libcraigscrape 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +19 -0
- data/README +27 -11
- data/Rakefile +44 -2
- data/bin/craig_report_schema.yml +30 -21
- data/bin/craigwatch +232 -67
- data/bin/report_mailer/craigslist_report.html.erb +12 -9
- data/bin/report_mailer/craigslist_report.plain.erb +4 -1
- data/lib/geo_listings.rb +144 -0
- data/lib/libcraigscrape.rb +158 -650
- data/lib/listings.rb +144 -0
- data/lib/posting.rb +293 -0
- data/lib/scraper.rb +203 -0
- data/test/geolisting_samples/hierarchy_test071009/index.html +31 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/%20SW%20florida/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/more-nonsense/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/nonexist/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/nonsense/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/south%20florida/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/index.html +355 -0
- data/test/test_craigslist_geolisting.rb +476 -380
- metadata +28 -2
data/lib/listings.rb
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
# = About listings.rb
|
2
|
+
#
|
3
|
+
# This file contains the parsing code, and logic relating to post-listing pages. You
|
4
|
+
# should never need to include this file directly, as all of libcraigscrape's objects and methods
|
5
|
+
# are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
|
6
|
+
#
|
7
|
+
|
8
|
+
require 'scraper'
|
9
|
+
|
10
|
+
# Listings represents a parsed Craigslist listing page and is generally returned by CraigScrape.scrape_listing
|
11
|
+
class CraigScrape::Listings < CraigScrape::Scraper
|
12
|
+
LABEL = /^(.+?)[ ]*\-$/
|
13
|
+
LOCATION = /^[ ]*\((.*?)\)$/
|
14
|
+
IMG_TYPE = /^[ ]*(.+)[ ]*$/
|
15
|
+
HEADER_DATE = /^[ ]*[^ ]+[ ]+([^ ]+)[ ]+([^ ]+)[ ]*$/
|
16
|
+
SUMMARY_DATE = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
|
17
|
+
NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/
|
18
|
+
|
19
|
+
# Array, PostSummary objects found in the listing
|
20
|
+
def posts
|
21
|
+
unless @posts
|
22
|
+
current_date = nil
|
23
|
+
@posts = []
|
24
|
+
|
25
|
+
post_tags = html.get_elements_by_tag_name('p','h4')
|
26
|
+
|
27
|
+
# The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output:
|
28
|
+
post_tags.pop if (
|
29
|
+
post_tags.length > 0 and
|
30
|
+
post_tags.last.at('a') and
|
31
|
+
NEXT_PAGE_LINK.match post_tags.last.at('a').inner_html
|
32
|
+
)
|
33
|
+
|
34
|
+
# Now we iterate though the listings:
|
35
|
+
post_tags.each do |el|
|
36
|
+
case el.name
|
37
|
+
when 'p'
|
38
|
+
post_summary = self.class.parse_summary el, current_date
|
39
|
+
|
40
|
+
# Validate that required fields are present:
|
41
|
+
parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
|
42
|
+
|
43
|
+
post_summary[:url] = url_from_href post_summary[:href]
|
44
|
+
|
45
|
+
@posts << CraigScrape::Posting.new(post_summary)
|
46
|
+
when 'h4'
|
47
|
+
# Let's make sense of the h4 tag, and then read all the p tags below it
|
48
|
+
if HEADER_DATE.match he_decode(el.inner_html)
|
49
|
+
# Generally, the H4 tags contain valid dates. When they do - this is easy:
|
50
|
+
current_date = CraigScrape.most_recently_expired_time $1, $2
|
51
|
+
elsif html.at('h4:last-of-type') == el
|
52
|
+
# There's a specific bug, where these nonsense h4's just appear without anything relevant inside them.
|
53
|
+
# They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
|
54
|
+
# we need to pull up the full post in order to accurate tell the date.
|
55
|
+
# Setting this to nil will achieve the eager-load.
|
56
|
+
current_date = nil
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
@posts
|
63
|
+
end
|
64
|
+
|
65
|
+
# String, URL Path href-fragment of the next page link
|
66
|
+
def next_page_href
|
67
|
+
unless @next_page_href
|
68
|
+
cursor = html.at 'p:last-of-type'
|
69
|
+
|
70
|
+
cursor = cursor.at 'a' if cursor
|
71
|
+
|
72
|
+
# Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
|
73
|
+
next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
|
74
|
+
|
75
|
+
# Search listings put their next page in a link towards the top
|
76
|
+
next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
|
77
|
+
|
78
|
+
# Some search pages have a bug, whereby a 'next page' link isn't displayed,
|
79
|
+
# even though we can see that theres another page listed in the page-number links block at the top
|
80
|
+
# and bottom of the listing page
|
81
|
+
unless next_link
|
82
|
+
cursor = html % 'div.sh:first-of-type > b:last-of-type'
|
83
|
+
|
84
|
+
# If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
|
85
|
+
# We're looking good.
|
86
|
+
next_link = cursor.next_sibling if cursor and /^[\d]+$/.match cursor.inner_html
|
87
|
+
end
|
88
|
+
|
89
|
+
# We have an anchor tag - so - let's assign the href:
|
90
|
+
@next_page_href = next_link[:href] if next_link
|
91
|
+
end
|
92
|
+
|
93
|
+
@next_page_href
|
94
|
+
end
|
95
|
+
|
96
|
+
# String, Full URL Path of the 'next page' link
|
97
|
+
def next_page_url
|
98
|
+
(next_page_href) ? url_from_href(next_page_href) : nil
|
99
|
+
end
|
100
|
+
|
101
|
+
# Returns a Listings object of the next_page_url on the current listings object
|
102
|
+
def next_page
|
103
|
+
CraigScrape::Listings.new next_page_url if next_page_url
|
104
|
+
end
|
105
|
+
|
106
|
+
# Takes a paragraph element and returns a mostly-parsed Posting
|
107
|
+
# We separate this from the rest of the parsing both for readability and ease of testing
|
108
|
+
def self.parse_summary(p_element, date = nil) #:nodoc:
|
109
|
+
ret = {}
|
110
|
+
|
111
|
+
title_anchor, section_anchor = p_element.search 'a'
|
112
|
+
location_tag = p_element.at 'font'
|
113
|
+
has_pic_tag = p_element.at 'span'
|
114
|
+
|
115
|
+
href = nil
|
116
|
+
|
117
|
+
location = he_decode p_element.at('font').inner_html if location_tag
|
118
|
+
ret[:location] = $1 if location and LOCATION.match location
|
119
|
+
|
120
|
+
ret[:img_types] = []
|
121
|
+
if has_pic_tag
|
122
|
+
img_type = he_decode has_pic_tag.inner_html
|
123
|
+
img_type = $1.tr('^a-zA-Z0-9',' ') if IMG_TYPE.match img_type
|
124
|
+
|
125
|
+
ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
|
126
|
+
end
|
127
|
+
|
128
|
+
ret[:section] = he_decode(section_anchor.inner_html).split("\302\240").join(" ") if section_anchor
|
129
|
+
|
130
|
+
ret[:post_date] = date
|
131
|
+
if SUMMARY_DATE.match he_decode(p_element.children[0])
|
132
|
+
ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
|
133
|
+
end
|
134
|
+
|
135
|
+
if title_anchor
|
136
|
+
label = he_decode title_anchor.inner_html
|
137
|
+
ret[:label] = $1 if LABEL.match label
|
138
|
+
|
139
|
+
ret[:href] = title_anchor[:href]
|
140
|
+
end
|
141
|
+
|
142
|
+
ret
|
143
|
+
end
|
144
|
+
end
|
data/lib/posting.rb
ADDED
@@ -0,0 +1,293 @@
|
|
1
|
+
# = About posting.rb
|
2
|
+
#
|
3
|
+
# This file contains the parsing code, and logic relating to craiglist postings. You
|
4
|
+
# should never need to include this file directly, as all of libcraigscrape's objects and methods
|
5
|
+
# are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
|
6
|
+
#
|
7
|
+
|
8
|
+
require 'scraper'
|
9
|
+
|
10
|
+
# Posting represents a fully downloaded, and parsed, Craigslist post.
|
11
|
+
# This class is generally returned by the listing scrape methods, and
|
12
|
+
# contains the post summaries for a specific search url, or a general listing category
|
13
|
+
class CraigScrape::Posting < CraigScrape::Scraper
|
14
|
+
|
15
|
+
POST_DATE = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
|
16
|
+
LOCATION = /Location\:[ ]+(.+)/
|
17
|
+
HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/
|
18
|
+
POSTING_ID = /PostingID\:[ ]+([\d]+)/
|
19
|
+
REPLY_TO = /(.+)/
|
20
|
+
PRICE = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
|
21
|
+
USERBODY_PARTS = /\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>/m
|
22
|
+
IMAGE_SRC = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
|
23
|
+
|
24
|
+
# This is really just for testing, in production use, uri.path is a better solution
|
25
|
+
attr_reader :href #:nodoc:
|
26
|
+
|
27
|
+
# Create a new Post via a url (String), or supplied parameters (Hash)
|
28
|
+
def initialize(*args)
|
29
|
+
super(*args)
|
30
|
+
|
31
|
+
# Validate that required fields are present, at least - if we've downloaded it from a url
|
32
|
+
parse_error! if args.first.kind_of? String and !flagged_for_removal? and !deleted_by_author? and [
|
33
|
+
contents,posting_id,post_time,header,title,full_section
|
34
|
+
].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
# String, The contents of the item's html body heading
|
39
|
+
def header
|
40
|
+
unless @header
|
41
|
+
h2 = html.at 'h2' if html
|
42
|
+
@header = he_decode h2.inner_html if h2
|
43
|
+
end
|
44
|
+
|
45
|
+
@header
|
46
|
+
end
|
47
|
+
|
48
|
+
# String, the item's title
|
49
|
+
def title
|
50
|
+
unless @title
|
51
|
+
title_tag = html.at 'title' if html
|
52
|
+
@title = he_decode title_tag.inner_html if title_tag
|
53
|
+
@title = nil if @title and @title.length == 0
|
54
|
+
end
|
55
|
+
|
56
|
+
@title
|
57
|
+
end
|
58
|
+
|
59
|
+
# Array, hierarchial representation of the posts section
|
60
|
+
def full_section
|
61
|
+
unless @full_section
|
62
|
+
@full_section = []
|
63
|
+
|
64
|
+
(html/"div[@class='bchead']//a").each do |a|
|
65
|
+
@full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
|
66
|
+
end if html
|
67
|
+
end
|
68
|
+
|
69
|
+
@full_section
|
70
|
+
end
|
71
|
+
|
72
|
+
# String, represents the post's reply-to address, if listed
|
73
|
+
def reply_to
|
74
|
+
unless @reply_to
|
75
|
+
cursor = html.at 'hr' if html
|
76
|
+
cursor = cursor.next_sibling until cursor.nil? or cursor.name == 'a'
|
77
|
+
@reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
|
78
|
+
end
|
79
|
+
|
80
|
+
@reply_to
|
81
|
+
end
|
82
|
+
|
83
|
+
# Time, reflects the full timestamp of the posting
|
84
|
+
def post_time
|
85
|
+
unless @post_time
|
86
|
+
cursor = html.at 'hr' if html
|
87
|
+
cursor = cursor.next_node until cursor.nil? or POST_DATE.match cursor.to_s
|
88
|
+
@post_time = Time.parse $1 if $1
|
89
|
+
end
|
90
|
+
|
91
|
+
@post_time
|
92
|
+
end
|
93
|
+
|
94
|
+
# Integer, Craigslist's unique posting id
|
95
|
+
def posting_id
|
96
|
+
unless @posting_id
|
97
|
+
cursor = (html/"#userbody").first if html
|
98
|
+
cursor = cursor.next_node until cursor.nil? or POSTING_ID.match cursor.to_s
|
99
|
+
@posting_id = $1.to_i if $1
|
100
|
+
end
|
101
|
+
|
102
|
+
@posting_id
|
103
|
+
end
|
104
|
+
|
105
|
+
# String, The full-html contents of the post
|
106
|
+
def contents
|
107
|
+
unless @contents
|
108
|
+
@contents = user_body if html
|
109
|
+
@contents = he_decode @contents.strip if @contents
|
110
|
+
end
|
111
|
+
|
112
|
+
@contents
|
113
|
+
end
|
114
|
+
|
115
|
+
# String, the location of the item, as best could be parsed
|
116
|
+
def location
|
117
|
+
if @location.nil? and craigslist_body and html
|
118
|
+
# Location (when explicitly defined):
|
119
|
+
cursor = craigslist_body.at 'ul' unless @location
|
120
|
+
|
121
|
+
# Apa section includes other things in the li's (cats/dogs ok fields)
|
122
|
+
cursor.children.each do |li|
|
123
|
+
if LOCATION.match li.inner_html
|
124
|
+
@location = he_decode($1) and break
|
125
|
+
break
|
126
|
+
end
|
127
|
+
end if cursor
|
128
|
+
|
129
|
+
# Real estate listings can work a little different for location:
|
130
|
+
unless @location
|
131
|
+
cursor = craigslist_body.at 'small'
|
132
|
+
cursor = cursor.previous_node until cursor.nil? or cursor.text?
|
133
|
+
|
134
|
+
@location = he_decode(cursor.to_s.strip) if cursor
|
135
|
+
end
|
136
|
+
|
137
|
+
# So, *sometimes* the location just ends up being in the header, I don't know why:
|
138
|
+
@location = $1 if @location.nil? and HEADER_LOCATION.match header
|
139
|
+
end
|
140
|
+
|
141
|
+
@location
|
142
|
+
end
|
143
|
+
|
144
|
+
# Array, urls of the post's images that are *not* hosted on craigslist
|
145
|
+
def images
|
146
|
+
# Keep in mind that when users post html to craigslist, they're often not posting wonderful html...
|
147
|
+
@images = (
|
148
|
+
contents ?
|
149
|
+
contents.scan(IMAGE_SRC).collect{ |a| a.find{|b| !b.nil? } } :
|
150
|
+
[]
|
151
|
+
) unless @images
|
152
|
+
|
153
|
+
@images
|
154
|
+
end
|
155
|
+
|
156
|
+
# Array, urls of the post's craigslist-hosted images
|
157
|
+
def pics
|
158
|
+
unless @pics
|
159
|
+
@pics = []
|
160
|
+
|
161
|
+
if html and craigslist_body
|
162
|
+
# Now let's find the craigslist hosted images:
|
163
|
+
img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
|
164
|
+
|
165
|
+
@pics = (img_table / 'img').collect{|i| i[:src]} if img_table
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
@pics
|
170
|
+
end
|
171
|
+
|
172
|
+
# Returns true if this Post was parsed, and merely a 'Flagged for Removal' page
|
173
|
+
def flagged_for_removal?
|
174
|
+
@flagged_for_removal = (
|
175
|
+
system_post? and header_as_plain == "This posting has been flagged for removal"
|
176
|
+
) if @flagged_for_removal.nil?
|
177
|
+
|
178
|
+
@flagged_for_removal
|
179
|
+
end
|
180
|
+
|
181
|
+
# Returns true if this Post was parsed, and represents a 'This posting has been deleted by its author.' notice
|
182
|
+
def deleted_by_author?
|
183
|
+
@deleted_by_author = (
|
184
|
+
system_post? and header_as_plain == "This posting has been deleted by its author."
|
185
|
+
) if @deleted_by_author.nil?
|
186
|
+
|
187
|
+
@deleted_by_author
|
188
|
+
end
|
189
|
+
|
190
|
+
|
191
|
+
# Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
|
192
|
+
# used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
|
193
|
+
def post_date
|
194
|
+
@post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil?
|
195
|
+
|
196
|
+
@post_date
|
197
|
+
end
|
198
|
+
|
199
|
+
# Returns The post label. The label would appear at first glance to be indentical to the header - but its not.
|
200
|
+
# The label is cited on the listings pages, and generally includes everything in the header - with the exception of the location.
|
201
|
+
# Sometimes there's additional information ie. '(map)' on rea listings included in the header, that aren't to be listed in the label
|
202
|
+
# This is also used as a bandwidth shortcut for the craigwatch program, and is a guaranteed identifier for the post, that won't result
|
203
|
+
# in a full page load from the post's url.
|
204
|
+
def label
|
205
|
+
unless @label or system_post?
|
206
|
+
@label = header
|
207
|
+
|
208
|
+
@label = $1 if location and /(.+?)[ ]*\(#{location}\).*?$/.match @label
|
209
|
+
end
|
210
|
+
|
211
|
+
@label
|
212
|
+
end
|
213
|
+
|
214
|
+
# Array, which image types are listed for the post.
|
215
|
+
# This is always able to be pulled from the listing post-summary, and should never cause an additional page load
|
216
|
+
def img_types
|
217
|
+
unless @img_types
|
218
|
+
@img_types = []
|
219
|
+
|
220
|
+
@img_types << :img if images.length > 0
|
221
|
+
@img_types << :pic if pics.length > 0
|
222
|
+
end
|
223
|
+
|
224
|
+
@img_types
|
225
|
+
end
|
226
|
+
|
227
|
+
# Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
|
228
|
+
# this (sometimes/rarely) conserves bandwidth by pulling this field from the listing post-summary
|
229
|
+
def section
|
230
|
+
unless @section
|
231
|
+
@section = full_section.last if full_section
|
232
|
+
end
|
233
|
+
|
234
|
+
@section
|
235
|
+
end
|
236
|
+
|
237
|
+
# true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server.
|
238
|
+
# This is always able to be pulled from the listing post-summary, and should never cause an additional page load
|
239
|
+
def has_img?
|
240
|
+
img_types.include? :img
|
241
|
+
end
|
242
|
+
|
243
|
+
# true if post summary has 'pic(s)'. 'pics' are different then imgs, in that craigslist is hosting the resource on craigslist's servers
|
244
|
+
# This is always able to be pulled from the listing post-summary, and should never cause an additional page load
|
245
|
+
def has_pic?
|
246
|
+
img_types.include? :pic
|
247
|
+
end
|
248
|
+
|
249
|
+
# true if post summary has either the img or pic label
|
250
|
+
# This is always able to be pulled from the listing post-summary, and should never cause an additional page load
|
251
|
+
def has_pic_or_img?
|
252
|
+
img_types.length > 0
|
253
|
+
end
|
254
|
+
|
255
|
+
# Returns the best-guess of a price, judging by the label's contents. Price is available when pulled from the listing summary
|
256
|
+
# and can be safely used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
|
257
|
+
def price
|
258
|
+
$1.tr('$','').to_f if label and PRICE.match label
|
259
|
+
end
|
260
|
+
|
261
|
+
# Returns the post contents with all html tags removed
|
262
|
+
def contents_as_plain
|
263
|
+
strip_html contents
|
264
|
+
end
|
265
|
+
|
266
|
+
# Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a
|
267
|
+
# 'system_post' we may get tags in here
|
268
|
+
def header_as_plain
|
269
|
+
strip_html header
|
270
|
+
end
|
271
|
+
|
272
|
+
# Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original
|
273
|
+
# This returns true or false if that case applies
|
274
|
+
def system_post?
|
275
|
+
[contents,posting_id,post_time,title].all?{|f| f.nil?}
|
276
|
+
end
|
277
|
+
|
278
|
+
private
|
279
|
+
|
280
|
+
# OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
|
281
|
+
# This bad html trips up hpricot, and I've resorted to splitting the page up using string parsing like so:
|
282
|
+
# We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
|
283
|
+
def user_body
|
284
|
+
$1 if USERBODY_PARTS.match html.to_s
|
285
|
+
end
|
286
|
+
|
287
|
+
# Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
|
288
|
+
# So - we'll return it as an Hpricot object.
|
289
|
+
def craigslist_body
|
290
|
+
Hpricot.parse $2 if USERBODY_PARTS.match html.to_s
|
291
|
+
end
|
292
|
+
|
293
|
+
end
|
data/lib/scraper.rb
ADDED
@@ -0,0 +1,203 @@
|
|
1
|
+
# = About scraper.rb
|
2
|
+
#
|
3
|
+
# This file defines:
|
4
|
+
# - the base class from which other parse objects inherit
|
5
|
+
# - Basic http and connection handling methods
|
6
|
+
# - html utility methods used by objects
|
7
|
+
# - Common Errors
|
8
|
+
# You should never need to include this file directly, as all of libcraigscrape's objects and methods
|
9
|
+
# are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
|
10
|
+
#
|
11
|
+
|
12
|
+
require 'net/http'
|
13
|
+
require 'zlib'
|
14
|
+
|
15
|
+
require 'rubygems'
|
16
|
+
require 'activesupport'
|
17
|
+
require 'hpricot'
|
18
|
+
require 'htmlentities'
|
19
|
+
|
20
|
+
# Scraper is a general-pupose base class for all libcraigscrape Objects. Scraper facilitates all http-related
|
21
|
+
# functionality, and adds some useful helpers for dealing with eager-loading of http-objects and general html
|
22
|
+
# methods. It also contains the http-related cattr_accessors:
|
23
|
+
#
|
24
|
+
# <b>logger</b> - a Logger object to debug http notices too. Defaults to nil
|
25
|
+
#
|
26
|
+
# <b>retries_on_fetch_fail</b> - The number of times to retry a failed uri download. Defaults to 8
|
27
|
+
#
|
28
|
+
# <b>sleep_between_fetch_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a failed download. Defaults to 30.
|
29
|
+
#
|
30
|
+
# <b>retries_on_404_fail</b> - The number of times to retry a Resource Not Found error (http Response code 404). Defaults to 3.
|
31
|
+
#
|
32
|
+
# <b>sleep_between_404_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a Resource Not Found error. Defaults to 3.
|
33
|
+
#
|
34
|
+
class CraigScrape::Scraper
|
35
|
+
cattr_accessor :logger
|
36
|
+
cattr_accessor :sleep_between_fetch_retries
|
37
|
+
cattr_accessor :retries_on_fetch_fail
|
38
|
+
cattr_accessor :retries_on_404_fail
|
39
|
+
cattr_accessor :sleep_between_404_retries
|
40
|
+
|
41
|
+
URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/
|
42
|
+
HTML_TAG = /<\/?[^>]*>/
|
43
|
+
|
44
|
+
# Returns the full url that corresponds to this resource
|
45
|
+
attr_reader :url
|
46
|
+
|
47
|
+
# Set some defaults:
|
48
|
+
self.retries_on_fetch_fail = 8
|
49
|
+
self.sleep_between_fetch_retries = 30
|
50
|
+
|
51
|
+
self.retries_on_404_fail = 3
|
52
|
+
self.sleep_between_404_retries = 3
|
53
|
+
|
54
|
+
class BadConstructionError < StandardError #:nodoc:
|
55
|
+
end
|
56
|
+
|
57
|
+
class ParseError < StandardError #:nodoc:
|
58
|
+
end
|
59
|
+
|
60
|
+
class BadUrlError < StandardError #:nodoc:
|
61
|
+
end
|
62
|
+
|
63
|
+
class FetchError < StandardError #:nodoc:
|
64
|
+
end
|
65
|
+
|
66
|
+
class ResourceNotFoundError < StandardError #:nodoc:
|
67
|
+
end
|
68
|
+
|
69
|
+
# Scraper Objects can be created from either a full URL (string), or a Hash.
|
70
|
+
# Currently, this initializer isn't intended to be called from libcraigslist API users, though
|
71
|
+
# if you know what you're doing - feel free to try this out.
|
72
|
+
#
|
73
|
+
# A (string) url can be passed in a 'http://' scheme or a 'file://' scheme.
|
74
|
+
#
|
75
|
+
# When constructing from a hash, the keys in the hash will be used to set the object's corresponding values.
|
76
|
+
# This is useful to create an object without actually making an html request, this is used to set-up an
|
77
|
+
# object before it eager-loads any values not already passed in by the constructor hash. Though optional, if
|
78
|
+
# you're going to be setting this object up for eager-loadnig, be sure to pass in a :url key in your hash,
|
79
|
+
# Otherwise this will fail to eager load.
|
80
|
+
def initialize(init_via = nil)
|
81
|
+
if init_via.nil?
|
82
|
+
# Do nothing - possibly not a great idea, but we'll allow it
|
83
|
+
elsif init_via.kind_of? String
|
84
|
+
@url = init_via
|
85
|
+
elsif init_via.kind_of? Hash
|
86
|
+
init_via.each_pair{|k,v| instance_variable_set "@#{k}", v}
|
87
|
+
else
|
88
|
+
raise BadConstructionError, ("Unrecognized parameter passed to %s.new %s}" % [self.class.to_s, init_via.class.inspect])
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Indicates whether the resource has yet been retrieved from its associated url.
|
93
|
+
# This is useful to distinguish whether the instance was instantiated for the purpose of an eager-load,
|
94
|
+
# but hasn't yet been fetched.
|
95
|
+
def downloaded?; !@html.nil?; end
|
96
|
+
|
97
|
+
# A URI object corresponding to this Scraped URL
|
98
|
+
def uri
|
99
|
+
@uri ||= URI.parse @url if @url
|
100
|
+
@uri
|
101
|
+
end
|
102
|
+
|
103
|
+
private
|
104
|
+
|
105
|
+
# Returns text with all html tags removed.
|
106
|
+
def strip_html(str)
|
107
|
+
str.gsub HTML_TAG, "" if str
|
108
|
+
end
|
109
|
+
|
110
|
+
# Easy way to fail noisily:
|
111
|
+
def parse_error!; raise ParseError, "Error while parsing %s:\n %s" % [self.class.to_s, html]; end
|
112
|
+
|
113
|
+
# Returns text with all html entities converted to respective ascii character.
|
114
|
+
def he_decode(text); self.class.he_decode text; end
|
115
|
+
|
116
|
+
# Returns text with all html entities converted to respective ascii character.
|
117
|
+
def self.he_decode(text); HTMLEntities.new.decode text; end
|
118
|
+
|
119
|
+
# Derives a full url, using the current object's url and the provided href
|
120
|
+
def url_from_href(href) #:nodoc:
|
121
|
+
scheme, host, path = $1, $2, $3 if URL_PARTS.match href
|
122
|
+
|
123
|
+
scheme = uri.scheme if scheme.nil? or scheme.empty? and uri.respond_to? :scheme
|
124
|
+
|
125
|
+
host = uri.host if host.nil? or host.empty? and uri.respond_to? :host
|
126
|
+
|
127
|
+
path = (
|
128
|
+
(/\/$/.match(uri.path)) ?
|
129
|
+
'%s%s' % [uri.path,path] :
|
130
|
+
'%s/%s' % [File.dirname(uri.path),path]
|
131
|
+
) unless /^\//.match path
|
132
|
+
|
133
|
+
'%s://%s%s' % [scheme, host, path]
|
134
|
+
end
|
135
|
+
|
136
|
+
def fetch_uri(uri)
|
137
|
+
logger.info "Requesting: %s" % @url if logger
|
138
|
+
|
139
|
+
case uri.scheme
|
140
|
+
when 'file'
|
141
|
+
# If this is a directory, we'll try to approximate http a bit by loading a '/index.html'
|
142
|
+
File.read( File.directory?(uri.path) ? "#{uri.path}/index.html" : uri.path )
|
143
|
+
when /^http[s]?/
|
144
|
+
fetch_http uri
|
145
|
+
else
|
146
|
+
raise BadUrlError, "Unknown URI scheme for the url: #{@url}"
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
def fetch_http(uri)
|
151
|
+
fetch_attempts = 0
|
152
|
+
resource_not_found_attempts = 0
|
153
|
+
|
154
|
+
begin
|
155
|
+
# This handles the redirects for us
|
156
|
+
resp, data = Net::HTTP.new( uri.host, uri.port).get uri.request_uri, nil
|
157
|
+
|
158
|
+
if resp.response.code == "200"
|
159
|
+
# Check for gzip, and decode:
|
160
|
+
data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
|
161
|
+
|
162
|
+
data
|
163
|
+
elsif resp.response['Location']
|
164
|
+
redirect_to = resp.response['Location']
|
165
|
+
|
166
|
+
fetch_uri URI.parse(url_from_href(redirect_to))
|
167
|
+
else
|
168
|
+
# Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
|
169
|
+
raise ResourceNotFoundError, 'Unable to fetch "%s" (%s)' % [ @url, resp.response.code ]
|
170
|
+
end
|
171
|
+
rescue ResourceNotFoundError => err
|
172
|
+
logger.info err.message if logger
|
173
|
+
|
174
|
+
resource_not_found_attempts += 1
|
175
|
+
|
176
|
+
if resource_not_found_attempts <= self.retries_on_404_fail
|
177
|
+
sleep self.sleep_between_404_retries if self.sleep_between_404_retries
|
178
|
+
logger.info 'Retrying ....' if logger
|
179
|
+
retry
|
180
|
+
else
|
181
|
+
raise err
|
182
|
+
end
|
183
|
+
rescue FetchError,Timeout::Error,Errno::ECONNRESET,EOFError => err
|
184
|
+
logger.info 'Timeout error while requesting "%s"' % @url if logger and err.class == Timeout::Error
|
185
|
+
logger.info 'Connection reset while requesting "%s"' % @url if logger and err.class == Errno::ECONNRESET
|
186
|
+
|
187
|
+
fetch_attempts += 1
|
188
|
+
|
189
|
+
if fetch_attempts <= self.retries_on_fetch_fail
|
190
|
+
sleep self.sleep_between_fetch_retries if self.sleep_between_fetch_retries
|
191
|
+
logger.info 'Retrying fetch ....' if logger
|
192
|
+
retry
|
193
|
+
else
|
194
|
+
raise err
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
def html
|
200
|
+
@html ||= Hpricot.parse fetch_uri(uri) if uri
|
201
|
+
@html
|
202
|
+
end
|
203
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
|
2
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
3
|
+
<html>
|
4
|
+
<head>
|
5
|
+
<title>craigslist: classifieds for jobs, apartments, personals, for sale, services, community, and events</title>
|
6
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
7
|
+
<link rel="stylesheet" title="craigslist" href="http://www.craigslist.org/styles/craigslist.css" type="text/css" media="all">
|
8
|
+
<style type="text/css"><!--
|
9
|
+
a { text-decoration: none; }
|
10
|
+
#list { line-height: 2.00em;; }
|
11
|
+
#list em { font-size: smaller; font-weight: normal; }
|
12
|
+
-->
|
13
|
+
</style>
|
14
|
+
</head>
|
15
|
+
<body>
|
16
|
+
<blockquote>
|
17
|
+
<h3><b><a href="http://www.craigslist.org/">craigslist</a> > </b> <sup><a href="http://en.wikipedia.org/wiki/">w</a></sup></h3>
|
18
|
+
|
19
|
+
<blockquote>
|
20
|
+
<blockquote>
|
21
|
+
<h4>choose the site nearest you (<a href="http://forums.craigslist.org/?forumID=1">or suggest a new one</a>):</h4>
|
22
|
+
<blockquote>
|
23
|
+
<div id="list"><a href="http://caribbean.craigslist.org/">caribbean islands</a> <br>
|
24
|
+
<a href="http://micronesia.craigslist.org/">guam-micronesia</a> <br>
|
25
|
+
</div>
|
26
|
+
</blockquote>
|
27
|
+
</blockquote>
|
28
|
+
</blockquote>
|
29
|
+
</blockquote>
|
30
|
+
</body>
|
31
|
+
</html>
|