libcraigscrape 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/listings.rb ADDED
@@ -0,0 +1,144 @@
1
+ # = About listings.rb
2
+ #
3
+ # This file contains the parsing code, and logic relating to post-listing pages. You
4
+ # should never need to include this file directly, as all of libcraigscrape's objects and methods
5
+ # are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
6
+ #
7
+
8
+ require 'scraper'
9
+
10
+ # Listings represents a parsed Craigslist listing page and is generally returned by CraigScrape.scrape_listing
11
+ class CraigScrape::Listings < CraigScrape::Scraper
12
+ LABEL = /^(.+?)[ ]*\-$/
13
+ LOCATION = /^[ ]*\((.*?)\)$/
14
+ IMG_TYPE = /^[ ]*(.+)[ ]*$/
15
+ HEADER_DATE = /^[ ]*[^ ]+[ ]+([^ ]+)[ ]+([^ ]+)[ ]*$/
16
+ SUMMARY_DATE = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
17
+ NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/
18
+
19
+ # Array, PostSummary objects found in the listing
20
+ def posts
21
+ unless @posts
22
+ current_date = nil
23
+ @posts = []
24
+
25
+ post_tags = html.get_elements_by_tag_name('p','h4')
26
+
27
+ # The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output:
28
+ post_tags.pop if (
29
+ post_tags.length > 0 and
30
+ post_tags.last.at('a') and
31
+ NEXT_PAGE_LINK.match post_tags.last.at('a').inner_html
32
+ )
33
+
34
+ # Now we iterate though the listings:
35
+ post_tags.each do |el|
36
+ case el.name
37
+ when 'p'
38
+ post_summary = self.class.parse_summary el, current_date
39
+
40
+ # Validate that required fields are present:
41
+ parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
42
+
43
+ post_summary[:url] = url_from_href post_summary[:href]
44
+
45
+ @posts << CraigScrape::Posting.new(post_summary)
46
+ when 'h4'
47
+ # Let's make sense of the h4 tag, and then read all the p tags below it
48
+ if HEADER_DATE.match he_decode(el.inner_html)
49
+ # Generally, the H4 tags contain valid dates. When they do - this is easy:
50
+ current_date = CraigScrape.most_recently_expired_time $1, $2
51
+ elsif html.at('h4:last-of-type') == el
52
+ # There's a specific bug, where these nonsense h4's just appear without anything relevant inside them.
53
+ # They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
54
+ # we need to pull up the full post in order to accurate tell the date.
55
+ # Setting this to nil will achieve the eager-load.
56
+ current_date = nil
57
+ end
58
+ end
59
+ end
60
+ end
61
+
62
+ @posts
63
+ end
64
+
65
+ # String, URL Path href-fragment of the next page link
66
+ def next_page_href
67
+ unless @next_page_href
68
+ cursor = html.at 'p:last-of-type'
69
+
70
+ cursor = cursor.at 'a' if cursor
71
+
72
+ # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
73
+ next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
74
+
75
+ # Search listings put their next page in a link towards the top
76
+ next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
77
+
78
+ # Some search pages have a bug, whereby a 'next page' link isn't displayed,
79
+ # even though we can see that theres another page listed in the page-number links block at the top
80
+ # and bottom of the listing page
81
+ unless next_link
82
+ cursor = html % 'div.sh:first-of-type > b:last-of-type'
83
+
84
+ # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
85
+ # We're looking good.
86
+ next_link = cursor.next_sibling if cursor and /^[\d]+$/.match cursor.inner_html
87
+ end
88
+
89
+ # We have an anchor tag - so - let's assign the href:
90
+ @next_page_href = next_link[:href] if next_link
91
+ end
92
+
93
+ @next_page_href
94
+ end
95
+
96
+ # String, Full URL Path of the 'next page' link
97
+ def next_page_url
98
+ (next_page_href) ? url_from_href(next_page_href) : nil
99
+ end
100
+
101
+ # Returns a Listings object of the next_page_url on the current listings object
102
+ def next_page
103
+ CraigScrape::Listings.new next_page_url if next_page_url
104
+ end
105
+
106
+ # Takes a paragraph element and returns a mostly-parsed Posting
107
+ # We separate this from the rest of the parsing both for readability and ease of testing
108
+ def self.parse_summary(p_element, date = nil) #:nodoc:
109
+ ret = {}
110
+
111
+ title_anchor, section_anchor = p_element.search 'a'
112
+ location_tag = p_element.at 'font'
113
+ has_pic_tag = p_element.at 'span'
114
+
115
+ href = nil
116
+
117
+ location = he_decode p_element.at('font').inner_html if location_tag
118
+ ret[:location] = $1 if location and LOCATION.match location
119
+
120
+ ret[:img_types] = []
121
+ if has_pic_tag
122
+ img_type = he_decode has_pic_tag.inner_html
123
+ img_type = $1.tr('^a-zA-Z0-9',' ') if IMG_TYPE.match img_type
124
+
125
+ ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
126
+ end
127
+
128
+ ret[:section] = he_decode(section_anchor.inner_html).split("\302\240").join(" ") if section_anchor
129
+
130
+ ret[:post_date] = date
131
+ if SUMMARY_DATE.match he_decode(p_element.children[0])
132
+ ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
133
+ end
134
+
135
+ if title_anchor
136
+ label = he_decode title_anchor.inner_html
137
+ ret[:label] = $1 if LABEL.match label
138
+
139
+ ret[:href] = title_anchor[:href]
140
+ end
141
+
142
+ ret
143
+ end
144
+ end
data/lib/posting.rb ADDED
@@ -0,0 +1,293 @@
1
+ # = About posting.rb
2
+ #
3
+ # This file contains the parsing code, and logic relating to craiglist postings. You
4
+ # should never need to include this file directly, as all of libcraigscrape's objects and methods
5
+ # are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
6
+ #
7
+
8
+ require 'scraper'
9
+
10
+ # Posting represents a fully downloaded, and parsed, Craigslist post.
11
+ # This class is generally returned by the listing scrape methods, and
12
+ # contains the post summaries for a specific search url, or a general listing category
13
+ class CraigScrape::Posting < CraigScrape::Scraper
14
+
15
+ POST_DATE = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
16
+ LOCATION = /Location\:[ ]+(.+)/
17
+ HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/
18
+ POSTING_ID = /PostingID\:[ ]+([\d]+)/
19
+ REPLY_TO = /(.+)/
20
+ PRICE = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
21
+ USERBODY_PARTS = /\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>/m
22
+ IMAGE_SRC = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
23
+
24
+ # This is really just for testing, in production use, uri.path is a better solution
25
+ attr_reader :href #:nodoc:
26
+
27
+ # Create a new Post via a url (String), or supplied parameters (Hash)
28
+ def initialize(*args)
29
+ super(*args)
30
+
31
+ # Validate that required fields are present, at least - if we've downloaded it from a url
32
+ parse_error! if args.first.kind_of? String and !flagged_for_removal? and !deleted_by_author? and [
33
+ contents,posting_id,post_time,header,title,full_section
34
+ ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
35
+ end
36
+
37
+
38
+ # String, The contents of the item's html body heading
39
+ def header
40
+ unless @header
41
+ h2 = html.at 'h2' if html
42
+ @header = he_decode h2.inner_html if h2
43
+ end
44
+
45
+ @header
46
+ end
47
+
48
+ # String, the item's title
49
+ def title
50
+ unless @title
51
+ title_tag = html.at 'title' if html
52
+ @title = he_decode title_tag.inner_html if title_tag
53
+ @title = nil if @title and @title.length == 0
54
+ end
55
+
56
+ @title
57
+ end
58
+
59
+ # Array, hierarchial representation of the posts section
60
+ def full_section
61
+ unless @full_section
62
+ @full_section = []
63
+
64
+ (html/"div[@class='bchead']//a").each do |a|
65
+ @full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
66
+ end if html
67
+ end
68
+
69
+ @full_section
70
+ end
71
+
72
+ # String, represents the post's reply-to address, if listed
73
+ def reply_to
74
+ unless @reply_to
75
+ cursor = html.at 'hr' if html
76
+ cursor = cursor.next_sibling until cursor.nil? or cursor.name == 'a'
77
+ @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
78
+ end
79
+
80
+ @reply_to
81
+ end
82
+
83
+ # Time, reflects the full timestamp of the posting
84
+ def post_time
85
+ unless @post_time
86
+ cursor = html.at 'hr' if html
87
+ cursor = cursor.next_node until cursor.nil? or POST_DATE.match cursor.to_s
88
+ @post_time = Time.parse $1 if $1
89
+ end
90
+
91
+ @post_time
92
+ end
93
+
94
+ # Integer, Craigslist's unique posting id
95
+ def posting_id
96
+ unless @posting_id
97
+ cursor = (html/"#userbody").first if html
98
+ cursor = cursor.next_node until cursor.nil? or POSTING_ID.match cursor.to_s
99
+ @posting_id = $1.to_i if $1
100
+ end
101
+
102
+ @posting_id
103
+ end
104
+
105
+ # String, The full-html contents of the post
106
+ def contents
107
+ unless @contents
108
+ @contents = user_body if html
109
+ @contents = he_decode @contents.strip if @contents
110
+ end
111
+
112
+ @contents
113
+ end
114
+
115
+ # String, the location of the item, as best could be parsed
116
+ def location
117
+ if @location.nil? and craigslist_body and html
118
+ # Location (when explicitly defined):
119
+ cursor = craigslist_body.at 'ul' unless @location
120
+
121
+ # Apa section includes other things in the li's (cats/dogs ok fields)
122
+ cursor.children.each do |li|
123
+ if LOCATION.match li.inner_html
124
+ @location = he_decode($1) and break
125
+ break
126
+ end
127
+ end if cursor
128
+
129
+ # Real estate listings can work a little different for location:
130
+ unless @location
131
+ cursor = craigslist_body.at 'small'
132
+ cursor = cursor.previous_node until cursor.nil? or cursor.text?
133
+
134
+ @location = he_decode(cursor.to_s.strip) if cursor
135
+ end
136
+
137
+ # So, *sometimes* the location just ends up being in the header, I don't know why:
138
+ @location = $1 if @location.nil? and HEADER_LOCATION.match header
139
+ end
140
+
141
+ @location
142
+ end
143
+
144
+ # Array, urls of the post's images that are *not* hosted on craigslist
145
+ def images
146
+ # Keep in mind that when users post html to craigslist, they're often not posting wonderful html...
147
+ @images = (
148
+ contents ?
149
+ contents.scan(IMAGE_SRC).collect{ |a| a.find{|b| !b.nil? } } :
150
+ []
151
+ ) unless @images
152
+
153
+ @images
154
+ end
155
+
156
+ # Array, urls of the post's craigslist-hosted images
157
+ def pics
158
+ unless @pics
159
+ @pics = []
160
+
161
+ if html and craigslist_body
162
+ # Now let's find the craigslist hosted images:
163
+ img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
164
+
165
+ @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
166
+ end
167
+ end
168
+
169
+ @pics
170
+ end
171
+
172
+ # Returns true if this Post was parsed, and merely a 'Flagged for Removal' page
173
+ def flagged_for_removal?
174
+ @flagged_for_removal = (
175
+ system_post? and header_as_plain == "This posting has been flagged for removal"
176
+ ) if @flagged_for_removal.nil?
177
+
178
+ @flagged_for_removal
179
+ end
180
+
181
+ # Returns true if this Post was parsed, and represents a 'This posting has been deleted by its author.' notice
182
+ def deleted_by_author?
183
+ @deleted_by_author = (
184
+ system_post? and header_as_plain == "This posting has been deleted by its author."
185
+ ) if @deleted_by_author.nil?
186
+
187
+ @deleted_by_author
188
+ end
189
+
190
+
191
+ # Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
192
+ # used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
193
+ def post_date
194
+ @post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil?
195
+
196
+ @post_date
197
+ end
198
+
199
+ # Returns The post label. The label would appear at first glance to be indentical to the header - but its not.
200
+ # The label is cited on the listings pages, and generally includes everything in the header - with the exception of the location.
201
+ # Sometimes there's additional information ie. '(map)' on rea listings included in the header, that aren't to be listed in the label
202
+ # This is also used as a bandwidth shortcut for the craigwatch program, and is a guaranteed identifier for the post, that won't result
203
+ # in a full page load from the post's url.
204
+ def label
205
+ unless @label or system_post?
206
+ @label = header
207
+
208
+ @label = $1 if location and /(.+?)[ ]*\(#{location}\).*?$/.match @label
209
+ end
210
+
211
+ @label
212
+ end
213
+
214
+ # Array, which image types are listed for the post.
215
+ # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
216
+ def img_types
217
+ unless @img_types
218
+ @img_types = []
219
+
220
+ @img_types << :img if images.length > 0
221
+ @img_types << :pic if pics.length > 0
222
+ end
223
+
224
+ @img_types
225
+ end
226
+
227
+ # Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
228
+ # this (sometimes/rarely) conserves bandwidth by pulling this field from the listing post-summary
229
+ def section
230
+ unless @section
231
+ @section = full_section.last if full_section
232
+ end
233
+
234
+ @section
235
+ end
236
+
237
+ # true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server.
238
+ # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
239
+ def has_img?
240
+ img_types.include? :img
241
+ end
242
+
243
+ # true if post summary has 'pic(s)'. 'pics' are different then imgs, in that craigslist is hosting the resource on craigslist's servers
244
+ # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
245
+ def has_pic?
246
+ img_types.include? :pic
247
+ end
248
+
249
+ # true if post summary has either the img or pic label
250
+ # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
251
+ def has_pic_or_img?
252
+ img_types.length > 0
253
+ end
254
+
255
+ # Returns the best-guess of a price, judging by the label's contents. Price is available when pulled from the listing summary
256
+ # and can be safely used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
257
+ def price
258
+ $1.tr('$','').to_f if label and PRICE.match label
259
+ end
260
+
261
+ # Returns the post contents with all html tags removed
262
+ def contents_as_plain
263
+ strip_html contents
264
+ end
265
+
266
+ # Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a
267
+ # 'system_post' we may get tags in here
268
+ def header_as_plain
269
+ strip_html header
270
+ end
271
+
272
+ # Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original
273
+ # This returns true or false if that case applies
274
+ def system_post?
275
+ [contents,posting_id,post_time,title].all?{|f| f.nil?}
276
+ end
277
+
278
+ private
279
+
280
+ # OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
281
+ # This bad html trips up hpricot, and I've resorted to splitting the page up using string parsing like so:
282
+ # We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
283
+ def user_body
284
+ $1 if USERBODY_PARTS.match html.to_s
285
+ end
286
+
287
+ # Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
288
+ # So - we'll return it as an Hpricot object.
289
+ def craigslist_body
290
+ Hpricot.parse $2 if USERBODY_PARTS.match html.to_s
291
+ end
292
+
293
+ end
data/lib/scraper.rb ADDED
@@ -0,0 +1,203 @@
1
+ # = About scraper.rb
2
+ #
3
+ # This file defines:
4
+ # - the base class from which other parse objects inherit
5
+ # - Basic http and connection handling methods
6
+ # - html utility methods used by objects
7
+ # - Common Errors
8
+ # You should never need to include this file directly, as all of libcraigscrape's objects and methods
9
+ # are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
10
+ #
11
+
12
+ require 'net/http'
13
+ require 'zlib'
14
+
15
+ require 'rubygems'
16
+ require 'activesupport'
17
+ require 'hpricot'
18
+ require 'htmlentities'
19
+
20
+ # Scraper is a general-pupose base class for all libcraigscrape Objects. Scraper facilitates all http-related
21
+ # functionality, and adds some useful helpers for dealing with eager-loading of http-objects and general html
22
+ # methods. It also contains the http-related cattr_accessors:
23
+ #
24
+ # <b>logger</b> - a Logger object to debug http notices too. Defaults to nil
25
+ #
26
+ # <b>retries_on_fetch_fail</b> - The number of times to retry a failed uri download. Defaults to 8
27
+ #
28
+ # <b>sleep_between_fetch_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a failed download. Defaults to 30.
29
+ #
30
+ # <b>retries_on_404_fail</b> - The number of times to retry a Resource Not Found error (http Response code 404). Defaults to 3.
31
+ #
32
+ # <b>sleep_between_404_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a Resource Not Found error. Defaults to 3.
33
+ #
34
+ class CraigScrape::Scraper
35
+ cattr_accessor :logger
36
+ cattr_accessor :sleep_between_fetch_retries
37
+ cattr_accessor :retries_on_fetch_fail
38
+ cattr_accessor :retries_on_404_fail
39
+ cattr_accessor :sleep_between_404_retries
40
+
41
+ URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/
42
+ HTML_TAG = /<\/?[^>]*>/
43
+
44
+ # Returns the full url that corresponds to this resource
45
+ attr_reader :url
46
+
47
+ # Set some defaults:
48
+ self.retries_on_fetch_fail = 8
49
+ self.sleep_between_fetch_retries = 30
50
+
51
+ self.retries_on_404_fail = 3
52
+ self.sleep_between_404_retries = 3
53
+
54
+ class BadConstructionError < StandardError #:nodoc:
55
+ end
56
+
57
+ class ParseError < StandardError #:nodoc:
58
+ end
59
+
60
+ class BadUrlError < StandardError #:nodoc:
61
+ end
62
+
63
+ class FetchError < StandardError #:nodoc:
64
+ end
65
+
66
+ class ResourceNotFoundError < StandardError #:nodoc:
67
+ end
68
+
69
+ # Scraper Objects can be created from either a full URL (string), or a Hash.
70
+ # Currently, this initializer isn't intended to be called from libcraigslist API users, though
71
+ # if you know what you're doing - feel free to try this out.
72
+ #
73
+ # A (string) url can be passed in a 'http://' scheme or a 'file://' scheme.
74
+ #
75
+ # When constructing from a hash, the keys in the hash will be used to set the object's corresponding values.
76
+ # This is useful to create an object without actually making an html request, this is used to set-up an
77
+ # object before it eager-loads any values not already passed in by the constructor hash. Though optional, if
78
+ # you're going to be setting this object up for eager-loadnig, be sure to pass in a :url key in your hash,
79
+ # Otherwise this will fail to eager load.
80
+ def initialize(init_via = nil)
81
+ if init_via.nil?
82
+ # Do nothing - possibly not a great idea, but we'll allow it
83
+ elsif init_via.kind_of? String
84
+ @url = init_via
85
+ elsif init_via.kind_of? Hash
86
+ init_via.each_pair{|k,v| instance_variable_set "@#{k}", v}
87
+ else
88
+ raise BadConstructionError, ("Unrecognized parameter passed to %s.new %s}" % [self.class.to_s, init_via.class.inspect])
89
+ end
90
+ end
91
+
92
+ # Indicates whether the resource has yet been retrieved from its associated url.
93
+ # This is useful to distinguish whether the instance was instantiated for the purpose of an eager-load,
94
+ # but hasn't yet been fetched.
95
+ def downloaded?; !@html.nil?; end
96
+
97
+ # A URI object corresponding to this Scraped URL
98
+ def uri
99
+ @uri ||= URI.parse @url if @url
100
+ @uri
101
+ end
102
+
103
+ private
104
+
105
+ # Returns text with all html tags removed.
106
+ def strip_html(str)
107
+ str.gsub HTML_TAG, "" if str
108
+ end
109
+
110
+ # Easy way to fail noisily:
111
+ def parse_error!; raise ParseError, "Error while parsing %s:\n %s" % [self.class.to_s, html]; end
112
+
113
+ # Returns text with all html entities converted to respective ascii character.
114
+ def he_decode(text); self.class.he_decode text; end
115
+
116
+ # Returns text with all html entities converted to respective ascii character.
117
+ def self.he_decode(text); HTMLEntities.new.decode text; end
118
+
119
+ # Derives a full url, using the current object's url and the provided href
120
+ def url_from_href(href) #:nodoc:
121
+ scheme, host, path = $1, $2, $3 if URL_PARTS.match href
122
+
123
+ scheme = uri.scheme if scheme.nil? or scheme.empty? and uri.respond_to? :scheme
124
+
125
+ host = uri.host if host.nil? or host.empty? and uri.respond_to? :host
126
+
127
+ path = (
128
+ (/\/$/.match(uri.path)) ?
129
+ '%s%s' % [uri.path,path] :
130
+ '%s/%s' % [File.dirname(uri.path),path]
131
+ ) unless /^\//.match path
132
+
133
+ '%s://%s%s' % [scheme, host, path]
134
+ end
135
+
136
+ def fetch_uri(uri)
137
+ logger.info "Requesting: %s" % @url if logger
138
+
139
+ case uri.scheme
140
+ when 'file'
141
+ # If this is a directory, we'll try to approximate http a bit by loading a '/index.html'
142
+ File.read( File.directory?(uri.path) ? "#{uri.path}/index.html" : uri.path )
143
+ when /^http[s]?/
144
+ fetch_http uri
145
+ else
146
+ raise BadUrlError, "Unknown URI scheme for the url: #{@url}"
147
+ end
148
+ end
149
+
150
+ def fetch_http(uri)
151
+ fetch_attempts = 0
152
+ resource_not_found_attempts = 0
153
+
154
+ begin
155
+ # This handles the redirects for us
156
+ resp, data = Net::HTTP.new( uri.host, uri.port).get uri.request_uri, nil
157
+
158
+ if resp.response.code == "200"
159
+ # Check for gzip, and decode:
160
+ data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
161
+
162
+ data
163
+ elsif resp.response['Location']
164
+ redirect_to = resp.response['Location']
165
+
166
+ fetch_uri URI.parse(url_from_href(redirect_to))
167
+ else
168
+ # Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
169
+ raise ResourceNotFoundError, 'Unable to fetch "%s" (%s)' % [ @url, resp.response.code ]
170
+ end
171
+ rescue ResourceNotFoundError => err
172
+ logger.info err.message if logger
173
+
174
+ resource_not_found_attempts += 1
175
+
176
+ if resource_not_found_attempts <= self.retries_on_404_fail
177
+ sleep self.sleep_between_404_retries if self.sleep_between_404_retries
178
+ logger.info 'Retrying ....' if logger
179
+ retry
180
+ else
181
+ raise err
182
+ end
183
+ rescue FetchError,Timeout::Error,Errno::ECONNRESET,EOFError => err
184
+ logger.info 'Timeout error while requesting "%s"' % @url if logger and err.class == Timeout::Error
185
+ logger.info 'Connection reset while requesting "%s"' % @url if logger and err.class == Errno::ECONNRESET
186
+
187
+ fetch_attempts += 1
188
+
189
+ if fetch_attempts <= self.retries_on_fetch_fail
190
+ sleep self.sleep_between_fetch_retries if self.sleep_between_fetch_retries
191
+ logger.info 'Retrying fetch ....' if logger
192
+ retry
193
+ else
194
+ raise err
195
+ end
196
+ end
197
+ end
198
+
199
+ def html
200
+ @html ||= Hpricot.parse fetch_uri(uri) if uri
201
+ @html
202
+ end
203
+ end
@@ -0,0 +1,31 @@
1
+
2
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
3
+ <html>
4
+ <head>
5
+ <title>craigslist: classifieds for jobs, apartments, personals, for sale, services, community, and events</title>
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
+ <link rel="stylesheet" title="craigslist" href="http://www.craigslist.org/styles/craigslist.css" type="text/css" media="all">
8
+ <style type="text/css"><!--
9
+ a { text-decoration: none; }
10
+ #list { line-height: 2.00em;; }
11
+ #list em { font-size: smaller; font-weight: normal; }
12
+ -->
13
+ </style>
14
+ </head>
15
+ <body>
16
+ <blockquote>
17
+ <h3><b><a href="http://www.craigslist.org/">craigslist</a> &gt; </b> <sup><a href="http://en.wikipedia.org/wiki/">w</a></sup></h3>
18
+
19
+ <blockquote>&nbsp;
20
+ <blockquote>
21
+ <h4>choose the site nearest you (<a href="http://forums.craigslist.org/?forumID=1">or suggest a new one</a>):</h4>
22
+ <blockquote>&nbsp;
23
+ <div id="list"><a href="http://caribbean.craigslist.org/">caribbean islands</a> <br>
24
+ <a href="http://micronesia.craigslist.org/">guam-micronesia</a> <br>
25
+ </div>
26
+ </blockquote>
27
+ </blockquote>
28
+ </blockquote>
29
+ </blockquote>
30
+ </body>
31
+ </html>