olek-libcraigscrape 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. data/CHANGELOG +94 -0
  2. data/COPYING +674 -0
  3. data/COPYING.LESSER +165 -0
  4. data/README +89 -0
  5. data/Rakefile +125 -0
  6. data/bin/craig_report_schema.yml +68 -0
  7. data/bin/craigwatch +581 -0
  8. data/bin/report_mailer/craigslist_report.html.erb +17 -0
  9. data/bin/report_mailer/craigslist_report.plain.erb +18 -0
  10. data/lib/geo_listings.rb +144 -0
  11. data/lib/libcraigscrape.rb +217 -0
  12. data/lib/listings.rb +160 -0
  13. data/lib/posting.rb +324 -0
  14. data/lib/scraper.rb +212 -0
  15. data/test/geolisting_samples/geo_listing_ca070209.html +76 -0
  16. data/test/geolisting_samples/geo_listing_ca_sk070209.html +31 -0
  17. data/test/geolisting_samples/geo_listing_cn070209.html +35 -0
  18. data/test/geolisting_samples/geo_listing_us070209.html +355 -0
  19. data/test/geolisting_samples/hierarchy_test071009/index.html +31 -0
  20. data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/%20SW%20florida/index.html +46 -0
  21. data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/index.html +46 -0
  22. data/test/geolisting_samples/hierarchy_test071009/us/fl/index.html +46 -0
  23. data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/index.html +46 -0
  24. data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/index.html +46 -0
  25. data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/more-nonsense/index.html +46 -0
  26. data/test/geolisting_samples/hierarchy_test071009/us/fl/nonexist/index.html +46 -0
  27. data/test/geolisting_samples/hierarchy_test071009/us/fl/nonsense/index.html +46 -0
  28. data/test/geolisting_samples/hierarchy_test071009/us/fl/south%20florida/index.html +46 -0
  29. data/test/geolisting_samples/hierarchy_test071009/us/index.html +355 -0
  30. data/test/google.html +8 -0
  31. data/test/libcraigscrape_test_helpers.rb +37 -0
  32. data/test/listing_samples/category_output.html +231 -0
  33. data/test/listing_samples/category_output_2.html +217 -0
  34. data/test/listing_samples/empty_listings.html +128 -0
  35. data/test/listing_samples/fortmyers_art_index.060909/1046596324.html +93 -0
  36. data/test/listing_samples/fortmyers_art_index.060909/1053085283.html +92 -0
  37. data/test/listing_samples/fortmyers_art_index.060909/1112522674.html +89 -0
  38. data/test/listing_samples/fortmyers_art_index.060909/823516079.html +92 -0
  39. data/test/listing_samples/fortmyers_art_index.060909/825684735.html +89 -0
  40. data/test/listing_samples/fortmyers_art_index.060909/891513957.html +94 -0
  41. data/test/listing_samples/fortmyers_art_index.060909/897549505.html +99 -0
  42. data/test/listing_samples/fortmyers_art_index.060909/960826026.html +89 -0
  43. data/test/listing_samples/fortmyers_art_index.060909/993256300.html +89 -0
  44. data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index500.060909.html +237 -0
  45. data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index600.060909.html +132 -0
  46. data/test/listing_samples/long_search_output.html +137 -0
  47. data/test/listing_samples/mia_fua_index8900.5.21.09.html +226 -0
  48. data/test/listing_samples/mia_search_kitten.3.15.10.html +149 -0
  49. data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack1000.6.18.09.html +144 -0
  50. data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack900.6.18.09.html +146 -0
  51. data/test/listing_samples/new_listing_span.4.17.10.html +769 -0
  52. data/test/listing_samples/short_search_output.html +133 -0
  53. data/test/post_samples/1207457727.html +92 -0
  54. data/test/post_samples/brw_reb_1224008903.html +101 -0
  55. data/test/post_samples/posting0.html +91 -0
  56. data/test/post_samples/posting1.html +106 -0
  57. data/test/post_samples/posting1796890756-061710.html +2318 -0
  58. data/test/post_samples/posting1808219423.html +2473 -0
  59. data/test/post_samples/posting1938291834-090610.html +188 -0
  60. data/test/post_samples/posting2.html +107 -0
  61. data/test/post_samples/posting3.html +92 -0
  62. data/test/post_samples/posting4.html +993 -0
  63. data/test/post_samples/posting5.html +38 -0
  64. data/test/post_samples/sfbay_art_1223614914.html +94 -0
  65. data/test/post_samples/this_post_has_been_deleted_by_its_author.html +37 -0
  66. data/test/post_samples/this_post_has_expired.html +48 -0
  67. data/test/test_craigslist_geolisting.rb +521 -0
  68. data/test/test_craigslist_listing.rb +362 -0
  69. data/test/test_craigslist_posting.rb +426 -0
  70. metadata +273 -0
data/lib/posting.rb ADDED
@@ -0,0 +1,324 @@
1
+ # = About posting.rb
2
+ #
3
+ # This file contains the parsing code, and logic relating to craiglist postings. You
4
+ # should never need to include this file directly, as all of libcraigscrape's objects and methods
5
+ # are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
6
+ #
7
+
8
+ require 'scraper'
9
+
10
+ # Posting represents a fully downloaded, and parsed, Craigslist post.
11
+ # This class is generally returned by the listing scrape methods, and
12
+ # contains the post summaries for a specific search url, or a general listing category
13
+ class CraigScrape::Posting < CraigScrape::Scraper
14
+
15
+ POST_DATE = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
16
+ LOCATION = /Location\:[ ]+(.+)/
17
+ HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/
18
+ POSTING_ID = /PostingID\:[ ]+([\d]+)/
19
+ REPLY_TO = /(.+)/
20
+ PRICE = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
21
+ USERBODY_PARTS = /^(.+)\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>(.+)$/m
22
+ HTML_HEADER = /^(.+)\<div id\=\"userbody\">/m
23
+ IMAGE_SRC = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
24
+
25
+ # This is really just for testing, in production use, uri.path is a better solution
26
+ attr_reader :href #:nodoc:
27
+
28
+ # Create a new Post via a url (String), or supplied parameters (Hash)
29
+ def initialize(*args)
30
+ super(*args)
31
+
32
+ # Validate that required fields are present, at least - if we've downloaded it from a url
33
+ parse_error! if (
34
+ args.first.kind_of? String and
35
+ !flagged_for_removal? and
36
+ !posting_has_expired? and
37
+ !deleted_by_author? and [
38
+ contents,posting_id,post_time,header,title,full_section
39
+ ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
40
+ )
41
+ end
42
+
43
+
44
+ # String, The contents of the item's html body heading
45
+ def header
46
+ unless @header
47
+ h2 = html_head.at 'h2' if html_head
48
+ @header = he_decode h2.inner_html if h2
49
+ end
50
+
51
+ @header
52
+ end
53
+
54
+ # String, the item's title
55
+ def title
56
+ unless @title
57
+ title_tag = html_head.at 'title' if html_head
58
+ @title = he_decode title_tag.inner_html if title_tag
59
+ @title = nil if @title and @title.length == 0
60
+ end
61
+
62
+ @title
63
+ end
64
+
65
+ # Array, hierarchial representation of the posts section
66
+ def full_section
67
+ unless @full_section
68
+ @full_section = []
69
+
70
+ (html_head/"div[@class='bchead']//a").each do |a|
71
+ @full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
72
+ end if html_head
73
+ end
74
+
75
+ @full_section
76
+ end
77
+
78
+ # String, represents the post's reply-to address, if listed
79
+ def reply_to
80
+ unless @reply_to
81
+ cursor = html_head.at 'hr' if html_head
82
+ cursor = cursor.next until cursor.nil? or cursor.name == 'a'
83
+ @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
84
+ end
85
+
86
+ @reply_to
87
+ end
88
+
89
+ # Time, reflects the full timestamp of the posting
90
+ def post_time
91
+ unless @post_time
92
+ cursor = html_head.at 'hr' if html_head
93
+ cursor = cursor.next until cursor.nil? or POST_DATE.match cursor.to_s
94
+ @post_time = Time.parse $1 if $1
95
+ end
96
+
97
+ @post_time
98
+ end
99
+
100
+ # Integer, Craigslist's unique posting id
101
+ def posting_id
102
+ unless @posting_id
103
+ cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING if html_footer
104
+ cursor = cursor.next until cursor.nil? or POSTING_ID.match cursor.to_s
105
+ @posting_id = $1.to_i if $1
106
+ end
107
+
108
+ @posting_id
109
+ end
110
+
111
+ # String, The full-html contents of the post
112
+ def contents
113
+ unless @contents
114
+ @contents = user_body if html_source
115
+ @contents = he_decode @contents.strip if @contents
116
+ end
117
+
118
+ @contents
119
+ end
120
+
121
+ # String, the location of the item, as best could be parsed
122
+ def location
123
+ if @location.nil? and craigslist_body and html
124
+ # Location (when explicitly defined):
125
+ cursor = craigslist_body.at 'ul' unless @location
126
+
127
+ # Apa section includes other things in the li's (cats/dogs ok fields)
128
+ cursor.children.each do |li|
129
+ if LOCATION.match li.inner_html
130
+ @location = he_decode($1) and break
131
+ break
132
+ end
133
+ end if cursor
134
+
135
+ # Real estate listings can work a little different for location:
136
+ unless @location
137
+ cursor = craigslist_body.at 'small'
138
+ cursor = cursor.previous until cursor.nil? or cursor.text?
139
+
140
+ @location = he_decode(cursor.to_s.strip) if cursor
141
+ end
142
+
143
+ # So, *sometimes* the location just ends up being in the header, I don't know why:
144
+ @location = $1 if @location.nil? and HEADER_LOCATION.match header
145
+ end
146
+
147
+ @location
148
+ end
149
+
150
+ # Array, urls of the post's images that are *not* hosted on craigslist
151
+ def images
152
+ # Keep in mind that when users post html to craigslist, they're often not posting wonderful html...
153
+ @images = (
154
+ contents ?
155
+ contents.scan(IMAGE_SRC).collect{ |a| a.find{|b| !b.nil? } } :
156
+ []
157
+ ) unless @images
158
+
159
+ @images
160
+ end
161
+
162
+ # Array, urls of the post's craigslist-hosted images
163
+ def pics
164
+ unless @pics
165
+ @pics = []
166
+
167
+ if html and craigslist_body
168
+ # Now let's find the craigslist hosted images:
169
+ img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
170
+
171
+ @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
172
+ end
173
+ end
174
+
175
+ @pics
176
+ end
177
+
178
+ # Returns true if this Post was parsed, and merely a 'Flagged for Removal' page
179
+ def flagged_for_removal?
180
+ @flagged_for_removal = (
181
+ system_post? and header_as_plain == "This posting has been flagged for removal"
182
+ ) if @flagged_for_removal.nil?
183
+
184
+ @flagged_for_removal
185
+ end
186
+
187
+ # Returns true if this Post was parsed, and represents a 'This posting has been deleted by its author.' notice
188
+ def deleted_by_author?
189
+ @deleted_by_author = (
190
+ system_post? and header_as_plain == "This posting has been deleted by its author."
191
+ ) if @deleted_by_author.nil?
192
+
193
+ @deleted_by_author
194
+ end
195
+
196
+ # Returns true if this Post was parsed, and represents a 'This posting has expired.' notice
197
+ def posting_has_expired?
198
+ @posting_has_expired = (
199
+ system_post? and header_as_plain == "This posting has expired."
200
+ ) if @posting_has_expired.nil?
201
+
202
+ @posting_has_expired
203
+ end
204
+
205
+
206
+ # Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
207
+ # used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
208
+ def post_date
209
+ @post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil?
210
+
211
+ @post_date
212
+ end
213
+
214
+ # Returns The post label. The label would appear at first glance to be indentical to the header - but its not.
215
+ # The label is cited on the listings pages, and generally includes everything in the header - with the exception of the location.
216
+ # Sometimes there's additional information ie. '(map)' on rea listings included in the header, that aren't to be listed in the label
217
+ # This is also used as a bandwidth shortcut for the craigwatch program, and is a guaranteed identifier for the post, that won't result
218
+ # in a full page load from the post's url.
219
+ def label
220
+ unless @label or system_post?
221
+ @label = header
222
+
223
+ @label = $1 if location and /(.+?)[ ]*\(#{location}\).*?$/.match @label
224
+ end
225
+
226
+ @label
227
+ end
228
+
229
+ # Array, which image types are listed for the post.
230
+ # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
231
+ def img_types
232
+ unless @img_types
233
+ @img_types = []
234
+
235
+ @img_types << :img if images.length > 0
236
+ @img_types << :pic if pics.length > 0
237
+ end
238
+
239
+ @img_types
240
+ end
241
+
242
+ # Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
243
+ # this (sometimes/rarely) conserves bandwidth by pulling this field from the listing post-summary
244
+ def section
245
+ unless @section
246
+ @section = full_section.last if full_section
247
+ end
248
+
249
+ @section
250
+ end
251
+
252
+ # true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server.
253
+ # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
254
+ def has_img?
255
+ img_types.include? :img
256
+ end
257
+
258
+ # true if post summary has 'pic(s)'. 'pics' are different then imgs, in that craigslist is hosting the resource on craigslist's servers
259
+ # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
260
+ def has_pic?
261
+ img_types.include? :pic
262
+ end
263
+
264
+ # true if post summary has either the img or pic label
265
+ # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
266
+ def has_pic_or_img?
267
+ img_types.length > 0
268
+ end
269
+
270
+ # Returns the best-guess of a price, judging by the label's contents. Price is available when pulled from the listing summary
271
+ # and can be safely used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
272
+ def price
273
+ $1.tr('$','').to_f if label and PRICE.match label
274
+ end
275
+
276
+ # Returns the post contents with all html tags removed
277
+ def contents_as_plain
278
+ strip_html contents
279
+ end
280
+
281
+ # Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a
282
+ # 'system_post' we may get tags in here
283
+ def header_as_plain
284
+ strip_html header
285
+ end
286
+
287
+ # Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original
288
+ # This returns true or false if that case applies
289
+ def system_post?
290
+ [contents,posting_id,post_time,title].all?{|f| f.nil?}
291
+ end
292
+
293
+ private
294
+
295
+ # I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we
296
+ # return everything above the user_body
297
+ def html_head
298
+ @html_head = Nokogiri::HTML $1, nil, HTML_ENCODING if @html_head.nil? and HTML_HEADER.match html_source
299
+ # We return html itself if HTML_HEADER doesn't match, which would be case for a 404 page or something
300
+ @html_head ||= html
301
+
302
+ @html_head
303
+ end
304
+
305
+ # Since we started having so many problems with Hpricot flipping out on whack content bodies,
306
+ # I added this to return everything south of the user_body
307
+ def html_footer
308
+ $4 if USERBODY_PARTS.match html_source
309
+ end
310
+
311
+ # OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
312
+ # This bad html trips up hpricot, and I've resorted to splitting the page up using string parsing like so:
313
+ # We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
314
+ def user_body
315
+ $2 if USERBODY_PARTS.match html_source
316
+ end
317
+
318
+ # Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
319
+ # So - we'll return it as a Nokogiri object.
320
+ def craigslist_body
321
+ Nokogiri::HTML $3, nil, HTML_ENCODING if USERBODY_PARTS.match html_source
322
+ end
323
+
324
+ end
data/lib/scraper.rb ADDED
@@ -0,0 +1,212 @@
1
+ # = About scraper.rb
2
+ #
3
+ # This file defines:
4
+ # - the base class from which other parse objects inherit
5
+ # - Basic http and connection handling methods
6
+ # - html utility methods used by objects
7
+ # - Common Errors
8
+ # You should never need to include this file directly, as all of libcraigscrape's objects and methods
9
+ # are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
10
+ #
11
+
12
+ # Scraper is a general-pupose base class for all libcraigscrape Objects. Scraper facilitates all http-related
13
+ # functionality, and adds some useful helpers for dealing with eager-loading of http-objects and general html
14
+ # methods. It also contains the http-related cattr_accessors:
15
+ #
16
+ # <b>logger</b> - a Logger object to debug http notices too. Defaults to nil
17
+ #
18
+ # <b>retries_on_fetch_fail</b> - The number of times to retry a failed uri download. Defaults to 8
19
+ #
20
+ # <b>sleep_between_fetch_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a failed download. Defaults to 30.
21
+ #
22
+ # <b>retries_on_404_fail</b> - The number of times to retry a Resource Not Found error (http Response code 404). Defaults to 3.
23
+ #
24
+ # <b>sleep_between_404_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a Resource Not Found error. Defaults to 3.
25
+ #
26
+ class CraigScrape::Scraper
27
+ cattr_accessor :logger
28
+ cattr_accessor :sleep_between_fetch_retries
29
+ cattr_accessor :retries_on_fetch_fail
30
+ cattr_accessor :retries_on_404_fail
31
+ cattr_accessor :sleep_between_404_retries
32
+ cattr_accessor :maximum_redirects_per_request
33
+
34
+ URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/
35
+ HTML_TAG = /<\/?[^>]*>/
36
+ # We have to specify this to nokogiri. Sometimes it tries to figure out encoding on its own, and craigslist users post crazy bytes sometimes
37
+ HTML_ENCODING = "UTF-8"
38
+
39
+ # Returns the full url that corresponds to this resource
40
+ attr_reader :url
41
+
42
+ # Set some defaults:
43
+ self.retries_on_fetch_fail = 8
44
+ self.sleep_between_fetch_retries = 30
45
+
46
+ self.retries_on_404_fail = 3
47
+ self.sleep_between_404_retries = 3
48
+
49
+ self.maximum_redirects_per_request = 20
50
+
51
+ class BadConstructionError < StandardError #:nodoc:
52
+ end
53
+
54
+ class ParseError < StandardError #:nodoc:
55
+ end
56
+
57
+ class BadUrlError < StandardError #:nodoc:
58
+ end
59
+
60
+ class MaxRedirectError < StandardError #:nodoc:
61
+ end
62
+
63
+ class FetchError < StandardError #:nodoc:
64
+ end
65
+
66
+ class ResourceNotFoundError < StandardError #:nodoc:
67
+ end
68
+
69
+ # Scraper Objects can be created from either a full URL (string), or a Hash.
70
+ # Currently, this initializer isn't intended to be called from libcraigslist API users, though
71
+ # if you know what you're doing - feel free to try this out.
72
+ #
73
+ # A (string) url can be passed in a 'http://' scheme or a 'file://' scheme.
74
+ #
75
+ # When constructing from a hash, the keys in the hash will be used to set the object's corresponding values.
76
+ # This is useful to create an object without actually making an html request, this is used to set-up an
77
+ # object before it eager-loads any values not already passed in by the constructor hash. Though optional, if
78
+ # you're going to be setting this object up for eager-loadnig, be sure to pass in a :url key in your hash,
79
+ # Otherwise this will fail to eager load.
80
+ def initialize(init_via = nil)
81
+ if init_via.nil?
82
+ # Do nothing - possibly not a great idea, but we'll allow it
83
+ elsif init_via.kind_of? String
84
+ @url = init_via
85
+ elsif init_via.kind_of? Hash
86
+ init_via.each_pair{|k,v| instance_variable_set "@#{k}", v}
87
+ else
88
+ raise BadConstructionError, ("Unrecognized parameter passed to %s.new %s}" % [self.class.to_s, init_via.class.inspect])
89
+ end
90
+ end
91
+
92
+ # Indicates whether the resource has yet been retrieved from its associated url.
93
+ # This is useful to distinguish whether the instance was instantiated for the purpose of an eager-load,
94
+ # but hasn't yet been fetched.
95
+ def downloaded?; !@html_source.nil?; end
96
+
97
+ # A URI object corresponding to this Scraped URL
98
+ def uri
99
+ @uri ||= URI.parse @url if @url
100
+ @uri
101
+ end
102
+
103
+ private
104
+
105
+ # Returns text with all html tags removed.
106
+ def strip_html(str)
107
+ str.gsub HTML_TAG, "" if str
108
+ end
109
+
110
+ # Easy way to fail noisily:
111
+ def parse_error!; raise ParseError, "Error while parsing %s:\n %s" % [self.class.to_s, html]; end
112
+
113
+ # Returns text with all html entities converted to respective ascii character.
114
+ def he_decode(text); self.class.he_decode text; end
115
+
116
+ # Returns text with all html entities converted to respective ascii character.
117
+ def self.he_decode(text); HTMLEntities.new.decode text; end
118
+
119
+ # Derives a full url, using the current object's url and the provided href
120
+ def url_from_href(href) #:nodoc:
121
+ scheme, host, path = $1, $2, $3 if URL_PARTS.match href
122
+
123
+ scheme = uri.scheme if scheme.nil? or scheme.empty? and uri.respond_to? :scheme
124
+
125
+ host = uri.host if host.nil? or host.empty? and uri.respond_to? :host
126
+
127
+ path = (
128
+ (/\/$/.match(uri.path)) ?
129
+ '%s%s' % [uri.path,path] :
130
+ '%s/%s' % [File.dirname(uri.path),path]
131
+ ) unless /^\//.match path
132
+
133
+ '%s://%s%s' % [scheme, host, path]
134
+ end
135
+
136
+ def fetch_uri(uri, redirect_count = 0)
137
+ logger.info "Requesting (%d): %s" % [redirect_count, @url.inspect] if logger
138
+
139
+ raise MaxRedirectError, "Max redirects (#{redirect_count}) reached for URL: #{@url}" if redirect_count > self.maximum_redirects_per_request-1
140
+
141
+ case uri.scheme
142
+ when 'file'
143
+ # If this is a directory, we'll try to approximate http a bit by loading a '/index.html'
144
+ File.read( File.directory?(uri.path) ? "#{uri.path}/index.html" : uri.path )
145
+ when /^http[s]?/
146
+ fetch_http uri, redirect_count
147
+ else
148
+ raise BadUrlError, "Unknown URI scheme for the url: #{@url}"
149
+ end
150
+ end
151
+
152
+ def fetch_http(uri, redirect_count = 0)
153
+ fetch_attempts = 0
154
+ resource_not_found_attempts = 0
155
+
156
+ begin
157
+ # This handles the redirects for us
158
+ resp, data = Net::HTTP.new( uri.host, uri.port).get uri.request_uri
159
+
160
+ if resp.response.code == "200"
161
+ # Check for gzip, and decode:
162
+ data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
163
+
164
+ data
165
+ elsif resp.response['Location']
166
+ redirect_to = resp.response['Location']
167
+
168
+ fetch_uri URI.parse(url_from_href(redirect_to)), redirect_count+1
169
+ else
170
+ # Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
171
+ raise ResourceNotFoundError, 'Unable to fetch "%s" (%s)' % [ @url, resp.response.code ]
172
+ end
173
+ rescue ResourceNotFoundError => err
174
+ logger.info err.message if logger
175
+
176
+ resource_not_found_attempts += 1
177
+
178
+ if resource_not_found_attempts <= self.retries_on_404_fail
179
+ sleep self.sleep_between_404_retries if self.sleep_between_404_retries
180
+ logger.info 'Retrying ....' if logger
181
+ retry
182
+ else
183
+ raise err
184
+ end
185
+ rescue FetchError,Timeout::Error,Errno::ECONNRESET,EOFError => err
186
+ logger.info 'Timeout error while requesting "%s"' % @url if logger and err.class == Timeout::Error
187
+ logger.info 'Connection reset while requesting "%s"' % @url if logger and err.class == Errno::ECONNRESET
188
+
189
+ fetch_attempts += 1
190
+
191
+ if fetch_attempts <= self.retries_on_fetch_fail
192
+ sleep self.sleep_between_fetch_retries if self.sleep_between_fetch_retries
193
+ logger.info 'Retrying fetch ....' if logger
194
+ retry
195
+ else
196
+ raise err
197
+ end
198
+ end
199
+ end
200
+
201
+ # Returns a string, of the current URI's source code
202
+ def html_source
203
+ @html_source ||= fetch_uri uri if uri
204
+ @html_source
205
+ end
206
+
207
+ # Returns an Nokogiri parse, of the current URI
208
+ def html
209
+ @html ||= Nokogiri::HTML html_source, nil, HTML_ENCODING if html_source
210
+ @html
211
+ end
212
+ end