libcraigscrape 1.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +12 -1
- data/Gemfile +12 -0
- data/Rakefile +1 -54
- data/bin/craig_report_schema.yml +4 -1
- data/bin/craigwatch +148 -146
- data/bin/report_mailer/report.html.erb +20 -0
- data/bin/report_mailer/{craigslist_report.plain.erb → report.text.erb} +7 -6
- data/lib/geo_listings.rb +1 -1
- data/lib/libcraigscrape.rb +52 -59
- data/lib/listings.rb +75 -39
- data/lib/posting.rb +120 -63
- data/lib/scraper.rb +43 -63
- data/spec/assets/geolisting_iso_us_120412.html +441 -0
- data/spec/assets/listing_cta_ftl_112612.html +1470 -0
- data/spec/assets/listing_rea_miami_123012.html +1397 -0
- data/spec/assets/listing_search_ppa_nyc_121212.html +1584 -0
- data/spec/assets/posting_daytona_art_120512-2.html +160 -0
- data/spec/assets/posting_daytona_art_120512.html +153 -0
- data/spec/assets/posting_mdc_cto_ftl_112612.html +170 -0
- data/spec/assets/posting_mdc_reb_120612.html +183 -0
- data/spec/assets/posting_sfbay_1226.html +157 -0
- data/spec/assets/posting_sya_121012-2.html +122 -0
- data/spec/assets/posting_sya_121012.html +165 -0
- data/spec/assets/this_post_has_expired_old.html +48 -0
- data/spec/geolisting_spec.rb +9 -0
- data/spec/listings_spec.rb +77 -0
- data/spec/postings_spec.rb +157 -0
- data/spec/spec_helper.rb +8 -0
- data/test/test_craigslist_geolisting.rb +5 -5
- data/test/test_craigslist_listing.rb +30 -30
- data/test/test_craigslist_posting.rb +25 -145
- metadata +200 -114
- data/bin/report_mailer/craigslist_report.html.erb +0 -17
data/lib/posting.rb
CHANGED
@@ -14,14 +14,25 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
14
14
|
|
15
15
|
POST_DATE = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
|
16
16
|
LOCATION = /Location\:[ ]+(.+)/
|
17
|
-
HEADER_LOCATION =
|
18
|
-
POSTING_ID = /PostingID\:[ ]
|
17
|
+
HEADER_LOCATION = /\((.+)\)$/
|
18
|
+
POSTING_ID = /PostingID\:[ ]*([\d]+)/
|
19
19
|
REPLY_TO = /(.+)/
|
20
20
|
PRICE = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
|
21
|
+
|
22
|
+
# NOTE: we implement the (?:) to first check the 'old' style format, and then the 'new style'
|
23
|
+
# (As of 12/03's parse changes)
|
21
24
|
USERBODY_PARTS = /^(.+)\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>(.+)$/m
|
22
25
|
HTML_HEADER = /^(.+)\<div id\=\"userbody\">/m
|
23
26
|
IMAGE_SRC = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
|
24
27
|
|
28
|
+
# This is used to determine if there's a parse error
|
29
|
+
REQUIRED_FIELDS = %w(contents posting_id post_time header title full_section)
|
30
|
+
|
31
|
+
XPATH_USERBODY = "//*[@id='userbody']"
|
32
|
+
XPATH_BLURBS = "//ul[@class='blurbs']"
|
33
|
+
XPATH_PICS = "//*[@class='tn']/a/@href"
|
34
|
+
XPATH_REPLY_TO = "//*[@class='dateReplyBar']/small/a"
|
35
|
+
|
25
36
|
# This is really just for testing, in production use, uri.path is a better solution
|
26
37
|
attr_reader :href #:nodoc:
|
27
38
|
|
@@ -30,14 +41,14 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
30
41
|
super(*args)
|
31
42
|
|
32
43
|
# Validate that required fields are present, at least - if we've downloaded it from a url
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
44
|
+
if args.first.kind_of? String and is_active_post?
|
45
|
+
unparsed_fields = REQUIRED_FIELDS.find_all{|f|
|
46
|
+
val = send(f)
|
47
|
+
val.nil? or (val.respond_to? :length and val.length == 0)
|
48
|
+
}
|
49
|
+
parse_error! unparsed_fields unless unparsed_fields.empty?
|
50
|
+
end
|
51
|
+
|
41
52
|
end
|
42
53
|
|
43
54
|
|
@@ -67,7 +78,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
67
78
|
unless @full_section
|
68
79
|
@full_section = []
|
69
80
|
|
70
|
-
(html_head/"
|
81
|
+
(html_head / "*[@class='bchead']//a").each do |a|
|
71
82
|
@full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
|
72
83
|
end if html_head
|
73
84
|
end
|
@@ -78,9 +89,13 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
78
89
|
# String, represents the post's reply-to address, if listed
|
79
90
|
def reply_to
|
80
91
|
unless @reply_to
|
81
|
-
|
82
|
-
|
83
|
-
|
92
|
+
if html.at_xpath(XPATH_REPLY_TO)
|
93
|
+
@reply_to = html.at_xpath(XPATH_REPLY_TO).content
|
94
|
+
else
|
95
|
+
cursor = html_head.at 'hr' if html_head
|
96
|
+
cursor = cursor.next until cursor.nil? or cursor.name == 'a'
|
97
|
+
@reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
|
98
|
+
end
|
84
99
|
end
|
85
100
|
|
86
101
|
@reply_to
|
@@ -91,7 +106,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
91
106
|
unless @post_time
|
92
107
|
cursor = html_head.at 'hr' if html_head
|
93
108
|
cursor = cursor.next until cursor.nil? or POST_DATE.match cursor.to_s
|
94
|
-
@post_time =
|
109
|
+
@post_time = DateTime.parse($1) if $1
|
95
110
|
end
|
96
111
|
|
97
112
|
@post_time
|
@@ -99,10 +114,17 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
99
114
|
|
100
115
|
# Integer, Craigslist's unique posting id
|
101
116
|
def posting_id
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
117
|
+
if @posting_id
|
118
|
+
|
119
|
+
elsif USERBODY_PARTS.match html_source
|
120
|
+
# Old style:
|
121
|
+
html_footer = $4
|
122
|
+
cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING
|
123
|
+
cursor = cursor.next until cursor.nil? or
|
124
|
+
@posting_id = $1.to_i if POSTING_ID.match html_footer.to_s
|
125
|
+
else
|
126
|
+
# Post 12/3
|
127
|
+
@posting_id = $1.to_i if POSTING_ID.match html.xpath("//*[@class='postingidtext']").to_s
|
106
128
|
end
|
107
129
|
|
108
130
|
@posting_id
|
@@ -112,7 +134,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
112
134
|
def contents
|
113
135
|
unless @contents
|
114
136
|
@contents = user_body if html_source
|
115
|
-
@contents = he_decode
|
137
|
+
@contents = he_decode(@contents).strip if @contents
|
116
138
|
end
|
117
139
|
|
118
140
|
@contents
|
@@ -120,27 +142,40 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
120
142
|
|
121
143
|
# String, the location of the item, as best could be parsed
|
122
144
|
def location
|
123
|
-
if @location.nil? and
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
if
|
130
|
-
|
131
|
-
break
|
132
|
-
end
|
133
|
-
end if cursor
|
145
|
+
if @location.nil? and html
|
146
|
+
|
147
|
+
if html.at_xpath(XPATH_BLURBS)
|
148
|
+
# This is the post-12/3/12 style:
|
149
|
+
|
150
|
+
# Sometimes the Location is in the body :
|
151
|
+
@location = $1 if html.xpath(XPATH_BLURBS).first.children.any?{|c|
|
152
|
+
LOCATION.match c.content}
|
134
153
|
|
135
|
-
|
136
|
-
|
137
|
-
cursor = craigslist_body.at '
|
138
|
-
|
154
|
+
elsif craigslist_body
|
155
|
+
# Location (when explicitly defined):
|
156
|
+
cursor = craigslist_body.at 'ul' unless @location
|
157
|
+
|
158
|
+
# This is the legacy style:
|
159
|
+
# Note: Apa section includes other things in the li's (cats/dogs ok fields)
|
160
|
+
cursor.children.each do |li|
|
161
|
+
if LOCATION.match li.inner_html
|
162
|
+
@location = he_decode($1) and break
|
163
|
+
break
|
164
|
+
end
|
165
|
+
end if cursor
|
166
|
+
|
167
|
+
# Real estate listings can work a little different for location:
|
168
|
+
unless @location
|
169
|
+
cursor = craigslist_body.at 'small'
|
170
|
+
cursor = cursor.previous until cursor.nil? or cursor.text?
|
171
|
+
|
172
|
+
@location = he_decode(cursor.to_s.strip) if cursor
|
173
|
+
end
|
139
174
|
|
140
|
-
@location = he_decode(cursor.to_s.strip) if cursor
|
141
175
|
end
|
142
176
|
|
143
|
-
# So, *sometimes* the location just ends up being in the header, I don't know why
|
177
|
+
# So, *sometimes* the location just ends up being in the header, I don't know why.
|
178
|
+
# This happens on old-style and new-style posts:
|
144
179
|
@location = $1 if @location.nil? and HEADER_LOCATION.match header
|
145
180
|
end
|
146
181
|
|
@@ -164,11 +199,16 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
164
199
|
unless @pics
|
165
200
|
@pics = []
|
166
201
|
|
167
|
-
if html
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
202
|
+
if html
|
203
|
+
if html.at_xpath(XPATH_PICS)
|
204
|
+
@pics = html.xpath(XPATH_PICS).collect(&:value)
|
205
|
+
elsif craigslist_body
|
206
|
+
# This is the pre-12/3/12 style:
|
207
|
+
# Now let's find the craigslist hosted images:
|
208
|
+
img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
|
209
|
+
|
210
|
+
@pics = (img_table / 'img').collect{|i| i[:src]} if img_table
|
211
|
+
end
|
172
212
|
end
|
173
213
|
end
|
174
214
|
|
@@ -202,11 +242,10 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
202
242
|
@posting_has_expired
|
203
243
|
end
|
204
244
|
|
205
|
-
|
206
245
|
# Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
|
207
246
|
# used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
|
208
247
|
def post_date
|
209
|
-
@post_date =
|
248
|
+
@post_date = post_time.to_date unless @post_date or post_time.nil?
|
210
249
|
|
211
250
|
@post_date
|
212
251
|
end
|
@@ -229,14 +268,8 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
229
268
|
# Array, which image types are listed for the post.
|
230
269
|
# This is always able to be pulled from the listing post-summary, and should never cause an additional page load
|
231
270
|
def img_types
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
@img_types << :img if images.length > 0
|
236
|
-
@img_types << :pic if pics.length > 0
|
237
|
-
end
|
238
|
-
|
239
|
-
@img_types
|
271
|
+
@img_types || [ (images.length > 0) ? :img : nil,
|
272
|
+
(pics.length > 0) ? :pic : nil ].compact
|
240
273
|
end
|
241
274
|
|
242
275
|
# Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
|
@@ -270,7 +303,11 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
270
303
|
# Returns the best-guess of a price, judging by the label's contents. Price is available when pulled from the listing summary
|
271
304
|
# and can be safely used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
|
272
305
|
def price
|
273
|
-
|
306
|
+
unless @price
|
307
|
+
(header and PRICE.match label) ?
|
308
|
+
@price = Money.new($1.tr('$','').to_i*100, 'USD') : nil
|
309
|
+
end
|
310
|
+
@price
|
274
311
|
end
|
275
312
|
|
276
313
|
# Returns the post contents with all html tags removed
|
@@ -290,6 +327,12 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
290
327
|
[contents,posting_id,post_time,title].all?{|f| f.nil?}
|
291
328
|
end
|
292
329
|
|
330
|
+
# This is mostly used to determine if the post should be checked for
|
331
|
+
# parse errors. Might be useful for someone else though
|
332
|
+
def is_active_post?
|
333
|
+
[flagged_for_removal?, posting_has_expired?, deleted_by_author?].none?
|
334
|
+
end
|
335
|
+
|
293
336
|
private
|
294
337
|
|
295
338
|
# I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we
|
@@ -302,17 +345,31 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
302
345
|
@html_head
|
303
346
|
end
|
304
347
|
|
305
|
-
# Since we started having so many problems with Hpricot flipping out on whack content bodies,
|
306
|
-
# I added this to return everything south of the user_body
|
307
|
-
def html_footer
|
308
|
-
$4 if USERBODY_PARTS.match html_source
|
309
|
-
end
|
310
|
-
|
311
348
|
# OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
|
312
|
-
# This bad html trips up
|
349
|
+
# This bad html trips up html parsers, and I've resorted to splitting the page up using string parsing like so:
|
313
350
|
# We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
|
314
|
-
def user_body
|
315
|
-
|
351
|
+
def user_body
|
352
|
+
if USERBODY_PARTS.match html_source
|
353
|
+
# This is the pre-12/3/12 style:
|
354
|
+
$2
|
355
|
+
elsif html.at_xpath(XPATH_USERBODY)
|
356
|
+
# There's a bunch of junk in here that we don't want, so this loop removes
|
357
|
+
# everything after (and including) the last script tag, from the result
|
358
|
+
user_body = html.xpath(XPATH_USERBODY)
|
359
|
+
hit_delimeter = false
|
360
|
+
# Since some posts don't actually have the script tag:
|
361
|
+
delimeter = user_body.at_xpath('script') ? :script : :comment
|
362
|
+
user_body.first.children.to_a.reverse.reject{ |p|
|
363
|
+
if hit_delimeter
|
364
|
+
false
|
365
|
+
elsif ( (delimeter == :script and p.name == 'script') or
|
366
|
+
(delimeter == :comment and p.comment? and p.content.strip == "START CLTAGS") )
|
367
|
+
hit_delimeter = true
|
368
|
+
else
|
369
|
+
true
|
370
|
+
end
|
371
|
+
}.reverse.collect(&:to_s).join
|
372
|
+
end
|
316
373
|
end
|
317
374
|
|
318
375
|
# Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
|
@@ -321,4 +378,4 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
321
378
|
Nokogiri::HTML $3, nil, HTML_ENCODING if USERBODY_PARTS.match html_source
|
322
379
|
end
|
323
380
|
|
324
|
-
end
|
381
|
+
end
|
data/lib/scraper.rb
CHANGED
@@ -15,39 +15,27 @@
|
|
15
15
|
#
|
16
16
|
# <b>logger</b> - a Logger object to debug http notices too. Defaults to nil
|
17
17
|
#
|
18
|
-
|
19
|
-
#
|
20
|
-
# <b>sleep_between_fetch_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a failed download. Defaults to 30.
|
21
|
-
#
|
22
|
-
# <b>retries_on_404_fail</b> - The number of times to retry a Resource Not Found error (http Response code 404). Defaults to 3.
|
23
|
-
#
|
24
|
-
# <b>sleep_between_404_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a Resource Not Found error. Defaults to 3.
|
25
|
-
#
|
18
|
+
|
26
19
|
class CraigScrape::Scraper
|
27
20
|
cattr_accessor :logger
|
28
|
-
cattr_accessor :sleep_between_fetch_retries
|
29
|
-
cattr_accessor :retries_on_fetch_fail
|
30
21
|
cattr_accessor :retries_on_404_fail
|
31
22
|
cattr_accessor :sleep_between_404_retries
|
32
|
-
|
23
|
+
|
24
|
+
self.retries_on_404_fail = 3
|
25
|
+
self.sleep_between_404_retries = 3
|
33
26
|
|
34
27
|
URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/
|
35
28
|
HTML_TAG = /<\/?[^>]*>/
|
36
29
|
# We have to specify this to nokogiri. Sometimes it tries to figure out encoding on its own, and craigslist users post crazy bytes sometimes
|
37
30
|
HTML_ENCODING = "UTF-8"
|
31
|
+
|
32
|
+
HTTP_HEADERS = { "Cache-Control" => "no-cache", "Pragma" => "no-cache",
|
33
|
+
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
34
|
+
"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19"}
|
38
35
|
|
39
36
|
# Returns the full url that corresponds to this resource
|
40
37
|
attr_reader :url
|
41
38
|
|
42
|
-
# Set some defaults:
|
43
|
-
self.retries_on_fetch_fail = 8
|
44
|
-
self.sleep_between_fetch_retries = 30
|
45
|
-
|
46
|
-
self.retries_on_404_fail = 3
|
47
|
-
self.sleep_between_404_retries = 3
|
48
|
-
|
49
|
-
self.maximum_redirects_per_request = 20
|
50
|
-
|
51
39
|
class BadConstructionError < StandardError #:nodoc:
|
52
40
|
end
|
53
41
|
|
@@ -57,9 +45,6 @@ class CraigScrape::Scraper
|
|
57
45
|
class BadUrlError < StandardError #:nodoc:
|
58
46
|
end
|
59
47
|
|
60
|
-
class MaxRedirectError < StandardError #:nodoc:
|
61
|
-
end
|
62
|
-
|
63
48
|
class FetchError < StandardError #:nodoc:
|
64
49
|
end
|
65
50
|
|
@@ -100,21 +85,37 @@ class CraigScrape::Scraper
|
|
100
85
|
@uri
|
101
86
|
end
|
102
87
|
|
88
|
+
# This method is mostly useful for our specs, but it's included in case anyone
|
89
|
+
# else wants it. It returns all currently-defined instance variables, and is
|
90
|
+
# mostly useful for the specs. Probably this doesn't do what you think, and
|
91
|
+
# should only be used to determine what's been parsed by the object thus-far.
|
92
|
+
# (And does not include parseable attributes which have yet to be determined
|
93
|
+
def attributes
|
94
|
+
Hash[self.instance_variables.collect{|i|
|
95
|
+
[i.to_s.tr('@','').to_sym, instance_variable_get(i) ] }]
|
96
|
+
end
|
97
|
+
|
103
98
|
private
|
104
99
|
|
105
100
|
# Returns text with all html tags removed.
|
106
101
|
def strip_html(str)
|
107
|
-
str.gsub HTML_TAG, "" if str
|
102
|
+
he_decode(str).gsub HTML_TAG, "" if str
|
108
103
|
end
|
109
104
|
|
110
105
|
# Easy way to fail noisily:
|
111
|
-
def parse_error
|
106
|
+
def parse_error!(fields = nil)
|
107
|
+
raise ParseError, "Error while parsing %s:\n %s%s" % [
|
108
|
+
self.class.to_s, html,
|
109
|
+
(fields) ? ("\nRequired fields missing: %s" % fields.join(', ')) : '']
|
110
|
+
end
|
112
111
|
|
113
112
|
# Returns text with all html entities converted to respective ascii character.
|
114
113
|
def he_decode(text); self.class.he_decode text; end
|
115
114
|
|
116
115
|
# Returns text with all html entities converted to respective ascii character.
|
117
|
-
def self.he_decode(text)
|
116
|
+
def self.he_decode(text)
|
117
|
+
HTMLEntities.new.decode text
|
118
|
+
end
|
118
119
|
|
119
120
|
# Derives a full url, using the current object's url and the provided href
|
120
121
|
def url_from_href(href) #:nodoc:
|
@@ -133,42 +134,34 @@ class CraigScrape::Scraper
|
|
133
134
|
'%s://%s%s' % [scheme, host, path]
|
134
135
|
end
|
135
136
|
|
136
|
-
def fetch_uri(uri
|
137
|
-
logger.info "Requesting
|
137
|
+
def fetch_uri(uri)
|
138
|
+
logger.info "Requesting: %s" % [@url.inspect] if logger
|
138
139
|
|
139
|
-
|
140
|
-
|
141
|
-
case uri.scheme
|
140
|
+
(case uri.scheme
|
142
141
|
when 'file'
|
143
142
|
# If this is a directory, we'll try to approximate http a bit by loading a '/index.html'
|
144
|
-
File.read( File.directory?(uri.path) ?
|
143
|
+
File.read( File.directory?(uri.path) ?
|
144
|
+
"#{uri.path}/index.html" : uri.path , :encoding => 'BINARY')
|
145
145
|
when /^http[s]?/
|
146
|
-
fetch_http uri
|
146
|
+
fetch_http uri
|
147
147
|
else
|
148
148
|
raise BadUrlError, "Unknown URI scheme for the url: #{@url}"
|
149
|
-
end
|
149
|
+
end).force_encoding("ISO-8859-1").encode("UTF-8")
|
150
150
|
end
|
151
|
-
|
152
|
-
def fetch_http(uri
|
151
|
+
|
152
|
+
def fetch_http(uri)
|
153
153
|
fetch_attempts = 0
|
154
154
|
resource_not_found_attempts = 0
|
155
155
|
|
156
156
|
begin
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
if resp.
|
161
|
-
|
162
|
-
data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
|
163
|
-
|
164
|
-
data
|
165
|
-
elsif resp.response['Location']
|
166
|
-
redirect_to = resp.response['Location']
|
167
|
-
|
168
|
-
fetch_uri URI.parse(url_from_href(redirect_to)), redirect_count+1
|
157
|
+
resp = Typhoeus.get uri.to_s, :followlocation => true,
|
158
|
+
:headers => HTTP_HEADERS
|
159
|
+
|
160
|
+
if resp.response_code == 200
|
161
|
+
resp.response_body
|
169
162
|
else
|
170
163
|
# Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
|
171
|
-
raise ResourceNotFoundError, 'Unable to fetch "%s" (%s)' % [ @url, resp.
|
164
|
+
raise ResourceNotFoundError, 'Unable to fetch "%s" (%s)' % [ @url, resp.response_code ]
|
172
165
|
end
|
173
166
|
rescue ResourceNotFoundError => err
|
174
167
|
logger.info err.message if logger
|
@@ -182,19 +175,6 @@ class CraigScrape::Scraper
|
|
182
175
|
else
|
183
176
|
raise err
|
184
177
|
end
|
185
|
-
rescue FetchError,Timeout::Error,Errno::ECONNRESET,EOFError => err
|
186
|
-
logger.info 'Timeout error while requesting "%s"' % @url if logger and err.class == Timeout::Error
|
187
|
-
logger.info 'Connection reset while requesting "%s"' % @url if logger and err.class == Errno::ECONNRESET
|
188
|
-
|
189
|
-
fetch_attempts += 1
|
190
|
-
|
191
|
-
if fetch_attempts <= self.retries_on_fetch_fail
|
192
|
-
sleep self.sleep_between_fetch_retries if self.sleep_between_fetch_retries
|
193
|
-
logger.info 'Retrying fetch ....' if logger
|
194
|
-
retry
|
195
|
-
else
|
196
|
-
raise err
|
197
|
-
end
|
198
178
|
end
|
199
179
|
end
|
200
180
|
|
@@ -209,4 +189,4 @@ class CraigScrape::Scraper
|
|
209
189
|
@html ||= Nokogiri::HTML html_source, nil, HTML_ENCODING if html_source
|
210
190
|
@html
|
211
191
|
end
|
212
|
-
end
|
192
|
+
end
|