libcraigscrape 1.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +12 -1
- data/Gemfile +12 -0
- data/Rakefile +1 -54
- data/bin/craig_report_schema.yml +4 -1
- data/bin/craigwatch +148 -146
- data/bin/report_mailer/report.html.erb +20 -0
- data/bin/report_mailer/{craigslist_report.plain.erb → report.text.erb} +7 -6
- data/lib/geo_listings.rb +1 -1
- data/lib/libcraigscrape.rb +52 -59
- data/lib/listings.rb +75 -39
- data/lib/posting.rb +120 -63
- data/lib/scraper.rb +43 -63
- data/spec/assets/geolisting_iso_us_120412.html +441 -0
- data/spec/assets/listing_cta_ftl_112612.html +1470 -0
- data/spec/assets/listing_rea_miami_123012.html +1397 -0
- data/spec/assets/listing_search_ppa_nyc_121212.html +1584 -0
- data/spec/assets/posting_daytona_art_120512-2.html +160 -0
- data/spec/assets/posting_daytona_art_120512.html +153 -0
- data/spec/assets/posting_mdc_cto_ftl_112612.html +170 -0
- data/spec/assets/posting_mdc_reb_120612.html +183 -0
- data/spec/assets/posting_sfbay_1226.html +157 -0
- data/spec/assets/posting_sya_121012-2.html +122 -0
- data/spec/assets/posting_sya_121012.html +165 -0
- data/spec/assets/this_post_has_expired_old.html +48 -0
- data/spec/geolisting_spec.rb +9 -0
- data/spec/listings_spec.rb +77 -0
- data/spec/postings_spec.rb +157 -0
- data/spec/spec_helper.rb +8 -0
- data/test/test_craigslist_geolisting.rb +5 -5
- data/test/test_craigslist_listing.rb +30 -30
- data/test/test_craigslist_posting.rb +25 -145
- metadata +200 -114
- data/bin/report_mailer/craigslist_report.html.erb +0 -17
data/lib/posting.rb
CHANGED
@@ -14,14 +14,25 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
14
14
|
|
15
15
|
POST_DATE = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
|
16
16
|
LOCATION = /Location\:[ ]+(.+)/
|
17
|
-
HEADER_LOCATION =
|
18
|
-
POSTING_ID = /PostingID\:[ ]
|
17
|
+
HEADER_LOCATION = /\((.+)\)$/
|
18
|
+
POSTING_ID = /PostingID\:[ ]*([\d]+)/
|
19
19
|
REPLY_TO = /(.+)/
|
20
20
|
PRICE = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
|
21
|
+
|
22
|
+
# NOTE: we implement the (?:) to first check the 'old' style format, and then the 'new style'
|
23
|
+
# (As of 12/03's parse changes)
|
21
24
|
USERBODY_PARTS = /^(.+)\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>(.+)$/m
|
22
25
|
HTML_HEADER = /^(.+)\<div id\=\"userbody\">/m
|
23
26
|
IMAGE_SRC = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
|
24
27
|
|
28
|
+
# This is used to determine if there's a parse error
|
29
|
+
REQUIRED_FIELDS = %w(contents posting_id post_time header title full_section)
|
30
|
+
|
31
|
+
XPATH_USERBODY = "//*[@id='userbody']"
|
32
|
+
XPATH_BLURBS = "//ul[@class='blurbs']"
|
33
|
+
XPATH_PICS = "//*[@class='tn']/a/@href"
|
34
|
+
XPATH_REPLY_TO = "//*[@class='dateReplyBar']/small/a"
|
35
|
+
|
25
36
|
# This is really just for testing, in production use, uri.path is a better solution
|
26
37
|
attr_reader :href #:nodoc:
|
27
38
|
|
@@ -30,14 +41,14 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
30
41
|
super(*args)
|
31
42
|
|
32
43
|
# Validate that required fields are present, at least - if we've downloaded it from a url
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
44
|
+
if args.first.kind_of? String and is_active_post?
|
45
|
+
unparsed_fields = REQUIRED_FIELDS.find_all{|f|
|
46
|
+
val = send(f)
|
47
|
+
val.nil? or (val.respond_to? :length and val.length == 0)
|
48
|
+
}
|
49
|
+
parse_error! unparsed_fields unless unparsed_fields.empty?
|
50
|
+
end
|
51
|
+
|
41
52
|
end
|
42
53
|
|
43
54
|
|
@@ -67,7 +78,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
67
78
|
unless @full_section
|
68
79
|
@full_section = []
|
69
80
|
|
70
|
-
(html_head/"
|
81
|
+
(html_head / "*[@class='bchead']//a").each do |a|
|
71
82
|
@full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
|
72
83
|
end if html_head
|
73
84
|
end
|
@@ -78,9 +89,13 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
78
89
|
# String, represents the post's reply-to address, if listed
|
79
90
|
def reply_to
|
80
91
|
unless @reply_to
|
81
|
-
|
82
|
-
|
83
|
-
|
92
|
+
if html.at_xpath(XPATH_REPLY_TO)
|
93
|
+
@reply_to = html.at_xpath(XPATH_REPLY_TO).content
|
94
|
+
else
|
95
|
+
cursor = html_head.at 'hr' if html_head
|
96
|
+
cursor = cursor.next until cursor.nil? or cursor.name == 'a'
|
97
|
+
@reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
|
98
|
+
end
|
84
99
|
end
|
85
100
|
|
86
101
|
@reply_to
|
@@ -91,7 +106,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
91
106
|
unless @post_time
|
92
107
|
cursor = html_head.at 'hr' if html_head
|
93
108
|
cursor = cursor.next until cursor.nil? or POST_DATE.match cursor.to_s
|
94
|
-
@post_time =
|
109
|
+
@post_time = DateTime.parse($1) if $1
|
95
110
|
end
|
96
111
|
|
97
112
|
@post_time
|
@@ -99,10 +114,17 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
99
114
|
|
100
115
|
# Integer, Craigslist's unique posting id
|
101
116
|
def posting_id
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
117
|
+
if @posting_id
|
118
|
+
|
119
|
+
elsif USERBODY_PARTS.match html_source
|
120
|
+
# Old style:
|
121
|
+
html_footer = $4
|
122
|
+
cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING
|
123
|
+
cursor = cursor.next until cursor.nil? or
|
124
|
+
@posting_id = $1.to_i if POSTING_ID.match html_footer.to_s
|
125
|
+
else
|
126
|
+
# Post 12/3
|
127
|
+
@posting_id = $1.to_i if POSTING_ID.match html.xpath("//*[@class='postingidtext']").to_s
|
106
128
|
end
|
107
129
|
|
108
130
|
@posting_id
|
@@ -112,7 +134,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
112
134
|
def contents
|
113
135
|
unless @contents
|
114
136
|
@contents = user_body if html_source
|
115
|
-
@contents = he_decode
|
137
|
+
@contents = he_decode(@contents).strip if @contents
|
116
138
|
end
|
117
139
|
|
118
140
|
@contents
|
@@ -120,27 +142,40 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
120
142
|
|
121
143
|
# String, the location of the item, as best could be parsed
|
122
144
|
def location
|
123
|
-
if @location.nil? and
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
if
|
130
|
-
|
131
|
-
break
|
132
|
-
end
|
133
|
-
end if cursor
|
145
|
+
if @location.nil? and html
|
146
|
+
|
147
|
+
if html.at_xpath(XPATH_BLURBS)
|
148
|
+
# This is the post-12/3/12 style:
|
149
|
+
|
150
|
+
# Sometimes the Location is in the body :
|
151
|
+
@location = $1 if html.xpath(XPATH_BLURBS).first.children.any?{|c|
|
152
|
+
LOCATION.match c.content}
|
134
153
|
|
135
|
-
|
136
|
-
|
137
|
-
cursor = craigslist_body.at '
|
138
|
-
|
154
|
+
elsif craigslist_body
|
155
|
+
# Location (when explicitly defined):
|
156
|
+
cursor = craigslist_body.at 'ul' unless @location
|
157
|
+
|
158
|
+
# This is the legacy style:
|
159
|
+
# Note: Apa section includes other things in the li's (cats/dogs ok fields)
|
160
|
+
cursor.children.each do |li|
|
161
|
+
if LOCATION.match li.inner_html
|
162
|
+
@location = he_decode($1) and break
|
163
|
+
break
|
164
|
+
end
|
165
|
+
end if cursor
|
166
|
+
|
167
|
+
# Real estate listings can work a little different for location:
|
168
|
+
unless @location
|
169
|
+
cursor = craigslist_body.at 'small'
|
170
|
+
cursor = cursor.previous until cursor.nil? or cursor.text?
|
171
|
+
|
172
|
+
@location = he_decode(cursor.to_s.strip) if cursor
|
173
|
+
end
|
139
174
|
|
140
|
-
@location = he_decode(cursor.to_s.strip) if cursor
|
141
175
|
end
|
142
176
|
|
143
|
-
# So, *sometimes* the location just ends up being in the header, I don't know why
|
177
|
+
# So, *sometimes* the location just ends up being in the header, I don't know why.
|
178
|
+
# This happens on old-style and new-style posts:
|
144
179
|
@location = $1 if @location.nil? and HEADER_LOCATION.match header
|
145
180
|
end
|
146
181
|
|
@@ -164,11 +199,16 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
164
199
|
unless @pics
|
165
200
|
@pics = []
|
166
201
|
|
167
|
-
if html
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
202
|
+
if html
|
203
|
+
if html.at_xpath(XPATH_PICS)
|
204
|
+
@pics = html.xpath(XPATH_PICS).collect(&:value)
|
205
|
+
elsif craigslist_body
|
206
|
+
# This is the pre-12/3/12 style:
|
207
|
+
# Now let's find the craigslist hosted images:
|
208
|
+
img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
|
209
|
+
|
210
|
+
@pics = (img_table / 'img').collect{|i| i[:src]} if img_table
|
211
|
+
end
|
172
212
|
end
|
173
213
|
end
|
174
214
|
|
@@ -202,11 +242,10 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
202
242
|
@posting_has_expired
|
203
243
|
end
|
204
244
|
|
205
|
-
|
206
245
|
# Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
|
207
246
|
# used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
|
208
247
|
def post_date
|
209
|
-
@post_date =
|
248
|
+
@post_date = post_time.to_date unless @post_date or post_time.nil?
|
210
249
|
|
211
250
|
@post_date
|
212
251
|
end
|
@@ -229,14 +268,8 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
229
268
|
# Array, which image types are listed for the post.
|
230
269
|
# This is always able to be pulled from the listing post-summary, and should never cause an additional page load
|
231
270
|
def img_types
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
@img_types << :img if images.length > 0
|
236
|
-
@img_types << :pic if pics.length > 0
|
237
|
-
end
|
238
|
-
|
239
|
-
@img_types
|
271
|
+
@img_types || [ (images.length > 0) ? :img : nil,
|
272
|
+
(pics.length > 0) ? :pic : nil ].compact
|
240
273
|
end
|
241
274
|
|
242
275
|
# Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
|
@@ -270,7 +303,11 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
270
303
|
# Returns the best-guess of a price, judging by the label's contents. Price is available when pulled from the listing summary
|
271
304
|
# and can be safely used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
|
272
305
|
def price
|
273
|
-
|
306
|
+
unless @price
|
307
|
+
(header and PRICE.match label) ?
|
308
|
+
@price = Money.new($1.tr('$','').to_i*100, 'USD') : nil
|
309
|
+
end
|
310
|
+
@price
|
274
311
|
end
|
275
312
|
|
276
313
|
# Returns the post contents with all html tags removed
|
@@ -290,6 +327,12 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
290
327
|
[contents,posting_id,post_time,title].all?{|f| f.nil?}
|
291
328
|
end
|
292
329
|
|
330
|
+
# This is mostly used to determine if the post should be checked for
|
331
|
+
# parse errors. Might be useful for someone else though
|
332
|
+
def is_active_post?
|
333
|
+
[flagged_for_removal?, posting_has_expired?, deleted_by_author?].none?
|
334
|
+
end
|
335
|
+
|
293
336
|
private
|
294
337
|
|
295
338
|
# I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we
|
@@ -302,17 +345,31 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
302
345
|
@html_head
|
303
346
|
end
|
304
347
|
|
305
|
-
# Since we started having so many problems with Hpricot flipping out on whack content bodies,
|
306
|
-
# I added this to return everything south of the user_body
|
307
|
-
def html_footer
|
308
|
-
$4 if USERBODY_PARTS.match html_source
|
309
|
-
end
|
310
|
-
|
311
348
|
# OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
|
312
|
-
# This bad html trips up
|
349
|
+
# This bad html trips up html parsers, and I've resorted to splitting the page up using string parsing like so:
|
313
350
|
# We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
|
314
|
-
def user_body
|
315
|
-
|
351
|
+
def user_body
|
352
|
+
if USERBODY_PARTS.match html_source
|
353
|
+
# This is the pre-12/3/12 style:
|
354
|
+
$2
|
355
|
+
elsif html.at_xpath(XPATH_USERBODY)
|
356
|
+
# There's a bunch of junk in here that we don't want, so this loop removes
|
357
|
+
# everything after (and including) the last script tag, from the result
|
358
|
+
user_body = html.xpath(XPATH_USERBODY)
|
359
|
+
hit_delimeter = false
|
360
|
+
# Since some posts don't actually have the script tag:
|
361
|
+
delimeter = user_body.at_xpath('script') ? :script : :comment
|
362
|
+
user_body.first.children.to_a.reverse.reject{ |p|
|
363
|
+
if hit_delimeter
|
364
|
+
false
|
365
|
+
elsif ( (delimeter == :script and p.name == 'script') or
|
366
|
+
(delimeter == :comment and p.comment? and p.content.strip == "START CLTAGS") )
|
367
|
+
hit_delimeter = true
|
368
|
+
else
|
369
|
+
true
|
370
|
+
end
|
371
|
+
}.reverse.collect(&:to_s).join
|
372
|
+
end
|
316
373
|
end
|
317
374
|
|
318
375
|
# Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
|
@@ -321,4 +378,4 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
321
378
|
Nokogiri::HTML $3, nil, HTML_ENCODING if USERBODY_PARTS.match html_source
|
322
379
|
end
|
323
380
|
|
324
|
-
end
|
381
|
+
end
|
data/lib/scraper.rb
CHANGED
@@ -15,39 +15,27 @@
|
|
15
15
|
#
|
16
16
|
# <b>logger</b> - a Logger object to debug http notices too. Defaults to nil
|
17
17
|
#
|
18
|
-
|
19
|
-
#
|
20
|
-
# <b>sleep_between_fetch_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a failed download. Defaults to 30.
|
21
|
-
#
|
22
|
-
# <b>retries_on_404_fail</b> - The number of times to retry a Resource Not Found error (http Response code 404). Defaults to 3.
|
23
|
-
#
|
24
|
-
# <b>sleep_between_404_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a Resource Not Found error. Defaults to 3.
|
25
|
-
#
|
18
|
+
|
26
19
|
class CraigScrape::Scraper
|
27
20
|
cattr_accessor :logger
|
28
|
-
cattr_accessor :sleep_between_fetch_retries
|
29
|
-
cattr_accessor :retries_on_fetch_fail
|
30
21
|
cattr_accessor :retries_on_404_fail
|
31
22
|
cattr_accessor :sleep_between_404_retries
|
32
|
-
|
23
|
+
|
24
|
+
self.retries_on_404_fail = 3
|
25
|
+
self.sleep_between_404_retries = 3
|
33
26
|
|
34
27
|
URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/
|
35
28
|
HTML_TAG = /<\/?[^>]*>/
|
36
29
|
# We have to specify this to nokogiri. Sometimes it tries to figure out encoding on its own, and craigslist users post crazy bytes sometimes
|
37
30
|
HTML_ENCODING = "UTF-8"
|
31
|
+
|
32
|
+
HTTP_HEADERS = { "Cache-Control" => "no-cache", "Pragma" => "no-cache",
|
33
|
+
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
34
|
+
"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19"}
|
38
35
|
|
39
36
|
# Returns the full url that corresponds to this resource
|
40
37
|
attr_reader :url
|
41
38
|
|
42
|
-
# Set some defaults:
|
43
|
-
self.retries_on_fetch_fail = 8
|
44
|
-
self.sleep_between_fetch_retries = 30
|
45
|
-
|
46
|
-
self.retries_on_404_fail = 3
|
47
|
-
self.sleep_between_404_retries = 3
|
48
|
-
|
49
|
-
self.maximum_redirects_per_request = 20
|
50
|
-
|
51
39
|
class BadConstructionError < StandardError #:nodoc:
|
52
40
|
end
|
53
41
|
|
@@ -57,9 +45,6 @@ class CraigScrape::Scraper
|
|
57
45
|
class BadUrlError < StandardError #:nodoc:
|
58
46
|
end
|
59
47
|
|
60
|
-
class MaxRedirectError < StandardError #:nodoc:
|
61
|
-
end
|
62
|
-
|
63
48
|
class FetchError < StandardError #:nodoc:
|
64
49
|
end
|
65
50
|
|
@@ -100,21 +85,37 @@ class CraigScrape::Scraper
|
|
100
85
|
@uri
|
101
86
|
end
|
102
87
|
|
88
|
+
# This method is mostly useful for our specs, but it's included in case anyone
|
89
|
+
# else wants it. It returns all currently-defined instance variables, and is
|
90
|
+
# mostly useful for the specs. Probably this doesn't do what you think, and
|
91
|
+
# should only be used to determine what's been parsed by the object thus-far.
|
92
|
+
# (And does not include parseable attributes which have yet to be determined
|
93
|
+
def attributes
|
94
|
+
Hash[self.instance_variables.collect{|i|
|
95
|
+
[i.to_s.tr('@','').to_sym, instance_variable_get(i) ] }]
|
96
|
+
end
|
97
|
+
|
103
98
|
private
|
104
99
|
|
105
100
|
# Returns text with all html tags removed.
|
106
101
|
def strip_html(str)
|
107
|
-
str.gsub HTML_TAG, "" if str
|
102
|
+
he_decode(str).gsub HTML_TAG, "" if str
|
108
103
|
end
|
109
104
|
|
110
105
|
# Easy way to fail noisily:
|
111
|
-
def parse_error
|
106
|
+
def parse_error!(fields = nil)
|
107
|
+
raise ParseError, "Error while parsing %s:\n %s%s" % [
|
108
|
+
self.class.to_s, html,
|
109
|
+
(fields) ? ("\nRequired fields missing: %s" % fields.join(', ')) : '']
|
110
|
+
end
|
112
111
|
|
113
112
|
# Returns text with all html entities converted to respective ascii character.
|
114
113
|
def he_decode(text); self.class.he_decode text; end
|
115
114
|
|
116
115
|
# Returns text with all html entities converted to respective ascii character.
|
117
|
-
def self.he_decode(text)
|
116
|
+
def self.he_decode(text)
|
117
|
+
HTMLEntities.new.decode text
|
118
|
+
end
|
118
119
|
|
119
120
|
# Derives a full url, using the current object's url and the provided href
|
120
121
|
def url_from_href(href) #:nodoc:
|
@@ -133,42 +134,34 @@ class CraigScrape::Scraper
|
|
133
134
|
'%s://%s%s' % [scheme, host, path]
|
134
135
|
end
|
135
136
|
|
136
|
-
def fetch_uri(uri
|
137
|
-
logger.info "Requesting
|
137
|
+
def fetch_uri(uri)
|
138
|
+
logger.info "Requesting: %s" % [@url.inspect] if logger
|
138
139
|
|
139
|
-
|
140
|
-
|
141
|
-
case uri.scheme
|
140
|
+
(case uri.scheme
|
142
141
|
when 'file'
|
143
142
|
# If this is a directory, we'll try to approximate http a bit by loading a '/index.html'
|
144
|
-
File.read( File.directory?(uri.path) ?
|
143
|
+
File.read( File.directory?(uri.path) ?
|
144
|
+
"#{uri.path}/index.html" : uri.path , :encoding => 'BINARY')
|
145
145
|
when /^http[s]?/
|
146
|
-
fetch_http uri
|
146
|
+
fetch_http uri
|
147
147
|
else
|
148
148
|
raise BadUrlError, "Unknown URI scheme for the url: #{@url}"
|
149
|
-
end
|
149
|
+
end).force_encoding("ISO-8859-1").encode("UTF-8")
|
150
150
|
end
|
151
|
-
|
152
|
-
def fetch_http(uri
|
151
|
+
|
152
|
+
def fetch_http(uri)
|
153
153
|
fetch_attempts = 0
|
154
154
|
resource_not_found_attempts = 0
|
155
155
|
|
156
156
|
begin
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
if resp.
|
161
|
-
|
162
|
-
data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
|
163
|
-
|
164
|
-
data
|
165
|
-
elsif resp.response['Location']
|
166
|
-
redirect_to = resp.response['Location']
|
167
|
-
|
168
|
-
fetch_uri URI.parse(url_from_href(redirect_to)), redirect_count+1
|
157
|
+
resp = Typhoeus.get uri.to_s, :followlocation => true,
|
158
|
+
:headers => HTTP_HEADERS
|
159
|
+
|
160
|
+
if resp.response_code == 200
|
161
|
+
resp.response_body
|
169
162
|
else
|
170
163
|
# Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
|
171
|
-
raise ResourceNotFoundError, 'Unable to fetch "%s" (%s)' % [ @url, resp.
|
164
|
+
raise ResourceNotFoundError, 'Unable to fetch "%s" (%s)' % [ @url, resp.response_code ]
|
172
165
|
end
|
173
166
|
rescue ResourceNotFoundError => err
|
174
167
|
logger.info err.message if logger
|
@@ -182,19 +175,6 @@ class CraigScrape::Scraper
|
|
182
175
|
else
|
183
176
|
raise err
|
184
177
|
end
|
185
|
-
rescue FetchError,Timeout::Error,Errno::ECONNRESET,EOFError => err
|
186
|
-
logger.info 'Timeout error while requesting "%s"' % @url if logger and err.class == Timeout::Error
|
187
|
-
logger.info 'Connection reset while requesting "%s"' % @url if logger and err.class == Errno::ECONNRESET
|
188
|
-
|
189
|
-
fetch_attempts += 1
|
190
|
-
|
191
|
-
if fetch_attempts <= self.retries_on_fetch_fail
|
192
|
-
sleep self.sleep_between_fetch_retries if self.sleep_between_fetch_retries
|
193
|
-
logger.info 'Retrying fetch ....' if logger
|
194
|
-
retry
|
195
|
-
else
|
196
|
-
raise err
|
197
|
-
end
|
198
178
|
end
|
199
179
|
end
|
200
180
|
|
@@ -209,4 +189,4 @@ class CraigScrape::Scraper
|
|
209
189
|
@html ||= Nokogiri::HTML html_source, nil, HTML_ENCODING if html_source
|
210
190
|
@html
|
211
191
|
end
|
212
|
-
end
|
192
|
+
end
|