libcraigscrape 0.6.5 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +17 -0
- data/Rakefile +1 -1
- data/bin/craigwatch +10 -10
- data/bin/report_mailer/craigslist_report.html.erb +2 -2
- data/bin/report_mailer/craigslist_report.plain.erb +2 -2
- data/lib/libcraigscrape.rb +585 -342
- data/test/geolisting_samples/geo_listing_ca070209.html +76 -0
- data/test/geolisting_samples/geo_listing_ca_sk070209.html +31 -0
- data/test/geolisting_samples/geo_listing_cn070209.html +35 -0
- data/test/geolisting_samples/geo_listing_us070209.html +355 -0
- data/test/libcraigscrape_test_helpers.rb +31 -0
- data/test/listing_samples/fortmyers_art_index.060909/1046596324.html +93 -0
- data/test/listing_samples/fortmyers_art_index.060909/1053085283.html +92 -0
- data/test/listing_samples/fortmyers_art_index.060909/1112522674.html +89 -0
- data/test/listing_samples/fortmyers_art_index.060909/823516079.html +92 -0
- data/test/listing_samples/fortmyers_art_index.060909/825684735.html +89 -0
- data/test/listing_samples/fortmyers_art_index.060909/891513957.html +94 -0
- data/test/listing_samples/fortmyers_art_index.060909/897549505.html +99 -0
- data/test/listing_samples/fortmyers_art_index.060909/960826026.html +89 -0
- data/test/listing_samples/fortmyers_art_index.060909/993256300.html +89 -0
- data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index500.060909.html +237 -0
- data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index600.060909.html +132 -0
- data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack1000.6.18.09.html +144 -0
- data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack900.6.18.09.html +146 -0
- data/test/post_samples/brw_reb_1224008903.html +101 -0
- data/test/post_samples/sfbay_art_1223614914.html +94 -0
- data/test/test_craigslist_geolisting.rb +425 -0
- data/test/test_craigslist_listing.rb +179 -260
- data/test/test_craigslist_posting.rb +306 -0
- metadata +29 -2
data/lib/libcraigscrape.rb
CHANGED
@@ -10,17 +10,64 @@ require 'hpricot'
|
|
10
10
|
require 'htmlentities'
|
11
11
|
require 'activesupport'
|
12
12
|
|
13
|
-
# A base class encapsulating the libcraigscrape
|
13
|
+
# A base class encapsulating the libcraigscrape objects, and providing some utility methods.
|
14
14
|
class CraigScrape
|
15
|
-
cattr_accessor :logger
|
16
15
|
cattr_accessor :time_now
|
17
|
-
cattr_accessor :retries_on_fetch_fail
|
18
|
-
cattr_accessor :sleep_between_fetch_retries
|
19
16
|
|
20
|
-
#
|
21
|
-
|
22
|
-
|
17
|
+
# Scrapes a single listing url and returns a Listings object representing the contents.
|
18
|
+
# Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
|
19
|
+
# Consider this method 'marked for deprecation'
|
20
|
+
def self.scrape_listing(listing_url)
|
21
|
+
CraigScrape::Listings.new listing_url
|
22
|
+
end
|
23
|
+
|
24
|
+
# Continually scrapes listings, using the supplied url as a starting point, until the supplied block returns true or
|
25
|
+
# until there's no more 'next page' links available to click on
|
26
|
+
def self.scrape_until(listing_url, &post_condition)
|
27
|
+
ret = []
|
28
|
+
|
29
|
+
current_url = listing_url
|
30
|
+
catch "ScrapeBreak" do
|
31
|
+
while current_url do
|
32
|
+
listings = CraigScrape::Listings.new current_url
|
33
|
+
|
34
|
+
listings.posts.each do |post|
|
35
|
+
throw "ScrapeBreak" if post_condition.call(post)
|
36
|
+
ret << post
|
37
|
+
end
|
38
|
+
|
39
|
+
current_url = listings.next_page_url
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
ret
|
44
|
+
end
|
45
|
+
|
46
|
+
# Scrapes a single Post Url, and returns a Posting object representing its contents.
|
47
|
+
# Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
|
48
|
+
# Consider this method 'marked for deprecation'
|
49
|
+
def self.scrape_full_post(post_url)
|
50
|
+
CraigScrape::Posting.new post_url
|
51
|
+
end
|
52
|
+
|
53
|
+
# Continually scrapes listings, using the supplied url as a starting point, until 'count' summaries have been retrieved
|
54
|
+
# or no more 'next page' links are avialable to be clicked on. Returns an array of PostSummary objects.
|
55
|
+
def self.scrape_posts(listing_url, count)
|
56
|
+
count_so_far = 0
|
57
|
+
self.scrape_until(listing_url) {|post| count_so_far+=1; count < count_so_far }
|
58
|
+
end
|
23
59
|
|
60
|
+
# Continually scrapes listings, until the date newer_then has been reached, or no more 'next page' links are avialable to be clicked on.
|
61
|
+
# Returns an array of PostSummary objects. Dates are based on the Month/Day 'datestamps' reported in the listing summaries.
|
62
|
+
# As such, time-based cutoffs are not supported here. The scrape_until method, utilizing the SummaryPost.full_post method could achieve
|
63
|
+
# time-based cutoffs, at the expense of retrieving every post in full during enumerations.
|
64
|
+
#
|
65
|
+
# <b>Note:<b> The results will not include post summaries having the newer_then date themselves.
|
66
|
+
def self.scrape_posts_since(listing_url, newer_then)
|
67
|
+
self.scrape_until(listing_url) {|post| post.post_date <= newer_then}
|
68
|
+
end
|
69
|
+
|
70
|
+
# Returns the most recentlt expired time for the provided month and day
|
24
71
|
def self.most_recently_expired_time(month, day) #:nodoc:
|
25
72
|
now = (time_now) ? time_now : Time.now
|
26
73
|
|
@@ -30,425 +77,621 @@ class CraigScrape
|
|
30
77
|
|
31
78
|
ret
|
32
79
|
end
|
80
|
+
|
81
|
+
# Scraper is a general-pupose base class for all libcraigscrape Objects. Scraper facilitates all http-related
|
82
|
+
# functionality, and adds some useful helpers for dealing with eager-loading of http-objects and general html
|
83
|
+
# methods. It also contains the http-related cattr_accessors:
|
84
|
+
#
|
85
|
+
# *logger* - a Logger object to debug http notices too. Defaults to nil
|
86
|
+
#
|
87
|
+
# *retries_on_fetch_fail* - The number of times to retry a failed uri download. Defaults to 4
|
88
|
+
#
|
89
|
+
# *sleep_between_fetch_retries* - The amount of seconds to sleep, between successive attempts in the case of a failed download. Defaults to 15.
|
90
|
+
class Scraper
|
91
|
+
cattr_accessor :logger
|
92
|
+
cattr_accessor :sleep_between_fetch_retries
|
93
|
+
cattr_accessor :retries_on_fetch_fail
|
94
|
+
|
95
|
+
URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/
|
96
|
+
HTML_TAG = /<\/?[^>]*>/
|
97
|
+
|
98
|
+
# Returns the full url that corresponds to this resource
|
99
|
+
attr_reader :url
|
100
|
+
|
101
|
+
# Set some defaults:
|
102
|
+
self.retries_on_fetch_fail = 4
|
103
|
+
self.sleep_between_fetch_retries = 15
|
33
104
|
|
34
|
-
|
105
|
+
class BadConstructionError < StandardError #:nodoc:
|
106
|
+
end
|
107
|
+
|
108
|
+
class ParseError < StandardError #:nodoc:
|
109
|
+
end
|
110
|
+
|
111
|
+
class BadUrlError < StandardError #:nodoc:
|
112
|
+
end
|
113
|
+
|
114
|
+
class FetchError < StandardError #:nodoc:
|
115
|
+
end
|
116
|
+
|
117
|
+
# Scraper Objects can be created from either a full URL (string), or a Hash.
|
118
|
+
# Currently, this initializer isn't intended to be called from libcraigslist API users, though
|
119
|
+
# if you know what you're doing - feel free to try this out.
|
120
|
+
#
|
121
|
+
# A (string) url can be passed in a 'http://' scheme or a 'file://' scheme.
|
122
|
+
#
|
123
|
+
# When constructing from a hash, the keys in the hash will be used to set the object's corresponding values.
|
124
|
+
# This is useful to create an object without actually making an html request, this is used to set-up an
|
125
|
+
# object before it eager-loads any values not already passed in by the constructor hash. Though optional, if
|
126
|
+
# you're going to be setting this object up for eager-loadnig, be sure to pass in a :url key in your hash,
|
127
|
+
# Otherwise this will fail to eager load.
|
128
|
+
def initialize(init_via = nil)
|
129
|
+
if init_via.nil?
|
130
|
+
# Do nothing - possibly not a great idea, but we'll allow it
|
131
|
+
elsif init_via.kind_of? String
|
132
|
+
@url = init_via
|
133
|
+
elsif init_via.kind_of? Hash
|
134
|
+
init_via.each_pair{|k,v| instance_variable_set "@#{k}", v}
|
135
|
+
else
|
136
|
+
raise BadConstructionError, ("Unrecognized parameter passed to %s.new %s}" % [self.class.to_s, init_via.class.inspect])
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
# Indicates whether the resource has yet been retrieved from its associated url.
|
141
|
+
# This is useful to distinguish whether the instance was instantiated for the purpose of an eager-load,
|
142
|
+
# but hasn't yet been fetched.
|
143
|
+
def downloaded?; !@html.nil?; end
|
144
|
+
|
145
|
+
# A URI object corresponding to this Scraped URL
|
146
|
+
def uri
|
147
|
+
@uri ||= URI.parse @url if @url
|
148
|
+
@uri
|
149
|
+
end
|
150
|
+
|
35
151
|
private
|
36
|
-
|
37
|
-
|
152
|
+
|
153
|
+
# Returns text with all html tags removed.
|
154
|
+
def strip_html(str)
|
155
|
+
str.gsub HTML_TAG, "" if str
|
38
156
|
end
|
39
|
-
|
157
|
+
|
158
|
+
# Easy way to fail noisily:
|
159
|
+
def parse_error!; raise ParseError, "Error while parsing %s:\n %s" % [self.class.to_s, html]; end
|
160
|
+
|
161
|
+
# Returns text with all html entities converted to respective ascii character.
|
162
|
+
def he_decode(text); self.class.he_decode text; end
|
40
163
|
|
41
|
-
|
42
|
-
|
164
|
+
# Returns text with all html entities converted to respective ascii character.
|
165
|
+
def self.he_decode(text); HTMLEntities.new.decode text; end
|
166
|
+
|
167
|
+
# Derives a full url, using the current object's url and the provided href
|
168
|
+
def url_from_href(href) #:nodoc:
|
169
|
+
scheme, host, path = $1, $2, $3 if URL_PARTS.match href
|
170
|
+
|
171
|
+
scheme = uri.scheme if scheme.nil? or scheme.empty? and uri.respond_to? :scheme
|
172
|
+
|
173
|
+
host = uri.host if host.nil? or host.empty? and uri.respond_to? :host
|
174
|
+
|
175
|
+
path = (
|
176
|
+
(/\/$/.match(uri.path)) ?
|
177
|
+
'%s%s' % [uri.path,path] :
|
178
|
+
'%s/%s' % [File.dirname(uri.path),path]
|
179
|
+
) unless /^\//.match path
|
180
|
+
|
181
|
+
'%s://%s%s' % [scheme, host, path]
|
182
|
+
end
|
183
|
+
|
184
|
+
def fetch_uri(uri)
|
43
185
|
|
44
|
-
|
45
|
-
|
186
|
+
logger.info "Requesting: %s" % @url if logger
|
187
|
+
|
188
|
+
case uri.scheme
|
189
|
+
when 'file'
|
190
|
+
File.read uri.path
|
191
|
+
when /^http[s]?/
|
192
|
+
fetch_attempts = 0
|
193
|
+
|
194
|
+
begin
|
195
|
+
# This handles the redirects for us
|
196
|
+
resp, data = Net::HTTP.new( uri.host, uri.port).get uri.request_uri, nil
|
197
|
+
|
198
|
+
if resp.response.code == "200"
|
199
|
+
# Check for gzip, and decode:
|
200
|
+
data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
|
201
|
+
|
202
|
+
data
|
203
|
+
elsif resp.response['Location']
|
204
|
+
redirect_to = resp.response['Location']
|
205
|
+
|
206
|
+
fetch_uri URI.parse(url_from_href(redirect_to))
|
207
|
+
else
|
208
|
+
# Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
|
209
|
+
error_description = 'Unable to fetch "%s" (%s)' % [ @url, resp.response.code ]
|
210
|
+
|
211
|
+
logger.info error_description if logger
|
212
|
+
|
213
|
+
raise FetchError, error_description
|
214
|
+
end
|
215
|
+
rescue FetchError,Timeout::Error,Errno::ECONNRESET => err
|
216
|
+
logger.info 'Timeout error while requesting "%s"' % @url if logger and err.class == Timeout::Error
|
217
|
+
logger.info 'Connection reset while requesting "%s"' % @url if logger and err.class == Errno::ECONNRESET
|
218
|
+
|
219
|
+
fetch_attempts += 1
|
46
220
|
|
47
|
-
|
221
|
+
if fetch_attempts <= self.retries_on_fetch_fail
|
222
|
+
sleep self.sleep_between_fetch_retries if self.sleep_between_fetch_retries
|
223
|
+
logger.info 'Retrying fetch ....' if logger
|
224
|
+
retry
|
225
|
+
else
|
226
|
+
raise err
|
227
|
+
end
|
228
|
+
end
|
229
|
+
else
|
230
|
+
raise BadUrlError, "Unknown URI scheme for the url: #{@url}"
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
def html
|
235
|
+
@html ||= Hpricot.parse fetch_uri(uri) if uri
|
236
|
+
@html
|
237
|
+
end
|
48
238
|
end
|
49
239
|
|
50
|
-
#
|
240
|
+
# Posting represents a fully downloaded, and parsed, Craigslist post.
|
51
241
|
# This class is generally returned by the listing scrape methods, and
|
52
242
|
# contains the post summaries for a specific search url, or a general listing category
|
53
|
-
class
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
243
|
+
class Posting < Scraper
|
244
|
+
|
245
|
+
POST_DATE = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
|
246
|
+
LOCATION = /Location\:[ ]+(.+)/
|
247
|
+
HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/
|
248
|
+
POSTING_ID = /PostingID\:[ ]+([\d]+)/
|
249
|
+
REPLY_TO = /(.+)/
|
250
|
+
PRICE = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
|
251
|
+
USERBODY_PARTS = /\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>/m
|
252
|
+
IMAGE_SRC = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
|
253
|
+
|
254
|
+
# This is really just for testing, in production use, uri.path is a better solution
|
255
|
+
attr_reader :href #:nodoc:
|
256
|
+
|
257
|
+
# Create a new Post via a url (String), or supplied parameters (Hash)
|
258
|
+
def initialize(*args)
|
259
|
+
super(*args)
|
260
|
+
|
261
|
+
# Validate that required fields are present, at least - if we've downloaded it from a url
|
262
|
+
parse_error! if args.first.kind_of? String and !flagged_for_removal? and !deleted_by_author? and [
|
263
|
+
contents,posting_id,post_time,header,title,full_section
|
264
|
+
].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
|
265
|
+
end
|
266
|
+
|
267
|
+
|
62
268
|
# String, The contents of the item's html body heading
|
63
|
-
|
269
|
+
def header
|
270
|
+
unless @header
|
271
|
+
h2 = html.at 'h2' if html
|
272
|
+
@header = he_decode h2.inner_html if h2
|
273
|
+
end
|
274
|
+
|
275
|
+
@header
|
276
|
+
end
|
64
277
|
|
65
278
|
# String, the item's title
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
attr_reader :contents
|
73
|
-
|
74
|
-
# String, the location of the item, as best could be parsed
|
75
|
-
attr_reader :location
|
279
|
+
def title
|
280
|
+
unless @title
|
281
|
+
title_tag = html.at 'title' if html
|
282
|
+
@title = he_decode title_tag.inner_html if title_tag
|
283
|
+
@title = nil if @title and @title.length == 0
|
284
|
+
end
|
76
285
|
|
286
|
+
@title
|
287
|
+
end
|
288
|
+
|
77
289
|
# Array, hierarchial representation of the posts section
|
78
|
-
|
290
|
+
def full_section
|
291
|
+
unless @full_section
|
292
|
+
@full_section = []
|
293
|
+
|
294
|
+
(html/"div[@class='bchead']//a").each do |a|
|
295
|
+
@full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
|
296
|
+
end if html
|
297
|
+
end
|
298
|
+
|
299
|
+
@full_section
|
300
|
+
end
|
301
|
+
|
302
|
+
# String, represents the post's reply-to address, if listed
|
303
|
+
def reply_to
|
304
|
+
unless @reply_to
|
305
|
+
cursor = html.at 'hr' if html
|
306
|
+
cursor = cursor.next_sibling until cursor.nil? or cursor.name == 'a'
|
307
|
+
@reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
|
308
|
+
end
|
309
|
+
|
310
|
+
@reply_to
|
311
|
+
end
|
79
312
|
|
80
|
-
#
|
81
|
-
|
313
|
+
# Time, reflects the full timestamp of the posting
|
314
|
+
def post_time
|
315
|
+
unless @post_time
|
316
|
+
cursor = html.at 'hr' if html
|
317
|
+
cursor = cursor.next_node until cursor.nil? or POST_DATE.match cursor.to_s
|
318
|
+
@post_time = Time.parse $1 if $1
|
319
|
+
end
|
320
|
+
|
321
|
+
@post_time
|
322
|
+
end
|
323
|
+
|
324
|
+
# Integer, Craigslist's unique posting id
|
325
|
+
def posting_id
|
326
|
+
unless @posting_id
|
327
|
+
cursor = (html/"#userbody").first if html
|
328
|
+
cursor = cursor.next_node until cursor.nil? or POSTING_ID.match cursor.to_s
|
329
|
+
@posting_id = $1.to_i if $1
|
330
|
+
end
|
82
331
|
|
83
|
-
|
84
|
-
|
85
|
-
POSTING_ID = /PostingID\:[ ]+([\d]+)/
|
86
|
-
REPLY_TO = /(.+)/
|
87
|
-
PRICE = /\$([\d]+(?:\.[\d]{2})?)/
|
88
|
-
HTML_TAG = /<\/?[^>]*>/
|
332
|
+
@posting_id
|
333
|
+
end
|
89
334
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
h2 = page.at('h2')
|
96
|
-
@header = he_decode h2.inner_html if h2
|
97
|
-
|
98
|
-
title = page.at('title')
|
99
|
-
@title = he_decode title.inner_html if title
|
100
|
-
@title = nil if @title and @title.length ==0
|
101
|
-
|
102
|
-
@full_section = []
|
103
|
-
(page/"div[@class='bchead']//a").each do |a|
|
104
|
-
@full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
|
335
|
+
# String, The full-html contents of the post
|
336
|
+
def contents
|
337
|
+
unless @contents
|
338
|
+
@contents = user_body if html
|
339
|
+
@contents = he_decode @contents.strip if @contents
|
105
340
|
end
|
106
341
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
342
|
+
@contents
|
343
|
+
end
|
344
|
+
|
345
|
+
# String, the location of the item, as best could be parsed
|
346
|
+
def location
|
347
|
+
if @location.nil? and craigslist_body and html
|
348
|
+
# Location (when explicitly defined):
|
349
|
+
cursor = craigslist_body.at 'ul' unless @location
|
350
|
+
|
351
|
+
# Apa section includes other things in the li's (cats/dogs ok fields)
|
352
|
+
cursor.children.each do |li|
|
353
|
+
if LOCATION.match li.inner_html
|
354
|
+
@location = he_decode($1) and break
|
355
|
+
break
|
356
|
+
end
|
357
|
+
end if cursor
|
358
|
+
|
359
|
+
# Real estate listings can work a little different for location:
|
360
|
+
unless @location
|
361
|
+
cursor = craigslist_body.at 'small'
|
362
|
+
cursor = cursor.previous_node until cursor.nil? or cursor.text?
|
363
|
+
|
364
|
+
@location = he_decode(cursor.to_s.strip) if cursor
|
365
|
+
end
|
366
|
+
|
367
|
+
# So, *sometimes* the location just ends up being in the header, I don't know why:
|
368
|
+
@location = $1 if @location.nil? and HEADER_LOCATION.match header
|
369
|
+
end
|
111
370
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
371
|
+
@location
|
372
|
+
end
|
373
|
+
|
374
|
+
# Array, urls of the post's images that are *not* hosted on craigslist
|
375
|
+
def images
|
376
|
+
# Keep in mind that when users post html to craigslist, they're often not posting wonderful html...
|
377
|
+
@images = (
|
378
|
+
contents ?
|
379
|
+
contents.scan(IMAGE_SRC).collect{ |a| a.find{|b| !b.nil? } } :
|
380
|
+
[]
|
381
|
+
) unless @images
|
116
382
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
383
|
+
@images
|
384
|
+
end
|
385
|
+
|
386
|
+
# Array, urls of the post's craigslist-hosted images
|
387
|
+
def pics
|
388
|
+
unless @pics
|
389
|
+
@pics = []
|
390
|
+
|
391
|
+
if html and craigslist_body
|
392
|
+
# Now let's find the craigslist hosted images:
|
393
|
+
img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
|
394
|
+
|
395
|
+
@pics = (img_table / 'img').collect{|i| i[:src]} if img_table
|
396
|
+
end
|
397
|
+
end
|
121
398
|
|
122
|
-
|
123
|
-
|
124
|
-
userbody_as_s,craigbody_as_s = $1, $2 if /\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>/m.match page.to_s
|
399
|
+
@pics
|
400
|
+
end
|
125
401
|
|
126
|
-
|
127
|
-
|
402
|
+
# Returns true if this Post was parsed, and merely a 'Flagged for Removal' page
|
403
|
+
def flagged_for_removal?
|
404
|
+
@flagged_for_removal = (
|
405
|
+
system_post? and header_as_plain == "This posting has been flagged for removal"
|
406
|
+
) if @flagged_for_removal.nil?
|
128
407
|
|
129
|
-
|
130
|
-
|
131
|
-
|
408
|
+
@flagged_for_removal
|
409
|
+
end
|
410
|
+
|
411
|
+
# Returns true if this Post was parsed, and represents a 'This posting has been deleted by its author.' notice
|
412
|
+
def deleted_by_author?
|
413
|
+
@deleted_by_author = (
|
414
|
+
system_post? and header_as_plain == "This posting has been deleted by its author."
|
415
|
+
) if @deleted_by_author.nil?
|
132
416
|
|
133
|
-
|
134
|
-
|
135
|
-
|
417
|
+
@deleted_by_author
|
418
|
+
end
|
419
|
+
|
420
|
+
|
421
|
+
# Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
|
422
|
+
# used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
|
423
|
+
def post_date
|
424
|
+
@post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil?
|
136
425
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
426
|
+
@post_date
|
427
|
+
end
|
428
|
+
|
429
|
+
# Returns The post label. The label would appear at first glance to be indentical to the header - but its not.
|
430
|
+
# The label is cited on the listings pages, and generally includes everything in the header - with the exception of the location.
|
431
|
+
# Sometimes there's additional information ie. '(map)' on rea listings included in the header, that aren't to be listed in the label
|
432
|
+
# This is also used as a bandwidth shortcut for the craigwatch program, and is a guaranteed identifier for the post, that won't result
|
433
|
+
# in a full page load from the post's url.
|
434
|
+
def label
|
435
|
+
unless @label or system_post?
|
436
|
+
@label = header
|
437
|
+
|
438
|
+
@label = $1 if location and /(.+?)[ ]*\(#{location}\).*?$/.match @label
|
145
439
|
end
|
146
440
|
|
147
|
-
|
148
|
-
raise ParseError, "Unable to parse PostFull: %s" % page.to_html if !flagged_for_removal? and !deleted_by_author? and [
|
149
|
-
@contents,@posting_id,@post_time,@header,@title,@full_section
|
150
|
-
].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
|
441
|
+
@label
|
151
442
|
end
|
152
|
-
|
153
|
-
# Returns true if this Post was parsed, and merely a 'Flagged for Removal' page
|
154
|
-
def flagged_for_removal?; @flagged_for_removal; end
|
155
443
|
|
156
|
-
#
|
157
|
-
|
444
|
+
# Array, which image types are listed for the post.
|
445
|
+
# This is always able to be pulled from the listing post-summary, and should never cause an additional page load
|
446
|
+
def img_types
|
447
|
+
unless @img_types
|
448
|
+
@img_types = []
|
449
|
+
|
450
|
+
@img_types << :img if images.length > 0
|
451
|
+
@img_types << :pic if pics.length > 0
|
452
|
+
end
|
453
|
+
|
454
|
+
@img_types
|
455
|
+
end
|
158
456
|
|
159
|
-
#
|
457
|
+
# Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
|
458
|
+
# this (sometimes/rarely) conserves bandwidth by pulling this field from the listing post-summary
|
459
|
+
def section
|
460
|
+
unless @section
|
461
|
+
@section = full_section.last if full_section
|
462
|
+
end
|
463
|
+
|
464
|
+
@section
|
465
|
+
end
|
466
|
+
|
467
|
+
# true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server.
|
468
|
+
# This is always able to be pulled from the listing post-summary, and should never cause an additional page load
|
469
|
+
def has_img?
|
470
|
+
img_types.include? :img
|
471
|
+
end
|
472
|
+
|
473
|
+
# true if post summary has 'pic(s)'. 'pics' are different then imgs, in that craigslist is hosting the resource on craigslist's servers
|
474
|
+
# This is always able to be pulled from the listing post-summary, and should never cause an additional page load
|
475
|
+
def has_pic?
|
476
|
+
img_types.include? :pic
|
477
|
+
end
|
478
|
+
|
479
|
+
# true if post summary has either the img or pic label
|
480
|
+
# This is always able to be pulled from the listing post-summary, and should never cause an additional page load
|
481
|
+
def has_pic_or_img?
|
482
|
+
img_types.length > 0
|
483
|
+
end
|
484
|
+
|
485
|
+
# Returns the best-guess of a price, judging by the label's contents. Price is available when pulled from the listing summary
|
486
|
+
# and can be safely used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
|
160
487
|
def price
|
161
|
-
$1.to_f if
|
488
|
+
$1.tr('$','').to_f if label and PRICE.match label
|
162
489
|
end
|
163
490
|
|
164
491
|
# Returns the post contents with all html tags removed
|
165
492
|
def contents_as_plain
|
166
|
-
|
493
|
+
strip_html contents
|
167
494
|
end
|
168
|
-
|
495
|
+
|
496
|
+
# Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a
|
497
|
+
# 'system_post' we may get tags in here
|
498
|
+
def header_as_plain
|
499
|
+
strip_html header
|
500
|
+
end
|
501
|
+
|
502
|
+
# Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original
|
503
|
+
# This returns true or false if that case applies
|
504
|
+
def system_post?
|
505
|
+
[contents,posting_id,post_time,title].all?{|f| f.nil?}
|
506
|
+
end
|
507
|
+
|
169
508
|
private
|
509
|
+
|
510
|
+
# OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
|
511
|
+
# This bad html trips up hpricot, and I've resorted to splitting the page up using string parsing like so:
|
512
|
+
# We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
|
513
|
+
def user_body
|
514
|
+
$1 if USERBODY_PARTS.match html.to_s
|
515
|
+
end
|
170
516
|
|
171
|
-
#
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
517
|
+
# Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
|
518
|
+
# So - we'll return it as an Hpricot object.
|
519
|
+
def craigslist_body
|
520
|
+
Hpricot.parse $2 if USERBODY_PARTS.match html.to_s
|
521
|
+
end
|
522
|
+
|
523
|
+
end
|
524
|
+
|
525
|
+
# Listings represents a parsed Craigslist listing page and is generally returned by CraigScrape.scrape_listing
|
526
|
+
class Listings < Scraper
|
527
|
+
LABEL = /^(.+?)[ ]*\-$/
|
528
|
+
LOCATION = /^[ ]*\((.*?)\)$/
|
529
|
+
IMG_TYPE = /^[ ]*(.+)[ ]*$/
|
530
|
+
HEADER_DATE = /^[ ]*[^ ]+[ ]+([^ ]+)[ ]+([^ ]+)[ ]*$/
|
531
|
+
SUMMARY_DATE = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
|
532
|
+
NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/
|
183
533
|
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
534
|
+
# Array, PostSummary objects found in the listing
|
535
|
+
def posts
|
536
|
+
unless @posts
|
537
|
+
current_date = nil
|
538
|
+
@posts = []
|
539
|
+
|
540
|
+
post_tags = html.get_elements_by_tag_name('p','h4')
|
188
541
|
|
189
|
-
|
542
|
+
# The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output:
|
543
|
+
post_tags.pop if (
|
544
|
+
post_tags.length > 0 and
|
545
|
+
post_tags.last.at('a') and
|
546
|
+
NEXT_PAGE_LINK.match post_tags.last.at('a').inner_html
|
547
|
+
)
|
548
|
+
|
549
|
+
# Now we iterate though the listings:
|
550
|
+
post_tags.each do |el|
|
551
|
+
case el.name
|
552
|
+
when 'p'
|
553
|
+
post_summary = self.class.parse_summary el, current_date
|
554
|
+
|
555
|
+
# Validate that required fields are present:
|
556
|
+
parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
|
557
|
+
|
558
|
+
post_summary[:url] = url_from_href post_summary[:href]
|
559
|
+
|
560
|
+
@posts << CraigScrape::Posting.new(post_summary)
|
561
|
+
when 'h4'
|
562
|
+
# Let's make sense of the h4 tag, and then read all the p tags below it
|
563
|
+
if HEADER_DATE.match he_decode(el.inner_html)
|
564
|
+
# Generally, the H4 tags contain valid dates. When they do - this is easy:
|
565
|
+
current_date = CraigScrape.most_recently_expired_time $1, $2
|
566
|
+
elsif html.at('h4:last-of-type') == el
|
567
|
+
# There's a specific bug, where these nonsense h4's just appear without anything relevant inside them.
|
568
|
+
# They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
|
569
|
+
# we need to pull up the full post in order to accurate tell the date.
|
570
|
+
# Setting this to nil will achieve the eager-load.
|
571
|
+
current_date = nil
|
572
|
+
end
|
573
|
+
end
|
574
|
+
end
|
190
575
|
end
|
191
576
|
|
192
|
-
|
193
|
-
img_table = (craigbody_els / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
|
194
|
-
|
195
|
-
@images = (img_table / 'img').collect{|i| i[:src]} if img_table
|
577
|
+
@posts
|
196
578
|
end
|
197
|
-
end
|
198
579
|
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
580
|
+
# String, URL Path href-fragment of the next page link
|
581
|
+
def next_page_href
|
582
|
+
unless @next_page_href
|
583
|
+
cursor = html.at 'p:last-of-type'
|
584
|
+
|
585
|
+
cursor = cursor.at 'a' if cursor
|
586
|
+
|
587
|
+
# Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
|
588
|
+
next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
|
208
589
|
|
209
|
-
|
210
|
-
|
211
|
-
|
590
|
+
# Search listings put their next page in a link towards the top
|
591
|
+
next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
|
592
|
+
|
593
|
+
# Some search pages have a bug, whereby a 'next page' link isn't displayed,
|
594
|
+
# even though we can see that theres another page listed in the page-number links block at the top
|
595
|
+
# and bottom of the listing page
|
596
|
+
unless next_link
|
597
|
+
cursor = html % 'div.sh:first-of-type > b:last-of-type'
|
212
598
|
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
tags_worth_parsing.each do |el|
|
221
|
-
case el.name
|
222
|
-
when 'p'
|
223
|
-
@posts << CraigScrape::PostSummary.new(el, current_date, base_url)
|
224
|
-
when 'h4'
|
225
|
-
current_date = CraigScrape.most_recently_expired_time $1, $2 if /^[ ]*[^ ]+[ ]+([^ ]+)[ ]+([^ ]+)[ ]*$/.match he_decode(el.inner_html)
|
226
|
-
end
|
599
|
+
# If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
|
600
|
+
# We're looking good.
|
601
|
+
next_link = cursor.next_sibling if cursor and /^[\d]+$/.match cursor.inner_html
|
602
|
+
end
|
603
|
+
|
604
|
+
# We have an anchor tag - so - let's assign the href:
|
605
|
+
@next_page_href = next_link[:href] if next_link
|
227
606
|
end
|
228
|
-
|
229
|
-
next_link = (page / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
|
230
|
-
|
231
|
-
# This will find the link on 'search listing' pages (if there is one):
|
232
|
-
@next_page_href = next_link[:href] if next_link
|
233
607
|
|
234
|
-
|
235
|
-
raise ParseError, "Unable to parse Listings: %s" % page.to_html if tags_worth_parsing.length > 0 and @posts.length == 0
|
608
|
+
@next_page_href
|
236
609
|
end
|
237
610
|
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
class PostSummary
|
243
|
-
include ParseObjectHelper
|
244
|
-
|
245
|
-
# Time, date of post, as a Time object. Does not include hours/minutes
|
246
|
-
attr_reader :date
|
247
|
-
|
248
|
-
# String, The label of the post
|
249
|
-
attr_reader :label
|
250
|
-
|
251
|
-
# String, The path fragment of the post's URI
|
252
|
-
attr_reader :href
|
253
|
-
|
254
|
-
# String, The location of the post
|
255
|
-
attr_reader :location
|
256
|
-
|
257
|
-
# String, The abbreviated section of the post
|
258
|
-
attr_reader :section
|
259
|
-
|
260
|
-
# Array, which image types are listed for the post
|
261
|
-
attr_reader :img_types
|
611
|
+
# String, Full URL Path of the 'next page' link
|
612
|
+
def next_page_url
|
613
|
+
(next_page_href) ? url_from_href(next_page_href) : nil
|
614
|
+
end
|
262
615
|
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
def initialize(p_element, date = nil, base_url = nil) #:nodoc:
|
616
|
+
# Takes a paragraph element and returns a mostly-parsed Posting
|
617
|
+
# We separate this from the rest of the parsing both for readability and ease of testing
|
618
|
+
def self.parse_summary(p_element, date = nil) #:nodoc:
|
619
|
+
ret = {}
|
620
|
+
|
270
621
|
title_anchor, section_anchor = p_element.search 'a'
|
271
622
|
location_tag = p_element.at 'font'
|
272
623
|
has_pic_tag = p_element.at 'span'
|
273
624
|
|
625
|
+
href = nil
|
626
|
+
|
274
627
|
location = he_decode p_element.at('font').inner_html if location_tag
|
275
|
-
|
628
|
+
ret[:location] = $1 if location and LOCATION.match location
|
276
629
|
|
277
|
-
|
630
|
+
ret[:img_types] = []
|
278
631
|
if has_pic_tag
|
279
632
|
img_type = he_decode has_pic_tag.inner_html
|
280
633
|
img_type = $1.tr('^a-zA-Z0-9',' ') if IMG_TYPE.match img_type
|
281
634
|
|
282
|
-
|
635
|
+
ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
|
283
636
|
end
|
284
637
|
|
285
|
-
|
638
|
+
ret[:section] = he_decode(section_anchor.inner_html).split("\302\240").join(" ") if section_anchor
|
286
639
|
|
287
|
-
|
288
|
-
if
|
289
|
-
|
640
|
+
ret[:post_date] = date
|
641
|
+
if SUMMARY_DATE.match he_decode(p_element.children[0])
|
642
|
+
ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
|
290
643
|
end
|
291
644
|
|
292
645
|
if title_anchor
|
293
646
|
label = he_decode title_anchor.inner_html
|
294
|
-
|
647
|
+
ret[:label] = $1 if LABEL.match label
|
295
648
|
|
296
|
-
|
649
|
+
ret[:href] = title_anchor[:href]
|
297
650
|
end
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
# Validate that required fields are present:
|
302
|
-
raise ParseError, "Unable to parse PostSummary: %s" % p_element.to_html if [@label,@href].any?{|f| f.nil? or f.length == 0}
|
303
|
-
end
|
304
|
-
|
305
|
-
# Returns the full uri including host and scheme, not just the href
|
306
|
-
def full_url
|
307
|
-
'%s%s' % [@base_url, @href]
|
651
|
+
|
652
|
+
ret
|
308
653
|
end
|
654
|
+
end
|
309
655
|
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
656
|
+
# GeoListings represents a parsed Craigslist geo lisiting page. (i.e. {'http://geo.craigslist.org/iso/us'}[http://geo.craigslist.org/iso/us])
|
657
|
+
# These list all the craigslist sites in a given region.
|
658
|
+
class GeoListings < Scraper
|
659
|
+
LOCATION_NAME = /[ ]*\>[ ](.+)[ ]*/
|
660
|
+
GEOLISTING_BASE_URL = %{http://geo.craigslist.org/iso/}
|
314
661
|
|
315
|
-
#
|
316
|
-
|
317
|
-
|
662
|
+
# The geolisting constructor works like all other Scraper objects, in that it accepts a string 'url'.
|
663
|
+
# In addition though, here we'll accept an array like %w(us fl) which gets converted to
|
664
|
+
# {'http://geo.craigslist.org/iso/us/fl'}[http://geo.craigslist.org/iso/us/fl]
|
665
|
+
def initialize(init_via = nil)
|
666
|
+
super init_via.kind_of?(Array) ? "#{GEOLISTING_BASE_URL}#{init_via.join '/'}" : init_via
|
667
|
+
|
668
|
+
# Validate that required fields are present, at least - if we've downloaded it from a url
|
669
|
+
parse_error! unless location
|
318
670
|
end
|
319
671
|
|
320
|
-
#
|
321
|
-
def
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
$1.tr('$','').to_f if @label and PRICE.match(@label)
|
328
|
-
end
|
329
|
-
|
330
|
-
# Requests and returns the PostFull object that corresponds with this summary's full_url
|
331
|
-
def full_post
|
332
|
-
@full_post ||= CraigScrape.scrape_full_post full_url if full_url
|
672
|
+
# Returns the GeoLocation's full name
|
673
|
+
def location
|
674
|
+
unless @name
|
675
|
+
cursor = html % 'h3 > b > a:first-of-type'
|
676
|
+
cursor = cursor.next_node if cursor
|
677
|
+
@name = $1 if cursor and LOCATION_NAME.match he_decode(cursor.to_s)
|
678
|
+
end
|
333
679
|
|
334
|
-
@
|
680
|
+
@name
|
335
681
|
end
|
336
|
-
end
|
337
|
-
|
338
|
-
# Scrapes a single listing url and returns a Listings object representing the contents
|
339
|
-
def self.scrape_listing(listing_url)
|
340
|
-
current_uri = ( listing_url.class == String ) ? URI.parse(listing_url) : listing_url
|
341
|
-
|
342
|
-
uri_contents = self.fetch_url(current_uri)
|
343
|
-
|
344
|
-
CraigScrape::Listings.new Hpricot.parse(uri_contents), '%s://%s' % [current_uri.scheme, current_uri.host]
|
345
|
-
|
346
|
-
rescue ParseError
|
347
|
-
puts "Encountered error here! : #{uri_contents.inspect}"
|
348
|
-
exit
|
349
|
-
end
|
350
682
|
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
while current_uri do
|
359
|
-
listings = scrape_listing current_uri
|
360
|
-
|
361
|
-
listings.posts.each do |post|
|
362
|
-
throw "ScrapeBreak" if post_condition.call(post)
|
363
|
-
ret << post
|
683
|
+
# Returns a hash of site name to urls in the current listing
|
684
|
+
def sites
|
685
|
+
unless @sites
|
686
|
+
@sites = {}
|
687
|
+
(html / 'div#list > a').each do |el_a|
|
688
|
+
site_name = he_decode strip_html(el_a.inner_html)
|
689
|
+
@sites[site_name] = el_a[:href]
|
364
690
|
end
|
365
|
-
|
366
|
-
current_uri = (listings.next_page_href) ? self.uri_from_href( current_uri, listings.next_page_href ) : nil
|
367
691
|
end
|
368
|
-
end
|
369
|
-
|
370
|
-
ret
|
371
|
-
end
|
372
|
-
|
373
|
-
# Scrapes a single Post Url, and returns a PostFull object representing its contents.
|
374
|
-
def self.scrape_full_post(post_url)
|
375
|
-
CraigScrape::PostFull.new Hpricot.parse(self.fetch_url(post_url))
|
376
|
-
end
|
377
|
-
|
378
|
-
# Continually scrapes listings, using the supplied url as a starting point, until 'count' summaries have been retrieved
|
379
|
-
# or no more 'next page' links are avialable to be clicked on. Returns an array of PostSummary objects.
|
380
|
-
def self.scrape_posts(listing_url, count)
|
381
|
-
count_so_far = 0
|
382
|
-
self.scrape_until(listing_url) {|post| count_so_far+=1; count < count_so_far }
|
383
|
-
end
|
384
|
-
|
385
|
-
# Continually scrapes listings, until the date newer_then has been reached, or no more 'next page' links are avialable to be clicked on.
|
386
|
-
# Returns an array of PostSummary objects. Dates are based on the Month/Day 'datestamps' reported in the listing summaries.
|
387
|
-
# As such, time-based cutoffs are not supported here. The scrape_until method, utilizing the SummaryPost.full_post method could achieve
|
388
|
-
# time-based cutoffs, at the expense of retrieving every post in full during enumerations.
|
389
|
-
#
|
390
|
-
# <b>Note:<b> The results will not include post summaries having the newer_then date themselves.
|
391
|
-
def self.scrape_posts_since(listing_url, newer_then)
|
392
|
-
self.scrape_until(listing_url) {|post| post.date <= newer_then}
|
393
|
-
end
|
394
|
-
|
395
|
-
def self.fetch_url(uri) #:nodoc:
|
396
|
-
uri_dest = ( uri.class == String ) ? URI.parse(uri) : uri
|
397
|
-
|
398
|
-
logger.info "Requesting: %s" % uri_dest.to_s if logger
|
399
|
-
|
400
|
-
case uri_dest.scheme
|
401
|
-
when 'file'
|
402
|
-
File.read uri_dest.path
|
403
|
-
when /^http[s]?/
|
404
|
-
fetch_attempts = 0
|
405
|
-
|
406
|
-
begin
|
407
|
-
# This handles the redirects for us
|
408
|
-
resp, data = Net::HTTP.new( uri_dest.host, uri_dest.port).get uri_dest.request_uri, nil
|
409
692
|
|
410
|
-
|
411
|
-
# Check for gzip, and decode:
|
412
|
-
data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
|
413
|
-
|
414
|
-
data
|
415
|
-
elsif resp.response['Location']
|
416
|
-
redirect_to = resp.response['Location']
|
417
|
-
self.fetch_url(redirect_to)
|
418
|
-
else
|
419
|
-
# Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
|
420
|
-
error_description = 'Unable to fetch "%s" (%s)' % [ uri_dest.to_s, resp.response.code ]
|
421
|
-
|
422
|
-
logger.info error_description if logger
|
423
|
-
|
424
|
-
raise FetchError, error_description
|
425
|
-
end
|
426
|
-
rescue FetchError => err
|
427
|
-
fetch_attempts += 1
|
428
|
-
|
429
|
-
if retries_on_fetch_fail <= CraigScrape.retries_on_fetch_fail
|
430
|
-
sleep CraigScrape.sleep_between_fetch_retries if CraigScrape.sleep_between_fetch_retries
|
431
|
-
retry
|
432
|
-
else
|
433
|
-
raise err
|
434
|
-
end
|
435
|
-
end
|
436
|
-
else
|
437
|
-
raise BadUrlError, "Unknown URI scheme for the url: #{uri_dest.to_s}"
|
693
|
+
@sites
|
438
694
|
end
|
439
695
|
end
|
440
|
-
|
441
|
-
def self.uri_from_href(base_uri, href) #:nodoc:
|
442
|
-
URI.parse(
|
443
|
-
case href
|
444
|
-
when /^http[s]?\:\/\// : href
|
445
|
-
when /^\// : "%s://%s%s" % [ base_uri.scheme, base_uri.host, href ]
|
446
|
-
else "%s://%s%s" % [
|
447
|
-
base_uri.scheme, base_uri.host,
|
448
|
-
/^(.*?\/)[^\/]+$/.match(base_uri.path) ? $1+href : base_uri.path+href
|
449
|
-
]
|
450
|
-
end
|
451
|
-
)
|
452
|
-
end
|
453
696
|
|
454
697
|
end
|