libcraigscrape 0.6.5 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. data/CHANGELOG +17 -0
  2. data/Rakefile +1 -1
  3. data/bin/craigwatch +10 -10
  4. data/bin/report_mailer/craigslist_report.html.erb +2 -2
  5. data/bin/report_mailer/craigslist_report.plain.erb +2 -2
  6. data/lib/libcraigscrape.rb +585 -342
  7. data/test/geolisting_samples/geo_listing_ca070209.html +76 -0
  8. data/test/geolisting_samples/geo_listing_ca_sk070209.html +31 -0
  9. data/test/geolisting_samples/geo_listing_cn070209.html +35 -0
  10. data/test/geolisting_samples/geo_listing_us070209.html +355 -0
  11. data/test/libcraigscrape_test_helpers.rb +31 -0
  12. data/test/listing_samples/fortmyers_art_index.060909/1046596324.html +93 -0
  13. data/test/listing_samples/fortmyers_art_index.060909/1053085283.html +92 -0
  14. data/test/listing_samples/fortmyers_art_index.060909/1112522674.html +89 -0
  15. data/test/listing_samples/fortmyers_art_index.060909/823516079.html +92 -0
  16. data/test/listing_samples/fortmyers_art_index.060909/825684735.html +89 -0
  17. data/test/listing_samples/fortmyers_art_index.060909/891513957.html +94 -0
  18. data/test/listing_samples/fortmyers_art_index.060909/897549505.html +99 -0
  19. data/test/listing_samples/fortmyers_art_index.060909/960826026.html +89 -0
  20. data/test/listing_samples/fortmyers_art_index.060909/993256300.html +89 -0
  21. data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index500.060909.html +237 -0
  22. data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index600.060909.html +132 -0
  23. data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack1000.6.18.09.html +144 -0
  24. data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack900.6.18.09.html +146 -0
  25. data/test/post_samples/brw_reb_1224008903.html +101 -0
  26. data/test/post_samples/sfbay_art_1223614914.html +94 -0
  27. data/test/test_craigslist_geolisting.rb +425 -0
  28. data/test/test_craigslist_listing.rb +179 -260
  29. data/test/test_craigslist_posting.rb +306 -0
  30. metadata +29 -2
@@ -10,17 +10,64 @@ require 'hpricot'
10
10
  require 'htmlentities'
11
11
  require 'activesupport'
12
12
 
13
- # A base class encapsulating the libcraigscrape objests, and providing some utility methods.
13
+ # A base class encapsulating the libcraigscrape objects, and providing some utility methods.
14
14
  class CraigScrape
15
- cattr_accessor :logger
16
15
  cattr_accessor :time_now
17
- cattr_accessor :retries_on_fetch_fail
18
- cattr_accessor :sleep_between_fetch_retries
19
16
 
20
- # Set some defaults:
21
- self.retries_on_fetch_fail = 4
22
- self.sleep_between_fetch_retries = 4
17
+ # Scrapes a single listing url and returns a Listings object representing the contents.
18
+ # Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
19
+ # Consider this method 'marked for deprecation'
20
+ def self.scrape_listing(listing_url)
21
+ CraigScrape::Listings.new listing_url
22
+ end
23
+
24
+ # Continually scrapes listings, using the supplied url as a starting point, until the supplied block returns true or
25
+ # until there's no more 'next page' links available to click on
26
+ def self.scrape_until(listing_url, &post_condition)
27
+ ret = []
28
+
29
+ current_url = listing_url
30
+ catch "ScrapeBreak" do
31
+ while current_url do
32
+ listings = CraigScrape::Listings.new current_url
33
+
34
+ listings.posts.each do |post|
35
+ throw "ScrapeBreak" if post_condition.call(post)
36
+ ret << post
37
+ end
38
+
39
+ current_url = listings.next_page_url
40
+ end
41
+ end
42
+
43
+ ret
44
+ end
45
+
46
+ # Scrapes a single Post Url, and returns a Posting object representing its contents.
47
+ # Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
48
+ # Consider this method 'marked for deprecation'
49
+ def self.scrape_full_post(post_url)
50
+ CraigScrape::Posting.new post_url
51
+ end
52
+
53
+ # Continually scrapes listings, using the supplied url as a starting point, until 'count' summaries have been retrieved
54
+ # or no more 'next page' links are avialable to be clicked on. Returns an array of PostSummary objects.
55
+ def self.scrape_posts(listing_url, count)
56
+ count_so_far = 0
57
+ self.scrape_until(listing_url) {|post| count_so_far+=1; count < count_so_far }
58
+ end
23
59
 
60
+ # Continually scrapes listings, until the date newer_then has been reached, or no more 'next page' links are avialable to be clicked on.
61
+ # Returns an array of PostSummary objects. Dates are based on the Month/Day 'datestamps' reported in the listing summaries.
62
+ # As such, time-based cutoffs are not supported here. The scrape_until method, utilizing the SummaryPost.full_post method could achieve
63
+ # time-based cutoffs, at the expense of retrieving every post in full during enumerations.
64
+ #
65
+ # <b>Note:<b> The results will not include post summaries having the newer_then date themselves.
66
+ def self.scrape_posts_since(listing_url, newer_then)
67
+ self.scrape_until(listing_url) {|post| post.post_date <= newer_then}
68
+ end
69
+
70
+ # Returns the most recentlt expired time for the provided month and day
24
71
  def self.most_recently_expired_time(month, day) #:nodoc:
25
72
  now = (time_now) ? time_now : Time.now
26
73
 
@@ -30,425 +77,621 @@ class CraigScrape
30
77
 
31
78
  ret
32
79
  end
80
+
81
+ # Scraper is a general-pupose base class for all libcraigscrape Objects. Scraper facilitates all http-related
82
+ # functionality, and adds some useful helpers for dealing with eager-loading of http-objects and general html
83
+ # methods. It also contains the http-related cattr_accessors:
84
+ #
85
+ # *logger* - a Logger object to debug http notices too. Defaults to nil
86
+ #
87
+ # *retries_on_fetch_fail* - The number of times to retry a failed uri download. Defaults to 4
88
+ #
89
+ # *sleep_between_fetch_retries* - The amount of seconds to sleep, between successive attempts in the case of a failed download. Defaults to 15.
90
+ class Scraper
91
+ cattr_accessor :logger
92
+ cattr_accessor :sleep_between_fetch_retries
93
+ cattr_accessor :retries_on_fetch_fail
94
+
95
+ URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/
96
+ HTML_TAG = /<\/?[^>]*>/
97
+
98
+ # Returns the full url that corresponds to this resource
99
+ attr_reader :url
100
+
101
+ # Set some defaults:
102
+ self.retries_on_fetch_fail = 4
103
+ self.sleep_between_fetch_retries = 15
33
104
 
34
- module ParseObjectHelper #:nodoc:
105
+ class BadConstructionError < StandardError #:nodoc:
106
+ end
107
+
108
+ class ParseError < StandardError #:nodoc:
109
+ end
110
+
111
+ class BadUrlError < StandardError #:nodoc:
112
+ end
113
+
114
+ class FetchError < StandardError #:nodoc:
115
+ end
116
+
117
+ # Scraper Objects can be created from either a full URL (string), or a Hash.
118
+ # Currently, this initializer isn't intended to be called from libcraigslist API users, though
119
+ # if you know what you're doing - feel free to try this out.
120
+ #
121
+ # A (string) url can be passed in a 'http://' scheme or a 'file://' scheme.
122
+ #
123
+ # When constructing from a hash, the keys in the hash will be used to set the object's corresponding values.
124
+ # This is useful to create an object without actually making an html request, this is used to set-up an
125
+ # object before it eager-loads any values not already passed in by the constructor hash. Though optional, if
126
+ # you're going to be setting this object up for eager-loadnig, be sure to pass in a :url key in your hash,
127
+ # Otherwise this will fail to eager load.
128
+ def initialize(init_via = nil)
129
+ if init_via.nil?
130
+ # Do nothing - possibly not a great idea, but we'll allow it
131
+ elsif init_via.kind_of? String
132
+ @url = init_via
133
+ elsif init_via.kind_of? Hash
134
+ init_via.each_pair{|k,v| instance_variable_set "@#{k}", v}
135
+ else
136
+ raise BadConstructionError, ("Unrecognized parameter passed to %s.new %s}" % [self.class.to_s, init_via.class.inspect])
137
+ end
138
+ end
139
+
140
+ # Indicates whether the resource has yet been retrieved from its associated url.
141
+ # This is useful to distinguish whether the instance was instantiated for the purpose of an eager-load,
142
+ # but hasn't yet been fetched.
143
+ def downloaded?; !@html.nil?; end
144
+
145
+ # A URI object corresponding to this Scraped URL
146
+ def uri
147
+ @uri ||= URI.parse @url if @url
148
+ @uri
149
+ end
150
+
35
151
  private
36
- def he_decode(text)
37
- HTMLEntities.new.decode text
152
+
153
+ # Returns text with all html tags removed.
154
+ def strip_html(str)
155
+ str.gsub HTML_TAG, "" if str
38
156
  end
39
- end
157
+
158
+ # Easy way to fail noisily:
159
+ def parse_error!; raise ParseError, "Error while parsing %s:\n %s" % [self.class.to_s, html]; end
160
+
161
+ # Returns text with all html entities converted to respective ascii character.
162
+ def he_decode(text); self.class.he_decode text; end
40
163
 
41
- class BadUrlError < StandardError #:nodoc:
42
- end
164
+ # Returns text with all html entities converted to respective ascii character.
165
+ def self.he_decode(text); HTMLEntities.new.decode text; end
166
+
167
+ # Derives a full url, using the current object's url and the provided href
168
+ def url_from_href(href) #:nodoc:
169
+ scheme, host, path = $1, $2, $3 if URL_PARTS.match href
170
+
171
+ scheme = uri.scheme if scheme.nil? or scheme.empty? and uri.respond_to? :scheme
172
+
173
+ host = uri.host if host.nil? or host.empty? and uri.respond_to? :host
174
+
175
+ path = (
176
+ (/\/$/.match(uri.path)) ?
177
+ '%s%s' % [uri.path,path] :
178
+ '%s/%s' % [File.dirname(uri.path),path]
179
+ ) unless /^\//.match path
180
+
181
+ '%s://%s%s' % [scheme, host, path]
182
+ end
183
+
184
+ def fetch_uri(uri)
43
185
 
44
- class ParseError < StandardError #:nodoc:
45
- end
186
+ logger.info "Requesting: %s" % @url if logger
187
+
188
+ case uri.scheme
189
+ when 'file'
190
+ File.read uri.path
191
+ when /^http[s]?/
192
+ fetch_attempts = 0
193
+
194
+ begin
195
+ # This handles the redirects for us
196
+ resp, data = Net::HTTP.new( uri.host, uri.port).get uri.request_uri, nil
197
+
198
+ if resp.response.code == "200"
199
+ # Check for gzip, and decode:
200
+ data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
201
+
202
+ data
203
+ elsif resp.response['Location']
204
+ redirect_to = resp.response['Location']
205
+
206
+ fetch_uri URI.parse(url_from_href(redirect_to))
207
+ else
208
+ # Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
209
+ error_description = 'Unable to fetch "%s" (%s)' % [ @url, resp.response.code ]
210
+
211
+ logger.info error_description if logger
212
+
213
+ raise FetchError, error_description
214
+ end
215
+ rescue FetchError,Timeout::Error,Errno::ECONNRESET => err
216
+ logger.info 'Timeout error while requesting "%s"' % @url if logger and err.class == Timeout::Error
217
+ logger.info 'Connection reset while requesting "%s"' % @url if logger and err.class == Errno::ECONNRESET
218
+
219
+ fetch_attempts += 1
46
220
 
47
- class FetchError < StandardError #:nodoc:
221
+ if fetch_attempts <= self.retries_on_fetch_fail
222
+ sleep self.sleep_between_fetch_retries if self.sleep_between_fetch_retries
223
+ logger.info 'Retrying fetch ....' if logger
224
+ retry
225
+ else
226
+ raise err
227
+ end
228
+ end
229
+ else
230
+ raise BadUrlError, "Unknown URI scheme for the url: #{@url}"
231
+ end
232
+ end
233
+
234
+ def html
235
+ @html ||= Hpricot.parse fetch_uri(uri) if uri
236
+ @html
237
+ end
48
238
  end
49
239
 
50
- # PostFull represents a fully downloaded, and parsed, Craigslist post.
240
+ # Posting represents a fully downloaded, and parsed, Craigslist post.
51
241
  # This class is generally returned by the listing scrape methods, and
52
242
  # contains the post summaries for a specific search url, or a general listing category
53
- class PostFull
54
- include ParseObjectHelper
55
-
56
- # String, represents the post's reply-to address, if listed
57
- attr_reader :reply_to
58
-
59
- # Time, reflects the full timestamp of the posting
60
- attr_reader :post_time
61
-
243
+ class Posting < Scraper
244
+
245
+ POST_DATE = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
246
+ LOCATION = /Location\:[ ]+(.+)/
247
+ HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/
248
+ POSTING_ID = /PostingID\:[ ]+([\d]+)/
249
+ REPLY_TO = /(.+)/
250
+ PRICE = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
251
+ USERBODY_PARTS = /\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>/m
252
+ IMAGE_SRC = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
253
+
254
+ # This is really just for testing, in production use, uri.path is a better solution
255
+ attr_reader :href #:nodoc:
256
+
257
+ # Create a new Post via a url (String), or supplied parameters (Hash)
258
+ def initialize(*args)
259
+ super(*args)
260
+
261
+ # Validate that required fields are present, at least - if we've downloaded it from a url
262
+ parse_error! if args.first.kind_of? String and !flagged_for_removal? and !deleted_by_author? and [
263
+ contents,posting_id,post_time,header,title,full_section
264
+ ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
265
+ end
266
+
267
+
62
268
  # String, The contents of the item's html body heading
63
- attr_reader :header
269
+ def header
270
+ unless @header
271
+ h2 = html.at 'h2' if html
272
+ @header = he_decode h2.inner_html if h2
273
+ end
274
+
275
+ @header
276
+ end
64
277
 
65
278
  # String, the item's title
66
- attr_reader :title
67
-
68
- # Integer, Craigslist's unique posting id
69
- attr_reader :posting_id
70
-
71
- # String, The full-html contents of the post
72
- attr_reader :contents
73
-
74
- # String, the location of the item, as best could be parsed
75
- attr_reader :location
279
+ def title
280
+ unless @title
281
+ title_tag = html.at 'title' if html
282
+ @title = he_decode title_tag.inner_html if title_tag
283
+ @title = nil if @title and @title.length == 0
284
+ end
76
285
 
286
+ @title
287
+ end
288
+
77
289
  # Array, hierarchial representation of the posts section
78
- attr_reader :full_section
290
+ def full_section
291
+ unless @full_section
292
+ @full_section = []
293
+
294
+ (html/"div[@class='bchead']//a").each do |a|
295
+ @full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
296
+ end if html
297
+ end
298
+
299
+ @full_section
300
+ end
301
+
302
+ # String, represents the post's reply-to address, if listed
303
+ def reply_to
304
+ unless @reply_to
305
+ cursor = html.at 'hr' if html
306
+ cursor = cursor.next_sibling until cursor.nil? or cursor.name == 'a'
307
+ @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
308
+ end
309
+
310
+ @reply_to
311
+ end
79
312
 
80
- # Array, urls of the post's craigslist-hosted images
81
- attr_reader :images
313
+ # Time, reflects the full timestamp of the posting
314
+ def post_time
315
+ unless @post_time
316
+ cursor = html.at 'hr' if html
317
+ cursor = cursor.next_node until cursor.nil? or POST_DATE.match cursor.to_s
318
+ @post_time = Time.parse $1 if $1
319
+ end
320
+
321
+ @post_time
322
+ end
323
+
324
+ # Integer, Craigslist's unique posting id
325
+ def posting_id
326
+ unless @posting_id
327
+ cursor = (html/"#userbody").first if html
328
+ cursor = cursor.next_node until cursor.nil? or POSTING_ID.match cursor.to_s
329
+ @posting_id = $1.to_i if $1
330
+ end
82
331
 
83
- POST_DATE = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
84
- LOCATION = /Location\:[ ]+(.+)/
85
- POSTING_ID = /PostingID\:[ ]+([\d]+)/
86
- REPLY_TO = /(.+)/
87
- PRICE = /\$([\d]+(?:\.[\d]{2})?)/
88
- HTML_TAG = /<\/?[^>]*>/
332
+ @posting_id
333
+ end
89
334
 
90
- def initialize(page) #:nodoc:
91
- # We proceed from easy to difficult:
92
-
93
- @images = []
94
-
95
- h2 = page.at('h2')
96
- @header = he_decode h2.inner_html if h2
97
-
98
- title = page.at('title')
99
- @title = he_decode title.inner_html if title
100
- @title = nil if @title and @title.length ==0
101
-
102
- @full_section = []
103
- (page/"div[@class='bchead']//a").each do |a|
104
- @full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
335
+ # String, The full-html contents of the post
336
+ def contents
337
+ unless @contents
338
+ @contents = user_body if html
339
+ @contents = he_decode @contents.strip if @contents
105
340
  end
106
341
 
107
- # Reply To:
108
- cursor = page.at 'hr'
109
- cursor = cursor.next_sibling until cursor.nil? or cursor.name == 'a'
110
- @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
342
+ @contents
343
+ end
344
+
345
+ # String, the location of the item, as best could be parsed
346
+ def location
347
+ if @location.nil? and craigslist_body and html
348
+ # Location (when explicitly defined):
349
+ cursor = craigslist_body.at 'ul' unless @location
350
+
351
+ # Apa section includes other things in the li's (cats/dogs ok fields)
352
+ cursor.children.each do |li|
353
+ if LOCATION.match li.inner_html
354
+ @location = he_decode($1) and break
355
+ break
356
+ end
357
+ end if cursor
358
+
359
+ # Real estate listings can work a little different for location:
360
+ unless @location
361
+ cursor = craigslist_body.at 'small'
362
+ cursor = cursor.previous_node until cursor.nil? or cursor.text?
363
+
364
+ @location = he_decode(cursor.to_s.strip) if cursor
365
+ end
366
+
367
+ # So, *sometimes* the location just ends up being in the header, I don't know why:
368
+ @location = $1 if @location.nil? and HEADER_LOCATION.match header
369
+ end
111
370
 
112
- # Post Date:
113
- cursor = page.at 'hr'
114
- cursor = cursor.next_node until cursor.nil? or POST_DATE.match cursor.to_s
115
- @post_time = Time.parse $1 if $1
371
+ @location
372
+ end
373
+
374
+ # Array, urls of the post's images that are *not* hosted on craigslist
375
+ def images
376
+ # Keep in mind that when users post html to craigslist, they're often not posting wonderful html...
377
+ @images = (
378
+ contents ?
379
+ contents.scan(IMAGE_SRC).collect{ |a| a.find{|b| !b.nil? } } :
380
+ []
381
+ ) unless @images
116
382
 
117
- # Posting ID:
118
- cursor = (page/"#userbody").first
119
- cursor = cursor.next_node until cursor.nil? or POSTING_ID.match cursor.to_s
120
- @posting_id = $1.to_i if $1
383
+ @images
384
+ end
385
+
386
+ # Array, urls of the post's craigslist-hosted images
387
+ def pics
388
+ unless @pics
389
+ @pics = []
390
+
391
+ if html and craigslist_body
392
+ # Now let's find the craigslist hosted images:
393
+ img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
394
+
395
+ @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
396
+ end
397
+ end
121
398
 
122
- # OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
123
- # This bad html trips up hpricot, and I've resorted to splitting the page up using string parsing like so:
124
- userbody_as_s,craigbody_as_s = $1, $2 if /\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>/m.match page.to_s
399
+ @pics
400
+ end
125
401
 
126
- # Contents:
127
- @contents = he_decode(userbody_as_s.strip) if userbody_as_s
402
+ # Returns true if this Post was parsed, and merely a 'Flagged for Removal' page
403
+ def flagged_for_removal?
404
+ @flagged_for_removal = (
405
+ system_post? and header_as_plain == "This posting has been flagged for removal"
406
+ ) if @flagged_for_removal.nil?
128
407
 
129
- # I made this a separate method since we're not actually parsing everything in here as-is.
130
- # This will make it easier for the next guy to work with if wants to parse out the information we're disgarding...
131
- parse_craig_body Hpricot.parse(craigbody_as_s) if craigbody_as_s
408
+ @flagged_for_removal
409
+ end
410
+
411
+ # Returns true if this Post was parsed, and represents a 'This posting has been deleted by its author.' notice
412
+ def deleted_by_author?
413
+ @deleted_by_author = (
414
+ system_post? and header_as_plain == "This posting has been deleted by its author."
415
+ ) if @deleted_by_author.nil?
132
416
 
133
- # We'll first set these edge cases to false, unless the block below decides otherwise
134
- @flagged_for_removal = false
135
- @deleted_by_author = false
417
+ @deleted_by_author
418
+ end
419
+
420
+
421
+ # Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
422
+ # used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
423
+ def post_date
424
+ @post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil?
136
425
 
137
- # Time to check for errors and edge cases
138
- if [@contents,@posting_id,@post_time,@title].all?{|f| f.nil?}
139
- case @header.gsub(HTML_TAG, "")
140
- when "This posting has been flagged for removal"
141
- @flagged_for_removal = true
142
- when "This posting has been deleted by its author."
143
- @deleted_by_author = true
144
- end
426
+ @post_date
427
+ end
428
+
429
+ # Returns The post label. The label would appear at first glance to be indentical to the header - but its not.
430
+ # The label is cited on the listings pages, and generally includes everything in the header - with the exception of the location.
431
+ # Sometimes there's additional information ie. '(map)' on rea listings included in the header, that aren't to be listed in the label
432
+ # This is also used as a bandwidth shortcut for the craigwatch program, and is a guaranteed identifier for the post, that won't result
433
+ # in a full page load from the post's url.
434
+ def label
435
+ unless @label or system_post?
436
+ @label = header
437
+
438
+ @label = $1 if location and /(.+?)[ ]*\(#{location}\).*?$/.match @label
145
439
  end
146
440
 
147
- # Validate that required fields are present:
148
- raise ParseError, "Unable to parse PostFull: %s" % page.to_html if !flagged_for_removal? and !deleted_by_author? and [
149
- @contents,@posting_id,@post_time,@header,@title,@full_section
150
- ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
441
+ @label
151
442
  end
152
-
153
- # Returns true if this Post was parsed, and merely a 'Flagged for Removal' page
154
- def flagged_for_removal?; @flagged_for_removal; end
155
443
 
156
- # Returns true if this Post was parsed, and represents a 'This posting has been deleted by its author.' notice
157
- def deleted_by_author?; @deleted_by_author; end
444
+ # Array, which image types are listed for the post.
445
+ # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
446
+ def img_types
447
+ unless @img_types
448
+ @img_types = []
449
+
450
+ @img_types << :img if images.length > 0
451
+ @img_types << :pic if pics.length > 0
452
+ end
453
+
454
+ @img_types
455
+ end
158
456
 
159
- # Returns the price (as float) of the item, as best ascertained by the post header
457
+ # Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
458
+ # this (sometimes/rarely) conserves bandwidth by pulling this field from the listing post-summary
459
+ def section
460
+ unless @section
461
+ @section = full_section.last if full_section
462
+ end
463
+
464
+ @section
465
+ end
466
+
467
+ # true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server.
468
+ # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
469
+ def has_img?
470
+ img_types.include? :img
471
+ end
472
+
473
+ # true if post summary has 'pic(s)'. 'pics' are different then imgs, in that craigslist is hosting the resource on craigslist's servers
474
+ # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
475
+ def has_pic?
476
+ img_types.include? :pic
477
+ end
478
+
479
+ # true if post summary has either the img or pic label
480
+ # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
481
+ def has_pic_or_img?
482
+ img_types.length > 0
483
+ end
484
+
485
+ # Returns the best-guess of a price, judging by the label's contents. Price is available when pulled from the listing summary
486
+ # and can be safely used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
160
487
  def price
161
- $1.to_f if @title and @header and PRICE.match(@header.gsub(/#{@title}/, ''))
488
+ $1.tr('$','').to_f if label and PRICE.match label
162
489
  end
163
490
 
164
491
  # Returns the post contents with all html tags removed
165
492
  def contents_as_plain
166
- @contents.gsub HTML_TAG, "" if @contents
493
+ strip_html contents
167
494
  end
168
-
495
+
496
+ # Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a
497
+ # 'system_post' we may get tags in here
498
+ def header_as_plain
499
+ strip_html header
500
+ end
501
+
502
+ # Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original
503
+ # This returns true or false if that case applies
504
+ def system_post?
505
+ [contents,posting_id,post_time,title].all?{|f| f.nil?}
506
+ end
507
+
169
508
  private
509
+
510
+ # OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
511
+ # This bad html trips up hpricot, and I've resorted to splitting the page up using string parsing like so:
512
+ # We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
513
+ def user_body
514
+ $1 if USERBODY_PARTS.match html.to_s
515
+ end
170
516
 
171
- # I left this here as a stub, since someone may want to parse more then what I'm currently scraping from this part of the page
172
- def parse_craig_body(craigbody_els) #:nodoc:
173
- # Location (when explicitly defined):
174
- cursor = craigbody_els.at 'ul' unless @location
175
-
176
- # Apa section includes other things in the li's (cats/dogs ok fields)
177
- cursor.children.each do |li|
178
- if LOCATION.match li.inner_html
179
- @location = he_decode($1) and break
180
- break
181
- end
182
- end if cursor
517
+ # Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
518
+ # So - we'll return it as an Hpricot object.
519
+ def craigslist_body
520
+ Hpricot.parse $2 if USERBODY_PARTS.match html.to_s
521
+ end
522
+
523
+ end
524
+
525
+ # Listings represents a parsed Craigslist listing page and is generally returned by CraigScrape.scrape_listing
526
+ class Listings < Scraper
527
+ LABEL = /^(.+?)[ ]*\-$/
528
+ LOCATION = /^[ ]*\((.*?)\)$/
529
+ IMG_TYPE = /^[ ]*(.+)[ ]*$/
530
+ HEADER_DATE = /^[ ]*[^ ]+[ ]+([^ ]+)[ ]+([^ ]+)[ ]*$/
531
+ SUMMARY_DATE = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
532
+ NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/
183
533
 
184
- # Real estate listings can work a little different for location:
185
- unless @location
186
- cursor = craigbody_els.at 'small'
187
- cursor = cursor.previous_node until cursor.nil? or cursor.text?
534
+ # Array, PostSummary objects found in the listing
535
+ def posts
536
+ unless @posts
537
+ current_date = nil
538
+ @posts = []
539
+
540
+ post_tags = html.get_elements_by_tag_name('p','h4')
188
541
 
189
- @location = he_decode(cursor.to_s.strip) if cursor
542
+ # The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output:
543
+ post_tags.pop if (
544
+ post_tags.length > 0 and
545
+ post_tags.last.at('a') and
546
+ NEXT_PAGE_LINK.match post_tags.last.at('a').inner_html
547
+ )
548
+
549
+ # Now we iterate though the listings:
550
+ post_tags.each do |el|
551
+ case el.name
552
+ when 'p'
553
+ post_summary = self.class.parse_summary el, current_date
554
+
555
+ # Validate that required fields are present:
556
+ parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
557
+
558
+ post_summary[:url] = url_from_href post_summary[:href]
559
+
560
+ @posts << CraigScrape::Posting.new(post_summary)
561
+ when 'h4'
562
+ # Let's make sense of the h4 tag, and then read all the p tags below it
563
+ if HEADER_DATE.match he_decode(el.inner_html)
564
+ # Generally, the H4 tags contain valid dates. When they do - this is easy:
565
+ current_date = CraigScrape.most_recently_expired_time $1, $2
566
+ elsif html.at('h4:last-of-type') == el
567
+ # There's a specific bug, where these nonsense h4's just appear without anything relevant inside them.
568
+ # They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
569
+ # we need to pull up the full post in order to accurate tell the date.
570
+ # Setting this to nil will achieve the eager-load.
571
+ current_date = nil
572
+ end
573
+ end
574
+ end
190
575
  end
191
576
 
192
- # Now let's find the craigslist hosted images:
193
- img_table = (craigbody_els / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
194
-
195
- @images = (img_table / 'img').collect{|i| i[:src]} if img_table
577
+ @posts
196
578
  end
197
- end
198
579
 
199
- # Listings represents a parsed Craigslist listing page and is generally returned by CraigScrape.scrape_listing
200
- class Listings
201
- include ParseObjectHelper
202
-
203
- # Array, PostSummary objects found in the listing
204
- attr_reader :posts
205
-
206
- # String, URL Path of the next page link
207
- attr_reader :next_page_href
580
+ # String, URL Path href-fragment of the next page link
581
+ def next_page_href
582
+ unless @next_page_href
583
+ cursor = html.at 'p:last-of-type'
584
+
585
+ cursor = cursor.at 'a' if cursor
586
+
587
+ # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
588
+ next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
208
589
 
209
- def initialize(page, base_url = nil) #:nodoc:
210
- current_date = nil
211
- @posts = []
590
+ # Search listings put their next page in a link towards the top
591
+ next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
592
+
593
+ # Some search pages have a bug, whereby a 'next page' link isn't displayed,
594
+ # even though we can see that theres another page listed in the page-number links block at the top
595
+ # and bottom of the listing page
596
+ unless next_link
597
+ cursor = html % 'div.sh:first-of-type > b:last-of-type'
212
598
 
213
- tags_worth_parsing = page.get_elements_by_tag_name('p','h4')
214
-
215
- # This will find the link on 'general listing' pages, if there is one:
216
- last_twp_a = tags_worth_parsing.last.at('a') if tags_worth_parsing.last
217
- next_link = tags_worth_parsing.pop.at('a') if last_twp_a and /^[ ]*next [\d]+ postings[ ]*$/.match last_twp_a.inner_html
218
-
219
- # Now we iterate though the listings:
220
- tags_worth_parsing.each do |el|
221
- case el.name
222
- when 'p'
223
- @posts << CraigScrape::PostSummary.new(el, current_date, base_url)
224
- when 'h4'
225
- current_date = CraigScrape.most_recently_expired_time $1, $2 if /^[ ]*[^ ]+[ ]+([^ ]+)[ ]+([^ ]+)[ ]*$/.match he_decode(el.inner_html)
226
- end
599
+ # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
600
+ # We're looking good.
601
+ next_link = cursor.next_sibling if cursor and /^[\d]+$/.match cursor.inner_html
602
+ end
603
+
604
+ # We have an anchor tag - so - let's assign the href:
605
+ @next_page_href = next_link[:href] if next_link
227
606
  end
228
-
229
- next_link = (page / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
230
-
231
- # This will find the link on 'search listing' pages (if there is one):
232
- @next_page_href = next_link[:href] if next_link
233
607
 
234
- # Validate that required fields are present:
235
- raise ParseError, "Unable to parse Listings: %s" % page.to_html if tags_worth_parsing.length > 0 and @posts.length == 0
608
+ @next_page_href
236
609
  end
237
610
 
238
- end
239
-
240
- # PostSummary represents a parsed summary posting, typically found on a Listing page.
241
- # This object is returned by the CraigScrape.scrape_listing methods
242
- class PostSummary
243
- include ParseObjectHelper
244
-
245
- # Time, date of post, as a Time object. Does not include hours/minutes
246
- attr_reader :date
247
-
248
- # String, The label of the post
249
- attr_reader :label
250
-
251
- # String, The path fragment of the post's URI
252
- attr_reader :href
253
-
254
- # String, The location of the post
255
- attr_reader :location
256
-
257
- # String, The abbreviated section of the post
258
- attr_reader :section
259
-
260
- # Array, which image types are listed for the post
261
- attr_reader :img_types
611
+ # String, Full URL Path of the 'next page' link
612
+ def next_page_url
613
+ (next_page_href) ? url_from_href(next_page_href) : nil
614
+ end
262
615
 
263
- PRICE = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
264
- DATE = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
265
- LABEL = /^(.+?)[ ]*\-$/
266
- LOCATION = /^[ ]*\((.*?)\)$/
267
- IMG_TYPE = /^[ ]*(.+)[ ]*$/
268
-
269
- def initialize(p_element, date = nil, base_url = nil) #:nodoc:
616
+ # Takes a paragraph element and returns a mostly-parsed Posting
617
+ # We separate this from the rest of the parsing both for readability and ease of testing
618
+ def self.parse_summary(p_element, date = nil) #:nodoc:
619
+ ret = {}
620
+
270
621
  title_anchor, section_anchor = p_element.search 'a'
271
622
  location_tag = p_element.at 'font'
272
623
  has_pic_tag = p_element.at 'span'
273
624
 
625
+ href = nil
626
+
274
627
  location = he_decode p_element.at('font').inner_html if location_tag
275
- @location = $1 if location and LOCATION.match location
628
+ ret[:location] = $1 if location and LOCATION.match location
276
629
 
277
- @img_types = []
630
+ ret[:img_types] = []
278
631
  if has_pic_tag
279
632
  img_type = he_decode has_pic_tag.inner_html
280
633
  img_type = $1.tr('^a-zA-Z0-9',' ') if IMG_TYPE.match img_type
281
634
 
282
- @img_types = img_type.split(' ').collect{|t| t.to_sym}
635
+ ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
283
636
  end
284
637
 
285
- @section = he_decode section_anchor.inner_html if section_anchor
638
+ ret[:section] = he_decode(section_anchor.inner_html).split("\302\240").join(" ") if section_anchor
286
639
 
287
- @date = date
288
- if DATE.match he_decode(p_element.children[0])
289
- @date = CraigScrape.most_recently_expired_time $1, $2.to_i
640
+ ret[:post_date] = date
641
+ if SUMMARY_DATE.match he_decode(p_element.children[0])
642
+ ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
290
643
  end
291
644
 
292
645
  if title_anchor
293
646
  label = he_decode title_anchor.inner_html
294
- @label = $1 if LABEL.match label
647
+ ret[:label] = $1 if LABEL.match label
295
648
 
296
- @href = title_anchor[:href]
649
+ ret[:href] = title_anchor[:href]
297
650
  end
298
-
299
- @base_url = base_url
300
-
301
- # Validate that required fields are present:
302
- raise ParseError, "Unable to parse PostSummary: %s" % p_element.to_html if [@label,@href].any?{|f| f.nil? or f.length == 0}
303
- end
304
-
305
- # Returns the full uri including host and scheme, not just the href
306
- def full_url
307
- '%s%s' % [@base_url, @href]
651
+
652
+ ret
308
653
  end
654
+ end
309
655
 
310
- # true if post summary has the img label
311
- def has_img?
312
- img_types.include? :img
313
- end
656
+ # GeoListings represents a parsed Craigslist geo lisiting page. (i.e. {'http://geo.craigslist.org/iso/us'}[http://geo.craigslist.org/iso/us])
657
+ # These list all the craigslist sites in a given region.
658
+ class GeoListings < Scraper
659
+ LOCATION_NAME = /[ ]*\>[ ](.+)[ ]*/
660
+ GEOLISTING_BASE_URL = %{http://geo.craigslist.org/iso/}
314
661
 
315
- # true if post summary has the pic label
316
- def has_pic?
317
- img_types.include? :pic
662
+ # The geolisting constructor works like all other Scraper objects, in that it accepts a string 'url'.
663
+ # In addition though, here we'll accept an array like %w(us fl) which gets converted to
664
+ # {'http://geo.craigslist.org/iso/us/fl'}[http://geo.craigslist.org/iso/us/fl]
665
+ def initialize(init_via = nil)
666
+ super init_via.kind_of?(Array) ? "#{GEOLISTING_BASE_URL}#{init_via.join '/'}" : init_via
667
+
668
+ # Validate that required fields are present, at least - if we've downloaded it from a url
669
+ parse_error! unless location
318
670
  end
319
671
 
320
- # true if post summary has either the img or pic label
321
- def has_pic_or_img?
322
- img_types.length > 0
323
- end
324
-
325
- # Returns the best-guess of a price, judging by the label's contents.
326
- def price
327
- $1.tr('$','').to_f if @label and PRICE.match(@label)
328
- end
329
-
330
- # Requests and returns the PostFull object that corresponds with this summary's full_url
331
- def full_post
332
- @full_post ||= CraigScrape.scrape_full_post full_url if full_url
672
+ # Returns the GeoLocation's full name
673
+ def location
674
+ unless @name
675
+ cursor = html % 'h3 > b > a:first-of-type'
676
+ cursor = cursor.next_node if cursor
677
+ @name = $1 if cursor and LOCATION_NAME.match he_decode(cursor.to_s)
678
+ end
333
679
 
334
- @full_post
680
+ @name
335
681
  end
336
- end
337
-
338
- # Scrapes a single listing url and returns a Listings object representing the contents
339
- def self.scrape_listing(listing_url)
340
- current_uri = ( listing_url.class == String ) ? URI.parse(listing_url) : listing_url
341
-
342
- uri_contents = self.fetch_url(current_uri)
343
-
344
- CraigScrape::Listings.new Hpricot.parse(uri_contents), '%s://%s' % [current_uri.scheme, current_uri.host]
345
-
346
- rescue ParseError
347
- puts "Encountered error here! : #{uri_contents.inspect}"
348
- exit
349
- end
350
682
 
351
- # Continually scrapes listings, using the supplied url as a starting point, until the supplied block returns true or
352
- # until there's no more 'next page' links available to click on
353
- def self.scrape_until(listing_url, &post_condition)
354
- ret = []
355
-
356
- current_uri = URI.parse listing_url
357
- catch "ScrapeBreak" do
358
- while current_uri do
359
- listings = scrape_listing current_uri
360
-
361
- listings.posts.each do |post|
362
- throw "ScrapeBreak" if post_condition.call(post)
363
- ret << post
683
+ # Returns a hash of site name to urls in the current listing
684
+ def sites
685
+ unless @sites
686
+ @sites = {}
687
+ (html / 'div#list > a').each do |el_a|
688
+ site_name = he_decode strip_html(el_a.inner_html)
689
+ @sites[site_name] = el_a[:href]
364
690
  end
365
-
366
- current_uri = (listings.next_page_href) ? self.uri_from_href( current_uri, listings.next_page_href ) : nil
367
691
  end
368
- end
369
-
370
- ret
371
- end
372
-
373
- # Scrapes a single Post Url, and returns a PostFull object representing its contents.
374
- def self.scrape_full_post(post_url)
375
- CraigScrape::PostFull.new Hpricot.parse(self.fetch_url(post_url))
376
- end
377
-
378
- # Continually scrapes listings, using the supplied url as a starting point, until 'count' summaries have been retrieved
379
- # or no more 'next page' links are avialable to be clicked on. Returns an array of PostSummary objects.
380
- def self.scrape_posts(listing_url, count)
381
- count_so_far = 0
382
- self.scrape_until(listing_url) {|post| count_so_far+=1; count < count_so_far }
383
- end
384
-
385
- # Continually scrapes listings, until the date newer_then has been reached, or no more 'next page' links are avialable to be clicked on.
386
- # Returns an array of PostSummary objects. Dates are based on the Month/Day 'datestamps' reported in the listing summaries.
387
- # As such, time-based cutoffs are not supported here. The scrape_until method, utilizing the SummaryPost.full_post method could achieve
388
- # time-based cutoffs, at the expense of retrieving every post in full during enumerations.
389
- #
390
- # <b>Note:<b> The results will not include post summaries having the newer_then date themselves.
391
- def self.scrape_posts_since(listing_url, newer_then)
392
- self.scrape_until(listing_url) {|post| post.date <= newer_then}
393
- end
394
-
395
- def self.fetch_url(uri) #:nodoc:
396
- uri_dest = ( uri.class == String ) ? URI.parse(uri) : uri
397
-
398
- logger.info "Requesting: %s" % uri_dest.to_s if logger
399
-
400
- case uri_dest.scheme
401
- when 'file'
402
- File.read uri_dest.path
403
- when /^http[s]?/
404
- fetch_attempts = 0
405
-
406
- begin
407
- # This handles the redirects for us
408
- resp, data = Net::HTTP.new( uri_dest.host, uri_dest.port).get uri_dest.request_uri, nil
409
692
 
410
- if resp.response.code == "200"
411
- # Check for gzip, and decode:
412
- data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
413
-
414
- data
415
- elsif resp.response['Location']
416
- redirect_to = resp.response['Location']
417
- self.fetch_url(redirect_to)
418
- else
419
- # Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
420
- error_description = 'Unable to fetch "%s" (%s)' % [ uri_dest.to_s, resp.response.code ]
421
-
422
- logger.info error_description if logger
423
-
424
- raise FetchError, error_description
425
- end
426
- rescue FetchError => err
427
- fetch_attempts += 1
428
-
429
- if retries_on_fetch_fail <= CraigScrape.retries_on_fetch_fail
430
- sleep CraigScrape.sleep_between_fetch_retries if CraigScrape.sleep_between_fetch_retries
431
- retry
432
- else
433
- raise err
434
- end
435
- end
436
- else
437
- raise BadUrlError, "Unknown URI scheme for the url: #{uri_dest.to_s}"
693
+ @sites
438
694
  end
439
695
  end
440
-
441
- def self.uri_from_href(base_uri, href) #:nodoc:
442
- URI.parse(
443
- case href
444
- when /^http[s]?\:\/\// : href
445
- when /^\// : "%s://%s%s" % [ base_uri.scheme, base_uri.host, href ]
446
- else "%s://%s%s" % [
447
- base_uri.scheme, base_uri.host,
448
- /^(.*?\/)[^\/]+$/.match(base_uri.path) ? $1+href : base_uri.path+href
449
- ]
450
- end
451
- )
452
- end
453
696
 
454
697
  end