libcraigscrape 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,696 +2,204 @@
2
2
  #
3
3
  # All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
4
4
  #
5
- require 'net/http'
6
- require 'zlib'
7
5
 
8
- require 'rubygems'
9
- require 'hpricot'
10
- require 'htmlentities'
11
- require 'activesupport'
6
+ # A base class encapsulating the various libcraigscrape objects, and providing most of the
7
+ # craigslist interaction methods. Currently, we're supporting the old Class methods
8
+ # in a legacy-compatibility mode, but these methods are marked for deprecation. Instead,
9
+ # create an instance of the Craigslist object, and use its Public Instance methods.
10
+ # See the README for easy to follow examples.
11
+ class CraigScrape; end
12
+
13
+ require 'listings'
14
+ require 'posting'
15
+ require 'geo_listings'
12
16
 
13
- # A base class encapsulating the libcraigscrape objects, and providing some utility methods.
14
17
  class CraigScrape
15
18
  cattr_accessor :time_now
19
+ cattr_accessor :site_to_url_prefix
20
+
21
+ #--
22
+ # NOTE:
23
+ # The only reason I took this out is b/c I might want to test with a file://
24
+ # prefix at some point
25
+ #++
26
+ self.site_to_url_prefix = 'http://'
16
27
 
17
- # Scrapes a single listing url and returns a Listings object representing the contents.
18
- # Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
19
- # Consider this method 'marked for deprecation'
20
- def self.scrape_listing(listing_url)
21
- CraigScrape::Listings.new listing_url
28
+
29
+ # Takes a variable number of site/path specifiers (strings) as an argument.
30
+ # This list gets flattened and passed to CraigScrape::GeoListings.find_sites .
31
+ # See that method's rdoc for a complete set of rules on what arguments are allowed here.
32
+ def initialize(*args)
33
+ @sites_specs = args.flatten
22
34
  end
23
35
 
24
- # Continually scrapes listings, using the supplied url as a starting point, until the supplied block returns true or
25
- # until there's no more 'next page' links available to click on
26
- def self.scrape_until(listing_url, &post_condition)
27
- ret = []
28
-
29
- current_url = listing_url
30
- catch "ScrapeBreak" do
31
- while current_url do
32
- listings = CraigScrape::Listings.new current_url
33
-
34
- listings.posts.each do |post|
35
- throw "ScrapeBreak" if post_condition.call(post)
36
- ret << post
37
- end
38
-
39
- current_url = listings.next_page_url
36
+ # Returns which sites are included in any operations performed by this object. This is directly
37
+ # ascertained from the initial constructor's spec-list
38
+ def sites
39
+ @sites ||= GeoListings.find_sites @sites_specs
40
+ @sites
41
+ end
42
+
43
+ # Determines all listings which can be construed by combining the sites specified in the object
44
+ # constructor with the provided url-path fragments.
45
+ #
46
+ # Passes the <b>first page listing</b> of each of these urls to the provided block.
47
+ def each_listing(*fragments)
48
+ listing_urls_for(fragments).each{|url| yield Listings.new(url) }
49
+ end
50
+
51
+ # Determines all listings which can be construed by combining the sites specified in the object
52
+ # constructor with the provided url-path fragments.
53
+ #
54
+ # Passes <b>each page on every listing</b> for the passed URLs to the provided block.
55
+ def each_page_in_each_listing(*fragments)
56
+ each_listing(*fragments) do |listing|
57
+ while listing
58
+ yield listing
59
+ listing = listing.next_page
40
60
  end
41
61
  end
42
-
43
- ret
44
- end
45
-
46
- # Scrapes a single Post Url, and returns a Posting object representing its contents.
47
- # Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
48
- # Consider this method 'marked for deprecation'
49
- def self.scrape_full_post(post_url)
50
- CraigScrape::Posting.new post_url
51
62
  end
52
-
53
- # Continually scrapes listings, using the supplied url as a starting point, until 'count' summaries have been retrieved
54
- # or no more 'next page' links are avialable to be clicked on. Returns an array of PostSummary objects.
55
- def self.scrape_posts(listing_url, count)
56
- count_so_far = 0
57
- self.scrape_until(listing_url) {|post| count_so_far+=1; count < count_so_far }
63
+
64
+ # Determines all listings which can be construed by combining the sites specified in the object
65
+ # constructor with the provided url-path fragments.
66
+ #
67
+ # Returns the <b>first page listing</b> of each of these urls to the provided block.
68
+ def listings(*fragments)
69
+ listing_urls_for(fragments).collect{|url| Listings.new url }
58
70
  end
59
71
 
60
- # Continually scrapes listings, until the date newer_then has been reached, or no more 'next page' links are avialable to be clicked on.
61
- # Returns an array of PostSummary objects. Dates are based on the Month/Day 'datestamps' reported in the listing summaries.
62
- # As such, time-based cutoffs are not supported here. The scrape_until method, utilizing the SummaryPost.full_post method could achieve
63
- # time-based cutoffs, at the expense of retrieving every post in full during enumerations.
72
+ # Determines all listings which can be construed by combining the sites specified in the object
73
+ # constructor with the provided url-path fragments.
64
74
  #
65
- # <b>Note:<b> The results will not include post summaries having the newer_then date themselves.
66
- def self.scrape_posts_since(listing_url, newer_then)
67
- self.scrape_until(listing_url) {|post| post.post_date <= newer_then}
75
+ # Passes all posts from each of these urls to the provided block, in the order they're parsed
76
+ # (for each listing, newest posts are returned first).
77
+ def each_post(*fragments)
78
+ each_page_in_each_listing(*fragments){ |l| l.posts.each{|p| yield p} }
68
79
  end
69
-
70
- # Returns the most recentlt expired time for the provided month and day
71
- def self.most_recently_expired_time(month, day) #:nodoc:
72
- now = (time_now) ? time_now : Time.now
73
-
74
- # This ensures we always generate a time in the past, by guessing the year and subtracting one if we guessed wrong
75
- ret = Time.local now.year, month, day
76
- ret = Time.local now.year-1, month, day if ret > now
77
-
80
+
81
+ # Determines all listings which can be construed by combining the sites specified in the object
82
+ # constructor with the provided url-path fragments.
83
+ #
84
+ # Returns all posts from each of these urls, in the order they're parsed
85
+ # (newest posts first).
86
+ def posts(*fragments)
87
+ ret = []
88
+ each_page_in_each_listing(*fragments){ |l| ret += l.posts }
78
89
  ret
79
90
  end
80
-
81
- # Scraper is a general-pupose base class for all libcraigscrape Objects. Scraper facilitates all http-related
82
- # functionality, and adds some useful helpers for dealing with eager-loading of http-objects and general html
83
- # methods. It also contains the http-related cattr_accessors:
84
- #
85
- # *logger* - a Logger object to debug http notices too. Defaults to nil
86
- #
87
- # *retries_on_fetch_fail* - The number of times to retry a failed uri download. Defaults to 4
88
- #
89
- # *sleep_between_fetch_retries* - The amount of seconds to sleep, between successive attempts in the case of a failed download. Defaults to 15.
90
- class Scraper
91
- cattr_accessor :logger
92
- cattr_accessor :sleep_between_fetch_retries
93
- cattr_accessor :retries_on_fetch_fail
94
-
95
- URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/
96
- HTML_TAG = /<\/?[^>]*>/
97
-
98
- # Returns the full url that corresponds to this resource
99
- attr_reader :url
100
-
101
- # Set some defaults:
102
- self.retries_on_fetch_fail = 4
103
- self.sleep_between_fetch_retries = 15
104
91
 
105
- class BadConstructionError < StandardError #:nodoc:
106
- end
107
-
108
- class ParseError < StandardError #:nodoc:
109
- end
110
-
111
- class BadUrlError < StandardError #:nodoc:
112
- end
113
-
114
- class FetchError < StandardError #:nodoc:
115
- end
116
-
117
- # Scraper Objects can be created from either a full URL (string), or a Hash.
118
- # Currently, this initializer isn't intended to be called from libcraigslist API users, though
119
- # if you know what you're doing - feel free to try this out.
120
- #
121
- # A (string) url can be passed in a 'http://' scheme or a 'file://' scheme.
122
- #
123
- # When constructing from a hash, the keys in the hash will be used to set the object's corresponding values.
124
- # This is useful to create an object without actually making an html request, this is used to set-up an
125
- # object before it eager-loads any values not already passed in by the constructor hash. Though optional, if
126
- # you're going to be setting this object up for eager-loadnig, be sure to pass in a :url key in your hash,
127
- # Otherwise this will fail to eager load.
128
- def initialize(init_via = nil)
129
- if init_via.nil?
130
- # Do nothing - possibly not a great idea, but we'll allow it
131
- elsif init_via.kind_of? String
132
- @url = init_via
133
- elsif init_via.kind_of? Hash
134
- init_via.each_pair{|k,v| instance_variable_set "@#{k}", v}
135
- else
136
- raise BadConstructionError, ("Unrecognized parameter passed to %s.new %s}" % [self.class.to_s, init_via.class.inspect])
92
+ # Determines all listings which can be construed by combining the sites specified in the object
93
+ # constructor with the provided url-path fragments.
94
+ #
95
+ # Returns all posts from each of these urls, which are newer than the provider 'newer_then' date.
96
+ # (Returns 'newest' posts first).
97
+ def posts_since(newer_then, *fragments)
98
+ ret = []
99
+ fragments.each do |frag|
100
+ each_post(frag) do |p|
101
+ break if p.post_date <= newer_then
102
+ ret << p
137
103
  end
138
104
  end
139
-
140
- # Indicates whether the resource has yet been retrieved from its associated url.
141
- # This is useful to distinguish whether the instance was instantiated for the purpose of an eager-load,
142
- # but hasn't yet been fetched.
143
- def downloaded?; !@html.nil?; end
144
-
145
- # A URI object corresponding to this Scraped URL
146
- def uri
147
- @uri ||= URI.parse @url if @url
148
- @uri
149
- end
150
105
 
151
- private
152
-
153
- # Returns text with all html tags removed.
154
- def strip_html(str)
155
- str.gsub HTML_TAG, "" if str
156
- end
157
-
158
- # Easy way to fail noisily:
159
- def parse_error!; raise ParseError, "Error while parsing %s:\n %s" % [self.class.to_s, html]; end
160
-
161
- # Returns text with all html entities converted to respective ascii character.
162
- def he_decode(text); self.class.he_decode text; end
163
-
164
- # Returns text with all html entities converted to respective ascii character.
165
- def self.he_decode(text); HTMLEntities.new.decode text; end
166
-
167
- # Derives a full url, using the current object's url and the provided href
168
- def url_from_href(href) #:nodoc:
169
- scheme, host, path = $1, $2, $3 if URL_PARTS.match href
170
-
171
- scheme = uri.scheme if scheme.nil? or scheme.empty? and uri.respond_to? :scheme
172
-
173
- host = uri.host if host.nil? or host.empty? and uri.respond_to? :host
174
-
175
- path = (
176
- (/\/$/.match(uri.path)) ?
177
- '%s%s' % [uri.path,path] :
178
- '%s/%s' % [File.dirname(uri.path),path]
179
- ) unless /^\//.match path
180
-
181
- '%s://%s%s' % [scheme, host, path]
182
- end
183
-
184
- def fetch_uri(uri)
185
-
186
- logger.info "Requesting: %s" % @url if logger
187
-
188
- case uri.scheme
189
- when 'file'
190
- File.read uri.path
191
- when /^http[s]?/
192
- fetch_attempts = 0
193
-
194
- begin
195
- # This handles the redirects for us
196
- resp, data = Net::HTTP.new( uri.host, uri.port).get uri.request_uri, nil
197
-
198
- if resp.response.code == "200"
199
- # Check for gzip, and decode:
200
- data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
201
-
202
- data
203
- elsif resp.response['Location']
204
- redirect_to = resp.response['Location']
205
-
206
- fetch_uri URI.parse(url_from_href(redirect_to))
207
- else
208
- # Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
209
- error_description = 'Unable to fetch "%s" (%s)' % [ @url, resp.response.code ]
210
-
211
- logger.info error_description if logger
212
-
213
- raise FetchError, error_description
214
- end
215
- rescue FetchError,Timeout::Error,Errno::ECONNRESET => err
216
- logger.info 'Timeout error while requesting "%s"' % @url if logger and err.class == Timeout::Error
217
- logger.info 'Connection reset while requesting "%s"' % @url if logger and err.class == Errno::ECONNRESET
218
-
219
- fetch_attempts += 1
220
-
221
- if fetch_attempts <= self.retries_on_fetch_fail
222
- sleep self.sleep_between_fetch_retries if self.sleep_between_fetch_retries
223
- logger.info 'Retrying fetch ....' if logger
224
- retry
225
- else
226
- raise err
227
- end
228
- end
229
- else
230
- raise BadUrlError, "Unknown URI scheme for the url: #{@url}"
231
- end
232
- end
233
-
234
- def html
235
- @html ||= Hpricot.parse fetch_uri(uri) if uri
236
- @html
237
- end
106
+ ret
238
107
  end
108
+
109
+ class << self # Class methods
239
110
 
240
- # Posting represents a fully downloaded, and parsed, Craigslist post.
241
- # This class is generally returned by the listing scrape methods, and
242
- # contains the post summaries for a specific search url, or a general listing category
243
- class Posting < Scraper
244
-
245
- POST_DATE = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
246
- LOCATION = /Location\:[ ]+(.+)/
247
- HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/
248
- POSTING_ID = /PostingID\:[ ]+([\d]+)/
249
- REPLY_TO = /(.+)/
250
- PRICE = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
251
- USERBODY_PARTS = /\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>/m
252
- IMAGE_SRC = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
253
-
254
- # This is really just for testing, in production use, uri.path is a better solution
255
- attr_reader :href #:nodoc:
256
-
257
- # Create a new Post via a url (String), or supplied parameters (Hash)
258
- def initialize(*args)
259
- super(*args)
260
-
261
- # Validate that required fields are present, at least - if we've downloaded it from a url
262
- parse_error! if args.first.kind_of? String and !flagged_for_removal? and !deleted_by_author? and [
263
- contents,posting_id,post_time,header,title,full_section
264
- ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
265
- end
266
-
267
-
268
- # String, The contents of the item's html body heading
269
- def header
270
- unless @header
271
- h2 = html.at 'h2' if html
272
- @header = he_decode h2.inner_html if h2
273
- end
274
-
275
- @header
276
- end
277
-
278
- # String, the item's title
279
- def title
280
- unless @title
281
- title_tag = html.at 'title' if html
282
- @title = he_decode title_tag.inner_html if title_tag
283
- @title = nil if @title and @title.length == 0
284
- end
285
-
286
- @title
287
- end
288
-
289
- # Array, hierarchial representation of the posts section
290
- def full_section
291
- unless @full_section
292
- @full_section = []
293
-
294
- (html/"div[@class='bchead']//a").each do |a|
295
- @full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
296
- end if html
297
- end
298
-
299
- @full_section
300
- end
111
+ #--
112
+ # NOTE: These Class methods are all marked for deprecation as of
113
+ # version 0.8.0, and should not be used with any new project code
114
+ #++
301
115
 
302
- # String, represents the post's reply-to address, if listed
303
- def reply_to
304
- unless @reply_to
305
- cursor = html.at 'hr' if html
306
- cursor = cursor.next_sibling until cursor.nil? or cursor.name == 'a'
307
- @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
308
- end
309
-
310
- @reply_to
311
- end
312
-
313
- # Time, reflects the full timestamp of the posting
314
- def post_time
315
- unless @post_time
316
- cursor = html.at 'hr' if html
317
- cursor = cursor.next_node until cursor.nil? or POST_DATE.match cursor.to_s
318
- @post_time = Time.parse $1 if $1
319
- end
320
-
321
- @post_time
116
+ # <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
117
+ # Instead, consider using CraigScrape::Listings.new
118
+ #
119
+ # Scrapes a single listing url and returns a Listings object representing the contents.
120
+ # Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
121
+ def scrape_listing(listing_url)
122
+ CraigScrape::Listings.new listing_url
322
123
  end
323
124
 
324
- # Integer, Craigslist's unique posting id
325
- def posting_id
326
- unless @posting_id
327
- cursor = (html/"#userbody").first if html
328
- cursor = cursor.next_node until cursor.nil? or POSTING_ID.match cursor.to_s
329
- @posting_id = $1.to_i if $1
330
- end
331
-
332
- @posting_id
333
- end
334
-
335
- # String, The full-html contents of the post
336
- def contents
337
- unless @contents
338
- @contents = user_body if html
339
- @contents = he_decode @contents.strip if @contents
340
- end
341
-
342
- @contents
343
- end
344
-
345
- # String, the location of the item, as best could be parsed
346
- def location
347
- if @location.nil? and craigslist_body and html
348
- # Location (when explicitly defined):
349
- cursor = craigslist_body.at 'ul' unless @location
350
-
351
- # Apa section includes other things in the li's (cats/dogs ok fields)
352
- cursor.children.each do |li|
353
- if LOCATION.match li.inner_html
354
- @location = he_decode($1) and break
355
- break
125
+ # <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
126
+ # Instead, consider using the CraigScrape::each_post method.
127
+ #
128
+ # Continually scrapes listings, using the supplied url as a starting point, until the supplied block returns true or
129
+ # until there's no more 'next page' links available to click on
130
+ def scrape_until(listing_url, &post_condition)
131
+ ret = []
132
+
133
+ listings = CraigScrape::Listings.new listing_url
134
+ catch "ScrapeBreak" do
135
+ while listings do
136
+ listings.posts.each do |post|
137
+ throw "ScrapeBreak" if post_condition.call(post)
138
+ ret << post
356
139
  end
357
- end if cursor
358
-
359
- # Real estate listings can work a little different for location:
360
- unless @location
361
- cursor = craigslist_body.at 'small'
362
- cursor = cursor.previous_node until cursor.nil? or cursor.text?
363
-
364
- @location = he_decode(cursor.to_s.strip) if cursor
365
- end
366
-
367
- # So, *sometimes* the location just ends up being in the header, I don't know why:
368
- @location = $1 if @location.nil? and HEADER_LOCATION.match header
369
- end
370
-
371
- @location
372
- end
373
-
374
- # Array, urls of the post's images that are *not* hosted on craigslist
375
- def images
376
- # Keep in mind that when users post html to craigslist, they're often not posting wonderful html...
377
- @images = (
378
- contents ?
379
- contents.scan(IMAGE_SRC).collect{ |a| a.find{|b| !b.nil? } } :
380
- []
381
- ) unless @images
382
-
383
- @images
384
- end
385
-
386
- # Array, urls of the post's craigslist-hosted images
387
- def pics
388
- unless @pics
389
- @pics = []
390
-
391
- if html and craigslist_body
392
- # Now let's find the craigslist hosted images:
393
- img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
394
-
395
- @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
140
+
141
+ listings = listings.next_page
396
142
  end
397
143
  end
398
-
399
- @pics
400
- end
401
-
402
- # Returns true if this Post was parsed, and merely a 'Flagged for Removal' page
403
- def flagged_for_removal?
404
- @flagged_for_removal = (
405
- system_post? and header_as_plain == "This posting has been flagged for removal"
406
- ) if @flagged_for_removal.nil?
407
-
408
- @flagged_for_removal
409
- end
410
-
411
- # Returns true if this Post was parsed, and represents a 'This posting has been deleted by its author.' notice
412
- def deleted_by_author?
413
- @deleted_by_author = (
414
- system_post? and header_as_plain == "This posting has been deleted by its author."
415
- ) if @deleted_by_author.nil?
416
-
417
- @deleted_by_author
418
- end
419
-
420
-
421
- # Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
422
- # used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
423
- def post_date
424
- @post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil?
425
-
426
- @post_date
427
- end
428
-
429
- # Returns The post label. The label would appear at first glance to be indentical to the header - but its not.
430
- # The label is cited on the listings pages, and generally includes everything in the header - with the exception of the location.
431
- # Sometimes there's additional information ie. '(map)' on rea listings included in the header, that aren't to be listed in the label
432
- # This is also used as a bandwidth shortcut for the craigwatch program, and is a guaranteed identifier for the post, that won't result
433
- # in a full page load from the post's url.
434
- def label
435
- unless @label or system_post?
436
- @label = header
437
-
438
- @label = $1 if location and /(.+?)[ ]*\(#{location}\).*?$/.match @label
439
- end
440
-
441
- @label
442
- end
443
-
444
- # Array, which image types are listed for the post.
445
- # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
446
- def img_types
447
- unless @img_types
448
- @img_types = []
449
-
450
- @img_types << :img if images.length > 0
451
- @img_types << :pic if pics.length > 0
452
- end
453
-
454
- @img_types
455
- end
456
-
457
- # Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
458
- # this (sometimes/rarely) conserves bandwidth by pulling this field from the listing post-summary
459
- def section
460
- unless @section
461
- @section = full_section.last if full_section
462
- end
463
-
464
- @section
465
- end
466
-
467
- # true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server.
468
- # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
469
- def has_img?
470
- img_types.include? :img
471
- end
472
-
473
- # true if post summary has 'pic(s)'. 'pics' are different then imgs, in that craigslist is hosting the resource on craigslist's servers
474
- # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
475
- def has_pic?
476
- img_types.include? :pic
477
- end
478
-
479
- # true if post summary has either the img or pic label
480
- # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
481
- def has_pic_or_img?
482
- img_types.length > 0
483
- end
484
-
485
- # Returns the best-guess of a price, judging by the label's contents. Price is available when pulled from the listing summary
486
- # and can be safely used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
487
- def price
488
- $1.tr('$','').to_f if label and PRICE.match label
489
- end
490
-
491
- # Returns the post contents with all html tags removed
492
- def contents_as_plain
493
- strip_html contents
144
+
145
+ ret
494
146
  end
495
147
 
496
- # Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a
497
- # 'system_post' we may get tags in here
498
- def header_as_plain
499
- strip_html header
148
+ # <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
149
+ # Instead, consider using CraigScrape::Posting.new
150
+ #
151
+ # Scrapes a single Post Url, and returns a Posting object representing its contents.
152
+ # Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
153
+ def scrape_full_post(post_url)
154
+ CraigScrape::Posting.new post_url
500
155
  end
501
156
 
502
- # Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original
503
- # This returns true or false if that case applies
504
- def system_post?
505
- [contents,posting_id,post_time,title].all?{|f| f.nil?}
157
+ # <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
158
+ # Instead, consider using the CraigScrape::each_post method.
159
+ #
160
+ # Continually scrapes listings, using the supplied url as a starting point, until 'count' summaries have been retrieved
161
+ # or no more 'next page' links are avialable to be clicked on. Returns an array of PostSummary objects.
162
+ def scrape_posts(listing_url, count)
163
+ count_so_far = 0
164
+ self.scrape_until(listing_url) {|post| count_so_far+=1; count < count_so_far }
506
165
  end
507
166
 
508
- private
509
-
510
- # OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
511
- # This bad html trips up hpricot, and I've resorted to splitting the page up using string parsing like so:
512
- # We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
513
- def user_body
514
- $1 if USERBODY_PARTS.match html.to_s
515
- end
516
-
517
- # Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
518
- # So - we'll return it as an Hpricot object.
519
- def craigslist_body
520
- Hpricot.parse $2 if USERBODY_PARTS.match html.to_s
167
+ # <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
168
+ # Instead, consider using the CraigScrape::posts_since method.
169
+ #
170
+ # Continually scrapes listings, until the date newer_then has been reached, or no more 'next page' links are avialable to be clicked on.
171
+ # Returns an array of PostSummary objects. Dates are based on the Month/Day 'datestamps' reported in the listing summaries.
172
+ # As such, time-based cutoffs are not supported here. The scrape_until method, utilizing the SummaryPost.full_post method could achieve
173
+ # time-based cutoffs, at the expense of retrieving every post in full during enumerations.
174
+ #
175
+ # <b>Note:</b> The results will not include post summaries having the newer_then date themselves.
176
+ def scrape_posts_since(listing_url, newer_then)
177
+ self.scrape_until(listing_url) {|post| post.post_date <= newer_then}
521
178
  end
522
-
523
179
  end
524
-
525
- # Listings represents a parsed Craigslist listing page and is generally returned by CraigScrape.scrape_listing
526
- class Listings < Scraper
527
- LABEL = /^(.+?)[ ]*\-$/
528
- LOCATION = /^[ ]*\((.*?)\)$/
529
- IMG_TYPE = /^[ ]*(.+)[ ]*$/
530
- HEADER_DATE = /^[ ]*[^ ]+[ ]+([^ ]+)[ ]+([^ ]+)[ ]*$/
531
- SUMMARY_DATE = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
532
- NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/
533
-
534
- # Array, PostSummary objects found in the listing
535
- def posts
536
- unless @posts
537
- current_date = nil
538
- @posts = []
539
180
 
540
- post_tags = html.get_elements_by_tag_name('p','h4')
541
-
542
- # The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output:
543
- post_tags.pop if (
544
- post_tags.length > 0 and
545
- post_tags.last.at('a') and
546
- NEXT_PAGE_LINK.match post_tags.last.at('a').inner_html
547
- )
548
-
549
- # Now we iterate though the listings:
550
- post_tags.each do |el|
551
- case el.name
552
- when 'p'
553
- post_summary = self.class.parse_summary el, current_date
554
-
555
- # Validate that required fields are present:
556
- parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
557
-
558
- post_summary[:url] = url_from_href post_summary[:href]
559
-
560
- @posts << CraigScrape::Posting.new(post_summary)
561
- when 'h4'
562
- # Let's make sense of the h4 tag, and then read all the p tags below it
563
- if HEADER_DATE.match he_decode(el.inner_html)
564
- # Generally, the H4 tags contain valid dates. When they do - this is easy:
565
- current_date = CraigScrape.most_recently_expired_time $1, $2
566
- elsif html.at('h4:last-of-type') == el
567
- # There's a specific bug, where these nonsense h4's just appear without anything relevant inside them.
568
- # They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
569
- # we need to pull up the full post in order to accurate tell the date.
570
- # Setting this to nil will achieve the eager-load.
571
- current_date = nil
572
- end
573
- end
574
- end
575
- end
576
-
577
- @posts
578
- end
579
-
580
- # String, URL Path href-fragment of the next page link
581
- def next_page_href
582
- unless @next_page_href
583
- cursor = html.at 'p:last-of-type'
584
-
585
- cursor = cursor.at 'a' if cursor
586
-
587
- # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
588
- next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
589
-
590
- # Search listings put their next page in a link towards the top
591
- next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
592
-
593
- # Some search pages have a bug, whereby a 'next page' link isn't displayed,
594
- # even though we can see that theres another page listed in the page-number links block at the top
595
- # and bottom of the listing page
596
- unless next_link
597
- cursor = html % 'div.sh:first-of-type > b:last-of-type'
598
-
599
- # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
600
- # We're looking good.
601
- next_link = cursor.next_sibling if cursor and /^[\d]+$/.match cursor.inner_html
602
- end
603
-
604
- # We have an anchor tag - so - let's assign the href:
605
- @next_page_href = next_link[:href] if next_link
606
- end
607
-
608
- @next_page_href
609
- end
181
+ private
182
+
183
+ # This takes a fragments paramter, and turns it into actual urls
184
+ def listing_urls_for(listing_fragments)
185
+ listing_fragments.collect{ |lf|
186
+ # This removes any /'s from he beginning of the fragment
187
+ lf = $1 if /^\/(.*)/.match lf
188
+ # This adds a '/' to the end of a path, so long as its not a query we're dealing with...
189
+ lf += '/' unless lf.index '?'
190
+ sites.collect { |site| '%s%s/%s' % [site_to_url_prefix,site,lf] }
191
+ }.flatten
192
+ end
610
193
 
611
- # String, Full URL Path of the 'next page' link
612
- def next_page_url
613
- (next_page_href) ? url_from_href(next_page_href) : nil
614
- end
194
+ # Returns the most recentlt expired time for the provided month and day
195
+ def self.most_recently_expired_time(month, day) #:nodoc:
196
+ now = (time_now) ? time_now : Time.now
615
197
 
616
- # Takes a paragraph element and returns a mostly-parsed Posting
617
- # We separate this from the rest of the parsing both for readability and ease of testing
618
- def self.parse_summary(p_element, date = nil) #:nodoc:
619
- ret = {}
620
-
621
- title_anchor, section_anchor = p_element.search 'a'
622
- location_tag = p_element.at 'font'
623
- has_pic_tag = p_element.at 'span'
624
-
625
- href = nil
626
-
627
- location = he_decode p_element.at('font').inner_html if location_tag
628
- ret[:location] = $1 if location and LOCATION.match location
629
-
630
- ret[:img_types] = []
631
- if has_pic_tag
632
- img_type = he_decode has_pic_tag.inner_html
633
- img_type = $1.tr('^a-zA-Z0-9',' ') if IMG_TYPE.match img_type
634
-
635
- ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
636
- end
637
-
638
- ret[:section] = he_decode(section_anchor.inner_html).split("\302\240").join(" ") if section_anchor
639
-
640
- ret[:post_date] = date
641
- if SUMMARY_DATE.match he_decode(p_element.children[0])
642
- ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
643
- end
644
-
645
- if title_anchor
646
- label = he_decode title_anchor.inner_html
647
- ret[:label] = $1 if LABEL.match label
198
+ # This ensures we always generate a time in the past, by guessing the year and subtracting one if we guessed wrong
199
+ ret = Time.local now.year, month, day
200
+ ret = Time.local now.year-1, month, day if ret > now
648
201
 
649
- ret[:href] = title_anchor[:href]
650
- end
651
-
652
- ret
653
- end
654
- end
655
-
656
- # GeoListings represents a parsed Craigslist geo lisiting page. (i.e. {'http://geo.craigslist.org/iso/us'}[http://geo.craigslist.org/iso/us])
657
- # These list all the craigslist sites in a given region.
658
- class GeoListings < Scraper
659
- LOCATION_NAME = /[ ]*\>[ ](.+)[ ]*/
660
- GEOLISTING_BASE_URL = %{http://geo.craigslist.org/iso/}
661
-
662
- # The geolisting constructor works like all other Scraper objects, in that it accepts a string 'url'.
663
- # In addition though, here we'll accept an array like %w(us fl) which gets converted to
664
- # {'http://geo.craigslist.org/iso/us/fl'}[http://geo.craigslist.org/iso/us/fl]
665
- def initialize(init_via = nil)
666
- super init_via.kind_of?(Array) ? "#{GEOLISTING_BASE_URL}#{init_via.join '/'}" : init_via
667
-
668
- # Validate that required fields are present, at least - if we've downloaded it from a url
669
- parse_error! unless location
670
- end
671
-
672
- # Returns the GeoLocation's full name
673
- def location
674
- unless @name
675
- cursor = html % 'h3 > b > a:first-of-type'
676
- cursor = cursor.next_node if cursor
677
- @name = $1 if cursor and LOCATION_NAME.match he_decode(cursor.to_s)
678
- end
679
-
680
- @name
681
- end
682
-
683
- # Returns a hash of site name to urls in the current listing
684
- def sites
685
- unless @sites
686
- @sites = {}
687
- (html / 'div#list > a').each do |el_a|
688
- site_name = he_decode strip_html(el_a.inner_html)
689
- @sites[site_name] = el_a[:href]
690
- end
691
- end
692
-
693
- @sites
694
- end
202
+ ret
695
203
  end
696
204
 
697
205
  end