libcraigscrape 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/craigwatch CHANGED
@@ -15,10 +15,14 @@
15
15
  # - price_required - yes/no
16
16
  # - price_greater_than - (int)
17
17
  # - price_less_than - (int)
18
- # - full_post_has - (string or regexp) Only post whose full-post's contents contains/matches
19
- # - full_post_has_no - (string or regexp) Only post whose full-post's contents_ contains doesn't contain/match
20
- # - summary_post_has - (string or regexp) Only post whose listing's label contains/matches
21
- # - summary_post_has_no - (string or regexp) Only post whose listing's label doesn't contain/match
18
+ # - full_post_has - (array of string or regexp) Only post whose full-post's contents contains/matches
19
+ # - full_post_has_no - (array of string or regexp) Only post whose full-post's contents_ contains doesn't contain/match
20
+ # - summary_post_has - (array of string or regexp) Only post whose listing's label contains/matches
21
+ # - summary_post_has_no - (array of string or regexp) Only post whose listing's label doesn't contain/match
22
+ # - summary_or_full_post_has - (array of string or regexp) Filter's out results which don't match either the post label <b>or</b> the post contents
23
+ # - summary_or_full_post_has_no - (array of string or regexp) Filter's out results which match either the post label <b>or</b> the post contents
24
+ # - location_has - (array of string or regexp) Only include posts which match against the post location
25
+ # - location_has_no - (array of string or regexp) Only include posts which don't match against the post location
22
26
  #
23
27
  # Multiple searches can be combined into a single report, and results can be sorted by newest-first or oldest-first (default)
24
28
  #
@@ -58,8 +62,44 @@
58
62
  # - debug_craigscrape
59
63
  #
60
64
  # == Definition File Sample
61
- #
62
- # Here's a simple annotated report which uses most of the available craigwatch features:
65
+ #
66
+ # Let's start with a minimal report, just enough needed to get something quick working:
67
+ # # We need some kind of destination to send this to
68
+ # email_to: Chris DeRose <cderose@derosetechnologies.com>
69
+ #
70
+ # # This is an array of specific 'searches' we'll be performing in this report:
71
+ # searches:
72
+ # # We're looking for 90's era cadillac, something cheap, confortable and in white...
73
+ # - name: 90's White/Creme Convertible Cadillacs
74
+ #
75
+ # # This starting date is mostly for the first run, and gives us a reasonable cut-off point from whcih to build.
76
+ # # Its optional, and if omitted, craigwatch defaults to 'yesterday'
77
+ # starting: 9/10/09
78
+ #
79
+ # # We want to check all the labels, and filter out years not in the 90's, and cars not made by cadillac
80
+ # summary_post_has:
81
+ # - /(?:^|[^\d]|19)9[\d](?:[^\dk]|$)/i
82
+ # - /cadillac/i
83
+ #
84
+ # # I said we're looking for something *comfortable* !
85
+ # summary_post_has_no: [ /xlr/i ]
86
+ #
87
+ # # We were convertable, and white/cream/etc:
88
+ # full_post_has:
89
+ # - /convertible/i
90
+ # - /(white|yellow|banana|creme|cream)/i
91
+ #
92
+ # # Convertible - not *simulated* convertible!
93
+ # full_post_has_no:
94
+ # - /simulated[^a-z]{0,2}convertible/i
95
+ #
96
+ # # We want to search all of craigslist's in the us, and we'll want to find it using
97
+ # # the '/search/cta?hasPic=1&query=cadillac' url on the site
98
+ # sites: [ us ]
99
+ # listings:
100
+ # - /search/cta?hasPic=1&query=cadillac
101
+ #
102
+ # Here's another annotated report which uses most of the other available craigwatch features:
63
103
  #
64
104
  # # The report_name is fed into Time.now.strftime, hence the formatting characters
65
105
  # report_name: Craig Watch For Johnathan on %D at %I:%M %p
@@ -73,13 +113,13 @@
73
113
  # searches:
74
114
  # # Search #1:
75
115
  # - name: Schwinn Bikes For Sale in/near New York
116
+ # starting: 9/10/2009
117
+ #
118
+ # # Scrape the following sites/servers:
119
+ # sites: [ us/ny/newyork, us/nj/southjersey ]
76
120
  #
77
121
  # # Scrape the following listings pages:
78
- # listing:
79
- # - http://newyork.craigslist.org/bik/
80
- # - http://newyork.craigslist.org/jsy/bik/
81
- # # This starting date is mostly for the first run, and gives us a reasonable cut-off point from whcih to build
82
- # starting: 5/2/2009
122
+ # listings: [ bik ]
83
123
  #
84
124
  # # We want listings with Schwinn in the summary
85
125
  # summary_post_has: [ /schwinn/i ]
@@ -92,10 +132,13 @@
92
132
  #
93
133
  # # Search #2
94
134
  # - name: Large apartment rentals in San Francisco
135
+ # sites: [ us/ca/sfbay ]
136
+ # starting: 9/10/2009
137
+ #
95
138
  # # We're going to rely on craigslist's built-in search for this one since there's a lot of listings, and we
96
139
  # # want to conserve some bandwidth
97
- # listing: [ http://sfbay.craigslist.org/search/apa?query=pool&minAsk=min&maxAsk=max&bedrooms=5 ]
98
- # starting: 5/2/2009
140
+ # listings: [ /search/apa?query=pool&minAsk=min&maxAsk=max&bedrooms=5 ]
141
+ #
99
142
  # # We'll require a price to be listed, 'cause it keeps out some of the unwanted fluff
100
143
  # price_required: yes
101
144
  #
@@ -149,24 +192,52 @@ end
149
192
  class CraigReportDefinition #:nodoc:
150
193
  include Kwalify::Util::HashLike
151
194
 
195
+ EMAIL_NAME_PARTS = /^[ ]*(.+)[ ]*\<.+\>[ ]*/
196
+
152
197
  attr_reader :report_name, :email_to, :email_from, :tracking_database, :searches, :smtp_settings
153
198
 
154
199
  def debug_database?; @debug_database; end
155
200
  def debug_mailer?; @debug_mailer; end
156
201
  def debug_craigscrape?; @debug_craigscrape; end
157
-
158
- def each_search(&block); searches.each &block; end
159
202
 
160
203
  def email_from
161
204
  (@email_from) ? @email_from : ('%s@%s' % [ENV['USER'], Socket.gethostname])
162
205
  end
163
206
 
207
+ def email_to_name
208
+ EMAIL_NAME_PARTS.match(email_to) ? $1 : email_to
209
+ end
210
+
211
+ def report_name
212
+ @report_name ? @report_name : "Craigslist Watch For #{email_to_name} on %D at %I:%M %p"
213
+ end
214
+
215
+ # We allow people rewrite relative (sqlite) dbfiles by taking the use_cwd as a paramter
216
+ def tracking_database(for_yaml_file = nil)
217
+ # We'll setup a SQLite db using some defaults if needed
218
+ @tracking_database ||= {
219
+ :adapter => 'sqlite3',
220
+ :dbfile => File.basename(for_yaml_file, File.extname(for_yaml_file))+'.db'
221
+ } if for_yaml_file
222
+
223
+ # This is a little hack to make sqlite definitions a little more portable, by allowing them
224
+ # to be specify dbfile's relative to the yml's directory:
225
+ ret = @tracking_database
226
+ ret['dbfile'] = '%s/%s' % [File.dirname(for_yaml_file), $1] if (
227
+ for_yaml_file and ret.has_key? 'dbfile' and /^([^\/].*)$/.match ret['dbfile']
228
+ )
229
+
230
+ ret
231
+ end
232
+
164
233
  class SearchDefinition #:nodoc:
165
234
  include Kwalify::Util::HashLike
166
235
 
167
- attr_reader :name, :listing
236
+ attr_reader :name, :sites, :listings
237
+ attr_reader :location_has, :location_has_no
168
238
  attr_reader :full_post_has, :full_post_has_no
169
239
  attr_reader :summary_post_has, :summary_post_has_no
240
+ attr_reader :summary_or_full_post_has, :summary_or_full_post_has_no
170
241
 
171
242
  attr_reader :price_greater_than,:price_less_than
172
243
 
@@ -188,15 +259,24 @@ class CraigReportDefinition #:nodoc:
188
259
  return false if @price_less_than and post.price >= @price_less_than
189
260
  end
190
261
 
262
+ # Label Filters:
191
263
  return false unless matches_all? summary_post_has, post.label
192
264
  return false unless doesnt_match_any? summary_post_has_no, post.label
193
265
 
194
- if full_post_has or full_post_has_no
266
+ # Location Filters:
267
+ return false unless matches_all? location_has, post.location
268
+ return false unless doesnt_match_any? location_has_no, post.location
269
+
270
+ # Full post Filters:
271
+ if full_post_has or full_post_has_no or summary_or_full_post_has or summary_or_full_post_has_no
195
272
  # We're going to download the page, so let's make sure we didnt hit a "This posting has been flagged for removal"
196
273
  return false if post.system_post?
197
274
 
198
275
  return false unless matches_all? full_post_has, post.contents_as_plain
199
276
  return false unless doesnt_match_any? full_post_has_no, post.contents_as_plain
277
+
278
+ return false unless matches_all? summary_or_full_post_has, [post.contents_as_plain, post.label]
279
+ return false unless doesnt_match_any? summary_or_full_post_has_no, [post.contents_as_plain, post.label]
200
280
  end
201
281
 
202
282
  true
@@ -205,37 +285,66 @@ class CraigReportDefinition #:nodoc:
205
285
  private
206
286
 
207
287
  def matches_all?(conditions, against)
208
- (conditions.nil? or conditions.all?{|c| match_against c, against}) ? true : false
288
+ against = against.to_a
289
+ (conditions.nil? or conditions.all?{|c| against.any?{|a| match_against c, a } }) ? true : false
209
290
  end
210
291
 
211
292
  def doesnt_match_any?(conditions, against)
212
- (conditions.nil? or conditions.all?{|c| !match_against c, against}) ? true : false
293
+ against = against.to_a
294
+ (conditions.nil? or conditions.all?{|c| against.any?{|a| !match_against c, a } }) ? true : false
213
295
  end
214
296
 
215
297
  def match_against(condition, against)
216
- (against.scan( condition.is_re? ? condition.to_re : condition).length > 0) ? true : false
298
+ (against.scan( condition.is_re? ? condition.to_re : /#{condition}/i).length > 0) ? true : false
217
299
  end
218
300
  end
219
301
  end
220
302
 
221
303
  class TrackedSearch < ActiveRecord::Base #:nodoc:
222
- has_many :tracked_posts, :dependent => :destroy
304
+ has_many :listings, :dependent => :destroy, :class_name => 'TrackedListing'
223
305
  validates_uniqueness_of :search_name
224
306
  validates_presence_of :search_name
225
307
 
226
- def already_tracked?(url)
227
- ( self.tracked_posts.find :first, :conditions => ['url = ?', url]) ? true : false
308
+ def self.find_by_name(name)
309
+ self.find :first, :conditions => ['search_name = ?',name]
228
310
  end
229
311
 
312
+ def find_listing_by_url(url)
313
+ listings.find :first, :conditions => ['url = ?', url]
314
+ end
315
+ end
316
+
317
+ class TrackedListing < ActiveRecord::Base #:nodoc:
318
+ has_many :posts, :dependent => :destroy, :class_name => 'TrackedPost'
319
+ validates_presence_of :url, :tracked_search_id
320
+
321
+ def already_tracked?(url)
322
+ ( self.posts.find :first, :conditions => ['url = ?', url]) ? true : false
323
+ end
324
+
230
325
  def last_tracked_at
231
- self.tracked_posts.maximum 'created_at'
326
+ self.posts.maximum 'created_at'
327
+ end
328
+
329
+ def delete_posts_older_than(cutoff_date)
330
+ # TODO: can't I use posts.delete 'created_at < ?' and keep it cleaner?
331
+ TrackedPost.delete_all [ 'tracked_listing_id = ? AND created_at < ?', self.id, cutoff_date ]
232
332
  end
233
333
  end
234
334
 
235
335
  class TrackedPost < ActiveRecord::Base #:nodoc:
236
- belongs_to :tracked_search
237
- validates_presence_of :url, :tracked_search_id
238
- validates_uniqueness_of :url, :scope => :tracked_search_id
336
+ validates_presence_of :url, :tracked_listing_id
337
+
338
+ def self.activate_all!
339
+ TrackedPost.update_all(
340
+ { :active => true },
341
+ [ 'active = ?', false ]
342
+ )
343
+ end
344
+
345
+ def self.destroy_inactive!
346
+ TrackedPost.delete_all [ 'active = ?', false ]
347
+ end
239
348
  end
240
349
 
241
350
  class ReportMailer < ActionMailer::Base #:nodoc:
@@ -279,9 +388,9 @@ end
279
388
  parser = Kwalify::Yaml::Parser.new(
280
389
  Kwalify::Validator.new(
281
390
  Kwalify::Yaml.load_file(File.dirname(__FILE__)+'/craig_report_schema.yml')
282
- )
391
+ ),
392
+ :data_binding => true
283
393
  )
284
- parser.data_binding = true
285
394
 
286
395
  craig_report = parser.parse_file report_definition_file
287
396
 
@@ -300,7 +409,7 @@ ReportMailer.template_root = File.dirname __FILE__
300
409
 
301
410
  # Initialize the database:
302
411
  ActiveRecord::Base.logger = Logger.new STDERR if craig_report.debug_database?
303
- ActiveRecord::Base.establish_connection craig_report.tracking_database
412
+ ActiveRecord::Base.establish_connection craig_report.tracking_database(report_definition_file)
304
413
 
305
414
  # Initialize CraigScrape (sorta)
306
415
  CraigScrape::Scraper.logger = Logger.new STDERR if craig_report.debug_craigscrape?
@@ -311,68 +420,123 @@ ActiveRecord::Schema.define do
311
420
  create_table :tracked_searches do |t|
312
421
  t.column :search_name, :string
313
422
  end unless table_exists? :tracked_searches
314
-
315
- create_table :tracked_posts do |t|
423
+
424
+ create_table :tracked_listings do |t|
316
425
  t.column :url, :string
317
426
  t.column :tracked_search_id, :integer
427
+ end unless table_exists? :tracked_listings
428
+
429
+ create_table :tracked_posts do |t|
430
+ t.column :url, :string
431
+ t.column :tracked_listing_id, :integer
318
432
  t.column :created_at, :date
433
+ t.column :active, :boolean, :default => 0
319
434
  end unless table_exists? :tracked_posts
320
435
  end
321
436
  end
322
437
 
438
+ # Remove all posts which are inactive. They would be in there if the prior run was a failure.
439
+ TrackedPost.destroy_inactive!
440
+
323
441
  # We'll need these outside this next loop:
324
- report_summaries = []
325
442
  newly_tracked_posts = []
326
443
 
327
444
  # Now let's run a report:
328
- craig_report.each_search do |search|
329
-
445
+ report_summaries = craig_report.searches.collect do |search|
330
446
  # Load our tracking info
331
- search_track = TrackedSearch.find :first, :conditions => ['search_name = ?',search.name]
447
+ search_track = TrackedSearch.find_by_name search.name
332
448
 
333
449
  # No Tracking found - let's set one up:
334
450
  search_track = TrackedSearch.create! :search_name => search.name unless search_track
451
+
452
+ # This hash tracks what makes it into the report on this search.
453
+ # NOTE that keys are url's b/c sometimes the same posting will end up in multiple listings,
454
+ # And doing this ensures that we don't end-up reporting the same post twice.
455
+ new_summaries = {}
456
+
457
+ # And now we actually scrape:
458
+ CraigScrape.new(*search.sites).each_listing(*search.listings) do |listing|
459
+ # Keep in mind that listing.url does change in the while loop.
460
+ # But, this first one is a good base_url that will never change between runs.
335
461
 
336
- last_tracked_at = (search_track.last_tracked_at) ? search_track.last_tracked_at : search.starting_at
462
+ tracked_listing = search_track.find_listing_by_url listing.url
463
+ tracked_listing ||= search_track.listings.create! :url => listing.url
464
+
465
+ # Gives us a sane stopping point (hopefully) :
466
+ last_tracked_at = tracked_listing.last_tracked_at
467
+ last_tracked_at ||= search.starting_at
337
468
 
338
- already_tracked_urls = search_track.tracked_posts.collect{|tp| tp.url}
469
+ # Some more stopping points (probably):
470
+ already_tracked_urls = tracked_listing.posts.collect{|tp| tp.url}
339
471
 
340
- # Let's collect all the summaries that could apply:
341
- new_summaries = {}
342
- search.listing.each do |listing|
343
- CraigScrape.scrape_until(listing){|p| p.post_date <= last_tracked_at or already_tracked_urls.include? p.url }.each do |p_s|
344
- new_summaries[p_s.url] = p_s unless new_summaries.has_key? p_s.url
472
+ # We'll use this in the loop to decide what posts to track:
473
+ newest_post_date = last_tracked_at
474
+
475
+ # OK - Now let's go!
476
+ catch :list_break do
477
+ while listing
478
+ listing.posts.each do |post|
479
+ begin
480
+ # Are we at a point in the scrape, past which we don't need to proceed?
481
+ throw :list_break if (
482
+ post.post_date < last_tracked_at or
483
+ already_tracked_urls.include? post.url
484
+ )
485
+
486
+ # If we want to report this post, add it to the collection:
487
+ new_summaries[post.url] = post if (
488
+ !new_summaries.has_key? post.url and
489
+ search.passes_filter? post
490
+ )
491
+ rescue CraigScrape::Scraper::ResourceNotFoundError => e
492
+ # Sometimes we do end up with 404's that will never load, and we dont want to
493
+ # abort a run simply b/c we found some anomaly due to the craigslist index.
494
+ # being out of date. This ResourceNotFoundError can occur due to
495
+ # loading the post url in full, only to see that it was yanked - or craigslist
496
+ # is acting funny.
497
+ next
498
+ end
499
+
500
+ # Now let's see if the url should be kept in our tracking database for the future...
501
+
502
+ # This post-date sets a limit for the tracked_listing.posts.create below
503
+ newest_post_date = post.post_date if post.post_date > newest_post_date
504
+
505
+ # Now let's add these urls to the database so as to reduce memory overhead.
506
+ # Keep in mind - they're not active until the email goes out.
507
+ # also - we shouldn't have to worry about putting 'irrelevant' posts in the db, since
508
+ # the nbewest are always the first ones parsed:
509
+ tracked_listing.posts.create(
510
+ :url => post.url,
511
+ :created_at => newest_post_date
512
+ ) unless post.post_date < newest_post_date
513
+
514
+ end
515
+
516
+ listing = listing.next_page
517
+ end
345
518
  end
346
519
  end
347
520
 
348
521
  # Let's flatten the unique'd hash into a more useable array:
349
522
  new_summaries = new_summaries.values.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
350
523
 
351
- # Let's tag all the newest tracked posts that should go into the database:
352
- # NOTE: Since all the dates are at_begining_of_day, we'll effectively have a chunk of dates tied for latest
353
- new_summaries.reject{|p| p.post_date < new_summaries.last.post_date}.each do |p_s|
354
- newly_tracked_posts << search_track.tracked_posts.build( :url => p_s.url, :created_at => p_s.post_date)
355
- end
356
-
357
- # Reject anything from this report which doesn't match the has/has_no :
358
- new_summaries.reject!{|s| !search.passes_filter? s }
359
-
360
524
  # Now Let's manage the tracking database:
361
525
  if new_summaries.length > 0
362
526
 
363
527
  # We'll use this in the cleanup at the bottom:
364
528
  latest_post_date = new_summaries.last.post_date
365
529
 
366
- new_summaries.reverse! if search.newest_first?
367
-
368
- # We'll want to email these...
369
- report_summaries << {
370
- :postings => new_summaries,
371
- :search => search,
372
- :search_track => search_track,
373
- :latest_post_date => latest_post_date
374
- }
530
+ new_summaries.reverse! if search.newest_first?
375
531
  end
532
+
533
+ # We'll want to email these...
534
+ {
535
+ :latest_post_date => latest_post_date,
536
+ :search_track => search_track,
537
+ :postings => new_summaries,
538
+ :search => search
539
+ }
376
540
  end
377
541
 
378
542
  # Time to send the email:
@@ -383,11 +547,12 @@ ReportMailer.deliver_report(
383
547
  {:summaries => report_summaries, :definition => craig_report}
384
548
  ) if report_summaries.length > 0
385
549
 
386
- # Save the newly created posts:
387
- newly_tracked_posts.each{|tp| tp.save!}
550
+ # Commit (make 'active') all newly created tracked post urls:
551
+ TrackedPost.activate_all!
388
552
 
389
- # Now that we know the user has been informed, Let's commit all our database changes and end this scrape 'transaction':
390
- report_summaries.each do |s|
391
- # Let's do some light cleanup to keep the database size down, by removing all the old posts we're no longer tracking:
392
- TrackedPost.delete_all [ 'tracked_search_id = ? AND created_at < ?', s[:search_track].id, s[:latest_post_date] ]
553
+ # Now remove all the no-longer-need posts from the prior run:
554
+ report_summaries.each do |summary|
555
+ summary[:search_track].listings.each do |listing|
556
+ listing.delete_posts_older_than listing.last_tracked_at
557
+ end
393
558
  end
@@ -1,14 +1,17 @@
1
1
  <h2><%=h @subject %></h2>
2
-
3
2
  <%@summaries.each do |summary| %>
4
3
  <h3><%=h summary[:search].name%></h3>
5
- <%summary[:postings].each do |post|%>
6
- <%='<p>%s <a href="%s">%s -</a>%s%s</p>' % [
7
- h(post.post_date.strftime('%b %d')),
8
- post.url,
9
- h(post.label),
10
- (post.location) ? '<font size="-1"> (%s)</font>' % h(post.location) : '',
11
- (post.has_pic_or_img?) ? ' <span style="color: orange"> img</span>': ''
12
- ] -%>
4
+ <% if summary[:postings].length > 0 %>
5
+ <%summary[:postings].each do |post|%>
6
+ <%='<p>%s <a href="%s">%s -</a>%s%s</p>' % [
7
+ h(post.post_date.strftime('%b %d')),
8
+ post.url,
9
+ h(post.label),
10
+ (post.location) ? '<font size="-1"> (%s)</font>' % h(post.location) : '',
11
+ (post.has_pic_or_img?) ? ' <span style="color: orange"> img</span>': ''
12
+ ] -%>
13
+ <% end %>
14
+ <% else %>
15
+ <p><i>No new postings were found, which matched the search criteria.</i></p>
13
16
  <% end %>
14
17
  <% end %>
@@ -3,6 +3,7 @@ CRAIGSLIST REPORTER
3
3
  <%@summaries.each do |summary| -%>
4
4
  <%=summary[:search].name %>
5
5
  <% summary[:postings].collect do |post| -%>
6
+ <% if summary[:postings].length > 0 %>
6
7
  <%='%s : %s %s %s %s' % [
7
8
  post.post_date.strftime('%b %d'),
8
9
  post.label,
@@ -10,6 +11,8 @@ CRAIGSLIST REPORTER
10
11
  (post.has_pic_or_img?) ? ' [img]': '',
11
12
  post.url
12
13
  ] -%>
13
-
14
+ <% else %>
15
+ No new postings were found, which matched the search criteria.
16
+ <% end %>
14
17
  <% end %>
15
18
  <% end -%>
@@ -0,0 +1,144 @@
1
+ # = About geo_listings.rb
2
+ #
3
+ # This file contains the parsing code, and logic relating to geographic site pages and paths. You
4
+ # should never need to include this file directly, as all of libcraigscrape's objects and methods
5
+ # are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
6
+ #
7
+
8
+ require 'scraper'
9
+
10
+ class CraigScrape
11
+
12
+ # GeoListings represents a parsed Craigslist geo lisiting page. (i.e. {'http://geo.craigslist.org/iso/us'}[http://geo.craigslist.org/iso/us])
13
+ # These list all the craigslist sites in a given region.
14
+ class GeoListings < Scraper
15
+ GEOLISTING_BASE_URL = %{http://geo.craigslist.org/iso/}
16
+
17
+ LOCATION_NAME = /[ ]*\>[ ](.+)[ ]*/
18
+ PATH_SCANNER = /(?:\\\/|[^\/])+/
19
+ URL_HOST_PART = /^[^\:]+\:\/\/([^\/]+)[\/]?$/
20
+ SITE_PREFIX = /^([^\.]+)/
21
+ FIND_SITES_PARTS = /^[ ]*([\+|\-]?)[ ]*(.+)[ ]*/
22
+
23
+ class BadGeoListingPath < StandardError #:nodoc:
24
+ end
25
+
26
+ # The geolisting constructor works like all other Scraper objects, in that it accepts a string 'url'.
27
+ # See the Craigscrape.find_sites for a more powerful way to find craigslist sites.
28
+ def initialize(init_via = nil)
29
+ super(init_via)
30
+
31
+ # Validate that required fields are present, at least - if we've downloaded it from a url
32
+ parse_error! unless location
33
+ end
34
+
35
+ # Returns the GeoLocation's full name
36
+ def location
37
+ unless @location
38
+ cursor = html % 'h3 > b > a:first-of-type'
39
+ cursor = cursor.next_node if cursor
40
+ @location = $1 if cursor and LOCATION_NAME.match he_decode(cursor.to_s)
41
+ end
42
+
43
+ @location
44
+ end
45
+
46
+ # Returns a hash of site name to urls in the current listing
47
+ def sites
48
+ unless @sites
49
+ @sites = {}
50
+ (html / 'div#list > a').each do |el_a|
51
+ site_name = he_decode strip_html(el_a.inner_html)
52
+ @sites[site_name] = $1 if URL_HOST_PART.match el_a[:href]
53
+ end
54
+ end
55
+
56
+ @sites
57
+ end
58
+
59
+ # This method will return an array of all possible sites that match the specified location path.
60
+ # Sample location paths:
61
+ # - us/ca
62
+ # - us/fl/miami
63
+ # - jp/fukuoka
64
+ # - mx
65
+ # Here's how location paths work.
66
+ # - The components of the path are to be separated by '/' 's.
67
+ # - Up to (and optionally, not including) the last component, the path should correspond against a valid GeoLocation url with the prefix of 'http://geo.craigslist.org/iso/'
68
+ # - the last component can either be a site's 'prefix' on a GeoLocation page, or, the last component can just be a geolocation page itself, in which case all the sites on that page are selected.
69
+ # - the site prefix is the first dns record in a website listed on a GeoLocation page. (So, for the case of us/fl/miami , the last 'miami' corresponds to the 'south florida' link on {'http://geo.craigslist.org/iso/us/fl'}[http://geo.craigslist.org/iso/us/fl]
70
+ def self.sites_in_path(full_path, base_url = GEOLISTING_BASE_URL)
71
+ # the base_url parameter is mostly so we can test this method
72
+
73
+ # Unfortunately - the easiest way to understand much of this is to see how craigslist returns
74
+ # these geolocations. Watch what happens when you request us/fl/non-existant/page/here.
75
+ # I also made this a little forgiving in a couple ways not specified with official support, per
76
+ # the rules above.
77
+ full_path_parts = full_path.scan PATH_SCANNER
78
+
79
+ # We'll either find a single site in this loop andf return that, or, we'll find a whole listing
80
+ # and set the geo_listing object to reflect that
81
+ geo_listing = nil
82
+ full_path_parts.each_with_index do |part, i|
83
+
84
+ # Let's un-escape the path-part, if needed:
85
+ part.gsub! "\\/", "/"
86
+
87
+ # If they're specifying a single site, this will catch and return it immediately
88
+ site = geo_listing.sites.find{ |n,s|
89
+ (SITE_PREFIX.match s and $1 == part) or n == part
90
+ } if geo_listing
91
+
92
+ # This returns the site component of the found array
93
+ return [site.last] if site
94
+
95
+ begin
96
+ # The URI escape is mostly needed to translate the space characters
97
+ l = GeoListings.new base_url+full_path_parts[0...i+1].collect{|p| URI.escape p}.join('/')
98
+ rescue CraigScrape::Scraper::FetchError
99
+ bad_geo_path! full_path
100
+ end
101
+
102
+ # This probably tells us the first part of the path was 'correct', but not the rest:
103
+ bad_geo_path! full_path if geo_listing and geo_listing.location == l.location
104
+
105
+ geo_listing = l
106
+ end
107
+
108
+ # We have a valid listing page we found, and we can just return all the sites on it:
109
+ geo_listing.sites.collect{|n,s| s }
110
+ end
111
+
112
+ # find_sites takes a single array of strings as an argument. Each string is to be either a location path
113
+ # (see sites_in_path), or a full site (in canonical form - ie "memphis.craigslist.org"). Optionally,
114
+ # each of this may/should contain a '+' or '-' prefix to indicate whether the string is supposed to
115
+ # include sites from the master list, or remove them from the list. If no '+' or'-' is
116
+ # specified, the default assumption is '+'. Strings are processed from left to right, which gives
117
+ # a high degree of control over the selection set. Examples:
118
+ # - find_sites "us/fl", "- miami.craigslist.org"
119
+ # - find_sites "us", "- us/nm"
120
+ # - find_sites "us", "- us/ny", "+ newyork.craigslist.org"
121
+ # - find_sites "us/ny", "us/id", "caribbean.craigslist.org"
122
+ # There's a lot of flexibility here, you get the idea.
123
+ def self.find_sites(specs, base_url = GEOLISTING_BASE_URL)
124
+ ret = []
125
+
126
+ specs.each do |spec|
127
+ (op,spec = $1,$2) if FIND_SITES_PARTS.match spec
128
+
129
+ spec = (spec.include? '.') ? [spec] : sites_in_path(spec, base_url)
130
+
131
+ (op == '-') ? ret -= spec : ret |= spec
132
+ end
133
+
134
+ ret
135
+ end
136
+
137
+ private
138
+
139
+ def self.bad_geo_path!(path)
140
+ raise BadGeoListingPath, "Unable to load path #{path.inspect}, either you're having problems connecting to Craiglist, or your path is invalid."
141
+ end
142
+
143
+ end
144
+ end