RubyGems - libcraigscrape - Versions diffs - 0.7.0 → 0.8.0 - Mend

libcraigscrape 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

data/CHANGELOG +19 -0
data/README +27 -11
data/Rakefile +44 -2
data/bin/craig_report_schema.yml +30 -21
data/bin/craigwatch +232 -67
data/bin/report_mailer/craigslist_report.html.erb +12 -9
data/bin/report_mailer/craigslist_report.plain.erb +4 -1
data/lib/geo_listings.rb +144 -0
data/lib/libcraigscrape.rb +158 -650
data/lib/listings.rb +144 -0
data/lib/posting.rb +293 -0
data/lib/scraper.rb +203 -0
data/test/geolisting_samples/hierarchy_test071009/index.html +31 -0
data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/%20SW%20florida/index.html +46 -0
data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/index.html +46 -0
data/test/geolisting_samples/hierarchy_test071009/us/fl/index.html +46 -0
data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/index.html +46 -0
data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/index.html +46 -0
data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/more-nonsense/index.html +46 -0
data/test/geolisting_samples/hierarchy_test071009/us/fl/nonexist/index.html +46 -0
data/test/geolisting_samples/hierarchy_test071009/us/fl/nonsense/index.html +46 -0
data/test/geolisting_samples/hierarchy_test071009/us/fl/south%20florida/index.html +46 -0
data/test/geolisting_samples/hierarchy_test071009/us/index.html +355 -0
data/test/test_craigslist_geolisting.rb +476 -380
metadata +28 -2

data/bin/craigwatch CHANGED Viewed

@@ -15,10 +15,14 @@
 # - price_required - yes/no
 # - price_greater_than - (int)
 # - price_less_than - (int)
-# - full_post_has - (string or regexp) Only post whose full-post's contents contains/matches
-# - full_post_has_no - (string or regexp) Only post whose full-post's contents_ contains doesn't contain/match
-# - summary_post_has - (string or regexp) Only post whose listing's label contains/matches
-# - summary_post_has_no - (string or regexp) Only post whose listing's label doesn't contain/match
+# - full_post_has - (array of string or regexp) Only post whose full-post's contents contains/matches
+# - full_post_has_no - (array of string or regexp) Only post whose full-post's contents_ contains doesn't contain/match
+# - summary_post_has - (array of string or regexp) Only post whose listing's label contains/matches
+# - summary_post_has_no - (array of string or regexp) Only post whose listing's label doesn't contain/match
+# - summary_or_full_post_has - (array of string or regexp) Filter's out results which don't match either the post label <b>or</b> the post contents
+# - summary_or_full_post_has_no - (array of string or regexp)  Filter's out results which match either the post label <b>or</b> the post contents
+# - location_has - (array of string or regexp) Only include posts which match against the post location
+# - location_has_no - (array of string or regexp) Only include posts which don't match against the post location
 #
 # Multiple searches can be combined into a single report, and results can be sorted by newest-first or oldest-first (default)
 #
@@ -58,8 +62,44 @@
 # - debug_craigscrape
 #
 # == Definition File Sample
-#
-# Here's a simple annotated report which uses most of the available craigwatch features:
+#
+# Let's start with a minimal report, just enough needed to get something quick working:
+#    # We need some kind of destination to send this to
+#    email_to: Chris DeRose <cderose@derosetechnologies.com>
+#
+#    # This is an array of specific 'searches' we'll be performing in this report:
+#    searches:
+#         # We're looking for 90's era cadillac, something cheap, confortable and in white...
+#       - name: 90's White/Creme Convertible Cadillacs
+#
+#         # This starting date is mostly for the first run, and gives us a reasonable cut-off point from whcih to build.
+#         # Its optional, and if omitted, craigwatch defaults to 'yesterday'
+#         starting: 9/10/09
+#
+#         # We want to check all the labels, and filter out years not in the 90's, and cars not made by cadillac
+#         summary_post_has:
+#            - /(?:^|[^\d]|19)9[\d](?:[^\dk]|$)/i
+#            - /cadillac/i
+#
+#         # I said we're looking for something *comfortable* !
+#         summary_post_has_no: [ /xlr/i ]
+#
+#         # We were convertable, and white/cream/etc:
+#         full_post_has:
+#            - /convertible/i
+#            - /(white|yellow|banana|creme|cream)/i
+#
+#         # Convertible - not *simulated* convertible!
+#         full_post_has_no:
+#            - /simulated[^a-z]{0,2}convertible/i
+#
+#         # We want to search all of craigslist's in the us, and we'll want to find it using
+#         # the '/search/cta?hasPic=1&query=cadillac' url on the site
+#         sites: [ us ]
+#         listings:
+#            - /search/cta?hasPic=1&query=cadillac
+#
+# Here's another annotated report which uses most of the other available craigwatch features:
 #
 #    # The report_name is fed into Time.now.strftime, hence the formatting characters
 #    report_name: Craig Watch For Johnathan on %D at %I:%M %p
@@ -73,13 +113,13 @@
 #    searches:
 #       # Search #1:
 #       - name: Schwinn Bikes For Sale in/near New York
+#         starting: 9/10/2009
+#
+#         # Scrape the following sites/servers:
+#         sites: [ us/ny/newyork, us/nj/southjersey ]
 #
 #         # Scrape the following listings pages:
-#         listing:
-#            - http://newyork.craigslist.org/bik/
-#            - http://newyork.craigslist.org/jsy/bik/
-#         # This starting date is mostly for the first run, and gives us a reasonable cut-off point from whcih to build
-#         starting: 5/2/2009
+#         listings: [ bik ]
 #
 #         # We want listings with Schwinn in the summary
 #         summary_post_has: [ /schwinn/i ]
@@ -92,10 +132,13 @@
 #
 #       # Search #2
 #       - name: Large apartment rentals in San Francisco
+#         sites: [ us/ca/sfbay ]
+#         starting: 9/10/2009
+#
 #         # We're going to rely on craigslist's built-in search for this one since there's a lot of listings, and we
 #         # want to conserve some bandwidth
-#         listing: [ http://sfbay.craigslist.org/search/apa?query=pool&minAsk=min&maxAsk=max&bedrooms=5 ]
-#         starting: 5/2/2009
+#         listings: [ /search/apa?query=pool&minAsk=min&maxAsk=max&bedrooms=5 ]
+#
 #         # We'll require a price to be listed, 'cause it keeps out some of the unwanted fluff
 #         price_required: yes
 #
@@ -149,24 +192,52 @@ end
 class CraigReportDefinition #:nodoc:
   include Kwalify::Util::HashLike
+  EMAIL_NAME_PARTS = /^[ ]*(.+)[ ]*\<.+\>[ ]*/
   attr_reader :report_name, :email_to, :email_from, :tracking_database, :searches, :smtp_settings
   def debug_database?;    @debug_database; end
   def debug_mailer?;      @debug_mailer; end
   def debug_craigscrape?; @debug_craigscrape; end
-  def each_search(&block); searches.each &block; end
   def email_from
     (@email_from) ? @email_from : ('%s@%s' % [ENV['USER'], Socket.gethostname])
   end
+  def email_to_name
+    EMAIL_NAME_PARTS.match(email_to) ? $1 : email_to
+  end
+  def report_name
+    @report_name ? @report_name : "Craigslist Watch For #{email_to_name} on %D at %I:%M %p"
+  end
+  # We allow people rewrite relative (sqlite) dbfiles by taking the use_cwd as a paramter
+  def tracking_database(for_yaml_file = nil)
+    # We'll setup a SQLite db using some defaults if needed
+    @tracking_database ||= {
+      :adapter => 'sqlite3',
+      :dbfile => File.basename(for_yaml_file, File.extname(for_yaml_file))+'.db'
+    } if for_yaml_file
+    # This is a little hack to make sqlite definitions a little more portable, by allowing them
+    # to be specify dbfile's relative to the yml's directory:
+    ret = @tracking_database
+    ret['dbfile'] = '%s/%s' % [File.dirname(for_yaml_file), $1] if (
+      for_yaml_file and ret.has_key? 'dbfile' and /^([^\/].*)$/.match ret['dbfile']
+    )
+    ret
+  end
   class SearchDefinition #:nodoc:
     include Kwalify::Util::HashLike
-    attr_reader :name, :listing
+    attr_reader :name, :sites, :listings
+    attr_reader :location_has, :location_has_no
     attr_reader :full_post_has, :full_post_has_no
     attr_reader :summary_post_has, :summary_post_has_no
+    attr_reader :summary_or_full_post_has, :summary_or_full_post_has_no
     attr_reader :price_greater_than,:price_less_than
@@ -188,15 +259,24 @@ class CraigReportDefinition #:nodoc:
         return false if @price_less_than and post.price >= @price_less_than
       end
+      # Label Filters:
       return false unless matches_all? summary_post_has, post.label
       return false unless doesnt_match_any? summary_post_has_no, post.label
-      if full_post_has or full_post_has_no
+      # Location Filters:
+      return false unless matches_all? location_has, post.location
+      return false unless doesnt_match_any? location_has_no, post.location
+      # Full post Filters:
+      if full_post_has or full_post_has_no or summary_or_full_post_has or summary_or_full_post_has_no
         # We're going to download the page, so let's make sure we didnt hit a "This posting has been flagged for removal"
         return false if post.system_post?
         return false unless matches_all? full_post_has, post.contents_as_plain
         return false unless doesnt_match_any? full_post_has_no, post.contents_as_plain
+        return false unless matches_all? summary_or_full_post_has, [post.contents_as_plain, post.label]
+        return false unless doesnt_match_any? summary_or_full_post_has_no, [post.contents_as_plain, post.label]
       end
       true
@@ -205,37 +285,66 @@ class CraigReportDefinition #:nodoc:
     private
     def matches_all?(conditions, against)
-      (conditions.nil? or conditions.all?{|c| match_against c, against}) ? true : false
+      against = against.to_a
+      (conditions.nil? or conditions.all?{|c| against.any?{|a| match_against c, a } }) ? true : false
     end
     def doesnt_match_any?(conditions, against)
-      (conditions.nil? or conditions.all?{|c| !match_against c, against}) ? true : false
+      against = against.to_a
+      (conditions.nil? or conditions.all?{|c| against.any?{|a| !match_against c, a } }) ? true : false
     end
     def match_against(condition, against)
-      (against.scan( condition.is_re? ? condition.to_re : condition).length > 0) ? true : false
+      (against.scan( condition.is_re? ? condition.to_re : /#{condition}/i).length > 0) ? true : false
     end
   end
 end
 class TrackedSearch < ActiveRecord::Base #:nodoc:
-  has_many :tracked_posts, :dependent => :destroy
+  has_many :listings, :dependent => :destroy, :class_name => 'TrackedListing'
   validates_uniqueness_of :search_name
   validates_presence_of   :search_name
-  def already_tracked?(url)
-    ( self.tracked_posts.find :first, :conditions => ['url = ?', url]) ? true : false
+  def self.find_by_name(name)
+    self.find :first, :conditions => ['search_name = ?',name]
   end
+  def find_listing_by_url(url)
+    listings.find :first, :conditions => ['url = ?',  url]
+  end
+end
+class TrackedListing < ActiveRecord::Base #:nodoc:
+  has_many :posts, :dependent => :destroy, :class_name => 'TrackedPost'
+  validates_presence_of :url, :tracked_search_id
+  def already_tracked?(url)
+    ( self.posts.find :first, :conditions => ['url = ?', url]) ? true : false
+  end
   def last_tracked_at
-    self.tracked_posts.maximum 'created_at'
+    self.posts.maximum 'created_at'
+  end
+  def delete_posts_older_than(cutoff_date)
+    # TODO: can't I use posts.delete 'created_at < ?' and keep it cleaner?
+    TrackedPost.delete_all [ 'tracked_listing_id = ? AND created_at < ?', self.id, cutoff_date ]
   end
 end
 class TrackedPost < ActiveRecord::Base #:nodoc:
-  belongs_to :tracked_search
-  validates_presence_of :url, :tracked_search_id
-  validates_uniqueness_of :url, :scope => :tracked_search_id
+  validates_presence_of :url, :tracked_listing_id
+  def self.activate_all!
+    TrackedPost.update_all(
+      { :active => true },
+      [ 'active = ?', false ]
+    )
+  end
+  def self.destroy_inactive!
+    TrackedPost.delete_all [ 'active = ?', false ]
+  end
 end
 class ReportMailer < ActionMailer::Base #:nodoc:
@@ -279,9 +388,9 @@ end
 parser = Kwalify::Yaml::Parser.new(
   Kwalify::Validator.new(
     Kwalify::Yaml.load_file(File.dirname(__FILE__)+'/craig_report_schema.yml')
-  )
+  ),
+  :data_binding => true
 )
-parser.data_binding = true
 craig_report = parser.parse_file report_definition_file
@@ -300,7 +409,7 @@ ReportMailer.template_root = File.dirname __FILE__
 # Initialize the database:
 ActiveRecord::Base.logger = Logger.new STDERR if craig_report.debug_database?
-ActiveRecord::Base.establish_connection craig_report.tracking_database
+ActiveRecord::Base.establish_connection craig_report.tracking_database(report_definition_file)
 # Initialize CraigScrape (sorta)
 CraigScrape::Scraper.logger = Logger.new STDERR if craig_report.debug_craigscrape?
@@ -311,68 +420,123 @@ ActiveRecord::Schema.define do
     create_table :tracked_searches do |t|
       t.column :search_name,      :string
     end unless table_exists? :tracked_searches
-    create_table :tracked_posts do |t|
+    create_table :tracked_listings do |t|
       t.column :url,                :string
       t.column :tracked_search_id,  :integer
+    end unless table_exists? :tracked_listings
+    create_table :tracked_posts do |t|
+      t.column :url,                :string
+      t.column :tracked_listing_id, :integer
       t.column :created_at,         :date
+      t.column :active,             :boolean, :default => 0
     end unless table_exists? :tracked_posts
   end
 end
+# Remove all posts which are inactive. They would be in there if the prior run was a failure.
+TrackedPost.destroy_inactive!
 # We'll need these outside this next loop:
-report_summaries = []
 newly_tracked_posts = []
 # Now let's run a report:
-craig_report.each_search do |search|
+report_summaries = craig_report.searches.collect do |search|
   # Load our tracking info
-  search_track = TrackedSearch.find :first, :conditions => ['search_name = ?',search.name]
+  search_track = TrackedSearch.find_by_name search.name
   # No Tracking found - let's set one up:
   search_track = TrackedSearch.create! :search_name => search.name unless search_track
+  # This hash tracks what makes it into the report on this search.
+  # NOTE that keys are url's b/c sometimes the same posting will end up in multiple listings,
+  # And doing this ensures that we don't end-up reporting the same post twice.
+  new_summaries = {}
+  # And now we actually scrape:
+  CraigScrape.new(*search.sites).each_listing(*search.listings) do |listing|
+    # Keep in mind that listing.url does change in the while loop.
+    # But, this first one is a good base_url that will never change between runs.
-  last_tracked_at = (search_track.last_tracked_at) ? search_track.last_tracked_at : search.starting_at
+    tracked_listing = search_track.find_listing_by_url listing.url
+    tracked_listing ||= search_track.listings.create! :url => listing.url
+    # Gives us a sane stopping point (hopefully) :
+    last_tracked_at = tracked_listing.last_tracked_at
+    last_tracked_at ||= search.starting_at
-  already_tracked_urls = search_track.tracked_posts.collect{|tp| tp.url}
+    # Some more stopping points (probably):
+    already_tracked_urls = tracked_listing.posts.collect{|tp| tp.url}
-  # Let's collect all the summaries that could apply:
-  new_summaries = {}
-  search.listing.each do |listing|
-    CraigScrape.scrape_until(listing){|p| p.post_date <= last_tracked_at or already_tracked_urls.include? p.url }.each do |p_s|
-      new_summaries[p_s.url] = p_s unless new_summaries.has_key? p_s.url
+    # We'll use this in the loop to decide what posts to track:
+    newest_post_date = last_tracked_at
+    # OK - Now let's go!
+    catch :list_break do
+      while listing
+        listing.posts.each do |post|
+          begin
+            # Are we at a point in the scrape, past which we don't need to proceed?
+            throw :list_break if (
+              post.post_date < last_tracked_at or
+              already_tracked_urls.include? post.url
+            )
+            # If we want to report this post, add it to the collection:
+            new_summaries[post.url] = post if (
+              !new_summaries.has_key? post.url and
+              search.passes_filter? post
+            )
+          rescue CraigScrape::Scraper::ResourceNotFoundError => e
+            # Sometimes we do end up with 404's that will never load, and we dont want to
+            # abort a run simply b/c we found some anomaly due to the craigslist index.
+            # being out of date. This ResourceNotFoundError can occur due to
+            # loading the post url in full, only to see that it was yanked - or craigslist
+            # is acting funny.
+            next
+          end
+          # Now let's see if the url should be kept in our tracking database for the future...
+          # This post-date sets a limit for the tracked_listing.posts.create below
+          newest_post_date = post.post_date if post.post_date > newest_post_date
+          # Now let's add these urls to the database so as to reduce memory overhead.
+          # Keep in mind - they're not active until the email goes out.
+          # also - we shouldn't have to worry about putting 'irrelevant' posts in the db, since
+          # the nbewest are always the first ones parsed:
+          tracked_listing.posts.create(
+            :url => post.url,
+            :created_at => newest_post_date
+          ) unless post.post_date < newest_post_date
+        end
+        listing = listing.next_page
+      end
     end
   end
   # Let's flatten the unique'd hash into a more useable array:
   new_summaries = new_summaries.values.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
-  # Let's tag all the newest tracked posts that should go into the database:
-  # NOTE: Since all the dates are at_begining_of_day, we'll effectively have a chunk of dates tied for latest
-  new_summaries.reject{|p| p.post_date < new_summaries.last.post_date}.each do |p_s|
-    newly_tracked_posts << search_track.tracked_posts.build( :url => p_s.url, :created_at => p_s.post_date)
-  end
-  # Reject anything from this report which doesn't match the has/has_no :
-  new_summaries.reject!{|s| !search.passes_filter? s }
   # Now Let's manage the tracking database:
   if new_summaries.length > 0
     # We'll use this in the cleanup at the bottom:
     latest_post_date = new_summaries.last.post_date
-    new_summaries.reverse! if search.newest_first?
-    # We'll want to email these...
-    report_summaries << {
-      :postings => new_summaries,
-      :search => search,
-      :search_track => search_track,
-      :latest_post_date => latest_post_date
-    }
+    new_summaries.reverse! if search.newest_first?
   end
+  # We'll want to email these...
+  {
+    :latest_post_date => latest_post_date,
+    :search_track => search_track,
+    :postings => new_summaries,
+    :search => search
+  }
 end
 # Time to send the email:
@@ -383,11 +547,12 @@ ReportMailer.deliver_report(
   {:summaries => report_summaries, :definition => craig_report}
 ) if report_summaries.length > 0
-# Save the newly created posts:
-newly_tracked_posts.each{|tp| tp.save!}
+# Commit (make 'active') all newly created tracked post urls:
+TrackedPost.activate_all!
-# Now that we know the user has been informed, Let's commit all our database changes and end this scrape 'transaction':
-report_summaries.each do |s|
-  # Let's do some light cleanup to keep the database size down, by removing all the old posts we're no longer tracking:
-  TrackedPost.delete_all [ 'tracked_search_id = ? AND created_at < ?', s[:search_track].id, s[:latest_post_date] ]
+# Now remove all the no-longer-need posts from the prior run:
+report_summaries.each do |summary|
+  summary[:search_track].listings.each do |listing|
+    listing.delete_posts_older_than listing.last_tracked_at
+  end
 end

data/bin/report_mailer/craigslist_report.html.erb CHANGED Viewed

@@ -1,14 +1,17 @@
 <h2><%=h @subject %></h2>
 <%@summaries.each do |summary| %>
    <h3><%=h summary[:search].name%></h3>
-      <%summary[:postings].each do |post|%>
-         <%='<p>%s <a href="%s">%s -</a>%s%s</p>' % [
-			h(post.post_date.strftime('%b %d')),
-			post.url,
-			h(post.label),
-			(post.location) ? '<font size="-1"> (%s)</font>' % h(post.location) : '',
-			(post.has_pic_or_img?) ? ' <span style="color: orange"> img</span>': ''
-      	] -%>
+      <% if summary[:postings].length > 0 %>
+         <%summary[:postings].each do |post|%>
+            <%='<p>%s <a href="%s">%s -</a>%s%s</p>' % [
+   			h(post.post_date.strftime('%b %d')),
+   			post.url,
+   			h(post.label),
+   			(post.location) ? '<font size="-1"> (%s)</font>' % h(post.location) : '',
+   			(post.has_pic_or_img?) ? ' <span style="color: orange"> img</span>': ''
+         	] -%>
+         <% end %>
+      <% else %>
+         <p><i>No new postings were found, which matched the search criteria.</i></p>
       <% end %>
 <% end %>

data/bin/report_mailer/craigslist_report.plain.erb CHANGED Viewed

@@ -3,6 +3,7 @@ CRAIGSLIST REPORTER
 <%@summaries.each do |summary| -%>
    <%=summary[:search].name %>
    <% summary[:postings].collect do |post| -%>
+      <% if summary[:postings].length > 0 %>
       <%='%s : %s %s %s %s' % [
 			post.post_date.strftime('%b %d'),
 			post.label,
@@ -10,6 +11,8 @@ CRAIGSLIST REPORTER
 			(post.has_pic_or_img?) ? ' [img]': '',
 			post.url
       ] -%>
+      <% else %>
+      No new postings were found, which matched the search criteria.
+      <% end %>
    <% end %>
 <% end -%>

data/lib/geo_listings.rb ADDED Viewed

@@ -0,0 +1,144 @@
+# = About geo_listings.rb
+#
+# This file contains the parsing code, and logic relating to geographic site pages and paths. You
+# should never need to include this file directly, as all of libcraigscrape's objects and methods
+# are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
+#
+require 'scraper'
+class CraigScrape
+  # GeoListings represents a parsed Craigslist geo lisiting page. (i.e. {'http://geo.craigslist.org/iso/us'}[http://geo.craigslist.org/iso/us])
+  # These list all the craigslist sites in a given region.
+  class GeoListings < Scraper
+    GEOLISTING_BASE_URL = %{http://geo.craigslist.org/iso/}
+    LOCATION_NAME    = /[ ]*\>[ ](.+)[ ]*/
+    PATH_SCANNER     = /(?:\\\/|[^\/])+/
+    URL_HOST_PART    = /^[^\:]+\:\/\/([^\/]+)[\/]?$/
+    SITE_PREFIX      = /^([^\.]+)/
+    FIND_SITES_PARTS = /^[ ]*([\+|\-]?)[ ]*(.+)[ ]*/
+    class BadGeoListingPath < StandardError #:nodoc:
+    end
+    # The geolisting constructor works like all other Scraper objects, in that it accepts a string 'url'.
+    # See the Craigscrape.find_sites for a more powerful way to find craigslist sites.
+    def initialize(init_via = nil)
+      super(init_via)
+      # Validate that required fields are present, at least - if we've downloaded it from a url
+      parse_error! unless location
+    end
+    # Returns the GeoLocation's full name
+    def location
+      unless @location
+        cursor = html % 'h3 > b > a:first-of-type'
+        cursor = cursor.next_node if cursor
+        @location = $1 if cursor and LOCATION_NAME.match he_decode(cursor.to_s)
+      end
+      @location
+    end
+    # Returns a hash of site name to urls in the current listing
+    def sites
+      unless @sites
+        @sites = {}
+        (html / 'div#list > a').each do |el_a|
+          site_name = he_decode strip_html(el_a.inner_html)
+          @sites[site_name] = $1 if URL_HOST_PART.match el_a[:href]
+        end
+      end
+      @sites
+    end
+    # This method will return an array of all possible sites that match the specified location path.
+    # Sample location paths:
+    # - us/ca
+    # - us/fl/miami
+    # - jp/fukuoka
+    # - mx
+    # Here's how location paths work.
+    # - The components of the path are to be separated by '/' 's.
+    # - Up to (and optionally, not including) the last component, the path should correspond against a valid GeoLocation url with the prefix of 'http://geo.craigslist.org/iso/'
+    # - the last component can either be a site's 'prefix' on a GeoLocation page, or, the last component can just be a geolocation page itself, in which case all the sites on that page are selected.
+    # - the site prefix is the first dns record in a website listed on a GeoLocation page. (So, for the case of us/fl/miami , the last 'miami' corresponds to the 'south florida' link on {'http://geo.craigslist.org/iso/us/fl'}[http://geo.craigslist.org/iso/us/fl]
+    def self.sites_in_path(full_path, base_url = GEOLISTING_BASE_URL)
+      # the base_url parameter is mostly so we can test this method
+      # Unfortunately - the easiest way to understand much of this is to see how craigslist returns
+      # these geolocations. Watch what happens when you request us/fl/non-existant/page/here.
+      # I also made this a little forgiving in a couple ways not specified with official support, per
+      # the rules above.
+      full_path_parts = full_path.scan PATH_SCANNER
+      # We'll either find a single site in this loop andf return that, or, we'll find a whole listing
+      # and set the geo_listing object to reflect that
+      geo_listing = nil
+      full_path_parts.each_with_index do |part, i|
+        # Let's un-escape the path-part, if needed:
+        part.gsub! "\\/", "/"
+        # If they're specifying a single site, this will catch and return it immediately
+        site = geo_listing.sites.find{ |n,s|
+          (SITE_PREFIX.match s and $1 == part) or n == part
+        } if geo_listing
+        # This returns the site component of the found array
+        return [site.last] if site
+        begin
+          # The URI escape is mostly needed to translate the space characters
+          l = GeoListings.new base_url+full_path_parts[0...i+1].collect{|p| URI.escape p}.join('/')
+        rescue CraigScrape::Scraper::FetchError
+          bad_geo_path! full_path
+        end
+        # This probably tells us the first part of the path was 'correct', but not the rest:
+        bad_geo_path! full_path if geo_listing and geo_listing.location == l.location
+        geo_listing = l
+      end
+      # We have a valid listing page we found, and we can just return all the sites on it:
+      geo_listing.sites.collect{|n,s| s }
+    end
+    # find_sites takes a single array of strings as an argument. Each string is to be either a location path
+    # (see sites_in_path), or a full site (in canonical form - ie "memphis.craigslist.org"). Optionally,
+    # each of this may/should contain a '+' or '-' prefix to indicate whether the string is supposed to
+    # include sites from the master list, or remove them from the list. If no '+' or'-' is
+    # specified, the default assumption is '+'. Strings are processed from left to right, which gives
+    # a high degree of control over the selection set. Examples:
+    # - find_sites "us/fl", "- miami.craigslist.org"
+    # - find_sites "us", "- us/nm"
+    # - find_sites "us", "- us/ny", "+ newyork.craigslist.org"
+    # - find_sites "us/ny", "us/id", "caribbean.craigslist.org"
+    # There's a lot of flexibility here, you get the idea.
+    def self.find_sites(specs, base_url = GEOLISTING_BASE_URL)
+      ret = []
+      specs.each do |spec|
+        (op,spec = $1,$2) if FIND_SITES_PARTS.match spec
+        spec = (spec.include? '.')  ? [spec] : sites_in_path(spec, base_url)
+        (op == '-') ? ret -= spec : ret |= spec
+      end
+      ret
+    end
+    private
+    def self.bad_geo_path!(path)
+      raise BadGeoListingPath, "Unable to load path #{path.inspect}, either you're having problems connecting to Craiglist, or your path is invalid."
+    end
+  end
+end