RubyGems - libcraigscrape - Versions diffs - 1.0 → 1.1.0 - Mend

libcraigscrape 1.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

data/CHANGELOG +12 -1
data/Gemfile +12 -0
data/Rakefile +1 -54
data/bin/craig_report_schema.yml +4 -1
data/bin/craigwatch +148 -146
data/bin/report_mailer/report.html.erb +20 -0
data/bin/report_mailer/{craigslist_report.plain.erb → report.text.erb} +7 -6
data/lib/geo_listings.rb +1 -1
data/lib/libcraigscrape.rb +52 -59
data/lib/listings.rb +75 -39
data/lib/posting.rb +120 -63
data/lib/scraper.rb +43 -63
data/spec/assets/geolisting_iso_us_120412.html +441 -0
data/spec/assets/listing_cta_ftl_112612.html +1470 -0
data/spec/assets/listing_rea_miami_123012.html +1397 -0
data/spec/assets/listing_search_ppa_nyc_121212.html +1584 -0
data/spec/assets/posting_daytona_art_120512-2.html +160 -0
data/spec/assets/posting_daytona_art_120512.html +153 -0
data/spec/assets/posting_mdc_cto_ftl_112612.html +170 -0
data/spec/assets/posting_mdc_reb_120612.html +183 -0
data/spec/assets/posting_sfbay_1226.html +157 -0
data/spec/assets/posting_sya_121012-2.html +122 -0
data/spec/assets/posting_sya_121012.html +165 -0
data/spec/assets/this_post_has_expired_old.html +48 -0
data/spec/geolisting_spec.rb +9 -0
data/spec/listings_spec.rb +77 -0
data/spec/postings_spec.rb +157 -0
data/spec/spec_helper.rb +8 -0
data/test/test_craigslist_geolisting.rb +5 -5
data/test/test_craigslist_listing.rb +30 -30
data/test/test_craigslist_posting.rb +25 -145
metadata +200 -114
data/bin/report_mailer/craigslist_report.html.erb +0 -17

data/lib/libcraigscrape.rb CHANGED Viewed

@@ -3,38 +3,34 @@
 # All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
 #
 require 'rubygems'
-gem 'activesupport', '~> 2.3'
-gem 'nokogiri',      '~> 1.4.4'
-gem 'htmlentities',  '~> 4.0.0'
-require 'net/http'
-require 'zlib'
-require 'nokogiri'
+require 'time'
+require 'uri'
 require 'htmlentities'
-require 'active_support'
+require 'active_support/core_ext/class/attribute_accessors'
+require 'active_support/core_ext/time/calculations'
+require 'htmlentities'
+require 'nokogiri'
+require 'typhoeus'
+require 'money'
-# A base class encapsulating the various libcraigscrape objects, and providing most of the
-# craigslist interaction methods. Currently, we're supporting the old Class methods
+# A base class encapsulating the various libcraigscrape objects, and providing most of the
+# craigslist interaction methods. Currently, we're supporting the old Class methods
 # in a legacy-compatibility mode, but these methods are marked for deprecation. Instead,
 # create an instance of the Craigslist object, and use its Public Instance methods.
 # See the README for easy to follow examples.
 class CraigScrape
-  cattr_accessor :time_now
   cattr_accessor :site_to_url_prefix
   #--
   # NOTE:
-  # The only reason I took this out is b/c I might want to test with a file://
+  # The only reason I took this out is b/c I might want to test with a file://
   # prefix at some point
   #++
   self.site_to_url_prefix = 'http://'
-  # Takes a variable number of site/path specifiers (strings) as an argument.
+  # Takes a variable number of site/path specifiers (strings) as an argument.
   # This list gets flattened and passed to CraigScrape::GeoListings.find_sites .
   # See that method's rdoc for a complete set of rules on what arguments are allowed here.
   def initialize(*args)
@@ -44,50 +40,50 @@ class CraigScrape
   # Returns which sites are included in any operations performed by this object. This is directly
   # ascertained from the initial constructor's spec-list
   def sites
-    @sites ||= GeoListings.find_sites @sites_specs
+    @sites ||= GeoListings.find_sites @sites_specs
     @sites
   end
   # Determines all listings which can be construed by combining the sites specified in the object
-  # constructor with the provided url-path fragments.
+  # constructor with the provided url-path fragments.
   #
   # Passes the <b>first page listing</b> of each of these urls to the provided block.
   def each_listing(*fragments)
     listing_urls_for(fragments).each{|url| yield Listings.new(url) }
   end
   # Determines all listings which can be construed by combining the sites specified in the object
-  # constructor with the provided url-path fragments.
+  # constructor with the provided url-path fragments.
   #
   # Passes <b>each page on every listing</b> for the passed URLs to the provided block.
   def each_page_in_each_listing(*fragments)
     each_listing(*fragments) do |listing|
       while listing
         yield listing
-        listing = listing.next_page
+        listing = listing.next_page
       end
     end
   end
   # Determines all listings which can be construed by combining the sites specified in the object
-  # constructor with the provided url-path fragments.
+  # constructor with the provided url-path fragments.
   #
   # Returns the <b>first page listing</b> of each of these urls to the provided block.
   def listings(*fragments)
     listing_urls_for(fragments).collect{|url| Listings.new url }
   end
   # Determines all listings which can be construed by combining the sites specified in the object
-  # constructor with the provided url-path fragments.
+  # constructor with the provided url-path fragments.
   #
   # Passes all posts from each of these urls to the provided block, in the order they're parsed
   # (for each listing, newest posts are returned first).
   def each_post(*fragments)
     each_page_in_each_listing(*fragments){ |l| l.posts.each{|p| yield p} }
   end
   # Determines all listings which can be construed by combining the sites specified in the object
-  # constructor with the provided url-path fragments.
+  # constructor with the provided url-path fragments.
   #
   # Returns all posts from each of these urls, in the order they're parsed
   # (newest posts first).
@@ -96,24 +92,32 @@ class CraigScrape
     each_page_in_each_listing(*fragments){ |l| ret += l.posts }
     ret
   end
   # Determines all listings which can be construed by combining the sites specified in the object
-  # constructor with the provided url-path fragments.
+  # constructor with the provided url-path fragments.
   #
-  # Returns all posts from each of these urls, which are newer than the provider 'newer_then' date.
+  # Returns all posts from each of these urls, which are newer than (or equal to) the provider 'newer_then' date.
   # (Returns 'newest' posts first).
+  #
+  # NOTE: New to version 1.1, if newer_then is a date, we compare to the post_date
+  # if newer_then is a Time, we compare to post_time. Be aware that post_time
+  # requires the entire post be loaded, and not just the summary - which will
+  # take longer to download.
   def posts_since(newer_then, *fragments)
+    accessor =  (newer_then.kind_of? Date) ? :post_date : :post_time
     ret = []
     fragments.each do |frag|
       each_post(frag) do |p|
-        break if p.post_date <= newer_then
+        # We have to try the comparison, since post_time could conceivably be nil
+        # for the case of a system_post?
+        break if p.send(accessor).try(:<=, newer_then)
         ret << p
       end
     end
-    ret
+    ret
   end
   class << self # Class methods
     #--
@@ -122,11 +126,11 @@ class CraigScrape
     #++
     # <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
-    # Instead, consider using CraigScrape::Listings.new
+    # Instead, consider using CraigScrape::Listings.new
     #
-    # Scrapes a single listing url and returns a Listings object representing the contents.
+    # Scrapes a single listing url and returns a Listings object representing the contents.
     # Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
-    def scrape_listing(listing_url)
+    def scrape_listing(listing_url)
       CraigScrape::Listings.new listing_url
     end
@@ -137,24 +141,24 @@ class CraigScrape
     # until there's no more 'next page' links available to click on
     def scrape_until(listing_url, &post_condition)
       ret = []
       listings = CraigScrape::Listings.new listing_url
       catch "ScrapeBreak" do
-        while listings do
+        while listings do
           listings.posts.each do |post|
             throw "ScrapeBreak" if post_condition.call(post)
             ret << post
           end
           listings = listings.next_page
         end
       end
       ret
     end
     # <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
-    # Instead, consider using CraigScrape::Posting.new
+    # Instead, consider using CraigScrape::Posting.new
     #
     # Scrapes a single Post Url, and returns a Posting object representing its contents.
     # Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
@@ -176,7 +180,7 @@ class CraigScrape
     # Instead, consider using the CraigScrape::posts_since method.
     #
     # Continually scrapes listings, until the date newer_then has been reached, or no more 'next page' links are avialable to be clicked on.
-    # Returns an array of PostSummary objects. Dates are based on the Month/Day 'datestamps' reported in the listing summaries.
+    # Returns an array of PostSummary objects. Dates are based on the Month/Day 'datestamps' reported in the listing summaries.
     # As such, time-based cutoffs are not supported here. The scrape_until method, utilizing the SummaryPost.full_post method could achieve
     # time-based cutoffs, at the expense of retrieving every post in full during enumerations.
     #
@@ -185,9 +189,9 @@ class CraigScrape
       self.scrape_until(listing_url) {|post| post.post_date <= newer_then}
     end
   end
   private
   # This  takes a fragments paramter, and turns it into actual urls
   def listing_urls_for(listing_fragments)
     listing_fragments.collect{ |lf|
@@ -198,20 +202,9 @@ class CraigScrape
       sites.collect { |site| '%s%s/%s' % [site_to_url_prefix,site,lf] }
     }.flatten
   end
-  # Returns the most recentlt expired  time for the provided month and day
-  def self.most_recently_expired_time(month, day)  #:nodoc:
-    now = (time_now) ? time_now : Time.now
-    # This ensures we always generate a time in the past, by guessing the year and subtracting one if we guessed wrong
-    ret = Time.local now.year, month, day
-    ret = Time.local now.year-1, month, day if ret > now
-    ret
-  end
 end
 require 'listings'
 require 'posting'
-require 'geo_listings'
+require 'geo_listings'

data/lib/listings.rb CHANGED Viewed

@@ -13,7 +13,13 @@ class CraigScrape::Listings < CraigScrape::Scraper
   IMG_TYPE       = /^[ ]*(.+)[ ]*$/
   HEADER_DATE    = /^[ ]*(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[ ]+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Nov|Dec)[ ]+([0-9]{1,2})[ ]*$/i
   SUMMARY_DATE   = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
-  NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/
+  NEXT_PAGE_LINK = /^[ ]*(?:next [\d]+ postings|Next \>\>)[ ]*$/
+  XPATH_POST_DATE = "*[@class='itemdate']"
+  XPATH_POST_IMGPIC = "*[@class='itempx']/*[@class='p']"
+  XPATH_PAGENAV_LINKS = "//*[@class='ban']//a"
+  # There's a couple places that the price hangs out. We search in this order
+  XPATHS_POST_PRICE = ["*[@class='itempp']", "*[@class='itemph']"]
   # Array, PostSummary objects found in the listing
   def posts
@@ -35,7 +41,7 @@ class CraigScrape::Listings < CraigScrape::Scraper
       post_tags.each do |el|
         case el.name
           when 'p'
-           post_summary = self.class.parse_summary el, current_date
+           post_summary = parse_summary el, current_date
            # Validate that required fields are present:
            parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
@@ -47,7 +53,7 @@ class CraigScrape::Listings < CraigScrape::Scraper
           # Let's make sense of the h4 tag, and then read all the p tags below it
           if HEADER_DATE.match he_decode(el.inner_html)
             # Generally, the H4 tags contain valid dates. When they do - this is easy:
-            current_date = CraigScrape.most_recently_expired_time $1, $2
+            current_date = Date.parse [$1, $2].join('/')
           elsif html.at('h4:last-of-type') == el
             # There's a specific bug in craigslist, where these nonsense h4's just appear without anything relevant inside them.
             # They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
@@ -65,29 +71,37 @@ class CraigScrape::Listings < CraigScrape::Scraper
   # String, URL Path href-fragment of the next page link
   def next_page_href
     unless @next_page_href
-      cursor = html.at 'p:last-of-type'
-      cursor = cursor.at 'a' if cursor
-      # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
-      next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
-      # Search listings put their next page in a link towards the top
-      next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
-      # Some search pages have a bug, whereby a 'next page' link isn't displayed,
-      # even though we can see that theres another page listed in the page-number links block at the top
-      # and bottom of the listing page
-      unless next_link
-        cursor = html % 'div.sh:first-of-type > b:last-of-type'
-        # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
-        # We're looking good.
-        next_link = cursor.next_element if cursor and /^[\d]+$/.match cursor.inner_html
+      if html.at_xpath(XPATH_PAGENAV_LINKS)
+        # Post 12/3
+        next_link = html.xpath(XPATH_PAGENAV_LINKS).find{|link| NEXT_PAGE_LINK.match link.content}
+        @next_page_href = next_link[:href] if next_link
+      else
+        # Old style
+        cursor = html.at 'p:last-of-type'
+        cursor = cursor.at 'a' if cursor
+        # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
+        next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
+        # Search listings put their next page in a link towards the top
+        next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
+        # Some search pages have a bug, whereby a 'next page' link isn't displayed,
+        # even though we can see that theres another page listed in the page-number links block at the top
+        # and bottom of the listing page
+        unless next_link
+          cursor = html % 'div.sh:first-of-type > b:last-of-type'
+          # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
+          # We're looking good.
+          next_link = cursor.next_element if cursor and /^[\d]+$/.match cursor.inner_html
+        end
+        # We have an anchor tag - so - let's assign the href:
+        @next_page_href = next_link[:href] if next_link
       end
-      # We have an anchor tag - so - let's assign the href:
-      @next_page_href = next_link[:href] if next_link
     end
     @next_page_href
@@ -100,12 +114,14 @@ class CraigScrape::Listings < CraigScrape::Scraper
   # Returns a Listings object of the next_page_url on the current listings object
   def next_page
-    CraigScrape::Listings.new next_page_url if next_page_url
+    CraigScrape::Listings.new URI.encode(next_page_url) if next_page_url
   end
+  private
   # Takes a paragraph element and returns a mostly-parsed Posting
   # We separate this from the rest of the parsing both for readability and ease of testing
-  def self.parse_summary(p_element, date = nil)  #:nodoc:
+  def parse_summary(p_element, date = nil)  #:nodoc:
     ret = {}
     title_anchor   = nil
@@ -126,26 +142,45 @@ class CraigScrape::Listings < CraigScrape::Scraper
     end
     location_tag = p_element.at 'font'
-    has_pic_tag = p_element.at 'span'
     href = nil
     location = he_decode p_element.at('font').inner_html if location_tag
     ret[:location] = $1 if location and LOCATION.match location
-    ret[:img_types] = []
-    if has_pic_tag
-      img_type = he_decode has_pic_tag.inner_html
-      img_type = $1.tr('^a-zA-Z0-9',' ') if IMG_TYPE.match img_type
+    price_path = XPATHS_POST_PRICE.find{|path|
+      content = p_element.at_xpath(path).try(:content)
+      (!content.nil? and !content.empty?)
+    }
+    ret[:price] = Money.new($1.to_i * 100, 'USD') if price_path and
+      /\$([\d]+)/.match(p_element.at_xpath(price_path).content)
-      ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
+    ret[:img_types] = []
+    if p_element.at_xpath XPATH_POST_IMGPIC
+      # Post 12/3
+      ret[:img_types] = p_element.at_xpath(XPATH_POST_IMGPIC).content.scan(/\w+/).collect(&:to_sym)
+    else
+      # Old style:
+      has_pic_tag = p_element.at 'span'
+      if has_pic_tag
+        img_type = he_decode has_pic_tag.inner_html
+        img_type = $1.tr('^a-zA-Z0-9',' ') if IMG_TYPE.match img_type
+        ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
+      end
     end
-    ret[:section] = he_decode(section_anchor.inner_html).split("\302\240").join(" ") if section_anchor
+    ret[:section] = he_decode(section_anchor.inner_html) if section_anchor
     ret[:post_date] = date
-    if SUMMARY_DATE.match he_decode(p_element.children[0])
-      ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
+    if p_element.at_xpath(XPATH_POST_DATE)
+      # Post 12/3
+      if /\A([^ ]+) ([\d]+)\Z/.match p_element.at_xpath(XPATH_POST_DATE).content.strip
+        ret[:post_date] = Date.parse [$1, $2].join('/')
+      end
+    elsif SUMMARY_DATE.match he_decode(p_element.children[0])
+      # Old style
+        ret[:post_date] = Date.parse [$1, $2].join('/')
     end
     if title_anchor
@@ -157,4 +192,5 @@ class CraigScrape::Listings < CraigScrape::Scraper
     ret
   end
-end
+end