RubyGems - libcraigscrape - Versions diffs - 1.0 → 1.1.0 - Mend

libcraigscrape 1.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

data/CHANGELOG +12 -1
data/Gemfile +12 -0
data/Rakefile +1 -54
data/bin/craig_report_schema.yml +4 -1
data/bin/craigwatch +148 -146
data/bin/report_mailer/report.html.erb +20 -0
data/bin/report_mailer/{craigslist_report.plain.erb → report.text.erb} +7 -6
data/lib/geo_listings.rb +1 -1
data/lib/libcraigscrape.rb +52 -59
data/lib/listings.rb +75 -39
data/lib/posting.rb +120 -63
data/lib/scraper.rb +43 -63
data/spec/assets/geolisting_iso_us_120412.html +441 -0
data/spec/assets/listing_cta_ftl_112612.html +1470 -0
data/spec/assets/listing_rea_miami_123012.html +1397 -0
data/spec/assets/listing_search_ppa_nyc_121212.html +1584 -0
data/spec/assets/posting_daytona_art_120512-2.html +160 -0
data/spec/assets/posting_daytona_art_120512.html +153 -0
data/spec/assets/posting_mdc_cto_ftl_112612.html +170 -0
data/spec/assets/posting_mdc_reb_120612.html +183 -0
data/spec/assets/posting_sfbay_1226.html +157 -0
data/spec/assets/posting_sya_121012-2.html +122 -0
data/spec/assets/posting_sya_121012.html +165 -0
data/spec/assets/this_post_has_expired_old.html +48 -0
data/spec/geolisting_spec.rb +9 -0
data/spec/listings_spec.rb +77 -0
data/spec/postings_spec.rb +157 -0
data/spec/spec_helper.rb +8 -0
data/test/test_craigslist_geolisting.rb +5 -5
data/test/test_craigslist_listing.rb +30 -30
data/test/test_craigslist_posting.rb +25 -145
metadata +200 -114
data/bin/report_mailer/craigslist_report.html.erb +0 -17

data/lib/posting.rb CHANGED Viewed

@@ -14,14 +14,25 @@ class CraigScrape::Posting < CraigScrape::Scraper
   POST_DATE       = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
   LOCATION        = /Location\:[ ]+(.+)/
-  HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/
-  POSTING_ID      = /PostingID\:[ ]+([\d]+)/
+  HEADER_LOCATION = /\((.+)\)$/
+  POSTING_ID      = /PostingID\:[ ]*([\d]+)/
   REPLY_TO        = /(.+)/
   PRICE           = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
+  # NOTE: we implement the (?:) to first check the 'old' style format, and then the 'new style'
+  # (As of 12/03's parse changes)
   USERBODY_PARTS  = /^(.+)\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>(.+)$/m
   HTML_HEADER     = /^(.+)\<div id\=\"userbody\">/m
   IMAGE_SRC       = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
+  # This is used to determine if there's a parse error
+  REQUIRED_FIELDS = %w(contents posting_id post_time header title full_section)
+  XPATH_USERBODY = "//*[@id='userbody']"
+  XPATH_BLURBS = "//ul[@class='blurbs']"
+  XPATH_PICS = "//*[@class='tn']/a/@href"
+  XPATH_REPLY_TO = "//*[@class='dateReplyBar']/small/a"
   # This is really just for testing, in production use, uri.path is a better solution
   attr_reader :href #:nodoc:
@@ -30,14 +41,14 @@ class CraigScrape::Posting < CraigScrape::Scraper
     super(*args)
     # Validate that required fields are present, at least - if we've downloaded it from a url
-    parse_error! if (
-      args.first.kind_of? String and
-      !flagged_for_removal? and
-      !posting_has_expired? and
-      !deleted_by_author? and [
-        contents,posting_id,post_time,header,title,full_section
-      ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
-    )
+    if args.first.kind_of? String and is_active_post?
+      unparsed_fields = REQUIRED_FIELDS.find_all{|f|
+        val = send(f)
+        val.nil? or (val.respond_to? :length and val.length == 0)
+      }
+      parse_error! unparsed_fields unless unparsed_fields.empty?
+    end
   end
@@ -67,7 +78,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
     unless @full_section
       @full_section = []
-      (html_head/"div[@class='bchead']//a").each do |a|
+      (html_head / "*[@class='bchead']//a").each do |a|
         @full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
       end if html_head
     end
@@ -78,9 +89,13 @@ class CraigScrape::Posting < CraigScrape::Scraper
   # String, represents the post's reply-to address, if listed
   def reply_to
     unless @reply_to
-      cursor = html_head.at 'hr' if html_head
-      cursor = cursor.next until cursor.nil? or cursor.name == 'a'
-      @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
+      if html.at_xpath(XPATH_REPLY_TO)
+        @reply_to = html.at_xpath(XPATH_REPLY_TO).content
+      else
+        cursor = html_head.at 'hr' if html_head
+        cursor = cursor.next until cursor.nil? or cursor.name == 'a'
+        @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
+      end
     end
     @reply_to
@@ -91,7 +106,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
     unless @post_time
       cursor = html_head.at 'hr' if html_head
       cursor = cursor.next until cursor.nil? or POST_DATE.match cursor.to_s
-      @post_time = Time.parse $1 if $1
+      @post_time = DateTime.parse($1) if $1
     end
     @post_time
@@ -99,10 +114,17 @@ class CraigScrape::Posting < CraigScrape::Scraper
   # Integer, Craigslist's unique posting id
   def posting_id
-    unless @posting_id
-      cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING if html_footer
-      cursor = cursor.next until cursor.nil? or POSTING_ID.match cursor.to_s
-      @posting_id = $1.to_i if $1
+    if @posting_id
+    elsif USERBODY_PARTS.match html_source
+      # Old style:
+      html_footer = $4
+      cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING
+      cursor = cursor.next until cursor.nil? or
+      @posting_id = $1.to_i if POSTING_ID.match html_footer.to_s
+    else
+      # Post 12/3
+      @posting_id = $1.to_i if POSTING_ID.match html.xpath("//*[@class='postingidtext']").to_s
     end
     @posting_id
@@ -112,7 +134,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
   def contents
     unless @contents
       @contents = user_body if html_source
-      @contents = he_decode @contents.strip if @contents
+      @contents = he_decode(@contents).strip if @contents
     end
     @contents
@@ -120,27 +142,40 @@ class CraigScrape::Posting < CraigScrape::Scraper
   # String, the location of the item, as best could be parsed
   def location
-    if @location.nil? and craigslist_body and html
-      # Location (when explicitly defined):
-      cursor = craigslist_body.at 'ul' unless @location
-      # Apa section includes other things in the li's (cats/dogs ok fields)
-      cursor.children.each do |li|
-        if LOCATION.match li.inner_html
-          @location = he_decode($1) and break
-          break
-        end
-      end if cursor
+    if @location.nil? and html
+      if html.at_xpath(XPATH_BLURBS)
+        # This is the post-12/3/12 style:
+        # Sometimes the Location is in the body :
+        @location = $1 if html.xpath(XPATH_BLURBS).first.children.any?{|c|
+          LOCATION.match c.content}
-      # Real estate listings can work a little different for location:
-      unless @location
-        cursor = craigslist_body.at 'small'
-        cursor = cursor.previous until cursor.nil? or cursor.text?
+      elsif craigslist_body
+        # Location (when explicitly defined):
+        cursor = craigslist_body.at 'ul' unless @location
+        # This is the legacy style:
+        # Note: Apa section includes other things in the li's (cats/dogs ok fields)
+        cursor.children.each do |li|
+          if LOCATION.match li.inner_html
+            @location = he_decode($1) and break
+            break
+          end
+        end if cursor
+        # Real estate listings can work a little different for location:
+        unless @location
+          cursor = craigslist_body.at 'small'
+          cursor = cursor.previous until cursor.nil? or cursor.text?
+          @location = he_decode(cursor.to_s.strip) if cursor
+        end
-        @location = he_decode(cursor.to_s.strip) if cursor
       end
-      # So, *sometimes* the location just ends up being in the header, I don't know why:
+      # So, *sometimes* the location just ends up being in the header, I don't know why.
+      # This happens on old-style and new-style posts:
       @location = $1 if @location.nil? and HEADER_LOCATION.match header
     end
@@ -164,11 +199,16 @@ class CraigScrape::Posting < CraigScrape::Scraper
     unless @pics
       @pics = []
-      if html and craigslist_body
-        # Now let's find the craigslist hosted images:
-        img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
-        @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
+      if html
+        if html.at_xpath(XPATH_PICS)
+          @pics = html.xpath(XPATH_PICS).collect(&:value)
+        elsif craigslist_body
+          # This is the pre-12/3/12 style:
+          # Now let's find the craigslist hosted images:
+          img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
+          @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
+        end
       end
     end
@@ -202,11 +242,10 @@ class CraigScrape::Posting < CraigScrape::Scraper
     @posting_has_expired
   end
   # Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
   # used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
   def post_date
-    @post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil?
+    @post_date = post_time.to_date unless @post_date or post_time.nil?
     @post_date
   end
@@ -229,14 +268,8 @@ class CraigScrape::Posting < CraigScrape::Scraper
   # Array, which image types are listed for the post.
   # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
   def img_types
-    unless @img_types
-      @img_types = []
-      @img_types << :img if images.length > 0
-      @img_types << :pic if pics.length > 0
-    end
-    @img_types
+    @img_types || [ (images.length > 0) ? :img : nil,
+      (pics.length > 0) ? :pic : nil ].compact
   end
   # Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
@@ -270,7 +303,11 @@ class CraigScrape::Posting < CraigScrape::Scraper
   # Returns the best-guess of a price, judging by the label's contents. Price is available when pulled from the listing summary
   # and can be safely used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
   def price
-    $1.tr('$','').to_f if label and PRICE.match label
+    unless @price
+      (header and PRICE.match label) ?
+        @price = Money.new($1.tr('$','').to_i*100, 'USD') : nil
+    end
+    @price
   end
   # Returns the post contents with all html tags removed
@@ -290,6 +327,12 @@ class CraigScrape::Posting < CraigScrape::Scraper
     [contents,posting_id,post_time,title].all?{|f| f.nil?}
   end
+  # This is mostly used to determine if the post should be checked for
+  # parse errors. Might be useful for someone else though
+  def is_active_post?
+    [flagged_for_removal?, posting_has_expired?, deleted_by_author?].none?
+  end
   private
   # I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we
@@ -302,17 +345,31 @@ class CraigScrape::Posting < CraigScrape::Scraper
     @html_head
   end
-  # Since we started having so many problems with Hpricot flipping out on whack content bodies,
-  # I added this to return everything south of the user_body
-  def html_footer
-    $4 if USERBODY_PARTS.match html_source
-  end
   # OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
-  # This bad html trips up hpricot, and I've resorted to splitting the page up using string parsing like so:
+  # This bad html trips up html parsers, and I've resorted to splitting the page up using string parsing like so:
   # We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
-  def user_body
-    $2 if USERBODY_PARTS.match html_source
+  def user_body
+    if USERBODY_PARTS.match html_source
+      # This is the pre-12/3/12 style:
+      $2
+    elsif html.at_xpath(XPATH_USERBODY)
+      # There's a bunch of junk in here that we don't want, so this loop removes
+      # everything after (and including) the last script tag, from the result
+      user_body = html.xpath(XPATH_USERBODY)
+      hit_delimeter = false
+      # Since some posts don't actually have the script tag:
+      delimeter = user_body.at_xpath('script') ? :script : :comment
+      user_body.first.children.to_a.reverse.reject{ |p|
+        if hit_delimeter
+          false
+        elsif ( (delimeter == :script and p.name == 'script') or
+          (delimeter == :comment and p.comment? and p.content.strip == "START CLTAGS") )
+          hit_delimeter = true
+        else
+          true
+        end
+      }.reverse.collect(&:to_s).join
+    end
   end
   # Read the notes on user_body. However,  unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
@@ -321,4 +378,4 @@ class CraigScrape::Posting < CraigScrape::Scraper
     Nokogiri::HTML $3, nil, HTML_ENCODING if USERBODY_PARTS.match html_source
   end
-end
+end

data/lib/scraper.rb CHANGED Viewed

@@ -15,39 +15,27 @@
 #
 # <b>logger</b> - a Logger object to debug http notices too. Defaults to nil
 #
-# <b>retries_on_fetch_fail</b> - The number of times to retry a failed uri download. Defaults to 8
-#
-# <b>sleep_between_fetch_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a failed download. Defaults to 30.
-#
-# <b>retries_on_404_fail</b> - The number of times to retry a Resource Not Found error (http Response code 404). Defaults to 3.
-#
-# <b>sleep_between_404_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a Resource Not Found error. Defaults to 3.
-#
 class CraigScrape::Scraper
   cattr_accessor :logger
-  cattr_accessor :sleep_between_fetch_retries
-  cattr_accessor :retries_on_fetch_fail
   cattr_accessor :retries_on_404_fail
   cattr_accessor :sleep_between_404_retries
-  cattr_accessor :maximum_redirects_per_request
+  self.retries_on_404_fail = 3
+  self.sleep_between_404_retries = 3
   URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/
   HTML_TAG  = /<\/?[^>]*>/
   # We have to specify this to nokogiri. Sometimes it tries to figure out encoding on its own, and craigslist users post crazy bytes sometimes
   HTML_ENCODING = "UTF-8"
+  HTTP_HEADERS = { "Cache-Control" => "no-cache", "Pragma" => "no-cache",
+    "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19"}
   # Returns the full url that corresponds to this resource
   attr_reader :url
-  # Set some defaults:
-  self.retries_on_fetch_fail = 8
-  self.sleep_between_fetch_retries = 30
-  self.retries_on_404_fail = 3
-  self.sleep_between_404_retries = 3
-  self.maximum_redirects_per_request = 20
   class BadConstructionError < StandardError #:nodoc:
   end
@@ -57,9 +45,6 @@ class CraigScrape::Scraper
   class BadUrlError < StandardError #:nodoc:
   end
-  class MaxRedirectError < StandardError #:nodoc:
-  end
   class FetchError < StandardError #:nodoc:
   end
@@ -100,21 +85,37 @@ class CraigScrape::Scraper
     @uri
   end
+  # This method is mostly useful for our specs, but it's included in case anyone
+  # else wants it. It returns all currently-defined instance variables, and is
+  # mostly useful for the specs. Probably this doesn't do what you think, and
+  # should only be used to determine what's been parsed by the object thus-far.
+  # (And does not include parseable attributes which have yet to be determined
+  def attributes
+    Hash[self.instance_variables.collect{|i|
+      [i.to_s.tr('@','').to_sym, instance_variable_get(i) ] }]
+  end
   private
   # Returns text with all html tags removed.
   def strip_html(str)
-    str.gsub HTML_TAG, "" if str
+    he_decode(str).gsub HTML_TAG, "" if str
   end
   # Easy way to fail noisily:
-  def parse_error!; raise ParseError, "Error while parsing %s:\n %s" % [self.class.to_s, html]; end
+  def parse_error!(fields = nil)
+    raise ParseError, "Error while parsing %s:\n %s%s" % [
+      self.class.to_s, html,
+      (fields) ? ("\nRequired fields missing: %s" % fields.join(', ')) : '']
+  end
   # Returns text with all html entities converted to respective ascii character.
   def he_decode(text); self.class.he_decode text; end
   # Returns text with all html entities converted to respective ascii character.
-  def self.he_decode(text); HTMLEntities.new.decode text; end
+  def self.he_decode(text)
+    HTMLEntities.new.decode text
+  end
   # Derives a full url, using the current object's url and the provided href
   def url_from_href(href) #:nodoc:
@@ -133,42 +134,34 @@ class CraigScrape::Scraper
     '%s://%s%s' % [scheme, host, path]
   end
-  def fetch_uri(uri, redirect_count = 0)
-    logger.info "Requesting (%d): %s" % [redirect_count, @url.inspect] if logger
+  def fetch_uri(uri)
+    logger.info "Requesting: %s" % [@url.inspect] if logger
-    raise MaxRedirectError, "Max redirects (#{redirect_count}) reached for URL: #{@url}" if redirect_count > self.maximum_redirects_per_request-1
-    case uri.scheme
+    (case uri.scheme
       when 'file'
         # If this is a directory, we'll try to approximate http a bit by loading a '/index.html'
-        File.read( File.directory?(uri.path) ? "#{uri.path}/index.html" : uri.path )
+        File.read( File.directory?(uri.path) ?
+          "#{uri.path}/index.html" : uri.path , :encoding => 'BINARY')
       when /^http[s]?/
-        fetch_http uri, redirect_count
+        fetch_http uri
       else
         raise BadUrlError, "Unknown URI scheme for the url: #{@url}"
-    end
+    end).force_encoding("ISO-8859-1").encode("UTF-8")
   end
-  def fetch_http(uri, redirect_count = 0)
+  def fetch_http(uri)
     fetch_attempts = 0
     resource_not_found_attempts = 0
     begin
-      # This handles the redirects for us
-      resp, data = Net::HTTP.new( uri.host, uri.port).get uri.request_uri, nil
-      if resp.response.code == "200"
-        # Check for gzip, and decode:
-        data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
-        data
-      elsif resp.response['Location']
-        redirect_to = resp.response['Location']
-        fetch_uri URI.parse(url_from_href(redirect_to)), redirect_count+1
+      resp = Typhoeus.get uri.to_s, :followlocation =>  true,
+        :headers => HTTP_HEADERS
+      if resp.response_code == 200
+        resp.response_body
       else
         # Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
-        raise ResourceNotFoundError, 'Unable to fetch "%s" (%s)' % [ @url, resp.response.code ]
+        raise ResourceNotFoundError, 'Unable to fetch "%s" (%s)' % [ @url, resp.response_code ]
       end
     rescue ResourceNotFoundError => err
       logger.info err.message if logger
@@ -182,19 +175,6 @@ class CraigScrape::Scraper
       else
         raise err
       end
-    rescue FetchError,Timeout::Error,Errno::ECONNRESET,EOFError => err
-      logger.info 'Timeout error while requesting "%s"' % @url if logger and err.class == Timeout::Error
-      logger.info 'Connection reset while requesting "%s"' % @url if logger and err.class == Errno::ECONNRESET
-      fetch_attempts += 1
-      if fetch_attempts <= self.retries_on_fetch_fail
-        sleep self.sleep_between_fetch_retries if self.sleep_between_fetch_retries
-        logger.info 'Retrying fetch ....' if logger
-        retry
-      else
-        raise err
-      end
     end
   end
@@ -209,4 +189,4 @@ class CraigScrape::Scraper
     @html ||= Nokogiri::HTML html_source, nil, HTML_ENCODING if html_source
     @html
   end
-end
+end