RubyGems - olek-libcraigscrape - Versions diffs - 1.0.3 → 1.1.0 - Mend

olek-libcraigscrape 1.0.3 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

data/CHANGELOG +12 -6
data/COPYING.LESSER +1 -1
data/README +10 -10
data/Rakefile +5 -54
data/bin/craig_report_schema.yml +3 -3
data/bin/craigwatch +32 -44
data/bin/report_mailer/report.html.erb +17 -0
data/bin/report_mailer/{craigslist_report.plain.erb → report.text.erb} +6 -6
data/lib/geo_listings.rb +24 -24
data/lib/libcraigscrape.rb +6 -11
data/lib/listings.rb +62 -45
data/lib/posting.rb +153 -106
data/lib/scraper.rb +37 -94
data/test/libcraigscrape_test_helpers.rb +10 -10
data/test/test_craigslist_geolisting.rb +53 -53
data/test/test_craigslist_listing.rb +26 -26
data/test/test_craigslist_posting.rb +39 -38
metadata +38 -114
data/bin/report_mailer/craigslist_report.html.erb +0 -17

data/lib/scraper.rb CHANGED Viewed

@@ -5,14 +5,14 @@
 # - Basic http and connection handling methods
 # - html utility methods used by objects
 # - Common Errors
-# You should never need to include this file directly, as all of libcraigscrape's objects and methods
+# You should never need to include this file directly, as all of libcraigscrape's objects and methods
 # are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
 #
-# Scraper is a general-pupose base class for all libcraigscrape Objects. Scraper facilitates all http-related
+# Scraper is a general-pupose base class for all libcraigscrape Objects. Scraper facilitates all http-related
 # functionality, and adds some useful helpers for dealing with eager-loading of http-objects and general html
 # methods. It also contains the http-related cattr_accessors:
-#
+#
 # <b>logger</b> - a Logger object to debug http notices too. Defaults to nil
 #
 # <b>retries_on_fetch_fail</b> - The number of times to retry a failed uri download. Defaults to 8
@@ -23,31 +23,22 @@
 #
 # <b>sleep_between_404_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a Resource Not Found error. Defaults to 3.
 #
 class CraigScrape::Scraper
   cattr_accessor :logger
-  cattr_accessor :sleep_between_fetch_retries
-  cattr_accessor :retries_on_fetch_fail
-  cattr_accessor :retries_on_404_fail
-  cattr_accessor :sleep_between_404_retries
-  cattr_accessor :maximum_redirects_per_request
   URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/
   HTML_TAG  = /<\/?[^>]*>/
-  # We have to specify this to nokogiri. Sometimes it tries to figure out encoding on its own, and craigslist users post crazy bytes sometimes
+  # We have to specify this to nokogiri. Sometimes it tries to figure out encoding on its own, and craigslist users post crazy bytes sometimes
   HTML_ENCODING = "UTF-8"
+  HTTP_HEADERS = { "Cache-Control" => "no-cache", "Pragma" => "no-cache",
+    "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19"}
   # Returns the full url that corresponds to this resource
   attr_reader :url
-  # Set some defaults:
-  self.retries_on_fetch_fail = 8
-  self.sleep_between_fetch_retries = 30
-  self.retries_on_404_fail = 3
-  self.sleep_between_404_retries = 3
-  self.maximum_redirects_per_request = 20
   class BadConstructionError < StandardError #:nodoc:
   end
@@ -57,15 +48,9 @@ class CraigScrape::Scraper
   class BadUrlError < StandardError #:nodoc:
   end
-  class MaxRedirectError < StandardError #:nodoc:
-  end
   class FetchError < StandardError #:nodoc:
   end
-  class ResourceNotFoundError < StandardError #:nodoc:
-  end
   # Scraper Objects can be created from either a full URL (string), or a Hash.
   # Currently, this initializer isn't intended to be called from libcraigslist API users, though
   # if you know what you're doing - feel free to try this out.
@@ -88,7 +73,7 @@ class CraigScrape::Scraper
       raise BadConstructionError, ("Unrecognized parameter passed to %s.new %s}" % [self.class.to_s, init_via.class.inspect])
     end
   end
   # Indicates whether the resource has yet been retrieved from its associated url.
   # This is useful to distinguish whether the instance was instantiated for the purpose of an eager-load,
   # but hasn't yet been fetched.
@@ -101,21 +86,27 @@ class CraigScrape::Scraper
   end
   private
   # Returns text with all html tags removed.
   def strip_html(str)
-    str.gsub HTML_TAG, "" if str
+    he_decode(str).gsub HTML_TAG, "" if str
   end
   # Easy way to fail noisily:
-  def parse_error!; raise ParseError, "Error while parsing %s:\n %s" % [self.class.to_s, html]; end
+  def parse_error!(fields = nil)
+    raise ParseError, "Error while parsing %s:\n %s%s" % [
+      self.class.to_s, html,
+      (fields) ? ("\nRequired fields missing: %s" % fields.join(', ')) : '']
+  end
   # Returns text with all html entities converted to respective ascii character.
   def he_decode(text); self.class.he_decode text; end
   # Returns text with all html entities converted to respective ascii character.
-  def self.he_decode(text); HTMLEntities.new.decode text; end
+  def self.he_decode(text)
+    HTMLEntities.new.decode text
+  end
   # Derives a full url, using the current object's url and the provided href
   def url_from_href(href) #:nodoc:
     scheme, host, path = $1, $2, $3 if URL_PARTS.match href
@@ -132,81 +123,33 @@ class CraigScrape::Scraper
     '%s://%s%s' % [scheme, host, path]
   end
+  def fetch_uri(uri)
+    logger.info "Requesting: %s" % [@url.inspect] if logger
-  def fetch_uri(uri, redirect_count = 0)
-    logger.info "Requesting (%d): %s" % [redirect_count, @url.inspect] if logger
-    raise MaxRedirectError, "Max redirects (#{redirect_count}) reached for URL: #{@url}" if redirect_count > self.maximum_redirects_per_request-1
-    case uri.scheme
+    (case uri.scheme
       when 'file'
         # If this is a directory, we'll try to approximate http a bit by loading a '/index.html'
-        File.read( File.directory?(uri.path) ? "#{uri.path}/index.html" : uri.path )
+        File.read( File.directory?(uri.path) ?
+          "#{uri.path}/index.html" : uri.path , :encoding => 'BINARY')
       when /^http[s]?/
-        fetch_http uri, redirect_count
+        resp = Typhoeus.get uri.to_s, :followlocation =>  true,
+          :headers => HTTP_HEADERS
+        resp.response_body
       else
         raise BadUrlError, "Unknown URI scheme for the url: #{@url}"
-    end
-  end
-  def fetch_http(uri, redirect_count = 0)
-    fetch_attempts = 0
-    resource_not_found_attempts = 0
-    begin
-      # This handles the redirects for us
-      resp, data = Net::HTTP.new( uri.host, uri.port).get uri.request_uri
-      if resp.response.code == "200"
-        # Check for gzip, and decode:
-        data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
-        data
-      elsif resp.response['Location']
-        redirect_to = resp.response['Location']
-        fetch_uri URI.parse(url_from_href(redirect_to)), redirect_count+1
-      else
-        # Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
-        raise ResourceNotFoundError, 'Unable to fetch "%s" (%s)' % [ @url, resp.response.code ]
-      end
-    rescue ResourceNotFoundError => err
-      logger.info err.message if logger
-      resource_not_found_attempts += 1
-      if resource_not_found_attempts <= self.retries_on_404_fail
-        sleep self.sleep_between_404_retries if self.sleep_between_404_retries
-        logger.info 'Retrying ....' if logger
-        retry
-      else
-        raise err
-      end
-    rescue FetchError,Timeout::Error,Errno::ECONNRESET,EOFError => err
-      logger.info 'Timeout error while requesting "%s"' % @url if logger and err.class == Timeout::Error
-      logger.info 'Connection reset while requesting "%s"' % @url if logger and err.class == Errno::ECONNRESET
-      fetch_attempts += 1
-      if fetch_attempts <= self.retries_on_fetch_fail
-        sleep self.sleep_between_fetch_retries if self.sleep_between_fetch_retries
-        logger.info 'Retrying fetch ....' if logger
-        retry
-      else
-        raise err
-      end
-    end
+    end).force_encoding("ISO-8859-1").encode("UTF-8")
   end
   # Returns a string, of the current URI's source code
   def html_source
     @html_source ||= fetch_uri uri if uri
     @html_source
   end
   # Returns an Nokogiri parse, of the current URI
   def html
     @html ||= Nokogiri::HTML html_source, nil, HTML_ENCODING if html_source
     @html
   end
-end
+end

data/test/libcraigscrape_test_helpers.rb CHANGED Viewed

@@ -2,36 +2,36 @@ module LibcraigscrapeTestHelpers
   def relative_uri_for(filename)
     'file://%s/%s' % [File.dirname(File.expand_path(__FILE__)), filename]
   end
   def pp_assertions(obj, obj_name)
     probable_accessors = (obj.methods-obj.class.superclass.methods)
     puts
     probable_accessors.sort.each do |m|
       val = obj.send(m.to_sym)
       # There's a good number of transformations worth doing here, I'll just start like this for now:
       if val.kind_of? Time
         # I've decided this is the the easiest way to understand and test a time
         val = val.to_a
         m = "#{m}.to_a"
       end
-      if val.kind_of? Hash and val.length > 5
+      if val.kind_of? Hash and val.length > 5
         puts "assert_equal %s, %s.%s.length" % [val.length.inspect,obj_name,m]
-        val.keys.sort{|a,b| a <=> b }.each do |k|
+        val.keys.sort{|a,b| a <=> b }.each do |k|
           puts "assert_equal %s, %s.%s[%s]" % [val[k].inspect,obj_name,m,k.inspect]
         end
 #      elsif val.kind_of? Array
 #        puts "assert_equal %s, %s.%s.length" % [val.length.inspect,obj_name,m]
-#
-#        val.each_index do |i|
+#
+#        val.each_index do |i|
 #          pp_assertions  val[i], "%s.%s[%s]" % [obj_name,m,i.inspect]
 #        end
       else
         puts "assert_equal %s, %s.%s" % [val.inspect,obj_name,m]
       end
-    end
+    end
   end
-end
+end

data/test/test_craigslist_geolisting.rb CHANGED Viewed

@@ -6,13 +6,13 @@ require File.dirname(__FILE__)+'/libcraigscrape_test_helpers'
 class CraigslistGeolistingTest < Test::Unit::TestCase
   include LibcraigscrapeTestHelpers
   def test_pukes
     assert_raise(CraigScrape::Scraper::ParseError) do
       CraigScrape::GeoListings.new( relative_uri_for('google.html') ).sites
     end
   end
   def test_geo_listings
     geo_listing_us070209 = CraigScrape::GeoListings.new relative_uri_for(
       'geolisting_samples/geo_listing_us070209.html'
@@ -345,10 +345,10 @@ class CraigslistGeolistingTest < Test::Unit::TestCase
     assert_equal "youngstown.craigslist.org", geo_listing_us070209.sites["youngstown"]
     assert_equal "yubasutter.craigslist.org", geo_listing_us070209.sites["yuba-sutter"]
     assert_equal "yuma.craigslist.org", geo_listing_us070209.sites["yuma"]
     geo_listing_cn070209 = CraigScrape::GeoListings.new relative_uri_for(
       'geolisting_samples/geo_listing_cn070209.html'
-    )
+    )
     assert_equal "china", geo_listing_cn070209.location
     assert_equal 6, geo_listing_cn070209.sites.length
     assert_equal "beijing.craigslist.com.cn", geo_listing_cn070209.sites["beijing"]
@@ -357,10 +357,10 @@ class CraigslistGeolistingTest < Test::Unit::TestCase
     assert_equal "hongkong.craigslist.org", geo_listing_cn070209.sites["hong kong"]
     assert_equal "shanghai.craigslist.com.cn", geo_listing_cn070209.sites["shanghai"]
     assert_equal "shenzhen.craigslist.org", geo_listing_cn070209.sites["shenzhen"]
     geo_listing_ca070209 = CraigScrape::GeoListings.new relative_uri_for(
       'geolisting_samples/geo_listing_ca070209.html'
-    )
+    )
     assert_equal "canada", geo_listing_ca070209.location
     assert_equal 47, geo_listing_ca070209.sites.length
     assert_equal "barrie.craigslist.ca", geo_listing_ca070209.sites["barrie"]
@@ -410,28 +410,28 @@ class CraigslistGeolistingTest < Test::Unit::TestCase
     assert_equal "whistler.craigslist.ca", geo_listing_ca070209.sites["whistler, BC"]
     assert_equal "windsor.craigslist.ca", geo_listing_ca070209.sites["windsor"]
     assert_equal "winnipeg.craigslist.ca", geo_listing_ca070209.sites["winnipeg"]
     geo_listing_ca_sk07020 = CraigScrape::GeoListings.new relative_uri_for(
       'geolisting_samples/geo_listing_ca_sk070209.html'
-    )
+    )
     assert_equal "canada", geo_listing_ca_sk07020.location
-    assert_equal(
-      { "saskatoon" => "saskatoon.craigslist.ca", "regina" => "regina.craigslist.ca" },
+    assert_equal(
+      { "saskatoon" => "saskatoon.craigslist.ca", "regina" => "regina.craigslist.ca" },
       geo_listing_ca_sk07020.sites
     )
   end
   def test_sites_in_path
     # This was really tough to test, and in the end, I don't know just how useful this really is...
     hier_dir = relative_uri_for 'geolisting_samples/hierarchy_test071009/'
     %w(
-      us/fl/miami /us/fl/miami/ us/fl/miami/ /us/fl/miami us/fl/miami/nonsense
+      us/fl/miami /us/fl/miami/ us/fl/miami/ /us/fl/miami us/fl/miami/nonsense
       us/fl/miami/nonsense/more-nonsense us/fl/miami/south\ florida
     ).each do |path|
       assert_equal ["miami.craigslist.org"], CraigScrape::GeoListings.sites_in_path( path, hier_dir )
     end
     %w( us/fl /us/fl us/fl/ /us/fl/ ).each do |path|
       assert_equal(
         %w(
@@ -441,20 +441,20 @@ class CraigslistGeolistingTest < Test::Unit::TestCase
         CraigScrape::GeoListings.sites_in_path( path, hier_dir )
       )
     end
     # This tests those escaped funky paths. I *think* this file-based test is actually indicative
     # that the http-retrieval version works as well;
     us_fl_mia_ftmeyers = CraigScrape::GeoListings.sites_in_path(
       "us/fl/ft myers \\/ SW florida", hier_dir
     )
     assert_equal ["fortmyers.craigslist.org"], us_fl_mia_ftmeyers
     # make sure we puke on obvious bad-stuff. I *think* this file-based test is actually indicative
     # that the http-retrieval version works as well:
     assert_raise(CraigScrape::GeoListings::BadGeoListingPath) do
       CraigScrape::GeoListings.sites_in_path "us/fl/nonexist", hier_dir
     end
     assert_raise(CraigScrape::GeoListings::BadGeoListingPath) do
       # You'll notice that we could actually guess a decent match, but we wont :
       CraigScrape::GeoListings.sites_in_path "us/fl/miami/nonexist", hier_dir
@@ -465,57 +465,57 @@ class CraigslistGeolistingTest < Test::Unit::TestCase
     hier_dir = relative_uri_for 'geolisting_samples/hierarchy_test071009/'
     assert_equal(
-      %w(miami.craigslist.org),
-      CraigScrape::GeoListings.find_sites(
-        ["us/fl/south florida","+ us/fl/south florida", "-newyork.craigslist.org"],
+      %w(miami.craigslist.org),
+      CraigScrape::GeoListings.find_sites(
+        ["us/fl/south florida","+ us/fl/south florida", "-newyork.craigslist.org"],
         hier_dir
       )
     )
     assert_equal(
       %w(
-        jacksonville panamacity orlando fortmyers keys tallahassee ocala gainesville tampa
+        jacksonville panamacity orlando fortmyers keys tallahassee ocala gainesville tampa
         pensacola daytona treasure sarasota staugustine spacecoast lakeland newyork
-      ).collect{|p| "#{p}.craigslist.org"},
-      CraigScrape::GeoListings.find_sites( ["us/fl","-us/fl/miami", "+ newyork.craigslist.org"], hier_dir)
+      ).collect{|p| "#{p}.craigslist.org"}.sort,
+      CraigScrape::GeoListings.find_sites( ["us/fl","-us/fl/miami", "+ newyork.craigslist.org"], hier_dir).sort
     )
     assert_equal(
       %w(
-      westmd fortcollins charleston fayetteville dallas mendocino wichita valdosta terrahaute rockford erie
-      decatur cedarrapids stillwater collegestation charlestonwv albany sacramento houston kalamazoo fortsmith
+      westmd fortcollins charleston fayetteville dallas mendocino wichita valdosta terrahaute rockford erie
+      decatur cedarrapids stillwater collegestation charlestonwv albany sacramento houston kalamazoo fortsmith
       maine minneapolis stockton pennstate bend grandisland palmsprings nmi waterloo topeka eastnc greenbay york
-      utica stgeorge oklahomacity grandrapids eastidaho lancaster gulfport sandiego reading kpr fresno iowacity
-      chicago tuscaloosa smd monterey yubasutter victoriatx sd knoxville gadsden jonesboro ksu youngstown toledo
-      lascruces annarbor danville delaware parkersburg appleton stcloud richmond muskegon jerseyshore redding
-      ithaca hartford evansville corpuschristi binghamton chico modesto lynchburg hattiesburg morgantown
-      harrisonburg lubbock carbondale florencesc imperial wenatchee semo savannah prescott lacrosse longisland
-      huntsville santabarbara janesville mankato santafe pullman louisville lexington brunswick duluth columbus
-      hudsonvalley pittsburgh wheeling westky waco shreveport eastoregon corvallis winstonsalem denver
-      tippecanoe newhaven shoals wv greenville lansing detroit athensohio easttexas sanantonio raleigh phoenix
-      honolulu inlandempire pueblo chattanooga lawton worcester twinfalls roseburg roanoke fredericksburg
-      annapolis asheville seattle scranton quadcities oregoncoast stlouis newyork mobile atlanta visalia
-      clarksville providence kansascity galveston madison bham harrisburg muncie bloomington anchorage ventura
-      up tricities rockies elpaso slo indianapolis fayar columbusga bellingham abilene wichitafalls boston
-      mcallen bn sierravista lasvegas sanmarcos nwct farmington mansfield jacksontn bgky altoona eugene
-      lafayette boone odessa spokane norfolk hickory burlington nashville lawrence hiltonhead elmira westernmass
-      southjersey myrtlebeach dothan goldcountry lincoln martinsburg dubuque brownsville washingtondc tucson
-      columbiamo jxn yakima sheboygan olympic humboldt newjersey cosprings springfield beaumont macon eauclaire
-      batonrouge buffalo mohave wilmington rochester sfbay northmiss bakersfield neworleans catskills wausau
-      akroncanton cnj merced chambana flint capecod nh yuma tulsa charlottesville easternshore desmoines
-      athensga austin newlondon outerbanks fortwayne dayton wyoming watertown provo medford texarkana cleveland
-      memphis amarillo limaohio augusta flagstaff jackson plattsburgh peoria skagit saltlakecity saginaw
-      portland syracuse swmi baltimore monroe littlerock boise laredo boulder philadelphia sandusky salem rmn
-      montgomery blacksburg centralmich logan albuquerque losangeles poconos westslope southbend siouxcity reno
-      porthuron greensboro orangecounty fargo ogden charlotte allentown joplin chautauqua lakecharles omaha
-      springfieldil roswell montana killeen milwaukee nd williamsport columbia racine southcoast ames huntington
+      utica stgeorge oklahomacity grandrapids eastidaho lancaster gulfport sandiego reading kpr fresno iowacity
+      chicago tuscaloosa smd monterey yubasutter victoriatx sd knoxville gadsden jonesboro ksu youngstown toledo
+      lascruces annarbor danville delaware parkersburg appleton stcloud richmond muskegon jerseyshore redding
+      ithaca hartford evansville corpuschristi binghamton chico modesto lynchburg hattiesburg morgantown
+      harrisonburg lubbock carbondale florencesc imperial wenatchee semo savannah prescott lacrosse longisland
+      huntsville santabarbara janesville mankato santafe pullman louisville lexington brunswick duluth columbus
+      hudsonvalley pittsburgh wheeling westky waco shreveport eastoregon corvallis winstonsalem denver
+      tippecanoe newhaven shoals wv greenville lansing detroit athensohio easttexas sanantonio raleigh phoenix
+      honolulu inlandempire pueblo chattanooga lawton worcester twinfalls roseburg roanoke fredericksburg
+      annapolis asheville seattle scranton quadcities oregoncoast stlouis newyork mobile atlanta visalia
+      clarksville providence kansascity galveston madison bham harrisburg muncie bloomington anchorage ventura
+      up tricities rockies elpaso slo indianapolis fayar columbusga bellingham abilene wichitafalls boston
+      mcallen bn sierravista lasvegas sanmarcos nwct farmington mansfield jacksontn bgky altoona eugene
+      lafayette boone odessa spokane norfolk hickory burlington nashville lawrence hiltonhead elmira westernmass
+      southjersey myrtlebeach dothan goldcountry lincoln martinsburg dubuque brownsville washingtondc tucson
+      columbiamo jxn yakima sheboygan olympic humboldt newjersey cosprings springfield beaumont macon eauclaire
+      batonrouge buffalo mohave wilmington rochester sfbay northmiss bakersfield neworleans catskills wausau
+      akroncanton cnj merced chambana flint capecod nh yuma tulsa charlottesville easternshore desmoines
+      athensga austin newlondon outerbanks fortwayne dayton wyoming watertown provo medford texarkana cleveland
+      memphis amarillo limaohio augusta flagstaff jackson plattsburgh peoria skagit saltlakecity saginaw
+      portland syracuse swmi baltimore monroe littlerock boise laredo boulder philadelphia sandusky salem rmn
+      montgomery blacksburg centralmich logan albuquerque losangeles poconos westslope southbend siouxcity reno
+      porthuron greensboro orangecounty fargo ogden charlotte allentown joplin chautauqua lakecharles omaha
+      springfieldil roswell montana killeen milwaukee nd williamsport columbia racine southcoast ames huntington
       cincinnati auburn miami
-      ).collect{|p| "#{p}.craigslist.org"},
+      ).collect{|p| "#{p}.craigslist.org"}.sort,
       CraigScrape::GeoListings.find_sites(
         ["us","- us/fl", "+ us/fl/miami", ' -jacksonville.craigslist.org'], hier_dir
-      )
+      ).sort
     )
   end
 end