RubyGems - olek-libcraigscrape - Versions diffs - 1.0.1 - Mend

olek-libcraigscrape 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

data/CHANGELOG +94 -0
data/COPYING +674 -0
data/COPYING.LESSER +165 -0
data/README +89 -0
data/Rakefile +125 -0
data/bin/craig_report_schema.yml +68 -0
data/bin/craigwatch +581 -0
data/bin/report_mailer/craigslist_report.html.erb +17 -0
data/bin/report_mailer/craigslist_report.plain.erb +18 -0
data/lib/geo_listings.rb +144 -0
data/lib/libcraigscrape.rb +217 -0
data/lib/listings.rb +160 -0
data/lib/posting.rb +324 -0
data/lib/scraper.rb +212 -0
data/test/geolisting_samples/geo_listing_ca070209.html +76 -0
data/test/geolisting_samples/geo_listing_ca_sk070209.html +31 -0
data/test/geolisting_samples/geo_listing_cn070209.html +35 -0
data/test/geolisting_samples/geo_listing_us070209.html +355 -0
data/test/geolisting_samples/hierarchy_test071009/index.html +31 -0
data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/%20SW%20florida/index.html +46 -0
data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/index.html +46 -0
data/test/geolisting_samples/hierarchy_test071009/us/fl/index.html +46 -0
data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/index.html +46 -0
data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/index.html +46 -0
data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/more-nonsense/index.html +46 -0
data/test/geolisting_samples/hierarchy_test071009/us/fl/nonexist/index.html +46 -0
data/test/geolisting_samples/hierarchy_test071009/us/fl/nonsense/index.html +46 -0
data/test/geolisting_samples/hierarchy_test071009/us/fl/south%20florida/index.html +46 -0
data/test/geolisting_samples/hierarchy_test071009/us/index.html +355 -0
data/test/google.html +8 -0
data/test/libcraigscrape_test_helpers.rb +37 -0
data/test/listing_samples/category_output.html +231 -0
data/test/listing_samples/category_output_2.html +217 -0
data/test/listing_samples/empty_listings.html +128 -0
data/test/listing_samples/fortmyers_art_index.060909/1046596324.html +93 -0
data/test/listing_samples/fortmyers_art_index.060909/1053085283.html +92 -0
data/test/listing_samples/fortmyers_art_index.060909/1112522674.html +89 -0
data/test/listing_samples/fortmyers_art_index.060909/823516079.html +92 -0
data/test/listing_samples/fortmyers_art_index.060909/825684735.html +89 -0
data/test/listing_samples/fortmyers_art_index.060909/891513957.html +94 -0
data/test/listing_samples/fortmyers_art_index.060909/897549505.html +99 -0
data/test/listing_samples/fortmyers_art_index.060909/960826026.html +89 -0
data/test/listing_samples/fortmyers_art_index.060909/993256300.html +89 -0
data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index500.060909.html +237 -0
data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index600.060909.html +132 -0
data/test/listing_samples/long_search_output.html +137 -0
data/test/listing_samples/mia_fua_index8900.5.21.09.html +226 -0
data/test/listing_samples/mia_search_kitten.3.15.10.html +149 -0
data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack1000.6.18.09.html +144 -0
data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack900.6.18.09.html +146 -0
data/test/listing_samples/new_listing_span.4.17.10.html +769 -0
data/test/listing_samples/short_search_output.html +133 -0
data/test/post_samples/1207457727.html +92 -0
data/test/post_samples/brw_reb_1224008903.html +101 -0
data/test/post_samples/posting0.html +91 -0
data/test/post_samples/posting1.html +106 -0
data/test/post_samples/posting1796890756-061710.html +2318 -0
data/test/post_samples/posting1808219423.html +2473 -0
data/test/post_samples/posting1938291834-090610.html +188 -0
data/test/post_samples/posting2.html +107 -0
data/test/post_samples/posting3.html +92 -0
data/test/post_samples/posting4.html +993 -0
data/test/post_samples/posting5.html +38 -0
data/test/post_samples/sfbay_art_1223614914.html +94 -0
data/test/post_samples/this_post_has_been_deleted_by_its_author.html +37 -0
data/test/post_samples/this_post_has_expired.html +48 -0
data/test/test_craigslist_geolisting.rb +521 -0
data/test/test_craigslist_listing.rb +362 -0
data/test/test_craigslist_posting.rb +426 -0
metadata +273 -0

data/lib/posting.rb ADDED Viewed

@@ -0,0 +1,324 @@
+# = About posting.rb
+#
+# This file contains the parsing code, and logic relating to craiglist postings. You
+# should never need to include this file directly, as all of libcraigscrape's objects and methods
+# are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
+#
+require 'scraper'
+# Posting represents a fully downloaded, and parsed, Craigslist post.
+# This class is generally returned by the listing scrape methods, and
+# contains the post summaries for a specific search url, or a general listing category
+class CraigScrape::Posting < CraigScrape::Scraper
+  POST_DATE       = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
+  LOCATION        = /Location\:[ ]+(.+)/
+  HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/
+  POSTING_ID      = /PostingID\:[ ]+([\d]+)/
+  REPLY_TO        = /(.+)/
+  PRICE           = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
+  USERBODY_PARTS  = /^(.+)\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>(.+)$/m
+  HTML_HEADER     = /^(.+)\<div id\=\"userbody\">/m
+  IMAGE_SRC       = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
+  # This is really just for testing, in production use, uri.path is a better solution
+  attr_reader :href #:nodoc:
+  # Create a new Post via a url (String), or supplied parameters (Hash)
+  def initialize(*args)
+    super(*args)
+    # Validate that required fields are present, at least - if we've downloaded it from a url
+    parse_error! if (
+      args.first.kind_of? String and
+      !flagged_for_removal? and
+      !posting_has_expired? and
+      !deleted_by_author? and [
+        contents,posting_id,post_time,header,title,full_section
+      ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
+    )
+  end
+  # String, The contents of the item's html body heading
+  def header
+    unless @header
+      h2 = html_head.at 'h2' if html_head
+      @header = he_decode h2.inner_html if h2
+    end
+    @header
+  end
+  # String, the item's title
+  def title
+    unless @title
+      title_tag = html_head.at 'title' if html_head
+      @title = he_decode title_tag.inner_html if title_tag
+      @title = nil if @title and @title.length == 0
+    end
+    @title
+  end
+  # Array, hierarchial representation of the posts section
+  def full_section
+    unless @full_section
+      @full_section = []
+      (html_head/"div[@class='bchead']//a").each do |a|
+        @full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
+      end if html_head
+    end
+    @full_section
+  end
+  # String, represents the post's reply-to address, if listed
+  def reply_to
+    unless @reply_to
+      cursor = html_head.at 'hr' if html_head
+      cursor = cursor.next until cursor.nil? or cursor.name == 'a'
+      @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
+    end
+    @reply_to
+  end
+  # Time, reflects the full timestamp of the posting
+  def post_time
+    unless @post_time
+      cursor = html_head.at 'hr' if html_head
+      cursor = cursor.next until cursor.nil? or POST_DATE.match cursor.to_s
+      @post_time = Time.parse $1 if $1
+    end
+    @post_time
+  end
+  # Integer, Craigslist's unique posting id
+  def posting_id
+    unless @posting_id
+      cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING if html_footer
+      cursor = cursor.next until cursor.nil? or POSTING_ID.match cursor.to_s
+      @posting_id = $1.to_i if $1
+    end
+    @posting_id
+  end
+  # String, The full-html contents of the post
+  def contents
+    unless @contents
+      @contents = user_body if html_source
+      @contents = he_decode @contents.strip if @contents
+    end
+    @contents
+  end
+  # String, the location of the item, as best could be parsed
+  def location
+    if @location.nil? and craigslist_body and html
+      # Location (when explicitly defined):
+      cursor = craigslist_body.at 'ul' unless @location
+      # Apa section includes other things in the li's (cats/dogs ok fields)
+      cursor.children.each do |li|
+        if LOCATION.match li.inner_html
+          @location = he_decode($1) and break
+          break
+        end
+      end if cursor
+      # Real estate listings can work a little different for location:
+      unless @location
+        cursor = craigslist_body.at 'small'
+        cursor = cursor.previous until cursor.nil? or cursor.text?
+        @location = he_decode(cursor.to_s.strip) if cursor
+      end
+      # So, *sometimes* the location just ends up being in the header, I don't know why:
+      @location = $1 if @location.nil? and HEADER_LOCATION.match header
+    end
+    @location
+  end
+  # Array, urls of the post's images that are *not* hosted on craigslist
+  def images
+    # Keep in mind that when users post html to craigslist, they're often not posting wonderful html...
+    @images = (
+      contents ?
+        contents.scan(IMAGE_SRC).collect{ |a| a.find{|b| !b.nil? } } :
+        []
+    ) unless @images
+    @images
+  end
+  # Array, urls of the post's craigslist-hosted images
+  def pics
+    unless @pics
+      @pics = []
+      if html and craigslist_body
+        # Now let's find the craigslist hosted images:
+        img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
+        @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
+      end
+    end
+    @pics
+  end
+  # Returns true if this Post was parsed, and merely a 'Flagged for Removal' page
+  def flagged_for_removal?
+    @flagged_for_removal = (
+      system_post? and header_as_plain == "This posting has been flagged for removal"
+    ) if @flagged_for_removal.nil?
+    @flagged_for_removal
+  end
+  # Returns true if this Post was parsed, and represents a 'This posting has been deleted by its author.' notice
+  def deleted_by_author?
+    @deleted_by_author = (
+      system_post? and header_as_plain == "This posting has been deleted by its author."
+    ) if @deleted_by_author.nil?
+    @deleted_by_author
+  end
+  # Returns true if this Post was parsed, and represents a 'This posting has expired.' notice
+  def posting_has_expired?
+    @posting_has_expired = (
+      system_post? and header_as_plain == "This posting has expired."
+    ) if @posting_has_expired.nil?
+    @posting_has_expired
+  end
+  # Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
+  # used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
+  def post_date
+    @post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil?
+    @post_date
+  end
+  # Returns The post label. The label would appear at first glance to be indentical to the header - but its not.
+  # The label is cited on the listings pages, and generally includes everything in the header - with the exception of the location.
+  # Sometimes there's additional information ie. '(map)' on rea listings included in the header, that aren't to be listed in the label
+  # This is also used as a bandwidth shortcut for the craigwatch program, and is a guaranteed identifier for the post, that won't result
+  # in a full page load from the post's url.
+  def label
+    unless @label or system_post?
+      @label = header
+      @label = $1 if location and /(.+?)[ ]*\(#{location}\).*?$/.match @label
+    end
+    @label
+  end
+  # Array, which image types are listed for the post.
+  # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
+  def img_types
+    unless @img_types
+      @img_types = []
+      @img_types << :img if images.length > 0
+      @img_types << :pic if pics.length > 0
+    end
+    @img_types
+  end
+  # Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
+  # this (sometimes/rarely) conserves bandwidth by pulling this field from the listing post-summary
+  def section
+    unless @section
+      @section = full_section.last if full_section
+    end
+    @section
+  end
+  # true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server.
+  # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
+  def has_img?
+    img_types.include? :img
+  end
+  # true if post summary has 'pic(s)'. 'pics' are different then imgs, in that craigslist is hosting the resource on craigslist's servers
+  # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
+  def has_pic?
+    img_types.include? :pic
+  end
+  # true if post summary has either the img or pic label
+  # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
+  def has_pic_or_img?
+    img_types.length > 0
+  end
+  # Returns the best-guess of a price, judging by the label's contents. Price is available when pulled from the listing summary
+  # and can be safely used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
+  def price
+    $1.tr('$','').to_f if label and PRICE.match label
+  end
+  # Returns the post contents with all html tags removed
+  def contents_as_plain
+    strip_html contents
+  end
+  # Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a
+  # 'system_post' we may get tags in here
+  def header_as_plain
+    strip_html header
+  end
+  # Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original
+  # This returns true or false if that case applies
+  def system_post?
+    [contents,posting_id,post_time,title].all?{|f| f.nil?}
+  end
+  private
+  # I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we
+  # return everything above the user_body
+  def html_head
+    @html_head = Nokogiri::HTML  $1, nil, HTML_ENCODING if @html_head.nil? and HTML_HEADER.match html_source
+    # We return html itself if HTML_HEADER doesn't match, which would be case for a 404 page or something
+    @html_head ||= html
+    @html_head
+  end
+  # Since we started having so many problems with Hpricot flipping out on whack content bodies,
+  # I added this to return everything south of the user_body
+  def html_footer
+    $4 if USERBODY_PARTS.match html_source
+  end
+  # OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
+  # This bad html trips up hpricot, and I've resorted to splitting the page up using string parsing like so:
+  # We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
+  def user_body
+    $2 if USERBODY_PARTS.match html_source
+  end
+  # Read the notes on user_body. However,  unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
+  # So - we'll return it as a Nokogiri object.
+  def craigslist_body
+    Nokogiri::HTML $3, nil, HTML_ENCODING if USERBODY_PARTS.match html_source
+  end
+end

data/lib/scraper.rb ADDED Viewed

@@ -0,0 +1,212 @@
+# = About scraper.rb
+#
+# This file defines:
+# - the base class from which other parse objects inherit
+# - Basic http and connection handling methods
+# - html utility methods used by objects
+# - Common Errors
+# You should never need to include this file directly, as all of libcraigscrape's objects and methods
+# are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
+#
+# Scraper is a general-pupose base class for all libcraigscrape Objects. Scraper facilitates all http-related
+# functionality, and adds some useful helpers for dealing with eager-loading of http-objects and general html
+# methods. It also contains the http-related cattr_accessors:
+#
+# <b>logger</b> - a Logger object to debug http notices too. Defaults to nil
+#
+# <b>retries_on_fetch_fail</b> - The number of times to retry a failed uri download. Defaults to 8
+#
+# <b>sleep_between_fetch_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a failed download. Defaults to 30.
+#
+# <b>retries_on_404_fail</b> - The number of times to retry a Resource Not Found error (http Response code 404). Defaults to 3.
+#
+# <b>sleep_between_404_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a Resource Not Found error. Defaults to 3.
+#
+class CraigScrape::Scraper
+  cattr_accessor :logger
+  cattr_accessor :sleep_between_fetch_retries
+  cattr_accessor :retries_on_fetch_fail
+  cattr_accessor :retries_on_404_fail
+  cattr_accessor :sleep_between_404_retries
+  cattr_accessor :maximum_redirects_per_request
+  URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/
+  HTML_TAG  = /<\/?[^>]*>/
+  # We have to specify this to nokogiri. Sometimes it tries to figure out encoding on its own, and craigslist users post crazy bytes sometimes
+  HTML_ENCODING = "UTF-8"
+  # Returns the full url that corresponds to this resource
+  attr_reader :url
+  # Set some defaults:
+  self.retries_on_fetch_fail = 8
+  self.sleep_between_fetch_retries = 30
+  self.retries_on_404_fail = 3
+  self.sleep_between_404_retries = 3
+  self.maximum_redirects_per_request = 20
+  class BadConstructionError < StandardError #:nodoc:
+  end
+  class ParseError < StandardError #:nodoc:
+  end
+  class BadUrlError < StandardError #:nodoc:
+  end
+  class MaxRedirectError < StandardError #:nodoc:
+  end
+  class FetchError < StandardError #:nodoc:
+  end
+  class ResourceNotFoundError < StandardError #:nodoc:
+  end
+  # Scraper Objects can be created from either a full URL (string), or a Hash.
+  # Currently, this initializer isn't intended to be called from libcraigslist API users, though
+  # if you know what you're doing - feel free to try this out.
+  #
+  # A (string) url can be passed in a 'http://' scheme or a 'file://' scheme.
+  #
+  # When constructing from a hash, the keys in the hash will be used to set the object's corresponding values.
+  # This is useful to create an object without actually making an html request, this is used to set-up an
+  # object before it eager-loads any values not already passed in by the constructor hash. Though optional, if
+  # you're going to be setting this object up for eager-loadnig, be sure to pass in a :url key in your hash,
+  # Otherwise this will fail to eager load.
+  def initialize(init_via = nil)
+    if init_via.nil?
+      # Do nothing - possibly not a great idea, but we'll allow it
+    elsif init_via.kind_of? String
+      @url = init_via
+    elsif init_via.kind_of? Hash
+      init_via.each_pair{|k,v| instance_variable_set "@#{k}", v}
+    else
+      raise BadConstructionError, ("Unrecognized parameter passed to %s.new %s}" % [self.class.to_s, init_via.class.inspect])
+    end
+  end
+  # Indicates whether the resource has yet been retrieved from its associated url.
+  # This is useful to distinguish whether the instance was instantiated for the purpose of an eager-load,
+  # but hasn't yet been fetched.
+  def downloaded?; !@html_source.nil?; end
+  # A URI object corresponding to this Scraped URL
+  def uri
+    @uri ||= URI.parse @url if @url
+    @uri
+  end
+  private
+  # Returns text with all html tags removed.
+  def strip_html(str)
+    str.gsub HTML_TAG, "" if str
+  end
+  # Easy way to fail noisily:
+  def parse_error!; raise ParseError, "Error while parsing %s:\n %s" % [self.class.to_s, html]; end
+  # Returns text with all html entities converted to respective ascii character.
+  def he_decode(text); self.class.he_decode text; end
+  # Returns text with all html entities converted to respective ascii character.
+  def self.he_decode(text); HTMLEntities.new.decode text; end
+  # Derives a full url, using the current object's url and the provided href
+  def url_from_href(href) #:nodoc:
+    scheme, host, path = $1, $2, $3 if URL_PARTS.match href
+    scheme = uri.scheme if scheme.nil? or scheme.empty? and uri.respond_to? :scheme
+    host = uri.host if host.nil? or host.empty? and uri.respond_to? :host
+    path = (
+      (/\/$/.match(uri.path)) ?
+        '%s%s'  % [uri.path,path] :
+        '%s/%s' % [File.dirname(uri.path),path]
+    ) unless /^\//.match path
+    '%s://%s%s' % [scheme, host, path]
+  end
+  def fetch_uri(uri, redirect_count = 0)
+    logger.info "Requesting (%d): %s" % [redirect_count, @url.inspect] if logger
+    raise MaxRedirectError, "Max redirects (#{redirect_count}) reached for URL: #{@url}" if redirect_count > self.maximum_redirects_per_request-1
+    case uri.scheme
+      when 'file'
+        # If this is a directory, we'll try to approximate http a bit by loading a '/index.html'
+        File.read( File.directory?(uri.path) ? "#{uri.path}/index.html" : uri.path )
+      when /^http[s]?/
+        fetch_http uri, redirect_count
+      else
+        raise BadUrlError, "Unknown URI scheme for the url: #{@url}"
+    end
+  end
+  def fetch_http(uri, redirect_count = 0)
+    fetch_attempts = 0
+    resource_not_found_attempts = 0
+    begin
+      # This handles the redirects for us
+      resp, data = Net::HTTP.new( uri.host, uri.port).get uri.request_uri
+      if resp.response.code == "200"
+        # Check for gzip, and decode:
+        data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
+        data
+      elsif resp.response['Location']
+        redirect_to = resp.response['Location']
+        fetch_uri URI.parse(url_from_href(redirect_to)), redirect_count+1
+      else
+        # Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
+        raise ResourceNotFoundError, 'Unable to fetch "%s" (%s)' % [ @url, resp.response.code ]
+      end
+    rescue ResourceNotFoundError => err
+      logger.info err.message if logger
+      resource_not_found_attempts += 1
+      if resource_not_found_attempts <= self.retries_on_404_fail
+        sleep self.sleep_between_404_retries if self.sleep_between_404_retries
+        logger.info 'Retrying ....' if logger
+        retry
+      else
+        raise err
+      end
+    rescue FetchError,Timeout::Error,Errno::ECONNRESET,EOFError => err
+      logger.info 'Timeout error while requesting "%s"' % @url if logger and err.class == Timeout::Error
+      logger.info 'Connection reset while requesting "%s"' % @url if logger and err.class == Errno::ECONNRESET
+      fetch_attempts += 1
+      if fetch_attempts <= self.retries_on_fetch_fail
+        sleep self.sleep_between_fetch_retries if self.sleep_between_fetch_retries
+        logger.info 'Retrying fetch ....' if logger
+        retry
+      else
+        raise err
+      end
+    end
+  end
+  # Returns a string, of the current URI's source code
+  def html_source
+    @html_source ||= fetch_uri uri if uri
+    @html_source
+  end
+  # Returns an Nokogiri parse, of the current URI
+  def html
+    @html ||= Nokogiri::HTML html_source, nil, HTML_ENCODING if html_source
+    @html
+  end
+end