RubyGems - olek-libcraigscrape - Versions diffs - 1.0.3 → 1.1.0 - Mend

olek-libcraigscrape 1.0.3 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

data/CHANGELOG +12 -6
data/COPYING.LESSER +1 -1
data/README +10 -10
data/Rakefile +5 -54
data/bin/craig_report_schema.yml +3 -3
data/bin/craigwatch +32 -44
data/bin/report_mailer/report.html.erb +17 -0
data/bin/report_mailer/{craigslist_report.plain.erb → report.text.erb} +6 -6
data/lib/geo_listings.rb +24 -24
data/lib/libcraigscrape.rb +6 -11
data/lib/listings.rb +62 -45
data/lib/posting.rb +153 -106
data/lib/scraper.rb +37 -94
data/test/libcraigscrape_test_helpers.rb +10 -10
data/test/test_craigslist_geolisting.rb +53 -53
data/test/test_craigslist_listing.rb +26 -26
data/test/test_craigslist_posting.rb +39 -38
metadata +38 -114
data/bin/report_mailer/craigslist_report.html.erb +0 -17

data/lib/libcraigscrape.rb CHANGED Viewed

@@ -3,18 +3,13 @@
 # All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
 #
 require 'rubygems'
-gem 'activesupport', '~> 2.3'
-gem 'nokogiri',      '>= 1.4.4'
-gem 'htmlentities',  '>= 4.0.0'
-require 'net/http'
-require 'zlib'
-require 'nokogiri'
+require 'time'
+require 'uri'
 require 'htmlentities'
-require 'active_support'
+require 'active_support/core_ext/class/attribute_accessors'
+require 'htmlentities'
+require 'nokogiri'
+require 'typhoeus'
 # A base class encapsulating the various libcraigscrape objects, and providing most of the
 # craigslist interaction methods. Currently, we're supporting the old Class methods

data/lib/listings.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # = About listings.rb
 #
 # This file contains the parsing code, and logic relating to post-listing pages. You
-# should never need to include this file directly, as all of libcraigscrape's objects and methods
+# should never need to include this file directly, as all of libcraigscrape's objects and methods
 # are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
 #
 require 'scraper'
@@ -13,7 +13,10 @@ class CraigScrape::Listings < CraigScrape::Scraper
   IMG_TYPE       = /^[ ]*(.+)[ ]*$/
   HEADER_DATE    = /^[ ]*(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[ ]+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Nov|Dec)[ ]+([0-9]{1,2})[ ]*$/i
   SUMMARY_DATE   = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
-  NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/
+  NEXT_PAGE_LINK = /^[ ]*(?:next [\d]+ postings|Next \>\>)[ ]*$/
+  XPATH_POST_DATE = "*[@class='itemdate']"
+  XPATH_PAGENAV_LINKS = "//*[@class='ban']//a"
   # Array, PostSummary objects found in the listing
   def posts
@@ -22,12 +25,12 @@ class CraigScrape::Listings < CraigScrape::Scraper
       @posts = []
       # All we care about are p and h4 tags. This seemed to be the only way I could do this on Nokogiri:
-      post_tags = html.search('*').reject{|n| !/^(?:p|h4)$/i.match n.name }
+      post_tags = html.search('*').reject{|n| !/^(?:p|h4)$/i.match n.name }
       # The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output:
       post_tags.pop if (
-        post_tags.length > 0 and
-        post_tags.last.at('a') and
+        post_tags.length > 0 and
+        post_tags.last.at('a') and
         NEXT_PAGE_LINK.match post_tags.last.at('a').inner_html
       )
@@ -39,7 +42,7 @@ class CraigScrape::Listings < CraigScrape::Scraper
            # Validate that required fields are present:
            parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
            post_summary[:url] = url_from_href post_summary[:href]
            @posts << CraigScrape::Posting.new(post_summary)
@@ -50,13 +53,13 @@ class CraigScrape::Listings < CraigScrape::Scraper
             current_date = CraigScrape.most_recently_expired_time $1, $2
           elsif html.at('h4:last-of-type') == el
             # There's a specific bug in craigslist, where these nonsense h4's just appear without anything relevant inside them.
-            # They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
+            # They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page,
             # we need to pull up the full post in order to accurate tell the date.
             # Setting this to nil will achieve the eager-load.
             current_date = nil
           end
-        end
-      end
+        end
+      end
     end
     @posts
@@ -65,44 +68,52 @@ class CraigScrape::Listings < CraigScrape::Scraper
   # String, URL Path href-fragment of the next page link
   def next_page_href
     unless @next_page_href
-      cursor = html.at 'p:last-of-type'
-      cursor = cursor.at 'a' if cursor
-      # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
-      next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
-      # Search listings put their next page in a link towards the top
-      next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
-      # Some search pages have a bug, whereby a 'next page' link isn't displayed,
-      # even though we can see that theres another page listed in the page-number links block at the top
-      # and bottom of the listing page
-      unless next_link
-        cursor = html % 'div.sh:first-of-type > b:last-of-type'
-        # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
-        # We're looking good.
-        next_link = cursor.next_element if cursor and /^[\d]+$/.match cursor.inner_html
+      if html.at_xpath(XPATH_PAGENAV_LINKS)
+        # Post 12/3
+        next_link = html.xpath(XPATH_PAGENAV_LINKS).find{|link| NEXT_PAGE_LINK.match link.content}
+        @next_page_href = next_link[:href]
+      else
+        # Old style
+        cursor = html.at 'p:last-of-type'
+        cursor = cursor.at 'a' if cursor
+        # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag
+        next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
+        # Search listings put their next page in a link towards the top
+        next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
+        # Some search pages have a bug, whereby a 'next page' link isn't displayed,
+        # even though we can see that theres another page listed in the page-number links block at the top
+        # and bottom of the listing page
+        unless next_link
+          cursor = html % 'div.sh:first-of-type > b:last-of-type'
+          # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
+          # We're looking good.
+          next_link = cursor.next_element if cursor and /^[\d]+$/.match cursor.inner_html
+        end
+        # We have an anchor tag - so - let's assign the href:
+        @next_page_href = next_link[:href] if next_link
       end
-      # We have an anchor tag - so - let's assign the href:
-      @next_page_href = next_link[:href] if next_link
     end
     @next_page_href
   end
   # String, Full URL Path of the 'next page' link
   def next_page_url
     (next_page_href) ? url_from_href(next_page_href) : nil
   end
   # Returns a Listings object of the next_page_url on the current listings object
   def next_page
     CraigScrape::Listings.new next_page_url if next_page_url
   end
   # Takes a paragraph element and returns a mostly-parsed Posting
   # We separate this from the rest of the parsing both for readability and ease of testing
   def self.parse_summary(p_element, date = nil)  #:nodoc:
@@ -111,8 +122,8 @@ class CraigScrape::Listings < CraigScrape::Scraper
     title_anchor   = nil
     section_anchor = nil
-    # This loop got a little more complicated after Craigslist start inserting weird <spans>'s in
-    # its list summary postings (See test_new_listing_span051710)
+    # This loop got a little more complicated after Craigslist start inserting weird <spans>'s in
+    # its list summary postings (See test_new_listing_span051710)
     p_element.search('a').each do |a_el|
       # We want the first a-tag that doesn't have spans in it to be the title anchor
       if title_anchor.nil?
@@ -124,12 +135,12 @@ class CraigScrape::Listings < CraigScrape::Scraper
         break
       end
     end
     location_tag = p_element.at 'font'
     has_pic_tag = p_element.at 'span'
     href = nil
     location = he_decode p_element.at('font').inner_html if location_tag
     ret[:location] = $1 if location and LOCATION.match location
@@ -141,20 +152,26 @@ class CraigScrape::Listings < CraigScrape::Scraper
       ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
     end
-    ret[:section] = he_decode(section_anchor.inner_html).split("\302\240").join(" ") if section_anchor
+    ret[:section] = he_decode(section_anchor.inner_html) if section_anchor
     ret[:post_date] = date
-    if SUMMARY_DATE.match he_decode(p_element.children[0])
+    if p_element.at_xpath(XPATH_POST_DATE)
+      # Post 12/3
+      if /\A([^ ]+) ([\d]+)\Z/.match p_element.at_xpath(XPATH_POST_DATE).content.strip
+        ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
+      end
+    elsif SUMMARY_DATE.match he_decode(p_element.children[0])
+      # Old style
       ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
     end
     if title_anchor
       label = he_decode title_anchor.inner_html
       ret[:label] = $1 if LABEL.match label
       ret[:href] = title_anchor[:href]
     end
     ret
   end
 end

data/lib/posting.rb CHANGED Viewed

@@ -1,27 +1,37 @@
 # = About posting.rb
 #
 # This file contains the parsing code, and logic relating to craiglist postings. You
-# should never need to include this file directly, as all of libcraigscrape's objects and methods
+# should never need to include this file directly, as all of libcraigscrape's objects and methods
 # are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
 #
 require 'scraper'
 # Posting represents a fully downloaded, and parsed, Craigslist post.
-# This class is generally returned by the listing scrape methods, and
-# contains the post summaries for a specific search url, or a general listing category
+# This class is generally returned by the listing scrape methods, and
+# contains the post summaries for a specific search url, or a general listing category
 class CraigScrape::Posting < CraigScrape::Scraper
   POST_DATE       = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
   LOCATION        = /Location\:[ ]+(.+)/
   HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/
-  POSTING_ID      = /PostingID\:[ ]+([\d]+)/
+  POSTING_ID      = /PostingID\:[ ]*([\d]+)/
   REPLY_TO        = /(.+)/
   PRICE           = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
+  # NOTE: we implement the (?:) to first check the 'old' style format, and then the 'new style'
+  # (As of 12/03's parse changes)
   USERBODY_PARTS  = /^(.+)\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>(.+)$/m
   HTML_HEADER     = /^(.+)\<div id\=\"userbody\">/m
   IMAGE_SRC       = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
+  # This is used to determine if there's a parse error
+  REQUIRED_FIELDS = %w(contents posting_id post_time header title full_section)
+  XPATH_USERBODY = "//*[@id='userbody']"
+  XPATH_BLURBS = "//ul[@class='blurbs']"
+  XPATH_PICS = "//*[@class='tn']/a/@href"
+  XPATH_REPLY_TO = "//*[@class='dateReplyBar']/small/a"
   # This is really just for testing, in production use, uri.path is a better solution
   attr_reader :href #:nodoc:
@@ -30,14 +40,14 @@ class CraigScrape::Posting < CraigScrape::Scraper
     super(*args)
     # Validate that required fields are present, at least - if we've downloaded it from a url
-    parse_error! if (
-      args.first.kind_of? String and
-      !flagged_for_removal? and
-      !posting_has_expired? and
-      !deleted_by_author? and [
-        contents,posting_id,post_time,header,title,full_section
-      ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
-    )
+    if args.first.kind_of? String and is_active_post?
+      unparsed_fields = REQUIRED_FIELDS.find_all{|f|
+        val = send(f)
+        val.nil? or (val.respond_to? :length and val.length == 0)
+      }
+      parse_error! unparsed_fields unless unparsed_fields.empty?
+    end
   end
@@ -47,10 +57,10 @@ class CraigScrape::Posting < CraigScrape::Scraper
       h2 = html_head.at 'h2' if html_head
       @header = he_decode h2.inner_html if h2
     end
     @header
   end
   # String, the item's title
   def title
     unless @title
@@ -58,7 +68,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
       @title = he_decode title_tag.inner_html if title_tag
       @title = nil if @title and @title.length == 0
     end
     @title
   end
@@ -66,8 +76,8 @@ class CraigScrape::Posting < CraigScrape::Scraper
   def full_section
     unless @full_section
       @full_section = []
-      (html_head/"div[@class='bchead']//a").each do |a|
+      (html_head / "*[@class='bchead']//a").each do |a|
         @full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
       end if html_head
     end
@@ -78,84 +88,103 @@ class CraigScrape::Posting < CraigScrape::Scraper
   # String, represents the post's reply-to address, if listed
   def reply_to
     unless @reply_to
-      cursor = html_head.at 'hr' if html_head
-      cursor = cursor.next until cursor.nil? or cursor.name == 'a'
-      @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
+      if html.at_xpath(XPATH_REPLY_TO)
+        @reply_to = html.at_xpath(XPATH_REPLY_TO).content
+      else
+        cursor = html_head.at 'hr' if html_head
+        cursor = cursor.next until cursor.nil? or cursor.name == 'a'
+        @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
+      end
     end
     @reply_to
   end
-  # Time, reflects the full timestamp of the posting
+  # Time, reflects the full timestamp of the posting
   def post_time
     unless @post_time
       cursor = html_head.at 'hr' if html_head
       cursor = cursor.next until cursor.nil? or POST_DATE.match cursor.to_s
       @post_time = Time.parse $1 if $1
     end
     @post_time
   end
   # Integer, Craigslist's unique posting id
   def posting_id
-    unless @posting_id
-      cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING if html_footer
-      cursor = cursor.next until cursor.nil? or POSTING_ID.match cursor.to_s
-      @posting_id = $1.to_i if $1
+    if @posting_id
+    elsif USERBODY_PARTS.match html_source
+      # Old style:
+      html_footer = $4
+      cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING
+      cursor = cursor.next until cursor.nil? or
+      @posting_id = $1.to_i if POSTING_ID.match html_footer.to_s
+    else
+      # Post 12/3
+      @posting_id = $1.to_i if POSTING_ID.match html.xpath("//*[@class='postingidtext']").to_s
     end
     @posting_id
   end
   # String, The full-html contents of the post
   def contents
     unless @contents
       @contents = user_body if html_source
-      @contents = he_decode @contents.strip if @contents
+      @contents = he_decode(@contents).strip if @contents
     end
     @contents
   end
   # String, the location of the item, as best could be parsed
   def location
-    if @location.nil? and craigslist_body and html
-      # Location (when explicitly defined):
-      cursor = craigslist_body.at 'ul' unless @location
-      # Apa section includes other things in the li's (cats/dogs ok fields)
-      cursor.children.each do |li|
-        if LOCATION.match li.inner_html
-          @location = he_decode($1) and break
-          break
+    if @location.nil? and html
+      if html.at_xpath(XPATH_BLURBS)
+        # This is the post-12/3/12 style:
+        @location = $1 if html.xpath(XPATH_BLURBS).first.children.any?{|c|
+          LOCATION.match c.content}
+      elsif craigslist_body
+        # Location (when explicitly defined):
+        cursor = craigslist_body.at 'ul' unless @location
+        # This is the legacy style:
+        # Note: Apa section includes other things in the li's (cats/dogs ok fields)
+        cursor.children.each do |li|
+          if LOCATION.match li.inner_html
+            @location = he_decode($1) and break
+            break
+          end
+        end if cursor
+        # Real estate listings can work a little different for location:
+        unless @location
+          cursor = craigslist_body.at 'small'
+          cursor = cursor.previous until cursor.nil? or cursor.text?
+          @location = he_decode(cursor.to_s.strip) if cursor
         end
-      end if cursor
-      # Real estate listings can work a little different for location:
-      unless @location
-        cursor = craigslist_body.at 'small'
-        cursor = cursor.previous until cursor.nil? or cursor.text?
-        @location = he_decode(cursor.to_s.strip) if cursor
+        # So, *sometimes* the location just ends up being in the header, I don't know why:
+        @location = $1 if @location.nil? and HEADER_LOCATION.match header
       end
-      # So, *sometimes* the location just ends up being in the header, I don't know why:
-      @location = $1 if @location.nil? and HEADER_LOCATION.match header
     end
     @location
   end
   # Array, urls of the post's images that are *not* hosted on craigslist
   def images
     # Keep in mind that when users post html to craigslist, they're often not posting wonderful html...
-    @images = (
-      contents ?
+    @images = (
+      contents ?
         contents.scan(IMAGE_SRC).collect{ |a| a.find{|b| !b.nil? } } :
-        []
+        []
     ) unless @images
     @images
   end
@@ -163,15 +192,20 @@ class CraigScrape::Posting < CraigScrape::Scraper
   def pics
     unless @pics
       @pics = []
-      if html and craigslist_body
-        # Now let's find the craigslist hosted images:
-        img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
-        @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
+      if html
+        if html.at_xpath(XPATH_PICS)
+          @pics = html.xpath(XPATH_PICS).collect(&:value)
+        elsif craigslist_body
+          # This is the pre-12/3/12 style:
+          # Now let's find the craigslist hosted images:
+          img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
+          @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
+        end
       end
     end
     @pics
   end
@@ -180,38 +214,37 @@ class CraigScrape::Posting < CraigScrape::Scraper
     @flagged_for_removal = (
       system_post? and header_as_plain == "This posting has been flagged for removal"
     ) if @flagged_for_removal.nil?
     @flagged_for_removal
   end
   # Returns true if this Post was parsed, and represents a 'This posting has been deleted by its author.' notice
   def deleted_by_author?
     @deleted_by_author = (
       system_post? and header_as_plain == "This posting has been deleted by its author."
     ) if @deleted_by_author.nil?
     @deleted_by_author
   end
   # Returns true if this Post was parsed, and represents a 'This posting has expired.' notice
   def posting_has_expired?
     @posting_has_expired = (
       system_post? and header_as_plain == "This posting has expired."
     ) if @posting_has_expired.nil?
     @posting_has_expired
   end
   # Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
   # used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
   def post_date
     @post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil?
     @post_date
   end
-  # Returns The post label. The label would appear at first glance to be indentical to the header - but its not.
+  # Returns The post label. The label would appear at first glance to be indentical to the header - but its not.
   # The label is cited on the listings pages, and generally includes everything in the header - with the exception of the location.
   # Sometimes there's additional information ie. '(map)' on rea listings included in the header, that aren't to be listed in the label
   # This is also used as a bandwidth shortcut for the craigwatch program, and is a guaranteed identifier for the post, that won't result
@@ -219,37 +252,31 @@ class CraigScrape::Posting < CraigScrape::Scraper
   def label
     unless @label or system_post?
       @label = header
       @label = $1 if location and /(.+?)[ ]*\(#{location}\).*?$/.match @label
     end
     @label
   end
   # Array, which image types are listed for the post.
   # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
   def img_types
-    unless @img_types
-      @img_types = []
-      @img_types << :img if images.length > 0
-      @img_types << :pic if pics.length > 0
-    end
-    @img_types
+    @img_types || [ (images.length > 0) ? :img : nil,
+      (pics.length > 0) ? :pic : nil ].compact
   end
-  # Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
+  # Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However,
   # this (sometimes/rarely) conserves bandwidth by pulling this field from the listing post-summary
   def section
     unless @section
-      @section = full_section.last if full_section
+      @section = full_section.last if full_section
     end
     @section
   end
-  # true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server.
+  # true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server.
   # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
   def has_img?
     img_types.include? :img
@@ -272,50 +299,70 @@ class CraigScrape::Posting < CraigScrape::Scraper
   def price
     $1.tr('$','').to_f if label and PRICE.match label
   end
   # Returns the post contents with all html tags removed
   def contents_as_plain
     strip_html contents
   end
-  # Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a
+  # Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a
   # 'system_post' we may get tags in here
   def header_as_plain
     strip_html header
   end
-  # Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original
+  # Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original
   # This returns true or false if that case applies
   def system_post?
     [contents,posting_id,post_time,title].all?{|f| f.nil?}
   end
+  # This is mostly used to determine if the post should be checked for
+  # parse errors. Might be useful for someone else though
+  def is_active_post?
+    [flagged_for_removal?, posting_has_expired?, deleted_by_author?].none?
+  end
   private
-  # I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we
+  # I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we
   # return everything above the user_body
   def html_head
     @html_head = Nokogiri::HTML  $1, nil, HTML_ENCODING if @html_head.nil? and HTML_HEADER.match html_source
     # We return html itself if HTML_HEADER doesn't match, which would be case for a 404 page or something
     @html_head ||= html
     @html_head
   end
-  # Since we started having so many problems with Hpricot flipping out on whack content bodies,
-  # I added this to return everything south of the user_body
-  def html_footer
-    $4 if USERBODY_PARTS.match html_source
-  end
   # OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
-  # This bad html trips up hpricot, and I've resorted to splitting the page up using string parsing like so:
+  # This bad html trips up html parsers, and I've resorted to splitting the page up using string parsing like so:
   # We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
   def user_body
-    $2 if USERBODY_PARTS.match html_source
+    if USERBODY_PARTS.match html_source
+      # This is the pre-12/3/12 style:
+      $2
+    elsif html.at_xpath(XPATH_USERBODY)
+      # There's a bunch of junk in here that we don't want, so this loop removes
+      # everything after (and including) the last script tag, from the result
+      user_body = html.xpath(XPATH_USERBODY)
+      hit_delimeter = false
+      # Since some posts don't actually have the script tag:
+      delimeter = user_body.at_xpath('script') ? :script : :comment
+      user_body.first.children.to_a.reverse.reject{ |p|
+        if hit_delimeter
+          false
+        elsif ( (delimeter == :script and p.name == 'script') or
+          (delimeter == :comment and p.comment? and p.content.strip == "START CLTAGS") )
+          hit_delimeter = true
+        else
+          true
+        end
+      }.reverse.collect(&:to_s).join
+    end
   end
-  # Read the notes on user_body. However,  unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
+  # Read the notes on user_body. However,  unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
   # So - we'll return it as a Nokogiri object.
   def craigslist_body
     Nokogiri::HTML $3, nil, HTML_ENCODING if USERBODY_PARTS.match html_source