libcraigscrape 1.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +12 -1
- data/Gemfile +12 -0
- data/Rakefile +1 -54
- data/bin/craig_report_schema.yml +4 -1
- data/bin/craigwatch +148 -146
- data/bin/report_mailer/report.html.erb +20 -0
- data/bin/report_mailer/{craigslist_report.plain.erb → report.text.erb} +7 -6
- data/lib/geo_listings.rb +1 -1
- data/lib/libcraigscrape.rb +52 -59
- data/lib/listings.rb +75 -39
- data/lib/posting.rb +120 -63
- data/lib/scraper.rb +43 -63
- data/spec/assets/geolisting_iso_us_120412.html +441 -0
- data/spec/assets/listing_cta_ftl_112612.html +1470 -0
- data/spec/assets/listing_rea_miami_123012.html +1397 -0
- data/spec/assets/listing_search_ppa_nyc_121212.html +1584 -0
- data/spec/assets/posting_daytona_art_120512-2.html +160 -0
- data/spec/assets/posting_daytona_art_120512.html +153 -0
- data/spec/assets/posting_mdc_cto_ftl_112612.html +170 -0
- data/spec/assets/posting_mdc_reb_120612.html +183 -0
- data/spec/assets/posting_sfbay_1226.html +157 -0
- data/spec/assets/posting_sya_121012-2.html +122 -0
- data/spec/assets/posting_sya_121012.html +165 -0
- data/spec/assets/this_post_has_expired_old.html +48 -0
- data/spec/geolisting_spec.rb +9 -0
- data/spec/listings_spec.rb +77 -0
- data/spec/postings_spec.rb +157 -0
- data/spec/spec_helper.rb +8 -0
- data/test/test_craigslist_geolisting.rb +5 -5
- data/test/test_craigslist_listing.rb +30 -30
- data/test/test_craigslist_posting.rb +25 -145
- metadata +200 -114
- data/bin/report_mailer/craigslist_report.html.erb +0 -17
    
        data/lib/libcraigscrape.rb
    CHANGED
    
    | @@ -3,38 +3,34 @@ | |
| 3 3 | 
             
            # All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
         | 
| 4 4 | 
             
            #
         | 
| 5 5 | 
             
            require 'rubygems'
         | 
| 6 | 
            -
             | 
| 7 | 
            -
             | 
| 8 | 
            -
            gem 'nokogiri',      '~> 1.4.4'
         | 
| 9 | 
            -
            gem 'htmlentities',  '~> 4.0.0'
         | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
            require 'net/http'
         | 
| 13 | 
            -
            require 'zlib'
         | 
| 14 | 
            -
            require 'nokogiri'
         | 
| 6 | 
            +
            require 'time'
         | 
| 7 | 
            +
            require 'uri'
         | 
| 15 8 | 
             
            require 'htmlentities'
         | 
| 16 | 
            -
            require 'active_support'
         | 
| 17 | 
            -
             | 
| 9 | 
            +
            require 'active_support/core_ext/class/attribute_accessors'
         | 
| 10 | 
            +
            require 'active_support/core_ext/time/calculations'
         | 
| 11 | 
            +
            require 'htmlentities'
         | 
| 12 | 
            +
            require 'nokogiri'
         | 
| 13 | 
            +
            require 'typhoeus'
         | 
| 14 | 
            +
            require 'money'
         | 
| 18 15 |  | 
| 19 | 
            -
            # A base class encapsulating the various libcraigscrape objects, and providing most of the | 
| 20 | 
            -
            # craigslist interaction methods. Currently, we're supporting the old Class methods | 
| 16 | 
            +
            # A base class encapsulating the various libcraigscrape objects, and providing most of the
         | 
| 17 | 
            +
            # craigslist interaction methods. Currently, we're supporting the old Class methods
         | 
| 21 18 | 
             
            # in a legacy-compatibility mode, but these methods are marked for deprecation. Instead,
         | 
| 22 19 | 
             
            # create an instance of the Craigslist object, and use its Public Instance methods.
         | 
| 23 20 | 
             
            # See the README for easy to follow examples.
         | 
| 24 21 |  | 
| 25 22 | 
             
            class CraigScrape
         | 
| 26 | 
            -
              cattr_accessor :time_now
         | 
| 27 23 | 
             
              cattr_accessor :site_to_url_prefix
         | 
| 28 | 
            -
             | 
| 24 | 
            +
             | 
| 29 25 | 
             
              #--
         | 
| 30 26 | 
             
              # NOTE:
         | 
| 31 | 
            -
              # The only reason I took this out is b/c I might want to test with a file:// | 
| 27 | 
            +
              # The only reason I took this out is b/c I might want to test with a file://
         | 
| 32 28 | 
             
              # prefix at some point
         | 
| 33 29 | 
             
              #++
         | 
| 34 30 | 
             
              self.site_to_url_prefix = 'http://'
         | 
| 35 31 |  | 
| 36 | 
            -
             | 
| 37 | 
            -
              # Takes a variable number of site/path specifiers (strings) as an argument. | 
| 32 | 
            +
             | 
| 33 | 
            +
              # Takes a variable number of site/path specifiers (strings) as an argument.
         | 
| 38 34 | 
             
              # This list gets flattened and passed to CraigScrape::GeoListings.find_sites .
         | 
| 39 35 | 
             
              # See that method's rdoc for a complete set of rules on what arguments are allowed here.
         | 
| 40 36 | 
             
              def initialize(*args)
         | 
| @@ -44,50 +40,50 @@ class CraigScrape | |
| 44 40 | 
             
              # Returns which sites are included in any operations performed by this object. This is directly
         | 
| 45 41 | 
             
              # ascertained from the initial constructor's spec-list
         | 
| 46 42 | 
             
              def sites
         | 
| 47 | 
            -
                @sites ||= GeoListings.find_sites @sites_specs | 
| 43 | 
            +
                @sites ||= GeoListings.find_sites @sites_specs
         | 
| 48 44 | 
             
                @sites
         | 
| 49 45 | 
             
              end
         | 
| 50 | 
            -
             | 
| 46 | 
            +
             | 
| 51 47 | 
             
              # Determines all listings which can be construed by combining the sites specified in the object
         | 
| 52 | 
            -
              # constructor with the provided url-path fragments. | 
| 48 | 
            +
              # constructor with the provided url-path fragments.
         | 
| 53 49 | 
             
              #
         | 
| 54 50 | 
             
              # Passes the <b>first page listing</b> of each of these urls to the provided block.
         | 
| 55 51 | 
             
              def each_listing(*fragments)
         | 
| 56 52 | 
             
                listing_urls_for(fragments).each{|url| yield Listings.new(url) }
         | 
| 57 53 | 
             
              end
         | 
| 58 | 
            -
             | 
| 54 | 
            +
             | 
| 59 55 | 
             
              # Determines all listings which can be construed by combining the sites specified in the object
         | 
| 60 | 
            -
              # constructor with the provided url-path fragments. | 
| 56 | 
            +
              # constructor with the provided url-path fragments.
         | 
| 61 57 | 
             
              #
         | 
| 62 58 | 
             
              # Passes <b>each page on every listing</b> for the passed URLs to the provided block.
         | 
| 63 59 | 
             
              def each_page_in_each_listing(*fragments)
         | 
| 64 60 | 
             
                each_listing(*fragments) do |listing|
         | 
| 65 61 | 
             
                  while listing
         | 
| 66 62 | 
             
                    yield listing
         | 
| 67 | 
            -
                    listing = listing.next_page | 
| 63 | 
            +
                    listing = listing.next_page
         | 
| 68 64 | 
             
                  end
         | 
| 69 65 | 
             
                end
         | 
| 70 66 | 
             
              end
         | 
| 71 | 
            -
             | 
| 67 | 
            +
             | 
| 72 68 | 
             
              # Determines all listings which can be construed by combining the sites specified in the object
         | 
| 73 | 
            -
              # constructor with the provided url-path fragments. | 
| 69 | 
            +
              # constructor with the provided url-path fragments.
         | 
| 74 70 | 
             
              #
         | 
| 75 71 | 
             
              # Returns the <b>first page listing</b> of each of these urls to the provided block.
         | 
| 76 72 | 
             
              def listings(*fragments)
         | 
| 77 73 | 
             
                listing_urls_for(fragments).collect{|url| Listings.new url }
         | 
| 78 74 | 
             
              end
         | 
| 79 | 
            -
             | 
| 75 | 
            +
             | 
| 80 76 | 
             
              # Determines all listings which can be construed by combining the sites specified in the object
         | 
| 81 | 
            -
              # constructor with the provided url-path fragments. | 
| 77 | 
            +
              # constructor with the provided url-path fragments.
         | 
| 82 78 | 
             
              #
         | 
| 83 79 | 
             
              # Passes all posts from each of these urls to the provided block, in the order they're parsed
         | 
| 84 80 | 
             
              # (for each listing, newest posts are returned first).
         | 
| 85 81 | 
             
              def each_post(*fragments)
         | 
| 86 82 | 
             
                each_page_in_each_listing(*fragments){ |l| l.posts.each{|p| yield p} }
         | 
| 87 83 | 
             
              end
         | 
| 88 | 
            -
             | 
| 84 | 
            +
             | 
| 89 85 | 
             
              # Determines all listings which can be construed by combining the sites specified in the object
         | 
| 90 | 
            -
              # constructor with the provided url-path fragments. | 
| 86 | 
            +
              # constructor with the provided url-path fragments.
         | 
| 91 87 | 
             
              #
         | 
| 92 88 | 
             
              # Returns all posts from each of these urls, in the order they're parsed
         | 
| 93 89 | 
             
              # (newest posts first).
         | 
| @@ -96,24 +92,32 @@ class CraigScrape | |
| 96 92 | 
             
                each_page_in_each_listing(*fragments){ |l| ret += l.posts }
         | 
| 97 93 | 
             
                ret
         | 
| 98 94 | 
             
              end
         | 
| 99 | 
            -
             | 
| 95 | 
            +
             | 
| 100 96 | 
             
              # Determines all listings which can be construed by combining the sites specified in the object
         | 
| 101 | 
            -
              # constructor with the provided url-path fragments. | 
| 97 | 
            +
              # constructor with the provided url-path fragments.
         | 
| 102 98 | 
             
              #
         | 
| 103 | 
            -
              # Returns all posts from each of these urls, which are newer than the provider 'newer_then' date.
         | 
| 99 | 
            +
              # Returns all posts from each of these urls, which are newer than (or equal to) the provider 'newer_then' date.
         | 
| 104 100 | 
             
              # (Returns 'newest' posts first).
         | 
| 101 | 
            +
              #
         | 
| 102 | 
            +
              # NOTE: New to version 1.1, if newer_then is a date, we compare to the post_date
         | 
| 103 | 
            +
              # if newer_then is a Time, we compare to post_time. Be aware that post_time 
         | 
| 104 | 
            +
              # requires the entire post be loaded, and not just the summary - which will
         | 
| 105 | 
            +
              # take longer to download.
         | 
| 105 106 | 
             
              def posts_since(newer_then, *fragments)
         | 
| 107 | 
            +
                accessor =  (newer_then.kind_of? Date) ? :post_date : :post_time
         | 
| 106 108 | 
             
                ret = []
         | 
| 107 109 | 
             
                fragments.each do |frag|
         | 
| 108 110 | 
             
                  each_post(frag) do |p|
         | 
| 109 | 
            -
                     | 
| 111 | 
            +
                    # We have to try the comparison, since post_time could conceivably be nil 
         | 
| 112 | 
            +
                    # for the case of a system_post?
         | 
| 113 | 
            +
                    break if p.send(accessor).try(:<=, newer_then)
         | 
| 110 114 | 
             
                    ret << p
         | 
| 111 115 | 
             
                  end
         | 
| 112 116 | 
             
                end
         | 
| 113 117 |  | 
| 114 | 
            -
                ret | 
| 118 | 
            +
                ret
         | 
| 115 119 | 
             
              end
         | 
| 116 | 
            -
             | 
| 120 | 
            +
             | 
| 117 121 | 
             
              class << self # Class methods
         | 
| 118 122 |  | 
| 119 123 | 
             
                #--
         | 
| @@ -122,11 +126,11 @@ class CraigScrape | |
| 122 126 | 
             
                #++
         | 
| 123 127 |  | 
| 124 128 | 
             
                # <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
         | 
| 125 | 
            -
                # Instead, consider using CraigScrape::Listings.new | 
| 129 | 
            +
                # Instead, consider using CraigScrape::Listings.new
         | 
| 126 130 | 
             
                #
         | 
| 127 | 
            -
                # Scrapes a single listing url and returns a Listings object representing the contents. | 
| 131 | 
            +
                # Scrapes a single listing url and returns a Listings object representing the contents.
         | 
| 128 132 | 
             
                # Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
         | 
| 129 | 
            -
                def scrape_listing(listing_url) | 
| 133 | 
            +
                def scrape_listing(listing_url)
         | 
| 130 134 | 
             
                  CraigScrape::Listings.new listing_url
         | 
| 131 135 | 
             
                end
         | 
| 132 136 |  | 
| @@ -137,24 +141,24 @@ class CraigScrape | |
| 137 141 | 
             
                # until there's no more 'next page' links available to click on
         | 
| 138 142 | 
             
                def scrape_until(listing_url, &post_condition)
         | 
| 139 143 | 
             
                  ret = []
         | 
| 140 | 
            -
             | 
| 144 | 
            +
             | 
| 141 145 | 
             
                  listings = CraigScrape::Listings.new listing_url
         | 
| 142 146 | 
             
                  catch "ScrapeBreak" do
         | 
| 143 | 
            -
                    while listings do | 
| 147 | 
            +
                    while listings do
         | 
| 144 148 | 
             
                      listings.posts.each do |post|
         | 
| 145 149 | 
             
                        throw "ScrapeBreak" if post_condition.call(post)
         | 
| 146 150 | 
             
                        ret << post
         | 
| 147 151 | 
             
                      end
         | 
| 148 | 
            -
             | 
| 152 | 
            +
             | 
| 149 153 | 
             
                      listings = listings.next_page
         | 
| 150 154 | 
             
                    end
         | 
| 151 155 | 
             
                  end
         | 
| 152 | 
            -
             | 
| 156 | 
            +
             | 
| 153 157 | 
             
                  ret
         | 
| 154 158 | 
             
                end
         | 
| 155 159 |  | 
| 156 160 | 
             
                # <b>This method is for legacy compatibility and is not recommended for use by new projects.</b>
         | 
| 157 | 
            -
                # Instead, consider using CraigScrape::Posting.new | 
| 161 | 
            +
                # Instead, consider using CraigScrape::Posting.new
         | 
| 158 162 | 
             
                #
         | 
| 159 163 | 
             
                # Scrapes a single Post Url, and returns a Posting object representing its contents.
         | 
| 160 164 | 
             
                # Mostly here to preserve backwards-compatibility with the older api, CraigScrape::Listings.new "listing_url" does the same thing
         | 
| @@ -176,7 +180,7 @@ class CraigScrape | |
| 176 180 | 
             
                # Instead, consider using the CraigScrape::posts_since method.
         | 
| 177 181 | 
             
                #
         | 
| 178 182 | 
             
                # Continually scrapes listings, until the date newer_then has been reached, or no more 'next page' links are avialable to be clicked on.
         | 
| 179 | 
            -
                # Returns an array of PostSummary objects. Dates are based on the Month/Day 'datestamps' reported in the listing summaries. | 
| 183 | 
            +
                # Returns an array of PostSummary objects. Dates are based on the Month/Day 'datestamps' reported in the listing summaries.
         | 
| 180 184 | 
             
                # As such, time-based cutoffs are not supported here. The scrape_until method, utilizing the SummaryPost.full_post method could achieve
         | 
| 181 185 | 
             
                # time-based cutoffs, at the expense of retrieving every post in full during enumerations.
         | 
| 182 186 | 
             
                #
         | 
| @@ -185,9 +189,9 @@ class CraigScrape | |
| 185 189 | 
             
                  self.scrape_until(listing_url) {|post| post.post_date <= newer_then}
         | 
| 186 190 | 
             
                end
         | 
| 187 191 | 
             
              end
         | 
| 188 | 
            -
             | 
| 192 | 
            +
             | 
| 189 193 | 
             
              private
         | 
| 190 | 
            -
             | 
| 194 | 
            +
             | 
| 191 195 | 
             
              # This  takes a fragments paramter, and turns it into actual urls
         | 
| 192 196 | 
             
              def listing_urls_for(listing_fragments)
         | 
| 193 197 | 
             
                listing_fragments.collect{ |lf|
         | 
| @@ -198,20 +202,9 @@ class CraigScrape | |
| 198 202 | 
             
                  sites.collect { |site| '%s%s/%s' % [site_to_url_prefix,site,lf] }
         | 
| 199 203 | 
             
                }.flatten
         | 
| 200 204 | 
             
              end
         | 
| 201 | 
            -
                
         | 
| 202 | 
            -
              # Returns the most recentlt expired  time for the provided month and day
         | 
| 203 | 
            -
              def self.most_recently_expired_time(month, day)  #:nodoc:
         | 
| 204 | 
            -
                now = (time_now) ? time_now : Time.now
         | 
| 205 | 
            -
             | 
| 206 | 
            -
                # This ensures we always generate a time in the past, by guessing the year and subtracting one if we guessed wrong
         | 
| 207 | 
            -
                ret = Time.local now.year, month, day
         | 
| 208 | 
            -
                ret = Time.local now.year-1, month, day if ret > now 
         | 
| 209 | 
            -
                
         | 
| 210 | 
            -
                ret
         | 
| 211 | 
            -
              end
         | 
| 212 205 |  | 
| 213 206 | 
             
            end
         | 
| 214 207 |  | 
| 215 208 | 
             
            require 'listings'
         | 
| 216 209 | 
             
            require 'posting'
         | 
| 217 | 
            -
            require 'geo_listings'
         | 
| 210 | 
            +
            require 'geo_listings'
         | 
    
        data/lib/listings.rb
    CHANGED
    
    | @@ -13,7 +13,13 @@ class CraigScrape::Listings < CraigScrape::Scraper | |
| 13 13 | 
             
              IMG_TYPE       = /^[ ]*(.+)[ ]*$/
         | 
| 14 14 | 
             
              HEADER_DATE    = /^[ ]*(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[ ]+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Nov|Dec)[ ]+([0-9]{1,2})[ ]*$/i
         | 
| 15 15 | 
             
              SUMMARY_DATE   = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
         | 
| 16 | 
            -
              NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/
         | 
| 16 | 
            +
              NEXT_PAGE_LINK = /^[ ]*(?:next [\d]+ postings|Next \>\>)[ ]*$/
         | 
| 17 | 
            +
             | 
| 18 | 
            +
              XPATH_POST_DATE = "*[@class='itemdate']"
         | 
| 19 | 
            +
              XPATH_POST_IMGPIC = "*[@class='itempx']/*[@class='p']"
         | 
| 20 | 
            +
              XPATH_PAGENAV_LINKS = "//*[@class='ban']//a"
         | 
| 21 | 
            +
              # There's a couple places that the price hangs out. We search in this order
         | 
| 22 | 
            +
              XPATHS_POST_PRICE = ["*[@class='itempp']", "*[@class='itemph']"]
         | 
| 17 23 |  | 
| 18 24 | 
             
              # Array, PostSummary objects found in the listing
         | 
| 19 25 | 
             
              def posts
         | 
| @@ -35,7 +41,7 @@ class CraigScrape::Listings < CraigScrape::Scraper | |
| 35 41 | 
             
                  post_tags.each do |el|
         | 
| 36 42 | 
             
                    case el.name
         | 
| 37 43 | 
             
                      when 'p'
         | 
| 38 | 
            -
                       post_summary =  | 
| 44 | 
            +
                       post_summary = parse_summary el, current_date
         | 
| 39 45 |  | 
| 40 46 | 
             
                       # Validate that required fields are present:
         | 
| 41 47 | 
             
                       parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
         | 
| @@ -47,7 +53,7 @@ class CraigScrape::Listings < CraigScrape::Scraper | |
| 47 53 | 
             
                      # Let's make sense of the h4 tag, and then read all the p tags below it
         | 
| 48 54 | 
             
                      if HEADER_DATE.match he_decode(el.inner_html)
         | 
| 49 55 | 
             
                        # Generally, the H4 tags contain valid dates. When they do - this is easy:
         | 
| 50 | 
            -
                        current_date =  | 
| 56 | 
            +
                        current_date = Date.parse [$1, $2].join('/')
         | 
| 51 57 | 
             
                      elsif html.at('h4:last-of-type') == el
         | 
| 52 58 | 
             
                        # There's a specific bug in craigslist, where these nonsense h4's just appear without anything relevant inside them.
         | 
| 53 59 | 
             
                        # They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page, 
         | 
| @@ -65,29 +71,37 @@ class CraigScrape::Listings < CraigScrape::Scraper | |
| 65 71 | 
             
              # String, URL Path href-fragment of the next page link
         | 
| 66 72 | 
             
              def next_page_href
         | 
| 67 73 | 
             
                unless @next_page_href
         | 
| 68 | 
            -
             | 
| 69 | 
            -
                  
         | 
| 70 | 
            -
             | 
| 71 | 
            -
             | 
| 72 | 
            -
             | 
| 73 | 
            -
                   | 
| 74 | 
            -
             | 
| 75 | 
            -
             | 
| 76 | 
            -
             | 
| 77 | 
            -
             | 
| 78 | 
            -
             | 
| 79 | 
            -
             | 
| 80 | 
            -
             | 
| 81 | 
            -
             | 
| 82 | 
            -
                     | 
| 83 | 
            -
             | 
| 84 | 
            -
             | 
| 85 | 
            -
                    #  | 
| 86 | 
            -
                     | 
| 74 | 
            +
                 
         | 
| 75 | 
            +
                  if html.at_xpath(XPATH_PAGENAV_LINKS)
         | 
| 76 | 
            +
                    # Post 12/3
         | 
| 77 | 
            +
                    next_link = html.xpath(XPATH_PAGENAV_LINKS).find{|link| NEXT_PAGE_LINK.match link.content}
         | 
| 78 | 
            +
                    @next_page_href = next_link[:href] if next_link
         | 
| 79 | 
            +
                  else 
         | 
| 80 | 
            +
                    # Old style
         | 
| 81 | 
            +
                    cursor = html.at 'p:last-of-type'
         | 
| 82 | 
            +
                    
         | 
| 83 | 
            +
                    cursor = cursor.at 'a' if cursor
         | 
| 84 | 
            +
                    
         | 
| 85 | 
            +
                    # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag 
         | 
| 86 | 
            +
                    next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                    # Search listings put their next page in a link towards the top
         | 
| 89 | 
            +
                    next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
         | 
| 90 | 
            +
                            
         | 
| 91 | 
            +
                    # Some search pages have a bug, whereby a 'next page' link isn't displayed,
         | 
| 92 | 
            +
                    # even though we can see that theres another page listed in the page-number links block at the top
         | 
| 93 | 
            +
                    # and bottom of the listing page
         | 
| 94 | 
            +
                    unless next_link
         | 
| 95 | 
            +
                      cursor = html % 'div.sh:first-of-type > b:last-of-type'
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                      # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
         | 
| 98 | 
            +
                      # We're looking good.
         | 
| 99 | 
            +
                      next_link = cursor.next_element if cursor and /^[\d]+$/.match cursor.inner_html
         | 
| 100 | 
            +
                    end
         | 
| 101 | 
            +
                    
         | 
| 102 | 
            +
                    # We have an anchor tag - so - let's assign the href:
         | 
| 103 | 
            +
                    @next_page_href = next_link[:href] if next_link
         | 
| 87 104 | 
             
                  end
         | 
| 88 | 
            -
                  
         | 
| 89 | 
            -
                  # We have an anchor tag - so - let's assign the href:
         | 
| 90 | 
            -
                  @next_page_href = next_link[:href] if next_link
         | 
| 91 105 | 
             
                end
         | 
| 92 106 |  | 
| 93 107 | 
             
                @next_page_href
         | 
| @@ -100,12 +114,14 @@ class CraigScrape::Listings < CraigScrape::Scraper | |
| 100 114 |  | 
| 101 115 | 
             
              # Returns a Listings object of the next_page_url on the current listings object
         | 
| 102 116 | 
             
              def next_page
         | 
| 103 | 
            -
                CraigScrape::Listings.new next_page_url if next_page_url
         | 
| 117 | 
            +
                CraigScrape::Listings.new URI.encode(next_page_url) if next_page_url
         | 
| 104 118 | 
             
              end
         | 
| 105 | 
            -
             | 
| 119 | 
            +
             
         | 
| 120 | 
            +
              private
         | 
| 121 | 
            +
             | 
| 106 122 | 
             
              # Takes a paragraph element and returns a mostly-parsed Posting
         | 
| 107 123 | 
             
              # We separate this from the rest of the parsing both for readability and ease of testing
         | 
| 108 | 
            -
              def  | 
| 124 | 
            +
              def parse_summary(p_element, date = nil)  #:nodoc:
         | 
| 109 125 | 
             
                ret = {}
         | 
| 110 126 |  | 
| 111 127 | 
             
                title_anchor   = nil
         | 
| @@ -126,26 +142,45 @@ class CraigScrape::Listings < CraigScrape::Scraper | |
| 126 142 | 
             
                end
         | 
| 127 143 |  | 
| 128 144 | 
             
                location_tag = p_element.at 'font'
         | 
| 129 | 
            -
                has_pic_tag = p_element.at 'span'
         | 
| 130 145 |  | 
| 131 146 | 
             
                href = nil
         | 
| 132 147 |  | 
| 133 148 | 
             
                location = he_decode p_element.at('font').inner_html if location_tag
         | 
| 134 149 | 
             
                ret[:location] = $1 if location and LOCATION.match location
         | 
| 135 150 |  | 
| 136 | 
            -
                 | 
| 137 | 
            -
             | 
| 138 | 
            -
                   | 
| 139 | 
            -
             | 
| 151 | 
            +
                price_path = XPATHS_POST_PRICE.find{|path| 
         | 
| 152 | 
            +
                  content = p_element.at_xpath(path).try(:content)
         | 
| 153 | 
            +
                  (!content.nil? and !content.empty?)
         | 
| 154 | 
            +
                }
         | 
| 155 | 
            +
                ret[:price] = Money.new($1.to_i * 100, 'USD') if price_path and 
         | 
| 156 | 
            +
                  /\$([\d]+)/.match(p_element.at_xpath(price_path).content) 
         | 
| 140 157 |  | 
| 141 | 
            -
             | 
| 158 | 
            +
                ret[:img_types] = []
         | 
| 159 | 
            +
                if p_element.at_xpath XPATH_POST_IMGPIC
         | 
| 160 | 
            +
                  # Post 12/3
         | 
| 161 | 
            +
                  ret[:img_types] = p_element.at_xpath(XPATH_POST_IMGPIC).content.scan(/\w+/).collect(&:to_sym)
         | 
| 162 | 
            +
                else
         | 
| 163 | 
            +
                  # Old style:
         | 
| 164 | 
            +
                  has_pic_tag = p_element.at 'span'
         | 
| 165 | 
            +
                  if has_pic_tag
         | 
| 166 | 
            +
                    img_type = he_decode has_pic_tag.inner_html
         | 
| 167 | 
            +
                    img_type = $1.tr('^a-zA-Z0-9',' ') if IMG_TYPE.match img_type
         | 
| 168 | 
            +
             | 
| 169 | 
            +
                    ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
         | 
| 170 | 
            +
                  end
         | 
| 142 171 | 
             
                end
         | 
| 143 172 |  | 
| 144 | 
            -
                ret[:section] = he_decode(section_anchor.inner_html) | 
| 145 | 
            -
             | 
| 173 | 
            +
                ret[:section] = he_decode(section_anchor.inner_html) if section_anchor
         | 
| 174 | 
            +
               
         | 
| 146 175 | 
             
                ret[:post_date] = date
         | 
| 147 | 
            -
                if  | 
| 148 | 
            -
                   | 
| 176 | 
            +
                if p_element.at_xpath(XPATH_POST_DATE)
         | 
| 177 | 
            +
                  # Post 12/3
         | 
| 178 | 
            +
                  if /\A([^ ]+) ([\d]+)\Z/.match p_element.at_xpath(XPATH_POST_DATE).content.strip
         | 
| 179 | 
            +
                    ret[:post_date] = Date.parse [$1, $2].join('/')
         | 
| 180 | 
            +
                  end
         | 
| 181 | 
            +
                elsif SUMMARY_DATE.match he_decode(p_element.children[0])
         | 
| 182 | 
            +
                  # Old style
         | 
| 183 | 
            +
                    ret[:post_date] = Date.parse [$1, $2].join('/')
         | 
| 149 184 | 
             
                end
         | 
| 150 185 |  | 
| 151 186 | 
             
                if title_anchor
         | 
| @@ -157,4 +192,5 @@ class CraigScrape::Listings < CraigScrape::Scraper | |
| 157 192 |  | 
| 158 193 | 
             
                ret
         | 
| 159 194 | 
             
              end
         | 
| 160 | 
            -
             | 
| 195 | 
            +
             | 
| 196 | 
            +
            end
         |