libcraigscrape 1.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +12 -1
- data/Gemfile +12 -0
- data/Rakefile +1 -54
- data/bin/craig_report_schema.yml +4 -1
- data/bin/craigwatch +148 -146
- data/bin/report_mailer/report.html.erb +20 -0
- data/bin/report_mailer/{craigslist_report.plain.erb → report.text.erb} +7 -6
- data/lib/geo_listings.rb +1 -1
- data/lib/libcraigscrape.rb +52 -59
- data/lib/listings.rb +75 -39
- data/lib/posting.rb +120 -63
- data/lib/scraper.rb +43 -63
- data/spec/assets/geolisting_iso_us_120412.html +441 -0
- data/spec/assets/listing_cta_ftl_112612.html +1470 -0
- data/spec/assets/listing_rea_miami_123012.html +1397 -0
- data/spec/assets/listing_search_ppa_nyc_121212.html +1584 -0
- data/spec/assets/posting_daytona_art_120512-2.html +160 -0
- data/spec/assets/posting_daytona_art_120512.html +153 -0
- data/spec/assets/posting_mdc_cto_ftl_112612.html +170 -0
- data/spec/assets/posting_mdc_reb_120612.html +183 -0
- data/spec/assets/posting_sfbay_1226.html +157 -0
- data/spec/assets/posting_sya_121012-2.html +122 -0
- data/spec/assets/posting_sya_121012.html +165 -0
- data/spec/assets/this_post_has_expired_old.html +48 -0
- data/spec/geolisting_spec.rb +9 -0
- data/spec/listings_spec.rb +77 -0
- data/spec/postings_spec.rb +157 -0
- data/spec/spec_helper.rb +8 -0
- data/test/test_craigslist_geolisting.rb +5 -5
- data/test/test_craigslist_listing.rb +30 -30
- data/test/test_craigslist_posting.rb +25 -145
- metadata +200 -114
- data/bin/report_mailer/craigslist_report.html.erb +0 -17
    
        data/lib/posting.rb
    CHANGED
    
    | @@ -14,14 +14,25 @@ class CraigScrape::Posting < CraigScrape::Scraper | |
| 14 14 |  | 
| 15 15 | 
             
              POST_DATE       = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
         | 
| 16 16 | 
             
              LOCATION        = /Location\:[ ]+(.+)/
         | 
| 17 | 
            -
              HEADER_LOCATION =  | 
| 18 | 
            -
              POSTING_ID      = /PostingID\:[ ] | 
| 17 | 
            +
              HEADER_LOCATION = /\((.+)\)$/
         | 
| 18 | 
            +
              POSTING_ID      = /PostingID\:[ ]*([\d]+)/
         | 
| 19 19 | 
             
              REPLY_TO        = /(.+)/
         | 
| 20 20 | 
             
              PRICE           = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
         | 
| 21 | 
            +
               
         | 
| 22 | 
            +
              # NOTE: we implement the (?:) to first check the 'old' style format, and then the 'new style'
         | 
| 23 | 
            +
              # (As of 12/03's parse changes)
         | 
| 21 24 | 
             
              USERBODY_PARTS  = /^(.+)\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>(.+)$/m
         | 
| 22 25 | 
             
              HTML_HEADER     = /^(.+)\<div id\=\"userbody\">/m
         | 
| 23 26 | 
             
              IMAGE_SRC       = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
         | 
| 24 27 |  | 
| 28 | 
            +
              # This is used to determine if there's a parse error
         | 
| 29 | 
            +
              REQUIRED_FIELDS = %w(contents posting_id post_time header title full_section)
         | 
| 30 | 
            +
             | 
| 31 | 
            +
              XPATH_USERBODY = "//*[@id='userbody']"
         | 
| 32 | 
            +
              XPATH_BLURBS = "//ul[@class='blurbs']"
         | 
| 33 | 
            +
              XPATH_PICS = "//*[@class='tn']/a/@href"
         | 
| 34 | 
            +
              XPATH_REPLY_TO = "//*[@class='dateReplyBar']/small/a"
         | 
| 35 | 
            +
             | 
| 25 36 | 
             
              # This is really just for testing, in production use, uri.path is a better solution
         | 
| 26 37 | 
             
              attr_reader :href #:nodoc:
         | 
| 27 38 |  | 
| @@ -30,14 +41,14 @@ class CraigScrape::Posting < CraigScrape::Scraper | |
| 30 41 | 
             
                super(*args)
         | 
| 31 42 |  | 
| 32 43 | 
             
                # Validate that required fields are present, at least - if we've downloaded it from a url
         | 
| 33 | 
            -
                 | 
| 34 | 
            -
                   | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
                   | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 44 | 
            +
                if args.first.kind_of? String and is_active_post?
         | 
| 45 | 
            +
                  unparsed_fields = REQUIRED_FIELDS.find_all{|f| 
         | 
| 46 | 
            +
                    val = send(f)
         | 
| 47 | 
            +
                    val.nil? or (val.respond_to? :length and val.length == 0)
         | 
| 48 | 
            +
                  } 
         | 
| 49 | 
            +
                  parse_error! unparsed_fields unless unparsed_fields.empty?
         | 
| 50 | 
            +
                end  
         | 
| 51 | 
            +
             | 
| 41 52 | 
             
              end
         | 
| 42 53 |  | 
| 43 54 |  | 
| @@ -67,7 +78,7 @@ class CraigScrape::Posting < CraigScrape::Scraper | |
| 67 78 | 
             
                unless @full_section
         | 
| 68 79 | 
             
                  @full_section = []
         | 
| 69 80 |  | 
| 70 | 
            -
                  (html_head/" | 
| 81 | 
            +
                  (html_head / "*[@class='bchead']//a").each do |a|
         | 
| 71 82 | 
             
                    @full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
         | 
| 72 83 | 
             
                  end if html_head
         | 
| 73 84 | 
             
                end
         | 
| @@ -78,9 +89,13 @@ class CraigScrape::Posting < CraigScrape::Scraper | |
| 78 89 | 
             
              # String, represents the post's reply-to address, if listed
         | 
| 79 90 | 
             
              def reply_to
         | 
| 80 91 | 
             
                unless @reply_to
         | 
| 81 | 
            -
                   | 
| 82 | 
            -
             | 
| 83 | 
            -
                   | 
| 92 | 
            +
                  if html.at_xpath(XPATH_REPLY_TO)
         | 
| 93 | 
            +
                    @reply_to = html.at_xpath(XPATH_REPLY_TO).content
         | 
| 94 | 
            +
                  else
         | 
| 95 | 
            +
                    cursor = html_head.at 'hr' if html_head
         | 
| 96 | 
            +
                    cursor = cursor.next until cursor.nil? or cursor.name == 'a'
         | 
| 97 | 
            +
                    @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
         | 
| 98 | 
            +
                  end
         | 
| 84 99 | 
             
                end
         | 
| 85 100 |  | 
| 86 101 | 
             
                @reply_to
         | 
| @@ -91,7 +106,7 @@ class CraigScrape::Posting < CraigScrape::Scraper | |
| 91 106 | 
             
                unless @post_time
         | 
| 92 107 | 
             
                  cursor = html_head.at 'hr' if html_head
         | 
| 93 108 | 
             
                  cursor = cursor.next until cursor.nil? or POST_DATE.match cursor.to_s
         | 
| 94 | 
            -
                  @post_time =  | 
| 109 | 
            +
                  @post_time = DateTime.parse($1) if $1
         | 
| 95 110 | 
             
                end
         | 
| 96 111 |  | 
| 97 112 | 
             
                @post_time
         | 
| @@ -99,10 +114,17 @@ class CraigScrape::Posting < CraigScrape::Scraper | |
| 99 114 |  | 
| 100 115 | 
             
              # Integer, Craigslist's unique posting id
         | 
| 101 116 | 
             
              def posting_id
         | 
| 102 | 
            -
                 | 
| 103 | 
            -
             | 
| 104 | 
            -
             | 
| 105 | 
            -
                   | 
| 117 | 
            +
                if @posting_id 
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                elsif USERBODY_PARTS.match html_source
         | 
| 120 | 
            +
                  # Old style:
         | 
| 121 | 
            +
                  html_footer = $4
         | 
| 122 | 
            +
                  cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING 
         | 
| 123 | 
            +
                  cursor = cursor.next until cursor.nil? or 
         | 
| 124 | 
            +
                  @posting_id = $1.to_i if POSTING_ID.match html_footer.to_s
         | 
| 125 | 
            +
                else
         | 
| 126 | 
            +
                  # Post 12/3
         | 
| 127 | 
            +
                  @posting_id = $1.to_i if POSTING_ID.match html.xpath("//*[@class='postingidtext']").to_s
         | 
| 106 128 | 
             
                end
         | 
| 107 129 |  | 
| 108 130 | 
             
                @posting_id
         | 
| @@ -112,7 +134,7 @@ class CraigScrape::Posting < CraigScrape::Scraper | |
| 112 134 | 
             
              def contents
         | 
| 113 135 | 
             
                unless @contents
         | 
| 114 136 | 
             
                  @contents = user_body if html_source
         | 
| 115 | 
            -
                  @contents = he_decode | 
| 137 | 
            +
                  @contents = he_decode(@contents).strip if @contents
         | 
| 116 138 | 
             
                end
         | 
| 117 139 |  | 
| 118 140 | 
             
                @contents
         | 
| @@ -120,27 +142,40 @@ class CraigScrape::Posting < CraigScrape::Scraper | |
| 120 142 |  | 
| 121 143 | 
             
              # String, the location of the item, as best could be parsed
         | 
| 122 144 | 
             
              def location
         | 
| 123 | 
            -
                if @location.nil? and  | 
| 124 | 
            -
             | 
| 125 | 
            -
                   | 
| 126 | 
            -
             | 
| 127 | 
            -
             | 
| 128 | 
            -
             | 
| 129 | 
            -
                    if  | 
| 130 | 
            -
                       | 
| 131 | 
            -
                      break
         | 
| 132 | 
            -
                    end
         | 
| 133 | 
            -
                  end if cursor
         | 
| 145 | 
            +
                if @location.nil? and html
         | 
| 146 | 
            +
                 
         | 
| 147 | 
            +
                  if html.at_xpath(XPATH_BLURBS)
         | 
| 148 | 
            +
                    # This is the post-12/3/12 style:
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                    # Sometimes the Location is in the body :
         | 
| 151 | 
            +
                    @location = $1 if html.xpath(XPATH_BLURBS).first.children.any?{|c| 
         | 
| 152 | 
            +
                      LOCATION.match c.content}
         | 
| 134 153 |  | 
| 135 | 
            -
                   | 
| 136 | 
            -
             | 
| 137 | 
            -
                    cursor = craigslist_body.at ' | 
| 138 | 
            -
             | 
| 154 | 
            +
                  elsif craigslist_body
         | 
| 155 | 
            +
                    # Location (when explicitly defined):
         | 
| 156 | 
            +
                    cursor = craigslist_body.at 'ul' unless @location
         | 
| 157 | 
            +
             | 
| 158 | 
            +
                    # This is the legacy style:
         | 
| 159 | 
            +
                    # Note: Apa section includes other things in the li's (cats/dogs ok fields)
         | 
| 160 | 
            +
                    cursor.children.each do |li|
         | 
| 161 | 
            +
                      if LOCATION.match li.inner_html
         | 
| 162 | 
            +
                        @location = he_decode($1) and break
         | 
| 163 | 
            +
                        break
         | 
| 164 | 
            +
                      end
         | 
| 165 | 
            +
                    end if cursor
         | 
| 166 | 
            +
             | 
| 167 | 
            +
                    # Real estate listings can work a little different for location:
         | 
| 168 | 
            +
                    unless @location
         | 
| 169 | 
            +
                      cursor = craigslist_body.at 'small'
         | 
| 170 | 
            +
                      cursor = cursor.previous until cursor.nil? or cursor.text?
         | 
| 171 | 
            +
                      
         | 
| 172 | 
            +
                      @location = he_decode(cursor.to_s.strip) if cursor
         | 
| 173 | 
            +
                    end
         | 
| 139 174 |  | 
| 140 | 
            -
                    @location = he_decode(cursor.to_s.strip) if cursor
         | 
| 141 175 | 
             
                  end
         | 
| 142 176 |  | 
| 143 | 
            -
                  # So, *sometimes* the location just ends up being in the header, I don't know why | 
| 177 | 
            +
                  # So, *sometimes* the location just ends up being in the header, I don't know why.
         | 
| 178 | 
            +
                  # This happens on old-style and new-style posts:
         | 
| 144 179 | 
             
                  @location = $1 if @location.nil? and HEADER_LOCATION.match header
         | 
| 145 180 | 
             
                end
         | 
| 146 181 |  | 
| @@ -164,11 +199,16 @@ class CraigScrape::Posting < CraigScrape::Scraper | |
| 164 199 | 
             
                unless @pics
         | 
| 165 200 | 
             
                  @pics = []
         | 
| 166 201 |  | 
| 167 | 
            -
                  if html  | 
| 168 | 
            -
                     | 
| 169 | 
            -
             | 
| 170 | 
            -
             | 
| 171 | 
            -
             | 
| 202 | 
            +
                  if html 
         | 
| 203 | 
            +
                    if html.at_xpath(XPATH_PICS)
         | 
| 204 | 
            +
                      @pics = html.xpath(XPATH_PICS).collect(&:value)
         | 
| 205 | 
            +
                    elsif craigslist_body
         | 
| 206 | 
            +
                      # This is the pre-12/3/12 style:
         | 
| 207 | 
            +
                      # Now let's find the craigslist hosted images:
         | 
| 208 | 
            +
                      img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
         | 
| 209 | 
            +
                    
         | 
| 210 | 
            +
                      @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
         | 
| 211 | 
            +
                    end
         | 
| 172 212 | 
             
                  end
         | 
| 173 213 | 
             
                end
         | 
| 174 214 |  | 
| @@ -202,11 +242,10 @@ class CraigScrape::Posting < CraigScrape::Scraper | |
| 202 242 | 
             
                @posting_has_expired
         | 
| 203 243 | 
             
              end
         | 
| 204 244 |  | 
| 205 | 
            -
              
         | 
| 206 245 | 
             
              # Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
         | 
| 207 246 | 
             
              # used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
         | 
| 208 247 | 
             
              def post_date
         | 
| 209 | 
            -
                @post_date =  | 
| 248 | 
            +
                @post_date = post_time.to_date unless @post_date or post_time.nil?
         | 
| 210 249 |  | 
| 211 250 | 
             
                @post_date
         | 
| 212 251 | 
             
              end
         | 
| @@ -229,14 +268,8 @@ class CraigScrape::Posting < CraigScrape::Scraper | |
| 229 268 | 
             
              # Array, which image types are listed for the post.
         | 
| 230 269 | 
             
              # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
         | 
| 231 270 | 
             
              def img_types
         | 
| 232 | 
            -
                 | 
| 233 | 
            -
                   | 
| 234 | 
            -
                  
         | 
| 235 | 
            -
                  @img_types << :img if images.length > 0
         | 
| 236 | 
            -
                  @img_types << :pic if pics.length > 0
         | 
| 237 | 
            -
                end
         | 
| 238 | 
            -
                
         | 
| 239 | 
            -
                @img_types
         | 
| 271 | 
            +
                @img_types || [ (images.length > 0) ? :img : nil, 
         | 
| 272 | 
            +
                  (pics.length > 0) ? :pic : nil ].compact
         | 
| 240 273 | 
             
              end
         | 
| 241 274 |  | 
| 242 275 | 
             
              # Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However, 
         | 
| @@ -270,7 +303,11 @@ class CraigScrape::Posting < CraigScrape::Scraper | |
| 270 303 | 
             
              # Returns the best-guess of a price, judging by the label's contents. Price is available when pulled from the listing summary
         | 
| 271 304 | 
             
              # and can be safely used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
         | 
| 272 305 | 
             
              def price
         | 
| 273 | 
            -
                 | 
| 306 | 
            +
                unless @price
         | 
| 307 | 
            +
                  (header and PRICE.match label) ? 
         | 
| 308 | 
            +
                    @price = Money.new($1.tr('$','').to_i*100, 'USD') : nil
         | 
| 309 | 
            +
                end
         | 
| 310 | 
            +
                @price
         | 
| 274 311 | 
             
              end
         | 
| 275 312 |  | 
| 276 313 | 
             
              # Returns the post contents with all html tags removed
         | 
| @@ -290,6 +327,12 @@ class CraigScrape::Posting < CraigScrape::Scraper | |
| 290 327 | 
             
                [contents,posting_id,post_time,title].all?{|f| f.nil?}
         | 
| 291 328 | 
             
              end
         | 
| 292 329 |  | 
| 330 | 
            +
              # This is mostly used to determine if the post should be checked for
         | 
| 331 | 
            +
              # parse errors. Might be useful for someone else though
         | 
| 332 | 
            +
              def is_active_post?
         | 
| 333 | 
            +
                [flagged_for_removal?, posting_has_expired?, deleted_by_author?].none?
         | 
| 334 | 
            +
              end 
         | 
| 335 | 
            +
             | 
| 293 336 | 
             
              private
         | 
| 294 337 |  | 
| 295 338 | 
             
              # I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we 
         | 
| @@ -302,17 +345,31 @@ class CraigScrape::Posting < CraigScrape::Scraper | |
| 302 345 | 
             
                @html_head
         | 
| 303 346 | 
             
              end
         | 
| 304 347 |  | 
| 305 | 
            -
              # Since we started having so many problems with Hpricot flipping out on whack content bodies, 
         | 
| 306 | 
            -
              # I added this to return everything south of the user_body
         | 
| 307 | 
            -
              def html_footer     
         | 
| 308 | 
            -
                $4 if USERBODY_PARTS.match html_source
         | 
| 309 | 
            -
              end
         | 
| 310 | 
            -
             | 
| 311 348 | 
             
              # OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
         | 
| 312 | 
            -
              # This bad html trips up  | 
| 349 | 
            +
              # This bad html trips up html parsers, and I've resorted to splitting the page up using string parsing like so:
         | 
| 313 350 | 
             
              # We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
         | 
| 314 | 
            -
              def user_body | 
| 315 | 
            -
                 | 
| 351 | 
            +
              def user_body
         | 
| 352 | 
            +
                if USERBODY_PARTS.match html_source
         | 
| 353 | 
            +
                  # This is the pre-12/3/12 style:
         | 
| 354 | 
            +
                  $2
         | 
| 355 | 
            +
                elsif html.at_xpath(XPATH_USERBODY)
         | 
| 356 | 
            +
                  # There's a bunch of junk in here that we don't want, so this loop removes
         | 
| 357 | 
            +
                  # everything after (and including) the last script tag, from the result
         | 
| 358 | 
            +
                  user_body = html.xpath(XPATH_USERBODY)
         | 
| 359 | 
            +
                  hit_delimeter = false
         | 
| 360 | 
            +
                  # Since some posts don't actually have the script tag:
         | 
| 361 | 
            +
                  delimeter = user_body.at_xpath('script') ? :script : :comment
         | 
| 362 | 
            +
                  user_body.first.children.to_a.reverse.reject{ |p|
         | 
| 363 | 
            +
                    if hit_delimeter
         | 
| 364 | 
            +
                      false
         | 
| 365 | 
            +
                    elsif ( (delimeter == :script and p.name == 'script') or 
         | 
| 366 | 
            +
                      (delimeter == :comment and p.comment? and p.content.strip == "START CLTAGS") )
         | 
| 367 | 
            +
                      hit_delimeter = true 
         | 
| 368 | 
            +
                    else
         | 
| 369 | 
            +
                      true
         | 
| 370 | 
            +
                    end
         | 
| 371 | 
            +
                  }.reverse.collect(&:to_s).join
         | 
| 372 | 
            +
                end
         | 
| 316 373 | 
             
              end
         | 
| 317 374 |  | 
| 318 375 | 
             
              # Read the notes on user_body. However,  unlike the user_body, the craigslist portion of this div can be relied upon to be valid html. 
         | 
| @@ -321,4 +378,4 @@ class CraigScrape::Posting < CraigScrape::Scraper | |
| 321 378 | 
             
                Nokogiri::HTML $3, nil, HTML_ENCODING if USERBODY_PARTS.match html_source
         | 
| 322 379 | 
             
              end
         | 
| 323 380 |  | 
| 324 | 
            -
            end
         | 
| 381 | 
            +
            end
         | 
    
        data/lib/scraper.rb
    CHANGED
    
    | @@ -15,39 +15,27 @@ | |
| 15 15 | 
             
            # 
         | 
| 16 16 | 
             
            # <b>logger</b> - a Logger object to debug http notices too. Defaults to nil
         | 
| 17 17 | 
             
            #
         | 
| 18 | 
            -
             | 
| 19 | 
            -
            #
         | 
| 20 | 
            -
            # <b>sleep_between_fetch_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a failed download. Defaults to 30.
         | 
| 21 | 
            -
            #
         | 
| 22 | 
            -
            # <b>retries_on_404_fail</b> - The number of times to retry a Resource Not Found error (http Response code 404). Defaults to 3.
         | 
| 23 | 
            -
            #
         | 
| 24 | 
            -
            # <b>sleep_between_404_retries</b> - The amount of seconds to sleep, between successive attempts in the case of a Resource Not Found error. Defaults to 3.
         | 
| 25 | 
            -
            #
         | 
| 18 | 
            +
             | 
| 26 19 | 
             
            class CraigScrape::Scraper
         | 
| 27 20 | 
             
              cattr_accessor :logger
         | 
| 28 | 
            -
              cattr_accessor :sleep_between_fetch_retries
         | 
| 29 | 
            -
              cattr_accessor :retries_on_fetch_fail
         | 
| 30 21 | 
             
              cattr_accessor :retries_on_404_fail
         | 
| 31 22 | 
             
              cattr_accessor :sleep_between_404_retries
         | 
| 32 | 
            -
             | 
| 23 | 
            +
             | 
| 24 | 
            +
              self.retries_on_404_fail = 3
         | 
| 25 | 
            +
              self.sleep_between_404_retries = 3
         | 
| 33 26 |  | 
| 34 27 | 
             
              URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/
         | 
| 35 28 | 
             
              HTML_TAG  = /<\/?[^>]*>/
         | 
| 36 29 | 
             
              # We have to specify this to nokogiri. Sometimes it tries to figure out encoding on its own, and craigslist users post crazy bytes sometimes  
         | 
| 37 30 | 
             
              HTML_ENCODING = "UTF-8"
         | 
| 31 | 
            +
             | 
| 32 | 
            +
              HTTP_HEADERS = { "Cache-Control" => "no-cache", "Pragma" => "no-cache", 
         | 
| 33 | 
            +
                "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 
         | 
| 34 | 
            +
                "User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19"}
         | 
| 38 35 |  | 
| 39 36 | 
             
              # Returns the full url that corresponds to this resource
         | 
| 40 37 | 
             
              attr_reader :url
         | 
| 41 38 |  | 
| 42 | 
            -
              # Set some defaults:
         | 
| 43 | 
            -
              self.retries_on_fetch_fail = 8
         | 
| 44 | 
            -
              self.sleep_between_fetch_retries = 30
         | 
| 45 | 
            -
              
         | 
| 46 | 
            -
              self.retries_on_404_fail = 3
         | 
| 47 | 
            -
              self.sleep_between_404_retries = 3
         | 
| 48 | 
            -
              
         | 
| 49 | 
            -
              self.maximum_redirects_per_request = 20
         | 
| 50 | 
            -
             | 
| 51 39 | 
             
              class BadConstructionError < StandardError #:nodoc:
         | 
| 52 40 | 
             
              end
         | 
| 53 41 |  | 
| @@ -57,9 +45,6 @@ class CraigScrape::Scraper | |
| 57 45 | 
             
              class BadUrlError < StandardError #:nodoc:
         | 
| 58 46 | 
             
              end
         | 
| 59 47 |  | 
| 60 | 
            -
              class MaxRedirectError < StandardError #:nodoc:
         | 
| 61 | 
            -
              end
         | 
| 62 | 
            -
             | 
| 63 48 | 
             
              class FetchError < StandardError #:nodoc:
         | 
| 64 49 | 
             
              end
         | 
| 65 50 |  | 
| @@ -100,21 +85,37 @@ class CraigScrape::Scraper | |
| 100 85 | 
             
                @uri
         | 
| 101 86 | 
             
              end
         | 
| 102 87 |  | 
| 88 | 
            +
              # This method is mostly useful for our specs, but it's included in case anyone
         | 
| 89 | 
            +
              # else wants it. It returns all currently-defined instance variables, and is 
         | 
| 90 | 
            +
              # mostly useful for the specs. Probably this doesn't do what you think, and
         | 
| 91 | 
            +
              # should only be used to determine what's been parsed by the object thus-far.
         | 
| 92 | 
            +
              # (And does not include parseable attributes which have yet to be determined
         | 
| 93 | 
            +
              def attributes
         | 
| 94 | 
            +
                Hash[self.instance_variables.collect{|i| 
         | 
| 95 | 
            +
                  [i.to_s.tr('@','').to_sym, instance_variable_get(i) ] }]
         | 
| 96 | 
            +
              end
         | 
| 97 | 
            +
             | 
| 103 98 | 
             
              private
         | 
| 104 99 |  | 
| 105 100 | 
             
              # Returns text with all html tags removed.
         | 
| 106 101 | 
             
              def strip_html(str)
         | 
| 107 | 
            -
                str.gsub HTML_TAG, "" if str
         | 
| 102 | 
            +
                he_decode(str).gsub HTML_TAG, "" if str
         | 
| 108 103 | 
             
              end
         | 
| 109 104 |  | 
| 110 105 | 
             
              # Easy way to fail noisily:
         | 
| 111 | 
            -
              def parse_error | 
| 106 | 
            +
              def parse_error!(fields = nil)
         | 
| 107 | 
            +
                raise ParseError, "Error while parsing %s:\n %s%s" % [
         | 
| 108 | 
            +
                  self.class.to_s, html, 
         | 
| 109 | 
            +
                  (fields) ? ("\nRequired fields missing: %s" % fields.join(', ')) : ''] 
         | 
| 110 | 
            +
              end
         | 
| 112 111 |  | 
| 113 112 | 
             
              # Returns text with all html entities converted to respective ascii character.
         | 
| 114 113 | 
             
              def he_decode(text); self.class.he_decode text; end
         | 
| 115 114 |  | 
| 116 115 | 
             
              # Returns text with all html entities converted to respective ascii character.
         | 
| 117 | 
            -
              def self.he_decode(text) | 
| 116 | 
            +
              def self.he_decode(text)
         | 
| 117 | 
            +
                HTMLEntities.new.decode text
         | 
| 118 | 
            +
              end
         | 
| 118 119 |  | 
| 119 120 | 
             
              # Derives a full url, using the current object's url and the provided href
         | 
| 120 121 | 
             
              def url_from_href(href) #:nodoc:
         | 
| @@ -133,42 +134,34 @@ class CraigScrape::Scraper | |
| 133 134 | 
             
                '%s://%s%s' % [scheme, host, path]
         | 
| 134 135 | 
             
              end
         | 
| 135 136 |  | 
| 136 | 
            -
              def fetch_uri(uri | 
| 137 | 
            -
                logger.info "Requesting | 
| 137 | 
            +
              def fetch_uri(uri)
         | 
| 138 | 
            +
                logger.info "Requesting: %s" % [@url.inspect] if logger
         | 
| 138 139 |  | 
| 139 | 
            -
                 | 
| 140 | 
            -
             | 
| 141 | 
            -
                case uri.scheme
         | 
| 140 | 
            +
                (case uri.scheme
         | 
| 142 141 | 
             
                  when 'file'
         | 
| 143 142 | 
             
                    # If this is a directory, we'll try to approximate http a bit by loading a '/index.html'
         | 
| 144 | 
            -
                    File.read( File.directory?(uri.path) ?  | 
| 143 | 
            +
                    File.read( File.directory?(uri.path) ? 
         | 
| 144 | 
            +
                      "#{uri.path}/index.html" : uri.path , :encoding => 'BINARY')
         | 
| 145 145 | 
             
                  when /^http[s]?/
         | 
| 146 | 
            -
                    fetch_http uri | 
| 146 | 
            +
                    fetch_http uri
         | 
| 147 147 | 
             
                  else
         | 
| 148 148 | 
             
                    raise BadUrlError, "Unknown URI scheme for the url: #{@url}"
         | 
| 149 | 
            -
                end
         | 
| 149 | 
            +
                end).force_encoding("ISO-8859-1").encode("UTF-8")
         | 
| 150 150 | 
             
              end
         | 
| 151 | 
            -
             | 
| 152 | 
            -
              def fetch_http(uri | 
| 151 | 
            +
             | 
| 152 | 
            +
              def fetch_http(uri)
         | 
| 153 153 | 
             
                fetch_attempts = 0
         | 
| 154 154 | 
             
                resource_not_found_attempts = 0
         | 
| 155 155 |  | 
| 156 156 | 
             
                begin
         | 
| 157 | 
            -
                   | 
| 158 | 
            -
             | 
| 159 | 
            -
             | 
| 160 | 
            -
                  if resp. | 
| 161 | 
            -
                     | 
| 162 | 
            -
                    data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
         | 
| 163 | 
            -
                    
         | 
| 164 | 
            -
                    data
         | 
| 165 | 
            -
                  elsif resp.response['Location']
         | 
| 166 | 
            -
                    redirect_to = resp.response['Location']
         | 
| 167 | 
            -
                    
         | 
| 168 | 
            -
                    fetch_uri URI.parse(url_from_href(redirect_to)), redirect_count+1
         | 
| 157 | 
            +
                  resp = Typhoeus.get uri.to_s, :followlocation =>  true, 
         | 
| 158 | 
            +
                    :headers => HTTP_HEADERS
         | 
| 159 | 
            +
             | 
| 160 | 
            +
                  if resp.response_code == 200
         | 
| 161 | 
            +
                    resp.response_body
         | 
| 169 162 | 
             
                  else
         | 
| 170 163 | 
             
                    # Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
         | 
| 171 | 
            -
                    raise ResourceNotFoundError, 'Unable to fetch "%s" (%s)' % [ @url, resp. | 
| 164 | 
            +
                    raise ResourceNotFoundError, 'Unable to fetch "%s" (%s)' % [ @url, resp.response_code ]
         | 
| 172 165 | 
             
                  end
         | 
| 173 166 | 
             
                rescue ResourceNotFoundError => err
         | 
| 174 167 | 
             
                  logger.info err.message if logger
         | 
| @@ -182,19 +175,6 @@ class CraigScrape::Scraper | |
| 182 175 | 
             
                  else
         | 
| 183 176 | 
             
                    raise err
         | 
| 184 177 | 
             
                  end      
         | 
| 185 | 
            -
                rescue FetchError,Timeout::Error,Errno::ECONNRESET,EOFError => err
         | 
| 186 | 
            -
                  logger.info 'Timeout error while requesting "%s"' % @url if logger and err.class == Timeout::Error
         | 
| 187 | 
            -
                  logger.info 'Connection reset while requesting "%s"' % @url if logger and err.class == Errno::ECONNRESET
         | 
| 188 | 
            -
                  
         | 
| 189 | 
            -
                  fetch_attempts += 1
         | 
| 190 | 
            -
                  
         | 
| 191 | 
            -
                  if fetch_attempts <= self.retries_on_fetch_fail
         | 
| 192 | 
            -
                    sleep self.sleep_between_fetch_retries if self.sleep_between_fetch_retries
         | 
| 193 | 
            -
                    logger.info 'Retrying fetch ....' if logger
         | 
| 194 | 
            -
                    retry
         | 
| 195 | 
            -
                  else
         | 
| 196 | 
            -
                    raise err
         | 
| 197 | 
            -
                  end
         | 
| 198 178 | 
             
                end
         | 
| 199 179 | 
             
              end
         | 
| 200 180 |  | 
| @@ -209,4 +189,4 @@ class CraigScrape::Scraper | |
| 209 189 | 
             
                @html ||= Nokogiri::HTML html_source, nil, HTML_ENCODING if html_source
         | 
| 210 190 | 
             
                @html
         | 
| 211 191 | 
             
              end
         | 
| 212 | 
            -
            end  
         | 
| 192 | 
            +
            end  
         |