libcraigscrape 1.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +12 -1
- data/Gemfile +12 -0
- data/Rakefile +1 -54
- data/bin/craig_report_schema.yml +4 -1
- data/bin/craigwatch +148 -146
- data/bin/report_mailer/report.html.erb +20 -0
- data/bin/report_mailer/{craigslist_report.plain.erb → report.text.erb} +7 -6
- data/lib/geo_listings.rb +1 -1
- data/lib/libcraigscrape.rb +52 -59
- data/lib/listings.rb +75 -39
- data/lib/posting.rb +120 -63
- data/lib/scraper.rb +43 -63
- data/spec/assets/geolisting_iso_us_120412.html +441 -0
- data/spec/assets/listing_cta_ftl_112612.html +1470 -0
- data/spec/assets/listing_rea_miami_123012.html +1397 -0
- data/spec/assets/listing_search_ppa_nyc_121212.html +1584 -0
- data/spec/assets/posting_daytona_art_120512-2.html +160 -0
- data/spec/assets/posting_daytona_art_120512.html +153 -0
- data/spec/assets/posting_mdc_cto_ftl_112612.html +170 -0
- data/spec/assets/posting_mdc_reb_120612.html +183 -0
- data/spec/assets/posting_sfbay_1226.html +157 -0
- data/spec/assets/posting_sya_121012-2.html +122 -0
- data/spec/assets/posting_sya_121012.html +165 -0
- data/spec/assets/this_post_has_expired_old.html +48 -0
- data/spec/geolisting_spec.rb +9 -0
- data/spec/listings_spec.rb +77 -0
- data/spec/postings_spec.rb +157 -0
- data/spec/spec_helper.rb +8 -0
- data/test/test_craigslist_geolisting.rb +5 -5
- data/test/test_craigslist_listing.rb +30 -30
- data/test/test_craigslist_posting.rb +25 -145
- metadata +200 -114
- data/bin/report_mailer/craigslist_report.html.erb +0 -17
    
        data/bin/craigwatch
    CHANGED
    
    | @@ -1,9 +1,10 @@ | |
| 1 | 
            -
            #!/usr/bin/ruby
         | 
| 1 | 
            +
            #!/usr/bin/env ruby
         | 
| 2 | 
            +
            # encoding: UTF-8
         | 
| 2 3 | 
             
            #
         | 
| 3 4 | 
             
            # =craigwatch - A email-based "post monitoring" solution
         | 
| 4 5 | 
             
            #
         | 
| 5 | 
            -
            # Created alongside the libcraigscrape library, libcraigwatch was designed to take the monotony out of regular | 
| 6 | 
            -
            # craiglist monitoring. craigwatch is designed to be run at periodic intervals (hourly/daily/etc) through crontab | 
| 6 | 
            +
            # Created alongside the libcraigscrape library, libcraigwatch was designed to take the monotony out of regular
         | 
| 7 | 
            +
            # craiglist monitoring. craigwatch is designed to be run at periodic intervals (hourly/daily/etc) through crontab
         | 
| 7 8 | 
             
            # and report all new postings within a listing or search url, since its last run, by email.
         | 
| 8 9 | 
             
            #
         | 
| 9 10 | 
             
            # For more information, head to the {craiglist monitoring}[http://www.derosetechnologies.com/community/libcraigscrape] help section of our website.
         | 
| @@ -25,29 +26,19 @@ | |
| 25 26 | 
             
            # - location_has_no - (array of string or regexp) Only include posts which don't match against the post location
         | 
| 26 27 | 
             
            #
         | 
| 27 28 | 
             
            # Multiple searches can be combined into a single report, and results can be sorted by newest-first or oldest-first (default)
         | 
| 28 | 
            -
            # | 
| 29 | 
            +
            #
         | 
| 29 30 | 
             
            # Reporting output is easily customized html, handled by ActionMailer, and emails can be delivered via smtp or sendmail.
         | 
| 30 | 
            -
            # Database tracking of already-delivered posts is handled by ActiveRecord, and its driver-agnostic SQL supports all the | 
| 31 | 
            +
            # Database tracking of already-delivered posts is handled by ActiveRecord, and its driver-agnostic SQL supports all the
         | 
| 31 32 | 
             
            # major backends (sqllite/mysql/postgres/probably-all-others). Database sizes are contained by automatically pruning old results
         | 
| 32 33 | 
             
            # that are no longer required at the end of each run.
         | 
| 33 34 | 
             
            #
         | 
| 34 35 | 
             
            # Pretty useful, no?
         | 
| 35 | 
            -
            # | 
| 36 | 
            +
            #
         | 
| 36 37 | 
             
            # == Installation
         | 
| 37 | 
            -
            # craigwatch is coupled with libcraigscrape, and is installed via ruby gems.  | 
| 38 | 
            -
            # | 
| 39 | 
            -
            # gem itself. 
         | 
| 40 | 
            -
            #
         | 
| 41 | 
            -
            # This should take care of the craigwatch install on all systems:
         | 
| 42 | 
            -
            #    sudo gem install libcraigscrape kwalify activerecord actionmailer
         | 
| 43 | 
            -
            # Alternatively, if you've already installed libcraigscrape and want to start working with craigwatch:
         | 
| 44 | 
            -
            #    sudo gem install kwalify activerecord actionmailer
         | 
| 45 | 
            -
            # 
         | 
| 46 | 
            -
            # This script was initially developed with activerecord 2.3, actionmailer 2.3 and kwalify 0.7, but will likely work with most 
         | 
| 47 | 
            -
            # prior and future versions of these libraries.
         | 
| 48 | 
            -
            # 
         | 
| 38 | 
            +
            # craigwatch is coupled with libcraigscrape, and is installed via ruby gems. 
         | 
| 39 | 
            +
            #
         | 
| 49 40 | 
             
            # == Usage
         | 
| 50 | 
            -
            # When craigwatch is invoked, it is designed to run a single report and then terminate. There is only one parameter to craigwatch, and | 
| 41 | 
            +
            # When craigwatch is invoked, it is designed to run a single report and then terminate. There is only one parameter to craigwatch, and
         | 
| 51 42 | 
             
            # this parameter is the path to a valid report-definition yml file. ie:
         | 
| 52 43 | 
             
            #    craigwatch johns_daily_watch.yml
         | 
| 53 44 | 
             
            #
         | 
| @@ -55,6 +46,9 @@ | |
| 55 46 | 
             
            # Probably, the best way to understand the report definition files, is to look at the annotated sample file below, and use it as a
         | 
| 56 47 | 
             
            # starting point for your own.
         | 
| 57 48 | 
             
            #
         | 
| 49 | 
            +
            # New in version 1.1.0 is ERB evaluation of the report-definiton file. This feature is automatic, just include the erb blocks you'd 
         | 
| 50 | 
            +
            # like, and the file will be evaluated at runtime.
         | 
| 51 | 
            +
            #
         | 
| 58 52 | 
             
            # By default there is no program output, however, setting any of the following paramters to 'yes' in your definition file will turn on
         | 
| 59 53 | 
             
            # useful debugging/logging output:
         | 
| 60 54 | 
             
            # - debug_database
         | 
| @@ -63,10 +57,10 @@ | |
| 63 57 | 
             
            #
         | 
| 64 58 | 
             
            # == Definition File Sample
         | 
| 65 59 | 
             
            #
         | 
| 66 | 
            -
            # Let's start with a minimal report, just enough needed to get something quick working: | 
| 60 | 
            +
            # Let's start with a minimal report, just enough needed to get something quick working:
         | 
| 67 61 | 
             
            #    # We need some kind of destination to send this to
         | 
| 68 62 | 
             
            #    email_to: Chris DeRose <cderose@derosetechnologies.com>
         | 
| 69 | 
            -
            # | 
| 63 | 
            +
            #
         | 
| 70 64 | 
             
            #    # This is an array of specific 'searches' we'll be performing in this report:
         | 
| 71 65 | 
             
            #    searches:
         | 
| 72 66 | 
             
            #         # We're looking for 90's era cadillac, something cheap, confortable and in white...
         | 
| @@ -85,7 +79,7 @@ | |
| 85 79 | 
             
            #         summary_post_has_no: [ /xlr/i ]
         | 
| 86 80 | 
             
            #
         | 
| 87 81 | 
             
            #         # We were convertable, and white/cream/etc:
         | 
| 88 | 
            -
            #         full_post_has: | 
| 82 | 
            +
            #         full_post_has:
         | 
| 89 83 | 
             
            #            - /convertible/i
         | 
| 90 84 | 
             
            #            - /(white|yellow|banana|creme|cream)/i
         | 
| 91 85 | 
             
            #
         | 
| @@ -93,7 +87,7 @@ | |
| 93 87 | 
             
            #         full_post_has_no:
         | 
| 94 88 | 
             
            #            - /simulated[^a-z]{0,2}convertible/i
         | 
| 95 89 | 
             
            #
         | 
| 96 | 
            -
            #         # We want to search all of craigslist's in the us, and we'll want to find it using | 
| 90 | 
            +
            #         # We want to search all of craigslist's in the us, and we'll want to find it using
         | 
| 97 91 | 
             
            #         # the '/search/cta?hasPic=1&query=cadillac' url on the site
         | 
| 98 92 | 
             
            #         sites: [ us ]
         | 
| 99 93 | 
             
            #         listings:
         | 
| @@ -104,6 +98,9 @@ | |
| 104 98 | 
             
            #    # The report_name is fed into Time.now.strftime, hence the formatting characters
         | 
| 105 99 | 
             
            #    report_name: Craig Watch For Johnathan on %D at %I:%M %p
         | 
| 106 100 | 
             
            #
         | 
| 101 | 
            +
            #    # Overrides the default system time zone with an EST zone
         | 
| 102 | 
            +
            #    tz: EST
         | 
| 103 | 
            +
            #
         | 
| 107 104 | 
             
            #    email_to: Johnathan Peabody <john@example.local>
         | 
| 108 105 | 
             
            #
         | 
| 109 106 | 
             
            #    # This is sent straight into ActiveRecord, so there's plenty of options available here. the following is an easy
         | 
| @@ -129,21 +126,21 @@ | |
| 129 126 | 
             
            #
         | 
| 130 127 | 
             
            #         # Oh, and we're on a budget:
         | 
| 131 128 | 
             
            #         price_less_than: 120
         | 
| 132 | 
            -
            # | 
| 129 | 
            +
            #
         | 
| 133 130 | 
             
            #       # Search #2
         | 
| 134 131 | 
             
            #       - name: Large apartment rentals in San Francisco
         | 
| 135 132 | 
             
            #         sites: [ us/ca/sfbay ]
         | 
| 136 133 | 
             
            #         starting: 9/10/2009
         | 
| 137 | 
            -
            # | 
| 138 | 
            -
            #         # We're going to rely on craigslist's built-in search for this one since there's a lot of listings, and we | 
| 134 | 
            +
            #
         | 
| 135 | 
            +
            #         # We're going to rely on craigslist's built-in search for this one since there's a lot of listings, and we
         | 
| 139 136 | 
             
            #         # want to conserve some bandwidth
         | 
| 140 137 | 
             
            #         listings: [ /search/apa?query=pool&minAsk=min&maxAsk=max&bedrooms=5 ]
         | 
| 141 138 | 
             
            #
         | 
| 142 139 | 
             
            #         # We'll require a price to be listed, 'cause it keeps out some of the unwanted fluff
         | 
| 143 140 | 
             
            #         price_required: yes
         | 
| 144 | 
            -
            # | 
| 141 | 
            +
            #
         | 
| 145 142 | 
             
            #         # Hopefully this will keep us away from a bad part of town:
         | 
| 146 | 
            -
            #         price_greater_than: 1000 | 
| 143 | 
            +
            #         price_greater_than: 1000
         | 
| 147 144 | 
             
            #
         | 
| 148 145 | 
             
            #         # Since we dont have time to driv to each location, we'll require only listings with pictures
         | 
| 149 146 | 
             
            #         has_image: yes
         | 
| @@ -160,9 +157,9 @@ $: << File.dirname(__FILE__) + '/../lib' | |
| 160 157 |  | 
| 161 158 | 
             
            require 'rubygems'
         | 
| 162 159 |  | 
| 163 | 
            -
            gem 'kwalify' | 
| 164 | 
            -
            gem 'activerecord' | 
| 165 | 
            -
            gem 'actionmailer' | 
| 160 | 
            +
            gem 'kwalify'
         | 
| 161 | 
            +
            gem 'activerecord'
         | 
| 162 | 
            +
            gem 'actionmailer'
         | 
| 166 163 |  | 
| 167 164 | 
             
            require 'kwalify'
         | 
| 168 165 | 
             
            require 'active_record'
         | 
| @@ -170,19 +167,20 @@ require 'action_mailer' | |
| 170 167 | 
             
            require 'kwalify/util/hashlike'
         | 
| 171 168 | 
             
            require 'libcraigscrape'
         | 
| 172 169 | 
             
            require "socket"
         | 
| 170 | 
            +
            require 'active_support/all'
         | 
| 173 171 |  | 
| 174 172 | 
             
            class String #:nodoc:
         | 
| 175 173 | 
             
              RE = /^\/(.*)\/([ixm]*)$/
         | 
| 176 | 
            -
             | 
| 174 | 
            +
             | 
| 177 175 | 
             
              def is_re?
         | 
| 178 176 | 
             
                (RE.match self) ? true : false
         | 
| 179 177 | 
             
              end
         | 
| 180 | 
            -
             | 
| 178 | 
            +
             | 
| 181 179 | 
             
              def to_re
         | 
| 182 180 | 
             
                source, options = ( RE.match(self) )? [$1, $2] : [self,nil]
         | 
| 183 181 | 
             
                mods = 0
         | 
| 184 182 |  | 
| 185 | 
            -
                options.each_char do |c| | 
| 183 | 
            +
                options.each_char do |c|
         | 
| 186 184 | 
             
                  mods |= case c
         | 
| 187 185 | 
             
                    when 'i' then Regexp::IGNORECASE
         | 
| 188 186 | 
             
                    when 'x' then Regexp::EXTENDED
         | 
| @@ -199,12 +197,19 @@ class CraigReportDefinition #:nodoc: | |
| 199 197 |  | 
| 200 198 | 
             
              EMAIL_NAME_PARTS = /^[ ]*(.+)[ ]*\<.+\>[ ]*/
         | 
| 201 199 |  | 
| 202 | 
            -
              attr_reader :report_name, :email_to, :email_from, :tracking_database, :searches,  | 
| 200 | 
            +
              attr_reader :report_name, :email_to, :email_from, :tracking_database, :searches, 
         | 
| 201 | 
            +
                :smtp_settings, :tz
         | 
| 203 202 |  | 
| 204 203 | 
             
              def debug_database?;    @debug_database; end
         | 
| 205 204 | 
             
              def debug_mailer?;      @debug_mailer; end
         | 
| 206 205 | 
             
              def debug_craigscrape?; @debug_craigscrape; end
         | 
| 207 206 |  | 
| 207 | 
            +
              # Returns the configuration report zone, if defined. Otherwise pulls the zone
         | 
| 208 | 
            +
              # from the system's default local zone
         | 
| 209 | 
            +
              def tz
         | 
| 210 | 
            +
                @tz || Time.new.zone
         | 
| 211 | 
            +
              end
         | 
| 212 | 
            +
             | 
| 208 213 | 
             
              def email_from
         | 
| 209 214 | 
             
                (@email_from) ? @email_from : ('%s@%s' % [ENV['USER'], Socket.gethostname])
         | 
| 210 215 | 
             
              end
         | 
| @@ -224,59 +229,66 @@ class CraigReportDefinition #:nodoc: | |
| 224 229 | 
             
                  :adapter => 'sqlite3',
         | 
| 225 230 | 
             
                  :database => File.basename(for_yaml_file, File.extname(for_yaml_file))+'.db'
         | 
| 226 231 | 
             
                } if for_yaml_file
         | 
| 227 | 
            -
             | 
| 228 | 
            -
                # This is a little hack to make sqlite definitions a little more portable, by allowing them | 
| 232 | 
            +
             | 
| 233 | 
            +
                # This is a little hack to make sqlite definitions a little more portable, by allowing them
         | 
| 229 234 | 
             
                # to be specify dbfile's relative to the yml's directory:
         | 
| 230 235 | 
             
                ret = @tracking_database
         | 
| 231 236 | 
             
                ret['dbfile'] = '%s/%s' % [File.dirname(for_yaml_file), $1] if (
         | 
| 232 237 | 
             
                  for_yaml_file and ret.has_key? 'dbfile' and /^([^\/].*)$/.match ret['dbfile']
         | 
| 233 238 | 
             
                )
         | 
| 234 | 
            -
             | 
| 239 | 
            +
             | 
| 235 240 | 
             
                ret
         | 
| 236 241 | 
             
              end
         | 
| 237 242 |  | 
| 238 243 | 
             
              class SearchDefinition #:nodoc:
         | 
| 239 | 
            -
                include Kwalify::Util::HashLike | 
| 240 | 
            -
             | 
| 244 | 
            +
                include Kwalify::Util::HashLike
         | 
| 245 | 
            +
             | 
| 241 246 | 
             
                attr_reader :name, :sites, :listings
         | 
| 242 247 | 
             
                attr_reader :location_has, :location_has_no
         | 
| 243 248 | 
             
                attr_reader :full_post_has, :full_post_has_no
         | 
| 244 249 | 
             
                attr_reader :summary_post_has, :summary_post_has_no
         | 
| 245 250 | 
             
                attr_reader :summary_or_full_post_has, :summary_or_full_post_has_no
         | 
| 246 | 
            -
                
         | 
| 247 | 
            -
                attr_reader :price_greater_than,:price_less_than
         | 
| 248 251 |  | 
| 249 252 | 
             
                def has_image?; @has_image; end
         | 
| 250 253 | 
             
                def newest_first?; @newest_first; end
         | 
| 251 254 | 
             
                def price_required?; @price_required; end
         | 
| 252 | 
            -
             | 
| 255 | 
            +
             | 
| 256 | 
            +
                def price_greater_than
         | 
| 257 | 
            +
                  Money.new(@price_greater_than*100, 'USD') if @price_greater_than
         | 
| 258 | 
            +
                end
         | 
| 259 | 
            +
             | 
| 260 | 
            +
                def price_less_than
         | 
| 261 | 
            +
                  Money.new(@price_less_than*100, 'USD') if @price_less_than
         | 
| 262 | 
            +
                end
         | 
| 263 | 
            +
             | 
| 253 264 | 
             
                def starting_at
         | 
| 254 | 
            -
                  (@starting) ? | 
| 255 | 
            -
                     | 
| 256 | 
            -
             | 
| 265 | 
            +
                  (@starting) ?
         | 
| 266 | 
            +
                    Date.strptime(@starting, ['%m','%d',
         | 
| 267 | 
            +
                      /\/(?:[\d]{4})$/.match(@starting) ? '%Y' : '%y'].join('/') ) :
         | 
| 268 | 
            +
                    Date.yesterday
         | 
| 257 269 | 
             
                end
         | 
| 258 | 
            -
             | 
| 259 | 
            -
                def passes_filter?(post) | 
| 270 | 
            +
             | 
| 271 | 
            +
                def passes_filter?(post)
         | 
| 260 272 | 
             
                  if post.price.nil?
         | 
| 261 273 | 
             
                    return false if price_required?
         | 
| 262 274 | 
             
                  else
         | 
| 263 | 
            -
                    return false if  | 
| 264 | 
            -
                    return false if  | 
| 275 | 
            +
                    return false if price_greater_than and post.price <= price_greater_than
         | 
| 276 | 
            +
                    return false if price_less_than and post.price >= price_less_than
         | 
| 265 277 | 
             
                  end
         | 
| 266 | 
            -
             | 
| 278 | 
            +
             | 
| 267 279 | 
             
                  # Label Filters:
         | 
| 268 280 | 
             
                  return false unless matches_all? summary_post_has, post.label
         | 
| 269 281 | 
             
                  return false unless doesnt_match_any? summary_post_has_no, post.label
         | 
| 270 | 
            -
             | 
| 282 | 
            +
             | 
| 271 283 | 
             
                  # Location Filters:
         | 
| 272 284 | 
             
                  return false unless matches_all? location_has, post.location
         | 
| 273 285 | 
             
                  return false unless doesnt_match_any? location_has_no, post.location
         | 
| 274 | 
            -
             | 
| 286 | 
            +
             | 
| 275 287 | 
             
                  # Full post Filters:
         | 
| 276 288 | 
             
                  if full_post_has or full_post_has_no or summary_or_full_post_has or summary_or_full_post_has_no
         | 
| 277 289 | 
             
                    # We're going to download the page, so let's make sure we didnt hit a "This posting has been flagged for removal"
         | 
| 278 290 | 
             
                    return false if post.system_post?
         | 
| 279 | 
            -
             | 
| 291 | 
            +
             | 
| 280 292 | 
             
                    return false unless matches_all? full_post_has, post.contents_as_plain
         | 
| 281 293 | 
             
                    return false unless doesnt_match_any? full_post_has_no, post.contents_as_plain
         | 
| 282 294 |  | 
| @@ -286,21 +298,27 @@ class CraigReportDefinition #:nodoc: | |
| 286 298 |  | 
| 287 299 | 
             
                  true
         | 
| 288 300 | 
             
                end
         | 
| 289 | 
            -
             | 
| 301 | 
            +
             | 
| 290 302 | 
             
                private
         | 
| 291 | 
            -
             | 
| 303 | 
            +
             | 
| 292 304 | 
             
                def matches_all?(conditions, against)
         | 
| 293 | 
            -
                   | 
| 294 | 
            -
                  (conditions.nil? or conditions.all?{|c| against.any?{|a| match_against c, a } }) ? true : false
         | 
| 305 | 
            +
                  (conditions.nil? or conditions.all?{|c| sanitized_against(against).any?{|a| match_against c, a } }) ? true : false
         | 
| 295 306 | 
             
                end
         | 
| 296 | 
            -
             | 
| 307 | 
            +
             | 
| 297 308 | 
             
                def doesnt_match_any?(conditions, against)
         | 
| 298 | 
            -
                   | 
| 299 | 
            -
                  (conditions.nil? or conditions.all?{|c| against.any?{|a| !match_against c, a } }) ? true : false
         | 
| 309 | 
            +
                  (conditions.nil? or conditions.all?{|c| sanitized_against(against).any?{|a| !match_against c, a } }) ? true : false
         | 
| 300 310 | 
             
                end
         | 
| 301 | 
            -
             | 
| 311 | 
            +
             | 
| 302 312 | 
             
                def match_against(condition, against)
         | 
| 303 | 
            -
                  (against.scan( condition.is_re? ? condition.to_re : /#{condition}/i).length > 0) ? true : false
         | 
| 313 | 
            +
                  (CraigScrape::Scraper.he_decode(against).scan( condition.is_re? ? condition.to_re : /#{condition}/i).length > 0) ? true : false
         | 
| 314 | 
            +
                end
         | 
| 315 | 
            +
             | 
| 316 | 
            +
                # This is kind of a hack to deal with ruby 1.9. Really the filtering mechanism 
         | 
| 317 | 
            +
                # needs to be factored out and tested....
         | 
| 318 | 
            +
                def sanitized_against(against)
         | 
| 319 | 
            +
                  against = against.lines if against.respond_to? :lines
         | 
| 320 | 
            +
                  against = against.to_a if against.respond_to? :to_a
         | 
| 321 | 
            +
                  (against.nil?) ? [] : against.compact
         | 
| 304 322 | 
             
                end
         | 
| 305 323 | 
             
              end
         | 
| 306 324 | 
             
            end
         | 
| @@ -309,11 +327,11 @@ class TrackedSearch < ActiveRecord::Base #:nodoc: | |
| 309 327 | 
             
              has_many :listings, :dependent => :destroy, :class_name => 'TrackedListing'
         | 
| 310 328 | 
             
              validates_uniqueness_of :search_name
         | 
| 311 329 | 
             
              validates_presence_of   :search_name
         | 
| 312 | 
            -
             | 
| 330 | 
            +
             | 
| 313 331 | 
             
              def self.find_by_name(name)
         | 
| 314 332 | 
             
                self.find :first, :conditions => ['search_name = ?',name]
         | 
| 315 333 | 
             
              end
         | 
| 316 | 
            -
             | 
| 334 | 
            +
             | 
| 317 335 | 
             
              def find_listing_by_url(url)
         | 
| 318 336 | 
             
                listings.find :first, :conditions => ['url = ?',  url]
         | 
| 319 337 | 
             
              end
         | 
| @@ -330,9 +348,8 @@ class TrackedListing < ActiveRecord::Base #:nodoc: | |
| 330 348 | 
             
              def last_tracked_at
         | 
| 331 349 | 
             
                self.posts.maximum 'created_at'
         | 
| 332 350 | 
             
              end
         | 
| 333 | 
            -
             | 
| 351 | 
            +
             | 
| 334 352 | 
             
              def delete_posts_older_than(cutoff_date)
         | 
| 335 | 
            -
                # TODO: can't I use posts.delete 'created_at < ?' and keep it cleaner?
         | 
| 336 353 | 
             
                TrackedPost.delete_all [ 'tracked_listing_id = ? AND created_at < ?', self.id, cutoff_date ]
         | 
| 337 354 | 
             
              end
         | 
| 338 355 | 
             
            end
         | 
| @@ -342,11 +359,11 @@ class TrackedPost < ActiveRecord::Base #:nodoc: | |
| 342 359 |  | 
| 343 360 | 
             
              def self.activate_all!
         | 
| 344 361 | 
             
                TrackedPost.update_all(
         | 
| 345 | 
            -
                  { :active => true }, | 
| 346 | 
            -
                  [ 'active = ?', false ] | 
| 362 | 
            +
                  { :active => true },
         | 
| 363 | 
            +
                  [ 'active = ?', false ]
         | 
| 347 364 | 
             
                )
         | 
| 348 365 | 
             
              end
         | 
| 349 | 
            -
             | 
| 366 | 
            +
             | 
| 350 367 | 
             
              def self.destroy_inactive!
         | 
| 351 368 | 
             
                TrackedPost.delete_all [ 'active = ?', false ]
         | 
| 352 369 | 
             
              end
         | 
| @@ -354,23 +371,9 @@ end | |
| 354 371 |  | 
| 355 372 | 
             
            class ReportMailer < ActionMailer::Base #:nodoc:
         | 
| 356 373 | 
             
              def report(to, sender, subject_template, report_tmpl)
         | 
| 357 | 
            -
                
         | 
| 358 | 
            -
                formatted_subject = Time.now.strftime(subject_template)
         | 
| 359 | 
            -
                
         | 
| 360 | 
            -
                recipients  to
         | 
| 361 | 
            -
                from        sender
         | 
| 362 | 
            -
                subject     formatted_subject
         | 
| 363 | 
            -
             | 
| 364 | 
            -
                generate_view_parts 'craigslist_report', report_tmpl.merge({:subject =>formatted_subject})
         | 
| 365 | 
            -
              end
         | 
| 374 | 
            +
                @summaries = report_tmpl[:summaries]
         | 
| 366 375 |  | 
| 367 | 
            -
             | 
| 368 | 
            -
                part( :content_type => "multipart/alternative" ) do |p|
         | 
| 369 | 
            -
                  [
         | 
| 370 | 
            -
                    { :content_type => "text/plain", :body => render_message("#{view_name.to_s}.plain.erb", tmpl) },
         | 
| 371 | 
            -
                    { :content_type => "text/html",  :body => render_message("#{view_name.to_s}.html.erb",  tmpl.merge({:part_container => p})) }
         | 
| 372 | 
            -
                  ].each { |parms| p.part parms.merge( { :charset => "UTF-8", :transfer_encoding => "7bit" } ) }
         | 
| 373 | 
            -
                end
         | 
| 376 | 
            +
                mail :to => to, :subject => Time.zone.now.strftime(subject_template), :from => sender
         | 
| 374 377 | 
             
              end
         | 
| 375 378 | 
             
            end
         | 
| 376 379 |  | 
| @@ -383,7 +386,7 @@ unless report_definition_file | |
| 383 386 | 
             
              puts <<EOD
         | 
| 384 387 | 
             
            Usage:
         | 
| 385 388 | 
             
                #{File.basename($0)} [report_definition_file]
         | 
| 386 | 
            -
             | 
| 389 | 
            +
             | 
| 387 390 | 
             
            Run 'gem server' and browse the libcraigscrape rdoc for 'bin/craigscrape' for specific usage details.
         | 
| 388 391 | 
             
            EOD
         | 
| 389 392 | 
             
              exit
         | 
| @@ -397,20 +400,25 @@ parser = Kwalify::Yaml::Parser.new( | |
| 397 400 | 
             
              :data_binding => true
         | 
| 398 401 | 
             
            )
         | 
| 399 402 |  | 
| 400 | 
            -
             | 
| 403 | 
            +
            report_definition_file_content = ERB.new(File.read(report_definition_file)).result
         | 
| 404 | 
            +
            craig_report = parser.parse(report_definition_file_content, filename: report_definition_file)
         | 
| 401 405 |  | 
| 402 406 | 
             
            parser.errors.each do |e|
         | 
| 403 407 | 
             
              puts "Definition Validation Error (line #{e.linenum}, char #{e.column}): #{e.message}"
         | 
| 404 408 | 
             
            end and exit if parser.errors.length > 0
         | 
| 405 409 |  | 
| 410 | 
            +
            # Set the time zone:
         | 
| 411 | 
            +
            Time.zone = craig_report.tz
         | 
| 412 | 
            +
             | 
| 406 413 | 
             
            # Initialize Action Mailer:
         | 
| 414 | 
            +
            ActionMailer::Base.prepend_view_path(File.dirname(__FILE__))
         | 
| 407 415 | 
             
            ActionMailer::Base.logger = Logger.new STDERR if craig_report.debug_mailer?
         | 
| 408 416 | 
             
            if craig_report.smtp_settings
         | 
| 409 | 
            -
               | 
| 417 | 
            +
              ActionMailer::Base.smtp_settings = craig_report.smtp_settings.symbolize_keys
         | 
| 418 | 
            +
              ActionMailer::Base.delivery_method = :smtp
         | 
| 410 419 | 
             
            else
         | 
| 411 | 
            -
               | 
| 420 | 
            +
              ActionMailer::Base.delivery_method = :sendmail
         | 
| 412 421 | 
             
            end
         | 
| 413 | 
            -
            ReportMailer.template_root = File.dirname __FILE__
         | 
| 414 422 |  | 
| 415 423 | 
             
            # Initialize the database:
         | 
| 416 424 | 
             
            ActiveRecord::Base.logger = Logger.new STDERR if craig_report.debug_database?
         | 
| @@ -421,16 +429,16 @@ CraigScrape::Scraper.logger = Logger.new STDERR if craig_report.debug_craigscrap | |
| 421 429 |  | 
| 422 430 | 
             
            # Perform migrations if needed?
         | 
| 423 431 | 
             
            ActiveRecord::Schema.define do
         | 
| 424 | 
            -
              suppress_messages do | 
| 432 | 
            +
              suppress_messages do
         | 
| 425 433 | 
             
                create_table :tracked_searches do |t|
         | 
| 426 434 | 
             
                  t.column :search_name,      :string
         | 
| 427 435 | 
             
                end unless table_exists? :tracked_searches
         | 
| 428 | 
            -
             | 
| 436 | 
            +
             | 
| 429 437 | 
             
                create_table :tracked_listings do |t|
         | 
| 430 438 | 
             
                  t.column :url,                :string
         | 
| 431 439 | 
             
                  t.column :tracked_search_id,  :integer
         | 
| 432 | 
            -
                end unless table_exists? :tracked_listings | 
| 433 | 
            -
             | 
| 440 | 
            +
                end unless table_exists? :tracked_listings
         | 
| 441 | 
            +
             | 
| 434 442 | 
             
                create_table :tracked_posts do |t|
         | 
| 435 443 | 
             
                  t.column :url,                :string
         | 
| 436 444 | 
             
                  t.column :tracked_listing_id, :integer
         | 
| @@ -440,7 +448,7 @@ ActiveRecord::Schema.define do | |
| 440 448 | 
             
              end
         | 
| 441 449 | 
             
            end
         | 
| 442 450 |  | 
| 443 | 
            -
            # Remove all posts which are inactive. They would be in there if the prior run was a failure. | 
| 451 | 
            +
            # Remove all posts which are inactive. They would be in there if the prior run was a failure.
         | 
| 444 452 | 
             
            TrackedPost.destroy_inactive!
         | 
| 445 453 |  | 
| 446 454 | 
             
            # We'll need these outside this next loop:
         | 
| @@ -450,80 +458,80 @@ newly_tracked_posts = [] | |
| 450 458 | 
             
            report_summaries = craig_report.searches.collect do |search|
         | 
| 451 459 | 
             
              # Load our tracking info
         | 
| 452 460 | 
             
              search_track = TrackedSearch.find_by_name search.name
         | 
| 453 | 
            -
             | 
| 461 | 
            +
             | 
| 454 462 | 
             
              # No Tracking found - let's set one up:
         | 
| 455 463 | 
             
              search_track = TrackedSearch.create! :search_name => search.name unless search_track
         | 
| 456 | 
            -
             | 
| 464 | 
            +
             | 
| 457 465 | 
             
              # This hash tracks what makes it into the report on this search.
         | 
| 458 466 | 
             
              # NOTE that keys are url's b/c sometimes the same posting will end up in multiple listings,
         | 
| 459 467 | 
             
              # And doing this ensures that we don't end-up reporting the same post twice.
         | 
| 460 468 | 
             
              new_summaries = {}
         | 
| 461 | 
            -
             | 
| 469 | 
            +
             | 
| 462 470 | 
             
              # And now we actually scrape:
         | 
| 463 471 | 
             
              CraigScrape.new(*search.sites).each_listing(*search.listings) do |listing|
         | 
| 464 | 
            -
                # Keep in mind that listing.url does change in the while loop. | 
| 472 | 
            +
                # Keep in mind that listing.url does change in the while loop.
         | 
| 465 473 | 
             
                # But, this first one is a good base_url that will never change between runs.
         | 
| 466 474 |  | 
| 467 475 | 
             
                tracked_listing = search_track.find_listing_by_url listing.url
         | 
| 468 476 | 
             
                tracked_listing ||= search_track.listings.create! :url => listing.url
         | 
| 469 | 
            -
             | 
| 470 | 
            -
                # Gives us a sane stopping point (hopefully) : | 
| 471 | 
            -
                last_tracked_at = tracked_listing.last_tracked_at
         | 
| 477 | 
            +
             | 
| 478 | 
            +
                # Gives us a sane stopping point (hopefully) :
         | 
| 479 | 
            +
                last_tracked_at = tracked_listing.last_tracked_at.try(:to_date)
         | 
| 472 480 | 
             
                last_tracked_at ||= search.starting_at
         | 
| 473 481 |  | 
| 474 482 | 
             
                # Some more stopping points (probably):
         | 
| 475 483 | 
             
                already_tracked_urls = tracked_listing.posts.collect{|tp| tp.url}
         | 
| 476 484 |  | 
| 477 485 | 
             
                # We'll use this in the loop to decide what posts to track:
         | 
| 478 | 
            -
                newest_post_date = last_tracked_at | 
| 479 | 
            -
             | 
| 486 | 
            +
                newest_post_date = last_tracked_at
         | 
| 487 | 
            +
             | 
| 480 488 | 
             
                # We keep track of post.post_date here, b/c in some circumstances, you can be in the below loop
         | 
| 481 489 | 
             
                # but have no post.post_date since the posting was removed and it parsed to nil
         | 
| 482 | 
            -
                most_recent_posting_date =  | 
| 483 | 
            -
             | 
| 490 | 
            +
                most_recent_posting_date = Date.new
         | 
| 491 | 
            +
             | 
| 484 492 | 
             
                # OK - Now let's go!
         | 
| 485 493 | 
             
                catch :list_break do
         | 
| 486 494 | 
             
                  while listing
         | 
| 487 495 | 
             
                    listing.posts.each do |post|
         | 
| 488 496 | 
             
                      begin
         | 
| 489 497 | 
             
                        most_recent_posting_date = post.post_date if post.post_date
         | 
| 490 | 
            -
             | 
| 498 | 
            +
             | 
| 491 499 | 
             
                        # Are we at a point in the scrape, past which we don't need to proceed?
         | 
| 492 500 | 
             
                        throw :list_break if (
         | 
| 493 | 
            -
                          most_recent_posting_date < last_tracked_at or
         | 
| 501 | 
            +
                          most_recent_posting_date.to_time < last_tracked_at or
         | 
| 494 502 | 
             
                          already_tracked_urls.include? post.url
         | 
| 495 503 | 
             
                        )
         | 
| 496 | 
            -
             | 
| 504 | 
            +
             | 
| 497 505 | 
             
                        # If we want to report this post, add it to the collection:
         | 
| 498 506 | 
             
                        new_summaries[post.url] = post if (
         | 
| 499 | 
            -
                          !new_summaries.has_key? post.url and | 
| 507 | 
            +
                          !new_summaries.has_key? post.url and
         | 
| 500 508 | 
             
                          search.passes_filter? post
         | 
| 501 509 | 
             
                        )
         | 
| 502 | 
            -
                      rescue CraigScrape::Scraper::ResourceNotFoundError | 
| 510 | 
            +
                      rescue CraigScrape::Scraper::ResourceNotFoundError => e
         | 
| 503 511 | 
             
                        # Sometimes we do end up with 404's that will never load, and we dont want to
         | 
| 504 512 | 
             
                        # abort a run simply b/c we found some anomaly due to the craigslist index.
         | 
| 505 | 
            -
                        # being out of date. This ResourceNotFoundError can occur due to | 
| 506 | 
            -
                        # loading the post url in full, only to see that it was yanked - or craigslist | 
| 513 | 
            +
                        # being out of date. This ResourceNotFoundError can occur due to
         | 
| 514 | 
            +
                        # loading the post url in full, only to see that it was yanked - or craigslist
         | 
| 507 515 | 
             
                        # is acting funny.
         | 
| 508 516 | 
             
                        next
         | 
| 509 517 | 
             
                      end
         | 
| 510 | 
            -
             | 
| 518 | 
            +
             | 
| 511 519 | 
             
                      # Now let's see if the url should be kept in our tracking database for the future...
         | 
| 512 520 |  | 
| 513 521 | 
             
                      # This post-date sets a limit for the tracked_listing.posts.create below
         | 
| 514 522 | 
             
                      newest_post_date = most_recent_posting_date if most_recent_posting_date > newest_post_date
         | 
| 515 | 
            -
             | 
| 523 | 
            +
             | 
| 516 524 | 
             
                      # Now let's add these urls to the database so as to reduce memory overhead.
         | 
| 517 525 | 
             
                      # Keep in mind - they're not active until the email goes out.
         | 
| 518 | 
            -
                      # also - we shouldn't have to worry about putting 'irrelevant' posts in the db, since | 
| 519 | 
            -
                      # the  | 
| 526 | 
            +
                      # also - we shouldn't have to worry about putting 'irrelevant' posts in the db, since
         | 
| 527 | 
            +
                      # the newest are always the first ones parsed:
         | 
| 520 528 | 
             
                      tracked_listing.posts.create(
         | 
| 521 | 
            -
                        :url => post.url, | 
| 522 | 
            -
                        :created_at => newest_post_date | 
| 529 | 
            +
                        :url => post.url,
         | 
| 530 | 
            +
                        :created_at => newest_post_date
         | 
| 523 531 | 
             
                      ) unless most_recent_posting_date < newest_post_date
         | 
| 524 532 |  | 
| 525 533 | 
             
                    end
         | 
| 526 | 
            -
             | 
| 534 | 
            +
             | 
| 527 535 | 
             
                    listing = listing.next_page
         | 
| 528 536 | 
             
                  end
         | 
| 529 537 | 
             
                end
         | 
| @@ -532,41 +540,35 @@ report_summaries = craig_report.searches.collect do |search| | |
| 532 540 |  | 
| 533 541 |  | 
| 534 542 | 
             
              # Let's flatten the unique'd hash into a more useable array:
         | 
| 535 | 
            -
               | 
| 536 | 
            -
             | 
| 537 | 
            -
              #  * Its possible that we met all the criterion of the passes_filter? with merely a header, and
         | 
| 538 | 
            -
              #    if so we add a url to the summaries stack
         | 
| 539 | 
            -
              #  * Unfortunately, when we later load that post in full, we may find that the post was posting_has_expired? 
         | 
| 540 | 
            -
              #    or flagged_for_removal?, etc.
         | 
| 541 | 
            -
              #  * If this was the case, below we'll end up sorting against nil post_dates. This would fail.
         | 
| 542 | 
            -
              #  * So - before we sort, we run a quick reject on nil post_dates 
         | 
| 543 | 
            -
              new_summaries = new_summaries.values.reject{|v| v.post_date.nil? }.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
         | 
| 544 | 
            -
              
         | 
| 543 | 
            +
              new_summaries = new_summaries.values.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
         | 
| 544 | 
            +
             | 
| 545 545 | 
             
              # Now Let's manage the tracking database:
         | 
| 546 | 
            -
              if new_summaries.length > 0 | 
| 546 | 
            +
              if new_summaries.length > 0
         | 
| 547 547 |  | 
| 548 548 | 
             
                # We'll use this in the cleanup at the bottom:
         | 
| 549 549 | 
             
                latest_post_date = new_summaries.last.post_date
         | 
| 550 | 
            -
             | 
| 551 | 
            -
                new_summaries.reverse! if search.newest_first? | 
| 550 | 
            +
             | 
| 551 | 
            +
                new_summaries.reverse! if search.newest_first?
         | 
| 552 552 | 
             
              end
         | 
| 553 | 
            -
             | 
| 553 | 
            +
             | 
| 554 554 | 
             
              # We'll want to email these...
         | 
| 555 | 
            -
              { | 
| 555 | 
            +
              {
         | 
| 556 556 | 
             
                :latest_post_date => latest_post_date,
         | 
| 557 | 
            -
                :search_track => search_track, | 
| 558 | 
            -
                :postings => new_summaries, | 
| 557 | 
            +
                :search_track => search_track,
         | 
| 558 | 
            +
                :postings => new_summaries,
         | 
| 559 559 | 
             
                :search => search
         | 
| 560 560 | 
             
              }
         | 
| 561 561 | 
             
            end
         | 
| 562 562 |  | 
| 563 | 
            -
            # Time to send the email:
         | 
| 564 | 
            -
             | 
| 565 | 
            -
               | 
| 566 | 
            -
             | 
| 567 | 
            -
             | 
| 568 | 
            -
             | 
| 569 | 
            -
             | 
| 563 | 
            +
            # Time to send the email (maybe):
         | 
| 564 | 
            +
            unless report_summaries.select { |s| !s[:postings].empty? }.empty?
         | 
| 565 | 
            +
              ReportMailer.report(
         | 
| 566 | 
            +
                craig_report.email_to,
         | 
| 567 | 
            +
                craig_report.email_from,
         | 
| 568 | 
            +
                craig_report.report_name,
         | 
| 569 | 
            +
                {:summaries => report_summaries, :definition => craig_report}
         | 
| 570 | 
            +
              ).deliver
         | 
| 571 | 
            +
            end
         | 
| 570 572 |  | 
| 571 573 | 
             
            # Commit (make 'active') all newly created tracked post urls:
         | 
| 572 574 | 
             
            TrackedPost.activate_all!
         | 
| @@ -576,4 +578,4 @@ report_summaries.each do |summary| | |
| 576 578 | 
             
              summary[:search_track].listings.each do |listing|
         | 
| 577 579 | 
             
                listing.delete_posts_older_than listing.last_tracked_at
         | 
| 578 580 | 
             
              end
         | 
| 579 | 
            -
            end
         | 
| 581 | 
            +
            end
         | 
| @@ -0,0 +1,20 @@ | |
| 1 | 
            +
            <h2><%=h @subject %></h2>
         | 
| 2 | 
            +
            <%@summaries.each do |summary| %>
         | 
| 3 | 
            +
              <h3><%=h summary[:search].name%></h3>
         | 
| 4 | 
            +
              <% if summary[:postings].length > 0 %>
         | 
| 5 | 
            +
                <%summary[:postings].each do |post|%>
         | 
| 6 | 
            +
                  <p>
         | 
| 7 | 
            +
                  <%=('%s <a href="%s">%s</a>' % [ 
         | 
| 8 | 
            +
               			h(post.post_date.strftime('%b %d')), post.url, h(post.title)
         | 
| 9 | 
            +
                  ]).html_safe %>
         | 
| 10 | 
            +
                  <%=([
         | 
| 11 | 
            +
                    (post.price) ? h(post.price.try(:format, :no_cents => true)) : nil,
         | 
| 12 | 
            +
               			(post.location) ? '<font size="-1"> (%s)</font>' % h(post.location) : nil,
         | 
| 13 | 
            +
               			(post.has_pic_or_img?) ? ' <span style="color: orange"> img</span>': nil
         | 
| 14 | 
            +
                  ].compact.join(' ')).html_safe -%>
         | 
| 15 | 
            +
                  </p>
         | 
| 16 | 
            +
                <% end %>
         | 
| 17 | 
            +
              <% else %>
         | 
| 18 | 
            +
                <p><i>No new postings were found, which matched the search criteria.</i></p>
         | 
| 19 | 
            +
              <% end %>
         | 
| 20 | 
            +
            <% end %>
         | 
| @@ -1,18 +1,19 @@ | |
| 1 1 | 
             
            CRAIGSLIST REPORTER
         | 
| 2 2 |  | 
| 3 | 
            -
             | 
| 3 | 
            +
            <% @summaries.each do |summary| -%>
         | 
| 4 4 | 
             
               <%=summary[:search].name %>
         | 
| 5 5 | 
             
               <% summary[:postings].collect do |post| -%>
         | 
| 6 6 | 
             
                  <% if summary[:postings].length > 0 %>
         | 
| 7 | 
            -
                  <%='%s : %s %s %s %s' % [
         | 
| 7 | 
            +
                  <%='%s : %s %s %s %s %s' % [
         | 
| 8 8 | 
             
            			post.post_date.strftime('%b %d'),
         | 
| 9 | 
            -
            			post. | 
| 10 | 
            -
             | 
| 11 | 
            -
            			(post. | 
| 9 | 
            +
            			post.title,
         | 
| 10 | 
            +
                  post.price.try(:format, :no_cents => true),
         | 
| 11 | 
            +
            			(post.location) ? " (#{post.location})" : nil,
         | 
| 12 | 
            +
            			(post.has_pic_or_img?) ? ' [img]': nil,
         | 
| 12 13 | 
             
            			post.url
         | 
| 13 14 | 
             
                  ] -%>
         | 
| 14 15 | 
             
                  <% else %>
         | 
| 15 16 | 
             
                  No new postings were found, which matched the search criteria.
         | 
| 16 17 | 
             
                  <% end %>
         | 
| 17 18 | 
             
               <% end %>
         | 
| 18 | 
            -
            <% end -%>
         | 
| 19 | 
            +
            <% end -%>
         | 
    
        data/lib/geo_listings.rb
    CHANGED