olek-libcraigscrape 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. data/CHANGELOG +94 -0
  2. data/COPYING +674 -0
  3. data/COPYING.LESSER +165 -0
  4. data/README +89 -0
  5. data/Rakefile +125 -0
  6. data/bin/craig_report_schema.yml +68 -0
  7. data/bin/craigwatch +581 -0
  8. data/bin/report_mailer/craigslist_report.html.erb +17 -0
  9. data/bin/report_mailer/craigslist_report.plain.erb +18 -0
  10. data/lib/geo_listings.rb +144 -0
  11. data/lib/libcraigscrape.rb +217 -0
  12. data/lib/listings.rb +160 -0
  13. data/lib/posting.rb +324 -0
  14. data/lib/scraper.rb +212 -0
  15. data/test/geolisting_samples/geo_listing_ca070209.html +76 -0
  16. data/test/geolisting_samples/geo_listing_ca_sk070209.html +31 -0
  17. data/test/geolisting_samples/geo_listing_cn070209.html +35 -0
  18. data/test/geolisting_samples/geo_listing_us070209.html +355 -0
  19. data/test/geolisting_samples/hierarchy_test071009/index.html +31 -0
  20. data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/%20SW%20florida/index.html +46 -0
  21. data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/index.html +46 -0
  22. data/test/geolisting_samples/hierarchy_test071009/us/fl/index.html +46 -0
  23. data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/index.html +46 -0
  24. data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/index.html +46 -0
  25. data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/more-nonsense/index.html +46 -0
  26. data/test/geolisting_samples/hierarchy_test071009/us/fl/nonexist/index.html +46 -0
  27. data/test/geolisting_samples/hierarchy_test071009/us/fl/nonsense/index.html +46 -0
  28. data/test/geolisting_samples/hierarchy_test071009/us/fl/south%20florida/index.html +46 -0
  29. data/test/geolisting_samples/hierarchy_test071009/us/index.html +355 -0
  30. data/test/google.html +8 -0
  31. data/test/libcraigscrape_test_helpers.rb +37 -0
  32. data/test/listing_samples/category_output.html +231 -0
  33. data/test/listing_samples/category_output_2.html +217 -0
  34. data/test/listing_samples/empty_listings.html +128 -0
  35. data/test/listing_samples/fortmyers_art_index.060909/1046596324.html +93 -0
  36. data/test/listing_samples/fortmyers_art_index.060909/1053085283.html +92 -0
  37. data/test/listing_samples/fortmyers_art_index.060909/1112522674.html +89 -0
  38. data/test/listing_samples/fortmyers_art_index.060909/823516079.html +92 -0
  39. data/test/listing_samples/fortmyers_art_index.060909/825684735.html +89 -0
  40. data/test/listing_samples/fortmyers_art_index.060909/891513957.html +94 -0
  41. data/test/listing_samples/fortmyers_art_index.060909/897549505.html +99 -0
  42. data/test/listing_samples/fortmyers_art_index.060909/960826026.html +89 -0
  43. data/test/listing_samples/fortmyers_art_index.060909/993256300.html +89 -0
  44. data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index500.060909.html +237 -0
  45. data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index600.060909.html +132 -0
  46. data/test/listing_samples/long_search_output.html +137 -0
  47. data/test/listing_samples/mia_fua_index8900.5.21.09.html +226 -0
  48. data/test/listing_samples/mia_search_kitten.3.15.10.html +149 -0
  49. data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack1000.6.18.09.html +144 -0
  50. data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack900.6.18.09.html +146 -0
  51. data/test/listing_samples/new_listing_span.4.17.10.html +769 -0
  52. data/test/listing_samples/short_search_output.html +133 -0
  53. data/test/post_samples/1207457727.html +92 -0
  54. data/test/post_samples/brw_reb_1224008903.html +101 -0
  55. data/test/post_samples/posting0.html +91 -0
  56. data/test/post_samples/posting1.html +106 -0
  57. data/test/post_samples/posting1796890756-061710.html +2318 -0
  58. data/test/post_samples/posting1808219423.html +2473 -0
  59. data/test/post_samples/posting1938291834-090610.html +188 -0
  60. data/test/post_samples/posting2.html +107 -0
  61. data/test/post_samples/posting3.html +92 -0
  62. data/test/post_samples/posting4.html +993 -0
  63. data/test/post_samples/posting5.html +38 -0
  64. data/test/post_samples/sfbay_art_1223614914.html +94 -0
  65. data/test/post_samples/this_post_has_been_deleted_by_its_author.html +37 -0
  66. data/test/post_samples/this_post_has_expired.html +48 -0
  67. data/test/test_craigslist_geolisting.rb +521 -0
  68. data/test/test_craigslist_listing.rb +362 -0
  69. data/test/test_craigslist_posting.rb +426 -0
  70. metadata +273 -0
data/bin/craigwatch ADDED
@@ -0,0 +1,581 @@
1
+ #!/usr/bin/ruby
2
+ #
3
+ # =craigwatch - A email-based "post monitoring" solution
4
+ #
5
+ # Created alongside the libcraigscrape library, libcraigwatch was designed to take the monotony out of regular
6
+ # craiglist monitoring. craigwatch is designed to be run at periodic intervals (hourly/daily/etc) through crontab
7
+ # and report all new postings within a listing or search url, since its last run, by email.
8
+ #
9
+ # For more information, head to the {craiglist monitoring}[http://www.derosetechnologies.com/community/libcraigscrape] help section of our website.
10
+ #
11
+ # == Features
12
+ # In additon to its report tracking, craigwatch offers many post search and filtering options that offer much imrpoved
13
+ # and more accurate results then does craigslist's search functions. Post filtering options include:
14
+ # - has_image - yes/no
15
+ # - price_required - yes/no
16
+ # - price_greater_than - (int)
17
+ # - price_less_than - (int)
18
+ # - full_post_has - (array of string or regexp) Only post whose full-post's contents contains/matches
19
+ # - full_post_has_no - (array of string or regexp) Only post whose full-post's contents_ contains doesn't contain/match
20
+ # - summary_post_has - (array of string or regexp) Only post whose listing's label contains/matches
21
+ # - summary_post_has_no - (array of string or regexp) Only post whose listing's label doesn't contain/match
22
+ # - summary_or_full_post_has - (array of string or regexp) Filter's out results which don't match either the post label <b>or</b> the post contents
23
+ # - summary_or_full_post_has_no - (array of string or regexp) Filter's out results which match either the post label <b>or</b> the post contents
24
+ # - location_has - (array of string or regexp) Only include posts which match against the post location
25
+ # - location_has_no - (array of string or regexp) Only include posts which don't match against the post location
26
+ #
27
+ # Multiple searches can be combined into a single report, and results can be sorted by newest-first or oldest-first (default)
28
+ #
29
+ # Reporting output is easily customized html, handled by ActionMailer, and emails can be delivered via smtp or sendmail.
30
+ # Database tracking of already-delivered posts is handled by ActiveRecord, and its driver-agnostic SQL supports all the
31
+ # major backends (sqllite/mysql/postgres/probably-all-others). Database sizes are contained by automatically pruning old results
32
+ # that are no longer required at the end of each run.
33
+ #
34
+ # Pretty useful, no?
35
+ #
36
+ # == Installation
37
+ # craigwatch is coupled with libcraigscrape, and is installed via ruby gems. However, since we focused on keeping the
38
+ # libcraigscrape download 'lightweight' some additional gems need to be installed in addition to the initial libcraigscrape
39
+ # gem itself.
40
+ #
41
+ # This should take care of the craigwatch install on all systems:
42
+ # sudo gem install libcraigscrape kwalify activerecord actionmailer
43
+ # Alternatively, if you've already installed libcraigscrape and want to start working with craigwatch:
44
+ # sudo gem install kwalify activerecord actionmailer
45
+ #
46
+ # This script was initially developed with activerecord 2.3, actionmailer 2.3 and kwalify 0.7, but will likely work with most
47
+ # prior and future versions of these libraries.
48
+ #
49
+ # == Usage
50
+ # When craigwatch is invoked, it is designed to run a single report and then terminate. There is only one parameter to craigwatch, and
51
+ # this parameter is the path to a valid report-definition yml file. ie:
52
+ # craigwatch johns_daily_watch.yml
53
+ #
54
+ # There is an included kwalify schema which can validate your definition files, but craigwatch will automatically do so at startup.
55
+ # Probably, the best way to understand the report definition files, is to look at the annotated sample file below, and use it as a
56
+ # starting point for your own.
57
+ #
58
+ # By default there is no program output, however, setting any of the following paramters to 'yes' in your definition file will turn on
59
+ # useful debugging/logging output:
60
+ # - debug_database
61
+ # - debug_mailer
62
+ # - debug_craigscrape
63
+ #
64
+ # == Definition File Sample
65
+ #
66
+ # Let's start with a minimal report, just enough needed to get something quick working:
67
+ # # We need some kind of destination to send this to
68
+ # email_to: Chris DeRose <cderose@derosetechnologies.com>
69
+ #
70
+ # # This is an array of specific 'searches' we'll be performing in this report:
71
+ # searches:
72
+ # # We're looking for 90's era cadillac, something cheap, confortable and in white...
73
+ # - name: 90's White/Creme Convertible Cadillacs
74
+ #
75
+ # # This starting date is mostly for the first run, and gives us a reasonable cut-off point from whcih to build.
76
+ # # Its optional, and if omitted, craigwatch defaults to 'yesterday'
77
+ # starting: 9/10/09
78
+ #
79
+ # # We want to check all the labels, and filter out years not in the 90's, and cars not made by cadillac
80
+ # summary_post_has:
81
+ # - /(?:^|[^\d]|19)9[\d](?:[^\dk]|$)/i
82
+ # - /cadillac/i
83
+ #
84
+ # # I said we're looking for something *comfortable* !
85
+ # summary_post_has_no: [ /xlr/i ]
86
+ #
87
+ # # We were convertable, and white/cream/etc:
88
+ # full_post_has:
89
+ # - /convertible/i
90
+ # - /(white|yellow|banana|creme|cream)/i
91
+ #
92
+ # # Convertible - not *simulated* convertible!
93
+ # full_post_has_no:
94
+ # - /simulated[^a-z]{0,2}convertible/i
95
+ #
96
+ # # We want to search all of craigslist's in the us, and we'll want to find it using
97
+ # # the '/search/cta?hasPic=1&query=cadillac' url on the site
98
+ # sites: [ us ]
99
+ # listings:
100
+ # - /search/cta?hasPic=1&query=cadillac
101
+ #
102
+ # Here's another annotated report which uses most of the other available craigwatch features:
103
+ #
104
+ # # The report_name is fed into Time.now.strftime, hence the formatting characters
105
+ # report_name: Craig Watch For Johnathan on %D at %I:%M %p
106
+ #
107
+ # email_to: Johnathan Peabody <john@example.local>
108
+ #
109
+ # # This is sent straight into ActiveRecord, so there's plenty of options available here. the following is an easy
110
+ # # default sqlite store that should work on most any system with a minimal overhead
111
+ # tracking_database: { adapter: sqlite3, dbfile: /home/john/john_cwatch_report.db }
112
+ #
113
+ # searches:
114
+ # # Search #1:
115
+ # - name: Schwinn Bikes For Sale in/near New York
116
+ # starting: 9/10/2009
117
+ #
118
+ # # Scrape the following sites/servers:
119
+ # sites: [ us/ny/newyork, us/nj/southjersey ]
120
+ #
121
+ # # Scrape the following listings pages:
122
+ # listings: [ bik ]
123
+ #
124
+ # # We want listings with Schwinn in the summary
125
+ # summary_post_has: [ /schwinn/i ]
126
+ #
127
+ # # We're only interested in adult bikes, so scrap any results that mentions chidren or kids
128
+ # full_post_has_no: [ /(children|kids)/i ]
129
+ #
130
+ # # Oh, and we're on a budget:
131
+ # price_less_than: 120
132
+ #
133
+ # # Search #2
134
+ # - name: Large apartment rentals in San Francisco
135
+ # sites: [ us/ca/sfbay ]
136
+ # starting: 9/10/2009
137
+ #
138
+ # # We're going to rely on craigslist's built-in search for this one since there's a lot of listings, and we
139
+ # # want to conserve some bandwidth
140
+ # listings: [ /search/apa?query=pool&minAsk=min&maxAsk=max&bedrooms=5 ]
141
+ #
142
+ # # We'll require a price to be listed, 'cause it keeps out some of the unwanted fluff
143
+ # price_required: yes
144
+ #
145
+ # # Hopefully this will keep us away from a bad part of town:
146
+ # price_greater_than: 1000
147
+ #
148
+ # # Since we dont have time to driv to each location, we'll require only listings with pictures
149
+ # has_image: yes
150
+ #
151
+ # == Author
152
+ # - Chris DeRose (cderose@derosetechnologies.com)
153
+ # - DeRose Technologies, Inc. http://www.derosetechnologies.com
154
+ #
155
+ # == License
156
+ #
157
+ # See COPYING[link:files/COPYING.html]
158
+ #
159
+ $: << File.dirname(__FILE__) + '/../lib'
160
+
161
+ require 'rubygems'
162
+
163
+ gem 'kwalify', '~> 0.7'
164
+ gem 'activerecord', '~> 2.3'
165
+ gem 'actionmailer', '~> 2.3'
166
+
167
+ require 'kwalify'
168
+ require 'active_record'
169
+ require 'action_mailer'
170
+ require 'kwalify/util/hashlike'
171
+ require 'libcraigscrape'
172
+ require "socket"
173
+
174
+ class String #:nodoc:
175
+ RE = /^\/(.*)\/([ixm]*)$/
176
+
177
+ def is_re?
178
+ (RE.match self) ? true : false
179
+ end
180
+
181
+ def to_re
182
+ source, options = ( RE.match(self) )? [$1, $2] : [self,nil]
183
+ mods = 0
184
+
185
+ options.each_char do |c|
186
+ mods |= case c
187
+ when 'i' then Regexp::IGNORECASE
188
+ when 'x' then Regexp::EXTENDED
189
+ when 'm' then Regexp::MULTILINE
190
+ end
191
+ end unless options.nil? or options.empty?
192
+
193
+ Regexp.new source, mods
194
+ end
195
+ end
196
+
197
+ class CraigReportDefinition #:nodoc:
198
+ include Kwalify::Util::HashLike
199
+
200
+ EMAIL_NAME_PARTS = /^[ ]*(.+)[ ]*\<.+\>[ ]*/
201
+
202
+ attr_reader :report_name, :email_to, :email_from, :tracking_database, :searches, :smtp_settings
203
+
204
+ def debug_database?; @debug_database; end
205
+ def debug_mailer?; @debug_mailer; end
206
+ def debug_craigscrape?; @debug_craigscrape; end
207
+
208
+ def email_from
209
+ (@email_from) ? @email_from : ('%s@%s' % [ENV['USER'], Socket.gethostname])
210
+ end
211
+
212
+ def email_to_name
213
+ EMAIL_NAME_PARTS.match(email_to) ? $1 : email_to
214
+ end
215
+
216
+ def report_name
217
+ @report_name ? @report_name : "Craigslist Watch For #{email_to_name} on %D at %I:%M %p"
218
+ end
219
+
220
+ # We allow people rewrite relative (sqlite) dbfiles by taking the use_cwd as a paramter
221
+ def tracking_database(for_yaml_file = nil)
222
+ # We'll setup a SQLite db using some defaults if needed
223
+ @tracking_database ||= {
224
+ :adapter => 'sqlite3',
225
+ :database => File.basename(for_yaml_file, File.extname(for_yaml_file))+'.db'
226
+ } if for_yaml_file
227
+
228
+ # This is a little hack to make sqlite definitions a little more portable, by allowing them
229
+ # to be specify dbfile's relative to the yml's directory:
230
+ ret = @tracking_database
231
+ ret['dbfile'] = '%s/%s' % [File.dirname(for_yaml_file), $1] if (
232
+ for_yaml_file and ret.has_key? 'dbfile' and /^([^\/].*)$/.match ret['dbfile']
233
+ )
234
+
235
+ ret
236
+ end
237
+
238
+ class SearchDefinition #:nodoc:
239
+ include Kwalify::Util::HashLike
240
+
241
+ attr_reader :name, :sites, :listings
242
+ attr_reader :location_has, :location_has_no
243
+ attr_reader :full_post_has, :full_post_has_no
244
+ attr_reader :summary_post_has, :summary_post_has_no
245
+ attr_reader :summary_or_full_post_has, :summary_or_full_post_has_no
246
+
247
+ attr_reader :price_greater_than,:price_less_than
248
+
249
+ def has_image?; @has_image; end
250
+ def newest_first?; @newest_first; end
251
+ def price_required?; @price_required; end
252
+
253
+ def starting_at
254
+ (@starting) ?
255
+ Time.parse(@starting) :
256
+ Time.now.yesterday.beginning_of_day
257
+ end
258
+
259
+ def passes_filter?(post)
260
+ if post.price.nil?
261
+ return false if price_required?
262
+ else
263
+ return false if @price_greater_than and post.price <= @price_greater_than
264
+ return false if @price_less_than and post.price >= @price_less_than
265
+ end
266
+
267
+ # Label Filters:
268
+ return false unless matches_all? summary_post_has, post.label
269
+ return false unless doesnt_match_any? summary_post_has_no, post.label
270
+
271
+ # Location Filters:
272
+ return false unless matches_all? location_has, post.location
273
+ return false unless doesnt_match_any? location_has_no, post.location
274
+
275
+ # Full post Filters:
276
+ if full_post_has or full_post_has_no or summary_or_full_post_has or summary_or_full_post_has_no
277
+ # We're going to download the page, so let's make sure we didnt hit a "This posting has been flagged for removal"
278
+ return false if post.system_post?
279
+
280
+ return false unless matches_all? full_post_has, post.contents_as_plain
281
+ return false unless doesnt_match_any? full_post_has_no, post.contents_as_plain
282
+
283
+ return false unless matches_all? summary_or_full_post_has, [post.contents_as_plain, post.label]
284
+ return false unless doesnt_match_any? summary_or_full_post_has_no, [post.contents_as_plain, post.label]
285
+ end
286
+
287
+ true
288
+ end
289
+
290
+ private
291
+
292
+ def matches_all?(conditions, against)
293
+ against = against.to_a
294
+ (conditions.nil? or conditions.all?{|c| against.any?{|a| match_against c, a } }) ? true : false
295
+ end
296
+
297
+ def doesnt_match_any?(conditions, against)
298
+ against = against.to_a
299
+ (conditions.nil? or conditions.all?{|c| against.any?{|a| !match_against c, a } }) ? true : false
300
+ end
301
+
302
+ def match_against(condition, against)
303
+ (against.scan( condition.is_re? ? condition.to_re : /#{condition}/i).length > 0) ? true : false
304
+ end
305
+ end
306
+ end
307
+
308
+ class TrackedSearch < ActiveRecord::Base #:nodoc:
309
+ has_many :listings, :dependent => :destroy, :class_name => 'TrackedListing'
310
+ validates_uniqueness_of :search_name
311
+ validates_presence_of :search_name
312
+
313
+ def self.find_by_name(name)
314
+ self.find :first, :conditions => ['search_name = ?',name]
315
+ end
316
+
317
+ def find_listing_by_url(url)
318
+ listings.find :first, :conditions => ['url = ?', url]
319
+ end
320
+ end
321
+
322
+ class TrackedListing < ActiveRecord::Base #:nodoc:
323
+ has_many :posts, :dependent => :destroy, :class_name => 'TrackedPost'
324
+ validates_presence_of :url, :tracked_search_id
325
+
326
+ def already_tracked?(url)
327
+ ( self.posts.find :first, :conditions => ['url = ?', url]) ? true : false
328
+ end
329
+
330
+ def last_tracked_at
331
+ self.posts.maximum 'created_at'
332
+ end
333
+
334
+ def delete_posts_older_than(cutoff_date)
335
+ # TODO: can't I use posts.delete 'created_at < ?' and keep it cleaner?
336
+ TrackedPost.delete_all [ 'tracked_listing_id = ? AND created_at < ?', self.id, cutoff_date ]
337
+ end
338
+ end
339
+
340
+ class TrackedPost < ActiveRecord::Base #:nodoc:
341
+ validates_presence_of :url, :tracked_listing_id
342
+
343
+ def self.activate_all!
344
+ TrackedPost.update_all(
345
+ { :active => true },
346
+ [ 'active = ?', false ]
347
+ )
348
+ end
349
+
350
+ def self.destroy_inactive!
351
+ TrackedPost.delete_all [ 'active = ?', false ]
352
+ end
353
+ end
354
+
355
+ class ReportMailer < ActionMailer::Base #:nodoc:
356
+ def report(to, sender, subject_template, report_tmpl)
357
+
358
+ formatted_subject = Time.now.strftime(subject_template)
359
+
360
+ recipients to
361
+ from sender
362
+ subject formatted_subject
363
+
364
+ generate_view_parts 'craigslist_report', report_tmpl.merge({:subject =>formatted_subject})
365
+ end
366
+
367
+ def generate_view_parts(view_name, tmpl)
368
+ part( :content_type => "multipart/alternative" ) do |p|
369
+ [
370
+ { :content_type => "text/plain", :body => render_message("#{view_name.to_s}.plain.erb", tmpl) },
371
+ { :content_type => "text/html", :body => render_message("#{view_name.to_s}.html.erb", tmpl.merge({:part_container => p})) }
372
+ ].each { |parms| p.part parms.merge( { :charset => "UTF-8", :transfer_encoding => "7bit" } ) }
373
+ end
374
+ end
375
+ end
376
+
377
+ #############
378
+
379
+ # Let's start our program now:
380
+ report_definition_file = ARGV[0] if ARGV[0] and File.readable?(ARGV[0])
381
+
382
+ unless report_definition_file
383
+ puts <<EOD
384
+ Usage:
385
+ #{File.basename($0)} [report_definition_file]
386
+
387
+ Run 'gem server' and browse the libcraigscrape rdoc for 'bin/craigscrape' for specific usage details.
388
+ EOD
389
+ exit
390
+ end
391
+
392
+ # Validate/Parse our input file:
393
+ parser = Kwalify::Yaml::Parser.new(
394
+ Kwalify::Validator.new(
395
+ Kwalify::Yaml.load_file(File.dirname(__FILE__)+'/craig_report_schema.yml')
396
+ ),
397
+ :data_binding => true
398
+ )
399
+
400
+ craig_report = parser.parse_file report_definition_file
401
+
402
+ parser.errors.each do |e|
403
+ puts "Definition Validation Error (line #{e.linenum}, char #{e.column}): #{e.message}"
404
+ end and exit if parser.errors.length > 0
405
+
406
+ # Initialize Action Mailer:
407
+ ActionMailer::Base.logger = Logger.new STDERR if craig_report.debug_mailer?
408
+ if craig_report.smtp_settings
409
+ ReportMailer.smtp_settings = craig_report.smtp_settings.symbolize_keys
410
+ else
411
+ ReportMailer.delivery_method = :sendmail
412
+ end
413
+ ReportMailer.template_root = File.dirname __FILE__
414
+
415
+ # Initialize the database:
416
+ ActiveRecord::Base.logger = Logger.new STDERR if craig_report.debug_database?
417
+ ActiveRecord::Base.establish_connection craig_report.tracking_database(report_definition_file)
418
+
419
+ # Initialize CraigScrape (sorta)
420
+ CraigScrape::Scraper.logger = Logger.new STDERR if craig_report.debug_craigscrape?
421
+
422
+ # Perform migrations if needed?
423
+ ActiveRecord::Schema.define do
424
+ suppress_messages do
425
+ create_table :tracked_searches do |t|
426
+ t.column :search_name, :string
427
+ end unless table_exists? :tracked_searches
428
+
429
+ create_table :tracked_listings do |t|
430
+ t.column :url, :string
431
+ t.column :tracked_search_id, :integer
432
+ end unless table_exists? :tracked_listings
433
+
434
+ create_table :tracked_posts do |t|
435
+ t.column :url, :string
436
+ t.column :tracked_listing_id, :integer
437
+ t.column :created_at, :date
438
+ t.column :active, :boolean, :default => 0
439
+ end unless table_exists? :tracked_posts
440
+ end
441
+ end
442
+
443
+ # Remove all posts which are inactive. They would be in there if the prior run was a failure.
444
+ TrackedPost.destroy_inactive!
445
+
446
+ # We'll need these outside this next loop:
447
+ newly_tracked_posts = []
448
+
449
+ # Now let's run a report:
450
+ report_summaries = craig_report.searches.collect do |search|
451
+ # Load our tracking info
452
+ search_track = TrackedSearch.find_by_name search.name
453
+
454
+ # No Tracking found - let's set one up:
455
+ search_track = TrackedSearch.create! :search_name => search.name unless search_track
456
+
457
+ # This hash tracks what makes it into the report on this search.
458
+ # NOTE that keys are url's b/c sometimes the same posting will end up in multiple listings,
459
+ # And doing this ensures that we don't end-up reporting the same post twice.
460
+ new_summaries = {}
461
+
462
+ # And now we actually scrape:
463
+ CraigScrape.new(*search.sites).each_listing(*search.listings) do |listing|
464
+ # Keep in mind that listing.url does change in the while loop.
465
+ # But, this first one is a good base_url that will never change between runs.
466
+
467
+ tracked_listing = search_track.find_listing_by_url listing.url
468
+ tracked_listing ||= search_track.listings.create! :url => listing.url
469
+
470
+ # Gives us a sane stopping point (hopefully) :
471
+ last_tracked_at = tracked_listing.last_tracked_at
472
+ last_tracked_at ||= search.starting_at
473
+
474
+ # Some more stopping points (probably):
475
+ already_tracked_urls = tracked_listing.posts.collect{|tp| tp.url}
476
+
477
+ # We'll use this in the loop to decide what posts to track:
478
+ newest_post_date = last_tracked_at
479
+
480
+ # We keep track of post.post_date here, b/c in some circumstances, you can be in the below loop
481
+ # but have no post.post_date since the posting was removed and it parsed to nil
482
+ most_recent_posting_date = Time.now
483
+
484
+ # OK - Now let's go!
485
+ catch :list_break do
486
+ while listing
487
+ listing.posts.each do |post|
488
+ begin
489
+ most_recent_posting_date = post.post_date if post.post_date
490
+
491
+ # Are we at a point in the scrape, past which we don't need to proceed?
492
+ throw :list_break if (
493
+ most_recent_posting_date < last_tracked_at or
494
+ already_tracked_urls.include? post.url
495
+ )
496
+
497
+ # If we want to report this post, add it to the collection:
498
+ new_summaries[post.url] = post if (
499
+ !new_summaries.has_key? post.url and
500
+ search.passes_filter? post
501
+ )
502
+ rescue CraigScrape::Scraper::ResourceNotFoundError,CraigScrape::Scraper::MaxRedirectError => e
503
+ # Sometimes we do end up with 404's that will never load, and we dont want to
504
+ # abort a run simply b/c we found some anomaly due to the craigslist index.
505
+ # being out of date. This ResourceNotFoundError can occur due to
506
+ # loading the post url in full, only to see that it was yanked - or craigslist
507
+ # is acting funny.
508
+ next
509
+ end
510
+
511
+ # Now let's see if the url should be kept in our tracking database for the future...
512
+
513
+ # This post-date sets a limit for the tracked_listing.posts.create below
514
+ newest_post_date = most_recent_posting_date if most_recent_posting_date > newest_post_date
515
+
516
+ # Now let's add these urls to the database so as to reduce memory overhead.
517
+ # Keep in mind - they're not active until the email goes out.
518
+ # also - we shouldn't have to worry about putting 'irrelevant' posts in the db, since
519
+ # the nbewest are always the first ones parsed:
520
+ tracked_listing.posts.create(
521
+ :url => post.url,
522
+ :created_at => newest_post_date
523
+ ) unless most_recent_posting_date < newest_post_date
524
+
525
+ end
526
+
527
+ listing = listing.next_page
528
+ end
529
+ end
530
+ end
531
+
532
+
533
+
534
+ # Let's flatten the unique'd hash into a more useable array:
535
+ # NOTE: The reason we included a reject is a little complicated, but here's the gist:
536
+ # * We try not to load the whole post if we don't have to
537
+ # * Its possible that we met all the criterion of the passes_filter? with merely a header, and
538
+ # if so we add a url to the summaries stack
539
+ # * Unfortunately, when we later load that post in full, we may find that the post was posting_has_expired?
540
+ # or flagged_for_removal?, etc.
541
+ # * If this was the case, below we'll end up sorting against nil post_dates. This would fail.
542
+ # * So - before we sort, we run a quick reject on nil post_dates
543
+ new_summaries = new_summaries.values.reject{|v| v.post_date.nil? }.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
544
+
545
+ # Now Let's manage the tracking database:
546
+ if new_summaries.length > 0
547
+
548
+ # We'll use this in the cleanup at the bottom:
549
+ latest_post_date = new_summaries.last.post_date
550
+
551
+ new_summaries.reverse! if search.newest_first?
552
+ end
553
+
554
+ # We'll want to email these...
555
+ {
556
+ :latest_post_date => latest_post_date,
557
+ :search_track => search_track,
558
+ :postings => new_summaries,
559
+ :search => search
560
+ }
561
+ end
562
+
563
+ # Time to send the email (maybe):
564
+ unless report_summaries.select { |s| ! s[:postings].empty? }.empty?
565
+ ReportMailer.deliver_report(
566
+ craig_report.email_to,
567
+ craig_report.email_from,
568
+ craig_report.report_name,
569
+ {:summaries => report_summaries, :definition => craig_report}
570
+ )
571
+ end
572
+
573
+ # Commit (make 'active') all newly created tracked post urls:
574
+ TrackedPost.activate_all!
575
+
576
+ # Now remove all the no-longer-need posts from the prior run:
577
+ report_summaries.each do |summary|
578
+ summary[:search_track].listings.each do |listing|
579
+ listing.delete_posts_older_than listing.last_tracked_at
580
+ end
581
+ end