libcraigscrape 1.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +12 -1
- data/Gemfile +12 -0
- data/Rakefile +1 -54
- data/bin/craig_report_schema.yml +4 -1
- data/bin/craigwatch +148 -146
- data/bin/report_mailer/report.html.erb +20 -0
- data/bin/report_mailer/{craigslist_report.plain.erb → report.text.erb} +7 -6
- data/lib/geo_listings.rb +1 -1
- data/lib/libcraigscrape.rb +52 -59
- data/lib/listings.rb +75 -39
- data/lib/posting.rb +120 -63
- data/lib/scraper.rb +43 -63
- data/spec/assets/geolisting_iso_us_120412.html +441 -0
- data/spec/assets/listing_cta_ftl_112612.html +1470 -0
- data/spec/assets/listing_rea_miami_123012.html +1397 -0
- data/spec/assets/listing_search_ppa_nyc_121212.html +1584 -0
- data/spec/assets/posting_daytona_art_120512-2.html +160 -0
- data/spec/assets/posting_daytona_art_120512.html +153 -0
- data/spec/assets/posting_mdc_cto_ftl_112612.html +170 -0
- data/spec/assets/posting_mdc_reb_120612.html +183 -0
- data/spec/assets/posting_sfbay_1226.html +157 -0
- data/spec/assets/posting_sya_121012-2.html +122 -0
- data/spec/assets/posting_sya_121012.html +165 -0
- data/spec/assets/this_post_has_expired_old.html +48 -0
- data/spec/geolisting_spec.rb +9 -0
- data/spec/listings_spec.rb +77 -0
- data/spec/postings_spec.rb +157 -0
- data/spec/spec_helper.rb +8 -0
- data/test/test_craigslist_geolisting.rb +5 -5
- data/test/test_craigslist_listing.rb +30 -30
- data/test/test_craigslist_posting.rb +25 -145
- metadata +200 -114
- data/bin/report_mailer/craigslist_report.html.erb +0 -17
data/bin/craigwatch
CHANGED
@@ -1,9 +1,10 @@
|
|
1
|
-
#!/usr/bin/ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
2
3
|
#
|
3
4
|
# =craigwatch - A email-based "post monitoring" solution
|
4
5
|
#
|
5
|
-
# Created alongside the libcraigscrape library, libcraigwatch was designed to take the monotony out of regular
|
6
|
-
# craiglist monitoring. craigwatch is designed to be run at periodic intervals (hourly/daily/etc) through crontab
|
6
|
+
# Created alongside the libcraigscrape library, libcraigwatch was designed to take the monotony out of regular
|
7
|
+
# craiglist monitoring. craigwatch is designed to be run at periodic intervals (hourly/daily/etc) through crontab
|
7
8
|
# and report all new postings within a listing or search url, since its last run, by email.
|
8
9
|
#
|
9
10
|
# For more information, head to the {craiglist monitoring}[http://www.derosetechnologies.com/community/libcraigscrape] help section of our website.
|
@@ -25,29 +26,19 @@
|
|
25
26
|
# - location_has_no - (array of string or regexp) Only include posts which don't match against the post location
|
26
27
|
#
|
27
28
|
# Multiple searches can be combined into a single report, and results can be sorted by newest-first or oldest-first (default)
|
28
|
-
#
|
29
|
+
#
|
29
30
|
# Reporting output is easily customized html, handled by ActionMailer, and emails can be delivered via smtp or sendmail.
|
30
|
-
# Database tracking of already-delivered posts is handled by ActiveRecord, and its driver-agnostic SQL supports all the
|
31
|
+
# Database tracking of already-delivered posts is handled by ActiveRecord, and its driver-agnostic SQL supports all the
|
31
32
|
# major backends (sqllite/mysql/postgres/probably-all-others). Database sizes are contained by automatically pruning old results
|
32
33
|
# that are no longer required at the end of each run.
|
33
34
|
#
|
34
35
|
# Pretty useful, no?
|
35
|
-
#
|
36
|
+
#
|
36
37
|
# == Installation
|
37
|
-
# craigwatch is coupled with libcraigscrape, and is installed via ruby gems.
|
38
|
-
#
|
39
|
-
# gem itself.
|
40
|
-
#
|
41
|
-
# This should take care of the craigwatch install on all systems:
|
42
|
-
# sudo gem install libcraigscrape kwalify activerecord actionmailer
|
43
|
-
# Alternatively, if you've already installed libcraigscrape and want to start working with craigwatch:
|
44
|
-
# sudo gem install kwalify activerecord actionmailer
|
45
|
-
#
|
46
|
-
# This script was initially developed with activerecord 2.3, actionmailer 2.3 and kwalify 0.7, but will likely work with most
|
47
|
-
# prior and future versions of these libraries.
|
48
|
-
#
|
38
|
+
# craigwatch is coupled with libcraigscrape, and is installed via ruby gems.
|
39
|
+
#
|
49
40
|
# == Usage
|
50
|
-
# When craigwatch is invoked, it is designed to run a single report and then terminate. There is only one parameter to craigwatch, and
|
41
|
+
# When craigwatch is invoked, it is designed to run a single report and then terminate. There is only one parameter to craigwatch, and
|
51
42
|
# this parameter is the path to a valid report-definition yml file. ie:
|
52
43
|
# craigwatch johns_daily_watch.yml
|
53
44
|
#
|
@@ -55,6 +46,9 @@
|
|
55
46
|
# Probably, the best way to understand the report definition files, is to look at the annotated sample file below, and use it as a
|
56
47
|
# starting point for your own.
|
57
48
|
#
|
49
|
+
# New in version 1.1.0 is ERB evaluation of the report-definiton file. This feature is automatic, just include the erb blocks you'd
|
50
|
+
# like, and the file will be evaluated at runtime.
|
51
|
+
#
|
58
52
|
# By default there is no program output, however, setting any of the following paramters to 'yes' in your definition file will turn on
|
59
53
|
# useful debugging/logging output:
|
60
54
|
# - debug_database
|
@@ -63,10 +57,10 @@
|
|
63
57
|
#
|
64
58
|
# == Definition File Sample
|
65
59
|
#
|
66
|
-
# Let's start with a minimal report, just enough needed to get something quick working:
|
60
|
+
# Let's start with a minimal report, just enough needed to get something quick working:
|
67
61
|
# # We need some kind of destination to send this to
|
68
62
|
# email_to: Chris DeRose <cderose@derosetechnologies.com>
|
69
|
-
#
|
63
|
+
#
|
70
64
|
# # This is an array of specific 'searches' we'll be performing in this report:
|
71
65
|
# searches:
|
72
66
|
# # We're looking for 90's era cadillac, something cheap, confortable and in white...
|
@@ -85,7 +79,7 @@
|
|
85
79
|
# summary_post_has_no: [ /xlr/i ]
|
86
80
|
#
|
87
81
|
# # We were convertable, and white/cream/etc:
|
88
|
-
# full_post_has:
|
82
|
+
# full_post_has:
|
89
83
|
# - /convertible/i
|
90
84
|
# - /(white|yellow|banana|creme|cream)/i
|
91
85
|
#
|
@@ -93,7 +87,7 @@
|
|
93
87
|
# full_post_has_no:
|
94
88
|
# - /simulated[^a-z]{0,2}convertible/i
|
95
89
|
#
|
96
|
-
# # We want to search all of craigslist's in the us, and we'll want to find it using
|
90
|
+
# # We want to search all of craigslist's in the us, and we'll want to find it using
|
97
91
|
# # the '/search/cta?hasPic=1&query=cadillac' url on the site
|
98
92
|
# sites: [ us ]
|
99
93
|
# listings:
|
@@ -104,6 +98,9 @@
|
|
104
98
|
# # The report_name is fed into Time.now.strftime, hence the formatting characters
|
105
99
|
# report_name: Craig Watch For Johnathan on %D at %I:%M %p
|
106
100
|
#
|
101
|
+
# # Overrides the default system time zone with an EST zone
|
102
|
+
# tz: EST
|
103
|
+
#
|
107
104
|
# email_to: Johnathan Peabody <john@example.local>
|
108
105
|
#
|
109
106
|
# # This is sent straight into ActiveRecord, so there's plenty of options available here. the following is an easy
|
@@ -129,21 +126,21 @@
|
|
129
126
|
#
|
130
127
|
# # Oh, and we're on a budget:
|
131
128
|
# price_less_than: 120
|
132
|
-
#
|
129
|
+
#
|
133
130
|
# # Search #2
|
134
131
|
# - name: Large apartment rentals in San Francisco
|
135
132
|
# sites: [ us/ca/sfbay ]
|
136
133
|
# starting: 9/10/2009
|
137
|
-
#
|
138
|
-
# # We're going to rely on craigslist's built-in search for this one since there's a lot of listings, and we
|
134
|
+
#
|
135
|
+
# # We're going to rely on craigslist's built-in search for this one since there's a lot of listings, and we
|
139
136
|
# # want to conserve some bandwidth
|
140
137
|
# listings: [ /search/apa?query=pool&minAsk=min&maxAsk=max&bedrooms=5 ]
|
141
138
|
#
|
142
139
|
# # We'll require a price to be listed, 'cause it keeps out some of the unwanted fluff
|
143
140
|
# price_required: yes
|
144
|
-
#
|
141
|
+
#
|
145
142
|
# # Hopefully this will keep us away from a bad part of town:
|
146
|
-
# price_greater_than: 1000
|
143
|
+
# price_greater_than: 1000
|
147
144
|
#
|
148
145
|
# # Since we dont have time to driv to each location, we'll require only listings with pictures
|
149
146
|
# has_image: yes
|
@@ -160,9 +157,9 @@ $: << File.dirname(__FILE__) + '/../lib'
|
|
160
157
|
|
161
158
|
require 'rubygems'
|
162
159
|
|
163
|
-
gem 'kwalify'
|
164
|
-
gem 'activerecord'
|
165
|
-
gem 'actionmailer'
|
160
|
+
gem 'kwalify'
|
161
|
+
gem 'activerecord'
|
162
|
+
gem 'actionmailer'
|
166
163
|
|
167
164
|
require 'kwalify'
|
168
165
|
require 'active_record'
|
@@ -170,19 +167,20 @@ require 'action_mailer'
|
|
170
167
|
require 'kwalify/util/hashlike'
|
171
168
|
require 'libcraigscrape'
|
172
169
|
require "socket"
|
170
|
+
require 'active_support/all'
|
173
171
|
|
174
172
|
class String #:nodoc:
|
175
173
|
RE = /^\/(.*)\/([ixm]*)$/
|
176
|
-
|
174
|
+
|
177
175
|
def is_re?
|
178
176
|
(RE.match self) ? true : false
|
179
177
|
end
|
180
|
-
|
178
|
+
|
181
179
|
def to_re
|
182
180
|
source, options = ( RE.match(self) )? [$1, $2] : [self,nil]
|
183
181
|
mods = 0
|
184
182
|
|
185
|
-
options.each_char do |c|
|
183
|
+
options.each_char do |c|
|
186
184
|
mods |= case c
|
187
185
|
when 'i' then Regexp::IGNORECASE
|
188
186
|
when 'x' then Regexp::EXTENDED
|
@@ -199,12 +197,19 @@ class CraigReportDefinition #:nodoc:
|
|
199
197
|
|
200
198
|
EMAIL_NAME_PARTS = /^[ ]*(.+)[ ]*\<.+\>[ ]*/
|
201
199
|
|
202
|
-
attr_reader :report_name, :email_to, :email_from, :tracking_database, :searches,
|
200
|
+
attr_reader :report_name, :email_to, :email_from, :tracking_database, :searches,
|
201
|
+
:smtp_settings, :tz
|
203
202
|
|
204
203
|
def debug_database?; @debug_database; end
|
205
204
|
def debug_mailer?; @debug_mailer; end
|
206
205
|
def debug_craigscrape?; @debug_craigscrape; end
|
207
206
|
|
207
|
+
# Returns the configuration report zone, if defined. Otherwise pulls the zone
|
208
|
+
# from the system's default local zone
|
209
|
+
def tz
|
210
|
+
@tz || Time.new.zone
|
211
|
+
end
|
212
|
+
|
208
213
|
def email_from
|
209
214
|
(@email_from) ? @email_from : ('%s@%s' % [ENV['USER'], Socket.gethostname])
|
210
215
|
end
|
@@ -224,59 +229,66 @@ class CraigReportDefinition #:nodoc:
|
|
224
229
|
:adapter => 'sqlite3',
|
225
230
|
:database => File.basename(for_yaml_file, File.extname(for_yaml_file))+'.db'
|
226
231
|
} if for_yaml_file
|
227
|
-
|
228
|
-
# This is a little hack to make sqlite definitions a little more portable, by allowing them
|
232
|
+
|
233
|
+
# This is a little hack to make sqlite definitions a little more portable, by allowing them
|
229
234
|
# to be specify dbfile's relative to the yml's directory:
|
230
235
|
ret = @tracking_database
|
231
236
|
ret['dbfile'] = '%s/%s' % [File.dirname(for_yaml_file), $1] if (
|
232
237
|
for_yaml_file and ret.has_key? 'dbfile' and /^([^\/].*)$/.match ret['dbfile']
|
233
238
|
)
|
234
|
-
|
239
|
+
|
235
240
|
ret
|
236
241
|
end
|
237
242
|
|
238
243
|
class SearchDefinition #:nodoc:
|
239
|
-
include Kwalify::Util::HashLike
|
240
|
-
|
244
|
+
include Kwalify::Util::HashLike
|
245
|
+
|
241
246
|
attr_reader :name, :sites, :listings
|
242
247
|
attr_reader :location_has, :location_has_no
|
243
248
|
attr_reader :full_post_has, :full_post_has_no
|
244
249
|
attr_reader :summary_post_has, :summary_post_has_no
|
245
250
|
attr_reader :summary_or_full_post_has, :summary_or_full_post_has_no
|
246
|
-
|
247
|
-
attr_reader :price_greater_than,:price_less_than
|
248
251
|
|
249
252
|
def has_image?; @has_image; end
|
250
253
|
def newest_first?; @newest_first; end
|
251
254
|
def price_required?; @price_required; end
|
252
|
-
|
255
|
+
|
256
|
+
def price_greater_than
|
257
|
+
Money.new(@price_greater_than*100, 'USD') if @price_greater_than
|
258
|
+
end
|
259
|
+
|
260
|
+
def price_less_than
|
261
|
+
Money.new(@price_less_than*100, 'USD') if @price_less_than
|
262
|
+
end
|
263
|
+
|
253
264
|
def starting_at
|
254
|
-
(@starting) ?
|
255
|
-
|
256
|
-
|
265
|
+
(@starting) ?
|
266
|
+
Date.strptime(@starting, ['%m','%d',
|
267
|
+
/\/(?:[\d]{4})$/.match(@starting) ? '%Y' : '%y'].join('/') ) :
|
268
|
+
Date.yesterday
|
257
269
|
end
|
258
|
-
|
259
|
-
def passes_filter?(post)
|
270
|
+
|
271
|
+
def passes_filter?(post)
|
260
272
|
if post.price.nil?
|
261
273
|
return false if price_required?
|
262
274
|
else
|
263
|
-
return false if
|
264
|
-
return false if
|
275
|
+
return false if price_greater_than and post.price <= price_greater_than
|
276
|
+
return false if price_less_than and post.price >= price_less_than
|
265
277
|
end
|
266
|
-
|
278
|
+
|
267
279
|
# Label Filters:
|
268
280
|
return false unless matches_all? summary_post_has, post.label
|
269
281
|
return false unless doesnt_match_any? summary_post_has_no, post.label
|
270
|
-
|
282
|
+
|
271
283
|
# Location Filters:
|
272
284
|
return false unless matches_all? location_has, post.location
|
273
285
|
return false unless doesnt_match_any? location_has_no, post.location
|
274
|
-
|
286
|
+
|
275
287
|
# Full post Filters:
|
276
288
|
if full_post_has or full_post_has_no or summary_or_full_post_has or summary_or_full_post_has_no
|
277
289
|
# We're going to download the page, so let's make sure we didnt hit a "This posting has been flagged for removal"
|
278
290
|
return false if post.system_post?
|
279
|
-
|
291
|
+
|
280
292
|
return false unless matches_all? full_post_has, post.contents_as_plain
|
281
293
|
return false unless doesnt_match_any? full_post_has_no, post.contents_as_plain
|
282
294
|
|
@@ -286,21 +298,27 @@ class CraigReportDefinition #:nodoc:
|
|
286
298
|
|
287
299
|
true
|
288
300
|
end
|
289
|
-
|
301
|
+
|
290
302
|
private
|
291
|
-
|
303
|
+
|
292
304
|
def matches_all?(conditions, against)
|
293
|
-
|
294
|
-
(conditions.nil? or conditions.all?{|c| against.any?{|a| match_against c, a } }) ? true : false
|
305
|
+
(conditions.nil? or conditions.all?{|c| sanitized_against(against).any?{|a| match_against c, a } }) ? true : false
|
295
306
|
end
|
296
|
-
|
307
|
+
|
297
308
|
def doesnt_match_any?(conditions, against)
|
298
|
-
|
299
|
-
(conditions.nil? or conditions.all?{|c| against.any?{|a| !match_against c, a } }) ? true : false
|
309
|
+
(conditions.nil? or conditions.all?{|c| sanitized_against(against).any?{|a| !match_against c, a } }) ? true : false
|
300
310
|
end
|
301
|
-
|
311
|
+
|
302
312
|
def match_against(condition, against)
|
303
|
-
(against.scan( condition.is_re? ? condition.to_re : /#{condition}/i).length > 0) ? true : false
|
313
|
+
(CraigScrape::Scraper.he_decode(against).scan( condition.is_re? ? condition.to_re : /#{condition}/i).length > 0) ? true : false
|
314
|
+
end
|
315
|
+
|
316
|
+
# This is kind of a hack to deal with ruby 1.9. Really the filtering mechanism
|
317
|
+
# needs to be factored out and tested....
|
318
|
+
def sanitized_against(against)
|
319
|
+
against = against.lines if against.respond_to? :lines
|
320
|
+
against = against.to_a if against.respond_to? :to_a
|
321
|
+
(against.nil?) ? [] : against.compact
|
304
322
|
end
|
305
323
|
end
|
306
324
|
end
|
@@ -309,11 +327,11 @@ class TrackedSearch < ActiveRecord::Base #:nodoc:
|
|
309
327
|
has_many :listings, :dependent => :destroy, :class_name => 'TrackedListing'
|
310
328
|
validates_uniqueness_of :search_name
|
311
329
|
validates_presence_of :search_name
|
312
|
-
|
330
|
+
|
313
331
|
def self.find_by_name(name)
|
314
332
|
self.find :first, :conditions => ['search_name = ?',name]
|
315
333
|
end
|
316
|
-
|
334
|
+
|
317
335
|
def find_listing_by_url(url)
|
318
336
|
listings.find :first, :conditions => ['url = ?', url]
|
319
337
|
end
|
@@ -330,9 +348,8 @@ class TrackedListing < ActiveRecord::Base #:nodoc:
|
|
330
348
|
def last_tracked_at
|
331
349
|
self.posts.maximum 'created_at'
|
332
350
|
end
|
333
|
-
|
351
|
+
|
334
352
|
def delete_posts_older_than(cutoff_date)
|
335
|
-
# TODO: can't I use posts.delete 'created_at < ?' and keep it cleaner?
|
336
353
|
TrackedPost.delete_all [ 'tracked_listing_id = ? AND created_at < ?', self.id, cutoff_date ]
|
337
354
|
end
|
338
355
|
end
|
@@ -342,11 +359,11 @@ class TrackedPost < ActiveRecord::Base #:nodoc:
|
|
342
359
|
|
343
360
|
def self.activate_all!
|
344
361
|
TrackedPost.update_all(
|
345
|
-
{ :active => true },
|
346
|
-
[ 'active = ?', false ]
|
362
|
+
{ :active => true },
|
363
|
+
[ 'active = ?', false ]
|
347
364
|
)
|
348
365
|
end
|
349
|
-
|
366
|
+
|
350
367
|
def self.destroy_inactive!
|
351
368
|
TrackedPost.delete_all [ 'active = ?', false ]
|
352
369
|
end
|
@@ -354,23 +371,9 @@ end
|
|
354
371
|
|
355
372
|
class ReportMailer < ActionMailer::Base #:nodoc:
|
356
373
|
def report(to, sender, subject_template, report_tmpl)
|
357
|
-
|
358
|
-
formatted_subject = Time.now.strftime(subject_template)
|
359
|
-
|
360
|
-
recipients to
|
361
|
-
from sender
|
362
|
-
subject formatted_subject
|
363
|
-
|
364
|
-
generate_view_parts 'craigslist_report', report_tmpl.merge({:subject =>formatted_subject})
|
365
|
-
end
|
374
|
+
@summaries = report_tmpl[:summaries]
|
366
375
|
|
367
|
-
|
368
|
-
part( :content_type => "multipart/alternative" ) do |p|
|
369
|
-
[
|
370
|
-
{ :content_type => "text/plain", :body => render_message("#{view_name.to_s}.plain.erb", tmpl) },
|
371
|
-
{ :content_type => "text/html", :body => render_message("#{view_name.to_s}.html.erb", tmpl.merge({:part_container => p})) }
|
372
|
-
].each { |parms| p.part parms.merge( { :charset => "UTF-8", :transfer_encoding => "7bit" } ) }
|
373
|
-
end
|
376
|
+
mail :to => to, :subject => Time.zone.now.strftime(subject_template), :from => sender
|
374
377
|
end
|
375
378
|
end
|
376
379
|
|
@@ -383,7 +386,7 @@ unless report_definition_file
|
|
383
386
|
puts <<EOD
|
384
387
|
Usage:
|
385
388
|
#{File.basename($0)} [report_definition_file]
|
386
|
-
|
389
|
+
|
387
390
|
Run 'gem server' and browse the libcraigscrape rdoc for 'bin/craigscrape' for specific usage details.
|
388
391
|
EOD
|
389
392
|
exit
|
@@ -397,20 +400,25 @@ parser = Kwalify::Yaml::Parser.new(
|
|
397
400
|
:data_binding => true
|
398
401
|
)
|
399
402
|
|
400
|
-
|
403
|
+
report_definition_file_content = ERB.new(File.read(report_definition_file)).result
|
404
|
+
craig_report = parser.parse(report_definition_file_content, filename: report_definition_file)
|
401
405
|
|
402
406
|
parser.errors.each do |e|
|
403
407
|
puts "Definition Validation Error (line #{e.linenum}, char #{e.column}): #{e.message}"
|
404
408
|
end and exit if parser.errors.length > 0
|
405
409
|
|
410
|
+
# Set the time zone:
|
411
|
+
Time.zone = craig_report.tz
|
412
|
+
|
406
413
|
# Initialize Action Mailer:
|
414
|
+
ActionMailer::Base.prepend_view_path(File.dirname(__FILE__))
|
407
415
|
ActionMailer::Base.logger = Logger.new STDERR if craig_report.debug_mailer?
|
408
416
|
if craig_report.smtp_settings
|
409
|
-
|
417
|
+
ActionMailer::Base.smtp_settings = craig_report.smtp_settings.symbolize_keys
|
418
|
+
ActionMailer::Base.delivery_method = :smtp
|
410
419
|
else
|
411
|
-
|
420
|
+
ActionMailer::Base.delivery_method = :sendmail
|
412
421
|
end
|
413
|
-
ReportMailer.template_root = File.dirname __FILE__
|
414
422
|
|
415
423
|
# Initialize the database:
|
416
424
|
ActiveRecord::Base.logger = Logger.new STDERR if craig_report.debug_database?
|
@@ -421,16 +429,16 @@ CraigScrape::Scraper.logger = Logger.new STDERR if craig_report.debug_craigscrap
|
|
421
429
|
|
422
430
|
# Perform migrations if needed?
|
423
431
|
ActiveRecord::Schema.define do
|
424
|
-
suppress_messages do
|
432
|
+
suppress_messages do
|
425
433
|
create_table :tracked_searches do |t|
|
426
434
|
t.column :search_name, :string
|
427
435
|
end unless table_exists? :tracked_searches
|
428
|
-
|
436
|
+
|
429
437
|
create_table :tracked_listings do |t|
|
430
438
|
t.column :url, :string
|
431
439
|
t.column :tracked_search_id, :integer
|
432
|
-
end unless table_exists? :tracked_listings
|
433
|
-
|
440
|
+
end unless table_exists? :tracked_listings
|
441
|
+
|
434
442
|
create_table :tracked_posts do |t|
|
435
443
|
t.column :url, :string
|
436
444
|
t.column :tracked_listing_id, :integer
|
@@ -440,7 +448,7 @@ ActiveRecord::Schema.define do
|
|
440
448
|
end
|
441
449
|
end
|
442
450
|
|
443
|
-
# Remove all posts which are inactive. They would be in there if the prior run was a failure.
|
451
|
+
# Remove all posts which are inactive. They would be in there if the prior run was a failure.
|
444
452
|
TrackedPost.destroy_inactive!
|
445
453
|
|
446
454
|
# We'll need these outside this next loop:
|
@@ -450,80 +458,80 @@ newly_tracked_posts = []
|
|
450
458
|
report_summaries = craig_report.searches.collect do |search|
|
451
459
|
# Load our tracking info
|
452
460
|
search_track = TrackedSearch.find_by_name search.name
|
453
|
-
|
461
|
+
|
454
462
|
# No Tracking found - let's set one up:
|
455
463
|
search_track = TrackedSearch.create! :search_name => search.name unless search_track
|
456
|
-
|
464
|
+
|
457
465
|
# This hash tracks what makes it into the report on this search.
|
458
466
|
# NOTE that keys are url's b/c sometimes the same posting will end up in multiple listings,
|
459
467
|
# And doing this ensures that we don't end-up reporting the same post twice.
|
460
468
|
new_summaries = {}
|
461
|
-
|
469
|
+
|
462
470
|
# And now we actually scrape:
|
463
471
|
CraigScrape.new(*search.sites).each_listing(*search.listings) do |listing|
|
464
|
-
# Keep in mind that listing.url does change in the while loop.
|
472
|
+
# Keep in mind that listing.url does change in the while loop.
|
465
473
|
# But, this first one is a good base_url that will never change between runs.
|
466
474
|
|
467
475
|
tracked_listing = search_track.find_listing_by_url listing.url
|
468
476
|
tracked_listing ||= search_track.listings.create! :url => listing.url
|
469
|
-
|
470
|
-
# Gives us a sane stopping point (hopefully) :
|
471
|
-
last_tracked_at = tracked_listing.last_tracked_at
|
477
|
+
|
478
|
+
# Gives us a sane stopping point (hopefully) :
|
479
|
+
last_tracked_at = tracked_listing.last_tracked_at.try(:to_date)
|
472
480
|
last_tracked_at ||= search.starting_at
|
473
481
|
|
474
482
|
# Some more stopping points (probably):
|
475
483
|
already_tracked_urls = tracked_listing.posts.collect{|tp| tp.url}
|
476
484
|
|
477
485
|
# We'll use this in the loop to decide what posts to track:
|
478
|
-
newest_post_date = last_tracked_at
|
479
|
-
|
486
|
+
newest_post_date = last_tracked_at
|
487
|
+
|
480
488
|
# We keep track of post.post_date here, b/c in some circumstances, you can be in the below loop
|
481
489
|
# but have no post.post_date since the posting was removed and it parsed to nil
|
482
|
-
most_recent_posting_date =
|
483
|
-
|
490
|
+
most_recent_posting_date = Date.new
|
491
|
+
|
484
492
|
# OK - Now let's go!
|
485
493
|
catch :list_break do
|
486
494
|
while listing
|
487
495
|
listing.posts.each do |post|
|
488
496
|
begin
|
489
497
|
most_recent_posting_date = post.post_date if post.post_date
|
490
|
-
|
498
|
+
|
491
499
|
# Are we at a point in the scrape, past which we don't need to proceed?
|
492
500
|
throw :list_break if (
|
493
|
-
most_recent_posting_date < last_tracked_at or
|
501
|
+
most_recent_posting_date.to_time < last_tracked_at or
|
494
502
|
already_tracked_urls.include? post.url
|
495
503
|
)
|
496
|
-
|
504
|
+
|
497
505
|
# If we want to report this post, add it to the collection:
|
498
506
|
new_summaries[post.url] = post if (
|
499
|
-
!new_summaries.has_key? post.url and
|
507
|
+
!new_summaries.has_key? post.url and
|
500
508
|
search.passes_filter? post
|
501
509
|
)
|
502
|
-
rescue CraigScrape::Scraper::ResourceNotFoundError
|
510
|
+
rescue CraigScrape::Scraper::ResourceNotFoundError => e
|
503
511
|
# Sometimes we do end up with 404's that will never load, and we dont want to
|
504
512
|
# abort a run simply b/c we found some anomaly due to the craigslist index.
|
505
|
-
# being out of date. This ResourceNotFoundError can occur due to
|
506
|
-
# loading the post url in full, only to see that it was yanked - or craigslist
|
513
|
+
# being out of date. This ResourceNotFoundError can occur due to
|
514
|
+
# loading the post url in full, only to see that it was yanked - or craigslist
|
507
515
|
# is acting funny.
|
508
516
|
next
|
509
517
|
end
|
510
|
-
|
518
|
+
|
511
519
|
# Now let's see if the url should be kept in our tracking database for the future...
|
512
520
|
|
513
521
|
# This post-date sets a limit for the tracked_listing.posts.create below
|
514
522
|
newest_post_date = most_recent_posting_date if most_recent_posting_date > newest_post_date
|
515
|
-
|
523
|
+
|
516
524
|
# Now let's add these urls to the database so as to reduce memory overhead.
|
517
525
|
# Keep in mind - they're not active until the email goes out.
|
518
|
-
# also - we shouldn't have to worry about putting 'irrelevant' posts in the db, since
|
519
|
-
# the
|
526
|
+
# also - we shouldn't have to worry about putting 'irrelevant' posts in the db, since
|
527
|
+
# the newest are always the first ones parsed:
|
520
528
|
tracked_listing.posts.create(
|
521
|
-
:url => post.url,
|
522
|
-
:created_at => newest_post_date
|
529
|
+
:url => post.url,
|
530
|
+
:created_at => newest_post_date
|
523
531
|
) unless most_recent_posting_date < newest_post_date
|
524
532
|
|
525
533
|
end
|
526
|
-
|
534
|
+
|
527
535
|
listing = listing.next_page
|
528
536
|
end
|
529
537
|
end
|
@@ -532,41 +540,35 @@ report_summaries = craig_report.searches.collect do |search|
|
|
532
540
|
|
533
541
|
|
534
542
|
# Let's flatten the unique'd hash into a more useable array:
|
535
|
-
|
536
|
-
|
537
|
-
# * Its possible that we met all the criterion of the passes_filter? with merely a header, and
|
538
|
-
# if so we add a url to the summaries stack
|
539
|
-
# * Unfortunately, when we later load that post in full, we may find that the post was posting_has_expired?
|
540
|
-
# or flagged_for_removal?, etc.
|
541
|
-
# * If this was the case, below we'll end up sorting against nil post_dates. This would fail.
|
542
|
-
# * So - before we sort, we run a quick reject on nil post_dates
|
543
|
-
new_summaries = new_summaries.values.reject{|v| v.post_date.nil? }.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
|
544
|
-
|
543
|
+
new_summaries = new_summaries.values.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
|
544
|
+
|
545
545
|
# Now Let's manage the tracking database:
|
546
|
-
if new_summaries.length > 0
|
546
|
+
if new_summaries.length > 0
|
547
547
|
|
548
548
|
# We'll use this in the cleanup at the bottom:
|
549
549
|
latest_post_date = new_summaries.last.post_date
|
550
|
-
|
551
|
-
new_summaries.reverse! if search.newest_first?
|
550
|
+
|
551
|
+
new_summaries.reverse! if search.newest_first?
|
552
552
|
end
|
553
|
-
|
553
|
+
|
554
554
|
# We'll want to email these...
|
555
|
-
{
|
555
|
+
{
|
556
556
|
:latest_post_date => latest_post_date,
|
557
|
-
:search_track => search_track,
|
558
|
-
:postings => new_summaries,
|
557
|
+
:search_track => search_track,
|
558
|
+
:postings => new_summaries,
|
559
559
|
:search => search
|
560
560
|
}
|
561
561
|
end
|
562
562
|
|
563
|
-
# Time to send the email:
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
563
|
+
# Time to send the email (maybe):
|
564
|
+
unless report_summaries.select { |s| !s[:postings].empty? }.empty?
|
565
|
+
ReportMailer.report(
|
566
|
+
craig_report.email_to,
|
567
|
+
craig_report.email_from,
|
568
|
+
craig_report.report_name,
|
569
|
+
{:summaries => report_summaries, :definition => craig_report}
|
570
|
+
).deliver
|
571
|
+
end
|
570
572
|
|
571
573
|
# Commit (make 'active') all newly created tracked post urls:
|
572
574
|
TrackedPost.activate_all!
|
@@ -576,4 +578,4 @@ report_summaries.each do |summary|
|
|
576
578
|
summary[:search_track].listings.each do |listing|
|
577
579
|
listing.delete_posts_older_than listing.last_tracked_at
|
578
580
|
end
|
579
|
-
end
|
581
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
<h2><%=h @subject %></h2>
|
2
|
+
<%@summaries.each do |summary| %>
|
3
|
+
<h3><%=h summary[:search].name%></h3>
|
4
|
+
<% if summary[:postings].length > 0 %>
|
5
|
+
<%summary[:postings].each do |post|%>
|
6
|
+
<p>
|
7
|
+
<%=('%s <a href="%s">%s</a>' % [
|
8
|
+
h(post.post_date.strftime('%b %d')), post.url, h(post.title)
|
9
|
+
]).html_safe %>
|
10
|
+
<%=([
|
11
|
+
(post.price) ? h(post.price.try(:format, :no_cents => true)) : nil,
|
12
|
+
(post.location) ? '<font size="-1"> (%s)</font>' % h(post.location) : nil,
|
13
|
+
(post.has_pic_or_img?) ? ' <span style="color: orange"> img</span>': nil
|
14
|
+
].compact.join(' ')).html_safe -%>
|
15
|
+
</p>
|
16
|
+
<% end %>
|
17
|
+
<% else %>
|
18
|
+
<p><i>No new postings were found, which matched the search criteria.</i></p>
|
19
|
+
<% end %>
|
20
|
+
<% end %>
|
@@ -1,18 +1,19 @@
|
|
1
1
|
CRAIGSLIST REPORTER
|
2
2
|
|
3
|
-
|
3
|
+
<% @summaries.each do |summary| -%>
|
4
4
|
<%=summary[:search].name %>
|
5
5
|
<% summary[:postings].collect do |post| -%>
|
6
6
|
<% if summary[:postings].length > 0 %>
|
7
|
-
<%='%s : %s %s %s %s' % [
|
7
|
+
<%='%s : %s %s %s %s %s' % [
|
8
8
|
post.post_date.strftime('%b %d'),
|
9
|
-
post.
|
10
|
-
|
11
|
-
(post.
|
9
|
+
post.title,
|
10
|
+
post.price.try(:format, :no_cents => true),
|
11
|
+
(post.location) ? " (#{post.location})" : nil,
|
12
|
+
(post.has_pic_or_img?) ? ' [img]': nil,
|
12
13
|
post.url
|
13
14
|
] -%>
|
14
15
|
<% else %>
|
15
16
|
No new postings were found, which matched the search criteria.
|
16
17
|
<% end %>
|
17
18
|
<% end %>
|
18
|
-
<% end -%>
|
19
|
+
<% end -%>
|
data/lib/geo_listings.rb
CHANGED