libcraigscrape 1.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +12 -1
- data/Gemfile +12 -0
- data/Rakefile +1 -54
- data/bin/craig_report_schema.yml +4 -1
- data/bin/craigwatch +148 -146
- data/bin/report_mailer/report.html.erb +20 -0
- data/bin/report_mailer/{craigslist_report.plain.erb → report.text.erb} +7 -6
- data/lib/geo_listings.rb +1 -1
- data/lib/libcraigscrape.rb +52 -59
- data/lib/listings.rb +75 -39
- data/lib/posting.rb +120 -63
- data/lib/scraper.rb +43 -63
- data/spec/assets/geolisting_iso_us_120412.html +441 -0
- data/spec/assets/listing_cta_ftl_112612.html +1470 -0
- data/spec/assets/listing_rea_miami_123012.html +1397 -0
- data/spec/assets/listing_search_ppa_nyc_121212.html +1584 -0
- data/spec/assets/posting_daytona_art_120512-2.html +160 -0
- data/spec/assets/posting_daytona_art_120512.html +153 -0
- data/spec/assets/posting_mdc_cto_ftl_112612.html +170 -0
- data/spec/assets/posting_mdc_reb_120612.html +183 -0
- data/spec/assets/posting_sfbay_1226.html +157 -0
- data/spec/assets/posting_sya_121012-2.html +122 -0
- data/spec/assets/posting_sya_121012.html +165 -0
- data/spec/assets/this_post_has_expired_old.html +48 -0
- data/spec/geolisting_spec.rb +9 -0
- data/spec/listings_spec.rb +77 -0
- data/spec/postings_spec.rb +157 -0
- data/spec/spec_helper.rb +8 -0
- data/test/test_craigslist_geolisting.rb +5 -5
- data/test/test_craigslist_listing.rb +30 -30
- data/test/test_craigslist_posting.rb +25 -145
- metadata +200 -114
- data/bin/report_mailer/craigslist_report.html.erb +0 -17
data/bin/craigwatch
CHANGED
@@ -1,9 +1,10 @@
|
|
1
|
-
#!/usr/bin/ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
2
3
|
#
|
3
4
|
# =craigwatch - A email-based "post monitoring" solution
|
4
5
|
#
|
5
|
-
# Created alongside the libcraigscrape library, libcraigwatch was designed to take the monotony out of regular
|
6
|
-
# craiglist monitoring. craigwatch is designed to be run at periodic intervals (hourly/daily/etc) through crontab
|
6
|
+
# Created alongside the libcraigscrape library, libcraigwatch was designed to take the monotony out of regular
|
7
|
+
# craiglist monitoring. craigwatch is designed to be run at periodic intervals (hourly/daily/etc) through crontab
|
7
8
|
# and report all new postings within a listing or search url, since its last run, by email.
|
8
9
|
#
|
9
10
|
# For more information, head to the {craiglist monitoring}[http://www.derosetechnologies.com/community/libcraigscrape] help section of our website.
|
@@ -25,29 +26,19 @@
|
|
25
26
|
# - location_has_no - (array of string or regexp) Only include posts which don't match against the post location
|
26
27
|
#
|
27
28
|
# Multiple searches can be combined into a single report, and results can be sorted by newest-first or oldest-first (default)
|
28
|
-
#
|
29
|
+
#
|
29
30
|
# Reporting output is easily customized html, handled by ActionMailer, and emails can be delivered via smtp or sendmail.
|
30
|
-
# Database tracking of already-delivered posts is handled by ActiveRecord, and its driver-agnostic SQL supports all the
|
31
|
+
# Database tracking of already-delivered posts is handled by ActiveRecord, and its driver-agnostic SQL supports all the
|
31
32
|
# major backends (sqllite/mysql/postgres/probably-all-others). Database sizes are contained by automatically pruning old results
|
32
33
|
# that are no longer required at the end of each run.
|
33
34
|
#
|
34
35
|
# Pretty useful, no?
|
35
|
-
#
|
36
|
+
#
|
36
37
|
# == Installation
|
37
|
-
# craigwatch is coupled with libcraigscrape, and is installed via ruby gems.
|
38
|
-
#
|
39
|
-
# gem itself.
|
40
|
-
#
|
41
|
-
# This should take care of the craigwatch install on all systems:
|
42
|
-
# sudo gem install libcraigscrape kwalify activerecord actionmailer
|
43
|
-
# Alternatively, if you've already installed libcraigscrape and want to start working with craigwatch:
|
44
|
-
# sudo gem install kwalify activerecord actionmailer
|
45
|
-
#
|
46
|
-
# This script was initially developed with activerecord 2.3, actionmailer 2.3 and kwalify 0.7, but will likely work with most
|
47
|
-
# prior and future versions of these libraries.
|
48
|
-
#
|
38
|
+
# craigwatch is coupled with libcraigscrape, and is installed via ruby gems.
|
39
|
+
#
|
49
40
|
# == Usage
|
50
|
-
# When craigwatch is invoked, it is designed to run a single report and then terminate. There is only one parameter to craigwatch, and
|
41
|
+
# When craigwatch is invoked, it is designed to run a single report and then terminate. There is only one parameter to craigwatch, and
|
51
42
|
# this parameter is the path to a valid report-definition yml file. ie:
|
52
43
|
# craigwatch johns_daily_watch.yml
|
53
44
|
#
|
@@ -55,6 +46,9 @@
|
|
55
46
|
# Probably, the best way to understand the report definition files, is to look at the annotated sample file below, and use it as a
|
56
47
|
# starting point for your own.
|
57
48
|
#
|
49
|
+
# New in version 1.1.0 is ERB evaluation of the report-definiton file. This feature is automatic, just include the erb blocks you'd
|
50
|
+
# like, and the file will be evaluated at runtime.
|
51
|
+
#
|
58
52
|
# By default there is no program output, however, setting any of the following paramters to 'yes' in your definition file will turn on
|
59
53
|
# useful debugging/logging output:
|
60
54
|
# - debug_database
|
@@ -63,10 +57,10 @@
|
|
63
57
|
#
|
64
58
|
# == Definition File Sample
|
65
59
|
#
|
66
|
-
# Let's start with a minimal report, just enough needed to get something quick working:
|
60
|
+
# Let's start with a minimal report, just enough needed to get something quick working:
|
67
61
|
# # We need some kind of destination to send this to
|
68
62
|
# email_to: Chris DeRose <cderose@derosetechnologies.com>
|
69
|
-
#
|
63
|
+
#
|
70
64
|
# # This is an array of specific 'searches' we'll be performing in this report:
|
71
65
|
# searches:
|
72
66
|
# # We're looking for 90's era cadillac, something cheap, confortable and in white...
|
@@ -85,7 +79,7 @@
|
|
85
79
|
# summary_post_has_no: [ /xlr/i ]
|
86
80
|
#
|
87
81
|
# # We were convertable, and white/cream/etc:
|
88
|
-
# full_post_has:
|
82
|
+
# full_post_has:
|
89
83
|
# - /convertible/i
|
90
84
|
# - /(white|yellow|banana|creme|cream)/i
|
91
85
|
#
|
@@ -93,7 +87,7 @@
|
|
93
87
|
# full_post_has_no:
|
94
88
|
# - /simulated[^a-z]{0,2}convertible/i
|
95
89
|
#
|
96
|
-
# # We want to search all of craigslist's in the us, and we'll want to find it using
|
90
|
+
# # We want to search all of craigslist's in the us, and we'll want to find it using
|
97
91
|
# # the '/search/cta?hasPic=1&query=cadillac' url on the site
|
98
92
|
# sites: [ us ]
|
99
93
|
# listings:
|
@@ -104,6 +98,9 @@
|
|
104
98
|
# # The report_name is fed into Time.now.strftime, hence the formatting characters
|
105
99
|
# report_name: Craig Watch For Johnathan on %D at %I:%M %p
|
106
100
|
#
|
101
|
+
# # Overrides the default system time zone with an EST zone
|
102
|
+
# tz: EST
|
103
|
+
#
|
107
104
|
# email_to: Johnathan Peabody <john@example.local>
|
108
105
|
#
|
109
106
|
# # This is sent straight into ActiveRecord, so there's plenty of options available here. the following is an easy
|
@@ -129,21 +126,21 @@
|
|
129
126
|
#
|
130
127
|
# # Oh, and we're on a budget:
|
131
128
|
# price_less_than: 120
|
132
|
-
#
|
129
|
+
#
|
133
130
|
# # Search #2
|
134
131
|
# - name: Large apartment rentals in San Francisco
|
135
132
|
# sites: [ us/ca/sfbay ]
|
136
133
|
# starting: 9/10/2009
|
137
|
-
#
|
138
|
-
# # We're going to rely on craigslist's built-in search for this one since there's a lot of listings, and we
|
134
|
+
#
|
135
|
+
# # We're going to rely on craigslist's built-in search for this one since there's a lot of listings, and we
|
139
136
|
# # want to conserve some bandwidth
|
140
137
|
# listings: [ /search/apa?query=pool&minAsk=min&maxAsk=max&bedrooms=5 ]
|
141
138
|
#
|
142
139
|
# # We'll require a price to be listed, 'cause it keeps out some of the unwanted fluff
|
143
140
|
# price_required: yes
|
144
|
-
#
|
141
|
+
#
|
145
142
|
# # Hopefully this will keep us away from a bad part of town:
|
146
|
-
# price_greater_than: 1000
|
143
|
+
# price_greater_than: 1000
|
147
144
|
#
|
148
145
|
# # Since we dont have time to driv to each location, we'll require only listings with pictures
|
149
146
|
# has_image: yes
|
@@ -160,9 +157,9 @@ $: << File.dirname(__FILE__) + '/../lib'
|
|
160
157
|
|
161
158
|
require 'rubygems'
|
162
159
|
|
163
|
-
gem 'kwalify'
|
164
|
-
gem 'activerecord'
|
165
|
-
gem 'actionmailer'
|
160
|
+
gem 'kwalify'
|
161
|
+
gem 'activerecord'
|
162
|
+
gem 'actionmailer'
|
166
163
|
|
167
164
|
require 'kwalify'
|
168
165
|
require 'active_record'
|
@@ -170,19 +167,20 @@ require 'action_mailer'
|
|
170
167
|
require 'kwalify/util/hashlike'
|
171
168
|
require 'libcraigscrape'
|
172
169
|
require "socket"
|
170
|
+
require 'active_support/all'
|
173
171
|
|
174
172
|
class String #:nodoc:
|
175
173
|
RE = /^\/(.*)\/([ixm]*)$/
|
176
|
-
|
174
|
+
|
177
175
|
def is_re?
|
178
176
|
(RE.match self) ? true : false
|
179
177
|
end
|
180
|
-
|
178
|
+
|
181
179
|
def to_re
|
182
180
|
source, options = ( RE.match(self) )? [$1, $2] : [self,nil]
|
183
181
|
mods = 0
|
184
182
|
|
185
|
-
options.each_char do |c|
|
183
|
+
options.each_char do |c|
|
186
184
|
mods |= case c
|
187
185
|
when 'i' then Regexp::IGNORECASE
|
188
186
|
when 'x' then Regexp::EXTENDED
|
@@ -199,12 +197,19 @@ class CraigReportDefinition #:nodoc:
|
|
199
197
|
|
200
198
|
EMAIL_NAME_PARTS = /^[ ]*(.+)[ ]*\<.+\>[ ]*/
|
201
199
|
|
202
|
-
attr_reader :report_name, :email_to, :email_from, :tracking_database, :searches,
|
200
|
+
attr_reader :report_name, :email_to, :email_from, :tracking_database, :searches,
|
201
|
+
:smtp_settings, :tz
|
203
202
|
|
204
203
|
def debug_database?; @debug_database; end
|
205
204
|
def debug_mailer?; @debug_mailer; end
|
206
205
|
def debug_craigscrape?; @debug_craigscrape; end
|
207
206
|
|
207
|
+
# Returns the configuration report zone, if defined. Otherwise pulls the zone
|
208
|
+
# from the system's default local zone
|
209
|
+
def tz
|
210
|
+
@tz || Time.new.zone
|
211
|
+
end
|
212
|
+
|
208
213
|
def email_from
|
209
214
|
(@email_from) ? @email_from : ('%s@%s' % [ENV['USER'], Socket.gethostname])
|
210
215
|
end
|
@@ -224,59 +229,66 @@ class CraigReportDefinition #:nodoc:
|
|
224
229
|
:adapter => 'sqlite3',
|
225
230
|
:database => File.basename(for_yaml_file, File.extname(for_yaml_file))+'.db'
|
226
231
|
} if for_yaml_file
|
227
|
-
|
228
|
-
# This is a little hack to make sqlite definitions a little more portable, by allowing them
|
232
|
+
|
233
|
+
# This is a little hack to make sqlite definitions a little more portable, by allowing them
|
229
234
|
# to be specify dbfile's relative to the yml's directory:
|
230
235
|
ret = @tracking_database
|
231
236
|
ret['dbfile'] = '%s/%s' % [File.dirname(for_yaml_file), $1] if (
|
232
237
|
for_yaml_file and ret.has_key? 'dbfile' and /^([^\/].*)$/.match ret['dbfile']
|
233
238
|
)
|
234
|
-
|
239
|
+
|
235
240
|
ret
|
236
241
|
end
|
237
242
|
|
238
243
|
class SearchDefinition #:nodoc:
|
239
|
-
include Kwalify::Util::HashLike
|
240
|
-
|
244
|
+
include Kwalify::Util::HashLike
|
245
|
+
|
241
246
|
attr_reader :name, :sites, :listings
|
242
247
|
attr_reader :location_has, :location_has_no
|
243
248
|
attr_reader :full_post_has, :full_post_has_no
|
244
249
|
attr_reader :summary_post_has, :summary_post_has_no
|
245
250
|
attr_reader :summary_or_full_post_has, :summary_or_full_post_has_no
|
246
|
-
|
247
|
-
attr_reader :price_greater_than,:price_less_than
|
248
251
|
|
249
252
|
def has_image?; @has_image; end
|
250
253
|
def newest_first?; @newest_first; end
|
251
254
|
def price_required?; @price_required; end
|
252
|
-
|
255
|
+
|
256
|
+
def price_greater_than
|
257
|
+
Money.new(@price_greater_than*100, 'USD') if @price_greater_than
|
258
|
+
end
|
259
|
+
|
260
|
+
def price_less_than
|
261
|
+
Money.new(@price_less_than*100, 'USD') if @price_less_than
|
262
|
+
end
|
263
|
+
|
253
264
|
def starting_at
|
254
|
-
(@starting) ?
|
255
|
-
|
256
|
-
|
265
|
+
(@starting) ?
|
266
|
+
Date.strptime(@starting, ['%m','%d',
|
267
|
+
/\/(?:[\d]{4})$/.match(@starting) ? '%Y' : '%y'].join('/') ) :
|
268
|
+
Date.yesterday
|
257
269
|
end
|
258
|
-
|
259
|
-
def passes_filter?(post)
|
270
|
+
|
271
|
+
def passes_filter?(post)
|
260
272
|
if post.price.nil?
|
261
273
|
return false if price_required?
|
262
274
|
else
|
263
|
-
return false if
|
264
|
-
return false if
|
275
|
+
return false if price_greater_than and post.price <= price_greater_than
|
276
|
+
return false if price_less_than and post.price >= price_less_than
|
265
277
|
end
|
266
|
-
|
278
|
+
|
267
279
|
# Label Filters:
|
268
280
|
return false unless matches_all? summary_post_has, post.label
|
269
281
|
return false unless doesnt_match_any? summary_post_has_no, post.label
|
270
|
-
|
282
|
+
|
271
283
|
# Location Filters:
|
272
284
|
return false unless matches_all? location_has, post.location
|
273
285
|
return false unless doesnt_match_any? location_has_no, post.location
|
274
|
-
|
286
|
+
|
275
287
|
# Full post Filters:
|
276
288
|
if full_post_has or full_post_has_no or summary_or_full_post_has or summary_or_full_post_has_no
|
277
289
|
# We're going to download the page, so let's make sure we didnt hit a "This posting has been flagged for removal"
|
278
290
|
return false if post.system_post?
|
279
|
-
|
291
|
+
|
280
292
|
return false unless matches_all? full_post_has, post.contents_as_plain
|
281
293
|
return false unless doesnt_match_any? full_post_has_no, post.contents_as_plain
|
282
294
|
|
@@ -286,21 +298,27 @@ class CraigReportDefinition #:nodoc:
|
|
286
298
|
|
287
299
|
true
|
288
300
|
end
|
289
|
-
|
301
|
+
|
290
302
|
private
|
291
|
-
|
303
|
+
|
292
304
|
def matches_all?(conditions, against)
|
293
|
-
|
294
|
-
(conditions.nil? or conditions.all?{|c| against.any?{|a| match_against c, a } }) ? true : false
|
305
|
+
(conditions.nil? or conditions.all?{|c| sanitized_against(against).any?{|a| match_against c, a } }) ? true : false
|
295
306
|
end
|
296
|
-
|
307
|
+
|
297
308
|
def doesnt_match_any?(conditions, against)
|
298
|
-
|
299
|
-
(conditions.nil? or conditions.all?{|c| against.any?{|a| !match_against c, a } }) ? true : false
|
309
|
+
(conditions.nil? or conditions.all?{|c| sanitized_against(against).any?{|a| !match_against c, a } }) ? true : false
|
300
310
|
end
|
301
|
-
|
311
|
+
|
302
312
|
def match_against(condition, against)
|
303
|
-
(against.scan( condition.is_re? ? condition.to_re : /#{condition}/i).length > 0) ? true : false
|
313
|
+
(CraigScrape::Scraper.he_decode(against).scan( condition.is_re? ? condition.to_re : /#{condition}/i).length > 0) ? true : false
|
314
|
+
end
|
315
|
+
|
316
|
+
# This is kind of a hack to deal with ruby 1.9. Really the filtering mechanism
|
317
|
+
# needs to be factored out and tested....
|
318
|
+
def sanitized_against(against)
|
319
|
+
against = against.lines if against.respond_to? :lines
|
320
|
+
against = against.to_a if against.respond_to? :to_a
|
321
|
+
(against.nil?) ? [] : against.compact
|
304
322
|
end
|
305
323
|
end
|
306
324
|
end
|
@@ -309,11 +327,11 @@ class TrackedSearch < ActiveRecord::Base #:nodoc:
|
|
309
327
|
has_many :listings, :dependent => :destroy, :class_name => 'TrackedListing'
|
310
328
|
validates_uniqueness_of :search_name
|
311
329
|
validates_presence_of :search_name
|
312
|
-
|
330
|
+
|
313
331
|
def self.find_by_name(name)
|
314
332
|
self.find :first, :conditions => ['search_name = ?',name]
|
315
333
|
end
|
316
|
-
|
334
|
+
|
317
335
|
def find_listing_by_url(url)
|
318
336
|
listings.find :first, :conditions => ['url = ?', url]
|
319
337
|
end
|
@@ -330,9 +348,8 @@ class TrackedListing < ActiveRecord::Base #:nodoc:
|
|
330
348
|
def last_tracked_at
|
331
349
|
self.posts.maximum 'created_at'
|
332
350
|
end
|
333
|
-
|
351
|
+
|
334
352
|
def delete_posts_older_than(cutoff_date)
|
335
|
-
# TODO: can't I use posts.delete 'created_at < ?' and keep it cleaner?
|
336
353
|
TrackedPost.delete_all [ 'tracked_listing_id = ? AND created_at < ?', self.id, cutoff_date ]
|
337
354
|
end
|
338
355
|
end
|
@@ -342,11 +359,11 @@ class TrackedPost < ActiveRecord::Base #:nodoc:
|
|
342
359
|
|
343
360
|
def self.activate_all!
|
344
361
|
TrackedPost.update_all(
|
345
|
-
{ :active => true },
|
346
|
-
[ 'active = ?', false ]
|
362
|
+
{ :active => true },
|
363
|
+
[ 'active = ?', false ]
|
347
364
|
)
|
348
365
|
end
|
349
|
-
|
366
|
+
|
350
367
|
def self.destroy_inactive!
|
351
368
|
TrackedPost.delete_all [ 'active = ?', false ]
|
352
369
|
end
|
@@ -354,23 +371,9 @@ end
|
|
354
371
|
|
355
372
|
class ReportMailer < ActionMailer::Base #:nodoc:
|
356
373
|
def report(to, sender, subject_template, report_tmpl)
|
357
|
-
|
358
|
-
formatted_subject = Time.now.strftime(subject_template)
|
359
|
-
|
360
|
-
recipients to
|
361
|
-
from sender
|
362
|
-
subject formatted_subject
|
363
|
-
|
364
|
-
generate_view_parts 'craigslist_report', report_tmpl.merge({:subject =>formatted_subject})
|
365
|
-
end
|
374
|
+
@summaries = report_tmpl[:summaries]
|
366
375
|
|
367
|
-
|
368
|
-
part( :content_type => "multipart/alternative" ) do |p|
|
369
|
-
[
|
370
|
-
{ :content_type => "text/plain", :body => render_message("#{view_name.to_s}.plain.erb", tmpl) },
|
371
|
-
{ :content_type => "text/html", :body => render_message("#{view_name.to_s}.html.erb", tmpl.merge({:part_container => p})) }
|
372
|
-
].each { |parms| p.part parms.merge( { :charset => "UTF-8", :transfer_encoding => "7bit" } ) }
|
373
|
-
end
|
376
|
+
mail :to => to, :subject => Time.zone.now.strftime(subject_template), :from => sender
|
374
377
|
end
|
375
378
|
end
|
376
379
|
|
@@ -383,7 +386,7 @@ unless report_definition_file
|
|
383
386
|
puts <<EOD
|
384
387
|
Usage:
|
385
388
|
#{File.basename($0)} [report_definition_file]
|
386
|
-
|
389
|
+
|
387
390
|
Run 'gem server' and browse the libcraigscrape rdoc for 'bin/craigscrape' for specific usage details.
|
388
391
|
EOD
|
389
392
|
exit
|
@@ -397,20 +400,25 @@ parser = Kwalify::Yaml::Parser.new(
|
|
397
400
|
:data_binding => true
|
398
401
|
)
|
399
402
|
|
400
|
-
|
403
|
+
report_definition_file_content = ERB.new(File.read(report_definition_file)).result
|
404
|
+
craig_report = parser.parse(report_definition_file_content, filename: report_definition_file)
|
401
405
|
|
402
406
|
parser.errors.each do |e|
|
403
407
|
puts "Definition Validation Error (line #{e.linenum}, char #{e.column}): #{e.message}"
|
404
408
|
end and exit if parser.errors.length > 0
|
405
409
|
|
410
|
+
# Set the time zone:
|
411
|
+
Time.zone = craig_report.tz
|
412
|
+
|
406
413
|
# Initialize Action Mailer:
|
414
|
+
ActionMailer::Base.prepend_view_path(File.dirname(__FILE__))
|
407
415
|
ActionMailer::Base.logger = Logger.new STDERR if craig_report.debug_mailer?
|
408
416
|
if craig_report.smtp_settings
|
409
|
-
|
417
|
+
ActionMailer::Base.smtp_settings = craig_report.smtp_settings.symbolize_keys
|
418
|
+
ActionMailer::Base.delivery_method = :smtp
|
410
419
|
else
|
411
|
-
|
420
|
+
ActionMailer::Base.delivery_method = :sendmail
|
412
421
|
end
|
413
|
-
ReportMailer.template_root = File.dirname __FILE__
|
414
422
|
|
415
423
|
# Initialize the database:
|
416
424
|
ActiveRecord::Base.logger = Logger.new STDERR if craig_report.debug_database?
|
@@ -421,16 +429,16 @@ CraigScrape::Scraper.logger = Logger.new STDERR if craig_report.debug_craigscrap
|
|
421
429
|
|
422
430
|
# Perform migrations if needed?
|
423
431
|
ActiveRecord::Schema.define do
|
424
|
-
suppress_messages do
|
432
|
+
suppress_messages do
|
425
433
|
create_table :tracked_searches do |t|
|
426
434
|
t.column :search_name, :string
|
427
435
|
end unless table_exists? :tracked_searches
|
428
|
-
|
436
|
+
|
429
437
|
create_table :tracked_listings do |t|
|
430
438
|
t.column :url, :string
|
431
439
|
t.column :tracked_search_id, :integer
|
432
|
-
end unless table_exists? :tracked_listings
|
433
|
-
|
440
|
+
end unless table_exists? :tracked_listings
|
441
|
+
|
434
442
|
create_table :tracked_posts do |t|
|
435
443
|
t.column :url, :string
|
436
444
|
t.column :tracked_listing_id, :integer
|
@@ -440,7 +448,7 @@ ActiveRecord::Schema.define do
|
|
440
448
|
end
|
441
449
|
end
|
442
450
|
|
443
|
-
# Remove all posts which are inactive. They would be in there if the prior run was a failure.
|
451
|
+
# Remove all posts which are inactive. They would be in there if the prior run was a failure.
|
444
452
|
TrackedPost.destroy_inactive!
|
445
453
|
|
446
454
|
# We'll need these outside this next loop:
|
@@ -450,80 +458,80 @@ newly_tracked_posts = []
|
|
450
458
|
report_summaries = craig_report.searches.collect do |search|
|
451
459
|
# Load our tracking info
|
452
460
|
search_track = TrackedSearch.find_by_name search.name
|
453
|
-
|
461
|
+
|
454
462
|
# No Tracking found - let's set one up:
|
455
463
|
search_track = TrackedSearch.create! :search_name => search.name unless search_track
|
456
|
-
|
464
|
+
|
457
465
|
# This hash tracks what makes it into the report on this search.
|
458
466
|
# NOTE that keys are url's b/c sometimes the same posting will end up in multiple listings,
|
459
467
|
# And doing this ensures that we don't end-up reporting the same post twice.
|
460
468
|
new_summaries = {}
|
461
|
-
|
469
|
+
|
462
470
|
# And now we actually scrape:
|
463
471
|
CraigScrape.new(*search.sites).each_listing(*search.listings) do |listing|
|
464
|
-
# Keep in mind that listing.url does change in the while loop.
|
472
|
+
# Keep in mind that listing.url does change in the while loop.
|
465
473
|
# But, this first one is a good base_url that will never change between runs.
|
466
474
|
|
467
475
|
tracked_listing = search_track.find_listing_by_url listing.url
|
468
476
|
tracked_listing ||= search_track.listings.create! :url => listing.url
|
469
|
-
|
470
|
-
# Gives us a sane stopping point (hopefully) :
|
471
|
-
last_tracked_at = tracked_listing.last_tracked_at
|
477
|
+
|
478
|
+
# Gives us a sane stopping point (hopefully) :
|
479
|
+
last_tracked_at = tracked_listing.last_tracked_at.try(:to_date)
|
472
480
|
last_tracked_at ||= search.starting_at
|
473
481
|
|
474
482
|
# Some more stopping points (probably):
|
475
483
|
already_tracked_urls = tracked_listing.posts.collect{|tp| tp.url}
|
476
484
|
|
477
485
|
# We'll use this in the loop to decide what posts to track:
|
478
|
-
newest_post_date = last_tracked_at
|
479
|
-
|
486
|
+
newest_post_date = last_tracked_at
|
487
|
+
|
480
488
|
# We keep track of post.post_date here, b/c in some circumstances, you can be in the below loop
|
481
489
|
# but have no post.post_date since the posting was removed and it parsed to nil
|
482
|
-
most_recent_posting_date =
|
483
|
-
|
490
|
+
most_recent_posting_date = Date.new
|
491
|
+
|
484
492
|
# OK - Now let's go!
|
485
493
|
catch :list_break do
|
486
494
|
while listing
|
487
495
|
listing.posts.each do |post|
|
488
496
|
begin
|
489
497
|
most_recent_posting_date = post.post_date if post.post_date
|
490
|
-
|
498
|
+
|
491
499
|
# Are we at a point in the scrape, past which we don't need to proceed?
|
492
500
|
throw :list_break if (
|
493
|
-
most_recent_posting_date < last_tracked_at or
|
501
|
+
most_recent_posting_date.to_time < last_tracked_at or
|
494
502
|
already_tracked_urls.include? post.url
|
495
503
|
)
|
496
|
-
|
504
|
+
|
497
505
|
# If we want to report this post, add it to the collection:
|
498
506
|
new_summaries[post.url] = post if (
|
499
|
-
!new_summaries.has_key? post.url and
|
507
|
+
!new_summaries.has_key? post.url and
|
500
508
|
search.passes_filter? post
|
501
509
|
)
|
502
|
-
rescue CraigScrape::Scraper::ResourceNotFoundError
|
510
|
+
rescue CraigScrape::Scraper::ResourceNotFoundError => e
|
503
511
|
# Sometimes we do end up with 404's that will never load, and we dont want to
|
504
512
|
# abort a run simply b/c we found some anomaly due to the craigslist index.
|
505
|
-
# being out of date. This ResourceNotFoundError can occur due to
|
506
|
-
# loading the post url in full, only to see that it was yanked - or craigslist
|
513
|
+
# being out of date. This ResourceNotFoundError can occur due to
|
514
|
+
# loading the post url in full, only to see that it was yanked - or craigslist
|
507
515
|
# is acting funny.
|
508
516
|
next
|
509
517
|
end
|
510
|
-
|
518
|
+
|
511
519
|
# Now let's see if the url should be kept in our tracking database for the future...
|
512
520
|
|
513
521
|
# This post-date sets a limit for the tracked_listing.posts.create below
|
514
522
|
newest_post_date = most_recent_posting_date if most_recent_posting_date > newest_post_date
|
515
|
-
|
523
|
+
|
516
524
|
# Now let's add these urls to the database so as to reduce memory overhead.
|
517
525
|
# Keep in mind - they're not active until the email goes out.
|
518
|
-
# also - we shouldn't have to worry about putting 'irrelevant' posts in the db, since
|
519
|
-
# the
|
526
|
+
# also - we shouldn't have to worry about putting 'irrelevant' posts in the db, since
|
527
|
+
# the newest are always the first ones parsed:
|
520
528
|
tracked_listing.posts.create(
|
521
|
-
:url => post.url,
|
522
|
-
:created_at => newest_post_date
|
529
|
+
:url => post.url,
|
530
|
+
:created_at => newest_post_date
|
523
531
|
) unless most_recent_posting_date < newest_post_date
|
524
532
|
|
525
533
|
end
|
526
|
-
|
534
|
+
|
527
535
|
listing = listing.next_page
|
528
536
|
end
|
529
537
|
end
|
@@ -532,41 +540,35 @@ report_summaries = craig_report.searches.collect do |search|
|
|
532
540
|
|
533
541
|
|
534
542
|
# Let's flatten the unique'd hash into a more useable array:
|
535
|
-
|
536
|
-
|
537
|
-
# * Its possible that we met all the criterion of the passes_filter? with merely a header, and
|
538
|
-
# if so we add a url to the summaries stack
|
539
|
-
# * Unfortunately, when we later load that post in full, we may find that the post was posting_has_expired?
|
540
|
-
# or flagged_for_removal?, etc.
|
541
|
-
# * If this was the case, below we'll end up sorting against nil post_dates. This would fail.
|
542
|
-
# * So - before we sort, we run a quick reject on nil post_dates
|
543
|
-
new_summaries = new_summaries.values.reject{|v| v.post_date.nil? }.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
|
544
|
-
|
543
|
+
new_summaries = new_summaries.values.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
|
544
|
+
|
545
545
|
# Now Let's manage the tracking database:
|
546
|
-
if new_summaries.length > 0
|
546
|
+
if new_summaries.length > 0
|
547
547
|
|
548
548
|
# We'll use this in the cleanup at the bottom:
|
549
549
|
latest_post_date = new_summaries.last.post_date
|
550
|
-
|
551
|
-
new_summaries.reverse! if search.newest_first?
|
550
|
+
|
551
|
+
new_summaries.reverse! if search.newest_first?
|
552
552
|
end
|
553
|
-
|
553
|
+
|
554
554
|
# We'll want to email these...
|
555
|
-
{
|
555
|
+
{
|
556
556
|
:latest_post_date => latest_post_date,
|
557
|
-
:search_track => search_track,
|
558
|
-
:postings => new_summaries,
|
557
|
+
:search_track => search_track,
|
558
|
+
:postings => new_summaries,
|
559
559
|
:search => search
|
560
560
|
}
|
561
561
|
end
|
562
562
|
|
563
|
-
# Time to send the email:
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
563
|
+
# Time to send the email (maybe):
|
564
|
+
unless report_summaries.select { |s| !s[:postings].empty? }.empty?
|
565
|
+
ReportMailer.report(
|
566
|
+
craig_report.email_to,
|
567
|
+
craig_report.email_from,
|
568
|
+
craig_report.report_name,
|
569
|
+
{:summaries => report_summaries, :definition => craig_report}
|
570
|
+
).deliver
|
571
|
+
end
|
570
572
|
|
571
573
|
# Commit (make 'active') all newly created tracked post urls:
|
572
574
|
TrackedPost.activate_all!
|
@@ -576,4 +578,4 @@ report_summaries.each do |summary|
|
|
576
578
|
summary[:search_track].listings.each do |listing|
|
577
579
|
listing.delete_posts_older_than listing.last_tracked_at
|
578
580
|
end
|
579
|
-
end
|
581
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
<h2><%=h @subject %></h2>
|
2
|
+
<%@summaries.each do |summary| %>
|
3
|
+
<h3><%=h summary[:search].name%></h3>
|
4
|
+
<% if summary[:postings].length > 0 %>
|
5
|
+
<%summary[:postings].each do |post|%>
|
6
|
+
<p>
|
7
|
+
<%=('%s <a href="%s">%s</a>' % [
|
8
|
+
h(post.post_date.strftime('%b %d')), post.url, h(post.title)
|
9
|
+
]).html_safe %>
|
10
|
+
<%=([
|
11
|
+
(post.price) ? h(post.price.try(:format, :no_cents => true)) : nil,
|
12
|
+
(post.location) ? '<font size="-1"> (%s)</font>' % h(post.location) : nil,
|
13
|
+
(post.has_pic_or_img?) ? ' <span style="color: orange"> img</span>': nil
|
14
|
+
].compact.join(' ')).html_safe -%>
|
15
|
+
</p>
|
16
|
+
<% end %>
|
17
|
+
<% else %>
|
18
|
+
<p><i>No new postings were found, which matched the search criteria.</i></p>
|
19
|
+
<% end %>
|
20
|
+
<% end %>
|
@@ -1,18 +1,19 @@
|
|
1
1
|
CRAIGSLIST REPORTER
|
2
2
|
|
3
|
-
|
3
|
+
<% @summaries.each do |summary| -%>
|
4
4
|
<%=summary[:search].name %>
|
5
5
|
<% summary[:postings].collect do |post| -%>
|
6
6
|
<% if summary[:postings].length > 0 %>
|
7
|
-
<%='%s : %s %s %s %s' % [
|
7
|
+
<%='%s : %s %s %s %s %s' % [
|
8
8
|
post.post_date.strftime('%b %d'),
|
9
|
-
post.
|
10
|
-
|
11
|
-
(post.
|
9
|
+
post.title,
|
10
|
+
post.price.try(:format, :no_cents => true),
|
11
|
+
(post.location) ? " (#{post.location})" : nil,
|
12
|
+
(post.has_pic_or_img?) ? ' [img]': nil,
|
12
13
|
post.url
|
13
14
|
] -%>
|
14
15
|
<% else %>
|
15
16
|
No new postings were found, which matched the search criteria.
|
16
17
|
<% end %>
|
17
18
|
<% end %>
|
18
|
-
<% end -%>
|
19
|
+
<% end -%>
|
data/lib/geo_listings.rb
CHANGED