libcraigscrape 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +19 -0
- data/README +27 -11
- data/Rakefile +44 -2
- data/bin/craig_report_schema.yml +30 -21
- data/bin/craigwatch +232 -67
- data/bin/report_mailer/craigslist_report.html.erb +12 -9
- data/bin/report_mailer/craigslist_report.plain.erb +4 -1
- data/lib/geo_listings.rb +144 -0
- data/lib/libcraigscrape.rb +158 -650
- data/lib/listings.rb +144 -0
- data/lib/posting.rb +293 -0
- data/lib/scraper.rb +203 -0
- data/test/geolisting_samples/hierarchy_test071009/index.html +31 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/%20SW%20florida/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/more-nonsense/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/nonexist/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/nonsense/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/south%20florida/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/index.html +355 -0
- data/test/test_craigslist_geolisting.rb +476 -380
- metadata +28 -2
data/bin/craigwatch
CHANGED
@@ -15,10 +15,14 @@
|
|
15
15
|
# - price_required - yes/no
|
16
16
|
# - price_greater_than - (int)
|
17
17
|
# - price_less_than - (int)
|
18
|
-
# - full_post_has - (string or regexp) Only post whose full-post's contents contains/matches
|
19
|
-
# - full_post_has_no - (string or regexp) Only post whose full-post's contents_ contains doesn't contain/match
|
20
|
-
# - summary_post_has - (string or regexp) Only post whose listing's label contains/matches
|
21
|
-
# - summary_post_has_no - (string or regexp) Only post whose listing's label doesn't contain/match
|
18
|
+
# - full_post_has - (array of string or regexp) Only post whose full-post's contents contains/matches
|
19
|
+
# - full_post_has_no - (array of string or regexp) Only post whose full-post's contents_ contains doesn't contain/match
|
20
|
+
# - summary_post_has - (array of string or regexp) Only post whose listing's label contains/matches
|
21
|
+
# - summary_post_has_no - (array of string or regexp) Only post whose listing's label doesn't contain/match
|
22
|
+
# - summary_or_full_post_has - (array of string or regexp) Filter's out results which don't match either the post label <b>or</b> the post contents
|
23
|
+
# - summary_or_full_post_has_no - (array of string or regexp) Filter's out results which match either the post label <b>or</b> the post contents
|
24
|
+
# - location_has - (array of string or regexp) Only include posts which match against the post location
|
25
|
+
# - location_has_no - (array of string or regexp) Only include posts which don't match against the post location
|
22
26
|
#
|
23
27
|
# Multiple searches can be combined into a single report, and results can be sorted by newest-first or oldest-first (default)
|
24
28
|
#
|
@@ -58,8 +62,44 @@
|
|
58
62
|
# - debug_craigscrape
|
59
63
|
#
|
60
64
|
# == Definition File Sample
|
61
|
-
#
|
62
|
-
#
|
65
|
+
#
|
66
|
+
# Let's start with a minimal report, just enough needed to get something quick working:
|
67
|
+
# # We need some kind of destination to send this to
|
68
|
+
# email_to: Chris DeRose <cderose@derosetechnologies.com>
|
69
|
+
#
|
70
|
+
# # This is an array of specific 'searches' we'll be performing in this report:
|
71
|
+
# searches:
|
72
|
+
# # We're looking for 90's era cadillac, something cheap, confortable and in white...
|
73
|
+
# - name: 90's White/Creme Convertible Cadillacs
|
74
|
+
#
|
75
|
+
# # This starting date is mostly for the first run, and gives us a reasonable cut-off point from whcih to build.
|
76
|
+
# # Its optional, and if omitted, craigwatch defaults to 'yesterday'
|
77
|
+
# starting: 9/10/09
|
78
|
+
#
|
79
|
+
# # We want to check all the labels, and filter out years not in the 90's, and cars not made by cadillac
|
80
|
+
# summary_post_has:
|
81
|
+
# - /(?:^|[^\d]|19)9[\d](?:[^\dk]|$)/i
|
82
|
+
# - /cadillac/i
|
83
|
+
#
|
84
|
+
# # I said we're looking for something *comfortable* !
|
85
|
+
# summary_post_has_no: [ /xlr/i ]
|
86
|
+
#
|
87
|
+
# # We were convertable, and white/cream/etc:
|
88
|
+
# full_post_has:
|
89
|
+
# - /convertible/i
|
90
|
+
# - /(white|yellow|banana|creme|cream)/i
|
91
|
+
#
|
92
|
+
# # Convertible - not *simulated* convertible!
|
93
|
+
# full_post_has_no:
|
94
|
+
# - /simulated[^a-z]{0,2}convertible/i
|
95
|
+
#
|
96
|
+
# # We want to search all of craigslist's in the us, and we'll want to find it using
|
97
|
+
# # the '/search/cta?hasPic=1&query=cadillac' url on the site
|
98
|
+
# sites: [ us ]
|
99
|
+
# listings:
|
100
|
+
# - /search/cta?hasPic=1&query=cadillac
|
101
|
+
#
|
102
|
+
# Here's another annotated report which uses most of the other available craigwatch features:
|
63
103
|
#
|
64
104
|
# # The report_name is fed into Time.now.strftime, hence the formatting characters
|
65
105
|
# report_name: Craig Watch For Johnathan on %D at %I:%M %p
|
@@ -73,13 +113,13 @@
|
|
73
113
|
# searches:
|
74
114
|
# # Search #1:
|
75
115
|
# - name: Schwinn Bikes For Sale in/near New York
|
116
|
+
# starting: 9/10/2009
|
117
|
+
#
|
118
|
+
# # Scrape the following sites/servers:
|
119
|
+
# sites: [ us/ny/newyork, us/nj/southjersey ]
|
76
120
|
#
|
77
121
|
# # Scrape the following listings pages:
|
78
|
-
#
|
79
|
-
# - http://newyork.craigslist.org/bik/
|
80
|
-
# - http://newyork.craigslist.org/jsy/bik/
|
81
|
-
# # This starting date is mostly for the first run, and gives us a reasonable cut-off point from whcih to build
|
82
|
-
# starting: 5/2/2009
|
122
|
+
# listings: [ bik ]
|
83
123
|
#
|
84
124
|
# # We want listings with Schwinn in the summary
|
85
125
|
# summary_post_has: [ /schwinn/i ]
|
@@ -92,10 +132,13 @@
|
|
92
132
|
#
|
93
133
|
# # Search #2
|
94
134
|
# - name: Large apartment rentals in San Francisco
|
135
|
+
# sites: [ us/ca/sfbay ]
|
136
|
+
# starting: 9/10/2009
|
137
|
+
#
|
95
138
|
# # We're going to rely on craigslist's built-in search for this one since there's a lot of listings, and we
|
96
139
|
# # want to conserve some bandwidth
|
97
|
-
#
|
98
|
-
#
|
140
|
+
# listings: [ /search/apa?query=pool&minAsk=min&maxAsk=max&bedrooms=5 ]
|
141
|
+
#
|
99
142
|
# # We'll require a price to be listed, 'cause it keeps out some of the unwanted fluff
|
100
143
|
# price_required: yes
|
101
144
|
#
|
@@ -149,24 +192,52 @@ end
|
|
149
192
|
class CraigReportDefinition #:nodoc:
|
150
193
|
include Kwalify::Util::HashLike
|
151
194
|
|
195
|
+
EMAIL_NAME_PARTS = /^[ ]*(.+)[ ]*\<.+\>[ ]*/
|
196
|
+
|
152
197
|
attr_reader :report_name, :email_to, :email_from, :tracking_database, :searches, :smtp_settings
|
153
198
|
|
154
199
|
def debug_database?; @debug_database; end
|
155
200
|
def debug_mailer?; @debug_mailer; end
|
156
201
|
def debug_craigscrape?; @debug_craigscrape; end
|
157
|
-
|
158
|
-
def each_search(&block); searches.each █ end
|
159
202
|
|
160
203
|
def email_from
|
161
204
|
(@email_from) ? @email_from : ('%s@%s' % [ENV['USER'], Socket.gethostname])
|
162
205
|
end
|
163
206
|
|
207
|
+
def email_to_name
|
208
|
+
EMAIL_NAME_PARTS.match(email_to) ? $1 : email_to
|
209
|
+
end
|
210
|
+
|
211
|
+
def report_name
|
212
|
+
@report_name ? @report_name : "Craigslist Watch For #{email_to_name} on %D at %I:%M %p"
|
213
|
+
end
|
214
|
+
|
215
|
+
# We allow people rewrite relative (sqlite) dbfiles by taking the use_cwd as a paramter
|
216
|
+
def tracking_database(for_yaml_file = nil)
|
217
|
+
# We'll setup a SQLite db using some defaults if needed
|
218
|
+
@tracking_database ||= {
|
219
|
+
:adapter => 'sqlite3',
|
220
|
+
:dbfile => File.basename(for_yaml_file, File.extname(for_yaml_file))+'.db'
|
221
|
+
} if for_yaml_file
|
222
|
+
|
223
|
+
# This is a little hack to make sqlite definitions a little more portable, by allowing them
|
224
|
+
# to be specify dbfile's relative to the yml's directory:
|
225
|
+
ret = @tracking_database
|
226
|
+
ret['dbfile'] = '%s/%s' % [File.dirname(for_yaml_file), $1] if (
|
227
|
+
for_yaml_file and ret.has_key? 'dbfile' and /^([^\/].*)$/.match ret['dbfile']
|
228
|
+
)
|
229
|
+
|
230
|
+
ret
|
231
|
+
end
|
232
|
+
|
164
233
|
class SearchDefinition #:nodoc:
|
165
234
|
include Kwalify::Util::HashLike
|
166
235
|
|
167
|
-
attr_reader :name, :
|
236
|
+
attr_reader :name, :sites, :listings
|
237
|
+
attr_reader :location_has, :location_has_no
|
168
238
|
attr_reader :full_post_has, :full_post_has_no
|
169
239
|
attr_reader :summary_post_has, :summary_post_has_no
|
240
|
+
attr_reader :summary_or_full_post_has, :summary_or_full_post_has_no
|
170
241
|
|
171
242
|
attr_reader :price_greater_than,:price_less_than
|
172
243
|
|
@@ -188,15 +259,24 @@ class CraigReportDefinition #:nodoc:
|
|
188
259
|
return false if @price_less_than and post.price >= @price_less_than
|
189
260
|
end
|
190
261
|
|
262
|
+
# Label Filters:
|
191
263
|
return false unless matches_all? summary_post_has, post.label
|
192
264
|
return false unless doesnt_match_any? summary_post_has_no, post.label
|
193
265
|
|
194
|
-
|
266
|
+
# Location Filters:
|
267
|
+
return false unless matches_all? location_has, post.location
|
268
|
+
return false unless doesnt_match_any? location_has_no, post.location
|
269
|
+
|
270
|
+
# Full post Filters:
|
271
|
+
if full_post_has or full_post_has_no or summary_or_full_post_has or summary_or_full_post_has_no
|
195
272
|
# We're going to download the page, so let's make sure we didnt hit a "This posting has been flagged for removal"
|
196
273
|
return false if post.system_post?
|
197
274
|
|
198
275
|
return false unless matches_all? full_post_has, post.contents_as_plain
|
199
276
|
return false unless doesnt_match_any? full_post_has_no, post.contents_as_plain
|
277
|
+
|
278
|
+
return false unless matches_all? summary_or_full_post_has, [post.contents_as_plain, post.label]
|
279
|
+
return false unless doesnt_match_any? summary_or_full_post_has_no, [post.contents_as_plain, post.label]
|
200
280
|
end
|
201
281
|
|
202
282
|
true
|
@@ -205,37 +285,66 @@ class CraigReportDefinition #:nodoc:
|
|
205
285
|
private
|
206
286
|
|
207
287
|
def matches_all?(conditions, against)
|
208
|
-
|
288
|
+
against = against.to_a
|
289
|
+
(conditions.nil? or conditions.all?{|c| against.any?{|a| match_against c, a } }) ? true : false
|
209
290
|
end
|
210
291
|
|
211
292
|
def doesnt_match_any?(conditions, against)
|
212
|
-
|
293
|
+
against = against.to_a
|
294
|
+
(conditions.nil? or conditions.all?{|c| against.any?{|a| !match_against c, a } }) ? true : false
|
213
295
|
end
|
214
296
|
|
215
297
|
def match_against(condition, against)
|
216
|
-
(against.scan( condition.is_re? ? condition.to_re : condition).length > 0) ? true : false
|
298
|
+
(against.scan( condition.is_re? ? condition.to_re : /#{condition}/i).length > 0) ? true : false
|
217
299
|
end
|
218
300
|
end
|
219
301
|
end
|
220
302
|
|
221
303
|
class TrackedSearch < ActiveRecord::Base #:nodoc:
|
222
|
-
has_many :
|
304
|
+
has_many :listings, :dependent => :destroy, :class_name => 'TrackedListing'
|
223
305
|
validates_uniqueness_of :search_name
|
224
306
|
validates_presence_of :search_name
|
225
307
|
|
226
|
-
def
|
227
|
-
|
308
|
+
def self.find_by_name(name)
|
309
|
+
self.find :first, :conditions => ['search_name = ?',name]
|
228
310
|
end
|
229
311
|
|
312
|
+
def find_listing_by_url(url)
|
313
|
+
listings.find :first, :conditions => ['url = ?', url]
|
314
|
+
end
|
315
|
+
end
|
316
|
+
|
317
|
+
class TrackedListing < ActiveRecord::Base #:nodoc:
|
318
|
+
has_many :posts, :dependent => :destroy, :class_name => 'TrackedPost'
|
319
|
+
validates_presence_of :url, :tracked_search_id
|
320
|
+
|
321
|
+
def already_tracked?(url)
|
322
|
+
( self.posts.find :first, :conditions => ['url = ?', url]) ? true : false
|
323
|
+
end
|
324
|
+
|
230
325
|
def last_tracked_at
|
231
|
-
self.
|
326
|
+
self.posts.maximum 'created_at'
|
327
|
+
end
|
328
|
+
|
329
|
+
def delete_posts_older_than(cutoff_date)
|
330
|
+
# TODO: can't I use posts.delete 'created_at < ?' and keep it cleaner?
|
331
|
+
TrackedPost.delete_all [ 'tracked_listing_id = ? AND created_at < ?', self.id, cutoff_date ]
|
232
332
|
end
|
233
333
|
end
|
234
334
|
|
235
335
|
class TrackedPost < ActiveRecord::Base #:nodoc:
|
236
|
-
|
237
|
-
|
238
|
-
|
336
|
+
validates_presence_of :url, :tracked_listing_id
|
337
|
+
|
338
|
+
def self.activate_all!
|
339
|
+
TrackedPost.update_all(
|
340
|
+
{ :active => true },
|
341
|
+
[ 'active = ?', false ]
|
342
|
+
)
|
343
|
+
end
|
344
|
+
|
345
|
+
def self.destroy_inactive!
|
346
|
+
TrackedPost.delete_all [ 'active = ?', false ]
|
347
|
+
end
|
239
348
|
end
|
240
349
|
|
241
350
|
class ReportMailer < ActionMailer::Base #:nodoc:
|
@@ -279,9 +388,9 @@ end
|
|
279
388
|
parser = Kwalify::Yaml::Parser.new(
|
280
389
|
Kwalify::Validator.new(
|
281
390
|
Kwalify::Yaml.load_file(File.dirname(__FILE__)+'/craig_report_schema.yml')
|
282
|
-
)
|
391
|
+
),
|
392
|
+
:data_binding => true
|
283
393
|
)
|
284
|
-
parser.data_binding = true
|
285
394
|
|
286
395
|
craig_report = parser.parse_file report_definition_file
|
287
396
|
|
@@ -300,7 +409,7 @@ ReportMailer.template_root = File.dirname __FILE__
|
|
300
409
|
|
301
410
|
# Initialize the database:
|
302
411
|
ActiveRecord::Base.logger = Logger.new STDERR if craig_report.debug_database?
|
303
|
-
ActiveRecord::Base.establish_connection craig_report.tracking_database
|
412
|
+
ActiveRecord::Base.establish_connection craig_report.tracking_database(report_definition_file)
|
304
413
|
|
305
414
|
# Initialize CraigScrape (sorta)
|
306
415
|
CraigScrape::Scraper.logger = Logger.new STDERR if craig_report.debug_craigscrape?
|
@@ -311,68 +420,123 @@ ActiveRecord::Schema.define do
|
|
311
420
|
create_table :tracked_searches do |t|
|
312
421
|
t.column :search_name, :string
|
313
422
|
end unless table_exists? :tracked_searches
|
314
|
-
|
315
|
-
create_table :
|
423
|
+
|
424
|
+
create_table :tracked_listings do |t|
|
316
425
|
t.column :url, :string
|
317
426
|
t.column :tracked_search_id, :integer
|
427
|
+
end unless table_exists? :tracked_listings
|
428
|
+
|
429
|
+
create_table :tracked_posts do |t|
|
430
|
+
t.column :url, :string
|
431
|
+
t.column :tracked_listing_id, :integer
|
318
432
|
t.column :created_at, :date
|
433
|
+
t.column :active, :boolean, :default => 0
|
319
434
|
end unless table_exists? :tracked_posts
|
320
435
|
end
|
321
436
|
end
|
322
437
|
|
438
|
+
# Remove all posts which are inactive. They would be in there if the prior run was a failure.
|
439
|
+
TrackedPost.destroy_inactive!
|
440
|
+
|
323
441
|
# We'll need these outside this next loop:
|
324
|
-
report_summaries = []
|
325
442
|
newly_tracked_posts = []
|
326
443
|
|
327
444
|
# Now let's run a report:
|
328
|
-
craig_report.
|
329
|
-
|
445
|
+
report_summaries = craig_report.searches.collect do |search|
|
330
446
|
# Load our tracking info
|
331
|
-
search_track = TrackedSearch.
|
447
|
+
search_track = TrackedSearch.find_by_name search.name
|
332
448
|
|
333
449
|
# No Tracking found - let's set one up:
|
334
450
|
search_track = TrackedSearch.create! :search_name => search.name unless search_track
|
451
|
+
|
452
|
+
# This hash tracks what makes it into the report on this search.
|
453
|
+
# NOTE that keys are url's b/c sometimes the same posting will end up in multiple listings,
|
454
|
+
# And doing this ensures that we don't end-up reporting the same post twice.
|
455
|
+
new_summaries = {}
|
456
|
+
|
457
|
+
# And now we actually scrape:
|
458
|
+
CraigScrape.new(*search.sites).each_listing(*search.listings) do |listing|
|
459
|
+
# Keep in mind that listing.url does change in the while loop.
|
460
|
+
# But, this first one is a good base_url that will never change between runs.
|
335
461
|
|
336
|
-
|
462
|
+
tracked_listing = search_track.find_listing_by_url listing.url
|
463
|
+
tracked_listing ||= search_track.listings.create! :url => listing.url
|
464
|
+
|
465
|
+
# Gives us a sane stopping point (hopefully) :
|
466
|
+
last_tracked_at = tracked_listing.last_tracked_at
|
467
|
+
last_tracked_at ||= search.starting_at
|
337
468
|
|
338
|
-
|
469
|
+
# Some more stopping points (probably):
|
470
|
+
already_tracked_urls = tracked_listing.posts.collect{|tp| tp.url}
|
339
471
|
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
472
|
+
# We'll use this in the loop to decide what posts to track:
|
473
|
+
newest_post_date = last_tracked_at
|
474
|
+
|
475
|
+
# OK - Now let's go!
|
476
|
+
catch :list_break do
|
477
|
+
while listing
|
478
|
+
listing.posts.each do |post|
|
479
|
+
begin
|
480
|
+
# Are we at a point in the scrape, past which we don't need to proceed?
|
481
|
+
throw :list_break if (
|
482
|
+
post.post_date < last_tracked_at or
|
483
|
+
already_tracked_urls.include? post.url
|
484
|
+
)
|
485
|
+
|
486
|
+
# If we want to report this post, add it to the collection:
|
487
|
+
new_summaries[post.url] = post if (
|
488
|
+
!new_summaries.has_key? post.url and
|
489
|
+
search.passes_filter? post
|
490
|
+
)
|
491
|
+
rescue CraigScrape::Scraper::ResourceNotFoundError => e
|
492
|
+
# Sometimes we do end up with 404's that will never load, and we dont want to
|
493
|
+
# abort a run simply b/c we found some anomaly due to the craigslist index.
|
494
|
+
# being out of date. This ResourceNotFoundError can occur due to
|
495
|
+
# loading the post url in full, only to see that it was yanked - or craigslist
|
496
|
+
# is acting funny.
|
497
|
+
next
|
498
|
+
end
|
499
|
+
|
500
|
+
# Now let's see if the url should be kept in our tracking database for the future...
|
501
|
+
|
502
|
+
# This post-date sets a limit for the tracked_listing.posts.create below
|
503
|
+
newest_post_date = post.post_date if post.post_date > newest_post_date
|
504
|
+
|
505
|
+
# Now let's add these urls to the database so as to reduce memory overhead.
|
506
|
+
# Keep in mind - they're not active until the email goes out.
|
507
|
+
# also - we shouldn't have to worry about putting 'irrelevant' posts in the db, since
|
508
|
+
# the nbewest are always the first ones parsed:
|
509
|
+
tracked_listing.posts.create(
|
510
|
+
:url => post.url,
|
511
|
+
:created_at => newest_post_date
|
512
|
+
) unless post.post_date < newest_post_date
|
513
|
+
|
514
|
+
end
|
515
|
+
|
516
|
+
listing = listing.next_page
|
517
|
+
end
|
345
518
|
end
|
346
519
|
end
|
347
520
|
|
348
521
|
# Let's flatten the unique'd hash into a more useable array:
|
349
522
|
new_summaries = new_summaries.values.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
|
350
523
|
|
351
|
-
# Let's tag all the newest tracked posts that should go into the database:
|
352
|
-
# NOTE: Since all the dates are at_begining_of_day, we'll effectively have a chunk of dates tied for latest
|
353
|
-
new_summaries.reject{|p| p.post_date < new_summaries.last.post_date}.each do |p_s|
|
354
|
-
newly_tracked_posts << search_track.tracked_posts.build( :url => p_s.url, :created_at => p_s.post_date)
|
355
|
-
end
|
356
|
-
|
357
|
-
# Reject anything from this report which doesn't match the has/has_no :
|
358
|
-
new_summaries.reject!{|s| !search.passes_filter? s }
|
359
|
-
|
360
524
|
# Now Let's manage the tracking database:
|
361
525
|
if new_summaries.length > 0
|
362
526
|
|
363
527
|
# We'll use this in the cleanup at the bottom:
|
364
528
|
latest_post_date = new_summaries.last.post_date
|
365
529
|
|
366
|
-
new_summaries.reverse! if search.newest_first?
|
367
|
-
|
368
|
-
# We'll want to email these...
|
369
|
-
report_summaries << {
|
370
|
-
:postings => new_summaries,
|
371
|
-
:search => search,
|
372
|
-
:search_track => search_track,
|
373
|
-
:latest_post_date => latest_post_date
|
374
|
-
}
|
530
|
+
new_summaries.reverse! if search.newest_first?
|
375
531
|
end
|
532
|
+
|
533
|
+
# We'll want to email these...
|
534
|
+
{
|
535
|
+
:latest_post_date => latest_post_date,
|
536
|
+
:search_track => search_track,
|
537
|
+
:postings => new_summaries,
|
538
|
+
:search => search
|
539
|
+
}
|
376
540
|
end
|
377
541
|
|
378
542
|
# Time to send the email:
|
@@ -383,11 +547,12 @@ ReportMailer.deliver_report(
|
|
383
547
|
{:summaries => report_summaries, :definition => craig_report}
|
384
548
|
) if report_summaries.length > 0
|
385
549
|
|
386
|
-
#
|
387
|
-
|
550
|
+
# Commit (make 'active') all newly created tracked post urls:
|
551
|
+
TrackedPost.activate_all!
|
388
552
|
|
389
|
-
# Now
|
390
|
-
report_summaries.each do |
|
391
|
-
|
392
|
-
|
553
|
+
# Now remove all the no-longer-need posts from the prior run:
|
554
|
+
report_summaries.each do |summary|
|
555
|
+
summary[:search_track].listings.each do |listing|
|
556
|
+
listing.delete_posts_older_than listing.last_tracked_at
|
557
|
+
end
|
393
558
|
end
|
@@ -1,14 +1,17 @@
|
|
1
1
|
<h2><%=h @subject %></h2>
|
2
|
-
|
3
2
|
<%@summaries.each do |summary| %>
|
4
3
|
<h3><%=h summary[:search].name%></h3>
|
5
|
-
<%summary[:postings].
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
4
|
+
<% if summary[:postings].length > 0 %>
|
5
|
+
<%summary[:postings].each do |post|%>
|
6
|
+
<%='<p>%s <a href="%s">%s -</a>%s%s</p>' % [
|
7
|
+
h(post.post_date.strftime('%b %d')),
|
8
|
+
post.url,
|
9
|
+
h(post.label),
|
10
|
+
(post.location) ? '<font size="-1"> (%s)</font>' % h(post.location) : '',
|
11
|
+
(post.has_pic_or_img?) ? ' <span style="color: orange"> img</span>': ''
|
12
|
+
] -%>
|
13
|
+
<% end %>
|
14
|
+
<% else %>
|
15
|
+
<p><i>No new postings were found, which matched the search criteria.</i></p>
|
13
16
|
<% end %>
|
14
17
|
<% end %>
|
@@ -3,6 +3,7 @@ CRAIGSLIST REPORTER
|
|
3
3
|
<%@summaries.each do |summary| -%>
|
4
4
|
<%=summary[:search].name %>
|
5
5
|
<% summary[:postings].collect do |post| -%>
|
6
|
+
<% if summary[:postings].length > 0 %>
|
6
7
|
<%='%s : %s %s %s %s' % [
|
7
8
|
post.post_date.strftime('%b %d'),
|
8
9
|
post.label,
|
@@ -10,6 +11,8 @@ CRAIGSLIST REPORTER
|
|
10
11
|
(post.has_pic_or_img?) ? ' [img]': '',
|
11
12
|
post.url
|
12
13
|
] -%>
|
13
|
-
|
14
|
+
<% else %>
|
15
|
+
No new postings were found, which matched the search criteria.
|
16
|
+
<% end %>
|
14
17
|
<% end %>
|
15
18
|
<% end -%>
|
data/lib/geo_listings.rb
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
# = About geo_listings.rb
|
2
|
+
#
|
3
|
+
# This file contains the parsing code, and logic relating to geographic site pages and paths. You
|
4
|
+
# should never need to include this file directly, as all of libcraigscrape's objects and methods
|
5
|
+
# are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
|
6
|
+
#
|
7
|
+
|
8
|
+
require 'scraper'
|
9
|
+
|
10
|
+
class CraigScrape
|
11
|
+
|
12
|
+
# GeoListings represents a parsed Craigslist geo lisiting page. (i.e. {'http://geo.craigslist.org/iso/us'}[http://geo.craigslist.org/iso/us])
|
13
|
+
# These list all the craigslist sites in a given region.
|
14
|
+
class GeoListings < Scraper
|
15
|
+
GEOLISTING_BASE_URL = %{http://geo.craigslist.org/iso/}
|
16
|
+
|
17
|
+
LOCATION_NAME = /[ ]*\>[ ](.+)[ ]*/
|
18
|
+
PATH_SCANNER = /(?:\\\/|[^\/])+/
|
19
|
+
URL_HOST_PART = /^[^\:]+\:\/\/([^\/]+)[\/]?$/
|
20
|
+
SITE_PREFIX = /^([^\.]+)/
|
21
|
+
FIND_SITES_PARTS = /^[ ]*([\+|\-]?)[ ]*(.+)[ ]*/
|
22
|
+
|
23
|
+
class BadGeoListingPath < StandardError #:nodoc:
|
24
|
+
end
|
25
|
+
|
26
|
+
# The geolisting constructor works like all other Scraper objects, in that it accepts a string 'url'.
|
27
|
+
# See the Craigscrape.find_sites for a more powerful way to find craigslist sites.
|
28
|
+
def initialize(init_via = nil)
|
29
|
+
super(init_via)
|
30
|
+
|
31
|
+
# Validate that required fields are present, at least - if we've downloaded it from a url
|
32
|
+
parse_error! unless location
|
33
|
+
end
|
34
|
+
|
35
|
+
# Returns the GeoLocation's full name
|
36
|
+
def location
|
37
|
+
unless @location
|
38
|
+
cursor = html % 'h3 > b > a:first-of-type'
|
39
|
+
cursor = cursor.next_node if cursor
|
40
|
+
@location = $1 if cursor and LOCATION_NAME.match he_decode(cursor.to_s)
|
41
|
+
end
|
42
|
+
|
43
|
+
@location
|
44
|
+
end
|
45
|
+
|
46
|
+
# Returns a hash of site name to urls in the current listing
|
47
|
+
def sites
|
48
|
+
unless @sites
|
49
|
+
@sites = {}
|
50
|
+
(html / 'div#list > a').each do |el_a|
|
51
|
+
site_name = he_decode strip_html(el_a.inner_html)
|
52
|
+
@sites[site_name] = $1 if URL_HOST_PART.match el_a[:href]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
@sites
|
57
|
+
end
|
58
|
+
|
59
|
+
# This method will return an array of all possible sites that match the specified location path.
|
60
|
+
# Sample location paths:
|
61
|
+
# - us/ca
|
62
|
+
# - us/fl/miami
|
63
|
+
# - jp/fukuoka
|
64
|
+
# - mx
|
65
|
+
# Here's how location paths work.
|
66
|
+
# - The components of the path are to be separated by '/' 's.
|
67
|
+
# - Up to (and optionally, not including) the last component, the path should correspond against a valid GeoLocation url with the prefix of 'http://geo.craigslist.org/iso/'
|
68
|
+
# - the last component can either be a site's 'prefix' on a GeoLocation page, or, the last component can just be a geolocation page itself, in which case all the sites on that page are selected.
|
69
|
+
# - the site prefix is the first dns record in a website listed on a GeoLocation page. (So, for the case of us/fl/miami , the last 'miami' corresponds to the 'south florida' link on {'http://geo.craigslist.org/iso/us/fl'}[http://geo.craigslist.org/iso/us/fl]
|
70
|
+
def self.sites_in_path(full_path, base_url = GEOLISTING_BASE_URL)
|
71
|
+
# the base_url parameter is mostly so we can test this method
|
72
|
+
|
73
|
+
# Unfortunately - the easiest way to understand much of this is to see how craigslist returns
|
74
|
+
# these geolocations. Watch what happens when you request us/fl/non-existant/page/here.
|
75
|
+
# I also made this a little forgiving in a couple ways not specified with official support, per
|
76
|
+
# the rules above.
|
77
|
+
full_path_parts = full_path.scan PATH_SCANNER
|
78
|
+
|
79
|
+
# We'll either find a single site in this loop andf return that, or, we'll find a whole listing
|
80
|
+
# and set the geo_listing object to reflect that
|
81
|
+
geo_listing = nil
|
82
|
+
full_path_parts.each_with_index do |part, i|
|
83
|
+
|
84
|
+
# Let's un-escape the path-part, if needed:
|
85
|
+
part.gsub! "\\/", "/"
|
86
|
+
|
87
|
+
# If they're specifying a single site, this will catch and return it immediately
|
88
|
+
site = geo_listing.sites.find{ |n,s|
|
89
|
+
(SITE_PREFIX.match s and $1 == part) or n == part
|
90
|
+
} if geo_listing
|
91
|
+
|
92
|
+
# This returns the site component of the found array
|
93
|
+
return [site.last] if site
|
94
|
+
|
95
|
+
begin
|
96
|
+
# The URI escape is mostly needed to translate the space characters
|
97
|
+
l = GeoListings.new base_url+full_path_parts[0...i+1].collect{|p| URI.escape p}.join('/')
|
98
|
+
rescue CraigScrape::Scraper::FetchError
|
99
|
+
bad_geo_path! full_path
|
100
|
+
end
|
101
|
+
|
102
|
+
# This probably tells us the first part of the path was 'correct', but not the rest:
|
103
|
+
bad_geo_path! full_path if geo_listing and geo_listing.location == l.location
|
104
|
+
|
105
|
+
geo_listing = l
|
106
|
+
end
|
107
|
+
|
108
|
+
# We have a valid listing page we found, and we can just return all the sites on it:
|
109
|
+
geo_listing.sites.collect{|n,s| s }
|
110
|
+
end
|
111
|
+
|
112
|
+
# find_sites takes a single array of strings as an argument. Each string is to be either a location path
|
113
|
+
# (see sites_in_path), or a full site (in canonical form - ie "memphis.craigslist.org"). Optionally,
|
114
|
+
# each of this may/should contain a '+' or '-' prefix to indicate whether the string is supposed to
|
115
|
+
# include sites from the master list, or remove them from the list. If no '+' or'-' is
|
116
|
+
# specified, the default assumption is '+'. Strings are processed from left to right, which gives
|
117
|
+
# a high degree of control over the selection set. Examples:
|
118
|
+
# - find_sites "us/fl", "- miami.craigslist.org"
|
119
|
+
# - find_sites "us", "- us/nm"
|
120
|
+
# - find_sites "us", "- us/ny", "+ newyork.craigslist.org"
|
121
|
+
# - find_sites "us/ny", "us/id", "caribbean.craigslist.org"
|
122
|
+
# There's a lot of flexibility here, you get the idea.
|
123
|
+
def self.find_sites(specs, base_url = GEOLISTING_BASE_URL)
|
124
|
+
ret = []
|
125
|
+
|
126
|
+
specs.each do |spec|
|
127
|
+
(op,spec = $1,$2) if FIND_SITES_PARTS.match spec
|
128
|
+
|
129
|
+
spec = (spec.include? '.') ? [spec] : sites_in_path(spec, base_url)
|
130
|
+
|
131
|
+
(op == '-') ? ret -= spec : ret |= spec
|
132
|
+
end
|
133
|
+
|
134
|
+
ret
|
135
|
+
end
|
136
|
+
|
137
|
+
private
|
138
|
+
|
139
|
+
def self.bad_geo_path!(path)
|
140
|
+
raise BadGeoListingPath, "Unable to load path #{path.inspect}, either you're having problems connecting to Craiglist, or your path is invalid."
|
141
|
+
end
|
142
|
+
|
143
|
+
end
|
144
|
+
end
|