olek-libcraigscrape 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +94 -0
- data/COPYING +674 -0
- data/COPYING.LESSER +165 -0
- data/README +89 -0
- data/Rakefile +125 -0
- data/bin/craig_report_schema.yml +68 -0
- data/bin/craigwatch +581 -0
- data/bin/report_mailer/craigslist_report.html.erb +17 -0
- data/bin/report_mailer/craigslist_report.plain.erb +18 -0
- data/lib/geo_listings.rb +144 -0
- data/lib/libcraigscrape.rb +217 -0
- data/lib/listings.rb +160 -0
- data/lib/posting.rb +324 -0
- data/lib/scraper.rb +212 -0
- data/test/geolisting_samples/geo_listing_ca070209.html +76 -0
- data/test/geolisting_samples/geo_listing_ca_sk070209.html +31 -0
- data/test/geolisting_samples/geo_listing_cn070209.html +35 -0
- data/test/geolisting_samples/geo_listing_us070209.html +355 -0
- data/test/geolisting_samples/hierarchy_test071009/index.html +31 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/%20SW%20florida/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/ft%20myers%20%5C/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/miami/nonsense/more-nonsense/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/nonexist/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/nonsense/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/fl/south%20florida/index.html +46 -0
- data/test/geolisting_samples/hierarchy_test071009/us/index.html +355 -0
- data/test/google.html +8 -0
- data/test/libcraigscrape_test_helpers.rb +37 -0
- data/test/listing_samples/category_output.html +231 -0
- data/test/listing_samples/category_output_2.html +217 -0
- data/test/listing_samples/empty_listings.html +128 -0
- data/test/listing_samples/fortmyers_art_index.060909/1046596324.html +93 -0
- data/test/listing_samples/fortmyers_art_index.060909/1053085283.html +92 -0
- data/test/listing_samples/fortmyers_art_index.060909/1112522674.html +89 -0
- data/test/listing_samples/fortmyers_art_index.060909/823516079.html +92 -0
- data/test/listing_samples/fortmyers_art_index.060909/825684735.html +89 -0
- data/test/listing_samples/fortmyers_art_index.060909/891513957.html +94 -0
- data/test/listing_samples/fortmyers_art_index.060909/897549505.html +99 -0
- data/test/listing_samples/fortmyers_art_index.060909/960826026.html +89 -0
- data/test/listing_samples/fortmyers_art_index.060909/993256300.html +89 -0
- data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index500.060909.html +237 -0
- data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index600.060909.html +132 -0
- data/test/listing_samples/long_search_output.html +137 -0
- data/test/listing_samples/mia_fua_index8900.5.21.09.html +226 -0
- data/test/listing_samples/mia_search_kitten.3.15.10.html +149 -0
- data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack1000.6.18.09.html +144 -0
- data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack900.6.18.09.html +146 -0
- data/test/listing_samples/new_listing_span.4.17.10.html +769 -0
- data/test/listing_samples/short_search_output.html +133 -0
- data/test/post_samples/1207457727.html +92 -0
- data/test/post_samples/brw_reb_1224008903.html +101 -0
- data/test/post_samples/posting0.html +91 -0
- data/test/post_samples/posting1.html +106 -0
- data/test/post_samples/posting1796890756-061710.html +2318 -0
- data/test/post_samples/posting1808219423.html +2473 -0
- data/test/post_samples/posting1938291834-090610.html +188 -0
- data/test/post_samples/posting2.html +107 -0
- data/test/post_samples/posting3.html +92 -0
- data/test/post_samples/posting4.html +993 -0
- data/test/post_samples/posting5.html +38 -0
- data/test/post_samples/sfbay_art_1223614914.html +94 -0
- data/test/post_samples/this_post_has_been_deleted_by_its_author.html +37 -0
- data/test/post_samples/this_post_has_expired.html +48 -0
- data/test/test_craigslist_geolisting.rb +521 -0
- data/test/test_craigslist_listing.rb +362 -0
- data/test/test_craigslist_posting.rb +426 -0
- metadata +273 -0
data/bin/craigwatch
ADDED
@@ -0,0 +1,581 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
#
|
3
|
+
# =craigwatch - A email-based "post monitoring" solution
|
4
|
+
#
|
5
|
+
# Created alongside the libcraigscrape library, libcraigwatch was designed to take the monotony out of regular
|
6
|
+
# craiglist monitoring. craigwatch is designed to be run at periodic intervals (hourly/daily/etc) through crontab
|
7
|
+
# and report all new postings within a listing or search url, since its last run, by email.
|
8
|
+
#
|
9
|
+
# For more information, head to the {craiglist monitoring}[http://www.derosetechnologies.com/community/libcraigscrape] help section of our website.
|
10
|
+
#
|
11
|
+
# == Features
|
12
|
+
# In additon to its report tracking, craigwatch offers many post search and filtering options that offer much imrpoved
|
13
|
+
# and more accurate results then does craigslist's search functions. Post filtering options include:
|
14
|
+
# - has_image - yes/no
|
15
|
+
# - price_required - yes/no
|
16
|
+
# - price_greater_than - (int)
|
17
|
+
# - price_less_than - (int)
|
18
|
+
# - full_post_has - (array of string or regexp) Only post whose full-post's contents contains/matches
|
19
|
+
# - full_post_has_no - (array of string or regexp) Only post whose full-post's contents_ contains doesn't contain/match
|
20
|
+
# - summary_post_has - (array of string or regexp) Only post whose listing's label contains/matches
|
21
|
+
# - summary_post_has_no - (array of string or regexp) Only post whose listing's label doesn't contain/match
|
22
|
+
# - summary_or_full_post_has - (array of string or regexp) Filter's out results which don't match either the post label <b>or</b> the post contents
|
23
|
+
# - summary_or_full_post_has_no - (array of string or regexp) Filter's out results which match either the post label <b>or</b> the post contents
|
24
|
+
# - location_has - (array of string or regexp) Only include posts which match against the post location
|
25
|
+
# - location_has_no - (array of string or regexp) Only include posts which don't match against the post location
|
26
|
+
#
|
27
|
+
# Multiple searches can be combined into a single report, and results can be sorted by newest-first or oldest-first (default)
|
28
|
+
#
|
29
|
+
# Reporting output is easily customized html, handled by ActionMailer, and emails can be delivered via smtp or sendmail.
|
30
|
+
# Database tracking of already-delivered posts is handled by ActiveRecord, and its driver-agnostic SQL supports all the
|
31
|
+
# major backends (sqllite/mysql/postgres/probably-all-others). Database sizes are contained by automatically pruning old results
|
32
|
+
# that are no longer required at the end of each run.
|
33
|
+
#
|
34
|
+
# Pretty useful, no?
|
35
|
+
#
|
36
|
+
# == Installation
|
37
|
+
# craigwatch is coupled with libcraigscrape, and is installed via ruby gems. However, since we focused on keeping the
|
38
|
+
# libcraigscrape download 'lightweight' some additional gems need to be installed in addition to the initial libcraigscrape
|
39
|
+
# gem itself.
|
40
|
+
#
|
41
|
+
# This should take care of the craigwatch install on all systems:
|
42
|
+
# sudo gem install libcraigscrape kwalify activerecord actionmailer
|
43
|
+
# Alternatively, if you've already installed libcraigscrape and want to start working with craigwatch:
|
44
|
+
# sudo gem install kwalify activerecord actionmailer
|
45
|
+
#
|
46
|
+
# This script was initially developed with activerecord 2.3, actionmailer 2.3 and kwalify 0.7, but will likely work with most
|
47
|
+
# prior and future versions of these libraries.
|
48
|
+
#
|
49
|
+
# == Usage
|
50
|
+
# When craigwatch is invoked, it is designed to run a single report and then terminate. There is only one parameter to craigwatch, and
|
51
|
+
# this parameter is the path to a valid report-definition yml file. ie:
|
52
|
+
# craigwatch johns_daily_watch.yml
|
53
|
+
#
|
54
|
+
# There is an included kwalify schema which can validate your definition files, but craigwatch will automatically do so at startup.
|
55
|
+
# Probably, the best way to understand the report definition files, is to look at the annotated sample file below, and use it as a
|
56
|
+
# starting point for your own.
|
57
|
+
#
|
58
|
+
# By default there is no program output, however, setting any of the following paramters to 'yes' in your definition file will turn on
|
59
|
+
# useful debugging/logging output:
|
60
|
+
# - debug_database
|
61
|
+
# - debug_mailer
|
62
|
+
# - debug_craigscrape
|
63
|
+
#
|
64
|
+
# == Definition File Sample
|
65
|
+
#
|
66
|
+
# Let's start with a minimal report, just enough needed to get something quick working:
|
67
|
+
# # We need some kind of destination to send this to
|
68
|
+
# email_to: Chris DeRose <cderose@derosetechnologies.com>
|
69
|
+
#
|
70
|
+
# # This is an array of specific 'searches' we'll be performing in this report:
|
71
|
+
# searches:
|
72
|
+
# # We're looking for 90's era cadillac, something cheap, confortable and in white...
|
73
|
+
# - name: 90's White/Creme Convertible Cadillacs
|
74
|
+
#
|
75
|
+
# # This starting date is mostly for the first run, and gives us a reasonable cut-off point from whcih to build.
|
76
|
+
# # Its optional, and if omitted, craigwatch defaults to 'yesterday'
|
77
|
+
# starting: 9/10/09
|
78
|
+
#
|
79
|
+
# # We want to check all the labels, and filter out years not in the 90's, and cars not made by cadillac
|
80
|
+
# summary_post_has:
|
81
|
+
# - /(?:^|[^\d]|19)9[\d](?:[^\dk]|$)/i
|
82
|
+
# - /cadillac/i
|
83
|
+
#
|
84
|
+
# # I said we're looking for something *comfortable* !
|
85
|
+
# summary_post_has_no: [ /xlr/i ]
|
86
|
+
#
|
87
|
+
# # We were convertable, and white/cream/etc:
|
88
|
+
# full_post_has:
|
89
|
+
# - /convertible/i
|
90
|
+
# - /(white|yellow|banana|creme|cream)/i
|
91
|
+
#
|
92
|
+
# # Convertible - not *simulated* convertible!
|
93
|
+
# full_post_has_no:
|
94
|
+
# - /simulated[^a-z]{0,2}convertible/i
|
95
|
+
#
|
96
|
+
# # We want to search all of craigslist's in the us, and we'll want to find it using
|
97
|
+
# # the '/search/cta?hasPic=1&query=cadillac' url on the site
|
98
|
+
# sites: [ us ]
|
99
|
+
# listings:
|
100
|
+
# - /search/cta?hasPic=1&query=cadillac
|
101
|
+
#
|
102
|
+
# Here's another annotated report which uses most of the other available craigwatch features:
|
103
|
+
#
|
104
|
+
# # The report_name is fed into Time.now.strftime, hence the formatting characters
|
105
|
+
# report_name: Craig Watch For Johnathan on %D at %I:%M %p
|
106
|
+
#
|
107
|
+
# email_to: Johnathan Peabody <john@example.local>
|
108
|
+
#
|
109
|
+
# # This is sent straight into ActiveRecord, so there's plenty of options available here. the following is an easy
|
110
|
+
# # default sqlite store that should work on most any system with a minimal overhead
|
111
|
+
# tracking_database: { adapter: sqlite3, dbfile: /home/john/john_cwatch_report.db }
|
112
|
+
#
|
113
|
+
# searches:
|
114
|
+
# # Search #1:
|
115
|
+
# - name: Schwinn Bikes For Sale in/near New York
|
116
|
+
# starting: 9/10/2009
|
117
|
+
#
|
118
|
+
# # Scrape the following sites/servers:
|
119
|
+
# sites: [ us/ny/newyork, us/nj/southjersey ]
|
120
|
+
#
|
121
|
+
# # Scrape the following listings pages:
|
122
|
+
# listings: [ bik ]
|
123
|
+
#
|
124
|
+
# # We want listings with Schwinn in the summary
|
125
|
+
# summary_post_has: [ /schwinn/i ]
|
126
|
+
#
|
127
|
+
# # We're only interested in adult bikes, so scrap any results that mentions chidren or kids
|
128
|
+
# full_post_has_no: [ /(children|kids)/i ]
|
129
|
+
#
|
130
|
+
# # Oh, and we're on a budget:
|
131
|
+
# price_less_than: 120
|
132
|
+
#
|
133
|
+
# # Search #2
|
134
|
+
# - name: Large apartment rentals in San Francisco
|
135
|
+
# sites: [ us/ca/sfbay ]
|
136
|
+
# starting: 9/10/2009
|
137
|
+
#
|
138
|
+
# # We're going to rely on craigslist's built-in search for this one since there's a lot of listings, and we
|
139
|
+
# # want to conserve some bandwidth
|
140
|
+
# listings: [ /search/apa?query=pool&minAsk=min&maxAsk=max&bedrooms=5 ]
|
141
|
+
#
|
142
|
+
# # We'll require a price to be listed, 'cause it keeps out some of the unwanted fluff
|
143
|
+
# price_required: yes
|
144
|
+
#
|
145
|
+
# # Hopefully this will keep us away from a bad part of town:
|
146
|
+
# price_greater_than: 1000
|
147
|
+
#
|
148
|
+
# # Since we dont have time to driv to each location, we'll require only listings with pictures
|
149
|
+
# has_image: yes
|
150
|
+
#
|
151
|
+
# == Author
|
152
|
+
# - Chris DeRose (cderose@derosetechnologies.com)
|
153
|
+
# - DeRose Technologies, Inc. http://www.derosetechnologies.com
|
154
|
+
#
|
155
|
+
# == License
|
156
|
+
#
|
157
|
+
# See COPYING[link:files/COPYING.html]
|
158
|
+
#
|
159
|
+
$: << File.dirname(__FILE__) + '/../lib'
|
160
|
+
|
161
|
+
require 'rubygems'
|
162
|
+
|
163
|
+
gem 'kwalify', '~> 0.7'
|
164
|
+
gem 'activerecord', '~> 2.3'
|
165
|
+
gem 'actionmailer', '~> 2.3'
|
166
|
+
|
167
|
+
require 'kwalify'
|
168
|
+
require 'active_record'
|
169
|
+
require 'action_mailer'
|
170
|
+
require 'kwalify/util/hashlike'
|
171
|
+
require 'libcraigscrape'
|
172
|
+
require "socket"
|
173
|
+
|
174
|
+
class String #:nodoc:
|
175
|
+
RE = /^\/(.*)\/([ixm]*)$/
|
176
|
+
|
177
|
+
def is_re?
|
178
|
+
(RE.match self) ? true : false
|
179
|
+
end
|
180
|
+
|
181
|
+
def to_re
|
182
|
+
source, options = ( RE.match(self) )? [$1, $2] : [self,nil]
|
183
|
+
mods = 0
|
184
|
+
|
185
|
+
options.each_char do |c|
|
186
|
+
mods |= case c
|
187
|
+
when 'i' then Regexp::IGNORECASE
|
188
|
+
when 'x' then Regexp::EXTENDED
|
189
|
+
when 'm' then Regexp::MULTILINE
|
190
|
+
end
|
191
|
+
end unless options.nil? or options.empty?
|
192
|
+
|
193
|
+
Regexp.new source, mods
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
class CraigReportDefinition #:nodoc:
|
198
|
+
include Kwalify::Util::HashLike
|
199
|
+
|
200
|
+
EMAIL_NAME_PARTS = /^[ ]*(.+)[ ]*\<.+\>[ ]*/
|
201
|
+
|
202
|
+
attr_reader :report_name, :email_to, :email_from, :tracking_database, :searches, :smtp_settings
|
203
|
+
|
204
|
+
def debug_database?; @debug_database; end
|
205
|
+
def debug_mailer?; @debug_mailer; end
|
206
|
+
def debug_craigscrape?; @debug_craigscrape; end
|
207
|
+
|
208
|
+
def email_from
|
209
|
+
(@email_from) ? @email_from : ('%s@%s' % [ENV['USER'], Socket.gethostname])
|
210
|
+
end
|
211
|
+
|
212
|
+
def email_to_name
|
213
|
+
EMAIL_NAME_PARTS.match(email_to) ? $1 : email_to
|
214
|
+
end
|
215
|
+
|
216
|
+
def report_name
|
217
|
+
@report_name ? @report_name : "Craigslist Watch For #{email_to_name} on %D at %I:%M %p"
|
218
|
+
end
|
219
|
+
|
220
|
+
# We allow people rewrite relative (sqlite) dbfiles by taking the use_cwd as a paramter
|
221
|
+
def tracking_database(for_yaml_file = nil)
|
222
|
+
# We'll setup a SQLite db using some defaults if needed
|
223
|
+
@tracking_database ||= {
|
224
|
+
:adapter => 'sqlite3',
|
225
|
+
:database => File.basename(for_yaml_file, File.extname(for_yaml_file))+'.db'
|
226
|
+
} if for_yaml_file
|
227
|
+
|
228
|
+
# This is a little hack to make sqlite definitions a little more portable, by allowing them
|
229
|
+
# to be specify dbfile's relative to the yml's directory:
|
230
|
+
ret = @tracking_database
|
231
|
+
ret['dbfile'] = '%s/%s' % [File.dirname(for_yaml_file), $1] if (
|
232
|
+
for_yaml_file and ret.has_key? 'dbfile' and /^([^\/].*)$/.match ret['dbfile']
|
233
|
+
)
|
234
|
+
|
235
|
+
ret
|
236
|
+
end
|
237
|
+
|
238
|
+
class SearchDefinition #:nodoc:
|
239
|
+
include Kwalify::Util::HashLike
|
240
|
+
|
241
|
+
attr_reader :name, :sites, :listings
|
242
|
+
attr_reader :location_has, :location_has_no
|
243
|
+
attr_reader :full_post_has, :full_post_has_no
|
244
|
+
attr_reader :summary_post_has, :summary_post_has_no
|
245
|
+
attr_reader :summary_or_full_post_has, :summary_or_full_post_has_no
|
246
|
+
|
247
|
+
attr_reader :price_greater_than,:price_less_than
|
248
|
+
|
249
|
+
def has_image?; @has_image; end
|
250
|
+
def newest_first?; @newest_first; end
|
251
|
+
def price_required?; @price_required; end
|
252
|
+
|
253
|
+
def starting_at
|
254
|
+
(@starting) ?
|
255
|
+
Time.parse(@starting) :
|
256
|
+
Time.now.yesterday.beginning_of_day
|
257
|
+
end
|
258
|
+
|
259
|
+
def passes_filter?(post)
|
260
|
+
if post.price.nil?
|
261
|
+
return false if price_required?
|
262
|
+
else
|
263
|
+
return false if @price_greater_than and post.price <= @price_greater_than
|
264
|
+
return false if @price_less_than and post.price >= @price_less_than
|
265
|
+
end
|
266
|
+
|
267
|
+
# Label Filters:
|
268
|
+
return false unless matches_all? summary_post_has, post.label
|
269
|
+
return false unless doesnt_match_any? summary_post_has_no, post.label
|
270
|
+
|
271
|
+
# Location Filters:
|
272
|
+
return false unless matches_all? location_has, post.location
|
273
|
+
return false unless doesnt_match_any? location_has_no, post.location
|
274
|
+
|
275
|
+
# Full post Filters:
|
276
|
+
if full_post_has or full_post_has_no or summary_or_full_post_has or summary_or_full_post_has_no
|
277
|
+
# We're going to download the page, so let's make sure we didnt hit a "This posting has been flagged for removal"
|
278
|
+
return false if post.system_post?
|
279
|
+
|
280
|
+
return false unless matches_all? full_post_has, post.contents_as_plain
|
281
|
+
return false unless doesnt_match_any? full_post_has_no, post.contents_as_plain
|
282
|
+
|
283
|
+
return false unless matches_all? summary_or_full_post_has, [post.contents_as_plain, post.label]
|
284
|
+
return false unless doesnt_match_any? summary_or_full_post_has_no, [post.contents_as_plain, post.label]
|
285
|
+
end
|
286
|
+
|
287
|
+
true
|
288
|
+
end
|
289
|
+
|
290
|
+
private
|
291
|
+
|
292
|
+
def matches_all?(conditions, against)
|
293
|
+
against = against.to_a
|
294
|
+
(conditions.nil? or conditions.all?{|c| against.any?{|a| match_against c, a } }) ? true : false
|
295
|
+
end
|
296
|
+
|
297
|
+
def doesnt_match_any?(conditions, against)
|
298
|
+
against = against.to_a
|
299
|
+
(conditions.nil? or conditions.all?{|c| against.any?{|a| !match_against c, a } }) ? true : false
|
300
|
+
end
|
301
|
+
|
302
|
+
def match_against(condition, against)
|
303
|
+
(against.scan( condition.is_re? ? condition.to_re : /#{condition}/i).length > 0) ? true : false
|
304
|
+
end
|
305
|
+
end
|
306
|
+
end
|
307
|
+
|
308
|
+
class TrackedSearch < ActiveRecord::Base #:nodoc:
|
309
|
+
has_many :listings, :dependent => :destroy, :class_name => 'TrackedListing'
|
310
|
+
validates_uniqueness_of :search_name
|
311
|
+
validates_presence_of :search_name
|
312
|
+
|
313
|
+
def self.find_by_name(name)
|
314
|
+
self.find :first, :conditions => ['search_name = ?',name]
|
315
|
+
end
|
316
|
+
|
317
|
+
def find_listing_by_url(url)
|
318
|
+
listings.find :first, :conditions => ['url = ?', url]
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
class TrackedListing < ActiveRecord::Base #:nodoc:
|
323
|
+
has_many :posts, :dependent => :destroy, :class_name => 'TrackedPost'
|
324
|
+
validates_presence_of :url, :tracked_search_id
|
325
|
+
|
326
|
+
def already_tracked?(url)
|
327
|
+
( self.posts.find :first, :conditions => ['url = ?', url]) ? true : false
|
328
|
+
end
|
329
|
+
|
330
|
+
def last_tracked_at
|
331
|
+
self.posts.maximum 'created_at'
|
332
|
+
end
|
333
|
+
|
334
|
+
def delete_posts_older_than(cutoff_date)
|
335
|
+
# TODO: can't I use posts.delete 'created_at < ?' and keep it cleaner?
|
336
|
+
TrackedPost.delete_all [ 'tracked_listing_id = ? AND created_at < ?', self.id, cutoff_date ]
|
337
|
+
end
|
338
|
+
end
|
339
|
+
|
340
|
+
class TrackedPost < ActiveRecord::Base #:nodoc:
|
341
|
+
validates_presence_of :url, :tracked_listing_id
|
342
|
+
|
343
|
+
def self.activate_all!
|
344
|
+
TrackedPost.update_all(
|
345
|
+
{ :active => true },
|
346
|
+
[ 'active = ?', false ]
|
347
|
+
)
|
348
|
+
end
|
349
|
+
|
350
|
+
def self.destroy_inactive!
|
351
|
+
TrackedPost.delete_all [ 'active = ?', false ]
|
352
|
+
end
|
353
|
+
end
|
354
|
+
|
355
|
+
class ReportMailer < ActionMailer::Base #:nodoc:
|
356
|
+
def report(to, sender, subject_template, report_tmpl)
|
357
|
+
|
358
|
+
formatted_subject = Time.now.strftime(subject_template)
|
359
|
+
|
360
|
+
recipients to
|
361
|
+
from sender
|
362
|
+
subject formatted_subject
|
363
|
+
|
364
|
+
generate_view_parts 'craigslist_report', report_tmpl.merge({:subject =>formatted_subject})
|
365
|
+
end
|
366
|
+
|
367
|
+
def generate_view_parts(view_name, tmpl)
|
368
|
+
part( :content_type => "multipart/alternative" ) do |p|
|
369
|
+
[
|
370
|
+
{ :content_type => "text/plain", :body => render_message("#{view_name.to_s}.plain.erb", tmpl) },
|
371
|
+
{ :content_type => "text/html", :body => render_message("#{view_name.to_s}.html.erb", tmpl.merge({:part_container => p})) }
|
372
|
+
].each { |parms| p.part parms.merge( { :charset => "UTF-8", :transfer_encoding => "7bit" } ) }
|
373
|
+
end
|
374
|
+
end
|
375
|
+
end
|
376
|
+
|
377
|
+
#############
|
378
|
+
|
379
|
+
# Let's start our program now:
|
380
|
+
report_definition_file = ARGV[0] if ARGV[0] and File.readable?(ARGV[0])
|
381
|
+
|
382
|
+
unless report_definition_file
|
383
|
+
puts <<EOD
|
384
|
+
Usage:
|
385
|
+
#{File.basename($0)} [report_definition_file]
|
386
|
+
|
387
|
+
Run 'gem server' and browse the libcraigscrape rdoc for 'bin/craigscrape' for specific usage details.
|
388
|
+
EOD
|
389
|
+
exit
|
390
|
+
end
|
391
|
+
|
392
|
+
# Validate/Parse our input file:
|
393
|
+
parser = Kwalify::Yaml::Parser.new(
|
394
|
+
Kwalify::Validator.new(
|
395
|
+
Kwalify::Yaml.load_file(File.dirname(__FILE__)+'/craig_report_schema.yml')
|
396
|
+
),
|
397
|
+
:data_binding => true
|
398
|
+
)
|
399
|
+
|
400
|
+
craig_report = parser.parse_file report_definition_file
|
401
|
+
|
402
|
+
parser.errors.each do |e|
|
403
|
+
puts "Definition Validation Error (line #{e.linenum}, char #{e.column}): #{e.message}"
|
404
|
+
end and exit if parser.errors.length > 0
|
405
|
+
|
406
|
+
# Initialize Action Mailer:
|
407
|
+
ActionMailer::Base.logger = Logger.new STDERR if craig_report.debug_mailer?
|
408
|
+
if craig_report.smtp_settings
|
409
|
+
ReportMailer.smtp_settings = craig_report.smtp_settings.symbolize_keys
|
410
|
+
else
|
411
|
+
ReportMailer.delivery_method = :sendmail
|
412
|
+
end
|
413
|
+
ReportMailer.template_root = File.dirname __FILE__
|
414
|
+
|
415
|
+
# Initialize the database:
|
416
|
+
ActiveRecord::Base.logger = Logger.new STDERR if craig_report.debug_database?
|
417
|
+
ActiveRecord::Base.establish_connection craig_report.tracking_database(report_definition_file)
|
418
|
+
|
419
|
+
# Initialize CraigScrape (sorta)
|
420
|
+
CraigScrape::Scraper.logger = Logger.new STDERR if craig_report.debug_craigscrape?
|
421
|
+
|
422
|
+
# Perform migrations if needed?
|
423
|
+
ActiveRecord::Schema.define do
|
424
|
+
suppress_messages do
|
425
|
+
create_table :tracked_searches do |t|
|
426
|
+
t.column :search_name, :string
|
427
|
+
end unless table_exists? :tracked_searches
|
428
|
+
|
429
|
+
create_table :tracked_listings do |t|
|
430
|
+
t.column :url, :string
|
431
|
+
t.column :tracked_search_id, :integer
|
432
|
+
end unless table_exists? :tracked_listings
|
433
|
+
|
434
|
+
create_table :tracked_posts do |t|
|
435
|
+
t.column :url, :string
|
436
|
+
t.column :tracked_listing_id, :integer
|
437
|
+
t.column :created_at, :date
|
438
|
+
t.column :active, :boolean, :default => 0
|
439
|
+
end unless table_exists? :tracked_posts
|
440
|
+
end
|
441
|
+
end
|
442
|
+
|
443
|
+
# Remove all posts which are inactive. They would be in there if the prior run was a failure.
|
444
|
+
TrackedPost.destroy_inactive!
|
445
|
+
|
446
|
+
# We'll need these outside this next loop:
|
447
|
+
newly_tracked_posts = []
|
448
|
+
|
449
|
+
# Now let's run a report:
|
450
|
+
report_summaries = craig_report.searches.collect do |search|
|
451
|
+
# Load our tracking info
|
452
|
+
search_track = TrackedSearch.find_by_name search.name
|
453
|
+
|
454
|
+
# No Tracking found - let's set one up:
|
455
|
+
search_track = TrackedSearch.create! :search_name => search.name unless search_track
|
456
|
+
|
457
|
+
# This hash tracks what makes it into the report on this search.
|
458
|
+
# NOTE that keys are url's b/c sometimes the same posting will end up in multiple listings,
|
459
|
+
# And doing this ensures that we don't end-up reporting the same post twice.
|
460
|
+
new_summaries = {}
|
461
|
+
|
462
|
+
# And now we actually scrape:
|
463
|
+
CraigScrape.new(*search.sites).each_listing(*search.listings) do |listing|
|
464
|
+
# Keep in mind that listing.url does change in the while loop.
|
465
|
+
# But, this first one is a good base_url that will never change between runs.
|
466
|
+
|
467
|
+
tracked_listing = search_track.find_listing_by_url listing.url
|
468
|
+
tracked_listing ||= search_track.listings.create! :url => listing.url
|
469
|
+
|
470
|
+
# Gives us a sane stopping point (hopefully) :
|
471
|
+
last_tracked_at = tracked_listing.last_tracked_at
|
472
|
+
last_tracked_at ||= search.starting_at
|
473
|
+
|
474
|
+
# Some more stopping points (probably):
|
475
|
+
already_tracked_urls = tracked_listing.posts.collect{|tp| tp.url}
|
476
|
+
|
477
|
+
# We'll use this in the loop to decide what posts to track:
|
478
|
+
newest_post_date = last_tracked_at
|
479
|
+
|
480
|
+
# We keep track of post.post_date here, b/c in some circumstances, you can be in the below loop
|
481
|
+
# but have no post.post_date since the posting was removed and it parsed to nil
|
482
|
+
most_recent_posting_date = Time.now
|
483
|
+
|
484
|
+
# OK - Now let's go!
|
485
|
+
catch :list_break do
|
486
|
+
while listing
|
487
|
+
listing.posts.each do |post|
|
488
|
+
begin
|
489
|
+
most_recent_posting_date = post.post_date if post.post_date
|
490
|
+
|
491
|
+
# Are we at a point in the scrape, past which we don't need to proceed?
|
492
|
+
throw :list_break if (
|
493
|
+
most_recent_posting_date < last_tracked_at or
|
494
|
+
already_tracked_urls.include? post.url
|
495
|
+
)
|
496
|
+
|
497
|
+
# If we want to report this post, add it to the collection:
|
498
|
+
new_summaries[post.url] = post if (
|
499
|
+
!new_summaries.has_key? post.url and
|
500
|
+
search.passes_filter? post
|
501
|
+
)
|
502
|
+
rescue CraigScrape::Scraper::ResourceNotFoundError,CraigScrape::Scraper::MaxRedirectError => e
|
503
|
+
# Sometimes we do end up with 404's that will never load, and we dont want to
|
504
|
+
# abort a run simply b/c we found some anomaly due to the craigslist index.
|
505
|
+
# being out of date. This ResourceNotFoundError can occur due to
|
506
|
+
# loading the post url in full, only to see that it was yanked - or craigslist
|
507
|
+
# is acting funny.
|
508
|
+
next
|
509
|
+
end
|
510
|
+
|
511
|
+
# Now let's see if the url should be kept in our tracking database for the future...
|
512
|
+
|
513
|
+
# This post-date sets a limit for the tracked_listing.posts.create below
|
514
|
+
newest_post_date = most_recent_posting_date if most_recent_posting_date > newest_post_date
|
515
|
+
|
516
|
+
# Now let's add these urls to the database so as to reduce memory overhead.
|
517
|
+
# Keep in mind - they're not active until the email goes out.
|
518
|
+
# also - we shouldn't have to worry about putting 'irrelevant' posts in the db, since
|
519
|
+
# the nbewest are always the first ones parsed:
|
520
|
+
tracked_listing.posts.create(
|
521
|
+
:url => post.url,
|
522
|
+
:created_at => newest_post_date
|
523
|
+
) unless most_recent_posting_date < newest_post_date
|
524
|
+
|
525
|
+
end
|
526
|
+
|
527
|
+
listing = listing.next_page
|
528
|
+
end
|
529
|
+
end
|
530
|
+
end
|
531
|
+
|
532
|
+
|
533
|
+
|
534
|
+
# Let's flatten the unique'd hash into a more useable array:
|
535
|
+
# NOTE: The reason we included a reject is a little complicated, but here's the gist:
|
536
|
+
# * We try not to load the whole post if we don't have to
|
537
|
+
# * Its possible that we met all the criterion of the passes_filter? with merely a header, and
|
538
|
+
# if so we add a url to the summaries stack
|
539
|
+
# * Unfortunately, when we later load that post in full, we may find that the post was posting_has_expired?
|
540
|
+
# or flagged_for_removal?, etc.
|
541
|
+
# * If this was the case, below we'll end up sorting against nil post_dates. This would fail.
|
542
|
+
# * So - before we sort, we run a quick reject on nil post_dates
|
543
|
+
new_summaries = new_summaries.values.reject{|v| v.post_date.nil? }.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
|
544
|
+
|
545
|
+
# Now Let's manage the tracking database:
|
546
|
+
if new_summaries.length > 0
|
547
|
+
|
548
|
+
# We'll use this in the cleanup at the bottom:
|
549
|
+
latest_post_date = new_summaries.last.post_date
|
550
|
+
|
551
|
+
new_summaries.reverse! if search.newest_first?
|
552
|
+
end
|
553
|
+
|
554
|
+
# We'll want to email these...
|
555
|
+
{
|
556
|
+
:latest_post_date => latest_post_date,
|
557
|
+
:search_track => search_track,
|
558
|
+
:postings => new_summaries,
|
559
|
+
:search => search
|
560
|
+
}
|
561
|
+
end
|
562
|
+
|
563
|
+
# Time to send the email (maybe):
|
564
|
+
unless report_summaries.select { |s| ! s[:postings].empty? }.empty?
|
565
|
+
ReportMailer.deliver_report(
|
566
|
+
craig_report.email_to,
|
567
|
+
craig_report.email_from,
|
568
|
+
craig_report.report_name,
|
569
|
+
{:summaries => report_summaries, :definition => craig_report}
|
570
|
+
)
|
571
|
+
end
|
572
|
+
|
573
|
+
# Commit (make 'active') all newly created tracked post urls:
|
574
|
+
TrackedPost.activate_all!
|
575
|
+
|
576
|
+
# Now remove all the no-longer-need posts from the prior run:
|
577
|
+
report_summaries.each do |summary|
|
578
|
+
summary[:search_track].listings.each do |listing|
|
579
|
+
listing.delete_posts_older_than listing.last_tracked_at
|
580
|
+
end
|
581
|
+
end
|