libcraigscrape 0.6.5 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +17 -0
- data/Rakefile +1 -1
- data/bin/craigwatch +10 -10
- data/bin/report_mailer/craigslist_report.html.erb +2 -2
- data/bin/report_mailer/craigslist_report.plain.erb +2 -2
- data/lib/libcraigscrape.rb +585 -342
- data/test/geolisting_samples/geo_listing_ca070209.html +76 -0
- data/test/geolisting_samples/geo_listing_ca_sk070209.html +31 -0
- data/test/geolisting_samples/geo_listing_cn070209.html +35 -0
- data/test/geolisting_samples/geo_listing_us070209.html +355 -0
- data/test/libcraigscrape_test_helpers.rb +31 -0
- data/test/listing_samples/fortmyers_art_index.060909/1046596324.html +93 -0
- data/test/listing_samples/fortmyers_art_index.060909/1053085283.html +92 -0
- data/test/listing_samples/fortmyers_art_index.060909/1112522674.html +89 -0
- data/test/listing_samples/fortmyers_art_index.060909/823516079.html +92 -0
- data/test/listing_samples/fortmyers_art_index.060909/825684735.html +89 -0
- data/test/listing_samples/fortmyers_art_index.060909/891513957.html +94 -0
- data/test/listing_samples/fortmyers_art_index.060909/897549505.html +99 -0
- data/test/listing_samples/fortmyers_art_index.060909/960826026.html +89 -0
- data/test/listing_samples/fortmyers_art_index.060909/993256300.html +89 -0
- data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index500.060909.html +237 -0
- data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index600.060909.html +132 -0
- data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack1000.6.18.09.html +144 -0
- data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack900.6.18.09.html +146 -0
- data/test/post_samples/brw_reb_1224008903.html +101 -0
- data/test/post_samples/sfbay_art_1223614914.html +94 -0
- data/test/test_craigslist_geolisting.rb +425 -0
- data/test/test_craigslist_listing.rb +179 -260
- data/test/test_craigslist_posting.rb +306 -0
- metadata +29 -2
data/CHANGELOG
CHANGED
@@ -1,5 +1,22 @@
|
|
1
1
|
== Change Log
|
2
2
|
|
3
|
+
=== Release 0.7.0 (Jul 5, 2009)
|
4
|
+
- A good bit of refactoring
|
5
|
+
- Eager-loading in the Post object without the need of the full_post method
|
6
|
+
- full_post is no longer needed or available
|
7
|
+
- Added a Base Scraper object. Maybe I'll make that its own gem...
|
8
|
+
- Post/Listing constructors now take either a Hash or a url (string) to use for its scrape. Regardless, either should always have a url object association now
|
9
|
+
- Removed the PostSummary object, now we just have Posting/ Posts include all the functionality of both PostFull and PostSummary. Be careful with Posting if you're trying to minimize bandwidth. the rdoc labels which methods wont/will/might cause a page load.
|
10
|
+
- Things should fail better (always outputting relevant html that was unable to be parsed)
|
11
|
+
- Removed Posting::date in favor of Posting::post_date
|
12
|
+
- Posting::full_url is now url
|
13
|
+
- Post::href is no longer needed. Use Post::uri.path instead
|
14
|
+
- Posting.images renamed to Posting.pics, and a new method Posting.images better reflects the craigslist designations
|
15
|
+
- Fixed a bug with some search listings, where the last page might not get scrapped b/c craigslist doesn't correctly include the next page link. see 'test_nasty_search_listings' for an example
|
16
|
+
- On some listings (with the empty h4 tags) we were eager-loading when we could have made some safe assumptions instead. Fixed.
|
17
|
+
- Preliminary support for GeoListing scrapes.. (Not exactly sure where this will go - but I have ideas..)
|
18
|
+
- Adjusted Posting::images and Posting::pics to better match the craigslist media-dicotomy notation.
|
19
|
+
|
3
20
|
=== Release 0.6.5 (Jun 8, 2009)
|
4
21
|
- Added PostFull::deleted_by_author? , added test case for said condition
|
5
22
|
- Fixed a bug that caused the library to die in weird ways if there wasn't a title tag on a parsed page
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ include FileUtils
|
|
11
11
|
RbConfig = Config unless defined? RbConfig
|
12
12
|
|
13
13
|
NAME = "libcraigscrape"
|
14
|
-
VERS = ENV['VERSION'] || "0.
|
14
|
+
VERS = ENV['VERSION'] || "0.7.0"
|
15
15
|
PKG = "#{NAME}-#{VERS}"
|
16
16
|
|
17
17
|
RDOC_OPTS = ['--quiet', '--title', 'The libcraigscrape Reference', '--main', 'README', '--inline-source']
|
data/bin/craigwatch
CHANGED
@@ -193,10 +193,10 @@ class CraigReportDefinition #:nodoc:
|
|
193
193
|
|
194
194
|
if full_post_has or full_post_has_no
|
195
195
|
# We're going to download the page, so let's make sure we didnt hit a "This posting has been flagged for removal"
|
196
|
-
return false if post.
|
196
|
+
return false if post.system_post?
|
197
197
|
|
198
|
-
return false unless matches_all? full_post_has, post.
|
199
|
-
return false unless doesnt_match_any? full_post_has_no, post.
|
198
|
+
return false unless matches_all? full_post_has, post.contents_as_plain
|
199
|
+
return false unless doesnt_match_any? full_post_has_no, post.contents_as_plain
|
200
200
|
end
|
201
201
|
|
202
202
|
true
|
@@ -303,7 +303,7 @@ ActiveRecord::Base.logger = Logger.new STDERR if craig_report.debug_database?
|
|
303
303
|
ActiveRecord::Base.establish_connection craig_report.tracking_database
|
304
304
|
|
305
305
|
# Initialize CraigScrape (sorta)
|
306
|
-
CraigScrape.logger = Logger.new STDERR if craig_report.debug_craigscrape?
|
306
|
+
CraigScrape::Scraper.logger = Logger.new STDERR if craig_report.debug_craigscrape?
|
307
307
|
|
308
308
|
# Perform migrations if needed?
|
309
309
|
ActiveRecord::Schema.define do
|
@@ -340,18 +340,18 @@ craig_report.each_search do |search|
|
|
340
340
|
# Let's collect all the summaries that could apply:
|
341
341
|
new_summaries = {}
|
342
342
|
search.listing.each do |listing|
|
343
|
-
CraigScrape.scrape_until(listing){|p| p.
|
344
|
-
new_summaries[p_s.
|
343
|
+
CraigScrape.scrape_until(listing){|p| p.post_date <= last_tracked_at or already_tracked_urls.include? p.url }.each do |p_s|
|
344
|
+
new_summaries[p_s.url] = p_s unless new_summaries.has_key? p_s.url
|
345
345
|
end
|
346
346
|
end
|
347
347
|
|
348
348
|
# Let's flatten the unique'd hash into a more useable array:
|
349
|
-
new_summaries = new_summaries.values.sort{|a,b| a.
|
349
|
+
new_summaries = new_summaries.values.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
|
350
350
|
|
351
351
|
# Let's tag all the newest tracked posts that should go into the database:
|
352
352
|
# NOTE: Since all the dates are at_begining_of_day, we'll effectively have a chunk of dates tied for latest
|
353
|
-
new_summaries.reject{|p| p.
|
354
|
-
newly_tracked_posts << search_track.tracked_posts.build( :url => p_s.
|
353
|
+
new_summaries.reject{|p| p.post_date < new_summaries.last.post_date}.each do |p_s|
|
354
|
+
newly_tracked_posts << search_track.tracked_posts.build( :url => p_s.url, :created_at => p_s.post_date)
|
355
355
|
end
|
356
356
|
|
357
357
|
# Reject anything from this report which doesn't match the has/has_no :
|
@@ -361,7 +361,7 @@ craig_report.each_search do |search|
|
|
361
361
|
if new_summaries.length > 0
|
362
362
|
|
363
363
|
# We'll use this in the cleanup at the bottom:
|
364
|
-
latest_post_date = new_summaries.last.
|
364
|
+
latest_post_date = new_summaries.last.post_date
|
365
365
|
|
366
366
|
new_summaries.reverse! if search.newest_first?
|
367
367
|
|
@@ -4,8 +4,8 @@
|
|
4
4
|
<h3><%=h summary[:search].name%></h3>
|
5
5
|
<%summary[:postings].each do |post|%>
|
6
6
|
<%='<p>%s <a href="%s">%s -</a>%s%s</p>' % [
|
7
|
-
h(post.
|
8
|
-
post.
|
7
|
+
h(post.post_date.strftime('%b %d')),
|
8
|
+
post.url,
|
9
9
|
h(post.label),
|
10
10
|
(post.location) ? '<font size="-1"> (%s)</font>' % h(post.location) : '',
|
11
11
|
(post.has_pic_or_img?) ? ' <span style="color: orange"> img</span>': ''
|
@@ -4,11 +4,11 @@ CRAIGSLIST REPORTER
|
|
4
4
|
<%=summary[:search].name %>
|
5
5
|
<% summary[:postings].collect do |post| -%>
|
6
6
|
<%='%s : %s %s %s %s' % [
|
7
|
-
post.
|
7
|
+
post.post_date.strftime('%b %d'),
|
8
8
|
post.label,
|
9
9
|
(post.location) ? " (#{post.location})" : '',
|
10
10
|
(post.has_pic_or_img?) ? ' [img]': '',
|
11
|
-
post.
|
11
|
+
post.url
|
12
12
|
] -%>
|
13
13
|
|
14
14
|
<% end %>
|