libcraigscrape 0.6.5 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. data/CHANGELOG +17 -0
  2. data/Rakefile +1 -1
  3. data/bin/craigwatch +10 -10
  4. data/bin/report_mailer/craigslist_report.html.erb +2 -2
  5. data/bin/report_mailer/craigslist_report.plain.erb +2 -2
  6. data/lib/libcraigscrape.rb +585 -342
  7. data/test/geolisting_samples/geo_listing_ca070209.html +76 -0
  8. data/test/geolisting_samples/geo_listing_ca_sk070209.html +31 -0
  9. data/test/geolisting_samples/geo_listing_cn070209.html +35 -0
  10. data/test/geolisting_samples/geo_listing_us070209.html +355 -0
  11. data/test/libcraigscrape_test_helpers.rb +31 -0
  12. data/test/listing_samples/fortmyers_art_index.060909/1046596324.html +93 -0
  13. data/test/listing_samples/fortmyers_art_index.060909/1053085283.html +92 -0
  14. data/test/listing_samples/fortmyers_art_index.060909/1112522674.html +89 -0
  15. data/test/listing_samples/fortmyers_art_index.060909/823516079.html +92 -0
  16. data/test/listing_samples/fortmyers_art_index.060909/825684735.html +89 -0
  17. data/test/listing_samples/fortmyers_art_index.060909/891513957.html +94 -0
  18. data/test/listing_samples/fortmyers_art_index.060909/897549505.html +99 -0
  19. data/test/listing_samples/fortmyers_art_index.060909/960826026.html +89 -0
  20. data/test/listing_samples/fortmyers_art_index.060909/993256300.html +89 -0
  21. data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index500.060909.html +237 -0
  22. data/test/listing_samples/fortmyers_art_index.060909/fortmyers_art_index600.060909.html +132 -0
  23. data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack1000.6.18.09.html +144 -0
  24. data/test/listing_samples/miami_search_sss_rack.6.18.09/miami_search_sss_rack900.6.18.09.html +146 -0
  25. data/test/post_samples/brw_reb_1224008903.html +101 -0
  26. data/test/post_samples/sfbay_art_1223614914.html +94 -0
  27. data/test/test_craigslist_geolisting.rb +425 -0
  28. data/test/test_craigslist_listing.rb +179 -260
  29. data/test/test_craigslist_posting.rb +306 -0
  30. metadata +29 -2
data/CHANGELOG CHANGED
@@ -1,5 +1,22 @@
1
1
  == Change Log
2
2
 
3
+ === Release 0.7.0 (Jul 5, 2009)
4
+ - A good bit of refactoring
5
+ - Eager-loading in the Post object without the need of the full_post method
6
+ - full_post is no longer needed or available
7
+ - Added a Base Scraper object. Maybe I'll make that its own gem...
8
+ - Post/Listing constructors now take either a Hash or a url (string) to use for its scrape. Regardless, either should always have a url object association now
9
+ - Removed the PostSummary object, now we just have Posting/ Posts include all the functionality of both PostFull and PostSummary. Be careful with Posting if you're trying to minimize bandwidth. the rdoc labels which methods wont/will/might cause a page load.
10
+ - Things should fail better (always outputting relevant html that was unable to be parsed)
11
+ - Removed Posting::date in favor of Posting::post_date
12
+ - Posting::full_url is now url
13
+ - Post::href is no longer needed. Use Post::uri.path instead
14
+ - Posting.images renamed to Posting.pics, and a new method Posting.images better reflects the craigslist designations
15
+ - Fixed a bug with some search listings, where the last page might not get scrapped b/c craigslist doesn't correctly include the next page link. see 'test_nasty_search_listings' for an example
16
+ - On some listings (with the empty h4 tags) we were eager-loading when we could have made some safe assumptions instead. Fixed.
17
+ - Preliminary support for GeoListing scrapes.. (Not exactly sure where this will go - but I have ideas..)
18
+ - Adjusted Posting::images and Posting::pics to better match the craigslist media-dicotomy notation.
19
+
3
20
  === Release 0.6.5 (Jun 8, 2009)
4
21
  - Added PostFull::deleted_by_author? , added test case for said condition
5
22
  - Fixed a bug that caused the library to die in weird ways if there wasn't a title tag on a parsed page
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ include FileUtils
11
11
  RbConfig = Config unless defined? RbConfig
12
12
 
13
13
  NAME = "libcraigscrape"
14
- VERS = ENV['VERSION'] || "0.6.5"
14
+ VERS = ENV['VERSION'] || "0.7.0"
15
15
  PKG = "#{NAME}-#{VERS}"
16
16
 
17
17
  RDOC_OPTS = ['--quiet', '--title', 'The libcraigscrape Reference', '--main', 'README', '--inline-source']
data/bin/craigwatch CHANGED
@@ -193,10 +193,10 @@ class CraigReportDefinition #:nodoc:
193
193
 
194
194
  if full_post_has or full_post_has_no
195
195
  # We're going to download the page, so let's make sure we didnt hit a "This posting has been flagged for removal"
196
- return false if post.full_post.title.nil?
196
+ return false if post.system_post?
197
197
 
198
- return false unless matches_all? full_post_has, post.full_post.contents_as_plain
199
- return false unless doesnt_match_any? full_post_has_no, post.full_post.contents_as_plain
198
+ return false unless matches_all? full_post_has, post.contents_as_plain
199
+ return false unless doesnt_match_any? full_post_has_no, post.contents_as_plain
200
200
  end
201
201
 
202
202
  true
@@ -303,7 +303,7 @@ ActiveRecord::Base.logger = Logger.new STDERR if craig_report.debug_database?
303
303
  ActiveRecord::Base.establish_connection craig_report.tracking_database
304
304
 
305
305
  # Initialize CraigScrape (sorta)
306
- CraigScrape.logger = Logger.new STDERR if craig_report.debug_craigscrape?
306
+ CraigScrape::Scraper.logger = Logger.new STDERR if craig_report.debug_craigscrape?
307
307
 
308
308
  # Perform migrations if needed?
309
309
  ActiveRecord::Schema.define do
@@ -340,18 +340,18 @@ craig_report.each_search do |search|
340
340
  # Let's collect all the summaries that could apply:
341
341
  new_summaries = {}
342
342
  search.listing.each do |listing|
343
- CraigScrape.scrape_until(listing){|p| p.date <= last_tracked_at or already_tracked_urls.include? p.full_url }.each do |p_s|
344
- new_summaries[p_s.full_url] = p_s unless new_summaries.has_key? p_s.full_url
343
+ CraigScrape.scrape_until(listing){|p| p.post_date <= last_tracked_at or already_tracked_urls.include? p.url }.each do |p_s|
344
+ new_summaries[p_s.url] = p_s unless new_summaries.has_key? p_s.url
345
345
  end
346
346
  end
347
347
 
348
348
  # Let's flatten the unique'd hash into a more useable array:
349
- new_summaries = new_summaries.values.sort{|a,b| a.date <=> b.date} # oldest goes to bottom
349
+ new_summaries = new_summaries.values.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
350
350
 
351
351
  # Let's tag all the newest tracked posts that should go into the database:
352
352
  # NOTE: Since all the dates are at_begining_of_day, we'll effectively have a chunk of dates tied for latest
353
- new_summaries.reject{|p| p.date < new_summaries.last.date}.each do |p_s|
354
- newly_tracked_posts << search_track.tracked_posts.build( :url => p_s.full_url, :created_at => p_s.date)
353
+ new_summaries.reject{|p| p.post_date < new_summaries.last.post_date}.each do |p_s|
354
+ newly_tracked_posts << search_track.tracked_posts.build( :url => p_s.url, :created_at => p_s.post_date)
355
355
  end
356
356
 
357
357
  # Reject anything from this report which doesn't match the has/has_no :
@@ -361,7 +361,7 @@ craig_report.each_search do |search|
361
361
  if new_summaries.length > 0
362
362
 
363
363
  # We'll use this in the cleanup at the bottom:
364
- latest_post_date = new_summaries.last.date
364
+ latest_post_date = new_summaries.last.post_date
365
365
 
366
366
  new_summaries.reverse! if search.newest_first?
367
367
 
@@ -4,8 +4,8 @@
4
4
  <h3><%=h summary[:search].name%></h3>
5
5
  <%summary[:postings].each do |post|%>
6
6
  <%='<p>%s <a href="%s">%s -</a>%s%s</p>' % [
7
- h(post.date.strftime('%b %d')),
8
- post.full_url,
7
+ h(post.post_date.strftime('%b %d')),
8
+ post.url,
9
9
  h(post.label),
10
10
  (post.location) ? '<font size="-1"> (%s)</font>' % h(post.location) : '',
11
11
  (post.has_pic_or_img?) ? ' <span style="color: orange"> img</span>': ''
@@ -4,11 +4,11 @@ CRAIGSLIST REPORTER
4
4
  <%=summary[:search].name %>
5
5
  <% summary[:postings].collect do |post| -%>
6
6
  <%='%s : %s %s %s %s' % [
7
- post.date.strftime('%b %d'),
7
+ post.post_date.strftime('%b %d'),
8
8
  post.label,
9
9
  (post.location) ? " (#{post.location})" : '',
10
10
  (post.has_pic_or_img?) ? ' [img]': '',
11
- post.full_url
11
+ post.url
12
12
  ] -%>
13
13
 
14
14
  <% end %>