RubyGems - libcraigscrape - Versions diffs - 0.8.4 → 0.9.1 - Mend

libcraigscrape 0.8.4 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/CHANGELOG +14 -2
data/Rakefile +8 -7
data/bin/craigwatch +30 -9
data/lib/libcraigscrape.rb +8 -3
data/lib/posting.rb +17 -3
data/test/post_samples/this_post_has_expired.html +48 -0
data/test/test_craigslist_posting.rb +21 -0
metadata +45 -11

data/CHANGELOG CHANGED Viewed

@@ -1,8 +1,20 @@
 == Change Log
-=== Release 0.8.4 (TODO)
+=== Release 0.9.1
+- Added support for posting_has_expired? and expired post recognition
+- Fixed a weird bug in craigwatch that would cause a scrape to abort if a flagged_for_removal? was encountered when using certain (minimal) filtering
+=== Release 0.9 (Oct 01, 2010)
+- Minor adjustments to craigwatch to fix deprecation warnings in new ActiveRecord and ActionMailer gems
+- Added gem version specifiers to the Gem spec and to the require statements
+- Moved repo to github
+- Fixed an esoteric bug in craigwatch, affecting the last scraped post in a listing when that post was 'flagged for removal'.
+- Took all those extra package-building tasts out of the Rakefile since this is 2010 and we only party with gemfiles
+- Ruby 1.9 compatibility adjustments
+=== Release 0.8.4 (Sep 6, 2010)
 - Someone found a way to screw up hpricot's to_s method (posting1938291834-090610.html) and fixed by added html_source to the craigslist Scraper object, which returns the body of the post without passing it through hpricot. Its a better way to go anyways, and re-wrote a couple incidentals to use the html_source method...
-- Adjusted the test cases a bit, since the user bodies being returned have slightly less cleanup in their output than they had prior
+- Adjusted the test cases a bit, since the user bodies being returned have less cleanup in their output than they had prior
 === Release 0.8.3 (August 2, 2010)
 - Someone was posting really bad html that was screwing up Hpricot. Such is to be expected when you're soliciting html from the general public I suppose. Added test_bugs_found061710 posting test, and fixed by stripping out the user body before parsing with Hpricot.

data/Rakefile CHANGED Viewed

@@ -11,7 +11,7 @@ include FileUtils
 RbConfig = Config unless defined? RbConfig
 NAME = "libcraigscrape"
-VERS = ENV['VERSION'] || "0.8.4"
+VERS = ENV['VERSION'] || "0.9.1"
 PKG = "#{NAME}-#{VERS}"
 RDOC_OPTS = ['--quiet', '--title', 'The libcraigscrape Reference', '--main', 'README', '--inline-source']
@@ -37,9 +37,10 @@ SPEC =
     s.files = PKG_FILES
     s.require_paths = ["lib"]
     s.test_files = FileList['test/test_*.rb']
-    s.add_dependency 'hpricot'
-    s.add_dependency 'htmlentities'
-    s.add_dependency 'activesupport'
+    s.add_dependency 'hpricot',      '> 0.8'
+    s.add_dependency 'htmlentities', '>= 4.0.0'
+    s.add_dependency 'activesupport','>= 2.3.0', '< 3'
+    s.add_dependency 'activerecord', '>= 2.3.0', '< 3'
   end
 desc "Run all the tests"
@@ -59,9 +60,9 @@ end
 Rake::GemPackageTask.new(SPEC) do |p|
   p.need_tar = false
-  p.need_tar_gz = true
-  p.need_tar_bz2 = true
-  p.need_zip = true
+  p.need_tar_gz = false
+  p.need_tar_bz2 = false
+  p.need_zip = false
   p.gem_spec = SPEC
 end

data/bin/craigwatch CHANGED Viewed

@@ -159,10 +159,15 @@
 $: << File.dirname(__FILE__) + '/../lib'
 require 'rubygems'
+gem 'kwalify',      '~> 0.7'
+gem 'activerecord', '~> 2.3'
+gem 'actionmailer', '~> 2.3'
 require 'kwalify'
-require 'kwalify/util/hashlike'
 require 'active_record'
 require 'action_mailer'
+require 'kwalify/util/hashlike'
 require 'libcraigscrape'
 require "socket"
@@ -179,9 +184,9 @@ class String #:nodoc:
     options.each_char do |c|
       mods |= case c
-        when 'i': Regexp::IGNORECASE
-        when 'x': Regexp::EXTENDED
-        when 'm': Regexp::MULTILINE
+        when 'i' then Regexp::IGNORECASE
+        when 'x' then Regexp::EXTENDED
+        when 'm' then Regexp::MULTILINE
       end
     end unless options.nil? or options.empty?
@@ -217,7 +222,7 @@ class CraigReportDefinition #:nodoc:
     # We'll setup a SQLite db using some defaults if needed
     @tracking_database ||= {
       :adapter => 'sqlite3',
-      :dbfile => File.basename(for_yaml_file, File.extname(for_yaml_file))+'.db'
+      :database => File.basename(for_yaml_file, File.extname(for_yaml_file))+'.db'
     } if for_yaml_file
     # This is a little hack to make sqlite definitions a little more portable, by allowing them
@@ -472,14 +477,20 @@ report_summaries = craig_report.searches.collect do |search|
     # We'll use this in the loop to decide what posts to track:
     newest_post_date = last_tracked_at
+    # We keep track of post.post_date here, b/c in some circumstances, you can be in the below loop
+    # but have no post.post_date since the posting was removed and it parsed to nil
+    most_recent_posting_date = Time.now
     # OK - Now let's go!
     catch :list_break do
       while listing
         listing.posts.each do |post|
           begin
+            most_recent_posting_date = post.post_date if post.post_date
             # Are we at a point in the scrape, past which we don't need to proceed?
             throw :list_break if (
-              post.post_date < last_tracked_at or
+              most_recent_posting_date < last_tracked_at or
               already_tracked_urls.include? post.url
             )
@@ -500,7 +511,7 @@ report_summaries = craig_report.searches.collect do |search|
           # Now let's see if the url should be kept in our tracking database for the future...
           # This post-date sets a limit for the tracked_listing.posts.create below
-          newest_post_date = post.post_date if post.post_date > newest_post_date
+          newest_post_date = most_recent_posting_date if most_recent_posting_date > newest_post_date
           # Now let's add these urls to the database so as to reduce memory overhead.
           # Keep in mind - they're not active until the email goes out.
@@ -509,7 +520,7 @@ report_summaries = craig_report.searches.collect do |search|
           tracked_listing.posts.create(
             :url => post.url,
             :created_at => newest_post_date
-          ) unless post.post_date < newest_post_date
+          ) unless most_recent_posting_date < newest_post_date
         end
@@ -518,8 +529,18 @@ report_summaries = craig_report.searches.collect do |search|
     end
   end
   # Let's flatten the unique'd hash into a more useable array:
-  new_summaries = new_summaries.values.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
+  # NOTE: The reason we included a reject is a little complicated, but here's the gist:
+  #  * We try not to load the whole post if we don't have to
+  #  * Its possible that we met all the criterion of the passes_filter? with merely a header, and
+  #    if so we add a url to the summaries stack
+  #  * Unfortunately, when we later load that post in full, we may find that the post was posting_has_expired?
+  #    or flagged_for_removal?, etc.
+  #  * If this was the case, below we'll end up sorting against nil post_dates. This would fail.
+  #  * So - before we sort, we run a quick reject on nil post_dates
+  new_summaries = new_summaries.values.reject{|v| v.post_date.nil? }.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
   # Now Let's manage the tracking database:
   if new_summaries.length > 0

data/lib/libcraigscrape.rb CHANGED Viewed

@@ -2,13 +2,18 @@
 #
 # All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
 #
+require 'rubygems'
+gem 'activesupport', '~> 2.3'
+gem 'hpricot',       '~> 0.8'
+gem 'htmlentities',  '~> 4.0.0'
 require 'net/http'
 require 'zlib'
-require 'rubygems'
-require 'active_support'
 require 'hpricot'
 require 'htmlentities'
+require 'active_support'
 # A base class encapsulating the various libcraigscrape objects, and providing most of the
 # craigslist interaction methods. Currently, we're supporting the old Class methods

data/lib/posting.rb CHANGED Viewed

@@ -30,9 +30,14 @@ class CraigScrape::Posting < CraigScrape::Scraper
     super(*args)
     # Validate that required fields are present, at least - if we've downloaded it from a url
-    parse_error! if args.first.kind_of? String and !flagged_for_removal? and !deleted_by_author? and [
-      contents,posting_id,post_time,header,title,full_section
-    ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
+    parse_error! if (
+      args.first.kind_of? String and
+      !flagged_for_removal? and
+      !posting_has_expired? and
+      !deleted_by_author? and [
+        contents,posting_id,post_time,header,title,full_section
+      ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
+    )
   end
@@ -188,6 +193,15 @@ class CraigScrape::Posting < CraigScrape::Scraper
     @deleted_by_author
   end
+  # Returns true if this Post was parsed, and represents a 'This posting has expired.' notice
+  def posting_has_expired?
+    @posting_has_expired = (
+      system_post? and header_as_plain == "This posting has expired."
+    ) if @posting_has_expired.nil?
+    @posting_has_expired
+  end
   # Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
   # used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.

data/test/post_samples/this_post_has_expired.html ADDED Viewed

@@ -0,0 +1,48 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+	<title></title>
+	<meta name="robots" content="NOARCHIVE,NOFOLLOW">
+	<link type="text/css" rel="stylesheet" media="all" href="http://www.craigslist.org/styles/craigslist.css?v=8">
+</head>
+<body class="posting">
+<div class="bchead">
+	<a href="http://charleston.craigslist.org/">charleston craigslist</a> &gt;
+	 <a href="http://charleston.craigslist.org/sss/">for sale / wanted</a> &gt;
+    <a href="http://charleston.craigslist.org/cto/">cars &amp; trucks - by owner</a>
+</div>
+<hr>
+<br>
+<br>
+<h2>This posting has expired.</h2>
+<h5>(The title on the listings page will be removed in just a few minutes.)</h5>
+<br><br>
+<hr>
+<ul class="clfooter">
+	<li>Copyright &copy; 2011 craigslist, inc.</li>
+	<li><a href="http://www.craigslist.org/about/terms.of.use.html">terms of use</a></li>
+	<li><a href="http://www.craigslist.org/about/privacy_policy">privacy policy</a></li>
+	<li><a href="/forums/?forumID=8">feedback forum</a></li>
+</ul>
+<script type="text/javascript" src="http://www.craigslist.org/js/jquery-1.4.2.js"></script>
+<script type="text/javascript" src="http://www.craigslist.org/js/postings.js"></script>
+<script type="text/javascript"><!--
+	pID = 1968731193;
+-->
+</script>
+</body>
+</html>

data/test/test_craigslist_posting.rb CHANGED Viewed

@@ -388,4 +388,25 @@ EOD
     assert_equal "2008 GMC Sierra 2500HD", posting_090610.title
   end
+  def test_expired_post
+    posting_expired = CraigScrape::Posting.new relative_uri_for('post_samples/this_post_has_expired.html')
+    assert_equal true, posting_expired.posting_has_expired?
+    assert_equal true, posting_expired.system_post?
+    assert_equal nil, posting_expired.contents
+    assert_equal ["charleston craigslist", "for sale / wanted", "cars & trucks - by owner" ], posting_expired.full_section
+    assert_equal "This posting has expired.", posting_expired.header
+    assert_equal nil, posting_expired.label
+    assert_equal nil, posting_expired.title
+    assert_equal nil, posting_expired.location
+    assert_equal nil, posting_expired.posting_id
+    assert_equal nil, posting_expired.reply_to
+    assert_equal nil, posting_expired.post_time
+    assert_equal [],  posting_expired.pics
+    assert_equal nil, posting_expired.contents_as_plain
+    assert_equal nil, posting_expired.price
+    assert_equal [], posting_expired.images
+    assert_equal [], posting_expired.img_types
+  end
 end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: libcraigscrape
 version: !ruby/object:Gem::Version
-  hash: 55
+  hash: 57
   prerelease: false
   segments:
   - 0
-  - 8
-  - 4
-  version: 0.8.4
+  - 9
+  - 1
+  version: 0.9.1
 platform: ruby
 authors:
 - Chris DeRose, DeRose Technologies, Inc.
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-09-06 00:00:00 -04:00
+date: 2011-01-05 00:00:00 -05:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -24,12 +24,13 @@ dependencies:
   requirement: &id001 !ruby/object:Gem::Requirement
     none: false
     requirements:
-    - - ">="
+    - - ">"
       - !ruby/object:Gem::Version
-        hash: 3
+        hash: 27
         segments:
         - 0
-        version: "0"
+        - 8
+        version: "0.8"
   type: :runtime
   version_requirements: *id001
 - !ruby/object:Gem::Dependency
@@ -40,10 +41,12 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        hash: 3
+        hash: 63
         segments:
+        - 4
         - 0
-        version: "0"
+        - 0
+        version: 4.0.0
   type: :runtime
   version_requirements: *id002
 - !ruby/object:Gem::Dependency
@@ -56,10 +59,40 @@ dependencies:
       - !ruby/object:Gem::Version
         hash: 3
         segments:
+        - 2
+        - 3
         - 0
-        version: "0"
+        version: 2.3.0
+    - - <
+      - !ruby/object:Gem::Version
+        hash: 5
+        segments:
+        - 3
+        version: "3"
   type: :runtime
   version_requirements: *id003
+- !ruby/object:Gem::Dependency
+  name: activerecord
+  prerelease: false
+  requirement: &id004 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 2
+        - 3
+        - 0
+        version: 2.3.0
+    - - <
+      - !ruby/object:Gem::Version
+        hash: 5
+        segments:
+        - 3
+        version: "3"
+  type: :runtime
+  version_requirements: *id004
 description: quick, easy, craigslist parsing library that takes the monotony out of working with craigslist posts and listings
 email: cderose@derosetechnologies.com
 executables:
@@ -114,6 +147,7 @@ files:
 - test/post_samples/posting5.html
 - test/post_samples/posting1796890756-061710.html
 - test/post_samples/posting3.html
+- test/post_samples/this_post_has_expired.html
 - test/post_samples/posting1808219423.html
 - test/post_samples/sfbay_art_1223614914.html
 - test/post_samples/this_post_has_been_deleted_by_its_author.html