libcraigscrape 0.8.4 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,8 +1,20 @@
1
1
  == Change Log
2
2
 
3
- === Release 0.8.4 (TODO)
3
+ === Release 0.9.1
4
+ - Added support for posting_has_expired? and expired post recognition
5
+ - Fixed a weird bug in craigwatch that would cause a scrape to abort if a flagged_for_removal? was encountered when using certain (minimal) filtering
6
+
7
+ === Release 0.9 (Oct 01, 2010)
8
+ - Minor adjustments to craigwatch to fix deprecation warnings in new ActiveRecord and ActionMailer gems
9
+ - Added gem version specifiers to the Gem spec and to the require statements
10
+ - Moved repo to github
11
+ - Fixed an esoteric bug in craigwatch, affecting the last scraped post in a listing when that post was 'flagged for removal'.
12
+ - Took all those extra package-building tasts out of the Rakefile since this is 2010 and we only party with gemfiles
13
+ - Ruby 1.9 compatibility adjustments
14
+
15
+ === Release 0.8.4 (Sep 6, 2010)
4
16
  - Someone found a way to screw up hpricot's to_s method (posting1938291834-090610.html) and fixed by added html_source to the craigslist Scraper object, which returns the body of the post without passing it through hpricot. Its a better way to go anyways, and re-wrote a couple incidentals to use the html_source method...
5
- - Adjusted the test cases a bit, since the user bodies being returned have slightly less cleanup in their output than they had prior
17
+ - Adjusted the test cases a bit, since the user bodies being returned have less cleanup in their output than they had prior
6
18
 
7
19
  === Release 0.8.3 (August 2, 2010)
8
20
  - Someone was posting really bad html that was screwing up Hpricot. Such is to be expected when you're soliciting html from the general public I suppose. Added test_bugs_found061710 posting test, and fixed by stripping out the user body before parsing with Hpricot.
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ include FileUtils
11
11
  RbConfig = Config unless defined? RbConfig
12
12
 
13
13
  NAME = "libcraigscrape"
14
- VERS = ENV['VERSION'] || "0.8.4"
14
+ VERS = ENV['VERSION'] || "0.9.1"
15
15
  PKG = "#{NAME}-#{VERS}"
16
16
 
17
17
  RDOC_OPTS = ['--quiet', '--title', 'The libcraigscrape Reference', '--main', 'README', '--inline-source']
@@ -37,9 +37,10 @@ SPEC =
37
37
  s.files = PKG_FILES
38
38
  s.require_paths = ["lib"]
39
39
  s.test_files = FileList['test/test_*.rb']
40
- s.add_dependency 'hpricot'
41
- s.add_dependency 'htmlentities'
42
- s.add_dependency 'activesupport'
40
+ s.add_dependency 'hpricot', '> 0.8'
41
+ s.add_dependency 'htmlentities', '>= 4.0.0'
42
+ s.add_dependency 'activesupport','>= 2.3.0', '< 3'
43
+ s.add_dependency 'activerecord', '>= 2.3.0', '< 3'
43
44
  end
44
45
 
45
46
  desc "Run all the tests"
@@ -59,9 +60,9 @@ end
59
60
 
60
61
  Rake::GemPackageTask.new(SPEC) do |p|
61
62
  p.need_tar = false
62
- p.need_tar_gz = true
63
- p.need_tar_bz2 = true
64
- p.need_zip = true
63
+ p.need_tar_gz = false
64
+ p.need_tar_bz2 = false
65
+ p.need_zip = false
65
66
  p.gem_spec = SPEC
66
67
  end
67
68
 
data/bin/craigwatch CHANGED
@@ -159,10 +159,15 @@
159
159
  $: << File.dirname(__FILE__) + '/../lib'
160
160
 
161
161
  require 'rubygems'
162
+
163
+ gem 'kwalify', '~> 0.7'
164
+ gem 'activerecord', '~> 2.3'
165
+ gem 'actionmailer', '~> 2.3'
166
+
162
167
  require 'kwalify'
163
- require 'kwalify/util/hashlike'
164
168
  require 'active_record'
165
169
  require 'action_mailer'
170
+ require 'kwalify/util/hashlike'
166
171
  require 'libcraigscrape'
167
172
  require "socket"
168
173
 
@@ -179,9 +184,9 @@ class String #:nodoc:
179
184
 
180
185
  options.each_char do |c|
181
186
  mods |= case c
182
- when 'i': Regexp::IGNORECASE
183
- when 'x': Regexp::EXTENDED
184
- when 'm': Regexp::MULTILINE
187
+ when 'i' then Regexp::IGNORECASE
188
+ when 'x' then Regexp::EXTENDED
189
+ when 'm' then Regexp::MULTILINE
185
190
  end
186
191
  end unless options.nil? or options.empty?
187
192
 
@@ -217,7 +222,7 @@ class CraigReportDefinition #:nodoc:
217
222
  # We'll setup a SQLite db using some defaults if needed
218
223
  @tracking_database ||= {
219
224
  :adapter => 'sqlite3',
220
- :dbfile => File.basename(for_yaml_file, File.extname(for_yaml_file))+'.db'
225
+ :database => File.basename(for_yaml_file, File.extname(for_yaml_file))+'.db'
221
226
  } if for_yaml_file
222
227
 
223
228
  # This is a little hack to make sqlite definitions a little more portable, by allowing them
@@ -472,14 +477,20 @@ report_summaries = craig_report.searches.collect do |search|
472
477
  # We'll use this in the loop to decide what posts to track:
473
478
  newest_post_date = last_tracked_at
474
479
 
480
+ # We keep track of post.post_date here, b/c in some circumstances, you can be in the below loop
481
+ # but have no post.post_date since the posting was removed and it parsed to nil
482
+ most_recent_posting_date = Time.now
483
+
475
484
  # OK - Now let's go!
476
485
  catch :list_break do
477
486
  while listing
478
487
  listing.posts.each do |post|
479
488
  begin
489
+ most_recent_posting_date = post.post_date if post.post_date
490
+
480
491
  # Are we at a point in the scrape, past which we don't need to proceed?
481
492
  throw :list_break if (
482
- post.post_date < last_tracked_at or
493
+ most_recent_posting_date < last_tracked_at or
483
494
  already_tracked_urls.include? post.url
484
495
  )
485
496
 
@@ -500,7 +511,7 @@ report_summaries = craig_report.searches.collect do |search|
500
511
  # Now let's see if the url should be kept in our tracking database for the future...
501
512
 
502
513
  # This post-date sets a limit for the tracked_listing.posts.create below
503
- newest_post_date = post.post_date if post.post_date > newest_post_date
514
+ newest_post_date = most_recent_posting_date if most_recent_posting_date > newest_post_date
504
515
 
505
516
  # Now let's add these urls to the database so as to reduce memory overhead.
506
517
  # Keep in mind - they're not active until the email goes out.
@@ -509,7 +520,7 @@ report_summaries = craig_report.searches.collect do |search|
509
520
  tracked_listing.posts.create(
510
521
  :url => post.url,
511
522
  :created_at => newest_post_date
512
- ) unless post.post_date < newest_post_date
523
+ ) unless most_recent_posting_date < newest_post_date
513
524
 
514
525
  end
515
526
 
@@ -518,8 +529,18 @@ report_summaries = craig_report.searches.collect do |search|
518
529
  end
519
530
  end
520
531
 
532
+
533
+
521
534
  # Let's flatten the unique'd hash into a more useable array:
522
- new_summaries = new_summaries.values.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
535
+ # NOTE: The reason we included a reject is a little complicated, but here's the gist:
536
+ # * We try not to load the whole post if we don't have to
537
+ # * Its possible that we met all the criterion of the passes_filter? with merely a header, and
538
+ # if so we add a url to the summaries stack
539
+ # * Unfortunately, when we later load that post in full, we may find that the post was posting_has_expired?
540
+ # or flagged_for_removal?, etc.
541
+ # * If this was the case, below we'll end up sorting against nil post_dates. This would fail.
542
+ # * So - before we sort, we run a quick reject on nil post_dates
543
+ new_summaries = new_summaries.values.reject{|v| v.post_date.nil? }.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
523
544
 
524
545
  # Now Let's manage the tracking database:
525
546
  if new_summaries.length > 0
@@ -2,13 +2,18 @@
2
2
  #
3
3
  # All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
4
4
  #
5
+ require 'rubygems'
6
+
7
+ gem 'activesupport', '~> 2.3'
8
+ gem 'hpricot', '~> 0.8'
9
+ gem 'htmlentities', '~> 4.0.0'
10
+
5
11
  require 'net/http'
6
12
  require 'zlib'
7
-
8
- require 'rubygems'
9
- require 'active_support'
10
13
  require 'hpricot'
11
14
  require 'htmlentities'
15
+ require 'active_support'
16
+
12
17
 
13
18
  # A base class encapsulating the various libcraigscrape objects, and providing most of the
14
19
  # craigslist interaction methods. Currently, we're supporting the old Class methods
data/lib/posting.rb CHANGED
@@ -30,9 +30,14 @@ class CraigScrape::Posting < CraigScrape::Scraper
30
30
  super(*args)
31
31
 
32
32
  # Validate that required fields are present, at least - if we've downloaded it from a url
33
- parse_error! if args.first.kind_of? String and !flagged_for_removal? and !deleted_by_author? and [
34
- contents,posting_id,post_time,header,title,full_section
35
- ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
33
+ parse_error! if (
34
+ args.first.kind_of? String and
35
+ !flagged_for_removal? and
36
+ !posting_has_expired? and
37
+ !deleted_by_author? and [
38
+ contents,posting_id,post_time,header,title,full_section
39
+ ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
40
+ )
36
41
  end
37
42
 
38
43
 
@@ -188,6 +193,15 @@ class CraigScrape::Posting < CraigScrape::Scraper
188
193
  @deleted_by_author
189
194
  end
190
195
 
196
+ # Returns true if this Post was parsed, and represents a 'This posting has expired.' notice
197
+ def posting_has_expired?
198
+ @posting_has_expired = (
199
+ system_post? and header_as_plain == "This posting has expired."
200
+ ) if @posting_has_expired.nil?
201
+
202
+ @posting_has_expired
203
+ end
204
+
191
205
 
192
206
  # Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
193
207
  # used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
@@ -0,0 +1,48 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
+ <html>
3
+ <head>
4
+ <title></title>
5
+ <meta name="robots" content="NOARCHIVE,NOFOLLOW">
6
+ <link type="text/css" rel="stylesheet" media="all" href="http://www.craigslist.org/styles/craigslist.css?v=8">
7
+ </head>
8
+
9
+ <body class="posting">
10
+
11
+
12
+ <div class="bchead">
13
+
14
+ <a href="http://charleston.craigslist.org/">charleston craigslist</a> &gt;
15
+
16
+ <a href="http://charleston.craigslist.org/sss/">for sale / wanted</a> &gt;
17
+ <a href="http://charleston.craigslist.org/cto/">cars &amp; trucks - by owner</a>
18
+ </div>
19
+
20
+
21
+
22
+
23
+
24
+ <hr>
25
+ <br>
26
+ <br>
27
+ <h2>This posting has expired.</h2>
28
+ <h5>(The title on the listings page will be removed in just a few minutes.)</h5>
29
+
30
+ <br><br>
31
+
32
+ <hr>
33
+ <ul class="clfooter">
34
+ <li>Copyright &copy; 2011 craigslist, inc.</li>
35
+ <li><a href="http://www.craigslist.org/about/terms.of.use.html">terms of use</a></li>
36
+ <li><a href="http://www.craigslist.org/about/privacy_policy">privacy policy</a></li>
37
+ <li><a href="/forums/?forumID=8">feedback forum</a></li>
38
+ </ul>
39
+
40
+ <script type="text/javascript" src="http://www.craigslist.org/js/jquery-1.4.2.js"></script>
41
+ <script type="text/javascript" src="http://www.craigslist.org/js/postings.js"></script>
42
+ <script type="text/javascript"><!--
43
+ pID = 1968731193;
44
+ -->
45
+ </script>
46
+ </body>
47
+ </html>
48
+
@@ -388,4 +388,25 @@ EOD
388
388
  assert_equal "2008 GMC Sierra 2500HD", posting_090610.title
389
389
  end
390
390
 
391
+ def test_expired_post
392
+ posting_expired = CraigScrape::Posting.new relative_uri_for('post_samples/this_post_has_expired.html')
393
+ assert_equal true, posting_expired.posting_has_expired?
394
+ assert_equal true, posting_expired.system_post?
395
+ assert_equal nil, posting_expired.contents
396
+ assert_equal ["charleston craigslist", "for sale / wanted", "cars & trucks - by owner" ], posting_expired.full_section
397
+ assert_equal "This posting has expired.", posting_expired.header
398
+ assert_equal nil, posting_expired.label
399
+ assert_equal nil, posting_expired.title
400
+ assert_equal nil, posting_expired.location
401
+ assert_equal nil, posting_expired.posting_id
402
+ assert_equal nil, posting_expired.reply_to
403
+ assert_equal nil, posting_expired.post_time
404
+ assert_equal [], posting_expired.pics
405
+ assert_equal nil, posting_expired.contents_as_plain
406
+ assert_equal nil, posting_expired.price
407
+ assert_equal [], posting_expired.images
408
+ assert_equal [], posting_expired.img_types
409
+
410
+ end
411
+
391
412
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: libcraigscrape
3
3
  version: !ruby/object:Gem::Version
4
- hash: 55
4
+ hash: 57
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 8
9
- - 4
10
- version: 0.8.4
8
+ - 9
9
+ - 1
10
+ version: 0.9.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Chris DeRose, DeRose Technologies, Inc.
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-09-06 00:00:00 -04:00
18
+ date: 2011-01-05 00:00:00 -05:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -24,12 +24,13 @@ dependencies:
24
24
  requirement: &id001 !ruby/object:Gem::Requirement
25
25
  none: false
26
26
  requirements:
27
- - - ">="
27
+ - - ">"
28
28
  - !ruby/object:Gem::Version
29
- hash: 3
29
+ hash: 27
30
30
  segments:
31
31
  - 0
32
- version: "0"
32
+ - 8
33
+ version: "0.8"
33
34
  type: :runtime
34
35
  version_requirements: *id001
35
36
  - !ruby/object:Gem::Dependency
@@ -40,10 +41,12 @@ dependencies:
40
41
  requirements:
41
42
  - - ">="
42
43
  - !ruby/object:Gem::Version
43
- hash: 3
44
+ hash: 63
44
45
  segments:
46
+ - 4
45
47
  - 0
46
- version: "0"
48
+ - 0
49
+ version: 4.0.0
47
50
  type: :runtime
48
51
  version_requirements: *id002
49
52
  - !ruby/object:Gem::Dependency
@@ -56,10 +59,40 @@ dependencies:
56
59
  - !ruby/object:Gem::Version
57
60
  hash: 3
58
61
  segments:
62
+ - 2
63
+ - 3
59
64
  - 0
60
- version: "0"
65
+ version: 2.3.0
66
+ - - <
67
+ - !ruby/object:Gem::Version
68
+ hash: 5
69
+ segments:
70
+ - 3
71
+ version: "3"
61
72
  type: :runtime
62
73
  version_requirements: *id003
74
+ - !ruby/object:Gem::Dependency
75
+ name: activerecord
76
+ prerelease: false
77
+ requirement: &id004 !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ hash: 3
83
+ segments:
84
+ - 2
85
+ - 3
86
+ - 0
87
+ version: 2.3.0
88
+ - - <
89
+ - !ruby/object:Gem::Version
90
+ hash: 5
91
+ segments:
92
+ - 3
93
+ version: "3"
94
+ type: :runtime
95
+ version_requirements: *id004
63
96
  description: quick, easy, craigslist parsing library that takes the monotony out of working with craigslist posts and listings
64
97
  email: cderose@derosetechnologies.com
65
98
  executables:
@@ -114,6 +147,7 @@ files:
114
147
  - test/post_samples/posting5.html
115
148
  - test/post_samples/posting1796890756-061710.html
116
149
  - test/post_samples/posting3.html
150
+ - test/post_samples/this_post_has_expired.html
117
151
  - test/post_samples/posting1808219423.html
118
152
  - test/post_samples/sfbay_art_1223614914.html
119
153
  - test/post_samples/this_post_has_been_deleted_by_its_author.html