libcraigscrape 0.8.4 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,8 +1,20 @@
1
1
  == Change Log
2
2
 
3
- === Release 0.8.4 (TODO)
3
+ === Release 0.9.1
4
+ - Added support for posting_has_expired? and expired post recognition
5
+ - Fixed a weird bug in craigwatch that would cause a scrape to abort if a flagged_for_removal? was encountered when using certain (minimal) filtering
6
+
7
+ === Release 0.9 (Oct 01, 2010)
8
+ - Minor adjustments to craigwatch to fix deprecation warnings in new ActiveRecord and ActionMailer gems
9
+ - Added gem version specifiers to the Gem spec and to the require statements
10
+ - Moved repo to github
11
+ - Fixed an esoteric bug in craigwatch, affecting the last scraped post in a listing when that post was 'flagged for removal'.
12
+ - Took all those extra package-building tasts out of the Rakefile since this is 2010 and we only party with gemfiles
13
+ - Ruby 1.9 compatibility adjustments
14
+
15
+ === Release 0.8.4 (Sep 6, 2010)
4
16
  - Someone found a way to screw up hpricot's to_s method (posting1938291834-090610.html) and fixed by added html_source to the craigslist Scraper object, which returns the body of the post without passing it through hpricot. Its a better way to go anyways, and re-wrote a couple incidentals to use the html_source method...
5
- - Adjusted the test cases a bit, since the user bodies being returned have slightly less cleanup in their output than they had prior
17
+ - Adjusted the test cases a bit, since the user bodies being returned have less cleanup in their output than they had prior
6
18
 
7
19
  === Release 0.8.3 (August 2, 2010)
8
20
  - Someone was posting really bad html that was screwing up Hpricot. Such is to be expected when you're soliciting html from the general public I suppose. Added test_bugs_found061710 posting test, and fixed by stripping out the user body before parsing with Hpricot.
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ include FileUtils
11
11
  RbConfig = Config unless defined? RbConfig
12
12
 
13
13
  NAME = "libcraigscrape"
14
- VERS = ENV['VERSION'] || "0.8.4"
14
+ VERS = ENV['VERSION'] || "0.9.1"
15
15
  PKG = "#{NAME}-#{VERS}"
16
16
 
17
17
  RDOC_OPTS = ['--quiet', '--title', 'The libcraigscrape Reference', '--main', 'README', '--inline-source']
@@ -37,9 +37,10 @@ SPEC =
37
37
  s.files = PKG_FILES
38
38
  s.require_paths = ["lib"]
39
39
  s.test_files = FileList['test/test_*.rb']
40
- s.add_dependency 'hpricot'
41
- s.add_dependency 'htmlentities'
42
- s.add_dependency 'activesupport'
40
+ s.add_dependency 'hpricot', '> 0.8'
41
+ s.add_dependency 'htmlentities', '>= 4.0.0'
42
+ s.add_dependency 'activesupport','>= 2.3.0', '< 3'
43
+ s.add_dependency 'activerecord', '>= 2.3.0', '< 3'
43
44
  end
44
45
 
45
46
  desc "Run all the tests"
@@ -59,9 +60,9 @@ end
59
60
 
60
61
  Rake::GemPackageTask.new(SPEC) do |p|
61
62
  p.need_tar = false
62
- p.need_tar_gz = true
63
- p.need_tar_bz2 = true
64
- p.need_zip = true
63
+ p.need_tar_gz = false
64
+ p.need_tar_bz2 = false
65
+ p.need_zip = false
65
66
  p.gem_spec = SPEC
66
67
  end
67
68
 
data/bin/craigwatch CHANGED
@@ -159,10 +159,15 @@
159
159
  $: << File.dirname(__FILE__) + '/../lib'
160
160
 
161
161
  require 'rubygems'
162
+
163
+ gem 'kwalify', '~> 0.7'
164
+ gem 'activerecord', '~> 2.3'
165
+ gem 'actionmailer', '~> 2.3'
166
+
162
167
  require 'kwalify'
163
- require 'kwalify/util/hashlike'
164
168
  require 'active_record'
165
169
  require 'action_mailer'
170
+ require 'kwalify/util/hashlike'
166
171
  require 'libcraigscrape'
167
172
  require "socket"
168
173
 
@@ -179,9 +184,9 @@ class String #:nodoc:
179
184
 
180
185
  options.each_char do |c|
181
186
  mods |= case c
182
- when 'i': Regexp::IGNORECASE
183
- when 'x': Regexp::EXTENDED
184
- when 'm': Regexp::MULTILINE
187
+ when 'i' then Regexp::IGNORECASE
188
+ when 'x' then Regexp::EXTENDED
189
+ when 'm' then Regexp::MULTILINE
185
190
  end
186
191
  end unless options.nil? or options.empty?
187
192
 
@@ -217,7 +222,7 @@ class CraigReportDefinition #:nodoc:
217
222
  # We'll setup a SQLite db using some defaults if needed
218
223
  @tracking_database ||= {
219
224
  :adapter => 'sqlite3',
220
- :dbfile => File.basename(for_yaml_file, File.extname(for_yaml_file))+'.db'
225
+ :database => File.basename(for_yaml_file, File.extname(for_yaml_file))+'.db'
221
226
  } if for_yaml_file
222
227
 
223
228
  # This is a little hack to make sqlite definitions a little more portable, by allowing them
@@ -472,14 +477,20 @@ report_summaries = craig_report.searches.collect do |search|
472
477
  # We'll use this in the loop to decide what posts to track:
473
478
  newest_post_date = last_tracked_at
474
479
 
480
+ # We keep track of post.post_date here, b/c in some circumstances, you can be in the below loop
481
+ # but have no post.post_date since the posting was removed and it parsed to nil
482
+ most_recent_posting_date = Time.now
483
+
475
484
  # OK - Now let's go!
476
485
  catch :list_break do
477
486
  while listing
478
487
  listing.posts.each do |post|
479
488
  begin
489
+ most_recent_posting_date = post.post_date if post.post_date
490
+
480
491
  # Are we at a point in the scrape, past which we don't need to proceed?
481
492
  throw :list_break if (
482
- post.post_date < last_tracked_at or
493
+ most_recent_posting_date < last_tracked_at or
483
494
  already_tracked_urls.include? post.url
484
495
  )
485
496
 
@@ -500,7 +511,7 @@ report_summaries = craig_report.searches.collect do |search|
500
511
  # Now let's see if the url should be kept in our tracking database for the future...
501
512
 
502
513
  # This post-date sets a limit for the tracked_listing.posts.create below
503
- newest_post_date = post.post_date if post.post_date > newest_post_date
514
+ newest_post_date = most_recent_posting_date if most_recent_posting_date > newest_post_date
504
515
 
505
516
  # Now let's add these urls to the database so as to reduce memory overhead.
506
517
  # Keep in mind - they're not active until the email goes out.
@@ -509,7 +520,7 @@ report_summaries = craig_report.searches.collect do |search|
509
520
  tracked_listing.posts.create(
510
521
  :url => post.url,
511
522
  :created_at => newest_post_date
512
- ) unless post.post_date < newest_post_date
523
+ ) unless most_recent_posting_date < newest_post_date
513
524
 
514
525
  end
515
526
 
@@ -518,8 +529,18 @@ report_summaries = craig_report.searches.collect do |search|
518
529
  end
519
530
  end
520
531
 
532
+
533
+
521
534
  # Let's flatten the unique'd hash into a more useable array:
522
- new_summaries = new_summaries.values.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
535
+ # NOTE: The reason we included a reject is a little complicated, but here's the gist:
536
+ # * We try not to load the whole post if we don't have to
537
+ # * Its possible that we met all the criterion of the passes_filter? with merely a header, and
538
+ # if so we add a url to the summaries stack
539
+ # * Unfortunately, when we later load that post in full, we may find that the post was posting_has_expired?
540
+ # or flagged_for_removal?, etc.
541
+ # * If this was the case, below we'll end up sorting against nil post_dates. This would fail.
542
+ # * So - before we sort, we run a quick reject on nil post_dates
543
+ new_summaries = new_summaries.values.reject{|v| v.post_date.nil? }.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
523
544
 
524
545
  # Now Let's manage the tracking database:
525
546
  if new_summaries.length > 0
@@ -2,13 +2,18 @@
2
2
  #
3
3
  # All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
4
4
  #
5
+ require 'rubygems'
6
+
7
+ gem 'activesupport', '~> 2.3'
8
+ gem 'hpricot', '~> 0.8'
9
+ gem 'htmlentities', '~> 4.0.0'
10
+
5
11
  require 'net/http'
6
12
  require 'zlib'
7
-
8
- require 'rubygems'
9
- require 'active_support'
10
13
  require 'hpricot'
11
14
  require 'htmlentities'
15
+ require 'active_support'
16
+
12
17
 
13
18
  # A base class encapsulating the various libcraigscrape objects, and providing most of the
14
19
  # craigslist interaction methods. Currently, we're supporting the old Class methods
data/lib/posting.rb CHANGED
@@ -30,9 +30,14 @@ class CraigScrape::Posting < CraigScrape::Scraper
30
30
  super(*args)
31
31
 
32
32
  # Validate that required fields are present, at least - if we've downloaded it from a url
33
- parse_error! if args.first.kind_of? String and !flagged_for_removal? and !deleted_by_author? and [
34
- contents,posting_id,post_time,header,title,full_section
35
- ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
33
+ parse_error! if (
34
+ args.first.kind_of? String and
35
+ !flagged_for_removal? and
36
+ !posting_has_expired? and
37
+ !deleted_by_author? and [
38
+ contents,posting_id,post_time,header,title,full_section
39
+ ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
40
+ )
36
41
  end
37
42
 
38
43
 
@@ -188,6 +193,15 @@ class CraigScrape::Posting < CraigScrape::Scraper
188
193
  @deleted_by_author
189
194
  end
190
195
 
196
+ # Returns true if this Post was parsed, and represents a 'This posting has expired.' notice
197
+ def posting_has_expired?
198
+ @posting_has_expired = (
199
+ system_post? and header_as_plain == "This posting has expired."
200
+ ) if @posting_has_expired.nil?
201
+
202
+ @posting_has_expired
203
+ end
204
+
191
205
 
192
206
  # Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
193
207
  # used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
@@ -0,0 +1,48 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
+ <html>
3
+ <head>
4
+ <title></title>
5
+ <meta name="robots" content="NOARCHIVE,NOFOLLOW">
6
+ <link type="text/css" rel="stylesheet" media="all" href="http://www.craigslist.org/styles/craigslist.css?v=8">
7
+ </head>
8
+
9
+ <body class="posting">
10
+
11
+
12
+ <div class="bchead">
13
+
14
+ <a href="http://charleston.craigslist.org/">charleston craigslist</a> &gt;
15
+
16
+ <a href="http://charleston.craigslist.org/sss/">for sale / wanted</a> &gt;
17
+ <a href="http://charleston.craigslist.org/cto/">cars &amp; trucks - by owner</a>
18
+ </div>
19
+
20
+
21
+
22
+
23
+
24
+ <hr>
25
+ <br>
26
+ <br>
27
+ <h2>This posting has expired.</h2>
28
+ <h5>(The title on the listings page will be removed in just a few minutes.)</h5>
29
+
30
+ <br><br>
31
+
32
+ <hr>
33
+ <ul class="clfooter">
34
+ <li>Copyright &copy; 2011 craigslist, inc.</li>
35
+ <li><a href="http://www.craigslist.org/about/terms.of.use.html">terms of use</a></li>
36
+ <li><a href="http://www.craigslist.org/about/privacy_policy">privacy policy</a></li>
37
+ <li><a href="/forums/?forumID=8">feedback forum</a></li>
38
+ </ul>
39
+
40
+ <script type="text/javascript" src="http://www.craigslist.org/js/jquery-1.4.2.js"></script>
41
+ <script type="text/javascript" src="http://www.craigslist.org/js/postings.js"></script>
42
+ <script type="text/javascript"><!--
43
+ pID = 1968731193;
44
+ -->
45
+ </script>
46
+ </body>
47
+ </html>
48
+
@@ -388,4 +388,25 @@ EOD
388
388
  assert_equal "2008 GMC Sierra 2500HD", posting_090610.title
389
389
  end
390
390
 
391
+ def test_expired_post
392
+ posting_expired = CraigScrape::Posting.new relative_uri_for('post_samples/this_post_has_expired.html')
393
+ assert_equal true, posting_expired.posting_has_expired?
394
+ assert_equal true, posting_expired.system_post?
395
+ assert_equal nil, posting_expired.contents
396
+ assert_equal ["charleston craigslist", "for sale / wanted", "cars & trucks - by owner" ], posting_expired.full_section
397
+ assert_equal "This posting has expired.", posting_expired.header
398
+ assert_equal nil, posting_expired.label
399
+ assert_equal nil, posting_expired.title
400
+ assert_equal nil, posting_expired.location
401
+ assert_equal nil, posting_expired.posting_id
402
+ assert_equal nil, posting_expired.reply_to
403
+ assert_equal nil, posting_expired.post_time
404
+ assert_equal [], posting_expired.pics
405
+ assert_equal nil, posting_expired.contents_as_plain
406
+ assert_equal nil, posting_expired.price
407
+ assert_equal [], posting_expired.images
408
+ assert_equal [], posting_expired.img_types
409
+
410
+ end
411
+
391
412
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: libcraigscrape
3
3
  version: !ruby/object:Gem::Version
4
- hash: 55
4
+ hash: 57
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 8
9
- - 4
10
- version: 0.8.4
8
+ - 9
9
+ - 1
10
+ version: 0.9.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Chris DeRose, DeRose Technologies, Inc.
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-09-06 00:00:00 -04:00
18
+ date: 2011-01-05 00:00:00 -05:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -24,12 +24,13 @@ dependencies:
24
24
  requirement: &id001 !ruby/object:Gem::Requirement
25
25
  none: false
26
26
  requirements:
27
- - - ">="
27
+ - - ">"
28
28
  - !ruby/object:Gem::Version
29
- hash: 3
29
+ hash: 27
30
30
  segments:
31
31
  - 0
32
- version: "0"
32
+ - 8
33
+ version: "0.8"
33
34
  type: :runtime
34
35
  version_requirements: *id001
35
36
  - !ruby/object:Gem::Dependency
@@ -40,10 +41,12 @@ dependencies:
40
41
  requirements:
41
42
  - - ">="
42
43
  - !ruby/object:Gem::Version
43
- hash: 3
44
+ hash: 63
44
45
  segments:
46
+ - 4
45
47
  - 0
46
- version: "0"
48
+ - 0
49
+ version: 4.0.0
47
50
  type: :runtime
48
51
  version_requirements: *id002
49
52
  - !ruby/object:Gem::Dependency
@@ -56,10 +59,40 @@ dependencies:
56
59
  - !ruby/object:Gem::Version
57
60
  hash: 3
58
61
  segments:
62
+ - 2
63
+ - 3
59
64
  - 0
60
- version: "0"
65
+ version: 2.3.0
66
+ - - <
67
+ - !ruby/object:Gem::Version
68
+ hash: 5
69
+ segments:
70
+ - 3
71
+ version: "3"
61
72
  type: :runtime
62
73
  version_requirements: *id003
74
+ - !ruby/object:Gem::Dependency
75
+ name: activerecord
76
+ prerelease: false
77
+ requirement: &id004 !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ hash: 3
83
+ segments:
84
+ - 2
85
+ - 3
86
+ - 0
87
+ version: 2.3.0
88
+ - - <
89
+ - !ruby/object:Gem::Version
90
+ hash: 5
91
+ segments:
92
+ - 3
93
+ version: "3"
94
+ type: :runtime
95
+ version_requirements: *id004
63
96
  description: quick, easy, craigslist parsing library that takes the monotony out of working with craigslist posts and listings
64
97
  email: cderose@derosetechnologies.com
65
98
  executables:
@@ -114,6 +147,7 @@ files:
114
147
  - test/post_samples/posting5.html
115
148
  - test/post_samples/posting1796890756-061710.html
116
149
  - test/post_samples/posting3.html
150
+ - test/post_samples/this_post_has_expired.html
117
151
  - test/post_samples/posting1808219423.html
118
152
  - test/post_samples/sfbay_art_1223614914.html
119
153
  - test/post_samples/this_post_has_been_deleted_by_its_author.html