libcraigscrape 0.8.4 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +14 -2
- data/Rakefile +8 -7
- data/bin/craigwatch +30 -9
- data/lib/libcraigscrape.rb +8 -3
- data/lib/posting.rb +17 -3
- data/test/post_samples/this_post_has_expired.html +48 -0
- data/test/test_craigslist_posting.rb +21 -0
- metadata +45 -11
data/CHANGELOG
CHANGED
@@ -1,8 +1,20 @@
|
|
1
1
|
== Change Log
|
2
2
|
|
3
|
-
=== Release 0.
|
3
|
+
=== Release 0.9.1
|
4
|
+
- Added support for posting_has_expired? and expired post recognition
|
5
|
+
- Fixed a weird bug in craigwatch that would cause a scrape to abort if a flagged_for_removal? was encountered when using certain (minimal) filtering
|
6
|
+
|
7
|
+
=== Release 0.9 (Oct 01, 2010)
|
8
|
+
- Minor adjustments to craigwatch to fix deprecation warnings in new ActiveRecord and ActionMailer gems
|
9
|
+
- Added gem version specifiers to the Gem spec and to the require statements
|
10
|
+
- Moved repo to github
|
11
|
+
- Fixed an esoteric bug in craigwatch, affecting the last scraped post in a listing when that post was 'flagged for removal'.
|
12
|
+
- Took all those extra package-building tasts out of the Rakefile since this is 2010 and we only party with gemfiles
|
13
|
+
- Ruby 1.9 compatibility adjustments
|
14
|
+
|
15
|
+
=== Release 0.8.4 (Sep 6, 2010)
|
4
16
|
- Someone found a way to screw up hpricot's to_s method (posting1938291834-090610.html) and fixed by added html_source to the craigslist Scraper object, which returns the body of the post without passing it through hpricot. Its a better way to go anyways, and re-wrote a couple incidentals to use the html_source method...
|
5
|
-
- Adjusted the test cases a bit, since the user bodies being returned have
|
17
|
+
- Adjusted the test cases a bit, since the user bodies being returned have less cleanup in their output than they had prior
|
6
18
|
|
7
19
|
=== Release 0.8.3 (August 2, 2010)
|
8
20
|
- Someone was posting really bad html that was screwing up Hpricot. Such is to be expected when you're soliciting html from the general public I suppose. Added test_bugs_found061710 posting test, and fixed by stripping out the user body before parsing with Hpricot.
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ include FileUtils
|
|
11
11
|
RbConfig = Config unless defined? RbConfig
|
12
12
|
|
13
13
|
NAME = "libcraigscrape"
|
14
|
-
VERS = ENV['VERSION'] || "0.
|
14
|
+
VERS = ENV['VERSION'] || "0.9.1"
|
15
15
|
PKG = "#{NAME}-#{VERS}"
|
16
16
|
|
17
17
|
RDOC_OPTS = ['--quiet', '--title', 'The libcraigscrape Reference', '--main', 'README', '--inline-source']
|
@@ -37,9 +37,10 @@ SPEC =
|
|
37
37
|
s.files = PKG_FILES
|
38
38
|
s.require_paths = ["lib"]
|
39
39
|
s.test_files = FileList['test/test_*.rb']
|
40
|
-
s.add_dependency 'hpricot'
|
41
|
-
s.add_dependency 'htmlentities'
|
42
|
-
s.add_dependency 'activesupport'
|
40
|
+
s.add_dependency 'hpricot', '> 0.8'
|
41
|
+
s.add_dependency 'htmlentities', '>= 4.0.0'
|
42
|
+
s.add_dependency 'activesupport','>= 2.3.0', '< 3'
|
43
|
+
s.add_dependency 'activerecord', '>= 2.3.0', '< 3'
|
43
44
|
end
|
44
45
|
|
45
46
|
desc "Run all the tests"
|
@@ -59,9 +60,9 @@ end
|
|
59
60
|
|
60
61
|
Rake::GemPackageTask.new(SPEC) do |p|
|
61
62
|
p.need_tar = false
|
62
|
-
p.need_tar_gz =
|
63
|
-
p.need_tar_bz2 =
|
64
|
-
p.need_zip =
|
63
|
+
p.need_tar_gz = false
|
64
|
+
p.need_tar_bz2 = false
|
65
|
+
p.need_zip = false
|
65
66
|
p.gem_spec = SPEC
|
66
67
|
end
|
67
68
|
|
data/bin/craigwatch
CHANGED
@@ -159,10 +159,15 @@
|
|
159
159
|
$: << File.dirname(__FILE__) + '/../lib'
|
160
160
|
|
161
161
|
require 'rubygems'
|
162
|
+
|
163
|
+
gem 'kwalify', '~> 0.7'
|
164
|
+
gem 'activerecord', '~> 2.3'
|
165
|
+
gem 'actionmailer', '~> 2.3'
|
166
|
+
|
162
167
|
require 'kwalify'
|
163
|
-
require 'kwalify/util/hashlike'
|
164
168
|
require 'active_record'
|
165
169
|
require 'action_mailer'
|
170
|
+
require 'kwalify/util/hashlike'
|
166
171
|
require 'libcraigscrape'
|
167
172
|
require "socket"
|
168
173
|
|
@@ -179,9 +184,9 @@ class String #:nodoc:
|
|
179
184
|
|
180
185
|
options.each_char do |c|
|
181
186
|
mods |= case c
|
182
|
-
when 'i'
|
183
|
-
when 'x'
|
184
|
-
when 'm'
|
187
|
+
when 'i' then Regexp::IGNORECASE
|
188
|
+
when 'x' then Regexp::EXTENDED
|
189
|
+
when 'm' then Regexp::MULTILINE
|
185
190
|
end
|
186
191
|
end unless options.nil? or options.empty?
|
187
192
|
|
@@ -217,7 +222,7 @@ class CraigReportDefinition #:nodoc:
|
|
217
222
|
# We'll setup a SQLite db using some defaults if needed
|
218
223
|
@tracking_database ||= {
|
219
224
|
:adapter => 'sqlite3',
|
220
|
-
:
|
225
|
+
:database => File.basename(for_yaml_file, File.extname(for_yaml_file))+'.db'
|
221
226
|
} if for_yaml_file
|
222
227
|
|
223
228
|
# This is a little hack to make sqlite definitions a little more portable, by allowing them
|
@@ -472,14 +477,20 @@ report_summaries = craig_report.searches.collect do |search|
|
|
472
477
|
# We'll use this in the loop to decide what posts to track:
|
473
478
|
newest_post_date = last_tracked_at
|
474
479
|
|
480
|
+
# We keep track of post.post_date here, b/c in some circumstances, you can be in the below loop
|
481
|
+
# but have no post.post_date since the posting was removed and it parsed to nil
|
482
|
+
most_recent_posting_date = Time.now
|
483
|
+
|
475
484
|
# OK - Now let's go!
|
476
485
|
catch :list_break do
|
477
486
|
while listing
|
478
487
|
listing.posts.each do |post|
|
479
488
|
begin
|
489
|
+
most_recent_posting_date = post.post_date if post.post_date
|
490
|
+
|
480
491
|
# Are we at a point in the scrape, past which we don't need to proceed?
|
481
492
|
throw :list_break if (
|
482
|
-
|
493
|
+
most_recent_posting_date < last_tracked_at or
|
483
494
|
already_tracked_urls.include? post.url
|
484
495
|
)
|
485
496
|
|
@@ -500,7 +511,7 @@ report_summaries = craig_report.searches.collect do |search|
|
|
500
511
|
# Now let's see if the url should be kept in our tracking database for the future...
|
501
512
|
|
502
513
|
# This post-date sets a limit for the tracked_listing.posts.create below
|
503
|
-
newest_post_date =
|
514
|
+
newest_post_date = most_recent_posting_date if most_recent_posting_date > newest_post_date
|
504
515
|
|
505
516
|
# Now let's add these urls to the database so as to reduce memory overhead.
|
506
517
|
# Keep in mind - they're not active until the email goes out.
|
@@ -509,7 +520,7 @@ report_summaries = craig_report.searches.collect do |search|
|
|
509
520
|
tracked_listing.posts.create(
|
510
521
|
:url => post.url,
|
511
522
|
:created_at => newest_post_date
|
512
|
-
) unless
|
523
|
+
) unless most_recent_posting_date < newest_post_date
|
513
524
|
|
514
525
|
end
|
515
526
|
|
@@ -518,8 +529,18 @@ report_summaries = craig_report.searches.collect do |search|
|
|
518
529
|
end
|
519
530
|
end
|
520
531
|
|
532
|
+
|
533
|
+
|
521
534
|
# Let's flatten the unique'd hash into a more useable array:
|
522
|
-
|
535
|
+
# NOTE: The reason we included a reject is a little complicated, but here's the gist:
|
536
|
+
# * We try not to load the whole post if we don't have to
|
537
|
+
# * Its possible that we met all the criterion of the passes_filter? with merely a header, and
|
538
|
+
# if so we add a url to the summaries stack
|
539
|
+
# * Unfortunately, when we later load that post in full, we may find that the post was posting_has_expired?
|
540
|
+
# or flagged_for_removal?, etc.
|
541
|
+
# * If this was the case, below we'll end up sorting against nil post_dates. This would fail.
|
542
|
+
# * So - before we sort, we run a quick reject on nil post_dates
|
543
|
+
new_summaries = new_summaries.values.reject{|v| v.post_date.nil? }.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
|
523
544
|
|
524
545
|
# Now Let's manage the tracking database:
|
525
546
|
if new_summaries.length > 0
|
data/lib/libcraigscrape.rb
CHANGED
@@ -2,13 +2,18 @@
|
|
2
2
|
#
|
3
3
|
# All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
|
4
4
|
#
|
5
|
+
require 'rubygems'
|
6
|
+
|
7
|
+
gem 'activesupport', '~> 2.3'
|
8
|
+
gem 'hpricot', '~> 0.8'
|
9
|
+
gem 'htmlentities', '~> 4.0.0'
|
10
|
+
|
5
11
|
require 'net/http'
|
6
12
|
require 'zlib'
|
7
|
-
|
8
|
-
require 'rubygems'
|
9
|
-
require 'active_support'
|
10
13
|
require 'hpricot'
|
11
14
|
require 'htmlentities'
|
15
|
+
require 'active_support'
|
16
|
+
|
12
17
|
|
13
18
|
# A base class encapsulating the various libcraigscrape objects, and providing most of the
|
14
19
|
# craigslist interaction methods. Currently, we're supporting the old Class methods
|
data/lib/posting.rb
CHANGED
@@ -30,9 +30,14 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
30
30
|
super(*args)
|
31
31
|
|
32
32
|
# Validate that required fields are present, at least - if we've downloaded it from a url
|
33
|
-
parse_error! if
|
34
|
-
|
35
|
-
|
33
|
+
parse_error! if (
|
34
|
+
args.first.kind_of? String and
|
35
|
+
!flagged_for_removal? and
|
36
|
+
!posting_has_expired? and
|
37
|
+
!deleted_by_author? and [
|
38
|
+
contents,posting_id,post_time,header,title,full_section
|
39
|
+
].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
|
40
|
+
)
|
36
41
|
end
|
37
42
|
|
38
43
|
|
@@ -188,6 +193,15 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
188
193
|
@deleted_by_author
|
189
194
|
end
|
190
195
|
|
196
|
+
# Returns true if this Post was parsed, and represents a 'This posting has expired.' notice
|
197
|
+
def posting_has_expired?
|
198
|
+
@posting_has_expired = (
|
199
|
+
system_post? and header_as_plain == "This posting has expired."
|
200
|
+
) if @posting_has_expired.nil?
|
201
|
+
|
202
|
+
@posting_has_expired
|
203
|
+
end
|
204
|
+
|
191
205
|
|
192
206
|
# Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
|
193
207
|
# used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
|
@@ -0,0 +1,48 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title></title>
|
5
|
+
<meta name="robots" content="NOARCHIVE,NOFOLLOW">
|
6
|
+
<link type="text/css" rel="stylesheet" media="all" href="http://www.craigslist.org/styles/craigslist.css?v=8">
|
7
|
+
</head>
|
8
|
+
|
9
|
+
<body class="posting">
|
10
|
+
|
11
|
+
|
12
|
+
<div class="bchead">
|
13
|
+
|
14
|
+
<a href="http://charleston.craigslist.org/">charleston craigslist</a> >
|
15
|
+
|
16
|
+
<a href="http://charleston.craigslist.org/sss/">for sale / wanted</a> >
|
17
|
+
<a href="http://charleston.craigslist.org/cto/">cars & trucks - by owner</a>
|
18
|
+
</div>
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
<hr>
|
25
|
+
<br>
|
26
|
+
<br>
|
27
|
+
<h2>This posting has expired.</h2>
|
28
|
+
<h5>(The title on the listings page will be removed in just a few minutes.)</h5>
|
29
|
+
|
30
|
+
<br><br>
|
31
|
+
|
32
|
+
<hr>
|
33
|
+
<ul class="clfooter">
|
34
|
+
<li>Copyright © 2011 craigslist, inc.</li>
|
35
|
+
<li><a href="http://www.craigslist.org/about/terms.of.use.html">terms of use</a></li>
|
36
|
+
<li><a href="http://www.craigslist.org/about/privacy_policy">privacy policy</a></li>
|
37
|
+
<li><a href="/forums/?forumID=8">feedback forum</a></li>
|
38
|
+
</ul>
|
39
|
+
|
40
|
+
<script type="text/javascript" src="http://www.craigslist.org/js/jquery-1.4.2.js"></script>
|
41
|
+
<script type="text/javascript" src="http://www.craigslist.org/js/postings.js"></script>
|
42
|
+
<script type="text/javascript"><!--
|
43
|
+
pID = 1968731193;
|
44
|
+
-->
|
45
|
+
</script>
|
46
|
+
</body>
|
47
|
+
</html>
|
48
|
+
|
@@ -388,4 +388,25 @@ EOD
|
|
388
388
|
assert_equal "2008 GMC Sierra 2500HD", posting_090610.title
|
389
389
|
end
|
390
390
|
|
391
|
+
def test_expired_post
|
392
|
+
posting_expired = CraigScrape::Posting.new relative_uri_for('post_samples/this_post_has_expired.html')
|
393
|
+
assert_equal true, posting_expired.posting_has_expired?
|
394
|
+
assert_equal true, posting_expired.system_post?
|
395
|
+
assert_equal nil, posting_expired.contents
|
396
|
+
assert_equal ["charleston craigslist", "for sale / wanted", "cars & trucks - by owner" ], posting_expired.full_section
|
397
|
+
assert_equal "This posting has expired.", posting_expired.header
|
398
|
+
assert_equal nil, posting_expired.label
|
399
|
+
assert_equal nil, posting_expired.title
|
400
|
+
assert_equal nil, posting_expired.location
|
401
|
+
assert_equal nil, posting_expired.posting_id
|
402
|
+
assert_equal nil, posting_expired.reply_to
|
403
|
+
assert_equal nil, posting_expired.post_time
|
404
|
+
assert_equal [], posting_expired.pics
|
405
|
+
assert_equal nil, posting_expired.contents_as_plain
|
406
|
+
assert_equal nil, posting_expired.price
|
407
|
+
assert_equal [], posting_expired.images
|
408
|
+
assert_equal [], posting_expired.img_types
|
409
|
+
|
410
|
+
end
|
411
|
+
|
391
412
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: libcraigscrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 57
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 9
|
9
|
+
- 1
|
10
|
+
version: 0.9.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Chris DeRose, DeRose Technologies, Inc.
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-01-05 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -24,12 +24,13 @@ dependencies:
|
|
24
24
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
25
|
none: false
|
26
26
|
requirements:
|
27
|
-
- - "
|
27
|
+
- - ">"
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
hash:
|
29
|
+
hash: 27
|
30
30
|
segments:
|
31
31
|
- 0
|
32
|
-
|
32
|
+
- 8
|
33
|
+
version: "0.8"
|
33
34
|
type: :runtime
|
34
35
|
version_requirements: *id001
|
35
36
|
- !ruby/object:Gem::Dependency
|
@@ -40,10 +41,12 @@ dependencies:
|
|
40
41
|
requirements:
|
41
42
|
- - ">="
|
42
43
|
- !ruby/object:Gem::Version
|
43
|
-
hash:
|
44
|
+
hash: 63
|
44
45
|
segments:
|
46
|
+
- 4
|
45
47
|
- 0
|
46
|
-
|
48
|
+
- 0
|
49
|
+
version: 4.0.0
|
47
50
|
type: :runtime
|
48
51
|
version_requirements: *id002
|
49
52
|
- !ruby/object:Gem::Dependency
|
@@ -56,10 +59,40 @@ dependencies:
|
|
56
59
|
- !ruby/object:Gem::Version
|
57
60
|
hash: 3
|
58
61
|
segments:
|
62
|
+
- 2
|
63
|
+
- 3
|
59
64
|
- 0
|
60
|
-
version:
|
65
|
+
version: 2.3.0
|
66
|
+
- - <
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
hash: 5
|
69
|
+
segments:
|
70
|
+
- 3
|
71
|
+
version: "3"
|
61
72
|
type: :runtime
|
62
73
|
version_requirements: *id003
|
74
|
+
- !ruby/object:Gem::Dependency
|
75
|
+
name: activerecord
|
76
|
+
prerelease: false
|
77
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
hash: 3
|
83
|
+
segments:
|
84
|
+
- 2
|
85
|
+
- 3
|
86
|
+
- 0
|
87
|
+
version: 2.3.0
|
88
|
+
- - <
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
hash: 5
|
91
|
+
segments:
|
92
|
+
- 3
|
93
|
+
version: "3"
|
94
|
+
type: :runtime
|
95
|
+
version_requirements: *id004
|
63
96
|
description: quick, easy, craigslist parsing library that takes the monotony out of working with craigslist posts and listings
|
64
97
|
email: cderose@derosetechnologies.com
|
65
98
|
executables:
|
@@ -114,6 +147,7 @@ files:
|
|
114
147
|
- test/post_samples/posting5.html
|
115
148
|
- test/post_samples/posting1796890756-061710.html
|
116
149
|
- test/post_samples/posting3.html
|
150
|
+
- test/post_samples/this_post_has_expired.html
|
117
151
|
- test/post_samples/posting1808219423.html
|
118
152
|
- test/post_samples/sfbay_art_1223614914.html
|
119
153
|
- test/post_samples/this_post_has_been_deleted_by_its_author.html
|