libcraigscrape 0.8.4 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +14 -2
- data/Rakefile +8 -7
- data/bin/craigwatch +30 -9
- data/lib/libcraigscrape.rb +8 -3
- data/lib/posting.rb +17 -3
- data/test/post_samples/this_post_has_expired.html +48 -0
- data/test/test_craigslist_posting.rb +21 -0
- metadata +45 -11
data/CHANGELOG
CHANGED
@@ -1,8 +1,20 @@
|
|
1
1
|
== Change Log
|
2
2
|
|
3
|
-
=== Release 0.
|
3
|
+
=== Release 0.9.1
|
4
|
+
- Added support for posting_has_expired? and expired post recognition
|
5
|
+
- Fixed a weird bug in craigwatch that would cause a scrape to abort if a flagged_for_removal? was encountered when using certain (minimal) filtering
|
6
|
+
|
7
|
+
=== Release 0.9 (Oct 01, 2010)
|
8
|
+
- Minor adjustments to craigwatch to fix deprecation warnings in new ActiveRecord and ActionMailer gems
|
9
|
+
- Added gem version specifiers to the Gem spec and to the require statements
|
10
|
+
- Moved repo to github
|
11
|
+
- Fixed an esoteric bug in craigwatch, affecting the last scraped post in a listing when that post was 'flagged for removal'.
|
12
|
+
- Took all those extra package-building tasts out of the Rakefile since this is 2010 and we only party with gemfiles
|
13
|
+
- Ruby 1.9 compatibility adjustments
|
14
|
+
|
15
|
+
=== Release 0.8.4 (Sep 6, 2010)
|
4
16
|
- Someone found a way to screw up hpricot's to_s method (posting1938291834-090610.html) and fixed by added html_source to the craigslist Scraper object, which returns the body of the post without passing it through hpricot. Its a better way to go anyways, and re-wrote a couple incidentals to use the html_source method...
|
5
|
-
- Adjusted the test cases a bit, since the user bodies being returned have
|
17
|
+
- Adjusted the test cases a bit, since the user bodies being returned have less cleanup in their output than they had prior
|
6
18
|
|
7
19
|
=== Release 0.8.3 (August 2, 2010)
|
8
20
|
- Someone was posting really bad html that was screwing up Hpricot. Such is to be expected when you're soliciting html from the general public I suppose. Added test_bugs_found061710 posting test, and fixed by stripping out the user body before parsing with Hpricot.
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ include FileUtils
|
|
11
11
|
RbConfig = Config unless defined? RbConfig
|
12
12
|
|
13
13
|
NAME = "libcraigscrape"
|
14
|
-
VERS = ENV['VERSION'] || "0.
|
14
|
+
VERS = ENV['VERSION'] || "0.9.1"
|
15
15
|
PKG = "#{NAME}-#{VERS}"
|
16
16
|
|
17
17
|
RDOC_OPTS = ['--quiet', '--title', 'The libcraigscrape Reference', '--main', 'README', '--inline-source']
|
@@ -37,9 +37,10 @@ SPEC =
|
|
37
37
|
s.files = PKG_FILES
|
38
38
|
s.require_paths = ["lib"]
|
39
39
|
s.test_files = FileList['test/test_*.rb']
|
40
|
-
s.add_dependency 'hpricot'
|
41
|
-
s.add_dependency 'htmlentities'
|
42
|
-
s.add_dependency 'activesupport'
|
40
|
+
s.add_dependency 'hpricot', '> 0.8'
|
41
|
+
s.add_dependency 'htmlentities', '>= 4.0.0'
|
42
|
+
s.add_dependency 'activesupport','>= 2.3.0', '< 3'
|
43
|
+
s.add_dependency 'activerecord', '>= 2.3.0', '< 3'
|
43
44
|
end
|
44
45
|
|
45
46
|
desc "Run all the tests"
|
@@ -59,9 +60,9 @@ end
|
|
59
60
|
|
60
61
|
Rake::GemPackageTask.new(SPEC) do |p|
|
61
62
|
p.need_tar = false
|
62
|
-
p.need_tar_gz =
|
63
|
-
p.need_tar_bz2 =
|
64
|
-
p.need_zip =
|
63
|
+
p.need_tar_gz = false
|
64
|
+
p.need_tar_bz2 = false
|
65
|
+
p.need_zip = false
|
65
66
|
p.gem_spec = SPEC
|
66
67
|
end
|
67
68
|
|
data/bin/craigwatch
CHANGED
@@ -159,10 +159,15 @@
|
|
159
159
|
$: << File.dirname(__FILE__) + '/../lib'
|
160
160
|
|
161
161
|
require 'rubygems'
|
162
|
+
|
163
|
+
gem 'kwalify', '~> 0.7'
|
164
|
+
gem 'activerecord', '~> 2.3'
|
165
|
+
gem 'actionmailer', '~> 2.3'
|
166
|
+
|
162
167
|
require 'kwalify'
|
163
|
-
require 'kwalify/util/hashlike'
|
164
168
|
require 'active_record'
|
165
169
|
require 'action_mailer'
|
170
|
+
require 'kwalify/util/hashlike'
|
166
171
|
require 'libcraigscrape'
|
167
172
|
require "socket"
|
168
173
|
|
@@ -179,9 +184,9 @@ class String #:nodoc:
|
|
179
184
|
|
180
185
|
options.each_char do |c|
|
181
186
|
mods |= case c
|
182
|
-
when 'i'
|
183
|
-
when 'x'
|
184
|
-
when 'm'
|
187
|
+
when 'i' then Regexp::IGNORECASE
|
188
|
+
when 'x' then Regexp::EXTENDED
|
189
|
+
when 'm' then Regexp::MULTILINE
|
185
190
|
end
|
186
191
|
end unless options.nil? or options.empty?
|
187
192
|
|
@@ -217,7 +222,7 @@ class CraigReportDefinition #:nodoc:
|
|
217
222
|
# We'll setup a SQLite db using some defaults if needed
|
218
223
|
@tracking_database ||= {
|
219
224
|
:adapter => 'sqlite3',
|
220
|
-
:
|
225
|
+
:database => File.basename(for_yaml_file, File.extname(for_yaml_file))+'.db'
|
221
226
|
} if for_yaml_file
|
222
227
|
|
223
228
|
# This is a little hack to make sqlite definitions a little more portable, by allowing them
|
@@ -472,14 +477,20 @@ report_summaries = craig_report.searches.collect do |search|
|
|
472
477
|
# We'll use this in the loop to decide what posts to track:
|
473
478
|
newest_post_date = last_tracked_at
|
474
479
|
|
480
|
+
# We keep track of post.post_date here, b/c in some circumstances, you can be in the below loop
|
481
|
+
# but have no post.post_date since the posting was removed and it parsed to nil
|
482
|
+
most_recent_posting_date = Time.now
|
483
|
+
|
475
484
|
# OK - Now let's go!
|
476
485
|
catch :list_break do
|
477
486
|
while listing
|
478
487
|
listing.posts.each do |post|
|
479
488
|
begin
|
489
|
+
most_recent_posting_date = post.post_date if post.post_date
|
490
|
+
|
480
491
|
# Are we at a point in the scrape, past which we don't need to proceed?
|
481
492
|
throw :list_break if (
|
482
|
-
|
493
|
+
most_recent_posting_date < last_tracked_at or
|
483
494
|
already_tracked_urls.include? post.url
|
484
495
|
)
|
485
496
|
|
@@ -500,7 +511,7 @@ report_summaries = craig_report.searches.collect do |search|
|
|
500
511
|
# Now let's see if the url should be kept in our tracking database for the future...
|
501
512
|
|
502
513
|
# This post-date sets a limit for the tracked_listing.posts.create below
|
503
|
-
newest_post_date =
|
514
|
+
newest_post_date = most_recent_posting_date if most_recent_posting_date > newest_post_date
|
504
515
|
|
505
516
|
# Now let's add these urls to the database so as to reduce memory overhead.
|
506
517
|
# Keep in mind - they're not active until the email goes out.
|
@@ -509,7 +520,7 @@ report_summaries = craig_report.searches.collect do |search|
|
|
509
520
|
tracked_listing.posts.create(
|
510
521
|
:url => post.url,
|
511
522
|
:created_at => newest_post_date
|
512
|
-
) unless
|
523
|
+
) unless most_recent_posting_date < newest_post_date
|
513
524
|
|
514
525
|
end
|
515
526
|
|
@@ -518,8 +529,18 @@ report_summaries = craig_report.searches.collect do |search|
|
|
518
529
|
end
|
519
530
|
end
|
520
531
|
|
532
|
+
|
533
|
+
|
521
534
|
# Let's flatten the unique'd hash into a more useable array:
|
522
|
-
|
535
|
+
# NOTE: The reason we included a reject is a little complicated, but here's the gist:
|
536
|
+
# * We try not to load the whole post if we don't have to
|
537
|
+
# * Its possible that we met all the criterion of the passes_filter? with merely a header, and
|
538
|
+
# if so we add a url to the summaries stack
|
539
|
+
# * Unfortunately, when we later load that post in full, we may find that the post was posting_has_expired?
|
540
|
+
# or flagged_for_removal?, etc.
|
541
|
+
# * If this was the case, below we'll end up sorting against nil post_dates. This would fail.
|
542
|
+
# * So - before we sort, we run a quick reject on nil post_dates
|
543
|
+
new_summaries = new_summaries.values.reject{|v| v.post_date.nil? }.sort{|a,b| a.post_date <=> b.post_date} # oldest goes to bottom
|
523
544
|
|
524
545
|
# Now Let's manage the tracking database:
|
525
546
|
if new_summaries.length > 0
|
data/lib/libcraigscrape.rb
CHANGED
@@ -2,13 +2,18 @@
|
|
2
2
|
#
|
3
3
|
# All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
|
4
4
|
#
|
5
|
+
require 'rubygems'
|
6
|
+
|
7
|
+
gem 'activesupport', '~> 2.3'
|
8
|
+
gem 'hpricot', '~> 0.8'
|
9
|
+
gem 'htmlentities', '~> 4.0.0'
|
10
|
+
|
5
11
|
require 'net/http'
|
6
12
|
require 'zlib'
|
7
|
-
|
8
|
-
require 'rubygems'
|
9
|
-
require 'active_support'
|
10
13
|
require 'hpricot'
|
11
14
|
require 'htmlentities'
|
15
|
+
require 'active_support'
|
16
|
+
|
12
17
|
|
13
18
|
# A base class encapsulating the various libcraigscrape objects, and providing most of the
|
14
19
|
# craigslist interaction methods. Currently, we're supporting the old Class methods
|
data/lib/posting.rb
CHANGED
@@ -30,9 +30,14 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
30
30
|
super(*args)
|
31
31
|
|
32
32
|
# Validate that required fields are present, at least - if we've downloaded it from a url
|
33
|
-
parse_error! if
|
34
|
-
|
35
|
-
|
33
|
+
parse_error! if (
|
34
|
+
args.first.kind_of? String and
|
35
|
+
!flagged_for_removal? and
|
36
|
+
!posting_has_expired? and
|
37
|
+
!deleted_by_author? and [
|
38
|
+
contents,posting_id,post_time,header,title,full_section
|
39
|
+
].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
|
40
|
+
)
|
36
41
|
end
|
37
42
|
|
38
43
|
|
@@ -188,6 +193,15 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
188
193
|
@deleted_by_author
|
189
194
|
end
|
190
195
|
|
196
|
+
# Returns true if this Post was parsed, and represents a 'This posting has expired.' notice
|
197
|
+
def posting_has_expired?
|
198
|
+
@posting_has_expired = (
|
199
|
+
system_post? and header_as_plain == "This posting has expired."
|
200
|
+
) if @posting_has_expired.nil?
|
201
|
+
|
202
|
+
@posting_has_expired
|
203
|
+
end
|
204
|
+
|
191
205
|
|
192
206
|
# Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
|
193
207
|
# used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
|
@@ -0,0 +1,48 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title></title>
|
5
|
+
<meta name="robots" content="NOARCHIVE,NOFOLLOW">
|
6
|
+
<link type="text/css" rel="stylesheet" media="all" href="http://www.craigslist.org/styles/craigslist.css?v=8">
|
7
|
+
</head>
|
8
|
+
|
9
|
+
<body class="posting">
|
10
|
+
|
11
|
+
|
12
|
+
<div class="bchead">
|
13
|
+
|
14
|
+
<a href="http://charleston.craigslist.org/">charleston craigslist</a> >
|
15
|
+
|
16
|
+
<a href="http://charleston.craigslist.org/sss/">for sale / wanted</a> >
|
17
|
+
<a href="http://charleston.craigslist.org/cto/">cars & trucks - by owner</a>
|
18
|
+
</div>
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
<hr>
|
25
|
+
<br>
|
26
|
+
<br>
|
27
|
+
<h2>This posting has expired.</h2>
|
28
|
+
<h5>(The title on the listings page will be removed in just a few minutes.)</h5>
|
29
|
+
|
30
|
+
<br><br>
|
31
|
+
|
32
|
+
<hr>
|
33
|
+
<ul class="clfooter">
|
34
|
+
<li>Copyright © 2011 craigslist, inc.</li>
|
35
|
+
<li><a href="http://www.craigslist.org/about/terms.of.use.html">terms of use</a></li>
|
36
|
+
<li><a href="http://www.craigslist.org/about/privacy_policy">privacy policy</a></li>
|
37
|
+
<li><a href="/forums/?forumID=8">feedback forum</a></li>
|
38
|
+
</ul>
|
39
|
+
|
40
|
+
<script type="text/javascript" src="http://www.craigslist.org/js/jquery-1.4.2.js"></script>
|
41
|
+
<script type="text/javascript" src="http://www.craigslist.org/js/postings.js"></script>
|
42
|
+
<script type="text/javascript"><!--
|
43
|
+
pID = 1968731193;
|
44
|
+
-->
|
45
|
+
</script>
|
46
|
+
</body>
|
47
|
+
</html>
|
48
|
+
|
@@ -388,4 +388,25 @@ EOD
|
|
388
388
|
assert_equal "2008 GMC Sierra 2500HD", posting_090610.title
|
389
389
|
end
|
390
390
|
|
391
|
+
def test_expired_post
|
392
|
+
posting_expired = CraigScrape::Posting.new relative_uri_for('post_samples/this_post_has_expired.html')
|
393
|
+
assert_equal true, posting_expired.posting_has_expired?
|
394
|
+
assert_equal true, posting_expired.system_post?
|
395
|
+
assert_equal nil, posting_expired.contents
|
396
|
+
assert_equal ["charleston craigslist", "for sale / wanted", "cars & trucks - by owner" ], posting_expired.full_section
|
397
|
+
assert_equal "This posting has expired.", posting_expired.header
|
398
|
+
assert_equal nil, posting_expired.label
|
399
|
+
assert_equal nil, posting_expired.title
|
400
|
+
assert_equal nil, posting_expired.location
|
401
|
+
assert_equal nil, posting_expired.posting_id
|
402
|
+
assert_equal nil, posting_expired.reply_to
|
403
|
+
assert_equal nil, posting_expired.post_time
|
404
|
+
assert_equal [], posting_expired.pics
|
405
|
+
assert_equal nil, posting_expired.contents_as_plain
|
406
|
+
assert_equal nil, posting_expired.price
|
407
|
+
assert_equal [], posting_expired.images
|
408
|
+
assert_equal [], posting_expired.img_types
|
409
|
+
|
410
|
+
end
|
411
|
+
|
391
412
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: libcraigscrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 57
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 9
|
9
|
+
- 1
|
10
|
+
version: 0.9.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Chris DeRose, DeRose Technologies, Inc.
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-01-05 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -24,12 +24,13 @@ dependencies:
|
|
24
24
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
25
|
none: false
|
26
26
|
requirements:
|
27
|
-
- - "
|
27
|
+
- - ">"
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
hash:
|
29
|
+
hash: 27
|
30
30
|
segments:
|
31
31
|
- 0
|
32
|
-
|
32
|
+
- 8
|
33
|
+
version: "0.8"
|
33
34
|
type: :runtime
|
34
35
|
version_requirements: *id001
|
35
36
|
- !ruby/object:Gem::Dependency
|
@@ -40,10 +41,12 @@ dependencies:
|
|
40
41
|
requirements:
|
41
42
|
- - ">="
|
42
43
|
- !ruby/object:Gem::Version
|
43
|
-
hash:
|
44
|
+
hash: 63
|
44
45
|
segments:
|
46
|
+
- 4
|
45
47
|
- 0
|
46
|
-
|
48
|
+
- 0
|
49
|
+
version: 4.0.0
|
47
50
|
type: :runtime
|
48
51
|
version_requirements: *id002
|
49
52
|
- !ruby/object:Gem::Dependency
|
@@ -56,10 +59,40 @@ dependencies:
|
|
56
59
|
- !ruby/object:Gem::Version
|
57
60
|
hash: 3
|
58
61
|
segments:
|
62
|
+
- 2
|
63
|
+
- 3
|
59
64
|
- 0
|
60
|
-
version:
|
65
|
+
version: 2.3.0
|
66
|
+
- - <
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
hash: 5
|
69
|
+
segments:
|
70
|
+
- 3
|
71
|
+
version: "3"
|
61
72
|
type: :runtime
|
62
73
|
version_requirements: *id003
|
74
|
+
- !ruby/object:Gem::Dependency
|
75
|
+
name: activerecord
|
76
|
+
prerelease: false
|
77
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
hash: 3
|
83
|
+
segments:
|
84
|
+
- 2
|
85
|
+
- 3
|
86
|
+
- 0
|
87
|
+
version: 2.3.0
|
88
|
+
- - <
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
hash: 5
|
91
|
+
segments:
|
92
|
+
- 3
|
93
|
+
version: "3"
|
94
|
+
type: :runtime
|
95
|
+
version_requirements: *id004
|
63
96
|
description: quick, easy, craigslist parsing library that takes the monotony out of working with craigslist posts and listings
|
64
97
|
email: cderose@derosetechnologies.com
|
65
98
|
executables:
|
@@ -114,6 +147,7 @@ files:
|
|
114
147
|
- test/post_samples/posting5.html
|
115
148
|
- test/post_samples/posting1796890756-061710.html
|
116
149
|
- test/post_samples/posting3.html
|
150
|
+
- test/post_samples/this_post_has_expired.html
|
117
151
|
- test/post_samples/posting1808219423.html
|
118
152
|
- test/post_samples/sfbay_art_1223614914.html
|
119
153
|
- test/post_samples/this_post_has_been_deleted_by_its_author.html
|