libcraigscrape 0.9.1 → 1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +3 -0
- data/Rakefile +2 -2
- data/lib/geo_listings.rb +1 -1
- data/lib/libcraigscrape.rb +3 -2
- data/lib/listings.rb +3 -2
- data/lib/posting.rb +8 -8
- data/lib/scraper.rb +4 -2
- data/test/test_craigslist_listing.rb +21 -21
- data/test/test_craigslist_posting.rb +39 -25
- metadata +11 -11
data/CHANGELOG
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
== Change Log
|
2
2
|
|
3
|
+
=== Release 1.0
|
4
|
+
- Replaced hpricot dependency with Nokogiri. Nokogiri should be faster and more reliable. Whoo-hoo!
|
5
|
+
|
3
6
|
=== Release 0.9.1
|
4
7
|
- Added support for posting_has_expired? and expired post recognition
|
5
8
|
- Fixed a weird bug in craigwatch that would cause a scrape to abort if a flagged_for_removal? was encountered when using certain (minimal) filtering
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ include FileUtils
|
|
11
11
|
RbConfig = Config unless defined? RbConfig
|
12
12
|
|
13
13
|
NAME = "libcraigscrape"
|
14
|
-
VERS = ENV['VERSION'] || "0
|
14
|
+
VERS = ENV['VERSION'] || "1.0"
|
15
15
|
PKG = "#{NAME}-#{VERS}"
|
16
16
|
|
17
17
|
RDOC_OPTS = ['--quiet', '--title', 'The libcraigscrape Reference', '--main', 'README', '--inline-source']
|
@@ -37,7 +37,7 @@ SPEC =
|
|
37
37
|
s.files = PKG_FILES
|
38
38
|
s.require_paths = ["lib"]
|
39
39
|
s.test_files = FileList['test/test_*.rb']
|
40
|
-
s.add_dependency '
|
40
|
+
s.add_dependency 'nokogiri', '>= 1.4.4'
|
41
41
|
s.add_dependency 'htmlentities', '>= 4.0.0'
|
42
42
|
s.add_dependency 'activesupport','>= 2.3.0', '< 3'
|
43
43
|
s.add_dependency 'activerecord', '>= 2.3.0', '< 3'
|
data/lib/geo_listings.rb
CHANGED
data/lib/libcraigscrape.rb
CHANGED
@@ -5,12 +5,13 @@
|
|
5
5
|
require 'rubygems'
|
6
6
|
|
7
7
|
gem 'activesupport', '~> 2.3'
|
8
|
-
gem '
|
8
|
+
gem 'nokogiri', '~> 1.4.4'
|
9
9
|
gem 'htmlentities', '~> 4.0.0'
|
10
10
|
|
11
|
+
|
11
12
|
require 'net/http'
|
12
13
|
require 'zlib'
|
13
|
-
require '
|
14
|
+
require 'nokogiri'
|
14
15
|
require 'htmlentities'
|
15
16
|
require 'active_support'
|
16
17
|
|
data/lib/listings.rb
CHANGED
@@ -21,7 +21,8 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
21
21
|
current_date = nil
|
22
22
|
@posts = []
|
23
23
|
|
24
|
-
|
24
|
+
# All we care about are p and h4 tags. This seemed to be the only way I could do this on Nokogiri:
|
25
|
+
post_tags = html.search('*').reject{|n| !/^(?:p|h4)$/i.match n.name }
|
25
26
|
|
26
27
|
# The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output:
|
27
28
|
post_tags.pop if (
|
@@ -82,7 +83,7 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
82
83
|
|
83
84
|
# If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
|
84
85
|
# We're looking good.
|
85
|
-
next_link = cursor.
|
86
|
+
next_link = cursor.next_element if cursor and /^[\d]+$/.match cursor.inner_html
|
86
87
|
end
|
87
88
|
|
88
89
|
# We have an anchor tag - so - let's assign the href:
|
data/lib/posting.rb
CHANGED
@@ -79,7 +79,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
79
79
|
def reply_to
|
80
80
|
unless @reply_to
|
81
81
|
cursor = html_head.at 'hr' if html_head
|
82
|
-
cursor = cursor.
|
82
|
+
cursor = cursor.next until cursor.nil? or cursor.name == 'a'
|
83
83
|
@reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
|
84
84
|
end
|
85
85
|
|
@@ -90,7 +90,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
90
90
|
def post_time
|
91
91
|
unless @post_time
|
92
92
|
cursor = html_head.at 'hr' if html_head
|
93
|
-
cursor = cursor.
|
93
|
+
cursor = cursor.next until cursor.nil? or POST_DATE.match cursor.to_s
|
94
94
|
@post_time = Time.parse $1 if $1
|
95
95
|
end
|
96
96
|
|
@@ -100,8 +100,8 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
100
100
|
# Integer, Craigslist's unique posting id
|
101
101
|
def posting_id
|
102
102
|
unless @posting_id
|
103
|
-
cursor =
|
104
|
-
cursor = cursor.
|
103
|
+
cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING if html_footer
|
104
|
+
cursor = cursor.next until cursor.nil? or POSTING_ID.match cursor.to_s
|
105
105
|
@posting_id = $1.to_i if $1
|
106
106
|
end
|
107
107
|
|
@@ -135,7 +135,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
135
135
|
# Real estate listings can work a little different for location:
|
136
136
|
unless @location
|
137
137
|
cursor = craigslist_body.at 'small'
|
138
|
-
cursor = cursor.
|
138
|
+
cursor = cursor.previous until cursor.nil? or cursor.text?
|
139
139
|
|
140
140
|
@location = he_decode(cursor.to_s.strip) if cursor
|
141
141
|
end
|
@@ -295,7 +295,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
295
295
|
# I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we
|
296
296
|
# return everything above the user_body
|
297
297
|
def html_head
|
298
|
-
@html_head =
|
298
|
+
@html_head = Nokogiri::HTML $1, nil, HTML_ENCODING if @html_head.nil? and HTML_HEADER.match html_source
|
299
299
|
# We return html itself if HTML_HEADER doesn't match, which would be case for a 404 page or something
|
300
300
|
@html_head ||= html
|
301
301
|
|
@@ -316,9 +316,9 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
316
316
|
end
|
317
317
|
|
318
318
|
# Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
|
319
|
-
# So - we'll return it as
|
319
|
+
# So - we'll return it as a Nokogiri object.
|
320
320
|
def craigslist_body
|
321
|
-
|
321
|
+
Nokogiri::HTML $3, nil, HTML_ENCODING if USERBODY_PARTS.match html_source
|
322
322
|
end
|
323
323
|
|
324
324
|
end
|
data/lib/scraper.rb
CHANGED
@@ -33,6 +33,8 @@ class CraigScrape::Scraper
|
|
33
33
|
|
34
34
|
URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/
|
35
35
|
HTML_TAG = /<\/?[^>]*>/
|
36
|
+
# We have to specify this to nokogiri. Sometimes it tries to figure out encoding on its own, and craigslist users post crazy bytes sometimes
|
37
|
+
HTML_ENCODING = "UTF-8"
|
36
38
|
|
37
39
|
# Returns the full url that corresponds to this resource
|
38
40
|
attr_reader :url
|
@@ -202,9 +204,9 @@ class CraigScrape::Scraper
|
|
202
204
|
@html_source
|
203
205
|
end
|
204
206
|
|
205
|
-
# Returns an
|
207
|
+
# Returns an Nokogiri parse, of the current URI
|
206
208
|
def html
|
207
|
-
@html ||=
|
209
|
+
@html ||= Nokogiri::HTML html_source, nil, HTML_ENCODING if html_source
|
208
210
|
@html
|
209
211
|
end
|
210
212
|
end
|
@@ -210,47 +210,47 @@ class CraigslistListingTest < Test::Unit::TestCase
|
|
210
210
|
|
211
211
|
mia_search_kitten031510 = CraigScrape::Listings.new relative_uri_for('listing_samples/mia_search_kitten.3.15.10.html')
|
212
212
|
assert_equal "Adopt a 7 month on kitten- $75", mia_search_kitten031510.posts[0].label
|
213
|
-
assert_equal [
|
213
|
+
assert_equal [15, 3], mia_search_kitten031510.posts[0].post_date.to_a[3..4]
|
214
214
|
assert_equal "Adorable Kitten! Free!!!", mia_search_kitten031510.posts[1].label
|
215
|
-
assert_equal [
|
215
|
+
assert_equal [15, 3], mia_search_kitten031510.posts[1].post_date.to_a[3..4]
|
216
216
|
assert_equal "KITTENS,5 months, 1 Russian blue, 1 grey & white,vac spy/neu,$35fee ea", mia_search_kitten031510.posts[2].label
|
217
|
-
assert_equal [
|
217
|
+
assert_equal [13, 3], mia_search_kitten031510.posts[2].post_date.to_a[3..4]
|
218
218
|
assert_equal "Kitties need a good home", mia_search_kitten031510.posts[3].label
|
219
|
-
assert_equal [
|
219
|
+
assert_equal [13, 3], mia_search_kitten031510.posts[3].post_date.to_a[3..4]
|
220
220
|
assert_equal "7 week old kittens for adoption", mia_search_kitten031510.posts[4].label
|
221
|
-
assert_equal [
|
221
|
+
assert_equal [13, 3], mia_search_kitten031510.posts[4].post_date.to_a[3..4]
|
222
222
|
assert_equal "Adorable Orange Kitten Free to Good Home", mia_search_kitten031510.posts[5].label
|
223
|
-
assert_equal [
|
223
|
+
assert_equal [12, 3], mia_search_kitten031510.posts[5].post_date.to_a[3..4]
|
224
224
|
assert_equal "7 month old kitten free to good home", mia_search_kitten031510.posts[6].label
|
225
|
-
assert_equal [
|
225
|
+
assert_equal [12, 3], mia_search_kitten031510.posts[6].post_date.to_a[3..4]
|
226
226
|
assert_equal "FEMALE KITTEN FOR GOOD HOME", mia_search_kitten031510.posts[7].label
|
227
|
-
assert_equal [
|
227
|
+
assert_equal [9, 3], mia_search_kitten031510.posts[7].post_date.to_a[3..4]
|
228
228
|
assert_equal "Kitten", mia_search_kitten031510.posts[8].label
|
229
|
-
assert_equal [
|
229
|
+
assert_equal [4, 3], mia_search_kitten031510.posts[8].post_date.to_a[3..4]
|
230
230
|
assert_equal "Kitties need a good home", mia_search_kitten031510.posts[9].label
|
231
|
-
assert_equal [
|
231
|
+
assert_equal [4, 3], mia_search_kitten031510.posts[9].post_date.to_a[3..4]
|
232
232
|
assert_equal "Persain Cat And Tabby Cat", mia_search_kitten031510.posts[10].label
|
233
|
-
assert_equal [
|
233
|
+
assert_equal [1, 3], mia_search_kitten031510.posts[10].post_date.to_a[3..4]
|
234
234
|
assert_equal "Tabby female kitten in a parking lot needs your help", mia_search_kitten031510.posts[11].label
|
235
|
-
assert_equal [
|
235
|
+
assert_equal [23, 2], mia_search_kitten031510.posts[11].post_date.to_a[3..4]
|
236
236
|
assert_equal "Spring is almost officially here, grow your family, adopt a kitty!", mia_search_kitten031510.posts[12].label
|
237
|
-
assert_equal [
|
237
|
+
assert_equal [22, 2], mia_search_kitten031510.posts[12].post_date.to_a[3..4]
|
238
238
|
assert_equal "Many adorable kittens for adoption!", mia_search_kitten031510.posts[13].label
|
239
|
-
assert_equal [
|
239
|
+
assert_equal [22, 2], mia_search_kitten031510.posts[13].post_date.to_a[3..4]
|
240
240
|
assert_equal "2 free cats/kitten to good home", mia_search_kitten031510.posts[14].label
|
241
|
-
assert_equal [
|
241
|
+
assert_equal [19, 2], mia_search_kitten031510.posts[14].post_date.to_a[3..4]
|
242
242
|
assert_equal "BEAUTIFUL KITTENS", mia_search_kitten031510.posts[15].label
|
243
|
-
assert_equal [
|
243
|
+
assert_equal [19, 2], mia_search_kitten031510.posts[15].post_date.to_a[3..4]
|
244
244
|
assert_equal "MANY new adorable kittens for good homes!!!", mia_search_kitten031510.posts[16].label
|
245
|
-
assert_equal [
|
245
|
+
assert_equal [18, 2], mia_search_kitten031510.posts[16].post_date.to_a[3..4]
|
246
246
|
assert_equal "Kitten living in a parking lot needs your help", mia_search_kitten031510.posts[17].label
|
247
|
-
assert_equal [
|
247
|
+
assert_equal [16, 2], mia_search_kitten031510.posts[17].post_date.to_a[3..4]
|
248
248
|
assert_equal "BEAUTIFUL 8 WEEK KITTENS", mia_search_kitten031510.posts[18].label
|
249
|
-
assert_equal [
|
249
|
+
assert_equal [16, 2], mia_search_kitten031510.posts[18].post_date.to_a[3..4]
|
250
250
|
assert_equal "ORANGE TABBY KITTEN", mia_search_kitten031510.posts[19].label
|
251
|
-
assert_equal [
|
251
|
+
assert_equal [13, 2], mia_search_kitten031510.posts[19].post_date.to_a[3..4]
|
252
252
|
assert_equal "Lots of kittens to choose from! Pics!!", mia_search_kitten031510.posts[20].label
|
253
|
-
assert_equal [
|
253
|
+
assert_equal [13, 2], mia_search_kitten031510.posts[20].post_date.to_a[3..4]
|
254
254
|
|
255
255
|
end
|
256
256
|
|
@@ -37,7 +37,9 @@ EOD
|
|
37
37
|
<p><a href="/mdc/jwl/1128691192.html">925 Sterling Silver Dragonfly Charm Bracelet - $25 -</a> <span class="p"> img</span></p>
|
38
38
|
EOD
|
39
39
|
|
40
|
-
one = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
40
|
+
one = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
41
|
+
Nokogiri::HTML(search_html_one, nil, CraigScrape::Scraper::HTML_ENCODING).at('p')
|
42
|
+
)
|
41
43
|
assert_equal true, one.has_img?
|
42
44
|
assert_equal false, one.has_pic?
|
43
45
|
assert_equal true, one.has_pic_or_img?
|
@@ -49,7 +51,9 @@ EOD
|
|
49
51
|
assert_equal 18, one.post_date.day
|
50
52
|
assert_equal nil, one.price
|
51
53
|
|
52
|
-
two = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
54
|
+
two = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
55
|
+
Nokogiri::HTML(search_html_two, nil, CraigScrape::Scraper::HTML_ENCODING).at('p')
|
56
|
+
)
|
53
57
|
assert_equal true, two.has_img?
|
54
58
|
assert_equal true, two.has_pic?
|
55
59
|
assert_equal true, two.has_pic_or_img?
|
@@ -61,7 +65,9 @@ EOD
|
|
61
65
|
assert_equal 4, two.post_date.day
|
62
66
|
assert_equal 348000.0, two.price
|
63
67
|
|
64
|
-
three = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
68
|
+
three = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
69
|
+
Nokogiri::HTML(search_html_three, nil, CraigScrape::Scraper::HTML_ENCODING).at('p')
|
70
|
+
)
|
65
71
|
assert_equal false, three.has_img?
|
66
72
|
assert_equal true, three.has_pic?
|
67
73
|
assert_equal true, three.has_pic_or_img?
|
@@ -73,7 +79,9 @@ EOD
|
|
73
79
|
assert_equal 31, three.post_date.day
|
74
80
|
assert_equal 22.0, three.price
|
75
81
|
|
76
|
-
four = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
82
|
+
four = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
83
|
+
Nokogiri::HTML(search_html_four, nil, CraigScrape::Scraper::HTML_ENCODING).at('p')
|
84
|
+
)
|
77
85
|
assert_equal false, four.has_img?
|
78
86
|
assert_equal false, four.has_pic?
|
79
87
|
assert_equal false, four.has_pic_or_img?
|
@@ -85,7 +93,9 @@ EOD
|
|
85
93
|
assert_equal 22, four.post_date.day
|
86
94
|
assert_equal 325000.0, four.price
|
87
95
|
|
88
|
-
five = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
96
|
+
five = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
97
|
+
Nokogiri::HTML(search_html_five, nil, CraigScrape::Scraper::HTML_ENCODING).at('p')
|
98
|
+
)
|
89
99
|
assert_equal false, five.has_img?
|
90
100
|
assert_equal true, five.has_pic?
|
91
101
|
assert_equal true, five.has_pic_or_img?
|
@@ -97,27 +107,31 @@ EOD
|
|
97
107
|
assert_equal 9, five.post_date.day
|
98
108
|
assert_equal 105000.0, five.price
|
99
109
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
assert_equal
|
104
|
-
assert_equal
|
105
|
-
assert_equal "$2995000 / 5br - Downtown Boca New Home To Be Built", five.label
|
106
|
-
assert_equal "real estate - by broker", five.section
|
107
|
-
assert_equal "Boca Raton", five.location
|
108
|
-
assert_equal nil, five.post_date
|
109
|
-
assert_equal 2995000.0, five.price
|
110
|
-
|
111
|
-
six = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(Hpricot.parse(category_listing_two).at('p'))
|
112
|
-
assert_equal true, six.has_img?
|
113
|
-
assert_equal false, six.has_pic?
|
110
|
+
six = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
111
|
+
Nokogiri::HTML(category_listing_one, nil, CraigScrape::Scraper::HTML_ENCODING).at('p')
|
112
|
+
)
|
113
|
+
assert_equal false, six.has_img?
|
114
|
+
assert_equal true, six.has_pic?
|
114
115
|
assert_equal true, six.has_pic_or_img?
|
115
|
-
assert_equal '/
|
116
|
-
assert_equal "
|
117
|
-
assert_equal
|
118
|
-
assert_equal
|
116
|
+
assert_equal '/pbc/reb/1128661387.html', six.href
|
117
|
+
assert_equal "$2995000 / 5br - Downtown Boca New Home To Be Built", six.label
|
118
|
+
assert_equal "real estate - by broker", six.section
|
119
|
+
assert_equal "Boca Raton", six.location
|
119
120
|
assert_equal nil, six.post_date
|
120
|
-
assert_equal
|
121
|
+
assert_equal 2995000.0, six.price
|
122
|
+
|
123
|
+
seven = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
124
|
+
Nokogiri::HTML(category_listing_two, nil, CraigScrape::Scraper::HTML_ENCODING).at('p')
|
125
|
+
)
|
126
|
+
assert_equal true, seven.has_img?
|
127
|
+
assert_equal false, seven.has_pic?
|
128
|
+
assert_equal true, seven.has_pic_or_img?
|
129
|
+
assert_equal '/mdc/jwl/1128691192.html', seven.href
|
130
|
+
assert_equal "925 Sterling Silver Dragonfly Charm Bracelet - $25", seven.label
|
131
|
+
assert_equal nil, seven.section
|
132
|
+
assert_equal nil, seven.location
|
133
|
+
assert_equal nil, seven.post_date
|
134
|
+
assert_equal 25.0, seven.price
|
121
135
|
end
|
122
136
|
|
123
137
|
|
@@ -302,7 +316,7 @@ EOD
|
|
302
316
|
assert_equal [:pic], sfbay_art_1223614914.img_types
|
303
317
|
end
|
304
318
|
|
305
|
-
# This
|
319
|
+
# This was actually a 'bug' with hpricot itself when the ulimit is set too low.
|
306
320
|
# the Easy fix is running "ulimit -s 16384" before the tests. But the better fix was
|
307
321
|
# to remove the userbody sending these pages to be parsed by Hpricot
|
308
322
|
def test_bugs_found061710
|
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: libcraigscrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
|
-
- 0
|
8
|
-
- 9
|
9
7
|
- 1
|
10
|
-
|
8
|
+
- 0
|
9
|
+
version: "1.0"
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- Chris DeRose, DeRose Technologies, Inc.
|
@@ -15,22 +14,23 @@ autorequire:
|
|
15
14
|
bindir: bin
|
16
15
|
cert_chain: []
|
17
16
|
|
18
|
-
date: 2011-
|
17
|
+
date: 2011-02-13 00:00:00 -05:00
|
19
18
|
default_executable:
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
|
-
name:
|
21
|
+
name: nokogiri
|
23
22
|
prerelease: false
|
24
23
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
24
|
none: false
|
26
25
|
requirements:
|
27
|
-
- - "
|
26
|
+
- - ">="
|
28
27
|
- !ruby/object:Gem::Version
|
29
|
-
hash:
|
28
|
+
hash: 15
|
30
29
|
segments:
|
31
|
-
-
|
32
|
-
-
|
33
|
-
|
30
|
+
- 1
|
31
|
+
- 4
|
32
|
+
- 4
|
33
|
+
version: 1.4.4
|
34
34
|
type: :runtime
|
35
35
|
version_requirements: *id001
|
36
36
|
- !ruby/object:Gem::Dependency
|