libcraigscrape 0.9.1 → 1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +3 -0
- data/Rakefile +2 -2
- data/lib/geo_listings.rb +1 -1
- data/lib/libcraigscrape.rb +3 -2
- data/lib/listings.rb +3 -2
- data/lib/posting.rb +8 -8
- data/lib/scraper.rb +4 -2
- data/test/test_craigslist_listing.rb +21 -21
- data/test/test_craigslist_posting.rb +39 -25
- metadata +11 -11
data/CHANGELOG
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
== Change Log
|
2
2
|
|
3
|
+
=== Release 1.0
|
4
|
+
- Replaced hpricot dependency with Nokogiri. Nokogiri should be faster and more reliable. Whoo-hoo!
|
5
|
+
|
3
6
|
=== Release 0.9.1
|
4
7
|
- Added support for posting_has_expired? and expired post recognition
|
5
8
|
- Fixed a weird bug in craigwatch that would cause a scrape to abort if a flagged_for_removal? was encountered when using certain (minimal) filtering
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ include FileUtils
|
|
11
11
|
RbConfig = Config unless defined? RbConfig
|
12
12
|
|
13
13
|
NAME = "libcraigscrape"
|
14
|
-
VERS = ENV['VERSION'] || "0
|
14
|
+
VERS = ENV['VERSION'] || "1.0"
|
15
15
|
PKG = "#{NAME}-#{VERS}"
|
16
16
|
|
17
17
|
RDOC_OPTS = ['--quiet', '--title', 'The libcraigscrape Reference', '--main', 'README', '--inline-source']
|
@@ -37,7 +37,7 @@ SPEC =
|
|
37
37
|
s.files = PKG_FILES
|
38
38
|
s.require_paths = ["lib"]
|
39
39
|
s.test_files = FileList['test/test_*.rb']
|
40
|
-
s.add_dependency '
|
40
|
+
s.add_dependency 'nokogiri', '>= 1.4.4'
|
41
41
|
s.add_dependency 'htmlentities', '>= 4.0.0'
|
42
42
|
s.add_dependency 'activesupport','>= 2.3.0', '< 3'
|
43
43
|
s.add_dependency 'activerecord', '>= 2.3.0', '< 3'
|
data/lib/geo_listings.rb
CHANGED
data/lib/libcraigscrape.rb
CHANGED
@@ -5,12 +5,13 @@
|
|
5
5
|
require 'rubygems'
|
6
6
|
|
7
7
|
gem 'activesupport', '~> 2.3'
|
8
|
-
gem '
|
8
|
+
gem 'nokogiri', '~> 1.4.4'
|
9
9
|
gem 'htmlentities', '~> 4.0.0'
|
10
10
|
|
11
|
+
|
11
12
|
require 'net/http'
|
12
13
|
require 'zlib'
|
13
|
-
require '
|
14
|
+
require 'nokogiri'
|
14
15
|
require 'htmlentities'
|
15
16
|
require 'active_support'
|
16
17
|
|
data/lib/listings.rb
CHANGED
@@ -21,7 +21,8 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
21
21
|
current_date = nil
|
22
22
|
@posts = []
|
23
23
|
|
24
|
-
|
24
|
+
# All we care about are p and h4 tags. This seemed to be the only way I could do this on Nokogiri:
|
25
|
+
post_tags = html.search('*').reject{|n| !/^(?:p|h4)$/i.match n.name }
|
25
26
|
|
26
27
|
# The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output:
|
27
28
|
post_tags.pop if (
|
@@ -82,7 +83,7 @@ class CraigScrape::Listings < CraigScrape::Scraper
|
|
82
83
|
|
83
84
|
# If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
|
84
85
|
# We're looking good.
|
85
|
-
next_link = cursor.
|
86
|
+
next_link = cursor.next_element if cursor and /^[\d]+$/.match cursor.inner_html
|
86
87
|
end
|
87
88
|
|
88
89
|
# We have an anchor tag - so - let's assign the href:
|
data/lib/posting.rb
CHANGED
@@ -79,7 +79,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
79
79
|
def reply_to
|
80
80
|
unless @reply_to
|
81
81
|
cursor = html_head.at 'hr' if html_head
|
82
|
-
cursor = cursor.
|
82
|
+
cursor = cursor.next until cursor.nil? or cursor.name == 'a'
|
83
83
|
@reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
|
84
84
|
end
|
85
85
|
|
@@ -90,7 +90,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
90
90
|
def post_time
|
91
91
|
unless @post_time
|
92
92
|
cursor = html_head.at 'hr' if html_head
|
93
|
-
cursor = cursor.
|
93
|
+
cursor = cursor.next until cursor.nil? or POST_DATE.match cursor.to_s
|
94
94
|
@post_time = Time.parse $1 if $1
|
95
95
|
end
|
96
96
|
|
@@ -100,8 +100,8 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
100
100
|
# Integer, Craigslist's unique posting id
|
101
101
|
def posting_id
|
102
102
|
unless @posting_id
|
103
|
-
cursor =
|
104
|
-
cursor = cursor.
|
103
|
+
cursor = Nokogiri::HTML html_footer, nil, HTML_ENCODING if html_footer
|
104
|
+
cursor = cursor.next until cursor.nil? or POSTING_ID.match cursor.to_s
|
105
105
|
@posting_id = $1.to_i if $1
|
106
106
|
end
|
107
107
|
|
@@ -135,7 +135,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
135
135
|
# Real estate listings can work a little different for location:
|
136
136
|
unless @location
|
137
137
|
cursor = craigslist_body.at 'small'
|
138
|
-
cursor = cursor.
|
138
|
+
cursor = cursor.previous until cursor.nil? or cursor.text?
|
139
139
|
|
140
140
|
@location = he_decode(cursor.to_s.strip) if cursor
|
141
141
|
end
|
@@ -295,7 +295,7 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
295
295
|
# I set apart from html to work around the SystemStackError bugs in test_bugs_found061710. Essentially we
|
296
296
|
# return everything above the user_body
|
297
297
|
def html_head
|
298
|
-
@html_head =
|
298
|
+
@html_head = Nokogiri::HTML $1, nil, HTML_ENCODING if @html_head.nil? and HTML_HEADER.match html_source
|
299
299
|
# We return html itself if HTML_HEADER doesn't match, which would be case for a 404 page or something
|
300
300
|
@html_head ||= html
|
301
301
|
|
@@ -316,9 +316,9 @@ class CraigScrape::Posting < CraigScrape::Scraper
|
|
316
316
|
end
|
317
317
|
|
318
318
|
# Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html.
|
319
|
-
# So - we'll return it as
|
319
|
+
# So - we'll return it as a Nokogiri object.
|
320
320
|
def craigslist_body
|
321
|
-
|
321
|
+
Nokogiri::HTML $3, nil, HTML_ENCODING if USERBODY_PARTS.match html_source
|
322
322
|
end
|
323
323
|
|
324
324
|
end
|
data/lib/scraper.rb
CHANGED
@@ -33,6 +33,8 @@ class CraigScrape::Scraper
|
|
33
33
|
|
34
34
|
URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/
|
35
35
|
HTML_TAG = /<\/?[^>]*>/
|
36
|
+
# We have to specify this to nokogiri. Sometimes it tries to figure out encoding on its own, and craigslist users post crazy bytes sometimes
|
37
|
+
HTML_ENCODING = "UTF-8"
|
36
38
|
|
37
39
|
# Returns the full url that corresponds to this resource
|
38
40
|
attr_reader :url
|
@@ -202,9 +204,9 @@ class CraigScrape::Scraper
|
|
202
204
|
@html_source
|
203
205
|
end
|
204
206
|
|
205
|
-
# Returns an
|
207
|
+
# Returns an Nokogiri parse, of the current URI
|
206
208
|
def html
|
207
|
-
@html ||=
|
209
|
+
@html ||= Nokogiri::HTML html_source, nil, HTML_ENCODING if html_source
|
208
210
|
@html
|
209
211
|
end
|
210
212
|
end
|
@@ -210,47 +210,47 @@ class CraigslistListingTest < Test::Unit::TestCase
|
|
210
210
|
|
211
211
|
mia_search_kitten031510 = CraigScrape::Listings.new relative_uri_for('listing_samples/mia_search_kitten.3.15.10.html')
|
212
212
|
assert_equal "Adopt a 7 month on kitten- $75", mia_search_kitten031510.posts[0].label
|
213
|
-
assert_equal [
|
213
|
+
assert_equal [15, 3], mia_search_kitten031510.posts[0].post_date.to_a[3..4]
|
214
214
|
assert_equal "Adorable Kitten! Free!!!", mia_search_kitten031510.posts[1].label
|
215
|
-
assert_equal [
|
215
|
+
assert_equal [15, 3], mia_search_kitten031510.posts[1].post_date.to_a[3..4]
|
216
216
|
assert_equal "KITTENS,5 months, 1 Russian blue, 1 grey & white,vac spy/neu,$35fee ea", mia_search_kitten031510.posts[2].label
|
217
|
-
assert_equal [
|
217
|
+
assert_equal [13, 3], mia_search_kitten031510.posts[2].post_date.to_a[3..4]
|
218
218
|
assert_equal "Kitties need a good home", mia_search_kitten031510.posts[3].label
|
219
|
-
assert_equal [
|
219
|
+
assert_equal [13, 3], mia_search_kitten031510.posts[3].post_date.to_a[3..4]
|
220
220
|
assert_equal "7 week old kittens for adoption", mia_search_kitten031510.posts[4].label
|
221
|
-
assert_equal [
|
221
|
+
assert_equal [13, 3], mia_search_kitten031510.posts[4].post_date.to_a[3..4]
|
222
222
|
assert_equal "Adorable Orange Kitten Free to Good Home", mia_search_kitten031510.posts[5].label
|
223
|
-
assert_equal [
|
223
|
+
assert_equal [12, 3], mia_search_kitten031510.posts[5].post_date.to_a[3..4]
|
224
224
|
assert_equal "7 month old kitten free to good home", mia_search_kitten031510.posts[6].label
|
225
|
-
assert_equal [
|
225
|
+
assert_equal [12, 3], mia_search_kitten031510.posts[6].post_date.to_a[3..4]
|
226
226
|
assert_equal "FEMALE KITTEN FOR GOOD HOME", mia_search_kitten031510.posts[7].label
|
227
|
-
assert_equal [
|
227
|
+
assert_equal [9, 3], mia_search_kitten031510.posts[7].post_date.to_a[3..4]
|
228
228
|
assert_equal "Kitten", mia_search_kitten031510.posts[8].label
|
229
|
-
assert_equal [
|
229
|
+
assert_equal [4, 3], mia_search_kitten031510.posts[8].post_date.to_a[3..4]
|
230
230
|
assert_equal "Kitties need a good home", mia_search_kitten031510.posts[9].label
|
231
|
-
assert_equal [
|
231
|
+
assert_equal [4, 3], mia_search_kitten031510.posts[9].post_date.to_a[3..4]
|
232
232
|
assert_equal "Persain Cat And Tabby Cat", mia_search_kitten031510.posts[10].label
|
233
|
-
assert_equal [
|
233
|
+
assert_equal [1, 3], mia_search_kitten031510.posts[10].post_date.to_a[3..4]
|
234
234
|
assert_equal "Tabby female kitten in a parking lot needs your help", mia_search_kitten031510.posts[11].label
|
235
|
-
assert_equal [
|
235
|
+
assert_equal [23, 2], mia_search_kitten031510.posts[11].post_date.to_a[3..4]
|
236
236
|
assert_equal "Spring is almost officially here, grow your family, adopt a kitty!", mia_search_kitten031510.posts[12].label
|
237
|
-
assert_equal [
|
237
|
+
assert_equal [22, 2], mia_search_kitten031510.posts[12].post_date.to_a[3..4]
|
238
238
|
assert_equal "Many adorable kittens for adoption!", mia_search_kitten031510.posts[13].label
|
239
|
-
assert_equal [
|
239
|
+
assert_equal [22, 2], mia_search_kitten031510.posts[13].post_date.to_a[3..4]
|
240
240
|
assert_equal "2 free cats/kitten to good home", mia_search_kitten031510.posts[14].label
|
241
|
-
assert_equal [
|
241
|
+
assert_equal [19, 2], mia_search_kitten031510.posts[14].post_date.to_a[3..4]
|
242
242
|
assert_equal "BEAUTIFUL KITTENS", mia_search_kitten031510.posts[15].label
|
243
|
-
assert_equal [
|
243
|
+
assert_equal [19, 2], mia_search_kitten031510.posts[15].post_date.to_a[3..4]
|
244
244
|
assert_equal "MANY new adorable kittens for good homes!!!", mia_search_kitten031510.posts[16].label
|
245
|
-
assert_equal [
|
245
|
+
assert_equal [18, 2], mia_search_kitten031510.posts[16].post_date.to_a[3..4]
|
246
246
|
assert_equal "Kitten living in a parking lot needs your help", mia_search_kitten031510.posts[17].label
|
247
|
-
assert_equal [
|
247
|
+
assert_equal [16, 2], mia_search_kitten031510.posts[17].post_date.to_a[3..4]
|
248
248
|
assert_equal "BEAUTIFUL 8 WEEK KITTENS", mia_search_kitten031510.posts[18].label
|
249
|
-
assert_equal [
|
249
|
+
assert_equal [16, 2], mia_search_kitten031510.posts[18].post_date.to_a[3..4]
|
250
250
|
assert_equal "ORANGE TABBY KITTEN", mia_search_kitten031510.posts[19].label
|
251
|
-
assert_equal [
|
251
|
+
assert_equal [13, 2], mia_search_kitten031510.posts[19].post_date.to_a[3..4]
|
252
252
|
assert_equal "Lots of kittens to choose from! Pics!!", mia_search_kitten031510.posts[20].label
|
253
|
-
assert_equal [
|
253
|
+
assert_equal [13, 2], mia_search_kitten031510.posts[20].post_date.to_a[3..4]
|
254
254
|
|
255
255
|
end
|
256
256
|
|
@@ -37,7 +37,9 @@ EOD
|
|
37
37
|
<p><a href="/mdc/jwl/1128691192.html">925 Sterling Silver Dragonfly Charm Bracelet - $25 -</a> <span class="p"> img</span></p>
|
38
38
|
EOD
|
39
39
|
|
40
|
-
one = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
40
|
+
one = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
41
|
+
Nokogiri::HTML(search_html_one, nil, CraigScrape::Scraper::HTML_ENCODING).at('p')
|
42
|
+
)
|
41
43
|
assert_equal true, one.has_img?
|
42
44
|
assert_equal false, one.has_pic?
|
43
45
|
assert_equal true, one.has_pic_or_img?
|
@@ -49,7 +51,9 @@ EOD
|
|
49
51
|
assert_equal 18, one.post_date.day
|
50
52
|
assert_equal nil, one.price
|
51
53
|
|
52
|
-
two = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
54
|
+
two = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
55
|
+
Nokogiri::HTML(search_html_two, nil, CraigScrape::Scraper::HTML_ENCODING).at('p')
|
56
|
+
)
|
53
57
|
assert_equal true, two.has_img?
|
54
58
|
assert_equal true, two.has_pic?
|
55
59
|
assert_equal true, two.has_pic_or_img?
|
@@ -61,7 +65,9 @@ EOD
|
|
61
65
|
assert_equal 4, two.post_date.day
|
62
66
|
assert_equal 348000.0, two.price
|
63
67
|
|
64
|
-
three = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
68
|
+
three = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
69
|
+
Nokogiri::HTML(search_html_three, nil, CraigScrape::Scraper::HTML_ENCODING).at('p')
|
70
|
+
)
|
65
71
|
assert_equal false, three.has_img?
|
66
72
|
assert_equal true, three.has_pic?
|
67
73
|
assert_equal true, three.has_pic_or_img?
|
@@ -73,7 +79,9 @@ EOD
|
|
73
79
|
assert_equal 31, three.post_date.day
|
74
80
|
assert_equal 22.0, three.price
|
75
81
|
|
76
|
-
four = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
82
|
+
four = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
83
|
+
Nokogiri::HTML(search_html_four, nil, CraigScrape::Scraper::HTML_ENCODING).at('p')
|
84
|
+
)
|
77
85
|
assert_equal false, four.has_img?
|
78
86
|
assert_equal false, four.has_pic?
|
79
87
|
assert_equal false, four.has_pic_or_img?
|
@@ -85,7 +93,9 @@ EOD
|
|
85
93
|
assert_equal 22, four.post_date.day
|
86
94
|
assert_equal 325000.0, four.price
|
87
95
|
|
88
|
-
five = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
96
|
+
five = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
97
|
+
Nokogiri::HTML(search_html_five, nil, CraigScrape::Scraper::HTML_ENCODING).at('p')
|
98
|
+
)
|
89
99
|
assert_equal false, five.has_img?
|
90
100
|
assert_equal true, five.has_pic?
|
91
101
|
assert_equal true, five.has_pic_or_img?
|
@@ -97,27 +107,31 @@ EOD
|
|
97
107
|
assert_equal 9, five.post_date.day
|
98
108
|
assert_equal 105000.0, five.price
|
99
109
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
assert_equal
|
104
|
-
assert_equal
|
105
|
-
assert_equal "$2995000 / 5br - Downtown Boca New Home To Be Built", five.label
|
106
|
-
assert_equal "real estate - by broker", five.section
|
107
|
-
assert_equal "Boca Raton", five.location
|
108
|
-
assert_equal nil, five.post_date
|
109
|
-
assert_equal 2995000.0, five.price
|
110
|
-
|
111
|
-
six = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(Hpricot.parse(category_listing_two).at('p'))
|
112
|
-
assert_equal true, six.has_img?
|
113
|
-
assert_equal false, six.has_pic?
|
110
|
+
six = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
111
|
+
Nokogiri::HTML(category_listing_one, nil, CraigScrape::Scraper::HTML_ENCODING).at('p')
|
112
|
+
)
|
113
|
+
assert_equal false, six.has_img?
|
114
|
+
assert_equal true, six.has_pic?
|
114
115
|
assert_equal true, six.has_pic_or_img?
|
115
|
-
assert_equal '/
|
116
|
-
assert_equal "
|
117
|
-
assert_equal
|
118
|
-
assert_equal
|
116
|
+
assert_equal '/pbc/reb/1128661387.html', six.href
|
117
|
+
assert_equal "$2995000 / 5br - Downtown Boca New Home To Be Built", six.label
|
118
|
+
assert_equal "real estate - by broker", six.section
|
119
|
+
assert_equal "Boca Raton", six.location
|
119
120
|
assert_equal nil, six.post_date
|
120
|
-
assert_equal
|
121
|
+
assert_equal 2995000.0, six.price
|
122
|
+
|
123
|
+
seven = CraigScrape::Posting.new CraigScrape::Listings.parse_summary(
|
124
|
+
Nokogiri::HTML(category_listing_two, nil, CraigScrape::Scraper::HTML_ENCODING).at('p')
|
125
|
+
)
|
126
|
+
assert_equal true, seven.has_img?
|
127
|
+
assert_equal false, seven.has_pic?
|
128
|
+
assert_equal true, seven.has_pic_or_img?
|
129
|
+
assert_equal '/mdc/jwl/1128691192.html', seven.href
|
130
|
+
assert_equal "925 Sterling Silver Dragonfly Charm Bracelet - $25", seven.label
|
131
|
+
assert_equal nil, seven.section
|
132
|
+
assert_equal nil, seven.location
|
133
|
+
assert_equal nil, seven.post_date
|
134
|
+
assert_equal 25.0, seven.price
|
121
135
|
end
|
122
136
|
|
123
137
|
|
@@ -302,7 +316,7 @@ EOD
|
|
302
316
|
assert_equal [:pic], sfbay_art_1223614914.img_types
|
303
317
|
end
|
304
318
|
|
305
|
-
# This
|
319
|
+
# This was actually a 'bug' with hpricot itself when the ulimit is set too low.
|
306
320
|
# the Easy fix is running "ulimit -s 16384" before the tests. But the better fix was
|
307
321
|
# to remove the userbody sending these pages to be parsed by Hpricot
|
308
322
|
def test_bugs_found061710
|
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: libcraigscrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
|
-
- 0
|
8
|
-
- 9
|
9
7
|
- 1
|
10
|
-
|
8
|
+
- 0
|
9
|
+
version: "1.0"
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- Chris DeRose, DeRose Technologies, Inc.
|
@@ -15,22 +14,23 @@ autorequire:
|
|
15
14
|
bindir: bin
|
16
15
|
cert_chain: []
|
17
16
|
|
18
|
-
date: 2011-
|
17
|
+
date: 2011-02-13 00:00:00 -05:00
|
19
18
|
default_executable:
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
|
-
name:
|
21
|
+
name: nokogiri
|
23
22
|
prerelease: false
|
24
23
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
24
|
none: false
|
26
25
|
requirements:
|
27
|
-
- - "
|
26
|
+
- - ">="
|
28
27
|
- !ruby/object:Gem::Version
|
29
|
-
hash:
|
28
|
+
hash: 15
|
30
29
|
segments:
|
31
|
-
-
|
32
|
-
-
|
33
|
-
|
30
|
+
- 1
|
31
|
+
- 4
|
32
|
+
- 4
|
33
|
+
version: 1.4.4
|
34
34
|
type: :runtime
|
35
35
|
version_requirements: *id001
|
36
36
|
- !ruby/object:Gem::Dependency
|