scrapouille 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/scrapouille/scraper.rb +10 -3
- data/scrapouille.gemspec +1 -1
- data/test/test_scraping.rb +30 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e4fe2677b4492ce2c30d305949f0b8f72a48859d
|
4
|
+
data.tar.gz: e3891caa4fd74331d483e4308d2d42d1f9fc0944
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 54ca09257d43ad187090d6cd35d4de71328574d4ab4682253033bb00d32f8788720ac65841195d844e381e564a20147ddc53ab1741f4ca6fd75769425c9645f3
|
7
|
+
data.tar.gz: 44ed891ee0c667918c945708c5344848042a4f2d3e26427e03a4746bf4e5601b4d0aacbf8a1bbdf9c6e9e79f1355a3af0461416479cefd18205c25e83b8195ef
|
data/lib/scrapouille/scraper.rb
CHANGED
@@ -22,9 +22,16 @@ module Scrapouille
|
|
22
22
|
add_rule(:collect_unique, property, xpath_options, block)
|
23
23
|
end
|
24
24
|
|
25
|
-
def scrap_each!(uris)
|
26
|
-
|
27
|
-
|
25
|
+
def scrap_each!(*uris)
|
26
|
+
if uris.length == 1
|
27
|
+
full_uris = uris.first
|
28
|
+
elsif uris.length == 2
|
29
|
+
root, relative_uris = *uris
|
30
|
+
full_uris = relative_uris.map do |uri| "#{root}/#{uri}" end
|
31
|
+
else
|
32
|
+
raise ArgumentError, "Expecting 1 or 2 arguments when calling #{__callee__}"
|
33
|
+
end
|
34
|
+
full_uris.map do |uri|
|
28
35
|
scrap!(uri)
|
29
36
|
end
|
30
37
|
end
|
data/scrapouille.gemspec
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |spec|
|
4
4
|
spec.name = "scrapouille"
|
5
|
-
spec.version = "0.0.
|
5
|
+
spec.version = "0.0.7"
|
6
6
|
spec.authors = ["simcap"]
|
7
7
|
spec.summary = %q{Interactive and declarative XPath driven HTML scraper}
|
8
8
|
spec.description = %q{Interactive and declarative XPath driven HTML scraper}
|
data/test/test_scraping.rb
CHANGED
@@ -52,6 +52,36 @@ class TestScraping < MiniTest::Unit::TestCase
|
|
52
52
|
)
|
53
53
|
end
|
54
54
|
|
55
|
+
|
56
|
+
def test_scrap_each_using_root_and_relative_uri
|
57
|
+
scraper = Scrapouille.configure do
|
58
|
+
scrap 'fullname', at: "//div[@class='player-name']/h1/child::text()"
|
59
|
+
scrap 'image_url', at: "//div[@id='basic']//img/attribute::src"
|
60
|
+
scrap 'rank', at: "//div[@class='position']/text()" do |c|
|
61
|
+
Integer(c.sub('#', ''))
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
results = scraper.scrap_each!("#{__dir__}/fixtures", ['tennis-player.html', 'other-tennis-player.html'])
|
66
|
+
|
67
|
+
assert Array === results
|
68
|
+
assert_equal({
|
69
|
+
'fullname' => 'Richard Gasquet',
|
70
|
+
'image_url' => 'http://cdn.tennis.com/uploads/img/2014/06/12/gasquet/regular.jpg',
|
71
|
+
'rank' => 21
|
72
|
+
},
|
73
|
+
results[0]
|
74
|
+
)
|
75
|
+
assert_equal({
|
76
|
+
'fullname' => 'Rafael Nadal',
|
77
|
+
'image_url' => 'http://cdn.tennis.com/uploads/img/1201/01/01/rnadal/regular.jpg',
|
78
|
+
'rank' => 2
|
79
|
+
},
|
80
|
+
results[1]
|
81
|
+
)
|
82
|
+
end
|
83
|
+
|
84
|
+
|
55
85
|
def test_scrap_attribute_value
|
56
86
|
scraper = Scrapouille.configure do
|
57
87
|
scrap :djokovic_picture_src, at: "//img[contains(@src, 'djokovicz')]/@src"
|