scrapouille 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +9 -1
- data/README.md +2 -0
- data/bin/scrapouille +4 -5
- data/lib/scrapouille.rb +5 -28
- data/lib/scrapouille/scraper.rb +65 -0
- data/lib/scrapouille/xpath_runner.rb +29 -0
- data/scrapouille.gemspec +2 -1
- data/test/fixtures/tennis-players-listing.html +2847 -0
- data/test/helper.rb +1 -0
- data/test/test_scraping.rb +64 -6
- metadata +20 -2
data/test/helper.rb
CHANGED
data/test/test_scraping.rb
CHANGED
@@ -2,8 +2,8 @@ require 'helper'
|
|
2
2
|
|
3
3
|
class TestScraping < MiniTest::Unit::TestCase
|
4
4
|
|
5
|
-
def
|
6
|
-
scraper = Scrapouille.
|
5
|
+
def test_scrap_text
|
6
|
+
scraper = Scrapouille.configure do
|
7
7
|
scrap 'fullname', at: "//div[@class='player-name']/h1/child::text()"
|
8
8
|
scrap 'image_url', at: "//div[@id='basic']//img/attribute::src"
|
9
9
|
scrap 'rank', at: "//div[@class='position']/text()" do |c|
|
@@ -14,16 +14,74 @@ class TestScraping < MiniTest::Unit::TestCase
|
|
14
14
|
results = scraper.scrap!(File.join(__dir__, 'fixtures', 'tennis-player.html'))
|
15
15
|
|
16
16
|
assert_equal({
|
17
|
-
fullname
|
18
|
-
image_url
|
19
|
-
rank
|
17
|
+
'fullname' => 'Richard Gasquet',
|
18
|
+
'image_url' => 'http://cdn.tennis.com/uploads/img/2014/06/12/gasquet/regular.jpg',
|
19
|
+
'rank' => 21
|
20
20
|
},
|
21
21
|
results)
|
22
22
|
end
|
23
23
|
|
24
|
+
def test_scrap_attribute_value
|
25
|
+
scraper = Scrapouille.configure do
|
26
|
+
scrap :djokovic_picture_src, at: "//img[contains(@src, 'djokovicz')]/@src"
|
27
|
+
end
|
28
|
+
|
29
|
+
results = scraper.scrap!(File.join(__dir__, 'fixtures', 'tennis-players-listing.html'))
|
30
|
+
|
31
|
+
assert_equal(
|
32
|
+
{ djokovic_picture_src: 'http://cdn.tennis.com/uploads/img/2014/06/12/djokoviczz/regular.jpg' },
|
33
|
+
results
|
34
|
+
)
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_scrap_all_attributes_value
|
38
|
+
scraper = Scrapouille.configure do
|
39
|
+
scrap_all :players_hrefs, at: "//table[contains(@class, 'ranking-table')]//a[child::img]/@href"
|
40
|
+
end
|
41
|
+
|
42
|
+
results = scraper.scrap!(File.join(__dir__, 'fixtures', 'tennis-players-listing.html'))
|
43
|
+
|
44
|
+
assert results[:players_hrefs]
|
45
|
+
assert results[:players_hrefs].all? {|p| p.start_with? '/player/'}
|
46
|
+
assert_equal 119, results[:players_hrefs].count
|
47
|
+
assert_equal '/player/532/novak-djokovic/', results[:players_hrefs].first
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_scrap_all_text
|
51
|
+
scraper = Scrapouille.configure do
|
52
|
+
scrap_all 'players_names', at: "//table[contains(@class, 'ranking-table')]//a[not(child::img)]/text()" do |c|
|
53
|
+
c.downcase
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
results = scraper.scrap!(File.join(__dir__, 'fixtures', 'tennis-players-listing.html'))
|
58
|
+
|
59
|
+
assert results['players_names']
|
60
|
+
assert_equal 119, results['players_names'].count
|
61
|
+
assert_equal 'stan wawrinka', results['players_names'][3]
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_both_scrap_and_scrap_all
|
65
|
+
scraper = Scrapouille.configure do
|
66
|
+
scrap :djokovic_picture_src, at: "//img[contains(@src, 'djokovicz')]/@src"
|
67
|
+
scrap_all :players_hrefs, at: "//table[contains(@class, 'ranking-table')]//a[child::img]/@href"
|
68
|
+
end
|
69
|
+
|
70
|
+
results = scraper.scrap!(File.join(__dir__, 'fixtures', 'tennis-players-listing.html'))
|
71
|
+
|
72
|
+
assert results[:players_hrefs]
|
73
|
+
assert_equal 119, results[:players_hrefs].count
|
74
|
+
|
75
|
+
assert_equal(
|
76
|
+
'http://cdn.tennis.com/uploads/img/2014/06/12/djokoviczz/regular.jpg',
|
77
|
+
results[:djokovic_picture_src]
|
78
|
+
)
|
79
|
+
end
|
80
|
+
|
81
|
+
|
24
82
|
def test_raise_when_no_at_options
|
25
83
|
error = assert_raises(RuntimeError) do
|
26
|
-
scraper = Scrapouille.
|
84
|
+
scraper = Scrapouille.configure do
|
27
85
|
scrap 'fullname', {}
|
28
86
|
end
|
29
87
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapouille
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- simcap
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-09-
|
11
|
+
date: 2014-09-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: pry
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: minitest
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -81,8 +95,11 @@ files:
|
|
81
95
|
- Rakefile
|
82
96
|
- bin/scrapouille
|
83
97
|
- lib/scrapouille.rb
|
98
|
+
- lib/scrapouille/scraper.rb
|
99
|
+
- lib/scrapouille/xpath_runner.rb
|
84
100
|
- scrapouille.gemspec
|
85
101
|
- test/fixtures/tennis-player.html
|
102
|
+
- test/fixtures/tennis-players-listing.html
|
86
103
|
- test/helper.rb
|
87
104
|
- test/test_scraping.rb
|
88
105
|
homepage: https://github.com/simcap/scrapouille
|
@@ -110,5 +127,6 @@ specification_version: 4
|
|
110
127
|
summary: Interactive and declarative XPath driven HTML scraper
|
111
128
|
test_files:
|
112
129
|
- test/fixtures/tennis-player.html
|
130
|
+
- test/fixtures/tennis-players-listing.html
|
113
131
|
- test/helper.rb
|
114
132
|
- test/test_scraping.rb
|