scrapouille 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/Rakefile +1 -0
- data/lib/scrapouille/sanitizer.rb +16 -0
- data/lib/scrapouille/scraper.rb +16 -5
- data/lib/scrapouille.rb +1 -0
- data/scrapouille.gemspec +1 -1
- data/test/fixtures/other-tennis-player.html +63 -0
- data/test/scrapouille/test_sanitizer.rb +27 -0
- data/test/test_scraping.rb +31 -0
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9ac7508a3cc01e967f6cf32e986578b99f5ddf23
|
4
|
+
data.tar.gz: 3bb5e1f56cfbe7c1c5668a387c679f6b8ac4b8ed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a5a8462493a89d40936cbf7ae3ef85e69c5e36e89581c13ea12bfdd40abb9336c2d056352b67a35d3f88719ddd15f24aa02c67ba9c30b4dd4d3c343d4eec7154
|
7
|
+
data.tar.gz: ddf07efb781bb461c4e12e8b467b5cbc04f9bd75d9e07ab7eda3ceb2a4fdc673be3cbe939ede43a168fce6183892e58f9b0c0843b4a172d0e9379a08dacb3111
|
data/Gemfile.lock
CHANGED
data/Rakefile
CHANGED
data/lib/scrapouille/scraper.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'open-uri'
|
2
2
|
require_relative 'xpath_runner'
|
3
|
+
require_relative 'sanitizer'
|
3
4
|
|
4
5
|
module Scrapouille
|
5
6
|
class Scraper
|
@@ -10,15 +11,24 @@ module Scrapouille
|
|
10
11
|
end
|
11
12
|
|
12
13
|
def scrap_all(property, xpath_options)
|
14
|
+
ensure_valid_definition(property, xpath_options)
|
13
15
|
block = Proc.new if block_given?
|
14
16
|
add_rule(:collect_all, property, xpath_options, block)
|
15
17
|
end
|
16
18
|
|
17
19
|
def scrap(property, xpath_options)
|
20
|
+
ensure_valid_definition(property, xpath_options)
|
18
21
|
block = Proc.new if block_given?
|
19
22
|
add_rule(:collect_unique, property, xpath_options, block)
|
20
23
|
end
|
21
24
|
|
25
|
+
def scrap_each!(uris)
|
26
|
+
raise ArgumentError, 'Expecting enumerable as argument' unless uris.respond_to? :map
|
27
|
+
uris.map do |uri|
|
28
|
+
scrap!(uri)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
22
32
|
def scrap!(uri)
|
23
33
|
page = open(uri).read
|
24
34
|
|
@@ -43,7 +53,8 @@ module Scrapouille
|
|
43
53
|
property, xpath, block = rule
|
44
54
|
|
45
55
|
items = XpathRunner.new(xpath, page).get
|
46
|
-
|
56
|
+
|
57
|
+
Sanitizer.clean!(items)
|
47
58
|
|
48
59
|
items.map! do |i|
|
49
60
|
block.call(i)
|
@@ -52,13 +63,13 @@ module Scrapouille
|
|
52
63
|
[property, items]
|
53
64
|
end
|
54
65
|
|
55
|
-
def
|
56
|
-
|
66
|
+
def add_rule(bucket, property, xpath_options, block = nil)
|
67
|
+
@rules[bucket] << ([property, xpath_options[:at], block].compact)
|
57
68
|
end
|
58
69
|
|
59
|
-
def
|
70
|
+
def ensure_valid_definition(property, xpath_options)
|
71
|
+
raise ArgumentError, 'Expecting Hash as second argument for scraping rules' unless Hash === xpath_options
|
60
72
|
raise "Missing 'at:' option for '#{property}'" unless xpath_options[:at]
|
61
|
-
@rules[bucket] << ([property, xpath_options[:at], block].compact)
|
62
73
|
end
|
63
74
|
|
64
75
|
end
|
data/lib/scrapouille.rb
CHANGED
data/scrapouille.gemspec
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |spec|
|
4
4
|
spec.name = "scrapouille"
|
5
|
-
spec.version = "0.0.
|
5
|
+
spec.version = "0.0.6"
|
6
6
|
spec.authors = ["simcap"]
|
7
7
|
spec.summary = %q{Interactive and declarative XPath driven HTML scraper}
|
8
8
|
spec.description = %q{Interactive and declarative XPath driven HTML scraper}
|
@@ -0,0 +1,63 @@
|
|
1
|
+
<html>
|
2
|
+
<body class="players-profile" >
|
3
|
+
<div class="player-bio">
|
4
|
+
<div class="player-name">
|
5
|
+
<h1 class="left">Rafael Nadal <span>(Spain)</span></h1>
|
6
|
+
<div class="right player-flag">
|
7
|
+
<div class="frame"></div>
|
8
|
+
<img src="http://cdn.tennis.com/statics/assets/images/flags/spain.jpg" border="0">
|
9
|
+
</div>
|
10
|
+
</div>
|
11
|
+
<div id="basic" class="padding-top-10">
|
12
|
+
<div class="bg-ranking clearfix">
|
13
|
+
<div class="image left">
|
14
|
+
<img src="http://cdn.tennis.com/uploads/img/1201/01/01/rnadal/regular.jpg">
|
15
|
+
<p>Getty Images</p>
|
16
|
+
</div>
|
17
|
+
<div class="left bio-detail">
|
18
|
+
<div class="info-row clearfix">
|
19
|
+
<div class="label left">Birthdate:</div>
|
20
|
+
<div class="value left">June 03, 1986 (28 years old)</div>
|
21
|
+
</div>
|
22
|
+
<div class="info-row clearfix">
|
23
|
+
<div class="label left">Birthplace:</div>
|
24
|
+
<div class="value left">Manacor, Spain</div>
|
25
|
+
</div>
|
26
|
+
<div class="info-row clearfix">
|
27
|
+
<div class="label left">Residence:</div>
|
28
|
+
<div class="value left">Manacor, Spain</div>
|
29
|
+
</div>
|
30
|
+
<div class="info-row clearfix">
|
31
|
+
<div class="label left">Height:</div>
|
32
|
+
<div class="value left">6' 1" (185 cm)</div>
|
33
|
+
</div>
|
34
|
+
<div class="info-row clearfix">
|
35
|
+
<div class="label left">Weight:</div>
|
36
|
+
<div class="value left">187 lb (85 kg)</div>
|
37
|
+
</div>
|
38
|
+
<div class="info-row clearfix">
|
39
|
+
<div class="label left">Plays:</div>
|
40
|
+
<div class="value left">Left-handed</div>
|
41
|
+
</div>
|
42
|
+
<div class="info-row clearfix">
|
43
|
+
<div class="label left">Turned Pro:</div>
|
44
|
+
<div class="value left">2000 (15 years on tour)</div>
|
45
|
+
</div>
|
46
|
+
<div class="info-row clearfix">
|
47
|
+
<div class="label left">Official Site:</div>
|
48
|
+
<div class="value left"><a href="http://www.rafaelnadal.com/">http://www.rafaelnadal.com/</a></div>
|
49
|
+
</div>
|
50
|
+
<div class="info-row clearfix">
|
51
|
+
<div class="label left">Nicknames:</div>
|
52
|
+
<div class="value left">Rafa</div>
|
53
|
+
</div>
|
54
|
+
</div>
|
55
|
+
<div class="right ranking">
|
56
|
+
<div class="position">#2</div>
|
57
|
+
<a href="/rankings/ATP/">Singles Ranking <span class="link-arrow"></span></a>
|
58
|
+
</div>
|
59
|
+
</div>
|
60
|
+
</div>
|
61
|
+
</div>
|
62
|
+
</body>
|
63
|
+
</html>
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'helper'
|
3
|
+
|
4
|
+
class TestSanitizer < Minitest::Unit::TestCase
|
5
|
+
|
6
|
+
def test_special_cases
|
7
|
+
items = [nil, '', ' ']
|
8
|
+
Scrapouille::Sanitizer.clean!(items)
|
9
|
+
assert_equal [nil, '', ''], items
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_squeeze_and_strip_whitespaces
|
13
|
+
items = ['1 2 3 4 ', ' a b c d ']
|
14
|
+
Scrapouille::Sanitizer.clean!(items)
|
15
|
+
assert_equal ['1 2 3 4', 'a b c d'], items
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_replace_html_non_breaking_spaces_by_whitespaces
|
19
|
+
with_html_nbsp = 'B. Firat F'
|
20
|
+
refute with_html_nbsp.ascii_only?
|
21
|
+
items = [with_html_nbsp]
|
22
|
+
Scrapouille::Sanitizer.clean!(items)
|
23
|
+
assert_equal ['B. Firat F'], items, 'Should have removed (\u00A0) char'
|
24
|
+
assert items.first.ascii_only?
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
data/test/test_scraping.rb
CHANGED
@@ -21,6 +21,37 @@ class TestScraping < MiniTest::Unit::TestCase
|
|
21
21
|
results)
|
22
22
|
end
|
23
23
|
|
24
|
+
def test_scrap_each
|
25
|
+
scraper = Scrapouille.configure do
|
26
|
+
scrap 'fullname', at: "//div[@class='player-name']/h1/child::text()"
|
27
|
+
scrap 'image_url', at: "//div[@id='basic']//img/attribute::src"
|
28
|
+
scrap 'rank', at: "//div[@class='position']/text()" do |c|
|
29
|
+
Integer(c.sub('#', ''))
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
results = scraper.scrap_each!([
|
34
|
+
File.join(__dir__, 'fixtures', 'tennis-player.html'),
|
35
|
+
File.join(__dir__, 'fixtures', 'other-tennis-player.html')
|
36
|
+
])
|
37
|
+
|
38
|
+
assert Array === results
|
39
|
+
assert_equal({
|
40
|
+
'fullname' => 'Richard Gasquet',
|
41
|
+
'image_url' => 'http://cdn.tennis.com/uploads/img/2014/06/12/gasquet/regular.jpg',
|
42
|
+
'rank' => 21
|
43
|
+
},
|
44
|
+
results[0]
|
45
|
+
)
|
46
|
+
assert_equal({
|
47
|
+
'fullname' => 'Rafael Nadal',
|
48
|
+
'image_url' => 'http://cdn.tennis.com/uploads/img/1201/01/01/rnadal/regular.jpg',
|
49
|
+
'rank' => 2
|
50
|
+
},
|
51
|
+
results[1]
|
52
|
+
)
|
53
|
+
end
|
54
|
+
|
24
55
|
def test_scrap_attribute_value
|
25
56
|
scraper = Scrapouille.configure do
|
26
57
|
scrap :djokovic_picture_src, at: "//img[contains(@src, 'djokovicz')]/@src"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapouille
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- simcap
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-09-
|
11
|
+
date: 2014-09-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -95,12 +95,15 @@ files:
|
|
95
95
|
- Rakefile
|
96
96
|
- bin/scrapouille
|
97
97
|
- lib/scrapouille.rb
|
98
|
+
- lib/scrapouille/sanitizer.rb
|
98
99
|
- lib/scrapouille/scraper.rb
|
99
100
|
- lib/scrapouille/xpath_runner.rb
|
100
101
|
- scrapouille.gemspec
|
102
|
+
- test/fixtures/other-tennis-player.html
|
101
103
|
- test/fixtures/tennis-player.html
|
102
104
|
- test/fixtures/tennis-players-listing.html
|
103
105
|
- test/helper.rb
|
106
|
+
- test/scrapouille/test_sanitizer.rb
|
104
107
|
- test/test_scraping.rb
|
105
108
|
homepage: https://github.com/simcap/scrapouille
|
106
109
|
licenses: []
|
@@ -126,7 +129,9 @@ signing_key:
|
|
126
129
|
specification_version: 4
|
127
130
|
summary: Interactive and declarative XPath driven HTML scraper
|
128
131
|
test_files:
|
132
|
+
- test/fixtures/other-tennis-player.html
|
129
133
|
- test/fixtures/tennis-player.html
|
130
134
|
- test/fixtures/tennis-players-listing.html
|
131
135
|
- test/helper.rb
|
136
|
+
- test/scrapouille/test_sanitizer.rb
|
132
137
|
- test/test_scraping.rb
|