scrapouille 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/Rakefile +1 -0
- data/lib/scrapouille/sanitizer.rb +16 -0
- data/lib/scrapouille/scraper.rb +16 -5
- data/lib/scrapouille.rb +1 -0
- data/scrapouille.gemspec +1 -1
- data/test/fixtures/other-tennis-player.html +63 -0
- data/test/scrapouille/test_sanitizer.rb +27 -0
- data/test/test_scraping.rb +31 -0
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9ac7508a3cc01e967f6cf32e986578b99f5ddf23
|
4
|
+
data.tar.gz: 3bb5e1f56cfbe7c1c5668a387c679f6b8ac4b8ed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a5a8462493a89d40936cbf7ae3ef85e69c5e36e89581c13ea12bfdd40abb9336c2d056352b67a35d3f88719ddd15f24aa02c67ba9c30b4dd4d3c343d4eec7154
|
7
|
+
data.tar.gz: ddf07efb781bb461c4e12e8b467b5cbc04f9bd75d9e07ab7eda3ceb2a4fdc673be3cbe939ede43a168fce6183892e58f9b0c0843b4a172d0e9379a08dacb3111
|
data/Gemfile.lock
CHANGED
data/Rakefile
CHANGED
data/lib/scrapouille/scraper.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'open-uri'
|
2
2
|
require_relative 'xpath_runner'
|
3
|
+
require_relative 'sanitizer'
|
3
4
|
|
4
5
|
module Scrapouille
|
5
6
|
class Scraper
|
@@ -10,15 +11,24 @@ module Scrapouille
|
|
10
11
|
end
|
11
12
|
|
12
13
|
def scrap_all(property, xpath_options)
|
14
|
+
ensure_valid_definition(property, xpath_options)
|
13
15
|
block = Proc.new if block_given?
|
14
16
|
add_rule(:collect_all, property, xpath_options, block)
|
15
17
|
end
|
16
18
|
|
17
19
|
def scrap(property, xpath_options)
|
20
|
+
ensure_valid_definition(property, xpath_options)
|
18
21
|
block = Proc.new if block_given?
|
19
22
|
add_rule(:collect_unique, property, xpath_options, block)
|
20
23
|
end
|
21
24
|
|
25
|
+
def scrap_each!(uris)
|
26
|
+
raise ArgumentError, 'Expecting enumerable as argument' unless uris.respond_to? :map
|
27
|
+
uris.map do |uri|
|
28
|
+
scrap!(uri)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
22
32
|
def scrap!(uri)
|
23
33
|
page = open(uri).read
|
24
34
|
|
@@ -43,7 +53,8 @@ module Scrapouille
|
|
43
53
|
property, xpath, block = rule
|
44
54
|
|
45
55
|
items = XpathRunner.new(xpath, page).get
|
46
|
-
|
56
|
+
|
57
|
+
Sanitizer.clean!(items)
|
47
58
|
|
48
59
|
items.map! do |i|
|
49
60
|
block.call(i)
|
@@ -52,13 +63,13 @@ module Scrapouille
|
|
52
63
|
[property, items]
|
53
64
|
end
|
54
65
|
|
55
|
-
def
|
56
|
-
|
66
|
+
def add_rule(bucket, property, xpath_options, block = nil)
|
67
|
+
@rules[bucket] << ([property, xpath_options[:at], block].compact)
|
57
68
|
end
|
58
69
|
|
59
|
-
def
|
70
|
+
def ensure_valid_definition(property, xpath_options)
|
71
|
+
raise ArgumentError, 'Expecting Hash as second argument for scraping rules' unless Hash === xpath_options
|
60
72
|
raise "Missing 'at:' option for '#{property}'" unless xpath_options[:at]
|
61
|
-
@rules[bucket] << ([property, xpath_options[:at], block].compact)
|
62
73
|
end
|
63
74
|
|
64
75
|
end
|
data/lib/scrapouille.rb
CHANGED
data/scrapouille.gemspec
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |spec|
|
4
4
|
spec.name = "scrapouille"
|
5
|
-
spec.version = "0.0.
|
5
|
+
spec.version = "0.0.6"
|
6
6
|
spec.authors = ["simcap"]
|
7
7
|
spec.summary = %q{Interactive and declarative XPath driven HTML scraper}
|
8
8
|
spec.description = %q{Interactive and declarative XPath driven HTML scraper}
|
@@ -0,0 +1,63 @@
|
|
1
|
+
<html>
|
2
|
+
<body class="players-profile" >
|
3
|
+
<div class="player-bio">
|
4
|
+
<div class="player-name">
|
5
|
+
<h1 class="left">Rafael Nadal <span>(Spain)</span></h1>
|
6
|
+
<div class="right player-flag">
|
7
|
+
<div class="frame"></div>
|
8
|
+
<img src="http://cdn.tennis.com/statics/assets/images/flags/spain.jpg" border="0">
|
9
|
+
</div>
|
10
|
+
</div>
|
11
|
+
<div id="basic" class="padding-top-10">
|
12
|
+
<div class="bg-ranking clearfix">
|
13
|
+
<div class="image left">
|
14
|
+
<img src="http://cdn.tennis.com/uploads/img/1201/01/01/rnadal/regular.jpg">
|
15
|
+
<p>Getty Images</p>
|
16
|
+
</div>
|
17
|
+
<div class="left bio-detail">
|
18
|
+
<div class="info-row clearfix">
|
19
|
+
<div class="label left">Birthdate:</div>
|
20
|
+
<div class="value left">June 03, 1986 (28 years old)</div>
|
21
|
+
</div>
|
22
|
+
<div class="info-row clearfix">
|
23
|
+
<div class="label left">Birthplace:</div>
|
24
|
+
<div class="value left">Manacor, Spain</div>
|
25
|
+
</div>
|
26
|
+
<div class="info-row clearfix">
|
27
|
+
<div class="label left">Residence:</div>
|
28
|
+
<div class="value left">Manacor, Spain</div>
|
29
|
+
</div>
|
30
|
+
<div class="info-row clearfix">
|
31
|
+
<div class="label left">Height:</div>
|
32
|
+
<div class="value left">6' 1" (185 cm)</div>
|
33
|
+
</div>
|
34
|
+
<div class="info-row clearfix">
|
35
|
+
<div class="label left">Weight:</div>
|
36
|
+
<div class="value left">187 lb (85 kg)</div>
|
37
|
+
</div>
|
38
|
+
<div class="info-row clearfix">
|
39
|
+
<div class="label left">Plays:</div>
|
40
|
+
<div class="value left">Left-handed</div>
|
41
|
+
</div>
|
42
|
+
<div class="info-row clearfix">
|
43
|
+
<div class="label left">Turned Pro:</div>
|
44
|
+
<div class="value left">2000 (15 years on tour)</div>
|
45
|
+
</div>
|
46
|
+
<div class="info-row clearfix">
|
47
|
+
<div class="label left">Official Site:</div>
|
48
|
+
<div class="value left"><a href="http://www.rafaelnadal.com/">http://www.rafaelnadal.com/</a></div>
|
49
|
+
</div>
|
50
|
+
<div class="info-row clearfix">
|
51
|
+
<div class="label left">Nicknames:</div>
|
52
|
+
<div class="value left">Rafa</div>
|
53
|
+
</div>
|
54
|
+
</div>
|
55
|
+
<div class="right ranking">
|
56
|
+
<div class="position">#2</div>
|
57
|
+
<a href="/rankings/ATP/">Singles Ranking <span class="link-arrow"></span></a>
|
58
|
+
</div>
|
59
|
+
</div>
|
60
|
+
</div>
|
61
|
+
</div>
|
62
|
+
</body>
|
63
|
+
</html>
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'helper'
|
3
|
+
|
4
|
+
class TestSanitizer < Minitest::Unit::TestCase
|
5
|
+
|
6
|
+
def test_special_cases
|
7
|
+
items = [nil, '', ' ']
|
8
|
+
Scrapouille::Sanitizer.clean!(items)
|
9
|
+
assert_equal [nil, '', ''], items
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_squeeze_and_strip_whitespaces
|
13
|
+
items = ['1 2 3 4 ', ' a b c d ']
|
14
|
+
Scrapouille::Sanitizer.clean!(items)
|
15
|
+
assert_equal ['1 2 3 4', 'a b c d'], items
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_replace_html_non_breaking_spaces_by_whitespaces
|
19
|
+
with_html_nbsp = 'B. Firat F'
|
20
|
+
refute with_html_nbsp.ascii_only?
|
21
|
+
items = [with_html_nbsp]
|
22
|
+
Scrapouille::Sanitizer.clean!(items)
|
23
|
+
assert_equal ['B. Firat F'], items, 'Should have removed (\u00A0) char'
|
24
|
+
assert items.first.ascii_only?
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
data/test/test_scraping.rb
CHANGED
@@ -21,6 +21,37 @@ class TestScraping < MiniTest::Unit::TestCase
|
|
21
21
|
results)
|
22
22
|
end
|
23
23
|
|
24
|
+
def test_scrap_each
|
25
|
+
scraper = Scrapouille.configure do
|
26
|
+
scrap 'fullname', at: "//div[@class='player-name']/h1/child::text()"
|
27
|
+
scrap 'image_url', at: "//div[@id='basic']//img/attribute::src"
|
28
|
+
scrap 'rank', at: "//div[@class='position']/text()" do |c|
|
29
|
+
Integer(c.sub('#', ''))
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
results = scraper.scrap_each!([
|
34
|
+
File.join(__dir__, 'fixtures', 'tennis-player.html'),
|
35
|
+
File.join(__dir__, 'fixtures', 'other-tennis-player.html')
|
36
|
+
])
|
37
|
+
|
38
|
+
assert Array === results
|
39
|
+
assert_equal({
|
40
|
+
'fullname' => 'Richard Gasquet',
|
41
|
+
'image_url' => 'http://cdn.tennis.com/uploads/img/2014/06/12/gasquet/regular.jpg',
|
42
|
+
'rank' => 21
|
43
|
+
},
|
44
|
+
results[0]
|
45
|
+
)
|
46
|
+
assert_equal({
|
47
|
+
'fullname' => 'Rafael Nadal',
|
48
|
+
'image_url' => 'http://cdn.tennis.com/uploads/img/1201/01/01/rnadal/regular.jpg',
|
49
|
+
'rank' => 2
|
50
|
+
},
|
51
|
+
results[1]
|
52
|
+
)
|
53
|
+
end
|
54
|
+
|
24
55
|
def test_scrap_attribute_value
|
25
56
|
scraper = Scrapouille.configure do
|
26
57
|
scrap :djokovic_picture_src, at: "//img[contains(@src, 'djokovicz')]/@src"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapouille
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- simcap
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-09-
|
11
|
+
date: 2014-09-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -95,12 +95,15 @@ files:
|
|
95
95
|
- Rakefile
|
96
96
|
- bin/scrapouille
|
97
97
|
- lib/scrapouille.rb
|
98
|
+
- lib/scrapouille/sanitizer.rb
|
98
99
|
- lib/scrapouille/scraper.rb
|
99
100
|
- lib/scrapouille/xpath_runner.rb
|
100
101
|
- scrapouille.gemspec
|
102
|
+
- test/fixtures/other-tennis-player.html
|
101
103
|
- test/fixtures/tennis-player.html
|
102
104
|
- test/fixtures/tennis-players-listing.html
|
103
105
|
- test/helper.rb
|
106
|
+
- test/scrapouille/test_sanitizer.rb
|
104
107
|
- test/test_scraping.rb
|
105
108
|
homepage: https://github.com/simcap/scrapouille
|
106
109
|
licenses: []
|
@@ -126,7 +129,9 @@ signing_key:
|
|
126
129
|
specification_version: 4
|
127
130
|
summary: Interactive and declarative XPath driven HTML scraper
|
128
131
|
test_files:
|
132
|
+
- test/fixtures/other-tennis-player.html
|
129
133
|
- test/fixtures/tennis-player.html
|
130
134
|
- test/fixtures/tennis-players-listing.html
|
131
135
|
- test/helper.rb
|
136
|
+
- test/scrapouille/test_sanitizer.rb
|
132
137
|
- test/test_scraping.rb
|