scrapouille 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a8b59829e9883445bc47efc7c0993b08ea024356
4
- data.tar.gz: c5737c0c8985d8276faf2aa107eb7b371a25afab
3
+ metadata.gz: 9ac7508a3cc01e967f6cf32e986578b99f5ddf23
4
+ data.tar.gz: 3bb5e1f56cfbe7c1c5668a387c679f6b8ac4b8ed
5
5
  SHA512:
6
- metadata.gz: 0f569bca9605d73531734c39d96f5cf9d07f0b687027c70b6d4a198808ce91289211cf3e2c36486a104167f42c4d494e79c103b1f0cae25ec95101114b1838bc
7
- data.tar.gz: b1ab5b5d53c130dab864a3daee6368759710107777a10d7f57d5de344df4b7257aae115decdaa7539e5f94776ae280029633f1b3ea80c1dfc13fa973c2eafc74
6
+ metadata.gz: a5a8462493a89d40936cbf7ae3ef85e69c5e36e89581c13ea12bfdd40abb9336c2d056352b67a35d3f88719ddd15f24aa02c67ba9c30b4dd4d3c343d4eec7154
7
+ data.tar.gz: ddf07efb781bb461c4e12e8b467b5cbc04f9bd75d9e07ab7eda3ceb2a4fdc673be3cbe939ede43a168fce6183892e58f9b0c0843b4a172d0e9379a08dacb3111
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scrapouille (0.0.5)
4
+ scrapouille (0.0.6)
5
5
  nokogiri
6
6
 
7
7
  GEM
data/Rakefile CHANGED
@@ -4,6 +4,7 @@ require 'rake/testtask'
4
4
 
5
5
  Rake::TestTask.new do |t|
6
6
  t.libs << 'test'
7
+ t.test_files = FileList['test/**/test*.rb']
7
8
  end
8
9
 
9
10
  task default: :test
@@ -0,0 +1,16 @@
1
+ module Scrapouille
2
+ module Sanitizer
3
+
4
+ HTML_NBSP_ENTITY = "\u00A0".freeze
5
+
6
+ def self.clean!(items)
7
+ items.map! do |i|
8
+ next if i.nil?
9
+ n = i.gsub(HTML_NBSP_ENTITY, ' ')
10
+ n = n.squeeze(' ')
11
+ n.strip
12
+ end
13
+ end
14
+
15
+ end
16
+ end
@@ -1,5 +1,6 @@
1
1
  require 'open-uri'
2
2
  require_relative 'xpath_runner'
3
+ require_relative 'sanitizer'
3
4
 
4
5
  module Scrapouille
5
6
  class Scraper
@@ -10,15 +11,24 @@ module Scrapouille
10
11
  end
11
12
 
12
13
  def scrap_all(property, xpath_options)
14
+ ensure_valid_definition(property, xpath_options)
13
15
  block = Proc.new if block_given?
14
16
  add_rule(:collect_all, property, xpath_options, block)
15
17
  end
16
18
 
17
19
  def scrap(property, xpath_options)
20
+ ensure_valid_definition(property, xpath_options)
18
21
  block = Proc.new if block_given?
19
22
  add_rule(:collect_unique, property, xpath_options, block)
20
23
  end
21
24
 
25
+ def scrap_each!(uris)
26
+ raise ArgumentError, 'Expecting enumerable as argument' unless uris.respond_to? :map
27
+ uris.map do |uri|
28
+ scrap!(uri)
29
+ end
30
+ end
31
+
22
32
  def scrap!(uri)
23
33
  page = open(uri).read
24
34
 
@@ -43,7 +53,8 @@ module Scrapouille
43
53
  property, xpath, block = rule
44
54
 
45
55
  items = XpathRunner.new(xpath, page).get
46
- sanitize!(items)
56
+
57
+ Sanitizer.clean!(items)
47
58
 
48
59
  items.map! do |i|
49
60
  block.call(i)
@@ -52,13 +63,13 @@ module Scrapouille
52
63
  [property, items]
53
64
  end
54
65
 
55
- def sanitize!(items)
56
- items.map!(&:strip)
66
+ def add_rule(bucket, property, xpath_options, block = nil)
67
+ @rules[bucket] << ([property, xpath_options[:at], block].compact)
57
68
  end
58
69
 
59
- def add_rule(bucket, property, xpath_options, block = nil)
70
+ def ensure_valid_definition(property, xpath_options)
71
+ raise ArgumentError, 'Expecting Hash as second argument for scraping rules' unless Hash === xpath_options
60
72
  raise "Missing 'at:' option for '#{property}'" unless xpath_options[:at]
61
- @rules[bucket] << ([property, xpath_options[:at], block].compact)
62
73
  end
63
74
 
64
75
  end
data/lib/scrapouille.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require_relative 'scrapouille/scraper'
2
+ require_relative 'scrapouille/sanitizer'
2
3
  require_relative 'scrapouille/xpath_runner'
3
4
 
4
5
  module Scrapouille
data/scrapouille.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |spec|
4
4
  spec.name = "scrapouille"
5
- spec.version = "0.0.5"
5
+ spec.version = "0.0.6"
6
6
  spec.authors = ["simcap"]
7
7
  spec.summary = %q{Interactive and declarative XPath driven HTML scraper}
8
8
  spec.description = %q{Interactive and declarative XPath driven HTML scraper}
@@ -0,0 +1,63 @@
1
+ <html>
2
+ <body class="players-profile" >
3
+ <div class="player-bio">
4
+ <div class="player-name">
5
+ <h1 class="left">Rafael Nadal <span>(Spain)</span></h1>
6
+ <div class="right player-flag">
7
+ <div class="frame"></div>
8
+ <img src="http://cdn.tennis.com/statics/assets/images/flags/spain.jpg" border="0">
9
+ </div>
10
+ </div>
11
+ <div id="basic" class="padding-top-10">
12
+ <div class="bg-ranking clearfix">
13
+ <div class="image left">
14
+ <img src="http://cdn.tennis.com/uploads/img/1201/01/01/rnadal/regular.jpg">
15
+ <p>Getty Images</p>
16
+ </div>
17
+ <div class="left bio-detail">
18
+ <div class="info-row clearfix">
19
+ <div class="label left">Birthdate:</div>
20
+ <div class="value left">June 03, 1986 (28 years old)</div>
21
+ </div>
22
+ <div class="info-row clearfix">
23
+ <div class="label left">Birthplace:</div>
24
+ <div class="value left">Manacor, Spain</div>
25
+ </div>
26
+ <div class="info-row clearfix">
27
+ <div class="label left">Residence:</div>
28
+ <div class="value left">Manacor, Spain</div>
29
+ </div>
30
+ <div class="info-row clearfix">
31
+ <div class="label left">Height:</div>
32
+ <div class="value left">6' 1" (185 cm)</div>
33
+ </div>
34
+ <div class="info-row clearfix">
35
+ <div class="label left">Weight:</div>
36
+ <div class="value left">187 lb (85 kg)</div>
37
+ </div>
38
+ <div class="info-row clearfix">
39
+ <div class="label left">Plays:</div>
40
+ <div class="value left">Left-handed</div>
41
+ </div>
42
+ <div class="info-row clearfix">
43
+ <div class="label left">Turned Pro:</div>
44
+ <div class="value left">2000 (15 years on tour)</div>
45
+ </div>
46
+ <div class="info-row clearfix">
47
+ <div class="label left">Official Site:</div>
48
+ <div class="value left"><a href="http://www.rafaelnadal.com/">http://www.rafaelnadal.com/</a></div>
49
+ </div>
50
+ <div class="info-row clearfix">
51
+ <div class="label left">Nicknames:</div>
52
+ <div class="value left">Rafa</div>
53
+ </div>
54
+ </div>
55
+ <div class="right ranking">
56
+ <div class="position">#2</div>
57
+ <a href="/rankings/ATP/">Singles Ranking <span class="link-arrow"></span></a>
58
+ </div>
59
+ </div>
60
+ </div>
61
+ </div>
62
+ </body>
63
+ </html>
@@ -0,0 +1,27 @@
1
+ # encoding: utf-8
2
+ require 'helper'
3
+
4
+ class TestSanitizer < Minitest::Unit::TestCase
5
+
6
+ def test_special_cases
7
+ items = [nil, '', ' ']
8
+ Scrapouille::Sanitizer.clean!(items)
9
+ assert_equal [nil, '', ''], items
10
+ end
11
+
12
+ def test_squeeze_and_strip_whitespaces
13
+ items = ['1 2 3 4 ', ' a b c d ']
14
+ Scrapouille::Sanitizer.clean!(items)
15
+ assert_equal ['1 2 3 4', 'a b c d'], items
16
+ end
17
+
18
+ def test_replace_html_non_breaking_spaces_by_whitespaces
19
+ with_html_nbsp = 'B. Firat F'
20
+ refute with_html_nbsp.ascii_only?
21
+ items = [with_html_nbsp]
22
+ Scrapouille::Sanitizer.clean!(items)
23
+ assert_equal ['B. Firat F'], items, 'Should have removed &nbsp; (\u00A0) char'
24
+ assert items.first.ascii_only?
25
+ end
26
+
27
+ end
@@ -21,6 +21,37 @@ class TestScraping < MiniTest::Unit::TestCase
21
21
  results)
22
22
  end
23
23
 
24
+ def test_scrap_each
25
+ scraper = Scrapouille.configure do
26
+ scrap 'fullname', at: "//div[@class='player-name']/h1/child::text()"
27
+ scrap 'image_url', at: "//div[@id='basic']//img/attribute::src"
28
+ scrap 'rank', at: "//div[@class='position']/text()" do |c|
29
+ Integer(c.sub('#', ''))
30
+ end
31
+ end
32
+
33
+ results = scraper.scrap_each!([
34
+ File.join(__dir__, 'fixtures', 'tennis-player.html'),
35
+ File.join(__dir__, 'fixtures', 'other-tennis-player.html')
36
+ ])
37
+
38
+ assert Array === results
39
+ assert_equal({
40
+ 'fullname' => 'Richard Gasquet',
41
+ 'image_url' => 'http://cdn.tennis.com/uploads/img/2014/06/12/gasquet/regular.jpg',
42
+ 'rank' => 21
43
+ },
44
+ results[0]
45
+ )
46
+ assert_equal({
47
+ 'fullname' => 'Rafael Nadal',
48
+ 'image_url' => 'http://cdn.tennis.com/uploads/img/1201/01/01/rnadal/regular.jpg',
49
+ 'rank' => 2
50
+ },
51
+ results[1]
52
+ )
53
+ end
54
+
24
55
  def test_scrap_attribute_value
25
56
  scraper = Scrapouille.configure do
26
57
  scrap :djokovic_picture_src, at: "//img[contains(@src, 'djokovicz')]/@src"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapouille
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - simcap
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-20 00:00:00.000000000 Z
11
+ date: 2014-09-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -95,12 +95,15 @@ files:
95
95
  - Rakefile
96
96
  - bin/scrapouille
97
97
  - lib/scrapouille.rb
98
+ - lib/scrapouille/sanitizer.rb
98
99
  - lib/scrapouille/scraper.rb
99
100
  - lib/scrapouille/xpath_runner.rb
100
101
  - scrapouille.gemspec
102
+ - test/fixtures/other-tennis-player.html
101
103
  - test/fixtures/tennis-player.html
102
104
  - test/fixtures/tennis-players-listing.html
103
105
  - test/helper.rb
106
+ - test/scrapouille/test_sanitizer.rb
104
107
  - test/test_scraping.rb
105
108
  homepage: https://github.com/simcap/scrapouille
106
109
  licenses: []
@@ -126,7 +129,9 @@ signing_key:
126
129
  specification_version: 4
127
130
  summary: Interactive and declarative XPath driven HTML scraper
128
131
  test_files:
132
+ - test/fixtures/other-tennis-player.html
129
133
  - test/fixtures/tennis-player.html
130
134
  - test/fixtures/tennis-players-listing.html
131
135
  - test/helper.rb
136
+ - test/scrapouille/test_sanitizer.rb
132
137
  - test/test_scraping.rb