scrapouille 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a8b59829e9883445bc47efc7c0993b08ea024356
4
- data.tar.gz: c5737c0c8985d8276faf2aa107eb7b371a25afab
3
+ metadata.gz: 9ac7508a3cc01e967f6cf32e986578b99f5ddf23
4
+ data.tar.gz: 3bb5e1f56cfbe7c1c5668a387c679f6b8ac4b8ed
5
5
  SHA512:
6
- metadata.gz: 0f569bca9605d73531734c39d96f5cf9d07f0b687027c70b6d4a198808ce91289211cf3e2c36486a104167f42c4d494e79c103b1f0cae25ec95101114b1838bc
7
- data.tar.gz: b1ab5b5d53c130dab864a3daee6368759710107777a10d7f57d5de344df4b7257aae115decdaa7539e5f94776ae280029633f1b3ea80c1dfc13fa973c2eafc74
6
+ metadata.gz: a5a8462493a89d40936cbf7ae3ef85e69c5e36e89581c13ea12bfdd40abb9336c2d056352b67a35d3f88719ddd15f24aa02c67ba9c30b4dd4d3c343d4eec7154
7
+ data.tar.gz: ddf07efb781bb461c4e12e8b467b5cbc04f9bd75d9e07ab7eda3ceb2a4fdc673be3cbe939ede43a168fce6183892e58f9b0c0843b4a172d0e9379a08dacb3111
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scrapouille (0.0.5)
4
+ scrapouille (0.0.6)
5
5
  nokogiri
6
6
 
7
7
  GEM
data/Rakefile CHANGED
@@ -4,6 +4,7 @@ require 'rake/testtask'
4
4
 
5
5
  Rake::TestTask.new do |t|
6
6
  t.libs << 'test'
7
+ t.test_files = FileList['test/**/test*.rb']
7
8
  end
8
9
 
9
10
  task default: :test
@@ -0,0 +1,16 @@
1
+ module Scrapouille
2
+ module Sanitizer
3
+
4
+ HTML_NBSP_ENTITY = "\u00A0".freeze
5
+
6
+ def self.clean!(items)
7
+ items.map! do |i|
8
+ next if i.nil?
9
+ n = i.gsub(HTML_NBSP_ENTITY, ' ')
10
+ n = n.squeeze(' ')
11
+ n.strip
12
+ end
13
+ end
14
+
15
+ end
16
+ end
@@ -1,5 +1,6 @@
1
1
  require 'open-uri'
2
2
  require_relative 'xpath_runner'
3
+ require_relative 'sanitizer'
3
4
 
4
5
  module Scrapouille
5
6
  class Scraper
@@ -10,15 +11,24 @@ module Scrapouille
10
11
  end
11
12
 
12
13
  def scrap_all(property, xpath_options)
14
+ ensure_valid_definition(property, xpath_options)
13
15
  block = Proc.new if block_given?
14
16
  add_rule(:collect_all, property, xpath_options, block)
15
17
  end
16
18
 
17
19
  def scrap(property, xpath_options)
20
+ ensure_valid_definition(property, xpath_options)
18
21
  block = Proc.new if block_given?
19
22
  add_rule(:collect_unique, property, xpath_options, block)
20
23
  end
21
24
 
25
+ def scrap_each!(uris)
26
+ raise ArgumentError, 'Expecting enumerable as argument' unless uris.respond_to? :map
27
+ uris.map do |uri|
28
+ scrap!(uri)
29
+ end
30
+ end
31
+
22
32
  def scrap!(uri)
23
33
  page = open(uri).read
24
34
 
@@ -43,7 +53,8 @@ module Scrapouille
43
53
  property, xpath, block = rule
44
54
 
45
55
  items = XpathRunner.new(xpath, page).get
46
- sanitize!(items)
56
+
57
+ Sanitizer.clean!(items)
47
58
 
48
59
  items.map! do |i|
49
60
  block.call(i)
@@ -52,13 +63,13 @@ module Scrapouille
52
63
  [property, items]
53
64
  end
54
65
 
55
- def sanitize!(items)
56
- items.map!(&:strip)
66
+ def add_rule(bucket, property, xpath_options, block = nil)
67
+ @rules[bucket] << ([property, xpath_options[:at], block].compact)
57
68
  end
58
69
 
59
- def add_rule(bucket, property, xpath_options, block = nil)
70
+ def ensure_valid_definition(property, xpath_options)
71
+ raise ArgumentError, 'Expecting Hash as second argument for scraping rules' unless Hash === xpath_options
60
72
  raise "Missing 'at:' option for '#{property}'" unless xpath_options[:at]
61
- @rules[bucket] << ([property, xpath_options[:at], block].compact)
62
73
  end
63
74
 
64
75
  end
data/lib/scrapouille.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require_relative 'scrapouille/scraper'
2
+ require_relative 'scrapouille/sanitizer'
2
3
  require_relative 'scrapouille/xpath_runner'
3
4
 
4
5
  module Scrapouille
data/scrapouille.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |spec|
4
4
  spec.name = "scrapouille"
5
- spec.version = "0.0.5"
5
+ spec.version = "0.0.6"
6
6
  spec.authors = ["simcap"]
7
7
  spec.summary = %q{Interactive and declarative XPath driven HTML scraper}
8
8
  spec.description = %q{Interactive and declarative XPath driven HTML scraper}
@@ -0,0 +1,63 @@
1
+ <html>
2
+ <body class="players-profile" >
3
+ <div class="player-bio">
4
+ <div class="player-name">
5
+ <h1 class="left">Rafael Nadal <span>(Spain)</span></h1>
6
+ <div class="right player-flag">
7
+ <div class="frame"></div>
8
+ <img src="http://cdn.tennis.com/statics/assets/images/flags/spain.jpg" border="0">
9
+ </div>
10
+ </div>
11
+ <div id="basic" class="padding-top-10">
12
+ <div class="bg-ranking clearfix">
13
+ <div class="image left">
14
+ <img src="http://cdn.tennis.com/uploads/img/1201/01/01/rnadal/regular.jpg">
15
+ <p>Getty Images</p>
16
+ </div>
17
+ <div class="left bio-detail">
18
+ <div class="info-row clearfix">
19
+ <div class="label left">Birthdate:</div>
20
+ <div class="value left">June 03, 1986 (28 years old)</div>
21
+ </div>
22
+ <div class="info-row clearfix">
23
+ <div class="label left">Birthplace:</div>
24
+ <div class="value left">Manacor, Spain</div>
25
+ </div>
26
+ <div class="info-row clearfix">
27
+ <div class="label left">Residence:</div>
28
+ <div class="value left">Manacor, Spain</div>
29
+ </div>
30
+ <div class="info-row clearfix">
31
+ <div class="label left">Height:</div>
32
+ <div class="value left">6' 1" (185 cm)</div>
33
+ </div>
34
+ <div class="info-row clearfix">
35
+ <div class="label left">Weight:</div>
36
+ <div class="value left">187 lb (85 kg)</div>
37
+ </div>
38
+ <div class="info-row clearfix">
39
+ <div class="label left">Plays:</div>
40
+ <div class="value left">Left-handed</div>
41
+ </div>
42
+ <div class="info-row clearfix">
43
+ <div class="label left">Turned Pro:</div>
44
+ <div class="value left">2000 (15 years on tour)</div>
45
+ </div>
46
+ <div class="info-row clearfix">
47
+ <div class="label left">Official Site:</div>
48
+ <div class="value left"><a href="http://www.rafaelnadal.com/">http://www.rafaelnadal.com/</a></div>
49
+ </div>
50
+ <div class="info-row clearfix">
51
+ <div class="label left">Nicknames:</div>
52
+ <div class="value left">Rafa</div>
53
+ </div>
54
+ </div>
55
+ <div class="right ranking">
56
+ <div class="position">#2</div>
57
+ <a href="/rankings/ATP/">Singles Ranking <span class="link-arrow"></span></a>
58
+ </div>
59
+ </div>
60
+ </div>
61
+ </div>
62
+ </body>
63
+ </html>
@@ -0,0 +1,27 @@
1
+ # encoding: utf-8
2
+ require 'helper'
3
+
4
+ class TestSanitizer < Minitest::Unit::TestCase
5
+
6
+ def test_special_cases
7
+ items = [nil, '', ' ']
8
+ Scrapouille::Sanitizer.clean!(items)
9
+ assert_equal [nil, '', ''], items
10
+ end
11
+
12
+ def test_squeeze_and_strip_whitespaces
13
+ items = ['1 2 3 4 ', ' a b c d ']
14
+ Scrapouille::Sanitizer.clean!(items)
15
+ assert_equal ['1 2 3 4', 'a b c d'], items
16
+ end
17
+
18
+ def test_replace_html_non_breaking_spaces_by_whitespaces
19
+ with_html_nbsp = 'B. Firat F'
20
+ refute with_html_nbsp.ascii_only?
21
+ items = [with_html_nbsp]
22
+ Scrapouille::Sanitizer.clean!(items)
23
+ assert_equal ['B. Firat F'], items, 'Should have removed &nbsp; (\u00A0) char'
24
+ assert items.first.ascii_only?
25
+ end
26
+
27
+ end
@@ -21,6 +21,37 @@ class TestScraping < MiniTest::Unit::TestCase
21
21
  results)
22
22
  end
23
23
 
24
+ def test_scrap_each
25
+ scraper = Scrapouille.configure do
26
+ scrap 'fullname', at: "//div[@class='player-name']/h1/child::text()"
27
+ scrap 'image_url', at: "//div[@id='basic']//img/attribute::src"
28
+ scrap 'rank', at: "//div[@class='position']/text()" do |c|
29
+ Integer(c.sub('#', ''))
30
+ end
31
+ end
32
+
33
+ results = scraper.scrap_each!([
34
+ File.join(__dir__, 'fixtures', 'tennis-player.html'),
35
+ File.join(__dir__, 'fixtures', 'other-tennis-player.html')
36
+ ])
37
+
38
+ assert Array === results
39
+ assert_equal({
40
+ 'fullname' => 'Richard Gasquet',
41
+ 'image_url' => 'http://cdn.tennis.com/uploads/img/2014/06/12/gasquet/regular.jpg',
42
+ 'rank' => 21
43
+ },
44
+ results[0]
45
+ )
46
+ assert_equal({
47
+ 'fullname' => 'Rafael Nadal',
48
+ 'image_url' => 'http://cdn.tennis.com/uploads/img/1201/01/01/rnadal/regular.jpg',
49
+ 'rank' => 2
50
+ },
51
+ results[1]
52
+ )
53
+ end
54
+
24
55
  def test_scrap_attribute_value
25
56
  scraper = Scrapouille.configure do
26
57
  scrap :djokovic_picture_src, at: "//img[contains(@src, 'djokovicz')]/@src"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapouille
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - simcap
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-20 00:00:00.000000000 Z
11
+ date: 2014-09-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -95,12 +95,15 @@ files:
95
95
  - Rakefile
96
96
  - bin/scrapouille
97
97
  - lib/scrapouille.rb
98
+ - lib/scrapouille/sanitizer.rb
98
99
  - lib/scrapouille/scraper.rb
99
100
  - lib/scrapouille/xpath_runner.rb
100
101
  - scrapouille.gemspec
102
+ - test/fixtures/other-tennis-player.html
101
103
  - test/fixtures/tennis-player.html
102
104
  - test/fixtures/tennis-players-listing.html
103
105
  - test/helper.rb
106
+ - test/scrapouille/test_sanitizer.rb
104
107
  - test/test_scraping.rb
105
108
  homepage: https://github.com/simcap/scrapouille
106
109
  licenses: []
@@ -126,7 +129,9 @@ signing_key:
126
129
  specification_version: 4
127
130
  summary: Interactive and declarative XPath driven HTML scraper
128
131
  test_files:
132
+ - test/fixtures/other-tennis-player.html
129
133
  - test/fixtures/tennis-player.html
130
134
  - test/fixtures/tennis-players-listing.html
131
135
  - test/helper.rb
136
+ - test/scrapouille/test_sanitizer.rb
132
137
  - test/test_scraping.rb