scrapouille 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/test/helper.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  $LOAD_PATH << File.join(__dir__, '../lib')
2
2
  require 'scrapouille'
3
3
 
4
+ require 'pry'
4
5
  require 'minitest/autorun'
@@ -2,8 +2,8 @@ require 'helper'
2
2
 
3
3
  class TestScraping < MiniTest::Unit::TestCase
4
4
 
5
- def test_one_player_scrapping
6
- scraper = Scrapouille.new do
5
+ def test_scrap_text
6
+ scraper = Scrapouille.configure do
7
7
  scrap 'fullname', at: "//div[@class='player-name']/h1/child::text()"
8
8
  scrap 'image_url', at: "//div[@id='basic']//img/attribute::src"
9
9
  scrap 'rank', at: "//div[@class='position']/text()" do |c|
@@ -14,16 +14,74 @@ class TestScraping < MiniTest::Unit::TestCase
14
14
  results = scraper.scrap!(File.join(__dir__, 'fixtures', 'tennis-player.html'))
15
15
 
16
16
  assert_equal({
17
- fullname: 'Richard Gasquet',
18
- image_url: 'http://cdn.tennis.com/uploads/img/2014/06/12/gasquet/regular.jpg',
19
- rank: 21
17
+ 'fullname' => 'Richard Gasquet',
18
+ 'image_url' => 'http://cdn.tennis.com/uploads/img/2014/06/12/gasquet/regular.jpg',
19
+ 'rank' => 21
20
20
  },
21
21
  results)
22
22
  end
23
23
 
24
+ def test_scrap_attribute_value
25
+ scraper = Scrapouille.configure do
26
+ scrap :djokovic_picture_src, at: "//img[contains(@src, 'djokovicz')]/@src"
27
+ end
28
+
29
+ results = scraper.scrap!(File.join(__dir__, 'fixtures', 'tennis-players-listing.html'))
30
+
31
+ assert_equal(
32
+ { djokovic_picture_src: 'http://cdn.tennis.com/uploads/img/2014/06/12/djokoviczz/regular.jpg' },
33
+ results
34
+ )
35
+ end
36
+
37
+ def test_scrap_all_attributes_value
38
+ scraper = Scrapouille.configure do
39
+ scrap_all :players_hrefs, at: "//table[contains(@class, 'ranking-table')]//a[child::img]/@href"
40
+ end
41
+
42
+ results = scraper.scrap!(File.join(__dir__, 'fixtures', 'tennis-players-listing.html'))
43
+
44
+ assert results[:players_hrefs]
45
+ assert results[:players_hrefs].all? {|p| p.start_with? '/player/'}
46
+ assert_equal 119, results[:players_hrefs].count
47
+ assert_equal '/player/532/novak-djokovic/', results[:players_hrefs].first
48
+ end
49
+
50
+ def test_scrap_all_text
51
+ scraper = Scrapouille.configure do
52
+ scrap_all 'players_names', at: "//table[contains(@class, 'ranking-table')]//a[not(child::img)]/text()" do |c|
53
+ c.downcase
54
+ end
55
+ end
56
+
57
+ results = scraper.scrap!(File.join(__dir__, 'fixtures', 'tennis-players-listing.html'))
58
+
59
+ assert results['players_names']
60
+ assert_equal 119, results['players_names'].count
61
+ assert_equal 'stan wawrinka', results['players_names'][3]
62
+ end
63
+
64
+ def test_both_scrap_and_scrap_all
65
+ scraper = Scrapouille.configure do
66
+ scrap :djokovic_picture_src, at: "//img[contains(@src, 'djokovicz')]/@src"
67
+ scrap_all :players_hrefs, at: "//table[contains(@class, 'ranking-table')]//a[child::img]/@href"
68
+ end
69
+
70
+ results = scraper.scrap!(File.join(__dir__, 'fixtures', 'tennis-players-listing.html'))
71
+
72
+ assert results[:players_hrefs]
73
+ assert_equal 119, results[:players_hrefs].count
74
+
75
+ assert_equal(
76
+ 'http://cdn.tennis.com/uploads/img/2014/06/12/djokoviczz/regular.jpg',
77
+ results[:djokovic_picture_src]
78
+ )
79
+ end
80
+
81
+
24
82
  def test_raise_when_no_at_options
25
83
  error = assert_raises(RuntimeError) do
26
- scraper = Scrapouille.new do
84
+ scraper = Scrapouille.configure do
27
85
  scrap 'fullname', {}
28
86
  end
29
87
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapouille
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - simcap
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-18 00:00:00.000000000 Z
11
+ date: 2014-09-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pry
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: minitest
57
71
  requirement: !ruby/object:Gem::Requirement
@@ -81,8 +95,11 @@ files:
81
95
  - Rakefile
82
96
  - bin/scrapouille
83
97
  - lib/scrapouille.rb
98
+ - lib/scrapouille/scraper.rb
99
+ - lib/scrapouille/xpath_runner.rb
84
100
  - scrapouille.gemspec
85
101
  - test/fixtures/tennis-player.html
102
+ - test/fixtures/tennis-players-listing.html
86
103
  - test/helper.rb
87
104
  - test/test_scraping.rb
88
105
  homepage: https://github.com/simcap/scrapouille
@@ -110,5 +127,6 @@ specification_version: 4
110
127
  summary: Interactive and declarative XPath driven HTML scraper
111
128
  test_files:
112
129
  - test/fixtures/tennis-player.html
130
+ - test/fixtures/tennis-players-listing.html
113
131
  - test/helper.rb
114
132
  - test/test_scraping.rb