scrapouille 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
data/test/helper.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  $LOAD_PATH << File.join(__dir__, '../lib')
2
2
  require 'scrapouille'
3
3
 
4
+ require 'pry'
4
5
  require 'minitest/autorun'
@@ -2,8 +2,8 @@ require 'helper'
2
2
 
3
3
  class TestScraping < MiniTest::Unit::TestCase
4
4
 
5
- def test_one_player_scrapping
6
- scraper = Scrapouille.new do
5
+ def test_scrap_text
6
+ scraper = Scrapouille.configure do
7
7
  scrap 'fullname', at: "//div[@class='player-name']/h1/child::text()"
8
8
  scrap 'image_url', at: "//div[@id='basic']//img/attribute::src"
9
9
  scrap 'rank', at: "//div[@class='position']/text()" do |c|
@@ -14,16 +14,74 @@ class TestScraping < MiniTest::Unit::TestCase
14
14
  results = scraper.scrap!(File.join(__dir__, 'fixtures', 'tennis-player.html'))
15
15
 
16
16
  assert_equal({
17
- fullname: 'Richard Gasquet',
18
- image_url: 'http://cdn.tennis.com/uploads/img/2014/06/12/gasquet/regular.jpg',
19
- rank: 21
17
+ 'fullname' => 'Richard Gasquet',
18
+ 'image_url' => 'http://cdn.tennis.com/uploads/img/2014/06/12/gasquet/regular.jpg',
19
+ 'rank' => 21
20
20
  },
21
21
  results)
22
22
  end
23
23
 
24
+ def test_scrap_attribute_value
25
+ scraper = Scrapouille.configure do
26
+ scrap :djokovic_picture_src, at: "//img[contains(@src, 'djokovicz')]/@src"
27
+ end
28
+
29
+ results = scraper.scrap!(File.join(__dir__, 'fixtures', 'tennis-players-listing.html'))
30
+
31
+ assert_equal(
32
+ { djokovic_picture_src: 'http://cdn.tennis.com/uploads/img/2014/06/12/djokoviczz/regular.jpg' },
33
+ results
34
+ )
35
+ end
36
+
37
+ def test_scrap_all_attributes_value
38
+ scraper = Scrapouille.configure do
39
+ scrap_all :players_hrefs, at: "//table[contains(@class, 'ranking-table')]//a[child::img]/@href"
40
+ end
41
+
42
+ results = scraper.scrap!(File.join(__dir__, 'fixtures', 'tennis-players-listing.html'))
43
+
44
+ assert results[:players_hrefs]
45
+ assert results[:players_hrefs].all? {|p| p.start_with? '/player/'}
46
+ assert_equal 119, results[:players_hrefs].count
47
+ assert_equal '/player/532/novak-djokovic/', results[:players_hrefs].first
48
+ end
49
+
50
+ def test_scrap_all_text
51
+ scraper = Scrapouille.configure do
52
+ scrap_all 'players_names', at: "//table[contains(@class, 'ranking-table')]//a[not(child::img)]/text()" do |c|
53
+ c.downcase
54
+ end
55
+ end
56
+
57
+ results = scraper.scrap!(File.join(__dir__, 'fixtures', 'tennis-players-listing.html'))
58
+
59
+ assert results['players_names']
60
+ assert_equal 119, results['players_names'].count
61
+ assert_equal 'stan wawrinka', results['players_names'][3]
62
+ end
63
+
64
+ def test_both_scrap_and_scrap_all
65
+ scraper = Scrapouille.configure do
66
+ scrap :djokovic_picture_src, at: "//img[contains(@src, 'djokovicz')]/@src"
67
+ scrap_all :players_hrefs, at: "//table[contains(@class, 'ranking-table')]//a[child::img]/@href"
68
+ end
69
+
70
+ results = scraper.scrap!(File.join(__dir__, 'fixtures', 'tennis-players-listing.html'))
71
+
72
+ assert results[:players_hrefs]
73
+ assert_equal 119, results[:players_hrefs].count
74
+
75
+ assert_equal(
76
+ 'http://cdn.tennis.com/uploads/img/2014/06/12/djokoviczz/regular.jpg',
77
+ results[:djokovic_picture_src]
78
+ )
79
+ end
80
+
81
+
24
82
  def test_raise_when_no_at_options
25
83
  error = assert_raises(RuntimeError) do
26
- scraper = Scrapouille.new do
84
+ scraper = Scrapouille.configure do
27
85
  scrap 'fullname', {}
28
86
  end
29
87
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapouille
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - simcap
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-18 00:00:00.000000000 Z
11
+ date: 2014-09-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pry
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: minitest
57
71
  requirement: !ruby/object:Gem::Requirement
@@ -81,8 +95,11 @@ files:
81
95
  - Rakefile
82
96
  - bin/scrapouille
83
97
  - lib/scrapouille.rb
98
+ - lib/scrapouille/scraper.rb
99
+ - lib/scrapouille/xpath_runner.rb
84
100
  - scrapouille.gemspec
85
101
  - test/fixtures/tennis-player.html
102
+ - test/fixtures/tennis-players-listing.html
86
103
  - test/helper.rb
87
104
  - test/test_scraping.rb
88
105
  homepage: https://github.com/simcap/scrapouille
@@ -110,5 +127,6 @@ specification_version: 4
110
127
  summary: Interactive and declarative XPath driven HTML scraper
111
128
  test_files:
112
129
  - test/fixtures/tennis-player.html
130
+ - test/fixtures/tennis-players-listing.html
113
131
  - test/helper.rb
114
132
  - test/test_scraping.rb