scrapouille 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 3da429237821d2f8347a236528a5989909a48d9e
4
+ data.tar.gz: 4a751718b909aae9e44acd53534ed0959ec7eb4b
5
+ SHA512:
6
+ metadata.gz: 323db63080ebb8cdf6979c89e0913353ea279317d50a82627e6c0b791788522074bd54ce5deb83de84cc53ae78db430a201b761f4d96a17e6412266d1773f572
7
+ data.tar.gz: 311f25868a32077f97f40892e0cf1aeaa31d36eda50a9ed080d2bffdec7e0affcbdfed7ebb95298410475a9a99113bdda3b577197784239de43fc062fe44c825
data/.gitignore ADDED
@@ -0,0 +1,22 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
4
+
data/Gemfile.lock ADDED
@@ -0,0 +1,21 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ scrapouille (0.0.1)
5
+ nokogiri
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ mini_portile (0.6.0)
11
+ nokogiri (1.6.2.1)
12
+ mini_portile (= 0.6.0)
13
+ rake (10.3.2)
14
+
15
+ PLATFORMS
16
+ ruby
17
+
18
+ DEPENDENCIES
19
+ bundler (~> 1.6)
20
+ rake
21
+ scrapouille!
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 simcap
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,7 @@
1
+ Declarative scraper
2
+
3
+ # Usage
4
+
5
+ Test
6
+
7
+ rake
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require 'bundler/gem_tasks'
2
+
3
+ require 'rake/testtask'
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.libs << 'test'
7
+ t.test_files = FileList['test/**/*_spec.rb']
8
+ end
9
+
10
+ task default: :test
@@ -0,0 +1,33 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+
4
+ class Scrapouille
5
+
6
+ def initialize(&block)
7
+ @rules = []
8
+ instance_eval(&block)
9
+ end
10
+
11
+ def scrap(property, xpath_options)
12
+ raise "Missing 'at:' option for '#{property}'" unless xpath_options[:at]
13
+ xpath_string = xpath_options[:at]
14
+ if block_given?
15
+ @rules << [property, xpath_string, Proc.new]
16
+ else
17
+ @rules << [property, xpath_string]
18
+ end
19
+ end
20
+
21
+ def scrap!(uri)
22
+ web_page = open(uri).read
23
+ html = Nokogiri::HTML(web_page)
24
+ @rules.inject({}) do |result, rule|
25
+ property, xpath, block = rule
26
+ content = html.xpath(xpath).text.strip
27
+ content = block.call(content) if block
28
+ result[property.to_sym] = content
29
+ result
30
+ end
31
+ end
32
+
33
+ end
@@ -0,0 +1,22 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "scrapouille"
7
+ spec.version = "0.0.1"
8
+ spec.authors = ["simcap"]
9
+ spec.summary = %q{Simpe declarative scrapper}
10
+ spec.description = %q{Simpe declarative scrapper}
11
+ spec.homepage = "https://github.com/simcap/scrapouille"
12
+
13
+ spec.files = `git ls-files -z`.split("\x0")
14
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
15
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
16
+ spec.require_paths = ["lib"]
17
+
18
+ spec.add_dependency "nokogiri"
19
+
20
+ spec.add_development_dependency "bundler", "~> 1.6"
21
+ spec.add_development_dependency "rake"
22
+ end
@@ -0,0 +1,74 @@
1
+ <html>
2
+ <body class="players-profile" >
3
+ <div class="player-bio">
4
+ <div class="player-name">
5
+ <h1 class="left">Richard Gasquet <span>(France)</span></h1>
6
+ <div class="right player-flag">
7
+ <div class="frame"></div>
8
+ <img src="http://cdn.tennis.com/statics/assets/images/flags/france.jpg" border="0" />
9
+ </div>
10
+ </div>
11
+ <div id="basic" class="padding-top-10">
12
+ <div class="bg-ranking clearfix">
13
+ <div class="image left">
14
+ <img src="http://cdn.tennis.com/uploads/img/2014/06/12/gasquet/regular.jpg" />
15
+ <p>Getty Images</p>
16
+ </div>
17
+ <div class="left bio-detail">
18
+
19
+ <div class="info-row clearfix">
20
+ <div class="label left">Birthdate:</div>
21
+ <div class="value left">June 18, 1986 (28 years old)</div>
22
+ </div>
23
+
24
+
25
+ <div class="info-row clearfix">
26
+ <div class="label left">Birthplace:</div>
27
+ <div class="value left">Beziers, France</div>
28
+ </div>
29
+
30
+
31
+ <div class="info-row clearfix">
32
+ <div class="label left">Residence:</div>
33
+ <div class="value left">Neuchatel, Switzerland</div>
34
+ </div>
35
+
36
+
37
+ <div class="info-row clearfix">
38
+ <div class="label left">Height:</div>
39
+ <div class="value left">6&#39; 1&#34; (185 cm)</div>
40
+ </div>
41
+
42
+
43
+ <div class="info-row clearfix">
44
+ <div class="label left">Weight:</div>
45
+ <div class="value left">165 lb (75 kg)</div>
46
+ </div>
47
+
48
+ <div class="info-row clearfix">
49
+ <div class="label left">Plays:</div>
50
+ <div class="value left">Right-handed</div>
51
+ </div>
52
+
53
+ <div class="info-row clearfix">
54
+ <div class="label left">Turned Pro:</div>
55
+ <div class="value left">2002 (13 years on tour)</div>
56
+ </div>
57
+
58
+
59
+ <div class="info-row clearfix">
60
+ <div class="label left">Official Site:</div>
61
+ <div class="value left"><a href="http://www.richardgasquet.net/">http://www.richardgasquet.net/</a></div>
62
+ </div>
63
+
64
+
65
+ </div>
66
+ <div class="right ranking">
67
+ <div class="position">#21</div>
68
+ <a href="/rankings/ATP/">Singles Ranking <span class="link-arrow"></span></a>
69
+ </div>
70
+ </div>
71
+ </div>
72
+ </div>
73
+ </body>
74
+ </html>
data/test/helper.rb ADDED
@@ -0,0 +1,4 @@
1
+ $LOAD_PATH << File.join(__dir__, '../lib')
2
+
3
+ require 'minitest/autorun'
4
+ require 'scrapouille'
@@ -0,0 +1,33 @@
1
+ require 'helper'
2
+
3
+ class ScrapingTest < MiniTest::Unit::TestCase
4
+
5
+ def test_one_player_scrapping
6
+ scraper = Scrapouille.new do
7
+ scrap 'fullname', at: "//div[@class='player-name']/h1/child::text()"
8
+ scrap 'image_url', at: "//div[@id='basic']//img/attribute::src"
9
+ scrap 'rank', at: "//div[@class='position']/text()" do |c|
10
+ Integer(c.sub('#', ''))
11
+ end
12
+ end
13
+
14
+ results = scraper.scrap!(File.join(__dir__, 'fixtures', 'tennis-player.html'))
15
+
16
+ assert_equal({
17
+ fullname: 'Richard Gasquet',
18
+ image_url: 'http://cdn.tennis.com/uploads/img/2014/06/12/gasquet/regular.jpg',
19
+ rank: 21
20
+ },
21
+ results)
22
+ end
23
+
24
+ def test_raise_when_no_at_options
25
+ error = assert_raises(RuntimeError) do
26
+ scraper = Scrapouille.new do
27
+ scrap 'fullname', {}
28
+ end
29
+ end
30
+ assert_match /fullname/, error.message
31
+ end
32
+
33
+ end
metadata ADDED
@@ -0,0 +1,98 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scrapouille
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - simcap
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-09-17 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.6'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.6'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Simpe declarative scrapper
56
+ email:
57
+ executables: []
58
+ extensions: []
59
+ extra_rdoc_files: []
60
+ files:
61
+ - ".gitignore"
62
+ - Gemfile
63
+ - Gemfile.lock
64
+ - LICENSE.txt
65
+ - README.md
66
+ - Rakefile
67
+ - lib/scrapouille.rb
68
+ - scrapouille.gemspec
69
+ - test/fixtures/tennis-player.html
70
+ - test/helper.rb
71
+ - test/scraping_spec.rb
72
+ homepage: https://github.com/simcap/scrapouille
73
+ licenses: []
74
+ metadata: {}
75
+ post_install_message:
76
+ rdoc_options: []
77
+ require_paths:
78
+ - lib
79
+ required_ruby_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ requirements: []
90
+ rubyforge_project:
91
+ rubygems_version: 2.2.2
92
+ signing_key:
93
+ specification_version: 4
94
+ summary: Simpe declarative scrapper
95
+ test_files:
96
+ - test/fixtures/tennis-player.html
97
+ - test/helper.rb
98
+ - test/scraping_spec.rb