scrapouille 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 3da429237821d2f8347a236528a5989909a48d9e
4
+ data.tar.gz: 4a751718b909aae9e44acd53534ed0959ec7eb4b
5
+ SHA512:
6
+ metadata.gz: 323db63080ebb8cdf6979c89e0913353ea279317d50a82627e6c0b791788522074bd54ce5deb83de84cc53ae78db430a201b761f4d96a17e6412266d1773f572
7
+ data.tar.gz: 311f25868a32077f97f40892e0cf1aeaa31d36eda50a9ed080d2bffdec7e0affcbdfed7ebb95298410475a9a99113bdda3b577197784239de43fc062fe44c825
data/.gitignore ADDED
@@ -0,0 +1,22 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
4
+
data/Gemfile.lock ADDED
@@ -0,0 +1,21 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ scrapouille (0.0.1)
5
+ nokogiri
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ mini_portile (0.6.0)
11
+ nokogiri (1.6.2.1)
12
+ mini_portile (= 0.6.0)
13
+ rake (10.3.2)
14
+
15
+ PLATFORMS
16
+ ruby
17
+
18
+ DEPENDENCIES
19
+ bundler (~> 1.6)
20
+ rake
21
+ scrapouille!
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 simcap
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,7 @@
1
+ Declarative scraper
2
+
3
+ # Usage
4
+
5
+ Test
6
+
7
+ rake
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require 'bundler/gem_tasks'
2
+
3
+ require 'rake/testtask'
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.libs << 'test'
7
+ t.test_files = FileList['test/**/*_spec.rb']
8
+ end
9
+
10
+ task default: :test
@@ -0,0 +1,33 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+
4
+ class Scrapouille
5
+
6
+ def initialize(&block)
7
+ @rules = []
8
+ instance_eval(&block)
9
+ end
10
+
11
+ def scrap(property, xpath_options)
12
+ raise "Missing 'at:' option for '#{property}'" unless xpath_options[:at]
13
+ xpath_string = xpath_options[:at]
14
+ if block_given?
15
+ @rules << [property, xpath_string, Proc.new]
16
+ else
17
+ @rules << [property, xpath_string]
18
+ end
19
+ end
20
+
21
+ def scrap!(uri)
22
+ web_page = open(uri).read
23
+ html = Nokogiri::HTML(web_page)
24
+ @rules.inject({}) do |result, rule|
25
+ property, xpath, block = rule
26
+ content = html.xpath(xpath).text.strip
27
+ content = block.call(content) if block
28
+ result[property.to_sym] = content
29
+ result
30
+ end
31
+ end
32
+
33
+ end
@@ -0,0 +1,22 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "scrapouille"
7
+ spec.version = "0.0.1"
8
+ spec.authors = ["simcap"]
9
+ spec.summary = %q{Simpe declarative scrapper}
10
+ spec.description = %q{Simpe declarative scrapper}
11
+ spec.homepage = "https://github.com/simcap/scrapouille"
12
+
13
+ spec.files = `git ls-files -z`.split("\x0")
14
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
15
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
16
+ spec.require_paths = ["lib"]
17
+
18
+ spec.add_dependency "nokogiri"
19
+
20
+ spec.add_development_dependency "bundler", "~> 1.6"
21
+ spec.add_development_dependency "rake"
22
+ end
@@ -0,0 +1,74 @@
1
+ <html>
2
+ <body class="players-profile" >
3
+ <div class="player-bio">
4
+ <div class="player-name">
5
+ <h1 class="left">Richard Gasquet <span>(France)</span></h1>
6
+ <div class="right player-flag">
7
+ <div class="frame"></div>
8
+ <img src="http://cdn.tennis.com/statics/assets/images/flags/france.jpg" border="0" />
9
+ </div>
10
+ </div>
11
+ <div id="basic" class="padding-top-10">
12
+ <div class="bg-ranking clearfix">
13
+ <div class="image left">
14
+ <img src="http://cdn.tennis.com/uploads/img/2014/06/12/gasquet/regular.jpg" />
15
+ <p>Getty Images</p>
16
+ </div>
17
+ <div class="left bio-detail">
18
+
19
+ <div class="info-row clearfix">
20
+ <div class="label left">Birthdate:</div>
21
+ <div class="value left">June 18, 1986 (28 years old)</div>
22
+ </div>
23
+
24
+
25
+ <div class="info-row clearfix">
26
+ <div class="label left">Birthplace:</div>
27
+ <div class="value left">Beziers, France</div>
28
+ </div>
29
+
30
+
31
+ <div class="info-row clearfix">
32
+ <div class="label left">Residence:</div>
33
+ <div class="value left">Neuchatel, Switzerland</div>
34
+ </div>
35
+
36
+
37
+ <div class="info-row clearfix">
38
+ <div class="label left">Height:</div>
39
+ <div class="value left">6&#39; 1&#34; (185 cm)</div>
40
+ </div>
41
+
42
+
43
+ <div class="info-row clearfix">
44
+ <div class="label left">Weight:</div>
45
+ <div class="value left">165 lb (75 kg)</div>
46
+ </div>
47
+
48
+ <div class="info-row clearfix">
49
+ <div class="label left">Plays:</div>
50
+ <div class="value left">Right-handed</div>
51
+ </div>
52
+
53
+ <div class="info-row clearfix">
54
+ <div class="label left">Turned Pro:</div>
55
+ <div class="value left">2002 (13 years on tour)</div>
56
+ </div>
57
+
58
+
59
+ <div class="info-row clearfix">
60
+ <div class="label left">Official Site:</div>
61
+ <div class="value left"><a href="http://www.richardgasquet.net/">http://www.richardgasquet.net/</a></div>
62
+ </div>
63
+
64
+
65
+ </div>
66
+ <div class="right ranking">
67
+ <div class="position">#21</div>
68
+ <a href="/rankings/ATP/">Singles Ranking <span class="link-arrow"></span></a>
69
+ </div>
70
+ </div>
71
+ </div>
72
+ </div>
73
+ </body>
74
+ </html>
data/test/helper.rb ADDED
@@ -0,0 +1,4 @@
1
+ $LOAD_PATH << File.join(__dir__, '../lib')
2
+
3
+ require 'minitest/autorun'
4
+ require 'scrapouille'
@@ -0,0 +1,33 @@
1
+ require 'helper'
2
+
3
+ class ScrapingTest < MiniTest::Unit::TestCase
4
+
5
+ def test_one_player_scrapping
6
+ scraper = Scrapouille.new do
7
+ scrap 'fullname', at: "//div[@class='player-name']/h1/child::text()"
8
+ scrap 'image_url', at: "//div[@id='basic']//img/attribute::src"
9
+ scrap 'rank', at: "//div[@class='position']/text()" do |c|
10
+ Integer(c.sub('#', ''))
11
+ end
12
+ end
13
+
14
+ results = scraper.scrap!(File.join(__dir__, 'fixtures', 'tennis-player.html'))
15
+
16
+ assert_equal({
17
+ fullname: 'Richard Gasquet',
18
+ image_url: 'http://cdn.tennis.com/uploads/img/2014/06/12/gasquet/regular.jpg',
19
+ rank: 21
20
+ },
21
+ results)
22
+ end
23
+
24
+ def test_raise_when_no_at_options
25
+ error = assert_raises(RuntimeError) do
26
+ scraper = Scrapouille.new do
27
+ scrap 'fullname', {}
28
+ end
29
+ end
30
+ assert_match /fullname/, error.message
31
+ end
32
+
33
+ end
metadata ADDED
@@ -0,0 +1,98 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scrapouille
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - simcap
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-09-17 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.6'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.6'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Simpe declarative scrapper
56
+ email:
57
+ executables: []
58
+ extensions: []
59
+ extra_rdoc_files: []
60
+ files:
61
+ - ".gitignore"
62
+ - Gemfile
63
+ - Gemfile.lock
64
+ - LICENSE.txt
65
+ - README.md
66
+ - Rakefile
67
+ - lib/scrapouille.rb
68
+ - scrapouille.gemspec
69
+ - test/fixtures/tennis-player.html
70
+ - test/helper.rb
71
+ - test/scraping_spec.rb
72
+ homepage: https://github.com/simcap/scrapouille
73
+ licenses: []
74
+ metadata: {}
75
+ post_install_message:
76
+ rdoc_options: []
77
+ require_paths:
78
+ - lib
79
+ required_ruby_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ requirements: []
90
+ rubyforge_project:
91
+ rubygems_version: 2.2.2
92
+ signing_key:
93
+ specification_version: 4
94
+ summary: Simpe declarative scrapper
95
+ test_files:
96
+ - test/fixtures/tennis-player.html
97
+ - test/helper.rb
98
+ - test/scraping_spec.rb