scrapouille 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c0edc7fb52550844caa8d189373a577b113462c8
4
- data.tar.gz: 5b76257e9540d9f92e46d502d8346518fba1589b
3
+ metadata.gz: 9aa9621a90f3ebb7e73aa8d9ea6e46826d78bc7f
4
+ data.tar.gz: 95f49609599be7100ff1b3ebd05c90ae6364a0b1
5
5
  SHA512:
6
- metadata.gz: 4b1924dab7efceb32eea3a684e6a42a294d746ed98f8bd30096640e100e4c54419d8f3dfad1af5492a60d19ae1cb77594cb4c28a0f3e8e3f0451a4176e05adeb
7
- data.tar.gz: ca5405c1da48f22add8cb924df2df458d7ab3470b2e017f87ac4d17dde677fe0794ae03751ab172fd41eac6eb623ebc7108c0c5405d63b63b77c45f8f70dcb9d
6
+ metadata.gz: 87ad778ca5d8be90001921f492cce90271a82a64529c26dbfb305fb578eacea0f2214d45d14ecab7731623fe5d600de3a213d7669e0347fb9427ed48ae4d7fc0
7
+ data.tar.gz: 01d1ec1ee5757269cf47c222472d7f63194189246c0d877a6bd15262bf06473e1642705d62e9baf3cdd056eb2a5255d042ebf630e0dda2652d4ebbe23f65407e
data/README.md CHANGED
@@ -1,35 +1,61 @@
1
- Declarative HTML scraper
1
+ # Scrapouille
2
2
 
3
- # Usage
3
+ Scrapouille is a declarative XPath driven HTML scraper with an interactive mode as a bonus
4
+
5
+ Why XPath ? XPath is powerful enough to get any data on a HTML document (see http://www.w3schools.com/xpath/xpath_axes.asp)
6
+
7
+ Scrapouille run XPath queries using the **nokogiri** gem
8
+
9
+ ### Install
10
+
11
+ gem install 'scrapouille'
4
12
 
5
13
  ### Test
6
14
 
7
15
  rake
8
16
 
9
- ### Scrap
17
+ # Usage
18
+
19
+ ### Interactive mode
20
+
21
+ From the command line you can interact with a remote web page as if it was local
22
+
23
+ $ scrapouille http://tennis.com/player.html # launch scrapouille on the command line with a provided URI
24
+ > //div[@class='player-name']/h1/child::text() # You will get a prompt. Enter a xpath query
25
+ Richard Gasquest # Get the result string
26
+
27
+ **Behind the scene - during the session - the remote web page is stored in a `Tempfile` for fast xpath interaction**
28
+
29
+ You can also directly interact with a local file
30
+
31
+ $ scrapouille /Users/simon/web/player.html # launch scrapouille on the command line with a provided filepath
32
+ > //div[@class='player-name']/h1/child::text() # enter your xpath query
33
+ Richard Gasquest # Get the result String
34
+
35
+ ### Scraping programatically
10
36
 
11
37
  Define a scraper
12
38
 
13
- ```ruby
14
- scraper = Scrapouille.new do
15
- scrap 'fullname', at: "//div[@class='player-name']/h1/child::text()"
16
- scrap 'image_url', at: "//div[@id='basic']//img/attribute::src"
17
- scrap 'rank', at: "//div[@class='position']/text()" do |c|
18
- Integer(c.sub('#', ''))
19
- end
20
- end
21
- ```
39
+ ```ruby
40
+ scraper = Scrapouille.new do
41
+ scrap 'fullname', at: "//div[@class='player-name']/h1/child::text()"
42
+ scrap 'image_url', at: "//div[@id='basic']//img/attribute::src"
43
+ scrap 'rank', at: "//div[@class='position']/text()" do |c|
44
+ Integer(c.sub('#', ''))
45
+ end
46
+ end
47
+ ```
22
48
 
23
- Use you scraper instance on an URI (as defined by `open-uri`: filepath, http, ...)
49
+ Use the scraper instance on an URI (as defined by `open-uri`: filepath, http, ...)
24
50
 
25
- ```ruby
26
- results = scraper.scrap!('http://tennis-player.com/richard-gasquet')
27
- results['fullname'] # => 'Richard Gasquest'
28
- ```
51
+ ```ruby
52
+ results = scraper.scrap!('http://tennis-player.com/richard-gasquet')
53
+ results['fullname'] # => 'Richard Gasquest'
54
+ ```
29
55
 
30
- You can test your xpath expression with a local HTML filepath
56
+ You can also run your scraper using a local HTML filepath for testing purposes
31
57
 
32
- ```ruby
33
- scraper.scrap!(File.join('..', 'player.html'))
34
- ```
58
+ ```ruby
59
+ scraper.scrap!(File.join('..', 'player.html'))
60
+ ```
35
61
 
data/Rakefile CHANGED
@@ -4,7 +4,6 @@ require 'rake/testtask'
4
4
 
5
5
  Rake::TestTask.new do |t|
6
6
  t.libs << 'test'
7
- t.test_files = FileList['test/**/*_spec.rb']
8
7
  end
9
8
 
10
9
  task default: :test
data/bin/scrapouille ADDED
@@ -0,0 +1,38 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'scrapouille'
4
+ require 'readline'
5
+ require 'open-uri'
6
+ require 'tempfile'
7
+
8
+ abort 'Missing uri parameter' if ARGV.empty?
9
+
10
+ uri = URI(ARGV.shift)
11
+
12
+ if uri.scheme && uri.scheme.start_with?('http')
13
+ file = Tempfile.new('scrapouille-')
14
+ puts 'Fetching remote content ...'
15
+ file.write(open(uri).read)
16
+ uri = file.path
17
+ puts "Tempfile created at #{file.path}"
18
+ else
19
+ path = uri.to_s
20
+ abort "Cannot find file '#{path}'" unless File.exists?(path)
21
+ end
22
+
23
+ while provided_xpath = Readline.readline("> ", true)
24
+ scraper = Scrapouille.new
25
+ to_eval = "scrap :item, at: \"#{provided_xpath}\""
26
+ scraper.instance_eval(to_eval)
27
+ begin
28
+ puts scraper.scrap!(uri.to_s)[:item]
29
+ rescue => e
30
+ puts e.message
31
+ end
32
+ end
33
+
34
+ if file
35
+ puts "Closing and deleting temp file"
36
+ file.close
37
+ file.unlink
38
+ end
data/scrapouille.gemspec CHANGED
@@ -2,10 +2,10 @@
2
2
 
3
3
  Gem::Specification.new do |spec|
4
4
  spec.name = "scrapouille"
5
- spec.version = "0.0.2"
5
+ spec.version = "0.0.3"
6
6
  spec.authors = ["simcap"]
7
- spec.summary = %q{Simpe declarative HTML scrapper}
8
- spec.description = %q{Simpe declarative HTML scrapper}
7
+ spec.summary = %q{Interactive and declarative XPath driven HTML scraper}
8
+ spec.description = %q{Interactive and declarative XPath driven HTML scraper}
9
9
  spec.homepage = "https://github.com/simcap/scrapouille"
10
10
 
11
11
  spec.files = `git ls-files -z`.split("\x0")
@@ -17,4 +17,5 @@ Gem::Specification.new do |spec|
17
17
 
18
18
  spec.add_development_dependency "bundler", "~> 1.6"
19
19
  spec.add_development_dependency "rake"
20
+ spec.add_development_dependency "minitest"
20
21
  end
data/test/helper.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  $LOAD_PATH << File.join(__dir__, '../lib')
2
+ require 'scrapouille'
2
3
 
3
4
  require 'minitest/autorun'
4
- require 'scrapouille'
@@ -1,6 +1,6 @@
1
1
  require 'helper'
2
2
 
3
- class ScrapingTest < MiniTest::Unit::TestCase
3
+ class TestScraping < MiniTest::Unit::TestCase
4
4
 
5
5
  def test_one_player_scrapping
6
6
  scraper = Scrapouille.new do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapouille
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - simcap
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-17 00:00:00.000000000 Z
11
+ date: 2014-09-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -52,9 +52,24 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
- description: Simpe declarative HTML scrapper
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: Interactive and declarative XPath driven HTML scraper
56
70
  email:
57
- executables: []
71
+ executables:
72
+ - scrapouille
58
73
  extensions: []
59
74
  extra_rdoc_files: []
60
75
  files:
@@ -64,11 +79,12 @@ files:
64
79
  - LICENSE.txt
65
80
  - README.md
66
81
  - Rakefile
82
+ - bin/scrapouille
67
83
  - lib/scrapouille.rb
68
84
  - scrapouille.gemspec
69
85
  - test/fixtures/tennis-player.html
70
86
  - test/helper.rb
71
- - test/scraping_spec.rb
87
+ - test/test_scraping.rb
72
88
  homepage: https://github.com/simcap/scrapouille
73
89
  licenses: []
74
90
  metadata: {}
@@ -91,8 +107,8 @@ rubyforge_project:
91
107
  rubygems_version: 2.2.2
92
108
  signing_key:
93
109
  specification_version: 4
94
- summary: Simpe declarative HTML scrapper
110
+ summary: Interactive and declarative XPath driven HTML scraper
95
111
  test_files:
96
112
  - test/fixtures/tennis-player.html
97
113
  - test/helper.rb
98
- - test/scraping_spec.rb
114
+ - test/test_scraping.rb