scrapouille 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +47 -21
- data/Rakefile +0 -1
- data/bin/scrapouille +38 -0
- data/scrapouille.gemspec +4 -3
- data/test/helper.rb +1 -1
- data/test/{scraping_spec.rb → test_scraping.rb} +1 -1
- metadata +23 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9aa9621a90f3ebb7e73aa8d9ea6e46826d78bc7f
|
4
|
+
data.tar.gz: 95f49609599be7100ff1b3ebd05c90ae6364a0b1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 87ad778ca5d8be90001921f492cce90271a82a64529c26dbfb305fb578eacea0f2214d45d14ecab7731623fe5d600de3a213d7669e0347fb9427ed48ae4d7fc0
|
7
|
+
data.tar.gz: 01d1ec1ee5757269cf47c222472d7f63194189246c0d877a6bd15262bf06473e1642705d62e9baf3cdd056eb2a5255d042ebf630e0dda2652d4ebbe23f65407e
|
data/README.md
CHANGED
@@ -1,35 +1,61 @@
|
|
1
|
-
|
1
|
+
# Scrapouille
|
2
2
|
|
3
|
-
|
3
|
+
Scrapouille is a declarative XPath driven HTML scraper with an interactive mode as a bonus
|
4
|
+
|
5
|
+
Why XPath ? XPath is powerful enough to get any data on a HTML document (see http://www.w3schools.com/xpath/xpath_axes.asp)
|
6
|
+
|
7
|
+
Scrapouille run XPath queries using the **nokogiri** gem
|
8
|
+
|
9
|
+
### Install
|
10
|
+
|
11
|
+
gem install 'scrapouille'
|
4
12
|
|
5
13
|
### Test
|
6
14
|
|
7
15
|
rake
|
8
16
|
|
9
|
-
|
17
|
+
# Usage
|
18
|
+
|
19
|
+
### Interactive mode
|
20
|
+
|
21
|
+
From the command line you can interact with a remote web page as if it was local
|
22
|
+
|
23
|
+
$ scrapouille http://tennis.com/player.html # launch scrapouille on the command line with a provided URI
|
24
|
+
> //div[@class='player-name']/h1/child::text() # You will get a prompt. Enter a xpath query
|
25
|
+
Richard Gasquest # Get the result string
|
26
|
+
|
27
|
+
**Behind the scene - during the session - the remote web page is stored in a `Tempfile` for fast xpath interaction**
|
28
|
+
|
29
|
+
You can also directly interact with a local file
|
30
|
+
|
31
|
+
$ scrapouille /Users/simon/web/player.html # launch scrapouille on the command line with a provided filepath
|
32
|
+
> //div[@class='player-name']/h1/child::text() # enter your xpath query
|
33
|
+
Richard Gasquest # Get the result String
|
34
|
+
|
35
|
+
### Scraping programatically
|
10
36
|
|
11
37
|
Define a scraper
|
12
38
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
39
|
+
```ruby
|
40
|
+
scraper = Scrapouille.new do
|
41
|
+
scrap 'fullname', at: "//div[@class='player-name']/h1/child::text()"
|
42
|
+
scrap 'image_url', at: "//div[@id='basic']//img/attribute::src"
|
43
|
+
scrap 'rank', at: "//div[@class='position']/text()" do |c|
|
44
|
+
Integer(c.sub('#', ''))
|
45
|
+
end
|
46
|
+
end
|
47
|
+
```
|
22
48
|
|
23
|
-
Use
|
49
|
+
Use the scraper instance on an URI (as defined by `open-uri`: filepath, http, ...)
|
24
50
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
51
|
+
```ruby
|
52
|
+
results = scraper.scrap!('http://tennis-player.com/richard-gasquet')
|
53
|
+
results['fullname'] # => 'Richard Gasquest'
|
54
|
+
```
|
29
55
|
|
30
|
-
You can
|
56
|
+
You can also run your scraper using a local HTML filepath for testing purposes
|
31
57
|
|
32
|
-
|
33
|
-
|
34
|
-
|
58
|
+
```ruby
|
59
|
+
scraper.scrap!(File.join('..', 'player.html'))
|
60
|
+
```
|
35
61
|
|
data/Rakefile
CHANGED
data/bin/scrapouille
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'scrapouille'
|
4
|
+
require 'readline'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'tempfile'
|
7
|
+
|
8
|
+
abort 'Missing uri parameter' if ARGV.empty?
|
9
|
+
|
10
|
+
uri = URI(ARGV.shift)
|
11
|
+
|
12
|
+
if uri.scheme && uri.scheme.start_with?('http')
|
13
|
+
file = Tempfile.new('scrapouille-')
|
14
|
+
puts 'Fetching remote content ...'
|
15
|
+
file.write(open(uri).read)
|
16
|
+
uri = file.path
|
17
|
+
puts "Tempfile created at #{file.path}"
|
18
|
+
else
|
19
|
+
path = uri.to_s
|
20
|
+
abort "Cannot find file '#{path}'" unless File.exists?(path)
|
21
|
+
end
|
22
|
+
|
23
|
+
while provided_xpath = Readline.readline("> ", true)
|
24
|
+
scraper = Scrapouille.new
|
25
|
+
to_eval = "scrap :item, at: \"#{provided_xpath}\""
|
26
|
+
scraper.instance_eval(to_eval)
|
27
|
+
begin
|
28
|
+
puts scraper.scrap!(uri.to_s)[:item]
|
29
|
+
rescue => e
|
30
|
+
puts e.message
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
if file
|
35
|
+
puts "Closing and deleting temp file"
|
36
|
+
file.close
|
37
|
+
file.unlink
|
38
|
+
end
|
data/scrapouille.gemspec
CHANGED
@@ -2,10 +2,10 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |spec|
|
4
4
|
spec.name = "scrapouille"
|
5
|
-
spec.version = "0.0.
|
5
|
+
spec.version = "0.0.3"
|
6
6
|
spec.authors = ["simcap"]
|
7
|
-
spec.summary = %q{
|
8
|
-
spec.description = %q{
|
7
|
+
spec.summary = %q{Interactive and declarative XPath driven HTML scraper}
|
8
|
+
spec.description = %q{Interactive and declarative XPath driven HTML scraper}
|
9
9
|
spec.homepage = "https://github.com/simcap/scrapouille"
|
10
10
|
|
11
11
|
spec.files = `git ls-files -z`.split("\x0")
|
@@ -17,4 +17,5 @@ Gem::Specification.new do |spec|
|
|
17
17
|
|
18
18
|
spec.add_development_dependency "bundler", "~> 1.6"
|
19
19
|
spec.add_development_dependency "rake"
|
20
|
+
spec.add_development_dependency "minitest"
|
20
21
|
end
|
data/test/helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapouille
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- simcap
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-09-
|
11
|
+
date: 2014-09-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -52,9 +52,24 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
|
-
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: minitest
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description: Interactive and declarative XPath driven HTML scraper
|
56
70
|
email:
|
57
|
-
executables:
|
71
|
+
executables:
|
72
|
+
- scrapouille
|
58
73
|
extensions: []
|
59
74
|
extra_rdoc_files: []
|
60
75
|
files:
|
@@ -64,11 +79,12 @@ files:
|
|
64
79
|
- LICENSE.txt
|
65
80
|
- README.md
|
66
81
|
- Rakefile
|
82
|
+
- bin/scrapouille
|
67
83
|
- lib/scrapouille.rb
|
68
84
|
- scrapouille.gemspec
|
69
85
|
- test/fixtures/tennis-player.html
|
70
86
|
- test/helper.rb
|
71
|
-
- test/
|
87
|
+
- test/test_scraping.rb
|
72
88
|
homepage: https://github.com/simcap/scrapouille
|
73
89
|
licenses: []
|
74
90
|
metadata: {}
|
@@ -91,8 +107,8 @@ rubyforge_project:
|
|
91
107
|
rubygems_version: 2.2.2
|
92
108
|
signing_key:
|
93
109
|
specification_version: 4
|
94
|
-
summary:
|
110
|
+
summary: Interactive and declarative XPath driven HTML scraper
|
95
111
|
test_files:
|
96
112
|
- test/fixtures/tennis-player.html
|
97
113
|
- test/helper.rb
|
98
|
-
- test/
|
114
|
+
- test/test_scraping.rb
|