scrapouille 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +47 -21
- data/Rakefile +0 -1
- data/bin/scrapouille +38 -0
- data/scrapouille.gemspec +4 -3
- data/test/helper.rb +1 -1
- data/test/{scraping_spec.rb → test_scraping.rb} +1 -1
- metadata +23 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9aa9621a90f3ebb7e73aa8d9ea6e46826d78bc7f
|
4
|
+
data.tar.gz: 95f49609599be7100ff1b3ebd05c90ae6364a0b1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 87ad778ca5d8be90001921f492cce90271a82a64529c26dbfb305fb578eacea0f2214d45d14ecab7731623fe5d600de3a213d7669e0347fb9427ed48ae4d7fc0
|
7
|
+
data.tar.gz: 01d1ec1ee5757269cf47c222472d7f63194189246c0d877a6bd15262bf06473e1642705d62e9baf3cdd056eb2a5255d042ebf630e0dda2652d4ebbe23f65407e
|
data/README.md
CHANGED
@@ -1,35 +1,61 @@
|
|
1
|
-
|
1
|
+
# Scrapouille
|
2
2
|
|
3
|
-
|
3
|
+
Scrapouille is a declarative XPath driven HTML scraper with an interactive mode as a bonus
|
4
|
+
|
5
|
+
Why XPath ? XPath is powerful enough to get any data on a HTML document (see http://www.w3schools.com/xpath/xpath_axes.asp)
|
6
|
+
|
7
|
+
Scrapouille run XPath queries using the **nokogiri** gem
|
8
|
+
|
9
|
+
### Install
|
10
|
+
|
11
|
+
gem install 'scrapouille'
|
4
12
|
|
5
13
|
### Test
|
6
14
|
|
7
15
|
rake
|
8
16
|
|
9
|
-
|
17
|
+
# Usage
|
18
|
+
|
19
|
+
### Interactive mode
|
20
|
+
|
21
|
+
From the command line you can interact with a remote web page as if it was local
|
22
|
+
|
23
|
+
$ scrapouille http://tennis.com/player.html # launch scrapouille on the command line with a provided URI
|
24
|
+
> //div[@class='player-name']/h1/child::text() # You will get a prompt. Enter a xpath query
|
25
|
+
Richard Gasquest # Get the result string
|
26
|
+
|
27
|
+
**Behind the scene - during the session - the remote web page is stored in a `Tempfile` for fast xpath interaction**
|
28
|
+
|
29
|
+
You can also directly interact with a local file
|
30
|
+
|
31
|
+
$ scrapouille /Users/simon/web/player.html # launch scrapouille on the command line with a provided filepath
|
32
|
+
> //div[@class='player-name']/h1/child::text() # enter your xpath query
|
33
|
+
Richard Gasquest # Get the result String
|
34
|
+
|
35
|
+
### Scraping programatically
|
10
36
|
|
11
37
|
Define a scraper
|
12
38
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
39
|
+
```ruby
|
40
|
+
scraper = Scrapouille.new do
|
41
|
+
scrap 'fullname', at: "//div[@class='player-name']/h1/child::text()"
|
42
|
+
scrap 'image_url', at: "//div[@id='basic']//img/attribute::src"
|
43
|
+
scrap 'rank', at: "//div[@class='position']/text()" do |c|
|
44
|
+
Integer(c.sub('#', ''))
|
45
|
+
end
|
46
|
+
end
|
47
|
+
```
|
22
48
|
|
23
|
-
Use
|
49
|
+
Use the scraper instance on an URI (as defined by `open-uri`: filepath, http, ...)
|
24
50
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
51
|
+
```ruby
|
52
|
+
results = scraper.scrap!('http://tennis-player.com/richard-gasquet')
|
53
|
+
results['fullname'] # => 'Richard Gasquest'
|
54
|
+
```
|
29
55
|
|
30
|
-
You can
|
56
|
+
You can also run your scraper using a local HTML filepath for testing purposes
|
31
57
|
|
32
|
-
|
33
|
-
|
34
|
-
|
58
|
+
```ruby
|
59
|
+
scraper.scrap!(File.join('..', 'player.html'))
|
60
|
+
```
|
35
61
|
|
data/Rakefile
CHANGED
data/bin/scrapouille
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'scrapouille'
|
4
|
+
require 'readline'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'tempfile'
|
7
|
+
|
8
|
+
abort 'Missing uri parameter' if ARGV.empty?
|
9
|
+
|
10
|
+
uri = URI(ARGV.shift)
|
11
|
+
|
12
|
+
if uri.scheme && uri.scheme.start_with?('http')
|
13
|
+
file = Tempfile.new('scrapouille-')
|
14
|
+
puts 'Fetching remote content ...'
|
15
|
+
file.write(open(uri).read)
|
16
|
+
uri = file.path
|
17
|
+
puts "Tempfile created at #{file.path}"
|
18
|
+
else
|
19
|
+
path = uri.to_s
|
20
|
+
abort "Cannot find file '#{path}'" unless File.exists?(path)
|
21
|
+
end
|
22
|
+
|
23
|
+
while provided_xpath = Readline.readline("> ", true)
|
24
|
+
scraper = Scrapouille.new
|
25
|
+
to_eval = "scrap :item, at: \"#{provided_xpath}\""
|
26
|
+
scraper.instance_eval(to_eval)
|
27
|
+
begin
|
28
|
+
puts scraper.scrap!(uri.to_s)[:item]
|
29
|
+
rescue => e
|
30
|
+
puts e.message
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
if file
|
35
|
+
puts "Closing and deleting temp file"
|
36
|
+
file.close
|
37
|
+
file.unlink
|
38
|
+
end
|
data/scrapouille.gemspec
CHANGED
@@ -2,10 +2,10 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |spec|
|
4
4
|
spec.name = "scrapouille"
|
5
|
-
spec.version = "0.0.
|
5
|
+
spec.version = "0.0.3"
|
6
6
|
spec.authors = ["simcap"]
|
7
|
-
spec.summary = %q{
|
8
|
-
spec.description = %q{
|
7
|
+
spec.summary = %q{Interactive and declarative XPath driven HTML scraper}
|
8
|
+
spec.description = %q{Interactive and declarative XPath driven HTML scraper}
|
9
9
|
spec.homepage = "https://github.com/simcap/scrapouille"
|
10
10
|
|
11
11
|
spec.files = `git ls-files -z`.split("\x0")
|
@@ -17,4 +17,5 @@ Gem::Specification.new do |spec|
|
|
17
17
|
|
18
18
|
spec.add_development_dependency "bundler", "~> 1.6"
|
19
19
|
spec.add_development_dependency "rake"
|
20
|
+
spec.add_development_dependency "minitest"
|
20
21
|
end
|
data/test/helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapouille
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- simcap
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-09-
|
11
|
+
date: 2014-09-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -52,9 +52,24 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
|
-
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: minitest
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description: Interactive and declarative XPath driven HTML scraper
|
56
70
|
email:
|
57
|
-
executables:
|
71
|
+
executables:
|
72
|
+
- scrapouille
|
58
73
|
extensions: []
|
59
74
|
extra_rdoc_files: []
|
60
75
|
files:
|
@@ -64,11 +79,12 @@ files:
|
|
64
79
|
- LICENSE.txt
|
65
80
|
- README.md
|
66
81
|
- Rakefile
|
82
|
+
- bin/scrapouille
|
67
83
|
- lib/scrapouille.rb
|
68
84
|
- scrapouille.gemspec
|
69
85
|
- test/fixtures/tennis-player.html
|
70
86
|
- test/helper.rb
|
71
|
-
- test/
|
87
|
+
- test/test_scraping.rb
|
72
88
|
homepage: https://github.com/simcap/scrapouille
|
73
89
|
licenses: []
|
74
90
|
metadata: {}
|
@@ -91,8 +107,8 @@ rubyforge_project:
|
|
91
107
|
rubygems_version: 2.2.2
|
92
108
|
signing_key:
|
93
109
|
specification_version: 4
|
94
|
-
summary:
|
110
|
+
summary: Interactive and declarative XPath driven HTML scraper
|
95
111
|
test_files:
|
96
112
|
- test/fixtures/tennis-player.html
|
97
113
|
- test/helper.rb
|
98
|
-
- test/
|
114
|
+
- test/test_scraping.rb
|