scrapula 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +3 -0
- data/.rspec +1 -0
- data/.simplecov +1 -0
- data/CHANGELOG.md +15 -0
- data/CONTRIBUTING.md +0 -0
- data/Gemfile +24 -0
- data/Gemfile.lock +127 -0
- data/Guardfile +12 -0
- data/LICENSE +21 -0
- data/README.md +108 -0
- data/ROADMAP.md +42 -0
- data/Rakefile +30 -0
- data/examples/block_syntax.rb +20 -0
- data/examples/find_nodes.rb +6 -0
- data/examples/get_first_and_scrape_later.rb +13 -0
- data/examples/metas.rb +32 -0
- data/examples/more_api.rb +17 -0
- data/examples/nested_results.rb +14 -0
- data/examples/one_liners.rb +9 -0
- data/examples/posting_data.rb +7 -0
- data/examples/s.rb +24 -0
- data/examples/validation.rb +40 -0
- data/lib/scrapula.rb +47 -0
- data/lib/scrapula/_old_scraper.rb +110 -0
- data/lib/scrapula/agent.rb +8 -0
- data/lib/scrapula/data.rb +18 -0
- data/lib/scrapula/page.rb +109 -0
- data/lib/scrapula/page/meta.rb +74 -0
- data/lib/scrapula/request.rb +44 -0
- data/lib/scrapula/s.rb +21 -0
- data/lib/scrapula/scraper.rb +56 -0
- data/lib/scrapula/version.rb +3 -0
- data/scrapula.gemspec +36 -0
- data/spec/cassettes/Scrapula_Page_Meta/_.yml +748 -0
- data/spec/cassettes/Scrapula_Page_Meta/_/Open_Graph.yml +322 -0
- data/spec/cassettes/Scrapula_Page_Meta/_/other_names.yml +586 -0
- data/spec/cassettes/Scrapula_Page_Meta/_/standard_names.yml +429 -0
- data/spec/lib/scrapula/agent_spec.rb +6 -0
- data/spec/lib/scrapula/data_spec.rb +19 -0
- data/spec/lib/scrapula/page/meta_spec.rb +89 -0
- data/spec/lib/scrapula/page_spec.rb +136 -0
- data/spec/lib/scrapula/request_spec.rb +91 -0
- data/spec/lib/scrapula/s_spec.rb +44 -0
- data/spec/lib/scrapula/scraper_spec.rb +205 -0
- data/spec/lib/scrapula_spec.rb +141 -0
- data/spec/spec_helper.rb +26 -0
- metadata +118 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 3d346e08f162bc19ef64bc931986df984fc56aa2
|
4
|
+
data.tar.gz: e64475c7d6cc3cfe5075981e5152931e23423458
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: cf881c26643ea5a11fc1a6f2f1a651f9f2321f9a046483586e63366a1dd280246c4ca29e04b74c6f8ddb97655ed4893270b11c7bc95fbd1ed30ddddab85b3900
|
7
|
+
data.tar.gz: f7951f02b12b0affea85431b2a30d5a9b656e97534b89c8ec411be8f5d68cf090f2edf5d4ac8313ef3f2aa9fdfdc8a6f1efb3537043b54fff8b2cc11379b7de4
|
data/.gitignore
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color --require spec_helper
|
data/.simplecov
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
SimpleCov.start
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# CHANGELOG
|
2
|
+
|
3
|
+
## 0.6.3 (2015-09-17) Juan A. Martín Lucas <scrapula@jaml.site>
|
4
|
+
|
5
|
+
* Published to RubyGems.org.
|
6
|
+
* Prepare the `scrapula.gemspec` for publishing to RubyGems.org.
|
7
|
+
* Improve the project description in the README.
|
8
|
+
|
9
|
+
## 0.6.2 (2015-09-12) Juan A. Martín Lucas <scrapula@jaml.site>
|
10
|
+
|
11
|
+
* "S" API shortcuts (e.g: `S.g` => `Scrapula.get`)
|
12
|
+
|
13
|
+
## 0.6.1 (2015-09-10) Juan A. Martín Lucas <scrapula@jaml.site>
|
14
|
+
|
15
|
+
* Implement `Scraper#respond_to?` tests (for specifying its expected behaviour)
|
data/CONTRIBUTING.md
ADDED
File without changes
|
data/Gemfile
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
source 'http://rubygems.org'
|
2
|
+
|
3
|
+
gemspec
|
4
|
+
|
5
|
+
group :development do
|
6
|
+
gem 'rake'
|
7
|
+
|
8
|
+
gem 'guard'
|
9
|
+
gem 'guard-bundler'
|
10
|
+
gem 'guard-rspec'
|
11
|
+
end
|
12
|
+
|
13
|
+
group :test do
|
14
|
+
gem 'rspec'
|
15
|
+
|
16
|
+
gem 'webmock'
|
17
|
+
gem 'vcr'
|
18
|
+
|
19
|
+
gem 'simplecov'
|
20
|
+
end
|
21
|
+
|
22
|
+
group :development, :test do
|
23
|
+
gem 'byebug'
|
24
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,127 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
scrapula (0.6.3)
|
5
|
+
mechanize (~> 2.7, >= 2.7.3)
|
6
|
+
|
7
|
+
GEM
|
8
|
+
remote: http://rubygems.org/
|
9
|
+
specs:
|
10
|
+
addressable (2.3.8)
|
11
|
+
byebug (5.0.0)
|
12
|
+
columnize (= 0.9.0)
|
13
|
+
celluloid (0.16.0)
|
14
|
+
timers (~> 4.0.0)
|
15
|
+
coderay (1.1.0)
|
16
|
+
columnize (0.9.0)
|
17
|
+
crack (0.4.2)
|
18
|
+
safe_yaml (~> 1.0.0)
|
19
|
+
diff-lcs (1.2.5)
|
20
|
+
docile (1.1.5)
|
21
|
+
domain_name (0.5.24)
|
22
|
+
unf (>= 0.0.5, < 1.0.0)
|
23
|
+
ffi (1.9.8)
|
24
|
+
formatador (0.2.5)
|
25
|
+
guard (2.12.5)
|
26
|
+
formatador (>= 0.2.4)
|
27
|
+
listen (~> 2.7)
|
28
|
+
lumberjack (~> 1.0)
|
29
|
+
nenv (~> 0.1)
|
30
|
+
notiffany (~> 0.0)
|
31
|
+
pry (>= 0.9.12)
|
32
|
+
shellany (~> 0.0)
|
33
|
+
thor (>= 0.18.1)
|
34
|
+
guard-bundler (2.1.0)
|
35
|
+
bundler (~> 1.0)
|
36
|
+
guard (~> 2.2)
|
37
|
+
guard-compat (~> 1.1)
|
38
|
+
guard-compat (1.2.1)
|
39
|
+
guard-rspec (4.5.0)
|
40
|
+
guard (~> 2.1)
|
41
|
+
guard-compat (~> 1.1)
|
42
|
+
rspec (>= 2.99.0, < 4.0)
|
43
|
+
hitimes (1.2.2)
|
44
|
+
http-cookie (1.0.2)
|
45
|
+
domain_name (~> 0.5)
|
46
|
+
json (1.8.2)
|
47
|
+
listen (2.10.0)
|
48
|
+
celluloid (~> 0.16.0)
|
49
|
+
rb-fsevent (>= 0.9.3)
|
50
|
+
rb-inotify (>= 0.9)
|
51
|
+
lumberjack (1.0.9)
|
52
|
+
mechanize (2.7.3)
|
53
|
+
domain_name (~> 0.5, >= 0.5.1)
|
54
|
+
http-cookie (~> 1.0)
|
55
|
+
mime-types (~> 2.0)
|
56
|
+
net-http-digest_auth (~> 1.1, >= 1.1.1)
|
57
|
+
net-http-persistent (~> 2.5, >= 2.5.2)
|
58
|
+
nokogiri (~> 1.4)
|
59
|
+
ntlm-http (~> 0.1, >= 0.1.1)
|
60
|
+
webrobots (>= 0.0.9, < 0.2)
|
61
|
+
method_source (0.8.2)
|
62
|
+
mime-types (2.6.2)
|
63
|
+
mini_portile (0.6.2)
|
64
|
+
nenv (0.2.0)
|
65
|
+
net-http-digest_auth (1.4)
|
66
|
+
net-http-persistent (2.9.4)
|
67
|
+
nokogiri (1.6.6.2)
|
68
|
+
mini_portile (~> 0.6.0)
|
69
|
+
notiffany (0.0.6)
|
70
|
+
nenv (~> 0.1)
|
71
|
+
shellany (~> 0.0)
|
72
|
+
ntlm-http (0.1.1)
|
73
|
+
pry (0.10.1)
|
74
|
+
coderay (~> 1.1.0)
|
75
|
+
method_source (~> 0.8.1)
|
76
|
+
slop (~> 3.4)
|
77
|
+
rake (10.4.2)
|
78
|
+
rb-fsevent (0.9.4)
|
79
|
+
rb-inotify (0.9.5)
|
80
|
+
ffi (>= 0.5.0)
|
81
|
+
rspec (3.2.0)
|
82
|
+
rspec-core (~> 3.2.0)
|
83
|
+
rspec-expectations (~> 3.2.0)
|
84
|
+
rspec-mocks (~> 3.2.0)
|
85
|
+
rspec-core (3.2.3)
|
86
|
+
rspec-support (~> 3.2.0)
|
87
|
+
rspec-expectations (3.2.1)
|
88
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
89
|
+
rspec-support (~> 3.2.0)
|
90
|
+
rspec-mocks (3.2.1)
|
91
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
92
|
+
rspec-support (~> 3.2.0)
|
93
|
+
rspec-support (3.2.2)
|
94
|
+
safe_yaml (1.0.4)
|
95
|
+
shellany (0.0.1)
|
96
|
+
simplecov (0.10.0)
|
97
|
+
docile (~> 1.1.0)
|
98
|
+
json (~> 1.8)
|
99
|
+
simplecov-html (~> 0.10.0)
|
100
|
+
simplecov-html (0.10.0)
|
101
|
+
slop (3.6.0)
|
102
|
+
thor (0.19.1)
|
103
|
+
timers (4.0.1)
|
104
|
+
hitimes
|
105
|
+
unf (0.1.4)
|
106
|
+
unf_ext
|
107
|
+
unf_ext (0.0.7.1)
|
108
|
+
vcr (2.9.2)
|
109
|
+
webmock (1.17.4)
|
110
|
+
addressable (>= 2.2.7)
|
111
|
+
crack (>= 0.3.2)
|
112
|
+
webrobots (0.1.1)
|
113
|
+
|
114
|
+
PLATFORMS
|
115
|
+
ruby
|
116
|
+
|
117
|
+
DEPENDENCIES
|
118
|
+
byebug
|
119
|
+
guard
|
120
|
+
guard-bundler
|
121
|
+
guard-rspec
|
122
|
+
rake
|
123
|
+
rspec
|
124
|
+
scrapula!
|
125
|
+
simplecov
|
126
|
+
vcr
|
127
|
+
webmock
|
data/Guardfile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
guard :bundler do
|
2
|
+
watch 'Gemfile'
|
3
|
+
end
|
4
|
+
|
5
|
+
guard :rspec, cmd: 'rspec --format progress --color -r ./spec/spec_helper.rb', all_on_start: true, all_after_pass: true do
|
6
|
+
|
7
|
+
watch 'spec/spec_helper.rb'
|
8
|
+
|
9
|
+
watch(%r{^lib/(.+)\.rb$}) {|m| "spec/lib/#{m[1]}_spec.rb"}
|
10
|
+
watch(%r{^spec/.+_spec\.rb$})
|
11
|
+
|
12
|
+
end
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2012-2015 Juan A. Martín Lucas
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
[
|
2
|
+
|
3
|
+
Scrapula
|
4
|
+
========
|
5
|
+
Scrapula is a library for scraping web pages that simplifies some of the
|
6
|
+
common actions that are involved.
|
7
|
+
|
8
|
+
It has a very simple API that can be used in several ways and contexts, and
|
9
|
+
another, shorter, that facilitates processing pages when characters are
|
10
|
+
scarce, like irb / pry, or quick and dirty scripts.
|
11
|
+
|
12
|
+
Requirements
|
13
|
+
============
|
14
|
+
It uses [Mechanize](http://mechanize.rubyforge.org/) and [Nokogiri](http://nokogiri.org) to obtain and extract the information and [RSpec](https://www.relishapp.com/rspec) for testing.
|
15
|
+
|
16
|
+
Configuration
|
17
|
+
=============
|
18
|
+
If you want to show the output of some steps:
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
Scrapula.verbose = true
|
22
|
+
```
|
23
|
+
|
24
|
+
API
|
25
|
+
===
|
26
|
+
|
27
|
+
Perform requests:
|
28
|
+
|
29
|
+
```ruby
|
30
|
+
page = Scrapula.get 'example.net' #=> Scrapula::Page object
|
31
|
+
|
32
|
+
page = Scrapula.post 'example.net', { q: 'a query' } #=> Scrapula::Page object
|
33
|
+
```
|
34
|
+
|
35
|
+
Extract information from the page:
|
36
|
+
|
37
|
+
```ruby
|
38
|
+
# Using a CSS selector (all elements)
|
39
|
+
page.search! 'a'
|
40
|
+
|
41
|
+
# Using a CSS selector (fist element)
|
42
|
+
page.at! 'h1'
|
43
|
+
|
44
|
+
# Using XPath (fist element)
|
45
|
+
page.at! '//'
|
46
|
+
```
|
47
|
+
|
48
|
+
Perform a GET request:
|
49
|
+
|
50
|
+
```ruby
|
51
|
+
Scrapula.get 'example.net
|
52
|
+
```
|
53
|
+
|
54
|
+
S interface
|
55
|
+
-----------
|
56
|
+
This API is not required by default, so it is up to you to use it:
|
57
|
+
```ruby
|
58
|
+
require 'scrapula/s'
|
59
|
+
```
|
60
|
+
|
61
|
+
It provides the method and its shortcut For all HTTP verbs:
|
62
|
+
|
63
|
+
```ruby
|
64
|
+
S.get 'example.net'
|
65
|
+
S.g 'example.net'
|
66
|
+
|
67
|
+
S.post 'example.net'
|
68
|
+
S.p 'example.net'
|
69
|
+
|
70
|
+
S.put 'example.net'
|
71
|
+
S.u 'example.net'
|
72
|
+
|
73
|
+
S.patch 'example.net'
|
74
|
+
S.a 'example.net'
|
75
|
+
|
76
|
+
S.delete 'example.net'
|
77
|
+
S.d 'example.net'
|
78
|
+
|
79
|
+
S.head 'example.net'
|
80
|
+
S.h 'example.net'
|
81
|
+
```
|
82
|
+
|
83
|
+
Additionally, GET requests, can be performed with through the shortest invocation:
|
84
|
+
|
85
|
+
```ruby
|
86
|
+
S 'example.net'
|
87
|
+
```
|
88
|
+
|
89
|
+
Examples
|
90
|
+
--------
|
91
|
+
|
92
|
+
There are more examples in the `examples` folder.
|
93
|
+
|
94
|
+
Changelog
|
95
|
+
=========
|
96
|
+
|
97
|
+
You can read previous changes in `CHANGELOG.md`
|
98
|
+
|
99
|
+
Contributing
|
100
|
+
============
|
101
|
+
|
102
|
+
Authors
|
103
|
+
=======
|
104
|
+
Juan A. Martín Lucas (https://github.com/j-a-m-l)
|
105
|
+
|
106
|
+
License
|
107
|
+
=======
|
108
|
+
This project is licensed under the MIT license. See [LICENSE]() for details.
|
data/ROADMAP.md
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# ROADMAP
|
2
|
+
|
3
|
+
This is a proposal and should not be interpreted literally. If you want to collaborate in some feature before its version, do it.
|
4
|
+
|
5
|
+
## 3.0.0
|
6
|
+
|
7
|
+
* Other agents than Mechanize
|
8
|
+
* Uncommon HTTP methods: OPTIONS, CONNECT, TRACE
|
9
|
+
|
10
|
+
## 2.0.0
|
11
|
+
|
12
|
+
* Page | Scraping schema definition
|
13
|
+
* Page | Scraping schema validation
|
14
|
+
* Scraping operations: `regex!`
|
15
|
+
* Scraping operations: `int!`
|
16
|
+
* Scraping operations: `decimal!`
|
17
|
+
* Scraping operations: `number!`
|
18
|
+
* Scraping operations: `date!`
|
19
|
+
* Scraping operations: `time!`
|
20
|
+
* Scraping operations: `datetime!`
|
21
|
+
* Non-standard behaviour of some known non-standard metas, like og:image
|
22
|
+
|
23
|
+
## 1.0.0
|
24
|
+
|
25
|
+
* Documented with TODO
|
26
|
+
|
27
|
+
## 0.9.0
|
28
|
+
|
29
|
+
* Nested block syntax
|
30
|
+
* Scraping operations: `attribute!`
|
31
|
+
* Scraping operations: `inner_html!` ?
|
32
|
+
* Scraping operations: `xhtml!` ?
|
33
|
+
* Scraping operations: `title!` ?
|
34
|
+
|
35
|
+
## 0.8.0
|
36
|
+
|
37
|
+
* Add pending HTTP methods: POST, PUT, PATCH, DELETE and HEAD
|
38
|
+
* Request parameters
|
39
|
+
|
40
|
+
## 0.7.0
|
41
|
+
|
42
|
+
* Timeouts
|
data/Rakefile
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'rspec/core/rake_task'
|
2
|
+
|
3
|
+
require './lib/scrapula/version'
|
4
|
+
name = 'scrapula'
|
5
|
+
version = Scrapula::VERSION
|
6
|
+
|
7
|
+
task :default => :coverage
|
8
|
+
|
9
|
+
desc "Run specs!"
|
10
|
+
RSpec::Core::RakeTask.new do |t|
|
11
|
+
t.rspec_opts = ['--color', '--debug']
|
12
|
+
t.pattern = "./spec/**/*_spec.rb"
|
13
|
+
end
|
14
|
+
|
15
|
+
# Alias
|
16
|
+
task :test => :spec
|
17
|
+
|
18
|
+
desc "Happy coverage"
|
19
|
+
task :coverage do
|
20
|
+
ENV['COVERAGE'] = 'true'
|
21
|
+
Rake::Task[:spec].execute
|
22
|
+
end
|
23
|
+
|
24
|
+
task :build do
|
25
|
+
system "gem build #{name}.gemspec"
|
26
|
+
end
|
27
|
+
|
28
|
+
task :install => :build do
|
29
|
+
system "gem install #{name}-#{version}"
|
30
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require_relative '../lib/scrapula'
|
2
|
+
|
3
|
+
page = Scrapula.get 'reddit.com'
|
4
|
+
|
5
|
+
hs1 = page.scrape do
|
6
|
+
h1 'h1'
|
7
|
+
h2 'h2'
|
8
|
+
h3 'h3'
|
9
|
+
h4 'h4'
|
10
|
+
h5 'h5'
|
11
|
+
end
|
12
|
+
|
13
|
+
# Alternative way
|
14
|
+
hs2 = page.scrape
|
15
|
+
# TODO
|
16
|
+
txt!({ h1: 'h1', h2: 'h2', h3: 'h3', h4: 'h4', h5: 'h5' })
|
17
|
+
end
|
18
|
+
|
19
|
+
puts hs1
|
20
|
+
puts hs2
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require_relative '../lib/scrapula'
|
2
|
+
|
3
|
+
query = 'ruby scraping'
|
4
|
+
page = Scrapula.get 'stackoverflow.com/search', q: query
|
5
|
+
|
6
|
+
# Extract the number of results (with newline characters and spaces)
|
7
|
+
number_of_results = page.text! '.results-header h2'
|
8
|
+
|
9
|
+
# Apply the operation (method) strip to the result
|
10
|
+
first_result = page.txt! '.search-result[data-position="1"] .result-link', [:strip]
|
11
|
+
|
12
|
+
puts "Stack Overflow returns #{number_of_results.strip} for \"#{query}\""
|
13
|
+
puts "Stack Overflow returns \"#{first_result}\" as first result for \"#{query}\""
|