scrapula 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +3 -0
  3. data/.rspec +1 -0
  4. data/.simplecov +1 -0
  5. data/CHANGELOG.md +15 -0
  6. data/CONTRIBUTING.md +0 -0
  7. data/Gemfile +24 -0
  8. data/Gemfile.lock +127 -0
  9. data/Guardfile +12 -0
  10. data/LICENSE +21 -0
  11. data/README.md +108 -0
  12. data/ROADMAP.md +42 -0
  13. data/Rakefile +30 -0
  14. data/examples/block_syntax.rb +20 -0
  15. data/examples/find_nodes.rb +6 -0
  16. data/examples/get_first_and_scrape_later.rb +13 -0
  17. data/examples/metas.rb +32 -0
  18. data/examples/more_api.rb +17 -0
  19. data/examples/nested_results.rb +14 -0
  20. data/examples/one_liners.rb +9 -0
  21. data/examples/posting_data.rb +7 -0
  22. data/examples/s.rb +24 -0
  23. data/examples/validation.rb +40 -0
  24. data/lib/scrapula.rb +47 -0
  25. data/lib/scrapula/_old_scraper.rb +110 -0
  26. data/lib/scrapula/agent.rb +8 -0
  27. data/lib/scrapula/data.rb +18 -0
  28. data/lib/scrapula/page.rb +109 -0
  29. data/lib/scrapula/page/meta.rb +74 -0
  30. data/lib/scrapula/request.rb +44 -0
  31. data/lib/scrapula/s.rb +21 -0
  32. data/lib/scrapula/scraper.rb +56 -0
  33. data/lib/scrapula/version.rb +3 -0
  34. data/scrapula.gemspec +36 -0
  35. data/spec/cassettes/Scrapula_Page_Meta/_.yml +748 -0
  36. data/spec/cassettes/Scrapula_Page_Meta/_/Open_Graph.yml +322 -0
  37. data/spec/cassettes/Scrapula_Page_Meta/_/other_names.yml +586 -0
  38. data/spec/cassettes/Scrapula_Page_Meta/_/standard_names.yml +429 -0
  39. data/spec/lib/scrapula/agent_spec.rb +6 -0
  40. data/spec/lib/scrapula/data_spec.rb +19 -0
  41. data/spec/lib/scrapula/page/meta_spec.rb +89 -0
  42. data/spec/lib/scrapula/page_spec.rb +136 -0
  43. data/spec/lib/scrapula/request_spec.rb +91 -0
  44. data/spec/lib/scrapula/s_spec.rb +44 -0
  45. data/spec/lib/scrapula/scraper_spec.rb +205 -0
  46. data/spec/lib/scrapula_spec.rb +141 -0
  47. data/spec/spec_helper.rb +26 -0
  48. metadata +118 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 3d346e08f162bc19ef64bc931986df984fc56aa2
4
+ data.tar.gz: e64475c7d6cc3cfe5075981e5152931e23423458
5
+ SHA512:
6
+ metadata.gz: cf881c26643ea5a11fc1a6f2f1a651f9f2321f9a046483586e63366a1dd280246c4ca29e04b74c6f8ddb97655ed4893270b11c7bc95fbd1ed30ddddab85b3900
7
+ data.tar.gz: f7951f02b12b0affea85431b2a30d5a9b656e97534b89c8ec411be8f5d68cf090f2edf5d4ac8313ef3f2aa9fdfdc8a6f1efb3537043b54fff8b2cc11379b7de4
@@ -0,0 +1,3 @@
1
+ coverage/
2
+ tmp/
3
+ *.gem
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color --require spec_helper
@@ -0,0 +1 @@
1
+ SimpleCov.start
@@ -0,0 +1,15 @@
1
+ # CHANGELOG
2
+
3
+ ## 0.6.3 (2015-09-17) Juan A. Martín Lucas <scrapula@jaml.site>
4
+
5
+ * Published to RubyGems.org.
6
+ * Prepare the `scrapula.gemspec` for publishing to RubyGems.org.
7
+ * Improve the project description in the README.
8
+
9
+ ## 0.6.2 (2015-09-12) Juan A. Martín Lucas <scrapula@jaml.site>
10
+
11
+ * "S" API shortcuts (e.g: `S.g` => `Scrapula.get`)
12
+
13
+ ## 0.6.1 (2015-09-10) Juan A. Martín Lucas <scrapula@jaml.site>
14
+
15
+ * Implement `Scraper#respond_to?` tests (for specifying its expected behaviour)
File without changes
data/Gemfile ADDED
@@ -0,0 +1,24 @@
1
+ source 'http://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ group :development do
6
+ gem 'rake'
7
+
8
+ gem 'guard'
9
+ gem 'guard-bundler'
10
+ gem 'guard-rspec'
11
+ end
12
+
13
+ group :test do
14
+ gem 'rspec'
15
+
16
+ gem 'webmock'
17
+ gem 'vcr'
18
+
19
+ gem 'simplecov'
20
+ end
21
+
22
+ group :development, :test do
23
+ gem 'byebug'
24
+ end
@@ -0,0 +1,127 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ scrapula (0.6.3)
5
+ mechanize (~> 2.7, >= 2.7.3)
6
+
7
+ GEM
8
+ remote: http://rubygems.org/
9
+ specs:
10
+ addressable (2.3.8)
11
+ byebug (5.0.0)
12
+ columnize (= 0.9.0)
13
+ celluloid (0.16.0)
14
+ timers (~> 4.0.0)
15
+ coderay (1.1.0)
16
+ columnize (0.9.0)
17
+ crack (0.4.2)
18
+ safe_yaml (~> 1.0.0)
19
+ diff-lcs (1.2.5)
20
+ docile (1.1.5)
21
+ domain_name (0.5.24)
22
+ unf (>= 0.0.5, < 1.0.0)
23
+ ffi (1.9.8)
24
+ formatador (0.2.5)
25
+ guard (2.12.5)
26
+ formatador (>= 0.2.4)
27
+ listen (~> 2.7)
28
+ lumberjack (~> 1.0)
29
+ nenv (~> 0.1)
30
+ notiffany (~> 0.0)
31
+ pry (>= 0.9.12)
32
+ shellany (~> 0.0)
33
+ thor (>= 0.18.1)
34
+ guard-bundler (2.1.0)
35
+ bundler (~> 1.0)
36
+ guard (~> 2.2)
37
+ guard-compat (~> 1.1)
38
+ guard-compat (1.2.1)
39
+ guard-rspec (4.5.0)
40
+ guard (~> 2.1)
41
+ guard-compat (~> 1.1)
42
+ rspec (>= 2.99.0, < 4.0)
43
+ hitimes (1.2.2)
44
+ http-cookie (1.0.2)
45
+ domain_name (~> 0.5)
46
+ json (1.8.2)
47
+ listen (2.10.0)
48
+ celluloid (~> 0.16.0)
49
+ rb-fsevent (>= 0.9.3)
50
+ rb-inotify (>= 0.9)
51
+ lumberjack (1.0.9)
52
+ mechanize (2.7.3)
53
+ domain_name (~> 0.5, >= 0.5.1)
54
+ http-cookie (~> 1.0)
55
+ mime-types (~> 2.0)
56
+ net-http-digest_auth (~> 1.1, >= 1.1.1)
57
+ net-http-persistent (~> 2.5, >= 2.5.2)
58
+ nokogiri (~> 1.4)
59
+ ntlm-http (~> 0.1, >= 0.1.1)
60
+ webrobots (>= 0.0.9, < 0.2)
61
+ method_source (0.8.2)
62
+ mime-types (2.6.2)
63
+ mini_portile (0.6.2)
64
+ nenv (0.2.0)
65
+ net-http-digest_auth (1.4)
66
+ net-http-persistent (2.9.4)
67
+ nokogiri (1.6.6.2)
68
+ mini_portile (~> 0.6.0)
69
+ notiffany (0.0.6)
70
+ nenv (~> 0.1)
71
+ shellany (~> 0.0)
72
+ ntlm-http (0.1.1)
73
+ pry (0.10.1)
74
+ coderay (~> 1.1.0)
75
+ method_source (~> 0.8.1)
76
+ slop (~> 3.4)
77
+ rake (10.4.2)
78
+ rb-fsevent (0.9.4)
79
+ rb-inotify (0.9.5)
80
+ ffi (>= 0.5.0)
81
+ rspec (3.2.0)
82
+ rspec-core (~> 3.2.0)
83
+ rspec-expectations (~> 3.2.0)
84
+ rspec-mocks (~> 3.2.0)
85
+ rspec-core (3.2.3)
86
+ rspec-support (~> 3.2.0)
87
+ rspec-expectations (3.2.1)
88
+ diff-lcs (>= 1.2.0, < 2.0)
89
+ rspec-support (~> 3.2.0)
90
+ rspec-mocks (3.2.1)
91
+ diff-lcs (>= 1.2.0, < 2.0)
92
+ rspec-support (~> 3.2.0)
93
+ rspec-support (3.2.2)
94
+ safe_yaml (1.0.4)
95
+ shellany (0.0.1)
96
+ simplecov (0.10.0)
97
+ docile (~> 1.1.0)
98
+ json (~> 1.8)
99
+ simplecov-html (~> 0.10.0)
100
+ simplecov-html (0.10.0)
101
+ slop (3.6.0)
102
+ thor (0.19.1)
103
+ timers (4.0.1)
104
+ hitimes
105
+ unf (0.1.4)
106
+ unf_ext
107
+ unf_ext (0.0.7.1)
108
+ vcr (2.9.2)
109
+ webmock (1.17.4)
110
+ addressable (>= 2.2.7)
111
+ crack (>= 0.3.2)
112
+ webrobots (0.1.1)
113
+
114
+ PLATFORMS
115
+ ruby
116
+
117
+ DEPENDENCIES
118
+ byebug
119
+ guard
120
+ guard-bundler
121
+ guard-rspec
122
+ rake
123
+ rspec
124
+ scrapula!
125
+ simplecov
126
+ vcr
127
+ webmock
@@ -0,0 +1,12 @@
1
+ guard :bundler do
2
+ watch 'Gemfile'
3
+ end
4
+
5
+ guard :rspec, cmd: 'rspec --format progress --color -r ./spec/spec_helper.rb', all_on_start: true, all_after_pass: true do
6
+
7
+ watch 'spec/spec_helper.rb'
8
+
9
+ watch(%r{^lib/(.+)\.rb$}) {|m| "spec/lib/#{m[1]}_spec.rb"}
10
+ watch(%r{^spec/.+_spec\.rb$})
11
+
12
+ end
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2012-2015 Juan A. Martín Lucas
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,108 @@
1
+ [![Build Status](TODO)
2
+
3
+ Scrapula
4
+ ========
5
+ Scrapula is a library for scraping web pages that simplifies some of the
6
+ common actions that are involved.
7
+
8
+ It has a very simple API that can be used in several ways and contexts, and
9
+ another, shorter, that facilitates processing pages when characters are
10
+ scarce, like irb / pry, or quick and dirty scripts.
11
+
12
+ Requirements
13
+ ============
14
+ It uses [Mechanize](http://mechanize.rubyforge.org/) and [Nokogiri](http://nokogiri.org) to obtain and extract the information and [RSpec](https://www.relishapp.com/rspec) for testing.
15
+
16
+ Configuration
17
+ =============
18
+ If you want to show the output of some steps:
19
+
20
+ ```ruby
21
+ Scrapula.verbose = true
22
+ ```
23
+
24
+ API
25
+ ===
26
+
27
+ Perform requests:
28
+
29
+ ```ruby
30
+ page = Scrapula.get 'example.net' #=> Scrapula::Page object
31
+
32
+ page = Scrapula.post 'example.net', { q: 'a query' } #=> Scrapula::Page object
33
+ ```
34
+
35
+ Extract information from the page:
36
+
37
+ ```ruby
38
+ # Using a CSS selector (all elements)
39
+ page.search! 'a'
40
+
41
+ # Using a CSS selector (fist element)
42
+ page.at! 'h1'
43
+
44
+ # Using XPath (fist element)
45
+ page.at! '//'
46
+ ```
47
+
48
+ Perform a GET request:
49
+
50
+ ```ruby
51
+ Scrapula.get 'example.net
52
+ ```
53
+
54
+ S interface
55
+ -----------
56
+ This API is not required by default, so it is up to you to use it:
57
+ ```ruby
58
+ require 'scrapula/s'
59
+ ```
60
+
61
+ It provides the method and its shortcut For all HTTP verbs:
62
+
63
+ ```ruby
64
+ S.get 'example.net'
65
+ S.g 'example.net'
66
+
67
+ S.post 'example.net'
68
+ S.p 'example.net'
69
+
70
+ S.put 'example.net'
71
+ S.u 'example.net'
72
+
73
+ S.patch 'example.net'
74
+ S.a 'example.net'
75
+
76
+ S.delete 'example.net'
77
+ S.d 'example.net'
78
+
79
+ S.head 'example.net'
80
+ S.h 'example.net'
81
+ ```
82
+
83
+ Additionally, GET requests, can be performed with through the shortest invocation:
84
+
85
+ ```ruby
86
+ S 'example.net'
87
+ ```
88
+
89
+ Examples
90
+ --------
91
+
92
+ There are more examples in the `examples` folder.
93
+
94
+ Changelog
95
+ =========
96
+
97
+ You can read previous changes in `CHANGELOG.md`
98
+
99
+ Contributing
100
+ ============
101
+
102
+ Authors
103
+ =======
104
+ Juan A. Martín Lucas (https://github.com/j-a-m-l)
105
+
106
+ License
107
+ =======
108
+ This project is licensed under the MIT license. See [LICENSE]() for details.
@@ -0,0 +1,42 @@
1
+ # ROADMAP
2
+
3
+ This is a proposal and should not be interpreted literally. If you want to collaborate in some feature before its version, do it.
4
+
5
+ ## 3.0.0
6
+
7
+ * Other agents than Mechanize
8
+ * Uncommon HTTP methods: OPTIONS, CONNECT, TRACE
9
+
10
+ ## 2.0.0
11
+
12
+ * Page | Scraping schema definition
13
+ * Page | Scraping schema validation
14
+ * Scraping operations: `regex!`
15
+ * Scraping operations: `int!`
16
+ * Scraping operations: `decimal!`
17
+ * Scraping operations: `number!`
18
+ * Scraping operations: `date!`
19
+ * Scraping operations: `time!`
20
+ * Scraping operations: `datetime!`
21
+ * Non-standard behaviour of some known non-standard metas, like og:image
22
+
23
+ ## 1.0.0
24
+
25
+ * Documented with TODO
26
+
27
+ ## 0.9.0
28
+
29
+ * Nested block syntax
30
+ * Scraping operations: `attribute!`
31
+ * Scraping operations: `inner_html!` ?
32
+ * Scraping operations: `xhtml!` ?
33
+ * Scraping operations: `title!` ?
34
+
35
+ ## 0.8.0
36
+
37
+ * Add pending HTTP methods: POST, PUT, PATCH, DELETE and HEAD
38
+ * Request parameters
39
+
40
+ ## 0.7.0
41
+
42
+ * Timeouts
@@ -0,0 +1,30 @@
1
+ require 'rspec/core/rake_task'
2
+
3
+ require './lib/scrapula/version'
4
+ name = 'scrapula'
5
+ version = Scrapula::VERSION
6
+
7
+ task :default => :coverage
8
+
9
+ desc "Run specs!"
10
+ RSpec::Core::RakeTask.new do |t|
11
+ t.rspec_opts = ['--color', '--debug']
12
+ t.pattern = "./spec/**/*_spec.rb"
13
+ end
14
+
15
+ # Alias
16
+ task :test => :spec
17
+
18
+ desc "Happy coverage"
19
+ task :coverage do
20
+ ENV['COVERAGE'] = 'true'
21
+ Rake::Task[:spec].execute
22
+ end
23
+
24
+ task :build do
25
+ system "gem build #{name}.gemspec"
26
+ end
27
+
28
+ task :install => :build do
29
+ system "gem install #{name}-#{version}"
30
+ end
@@ -0,0 +1,20 @@
1
+ require_relative '../lib/scrapula'
2
+
3
+ page = Scrapula.get 'reddit.com'
4
+
5
+ hs1 = page.scrape do
6
+ h1 'h1'
7
+ h2 'h2'
8
+ h3 'h3'
9
+ h4 'h4'
10
+ h5 'h5'
11
+ end
12
+
13
+ # Alternative way
14
+ hs2 = page.scrape
15
+ # TODO
16
+ txt!({ h1: 'h1', h2: 'h2', h3: 'h3', h4: 'h4', h5: 'h5' })
17
+ end
18
+
19
+ puts hs1
20
+ puts hs2
@@ -0,0 +1,6 @@
1
+ require_relative '../lib/scrapula'
2
+
3
+ page = Scrapula.get 'reddit.com'
4
+ h1 = page.at! 'h1'
5
+
6
+ puts h1
@@ -0,0 +1,13 @@
1
+ require_relative '../lib/scrapula'
2
+
3
+ query = 'ruby scraping'
4
+ page = Scrapula.get 'stackoverflow.com/search', q: query
5
+
6
+ # Extract the number of results (with newline characters and spaces)
7
+ number_of_results = page.text! '.results-header h2'
8
+
9
+ # Apply the operation (method) strip to the result
10
+ first_result = page.txt! '.search-result[data-position="1"] .result-link', [:strip]
11
+
12
+ puts "Stack Overflow returns #{number_of_results.strip} for \"#{query}\""
13
+ puts "Stack Overflow returns \"#{first_result}\" as first result for \"#{query}\""