grubby 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.travis.yml +5 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +127 -0
- data/Rakefile +26 -0
- data/grubby.gemspec +35 -0
- data/lib/grubby.rb +169 -0
- data/lib/grubby/core_ext/string.rb +12 -0
- data/lib/grubby/core_ext/uri.rb +12 -0
- data/lib/grubby/json_parser.rb +45 -0
- data/lib/grubby/json_scraper.rb +13 -0
- data/lib/grubby/log.rb +5 -0
- data/lib/grubby/mechanize/download.rb +8 -0
- data/lib/grubby/mechanize/fetch_with_retry.rb +39 -0
- data/lib/grubby/mechanize/file.rb +8 -0
- data/lib/grubby/mechanize/link.rb +20 -0
- data/lib/grubby/mechanize/page.rb +17 -0
- data/lib/grubby/nokogiri/searchable.rb +27 -0
- data/lib/grubby/page_scraper.rb +13 -0
- data/lib/grubby/scraper.rb +99 -0
- data/lib/grubby/version.rb +3 -0
- metadata +220 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 88c8ecc06ffba254ee9e9de3d42f868c0692b244
|
4
|
+
data.tar.gz: 8cd3445f33c9f7db05550947d686293ceee620c1
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d9dc7435763425d54d82f4930935c913cd67eb15a1250c58c7ccf3b0e419eeb6ceb87289ed3c22386faa96e3ae3584a9f39cf8e671af8b56b52bb3e2c8257e4d
|
7
|
+
data.tar.gz: 2c5c96993c8a673274a4acc34c3f82a8719fbea508c1dffb5ce8cff1d4a13cd90c9fcbf8a3c743500a31f08419c687c24adf5f2aee464a8a4f1935b2b302b184
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2017 Jonathan Hefner
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,127 @@
|
|
1
|
+
# grubby
|
2
|
+
|
3
|
+
[Fail-fast] web scraping. *grubby* adds a layer of utility and
|
4
|
+
error-checking atop the marvelous [Mechanize gem]. See API summary
|
5
|
+
below, or browse the [full documentation].
|
6
|
+
|
7
|
+
[Fail-fast]: https://en.wikipedia.org/wiki/Fail-fast
|
8
|
+
[Mechanize gem]: https://rubygems.org/gems/mechanize
|
9
|
+
[full documentation]: http://www.rubydoc.info/gems/grubby/
|
10
|
+
|
11
|
+
|
12
|
+
## Examples
|
13
|
+
|
14
|
+
The following example scrapes the [Hacker News] front page:
|
15
|
+
|
16
|
+
```ruby
|
17
|
+
require "grubby"
|
18
|
+
|
19
|
+
class HackerNews < Grubby::PageScraper
|
20
|
+
|
21
|
+
scrapes(:items) do
|
22
|
+
page.search!(".athing").map{|item| HackerNewsItem.new(item) }
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
class HackerNewsItem < Grubby::Scraper
|
28
|
+
|
29
|
+
scrapes(:title) { @row1.at!(".storylink").text }
|
30
|
+
scrapes(:submitter) { @row2.at!(".hnuser").text }
|
31
|
+
scrapes(:story_uri) { URI.join(@base_uri, @row1.at!(".storylink")["href"]) }
|
32
|
+
scrapes(:comments_uri) { URI.join(@base_uri, @row2.at!(".age a")["href"]) }
|
33
|
+
|
34
|
+
def initialize(source)
|
35
|
+
@row1 = source
|
36
|
+
@row2 = source.next_sibling
|
37
|
+
@base_uri = source.document.url
|
38
|
+
super
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
grubby = Grubby.new
|
44
|
+
|
45
|
+
# The following line will raise an exception if anything goes wrong
|
46
|
+
# during the scraping process. For example, if the structure of the
|
47
|
+
# HTML does not match expectations, either due to a bad assumption or
|
48
|
+
# due to a site-wide change, the script will terminate immediately with
|
49
|
+
# a relevant error message. This prevents bad values from propogating
|
50
|
+
# and causing hard-to-trace errors.
|
51
|
+
hn = HackerNews.new(grubby.get("https://news.ycombinator.com/news"))
|
52
|
+
|
53
|
+
puts hn.items.take(10).map(&:title) # your scraping logic goes here
|
54
|
+
```
|
55
|
+
|
56
|
+
[Hacker News]: https://news.ycombinator.com/news
|
57
|
+
|
58
|
+
|
59
|
+
## Core API
|
60
|
+
|
61
|
+
- [Grubby](http://www.rubydoc.info/gems/grubby/Grubby)
|
62
|
+
- [#get_mirrored](http://www.rubydoc.info/gems/grubby/Grubby:get_mirrored)
|
63
|
+
- [#singleton](http://www.rubydoc.info/gems/grubby/Grubby:singleton)
|
64
|
+
- [#time_between_requests](http://www.rubydoc.info/gems/grubby/Grubby:time_between_requests)
|
65
|
+
- [Scraper](http://www.rubydoc.info/gems/grubby/Grubby/Scraper)
|
66
|
+
- [.fields](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.fields)
|
67
|
+
- [.scrapes](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrapes)
|
68
|
+
- [#[]](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:[])
|
69
|
+
- [#source](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:source)
|
70
|
+
- [#to_h](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:to_h)
|
71
|
+
- [PageScraper](http://www.rubydoc.info/gems/grubby/Grubby/PageScraper)
|
72
|
+
- [#page](http://www.rubydoc.info/gems/grubby/Grubby/PageScraper:page)
|
73
|
+
- [JsonScraper](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper)
|
74
|
+
- [#json](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper:json)
|
75
|
+
- Nokogiri::XML::Searchable
|
76
|
+
- [#at!](http://www.rubydoc.info/gems/grubby/Nokogiri/XML/Searchable:at%21)
|
77
|
+
- [#search!](http://www.rubydoc.info/gems/grubby/Nokogiri/XML/Searchable:search%21)
|
78
|
+
- Mechanize::Page
|
79
|
+
- [#at!](http://www.rubydoc.info/gems/grubby/Mechanize/Page:at%21)
|
80
|
+
- [#search!](http://www.rubydoc.info/gems/grubby/Mechanize/Page:search%21)
|
81
|
+
- Mechanize::Page::Link
|
82
|
+
- [#to_absolute_uri](http://www.rubydoc.info/gems/grubby/Mechanize/Page/Link#to_absolute_uri)
|
83
|
+
|
84
|
+
|
85
|
+
## Supplemental API
|
86
|
+
|
87
|
+
*grubby* uses several gems which extend core Ruby objects with
|
88
|
+
convenience methods. When you import *grubby* you automatically make
|
89
|
+
these methods available. See each gem below for its specific API
|
90
|
+
documentation:
|
91
|
+
|
92
|
+
- [Active Support](https://rubygems.org/gems/activesupport)
|
93
|
+
([docs](http://www.rubydoc.info/gems/activesupport/))
|
94
|
+
- [casual_support](https://rubygems.org/gems/casual_support)
|
95
|
+
([docs](http://www.rubydoc.info/gems/casual_support/))
|
96
|
+
- [gorge](https://rubygems.org/gems/gorge)
|
97
|
+
([docs](http://www.rubydoc.info/gems/gorge/))
|
98
|
+
- [mini_sanity](https://rubygems.org/gems/mini_sanity)
|
99
|
+
([docs](http://www.rubydoc.info/gems/mini_sanity/))
|
100
|
+
- [pleasant_path](https://rubygems.org/gems/pleasant_path)
|
101
|
+
([docs](http://www.rubydoc.info/gems/pleasant_path/))
|
102
|
+
|
103
|
+
|
104
|
+
## Installation
|
105
|
+
|
106
|
+
Install from [Ruby Gems](https://rubygems.org/gems/grubby):
|
107
|
+
|
108
|
+
```bash
|
109
|
+
$ gem install grubby
|
110
|
+
```
|
111
|
+
|
112
|
+
Then require in your Ruby script:
|
113
|
+
|
114
|
+
```ruby
|
115
|
+
require "grubby"
|
116
|
+
```
|
117
|
+
|
118
|
+
|
119
|
+
## Contributing
|
120
|
+
|
121
|
+
Run `rake test` to run the tests. You can also run `rake irb` for an
|
122
|
+
interactive prompt that pre-loads the project code.
|
123
|
+
|
124
|
+
|
125
|
+
## License
|
126
|
+
|
127
|
+
[MIT License](https://opensource.org/licenses/MIT)
|
data/Rakefile
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "rake/testtask"
|
3
|
+
require "yard"
|
4
|
+
|
5
|
+
|
6
|
+
YARD::Rake::YardocTask.new(:doc) do |t|
|
7
|
+
end
|
8
|
+
|
9
|
+
desc "Launch IRB with this gem pre-loaded"
|
10
|
+
task :irb do
|
11
|
+
# HACK because lib/grubby/version is prematurely loaded by bundler/gem_tasks
|
12
|
+
Object.send(:remove_const, :Grubby)
|
13
|
+
|
14
|
+
require "grubby"
|
15
|
+
require "irb"
|
16
|
+
ARGV.clear
|
17
|
+
IRB.start
|
18
|
+
end
|
19
|
+
|
20
|
+
Rake::TestTask.new(:test) do |t|
|
21
|
+
t.libs << "test"
|
22
|
+
t.libs << "lib"
|
23
|
+
t.test_files = FileList["test/**/*_test.rb"]
|
24
|
+
end
|
25
|
+
|
26
|
+
task :default => :test
|
data/grubby.gemspec
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path("../lib", __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require "grubby/version"
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "grubby"
|
8
|
+
spec.version = Grubby::VERSION
|
9
|
+
spec.authors = ["Jonathan Hefner"]
|
10
|
+
spec.email = ["jonathan.hefner@gmail.com"]
|
11
|
+
|
12
|
+
spec.summary = %q{Fail-fast web scraping}
|
13
|
+
spec.homepage = "https://github.com/jonathanhefner/grubby"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
17
|
+
f.match(%r{^(test|spec|features)/})
|
18
|
+
end
|
19
|
+
spec.bindir = "exe"
|
20
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
21
|
+
spec.require_paths = ["lib"]
|
22
|
+
|
23
|
+
spec.add_runtime_dependency "activesupport", "~> 5.0"
|
24
|
+
spec.add_runtime_dependency "casual_support", "~> 3.0"
|
25
|
+
spec.add_runtime_dependency "dumb_delimited", "~> 1.0"
|
26
|
+
spec.add_runtime_dependency "gorge", "~> 1.0"
|
27
|
+
spec.add_runtime_dependency "mechanize", "~> 2.7"
|
28
|
+
spec.add_runtime_dependency "mini_sanity", "~> 1.0"
|
29
|
+
spec.add_runtime_dependency "pleasant_path", "~> 1.1"
|
30
|
+
|
31
|
+
spec.add_development_dependency "bundler", "~> 1.15"
|
32
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
33
|
+
spec.add_development_dependency "minitest", "~> 5.0"
|
34
|
+
spec.add_development_dependency "yard", "~> 0.9"
|
35
|
+
end
|
data/lib/grubby.rb
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
require "active_support/all"
|
2
|
+
require "casual_support"
|
3
|
+
require "dumb_delimited"
|
4
|
+
require "gorge"
|
5
|
+
require "mechanize"
|
6
|
+
require "mini_sanity"
|
7
|
+
require "pleasant_path"
|
8
|
+
|
9
|
+
require_relative "grubby/log"
|
10
|
+
|
11
|
+
require_relative "grubby/core_ext/string"
|
12
|
+
require_relative "grubby/core_ext/uri"
|
13
|
+
require_relative "grubby/mechanize/fetch_with_retry"
|
14
|
+
require_relative "grubby/mechanize/download"
|
15
|
+
require_relative "grubby/mechanize/file"
|
16
|
+
require_relative "grubby/mechanize/link"
|
17
|
+
require_relative "grubby/mechanize/page"
|
18
|
+
require_relative "grubby/nokogiri/searchable"
|
19
|
+
|
20
|
+
|
21
|
+
class Grubby < Mechanize
|
22
|
+
|
23
|
+
# @return [Integer, Float, Range<Integer>, Range<Float>]
|
24
|
+
# The enforced minimum amount of time to wait between requests, in
|
25
|
+
# seconds. If the value is a Range, a random number within the
|
26
|
+
# Range is chosen for each request.
|
27
|
+
attr_accessor :time_between_requests
|
28
|
+
|
29
|
+
# @param singleton_journal [Pathname, String]
|
30
|
+
# Optional journal file to persist the list of resources processed
|
31
|
+
# by {singleton}. Useful to ensure only-once processing across
|
32
|
+
# multiple program runs.
|
33
|
+
def initialize(singleton_journal = nil)
|
34
|
+
super()
|
35
|
+
|
36
|
+
# Prevent "memory leaks", and prevent mistakenly blank urls from
|
37
|
+
# resolving. (Blank urls resolve as a path relative to the last
|
38
|
+
# history entry. Without this setting, an erroneous `agent.get("")`
|
39
|
+
# could sometimes successfully fetch a page.)
|
40
|
+
self.max_history = 0
|
41
|
+
|
42
|
+
# Prevent files of unforeseen content type from being buffered into
|
43
|
+
# memory by default, in case they are very large. However, increase
|
44
|
+
# the threshold for what is considered "large", to prevent
|
45
|
+
# unnecessary writes to disk.
|
46
|
+
#
|
47
|
+
# References:
|
48
|
+
# - http://docs.seattlerb.org/mechanize/Mechanize/PluggableParser.html
|
49
|
+
# - http://docs.seattlerb.org/mechanize/Mechanize/Download.html
|
50
|
+
# - http://docs.seattlerb.org/mechanize/Mechanize/File.html
|
51
|
+
self.max_file_buffer = 1_000_000 # only applies to Mechanize::Download
|
52
|
+
self.pluggable_parser.default = Mechanize::Download
|
53
|
+
self.pluggable_parser["text/plain"] = Mechanize::File
|
54
|
+
self.pluggable_parser["application/json"] = Grubby::JsonParser
|
55
|
+
|
56
|
+
# Set up configurable rate limiting, and choose a reasonable default
|
57
|
+
# rate limit.
|
58
|
+
self.pre_connect_hooks << Proc.new{ self.send(:sleep_between_requests) }
|
59
|
+
self.time_between_requests = 1.0
|
60
|
+
|
61
|
+
@journal = singleton_journal ?
|
62
|
+
singleton_journal.to_pathname.touch_file : Pathname::NULL
|
63
|
+
@seen = SingletonKey.parse_file(@journal).
|
64
|
+
group_by(&:purpose).transform_values{|sks| sks.map(&:key).index_to{ true } }
|
65
|
+
end
|
66
|
+
|
67
|
+
# Calls +#get+ with each of +mirror_uris+ until a successful
|
68
|
+
# ("200 OK") response is recieved, and returns that +#get+ result.
|
69
|
+
# Rescues and logs +Mechanize::ResponseCodeError+ failures for all but
|
70
|
+
# the last mirror.
|
71
|
+
#
|
72
|
+
# @param mirror_uris [Array<String>]
|
73
|
+
# @return [Mechanize::Page, Mechanize::File, Mechanize::Download, ...]
|
74
|
+
# @raise [Mechanize::ResponseCodeError]
|
75
|
+
# if all +mirror_uris+ fail
|
76
|
+
def get_mirrored(mirror_uris, parameters = [], referer = nil, headers = {})
|
77
|
+
i = 0
|
78
|
+
begin
|
79
|
+
get(mirror_uris[i], parameters, referer, headers)
|
80
|
+
rescue Mechanize::ResponseCodeError => e
|
81
|
+
i += 1
|
82
|
+
if i >= mirror_uris.length
|
83
|
+
raise
|
84
|
+
else
|
85
|
+
$log.info("Mirror failed with response code #{e.response_code}: #{mirror_uris[i - 1]}")
|
86
|
+
$log.debug("Trying next mirror: #{mirror_uris[i]}")
|
87
|
+
retry
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Ensures only-once processing of the resource indicated by +target+
|
93
|
+
# for the specified +purpose+. A list of previously-processed
|
94
|
+
# resource URIs and content hashes is maintained in the Grubby
|
95
|
+
# instance. The given block is called with the fetched resource only
|
96
|
+
# if the resource's URI and the resource's content hash have not been
|
97
|
+
# previously processed under the specified +purpose+.
|
98
|
+
#
|
99
|
+
# @param target [URI, String, Mechanize::Page::Link, #to_absolute_uri]
|
100
|
+
# designates the resource to fetch
|
101
|
+
# @param purpose [String]
|
102
|
+
# the purpose of processing the resource
|
103
|
+
# @yield [resource]
|
104
|
+
# processes the resource
|
105
|
+
# @yieldparam resource [Mechanize::Page, Mechanize::File, Mechanize::Download, ...]
|
106
|
+
# the fetched resource
|
107
|
+
# @return [Boolean]
|
108
|
+
# whether the given block was called
|
109
|
+
# @raise [Mechanize::ResponseCodeError]
|
110
|
+
# if fetching the resource results in error (see +Mechanize#get+)
|
111
|
+
def singleton(target, purpose = "")
|
112
|
+
series = []
|
113
|
+
|
114
|
+
original_url = target.to_absolute_uri
|
115
|
+
return if skip_singleton?(purpose, original_url.to_s, series)
|
116
|
+
|
117
|
+
url = normalize_url(original_url)
|
118
|
+
return if skip_singleton?(purpose, url.to_s, series)
|
119
|
+
|
120
|
+
$log.info("Fetching #{url}")
|
121
|
+
resource = get(url)
|
122
|
+
skip = skip_singleton?(purpose, resource.uri.to_s, series) |
|
123
|
+
skip_singleton?(purpose, "content hash: #{resource.content_hash}", series)
|
124
|
+
|
125
|
+
yield resource unless skip
|
126
|
+
|
127
|
+
series.map{|k| SingletonKey.new(purpose, k) }.append_to_file(@journal)
|
128
|
+
|
129
|
+
!skip
|
130
|
+
end
|
131
|
+
|
132
|
+
|
133
|
+
private
|
134
|
+
|
135
|
+
SingletonKey = DumbDelimited[:purpose, :key]
|
136
|
+
|
137
|
+
def skip_singleton?(purpose, key, series)
|
138
|
+
return false if series.include?(key)
|
139
|
+
series << key
|
140
|
+
already = (@seen[purpose.to_s] ||= {}).displace(key, true)
|
141
|
+
$log.info("Skipping #{series.first} (already seen #{series.last})") if already
|
142
|
+
already
|
143
|
+
end
|
144
|
+
|
145
|
+
def normalize_url(url)
|
146
|
+
url = url.dup
|
147
|
+
$log.warn("Discarding fragment in URL: #{url}") if url.fragment
|
148
|
+
url.fragment = nil
|
149
|
+
url.path = url.path.chomp("/")
|
150
|
+
url
|
151
|
+
end
|
152
|
+
|
153
|
+
def sleep_between_requests
|
154
|
+
@last_request_at ||= 0.0
|
155
|
+
delay_duration = @time_between_requests.is_a?(Range) ?
|
156
|
+
rand(@time_between_requests) : @time_between_requests
|
157
|
+
sleep_duration = @last_request_at + delay_duration - Time.now.to_f
|
158
|
+
sleep(sleep_duration) if sleep_duration > 0
|
159
|
+
@last_request_at = Time.now.to_f
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
163
|
+
|
164
|
+
|
165
|
+
require_relative "grubby/version"
|
166
|
+
require_relative "grubby/json_parser"
|
167
|
+
require_relative "grubby/scraper"
|
168
|
+
require_relative "grubby/page_scraper"
|
169
|
+
require_relative "grubby/json_scraper"
|
@@ -0,0 +1,12 @@
|
|
1
|
+
class String
|
2
|
+
|
3
|
+
# Constructs a URI from the String. Raises an exception if the String
|
4
|
+
# does not denote an absolute URI.
|
5
|
+
#
|
6
|
+
# @return [URI]
|
7
|
+
# @raise [RuntimeError] if the String does not denote an absolute URI
|
8
|
+
def to_absolute_uri
|
9
|
+
URI(self).to_absolute_uri
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
class Grubby::JsonParser < Mechanize::File
|
2
|
+
|
3
|
+
# Returns the options to use when parsing JSON. The returned options
|
4
|
+
# Hash is not +dup+ed and can be modified directly. Any modifications
|
5
|
+
# will be applied to all future parsing.
|
6
|
+
#
|
7
|
+
# For information about available options, see
|
8
|
+
# {http://ruby-doc.org/stdlib/libdoc/json/rdoc/JSON.html#method-i-parse
|
9
|
+
# +JSON.parse+}.
|
10
|
+
#
|
11
|
+
# @return [Hash]
|
12
|
+
def self.json_parse_options
|
13
|
+
@json_parse_options ||= {
|
14
|
+
max_nesting: false,
|
15
|
+
allow_nan: false,
|
16
|
+
symbolize_names: false,
|
17
|
+
create_additions: false,
|
18
|
+
object_class: Hash,
|
19
|
+
array_class: Array,
|
20
|
+
}
|
21
|
+
end
|
22
|
+
|
23
|
+
# Sets the options to use when parsing JSON. The entire options Hash
|
24
|
+
# is replaced, and the new value will be applied to all future
|
25
|
+
# parsing. To set options individually, see {json_parse_options}.
|
26
|
+
#
|
27
|
+
# For information about available options, see
|
28
|
+
# {http://ruby-doc.org/stdlib/libdoc/json/rdoc/JSON.html#method-i-parse
|
29
|
+
# +JSON.parse+}.
|
30
|
+
#
|
31
|
+
# @param options [Hash]
|
32
|
+
def self.json_parse_options=(options)
|
33
|
+
@json_parse_options = options
|
34
|
+
end
|
35
|
+
|
36
|
+
# @return [Hash, Array]
|
37
|
+
# The parsed JSON data.
|
38
|
+
attr_reader :json
|
39
|
+
|
40
|
+
def initialize(uri = nil, response = nil, body = nil, code = nil)
|
41
|
+
@json = body && JSON.parse(body, self.class.json_parse_options)
|
42
|
+
super
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
class Grubby::JsonScraper < Grubby::Scraper
|
2
|
+
|
3
|
+
# @return [Hash, Array]
|
4
|
+
# The parsed JSON data being scraped.
|
5
|
+
attr_reader :json
|
6
|
+
|
7
|
+
# @param source [Grubby::JsonParser]
|
8
|
+
def initialize(source)
|
9
|
+
@json = source.assert_kind_of!(Grubby::JsonParser).json
|
10
|
+
super
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
data/lib/grubby/log.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# This monkey patch attempts to fix the insidious "too many connection
|
2
|
+
# resets" bug described here: https://github.com/sparklemotion/mechanize/issues/123
|
3
|
+
#
|
4
|
+
# The code is taken and modified from this helpful blog article:
|
5
|
+
# http://scottwb.com/blog/2013/11/09/defeating-the-infamous-mechanize-too-many-connection-resets-bug/
|
6
|
+
class Mechanize::HTTP::Agent
|
7
|
+
|
8
|
+
MAX_CONNECTION_RESET_RETRIES = 9
|
9
|
+
IDEMPOTENT_HTTP_METHODS = [:get, :head, :options, :delete]
|
10
|
+
|
11
|
+
# Replacement for +Mechanize::HTTP::Agent#fetch+. When a "too many
|
12
|
+
# connection resets" error is encountered, this method shuts down the
|
13
|
+
# persistent HTTP connection, and then retries the request (upto
|
14
|
+
# {MAX_CONNECTION_RESET_RETRIES} times).
|
15
|
+
def fetch_with_retry(uri, http_method = :get, headers = {}, params = [], referer = current_page, redirects = 0)
|
16
|
+
retry_count = 0
|
17
|
+
begin
|
18
|
+
fetch_without_retry(uri, http_method, headers, params, referer, redirects)
|
19
|
+
rescue Net::HTTP::Persistent::Error => e
|
20
|
+
# raise if different type of error
|
21
|
+
raise unless e.message.include?("too many connection resets")
|
22
|
+
# raise if non-idempotent http method
|
23
|
+
raise unless IDEMPOTENT_HTTP_METHODS.include?(http_method)
|
24
|
+
# raise if we've tried too many times
|
25
|
+
raise if retry_count >= MAX_CONNECTION_RESET_RETRIES
|
26
|
+
|
27
|
+
# otherwise, shutdown the persistent HTTP connection and try again
|
28
|
+
retry_count += 1
|
29
|
+
$log.warn("Possible connection reset bug. Retry(#{retry_count}) #{http_method.to_s.upcase} #{uri}")
|
30
|
+
self.http.shutdown
|
31
|
+
sleep(retry_count) # incremental backoff in case problem is with server
|
32
|
+
retry
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
alias_method :fetch_without_retry, :fetch
|
37
|
+
alias_method :fetch, :fetch_with_retry
|
38
|
+
|
39
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
class Mechanize::Page::Link
|
2
|
+
|
3
|
+
# Returns the URI represented by the Link, in absolute form. If the
|
4
|
+
# href attribute of the Link is expressed in relative form, the URI of
|
5
|
+
# the Link's Page is used to convert to absolute form.
|
6
|
+
#
|
7
|
+
# @return [URI]
|
8
|
+
def to_absolute_uri
|
9
|
+
# Via the W3 spec: "If the a element has no href attribute, then the
|
10
|
+
# element represents a placeholder for where a link might otherwise
|
11
|
+
# have been placed, if it had been relevant, consisting of just the
|
12
|
+
# element's contents."[1] So, we assume a link with no href
|
13
|
+
# attribute (i.e. `uri == nil`) should be treated the same as an
|
14
|
+
# intra-page link.
|
15
|
+
#
|
16
|
+
# [1]: https://www.w3.org/TR/2016/REC-html51-20161101/textlevel-semantics.html#the-a-element
|
17
|
+
URI.join(self.page.uri, self.uri || "#").to_absolute_uri
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
class Mechanize::Page
|
2
|
+
|
3
|
+
# @!method search!(*queries)
|
4
|
+
# See {::Nokogiri::XML::Searchable#search!}.
|
5
|
+
#
|
6
|
+
# @param queries [Array<String>]
|
7
|
+
# @return [Array<Nokogiri::XML::Element>]
|
8
|
+
def_delegators :parser, :search!
|
9
|
+
|
10
|
+
# @!method at!(*queries)
|
11
|
+
# See {::Nokogiri::XML::Searchable#at!}.
|
12
|
+
#
|
13
|
+
# @param queries [Array<String>]
|
14
|
+
# @return [Nokogiri::XML::Element]
|
15
|
+
def_delegators :parser, :at!
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Nokogiri::XML::Searchable
|
2
|
+
|
3
|
+
# Searches the node using the given XPath or CSS queries, and returns
|
4
|
+
# the results. Raises an exception if there are no results. See also
|
5
|
+
# +#search+.
|
6
|
+
#
|
7
|
+
# @param queries [Array<String>]
|
8
|
+
# @return [Array<Nokogiri::XML::Element>]
|
9
|
+
# @raise [RuntimeError] if queries yield no results
|
10
|
+
def search!(*queries)
|
11
|
+
results = search(*queries)
|
12
|
+
raise "No elements matching #{queries.map(&:inspect).join(" OR ")}" if results.empty?
|
13
|
+
results
|
14
|
+
end
|
15
|
+
|
16
|
+
# Searches the node using the given XPath or CSS queries, and returns
|
17
|
+
# only the first result. Raises an exception if there are no results.
|
18
|
+
# See also +#at+.
|
19
|
+
#
|
20
|
+
# @param queries [Array<String>]
|
21
|
+
# @return [Nokogiri::XML::Element]
|
22
|
+
# @raise [RuntimeError] if queries yield no results
|
23
|
+
def at!(*queries)
|
24
|
+
search!(*queries).first
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
class Grubby::PageScraper < Grubby::Scraper
|
2
|
+
|
3
|
+
# @return [Mechanize::Page]
|
4
|
+
# The Page being scraped.
|
5
|
+
attr_reader :page
|
6
|
+
|
7
|
+
# @param source [Mechanize::Page]
|
8
|
+
def initialize(source)
|
9
|
+
@page = source.assert_kind_of!(Mechanize::Page)
|
10
|
+
super
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
class Grubby::Scraper
|
2
|
+
|
3
|
+
class Error < RuntimeError
|
4
|
+
end
|
5
|
+
|
6
|
+
# Defines an attribute reader method named by +field+. During
|
7
|
+
# +initialize+, the given block is called, and the attribute is set to
|
8
|
+
# the block's return value. By default, if the block's return value
|
9
|
+
# is nil, an exception will be raised. To prevent this behavior, set
|
10
|
+
# +optional+ to true.
|
11
|
+
#
|
12
|
+
# @param field [Symbol, String]
|
13
|
+
# name of the scraped value
|
14
|
+
# @param optional [Boolean]
|
15
|
+
# whether to permit a nil scraped value
|
16
|
+
# @yield []
|
17
|
+
# scrapes the value
|
18
|
+
# @yieldreturn [Object]
|
19
|
+
# scraped value
|
20
|
+
def self.scrapes(field, optional: false, &block)
|
21
|
+
field = field.to_sym
|
22
|
+
self.fields << field
|
23
|
+
|
24
|
+
define_method(field) do
|
25
|
+
return @scraped[field] if @scraped.key?(field)
|
26
|
+
|
27
|
+
unless @errors.key?(field)
|
28
|
+
begin
|
29
|
+
value = instance_eval(&block)
|
30
|
+
if value.nil?
|
31
|
+
raise "`#{field}` cannot be nil" unless optional
|
32
|
+
$log.debug("Scraped nil value for #{self.class}##{field}")
|
33
|
+
end
|
34
|
+
@scraped[field] = value
|
35
|
+
rescue RuntimeError => e
|
36
|
+
@errors[field] = e
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
raise "`#{field}` raised a #{@errors[field].class}" if @errors.key?(field)
|
41
|
+
|
42
|
+
@scraped[field]
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# @return [Array<Symbol>]
|
47
|
+
# The names of all scraped values, as defined by {scrapes}.
|
48
|
+
def self.fields
|
49
|
+
@fields ||= []
|
50
|
+
end
|
51
|
+
|
52
|
+
# @return [Object]
|
53
|
+
# The source being scraped. Typically a Mechanize pluggable parser
|
54
|
+
# such as +Mechanize::Page+.
|
55
|
+
attr_reader :source
|
56
|
+
|
57
|
+
# @param source
|
58
|
+
# @raise [Grubby::Scraper::Error]
|
59
|
+
# if any scraped values result in error
|
60
|
+
def initialize(source)
|
61
|
+
@source = source
|
62
|
+
@scraped = {}
|
63
|
+
@errors = {}
|
64
|
+
|
65
|
+
self.class.fields.each do |field|
|
66
|
+
begin
|
67
|
+
self.send(field)
|
68
|
+
rescue RuntimeError
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
unless @errors.empty?
|
73
|
+
listing = @errors.map do |field, error|
|
74
|
+
error_class = " (#{error.class})" unless error.class == RuntimeError
|
75
|
+
error_trace = error.backtrace.join("\n").indent(2)
|
76
|
+
"* #{field} -- #{error.message}#{error_class}\n#{error_trace}"
|
77
|
+
end
|
78
|
+
raise Error.new("Failed to scrape the following fields:\n#{listing.join("\n")}")
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# Returns the scraped value named by +field+.
|
83
|
+
#
|
84
|
+
# @param field [Symbol, String]
|
85
|
+
# @return [Object]
|
86
|
+
# @raise [RuntimeError]
|
87
|
+
# if +field+ is not a valid name
|
88
|
+
def [](field)
|
89
|
+
@scraped.fetch(field.to_sym)
|
90
|
+
end
|
91
|
+
|
92
|
+
# Returns all scraped values as a Hash.
|
93
|
+
#
|
94
|
+
# @return [Hash<Symbol, Object>]
|
95
|
+
def to_h
|
96
|
+
@scraped.dup
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
metadata
ADDED
@@ -0,0 +1,220 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: grubby
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jonathan Hefner
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-09-05 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: activesupport
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '5.0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '5.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: casual_support
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '3.0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '3.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: dumb_delimited
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: gorge
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: mechanize
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '2.7'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '2.7'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: mini_sanity
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '1.0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '1.0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: pleasant_path
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '1.1'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '1.1'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: bundler
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '1.15'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '1.15'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: rake
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - "~>"
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '10.0'
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - "~>"
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '10.0'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: minitest
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - "~>"
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '5.0'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - "~>"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '5.0'
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: yard
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - "~>"
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '0.9'
|
160
|
+
type: :development
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - "~>"
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '0.9'
|
167
|
+
description:
|
168
|
+
email:
|
169
|
+
- jonathan.hefner@gmail.com
|
170
|
+
executables: []
|
171
|
+
extensions: []
|
172
|
+
extra_rdoc_files: []
|
173
|
+
files:
|
174
|
+
- ".gitignore"
|
175
|
+
- ".travis.yml"
|
176
|
+
- Gemfile
|
177
|
+
- LICENSE.txt
|
178
|
+
- README.md
|
179
|
+
- Rakefile
|
180
|
+
- grubby.gemspec
|
181
|
+
- lib/grubby.rb
|
182
|
+
- lib/grubby/core_ext/string.rb
|
183
|
+
- lib/grubby/core_ext/uri.rb
|
184
|
+
- lib/grubby/json_parser.rb
|
185
|
+
- lib/grubby/json_scraper.rb
|
186
|
+
- lib/grubby/log.rb
|
187
|
+
- lib/grubby/mechanize/download.rb
|
188
|
+
- lib/grubby/mechanize/fetch_with_retry.rb
|
189
|
+
- lib/grubby/mechanize/file.rb
|
190
|
+
- lib/grubby/mechanize/link.rb
|
191
|
+
- lib/grubby/mechanize/page.rb
|
192
|
+
- lib/grubby/nokogiri/searchable.rb
|
193
|
+
- lib/grubby/page_scraper.rb
|
194
|
+
- lib/grubby/scraper.rb
|
195
|
+
- lib/grubby/version.rb
|
196
|
+
homepage: https://github.com/jonathanhefner/grubby
|
197
|
+
licenses:
|
198
|
+
- MIT
|
199
|
+
metadata: {}
|
200
|
+
post_install_message:
|
201
|
+
rdoc_options: []
|
202
|
+
require_paths:
|
203
|
+
- lib
|
204
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
205
|
+
requirements:
|
206
|
+
- - ">="
|
207
|
+
- !ruby/object:Gem::Version
|
208
|
+
version: '0'
|
209
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
210
|
+
requirements:
|
211
|
+
- - ">="
|
212
|
+
- !ruby/object:Gem::Version
|
213
|
+
version: '0'
|
214
|
+
requirements: []
|
215
|
+
rubyforge_project:
|
216
|
+
rubygems_version: 2.6.13
|
217
|
+
signing_key:
|
218
|
+
specification_version: 4
|
219
|
+
summary: Fail-fast web scraping
|
220
|
+
test_files: []
|