grubby 2.0.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/Gemfile +2 -2
- data/README.md +86 -90
- data/grubby.gemspec +6 -9
- data/lib/grubby/core_ext/uri.rb +2 -0
- data/lib/grubby/json_parser.rb +2 -0
- data/lib/grubby/json_scraper.rb +4 -1
- data/lib/grubby/mechanize/fetch_with_retry.rb +2 -2
- data/lib/grubby/mechanize/file.rb +2 -0
- data/lib/grubby/page_scraper.rb +4 -1
- data/lib/grubby/scraper.rb +10 -10
- data/lib/grubby/version.rb +1 -1
- data/lib/grubby.rb +33 -16
- metadata +16 -53
- data/.gitignore +0 -10
- data/.travis.yml +0 -8
- data/gemfiles/activesupport-6.0.gemfile +0 -3
- data/lib/grubby/log.rb +0 -5
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 8c66a9a4e4d35bdf31269bf55d42641aaef58d3309d67e4521706c4acc10b618
|
|
4
|
+
data.tar.gz: 7d097cef49f6e4fe22c914e236afcfee97b079a7731be3a1ef897382bc0882d8
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 428f5c6b4bf4e8fe53b8499b21f7e9963dbdbce987906c56dcc3167597510822b0ae9288beb5ccc82fdf6e627d950663de4a5e06a58e70da9f63d01aa53bafd9
|
|
7
|
+
data.tar.gz: 241ef2f45729fb3cb3da8f5b9def7e925b546b34281db070385643733d4061dda07474b53eef8f92602d24f75658f223785deb71aa84da266a84b35b1804ded0
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
## 3.0.0
|
|
2
|
+
|
|
3
|
+
* [BREAKING] Drop support for Ruby < 3.4
|
|
4
|
+
* [BREAKING] Remove Active Support, `casual_support`, `mini_sanity`, and
|
|
5
|
+
`pleasant_path` as runtime dependencies
|
|
6
|
+
* Add and require those gems directly if your code relies on their
|
|
7
|
+
extension methods
|
|
8
|
+
* [BREAKING] Replace `$log` with `Grubby.logger`
|
|
9
|
+
* Use `Grubby.logger = ...` to customize logging
|
|
10
|
+
|
|
11
|
+
|
|
1
12
|
## 2.0.0
|
|
2
13
|
|
|
3
14
|
* [BREAKING] Drop support for Active Support < 6.0
|
data/Gemfile
CHANGED
data/README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
# grubby
|
|
1
|
+
# grubby
|
|
2
2
|
|
|
3
|
-
[Fail-fast] web scraping.
|
|
4
|
-
error-checking atop the marvelous [Mechanize gem]. See API listing
|
|
5
|
-
below, or browse the [full documentation].
|
|
3
|
+
[Fail-fast][] web scraping. `grubby` adds a layer of utility and
|
|
4
|
+
error-checking atop the marvelous [Mechanize gem][]. See API listing
|
|
5
|
+
below, or browse the [full documentation][].
|
|
6
6
|
|
|
7
7
|
[Fail-fast]: https://en.wikipedia.org/wiki/Fail-fast
|
|
8
8
|
[Mechanize gem]: https://rubygems.org/gems/mechanize
|
|
@@ -23,7 +23,7 @@ class HackerNews < Grubby::PageScraper
|
|
|
23
23
|
end
|
|
24
24
|
|
|
25
25
|
class Item < Grubby::Scraper
|
|
26
|
-
scrapes(:story_link){ source.at!("a
|
|
26
|
+
scrapes(:story_link){ source.at!(".titleline > a") }
|
|
27
27
|
|
|
28
28
|
scrapes(:story_url){ expand_url(story_link["href"]) }
|
|
29
29
|
|
|
@@ -66,7 +66,7 @@ end
|
|
|
66
66
|
```
|
|
67
67
|
|
|
68
68
|
Hacker News also offers a [JSON API](https://github.com/HackerNews/API),
|
|
69
|
-
which may be more robust for scraping purposes.
|
|
69
|
+
which may be more robust for scraping purposes. `grubby` can scrape
|
|
70
70
|
JSON just as well:
|
|
71
71
|
|
|
72
72
|
```ruby
|
|
@@ -113,103 +113,99 @@ end
|
|
|
113
113
|
|
|
114
114
|
## Core API
|
|
115
115
|
|
|
116
|
-
- [Grubby](https://www.rubydoc.info/gems/grubby/Grubby)
|
|
117
|
-
- [
|
|
118
|
-
- [
|
|
119
|
-
- [
|
|
120
|
-
- [
|
|
121
|
-
- [Scraper](https://www.rubydoc.info/gems/grubby/Grubby/Scraper)
|
|
122
|
-
- [
|
|
123
|
-
- [
|
|
124
|
-
- [
|
|
125
|
-
- [
|
|
126
|
-
- [
|
|
127
|
-
- [PageScraper](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper)
|
|
128
|
-
- [
|
|
129
|
-
- [
|
|
130
|
-
- [JsonScraper](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper)
|
|
131
|
-
- [
|
|
132
|
-
- [
|
|
133
|
-
- Mechanize::File
|
|
134
|
-
- [
|
|
135
|
-
- [
|
|
136
|
-
- Mechanize::Page
|
|
137
|
-
- [
|
|
138
|
-
- [
|
|
139
|
-
- Mechanize::Page::Link
|
|
140
|
-
- [
|
|
141
|
-
- URI
|
|
142
|
-
- [
|
|
143
|
-
- [
|
|
116
|
+
- [`Grubby`](https://www.rubydoc.info/gems/grubby/Grubby)
|
|
117
|
+
- [`#fulfill`](https://www.rubydoc.info/gems/grubby/Grubby:fulfill)
|
|
118
|
+
- [`#get_mirrored`](https://www.rubydoc.info/gems/grubby/Grubby:get_mirrored)
|
|
119
|
+
- [`#ok?`](https://www.rubydoc.info/gems/grubby/Grubby:ok%3F)
|
|
120
|
+
- [`#time_between_requests`](https://www.rubydoc.info/gems/grubby/Grubby:time_between_requests)
|
|
121
|
+
- [`Scraper`](https://www.rubydoc.info/gems/grubby/Grubby/Scraper)
|
|
122
|
+
- [`.each`](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.each)
|
|
123
|
+
- [`.scrape`](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrape)
|
|
124
|
+
- [`.scrapes`](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrapes)
|
|
125
|
+
- [`#[]`](https://www.rubydoc.info/gems/grubby/Grubby/Scraper:[])
|
|
126
|
+
- [`#to_h`](https://www.rubydoc.info/gems/grubby/Grubby/Scraper:to_h)
|
|
127
|
+
- [`PageScraper`](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper)
|
|
128
|
+
- [`.scrape_file`](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper.scrape_file)
|
|
129
|
+
- [`#page`](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper:page)
|
|
130
|
+
- [`JsonScraper`](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper)
|
|
131
|
+
- [`.scrape_file`](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper.scrape_file)
|
|
132
|
+
- [`#json`](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper:json)
|
|
133
|
+
- `Mechanize::File`
|
|
134
|
+
- [`#save_to`](https://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to)
|
|
135
|
+
- [`#save_to!`](https://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to%21)
|
|
136
|
+
- `Mechanize::Page`
|
|
137
|
+
- [`#at!`](https://www.rubydoc.info/gems/grubby/Mechanize/Page:at%21)
|
|
138
|
+
- [`#search!`](https://www.rubydoc.info/gems/grubby/Mechanize/Page:search%21)
|
|
139
|
+
- `Mechanize::Page::Link`
|
|
140
|
+
- [`#to_absolute_uri`](https://www.rubydoc.info/gems/grubby/Mechanize/Page/Link#to_absolute_uri)
|
|
141
|
+
- `URI`
|
|
142
|
+
- [`#basename`](https://www.rubydoc.info/gems/grubby/URI:basename)
|
|
143
|
+
- [`#query_param`](https://www.rubydoc.info/gems/grubby/URI:query_param)
|
|
144
144
|
|
|
145
145
|
|
|
146
146
|
## Auxiliary API
|
|
147
147
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
148
|
+
`grubby` loads a few gems that extend Ruby objects with utility methods.
|
|
149
|
+
Some of those methods are listed below. See each gem's documentation
|
|
150
|
+
for a complete API listing.
|
|
151
|
+
|
|
152
|
+
- [`gorge`](https://rubygems.org/gems/gorge)
|
|
153
|
+
([docs](https://www.rubydoc.info/gems/gorge/))
|
|
154
|
+
- [`Pathname#file_crc32`](https://www.rubydoc.info/gems/gorge/Pathname:file_crc32)
|
|
155
|
+
- [`Pathname#file_md5`](https://www.rubydoc.info/gems/gorge/Pathname:file_md5)
|
|
156
|
+
- [`Pathname#file_sha1`](https://www.rubydoc.info/gems/gorge/Pathname:file_sha1)
|
|
157
|
+
- [`ryoba`](https://rubygems.org/gems/ryoba)
|
|
158
|
+
([docs](https://www.rubydoc.info/gems/ryoba/))
|
|
159
|
+
- [`Nokogiri::XML::Node#matches!`](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:matches%21)
|
|
160
|
+
- [`Nokogiri::XML::Node#text!`](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:text%21)
|
|
161
|
+
- [`Nokogiri::XML::Node#uri`](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:uri)
|
|
162
|
+
- [`Nokogiri::XML::Searchable#ancestor!`](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestor%21)
|
|
163
|
+
- [`Nokogiri::XML::Searchable#ancestors!`](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestors%21)
|
|
164
|
+
- [`Nokogiri::XML::Searchable#at!`](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21)
|
|
165
|
+
- [`Nokogiri::XML::Searchable#search!`](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
## Recommended Gems
|
|
169
|
+
|
|
170
|
+
The following gems will extend Ruby objects with utility methods that
|
|
171
|
+
can be useful when web scraping. Example methods are listed; see each
|
|
172
|
+
gem's documentation for a complete API listing.
|
|
151
173
|
|
|
152
174
|
- [Active Support](https://rubygems.org/gems/activesupport)
|
|
153
175
|
([docs](https://www.rubydoc.info/gems/activesupport/))
|
|
154
|
-
- [Enumerable#index_by](https://www.rubydoc.info/gems/activesupport/Enumerable:index_by)
|
|
155
|
-
- [
|
|
156
|
-
- [
|
|
157
|
-
- [
|
|
158
|
-
- [String#
|
|
159
|
-
- [
|
|
176
|
+
- [`Enumerable#index_by`](https://www.rubydoc.info/gems/activesupport/Enumerable:index_by)
|
|
177
|
+
- [`Enumerable#index_with`](https://www.rubydoc.info/gems/activesupport/Enumerable:index_with)
|
|
178
|
+
- [`File.atomic_write`](https://www.rubydoc.info/gems/activesupport/File:atomic_write)
|
|
179
|
+
- [`Object#presence`](https://www.rubydoc.info/gems/activesupport/Object:presence)
|
|
180
|
+
- [`String#blank?`](https://www.rubydoc.info/gems/activesupport/String:blank%3F)
|
|
181
|
+
- [`String#squish`](https://www.rubydoc.info/gems/activesupport/String:squish)
|
|
182
|
+
- [`casual_support`](https://rubygems.org/gems/casual_support)
|
|
160
183
|
([docs](https://www.rubydoc.info/gems/casual_support/))
|
|
161
|
-
- [
|
|
162
|
-
- [String#
|
|
163
|
-
- [String#
|
|
164
|
-
- [String#
|
|
165
|
-
- [String#
|
|
166
|
-
|
|
167
|
-
- [Time#to_hms](https://www.rubydoc.info/gems/casual_support/Time:to_hms)
|
|
168
|
-
- [Time#to_ymd](https://www.rubydoc.info/gems/casual_support/Time:to_ymd)
|
|
169
|
-
- [gorge](https://rubygems.org/gems/gorge)
|
|
170
|
-
([docs](https://www.rubydoc.info/gems/gorge/))
|
|
171
|
-
- [Pathname#file_crc32](https://www.rubydoc.info/gems/gorge/Pathname:file_crc32)
|
|
172
|
-
- [Pathname#file_md5](https://www.rubydoc.info/gems/gorge/Pathname:file_md5)
|
|
173
|
-
- [Pathname#file_sha1](https://www.rubydoc.info/gems/gorge/Pathname:file_sha1)
|
|
174
|
-
- [mini_sanity](https://rubygems.org/gems/mini_sanity)
|
|
184
|
+
- [`String#after`](https://www.rubydoc.info/gems/casual_support/String:after)
|
|
185
|
+
- [`String#after_last`](https://www.rubydoc.info/gems/casual_support/String:after_last)
|
|
186
|
+
- [`String#before`](https://www.rubydoc.info/gems/casual_support/String:before)
|
|
187
|
+
- [`String#before_last`](https://www.rubydoc.info/gems/casual_support/String:before_last)
|
|
188
|
+
- [`String#between`](https://www.rubydoc.info/gems/casual_support/String:between)
|
|
189
|
+
- [`mini_sanity`](https://rubygems.org/gems/mini_sanity)
|
|
175
190
|
([docs](https://www.rubydoc.info/gems/mini_sanity/))
|
|
176
|
-
- [Enumerator#result
|
|
177
|
-
- [Enumerator#results
|
|
178
|
-
- [Object#assert
|
|
179
|
-
- [Object#refute
|
|
180
|
-
- [String#match
|
|
181
|
-
- [pleasant_path](https://rubygems.org/gems/pleasant_path)
|
|
191
|
+
- [`Enumerator#result!`](https://www.rubydoc.info/gems/mini_sanity/Enumerator:result%21)
|
|
192
|
+
- [`Enumerator#results!`](https://www.rubydoc.info/gems/mini_sanity/Enumerator:results%21)
|
|
193
|
+
- [`Object#assert!`](https://www.rubydoc.info/gems/mini_sanity/Object:assert%21)
|
|
194
|
+
- [`Object#refute!`](https://www.rubydoc.info/gems/mini_sanity/Object:refute%21)
|
|
195
|
+
- [`String#match!`](https://www.rubydoc.info/gems/mini_sanity/String:match%21)
|
|
196
|
+
- [`pleasant_path`](https://rubygems.org/gems/pleasant_path)
|
|
182
197
|
([docs](https://www.rubydoc.info/gems/pleasant_path/))
|
|
183
|
-
- [Pathname#available_name](https://www.rubydoc.info/gems/pleasant_path/Pathname:available_name)
|
|
184
|
-
- [Pathname#existence](https://www.rubydoc.info/gems/pleasant_path/Pathname:existence)
|
|
185
|
-
- [Pathname#make_dirname](https://www.rubydoc.info/gems/pleasant_path/Pathname:make_dirname)
|
|
186
|
-
- [Pathname#move_as](https://www.rubydoc.info/gems/pleasant_path/Pathname:move_as)
|
|
187
|
-
- [Pathname#rename_basename](https://www.rubydoc.info/gems/pleasant_path/Pathname:rename_basename)
|
|
188
|
-
- [Pathname#rename_extname](https://www.rubydoc.info/gems/pleasant_path/Pathname:rename_extname)
|
|
189
|
-
- [ryoba](https://rubygems.org/gems/ryoba)
|
|
190
|
-
([docs](https://www.rubydoc.info/gems/ryoba/))
|
|
191
|
-
- [Nokogiri::XML::Node#matches!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:matches%21)
|
|
192
|
-
- [Nokogiri::XML::Node#text!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:text%21)
|
|
193
|
-
- [Nokogiri::XML::Node#uri](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:uri)
|
|
194
|
-
- [Nokogiri::XML::Searchable#ancestor!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestor%21)
|
|
195
|
-
- [Nokogiri::XML::Searchable#ancestors!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestors%21)
|
|
196
|
-
- [Nokogiri::XML::Searchable#at!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21)
|
|
197
|
-
- [Nokogiri::XML::Searchable#search!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21)
|
|
198
|
+
- [`Pathname#available_name`](https://www.rubydoc.info/gems/pleasant_path/Pathname:available_name)
|
|
199
|
+
- [`Pathname#existence`](https://www.rubydoc.info/gems/pleasant_path/Pathname:existence)
|
|
200
|
+
- [`Pathname#make_dirname`](https://www.rubydoc.info/gems/pleasant_path/Pathname:make_dirname)
|
|
201
|
+
- [`Pathname#move_as`](https://www.rubydoc.info/gems/pleasant_path/Pathname:move_as)
|
|
202
|
+
- [`Pathname#rename_basename`](https://www.rubydoc.info/gems/pleasant_path/Pathname:rename_basename)
|
|
203
|
+
- [`Pathname#rename_extname`](https://www.rubydoc.info/gems/pleasant_path/Pathname:rename_extname)
|
|
198
204
|
|
|
199
205
|
|
|
200
206
|
## Installation
|
|
201
207
|
|
|
202
|
-
Install the [gem](https://rubygems.org/gems/grubby)
|
|
203
|
-
|
|
204
|
-
```bash
|
|
205
|
-
$ gem install grubby
|
|
206
|
-
```
|
|
207
|
-
|
|
208
|
-
Then require in your Ruby code:
|
|
209
|
-
|
|
210
|
-
```ruby
|
|
211
|
-
require "grubby"
|
|
212
|
-
```
|
|
208
|
+
Install the [`grubby` gem](https://rubygems.org/gems/grubby).
|
|
213
209
|
|
|
214
210
|
|
|
215
211
|
## Contributing
|
|
@@ -219,4 +215,4 @@ Run `rake test` to run the tests.
|
|
|
219
215
|
|
|
220
216
|
## License
|
|
221
217
|
|
|
222
|
-
[MIT License](
|
|
218
|
+
[MIT License](LICENSE.txt)
|
data/grubby.gemspec
CHANGED
|
@@ -9,26 +9,23 @@ Gem::Specification.new do |spec|
|
|
|
9
9
|
spec.summary = %q{Fail-fast web scraping}
|
|
10
10
|
spec.homepage = "https://github.com/jonathanhefner/grubby"
|
|
11
11
|
spec.license = "MIT"
|
|
12
|
-
spec.required_ruby_version = ">=
|
|
12
|
+
spec.required_ruby_version = ">= 3.4"
|
|
13
13
|
|
|
14
|
-
spec.metadata["homepage_uri"] = spec.homepage
|
|
15
14
|
spec.metadata["source_code_uri"] = spec.homepage
|
|
16
15
|
spec.metadata["changelog_uri"] = spec.metadata["source_code_uri"] + "/blob/master/CHANGELOG.md"
|
|
17
16
|
|
|
18
17
|
# Specify which files should be added to the gem when it is released.
|
|
19
18
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
|
20
19
|
spec.files = Dir.chdir(__dir__) do
|
|
21
|
-
`git ls-files -z`.split("\x0").reject { |f| f.
|
|
20
|
+
`git ls-files -z`.split("\x0").reject { |f| f.start_with?("test/", ".git") }
|
|
22
21
|
end
|
|
23
22
|
spec.bindir = "exe"
|
|
24
23
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
|
25
24
|
spec.require_paths = ["lib"]
|
|
26
25
|
|
|
27
|
-
spec.add_dependency "
|
|
28
|
-
spec.add_dependency "
|
|
29
|
-
spec.add_dependency "gorge", "
|
|
26
|
+
spec.add_dependency "cgi"
|
|
27
|
+
spec.add_dependency "csv"
|
|
28
|
+
spec.add_dependency "gorge", ">= 1.0"
|
|
30
29
|
spec.add_dependency "mechanize", "~> 2.7"
|
|
31
|
-
spec.add_dependency "
|
|
32
|
-
spec.add_dependency "pleasant_path", "~> 2.0"
|
|
33
|
-
spec.add_dependency "ryoba", "~> 1.0"
|
|
30
|
+
spec.add_dependency "ryoba", ">= 1.0"
|
|
34
31
|
end
|
data/lib/grubby/core_ext/uri.rb
CHANGED
data/lib/grubby/json_parser.rb
CHANGED
data/lib/grubby/json_scraper.rb
CHANGED
|
@@ -9,7 +9,10 @@ class Grubby::JsonScraper < Grubby::Scraper
|
|
|
9
9
|
# @raise [Grubby::Scraper::Error]
|
|
10
10
|
# if any {Scraper.scrapes} blocks fail
|
|
11
11
|
def initialize(source)
|
|
12
|
-
|
|
12
|
+
unless source.is_a?(Grubby::JsonParser)
|
|
13
|
+
raise ArgumentError, "source must be a Grubby::JsonParser object"
|
|
14
|
+
end
|
|
15
|
+
@json = source.json
|
|
13
16
|
super
|
|
14
17
|
end
|
|
15
18
|
|
|
@@ -25,9 +25,9 @@ class Mechanize::HTTP::Agent
|
|
|
25
25
|
|
|
26
26
|
# otherwise, shutdown the persistent HTTP connection and try again
|
|
27
27
|
retry_count += 1
|
|
28
|
-
|
|
28
|
+
Grubby.logger.warn("#{e.message} (#{e.class}). Retry in #{retry_count} seconds.")
|
|
29
29
|
sleep(retry_count) # incremental backoff to allow server to self-correct
|
|
30
|
-
|
|
30
|
+
Grubby.logger.warn("Retry #{http_method.to_s.upcase} #{uri}")
|
|
31
31
|
retry
|
|
32
32
|
end
|
|
33
33
|
end
|
data/lib/grubby/page_scraper.rb
CHANGED
|
@@ -9,7 +9,10 @@ class Grubby::PageScraper < Grubby::Scraper
|
|
|
9
9
|
# @raise [Grubby::Scraper::Error]
|
|
10
10
|
# if any {Scraper.scrapes} blocks fail
|
|
11
11
|
def initialize(source)
|
|
12
|
-
|
|
12
|
+
unless source.is_a?(Mechanize::Page)
|
|
13
|
+
raise ArgumentError, "source must be a Mechanize::Page object"
|
|
14
|
+
end
|
|
15
|
+
@page = source
|
|
13
16
|
super
|
|
14
17
|
end
|
|
15
18
|
|
data/lib/grubby/scraper.rb
CHANGED
|
@@ -83,7 +83,7 @@ class Grubby::Scraper
|
|
|
83
83
|
@scraped[field] = instance_eval(&block)
|
|
84
84
|
if @scraped[field].nil?
|
|
85
85
|
raise FieldValueRequiredError.new(field) unless options[:optional]
|
|
86
|
-
|
|
86
|
+
Grubby.logger.debug("#{self.class}##{field} is nil")
|
|
87
87
|
end
|
|
88
88
|
end
|
|
89
89
|
rescue RuntimeError, IndexError => e
|
|
@@ -258,13 +258,6 @@ class Grubby::Scraper
|
|
|
258
258
|
end
|
|
259
259
|
|
|
260
260
|
class Error < RuntimeError
|
|
261
|
-
# @!visibility private
|
|
262
|
-
BACKTRACE_CLEANER = ActiveSupport::BacktraceCleaner.new.tap do |cleaner|
|
|
263
|
-
cleaner.add_silencer do |line|
|
|
264
|
-
line.include?(__dir__) && line.include?("scraper.rb:")
|
|
265
|
-
end
|
|
266
|
-
end
|
|
267
|
-
|
|
268
261
|
# The Scraper that raised this Error.
|
|
269
262
|
#
|
|
270
263
|
# @return [Grubby::Scraper]
|
|
@@ -278,13 +271,20 @@ class Grubby::Scraper
|
|
|
278
271
|
reject{|field, error| error.is_a?(FieldScrapeFailedError) }.
|
|
279
272
|
map do |field, error|
|
|
280
273
|
"* `#{field}` (#{error.class})\n" +
|
|
281
|
-
error.message.
|
|
282
|
-
|
|
274
|
+
error.message.gsub(/^/, " ") + "\n\n" +
|
|
275
|
+
clean_backtrace(error.backtrace).join("\n").gsub(/^/, " ") + "\n"
|
|
283
276
|
end.
|
|
284
277
|
join("\n")
|
|
285
278
|
|
|
286
279
|
super("Failed to scrape the following fields:\n#{listing}")
|
|
287
280
|
end
|
|
281
|
+
|
|
282
|
+
private
|
|
283
|
+
def clean_backtrace(backtrace)
|
|
284
|
+
backtrace.reject do |line|
|
|
285
|
+
line.start_with?(__dir__) && line.include?("scraper.rb:")
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
288
|
end
|
|
289
289
|
|
|
290
290
|
# @!visibility private
|
data/lib/grubby/version.rb
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
GRUBBY_VERSION = "
|
|
1
|
+
GRUBBY_VERSION = "3.0.0"
|
data/lib/grubby.rb
CHANGED
|
@@ -1,13 +1,10 @@
|
|
|
1
|
-
require "
|
|
2
|
-
|
|
1
|
+
require "pathname"
|
|
2
|
+
|
|
3
3
|
require "gorge"
|
|
4
4
|
require "mechanize"
|
|
5
|
-
require "mini_sanity"
|
|
6
|
-
require "pleasant_path"
|
|
7
5
|
require "ryoba"
|
|
8
6
|
|
|
9
7
|
require_relative "grubby/version"
|
|
10
|
-
require_relative "grubby/log"
|
|
11
8
|
|
|
12
9
|
require_relative "grubby/core_ext/string"
|
|
13
10
|
require_relative "grubby/core_ext/uri"
|
|
@@ -23,6 +20,21 @@ class Grubby < Mechanize
|
|
|
23
20
|
|
|
24
21
|
VERSION = GRUBBY_VERSION
|
|
25
22
|
|
|
23
|
+
class << self
|
|
24
|
+
# Logger used by Grubby.
|
|
25
|
+
#
|
|
26
|
+
# @return [Logger]
|
|
27
|
+
def logger
|
|
28
|
+
@logger ||= Logger.new($stderr).tap do |logger|
|
|
29
|
+
logger.formatter = -> (severity, time, progname, msg) do
|
|
30
|
+
"[#{time.strftime "%Y-%m-%d %H:%M:%S"}] #{severity} #{msg}\n"
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
attr_writer :logger
|
|
36
|
+
end
|
|
37
|
+
|
|
26
38
|
# The minimum amount of time enforced between requests, in seconds.
|
|
27
39
|
# If the value is a Range, a random number within the Range is chosen
|
|
28
40
|
# for each request.
|
|
@@ -81,13 +93,18 @@ class Grubby < Mechanize
|
|
|
81
93
|
# @param path [Pathname, String, nil]
|
|
82
94
|
# @return [Pathname]
|
|
83
95
|
def journal=(path)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
96
|
+
if path
|
|
97
|
+
@journal = Pathname(path)
|
|
98
|
+
@journal.dirname.mkpath
|
|
99
|
+
require "csv"
|
|
100
|
+
@fulfilled = CSV.open(@journal, "a+") do |csv|
|
|
101
|
+
csv.each.map{|row| FulfilledEntry.new(*row) }.to_set
|
|
90
102
|
end
|
|
103
|
+
else
|
|
104
|
+
@journal = nil
|
|
105
|
+
@fulfilled = Set.new
|
|
106
|
+
end
|
|
107
|
+
|
|
91
108
|
@journal
|
|
92
109
|
end
|
|
93
110
|
|
|
@@ -138,8 +155,8 @@ class Grubby < Mechanize
|
|
|
138
155
|
if i >= mirror_uris.length
|
|
139
156
|
raise
|
|
140
157
|
else
|
|
141
|
-
|
|
142
|
-
|
|
158
|
+
Grubby.logger.debug("Mirror failed (code #{e.response_code}): #{mirror_uris[i - 1]}")
|
|
159
|
+
Grubby.logger.debug("Try mirror: #{mirror_uris[i]}")
|
|
143
160
|
retry
|
|
144
161
|
end
|
|
145
162
|
end
|
|
@@ -199,7 +216,7 @@ class Grubby < Mechanize
|
|
|
199
216
|
normalized_uri = normalize_uri(uri)
|
|
200
217
|
return unless add_fulfilled(normalized_uri, purpose, series)
|
|
201
218
|
|
|
202
|
-
|
|
219
|
+
Grubby.logger.info("Fetch #{normalized_uri}")
|
|
203
220
|
resource = get(normalized_uri)
|
|
204
221
|
unprocessed = add_fulfilled(resource.uri, purpose, series) &
|
|
205
222
|
add_fulfilled("content hash: #{resource.content_hash}", purpose, series)
|
|
@@ -224,7 +241,7 @@ class Grubby < Mechanize
|
|
|
224
241
|
if (series.uniq!) || @fulfilled.add?(series.last)
|
|
225
242
|
true
|
|
226
243
|
else
|
|
227
|
-
|
|
244
|
+
Grubby.logger.info("Skip #{series.first.target}" \
|
|
228
245
|
" (seen#{" #{series.last.target}" unless series.length == 1})")
|
|
229
246
|
false
|
|
230
247
|
end
|
|
@@ -232,7 +249,7 @@ class Grubby < Mechanize
|
|
|
232
249
|
|
|
233
250
|
def normalize_uri(uri)
|
|
234
251
|
uri = uri.dup
|
|
235
|
-
|
|
252
|
+
Grubby.logger.warn("Ignore ##{uri.fragment} in #{uri}") if uri.fragment
|
|
236
253
|
uri.fragment = nil
|
|
237
254
|
uri.path = uri.path.chomp("/")
|
|
238
255
|
uri
|
metadata
CHANGED
|
@@ -1,55 +1,54 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: grubby
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
4
|
+
version: 3.0.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Jonathan Hefner
|
|
8
|
-
autorequire:
|
|
9
8
|
bindir: exe
|
|
10
9
|
cert_chain: []
|
|
11
|
-
date:
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
12
11
|
dependencies:
|
|
13
12
|
- !ruby/object:Gem::Dependency
|
|
14
|
-
name:
|
|
13
|
+
name: cgi
|
|
15
14
|
requirement: !ruby/object:Gem::Requirement
|
|
16
15
|
requirements:
|
|
17
16
|
- - ">="
|
|
18
17
|
- !ruby/object:Gem::Version
|
|
19
|
-
version: '
|
|
18
|
+
version: '0'
|
|
20
19
|
type: :runtime
|
|
21
20
|
prerelease: false
|
|
22
21
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
22
|
requirements:
|
|
24
23
|
- - ">="
|
|
25
24
|
- !ruby/object:Gem::Version
|
|
26
|
-
version: '
|
|
25
|
+
version: '0'
|
|
27
26
|
- !ruby/object:Gem::Dependency
|
|
28
|
-
name:
|
|
27
|
+
name: csv
|
|
29
28
|
requirement: !ruby/object:Gem::Requirement
|
|
30
29
|
requirements:
|
|
31
|
-
- - "
|
|
30
|
+
- - ">="
|
|
32
31
|
- !ruby/object:Gem::Version
|
|
33
|
-
version: '
|
|
32
|
+
version: '0'
|
|
34
33
|
type: :runtime
|
|
35
34
|
prerelease: false
|
|
36
35
|
version_requirements: !ruby/object:Gem::Requirement
|
|
37
36
|
requirements:
|
|
38
|
-
- - "
|
|
37
|
+
- - ">="
|
|
39
38
|
- !ruby/object:Gem::Version
|
|
40
|
-
version: '
|
|
39
|
+
version: '0'
|
|
41
40
|
- !ruby/object:Gem::Dependency
|
|
42
41
|
name: gorge
|
|
43
42
|
requirement: !ruby/object:Gem::Requirement
|
|
44
43
|
requirements:
|
|
45
|
-
- - "
|
|
44
|
+
- - ">="
|
|
46
45
|
- !ruby/object:Gem::Version
|
|
47
46
|
version: '1.0'
|
|
48
47
|
type: :runtime
|
|
49
48
|
prerelease: false
|
|
50
49
|
version_requirements: !ruby/object:Gem::Requirement
|
|
51
50
|
requirements:
|
|
52
|
-
- - "
|
|
51
|
+
- - ">="
|
|
53
52
|
- !ruby/object:Gem::Version
|
|
54
53
|
version: '1.0'
|
|
55
54
|
- !ruby/object:Gem::Dependency
|
|
@@ -66,70 +65,37 @@ dependencies:
|
|
|
66
65
|
- - "~>"
|
|
67
66
|
- !ruby/object:Gem::Version
|
|
68
67
|
version: '2.7'
|
|
69
|
-
- !ruby/object:Gem::Dependency
|
|
70
|
-
name: mini_sanity
|
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
|
72
|
-
requirements:
|
|
73
|
-
- - "~>"
|
|
74
|
-
- !ruby/object:Gem::Version
|
|
75
|
-
version: '2.0'
|
|
76
|
-
type: :runtime
|
|
77
|
-
prerelease: false
|
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
79
|
-
requirements:
|
|
80
|
-
- - "~>"
|
|
81
|
-
- !ruby/object:Gem::Version
|
|
82
|
-
version: '2.0'
|
|
83
|
-
- !ruby/object:Gem::Dependency
|
|
84
|
-
name: pleasant_path
|
|
85
|
-
requirement: !ruby/object:Gem::Requirement
|
|
86
|
-
requirements:
|
|
87
|
-
- - "~>"
|
|
88
|
-
- !ruby/object:Gem::Version
|
|
89
|
-
version: '2.0'
|
|
90
|
-
type: :runtime
|
|
91
|
-
prerelease: false
|
|
92
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
93
|
-
requirements:
|
|
94
|
-
- - "~>"
|
|
95
|
-
- !ruby/object:Gem::Version
|
|
96
|
-
version: '2.0'
|
|
97
68
|
- !ruby/object:Gem::Dependency
|
|
98
69
|
name: ryoba
|
|
99
70
|
requirement: !ruby/object:Gem::Requirement
|
|
100
71
|
requirements:
|
|
101
|
-
- - "
|
|
72
|
+
- - ">="
|
|
102
73
|
- !ruby/object:Gem::Version
|
|
103
74
|
version: '1.0'
|
|
104
75
|
type: :runtime
|
|
105
76
|
prerelease: false
|
|
106
77
|
version_requirements: !ruby/object:Gem::Requirement
|
|
107
78
|
requirements:
|
|
108
|
-
- - "
|
|
79
|
+
- - ">="
|
|
109
80
|
- !ruby/object:Gem::Version
|
|
110
81
|
version: '1.0'
|
|
111
|
-
description:
|
|
112
82
|
email:
|
|
113
83
|
- jonathan@hefner.pro
|
|
114
84
|
executables: []
|
|
115
85
|
extensions: []
|
|
116
86
|
extra_rdoc_files: []
|
|
117
87
|
files:
|
|
118
|
-
- ".gitignore"
|
|
119
|
-
- ".travis.yml"
|
|
120
88
|
- CHANGELOG.md
|
|
121
89
|
- Gemfile
|
|
122
90
|
- LICENSE.txt
|
|
123
91
|
- README.md
|
|
124
92
|
- Rakefile
|
|
125
|
-
- gemfiles/activesupport-6.0.gemfile
|
|
126
93
|
- grubby.gemspec
|
|
127
94
|
- lib/grubby.rb
|
|
128
95
|
- lib/grubby/core_ext/string.rb
|
|
129
96
|
- lib/grubby/core_ext/uri.rb
|
|
130
97
|
- lib/grubby/json_parser.rb
|
|
131
98
|
- lib/grubby/json_scraper.rb
|
|
132
|
-
- lib/grubby/log.rb
|
|
133
99
|
- lib/grubby/mechanize/download.rb
|
|
134
100
|
- lib/grubby/mechanize/fetch_with_retry.rb
|
|
135
101
|
- lib/grubby/mechanize/file.rb
|
|
@@ -143,10 +109,8 @@ homepage: https://github.com/jonathanhefner/grubby
|
|
|
143
109
|
licenses:
|
|
144
110
|
- MIT
|
|
145
111
|
metadata:
|
|
146
|
-
homepage_uri: https://github.com/jonathanhefner/grubby
|
|
147
112
|
source_code_uri: https://github.com/jonathanhefner/grubby
|
|
148
113
|
changelog_uri: https://github.com/jonathanhefner/grubby/blob/master/CHANGELOG.md
|
|
149
|
-
post_install_message:
|
|
150
114
|
rdoc_options: []
|
|
151
115
|
require_paths:
|
|
152
116
|
- lib
|
|
@@ -154,15 +118,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
154
118
|
requirements:
|
|
155
119
|
- - ">="
|
|
156
120
|
- !ruby/object:Gem::Version
|
|
157
|
-
version: '
|
|
121
|
+
version: '3.4'
|
|
158
122
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
159
123
|
requirements:
|
|
160
124
|
- - ">="
|
|
161
125
|
- !ruby/object:Gem::Version
|
|
162
126
|
version: '0'
|
|
163
127
|
requirements: []
|
|
164
|
-
rubygems_version:
|
|
165
|
-
signing_key:
|
|
128
|
+
rubygems_version: 4.0.10
|
|
166
129
|
specification_version: 4
|
|
167
130
|
summary: Fail-fast web scraping
|
|
168
131
|
test_files: []
|
data/.gitignore
DELETED
data/.travis.yml
DELETED