grubby 1.2.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.travis.yml +6 -3
- data/CHANGELOG.md +12 -0
- data/Gemfile +3 -0
- data/README.md +140 -92
- data/Rakefile +0 -13
- data/gemfiles/activesupport-6.0.gemfile +3 -0
- data/grubby.gemspec +17 -18
- data/lib/grubby.rb +64 -46
- data/lib/grubby/core_ext/uri.rb +12 -11
- data/lib/grubby/json_parser.rb +1 -27
- data/lib/grubby/json_scraper.rb +6 -2
- data/lib/grubby/mechanize/download.rb +1 -1
- data/lib/grubby/mechanize/file.rb +1 -2
- data/lib/grubby/mechanize/link.rb +9 -6
- data/lib/grubby/mechanize/page.rb +4 -2
- data/lib/grubby/mechanize/parser.rb +9 -9
- data/lib/grubby/page_scraper.rb +6 -2
- data/lib/grubby/scraper.rb +86 -60
- data/lib/grubby/version.rb +1 -1
- metadata +17 -69
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e313c9ba144ee119b31eb6b7ec5fef721df811c8d579f532e5aa5de5a8d65198
|
4
|
+
data.tar.gz: 07f06e01378301c37ca0177a29f95e72f3cf549b65c3e1c9896c9749a9cd857d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ea948a4c90d2d9ef0e1cd527adc3ef89cb0379ad98751ffbf671b5cf2210e6e700b7856983e1378a500d9db842d9411cf20275f005ea4a2e2eba824a9c929ee3
|
7
|
+
data.tar.gz: 7a7985f0d5127d6c7e25f9d39a489c460cd76f05219358072dce618667edcefd033740335fbb4b6c8cfa216f0ed4f4d3cfab239af7d33f7d92ec939508a6ea20
|
data/.gitignore
CHANGED
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
## 2.0.0
|
2
|
+
|
3
|
+
* [BREAKING] Drop support for Active Support < 6.0
|
4
|
+
* [BREAKING] Require casual_support ~> 4.0
|
5
|
+
* [BREAKING] Require mini_sanity ~> 2.0
|
6
|
+
* [BREAKING] Require pleasant_path ~> 2.0
|
7
|
+
* [BREAKING] Remove `JsonParser.json_parse_options`
|
8
|
+
* Use `::JSON.load_default_options` instead
|
9
|
+
* [BREAKING] Rename `Grubby#singleton` to `Grubby#fulfill`
|
10
|
+
* [BREAKING] Change `Grubby#fulfill` to return block's result
|
11
|
+
|
12
|
+
|
1
13
|
## 1.2.1
|
2
14
|
|
3
15
|
* Add `JsonParser#mech` attribute for parity with `Mechanize::Page#mech`
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,162 +1,211 @@
|
|
1
|
-
# grubby
|
1
|
+
# grubby [](https://travis-ci.org/jonathanhefner/grubby)
|
2
2
|
|
3
3
|
[Fail-fast] web scraping. *grubby* adds a layer of utility and
|
4
|
-
error-checking atop the marvelous [Mechanize gem]. See API
|
4
|
+
error-checking atop the marvelous [Mechanize gem]. See API listing
|
5
5
|
below, or browse the [full documentation].
|
6
6
|
|
7
7
|
[Fail-fast]: https://en.wikipedia.org/wiki/Fail-fast
|
8
8
|
[Mechanize gem]: https://rubygems.org/gems/mechanize
|
9
|
-
[full documentation]:
|
9
|
+
[full documentation]: https://www.rubydoc.info/gems/grubby/
|
10
10
|
|
11
11
|
|
12
12
|
## Examples
|
13
13
|
|
14
|
-
The following
|
14
|
+
The following code scrapes stories from the [Hacker News](
|
15
|
+
https://news.ycombinator.com/news) front page:
|
15
16
|
|
16
17
|
```ruby
|
17
18
|
require "grubby"
|
18
19
|
|
19
20
|
class HackerNews < Grubby::PageScraper
|
20
21
|
scrapes(:items) do
|
21
|
-
page.search!(".athing").map{|
|
22
|
+
page.search!(".athing").map{|element| Item.new(element) }
|
22
23
|
end
|
23
24
|
|
24
25
|
class Item < Grubby::Scraper
|
25
26
|
scrapes(:story_link){ source.at!("a.storylink") }
|
26
|
-
|
27
|
+
|
28
|
+
scrapes(:story_url){ expand_url(story_link["href"]) }
|
29
|
+
|
27
30
|
scrapes(:title){ story_link.text }
|
31
|
+
|
32
|
+
scrapes(:comments_link, optional: true) do
|
33
|
+
source.next_sibling.search!(".subtext a").find do |link|
|
34
|
+
link.text.match?(/comment|discuss/)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
scrapes(:comments_url, if: :comments_link) do
|
39
|
+
expand_url(comments_link["href"])
|
40
|
+
end
|
41
|
+
|
42
|
+
scrapes(:comment_count, if: :comments_link) do
|
43
|
+
comments_link.text.to_i
|
44
|
+
end
|
45
|
+
|
46
|
+
def expand_url(url)
|
47
|
+
url.include?("://") ? url : source.document.uri.merge(url).to_s
|
48
|
+
end
|
28
49
|
end
|
29
50
|
end
|
30
51
|
|
31
52
|
# The following line will raise an exception if anything goes wrong
|
32
53
|
# during the scraping process. For example, if the structure of the
|
33
|
-
# HTML does not match expectations
|
34
|
-
#
|
35
|
-
#
|
36
|
-
# hard-to-trace errors.
|
54
|
+
# HTML does not match expectations due to a site change, the script will
|
55
|
+
# terminate immediately with a helpful error message. This prevents bad
|
56
|
+
# data from propagating and causing hard-to-trace errors.
|
37
57
|
hn = HackerNews.scrape("https://news.ycombinator.com/news")
|
38
58
|
|
39
59
|
# Your processing logic goes here:
|
40
60
|
hn.items.take(10).each do |item|
|
41
61
|
puts "* #{item.title}"
|
42
|
-
puts " #{item.
|
62
|
+
puts " #{item.story_url}"
|
63
|
+
puts " #{item.comment_count} comments: #{item.comments_url}" if item.comments_url
|
43
64
|
puts
|
44
65
|
end
|
45
66
|
```
|
46
67
|
|
47
|
-
|
68
|
+
Hacker News also offers a [JSON API](https://github.com/HackerNews/API),
|
69
|
+
which may be more robust for scraping purposes. *grubby* can scrape
|
70
|
+
JSON just as well:
|
71
|
+
|
72
|
+
```ruby
|
73
|
+
require "grubby"
|
74
|
+
|
75
|
+
class HackerNews < Grubby::JsonScraper
|
76
|
+
scrapes(:items) do
|
77
|
+
# API returns array of top 500 item IDs, so limit as necessary
|
78
|
+
json.take(10).map do |item_id|
|
79
|
+
Item.scrape("https://hacker-news.firebaseio.com/v0/item/#{item_id}.json")
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
class Item < Grubby::JsonScraper
|
84
|
+
scrapes(:story_url){ json["url"] || hn_url }
|
85
|
+
|
86
|
+
scrapes(:title){ json["title"] }
|
87
|
+
|
88
|
+
scrapes(:comments_url, optional: true) do
|
89
|
+
hn_url if json["descendants"]
|
90
|
+
end
|
91
|
+
|
92
|
+
scrapes(:comment_count, optional: true) do
|
93
|
+
json["descendants"]&.to_i
|
94
|
+
end
|
95
|
+
|
96
|
+
def hn_url
|
97
|
+
"https://news.ycombinator.com/item?id=#{json["id"]}"
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
hn = HackerNews.scrape("https://hacker-news.firebaseio.com/v0/topstories.json")
|
103
|
+
|
104
|
+
# Your processing logic goes here:
|
105
|
+
hn.items.each do |item|
|
106
|
+
puts "* #{item.title}"
|
107
|
+
puts " #{item.story_url}"
|
108
|
+
puts " #{item.comment_count} comments: #{item.comments_url}" if item.comments_url
|
109
|
+
puts
|
110
|
+
end
|
111
|
+
```
|
48
112
|
|
49
113
|
|
50
114
|
## Core API
|
51
115
|
|
52
|
-
- [Grubby](
|
53
|
-
- [#
|
54
|
-
- [#
|
55
|
-
- [#
|
56
|
-
- [#time_between_requests](
|
57
|
-
- [Scraper](
|
58
|
-
- [.each](
|
59
|
-
- [.
|
60
|
-
- [.
|
61
|
-
- [
|
62
|
-
- [#
|
63
|
-
|
64
|
-
- [
|
65
|
-
- [
|
66
|
-
|
67
|
-
- [
|
68
|
-
- [
|
69
|
-
- [.scrape_file](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper.scrape_file)
|
70
|
-
- [#json](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper:json)
|
71
|
-
- Mechanize::Download
|
72
|
-
- [#save_to](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to)
|
73
|
-
- [#save_to!](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to%21)
|
116
|
+
- [Grubby](https://www.rubydoc.info/gems/grubby/Grubby)
|
117
|
+
- [#fulfill](https://www.rubydoc.info/gems/grubby/Grubby:fulfill)
|
118
|
+
- [#get_mirrored](https://www.rubydoc.info/gems/grubby/Grubby:get_mirrored)
|
119
|
+
- [#ok?](https://www.rubydoc.info/gems/grubby/Grubby:ok%3F)
|
120
|
+
- [#time_between_requests](https://www.rubydoc.info/gems/grubby/Grubby:time_between_requests)
|
121
|
+
- [Scraper](https://www.rubydoc.info/gems/grubby/Grubby/Scraper)
|
122
|
+
- [.each](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.each)
|
123
|
+
- [.scrape](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrape)
|
124
|
+
- [.scrapes](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrapes)
|
125
|
+
- [#[]](https://www.rubydoc.info/gems/grubby/Grubby/Scraper:[])
|
126
|
+
- [#to_h](https://www.rubydoc.info/gems/grubby/Grubby/Scraper:to_h)
|
127
|
+
- [PageScraper](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper)
|
128
|
+
- [.scrape_file](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper.scrape_file)
|
129
|
+
- [#page](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper:page)
|
130
|
+
- [JsonScraper](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper)
|
131
|
+
- [.scrape_file](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper.scrape_file)
|
132
|
+
- [#json](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper:json)
|
74
133
|
- Mechanize::File
|
75
|
-
- [#save_to](
|
76
|
-
- [#save_to!](
|
134
|
+
- [#save_to](https://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to)
|
135
|
+
- [#save_to!](https://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to%21)
|
77
136
|
- Mechanize::Page
|
78
|
-
- [#at!](
|
79
|
-
- [#search!](
|
137
|
+
- [#at!](https://www.rubydoc.info/gems/grubby/Mechanize/Page:at%21)
|
138
|
+
- [#search!](https://www.rubydoc.info/gems/grubby/Mechanize/Page:search%21)
|
80
139
|
- Mechanize::Page::Link
|
81
|
-
- [#to_absolute_uri](
|
140
|
+
- [#to_absolute_uri](https://www.rubydoc.info/gems/grubby/Mechanize/Page/Link#to_absolute_uri)
|
82
141
|
- URI
|
83
142
|
- [#basename](https://www.rubydoc.info/gems/grubby/URI:basename)
|
84
143
|
- [#query_param](https://www.rubydoc.info/gems/grubby/URI:query_param)
|
85
144
|
|
86
145
|
|
87
|
-
##
|
146
|
+
## Auxiliary API
|
88
147
|
|
89
|
-
*grubby*
|
90
|
-
|
91
|
-
|
92
|
-
**a few** of the methods each provides. See each gem's documentation
|
93
|
-
for a complete API listing.
|
148
|
+
*grubby* loads several gems that extend Ruby objects with utility
|
149
|
+
methods. Some of those methods are listed below. See each gem's
|
150
|
+
documentation for a complete API listing.
|
94
151
|
|
95
152
|
- [Active Support](https://rubygems.org/gems/activesupport)
|
96
|
-
([docs](
|
153
|
+
([docs](https://www.rubydoc.info/gems/activesupport/))
|
97
154
|
- [Enumerable#index_by](https://www.rubydoc.info/gems/activesupport/Enumerable:index_by)
|
98
155
|
- [File.atomic_write](https://www.rubydoc.info/gems/activesupport/File:atomic_write)
|
99
|
-
- [NilClass#try](https://www.rubydoc.info/gems/activesupport/NilClass:try)
|
100
156
|
- [Object#presence](https://www.rubydoc.info/gems/activesupport/Object:presence)
|
101
157
|
- [String#blank?](https://www.rubydoc.info/gems/activesupport/String:blank%3F)
|
102
158
|
- [String#squish](https://www.rubydoc.info/gems/activesupport/String:squish)
|
103
159
|
- [casual_support](https://rubygems.org/gems/casual_support)
|
104
|
-
([docs](
|
105
|
-
- [Enumerable#index_to](
|
106
|
-
- [String#after](
|
107
|
-
- [String#after_last](
|
108
|
-
- [String#before](
|
109
|
-
- [String#before_last](
|
110
|
-
- [String#between](
|
111
|
-
- [Time#to_hms](
|
112
|
-
- [Time#to_ymd](
|
160
|
+
([docs](https://www.rubydoc.info/gems/casual_support/))
|
161
|
+
- [Enumerable#index_to](https://www.rubydoc.info/gems/casual_support/Enumerable:index_to)
|
162
|
+
- [String#after](https://www.rubydoc.info/gems/casual_support/String:after)
|
163
|
+
- [String#after_last](https://www.rubydoc.info/gems/casual_support/String:after_last)
|
164
|
+
- [String#before](https://www.rubydoc.info/gems/casual_support/String:before)
|
165
|
+
- [String#before_last](https://www.rubydoc.info/gems/casual_support/String:before_last)
|
166
|
+
- [String#between](https://www.rubydoc.info/gems/casual_support/String:between)
|
167
|
+
- [Time#to_hms](https://www.rubydoc.info/gems/casual_support/Time:to_hms)
|
168
|
+
- [Time#to_ymd](https://www.rubydoc.info/gems/casual_support/Time:to_ymd)
|
113
169
|
- [gorge](https://rubygems.org/gems/gorge)
|
114
|
-
([docs](
|
115
|
-
- [Pathname#file_crc32](
|
116
|
-
- [Pathname#file_md5](
|
117
|
-
- [Pathname#file_sha1](
|
118
|
-
- [String#crc32](http://www.rubydoc.info/gems/gorge/String:crc32)
|
119
|
-
- [String#md5](http://www.rubydoc.info/gems/gorge/String:md5)
|
120
|
-
- [String#sha1](http://www.rubydoc.info/gems/gorge/String:sha1)
|
170
|
+
([docs](https://www.rubydoc.info/gems/gorge/))
|
171
|
+
- [Pathname#file_crc32](https://www.rubydoc.info/gems/gorge/Pathname:file_crc32)
|
172
|
+
- [Pathname#file_md5](https://www.rubydoc.info/gems/gorge/Pathname:file_md5)
|
173
|
+
- [Pathname#file_sha1](https://www.rubydoc.info/gems/gorge/Pathname:file_sha1)
|
121
174
|
- [mini_sanity](https://rubygems.org/gems/mini_sanity)
|
122
|
-
([docs](
|
123
|
-
- [
|
124
|
-
- [
|
125
|
-
- [Object#
|
126
|
-
- [Object#
|
127
|
-
- [
|
128
|
-
- [Pathname#assert_exist!](http://www.rubydoc.info/gems/mini_sanity/Pathname:assert_exist%21)
|
129
|
-
- [String#assert_match!](http://www.rubydoc.info/gems/mini_sanity/String:assert_match%21)
|
175
|
+
([docs](https://www.rubydoc.info/gems/mini_sanity/))
|
176
|
+
- [Enumerator#result!](https://www.rubydoc.info/gems/mini_sanity/Enumerator:result%21)
|
177
|
+
- [Enumerator#results!](https://www.rubydoc.info/gems/mini_sanity/Enumerator:results%21)
|
178
|
+
- [Object#assert!](https://www.rubydoc.info/gems/mini_sanity/Object:assert%21)
|
179
|
+
- [Object#refute!](https://www.rubydoc.info/gems/mini_sanity/Object:refute%21)
|
180
|
+
- [String#match!](https://www.rubydoc.info/gems/mini_sanity/String:match%21)
|
130
181
|
- [pleasant_path](https://rubygems.org/gems/pleasant_path)
|
131
|
-
([docs](
|
132
|
-
- [Pathname#available_name](
|
133
|
-
- [Pathname#
|
134
|
-
- [Pathname#
|
135
|
-
- [Pathname#
|
136
|
-
- [Pathname#
|
137
|
-
- [Pathname#
|
138
|
-
- [Pathname#rename_basename](http://www.rubydoc.info/gems/pleasant_path/Pathname:rename_basename)
|
139
|
-
- [Pathname#rename_extname](http://www.rubydoc.info/gems/pleasant_path/Pathname:rename_extname)
|
182
|
+
([docs](https://www.rubydoc.info/gems/pleasant_path/))
|
183
|
+
- [Pathname#available_name](https://www.rubydoc.info/gems/pleasant_path/Pathname:available_name)
|
184
|
+
- [Pathname#existence](https://www.rubydoc.info/gems/pleasant_path/Pathname:existence)
|
185
|
+
- [Pathname#make_dirname](https://www.rubydoc.info/gems/pleasant_path/Pathname:make_dirname)
|
186
|
+
- [Pathname#move_as](https://www.rubydoc.info/gems/pleasant_path/Pathname:move_as)
|
187
|
+
- [Pathname#rename_basename](https://www.rubydoc.info/gems/pleasant_path/Pathname:rename_basename)
|
188
|
+
- [Pathname#rename_extname](https://www.rubydoc.info/gems/pleasant_path/Pathname:rename_extname)
|
140
189
|
- [ryoba](https://rubygems.org/gems/ryoba)
|
141
|
-
([docs](
|
142
|
-
- [Nokogiri::XML::Node#matches!](
|
143
|
-
- [Nokogiri::XML::Node#text!](
|
144
|
-
- [Nokogiri::XML::Node#uri](
|
145
|
-
- [Nokogiri::XML::Searchable#ancestor!](
|
146
|
-
- [Nokogiri::XML::Searchable#ancestors!](
|
147
|
-
- [Nokogiri::XML::Searchable#at!](
|
148
|
-
- [Nokogiri::XML::Searchable#search!](
|
190
|
+
([docs](https://www.rubydoc.info/gems/ryoba/))
|
191
|
+
- [Nokogiri::XML::Node#matches!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:matches%21)
|
192
|
+
- [Nokogiri::XML::Node#text!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:text%21)
|
193
|
+
- [Nokogiri::XML::Node#uri](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:uri)
|
194
|
+
- [Nokogiri::XML::Searchable#ancestor!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestor%21)
|
195
|
+
- [Nokogiri::XML::Searchable#ancestors!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestors%21)
|
196
|
+
- [Nokogiri::XML::Searchable#at!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21)
|
197
|
+
- [Nokogiri::XML::Searchable#search!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21)
|
149
198
|
|
150
199
|
|
151
200
|
## Installation
|
152
201
|
|
153
|
-
Install
|
202
|
+
Install the [gem](https://rubygems.org/gems/grubby):
|
154
203
|
|
155
204
|
```bash
|
156
205
|
$ gem install grubby
|
157
206
|
```
|
158
207
|
|
159
|
-
Then require in your Ruby
|
208
|
+
Then require in your Ruby code:
|
160
209
|
|
161
210
|
```ruby
|
162
211
|
require "grubby"
|
@@ -165,8 +214,7 @@ require "grubby"
|
|
165
214
|
|
166
215
|
## Contributing
|
167
216
|
|
168
|
-
Run `rake test` to run the tests.
|
169
|
-
interactive prompt that pre-loads the project code.
|
217
|
+
Run `rake test` to run the tests.
|
170
218
|
|
171
219
|
|
172
220
|
## License
|
data/Rakefile
CHANGED
@@ -1,18 +1,5 @@
|
|
1
1
|
require "bundler/gem_tasks"
|
2
2
|
require "rake/testtask"
|
3
|
-
require "yard"
|
4
|
-
|
5
|
-
|
6
|
-
YARD::Rake::YardocTask.new(:doc) do |t|
|
7
|
-
end
|
8
|
-
|
9
|
-
desc "Launch IRB with this gem pre-loaded"
|
10
|
-
task :irb do
|
11
|
-
require "grubby"
|
12
|
-
require "irb"
|
13
|
-
ARGV.clear
|
14
|
-
IRB.start
|
15
|
-
end
|
16
3
|
|
17
4
|
Rake::TestTask.new(:test) do |t|
|
18
5
|
t.libs << "test"
|
data/grubby.gemspec
CHANGED
@@ -1,7 +1,4 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path("../lib", __FILE__)
|
3
|
-
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require "grubby/version"
|
1
|
+
require_relative "lib/grubby/version"
|
5
2
|
|
6
3
|
Gem::Specification.new do |spec|
|
7
4
|
spec.name = "grubby"
|
@@ -12,24 +9,26 @@ Gem::Specification.new do |spec|
|
|
12
9
|
spec.summary = %q{Fail-fast web scraping}
|
13
10
|
spec.homepage = "https://github.com/jonathanhefner/grubby"
|
14
11
|
spec.license = "MIT"
|
12
|
+
spec.required_ruby_version = ">= 2.6"
|
15
13
|
|
16
|
-
spec.
|
17
|
-
|
14
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
15
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
16
|
+
spec.metadata["changelog_uri"] = spec.metadata["source_code_uri"] + "/blob/master/CHANGELOG.md"
|
17
|
+
|
18
|
+
# Specify which files should be added to the gem when it is released.
|
19
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
20
|
+
spec.files = Dir.chdir(__dir__) do
|
21
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
22
|
end
|
19
23
|
spec.bindir = "exe"
|
20
24
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
21
25
|
spec.require_paths = ["lib"]
|
22
26
|
|
23
|
-
spec.
|
24
|
-
spec.
|
25
|
-
spec.
|
26
|
-
spec.
|
27
|
-
spec.
|
28
|
-
spec.
|
29
|
-
spec.
|
30
|
-
|
31
|
-
spec.add_development_dependency "bundler", "~> 1.15"
|
32
|
-
spec.add_development_dependency "rake", "~> 10.0"
|
33
|
-
spec.add_development_dependency "minitest", "~> 5.0"
|
34
|
-
spec.add_development_dependency "yard", "~> 0.9"
|
27
|
+
spec.add_dependency "activesupport", ">= 6.0"
|
28
|
+
spec.add_dependency "casual_support", "~> 4.0"
|
29
|
+
spec.add_dependency "gorge", "~> 1.0"
|
30
|
+
spec.add_dependency "mechanize", "~> 2.7"
|
31
|
+
spec.add_dependency "mini_sanity", "~> 2.0"
|
32
|
+
spec.add_dependency "pleasant_path", "~> 2.0"
|
33
|
+
spec.add_dependency "ryoba", "~> 1.0"
|
35
34
|
end
|
data/lib/grubby.rb
CHANGED
@@ -23,22 +23,22 @@ class Grubby < Mechanize
|
|
23
23
|
|
24
24
|
VERSION = GRUBBY_VERSION
|
25
25
|
|
26
|
-
# The
|
27
|
-
#
|
28
|
-
#
|
26
|
+
# The minimum amount of time enforced between requests, in seconds.
|
27
|
+
# If the value is a Range, a random number within the Range is chosen
|
28
|
+
# for each request.
|
29
29
|
#
|
30
30
|
# @return [Integer, Float, Range<Integer>, Range<Float>]
|
31
31
|
attr_accessor :time_between_requests
|
32
32
|
|
33
33
|
# Journal file used to ensure only-once processing of resources by
|
34
|
-
# {
|
34
|
+
# {fulfill} across multiple program runs.
|
35
35
|
#
|
36
36
|
# @return [Pathname, nil]
|
37
37
|
attr_reader :journal
|
38
38
|
|
39
39
|
# @param journal [Pathname, String]
|
40
40
|
# Optional journal file used to ensure only-once processing of
|
41
|
-
# resources by {
|
41
|
+
# resources by {fulfill} across multiple program runs
|
42
42
|
def initialize(journal = nil)
|
43
43
|
super()
|
44
44
|
|
@@ -74,26 +74,27 @@ class Grubby < Mechanize
|
|
74
74
|
end
|
75
75
|
|
76
76
|
# Sets the journal file used to ensure only-once processing of
|
77
|
-
# resources by {
|
77
|
+
# resources by {fulfill} across multiple program runs. Setting the
|
78
78
|
# journal file will clear the in-memory list of previously-processed
|
79
79
|
# resources, and, if the journal file exists, load the list from file.
|
80
80
|
#
|
81
81
|
# @param path [Pathname, String, nil]
|
82
82
|
# @return [Pathname]
|
83
83
|
def journal=(path)
|
84
|
-
@journal = path&.to_pathname&.
|
85
|
-
@
|
84
|
+
@journal = path&.to_pathname&.make_file
|
85
|
+
@fulfilled = if @journal
|
86
86
|
require "csv"
|
87
|
-
CSV.read(@journal).map{|row|
|
87
|
+
CSV.read(@journal).map{|row| FulfilledEntry.new(*row) }.to_set
|
88
88
|
else
|
89
89
|
Set.new
|
90
90
|
end
|
91
91
|
@journal
|
92
92
|
end
|
93
93
|
|
94
|
-
# Calls +#head+ and returns true if
|
95
|
-
#
|
96
|
-
# do not
|
94
|
+
# Calls +#head+ and returns true if a response code "200" is received,
|
95
|
+
# false otherwise. Unlike +#head+, error response codes (e.g. "404",
|
96
|
+
# "500") do not result in a +Mechanize::ResponseCodeError+ being
|
97
|
+
# raised.
|
97
98
|
#
|
98
99
|
# @param uri [URI, String]
|
99
100
|
# @return [Boolean]
|
@@ -106,7 +107,7 @@ class Grubby < Mechanize
|
|
106
107
|
end
|
107
108
|
|
108
109
|
# Calls +#get+ with each of +mirror_uris+ until a successful
|
109
|
-
# ("200 OK") response is
|
110
|
+
# ("200 OK") response is received, and returns that +#get+ result.
|
110
111
|
# Rescues and logs +Mechanize::ResponseCodeError+ failures for all but
|
111
112
|
# the last mirror.
|
112
113
|
#
|
@@ -114,13 +115,13 @@ class Grubby < Mechanize
|
|
114
115
|
# grubby = Grubby.new
|
115
116
|
#
|
116
117
|
# urls = [
|
117
|
-
# "
|
118
|
-
# "
|
119
|
-
# "
|
120
|
-
# "
|
118
|
+
# "https://httpstat.us/404",
|
119
|
+
# "https://httpstat.us/500",
|
120
|
+
# "https://httpstat.us/200?foo",
|
121
|
+
# "https://httpstat.us/200?bar",
|
121
122
|
# ]
|
122
123
|
#
|
123
|
-
# grubby.get_mirrored(urls).uri # == URI("
|
124
|
+
# grubby.get_mirrored(urls).uri # == URI("https://httpstat.us/200?foo")
|
124
125
|
#
|
125
126
|
# grubby.get_mirrored(urls.take(2)) # raise Mechanize::ResponseCodeError
|
126
127
|
#
|
@@ -145,70 +146,87 @@ class Grubby < Mechanize
|
|
145
146
|
end
|
146
147
|
|
147
148
|
# Ensures only-once processing of the resource indicated by +uri+ for
|
148
|
-
# the specified +purpose+.
|
149
|
-
#
|
150
|
-
#
|
151
|
-
#
|
152
|
-
#
|
149
|
+
# the specified +purpose+. The given block is executed and the result
|
150
|
+
# is returned if and only if the Grubby instance has not recorded a
|
151
|
+
# previous call to +fulfill+ for the same resource and purpose.
|
152
|
+
#
|
153
|
+
# Note that the resource is identified by both its URI and its content
|
154
|
+
# hash. The latter prevents superfluous and rearranged URI query
|
155
|
+
# string parameters from interfering with only-once processing.
|
156
|
+
#
|
157
|
+
# If {journal} is set, and if the block does not raise an exception,
|
158
|
+
# the resource and purpose are logged to the journal file. This
|
159
|
+
# enables only-once processing across multiple program runs. It also
|
160
|
+
# provides a means to resume batch processing after an unexpected
|
161
|
+
# termination.
|
153
162
|
#
|
154
163
|
# @example
|
155
164
|
# grubby = Grubby.new
|
156
165
|
#
|
157
|
-
# grubby.
|
158
|
-
#
|
166
|
+
# grubby.fulfill("https://example.com/posts") do |page|
|
167
|
+
# "first time"
|
168
|
+
# end
|
169
|
+
# # == "first time"
|
170
|
+
#
|
171
|
+
# grubby.fulfill("https://example.com/posts") do |page|
|
172
|
+
# "already seen" # not evaluated
|
159
173
|
# end
|
174
|
+
# # == nil
|
160
175
|
#
|
161
|
-
# grubby.
|
162
|
-
#
|
176
|
+
# grubby.fulfill("https://example.com/posts?page=1") do |page|
|
177
|
+
# "already seen content hash" # not evaluated
|
163
178
|
# end
|
179
|
+
# # == nil
|
164
180
|
#
|
165
|
-
# grubby.
|
166
|
-
#
|
181
|
+
# grubby.fulfill("https://example.com/posts", "again!") do |page|
|
182
|
+
# "already seen, but new purpose"
|
167
183
|
# end
|
184
|
+
# # == "already seen, but new purpose"
|
168
185
|
#
|
169
186
|
# @param uri [URI, String]
|
170
187
|
# @param purpose [String]
|
171
|
-
# @yield [resource]
|
172
188
|
# @yieldparam resource [Mechanize::Page, Mechanize::File, Mechanize::Download, ...]
|
173
|
-
# @
|
174
|
-
#
|
189
|
+
# @yieldreturn [Object]
|
190
|
+
# @return [Object, nil]
|
175
191
|
# @raise [Mechanize::ResponseCodeError]
|
176
192
|
# if fetching the resource results in error (see +Mechanize#get+)
|
177
|
-
def
|
193
|
+
def fulfill(uri, purpose = "")
|
178
194
|
series = []
|
179
195
|
|
180
196
|
uri = uri.to_absolute_uri
|
181
|
-
return
|
197
|
+
return unless add_fulfilled(uri, purpose, series)
|
182
198
|
|
183
199
|
normalized_uri = normalize_uri(uri)
|
184
|
-
return
|
200
|
+
return unless add_fulfilled(normalized_uri, purpose, series)
|
185
201
|
|
186
202
|
$log.info("Fetch #{normalized_uri}")
|
187
203
|
resource = get(normalized_uri)
|
188
|
-
|
189
|
-
|
204
|
+
unprocessed = add_fulfilled(resource.uri, purpose, series) &
|
205
|
+
add_fulfilled("content hash: #{resource.content_hash}", purpose, series)
|
190
206
|
|
191
|
-
yield resource
|
207
|
+
result = yield resource if unprocessed
|
192
208
|
|
193
209
|
CSV.open(journal, "a") do |csv|
|
194
|
-
series.each{|
|
210
|
+
series.each{|entry| csv << entry }
|
195
211
|
end if journal
|
196
212
|
|
197
|
-
|
213
|
+
result
|
198
214
|
end
|
199
215
|
|
200
216
|
|
201
217
|
private
|
202
218
|
|
203
219
|
# @!visibility private
|
204
|
-
|
220
|
+
FulfilledEntry = Struct.new(:purpose, :target)
|
205
221
|
|
206
|
-
def
|
207
|
-
series <<
|
208
|
-
if series.uniq
|
209
|
-
seen_info = series.length > 1 ? "seen #{series.last.target}" : "seen"
|
210
|
-
$log.info("Skip #{series.first.target} (#{seen_info})")
|
222
|
+
def add_fulfilled(target, purpose, series)
|
223
|
+
series << FulfilledEntry.new(purpose, target.to_s)
|
224
|
+
if (series.uniq!) || @fulfilled.add?(series.last)
|
211
225
|
true
|
226
|
+
else
|
227
|
+
$log.info("Skip #{series.first.target}" \
|
228
|
+
" (seen#{" #{series.last.target}" unless series.length == 1})")
|
229
|
+
false
|
212
230
|
end
|
213
231
|
end
|
214
232
|
|