grubby 1.2.1 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.travis.yml +6 -3
- data/CHANGELOG.md +12 -0
- data/Gemfile +3 -0
- data/README.md +140 -92
- data/Rakefile +0 -13
- data/gemfiles/activesupport-6.0.gemfile +3 -0
- data/grubby.gemspec +17 -18
- data/lib/grubby.rb +64 -46
- data/lib/grubby/core_ext/uri.rb +12 -11
- data/lib/grubby/json_parser.rb +1 -27
- data/lib/grubby/json_scraper.rb +6 -2
- data/lib/grubby/mechanize/download.rb +1 -1
- data/lib/grubby/mechanize/file.rb +1 -2
- data/lib/grubby/mechanize/link.rb +9 -6
- data/lib/grubby/mechanize/page.rb +4 -2
- data/lib/grubby/mechanize/parser.rb +9 -9
- data/lib/grubby/page_scraper.rb +6 -2
- data/lib/grubby/scraper.rb +86 -60
- data/lib/grubby/version.rb +1 -1
- metadata +17 -69
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e313c9ba144ee119b31eb6b7ec5fef721df811c8d579f532e5aa5de5a8d65198
|
4
|
+
data.tar.gz: 07f06e01378301c37ca0177a29f95e72f3cf549b65c3e1c9896c9749a9cd857d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ea948a4c90d2d9ef0e1cd527adc3ef89cb0379ad98751ffbf671b5cf2210e6e700b7856983e1378a500d9db842d9411cf20275f005ea4a2e2eba824a9c929ee3
|
7
|
+
data.tar.gz: 7a7985f0d5127d6c7e25f9d39a489c460cd76f05219358072dce618667edcefd033740335fbb4b6c8cfa216f0ed4f4d3cfab239af7d33f7d92ec939508a6ea20
|
data/.gitignore
CHANGED
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
## 2.0.0
|
2
|
+
|
3
|
+
* [BREAKING] Drop support for Active Support < 6.0
|
4
|
+
* [BREAKING] Require casual_support ~> 4.0
|
5
|
+
* [BREAKING] Require mini_sanity ~> 2.0
|
6
|
+
* [BREAKING] Require pleasant_path ~> 2.0
|
7
|
+
* [BREAKING] Remove `JsonParser.json_parse_options`
|
8
|
+
* Use `::JSON.load_default_options` instead
|
9
|
+
* [BREAKING] Rename `Grubby#singleton` to `Grubby#fulfill`
|
10
|
+
* [BREAKING] Change `Grubby#fulfill` to return block's result
|
11
|
+
|
12
|
+
|
1
13
|
## 1.2.1
|
2
14
|
|
3
15
|
* Add `JsonParser#mech` attribute for parity with `Mechanize::Page#mech`
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,162 +1,211 @@
|
|
1
|
-
# grubby
|
1
|
+
# grubby [![Build Status](https://travis-ci.org/jonathanhefner/grubby.svg?branch=master)](https://travis-ci.org/jonathanhefner/grubby)
|
2
2
|
|
3
3
|
[Fail-fast] web scraping. *grubby* adds a layer of utility and
|
4
|
-
error-checking atop the marvelous [Mechanize gem]. See API
|
4
|
+
error-checking atop the marvelous [Mechanize gem]. See API listing
|
5
5
|
below, or browse the [full documentation].
|
6
6
|
|
7
7
|
[Fail-fast]: https://en.wikipedia.org/wiki/Fail-fast
|
8
8
|
[Mechanize gem]: https://rubygems.org/gems/mechanize
|
9
|
-
[full documentation]:
|
9
|
+
[full documentation]: https://www.rubydoc.info/gems/grubby/
|
10
10
|
|
11
11
|
|
12
12
|
## Examples
|
13
13
|
|
14
|
-
The following
|
14
|
+
The following code scrapes stories from the [Hacker News](
|
15
|
+
https://news.ycombinator.com/news) front page:
|
15
16
|
|
16
17
|
```ruby
|
17
18
|
require "grubby"
|
18
19
|
|
19
20
|
class HackerNews < Grubby::PageScraper
|
20
21
|
scrapes(:items) do
|
21
|
-
page.search!(".athing").map{|
|
22
|
+
page.search!(".athing").map{|element| Item.new(element) }
|
22
23
|
end
|
23
24
|
|
24
25
|
class Item < Grubby::Scraper
|
25
26
|
scrapes(:story_link){ source.at!("a.storylink") }
|
26
|
-
|
27
|
+
|
28
|
+
scrapes(:story_url){ expand_url(story_link["href"]) }
|
29
|
+
|
27
30
|
scrapes(:title){ story_link.text }
|
31
|
+
|
32
|
+
scrapes(:comments_link, optional: true) do
|
33
|
+
source.next_sibling.search!(".subtext a").find do |link|
|
34
|
+
link.text.match?(/comment|discuss/)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
scrapes(:comments_url, if: :comments_link) do
|
39
|
+
expand_url(comments_link["href"])
|
40
|
+
end
|
41
|
+
|
42
|
+
scrapes(:comment_count, if: :comments_link) do
|
43
|
+
comments_link.text.to_i
|
44
|
+
end
|
45
|
+
|
46
|
+
def expand_url(url)
|
47
|
+
url.include?("://") ? url : source.document.uri.merge(url).to_s
|
48
|
+
end
|
28
49
|
end
|
29
50
|
end
|
30
51
|
|
31
52
|
# The following line will raise an exception if anything goes wrong
|
32
53
|
# during the scraping process. For example, if the structure of the
|
33
|
-
# HTML does not match expectations
|
34
|
-
#
|
35
|
-
#
|
36
|
-
# hard-to-trace errors.
|
54
|
+
# HTML does not match expectations due to a site change, the script will
|
55
|
+
# terminate immediately with a helpful error message. This prevents bad
|
56
|
+
# data from propagating and causing hard-to-trace errors.
|
37
57
|
hn = HackerNews.scrape("https://news.ycombinator.com/news")
|
38
58
|
|
39
59
|
# Your processing logic goes here:
|
40
60
|
hn.items.take(10).each do |item|
|
41
61
|
puts "* #{item.title}"
|
42
|
-
puts " #{item.
|
62
|
+
puts " #{item.story_url}"
|
63
|
+
puts " #{item.comment_count} comments: #{item.comments_url}" if item.comments_url
|
43
64
|
puts
|
44
65
|
end
|
45
66
|
```
|
46
67
|
|
47
|
-
|
68
|
+
Hacker News also offers a [JSON API](https://github.com/HackerNews/API),
|
69
|
+
which may be more robust for scraping purposes. *grubby* can scrape
|
70
|
+
JSON just as well:
|
71
|
+
|
72
|
+
```ruby
|
73
|
+
require "grubby"
|
74
|
+
|
75
|
+
class HackerNews < Grubby::JsonScraper
|
76
|
+
scrapes(:items) do
|
77
|
+
# API returns array of top 500 item IDs, so limit as necessary
|
78
|
+
json.take(10).map do |item_id|
|
79
|
+
Item.scrape("https://hacker-news.firebaseio.com/v0/item/#{item_id}.json")
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
class Item < Grubby::JsonScraper
|
84
|
+
scrapes(:story_url){ json["url"] || hn_url }
|
85
|
+
|
86
|
+
scrapes(:title){ json["title"] }
|
87
|
+
|
88
|
+
scrapes(:comments_url, optional: true) do
|
89
|
+
hn_url if json["descendants"]
|
90
|
+
end
|
91
|
+
|
92
|
+
scrapes(:comment_count, optional: true) do
|
93
|
+
json["descendants"]&.to_i
|
94
|
+
end
|
95
|
+
|
96
|
+
def hn_url
|
97
|
+
"https://news.ycombinator.com/item?id=#{json["id"]}"
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
hn = HackerNews.scrape("https://hacker-news.firebaseio.com/v0/topstories.json")
|
103
|
+
|
104
|
+
# Your processing logic goes here:
|
105
|
+
hn.items.each do |item|
|
106
|
+
puts "* #{item.title}"
|
107
|
+
puts " #{item.story_url}"
|
108
|
+
puts " #{item.comment_count} comments: #{item.comments_url}" if item.comments_url
|
109
|
+
puts
|
110
|
+
end
|
111
|
+
```
|
48
112
|
|
49
113
|
|
50
114
|
## Core API
|
51
115
|
|
52
|
-
- [Grubby](
|
53
|
-
- [#
|
54
|
-
- [#
|
55
|
-
- [#
|
56
|
-
- [#time_between_requests](
|
57
|
-
- [Scraper](
|
58
|
-
- [.each](
|
59
|
-
- [.
|
60
|
-
- [.
|
61
|
-
- [
|
62
|
-
- [#
|
63
|
-
|
64
|
-
- [
|
65
|
-
- [
|
66
|
-
|
67
|
-
- [
|
68
|
-
- [
|
69
|
-
- [.scrape_file](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper.scrape_file)
|
70
|
-
- [#json](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper:json)
|
71
|
-
- Mechanize::Download
|
72
|
-
- [#save_to](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to)
|
73
|
-
- [#save_to!](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to%21)
|
116
|
+
- [Grubby](https://www.rubydoc.info/gems/grubby/Grubby)
|
117
|
+
- [#fulfill](https://www.rubydoc.info/gems/grubby/Grubby:fulfill)
|
118
|
+
- [#get_mirrored](https://www.rubydoc.info/gems/grubby/Grubby:get_mirrored)
|
119
|
+
- [#ok?](https://www.rubydoc.info/gems/grubby/Grubby:ok%3F)
|
120
|
+
- [#time_between_requests](https://www.rubydoc.info/gems/grubby/Grubby:time_between_requests)
|
121
|
+
- [Scraper](https://www.rubydoc.info/gems/grubby/Grubby/Scraper)
|
122
|
+
- [.each](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.each)
|
123
|
+
- [.scrape](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrape)
|
124
|
+
- [.scrapes](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrapes)
|
125
|
+
- [#[]](https://www.rubydoc.info/gems/grubby/Grubby/Scraper:[])
|
126
|
+
- [#to_h](https://www.rubydoc.info/gems/grubby/Grubby/Scraper:to_h)
|
127
|
+
- [PageScraper](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper)
|
128
|
+
- [.scrape_file](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper.scrape_file)
|
129
|
+
- [#page](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper:page)
|
130
|
+
- [JsonScraper](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper)
|
131
|
+
- [.scrape_file](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper.scrape_file)
|
132
|
+
- [#json](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper:json)
|
74
133
|
- Mechanize::File
|
75
|
-
- [#save_to](
|
76
|
-
- [#save_to!](
|
134
|
+
- [#save_to](https://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to)
|
135
|
+
- [#save_to!](https://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to%21)
|
77
136
|
- Mechanize::Page
|
78
|
-
- [#at!](
|
79
|
-
- [#search!](
|
137
|
+
- [#at!](https://www.rubydoc.info/gems/grubby/Mechanize/Page:at%21)
|
138
|
+
- [#search!](https://www.rubydoc.info/gems/grubby/Mechanize/Page:search%21)
|
80
139
|
- Mechanize::Page::Link
|
81
|
-
- [#to_absolute_uri](
|
140
|
+
- [#to_absolute_uri](https://www.rubydoc.info/gems/grubby/Mechanize/Page/Link#to_absolute_uri)
|
82
141
|
- URI
|
83
142
|
- [#basename](https://www.rubydoc.info/gems/grubby/URI:basename)
|
84
143
|
- [#query_param](https://www.rubydoc.info/gems/grubby/URI:query_param)
|
85
144
|
|
86
145
|
|
87
|
-
##
|
146
|
+
## Auxiliary API
|
88
147
|
|
89
|
-
*grubby*
|
90
|
-
|
91
|
-
|
92
|
-
**a few** of the methods each provides. See each gem's documentation
|
93
|
-
for a complete API listing.
|
148
|
+
*grubby* loads several gems that extend Ruby objects with utility
|
149
|
+
methods. Some of those methods are listed below. See each gem's
|
150
|
+
documentation for a complete API listing.
|
94
151
|
|
95
152
|
- [Active Support](https://rubygems.org/gems/activesupport)
|
96
|
-
([docs](
|
153
|
+
([docs](https://www.rubydoc.info/gems/activesupport/))
|
97
154
|
- [Enumerable#index_by](https://www.rubydoc.info/gems/activesupport/Enumerable:index_by)
|
98
155
|
- [File.atomic_write](https://www.rubydoc.info/gems/activesupport/File:atomic_write)
|
99
|
-
- [NilClass#try](https://www.rubydoc.info/gems/activesupport/NilClass:try)
|
100
156
|
- [Object#presence](https://www.rubydoc.info/gems/activesupport/Object:presence)
|
101
157
|
- [String#blank?](https://www.rubydoc.info/gems/activesupport/String:blank%3F)
|
102
158
|
- [String#squish](https://www.rubydoc.info/gems/activesupport/String:squish)
|
103
159
|
- [casual_support](https://rubygems.org/gems/casual_support)
|
104
|
-
([docs](
|
105
|
-
- [Enumerable#index_to](
|
106
|
-
- [String#after](
|
107
|
-
- [String#after_last](
|
108
|
-
- [String#before](
|
109
|
-
- [String#before_last](
|
110
|
-
- [String#between](
|
111
|
-
- [Time#to_hms](
|
112
|
-
- [Time#to_ymd](
|
160
|
+
([docs](https://www.rubydoc.info/gems/casual_support/))
|
161
|
+
- [Enumerable#index_to](https://www.rubydoc.info/gems/casual_support/Enumerable:index_to)
|
162
|
+
- [String#after](https://www.rubydoc.info/gems/casual_support/String:after)
|
163
|
+
- [String#after_last](https://www.rubydoc.info/gems/casual_support/String:after_last)
|
164
|
+
- [String#before](https://www.rubydoc.info/gems/casual_support/String:before)
|
165
|
+
- [String#before_last](https://www.rubydoc.info/gems/casual_support/String:before_last)
|
166
|
+
- [String#between](https://www.rubydoc.info/gems/casual_support/String:between)
|
167
|
+
- [Time#to_hms](https://www.rubydoc.info/gems/casual_support/Time:to_hms)
|
168
|
+
- [Time#to_ymd](https://www.rubydoc.info/gems/casual_support/Time:to_ymd)
|
113
169
|
- [gorge](https://rubygems.org/gems/gorge)
|
114
|
-
([docs](
|
115
|
-
- [Pathname#file_crc32](
|
116
|
-
- [Pathname#file_md5](
|
117
|
-
- [Pathname#file_sha1](
|
118
|
-
- [String#crc32](http://www.rubydoc.info/gems/gorge/String:crc32)
|
119
|
-
- [String#md5](http://www.rubydoc.info/gems/gorge/String:md5)
|
120
|
-
- [String#sha1](http://www.rubydoc.info/gems/gorge/String:sha1)
|
170
|
+
([docs](https://www.rubydoc.info/gems/gorge/))
|
171
|
+
- [Pathname#file_crc32](https://www.rubydoc.info/gems/gorge/Pathname:file_crc32)
|
172
|
+
- [Pathname#file_md5](https://www.rubydoc.info/gems/gorge/Pathname:file_md5)
|
173
|
+
- [Pathname#file_sha1](https://www.rubydoc.info/gems/gorge/Pathname:file_sha1)
|
121
174
|
- [mini_sanity](https://rubygems.org/gems/mini_sanity)
|
122
|
-
([docs](
|
123
|
-
- [
|
124
|
-
- [
|
125
|
-
- [Object#
|
126
|
-
- [Object#
|
127
|
-
- [
|
128
|
-
- [Pathname#assert_exist!](http://www.rubydoc.info/gems/mini_sanity/Pathname:assert_exist%21)
|
129
|
-
- [String#assert_match!](http://www.rubydoc.info/gems/mini_sanity/String:assert_match%21)
|
175
|
+
([docs](https://www.rubydoc.info/gems/mini_sanity/))
|
176
|
+
- [Enumerator#result!](https://www.rubydoc.info/gems/mini_sanity/Enumerator:result%21)
|
177
|
+
- [Enumerator#results!](https://www.rubydoc.info/gems/mini_sanity/Enumerator:results%21)
|
178
|
+
- [Object#assert!](https://www.rubydoc.info/gems/mini_sanity/Object:assert%21)
|
179
|
+
- [Object#refute!](https://www.rubydoc.info/gems/mini_sanity/Object:refute%21)
|
180
|
+
- [String#match!](https://www.rubydoc.info/gems/mini_sanity/String:match%21)
|
130
181
|
- [pleasant_path](https://rubygems.org/gems/pleasant_path)
|
131
|
-
([docs](
|
132
|
-
- [Pathname#available_name](
|
133
|
-
- [Pathname#
|
134
|
-
- [Pathname#
|
135
|
-
- [Pathname#
|
136
|
-
- [Pathname#
|
137
|
-
- [Pathname#
|
138
|
-
- [Pathname#rename_basename](http://www.rubydoc.info/gems/pleasant_path/Pathname:rename_basename)
|
139
|
-
- [Pathname#rename_extname](http://www.rubydoc.info/gems/pleasant_path/Pathname:rename_extname)
|
182
|
+
([docs](https://www.rubydoc.info/gems/pleasant_path/))
|
183
|
+
- [Pathname#available_name](https://www.rubydoc.info/gems/pleasant_path/Pathname:available_name)
|
184
|
+
- [Pathname#existence](https://www.rubydoc.info/gems/pleasant_path/Pathname:existence)
|
185
|
+
- [Pathname#make_dirname](https://www.rubydoc.info/gems/pleasant_path/Pathname:make_dirname)
|
186
|
+
- [Pathname#move_as](https://www.rubydoc.info/gems/pleasant_path/Pathname:move_as)
|
187
|
+
- [Pathname#rename_basename](https://www.rubydoc.info/gems/pleasant_path/Pathname:rename_basename)
|
188
|
+
- [Pathname#rename_extname](https://www.rubydoc.info/gems/pleasant_path/Pathname:rename_extname)
|
140
189
|
- [ryoba](https://rubygems.org/gems/ryoba)
|
141
|
-
([docs](
|
142
|
-
- [Nokogiri::XML::Node#matches!](
|
143
|
-
- [Nokogiri::XML::Node#text!](
|
144
|
-
- [Nokogiri::XML::Node#uri](
|
145
|
-
- [Nokogiri::XML::Searchable#ancestor!](
|
146
|
-
- [Nokogiri::XML::Searchable#ancestors!](
|
147
|
-
- [Nokogiri::XML::Searchable#at!](
|
148
|
-
- [Nokogiri::XML::Searchable#search!](
|
190
|
+
([docs](https://www.rubydoc.info/gems/ryoba/))
|
191
|
+
- [Nokogiri::XML::Node#matches!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:matches%21)
|
192
|
+
- [Nokogiri::XML::Node#text!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:text%21)
|
193
|
+
- [Nokogiri::XML::Node#uri](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:uri)
|
194
|
+
- [Nokogiri::XML::Searchable#ancestor!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestor%21)
|
195
|
+
- [Nokogiri::XML::Searchable#ancestors!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestors%21)
|
196
|
+
- [Nokogiri::XML::Searchable#at!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21)
|
197
|
+
- [Nokogiri::XML::Searchable#search!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21)
|
149
198
|
|
150
199
|
|
151
200
|
## Installation
|
152
201
|
|
153
|
-
Install
|
202
|
+
Install the [gem](https://rubygems.org/gems/grubby):
|
154
203
|
|
155
204
|
```bash
|
156
205
|
$ gem install grubby
|
157
206
|
```
|
158
207
|
|
159
|
-
Then require in your Ruby
|
208
|
+
Then require in your Ruby code:
|
160
209
|
|
161
210
|
```ruby
|
162
211
|
require "grubby"
|
@@ -165,8 +214,7 @@ require "grubby"
|
|
165
214
|
|
166
215
|
## Contributing
|
167
216
|
|
168
|
-
Run `rake test` to run the tests.
|
169
|
-
interactive prompt that pre-loads the project code.
|
217
|
+
Run `rake test` to run the tests.
|
170
218
|
|
171
219
|
|
172
220
|
## License
|
data/Rakefile
CHANGED
@@ -1,18 +1,5 @@
|
|
1
1
|
require "bundler/gem_tasks"
|
2
2
|
require "rake/testtask"
|
3
|
-
require "yard"
|
4
|
-
|
5
|
-
|
6
|
-
YARD::Rake::YardocTask.new(:doc) do |t|
|
7
|
-
end
|
8
|
-
|
9
|
-
desc "Launch IRB with this gem pre-loaded"
|
10
|
-
task :irb do
|
11
|
-
require "grubby"
|
12
|
-
require "irb"
|
13
|
-
ARGV.clear
|
14
|
-
IRB.start
|
15
|
-
end
|
16
3
|
|
17
4
|
Rake::TestTask.new(:test) do |t|
|
18
5
|
t.libs << "test"
|
data/grubby.gemspec
CHANGED
@@ -1,7 +1,4 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path("../lib", __FILE__)
|
3
|
-
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require "grubby/version"
|
1
|
+
require_relative "lib/grubby/version"
|
5
2
|
|
6
3
|
Gem::Specification.new do |spec|
|
7
4
|
spec.name = "grubby"
|
@@ -12,24 +9,26 @@ Gem::Specification.new do |spec|
|
|
12
9
|
spec.summary = %q{Fail-fast web scraping}
|
13
10
|
spec.homepage = "https://github.com/jonathanhefner/grubby"
|
14
11
|
spec.license = "MIT"
|
12
|
+
spec.required_ruby_version = ">= 2.6"
|
15
13
|
|
16
|
-
spec.
|
17
|
-
|
14
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
15
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
16
|
+
spec.metadata["changelog_uri"] = spec.metadata["source_code_uri"] + "/blob/master/CHANGELOG.md"
|
17
|
+
|
18
|
+
# Specify which files should be added to the gem when it is released.
|
19
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
20
|
+
spec.files = Dir.chdir(__dir__) do
|
21
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
22
|
end
|
19
23
|
spec.bindir = "exe"
|
20
24
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
21
25
|
spec.require_paths = ["lib"]
|
22
26
|
|
23
|
-
spec.
|
24
|
-
spec.
|
25
|
-
spec.
|
26
|
-
spec.
|
27
|
-
spec.
|
28
|
-
spec.
|
29
|
-
spec.
|
30
|
-
|
31
|
-
spec.add_development_dependency "bundler", "~> 1.15"
|
32
|
-
spec.add_development_dependency "rake", "~> 10.0"
|
33
|
-
spec.add_development_dependency "minitest", "~> 5.0"
|
34
|
-
spec.add_development_dependency "yard", "~> 0.9"
|
27
|
+
spec.add_dependency "activesupport", ">= 6.0"
|
28
|
+
spec.add_dependency "casual_support", "~> 4.0"
|
29
|
+
spec.add_dependency "gorge", "~> 1.0"
|
30
|
+
spec.add_dependency "mechanize", "~> 2.7"
|
31
|
+
spec.add_dependency "mini_sanity", "~> 2.0"
|
32
|
+
spec.add_dependency "pleasant_path", "~> 2.0"
|
33
|
+
spec.add_dependency "ryoba", "~> 1.0"
|
35
34
|
end
|
data/lib/grubby.rb
CHANGED
@@ -23,22 +23,22 @@ class Grubby < Mechanize
|
|
23
23
|
|
24
24
|
VERSION = GRUBBY_VERSION
|
25
25
|
|
26
|
-
# The
|
27
|
-
#
|
28
|
-
#
|
26
|
+
# The minimum amount of time enforced between requests, in seconds.
|
27
|
+
# If the value is a Range, a random number within the Range is chosen
|
28
|
+
# for each request.
|
29
29
|
#
|
30
30
|
# @return [Integer, Float, Range<Integer>, Range<Float>]
|
31
31
|
attr_accessor :time_between_requests
|
32
32
|
|
33
33
|
# Journal file used to ensure only-once processing of resources by
|
34
|
-
# {
|
34
|
+
# {fulfill} across multiple program runs.
|
35
35
|
#
|
36
36
|
# @return [Pathname, nil]
|
37
37
|
attr_reader :journal
|
38
38
|
|
39
39
|
# @param journal [Pathname, String]
|
40
40
|
# Optional journal file used to ensure only-once processing of
|
41
|
-
# resources by {
|
41
|
+
# resources by {fulfill} across multiple program runs
|
42
42
|
def initialize(journal = nil)
|
43
43
|
super()
|
44
44
|
|
@@ -74,26 +74,27 @@ class Grubby < Mechanize
|
|
74
74
|
end
|
75
75
|
|
76
76
|
# Sets the journal file used to ensure only-once processing of
|
77
|
-
# resources by {
|
77
|
+
# resources by {fulfill} across multiple program runs. Setting the
|
78
78
|
# journal file will clear the in-memory list of previously-processed
|
79
79
|
# resources, and, if the journal file exists, load the list from file.
|
80
80
|
#
|
81
81
|
# @param path [Pathname, String, nil]
|
82
82
|
# @return [Pathname]
|
83
83
|
def journal=(path)
|
84
|
-
@journal = path&.to_pathname&.
|
85
|
-
@
|
84
|
+
@journal = path&.to_pathname&.make_file
|
85
|
+
@fulfilled = if @journal
|
86
86
|
require "csv"
|
87
|
-
CSV.read(@journal).map{|row|
|
87
|
+
CSV.read(@journal).map{|row| FulfilledEntry.new(*row) }.to_set
|
88
88
|
else
|
89
89
|
Set.new
|
90
90
|
end
|
91
91
|
@journal
|
92
92
|
end
|
93
93
|
|
94
|
-
# Calls +#head+ and returns true if
|
95
|
-
#
|
96
|
-
# do not
|
94
|
+
# Calls +#head+ and returns true if a response code "200" is received,
|
95
|
+
# false otherwise. Unlike +#head+, error response codes (e.g. "404",
|
96
|
+
# "500") do not result in a +Mechanize::ResponseCodeError+ being
|
97
|
+
# raised.
|
97
98
|
#
|
98
99
|
# @param uri [URI, String]
|
99
100
|
# @return [Boolean]
|
@@ -106,7 +107,7 @@ class Grubby < Mechanize
|
|
106
107
|
end
|
107
108
|
|
108
109
|
# Calls +#get+ with each of +mirror_uris+ until a successful
|
109
|
-
# ("200 OK") response is
|
110
|
+
# ("200 OK") response is received, and returns that +#get+ result.
|
110
111
|
# Rescues and logs +Mechanize::ResponseCodeError+ failures for all but
|
111
112
|
# the last mirror.
|
112
113
|
#
|
@@ -114,13 +115,13 @@ class Grubby < Mechanize
|
|
114
115
|
# grubby = Grubby.new
|
115
116
|
#
|
116
117
|
# urls = [
|
117
|
-
# "
|
118
|
-
# "
|
119
|
-
# "
|
120
|
-
# "
|
118
|
+
# "https://httpstat.us/404",
|
119
|
+
# "https://httpstat.us/500",
|
120
|
+
# "https://httpstat.us/200?foo",
|
121
|
+
# "https://httpstat.us/200?bar",
|
121
122
|
# ]
|
122
123
|
#
|
123
|
-
# grubby.get_mirrored(urls).uri # == URI("
|
124
|
+
# grubby.get_mirrored(urls).uri # == URI("https://httpstat.us/200?foo")
|
124
125
|
#
|
125
126
|
# grubby.get_mirrored(urls.take(2)) # raise Mechanize::ResponseCodeError
|
126
127
|
#
|
@@ -145,70 +146,87 @@ class Grubby < Mechanize
|
|
145
146
|
end
|
146
147
|
|
147
148
|
# Ensures only-once processing of the resource indicated by +uri+ for
|
148
|
-
# the specified +purpose+.
|
149
|
-
#
|
150
|
-
#
|
151
|
-
#
|
152
|
-
#
|
149
|
+
# the specified +purpose+. The given block is executed and the result
|
150
|
+
# is returned if and only if the Grubby instance has not recorded a
|
151
|
+
# previous call to +fulfill+ for the same resource and purpose.
|
152
|
+
#
|
153
|
+
# Note that the resource is identified by both its URI and its content
|
154
|
+
# hash. The latter prevents superfluous and rearranged URI query
|
155
|
+
# string parameters from interfering with only-once processing.
|
156
|
+
#
|
157
|
+
# If {journal} is set, and if the block does not raise an exception,
|
158
|
+
# the resource and purpose are logged to the journal file. This
|
159
|
+
# enables only-once processing across multiple program runs. It also
|
160
|
+
# provides a means to resume batch processing after an unexpected
|
161
|
+
# termination.
|
153
162
|
#
|
154
163
|
# @example
|
155
164
|
# grubby = Grubby.new
|
156
165
|
#
|
157
|
-
# grubby.
|
158
|
-
#
|
166
|
+
# grubby.fulfill("https://example.com/posts") do |page|
|
167
|
+
# "first time"
|
168
|
+
# end
|
169
|
+
# # == "first time"
|
170
|
+
#
|
171
|
+
# grubby.fulfill("https://example.com/posts") do |page|
|
172
|
+
# "already seen" # not evaluated
|
159
173
|
# end
|
174
|
+
# # == nil
|
160
175
|
#
|
161
|
-
# grubby.
|
162
|
-
#
|
176
|
+
# grubby.fulfill("https://example.com/posts?page=1") do |page|
|
177
|
+
# "already seen content hash" # not evaluated
|
163
178
|
# end
|
179
|
+
# # == nil
|
164
180
|
#
|
165
|
-
# grubby.
|
166
|
-
#
|
181
|
+
# grubby.fulfill("https://example.com/posts", "again!") do |page|
|
182
|
+
# "already seen, but new purpose"
|
167
183
|
# end
|
184
|
+
# # == "already seen, but new purpose"
|
168
185
|
#
|
169
186
|
# @param uri [URI, String]
|
170
187
|
# @param purpose [String]
|
171
|
-
# @yield [resource]
|
172
188
|
# @yieldparam resource [Mechanize::Page, Mechanize::File, Mechanize::Download, ...]
|
173
|
-
# @
|
174
|
-
#
|
189
|
+
# @yieldreturn [Object]
|
190
|
+
# @return [Object, nil]
|
175
191
|
# @raise [Mechanize::ResponseCodeError]
|
176
192
|
# if fetching the resource results in error (see +Mechanize#get+)
|
177
|
-
def
|
193
|
+
def fulfill(uri, purpose = "")
|
178
194
|
series = []
|
179
195
|
|
180
196
|
uri = uri.to_absolute_uri
|
181
|
-
return
|
197
|
+
return unless add_fulfilled(uri, purpose, series)
|
182
198
|
|
183
199
|
normalized_uri = normalize_uri(uri)
|
184
|
-
return
|
200
|
+
return unless add_fulfilled(normalized_uri, purpose, series)
|
185
201
|
|
186
202
|
$log.info("Fetch #{normalized_uri}")
|
187
203
|
resource = get(normalized_uri)
|
188
|
-
|
189
|
-
|
204
|
+
unprocessed = add_fulfilled(resource.uri, purpose, series) &
|
205
|
+
add_fulfilled("content hash: #{resource.content_hash}", purpose, series)
|
190
206
|
|
191
|
-
yield resource
|
207
|
+
result = yield resource if unprocessed
|
192
208
|
|
193
209
|
CSV.open(journal, "a") do |csv|
|
194
|
-
series.each{|
|
210
|
+
series.each{|entry| csv << entry }
|
195
211
|
end if journal
|
196
212
|
|
197
|
-
|
213
|
+
result
|
198
214
|
end
|
199
215
|
|
200
216
|
|
201
217
|
private
|
202
218
|
|
203
219
|
# @!visibility private
|
204
|
-
|
220
|
+
FulfilledEntry = Struct.new(:purpose, :target)
|
205
221
|
|
206
|
-
def
|
207
|
-
series <<
|
208
|
-
if series.uniq
|
209
|
-
seen_info = series.length > 1 ? "seen #{series.last.target}" : "seen"
|
210
|
-
$log.info("Skip #{series.first.target} (#{seen_info})")
|
222
|
+
def add_fulfilled(target, purpose, series)
|
223
|
+
series << FulfilledEntry.new(purpose, target.to_s)
|
224
|
+
if (series.uniq!) || @fulfilled.add?(series.last)
|
211
225
|
true
|
226
|
+
else
|
227
|
+
$log.info("Skip #{series.first.target}" \
|
228
|
+
" (seen#{" #{series.last.target}" unless series.length == 1})")
|
229
|
+
false
|
212
230
|
end
|
213
231
|
end
|
214
232
|
|