grubby 1.2.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 91cb5fb76be040dc0a6b86c7dd5513e7dfa79327e68b6f15da6ed41df1492740
4
- data.tar.gz: d96e1a83f6ebc93c09403bc66ee3251132bbdabeb40379aa081dbece2c978b98
3
+ metadata.gz: e313c9ba144ee119b31eb6b7ec5fef721df811c8d579f532e5aa5de5a8d65198
4
+ data.tar.gz: 07f06e01378301c37ca0177a29f95e72f3cf549b65c3e1c9896c9749a9cd857d
5
5
  SHA512:
6
- metadata.gz: 4e10fa8ae3b183fa600a26af1ff87e0e340e63cfdeec9369c1f9987ace143591b9c33b1edfed980b841ffea5806f96332b1b32e117551b714dcd3b66cff5a8da
7
- data.tar.gz: 63985a6d1d39a1ac224eb1aca676f3266b911059e7ab5e838a535dd14e6249d2bbc1d41b59a35101e17983930ebd7ab258a6ce39375a300bcf1725a0e79b72c1
6
+ metadata.gz: ea948a4c90d2d9ef0e1cd527adc3ef89cb0379ad98751ffbf671b5cf2210e6e700b7856983e1378a500d9db842d9411cf20275f005ea4a2e2eba824a9c929ee3
7
+ data.tar.gz: 7a7985f0d5127d6c7e25f9d39a489c460cd76f05219358072dce618667edcefd033740335fbb4b6c8cfa216f0ed4f4d3cfab239af7d33f7d92ec939508a6ea20
data/.gitignore CHANGED
@@ -4,6 +4,7 @@
4
4
  /_yardoc/
5
5
  /coverage/
6
6
  /doc/
7
+ /gemfiles/*.lock
7
8
  /pkg/
8
9
  /spec/reports/
9
10
  /tmp/
@@ -1,5 +1,8 @@
1
- sudo: false
2
1
  language: ruby
2
+
3
3
  rvm:
4
- - 2.2.5
5
- before_install: gem install bundler -v 1.15.1
4
+ - 2.6
5
+ - 2.7
6
+
7
+ gemfile:
8
+ - gemfiles/activesupport-6.0.gemfile
@@ -1,3 +1,15 @@
1
+ ## 2.0.0
2
+
3
+ * [BREAKING] Drop support for Active Support < 6.0
4
+ * [BREAKING] Require casual_support ~> 4.0
5
+ * [BREAKING] Require mini_sanity ~> 2.0
6
+ * [BREAKING] Require pleasant_path ~> 2.0
7
+ * [BREAKING] Remove `JsonParser.json_parse_options`
8
+ * Use `::JSON.load_default_options` instead
9
+ * [BREAKING] Rename `Grubby#singleton` to `Grubby#fulfill`
10
+ * [BREAKING] Change `Grubby#fulfill` to return block's result
11
+
12
+
1
13
  ## 1.2.1
2
14
 
3
15
  * Add `JsonParser#mech` attribute for parity with `Mechanize::Page#mech`
data/Gemfile CHANGED
@@ -2,3 +2,6 @@ source "https://rubygems.org"
2
2
 
3
3
  # Specify your gem's dependencies in grubby.gemspec
4
4
  gemspec
5
+
6
+ gem "rake", "~> 12.0"
7
+ gem "minitest", "~> 5.0"
data/README.md CHANGED
@@ -1,162 +1,211 @@
1
- # grubby
1
+ # grubby [![Build Status](https://travis-ci.org/jonathanhefner/grubby.svg?branch=master)](https://travis-ci.org/jonathanhefner/grubby)
2
2
 
3
3
  [Fail-fast] web scraping. *grubby* adds a layer of utility and
4
- error-checking atop the marvelous [Mechanize gem]. See API summary
4
+ error-checking atop the marvelous [Mechanize gem]. See API listing
5
5
  below, or browse the [full documentation].
6
6
 
7
7
  [Fail-fast]: https://en.wikipedia.org/wiki/Fail-fast
8
8
  [Mechanize gem]: https://rubygems.org/gems/mechanize
9
- [full documentation]: http://www.rubydoc.info/gems/grubby/
9
+ [full documentation]: https://www.rubydoc.info/gems/grubby/
10
10
 
11
11
 
12
12
  ## Examples
13
13
 
14
- The following example scrapes stories from the [Hacker News] front page:
14
+ The following code scrapes stories from the [Hacker News](
15
+ https://news.ycombinator.com/news) front page:
15
16
 
16
17
  ```ruby
17
18
  require "grubby"
18
19
 
19
20
  class HackerNews < Grubby::PageScraper
20
21
  scrapes(:items) do
21
- page.search!(".athing").map{|el| Item.new(el) }
22
+ page.search!(".athing").map{|element| Item.new(element) }
22
23
  end
23
24
 
24
25
  class Item < Grubby::Scraper
25
26
  scrapes(:story_link){ source.at!("a.storylink") }
26
- scrapes(:story_uri){ story_link.uri }
27
+
28
+ scrapes(:story_url){ expand_url(story_link["href"]) }
29
+
27
30
  scrapes(:title){ story_link.text }
31
+
32
+ scrapes(:comments_link, optional: true) do
33
+ source.next_sibling.search!(".subtext a").find do |link|
34
+ link.text.match?(/comment|discuss/)
35
+ end
36
+ end
37
+
38
+ scrapes(:comments_url, if: :comments_link) do
39
+ expand_url(comments_link["href"])
40
+ end
41
+
42
+ scrapes(:comment_count, if: :comments_link) do
43
+ comments_link.text.to_i
44
+ end
45
+
46
+ def expand_url(url)
47
+ url.include?("://") ? url : source.document.uri.merge(url).to_s
48
+ end
28
49
  end
29
50
  end
30
51
 
31
52
  # The following line will raise an exception if anything goes wrong
32
53
  # during the scraping process. For example, if the structure of the
33
- # HTML does not match expectations, either due to incorrect assumptions
34
- # or a site change, the script will terminate immediately with a helpful
35
- # error message. This prevents bad data from propagating and causing
36
- # hard-to-trace errors.
54
+ # HTML does not match expectations due to a site change, the script will
55
+ # terminate immediately with a helpful error message. This prevents bad
56
+ # data from propagating and causing hard-to-trace errors.
37
57
  hn = HackerNews.scrape("https://news.ycombinator.com/news")
38
58
 
39
59
  # Your processing logic goes here:
40
60
  hn.items.take(10).each do |item|
41
61
  puts "* #{item.title}"
42
- puts " #{item.story_uri}"
62
+ puts " #{item.story_url}"
63
+ puts " #{item.comment_count} comments: #{item.comments_url}" if item.comments_url
43
64
  puts
44
65
  end
45
66
  ```
46
67
 
47
- [Hacker News]: https://news.ycombinator.com/news
68
+ Hacker News also offers a [JSON API](https://github.com/HackerNews/API),
69
+ which may be more robust for scraping purposes. *grubby* can scrape
70
+ JSON just as well:
71
+
72
+ ```ruby
73
+ require "grubby"
74
+
75
+ class HackerNews < Grubby::JsonScraper
76
+ scrapes(:items) do
77
+ # API returns array of top 500 item IDs, so limit as necessary
78
+ json.take(10).map do |item_id|
79
+ Item.scrape("https://hacker-news.firebaseio.com/v0/item/#{item_id}.json")
80
+ end
81
+ end
82
+
83
+ class Item < Grubby::JsonScraper
84
+ scrapes(:story_url){ json["url"] || hn_url }
85
+
86
+ scrapes(:title){ json["title"] }
87
+
88
+ scrapes(:comments_url, optional: true) do
89
+ hn_url if json["descendants"]
90
+ end
91
+
92
+ scrapes(:comment_count, optional: true) do
93
+ json["descendants"]&.to_i
94
+ end
95
+
96
+ def hn_url
97
+ "https://news.ycombinator.com/item?id=#{json["id"]}"
98
+ end
99
+ end
100
+ end
101
+
102
+ hn = HackerNews.scrape("https://hacker-news.firebaseio.com/v0/topstories.json")
103
+
104
+ # Your processing logic goes here:
105
+ hn.items.each do |item|
106
+ puts "* #{item.title}"
107
+ puts " #{item.story_url}"
108
+ puts " #{item.comment_count} comments: #{item.comments_url}" if item.comments_url
109
+ puts
110
+ end
111
+ ```
48
112
 
49
113
 
50
114
  ## Core API
51
115
 
52
- - [Grubby](http://www.rubydoc.info/gems/grubby/Grubby)
53
- - [#get_mirrored](http://www.rubydoc.info/gems/grubby/Grubby:get_mirrored)
54
- - [#ok?](http://www.rubydoc.info/gems/grubby/Grubby:ok%3F)
55
- - [#singleton](http://www.rubydoc.info/gems/grubby/Grubby:singleton)
56
- - [#time_between_requests](http://www.rubydoc.info/gems/grubby/Grubby:time_between_requests)
57
- - [Scraper](http://www.rubydoc.info/gems/grubby/Grubby/Scraper)
58
- - [.each](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.each)
59
- - [.fields](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.fields)
60
- - [.scrape](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrape)
61
- - [.scrapes](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrapes)
62
- - [#[]](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:[])
63
- - [#source](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:source)
64
- - [#to_h](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:to_h)
65
- - [PageScraper](http://www.rubydoc.info/gems/grubby/Grubby/PageScraper)
66
- - [.scrape_file](http://www.rubydoc.info/gems/grubby/Grubby/PageScraper.scrape_file)
67
- - [#page](http://www.rubydoc.info/gems/grubby/Grubby/PageScraper:page)
68
- - [JsonScraper](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper)
69
- - [.scrape_file](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper.scrape_file)
70
- - [#json](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper:json)
71
- - Mechanize::Download
72
- - [#save_to](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to)
73
- - [#save_to!](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to%21)
116
+ - [Grubby](https://www.rubydoc.info/gems/grubby/Grubby)
117
+ - [#fulfill](https://www.rubydoc.info/gems/grubby/Grubby:fulfill)
118
+ - [#get_mirrored](https://www.rubydoc.info/gems/grubby/Grubby:get_mirrored)
119
+ - [#ok?](https://www.rubydoc.info/gems/grubby/Grubby:ok%3F)
120
+ - [#time_between_requests](https://www.rubydoc.info/gems/grubby/Grubby:time_between_requests)
121
+ - [Scraper](https://www.rubydoc.info/gems/grubby/Grubby/Scraper)
122
+ - [.each](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.each)
123
+ - [.scrape](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrape)
124
+ - [.scrapes](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrapes)
125
+ - [#[]](https://www.rubydoc.info/gems/grubby/Grubby/Scraper:[])
126
+ - [#to_h](https://www.rubydoc.info/gems/grubby/Grubby/Scraper:to_h)
127
+ - [PageScraper](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper)
128
+ - [.scrape_file](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper.scrape_file)
129
+ - [#page](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper:page)
130
+ - [JsonScraper](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper)
131
+ - [.scrape_file](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper.scrape_file)
132
+ - [#json](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper:json)
74
133
  - Mechanize::File
75
- - [#save_to](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to)
76
- - [#save_to!](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to%21)
134
+ - [#save_to](https://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to)
135
+ - [#save_to!](https://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to%21)
77
136
  - Mechanize::Page
78
- - [#at!](http://www.rubydoc.info/gems/grubby/Mechanize/Page:at%21)
79
- - [#search!](http://www.rubydoc.info/gems/grubby/Mechanize/Page:search%21)
137
+ - [#at!](https://www.rubydoc.info/gems/grubby/Mechanize/Page:at%21)
138
+ - [#search!](https://www.rubydoc.info/gems/grubby/Mechanize/Page:search%21)
80
139
  - Mechanize::Page::Link
81
- - [#to_absolute_uri](http://www.rubydoc.info/gems/grubby/Mechanize/Page/Link#to_absolute_uri)
140
+ - [#to_absolute_uri](https://www.rubydoc.info/gems/grubby/Mechanize/Page/Link#to_absolute_uri)
82
141
  - URI
83
142
  - [#basename](https://www.rubydoc.info/gems/grubby/URI:basename)
84
143
  - [#query_param](https://www.rubydoc.info/gems/grubby/URI:query_param)
85
144
 
86
145
 
87
- ## Supplemental API
146
+ ## Auxiliary API
88
147
 
89
- *grubby* includes several gems which extend Ruby objects with
90
- convenience methods. When you load *grubby* you automatically make
91
- these methods available. The included gems are listed below, along with
92
- **a few** of the methods each provides. See each gem's documentation
93
- for a complete API listing.
148
+ *grubby* loads several gems that extend Ruby objects with utility
149
+ methods. Some of those methods are listed below. See each gem's
150
+ documentation for a complete API listing.
94
151
 
95
152
  - [Active Support](https://rubygems.org/gems/activesupport)
96
- ([docs](http://www.rubydoc.info/gems/activesupport/))
153
+ ([docs](https://www.rubydoc.info/gems/activesupport/))
97
154
  - [Enumerable#index_by](https://www.rubydoc.info/gems/activesupport/Enumerable:index_by)
98
155
  - [File.atomic_write](https://www.rubydoc.info/gems/activesupport/File:atomic_write)
99
- - [NilClass#try](https://www.rubydoc.info/gems/activesupport/NilClass:try)
100
156
  - [Object#presence](https://www.rubydoc.info/gems/activesupport/Object:presence)
101
157
  - [String#blank?](https://www.rubydoc.info/gems/activesupport/String:blank%3F)
102
158
  - [String#squish](https://www.rubydoc.info/gems/activesupport/String:squish)
103
159
  - [casual_support](https://rubygems.org/gems/casual_support)
104
- ([docs](http://www.rubydoc.info/gems/casual_support/))
105
- - [Enumerable#index_to](http://www.rubydoc.info/gems/casual_support/Enumerable:index_to)
106
- - [String#after](http://www.rubydoc.info/gems/casual_support/String:after)
107
- - [String#after_last](http://www.rubydoc.info/gems/casual_support/String:after_last)
108
- - [String#before](http://www.rubydoc.info/gems/casual_support/String:before)
109
- - [String#before_last](http://www.rubydoc.info/gems/casual_support/String:before_last)
110
- - [String#between](http://www.rubydoc.info/gems/casual_support/String:between)
111
- - [Time#to_hms](http://www.rubydoc.info/gems/casual_support/Time:to_hms)
112
- - [Time#to_ymd](http://www.rubydoc.info/gems/casual_support/Time:to_ymd)
160
+ ([docs](https://www.rubydoc.info/gems/casual_support/))
161
+ - [Enumerable#index_to](https://www.rubydoc.info/gems/casual_support/Enumerable:index_to)
162
+ - [String#after](https://www.rubydoc.info/gems/casual_support/String:after)
163
+ - [String#after_last](https://www.rubydoc.info/gems/casual_support/String:after_last)
164
+ - [String#before](https://www.rubydoc.info/gems/casual_support/String:before)
165
+ - [String#before_last](https://www.rubydoc.info/gems/casual_support/String:before_last)
166
+ - [String#between](https://www.rubydoc.info/gems/casual_support/String:between)
167
+ - [Time#to_hms](https://www.rubydoc.info/gems/casual_support/Time:to_hms)
168
+ - [Time#to_ymd](https://www.rubydoc.info/gems/casual_support/Time:to_ymd)
113
169
  - [gorge](https://rubygems.org/gems/gorge)
114
- ([docs](http://www.rubydoc.info/gems/gorge/))
115
- - [Pathname#file_crc32](http://www.rubydoc.info/gems/gorge/Pathname:file_crc32)
116
- - [Pathname#file_md5](http://www.rubydoc.info/gems/gorge/Pathname:file_md5)
117
- - [Pathname#file_sha1](http://www.rubydoc.info/gems/gorge/Pathname:file_sha1)
118
- - [String#crc32](http://www.rubydoc.info/gems/gorge/String:crc32)
119
- - [String#md5](http://www.rubydoc.info/gems/gorge/String:md5)
120
- - [String#sha1](http://www.rubydoc.info/gems/gorge/String:sha1)
170
+ ([docs](https://www.rubydoc.info/gems/gorge/))
171
+ - [Pathname#file_crc32](https://www.rubydoc.info/gems/gorge/Pathname:file_crc32)
172
+ - [Pathname#file_md5](https://www.rubydoc.info/gems/gorge/Pathname:file_md5)
173
+ - [Pathname#file_sha1](https://www.rubydoc.info/gems/gorge/Pathname:file_sha1)
121
174
  - [mini_sanity](https://rubygems.org/gems/mini_sanity)
122
- ([docs](http://www.rubydoc.info/gems/mini_sanity/))
123
- - [Array#assert_length!](http://www.rubydoc.info/gems/mini_sanity/Array:assert_length%21)
124
- - [Enumerable#refute_empty!](http://www.rubydoc.info/gems/mini_sanity/Enumerable:refute_empty%21)
125
- - [Object#assert_equal!](http://www.rubydoc.info/gems/mini_sanity/Object:assert_equal%21)
126
- - [Object#assert_in!](http://www.rubydoc.info/gems/mini_sanity/Object:assert_in%21)
127
- - [Object#refute_nil!](http://www.rubydoc.info/gems/mini_sanity/Object:refute_nil%21)
128
- - [Pathname#assert_exist!](http://www.rubydoc.info/gems/mini_sanity/Pathname:assert_exist%21)
129
- - [String#assert_match!](http://www.rubydoc.info/gems/mini_sanity/String:assert_match%21)
175
+ ([docs](https://www.rubydoc.info/gems/mini_sanity/))
176
+ - [Enumerator#result!](https://www.rubydoc.info/gems/mini_sanity/Enumerator:result%21)
177
+ - [Enumerator#results!](https://www.rubydoc.info/gems/mini_sanity/Enumerator:results%21)
178
+ - [Object#assert!](https://www.rubydoc.info/gems/mini_sanity/Object:assert%21)
179
+ - [Object#refute!](https://www.rubydoc.info/gems/mini_sanity/Object:refute%21)
180
+ - [String#match!](https://www.rubydoc.info/gems/mini_sanity/String:match%21)
130
181
  - [pleasant_path](https://rubygems.org/gems/pleasant_path)
131
- ([docs](http://www.rubydoc.info/gems/pleasant_path/))
132
- - [Pathname#available_name](http://www.rubydoc.info/gems/pleasant_path/Pathname:available_name)
133
- - [Pathname#dirs](http://www.rubydoc.info/gems/pleasant_path/Pathname:dirs)
134
- - [Pathname#files](http://www.rubydoc.info/gems/pleasant_path/Pathname:files)
135
- - [Pathname#make_dirname](http://www.rubydoc.info/gems/pleasant_path/Pathname:make_dirname)
136
- - [Pathname#make_file](http://www.rubydoc.info/gems/pleasant_path/Pathname:make_file)
137
- - [Pathname#move_as](http://www.rubydoc.info/gems/pleasant_path/Pathname:move_as)
138
- - [Pathname#rename_basename](http://www.rubydoc.info/gems/pleasant_path/Pathname:rename_basename)
139
- - [Pathname#rename_extname](http://www.rubydoc.info/gems/pleasant_path/Pathname:rename_extname)
182
+ ([docs](https://www.rubydoc.info/gems/pleasant_path/))
183
+ - [Pathname#available_name](https://www.rubydoc.info/gems/pleasant_path/Pathname:available_name)
184
+ - [Pathname#existence](https://www.rubydoc.info/gems/pleasant_path/Pathname:existence)
185
+ - [Pathname#make_dirname](https://www.rubydoc.info/gems/pleasant_path/Pathname:make_dirname)
186
+ - [Pathname#move_as](https://www.rubydoc.info/gems/pleasant_path/Pathname:move_as)
187
+ - [Pathname#rename_basename](https://www.rubydoc.info/gems/pleasant_path/Pathname:rename_basename)
188
+ - [Pathname#rename_extname](https://www.rubydoc.info/gems/pleasant_path/Pathname:rename_extname)
140
189
  - [ryoba](https://rubygems.org/gems/ryoba)
141
- ([docs](http://www.rubydoc.info/gems/ryoba/))
142
- - [Nokogiri::XML::Node#matches!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:matches%21)
143
- - [Nokogiri::XML::Node#text!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:text%21)
144
- - [Nokogiri::XML::Node#uri](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:uri)
145
- - [Nokogiri::XML::Searchable#ancestor!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestor%21)
146
- - [Nokogiri::XML::Searchable#ancestors!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestors%21)
147
- - [Nokogiri::XML::Searchable#at!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21)
148
- - [Nokogiri::XML::Searchable#search!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21)
190
+ ([docs](https://www.rubydoc.info/gems/ryoba/))
191
+ - [Nokogiri::XML::Node#matches!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:matches%21)
192
+ - [Nokogiri::XML::Node#text!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:text%21)
193
+ - [Nokogiri::XML::Node#uri](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:uri)
194
+ - [Nokogiri::XML::Searchable#ancestor!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestor%21)
195
+ - [Nokogiri::XML::Searchable#ancestors!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestors%21)
196
+ - [Nokogiri::XML::Searchable#at!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21)
197
+ - [Nokogiri::XML::Searchable#search!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21)
149
198
 
150
199
 
151
200
  ## Installation
152
201
 
153
- Install from [Ruby Gems](https://rubygems.org/gems/grubby):
202
+ Install the [gem](https://rubygems.org/gems/grubby):
154
203
 
155
204
  ```bash
156
205
  $ gem install grubby
157
206
  ```
158
207
 
159
- Then require in your Ruby script:
208
+ Then require in your Ruby code:
160
209
 
161
210
  ```ruby
162
211
  require "grubby"
@@ -165,8 +214,7 @@ require "grubby"
165
214
 
166
215
  ## Contributing
167
216
 
168
- Run `rake test` to run the tests. You can also run `rake irb` for an
169
- interactive prompt that pre-loads the project code.
217
+ Run `rake test` to run the tests.
170
218
 
171
219
 
172
220
  ## License
data/Rakefile CHANGED
@@ -1,18 +1,5 @@
1
1
  require "bundler/gem_tasks"
2
2
  require "rake/testtask"
3
- require "yard"
4
-
5
-
6
- YARD::Rake::YardocTask.new(:doc) do |t|
7
- end
8
-
9
- desc "Launch IRB with this gem pre-loaded"
10
- task :irb do
11
- require "grubby"
12
- require "irb"
13
- ARGV.clear
14
- IRB.start
15
- end
16
3
 
17
4
  Rake::TestTask.new(:test) do |t|
18
5
  t.libs << "test"
@@ -0,0 +1,3 @@
1
+ eval_gemfile "../Gemfile"
2
+
3
+ gem "activesupport", "~> 6.0.0"
@@ -1,7 +1,4 @@
1
- # coding: utf-8
2
- lib = File.expand_path("../lib", __FILE__)
3
- $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require "grubby/version"
1
+ require_relative "lib/grubby/version"
5
2
 
6
3
  Gem::Specification.new do |spec|
7
4
  spec.name = "grubby"
@@ -12,24 +9,26 @@ Gem::Specification.new do |spec|
12
9
  spec.summary = %q{Fail-fast web scraping}
13
10
  spec.homepage = "https://github.com/jonathanhefner/grubby"
14
11
  spec.license = "MIT"
12
+ spec.required_ruby_version = ">= 2.6"
15
13
 
16
- spec.files = `git ls-files -z`.split("\x0").reject do |f|
17
- f.match(%r{^(test|spec|features)/})
14
+ spec.metadata["homepage_uri"] = spec.homepage
15
+ spec.metadata["source_code_uri"] = spec.homepage
16
+ spec.metadata["changelog_uri"] = spec.metadata["source_code_uri"] + "/blob/master/CHANGELOG.md"
17
+
18
+ # Specify which files should be added to the gem when it is released.
19
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
20
+ spec.files = Dir.chdir(__dir__) do
21
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
22
  end
19
23
  spec.bindir = "exe"
20
24
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
21
25
  spec.require_paths = ["lib"]
22
26
 
23
- spec.add_runtime_dependency "activesupport", ">= 5.0"
24
- spec.add_runtime_dependency "casual_support", "~> 3.0"
25
- spec.add_runtime_dependency "gorge", "~> 1.0"
26
- spec.add_runtime_dependency "mechanize", "~> 2.7"
27
- spec.add_runtime_dependency "mini_sanity", "~> 1.0"
28
- spec.add_runtime_dependency "pleasant_path", "~> 1.1"
29
- spec.add_runtime_dependency "ryoba", "~> 1.0"
30
-
31
- spec.add_development_dependency "bundler", "~> 1.15"
32
- spec.add_development_dependency "rake", "~> 10.0"
33
- spec.add_development_dependency "minitest", "~> 5.0"
34
- spec.add_development_dependency "yard", "~> 0.9"
27
+ spec.add_dependency "activesupport", ">= 6.0"
28
+ spec.add_dependency "casual_support", "~> 4.0"
29
+ spec.add_dependency "gorge", "~> 1.0"
30
+ spec.add_dependency "mechanize", "~> 2.7"
31
+ spec.add_dependency "mini_sanity", "~> 2.0"
32
+ spec.add_dependency "pleasant_path", "~> 2.0"
33
+ spec.add_dependency "ryoba", "~> 1.0"
35
34
  end
@@ -23,22 +23,22 @@ class Grubby < Mechanize
23
23
 
24
24
  VERSION = GRUBBY_VERSION
25
25
 
26
- # The enforced minimum amount of time to wait between requests, in
27
- # seconds. If the value is a Range, a random number within the Range
28
- # is chosen for each request.
26
+ # The minimum amount of time enforced between requests, in seconds.
27
+ # If the value is a Range, a random number within the Range is chosen
28
+ # for each request.
29
29
  #
30
30
  # @return [Integer, Float, Range<Integer>, Range<Float>]
31
31
  attr_accessor :time_between_requests
32
32
 
33
33
  # Journal file used to ensure only-once processing of resources by
34
- # {singleton} across multiple program runs.
34
+ # {fulfill} across multiple program runs.
35
35
  #
36
36
  # @return [Pathname, nil]
37
37
  attr_reader :journal
38
38
 
39
39
  # @param journal [Pathname, String]
40
40
  # Optional journal file used to ensure only-once processing of
41
- # resources by {singleton} across multiple program runs.
41
+ # resources by {fulfill} across multiple program runs
42
42
  def initialize(journal = nil)
43
43
  super()
44
44
 
@@ -74,26 +74,27 @@ class Grubby < Mechanize
74
74
  end
75
75
 
76
76
  # Sets the journal file used to ensure only-once processing of
77
- # resources by {singleton} across multiple program runs. Setting the
77
+ # resources by {fulfill} across multiple program runs. Setting the
78
78
  # journal file will clear the in-memory list of previously-processed
79
79
  # resources, and, if the journal file exists, load the list from file.
80
80
  #
81
81
  # @param path [Pathname, String, nil]
82
82
  # @return [Pathname]
83
83
  def journal=(path)
84
- @journal = path&.to_pathname&.touch_file
85
- @seen = if @journal
84
+ @journal = path&.to_pathname&.make_file
85
+ @fulfilled = if @journal
86
86
  require "csv"
87
- CSV.read(@journal).map{|row| SingletonKey.new(*row) }.to_set
87
+ CSV.read(@journal).map{|row| FulfilledEntry.new(*row) }.to_set
88
88
  else
89
89
  Set.new
90
90
  end
91
91
  @journal
92
92
  end
93
93
 
94
- # Calls +#head+ and returns true if the result has response code
95
- # "200". Unlike +#head+, error response codes (e.g. "404", "500")
96
- # do not cause a +Mechanize::ResponseCodeError+ to be raised.
94
+ # Calls +#head+ and returns true if a response code "200" is received,
95
+ # false otherwise. Unlike +#head+, error response codes (e.g. "404",
96
+ # "500") do not result in a +Mechanize::ResponseCodeError+ being
97
+ # raised.
97
98
  #
98
99
  # @param uri [URI, String]
99
100
  # @return [Boolean]
@@ -106,7 +107,7 @@ class Grubby < Mechanize
106
107
  end
107
108
 
108
109
  # Calls +#get+ with each of +mirror_uris+ until a successful
109
- # ("200 OK") response is recieved, and returns that +#get+ result.
110
+ # ("200 OK") response is received, and returns that +#get+ result.
110
111
  # Rescues and logs +Mechanize::ResponseCodeError+ failures for all but
111
112
  # the last mirror.
112
113
  #
@@ -114,13 +115,13 @@ class Grubby < Mechanize
114
115
  # grubby = Grubby.new
115
116
  #
116
117
  # urls = [
117
- # "http://httpstat.us/404",
118
- # "http://httpstat.us/500",
119
- # "http://httpstat.us/200#foo",
120
- # "http://httpstat.us/200#bar",
118
+ # "https://httpstat.us/404",
119
+ # "https://httpstat.us/500",
120
+ # "https://httpstat.us/200?foo",
121
+ # "https://httpstat.us/200?bar",
121
122
  # ]
122
123
  #
123
- # grubby.get_mirrored(urls).uri # == URI("http://httpstat.us/200#foo")
124
+ # grubby.get_mirrored(urls).uri # == URI("https://httpstat.us/200?foo")
124
125
  #
125
126
  # grubby.get_mirrored(urls.take(2)) # raise Mechanize::ResponseCodeError
126
127
  #
@@ -145,70 +146,87 @@ class Grubby < Mechanize
145
146
  end
146
147
 
147
148
  # Ensures only-once processing of the resource indicated by +uri+ for
148
- # the specified +purpose+. A list of previously-processed resource
149
- # URIs and content hashes is maintained in the Grubby instance. The
150
- # given block is called with the fetched resource only if the
151
- # resource's URI and the resource's content hash have not been
152
- # previously processed under the specified +purpose+.
149
+ # the specified +purpose+. The given block is executed and the result
150
+ # is returned if and only if the Grubby instance has not recorded a
151
+ # previous call to +fulfill+ for the same resource and purpose.
152
+ #
153
+ # Note that the resource is identified by both its URI and its content
154
+ # hash. The latter prevents superfluous and rearranged URI query
155
+ # string parameters from interfering with only-once processing.
156
+ #
157
+ # If {journal} is set, and if the block does not raise an exception,
158
+ # the resource and purpose are logged to the journal file. This
159
+ # enables only-once processing across multiple program runs. It also
160
+ # provides a means to resume batch processing after an unexpected
161
+ # termination.
153
162
  #
154
163
  # @example
155
164
  # grubby = Grubby.new
156
165
  #
157
- # grubby.singleton("https://example.com/foo") do |page|
158
- # # will be executed (first time "/foo")
166
+ # grubby.fulfill("https://example.com/posts") do |page|
167
+ # "first time"
168
+ # end
169
+ # # == "first time"
170
+ #
171
+ # grubby.fulfill("https://example.com/posts") do |page|
172
+ # "already seen" # not evaluated
159
173
  # end
174
+ # # == nil
160
175
  #
161
- # grubby.singleton("https://example.com/foo#bar") do |page|
162
- # # will be skipped (already seen "/foo")
176
+ # grubby.fulfill("https://example.com/posts?page=1") do |page|
177
+ # "already seen content hash" # not evaluated
163
178
  # end
179
+ # # == nil
164
180
  #
165
- # grubby.singleton("https://example.com/foo", "again!") do |page|
166
- # # will be executed (new purpose for "/foo")
181
+ # grubby.fulfill("https://example.com/posts", "again!") do |page|
182
+ # "already seen, but new purpose"
167
183
  # end
184
+ # # == "already seen, but new purpose"
168
185
  #
169
186
  # @param uri [URI, String]
170
187
  # @param purpose [String]
171
- # @yield [resource]
172
188
  # @yieldparam resource [Mechanize::Page, Mechanize::File, Mechanize::Download, ...]
173
- # @return [Boolean]
174
- # whether the given block was called
189
+ # @yieldreturn [Object]
190
+ # @return [Object, nil]
175
191
  # @raise [Mechanize::ResponseCodeError]
176
192
  # if fetching the resource results in error (see +Mechanize#get+)
177
- def singleton(uri, purpose = "")
193
+ def fulfill(uri, purpose = "")
178
194
  series = []
179
195
 
180
196
  uri = uri.to_absolute_uri
181
- return if try_skip_singleton(uri, purpose, series)
197
+ return unless add_fulfilled(uri, purpose, series)
182
198
 
183
199
  normalized_uri = normalize_uri(uri)
184
- return if try_skip_singleton(normalized_uri, purpose, series)
200
+ return unless add_fulfilled(normalized_uri, purpose, series)
185
201
 
186
202
  $log.info("Fetch #{normalized_uri}")
187
203
  resource = get(normalized_uri)
188
- skip = try_skip_singleton(resource.uri, purpose, series) |
189
- try_skip_singleton("content hash: #{resource.content_hash}", purpose, series)
204
+ unprocessed = add_fulfilled(resource.uri, purpose, series) &
205
+ add_fulfilled("content hash: #{resource.content_hash}", purpose, series)
190
206
 
191
- yield resource unless skip
207
+ result = yield resource if unprocessed
192
208
 
193
209
  CSV.open(journal, "a") do |csv|
194
- series.each{|singleton_key| csv << singleton_key }
210
+ series.each{|entry| csv << entry }
195
211
  end if journal
196
212
 
197
- !skip
213
+ result
198
214
  end
199
215
 
200
216
 
201
217
  private
202
218
 
203
219
  # @!visibility private
204
- SingletonKey = Struct.new(:purpose, :target)
220
+ FulfilledEntry = Struct.new(:purpose, :target)
205
221
 
206
- def try_skip_singleton(target, purpose, series)
207
- series << SingletonKey.new(purpose, target.to_s)
208
- if series.uniq!.nil? && !@seen.add?(series.last)
209
- seen_info = series.length > 1 ? "seen #{series.last.target}" : "seen"
210
- $log.info("Skip #{series.first.target} (#{seen_info})")
222
+ def add_fulfilled(target, purpose, series)
223
+ series << FulfilledEntry.new(purpose, target.to_s)
224
+ if (series.uniq!) || @fulfilled.add?(series.last)
211
225
  true
226
+ else
227
+ $log.info("Skip #{series.first.target}" \
228
+ " (seen#{" #{series.last.target}" unless series.length == 1})")
229
+ false
212
230
  end
213
231
  end
214
232