grubby 1.2.1 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 91cb5fb76be040dc0a6b86c7dd5513e7dfa79327e68b6f15da6ed41df1492740
4
- data.tar.gz: d96e1a83f6ebc93c09403bc66ee3251132bbdabeb40379aa081dbece2c978b98
3
+ metadata.gz: e313c9ba144ee119b31eb6b7ec5fef721df811c8d579f532e5aa5de5a8d65198
4
+ data.tar.gz: 07f06e01378301c37ca0177a29f95e72f3cf549b65c3e1c9896c9749a9cd857d
5
5
  SHA512:
6
- metadata.gz: 4e10fa8ae3b183fa600a26af1ff87e0e340e63cfdeec9369c1f9987ace143591b9c33b1edfed980b841ffea5806f96332b1b32e117551b714dcd3b66cff5a8da
7
- data.tar.gz: 63985a6d1d39a1ac224eb1aca676f3266b911059e7ab5e838a535dd14e6249d2bbc1d41b59a35101e17983930ebd7ab258a6ce39375a300bcf1725a0e79b72c1
6
+ metadata.gz: ea948a4c90d2d9ef0e1cd527adc3ef89cb0379ad98751ffbf671b5cf2210e6e700b7856983e1378a500d9db842d9411cf20275f005ea4a2e2eba824a9c929ee3
7
+ data.tar.gz: 7a7985f0d5127d6c7e25f9d39a489c460cd76f05219358072dce618667edcefd033740335fbb4b6c8cfa216f0ed4f4d3cfab239af7d33f7d92ec939508a6ea20
data/.gitignore CHANGED
@@ -4,6 +4,7 @@
4
4
  /_yardoc/
5
5
  /coverage/
6
6
  /doc/
7
+ /gemfiles/*.lock
7
8
  /pkg/
8
9
  /spec/reports/
9
10
  /tmp/
@@ -1,5 +1,8 @@
1
- sudo: false
2
1
  language: ruby
2
+
3
3
  rvm:
4
- - 2.2.5
5
- before_install: gem install bundler -v 1.15.1
4
+ - 2.6
5
+ - 2.7
6
+
7
+ gemfile:
8
+ - gemfiles/activesupport-6.0.gemfile
@@ -1,3 +1,15 @@
1
+ ## 2.0.0
2
+
3
+ * [BREAKING] Drop support for Active Support < 6.0
4
+ * [BREAKING] Require casual_support ~> 4.0
5
+ * [BREAKING] Require mini_sanity ~> 2.0
6
+ * [BREAKING] Require pleasant_path ~> 2.0
7
+ * [BREAKING] Remove `JsonParser.json_parse_options`
8
+ * Use `::JSON.load_default_options` instead
9
+ * [BREAKING] Rename `Grubby#singleton` to `Grubby#fulfill`
10
+ * [BREAKING] Change `Grubby#fulfill` to return block's result
11
+
12
+
1
13
  ## 1.2.1
2
14
 
3
15
  * Add `JsonParser#mech` attribute for parity with `Mechanize::Page#mech`
data/Gemfile CHANGED
@@ -2,3 +2,6 @@ source "https://rubygems.org"
2
2
 
3
3
  # Specify your gem's dependencies in grubby.gemspec
4
4
  gemspec
5
+
6
+ gem "rake", "~> 12.0"
7
+ gem "minitest", "~> 5.0"
data/README.md CHANGED
@@ -1,162 +1,211 @@
1
- # grubby
1
+ # grubby [![Build Status](https://travis-ci.org/jonathanhefner/grubby.svg?branch=master)](https://travis-ci.org/jonathanhefner/grubby)
2
2
 
3
3
  [Fail-fast] web scraping. *grubby* adds a layer of utility and
4
- error-checking atop the marvelous [Mechanize gem]. See API summary
4
+ error-checking atop the marvelous [Mechanize gem]. See API listing
5
5
  below, or browse the [full documentation].
6
6
 
7
7
  [Fail-fast]: https://en.wikipedia.org/wiki/Fail-fast
8
8
  [Mechanize gem]: https://rubygems.org/gems/mechanize
9
- [full documentation]: http://www.rubydoc.info/gems/grubby/
9
+ [full documentation]: https://www.rubydoc.info/gems/grubby/
10
10
 
11
11
 
12
12
  ## Examples
13
13
 
14
- The following example scrapes stories from the [Hacker News] front page:
14
+ The following code scrapes stories from the [Hacker News](
15
+ https://news.ycombinator.com/news) front page:
15
16
 
16
17
  ```ruby
17
18
  require "grubby"
18
19
 
19
20
  class HackerNews < Grubby::PageScraper
20
21
  scrapes(:items) do
21
- page.search!(".athing").map{|el| Item.new(el) }
22
+ page.search!(".athing").map{|element| Item.new(element) }
22
23
  end
23
24
 
24
25
  class Item < Grubby::Scraper
25
26
  scrapes(:story_link){ source.at!("a.storylink") }
26
- scrapes(:story_uri){ story_link.uri }
27
+
28
+ scrapes(:story_url){ expand_url(story_link["href"]) }
29
+
27
30
  scrapes(:title){ story_link.text }
31
+
32
+ scrapes(:comments_link, optional: true) do
33
+ source.next_sibling.search!(".subtext a").find do |link|
34
+ link.text.match?(/comment|discuss/)
35
+ end
36
+ end
37
+
38
+ scrapes(:comments_url, if: :comments_link) do
39
+ expand_url(comments_link["href"])
40
+ end
41
+
42
+ scrapes(:comment_count, if: :comments_link) do
43
+ comments_link.text.to_i
44
+ end
45
+
46
+ def expand_url(url)
47
+ url.include?("://") ? url : source.document.uri.merge(url).to_s
48
+ end
28
49
  end
29
50
  end
30
51
 
31
52
  # The following line will raise an exception if anything goes wrong
32
53
  # during the scraping process. For example, if the structure of the
33
- # HTML does not match expectations, either due to incorrect assumptions
34
- # or a site change, the script will terminate immediately with a helpful
35
- # error message. This prevents bad data from propagating and causing
36
- # hard-to-trace errors.
54
+ # HTML does not match expectations due to a site change, the script will
55
+ # terminate immediately with a helpful error message. This prevents bad
56
+ # data from propagating and causing hard-to-trace errors.
37
57
  hn = HackerNews.scrape("https://news.ycombinator.com/news")
38
58
 
39
59
  # Your processing logic goes here:
40
60
  hn.items.take(10).each do |item|
41
61
  puts "* #{item.title}"
42
- puts " #{item.story_uri}"
62
+ puts " #{item.story_url}"
63
+ puts " #{item.comment_count} comments: #{item.comments_url}" if item.comments_url
43
64
  puts
44
65
  end
45
66
  ```
46
67
 
47
- [Hacker News]: https://news.ycombinator.com/news
68
+ Hacker News also offers a [JSON API](https://github.com/HackerNews/API),
69
+ which may be more robust for scraping purposes. *grubby* can scrape
70
+ JSON just as well:
71
+
72
+ ```ruby
73
+ require "grubby"
74
+
75
+ class HackerNews < Grubby::JsonScraper
76
+ scrapes(:items) do
77
+ # API returns array of top 500 item IDs, so limit as necessary
78
+ json.take(10).map do |item_id|
79
+ Item.scrape("https://hacker-news.firebaseio.com/v0/item/#{item_id}.json")
80
+ end
81
+ end
82
+
83
+ class Item < Grubby::JsonScraper
84
+ scrapes(:story_url){ json["url"] || hn_url }
85
+
86
+ scrapes(:title){ json["title"] }
87
+
88
+ scrapes(:comments_url, optional: true) do
89
+ hn_url if json["descendants"]
90
+ end
91
+
92
+ scrapes(:comment_count, optional: true) do
93
+ json["descendants"]&.to_i
94
+ end
95
+
96
+ def hn_url
97
+ "https://news.ycombinator.com/item?id=#{json["id"]}"
98
+ end
99
+ end
100
+ end
101
+
102
+ hn = HackerNews.scrape("https://hacker-news.firebaseio.com/v0/topstories.json")
103
+
104
+ # Your processing logic goes here:
105
+ hn.items.each do |item|
106
+ puts "* #{item.title}"
107
+ puts " #{item.story_url}"
108
+ puts " #{item.comment_count} comments: #{item.comments_url}" if item.comments_url
109
+ puts
110
+ end
111
+ ```
48
112
 
49
113
 
50
114
  ## Core API
51
115
 
52
- - [Grubby](http://www.rubydoc.info/gems/grubby/Grubby)
53
- - [#get_mirrored](http://www.rubydoc.info/gems/grubby/Grubby:get_mirrored)
54
- - [#ok?](http://www.rubydoc.info/gems/grubby/Grubby:ok%3F)
55
- - [#singleton](http://www.rubydoc.info/gems/grubby/Grubby:singleton)
56
- - [#time_between_requests](http://www.rubydoc.info/gems/grubby/Grubby:time_between_requests)
57
- - [Scraper](http://www.rubydoc.info/gems/grubby/Grubby/Scraper)
58
- - [.each](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.each)
59
- - [.fields](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.fields)
60
- - [.scrape](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrape)
61
- - [.scrapes](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrapes)
62
- - [#[]](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:[])
63
- - [#source](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:source)
64
- - [#to_h](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:to_h)
65
- - [PageScraper](http://www.rubydoc.info/gems/grubby/Grubby/PageScraper)
66
- - [.scrape_file](http://www.rubydoc.info/gems/grubby/Grubby/PageScraper.scrape_file)
67
- - [#page](http://www.rubydoc.info/gems/grubby/Grubby/PageScraper:page)
68
- - [JsonScraper](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper)
69
- - [.scrape_file](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper.scrape_file)
70
- - [#json](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper:json)
71
- - Mechanize::Download
72
- - [#save_to](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to)
73
- - [#save_to!](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to%21)
116
+ - [Grubby](https://www.rubydoc.info/gems/grubby/Grubby)
117
+ - [#fulfill](https://www.rubydoc.info/gems/grubby/Grubby:fulfill)
118
+ - [#get_mirrored](https://www.rubydoc.info/gems/grubby/Grubby:get_mirrored)
119
+ - [#ok?](https://www.rubydoc.info/gems/grubby/Grubby:ok%3F)
120
+ - [#time_between_requests](https://www.rubydoc.info/gems/grubby/Grubby:time_between_requests)
121
+ - [Scraper](https://www.rubydoc.info/gems/grubby/Grubby/Scraper)
122
+ - [.each](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.each)
123
+ - [.scrape](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrape)
124
+ - [.scrapes](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrapes)
125
+ - [#[]](https://www.rubydoc.info/gems/grubby/Grubby/Scraper:[])
126
+ - [#to_h](https://www.rubydoc.info/gems/grubby/Grubby/Scraper:to_h)
127
+ - [PageScraper](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper)
128
+ - [.scrape_file](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper.scrape_file)
129
+ - [#page](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper:page)
130
+ - [JsonScraper](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper)
131
+ - [.scrape_file](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper.scrape_file)
132
+ - [#json](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper:json)
74
133
  - Mechanize::File
75
- - [#save_to](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to)
76
- - [#save_to!](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to%21)
134
+ - [#save_to](https://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to)
135
+ - [#save_to!](https://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to%21)
77
136
  - Mechanize::Page
78
- - [#at!](http://www.rubydoc.info/gems/grubby/Mechanize/Page:at%21)
79
- - [#search!](http://www.rubydoc.info/gems/grubby/Mechanize/Page:search%21)
137
+ - [#at!](https://www.rubydoc.info/gems/grubby/Mechanize/Page:at%21)
138
+ - [#search!](https://www.rubydoc.info/gems/grubby/Mechanize/Page:search%21)
80
139
  - Mechanize::Page::Link
81
- - [#to_absolute_uri](http://www.rubydoc.info/gems/grubby/Mechanize/Page/Link#to_absolute_uri)
140
+ - [#to_absolute_uri](https://www.rubydoc.info/gems/grubby/Mechanize/Page/Link#to_absolute_uri)
82
141
  - URI
83
142
  - [#basename](https://www.rubydoc.info/gems/grubby/URI:basename)
84
143
  - [#query_param](https://www.rubydoc.info/gems/grubby/URI:query_param)
85
144
 
86
145
 
87
- ## Supplemental API
146
+ ## Auxiliary API
88
147
 
89
- *grubby* includes several gems which extend Ruby objects with
90
- convenience methods. When you load *grubby* you automatically make
91
- these methods available. The included gems are listed below, along with
92
- **a few** of the methods each provides. See each gem's documentation
93
- for a complete API listing.
148
+ *grubby* loads several gems that extend Ruby objects with utility
149
+ methods. Some of those methods are listed below. See each gem's
150
+ documentation for a complete API listing.
94
151
 
95
152
  - [Active Support](https://rubygems.org/gems/activesupport)
96
- ([docs](http://www.rubydoc.info/gems/activesupport/))
153
+ ([docs](https://www.rubydoc.info/gems/activesupport/))
97
154
  - [Enumerable#index_by](https://www.rubydoc.info/gems/activesupport/Enumerable:index_by)
98
155
  - [File.atomic_write](https://www.rubydoc.info/gems/activesupport/File:atomic_write)
99
- - [NilClass#try](https://www.rubydoc.info/gems/activesupport/NilClass:try)
100
156
  - [Object#presence](https://www.rubydoc.info/gems/activesupport/Object:presence)
101
157
  - [String#blank?](https://www.rubydoc.info/gems/activesupport/String:blank%3F)
102
158
  - [String#squish](https://www.rubydoc.info/gems/activesupport/String:squish)
103
159
  - [casual_support](https://rubygems.org/gems/casual_support)
104
- ([docs](http://www.rubydoc.info/gems/casual_support/))
105
- - [Enumerable#index_to](http://www.rubydoc.info/gems/casual_support/Enumerable:index_to)
106
- - [String#after](http://www.rubydoc.info/gems/casual_support/String:after)
107
- - [String#after_last](http://www.rubydoc.info/gems/casual_support/String:after_last)
108
- - [String#before](http://www.rubydoc.info/gems/casual_support/String:before)
109
- - [String#before_last](http://www.rubydoc.info/gems/casual_support/String:before_last)
110
- - [String#between](http://www.rubydoc.info/gems/casual_support/String:between)
111
- - [Time#to_hms](http://www.rubydoc.info/gems/casual_support/Time:to_hms)
112
- - [Time#to_ymd](http://www.rubydoc.info/gems/casual_support/Time:to_ymd)
160
+ ([docs](https://www.rubydoc.info/gems/casual_support/))
161
+ - [Enumerable#index_to](https://www.rubydoc.info/gems/casual_support/Enumerable:index_to)
162
+ - [String#after](https://www.rubydoc.info/gems/casual_support/String:after)
163
+ - [String#after_last](https://www.rubydoc.info/gems/casual_support/String:after_last)
164
+ - [String#before](https://www.rubydoc.info/gems/casual_support/String:before)
165
+ - [String#before_last](https://www.rubydoc.info/gems/casual_support/String:before_last)
166
+ - [String#between](https://www.rubydoc.info/gems/casual_support/String:between)
167
+ - [Time#to_hms](https://www.rubydoc.info/gems/casual_support/Time:to_hms)
168
+ - [Time#to_ymd](https://www.rubydoc.info/gems/casual_support/Time:to_ymd)
113
169
  - [gorge](https://rubygems.org/gems/gorge)
114
- ([docs](http://www.rubydoc.info/gems/gorge/))
115
- - [Pathname#file_crc32](http://www.rubydoc.info/gems/gorge/Pathname:file_crc32)
116
- - [Pathname#file_md5](http://www.rubydoc.info/gems/gorge/Pathname:file_md5)
117
- - [Pathname#file_sha1](http://www.rubydoc.info/gems/gorge/Pathname:file_sha1)
118
- - [String#crc32](http://www.rubydoc.info/gems/gorge/String:crc32)
119
- - [String#md5](http://www.rubydoc.info/gems/gorge/String:md5)
120
- - [String#sha1](http://www.rubydoc.info/gems/gorge/String:sha1)
170
+ ([docs](https://www.rubydoc.info/gems/gorge/))
171
+ - [Pathname#file_crc32](https://www.rubydoc.info/gems/gorge/Pathname:file_crc32)
172
+ - [Pathname#file_md5](https://www.rubydoc.info/gems/gorge/Pathname:file_md5)
173
+ - [Pathname#file_sha1](https://www.rubydoc.info/gems/gorge/Pathname:file_sha1)
121
174
  - [mini_sanity](https://rubygems.org/gems/mini_sanity)
122
- ([docs](http://www.rubydoc.info/gems/mini_sanity/))
123
- - [Array#assert_length!](http://www.rubydoc.info/gems/mini_sanity/Array:assert_length%21)
124
- - [Enumerable#refute_empty!](http://www.rubydoc.info/gems/mini_sanity/Enumerable:refute_empty%21)
125
- - [Object#assert_equal!](http://www.rubydoc.info/gems/mini_sanity/Object:assert_equal%21)
126
- - [Object#assert_in!](http://www.rubydoc.info/gems/mini_sanity/Object:assert_in%21)
127
- - [Object#refute_nil!](http://www.rubydoc.info/gems/mini_sanity/Object:refute_nil%21)
128
- - [Pathname#assert_exist!](http://www.rubydoc.info/gems/mini_sanity/Pathname:assert_exist%21)
129
- - [String#assert_match!](http://www.rubydoc.info/gems/mini_sanity/String:assert_match%21)
175
+ ([docs](https://www.rubydoc.info/gems/mini_sanity/))
176
+ - [Enumerator#result!](https://www.rubydoc.info/gems/mini_sanity/Enumerator:result%21)
177
+ - [Enumerator#results!](https://www.rubydoc.info/gems/mini_sanity/Enumerator:results%21)
178
+ - [Object#assert!](https://www.rubydoc.info/gems/mini_sanity/Object:assert%21)
179
+ - [Object#refute!](https://www.rubydoc.info/gems/mini_sanity/Object:refute%21)
180
+ - [String#match!](https://www.rubydoc.info/gems/mini_sanity/String:match%21)
130
181
  - [pleasant_path](https://rubygems.org/gems/pleasant_path)
131
- ([docs](http://www.rubydoc.info/gems/pleasant_path/))
132
- - [Pathname#available_name](http://www.rubydoc.info/gems/pleasant_path/Pathname:available_name)
133
- - [Pathname#dirs](http://www.rubydoc.info/gems/pleasant_path/Pathname:dirs)
134
- - [Pathname#files](http://www.rubydoc.info/gems/pleasant_path/Pathname:files)
135
- - [Pathname#make_dirname](http://www.rubydoc.info/gems/pleasant_path/Pathname:make_dirname)
136
- - [Pathname#make_file](http://www.rubydoc.info/gems/pleasant_path/Pathname:make_file)
137
- - [Pathname#move_as](http://www.rubydoc.info/gems/pleasant_path/Pathname:move_as)
138
- - [Pathname#rename_basename](http://www.rubydoc.info/gems/pleasant_path/Pathname:rename_basename)
139
- - [Pathname#rename_extname](http://www.rubydoc.info/gems/pleasant_path/Pathname:rename_extname)
182
+ ([docs](https://www.rubydoc.info/gems/pleasant_path/))
183
+ - [Pathname#available_name](https://www.rubydoc.info/gems/pleasant_path/Pathname:available_name)
184
+ - [Pathname#existence](https://www.rubydoc.info/gems/pleasant_path/Pathname:existence)
185
+ - [Pathname#make_dirname](https://www.rubydoc.info/gems/pleasant_path/Pathname:make_dirname)
186
+ - [Pathname#move_as](https://www.rubydoc.info/gems/pleasant_path/Pathname:move_as)
187
+ - [Pathname#rename_basename](https://www.rubydoc.info/gems/pleasant_path/Pathname:rename_basename)
188
+ - [Pathname#rename_extname](https://www.rubydoc.info/gems/pleasant_path/Pathname:rename_extname)
140
189
  - [ryoba](https://rubygems.org/gems/ryoba)
141
- ([docs](http://www.rubydoc.info/gems/ryoba/))
142
- - [Nokogiri::XML::Node#matches!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:matches%21)
143
- - [Nokogiri::XML::Node#text!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:text%21)
144
- - [Nokogiri::XML::Node#uri](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:uri)
145
- - [Nokogiri::XML::Searchable#ancestor!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestor%21)
146
- - [Nokogiri::XML::Searchable#ancestors!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestors%21)
147
- - [Nokogiri::XML::Searchable#at!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21)
148
- - [Nokogiri::XML::Searchable#search!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21)
190
+ ([docs](https://www.rubydoc.info/gems/ryoba/))
191
+ - [Nokogiri::XML::Node#matches!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:matches%21)
192
+ - [Nokogiri::XML::Node#text!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:text%21)
193
+ - [Nokogiri::XML::Node#uri](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:uri)
194
+ - [Nokogiri::XML::Searchable#ancestor!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestor%21)
195
+ - [Nokogiri::XML::Searchable#ancestors!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestors%21)
196
+ - [Nokogiri::XML::Searchable#at!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21)
197
+ - [Nokogiri::XML::Searchable#search!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21)
149
198
 
150
199
 
151
200
  ## Installation
152
201
 
153
- Install from [Ruby Gems](https://rubygems.org/gems/grubby):
202
+ Install the [gem](https://rubygems.org/gems/grubby):
154
203
 
155
204
  ```bash
156
205
  $ gem install grubby
157
206
  ```
158
207
 
159
- Then require in your Ruby script:
208
+ Then require in your Ruby code:
160
209
 
161
210
  ```ruby
162
211
  require "grubby"
@@ -165,8 +214,7 @@ require "grubby"
165
214
 
166
215
  ## Contributing
167
216
 
168
- Run `rake test` to run the tests. You can also run `rake irb` for an
169
- interactive prompt that pre-loads the project code.
217
+ Run `rake test` to run the tests.
170
218
 
171
219
 
172
220
  ## License
data/Rakefile CHANGED
@@ -1,18 +1,5 @@
1
1
  require "bundler/gem_tasks"
2
2
  require "rake/testtask"
3
- require "yard"
4
-
5
-
6
- YARD::Rake::YardocTask.new(:doc) do |t|
7
- end
8
-
9
- desc "Launch IRB with this gem pre-loaded"
10
- task :irb do
11
- require "grubby"
12
- require "irb"
13
- ARGV.clear
14
- IRB.start
15
- end
16
3
 
17
4
  Rake::TestTask.new(:test) do |t|
18
5
  t.libs << "test"
@@ -0,0 +1,3 @@
1
+ eval_gemfile "../Gemfile"
2
+
3
+ gem "activesupport", "~> 6.0.0"
@@ -1,7 +1,4 @@
1
- # coding: utf-8
2
- lib = File.expand_path("../lib", __FILE__)
3
- $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require "grubby/version"
1
+ require_relative "lib/grubby/version"
5
2
 
6
3
  Gem::Specification.new do |spec|
7
4
  spec.name = "grubby"
@@ -12,24 +9,26 @@ Gem::Specification.new do |spec|
12
9
  spec.summary = %q{Fail-fast web scraping}
13
10
  spec.homepage = "https://github.com/jonathanhefner/grubby"
14
11
  spec.license = "MIT"
12
+ spec.required_ruby_version = ">= 2.6"
15
13
 
16
- spec.files = `git ls-files -z`.split("\x0").reject do |f|
17
- f.match(%r{^(test|spec|features)/})
14
+ spec.metadata["homepage_uri"] = spec.homepage
15
+ spec.metadata["source_code_uri"] = spec.homepage
16
+ spec.metadata["changelog_uri"] = spec.metadata["source_code_uri"] + "/blob/master/CHANGELOG.md"
17
+
18
+ # Specify which files should be added to the gem when it is released.
19
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
20
+ spec.files = Dir.chdir(__dir__) do
21
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
22
  end
19
23
  spec.bindir = "exe"
20
24
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
21
25
  spec.require_paths = ["lib"]
22
26
 
23
- spec.add_runtime_dependency "activesupport", ">= 5.0"
24
- spec.add_runtime_dependency "casual_support", "~> 3.0"
25
- spec.add_runtime_dependency "gorge", "~> 1.0"
26
- spec.add_runtime_dependency "mechanize", "~> 2.7"
27
- spec.add_runtime_dependency "mini_sanity", "~> 1.0"
28
- spec.add_runtime_dependency "pleasant_path", "~> 1.1"
29
- spec.add_runtime_dependency "ryoba", "~> 1.0"
30
-
31
- spec.add_development_dependency "bundler", "~> 1.15"
32
- spec.add_development_dependency "rake", "~> 10.0"
33
- spec.add_development_dependency "minitest", "~> 5.0"
34
- spec.add_development_dependency "yard", "~> 0.9"
27
+ spec.add_dependency "activesupport", ">= 6.0"
28
+ spec.add_dependency "casual_support", "~> 4.0"
29
+ spec.add_dependency "gorge", "~> 1.0"
30
+ spec.add_dependency "mechanize", "~> 2.7"
31
+ spec.add_dependency "mini_sanity", "~> 2.0"
32
+ spec.add_dependency "pleasant_path", "~> 2.0"
33
+ spec.add_dependency "ryoba", "~> 1.0"
35
34
  end
@@ -23,22 +23,22 @@ class Grubby < Mechanize
23
23
 
24
24
  VERSION = GRUBBY_VERSION
25
25
 
26
- # The enforced minimum amount of time to wait between requests, in
27
- # seconds. If the value is a Range, a random number within the Range
28
- # is chosen for each request.
26
+ # The minimum amount of time enforced between requests, in seconds.
27
+ # If the value is a Range, a random number within the Range is chosen
28
+ # for each request.
29
29
  #
30
30
  # @return [Integer, Float, Range<Integer>, Range<Float>]
31
31
  attr_accessor :time_between_requests
32
32
 
33
33
  # Journal file used to ensure only-once processing of resources by
34
- # {singleton} across multiple program runs.
34
+ # {fulfill} across multiple program runs.
35
35
  #
36
36
  # @return [Pathname, nil]
37
37
  attr_reader :journal
38
38
 
39
39
  # @param journal [Pathname, String]
40
40
  # Optional journal file used to ensure only-once processing of
41
- # resources by {singleton} across multiple program runs.
41
+ # resources by {fulfill} across multiple program runs
42
42
  def initialize(journal = nil)
43
43
  super()
44
44
 
@@ -74,26 +74,27 @@ class Grubby < Mechanize
74
74
  end
75
75
 
76
76
  # Sets the journal file used to ensure only-once processing of
77
- # resources by {singleton} across multiple program runs. Setting the
77
+ # resources by {fulfill} across multiple program runs. Setting the
78
78
  # journal file will clear the in-memory list of previously-processed
79
79
  # resources, and, if the journal file exists, load the list from file.
80
80
  #
81
81
  # @param path [Pathname, String, nil]
82
82
  # @return [Pathname]
83
83
  def journal=(path)
84
- @journal = path&.to_pathname&.touch_file
85
- @seen = if @journal
84
+ @journal = path&.to_pathname&.make_file
85
+ @fulfilled = if @journal
86
86
  require "csv"
87
- CSV.read(@journal).map{|row| SingletonKey.new(*row) }.to_set
87
+ CSV.read(@journal).map{|row| FulfilledEntry.new(*row) }.to_set
88
88
  else
89
89
  Set.new
90
90
  end
91
91
  @journal
92
92
  end
93
93
 
94
- # Calls +#head+ and returns true if the result has response code
95
- # "200". Unlike +#head+, error response codes (e.g. "404", "500")
96
- # do not cause a +Mechanize::ResponseCodeError+ to be raised.
94
+ # Calls +#head+ and returns true if a response code "200" is received,
95
+ # false otherwise. Unlike +#head+, error response codes (e.g. "404",
96
+ # "500") do not result in a +Mechanize::ResponseCodeError+ being
97
+ # raised.
97
98
  #
98
99
  # @param uri [URI, String]
99
100
  # @return [Boolean]
@@ -106,7 +107,7 @@ class Grubby < Mechanize
106
107
  end
107
108
 
108
109
  # Calls +#get+ with each of +mirror_uris+ until a successful
109
- # ("200 OK") response is recieved, and returns that +#get+ result.
110
+ # ("200 OK") response is received, and returns that +#get+ result.
110
111
  # Rescues and logs +Mechanize::ResponseCodeError+ failures for all but
111
112
  # the last mirror.
112
113
  #
@@ -114,13 +115,13 @@ class Grubby < Mechanize
114
115
  # grubby = Grubby.new
115
116
  #
116
117
  # urls = [
117
- # "http://httpstat.us/404",
118
- # "http://httpstat.us/500",
119
- # "http://httpstat.us/200#foo",
120
- # "http://httpstat.us/200#bar",
118
+ # "https://httpstat.us/404",
119
+ # "https://httpstat.us/500",
120
+ # "https://httpstat.us/200?foo",
121
+ # "https://httpstat.us/200?bar",
121
122
  # ]
122
123
  #
123
- # grubby.get_mirrored(urls).uri # == URI("http://httpstat.us/200#foo")
124
+ # grubby.get_mirrored(urls).uri # == URI("https://httpstat.us/200?foo")
124
125
  #
125
126
  # grubby.get_mirrored(urls.take(2)) # raise Mechanize::ResponseCodeError
126
127
  #
@@ -145,70 +146,87 @@ class Grubby < Mechanize
145
146
  end
146
147
 
147
148
  # Ensures only-once processing of the resource indicated by +uri+ for
148
- # the specified +purpose+. A list of previously-processed resource
149
- # URIs and content hashes is maintained in the Grubby instance. The
150
- # given block is called with the fetched resource only if the
151
- # resource's URI and the resource's content hash have not been
152
- # previously processed under the specified +purpose+.
149
+ # the specified +purpose+. The given block is executed and the result
150
+ # is returned if and only if the Grubby instance has not recorded a
151
+ # previous call to +fulfill+ for the same resource and purpose.
152
+ #
153
+ # Note that the resource is identified by both its URI and its content
154
+ # hash. The latter prevents superfluous and rearranged URI query
155
+ # string parameters from interfering with only-once processing.
156
+ #
157
+ # If {journal} is set, and if the block does not raise an exception,
158
+ # the resource and purpose are logged to the journal file. This
159
+ # enables only-once processing across multiple program runs. It also
160
+ # provides a means to resume batch processing after an unexpected
161
+ # termination.
153
162
  #
154
163
  # @example
155
164
  # grubby = Grubby.new
156
165
  #
157
- # grubby.singleton("https://example.com/foo") do |page|
158
- # # will be executed (first time "/foo")
166
+ # grubby.fulfill("https://example.com/posts") do |page|
167
+ # "first time"
168
+ # end
169
+ # # == "first time"
170
+ #
171
+ # grubby.fulfill("https://example.com/posts") do |page|
172
+ # "already seen" # not evaluated
159
173
  # end
174
+ # # == nil
160
175
  #
161
- # grubby.singleton("https://example.com/foo#bar") do |page|
162
- # # will be skipped (already seen "/foo")
176
+ # grubby.fulfill("https://example.com/posts?page=1") do |page|
177
+ # "already seen content hash" # not evaluated
163
178
  # end
179
+ # # == nil
164
180
  #
165
- # grubby.singleton("https://example.com/foo", "again!") do |page|
166
- # # will be executed (new purpose for "/foo")
181
+ # grubby.fulfill("https://example.com/posts", "again!") do |page|
182
+ # "already seen, but new purpose"
167
183
  # end
184
+ # # == "already seen, but new purpose"
168
185
  #
169
186
  # @param uri [URI, String]
170
187
  # @param purpose [String]
171
- # @yield [resource]
172
188
  # @yieldparam resource [Mechanize::Page, Mechanize::File, Mechanize::Download, ...]
173
- # @return [Boolean]
174
- # whether the given block was called
189
+ # @yieldreturn [Object]
190
+ # @return [Object, nil]
175
191
  # @raise [Mechanize::ResponseCodeError]
176
192
  # if fetching the resource results in error (see +Mechanize#get+)
177
- def singleton(uri, purpose = "")
193
+ def fulfill(uri, purpose = "")
178
194
  series = []
179
195
 
180
196
  uri = uri.to_absolute_uri
181
- return if try_skip_singleton(uri, purpose, series)
197
+ return unless add_fulfilled(uri, purpose, series)
182
198
 
183
199
  normalized_uri = normalize_uri(uri)
184
- return if try_skip_singleton(normalized_uri, purpose, series)
200
+ return unless add_fulfilled(normalized_uri, purpose, series)
185
201
 
186
202
  $log.info("Fetch #{normalized_uri}")
187
203
  resource = get(normalized_uri)
188
- skip = try_skip_singleton(resource.uri, purpose, series) |
189
- try_skip_singleton("content hash: #{resource.content_hash}", purpose, series)
204
+ unprocessed = add_fulfilled(resource.uri, purpose, series) &
205
+ add_fulfilled("content hash: #{resource.content_hash}", purpose, series)
190
206
 
191
- yield resource unless skip
207
+ result = yield resource if unprocessed
192
208
 
193
209
  CSV.open(journal, "a") do |csv|
194
- series.each{|singleton_key| csv << singleton_key }
210
+ series.each{|entry| csv << entry }
195
211
  end if journal
196
212
 
197
- !skip
213
+ result
198
214
  end
199
215
 
200
216
 
201
217
  private
202
218
 
203
219
  # @!visibility private
204
- SingletonKey = Struct.new(:purpose, :target)
220
+ FulfilledEntry = Struct.new(:purpose, :target)
205
221
 
206
- def try_skip_singleton(target, purpose, series)
207
- series << SingletonKey.new(purpose, target.to_s)
208
- if series.uniq!.nil? && !@seen.add?(series.last)
209
- seen_info = series.length > 1 ? "seen #{series.last.target}" : "seen"
210
- $log.info("Skip #{series.first.target} (#{seen_info})")
222
+ def add_fulfilled(target, purpose, series)
223
+ series << FulfilledEntry.new(purpose, target.to_s)
224
+ if (series.uniq!) || @fulfilled.add?(series.last)
211
225
  true
226
+ else
227
+ $log.info("Skip #{series.first.target}" \
228
+ " (seen#{" #{series.last.target}" unless series.length == 1})")
229
+ false
212
230
  end
213
231
  end
214
232