grubby 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -11
- data/README.md +24 -28
- data/grubby.gemspec +1 -2
- data/lib/grubby.rb +72 -25
- data/lib/grubby/core_ext/string.rb +2 -1
- data/lib/grubby/core_ext/uri.rb +4 -3
- data/lib/grubby/json_parser.rb +1 -1
- data/lib/grubby/mechanize/download.rb +1 -1
- data/lib/grubby/mechanize/file.rb +1 -1
- data/lib/grubby/mechanize/page.rb +7 -3
- data/lib/grubby/page_scraper.rb +1 -1
- data/lib/grubby/scraper.rb +165 -25
- data/lib/grubby/version.rb +1 -1
- metadata +5 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 84d759cf7187c8502b42e9d7828f59f126bb87af8da524e9d8e6f6ad8a64f467
|
4
|
+
data.tar.gz: bf26cca3991fca00e573f51f28a1c457e063e4f419986971f1429f051f2e3155
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 38b8f7818be985da5c48484b8a3f42a40401b4890e46da93c2565c546654a660537cf15303e1106bdca201d1ea8e7ff90e13ab13dcb652997b0acc9becc01b48
|
7
|
+
data.tar.gz: e3c8b063d275ebf49dc50c5a70fa82cb0f9e517f17cc9e3735557a2fe998d5ea82a3ea0932ad8a6ecec630f4f66c8d62443c76497f6e98e4f202a72df988095e
|
data/CHANGELOG.md
CHANGED
@@ -1,15 +1,28 @@
|
|
1
|
+
## 1.2.0
|
2
|
+
|
3
|
+
* Add `Grubby#journal=`
|
4
|
+
* Add `$grubby` global default `Grubby` instance
|
5
|
+
* Add `Scraper.scrape`
|
6
|
+
* Add `Scraper.each`
|
7
|
+
* Support `:if` and `:unless` options for `Scraper.scrapes`
|
8
|
+
* Fix fail-fast behavior of inherited scraper fields
|
9
|
+
* Fix `JsonParser` on empty response body
|
10
|
+
* Loosen Active Support version constraint
|
11
|
+
|
12
|
+
|
1
13
|
## 1.1.0
|
2
|
-
|
3
|
-
*
|
4
|
-
*
|
5
|
-
|
6
|
-
|
7
|
-
*
|
8
|
-
*
|
9
|
-
*
|
10
|
-
|
11
|
-
|
12
|
-
*
|
14
|
+
|
15
|
+
* Add `Grubby#ok?`
|
16
|
+
* Add `PageScraper.scrape_file` and `JsonScraper.scrape_file`
|
17
|
+
* Add `Mechanize::Parser#save_to` and `Mechanize::Parser#save_to!`,
|
18
|
+
which are inherited by `Mechanize::Download` and `Mechanize::File`
|
19
|
+
* Add `URI#basename`
|
20
|
+
* Add `URI#query_param`
|
21
|
+
* Add utility methods from [ryoba](https://rubygems.org/gems/ryoba)
|
22
|
+
* Add `Scraper::Error#scraper` and `Scraper#errors` for interactive
|
23
|
+
debugging with e.g. `byebug`
|
24
|
+
* Improve log messages and error formatting
|
25
|
+
* Fix compatibility with net-http-persistent gem v3.0
|
13
26
|
|
14
27
|
|
15
28
|
## 1.0.0
|
data/README.md
CHANGED
@@ -11,7 +11,7 @@ below, or browse the [full documentation].
|
|
11
11
|
|
12
12
|
## Examples
|
13
13
|
|
14
|
-
The following example scrapes the [Hacker News] front page:
|
14
|
+
The following example scrapes stories from the [Hacker News] front page:
|
15
15
|
|
16
16
|
```ruby
|
17
17
|
require "grubby"
|
@@ -19,38 +19,31 @@ require "grubby"
|
|
19
19
|
class HackerNews < Grubby::PageScraper
|
20
20
|
|
21
21
|
scrapes(:items) do
|
22
|
-
page.search!(".athing").map{|
|
22
|
+
page.search!(".athing").map{|el| Item.new(el) }
|
23
23
|
end
|
24
24
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
scrapes(:title) { @row1.at!(".storylink").text }
|
30
|
-
scrapes(:submitter) { @row2.at!(".hnuser").text }
|
31
|
-
scrapes(:story_uri) { URI.join(@base_uri, @row1.at!(".storylink")["href"]) }
|
32
|
-
scrapes(:comments_uri) { URI.join(@base_uri, @row2.at!(".age a")["href"]) }
|
33
|
-
|
34
|
-
def initialize(source)
|
35
|
-
@row1 = source
|
36
|
-
@row2 = source.next_sibling
|
37
|
-
@base_uri = source.document.url
|
38
|
-
super
|
25
|
+
class Item < Grubby::Scraper
|
26
|
+
scrapes(:story_link){ source.at!("a.storylink") }
|
27
|
+
scrapes(:story_uri) { story_link.uri }
|
28
|
+
scrapes(:title) { story_link.text }
|
39
29
|
end
|
40
30
|
|
41
31
|
end
|
42
32
|
|
43
|
-
grubby = Grubby.new
|
44
|
-
|
45
33
|
# The following line will raise an exception if anything goes wrong
|
46
34
|
# during the scraping process. For example, if the structure of the
|
47
|
-
# HTML does not match expectations, either due to
|
48
|
-
#
|
49
|
-
#
|
50
|
-
#
|
51
|
-
hn = HackerNews.
|
52
|
-
|
53
|
-
|
35
|
+
# HTML does not match expectations, either due to incorrect assumptions
|
36
|
+
# or a site change, the script will terminate immediately with a helpful
|
37
|
+
# error message. This prevents bad data from propagating and causing
|
38
|
+
# hard-to-trace errors.
|
39
|
+
hn = HackerNews.scrape("https://news.ycombinator.com/news")
|
40
|
+
|
41
|
+
# Your processing logic goes here:
|
42
|
+
hn.items.take(10).each do |item|
|
43
|
+
puts "* #{item.title}"
|
44
|
+
puts " #{item.story_uri}"
|
45
|
+
puts
|
46
|
+
end
|
54
47
|
```
|
55
48
|
|
56
49
|
[Hacker News]: https://news.ycombinator.com/news
|
@@ -64,7 +57,9 @@ puts hn.items.take(10).map(&:title) # your scraping logic goes here
|
|
64
57
|
- [#singleton](http://www.rubydoc.info/gems/grubby/Grubby:singleton)
|
65
58
|
- [#time_between_requests](http://www.rubydoc.info/gems/grubby/Grubby:time_between_requests)
|
66
59
|
- [Scraper](http://www.rubydoc.info/gems/grubby/Grubby/Scraper)
|
60
|
+
- [.each](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.each)
|
67
61
|
- [.fields](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.fields)
|
62
|
+
- [.scrape](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrape)
|
68
63
|
- [.scrapes](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrapes)
|
69
64
|
- [#[]](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:[])
|
70
65
|
- [#source](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:source)
|
@@ -136,14 +131,14 @@ for a complete API listing.
|
|
136
131
|
- [String#assert_match!](http://www.rubydoc.info/gems/mini_sanity/String:assert_match%21)
|
137
132
|
- [pleasant_path](https://rubygems.org/gems/pleasant_path)
|
138
133
|
([docs](http://www.rubydoc.info/gems/pleasant_path/))
|
134
|
+
- [Pathname#available_name](http://www.rubydoc.info/gems/pleasant_path/Pathname:available_name)
|
139
135
|
- [Pathname#dirs](http://www.rubydoc.info/gems/pleasant_path/Pathname:dirs)
|
140
|
-
- [Pathname#dirs_r](http://www.rubydoc.info/gems/pleasant_path/Pathname:dirs_r)
|
141
136
|
- [Pathname#files](http://www.rubydoc.info/gems/pleasant_path/Pathname:files)
|
142
|
-
- [Pathname#files_r](http://www.rubydoc.info/gems/pleasant_path/Pathname:files_r)
|
143
137
|
- [Pathname#make_dirname](http://www.rubydoc.info/gems/pleasant_path/Pathname:make_dirname)
|
138
|
+
- [Pathname#make_file](http://www.rubydoc.info/gems/pleasant_path/Pathname:make_file)
|
139
|
+
- [Pathname#move_as](http://www.rubydoc.info/gems/pleasant_path/Pathname:move_as)
|
144
140
|
- [Pathname#rename_basename](http://www.rubydoc.info/gems/pleasant_path/Pathname:rename_basename)
|
145
141
|
- [Pathname#rename_extname](http://www.rubydoc.info/gems/pleasant_path/Pathname:rename_extname)
|
146
|
-
- [Pathname#touch_file](http://www.rubydoc.info/gems/pleasant_path/Pathname:touch_file)
|
147
142
|
- [ryoba](https://rubygems.org/gems/ryoba)
|
148
143
|
([docs](http://www.rubydoc.info/gems/ryoba/))
|
149
144
|
- [Nokogiri::XML::Node#matches!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:matches%21)
|
@@ -154,6 +149,7 @@ for a complete API listing.
|
|
154
149
|
- [Nokogiri::XML::Searchable#at!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21)
|
155
150
|
- [Nokogiri::XML::Searchable#search!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21)
|
156
151
|
|
152
|
+
|
157
153
|
## Installation
|
158
154
|
|
159
155
|
Install from [Ruby Gems](https://rubygems.org/gems/grubby):
|
data/grubby.gemspec
CHANGED
@@ -20,9 +20,8 @@ Gem::Specification.new do |spec|
|
|
20
20
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
21
21
|
spec.require_paths = ["lib"]
|
22
22
|
|
23
|
-
spec.add_runtime_dependency "activesupport", "
|
23
|
+
spec.add_runtime_dependency "activesupport", ">= 5.0"
|
24
24
|
spec.add_runtime_dependency "casual_support", "~> 3.0"
|
25
|
-
spec.add_runtime_dependency "dumb_delimited", "~> 1.0"
|
26
25
|
spec.add_runtime_dependency "gorge", "~> 1.0"
|
27
26
|
spec.add_runtime_dependency "mechanize", "~> 2.7"
|
28
27
|
spec.add_runtime_dependency "mini_sanity", "~> 1.0"
|
data/lib/grubby.rb
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
require "active_support/all"
|
2
2
|
require "casual_support"
|
3
|
-
require "dumb_delimited"
|
4
3
|
require "gorge"
|
5
4
|
require "mechanize"
|
6
5
|
require "mini_sanity"
|
@@ -32,7 +31,7 @@ class Grubby < Mechanize
|
|
32
31
|
attr_accessor :time_between_requests
|
33
32
|
|
34
33
|
# Journal file used to ensure only-once processing of resources by
|
35
|
-
# {singleton} across multiple program runs.
|
34
|
+
# {singleton} across multiple program runs.
|
36
35
|
#
|
37
36
|
# @return [Pathname, nil]
|
38
37
|
attr_reader :journal
|
@@ -68,20 +67,37 @@ class Grubby < Mechanize
|
|
68
67
|
self.pre_connect_hooks << Proc.new{ self.send(:sleep_between_requests) }
|
69
68
|
self.time_between_requests = 1.0
|
70
69
|
|
71
|
-
|
72
|
-
|
70
|
+
self.journal = journal
|
71
|
+
end
|
72
|
+
|
73
|
+
# Sets the journal file used to ensure only-once processing of
|
74
|
+
# resources by {singleton} across multiple program runs. Setting the
|
75
|
+
# journal file will clear the in-memory list of previously-processed
|
76
|
+
# resources, and, if the journal file exists, load the list from file.
|
77
|
+
#
|
78
|
+
# @param path [Pathname, String, nil]
|
79
|
+
# @return [Pathname]
|
80
|
+
def journal=(path)
|
81
|
+
@journal = path&.to_pathname&.touch_file
|
82
|
+
@seen = if @journal
|
83
|
+
require "csv"
|
84
|
+
CSV.read(@journal).map{|row| SingletonKey.new(*row) }.index_to{ true }
|
85
|
+
else
|
86
|
+
{}
|
87
|
+
end
|
88
|
+
@journal
|
73
89
|
end
|
74
90
|
|
75
91
|
# Calls +#head+ and returns true if the result has response code
|
76
92
|
# "200". Unlike +#head+, error response codes (e.g. "404", "500")
|
77
93
|
# do not cause a +Mechanize::ResponseCodeError+ to be raised.
|
78
94
|
#
|
79
|
-
# @param uri [String]
|
95
|
+
# @param uri [URI, String]
|
80
96
|
# @return [Boolean]
|
81
97
|
def ok?(uri, query_params = {}, headers = {})
|
82
98
|
begin
|
83
99
|
head(uri, query_params, headers).code == "200"
|
84
|
-
rescue Mechanize::ResponseCodeError
|
100
|
+
rescue Mechanize::ResponseCodeError
|
85
101
|
false
|
86
102
|
end
|
87
103
|
end
|
@@ -91,7 +107,21 @@ class Grubby < Mechanize
|
|
91
107
|
# Rescues and logs +Mechanize::ResponseCodeError+ failures for all but
|
92
108
|
# the last mirror.
|
93
109
|
#
|
94
|
-
# @
|
110
|
+
# @example
|
111
|
+
# grubby = Grubby.new
|
112
|
+
#
|
113
|
+
# urls = [
|
114
|
+
# "http://httpstat.us/404",
|
115
|
+
# "http://httpstat.us/500",
|
116
|
+
# "http://httpstat.us/200#foo",
|
117
|
+
# "http://httpstat.us/200#bar",
|
118
|
+
# ]
|
119
|
+
#
|
120
|
+
# grubby.get_mirrored(urls).uri # == URI("http://httpstat.us/200#foo")
|
121
|
+
#
|
122
|
+
# grubby.get_mirrored(urls.take(2)) # raise Mechanize::ResponseCodeError
|
123
|
+
#
|
124
|
+
# @param mirror_uris [Array<URI>, Array<String>]
|
95
125
|
# @return [Mechanize::Page, Mechanize::File, Mechanize::Download, ...]
|
96
126
|
# @raise [Mechanize::ResponseCodeError]
|
97
127
|
# if all +mirror_uris+ fail
|
@@ -111,32 +141,43 @@ class Grubby < Mechanize
|
|
111
141
|
end
|
112
142
|
end
|
113
143
|
|
114
|
-
# Ensures only-once processing of the resource indicated by +
|
115
|
-
#
|
116
|
-
#
|
117
|
-
#
|
118
|
-
#
|
144
|
+
# Ensures only-once processing of the resource indicated by +uri+ for
|
145
|
+
# the specified +purpose+. A list of previously-processed resource
|
146
|
+
# URIs and content hashes is maintained in the Grubby instance. The
|
147
|
+
# given block is called with the fetched resource only if the
|
148
|
+
# resource's URI and the resource's content hash have not been
|
119
149
|
# previously processed under the specified +purpose+.
|
120
150
|
#
|
121
|
-
# @
|
122
|
-
#
|
151
|
+
# @example
|
152
|
+
# grubby = Grubby.new
|
153
|
+
#
|
154
|
+
# grubby.singleton("https://example.com/foo") do |page|
|
155
|
+
# # will be executed (first time "/foo")
|
156
|
+
# end
|
157
|
+
#
|
158
|
+
# grubby.singleton("https://example.com/foo#bar") do |page|
|
159
|
+
# # will be skipped (already seen "/foo")
|
160
|
+
# end
|
161
|
+
#
|
162
|
+
# grubby.singleton("https://example.com/foo", "again!") do |page|
|
163
|
+
# # will be executed (new purpose for "/foo")
|
164
|
+
# end
|
165
|
+
#
|
166
|
+
# @param uri [URI, String]
|
123
167
|
# @param purpose [String]
|
124
|
-
# the purpose of processing the resource
|
125
168
|
# @yield [resource]
|
126
|
-
# processes the resource
|
127
169
|
# @yieldparam resource [Mechanize::Page, Mechanize::File, Mechanize::Download, ...]
|
128
|
-
# the fetched resource
|
129
170
|
# @return [Boolean]
|
130
171
|
# whether the given block was called
|
131
172
|
# @raise [Mechanize::ResponseCodeError]
|
132
173
|
# if fetching the resource results in error (see +Mechanize#get+)
|
133
|
-
def singleton(
|
174
|
+
def singleton(uri, purpose = "")
|
134
175
|
series = []
|
135
176
|
|
136
|
-
|
137
|
-
return if try_skip_singleton(
|
177
|
+
uri = uri.to_absolute_uri
|
178
|
+
return if try_skip_singleton(uri, purpose, series)
|
138
179
|
|
139
|
-
normalized_uri = normalize_uri(
|
180
|
+
normalized_uri = normalize_uri(uri)
|
140
181
|
return if try_skip_singleton(normalized_uri, purpose, series)
|
141
182
|
|
142
183
|
$log.info("Fetch #{normalized_uri}")
|
@@ -146,7 +187,9 @@ class Grubby < Mechanize
|
|
146
187
|
|
147
188
|
yield resource unless skip
|
148
189
|
|
149
|
-
|
190
|
+
CSV.open(journal, "a") do |csv|
|
191
|
+
series.each{|singleton_key| csv << singleton_key }
|
192
|
+
end if journal
|
150
193
|
|
151
194
|
!skip
|
152
195
|
end
|
@@ -154,7 +197,8 @@ class Grubby < Mechanize
|
|
154
197
|
|
155
198
|
private
|
156
199
|
|
157
|
-
|
200
|
+
# @!visibility private
|
201
|
+
SingletonKey = Struct.new(:purpose, :target)
|
158
202
|
|
159
203
|
def try_skip_singleton(target, purpose, series)
|
160
204
|
series << SingletonKey.new(purpose, target.to_s)
|
@@ -175,8 +219,8 @@ class Grubby < Mechanize
|
|
175
219
|
|
176
220
|
def sleep_between_requests
|
177
221
|
@last_request_at ||= 0.0
|
178
|
-
delay_duration =
|
179
|
-
rand(
|
222
|
+
delay_duration = time_between_requests.is_a?(Range) ?
|
223
|
+
rand(time_between_requests) : time_between_requests
|
180
224
|
sleep_duration = @last_request_at + delay_duration - Time.now.to_f
|
181
225
|
sleep(sleep_duration) if sleep_duration > 0
|
182
226
|
@last_request_at = Time.now.to_f
|
@@ -189,3 +233,6 @@ require_relative "grubby/json_parser"
|
|
189
233
|
require_relative "grubby/scraper"
|
190
234
|
require_relative "grubby/page_scraper"
|
191
235
|
require_relative "grubby/json_scraper"
|
236
|
+
|
237
|
+
|
238
|
+
$grubby = Grubby.new
|
@@ -4,7 +4,8 @@ class String
|
|
4
4
|
# does not denote an absolute URI.
|
5
5
|
#
|
6
6
|
# @return [URI]
|
7
|
-
# @raise [RuntimeError]
|
7
|
+
# @raise [RuntimeError]
|
8
|
+
# if the String does not denote an absolute URI
|
8
9
|
def to_absolute_uri
|
9
10
|
URI(self).to_absolute_uri
|
10
11
|
end
|
data/lib/grubby/core_ext/uri.rb
CHANGED
@@ -9,7 +9,7 @@ module URI
|
|
9
9
|
#
|
10
10
|
# @return [String]
|
11
11
|
def basename
|
12
|
-
self.path == "/" ? "" : File.basename(self.path)
|
12
|
+
self.path == "/" ? "" : ::File.basename(self.path)
|
13
13
|
end
|
14
14
|
|
15
15
|
# Returns the value of the specified param in the URI's +query+.
|
@@ -21,7 +21,7 @@ module URI
|
|
21
21
|
# occurrence of that param in the query string.
|
22
22
|
#
|
23
23
|
# @example
|
24
|
-
# URI("http://example.com/?foo=a").query_param("foo")
|
24
|
+
# URI("http://example.com/?foo=a").query_param("foo") # == "a"
|
25
25
|
#
|
26
26
|
# URI("http://example.com/?foo=a&foo=b").query_param("foo") # == "b"
|
27
27
|
# URI("http://example.com/?foo=a&foo=b").query_param("foo[]") # == nil
|
@@ -43,7 +43,8 @@ module URI
|
|
43
43
|
# Raises an exception if the URI is not +absolute?+.
|
44
44
|
#
|
45
45
|
# @return [self]
|
46
|
-
# @raise [RuntimeError]
|
46
|
+
# @raise [RuntimeError]
|
47
|
+
# if the URI is not +absolute?+
|
47
48
|
def to_absolute_uri
|
48
49
|
raise "URI is not absolute: #{self}" unless self.absolute?
|
49
50
|
self
|
data/lib/grubby/json_parser.rb
CHANGED
@@ -39,7 +39,7 @@ class Grubby::JsonParser < Mechanize::File
|
|
39
39
|
attr_reader :json
|
40
40
|
|
41
41
|
def initialize(uri = nil, response = nil, body = nil, code = nil)
|
42
|
-
@json = body && JSON.parse(body, self.class.json_parse_options)
|
42
|
+
@json = body.presence && JSON.parse(body, self.class.json_parse_options)
|
43
43
|
super
|
44
44
|
end
|
45
45
|
|
@@ -1,17 +1,21 @@
|
|
1
1
|
class Mechanize::Page
|
2
2
|
|
3
3
|
# @!method search!(*queries)
|
4
|
-
# See
|
4
|
+
# See Ryoba's +Nokogiri::XML::Searchable#search!+.
|
5
5
|
#
|
6
6
|
# @param queries [Array<String>]
|
7
|
-
# @return [
|
7
|
+
# @return [Nokogiri::XML::NodeSet]
|
8
|
+
# @raise [Ryoba::Error]
|
9
|
+
# if all queries yield no results
|
8
10
|
def_delegators :parser, :search!
|
9
11
|
|
10
12
|
# @!method at!(*queries)
|
11
|
-
# See
|
13
|
+
# See Ryoba's +Nokogiri::XML::Searchable#at!+.
|
12
14
|
#
|
13
15
|
# @param queries [Array<String>]
|
14
16
|
# @return [Nokogiri::XML::Element]
|
17
|
+
# @raise [Ryoba::Error]
|
18
|
+
# if all queries yield no results
|
15
19
|
def_delegators :parser, :at!
|
16
20
|
|
17
21
|
end
|
data/lib/grubby/page_scraper.rb
CHANGED
@@ -24,7 +24,7 @@ class Grubby::PageScraper < Grubby::Scraper
|
|
24
24
|
# @param path [String]
|
25
25
|
# @param agent [Mechanize]
|
26
26
|
# @return [Grubby::PageScraper]
|
27
|
-
def self.scrape_file(path, agent =
|
27
|
+
def self.scrape_file(path, agent = $grubby)
|
28
28
|
uri = URI.join("file:///", File.expand_path(path))
|
29
29
|
body = File.read(path)
|
30
30
|
self.new(Mechanize::Page.new(uri, nil, body, "200", agent))
|
data/lib/grubby/scraper.rb
CHANGED
@@ -2,61 +2,200 @@ class Grubby::Scraper
|
|
2
2
|
|
3
3
|
# Defines an attribute reader method named by +field+. During
|
4
4
|
# +initialize+, the given block is called, and the attribute is set to
|
5
|
-
# the block's return value.
|
6
|
-
#
|
7
|
-
#
|
5
|
+
# the block's return value.
|
6
|
+
#
|
7
|
+
# By default, if the block's return value is nil, an exception will be
|
8
|
+
# raised. To prevent this behavior, specify +optional: true+.
|
9
|
+
#
|
10
|
+
# The block may also be evaluated conditionally, based on another
|
11
|
+
# method's return value, using the +:if+ or +:unless+ options.
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# class GreetingScraper < Grubby::Scraper
|
15
|
+
# scrapes(:salutation) do
|
16
|
+
# source[/\A(hello|good morning)\b/i]
|
17
|
+
# end
|
18
|
+
#
|
19
|
+
# scrapes(:recipient, optional: true) do
|
20
|
+
# source[/\A#{salutation} ([a-z ]+)/i, 1]
|
21
|
+
# end
|
22
|
+
# end
|
23
|
+
#
|
24
|
+
# scraper = GreetingScraper.new("Hello World!")
|
25
|
+
# scraper.salutation # == "Hello"
|
26
|
+
# scraper.recipient # == "World"
|
27
|
+
#
|
28
|
+
# scraper = GreetingScraper.new("Good morning!")
|
29
|
+
# scraper.salutation # == "Good morning"
|
30
|
+
# scraper.recipient # == nil
|
31
|
+
#
|
32
|
+
# scraper = GreetingScraper.new("Hey!") # raises Grubby::Scraper::Error
|
33
|
+
#
|
34
|
+
# @example
|
35
|
+
# class EmbeddedUrlScraper < Grubby::Scraper
|
36
|
+
# scrapes(:url, optional: true){ source[%r"\bhttps?://\S+"] }
|
37
|
+
#
|
38
|
+
# scrapes(:domain, if: :url){ url[%r"://([^/]+)/", 1] }
|
39
|
+
# end
|
40
|
+
#
|
41
|
+
# scraper = EmbeddedUrlScraper.new("visit https://example.com/foo for details")
|
42
|
+
# scraper.url # == "https://example.com/foo"
|
43
|
+
# scraper.domain # == "example.com"
|
44
|
+
#
|
45
|
+
# scraper = EmbeddedUrlScraper.new("visit our website for details")
|
46
|
+
# scraper.url # == nil
|
47
|
+
# scraper.domain # == nil
|
8
48
|
#
|
9
49
|
# @param field [Symbol, String]
|
10
|
-
#
|
11
|
-
# @
|
12
|
-
#
|
50
|
+
# @param options [Hash]
|
51
|
+
# @option options :optional [Boolean]
|
52
|
+
# @option options :if [Symbol]
|
53
|
+
# @option options :unless [Symbol]
|
13
54
|
# @yield []
|
14
|
-
# scrapes the value
|
15
55
|
# @yieldreturn [Object]
|
16
|
-
#
|
17
|
-
def self.scrapes(field,
|
56
|
+
# @return [void]
|
57
|
+
def self.scrapes(field, **options, &block)
|
18
58
|
field = field.to_sym
|
19
59
|
self.fields << field
|
20
60
|
|
21
61
|
define_method(field) do
|
22
62
|
raise "#{self.class}#initialize does not invoke `super`" unless defined?(@scraped)
|
23
|
-
return @scraped[field] if @scraped.key?(field)
|
24
63
|
|
25
|
-
|
64
|
+
if !@scraped.key?(field) && !@errors.key?(field)
|
26
65
|
begin
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
66
|
+
skip = (options[:if] && !self.send(options[:if])) ||
|
67
|
+
(options[:unless] && self.send(options[:unless]))
|
68
|
+
|
69
|
+
if skip
|
70
|
+
@scraped[field] = nil
|
71
|
+
else
|
72
|
+
@scraped[field] = instance_eval(&block)
|
73
|
+
if @scraped[field].nil?
|
74
|
+
raise FieldValueRequiredError.new(field) unless options[:optional]
|
75
|
+
$log.debug("#{self.class}##{field} is nil")
|
76
|
+
end
|
31
77
|
end
|
32
|
-
@scraped[field] = value
|
33
78
|
rescue RuntimeError, IndexError => e
|
34
79
|
@errors[field] = e
|
35
80
|
end
|
36
81
|
end
|
37
82
|
|
38
|
-
|
39
|
-
|
40
|
-
|
83
|
+
if @errors.key?(field)
|
84
|
+
raise FieldScrapeFailedError.new(field, @errors[field])
|
85
|
+
else
|
86
|
+
@scraped[field]
|
87
|
+
end
|
41
88
|
end
|
42
89
|
end
|
43
90
|
|
44
|
-
#
|
91
|
+
# Fields defined by {scrapes}.
|
45
92
|
#
|
46
93
|
# @return [Array<Symbol>]
|
47
94
|
def self.fields
|
48
|
-
@fields ||= []
|
95
|
+
@fields ||= self == Grubby::Scraper ? [] : self.superclass.fields.dup
|
96
|
+
end
|
97
|
+
|
98
|
+
# Instantiates the Scraper class with the resource specified by +url+.
|
99
|
+
# This method acts as a default factory method, and provides a
|
100
|
+
# standard interface for specialized overrides.
|
101
|
+
#
|
102
|
+
# @example Default factory method
|
103
|
+
# class PostPageScraper < Grubby::PageScraper
|
104
|
+
# # ...
|
105
|
+
# end
|
106
|
+
#
|
107
|
+
# PostPageScraper.scrape("https://example.com/posts/42")
|
108
|
+
# # == PostPageScraper.new($grubby.get("https://example.com/posts/42"))
|
109
|
+
#
|
110
|
+
# @example Specialized factory method
|
111
|
+
# class PostApiScraper < Grubby::JsonScraper
|
112
|
+
# # ...
|
113
|
+
#
|
114
|
+
# def self.scrapes(url, agent = $grubby)
|
115
|
+
# api_url = url.sub(%r"//example.com/(.+)", '//api.example.com/\1.json')
|
116
|
+
# super(api_url, agent)
|
117
|
+
# end
|
118
|
+
# end
|
119
|
+
#
|
120
|
+
# PostApiScraper.scrape("https://example.com/posts/42")
|
121
|
+
# # == PostApiScraper.new($grubby.get("https://api.example.com/posts/42.json"))
|
122
|
+
#
|
123
|
+
# @param url [String, URI]
|
124
|
+
# @param agent [Mechanize]
|
125
|
+
# @return [Grubby::Scraper]
|
126
|
+
def self.scrape(url, agent = $grubby)
|
127
|
+
self.new(agent.get(url))
|
128
|
+
end
|
129
|
+
|
130
|
+
# Iterates a series of pages, starting at +start_url+. For each page,
|
131
|
+
# the Scraper class is instantiated and passed to the given block.
|
132
|
+
# Subsequent pages in the series are determined by invoking
|
133
|
+
# +next_method+ on each previous scraper instance.
|
134
|
+
#
|
135
|
+
# Iteration stops when the +next_method+ method returns nil. If the
|
136
|
+
# +next_method+ method returns a String or URI, that value will be
|
137
|
+
# treated as the URL of the next page. Otherwise that value will be
|
138
|
+
# treated as the page itself.
|
139
|
+
#
|
140
|
+
# @example
|
141
|
+
# class PostsIndexScraper < Grubby::PageScraper
|
142
|
+
# scrapes(:page_param){ page.uri.query_param("page") }
|
143
|
+
#
|
144
|
+
# def next
|
145
|
+
# page.link_with(text: "Next >")&.click
|
146
|
+
# end
|
147
|
+
# end
|
148
|
+
#
|
149
|
+
# PostsIndexScraper.each("https://example.com/posts?page=1") do |scraper|
|
150
|
+
# scraper.page_param # == "1", "2", "3", ...
|
151
|
+
# end
|
152
|
+
#
|
153
|
+
# @example
|
154
|
+
# class PostsIndexScraper < Grubby::PageScraper
|
155
|
+
# scrapes(:page_param){ page.uri.query_param("page") }
|
156
|
+
#
|
157
|
+
# scrapes(:next_uri, optional: true) do
|
158
|
+
# page.link_with(text: "Next >")&.to_absolute_uri
|
159
|
+
# end
|
160
|
+
# end
|
161
|
+
#
|
162
|
+
# PostsIndexScraper.each("https://example.com/posts?page=1", next_method: :next_uri) do |scraper|
|
163
|
+
# scraper.page_param # == "1", "2", "3", ...
|
164
|
+
# end
|
165
|
+
#
|
166
|
+
# @param start_url [String, URI]
|
167
|
+
# @param agent [Mechanize]
|
168
|
+
# @param next_method [Symbol]
|
169
|
+
# @yield [scraper]
|
170
|
+
# @yieldparam scraper [Grubby::Scraper]
|
171
|
+
# @return [void]
|
172
|
+
# @raise [NoMethodError]
|
173
|
+
# if Scraper class does not implement +next_method+
|
174
|
+
def self.each(start_url, agent = $grubby, next_method: :next)
|
175
|
+
unless self.method_defined?(next_method)
|
176
|
+
raise NoMethodError.new(nil, next_method), "#{self} does not define `#{next_method}`"
|
177
|
+
end
|
178
|
+
|
179
|
+
return to_enum(:each, start_url, agent, next_method: next_method) unless block_given?
|
180
|
+
|
181
|
+
current = start_url
|
182
|
+
while current
|
183
|
+
current = agent.get(current) if current.is_a?(String) || current.is_a?(URI)
|
184
|
+
scraper = self.new(current)
|
185
|
+
yield scraper
|
186
|
+
current = scraper.send(next_method)
|
187
|
+
end
|
49
188
|
end
|
50
189
|
|
51
|
-
# The
|
190
|
+
# The object being scraped. Typically a Mechanize pluggable parser
|
52
191
|
# such as +Mechanize::Page+.
|
53
192
|
#
|
54
193
|
# @return [Object]
|
55
194
|
attr_reader :source
|
56
195
|
|
57
|
-
#
|
58
|
-
# {
|
59
|
-
# be empty.
|
196
|
+
# Collected errors raised during {initialize} by blocks passed to
|
197
|
+
# {scrapes}, indexed by field name. If {initialize} did not raise
|
198
|
+
# +Grubby::Scraper::Error+, this Hash will be empty.
|
60
199
|
#
|
61
200
|
# @return [Hash<Symbol, StandardError>]
|
62
201
|
attr_reader :errors
|
@@ -123,6 +262,7 @@ class Grubby::Scraper
|
|
123
262
|
end
|
124
263
|
end
|
125
264
|
|
265
|
+
# @!visibility private
|
126
266
|
class FieldScrapeFailedError < RuntimeError
|
127
267
|
def initialize(field, field_error)
|
128
268
|
super("`#{field}` raised #{field_error.class}")
|
data/lib/grubby/version.rb
CHANGED
@@ -1 +1 @@
|
|
1
|
-
GRUBBY_VERSION = "1.
|
1
|
+
GRUBBY_VERSION = "1.2.0"
|
metadata
CHANGED
@@ -1,27 +1,27 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grubby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Hefner
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-07-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '5.0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '5.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
@@ -38,20 +38,6 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '3.0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: dumb_delimited
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - "~>"
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '1.0'
|
48
|
-
type: :runtime
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - "~>"
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '1.0'
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: gorge
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -227,8 +213,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
227
213
|
- !ruby/object:Gem::Version
|
228
214
|
version: '0'
|
229
215
|
requirements: []
|
230
|
-
|
231
|
-
rubygems_version: 2.7.6
|
216
|
+
rubygems_version: 3.0.1
|
232
217
|
signing_key:
|
233
218
|
specification_version: 4
|
234
219
|
summary: Fail-fast web scraping
|