grubby 1.1.0 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -11
- data/README.md +24 -28
- data/grubby.gemspec +1 -2
- data/lib/grubby.rb +72 -25
- data/lib/grubby/core_ext/string.rb +2 -1
- data/lib/grubby/core_ext/uri.rb +4 -3
- data/lib/grubby/json_parser.rb +1 -1
- data/lib/grubby/mechanize/download.rb +1 -1
- data/lib/grubby/mechanize/file.rb +1 -1
- data/lib/grubby/mechanize/page.rb +7 -3
- data/lib/grubby/page_scraper.rb +1 -1
- data/lib/grubby/scraper.rb +165 -25
- data/lib/grubby/version.rb +1 -1
- metadata +5 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 84d759cf7187c8502b42e9d7828f59f126bb87af8da524e9d8e6f6ad8a64f467
|
4
|
+
data.tar.gz: bf26cca3991fca00e573f51f28a1c457e063e4f419986971f1429f051f2e3155
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 38b8f7818be985da5c48484b8a3f42a40401b4890e46da93c2565c546654a660537cf15303e1106bdca201d1ea8e7ff90e13ab13dcb652997b0acc9becc01b48
|
7
|
+
data.tar.gz: e3c8b063d275ebf49dc50c5a70fa82cb0f9e517f17cc9e3735557a2fe998d5ea82a3ea0932ad8a6ecec630f4f66c8d62443c76497f6e98e4f202a72df988095e
|
data/CHANGELOG.md
CHANGED
@@ -1,15 +1,28 @@
|
|
1
|
+
## 1.2.0
|
2
|
+
|
3
|
+
* Add `Grubby#journal=`
|
4
|
+
* Add `$grubby` global default `Grubby` instance
|
5
|
+
* Add `Scraper.scrape`
|
6
|
+
* Add `Scraper.each`
|
7
|
+
* Support `:if` and `:unless` options for `Scraper.scrapes`
|
8
|
+
* Fix fail-fast behavior of inherited scraper fields
|
9
|
+
* Fix `JsonParser` on empty response body
|
10
|
+
* Loosen Active Support version constraint
|
11
|
+
|
12
|
+
|
1
13
|
## 1.1.0
|
2
|
-
|
3
|
-
*
|
4
|
-
*
|
5
|
-
|
6
|
-
|
7
|
-
*
|
8
|
-
*
|
9
|
-
*
|
10
|
-
|
11
|
-
|
12
|
-
*
|
14
|
+
|
15
|
+
* Add `Grubby#ok?`
|
16
|
+
* Add `PageScraper.scrape_file` and `JsonScraper.scrape_file`
|
17
|
+
* Add `Mechanize::Parser#save_to` and `Mechanize::Parser#save_to!`,
|
18
|
+
which are inherited by `Mechanize::Download` and `Mechanize::File`
|
19
|
+
* Add `URI#basename`
|
20
|
+
* Add `URI#query_param`
|
21
|
+
* Add utility methods from [ryoba](https://rubygems.org/gems/ryoba)
|
22
|
+
* Add `Scraper::Error#scraper` and `Scraper#errors` for interactive
|
23
|
+
debugging with e.g. `byebug`
|
24
|
+
* Improve log messages and error formatting
|
25
|
+
* Fix compatibility with net-http-persistent gem v3.0
|
13
26
|
|
14
27
|
|
15
28
|
## 1.0.0
|
data/README.md
CHANGED
@@ -11,7 +11,7 @@ below, or browse the [full documentation].
|
|
11
11
|
|
12
12
|
## Examples
|
13
13
|
|
14
|
-
The following example scrapes the [Hacker News] front page:
|
14
|
+
The following example scrapes stories from the [Hacker News] front page:
|
15
15
|
|
16
16
|
```ruby
|
17
17
|
require "grubby"
|
@@ -19,38 +19,31 @@ require "grubby"
|
|
19
19
|
class HackerNews < Grubby::PageScraper
|
20
20
|
|
21
21
|
scrapes(:items) do
|
22
|
-
page.search!(".athing").map{|
|
22
|
+
page.search!(".athing").map{|el| Item.new(el) }
|
23
23
|
end
|
24
24
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
scrapes(:title) { @row1.at!(".storylink").text }
|
30
|
-
scrapes(:submitter) { @row2.at!(".hnuser").text }
|
31
|
-
scrapes(:story_uri) { URI.join(@base_uri, @row1.at!(".storylink")["href"]) }
|
32
|
-
scrapes(:comments_uri) { URI.join(@base_uri, @row2.at!(".age a")["href"]) }
|
33
|
-
|
34
|
-
def initialize(source)
|
35
|
-
@row1 = source
|
36
|
-
@row2 = source.next_sibling
|
37
|
-
@base_uri = source.document.url
|
38
|
-
super
|
25
|
+
class Item < Grubby::Scraper
|
26
|
+
scrapes(:story_link){ source.at!("a.storylink") }
|
27
|
+
scrapes(:story_uri) { story_link.uri }
|
28
|
+
scrapes(:title) { story_link.text }
|
39
29
|
end
|
40
30
|
|
41
31
|
end
|
42
32
|
|
43
|
-
grubby = Grubby.new
|
44
|
-
|
45
33
|
# The following line will raise an exception if anything goes wrong
|
46
34
|
# during the scraping process. For example, if the structure of the
|
47
|
-
# HTML does not match expectations, either due to
|
48
|
-
#
|
49
|
-
#
|
50
|
-
#
|
51
|
-
hn = HackerNews.
|
52
|
-
|
53
|
-
|
35
|
+
# HTML does not match expectations, either due to incorrect assumptions
|
36
|
+
# or a site change, the script will terminate immediately with a helpful
|
37
|
+
# error message. This prevents bad data from propagating and causing
|
38
|
+
# hard-to-trace errors.
|
39
|
+
hn = HackerNews.scrape("https://news.ycombinator.com/news")
|
40
|
+
|
41
|
+
# Your processing logic goes here:
|
42
|
+
hn.items.take(10).each do |item|
|
43
|
+
puts "* #{item.title}"
|
44
|
+
puts " #{item.story_uri}"
|
45
|
+
puts
|
46
|
+
end
|
54
47
|
```
|
55
48
|
|
56
49
|
[Hacker News]: https://news.ycombinator.com/news
|
@@ -64,7 +57,9 @@ puts hn.items.take(10).map(&:title) # your scraping logic goes here
|
|
64
57
|
- [#singleton](http://www.rubydoc.info/gems/grubby/Grubby:singleton)
|
65
58
|
- [#time_between_requests](http://www.rubydoc.info/gems/grubby/Grubby:time_between_requests)
|
66
59
|
- [Scraper](http://www.rubydoc.info/gems/grubby/Grubby/Scraper)
|
60
|
+
- [.each](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.each)
|
67
61
|
- [.fields](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.fields)
|
62
|
+
- [.scrape](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrape)
|
68
63
|
- [.scrapes](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrapes)
|
69
64
|
- [#[]](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:[])
|
70
65
|
- [#source](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:source)
|
@@ -136,14 +131,14 @@ for a complete API listing.
|
|
136
131
|
- [String#assert_match!](http://www.rubydoc.info/gems/mini_sanity/String:assert_match%21)
|
137
132
|
- [pleasant_path](https://rubygems.org/gems/pleasant_path)
|
138
133
|
([docs](http://www.rubydoc.info/gems/pleasant_path/))
|
134
|
+
- [Pathname#available_name](http://www.rubydoc.info/gems/pleasant_path/Pathname:available_name)
|
139
135
|
- [Pathname#dirs](http://www.rubydoc.info/gems/pleasant_path/Pathname:dirs)
|
140
|
-
- [Pathname#dirs_r](http://www.rubydoc.info/gems/pleasant_path/Pathname:dirs_r)
|
141
136
|
- [Pathname#files](http://www.rubydoc.info/gems/pleasant_path/Pathname:files)
|
142
|
-
- [Pathname#files_r](http://www.rubydoc.info/gems/pleasant_path/Pathname:files_r)
|
143
137
|
- [Pathname#make_dirname](http://www.rubydoc.info/gems/pleasant_path/Pathname:make_dirname)
|
138
|
+
- [Pathname#make_file](http://www.rubydoc.info/gems/pleasant_path/Pathname:make_file)
|
139
|
+
- [Pathname#move_as](http://www.rubydoc.info/gems/pleasant_path/Pathname:move_as)
|
144
140
|
- [Pathname#rename_basename](http://www.rubydoc.info/gems/pleasant_path/Pathname:rename_basename)
|
145
141
|
- [Pathname#rename_extname](http://www.rubydoc.info/gems/pleasant_path/Pathname:rename_extname)
|
146
|
-
- [Pathname#touch_file](http://www.rubydoc.info/gems/pleasant_path/Pathname:touch_file)
|
147
142
|
- [ryoba](https://rubygems.org/gems/ryoba)
|
148
143
|
([docs](http://www.rubydoc.info/gems/ryoba/))
|
149
144
|
- [Nokogiri::XML::Node#matches!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:matches%21)
|
@@ -154,6 +149,7 @@ for a complete API listing.
|
|
154
149
|
- [Nokogiri::XML::Searchable#at!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21)
|
155
150
|
- [Nokogiri::XML::Searchable#search!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21)
|
156
151
|
|
152
|
+
|
157
153
|
## Installation
|
158
154
|
|
159
155
|
Install from [Ruby Gems](https://rubygems.org/gems/grubby):
|
data/grubby.gemspec
CHANGED
@@ -20,9 +20,8 @@ Gem::Specification.new do |spec|
|
|
20
20
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
21
21
|
spec.require_paths = ["lib"]
|
22
22
|
|
23
|
-
spec.add_runtime_dependency "activesupport", "
|
23
|
+
spec.add_runtime_dependency "activesupport", ">= 5.0"
|
24
24
|
spec.add_runtime_dependency "casual_support", "~> 3.0"
|
25
|
-
spec.add_runtime_dependency "dumb_delimited", "~> 1.0"
|
26
25
|
spec.add_runtime_dependency "gorge", "~> 1.0"
|
27
26
|
spec.add_runtime_dependency "mechanize", "~> 2.7"
|
28
27
|
spec.add_runtime_dependency "mini_sanity", "~> 1.0"
|
data/lib/grubby.rb
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
require "active_support/all"
|
2
2
|
require "casual_support"
|
3
|
-
require "dumb_delimited"
|
4
3
|
require "gorge"
|
5
4
|
require "mechanize"
|
6
5
|
require "mini_sanity"
|
@@ -32,7 +31,7 @@ class Grubby < Mechanize
|
|
32
31
|
attr_accessor :time_between_requests
|
33
32
|
|
34
33
|
# Journal file used to ensure only-once processing of resources by
|
35
|
-
# {singleton} across multiple program runs.
|
34
|
+
# {singleton} across multiple program runs.
|
36
35
|
#
|
37
36
|
# @return [Pathname, nil]
|
38
37
|
attr_reader :journal
|
@@ -68,20 +67,37 @@ class Grubby < Mechanize
|
|
68
67
|
self.pre_connect_hooks << Proc.new{ self.send(:sleep_between_requests) }
|
69
68
|
self.time_between_requests = 1.0
|
70
69
|
|
71
|
-
|
72
|
-
|
70
|
+
self.journal = journal
|
71
|
+
end
|
72
|
+
|
73
|
+
# Sets the journal file used to ensure only-once processing of
|
74
|
+
# resources by {singleton} across multiple program runs. Setting the
|
75
|
+
# journal file will clear the in-memory list of previously-processed
|
76
|
+
# resources, and, if the journal file exists, load the list from file.
|
77
|
+
#
|
78
|
+
# @param path [Pathname, String, nil]
|
79
|
+
# @return [Pathname]
|
80
|
+
def journal=(path)
|
81
|
+
@journal = path&.to_pathname&.touch_file
|
82
|
+
@seen = if @journal
|
83
|
+
require "csv"
|
84
|
+
CSV.read(@journal).map{|row| SingletonKey.new(*row) }.index_to{ true }
|
85
|
+
else
|
86
|
+
{}
|
87
|
+
end
|
88
|
+
@journal
|
73
89
|
end
|
74
90
|
|
75
91
|
# Calls +#head+ and returns true if the result has response code
|
76
92
|
# "200". Unlike +#head+, error response codes (e.g. "404", "500")
|
77
93
|
# do not cause a +Mechanize::ResponseCodeError+ to be raised.
|
78
94
|
#
|
79
|
-
# @param uri [String]
|
95
|
+
# @param uri [URI, String]
|
80
96
|
# @return [Boolean]
|
81
97
|
def ok?(uri, query_params = {}, headers = {})
|
82
98
|
begin
|
83
99
|
head(uri, query_params, headers).code == "200"
|
84
|
-
rescue Mechanize::ResponseCodeError
|
100
|
+
rescue Mechanize::ResponseCodeError
|
85
101
|
false
|
86
102
|
end
|
87
103
|
end
|
@@ -91,7 +107,21 @@ class Grubby < Mechanize
|
|
91
107
|
# Rescues and logs +Mechanize::ResponseCodeError+ failures for all but
|
92
108
|
# the last mirror.
|
93
109
|
#
|
94
|
-
# @
|
110
|
+
# @example
|
111
|
+
# grubby = Grubby.new
|
112
|
+
#
|
113
|
+
# urls = [
|
114
|
+
# "http://httpstat.us/404",
|
115
|
+
# "http://httpstat.us/500",
|
116
|
+
# "http://httpstat.us/200#foo",
|
117
|
+
# "http://httpstat.us/200#bar",
|
118
|
+
# ]
|
119
|
+
#
|
120
|
+
# grubby.get_mirrored(urls).uri # == URI("http://httpstat.us/200#foo")
|
121
|
+
#
|
122
|
+
# grubby.get_mirrored(urls.take(2)) # raise Mechanize::ResponseCodeError
|
123
|
+
#
|
124
|
+
# @param mirror_uris [Array<URI>, Array<String>]
|
95
125
|
# @return [Mechanize::Page, Mechanize::File, Mechanize::Download, ...]
|
96
126
|
# @raise [Mechanize::ResponseCodeError]
|
97
127
|
# if all +mirror_uris+ fail
|
@@ -111,32 +141,43 @@ class Grubby < Mechanize
|
|
111
141
|
end
|
112
142
|
end
|
113
143
|
|
114
|
-
# Ensures only-once processing of the resource indicated by +
|
115
|
-
#
|
116
|
-
#
|
117
|
-
#
|
118
|
-
#
|
144
|
+
# Ensures only-once processing of the resource indicated by +uri+ for
|
145
|
+
# the specified +purpose+. A list of previously-processed resource
|
146
|
+
# URIs and content hashes is maintained in the Grubby instance. The
|
147
|
+
# given block is called with the fetched resource only if the
|
148
|
+
# resource's URI and the resource's content hash have not been
|
119
149
|
# previously processed under the specified +purpose+.
|
120
150
|
#
|
121
|
-
# @
|
122
|
-
#
|
151
|
+
# @example
|
152
|
+
# grubby = Grubby.new
|
153
|
+
#
|
154
|
+
# grubby.singleton("https://example.com/foo") do |page|
|
155
|
+
# # will be executed (first time "/foo")
|
156
|
+
# end
|
157
|
+
#
|
158
|
+
# grubby.singleton("https://example.com/foo#bar") do |page|
|
159
|
+
# # will be skipped (already seen "/foo")
|
160
|
+
# end
|
161
|
+
#
|
162
|
+
# grubby.singleton("https://example.com/foo", "again!") do |page|
|
163
|
+
# # will be executed (new purpose for "/foo")
|
164
|
+
# end
|
165
|
+
#
|
166
|
+
# @param uri [URI, String]
|
123
167
|
# @param purpose [String]
|
124
|
-
# the purpose of processing the resource
|
125
168
|
# @yield [resource]
|
126
|
-
# processes the resource
|
127
169
|
# @yieldparam resource [Mechanize::Page, Mechanize::File, Mechanize::Download, ...]
|
128
|
-
# the fetched resource
|
129
170
|
# @return [Boolean]
|
130
171
|
# whether the given block was called
|
131
172
|
# @raise [Mechanize::ResponseCodeError]
|
132
173
|
# if fetching the resource results in error (see +Mechanize#get+)
|
133
|
-
def singleton(
|
174
|
+
def singleton(uri, purpose = "")
|
134
175
|
series = []
|
135
176
|
|
136
|
-
|
137
|
-
return if try_skip_singleton(
|
177
|
+
uri = uri.to_absolute_uri
|
178
|
+
return if try_skip_singleton(uri, purpose, series)
|
138
179
|
|
139
|
-
normalized_uri = normalize_uri(
|
180
|
+
normalized_uri = normalize_uri(uri)
|
140
181
|
return if try_skip_singleton(normalized_uri, purpose, series)
|
141
182
|
|
142
183
|
$log.info("Fetch #{normalized_uri}")
|
@@ -146,7 +187,9 @@ class Grubby < Mechanize
|
|
146
187
|
|
147
188
|
yield resource unless skip
|
148
189
|
|
149
|
-
|
190
|
+
CSV.open(journal, "a") do |csv|
|
191
|
+
series.each{|singleton_key| csv << singleton_key }
|
192
|
+
end if journal
|
150
193
|
|
151
194
|
!skip
|
152
195
|
end
|
@@ -154,7 +197,8 @@ class Grubby < Mechanize
|
|
154
197
|
|
155
198
|
private
|
156
199
|
|
157
|
-
|
200
|
+
# @!visibility private
|
201
|
+
SingletonKey = Struct.new(:purpose, :target)
|
158
202
|
|
159
203
|
def try_skip_singleton(target, purpose, series)
|
160
204
|
series << SingletonKey.new(purpose, target.to_s)
|
@@ -175,8 +219,8 @@ class Grubby < Mechanize
|
|
175
219
|
|
176
220
|
def sleep_between_requests
|
177
221
|
@last_request_at ||= 0.0
|
178
|
-
delay_duration =
|
179
|
-
rand(
|
222
|
+
delay_duration = time_between_requests.is_a?(Range) ?
|
223
|
+
rand(time_between_requests) : time_between_requests
|
180
224
|
sleep_duration = @last_request_at + delay_duration - Time.now.to_f
|
181
225
|
sleep(sleep_duration) if sleep_duration > 0
|
182
226
|
@last_request_at = Time.now.to_f
|
@@ -189,3 +233,6 @@ require_relative "grubby/json_parser"
|
|
189
233
|
require_relative "grubby/scraper"
|
190
234
|
require_relative "grubby/page_scraper"
|
191
235
|
require_relative "grubby/json_scraper"
|
236
|
+
|
237
|
+
|
238
|
+
$grubby = Grubby.new
|
@@ -4,7 +4,8 @@ class String
|
|
4
4
|
# does not denote an absolute URI.
|
5
5
|
#
|
6
6
|
# @return [URI]
|
7
|
-
# @raise [RuntimeError]
|
7
|
+
# @raise [RuntimeError]
|
8
|
+
# if the String does not denote an absolute URI
|
8
9
|
def to_absolute_uri
|
9
10
|
URI(self).to_absolute_uri
|
10
11
|
end
|
data/lib/grubby/core_ext/uri.rb
CHANGED
@@ -9,7 +9,7 @@ module URI
|
|
9
9
|
#
|
10
10
|
# @return [String]
|
11
11
|
def basename
|
12
|
-
self.path == "/" ? "" : File.basename(self.path)
|
12
|
+
self.path == "/" ? "" : ::File.basename(self.path)
|
13
13
|
end
|
14
14
|
|
15
15
|
# Returns the value of the specified param in the URI's +query+.
|
@@ -21,7 +21,7 @@ module URI
|
|
21
21
|
# occurrence of that param in the query string.
|
22
22
|
#
|
23
23
|
# @example
|
24
|
-
# URI("http://example.com/?foo=a").query_param("foo")
|
24
|
+
# URI("http://example.com/?foo=a").query_param("foo") # == "a"
|
25
25
|
#
|
26
26
|
# URI("http://example.com/?foo=a&foo=b").query_param("foo") # == "b"
|
27
27
|
# URI("http://example.com/?foo=a&foo=b").query_param("foo[]") # == nil
|
@@ -43,7 +43,8 @@ module URI
|
|
43
43
|
# Raises an exception if the URI is not +absolute?+.
|
44
44
|
#
|
45
45
|
# @return [self]
|
46
|
-
# @raise [RuntimeError]
|
46
|
+
# @raise [RuntimeError]
|
47
|
+
# if the URI is not +absolute?+
|
47
48
|
def to_absolute_uri
|
48
49
|
raise "URI is not absolute: #{self}" unless self.absolute?
|
49
50
|
self
|
data/lib/grubby/json_parser.rb
CHANGED
@@ -39,7 +39,7 @@ class Grubby::JsonParser < Mechanize::File
|
|
39
39
|
attr_reader :json
|
40
40
|
|
41
41
|
def initialize(uri = nil, response = nil, body = nil, code = nil)
|
42
|
-
@json = body && JSON.parse(body, self.class.json_parse_options)
|
42
|
+
@json = body.presence && JSON.parse(body, self.class.json_parse_options)
|
43
43
|
super
|
44
44
|
end
|
45
45
|
|
@@ -1,17 +1,21 @@
|
|
1
1
|
class Mechanize::Page
|
2
2
|
|
3
3
|
# @!method search!(*queries)
|
4
|
-
# See
|
4
|
+
# See Ryoba's +Nokogiri::XML::Searchable#search!+.
|
5
5
|
#
|
6
6
|
# @param queries [Array<String>]
|
7
|
-
# @return [
|
7
|
+
# @return [Nokogiri::XML::NodeSet]
|
8
|
+
# @raise [Ryoba::Error]
|
9
|
+
# if all queries yield no results
|
8
10
|
def_delegators :parser, :search!
|
9
11
|
|
10
12
|
# @!method at!(*queries)
|
11
|
-
# See
|
13
|
+
# See Ryoba's +Nokogiri::XML::Searchable#at!+.
|
12
14
|
#
|
13
15
|
# @param queries [Array<String>]
|
14
16
|
# @return [Nokogiri::XML::Element]
|
17
|
+
# @raise [Ryoba::Error]
|
18
|
+
# if all queries yield no results
|
15
19
|
def_delegators :parser, :at!
|
16
20
|
|
17
21
|
end
|
data/lib/grubby/page_scraper.rb
CHANGED
@@ -24,7 +24,7 @@ class Grubby::PageScraper < Grubby::Scraper
|
|
24
24
|
# @param path [String]
|
25
25
|
# @param agent [Mechanize]
|
26
26
|
# @return [Grubby::PageScraper]
|
27
|
-
def self.scrape_file(path, agent =
|
27
|
+
def self.scrape_file(path, agent = $grubby)
|
28
28
|
uri = URI.join("file:///", File.expand_path(path))
|
29
29
|
body = File.read(path)
|
30
30
|
self.new(Mechanize::Page.new(uri, nil, body, "200", agent))
|
data/lib/grubby/scraper.rb
CHANGED
@@ -2,61 +2,200 @@ class Grubby::Scraper
|
|
2
2
|
|
3
3
|
# Defines an attribute reader method named by +field+. During
|
4
4
|
# +initialize+, the given block is called, and the attribute is set to
|
5
|
-
# the block's return value.
|
6
|
-
#
|
7
|
-
#
|
5
|
+
# the block's return value.
|
6
|
+
#
|
7
|
+
# By default, if the block's return value is nil, an exception will be
|
8
|
+
# raised. To prevent this behavior, specify +optional: true+.
|
9
|
+
#
|
10
|
+
# The block may also be evaluated conditionally, based on another
|
11
|
+
# method's return value, using the +:if+ or +:unless+ options.
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# class GreetingScraper < Grubby::Scraper
|
15
|
+
# scrapes(:salutation) do
|
16
|
+
# source[/\A(hello|good morning)\b/i]
|
17
|
+
# end
|
18
|
+
#
|
19
|
+
# scrapes(:recipient, optional: true) do
|
20
|
+
# source[/\A#{salutation} ([a-z ]+)/i, 1]
|
21
|
+
# end
|
22
|
+
# end
|
23
|
+
#
|
24
|
+
# scraper = GreetingScraper.new("Hello World!")
|
25
|
+
# scraper.salutation # == "Hello"
|
26
|
+
# scraper.recipient # == "World"
|
27
|
+
#
|
28
|
+
# scraper = GreetingScraper.new("Good morning!")
|
29
|
+
# scraper.salutation # == "Good morning"
|
30
|
+
# scraper.recipient # == nil
|
31
|
+
#
|
32
|
+
# scraper = GreetingScraper.new("Hey!") # raises Grubby::Scraper::Error
|
33
|
+
#
|
34
|
+
# @example
|
35
|
+
# class EmbeddedUrlScraper < Grubby::Scraper
|
36
|
+
# scrapes(:url, optional: true){ source[%r"\bhttps?://\S+"] }
|
37
|
+
#
|
38
|
+
# scrapes(:domain, if: :url){ url[%r"://([^/]+)/", 1] }
|
39
|
+
# end
|
40
|
+
#
|
41
|
+
# scraper = EmbeddedUrlScraper.new("visit https://example.com/foo for details")
|
42
|
+
# scraper.url # == "https://example.com/foo"
|
43
|
+
# scraper.domain # == "example.com"
|
44
|
+
#
|
45
|
+
# scraper = EmbeddedUrlScraper.new("visit our website for details")
|
46
|
+
# scraper.url # == nil
|
47
|
+
# scraper.domain # == nil
|
8
48
|
#
|
9
49
|
# @param field [Symbol, String]
|
10
|
-
#
|
11
|
-
# @
|
12
|
-
#
|
50
|
+
# @param options [Hash]
|
51
|
+
# @option options :optional [Boolean]
|
52
|
+
# @option options :if [Symbol]
|
53
|
+
# @option options :unless [Symbol]
|
13
54
|
# @yield []
|
14
|
-
# scrapes the value
|
15
55
|
# @yieldreturn [Object]
|
16
|
-
#
|
17
|
-
def self.scrapes(field,
|
56
|
+
# @return [void]
|
57
|
+
def self.scrapes(field, **options, &block)
|
18
58
|
field = field.to_sym
|
19
59
|
self.fields << field
|
20
60
|
|
21
61
|
define_method(field) do
|
22
62
|
raise "#{self.class}#initialize does not invoke `super`" unless defined?(@scraped)
|
23
|
-
return @scraped[field] if @scraped.key?(field)
|
24
63
|
|
25
|
-
|
64
|
+
if !@scraped.key?(field) && !@errors.key?(field)
|
26
65
|
begin
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
66
|
+
skip = (options[:if] && !self.send(options[:if])) ||
|
67
|
+
(options[:unless] && self.send(options[:unless]))
|
68
|
+
|
69
|
+
if skip
|
70
|
+
@scraped[field] = nil
|
71
|
+
else
|
72
|
+
@scraped[field] = instance_eval(&block)
|
73
|
+
if @scraped[field].nil?
|
74
|
+
raise FieldValueRequiredError.new(field) unless options[:optional]
|
75
|
+
$log.debug("#{self.class}##{field} is nil")
|
76
|
+
end
|
31
77
|
end
|
32
|
-
@scraped[field] = value
|
33
78
|
rescue RuntimeError, IndexError => e
|
34
79
|
@errors[field] = e
|
35
80
|
end
|
36
81
|
end
|
37
82
|
|
38
|
-
|
39
|
-
|
40
|
-
|
83
|
+
if @errors.key?(field)
|
84
|
+
raise FieldScrapeFailedError.new(field, @errors[field])
|
85
|
+
else
|
86
|
+
@scraped[field]
|
87
|
+
end
|
41
88
|
end
|
42
89
|
end
|
43
90
|
|
44
|
-
#
|
91
|
+
# Fields defined by {scrapes}.
|
45
92
|
#
|
46
93
|
# @return [Array<Symbol>]
|
47
94
|
def self.fields
|
48
|
-
@fields ||= []
|
95
|
+
@fields ||= self == Grubby::Scraper ? [] : self.superclass.fields.dup
|
96
|
+
end
|
97
|
+
|
98
|
+
# Instantiates the Scraper class with the resource specified by +url+.
|
99
|
+
# This method acts as a default factory method, and provides a
|
100
|
+
# standard interface for specialized overrides.
|
101
|
+
#
|
102
|
+
# @example Default factory method
|
103
|
+
# class PostPageScraper < Grubby::PageScraper
|
104
|
+
# # ...
|
105
|
+
# end
|
106
|
+
#
|
107
|
+
# PostPageScraper.scrape("https://example.com/posts/42")
|
108
|
+
# # == PostPageScraper.new($grubby.get("https://example.com/posts/42"))
|
109
|
+
#
|
110
|
+
# @example Specialized factory method
|
111
|
+
# class PostApiScraper < Grubby::JsonScraper
|
112
|
+
# # ...
|
113
|
+
#
|
114
|
+
# def self.scrapes(url, agent = $grubby)
|
115
|
+
# api_url = url.sub(%r"//example.com/(.+)", '//api.example.com/\1.json')
|
116
|
+
# super(api_url, agent)
|
117
|
+
# end
|
118
|
+
# end
|
119
|
+
#
|
120
|
+
# PostApiScraper.scrape("https://example.com/posts/42")
|
121
|
+
# # == PostApiScraper.new($grubby.get("https://api.example.com/posts/42.json"))
|
122
|
+
#
|
123
|
+
# @param url [String, URI]
|
124
|
+
# @param agent [Mechanize]
|
125
|
+
# @return [Grubby::Scraper]
|
126
|
+
def self.scrape(url, agent = $grubby)
|
127
|
+
self.new(agent.get(url))
|
128
|
+
end
|
129
|
+
|
130
|
+
# Iterates a series of pages, starting at +start_url+. For each page,
|
131
|
+
# the Scraper class is instantiated and passed to the given block.
|
132
|
+
# Subsequent pages in the series are determined by invoking
|
133
|
+
# +next_method+ on each previous scraper instance.
|
134
|
+
#
|
135
|
+
# Iteration stops when the +next_method+ method returns nil. If the
|
136
|
+
# +next_method+ method returns a String or URI, that value will be
|
137
|
+
# treated as the URL of the next page. Otherwise that value will be
|
138
|
+
# treated as the page itself.
|
139
|
+
#
|
140
|
+
# @example
|
141
|
+
# class PostsIndexScraper < Grubby::PageScraper
|
142
|
+
# scrapes(:page_param){ page.uri.query_param("page") }
|
143
|
+
#
|
144
|
+
# def next
|
145
|
+
# page.link_with(text: "Next >")&.click
|
146
|
+
# end
|
147
|
+
# end
|
148
|
+
#
|
149
|
+
# PostsIndexScraper.each("https://example.com/posts?page=1") do |scraper|
|
150
|
+
# scraper.page_param # == "1", "2", "3", ...
|
151
|
+
# end
|
152
|
+
#
|
153
|
+
# @example
|
154
|
+
# class PostsIndexScraper < Grubby::PageScraper
|
155
|
+
# scrapes(:page_param){ page.uri.query_param("page") }
|
156
|
+
#
|
157
|
+
# scrapes(:next_uri, optional: true) do
|
158
|
+
# page.link_with(text: "Next >")&.to_absolute_uri
|
159
|
+
# end
|
160
|
+
# end
|
161
|
+
#
|
162
|
+
# PostsIndexScraper.each("https://example.com/posts?page=1", next_method: :next_uri) do |scraper|
|
163
|
+
# scraper.page_param # == "1", "2", "3", ...
|
164
|
+
# end
|
165
|
+
#
|
166
|
+
# @param start_url [String, URI]
|
167
|
+
# @param agent [Mechanize]
|
168
|
+
# @param next_method [Symbol]
|
169
|
+
# @yield [scraper]
|
170
|
+
# @yieldparam scraper [Grubby::Scraper]
|
171
|
+
# @return [void]
|
172
|
+
# @raise [NoMethodError]
|
173
|
+
# if Scraper class does not implement +next_method+
|
174
|
+
def self.each(start_url, agent = $grubby, next_method: :next)
|
175
|
+
unless self.method_defined?(next_method)
|
176
|
+
raise NoMethodError.new(nil, next_method), "#{self} does not define `#{next_method}`"
|
177
|
+
end
|
178
|
+
|
179
|
+
return to_enum(:each, start_url, agent, next_method: next_method) unless block_given?
|
180
|
+
|
181
|
+
current = start_url
|
182
|
+
while current
|
183
|
+
current = agent.get(current) if current.is_a?(String) || current.is_a?(URI)
|
184
|
+
scraper = self.new(current)
|
185
|
+
yield scraper
|
186
|
+
current = scraper.send(next_method)
|
187
|
+
end
|
49
188
|
end
|
50
189
|
|
51
|
-
# The
|
190
|
+
# The object being scraped. Typically a Mechanize pluggable parser
|
52
191
|
# such as +Mechanize::Page+.
|
53
192
|
#
|
54
193
|
# @return [Object]
|
55
194
|
attr_reader :source
|
56
195
|
|
57
|
-
#
|
58
|
-
# {
|
59
|
-
# be empty.
|
196
|
+
# Collected errors raised during {initialize} by blocks passed to
|
197
|
+
# {scrapes}, indexed by field name. If {initialize} did not raise
|
198
|
+
# +Grubby::Scraper::Error+, this Hash will be empty.
|
60
199
|
#
|
61
200
|
# @return [Hash<Symbol, StandardError>]
|
62
201
|
attr_reader :errors
|
@@ -123,6 +262,7 @@ class Grubby::Scraper
|
|
123
262
|
end
|
124
263
|
end
|
125
264
|
|
265
|
+
# @!visibility private
|
126
266
|
class FieldScrapeFailedError < RuntimeError
|
127
267
|
def initialize(field, field_error)
|
128
268
|
super("`#{field}` raised #{field_error.class}")
|
data/lib/grubby/version.rb
CHANGED
@@ -1 +1 @@
|
|
1
|
-
GRUBBY_VERSION = "1.
|
1
|
+
GRUBBY_VERSION = "1.2.0"
|
metadata
CHANGED
@@ -1,27 +1,27 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grubby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Hefner
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-07-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '5.0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '5.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
@@ -38,20 +38,6 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '3.0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: dumb_delimited
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - "~>"
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '1.0'
|
48
|
-
type: :runtime
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - "~>"
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '1.0'
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: gorge
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -227,8 +213,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
227
213
|
- !ruby/object:Gem::Version
|
228
214
|
version: '0'
|
229
215
|
requirements: []
|
230
|
-
|
231
|
-
rubygems_version: 2.7.6
|
216
|
+
rubygems_version: 3.0.1
|
232
217
|
signing_key:
|
233
218
|
specification_version: 4
|
234
219
|
summary: Fail-fast web scraping
|