grubby 1.2.1 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.travis.yml +6 -3
- data/CHANGELOG.md +12 -0
- data/Gemfile +3 -0
- data/README.md +140 -92
- data/Rakefile +0 -13
- data/gemfiles/activesupport-6.0.gemfile +3 -0
- data/grubby.gemspec +17 -18
- data/lib/grubby.rb +64 -46
- data/lib/grubby/core_ext/uri.rb +12 -11
- data/lib/grubby/json_parser.rb +1 -27
- data/lib/grubby/json_scraper.rb +6 -2
- data/lib/grubby/mechanize/download.rb +1 -1
- data/lib/grubby/mechanize/file.rb +1 -2
- data/lib/grubby/mechanize/link.rb +9 -6
- data/lib/grubby/mechanize/page.rb +4 -2
- data/lib/grubby/mechanize/parser.rb +9 -9
- data/lib/grubby/page_scraper.rb +6 -2
- data/lib/grubby/scraper.rb +86 -60
- data/lib/grubby/version.rb +1 -1
- metadata +17 -69
data/lib/grubby/core_ext/uri.rb
CHANGED
@@ -3,9 +3,9 @@ module URI
|
|
3
3
|
# Returns the basename of the URI's +path+, a la +File.basename+.
|
4
4
|
#
|
5
5
|
# @example
|
6
|
-
# URI("
|
7
|
-
# URI("
|
8
|
-
# URI("
|
6
|
+
# URI("https://example.com/foo/bar").basename # == "bar"
|
7
|
+
# URI("https://example.com/foo").basename # == "foo"
|
8
|
+
# URI("https://example.com/").basename # == ""
|
9
9
|
#
|
10
10
|
# @return [String]
|
11
11
|
def basename
|
@@ -20,16 +20,16 @@ module URI
|
|
20
20
|
# Otherwise, only the last occurrence is returned.
|
21
21
|
#
|
22
22
|
# @example
|
23
|
-
# URI("
|
23
|
+
# URI("https://example.com/?foo=a").query_param("foo") # == "a"
|
24
24
|
#
|
25
|
-
# URI("
|
26
|
-
# URI("
|
25
|
+
# URI("https://example.com/?foo=a&foo=b").query_param("foo") # == "b"
|
26
|
+
# URI("https://example.com/?foo=a&foo=b").query_param("foo[]") # == nil
|
27
27
|
#
|
28
|
-
# URI("
|
29
|
-
# URI("
|
28
|
+
# URI("https://example.com/?foo[]=a&foo[]=b").query_param("foo") # == nil
|
29
|
+
# URI("https://example.com/?foo[]=a&foo[]=b").query_param("foo[]") # == ["a", "b"]
|
30
30
|
#
|
31
|
-
# URI("
|
32
|
-
# URI("
|
31
|
+
# URI("https://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[]") # == nil
|
32
|
+
# URI("https://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[][x]") # == ["a"]
|
33
33
|
#
|
34
34
|
# @param name [String]
|
35
35
|
# @return [String, Array<String>, nil]
|
@@ -38,7 +38,8 @@ module URI
|
|
38
38
|
(values.nil? || name.include?("[]")) ? values : values.last
|
39
39
|
end
|
40
40
|
|
41
|
-
# Raises an exception if the URI is not +absolute?+.
|
41
|
+
# Raises an exception if the URI is not +absolute?+. Otherwise,
|
42
|
+
# returns the URI.
|
42
43
|
#
|
43
44
|
# @return [self]
|
44
45
|
# @raise [RuntimeError]
|
data/lib/grubby/json_parser.rb
CHANGED
@@ -1,31 +1,5 @@
|
|
1
1
|
class Grubby::JsonParser < Mechanize::File
|
2
2
|
|
3
|
-
# Returns the options to use when parsing JSON. The returned options
|
4
|
-
# Hash is not +dup+ed and can be modified directly. Any modifications
|
5
|
-
# will be applied to all future parsing.
|
6
|
-
#
|
7
|
-
# For information about available options, see
|
8
|
-
# {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
|
9
|
-
# +JSON.parse+}.
|
10
|
-
#
|
11
|
-
# @return [Hash]
|
12
|
-
def self.json_parse_options
|
13
|
-
@json_parse_options ||= JSON.load_default_options.merge(create_additions: false)
|
14
|
-
end
|
15
|
-
|
16
|
-
# Sets the options to use when parsing JSON. The entire options Hash
|
17
|
-
# is replaced, and the new value will be applied to all future
|
18
|
-
# parsing. To set options individually, see {json_parse_options}.
|
19
|
-
#
|
20
|
-
# For information about available options, see
|
21
|
-
# {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
|
22
|
-
# +JSON.parse+}.
|
23
|
-
#
|
24
|
-
# @param options [Hash]
|
25
|
-
def self.json_parse_options=(options)
|
26
|
-
@json_parse_options = options
|
27
|
-
end
|
28
|
-
|
29
3
|
# The parsed JSON data.
|
30
4
|
#
|
31
5
|
# @return [Hash, Array]
|
@@ -37,7 +11,7 @@ class Grubby::JsonParser < Mechanize::File
|
|
37
11
|
attr_accessor :mech
|
38
12
|
|
39
13
|
def initialize(uri = nil, response = nil, body = nil, code = nil, mech = nil)
|
40
|
-
@json =
|
14
|
+
@json = JSON.load(body, nil, create_additions: false)
|
41
15
|
@mech = mech
|
42
16
|
super(uri, response, body, code)
|
43
17
|
end
|
data/lib/grubby/json_scraper.rb
CHANGED
@@ -6,8 +6,10 @@ class Grubby::JsonScraper < Grubby::Scraper
|
|
6
6
|
attr_reader :json
|
7
7
|
|
8
8
|
# @param source [Grubby::JsonParser]
|
9
|
+
# @raise [Grubby::Scraper::Error]
|
10
|
+
# if any {Scraper.scrapes} blocks fail
|
9
11
|
def initialize(source)
|
10
|
-
@json = source.
|
12
|
+
@json = source.assert!(Grubby::JsonParser).json
|
11
13
|
super
|
12
14
|
end
|
13
15
|
|
@@ -19,11 +21,13 @@ class Grubby::JsonScraper < Grubby::Scraper
|
|
19
21
|
# # ...
|
20
22
|
# end
|
21
23
|
#
|
22
|
-
# MyScraper.scrape_file("path/to/local_file.json")
|
24
|
+
# MyScraper.scrape_file("path/to/local_file.json") # === MyScraper
|
23
25
|
#
|
24
26
|
# @param path [String]
|
25
27
|
# @param agent [Mechanize]
|
26
28
|
# @return [Grubby::JsonScraper]
|
29
|
+
# @raise [Grubby::Scraper::Error]
|
30
|
+
# if any {Scraper.scrapes} blocks fail
|
27
31
|
def self.scrape_file(path, agent = $grubby)
|
28
32
|
self.new(Grubby::JsonParser.read_local(path).tap{|parser| parser.mech = agent })
|
29
33
|
end
|
@@ -1,12 +1,11 @@
|
|
1
|
+
# @!visibility private
|
1
2
|
class Mechanize::File
|
2
3
|
|
3
|
-
# @!visibility private
|
4
4
|
def self.read_local(path)
|
5
5
|
uri_path = File.expand_path(path).gsub(%r"[^/\\]+"){|component| CGI.escape(component) }
|
6
6
|
self.new(URI::File.build(path: uri_path), nil, File.read(path), "200")
|
7
7
|
end
|
8
8
|
|
9
|
-
# @!visibility private
|
10
9
|
def content_hash
|
11
10
|
@content_hash ||= self.body.to_s.sha1
|
12
11
|
end
|
@@ -1,15 +1,18 @@
|
|
1
1
|
class Mechanize::Page::Link
|
2
2
|
|
3
3
|
# Returns the URI represented by the Link, in absolute form. If the
|
4
|
-
# href attribute of the Link is expressed in relative form, the URI
|
5
|
-
#
|
4
|
+
# href attribute of the Link is expressed in relative form, the URI is
|
5
|
+
# converted to absolute form using the Link's +page.uri+. Raises an
|
6
|
+
# exception if the URI cannot be converted to absolute form.
|
6
7
|
#
|
7
8
|
# @return [URI]
|
9
|
+
# @raise [RuntimeError]
|
10
|
+
# if the URI cannot be converted to absolute form
|
8
11
|
def to_absolute_uri
|
9
|
-
# Via the W3 spec: "If the a element has no href attribute, then
|
10
|
-
# element represents a placeholder for where a link might
|
11
|
-
# have been placed, if it had been relevant, consisting of
|
12
|
-
# element's contents."
|
12
|
+
# Via the W3 spec[1]: "If the a element has no href attribute, then
|
13
|
+
# the element represents a placeholder for where a link might
|
14
|
+
# otherwise have been placed, if it had been relevant, consisting of
|
15
|
+
# just the element's contents." So, we assume a link with no href
|
13
16
|
# attribute (i.e. `uri == nil`) should be treated the same as an
|
14
17
|
# intra-page link.
|
15
18
|
#
|
@@ -1,7 +1,8 @@
|
|
1
1
|
class Mechanize::Page
|
2
2
|
|
3
3
|
# @!method search!(*queries)
|
4
|
-
# See
|
4
|
+
# See ryoba's {https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21
|
5
|
+
# +Nokogiri::XML::Searchable#search!+}.
|
5
6
|
#
|
6
7
|
# @param queries [Array<String>]
|
7
8
|
# @return [Nokogiri::XML::NodeSet]
|
@@ -10,7 +11,8 @@ class Mechanize::Page
|
|
10
11
|
def_delegators :parser, :search!
|
11
12
|
|
12
13
|
# @!method at!(*queries)
|
13
|
-
# See
|
14
|
+
# See ryoba's {https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21
|
15
|
+
# +Nokogiri::XML::Searchable#at!+}.
|
14
16
|
#
|
15
17
|
# @param queries [Array<String>]
|
16
18
|
# @return [Nokogiri::XML::Element]
|
@@ -2,15 +2,15 @@ require "fileutils"
|
|
2
2
|
|
3
3
|
module Mechanize::Parser
|
4
4
|
|
5
|
-
# Saves the payload to a specified directory,
|
5
|
+
# Saves the payload to a specified directory, using the default
|
6
6
|
# filename suggested by the server. If a file with that name already
|
7
7
|
# exists, this method will try to find a free filename by appending
|
8
|
-
# numbers to the
|
8
|
+
# numbers to the default filename. Returns the full path of the saved
|
9
9
|
# file.
|
10
10
|
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
11
|
+
# @note This method expects a +#save!+ method to be defined by the
|
12
|
+
# class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
|
13
|
+
# and +Mechanize::Download#save!+.
|
14
14
|
#
|
15
15
|
# @param directory [String]
|
16
16
|
# @return [String]
|
@@ -23,14 +23,14 @@ module Mechanize::Parser
|
|
23
23
|
path
|
24
24
|
end
|
25
25
|
|
26
|
-
# Saves the payload to a specified directory,
|
26
|
+
# Saves the payload to a specified directory, using the default
|
27
27
|
# filename suggested by the server. If a file with that name already
|
28
28
|
# exists, that file will be overwritten. Returns the full path of the
|
29
29
|
# saved file.
|
30
30
|
#
|
31
|
-
#
|
32
|
-
#
|
33
|
-
#
|
31
|
+
# @note This method expects a +#save!+ method to be defined by the
|
32
|
+
# class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
|
33
|
+
# and +Mechanize::Download#save!+.
|
34
34
|
#
|
35
35
|
# @param directory [String]
|
36
36
|
# @return [String]
|
data/lib/grubby/page_scraper.rb
CHANGED
@@ -6,8 +6,10 @@ class Grubby::PageScraper < Grubby::Scraper
|
|
6
6
|
attr_reader :page
|
7
7
|
|
8
8
|
# @param source [Mechanize::Page]
|
9
|
+
# @raise [Grubby::Scraper::Error]
|
10
|
+
# if any {Scraper.scrapes} blocks fail
|
9
11
|
def initialize(source)
|
10
|
-
@page = source.
|
12
|
+
@page = source.assert!(Mechanize::Page)
|
11
13
|
super
|
12
14
|
end
|
13
15
|
|
@@ -19,11 +21,13 @@ class Grubby::PageScraper < Grubby::Scraper
|
|
19
21
|
# # ...
|
20
22
|
# end
|
21
23
|
#
|
22
|
-
# MyScraper.scrape_file("path/to/local_file.html")
|
24
|
+
# MyScraper.scrape_file("path/to/local_file.html") # === MyScraper
|
23
25
|
#
|
24
26
|
# @param path [String]
|
25
27
|
# @param agent [Mechanize]
|
26
28
|
# @return [Grubby::PageScraper]
|
29
|
+
# @raise [Grubby::Scraper::Error]
|
30
|
+
# if any {Scraper.scrapes} blocks fail
|
27
31
|
def self.scrape_file(path, agent = $grubby)
|
28
32
|
self.new(Mechanize::Page.read_local(path).tap{|page| page.mech = agent })
|
29
33
|
end
|
data/lib/grubby/scraper.rb
CHANGED
@@ -1,57 +1,68 @@
|
|
1
1
|
class Grubby::Scraper
|
2
2
|
|
3
3
|
# Defines an attribute reader method named by +field+. During
|
4
|
-
#
|
4
|
+
# {initialize}, the given block is called, and the attribute is set to
|
5
5
|
# the block's return value.
|
6
6
|
#
|
7
|
-
# By default, if the block's return value is nil
|
8
|
-
#
|
7
|
+
# By default, raises an exception if the block's return value is nil.
|
8
|
+
# To prevent this behavior, set the +:optional+ option to true.
|
9
|
+
# Alternatively, the block can be conditionally evaluated, based on
|
10
|
+
# another method's return value, using the +:if+ or +:unless+ options.
|
9
11
|
#
|
10
|
-
#
|
11
|
-
# method's return value, using the +:if+ or +:unless+ options.
|
12
|
-
#
|
13
|
-
# @example
|
12
|
+
# @example Default behavior
|
14
13
|
# class GreetingScraper < Grubby::Scraper
|
15
|
-
# scrapes(:
|
16
|
-
# source[
|
14
|
+
# scrapes(:name) do
|
15
|
+
# source[/Hello (\w+)/, 1]
|
17
16
|
# end
|
17
|
+
# end
|
18
|
+
#
|
19
|
+
# scraper = GreetingScraper.new("Hello World!")
|
20
|
+
# scraper.name # == "World"
|
21
|
+
#
|
22
|
+
# scraper = GreetingScraper.new("Hello!") # raises Grubby::Scraper::Error
|
18
23
|
#
|
19
|
-
#
|
20
|
-
#
|
24
|
+
# @example Optional scraped value
|
25
|
+
# class GreetingScraper < Grubby::Scraper
|
26
|
+
# scrapes(:name, optional: true) do
|
27
|
+
# source[/Hello (\w+)/, 1]
|
21
28
|
# end
|
22
29
|
# end
|
23
30
|
#
|
24
31
|
# scraper = GreetingScraper.new("Hello World!")
|
25
|
-
# scraper.
|
26
|
-
# scraper.recipient # == "World"
|
32
|
+
# scraper.name # == "World"
|
27
33
|
#
|
28
|
-
# scraper = GreetingScraper.new("
|
29
|
-
# scraper.
|
30
|
-
# scraper.recipient # == nil
|
34
|
+
# scraper = GreetingScraper.new("Hello!")
|
35
|
+
# scraper.name # == nil
|
31
36
|
#
|
32
|
-
#
|
33
|
-
#
|
34
|
-
#
|
35
|
-
#
|
36
|
-
#
|
37
|
+
# @example Conditional scraped value
|
38
|
+
# class GreetingScraper < Grubby::Scraper
|
39
|
+
# def hello?
|
40
|
+
# source.start_with?("Hello ")
|
41
|
+
# end
|
37
42
|
#
|
38
|
-
# scrapes(:
|
43
|
+
# scrapes(:name, if: :hello?) do
|
44
|
+
# source[/Hello (\w+)/, 1]
|
45
|
+
# end
|
39
46
|
# end
|
40
47
|
#
|
41
|
-
# scraper =
|
42
|
-
# scraper.
|
43
|
-
#
|
48
|
+
# scraper = GreetingScraper.new("Hello World!")
|
49
|
+
# scraper.name # == "World"
|
50
|
+
#
|
51
|
+
# scraper = GreetingScraper.new("Hello!") # raises Grubby::Scraper::Error
|
44
52
|
#
|
45
|
-
# scraper =
|
46
|
-
# scraper.
|
47
|
-
# scraper.domain # == nil
|
53
|
+
# scraper = GreetingScraper.new("How are you?")
|
54
|
+
# scraper.name # == nil
|
48
55
|
#
|
49
56
|
# @param field [Symbol, String]
|
50
57
|
# @param options [Hash]
|
51
|
-
# @option options :optional [Boolean]
|
52
|
-
#
|
53
|
-
# @option options :
|
54
|
-
#
|
58
|
+
# @option options :optional [Boolean] (false)
|
59
|
+
# Whether the block should be allowed to return a nil value
|
60
|
+
# @option options :if [Symbol] (nil)
|
61
|
+
# Name of predicate method that determines if the block should be
|
62
|
+
# evaluated
|
63
|
+
# @option options :unless [Symbol] (nil)
|
64
|
+
# Name of predicate method that determines if the block should not
|
65
|
+
# be evaluated
|
55
66
|
# @yieldreturn [Object]
|
56
67
|
# @return [void]
|
57
68
|
def self.scrapes(field, **options, &block)
|
@@ -88,16 +99,16 @@ class Grubby::Scraper
|
|
88
99
|
end
|
89
100
|
end
|
90
101
|
|
91
|
-
# Fields defined
|
102
|
+
# Fields defined via {scrapes}.
|
92
103
|
#
|
93
104
|
# @return [Array<Symbol>]
|
94
105
|
def self.fields
|
95
106
|
@fields ||= self == Grubby::Scraper ? [] : self.superclass.fields.dup
|
96
107
|
end
|
97
108
|
|
98
|
-
# Instantiates the Scraper class with the resource
|
109
|
+
# Instantiates the Scraper class with the resource indicated by +url+.
|
99
110
|
# This method acts as a default factory method, and provides a
|
100
|
-
# standard interface for
|
111
|
+
# standard interface for overrides.
|
101
112
|
#
|
102
113
|
# @example Default factory method
|
103
114
|
# class PostPageScraper < Grubby::PageScraper
|
@@ -107,12 +118,12 @@ class Grubby::Scraper
|
|
107
118
|
# PostPageScraper.scrape("https://example.com/posts/42")
|
108
119
|
# # == PostPageScraper.new($grubby.get("https://example.com/posts/42"))
|
109
120
|
#
|
110
|
-
# @example
|
121
|
+
# @example Override factory method
|
111
122
|
# class PostApiScraper < Grubby::JsonScraper
|
112
123
|
# # ...
|
113
124
|
#
|
114
|
-
# def self.
|
115
|
-
# api_url = url.sub(%r"//example.com/(.+)", '//api.example.com/\1.json')
|
125
|
+
# def self.scrape(url, agent = $grubby)
|
126
|
+
# api_url = url.to_s.sub(%r"//example.com/(.+)", '//api.example.com/\1.json')
|
116
127
|
# super(api_url, agent)
|
117
128
|
# end
|
118
129
|
# end
|
@@ -123,54 +134,65 @@ class Grubby::Scraper
|
|
123
134
|
# @param url [String, URI]
|
124
135
|
# @param agent [Mechanize]
|
125
136
|
# @return [Grubby::Scraper]
|
137
|
+
# @raise [Grubby::Scraper::Error]
|
138
|
+
# if any {Scraper.scrapes} blocks fail
|
126
139
|
def self.scrape(url, agent = $grubby)
|
127
140
|
self.new(agent.get(url))
|
128
141
|
end
|
129
142
|
|
130
143
|
# Iterates a series of pages, starting at +start+. The Scraper class
|
131
|
-
# is instantiated with each page, and each instance is passed
|
132
|
-
# given block. Subsequent pages in the series are determined
|
133
|
-
# invoking the +next_method+ method on each
|
144
|
+
# is instantiated with each page, and each Scraper instance is passed
|
145
|
+
# to the given block. Subsequent pages in the series are determined
|
146
|
+
# by invoking the +next_method+ method on each Scraper instance.
|
134
147
|
#
|
135
|
-
# Iteration stops when the +next_method+ method returns
|
148
|
+
# Iteration stops when the +next_method+ method returns falsy. If the
|
136
149
|
# +next_method+ method returns a String or URI, that value will be
|
137
150
|
# treated as the URL of the next page. Otherwise that value will be
|
138
151
|
# treated as the page itself.
|
139
152
|
#
|
140
|
-
# @example
|
153
|
+
# @example Iterate from page object
|
141
154
|
# class PostsIndexScraper < Grubby::PageScraper
|
142
|
-
# scrapes(:page_param){ page.uri.query_param("page") }
|
143
|
-
#
|
144
155
|
# def next
|
145
156
|
# page.link_with(text: "Next >")&.click
|
146
157
|
# end
|
147
158
|
# end
|
148
159
|
#
|
149
160
|
# PostsIndexScraper.each("https://example.com/posts?page=1") do |scraper|
|
150
|
-
# scraper.
|
161
|
+
# scraper.page.uri.query # == "page=1", "page=2", "page=3", ...
|
151
162
|
# end
|
152
163
|
#
|
153
|
-
# @example
|
164
|
+
# @example Iterate from URI
|
154
165
|
# class PostsIndexScraper < Grubby::PageScraper
|
155
|
-
#
|
166
|
+
# def next
|
167
|
+
# page.link_with(text: "Next >")&.to_absolute_uri
|
168
|
+
# end
|
169
|
+
# end
|
156
170
|
#
|
171
|
+
# PostsIndexScraper.each("https://example.com/posts?page=1") do |scraper|
|
172
|
+
# scraper.page.uri.query # == "page=1", "page=2", "page=3", ...
|
173
|
+
# end
|
174
|
+
#
|
175
|
+
# @example Specifying the iteration method
|
176
|
+
# class PostsIndexScraper < Grubby::PageScraper
|
157
177
|
# scrapes(:next_uri, optional: true) do
|
158
178
|
# page.link_with(text: "Next >")&.to_absolute_uri
|
159
179
|
# end
|
160
180
|
# end
|
161
181
|
#
|
162
182
|
# PostsIndexScraper.each("https://example.com/posts?page=1", next_method: :next_uri) do |scraper|
|
163
|
-
# scraper.
|
183
|
+
# scraper.page.uri.query # == "page=1", "page=2", "page=3", ...
|
164
184
|
# end
|
165
185
|
#
|
166
186
|
# @param start [String, URI, Mechanize::Page, Mechanize::File]
|
167
187
|
# @param agent [Mechanize]
|
168
188
|
# @param next_method [Symbol]
|
169
|
-
# @yield [scraper]
|
170
189
|
# @yieldparam scraper [Grubby::Scraper]
|
171
190
|
# @return [void]
|
172
191
|
# @raise [NoMethodError]
|
173
|
-
# if Scraper class does not
|
192
|
+
# if the Scraper class does not define the method indicated by
|
193
|
+
# +next_method+
|
194
|
+
# @raise [Grubby::Scraper::Error]
|
195
|
+
# if any {Scraper.scrapes} blocks fail
|
174
196
|
def self.each(start, agent = $grubby, next_method: :next)
|
175
197
|
unless self.method_defined?(next_method)
|
176
198
|
raise NoMethodError.new(nil, next_method), "#{self} does not define `#{next_method}`"
|
@@ -187,22 +209,22 @@ class Grubby::Scraper
|
|
187
209
|
end
|
188
210
|
end
|
189
211
|
|
190
|
-
# The object being scraped. Typically a Mechanize
|
191
|
-
# such as +Mechanize::Page+.
|
212
|
+
# The object being scraped. Typically an instance of a Mechanize
|
213
|
+
# pluggable parser such as +Mechanize::Page+.
|
192
214
|
#
|
193
215
|
# @return [Object]
|
194
216
|
attr_reader :source
|
195
217
|
|
196
|
-
# Collected errors raised during {initialize} by
|
197
|
-
#
|
198
|
-
# +Grubby::Scraper::Error
|
218
|
+
# Collected errors raised during {initialize} by {Scraper.scrapes}
|
219
|
+
# blocks, indexed by field name. This Hash will be empty if
|
220
|
+
# {initialize} did not raise a +Grubby::Scraper::Error+.
|
199
221
|
#
|
200
|
-
# @return [Hash
|
222
|
+
# @return [Hash{Symbol => StandardError}]
|
201
223
|
attr_reader :errors
|
202
224
|
|
203
225
|
# @param source
|
204
226
|
# @raise [Grubby::Scraper::Error]
|
205
|
-
# if any
|
227
|
+
# if any {Scraper.scrapes} blocks fail
|
206
228
|
def initialize(source)
|
207
229
|
@source = source
|
208
230
|
@scraped = {}
|
@@ -230,22 +252,25 @@ class Grubby::Scraper
|
|
230
252
|
|
231
253
|
# Returns all scraped values as a Hash.
|
232
254
|
#
|
233
|
-
# @return [Hash
|
255
|
+
# @return [Hash{Symbol => Object}]
|
234
256
|
def to_h
|
235
257
|
@scraped.dup
|
236
258
|
end
|
237
259
|
|
238
260
|
class Error < RuntimeError
|
261
|
+
# @!visibility private
|
239
262
|
BACKTRACE_CLEANER = ActiveSupport::BacktraceCleaner.new.tap do |cleaner|
|
240
263
|
cleaner.add_silencer do |line|
|
241
264
|
line.include?(__dir__) && line.include?("scraper.rb:")
|
242
265
|
end
|
243
266
|
end
|
244
267
|
|
268
|
+
# The Scraper that raised this Error.
|
269
|
+
#
|
245
270
|
# @return [Grubby::Scraper]
|
246
|
-
# The Scraper that raised this error.
|
247
271
|
attr_accessor :scraper
|
248
272
|
|
273
|
+
# @!visibility private
|
249
274
|
def initialize(scraper)
|
250
275
|
self.scraper = scraper
|
251
276
|
|
@@ -269,6 +294,7 @@ class Grubby::Scraper
|
|
269
294
|
end
|
270
295
|
end
|
271
296
|
|
297
|
+
# @!visibility private
|
272
298
|
class FieldValueRequiredError < RuntimeError
|
273
299
|
def initialize(field)
|
274
300
|
super("`#{field}` is nil but is not marked as optional")
|