grubby 1.2.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.travis.yml +6 -3
- data/CHANGELOG.md +12 -0
- data/Gemfile +3 -0
- data/README.md +140 -92
- data/Rakefile +0 -13
- data/gemfiles/activesupport-6.0.gemfile +3 -0
- data/grubby.gemspec +17 -18
- data/lib/grubby.rb +64 -46
- data/lib/grubby/core_ext/uri.rb +12 -11
- data/lib/grubby/json_parser.rb +1 -27
- data/lib/grubby/json_scraper.rb +6 -2
- data/lib/grubby/mechanize/download.rb +1 -1
- data/lib/grubby/mechanize/file.rb +1 -2
- data/lib/grubby/mechanize/link.rb +9 -6
- data/lib/grubby/mechanize/page.rb +4 -2
- data/lib/grubby/mechanize/parser.rb +9 -9
- data/lib/grubby/page_scraper.rb +6 -2
- data/lib/grubby/scraper.rb +86 -60
- data/lib/grubby/version.rb +1 -1
- metadata +17 -69
data/lib/grubby/core_ext/uri.rb
CHANGED
@@ -3,9 +3,9 @@ module URI
|
|
3
3
|
# Returns the basename of the URI's +path+, a la +File.basename+.
|
4
4
|
#
|
5
5
|
# @example
|
6
|
-
# URI("
|
7
|
-
# URI("
|
8
|
-
# URI("
|
6
|
+
# URI("https://example.com/foo/bar").basename # == "bar"
|
7
|
+
# URI("https://example.com/foo").basename # == "foo"
|
8
|
+
# URI("https://example.com/").basename # == ""
|
9
9
|
#
|
10
10
|
# @return [String]
|
11
11
|
def basename
|
@@ -20,16 +20,16 @@ module URI
|
|
20
20
|
# Otherwise, only the last occurrence is returned.
|
21
21
|
#
|
22
22
|
# @example
|
23
|
-
# URI("
|
23
|
+
# URI("https://example.com/?foo=a").query_param("foo") # == "a"
|
24
24
|
#
|
25
|
-
# URI("
|
26
|
-
# URI("
|
25
|
+
# URI("https://example.com/?foo=a&foo=b").query_param("foo") # == "b"
|
26
|
+
# URI("https://example.com/?foo=a&foo=b").query_param("foo[]") # == nil
|
27
27
|
#
|
28
|
-
# URI("
|
29
|
-
# URI("
|
28
|
+
# URI("https://example.com/?foo[]=a&foo[]=b").query_param("foo") # == nil
|
29
|
+
# URI("https://example.com/?foo[]=a&foo[]=b").query_param("foo[]") # == ["a", "b"]
|
30
30
|
#
|
31
|
-
# URI("
|
32
|
-
# URI("
|
31
|
+
# URI("https://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[]") # == nil
|
32
|
+
# URI("https://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[][x]") # == ["a"]
|
33
33
|
#
|
34
34
|
# @param name [String]
|
35
35
|
# @return [String, Array<String>, nil]
|
@@ -38,7 +38,8 @@ module URI
|
|
38
38
|
(values.nil? || name.include?("[]")) ? values : values.last
|
39
39
|
end
|
40
40
|
|
41
|
-
# Raises an exception if the URI is not +absolute?+.
|
41
|
+
# Raises an exception if the URI is not +absolute?+. Otherwise,
|
42
|
+
# returns the URI.
|
42
43
|
#
|
43
44
|
# @return [self]
|
44
45
|
# @raise [RuntimeError]
|
data/lib/grubby/json_parser.rb
CHANGED
@@ -1,31 +1,5 @@
|
|
1
1
|
class Grubby::JsonParser < Mechanize::File
|
2
2
|
|
3
|
-
# Returns the options to use when parsing JSON. The returned options
|
4
|
-
# Hash is not +dup+ed and can be modified directly. Any modifications
|
5
|
-
# will be applied to all future parsing.
|
6
|
-
#
|
7
|
-
# For information about available options, see
|
8
|
-
# {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
|
9
|
-
# +JSON.parse+}.
|
10
|
-
#
|
11
|
-
# @return [Hash]
|
12
|
-
def self.json_parse_options
|
13
|
-
@json_parse_options ||= JSON.load_default_options.merge(create_additions: false)
|
14
|
-
end
|
15
|
-
|
16
|
-
# Sets the options to use when parsing JSON. The entire options Hash
|
17
|
-
# is replaced, and the new value will be applied to all future
|
18
|
-
# parsing. To set options individually, see {json_parse_options}.
|
19
|
-
#
|
20
|
-
# For information about available options, see
|
21
|
-
# {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
|
22
|
-
# +JSON.parse+}.
|
23
|
-
#
|
24
|
-
# @param options [Hash]
|
25
|
-
def self.json_parse_options=(options)
|
26
|
-
@json_parse_options = options
|
27
|
-
end
|
28
|
-
|
29
3
|
# The parsed JSON data.
|
30
4
|
#
|
31
5
|
# @return [Hash, Array]
|
@@ -37,7 +11,7 @@ class Grubby::JsonParser < Mechanize::File
|
|
37
11
|
attr_accessor :mech
|
38
12
|
|
39
13
|
def initialize(uri = nil, response = nil, body = nil, code = nil, mech = nil)
|
40
|
-
@json =
|
14
|
+
@json = JSON.load(body, nil, create_additions: false)
|
41
15
|
@mech = mech
|
42
16
|
super(uri, response, body, code)
|
43
17
|
end
|
data/lib/grubby/json_scraper.rb
CHANGED
@@ -6,8 +6,10 @@ class Grubby::JsonScraper < Grubby::Scraper
|
|
6
6
|
attr_reader :json
|
7
7
|
|
8
8
|
# @param source [Grubby::JsonParser]
|
9
|
+
# @raise [Grubby::Scraper::Error]
|
10
|
+
# if any {Scraper.scrapes} blocks fail
|
9
11
|
def initialize(source)
|
10
|
-
@json = source.
|
12
|
+
@json = source.assert!(Grubby::JsonParser).json
|
11
13
|
super
|
12
14
|
end
|
13
15
|
|
@@ -19,11 +21,13 @@ class Grubby::JsonScraper < Grubby::Scraper
|
|
19
21
|
# # ...
|
20
22
|
# end
|
21
23
|
#
|
22
|
-
# MyScraper.scrape_file("path/to/local_file.json")
|
24
|
+
# MyScraper.scrape_file("path/to/local_file.json") # === MyScraper
|
23
25
|
#
|
24
26
|
# @param path [String]
|
25
27
|
# @param agent [Mechanize]
|
26
28
|
# @return [Grubby::JsonScraper]
|
29
|
+
# @raise [Grubby::Scraper::Error]
|
30
|
+
# if any {Scraper.scrapes} blocks fail
|
27
31
|
def self.scrape_file(path, agent = $grubby)
|
28
32
|
self.new(Grubby::JsonParser.read_local(path).tap{|parser| parser.mech = agent })
|
29
33
|
end
|
@@ -1,12 +1,11 @@
|
|
1
|
+
# @!visibility private
|
1
2
|
class Mechanize::File
|
2
3
|
|
3
|
-
# @!visibility private
|
4
4
|
def self.read_local(path)
|
5
5
|
uri_path = File.expand_path(path).gsub(%r"[^/\\]+"){|component| CGI.escape(component) }
|
6
6
|
self.new(URI::File.build(path: uri_path), nil, File.read(path), "200")
|
7
7
|
end
|
8
8
|
|
9
|
-
# @!visibility private
|
10
9
|
def content_hash
|
11
10
|
@content_hash ||= self.body.to_s.sha1
|
12
11
|
end
|
@@ -1,15 +1,18 @@
|
|
1
1
|
class Mechanize::Page::Link
|
2
2
|
|
3
3
|
# Returns the URI represented by the Link, in absolute form. If the
|
4
|
-
# href attribute of the Link is expressed in relative form, the URI
|
5
|
-
#
|
4
|
+
# href attribute of the Link is expressed in relative form, the URI is
|
5
|
+
# converted to absolute form using the Link's +page.uri+. Raises an
|
6
|
+
# exception if the URI cannot be converted to absolute form.
|
6
7
|
#
|
7
8
|
# @return [URI]
|
9
|
+
# @raise [RuntimeError]
|
10
|
+
# if the URI cannot be converted to absolute form
|
8
11
|
def to_absolute_uri
|
9
|
-
# Via the W3 spec: "If the a element has no href attribute, then
|
10
|
-
# element represents a placeholder for where a link might
|
11
|
-
# have been placed, if it had been relevant, consisting of
|
12
|
-
# element's contents."
|
12
|
+
# Via the W3 spec[1]: "If the a element has no href attribute, then
|
13
|
+
# the element represents a placeholder for where a link might
|
14
|
+
# otherwise have been placed, if it had been relevant, consisting of
|
15
|
+
# just the element's contents." So, we assume a link with no href
|
13
16
|
# attribute (i.e. `uri == nil`) should be treated the same as an
|
14
17
|
# intra-page link.
|
15
18
|
#
|
@@ -1,7 +1,8 @@
|
|
1
1
|
class Mechanize::Page
|
2
2
|
|
3
3
|
# @!method search!(*queries)
|
4
|
-
# See
|
4
|
+
# See ryoba's {https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21
|
5
|
+
# +Nokogiri::XML::Searchable#search!+}.
|
5
6
|
#
|
6
7
|
# @param queries [Array<String>]
|
7
8
|
# @return [Nokogiri::XML::NodeSet]
|
@@ -10,7 +11,8 @@ class Mechanize::Page
|
|
10
11
|
def_delegators :parser, :search!
|
11
12
|
|
12
13
|
# @!method at!(*queries)
|
13
|
-
# See
|
14
|
+
# See ryoba's {https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21
|
15
|
+
# +Nokogiri::XML::Searchable#at!+}.
|
14
16
|
#
|
15
17
|
# @param queries [Array<String>]
|
16
18
|
# @return [Nokogiri::XML::Element]
|
@@ -2,15 +2,15 @@ require "fileutils"
|
|
2
2
|
|
3
3
|
module Mechanize::Parser
|
4
4
|
|
5
|
-
# Saves the payload to a specified directory,
|
5
|
+
# Saves the payload to a specified directory, using the default
|
6
6
|
# filename suggested by the server. If a file with that name already
|
7
7
|
# exists, this method will try to find a free filename by appending
|
8
|
-
# numbers to the
|
8
|
+
# numbers to the default filename. Returns the full path of the saved
|
9
9
|
# file.
|
10
10
|
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
11
|
+
# @note This method expects a +#save!+ method to be defined by the
|
12
|
+
# class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
|
13
|
+
# and +Mechanize::Download#save!+.
|
14
14
|
#
|
15
15
|
# @param directory [String]
|
16
16
|
# @return [String]
|
@@ -23,14 +23,14 @@ module Mechanize::Parser
|
|
23
23
|
path
|
24
24
|
end
|
25
25
|
|
26
|
-
# Saves the payload to a specified directory,
|
26
|
+
# Saves the payload to a specified directory, using the default
|
27
27
|
# filename suggested by the server. If a file with that name already
|
28
28
|
# exists, that file will be overwritten. Returns the full path of the
|
29
29
|
# saved file.
|
30
30
|
#
|
31
|
-
#
|
32
|
-
#
|
33
|
-
#
|
31
|
+
# @note This method expects a +#save!+ method to be defined by the
|
32
|
+
# class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
|
33
|
+
# and +Mechanize::Download#save!+.
|
34
34
|
#
|
35
35
|
# @param directory [String]
|
36
36
|
# @return [String]
|
data/lib/grubby/page_scraper.rb
CHANGED
@@ -6,8 +6,10 @@ class Grubby::PageScraper < Grubby::Scraper
|
|
6
6
|
attr_reader :page
|
7
7
|
|
8
8
|
# @param source [Mechanize::Page]
|
9
|
+
# @raise [Grubby::Scraper::Error]
|
10
|
+
# if any {Scraper.scrapes} blocks fail
|
9
11
|
def initialize(source)
|
10
|
-
@page = source.
|
12
|
+
@page = source.assert!(Mechanize::Page)
|
11
13
|
super
|
12
14
|
end
|
13
15
|
|
@@ -19,11 +21,13 @@ class Grubby::PageScraper < Grubby::Scraper
|
|
19
21
|
# # ...
|
20
22
|
# end
|
21
23
|
#
|
22
|
-
# MyScraper.scrape_file("path/to/local_file.html")
|
24
|
+
# MyScraper.scrape_file("path/to/local_file.html") # === MyScraper
|
23
25
|
#
|
24
26
|
# @param path [String]
|
25
27
|
# @param agent [Mechanize]
|
26
28
|
# @return [Grubby::PageScraper]
|
29
|
+
# @raise [Grubby::Scraper::Error]
|
30
|
+
# if any {Scraper.scrapes} blocks fail
|
27
31
|
def self.scrape_file(path, agent = $grubby)
|
28
32
|
self.new(Mechanize::Page.read_local(path).tap{|page| page.mech = agent })
|
29
33
|
end
|
data/lib/grubby/scraper.rb
CHANGED
@@ -1,57 +1,68 @@
|
|
1
1
|
class Grubby::Scraper
|
2
2
|
|
3
3
|
# Defines an attribute reader method named by +field+. During
|
4
|
-
#
|
4
|
+
# {initialize}, the given block is called, and the attribute is set to
|
5
5
|
# the block's return value.
|
6
6
|
#
|
7
|
-
# By default, if the block's return value is nil
|
8
|
-
#
|
7
|
+
# By default, raises an exception if the block's return value is nil.
|
8
|
+
# To prevent this behavior, set the +:optional+ option to true.
|
9
|
+
# Alternatively, the block can be conditionally evaluated, based on
|
10
|
+
# another method's return value, using the +:if+ or +:unless+ options.
|
9
11
|
#
|
10
|
-
#
|
11
|
-
# method's return value, using the +:if+ or +:unless+ options.
|
12
|
-
#
|
13
|
-
# @example
|
12
|
+
# @example Default behavior
|
14
13
|
# class GreetingScraper < Grubby::Scraper
|
15
|
-
# scrapes(:
|
16
|
-
# source[
|
14
|
+
# scrapes(:name) do
|
15
|
+
# source[/Hello (\w+)/, 1]
|
17
16
|
# end
|
17
|
+
# end
|
18
|
+
#
|
19
|
+
# scraper = GreetingScraper.new("Hello World!")
|
20
|
+
# scraper.name # == "World"
|
21
|
+
#
|
22
|
+
# scraper = GreetingScraper.new("Hello!") # raises Grubby::Scraper::Error
|
18
23
|
#
|
19
|
-
#
|
20
|
-
#
|
24
|
+
# @example Optional scraped value
|
25
|
+
# class GreetingScraper < Grubby::Scraper
|
26
|
+
# scrapes(:name, optional: true) do
|
27
|
+
# source[/Hello (\w+)/, 1]
|
21
28
|
# end
|
22
29
|
# end
|
23
30
|
#
|
24
31
|
# scraper = GreetingScraper.new("Hello World!")
|
25
|
-
# scraper.
|
26
|
-
# scraper.recipient # == "World"
|
32
|
+
# scraper.name # == "World"
|
27
33
|
#
|
28
|
-
# scraper = GreetingScraper.new("
|
29
|
-
# scraper.
|
30
|
-
# scraper.recipient # == nil
|
34
|
+
# scraper = GreetingScraper.new("Hello!")
|
35
|
+
# scraper.name # == nil
|
31
36
|
#
|
32
|
-
#
|
33
|
-
#
|
34
|
-
#
|
35
|
-
#
|
36
|
-
#
|
37
|
+
# @example Conditional scraped value
|
38
|
+
# class GreetingScraper < Grubby::Scraper
|
39
|
+
# def hello?
|
40
|
+
# source.start_with?("Hello ")
|
41
|
+
# end
|
37
42
|
#
|
38
|
-
# scrapes(:
|
43
|
+
# scrapes(:name, if: :hello?) do
|
44
|
+
# source[/Hello (\w+)/, 1]
|
45
|
+
# end
|
39
46
|
# end
|
40
47
|
#
|
41
|
-
# scraper =
|
42
|
-
# scraper.
|
43
|
-
#
|
48
|
+
# scraper = GreetingScraper.new("Hello World!")
|
49
|
+
# scraper.name # == "World"
|
50
|
+
#
|
51
|
+
# scraper = GreetingScraper.new("Hello!") # raises Grubby::Scraper::Error
|
44
52
|
#
|
45
|
-
# scraper =
|
46
|
-
# scraper.
|
47
|
-
# scraper.domain # == nil
|
53
|
+
# scraper = GreetingScraper.new("How are you?")
|
54
|
+
# scraper.name # == nil
|
48
55
|
#
|
49
56
|
# @param field [Symbol, String]
|
50
57
|
# @param options [Hash]
|
51
|
-
# @option options :optional [Boolean]
|
52
|
-
#
|
53
|
-
# @option options :
|
54
|
-
#
|
58
|
+
# @option options :optional [Boolean] (false)
|
59
|
+
# Whether the block should be allowed to return a nil value
|
60
|
+
# @option options :if [Symbol] (nil)
|
61
|
+
# Name of predicate method that determines if the block should be
|
62
|
+
# evaluated
|
63
|
+
# @option options :unless [Symbol] (nil)
|
64
|
+
# Name of predicate method that determines if the block should not
|
65
|
+
# be evaluated
|
55
66
|
# @yieldreturn [Object]
|
56
67
|
# @return [void]
|
57
68
|
def self.scrapes(field, **options, &block)
|
@@ -88,16 +99,16 @@ class Grubby::Scraper
|
|
88
99
|
end
|
89
100
|
end
|
90
101
|
|
91
|
-
# Fields defined
|
102
|
+
# Fields defined via {scrapes}.
|
92
103
|
#
|
93
104
|
# @return [Array<Symbol>]
|
94
105
|
def self.fields
|
95
106
|
@fields ||= self == Grubby::Scraper ? [] : self.superclass.fields.dup
|
96
107
|
end
|
97
108
|
|
98
|
-
# Instantiates the Scraper class with the resource
|
109
|
+
# Instantiates the Scraper class with the resource indicated by +url+.
|
99
110
|
# This method acts as a default factory method, and provides a
|
100
|
-
# standard interface for
|
111
|
+
# standard interface for overrides.
|
101
112
|
#
|
102
113
|
# @example Default factory method
|
103
114
|
# class PostPageScraper < Grubby::PageScraper
|
@@ -107,12 +118,12 @@ class Grubby::Scraper
|
|
107
118
|
# PostPageScraper.scrape("https://example.com/posts/42")
|
108
119
|
# # == PostPageScraper.new($grubby.get("https://example.com/posts/42"))
|
109
120
|
#
|
110
|
-
# @example
|
121
|
+
# @example Override factory method
|
111
122
|
# class PostApiScraper < Grubby::JsonScraper
|
112
123
|
# # ...
|
113
124
|
#
|
114
|
-
# def self.
|
115
|
-
# api_url = url.sub(%r"//example.com/(.+)", '//api.example.com/\1.json')
|
125
|
+
# def self.scrape(url, agent = $grubby)
|
126
|
+
# api_url = url.to_s.sub(%r"//example.com/(.+)", '//api.example.com/\1.json')
|
116
127
|
# super(api_url, agent)
|
117
128
|
# end
|
118
129
|
# end
|
@@ -123,54 +134,65 @@ class Grubby::Scraper
|
|
123
134
|
# @param url [String, URI]
|
124
135
|
# @param agent [Mechanize]
|
125
136
|
# @return [Grubby::Scraper]
|
137
|
+
# @raise [Grubby::Scraper::Error]
|
138
|
+
# if any {Scraper.scrapes} blocks fail
|
126
139
|
def self.scrape(url, agent = $grubby)
|
127
140
|
self.new(agent.get(url))
|
128
141
|
end
|
129
142
|
|
130
143
|
# Iterates a series of pages, starting at +start+. The Scraper class
|
131
|
-
# is instantiated with each page, and each instance is passed
|
132
|
-
# given block. Subsequent pages in the series are determined
|
133
|
-
# invoking the +next_method+ method on each
|
144
|
+
# is instantiated with each page, and each Scraper instance is passed
|
145
|
+
# to the given block. Subsequent pages in the series are determined
|
146
|
+
# by invoking the +next_method+ method on each Scraper instance.
|
134
147
|
#
|
135
|
-
# Iteration stops when the +next_method+ method returns
|
148
|
+
# Iteration stops when the +next_method+ method returns falsy. If the
|
136
149
|
# +next_method+ method returns a String or URI, that value will be
|
137
150
|
# treated as the URL of the next page. Otherwise that value will be
|
138
151
|
# treated as the page itself.
|
139
152
|
#
|
140
|
-
# @example
|
153
|
+
# @example Iterate from page object
|
141
154
|
# class PostsIndexScraper < Grubby::PageScraper
|
142
|
-
# scrapes(:page_param){ page.uri.query_param("page") }
|
143
|
-
#
|
144
155
|
# def next
|
145
156
|
# page.link_with(text: "Next >")&.click
|
146
157
|
# end
|
147
158
|
# end
|
148
159
|
#
|
149
160
|
# PostsIndexScraper.each("https://example.com/posts?page=1") do |scraper|
|
150
|
-
# scraper.
|
161
|
+
# scraper.page.uri.query # == "page=1", "page=2", "page=3", ...
|
151
162
|
# end
|
152
163
|
#
|
153
|
-
# @example
|
164
|
+
# @example Iterate from URI
|
154
165
|
# class PostsIndexScraper < Grubby::PageScraper
|
155
|
-
#
|
166
|
+
# def next
|
167
|
+
# page.link_with(text: "Next >")&.to_absolute_uri
|
168
|
+
# end
|
169
|
+
# end
|
156
170
|
#
|
171
|
+
# PostsIndexScraper.each("https://example.com/posts?page=1") do |scraper|
|
172
|
+
# scraper.page.uri.query # == "page=1", "page=2", "page=3", ...
|
173
|
+
# end
|
174
|
+
#
|
175
|
+
# @example Specifying the iteration method
|
176
|
+
# class PostsIndexScraper < Grubby::PageScraper
|
157
177
|
# scrapes(:next_uri, optional: true) do
|
158
178
|
# page.link_with(text: "Next >")&.to_absolute_uri
|
159
179
|
# end
|
160
180
|
# end
|
161
181
|
#
|
162
182
|
# PostsIndexScraper.each("https://example.com/posts?page=1", next_method: :next_uri) do |scraper|
|
163
|
-
# scraper.
|
183
|
+
# scraper.page.uri.query # == "page=1", "page=2", "page=3", ...
|
164
184
|
# end
|
165
185
|
#
|
166
186
|
# @param start [String, URI, Mechanize::Page, Mechanize::File]
|
167
187
|
# @param agent [Mechanize]
|
168
188
|
# @param next_method [Symbol]
|
169
|
-
# @yield [scraper]
|
170
189
|
# @yieldparam scraper [Grubby::Scraper]
|
171
190
|
# @return [void]
|
172
191
|
# @raise [NoMethodError]
|
173
|
-
# if Scraper class does not
|
192
|
+
# if the Scraper class does not define the method indicated by
|
193
|
+
# +next_method+
|
194
|
+
# @raise [Grubby::Scraper::Error]
|
195
|
+
# if any {Scraper.scrapes} blocks fail
|
174
196
|
def self.each(start, agent = $grubby, next_method: :next)
|
175
197
|
unless self.method_defined?(next_method)
|
176
198
|
raise NoMethodError.new(nil, next_method), "#{self} does not define `#{next_method}`"
|
@@ -187,22 +209,22 @@ class Grubby::Scraper
|
|
187
209
|
end
|
188
210
|
end
|
189
211
|
|
190
|
-
# The object being scraped. Typically a Mechanize
|
191
|
-
# such as +Mechanize::Page+.
|
212
|
+
# The object being scraped. Typically an instance of a Mechanize
|
213
|
+
# pluggable parser such as +Mechanize::Page+.
|
192
214
|
#
|
193
215
|
# @return [Object]
|
194
216
|
attr_reader :source
|
195
217
|
|
196
|
-
# Collected errors raised during {initialize} by
|
197
|
-
#
|
198
|
-
# +Grubby::Scraper::Error
|
218
|
+
# Collected errors raised during {initialize} by {Scraper.scrapes}
|
219
|
+
# blocks, indexed by field name. This Hash will be empty if
|
220
|
+
# {initialize} did not raise a +Grubby::Scraper::Error+.
|
199
221
|
#
|
200
|
-
# @return [Hash
|
222
|
+
# @return [Hash{Symbol => StandardError}]
|
201
223
|
attr_reader :errors
|
202
224
|
|
203
225
|
# @param source
|
204
226
|
# @raise [Grubby::Scraper::Error]
|
205
|
-
# if any
|
227
|
+
# if any {Scraper.scrapes} blocks fail
|
206
228
|
def initialize(source)
|
207
229
|
@source = source
|
208
230
|
@scraped = {}
|
@@ -230,22 +252,25 @@ class Grubby::Scraper
|
|
230
252
|
|
231
253
|
# Returns all scraped values as a Hash.
|
232
254
|
#
|
233
|
-
# @return [Hash
|
255
|
+
# @return [Hash{Symbol => Object}]
|
234
256
|
def to_h
|
235
257
|
@scraped.dup
|
236
258
|
end
|
237
259
|
|
238
260
|
class Error < RuntimeError
|
261
|
+
# @!visibility private
|
239
262
|
BACKTRACE_CLEANER = ActiveSupport::BacktraceCleaner.new.tap do |cleaner|
|
240
263
|
cleaner.add_silencer do |line|
|
241
264
|
line.include?(__dir__) && line.include?("scraper.rb:")
|
242
265
|
end
|
243
266
|
end
|
244
267
|
|
268
|
+
# The Scraper that raised this Error.
|
269
|
+
#
|
245
270
|
# @return [Grubby::Scraper]
|
246
|
-
# The Scraper that raised this error.
|
247
271
|
attr_accessor :scraper
|
248
272
|
|
273
|
+
# @!visibility private
|
249
274
|
def initialize(scraper)
|
250
275
|
self.scraper = scraper
|
251
276
|
|
@@ -269,6 +294,7 @@ class Grubby::Scraper
|
|
269
294
|
end
|
270
295
|
end
|
271
296
|
|
297
|
+
# @!visibility private
|
272
298
|
class FieldValueRequiredError < RuntimeError
|
273
299
|
def initialize(field)
|
274
300
|
super("`#{field}` is nil but is not marked as optional")
|