grubby 1.2.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,9 +3,9 @@ module URI
3
3
  # Returns the basename of the URI's +path+, a la +File.basename+.
4
4
  #
5
5
  # @example
6
- # URI("http://example.com/foo/bar").basename # == "bar"
7
- # URI("http://example.com/foo").basename # == "foo"
8
- # URI("http://example.com/").basename # == ""
6
+ # URI("https://example.com/foo/bar").basename # == "bar"
7
+ # URI("https://example.com/foo").basename # == "foo"
8
+ # URI("https://example.com/").basename # == ""
9
9
  #
10
10
  # @return [String]
11
11
  def basename
@@ -20,16 +20,16 @@ module URI
20
20
  # Otherwise, only the last occurrence is returned.
21
21
  #
22
22
  # @example
23
- # URI("http://example.com/?foo=a").query_param("foo") # == "a"
23
+ # URI("https://example.com/?foo=a").query_param("foo") # == "a"
24
24
  #
25
- # URI("http://example.com/?foo=a&foo=b").query_param("foo") # == "b"
26
- # URI("http://example.com/?foo=a&foo=b").query_param("foo[]") # == nil
25
+ # URI("https://example.com/?foo=a&foo=b").query_param("foo") # == "b"
26
+ # URI("https://example.com/?foo=a&foo=b").query_param("foo[]") # == nil
27
27
  #
28
- # URI("http://example.com/?foo[]=a&foo[]=b").query_param("foo") # == nil
29
- # URI("http://example.com/?foo[]=a&foo[]=b").query_param("foo[]") # == ["a", "b"]
28
+ # URI("https://example.com/?foo[]=a&foo[]=b").query_param("foo") # == nil
29
+ # URI("https://example.com/?foo[]=a&foo[]=b").query_param("foo[]") # == ["a", "b"]
30
30
  #
31
- # URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[]") # == nil
32
- # URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[][x]") # == ["a"]
31
+ # URI("https://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[]") # == nil
32
+ # URI("https://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[][x]") # == ["a"]
33
33
  #
34
34
  # @param name [String]
35
35
  # @return [String, Array<String>, nil]
@@ -38,7 +38,8 @@ module URI
38
38
  (values.nil? || name.include?("[]")) ? values : values.last
39
39
  end
40
40
 
41
- # Raises an exception if the URI is not +absolute?+.
41
+ # Raises an exception if the URI is not +absolute?+. Otherwise,
42
+ # returns the URI.
42
43
  #
43
44
  # @return [self]
44
45
  # @raise [RuntimeError]
@@ -1,31 +1,5 @@
1
1
  class Grubby::JsonParser < Mechanize::File
2
2
 
3
- # Returns the options to use when parsing JSON. The returned options
4
- # Hash is not +dup+ed and can be modified directly. Any modifications
5
- # will be applied to all future parsing.
6
- #
7
- # For information about available options, see
8
- # {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
9
- # +JSON.parse+}.
10
- #
11
- # @return [Hash]
12
- def self.json_parse_options
13
- @json_parse_options ||= JSON.load_default_options.merge(create_additions: false)
14
- end
15
-
16
- # Sets the options to use when parsing JSON. The entire options Hash
17
- # is replaced, and the new value will be applied to all future
18
- # parsing. To set options individually, see {json_parse_options}.
19
- #
20
- # For information about available options, see
21
- # {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
22
- # +JSON.parse+}.
23
- #
24
- # @param options [Hash]
25
- def self.json_parse_options=(options)
26
- @json_parse_options = options
27
- end
28
-
29
3
  # The parsed JSON data.
30
4
  #
31
5
  # @return [Hash, Array]
@@ -37,7 +11,7 @@ class Grubby::JsonParser < Mechanize::File
37
11
  attr_accessor :mech
38
12
 
39
13
  def initialize(uri = nil, response = nil, body = nil, code = nil, mech = nil)
40
- @json = body.presence && JSON.parse(body, self.class.json_parse_options)
14
+ @json = JSON.load(body, nil, create_additions: false)
41
15
  @mech = mech
42
16
  super(uri, response, body, code)
43
17
  end
@@ -6,8 +6,10 @@ class Grubby::JsonScraper < Grubby::Scraper
6
6
  attr_reader :json
7
7
 
8
8
  # @param source [Grubby::JsonParser]
9
+ # @raise [Grubby::Scraper::Error]
10
+ # if any {Scraper.scrapes} blocks fail
9
11
  def initialize(source)
10
- @json = source.assert_kind_of!(Grubby::JsonParser).json
12
+ @json = source.assert!(Grubby::JsonParser).json
11
13
  super
12
14
  end
13
15
 
@@ -19,11 +21,13 @@ class Grubby::JsonScraper < Grubby::Scraper
19
21
  # # ...
20
22
  # end
21
23
  #
22
- # MyScraper.scrape_file("path/to/local_file.json").class # == MyScraper
24
+ # MyScraper.scrape_file("path/to/local_file.json") # === MyScraper
23
25
  #
24
26
  # @param path [String]
25
27
  # @param agent [Mechanize]
26
28
  # @return [Grubby::JsonScraper]
29
+ # @raise [Grubby::Scraper::Error]
30
+ # if any {Scraper.scrapes} blocks fail
27
31
  def self.scrape_file(path, agent = $grubby)
28
32
  self.new(Grubby::JsonParser.read_local(path).tap{|parser| parser.mech = agent })
29
33
  end
@@ -1,6 +1,6 @@
1
+ # @!visibility private
1
2
  class Mechanize::Download
2
3
 
3
- # @!visibility private
4
4
  def content_hash
5
5
  @content_hash ||= Digest::SHA1.new.io(self.body_io).hexdigest
6
6
  end
@@ -1,12 +1,11 @@
1
+ # @!visibility private
1
2
  class Mechanize::File
2
3
 
3
- # @!visibility private
4
4
  def self.read_local(path)
5
5
  uri_path = File.expand_path(path).gsub(%r"[^/\\]+"){|component| CGI.escape(component) }
6
6
  self.new(URI::File.build(path: uri_path), nil, File.read(path), "200")
7
7
  end
8
8
 
9
- # @!visibility private
10
9
  def content_hash
11
10
  @content_hash ||= self.body.to_s.sha1
12
11
  end
@@ -1,15 +1,18 @@
1
1
  class Mechanize::Page::Link
2
2
 
3
3
  # Returns the URI represented by the Link, in absolute form. If the
4
- # href attribute of the Link is expressed in relative form, the URI of
5
- # the Link's Page is used to convert to absolute form.
4
+ # href attribute of the Link is expressed in relative form, the URI is
5
+ # converted to absolute form using the Link's +page.uri+. Raises an
6
+ # exception if the URI cannot be converted to absolute form.
6
7
  #
7
8
  # @return [URI]
9
+ # @raise [RuntimeError]
10
+ # if the URI cannot be converted to absolute form
8
11
  def to_absolute_uri
9
- # Via the W3 spec: "If the a element has no href attribute, then the
10
- # element represents a placeholder for where a link might otherwise
11
- # have been placed, if it had been relevant, consisting of just the
12
- # element's contents."[1] So, we assume a link with no href
12
+ # Via the W3 spec[1]: "If the a element has no href attribute, then
13
+ # the element represents a placeholder for where a link might
14
+ # otherwise have been placed, if it had been relevant, consisting of
15
+ # just the element's contents." So, we assume a link with no href
13
16
  # attribute (i.e. `uri == nil`) should be treated the same as an
14
17
  # intra-page link.
15
18
  #
@@ -1,7 +1,8 @@
1
1
  class Mechanize::Page
2
2
 
3
3
  # @!method search!(*queries)
4
- # See Ryoba's +Nokogiri::XML::Searchable#search!+.
4
+ # See ryoba's {https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21
5
+ # +Nokogiri::XML::Searchable#search!+}.
5
6
  #
6
7
  # @param queries [Array<String>]
7
8
  # @return [Nokogiri::XML::NodeSet]
@@ -10,7 +11,8 @@ class Mechanize::Page
10
11
  def_delegators :parser, :search!
11
12
 
12
13
  # @!method at!(*queries)
13
- # See Ryoba's +Nokogiri::XML::Searchable#at!+.
14
+ # See ryoba's {https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21
15
+ # +Nokogiri::XML::Searchable#at!+}.
14
16
  #
15
17
  # @param queries [Array<String>]
16
18
  # @return [Nokogiri::XML::Element]
@@ -2,15 +2,15 @@ require "fileutils"
2
2
 
3
3
  module Mechanize::Parser
4
4
 
5
- # Saves the payload to a specified directory, but using the default
5
+ # Saves the payload to a specified directory, using the default
6
6
  # filename suggested by the server. If a file with that name already
7
7
  # exists, this method will try to find a free filename by appending
8
- # numbers to the original name. Returns the full path of the saved
8
+ # numbers to the default filename. Returns the full path of the saved
9
9
  # file.
10
10
  #
11
- # NOTE: this method expects a +#save!+ method to be defined by the
12
- # class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
13
- # and +Mechanize::Download#save!+.
11
+ # @note This method expects a +#save!+ method to be defined by the
12
+ # class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
13
+ # and +Mechanize::Download#save!+.
14
14
  #
15
15
  # @param directory [String]
16
16
  # @return [String]
@@ -23,14 +23,14 @@ module Mechanize::Parser
23
23
  path
24
24
  end
25
25
 
26
- # Saves the payload to a specified directory, but using the default
26
+ # Saves the payload to a specified directory, using the default
27
27
  # filename suggested by the server. If a file with that name already
28
28
  # exists, that file will be overwritten. Returns the full path of the
29
29
  # saved file.
30
30
  #
31
- # NOTE: this method expects a +#save!+ method to be defined by the
32
- # class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
33
- # and +Mechanize::Download#save!+.
31
+ # @note This method expects a +#save!+ method to be defined by the
32
+ # class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
33
+ # and +Mechanize::Download#save!+.
34
34
  #
35
35
  # @param directory [String]
36
36
  # @return [String]
@@ -6,8 +6,10 @@ class Grubby::PageScraper < Grubby::Scraper
6
6
  attr_reader :page
7
7
 
8
8
  # @param source [Mechanize::Page]
9
+ # @raise [Grubby::Scraper::Error]
10
+ # if any {Scraper.scrapes} blocks fail
9
11
  def initialize(source)
10
- @page = source.assert_kind_of!(Mechanize::Page)
12
+ @page = source.assert!(Mechanize::Page)
11
13
  super
12
14
  end
13
15
 
@@ -19,11 +21,13 @@ class Grubby::PageScraper < Grubby::Scraper
19
21
  # # ...
20
22
  # end
21
23
  #
22
- # MyScraper.scrape_file("path/to/local_file.html").class # == MyScraper
24
+ # MyScraper.scrape_file("path/to/local_file.html") # === MyScraper
23
25
  #
24
26
  # @param path [String]
25
27
  # @param agent [Mechanize]
26
28
  # @return [Grubby::PageScraper]
29
+ # @raise [Grubby::Scraper::Error]
30
+ # if any {Scraper.scrapes} blocks fail
27
31
  def self.scrape_file(path, agent = $grubby)
28
32
  self.new(Mechanize::Page.read_local(path).tap{|page| page.mech = agent })
29
33
  end
@@ -1,57 +1,68 @@
1
1
  class Grubby::Scraper
2
2
 
3
3
  # Defines an attribute reader method named by +field+. During
4
- # +initialize+, the given block is called, and the attribute is set to
4
+ # {initialize}, the given block is called, and the attribute is set to
5
5
  # the block's return value.
6
6
  #
7
- # By default, if the block's return value is nil, an exception will be
8
- # raised. To prevent this behavior, specify +optional: true+.
7
+ # By default, raises an exception if the block's return value is nil.
8
+ # To prevent this behavior, set the +:optional+ option to true.
9
+ # Alternatively, the block can be conditionally evaluated, based on
10
+ # another method's return value, using the +:if+ or +:unless+ options.
9
11
  #
10
- # The block may also be evaluated conditionally, based on another
11
- # method's return value, using the +:if+ or +:unless+ options.
12
- #
13
- # @example
12
+ # @example Default behavior
14
13
  # class GreetingScraper < Grubby::Scraper
15
- # scrapes(:salutation) do
16
- # source[/\A(hello|good morning)\b/i]
14
+ # scrapes(:name) do
15
+ # source[/Hello (\w+)/, 1]
17
16
  # end
17
+ # end
18
+ #
19
+ # scraper = GreetingScraper.new("Hello World!")
20
+ # scraper.name # == "World"
21
+ #
22
+ # scraper = GreetingScraper.new("Hello!") # raises Grubby::Scraper::Error
18
23
  #
19
- # scrapes(:recipient, optional: true) do
20
- # source[/\A#{salutation} ([a-z ]+)/i, 1]
24
+ # @example Optional scraped value
25
+ # class GreetingScraper < Grubby::Scraper
26
+ # scrapes(:name, optional: true) do
27
+ # source[/Hello (\w+)/, 1]
21
28
  # end
22
29
  # end
23
30
  #
24
31
  # scraper = GreetingScraper.new("Hello World!")
25
- # scraper.salutation # == "Hello"
26
- # scraper.recipient # == "World"
32
+ # scraper.name # == "World"
27
33
  #
28
- # scraper = GreetingScraper.new("Good morning!")
29
- # scraper.salutation # == "Good morning"
30
- # scraper.recipient # == nil
34
+ # scraper = GreetingScraper.new("Hello!")
35
+ # scraper.name # == nil
31
36
  #
32
- # scraper = GreetingScraper.new("Hey!") # raises Grubby::Scraper::Error
33
- #
34
- # @example
35
- # class EmbeddedUrlScraper < Grubby::Scraper
36
- # scrapes(:url, optional: true){ source[%r"\bhttps?://\S+"] }
37
+ # @example Conditional scraped value
38
+ # class GreetingScraper < Grubby::Scraper
39
+ # def hello?
40
+ # source.start_with?("Hello ")
41
+ # end
37
42
  #
38
- # scrapes(:domain, if: :url){ url[%r"://([^/]+)/", 1] }
43
+ # scrapes(:name, if: :hello?) do
44
+ # source[/Hello (\w+)/, 1]
45
+ # end
39
46
  # end
40
47
  #
41
- # scraper = EmbeddedUrlScraper.new("visit https://example.com/foo for details")
42
- # scraper.url # == "https://example.com/foo"
43
- # scraper.domain # == "example.com"
48
+ # scraper = GreetingScraper.new("Hello World!")
49
+ # scraper.name # == "World"
50
+ #
51
+ # scraper = GreetingScraper.new("Hello!") # raises Grubby::Scraper::Error
44
52
  #
45
- # scraper = EmbeddedUrlScraper.new("visit our website for details")
46
- # scraper.url # == nil
47
- # scraper.domain # == nil
53
+ # scraper = GreetingScraper.new("How are you?")
54
+ # scraper.name # == nil
48
55
  #
49
56
  # @param field [Symbol, String]
50
57
  # @param options [Hash]
51
- # @option options :optional [Boolean]
52
- # @option options :if [Symbol]
53
- # @option options :unless [Symbol]
54
- # @yield []
58
+ # @option options :optional [Boolean] (false)
59
+ # Whether the block should be allowed to return a nil value
60
+ # @option options :if [Symbol] (nil)
61
+ # Name of predicate method that determines if the block should be
62
+ # evaluated
63
+ # @option options :unless [Symbol] (nil)
64
+ # Name of predicate method that determines if the block should not
65
+ # be evaluated
55
66
  # @yieldreturn [Object]
56
67
  # @return [void]
57
68
  def self.scrapes(field, **options, &block)
@@ -88,16 +99,16 @@ class Grubby::Scraper
88
99
  end
89
100
  end
90
101
 
91
- # Fields defined by {scrapes}.
102
+ # Fields defined via {scrapes}.
92
103
  #
93
104
  # @return [Array<Symbol>]
94
105
  def self.fields
95
106
  @fields ||= self == Grubby::Scraper ? [] : self.superclass.fields.dup
96
107
  end
97
108
 
98
- # Instantiates the Scraper class with the resource specified by +url+.
109
+ # Instantiates the Scraper class with the resource indicated by +url+.
99
110
  # This method acts as a default factory method, and provides a
100
- # standard interface for specialized overrides.
111
+ # standard interface for overrides.
101
112
  #
102
113
  # @example Default factory method
103
114
  # class PostPageScraper < Grubby::PageScraper
@@ -107,12 +118,12 @@ class Grubby::Scraper
107
118
  # PostPageScraper.scrape("https://example.com/posts/42")
108
119
  # # == PostPageScraper.new($grubby.get("https://example.com/posts/42"))
109
120
  #
110
- # @example Specialized factory method
121
+ # @example Override factory method
111
122
  # class PostApiScraper < Grubby::JsonScraper
112
123
  # # ...
113
124
  #
114
- # def self.scrapes(url, agent = $grubby)
115
- # api_url = url.sub(%r"//example.com/(.+)", '//api.example.com/\1.json')
125
+ # def self.scrape(url, agent = $grubby)
126
+ # api_url = url.to_s.sub(%r"//example.com/(.+)", '//api.example.com/\1.json')
116
127
  # super(api_url, agent)
117
128
  # end
118
129
  # end
@@ -123,54 +134,65 @@ class Grubby::Scraper
123
134
  # @param url [String, URI]
124
135
  # @param agent [Mechanize]
125
136
  # @return [Grubby::Scraper]
137
+ # @raise [Grubby::Scraper::Error]
138
+ # if any {Scraper.scrapes} blocks fail
126
139
  def self.scrape(url, agent = $grubby)
127
140
  self.new(agent.get(url))
128
141
  end
129
142
 
130
143
  # Iterates a series of pages, starting at +start+. The Scraper class
131
- # is instantiated with each page, and each instance is passed to the
132
- # given block. Subsequent pages in the series are determined by
133
- # invoking the +next_method+ method on each previous scraper instance.
144
+ # is instantiated with each page, and each Scraper instance is passed
145
+ # to the given block. Subsequent pages in the series are determined
146
+ # by invoking the +next_method+ method on each Scraper instance.
134
147
  #
135
- # Iteration stops when the +next_method+ method returns nil. If the
148
+ # Iteration stops when the +next_method+ method returns falsy. If the
136
149
  # +next_method+ method returns a String or URI, that value will be
137
150
  # treated as the URL of the next page. Otherwise that value will be
138
151
  # treated as the page itself.
139
152
  #
140
- # @example
153
+ # @example Iterate from page object
141
154
  # class PostsIndexScraper < Grubby::PageScraper
142
- # scrapes(:page_param){ page.uri.query_param("page") }
143
- #
144
155
  # def next
145
156
  # page.link_with(text: "Next >")&.click
146
157
  # end
147
158
  # end
148
159
  #
149
160
  # PostsIndexScraper.each("https://example.com/posts?page=1") do |scraper|
150
- # scraper.page_param # == "1", "2", "3", ...
161
+ # scraper.page.uri.query # == "page=1", "page=2", "page=3", ...
151
162
  # end
152
163
  #
153
- # @example
164
+ # @example Iterate from URI
154
165
  # class PostsIndexScraper < Grubby::PageScraper
155
- # scrapes(:page_param){ page.uri.query_param("page") }
166
+ # def next
167
+ # page.link_with(text: "Next >")&.to_absolute_uri
168
+ # end
169
+ # end
156
170
  #
171
+ # PostsIndexScraper.each("https://example.com/posts?page=1") do |scraper|
172
+ # scraper.page.uri.query # == "page=1", "page=2", "page=3", ...
173
+ # end
174
+ #
175
+ # @example Specifying the iteration method
176
+ # class PostsIndexScraper < Grubby::PageScraper
157
177
  # scrapes(:next_uri, optional: true) do
158
178
  # page.link_with(text: "Next >")&.to_absolute_uri
159
179
  # end
160
180
  # end
161
181
  #
162
182
  # PostsIndexScraper.each("https://example.com/posts?page=1", next_method: :next_uri) do |scraper|
163
- # scraper.page_param # == "1", "2", "3", ...
183
+ # scraper.page.uri.query # == "page=1", "page=2", "page=3", ...
164
184
  # end
165
185
  #
166
186
  # @param start [String, URI, Mechanize::Page, Mechanize::File]
167
187
  # @param agent [Mechanize]
168
188
  # @param next_method [Symbol]
169
- # @yield [scraper]
170
189
  # @yieldparam scraper [Grubby::Scraper]
171
190
  # @return [void]
172
191
  # @raise [NoMethodError]
173
- # if Scraper class does not implement +next_method+
192
+ # if the Scraper class does not define the method indicated by
193
+ # +next_method+
194
+ # @raise [Grubby::Scraper::Error]
195
+ # if any {Scraper.scrapes} blocks fail
174
196
  def self.each(start, agent = $grubby, next_method: :next)
175
197
  unless self.method_defined?(next_method)
176
198
  raise NoMethodError.new(nil, next_method), "#{self} does not define `#{next_method}`"
@@ -187,22 +209,22 @@ class Grubby::Scraper
187
209
  end
188
210
  end
189
211
 
190
- # The object being scraped. Typically a Mechanize pluggable parser
191
- # such as +Mechanize::Page+.
212
+ # The object being scraped. Typically an instance of a Mechanize
213
+ # pluggable parser such as +Mechanize::Page+.
192
214
  #
193
215
  # @return [Object]
194
216
  attr_reader :source
195
217
 
196
- # Collected errors raised during {initialize} by blocks passed to
197
- # {scrapes}, indexed by field name. If {initialize} did not raise
198
- # +Grubby::Scraper::Error+, this Hash will be empty.
218
+ # Collected errors raised during {initialize} by {Scraper.scrapes}
219
+ # blocks, indexed by field name. This Hash will be empty if
220
+ # {initialize} did not raise a +Grubby::Scraper::Error+.
199
221
  #
200
- # @return [Hash<Symbol, StandardError>]
222
+ # @return [Hash{Symbol => StandardError}]
201
223
  attr_reader :errors
202
224
 
203
225
  # @param source
204
226
  # @raise [Grubby::Scraper::Error]
205
- # if any scraped values result in error
227
+ # if any {Scraper.scrapes} blocks fail
206
228
  def initialize(source)
207
229
  @source = source
208
230
  @scraped = {}
@@ -230,22 +252,25 @@ class Grubby::Scraper
230
252
 
231
253
  # Returns all scraped values as a Hash.
232
254
  #
233
- # @return [Hash<Symbol, Object>]
255
+ # @return [Hash{Symbol => Object}]
234
256
  def to_h
235
257
  @scraped.dup
236
258
  end
237
259
 
238
260
  class Error < RuntimeError
261
+ # @!visibility private
239
262
  BACKTRACE_CLEANER = ActiveSupport::BacktraceCleaner.new.tap do |cleaner|
240
263
  cleaner.add_silencer do |line|
241
264
  line.include?(__dir__) && line.include?("scraper.rb:")
242
265
  end
243
266
  end
244
267
 
268
+ # The Scraper that raised this Error.
269
+ #
245
270
  # @return [Grubby::Scraper]
246
- # The Scraper that raised this error.
247
271
  attr_accessor :scraper
248
272
 
273
+ # @!visibility private
249
274
  def initialize(scraper)
250
275
  self.scraper = scraper
251
276
 
@@ -269,6 +294,7 @@ class Grubby::Scraper
269
294
  end
270
295
  end
271
296
 
297
+ # @!visibility private
272
298
  class FieldValueRequiredError < RuntimeError
273
299
  def initialize(field)
274
300
  super("`#{field}` is nil but is not marked as optional")