grubby 1.2.1 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,9 +3,9 @@ module URI
3
3
  # Returns the basename of the URI's +path+, a la +File.basename+.
4
4
  #
5
5
  # @example
6
- # URI("http://example.com/foo/bar").basename # == "bar"
7
- # URI("http://example.com/foo").basename # == "foo"
8
- # URI("http://example.com/").basename # == ""
6
+ # URI("https://example.com/foo/bar").basename # == "bar"
7
+ # URI("https://example.com/foo").basename # == "foo"
8
+ # URI("https://example.com/").basename # == ""
9
9
  #
10
10
  # @return [String]
11
11
  def basename
@@ -20,16 +20,16 @@ module URI
20
20
  # Otherwise, only the last occurrence is returned.
21
21
  #
22
22
  # @example
23
- # URI("http://example.com/?foo=a").query_param("foo") # == "a"
23
+ # URI("https://example.com/?foo=a").query_param("foo") # == "a"
24
24
  #
25
- # URI("http://example.com/?foo=a&foo=b").query_param("foo") # == "b"
26
- # URI("http://example.com/?foo=a&foo=b").query_param("foo[]") # == nil
25
+ # URI("https://example.com/?foo=a&foo=b").query_param("foo") # == "b"
26
+ # URI("https://example.com/?foo=a&foo=b").query_param("foo[]") # == nil
27
27
  #
28
- # URI("http://example.com/?foo[]=a&foo[]=b").query_param("foo") # == nil
29
- # URI("http://example.com/?foo[]=a&foo[]=b").query_param("foo[]") # == ["a", "b"]
28
+ # URI("https://example.com/?foo[]=a&foo[]=b").query_param("foo") # == nil
29
+ # URI("https://example.com/?foo[]=a&foo[]=b").query_param("foo[]") # == ["a", "b"]
30
30
  #
31
- # URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[]") # == nil
32
- # URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[][x]") # == ["a"]
31
+ # URI("https://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[]") # == nil
32
+ # URI("https://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[][x]") # == ["a"]
33
33
  #
34
34
  # @param name [String]
35
35
  # @return [String, Array<String>, nil]
@@ -38,7 +38,8 @@ module URI
38
38
  (values.nil? || name.include?("[]")) ? values : values.last
39
39
  end
40
40
 
41
- # Raises an exception if the URI is not +absolute?+.
41
+ # Raises an exception if the URI is not +absolute?+. Otherwise,
42
+ # returns the URI.
42
43
  #
43
44
  # @return [self]
44
45
  # @raise [RuntimeError]
@@ -1,31 +1,5 @@
1
1
  class Grubby::JsonParser < Mechanize::File
2
2
 
3
- # Returns the options to use when parsing JSON. The returned options
4
- # Hash is not +dup+ed and can be modified directly. Any modifications
5
- # will be applied to all future parsing.
6
- #
7
- # For information about available options, see
8
- # {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
9
- # +JSON.parse+}.
10
- #
11
- # @return [Hash]
12
- def self.json_parse_options
13
- @json_parse_options ||= JSON.load_default_options.merge(create_additions: false)
14
- end
15
-
16
- # Sets the options to use when parsing JSON. The entire options Hash
17
- # is replaced, and the new value will be applied to all future
18
- # parsing. To set options individually, see {json_parse_options}.
19
- #
20
- # For information about available options, see
21
- # {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
22
- # +JSON.parse+}.
23
- #
24
- # @param options [Hash]
25
- def self.json_parse_options=(options)
26
- @json_parse_options = options
27
- end
28
-
29
3
  # The parsed JSON data.
30
4
  #
31
5
  # @return [Hash, Array]
@@ -37,7 +11,7 @@ class Grubby::JsonParser < Mechanize::File
37
11
  attr_accessor :mech
38
12
 
39
13
  def initialize(uri = nil, response = nil, body = nil, code = nil, mech = nil)
40
- @json = body.presence && JSON.parse(body, self.class.json_parse_options)
14
+ @json = JSON.load(body, nil, create_additions: false)
41
15
  @mech = mech
42
16
  super(uri, response, body, code)
43
17
  end
@@ -6,8 +6,10 @@ class Grubby::JsonScraper < Grubby::Scraper
6
6
  attr_reader :json
7
7
 
8
8
  # @param source [Grubby::JsonParser]
9
+ # @raise [Grubby::Scraper::Error]
10
+ # if any {Scraper.scrapes} blocks fail
9
11
  def initialize(source)
10
- @json = source.assert_kind_of!(Grubby::JsonParser).json
12
+ @json = source.assert!(Grubby::JsonParser).json
11
13
  super
12
14
  end
13
15
 
@@ -19,11 +21,13 @@ class Grubby::JsonScraper < Grubby::Scraper
19
21
  # # ...
20
22
  # end
21
23
  #
22
- # MyScraper.scrape_file("path/to/local_file.json").class # == MyScraper
24
+ # MyScraper.scrape_file("path/to/local_file.json") # === MyScraper
23
25
  #
24
26
  # @param path [String]
25
27
  # @param agent [Mechanize]
26
28
  # @return [Grubby::JsonScraper]
29
+ # @raise [Grubby::Scraper::Error]
30
+ # if any {Scraper.scrapes} blocks fail
27
31
  def self.scrape_file(path, agent = $grubby)
28
32
  self.new(Grubby::JsonParser.read_local(path).tap{|parser| parser.mech = agent })
29
33
  end
@@ -1,6 +1,6 @@
1
+ # @!visibility private
1
2
  class Mechanize::Download
2
3
 
3
- # @!visibility private
4
4
  def content_hash
5
5
  @content_hash ||= Digest::SHA1.new.io(self.body_io).hexdigest
6
6
  end
@@ -1,12 +1,11 @@
1
+ # @!visibility private
1
2
  class Mechanize::File
2
3
 
3
- # @!visibility private
4
4
  def self.read_local(path)
5
5
  uri_path = File.expand_path(path).gsub(%r"[^/\\]+"){|component| CGI.escape(component) }
6
6
  self.new(URI::File.build(path: uri_path), nil, File.read(path), "200")
7
7
  end
8
8
 
9
- # @!visibility private
10
9
  def content_hash
11
10
  @content_hash ||= self.body.to_s.sha1
12
11
  end
@@ -1,15 +1,18 @@
1
1
  class Mechanize::Page::Link
2
2
 
3
3
  # Returns the URI represented by the Link, in absolute form. If the
4
- # href attribute of the Link is expressed in relative form, the URI of
5
- # the Link's Page is used to convert to absolute form.
4
+ # href attribute of the Link is expressed in relative form, the URI is
5
+ # converted to absolute form using the Link's +page.uri+. Raises an
6
+ # exception if the URI cannot be converted to absolute form.
6
7
  #
7
8
  # @return [URI]
9
+ # @raise [RuntimeError]
10
+ # if the URI cannot be converted to absolute form
8
11
  def to_absolute_uri
9
- # Via the W3 spec: "If the a element has no href attribute, then the
10
- # element represents a placeholder for where a link might otherwise
11
- # have been placed, if it had been relevant, consisting of just the
12
- # element's contents."[1] So, we assume a link with no href
12
+ # Via the W3 spec[1]: "If the a element has no href attribute, then
13
+ # the element represents a placeholder for where a link might
14
+ # otherwise have been placed, if it had been relevant, consisting of
15
+ # just the element's contents." So, we assume a link with no href
13
16
  # attribute (i.e. `uri == nil`) should be treated the same as an
14
17
  # intra-page link.
15
18
  #
@@ -1,7 +1,8 @@
1
1
  class Mechanize::Page
2
2
 
3
3
  # @!method search!(*queries)
4
- # See Ryoba's +Nokogiri::XML::Searchable#search!+.
4
+ # See ryoba's {https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21
5
+ # +Nokogiri::XML::Searchable#search!+}.
5
6
  #
6
7
  # @param queries [Array<String>]
7
8
  # @return [Nokogiri::XML::NodeSet]
@@ -10,7 +11,8 @@ class Mechanize::Page
10
11
  def_delegators :parser, :search!
11
12
 
12
13
  # @!method at!(*queries)
13
- # See Ryoba's +Nokogiri::XML::Searchable#at!+.
14
+ # See ryoba's {https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21
15
+ # +Nokogiri::XML::Searchable#at!+}.
14
16
  #
15
17
  # @param queries [Array<String>]
16
18
  # @return [Nokogiri::XML::Element]
@@ -2,15 +2,15 @@ require "fileutils"
2
2
 
3
3
  module Mechanize::Parser
4
4
 
5
- # Saves the payload to a specified directory, but using the default
5
+ # Saves the payload to a specified directory, using the default
6
6
  # filename suggested by the server. If a file with that name already
7
7
  # exists, this method will try to find a free filename by appending
8
- # numbers to the original name. Returns the full path of the saved
8
+ # numbers to the default filename. Returns the full path of the saved
9
9
  # file.
10
10
  #
11
- # NOTE: this method expects a +#save!+ method to be defined by the
12
- # class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
13
- # and +Mechanize::Download#save!+.
11
+ # @note This method expects a +#save!+ method to be defined by the
12
+ # class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
13
+ # and +Mechanize::Download#save!+.
14
14
  #
15
15
  # @param directory [String]
16
16
  # @return [String]
@@ -23,14 +23,14 @@ module Mechanize::Parser
23
23
  path
24
24
  end
25
25
 
26
- # Saves the payload to a specified directory, but using the default
26
+ # Saves the payload to a specified directory, using the default
27
27
  # filename suggested by the server. If a file with that name already
28
28
  # exists, that file will be overwritten. Returns the full path of the
29
29
  # saved file.
30
30
  #
31
- # NOTE: this method expects a +#save!+ method to be defined by the
32
- # class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
33
- # and +Mechanize::Download#save!+.
31
+ # @note This method expects a +#save!+ method to be defined by the
32
+ # class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
33
+ # and +Mechanize::Download#save!+.
34
34
  #
35
35
  # @param directory [String]
36
36
  # @return [String]
@@ -6,8 +6,10 @@ class Grubby::PageScraper < Grubby::Scraper
6
6
  attr_reader :page
7
7
 
8
8
  # @param source [Mechanize::Page]
9
+ # @raise [Grubby::Scraper::Error]
10
+ # if any {Scraper.scrapes} blocks fail
9
11
  def initialize(source)
10
- @page = source.assert_kind_of!(Mechanize::Page)
12
+ @page = source.assert!(Mechanize::Page)
11
13
  super
12
14
  end
13
15
 
@@ -19,11 +21,13 @@ class Grubby::PageScraper < Grubby::Scraper
19
21
  # # ...
20
22
  # end
21
23
  #
22
- # MyScraper.scrape_file("path/to/local_file.html").class # == MyScraper
24
+ # MyScraper.scrape_file("path/to/local_file.html") # === MyScraper
23
25
  #
24
26
  # @param path [String]
25
27
  # @param agent [Mechanize]
26
28
  # @return [Grubby::PageScraper]
29
+ # @raise [Grubby::Scraper::Error]
30
+ # if any {Scraper.scrapes} blocks fail
27
31
  def self.scrape_file(path, agent = $grubby)
28
32
  self.new(Mechanize::Page.read_local(path).tap{|page| page.mech = agent })
29
33
  end
@@ -1,57 +1,68 @@
1
1
  class Grubby::Scraper
2
2
 
3
3
  # Defines an attribute reader method named by +field+. During
4
- # +initialize+, the given block is called, and the attribute is set to
4
+ # {initialize}, the given block is called, and the attribute is set to
5
5
  # the block's return value.
6
6
  #
7
- # By default, if the block's return value is nil, an exception will be
8
- # raised. To prevent this behavior, specify +optional: true+.
7
+ # By default, raises an exception if the block's return value is nil.
8
+ # To prevent this behavior, set the +:optional+ option to true.
9
+ # Alternatively, the block can be conditionally evaluated, based on
10
+ # another method's return value, using the +:if+ or +:unless+ options.
9
11
  #
10
- # The block may also be evaluated conditionally, based on another
11
- # method's return value, using the +:if+ or +:unless+ options.
12
- #
13
- # @example
12
+ # @example Default behavior
14
13
  # class GreetingScraper < Grubby::Scraper
15
- # scrapes(:salutation) do
16
- # source[/\A(hello|good morning)\b/i]
14
+ # scrapes(:name) do
15
+ # source[/Hello (\w+)/, 1]
17
16
  # end
17
+ # end
18
+ #
19
+ # scraper = GreetingScraper.new("Hello World!")
20
+ # scraper.name # == "World"
21
+ #
22
+ # scraper = GreetingScraper.new("Hello!") # raises Grubby::Scraper::Error
18
23
  #
19
- # scrapes(:recipient, optional: true) do
20
- # source[/\A#{salutation} ([a-z ]+)/i, 1]
24
+ # @example Optional scraped value
25
+ # class GreetingScraper < Grubby::Scraper
26
+ # scrapes(:name, optional: true) do
27
+ # source[/Hello (\w+)/, 1]
21
28
  # end
22
29
  # end
23
30
  #
24
31
  # scraper = GreetingScraper.new("Hello World!")
25
- # scraper.salutation # == "Hello"
26
- # scraper.recipient # == "World"
32
+ # scraper.name # == "World"
27
33
  #
28
- # scraper = GreetingScraper.new("Good morning!")
29
- # scraper.salutation # == "Good morning"
30
- # scraper.recipient # == nil
34
+ # scraper = GreetingScraper.new("Hello!")
35
+ # scraper.name # == nil
31
36
  #
32
- # scraper = GreetingScraper.new("Hey!") # raises Grubby::Scraper::Error
33
- #
34
- # @example
35
- # class EmbeddedUrlScraper < Grubby::Scraper
36
- # scrapes(:url, optional: true){ source[%r"\bhttps?://\S+"] }
37
+ # @example Conditional scraped value
38
+ # class GreetingScraper < Grubby::Scraper
39
+ # def hello?
40
+ # source.start_with?("Hello ")
41
+ # end
37
42
  #
38
- # scrapes(:domain, if: :url){ url[%r"://([^/]+)/", 1] }
43
+ # scrapes(:name, if: :hello?) do
44
+ # source[/Hello (\w+)/, 1]
45
+ # end
39
46
  # end
40
47
  #
41
- # scraper = EmbeddedUrlScraper.new("visit https://example.com/foo for details")
42
- # scraper.url # == "https://example.com/foo"
43
- # scraper.domain # == "example.com"
48
+ # scraper = GreetingScraper.new("Hello World!")
49
+ # scraper.name # == "World"
50
+ #
51
+ # scraper = GreetingScraper.new("Hello!") # raises Grubby::Scraper::Error
44
52
  #
45
- # scraper = EmbeddedUrlScraper.new("visit our website for details")
46
- # scraper.url # == nil
47
- # scraper.domain # == nil
53
+ # scraper = GreetingScraper.new("How are you?")
54
+ # scraper.name # == nil
48
55
  #
49
56
  # @param field [Symbol, String]
50
57
  # @param options [Hash]
51
- # @option options :optional [Boolean]
52
- # @option options :if [Symbol]
53
- # @option options :unless [Symbol]
54
- # @yield []
58
+ # @option options :optional [Boolean] (false)
59
+ # Whether the block should be allowed to return a nil value
60
+ # @option options :if [Symbol] (nil)
61
+ # Name of predicate method that determines if the block should be
62
+ # evaluated
63
+ # @option options :unless [Symbol] (nil)
64
+ # Name of predicate method that determines if the block should not
65
+ # be evaluated
55
66
  # @yieldreturn [Object]
56
67
  # @return [void]
57
68
  def self.scrapes(field, **options, &block)
@@ -88,16 +99,16 @@ class Grubby::Scraper
88
99
  end
89
100
  end
90
101
 
91
- # Fields defined by {scrapes}.
102
+ # Fields defined via {scrapes}.
92
103
  #
93
104
  # @return [Array<Symbol>]
94
105
  def self.fields
95
106
  @fields ||= self == Grubby::Scraper ? [] : self.superclass.fields.dup
96
107
  end
97
108
 
98
- # Instantiates the Scraper class with the resource specified by +url+.
109
+ # Instantiates the Scraper class with the resource indicated by +url+.
99
110
  # This method acts as a default factory method, and provides a
100
- # standard interface for specialized overrides.
111
+ # standard interface for overrides.
101
112
  #
102
113
  # @example Default factory method
103
114
  # class PostPageScraper < Grubby::PageScraper
@@ -107,12 +118,12 @@ class Grubby::Scraper
107
118
  # PostPageScraper.scrape("https://example.com/posts/42")
108
119
  # # == PostPageScraper.new($grubby.get("https://example.com/posts/42"))
109
120
  #
110
- # @example Specialized factory method
121
+ # @example Override factory method
111
122
  # class PostApiScraper < Grubby::JsonScraper
112
123
  # # ...
113
124
  #
114
- # def self.scrapes(url, agent = $grubby)
115
- # api_url = url.sub(%r"//example.com/(.+)", '//api.example.com/\1.json')
125
+ # def self.scrape(url, agent = $grubby)
126
+ # api_url = url.to_s.sub(%r"//example.com/(.+)", '//api.example.com/\1.json')
116
127
  # super(api_url, agent)
117
128
  # end
118
129
  # end
@@ -123,54 +134,65 @@ class Grubby::Scraper
123
134
  # @param url [String, URI]
124
135
  # @param agent [Mechanize]
125
136
  # @return [Grubby::Scraper]
137
+ # @raise [Grubby::Scraper::Error]
138
+ # if any {Scraper.scrapes} blocks fail
126
139
  def self.scrape(url, agent = $grubby)
127
140
  self.new(agent.get(url))
128
141
  end
129
142
 
130
143
  # Iterates a series of pages, starting at +start+. The Scraper class
131
- # is instantiated with each page, and each instance is passed to the
132
- # given block. Subsequent pages in the series are determined by
133
- # invoking the +next_method+ method on each previous scraper instance.
144
+ # is instantiated with each page, and each Scraper instance is passed
145
+ # to the given block. Subsequent pages in the series are determined
146
+ # by invoking the +next_method+ method on each Scraper instance.
134
147
  #
135
- # Iteration stops when the +next_method+ method returns nil. If the
148
+ # Iteration stops when the +next_method+ method returns falsy. If the
136
149
  # +next_method+ method returns a String or URI, that value will be
137
150
  # treated as the URL of the next page. Otherwise that value will be
138
151
  # treated as the page itself.
139
152
  #
140
- # @example
153
+ # @example Iterate from page object
141
154
  # class PostsIndexScraper < Grubby::PageScraper
142
- # scrapes(:page_param){ page.uri.query_param("page") }
143
- #
144
155
  # def next
145
156
  # page.link_with(text: "Next >")&.click
146
157
  # end
147
158
  # end
148
159
  #
149
160
  # PostsIndexScraper.each("https://example.com/posts?page=1") do |scraper|
150
- # scraper.page_param # == "1", "2", "3", ...
161
+ # scraper.page.uri.query # == "page=1", "page=2", "page=3", ...
151
162
  # end
152
163
  #
153
- # @example
164
+ # @example Iterate from URI
154
165
  # class PostsIndexScraper < Grubby::PageScraper
155
- # scrapes(:page_param){ page.uri.query_param("page") }
166
+ # def next
167
+ # page.link_with(text: "Next >")&.to_absolute_uri
168
+ # end
169
+ # end
156
170
  #
171
+ # PostsIndexScraper.each("https://example.com/posts?page=1") do |scraper|
172
+ # scraper.page.uri.query # == "page=1", "page=2", "page=3", ...
173
+ # end
174
+ #
175
+ # @example Specifying the iteration method
176
+ # class PostsIndexScraper < Grubby::PageScraper
157
177
  # scrapes(:next_uri, optional: true) do
158
178
  # page.link_with(text: "Next >")&.to_absolute_uri
159
179
  # end
160
180
  # end
161
181
  #
162
182
  # PostsIndexScraper.each("https://example.com/posts?page=1", next_method: :next_uri) do |scraper|
163
- # scraper.page_param # == "1", "2", "3", ...
183
+ # scraper.page.uri.query # == "page=1", "page=2", "page=3", ...
164
184
  # end
165
185
  #
166
186
  # @param start [String, URI, Mechanize::Page, Mechanize::File]
167
187
  # @param agent [Mechanize]
168
188
  # @param next_method [Symbol]
169
- # @yield [scraper]
170
189
  # @yieldparam scraper [Grubby::Scraper]
171
190
  # @return [void]
172
191
  # @raise [NoMethodError]
173
- # if Scraper class does not implement +next_method+
192
+ # if the Scraper class does not define the method indicated by
193
+ # +next_method+
194
+ # @raise [Grubby::Scraper::Error]
195
+ # if any {Scraper.scrapes} blocks fail
174
196
  def self.each(start, agent = $grubby, next_method: :next)
175
197
  unless self.method_defined?(next_method)
176
198
  raise NoMethodError.new(nil, next_method), "#{self} does not define `#{next_method}`"
@@ -187,22 +209,22 @@ class Grubby::Scraper
187
209
  end
188
210
  end
189
211
 
190
- # The object being scraped. Typically a Mechanize pluggable parser
191
- # such as +Mechanize::Page+.
212
+ # The object being scraped. Typically an instance of a Mechanize
213
+ # pluggable parser such as +Mechanize::Page+.
192
214
  #
193
215
  # @return [Object]
194
216
  attr_reader :source
195
217
 
196
- # Collected errors raised during {initialize} by blocks passed to
197
- # {scrapes}, indexed by field name. If {initialize} did not raise
198
- # +Grubby::Scraper::Error+, this Hash will be empty.
218
+ # Collected errors raised during {initialize} by {Scraper.scrapes}
219
+ # blocks, indexed by field name. This Hash will be empty if
220
+ # {initialize} did not raise a +Grubby::Scraper::Error+.
199
221
  #
200
- # @return [Hash<Symbol, StandardError>]
222
+ # @return [Hash{Symbol => StandardError}]
201
223
  attr_reader :errors
202
224
 
203
225
  # @param source
204
226
  # @raise [Grubby::Scraper::Error]
205
- # if any scraped values result in error
227
+ # if any {Scraper.scrapes} blocks fail
206
228
  def initialize(source)
207
229
  @source = source
208
230
  @scraped = {}
@@ -230,22 +252,25 @@ class Grubby::Scraper
230
252
 
231
253
  # Returns all scraped values as a Hash.
232
254
  #
233
- # @return [Hash<Symbol, Object>]
255
+ # @return [Hash{Symbol => Object}]
234
256
  def to_h
235
257
  @scraped.dup
236
258
  end
237
259
 
238
260
  class Error < RuntimeError
261
+ # @!visibility private
239
262
  BACKTRACE_CLEANER = ActiveSupport::BacktraceCleaner.new.tap do |cleaner|
240
263
  cleaner.add_silencer do |line|
241
264
  line.include?(__dir__) && line.include?("scraper.rb:")
242
265
  end
243
266
  end
244
267
 
268
+ # The Scraper that raised this Error.
269
+ #
245
270
  # @return [Grubby::Scraper]
246
- # The Scraper that raised this error.
247
271
  attr_accessor :scraper
248
272
 
273
+ # @!visibility private
249
274
  def initialize(scraper)
250
275
  self.scraper = scraper
251
276
 
@@ -269,6 +294,7 @@ class Grubby::Scraper
269
294
  end
270
295
  end
271
296
 
297
+ # @!visibility private
272
298
  class FieldValueRequiredError < RuntimeError
273
299
  def initialize(field)
274
300
  super("`#{field}` is nil but is not marked as optional")