RubyGems - grubby - Versions diffs - 1.2.1 → 2.0.0 - Mend

grubby 1.2.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/.travis.yml +6 -3
data/CHANGELOG.md +12 -0
data/Gemfile +3 -0
data/README.md +140 -92
data/Rakefile +0 -13
data/gemfiles/activesupport-6.0.gemfile +3 -0
data/grubby.gemspec +17 -18
data/lib/grubby.rb +64 -46
data/lib/grubby/core_ext/uri.rb +12 -11
data/lib/grubby/json_parser.rb +1 -27
data/lib/grubby/json_scraper.rb +6 -2
data/lib/grubby/mechanize/download.rb +1 -1
data/lib/grubby/mechanize/file.rb +1 -2
data/lib/grubby/mechanize/link.rb +9 -6
data/lib/grubby/mechanize/page.rb +4 -2
data/lib/grubby/mechanize/parser.rb +9 -9
data/lib/grubby/page_scraper.rb +6 -2
data/lib/grubby/scraper.rb +86 -60
data/lib/grubby/version.rb +1 -1
metadata +17 -69

data/lib/grubby/core_ext/uri.rb CHANGED

@@ -3,9 +3,9 @@ module URI
   # Returns the basename of the URI's +path+, a la +File.basename+.
   #
   # @example
-  #   URI("http://example.com/foo/bar").basename  # == "bar"
-  #   URI("http://example.com/foo").basename      # == "foo"
-  #   URI("http://example.com/").basename         # == ""
+  #   URI("https://example.com/foo/bar").basename  # == "bar"
+  #   URI("https://example.com/foo").basename      # == "foo"
+  #   URI("https://example.com/").basename         # == ""
   #
   # @return [String]
   def basename
@@ -20,16 +20,16 @@ module URI
   # Otherwise, only the last occurrence is returned.
   #
   # @example
-  #   URI("http://example.com/?foo=a").query_param("foo")  # == "a"
+  #   URI("https://example.com/?foo=a").query_param("foo")  # == "a"
   #
-  #   URI("http://example.com/?foo=a&foo=b").query_param("foo")    # == "b"
-  #   URI("http://example.com/?foo=a&foo=b").query_param("foo[]")  # == nil
+  #   URI("https://example.com/?foo=a&foo=b").query_param("foo")    # == "b"
+  #   URI("https://example.com/?foo=a&foo=b").query_param("foo[]")  # == nil
   #
-  #   URI("http://example.com/?foo[]=a&foo[]=b").query_param("foo")    # == nil
-  #   URI("http://example.com/?foo[]=a&foo[]=b").query_param("foo[]")  # == ["a", "b"]
+  #   URI("https://example.com/?foo[]=a&foo[]=b").query_param("foo")    # == nil
+  #   URI("https://example.com/?foo[]=a&foo[]=b").query_param("foo[]")  # == ["a", "b"]
   #
-  #   URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[]")     # == nil
-  #   URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[][x]")  # == ["a"]
+  #   URI("https://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[]")     # == nil
+  #   URI("https://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[][x]")  # == ["a"]
   #
   # @param name [String]
   # @return [String, Array<String>, nil]
@@ -38,7 +38,8 @@ module URI
     (values.nil? || name.include?("[]")) ? values : values.last
   end
-  # Raises an exception if the URI is not +absolute?+.
+  # Raises an exception if the URI is not +absolute?+.  Otherwise,
+  # returns the URI.
   #
   # @return [self]
   # @raise [RuntimeError]

data/lib/grubby/json_parser.rb CHANGED

@@ -1,31 +1,5 @@
 class Grubby::JsonParser < Mechanize::File
-  # Returns the options to use when parsing JSON.  The returned options
-  # Hash is not +dup+ed and can be modified directly.  Any modifications
-  # will be applied to all future parsing.
-  #
-  # For information about available options, see
-  # {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
-  # +JSON.parse+}.
-  #
-  # @return [Hash]
-  def self.json_parse_options
-    @json_parse_options ||= JSON.load_default_options.merge(create_additions: false)
-  end
-  # Sets the options to use when parsing JSON.  The entire options Hash
-  # is replaced, and the new value will be applied to all future
-  # parsing.  To set options individually, see {json_parse_options}.
-  #
-  # For information about available options, see
-  # {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
-  # +JSON.parse+}.
-  #
-  # @param options [Hash]
-  def self.json_parse_options=(options)
-    @json_parse_options = options
-  end
   # The parsed JSON data.
   #
   # @return [Hash, Array]
@@ -37,7 +11,7 @@ class Grubby::JsonParser < Mechanize::File
   attr_accessor :mech
   def initialize(uri = nil, response = nil, body = nil, code = nil, mech = nil)
-    @json = body.presence && JSON.parse(body, self.class.json_parse_options)
+    @json = JSON.load(body, nil, create_additions: false)
     @mech = mech
     super(uri, response, body, code)
   end

data/lib/grubby/json_scraper.rb CHANGED

@@ -6,8 +6,10 @@ class Grubby::JsonScraper < Grubby::Scraper
   attr_reader :json
   # @param source [Grubby::JsonParser]
+  # @raise [Grubby::Scraper::Error]
+  #   if any {Scraper.scrapes} blocks fail
   def initialize(source)
-    @json = source.assert_kind_of!(Grubby::JsonParser).json
+    @json = source.assert!(Grubby::JsonParser).json
     super
   end
@@ -19,11 +21,13 @@ class Grubby::JsonScraper < Grubby::Scraper
   #     # ...
   #   end
   #
-  #   MyScraper.scrape_file("path/to/local_file.json").class  # == MyScraper
+  #   MyScraper.scrape_file("path/to/local_file.json")  # === MyScraper
   #
   # @param path [String]
   # @param agent [Mechanize]
   # @return [Grubby::JsonScraper]
+  # @raise [Grubby::Scraper::Error]
+  #   if any {Scraper.scrapes} blocks fail
   def self.scrape_file(path, agent = $grubby)
     self.new(Grubby::JsonParser.read_local(path).tap{|parser| parser.mech = agent })
   end

data/lib/grubby/mechanize/download.rb CHANGED

@@ -1,6 +1,6 @@
+# @!visibility private
 class Mechanize::Download
-  # @!visibility private
   def content_hash
     @content_hash ||= Digest::SHA1.new.io(self.body_io).hexdigest
   end

data/lib/grubby/mechanize/file.rb CHANGED

@@ -1,12 +1,11 @@
+# @!visibility private
 class Mechanize::File
-  # @!visibility private
   def self.read_local(path)
     uri_path = File.expand_path(path).gsub(%r"[^/\\]+"){|component| CGI.escape(component) }
     self.new(URI::File.build(path: uri_path), nil, File.read(path), "200")
   end
-  # @!visibility private
   def content_hash
     @content_hash ||= self.body.to_s.sha1
   end

data/lib/grubby/mechanize/link.rb CHANGED

@@ -1,15 +1,18 @@
 class Mechanize::Page::Link
   # Returns the URI represented by the Link, in absolute form.  If the
-  # href attribute of the Link is expressed in relative form, the URI of
-  # the Link's Page is used to convert to absolute form.
+  # href attribute of the Link is expressed in relative form, the URI is
+  # converted to absolute form using the Link's +page.uri+.  Raises an
+  # exception if the URI cannot be converted to absolute form.
   #
   # @return [URI]
+  # @raise [RuntimeError]
+  #   if the URI cannot be converted to absolute form
   def to_absolute_uri
-    # Via the W3 spec: "If the a element has no href attribute, then the
-    # element represents a placeholder for where a link might otherwise
-    # have been placed, if it had been relevant, consisting of just the
-    # element's contents."[1]  So, we assume a link with no href
+    # Via the W3 spec[1]: "If the a element has no href attribute, then
+    # the element represents a placeholder for where a link might
+    # otherwise have been placed, if it had been relevant, consisting of
+    # just the element's contents."  So, we assume a link with no href
     # attribute (i.e. `uri == nil`) should be treated the same as an
     # intra-page link.
     #

data/lib/grubby/mechanize/page.rb CHANGED

@@ -1,7 +1,8 @@
 class Mechanize::Page
   # @!method search!(*queries)
-  # See Ryoba's +Nokogiri::XML::Searchable#search!+.
+  # See ryoba's {https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21
+  # +Nokogiri::XML::Searchable#search!+}.
   #
   # @param queries [Array<String>]
   # @return [Nokogiri::XML::NodeSet]
@@ -10,7 +11,8 @@ class Mechanize::Page
   def_delegators :parser, :search!
   # @!method at!(*queries)
-  # See Ryoba's +Nokogiri::XML::Searchable#at!+.
+  # See ryoba's {https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21
+  # +Nokogiri::XML::Searchable#at!+}.
   #
   # @param queries [Array<String>]
   # @return [Nokogiri::XML::Element]

data/lib/grubby/mechanize/parser.rb CHANGED

@@ -2,15 +2,15 @@ require "fileutils"
 module Mechanize::Parser
-  # Saves the payload to a specified directory, but using the default
+  # Saves the payload to a specified directory, using the default
   # filename suggested by the server.  If a file with that name already
   # exists, this method will try to find a free filename by appending
-  # numbers to the original name.  Returns the full path of the saved
+  # numbers to the default filename.  Returns the full path of the saved
   # file.
   #
-  # NOTE: this method expects a +#save!+ method to be defined by the
-  # class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
-  # and +Mechanize::Download#save!+.
+  # @note This method expects a +#save!+ method to be defined by the
+  #   class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
+  #   and +Mechanize::Download#save!+.
   #
   # @param directory [String]
   # @return [String]
@@ -23,14 +23,14 @@ module Mechanize::Parser
     path
   end
-  # Saves the payload to a specified directory, but using the default
+  # Saves the payload to a specified directory, using the default
   # filename suggested by the server.  If a file with that name already
   # exists, that file will be overwritten.  Returns the full path of the
   # saved file.
   #
-  # NOTE: this method expects a +#save!+ method to be defined by the
-  # class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
-  # and +Mechanize::Download#save!+.
+  # @note This method expects a +#save!+ method to be defined by the
+  #   class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
+  #   and +Mechanize::Download#save!+.
   #
   # @param directory [String]
   # @return [String]

data/lib/grubby/page_scraper.rb CHANGED

@@ -6,8 +6,10 @@ class Grubby::PageScraper < Grubby::Scraper
   attr_reader :page
   # @param source [Mechanize::Page]
+  # @raise [Grubby::Scraper::Error]
+  #   if any {Scraper.scrapes} blocks fail
   def initialize(source)
-    @page = source.assert_kind_of!(Mechanize::Page)
+    @page = source.assert!(Mechanize::Page)
     super
   end
@@ -19,11 +21,13 @@ class Grubby::PageScraper < Grubby::Scraper
   #     # ...
   #   end
   #
-  #   MyScraper.scrape_file("path/to/local_file.html").class  # == MyScraper
+  #   MyScraper.scrape_file("path/to/local_file.html")  # === MyScraper
   #
   # @param path [String]
   # @param agent [Mechanize]
   # @return [Grubby::PageScraper]
+  # @raise [Grubby::Scraper::Error]
+  #   if any {Scraper.scrapes} blocks fail
   def self.scrape_file(path, agent = $grubby)
     self.new(Mechanize::Page.read_local(path).tap{|page| page.mech = agent })
   end

data/lib/grubby/scraper.rb CHANGED

@@ -1,57 +1,68 @@
 class Grubby::Scraper
   # Defines an attribute reader method named by +field+.  During
-  # +initialize+, the given block is called, and the attribute is set to
+  # {initialize}, the given block is called, and the attribute is set to
   # the block's return value.
   #
-  # By default, if the block's return value is nil, an exception will be
-  # raised.  To prevent this behavior, specify +optional: true+.
+  # By default, raises an exception if the block's return value is nil.
+  # To prevent this behavior, set the +:optional+ option to true.
+  # Alternatively, the block can be conditionally evaluated, based on
+  # another method's return value, using the +:if+ or +:unless+ options.
   #
-  # The block may also be evaluated conditionally, based on another
-  # method's return value, using the +:if+ or +:unless+ options.
-  #
-  # @example
+  # @example Default behavior
   #   class GreetingScraper < Grubby::Scraper
-  #     scrapes(:salutation) do
-  #       source[/\A(hello|good morning)\b/i]
+  #     scrapes(:name) do
+  #       source[/Hello (\w+)/, 1]
   #     end
+  #   end
+  #
+  #   scraper = GreetingScraper.new("Hello World!")
+  #   scraper.name  # == "World"
+  #
+  #   scraper = GreetingScraper.new("Hello!")  # raises Grubby::Scraper::Error
   #
-  #     scrapes(:recipient, optional: true) do
-  #       source[/\A#{salutation} ([a-z ]+)/i, 1]
+  # @example Optional scraped value
+  #   class GreetingScraper < Grubby::Scraper
+  #     scrapes(:name, optional: true) do
+  #       source[/Hello (\w+)/, 1]
   #     end
   #   end
   #
   #   scraper = GreetingScraper.new("Hello World!")
-  #   scraper.salutation  # == "Hello"
-  #   scraper.recipient   # == "World"
+  #   scraper.name  # == "World"
   #
-  #   scraper = GreetingScraper.new("Good morning!")
-  #   scraper.salutation  # == "Good morning"
-  #   scraper.recipient   # == nil
+  #   scraper = GreetingScraper.new("Hello!")
+  #   scraper.name  # == nil
   #
-  #   scraper = GreetingScraper.new("Hey!")  # raises Grubby::Scraper::Error
-  #
-  # @example
-  #   class EmbeddedUrlScraper < Grubby::Scraper
-  #     scrapes(:url, optional: true){ source[%r"\bhttps?://\S+"] }
+  # @example Conditional scraped value
+  #   class GreetingScraper < Grubby::Scraper
+  #     def hello?
+  #       source.start_with?("Hello ")
+  #     end
   #
-  #     scrapes(:domain, if: :url){ url[%r"://([^/]+)/", 1] }
+  #     scrapes(:name, if: :hello?) do
+  #       source[/Hello (\w+)/, 1]
+  #     end
   #   end
   #
-  #   scraper = EmbeddedUrlScraper.new("visit https://example.com/foo for details")
-  #   scraper.url     # == "https://example.com/foo"
-  #   scraper.domain  # == "example.com"
+  #   scraper = GreetingScraper.new("Hello World!")
+  #   scraper.name  # == "World"
+  #
+  #   scraper = GreetingScraper.new("Hello!")  # raises Grubby::Scraper::Error
   #
-  #   scraper = EmbeddedUrlScraper.new("visit our website for details")
-  #   scraper.url     # == nil
-  #   scraper.domain  # == nil
+  #   scraper = GreetingScraper.new("How are you?")
+  #   scraper.name  # == nil
   #
   # @param field [Symbol, String]
   # @param options [Hash]
-  # @option options :optional [Boolean]
-  # @option options :if [Symbol]
-  # @option options :unless [Symbol]
-  # @yield []
+  # @option options :optional [Boolean] (false)
+  #   Whether the block should be allowed to return a nil value
+  # @option options :if [Symbol] (nil)
+  #   Name of predicate method that determines if the block should be
+  #   evaluated
+  # @option options :unless [Symbol] (nil)
+  #   Name of predicate method that determines if the block should not
+  #   be evaluated
   # @yieldreturn [Object]
   # @return [void]
   def self.scrapes(field, **options, &block)
@@ -88,16 +99,16 @@ class Grubby::Scraper
     end
   end
-  # Fields defined by {scrapes}.
+  # Fields defined via {scrapes}.
   #
   # @return [Array<Symbol>]
   def self.fields
     @fields ||= self == Grubby::Scraper ? [] : self.superclass.fields.dup
   end
-  # Instantiates the Scraper class with the resource specified by +url+.
+  # Instantiates the Scraper class with the resource indicated by +url+.
   # This method acts as a default factory method, and provides a
-  # standard interface for specialized overrides.
+  # standard interface for overrides.
   #
   # @example Default factory method
   #   class PostPageScraper < Grubby::PageScraper
@@ -107,12 +118,12 @@ class Grubby::Scraper
   #   PostPageScraper.scrape("https://example.com/posts/42")
   #     # == PostPageScraper.new($grubby.get("https://example.com/posts/42"))
   #
-  # @example Specialized factory method
+  # @example Override factory method
   #   class PostApiScraper < Grubby::JsonScraper
   #     # ...
   #
-  #     def self.scrapes(url, agent = $grubby)
-  #       api_url = url.sub(%r"//example.com/(.+)", '//api.example.com/\1.json')
+  #     def self.scrape(url, agent = $grubby)
+  #       api_url = url.to_s.sub(%r"//example.com/(.+)", '//api.example.com/\1.json')
   #       super(api_url, agent)
   #     end
   #   end
@@ -123,54 +134,65 @@ class Grubby::Scraper
   # @param url [String, URI]
   # @param agent [Mechanize]
   # @return [Grubby::Scraper]
+  # @raise [Grubby::Scraper::Error]
+  #   if any {Scraper.scrapes} blocks fail
   def self.scrape(url, agent = $grubby)
     self.new(agent.get(url))
   end
   # Iterates a series of pages, starting at +start+.  The Scraper class
-  # is instantiated with each page, and each instance is passed to the
-  # given block.  Subsequent pages in the series are determined by
-  # invoking the +next_method+ method on each previous scraper instance.
+  # is instantiated with each page, and each Scraper instance is passed
+  # to the given block.  Subsequent pages in the series are determined
+  # by invoking the +next_method+ method on each Scraper instance.
   #
-  # Iteration stops when the +next_method+ method returns nil.  If the
+  # Iteration stops when the +next_method+ method returns falsy.  If the
   # +next_method+ method returns a String or URI, that value will be
   # treated as the URL of the next page.  Otherwise that value will be
   # treated as the page itself.
   #
-  # @example
+  # @example Iterate from page object
   #   class PostsIndexScraper < Grubby::PageScraper
-  #     scrapes(:page_param){ page.uri.query_param("page") }
-  #
   #     def next
   #       page.link_with(text: "Next >")&.click
   #     end
   #   end
   #
   #   PostsIndexScraper.each("https://example.com/posts?page=1") do |scraper|
-  #     scraper.page_param  # == "1", "2", "3", ...
+  #     scraper.page.uri.query  # == "page=1", "page=2", "page=3", ...
   #   end
   #
-  # @example
+  # @example Iterate from URI
   #   class PostsIndexScraper < Grubby::PageScraper
-  #     scrapes(:page_param){ page.uri.query_param("page") }
+  #     def next
+  #       page.link_with(text: "Next >")&.to_absolute_uri
+  #     end
+  #   end
   #
+  #   PostsIndexScraper.each("https://example.com/posts?page=1") do |scraper|
+  #     scraper.page.uri.query  # == "page=1", "page=2", "page=3", ...
+  #   end
+  #
+  # @example Specifying the iteration method
+  #   class PostsIndexScraper < Grubby::PageScraper
   #     scrapes(:next_uri, optional: true) do
   #       page.link_with(text: "Next >")&.to_absolute_uri
   #     end
   #   end
   #
   #   PostsIndexScraper.each("https://example.com/posts?page=1", next_method: :next_uri) do |scraper|
-  #     scraper.page_param  # == "1", "2", "3", ...
+  #     scraper.page.uri.query  # == "page=1", "page=2", "page=3", ...
   #   end
   #
   # @param start [String, URI, Mechanize::Page, Mechanize::File]
   # @param agent [Mechanize]
   # @param next_method [Symbol]
-  # @yield [scraper]
   # @yieldparam scraper [Grubby::Scraper]
   # @return [void]
   # @raise [NoMethodError]
-  #   if Scraper class does not implement +next_method+
+  #   if the Scraper class does not define the method indicated by
+  #   +next_method+
+  # @raise [Grubby::Scraper::Error]
+  #   if any {Scraper.scrapes} blocks fail
   def self.each(start, agent = $grubby, next_method: :next)
     unless self.method_defined?(next_method)
       raise NoMethodError.new(nil, next_method), "#{self} does not define `#{next_method}`"
@@ -187,22 +209,22 @@ class Grubby::Scraper
     end
   end
-  # The object being scraped.  Typically a Mechanize pluggable parser
-  # such as +Mechanize::Page+.
+  # The object being scraped.  Typically an instance of a Mechanize
+  # pluggable parser such as +Mechanize::Page+.
   #
   # @return [Object]
   attr_reader :source
-  # Collected errors raised during {initialize} by blocks passed to
-  # {scrapes}, indexed by field name.  If {initialize} did not raise
-  # +Grubby::Scraper::Error+, this Hash will be empty.
+  # Collected errors raised during {initialize} by {Scraper.scrapes}
+  # blocks, indexed by field name.  This Hash will be empty if
+  # {initialize} did not raise a +Grubby::Scraper::Error+.
   #
-  # @return [Hash<Symbol, StandardError>]
+  # @return [Hash{Symbol => StandardError}]
   attr_reader :errors
   # @param source
   # @raise [Grubby::Scraper::Error]
-  #   if any scraped values result in error
+  #   if any {Scraper.scrapes} blocks fail
   def initialize(source)
     @source = source
     @scraped = {}
@@ -230,22 +252,25 @@ class Grubby::Scraper
   # Returns all scraped values as a Hash.
   #
-  # @return [Hash<Symbol, Object>]
+  # @return [Hash{Symbol => Object}]
   def to_h
     @scraped.dup
   end
   class Error < RuntimeError
+    # @!visibility private
     BACKTRACE_CLEANER = ActiveSupport::BacktraceCleaner.new.tap do |cleaner|
       cleaner.add_silencer do |line|
         line.include?(__dir__) && line.include?("scraper.rb:")
       end
     end
+    # The Scraper that raised this Error.
+    #
     # @return [Grubby::Scraper]
-    #   The Scraper that raised this error.
     attr_accessor :scraper
+    # @!visibility private
     def initialize(scraper)
       self.scraper = scraper
@@ -269,6 +294,7 @@ class Grubby::Scraper
     end
   end
+  # @!visibility private
   class FieldValueRequiredError < RuntimeError
     def initialize(field)
       super("`#{field}` is nil but is not marked as optional")