RubyGems - grubby - Versions diffs - 1.2.0 → 1.2.1 - Mend

grubby 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 84d759cf7187c8502b42e9d7828f59f126bb87af8da524e9d8e6f6ad8a64f467
-  data.tar.gz: bf26cca3991fca00e573f51f28a1c457e063e4f419986971f1429f051f2e3155
+  metadata.gz: 91cb5fb76be040dc0a6b86c7dd5513e7dfa79327e68b6f15da6ed41df1492740
+  data.tar.gz: d96e1a83f6ebc93c09403bc66ee3251132bbdabeb40379aa081dbece2c978b98
 SHA512:
-  metadata.gz: 38b8f7818be985da5c48484b8a3f42a40401b4890e46da93c2565c546654a660537cf15303e1106bdca201d1ea8e7ff90e13ab13dcb652997b0acc9becc01b48
-  data.tar.gz: e3c8b063d275ebf49dc50c5a70fa82cb0f9e517f17cc9e3735557a2fe998d5ea82a3ea0932ad8a6ecec630f4f66c8d62443c76497f6e98e4f202a72df988095e
+  metadata.gz: 4e10fa8ae3b183fa600a26af1ff87e0e340e63cfdeec9369c1f9987ace143591b9c33b1edfed980b841ffea5806f96332b1b32e117551b714dcd3b66cff5a8da
+  data.tar.gz: 63985a6d1d39a1ac224eb1aca676f3266b911059e7ab5e838a535dd14e6249d2bbc1d41b59a35101e17983930ebd7ab258a6ce39375a300bcf1725a0e79b72c1

data/CHANGELOG.md CHANGED

@@ -1,3 +1,15 @@
+## 1.2.1
+* Add `JsonParser#mech` attribute for parity with `Mechanize::Page#mech`
+* Ensure time spent fetching a response does not count toward the time
+  to sleep between requests
+* Prevent sleep between requests when following a redirect
+* Prevent duplicates in `Scraper.fields`
+* Fix `URI#query_param` when query is nil
+* Fix `PageScraper.scrape_file` and `JsonScraper.scrape_file` when path
+  contains characters that need to be URI-encoded
 ## 1.2.0
 * Add `Grubby#journal=`

data/README.md CHANGED

@@ -17,17 +17,15 @@ The following example scrapes stories from the [Hacker News] front page:
 require "grubby"
 class HackerNews < Grubby::PageScraper
   scrapes(:items) do
     page.search!(".athing").map{|el| Item.new(el) }
   end
   class Item < Grubby::Scraper
     scrapes(:story_link){ source.at!("a.storylink") }
-    scrapes(:story_uri) { story_link.uri }
-    scrapes(:title) { story_link.text }
+    scrapes(:story_uri){ story_link.uri }
+    scrapes(:title){ story_link.text }
   end
 end
 # The following line will raise an exception if anything goes wrong

data/grubby.gemspec CHANGED

@@ -7,7 +7,7 @@ Gem::Specification.new do |spec|
   spec.name          = "grubby"
   spec.version       = GRUBBY_VERSION
   spec.authors       = ["Jonathan Hefner"]
-  spec.email         = ["jonathan.hefner@gmail.com"]
+  spec.email         = ["jonathan@hefner.pro"]
   spec.summary       = %q{Fail-fast web scraping}
   spec.homepage      = "https://github.com/jonathanhefner/grubby"

data/lib/grubby.rb CHANGED

@@ -65,6 +65,9 @@ class Grubby < Mechanize
     # Set up configurable rate limiting, and choose a reasonable default
     # rate limit.
     self.pre_connect_hooks << Proc.new{ self.send(:sleep_between_requests) }
+    self.post_connect_hooks << Proc.new do |agent, uri, response, body|
+      self.send(:mark_last_request_time, (Time.now unless response.code.to_s.start_with?("3")))
+    end
     self.time_between_requests = 1.0
     self.journal = journal
@@ -81,9 +84,9 @@ class Grubby < Mechanize
     @journal = path&.to_pathname&.touch_file
     @seen = if @journal
         require "csv"
-        CSV.read(@journal).map{|row| SingletonKey.new(*row) }.index_to{ true }
+        CSV.read(@journal).map{|row| SingletonKey.new(*row) }.to_set
       else
-        {}
+        Set.new
       end
     @journal
   end
@@ -202,7 +205,7 @@ class Grubby < Mechanize
   def try_skip_singleton(target, purpose, series)
     series << SingletonKey.new(purpose, target.to_s)
-    if series.uniq!.nil? && @seen.displace(series.last, true)
+    if series.uniq!.nil? && !@seen.add?(series.last)
       seen_info = series.length > 1 ? "seen #{series.last.target}" : "seen"
       $log.info("Skip #{series.first.target} (#{seen_info})")
       true
@@ -223,7 +226,10 @@ class Grubby < Mechanize
       rand(time_between_requests) : time_between_requests
     sleep_duration = @last_request_at + delay_duration - Time.now.to_f
     sleep(sleep_duration) if sleep_duration > 0
-    @last_request_at = Time.now.to_f
+  end
+  def mark_last_request_time(time)
+    @last_request_at = time.to_f
   end
 end

data/lib/grubby/core_ext/uri.rb CHANGED

@@ -12,13 +12,12 @@ module URI
     self.path == "/" ? "" : ::File.basename(self.path)
   end
-  # Returns the value of the specified param in the URI's +query+.
-  # The specified param name must be exactly as it appears in the query
-  # string, and support for complex nested values is limited.  (See
-  # +CGI.parse+ for parsing behavior.)  If the param name includes a
-  # +"[]"+, the result will be an array of all occurrences of that param
-  # in the query string.  Otherwise, the result will be the last
-  # occurrence of that param in the query string.
+  # Returns the value of the specified query param in the URI's query
+  # string.  The specified +name+ must be *exactly* as it appears in the
+  # query string, and support for complex nested values is limited.
+  # (See +CGI.parse+ for parsing behavior.)  If +name+ contains +"[]"+,
+  # all occurrences of the query param are returned as an Array.
+  # Otherwise, only the last occurrence is returned.
   #
   # @example
   #   URI("http://example.com/?foo=a").query_param("foo")  # == "a"
@@ -32,11 +31,10 @@ module URI
   #   URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[]")     # == nil
   #   URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[][x]")  # == ["a"]
   #
-  # @return [String, nil]
-  # @return [Array<String>, nil]
-  #   if +name+ contains +"[]"+
+  # @param name [String]
+  # @return [String, Array<String>, nil]
   def query_param(name)
-    values = CGI.parse(self.query)[name.to_s]
+    values = CGI.parse(self.query)[name] if self.query
     (values.nil? || name.include?("[]")) ? values : values.last
   end

data/lib/grubby/json_parser.rb CHANGED

@@ -5,19 +5,12 @@ class Grubby::JsonParser < Mechanize::File
   # will be applied to all future parsing.
   #
   # For information about available options, see
-  # {http://ruby-doc.org/stdlib/libdoc/json/rdoc/JSON.html#method-i-parse
+  # {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
   # +JSON.parse+}.
   #
   # @return [Hash]
   def self.json_parse_options
-    @json_parse_options ||= {
-      max_nesting: false,
-      allow_nan: false,
-      symbolize_names: false,
-      create_additions: false,
-      object_class: Hash,
-      array_class: Array,
-    }
+    @json_parse_options ||= JSON.load_default_options.merge(create_additions: false)
   end
   # Sets the options to use when parsing JSON.  The entire options Hash
@@ -25,7 +18,7 @@ class Grubby::JsonParser < Mechanize::File
   # parsing.  To set options individually, see {json_parse_options}.
   #
   # For information about available options, see
-  # {http://ruby-doc.org/stdlib/libdoc/json/rdoc/JSON.html#method-i-parse
+  # {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
   # +JSON.parse+}.
   #
   # @param options [Hash]
@@ -38,9 +31,15 @@ class Grubby::JsonParser < Mechanize::File
   # @return [Hash, Array]
   attr_reader :json
-  def initialize(uri = nil, response = nil, body = nil, code = nil)
+  # The Mechanize agent used to make the request.
+  #
+  # @return [Mechanize, nil]
+  attr_accessor :mech
+  def initialize(uri = nil, response = nil, body = nil, code = nil, mech = nil)
     @json = body.presence && JSON.parse(body, self.class.json_parse_options)
-    super
+    @mech = mech
+    super(uri, response, body, code)
   end
 end

data/lib/grubby/json_scraper.rb CHANGED

@@ -22,11 +22,10 @@ class Grubby::JsonScraper < Grubby::Scraper
   #   MyScraper.scrape_file("path/to/local_file.json").class  # == MyScraper
   #
   # @param path [String]
+  # @param agent [Mechanize]
   # @return [Grubby::JsonScraper]
-  def self.scrape_file(path)
-    uri = URI.join("file:///", File.expand_path(path))
-    body = File.read(path)
-    self.new(Grubby::JsonParser.new(uri, nil, body, "200"))
+  def self.scrape_file(path, agent = $grubby)
+    self.new(Grubby::JsonParser.read_local(path).tap{|parser| parser.mech = agent })
   end
 end

data/lib/grubby/mechanize/file.rb CHANGED

@@ -1,5 +1,11 @@
 class Mechanize::File
+  # @!visibility private
+  def self.read_local(path)
+    uri_path = File.expand_path(path).gsub(%r"[^/\\]+"){|component| CGI.escape(component) }
+    self.new(URI::File.build(path: uri_path), nil, File.read(path), "200")
+  end
   # @!visibility private
   def content_hash
     @content_hash ||= self.body.to_s.sha1

data/lib/grubby/page_scraper.rb CHANGED

@@ -25,9 +25,7 @@ class Grubby::PageScraper < Grubby::Scraper
   # @param agent [Mechanize]
   # @return [Grubby::PageScraper]
   def self.scrape_file(path, agent = $grubby)
-    uri = URI.join("file:///", File.expand_path(path))
-    body = File.read(path)
-    self.new(Mechanize::Page.new(uri, nil, body, "200", agent))
+    self.new(Mechanize::Page.read_local(path).tap{|page| page.mech = agent })
   end
 end

data/lib/grubby/scraper.rb CHANGED

@@ -56,7 +56,7 @@ class Grubby::Scraper
   # @return [void]
   def self.scrapes(field, **options, &block)
     field = field.to_sym
-    self.fields << field
+    (self.fields << field).uniq!
     define_method(field) do
       raise "#{self.class}#initialize does not invoke `super`" unless defined?(@scraped)
@@ -127,10 +127,10 @@ class Grubby::Scraper
     self.new(agent.get(url))
   end
-  # Iterates a series of pages, starting at +start_url+.  For each page,
-  # the Scraper class is instantiated and passed to the given block.
-  # Subsequent pages in the series are determined by invoking
-  # +next_method+ on each previous scraper instance.
+  # Iterates a series of pages, starting at +start+.  The Scraper class
+  # is instantiated with each page, and each instance is passed to the
+  # given block.  Subsequent pages in the series are determined by
+  # invoking the +next_method+ method on each previous scraper instance.
   #
   # Iteration stops when the +next_method+ method returns nil.  If the
   # +next_method+ method returns a String or URI, that value will be
@@ -163,7 +163,7 @@ class Grubby::Scraper
   #     scraper.page_param  # == "1", "2", "3", ...
   #   end
   #
-  # @param start_url [String, URI]
+  # @param start [String, URI, Mechanize::Page, Mechanize::File]
   # @param agent [Mechanize]
   # @param next_method [Symbol]
   # @yield [scraper]
@@ -171,14 +171,14 @@ class Grubby::Scraper
   # @return [void]
   # @raise [NoMethodError]
   #   if Scraper class does not implement +next_method+
-  def self.each(start_url, agent = $grubby, next_method: :next)
+  def self.each(start, agent = $grubby, next_method: :next)
     unless self.method_defined?(next_method)
       raise NoMethodError.new(nil, next_method), "#{self} does not define `#{next_method}`"
     end
-    return to_enum(:each, start_url, agent, next_method: next_method) unless block_given?
+    return to_enum(:each, start, agent, next_method: next_method) unless block_given?
-    current = start_url
+    current = start
     while current
       current = agent.get(current) if current.is_a?(String) || current.is_a?(URI)
       scraper = self.new(current)

data/lib/grubby/version.rb CHANGED

	@@ -1 +1 @@
1	- GRUBBY_VERSION = "1.2.0"
1	+ GRUBBY_VERSION = "1.2.1"

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: grubby
 version: !ruby/object:Gem::Version
-  version: 1.2.0
+  version: 1.2.1
 platform: ruby
 authors:
 - Jonathan Hefner
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-07-06 00:00:00.000000000 Z
+date: 2019-08-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: activesupport
@@ -166,7 +166,7 @@ dependencies:
         version: '0.9'
 description:
 email:
-- jonathan.hefner@gmail.com
+- jonathan@hefner.pro
 executables: []
 extensions: []
 extra_rdoc_files: []