grubby 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 84d759cf7187c8502b42e9d7828f59f126bb87af8da524e9d8e6f6ad8a64f467
4
- data.tar.gz: bf26cca3991fca00e573f51f28a1c457e063e4f419986971f1429f051f2e3155
3
+ metadata.gz: 91cb5fb76be040dc0a6b86c7dd5513e7dfa79327e68b6f15da6ed41df1492740
4
+ data.tar.gz: d96e1a83f6ebc93c09403bc66ee3251132bbdabeb40379aa081dbece2c978b98
5
5
  SHA512:
6
- metadata.gz: 38b8f7818be985da5c48484b8a3f42a40401b4890e46da93c2565c546654a660537cf15303e1106bdca201d1ea8e7ff90e13ab13dcb652997b0acc9becc01b48
7
- data.tar.gz: e3c8b063d275ebf49dc50c5a70fa82cb0f9e517f17cc9e3735557a2fe998d5ea82a3ea0932ad8a6ecec630f4f66c8d62443c76497f6e98e4f202a72df988095e
6
+ metadata.gz: 4e10fa8ae3b183fa600a26af1ff87e0e340e63cfdeec9369c1f9987ace143591b9c33b1edfed980b841ffea5806f96332b1b32e117551b714dcd3b66cff5a8da
7
+ data.tar.gz: 63985a6d1d39a1ac224eb1aca676f3266b911059e7ab5e838a535dd14e6249d2bbc1d41b59a35101e17983930ebd7ab258a6ce39375a300bcf1725a0e79b72c1
@@ -1,3 +1,15 @@
1
+ ## 1.2.1
2
+
3
+ * Add `JsonParser#mech` attribute for parity with `Mechanize::Page#mech`
4
+ * Ensure time spent fetching a response does not count toward the time
5
+ to sleep between requests
6
+ * Prevent sleep between requests when following a redirect
7
+ * Prevent duplicates in `Scraper.fields`
8
+ * Fix `URI#query_param` when query is nil
9
+ * Fix `PageScraper.scrape_file` and `JsonScraper.scrape_file` when path
10
+ contains characters that need to be URI-encoded
11
+
12
+
1
13
  ## 1.2.0
2
14
 
3
15
  * Add `Grubby#journal=`
data/README.md CHANGED
@@ -17,17 +17,15 @@ The following example scrapes stories from the [Hacker News] front page:
17
17
  require "grubby"
18
18
 
19
19
  class HackerNews < Grubby::PageScraper
20
-
21
20
  scrapes(:items) do
22
21
  page.search!(".athing").map{|el| Item.new(el) }
23
22
  end
24
23
 
25
24
  class Item < Grubby::Scraper
26
25
  scrapes(:story_link){ source.at!("a.storylink") }
27
- scrapes(:story_uri) { story_link.uri }
28
- scrapes(:title) { story_link.text }
26
+ scrapes(:story_uri){ story_link.uri }
27
+ scrapes(:title){ story_link.text }
29
28
  end
30
-
31
29
  end
32
30
 
33
31
  # The following line will raise an exception if anything goes wrong
@@ -7,7 +7,7 @@ Gem::Specification.new do |spec|
7
7
  spec.name = "grubby"
8
8
  spec.version = GRUBBY_VERSION
9
9
  spec.authors = ["Jonathan Hefner"]
10
- spec.email = ["jonathan.hefner@gmail.com"]
10
+ spec.email = ["jonathan@hefner.pro"]
11
11
 
12
12
  spec.summary = %q{Fail-fast web scraping}
13
13
  spec.homepage = "https://github.com/jonathanhefner/grubby"
@@ -65,6 +65,9 @@ class Grubby < Mechanize
65
65
  # Set up configurable rate limiting, and choose a reasonable default
66
66
  # rate limit.
67
67
  self.pre_connect_hooks << Proc.new{ self.send(:sleep_between_requests) }
68
+ self.post_connect_hooks << Proc.new do |agent, uri, response, body|
69
+ self.send(:mark_last_request_time, (Time.now unless response.code.to_s.start_with?("3")))
70
+ end
68
71
  self.time_between_requests = 1.0
69
72
 
70
73
  self.journal = journal
@@ -81,9 +84,9 @@ class Grubby < Mechanize
81
84
  @journal = path&.to_pathname&.touch_file
82
85
  @seen = if @journal
83
86
  require "csv"
84
- CSV.read(@journal).map{|row| SingletonKey.new(*row) }.index_to{ true }
87
+ CSV.read(@journal).map{|row| SingletonKey.new(*row) }.to_set
85
88
  else
86
- {}
89
+ Set.new
87
90
  end
88
91
  @journal
89
92
  end
@@ -202,7 +205,7 @@ class Grubby < Mechanize
202
205
 
203
206
  def try_skip_singleton(target, purpose, series)
204
207
  series << SingletonKey.new(purpose, target.to_s)
205
- if series.uniq!.nil? && @seen.displace(series.last, true)
208
+ if series.uniq!.nil? && !@seen.add?(series.last)
206
209
  seen_info = series.length > 1 ? "seen #{series.last.target}" : "seen"
207
210
  $log.info("Skip #{series.first.target} (#{seen_info})")
208
211
  true
@@ -223,7 +226,10 @@ class Grubby < Mechanize
223
226
  rand(time_between_requests) : time_between_requests
224
227
  sleep_duration = @last_request_at + delay_duration - Time.now.to_f
225
228
  sleep(sleep_duration) if sleep_duration > 0
226
- @last_request_at = Time.now.to_f
229
+ end
230
+
231
+ def mark_last_request_time(time)
232
+ @last_request_at = time.to_f
227
233
  end
228
234
 
229
235
  end
@@ -12,13 +12,12 @@ module URI
12
12
  self.path == "/" ? "" : ::File.basename(self.path)
13
13
  end
14
14
 
15
- # Returns the value of the specified param in the URI's +query+.
16
- # The specified param name must be exactly as it appears in the query
17
- # string, and support for complex nested values is limited. (See
18
- # +CGI.parse+ for parsing behavior.) If the param name includes a
19
- # +"[]"+, the result will be an array of all occurrences of that param
20
- # in the query string. Otherwise, the result will be the last
21
- # occurrence of that param in the query string.
15
+ # Returns the value of the specified query param in the URI's query
16
+ # string. The specified +name+ must be *exactly* as it appears in the
17
+ # query string, and support for complex nested values is limited.
18
+ # (See +CGI.parse+ for parsing behavior.) If +name+ contains +"[]"+,
19
+ # all occurrences of the query param are returned as an Array.
20
+ # Otherwise, only the last occurrence is returned.
22
21
  #
23
22
  # @example
24
23
  # URI("http://example.com/?foo=a").query_param("foo") # == "a"
@@ -32,11 +31,10 @@ module URI
32
31
  # URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[]") # == nil
33
32
  # URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[][x]") # == ["a"]
34
33
  #
35
- # @return [String, nil]
36
- # @return [Array<String>, nil]
37
- # if +name+ contains +"[]"+
34
+ # @param name [String]
35
+ # @return [String, Array<String>, nil]
38
36
  def query_param(name)
39
- values = CGI.parse(self.query)[name.to_s]
37
+ values = CGI.parse(self.query)[name] if self.query
40
38
  (values.nil? || name.include?("[]")) ? values : values.last
41
39
  end
42
40
 
@@ -5,19 +5,12 @@ class Grubby::JsonParser < Mechanize::File
5
5
  # will be applied to all future parsing.
6
6
  #
7
7
  # For information about available options, see
8
- # {http://ruby-doc.org/stdlib/libdoc/json/rdoc/JSON.html#method-i-parse
8
+ # {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
9
9
  # +JSON.parse+}.
10
10
  #
11
11
  # @return [Hash]
12
12
  def self.json_parse_options
13
- @json_parse_options ||= {
14
- max_nesting: false,
15
- allow_nan: false,
16
- symbolize_names: false,
17
- create_additions: false,
18
- object_class: Hash,
19
- array_class: Array,
20
- }
13
+ @json_parse_options ||= JSON.load_default_options.merge(create_additions: false)
21
14
  end
22
15
 
23
16
  # Sets the options to use when parsing JSON. The entire options Hash
@@ -25,7 +18,7 @@ class Grubby::JsonParser < Mechanize::File
25
18
  # parsing. To set options individually, see {json_parse_options}.
26
19
  #
27
20
  # For information about available options, see
28
- # {http://ruby-doc.org/stdlib/libdoc/json/rdoc/JSON.html#method-i-parse
21
+ # {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
29
22
  # +JSON.parse+}.
30
23
  #
31
24
  # @param options [Hash]
@@ -38,9 +31,15 @@ class Grubby::JsonParser < Mechanize::File
38
31
  # @return [Hash, Array]
39
32
  attr_reader :json
40
33
 
41
- def initialize(uri = nil, response = nil, body = nil, code = nil)
34
+ # The Mechanize agent used to make the request.
35
+ #
36
+ # @return [Mechanize, nil]
37
+ attr_accessor :mech
38
+
39
+ def initialize(uri = nil, response = nil, body = nil, code = nil, mech = nil)
42
40
  @json = body.presence && JSON.parse(body, self.class.json_parse_options)
43
- super
41
+ @mech = mech
42
+ super(uri, response, body, code)
44
43
  end
45
44
 
46
45
  end
@@ -22,11 +22,10 @@ class Grubby::JsonScraper < Grubby::Scraper
22
22
  # MyScraper.scrape_file("path/to/local_file.json").class # == MyScraper
23
23
  #
24
24
  # @param path [String]
25
+ # @param agent [Mechanize]
25
26
  # @return [Grubby::JsonScraper]
26
- def self.scrape_file(path)
27
- uri = URI.join("file:///", File.expand_path(path))
28
- body = File.read(path)
29
- self.new(Grubby::JsonParser.new(uri, nil, body, "200"))
27
+ def self.scrape_file(path, agent = $grubby)
28
+ self.new(Grubby::JsonParser.read_local(path).tap{|parser| parser.mech = agent })
30
29
  end
31
30
 
32
31
  end
@@ -1,5 +1,11 @@
1
1
  class Mechanize::File
2
2
 
3
+ # @!visibility private
4
+ def self.read_local(path)
5
+ uri_path = File.expand_path(path).gsub(%r"[^/\\]+"){|component| CGI.escape(component) }
6
+ self.new(URI::File.build(path: uri_path), nil, File.read(path), "200")
7
+ end
8
+
3
9
  # @!visibility private
4
10
  def content_hash
5
11
  @content_hash ||= self.body.to_s.sha1
@@ -25,9 +25,7 @@ class Grubby::PageScraper < Grubby::Scraper
25
25
  # @param agent [Mechanize]
26
26
  # @return [Grubby::PageScraper]
27
27
  def self.scrape_file(path, agent = $grubby)
28
- uri = URI.join("file:///", File.expand_path(path))
29
- body = File.read(path)
30
- self.new(Mechanize::Page.new(uri, nil, body, "200", agent))
28
+ self.new(Mechanize::Page.read_local(path).tap{|page| page.mech = agent })
31
29
  end
32
30
 
33
31
  end
@@ -56,7 +56,7 @@ class Grubby::Scraper
56
56
  # @return [void]
57
57
  def self.scrapes(field, **options, &block)
58
58
  field = field.to_sym
59
- self.fields << field
59
+ (self.fields << field).uniq!
60
60
 
61
61
  define_method(field) do
62
62
  raise "#{self.class}#initialize does not invoke `super`" unless defined?(@scraped)
@@ -127,10 +127,10 @@ class Grubby::Scraper
127
127
  self.new(agent.get(url))
128
128
  end
129
129
 
130
- # Iterates a series of pages, starting at +start_url+. For each page,
131
- # the Scraper class is instantiated and passed to the given block.
132
- # Subsequent pages in the series are determined by invoking
133
- # +next_method+ on each previous scraper instance.
130
+ # Iterates a series of pages, starting at +start+. The Scraper class
131
+ # is instantiated with each page, and each instance is passed to the
132
+ # given block. Subsequent pages in the series are determined by
133
+ # invoking the +next_method+ method on each previous scraper instance.
134
134
  #
135
135
  # Iteration stops when the +next_method+ method returns nil. If the
136
136
  # +next_method+ method returns a String or URI, that value will be
@@ -163,7 +163,7 @@ class Grubby::Scraper
163
163
  # scraper.page_param # == "1", "2", "3", ...
164
164
  # end
165
165
  #
166
- # @param start_url [String, URI]
166
+ # @param start [String, URI, Mechanize::Page, Mechanize::File]
167
167
  # @param agent [Mechanize]
168
168
  # @param next_method [Symbol]
169
169
  # @yield [scraper]
@@ -171,14 +171,14 @@ class Grubby::Scraper
171
171
  # @return [void]
172
172
  # @raise [NoMethodError]
173
173
  # if Scraper class does not implement +next_method+
174
- def self.each(start_url, agent = $grubby, next_method: :next)
174
+ def self.each(start, agent = $grubby, next_method: :next)
175
175
  unless self.method_defined?(next_method)
176
176
  raise NoMethodError.new(nil, next_method), "#{self} does not define `#{next_method}`"
177
177
  end
178
178
 
179
- return to_enum(:each, start_url, agent, next_method: next_method) unless block_given?
179
+ return to_enum(:each, start, agent, next_method: next_method) unless block_given?
180
180
 
181
- current = start_url
181
+ current = start
182
182
  while current
183
183
  current = agent.get(current) if current.is_a?(String) || current.is_a?(URI)
184
184
  scraper = self.new(current)
@@ -1 +1 @@
1
- GRUBBY_VERSION = "1.2.0"
1
+ GRUBBY_VERSION = "1.2.1"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: grubby
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Hefner
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-07-06 00:00:00.000000000 Z
11
+ date: 2019-08-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -166,7 +166,7 @@ dependencies:
166
166
  version: '0.9'
167
167
  description:
168
168
  email:
169
- - jonathan.hefner@gmail.com
169
+ - jonathan@hefner.pro
170
170
  executables: []
171
171
  extensions: []
172
172
  extra_rdoc_files: []