grubby 1.2.0 → 1.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 84d759cf7187c8502b42e9d7828f59f126bb87af8da524e9d8e6f6ad8a64f467
4
- data.tar.gz: bf26cca3991fca00e573f51f28a1c457e063e4f419986971f1429f051f2e3155
3
+ metadata.gz: 91cb5fb76be040dc0a6b86c7dd5513e7dfa79327e68b6f15da6ed41df1492740
4
+ data.tar.gz: d96e1a83f6ebc93c09403bc66ee3251132bbdabeb40379aa081dbece2c978b98
5
5
  SHA512:
6
- metadata.gz: 38b8f7818be985da5c48484b8a3f42a40401b4890e46da93c2565c546654a660537cf15303e1106bdca201d1ea8e7ff90e13ab13dcb652997b0acc9becc01b48
7
- data.tar.gz: e3c8b063d275ebf49dc50c5a70fa82cb0f9e517f17cc9e3735557a2fe998d5ea82a3ea0932ad8a6ecec630f4f66c8d62443c76497f6e98e4f202a72df988095e
6
+ metadata.gz: 4e10fa8ae3b183fa600a26af1ff87e0e340e63cfdeec9369c1f9987ace143591b9c33b1edfed980b841ffea5806f96332b1b32e117551b714dcd3b66cff5a8da
7
+ data.tar.gz: 63985a6d1d39a1ac224eb1aca676f3266b911059e7ab5e838a535dd14e6249d2bbc1d41b59a35101e17983930ebd7ab258a6ce39375a300bcf1725a0e79b72c1
@@ -1,3 +1,15 @@
1
+ ## 1.2.1
2
+
3
+ * Add `JsonParser#mech` attribute for parity with `Mechanize::Page#mech`
4
+ * Ensure time spent fetching a response does not count toward the time
5
+ to sleep between requests
6
+ * Prevent sleep between requests when following a redirect
7
+ * Prevent duplicates in `Scraper.fields`
8
+ * Fix `URI#query_param` when query is nil
9
+ * Fix `PageScraper.scrape_file` and `JsonScraper.scrape_file` when path
10
+ contains characters that need to be URI-encoded
11
+
12
+
1
13
  ## 1.2.0
2
14
 
3
15
  * Add `Grubby#journal=`
data/README.md CHANGED
@@ -17,17 +17,15 @@ The following example scrapes stories from the [Hacker News] front page:
17
17
  require "grubby"
18
18
 
19
19
  class HackerNews < Grubby::PageScraper
20
-
21
20
  scrapes(:items) do
22
21
  page.search!(".athing").map{|el| Item.new(el) }
23
22
  end
24
23
 
25
24
  class Item < Grubby::Scraper
26
25
  scrapes(:story_link){ source.at!("a.storylink") }
27
- scrapes(:story_uri) { story_link.uri }
28
- scrapes(:title) { story_link.text }
26
+ scrapes(:story_uri){ story_link.uri }
27
+ scrapes(:title){ story_link.text }
29
28
  end
30
-
31
29
  end
32
30
 
33
31
  # The following line will raise an exception if anything goes wrong
@@ -7,7 +7,7 @@ Gem::Specification.new do |spec|
7
7
  spec.name = "grubby"
8
8
  spec.version = GRUBBY_VERSION
9
9
  spec.authors = ["Jonathan Hefner"]
10
- spec.email = ["jonathan.hefner@gmail.com"]
10
+ spec.email = ["jonathan@hefner.pro"]
11
11
 
12
12
  spec.summary = %q{Fail-fast web scraping}
13
13
  spec.homepage = "https://github.com/jonathanhefner/grubby"
@@ -65,6 +65,9 @@ class Grubby < Mechanize
65
65
  # Set up configurable rate limiting, and choose a reasonable default
66
66
  # rate limit.
67
67
  self.pre_connect_hooks << Proc.new{ self.send(:sleep_between_requests) }
68
+ self.post_connect_hooks << Proc.new do |agent, uri, response, body|
69
+ self.send(:mark_last_request_time, (Time.now unless response.code.to_s.start_with?("3")))
70
+ end
68
71
  self.time_between_requests = 1.0
69
72
 
70
73
  self.journal = journal
@@ -81,9 +84,9 @@ class Grubby < Mechanize
81
84
  @journal = path&.to_pathname&.touch_file
82
85
  @seen = if @journal
83
86
  require "csv"
84
- CSV.read(@journal).map{|row| SingletonKey.new(*row) }.index_to{ true }
87
+ CSV.read(@journal).map{|row| SingletonKey.new(*row) }.to_set
85
88
  else
86
- {}
89
+ Set.new
87
90
  end
88
91
  @journal
89
92
  end
@@ -202,7 +205,7 @@ class Grubby < Mechanize
202
205
 
203
206
  def try_skip_singleton(target, purpose, series)
204
207
  series << SingletonKey.new(purpose, target.to_s)
205
- if series.uniq!.nil? && @seen.displace(series.last, true)
208
+ if series.uniq!.nil? && !@seen.add?(series.last)
206
209
  seen_info = series.length > 1 ? "seen #{series.last.target}" : "seen"
207
210
  $log.info("Skip #{series.first.target} (#{seen_info})")
208
211
  true
@@ -223,7 +226,10 @@ class Grubby < Mechanize
223
226
  rand(time_between_requests) : time_between_requests
224
227
  sleep_duration = @last_request_at + delay_duration - Time.now.to_f
225
228
  sleep(sleep_duration) if sleep_duration > 0
226
- @last_request_at = Time.now.to_f
229
+ end
230
+
231
+ def mark_last_request_time(time)
232
+ @last_request_at = time.to_f
227
233
  end
228
234
 
229
235
  end
@@ -12,13 +12,12 @@ module URI
12
12
  self.path == "/" ? "" : ::File.basename(self.path)
13
13
  end
14
14
 
15
- # Returns the value of the specified param in the URI's +query+.
16
- # The specified param name must be exactly as it appears in the query
17
- # string, and support for complex nested values is limited. (See
18
- # +CGI.parse+ for parsing behavior.) If the param name includes a
19
- # +"[]"+, the result will be an array of all occurrences of that param
20
- # in the query string. Otherwise, the result will be the last
21
- # occurrence of that param in the query string.
15
+ # Returns the value of the specified query param in the URI's query
16
+ # string. The specified +name+ must be *exactly* as it appears in the
17
+ # query string, and support for complex nested values is limited.
18
+ # (See +CGI.parse+ for parsing behavior.) If +name+ contains +"[]"+,
19
+ # all occurrences of the query param are returned as an Array.
20
+ # Otherwise, only the last occurrence is returned.
22
21
  #
23
22
  # @example
24
23
  # URI("http://example.com/?foo=a").query_param("foo") # == "a"
@@ -32,11 +31,10 @@ module URI
32
31
  # URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[]") # == nil
33
32
  # URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[][x]") # == ["a"]
34
33
  #
35
- # @return [String, nil]
36
- # @return [Array<String>, nil]
37
- # if +name+ contains +"[]"+
34
+ # @param name [String]
35
+ # @return [String, Array<String>, nil]
38
36
  def query_param(name)
39
- values = CGI.parse(self.query)[name.to_s]
37
+ values = CGI.parse(self.query)[name] if self.query
40
38
  (values.nil? || name.include?("[]")) ? values : values.last
41
39
  end
42
40
 
@@ -5,19 +5,12 @@ class Grubby::JsonParser < Mechanize::File
5
5
  # will be applied to all future parsing.
6
6
  #
7
7
  # For information about available options, see
8
- # {http://ruby-doc.org/stdlib/libdoc/json/rdoc/JSON.html#method-i-parse
8
+ # {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
9
9
  # +JSON.parse+}.
10
10
  #
11
11
  # @return [Hash]
12
12
  def self.json_parse_options
13
- @json_parse_options ||= {
14
- max_nesting: false,
15
- allow_nan: false,
16
- symbolize_names: false,
17
- create_additions: false,
18
- object_class: Hash,
19
- array_class: Array,
20
- }
13
+ @json_parse_options ||= JSON.load_default_options.merge(create_additions: false)
21
14
  end
22
15
 
23
16
  # Sets the options to use when parsing JSON. The entire options Hash
@@ -25,7 +18,7 @@ class Grubby::JsonParser < Mechanize::File
25
18
  # parsing. To set options individually, see {json_parse_options}.
26
19
  #
27
20
  # For information about available options, see
28
- # {http://ruby-doc.org/stdlib/libdoc/json/rdoc/JSON.html#method-i-parse
21
+ # {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
29
22
  # +JSON.parse+}.
30
23
  #
31
24
  # @param options [Hash]
@@ -38,9 +31,15 @@ class Grubby::JsonParser < Mechanize::File
38
31
  # @return [Hash, Array]
39
32
  attr_reader :json
40
33
 
41
- def initialize(uri = nil, response = nil, body = nil, code = nil)
34
+ # The Mechanize agent used to make the request.
35
+ #
36
+ # @return [Mechanize, nil]
37
+ attr_accessor :mech
38
+
39
+ def initialize(uri = nil, response = nil, body = nil, code = nil, mech = nil)
42
40
  @json = body.presence && JSON.parse(body, self.class.json_parse_options)
43
- super
41
+ @mech = mech
42
+ super(uri, response, body, code)
44
43
  end
45
44
 
46
45
  end
@@ -22,11 +22,10 @@ class Grubby::JsonScraper < Grubby::Scraper
22
22
  # MyScraper.scrape_file("path/to/local_file.json").class # == MyScraper
23
23
  #
24
24
  # @param path [String]
25
+ # @param agent [Mechanize]
25
26
  # @return [Grubby::JsonScraper]
26
- def self.scrape_file(path)
27
- uri = URI.join("file:///", File.expand_path(path))
28
- body = File.read(path)
29
- self.new(Grubby::JsonParser.new(uri, nil, body, "200"))
27
+ def self.scrape_file(path, agent = $grubby)
28
+ self.new(Grubby::JsonParser.read_local(path).tap{|parser| parser.mech = agent })
30
29
  end
31
30
 
32
31
  end
@@ -1,5 +1,11 @@
1
1
  class Mechanize::File
2
2
 
3
+ # @!visibility private
4
+ def self.read_local(path)
5
+ uri_path = File.expand_path(path).gsub(%r"[^/\\]+"){|component| CGI.escape(component) }
6
+ self.new(URI::File.build(path: uri_path), nil, File.read(path), "200")
7
+ end
8
+
3
9
  # @!visibility private
4
10
  def content_hash
5
11
  @content_hash ||= self.body.to_s.sha1
@@ -25,9 +25,7 @@ class Grubby::PageScraper < Grubby::Scraper
25
25
  # @param agent [Mechanize]
26
26
  # @return [Grubby::PageScraper]
27
27
  def self.scrape_file(path, agent = $grubby)
28
- uri = URI.join("file:///", File.expand_path(path))
29
- body = File.read(path)
30
- self.new(Mechanize::Page.new(uri, nil, body, "200", agent))
28
+ self.new(Mechanize::Page.read_local(path).tap{|page| page.mech = agent })
31
29
  end
32
30
 
33
31
  end
@@ -56,7 +56,7 @@ class Grubby::Scraper
56
56
  # @return [void]
57
57
  def self.scrapes(field, **options, &block)
58
58
  field = field.to_sym
59
- self.fields << field
59
+ (self.fields << field).uniq!
60
60
 
61
61
  define_method(field) do
62
62
  raise "#{self.class}#initialize does not invoke `super`" unless defined?(@scraped)
@@ -127,10 +127,10 @@ class Grubby::Scraper
127
127
  self.new(agent.get(url))
128
128
  end
129
129
 
130
- # Iterates a series of pages, starting at +start_url+. For each page,
131
- # the Scraper class is instantiated and passed to the given block.
132
- # Subsequent pages in the series are determined by invoking
133
- # +next_method+ on each previous scraper instance.
130
+ # Iterates a series of pages, starting at +start+. The Scraper class
131
+ # is instantiated with each page, and each instance is passed to the
132
+ # given block. Subsequent pages in the series are determined by
133
+ # invoking the +next_method+ method on each previous scraper instance.
134
134
  #
135
135
  # Iteration stops when the +next_method+ method returns nil. If the
136
136
  # +next_method+ method returns a String or URI, that value will be
@@ -163,7 +163,7 @@ class Grubby::Scraper
163
163
  # scraper.page_param # == "1", "2", "3", ...
164
164
  # end
165
165
  #
166
- # @param start_url [String, URI]
166
+ # @param start [String, URI, Mechanize::Page, Mechanize::File]
167
167
  # @param agent [Mechanize]
168
168
  # @param next_method [Symbol]
169
169
  # @yield [scraper]
@@ -171,14 +171,14 @@ class Grubby::Scraper
171
171
  # @return [void]
172
172
  # @raise [NoMethodError]
173
173
  # if Scraper class does not implement +next_method+
174
- def self.each(start_url, agent = $grubby, next_method: :next)
174
+ def self.each(start, agent = $grubby, next_method: :next)
175
175
  unless self.method_defined?(next_method)
176
176
  raise NoMethodError.new(nil, next_method), "#{self} does not define `#{next_method}`"
177
177
  end
178
178
 
179
- return to_enum(:each, start_url, agent, next_method: next_method) unless block_given?
179
+ return to_enum(:each, start, agent, next_method: next_method) unless block_given?
180
180
 
181
- current = start_url
181
+ current = start
182
182
  while current
183
183
  current = agent.get(current) if current.is_a?(String) || current.is_a?(URI)
184
184
  scraper = self.new(current)
@@ -1 +1 @@
1
- GRUBBY_VERSION = "1.2.0"
1
+ GRUBBY_VERSION = "1.2.1"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: grubby
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Hefner
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-07-06 00:00:00.000000000 Z
11
+ date: 2019-08-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -166,7 +166,7 @@ dependencies:
166
166
  version: '0.9'
167
167
  description:
168
168
  email:
169
- - jonathan.hefner@gmail.com
169
+ - jonathan@hefner.pro
170
170
  executables: []
171
171
  extensions: []
172
172
  extra_rdoc_files: []