grubby 1.2.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +2 -4
- data/grubby.gemspec +1 -1
- data/lib/grubby.rb +10 -4
- data/lib/grubby/core_ext/uri.rb +9 -11
- data/lib/grubby/json_parser.rb +11 -12
- data/lib/grubby/json_scraper.rb +3 -4
- data/lib/grubby/mechanize/file.rb +6 -0
- data/lib/grubby/page_scraper.rb +1 -3
- data/lib/grubby/scraper.rb +9 -9
- data/lib/grubby/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 91cb5fb76be040dc0a6b86c7dd5513e7dfa79327e68b6f15da6ed41df1492740
|
4
|
+
data.tar.gz: d96e1a83f6ebc93c09403bc66ee3251132bbdabeb40379aa081dbece2c978b98
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4e10fa8ae3b183fa600a26af1ff87e0e340e63cfdeec9369c1f9987ace143591b9c33b1edfed980b841ffea5806f96332b1b32e117551b714dcd3b66cff5a8da
|
7
|
+
data.tar.gz: 63985a6d1d39a1ac224eb1aca676f3266b911059e7ab5e838a535dd14e6249d2bbc1d41b59a35101e17983930ebd7ab258a6ce39375a300bcf1725a0e79b72c1
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
## 1.2.1
|
2
|
+
|
3
|
+
* Add `JsonParser#mech` attribute for parity with `Mechanize::Page#mech`
|
4
|
+
* Ensure time spent fetching a response does not count toward the time
|
5
|
+
to sleep between requests
|
6
|
+
* Prevent sleep between requests when following a redirect
|
7
|
+
* Prevent duplicates in `Scraper.fields`
|
8
|
+
* Fix `URI#query_param` when query is nil
|
9
|
+
* Fix `PageScraper.scrape_file` and `JsonScraper.scrape_file` when path
|
10
|
+
contains characters that need to be URI-encoded
|
11
|
+
|
12
|
+
|
1
13
|
## 1.2.0
|
2
14
|
|
3
15
|
* Add `Grubby#journal=`
|
data/README.md
CHANGED
@@ -17,17 +17,15 @@ The following example scrapes stories from the [Hacker News] front page:
|
|
17
17
|
require "grubby"
|
18
18
|
|
19
19
|
class HackerNews < Grubby::PageScraper
|
20
|
-
|
21
20
|
scrapes(:items) do
|
22
21
|
page.search!(".athing").map{|el| Item.new(el) }
|
23
22
|
end
|
24
23
|
|
25
24
|
class Item < Grubby::Scraper
|
26
25
|
scrapes(:story_link){ source.at!("a.storylink") }
|
27
|
-
scrapes(:story_uri)
|
28
|
-
scrapes(:title)
|
26
|
+
scrapes(:story_uri){ story_link.uri }
|
27
|
+
scrapes(:title){ story_link.text }
|
29
28
|
end
|
30
|
-
|
31
29
|
end
|
32
30
|
|
33
31
|
# The following line will raise an exception if anything goes wrong
|
data/grubby.gemspec
CHANGED
@@ -7,7 +7,7 @@ Gem::Specification.new do |spec|
|
|
7
7
|
spec.name = "grubby"
|
8
8
|
spec.version = GRUBBY_VERSION
|
9
9
|
spec.authors = ["Jonathan Hefner"]
|
10
|
-
spec.email = ["jonathan
|
10
|
+
spec.email = ["jonathan@hefner.pro"]
|
11
11
|
|
12
12
|
spec.summary = %q{Fail-fast web scraping}
|
13
13
|
spec.homepage = "https://github.com/jonathanhefner/grubby"
|
data/lib/grubby.rb
CHANGED
@@ -65,6 +65,9 @@ class Grubby < Mechanize
|
|
65
65
|
# Set up configurable rate limiting, and choose a reasonable default
|
66
66
|
# rate limit.
|
67
67
|
self.pre_connect_hooks << Proc.new{ self.send(:sleep_between_requests) }
|
68
|
+
self.post_connect_hooks << Proc.new do |agent, uri, response, body|
|
69
|
+
self.send(:mark_last_request_time, (Time.now unless response.code.to_s.start_with?("3")))
|
70
|
+
end
|
68
71
|
self.time_between_requests = 1.0
|
69
72
|
|
70
73
|
self.journal = journal
|
@@ -81,9 +84,9 @@ class Grubby < Mechanize
|
|
81
84
|
@journal = path&.to_pathname&.touch_file
|
82
85
|
@seen = if @journal
|
83
86
|
require "csv"
|
84
|
-
CSV.read(@journal).map{|row| SingletonKey.new(*row) }.
|
87
|
+
CSV.read(@journal).map{|row| SingletonKey.new(*row) }.to_set
|
85
88
|
else
|
86
|
-
|
89
|
+
Set.new
|
87
90
|
end
|
88
91
|
@journal
|
89
92
|
end
|
@@ -202,7 +205,7 @@ class Grubby < Mechanize
|
|
202
205
|
|
203
206
|
def try_skip_singleton(target, purpose, series)
|
204
207
|
series << SingletonKey.new(purpose, target.to_s)
|
205
|
-
if series.uniq!.nil? &&
|
208
|
+
if series.uniq!.nil? && !@seen.add?(series.last)
|
206
209
|
seen_info = series.length > 1 ? "seen #{series.last.target}" : "seen"
|
207
210
|
$log.info("Skip #{series.first.target} (#{seen_info})")
|
208
211
|
true
|
@@ -223,7 +226,10 @@ class Grubby < Mechanize
|
|
223
226
|
rand(time_between_requests) : time_between_requests
|
224
227
|
sleep_duration = @last_request_at + delay_duration - Time.now.to_f
|
225
228
|
sleep(sleep_duration) if sleep_duration > 0
|
226
|
-
|
229
|
+
end
|
230
|
+
|
231
|
+
def mark_last_request_time(time)
|
232
|
+
@last_request_at = time.to_f
|
227
233
|
end
|
228
234
|
|
229
235
|
end
|
data/lib/grubby/core_ext/uri.rb
CHANGED
@@ -12,13 +12,12 @@ module URI
|
|
12
12
|
self.path == "/" ? "" : ::File.basename(self.path)
|
13
13
|
end
|
14
14
|
|
15
|
-
# Returns the value of the specified param in the URI's
|
16
|
-
# The specified
|
17
|
-
# string, and support for complex nested values is limited.
|
18
|
-
# +CGI.parse+ for parsing behavior.) If
|
19
|
-
#
|
20
|
-
#
|
21
|
-
# occurrence of that param in the query string.
|
15
|
+
# Returns the value of the specified query param in the URI's query
|
16
|
+
# string. The specified +name+ must be *exactly* as it appears in the
|
17
|
+
# query string, and support for complex nested values is limited.
|
18
|
+
# (See +CGI.parse+ for parsing behavior.) If +name+ contains +"[]"+,
|
19
|
+
# all occurrences of the query param are returned as an Array.
|
20
|
+
# Otherwise, only the last occurrence is returned.
|
22
21
|
#
|
23
22
|
# @example
|
24
23
|
# URI("http://example.com/?foo=a").query_param("foo") # == "a"
|
@@ -32,11 +31,10 @@ module URI
|
|
32
31
|
# URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[]") # == nil
|
33
32
|
# URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[][x]") # == ["a"]
|
34
33
|
#
|
35
|
-
# @
|
36
|
-
# @return [Array<String>, nil]
|
37
|
-
# if +name+ contains +"[]"+
|
34
|
+
# @param name [String]
|
35
|
+
# @return [String, Array<String>, nil]
|
38
36
|
def query_param(name)
|
39
|
-
values = CGI.parse(self.query)[name.
|
37
|
+
values = CGI.parse(self.query)[name] if self.query
|
40
38
|
(values.nil? || name.include?("[]")) ? values : values.last
|
41
39
|
end
|
42
40
|
|
data/lib/grubby/json_parser.rb
CHANGED
@@ -5,19 +5,12 @@ class Grubby::JsonParser < Mechanize::File
|
|
5
5
|
# will be applied to all future parsing.
|
6
6
|
#
|
7
7
|
# For information about available options, see
|
8
|
-
# {
|
8
|
+
# {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
|
9
9
|
# +JSON.parse+}.
|
10
10
|
#
|
11
11
|
# @return [Hash]
|
12
12
|
def self.json_parse_options
|
13
|
-
@json_parse_options ||=
|
14
|
-
max_nesting: false,
|
15
|
-
allow_nan: false,
|
16
|
-
symbolize_names: false,
|
17
|
-
create_additions: false,
|
18
|
-
object_class: Hash,
|
19
|
-
array_class: Array,
|
20
|
-
}
|
13
|
+
@json_parse_options ||= JSON.load_default_options.merge(create_additions: false)
|
21
14
|
end
|
22
15
|
|
23
16
|
# Sets the options to use when parsing JSON. The entire options Hash
|
@@ -25,7 +18,7 @@ class Grubby::JsonParser < Mechanize::File
|
|
25
18
|
# parsing. To set options individually, see {json_parse_options}.
|
26
19
|
#
|
27
20
|
# For information about available options, see
|
28
|
-
# {
|
21
|
+
# {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
|
29
22
|
# +JSON.parse+}.
|
30
23
|
#
|
31
24
|
# @param options [Hash]
|
@@ -38,9 +31,15 @@ class Grubby::JsonParser < Mechanize::File
|
|
38
31
|
# @return [Hash, Array]
|
39
32
|
attr_reader :json
|
40
33
|
|
41
|
-
|
34
|
+
# The Mechanize agent used to make the request.
|
35
|
+
#
|
36
|
+
# @return [Mechanize, nil]
|
37
|
+
attr_accessor :mech
|
38
|
+
|
39
|
+
def initialize(uri = nil, response = nil, body = nil, code = nil, mech = nil)
|
42
40
|
@json = body.presence && JSON.parse(body, self.class.json_parse_options)
|
43
|
-
|
41
|
+
@mech = mech
|
42
|
+
super(uri, response, body, code)
|
44
43
|
end
|
45
44
|
|
46
45
|
end
|
data/lib/grubby/json_scraper.rb
CHANGED
@@ -22,11 +22,10 @@ class Grubby::JsonScraper < Grubby::Scraper
|
|
22
22
|
# MyScraper.scrape_file("path/to/local_file.json").class # == MyScraper
|
23
23
|
#
|
24
24
|
# @param path [String]
|
25
|
+
# @param agent [Mechanize]
|
25
26
|
# @return [Grubby::JsonScraper]
|
26
|
-
def self.scrape_file(path)
|
27
|
-
|
28
|
-
body = File.read(path)
|
29
|
-
self.new(Grubby::JsonParser.new(uri, nil, body, "200"))
|
27
|
+
def self.scrape_file(path, agent = $grubby)
|
28
|
+
self.new(Grubby::JsonParser.read_local(path).tap{|parser| parser.mech = agent })
|
30
29
|
end
|
31
30
|
|
32
31
|
end
|
@@ -1,5 +1,11 @@
|
|
1
1
|
class Mechanize::File
|
2
2
|
|
3
|
+
# @!visibility private
|
4
|
+
def self.read_local(path)
|
5
|
+
uri_path = File.expand_path(path).gsub(%r"[^/\\]+"){|component| CGI.escape(component) }
|
6
|
+
self.new(URI::File.build(path: uri_path), nil, File.read(path), "200")
|
7
|
+
end
|
8
|
+
|
3
9
|
# @!visibility private
|
4
10
|
def content_hash
|
5
11
|
@content_hash ||= self.body.to_s.sha1
|
data/lib/grubby/page_scraper.rb
CHANGED
@@ -25,9 +25,7 @@ class Grubby::PageScraper < Grubby::Scraper
|
|
25
25
|
# @param agent [Mechanize]
|
26
26
|
# @return [Grubby::PageScraper]
|
27
27
|
def self.scrape_file(path, agent = $grubby)
|
28
|
-
|
29
|
-
body = File.read(path)
|
30
|
-
self.new(Mechanize::Page.new(uri, nil, body, "200", agent))
|
28
|
+
self.new(Mechanize::Page.read_local(path).tap{|page| page.mech = agent })
|
31
29
|
end
|
32
30
|
|
33
31
|
end
|
data/lib/grubby/scraper.rb
CHANGED
@@ -56,7 +56,7 @@ class Grubby::Scraper
|
|
56
56
|
# @return [void]
|
57
57
|
def self.scrapes(field, **options, &block)
|
58
58
|
field = field.to_sym
|
59
|
-
self.fields << field
|
59
|
+
(self.fields << field).uniq!
|
60
60
|
|
61
61
|
define_method(field) do
|
62
62
|
raise "#{self.class}#initialize does not invoke `super`" unless defined?(@scraped)
|
@@ -127,10 +127,10 @@ class Grubby::Scraper
|
|
127
127
|
self.new(agent.get(url))
|
128
128
|
end
|
129
129
|
|
130
|
-
# Iterates a series of pages, starting at +
|
131
|
-
#
|
132
|
-
# Subsequent pages in the series are determined by
|
133
|
-
# +next_method+ on each previous scraper instance.
|
130
|
+
# Iterates a series of pages, starting at +start+. The Scraper class
|
131
|
+
# is instantiated with each page, and each instance is passed to the
|
132
|
+
# given block. Subsequent pages in the series are determined by
|
133
|
+
# invoking the +next_method+ method on each previous scraper instance.
|
134
134
|
#
|
135
135
|
# Iteration stops when the +next_method+ method returns nil. If the
|
136
136
|
# +next_method+ method returns a String or URI, that value will be
|
@@ -163,7 +163,7 @@ class Grubby::Scraper
|
|
163
163
|
# scraper.page_param # == "1", "2", "3", ...
|
164
164
|
# end
|
165
165
|
#
|
166
|
-
# @param
|
166
|
+
# @param start [String, URI, Mechanize::Page, Mechanize::File]
|
167
167
|
# @param agent [Mechanize]
|
168
168
|
# @param next_method [Symbol]
|
169
169
|
# @yield [scraper]
|
@@ -171,14 +171,14 @@ class Grubby::Scraper
|
|
171
171
|
# @return [void]
|
172
172
|
# @raise [NoMethodError]
|
173
173
|
# if Scraper class does not implement +next_method+
|
174
|
-
def self.each(
|
174
|
+
def self.each(start, agent = $grubby, next_method: :next)
|
175
175
|
unless self.method_defined?(next_method)
|
176
176
|
raise NoMethodError.new(nil, next_method), "#{self} does not define `#{next_method}`"
|
177
177
|
end
|
178
178
|
|
179
|
-
return to_enum(:each,
|
179
|
+
return to_enum(:each, start, agent, next_method: next_method) unless block_given?
|
180
180
|
|
181
|
-
current =
|
181
|
+
current = start
|
182
182
|
while current
|
183
183
|
current = agent.get(current) if current.is_a?(String) || current.is_a?(URI)
|
184
184
|
scraper = self.new(current)
|
data/lib/grubby/version.rb
CHANGED
@@ -1 +1 @@
|
|
1
|
-
GRUBBY_VERSION = "1.2.
|
1
|
+
GRUBBY_VERSION = "1.2.1"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grubby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Hefner
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-08-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -166,7 +166,7 @@ dependencies:
|
|
166
166
|
version: '0.9'
|
167
167
|
description:
|
168
168
|
email:
|
169
|
-
- jonathan
|
169
|
+
- jonathan@hefner.pro
|
170
170
|
executables: []
|
171
171
|
extensions: []
|
172
172
|
extra_rdoc_files: []
|