grubby 1.2.0 → 1.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +2 -4
- data/grubby.gemspec +1 -1
- data/lib/grubby.rb +10 -4
- data/lib/grubby/core_ext/uri.rb +9 -11
- data/lib/grubby/json_parser.rb +11 -12
- data/lib/grubby/json_scraper.rb +3 -4
- data/lib/grubby/mechanize/file.rb +6 -0
- data/lib/grubby/page_scraper.rb +1 -3
- data/lib/grubby/scraper.rb +9 -9
- data/lib/grubby/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 91cb5fb76be040dc0a6b86c7dd5513e7dfa79327e68b6f15da6ed41df1492740
|
4
|
+
data.tar.gz: d96e1a83f6ebc93c09403bc66ee3251132bbdabeb40379aa081dbece2c978b98
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4e10fa8ae3b183fa600a26af1ff87e0e340e63cfdeec9369c1f9987ace143591b9c33b1edfed980b841ffea5806f96332b1b32e117551b714dcd3b66cff5a8da
|
7
|
+
data.tar.gz: 63985a6d1d39a1ac224eb1aca676f3266b911059e7ab5e838a535dd14e6249d2bbc1d41b59a35101e17983930ebd7ab258a6ce39375a300bcf1725a0e79b72c1
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
## 1.2.1
|
2
|
+
|
3
|
+
* Add `JsonParser#mech` attribute for parity with `Mechanize::Page#mech`
|
4
|
+
* Ensure time spent fetching a response does not count toward the time
|
5
|
+
to sleep between requests
|
6
|
+
* Prevent sleep between requests when following a redirect
|
7
|
+
* Prevent duplicates in `Scraper.fields`
|
8
|
+
* Fix `URI#query_param` when query is nil
|
9
|
+
* Fix `PageScraper.scrape_file` and `JsonScraper.scrape_file` when path
|
10
|
+
contains characters that need to be URI-encoded
|
11
|
+
|
12
|
+
|
1
13
|
## 1.2.0
|
2
14
|
|
3
15
|
* Add `Grubby#journal=`
|
data/README.md
CHANGED
@@ -17,17 +17,15 @@ The following example scrapes stories from the [Hacker News] front page:
|
|
17
17
|
require "grubby"
|
18
18
|
|
19
19
|
class HackerNews < Grubby::PageScraper
|
20
|
-
|
21
20
|
scrapes(:items) do
|
22
21
|
page.search!(".athing").map{|el| Item.new(el) }
|
23
22
|
end
|
24
23
|
|
25
24
|
class Item < Grubby::Scraper
|
26
25
|
scrapes(:story_link){ source.at!("a.storylink") }
|
27
|
-
scrapes(:story_uri)
|
28
|
-
scrapes(:title)
|
26
|
+
scrapes(:story_uri){ story_link.uri }
|
27
|
+
scrapes(:title){ story_link.text }
|
29
28
|
end
|
30
|
-
|
31
29
|
end
|
32
30
|
|
33
31
|
# The following line will raise an exception if anything goes wrong
|
data/grubby.gemspec
CHANGED
@@ -7,7 +7,7 @@ Gem::Specification.new do |spec|
|
|
7
7
|
spec.name = "grubby"
|
8
8
|
spec.version = GRUBBY_VERSION
|
9
9
|
spec.authors = ["Jonathan Hefner"]
|
10
|
-
spec.email = ["jonathan
|
10
|
+
spec.email = ["jonathan@hefner.pro"]
|
11
11
|
|
12
12
|
spec.summary = %q{Fail-fast web scraping}
|
13
13
|
spec.homepage = "https://github.com/jonathanhefner/grubby"
|
data/lib/grubby.rb
CHANGED
@@ -65,6 +65,9 @@ class Grubby < Mechanize
|
|
65
65
|
# Set up configurable rate limiting, and choose a reasonable default
|
66
66
|
# rate limit.
|
67
67
|
self.pre_connect_hooks << Proc.new{ self.send(:sleep_between_requests) }
|
68
|
+
self.post_connect_hooks << Proc.new do |agent, uri, response, body|
|
69
|
+
self.send(:mark_last_request_time, (Time.now unless response.code.to_s.start_with?("3")))
|
70
|
+
end
|
68
71
|
self.time_between_requests = 1.0
|
69
72
|
|
70
73
|
self.journal = journal
|
@@ -81,9 +84,9 @@ class Grubby < Mechanize
|
|
81
84
|
@journal = path&.to_pathname&.touch_file
|
82
85
|
@seen = if @journal
|
83
86
|
require "csv"
|
84
|
-
CSV.read(@journal).map{|row| SingletonKey.new(*row) }.
|
87
|
+
CSV.read(@journal).map{|row| SingletonKey.new(*row) }.to_set
|
85
88
|
else
|
86
|
-
|
89
|
+
Set.new
|
87
90
|
end
|
88
91
|
@journal
|
89
92
|
end
|
@@ -202,7 +205,7 @@ class Grubby < Mechanize
|
|
202
205
|
|
203
206
|
def try_skip_singleton(target, purpose, series)
|
204
207
|
series << SingletonKey.new(purpose, target.to_s)
|
205
|
-
if series.uniq!.nil? &&
|
208
|
+
if series.uniq!.nil? && !@seen.add?(series.last)
|
206
209
|
seen_info = series.length > 1 ? "seen #{series.last.target}" : "seen"
|
207
210
|
$log.info("Skip #{series.first.target} (#{seen_info})")
|
208
211
|
true
|
@@ -223,7 +226,10 @@ class Grubby < Mechanize
|
|
223
226
|
rand(time_between_requests) : time_between_requests
|
224
227
|
sleep_duration = @last_request_at + delay_duration - Time.now.to_f
|
225
228
|
sleep(sleep_duration) if sleep_duration > 0
|
226
|
-
|
229
|
+
end
|
230
|
+
|
231
|
+
def mark_last_request_time(time)
|
232
|
+
@last_request_at = time.to_f
|
227
233
|
end
|
228
234
|
|
229
235
|
end
|
data/lib/grubby/core_ext/uri.rb
CHANGED
@@ -12,13 +12,12 @@ module URI
|
|
12
12
|
self.path == "/" ? "" : ::File.basename(self.path)
|
13
13
|
end
|
14
14
|
|
15
|
-
# Returns the value of the specified param in the URI's
|
16
|
-
# The specified
|
17
|
-
# string, and support for complex nested values is limited.
|
18
|
-
# +CGI.parse+ for parsing behavior.) If
|
19
|
-
#
|
20
|
-
#
|
21
|
-
# occurrence of that param in the query string.
|
15
|
+
# Returns the value of the specified query param in the URI's query
|
16
|
+
# string. The specified +name+ must be *exactly* as it appears in the
|
17
|
+
# query string, and support for complex nested values is limited.
|
18
|
+
# (See +CGI.parse+ for parsing behavior.) If +name+ contains +"[]"+,
|
19
|
+
# all occurrences of the query param are returned as an Array.
|
20
|
+
# Otherwise, only the last occurrence is returned.
|
22
21
|
#
|
23
22
|
# @example
|
24
23
|
# URI("http://example.com/?foo=a").query_param("foo") # == "a"
|
@@ -32,11 +31,10 @@ module URI
|
|
32
31
|
# URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[]") # == nil
|
33
32
|
# URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[][x]") # == ["a"]
|
34
33
|
#
|
35
|
-
# @
|
36
|
-
# @return [Array<String>, nil]
|
37
|
-
# if +name+ contains +"[]"+
|
34
|
+
# @param name [String]
|
35
|
+
# @return [String, Array<String>, nil]
|
38
36
|
def query_param(name)
|
39
|
-
values = CGI.parse(self.query)[name.
|
37
|
+
values = CGI.parse(self.query)[name] if self.query
|
40
38
|
(values.nil? || name.include?("[]")) ? values : values.last
|
41
39
|
end
|
42
40
|
|
data/lib/grubby/json_parser.rb
CHANGED
@@ -5,19 +5,12 @@ class Grubby::JsonParser < Mechanize::File
|
|
5
5
|
# will be applied to all future parsing.
|
6
6
|
#
|
7
7
|
# For information about available options, see
|
8
|
-
# {
|
8
|
+
# {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
|
9
9
|
# +JSON.parse+}.
|
10
10
|
#
|
11
11
|
# @return [Hash]
|
12
12
|
def self.json_parse_options
|
13
|
-
@json_parse_options ||=
|
14
|
-
max_nesting: false,
|
15
|
-
allow_nan: false,
|
16
|
-
symbolize_names: false,
|
17
|
-
create_additions: false,
|
18
|
-
object_class: Hash,
|
19
|
-
array_class: Array,
|
20
|
-
}
|
13
|
+
@json_parse_options ||= JSON.load_default_options.merge(create_additions: false)
|
21
14
|
end
|
22
15
|
|
23
16
|
# Sets the options to use when parsing JSON. The entire options Hash
|
@@ -25,7 +18,7 @@ class Grubby::JsonParser < Mechanize::File
|
|
25
18
|
# parsing. To set options individually, see {json_parse_options}.
|
26
19
|
#
|
27
20
|
# For information about available options, see
|
28
|
-
# {
|
21
|
+
# {https://docs.ruby-lang.org/en/trunk/JSON.html#method-i-parse
|
29
22
|
# +JSON.parse+}.
|
30
23
|
#
|
31
24
|
# @param options [Hash]
|
@@ -38,9 +31,15 @@ class Grubby::JsonParser < Mechanize::File
|
|
38
31
|
# @return [Hash, Array]
|
39
32
|
attr_reader :json
|
40
33
|
|
41
|
-
|
34
|
+
# The Mechanize agent used to make the request.
|
35
|
+
#
|
36
|
+
# @return [Mechanize, nil]
|
37
|
+
attr_accessor :mech
|
38
|
+
|
39
|
+
def initialize(uri = nil, response = nil, body = nil, code = nil, mech = nil)
|
42
40
|
@json = body.presence && JSON.parse(body, self.class.json_parse_options)
|
43
|
-
|
41
|
+
@mech = mech
|
42
|
+
super(uri, response, body, code)
|
44
43
|
end
|
45
44
|
|
46
45
|
end
|
data/lib/grubby/json_scraper.rb
CHANGED
@@ -22,11 +22,10 @@ class Grubby::JsonScraper < Grubby::Scraper
|
|
22
22
|
# MyScraper.scrape_file("path/to/local_file.json").class # == MyScraper
|
23
23
|
#
|
24
24
|
# @param path [String]
|
25
|
+
# @param agent [Mechanize]
|
25
26
|
# @return [Grubby::JsonScraper]
|
26
|
-
def self.scrape_file(path)
|
27
|
-
|
28
|
-
body = File.read(path)
|
29
|
-
self.new(Grubby::JsonParser.new(uri, nil, body, "200"))
|
27
|
+
def self.scrape_file(path, agent = $grubby)
|
28
|
+
self.new(Grubby::JsonParser.read_local(path).tap{|parser| parser.mech = agent })
|
30
29
|
end
|
31
30
|
|
32
31
|
end
|
@@ -1,5 +1,11 @@
|
|
1
1
|
class Mechanize::File
|
2
2
|
|
3
|
+
# @!visibility private
|
4
|
+
def self.read_local(path)
|
5
|
+
uri_path = File.expand_path(path).gsub(%r"[^/\\]+"){|component| CGI.escape(component) }
|
6
|
+
self.new(URI::File.build(path: uri_path), nil, File.read(path), "200")
|
7
|
+
end
|
8
|
+
|
3
9
|
# @!visibility private
|
4
10
|
def content_hash
|
5
11
|
@content_hash ||= self.body.to_s.sha1
|
data/lib/grubby/page_scraper.rb
CHANGED
@@ -25,9 +25,7 @@ class Grubby::PageScraper < Grubby::Scraper
|
|
25
25
|
# @param agent [Mechanize]
|
26
26
|
# @return [Grubby::PageScraper]
|
27
27
|
def self.scrape_file(path, agent = $grubby)
|
28
|
-
|
29
|
-
body = File.read(path)
|
30
|
-
self.new(Mechanize::Page.new(uri, nil, body, "200", agent))
|
28
|
+
self.new(Mechanize::Page.read_local(path).tap{|page| page.mech = agent })
|
31
29
|
end
|
32
30
|
|
33
31
|
end
|
data/lib/grubby/scraper.rb
CHANGED
@@ -56,7 +56,7 @@ class Grubby::Scraper
|
|
56
56
|
# @return [void]
|
57
57
|
def self.scrapes(field, **options, &block)
|
58
58
|
field = field.to_sym
|
59
|
-
self.fields << field
|
59
|
+
(self.fields << field).uniq!
|
60
60
|
|
61
61
|
define_method(field) do
|
62
62
|
raise "#{self.class}#initialize does not invoke `super`" unless defined?(@scraped)
|
@@ -127,10 +127,10 @@ class Grubby::Scraper
|
|
127
127
|
self.new(agent.get(url))
|
128
128
|
end
|
129
129
|
|
130
|
-
# Iterates a series of pages, starting at +
|
131
|
-
#
|
132
|
-
# Subsequent pages in the series are determined by
|
133
|
-
# +next_method+ on each previous scraper instance.
|
130
|
+
# Iterates a series of pages, starting at +start+. The Scraper class
|
131
|
+
# is instantiated with each page, and each instance is passed to the
|
132
|
+
# given block. Subsequent pages in the series are determined by
|
133
|
+
# invoking the +next_method+ method on each previous scraper instance.
|
134
134
|
#
|
135
135
|
# Iteration stops when the +next_method+ method returns nil. If the
|
136
136
|
# +next_method+ method returns a String or URI, that value will be
|
@@ -163,7 +163,7 @@ class Grubby::Scraper
|
|
163
163
|
# scraper.page_param # == "1", "2", "3", ...
|
164
164
|
# end
|
165
165
|
#
|
166
|
-
# @param
|
166
|
+
# @param start [String, URI, Mechanize::Page, Mechanize::File]
|
167
167
|
# @param agent [Mechanize]
|
168
168
|
# @param next_method [Symbol]
|
169
169
|
# @yield [scraper]
|
@@ -171,14 +171,14 @@ class Grubby::Scraper
|
|
171
171
|
# @return [void]
|
172
172
|
# @raise [NoMethodError]
|
173
173
|
# if Scraper class does not implement +next_method+
|
174
|
-
def self.each(
|
174
|
+
def self.each(start, agent = $grubby, next_method: :next)
|
175
175
|
unless self.method_defined?(next_method)
|
176
176
|
raise NoMethodError.new(nil, next_method), "#{self} does not define `#{next_method}`"
|
177
177
|
end
|
178
178
|
|
179
|
-
return to_enum(:each,
|
179
|
+
return to_enum(:each, start, agent, next_method: next_method) unless block_given?
|
180
180
|
|
181
|
-
current =
|
181
|
+
current = start
|
182
182
|
while current
|
183
183
|
current = agent.get(current) if current.is_a?(String) || current.is_a?(URI)
|
184
184
|
scraper = self.new(current)
|
data/lib/grubby/version.rb
CHANGED
@@ -1 +1 @@
|
|
1
|
-
GRUBBY_VERSION = "1.2.
|
1
|
+
GRUBBY_VERSION = "1.2.1"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grubby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Hefner
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-08-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -166,7 +166,7 @@ dependencies:
|
|
166
166
|
version: '0.9'
|
167
167
|
description:
|
168
168
|
email:
|
169
|
-
- jonathan
|
169
|
+
- jonathan@hefner.pro
|
170
170
|
executables: []
|
171
171
|
extensions: []
|
172
172
|
extra_rdoc_files: []
|