grubby 1.0.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/CHANGELOG.md +17 -0
- data/README.md +61 -8
- data/Rakefile +0 -3
- data/grubby.gemspec +2 -1
- data/lib/grubby.rb +60 -38
- data/lib/grubby/core_ext/string.rb +0 -0
- data/lib/grubby/core_ext/uri.rb +40 -0
- data/lib/grubby/json_parser.rb +2 -1
- data/lib/grubby/json_scraper.rb +20 -1
- data/lib/grubby/log.rb +0 -0
- data/lib/grubby/mechanize/download.rb +0 -0
- data/lib/grubby/mechanize/fetch_with_retry.rb +5 -6
- data/lib/grubby/mechanize/file.rb +0 -0
- data/lib/grubby/mechanize/link.rb +0 -0
- data/lib/grubby/mechanize/page.rb +0 -0
- data/lib/grubby/mechanize/parser.rb +46 -0
- data/lib/grubby/page_scraper.rb +21 -1
- data/lib/grubby/scraper.rb +59 -20
- data/lib/grubby/version.rb +1 -3
- metadata +19 -4
- data/lib/grubby/nokogiri/searchable.rb +0 -27
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 7528791ce5da4ca182e8258cf5bc8920345470ee76ee50de44cf89adac7ffec6
|
4
|
+
data.tar.gz: 3b3dad255ae1841583abb2c61345fbefffb268231906968619000b95005044be
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 295c2957f708d86b596a4c062fcdf31d9c5083d26d15989de31feb174316ee17430e19d1746ad0f389d599560cc419e740835b1bfba6b0f57627e633f1a0ecf1
|
7
|
+
data.tar.gz: e8bc4ecb3ce277436be91ee4e8cf9c187c1f0bbf5ee170bc7a4e3f221f94d678e0e24d2f7dd878427c7b2b0e0b2fe1baa485556bdf21cd89e05a7ff222a9dc53
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
## 1.1.0
|
2
|
+
* Added `Grubby#ok?`.
|
3
|
+
* Added `Grubby::PageScraper.scrape_file` and `Grubby::JsonScraper.scrape_file`.
|
4
|
+
* Added `Mechanize::Parser#save_to` and `Mechanize::Parser#save_to!`,
|
5
|
+
which are inherited by `Mechanize::Download` and `Mechanize::File`.
|
6
|
+
* Added `URI#basename`.
|
7
|
+
* Added `URI#query_param`.
|
8
|
+
* Added utility methods from [ryoba](https://rubygems.org/gems/ryoba).
|
9
|
+
* Added `Grubby::Scraper::Error#scraper` and `Grubby::Scraper#errors`
|
10
|
+
for interactive debugging with e.g. byebug.
|
11
|
+
* Improved log messages and error formatting.
|
12
|
+
* Fixed compatibility with net-http-persistent gem v3.0.
|
13
|
+
|
14
|
+
|
15
|
+
## 1.0.0
|
16
|
+
|
17
|
+
* Initial release
|
data/README.md
CHANGED
@@ -60,6 +60,7 @@ puts hn.items.take(10).map(&:title) # your scraping logic goes here
|
|
60
60
|
|
61
61
|
- [Grubby](http://www.rubydoc.info/gems/grubby/Grubby)
|
62
62
|
- [#get_mirrored](http://www.rubydoc.info/gems/grubby/Grubby:get_mirrored)
|
63
|
+
- [#ok?](http://www.rubydoc.info/gems/grubby/Grubby:ok%3F)
|
63
64
|
- [#singleton](http://www.rubydoc.info/gems/grubby/Grubby:singleton)
|
64
65
|
- [#time_between_requests](http://www.rubydoc.info/gems/grubby/Grubby:time_between_requests)
|
65
66
|
- [Scraper](http://www.rubydoc.info/gems/grubby/Grubby/Scraper)
|
@@ -69,37 +70,89 @@ puts hn.items.take(10).map(&:title) # your scraping logic goes here
|
|
69
70
|
- [#source](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:source)
|
70
71
|
- [#to_h](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:to_h)
|
71
72
|
- [PageScraper](http://www.rubydoc.info/gems/grubby/Grubby/PageScraper)
|
73
|
+
- [.scrape_file](http://www.rubydoc.info/gems/grubby/Grubby/PageScraper.scrape_file)
|
72
74
|
- [#page](http://www.rubydoc.info/gems/grubby/Grubby/PageScraper:page)
|
73
75
|
- [JsonScraper](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper)
|
76
|
+
- [.scrape_file](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper.scrape_file)
|
74
77
|
- [#json](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper:json)
|
75
|
-
-
|
76
|
-
- [#
|
77
|
-
- [#
|
78
|
+
- Mechanize::Download
|
79
|
+
- [#save_to](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to)
|
80
|
+
- [#save_to!](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to%21)
|
81
|
+
- Mechanize::File
|
82
|
+
- [#save_to](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to)
|
83
|
+
- [#save_to!](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to%21)
|
78
84
|
- Mechanize::Page
|
79
85
|
- [#at!](http://www.rubydoc.info/gems/grubby/Mechanize/Page:at%21)
|
80
86
|
- [#search!](http://www.rubydoc.info/gems/grubby/Mechanize/Page:search%21)
|
81
87
|
- Mechanize::Page::Link
|
82
88
|
- [#to_absolute_uri](http://www.rubydoc.info/gems/grubby/Mechanize/Page/Link#to_absolute_uri)
|
89
|
+
- URI
|
90
|
+
- [#basename](https://www.rubydoc.info/gems/grubby/URI:basename)
|
91
|
+
- [#query_param](https://www.rubydoc.info/gems/grubby/URI:query_param)
|
83
92
|
|
84
93
|
|
85
94
|
## Supplemental API
|
86
95
|
|
87
|
-
*grubby*
|
88
|
-
convenience methods. When you
|
89
|
-
these methods available.
|
90
|
-
documentation
|
96
|
+
*grubby* includes several gems which extend Ruby objects with
|
97
|
+
convenience methods. When you load *grubby* you automatically make
|
98
|
+
these methods available. The included gems are listed below, along with
|
99
|
+
**a few** of the methods each provides. See each gem's documentation
|
100
|
+
for a complete API listing.
|
91
101
|
|
92
102
|
- [Active Support](https://rubygems.org/gems/activesupport)
|
93
103
|
([docs](http://www.rubydoc.info/gems/activesupport/))
|
104
|
+
- [Enumerable#index_by](https://www.rubydoc.info/gems/activesupport/Enumerable:index_by)
|
105
|
+
- [File.atomic_write](https://www.rubydoc.info/gems/activesupport/File:atomic_write)
|
106
|
+
- [NilClass#try](https://www.rubydoc.info/gems/activesupport/NilClass:try)
|
107
|
+
- [Object#presence](https://www.rubydoc.info/gems/activesupport/Object:presence)
|
108
|
+
- [String#blank?](https://www.rubydoc.info/gems/activesupport/String:blank%3F)
|
109
|
+
- [String#squish](https://www.rubydoc.info/gems/activesupport/String:squish)
|
94
110
|
- [casual_support](https://rubygems.org/gems/casual_support)
|
95
111
|
([docs](http://www.rubydoc.info/gems/casual_support/))
|
112
|
+
- [Enumerable#index_to](http://www.rubydoc.info/gems/casual_support/Enumerable:index_to)
|
113
|
+
- [String#after](http://www.rubydoc.info/gems/casual_support/String:after)
|
114
|
+
- [String#after_last](http://www.rubydoc.info/gems/casual_support/String:after_last)
|
115
|
+
- [String#before](http://www.rubydoc.info/gems/casual_support/String:before)
|
116
|
+
- [String#before_last](http://www.rubydoc.info/gems/casual_support/String:before_last)
|
117
|
+
- [String#between](http://www.rubydoc.info/gems/casual_support/String:between)
|
118
|
+
- [Time#to_hms](http://www.rubydoc.info/gems/casual_support/Time:to_hms)
|
119
|
+
- [Time#to_ymd](http://www.rubydoc.info/gems/casual_support/Time:to_ymd)
|
96
120
|
- [gorge](https://rubygems.org/gems/gorge)
|
97
121
|
([docs](http://www.rubydoc.info/gems/gorge/))
|
122
|
+
- [Pathname#file_crc32](http://www.rubydoc.info/gems/gorge/Pathname:file_crc32)
|
123
|
+
- [Pathname#file_md5](http://www.rubydoc.info/gems/gorge/Pathname:file_md5)
|
124
|
+
- [Pathname#file_sha1](http://www.rubydoc.info/gems/gorge/Pathname:file_sha1)
|
125
|
+
- [String#crc32](http://www.rubydoc.info/gems/gorge/String:crc32)
|
126
|
+
- [String#md5](http://www.rubydoc.info/gems/gorge/String:md5)
|
127
|
+
- [String#sha1](http://www.rubydoc.info/gems/gorge/String:sha1)
|
98
128
|
- [mini_sanity](https://rubygems.org/gems/mini_sanity)
|
99
129
|
([docs](http://www.rubydoc.info/gems/mini_sanity/))
|
130
|
+
- [Array#assert_length!](http://www.rubydoc.info/gems/mini_sanity/Array:assert_length%21)
|
131
|
+
- [Enumerable#refute_empty!](http://www.rubydoc.info/gems/mini_sanity/Enumerable:refute_empty%21)
|
132
|
+
- [Object#assert_equal!](http://www.rubydoc.info/gems/mini_sanity/Object:assert_equal%21)
|
133
|
+
- [Object#assert_in!](http://www.rubydoc.info/gems/mini_sanity/Object:assert_in%21)
|
134
|
+
- [Object#refute_nil!](http://www.rubydoc.info/gems/mini_sanity/Object:refute_nil%21)
|
135
|
+
- [Pathname#assert_exist!](http://www.rubydoc.info/gems/mini_sanity/Pathname:assert_exist%21)
|
136
|
+
- [String#assert_match!](http://www.rubydoc.info/gems/mini_sanity/String:assert_match%21)
|
100
137
|
- [pleasant_path](https://rubygems.org/gems/pleasant_path)
|
101
138
|
([docs](http://www.rubydoc.info/gems/pleasant_path/))
|
102
|
-
|
139
|
+
- [Pathname#dirs](http://www.rubydoc.info/gems/pleasant_path/Pathname:dirs)
|
140
|
+
- [Pathname#dirs_r](http://www.rubydoc.info/gems/pleasant_path/Pathname:dirs_r)
|
141
|
+
- [Pathname#files](http://www.rubydoc.info/gems/pleasant_path/Pathname:files)
|
142
|
+
- [Pathname#files_r](http://www.rubydoc.info/gems/pleasant_path/Pathname:files_r)
|
143
|
+
- [Pathname#make_dirname](http://www.rubydoc.info/gems/pleasant_path/Pathname:make_dirname)
|
144
|
+
- [Pathname#rename_basename](http://www.rubydoc.info/gems/pleasant_path/Pathname:rename_basename)
|
145
|
+
- [Pathname#rename_extname](http://www.rubydoc.info/gems/pleasant_path/Pathname:rename_extname)
|
146
|
+
- [Pathname#touch_file](http://www.rubydoc.info/gems/pleasant_path/Pathname:touch_file)
|
147
|
+
- [ryoba](https://rubygems.org/gems/ryoba)
|
148
|
+
([docs](http://www.rubydoc.info/gems/ryoba/))
|
149
|
+
- [Nokogiri::XML::Node#matches!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:matches%21)
|
150
|
+
- [Nokogiri::XML::Node#text!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:text%21)
|
151
|
+
- [Nokogiri::XML::Node#uri](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:uri)
|
152
|
+
- [Nokogiri::XML::Searchable#ancestor!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestor%21)
|
153
|
+
- [Nokogiri::XML::Searchable#ancestors!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestors%21)
|
154
|
+
- [Nokogiri::XML::Searchable#at!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21)
|
155
|
+
- [Nokogiri::XML::Searchable#search!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21)
|
103
156
|
|
104
157
|
## Installation
|
105
158
|
|
data/Rakefile
CHANGED
data/grubby.gemspec
CHANGED
@@ -5,7 +5,7 @@ require "grubby/version"
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
7
|
spec.name = "grubby"
|
8
|
-
spec.version =
|
8
|
+
spec.version = GRUBBY_VERSION
|
9
9
|
spec.authors = ["Jonathan Hefner"]
|
10
10
|
spec.email = ["jonathan.hefner@gmail.com"]
|
11
11
|
|
@@ -27,6 +27,7 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.add_runtime_dependency "mechanize", "~> 2.7"
|
28
28
|
spec.add_runtime_dependency "mini_sanity", "~> 1.0"
|
29
29
|
spec.add_runtime_dependency "pleasant_path", "~> 1.1"
|
30
|
+
spec.add_runtime_dependency "ryoba", "~> 1.0"
|
30
31
|
|
31
32
|
spec.add_development_dependency "bundler", "~> 1.15"
|
32
33
|
spec.add_development_dependency "rake", "~> 10.0"
|
data/lib/grubby.rb
CHANGED
@@ -5,7 +5,9 @@ require "gorge"
|
|
5
5
|
require "mechanize"
|
6
6
|
require "mini_sanity"
|
7
7
|
require "pleasant_path"
|
8
|
+
require "ryoba"
|
8
9
|
|
10
|
+
require_relative "grubby/version"
|
9
11
|
require_relative "grubby/log"
|
10
12
|
|
11
13
|
require_relative "grubby/core_ext/string"
|
@@ -15,22 +17,30 @@ require_relative "grubby/mechanize/download"
|
|
15
17
|
require_relative "grubby/mechanize/file"
|
16
18
|
require_relative "grubby/mechanize/link"
|
17
19
|
require_relative "grubby/mechanize/page"
|
18
|
-
require_relative "grubby/
|
20
|
+
require_relative "grubby/mechanize/parser"
|
19
21
|
|
20
22
|
|
21
23
|
class Grubby < Mechanize
|
22
24
|
|
25
|
+
VERSION = GRUBBY_VERSION
|
26
|
+
|
27
|
+
# The enforced minimum amount of time to wait between requests, in
|
28
|
+
# seconds. If the value is a Range, a random number within the Range
|
29
|
+
# is chosen for each request.
|
30
|
+
#
|
23
31
|
# @return [Integer, Float, Range<Integer>, Range<Float>]
|
24
|
-
# The enforced minimum amount of time to wait between requests, in
|
25
|
-
# seconds. If the value is a Range, a random number within the
|
26
|
-
# Range is chosen for each request.
|
27
32
|
attr_accessor :time_between_requests
|
28
33
|
|
29
|
-
#
|
30
|
-
#
|
31
|
-
#
|
32
|
-
#
|
33
|
-
|
34
|
+
# Journal file used to ensure only-once processing of resources by
|
35
|
+
# {singleton} across multiple program runs. Set via {initialize}.
|
36
|
+
#
|
37
|
+
# @return [Pathname, nil]
|
38
|
+
attr_reader :journal
|
39
|
+
|
40
|
+
# @param journal [Pathname, String]
|
41
|
+
# Optional journal file used to ensure only-once processing of
|
42
|
+
# resources by {singleton} across multiple program runs.
|
43
|
+
def initialize(journal = nil)
|
34
44
|
super()
|
35
45
|
|
36
46
|
# Prevent "memory leaks", and prevent mistakenly blank urls from
|
@@ -58,10 +68,22 @@ class Grubby < Mechanize
|
|
58
68
|
self.pre_connect_hooks << Proc.new{ self.send(:sleep_between_requests) }
|
59
69
|
self.time_between_requests = 1.0
|
60
70
|
|
61
|
-
@journal =
|
62
|
-
|
63
|
-
|
64
|
-
|
71
|
+
@journal = journal.try(&:to_pathname).try(&:touch_file)
|
72
|
+
@seen = @journal ? SingletonKey.parse_file(@journal).index_to{ true } : {}
|
73
|
+
end
|
74
|
+
|
75
|
+
# Calls +#head+ and returns true if the result has response code
|
76
|
+
# "200". Unlike +#head+, error response codes (e.g. "404", "500")
|
77
|
+
# do not cause a +Mechanize::ResponseCodeError+ to be raised.
|
78
|
+
#
|
79
|
+
# @param uri [String]
|
80
|
+
# @return [Boolean]
|
81
|
+
def ok?(uri, query_params = {}, headers = {})
|
82
|
+
begin
|
83
|
+
head(uri, query_params, headers).code == "200"
|
84
|
+
rescue Mechanize::ResponseCodeError => e
|
85
|
+
false
|
86
|
+
end
|
65
87
|
end
|
66
88
|
|
67
89
|
# Calls +#get+ with each of +mirror_uris+ until a successful
|
@@ -82,8 +104,8 @@ class Grubby < Mechanize
|
|
82
104
|
if i >= mirror_uris.length
|
83
105
|
raise
|
84
106
|
else
|
85
|
-
$log.
|
86
|
-
$log.debug("
|
107
|
+
$log.debug("Mirror failed (code #{e.response_code}): #{mirror_uris[i - 1]}")
|
108
|
+
$log.debug("Try mirror: #{mirror_uris[i]}")
|
87
109
|
retry
|
88
110
|
end
|
89
111
|
end
|
@@ -111,20 +133,20 @@ class Grubby < Mechanize
|
|
111
133
|
def singleton(target, purpose = "")
|
112
134
|
series = []
|
113
135
|
|
114
|
-
|
115
|
-
return if
|
136
|
+
original_uri = target.to_absolute_uri
|
137
|
+
return if try_skip_singleton(original_uri, purpose, series)
|
116
138
|
|
117
|
-
|
118
|
-
return if
|
139
|
+
normalized_uri = normalize_uri(original_uri)
|
140
|
+
return if try_skip_singleton(normalized_uri, purpose, series)
|
119
141
|
|
120
|
-
$log.info("
|
121
|
-
resource = get(
|
122
|
-
skip =
|
123
|
-
|
142
|
+
$log.info("Fetch #{normalized_uri}")
|
143
|
+
resource = get(normalized_uri)
|
144
|
+
skip = try_skip_singleton(resource.uri, purpose, series) |
|
145
|
+
try_skip_singleton("content hash: #{resource.content_hash}", purpose, series)
|
124
146
|
|
125
147
|
yield resource unless skip
|
126
148
|
|
127
|
-
series.
|
149
|
+
series.append_to_file(@journal) if @journal
|
128
150
|
|
129
151
|
!skip
|
130
152
|
end
|
@@ -132,22 +154,23 @@ class Grubby < Mechanize
|
|
132
154
|
|
133
155
|
private
|
134
156
|
|
135
|
-
SingletonKey = DumbDelimited[:purpose, :
|
157
|
+
SingletonKey = DumbDelimited[:purpose, :target]
|
136
158
|
|
137
|
-
def
|
138
|
-
|
139
|
-
series
|
140
|
-
|
141
|
-
|
142
|
-
|
159
|
+
def try_skip_singleton(target, purpose, series)
|
160
|
+
series << SingletonKey.new(purpose, target.to_s)
|
161
|
+
if series.uniq!.nil? && @seen.displace(series.last, true)
|
162
|
+
seen_info = series.length > 1 ? "seen #{series.last.target}" : "seen"
|
163
|
+
$log.info("Skip #{series.first.target} (#{seen_info})")
|
164
|
+
true
|
165
|
+
end
|
143
166
|
end
|
144
167
|
|
145
|
-
def
|
146
|
-
|
147
|
-
$log.warn("
|
148
|
-
|
149
|
-
|
150
|
-
|
168
|
+
def normalize_uri(uri)
|
169
|
+
uri = uri.dup
|
170
|
+
$log.warn("Ignore ##{uri.fragment} in #{uri}") if uri.fragment
|
171
|
+
uri.fragment = nil
|
172
|
+
uri.path = uri.path.chomp("/")
|
173
|
+
uri
|
151
174
|
end
|
152
175
|
|
153
176
|
def sleep_between_requests
|
@@ -162,7 +185,6 @@ class Grubby < Mechanize
|
|
162
185
|
end
|
163
186
|
|
164
187
|
|
165
|
-
require_relative "grubby/version"
|
166
188
|
require_relative "grubby/json_parser"
|
167
189
|
require_relative "grubby/scraper"
|
168
190
|
require_relative "grubby/page_scraper"
|
File without changes
|
data/lib/grubby/core_ext/uri.rb
CHANGED
@@ -1,5 +1,45 @@
|
|
1
1
|
module URI
|
2
2
|
|
3
|
+
# Returns the basename of the URI's +path+, a la +File.basename+.
|
4
|
+
#
|
5
|
+
# @example
|
6
|
+
# URI("http://example.com/foo/bar").basename # == "bar"
|
7
|
+
# URI("http://example.com/foo").basename # == "foo"
|
8
|
+
# URI("http://example.com/").basename # == ""
|
9
|
+
#
|
10
|
+
# @return [String]
|
11
|
+
def basename
|
12
|
+
self.path == "/" ? "" : File.basename(self.path)
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns the value of the specified param in the URI's +query+.
|
16
|
+
# The specified param name must be exactly as it appears in the query
|
17
|
+
# string, and support for complex nested values is limited. (See
|
18
|
+
# +CGI.parse+ for parsing behavior.) If the param name includes a
|
19
|
+
# +"[]"+, the result will be an array of all occurrences of that param
|
20
|
+
# in the query string. Otherwise, the result will be the last
|
21
|
+
# occurrence of that param in the query string.
|
22
|
+
#
|
23
|
+
# @example
|
24
|
+
# URI("http://example.com/?foo=a").query_param("foo") # == "a"
|
25
|
+
#
|
26
|
+
# URI("http://example.com/?foo=a&foo=b").query_param("foo") # == "b"
|
27
|
+
# URI("http://example.com/?foo=a&foo=b").query_param("foo[]") # == nil
|
28
|
+
#
|
29
|
+
# URI("http://example.com/?foo[]=a&foo[]=b").query_param("foo") # == nil
|
30
|
+
# URI("http://example.com/?foo[]=a&foo[]=b").query_param("foo[]") # == ["a", "b"]
|
31
|
+
#
|
32
|
+
# URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[]") # == nil
|
33
|
+
# URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[][x]") # == ["a"]
|
34
|
+
#
|
35
|
+
# @return [String, nil]
|
36
|
+
# @return [Array<String>, nil]
|
37
|
+
# if +name+ contains +"[]"+
|
38
|
+
def query_param(name)
|
39
|
+
values = CGI.parse(self.query)[name.to_s]
|
40
|
+
(values.nil? || name.include?("[]")) ? values : values.last
|
41
|
+
end
|
42
|
+
|
3
43
|
# Raises an exception if the URI is not +absolute?+.
|
4
44
|
#
|
5
45
|
# @return [self]
|
data/lib/grubby/json_parser.rb
CHANGED
@@ -33,8 +33,9 @@ class Grubby::JsonParser < Mechanize::File
|
|
33
33
|
@json_parse_options = options
|
34
34
|
end
|
35
35
|
|
36
|
+
# The parsed JSON data.
|
37
|
+
#
|
36
38
|
# @return [Hash, Array]
|
37
|
-
# The parsed JSON data.
|
38
39
|
attr_reader :json
|
39
40
|
|
40
41
|
def initialize(uri = nil, response = nil, body = nil, code = nil)
|
data/lib/grubby/json_scraper.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
class Grubby::JsonScraper < Grubby::Scraper
|
2
2
|
|
3
|
+
# The parsed JSON data being scraped.
|
4
|
+
#
|
3
5
|
# @return [Hash, Array]
|
4
|
-
# The parsed JSON data being scraped.
|
5
6
|
attr_reader :json
|
6
7
|
|
7
8
|
# @param source [Grubby::JsonParser]
|
@@ -10,4 +11,22 @@ class Grubby::JsonScraper < Grubby::Scraper
|
|
10
11
|
super
|
11
12
|
end
|
12
13
|
|
14
|
+
# Scrapes a locally-stored file. This method is intended for use with
|
15
|
+
# subclasses of +Grubby::JsonScraper+.
|
16
|
+
#
|
17
|
+
# @example
|
18
|
+
# class MyScraper < Grubby::JsonScraper
|
19
|
+
# # ...
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
# MyScraper.scrape_file("path/to/local_file.json").class # == MyScraper
|
23
|
+
#
|
24
|
+
# @param path [String]
|
25
|
+
# @return [Grubby::JsonScraper]
|
26
|
+
def self.scrape_file(path)
|
27
|
+
uri = URI.join("file:///", File.expand_path(path))
|
28
|
+
body = File.read(path)
|
29
|
+
self.new(Grubby::JsonParser.new(uri, nil, body, "200"))
|
30
|
+
end
|
31
|
+
|
13
32
|
end
|
data/lib/grubby/log.rb
CHANGED
File without changes
|
File without changes
|
@@ -9,9 +9,8 @@ class Mechanize::HTTP::Agent
|
|
9
9
|
IDEMPOTENT_HTTP_METHODS = [:get, :head, :options, :delete]
|
10
10
|
|
11
11
|
# Replacement for +Mechanize::HTTP::Agent#fetch+. When a "too many
|
12
|
-
# connection resets" error is encountered, this method
|
13
|
-
#
|
14
|
-
# {MAX_CONNECTION_RESET_RETRIES} times).
|
12
|
+
# connection resets" error is encountered, this method retries the
|
13
|
+
# request (upto {MAX_CONNECTION_RESET_RETRIES} times).
|
15
14
|
def fetch_with_retry(uri, http_method = :get, headers = {}, params = [], referer = current_page, redirects = 0)
|
16
15
|
retry_count = 0
|
17
16
|
begin
|
@@ -26,9 +25,9 @@ class Mechanize::HTTP::Agent
|
|
26
25
|
|
27
26
|
# otherwise, shutdown the persistent HTTP connection and try again
|
28
27
|
retry_count += 1
|
29
|
-
$log.warn("
|
30
|
-
self
|
31
|
-
|
28
|
+
$log.warn("#{e.message} (#{e.class}). Retry in #{retry_count} seconds.")
|
29
|
+
sleep(retry_count) # incremental backoff to allow server to self-correct
|
30
|
+
$log.warn("Retry #{http_method.to_s.upcase} #{uri}")
|
32
31
|
retry
|
33
32
|
end
|
34
33
|
end
|
File without changes
|
File without changes
|
File without changes
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require "fileutils"
|
2
|
+
|
3
|
+
module Mechanize::Parser
|
4
|
+
|
5
|
+
# Saves the payload to a specified directory, but using the default
|
6
|
+
# filename suggested by the server. If a file with that name already
|
7
|
+
# exists, this method will try to find a free filename by appending
|
8
|
+
# numbers to the original name. Returns the full path of the saved
|
9
|
+
# file.
|
10
|
+
#
|
11
|
+
# NOTE: this method expects a +#save!+ method to be defined by the
|
12
|
+
# class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
|
13
|
+
# and +Mechanize::Download#save!+.
|
14
|
+
#
|
15
|
+
# @param directory [String]
|
16
|
+
# @return [String]
|
17
|
+
def save_to(directory)
|
18
|
+
raise "#{self.class}#save! is not defined" unless self.respond_to?(:save!)
|
19
|
+
|
20
|
+
FileUtils.mkdir_p(directory)
|
21
|
+
path = find_free_name(File.join(directory, @filename))
|
22
|
+
save!(path)
|
23
|
+
path
|
24
|
+
end
|
25
|
+
|
26
|
+
# Saves the payload to a specified directory, but using the default
|
27
|
+
# filename suggested by the server. If a file with that name already
|
28
|
+
# exists, that file will be overwritten. Returns the full path of the
|
29
|
+
# saved file.
|
30
|
+
#
|
31
|
+
# NOTE: this method expects a +#save!+ method to be defined by the
|
32
|
+
# class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
|
33
|
+
# and +Mechanize::Download#save!+.
|
34
|
+
#
|
35
|
+
# @param directory [String]
|
36
|
+
# @return [String]
|
37
|
+
def save_to!(directory)
|
38
|
+
raise "#{self.class}#save! is not defined" unless self.respond_to?(:save!)
|
39
|
+
|
40
|
+
FileUtils.mkdir_p(directory)
|
41
|
+
path = File.join(directory, @filename)
|
42
|
+
save!(path)
|
43
|
+
path
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
data/lib/grubby/page_scraper.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
class Grubby::PageScraper < Grubby::Scraper
|
2
2
|
|
3
|
+
# The Page being scraped.
|
4
|
+
#
|
3
5
|
# @return [Mechanize::Page]
|
4
|
-
# The Page being scraped.
|
5
6
|
attr_reader :page
|
6
7
|
|
7
8
|
# @param source [Mechanize::Page]
|
@@ -10,4 +11,23 @@ class Grubby::PageScraper < Grubby::Scraper
|
|
10
11
|
super
|
11
12
|
end
|
12
13
|
|
14
|
+
# Scrapes a locally-stored file. This method is intended for use with
|
15
|
+
# subclasses of +Grubby::PageScraper+.
|
16
|
+
#
|
17
|
+
# @example
|
18
|
+
# class MyScraper < Grubby::PageScraper
|
19
|
+
# # ...
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
# MyScraper.scrape_file("path/to/local_file.html").class # == MyScraper
|
23
|
+
#
|
24
|
+
# @param path [String]
|
25
|
+
# @param agent [Mechanize]
|
26
|
+
# @return [Grubby::PageScraper]
|
27
|
+
def self.scrape_file(path, agent = Grubby.new)
|
28
|
+
uri = URI.join("file:///", File.expand_path(path))
|
29
|
+
body = File.read(path)
|
30
|
+
self.new(Mechanize::Page.new(uri, nil, body, "200", agent))
|
31
|
+
end
|
32
|
+
|
13
33
|
end
|
data/lib/grubby/scraper.rb
CHANGED
@@ -1,8 +1,5 @@
|
|
1
1
|
class Grubby::Scraper
|
2
2
|
|
3
|
-
class Error < RuntimeError
|
4
|
-
end
|
5
|
-
|
6
3
|
# Defines an attribute reader method named by +field+. During
|
7
4
|
# +initialize+, the given block is called, and the attribute is set to
|
8
5
|
# the block's return value. By default, if the block's return value
|
@@ -22,38 +19,48 @@ class Grubby::Scraper
|
|
22
19
|
self.fields << field
|
23
20
|
|
24
21
|
define_method(field) do
|
22
|
+
raise "#{self.class}#initialize does not invoke `super`" unless defined?(@scraped)
|
25
23
|
return @scraped[field] if @scraped.key?(field)
|
26
24
|
|
27
|
-
unless @errors
|
25
|
+
unless @errors[field]
|
28
26
|
begin
|
29
27
|
value = instance_eval(&block)
|
30
28
|
if value.nil?
|
31
|
-
raise
|
32
|
-
$log.debug("
|
29
|
+
raise FieldValueRequiredError.new(field) unless optional
|
30
|
+
$log.debug("#{self.class}##{field} is nil")
|
33
31
|
end
|
34
32
|
@scraped[field] = value
|
35
|
-
rescue RuntimeError => e
|
33
|
+
rescue RuntimeError, IndexError => e
|
36
34
|
@errors[field] = e
|
37
35
|
end
|
38
36
|
end
|
39
37
|
|
40
|
-
raise
|
38
|
+
raise FieldScrapeFailedError.new(field, @errors[field]) if @errors[field]
|
41
39
|
|
42
40
|
@scraped[field]
|
43
41
|
end
|
44
42
|
end
|
45
43
|
|
44
|
+
# The names of all scraped values, as defined by {scrapes}.
|
45
|
+
#
|
46
46
|
# @return [Array<Symbol>]
|
47
|
-
# The names of all scraped values, as defined by {scrapes}.
|
48
47
|
def self.fields
|
49
48
|
@fields ||= []
|
50
49
|
end
|
51
50
|
|
51
|
+
# The source being scraped. Typically a Mechanize pluggable parser
|
52
|
+
# such as +Mechanize::Page+.
|
53
|
+
#
|
52
54
|
# @return [Object]
|
53
|
-
# The source being scraped. Typically a Mechanize pluggable parser
|
54
|
-
# such as +Mechanize::Page+.
|
55
55
|
attr_reader :source
|
56
56
|
|
57
|
+
# Hash of errors raised by blocks passed to {scrapes}. If
|
58
|
+
# {initialize} does not raise +Grubby::Scraper::Error+, this Hash will
|
59
|
+
# be empty.
|
60
|
+
#
|
61
|
+
# @return [Hash<Symbol, StandardError>]
|
62
|
+
attr_reader :errors
|
63
|
+
|
57
64
|
# @param source
|
58
65
|
# @raise [Grubby::Scraper::Error]
|
59
66
|
# if any scraped values result in error
|
@@ -65,18 +72,11 @@ class Grubby::Scraper
|
|
65
72
|
self.class.fields.each do |field|
|
66
73
|
begin
|
67
74
|
self.send(field)
|
68
|
-
rescue
|
75
|
+
rescue FieldScrapeFailedError
|
69
76
|
end
|
70
77
|
end
|
71
78
|
|
72
|
-
unless @errors.empty?
|
73
|
-
listing = @errors.map do |field, error|
|
74
|
-
error_class = " (#{error.class})" unless error.class == RuntimeError
|
75
|
-
error_trace = error.backtrace.join("\n").indent(2)
|
76
|
-
"* #{field} -- #{error.message}#{error_class}\n#{error_trace}"
|
77
|
-
end
|
78
|
-
raise Error.new("Failed to scrape the following fields:\n#{listing.join("\n")}")
|
79
|
-
end
|
79
|
+
raise Error.new(self) unless @errors.empty?
|
80
80
|
end
|
81
81
|
|
82
82
|
# Returns the scraped value named by +field+.
|
@@ -96,4 +96,43 @@ class Grubby::Scraper
|
|
96
96
|
@scraped.dup
|
97
97
|
end
|
98
98
|
|
99
|
+
class Error < RuntimeError
|
100
|
+
BACKTRACE_CLEANER = ActiveSupport::BacktraceCleaner.new.tap do |cleaner|
|
101
|
+
cleaner.add_silencer do |line|
|
102
|
+
line.include?(__dir__) && line.include?("scraper.rb:")
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# @return [Grubby::Scraper]
|
107
|
+
# The Scraper that raised this error.
|
108
|
+
attr_accessor :scraper
|
109
|
+
|
110
|
+
def initialize(scraper)
|
111
|
+
self.scraper = scraper
|
112
|
+
|
113
|
+
listing = scraper.errors.
|
114
|
+
reject{|field, error| error.is_a?(FieldScrapeFailedError) }.
|
115
|
+
map do |field, error|
|
116
|
+
"* `#{field}` (#{error.class})\n" +
|
117
|
+
error.message.indent(2) + "\n\n" +
|
118
|
+
BACKTRACE_CLEANER.clean(error.backtrace).join("\n").indent(4) + "\n"
|
119
|
+
end.
|
120
|
+
join("\n")
|
121
|
+
|
122
|
+
super("Failed to scrape the following fields:\n#{listing}")
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
class FieldScrapeFailedError < RuntimeError
|
127
|
+
def initialize(field, field_error)
|
128
|
+
super("`#{field}` raised #{field_error.class}")
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
class FieldValueRequiredError < RuntimeError
|
133
|
+
def initialize(field)
|
134
|
+
super("`#{field}` is nil but is not marked as optional")
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
99
138
|
end
|
data/lib/grubby/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grubby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Hefner
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-07-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -108,6 +108,20 @@ dependencies:
|
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '1.1'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: ryoba
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '1.0'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '1.0'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: bundler
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -173,6 +187,7 @@ extra_rdoc_files: []
|
|
173
187
|
files:
|
174
188
|
- ".gitignore"
|
175
189
|
- ".travis.yml"
|
190
|
+
- CHANGELOG.md
|
176
191
|
- Gemfile
|
177
192
|
- LICENSE.txt
|
178
193
|
- README.md
|
@@ -189,7 +204,7 @@ files:
|
|
189
204
|
- lib/grubby/mechanize/file.rb
|
190
205
|
- lib/grubby/mechanize/link.rb
|
191
206
|
- lib/grubby/mechanize/page.rb
|
192
|
-
- lib/grubby/
|
207
|
+
- lib/grubby/mechanize/parser.rb
|
193
208
|
- lib/grubby/page_scraper.rb
|
194
209
|
- lib/grubby/scraper.rb
|
195
210
|
- lib/grubby/version.rb
|
@@ -213,7 +228,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
213
228
|
version: '0'
|
214
229
|
requirements: []
|
215
230
|
rubyforge_project:
|
216
|
-
rubygems_version: 2.6
|
231
|
+
rubygems_version: 2.7.6
|
217
232
|
signing_key:
|
218
233
|
specification_version: 4
|
219
234
|
summary: Fail-fast web scraping
|
@@ -1,27 +0,0 @@
|
|
1
|
-
module Nokogiri::XML::Searchable
|
2
|
-
|
3
|
-
# Searches the node using the given XPath or CSS queries, and returns
|
4
|
-
# the results. Raises an exception if there are no results. See also
|
5
|
-
# +#search+.
|
6
|
-
#
|
7
|
-
# @param queries [Array<String>]
|
8
|
-
# @return [Array<Nokogiri::XML::Element>]
|
9
|
-
# @raise [RuntimeError] if queries yield no results
|
10
|
-
def search!(*queries)
|
11
|
-
results = search(*queries)
|
12
|
-
raise "No elements matching #{queries.map(&:inspect).join(" OR ")}" if results.empty?
|
13
|
-
results
|
14
|
-
end
|
15
|
-
|
16
|
-
# Searches the node using the given XPath or CSS queries, and returns
|
17
|
-
# only the first result. Raises an exception if there are no results.
|
18
|
-
# See also +#at+.
|
19
|
-
#
|
20
|
-
# @param queries [Array<String>]
|
21
|
-
# @return [Nokogiri::XML::Element]
|
22
|
-
# @raise [RuntimeError] if queries yield no results
|
23
|
-
def at!(*queries)
|
24
|
-
search!(*queries).first
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|