grubby 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/CHANGELOG.md +17 -0
- data/README.md +61 -8
- data/Rakefile +0 -3
- data/grubby.gemspec +2 -1
- data/lib/grubby.rb +60 -38
- data/lib/grubby/core_ext/string.rb +0 -0
- data/lib/grubby/core_ext/uri.rb +40 -0
- data/lib/grubby/json_parser.rb +2 -1
- data/lib/grubby/json_scraper.rb +20 -1
- data/lib/grubby/log.rb +0 -0
- data/lib/grubby/mechanize/download.rb +0 -0
- data/lib/grubby/mechanize/fetch_with_retry.rb +5 -6
- data/lib/grubby/mechanize/file.rb +0 -0
- data/lib/grubby/mechanize/link.rb +0 -0
- data/lib/grubby/mechanize/page.rb +0 -0
- data/lib/grubby/mechanize/parser.rb +46 -0
- data/lib/grubby/page_scraper.rb +21 -1
- data/lib/grubby/scraper.rb +59 -20
- data/lib/grubby/version.rb +1 -3
- metadata +19 -4
- data/lib/grubby/nokogiri/searchable.rb +0 -27
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 7528791ce5da4ca182e8258cf5bc8920345470ee76ee50de44cf89adac7ffec6
|
4
|
+
data.tar.gz: 3b3dad255ae1841583abb2c61345fbefffb268231906968619000b95005044be
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 295c2957f708d86b596a4c062fcdf31d9c5083d26d15989de31feb174316ee17430e19d1746ad0f389d599560cc419e740835b1bfba6b0f57627e633f1a0ecf1
|
7
|
+
data.tar.gz: e8bc4ecb3ce277436be91ee4e8cf9c187c1f0bbf5ee170bc7a4e3f221f94d678e0e24d2f7dd878427c7b2b0e0b2fe1baa485556bdf21cd89e05a7ff222a9dc53
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
## 1.1.0
|
2
|
+
* Added `Grubby#ok?`.
|
3
|
+
* Added `Grubby::PageScraper.scrape_file` and `Grubby::JsonScraper.scrape_file`.
|
4
|
+
* Added `Mechanize::Parser#save_to` and `Mechanize::Parser#save_to!`,
|
5
|
+
which are inherited by `Mechanize::Download` and `Mechanize::File`.
|
6
|
+
* Added `URI#basename`.
|
7
|
+
* Added `URI#query_param`.
|
8
|
+
* Added utility methods from [ryoba](https://rubygems.org/gems/ryoba).
|
9
|
+
* Added `Grubby::Scraper::Error#scraper` and `Grubby::Scraper#errors`
|
10
|
+
for interactive debugging with e.g. byebug.
|
11
|
+
* Improved log messages and error formatting.
|
12
|
+
* Fixed compatibility with net-http-persistent gem v3.0.
|
13
|
+
|
14
|
+
|
15
|
+
## 1.0.0
|
16
|
+
|
17
|
+
* Initial release
|
data/README.md
CHANGED
@@ -60,6 +60,7 @@ puts hn.items.take(10).map(&:title) # your scraping logic goes here
|
|
60
60
|
|
61
61
|
- [Grubby](http://www.rubydoc.info/gems/grubby/Grubby)
|
62
62
|
- [#get_mirrored](http://www.rubydoc.info/gems/grubby/Grubby:get_mirrored)
|
63
|
+
- [#ok?](http://www.rubydoc.info/gems/grubby/Grubby:ok%3F)
|
63
64
|
- [#singleton](http://www.rubydoc.info/gems/grubby/Grubby:singleton)
|
64
65
|
- [#time_between_requests](http://www.rubydoc.info/gems/grubby/Grubby:time_between_requests)
|
65
66
|
- [Scraper](http://www.rubydoc.info/gems/grubby/Grubby/Scraper)
|
@@ -69,37 +70,89 @@ puts hn.items.take(10).map(&:title) # your scraping logic goes here
|
|
69
70
|
- [#source](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:source)
|
70
71
|
- [#to_h](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:to_h)
|
71
72
|
- [PageScraper](http://www.rubydoc.info/gems/grubby/Grubby/PageScraper)
|
73
|
+
- [.scrape_file](http://www.rubydoc.info/gems/grubby/Grubby/PageScraper.scrape_file)
|
72
74
|
- [#page](http://www.rubydoc.info/gems/grubby/Grubby/PageScraper:page)
|
73
75
|
- [JsonScraper](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper)
|
76
|
+
- [.scrape_file](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper.scrape_file)
|
74
77
|
- [#json](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper:json)
|
75
|
-
-
|
76
|
-
- [#
|
77
|
-
- [#
|
78
|
+
- Mechanize::Download
|
79
|
+
- [#save_to](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to)
|
80
|
+
- [#save_to!](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to%21)
|
81
|
+
- Mechanize::File
|
82
|
+
- [#save_to](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to)
|
83
|
+
- [#save_to!](http://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to%21)
|
78
84
|
- Mechanize::Page
|
79
85
|
- [#at!](http://www.rubydoc.info/gems/grubby/Mechanize/Page:at%21)
|
80
86
|
- [#search!](http://www.rubydoc.info/gems/grubby/Mechanize/Page:search%21)
|
81
87
|
- Mechanize::Page::Link
|
82
88
|
- [#to_absolute_uri](http://www.rubydoc.info/gems/grubby/Mechanize/Page/Link#to_absolute_uri)
|
89
|
+
- URI
|
90
|
+
- [#basename](https://www.rubydoc.info/gems/grubby/URI:basename)
|
91
|
+
- [#query_param](https://www.rubydoc.info/gems/grubby/URI:query_param)
|
83
92
|
|
84
93
|
|
85
94
|
## Supplemental API
|
86
95
|
|
87
|
-
*grubby*
|
88
|
-
convenience methods. When you
|
89
|
-
these methods available.
|
90
|
-
documentation
|
96
|
+
*grubby* includes several gems which extend Ruby objects with
|
97
|
+
convenience methods. When you load *grubby* you automatically make
|
98
|
+
these methods available. The included gems are listed below, along with
|
99
|
+
**a few** of the methods each provides. See each gem's documentation
|
100
|
+
for a complete API listing.
|
91
101
|
|
92
102
|
- [Active Support](https://rubygems.org/gems/activesupport)
|
93
103
|
([docs](http://www.rubydoc.info/gems/activesupport/))
|
104
|
+
- [Enumerable#index_by](https://www.rubydoc.info/gems/activesupport/Enumerable:index_by)
|
105
|
+
- [File.atomic_write](https://www.rubydoc.info/gems/activesupport/File:atomic_write)
|
106
|
+
- [NilClass#try](https://www.rubydoc.info/gems/activesupport/NilClass:try)
|
107
|
+
- [Object#presence](https://www.rubydoc.info/gems/activesupport/Object:presence)
|
108
|
+
- [String#blank?](https://www.rubydoc.info/gems/activesupport/String:blank%3F)
|
109
|
+
- [String#squish](https://www.rubydoc.info/gems/activesupport/String:squish)
|
94
110
|
- [casual_support](https://rubygems.org/gems/casual_support)
|
95
111
|
([docs](http://www.rubydoc.info/gems/casual_support/))
|
112
|
+
- [Enumerable#index_to](http://www.rubydoc.info/gems/casual_support/Enumerable:index_to)
|
113
|
+
- [String#after](http://www.rubydoc.info/gems/casual_support/String:after)
|
114
|
+
- [String#after_last](http://www.rubydoc.info/gems/casual_support/String:after_last)
|
115
|
+
- [String#before](http://www.rubydoc.info/gems/casual_support/String:before)
|
116
|
+
- [String#before_last](http://www.rubydoc.info/gems/casual_support/String:before_last)
|
117
|
+
- [String#between](http://www.rubydoc.info/gems/casual_support/String:between)
|
118
|
+
- [Time#to_hms](http://www.rubydoc.info/gems/casual_support/Time:to_hms)
|
119
|
+
- [Time#to_ymd](http://www.rubydoc.info/gems/casual_support/Time:to_ymd)
|
96
120
|
- [gorge](https://rubygems.org/gems/gorge)
|
97
121
|
([docs](http://www.rubydoc.info/gems/gorge/))
|
122
|
+
- [Pathname#file_crc32](http://www.rubydoc.info/gems/gorge/Pathname:file_crc32)
|
123
|
+
- [Pathname#file_md5](http://www.rubydoc.info/gems/gorge/Pathname:file_md5)
|
124
|
+
- [Pathname#file_sha1](http://www.rubydoc.info/gems/gorge/Pathname:file_sha1)
|
125
|
+
- [String#crc32](http://www.rubydoc.info/gems/gorge/String:crc32)
|
126
|
+
- [String#md5](http://www.rubydoc.info/gems/gorge/String:md5)
|
127
|
+
- [String#sha1](http://www.rubydoc.info/gems/gorge/String:sha1)
|
98
128
|
- [mini_sanity](https://rubygems.org/gems/mini_sanity)
|
99
129
|
([docs](http://www.rubydoc.info/gems/mini_sanity/))
|
130
|
+
- [Array#assert_length!](http://www.rubydoc.info/gems/mini_sanity/Array:assert_length%21)
|
131
|
+
- [Enumerable#refute_empty!](http://www.rubydoc.info/gems/mini_sanity/Enumerable:refute_empty%21)
|
132
|
+
- [Object#assert_equal!](http://www.rubydoc.info/gems/mini_sanity/Object:assert_equal%21)
|
133
|
+
- [Object#assert_in!](http://www.rubydoc.info/gems/mini_sanity/Object:assert_in%21)
|
134
|
+
- [Object#refute_nil!](http://www.rubydoc.info/gems/mini_sanity/Object:refute_nil%21)
|
135
|
+
- [Pathname#assert_exist!](http://www.rubydoc.info/gems/mini_sanity/Pathname:assert_exist%21)
|
136
|
+
- [String#assert_match!](http://www.rubydoc.info/gems/mini_sanity/String:assert_match%21)
|
100
137
|
- [pleasant_path](https://rubygems.org/gems/pleasant_path)
|
101
138
|
([docs](http://www.rubydoc.info/gems/pleasant_path/))
|
102
|
-
|
139
|
+
- [Pathname#dirs](http://www.rubydoc.info/gems/pleasant_path/Pathname:dirs)
|
140
|
+
- [Pathname#dirs_r](http://www.rubydoc.info/gems/pleasant_path/Pathname:dirs_r)
|
141
|
+
- [Pathname#files](http://www.rubydoc.info/gems/pleasant_path/Pathname:files)
|
142
|
+
- [Pathname#files_r](http://www.rubydoc.info/gems/pleasant_path/Pathname:files_r)
|
143
|
+
- [Pathname#make_dirname](http://www.rubydoc.info/gems/pleasant_path/Pathname:make_dirname)
|
144
|
+
- [Pathname#rename_basename](http://www.rubydoc.info/gems/pleasant_path/Pathname:rename_basename)
|
145
|
+
- [Pathname#rename_extname](http://www.rubydoc.info/gems/pleasant_path/Pathname:rename_extname)
|
146
|
+
- [Pathname#touch_file](http://www.rubydoc.info/gems/pleasant_path/Pathname:touch_file)
|
147
|
+
- [ryoba](https://rubygems.org/gems/ryoba)
|
148
|
+
([docs](http://www.rubydoc.info/gems/ryoba/))
|
149
|
+
- [Nokogiri::XML::Node#matches!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:matches%21)
|
150
|
+
- [Nokogiri::XML::Node#text!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:text%21)
|
151
|
+
- [Nokogiri::XML::Node#uri](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:uri)
|
152
|
+
- [Nokogiri::XML::Searchable#ancestor!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestor%21)
|
153
|
+
- [Nokogiri::XML::Searchable#ancestors!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestors%21)
|
154
|
+
- [Nokogiri::XML::Searchable#at!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21)
|
155
|
+
- [Nokogiri::XML::Searchable#search!](http://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21)
|
103
156
|
|
104
157
|
## Installation
|
105
158
|
|
data/Rakefile
CHANGED
data/grubby.gemspec
CHANGED
@@ -5,7 +5,7 @@ require "grubby/version"
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
7
|
spec.name = "grubby"
|
8
|
-
spec.version =
|
8
|
+
spec.version = GRUBBY_VERSION
|
9
9
|
spec.authors = ["Jonathan Hefner"]
|
10
10
|
spec.email = ["jonathan.hefner@gmail.com"]
|
11
11
|
|
@@ -27,6 +27,7 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.add_runtime_dependency "mechanize", "~> 2.7"
|
28
28
|
spec.add_runtime_dependency "mini_sanity", "~> 1.0"
|
29
29
|
spec.add_runtime_dependency "pleasant_path", "~> 1.1"
|
30
|
+
spec.add_runtime_dependency "ryoba", "~> 1.0"
|
30
31
|
|
31
32
|
spec.add_development_dependency "bundler", "~> 1.15"
|
32
33
|
spec.add_development_dependency "rake", "~> 10.0"
|
data/lib/grubby.rb
CHANGED
@@ -5,7 +5,9 @@ require "gorge"
|
|
5
5
|
require "mechanize"
|
6
6
|
require "mini_sanity"
|
7
7
|
require "pleasant_path"
|
8
|
+
require "ryoba"
|
8
9
|
|
10
|
+
require_relative "grubby/version"
|
9
11
|
require_relative "grubby/log"
|
10
12
|
|
11
13
|
require_relative "grubby/core_ext/string"
|
@@ -15,22 +17,30 @@ require_relative "grubby/mechanize/download"
|
|
15
17
|
require_relative "grubby/mechanize/file"
|
16
18
|
require_relative "grubby/mechanize/link"
|
17
19
|
require_relative "grubby/mechanize/page"
|
18
|
-
require_relative "grubby/
|
20
|
+
require_relative "grubby/mechanize/parser"
|
19
21
|
|
20
22
|
|
21
23
|
class Grubby < Mechanize
|
22
24
|
|
25
|
+
VERSION = GRUBBY_VERSION
|
26
|
+
|
27
|
+
# The enforced minimum amount of time to wait between requests, in
|
28
|
+
# seconds. If the value is a Range, a random number within the Range
|
29
|
+
# is chosen for each request.
|
30
|
+
#
|
23
31
|
# @return [Integer, Float, Range<Integer>, Range<Float>]
|
24
|
-
# The enforced minimum amount of time to wait between requests, in
|
25
|
-
# seconds. If the value is a Range, a random number within the
|
26
|
-
# Range is chosen for each request.
|
27
32
|
attr_accessor :time_between_requests
|
28
33
|
|
29
|
-
#
|
30
|
-
#
|
31
|
-
#
|
32
|
-
#
|
33
|
-
|
34
|
+
# Journal file used to ensure only-once processing of resources by
|
35
|
+
# {singleton} across multiple program runs. Set via {initialize}.
|
36
|
+
#
|
37
|
+
# @return [Pathname, nil]
|
38
|
+
attr_reader :journal
|
39
|
+
|
40
|
+
# @param journal [Pathname, String]
|
41
|
+
# Optional journal file used to ensure only-once processing of
|
42
|
+
# resources by {singleton} across multiple program runs.
|
43
|
+
def initialize(journal = nil)
|
34
44
|
super()
|
35
45
|
|
36
46
|
# Prevent "memory leaks", and prevent mistakenly blank urls from
|
@@ -58,10 +68,22 @@ class Grubby < Mechanize
|
|
58
68
|
self.pre_connect_hooks << Proc.new{ self.send(:sleep_between_requests) }
|
59
69
|
self.time_between_requests = 1.0
|
60
70
|
|
61
|
-
@journal =
|
62
|
-
|
63
|
-
|
64
|
-
|
71
|
+
@journal = journal.try(&:to_pathname).try(&:touch_file)
|
72
|
+
@seen = @journal ? SingletonKey.parse_file(@journal).index_to{ true } : {}
|
73
|
+
end
|
74
|
+
|
75
|
+
# Calls +#head+ and returns true if the result has response code
|
76
|
+
# "200". Unlike +#head+, error response codes (e.g. "404", "500")
|
77
|
+
# do not cause a +Mechanize::ResponseCodeError+ to be raised.
|
78
|
+
#
|
79
|
+
# @param uri [String]
|
80
|
+
# @return [Boolean]
|
81
|
+
def ok?(uri, query_params = {}, headers = {})
|
82
|
+
begin
|
83
|
+
head(uri, query_params, headers).code == "200"
|
84
|
+
rescue Mechanize::ResponseCodeError => e
|
85
|
+
false
|
86
|
+
end
|
65
87
|
end
|
66
88
|
|
67
89
|
# Calls +#get+ with each of +mirror_uris+ until a successful
|
@@ -82,8 +104,8 @@ class Grubby < Mechanize
|
|
82
104
|
if i >= mirror_uris.length
|
83
105
|
raise
|
84
106
|
else
|
85
|
-
$log.
|
86
|
-
$log.debug("
|
107
|
+
$log.debug("Mirror failed (code #{e.response_code}): #{mirror_uris[i - 1]}")
|
108
|
+
$log.debug("Try mirror: #{mirror_uris[i]}")
|
87
109
|
retry
|
88
110
|
end
|
89
111
|
end
|
@@ -111,20 +133,20 @@ class Grubby < Mechanize
|
|
111
133
|
def singleton(target, purpose = "")
|
112
134
|
series = []
|
113
135
|
|
114
|
-
|
115
|
-
return if
|
136
|
+
original_uri = target.to_absolute_uri
|
137
|
+
return if try_skip_singleton(original_uri, purpose, series)
|
116
138
|
|
117
|
-
|
118
|
-
return if
|
139
|
+
normalized_uri = normalize_uri(original_uri)
|
140
|
+
return if try_skip_singleton(normalized_uri, purpose, series)
|
119
141
|
|
120
|
-
$log.info("
|
121
|
-
resource = get(
|
122
|
-
skip =
|
123
|
-
|
142
|
+
$log.info("Fetch #{normalized_uri}")
|
143
|
+
resource = get(normalized_uri)
|
144
|
+
skip = try_skip_singleton(resource.uri, purpose, series) |
|
145
|
+
try_skip_singleton("content hash: #{resource.content_hash}", purpose, series)
|
124
146
|
|
125
147
|
yield resource unless skip
|
126
148
|
|
127
|
-
series.
|
149
|
+
series.append_to_file(@journal) if @journal
|
128
150
|
|
129
151
|
!skip
|
130
152
|
end
|
@@ -132,22 +154,23 @@ class Grubby < Mechanize
|
|
132
154
|
|
133
155
|
private
|
134
156
|
|
135
|
-
SingletonKey = DumbDelimited[:purpose, :
|
157
|
+
SingletonKey = DumbDelimited[:purpose, :target]
|
136
158
|
|
137
|
-
def
|
138
|
-
|
139
|
-
series
|
140
|
-
|
141
|
-
|
142
|
-
|
159
|
+
def try_skip_singleton(target, purpose, series)
|
160
|
+
series << SingletonKey.new(purpose, target.to_s)
|
161
|
+
if series.uniq!.nil? && @seen.displace(series.last, true)
|
162
|
+
seen_info = series.length > 1 ? "seen #{series.last.target}" : "seen"
|
163
|
+
$log.info("Skip #{series.first.target} (#{seen_info})")
|
164
|
+
true
|
165
|
+
end
|
143
166
|
end
|
144
167
|
|
145
|
-
def
|
146
|
-
|
147
|
-
$log.warn("
|
148
|
-
|
149
|
-
|
150
|
-
|
168
|
+
def normalize_uri(uri)
|
169
|
+
uri = uri.dup
|
170
|
+
$log.warn("Ignore ##{uri.fragment} in #{uri}") if uri.fragment
|
171
|
+
uri.fragment = nil
|
172
|
+
uri.path = uri.path.chomp("/")
|
173
|
+
uri
|
151
174
|
end
|
152
175
|
|
153
176
|
def sleep_between_requests
|
@@ -162,7 +185,6 @@ class Grubby < Mechanize
|
|
162
185
|
end
|
163
186
|
|
164
187
|
|
165
|
-
require_relative "grubby/version"
|
166
188
|
require_relative "grubby/json_parser"
|
167
189
|
require_relative "grubby/scraper"
|
168
190
|
require_relative "grubby/page_scraper"
|
File without changes
|
data/lib/grubby/core_ext/uri.rb
CHANGED
@@ -1,5 +1,45 @@
|
|
1
1
|
module URI
|
2
2
|
|
3
|
+
# Returns the basename of the URI's +path+, a la +File.basename+.
|
4
|
+
#
|
5
|
+
# @example
|
6
|
+
# URI("http://example.com/foo/bar").basename # == "bar"
|
7
|
+
# URI("http://example.com/foo").basename # == "foo"
|
8
|
+
# URI("http://example.com/").basename # == ""
|
9
|
+
#
|
10
|
+
# @return [String]
|
11
|
+
def basename
|
12
|
+
self.path == "/" ? "" : File.basename(self.path)
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns the value of the specified param in the URI's +query+.
|
16
|
+
# The specified param name must be exactly as it appears in the query
|
17
|
+
# string, and support for complex nested values is limited. (See
|
18
|
+
# +CGI.parse+ for parsing behavior.) If the param name includes a
|
19
|
+
# +"[]"+, the result will be an array of all occurrences of that param
|
20
|
+
# in the query string. Otherwise, the result will be the last
|
21
|
+
# occurrence of that param in the query string.
|
22
|
+
#
|
23
|
+
# @example
|
24
|
+
# URI("http://example.com/?foo=a").query_param("foo") # == "a"
|
25
|
+
#
|
26
|
+
# URI("http://example.com/?foo=a&foo=b").query_param("foo") # == "b"
|
27
|
+
# URI("http://example.com/?foo=a&foo=b").query_param("foo[]") # == nil
|
28
|
+
#
|
29
|
+
# URI("http://example.com/?foo[]=a&foo[]=b").query_param("foo") # == nil
|
30
|
+
# URI("http://example.com/?foo[]=a&foo[]=b").query_param("foo[]") # == ["a", "b"]
|
31
|
+
#
|
32
|
+
# URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[]") # == nil
|
33
|
+
# URI("http://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[][x]") # == ["a"]
|
34
|
+
#
|
35
|
+
# @return [String, nil]
|
36
|
+
# @return [Array<String>, nil]
|
37
|
+
# if +name+ contains +"[]"+
|
38
|
+
def query_param(name)
|
39
|
+
values = CGI.parse(self.query)[name.to_s]
|
40
|
+
(values.nil? || name.include?("[]")) ? values : values.last
|
41
|
+
end
|
42
|
+
|
3
43
|
# Raises an exception if the URI is not +absolute?+.
|
4
44
|
#
|
5
45
|
# @return [self]
|
data/lib/grubby/json_parser.rb
CHANGED
@@ -33,8 +33,9 @@ class Grubby::JsonParser < Mechanize::File
|
|
33
33
|
@json_parse_options = options
|
34
34
|
end
|
35
35
|
|
36
|
+
# The parsed JSON data.
|
37
|
+
#
|
36
38
|
# @return [Hash, Array]
|
37
|
-
# The parsed JSON data.
|
38
39
|
attr_reader :json
|
39
40
|
|
40
41
|
def initialize(uri = nil, response = nil, body = nil, code = nil)
|
data/lib/grubby/json_scraper.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
class Grubby::JsonScraper < Grubby::Scraper
|
2
2
|
|
3
|
+
# The parsed JSON data being scraped.
|
4
|
+
#
|
3
5
|
# @return [Hash, Array]
|
4
|
-
# The parsed JSON data being scraped.
|
5
6
|
attr_reader :json
|
6
7
|
|
7
8
|
# @param source [Grubby::JsonParser]
|
@@ -10,4 +11,22 @@ class Grubby::JsonScraper < Grubby::Scraper
|
|
10
11
|
super
|
11
12
|
end
|
12
13
|
|
14
|
+
# Scrapes a locally-stored file. This method is intended for use with
|
15
|
+
# subclasses of +Grubby::JsonScraper+.
|
16
|
+
#
|
17
|
+
# @example
|
18
|
+
# class MyScraper < Grubby::JsonScraper
|
19
|
+
# # ...
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
# MyScraper.scrape_file("path/to/local_file.json").class # == MyScraper
|
23
|
+
#
|
24
|
+
# @param path [String]
|
25
|
+
# @return [Grubby::JsonScraper]
|
26
|
+
def self.scrape_file(path)
|
27
|
+
uri = URI.join("file:///", File.expand_path(path))
|
28
|
+
body = File.read(path)
|
29
|
+
self.new(Grubby::JsonParser.new(uri, nil, body, "200"))
|
30
|
+
end
|
31
|
+
|
13
32
|
end
|
data/lib/grubby/log.rb
CHANGED
File without changes
|
File without changes
|
@@ -9,9 +9,8 @@ class Mechanize::HTTP::Agent
|
|
9
9
|
IDEMPOTENT_HTTP_METHODS = [:get, :head, :options, :delete]
|
10
10
|
|
11
11
|
# Replacement for +Mechanize::HTTP::Agent#fetch+. When a "too many
|
12
|
-
# connection resets" error is encountered, this method
|
13
|
-
#
|
14
|
-
# {MAX_CONNECTION_RESET_RETRIES} times).
|
12
|
+
# connection resets" error is encountered, this method retries the
|
13
|
+
# request (upto {MAX_CONNECTION_RESET_RETRIES} times).
|
15
14
|
def fetch_with_retry(uri, http_method = :get, headers = {}, params = [], referer = current_page, redirects = 0)
|
16
15
|
retry_count = 0
|
17
16
|
begin
|
@@ -26,9 +25,9 @@ class Mechanize::HTTP::Agent
|
|
26
25
|
|
27
26
|
# otherwise, shutdown the persistent HTTP connection and try again
|
28
27
|
retry_count += 1
|
29
|
-
$log.warn("
|
30
|
-
self
|
31
|
-
|
28
|
+
$log.warn("#{e.message} (#{e.class}). Retry in #{retry_count} seconds.")
|
29
|
+
sleep(retry_count) # incremental backoff to allow server to self-correct
|
30
|
+
$log.warn("Retry #{http_method.to_s.upcase} #{uri}")
|
32
31
|
retry
|
33
32
|
end
|
34
33
|
end
|
File without changes
|
File without changes
|
File without changes
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require "fileutils"
|
2
|
+
|
3
|
+
module Mechanize::Parser
|
4
|
+
|
5
|
+
# Saves the payload to a specified directory, but using the default
|
6
|
+
# filename suggested by the server. If a file with that name already
|
7
|
+
# exists, this method will try to find a free filename by appending
|
8
|
+
# numbers to the original name. Returns the full path of the saved
|
9
|
+
# file.
|
10
|
+
#
|
11
|
+
# NOTE: this method expects a +#save!+ method to be defined by the
|
12
|
+
# class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
|
13
|
+
# and +Mechanize::Download#save!+.
|
14
|
+
#
|
15
|
+
# @param directory [String]
|
16
|
+
# @return [String]
|
17
|
+
def save_to(directory)
|
18
|
+
raise "#{self.class}#save! is not defined" unless self.respond_to?(:save!)
|
19
|
+
|
20
|
+
FileUtils.mkdir_p(directory)
|
21
|
+
path = find_free_name(File.join(directory, @filename))
|
22
|
+
save!(path)
|
23
|
+
path
|
24
|
+
end
|
25
|
+
|
26
|
+
# Saves the payload to a specified directory, but using the default
|
27
|
+
# filename suggested by the server. If a file with that name already
|
28
|
+
# exists, that file will be overwritten. Returns the full path of the
|
29
|
+
# saved file.
|
30
|
+
#
|
31
|
+
# NOTE: this method expects a +#save!+ method to be defined by the
|
32
|
+
# class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
|
33
|
+
# and +Mechanize::Download#save!+.
|
34
|
+
#
|
35
|
+
# @param directory [String]
|
36
|
+
# @return [String]
|
37
|
+
def save_to!(directory)
|
38
|
+
raise "#{self.class}#save! is not defined" unless self.respond_to?(:save!)
|
39
|
+
|
40
|
+
FileUtils.mkdir_p(directory)
|
41
|
+
path = File.join(directory, @filename)
|
42
|
+
save!(path)
|
43
|
+
path
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
data/lib/grubby/page_scraper.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
class Grubby::PageScraper < Grubby::Scraper
|
2
2
|
|
3
|
+
# The Page being scraped.
|
4
|
+
#
|
3
5
|
# @return [Mechanize::Page]
|
4
|
-
# The Page being scraped.
|
5
6
|
attr_reader :page
|
6
7
|
|
7
8
|
# @param source [Mechanize::Page]
|
@@ -10,4 +11,23 @@ class Grubby::PageScraper < Grubby::Scraper
|
|
10
11
|
super
|
11
12
|
end
|
12
13
|
|
14
|
+
# Scrapes a locally-stored file. This method is intended for use with
|
15
|
+
# subclasses of +Grubby::PageScraper+.
|
16
|
+
#
|
17
|
+
# @example
|
18
|
+
# class MyScraper < Grubby::PageScraper
|
19
|
+
# # ...
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
# MyScraper.scrape_file("path/to/local_file.html").class # == MyScraper
|
23
|
+
#
|
24
|
+
# @param path [String]
|
25
|
+
# @param agent [Mechanize]
|
26
|
+
# @return [Grubby::PageScraper]
|
27
|
+
def self.scrape_file(path, agent = Grubby.new)
|
28
|
+
uri = URI.join("file:///", File.expand_path(path))
|
29
|
+
body = File.read(path)
|
30
|
+
self.new(Mechanize::Page.new(uri, nil, body, "200", agent))
|
31
|
+
end
|
32
|
+
|
13
33
|
end
|
data/lib/grubby/scraper.rb
CHANGED
@@ -1,8 +1,5 @@
|
|
1
1
|
class Grubby::Scraper
|
2
2
|
|
3
|
-
class Error < RuntimeError
|
4
|
-
end
|
5
|
-
|
6
3
|
# Defines an attribute reader method named by +field+. During
|
7
4
|
# +initialize+, the given block is called, and the attribute is set to
|
8
5
|
# the block's return value. By default, if the block's return value
|
@@ -22,38 +19,48 @@ class Grubby::Scraper
|
|
22
19
|
self.fields << field
|
23
20
|
|
24
21
|
define_method(field) do
|
22
|
+
raise "#{self.class}#initialize does not invoke `super`" unless defined?(@scraped)
|
25
23
|
return @scraped[field] if @scraped.key?(field)
|
26
24
|
|
27
|
-
unless @errors
|
25
|
+
unless @errors[field]
|
28
26
|
begin
|
29
27
|
value = instance_eval(&block)
|
30
28
|
if value.nil?
|
31
|
-
raise
|
32
|
-
$log.debug("
|
29
|
+
raise FieldValueRequiredError.new(field) unless optional
|
30
|
+
$log.debug("#{self.class}##{field} is nil")
|
33
31
|
end
|
34
32
|
@scraped[field] = value
|
35
|
-
rescue RuntimeError => e
|
33
|
+
rescue RuntimeError, IndexError => e
|
36
34
|
@errors[field] = e
|
37
35
|
end
|
38
36
|
end
|
39
37
|
|
40
|
-
raise
|
38
|
+
raise FieldScrapeFailedError.new(field, @errors[field]) if @errors[field]
|
41
39
|
|
42
40
|
@scraped[field]
|
43
41
|
end
|
44
42
|
end
|
45
43
|
|
44
|
+
# The names of all scraped values, as defined by {scrapes}.
|
45
|
+
#
|
46
46
|
# @return [Array<Symbol>]
|
47
|
-
# The names of all scraped values, as defined by {scrapes}.
|
48
47
|
def self.fields
|
49
48
|
@fields ||= []
|
50
49
|
end
|
51
50
|
|
51
|
+
# The source being scraped. Typically a Mechanize pluggable parser
|
52
|
+
# such as +Mechanize::Page+.
|
53
|
+
#
|
52
54
|
# @return [Object]
|
53
|
-
# The source being scraped. Typically a Mechanize pluggable parser
|
54
|
-
# such as +Mechanize::Page+.
|
55
55
|
attr_reader :source
|
56
56
|
|
57
|
+
# Hash of errors raised by blocks passed to {scrapes}. If
|
58
|
+
# {initialize} does not raise +Grubby::Scraper::Error+, this Hash will
|
59
|
+
# be empty.
|
60
|
+
#
|
61
|
+
# @return [Hash<Symbol, StandardError>]
|
62
|
+
attr_reader :errors
|
63
|
+
|
57
64
|
# @param source
|
58
65
|
# @raise [Grubby::Scraper::Error]
|
59
66
|
# if any scraped values result in error
|
@@ -65,18 +72,11 @@ class Grubby::Scraper
|
|
65
72
|
self.class.fields.each do |field|
|
66
73
|
begin
|
67
74
|
self.send(field)
|
68
|
-
rescue
|
75
|
+
rescue FieldScrapeFailedError
|
69
76
|
end
|
70
77
|
end
|
71
78
|
|
72
|
-
unless @errors.empty?
|
73
|
-
listing = @errors.map do |field, error|
|
74
|
-
error_class = " (#{error.class})" unless error.class == RuntimeError
|
75
|
-
error_trace = error.backtrace.join("\n").indent(2)
|
76
|
-
"* #{field} -- #{error.message}#{error_class}\n#{error_trace}"
|
77
|
-
end
|
78
|
-
raise Error.new("Failed to scrape the following fields:\n#{listing.join("\n")}")
|
79
|
-
end
|
79
|
+
raise Error.new(self) unless @errors.empty?
|
80
80
|
end
|
81
81
|
|
82
82
|
# Returns the scraped value named by +field+.
|
@@ -96,4 +96,43 @@ class Grubby::Scraper
|
|
96
96
|
@scraped.dup
|
97
97
|
end
|
98
98
|
|
99
|
+
class Error < RuntimeError
|
100
|
+
BACKTRACE_CLEANER = ActiveSupport::BacktraceCleaner.new.tap do |cleaner|
|
101
|
+
cleaner.add_silencer do |line|
|
102
|
+
line.include?(__dir__) && line.include?("scraper.rb:")
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# @return [Grubby::Scraper]
|
107
|
+
# The Scraper that raised this error.
|
108
|
+
attr_accessor :scraper
|
109
|
+
|
110
|
+
def initialize(scraper)
|
111
|
+
self.scraper = scraper
|
112
|
+
|
113
|
+
listing = scraper.errors.
|
114
|
+
reject{|field, error| error.is_a?(FieldScrapeFailedError) }.
|
115
|
+
map do |field, error|
|
116
|
+
"* `#{field}` (#{error.class})\n" +
|
117
|
+
error.message.indent(2) + "\n\n" +
|
118
|
+
BACKTRACE_CLEANER.clean(error.backtrace).join("\n").indent(4) + "\n"
|
119
|
+
end.
|
120
|
+
join("\n")
|
121
|
+
|
122
|
+
super("Failed to scrape the following fields:\n#{listing}")
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
class FieldScrapeFailedError < RuntimeError
|
127
|
+
def initialize(field, field_error)
|
128
|
+
super("`#{field}` raised #{field_error.class}")
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
class FieldValueRequiredError < RuntimeError
|
133
|
+
def initialize(field)
|
134
|
+
super("`#{field}` is nil but is not marked as optional")
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
99
138
|
end
|
data/lib/grubby/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grubby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Hefner
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-07-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -108,6 +108,20 @@ dependencies:
|
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '1.1'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: ryoba
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '1.0'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '1.0'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: bundler
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -173,6 +187,7 @@ extra_rdoc_files: []
|
|
173
187
|
files:
|
174
188
|
- ".gitignore"
|
175
189
|
- ".travis.yml"
|
190
|
+
- CHANGELOG.md
|
176
191
|
- Gemfile
|
177
192
|
- LICENSE.txt
|
178
193
|
- README.md
|
@@ -189,7 +204,7 @@ files:
|
|
189
204
|
- lib/grubby/mechanize/file.rb
|
190
205
|
- lib/grubby/mechanize/link.rb
|
191
206
|
- lib/grubby/mechanize/page.rb
|
192
|
-
- lib/grubby/
|
207
|
+
- lib/grubby/mechanize/parser.rb
|
193
208
|
- lib/grubby/page_scraper.rb
|
194
209
|
- lib/grubby/scraper.rb
|
195
210
|
- lib/grubby/version.rb
|
@@ -213,7 +228,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
213
228
|
version: '0'
|
214
229
|
requirements: []
|
215
230
|
rubyforge_project:
|
216
|
-
rubygems_version: 2.6
|
231
|
+
rubygems_version: 2.7.6
|
217
232
|
signing_key:
|
218
233
|
specification_version: 4
|
219
234
|
summary: Fail-fast web scraping
|
@@ -1,27 +0,0 @@
|
|
1
|
-
module Nokogiri::XML::Searchable
|
2
|
-
|
3
|
-
# Searches the node using the given XPath or CSS queries, and returns
|
4
|
-
# the results. Raises an exception if there are no results. See also
|
5
|
-
# +#search+.
|
6
|
-
#
|
7
|
-
# @param queries [Array<String>]
|
8
|
-
# @return [Array<Nokogiri::XML::Element>]
|
9
|
-
# @raise [RuntimeError] if queries yield no results
|
10
|
-
def search!(*queries)
|
11
|
-
results = search(*queries)
|
12
|
-
raise "No elements matching #{queries.map(&:inspect).join(" OR ")}" if results.empty?
|
13
|
-
results
|
14
|
-
end
|
15
|
-
|
16
|
-
# Searches the node using the given XPath or CSS queries, and returns
|
17
|
-
# only the first result. Raises an exception if there are no results.
|
18
|
-
# See also +#at+.
|
19
|
-
#
|
20
|
-
# @param queries [Array<String>]
|
21
|
-
# @return [Nokogiri::XML::Element]
|
22
|
-
# @raise [RuntimeError] if queries yield no results
|
23
|
-
def at!(*queries)
|
24
|
-
search!(*queries).first
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|