grubby 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 88c8ecc06ffba254ee9e9de3d42f868c0692b244
4
+ data.tar.gz: 8cd3445f33c9f7db05550947d686293ceee620c1
5
+ SHA512:
6
+ metadata.gz: d9dc7435763425d54d82f4930935c913cd67eb15a1250c58c7ccf3b0e419eeb6ceb87289ed3c22386faa96e3ae3584a9f39cf8e671af8b56b52bb3e2c8257e4d
7
+ data.tar.gz: 2c5c96993c8a673274a4acc34c3f82a8719fbea508c1dffb5ce8cff1d4a13cd90c9fcbf8a3c743500a31f08419c687c24adf5f2aee464a8a4f1935b2b302b184
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.2.5
5
+ before_install: gem install bundler -v 1.15.1
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in grubby.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2017 Jonathan Hefner
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,127 @@
1
+ # grubby
2
+
3
+ [Fail-fast] web scraping. *grubby* adds a layer of utility and
4
+ error-checking atop the marvelous [Mechanize gem]. See API summary
5
+ below, or browse the [full documentation].
6
+
7
+ [Fail-fast]: https://en.wikipedia.org/wiki/Fail-fast
8
+ [Mechanize gem]: https://rubygems.org/gems/mechanize
9
+ [full documentation]: http://www.rubydoc.info/gems/grubby/
10
+
11
+
12
+ ## Examples
13
+
14
+ The following example scrapes the [Hacker News] front page:
15
+
16
+ ```ruby
17
+ require "grubby"
18
+
19
+ class HackerNews < Grubby::PageScraper
20
+
21
+ scrapes(:items) do
22
+ page.search!(".athing").map{|item| HackerNewsItem.new(item) }
23
+ end
24
+
25
+ end
26
+
27
+ class HackerNewsItem < Grubby::Scraper
28
+
29
+ scrapes(:title) { @row1.at!(".storylink").text }
30
+ scrapes(:submitter) { @row2.at!(".hnuser").text }
31
+ scrapes(:story_uri) { URI.join(@base_uri, @row1.at!(".storylink")["href"]) }
32
+ scrapes(:comments_uri) { URI.join(@base_uri, @row2.at!(".age a")["href"]) }
33
+
34
+ def initialize(source)
35
+ @row1 = source
36
+ @row2 = source.next_sibling
37
+ @base_uri = source.document.url
38
+ super
39
+ end
40
+
41
+ end
42
+
43
+ grubby = Grubby.new
44
+
45
+ # The following line will raise an exception if anything goes wrong
46
+ # during the scraping process. For example, if the structure of the
47
+ # HTML does not match expectations, either due to a bad assumption or
48
+ # due to a site-wide change, the script will terminate immediately with
49
+ # a relevant error message. This prevents bad values from propogating
50
+ # and causing hard-to-trace errors.
51
+ hn = HackerNews.new(grubby.get("https://news.ycombinator.com/news"))
52
+
53
+ puts hn.items.take(10).map(&:title) # your scraping logic goes here
54
+ ```
55
+
56
+ [Hacker News]: https://news.ycombinator.com/news
57
+
58
+
59
+ ## Core API
60
+
61
+ - [Grubby](http://www.rubydoc.info/gems/grubby/Grubby)
62
+ - [#get_mirrored](http://www.rubydoc.info/gems/grubby/Grubby:get_mirrored)
63
+ - [#singleton](http://www.rubydoc.info/gems/grubby/Grubby:singleton)
64
+ - [#time_between_requests](http://www.rubydoc.info/gems/grubby/Grubby:time_between_requests)
65
+ - [Scraper](http://www.rubydoc.info/gems/grubby/Grubby/Scraper)
66
+ - [.fields](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.fields)
67
+ - [.scrapes](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrapes)
68
+ - [#[]](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:[])
69
+ - [#source](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:source)
70
+ - [#to_h](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:to_h)
71
+ - [PageScraper](http://www.rubydoc.info/gems/grubby/Grubby/PageScraper)
72
+ - [#page](http://www.rubydoc.info/gems/grubby/Grubby/PageScraper:page)
73
+ - [JsonScraper](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper)
74
+ - [#json](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper:json)
75
+ - Nokogiri::XML::Searchable
76
+ - [#at!](http://www.rubydoc.info/gems/grubby/Nokogiri/XML/Searchable:at%21)
77
+ - [#search!](http://www.rubydoc.info/gems/grubby/Nokogiri/XML/Searchable:search%21)
78
+ - Mechanize::Page
79
+ - [#at!](http://www.rubydoc.info/gems/grubby/Mechanize/Page:at%21)
80
+ - [#search!](http://www.rubydoc.info/gems/grubby/Mechanize/Page:search%21)
81
+ - Mechanize::Page::Link
82
+ - [#to_absolute_uri](http://www.rubydoc.info/gems/grubby/Mechanize/Page/Link#to_absolute_uri)
83
+
84
+
85
+ ## Supplemental API
86
+
87
+ *grubby* uses several gems which extend core Ruby objects with
88
+ convenience methods. When you import *grubby* you automatically make
89
+ these methods available. See each gem below for its specific API
90
+ documentation:
91
+
92
+ - [Active Support](https://rubygems.org/gems/activesupport)
93
+ ([docs](http://www.rubydoc.info/gems/activesupport/))
94
+ - [casual_support](https://rubygems.org/gems/casual_support)
95
+ ([docs](http://www.rubydoc.info/gems/casual_support/))
96
+ - [gorge](https://rubygems.org/gems/gorge)
97
+ ([docs](http://www.rubydoc.info/gems/gorge/))
98
+ - [mini_sanity](https://rubygems.org/gems/mini_sanity)
99
+ ([docs](http://www.rubydoc.info/gems/mini_sanity/))
100
+ - [pleasant_path](https://rubygems.org/gems/pleasant_path)
101
+ ([docs](http://www.rubydoc.info/gems/pleasant_path/))
102
+
103
+
104
+ ## Installation
105
+
106
+ Install from [Ruby Gems](https://rubygems.org/gems/grubby):
107
+
108
+ ```bash
109
+ $ gem install grubby
110
+ ```
111
+
112
+ Then require in your Ruby script:
113
+
114
+ ```ruby
115
+ require "grubby"
116
+ ```
117
+
118
+
119
+ ## Contributing
120
+
121
+ Run `rake test` to run the tests. You can also run `rake irb` for an
122
+ interactive prompt that pre-loads the project code.
123
+
124
+
125
+ ## License
126
+
127
+ [MIT License](https://opensource.org/licenses/MIT)
data/Rakefile ADDED
@@ -0,0 +1,26 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+ require "yard"
4
+
5
+
6
+ YARD::Rake::YardocTask.new(:doc) do |t|
7
+ end
8
+
9
+ desc "Launch IRB with this gem pre-loaded"
10
+ task :irb do
11
+ # HACK because lib/grubby/version is prematurely loaded by bundler/gem_tasks
12
+ Object.send(:remove_const, :Grubby)
13
+
14
+ require "grubby"
15
+ require "irb"
16
+ ARGV.clear
17
+ IRB.start
18
+ end
19
+
20
+ Rake::TestTask.new(:test) do |t|
21
+ t.libs << "test"
22
+ t.libs << "lib"
23
+ t.test_files = FileList["test/**/*_test.rb"]
24
+ end
25
+
26
+ task :default => :test
data/grubby.gemspec ADDED
@@ -0,0 +1,35 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "grubby/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "grubby"
8
+ spec.version = Grubby::VERSION
9
+ spec.authors = ["Jonathan Hefner"]
10
+ spec.email = ["jonathan.hefner@gmail.com"]
11
+
12
+ spec.summary = %q{Fail-fast web scraping}
13
+ spec.homepage = "https://github.com/jonathanhefner/grubby"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
17
+ f.match(%r{^(test|spec|features)/})
18
+ end
19
+ spec.bindir = "exe"
20
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
21
+ spec.require_paths = ["lib"]
22
+
23
+ spec.add_runtime_dependency "activesupport", "~> 5.0"
24
+ spec.add_runtime_dependency "casual_support", "~> 3.0"
25
+ spec.add_runtime_dependency "dumb_delimited", "~> 1.0"
26
+ spec.add_runtime_dependency "gorge", "~> 1.0"
27
+ spec.add_runtime_dependency "mechanize", "~> 2.7"
28
+ spec.add_runtime_dependency "mini_sanity", "~> 1.0"
29
+ spec.add_runtime_dependency "pleasant_path", "~> 1.1"
30
+
31
+ spec.add_development_dependency "bundler", "~> 1.15"
32
+ spec.add_development_dependency "rake", "~> 10.0"
33
+ spec.add_development_dependency "minitest", "~> 5.0"
34
+ spec.add_development_dependency "yard", "~> 0.9"
35
+ end
data/lib/grubby.rb ADDED
@@ -0,0 +1,169 @@
1
+ require "active_support/all"
2
+ require "casual_support"
3
+ require "dumb_delimited"
4
+ require "gorge"
5
+ require "mechanize"
6
+ require "mini_sanity"
7
+ require "pleasant_path"
8
+
9
+ require_relative "grubby/log"
10
+
11
+ require_relative "grubby/core_ext/string"
12
+ require_relative "grubby/core_ext/uri"
13
+ require_relative "grubby/mechanize/fetch_with_retry"
14
+ require_relative "grubby/mechanize/download"
15
+ require_relative "grubby/mechanize/file"
16
+ require_relative "grubby/mechanize/link"
17
+ require_relative "grubby/mechanize/page"
18
+ require_relative "grubby/nokogiri/searchable"
19
+
20
+
21
+ class Grubby < Mechanize
22
+
23
+ # @return [Integer, Float, Range<Integer>, Range<Float>]
24
+ # The enforced minimum amount of time to wait between requests, in
25
+ # seconds. If the value is a Range, a random number within the
26
+ # Range is chosen for each request.
27
+ attr_accessor :time_between_requests
28
+
29
+ # @param singleton_journal [Pathname, String]
30
+ # Optional journal file to persist the list of resources processed
31
+ # by {singleton}. Useful to ensure only-once processing across
32
+ # multiple program runs.
33
+ def initialize(singleton_journal = nil)
34
+ super()
35
+
36
+ # Prevent "memory leaks", and prevent mistakenly blank urls from
37
+ # resolving. (Blank urls resolve as a path relative to the last
38
+ # history entry. Without this setting, an erroneous `agent.get("")`
39
+ # could sometimes successfully fetch a page.)
40
+ self.max_history = 0
41
+
42
+ # Prevent files of unforeseen content type from being buffered into
43
+ # memory by default, in case they are very large. However, increase
44
+ # the threshold for what is considered "large", to prevent
45
+ # unnecessary writes to disk.
46
+ #
47
+ # References:
48
+ # - http://docs.seattlerb.org/mechanize/Mechanize/PluggableParser.html
49
+ # - http://docs.seattlerb.org/mechanize/Mechanize/Download.html
50
+ # - http://docs.seattlerb.org/mechanize/Mechanize/File.html
51
+ self.max_file_buffer = 1_000_000 # only applies to Mechanize::Download
52
+ self.pluggable_parser.default = Mechanize::Download
53
+ self.pluggable_parser["text/plain"] = Mechanize::File
54
+ self.pluggable_parser["application/json"] = Grubby::JsonParser
55
+
56
+ # Set up configurable rate limiting, and choose a reasonable default
57
+ # rate limit.
58
+ self.pre_connect_hooks << Proc.new{ self.send(:sleep_between_requests) }
59
+ self.time_between_requests = 1.0
60
+
61
+ @journal = singleton_journal ?
62
+ singleton_journal.to_pathname.touch_file : Pathname::NULL
63
+ @seen = SingletonKey.parse_file(@journal).
64
+ group_by(&:purpose).transform_values{|sks| sks.map(&:key).index_to{ true } }
65
+ end
66
+
67
+ # Calls +#get+ with each of +mirror_uris+ until a successful
68
+ # ("200 OK") response is recieved, and returns that +#get+ result.
69
+ # Rescues and logs +Mechanize::ResponseCodeError+ failures for all but
70
+ # the last mirror.
71
+ #
72
+ # @param mirror_uris [Array<String>]
73
+ # @return [Mechanize::Page, Mechanize::File, Mechanize::Download, ...]
74
+ # @raise [Mechanize::ResponseCodeError]
75
+ # if all +mirror_uris+ fail
76
+ def get_mirrored(mirror_uris, parameters = [], referer = nil, headers = {})
77
+ i = 0
78
+ begin
79
+ get(mirror_uris[i], parameters, referer, headers)
80
+ rescue Mechanize::ResponseCodeError => e
81
+ i += 1
82
+ if i >= mirror_uris.length
83
+ raise
84
+ else
85
+ $log.info("Mirror failed with response code #{e.response_code}: #{mirror_uris[i - 1]}")
86
+ $log.debug("Trying next mirror: #{mirror_uris[i]}")
87
+ retry
88
+ end
89
+ end
90
+ end
91
+
92
+ # Ensures only-once processing of the resource indicated by +target+
93
+ # for the specified +purpose+. A list of previously-processed
94
+ # resource URIs and content hashes is maintained in the Grubby
95
+ # instance. The given block is called with the fetched resource only
96
+ # if the resource's URI and the resource's content hash have not been
97
+ # previously processed under the specified +purpose+.
98
+ #
99
+ # @param target [URI, String, Mechanize::Page::Link, #to_absolute_uri]
100
+ # designates the resource to fetch
101
+ # @param purpose [String]
102
+ # the purpose of processing the resource
103
+ # @yield [resource]
104
+ # processes the resource
105
+ # @yieldparam resource [Mechanize::Page, Mechanize::File, Mechanize::Download, ...]
106
+ # the fetched resource
107
+ # @return [Boolean]
108
+ # whether the given block was called
109
+ # @raise [Mechanize::ResponseCodeError]
110
+ # if fetching the resource results in error (see +Mechanize#get+)
111
+ def singleton(target, purpose = "")
112
+ series = []
113
+
114
+ original_url = target.to_absolute_uri
115
+ return if skip_singleton?(purpose, original_url.to_s, series)
116
+
117
+ url = normalize_url(original_url)
118
+ return if skip_singleton?(purpose, url.to_s, series)
119
+
120
+ $log.info("Fetching #{url}")
121
+ resource = get(url)
122
+ skip = skip_singleton?(purpose, resource.uri.to_s, series) |
123
+ skip_singleton?(purpose, "content hash: #{resource.content_hash}", series)
124
+
125
+ yield resource unless skip
126
+
127
+ series.map{|k| SingletonKey.new(purpose, k) }.append_to_file(@journal)
128
+
129
+ !skip
130
+ end
131
+
132
+
133
+ private
134
+
135
+ SingletonKey = DumbDelimited[:purpose, :key]
136
+
137
+ def skip_singleton?(purpose, key, series)
138
+ return false if series.include?(key)
139
+ series << key
140
+ already = (@seen[purpose.to_s] ||= {}).displace(key, true)
141
+ $log.info("Skipping #{series.first} (already seen #{series.last})") if already
142
+ already
143
+ end
144
+
145
+ def normalize_url(url)
146
+ url = url.dup
147
+ $log.warn("Discarding fragment in URL: #{url}") if url.fragment
148
+ url.fragment = nil
149
+ url.path = url.path.chomp("/")
150
+ url
151
+ end
152
+
153
+ def sleep_between_requests
154
+ @last_request_at ||= 0.0
155
+ delay_duration = @time_between_requests.is_a?(Range) ?
156
+ rand(@time_between_requests) : @time_between_requests
157
+ sleep_duration = @last_request_at + delay_duration - Time.now.to_f
158
+ sleep(sleep_duration) if sleep_duration > 0
159
+ @last_request_at = Time.now.to_f
160
+ end
161
+
162
+ end
163
+
164
+
165
+ require_relative "grubby/version"
166
+ require_relative "grubby/json_parser"
167
+ require_relative "grubby/scraper"
168
+ require_relative "grubby/page_scraper"
169
+ require_relative "grubby/json_scraper"
@@ -0,0 +1,12 @@
1
+ class String
2
+
3
+ # Constructs a URI from the String. Raises an exception if the String
4
+ # does not denote an absolute URI.
5
+ #
6
+ # @return [URI]
7
+ # @raise [RuntimeError] if the String does not denote an absolute URI
8
+ def to_absolute_uri
9
+ URI(self).to_absolute_uri
10
+ end
11
+
12
+ end
@@ -0,0 +1,12 @@
1
+ module URI
2
+
3
+ # Raises an exception if the URI is not +absolute?+.
4
+ #
5
+ # @return [self]
6
+ # @raise [RuntimeError] if the URI is not +absolute?+
7
+ def to_absolute_uri
8
+ raise "URI is not absolute: #{self}" unless self.absolute?
9
+ self
10
+ end
11
+
12
+ end
@@ -0,0 +1,45 @@
1
+ class Grubby::JsonParser < Mechanize::File
2
+
3
+ # Returns the options to use when parsing JSON. The returned options
4
+ # Hash is not +dup+ed and can be modified directly. Any modifications
5
+ # will be applied to all future parsing.
6
+ #
7
+ # For information about available options, see
8
+ # {http://ruby-doc.org/stdlib/libdoc/json/rdoc/JSON.html#method-i-parse
9
+ # +JSON.parse+}.
10
+ #
11
+ # @return [Hash]
12
+ def self.json_parse_options
13
+ @json_parse_options ||= {
14
+ max_nesting: false,
15
+ allow_nan: false,
16
+ symbolize_names: false,
17
+ create_additions: false,
18
+ object_class: Hash,
19
+ array_class: Array,
20
+ }
21
+ end
22
+
23
+ # Sets the options to use when parsing JSON. The entire options Hash
24
+ # is replaced, and the new value will be applied to all future
25
+ # parsing. To set options individually, see {json_parse_options}.
26
+ #
27
+ # For information about available options, see
28
+ # {http://ruby-doc.org/stdlib/libdoc/json/rdoc/JSON.html#method-i-parse
29
+ # +JSON.parse+}.
30
+ #
31
+ # @param options [Hash]
32
+ def self.json_parse_options=(options)
33
+ @json_parse_options = options
34
+ end
35
+
36
+ # @return [Hash, Array]
37
+ # The parsed JSON data.
38
+ attr_reader :json
39
+
40
+ def initialize(uri = nil, response = nil, body = nil, code = nil)
41
+ @json = body && JSON.parse(body, self.class.json_parse_options)
42
+ super
43
+ end
44
+
45
+ end
@@ -0,0 +1,13 @@
1
+ class Grubby::JsonScraper < Grubby::Scraper
2
+
3
+ # @return [Hash, Array]
4
+ # The parsed JSON data being scraped.
5
+ attr_reader :json
6
+
7
+ # @param source [Grubby::JsonParser]
8
+ def initialize(source)
9
+ @json = source.assert_kind_of!(Grubby::JsonParser).json
10
+ super
11
+ end
12
+
13
+ end
data/lib/grubby/log.rb ADDED
@@ -0,0 +1,5 @@
1
+ $log ||= Logger.new($stderr).tap do |logger|
2
+ logger.formatter = ->(severity, datetime, progname, msg) do
3
+ "[#{datetime.to_ymd} #{datetime.to_hms}] #{severity} #{msg}\n"
4
+ end
5
+ end
@@ -0,0 +1,8 @@
1
+ class Mechanize::Download
2
+
3
+ # private
4
+ def content_hash
5
+ @content_hash ||= Digest::SHA1.new.io(self.body_io).hexdigest
6
+ end
7
+
8
+ end
@@ -0,0 +1,39 @@
1
+ # This monkey patch attempts to fix the insidious "too many connection
2
+ # resets" bug described here: https://github.com/sparklemotion/mechanize/issues/123
3
+ #
4
+ # The code is taken and modified from this helpful blog article:
5
+ # http://scottwb.com/blog/2013/11/09/defeating-the-infamous-mechanize-too-many-connection-resets-bug/
6
+ class Mechanize::HTTP::Agent
7
+
8
+ MAX_CONNECTION_RESET_RETRIES = 9
9
+ IDEMPOTENT_HTTP_METHODS = [:get, :head, :options, :delete]
10
+
11
+ # Replacement for +Mechanize::HTTP::Agent#fetch+. When a "too many
12
+ # connection resets" error is encountered, this method shuts down the
13
+ # persistent HTTP connection, and then retries the request (upto
14
+ # {MAX_CONNECTION_RESET_RETRIES} times).
15
+ def fetch_with_retry(uri, http_method = :get, headers = {}, params = [], referer = current_page, redirects = 0)
16
+ retry_count = 0
17
+ begin
18
+ fetch_without_retry(uri, http_method, headers, params, referer, redirects)
19
+ rescue Net::HTTP::Persistent::Error => e
20
+ # raise if different type of error
21
+ raise unless e.message.include?("too many connection resets")
22
+ # raise if non-idempotent http method
23
+ raise unless IDEMPOTENT_HTTP_METHODS.include?(http_method)
24
+ # raise if we've tried too many times
25
+ raise if retry_count >= MAX_CONNECTION_RESET_RETRIES
26
+
27
+ # otherwise, shutdown the persistent HTTP connection and try again
28
+ retry_count += 1
29
+ $log.warn("Possible connection reset bug. Retry(#{retry_count}) #{http_method.to_s.upcase} #{uri}")
30
+ self.http.shutdown
31
+ sleep(retry_count) # incremental backoff in case problem is with server
32
+ retry
33
+ end
34
+ end
35
+
36
+ alias_method :fetch_without_retry, :fetch
37
+ alias_method :fetch, :fetch_with_retry
38
+
39
+ end
@@ -0,0 +1,8 @@
1
+ class Mechanize::File
2
+
3
+ # private
4
+ def content_hash
5
+ @content_hash ||= self.body.to_s.sha1
6
+ end
7
+
8
+ end
@@ -0,0 +1,20 @@
1
+ class Mechanize::Page::Link
2
+
3
+ # Returns the URI represented by the Link, in absolute form. If the
4
+ # href attribute of the Link is expressed in relative form, the URI of
5
+ # the Link's Page is used to convert to absolute form.
6
+ #
7
+ # @return [URI]
8
+ def to_absolute_uri
9
+ # Via the W3 spec: "If the a element has no href attribute, then the
10
+ # element represents a placeholder for where a link might otherwise
11
+ # have been placed, if it had been relevant, consisting of just the
12
+ # element's contents."[1] So, we assume a link with no href
13
+ # attribute (i.e. `uri == nil`) should be treated the same as an
14
+ # intra-page link.
15
+ #
16
+ # [1]: https://www.w3.org/TR/2016/REC-html51-20161101/textlevel-semantics.html#the-a-element
17
+ URI.join(self.page.uri, self.uri || "#").to_absolute_uri
18
+ end
19
+
20
+ end
@@ -0,0 +1,17 @@
1
+ class Mechanize::Page
2
+
3
+ # @!method search!(*queries)
4
+ # See {::Nokogiri::XML::Searchable#search!}.
5
+ #
6
+ # @param queries [Array<String>]
7
+ # @return [Array<Nokogiri::XML::Element>]
8
+ def_delegators :parser, :search!
9
+
10
+ # @!method at!(*queries)
11
+ # See {::Nokogiri::XML::Searchable#at!}.
12
+ #
13
+ # @param queries [Array<String>]
14
+ # @return [Nokogiri::XML::Element]
15
+ def_delegators :parser, :at!
16
+
17
+ end
@@ -0,0 +1,27 @@
1
+ module Nokogiri::XML::Searchable
2
+
3
+ # Searches the node using the given XPath or CSS queries, and returns
4
+ # the results. Raises an exception if there are no results. See also
5
+ # +#search+.
6
+ #
7
+ # @param queries [Array<String>]
8
+ # @return [Array<Nokogiri::XML::Element>]
9
+ # @raise [RuntimeError] if queries yield no results
10
+ def search!(*queries)
11
+ results = search(*queries)
12
+ raise "No elements matching #{queries.map(&:inspect).join(" OR ")}" if results.empty?
13
+ results
14
+ end
15
+
16
+ # Searches the node using the given XPath or CSS queries, and returns
17
+ # only the first result. Raises an exception if there are no results.
18
+ # See also +#at+.
19
+ #
20
+ # @param queries [Array<String>]
21
+ # @return [Nokogiri::XML::Element]
22
+ # @raise [RuntimeError] if queries yield no results
23
+ def at!(*queries)
24
+ search!(*queries).first
25
+ end
26
+
27
+ end
@@ -0,0 +1,13 @@
1
+ class Grubby::PageScraper < Grubby::Scraper
2
+
3
+ # @return [Mechanize::Page]
4
+ # The Page being scraped.
5
+ attr_reader :page
6
+
7
+ # @param source [Mechanize::Page]
8
+ def initialize(source)
9
+ @page = source.assert_kind_of!(Mechanize::Page)
10
+ super
11
+ end
12
+
13
+ end
@@ -0,0 +1,99 @@
1
+ class Grubby::Scraper
2
+
3
+ class Error < RuntimeError
4
+ end
5
+
6
+ # Defines an attribute reader method named by +field+. During
7
+ # +initialize+, the given block is called, and the attribute is set to
8
+ # the block's return value. By default, if the block's return value
9
+ # is nil, an exception will be raised. To prevent this behavior, set
10
+ # +optional+ to true.
11
+ #
12
+ # @param field [Symbol, String]
13
+ # name of the scraped value
14
+ # @param optional [Boolean]
15
+ # whether to permit a nil scraped value
16
+ # @yield []
17
+ # scrapes the value
18
+ # @yieldreturn [Object]
19
+ # scraped value
20
+ def self.scrapes(field, optional: false, &block)
21
+ field = field.to_sym
22
+ self.fields << field
23
+
24
+ define_method(field) do
25
+ return @scraped[field] if @scraped.key?(field)
26
+
27
+ unless @errors.key?(field)
28
+ begin
29
+ value = instance_eval(&block)
30
+ if value.nil?
31
+ raise "`#{field}` cannot be nil" unless optional
32
+ $log.debug("Scraped nil value for #{self.class}##{field}")
33
+ end
34
+ @scraped[field] = value
35
+ rescue RuntimeError => e
36
+ @errors[field] = e
37
+ end
38
+ end
39
+
40
+ raise "`#{field}` raised a #{@errors[field].class}" if @errors.key?(field)
41
+
42
+ @scraped[field]
43
+ end
44
+ end
45
+
46
+ # @return [Array<Symbol>]
47
+ # The names of all scraped values, as defined by {scrapes}.
48
+ def self.fields
49
+ @fields ||= []
50
+ end
51
+
52
+ # @return [Object]
53
+ # The source being scraped. Typically a Mechanize pluggable parser
54
+ # such as +Mechanize::Page+.
55
+ attr_reader :source
56
+
57
+ # @param source
58
+ # @raise [Grubby::Scraper::Error]
59
+ # if any scraped values result in error
60
+ def initialize(source)
61
+ @source = source
62
+ @scraped = {}
63
+ @errors = {}
64
+
65
+ self.class.fields.each do |field|
66
+ begin
67
+ self.send(field)
68
+ rescue RuntimeError
69
+ end
70
+ end
71
+
72
+ unless @errors.empty?
73
+ listing = @errors.map do |field, error|
74
+ error_class = " (#{error.class})" unless error.class == RuntimeError
75
+ error_trace = error.backtrace.join("\n").indent(2)
76
+ "* #{field} -- #{error.message}#{error_class}\n#{error_trace}"
77
+ end
78
+ raise Error.new("Failed to scrape the following fields:\n#{listing.join("\n")}")
79
+ end
80
+ end
81
+
82
+ # Returns the scraped value named by +field+.
83
+ #
84
+ # @param field [Symbol, String]
85
+ # @return [Object]
86
+ # @raise [RuntimeError]
87
+ # if +field+ is not a valid name
88
+ def [](field)
89
+ @scraped.fetch(field.to_sym)
90
+ end
91
+
92
+ # Returns all scraped values as a Hash.
93
+ #
94
+ # @return [Hash<Symbol, Object>]
95
+ def to_h
96
+ @scraped.dup
97
+ end
98
+
99
+ end
@@ -0,0 +1,3 @@
1
+ class Grubby
2
+ VERSION = "1.0.0"
3
+ end
metadata ADDED
@@ -0,0 +1,220 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: grubby
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Jonathan Hefner
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2017-09-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: activesupport
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '5.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '5.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: casual_support
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '3.0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '3.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: dumb_delimited
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: gorge
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: mechanize
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '2.7'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '2.7'
83
+ - !ruby/object:Gem::Dependency
84
+ name: mini_sanity
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '1.0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '1.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: pleasant_path
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '1.1'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '1.1'
111
+ - !ruby/object:Gem::Dependency
112
+ name: bundler
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '1.15'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '1.15'
125
+ - !ruby/object:Gem::Dependency
126
+ name: rake
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '10.0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '10.0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: minitest
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '5.0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '5.0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: yard
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '0.9'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: '0.9'
167
+ description:
168
+ email:
169
+ - jonathan.hefner@gmail.com
170
+ executables: []
171
+ extensions: []
172
+ extra_rdoc_files: []
173
+ files:
174
+ - ".gitignore"
175
+ - ".travis.yml"
176
+ - Gemfile
177
+ - LICENSE.txt
178
+ - README.md
179
+ - Rakefile
180
+ - grubby.gemspec
181
+ - lib/grubby.rb
182
+ - lib/grubby/core_ext/string.rb
183
+ - lib/grubby/core_ext/uri.rb
184
+ - lib/grubby/json_parser.rb
185
+ - lib/grubby/json_scraper.rb
186
+ - lib/grubby/log.rb
187
+ - lib/grubby/mechanize/download.rb
188
+ - lib/grubby/mechanize/fetch_with_retry.rb
189
+ - lib/grubby/mechanize/file.rb
190
+ - lib/grubby/mechanize/link.rb
191
+ - lib/grubby/mechanize/page.rb
192
+ - lib/grubby/nokogiri/searchable.rb
193
+ - lib/grubby/page_scraper.rb
194
+ - lib/grubby/scraper.rb
195
+ - lib/grubby/version.rb
196
+ homepage: https://github.com/jonathanhefner/grubby
197
+ licenses:
198
+ - MIT
199
+ metadata: {}
200
+ post_install_message:
201
+ rdoc_options: []
202
+ require_paths:
203
+ - lib
204
+ required_ruby_version: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - ">="
207
+ - !ruby/object:Gem::Version
208
+ version: '0'
209
+ required_rubygems_version: !ruby/object:Gem::Requirement
210
+ requirements:
211
+ - - ">="
212
+ - !ruby/object:Gem::Version
213
+ version: '0'
214
+ requirements: []
215
+ rubyforge_project:
216
+ rubygems_version: 2.6.13
217
+ signing_key:
218
+ specification_version: 4
219
+ summary: Fail-fast web scraping
220
+ test_files: []