grubby 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 88c8ecc06ffba254ee9e9de3d42f868c0692b244
4
+ data.tar.gz: 8cd3445f33c9f7db05550947d686293ceee620c1
5
+ SHA512:
6
+ metadata.gz: d9dc7435763425d54d82f4930935c913cd67eb15a1250c58c7ccf3b0e419eeb6ceb87289ed3c22386faa96e3ae3584a9f39cf8e671af8b56b52bb3e2c8257e4d
7
+ data.tar.gz: 2c5c96993c8a673274a4acc34c3f82a8719fbea508c1dffb5ce8cff1d4a13cd90c9fcbf8a3c743500a31f08419c687c24adf5f2aee464a8a4f1935b2b302b184
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.2.5
5
+ before_install: gem install bundler -v 1.15.1
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in grubby.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2017 Jonathan Hefner
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,127 @@
1
+ # grubby
2
+
3
+ [Fail-fast] web scraping. *grubby* adds a layer of utility and
4
+ error-checking atop the marvelous [Mechanize gem]. See API summary
5
+ below, or browse the [full documentation].
6
+
7
+ [Fail-fast]: https://en.wikipedia.org/wiki/Fail-fast
8
+ [Mechanize gem]: https://rubygems.org/gems/mechanize
9
+ [full documentation]: http://www.rubydoc.info/gems/grubby/
10
+
11
+
12
+ ## Examples
13
+
14
+ The following example scrapes the [Hacker News] front page:
15
+
16
+ ```ruby
17
+ require "grubby"
18
+
19
+ class HackerNews < Grubby::PageScraper
20
+
21
+ scrapes(:items) do
22
+ page.search!(".athing").map{|item| HackerNewsItem.new(item) }
23
+ end
24
+
25
+ end
26
+
27
+ class HackerNewsItem < Grubby::Scraper
28
+
29
+ scrapes(:title) { @row1.at!(".storylink").text }
30
+ scrapes(:submitter) { @row2.at!(".hnuser").text }
31
+ scrapes(:story_uri) { URI.join(@base_uri, @row1.at!(".storylink")["href"]) }
32
+ scrapes(:comments_uri) { URI.join(@base_uri, @row2.at!(".age a")["href"]) }
33
+
34
+ def initialize(source)
35
+ @row1 = source
36
+ @row2 = source.next_sibling
37
+ @base_uri = source.document.url
38
+ super
39
+ end
40
+
41
+ end
42
+
43
+ grubby = Grubby.new
44
+
45
+ # The following line will raise an exception if anything goes wrong
46
+ # during the scraping process. For example, if the structure of the
47
+ # HTML does not match expectations, either due to a bad assumption or
48
+ # due to a site-wide change, the script will terminate immediately with
49
+ # a relevant error message. This prevents bad values from propogating
50
+ # and causing hard-to-trace errors.
51
+ hn = HackerNews.new(grubby.get("https://news.ycombinator.com/news"))
52
+
53
+ puts hn.items.take(10).map(&:title) # your scraping logic goes here
54
+ ```
55
+
56
+ [Hacker News]: https://news.ycombinator.com/news
57
+
58
+
59
+ ## Core API
60
+
61
+ - [Grubby](http://www.rubydoc.info/gems/grubby/Grubby)
62
+ - [#get_mirrored](http://www.rubydoc.info/gems/grubby/Grubby:get_mirrored)
63
+ - [#singleton](http://www.rubydoc.info/gems/grubby/Grubby:singleton)
64
+ - [#time_between_requests](http://www.rubydoc.info/gems/grubby/Grubby:time_between_requests)
65
+ - [Scraper](http://www.rubydoc.info/gems/grubby/Grubby/Scraper)
66
+ - [.fields](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.fields)
67
+ - [.scrapes](http://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrapes)
68
+ - [#[]](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:[])
69
+ - [#source](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:source)
70
+ - [#to_h](http://www.rubydoc.info/gems/grubby/Grubby/Scraper:to_h)
71
+ - [PageScraper](http://www.rubydoc.info/gems/grubby/Grubby/PageScraper)
72
+ - [#page](http://www.rubydoc.info/gems/grubby/Grubby/PageScraper:page)
73
+ - [JsonScraper](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper)
74
+ - [#json](http://www.rubydoc.info/gems/grubby/Grubby/JsonScraper:json)
75
+ - Nokogiri::XML::Searchable
76
+ - [#at!](http://www.rubydoc.info/gems/grubby/Nokogiri/XML/Searchable:at%21)
77
+ - [#search!](http://www.rubydoc.info/gems/grubby/Nokogiri/XML/Searchable:search%21)
78
+ - Mechanize::Page
79
+ - [#at!](http://www.rubydoc.info/gems/grubby/Mechanize/Page:at%21)
80
+ - [#search!](http://www.rubydoc.info/gems/grubby/Mechanize/Page:search%21)
81
+ - Mechanize::Page::Link
82
+ - [#to_absolute_uri](http://www.rubydoc.info/gems/grubby/Mechanize/Page/Link#to_absolute_uri)
83
+
84
+
85
+ ## Supplemental API
86
+
87
+ *grubby* uses several gems which extend core Ruby objects with
88
+ convenience methods. When you import *grubby* you automatically make
89
+ these methods available. See each gem below for its specific API
90
+ documentation:
91
+
92
+ - [Active Support](https://rubygems.org/gems/activesupport)
93
+ ([docs](http://www.rubydoc.info/gems/activesupport/))
94
+ - [casual_support](https://rubygems.org/gems/casual_support)
95
+ ([docs](http://www.rubydoc.info/gems/casual_support/))
96
+ - [gorge](https://rubygems.org/gems/gorge)
97
+ ([docs](http://www.rubydoc.info/gems/gorge/))
98
+ - [mini_sanity](https://rubygems.org/gems/mini_sanity)
99
+ ([docs](http://www.rubydoc.info/gems/mini_sanity/))
100
+ - [pleasant_path](https://rubygems.org/gems/pleasant_path)
101
+ ([docs](http://www.rubydoc.info/gems/pleasant_path/))
102
+
103
+
104
+ ## Installation
105
+
106
+ Install from [Ruby Gems](https://rubygems.org/gems/grubby):
107
+
108
+ ```bash
109
+ $ gem install grubby
110
+ ```
111
+
112
+ Then require in your Ruby script:
113
+
114
+ ```ruby
115
+ require "grubby"
116
+ ```
117
+
118
+
119
+ ## Contributing
120
+
121
+ Run `rake test` to run the tests. You can also run `rake irb` for an
122
+ interactive prompt that pre-loads the project code.
123
+
124
+
125
+ ## License
126
+
127
+ [MIT License](https://opensource.org/licenses/MIT)
data/Rakefile ADDED
@@ -0,0 +1,26 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+ require "yard"
4
+
5
+
6
+ YARD::Rake::YardocTask.new(:doc) do |t|
7
+ end
8
+
9
+ desc "Launch IRB with this gem pre-loaded"
10
+ task :irb do
11
+ # HACK because lib/grubby/version is prematurely loaded by bundler/gem_tasks
12
+ Object.send(:remove_const, :Grubby)
13
+
14
+ require "grubby"
15
+ require "irb"
16
+ ARGV.clear
17
+ IRB.start
18
+ end
19
+
20
+ Rake::TestTask.new(:test) do |t|
21
+ t.libs << "test"
22
+ t.libs << "lib"
23
+ t.test_files = FileList["test/**/*_test.rb"]
24
+ end
25
+
26
+ task :default => :test
data/grubby.gemspec ADDED
@@ -0,0 +1,35 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "grubby/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "grubby"
8
+ spec.version = Grubby::VERSION
9
+ spec.authors = ["Jonathan Hefner"]
10
+ spec.email = ["jonathan.hefner@gmail.com"]
11
+
12
+ spec.summary = %q{Fail-fast web scraping}
13
+ spec.homepage = "https://github.com/jonathanhefner/grubby"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
17
+ f.match(%r{^(test|spec|features)/})
18
+ end
19
+ spec.bindir = "exe"
20
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
21
+ spec.require_paths = ["lib"]
22
+
23
+ spec.add_runtime_dependency "activesupport", "~> 5.0"
24
+ spec.add_runtime_dependency "casual_support", "~> 3.0"
25
+ spec.add_runtime_dependency "dumb_delimited", "~> 1.0"
26
+ spec.add_runtime_dependency "gorge", "~> 1.0"
27
+ spec.add_runtime_dependency "mechanize", "~> 2.7"
28
+ spec.add_runtime_dependency "mini_sanity", "~> 1.0"
29
+ spec.add_runtime_dependency "pleasant_path", "~> 1.1"
30
+
31
+ spec.add_development_dependency "bundler", "~> 1.15"
32
+ spec.add_development_dependency "rake", "~> 10.0"
33
+ spec.add_development_dependency "minitest", "~> 5.0"
34
+ spec.add_development_dependency "yard", "~> 0.9"
35
+ end
data/lib/grubby.rb ADDED
@@ -0,0 +1,169 @@
1
+ require "active_support/all"
2
+ require "casual_support"
3
+ require "dumb_delimited"
4
+ require "gorge"
5
+ require "mechanize"
6
+ require "mini_sanity"
7
+ require "pleasant_path"
8
+
9
+ require_relative "grubby/log"
10
+
11
+ require_relative "grubby/core_ext/string"
12
+ require_relative "grubby/core_ext/uri"
13
+ require_relative "grubby/mechanize/fetch_with_retry"
14
+ require_relative "grubby/mechanize/download"
15
+ require_relative "grubby/mechanize/file"
16
+ require_relative "grubby/mechanize/link"
17
+ require_relative "grubby/mechanize/page"
18
+ require_relative "grubby/nokogiri/searchable"
19
+
20
+
21
+ class Grubby < Mechanize
22
+
23
+ # @return [Integer, Float, Range<Integer>, Range<Float>]
24
+ # The enforced minimum amount of time to wait between requests, in
25
+ # seconds. If the value is a Range, a random number within the
26
+ # Range is chosen for each request.
27
+ attr_accessor :time_between_requests
28
+
29
+ # @param singleton_journal [Pathname, String]
30
+ # Optional journal file to persist the list of resources processed
31
+ # by {singleton}. Useful to ensure only-once processing across
32
+ # multiple program runs.
33
+ def initialize(singleton_journal = nil)
34
+ super()
35
+
36
+ # Prevent "memory leaks", and prevent mistakenly blank urls from
37
+ # resolving. (Blank urls resolve as a path relative to the last
38
+ # history entry. Without this setting, an erroneous `agent.get("")`
39
+ # could sometimes successfully fetch a page.)
40
+ self.max_history = 0
41
+
42
+ # Prevent files of unforeseen content type from being buffered into
43
+ # memory by default, in case they are very large. However, increase
44
+ # the threshold for what is considered "large", to prevent
45
+ # unnecessary writes to disk.
46
+ #
47
+ # References:
48
+ # - http://docs.seattlerb.org/mechanize/Mechanize/PluggableParser.html
49
+ # - http://docs.seattlerb.org/mechanize/Mechanize/Download.html
50
+ # - http://docs.seattlerb.org/mechanize/Mechanize/File.html
51
+ self.max_file_buffer = 1_000_000 # only applies to Mechanize::Download
52
+ self.pluggable_parser.default = Mechanize::Download
53
+ self.pluggable_parser["text/plain"] = Mechanize::File
54
+ self.pluggable_parser["application/json"] = Grubby::JsonParser
55
+
56
+ # Set up configurable rate limiting, and choose a reasonable default
57
+ # rate limit.
58
+ self.pre_connect_hooks << Proc.new{ self.send(:sleep_between_requests) }
59
+ self.time_between_requests = 1.0
60
+
61
+ @journal = singleton_journal ?
62
+ singleton_journal.to_pathname.touch_file : Pathname::NULL
63
+ @seen = SingletonKey.parse_file(@journal).
64
+ group_by(&:purpose).transform_values{|sks| sks.map(&:key).index_to{ true } }
65
+ end
66
+
67
+ # Calls +#get+ with each of +mirror_uris+ until a successful
68
+ # ("200 OK") response is recieved, and returns that +#get+ result.
69
+ # Rescues and logs +Mechanize::ResponseCodeError+ failures for all but
70
+ # the last mirror.
71
+ #
72
+ # @param mirror_uris [Array<String>]
73
+ # @return [Mechanize::Page, Mechanize::File, Mechanize::Download, ...]
74
+ # @raise [Mechanize::ResponseCodeError]
75
+ # if all +mirror_uris+ fail
76
+ def get_mirrored(mirror_uris, parameters = [], referer = nil, headers = {})
77
+ i = 0
78
+ begin
79
+ get(mirror_uris[i], parameters, referer, headers)
80
+ rescue Mechanize::ResponseCodeError => e
81
+ i += 1
82
+ if i >= mirror_uris.length
83
+ raise
84
+ else
85
+ $log.info("Mirror failed with response code #{e.response_code}: #{mirror_uris[i - 1]}")
86
+ $log.debug("Trying next mirror: #{mirror_uris[i]}")
87
+ retry
88
+ end
89
+ end
90
+ end
91
+
92
+ # Ensures only-once processing of the resource indicated by +target+
93
+ # for the specified +purpose+. A list of previously-processed
94
+ # resource URIs and content hashes is maintained in the Grubby
95
+ # instance. The given block is called with the fetched resource only
96
+ # if the resource's URI and the resource's content hash have not been
97
+ # previously processed under the specified +purpose+.
98
+ #
99
+ # @param target [URI, String, Mechanize::Page::Link, #to_absolute_uri]
100
+ # designates the resource to fetch
101
+ # @param purpose [String]
102
+ # the purpose of processing the resource
103
+ # @yield [resource]
104
+ # processes the resource
105
+ # @yieldparam resource [Mechanize::Page, Mechanize::File, Mechanize::Download, ...]
106
+ # the fetched resource
107
+ # @return [Boolean]
108
+ # whether the given block was called
109
+ # @raise [Mechanize::ResponseCodeError]
110
+ # if fetching the resource results in error (see +Mechanize#get+)
111
+ def singleton(target, purpose = "")
112
+ series = []
113
+
114
+ original_url = target.to_absolute_uri
115
+ return if skip_singleton?(purpose, original_url.to_s, series)
116
+
117
+ url = normalize_url(original_url)
118
+ return if skip_singleton?(purpose, url.to_s, series)
119
+
120
+ $log.info("Fetching #{url}")
121
+ resource = get(url)
122
+ skip = skip_singleton?(purpose, resource.uri.to_s, series) |
123
+ skip_singleton?(purpose, "content hash: #{resource.content_hash}", series)
124
+
125
+ yield resource unless skip
126
+
127
+ series.map{|k| SingletonKey.new(purpose, k) }.append_to_file(@journal)
128
+
129
+ !skip
130
+ end
131
+
132
+
133
+ private
134
+
135
+ SingletonKey = DumbDelimited[:purpose, :key]
136
+
137
+ def skip_singleton?(purpose, key, series)
138
+ return false if series.include?(key)
139
+ series << key
140
+ already = (@seen[purpose.to_s] ||= {}).displace(key, true)
141
+ $log.info("Skipping #{series.first} (already seen #{series.last})") if already
142
+ already
143
+ end
144
+
145
+ def normalize_url(url)
146
+ url = url.dup
147
+ $log.warn("Discarding fragment in URL: #{url}") if url.fragment
148
+ url.fragment = nil
149
+ url.path = url.path.chomp("/")
150
+ url
151
+ end
152
+
153
+ def sleep_between_requests
154
+ @last_request_at ||= 0.0
155
+ delay_duration = @time_between_requests.is_a?(Range) ?
156
+ rand(@time_between_requests) : @time_between_requests
157
+ sleep_duration = @last_request_at + delay_duration - Time.now.to_f
158
+ sleep(sleep_duration) if sleep_duration > 0
159
+ @last_request_at = Time.now.to_f
160
+ end
161
+
162
+ end
163
+
164
+
165
+ require_relative "grubby/version"
166
+ require_relative "grubby/json_parser"
167
+ require_relative "grubby/scraper"
168
+ require_relative "grubby/page_scraper"
169
+ require_relative "grubby/json_scraper"
@@ -0,0 +1,12 @@
1
+ class String
2
+
3
+ # Constructs a URI from the String. Raises an exception if the String
4
+ # does not denote an absolute URI.
5
+ #
6
+ # @return [URI]
7
+ # @raise [RuntimeError] if the String does not denote an absolute URI
8
+ def to_absolute_uri
9
+ URI(self).to_absolute_uri
10
+ end
11
+
12
+ end
@@ -0,0 +1,12 @@
1
+ module URI
2
+
3
+ # Raises an exception if the URI is not +absolute?+.
4
+ #
5
+ # @return [self]
6
+ # @raise [RuntimeError] if the URI is not +absolute?+
7
+ def to_absolute_uri
8
+ raise "URI is not absolute: #{self}" unless self.absolute?
9
+ self
10
+ end
11
+
12
+ end
@@ -0,0 +1,45 @@
1
+ class Grubby::JsonParser < Mechanize::File
2
+
3
+ # Returns the options to use when parsing JSON. The returned options
4
+ # Hash is not +dup+ed and can be modified directly. Any modifications
5
+ # will be applied to all future parsing.
6
+ #
7
+ # For information about available options, see
8
+ # {http://ruby-doc.org/stdlib/libdoc/json/rdoc/JSON.html#method-i-parse
9
+ # +JSON.parse+}.
10
+ #
11
+ # @return [Hash]
12
+ def self.json_parse_options
13
+ @json_parse_options ||= {
14
+ max_nesting: false,
15
+ allow_nan: false,
16
+ symbolize_names: false,
17
+ create_additions: false,
18
+ object_class: Hash,
19
+ array_class: Array,
20
+ }
21
+ end
22
+
23
+ # Sets the options to use when parsing JSON. The entire options Hash
24
+ # is replaced, and the new value will be applied to all future
25
+ # parsing. To set options individually, see {json_parse_options}.
26
+ #
27
+ # For information about available options, see
28
+ # {http://ruby-doc.org/stdlib/libdoc/json/rdoc/JSON.html#method-i-parse
29
+ # +JSON.parse+}.
30
+ #
31
+ # @param options [Hash]
32
+ def self.json_parse_options=(options)
33
+ @json_parse_options = options
34
+ end
35
+
36
+ # @return [Hash, Array]
37
+ # The parsed JSON data.
38
+ attr_reader :json
39
+
40
+ def initialize(uri = nil, response = nil, body = nil, code = nil)
41
+ @json = body && JSON.parse(body, self.class.json_parse_options)
42
+ super
43
+ end
44
+
45
+ end
@@ -0,0 +1,13 @@
1
+ class Grubby::JsonScraper < Grubby::Scraper
2
+
3
+ # @return [Hash, Array]
4
+ # The parsed JSON data being scraped.
5
+ attr_reader :json
6
+
7
+ # @param source [Grubby::JsonParser]
8
+ def initialize(source)
9
+ @json = source.assert_kind_of!(Grubby::JsonParser).json
10
+ super
11
+ end
12
+
13
+ end
data/lib/grubby/log.rb ADDED
@@ -0,0 +1,5 @@
1
+ $log ||= Logger.new($stderr).tap do |logger|
2
+ logger.formatter = ->(severity, datetime, progname, msg) do
3
+ "[#{datetime.to_ymd} #{datetime.to_hms}] #{severity} #{msg}\n"
4
+ end
5
+ end
@@ -0,0 +1,8 @@
1
+ class Mechanize::Download
2
+
3
+ # private
4
+ def content_hash
5
+ @content_hash ||= Digest::SHA1.new.io(self.body_io).hexdigest
6
+ end
7
+
8
+ end
@@ -0,0 +1,39 @@
1
+ # This monkey patch attempts to fix the insidious "too many connection
2
+ # resets" bug described here: https://github.com/sparklemotion/mechanize/issues/123
3
+ #
4
+ # The code is taken and modified from this helpful blog article:
5
+ # http://scottwb.com/blog/2013/11/09/defeating-the-infamous-mechanize-too-many-connection-resets-bug/
6
+ class Mechanize::HTTP::Agent
7
+
8
+ MAX_CONNECTION_RESET_RETRIES = 9
9
+ IDEMPOTENT_HTTP_METHODS = [:get, :head, :options, :delete]
10
+
11
+ # Replacement for +Mechanize::HTTP::Agent#fetch+. When a "too many
12
+ # connection resets" error is encountered, this method shuts down the
13
+ # persistent HTTP connection, and then retries the request (upto
14
+ # {MAX_CONNECTION_RESET_RETRIES} times).
15
+ def fetch_with_retry(uri, http_method = :get, headers = {}, params = [], referer = current_page, redirects = 0)
16
+ retry_count = 0
17
+ begin
18
+ fetch_without_retry(uri, http_method, headers, params, referer, redirects)
19
+ rescue Net::HTTP::Persistent::Error => e
20
+ # raise if different type of error
21
+ raise unless e.message.include?("too many connection resets")
22
+ # raise if non-idempotent http method
23
+ raise unless IDEMPOTENT_HTTP_METHODS.include?(http_method)
24
+ # raise if we've tried too many times
25
+ raise if retry_count >= MAX_CONNECTION_RESET_RETRIES
26
+
27
+ # otherwise, shutdown the persistent HTTP connection and try again
28
+ retry_count += 1
29
+ $log.warn("Possible connection reset bug. Retry(#{retry_count}) #{http_method.to_s.upcase} #{uri}")
30
+ self.http.shutdown
31
+ sleep(retry_count) # incremental backoff in case problem is with server
32
+ retry
33
+ end
34
+ end
35
+
36
+ alias_method :fetch_without_retry, :fetch
37
+ alias_method :fetch, :fetch_with_retry
38
+
39
+ end
@@ -0,0 +1,8 @@
1
+ class Mechanize::File
2
+
3
+ # private
4
+ def content_hash
5
+ @content_hash ||= self.body.to_s.sha1
6
+ end
7
+
8
+ end
@@ -0,0 +1,20 @@
1
+ class Mechanize::Page::Link
2
+
3
+ # Returns the URI represented by the Link, in absolute form. If the
4
+ # href attribute of the Link is expressed in relative form, the URI of
5
+ # the Link's Page is used to convert to absolute form.
6
+ #
7
+ # @return [URI]
8
+ def to_absolute_uri
9
+ # Via the W3 spec: "If the a element has no href attribute, then the
10
+ # element represents a placeholder for where a link might otherwise
11
+ # have been placed, if it had been relevant, consisting of just the
12
+ # element's contents."[1] So, we assume a link with no href
13
+ # attribute (i.e. `uri == nil`) should be treated the same as an
14
+ # intra-page link.
15
+ #
16
+ # [1]: https://www.w3.org/TR/2016/REC-html51-20161101/textlevel-semantics.html#the-a-element
17
+ URI.join(self.page.uri, self.uri || "#").to_absolute_uri
18
+ end
19
+
20
+ end
@@ -0,0 +1,17 @@
1
+ class Mechanize::Page
2
+
3
+ # @!method search!(*queries)
4
+ # See {::Nokogiri::XML::Searchable#search!}.
5
+ #
6
+ # @param queries [Array<String>]
7
+ # @return [Array<Nokogiri::XML::Element>]
8
+ def_delegators :parser, :search!
9
+
10
+ # @!method at!(*queries)
11
+ # See {::Nokogiri::XML::Searchable#at!}.
12
+ #
13
+ # @param queries [Array<String>]
14
+ # @return [Nokogiri::XML::Element]
15
+ def_delegators :parser, :at!
16
+
17
+ end
@@ -0,0 +1,27 @@
1
+ module Nokogiri::XML::Searchable
2
+
3
+ # Searches the node using the given XPath or CSS queries, and returns
4
+ # the results. Raises an exception if there are no results. See also
5
+ # +#search+.
6
+ #
7
+ # @param queries [Array<String>]
8
+ # @return [Array<Nokogiri::XML::Element>]
9
+ # @raise [RuntimeError] if queries yield no results
10
+ def search!(*queries)
11
+ results = search(*queries)
12
+ raise "No elements matching #{queries.map(&:inspect).join(" OR ")}" if results.empty?
13
+ results
14
+ end
15
+
16
+ # Searches the node using the given XPath or CSS queries, and returns
17
+ # only the first result. Raises an exception if there are no results.
18
+ # See also +#at+.
19
+ #
20
+ # @param queries [Array<String>]
21
+ # @return [Nokogiri::XML::Element]
22
+ # @raise [RuntimeError] if queries yield no results
23
+ def at!(*queries)
24
+ search!(*queries).first
25
+ end
26
+
27
+ end
@@ -0,0 +1,13 @@
1
+ class Grubby::PageScraper < Grubby::Scraper
2
+
3
+ # @return [Mechanize::Page]
4
+ # The Page being scraped.
5
+ attr_reader :page
6
+
7
+ # @param source [Mechanize::Page]
8
+ def initialize(source)
9
+ @page = source.assert_kind_of!(Mechanize::Page)
10
+ super
11
+ end
12
+
13
+ end
@@ -0,0 +1,99 @@
1
+ class Grubby::Scraper
2
+
3
+ class Error < RuntimeError
4
+ end
5
+
6
+ # Defines an attribute reader method named by +field+. During
7
+ # +initialize+, the given block is called, and the attribute is set to
8
+ # the block's return value. By default, if the block's return value
9
+ # is nil, an exception will be raised. To prevent this behavior, set
10
+ # +optional+ to true.
11
+ #
12
+ # @param field [Symbol, String]
13
+ # name of the scraped value
14
+ # @param optional [Boolean]
15
+ # whether to permit a nil scraped value
16
+ # @yield []
17
+ # scrapes the value
18
+ # @yieldreturn [Object]
19
+ # scraped value
20
+ def self.scrapes(field, optional: false, &block)
21
+ field = field.to_sym
22
+ self.fields << field
23
+
24
+ define_method(field) do
25
+ return @scraped[field] if @scraped.key?(field)
26
+
27
+ unless @errors.key?(field)
28
+ begin
29
+ value = instance_eval(&block)
30
+ if value.nil?
31
+ raise "`#{field}` cannot be nil" unless optional
32
+ $log.debug("Scraped nil value for #{self.class}##{field}")
33
+ end
34
+ @scraped[field] = value
35
+ rescue RuntimeError => e
36
+ @errors[field] = e
37
+ end
38
+ end
39
+
40
+ raise "`#{field}` raised a #{@errors[field].class}" if @errors.key?(field)
41
+
42
+ @scraped[field]
43
+ end
44
+ end
45
+
46
+ # @return [Array<Symbol>]
47
+ # The names of all scraped values, as defined by {scrapes}.
48
+ def self.fields
49
+ @fields ||= []
50
+ end
51
+
52
+ # @return [Object]
53
+ # The source being scraped. Typically a Mechanize pluggable parser
54
+ # such as +Mechanize::Page+.
55
+ attr_reader :source
56
+
57
+ # @param source
58
+ # @raise [Grubby::Scraper::Error]
59
+ # if any scraped values result in error
60
+ def initialize(source)
61
+ @source = source
62
+ @scraped = {}
63
+ @errors = {}
64
+
65
+ self.class.fields.each do |field|
66
+ begin
67
+ self.send(field)
68
+ rescue RuntimeError
69
+ end
70
+ end
71
+
72
+ unless @errors.empty?
73
+ listing = @errors.map do |field, error|
74
+ error_class = " (#{error.class})" unless error.class == RuntimeError
75
+ error_trace = error.backtrace.join("\n").indent(2)
76
+ "* #{field} -- #{error.message}#{error_class}\n#{error_trace}"
77
+ end
78
+ raise Error.new("Failed to scrape the following fields:\n#{listing.join("\n")}")
79
+ end
80
+ end
81
+
82
+ # Returns the scraped value named by +field+.
83
+ #
84
+ # @param field [Symbol, String]
85
+ # @return [Object]
86
+ # @raise [RuntimeError]
87
+ # if +field+ is not a valid name
88
+ def [](field)
89
+ @scraped.fetch(field.to_sym)
90
+ end
91
+
92
+ # Returns all scraped values as a Hash.
93
+ #
94
+ # @return [Hash<Symbol, Object>]
95
+ def to_h
96
+ @scraped.dup
97
+ end
98
+
99
+ end
@@ -0,0 +1,3 @@
1
+ class Grubby
2
+ VERSION = "1.0.0"
3
+ end
metadata ADDED
@@ -0,0 +1,220 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: grubby
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Jonathan Hefner
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2017-09-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: activesupport
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '5.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '5.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: casual_support
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '3.0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '3.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: dumb_delimited
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: gorge
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: mechanize
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '2.7'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '2.7'
83
+ - !ruby/object:Gem::Dependency
84
+ name: mini_sanity
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '1.0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '1.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: pleasant_path
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '1.1'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '1.1'
111
+ - !ruby/object:Gem::Dependency
112
+ name: bundler
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '1.15'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '1.15'
125
+ - !ruby/object:Gem::Dependency
126
+ name: rake
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '10.0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '10.0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: minitest
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '5.0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '5.0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: yard
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '0.9'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: '0.9'
167
+ description:
168
+ email:
169
+ - jonathan.hefner@gmail.com
170
+ executables: []
171
+ extensions: []
172
+ extra_rdoc_files: []
173
+ files:
174
+ - ".gitignore"
175
+ - ".travis.yml"
176
+ - Gemfile
177
+ - LICENSE.txt
178
+ - README.md
179
+ - Rakefile
180
+ - grubby.gemspec
181
+ - lib/grubby.rb
182
+ - lib/grubby/core_ext/string.rb
183
+ - lib/grubby/core_ext/uri.rb
184
+ - lib/grubby/json_parser.rb
185
+ - lib/grubby/json_scraper.rb
186
+ - lib/grubby/log.rb
187
+ - lib/grubby/mechanize/download.rb
188
+ - lib/grubby/mechanize/fetch_with_retry.rb
189
+ - lib/grubby/mechanize/file.rb
190
+ - lib/grubby/mechanize/link.rb
191
+ - lib/grubby/mechanize/page.rb
192
+ - lib/grubby/nokogiri/searchable.rb
193
+ - lib/grubby/page_scraper.rb
194
+ - lib/grubby/scraper.rb
195
+ - lib/grubby/version.rb
196
+ homepage: https://github.com/jonathanhefner/grubby
197
+ licenses:
198
+ - MIT
199
+ metadata: {}
200
+ post_install_message:
201
+ rdoc_options: []
202
+ require_paths:
203
+ - lib
204
+ required_ruby_version: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - ">="
207
+ - !ruby/object:Gem::Version
208
+ version: '0'
209
+ required_rubygems_version: !ruby/object:Gem::Requirement
210
+ requirements:
211
+ - - ">="
212
+ - !ruby/object:Gem::Version
213
+ version: '0'
214
+ requirements: []
215
+ rubyforge_project:
216
+ rubygems_version: 2.6.13
217
+ signing_key:
218
+ specification_version: 4
219
+ summary: Fail-fast web scraping
220
+ test_files: []