fetch 0.0.0 → 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +21 -8
- data/Rakefile +1 -1
- data/fetch.gemspec +17 -14
- data/lib/fetch.rb +49 -1
- data/lib/fetch/async.rb +21 -0
- data/lib/fetch/backend.rb +2 -0
- data/lib/fetch/backend/base.rb +15 -0
- data/lib/fetch/backend/typhoeus.rb +43 -0
- data/lib/fetch/base.rb +64 -0
- data/lib/fetch/callbacks.rb +62 -0
- data/lib/fetch/configuration.rb +30 -0
- data/lib/fetch/module.rb +17 -0
- data/lib/fetch/request.rb +156 -0
- data/lib/fetch/simple.rb +21 -0
- data/lib/fetch/version.rb +1 -1
- data/test/callbacks_test.rb +97 -0
- data/test/fetch_test.rb +689 -2
- data/test/simple_test.rb +17 -0
- data/test/test_helper.rb +14 -2
- metadata +64 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0596dd2fcaa4c9e3be34f29b5346794cd98cfb1e
|
4
|
+
data.tar.gz: cf9479fe7e9c26b350efa01d64b5e880bdac6aa4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 34308fe5ee23a51d4974823f61020f2db9bb6b77966acbd9c72e41170523b27be4bb559b556e71c15899b5cd6b215d905c2c1489f0f110ed78a93b988c643f3e
|
7
|
+
data.tar.gz: f2004793493fd8bdefab68bcc50178a6519a205e768f3e2e7bdddc6757dd85408b0ab94f1e961208a574c7cccbaee4bf21ce62c0fa26e00e8c5f01c9b0c26560
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -1,28 +1,41 @@
|
|
1
|
-
[](http://travis-ci.org/bogrobotten/fetch)
|
2
2
|
|
3
|
-
# Fetch
|
3
|
+
# Fetch
|
4
4
|
|
5
5
|

|
6
6
|
|
7
|
+
Fetch enables easy fetching of data from multiple web sources.
|
8
|
+
It was extracted from [Bogrobotten](http://www.bogrobotten.dk) where we use it
|
9
|
+
to fetch prices and other stuff from multiple merchants.
|
10
|
+
We use it for price comparison, but you can use it for anything that involves
|
11
|
+
fetching data from external sources.
|
12
|
+
|
13
|
+
Fetch uses the [Typhoeus](https://github.com/typhoeus/typhoeus) gem for fast
|
14
|
+
and reliable asynchronous fetches from multiple URLs.
|
15
|
+
|
7
16
|
## Installation
|
8
17
|
|
9
18
|
Add this line to your application's *Gemfile*:
|
10
19
|
|
11
|
-
|
20
|
+
```ruby
|
21
|
+
gem "fetch"
|
22
|
+
```
|
12
23
|
|
13
24
|
Then run:
|
14
25
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
$ gem install fetch
|
26
|
+
```bash
|
27
|
+
$ bundle
|
28
|
+
```
|
20
29
|
|
21
30
|
## Contributing
|
22
31
|
|
32
|
+
Contributions are much appreciated. To contribute:
|
33
|
+
|
23
34
|
1. Fork the project
|
24
35
|
2. Create a feature branch (`git checkout -b my-new-feature`)
|
25
36
|
3. Make your changes, including tests so it doesn't break in the future
|
26
37
|
4. Commit your changes (`git commit -am 'Add feature'`)
|
27
38
|
5. Push to the branch (`git push origin my-new-feature`)
|
28
39
|
6. Create new pull request
|
40
|
+
|
41
|
+
Please do not touch the version, as this will be updated by the owners when the gem is ready for a new release.
|
data/Rakefile
CHANGED
data/fetch.gemspec
CHANGED
@@ -3,20 +3,23 @@ lib = File.expand_path('../lib', __FILE__)
|
|
3
3
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
4
|
require 'fetch/version'
|
5
5
|
|
6
|
-
Gem::Specification.new do |
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "fetch"
|
8
|
+
s.version = Fetch::VERSION
|
9
|
+
s.authors = ["Lasse Bunk"]
|
10
|
+
s.email = ["lassebunk@gmail.com"]
|
11
|
+
s.summary = %q{Coming}
|
12
|
+
s.description = %q{Coming}
|
13
|
+
s.homepage = "https://github.com/lassebunk/fetch"
|
14
|
+
s.license = "MIT"
|
15
15
|
|
16
|
-
|
17
|
-
|
18
|
-
|
16
|
+
s.files = `git ls-files`.split($/)
|
17
|
+
s.test_files = s.files.grep(%r{^test/})
|
18
|
+
s.require_paths = ["lib"]
|
19
19
|
|
20
|
-
|
21
|
-
|
20
|
+
s.add_dependency "typhoeus", ">= 0.6.0"
|
21
|
+
s.add_development_dependency "json"
|
22
|
+
s.add_development_dependency "minitest", ">= 5.4"
|
23
|
+
s.add_development_dependency "webmock", ">= 1.20"
|
24
|
+
s.add_development_dependency "rake"
|
22
25
|
end
|
data/lib/fetch.rb
CHANGED
@@ -1 +1,49 @@
|
|
1
|
-
require "
|
1
|
+
require "typhoeus"
|
2
|
+
|
3
|
+
%w{
|
4
|
+
version
|
5
|
+
callbacks
|
6
|
+
base
|
7
|
+
request
|
8
|
+
async
|
9
|
+
simple
|
10
|
+
module
|
11
|
+
backend
|
12
|
+
configuration
|
13
|
+
}.each do |file|
|
14
|
+
require "fetch/#{file}"
|
15
|
+
end
|
16
|
+
|
17
|
+
module Fetch
|
18
|
+
class HttpError < StandardError
|
19
|
+
attr_reader :code, :url
|
20
|
+
|
21
|
+
def initialize(code, url)
|
22
|
+
@code, @url = code, url
|
23
|
+
end
|
24
|
+
|
25
|
+
def message
|
26
|
+
"HTTP Error #{code}: #{url}"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
class << self
|
31
|
+
# Returns a configuration object.
|
32
|
+
def config
|
33
|
+
@config ||= Configuration.new
|
34
|
+
end
|
35
|
+
|
36
|
+
# Yields a configuration block (+Fetch::Configuration+).
|
37
|
+
#
|
38
|
+
# Fetch.configure do |config|
|
39
|
+
# config.user_agent = "Custom User Agent"
|
40
|
+
# end
|
41
|
+
def configure(&block)
|
42
|
+
yield config
|
43
|
+
end
|
44
|
+
|
45
|
+
def module_cache
|
46
|
+
@module_cache ||= ModuleCache.new
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/lib/fetch/async.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
module Fetch
|
2
|
+
module Async
|
3
|
+
def self.included(base)
|
4
|
+
base.define_callback :request,
|
5
|
+
:before_process,
|
6
|
+
:after_process
|
7
|
+
end
|
8
|
+
|
9
|
+
def requests
|
10
|
+
self.class.callbacks[:request].map do |callback|
|
11
|
+
Request.new.tap do |req|
|
12
|
+
req.before_process { before_process } if callback?(:before_process)
|
13
|
+
req.after_process { after_process } if callback?(:after_process)
|
14
|
+
req.failure { |code, url| failure(code, url) } if callback?(:failure)
|
15
|
+
req.error { |e| error(e) } if callback?(:error)
|
16
|
+
instance_exec(req, &callback)
|
17
|
+
end
|
18
|
+
end.select(&:url)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Fetch
|
2
|
+
module Backend
|
3
|
+
class Typhoeus < Base
|
4
|
+
def run(&progress)
|
5
|
+
hydra = ::Typhoeus::Hydra.new
|
6
|
+
|
7
|
+
build_requests(&progress).each do |request|
|
8
|
+
hydra.queue(request)
|
9
|
+
end
|
10
|
+
|
11
|
+
hydra.run
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def build_requests(&progress)
|
17
|
+
requests.map do |req|
|
18
|
+
request = ::Typhoeus::Request.new(
|
19
|
+
req.url,
|
20
|
+
method: req.method,
|
21
|
+
body: req.body_string,
|
22
|
+
followlocation: req.follow_redirects,
|
23
|
+
timeout: req.timeout,
|
24
|
+
forbid_reuse: true,
|
25
|
+
headers: req.headers
|
26
|
+
)
|
27
|
+
|
28
|
+
request.on_success do |res|
|
29
|
+
req.process!(res.body, req.url, res.effective_url)
|
30
|
+
progress.call
|
31
|
+
end
|
32
|
+
|
33
|
+
request.on_failure do |res|
|
34
|
+
req.failed!(res.code, req.url)
|
35
|
+
progress.call
|
36
|
+
end
|
37
|
+
|
38
|
+
request
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
data/lib/fetch/base.rb
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
# Base module for fetch handlers, e.g. +ProductFetch+, +UserFetch+, etc.
|
2
|
+
module Fetch
|
3
|
+
class Base
|
4
|
+
include Callbacks
|
5
|
+
|
6
|
+
# Set callbacks to be called when fetching.
|
7
|
+
#
|
8
|
+
# before_fetch do
|
9
|
+
# # do something before fetching
|
10
|
+
# end
|
11
|
+
#
|
12
|
+
# after_fetch do
|
13
|
+
# # do something after fetching
|
14
|
+
# end
|
15
|
+
#
|
16
|
+
# progress do |progress|
|
17
|
+
# # update progress in percent
|
18
|
+
# end
|
19
|
+
define_callback :modules,
|
20
|
+
:init,
|
21
|
+
:before_fetch,
|
22
|
+
:after_fetch,
|
23
|
+
:progress
|
24
|
+
|
25
|
+
# Begin fetching.
|
26
|
+
# Will run synchronous fetches first and async fetches afterwards.
|
27
|
+
# Updates progress when each module finishes its fetch.
|
28
|
+
def fetch
|
29
|
+
requests = instantiate_modules.select(&:fetch?).map(&:requests).flatten
|
30
|
+
|
31
|
+
total, done = requests.size, 0
|
32
|
+
update_progress(total, done)
|
33
|
+
|
34
|
+
before_fetch
|
35
|
+
|
36
|
+
backend.new(requests).run do
|
37
|
+
update_progress(total, done += 1)
|
38
|
+
end
|
39
|
+
|
40
|
+
after_fetch
|
41
|
+
|
42
|
+
true
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
# Array of instantiated fetch modules.
|
48
|
+
def instantiate_modules
|
49
|
+
Array(modules).map do |klass|
|
50
|
+
init(klass) || klass.new
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Updates progress with a percentage calculated from +total+ and +done+.
|
55
|
+
def update_progress(total, done)
|
56
|
+
percentage = total.zero? ? 100 : ((done.to_f / total) * 100).to_i
|
57
|
+
progress(percentage)
|
58
|
+
end
|
59
|
+
|
60
|
+
def backend
|
61
|
+
Fetch.config.backend
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Fetch
|
2
|
+
module Callbacks
|
3
|
+
def self.included(base)
|
4
|
+
base.extend ClassMethods
|
5
|
+
end
|
6
|
+
|
7
|
+
private
|
8
|
+
|
9
|
+
# Check if a callback has been used.
|
10
|
+
def callback?(name)
|
11
|
+
self.class.callbacks[name].any?
|
12
|
+
end
|
13
|
+
|
14
|
+
# Run specific callbacks.
|
15
|
+
#
|
16
|
+
# run_callbacks_for(:before_fetch)
|
17
|
+
# run_callbacks_for(:progress, 12) # 12 percent done
|
18
|
+
def run_callbacks_for(name, *args)
|
19
|
+
self.class.callbacks[name].map do |block|
|
20
|
+
instance_exec(*args, &block)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
module ClassMethods
|
25
|
+
# Hash of callback blocks to be called.
|
26
|
+
def callbacks
|
27
|
+
@callbacks ||= Hash.new { |h, k| h[k] = [] }
|
28
|
+
end
|
29
|
+
|
30
|
+
# Defines callback methods on the class level.
|
31
|
+
def define_callback(*names)
|
32
|
+
names.each do |name|
|
33
|
+
define_singleton_method name do |*values, &block|
|
34
|
+
create_callback_for(name, *values, &block)
|
35
|
+
end
|
36
|
+
|
37
|
+
define_method name do |*args|
|
38
|
+
run_callbacks_for(name, *args).last
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def inherited(base)
|
44
|
+
super
|
45
|
+
callbacks.each do |name, callbacks|
|
46
|
+
base.callbacks[name] = callbacks.dup
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def create_callback_for(name, *values, &block)
|
53
|
+
add_callback(name) { values } if values.any?
|
54
|
+
add_callback(name, &block) if block
|
55
|
+
end
|
56
|
+
|
57
|
+
def add_callback(name, &block)
|
58
|
+
callbacks[name] << block
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Fetch
|
2
|
+
class Configuration
|
3
|
+
DEFAULTS = {
|
4
|
+
user_agent: "Mozilla/5.0",
|
5
|
+
timeout: 10,
|
6
|
+
namespaces: ["fetch_sources"],
|
7
|
+
raise_on_error: -> { defined?(Rails.env) && %w{development test}.include?(Rails.env) },
|
8
|
+
backend: Backend::Typhoeus
|
9
|
+
}
|
10
|
+
|
11
|
+
DEFAULTS.each do |option, value|
|
12
|
+
ivar = "@#{option}"
|
13
|
+
|
14
|
+
define_method(option) do
|
15
|
+
return instance_variable_get(ivar) if instance_variable_defined?(ivar)
|
16
|
+
value = value.call if value.is_a?(Proc)
|
17
|
+
instance_variable_set(ivar, value)
|
18
|
+
end
|
19
|
+
|
20
|
+
define_method("#{option}=") do |value|
|
21
|
+
instance_variable_set(ivar, value)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Convenience method for defining a single namespace that contains fetch modules.
|
26
|
+
def namespace=(value)
|
27
|
+
self.namespaces = [value]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/lib/fetch/module.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
module Fetch
|
2
|
+
class Module
|
3
|
+
include Callbacks
|
4
|
+
include Async
|
5
|
+
|
6
|
+
define_callback :fetch_if,
|
7
|
+
:failure,
|
8
|
+
:error
|
9
|
+
|
10
|
+
# Whether or not the module should be used when fetching.
|
11
|
+
# Set with `fetch_if do ... end`.
|
12
|
+
def fetch?
|
13
|
+
return true unless callback?(:fetch_if)
|
14
|
+
!!fetch_if
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,156 @@
|
|
1
|
+
require "cgi"
|
2
|
+
|
3
|
+
module Fetch
|
4
|
+
# A request to be completed with Typhoeus.
|
5
|
+
class Request
|
6
|
+
# Initializes the request and sets properties to the values defined in
|
7
|
+
# +options+.
|
8
|
+
#
|
9
|
+
# request = Fetch::Request.new("http://www.google.com", timeout: 5)
|
10
|
+
# request.url # => "http://www.google.com"
|
11
|
+
# request.timeout # => 5
|
12
|
+
#
|
13
|
+
# request = Fetch::Request.new(timeout: 5)
|
14
|
+
# request.url # => nil
|
15
|
+
# request.timeout # => 5
|
16
|
+
def initialize(*args)
|
17
|
+
options = args.pop if args.last.is_a?(Hash)
|
18
|
+
|
19
|
+
if args.any?
|
20
|
+
self.url = args.first
|
21
|
+
end
|
22
|
+
|
23
|
+
if options
|
24
|
+
options.each { |key, value| send("#{key}=", value) }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# The URL to be requested.
|
29
|
+
attr_accessor :url
|
30
|
+
|
31
|
+
# Whether to follow redirects. Default: +true+
|
32
|
+
def follow_redirects
|
33
|
+
return @follow_redirects if defined?(@follow_redirects)
|
34
|
+
@follow_redirects = true
|
35
|
+
end
|
36
|
+
|
37
|
+
# Sets whether to follow redirects.
|
38
|
+
attr_writer :follow_redirects
|
39
|
+
|
40
|
+
# The method to be used for the request.
|
41
|
+
def method
|
42
|
+
@method || :get
|
43
|
+
end
|
44
|
+
|
45
|
+
# Sets the method to be used for the request.
|
46
|
+
attr_writer :method
|
47
|
+
|
48
|
+
# The post body to be sent with the request.
|
49
|
+
def body
|
50
|
+
@body ||= {}
|
51
|
+
end
|
52
|
+
|
53
|
+
# Sets the post body to be sent with the request.
|
54
|
+
attr_writer :body
|
55
|
+
|
56
|
+
# The post body represented as a string.
|
57
|
+
def body_string
|
58
|
+
body.map { |k, v| "#{CGI::escape(k.to_s)}=#{CGI::escape(v.to_s)}" }.join("&")
|
59
|
+
end
|
60
|
+
|
61
|
+
# The timeout for the request.
|
62
|
+
# Default: Taken from +Fetch.config.timeout+
|
63
|
+
def timeout
|
64
|
+
return @timeout if defined?(@timeout)
|
65
|
+
Fetch.config.timeout
|
66
|
+
end
|
67
|
+
|
68
|
+
# Sets the timeout for the request.
|
69
|
+
attr_writer :timeout
|
70
|
+
|
71
|
+
# The headers to be sent with the request.
|
72
|
+
def headers
|
73
|
+
@headers ||= {
|
74
|
+
"User-Agent" => Fetch.config.user_agent
|
75
|
+
}
|
76
|
+
end
|
77
|
+
|
78
|
+
# Sets the headers to be sent with the request.
|
79
|
+
attr_writer :headers
|
80
|
+
|
81
|
+
# The user agent being sent with the request.
|
82
|
+
def user_agent
|
83
|
+
headers["User-Agent"]
|
84
|
+
end
|
85
|
+
|
86
|
+
# Sets the user agent to be sent with the request.
|
87
|
+
def user_agent=(value)
|
88
|
+
headers.merge! "User-Agent" => value
|
89
|
+
end
|
90
|
+
|
91
|
+
# Sets a callback to be run before each process.
|
92
|
+
def before_process(&block)
|
93
|
+
raise "You must supply a block to #{self.class.name}#before_process" unless block
|
94
|
+
@before_process_callback = block
|
95
|
+
end
|
96
|
+
|
97
|
+
# Runs the before process callback.
|
98
|
+
def before_process!
|
99
|
+
@before_process_callback.call if @before_process_callback
|
100
|
+
end
|
101
|
+
|
102
|
+
# Sets the callback to be run when the request completes.
|
103
|
+
def process(&block)
|
104
|
+
raise "You must supply a block to #{self.class.name}#process" unless block
|
105
|
+
@process_callback = block
|
106
|
+
end
|
107
|
+
|
108
|
+
# Runs the process callback. If it fails with an exception, it will send
|
109
|
+
# the exception to the error callback.
|
110
|
+
def process!(body, url, effective_url)
|
111
|
+
before_process!
|
112
|
+
@process_callback.call(body, url, effective_url) if @process_callback
|
113
|
+
after_process!
|
114
|
+
rescue => e
|
115
|
+
error!(e)
|
116
|
+
end
|
117
|
+
|
118
|
+
# Sets a callback to be run after each process.
|
119
|
+
def after_process(&block)
|
120
|
+
raise "You must supply a block to #{self.class.name}#after_process" unless block
|
121
|
+
@after_process_callback = block
|
122
|
+
end
|
123
|
+
|
124
|
+
# Runs the after process callback.
|
125
|
+
def after_process!
|
126
|
+
@after_process_callback.call if @after_process_callback
|
127
|
+
end
|
128
|
+
|
129
|
+
# Sets the callback to be run if a request fails.
|
130
|
+
def failure(&block)
|
131
|
+
raise "You must supply a block to #{self.class.name}#failure" unless block
|
132
|
+
@failure_callback = block
|
133
|
+
end
|
134
|
+
|
135
|
+
# Runs the failure callback.
|
136
|
+
def failed!(code, url)
|
137
|
+
@failure_callback.call(code, url) if @failure_callback
|
138
|
+
end
|
139
|
+
|
140
|
+
# Sets the callback to be run if the processing fails due to an exception.
|
141
|
+
def error(&block)
|
142
|
+
raise "You must supply a block to #{self.class.name}#error" unless block
|
143
|
+
@error_callback = block
|
144
|
+
end
|
145
|
+
|
146
|
+
# Runs the error callback. Raises the exception given in +exception+ if an
|
147
|
+
# error callback isn't defined.
|
148
|
+
def error!(exception)
|
149
|
+
if @error_callback
|
150
|
+
@error_callback.call(exception)
|
151
|
+
else
|
152
|
+
raise exception
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|