wayfarer-jruby 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rbenv-gemsets +1 -0
- data/.rspec +3 -0
- data/.rubocop.yml +21 -0
- data/.ruby-version +1 -0
- data/.travis.yml +5 -0
- data/.yardopts +3 -0
- data/Gemfile +11 -0
- data/LICENSE +19 -0
- data/README.md +19 -0
- data/Rakefile +114 -0
- data/benchmark/frontiers.rb +143 -0
- data/bin/wayfarer +116 -0
- data/docs/.gitignore +2 -0
- data/docs/_config.yml +15 -0
- data/docs/_includes/base.html +7 -0
- data/docs/_includes/head.html +10 -0
- data/docs/_includes/navigation.html +172 -0
- data/docs/_layouts/default.html +42 -0
- data/docs/_sass/base.scss +439 -0
- data/docs/_sass/variables.scss +24 -0
- data/docs/_sass/vendor/bourbon/_bourbon-deprecate.scss +19 -0
- data/docs/_sass/vendor/bourbon/_bourbon-deprecated-upcoming.scss +425 -0
- data/docs/_sass/vendor/bourbon/_bourbon.scss +90 -0
- data/docs/_sass/vendor/bourbon/addons/_border-color.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_border-radius.scss +48 -0
- data/docs/_sass/vendor/bourbon/addons/_border-style.scss +28 -0
- data/docs/_sass/vendor/bourbon/addons/_border-width.scss +28 -0
- data/docs/_sass/vendor/bourbon/addons/_buttons.scss +69 -0
- data/docs/_sass/vendor/bourbon/addons/_clearfix.scss +25 -0
- data/docs/_sass/vendor/bourbon/addons/_ellipsis.scss +30 -0
- data/docs/_sass/vendor/bourbon/addons/_font-stacks.scss +31 -0
- data/docs/_sass/vendor/bourbon/addons/_hide-text.scss +27 -0
- data/docs/_sass/vendor/bourbon/addons/_margin.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_padding.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_position.scss +51 -0
- data/docs/_sass/vendor/bourbon/addons/_prefixer.scss +66 -0
- data/docs/_sass/vendor/bourbon/addons/_retina-image.scss +27 -0
- data/docs/_sass/vendor/bourbon/addons/_size.scss +56 -0
- data/docs/_sass/vendor/bourbon/addons/_text-inputs.scss +118 -0
- data/docs/_sass/vendor/bourbon/addons/_timing-functions.scss +34 -0
- data/docs/_sass/vendor/bourbon/addons/_triangle.scss +63 -0
- data/docs/_sass/vendor/bourbon/addons/_word-wrap.scss +29 -0
- data/docs/_sass/vendor/bourbon/css3/_animation.scss +61 -0
- data/docs/_sass/vendor/bourbon/css3/_appearance.scss +5 -0
- data/docs/_sass/vendor/bourbon/css3/_backface-visibility.scss +5 -0
- data/docs/_sass/vendor/bourbon/css3/_background-image.scss +44 -0
- data/docs/_sass/vendor/bourbon/css3/_background.scss +57 -0
- data/docs/_sass/vendor/bourbon/css3/_border-image.scss +61 -0
- data/docs/_sass/vendor/bourbon/css3/_calc.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_columns.scss +67 -0
- data/docs/_sass/vendor/bourbon/css3/_filter.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_flex-box.scss +327 -0
- data/docs/_sass/vendor/bourbon/css3/_font-face.scss +29 -0
- data/docs/_sass/vendor/bourbon/css3/_font-feature-settings.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_hidpi-media-query.scss +12 -0
- data/docs/_sass/vendor/bourbon/css3/_hyphens.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_image-rendering.scss +15 -0
- data/docs/_sass/vendor/bourbon/css3/_keyframes.scss +38 -0
- data/docs/_sass/vendor/bourbon/css3/_linear-gradient.scss +40 -0
- data/docs/_sass/vendor/bourbon/css3/_perspective.scss +12 -0
- data/docs/_sass/vendor/bourbon/css3/_placeholder.scss +10 -0
- data/docs/_sass/vendor/bourbon/css3/_radial-gradient.scss +40 -0
- data/docs/_sass/vendor/bourbon/css3/_selection.scss +44 -0
- data/docs/_sass/vendor/bourbon/css3/_text-decoration.scss +27 -0
- data/docs/_sass/vendor/bourbon/css3/_transform.scss +21 -0
- data/docs/_sass/vendor/bourbon/css3/_transition.scss +81 -0
- data/docs/_sass/vendor/bourbon/css3/_user-select.scss +5 -0
- data/docs/_sass/vendor/bourbon/functions/_assign-inputs.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_contains-falsy.scss +25 -0
- data/docs/_sass/vendor/bourbon/functions/_contains.scss +31 -0
- data/docs/_sass/vendor/bourbon/functions/_is-length.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_is-light.scss +26 -0
- data/docs/_sass/vendor/bourbon/functions/_is-number.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_is-size.scss +23 -0
- data/docs/_sass/vendor/bourbon/functions/_modular-scale.scss +74 -0
- data/docs/_sass/vendor/bourbon/functions/_px-to-em.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_px-to-rem.scss +26 -0
- data/docs/_sass/vendor/bourbon/functions/_shade.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_strip-units.scss +22 -0
- data/docs/_sass/vendor/bourbon/functions/_tint.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_transition-property-name.scss +37 -0
- data/docs/_sass/vendor/bourbon/functions/_unpack.scss +32 -0
- data/docs/_sass/vendor/bourbon/helpers/_convert-units.scss +26 -0
- data/docs/_sass/vendor/bourbon/helpers/_directional-values.scss +108 -0
- data/docs/_sass/vendor/bourbon/helpers/_font-source-declaration.scss +53 -0
- data/docs/_sass/vendor/bourbon/helpers/_gradient-positions-parser.scss +24 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-angle-parser.scss +35 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-gradient-parser.scss +51 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-positions-parser.scss +77 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-side-corner-parser.scss +41 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-arg-parser.scss +74 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-gradient-parser.scss +55 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-positions-parser.scss +28 -0
- data/docs/_sass/vendor/bourbon/helpers/_render-gradients.scss +31 -0
- data/docs/_sass/vendor/bourbon/helpers/_shape-size-stripper.scss +15 -0
- data/docs/_sass/vendor/bourbon/helpers/_str-to-num.scss +55 -0
- data/docs/_sass/vendor/bourbon/settings/_asset-pipeline.scss +7 -0
- data/docs/_sass/vendor/bourbon/settings/_deprecation-warnings.scss +8 -0
- data/docs/_sass/vendor/bourbon/settings/_prefixer.scss +9 -0
- data/docs/_sass/vendor/bourbon/settings/_px-to-em.scss +1 -0
- data/docs/_sass/vendor/neat/_neat-helpers.scss +11 -0
- data/docs/_sass/vendor/neat/_neat.scss +23 -0
- data/docs/_sass/vendor/neat/functions/_new-breakpoint.scss +49 -0
- data/docs/_sass/vendor/neat/functions/_private.scss +114 -0
- data/docs/_sass/vendor/neat/grid/_box-sizing.scss +15 -0
- data/docs/_sass/vendor/neat/grid/_direction-context.scss +33 -0
- data/docs/_sass/vendor/neat/grid/_display-context.scss +28 -0
- data/docs/_sass/vendor/neat/grid/_fill-parent.scss +22 -0
- data/docs/_sass/vendor/neat/grid/_media.scss +92 -0
- data/docs/_sass/vendor/neat/grid/_omega.scss +87 -0
- data/docs/_sass/vendor/neat/grid/_outer-container.scss +34 -0
- data/docs/_sass/vendor/neat/grid/_pad.scss +25 -0
- data/docs/_sass/vendor/neat/grid/_private.scss +35 -0
- data/docs/_sass/vendor/neat/grid/_row.scss +52 -0
- data/docs/_sass/vendor/neat/grid/_shift.scss +50 -0
- data/docs/_sass/vendor/neat/grid/_span-columns.scss +94 -0
- data/docs/_sass/vendor/neat/grid/_to-deprecate.scss +97 -0
- data/docs/_sass/vendor/neat/grid/_visual-grid.scss +42 -0
- data/docs/_sass/vendor/neat/mixins/_clearfix.scss +25 -0
- data/docs/_sass/vendor/neat/settings/_disable-warnings.scss +13 -0
- data/docs/_sass/vendor/neat/settings/_grid.scss +51 -0
- data/docs/_sass/vendor/neat/settings/_visual-grid.scss +27 -0
- data/docs/_sass/vendor/normalize-3.0.2.scss +427 -0
- data/docs/_sass/vendor/pygments.scss +356 -0
- data/docs/automating_browsers/capybara.md +70 -0
- data/docs/css/screen.scss +7 -0
- data/docs/guides/callbacks.md +45 -0
- data/docs/guides/cli.md +52 -0
- data/docs/guides/configuration.md +184 -0
- data/docs/guides/error_handling.md +46 -0
- data/docs/guides/frontiers.md +93 -0
- data/docs/guides/halting.md +23 -0
- data/docs/guides/job_queues.md +26 -0
- data/docs/guides/locals.md +36 -0
- data/docs/guides/logging.md +22 -0
- data/docs/guides/page_objects.md +67 -0
- data/docs/guides/peeking.md +46 -0
- data/docs/guides/selenium_capybara.md +100 -0
- data/docs/guides/tutorial.md +452 -0
- data/docs/index.md +82 -0
- data/docs/js/navigation.js +11 -0
- data/docs/misc/contributing.md +20 -0
- data/docs/misc/testing.md +11 -0
- data/docs/recipes/authentication.md +23 -0
- data/docs/recipes/csv.md +29 -0
- data/docs/recipes/javascript.md +20 -0
- data/docs/recipes/multiple_uris.md +18 -0
- data/docs/recipes/screenshots.md +20 -0
- data/docs/routing/host_rules.md +24 -0
- data/docs/routing/path_rules.md +33 -0
- data/docs/routing/query_rules.md +69 -0
- data/docs/routing/routes.md +96 -0
- data/docs/routing/uri_rules.md +18 -0
- data/examples/collect_github_issues.rb +65 -0
- data/examples/find_foobar_on_wikipedia.rb +23 -0
- data/lib/wayfarer.rb +65 -0
- data/lib/wayfarer/configuration.rb +86 -0
- data/lib/wayfarer/crawl.rb +79 -0
- data/lib/wayfarer/crawl_observer.rb +103 -0
- data/lib/wayfarer/dispatcher.rb +104 -0
- data/lib/wayfarer/finders.rb +61 -0
- data/lib/wayfarer/frontiers/frontier.rb +79 -0
- data/lib/wayfarer/frontiers/memory_bloomfilter.rb +32 -0
- data/lib/wayfarer/frontiers/memory_frontier.rb +76 -0
- data/lib/wayfarer/frontiers/memory_trie_frontier.rb +39 -0
- data/lib/wayfarer/frontiers/normalize_uris.rb +48 -0
- data/lib/wayfarer/frontiers/redis_bloomfilter.rb +34 -0
- data/lib/wayfarer/frontiers/redis_frontier.rb +83 -0
- data/lib/wayfarer/http_adapters/adapter_pool.rb +62 -0
- data/lib/wayfarer/http_adapters/net_http_adapter.rb +77 -0
- data/lib/wayfarer/http_adapters/selenium_adapter.rb +80 -0
- data/lib/wayfarer/job.rb +192 -0
- data/lib/wayfarer/locals.rb +40 -0
- data/lib/wayfarer/page.rb +94 -0
- data/lib/wayfarer/parsers/json_parser.rb +20 -0
- data/lib/wayfarer/parsers/xml_parser.rb +27 -0
- data/lib/wayfarer/processor.rb +103 -0
- data/lib/wayfarer/routing/host_rule.rb +19 -0
- data/lib/wayfarer/routing/path_rule.rb +54 -0
- data/lib/wayfarer/routing/query_rule.rb +59 -0
- data/lib/wayfarer/routing/router.rb +71 -0
- data/lib/wayfarer/routing/rule.rb +102 -0
- data/lib/wayfarer/routing/uri_rule.rb +21 -0
- data/spec/configuration_spec.rb +26 -0
- data/spec/crawl_spec.rb +48 -0
- data/spec/finders_spec.rb +49 -0
- data/spec/frontiers/memory_bloomfilter_spec.rb +6 -0
- data/spec/frontiers/memory_frontier_spec.rb +6 -0
- data/spec/frontiers/memory_trie_frontier_spec.rb +6 -0
- data/spec/frontiers/normalize_uris_spec.rb +59 -0
- data/spec/frontiers/redis_bloomfilter_spec.rb +6 -0
- data/spec/frontiers/redis_frontier_spec.rb +6 -0
- data/spec/http_adapters/adapter_pool_spec.rb +33 -0
- data/spec/http_adapters/net_http_adapter_spec.rb +83 -0
- data/spec/http_adapters/selenium_adapter_spec.rb +53 -0
- data/spec/integration/callbacks_spec.rb +42 -0
- data/spec/integration/locals_spec.rb +106 -0
- data/spec/job_spec.rb +86 -0
- data/spec/page_spec.rb +38 -0
- data/spec/parsers/json_parser_spec.rb +30 -0
- data/spec/parsers/xml_parser_spec.rb +24 -0
- data/spec/processor_spec.rb +31 -0
- data/spec/routing/host_rule_spec.rb +48 -0
- data/spec/routing/path_rule_spec.rb +66 -0
- data/spec/routing/query_rule_spec.rb +124 -0
- data/spec/routing/router_spec.rb +67 -0
- data/spec/routing/rule_spec.rb +218 -0
- data/spec/routing/uri_rule_spec.rb +24 -0
- data/spec/shared/frontier.rb +96 -0
- data/spec/spec_helpers.rb +62 -0
- data/spec/wayfarer_spec.rb +24 -0
- data/support/static/finders.html +38 -0
- data/support/static/graph/details/a.html +10 -0
- data/support/static/graph/details/b.html +10 -0
- data/support/static/graph/index.html +20 -0
- data/support/static/json/dummy.json +13 -0
- data/support/static/links/links.html +28 -0
- data/support/static/xml/dummy.xml +120 -0
- data/support/test_app.rb +45 -0
- data/wayfarer-jruby.gemspec +49 -0
- data/wayfarer.gemspec +53 -0
- metadata +616 -0
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "bloomfilter-rb"
|
4
|
+
|
5
|
+
module Wayfarer
|
6
|
+
module Frontiers
|
7
|
+
# A Redis bloomfilter.
|
8
|
+
# @api private
|
9
|
+
class RedisBloomfilter < MemoryBloomfilter
|
10
|
+
def initialize(config)
|
11
|
+
@conn = Redis.new(config.redis_opts)
|
12
|
+
@filter = BloomFilter::Redis.new(config.bloomfilter_opts.merge(db: @conn))
|
13
|
+
super(config)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# @override
|
18
|
+
def cache(*uris)
|
19
|
+
uris.each { |uri| @filter.insert(uri) }
|
20
|
+
end
|
21
|
+
|
22
|
+
# @override
|
23
|
+
def cached?(uri)
|
24
|
+
@filter.include?(uri)
|
25
|
+
end
|
26
|
+
|
27
|
+
# @override
|
28
|
+
def free
|
29
|
+
@filter.clear
|
30
|
+
@conn.disconnect!
|
31
|
+
super
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "redis"
|
4
|
+
|
5
|
+
module Wayfarer
|
6
|
+
module Frontiers
|
7
|
+
# A Redis frontier
|
8
|
+
# @api private
|
9
|
+
class RedisFrontier < Frontier
|
10
|
+
def initialize(config)
|
11
|
+
@conn = Redis.new(config.redis_opts)
|
12
|
+
super(config)
|
13
|
+
end
|
14
|
+
|
15
|
+
# @override
|
16
|
+
def current_uris
|
17
|
+
@conn.smembers(current_uris_key).map { |str| URI(str) }
|
18
|
+
end
|
19
|
+
|
20
|
+
# @override
|
21
|
+
def staged_uris
|
22
|
+
@conn.smembers(staged_uris_key).map { |str| URI(str) }
|
23
|
+
end
|
24
|
+
|
25
|
+
# @override
|
26
|
+
def stage(*uris)
|
27
|
+
@conn.sadd(staged_uris_key, uris.map(&:to_s)) if uris.any?
|
28
|
+
end
|
29
|
+
|
30
|
+
# @override
|
31
|
+
def staged?(uri)
|
32
|
+
@conn.sismember(staged_uris_key, uri.to_s)
|
33
|
+
end
|
34
|
+
|
35
|
+
# @override
|
36
|
+
def cache(*uris)
|
37
|
+
@conn.sadd(cached_uris_key, uris.map(&:to_s)) if uris.any?
|
38
|
+
end
|
39
|
+
|
40
|
+
# @override
|
41
|
+
def cached?(uri)
|
42
|
+
@conn.sismember(cached_uris_key, uri.to_s)
|
43
|
+
end
|
44
|
+
|
45
|
+
# @override
|
46
|
+
def free
|
47
|
+
[current_uris_key, staged_uris_key, cached_uris_key].each do |key|
|
48
|
+
@conn.del(key)
|
49
|
+
end
|
50
|
+
|
51
|
+
@conn.disconnect!
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def reset_staged_uris!
|
57
|
+
@conn.del(staged_uris_key)
|
58
|
+
end
|
59
|
+
|
60
|
+
# @override
|
61
|
+
def swap!
|
62
|
+
# Achieve: @current_uris = @staged_uris
|
63
|
+
@conn.rename(staged_uris_key, current_uris_key)
|
64
|
+
end
|
65
|
+
|
66
|
+
def filter_staged_uris!
|
67
|
+
@conn.sdiffstore(staged_uris_key, staged_uris_key, cached_uris_key)
|
68
|
+
end
|
69
|
+
|
70
|
+
def current_uris_key
|
71
|
+
"#{@config.uuid}_current_uris"
|
72
|
+
end
|
73
|
+
|
74
|
+
def staged_uris_key
|
75
|
+
"#{@config.uuid}_staged_uris"
|
76
|
+
end
|
77
|
+
|
78
|
+
def cached_uris_key
|
79
|
+
"#{@config.uuid}_cached_uris"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "forwardable"
|
4
|
+
require "connection_pool"
|
5
|
+
|
6
|
+
module Wayfarer
|
7
|
+
module HTTPAdapters
|
8
|
+
# A connection pool that hands out HTTP adapters.
|
9
|
+
# @private
|
10
|
+
class AdapterPool
|
11
|
+
extend Forwardable
|
12
|
+
|
13
|
+
def initialize(job)
|
14
|
+
@job = job
|
15
|
+
@config = job.config
|
16
|
+
|
17
|
+
size = @config.connection_count
|
18
|
+
timeout = @config.connection_timeout
|
19
|
+
|
20
|
+
@pool = ConnectionPool.new(
|
21
|
+
size: size,
|
22
|
+
timeout: timeout,
|
23
|
+
&method(:instantiate_adapter)
|
24
|
+
)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Shuts down all HTTP adapters
|
28
|
+
def free
|
29
|
+
@pool.shutdown(&:free)
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def instantiate_adapter
|
35
|
+
adapter = if @config.http_adapter == :selenium
|
36
|
+
HTTPAdapters::SeleniumAdapter.new(@config)
|
37
|
+
else
|
38
|
+
HTTPAdapters::NetHTTPAdapter.instance(@config)
|
39
|
+
end
|
40
|
+
|
41
|
+
@job.run_hook(
|
42
|
+
:setup_adapter,
|
43
|
+
adapter,
|
44
|
+
adapter.try(:driver),
|
45
|
+
adapter.try(:browser)
|
46
|
+
)
|
47
|
+
|
48
|
+
adapter
|
49
|
+
end
|
50
|
+
|
51
|
+
def method_missing(method, *argv, &proc)
|
52
|
+
super if method == :shutdown
|
53
|
+
@pool.public_send(method, *argv, &proc)
|
54
|
+
end
|
55
|
+
|
56
|
+
def respond_to_missing?(method, private = false)
|
57
|
+
return false if method == :shutdown
|
58
|
+
@pool.respond_to?(method) || super
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "securerandom"
|
4
|
+
require "net/http"
|
5
|
+
require "net/http/persistent"
|
6
|
+
|
7
|
+
module Wayfarer
|
8
|
+
module HTTPAdapters
|
9
|
+
# A singleton adapter for net-http-persistent.
|
10
|
+
# @api private
|
11
|
+
class NetHTTPAdapter
|
12
|
+
# Supported standard lib classes
|
13
|
+
RECOGNIZED_URI_TYPES = [
|
14
|
+
URI::HTTP,
|
15
|
+
URI::HTTPS
|
16
|
+
].freeze
|
17
|
+
|
18
|
+
MalformedURI = Class.new(StandardError)
|
19
|
+
MalformedRedirectURI = Class.new(StandardError)
|
20
|
+
MaximumRedirectCountReached = Class.new(StandardError)
|
21
|
+
|
22
|
+
attr_accessor :request_header_overrides
|
23
|
+
|
24
|
+
# TODO: Remove default parameter value
|
25
|
+
def self.instance(config = Wayfarer.config)
|
26
|
+
@@instance ||= new(config)
|
27
|
+
end
|
28
|
+
|
29
|
+
def initialize(config)
|
30
|
+
@config = config
|
31
|
+
@conn = Net::HTTP::Persistent.new("wayfarer-#{SecureRandom.uuid}")
|
32
|
+
end
|
33
|
+
|
34
|
+
# This is a singleton class. Use ::instance instead.
|
35
|
+
private_class_method :new
|
36
|
+
|
37
|
+
# Fetches a page.
|
38
|
+
# @return [Page]
|
39
|
+
# @raise [MalformedURI] if the URI is not supported.
|
40
|
+
# @raise [MalformedRedirectURI] if a redirection URI is not supported.
|
41
|
+
# @raise [MaximumRedirectCountReached] if too many redirections are
|
42
|
+
# encountered.
|
43
|
+
def fetch(uri, redirects_followed = 0)
|
44
|
+
if !RECOGNIZED_URI_TYPES.include?(uri.class)
|
45
|
+
raise _ = if redirects_followed.positive?
|
46
|
+
MalformedRedirectURI
|
47
|
+
else
|
48
|
+
MalformedURI
|
49
|
+
end
|
50
|
+
elsif redirects_followed > @config.max_http_redirects
|
51
|
+
raise MaximumRedirectCountReached
|
52
|
+
end
|
53
|
+
|
54
|
+
res = @conn.request(uri)
|
55
|
+
|
56
|
+
if res.is_a? Net::HTTPRedirection
|
57
|
+
redirect_uri = URI(res["location"])
|
58
|
+
return fetch(redirect_uri, redirects_followed + 1)
|
59
|
+
end
|
60
|
+
|
61
|
+
Page.new(
|
62
|
+
uri: uri,
|
63
|
+
status_code: res.code.to_i,
|
64
|
+
body: res.body,
|
65
|
+
headers: res.to_hash
|
66
|
+
)
|
67
|
+
rescue SocketError
|
68
|
+
raise MalformedURI
|
69
|
+
end
|
70
|
+
|
71
|
+
# Shuts down all connections.
|
72
|
+
def free
|
73
|
+
@conn.shutdown
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "selenium-webdriver"
|
4
|
+
require "selenium/emulated_features"
|
5
|
+
require "capybara"
|
6
|
+
|
7
|
+
module Wayfarer
|
8
|
+
module HTTPAdapters
|
9
|
+
# An adapter for Selenium WebDrivers
|
10
|
+
# @api private
|
11
|
+
class SeleniumAdapter
|
12
|
+
# @!attribute [r] driver
|
13
|
+
# @return [URI] the Selenium WebDriver.
|
14
|
+
attr_reader :driver
|
15
|
+
|
16
|
+
def initialize(config = Wayfarer.config)
|
17
|
+
@config = config
|
18
|
+
end
|
19
|
+
|
20
|
+
# Fetches a page.
|
21
|
+
# @return [Page]
|
22
|
+
def fetch(uri)
|
23
|
+
driver.navigate.to(uri)
|
24
|
+
|
25
|
+
Page.new(
|
26
|
+
uri: @driver.current_url,
|
27
|
+
status_code: @driver.response_code,
|
28
|
+
body: @driver.page_source,
|
29
|
+
headers: @driver.response_headers
|
30
|
+
)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Closes the driver.
|
34
|
+
def reload!
|
35
|
+
@driver&.close
|
36
|
+
@driver = nil
|
37
|
+
end
|
38
|
+
|
39
|
+
# Quits the browser.
|
40
|
+
def free
|
41
|
+
@driver&.quit
|
42
|
+
@driver = nil
|
43
|
+
end
|
44
|
+
|
45
|
+
# The WebDriver.
|
46
|
+
def driver
|
47
|
+
@driver ||= instantiate_driver
|
48
|
+
end
|
49
|
+
|
50
|
+
# A Capybara driver that wraps the {#driver}.
|
51
|
+
# @see https://github.com/teamcapybara/capybara Capybara
|
52
|
+
def browser
|
53
|
+
@browser ||= instantiate_capybara_driver
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def instantiate_driver
|
59
|
+
driver = Selenium::WebDriver.for(*@config.selenium_argv)
|
60
|
+
driver.manage.window.size = Selenium::WebDriver::Dimension.new(
|
61
|
+
*@config.window_size
|
62
|
+
)
|
63
|
+
driver
|
64
|
+
end
|
65
|
+
|
66
|
+
def instantiate_capybara_driver
|
67
|
+
Capybara.run_server = false
|
68
|
+
Capybara.current_driver = :selenium
|
69
|
+
|
70
|
+
capybara_driver = Capybara::Selenium::Driver.new(nil)
|
71
|
+
capybara_driver.instance_variable_set(:@browser, driver)
|
72
|
+
|
73
|
+
session = Capybara::Session.new(:selenium, nil)
|
74
|
+
session.instance_variable_set(:@driver, capybara_driver)
|
75
|
+
|
76
|
+
session
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/lib/wayfarer/job.rb
ADDED
@@ -0,0 +1,192 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "forwardable"
|
4
|
+
require "hooks"
|
5
|
+
require "active_job"
|
6
|
+
|
7
|
+
# TODO: I only want deep_dup
|
8
|
+
require "active_support/all"
|
9
|
+
|
10
|
+
module Wayfarer
|
11
|
+
# A {Job} is a class that has a {Routing::Router} with many {Routing::Rule}s
|
12
|
+
# which are matched against a URI. Rules map URIs onto job instance methods.
|
13
|
+
# Under the hood, jobs are instantiated within separate threads by a
|
14
|
+
# {Processor}. Every instance gets its own thread. If a URI is matched, its
|
15
|
+
# {Page} is retrieved, and made available to instance methods via {#page}.
|
16
|
+
#
|
17
|
+
# Jobs implement ActiveJob's Job API and are therefore compatible with a wide
|
18
|
+
# range of job queues. To run a job immediately, call ::perform_now.
|
19
|
+
# enqueue a job, call ::perform_later.
|
20
|
+
#
|
21
|
+
# @see https://github.com/rails/rails/tree/master/activejob rails/activejob
|
22
|
+
# @see http://edgeguides.rubyonrails.org/active_job_basics.html ActiveJob Basics
|
23
|
+
class Job < ActiveJob::Base
|
24
|
+
extend Forwardable
|
25
|
+
|
26
|
+
include Hooks
|
27
|
+
include Locals
|
28
|
+
|
29
|
+
# @!group Callbacks
|
30
|
+
|
31
|
+
# Callback that fires __once__ before any pages are retrieved.
|
32
|
+
# @method before_crawl
|
33
|
+
# @scope class
|
34
|
+
define_hook :before_crawl
|
35
|
+
|
36
|
+
# Callback that fires __once__ after all pages have been retrieved and
|
37
|
+
# processing is done.
|
38
|
+
# @method after_crawl
|
39
|
+
# @scope class
|
40
|
+
define_hook :after_crawl
|
41
|
+
|
42
|
+
# Callback that fires when HTTP adapters are instantiated.
|
43
|
+
# @method setup_adapter
|
44
|
+
# @scope class
|
45
|
+
# @yield [[HTTPAdapters::NetHTTPAdapter, HTTPAdapters::SeleniumAdapter], [Selenium::WebDriver::Driver, nil], [Capybara::Selenium::Driver, nil]]
|
46
|
+
define_hooks :setup_adapter
|
47
|
+
|
48
|
+
# @!endgroup
|
49
|
+
|
50
|
+
class << self
|
51
|
+
extend Forwardable
|
52
|
+
|
53
|
+
# @!attribute [w] router
|
54
|
+
attr_writer :router
|
55
|
+
|
56
|
+
# @!attribute [w] config
|
57
|
+
attr_writer :config
|
58
|
+
|
59
|
+
# Returns a class copy.
|
60
|
+
def prepare
|
61
|
+
duplicate = dup
|
62
|
+
duplicate.router = router.dup
|
63
|
+
duplicate.locals = locals.deep_dup
|
64
|
+
duplicate.config = config.dup
|
65
|
+
|
66
|
+
duplicate.locals.each do |(key, val)|
|
67
|
+
duplicate.locals[key] = Locals.thread_safe_counterpart(val)
|
68
|
+
end
|
69
|
+
|
70
|
+
duplicate.locals.each do |(key, _)|
|
71
|
+
duplicate.send(:define_method, key) do duplicate.locals[key] end
|
72
|
+
duplicate.send(:define_singleton_method, key) do
|
73
|
+
duplicate.locals[key]
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
duplicate
|
78
|
+
end
|
79
|
+
|
80
|
+
# A configuration based off the global {Wayfarer.config}.
|
81
|
+
# @yield [Configuration]
|
82
|
+
# @return [Configuration]
|
83
|
+
def config
|
84
|
+
@config ||= Wayfarer.config.clone
|
85
|
+
yield(@config) if block_given?
|
86
|
+
@config
|
87
|
+
end
|
88
|
+
|
89
|
+
# A router.
|
90
|
+
# If a block is passed in, it is evaluated within the {Router}'s instance.
|
91
|
+
# @return [Routing::Router]
|
92
|
+
def router(&proc)
|
93
|
+
@router ||= Routing::Router.new
|
94
|
+
@router.instance_eval(&proc) if block_given?
|
95
|
+
@router
|
96
|
+
end
|
97
|
+
|
98
|
+
alias route router
|
99
|
+
alias routes router
|
100
|
+
|
101
|
+
# Overshadows ActiveJob::Base's own logger
|
102
|
+
delegate logger: :config
|
103
|
+
end
|
104
|
+
|
105
|
+
# @!attribute [r] staged_uris
|
106
|
+
# @return [Array<String>, Array<URI>] URIs to stage for the next cycle.
|
107
|
+
# @see #stage
|
108
|
+
attr_reader :staged_uris
|
109
|
+
|
110
|
+
# @!attribute [rw] page
|
111
|
+
attr_writer :page
|
112
|
+
|
113
|
+
# @!attribute [rw] adapter
|
114
|
+
attr_accessor :adapter
|
115
|
+
|
116
|
+
# @!attribute [rw] params
|
117
|
+
attr_accessor :params
|
118
|
+
|
119
|
+
def initialize(*argv)
|
120
|
+
@halts = false
|
121
|
+
@staged_uris = []
|
122
|
+
super(*argv)
|
123
|
+
end
|
124
|
+
|
125
|
+
# Whether this job will stop processing.
|
126
|
+
def halts?
|
127
|
+
@halts
|
128
|
+
end
|
129
|
+
|
130
|
+
# Performs this job.
|
131
|
+
# @note ActiveJob API
|
132
|
+
# @override
|
133
|
+
def perform(*uris)
|
134
|
+
Crawl.new(self.class, *uris).execute
|
135
|
+
end
|
136
|
+
|
137
|
+
protected
|
138
|
+
|
139
|
+
# All following instance methods are available within actions.
|
140
|
+
|
141
|
+
# Sets a halting flag that signals the processor to stop its work.
|
142
|
+
def halt
|
143
|
+
@halts = true
|
144
|
+
end
|
145
|
+
|
146
|
+
# Adds URIs to process in the next cycle.
|
147
|
+
# If a relative URI is given, the page's protocol and hostname get
|
148
|
+
# prepended.
|
149
|
+
# @param [String, URI, Array<String>, Array<URI>]
|
150
|
+
def stage(*uris)
|
151
|
+
@staged_uris.push(*uris.flatten)
|
152
|
+
end
|
153
|
+
|
154
|
+
# The {Page} representing the URI currently processed by an action.
|
155
|
+
# When using the Selenium adapter, {Page#body} gets refreshed on every call.
|
156
|
+
# Otherwise, subsequent DOM updates (i.e. JavaScript-induced) would be
|
157
|
+
# invisible.
|
158
|
+
# @return Page
|
159
|
+
def page
|
160
|
+
return @page unless self.class.config.http_adapter == :selenium
|
161
|
+
|
162
|
+
Page.new(
|
163
|
+
uri: @page.uri,
|
164
|
+
status_code: @page.uri,
|
165
|
+
body: driver.page_source,
|
166
|
+
headers: @page.headers
|
167
|
+
)
|
168
|
+
end
|
169
|
+
|
170
|
+
# The parsed response body.
|
171
|
+
# When using the Selenium adapter, this parses the body again on every call.
|
172
|
+
# Otherwise, subsequent DOM updates (i.e. JavaScript-induced) would be
|
173
|
+
# invisible.
|
174
|
+
# @method doc
|
175
|
+
# @see Page#doc
|
176
|
+
delegate doc: :page
|
177
|
+
|
178
|
+
# The Selenium WebDriver.
|
179
|
+
# @method driver
|
180
|
+
# @see https://github.com/peterc/pismo Pismo
|
181
|
+
# @see Page#driver
|
182
|
+
delegate driver: :adapter
|
183
|
+
|
184
|
+
# A Capybara driver that wraps the {#driver}.
|
185
|
+
# @method browser
|
186
|
+
# @see HTTPAdapters::SeleniumAdapter#browser
|
187
|
+
delegate browser: :adapter
|
188
|
+
|
189
|
+
# @method logger
|
190
|
+
delegate logger: :"self.class"
|
191
|
+
end
|
192
|
+
end
|