wayfarer 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rbenv-gemsets +1 -0
- data/.rspec +3 -0
- data/.rubocop.yml +21 -0
- data/.ruby-version +1 -0
- data/.travis.yml +5 -0
- data/.yardopts +3 -0
- data/Changelog.md +10 -0
- data/Gemfile +11 -0
- data/LICENSE +19 -0
- data/README.md +21 -0
- data/Rakefile +114 -0
- data/benchmark/frontiers.rb +143 -0
- data/bin/wayfarer +116 -0
- data/docs/.gitignore +2 -0
- data/docs/_config.yml +15 -0
- data/docs/_includes/base.html +7 -0
- data/docs/_includes/head.html +10 -0
- data/docs/_includes/navigation.html +187 -0
- data/docs/_layouts/default.html +42 -0
- data/docs/_sass/base.scss +439 -0
- data/docs/_sass/variables.scss +24 -0
- data/docs/_sass/vendor/bourbon/_bourbon-deprecate.scss +19 -0
- data/docs/_sass/vendor/bourbon/_bourbon-deprecated-upcoming.scss +425 -0
- data/docs/_sass/vendor/bourbon/_bourbon.scss +90 -0
- data/docs/_sass/vendor/bourbon/addons/_border-color.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_border-radius.scss +48 -0
- data/docs/_sass/vendor/bourbon/addons/_border-style.scss +28 -0
- data/docs/_sass/vendor/bourbon/addons/_border-width.scss +28 -0
- data/docs/_sass/vendor/bourbon/addons/_buttons.scss +69 -0
- data/docs/_sass/vendor/bourbon/addons/_clearfix.scss +25 -0
- data/docs/_sass/vendor/bourbon/addons/_ellipsis.scss +30 -0
- data/docs/_sass/vendor/bourbon/addons/_font-stacks.scss +31 -0
- data/docs/_sass/vendor/bourbon/addons/_hide-text.scss +27 -0
- data/docs/_sass/vendor/bourbon/addons/_margin.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_padding.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_position.scss +51 -0
- data/docs/_sass/vendor/bourbon/addons/_prefixer.scss +66 -0
- data/docs/_sass/vendor/bourbon/addons/_retina-image.scss +27 -0
- data/docs/_sass/vendor/bourbon/addons/_size.scss +56 -0
- data/docs/_sass/vendor/bourbon/addons/_text-inputs.scss +118 -0
- data/docs/_sass/vendor/bourbon/addons/_timing-functions.scss +34 -0
- data/docs/_sass/vendor/bourbon/addons/_triangle.scss +63 -0
- data/docs/_sass/vendor/bourbon/addons/_word-wrap.scss +29 -0
- data/docs/_sass/vendor/bourbon/css3/_animation.scss +61 -0
- data/docs/_sass/vendor/bourbon/css3/_appearance.scss +5 -0
- data/docs/_sass/vendor/bourbon/css3/_backface-visibility.scss +5 -0
- data/docs/_sass/vendor/bourbon/css3/_background-image.scss +44 -0
- data/docs/_sass/vendor/bourbon/css3/_background.scss +57 -0
- data/docs/_sass/vendor/bourbon/css3/_border-image.scss +61 -0
- data/docs/_sass/vendor/bourbon/css3/_calc.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_columns.scss +67 -0
- data/docs/_sass/vendor/bourbon/css3/_filter.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_flex-box.scss +327 -0
- data/docs/_sass/vendor/bourbon/css3/_font-face.scss +29 -0
- data/docs/_sass/vendor/bourbon/css3/_font-feature-settings.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_hidpi-media-query.scss +12 -0
- data/docs/_sass/vendor/bourbon/css3/_hyphens.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_image-rendering.scss +15 -0
- data/docs/_sass/vendor/bourbon/css3/_keyframes.scss +38 -0
- data/docs/_sass/vendor/bourbon/css3/_linear-gradient.scss +40 -0
- data/docs/_sass/vendor/bourbon/css3/_perspective.scss +12 -0
- data/docs/_sass/vendor/bourbon/css3/_placeholder.scss +10 -0
- data/docs/_sass/vendor/bourbon/css3/_radial-gradient.scss +40 -0
- data/docs/_sass/vendor/bourbon/css3/_selection.scss +44 -0
- data/docs/_sass/vendor/bourbon/css3/_text-decoration.scss +27 -0
- data/docs/_sass/vendor/bourbon/css3/_transform.scss +21 -0
- data/docs/_sass/vendor/bourbon/css3/_transition.scss +81 -0
- data/docs/_sass/vendor/bourbon/css3/_user-select.scss +5 -0
- data/docs/_sass/vendor/bourbon/functions/_assign-inputs.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_contains-falsy.scss +25 -0
- data/docs/_sass/vendor/bourbon/functions/_contains.scss +31 -0
- data/docs/_sass/vendor/bourbon/functions/_is-length.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_is-light.scss +26 -0
- data/docs/_sass/vendor/bourbon/functions/_is-number.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_is-size.scss +23 -0
- data/docs/_sass/vendor/bourbon/functions/_modular-scale.scss +74 -0
- data/docs/_sass/vendor/bourbon/functions/_px-to-em.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_px-to-rem.scss +26 -0
- data/docs/_sass/vendor/bourbon/functions/_shade.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_strip-units.scss +22 -0
- data/docs/_sass/vendor/bourbon/functions/_tint.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_transition-property-name.scss +37 -0
- data/docs/_sass/vendor/bourbon/functions/_unpack.scss +32 -0
- data/docs/_sass/vendor/bourbon/helpers/_convert-units.scss +26 -0
- data/docs/_sass/vendor/bourbon/helpers/_directional-values.scss +108 -0
- data/docs/_sass/vendor/bourbon/helpers/_font-source-declaration.scss +53 -0
- data/docs/_sass/vendor/bourbon/helpers/_gradient-positions-parser.scss +24 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-angle-parser.scss +35 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-gradient-parser.scss +51 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-positions-parser.scss +77 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-side-corner-parser.scss +41 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-arg-parser.scss +74 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-gradient-parser.scss +55 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-positions-parser.scss +28 -0
- data/docs/_sass/vendor/bourbon/helpers/_render-gradients.scss +31 -0
- data/docs/_sass/vendor/bourbon/helpers/_shape-size-stripper.scss +15 -0
- data/docs/_sass/vendor/bourbon/helpers/_str-to-num.scss +55 -0
- data/docs/_sass/vendor/bourbon/settings/_asset-pipeline.scss +7 -0
- data/docs/_sass/vendor/bourbon/settings/_deprecation-warnings.scss +8 -0
- data/docs/_sass/vendor/bourbon/settings/_prefixer.scss +9 -0
- data/docs/_sass/vendor/bourbon/settings/_px-to-em.scss +1 -0
- data/docs/_sass/vendor/neat/_neat-helpers.scss +11 -0
- data/docs/_sass/vendor/neat/_neat.scss +23 -0
- data/docs/_sass/vendor/neat/functions/_new-breakpoint.scss +49 -0
- data/docs/_sass/vendor/neat/functions/_private.scss +114 -0
- data/docs/_sass/vendor/neat/grid/_box-sizing.scss +15 -0
- data/docs/_sass/vendor/neat/grid/_direction-context.scss +33 -0
- data/docs/_sass/vendor/neat/grid/_display-context.scss +28 -0
- data/docs/_sass/vendor/neat/grid/_fill-parent.scss +22 -0
- data/docs/_sass/vendor/neat/grid/_media.scss +92 -0
- data/docs/_sass/vendor/neat/grid/_omega.scss +87 -0
- data/docs/_sass/vendor/neat/grid/_outer-container.scss +34 -0
- data/docs/_sass/vendor/neat/grid/_pad.scss +25 -0
- data/docs/_sass/vendor/neat/grid/_private.scss +35 -0
- data/docs/_sass/vendor/neat/grid/_row.scss +52 -0
- data/docs/_sass/vendor/neat/grid/_shift.scss +50 -0
- data/docs/_sass/vendor/neat/grid/_span-columns.scss +94 -0
- data/docs/_sass/vendor/neat/grid/_to-deprecate.scss +97 -0
- data/docs/_sass/vendor/neat/grid/_visual-grid.scss +42 -0
- data/docs/_sass/vendor/neat/mixins/_clearfix.scss +25 -0
- data/docs/_sass/vendor/neat/settings/_disable-warnings.scss +13 -0
- data/docs/_sass/vendor/neat/settings/_grid.scss +51 -0
- data/docs/_sass/vendor/neat/settings/_visual-grid.scss +27 -0
- data/docs/_sass/vendor/normalize-3.0.2.scss +427 -0
- data/docs/_sass/vendor/pygments.scss +356 -0
- data/docs/automating_browsers/capybara.md +70 -0
- data/docs/css/screen.scss +7 -0
- data/docs/guides/callbacks.md +45 -0
- data/docs/guides/cli.md +52 -0
- data/docs/guides/configuration.md +184 -0
- data/docs/guides/error_handling.md +46 -0
- data/docs/guides/frontiers.md +93 -0
- data/docs/guides/halting.md +23 -0
- data/docs/guides/job_queues.md +26 -0
- data/docs/guides/locals.md +36 -0
- data/docs/guides/logging.md +22 -0
- data/docs/guides/page_objects.md +67 -0
- data/docs/guides/peeking.md +46 -0
- data/docs/guides/selenium_capybara.md +100 -0
- data/docs/guides/tutorial.md +452 -0
- data/docs/index.md +82 -0
- data/docs/js/navigation.js +11 -0
- data/docs/misc/contributing.md +20 -0
- data/docs/misc/testing.md +11 -0
- data/docs/recipes/authentication.md +23 -0
- data/docs/recipes/csv.md +29 -0
- data/docs/recipes/javascript.md +20 -0
- data/docs/recipes/multiple_uris.md +18 -0
- data/docs/recipes/screenshots.md +20 -0
- data/docs/routing/custom_rules.md +16 -0
- data/docs/routing/filetypes_rules.md +21 -0
- data/docs/routing/host_rules.md +24 -0
- data/docs/routing/path_rules.md +33 -0
- data/docs/routing/protocol_rules.md +17 -0
- data/docs/routing/query_rules.md +69 -0
- data/docs/routing/routes.md +96 -0
- data/docs/routing/uri_rules.md +18 -0
- data/examples/collect_github_issues.rb +65 -0
- data/examples/find_foobar_on_wikipedia.rb +23 -0
- data/lib/wayfarer/configuration.rb +86 -0
- data/lib/wayfarer/crawl.rb +79 -0
- data/lib/wayfarer/crawl_observer.rb +103 -0
- data/lib/wayfarer/dispatcher.rb +104 -0
- data/lib/wayfarer/finders.rb +61 -0
- data/lib/wayfarer/frontiers/frontier.rb +79 -0
- data/lib/wayfarer/frontiers/memory_bloomfilter.rb +32 -0
- data/lib/wayfarer/frontiers/memory_frontier.rb +76 -0
- data/lib/wayfarer/frontiers/memory_trie_frontier.rb +39 -0
- data/lib/wayfarer/frontiers/normalize_uris.rb +48 -0
- data/lib/wayfarer/frontiers/redis_bloomfilter.rb +34 -0
- data/lib/wayfarer/frontiers/redis_frontier.rb +83 -0
- data/lib/wayfarer/http_adapters/adapter_pool.rb +62 -0
- data/lib/wayfarer/http_adapters/net_http_adapter.rb +77 -0
- data/lib/wayfarer/http_adapters/selenium_adapter.rb +80 -0
- data/lib/wayfarer/job.rb +211 -0
- data/lib/wayfarer/locals.rb +40 -0
- data/lib/wayfarer/page.rb +94 -0
- data/lib/wayfarer/parsers/json_parser.rb +20 -0
- data/lib/wayfarer/parsers/xml_parser.rb +27 -0
- data/lib/wayfarer/processor.rb +103 -0
- data/lib/wayfarer/routing/custom_rule.rb +21 -0
- data/lib/wayfarer/routing/filetypes_rule.rb +20 -0
- data/lib/wayfarer/routing/host_rule.rb +19 -0
- data/lib/wayfarer/routing/path_rule.rb +54 -0
- data/lib/wayfarer/routing/protocol_rule.rb +21 -0
- data/lib/wayfarer/routing/query_rule.rb +59 -0
- data/lib/wayfarer/routing/router.rb +71 -0
- data/lib/wayfarer/routing/rule.rb +114 -0
- data/lib/wayfarer/routing/uri_rule.rb +21 -0
- data/lib/wayfarer.rb +68 -0
- data/spec/configuration_spec.rb +26 -0
- data/spec/crawl_spec.rb +48 -0
- data/spec/finders_spec.rb +49 -0
- data/spec/frontiers/memory_bloomfilter_spec.rb +6 -0
- data/spec/frontiers/memory_frontier_spec.rb +6 -0
- data/spec/frontiers/memory_trie_frontier_spec.rb +6 -0
- data/spec/frontiers/normalize_uris_spec.rb +59 -0
- data/spec/frontiers/redis_bloomfilter_spec.rb +6 -0
- data/spec/frontiers/redis_frontier_spec.rb +6 -0
- data/spec/http_adapters/adapter_pool_spec.rb +33 -0
- data/spec/http_adapters/net_http_adapter_spec.rb +83 -0
- data/spec/http_adapters/selenium_adapter_spec.rb +53 -0
- data/spec/integration/callbacks_spec.rb +42 -0
- data/spec/integration/locals_spec.rb +106 -0
- data/spec/integration/peeking_spec.rb +61 -0
- data/spec/job_spec.rb +122 -0
- data/spec/page_spec.rb +38 -0
- data/spec/parsers/json_parser_spec.rb +30 -0
- data/spec/parsers/xml_parser_spec.rb +24 -0
- data/spec/processor_spec.rb +31 -0
- data/spec/routing/custom_rule_spec.rb +26 -0
- data/spec/routing/filetypes_rule_spec.rb +40 -0
- data/spec/routing/host_rule_spec.rb +48 -0
- data/spec/routing/path_rule_spec.rb +66 -0
- data/spec/routing/protocol_rule_spec.rb +26 -0
- data/spec/routing/query_rule_spec.rb +124 -0
- data/spec/routing/router_spec.rb +67 -0
- data/spec/routing/rule_spec.rb +251 -0
- data/spec/routing/uri_rule_spec.rb +24 -0
- data/spec/shared/frontier.rb +96 -0
- data/spec/spec_helpers.rb +62 -0
- data/spec/wayfarer_spec.rb +24 -0
- data/support/static/finders.html +38 -0
- data/support/static/graph/details/a.html +10 -0
- data/support/static/graph/details/b.html +10 -0
- data/support/static/graph/index.html +20 -0
- data/support/static/json/dummy.json +13 -0
- data/support/static/links/links.html +28 -0
- data/support/static/xml/dummy.xml +120 -0
- data/support/test_app.rb +45 -0
- data/wayfarer-jruby.gemspec +49 -0
- data/wayfarer.gemspec +53 -0
- metadata +697 -0
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "active_support/core_ext/hash/indifferent_access"
|
|
4
|
+
|
|
5
|
+
module Wayfarer
|
|
6
|
+
# Creates job instances, retrieves pages and, if a URI matches a route, calls
|
|
7
|
+
# methods on the instances.
|
|
8
|
+
class Dispatcher
|
|
9
|
+
extend Forwardable
|
|
10
|
+
|
|
11
|
+
include Observable
|
|
12
|
+
include CrawlObserver::Events
|
|
13
|
+
include CrawlObserver::ObservableShortcuts
|
|
14
|
+
|
|
15
|
+
# Result types that a {Processor} operates with.
|
|
16
|
+
Mismatch = Struct.new(:uri)
|
|
17
|
+
Halt = Struct.new(:uri, :action)
|
|
18
|
+
Stage = Struct.new(:uris, :ret_val)
|
|
19
|
+
Error = Struct.new(:exception)
|
|
20
|
+
|
|
21
|
+
# @!attribute [r] adapter_pool
|
|
22
|
+
# @return [AdapterPool]
|
|
23
|
+
attr_reader :adapter_pool
|
|
24
|
+
|
|
25
|
+
# @!attribute [r] job
|
|
26
|
+
attr_reader :job
|
|
27
|
+
|
|
28
|
+
def initialize(job)
|
|
29
|
+
@job = job
|
|
30
|
+
@adapter_pool = HTTPAdapters::AdapterPool.new(job)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
delegate config: :job
|
|
34
|
+
delegate logger: :config
|
|
35
|
+
|
|
36
|
+
# Dispatches this URI. Matches an URI against the rules of the job's router.
|
|
37
|
+
# If a rule matches, the page is retrieved, and the action associated with
|
|
38
|
+
# the route is called.
|
|
39
|
+
#
|
|
40
|
+
# @param [Job] job
|
|
41
|
+
# @param [URI] uri
|
|
42
|
+
def dispatch(job, uri, is_peeking: false)
|
|
43
|
+
action, params = job.router.route(uri)
|
|
44
|
+
return Mismatch.new(uri) unless action
|
|
45
|
+
|
|
46
|
+
params = ActiveSupport::HashWithIndifferentAccess.new(params)
|
|
47
|
+
|
|
48
|
+
notify_observers!(DispatchedURI.new(action, uri))
|
|
49
|
+
|
|
50
|
+
job_instance = job.new
|
|
51
|
+
result = nil
|
|
52
|
+
|
|
53
|
+
adapter_pool.with do |adapter|
|
|
54
|
+
job_instance.page = adapter.fetch(uri)
|
|
55
|
+
job_instance.adapter = adapter
|
|
56
|
+
job_instance.params = params
|
|
57
|
+
|
|
58
|
+
result = job_instance.public_send(action) { |peek_uri|
|
|
59
|
+
begin
|
|
60
|
+
unless is_peeking
|
|
61
|
+
notify_observers!(Peeking.new(uri))
|
|
62
|
+
result = dispatch(job, URI(peek_uri), is_peeking: true)
|
|
63
|
+
result.ret_val
|
|
64
|
+
end
|
|
65
|
+
rescue
|
|
66
|
+
nil
|
|
67
|
+
end
|
|
68
|
+
}
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
if job_instance.halts?
|
|
72
|
+
Halt.new(uri, action)
|
|
73
|
+
else
|
|
74
|
+
Stage.new(job_instance.staged_uris, result)
|
|
75
|
+
end
|
|
76
|
+
# What follows are exceptions whose origin I don't care about at the moment
|
|
77
|
+
# TODO: Better logging
|
|
78
|
+
rescue Net::HTTP::Persistent::Error
|
|
79
|
+
logger.warn("Net::HTTP::Persistent::Error @ #{uri}")
|
|
80
|
+
rescue Errno::EHOSTUNREACH
|
|
81
|
+
logger.warn("Host unreachable @ #{uri}")
|
|
82
|
+
rescue Errno::ENETUNREACH
|
|
83
|
+
logger.warn("No route to network present @ #{uri}")
|
|
84
|
+
rescue Net::OpenTimeout, Net::ReadTimeout
|
|
85
|
+
logger.warn("::Net timeout @ #{uri}")
|
|
86
|
+
|
|
87
|
+
# SSL verification failed due to a missing certificate
|
|
88
|
+
rescue OpenSSL::SSL::SSLError
|
|
89
|
+
logger.warn("SSL verification failed @ #{uri}")
|
|
90
|
+
|
|
91
|
+
# Ruby/zlib encountered a Z_DATA_ERROR.
|
|
92
|
+
# Usually if a stream was prematurely freed.
|
|
93
|
+
# Probably has to do with net-http-persistent?
|
|
94
|
+
rescue Zlib::DataError
|
|
95
|
+
logger.warn("Z_DATA_ERROR")
|
|
96
|
+
rescue HTTPAdapters::NetHTTPAdapter::MalformedURI, URI::InvalidURIError
|
|
97
|
+
logger.info("[warn#{self}] Malformed URI @ #{uri}")
|
|
98
|
+
rescue HTTPAdapters::NetHTTPAdapter::MalformedRedirectURI
|
|
99
|
+
logger.info("Malformed redirect URI @ #{uri}")
|
|
100
|
+
rescue HTTPAdapters::NetHTTPAdapter::MaximumRedirectCountReached
|
|
101
|
+
logger.info("Maximum redirect count reached @ #{uri}")
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Wayfarer
|
|
4
|
+
module Finders
|
|
5
|
+
# Returns the expanded `href` attribute URIs from all or targeted `<a>` tags.
|
|
6
|
+
# @param [*Array<String>] filters CSS/XPath expressions.
|
|
7
|
+
# @return [Array<URI>]
|
|
8
|
+
def links(*filters)
|
|
9
|
+
query("a", "href", *filters)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Returns the expanded `href` attribute URIs from all or targeted `<link rel="stylesheet" ...>` tags.
|
|
13
|
+
# @param [*Array<String>] filters CSS/XPath expressions.
|
|
14
|
+
# @return [Array<URI>]
|
|
15
|
+
def stylesheets(*filters)
|
|
16
|
+
query("link[rel='stylesheet']", "href", *filters)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Returns the expanded `src` attribute URIs from all or targeted `<script>` tags.
|
|
20
|
+
# TODO: Tests
|
|
21
|
+
# @param [*Array<String>] filters CSS/XPath expressions.
|
|
22
|
+
# @return [Array<URI>]
|
|
23
|
+
def javascripts(*filters)
|
|
24
|
+
query("script", "src", *filters)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
alias scripts javascripts
|
|
28
|
+
|
|
29
|
+
# Returns the expanded `src` attribute URIs from all or targeted `<img>` tags.
|
|
30
|
+
# TODO: Tests
|
|
31
|
+
# @param [*Array<String>] filters CSS/XPath expressions.
|
|
32
|
+
# @return [Array<URI>]
|
|
33
|
+
def images(*filters)
|
|
34
|
+
query("img", "src", *filters)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private
|
|
38
|
+
|
|
39
|
+
# TODO: Lord have mercy
|
|
40
|
+
def query(selector, attr, *filters)
|
|
41
|
+
nodes = if filters.any?
|
|
42
|
+
doc.search(*filters).css(selector)
|
|
43
|
+
else
|
|
44
|
+
doc.css(selector)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
links = nodes.map { |node|
|
|
48
|
+
begin
|
|
49
|
+
URI.join(uri, node.attr(attr))
|
|
50
|
+
rescue
|
|
51
|
+
nil
|
|
52
|
+
end
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
links
|
|
56
|
+
.find_all { |uri| uri.is_a?(URI) }
|
|
57
|
+
.uniq
|
|
58
|
+
.map(&:to_s)
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Wayfarer
|
|
4
|
+
module Frontiers
|
|
5
|
+
# @abstract The common behaviour of all frontiers.
|
|
6
|
+
# @api private
|
|
7
|
+
class Frontier
|
|
8
|
+
attr_reader :config
|
|
9
|
+
|
|
10
|
+
def initialize(config)
|
|
11
|
+
@config = config
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# Returns URIs to be scraped in the current cycle.
|
|
15
|
+
# @note Usually an expensive operation!
|
|
16
|
+
# @return [Array<URI>]
|
|
17
|
+
def current_uris
|
|
18
|
+
raise "Unimplemented"
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Returns staged URIs.
|
|
22
|
+
# @return [Array<URI>]
|
|
23
|
+
def staged_uris
|
|
24
|
+
raise "Unimplemented"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Stages URIs for processing in the next cycle.
|
|
28
|
+
# @param [*Array<URI>, *Array<String>] uris
|
|
29
|
+
def stage(*_uris)
|
|
30
|
+
raise "Unimplemented"
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Whether a URI is cached.
|
|
34
|
+
def staged?(_uri)
|
|
35
|
+
raise "Unimplemented"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Caches URIs so they don't get processed again.
|
|
39
|
+
# @param [*Array<URI>, *Array<String>] uris
|
|
40
|
+
def cache(*_uris)
|
|
41
|
+
raise "Unimplemented"
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Whether a URI is cached.
|
|
45
|
+
def cached?(_uri)
|
|
46
|
+
raise "Unimplemented"
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Frees resources.
|
|
50
|
+
def free; end
|
|
51
|
+
|
|
52
|
+
# TODO: Documentation
|
|
53
|
+
def cycle
|
|
54
|
+
unless config.allow_circulation
|
|
55
|
+
cache(*current_uris) # TODO: Make it a template method
|
|
56
|
+
filter_staged_uris!
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
return false if staged_uris.none?
|
|
60
|
+
|
|
61
|
+
swap!
|
|
62
|
+
reset_staged_uris!
|
|
63
|
+
|
|
64
|
+
true
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
protected
|
|
68
|
+
|
|
69
|
+
# TODO: Documentation
|
|
70
|
+
def filter_staged_uris!; end
|
|
71
|
+
|
|
72
|
+
# TODO: Documentation
|
|
73
|
+
def swap!; end
|
|
74
|
+
|
|
75
|
+
# TODO: Documentation
|
|
76
|
+
def reset_staged_uris!; end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "bloomfilter-rb"
|
|
4
|
+
|
|
5
|
+
module Wayfarer
|
|
6
|
+
module Frontiers
|
|
7
|
+
# An in-memory bloomfilter.
|
|
8
|
+
# @api private
|
|
9
|
+
class MemoryBloomfilter < MemoryFrontier
|
|
10
|
+
def initialize(config)
|
|
11
|
+
@filter = BloomFilter::Native.new(config.bloomfilter_opts)
|
|
12
|
+
super(config)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# @override
|
|
16
|
+
def cache(*uris)
|
|
17
|
+
uris.each { |uri| @filter.insert(uri) }
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# @override
|
|
21
|
+
def cached?(uri)
|
|
22
|
+
@filter.include?(uri)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Frees up memory.
|
|
26
|
+
def free
|
|
27
|
+
@filter.clear
|
|
28
|
+
super
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "set"
|
|
4
|
+
require "parallel" unless JAVA_PLATFORM = "java"
|
|
5
|
+
|
|
6
|
+
module Wayfarer
|
|
7
|
+
module Frontiers
|
|
8
|
+
# A naive in-memory frontier.
|
|
9
|
+
# @api private
|
|
10
|
+
class MemoryFrontier < Frontier
|
|
11
|
+
def initialize(config)
|
|
12
|
+
@current_uris = Set.new([])
|
|
13
|
+
@staged_uris = Set.new([])
|
|
14
|
+
@cached_uris = Set.new([])
|
|
15
|
+
super(config)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# @override
|
|
19
|
+
def current_uris
|
|
20
|
+
if JAVA_PLATFORM == "java"
|
|
21
|
+
@current_uris.map { |uri| URI(uri) }
|
|
22
|
+
else
|
|
23
|
+
Parallel.map(@current_uris) { |uri| URI(uri) }
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# @override
|
|
28
|
+
def staged_uris
|
|
29
|
+
@staged_uris.to_a # These are assumed to be URIs already, so no map
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# @override
|
|
33
|
+
def stage(*uris)
|
|
34
|
+
@staged_uris |= uris
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# @override
|
|
38
|
+
def staged?(uri)
|
|
39
|
+
@staged_uris.include?(uri.to_s)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# @override
|
|
43
|
+
def cache(*uris)
|
|
44
|
+
@cached_uris |= if JAVA_PLATFORM == "java"
|
|
45
|
+
uris.map(&:to_s)
|
|
46
|
+
else
|
|
47
|
+
Parallel.map(uris, &:to_s)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# @override
|
|
52
|
+
def cached?(uri)
|
|
53
|
+
@cached_uris.include?(uri.to_s)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# @override
|
|
57
|
+
def free
|
|
58
|
+
@current_uris = @staged_uris = @cached_uris = nil
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
private
|
|
62
|
+
|
|
63
|
+
def reset_staged_uris!
|
|
64
|
+
@staged_uris = Set.new([])
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def swap!
|
|
68
|
+
@current_uris = @staged_uris
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def filter_staged_uris!
|
|
72
|
+
@staged_uris.delete_if { |uri| cached?(uri) }
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "trie"
|
|
4
|
+
|
|
5
|
+
module Wayfarer
|
|
6
|
+
module Frontiers
|
|
7
|
+
# An in-memory trie.
|
|
8
|
+
# @api private
|
|
9
|
+
class MemoryTrieFrontier < MemoryFrontier
|
|
10
|
+
def initialize(config)
|
|
11
|
+
@trie = Trie.new
|
|
12
|
+
super(config)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# @override
|
|
16
|
+
def cache(*uris)
|
|
17
|
+
uris.each { |uri| @trie.add(uri.to_s) }
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# @override
|
|
21
|
+
def match!(uri)
|
|
22
|
+
@str_or_regexp === uri.host
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def cached?(uri)
|
|
26
|
+
# RuboCop autocorrects `#has_key?` to `#key?` otherwise
|
|
27
|
+
# rubocop:disable Style/PreferredHashMethods
|
|
28
|
+
@trie.has_key?(uri.to_s)
|
|
29
|
+
# rubocop:enable Style/PreferredHashMethods
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# @override
|
|
33
|
+
def free
|
|
34
|
+
@trie = nil
|
|
35
|
+
super
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "normalize_url"
|
|
4
|
+
|
|
5
|
+
module Wayfarer
|
|
6
|
+
module Frontiers
|
|
7
|
+
# @api private
|
|
8
|
+
module NormalizeURIs
|
|
9
|
+
# @override
|
|
10
|
+
def stage(*uris)
|
|
11
|
+
super(*uris.map { |uri| normalize(uri) })
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# @override
|
|
15
|
+
def staged?(uri)
|
|
16
|
+
super(normalize(uri))
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# @override
|
|
20
|
+
def cache(*uris)
|
|
21
|
+
super(*uris.map { |uri| normalize(uri) })
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# @override
|
|
25
|
+
def cached?(uri)
|
|
26
|
+
super(normalize(uri))
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
private
|
|
30
|
+
|
|
31
|
+
def normalize(uri)
|
|
32
|
+
NormalizeUrl.process(uri, config.normalize_uri_options)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def to_s
|
|
36
|
+
"URI-normalizing #{super}"
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def method_missing(*argv, &proc)
|
|
40
|
+
super(*argv, &proc)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def respond_to_missing?(method, private = false)
|
|
44
|
+
@frontier.respond_to?(method) || super
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "bloomfilter-rb"
|
|
4
|
+
|
|
5
|
+
module Wayfarer
|
|
6
|
+
module Frontiers
|
|
7
|
+
# A Redis bloomfilter.
|
|
8
|
+
# @api private
|
|
9
|
+
class RedisBloomfilter < MemoryBloomfilter
|
|
10
|
+
def initialize(config)
|
|
11
|
+
@conn = Redis.new(config.redis_opts)
|
|
12
|
+
@filter = BloomFilter::Redis.new(config.bloomfilter_opts.merge(db: @conn))
|
|
13
|
+
super(config)
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# @override
|
|
18
|
+
def cache(*uris)
|
|
19
|
+
uris.each { |uri| @filter.insert(uri) }
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# @override
|
|
23
|
+
def cached?(uri)
|
|
24
|
+
@filter.include?(uri)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# @override
|
|
28
|
+
def free
|
|
29
|
+
@filter.clear
|
|
30
|
+
@conn.disconnect!
|
|
31
|
+
super
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "redis"
|
|
4
|
+
|
|
5
|
+
module Wayfarer
|
|
6
|
+
module Frontiers
|
|
7
|
+
# A Redis frontier
|
|
8
|
+
# @api private
|
|
9
|
+
class RedisFrontier < Frontier
|
|
10
|
+
def initialize(config)
|
|
11
|
+
@conn = Redis.new(config.redis_opts)
|
|
12
|
+
super(config)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# @override
|
|
16
|
+
def current_uris
|
|
17
|
+
@conn.smembers(current_uris_key).map { |str| URI(str) }
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# @override
|
|
21
|
+
def staged_uris
|
|
22
|
+
@conn.smembers(staged_uris_key).map { |str| URI(str) }
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# @override
|
|
26
|
+
def stage(*uris)
|
|
27
|
+
@conn.sadd(staged_uris_key, uris.map(&:to_s)) if uris.any?
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# @override
|
|
31
|
+
def staged?(uri)
|
|
32
|
+
@conn.sismember(staged_uris_key, uri.to_s)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# @override
|
|
36
|
+
def cache(*uris)
|
|
37
|
+
@conn.sadd(cached_uris_key, uris.map(&:to_s)) if uris.any?
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# @override
|
|
41
|
+
def cached?(uri)
|
|
42
|
+
@conn.sismember(cached_uris_key, uri.to_s)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# @override
|
|
46
|
+
def free
|
|
47
|
+
[current_uris_key, staged_uris_key, cached_uris_key].each do |key|
|
|
48
|
+
@conn.del(key)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
@conn.disconnect!
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
private
|
|
55
|
+
|
|
56
|
+
def reset_staged_uris!
|
|
57
|
+
@conn.del(staged_uris_key)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# @override
|
|
61
|
+
def swap!
|
|
62
|
+
# Achieve: @current_uris = @staged_uris
|
|
63
|
+
@conn.rename(staged_uris_key, current_uris_key)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def filter_staged_uris!
|
|
67
|
+
@conn.sdiffstore(staged_uris_key, staged_uris_key, cached_uris_key)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def current_uris_key
|
|
71
|
+
"#{@config.uuid}_current_uris"
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def staged_uris_key
|
|
75
|
+
"#{@config.uuid}_staged_uris"
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def cached_uris_key
|
|
79
|
+
"#{@config.uuid}_cached_uris"
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "forwardable"
|
|
4
|
+
require "connection_pool"
|
|
5
|
+
|
|
6
|
+
module Wayfarer
|
|
7
|
+
module HTTPAdapters
|
|
8
|
+
# A connection pool that hands out HTTP adapters.
|
|
9
|
+
# @private
|
|
10
|
+
class AdapterPool
|
|
11
|
+
extend Forwardable
|
|
12
|
+
|
|
13
|
+
def initialize(job)
|
|
14
|
+
@job = job
|
|
15
|
+
@config = job.config
|
|
16
|
+
|
|
17
|
+
size = @config.connection_count
|
|
18
|
+
timeout = @config.connection_timeout
|
|
19
|
+
|
|
20
|
+
@pool = ConnectionPool.new(
|
|
21
|
+
size: size,
|
|
22
|
+
timeout: timeout,
|
|
23
|
+
&method(:instantiate_adapter)
|
|
24
|
+
)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Shuts down all HTTP adapters
|
|
28
|
+
def free
|
|
29
|
+
@pool.shutdown(&:free)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
private
|
|
33
|
+
|
|
34
|
+
def instantiate_adapter
|
|
35
|
+
adapter = if @config.http_adapter == :selenium
|
|
36
|
+
HTTPAdapters::SeleniumAdapter.new(@config)
|
|
37
|
+
else
|
|
38
|
+
HTTPAdapters::NetHTTPAdapter.instance(@config)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
@job.run_hook(
|
|
42
|
+
:setup_adapter,
|
|
43
|
+
adapter,
|
|
44
|
+
adapter.try(:driver),
|
|
45
|
+
adapter.try(:browser)
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
adapter
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def method_missing(method, *argv, &proc)
|
|
52
|
+
super if method == :shutdown
|
|
53
|
+
@pool.public_send(method, *argv, &proc)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def respond_to_missing?(method, private = false)
|
|
57
|
+
return false if method == :shutdown
|
|
58
|
+
@pool.respond_to?(method) || super
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "securerandom"
|
|
4
|
+
require "net/http"
|
|
5
|
+
require "net/http/persistent"
|
|
6
|
+
|
|
7
|
+
module Wayfarer
|
|
8
|
+
module HTTPAdapters
|
|
9
|
+
# A singleton adapter for net-http-persistent.
|
|
10
|
+
# @api private
|
|
11
|
+
class NetHTTPAdapter
|
|
12
|
+
# Supported standard lib classes
|
|
13
|
+
RECOGNIZED_URI_TYPES = [
|
|
14
|
+
URI::HTTP,
|
|
15
|
+
URI::HTTPS
|
|
16
|
+
].freeze
|
|
17
|
+
|
|
18
|
+
MalformedURI = Class.new(StandardError)
|
|
19
|
+
MalformedRedirectURI = Class.new(StandardError)
|
|
20
|
+
MaximumRedirectCountReached = Class.new(StandardError)
|
|
21
|
+
|
|
22
|
+
attr_accessor :request_header_overrides
|
|
23
|
+
|
|
24
|
+
# TODO: Remove default parameter value
|
|
25
|
+
def self.instance(config = Wayfarer.config)
|
|
26
|
+
@@instance ||= new(config)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def initialize(config)
|
|
30
|
+
@config = config
|
|
31
|
+
@conn = Net::HTTP::Persistent.new("wayfarer-#{SecureRandom.uuid}")
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# This is a singleton class. Use ::instance instead.
|
|
35
|
+
private_class_method :new
|
|
36
|
+
|
|
37
|
+
# Fetches a page.
|
|
38
|
+
# @return [Page]
|
|
39
|
+
# @raise [MalformedURI] if the URI is not supported.
|
|
40
|
+
# @raise [MalformedRedirectURI] if a redirection URI is not supported.
|
|
41
|
+
# @raise [MaximumRedirectCountReached] if too many redirections are
|
|
42
|
+
# encountered.
|
|
43
|
+
def fetch(uri, redirects_followed = 0)
|
|
44
|
+
if !RECOGNIZED_URI_TYPES.include?(uri.class)
|
|
45
|
+
raise _ = if redirects_followed.positive?
|
|
46
|
+
MalformedRedirectURI
|
|
47
|
+
else
|
|
48
|
+
MalformedURI
|
|
49
|
+
end
|
|
50
|
+
elsif redirects_followed > @config.max_http_redirects
|
|
51
|
+
raise MaximumRedirectCountReached
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
res = @conn.request(uri)
|
|
55
|
+
|
|
56
|
+
if res.is_a? Net::HTTPRedirection
|
|
57
|
+
redirect_uri = URI(res["location"])
|
|
58
|
+
return fetch(redirect_uri, redirects_followed + 1)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
Page.new(
|
|
62
|
+
uri: uri,
|
|
63
|
+
status_code: res.code.to_i,
|
|
64
|
+
body: res.body,
|
|
65
|
+
headers: res.to_hash
|
|
66
|
+
)
|
|
67
|
+
rescue SocketError
|
|
68
|
+
raise MalformedURI
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Shuts down all connections.
|
|
72
|
+
def free
|
|
73
|
+
@conn.shutdown
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|