wayfarer-jruby 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rbenv-gemsets +1 -0
- data/.rspec +3 -0
- data/.rubocop.yml +21 -0
- data/.ruby-version +1 -0
- data/.travis.yml +5 -0
- data/.yardopts +3 -0
- data/Gemfile +11 -0
- data/LICENSE +19 -0
- data/README.md +19 -0
- data/Rakefile +114 -0
- data/benchmark/frontiers.rb +143 -0
- data/bin/wayfarer +116 -0
- data/docs/.gitignore +2 -0
- data/docs/_config.yml +15 -0
- data/docs/_includes/base.html +7 -0
- data/docs/_includes/head.html +10 -0
- data/docs/_includes/navigation.html +172 -0
- data/docs/_layouts/default.html +42 -0
- data/docs/_sass/base.scss +439 -0
- data/docs/_sass/variables.scss +24 -0
- data/docs/_sass/vendor/bourbon/_bourbon-deprecate.scss +19 -0
- data/docs/_sass/vendor/bourbon/_bourbon-deprecated-upcoming.scss +425 -0
- data/docs/_sass/vendor/bourbon/_bourbon.scss +90 -0
- data/docs/_sass/vendor/bourbon/addons/_border-color.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_border-radius.scss +48 -0
- data/docs/_sass/vendor/bourbon/addons/_border-style.scss +28 -0
- data/docs/_sass/vendor/bourbon/addons/_border-width.scss +28 -0
- data/docs/_sass/vendor/bourbon/addons/_buttons.scss +69 -0
- data/docs/_sass/vendor/bourbon/addons/_clearfix.scss +25 -0
- data/docs/_sass/vendor/bourbon/addons/_ellipsis.scss +30 -0
- data/docs/_sass/vendor/bourbon/addons/_font-stacks.scss +31 -0
- data/docs/_sass/vendor/bourbon/addons/_hide-text.scss +27 -0
- data/docs/_sass/vendor/bourbon/addons/_margin.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_padding.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_position.scss +51 -0
- data/docs/_sass/vendor/bourbon/addons/_prefixer.scss +66 -0
- data/docs/_sass/vendor/bourbon/addons/_retina-image.scss +27 -0
- data/docs/_sass/vendor/bourbon/addons/_size.scss +56 -0
- data/docs/_sass/vendor/bourbon/addons/_text-inputs.scss +118 -0
- data/docs/_sass/vendor/bourbon/addons/_timing-functions.scss +34 -0
- data/docs/_sass/vendor/bourbon/addons/_triangle.scss +63 -0
- data/docs/_sass/vendor/bourbon/addons/_word-wrap.scss +29 -0
- data/docs/_sass/vendor/bourbon/css3/_animation.scss +61 -0
- data/docs/_sass/vendor/bourbon/css3/_appearance.scss +5 -0
- data/docs/_sass/vendor/bourbon/css3/_backface-visibility.scss +5 -0
- data/docs/_sass/vendor/bourbon/css3/_background-image.scss +44 -0
- data/docs/_sass/vendor/bourbon/css3/_background.scss +57 -0
- data/docs/_sass/vendor/bourbon/css3/_border-image.scss +61 -0
- data/docs/_sass/vendor/bourbon/css3/_calc.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_columns.scss +67 -0
- data/docs/_sass/vendor/bourbon/css3/_filter.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_flex-box.scss +327 -0
- data/docs/_sass/vendor/bourbon/css3/_font-face.scss +29 -0
- data/docs/_sass/vendor/bourbon/css3/_font-feature-settings.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_hidpi-media-query.scss +12 -0
- data/docs/_sass/vendor/bourbon/css3/_hyphens.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_image-rendering.scss +15 -0
- data/docs/_sass/vendor/bourbon/css3/_keyframes.scss +38 -0
- data/docs/_sass/vendor/bourbon/css3/_linear-gradient.scss +40 -0
- data/docs/_sass/vendor/bourbon/css3/_perspective.scss +12 -0
- data/docs/_sass/vendor/bourbon/css3/_placeholder.scss +10 -0
- data/docs/_sass/vendor/bourbon/css3/_radial-gradient.scss +40 -0
- data/docs/_sass/vendor/bourbon/css3/_selection.scss +44 -0
- data/docs/_sass/vendor/bourbon/css3/_text-decoration.scss +27 -0
- data/docs/_sass/vendor/bourbon/css3/_transform.scss +21 -0
- data/docs/_sass/vendor/bourbon/css3/_transition.scss +81 -0
- data/docs/_sass/vendor/bourbon/css3/_user-select.scss +5 -0
- data/docs/_sass/vendor/bourbon/functions/_assign-inputs.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_contains-falsy.scss +25 -0
- data/docs/_sass/vendor/bourbon/functions/_contains.scss +31 -0
- data/docs/_sass/vendor/bourbon/functions/_is-length.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_is-light.scss +26 -0
- data/docs/_sass/vendor/bourbon/functions/_is-number.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_is-size.scss +23 -0
- data/docs/_sass/vendor/bourbon/functions/_modular-scale.scss +74 -0
- data/docs/_sass/vendor/bourbon/functions/_px-to-em.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_px-to-rem.scss +26 -0
- data/docs/_sass/vendor/bourbon/functions/_shade.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_strip-units.scss +22 -0
- data/docs/_sass/vendor/bourbon/functions/_tint.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_transition-property-name.scss +37 -0
- data/docs/_sass/vendor/bourbon/functions/_unpack.scss +32 -0
- data/docs/_sass/vendor/bourbon/helpers/_convert-units.scss +26 -0
- data/docs/_sass/vendor/bourbon/helpers/_directional-values.scss +108 -0
- data/docs/_sass/vendor/bourbon/helpers/_font-source-declaration.scss +53 -0
- data/docs/_sass/vendor/bourbon/helpers/_gradient-positions-parser.scss +24 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-angle-parser.scss +35 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-gradient-parser.scss +51 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-positions-parser.scss +77 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-side-corner-parser.scss +41 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-arg-parser.scss +74 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-gradient-parser.scss +55 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-positions-parser.scss +28 -0
- data/docs/_sass/vendor/bourbon/helpers/_render-gradients.scss +31 -0
- data/docs/_sass/vendor/bourbon/helpers/_shape-size-stripper.scss +15 -0
- data/docs/_sass/vendor/bourbon/helpers/_str-to-num.scss +55 -0
- data/docs/_sass/vendor/bourbon/settings/_asset-pipeline.scss +7 -0
- data/docs/_sass/vendor/bourbon/settings/_deprecation-warnings.scss +8 -0
- data/docs/_sass/vendor/bourbon/settings/_prefixer.scss +9 -0
- data/docs/_sass/vendor/bourbon/settings/_px-to-em.scss +1 -0
- data/docs/_sass/vendor/neat/_neat-helpers.scss +11 -0
- data/docs/_sass/vendor/neat/_neat.scss +23 -0
- data/docs/_sass/vendor/neat/functions/_new-breakpoint.scss +49 -0
- data/docs/_sass/vendor/neat/functions/_private.scss +114 -0
- data/docs/_sass/vendor/neat/grid/_box-sizing.scss +15 -0
- data/docs/_sass/vendor/neat/grid/_direction-context.scss +33 -0
- data/docs/_sass/vendor/neat/grid/_display-context.scss +28 -0
- data/docs/_sass/vendor/neat/grid/_fill-parent.scss +22 -0
- data/docs/_sass/vendor/neat/grid/_media.scss +92 -0
- data/docs/_sass/vendor/neat/grid/_omega.scss +87 -0
- data/docs/_sass/vendor/neat/grid/_outer-container.scss +34 -0
- data/docs/_sass/vendor/neat/grid/_pad.scss +25 -0
- data/docs/_sass/vendor/neat/grid/_private.scss +35 -0
- data/docs/_sass/vendor/neat/grid/_row.scss +52 -0
- data/docs/_sass/vendor/neat/grid/_shift.scss +50 -0
- data/docs/_sass/vendor/neat/grid/_span-columns.scss +94 -0
- data/docs/_sass/vendor/neat/grid/_to-deprecate.scss +97 -0
- data/docs/_sass/vendor/neat/grid/_visual-grid.scss +42 -0
- data/docs/_sass/vendor/neat/mixins/_clearfix.scss +25 -0
- data/docs/_sass/vendor/neat/settings/_disable-warnings.scss +13 -0
- data/docs/_sass/vendor/neat/settings/_grid.scss +51 -0
- data/docs/_sass/vendor/neat/settings/_visual-grid.scss +27 -0
- data/docs/_sass/vendor/normalize-3.0.2.scss +427 -0
- data/docs/_sass/vendor/pygments.scss +356 -0
- data/docs/automating_browsers/capybara.md +70 -0
- data/docs/css/screen.scss +7 -0
- data/docs/guides/callbacks.md +45 -0
- data/docs/guides/cli.md +52 -0
- data/docs/guides/configuration.md +184 -0
- data/docs/guides/error_handling.md +46 -0
- data/docs/guides/frontiers.md +93 -0
- data/docs/guides/halting.md +23 -0
- data/docs/guides/job_queues.md +26 -0
- data/docs/guides/locals.md +36 -0
- data/docs/guides/logging.md +22 -0
- data/docs/guides/page_objects.md +67 -0
- data/docs/guides/peeking.md +46 -0
- data/docs/guides/selenium_capybara.md +100 -0
- data/docs/guides/tutorial.md +452 -0
- data/docs/index.md +82 -0
- data/docs/js/navigation.js +11 -0
- data/docs/misc/contributing.md +20 -0
- data/docs/misc/testing.md +11 -0
- data/docs/recipes/authentication.md +23 -0
- data/docs/recipes/csv.md +29 -0
- data/docs/recipes/javascript.md +20 -0
- data/docs/recipes/multiple_uris.md +18 -0
- data/docs/recipes/screenshots.md +20 -0
- data/docs/routing/host_rules.md +24 -0
- data/docs/routing/path_rules.md +33 -0
- data/docs/routing/query_rules.md +69 -0
- data/docs/routing/routes.md +96 -0
- data/docs/routing/uri_rules.md +18 -0
- data/examples/collect_github_issues.rb +65 -0
- data/examples/find_foobar_on_wikipedia.rb +23 -0
- data/lib/wayfarer.rb +65 -0
- data/lib/wayfarer/configuration.rb +86 -0
- data/lib/wayfarer/crawl.rb +79 -0
- data/lib/wayfarer/crawl_observer.rb +103 -0
- data/lib/wayfarer/dispatcher.rb +104 -0
- data/lib/wayfarer/finders.rb +61 -0
- data/lib/wayfarer/frontiers/frontier.rb +79 -0
- data/lib/wayfarer/frontiers/memory_bloomfilter.rb +32 -0
- data/lib/wayfarer/frontiers/memory_frontier.rb +76 -0
- data/lib/wayfarer/frontiers/memory_trie_frontier.rb +39 -0
- data/lib/wayfarer/frontiers/normalize_uris.rb +48 -0
- data/lib/wayfarer/frontiers/redis_bloomfilter.rb +34 -0
- data/lib/wayfarer/frontiers/redis_frontier.rb +83 -0
- data/lib/wayfarer/http_adapters/adapter_pool.rb +62 -0
- data/lib/wayfarer/http_adapters/net_http_adapter.rb +77 -0
- data/lib/wayfarer/http_adapters/selenium_adapter.rb +80 -0
- data/lib/wayfarer/job.rb +192 -0
- data/lib/wayfarer/locals.rb +40 -0
- data/lib/wayfarer/page.rb +94 -0
- data/lib/wayfarer/parsers/json_parser.rb +20 -0
- data/lib/wayfarer/parsers/xml_parser.rb +27 -0
- data/lib/wayfarer/processor.rb +103 -0
- data/lib/wayfarer/routing/host_rule.rb +19 -0
- data/lib/wayfarer/routing/path_rule.rb +54 -0
- data/lib/wayfarer/routing/query_rule.rb +59 -0
- data/lib/wayfarer/routing/router.rb +71 -0
- data/lib/wayfarer/routing/rule.rb +102 -0
- data/lib/wayfarer/routing/uri_rule.rb +21 -0
- data/spec/configuration_spec.rb +26 -0
- data/spec/crawl_spec.rb +48 -0
- data/spec/finders_spec.rb +49 -0
- data/spec/frontiers/memory_bloomfilter_spec.rb +6 -0
- data/spec/frontiers/memory_frontier_spec.rb +6 -0
- data/spec/frontiers/memory_trie_frontier_spec.rb +6 -0
- data/spec/frontiers/normalize_uris_spec.rb +59 -0
- data/spec/frontiers/redis_bloomfilter_spec.rb +6 -0
- data/spec/frontiers/redis_frontier_spec.rb +6 -0
- data/spec/http_adapters/adapter_pool_spec.rb +33 -0
- data/spec/http_adapters/net_http_adapter_spec.rb +83 -0
- data/spec/http_adapters/selenium_adapter_spec.rb +53 -0
- data/spec/integration/callbacks_spec.rb +42 -0
- data/spec/integration/locals_spec.rb +106 -0
- data/spec/job_spec.rb +86 -0
- data/spec/page_spec.rb +38 -0
- data/spec/parsers/json_parser_spec.rb +30 -0
- data/spec/parsers/xml_parser_spec.rb +24 -0
- data/spec/processor_spec.rb +31 -0
- data/spec/routing/host_rule_spec.rb +48 -0
- data/spec/routing/path_rule_spec.rb +66 -0
- data/spec/routing/query_rule_spec.rb +124 -0
- data/spec/routing/router_spec.rb +67 -0
- data/spec/routing/rule_spec.rb +218 -0
- data/spec/routing/uri_rule_spec.rb +24 -0
- data/spec/shared/frontier.rb +96 -0
- data/spec/spec_helpers.rb +62 -0
- data/spec/wayfarer_spec.rb +24 -0
- data/support/static/finders.html +38 -0
- data/support/static/graph/details/a.html +10 -0
- data/support/static/graph/details/b.html +10 -0
- data/support/static/graph/index.html +20 -0
- data/support/static/json/dummy.json +13 -0
- data/support/static/links/links.html +28 -0
- data/support/static/xml/dummy.xml +120 -0
- data/support/test_app.rb +45 -0
- data/wayfarer-jruby.gemspec +49 -0
- data/wayfarer.gemspec +53 -0
- metadata +616 -0
@@ -0,0 +1,104 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_support/core_ext/hash/indifferent_access"
|
4
|
+
|
5
|
+
module Wayfarer
|
6
|
+
# Creates job instances, retrieves pages and, if a URI matches a route, calls
|
7
|
+
# methods on the instances.
|
8
|
+
class Dispatcher
|
9
|
+
extend Forwardable
|
10
|
+
|
11
|
+
include Observable
|
12
|
+
include CrawlObserver::Events
|
13
|
+
include CrawlObserver::ObservableShortcuts
|
14
|
+
|
15
|
+
# Result types that a {Processor} operates with.
|
16
|
+
Mismatch = Struct.new(:uri)
|
17
|
+
Halt = Struct.new(:uri, :action)
|
18
|
+
Stage = Struct.new(:uris, :ret_val)
|
19
|
+
Error = Struct.new(:exception)
|
20
|
+
|
21
|
+
# @!attribute [r] adapter_pool
|
22
|
+
# @return [AdapterPool]
|
23
|
+
attr_reader :adapter_pool
|
24
|
+
|
25
|
+
# @!attribute [r] job
|
26
|
+
attr_reader :job
|
27
|
+
|
28
|
+
def initialize(job)
|
29
|
+
@job = job
|
30
|
+
@adapter_pool = HTTPAdapters::AdapterPool.new(job)
|
31
|
+
end
|
32
|
+
|
33
|
+
delegate config: :job
|
34
|
+
delegate logger: :config
|
35
|
+
|
36
|
+
# Dispatches this URI. Matches an URI against the rules of the job's router.
|
37
|
+
# If a rule matches, the page is retrieved, and the action associated with
|
38
|
+
# the route is called.
|
39
|
+
#
|
40
|
+
# @param [Job] job
|
41
|
+
# @param [URI] uri
|
42
|
+
def dispatch(job, uri, is_peeking: false)
|
43
|
+
action, params = job.router.route(uri)
|
44
|
+
return Mismatch.new(uri) unless action
|
45
|
+
|
46
|
+
params = ActiveSupport::HashWithIndifferentAccess.new(params)
|
47
|
+
|
48
|
+
notify_observers!(DispatchedURI.new(action, uri))
|
49
|
+
|
50
|
+
job_instance = job.new
|
51
|
+
result = nil
|
52
|
+
|
53
|
+
adapter_pool.with do |adapter|
|
54
|
+
job_instance.page = adapter.fetch(uri)
|
55
|
+
job_instance.adapter = adapter
|
56
|
+
job_instance.params = params
|
57
|
+
|
58
|
+
result = job_instance.public_send(action) { |peek_uri|
|
59
|
+
begin
|
60
|
+
unless is_peeking
|
61
|
+
notify_observers!(Peeking.new(uri))
|
62
|
+
result = dispatch(job, URI(peek_uri), is_peeking: true)
|
63
|
+
result.ret_val
|
64
|
+
end
|
65
|
+
rescue
|
66
|
+
nil
|
67
|
+
end
|
68
|
+
}
|
69
|
+
end
|
70
|
+
|
71
|
+
if job_instance.halts?
|
72
|
+
Halt.new(uri, action)
|
73
|
+
else
|
74
|
+
Stage.new(job_instance.staged_uris, result)
|
75
|
+
end
|
76
|
+
# What follows are exceptions whose origin I don't care about at the moment
|
77
|
+
# TODO: Better logging
|
78
|
+
rescue Net::HTTP::Persistent::Error
|
79
|
+
logger.warn("Net::HTTP::Persistent::Error @ #{uri}")
|
80
|
+
rescue Errno::EHOSTUNREACH
|
81
|
+
logger.warn("Host unreachable @ #{uri}")
|
82
|
+
rescue Errno::ENETUNREACH
|
83
|
+
logger.warn("No route to network present @ #{uri}")
|
84
|
+
rescue Net::OpenTimeout, Net::ReadTimeout
|
85
|
+
logger.warn("::Net timeout @ #{uri}")
|
86
|
+
|
87
|
+
# SSL verification failed due to a missing certificate
|
88
|
+
rescue OpenSSL::SSL::SSLError
|
89
|
+
logger.warn("SSL verification failed @ #{uri}")
|
90
|
+
|
91
|
+
# Ruby/zlib encountered a Z_DATA_ERROR.
|
92
|
+
# Usually if a stream was prematurely freed.
|
93
|
+
# Probably has to do with net-http-persistent?
|
94
|
+
rescue Zlib::DataError
|
95
|
+
logger.warn("Z_DATA_ERROR")
|
96
|
+
rescue HTTPAdapters::NetHTTPAdapter::MalformedURI, URI::InvalidURIError
|
97
|
+
logger.info("[warn#{self}] Malformed URI @ #{uri}")
|
98
|
+
rescue HTTPAdapters::NetHTTPAdapter::MalformedRedirectURI
|
99
|
+
logger.info("Malformed redirect URI @ #{uri}")
|
100
|
+
rescue HTTPAdapters::NetHTTPAdapter::MaximumRedirectCountReached
|
101
|
+
logger.info("Maximum redirect count reached @ #{uri}")
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Finders
|
5
|
+
# Returns the expanded `href` attribute URIs from all or targeted `<a>` tags.
|
6
|
+
# @param [*Array<String>] filters CSS/XPath expressions.
|
7
|
+
# @return [Array<URI>]
|
8
|
+
def links(*filters)
|
9
|
+
query("a", "href", *filters)
|
10
|
+
end
|
11
|
+
|
12
|
+
# Returns the expanded `href` attribute URIs from all or targeted `<link rel="stylesheet" ...>` tags.
|
13
|
+
# @param [*Array<String>] filters CSS/XPath expressions.
|
14
|
+
# @return [Array<URI>]
|
15
|
+
def stylesheets(*filters)
|
16
|
+
query("link[rel='stylesheet']", "href", *filters)
|
17
|
+
end
|
18
|
+
|
19
|
+
# Returns the expanded `src` attribute URIs from all or targeted `<script>` tags.
|
20
|
+
# TODO: Tests
|
21
|
+
# @param [*Array<String>] filters CSS/XPath expressions.
|
22
|
+
# @return [Array<URI>]
|
23
|
+
def javascripts(*filters)
|
24
|
+
query("script", "src", *filters)
|
25
|
+
end
|
26
|
+
|
27
|
+
alias scripts javascripts
|
28
|
+
|
29
|
+
# Returns the expanded `src` attribute URIs from all or targeted `<img>` tags.
|
30
|
+
# TODO: Tests
|
31
|
+
# @param [*Array<String>] filters CSS/XPath expressions.
|
32
|
+
# @return [Array<URI>]
|
33
|
+
def images(*filters)
|
34
|
+
query("img", "src", *filters)
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
# TODO: Lord have mercy
|
40
|
+
def query(selector, attr, *filters)
|
41
|
+
nodes = if filters.any?
|
42
|
+
doc.search(*filters).css(selector)
|
43
|
+
else
|
44
|
+
doc.css(selector)
|
45
|
+
end
|
46
|
+
|
47
|
+
links = nodes.map { |node|
|
48
|
+
begin
|
49
|
+
URI.join(uri, node.attr(attr))
|
50
|
+
rescue
|
51
|
+
nil
|
52
|
+
end
|
53
|
+
}
|
54
|
+
|
55
|
+
links
|
56
|
+
.find_all { |uri| uri.is_a?(URI) }
|
57
|
+
.uniq
|
58
|
+
.map(&:to_s)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Frontiers
|
5
|
+
# @abstract The common behaviour of all frontiers.
|
6
|
+
# @api private
|
7
|
+
class Frontier
|
8
|
+
attr_reader :config
|
9
|
+
|
10
|
+
def initialize(config)
|
11
|
+
@config = config
|
12
|
+
end
|
13
|
+
|
14
|
+
# Returns URIs to be scraped in the current cycle.
|
15
|
+
# @note Usually an expensive operation!
|
16
|
+
# @return [Array<URI>]
|
17
|
+
def current_uris
|
18
|
+
raise "Unimplemented"
|
19
|
+
end
|
20
|
+
|
21
|
+
# Returns staged URIs.
|
22
|
+
# @return [Array<URI>]
|
23
|
+
def staged_uris
|
24
|
+
raise "Unimplemented"
|
25
|
+
end
|
26
|
+
|
27
|
+
# Stages URIs for processing in the next cycle.
|
28
|
+
# @param [*Array<URI>, *Array<String>] uris
|
29
|
+
def stage(*_uris)
|
30
|
+
raise "Unimplemented"
|
31
|
+
end
|
32
|
+
|
33
|
+
# Whether a URI is cached.
|
34
|
+
def staged?(_uri)
|
35
|
+
raise "Unimplemented"
|
36
|
+
end
|
37
|
+
|
38
|
+
# Caches URIs so they don't get processed again.
|
39
|
+
# @param [*Array<URI>, *Array<String>] uris
|
40
|
+
def cache(*_uris)
|
41
|
+
raise "Unimplemented"
|
42
|
+
end
|
43
|
+
|
44
|
+
# Whether a URI is cached.
|
45
|
+
def cached?(_uri)
|
46
|
+
raise "Unimplemented"
|
47
|
+
end
|
48
|
+
|
49
|
+
# Frees resources.
|
50
|
+
def free; end
|
51
|
+
|
52
|
+
# TODO: Documentation
|
53
|
+
def cycle
|
54
|
+
unless config.allow_circulation
|
55
|
+
cache(*current_uris) # TODO: Make it a template method
|
56
|
+
filter_staged_uris!
|
57
|
+
end
|
58
|
+
|
59
|
+
return false if staged_uris.none?
|
60
|
+
|
61
|
+
swap!
|
62
|
+
reset_staged_uris!
|
63
|
+
|
64
|
+
true
|
65
|
+
end
|
66
|
+
|
67
|
+
protected
|
68
|
+
|
69
|
+
# TODO: Documentation
|
70
|
+
def filter_staged_uris!; end
|
71
|
+
|
72
|
+
# TODO: Documentation
|
73
|
+
def swap!; end
|
74
|
+
|
75
|
+
# TODO: Documentation
|
76
|
+
def reset_staged_uris!; end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "bloomfilter-rb"
|
4
|
+
|
5
|
+
module Wayfarer
|
6
|
+
module Frontiers
|
7
|
+
# An in-memory bloomfilter.
|
8
|
+
# @api private
|
9
|
+
class MemoryBloomfilter < MemoryFrontier
|
10
|
+
def initialize(config)
|
11
|
+
@filter = BloomFilter::Native.new(config.bloomfilter_opts)
|
12
|
+
super(config)
|
13
|
+
end
|
14
|
+
|
15
|
+
# @override
|
16
|
+
def cache(*uris)
|
17
|
+
uris.each { |uri| @filter.insert(uri) }
|
18
|
+
end
|
19
|
+
|
20
|
+
# @override
|
21
|
+
def cached?(uri)
|
22
|
+
@filter.include?(uri)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Frees up memory.
|
26
|
+
def free
|
27
|
+
@filter.clear
|
28
|
+
super
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "set"
|
4
|
+
require "parallel" unless JAVA_PLATFORM = "java"
|
5
|
+
|
6
|
+
module Wayfarer
|
7
|
+
module Frontiers
|
8
|
+
# A naive in-memory frontier.
|
9
|
+
# @api private
|
10
|
+
class MemoryFrontier < Frontier
|
11
|
+
def initialize(config)
|
12
|
+
@current_uris = Set.new([])
|
13
|
+
@staged_uris = Set.new([])
|
14
|
+
@cached_uris = Set.new([])
|
15
|
+
super(config)
|
16
|
+
end
|
17
|
+
|
18
|
+
# @override
|
19
|
+
def current_uris
|
20
|
+
if JAVA_PLATFORM == "java"
|
21
|
+
@current_uris.map { |uri| URI(uri) }
|
22
|
+
else
|
23
|
+
Parallel.map(@current_uris) { |uri| URI(uri) }
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# @override
|
28
|
+
def staged_uris
|
29
|
+
@staged_uris.to_a # These are assumed to be URIs already, so no map
|
30
|
+
end
|
31
|
+
|
32
|
+
# @override
|
33
|
+
def stage(*uris)
|
34
|
+
@staged_uris |= uris
|
35
|
+
end
|
36
|
+
|
37
|
+
# @override
|
38
|
+
def staged?(uri)
|
39
|
+
@staged_uris.include?(uri.to_s)
|
40
|
+
end
|
41
|
+
|
42
|
+
# @override
|
43
|
+
def cache(*uris)
|
44
|
+
@cached_uris |= if JAVA_PLATFORM == "java"
|
45
|
+
uris.map(&:to_s)
|
46
|
+
else
|
47
|
+
Parallel.map(uris, &:to_s)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# @override
|
52
|
+
def cached?(uri)
|
53
|
+
@cached_uris.include?(uri.to_s)
|
54
|
+
end
|
55
|
+
|
56
|
+
# @override
|
57
|
+
def free
|
58
|
+
@current_uris = @staged_uris = @cached_uris = nil
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
def reset_staged_uris!
|
64
|
+
@staged_uris = Set.new([])
|
65
|
+
end
|
66
|
+
|
67
|
+
def swap!
|
68
|
+
@current_uris = @staged_uris
|
69
|
+
end
|
70
|
+
|
71
|
+
def filter_staged_uris!
|
72
|
+
@staged_uris.delete_if { |uri| cached?(uri) }
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "trie"
|
4
|
+
|
5
|
+
module Wayfarer
|
6
|
+
module Frontiers
|
7
|
+
# An in-memory trie.
|
8
|
+
# @api private
|
9
|
+
class MemoryTrieFrontier < MemoryFrontier
|
10
|
+
def initialize(config)
|
11
|
+
@trie = Trie.new
|
12
|
+
super(config)
|
13
|
+
end
|
14
|
+
|
15
|
+
# @override
|
16
|
+
def cache(*uris)
|
17
|
+
uris.each { |uri| @trie.add(uri.to_s) }
|
18
|
+
end
|
19
|
+
|
20
|
+
# @override
|
21
|
+
def match!(uri)
|
22
|
+
@str_or_regexp === uri.host
|
23
|
+
end
|
24
|
+
|
25
|
+
def cached?(uri)
|
26
|
+
# RuboCop autocorrects `#has_key?` to `#key?` otherwise
|
27
|
+
# rubocop:disable Style/PreferredHashMethods
|
28
|
+
@trie.has_key?(uri.to_s)
|
29
|
+
# rubocop:enable Style/PreferredHashMethods
|
30
|
+
end
|
31
|
+
|
32
|
+
# @override
|
33
|
+
def free
|
34
|
+
@trie = nil
|
35
|
+
super
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "normalize_url"
|
4
|
+
|
5
|
+
module Wayfarer
|
6
|
+
module Frontiers
|
7
|
+
# @api private
|
8
|
+
module NormalizeURIs
|
9
|
+
# @override
|
10
|
+
def stage(*uris)
|
11
|
+
super(*uris.map { |uri| normalize(uri) })
|
12
|
+
end
|
13
|
+
|
14
|
+
# @override
|
15
|
+
def staged?(uri)
|
16
|
+
super(normalize(uri))
|
17
|
+
end
|
18
|
+
|
19
|
+
# @override
|
20
|
+
def cache(*uris)
|
21
|
+
super(*uris.map { |uri| normalize(uri) })
|
22
|
+
end
|
23
|
+
|
24
|
+
# @override
|
25
|
+
def cached?(uri)
|
26
|
+
super(normalize(uri))
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def normalize(uri)
|
32
|
+
NormalizeUrl.process(uri, config.normalize_uri_options)
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_s
|
36
|
+
"URI-normalizing #{super}"
|
37
|
+
end
|
38
|
+
|
39
|
+
def method_missing(*argv, &proc)
|
40
|
+
super(*argv, &proc)
|
41
|
+
end
|
42
|
+
|
43
|
+
def respond_to_missing?(method, private = false)
|
44
|
+
@frontier.respond_to?(method) || super
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|