wayfarer-jruby 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rbenv-gemsets +1 -0
- data/.rspec +3 -0
- data/.rubocop.yml +21 -0
- data/.ruby-version +1 -0
- data/.travis.yml +5 -0
- data/.yardopts +3 -0
- data/Gemfile +11 -0
- data/LICENSE +19 -0
- data/README.md +19 -0
- data/Rakefile +114 -0
- data/benchmark/frontiers.rb +143 -0
- data/bin/wayfarer +116 -0
- data/docs/.gitignore +2 -0
- data/docs/_config.yml +15 -0
- data/docs/_includes/base.html +7 -0
- data/docs/_includes/head.html +10 -0
- data/docs/_includes/navigation.html +172 -0
- data/docs/_layouts/default.html +42 -0
- data/docs/_sass/base.scss +439 -0
- data/docs/_sass/variables.scss +24 -0
- data/docs/_sass/vendor/bourbon/_bourbon-deprecate.scss +19 -0
- data/docs/_sass/vendor/bourbon/_bourbon-deprecated-upcoming.scss +425 -0
- data/docs/_sass/vendor/bourbon/_bourbon.scss +90 -0
- data/docs/_sass/vendor/bourbon/addons/_border-color.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_border-radius.scss +48 -0
- data/docs/_sass/vendor/bourbon/addons/_border-style.scss +28 -0
- data/docs/_sass/vendor/bourbon/addons/_border-width.scss +28 -0
- data/docs/_sass/vendor/bourbon/addons/_buttons.scss +69 -0
- data/docs/_sass/vendor/bourbon/addons/_clearfix.scss +25 -0
- data/docs/_sass/vendor/bourbon/addons/_ellipsis.scss +30 -0
- data/docs/_sass/vendor/bourbon/addons/_font-stacks.scss +31 -0
- data/docs/_sass/vendor/bourbon/addons/_hide-text.scss +27 -0
- data/docs/_sass/vendor/bourbon/addons/_margin.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_padding.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_position.scss +51 -0
- data/docs/_sass/vendor/bourbon/addons/_prefixer.scss +66 -0
- data/docs/_sass/vendor/bourbon/addons/_retina-image.scss +27 -0
- data/docs/_sass/vendor/bourbon/addons/_size.scss +56 -0
- data/docs/_sass/vendor/bourbon/addons/_text-inputs.scss +118 -0
- data/docs/_sass/vendor/bourbon/addons/_timing-functions.scss +34 -0
- data/docs/_sass/vendor/bourbon/addons/_triangle.scss +63 -0
- data/docs/_sass/vendor/bourbon/addons/_word-wrap.scss +29 -0
- data/docs/_sass/vendor/bourbon/css3/_animation.scss +61 -0
- data/docs/_sass/vendor/bourbon/css3/_appearance.scss +5 -0
- data/docs/_sass/vendor/bourbon/css3/_backface-visibility.scss +5 -0
- data/docs/_sass/vendor/bourbon/css3/_background-image.scss +44 -0
- data/docs/_sass/vendor/bourbon/css3/_background.scss +57 -0
- data/docs/_sass/vendor/bourbon/css3/_border-image.scss +61 -0
- data/docs/_sass/vendor/bourbon/css3/_calc.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_columns.scss +67 -0
- data/docs/_sass/vendor/bourbon/css3/_filter.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_flex-box.scss +327 -0
- data/docs/_sass/vendor/bourbon/css3/_font-face.scss +29 -0
- data/docs/_sass/vendor/bourbon/css3/_font-feature-settings.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_hidpi-media-query.scss +12 -0
- data/docs/_sass/vendor/bourbon/css3/_hyphens.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_image-rendering.scss +15 -0
- data/docs/_sass/vendor/bourbon/css3/_keyframes.scss +38 -0
- data/docs/_sass/vendor/bourbon/css3/_linear-gradient.scss +40 -0
- data/docs/_sass/vendor/bourbon/css3/_perspective.scss +12 -0
- data/docs/_sass/vendor/bourbon/css3/_placeholder.scss +10 -0
- data/docs/_sass/vendor/bourbon/css3/_radial-gradient.scss +40 -0
- data/docs/_sass/vendor/bourbon/css3/_selection.scss +44 -0
- data/docs/_sass/vendor/bourbon/css3/_text-decoration.scss +27 -0
- data/docs/_sass/vendor/bourbon/css3/_transform.scss +21 -0
- data/docs/_sass/vendor/bourbon/css3/_transition.scss +81 -0
- data/docs/_sass/vendor/bourbon/css3/_user-select.scss +5 -0
- data/docs/_sass/vendor/bourbon/functions/_assign-inputs.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_contains-falsy.scss +25 -0
- data/docs/_sass/vendor/bourbon/functions/_contains.scss +31 -0
- data/docs/_sass/vendor/bourbon/functions/_is-length.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_is-light.scss +26 -0
- data/docs/_sass/vendor/bourbon/functions/_is-number.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_is-size.scss +23 -0
- data/docs/_sass/vendor/bourbon/functions/_modular-scale.scss +74 -0
- data/docs/_sass/vendor/bourbon/functions/_px-to-em.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_px-to-rem.scss +26 -0
- data/docs/_sass/vendor/bourbon/functions/_shade.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_strip-units.scss +22 -0
- data/docs/_sass/vendor/bourbon/functions/_tint.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_transition-property-name.scss +37 -0
- data/docs/_sass/vendor/bourbon/functions/_unpack.scss +32 -0
- data/docs/_sass/vendor/bourbon/helpers/_convert-units.scss +26 -0
- data/docs/_sass/vendor/bourbon/helpers/_directional-values.scss +108 -0
- data/docs/_sass/vendor/bourbon/helpers/_font-source-declaration.scss +53 -0
- data/docs/_sass/vendor/bourbon/helpers/_gradient-positions-parser.scss +24 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-angle-parser.scss +35 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-gradient-parser.scss +51 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-positions-parser.scss +77 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-side-corner-parser.scss +41 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-arg-parser.scss +74 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-gradient-parser.scss +55 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-positions-parser.scss +28 -0
- data/docs/_sass/vendor/bourbon/helpers/_render-gradients.scss +31 -0
- data/docs/_sass/vendor/bourbon/helpers/_shape-size-stripper.scss +15 -0
- data/docs/_sass/vendor/bourbon/helpers/_str-to-num.scss +55 -0
- data/docs/_sass/vendor/bourbon/settings/_asset-pipeline.scss +7 -0
- data/docs/_sass/vendor/bourbon/settings/_deprecation-warnings.scss +8 -0
- data/docs/_sass/vendor/bourbon/settings/_prefixer.scss +9 -0
- data/docs/_sass/vendor/bourbon/settings/_px-to-em.scss +1 -0
- data/docs/_sass/vendor/neat/_neat-helpers.scss +11 -0
- data/docs/_sass/vendor/neat/_neat.scss +23 -0
- data/docs/_sass/vendor/neat/functions/_new-breakpoint.scss +49 -0
- data/docs/_sass/vendor/neat/functions/_private.scss +114 -0
- data/docs/_sass/vendor/neat/grid/_box-sizing.scss +15 -0
- data/docs/_sass/vendor/neat/grid/_direction-context.scss +33 -0
- data/docs/_sass/vendor/neat/grid/_display-context.scss +28 -0
- data/docs/_sass/vendor/neat/grid/_fill-parent.scss +22 -0
- data/docs/_sass/vendor/neat/grid/_media.scss +92 -0
- data/docs/_sass/vendor/neat/grid/_omega.scss +87 -0
- data/docs/_sass/vendor/neat/grid/_outer-container.scss +34 -0
- data/docs/_sass/vendor/neat/grid/_pad.scss +25 -0
- data/docs/_sass/vendor/neat/grid/_private.scss +35 -0
- data/docs/_sass/vendor/neat/grid/_row.scss +52 -0
- data/docs/_sass/vendor/neat/grid/_shift.scss +50 -0
- data/docs/_sass/vendor/neat/grid/_span-columns.scss +94 -0
- data/docs/_sass/vendor/neat/grid/_to-deprecate.scss +97 -0
- data/docs/_sass/vendor/neat/grid/_visual-grid.scss +42 -0
- data/docs/_sass/vendor/neat/mixins/_clearfix.scss +25 -0
- data/docs/_sass/vendor/neat/settings/_disable-warnings.scss +13 -0
- data/docs/_sass/vendor/neat/settings/_grid.scss +51 -0
- data/docs/_sass/vendor/neat/settings/_visual-grid.scss +27 -0
- data/docs/_sass/vendor/normalize-3.0.2.scss +427 -0
- data/docs/_sass/vendor/pygments.scss +356 -0
- data/docs/automating_browsers/capybara.md +70 -0
- data/docs/css/screen.scss +7 -0
- data/docs/guides/callbacks.md +45 -0
- data/docs/guides/cli.md +52 -0
- data/docs/guides/configuration.md +184 -0
- data/docs/guides/error_handling.md +46 -0
- data/docs/guides/frontiers.md +93 -0
- data/docs/guides/halting.md +23 -0
- data/docs/guides/job_queues.md +26 -0
- data/docs/guides/locals.md +36 -0
- data/docs/guides/logging.md +22 -0
- data/docs/guides/page_objects.md +67 -0
- data/docs/guides/peeking.md +46 -0
- data/docs/guides/selenium_capybara.md +100 -0
- data/docs/guides/tutorial.md +452 -0
- data/docs/index.md +82 -0
- data/docs/js/navigation.js +11 -0
- data/docs/misc/contributing.md +20 -0
- data/docs/misc/testing.md +11 -0
- data/docs/recipes/authentication.md +23 -0
- data/docs/recipes/csv.md +29 -0
- data/docs/recipes/javascript.md +20 -0
- data/docs/recipes/multiple_uris.md +18 -0
- data/docs/recipes/screenshots.md +20 -0
- data/docs/routing/host_rules.md +24 -0
- data/docs/routing/path_rules.md +33 -0
- data/docs/routing/query_rules.md +69 -0
- data/docs/routing/routes.md +96 -0
- data/docs/routing/uri_rules.md +18 -0
- data/examples/collect_github_issues.rb +65 -0
- data/examples/find_foobar_on_wikipedia.rb +23 -0
- data/lib/wayfarer.rb +65 -0
- data/lib/wayfarer/configuration.rb +86 -0
- data/lib/wayfarer/crawl.rb +79 -0
- data/lib/wayfarer/crawl_observer.rb +103 -0
- data/lib/wayfarer/dispatcher.rb +104 -0
- data/lib/wayfarer/finders.rb +61 -0
- data/lib/wayfarer/frontiers/frontier.rb +79 -0
- data/lib/wayfarer/frontiers/memory_bloomfilter.rb +32 -0
- data/lib/wayfarer/frontiers/memory_frontier.rb +76 -0
- data/lib/wayfarer/frontiers/memory_trie_frontier.rb +39 -0
- data/lib/wayfarer/frontiers/normalize_uris.rb +48 -0
- data/lib/wayfarer/frontiers/redis_bloomfilter.rb +34 -0
- data/lib/wayfarer/frontiers/redis_frontier.rb +83 -0
- data/lib/wayfarer/http_adapters/adapter_pool.rb +62 -0
- data/lib/wayfarer/http_adapters/net_http_adapter.rb +77 -0
- data/lib/wayfarer/http_adapters/selenium_adapter.rb +80 -0
- data/lib/wayfarer/job.rb +192 -0
- data/lib/wayfarer/locals.rb +40 -0
- data/lib/wayfarer/page.rb +94 -0
- data/lib/wayfarer/parsers/json_parser.rb +20 -0
- data/lib/wayfarer/parsers/xml_parser.rb +27 -0
- data/lib/wayfarer/processor.rb +103 -0
- data/lib/wayfarer/routing/host_rule.rb +19 -0
- data/lib/wayfarer/routing/path_rule.rb +54 -0
- data/lib/wayfarer/routing/query_rule.rb +59 -0
- data/lib/wayfarer/routing/router.rb +71 -0
- data/lib/wayfarer/routing/rule.rb +102 -0
- data/lib/wayfarer/routing/uri_rule.rb +21 -0
- data/spec/configuration_spec.rb +26 -0
- data/spec/crawl_spec.rb +48 -0
- data/spec/finders_spec.rb +49 -0
- data/spec/frontiers/memory_bloomfilter_spec.rb +6 -0
- data/spec/frontiers/memory_frontier_spec.rb +6 -0
- data/spec/frontiers/memory_trie_frontier_spec.rb +6 -0
- data/spec/frontiers/normalize_uris_spec.rb +59 -0
- data/spec/frontiers/redis_bloomfilter_spec.rb +6 -0
- data/spec/frontiers/redis_frontier_spec.rb +6 -0
- data/spec/http_adapters/adapter_pool_spec.rb +33 -0
- data/spec/http_adapters/net_http_adapter_spec.rb +83 -0
- data/spec/http_adapters/selenium_adapter_spec.rb +53 -0
- data/spec/integration/callbacks_spec.rb +42 -0
- data/spec/integration/locals_spec.rb +106 -0
- data/spec/job_spec.rb +86 -0
- data/spec/page_spec.rb +38 -0
- data/spec/parsers/json_parser_spec.rb +30 -0
- data/spec/parsers/xml_parser_spec.rb +24 -0
- data/spec/processor_spec.rb +31 -0
- data/spec/routing/host_rule_spec.rb +48 -0
- data/spec/routing/path_rule_spec.rb +66 -0
- data/spec/routing/query_rule_spec.rb +124 -0
- data/spec/routing/router_spec.rb +67 -0
- data/spec/routing/rule_spec.rb +218 -0
- data/spec/routing/uri_rule_spec.rb +24 -0
- data/spec/shared/frontier.rb +96 -0
- data/spec/spec_helpers.rb +62 -0
- data/spec/wayfarer_spec.rb +24 -0
- data/support/static/finders.html +38 -0
- data/support/static/graph/details/a.html +10 -0
- data/support/static/graph/details/b.html +10 -0
- data/support/static/graph/index.html +20 -0
- data/support/static/json/dummy.json +13 -0
- data/support/static/links/links.html +28 -0
- data/support/static/xml/dummy.xml +120 -0
- data/support/test_app.rb +45 -0
- data/wayfarer-jruby.gemspec +49 -0
- data/wayfarer.gemspec +53 -0
- metadata +616 -0
@@ -0,0 +1,18 @@
|
|
1
|
+
---
|
2
|
+
layout: default
|
3
|
+
title: URI Rules
|
4
|
+
---
|
5
|
+
|
6
|
+
# URI rules
|
7
|
+
|
8
|
+
URI rules match against a string.
|
9
|
+
|
10
|
+
{% highlight ruby %}
|
11
|
+
class DummyJob < Wayfarer::Job
|
12
|
+
route.uri "https://example.com"
|
13
|
+
end
|
14
|
+
{% endhighlight %}
|
15
|
+
|
16
|
+
Matches:
|
17
|
+
|
18
|
+
* Only `https://example.com`
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require_relative "../lib/wayfarer"
|
2
|
+
|
3
|
+
class CollectGithubIssues < Wayfarer::Job
|
4
|
+
config.connection_count = 4
|
5
|
+
config.logger.level = :fatal
|
6
|
+
|
7
|
+
let(:records) { [] }
|
8
|
+
|
9
|
+
routes do
|
10
|
+
host "github.com" do
|
11
|
+
path "/:user/:repo", to: :repository
|
12
|
+
path "/:user/:repo/issues", to: :index
|
13
|
+
path "/:user/:repo/issues/:id", to: :show
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
after_crawl do
|
18
|
+
records.each do |issue|
|
19
|
+
# Save them somewhere?
|
20
|
+
puts issue
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def repository
|
25
|
+
stage navigation_links
|
26
|
+
end
|
27
|
+
|
28
|
+
def index
|
29
|
+
stage issue_listing_links, next_page
|
30
|
+
end
|
31
|
+
|
32
|
+
def show
|
33
|
+
return halt if records.count > 30
|
34
|
+
|
35
|
+
records << {
|
36
|
+
id: params[:id],
|
37
|
+
title: issue_title,
|
38
|
+
author: issue_author
|
39
|
+
}
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def issue_title
|
45
|
+
doc.css(".js-issue-title").text.strip
|
46
|
+
end
|
47
|
+
|
48
|
+
def issue_author
|
49
|
+
doc.css(".TableObject-item .author").text.strip
|
50
|
+
end
|
51
|
+
|
52
|
+
def navigation_links
|
53
|
+
page.links ".reponav-item"
|
54
|
+
end
|
55
|
+
|
56
|
+
def issue_listing_links
|
57
|
+
page.links ".issues-listing"
|
58
|
+
end
|
59
|
+
|
60
|
+
def next_page
|
61
|
+
page.links ".next_page"
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
CollectGithubIssues.perform_now("https://github.com/rails/rails")
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require_relative "../lib/wayfarer"
|
2
|
+
|
3
|
+
class FindFoobarOnWikipedia < Wayfarer::Job
|
4
|
+
config.http_adapter = :selenium
|
5
|
+
config.selenium_argv = [:chrome]
|
6
|
+
config.connection_count = 4
|
7
|
+
|
8
|
+
let(:keywords) { [] }
|
9
|
+
|
10
|
+
route.host "en.wikipedia.org", to: :article
|
11
|
+
|
12
|
+
def article
|
13
|
+
if page.body =~ /Foobar/
|
14
|
+
driver.save_screenshot("/tmp/foobar.png")
|
15
|
+
return halt
|
16
|
+
end
|
17
|
+
|
18
|
+
keywords << page.keywords
|
19
|
+
stage page.links
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
FindFoobarOnWikipedia.perform_now("https://en.wikipedia.org/wiki/Special:Random")
|
data/lib/wayfarer.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# rubocop:disable Style/Documentation
|
4
|
+
require "logger"
|
5
|
+
require "uri"
|
6
|
+
|
7
|
+
# Plumbing
|
8
|
+
require_relative "wayfarer/configuration"
|
9
|
+
|
10
|
+
# Routing
|
11
|
+
require_relative "wayfarer/routing/rule"
|
12
|
+
require_relative "wayfarer/routing/uri_rule"
|
13
|
+
require_relative "wayfarer/routing/host_rule"
|
14
|
+
require_relative "wayfarer/routing/path_rule"
|
15
|
+
require_relative "wayfarer/routing/query_rule"
|
16
|
+
require_relative "wayfarer/routing/router"
|
17
|
+
|
18
|
+
# Networking
|
19
|
+
require_relative "wayfarer/http_adapters/net_http_adapter"
|
20
|
+
require_relative "wayfarer/http_adapters/selenium_adapter"
|
21
|
+
require_relative "wayfarer/http_adapters/adapter_pool"
|
22
|
+
|
23
|
+
# Parsers
|
24
|
+
require_relative "wayfarer/parsers/xml_parser"
|
25
|
+
require_relative "wayfarer/parsers/json_parser"
|
26
|
+
|
27
|
+
# Frontiers
|
28
|
+
require_relative "wayfarer/frontiers/frontier"
|
29
|
+
require_relative "wayfarer/frontiers/memory_frontier"
|
30
|
+
require_relative "wayfarer/frontiers/redis_frontier"
|
31
|
+
require_relative "wayfarer/frontiers/normalize_uris"
|
32
|
+
|
33
|
+
unless RUBY_PLATFORM == "java"
|
34
|
+
require_relative "wayfarer/frontiers/memory_trie_frontier"
|
35
|
+
require_relative "wayfarer/frontiers/memory_bloomfilter"
|
36
|
+
require_relative "wayfarer/frontiers/redis_bloomfilter"
|
37
|
+
end
|
38
|
+
|
39
|
+
# Processing
|
40
|
+
require_relative "wayfarer/crawl"
|
41
|
+
require_relative "wayfarer/crawl_observer"
|
42
|
+
require_relative "wayfarer/locals"
|
43
|
+
require_relative "wayfarer/job"
|
44
|
+
require_relative "wayfarer/finders"
|
45
|
+
require_relative "wayfarer/page"
|
46
|
+
require_relative "wayfarer/dispatcher"
|
47
|
+
require_relative "wayfarer/processor"
|
48
|
+
|
49
|
+
module Wayfarer
|
50
|
+
VERSION = "0.0.1"
|
51
|
+
|
52
|
+
def self.logger
|
53
|
+
return @logger if @logger
|
54
|
+
|
55
|
+
@logger = Logger.new(STDOUT)
|
56
|
+
@logger.level = Logger::WARN
|
57
|
+
@logger
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.config
|
61
|
+
@config ||= Configuration.new
|
62
|
+
yield(@config) if block_given?
|
63
|
+
@config
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "ostruct"
|
4
|
+
require "securerandom"
|
5
|
+
require "forwardable"
|
6
|
+
|
7
|
+
module Wayfarer
|
8
|
+
class Configuration < OpenStruct
|
9
|
+
extend Forwardable
|
10
|
+
|
11
|
+
DEFAULTS = {
|
12
|
+
# Print full stacktraces?
|
13
|
+
print_stacktraces: true,
|
14
|
+
|
15
|
+
# Crash when encountering unhandled exceptions?
|
16
|
+
reraise_exceptions: false,
|
17
|
+
|
18
|
+
# Allow processing URIs multiple times?
|
19
|
+
allow_circulation: false,
|
20
|
+
|
21
|
+
# How many HTTP connections/Selenium drivers to use
|
22
|
+
# 1:1 correspondence with spawned threads
|
23
|
+
connection_count: 1,
|
24
|
+
|
25
|
+
# Which HTTP adapter to use. Supported are :net_http and :selenium
|
26
|
+
http_adapter: :net_http,
|
27
|
+
|
28
|
+
# Which frontier to use.
|
29
|
+
frontier: :memory,
|
30
|
+
|
31
|
+
# How long a thread may hold an HTTP adapter.
|
32
|
+
# Threads that exceed this limit fail with an exception.
|
33
|
+
connection_timeout: Float::INFINITY,
|
34
|
+
|
35
|
+
# How many 3xx redirects to follow. Has no effect when using Selenium
|
36
|
+
max_http_redirects: 3,
|
37
|
+
|
38
|
+
# Argument vector for instantiating Selenium drivers
|
39
|
+
selenium_argv: [:firefox],
|
40
|
+
|
41
|
+
# Argument vector for instantiating a Redis connection
|
42
|
+
redis_opts: {
|
43
|
+
host: "localhost",
|
44
|
+
port: 6379
|
45
|
+
}.freeze,
|
46
|
+
|
47
|
+
# Size of browser windows
|
48
|
+
window_size: [1024, 768],
|
49
|
+
|
50
|
+
# Which Mustermann pattern type to use when matching URI paths
|
51
|
+
# TODO: Mention in docs
|
52
|
+
mustermann_type: :sinatra,
|
53
|
+
|
54
|
+
# Options for instantiating Bloomfilters
|
55
|
+
bloomfilter_opts: {
|
56
|
+
size: 100,
|
57
|
+
hashes: 2,
|
58
|
+
seed: 1,
|
59
|
+
bucket: 3,
|
60
|
+
raise: false
|
61
|
+
},
|
62
|
+
|
63
|
+
# Whether to normalize URIs
|
64
|
+
normalize_uris: true,
|
65
|
+
|
66
|
+
# URI normalization options
|
67
|
+
# See: https://github.com/rwz/normalize_url
|
68
|
+
normalize_uri_options: {}
|
69
|
+
}.freeze
|
70
|
+
|
71
|
+
attr_reader :uuid
|
72
|
+
|
73
|
+
def initialize(overrides = {})
|
74
|
+
super(DEFAULTS.merge(overrides))
|
75
|
+
@uuid = SecureRandom.uuid
|
76
|
+
end
|
77
|
+
|
78
|
+
def logger
|
79
|
+
@logger ||= Wayfarer.logger.dup
|
80
|
+
end
|
81
|
+
|
82
|
+
def reset!
|
83
|
+
DEFAULTS.each { |key, val| self[key] = val }
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "observer"
|
4
|
+
require "normalize_url"
|
5
|
+
|
6
|
+
module Wayfarer
|
7
|
+
class Crawl
|
8
|
+
extend Forwardable
|
9
|
+
include Observable
|
10
|
+
|
11
|
+
# The prepared job.
|
12
|
+
# @!attribute [r] job
|
13
|
+
attr_reader :job
|
14
|
+
|
15
|
+
# @!attribute [r] dispatcher
|
16
|
+
attr_reader :dispatcher
|
17
|
+
|
18
|
+
delegate config: :job
|
19
|
+
delegate logger: :config
|
20
|
+
|
21
|
+
def initialize(job, *uris)
|
22
|
+
@job = job.prepare
|
23
|
+
@uris = uris
|
24
|
+
@dispatcher = Dispatcher.new(@job)
|
25
|
+
@processor = Processor.new(@job, frontier, @dispatcher)
|
26
|
+
end
|
27
|
+
|
28
|
+
def execute
|
29
|
+
trap_signals
|
30
|
+
|
31
|
+
CrawlObserver.new(@processor, @dispatcher, config.logger)
|
32
|
+
|
33
|
+
@job.run_hook(:before_crawl)
|
34
|
+
@processor.run(*@uris)
|
35
|
+
@job.run_hook(:after_crawl)
|
36
|
+
ensure
|
37
|
+
untrap_signals
|
38
|
+
end
|
39
|
+
|
40
|
+
# A frontier with initially pre-staged URIs.
|
41
|
+
# @return [Frontier]
|
42
|
+
def frontier
|
43
|
+
return @frontier if @frontier
|
44
|
+
|
45
|
+
@frontier = case config.frontier
|
46
|
+
when :memory_trie
|
47
|
+
Frontiers::MemoryTrieFrontier.new(config)
|
48
|
+
when :redis
|
49
|
+
Frontiers::RedisFrontier.new(config)
|
50
|
+
when :memory_bloom
|
51
|
+
Frontiers::MemoryBloomfilter.new(config)
|
52
|
+
when :redis_bloom
|
53
|
+
Frontiers::RedisBloomfilter.new(config)
|
54
|
+
else
|
55
|
+
Frontiers::MemoryFrontier.new(config)
|
56
|
+
end
|
57
|
+
|
58
|
+
@frontier.extend(Frontiers::NormalizeURIs) if config.normalize_uris
|
59
|
+
|
60
|
+
@frontier.stage(*@uris) # TODO: Test
|
61
|
+
|
62
|
+
@frontier
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def trap_signals
|
68
|
+
@cached_sigint_handler = trap(:INT) {
|
69
|
+
halt!
|
70
|
+
@cached_sigint_handler.try(:call)
|
71
|
+
exit(-1)
|
72
|
+
}
|
73
|
+
end
|
74
|
+
|
75
|
+
def untrap_signals
|
76
|
+
trap(:INT) { @cached_sigint_handler.try(:call) }
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
class CrawlObserver
|
5
|
+
module Events
|
6
|
+
FirstCycle = Struct.new(:frontier)
|
7
|
+
NewCycle = Struct.new(:current_uris_count)
|
8
|
+
DispatchedURI = Struct.new(:action, :uri)
|
9
|
+
CycleFinished = Class.new
|
10
|
+
Peeking = Struct.new(:uri)
|
11
|
+
AboutToCycle = Struct.new(:staged_uris_count)
|
12
|
+
MismatchedURI = Struct.new(:uri)
|
13
|
+
HaltInitiated = Struct.new(:action, :uri)
|
14
|
+
StagingURIs = Struct.new(:staged_uris_count)
|
15
|
+
UnhandledError = Struct.new(:exception)
|
16
|
+
end
|
17
|
+
|
18
|
+
module ObservableShortcuts
|
19
|
+
def notify_observers!(*argv)
|
20
|
+
changed
|
21
|
+
notify_observers(*argv)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
extend Forwardable
|
26
|
+
|
27
|
+
attr_reader :logger
|
28
|
+
|
29
|
+
def initialize(*observables, logger)
|
30
|
+
@logger = logger
|
31
|
+
observables.each { |obsv| obsv.add_observer(self) }
|
32
|
+
end
|
33
|
+
|
34
|
+
def update(event)
|
35
|
+
case event
|
36
|
+
when Events::FirstCycle then first_cycle(event)
|
37
|
+
when Events::NewCycle then new_cycle(event)
|
38
|
+
when Events::DispatchedURI then dispatched_uri(event)
|
39
|
+
when Events::CycleFinished then cycle_finished
|
40
|
+
when Events::Peeking then peeking(event)
|
41
|
+
when Events::AboutToCycle then about_to_cycle(event)
|
42
|
+
when Events::MismatchedURI then mismatched_uri(event)
|
43
|
+
when Events::HaltInitiated then halt_initiated(event)
|
44
|
+
when Events::StagingURIs then staging_uris(event)
|
45
|
+
when Events::UnhandledError then unhandled_error(event)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def first_cycle(event)
|
52
|
+
logger.info("First cycle")
|
53
|
+
logger.info("Frontier: #{event.frontier}")
|
54
|
+
end
|
55
|
+
|
56
|
+
def new_cycle(event)
|
57
|
+
logger.info("Current cycle contains #{event.current_uris_count} URI(s)")
|
58
|
+
end
|
59
|
+
|
60
|
+
def dispatched_uri(event)
|
61
|
+
logger.info("Dispatched to \##{event.action}: #{event.uri}")
|
62
|
+
end
|
63
|
+
|
64
|
+
def cycle_finished
|
65
|
+
logger.info("No URIs left in current cycle")
|
66
|
+
end
|
67
|
+
|
68
|
+
def peeking(event)
|
69
|
+
logger.info("Peeking into: #{event.uri}")
|
70
|
+
end
|
71
|
+
|
72
|
+
def about_to_cycle(event)
|
73
|
+
logger.info("About to cycle. #{event.staged_uris_count} staged URI(s)")
|
74
|
+
end
|
75
|
+
|
76
|
+
def mismatched_uri(event)
|
77
|
+
logger.debug("No matching route for: #{event.uri}")
|
78
|
+
end
|
79
|
+
|
80
|
+
def halt_initiated(event)
|
81
|
+
logger.info("Halt initiated from \##{event.action} at: #{event.uri}")
|
82
|
+
end
|
83
|
+
|
84
|
+
def staging_uris(event)
|
85
|
+
logger.info("Staging #{event.staged_uris_count} URI(s)")
|
86
|
+
end
|
87
|
+
|
88
|
+
def unhandled_error(event)
|
89
|
+
level = config.reraise_exceptions ? :fatal : :error
|
90
|
+
|
91
|
+
if config.print_stacktraces
|
92
|
+
logger.public_send level, <<~LOGGER
|
93
|
+
Unhandled exception in an action: #{event.exception.class.inspect}
|
94
|
+
#{event.exception.backtrace.map(&:to_s).join("\n* ")}
|
95
|
+
LOGGER
|
96
|
+
else
|
97
|
+
logger.public_send level, <<~LOGGER
|
98
|
+
Unhandled exception in an action: #{event.exception.class.inspect}
|
99
|
+
LOGGER
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|