wayfarer 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rbenv-gemsets +1 -0
- data/.rspec +3 -0
- data/.rubocop.yml +21 -0
- data/.ruby-version +1 -0
- data/.travis.yml +5 -0
- data/.yardopts +3 -0
- data/Changelog.md +10 -0
- data/Gemfile +11 -0
- data/LICENSE +19 -0
- data/README.md +21 -0
- data/Rakefile +114 -0
- data/benchmark/frontiers.rb +143 -0
- data/bin/wayfarer +116 -0
- data/docs/.gitignore +2 -0
- data/docs/_config.yml +15 -0
- data/docs/_includes/base.html +7 -0
- data/docs/_includes/head.html +10 -0
- data/docs/_includes/navigation.html +187 -0
- data/docs/_layouts/default.html +42 -0
- data/docs/_sass/base.scss +439 -0
- data/docs/_sass/variables.scss +24 -0
- data/docs/_sass/vendor/bourbon/_bourbon-deprecate.scss +19 -0
- data/docs/_sass/vendor/bourbon/_bourbon-deprecated-upcoming.scss +425 -0
- data/docs/_sass/vendor/bourbon/_bourbon.scss +90 -0
- data/docs/_sass/vendor/bourbon/addons/_border-color.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_border-radius.scss +48 -0
- data/docs/_sass/vendor/bourbon/addons/_border-style.scss +28 -0
- data/docs/_sass/vendor/bourbon/addons/_border-width.scss +28 -0
- data/docs/_sass/vendor/bourbon/addons/_buttons.scss +69 -0
- data/docs/_sass/vendor/bourbon/addons/_clearfix.scss +25 -0
- data/docs/_sass/vendor/bourbon/addons/_ellipsis.scss +30 -0
- data/docs/_sass/vendor/bourbon/addons/_font-stacks.scss +31 -0
- data/docs/_sass/vendor/bourbon/addons/_hide-text.scss +27 -0
- data/docs/_sass/vendor/bourbon/addons/_margin.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_padding.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_position.scss +51 -0
- data/docs/_sass/vendor/bourbon/addons/_prefixer.scss +66 -0
- data/docs/_sass/vendor/bourbon/addons/_retina-image.scss +27 -0
- data/docs/_sass/vendor/bourbon/addons/_size.scss +56 -0
- data/docs/_sass/vendor/bourbon/addons/_text-inputs.scss +118 -0
- data/docs/_sass/vendor/bourbon/addons/_timing-functions.scss +34 -0
- data/docs/_sass/vendor/bourbon/addons/_triangle.scss +63 -0
- data/docs/_sass/vendor/bourbon/addons/_word-wrap.scss +29 -0
- data/docs/_sass/vendor/bourbon/css3/_animation.scss +61 -0
- data/docs/_sass/vendor/bourbon/css3/_appearance.scss +5 -0
- data/docs/_sass/vendor/bourbon/css3/_backface-visibility.scss +5 -0
- data/docs/_sass/vendor/bourbon/css3/_background-image.scss +44 -0
- data/docs/_sass/vendor/bourbon/css3/_background.scss +57 -0
- data/docs/_sass/vendor/bourbon/css3/_border-image.scss +61 -0
- data/docs/_sass/vendor/bourbon/css3/_calc.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_columns.scss +67 -0
- data/docs/_sass/vendor/bourbon/css3/_filter.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_flex-box.scss +327 -0
- data/docs/_sass/vendor/bourbon/css3/_font-face.scss +29 -0
- data/docs/_sass/vendor/bourbon/css3/_font-feature-settings.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_hidpi-media-query.scss +12 -0
- data/docs/_sass/vendor/bourbon/css3/_hyphens.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_image-rendering.scss +15 -0
- data/docs/_sass/vendor/bourbon/css3/_keyframes.scss +38 -0
- data/docs/_sass/vendor/bourbon/css3/_linear-gradient.scss +40 -0
- data/docs/_sass/vendor/bourbon/css3/_perspective.scss +12 -0
- data/docs/_sass/vendor/bourbon/css3/_placeholder.scss +10 -0
- data/docs/_sass/vendor/bourbon/css3/_radial-gradient.scss +40 -0
- data/docs/_sass/vendor/bourbon/css3/_selection.scss +44 -0
- data/docs/_sass/vendor/bourbon/css3/_text-decoration.scss +27 -0
- data/docs/_sass/vendor/bourbon/css3/_transform.scss +21 -0
- data/docs/_sass/vendor/bourbon/css3/_transition.scss +81 -0
- data/docs/_sass/vendor/bourbon/css3/_user-select.scss +5 -0
- data/docs/_sass/vendor/bourbon/functions/_assign-inputs.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_contains-falsy.scss +25 -0
- data/docs/_sass/vendor/bourbon/functions/_contains.scss +31 -0
- data/docs/_sass/vendor/bourbon/functions/_is-length.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_is-light.scss +26 -0
- data/docs/_sass/vendor/bourbon/functions/_is-number.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_is-size.scss +23 -0
- data/docs/_sass/vendor/bourbon/functions/_modular-scale.scss +74 -0
- data/docs/_sass/vendor/bourbon/functions/_px-to-em.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_px-to-rem.scss +26 -0
- data/docs/_sass/vendor/bourbon/functions/_shade.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_strip-units.scss +22 -0
- data/docs/_sass/vendor/bourbon/functions/_tint.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_transition-property-name.scss +37 -0
- data/docs/_sass/vendor/bourbon/functions/_unpack.scss +32 -0
- data/docs/_sass/vendor/bourbon/helpers/_convert-units.scss +26 -0
- data/docs/_sass/vendor/bourbon/helpers/_directional-values.scss +108 -0
- data/docs/_sass/vendor/bourbon/helpers/_font-source-declaration.scss +53 -0
- data/docs/_sass/vendor/bourbon/helpers/_gradient-positions-parser.scss +24 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-angle-parser.scss +35 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-gradient-parser.scss +51 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-positions-parser.scss +77 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-side-corner-parser.scss +41 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-arg-parser.scss +74 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-gradient-parser.scss +55 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-positions-parser.scss +28 -0
- data/docs/_sass/vendor/bourbon/helpers/_render-gradients.scss +31 -0
- data/docs/_sass/vendor/bourbon/helpers/_shape-size-stripper.scss +15 -0
- data/docs/_sass/vendor/bourbon/helpers/_str-to-num.scss +55 -0
- data/docs/_sass/vendor/bourbon/settings/_asset-pipeline.scss +7 -0
- data/docs/_sass/vendor/bourbon/settings/_deprecation-warnings.scss +8 -0
- data/docs/_sass/vendor/bourbon/settings/_prefixer.scss +9 -0
- data/docs/_sass/vendor/bourbon/settings/_px-to-em.scss +1 -0
- data/docs/_sass/vendor/neat/_neat-helpers.scss +11 -0
- data/docs/_sass/vendor/neat/_neat.scss +23 -0
- data/docs/_sass/vendor/neat/functions/_new-breakpoint.scss +49 -0
- data/docs/_sass/vendor/neat/functions/_private.scss +114 -0
- data/docs/_sass/vendor/neat/grid/_box-sizing.scss +15 -0
- data/docs/_sass/vendor/neat/grid/_direction-context.scss +33 -0
- data/docs/_sass/vendor/neat/grid/_display-context.scss +28 -0
- data/docs/_sass/vendor/neat/grid/_fill-parent.scss +22 -0
- data/docs/_sass/vendor/neat/grid/_media.scss +92 -0
- data/docs/_sass/vendor/neat/grid/_omega.scss +87 -0
- data/docs/_sass/vendor/neat/grid/_outer-container.scss +34 -0
- data/docs/_sass/vendor/neat/grid/_pad.scss +25 -0
- data/docs/_sass/vendor/neat/grid/_private.scss +35 -0
- data/docs/_sass/vendor/neat/grid/_row.scss +52 -0
- data/docs/_sass/vendor/neat/grid/_shift.scss +50 -0
- data/docs/_sass/vendor/neat/grid/_span-columns.scss +94 -0
- data/docs/_sass/vendor/neat/grid/_to-deprecate.scss +97 -0
- data/docs/_sass/vendor/neat/grid/_visual-grid.scss +42 -0
- data/docs/_sass/vendor/neat/mixins/_clearfix.scss +25 -0
- data/docs/_sass/vendor/neat/settings/_disable-warnings.scss +13 -0
- data/docs/_sass/vendor/neat/settings/_grid.scss +51 -0
- data/docs/_sass/vendor/neat/settings/_visual-grid.scss +27 -0
- data/docs/_sass/vendor/normalize-3.0.2.scss +427 -0
- data/docs/_sass/vendor/pygments.scss +356 -0
- data/docs/automating_browsers/capybara.md +70 -0
- data/docs/css/screen.scss +7 -0
- data/docs/guides/callbacks.md +45 -0
- data/docs/guides/cli.md +52 -0
- data/docs/guides/configuration.md +184 -0
- data/docs/guides/error_handling.md +46 -0
- data/docs/guides/frontiers.md +93 -0
- data/docs/guides/halting.md +23 -0
- data/docs/guides/job_queues.md +26 -0
- data/docs/guides/locals.md +36 -0
- data/docs/guides/logging.md +22 -0
- data/docs/guides/page_objects.md +67 -0
- data/docs/guides/peeking.md +46 -0
- data/docs/guides/selenium_capybara.md +100 -0
- data/docs/guides/tutorial.md +452 -0
- data/docs/index.md +82 -0
- data/docs/js/navigation.js +11 -0
- data/docs/misc/contributing.md +20 -0
- data/docs/misc/testing.md +11 -0
- data/docs/recipes/authentication.md +23 -0
- data/docs/recipes/csv.md +29 -0
- data/docs/recipes/javascript.md +20 -0
- data/docs/recipes/multiple_uris.md +18 -0
- data/docs/recipes/screenshots.md +20 -0
- data/docs/routing/custom_rules.md +16 -0
- data/docs/routing/filetypes_rules.md +21 -0
- data/docs/routing/host_rules.md +24 -0
- data/docs/routing/path_rules.md +33 -0
- data/docs/routing/protocol_rules.md +17 -0
- data/docs/routing/query_rules.md +69 -0
- data/docs/routing/routes.md +96 -0
- data/docs/routing/uri_rules.md +18 -0
- data/examples/collect_github_issues.rb +65 -0
- data/examples/find_foobar_on_wikipedia.rb +23 -0
- data/lib/wayfarer/configuration.rb +86 -0
- data/lib/wayfarer/crawl.rb +79 -0
- data/lib/wayfarer/crawl_observer.rb +103 -0
- data/lib/wayfarer/dispatcher.rb +104 -0
- data/lib/wayfarer/finders.rb +61 -0
- data/lib/wayfarer/frontiers/frontier.rb +79 -0
- data/lib/wayfarer/frontiers/memory_bloomfilter.rb +32 -0
- data/lib/wayfarer/frontiers/memory_frontier.rb +76 -0
- data/lib/wayfarer/frontiers/memory_trie_frontier.rb +39 -0
- data/lib/wayfarer/frontiers/normalize_uris.rb +48 -0
- data/lib/wayfarer/frontiers/redis_bloomfilter.rb +34 -0
- data/lib/wayfarer/frontiers/redis_frontier.rb +83 -0
- data/lib/wayfarer/http_adapters/adapter_pool.rb +62 -0
- data/lib/wayfarer/http_adapters/net_http_adapter.rb +77 -0
- data/lib/wayfarer/http_adapters/selenium_adapter.rb +80 -0
- data/lib/wayfarer/job.rb +211 -0
- data/lib/wayfarer/locals.rb +40 -0
- data/lib/wayfarer/page.rb +94 -0
- data/lib/wayfarer/parsers/json_parser.rb +20 -0
- data/lib/wayfarer/parsers/xml_parser.rb +27 -0
- data/lib/wayfarer/processor.rb +103 -0
- data/lib/wayfarer/routing/custom_rule.rb +21 -0
- data/lib/wayfarer/routing/filetypes_rule.rb +20 -0
- data/lib/wayfarer/routing/host_rule.rb +19 -0
- data/lib/wayfarer/routing/path_rule.rb +54 -0
- data/lib/wayfarer/routing/protocol_rule.rb +21 -0
- data/lib/wayfarer/routing/query_rule.rb +59 -0
- data/lib/wayfarer/routing/router.rb +71 -0
- data/lib/wayfarer/routing/rule.rb +114 -0
- data/lib/wayfarer/routing/uri_rule.rb +21 -0
- data/lib/wayfarer.rb +68 -0
- data/spec/configuration_spec.rb +26 -0
- data/spec/crawl_spec.rb +48 -0
- data/spec/finders_spec.rb +49 -0
- data/spec/frontiers/memory_bloomfilter_spec.rb +6 -0
- data/spec/frontiers/memory_frontier_spec.rb +6 -0
- data/spec/frontiers/memory_trie_frontier_spec.rb +6 -0
- data/spec/frontiers/normalize_uris_spec.rb +59 -0
- data/spec/frontiers/redis_bloomfilter_spec.rb +6 -0
- data/spec/frontiers/redis_frontier_spec.rb +6 -0
- data/spec/http_adapters/adapter_pool_spec.rb +33 -0
- data/spec/http_adapters/net_http_adapter_spec.rb +83 -0
- data/spec/http_adapters/selenium_adapter_spec.rb +53 -0
- data/spec/integration/callbacks_spec.rb +42 -0
- data/spec/integration/locals_spec.rb +106 -0
- data/spec/integration/peeking_spec.rb +61 -0
- data/spec/job_spec.rb +122 -0
- data/spec/page_spec.rb +38 -0
- data/spec/parsers/json_parser_spec.rb +30 -0
- data/spec/parsers/xml_parser_spec.rb +24 -0
- data/spec/processor_spec.rb +31 -0
- data/spec/routing/custom_rule_spec.rb +26 -0
- data/spec/routing/filetypes_rule_spec.rb +40 -0
- data/spec/routing/host_rule_spec.rb +48 -0
- data/spec/routing/path_rule_spec.rb +66 -0
- data/spec/routing/protocol_rule_spec.rb +26 -0
- data/spec/routing/query_rule_spec.rb +124 -0
- data/spec/routing/router_spec.rb +67 -0
- data/spec/routing/rule_spec.rb +251 -0
- data/spec/routing/uri_rule_spec.rb +24 -0
- data/spec/shared/frontier.rb +96 -0
- data/spec/spec_helpers.rb +62 -0
- data/spec/wayfarer_spec.rb +24 -0
- data/support/static/finders.html +38 -0
- data/support/static/graph/details/a.html +10 -0
- data/support/static/graph/details/b.html +10 -0
- data/support/static/graph/index.html +20 -0
- data/support/static/json/dummy.json +13 -0
- data/support/static/links/links.html +28 -0
- data/support/static/xml/dummy.xml +120 -0
- data/support/test_app.rb +45 -0
- data/wayfarer-jruby.gemspec +49 -0
- data/wayfarer.gemspec +53 -0
- metadata +697 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "selenium-webdriver"
|
|
4
|
+
require "selenium/emulated_features"
|
|
5
|
+
require "capybara"
|
|
6
|
+
|
|
7
|
+
module Wayfarer
|
|
8
|
+
module HTTPAdapters
|
|
9
|
+
# An adapter for Selenium WebDrivers
|
|
10
|
+
# @api private
|
|
11
|
+
class SeleniumAdapter
|
|
12
|
+
# @!attribute [r] driver
|
|
13
|
+
# @return [URI] the Selenium WebDriver.
|
|
14
|
+
attr_reader :driver
|
|
15
|
+
|
|
16
|
+
def initialize(config = Wayfarer.config)
|
|
17
|
+
@config = config
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Fetches a page.
|
|
21
|
+
# @return [Page]
|
|
22
|
+
def fetch(uri)
|
|
23
|
+
driver.navigate.to(uri)
|
|
24
|
+
|
|
25
|
+
Page.new(
|
|
26
|
+
uri: @driver.current_url,
|
|
27
|
+
status_code: @driver.response_code,
|
|
28
|
+
body: @driver.page_source,
|
|
29
|
+
headers: @driver.response_headers
|
|
30
|
+
)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Closes the driver.
|
|
34
|
+
def reload!
|
|
35
|
+
@driver&.close
|
|
36
|
+
@driver = nil
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Quits the browser.
|
|
40
|
+
def free
|
|
41
|
+
@driver&.quit
|
|
42
|
+
@driver = nil
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# The WebDriver.
|
|
46
|
+
def driver
|
|
47
|
+
@driver ||= instantiate_driver
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# A Capybara driver that wraps the {#driver}.
|
|
51
|
+
# @see https://github.com/teamcapybara/capybara Capybara
|
|
52
|
+
def browser
|
|
53
|
+
@browser ||= instantiate_capybara_driver
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
private
|
|
57
|
+
|
|
58
|
+
def instantiate_driver
|
|
59
|
+
driver = Selenium::WebDriver.for(*@config.selenium_argv)
|
|
60
|
+
driver.manage.window.size = Selenium::WebDriver::Dimension.new(
|
|
61
|
+
*@config.window_size
|
|
62
|
+
)
|
|
63
|
+
driver
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def instantiate_capybara_driver
|
|
67
|
+
Capybara.run_server = false
|
|
68
|
+
Capybara.current_driver = :selenium
|
|
69
|
+
|
|
70
|
+
capybara_driver = Capybara::Selenium::Driver.new(nil)
|
|
71
|
+
capybara_driver.instance_variable_set(:@browser, driver)
|
|
72
|
+
|
|
73
|
+
session = Capybara::Session.new(:selenium, nil)
|
|
74
|
+
session.instance_variable_set(:@driver, capybara_driver)
|
|
75
|
+
|
|
76
|
+
session
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
data/lib/wayfarer/job.rb
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "forwardable"
|
|
4
|
+
require "hooks"
|
|
5
|
+
require "active_job"
|
|
6
|
+
|
|
7
|
+
# TODO: I only want deep_dup
|
|
8
|
+
require "active_support/all"
|
|
9
|
+
|
|
10
|
+
module Wayfarer
|
|
11
|
+
# A {Job} is a class that has a {Routing::Router} with many {Routing::Rule}s
|
|
12
|
+
# which are matched against a URI. Rules map URIs onto job instance methods.
|
|
13
|
+
# Under the hood, jobs are instantiated within separate threads by a
|
|
14
|
+
# {Processor}. Every instance gets its own thread. If a URI is matched, its
|
|
15
|
+
# {Page} is retrieved, and made available to instance methods via {#page}.
|
|
16
|
+
#
|
|
17
|
+
# Jobs implement ActiveJob's Job API and are therefore compatible with a wide
|
|
18
|
+
# range of job queues. To run a job immediately, call ::perform_now.
|
|
19
|
+
# enqueue a job, call ::perform_later.
|
|
20
|
+
#
|
|
21
|
+
# @see https://github.com/rails/rails/tree/master/activejob rails/activejob
|
|
22
|
+
# @see http://edgeguides.rubyonrails.org/active_job_basics.html ActiveJob Basics
|
|
23
|
+
class Job < ActiveJob::Base
|
|
24
|
+
extend Forwardable
|
|
25
|
+
|
|
26
|
+
include Hooks
|
|
27
|
+
include Locals
|
|
28
|
+
|
|
29
|
+
# @!group Callbacks
|
|
30
|
+
|
|
31
|
+
# Callback that fires __once__ before any pages are retrieved.
|
|
32
|
+
# @method before_crawl
|
|
33
|
+
# @scope class
|
|
34
|
+
define_hook :before_crawl
|
|
35
|
+
|
|
36
|
+
# Callback that fires __once__ after all pages have been retrieved and
|
|
37
|
+
# processing is done.
|
|
38
|
+
# @method after_crawl
|
|
39
|
+
# @scope class
|
|
40
|
+
define_hook :after_crawl
|
|
41
|
+
|
|
42
|
+
# Callback that fires when HTTP adapters are instantiated.
|
|
43
|
+
# @method setup_adapter
|
|
44
|
+
# @scope class
|
|
45
|
+
# @yield [[HTTPAdapters::NetHTTPAdapter, HTTPAdapters::SeleniumAdapter], [Selenium::WebDriver::Driver, nil], [Capybara::Selenium::Driver, nil]]
|
|
46
|
+
define_hooks :setup_adapter
|
|
47
|
+
|
|
48
|
+
# @!endgroup
|
|
49
|
+
|
|
50
|
+
class << self
|
|
51
|
+
extend Forwardable
|
|
52
|
+
|
|
53
|
+
# @!attribute [w] router
|
|
54
|
+
attr_writer :router
|
|
55
|
+
|
|
56
|
+
# @!attribute [w] config
|
|
57
|
+
attr_writer :config
|
|
58
|
+
|
|
59
|
+
# Returns a class copy.
|
|
60
|
+
def prepare
|
|
61
|
+
duplicate = dup
|
|
62
|
+
duplicate.router = router.dup
|
|
63
|
+
duplicate.locals = locals.deep_dup
|
|
64
|
+
duplicate.config = config.dup
|
|
65
|
+
|
|
66
|
+
duplicate.locals.each do |(key, val)|
|
|
67
|
+
duplicate.locals[key] = Locals.thread_safe_counterpart(val)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
duplicate.locals.each do |(key, _)|
|
|
71
|
+
duplicate.send(:define_method, key) do duplicate.locals[key] end
|
|
72
|
+
duplicate.send(:define_singleton_method, key) do
|
|
73
|
+
duplicate.locals[key]
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
duplicate
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# A configuration based off the global {Wayfarer.config}.
|
|
81
|
+
# @yield [Configuration]
|
|
82
|
+
# @return [Configuration]
|
|
83
|
+
def config
|
|
84
|
+
@config ||= Wayfarer.config.clone
|
|
85
|
+
yield(@config) if block_given?
|
|
86
|
+
@config
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# A router.
|
|
90
|
+
# If a block is passed in, it is evaluated within the {Router}'s instance.
|
|
91
|
+
# @return [Routing::Router]
|
|
92
|
+
def router(&proc)
|
|
93
|
+
@router ||= Routing::Router.new
|
|
94
|
+
@router.instance_eval(&proc) if block_given?
|
|
95
|
+
@router
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
alias route router
|
|
99
|
+
alias routes router
|
|
100
|
+
|
|
101
|
+
# Overshadows ActiveJob::Base's own logger
|
|
102
|
+
delegate logger: :config
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# @!attribute [r] staged_uris
|
|
106
|
+
# @return [Array<String>, Array<URI>] URIs to stage for the next cycle.
|
|
107
|
+
# @see #stage
|
|
108
|
+
attr_reader :staged_uris
|
|
109
|
+
|
|
110
|
+
# @!attribute [rw] page
|
|
111
|
+
attr_writer :page
|
|
112
|
+
|
|
113
|
+
# @!attribute [rw] adapter
|
|
114
|
+
attr_accessor :adapter
|
|
115
|
+
|
|
116
|
+
# @!attribute [rw] params
|
|
117
|
+
attr_accessor :params
|
|
118
|
+
|
|
119
|
+
def initialize(*argv)
|
|
120
|
+
@halts = false
|
|
121
|
+
@staged_uris = []
|
|
122
|
+
super(*argv)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Whether this job will stop processing.
|
|
126
|
+
def halts?
|
|
127
|
+
@halts
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Performs this job.
|
|
131
|
+
# @note ActiveJob API
|
|
132
|
+
# @override
|
|
133
|
+
def perform(*uris)
|
|
134
|
+
Crawl.new(self.class, *uris).execute
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
protected
|
|
138
|
+
|
|
139
|
+
# All following instance methods are available within actions.
|
|
140
|
+
|
|
141
|
+
# Sets a halting flag that signals the processor to stop its work.
|
|
142
|
+
def halt
|
|
143
|
+
@halts = true
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Adds URIs to process in the next cycle.
|
|
147
|
+
# If a relative path is given, an absolute URI is constructed from the
|
|
148
|
+
# current {#page}'s URI.
|
|
149
|
+
# @param [String, URI, Array<String>, Array<URI>]
|
|
150
|
+
def stage(*uris)
|
|
151
|
+
expanded = uris.flatten.map do |u|
|
|
152
|
+
if (uri = URI(u)).absolute?
|
|
153
|
+
uri
|
|
154
|
+
else
|
|
155
|
+
# URI#join would discard the path of page.uri.path
|
|
156
|
+
current = page.uri.dup
|
|
157
|
+
current.path = File.join(page.uri.path, uri.path)
|
|
158
|
+
current
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# This method has somewhat become the guard keeper for invalid URIs that
|
|
163
|
+
# would lead to exceptions otherwise down the line
|
|
164
|
+
supported = expanded.select do |uri|
|
|
165
|
+
HTTPAdapters::NetHTTPAdapter::RECOGNIZED_URI_TYPES.any? do |type|
|
|
166
|
+
uri.is_a?(type)
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
@staged_uris.push(*supported)
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# The {Page} representing the URI currently processed by an action.
|
|
174
|
+
# When using the Selenium adapter, {Page#body} gets refreshed on every call.
|
|
175
|
+
# Otherwise, subsequent DOM updates (i.e. JavaScript-induced) would be
|
|
176
|
+
# invisible.
|
|
177
|
+
# @return Page
|
|
178
|
+
def page
|
|
179
|
+
return @page unless self.class.config.http_adapter == :selenium
|
|
180
|
+
|
|
181
|
+
Page.new(
|
|
182
|
+
uri: @page.uri,
|
|
183
|
+
status_code: @page.uri,
|
|
184
|
+
body: driver.page_source,
|
|
185
|
+
headers: @page.headers
|
|
186
|
+
)
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# The parsed response body.
|
|
190
|
+
# When using the Selenium adapter, this parses the body again on every call.
|
|
191
|
+
# Otherwise, subsequent DOM updates (i.e. JavaScript-induced) would be
|
|
192
|
+
# invisible.
|
|
193
|
+
# @method doc
|
|
194
|
+
# @see Page#doc
|
|
195
|
+
delegate doc: :page
|
|
196
|
+
|
|
197
|
+
# The Selenium WebDriver.
|
|
198
|
+
# @method driver
|
|
199
|
+
# @see https://github.com/peterc/pismo Pismo
|
|
200
|
+
# @see Page#driver
|
|
201
|
+
delegate driver: :adapter
|
|
202
|
+
|
|
203
|
+
# A Capybara driver that wraps the {#driver}.
|
|
204
|
+
# @method browser
|
|
205
|
+
# @see HTTPAdapters::SeleniumAdapter#browser
|
|
206
|
+
delegate browser: :adapter
|
|
207
|
+
|
|
208
|
+
# @method logger
|
|
209
|
+
delegate logger: :"self.class"
|
|
210
|
+
end
|
|
211
|
+
end
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "concurrent"
|
|
4
|
+
|
|
5
|
+
module Wayfarer
|
|
6
|
+
# @api private
|
|
7
|
+
module Locals
|
|
8
|
+
def self.thread_safe_counterpart(value)
|
|
9
|
+
case value
|
|
10
|
+
when Array then Concurrent::Array.new(value)
|
|
11
|
+
when Hash then Concurrent::Hash[value]
|
|
12
|
+
when TrueClass then Concurrent::AtomicBoolean.new(value)
|
|
13
|
+
when FalseClass then Concurrent::AtomicBoolean.new(value)
|
|
14
|
+
when Integer then Concurrent::AtomicFixnum.new(value)
|
|
15
|
+
else value
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def self.included(base)
|
|
20
|
+
base.extend(ClassMethods)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
module ClassMethods
|
|
24
|
+
attr_reader :locals
|
|
25
|
+
|
|
26
|
+
def let(key)
|
|
27
|
+
raise "#let called without a block" unless block_given?
|
|
28
|
+
locals[key] = yield
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def locals
|
|
32
|
+
@locals ||= {}
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def locals=(locals)
|
|
36
|
+
@locals = locals
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ostruct"
|
|
4
|
+
require "forwardable"
|
|
5
|
+
require "mime/types"
|
|
6
|
+
require "mime-types"
|
|
7
|
+
|
|
8
|
+
require "pismo" unless RUBY_PLATFORM == "java"
|
|
9
|
+
|
|
10
|
+
module Wayfarer
|
|
11
|
+
# The representation of fetched pages
|
|
12
|
+
class Page
|
|
13
|
+
extend Forwardable
|
|
14
|
+
|
|
15
|
+
include Finders
|
|
16
|
+
|
|
17
|
+
# @!attribute [r] uri
|
|
18
|
+
# @return [URI] the URI of the page.
|
|
19
|
+
attr_reader :uri
|
|
20
|
+
|
|
21
|
+
# @!attribute [r] status_code
|
|
22
|
+
# @return [Fixnum] the response status code.
|
|
23
|
+
attr_reader :status_code
|
|
24
|
+
|
|
25
|
+
# @!attribute [r] body
|
|
26
|
+
# @return [String] the response body.
|
|
27
|
+
attr_accessor :body
|
|
28
|
+
|
|
29
|
+
# @!attribute [r] headers
|
|
30
|
+
# @return [Hash] the response headers.
|
|
31
|
+
attr_reader :headers
|
|
32
|
+
|
|
33
|
+
def initialize(attrs = {})
|
|
34
|
+
@uri = attrs[:uri]
|
|
35
|
+
@status_code = attrs[:status_code]
|
|
36
|
+
@body = attrs[:body]
|
|
37
|
+
@headers = attrs[:headers]
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Returns a parsed representation of the fetched document depending on the
|
|
41
|
+
# Content-Type field.
|
|
42
|
+
# @return [OpenStruct] if the Content-Type field's sub-type is "json".
|
|
43
|
+
# @return [Nokogiri::XML::Document] if the Content-Type field's sub-type is "xml".
|
|
44
|
+
# @return [Nokogiri::HTML::Document] otherwise.
|
|
45
|
+
def doc
|
|
46
|
+
return @doc if @doc
|
|
47
|
+
|
|
48
|
+
# If no Content-Type field is present, assume HTML/XML
|
|
49
|
+
# TODO: Test
|
|
50
|
+
unless @headers["content-type"]
|
|
51
|
+
return @doc = Parsers::XMLParser.parse_html(@body)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
content_type = @headers["content-type"].first
|
|
55
|
+
sub_type = MIME::Types[content_type].first.sub_type
|
|
56
|
+
|
|
57
|
+
# TODO: Tests
|
|
58
|
+
@doc = case sub_type
|
|
59
|
+
when "json"
|
|
60
|
+
Parsers::JSONParser.parse(@body)
|
|
61
|
+
when "xml"
|
|
62
|
+
Parsers::XMLParser.parse_xml(@body)
|
|
63
|
+
else
|
|
64
|
+
Parsers::XMLParser.parse_html(@body)
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Pismo is not supported on JRuby.
|
|
69
|
+
unless RUBY_PLATFORM == "java"
|
|
70
|
+
# `#images` is included from the Helpers module
|
|
71
|
+
# `#body` is an attribute reader defined above
|
|
72
|
+
delegate (Pismo::Document::ATTRIBUTE_METHODS - %i[images body]) => :pismo
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
private
|
|
76
|
+
|
|
77
|
+
# Returns a Pismo document.
|
|
78
|
+
# @note Not available on JRuby.
|
|
79
|
+
# @note Only succeeds when {#doc} returns a `Nokogiri::HTML::Document`.
|
|
80
|
+
# @return [Pismo::Document]
|
|
81
|
+
def pismo
|
|
82
|
+
@pismo_doc ||= instantiate_pismo_document
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def instantiate_pismo_document
|
|
86
|
+
doc = Pismo::Document.allocate
|
|
87
|
+
doc.instance_variable_set(:@options, {})
|
|
88
|
+
doc.instance_variable_set(:@url, uri)
|
|
89
|
+
doc.instance_variable_set(:@html, body)
|
|
90
|
+
doc.instance_variable_set(:@doc, self.doc)
|
|
91
|
+
doc
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "oj" unless RUBY_PLATFORM == "java"
|
|
4
|
+
|
|
5
|
+
module Wayfarer
|
|
6
|
+
module Parsers
|
|
7
|
+
# A wrapper class for parsing JSON.
|
|
8
|
+
# @private
|
|
9
|
+
module JSONParser
|
|
10
|
+
module_function
|
|
11
|
+
|
|
12
|
+
# Parses a JSON string.
|
|
13
|
+
# @param [String] json_str the JSON string to parse.
|
|
14
|
+
# @return [OpenStruct]
|
|
15
|
+
def parse(json_str)
|
|
16
|
+
RUBY_PLATFORM == "java" ? JSON.parse(json_str) : Oj.load(json_str)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
|
|
5
|
+
module Wayfarer
|
|
6
|
+
module Parsers
|
|
7
|
+
# A wrapper class for parsing HTML/XML.
|
|
8
|
+
# @private
|
|
9
|
+
module XMLParser
|
|
10
|
+
module_function
|
|
11
|
+
|
|
12
|
+
# Parses an XML string.
|
|
13
|
+
# @param [String] xml_str the XML string to parse.
|
|
14
|
+
# @return [Nokogiri::XML::Document]
|
|
15
|
+
def parse_xml(xml_str)
|
|
16
|
+
Nokogiri::XML(xml_str)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Parses a HTML string.
|
|
20
|
+
# @param [String] html_str the HTML string to parse.
|
|
21
|
+
# @return [Nokogiri::HTML::Document]
|
|
22
|
+
def parse_html(html_str)
|
|
23
|
+
Nokogiri::HTML(html_str)
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "pp"
|
|
4
|
+
require "concurrent"
|
|
5
|
+
require "observer"
|
|
6
|
+
|
|
7
|
+
module Wayfarer
|
|
8
|
+
# Runs jobs.
|
|
9
|
+
class Processor
|
|
10
|
+
extend Forwardable
|
|
11
|
+
|
|
12
|
+
include Observable
|
|
13
|
+
include CrawlObserver::Events
|
|
14
|
+
include CrawlObserver::ObservableShortcuts
|
|
15
|
+
|
|
16
|
+
attr_reader :job
|
|
17
|
+
|
|
18
|
+
delegate config: :job
|
|
19
|
+
delegate logger: :config
|
|
20
|
+
|
|
21
|
+
def initialize(job, frontier, dispatcher)
|
|
22
|
+
@job = job
|
|
23
|
+
@frontier = frontier
|
|
24
|
+
@dispatcher = dispatcher
|
|
25
|
+
@halted = Concurrent::AtomicBoolean.new(false)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Whether processing is done.
|
|
29
|
+
# @return [true, false]
|
|
30
|
+
def halted?
|
|
31
|
+
@halted.value
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Sets a halt flag.
|
|
35
|
+
def halt!
|
|
36
|
+
@halted.make_true
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Runs the job.
|
|
40
|
+
# @param [*Array<URI>, *Array<String>] uris
|
|
41
|
+
def run(*_uris)
|
|
42
|
+
notify_observers!(FirstCycle.new(@frontier))
|
|
43
|
+
|
|
44
|
+
while @halted.false? && @frontier.cycle
|
|
45
|
+
current_uris = @frontier.current_uris
|
|
46
|
+
queue = current_uris.inject(Queue.new, :push)
|
|
47
|
+
|
|
48
|
+
notify_observers!(NewCycle.new(current_uris.count))
|
|
49
|
+
|
|
50
|
+
@threads = Array.new(config.connection_count) do
|
|
51
|
+
Thread.new do
|
|
52
|
+
begin
|
|
53
|
+
loop do
|
|
54
|
+
uri = queue.pop(true)
|
|
55
|
+
break if uri.nil? || @halted.true?
|
|
56
|
+
handle_dispatch_result(@dispatcher.dispatch(@job, uri))
|
|
57
|
+
end
|
|
58
|
+
rescue ThreadError
|
|
59
|
+
notify_observers!(CycleFinished.new)
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
@threads.each(&:join)
|
|
65
|
+
|
|
66
|
+
notify_observers!(AboutToCycle.new(@frontier.staged_uris.count))
|
|
67
|
+
end
|
|
68
|
+
ensure
|
|
69
|
+
halt!
|
|
70
|
+
@frontier.free
|
|
71
|
+
@dispatcher.adapter_pool.free
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
private
|
|
75
|
+
|
|
76
|
+
def handle_dispatch_result(result)
|
|
77
|
+
case result
|
|
78
|
+
when Dispatcher::Mismatch then handle_mismatch(result)
|
|
79
|
+
when Dispatcher::Halt then handle_halt(result)
|
|
80
|
+
when Dispatcher::Stage then handle_stage(result)
|
|
81
|
+
when Dispatcher::Error then handle_error(result)
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def handle_mismatch(mismatch)
|
|
86
|
+
notify_observers!(MismatchedURI.new(mismatch.uri))
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def handle_halt(halt)
|
|
90
|
+
notify_observers!(HaltInitiated.new(halt.action, halt.uri))
|
|
91
|
+
halt!
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def handle_stage(stage)
|
|
95
|
+
notify_observers!(StagingURIs.new(stage.uris.count))
|
|
96
|
+
@frontier.stage(*stage.uris) unless halted?
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def handle_error(error)
|
|
100
|
+
notify_observers!(UnhandledError.new(error.exception))
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "uri"
|
|
4
|
+
|
|
5
|
+
module Wayfarer
|
|
6
|
+
module Routing
|
|
7
|
+
# @private
|
|
8
|
+
class CustomRule < Rule
|
|
9
|
+
def initialize(delegate_or_block = proc, opts = {}, &proc)
|
|
10
|
+
@delegate_or_block = delegate_or_block
|
|
11
|
+
super(opts, &proc)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
private
|
|
15
|
+
|
|
16
|
+
def match!(uri)
|
|
17
|
+
!!@delegate_or_block.call(uri)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
require "uri"
|
|
3
|
+
|
|
4
|
+
module Wayfarer
|
|
5
|
+
module Routing
|
|
6
|
+
# @private
|
|
7
|
+
class FiletypesRule < Rule
|
|
8
|
+
def initialize(types, opts = {}, &proc)
|
|
9
|
+
@types = types
|
|
10
|
+
super(opts, &proc)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
private
|
|
14
|
+
|
|
15
|
+
def match!(uri)
|
|
16
|
+
@types.any? { |type| uri.path =~ /\.#{type}$/ }
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Wayfarer
|
|
4
|
+
module Routing
|
|
5
|
+
# @private
|
|
6
|
+
class HostRule < Rule
|
|
7
|
+
def initialize(str_or_regexp, opts = {}, &proc)
|
|
8
|
+
@str_or_regexp = str_or_regexp
|
|
9
|
+
super(opts, &proc)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# rubocop:disable Style/CaseEquality
|
|
13
|
+
def match!(uri)
|
|
14
|
+
@str_or_regexp === uri.host
|
|
15
|
+
end
|
|
16
|
+
# rubocop:enable Style/CaseEquality
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "mustermann"
|
|
4
|
+
|
|
5
|
+
module Wayfarer
|
|
6
|
+
module Routing
|
|
7
|
+
# @private
|
|
8
|
+
class PathRule < Rule
|
|
9
|
+
attr_reader :matcher
|
|
10
|
+
|
|
11
|
+
def initialize(arg, opts = {}, &proc)
|
|
12
|
+
@matcher = if arg.is_a? String
|
|
13
|
+
Mustermann.new(arg, type: Wayfarer.config.mustermann_type)
|
|
14
|
+
else
|
|
15
|
+
arg
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
super(opts, &proc)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def params(uri)
|
|
22
|
+
return {} unless match!(uri)
|
|
23
|
+
|
|
24
|
+
path = uri.path
|
|
25
|
+
|
|
26
|
+
if @matcher.is_a? Mustermann
|
|
27
|
+
@matcher.params(path)
|
|
28
|
+
else
|
|
29
|
+
captures = @matcher.match(full_path(uri)).captures
|
|
30
|
+
|
|
31
|
+
captures.each.with_index.reduce({}) do |hash, (capture, i)|
|
|
32
|
+
hash.merge(i.to_s => capture)
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private
|
|
38
|
+
|
|
39
|
+
# rubocop:disable Style/CaseEquality
|
|
40
|
+
def match!(uri)
|
|
41
|
+
if @matcher.is_a? Mustermann
|
|
42
|
+
@matcher === uri.path
|
|
43
|
+
else
|
|
44
|
+
@matcher =~ full_path(uri)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
# rubocop:enable Style/CaseEquality
|
|
48
|
+
|
|
49
|
+
def full_path(uri)
|
|
50
|
+
"#{uri.path}?#{uri.query}##{uri.fragment}"
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|