wayfarer 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rbenv-gemsets +1 -0
- data/.rspec +3 -0
- data/.rubocop.yml +21 -0
- data/.ruby-version +1 -0
- data/.travis.yml +5 -0
- data/.yardopts +3 -0
- data/Changelog.md +10 -0
- data/Gemfile +11 -0
- data/LICENSE +19 -0
- data/README.md +21 -0
- data/Rakefile +114 -0
- data/benchmark/frontiers.rb +143 -0
- data/bin/wayfarer +116 -0
- data/docs/.gitignore +2 -0
- data/docs/_config.yml +15 -0
- data/docs/_includes/base.html +7 -0
- data/docs/_includes/head.html +10 -0
- data/docs/_includes/navigation.html +187 -0
- data/docs/_layouts/default.html +42 -0
- data/docs/_sass/base.scss +439 -0
- data/docs/_sass/variables.scss +24 -0
- data/docs/_sass/vendor/bourbon/_bourbon-deprecate.scss +19 -0
- data/docs/_sass/vendor/bourbon/_bourbon-deprecated-upcoming.scss +425 -0
- data/docs/_sass/vendor/bourbon/_bourbon.scss +90 -0
- data/docs/_sass/vendor/bourbon/addons/_border-color.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_border-radius.scss +48 -0
- data/docs/_sass/vendor/bourbon/addons/_border-style.scss +28 -0
- data/docs/_sass/vendor/bourbon/addons/_border-width.scss +28 -0
- data/docs/_sass/vendor/bourbon/addons/_buttons.scss +69 -0
- data/docs/_sass/vendor/bourbon/addons/_clearfix.scss +25 -0
- data/docs/_sass/vendor/bourbon/addons/_ellipsis.scss +30 -0
- data/docs/_sass/vendor/bourbon/addons/_font-stacks.scss +31 -0
- data/docs/_sass/vendor/bourbon/addons/_hide-text.scss +27 -0
- data/docs/_sass/vendor/bourbon/addons/_margin.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_padding.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_position.scss +51 -0
- data/docs/_sass/vendor/bourbon/addons/_prefixer.scss +66 -0
- data/docs/_sass/vendor/bourbon/addons/_retina-image.scss +27 -0
- data/docs/_sass/vendor/bourbon/addons/_size.scss +56 -0
- data/docs/_sass/vendor/bourbon/addons/_text-inputs.scss +118 -0
- data/docs/_sass/vendor/bourbon/addons/_timing-functions.scss +34 -0
- data/docs/_sass/vendor/bourbon/addons/_triangle.scss +63 -0
- data/docs/_sass/vendor/bourbon/addons/_word-wrap.scss +29 -0
- data/docs/_sass/vendor/bourbon/css3/_animation.scss +61 -0
- data/docs/_sass/vendor/bourbon/css3/_appearance.scss +5 -0
- data/docs/_sass/vendor/bourbon/css3/_backface-visibility.scss +5 -0
- data/docs/_sass/vendor/bourbon/css3/_background-image.scss +44 -0
- data/docs/_sass/vendor/bourbon/css3/_background.scss +57 -0
- data/docs/_sass/vendor/bourbon/css3/_border-image.scss +61 -0
- data/docs/_sass/vendor/bourbon/css3/_calc.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_columns.scss +67 -0
- data/docs/_sass/vendor/bourbon/css3/_filter.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_flex-box.scss +327 -0
- data/docs/_sass/vendor/bourbon/css3/_font-face.scss +29 -0
- data/docs/_sass/vendor/bourbon/css3/_font-feature-settings.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_hidpi-media-query.scss +12 -0
- data/docs/_sass/vendor/bourbon/css3/_hyphens.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_image-rendering.scss +15 -0
- data/docs/_sass/vendor/bourbon/css3/_keyframes.scss +38 -0
- data/docs/_sass/vendor/bourbon/css3/_linear-gradient.scss +40 -0
- data/docs/_sass/vendor/bourbon/css3/_perspective.scss +12 -0
- data/docs/_sass/vendor/bourbon/css3/_placeholder.scss +10 -0
- data/docs/_sass/vendor/bourbon/css3/_radial-gradient.scss +40 -0
- data/docs/_sass/vendor/bourbon/css3/_selection.scss +44 -0
- data/docs/_sass/vendor/bourbon/css3/_text-decoration.scss +27 -0
- data/docs/_sass/vendor/bourbon/css3/_transform.scss +21 -0
- data/docs/_sass/vendor/bourbon/css3/_transition.scss +81 -0
- data/docs/_sass/vendor/bourbon/css3/_user-select.scss +5 -0
- data/docs/_sass/vendor/bourbon/functions/_assign-inputs.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_contains-falsy.scss +25 -0
- data/docs/_sass/vendor/bourbon/functions/_contains.scss +31 -0
- data/docs/_sass/vendor/bourbon/functions/_is-length.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_is-light.scss +26 -0
- data/docs/_sass/vendor/bourbon/functions/_is-number.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_is-size.scss +23 -0
- data/docs/_sass/vendor/bourbon/functions/_modular-scale.scss +74 -0
- data/docs/_sass/vendor/bourbon/functions/_px-to-em.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_px-to-rem.scss +26 -0
- data/docs/_sass/vendor/bourbon/functions/_shade.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_strip-units.scss +22 -0
- data/docs/_sass/vendor/bourbon/functions/_tint.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_transition-property-name.scss +37 -0
- data/docs/_sass/vendor/bourbon/functions/_unpack.scss +32 -0
- data/docs/_sass/vendor/bourbon/helpers/_convert-units.scss +26 -0
- data/docs/_sass/vendor/bourbon/helpers/_directional-values.scss +108 -0
- data/docs/_sass/vendor/bourbon/helpers/_font-source-declaration.scss +53 -0
- data/docs/_sass/vendor/bourbon/helpers/_gradient-positions-parser.scss +24 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-angle-parser.scss +35 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-gradient-parser.scss +51 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-positions-parser.scss +77 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-side-corner-parser.scss +41 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-arg-parser.scss +74 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-gradient-parser.scss +55 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-positions-parser.scss +28 -0
- data/docs/_sass/vendor/bourbon/helpers/_render-gradients.scss +31 -0
- data/docs/_sass/vendor/bourbon/helpers/_shape-size-stripper.scss +15 -0
- data/docs/_sass/vendor/bourbon/helpers/_str-to-num.scss +55 -0
- data/docs/_sass/vendor/bourbon/settings/_asset-pipeline.scss +7 -0
- data/docs/_sass/vendor/bourbon/settings/_deprecation-warnings.scss +8 -0
- data/docs/_sass/vendor/bourbon/settings/_prefixer.scss +9 -0
- data/docs/_sass/vendor/bourbon/settings/_px-to-em.scss +1 -0
- data/docs/_sass/vendor/neat/_neat-helpers.scss +11 -0
- data/docs/_sass/vendor/neat/_neat.scss +23 -0
- data/docs/_sass/vendor/neat/functions/_new-breakpoint.scss +49 -0
- data/docs/_sass/vendor/neat/functions/_private.scss +114 -0
- data/docs/_sass/vendor/neat/grid/_box-sizing.scss +15 -0
- data/docs/_sass/vendor/neat/grid/_direction-context.scss +33 -0
- data/docs/_sass/vendor/neat/grid/_display-context.scss +28 -0
- data/docs/_sass/vendor/neat/grid/_fill-parent.scss +22 -0
- data/docs/_sass/vendor/neat/grid/_media.scss +92 -0
- data/docs/_sass/vendor/neat/grid/_omega.scss +87 -0
- data/docs/_sass/vendor/neat/grid/_outer-container.scss +34 -0
- data/docs/_sass/vendor/neat/grid/_pad.scss +25 -0
- data/docs/_sass/vendor/neat/grid/_private.scss +35 -0
- data/docs/_sass/vendor/neat/grid/_row.scss +52 -0
- data/docs/_sass/vendor/neat/grid/_shift.scss +50 -0
- data/docs/_sass/vendor/neat/grid/_span-columns.scss +94 -0
- data/docs/_sass/vendor/neat/grid/_to-deprecate.scss +97 -0
- data/docs/_sass/vendor/neat/grid/_visual-grid.scss +42 -0
- data/docs/_sass/vendor/neat/mixins/_clearfix.scss +25 -0
- data/docs/_sass/vendor/neat/settings/_disable-warnings.scss +13 -0
- data/docs/_sass/vendor/neat/settings/_grid.scss +51 -0
- data/docs/_sass/vendor/neat/settings/_visual-grid.scss +27 -0
- data/docs/_sass/vendor/normalize-3.0.2.scss +427 -0
- data/docs/_sass/vendor/pygments.scss +356 -0
- data/docs/automating_browsers/capybara.md +70 -0
- data/docs/css/screen.scss +7 -0
- data/docs/guides/callbacks.md +45 -0
- data/docs/guides/cli.md +52 -0
- data/docs/guides/configuration.md +184 -0
- data/docs/guides/error_handling.md +46 -0
- data/docs/guides/frontiers.md +93 -0
- data/docs/guides/halting.md +23 -0
- data/docs/guides/job_queues.md +26 -0
- data/docs/guides/locals.md +36 -0
- data/docs/guides/logging.md +22 -0
- data/docs/guides/page_objects.md +67 -0
- data/docs/guides/peeking.md +46 -0
- data/docs/guides/selenium_capybara.md +100 -0
- data/docs/guides/tutorial.md +452 -0
- data/docs/index.md +82 -0
- data/docs/js/navigation.js +11 -0
- data/docs/misc/contributing.md +20 -0
- data/docs/misc/testing.md +11 -0
- data/docs/recipes/authentication.md +23 -0
- data/docs/recipes/csv.md +29 -0
- data/docs/recipes/javascript.md +20 -0
- data/docs/recipes/multiple_uris.md +18 -0
- data/docs/recipes/screenshots.md +20 -0
- data/docs/routing/custom_rules.md +16 -0
- data/docs/routing/filetypes_rules.md +21 -0
- data/docs/routing/host_rules.md +24 -0
- data/docs/routing/path_rules.md +33 -0
- data/docs/routing/protocol_rules.md +17 -0
- data/docs/routing/query_rules.md +69 -0
- data/docs/routing/routes.md +96 -0
- data/docs/routing/uri_rules.md +18 -0
- data/examples/collect_github_issues.rb +65 -0
- data/examples/find_foobar_on_wikipedia.rb +23 -0
- data/lib/wayfarer/configuration.rb +86 -0
- data/lib/wayfarer/crawl.rb +79 -0
- data/lib/wayfarer/crawl_observer.rb +103 -0
- data/lib/wayfarer/dispatcher.rb +104 -0
- data/lib/wayfarer/finders.rb +61 -0
- data/lib/wayfarer/frontiers/frontier.rb +79 -0
- data/lib/wayfarer/frontiers/memory_bloomfilter.rb +32 -0
- data/lib/wayfarer/frontiers/memory_frontier.rb +76 -0
- data/lib/wayfarer/frontiers/memory_trie_frontier.rb +39 -0
- data/lib/wayfarer/frontiers/normalize_uris.rb +48 -0
- data/lib/wayfarer/frontiers/redis_bloomfilter.rb +34 -0
- data/lib/wayfarer/frontiers/redis_frontier.rb +83 -0
- data/lib/wayfarer/http_adapters/adapter_pool.rb +62 -0
- data/lib/wayfarer/http_adapters/net_http_adapter.rb +77 -0
- data/lib/wayfarer/http_adapters/selenium_adapter.rb +80 -0
- data/lib/wayfarer/job.rb +211 -0
- data/lib/wayfarer/locals.rb +40 -0
- data/lib/wayfarer/page.rb +94 -0
- data/lib/wayfarer/parsers/json_parser.rb +20 -0
- data/lib/wayfarer/parsers/xml_parser.rb +27 -0
- data/lib/wayfarer/processor.rb +103 -0
- data/lib/wayfarer/routing/custom_rule.rb +21 -0
- data/lib/wayfarer/routing/filetypes_rule.rb +20 -0
- data/lib/wayfarer/routing/host_rule.rb +19 -0
- data/lib/wayfarer/routing/path_rule.rb +54 -0
- data/lib/wayfarer/routing/protocol_rule.rb +21 -0
- data/lib/wayfarer/routing/query_rule.rb +59 -0
- data/lib/wayfarer/routing/router.rb +71 -0
- data/lib/wayfarer/routing/rule.rb +114 -0
- data/lib/wayfarer/routing/uri_rule.rb +21 -0
- data/lib/wayfarer.rb +68 -0
- data/spec/configuration_spec.rb +26 -0
- data/spec/crawl_spec.rb +48 -0
- data/spec/finders_spec.rb +49 -0
- data/spec/frontiers/memory_bloomfilter_spec.rb +6 -0
- data/spec/frontiers/memory_frontier_spec.rb +6 -0
- data/spec/frontiers/memory_trie_frontier_spec.rb +6 -0
- data/spec/frontiers/normalize_uris_spec.rb +59 -0
- data/spec/frontiers/redis_bloomfilter_spec.rb +6 -0
- data/spec/frontiers/redis_frontier_spec.rb +6 -0
- data/spec/http_adapters/adapter_pool_spec.rb +33 -0
- data/spec/http_adapters/net_http_adapter_spec.rb +83 -0
- data/spec/http_adapters/selenium_adapter_spec.rb +53 -0
- data/spec/integration/callbacks_spec.rb +42 -0
- data/spec/integration/locals_spec.rb +106 -0
- data/spec/integration/peeking_spec.rb +61 -0
- data/spec/job_spec.rb +122 -0
- data/spec/page_spec.rb +38 -0
- data/spec/parsers/json_parser_spec.rb +30 -0
- data/spec/parsers/xml_parser_spec.rb +24 -0
- data/spec/processor_spec.rb +31 -0
- data/spec/routing/custom_rule_spec.rb +26 -0
- data/spec/routing/filetypes_rule_spec.rb +40 -0
- data/spec/routing/host_rule_spec.rb +48 -0
- data/spec/routing/path_rule_spec.rb +66 -0
- data/spec/routing/protocol_rule_spec.rb +26 -0
- data/spec/routing/query_rule_spec.rb +124 -0
- data/spec/routing/router_spec.rb +67 -0
- data/spec/routing/rule_spec.rb +251 -0
- data/spec/routing/uri_rule_spec.rb +24 -0
- data/spec/shared/frontier.rb +96 -0
- data/spec/spec_helpers.rb +62 -0
- data/spec/wayfarer_spec.rb +24 -0
- data/support/static/finders.html +38 -0
- data/support/static/graph/details/a.html +10 -0
- data/support/static/graph/details/b.html +10 -0
- data/support/static/graph/index.html +20 -0
- data/support/static/json/dummy.json +13 -0
- data/support/static/links/links.html +28 -0
- data/support/static/xml/dummy.xml +120 -0
- data/support/test_app.rb +45 -0
- data/wayfarer-jruby.gemspec +49 -0
- data/wayfarer.gemspec +53 -0
- metadata +697 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
---
|
|
2
|
+
layout: default
|
|
3
|
+
title: Custom rules
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Custom rules
|
|
7
|
+
|
|
8
|
+
Custom rules take a block that gets yielded the URI or an object that responds to `#call(uri)`. If the block or the delegate return a truthy value, the rule matches.
|
|
9
|
+
|
|
10
|
+
{% highlight ruby %}
|
|
11
|
+
class DummyJob < Wayfarer::Job
|
|
12
|
+
route.if -> (uri) { uri.host == uri.host.reverse }
|
|
13
|
+
end
|
|
14
|
+
{% endhighlight %}
|
|
15
|
+
|
|
16
|
+
* Matches only URIs with palindrome hosts
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
---
|
|
2
|
+
layout: default
|
|
3
|
+
title: Filetypes rules
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Filetypes rules
|
|
7
|
+
|
|
8
|
+
Filetypes rules match against the URI path's file extension.
|
|
9
|
+
|
|
10
|
+
{% highlight ruby %}
|
|
11
|
+
class DummyJob < Wayfarer::Job
|
|
12
|
+
route.filetypes [:png, :jpg], to: :image
|
|
13
|
+
route.forbid.filetypes [:php, :js]
|
|
14
|
+
end
|
|
15
|
+
{% endhighlight %}
|
|
16
|
+
|
|
17
|
+
Matches:
|
|
18
|
+
|
|
19
|
+
* `http://example.com/foo.png`
|
|
20
|
+
* `http://example.com/foo.jpg`
|
|
21
|
+
* `https://example.com/qux/bar.jpg`
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
---
|
|
2
|
+
layout: default
|
|
3
|
+
title: Host rules
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Host rules
|
|
7
|
+
|
|
8
|
+
Host rules match against a host string or RegExp.
|
|
9
|
+
|
|
10
|
+
{% highlight ruby %}
|
|
11
|
+
class DummyJob < Wayfarer::Job
|
|
12
|
+
route.host "example.com"
|
|
13
|
+
route.host /example/
|
|
14
|
+
end
|
|
15
|
+
{% endhighlight %}
|
|
16
|
+
|
|
17
|
+
Matches:
|
|
18
|
+
|
|
19
|
+
* All URIs hosted on `"example.com"`.
|
|
20
|
+
* All URIs that contain `"example"`.
|
|
21
|
+
|
|
22
|
+
<aside class="note">
|
|
23
|
+
<code>"www.host.net"</code> and <code>"host.net"</code> are not considered equal. You have to specify the exact host when using strings. Consider using <code>/host.net/</code> instead.
|
|
24
|
+
</aside>
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
---
|
|
2
|
+
layout: default
|
|
3
|
+
title: Path rules
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Path rules
|
|
7
|
+
|
|
8
|
+
Path rules match against the path of a URI. Both strings and RegExps are accepted, and path segment pattern matching and RegExp captures are supported.
|
|
9
|
+
|
|
10
|
+
{% highlight ruby %}
|
|
11
|
+
class DummyJob < Wayfarer::Job
|
|
12
|
+
route.path "/:alpha/:beta", to: :foo
|
|
13
|
+
route.path /^foobar\/(.+)/, to: :bar
|
|
14
|
+
|
|
15
|
+
def foo
|
|
16
|
+
params[:alpha]
|
|
17
|
+
params[:beta]
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def foo
|
|
21
|
+
params["0"]
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
{% endhighlight %}
|
|
25
|
+
|
|
26
|
+
Matches:
|
|
27
|
+
|
|
28
|
+
* All URIs with path segments matching `/:alpha/:beta`, e.g. `https://example.com/foo/bar`
|
|
29
|
+
* All URIs starting with `"/foobar/"`.
|
|
30
|
+
|
|
31
|
+
<aside class="note">
|
|
32
|
+
<code>/:alpha/:beta</code> and <code>:alpha/:beta</code> are not considered equal. Note the opening slash.
|
|
33
|
+
</aside>
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
---
|
|
2
|
+
layout: default
|
|
3
|
+
title: Protocol rules
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Protocol rules
|
|
7
|
+
|
|
8
|
+
Protocol rules match against symbols/strings.
|
|
9
|
+
|
|
10
|
+
{% highlight ruby %}
|
|
11
|
+
class DummyJob < Wayfarer::Job
|
|
12
|
+
route.protocol :https
|
|
13
|
+
end
|
|
14
|
+
{% endhighlight %}
|
|
15
|
+
|
|
16
|
+
* Matches `https://example.com`.
|
|
17
|
+
* Does not match `http://example.com`.
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
---
|
|
2
|
+
layout: default
|
|
3
|
+
title: Query rules
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Query rules
|
|
7
|
+
|
|
8
|
+
Query rules impose constraints on key-value query parameters. Strings, integers, RegExps and ranges are supported.
|
|
9
|
+
|
|
10
|
+
## String constraints
|
|
11
|
+
|
|
12
|
+
{% highlight ruby %}
|
|
13
|
+
class DummyJob < Wayfarer::Job
|
|
14
|
+
route.query arg: "foo"
|
|
15
|
+
end
|
|
16
|
+
{% endhighlight %}
|
|
17
|
+
|
|
18
|
+
* Matches `https://example.com?arg=foo`.
|
|
19
|
+
|
|
20
|
+
## Integer constraints
|
|
21
|
+
|
|
22
|
+
{% highlight ruby %}
|
|
23
|
+
class DummyJob < Wayfarer::Job
|
|
24
|
+
route.query arg: 42
|
|
25
|
+
end
|
|
26
|
+
{% endhighlight %}
|
|
27
|
+
|
|
28
|
+
* Matches `https://example.com?arg=42`.
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## RegExp constraints
|
|
33
|
+
|
|
34
|
+
{% highlight ruby %}
|
|
35
|
+
class DummyJob < Wayfarer::Job
|
|
36
|
+
route.query arg: /foo/
|
|
37
|
+
end
|
|
38
|
+
{% endhighlight %}
|
|
39
|
+
|
|
40
|
+
* Matches `https://example.com?arg=foo`.
|
|
41
|
+
* Matches `https://example.com?arg=foobar`.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Range constraints
|
|
46
|
+
|
|
47
|
+
{% highlight ruby %}
|
|
48
|
+
class DummyJob < Wayfarer::Job
|
|
49
|
+
route.query arg: 1..10
|
|
50
|
+
end
|
|
51
|
+
{% endhighlight %}
|
|
52
|
+
|
|
53
|
+
* Matches `https://example.com?arg=1`.
|
|
54
|
+
* Matches […]
|
|
55
|
+
* Matches `https://example.com?arg=10`.
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## Compound constraints
|
|
60
|
+
|
|
61
|
+
{% highlight ruby %}
|
|
62
|
+
class DummyJob < Wayfarer::Job
|
|
63
|
+
route.query foo: 1..5, bar: /baz/, qux: "zot", toto: 2
|
|
64
|
+
end
|
|
65
|
+
{% endhighlight %}
|
|
66
|
+
|
|
67
|
+
* Matches `https://example.com?foo=4&bar=bazqux&qux=zot&toto=2`.
|
|
68
|
+
|
|
69
|
+
---
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
---
|
|
2
|
+
layout: default
|
|
3
|
+
title: Routes
|
|
4
|
+
categories: [Routing]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Routes
|
|
8
|
+
|
|
9
|
+
* Routes are filters for interesting URIs.
|
|
10
|
+
* Routes put constraints on URIs that should get processed.
|
|
11
|
+
* Routes map URIs to instance methods (actions).
|
|
12
|
+
* Routes are tree nodes and thus nestable.
|
|
13
|
+
|
|
14
|
+
Currently, the following rules are available:
|
|
15
|
+
|
|
16
|
+
* [URI rules](uri_rules.html) match URIs against a string.
|
|
17
|
+
* [Host rules](/routing/host_rules.html) match hosts against strings and RegExps.
|
|
18
|
+
* [Path rules](/routing/path_rules.html) match paths against pattern strings and RegExps. They support path segment capturing.
|
|
19
|
+
* [Query rules](/routing/query_rules.html) match key-value pairs of query parameters against strings, integers, RegExps and ranges.
|
|
20
|
+
|
|
21
|
+
Routes can be fordidden. URIs that match forbidden rules are never processed.
|
|
22
|
+
|
|
23
|
+
## Route declaration
|
|
24
|
+
|
|
25
|
+
### Declaration order matching
|
|
26
|
+
|
|
27
|
+
{% highlight ruby %}
|
|
28
|
+
class DummyJob < Wayfarer::Job
|
|
29
|
+
route.host "example.com", to: :foo
|
|
30
|
+
route.path "/foo", to: :bar
|
|
31
|
+
|
|
32
|
+
# Is equivalent to:
|
|
33
|
+
#
|
|
34
|
+
# routes do
|
|
35
|
+
# host "example.com", to: :foo
|
|
36
|
+
# path "/foo", to: :bar
|
|
37
|
+
# end
|
|
38
|
+
end
|
|
39
|
+
{% endhighlight %}
|
|
40
|
+
|
|
41
|
+
* Dispatches `https://example.com/foo` to `:foo`.
|
|
42
|
+
* Dispatches `https://example.com` to `:foo`.
|
|
43
|
+
* Dispatches `https://yahoo.com/foo` to `:bar`.
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
### Nesting routes (child rules)
|
|
48
|
+
|
|
49
|
+
A route matches if it has a child rule that matches. This applies recursively.
|
|
50
|
+
|
|
51
|
+
{% highlight ruby %}
|
|
52
|
+
class DummyJob < Wayfarer::Job
|
|
53
|
+
route.host "example.com", to: :foo do
|
|
54
|
+
path "/foo"
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Is equivalent to:
|
|
58
|
+
#
|
|
59
|
+
# route.host "example.com", path: "/foo", to: :foo
|
|
60
|
+
# route.path "/foo", host: "example.com", to: :foo
|
|
61
|
+
end
|
|
62
|
+
{% endhighlight %}
|
|
63
|
+
|
|
64
|
+
* Dispatches `https://example.com/foo` to `:foo`.
|
|
65
|
+
* Does not dispatch `https://example.com`.
|
|
66
|
+
* Does not dispatch `https://yahoo.com/foo`.
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
### Deepest routes override actions
|
|
71
|
+
|
|
72
|
+
{% highlight ruby %}
|
|
73
|
+
class DummyJob < Wayfarer::Job
|
|
74
|
+
route.host "example.com", to: :foo do
|
|
75
|
+
path "/foo", to: :bar
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
{% endhighlight %}
|
|
79
|
+
|
|
80
|
+
* Dispatches `https://example.com/foo` to `:bar`.
|
|
81
|
+
* Does not dispatch `https://example.com`.
|
|
82
|
+
* Does not dispatch `https://yahoo.com/foo`.
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
### Forbidding rules
|
|
87
|
+
|
|
88
|
+
{% highlight ruby %}
|
|
89
|
+
class DummyJob < Wayfarer::Job
|
|
90
|
+
route.forbid.path "/foo"
|
|
91
|
+
route.host "example.com", to: :foo
|
|
92
|
+
end
|
|
93
|
+
{% endhighlight %}
|
|
94
|
+
|
|
95
|
+
* Dispatches `https://example.com` to `:bar`.
|
|
96
|
+
* Does not dispatch `https://example.com/foo`.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
---
|
|
2
|
+
layout: default
|
|
3
|
+
title: URI Rules
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# URI rules
|
|
7
|
+
|
|
8
|
+
URI rules match against a string.
|
|
9
|
+
|
|
10
|
+
{% highlight ruby %}
|
|
11
|
+
class DummyJob < Wayfarer::Job
|
|
12
|
+
route.uri "https://example.com"
|
|
13
|
+
end
|
|
14
|
+
{% endhighlight %}
|
|
15
|
+
|
|
16
|
+
Matches:
|
|
17
|
+
|
|
18
|
+
* Only `https://example.com`
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
require_relative "../lib/wayfarer"
|
|
2
|
+
|
|
3
|
+
class CollectGithubIssues < Wayfarer::Job
|
|
4
|
+
config.connection_count = 4
|
|
5
|
+
config.logger.level = :fatal
|
|
6
|
+
|
|
7
|
+
let(:records) { [] }
|
|
8
|
+
|
|
9
|
+
routes do
|
|
10
|
+
host "github.com" do
|
|
11
|
+
path "/:user/:repo", to: :repository
|
|
12
|
+
path "/:user/:repo/issues", to: :index
|
|
13
|
+
path "/:user/:repo/issues/:id", to: :show
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
after_crawl do
|
|
18
|
+
records.each do |issue|
|
|
19
|
+
# Save them somewhere?
|
|
20
|
+
puts issue
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def repository
|
|
25
|
+
stage navigation_links
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def index
|
|
29
|
+
stage issue_listing_links, next_page
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def show
|
|
33
|
+
return halt if records.count > 30
|
|
34
|
+
|
|
35
|
+
records << {
|
|
36
|
+
id: params[:id],
|
|
37
|
+
title: issue_title,
|
|
38
|
+
author: issue_author
|
|
39
|
+
}
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
private
|
|
43
|
+
|
|
44
|
+
def issue_title
|
|
45
|
+
doc.css(".js-issue-title").text.strip
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def issue_author
|
|
49
|
+
doc.css(".TableObject-item .author").text.strip
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def navigation_links
|
|
53
|
+
page.links ".reponav-item"
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def issue_listing_links
|
|
57
|
+
page.links ".issues-listing"
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def next_page
|
|
61
|
+
page.links ".next_page"
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
CollectGithubIssues.perform_now("https://github.com/rails/rails")
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
require_relative "../lib/wayfarer"
|
|
2
|
+
|
|
3
|
+
class FindFoobarOnWikipedia < Wayfarer::Job
|
|
4
|
+
config.http_adapter = :selenium
|
|
5
|
+
config.selenium_argv = [:chrome]
|
|
6
|
+
config.connection_count = 4
|
|
7
|
+
|
|
8
|
+
let(:keywords) { [] }
|
|
9
|
+
|
|
10
|
+
route.host "en.wikipedia.org", to: :article
|
|
11
|
+
|
|
12
|
+
def article
|
|
13
|
+
if page.body =~ /Foobar/
|
|
14
|
+
driver.save_screenshot("/tmp/foobar.png")
|
|
15
|
+
return halt
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
keywords << page.keywords
|
|
19
|
+
stage page.links
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
FindFoobarOnWikipedia.perform_now("https://en.wikipedia.org/wiki/Special:Random")
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ostruct"
|
|
4
|
+
require "securerandom"
|
|
5
|
+
require "forwardable"
|
|
6
|
+
|
|
7
|
+
module Wayfarer
|
|
8
|
+
class Configuration < OpenStruct
|
|
9
|
+
extend Forwardable
|
|
10
|
+
|
|
11
|
+
DEFAULTS = {
|
|
12
|
+
# Print full stacktraces?
|
|
13
|
+
print_stacktraces: true,
|
|
14
|
+
|
|
15
|
+
# Crash when encountering unhandled exceptions?
|
|
16
|
+
reraise_exceptions: false,
|
|
17
|
+
|
|
18
|
+
# Allow processing URIs multiple times?
|
|
19
|
+
allow_circulation: false,
|
|
20
|
+
|
|
21
|
+
# How many HTTP connections/Selenium drivers to use
|
|
22
|
+
# 1:1 correspondence with spawned threads
|
|
23
|
+
connection_count: 1,
|
|
24
|
+
|
|
25
|
+
# Which HTTP adapter to use. Supported are :net_http and :selenium
|
|
26
|
+
http_adapter: :net_http,
|
|
27
|
+
|
|
28
|
+
# Which frontier to use.
|
|
29
|
+
frontier: :memory,
|
|
30
|
+
|
|
31
|
+
# How long a thread may hold an HTTP adapter.
|
|
32
|
+
# Threads that exceed this limit fail with an exception.
|
|
33
|
+
connection_timeout: Float::INFINITY,
|
|
34
|
+
|
|
35
|
+
# How many 3xx redirects to follow. Has no effect when using Selenium
|
|
36
|
+
max_http_redirects: 3,
|
|
37
|
+
|
|
38
|
+
# Argument vector for instantiating Selenium drivers
|
|
39
|
+
selenium_argv: [:firefox],
|
|
40
|
+
|
|
41
|
+
# Argument vector for instantiating a Redis connection
|
|
42
|
+
redis_opts: {
|
|
43
|
+
host: "localhost",
|
|
44
|
+
port: 6379
|
|
45
|
+
}.freeze,
|
|
46
|
+
|
|
47
|
+
# Size of browser windows
|
|
48
|
+
window_size: [1024, 768],
|
|
49
|
+
|
|
50
|
+
# Which Mustermann pattern type to use when matching URI paths
|
|
51
|
+
# TODO: Mention in docs
|
|
52
|
+
mustermann_type: :sinatra,
|
|
53
|
+
|
|
54
|
+
# Options for instantiating Bloomfilters
|
|
55
|
+
bloomfilter_opts: {
|
|
56
|
+
size: 100,
|
|
57
|
+
hashes: 2,
|
|
58
|
+
seed: 1,
|
|
59
|
+
bucket: 3,
|
|
60
|
+
raise: false
|
|
61
|
+
},
|
|
62
|
+
|
|
63
|
+
# Whether to normalize URIs
|
|
64
|
+
normalize_uris: true,
|
|
65
|
+
|
|
66
|
+
# URI normalization options
|
|
67
|
+
# See: https://github.com/rwz/normalize_url
|
|
68
|
+
normalize_uri_options: {}
|
|
69
|
+
}.freeze
|
|
70
|
+
|
|
71
|
+
attr_reader :uuid
|
|
72
|
+
|
|
73
|
+
def initialize(overrides = {})
|
|
74
|
+
super(DEFAULTS.merge(overrides))
|
|
75
|
+
@uuid = SecureRandom.uuid
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def logger
|
|
79
|
+
@logger ||= Wayfarer.logger.dup
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def reset!
|
|
83
|
+
DEFAULTS.each { |key, val| self[key] = val }
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "observer"
|
|
4
|
+
require "normalize_url"
|
|
5
|
+
|
|
6
|
+
module Wayfarer
|
|
7
|
+
class Crawl
|
|
8
|
+
extend Forwardable
|
|
9
|
+
include Observable
|
|
10
|
+
|
|
11
|
+
# The prepared job.
|
|
12
|
+
# @!attribute [r] job
|
|
13
|
+
attr_reader :job
|
|
14
|
+
|
|
15
|
+
# @!attribute [r] dispatcher
|
|
16
|
+
attr_reader :dispatcher
|
|
17
|
+
|
|
18
|
+
delegate config: :job
|
|
19
|
+
delegate logger: :config
|
|
20
|
+
|
|
21
|
+
def initialize(job, *uris)
|
|
22
|
+
@job = job.prepare
|
|
23
|
+
@uris = uris
|
|
24
|
+
@dispatcher = Dispatcher.new(@job)
|
|
25
|
+
@processor = Processor.new(@job, frontier, @dispatcher)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def execute
|
|
29
|
+
trap_signals
|
|
30
|
+
|
|
31
|
+
CrawlObserver.new(@processor, @dispatcher, config.logger)
|
|
32
|
+
|
|
33
|
+
@job.run_hook(:before_crawl)
|
|
34
|
+
@processor.run(*@uris)
|
|
35
|
+
@job.run_hook(:after_crawl)
|
|
36
|
+
ensure
|
|
37
|
+
untrap_signals
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# A frontier with initially pre-staged URIs.
|
|
41
|
+
# @return [Frontier]
|
|
42
|
+
def frontier
|
|
43
|
+
return @frontier if @frontier
|
|
44
|
+
|
|
45
|
+
@frontier = case config.frontier
|
|
46
|
+
when :memory_trie
|
|
47
|
+
Frontiers::MemoryTrieFrontier.new(config)
|
|
48
|
+
when :redis
|
|
49
|
+
Frontiers::RedisFrontier.new(config)
|
|
50
|
+
when :memory_bloom
|
|
51
|
+
Frontiers::MemoryBloomfilter.new(config)
|
|
52
|
+
when :redis_bloom
|
|
53
|
+
Frontiers::RedisBloomfilter.new(config)
|
|
54
|
+
else
|
|
55
|
+
Frontiers::MemoryFrontier.new(config)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
@frontier.extend(Frontiers::NormalizeURIs) if config.normalize_uris
|
|
59
|
+
|
|
60
|
+
@frontier.stage(*@uris) # TODO: Test
|
|
61
|
+
|
|
62
|
+
@frontier
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
private
|
|
66
|
+
|
|
67
|
+
def trap_signals
|
|
68
|
+
@cached_sigint_handler = trap(:INT) {
|
|
69
|
+
halt!
|
|
70
|
+
@cached_sigint_handler.try(:call)
|
|
71
|
+
exit(-1)
|
|
72
|
+
}
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def untrap_signals
|
|
76
|
+
trap(:INT) { @cached_sigint_handler.try(:call) }
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Wayfarer
|
|
4
|
+
class CrawlObserver
|
|
5
|
+
module Events
|
|
6
|
+
FirstCycle = Struct.new(:frontier)
|
|
7
|
+
NewCycle = Struct.new(:current_uris_count)
|
|
8
|
+
DispatchedURI = Struct.new(:action, :uri)
|
|
9
|
+
CycleFinished = Class.new
|
|
10
|
+
Peeking = Struct.new(:uri)
|
|
11
|
+
AboutToCycle = Struct.new(:staged_uris_count)
|
|
12
|
+
MismatchedURI = Struct.new(:uri)
|
|
13
|
+
HaltInitiated = Struct.new(:action, :uri)
|
|
14
|
+
StagingURIs = Struct.new(:staged_uris_count)
|
|
15
|
+
UnhandledError = Struct.new(:exception)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
module ObservableShortcuts
|
|
19
|
+
def notify_observers!(*argv)
|
|
20
|
+
changed
|
|
21
|
+
notify_observers(*argv)
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
extend Forwardable
|
|
26
|
+
|
|
27
|
+
attr_reader :logger
|
|
28
|
+
|
|
29
|
+
def initialize(*observables, logger)
|
|
30
|
+
@logger = logger
|
|
31
|
+
observables.each { |obsv| obsv.add_observer(self) }
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def update(event)
|
|
35
|
+
case event
|
|
36
|
+
when Events::FirstCycle then first_cycle(event)
|
|
37
|
+
when Events::NewCycle then new_cycle(event)
|
|
38
|
+
when Events::DispatchedURI then dispatched_uri(event)
|
|
39
|
+
when Events::CycleFinished then cycle_finished
|
|
40
|
+
when Events::Peeking then peeking(event)
|
|
41
|
+
when Events::AboutToCycle then about_to_cycle(event)
|
|
42
|
+
when Events::MismatchedURI then mismatched_uri(event)
|
|
43
|
+
when Events::HaltInitiated then halt_initiated(event)
|
|
44
|
+
when Events::StagingURIs then staging_uris(event)
|
|
45
|
+
when Events::UnhandledError then unhandled_error(event)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
private
|
|
50
|
+
|
|
51
|
+
def first_cycle(event)
|
|
52
|
+
logger.info("First cycle")
|
|
53
|
+
logger.info("Frontier: #{event.frontier}")
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def new_cycle(event)
|
|
57
|
+
logger.info("Current cycle contains #{event.current_uris_count} URI(s)")
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def dispatched_uri(event)
|
|
61
|
+
logger.info("Dispatched to \##{event.action}: #{event.uri}")
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def cycle_finished
|
|
65
|
+
logger.info("No URIs left in current cycle")
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def peeking(event)
|
|
69
|
+
logger.info("Peeking into: #{event.uri}")
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def about_to_cycle(event)
|
|
73
|
+
logger.info("About to cycle. #{event.staged_uris_count} staged URI(s)")
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def mismatched_uri(event)
|
|
77
|
+
logger.debug("No matching route for: #{event.uri}")
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def halt_initiated(event)
|
|
81
|
+
logger.info("Halt initiated from \##{event.action} at: #{event.uri}")
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def staging_uris(event)
|
|
85
|
+
logger.info("Staging #{event.staged_uris_count} URI(s)")
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def unhandled_error(event)
|
|
89
|
+
level = config.reraise_exceptions ? :fatal : :error
|
|
90
|
+
|
|
91
|
+
if config.print_stacktraces
|
|
92
|
+
logger.public_send level, <<~LOGGER
|
|
93
|
+
Unhandled exception in an action: #{event.exception.class.inspect}
|
|
94
|
+
#{event.exception.backtrace.map(&:to_s).join("\n* ")}
|
|
95
|
+
LOGGER
|
|
96
|
+
else
|
|
97
|
+
logger.public_send level, <<~LOGGER
|
|
98
|
+
Unhandled exception in an action: #{event.exception.class.inspect}
|
|
99
|
+
LOGGER
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|