wayfarer-jruby 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rbenv-gemsets +1 -0
- data/.rspec +3 -0
- data/.rubocop.yml +21 -0
- data/.ruby-version +1 -0
- data/.travis.yml +5 -0
- data/.yardopts +3 -0
- data/Gemfile +11 -0
- data/LICENSE +19 -0
- data/README.md +19 -0
- data/Rakefile +114 -0
- data/benchmark/frontiers.rb +143 -0
- data/bin/wayfarer +116 -0
- data/docs/.gitignore +2 -0
- data/docs/_config.yml +15 -0
- data/docs/_includes/base.html +7 -0
- data/docs/_includes/head.html +10 -0
- data/docs/_includes/navigation.html +172 -0
- data/docs/_layouts/default.html +42 -0
- data/docs/_sass/base.scss +439 -0
- data/docs/_sass/variables.scss +24 -0
- data/docs/_sass/vendor/bourbon/_bourbon-deprecate.scss +19 -0
- data/docs/_sass/vendor/bourbon/_bourbon-deprecated-upcoming.scss +425 -0
- data/docs/_sass/vendor/bourbon/_bourbon.scss +90 -0
- data/docs/_sass/vendor/bourbon/addons/_border-color.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_border-radius.scss +48 -0
- data/docs/_sass/vendor/bourbon/addons/_border-style.scss +28 -0
- data/docs/_sass/vendor/bourbon/addons/_border-width.scss +28 -0
- data/docs/_sass/vendor/bourbon/addons/_buttons.scss +69 -0
- data/docs/_sass/vendor/bourbon/addons/_clearfix.scss +25 -0
- data/docs/_sass/vendor/bourbon/addons/_ellipsis.scss +30 -0
- data/docs/_sass/vendor/bourbon/addons/_font-stacks.scss +31 -0
- data/docs/_sass/vendor/bourbon/addons/_hide-text.scss +27 -0
- data/docs/_sass/vendor/bourbon/addons/_margin.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_padding.scss +29 -0
- data/docs/_sass/vendor/bourbon/addons/_position.scss +51 -0
- data/docs/_sass/vendor/bourbon/addons/_prefixer.scss +66 -0
- data/docs/_sass/vendor/bourbon/addons/_retina-image.scss +27 -0
- data/docs/_sass/vendor/bourbon/addons/_size.scss +56 -0
- data/docs/_sass/vendor/bourbon/addons/_text-inputs.scss +118 -0
- data/docs/_sass/vendor/bourbon/addons/_timing-functions.scss +34 -0
- data/docs/_sass/vendor/bourbon/addons/_triangle.scss +63 -0
- data/docs/_sass/vendor/bourbon/addons/_word-wrap.scss +29 -0
- data/docs/_sass/vendor/bourbon/css3/_animation.scss +61 -0
- data/docs/_sass/vendor/bourbon/css3/_appearance.scss +5 -0
- data/docs/_sass/vendor/bourbon/css3/_backface-visibility.scss +5 -0
- data/docs/_sass/vendor/bourbon/css3/_background-image.scss +44 -0
- data/docs/_sass/vendor/bourbon/css3/_background.scss +57 -0
- data/docs/_sass/vendor/bourbon/css3/_border-image.scss +61 -0
- data/docs/_sass/vendor/bourbon/css3/_calc.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_columns.scss +67 -0
- data/docs/_sass/vendor/bourbon/css3/_filter.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_flex-box.scss +327 -0
- data/docs/_sass/vendor/bourbon/css3/_font-face.scss +29 -0
- data/docs/_sass/vendor/bourbon/css3/_font-feature-settings.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_hidpi-media-query.scss +12 -0
- data/docs/_sass/vendor/bourbon/css3/_hyphens.scss +6 -0
- data/docs/_sass/vendor/bourbon/css3/_image-rendering.scss +15 -0
- data/docs/_sass/vendor/bourbon/css3/_keyframes.scss +38 -0
- data/docs/_sass/vendor/bourbon/css3/_linear-gradient.scss +40 -0
- data/docs/_sass/vendor/bourbon/css3/_perspective.scss +12 -0
- data/docs/_sass/vendor/bourbon/css3/_placeholder.scss +10 -0
- data/docs/_sass/vendor/bourbon/css3/_radial-gradient.scss +40 -0
- data/docs/_sass/vendor/bourbon/css3/_selection.scss +44 -0
- data/docs/_sass/vendor/bourbon/css3/_text-decoration.scss +27 -0
- data/docs/_sass/vendor/bourbon/css3/_transform.scss +21 -0
- data/docs/_sass/vendor/bourbon/css3/_transition.scss +81 -0
- data/docs/_sass/vendor/bourbon/css3/_user-select.scss +5 -0
- data/docs/_sass/vendor/bourbon/functions/_assign-inputs.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_contains-falsy.scss +25 -0
- data/docs/_sass/vendor/bourbon/functions/_contains.scss +31 -0
- data/docs/_sass/vendor/bourbon/functions/_is-length.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_is-light.scss +26 -0
- data/docs/_sass/vendor/bourbon/functions/_is-number.scss +16 -0
- data/docs/_sass/vendor/bourbon/functions/_is-size.scss +23 -0
- data/docs/_sass/vendor/bourbon/functions/_modular-scale.scss +74 -0
- data/docs/_sass/vendor/bourbon/functions/_px-to-em.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_px-to-rem.scss +26 -0
- data/docs/_sass/vendor/bourbon/functions/_shade.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_strip-units.scss +22 -0
- data/docs/_sass/vendor/bourbon/functions/_tint.scss +24 -0
- data/docs/_sass/vendor/bourbon/functions/_transition-property-name.scss +37 -0
- data/docs/_sass/vendor/bourbon/functions/_unpack.scss +32 -0
- data/docs/_sass/vendor/bourbon/helpers/_convert-units.scss +26 -0
- data/docs/_sass/vendor/bourbon/helpers/_directional-values.scss +108 -0
- data/docs/_sass/vendor/bourbon/helpers/_font-source-declaration.scss +53 -0
- data/docs/_sass/vendor/bourbon/helpers/_gradient-positions-parser.scss +24 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-angle-parser.scss +35 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-gradient-parser.scss +51 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-positions-parser.scss +77 -0
- data/docs/_sass/vendor/bourbon/helpers/_linear-side-corner-parser.scss +41 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-arg-parser.scss +74 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-gradient-parser.scss +55 -0
- data/docs/_sass/vendor/bourbon/helpers/_radial-positions-parser.scss +28 -0
- data/docs/_sass/vendor/bourbon/helpers/_render-gradients.scss +31 -0
- data/docs/_sass/vendor/bourbon/helpers/_shape-size-stripper.scss +15 -0
- data/docs/_sass/vendor/bourbon/helpers/_str-to-num.scss +55 -0
- data/docs/_sass/vendor/bourbon/settings/_asset-pipeline.scss +7 -0
- data/docs/_sass/vendor/bourbon/settings/_deprecation-warnings.scss +8 -0
- data/docs/_sass/vendor/bourbon/settings/_prefixer.scss +9 -0
- data/docs/_sass/vendor/bourbon/settings/_px-to-em.scss +1 -0
- data/docs/_sass/vendor/neat/_neat-helpers.scss +11 -0
- data/docs/_sass/vendor/neat/_neat.scss +23 -0
- data/docs/_sass/vendor/neat/functions/_new-breakpoint.scss +49 -0
- data/docs/_sass/vendor/neat/functions/_private.scss +114 -0
- data/docs/_sass/vendor/neat/grid/_box-sizing.scss +15 -0
- data/docs/_sass/vendor/neat/grid/_direction-context.scss +33 -0
- data/docs/_sass/vendor/neat/grid/_display-context.scss +28 -0
- data/docs/_sass/vendor/neat/grid/_fill-parent.scss +22 -0
- data/docs/_sass/vendor/neat/grid/_media.scss +92 -0
- data/docs/_sass/vendor/neat/grid/_omega.scss +87 -0
- data/docs/_sass/vendor/neat/grid/_outer-container.scss +34 -0
- data/docs/_sass/vendor/neat/grid/_pad.scss +25 -0
- data/docs/_sass/vendor/neat/grid/_private.scss +35 -0
- data/docs/_sass/vendor/neat/grid/_row.scss +52 -0
- data/docs/_sass/vendor/neat/grid/_shift.scss +50 -0
- data/docs/_sass/vendor/neat/grid/_span-columns.scss +94 -0
- data/docs/_sass/vendor/neat/grid/_to-deprecate.scss +97 -0
- data/docs/_sass/vendor/neat/grid/_visual-grid.scss +42 -0
- data/docs/_sass/vendor/neat/mixins/_clearfix.scss +25 -0
- data/docs/_sass/vendor/neat/settings/_disable-warnings.scss +13 -0
- data/docs/_sass/vendor/neat/settings/_grid.scss +51 -0
- data/docs/_sass/vendor/neat/settings/_visual-grid.scss +27 -0
- data/docs/_sass/vendor/normalize-3.0.2.scss +427 -0
- data/docs/_sass/vendor/pygments.scss +356 -0
- data/docs/automating_browsers/capybara.md +70 -0
- data/docs/css/screen.scss +7 -0
- data/docs/guides/callbacks.md +45 -0
- data/docs/guides/cli.md +52 -0
- data/docs/guides/configuration.md +184 -0
- data/docs/guides/error_handling.md +46 -0
- data/docs/guides/frontiers.md +93 -0
- data/docs/guides/halting.md +23 -0
- data/docs/guides/job_queues.md +26 -0
- data/docs/guides/locals.md +36 -0
- data/docs/guides/logging.md +22 -0
- data/docs/guides/page_objects.md +67 -0
- data/docs/guides/peeking.md +46 -0
- data/docs/guides/selenium_capybara.md +100 -0
- data/docs/guides/tutorial.md +452 -0
- data/docs/index.md +82 -0
- data/docs/js/navigation.js +11 -0
- data/docs/misc/contributing.md +20 -0
- data/docs/misc/testing.md +11 -0
- data/docs/recipes/authentication.md +23 -0
- data/docs/recipes/csv.md +29 -0
- data/docs/recipes/javascript.md +20 -0
- data/docs/recipes/multiple_uris.md +18 -0
- data/docs/recipes/screenshots.md +20 -0
- data/docs/routing/host_rules.md +24 -0
- data/docs/routing/path_rules.md +33 -0
- data/docs/routing/query_rules.md +69 -0
- data/docs/routing/routes.md +96 -0
- data/docs/routing/uri_rules.md +18 -0
- data/examples/collect_github_issues.rb +65 -0
- data/examples/find_foobar_on_wikipedia.rb +23 -0
- data/lib/wayfarer.rb +65 -0
- data/lib/wayfarer/configuration.rb +86 -0
- data/lib/wayfarer/crawl.rb +79 -0
- data/lib/wayfarer/crawl_observer.rb +103 -0
- data/lib/wayfarer/dispatcher.rb +104 -0
- data/lib/wayfarer/finders.rb +61 -0
- data/lib/wayfarer/frontiers/frontier.rb +79 -0
- data/lib/wayfarer/frontiers/memory_bloomfilter.rb +32 -0
- data/lib/wayfarer/frontiers/memory_frontier.rb +76 -0
- data/lib/wayfarer/frontiers/memory_trie_frontier.rb +39 -0
- data/lib/wayfarer/frontiers/normalize_uris.rb +48 -0
- data/lib/wayfarer/frontiers/redis_bloomfilter.rb +34 -0
- data/lib/wayfarer/frontiers/redis_frontier.rb +83 -0
- data/lib/wayfarer/http_adapters/adapter_pool.rb +62 -0
- data/lib/wayfarer/http_adapters/net_http_adapter.rb +77 -0
- data/lib/wayfarer/http_adapters/selenium_adapter.rb +80 -0
- data/lib/wayfarer/job.rb +192 -0
- data/lib/wayfarer/locals.rb +40 -0
- data/lib/wayfarer/page.rb +94 -0
- data/lib/wayfarer/parsers/json_parser.rb +20 -0
- data/lib/wayfarer/parsers/xml_parser.rb +27 -0
- data/lib/wayfarer/processor.rb +103 -0
- data/lib/wayfarer/routing/host_rule.rb +19 -0
- data/lib/wayfarer/routing/path_rule.rb +54 -0
- data/lib/wayfarer/routing/query_rule.rb +59 -0
- data/lib/wayfarer/routing/router.rb +71 -0
- data/lib/wayfarer/routing/rule.rb +102 -0
- data/lib/wayfarer/routing/uri_rule.rb +21 -0
- data/spec/configuration_spec.rb +26 -0
- data/spec/crawl_spec.rb +48 -0
- data/spec/finders_spec.rb +49 -0
- data/spec/frontiers/memory_bloomfilter_spec.rb +6 -0
- data/spec/frontiers/memory_frontier_spec.rb +6 -0
- data/spec/frontiers/memory_trie_frontier_spec.rb +6 -0
- data/spec/frontiers/normalize_uris_spec.rb +59 -0
- data/spec/frontiers/redis_bloomfilter_spec.rb +6 -0
- data/spec/frontiers/redis_frontier_spec.rb +6 -0
- data/spec/http_adapters/adapter_pool_spec.rb +33 -0
- data/spec/http_adapters/net_http_adapter_spec.rb +83 -0
- data/spec/http_adapters/selenium_adapter_spec.rb +53 -0
- data/spec/integration/callbacks_spec.rb +42 -0
- data/spec/integration/locals_spec.rb +106 -0
- data/spec/job_spec.rb +86 -0
- data/spec/page_spec.rb +38 -0
- data/spec/parsers/json_parser_spec.rb +30 -0
- data/spec/parsers/xml_parser_spec.rb +24 -0
- data/spec/processor_spec.rb +31 -0
- data/spec/routing/host_rule_spec.rb +48 -0
- data/spec/routing/path_rule_spec.rb +66 -0
- data/spec/routing/query_rule_spec.rb +124 -0
- data/spec/routing/router_spec.rb +67 -0
- data/spec/routing/rule_spec.rb +218 -0
- data/spec/routing/uri_rule_spec.rb +24 -0
- data/spec/shared/frontier.rb +96 -0
- data/spec/spec_helpers.rb +62 -0
- data/spec/wayfarer_spec.rb +24 -0
- data/support/static/finders.html +38 -0
- data/support/static/graph/details/a.html +10 -0
- data/support/static/graph/details/b.html +10 -0
- data/support/static/graph/index.html +20 -0
- data/support/static/json/dummy.json +13 -0
- data/support/static/links/links.html +28 -0
- data/support/static/xml/dummy.xml +120 -0
- data/support/test_app.rb +45 -0
- data/wayfarer-jruby.gemspec +49 -0
- data/wayfarer.gemspec +53 -0
- metadata +616 -0
data/docs/guides/cli.md
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
---
|
2
|
+
layout: default
|
3
|
+
title: CLI
|
4
|
+
---
|
5
|
+
|
6
|
+
# Command-line interface
|
7
|
+
Wayfarer ships with a small executable, `wayfarer`.
|
8
|
+
|
9
|
+
Job classes are loaded by naming convention, e.g. if you pass `./directory/foo_bar.rb` as the `FILE` parameter, that file is expected to define the class `FooBar`. You can leave off the `.rb` extension.
|
10
|
+
|
11
|
+
## `% wayfarer route FILE URI`
|
12
|
+
Loads the job defined in `FILE`, and prints the first matching route for `URI`.
|
13
|
+
|
14
|
+
## `% wayfarer enqueue FILE URI`
|
15
|
+
Loads and enqueues the job in `FILE`, starting from `URI`.
|
16
|
+
|
17
|
+
* `--log_level LEVEL`
|
18
|
+
Option. Which log messages to print.
|
19
|
+
|
20
|
+
* Default: `info`
|
21
|
+
* Recognized values: `unknown`, `debug`, `error`, `fatal`, `info`, `warn`
|
22
|
+
|
23
|
+
* `--queue_adapter ADAPTER`
|
24
|
+
Option. Which ActiveJob queue adapter to use (e.g. `sidekiq`, `resque`).
|
25
|
+
* Recognized values: strings, see [documentation](http://api.rubyonrails.org/)
|
26
|
+
|
27
|
+
* `--wait VALUE`
|
28
|
+
Option. Point of time when the enqueued job should be run.
|
29
|
+
|
30
|
+
1. If the value can be converted to an integer, it represents the seconds from now.
|
31
|
+
2. If the value can be parsed by `Time::parse`, the job gets scheduled at that point in time.
|
32
|
+
3. If the value is a human-readable time string that [Chronic](https://github.com/mojombo/chronic) can make sense of, the job is scheduled at that point in time.
|
33
|
+
|
34
|
+
__Examples:__
|
35
|
+
|
36
|
+
60 seconds from now:
|
37
|
+
|
38
|
+
```
|
39
|
+
% wayfarer enqueue ./foo_bar http://google.com --wait 60
|
40
|
+
```
|
41
|
+
|
42
|
+
6pm, today:
|
43
|
+
|
44
|
+
```
|
45
|
+
% wayfarer enqueue ./foo_bar http://google.com --wait 18:00
|
46
|
+
```
|
47
|
+
|
48
|
+
Tomorrow:
|
49
|
+
|
50
|
+
```
|
51
|
+
% wayfarer enqueue ./foo_bar http://google.com --wait tomorrow
|
52
|
+
```
|
@@ -0,0 +1,184 @@
|
|
1
|
+
---
|
2
|
+
layout: default
|
3
|
+
title: Configuration
|
4
|
+
---
|
5
|
+
|
6
|
+
# Configuration
|
7
|
+
|
8
|
+
All job classes base their configuration off the global one.
|
9
|
+
|
10
|
+
{% highlight ruby %}
|
11
|
+
# Setting a key globally applies to all jobs ...
|
12
|
+
Wayfarer.config.key = :value
|
13
|
+
|
14
|
+
class DummyJob < Wayfarer::Job
|
15
|
+
# ... unless a job overrides it
|
16
|
+
config.key = :other_value
|
17
|
+
end
|
18
|
+
|
19
|
+
class DummyJob < Wayfarer::Job
|
20
|
+
# Have it yielded
|
21
|
+
config { |c| c.key = :other_value }
|
22
|
+
end
|
23
|
+
{% endhighlight %}
|
24
|
+
|
25
|
+
## Recognized keys and values
|
26
|
+
|
27
|
+
### `print_stacktraces`
|
28
|
+
* Default: `true`
|
29
|
+
* Recognized values: Booleans
|
30
|
+
|
31
|
+
Whether to print stacktraces when encounterting unhandled exceptions in job actions. See [Error handling]({{base}}/guides/error_handling.html).
|
32
|
+
|
33
|
+
---
|
34
|
+
|
35
|
+
### `reraise_exceptions`
|
36
|
+
|
37
|
+
* Default: `false`
|
38
|
+
* Recognized values: Booleans
|
39
|
+
|
40
|
+
Whether to crash when encountering unhandled exceptions in job actions. See [Error handling]({{base}}/guides/error_handling.html).
|
41
|
+
|
42
|
+
---
|
43
|
+
|
44
|
+
### `allow_circulation`
|
45
|
+
|
46
|
+
* Default: `false`
|
47
|
+
* Recognized values: Booleans
|
48
|
+
|
49
|
+
Whether URIs may be visited twice.
|
50
|
+
|
51
|
+
<aside class="note">
|
52
|
+
Allowing circulation might cause your jobs to not terminate.
|
53
|
+
</aside>
|
54
|
+
|
55
|
+
---
|
56
|
+
|
57
|
+
### `normalize_uris`
|
58
|
+
|
59
|
+
* Default: `true`
|
60
|
+
* Recognized values: Booleans
|
61
|
+
|
62
|
+
Whether to strip fragments, reorder query keys, etc. when staging and caching URIs. Customizable with the `:normalize_uri_options` key. See [normalize_url](https://github.com/rwz/normalize_url).
|
63
|
+
|
64
|
+
---
|
65
|
+
|
66
|
+
### `normalize_uri_options`
|
67
|
+
|
68
|
+
* Default: `{}`
|
69
|
+
* Recognized values: See [normalize_url](https://github.com/rwz/normalize_url).
|
70
|
+
|
71
|
+
---
|
72
|
+
|
73
|
+
### `frontier`
|
74
|
+
* Default: `:memory`
|
75
|
+
* Recognized values: See [(Redis) frontiers](frontiers.html).
|
76
|
+
|
77
|
+
Which frontier to use.
|
78
|
+
|
79
|
+
<aside class="note">
|
80
|
+
Bloom filters may yield false positives. See the <a href="https://en.wikipedia.org/wiki/Bloom_filter">Wikipedia article</a>.
|
81
|
+
</aside>
|
82
|
+
|
83
|
+
---
|
84
|
+
|
85
|
+
### `connection_count`
|
86
|
+
|
87
|
+
* Default: `4`
|
88
|
+
* Recognized values: Integers
|
89
|
+
|
90
|
+
How many threads and HTTP adapters to use (1:1 correspondence).
|
91
|
+
|
92
|
+
---
|
93
|
+
|
94
|
+
### `http_adapter`
|
95
|
+
|
96
|
+
* Default: `:net_http`
|
97
|
+
* Recognized values: `:net_http`, `:selenium`
|
98
|
+
|
99
|
+
Which HTTP adapter to use. See [Selenium & Capybara](selenium_capybara.html).
|
100
|
+
|
101
|
+
---
|
102
|
+
|
103
|
+
### `connection_timeout`
|
104
|
+
|
105
|
+
* Default: `Float::INFINITY`
|
106
|
+
* Recognized values: Floats
|
107
|
+
|
108
|
+
Time in seconds that a job instance may hold an HTTP adapter. Instances that exceed this time limit raise an exception.
|
109
|
+
|
110
|
+
---
|
111
|
+
|
112
|
+
### `max_http_redirects`
|
113
|
+
|
114
|
+
* Default: `3`
|
115
|
+
* Recognized values: Integers
|
116
|
+
|
117
|
+
How many 3xx redirects to follow.
|
118
|
+
|
119
|
+
<aside class="note">
|
120
|
+
Has no effect when using the <code>:selenium</code> HTTP adapter.
|
121
|
+
</aside>
|
122
|
+
|
123
|
+
---
|
124
|
+
|
125
|
+
### `selenium_argv`
|
126
|
+
|
127
|
+
* Default: `[:firefox]`
|
128
|
+
* Recognized values: See [Selenium & Capybara](selenium_capybara.html)
|
129
|
+
|
130
|
+
Argument vector passed to [`Selenium::WebDriver::Driver::for`](http://www.rubydoc.info/gems/selenium-webdriver/Selenium/WebDriver/Driver#for-class_method).
|
131
|
+
|
132
|
+
---
|
133
|
+
|
134
|
+
### `redis_opts`
|
135
|
+
|
136
|
+
* Default: `{ host: "localhost", port: 6379 }`
|
137
|
+
* Recognized values: [See documentation](http://www.rubydoc.info/github/redis/redis-rb/Redis%3Ainitialize)
|
138
|
+
|
139
|
+
Options passed to [`Redis#initialize`](http://www.rubydoc.info/github/redis/redis-rb/Redis%3Ainitialize).
|
140
|
+
|
141
|
+
---
|
142
|
+
|
143
|
+
### `bloomfilter_opts`
|
144
|
+
|
145
|
+
* Default:
|
146
|
+
```
|
147
|
+
{
|
148
|
+
size: 100,
|
149
|
+
hashes: 2,
|
150
|
+
seed: 1,
|
151
|
+
bucket: 3,
|
152
|
+
raise: false
|
153
|
+
}
|
154
|
+
```
|
155
|
+
* Recognized values:
|
156
|
+
* `size`: Integers; number of buckets in a bloom filter
|
157
|
+
* `hashes`: Integers; number of hash functions
|
158
|
+
* `seed`: Integers; seed of hash functions
|
159
|
+
* `bucket`: Integers; number of bits in a bloom filter bucket
|
160
|
+
* `raise`: Booleans; whether to raise on bucket overflow
|
161
|
+
|
162
|
+
Options for [bloomfilter-rb](https://github.com/igrigorik/bloomfilter-rb).
|
163
|
+
|
164
|
+
---
|
165
|
+
|
166
|
+
### `window_size`
|
167
|
+
|
168
|
+
* Default: `[1024, 768]`
|
169
|
+
* Recognized values: `[Integer, Integer]`
|
170
|
+
|
171
|
+
Dimensions of browser windows.
|
172
|
+
|
173
|
+
<aside class="note">
|
174
|
+
Only has an effect when using the <code>:selenium</code> HTTP adapter.
|
175
|
+
</aside>
|
176
|
+
|
177
|
+
---
|
178
|
+
|
179
|
+
### `mustermann_type`
|
180
|
+
|
181
|
+
* Default: `:sinatra`
|
182
|
+
* Recognized values: [See documentation](https://github.com/sinatra/mustermann)
|
183
|
+
|
184
|
+
Which [Mustermann](https://github.com/sinatra/mustermann) pattern type to use.
|
@@ -0,0 +1,46 @@
|
|
1
|
+
---
|
2
|
+
layout: default
|
3
|
+
title: Error handling
|
4
|
+
---
|
5
|
+
|
6
|
+
# Error handling
|
7
|
+
By default, all exceptions raised within actions are swallowed and only their stacktraces printed to stderr. This behaviour can be changed with two configuration keys (see [Configuration]()):
|
8
|
+
|
9
|
+
1. `print_stacktraces`: Whether to print stacktraces (default: `true`)
|
10
|
+
2. `reraise_exceptions`: Whether to crash when encountering unhandled exceptions (default: `false`)
|
11
|
+
|
12
|
+
Here’s an example to illustrate the default behaviour:
|
13
|
+
|
14
|
+
{% highlight ruby %}
|
15
|
+
class DummyJob < Wayfarer::Job
|
16
|
+
def example
|
17
|
+
# Makes this instance fail, but processing goes on
|
18
|
+
# Prints the stacktrace to stderr
|
19
|
+
fail "It's okay, life goes on"
|
20
|
+
end
|
21
|
+
end
|
22
|
+
{% endhighlight %}
|
23
|
+
|
24
|
+
The following reraises all exceptions, stops processing and returns with a non-zero exit code:
|
25
|
+
|
26
|
+
{% highlight ruby %}
|
27
|
+
class DummyJob < Wayfarer::Job
|
28
|
+
config.reraise_exceptions = true
|
29
|
+
|
30
|
+
def example
|
31
|
+
fail "This makes the exception bubble up"
|
32
|
+
end
|
33
|
+
end
|
34
|
+
{% endhighlight %}
|
35
|
+
|
36
|
+
And if you don’t want to be bothered with exceptions at all:
|
37
|
+
|
38
|
+
{% highlight ruby %}
|
39
|
+
class DummyJob < Wayfarer::Job
|
40
|
+
config.print_stacktraces = false
|
41
|
+
|
42
|
+
def example
|
43
|
+
fail "No one will know about this ..."
|
44
|
+
end
|
45
|
+
end
|
46
|
+
{% endhighlight %}
|
@@ -0,0 +1,93 @@
|
|
1
|
+
---
|
2
|
+
layout: default
|
3
|
+
title: Frontiers
|
4
|
+
---
|
5
|
+
|
6
|
+
# Frontiers
|
7
|
+
|
8
|
+
Frontiers keep track of three sets of URIs:
|
9
|
+
|
10
|
+
* Current URIs that are being processed
|
11
|
+
* Staged URIs that might be processed in the next cycle
|
12
|
+
* Cached URIs that have been processed
|
13
|
+
|
14
|
+
All frontiers expose the same behaviour.
|
15
|
+
|
16
|
+
<pre class="illustration">
|
17
|
+
┌──────────────────────────────────────────────────────────┐
|
18
|
+
│ STAGED │
|
19
|
+
│ {https://alpha.com, https://beta.com} │
|
20
|
+
└──────────────────────────────────────────────────────────┘
|
21
|
+
┌──────────────────────────────────────────────────────────┐
|
22
|
+
│ CURRENT │
|
23
|
+
│ {https://gamma.com} │
|
24
|
+
└──────────────────────────────────────────────────────────┘
|
25
|
+
┌──────────────────────────────────────────────────────────┐
|
26
|
+
│ CACHED │
|
27
|
+
│ {https://beta.com} │
|
28
|
+
└──────────────────────────────────────────────────────────┘
|
29
|
+
│
|
30
|
+
Cycle
|
31
|
+
│
|
32
|
+
▼
|
33
|
+
┌──────────────────────────────────────────────────────────┐
|
34
|
+
│ STAGED' │
|
35
|
+
│ {...} │
|
36
|
+
└──────────────────────────────────────────────────────────┘
|
37
|
+
┌──────────────────────────────────────────────────────────┐
|
38
|
+
│ CURRENT' = STAGED \ CACHED │
|
39
|
+
│ {https://alpha.com} │
|
40
|
+
└──────────────────────────────────────────────────────────┘
|
41
|
+
┌──────────────────────────────────────────────────────────┐
|
42
|
+
│ CACHED' = CACHED ∪ CURRENT │
|
43
|
+
│ {https://beta.com, https://gamma.com} │
|
44
|
+
└──────────────────────────────────────────────────────────┘
|
45
|
+
</pre>
|
46
|
+
|
47
|
+
## Available frontiers
|
48
|
+
Currently, there are 5 frontiers available:
|
49
|
+
|
50
|
+
2. `:memory` (default): Uses sets from the standard lib.
|
51
|
+
4. `:redis`: Uses Redis sets.
|
52
|
+
3. `:memory_bloom`: Uses a [Bloom filter](https://github.com/igrigorik/bloomfilter-rb).
|
53
|
+
5. `:redis_bloom`: Uses a Redis-backed Bloom filter.
|
54
|
+
1. `:memory_trie`: Uses a [trie](https://github.com/tyler/trie) and sets.
|
55
|
+
|
56
|
+
| Frontier | MRI support | JRuby support |
|
57
|
+
| --- | --- |
|
58
|
+
| `:memory` | Yes | Yes
|
59
|
+
| `:redis` | Yes | Yes
|
60
|
+
| `:memory_bloom` | Yes | No
|
61
|
+
| `:redis_bloom` | Yes | No
|
62
|
+
| `:memory_trie` | Yes | No
|
63
|
+
|
64
|
+
## Setting the frontier
|
65
|
+
|
66
|
+
Set the `:frontier` configuration key:
|
67
|
+
|
68
|
+
{% highlight ruby %}
|
69
|
+
class DummyJob < Wayfarer::Job
|
70
|
+
config.frontier = :foobar
|
71
|
+
end
|
72
|
+
{% endhighlight %}
|
73
|
+
|
74
|
+
### Using a Redis frontier
|
75
|
+
|
76
|
+
Set the `:redis_opts` and `:frontier` configuration keys:
|
77
|
+
|
78
|
+
{% highlight ruby %}
|
79
|
+
class DummyJob < Wayfarer::Job
|
80
|
+
config.redis_opts = { port: 4242 }
|
81
|
+
config.frontier = :redis
|
82
|
+
end
|
83
|
+
{% endhighlight %}
|
84
|
+
|
85
|
+
### Setting bloomfilter parameters
|
86
|
+
|
87
|
+
Set the `:bloomfilter_opts` configuration key:
|
88
|
+
|
89
|
+
{% highlight ruby %}
|
90
|
+
class DummyJob < Wayfarer::Job
|
91
|
+
config.bloomfilter_opts = { ... }
|
92
|
+
end
|
93
|
+
{% endhighlight %}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
---
|
2
|
+
layout: default
|
3
|
+
title: Halting
|
4
|
+
---
|
5
|
+
|
6
|
+
# Halting
|
7
|
+
Processing can be stopped by calling `#halt` within actions.
|
8
|
+
|
9
|
+
`#halt` does not return immediately. Instead, it sets a halting flag internally, and once the action returns, all threads will stop instead of processing further URIs.
|
10
|
+
|
11
|
+
Job instances run in separate threads. When a job signals that it wants to halt, all other threads will finish their current work, but will not process any further URIs. All instances have the chance to get their current work done.
|
12
|
+
|
13
|
+
{% highlight ruby %}
|
14
|
+
class DummyJob < Wayfarer::Job
|
15
|
+
def example
|
16
|
+
halt
|
17
|
+
puts "This will be printed!"
|
18
|
+
|
19
|
+
return halt
|
20
|
+
puts "This will not be printed!"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
{% endhighlight %}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
---
|
2
|
+
layout: default
|
3
|
+
title: Locals
|
4
|
+
---
|
5
|
+
|
6
|
+
# Job queues
|
7
|
+
|
8
|
+
Thanks to [ActiveJob](http://edgeguides.rubyonrails.org/active_job_basics.html), jobs can be enqueued with various backends, e.g. Sidekiq or Resque:
|
9
|
+
|
10
|
+
{% highlight ruby %}
|
11
|
+
class DummyJob < Wayfarer::Job
|
12
|
+
# Overrides ActiveJob's global setting
|
13
|
+
self.queue_adapter = :resque
|
14
|
+
|
15
|
+
# Identifier for enqueued jobs
|
16
|
+
queue_as :dummy_job
|
17
|
+
|
18
|
+
# Alternatively, pass a block
|
19
|
+
queue_as do
|
20
|
+
[:first, :second].sample
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Alternatively, set the queue explicitly on call:
|
25
|
+
DummyJob.set(queue: :something_else).perform_later(*uris)
|
26
|
+
{% endhighlight %}
|
@@ -0,0 +1,36 @@
|
|
1
|
+
---
|
2
|
+
layout: default
|
3
|
+
title: Locals
|
4
|
+
---
|
5
|
+
|
6
|
+
# Locals
|
7
|
+
|
8
|
+
Locals are Wayfarer's replacement for job instance variables. Both `let` and `let!` declare variables that are accessible within [callbacks]({{base}}/callbacks.html) and actions.
|
9
|
+
|
10
|
+
Even though you might recognise them from RSpec, they have differing semantics: Values in `let` blocks will be replaced with thread-safe counterparts once the job is run. `let!` skips this. Both evaluate their block immediately.
|
11
|
+
|
12
|
+
| Standard lib | Counterpart |
|
13
|
+
| --- | --- |
|
14
|
+
| Booleans | [`Concurrent::AtomicBoolean`](http://ruby-concurrency.github.io/concurrent-ruby/Concurrent/AtomicBoolean.html) |
|
15
|
+
| `Fixnum` | [`Concurrent::AtomicFixnum`](http://ruby-concurrency.github.io/concurrent-ruby/Concurrent/AtomicFixnum.html) |
|
16
|
+
| `Hash` | [`Concurrent::Hash`](http://ruby-concurrency.github.io/concurrent-ruby/Concurrent/Hash.html) |
|
17
|
+
| `Array` | [`Concurrent::Array`](http://ruby-concurrency.github.io/concurrent-ruby/Concurrent/Array.html) |
|
18
|
+
| Everything else | Untouched |
|
19
|
+
|
20
|
+
{% highlight ruby %}
|
21
|
+
class DummyJob < Wayfarer::Job
|
22
|
+
let(:values) { [1, 2, 3] }
|
23
|
+
|
24
|
+
before_crawl do
|
25
|
+
values.reverse!
|
26
|
+
end
|
27
|
+
|
28
|
+
after_crawl do
|
29
|
+
values # => [3, 2, 1, 0]
|
30
|
+
end
|
31
|
+
|
32
|
+
def some_action
|
33
|
+
values << 0
|
34
|
+
end
|
35
|
+
end
|
36
|
+
{% endhighlight %}
|