wayfarer 0.4.6 → 0.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.env +17 -0
- data/.github/workflows/lint.yaml +27 -0
- data/.github/workflows/release.yaml +30 -0
- data/.github/workflows/tests.yaml +21 -0
- data/.gitignore +5 -1
- data/.rubocop.yml +36 -0
- data/.vale.ini +8 -0
- data/.yardopts +1 -3
- data/Dockerfile +6 -4
- data/Gemfile +24 -0
- data/Gemfile.lock +274 -164
- data/Rakefile +7 -51
- data/bin/wayfarer +1 -1
- data/docker-compose.yml +23 -13
- data/docs/cookbook/consent_screen.md +2 -2
- data/docs/cookbook/executing_javascript.md +3 -3
- data/docs/cookbook/navigation.md +12 -12
- data/docs/cookbook/querying_html.md +3 -3
- data/docs/cookbook/screenshots.md +2 -2
- data/docs/guides/callbacks.md +25 -125
- data/docs/guides/cli.md +71 -0
- data/docs/guides/configuration.md +10 -35
- data/docs/guides/development.md +67 -0
- data/docs/guides/handlers.md +60 -0
- data/docs/guides/index.md +1 -0
- data/docs/guides/jobs.md +142 -31
- data/docs/guides/navigation.md +1 -1
- data/docs/guides/networking/capybara.md +13 -22
- data/docs/guides/networking/custom_adapters.md +103 -41
- data/docs/guides/networking/ferrum.md +4 -4
- data/docs/guides/networking/http.md +9 -13
- data/docs/guides/networking/selenium.md +10 -11
- data/docs/guides/pages.md +78 -10
- data/docs/guides/redis.md +10 -0
- data/docs/guides/routing.md +156 -0
- data/docs/guides/tasks.md +53 -9
- data/docs/guides/tutorial.md +66 -0
- data/docs/guides/user_agents.md +115 -0
- data/docs/index.md +17 -40
- data/lib/wayfarer/base.rb +125 -46
- data/lib/wayfarer/batch_completion.rb +60 -0
- data/lib/wayfarer/callbacks.rb +22 -48
- data/lib/wayfarer/cli/route_printer.rb +85 -89
- data/lib/wayfarer/cli.rb +103 -0
- data/lib/wayfarer/gc.rb +18 -6
- data/lib/wayfarer/handler.rb +15 -7
- data/lib/wayfarer/kv.rb +28 -0
- data/lib/wayfarer/logging.rb +38 -0
- data/lib/wayfarer/middleware/base.rb +2 -0
- data/lib/wayfarer/middleware/batch_completion.rb +19 -0
- data/lib/wayfarer/middleware/chain.rb +7 -1
- data/lib/wayfarer/middleware/content_type.rb +59 -0
- data/lib/wayfarer/middleware/controller.rb +19 -15
- data/lib/wayfarer/middleware/dedup.rb +22 -13
- data/lib/wayfarer/middleware/dispatch.rb +17 -4
- data/lib/wayfarer/middleware/normalize.rb +7 -14
- data/lib/wayfarer/middleware/redis.rb +15 -0
- data/lib/wayfarer/middleware/router.rb +33 -35
- data/lib/wayfarer/middleware/stage.rb +5 -5
- data/lib/wayfarer/middleware/uri_parser.rb +31 -0
- data/lib/wayfarer/middleware/user_agent.rb +49 -0
- data/lib/wayfarer/networking/capybara.rb +1 -1
- data/lib/wayfarer/networking/context.rb +14 -3
- data/lib/wayfarer/networking/ferrum.rb +1 -4
- data/lib/wayfarer/networking/follow.rb +14 -7
- data/lib/wayfarer/networking/http.rb +1 -1
- data/lib/wayfarer/networking/pool.rb +23 -13
- data/lib/wayfarer/networking/selenium.rb +15 -7
- data/lib/wayfarer/networking/strategy.rb +2 -2
- data/lib/wayfarer/page.rb +34 -14
- data/lib/wayfarer/parsing/xml.rb +6 -6
- data/lib/wayfarer/parsing.rb +21 -0
- data/lib/wayfarer/redis/barrier.rb +26 -21
- data/lib/wayfarer/redis/counter.rb +18 -9
- data/lib/wayfarer/redis/pool.rb +1 -1
- data/lib/wayfarer/redis/resettable.rb +19 -0
- data/lib/wayfarer/routing/dsl.rb +166 -30
- data/lib/wayfarer/routing/hash_stack.rb +33 -0
- data/lib/wayfarer/routing/matchers/custom.rb +8 -5
- data/lib/wayfarer/routing/matchers/{suffix.rb → empty_params.rb} +2 -6
- data/lib/wayfarer/routing/matchers/host.rb +15 -9
- data/lib/wayfarer/routing/matchers/path.rb +11 -31
- data/lib/wayfarer/routing/matchers/query.rb +41 -17
- data/lib/wayfarer/routing/matchers/result.rb +12 -0
- data/lib/wayfarer/routing/matchers/scheme.rb +13 -5
- data/lib/wayfarer/routing/matchers/url.rb +13 -5
- data/lib/wayfarer/routing/path_consumer.rb +130 -0
- data/lib/wayfarer/routing/path_finder.rb +151 -23
- data/lib/wayfarer/routing/result.rb +1 -1
- data/lib/wayfarer/routing/root_route.rb +17 -1
- data/lib/wayfarer/routing/route.rb +66 -19
- data/lib/wayfarer/routing/serializable.rb +28 -0
- data/lib/wayfarer/routing/sub_route.rb +53 -0
- data/lib/wayfarer/routing/target_route.rb +17 -1
- data/lib/wayfarer/stringify.rb +21 -30
- data/lib/wayfarer/task.rb +9 -17
- data/lib/wayfarer/uri/normalization.rb +120 -0
- data/lib/wayfarer.rb +72 -5
- data/mise.toml +2 -0
- data/mkdocs.yml +44 -8
- data/rake/docs.rake +26 -0
- data/rake/lint.rake +9 -0
- data/rake/release.rake +23 -0
- data/rake/tests.rake +32 -0
- data/requirements.txt +1 -1
- data/spec/factories/job.rb +8 -0
- data/spec/factories/middleware.rb +2 -2
- data/spec/factories/path_finder.rb +11 -0
- data/spec/factories/redis.rb +19 -0
- data/spec/factories/task.rb +46 -2
- data/spec/spec_helpers.rb +55 -51
- data/spec/support/active_job_helpers.rb +8 -0
- data/spec/support/integration_helpers.rb +21 -0
- data/spec/support/redis_helpers.rb +9 -0
- data/spec/support/test_app.rb +66 -37
- data/spec/wayfarer/base_spec.rb +200 -0
- data/spec/wayfarer/batch_completion_spec.rb +142 -0
- data/spec/wayfarer/cli/job_spec.rb +88 -0
- data/spec/wayfarer/cli/routing_spec.rb +322 -0
- data/spec/{cli → wayfarer/cli}/version_spec.rb +1 -1
- data/spec/wayfarer/gc_spec.rb +29 -0
- data/spec/wayfarer/handler_spec.rb +9 -0
- data/spec/wayfarer/integration/callbacks_spec.rb +200 -0
- data/spec/wayfarer/integration/content_type_spec.rb +37 -0
- data/spec/wayfarer/integration/custom_routing_spec.rb +51 -0
- data/spec/wayfarer/integration/gc_spec.rb +40 -0
- data/spec/wayfarer/integration/handler_spec.rb +65 -0
- data/spec/wayfarer/integration/page_spec.rb +79 -0
- data/spec/wayfarer/integration/params_spec.rb +64 -0
- data/spec/wayfarer/integration/parsing_spec.rb +99 -0
- data/spec/wayfarer/integration/retry_spec.rb +112 -0
- data/spec/wayfarer/integration/stage_spec.rb +58 -0
- data/spec/wayfarer/middleware/batch_completion_spec.rb +33 -0
- data/spec/{middleware → wayfarer/middleware}/chain_spec.rb +24 -19
- data/spec/wayfarer/middleware/content_type_spec.rb +83 -0
- data/spec/{middleware → wayfarer/middleware}/controller_spec.rb +24 -22
- data/spec/wayfarer/middleware/dedup_spec.rb +66 -0
- data/spec/wayfarer/middleware/normalize_spec.rb +32 -0
- data/spec/wayfarer/middleware/router_spec.rb +102 -0
- data/spec/wayfarer/middleware/stage_spec.rb +63 -0
- data/spec/wayfarer/middleware/uri_parser_spec.rb +63 -0
- data/spec/wayfarer/middleware/user_agent_spec.rb +158 -0
- data/spec/wayfarer/networking/capybara_spec.rb +13 -0
- data/spec/{networking → wayfarer/networking}/context_spec.rb +46 -38
- data/spec/wayfarer/networking/ferrum_spec.rb +13 -0
- data/spec/{networking → wayfarer/networking}/follow_spec.rb +11 -6
- data/spec/wayfarer/networking/http_spec.rb +12 -0
- data/spec/{networking → wayfarer/networking}/pool_spec.rb +16 -14
- data/spec/wayfarer/networking/selenium_spec.rb +12 -0
- data/spec/{networking → wayfarer/networking}/strategy.rb +33 -54
- data/spec/wayfarer/page_spec.rb +69 -0
- data/spec/{parsing → wayfarer/parsing}/json_spec.rb +1 -1
- data/spec/wayfarer/parsing/xml_parse_spec.rb +25 -0
- data/spec/wayfarer/redis/barrier_spec.rb +39 -0
- data/spec/wayfarer/redis/counter_spec.rb +34 -0
- data/spec/{redis → wayfarer/redis}/pool_spec.rb +4 -3
- data/spec/{routing → wayfarer/routing}/dsl_spec.rb +12 -22
- data/spec/wayfarer/routing/hash_stack_spec.rb +63 -0
- data/spec/wayfarer/routing/integration_spec.rb +101 -0
- data/spec/wayfarer/routing/matchers/custom_spec.rb +39 -0
- data/spec/wayfarer/routing/matchers/host_spec.rb +56 -0
- data/spec/wayfarer/routing/matchers/matcher.rb +17 -0
- data/spec/wayfarer/routing/matchers/path_spec.rb +43 -0
- data/spec/wayfarer/routing/matchers/query_spec.rb +123 -0
- data/spec/wayfarer/routing/matchers/scheme_spec.rb +45 -0
- data/spec/wayfarer/routing/matchers/url_spec.rb +33 -0
- data/spec/wayfarer/routing/path_consumer_spec.rb +123 -0
- data/spec/wayfarer/routing/path_finder_spec.rb +409 -0
- data/spec/wayfarer/routing/root_route_spec.rb +51 -0
- data/spec/wayfarer/routing/route_spec.rb +74 -0
- data/spec/wayfarer/routing/sub_route_spec.rb +103 -0
- data/spec/wayfarer/task_spec.rb +13 -0
- data/spec/wayfarer/uri/normalization_spec.rb +98 -0
- data/spec/wayfarer_spec.rb +2 -2
- data/wayfarer.gemspec +18 -28
- metadata +797 -265
- data/.github/workflows/ci.yaml +0 -32
- data/.rbenv-gemsets +0 -1
- data/.ruby-version +0 -1
- data/RELEASING.md +0 -17
- data/docs/cookbook/user_agent.md +0 -7
- data/docs/guides/error_handling.md +0 -53
- data/docs/guides/networking.md +0 -94
- data/docs/guides/performance.md +0 -130
- data/docs/guides/reliability.md +0 -41
- data/docs/guides/routing/steering.md +0 -30
- data/docs/reference/api/base.md +0 -48
- data/docs/reference/cli.md +0 -61
- data/docs/reference/configuration_keys.md +0 -43
- data/docs/reference/environment_variables.md +0 -83
- data/lib/wayfarer/cli/base.rb +0 -45
- data/lib/wayfarer/cli/generate.rb +0 -17
- data/lib/wayfarer/cli/job.rb +0 -56
- data/lib/wayfarer/cli/route.rb +0 -29
- data/lib/wayfarer/cli/runner.rb +0 -34
- data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
- data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
- data/lib/wayfarer/config/capybara.rb +0 -10
- data/lib/wayfarer/config/ferrum.rb +0 -11
- data/lib/wayfarer/config/networking.rb +0 -29
- data/lib/wayfarer/config/redis.rb +0 -14
- data/lib/wayfarer/config/root.rb +0 -11
- data/lib/wayfarer/config/selenium.rb +0 -21
- data/lib/wayfarer/config/strconv.rb +0 -45
- data/lib/wayfarer/config/struct.rb +0 -72
- data/lib/wayfarer/middleware/fetch.rb +0 -56
- data/lib/wayfarer/redis/connection.rb +0 -13
- data/lib/wayfarer/redis/version.rb +0 -19
- data/lib/wayfarer/routing/router.rb +0 -28
- data/spec/base_spec.rb +0 -224
- data/spec/callbacks_spec.rb +0 -102
- data/spec/cli/generate_spec.rb +0 -39
- data/spec/cli/job_spec.rb +0 -78
- data/spec/config/capybara_spec.rb +0 -18
- data/spec/config/ferrum_spec.rb +0 -24
- data/spec/config/networking_spec.rb +0 -73
- data/spec/config/redis_spec.rb +0 -32
- data/spec/config/root_spec.rb +0 -31
- data/spec/config/selenium_spec.rb +0 -56
- data/spec/config/strconv_spec.rb +0 -58
- data/spec/config/struct_spec.rb +0 -66
- data/spec/fixtures/dummy_job.rb +0 -7
- data/spec/gc_spec.rb +0 -59
- data/spec/handler_spec.rb +0 -11
- data/spec/integration/callbacks_spec.rb +0 -85
- data/spec/integration/page_spec.rb +0 -62
- data/spec/integration/params_spec.rb +0 -56
- data/spec/integration/stage_spec.rb +0 -51
- data/spec/integration/steering_spec.rb +0 -57
- data/spec/middleware/dedup_spec.rb +0 -88
- data/spec/middleware/dispatch_spec.rb +0 -43
- data/spec/middleware/fetch_spec.rb +0 -155
- data/spec/middleware/normalize_spec.rb +0 -29
- data/spec/middleware/router_spec.rb +0 -105
- data/spec/middleware/stage_spec.rb +0 -62
- data/spec/networking/capybara_spec.rb +0 -12
- data/spec/networking/ferrum_spec.rb +0 -12
- data/spec/networking/http_spec.rb +0 -12
- data/spec/networking/selenium_spec.rb +0 -12
- data/spec/page_spec.rb +0 -47
- data/spec/parsing/xml_spec.rb +0 -25
- data/spec/redis/barrier_spec.rb +0 -78
- data/spec/redis/counter_spec.rb +0 -32
- data/spec/redis/version_spec.rb +0 -13
- data/spec/routing/integration_spec.rb +0 -110
- data/spec/routing/matchers/custom_spec.rb +0 -31
- data/spec/routing/matchers/host_spec.rb +0 -49
- data/spec/routing/matchers/path_spec.rb +0 -43
- data/spec/routing/matchers/query_spec.rb +0 -137
- data/spec/routing/matchers/scheme_spec.rb +0 -25
- data/spec/routing/matchers/suffix_spec.rb +0 -41
- data/spec/routing/matchers/uri_spec.rb +0 -27
- data/spec/routing/path_finder_spec.rb +0 -33
- data/spec/routing/root_route_spec.rb +0 -29
- data/spec/routing/route_spec.rb +0 -43
- data/spec/routing/router_spec.rb +0 -24
- data/spec/task_spec.rb +0 -34
- data/spec/{stringify_spec.rb → wayfarer/stringify_spec.rb} +2 -2
data/docs/cookbook/navigation.md
CHANGED
@@ -4,12 +4,12 @@
|
|
4
4
|
|
5
5
|
```ruby
|
6
6
|
class DummyJob < Wayfarer::Base
|
7
|
-
route
|
7
|
+
route.to :index
|
8
8
|
|
9
9
|
def index
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
user_agent.goto("https://example.com")
|
11
|
+
user_agent.back
|
12
|
+
user_agent.forward
|
13
13
|
end
|
14
14
|
end
|
15
15
|
```
|
@@ -18,12 +18,12 @@
|
|
18
18
|
|
19
19
|
```ruby
|
20
20
|
class DummyJob < Wayfarer::Base
|
21
|
-
route
|
21
|
+
route.to :index
|
22
22
|
|
23
23
|
def index
|
24
|
-
|
25
|
-
|
26
|
-
|
24
|
+
user_agent.navigate.to("https://example.com")
|
25
|
+
user_agent.navigate.back
|
26
|
+
user_agent.navigate.forward
|
27
27
|
end
|
28
28
|
end
|
29
29
|
```
|
@@ -32,12 +32,12 @@
|
|
32
32
|
|
33
33
|
```ruby
|
34
34
|
class DummyJob < Wayfarer::Base
|
35
|
-
route
|
35
|
+
route.to :index
|
36
36
|
|
37
37
|
def index
|
38
|
-
|
39
|
-
|
40
|
-
|
38
|
+
user_agent.visit("https://example.com")
|
39
|
+
user_agent.go_back
|
40
|
+
user_agent.go_forward
|
41
41
|
end
|
42
42
|
end
|
43
43
|
```
|
@@ -6,7 +6,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
|
|
6
6
|
|
7
7
|
```ruby
|
8
8
|
class DummyJob < Wayfarer::Base
|
9
|
-
route
|
9
|
+
route.to :index
|
10
10
|
|
11
11
|
def index
|
12
12
|
page.doc.css("html")
|
@@ -19,7 +19,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
|
|
19
19
|
|
20
20
|
```ruby
|
21
21
|
class DummyJob < Wayfarer::Base
|
22
|
-
route
|
22
|
+
route.to :index
|
23
23
|
|
24
24
|
def index
|
25
25
|
browser.at_css("html")
|
@@ -32,7 +32,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
|
|
32
32
|
|
33
33
|
```ruby
|
34
34
|
class DummyJob < Wayfarer::Base
|
35
|
-
route
|
35
|
+
route.to :index
|
36
36
|
|
37
37
|
def index
|
38
38
|
browser.find_elements(css: "html")
|
@@ -6,7 +6,7 @@ Taking screenshots requires automating a browser.
|
|
6
6
|
|
7
7
|
```ruby
|
8
8
|
class DummyJob < Wayfarer::Base
|
9
|
-
route
|
9
|
+
route.to :index
|
10
10
|
|
11
11
|
def index
|
12
12
|
browser.screenshot(path: "screenshot.png")
|
@@ -18,7 +18,7 @@ Taking screenshots requires automating a browser.
|
|
18
18
|
|
19
19
|
```ruby
|
20
20
|
class DummyJob < Wayfarer::Base
|
21
|
-
route
|
21
|
+
route.to :index
|
22
22
|
|
23
23
|
def index
|
24
24
|
browser.save_screenshot("screenshot.png")
|
data/docs/guides/callbacks.md
CHANGED
@@ -1,145 +1,45 @@
|
|
1
1
|
# Callbacks
|
2
2
|
|
3
|
-
|
3
|
+
Wayfarer supports a number of callbacks in addition to
|
4
|
+
[ActiveJob callbacks](https://edgeguides.rubyonrails.org/active_job_basics.html#callbacks).
|
4
5
|
|
5
|
-
|
6
|
+
## Available callbacks
|
6
7
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
before_fetch :do_something
|
15
|
-
|
16
|
-
private
|
17
|
-
|
18
|
-
def do_something
|
19
|
-
# before the task.url is fetched
|
20
|
-
end
|
21
|
-
end
|
22
|
-
```
|
23
|
-
|
24
|
-
## `before_action`
|
25
|
-
|
26
|
-
Runs after a page was fetched, before an action method is called.
|
27
|
-
|
28
|
-
```ruby
|
29
|
-
class DummyJob < Wayfarer::Base
|
30
|
-
before_action :do_something
|
31
|
-
|
32
|
-
private
|
33
|
-
|
34
|
-
def do_something
|
35
|
-
# page is available at this point
|
36
|
-
end
|
37
|
-
end
|
38
|
-
```
|
8
|
+
* `before_fetch`
|
9
|
+
* `around_fetch`
|
10
|
+
* `after_fetch`
|
11
|
+
* `before_action`
|
12
|
+
* `around_action`
|
13
|
+
* `after_action`
|
14
|
+
* `after_batch`
|
39
15
|
|
40
16
|
## `after_batch`
|
41
17
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
after_batch do
|
47
|
-
# All jobs in batch done
|
48
|
-
end
|
49
|
-
end
|
50
|
-
```
|
51
|
-
|
52
|
-
Internally, a batch counter is in-/decremented on certain events. Once the
|
53
|
-
counter reaches zero, `after_batch` callbacks runs in declaration order.
|
54
|
-
|
55
|
-
The counter is incremented when within the batch:
|
18
|
+
You can register `after_batch` callbacks that run when there are no more tasks
|
19
|
+
to process in a batch. Wayfarer instruments job execution and in- or decrements
|
20
|
+
an integer counter in Redis on certain events. When the counter reaches zero,
|
21
|
+
the current job's `after_batch` callbacks run.
|
56
22
|
|
57
|
-
|
23
|
+
!!! info "`after_batch` callbacks fire at most once per batch."
|
58
24
|
|
59
|
-
|
25
|
+
## Conditional callbacks
|
60
26
|
|
61
|
-
|
62
|
-
|
63
|
-
* A job is discarded due to an exception.
|
64
|
-
* A job errors and thereyby exhausts its maximum attempts.
|
65
|
-
|
66
|
-
!!! attention "Batch callbacks can fail jobs"
|
67
|
-
|
68
|
-
If the last job's `after_batch` callbacks raises an exception, this can lead
|
69
|
-
to the job getting retried. If the exception raised by the callback is
|
70
|
-
unhandled or discarded, the callback never fully runs.
|
71
|
-
|
72
|
-
## Callback options
|
73
|
-
|
74
|
-
### Definition styles
|
75
|
-
|
76
|
-
Callbacks can be registered either by supplying a block or a symbol identifying
|
77
|
-
a callback instance method:
|
27
|
+
You can make callbacks conditional with the `#!ruby :if` and `#!ruby :unless`
|
28
|
+
keywords, for example to run a callback for some route `action` only:
|
78
29
|
|
79
30
|
```ruby
|
80
|
-
class DummyJob <
|
81
|
-
|
82
|
-
# ...
|
83
|
-
end
|
31
|
+
class DummyJob < ActiveJob::Base
|
32
|
+
include Wayfarer::Base
|
84
33
|
|
85
|
-
|
34
|
+
route.host "example.com", to: :example
|
35
|
+
route.to :fallback
|
86
36
|
|
87
|
-
|
88
|
-
|
89
|
-
def my_callback
|
37
|
+
before_action unless: -> { action == :fallback } do
|
90
38
|
# ...
|
91
39
|
end
|
92
|
-
end
|
93
|
-
```
|
94
|
-
|
95
|
-
### Conditionals
|
96
|
-
|
97
|
-
Callbacks can be registered conditionally with the `:if` and `:unless` keywords:
|
98
|
-
|
99
|
-
```ruby
|
100
|
-
class DummyJob < Wayfarer::Base
|
101
|
-
before_fetch :my_callback, if: :my_condition
|
102
|
-
|
103
|
-
private
|
104
|
-
|
105
|
-
def my_callback
|
106
|
-
end
|
107
|
-
|
108
|
-
def my_condition
|
109
|
-
end
|
110
|
-
end
|
111
|
-
```
|
112
|
-
|
113
|
-
Callbacks can be registered for certain action methods only with the `:only` and
|
114
|
-
`:except` keywords:
|
115
|
-
|
116
|
-
```ruby
|
117
|
-
class DummyJob < Wayfarer::Base
|
118
|
-
before_fetch :do_something, only: :foo
|
119
|
-
|
120
|
-
before_fetch except: [:foo, :qux] do
|
121
|
-
# runs only before bar
|
122
|
-
end
|
123
40
|
|
124
|
-
|
125
|
-
end
|
126
|
-
|
127
|
-
def bar
|
128
|
-
end
|
41
|
+
# ...
|
129
42
|
end
|
130
|
-
|
131
43
|
```
|
132
44
|
|
133
|
-
|
134
|
-
|
135
|
-
Callbacks that return `false` halt the callback chain:
|
136
|
-
|
137
|
-
```ruby
|
138
|
-
class DummyJob < Wayfarer::Base
|
139
|
-
before_action { false }
|
140
|
-
|
141
|
-
before_action do
|
142
|
-
# never runs
|
143
|
-
end
|
144
|
-
end
|
145
|
-
```
|
45
|
+
You can also pass a symbol instead of a block to call an instance method.
|
data/docs/guides/cli.md
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
# wayfarer
|
2
|
+
|
3
|
+
The command-line interface to Wayfarer.
|
4
|
+
|
5
|
+
## Usage
|
6
|
+
|
7
|
+
```
|
8
|
+
wayfarer [OPTIONS] [perform|enqueue|execute|route|tree]
|
9
|
+
```
|
10
|
+
|
11
|
+
See [Configuration](../reference/cli) for the respected environment variables.
|
12
|
+
|
13
|
+
---
|
14
|
+
|
15
|
+
## `wayfarer perform JOB URL`
|
16
|
+
|
17
|
+
: Performs `JOB` with `URL` in memory. The task is not sent to the message queue.
|
18
|
+
Staged jobs are ignored.
|
19
|
+
|
20
|
+
##### Options
|
21
|
+
|
22
|
+
* `--mock-redis`: Use an in-memory implementation of Redis instead of
|
23
|
+
talking to an actual server.
|
24
|
+
* `--batch=BATCH`: The job's batch. By default, a UUID is generated.
|
25
|
+
|
26
|
+
---
|
27
|
+
|
28
|
+
## `wayfarer enqueue JOB URL`
|
29
|
+
|
30
|
+
: Enqueues a task for `JOB` with `URL` to the message queue.
|
31
|
+
|
32
|
+
##### Options
|
33
|
+
|
34
|
+
* `--batch=BATCH`: The job's batch. By default, a UUID is generated.
|
35
|
+
|
36
|
+
---
|
37
|
+
|
38
|
+
## `wayfarer execute JOB URL`
|
39
|
+
|
40
|
+
: Execute `JOB` with `URL` with the in-memory
|
41
|
+
[Active Job Async adapter](https://api.rubyonrails.org/classes/ActiveJob/QueueAdapters/AsyncAdapter.html)
|
42
|
+
instead of writing the taks to an actual message queue. Blocks until the
|
43
|
+
batch has completed.
|
44
|
+
|
45
|
+
##### Options
|
46
|
+
|
47
|
+
* `--mock-redis`: Use an in-memory implementation of Redis instead of
|
48
|
+
talking to an actual server.
|
49
|
+
* `--batch=BATCH`: Set the job's batch. By default, a UUID is generated.
|
50
|
+
* `--min-threads`: Minimum number of threads to use. Default: 1
|
51
|
+
* `--max-threads`: Maximum number of threads to use. Default: 1
|
52
|
+
|
53
|
+
!!! attention "Why are my jobs not getting retried with `wayfarer job execute`?"
|
54
|
+
|
55
|
+
You need to set the `wait: 0` option on `retry_on` in order for
|
56
|
+
`wayfarer job execute` to execute retries:
|
57
|
+
|
58
|
+
```ruby
|
59
|
+
retry_on StandardError, attempts: 3, wait: 0
|
60
|
+
```
|
61
|
+
---
|
62
|
+
|
63
|
+
## `wayfarer route JOB URL`
|
64
|
+
|
65
|
+
: Prints the result of invoking `JOB`'s router with `URL`.
|
66
|
+
|
67
|
+
---
|
68
|
+
|
69
|
+
## `wayfarer tree JOB URL`
|
70
|
+
|
71
|
+
: Visualises the routing tree result of invoking `JOB`'s router with `URL`.
|
@@ -1,39 +1,14 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
1. Using [environment variables](/reference/environment_variables)
|
6
|
-
2. Using runtime configuration
|
7
|
-
|
8
|
-
## Runtime configuration
|
9
|
-
|
10
|
-
Wayfarer parses environment variables into a runtime configuration
|
11
|
-
`Wayfarer::Config`. The configuration can then be altered or replaced via
|
12
|
-
`Wayfarer.config`:
|
13
|
-
|
14
|
-
```ruby
|
15
|
-
# Which user agent to use to process tasks
|
16
|
-
Wayfarer.config.network.agent = :http # or :ferrum, :selenium
|
1
|
+
---
|
2
|
+
hide:
|
3
|
+
- toc
|
4
|
+
---
|
17
5
|
|
18
|
-
#
|
19
|
-
Wayfarer.config.network.pool_size = 3
|
20
|
-
|
21
|
-
# How long an agent may be used while processing a task
|
22
|
-
Wayfarer.config.network.pool_timeout = 5000
|
23
|
-
|
24
|
-
# Ferrum options
|
25
|
-
Wayfarer.config.ferrum.options = {}
|
26
|
-
|
27
|
-
# Selenium driver to use
|
28
|
-
Wayfarer.config.selenium.driver = :chrome
|
29
|
-
|
30
|
-
# Selenium HTTP client read timeout
|
31
|
-
Wayfarer.config.selenium.client_timeout = 10 # seconds
|
6
|
+
# Configuration
|
32
7
|
|
33
|
-
|
34
|
-
Wayfarer.config.selenium.options = { url: "http://chrome" }
|
8
|
+
You can configure Wayfarer by assigning to `Wayfarer.config` which defaults to:
|
35
9
|
|
36
|
-
|
37
|
-
Wayfarer
|
10
|
+
```rb
|
11
|
+
module Wayfarer
|
12
|
+
--8<-- "lib/wayfarer.rb:48:96"
|
13
|
+
end
|
38
14
|
```
|
39
|
-
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# Development
|
2
|
+
|
3
|
+
## Release Procedure
|
4
|
+
|
5
|
+
1. Ensure `Wayfarer::VERSION` was bumped appropriately.
|
6
|
+
2. Ensure the version in wayfarer.gemspec matches.
|
7
|
+
3. Open a release Pull Request develop -> master branch
|
8
|
+
4. Merge the Pull Request
|
9
|
+
5. Publish RubyGem and git tag as follows:
|
10
|
+
|
11
|
+
```
|
12
|
+
git checkout master
|
13
|
+
git pull origin master --rebase
|
14
|
+
bundle exec rake build
|
15
|
+
gem push build/wayfarer-*.gem
|
16
|
+
bundle exec rake clean
|
17
|
+
git tag <VERSION>
|
18
|
+
git push origin <VERSION>
|
19
|
+
```
|
20
|
+
|
21
|
+
## Conventions and guidelines
|
22
|
+
|
23
|
+
* In source code, `url` refers to strings and `uri` refers to `Addressable::URI`
|
24
|
+
* Avoid writing bash at all costs. Use Ruby instead
|
25
|
+
|
26
|
+
## Design decisions and architecture
|
27
|
+
|
28
|
+
### Navigate the web along URL patterns
|
29
|
+
|
30
|
+
URLs are less prone to change than served markup.
|
31
|
+
One reason for this is that changes to a URL's path can have negative
|
32
|
+
consequences for its page ranking in search engines. Websites naturally implement
|
33
|
+
architectural URL patterns like REST or expose surrogate keys.
|
34
|
+
|
35
|
+
### Follow URLs verbatim as they appear in responses
|
36
|
+
|
37
|
+
Normalized URLs are useful for deduplication, but URLs should be followed
|
38
|
+
as they appear in responses. Navigating to normalized versions of URLs makes
|
39
|
+
crawlers stick out from other user agents.
|
40
|
+
|
41
|
+
### Tasks are version-less and don't persist metadata
|
42
|
+
|
43
|
+
Tasks serialize to their URL and batch. No other data gets written to
|
44
|
+
the message queue. There is also no need for versioning persisted tasks, since
|
45
|
+
there will be never more to a task than URL and batch. All task metadata
|
46
|
+
is ephemeral.
|
47
|
+
|
48
|
+
### Why depend on Redis
|
49
|
+
|
50
|
+
There are two core features that depend on Redis. First, per-batch acylicity is
|
51
|
+
achieved by maintaining the set of processed URLs per batch in Redis.
|
52
|
+
There's no option to follow links in a cyclic manner. Second, batch completion
|
53
|
+
requires updating an integer value in Redis, and batch completion is a very
|
54
|
+
useful feature, since most crawls should end eventually, and often you want to
|
55
|
+
know when.
|
56
|
+
|
57
|
+
### No configuration files
|
58
|
+
|
59
|
+
Wayfarer can be configured through `Wayfarer.config` only, because `Wayfarer.config`
|
60
|
+
may contain Ruby objects that don't de/serialize well, such as `Proc`s or `Set`s.
|
61
|
+
|
62
|
+
### Features out of scope
|
63
|
+
|
64
|
+
Wayfarer won't provide:
|
65
|
+
|
66
|
+
* persistence or any sort of DOM data mapping abstractions
|
67
|
+
* URL generation helpers
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# Handlers
|
2
|
+
|
3
|
+
Handlers are like [jobs](/jobs) but they don't inherit from `ActiveJob::Base`
|
4
|
+
which is why they can't affect the message queue directly themselves.
|
5
|
+
Instead, jobs and handlers can route tasks to other handlers. Handlers
|
6
|
+
themselves have routes, but they can be bypassed.
|
7
|
+
|
8
|
+
## Handler capabilities
|
9
|
+
|
10
|
+
Like jobs, handlers support:
|
11
|
+
|
12
|
+
* URL routing
|
13
|
+
* staging tasks with `#!ruby stage(*urls)`
|
14
|
+
* jobs can access the `user_agent` that retrieved the `page`
|
15
|
+
* ad-hoc HTTP requests with `#!ruby fetch(url)`
|
16
|
+
* callbacks, but only a subset of job callbacks
|
17
|
+
* Content-Type filtering
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
class ExampleHandler
|
21
|
+
include Wayfarer::Handler
|
22
|
+
|
23
|
+
route.to: :index
|
24
|
+
|
25
|
+
def index
|
26
|
+
task # => #<Wayfarer::Task>
|
27
|
+
page # => #<Wayfarer::Page>
|
28
|
+
user_agent # => Browser or HTTP client
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
class DummyJob < ActiveJob::Base
|
33
|
+
include Wayfarer::Base
|
34
|
+
|
35
|
+
route.host "example.com", to: ExampleHandler
|
36
|
+
end
|
37
|
+
```
|
38
|
+
|
39
|
+
You can also bypass a handler's router and route directly to an instance
|
40
|
+
method:
|
41
|
+
|
42
|
+
```ruby
|
43
|
+
class DummyJob < ActiveJob::Base
|
44
|
+
include Wayfarer::Base
|
45
|
+
|
46
|
+
route.host "example.com", to: [ExampleHandler, :index]
|
47
|
+
end
|
48
|
+
|
49
|
+
class ExampleHandler
|
50
|
+
include Wayfarer::Handler
|
51
|
+
|
52
|
+
def index
|
53
|
+
task # => #<Wayfarer::Task>
|
54
|
+
page # => #<Wayfarer::Page>
|
55
|
+
user_agent # => Browser or HTTP client
|
56
|
+
end
|
57
|
+
end
|
58
|
+
```
|
59
|
+
|
60
|
+
!!! `before_action` callbacks
|
@@ -0,0 +1 @@
|
|
1
|
+
hello
|