wayfarer 0.0.3 → 0.4.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/ci.yaml +32 -0
- data/.gitignore +3 -4
- data/.rubocop.yml +25 -9
- data/.ruby-version +1 -1
- data/Dockerfile +5 -0
- data/Gemfile +1 -7
- data/Gemfile.lock +221 -0
- data/RELEASING.md +17 -0
- data/Rakefile +38 -90
- data/bin/wayfarer +1 -111
- data/docker-compose.yml +32 -0
- data/docs/cookbook/batch_routing.md +22 -0
- data/docs/cookbook/consent_screen.md +36 -0
- data/docs/cookbook/executing_javascript.md +41 -0
- data/docs/cookbook/querying_html.md +42 -0
- data/docs/cookbook/screenshots.md +27 -0
- data/docs/cookbook/user_agent.md +7 -0
- data/docs/guides/browser_automation/capybara.md +69 -0
- data/docs/guides/browser_automation/custom_adapters.md +100 -0
- data/docs/guides/browser_automation/ferrum.md +39 -0
- data/docs/guides/browser_automation/selenium.md +63 -0
- data/docs/guides/callbacks.md +131 -31
- data/docs/guides/configuration.md +24 -169
- data/docs/guides/debugging.md +17 -0
- data/docs/guides/error_handling.md +30 -45
- data/docs/guides/jobs.md +101 -0
- data/docs/guides/navigation.md +73 -0
- data/docs/guides/networking.md +94 -0
- data/docs/guides/pages.md +52 -0
- data/docs/guides/performance.md +130 -0
- data/docs/guides/reliability.md +41 -0
- data/docs/guides/routing/steering.md +30 -0
- data/docs/guides/tasks.md +14 -0
- data/docs/index.md +40 -66
- data/docs/reference/api/base.md +48 -0
- data/docs/reference/api/route.md +182 -0
- data/docs/reference/cli.md +61 -0
- data/docs/reference/configuration_keys.md +42 -0
- data/docs/reference/environment_variables.md +83 -0
- data/lib/wayfarer/base.rb +50 -0
- data/lib/wayfarer/callbacks.rb +71 -0
- data/lib/wayfarer/cli/base.rb +27 -0
- data/lib/wayfarer/cli/generate.rb +17 -0
- data/lib/wayfarer/cli/job.rb +60 -0
- data/lib/wayfarer/cli/route.rb +29 -0
- data/lib/wayfarer/cli/route_printer.rb +116 -0
- data/lib/wayfarer/cli/runner.rb +34 -0
- data/lib/wayfarer/cli/templates/Gemfile.tt +5 -0
- data/lib/wayfarer/cli/templates/job.rb.tt +10 -0
- data/lib/wayfarer/config/capybara.rb +10 -0
- data/lib/wayfarer/config/ferrum.rb +11 -0
- data/lib/wayfarer/config/networking.rb +26 -0
- data/lib/wayfarer/config/redis.rb +14 -0
- data/lib/wayfarer/config/root.rb +11 -0
- data/lib/wayfarer/config/selenium.rb +21 -0
- data/lib/wayfarer/config/strconv.rb +45 -0
- data/lib/wayfarer/config/struct.rb +72 -0
- data/lib/wayfarer/gc.rb +15 -0
- data/lib/wayfarer/middleware/chain.rb +19 -0
- data/lib/wayfarer/middleware/dedup.rb +25 -0
- data/lib/wayfarer/middleware/fetch.rb +47 -0
- data/lib/wayfarer/middleware/normalize.rb +25 -0
- data/lib/wayfarer/middleware/router.rb +53 -0
- data/lib/wayfarer/middleware/stage.rb +23 -0
- data/lib/wayfarer/middleware/worker.rb +30 -0
- data/lib/wayfarer/networking/capybara.rb +28 -0
- data/lib/wayfarer/networking/context.rb +36 -0
- data/lib/wayfarer/networking/ferrum.rb +35 -0
- data/lib/wayfarer/networking/http.rb +34 -0
- data/lib/wayfarer/networking/pool.rb +40 -0
- data/lib/wayfarer/networking/result.rb +18 -0
- data/lib/wayfarer/networking/selenium.rb +43 -0
- data/lib/wayfarer/networking/strategy.rb +38 -0
- data/lib/wayfarer/page.rb +17 -74
- data/lib/wayfarer/parsing/json.rb +17 -0
- data/lib/wayfarer/parsing/xml.rb +17 -0
- data/lib/wayfarer/redis/.#barrier.rb +1 -0
- data/lib/wayfarer/redis/barrier.rb +36 -0
- data/lib/wayfarer/redis/connection.rb +13 -0
- data/lib/wayfarer/redis/counter.rb +29 -0
- data/lib/wayfarer/redis/pool.rb +20 -0
- data/lib/wayfarer/redis/version.rb +19 -0
- data/lib/wayfarer/routing/dsl.rb +57 -0
- data/lib/wayfarer/routing/matchers/custom.rb +25 -0
- data/lib/wayfarer/routing/matchers/host.rb +19 -0
- data/lib/wayfarer/routing/matchers/path.rb +49 -0
- data/lib/wayfarer/routing/matchers/query.rb +63 -0
- data/lib/wayfarer/routing/matchers/scheme.rb +17 -0
- data/lib/wayfarer/routing/matchers/suffix.rb +17 -0
- data/lib/wayfarer/routing/matchers/url.rb +17 -0
- data/lib/wayfarer/routing/path_finder.rb +46 -0
- data/lib/wayfarer/routing/result.rb +15 -0
- data/lib/wayfarer/routing/root_route.rb +7 -0
- data/lib/wayfarer/routing/route.rb +47 -0
- data/lib/wayfarer/routing/router.rb +10 -54
- data/lib/wayfarer/routing/target_route.rb +7 -0
- data/lib/wayfarer/serializer.rb +17 -0
- data/lib/wayfarer/stringify.rb +47 -0
- data/lib/wayfarer/task.rb +34 -0
- data/lib/wayfarer.rb +48 -57
- data/mkdocs.yml +47 -0
- data/requirements.txt +1 -0
- data/spec/base_spec.rb +233 -0
- data/spec/callbacks_spec.rb +102 -0
- data/spec/cli/generate_spec.rb +39 -0
- data/spec/cli/job_spec.rb +74 -0
- data/spec/cli/version_spec.rb +13 -0
- data/spec/config/capybara_spec.rb +18 -0
- data/spec/config/ferrum_spec.rb +24 -0
- data/spec/config/networking_spec.rb +73 -0
- data/spec/config/redis_spec.rb +32 -0
- data/spec/config/root_spec.rb +31 -0
- data/spec/config/selenium_spec.rb +56 -0
- data/spec/config/strconv_spec.rb +58 -0
- data/spec/config/struct_spec.rb +66 -0
- data/spec/factories/middleware.rb +15 -0
- data/spec/factories/page.rb +78 -0
- data/spec/factories/task.rb +12 -0
- data/spec/fixtures/dummy_job.rb +7 -0
- data/spec/gc_spec.rb +63 -0
- data/spec/middleware/chain_spec.rb +96 -0
- data/spec/middleware/dedup_spec.rb +76 -0
- data/spec/middleware/fetch_spec.rb +100 -0
- data/spec/middleware/normalize_spec.rb +28 -0
- data/spec/middleware/router_spec.rb +80 -0
- data/spec/middleware/stage_spec.rb +39 -0
- data/spec/middleware/worker_spec.rb +117 -0
- data/spec/networking/capybara_spec.rb +12 -0
- data/spec/networking/context_spec.rb +127 -0
- data/spec/networking/ferrum_spec.rb +12 -0
- data/spec/networking/http_spec.rb +12 -0
- data/spec/networking/pool_spec.rb +67 -0
- data/spec/networking/selenium_spec.rb +12 -0
- data/spec/networking/strategy.rb +170 -0
- data/spec/page_spec.rb +21 -12
- data/spec/{parsers/json_parser_spec.rb → parsing/json_spec.rb} +5 -4
- data/spec/{parsers/xml_parser_spec.rb → parsing/xml_spec.rb} +3 -2
- data/spec/redis/barrier_spec.rb +78 -0
- data/spec/redis/counter_spec.rb +32 -0
- data/spec/redis/pool_spec.rb +18 -0
- data/spec/redis/version_spec.rb +13 -0
- data/spec/routing/dsl_spec.rb +98 -0
- data/spec/routing/integration_spec.rb +110 -0
- data/spec/routing/matchers/custom_spec.rb +31 -0
- data/spec/routing/matchers/host_spec.rb +49 -0
- data/spec/routing/matchers/path_spec.rb +43 -0
- data/spec/routing/matchers/query_spec.rb +137 -0
- data/spec/routing/matchers/scheme_spec.rb +25 -0
- data/spec/routing/{filetypes_rule_spec.rb → matchers/suffix_spec.rb} +14 -13
- data/spec/routing/matchers/uri_spec.rb +27 -0
- data/spec/routing/path_finder_spec.rb +33 -0
- data/spec/routing/root_route_spec.rb +29 -0
- data/spec/routing/route_spec.rb +43 -0
- data/spec/routing/router_spec.rb +13 -56
- data/spec/spec_helpers.rb +73 -38
- data/spec/stringify_spec.rb +23 -0
- data/{support → spec/support}/static/finders.html +0 -0
- data/{support → spec/support}/static/graph/details/a.html +0 -0
- data/{support → spec/support}/static/graph/details/b.html +0 -0
- data/{support → spec/support}/static/graph/index.html +0 -0
- data/{support → spec/support}/static/json/dummy.json +0 -0
- data/{support → spec/support}/static/links/links.html +0 -0
- data/{support → spec/support}/static/xml/dummy.xml +0 -0
- data/{support → spec/support}/test_app.rb +9 -2
- data/spec/task_spec.rb +27 -0
- data/spec/wayfarer_spec.rb +2 -13
- data/wayfarer.gemspec +40 -42
- metadata +234 -361
- data/.travis.yml +0 -5
- data/Changelog.md +0 -10
- data/README.md +0 -21
- data/benchmark/frontiers.rb +0 -143
- data/docs/.gitignore +0 -2
- data/docs/_config.yml +0 -15
- data/docs/_includes/base.html +0 -7
- data/docs/_includes/head.html +0 -10
- data/docs/_includes/navigation.html +0 -187
- data/docs/_layouts/default.html +0 -42
- data/docs/_sass/base.scss +0 -439
- data/docs/_sass/variables.scss +0 -24
- data/docs/_sass/vendor/bourbon/_bourbon-deprecate.scss +0 -19
- data/docs/_sass/vendor/bourbon/_bourbon-deprecated-upcoming.scss +0 -425
- data/docs/_sass/vendor/bourbon/_bourbon.scss +0 -90
- data/docs/_sass/vendor/bourbon/addons/_border-color.scss +0 -29
- data/docs/_sass/vendor/bourbon/addons/_border-radius.scss +0 -48
- data/docs/_sass/vendor/bourbon/addons/_border-style.scss +0 -28
- data/docs/_sass/vendor/bourbon/addons/_border-width.scss +0 -28
- data/docs/_sass/vendor/bourbon/addons/_buttons.scss +0 -69
- data/docs/_sass/vendor/bourbon/addons/_clearfix.scss +0 -25
- data/docs/_sass/vendor/bourbon/addons/_ellipsis.scss +0 -30
- data/docs/_sass/vendor/bourbon/addons/_font-stacks.scss +0 -31
- data/docs/_sass/vendor/bourbon/addons/_hide-text.scss +0 -27
- data/docs/_sass/vendor/bourbon/addons/_margin.scss +0 -29
- data/docs/_sass/vendor/bourbon/addons/_padding.scss +0 -29
- data/docs/_sass/vendor/bourbon/addons/_position.scss +0 -51
- data/docs/_sass/vendor/bourbon/addons/_prefixer.scss +0 -66
- data/docs/_sass/vendor/bourbon/addons/_retina-image.scss +0 -27
- data/docs/_sass/vendor/bourbon/addons/_size.scss +0 -56
- data/docs/_sass/vendor/bourbon/addons/_text-inputs.scss +0 -118
- data/docs/_sass/vendor/bourbon/addons/_timing-functions.scss +0 -34
- data/docs/_sass/vendor/bourbon/addons/_triangle.scss +0 -63
- data/docs/_sass/vendor/bourbon/addons/_word-wrap.scss +0 -29
- data/docs/_sass/vendor/bourbon/css3/_animation.scss +0 -61
- data/docs/_sass/vendor/bourbon/css3/_appearance.scss +0 -5
- data/docs/_sass/vendor/bourbon/css3/_backface-visibility.scss +0 -5
- data/docs/_sass/vendor/bourbon/css3/_background-image.scss +0 -44
- data/docs/_sass/vendor/bourbon/css3/_background.scss +0 -57
- data/docs/_sass/vendor/bourbon/css3/_border-image.scss +0 -61
- data/docs/_sass/vendor/bourbon/css3/_calc.scss +0 -6
- data/docs/_sass/vendor/bourbon/css3/_columns.scss +0 -67
- data/docs/_sass/vendor/bourbon/css3/_filter.scss +0 -6
- data/docs/_sass/vendor/bourbon/css3/_flex-box.scss +0 -327
- data/docs/_sass/vendor/bourbon/css3/_font-face.scss +0 -29
- data/docs/_sass/vendor/bourbon/css3/_font-feature-settings.scss +0 -6
- data/docs/_sass/vendor/bourbon/css3/_hidpi-media-query.scss +0 -12
- data/docs/_sass/vendor/bourbon/css3/_hyphens.scss +0 -6
- data/docs/_sass/vendor/bourbon/css3/_image-rendering.scss +0 -15
- data/docs/_sass/vendor/bourbon/css3/_keyframes.scss +0 -38
- data/docs/_sass/vendor/bourbon/css3/_linear-gradient.scss +0 -40
- data/docs/_sass/vendor/bourbon/css3/_perspective.scss +0 -12
- data/docs/_sass/vendor/bourbon/css3/_placeholder.scss +0 -10
- data/docs/_sass/vendor/bourbon/css3/_radial-gradient.scss +0 -40
- data/docs/_sass/vendor/bourbon/css3/_selection.scss +0 -44
- data/docs/_sass/vendor/bourbon/css3/_text-decoration.scss +0 -27
- data/docs/_sass/vendor/bourbon/css3/_transform.scss +0 -21
- data/docs/_sass/vendor/bourbon/css3/_transition.scss +0 -81
- data/docs/_sass/vendor/bourbon/css3/_user-select.scss +0 -5
- data/docs/_sass/vendor/bourbon/functions/_assign-inputs.scss +0 -16
- data/docs/_sass/vendor/bourbon/functions/_contains-falsy.scss +0 -25
- data/docs/_sass/vendor/bourbon/functions/_contains.scss +0 -31
- data/docs/_sass/vendor/bourbon/functions/_is-length.scss +0 -16
- data/docs/_sass/vendor/bourbon/functions/_is-light.scss +0 -26
- data/docs/_sass/vendor/bourbon/functions/_is-number.scss +0 -16
- data/docs/_sass/vendor/bourbon/functions/_is-size.scss +0 -23
- data/docs/_sass/vendor/bourbon/functions/_modular-scale.scss +0 -74
- data/docs/_sass/vendor/bourbon/functions/_px-to-em.scss +0 -24
- data/docs/_sass/vendor/bourbon/functions/_px-to-rem.scss +0 -26
- data/docs/_sass/vendor/bourbon/functions/_shade.scss +0 -24
- data/docs/_sass/vendor/bourbon/functions/_strip-units.scss +0 -22
- data/docs/_sass/vendor/bourbon/functions/_tint.scss +0 -24
- data/docs/_sass/vendor/bourbon/functions/_transition-property-name.scss +0 -37
- data/docs/_sass/vendor/bourbon/functions/_unpack.scss +0 -32
- data/docs/_sass/vendor/bourbon/helpers/_convert-units.scss +0 -26
- data/docs/_sass/vendor/bourbon/helpers/_directional-values.scss +0 -108
- data/docs/_sass/vendor/bourbon/helpers/_font-source-declaration.scss +0 -53
- data/docs/_sass/vendor/bourbon/helpers/_gradient-positions-parser.scss +0 -24
- data/docs/_sass/vendor/bourbon/helpers/_linear-angle-parser.scss +0 -35
- data/docs/_sass/vendor/bourbon/helpers/_linear-gradient-parser.scss +0 -51
- data/docs/_sass/vendor/bourbon/helpers/_linear-positions-parser.scss +0 -77
- data/docs/_sass/vendor/bourbon/helpers/_linear-side-corner-parser.scss +0 -41
- data/docs/_sass/vendor/bourbon/helpers/_radial-arg-parser.scss +0 -74
- data/docs/_sass/vendor/bourbon/helpers/_radial-gradient-parser.scss +0 -55
- data/docs/_sass/vendor/bourbon/helpers/_radial-positions-parser.scss +0 -28
- data/docs/_sass/vendor/bourbon/helpers/_render-gradients.scss +0 -31
- data/docs/_sass/vendor/bourbon/helpers/_shape-size-stripper.scss +0 -15
- data/docs/_sass/vendor/bourbon/helpers/_str-to-num.scss +0 -55
- data/docs/_sass/vendor/bourbon/settings/_asset-pipeline.scss +0 -7
- data/docs/_sass/vendor/bourbon/settings/_deprecation-warnings.scss +0 -8
- data/docs/_sass/vendor/bourbon/settings/_prefixer.scss +0 -9
- data/docs/_sass/vendor/bourbon/settings/_px-to-em.scss +0 -1
- data/docs/_sass/vendor/neat/_neat-helpers.scss +0 -11
- data/docs/_sass/vendor/neat/_neat.scss +0 -23
- data/docs/_sass/vendor/neat/functions/_new-breakpoint.scss +0 -49
- data/docs/_sass/vendor/neat/functions/_private.scss +0 -114
- data/docs/_sass/vendor/neat/grid/_box-sizing.scss +0 -15
- data/docs/_sass/vendor/neat/grid/_direction-context.scss +0 -33
- data/docs/_sass/vendor/neat/grid/_display-context.scss +0 -28
- data/docs/_sass/vendor/neat/grid/_fill-parent.scss +0 -22
- data/docs/_sass/vendor/neat/grid/_media.scss +0 -92
- data/docs/_sass/vendor/neat/grid/_omega.scss +0 -87
- data/docs/_sass/vendor/neat/grid/_outer-container.scss +0 -34
- data/docs/_sass/vendor/neat/grid/_pad.scss +0 -25
- data/docs/_sass/vendor/neat/grid/_private.scss +0 -35
- data/docs/_sass/vendor/neat/grid/_row.scss +0 -52
- data/docs/_sass/vendor/neat/grid/_shift.scss +0 -50
- data/docs/_sass/vendor/neat/grid/_span-columns.scss +0 -94
- data/docs/_sass/vendor/neat/grid/_to-deprecate.scss +0 -97
- data/docs/_sass/vendor/neat/grid/_visual-grid.scss +0 -42
- data/docs/_sass/vendor/neat/mixins/_clearfix.scss +0 -25
- data/docs/_sass/vendor/neat/settings/_disable-warnings.scss +0 -13
- data/docs/_sass/vendor/neat/settings/_grid.scss +0 -51
- data/docs/_sass/vendor/neat/settings/_visual-grid.scss +0 -27
- data/docs/_sass/vendor/normalize-3.0.2.scss +0 -427
- data/docs/_sass/vendor/pygments.scss +0 -356
- data/docs/automating_browsers/capybara.md +0 -70
- data/docs/css/screen.scss +0 -7
- data/docs/guides/cli.md +0 -52
- data/docs/guides/frontiers.md +0 -93
- data/docs/guides/halting.md +0 -23
- data/docs/guides/job_queues.md +0 -26
- data/docs/guides/locals.md +0 -36
- data/docs/guides/logging.md +0 -22
- data/docs/guides/page_objects.md +0 -67
- data/docs/guides/peeking.md +0 -46
- data/docs/guides/selenium_capybara.md +0 -100
- data/docs/guides/tutorial.md +0 -452
- data/docs/js/navigation.js +0 -11
- data/docs/misc/contributing.md +0 -20
- data/docs/misc/testing.md +0 -11
- data/docs/recipes/authentication.md +0 -23
- data/docs/recipes/csv.md +0 -29
- data/docs/recipes/javascript.md +0 -20
- data/docs/recipes/multiple_uris.md +0 -18
- data/docs/recipes/screenshots.md +0 -20
- data/docs/routing/custom_rules.md +0 -16
- data/docs/routing/filetypes_rules.md +0 -21
- data/docs/routing/host_rules.md +0 -24
- data/docs/routing/path_rules.md +0 -33
- data/docs/routing/protocol_rules.md +0 -17
- data/docs/routing/query_rules.md +0 -69
- data/docs/routing/routes.md +0 -96
- data/docs/routing/uri_rules.md +0 -18
- data/examples/collect_github_issues.rb +0 -65
- data/examples/find_foobar_on_wikipedia.rb +0 -23
- data/lib/wayfarer/configuration.rb +0 -86
- data/lib/wayfarer/crawl.rb +0 -79
- data/lib/wayfarer/crawl_observer.rb +0 -103
- data/lib/wayfarer/dispatcher.rb +0 -104
- data/lib/wayfarer/finders.rb +0 -61
- data/lib/wayfarer/frontiers/frontier.rb +0 -79
- data/lib/wayfarer/frontiers/memory_bloomfilter.rb +0 -32
- data/lib/wayfarer/frontiers/memory_frontier.rb +0 -76
- data/lib/wayfarer/frontiers/memory_trie_frontier.rb +0 -39
- data/lib/wayfarer/frontiers/normalize_uris.rb +0 -48
- data/lib/wayfarer/frontiers/redis_bloomfilter.rb +0 -34
- data/lib/wayfarer/frontiers/redis_frontier.rb +0 -83
- data/lib/wayfarer/http_adapters/adapter_pool.rb +0 -62
- data/lib/wayfarer/http_adapters/net_http_adapter.rb +0 -77
- data/lib/wayfarer/http_adapters/selenium_adapter.rb +0 -80
- data/lib/wayfarer/job.rb +0 -211
- data/lib/wayfarer/locals.rb +0 -40
- data/lib/wayfarer/parsers/json_parser.rb +0 -20
- data/lib/wayfarer/parsers/xml_parser.rb +0 -27
- data/lib/wayfarer/processor.rb +0 -103
- data/lib/wayfarer/routing/custom_rule.rb +0 -21
- data/lib/wayfarer/routing/filetypes_rule.rb +0 -20
- data/lib/wayfarer/routing/host_rule.rb +0 -19
- data/lib/wayfarer/routing/path_rule.rb +0 -54
- data/lib/wayfarer/routing/protocol_rule.rb +0 -21
- data/lib/wayfarer/routing/query_rule.rb +0 -59
- data/lib/wayfarer/routing/rule.rb +0 -114
- data/lib/wayfarer/routing/uri_rule.rb +0 -21
- data/spec/configuration_spec.rb +0 -26
- data/spec/crawl_spec.rb +0 -48
- data/spec/finders_spec.rb +0 -49
- data/spec/frontiers/memory_bloomfilter_spec.rb +0 -6
- data/spec/frontiers/memory_frontier_spec.rb +0 -6
- data/spec/frontiers/memory_trie_frontier_spec.rb +0 -6
- data/spec/frontiers/normalize_uris_spec.rb +0 -59
- data/spec/frontiers/redis_bloomfilter_spec.rb +0 -6
- data/spec/frontiers/redis_frontier_spec.rb +0 -6
- data/spec/http_adapters/adapter_pool_spec.rb +0 -33
- data/spec/http_adapters/net_http_adapter_spec.rb +0 -83
- data/spec/http_adapters/selenium_adapter_spec.rb +0 -53
- data/spec/integration/callbacks_spec.rb +0 -42
- data/spec/integration/locals_spec.rb +0 -106
- data/spec/integration/peeking_spec.rb +0 -61
- data/spec/job_spec.rb +0 -122
- data/spec/processor_spec.rb +0 -31
- data/spec/routing/custom_rule_spec.rb +0 -26
- data/spec/routing/host_rule_spec.rb +0 -48
- data/spec/routing/path_rule_spec.rb +0 -66
- data/spec/routing/protocol_rule_spec.rb +0 -26
- data/spec/routing/query_rule_spec.rb +0 -124
- data/spec/routing/rule_spec.rb +0 -251
- data/spec/routing/uri_rule_spec.rb +0 -24
- data/spec/shared/frontier.rb +0 -96
- data/wayfarer-jruby.gemspec +0 -49
data/docs/guides/page_objects.md
DELETED
@@ -1,67 +0,0 @@
|
|
1
|
-
---
|
2
|
-
layout: default
|
3
|
-
title: Page objects
|
4
|
-
---
|
5
|
-
|
6
|
-
# `Page` objects
|
7
|
-
|
8
|
-
Retrieved pages are represented by `Page` objects and made accessible by `#page` within actions. `Page`s support the same set of features regardless of the HTTP adapter in use.
|
9
|
-
|
10
|
-
<aside class="note">
|
11
|
-
HTTP response headers and status codes are not supported by Selenium WebDrivers. Wayfarer emulates both by having the WebDriver fire an AJAX request to the current page and extracting them from the response. Clearly this is a hack, but it might even work for you. See <a href="https://github.com/bauerd/selenium-emulated_features">selenium-emulated_features</a>.
|
12
|
-
</aside>
|
13
|
-
|
14
|
-
<aside class="note">
|
15
|
-
Even after having followed redirects, <code>Page#uri</code> always returns the URI that originally initiated the redirects. This behaviour stems from redirects being opaque to WebDrivers.
|
16
|
-
</aside>
|
17
|
-
|
18
|
-
A `Page` brings to the table all you'd wish for when doing web scraping:
|
19
|
-
|
20
|
-
* [Nokogiri](http://www.nokogiri.org) parses HTML/XML
|
21
|
-
* [Oj](https://github.com/ohler55/oj) or the standard lib parses JSON
|
22
|
-
* __When running on MRI__, [Pismo](https://github.com/peterc/pismo) lets you access metadata, e.g. keywords, author, a summary, … No overhead if you don't use it!
|
23
|
-
|
24
|
-
Let's see it in action:
|
25
|
-
|
26
|
-
{% highlight ruby %}
|
27
|
-
class DummyJob < Wayfarer::Job
|
28
|
-
# ...
|
29
|
-
|
30
|
-
def example
|
31
|
-
page # => #<Wayfarer::Page:...>
|
32
|
-
|
33
|
-
page.uri # => #<URI::...>
|
34
|
-
page.status_code # => Fixnum
|
35
|
-
page.body # => String
|
36
|
-
page.headers # => Hash
|
37
|
-
|
38
|
-
page.doc # => #<Nokogiri::HTML::Document:...> (HTML/XML) or Hash (JSON)
|
39
|
-
# Also accessible as just `doc`
|
40
|
-
|
41
|
-
page.links # => [URI]
|
42
|
-
page.stylesheets # => [URI]
|
43
|
-
page.javascripts # => [URI]
|
44
|
-
page.images # => [URI]
|
45
|
-
|
46
|
-
# All previous four methods accept arbitrary many CSS selectors
|
47
|
-
page.links ".my-target", ".my-other-target"
|
48
|
-
|
49
|
-
# THESE ARE NOT SUPPORTED ON JRUBY!
|
50
|
-
# On MRI, the following methods get forwarded to a Pismo::Document
|
51
|
-
# See https://github.com/peterc/pismo
|
52
|
-
page.title
|
53
|
-
page.titles
|
54
|
-
page.author
|
55
|
-
page.lede
|
56
|
-
page.keywords
|
57
|
-
page.sentences(qty)
|
58
|
-
page.body
|
59
|
-
page.html_body
|
60
|
-
page.feed
|
61
|
-
page.feeds
|
62
|
-
page.favicon
|
63
|
-
page.description
|
64
|
-
page.datetime
|
65
|
-
end
|
66
|
-
end
|
67
|
-
{% endhighlight %}
|
data/docs/guides/peeking.md
DELETED
@@ -1,46 +0,0 @@
|
|
1
|
-
---
|
2
|
-
layout: default
|
3
|
-
title: Peeking
|
4
|
-
---
|
5
|
-
|
6
|
-
# Peeking
|
7
|
-
Peeking allows bypassing the [frontier](frontiers.html) in an ad-hoc manner. Use Ruby's `yield` keyword to immediately retrieve and dispatch a URI from within actions. Control gets handed off to the action matching the yielded URI, if any.
|
8
|
-
|
9
|
-
A matching route for the yielded URI is still required. If the yielded URI matches no route or raises an exception, `yield` returns `nil`.
|
10
|
-
|
11
|
-
<aside class="note">
|
12
|
-
The action that gets the URI dispatched to <strong>will</strong> get assigned another HTTP adapter! HTTP adapters are never shared across actions, i.e. if you're using the Selenium HTTP adapter, the peeked URI gets retrieved by a different browser process.
|
13
|
-
</aside>
|
14
|
-
|
15
|
-
{% highlight ruby %}
|
16
|
-
class DummyJob < Wayfarer::Job
|
17
|
-
route.uri "https://example.com", to: :foo
|
18
|
-
route.uri "https://w3c.org", to: :bar
|
19
|
-
|
20
|
-
def foo
|
21
|
-
w3c_page = yield "https://w3c.org"
|
22
|
-
end
|
23
|
-
|
24
|
-
def bar
|
25
|
-
page
|
26
|
-
end
|
27
|
-
end
|
28
|
-
{% endhighlight %}
|
29
|
-
|
30
|
-
__Recursive peeking does not work__, or else peeking might result in an infinite loop. The following does terminate:
|
31
|
-
|
32
|
-
{% highlight ruby %}
|
33
|
-
class DummyJob < Wayfarer::Job
|
34
|
-
route.uri "https://example.com", to: :foo
|
35
|
-
route.uri "https://w3c.org", to: :bar
|
36
|
-
|
37
|
-
def foo
|
38
|
-
w3c_page = yield "https://w3c.org"
|
39
|
-
end
|
40
|
-
|
41
|
-
def bar
|
42
|
-
# Silently ignored, assigns nil
|
43
|
-
example_page = yield "https://example.com"
|
44
|
-
end
|
45
|
-
end
|
46
|
-
{% endhighlight %}
|
@@ -1,100 +0,0 @@
|
|
1
|
-
---
|
2
|
-
layout: default
|
3
|
-
title: Selenium & Capybara
|
4
|
-
---
|
5
|
-
|
6
|
-
# Selenium & Capybara
|
7
|
-
|
8
|
-
[Selenium](http://www.seleniumhq.org) is a browser automation framework. [Capybara](https://github.com/teamcapybara/capybara) is an acceptance testing framework that puts an expressive DSL on Selenium's WebDrivers. Both are first-class citizens in Wayfarer and the best tools for automating browsers.
|
9
|
-
|
10
|
-
## Selenium WebDrivers
|
11
|
-
|
12
|
-
WebDrivers let you remote-control browsers, e.g. Firefox, Chrome, Safari and PhantomJS.
|
13
|
-
|
14
|
-
Depending on what browser you want to automate, go install and run the corresponding driver first. For installation instructions, see the project websites:
|
15
|
-
|
16
|
-
* Firefox: [geckodriver](https://github.com/mozilla/geckodriver)
|
17
|
-
* Chrome: [chromedriver](https://sites.google.com/a/chromium.org/chromedriver)
|
18
|
-
* Safari: [SafariDriver](https://github.com/SeleniumHQ/selenium/wiki/SafariDriver)
|
19
|
-
* PhantomJS ships with an embedded driver.
|
20
|
-
|
21
|
-
Other browsers are supported, too. For an exhaustive list, see the "Third Party Drivers, Bindings, and Plugins" section on the [Selenium downloads page](http://www.seleniumhq.org/download).
|
22
|
-
|
23
|
-
If you want to run browser processes on a central server, consider using [Selenium Grid](http://www.seleniumhq.org/projects/grid).
|
24
|
-
|
25
|
-
Wayfarer hides the details of managing Ruby driver objects from you. In order to use Selenium, set the `http_adapter` configuration key to `:selenium`. Pass in the desired browser and arguments by setting the `selenium_argv` key. The number of browser processes can be controlled with the `connection_count` key.
|
26
|
-
|
27
|
-
{% highlight ruby %}
|
28
|
-
class DummyJob < Wayfarer::Job
|
29
|
-
config do |c|
|
30
|
-
# Use 4 Firefox processes
|
31
|
-
c.http_adapter = :selenium
|
32
|
-
c.selenium_argv = [:firefox]
|
33
|
-
c.connection_count = 4
|
34
|
-
|
35
|
-
# Chrome
|
36
|
-
# c.selenium_argv = [:chrome]
|
37
|
-
|
38
|
-
# Safari
|
39
|
-
# c.selenium_argv = [:safari]
|
40
|
-
|
41
|
-
# PhantomJS
|
42
|
-
# c.selenium_argv = [:phantomjs]
|
43
|
-
|
44
|
-
# Selenium Grid
|
45
|
-
# c.selenium_argv = [
|
46
|
-
# :remote,
|
47
|
-
# url: "http://localhost:4444/wd/hub",
|
48
|
-
# desired_capabilities: :firefox
|
49
|
-
# ]
|
50
|
-
end
|
51
|
-
end
|
52
|
-
{% endhighlight %}
|
53
|
-
|
54
|
-
<aside class="note">
|
55
|
-
In order to avoid redirect loops, the <code>:net_http</code> adapter supports the <code>max_http_redirects</code> configuration key. Because redirects are opaque to WebDrivers, the configuration key does not apply to the Selenium adapter. See <a href="configuration.html">Configuration</a>.
|
56
|
-
</aside>
|
57
|
-
|
58
|
-
### Accessing the WebDriver
|
59
|
-
|
60
|
-
Within actions, `#driver` returns a [`Selenium::WebDriver::Driver`](http://www.rubydoc.info/gems/selenium-webdriver/Selenium/WebDriver/Driver):
|
61
|
-
|
62
|
-
{% highlight ruby %}
|
63
|
-
class DummyJob < Wayfarer::Job
|
64
|
-
config do |c|
|
65
|
-
c.http_adapter = :selenium
|
66
|
-
c.selenium_argv = [:firefox]
|
67
|
-
end
|
68
|
-
|
69
|
-
draw uri: "https://example.com"
|
70
|
-
def example
|
71
|
-
driver # => #<Selenium::WebDriver::Driver:...>
|
72
|
-
end
|
73
|
-
end
|
74
|
-
{% endhighlight %}
|
75
|
-
|
76
|
-
<aside class="note">
|
77
|
-
What you do with a WebDriver is opaque to Wayfarer. If you handle navigation yourself with a WebDriver and bypass the <a href="/guides/frontiers.html">frontier</a>, Wayfarer cannot ensure you don't visit URIs twice.
|
78
|
-
</aside>
|
79
|
-
|
80
|
-
## Capybara
|
81
|
-
|
82
|
-
When using the `:selenium` HTTP adapter, `#browser` returns a [`Capybara::Selenium::Driver`](http://www.rubydoc.info/github/jnicklas/capybara/Capybara/Selenium/Driver) within actions:
|
83
|
-
|
84
|
-
{% highlight ruby %}
|
85
|
-
class DummyJob < Wayfarer::Job
|
86
|
-
config do |c|
|
87
|
-
c.http_adapter = :selenium
|
88
|
-
c.selenium_argv = [:firefox]
|
89
|
-
end
|
90
|
-
|
91
|
-
draw uri: "https://example.com"
|
92
|
-
def example
|
93
|
-
browser # => #<Capybara::Selenium::Driver:...>
|
94
|
-
end
|
95
|
-
end
|
96
|
-
{% endhighlight %}
|
97
|
-
|
98
|
-
<aside class="note">
|
99
|
-
What you do with a WebDriver is opaque to Wayfarer. If you handle navigation yourself with a WebDriver and bypass the <a href="/guides/frontiers.html">frontier</a>, Wayfarer cannot ensure you don't visit URIs twice.
|
100
|
-
</aside>
|
data/docs/guides/tutorial.md
DELETED
@@ -1,452 +0,0 @@
|
|
1
|
-
---
|
2
|
-
layout: default
|
3
|
-
title: Tutorial
|
4
|
-
---
|
5
|
-
|
6
|
-
# Tutorial
|
7
|
-
This tutorial walks you through 66.333% of what's to know about Wayfarer, a web crawling framework for Ruby. Along the way, we'll write a reusable crawler that collects the titles of all open issues from an arbitrary GitHub repository.
|
8
|
-
|
9
|
-
First, we get ourselves a subclass of `Wayfarer::Job`. If you've ever worked with a typical MVC web framework, think of a job as a self-contained controller with routes. If you haven't, don't worry!
|
10
|
-
|
11
|
-
{% highlight ruby %}
|
12
|
-
require "wayfarer" # This line omitted hereafter
|
13
|
-
|
14
|
-
class CollectGithubIssues < Wayfarer::Job
|
15
|
-
end
|
16
|
-
{% endhighlight %}
|
17
|
-
|
18
|
-
Suppose we’re interested in Rails' GitHub repository, which is located at `https://github.com/rails/rails`. We need two things:
|
19
|
-
1. A route that matches that URI and …
|
20
|
-
2. an instance method (action) which handles that page:
|
21
|
-
|
22
|
-
{% highlight ruby %}
|
23
|
-
class CollectGithubIssues < Wayfarer::Job
|
24
|
-
route.uri "https://github.com/rails/rails", to: :repository # (1)
|
25
|
-
|
26
|
-
def repository # (2)
|
27
|
-
puts "This looks like Rails to me!"
|
28
|
-
end
|
29
|
-
end
|
30
|
-
{% endhighlight %}
|
31
|
-
|
32
|
-
We set up a single route which maps the repository URI (and only that URI) to `CollectGithubIssues#repository`. When we feed our job the URI, the `#repository` method gets called.
|
33
|
-
|
34
|
-
To run a job, , call `::perform_now` on your job class and pass an arbitrary number of URIs to start with:
|
35
|
-
|
36
|
-
{% highlight ruby %}
|
37
|
-
class CollectGithubIssues < Wayfarer::Job
|
38
|
-
# Gives more detailed output
|
39
|
-
# I'll omit this from now on
|
40
|
-
config.logger.level = :debug
|
41
|
-
|
42
|
-
route.uri "https://github.com/rails/rails", to: :repository
|
43
|
-
|
44
|
-
def repository
|
45
|
-
puts "This looks like Rails to me!"
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
CollectGithubIssues.perform_now("https://github.com/rails/rails", "https://example.com")
|
50
|
-
{% endhighlight %}
|
51
|
-
|
52
|
-
Note that we pass a URI we have no matching route for, `https://example.com`.
|
53
|
-
|
54
|
-
Save and run your file as you would with every other Ruby file:
|
55
|
-
|
56
|
-
```
|
57
|
-
% ruby collect_github_issues.rb
|
58
|
-
```
|
59
|
-
|
60
|
-
… and you'll end up with output similiar to this:
|
61
|
-
|
62
|
-
```
|
63
|
-
Performing CollectGithubIssues (Job ID: …) from Async(default) with arguments: "https://github.com/rails/rails", "https://example.com"
|
64
|
-
I, […] INFO -- wayfarer: First cycle
|
65
|
-
I, […] INFO -- wayfarer: Frontier: URI-normalizing #<Wayfarer::Frontiers::MemoryFrontier:0x007fa2a6ae9cf0>
|
66
|
-
I, […] INFO -- wayfarer: Current cycle contains 2 URI(s)
|
67
|
-
I, […] INFO -- wayfarer: Dispatched to #repository: https://github.com/rails/rails
|
68
|
-
This looks like Rails to me!
|
69
|
-
I, […] INFO -- wayfarer: Staging 0 URI(s)
|
70
|
-
D, […] DEBUG -- wayfarer: No matching route for: https://example.com/
|
71
|
-
I, […] INFO -- wayfarer: No URIs left in current cycle
|
72
|
-
I, […] INFO -- wayfarer: About to cycle. 0 staged URI(s)
|
73
|
-
Performed CollectGithubIssues (Job ID: …) from Async(default) in 863.69ms
|
74
|
-
```
|
75
|
-
|
76
|
-
Here is what happened:
|
77
|
-
|
78
|
-
1. Both URIs we passed in were matched against our routes.
|
79
|
-
2. Our matching GitHub URI's page was retrieved, the mismatching one ignored.
|
80
|
-
3. Our `#repository` action was invoked and has access to the retrieved page.
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
Let’s exchange our static string for the actual page `<title>`. Inside our instance method, we call `#doc` to get ahold of a [`Nokogiri::HTML::Document`](http://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document). [Nokogiri]() is a HTML/XML library, and a parsed document allows us to access the title tag easily:
|
85
|
-
|
86
|
-
{% highlight ruby %}
|
87
|
-
class CollectGithubIssues < Wayfarer::Job
|
88
|
-
route.uri "https://github.com/rails/rails", to: :repository
|
89
|
-
|
90
|
-
def repository
|
91
|
-
# Outputs the <title> attribute value
|
92
|
-
puts doc.title
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
CollectGithubIssues.perform_now("https://github.com/rails/rails")
|
97
|
-
{% endhighlight %}
|
98
|
-
|
99
|
-
Wayfarer does not attempt to do black magic on top of Nokogiri. When it comes to extracting specific data from pages, you’re mostly on your own. There are helpers for finding links, CSS/JavaScript files and images (see [`Page` objects](page_objects.html)). But figuring out what the interesting parts of a HTTP response are is still up to you.
|
100
|
-
|
101
|
-
Wayfarer parses JSON, too. You'll get a `Hash` returned by `#doc` instead of a Nokogiri document.
|
102
|
-
|
103
|
-
Rails’ issues are located at `https://github.com/rails/rails/issues`. We need a new route and a new instance method to handle this issue index. By calling `#stage` and passing in an arbitrary number of URIs, we can stage URIs for processing. Note that just because a URI gets staged does not mean it will be fetched—a matching route is required for every URI. Also, Wayfarer will by default ensure that no URI gets processed twice. This behaviour can be turned off, though (see [Configuration](configuration.html)).
|
104
|
-
|
105
|
-
{% highlight ruby %}
|
106
|
-
class CollectGithubIssues < Wayfarer::Job
|
107
|
-
routes do
|
108
|
-
uri "https://github.com/rails/rails", to: :repository
|
109
|
-
uri "https://github.com/rails/rails/issues", to: :index
|
110
|
-
end
|
111
|
-
|
112
|
-
def repository
|
113
|
-
# This is where we want to head at
|
114
|
-
stage "https://github.com/rails/rails/issues"
|
115
|
-
end
|
116
|
-
|
117
|
-
def index
|
118
|
-
puts "Arrived at the issue listing"
|
119
|
-
end
|
120
|
-
end
|
121
|
-
|
122
|
-
CollectGithubIssues.perform_now("https://github.com/rails/rails")
|
123
|
-
{% endhighlight %}
|
124
|
-
|
125
|
-
What we have so far works fine for the Rails repository, but not for others, because the URIs are hardcoded. That's a real pity, because there are more than 10 million repositories on GitHub. We can do better by switching to a host and path rule.
|
126
|
-
|
127
|
-
A host rule narrows down the host portion of a URI, and a path rule the path. Instead of hard-coding the path, pattern matching can be used to have interesting parts of the path extracted:
|
128
|
-
|
129
|
-
{% highlight ruby %}
|
130
|
-
class CollectGithubIssues < Wayfarer::Job
|
131
|
-
routes do
|
132
|
-
# Both routes match only if
|
133
|
-
# (1) The host is github.com and
|
134
|
-
# (2) The path segments match
|
135
|
-
host "github.com" do
|
136
|
-
path "/:user/:repo", to: :repository
|
137
|
-
path "/:user/:repo/issues", to: :index
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
def repository
|
142
|
-
stage "https://github.com/rails/rails/issues"
|
143
|
-
end
|
144
|
-
|
145
|
-
def index
|
146
|
-
# Captured path segments: params # => { repo: ..., user: ... }
|
147
|
-
# Prints 'rails belongs to rails'.
|
148
|
-
puts "#{params['repo']} belongs to #{params['user']}"
|
149
|
-
end
|
150
|
-
end
|
151
|
-
|
152
|
-
CollectGithubIssues.perform_now("https://github.com/rails/rails")
|
153
|
-
{% endhighlight %}
|
154
|
-
|
155
|
-
Note that we still have a hard-coded URI in `#repository`. Usually, there are two approaches to identify URIs that one wants to follow:
|
156
|
-
|
157
|
-
1. Constructing the successor URI from the current URI.
|
158
|
-
2. Reading the URI from the HTTP response, e.g. extracting an `<a>` tag's `href` property.
|
159
|
-
|
160
|
-
For the first case, say we're on `https://github.com/:user/:repo` and want to go to `https://github.com/:user/:repo/issues`. `#stage` takes relative paths and URIs too, and constructs absolute URIs by appending to the current page's URI:
|
161
|
-
|
162
|
-
{% highlight ruby %}
|
163
|
-
class CollectGithubIssues < Wayfarer::Job
|
164
|
-
# ...
|
165
|
-
|
166
|
-
def index
|
167
|
-
# Stages "#{page.uri}/issues"
|
168
|
-
stage "issues"
|
169
|
-
end
|
170
|
-
|
171
|
-
# ...
|
172
|
-
end
|
173
|
-
{% endhighlight %}
|
174
|
-
|
175
|
-
`#page` returns a [`Page` object]({{base}}/guides/page_objects.html), the general representation of a retrieved page. It gives one access to the page's origin URI, the response headers, the status code and the raw response body and more.
|
176
|
-
|
177
|
-
The second case is where Wayfarer's routing shines. We know that the path structure is `/:user/:repo/issues` and that there's a link somewhere on the repository's frontpage that links to there. We can stage __all__ links of the current page, and have our routes ensure that only interesting ones get processed:
|
178
|
-
|
179
|
-
{% highlight ruby %}
|
180
|
-
class CollectGithubIssues < Wayfarer::Job
|
181
|
-
# ...
|
182
|
-
|
183
|
-
def repository
|
184
|
-
# But only route-matching ones get processed
|
185
|
-
stage page.links
|
186
|
-
end
|
187
|
-
|
188
|
-
# ...
|
189
|
-
end
|
190
|
-
{% endhighlight %}
|
191
|
-
|
192
|
-
`Page#links` returns all links of the current site. But staging all links brings overhead with it, and we'll want to narrow down the links to stage, especially when crawling large page structures. `Page#links` accepts an arbitrary number of CSS selectors to narrow down links. For clarity, let's give the navigation links their own private helper method:
|
193
|
-
|
194
|
-
{% highlight ruby %}
|
195
|
-
class CollectGithubIssues < Wayfarer::Job
|
196
|
-
routes do
|
197
|
-
host "github.com" do
|
198
|
-
path "/:user/:repo", to: :repository
|
199
|
-
path "/:user/:repo/issues", to: :index
|
200
|
-
end
|
201
|
-
end
|
202
|
-
|
203
|
-
def repository
|
204
|
-
stage navigation_links
|
205
|
-
end
|
206
|
-
|
207
|
-
def index
|
208
|
-
puts "#{params['repo']} belongs to #{params['user']}"
|
209
|
-
end
|
210
|
-
|
211
|
-
private
|
212
|
-
|
213
|
-
def navigation_links
|
214
|
-
page.links ".reponav-item"
|
215
|
-
end
|
216
|
-
end
|
217
|
-
|
218
|
-
CollectGithubIssues.perform_now("https://github.com/rails/rails")
|
219
|
-
{% endhighlight %}
|
220
|
-
|
221
|
-
URIs never get dispatched to private instance methods.
|
222
|
-
|
223
|
-
We're prepared to go after the individual issues now. We add the `#issue` action, and route to it with a host and path rule. Links to issue tickets are wrapped in `.issues-listing`, so we can apply the same technique as above:
|
224
|
-
|
225
|
-
{% highlight ruby %}
|
226
|
-
class CollectGithubIssues < Wayfarer::Job
|
227
|
-
routes do
|
228
|
-
host "github.com" do
|
229
|
-
path "/:user/:repo", to: :repository
|
230
|
-
path "/:user/:repo/issues", to: :index
|
231
|
-
path "/:user/:repo/issues/:id", to: :show
|
232
|
-
end
|
233
|
-
end
|
234
|
-
|
235
|
-
def repository
|
236
|
-
stage navigation_links
|
237
|
-
end
|
238
|
-
|
239
|
-
def index
|
240
|
-
stage issue_listing_links
|
241
|
-
end
|
242
|
-
|
243
|
-
def show
|
244
|
-
puts "Issue No. #{params[:id]} @ #{page.uri}"
|
245
|
-
end
|
246
|
-
|
247
|
-
private
|
248
|
-
|
249
|
-
def navigation_links
|
250
|
-
page.links ".reponav-item"
|
251
|
-
end
|
252
|
-
|
253
|
-
def issue_listing_links
|
254
|
-
page.links ".issues-listing"
|
255
|
-
end
|
256
|
-
end
|
257
|
-
|
258
|
-
CollectGithubIssues.perform_now("https://github.com/rails/rails")
|
259
|
-
{% endhighlight %}
|
260
|
-
|
261
|
-
Handling pagination boils down to staging one more link in `#index`. As mentioned before, `#stage` accepts an arbitrary number of URIs:
|
262
|
-
|
263
|
-
{% highlight ruby %}
|
264
|
-
class CollectGithubIssues < Wayfarer::Job
|
265
|
-
routes do
|
266
|
-
host "github.com" do
|
267
|
-
path "/:user/:repo", to: :repository
|
268
|
-
path "/:user/:repo/issues", to: :index
|
269
|
-
path "/:user/:repo/issues/:id", to: :show
|
270
|
-
end
|
271
|
-
end
|
272
|
-
|
273
|
-
def repository
|
274
|
-
stage navigation_links
|
275
|
-
end
|
276
|
-
|
277
|
-
def index
|
278
|
-
stage issue_listing_links, next_page
|
279
|
-
end
|
280
|
-
|
281
|
-
def show
|
282
|
-
puts "Issue No. #{params[:id]} @ #{page.uri}"
|
283
|
-
end
|
284
|
-
|
285
|
-
private
|
286
|
-
|
287
|
-
def navigation_links
|
288
|
-
page.links ".reponav-item"
|
289
|
-
end
|
290
|
-
|
291
|
-
def issue_listing_links
|
292
|
-
page.links ".issues-listing"
|
293
|
-
end
|
294
|
-
|
295
|
-
def next_page
|
296
|
-
page.links ".next_page"
|
297
|
-
end
|
298
|
-
end
|
299
|
-
|
300
|
-
CollectGithubIssues.perform_now("https://github.com/rails/rails")
|
301
|
-
{% endhighlight %}
|
302
|
-
|
303
|
-
By default, all work happens within a single thread. We can speed up crawling by increasing the thread count:
|
304
|
-
|
305
|
-
{% highlight ruby %}
|
306
|
-
class CollectGithubIssues < Wayfarer::Job
|
307
|
-
config.connection_count = 4 # Four threads
|
308
|
-
|
309
|
-
# ...
|
310
|
-
end
|
311
|
-
{% endhighlight %}
|
312
|
-
|
313
|
-
Next, we want to extract the issue's title, its ID, and the GitHub user who opened it and store that data somewhere.
|
314
|
-
|
315
|
-
For extracting the text from the HTML, we add two private helper methods that query the HTML for the text.
|
316
|
-
|
317
|
-
For storing the data, we introduce a [local]({{base}}/guides/locals.html) named `:records` which stores an array. In job actions, locals can be accessed and manipulated. But now that we've bumped up the thread count, multiple instances of our job class will run concurrently. That's why locals declared with `::let` are replaced with thread-safe counterparts behind the scenes.
|
318
|
-
|
319
|
-
We stop processing with `halt` once we have collected 30 issue records:
|
320
|
-
|
321
|
-
{% highlight ruby %}
|
322
|
-
class CollectGithubIssues < Wayfarer::Job
|
323
|
-
config.connection_count = 4
|
324
|
-
|
325
|
-
let(:records) { [] }
|
326
|
-
|
327
|
-
routes do
|
328
|
-
host "github.com" do
|
329
|
-
path "/:user/:repo", to: :repository
|
330
|
-
path "/:user/:repo/issues", to: :index
|
331
|
-
path "/:user/:repo/issues/:id", to: :show
|
332
|
-
end
|
333
|
-
end
|
334
|
-
|
335
|
-
after_crawl do
|
336
|
-
records.each do |issue|
|
337
|
-
# Save them somewhere?
|
338
|
-
puts issue
|
339
|
-
end
|
340
|
-
end
|
341
|
-
|
342
|
-
def repository
|
343
|
-
stage navigation_links
|
344
|
-
end
|
345
|
-
|
346
|
-
def index
|
347
|
-
stage issue_listing_links, next_page
|
348
|
-
end
|
349
|
-
|
350
|
-
def show
|
351
|
-
return halt if records.count > 30
|
352
|
-
|
353
|
-
records << {
|
354
|
-
id: params[:id],
|
355
|
-
title: issue_title,
|
356
|
-
author: issue_author
|
357
|
-
}
|
358
|
-
end
|
359
|
-
|
360
|
-
private
|
361
|
-
|
362
|
-
def issue_title
|
363
|
-
doc.css(".js-issue-title").text.strip
|
364
|
-
end
|
365
|
-
|
366
|
-
def issue_author
|
367
|
-
doc.css(".TableObject-item .author").text.strip
|
368
|
-
end
|
369
|
-
|
370
|
-
def navigation_links
|
371
|
-
page.links ".reponav-item"
|
372
|
-
end
|
373
|
-
|
374
|
-
def issue_listing_links
|
375
|
-
page.links ".issues-listing"
|
376
|
-
end
|
377
|
-
|
378
|
-
def next_page
|
379
|
-
page.links ".next_page"
|
380
|
-
end
|
381
|
-
end
|
382
|
-
|
383
|
-
CollectGithubIssues.perform_now("https://github.com/rails/rails")
|
384
|
-
{% endhighlight %}
|
385
|
-
|
386
|
-
For the last part, we turn off the debugging output (if you have still enabled it) and output each record. You'd probably want to store them somewhere at this point, e.g. by [writing them to a CSV file]({{base}}/recipes/csv.html), or putting them into a database, etc.
|
387
|
-
|
388
|
-
{% highlight ruby %}
|
389
|
-
class CollectGithubIssues < Wayfarer::Job
|
390
|
-
config.connection_count = 4
|
391
|
-
config.logger.level = :fatal
|
392
|
-
|
393
|
-
let(:records) { [] }
|
394
|
-
|
395
|
-
routes do
|
396
|
-
host "github.com" do
|
397
|
-
path "/:user/:repo", to: :repository
|
398
|
-
path "/:user/:repo/issues", to: :index
|
399
|
-
path "/:user/:repo/issues/:id", to: :show
|
400
|
-
end
|
401
|
-
end
|
402
|
-
|
403
|
-
after_crawl do
|
404
|
-
records.each do |issue|
|
405
|
-
# Save them somewhere?
|
406
|
-
puts issue
|
407
|
-
end
|
408
|
-
end
|
409
|
-
|
410
|
-
def repository
|
411
|
-
stage navigation_links
|
412
|
-
end
|
413
|
-
|
414
|
-
def index
|
415
|
-
stage issue_listing_links, next_page
|
416
|
-
end
|
417
|
-
|
418
|
-
def show
|
419
|
-
return halt if records.count > 30
|
420
|
-
|
421
|
-
records << {
|
422
|
-
id: params[:id],
|
423
|
-
title: issue_title,
|
424
|
-
author: issue_author
|
425
|
-
}
|
426
|
-
end
|
427
|
-
|
428
|
-
private
|
429
|
-
|
430
|
-
def issue_title
|
431
|
-
doc.css(".js-issue-title").text.strip
|
432
|
-
end
|
433
|
-
|
434
|
-
def issue_author
|
435
|
-
doc.css(".TableObject-item .author").text.strip
|
436
|
-
end
|
437
|
-
|
438
|
-
def navigation_links
|
439
|
-
page.links ".reponav-item"
|
440
|
-
end
|
441
|
-
|
442
|
-
def issue_listing_links
|
443
|
-
page.links ".issues-listing"
|
444
|
-
end
|
445
|
-
|
446
|
-
def next_page
|
447
|
-
page.links ".next_page"
|
448
|
-
end
|
449
|
-
end
|
450
|
-
|
451
|
-
CollectGithubIssues.perform_now("https://github.com/rails/rails")
|
452
|
-
{% endhighlight %}
|
data/docs/js/navigation.js
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
document.addEventListener("DOMContentLoaded", function() {
|
2
|
-
var links = document.querySelectorAll(".navigation__link");
|
3
|
-
|
4
|
-
for (i = 0; i < links.length; i++) {
|
5
|
-
var link = links[i];
|
6
|
-
|
7
|
-
if (link.pathname === window.location.pathname) {
|
8
|
-
link.classList.add("navigation__link--active");
|
9
|
-
}
|
10
|
-
}
|
11
|
-
});
|
data/docs/misc/contributing.md
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
---
|
2
|
-
layout: default
|
3
|
-
title: Contributing
|
4
|
-
---
|
5
|
-
|
6
|
-
# Contributing
|
7
|
-
|
8
|
-
1. Fork the repository
|
9
|
-
2. Ensure the development dependencies are installed:
|
10
|
-
`% bundle install --with development`
|
11
|
-
2. Make changes
|
12
|
-
3. Ensure your (new?) tests pass:
|
13
|
-
`% bundle exec rake test`
|
14
|
-
4. Autocorrect RubuCop offenses:
|
15
|
-
`% bundle exec rake rubocop:auto_correct`
|
16
|
-
5. Fix remaining offenses or have a good excuse not to:
|
17
|
-
`% bundle exec rake rubocop`
|
18
|
-
6. Write commit messages at least not worse than mine
|
19
|
-
7. Open a pull request on GitHub
|
20
|
-
8. Thank you
|