wayfarer 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rbenv-gemsets +1 -0
  4. data/.rspec +3 -0
  5. data/.rubocop.yml +21 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/.yardopts +3 -0
  9. data/Changelog.md +10 -0
  10. data/Gemfile +11 -0
  11. data/LICENSE +19 -0
  12. data/README.md +21 -0
  13. data/Rakefile +114 -0
  14. data/benchmark/frontiers.rb +143 -0
  15. data/bin/wayfarer +116 -0
  16. data/docs/.gitignore +2 -0
  17. data/docs/_config.yml +15 -0
  18. data/docs/_includes/base.html +7 -0
  19. data/docs/_includes/head.html +10 -0
  20. data/docs/_includes/navigation.html +187 -0
  21. data/docs/_layouts/default.html +42 -0
  22. data/docs/_sass/base.scss +439 -0
  23. data/docs/_sass/variables.scss +24 -0
  24. data/docs/_sass/vendor/bourbon/_bourbon-deprecate.scss +19 -0
  25. data/docs/_sass/vendor/bourbon/_bourbon-deprecated-upcoming.scss +425 -0
  26. data/docs/_sass/vendor/bourbon/_bourbon.scss +90 -0
  27. data/docs/_sass/vendor/bourbon/addons/_border-color.scss +29 -0
  28. data/docs/_sass/vendor/bourbon/addons/_border-radius.scss +48 -0
  29. data/docs/_sass/vendor/bourbon/addons/_border-style.scss +28 -0
  30. data/docs/_sass/vendor/bourbon/addons/_border-width.scss +28 -0
  31. data/docs/_sass/vendor/bourbon/addons/_buttons.scss +69 -0
  32. data/docs/_sass/vendor/bourbon/addons/_clearfix.scss +25 -0
  33. data/docs/_sass/vendor/bourbon/addons/_ellipsis.scss +30 -0
  34. data/docs/_sass/vendor/bourbon/addons/_font-stacks.scss +31 -0
  35. data/docs/_sass/vendor/bourbon/addons/_hide-text.scss +27 -0
  36. data/docs/_sass/vendor/bourbon/addons/_margin.scss +29 -0
  37. data/docs/_sass/vendor/bourbon/addons/_padding.scss +29 -0
  38. data/docs/_sass/vendor/bourbon/addons/_position.scss +51 -0
  39. data/docs/_sass/vendor/bourbon/addons/_prefixer.scss +66 -0
  40. data/docs/_sass/vendor/bourbon/addons/_retina-image.scss +27 -0
  41. data/docs/_sass/vendor/bourbon/addons/_size.scss +56 -0
  42. data/docs/_sass/vendor/bourbon/addons/_text-inputs.scss +118 -0
  43. data/docs/_sass/vendor/bourbon/addons/_timing-functions.scss +34 -0
  44. data/docs/_sass/vendor/bourbon/addons/_triangle.scss +63 -0
  45. data/docs/_sass/vendor/bourbon/addons/_word-wrap.scss +29 -0
  46. data/docs/_sass/vendor/bourbon/css3/_animation.scss +61 -0
  47. data/docs/_sass/vendor/bourbon/css3/_appearance.scss +5 -0
  48. data/docs/_sass/vendor/bourbon/css3/_backface-visibility.scss +5 -0
  49. data/docs/_sass/vendor/bourbon/css3/_background-image.scss +44 -0
  50. data/docs/_sass/vendor/bourbon/css3/_background.scss +57 -0
  51. data/docs/_sass/vendor/bourbon/css3/_border-image.scss +61 -0
  52. data/docs/_sass/vendor/bourbon/css3/_calc.scss +6 -0
  53. data/docs/_sass/vendor/bourbon/css3/_columns.scss +67 -0
  54. data/docs/_sass/vendor/bourbon/css3/_filter.scss +6 -0
  55. data/docs/_sass/vendor/bourbon/css3/_flex-box.scss +327 -0
  56. data/docs/_sass/vendor/bourbon/css3/_font-face.scss +29 -0
  57. data/docs/_sass/vendor/bourbon/css3/_font-feature-settings.scss +6 -0
  58. data/docs/_sass/vendor/bourbon/css3/_hidpi-media-query.scss +12 -0
  59. data/docs/_sass/vendor/bourbon/css3/_hyphens.scss +6 -0
  60. data/docs/_sass/vendor/bourbon/css3/_image-rendering.scss +15 -0
  61. data/docs/_sass/vendor/bourbon/css3/_keyframes.scss +38 -0
  62. data/docs/_sass/vendor/bourbon/css3/_linear-gradient.scss +40 -0
  63. data/docs/_sass/vendor/bourbon/css3/_perspective.scss +12 -0
  64. data/docs/_sass/vendor/bourbon/css3/_placeholder.scss +10 -0
  65. data/docs/_sass/vendor/bourbon/css3/_radial-gradient.scss +40 -0
  66. data/docs/_sass/vendor/bourbon/css3/_selection.scss +44 -0
  67. data/docs/_sass/vendor/bourbon/css3/_text-decoration.scss +27 -0
  68. data/docs/_sass/vendor/bourbon/css3/_transform.scss +21 -0
  69. data/docs/_sass/vendor/bourbon/css3/_transition.scss +81 -0
  70. data/docs/_sass/vendor/bourbon/css3/_user-select.scss +5 -0
  71. data/docs/_sass/vendor/bourbon/functions/_assign-inputs.scss +16 -0
  72. data/docs/_sass/vendor/bourbon/functions/_contains-falsy.scss +25 -0
  73. data/docs/_sass/vendor/bourbon/functions/_contains.scss +31 -0
  74. data/docs/_sass/vendor/bourbon/functions/_is-length.scss +16 -0
  75. data/docs/_sass/vendor/bourbon/functions/_is-light.scss +26 -0
  76. data/docs/_sass/vendor/bourbon/functions/_is-number.scss +16 -0
  77. data/docs/_sass/vendor/bourbon/functions/_is-size.scss +23 -0
  78. data/docs/_sass/vendor/bourbon/functions/_modular-scale.scss +74 -0
  79. data/docs/_sass/vendor/bourbon/functions/_px-to-em.scss +24 -0
  80. data/docs/_sass/vendor/bourbon/functions/_px-to-rem.scss +26 -0
  81. data/docs/_sass/vendor/bourbon/functions/_shade.scss +24 -0
  82. data/docs/_sass/vendor/bourbon/functions/_strip-units.scss +22 -0
  83. data/docs/_sass/vendor/bourbon/functions/_tint.scss +24 -0
  84. data/docs/_sass/vendor/bourbon/functions/_transition-property-name.scss +37 -0
  85. data/docs/_sass/vendor/bourbon/functions/_unpack.scss +32 -0
  86. data/docs/_sass/vendor/bourbon/helpers/_convert-units.scss +26 -0
  87. data/docs/_sass/vendor/bourbon/helpers/_directional-values.scss +108 -0
  88. data/docs/_sass/vendor/bourbon/helpers/_font-source-declaration.scss +53 -0
  89. data/docs/_sass/vendor/bourbon/helpers/_gradient-positions-parser.scss +24 -0
  90. data/docs/_sass/vendor/bourbon/helpers/_linear-angle-parser.scss +35 -0
  91. data/docs/_sass/vendor/bourbon/helpers/_linear-gradient-parser.scss +51 -0
  92. data/docs/_sass/vendor/bourbon/helpers/_linear-positions-parser.scss +77 -0
  93. data/docs/_sass/vendor/bourbon/helpers/_linear-side-corner-parser.scss +41 -0
  94. data/docs/_sass/vendor/bourbon/helpers/_radial-arg-parser.scss +74 -0
  95. data/docs/_sass/vendor/bourbon/helpers/_radial-gradient-parser.scss +55 -0
  96. data/docs/_sass/vendor/bourbon/helpers/_radial-positions-parser.scss +28 -0
  97. data/docs/_sass/vendor/bourbon/helpers/_render-gradients.scss +31 -0
  98. data/docs/_sass/vendor/bourbon/helpers/_shape-size-stripper.scss +15 -0
  99. data/docs/_sass/vendor/bourbon/helpers/_str-to-num.scss +55 -0
  100. data/docs/_sass/vendor/bourbon/settings/_asset-pipeline.scss +7 -0
  101. data/docs/_sass/vendor/bourbon/settings/_deprecation-warnings.scss +8 -0
  102. data/docs/_sass/vendor/bourbon/settings/_prefixer.scss +9 -0
  103. data/docs/_sass/vendor/bourbon/settings/_px-to-em.scss +1 -0
  104. data/docs/_sass/vendor/neat/_neat-helpers.scss +11 -0
  105. data/docs/_sass/vendor/neat/_neat.scss +23 -0
  106. data/docs/_sass/vendor/neat/functions/_new-breakpoint.scss +49 -0
  107. data/docs/_sass/vendor/neat/functions/_private.scss +114 -0
  108. data/docs/_sass/vendor/neat/grid/_box-sizing.scss +15 -0
  109. data/docs/_sass/vendor/neat/grid/_direction-context.scss +33 -0
  110. data/docs/_sass/vendor/neat/grid/_display-context.scss +28 -0
  111. data/docs/_sass/vendor/neat/grid/_fill-parent.scss +22 -0
  112. data/docs/_sass/vendor/neat/grid/_media.scss +92 -0
  113. data/docs/_sass/vendor/neat/grid/_omega.scss +87 -0
  114. data/docs/_sass/vendor/neat/grid/_outer-container.scss +34 -0
  115. data/docs/_sass/vendor/neat/grid/_pad.scss +25 -0
  116. data/docs/_sass/vendor/neat/grid/_private.scss +35 -0
  117. data/docs/_sass/vendor/neat/grid/_row.scss +52 -0
  118. data/docs/_sass/vendor/neat/grid/_shift.scss +50 -0
  119. data/docs/_sass/vendor/neat/grid/_span-columns.scss +94 -0
  120. data/docs/_sass/vendor/neat/grid/_to-deprecate.scss +97 -0
  121. data/docs/_sass/vendor/neat/grid/_visual-grid.scss +42 -0
  122. data/docs/_sass/vendor/neat/mixins/_clearfix.scss +25 -0
  123. data/docs/_sass/vendor/neat/settings/_disable-warnings.scss +13 -0
  124. data/docs/_sass/vendor/neat/settings/_grid.scss +51 -0
  125. data/docs/_sass/vendor/neat/settings/_visual-grid.scss +27 -0
  126. data/docs/_sass/vendor/normalize-3.0.2.scss +427 -0
  127. data/docs/_sass/vendor/pygments.scss +356 -0
  128. data/docs/automating_browsers/capybara.md +70 -0
  129. data/docs/css/screen.scss +7 -0
  130. data/docs/guides/callbacks.md +45 -0
  131. data/docs/guides/cli.md +52 -0
  132. data/docs/guides/configuration.md +184 -0
  133. data/docs/guides/error_handling.md +46 -0
  134. data/docs/guides/frontiers.md +93 -0
  135. data/docs/guides/halting.md +23 -0
  136. data/docs/guides/job_queues.md +26 -0
  137. data/docs/guides/locals.md +36 -0
  138. data/docs/guides/logging.md +22 -0
  139. data/docs/guides/page_objects.md +67 -0
  140. data/docs/guides/peeking.md +46 -0
  141. data/docs/guides/selenium_capybara.md +100 -0
  142. data/docs/guides/tutorial.md +452 -0
  143. data/docs/index.md +82 -0
  144. data/docs/js/navigation.js +11 -0
  145. data/docs/misc/contributing.md +20 -0
  146. data/docs/misc/testing.md +11 -0
  147. data/docs/recipes/authentication.md +23 -0
  148. data/docs/recipes/csv.md +29 -0
  149. data/docs/recipes/javascript.md +20 -0
  150. data/docs/recipes/multiple_uris.md +18 -0
  151. data/docs/recipes/screenshots.md +20 -0
  152. data/docs/routing/custom_rules.md +16 -0
  153. data/docs/routing/filetypes_rules.md +21 -0
  154. data/docs/routing/host_rules.md +24 -0
  155. data/docs/routing/path_rules.md +33 -0
  156. data/docs/routing/protocol_rules.md +17 -0
  157. data/docs/routing/query_rules.md +69 -0
  158. data/docs/routing/routes.md +96 -0
  159. data/docs/routing/uri_rules.md +18 -0
  160. data/examples/collect_github_issues.rb +65 -0
  161. data/examples/find_foobar_on_wikipedia.rb +23 -0
  162. data/lib/wayfarer/configuration.rb +86 -0
  163. data/lib/wayfarer/crawl.rb +79 -0
  164. data/lib/wayfarer/crawl_observer.rb +103 -0
  165. data/lib/wayfarer/dispatcher.rb +104 -0
  166. data/lib/wayfarer/finders.rb +61 -0
  167. data/lib/wayfarer/frontiers/frontier.rb +79 -0
  168. data/lib/wayfarer/frontiers/memory_bloomfilter.rb +32 -0
  169. data/lib/wayfarer/frontiers/memory_frontier.rb +76 -0
  170. data/lib/wayfarer/frontiers/memory_trie_frontier.rb +39 -0
  171. data/lib/wayfarer/frontiers/normalize_uris.rb +48 -0
  172. data/lib/wayfarer/frontiers/redis_bloomfilter.rb +34 -0
  173. data/lib/wayfarer/frontiers/redis_frontier.rb +83 -0
  174. data/lib/wayfarer/http_adapters/adapter_pool.rb +62 -0
  175. data/lib/wayfarer/http_adapters/net_http_adapter.rb +77 -0
  176. data/lib/wayfarer/http_adapters/selenium_adapter.rb +80 -0
  177. data/lib/wayfarer/job.rb +211 -0
  178. data/lib/wayfarer/locals.rb +40 -0
  179. data/lib/wayfarer/page.rb +94 -0
  180. data/lib/wayfarer/parsers/json_parser.rb +20 -0
  181. data/lib/wayfarer/parsers/xml_parser.rb +27 -0
  182. data/lib/wayfarer/processor.rb +103 -0
  183. data/lib/wayfarer/routing/custom_rule.rb +21 -0
  184. data/lib/wayfarer/routing/filetypes_rule.rb +20 -0
  185. data/lib/wayfarer/routing/host_rule.rb +19 -0
  186. data/lib/wayfarer/routing/path_rule.rb +54 -0
  187. data/lib/wayfarer/routing/protocol_rule.rb +21 -0
  188. data/lib/wayfarer/routing/query_rule.rb +59 -0
  189. data/lib/wayfarer/routing/router.rb +71 -0
  190. data/lib/wayfarer/routing/rule.rb +114 -0
  191. data/lib/wayfarer/routing/uri_rule.rb +21 -0
  192. data/lib/wayfarer.rb +68 -0
  193. data/spec/configuration_spec.rb +26 -0
  194. data/spec/crawl_spec.rb +48 -0
  195. data/spec/finders_spec.rb +49 -0
  196. data/spec/frontiers/memory_bloomfilter_spec.rb +6 -0
  197. data/spec/frontiers/memory_frontier_spec.rb +6 -0
  198. data/spec/frontiers/memory_trie_frontier_spec.rb +6 -0
  199. data/spec/frontiers/normalize_uris_spec.rb +59 -0
  200. data/spec/frontiers/redis_bloomfilter_spec.rb +6 -0
  201. data/spec/frontiers/redis_frontier_spec.rb +6 -0
  202. data/spec/http_adapters/adapter_pool_spec.rb +33 -0
  203. data/spec/http_adapters/net_http_adapter_spec.rb +83 -0
  204. data/spec/http_adapters/selenium_adapter_spec.rb +53 -0
  205. data/spec/integration/callbacks_spec.rb +42 -0
  206. data/spec/integration/locals_spec.rb +106 -0
  207. data/spec/integration/peeking_spec.rb +61 -0
  208. data/spec/job_spec.rb +122 -0
  209. data/spec/page_spec.rb +38 -0
  210. data/spec/parsers/json_parser_spec.rb +30 -0
  211. data/spec/parsers/xml_parser_spec.rb +24 -0
  212. data/spec/processor_spec.rb +31 -0
  213. data/spec/routing/custom_rule_spec.rb +26 -0
  214. data/spec/routing/filetypes_rule_spec.rb +40 -0
  215. data/spec/routing/host_rule_spec.rb +48 -0
  216. data/spec/routing/path_rule_spec.rb +66 -0
  217. data/spec/routing/protocol_rule_spec.rb +26 -0
  218. data/spec/routing/query_rule_spec.rb +124 -0
  219. data/spec/routing/router_spec.rb +67 -0
  220. data/spec/routing/rule_spec.rb +251 -0
  221. data/spec/routing/uri_rule_spec.rb +24 -0
  222. data/spec/shared/frontier.rb +96 -0
  223. data/spec/spec_helpers.rb +62 -0
  224. data/spec/wayfarer_spec.rb +24 -0
  225. data/support/static/finders.html +38 -0
  226. data/support/static/graph/details/a.html +10 -0
  227. data/support/static/graph/details/b.html +10 -0
  228. data/support/static/graph/index.html +20 -0
  229. data/support/static/json/dummy.json +13 -0
  230. data/support/static/links/links.html +28 -0
  231. data/support/static/xml/dummy.xml +120 -0
  232. data/support/test_app.rb +45 -0
  233. data/wayfarer-jruby.gemspec +49 -0
  234. data/wayfarer.gemspec +53 -0
  235. metadata +697 -0
@@ -0,0 +1,16 @@
1
+ ---
2
+ layout: default
3
+ title: Custom rules
4
+ ---
5
+
6
+ # Custom rules
7
+
8
+ Custom rules take a block that gets yielded the URI or an object that responds to `#call(uri)`. If the block or the delegate return a truthy value, the rule matches.
9
+
10
+ {% highlight ruby %}
11
+ class DummyJob < Wayfarer::Job
12
+ route.if -> (uri) { uri.host == uri.host.reverse }
13
+ end
14
+ {% endhighlight %}
15
+
16
+ * Matches only URIs with palindrome hosts
@@ -0,0 +1,21 @@
1
+ ---
2
+ layout: default
3
+ title: Filetypes rules
4
+ ---
5
+
6
+ # Filetypes rules
7
+
8
+ Filetypes rules match against the URI path's file extension.
9
+
10
+ {% highlight ruby %}
11
+ class DummyJob < Wayfarer::Job
12
+ route.filetypes [:png, :jpg], to: :image
13
+ route.forbid.filetypes [:php, :js]
14
+ end
15
+ {% endhighlight %}
16
+
17
+ Matches:
18
+
19
+ * `http://example.com/foo.png`
20
+ * `http://example.com/foo.jpg`
21
+ * `https://example.com/qux/bar.jpg`
@@ -0,0 +1,24 @@
1
+ ---
2
+ layout: default
3
+ title: Host rules
4
+ ---
5
+
6
+ # Host rules
7
+
8
+ Host rules match against a host string or RegExp.
9
+
10
+ {% highlight ruby %}
11
+ class DummyJob < Wayfarer::Job
12
+ route.host "example.com"
13
+ route.host /example/
14
+ end
15
+ {% endhighlight %}
16
+
17
+ Matches:
18
+
19
+ * All URIs hosted on `"example.com"`.
20
+ * All URIs that contain `"example"`.
21
+
22
+ <aside class="note">
23
+ <code>"www.host.net"</code> and <code>"host.net"</code> are not considered equal. You have to specify the exact host when using strings. Consider using <code>/host.net/</code> instead.
24
+ </aside>
@@ -0,0 +1,33 @@
1
+ ---
2
+ layout: default
3
+ title: Path rules
4
+ ---
5
+
6
+ # Path rules
7
+
8
+ Path rules match against the path of a URI. Both strings and RegExps are accepted, and path segment pattern matching and RegExp captures are supported.
9
+
10
+ {% highlight ruby %}
11
+ class DummyJob < Wayfarer::Job
12
+ route.path "/:alpha/:beta", to: :foo
13
+ route.path /^foobar\/(.+)/, to: :bar
14
+
15
+ def foo
16
+ params[:alpha]
17
+ params[:beta]
18
+ end
19
+
20
+ def foo
21
+ params["0"]
22
+ end
23
+ end
24
+ {% endhighlight %}
25
+
26
+ Matches:
27
+
28
+ * All URIs with path segments matching `/:alpha/:beta`, e.g. `https://example.com/foo/bar`
29
+ * All URIs starting with `"/foobar/"`.
30
+
31
+ <aside class="note">
32
+ <code>/:alpha/:beta</code> and <code>:alpha/:beta</code> are not considered equal. Note the opening slash.
33
+ </aside>
@@ -0,0 +1,17 @@
1
+ ---
2
+ layout: default
3
+ title: Protocol rules
4
+ ---
5
+
6
+ # Protocol rules
7
+
8
+ Protocol rules match against symbols/strings.
9
+
10
+ {% highlight ruby %}
11
+ class DummyJob < Wayfarer::Job
12
+ route.protocol :https
13
+ end
14
+ {% endhighlight %}
15
+
16
+ * Matches `https://example.com`.
17
+ * Does not match `http://example.com`.
@@ -0,0 +1,69 @@
1
+ ---
2
+ layout: default
3
+ title: Query rules
4
+ ---
5
+
6
+ # Query rules
7
+
8
+ Query rules impose constraints on key-value query parameters. Strings, integers, RegExps and ranges are supported.
9
+
10
+ ## String constraints
11
+
12
+ {% highlight ruby %}
13
+ class DummyJob < Wayfarer::Job
14
+ route.query arg: "foo"
15
+ end
16
+ {% endhighlight %}
17
+
18
+ * Matches `https://example.com?arg=foo`.
19
+
20
+ ## Integer constraints
21
+
22
+ {% highlight ruby %}
23
+ class DummyJob < Wayfarer::Job
24
+ route.query arg: 42
25
+ end
26
+ {% endhighlight %}
27
+
28
+ * Matches `https://example.com?arg=42`.
29
+
30
+ ---
31
+
32
+ ## RegExp constraints
33
+
34
+ {% highlight ruby %}
35
+ class DummyJob < Wayfarer::Job
36
+ route.query arg: /foo/
37
+ end
38
+ {% endhighlight %}
39
+
40
+ * Matches `https://example.com?arg=foo`.
41
+ * Matches `https://example.com?arg=foobar`.
42
+
43
+ ---
44
+
45
+ ## Range constraints
46
+
47
+ {% highlight ruby %}
48
+ class DummyJob < Wayfarer::Job
49
+ route.query arg: 1..10
50
+ end
51
+ {% endhighlight %}
52
+
53
+ * Matches `https://example.com?arg=1`.
54
+ * Matches […]
55
+ * Matches `https://example.com?arg=10`.
56
+
57
+ ---
58
+
59
+ ## Compound constraints
60
+
61
+ {% highlight ruby %}
62
+ class DummyJob < Wayfarer::Job
63
+ route.query foo: 1..5, bar: /baz/, qux: "zot", toto: 2
64
+ end
65
+ {% endhighlight %}
66
+
67
+ * Matches `https://example.com?foo=4&bar=bazqux&qux=zot&toto=2`.
68
+
69
+ ---
@@ -0,0 +1,96 @@
1
+ ---
2
+ layout: default
3
+ title: Routes
4
+ categories: [Routing]
5
+ ---
6
+
7
+ # Routes
8
+
9
+ * Routes are filters for interesting URIs.
10
+ * Routes put constraints on URIs that should get processed.
11
+ * Routes map URIs to instance methods (actions).
12
+ * Routes are tree nodes and thus nestable.
13
+
14
+ Currently, the following rules are available:
15
+
16
+ * [URI rules](uri_rules.html) match URIs against a string.
17
+ * [Host rules](/routing/host_rules.html) match hosts against strings and RegExps.
18
+ * [Path rules](/routing/path_rules.html) match paths against pattern strings and RegExps. They support path segment capturing.
19
+ * [Query rules](/routing/query_rules.html) match key-value pairs of query parameters against strings, integers, RegExps and ranges.
20
+
21
+ Routes can be fordidden. URIs that match forbidden rules are never processed.
22
+
23
+ ## Route declaration
24
+
25
+ ### Declaration order matching
26
+
27
+ {% highlight ruby %}
28
+ class DummyJob < Wayfarer::Job
29
+ route.host "example.com", to: :foo
30
+ route.path "/foo", to: :bar
31
+
32
+ # Is equivalent to:
33
+ #
34
+ # routes do
35
+ # host "example.com", to: :foo
36
+ # path "/foo", to: :bar
37
+ # end
38
+ end
39
+ {% endhighlight %}
40
+
41
+ * Dispatches `https://example.com/foo` to `:foo`.
42
+ * Dispatches `https://example.com` to `:foo`.
43
+ * Dispatches `https://yahoo.com/foo` to `:bar`.
44
+
45
+ ---
46
+
47
+ ### Nesting routes (child rules)
48
+
49
+ A route matches if it has a child rule that matches. This applies recursively.
50
+
51
+ {% highlight ruby %}
52
+ class DummyJob < Wayfarer::Job
53
+ route.host "example.com", to: :foo do
54
+ path "/foo"
55
+ end
56
+
57
+ # Is equivalent to:
58
+ #
59
+ # route.host "example.com", path: "/foo", to: :foo
60
+ # route.path "/foo", host: "example.com", to: :foo
61
+ end
62
+ {% endhighlight %}
63
+
64
+ * Dispatches `https://example.com/foo` to `:foo`.
65
+ * Does not dispatch `https://example.com`.
66
+ * Does not dispatch `https://yahoo.com/foo`.
67
+
68
+ ---
69
+
70
+ ### Deepest routes override actions
71
+
72
+ {% highlight ruby %}
73
+ class DummyJob < Wayfarer::Job
74
+ route.host "example.com", to: :foo do
75
+ path "/foo", to: :bar
76
+ end
77
+ end
78
+ {% endhighlight %}
79
+
80
+ * Dispatches `https://example.com/foo` to `:bar`.
81
+ * Does not dispatch `https://example.com`.
82
+ * Does not dispatch `https://yahoo.com/foo`.
83
+
84
+ ---
85
+
86
+ ### Forbidding rules
87
+
88
+ {% highlight ruby %}
89
+ class DummyJob < Wayfarer::Job
90
+ route.forbid.path "/foo"
91
+ route.host "example.com", to: :foo
92
+ end
93
+ {% endhighlight %}
94
+
95
+ * Dispatches `https://example.com` to `:bar`.
96
+ * Does not dispatch `https://example.com/foo`.
@@ -0,0 +1,18 @@
1
+ ---
2
+ layout: default
3
+ title: URI Rules
4
+ ---
5
+
6
+ # URI rules
7
+
8
+ URI rules match against a string.
9
+
10
+ {% highlight ruby %}
11
+ class DummyJob < Wayfarer::Job
12
+ route.uri "https://example.com"
13
+ end
14
+ {% endhighlight %}
15
+
16
+ Matches:
17
+
18
+ * Only `https://example.com`
@@ -0,0 +1,65 @@
1
+ require_relative "../lib/wayfarer"
2
+
3
+ class CollectGithubIssues < Wayfarer::Job
4
+ config.connection_count = 4
5
+ config.logger.level = :fatal
6
+
7
+ let(:records) { [] }
8
+
9
+ routes do
10
+ host "github.com" do
11
+ path "/:user/:repo", to: :repository
12
+ path "/:user/:repo/issues", to: :index
13
+ path "/:user/:repo/issues/:id", to: :show
14
+ end
15
+ end
16
+
17
+ after_crawl do
18
+ records.each do |issue|
19
+ # Save them somewhere?
20
+ puts issue
21
+ end
22
+ end
23
+
24
+ def repository
25
+ stage navigation_links
26
+ end
27
+
28
+ def index
29
+ stage issue_listing_links, next_page
30
+ end
31
+
32
+ def show
33
+ return halt if records.count > 30
34
+
35
+ records << {
36
+ id: params[:id],
37
+ title: issue_title,
38
+ author: issue_author
39
+ }
40
+ end
41
+
42
+ private
43
+
44
+ def issue_title
45
+ doc.css(".js-issue-title").text.strip
46
+ end
47
+
48
+ def issue_author
49
+ doc.css(".TableObject-item .author").text.strip
50
+ end
51
+
52
+ def navigation_links
53
+ page.links ".reponav-item"
54
+ end
55
+
56
+ def issue_listing_links
57
+ page.links ".issues-listing"
58
+ end
59
+
60
+ def next_page
61
+ page.links ".next_page"
62
+ end
63
+ end
64
+
65
+ CollectGithubIssues.perform_now("https://github.com/rails/rails")
@@ -0,0 +1,23 @@
1
+ require_relative "../lib/wayfarer"
2
+
3
+ class FindFoobarOnWikipedia < Wayfarer::Job
4
+ config.http_adapter = :selenium
5
+ config.selenium_argv = [:chrome]
6
+ config.connection_count = 4
7
+
8
+ let(:keywords) { [] }
9
+
10
+ route.host "en.wikipedia.org", to: :article
11
+
12
+ def article
13
+ if page.body =~ /Foobar/
14
+ driver.save_screenshot("/tmp/foobar.png")
15
+ return halt
16
+ end
17
+
18
+ keywords << page.keywords
19
+ stage page.links
20
+ end
21
+ end
22
+
23
+ FindFoobarOnWikipedia.perform_now("https://en.wikipedia.org/wiki/Special:Random")
@@ -0,0 +1,86 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ostruct"
4
+ require "securerandom"
5
+ require "forwardable"
6
+
7
+ module Wayfarer
8
+ class Configuration < OpenStruct
9
+ extend Forwardable
10
+
11
+ DEFAULTS = {
12
+ # Print full stacktraces?
13
+ print_stacktraces: true,
14
+
15
+ # Crash when encountering unhandled exceptions?
16
+ reraise_exceptions: false,
17
+
18
+ # Allow processing URIs multiple times?
19
+ allow_circulation: false,
20
+
21
+ # How many HTTP connections/Selenium drivers to use
22
+ # 1:1 correspondence with spawned threads
23
+ connection_count: 1,
24
+
25
+ # Which HTTP adapter to use. Supported are :net_http and :selenium
26
+ http_adapter: :net_http,
27
+
28
+ # Which frontier to use.
29
+ frontier: :memory,
30
+
31
+ # How long a thread may hold an HTTP adapter.
32
+ # Threads that exceed this limit fail with an exception.
33
+ connection_timeout: Float::INFINITY,
34
+
35
+ # How many 3xx redirects to follow. Has no effect when using Selenium
36
+ max_http_redirects: 3,
37
+
38
+ # Argument vector for instantiating Selenium drivers
39
+ selenium_argv: [:firefox],
40
+
41
+ # Argument vector for instantiating a Redis connection
42
+ redis_opts: {
43
+ host: "localhost",
44
+ port: 6379
45
+ }.freeze,
46
+
47
+ # Size of browser windows
48
+ window_size: [1024, 768],
49
+
50
+ # Which Mustermann pattern type to use when matching URI paths
51
+ # TODO: Mention in docs
52
+ mustermann_type: :sinatra,
53
+
54
+ # Options for instantiating Bloomfilters
55
+ bloomfilter_opts: {
56
+ size: 100,
57
+ hashes: 2,
58
+ seed: 1,
59
+ bucket: 3,
60
+ raise: false
61
+ },
62
+
63
+ # Whether to normalize URIs
64
+ normalize_uris: true,
65
+
66
+ # URI normalization options
67
+ # See: https://github.com/rwz/normalize_url
68
+ normalize_uri_options: {}
69
+ }.freeze
70
+
71
+ attr_reader :uuid
72
+
73
+ def initialize(overrides = {})
74
+ super(DEFAULTS.merge(overrides))
75
+ @uuid = SecureRandom.uuid
76
+ end
77
+
78
+ def logger
79
+ @logger ||= Wayfarer.logger.dup
80
+ end
81
+
82
+ def reset!
83
+ DEFAULTS.each { |key, val| self[key] = val }
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "observer"
4
+ require "normalize_url"
5
+
6
+ module Wayfarer
7
+ class Crawl
8
+ extend Forwardable
9
+ include Observable
10
+
11
+ # The prepared job.
12
+ # @!attribute [r] job
13
+ attr_reader :job
14
+
15
+ # @!attribute [r] dispatcher
16
+ attr_reader :dispatcher
17
+
18
+ delegate config: :job
19
+ delegate logger: :config
20
+
21
+ def initialize(job, *uris)
22
+ @job = job.prepare
23
+ @uris = uris
24
+ @dispatcher = Dispatcher.new(@job)
25
+ @processor = Processor.new(@job, frontier, @dispatcher)
26
+ end
27
+
28
+ def execute
29
+ trap_signals
30
+
31
+ CrawlObserver.new(@processor, @dispatcher, config.logger)
32
+
33
+ @job.run_hook(:before_crawl)
34
+ @processor.run(*@uris)
35
+ @job.run_hook(:after_crawl)
36
+ ensure
37
+ untrap_signals
38
+ end
39
+
40
+ # A frontier with initially pre-staged URIs.
41
+ # @return [Frontier]
42
+ def frontier
43
+ return @frontier if @frontier
44
+
45
+ @frontier = case config.frontier
46
+ when :memory_trie
47
+ Frontiers::MemoryTrieFrontier.new(config)
48
+ when :redis
49
+ Frontiers::RedisFrontier.new(config)
50
+ when :memory_bloom
51
+ Frontiers::MemoryBloomfilter.new(config)
52
+ when :redis_bloom
53
+ Frontiers::RedisBloomfilter.new(config)
54
+ else
55
+ Frontiers::MemoryFrontier.new(config)
56
+ end
57
+
58
+ @frontier.extend(Frontiers::NormalizeURIs) if config.normalize_uris
59
+
60
+ @frontier.stage(*@uris) # TODO: Test
61
+
62
+ @frontier
63
+ end
64
+
65
+ private
66
+
67
+ def trap_signals
68
+ @cached_sigint_handler = trap(:INT) {
69
+ halt!
70
+ @cached_sigint_handler.try(:call)
71
+ exit(-1)
72
+ }
73
+ end
74
+
75
+ def untrap_signals
76
+ trap(:INT) { @cached_sigint_handler.try(:call) }
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ class CrawlObserver
5
+ module Events
6
+ FirstCycle = Struct.new(:frontier)
7
+ NewCycle = Struct.new(:current_uris_count)
8
+ DispatchedURI = Struct.new(:action, :uri)
9
+ CycleFinished = Class.new
10
+ Peeking = Struct.new(:uri)
11
+ AboutToCycle = Struct.new(:staged_uris_count)
12
+ MismatchedURI = Struct.new(:uri)
13
+ HaltInitiated = Struct.new(:action, :uri)
14
+ StagingURIs = Struct.new(:staged_uris_count)
15
+ UnhandledError = Struct.new(:exception)
16
+ end
17
+
18
+ module ObservableShortcuts
19
+ def notify_observers!(*argv)
20
+ changed
21
+ notify_observers(*argv)
22
+ end
23
+ end
24
+
25
+ extend Forwardable
26
+
27
+ attr_reader :logger
28
+
29
+ def initialize(*observables, logger)
30
+ @logger = logger
31
+ observables.each { |obsv| obsv.add_observer(self) }
32
+ end
33
+
34
+ def update(event)
35
+ case event
36
+ when Events::FirstCycle then first_cycle(event)
37
+ when Events::NewCycle then new_cycle(event)
38
+ when Events::DispatchedURI then dispatched_uri(event)
39
+ when Events::CycleFinished then cycle_finished
40
+ when Events::Peeking then peeking(event)
41
+ when Events::AboutToCycle then about_to_cycle(event)
42
+ when Events::MismatchedURI then mismatched_uri(event)
43
+ when Events::HaltInitiated then halt_initiated(event)
44
+ when Events::StagingURIs then staging_uris(event)
45
+ when Events::UnhandledError then unhandled_error(event)
46
+ end
47
+ end
48
+
49
+ private
50
+
51
+ def first_cycle(event)
52
+ logger.info("First cycle")
53
+ logger.info("Frontier: #{event.frontier}")
54
+ end
55
+
56
+ def new_cycle(event)
57
+ logger.info("Current cycle contains #{event.current_uris_count} URI(s)")
58
+ end
59
+
60
+ def dispatched_uri(event)
61
+ logger.info("Dispatched to \##{event.action}: #{event.uri}")
62
+ end
63
+
64
+ def cycle_finished
65
+ logger.info("No URIs left in current cycle")
66
+ end
67
+
68
+ def peeking(event)
69
+ logger.info("Peeking into: #{event.uri}")
70
+ end
71
+
72
+ def about_to_cycle(event)
73
+ logger.info("About to cycle. #{event.staged_uris_count} staged URI(s)")
74
+ end
75
+
76
+ def mismatched_uri(event)
77
+ logger.debug("No matching route for: #{event.uri}")
78
+ end
79
+
80
+ def halt_initiated(event)
81
+ logger.info("Halt initiated from \##{event.action} at: #{event.uri}")
82
+ end
83
+
84
+ def staging_uris(event)
85
+ logger.info("Staging #{event.staged_uris_count} URI(s)")
86
+ end
87
+
88
+ def unhandled_error(event)
89
+ level = config.reraise_exceptions ? :fatal : :error
90
+
91
+ if config.print_stacktraces
92
+ logger.public_send level, <<~LOGGER
93
+ Unhandled exception in an action: #{event.exception.class.inspect}
94
+ #{event.exception.backtrace.map(&:to_s).join("\n* ")}
95
+ LOGGER
96
+ else
97
+ logger.public_send level, <<~LOGGER
98
+ Unhandled exception in an action: #{event.exception.class.inspect}
99
+ LOGGER
100
+ end
101
+ end
102
+ end
103
+ end