wayfarer 0.0.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (339) hide show
  1. checksums.yaml +5 -5
  2. data/.github/workflows/ci.yaml +32 -0
  3. data/.gitignore +3 -4
  4. data/.rubocop.yml +25 -9
  5. data/.ruby-version +1 -1
  6. data/Dockerfile +5 -0
  7. data/Gemfile +1 -7
  8. data/Gemfile.lock +212 -0
  9. data/RELEASING.md +17 -0
  10. data/Rakefile +38 -90
  11. data/bin/wayfarer +1 -111
  12. data/docker-compose.yml +32 -0
  13. data/docs/cookbook/querying_html.md +42 -0
  14. data/docs/cookbook/screenshots.md +27 -0
  15. data/docs/cookbook/user_agent.md +7 -0
  16. data/docs/guides/browser_automation/capybara.md +3 -0
  17. data/docs/guides/browser_automation/ferrum.md +37 -0
  18. data/docs/guides/browser_automation/selenium.md +59 -0
  19. data/docs/guides/callbacks.md +27 -34
  20. data/docs/guides/configuration.md +20 -171
  21. data/docs/guides/error_handling.md +18 -33
  22. data/docs/guides/jobs.md +75 -0
  23. data/docs/guides/networking.md +20 -0
  24. data/docs/guides/pages.md +52 -0
  25. data/docs/guides/performance.md +22 -0
  26. data/docs/guides/tasks.md +38 -0
  27. data/docs/index.md +34 -68
  28. data/docs/reference/api/base.md +162 -0
  29. data/docs/reference/api/route.md +182 -0
  30. data/docs/reference/cli.md +139 -0
  31. data/docs/reference/environment_variables.md +85 -0
  32. data/lib/wayfarer/base.rb +60 -0
  33. data/lib/wayfarer/cli/base.rb +23 -0
  34. data/lib/wayfarer/cli/generate.rb +17 -0
  35. data/lib/wayfarer/cli/job.rb +58 -0
  36. data/lib/wayfarer/cli/route.rb +27 -0
  37. data/lib/wayfarer/cli/route_printer.rb +116 -0
  38. data/lib/wayfarer/cli/runner.rb +34 -0
  39. data/lib/wayfarer/cli/templates/Gemfile.tt +5 -0
  40. data/lib/wayfarer/cli/templates/job.rb.tt +8 -0
  41. data/lib/wayfarer/config.rb +67 -0
  42. data/lib/wayfarer/gc.rb +19 -0
  43. data/lib/wayfarer/middleware/chain.rb +19 -0
  44. data/lib/wayfarer/middleware/dedup.rb +25 -0
  45. data/lib/wayfarer/middleware/fetch.rb +32 -0
  46. data/lib/wayfarer/middleware/normalize.rb +25 -0
  47. data/lib/wayfarer/middleware/router.rb +21 -0
  48. data/lib/wayfarer/middleware/stage.rb +23 -0
  49. data/lib/wayfarer/middleware/worker.rb +47 -0
  50. data/lib/wayfarer/networking/ferrum.rb +70 -0
  51. data/lib/wayfarer/networking/healer.rb +21 -0
  52. data/lib/wayfarer/networking/net_http.rb +52 -0
  53. data/lib/wayfarer/networking/pool.rb +34 -0
  54. data/lib/wayfarer/networking/result.rb +18 -0
  55. data/lib/wayfarer/networking/selenium.rb +70 -0
  56. data/lib/wayfarer/page.rb +15 -71
  57. data/lib/wayfarer/parsing/json.rb +17 -0
  58. data/lib/wayfarer/parsing/xml.rb +17 -0
  59. data/lib/wayfarer/redis/barrier.rb +36 -0
  60. data/lib/wayfarer/redis/connection.rb +13 -0
  61. data/lib/wayfarer/redis/counter.rb +29 -0
  62. data/lib/wayfarer/redis/pool.rb +18 -0
  63. data/lib/wayfarer/redis/version.rb +19 -0
  64. data/lib/wayfarer/routing/custom_matcher.rb +21 -0
  65. data/lib/wayfarer/routing/dsl.rb +57 -0
  66. data/lib/wayfarer/routing/host_matcher.rb +23 -0
  67. data/lib/wayfarer/routing/path_finder.rb +46 -0
  68. data/lib/wayfarer/routing/path_matcher.rb +46 -0
  69. data/lib/wayfarer/routing/{query_rule.rb → query_matcher.rb} +24 -16
  70. data/lib/wayfarer/routing/result.rb +15 -0
  71. data/lib/wayfarer/routing/root_route.rb +7 -0
  72. data/lib/wayfarer/routing/route.rb +41 -0
  73. data/lib/wayfarer/routing/scheme_matcher.rb +21 -0
  74. data/lib/wayfarer/routing/suffix_matcher.rb +21 -0
  75. data/lib/wayfarer/routing/target_route.rb +7 -0
  76. data/lib/wayfarer/routing/url_matcher.rb +21 -0
  77. data/lib/wayfarer/serializer.rb +17 -0
  78. data/lib/wayfarer/stringify.rb +41 -0
  79. data/lib/wayfarer/task.rb +34 -0
  80. data/lib/wayfarer.rb +47 -58
  81. data/mkdocs.yml +47 -0
  82. data/requirements.txt +1 -0
  83. data/spec/base_spec.rb +219 -0
  84. data/spec/cli/generate_spec.rb +39 -0
  85. data/spec/cli/job_spec.rb +74 -0
  86. data/spec/cli/version_spec.rb +13 -0
  87. data/spec/config_spec.rb +144 -0
  88. data/spec/factories/queue/chain.rb +11 -0
  89. data/spec/factories/queue/middleware.rb +15 -0
  90. data/spec/factories/queue/page.rb +78 -0
  91. data/spec/factories/queue/task.rb +12 -0
  92. data/spec/fixtures/dummy_job.rb +7 -0
  93. data/spec/gc_spec.rb +61 -0
  94. data/spec/middleware/chain_spec.rb +96 -0
  95. data/spec/middleware/dedup_spec.rb +76 -0
  96. data/spec/middleware/fetch_spec.rb +72 -0
  97. data/spec/middleware/normalize_spec.rb +28 -0
  98. data/spec/middleware/router_spec.rb +46 -0
  99. data/spec/middleware/stage_spec.rb +39 -0
  100. data/spec/middleware/worker_spec.rb +90 -0
  101. data/spec/networking/adapter.rb +135 -0
  102. data/spec/networking/ferrum_spec.rb +28 -0
  103. data/spec/networking/healer_spec.rb +46 -0
  104. data/spec/networking/net_http_spec.rb +37 -0
  105. data/spec/networking/pool_spec.rb +42 -0
  106. data/spec/networking/selenium_spec.rb +28 -0
  107. data/spec/page_spec.rb +21 -12
  108. data/spec/{parsers/json_parser_spec.rb → parsing/json_spec.rb} +5 -4
  109. data/spec/{parsers/xml_parser_spec.rb → parsing/xml_spec.rb} +3 -2
  110. data/spec/redis/barrier_spec.rb +78 -0
  111. data/spec/redis/counter_spec.rb +32 -0
  112. data/spec/redis/pool_spec.rb +18 -0
  113. data/spec/redis/version_spec.rb +13 -0
  114. data/spec/routing/custom_matcher_spec.rb +31 -0
  115. data/spec/routing/dsl_spec.rb +98 -0
  116. data/spec/routing/host_matcher_spec.rb +49 -0
  117. data/spec/routing/integration_spec.rb +110 -0
  118. data/spec/routing/path_finder_spec.rb +33 -0
  119. data/spec/routing/path_matcher_spec.rb +43 -0
  120. data/spec/routing/{query_rule_spec.rb → query_matcher_spec.rb} +39 -26
  121. data/spec/routing/root_route_spec.rb +29 -0
  122. data/spec/routing/route_spec.rb +43 -0
  123. data/spec/routing/scheme_matcher_spec.rb +25 -0
  124. data/spec/routing/{filetypes_rule_spec.rb → suffix_matcher_spec.rb} +14 -13
  125. data/spec/routing/uri_matcher_spec.rb +27 -0
  126. data/spec/spec_helpers.rb +65 -38
  127. data/spec/stringify_spec.rb +23 -0
  128. data/{support → spec/support}/static/finders.html +0 -0
  129. data/{support → spec/support}/static/graph/details/a.html +0 -0
  130. data/{support → spec/support}/static/graph/details/b.html +0 -0
  131. data/{support → spec/support}/static/graph/index.html +0 -0
  132. data/{support → spec/support}/static/json/dummy.json +0 -0
  133. data/{support → spec/support}/static/links/links.html +0 -0
  134. data/{support → spec/support}/static/xml/dummy.xml +0 -0
  135. data/{support → spec/support}/test_app.rb +9 -2
  136. data/spec/task_spec.rb +27 -0
  137. data/spec/wayfarer_spec.rb +2 -13
  138. data/wayfarer.gemspec +39 -42
  139. metadata +191 -368
  140. data/.travis.yml +0 -5
  141. data/Changelog.md +0 -10
  142. data/README.md +0 -21
  143. data/benchmark/frontiers.rb +0 -143
  144. data/docs/.gitignore +0 -2
  145. data/docs/_config.yml +0 -15
  146. data/docs/_includes/base.html +0 -7
  147. data/docs/_includes/head.html +0 -10
  148. data/docs/_includes/navigation.html +0 -187
  149. data/docs/_layouts/default.html +0 -42
  150. data/docs/_sass/base.scss +0 -439
  151. data/docs/_sass/variables.scss +0 -24
  152. data/docs/_sass/vendor/bourbon/_bourbon-deprecate.scss +0 -19
  153. data/docs/_sass/vendor/bourbon/_bourbon-deprecated-upcoming.scss +0 -425
  154. data/docs/_sass/vendor/bourbon/_bourbon.scss +0 -90
  155. data/docs/_sass/vendor/bourbon/addons/_border-color.scss +0 -29
  156. data/docs/_sass/vendor/bourbon/addons/_border-radius.scss +0 -48
  157. data/docs/_sass/vendor/bourbon/addons/_border-style.scss +0 -28
  158. data/docs/_sass/vendor/bourbon/addons/_border-width.scss +0 -28
  159. data/docs/_sass/vendor/bourbon/addons/_buttons.scss +0 -69
  160. data/docs/_sass/vendor/bourbon/addons/_clearfix.scss +0 -25
  161. data/docs/_sass/vendor/bourbon/addons/_ellipsis.scss +0 -30
  162. data/docs/_sass/vendor/bourbon/addons/_font-stacks.scss +0 -31
  163. data/docs/_sass/vendor/bourbon/addons/_hide-text.scss +0 -27
  164. data/docs/_sass/vendor/bourbon/addons/_margin.scss +0 -29
  165. data/docs/_sass/vendor/bourbon/addons/_padding.scss +0 -29
  166. data/docs/_sass/vendor/bourbon/addons/_position.scss +0 -51
  167. data/docs/_sass/vendor/bourbon/addons/_prefixer.scss +0 -66
  168. data/docs/_sass/vendor/bourbon/addons/_retina-image.scss +0 -27
  169. data/docs/_sass/vendor/bourbon/addons/_size.scss +0 -56
  170. data/docs/_sass/vendor/bourbon/addons/_text-inputs.scss +0 -118
  171. data/docs/_sass/vendor/bourbon/addons/_timing-functions.scss +0 -34
  172. data/docs/_sass/vendor/bourbon/addons/_triangle.scss +0 -63
  173. data/docs/_sass/vendor/bourbon/addons/_word-wrap.scss +0 -29
  174. data/docs/_sass/vendor/bourbon/css3/_animation.scss +0 -61
  175. data/docs/_sass/vendor/bourbon/css3/_appearance.scss +0 -5
  176. data/docs/_sass/vendor/bourbon/css3/_backface-visibility.scss +0 -5
  177. data/docs/_sass/vendor/bourbon/css3/_background-image.scss +0 -44
  178. data/docs/_sass/vendor/bourbon/css3/_background.scss +0 -57
  179. data/docs/_sass/vendor/bourbon/css3/_border-image.scss +0 -61
  180. data/docs/_sass/vendor/bourbon/css3/_calc.scss +0 -6
  181. data/docs/_sass/vendor/bourbon/css3/_columns.scss +0 -67
  182. data/docs/_sass/vendor/bourbon/css3/_filter.scss +0 -6
  183. data/docs/_sass/vendor/bourbon/css3/_flex-box.scss +0 -327
  184. data/docs/_sass/vendor/bourbon/css3/_font-face.scss +0 -29
  185. data/docs/_sass/vendor/bourbon/css3/_font-feature-settings.scss +0 -6
  186. data/docs/_sass/vendor/bourbon/css3/_hidpi-media-query.scss +0 -12
  187. data/docs/_sass/vendor/bourbon/css3/_hyphens.scss +0 -6
  188. data/docs/_sass/vendor/bourbon/css3/_image-rendering.scss +0 -15
  189. data/docs/_sass/vendor/bourbon/css3/_keyframes.scss +0 -38
  190. data/docs/_sass/vendor/bourbon/css3/_linear-gradient.scss +0 -40
  191. data/docs/_sass/vendor/bourbon/css3/_perspective.scss +0 -12
  192. data/docs/_sass/vendor/bourbon/css3/_placeholder.scss +0 -10
  193. data/docs/_sass/vendor/bourbon/css3/_radial-gradient.scss +0 -40
  194. data/docs/_sass/vendor/bourbon/css3/_selection.scss +0 -44
  195. data/docs/_sass/vendor/bourbon/css3/_text-decoration.scss +0 -27
  196. data/docs/_sass/vendor/bourbon/css3/_transform.scss +0 -21
  197. data/docs/_sass/vendor/bourbon/css3/_transition.scss +0 -81
  198. data/docs/_sass/vendor/bourbon/css3/_user-select.scss +0 -5
  199. data/docs/_sass/vendor/bourbon/functions/_assign-inputs.scss +0 -16
  200. data/docs/_sass/vendor/bourbon/functions/_contains-falsy.scss +0 -25
  201. data/docs/_sass/vendor/bourbon/functions/_contains.scss +0 -31
  202. data/docs/_sass/vendor/bourbon/functions/_is-length.scss +0 -16
  203. data/docs/_sass/vendor/bourbon/functions/_is-light.scss +0 -26
  204. data/docs/_sass/vendor/bourbon/functions/_is-number.scss +0 -16
  205. data/docs/_sass/vendor/bourbon/functions/_is-size.scss +0 -23
  206. data/docs/_sass/vendor/bourbon/functions/_modular-scale.scss +0 -74
  207. data/docs/_sass/vendor/bourbon/functions/_px-to-em.scss +0 -24
  208. data/docs/_sass/vendor/bourbon/functions/_px-to-rem.scss +0 -26
  209. data/docs/_sass/vendor/bourbon/functions/_shade.scss +0 -24
  210. data/docs/_sass/vendor/bourbon/functions/_strip-units.scss +0 -22
  211. data/docs/_sass/vendor/bourbon/functions/_tint.scss +0 -24
  212. data/docs/_sass/vendor/bourbon/functions/_transition-property-name.scss +0 -37
  213. data/docs/_sass/vendor/bourbon/functions/_unpack.scss +0 -32
  214. data/docs/_sass/vendor/bourbon/helpers/_convert-units.scss +0 -26
  215. data/docs/_sass/vendor/bourbon/helpers/_directional-values.scss +0 -108
  216. data/docs/_sass/vendor/bourbon/helpers/_font-source-declaration.scss +0 -53
  217. data/docs/_sass/vendor/bourbon/helpers/_gradient-positions-parser.scss +0 -24
  218. data/docs/_sass/vendor/bourbon/helpers/_linear-angle-parser.scss +0 -35
  219. data/docs/_sass/vendor/bourbon/helpers/_linear-gradient-parser.scss +0 -51
  220. data/docs/_sass/vendor/bourbon/helpers/_linear-positions-parser.scss +0 -77
  221. data/docs/_sass/vendor/bourbon/helpers/_linear-side-corner-parser.scss +0 -41
  222. data/docs/_sass/vendor/bourbon/helpers/_radial-arg-parser.scss +0 -74
  223. data/docs/_sass/vendor/bourbon/helpers/_radial-gradient-parser.scss +0 -55
  224. data/docs/_sass/vendor/bourbon/helpers/_radial-positions-parser.scss +0 -28
  225. data/docs/_sass/vendor/bourbon/helpers/_render-gradients.scss +0 -31
  226. data/docs/_sass/vendor/bourbon/helpers/_shape-size-stripper.scss +0 -15
  227. data/docs/_sass/vendor/bourbon/helpers/_str-to-num.scss +0 -55
  228. data/docs/_sass/vendor/bourbon/settings/_asset-pipeline.scss +0 -7
  229. data/docs/_sass/vendor/bourbon/settings/_deprecation-warnings.scss +0 -8
  230. data/docs/_sass/vendor/bourbon/settings/_prefixer.scss +0 -9
  231. data/docs/_sass/vendor/bourbon/settings/_px-to-em.scss +0 -1
  232. data/docs/_sass/vendor/neat/_neat-helpers.scss +0 -11
  233. data/docs/_sass/vendor/neat/_neat.scss +0 -23
  234. data/docs/_sass/vendor/neat/functions/_new-breakpoint.scss +0 -49
  235. data/docs/_sass/vendor/neat/functions/_private.scss +0 -114
  236. data/docs/_sass/vendor/neat/grid/_box-sizing.scss +0 -15
  237. data/docs/_sass/vendor/neat/grid/_direction-context.scss +0 -33
  238. data/docs/_sass/vendor/neat/grid/_display-context.scss +0 -28
  239. data/docs/_sass/vendor/neat/grid/_fill-parent.scss +0 -22
  240. data/docs/_sass/vendor/neat/grid/_media.scss +0 -92
  241. data/docs/_sass/vendor/neat/grid/_omega.scss +0 -87
  242. data/docs/_sass/vendor/neat/grid/_outer-container.scss +0 -34
  243. data/docs/_sass/vendor/neat/grid/_pad.scss +0 -25
  244. data/docs/_sass/vendor/neat/grid/_private.scss +0 -35
  245. data/docs/_sass/vendor/neat/grid/_row.scss +0 -52
  246. data/docs/_sass/vendor/neat/grid/_shift.scss +0 -50
  247. data/docs/_sass/vendor/neat/grid/_span-columns.scss +0 -94
  248. data/docs/_sass/vendor/neat/grid/_to-deprecate.scss +0 -97
  249. data/docs/_sass/vendor/neat/grid/_visual-grid.scss +0 -42
  250. data/docs/_sass/vendor/neat/mixins/_clearfix.scss +0 -25
  251. data/docs/_sass/vendor/neat/settings/_disable-warnings.scss +0 -13
  252. data/docs/_sass/vendor/neat/settings/_grid.scss +0 -51
  253. data/docs/_sass/vendor/neat/settings/_visual-grid.scss +0 -27
  254. data/docs/_sass/vendor/normalize-3.0.2.scss +0 -427
  255. data/docs/_sass/vendor/pygments.scss +0 -356
  256. data/docs/automating_browsers/capybara.md +0 -70
  257. data/docs/css/screen.scss +0 -7
  258. data/docs/guides/cli.md +0 -52
  259. data/docs/guides/frontiers.md +0 -93
  260. data/docs/guides/halting.md +0 -23
  261. data/docs/guides/job_queues.md +0 -26
  262. data/docs/guides/locals.md +0 -36
  263. data/docs/guides/logging.md +0 -22
  264. data/docs/guides/page_objects.md +0 -67
  265. data/docs/guides/peeking.md +0 -46
  266. data/docs/guides/selenium_capybara.md +0 -100
  267. data/docs/guides/tutorial.md +0 -452
  268. data/docs/js/navigation.js +0 -11
  269. data/docs/misc/contributing.md +0 -20
  270. data/docs/misc/testing.md +0 -11
  271. data/docs/recipes/authentication.md +0 -23
  272. data/docs/recipes/csv.md +0 -29
  273. data/docs/recipes/javascript.md +0 -20
  274. data/docs/recipes/multiple_uris.md +0 -18
  275. data/docs/recipes/screenshots.md +0 -20
  276. data/docs/routing/custom_rules.md +0 -16
  277. data/docs/routing/filetypes_rules.md +0 -21
  278. data/docs/routing/host_rules.md +0 -24
  279. data/docs/routing/path_rules.md +0 -33
  280. data/docs/routing/protocol_rules.md +0 -17
  281. data/docs/routing/query_rules.md +0 -69
  282. data/docs/routing/routes.md +0 -96
  283. data/docs/routing/uri_rules.md +0 -18
  284. data/examples/collect_github_issues.rb +0 -65
  285. data/examples/find_foobar_on_wikipedia.rb +0 -23
  286. data/lib/wayfarer/configuration.rb +0 -86
  287. data/lib/wayfarer/crawl.rb +0 -79
  288. data/lib/wayfarer/crawl_observer.rb +0 -103
  289. data/lib/wayfarer/dispatcher.rb +0 -104
  290. data/lib/wayfarer/finders.rb +0 -61
  291. data/lib/wayfarer/frontiers/frontier.rb +0 -79
  292. data/lib/wayfarer/frontiers/memory_bloomfilter.rb +0 -32
  293. data/lib/wayfarer/frontiers/memory_frontier.rb +0 -76
  294. data/lib/wayfarer/frontiers/memory_trie_frontier.rb +0 -39
  295. data/lib/wayfarer/frontiers/normalize_uris.rb +0 -48
  296. data/lib/wayfarer/frontiers/redis_bloomfilter.rb +0 -34
  297. data/lib/wayfarer/frontiers/redis_frontier.rb +0 -83
  298. data/lib/wayfarer/http_adapters/adapter_pool.rb +0 -62
  299. data/lib/wayfarer/http_adapters/net_http_adapter.rb +0 -77
  300. data/lib/wayfarer/http_adapters/selenium_adapter.rb +0 -80
  301. data/lib/wayfarer/job.rb +0 -211
  302. data/lib/wayfarer/locals.rb +0 -40
  303. data/lib/wayfarer/parsers/json_parser.rb +0 -20
  304. data/lib/wayfarer/parsers/xml_parser.rb +0 -27
  305. data/lib/wayfarer/processor.rb +0 -103
  306. data/lib/wayfarer/routing/custom_rule.rb +0 -21
  307. data/lib/wayfarer/routing/filetypes_rule.rb +0 -20
  308. data/lib/wayfarer/routing/host_rule.rb +0 -19
  309. data/lib/wayfarer/routing/path_rule.rb +0 -54
  310. data/lib/wayfarer/routing/protocol_rule.rb +0 -21
  311. data/lib/wayfarer/routing/router.rb +0 -71
  312. data/lib/wayfarer/routing/rule.rb +0 -114
  313. data/lib/wayfarer/routing/uri_rule.rb +0 -21
  314. data/spec/configuration_spec.rb +0 -26
  315. data/spec/crawl_spec.rb +0 -48
  316. data/spec/finders_spec.rb +0 -49
  317. data/spec/frontiers/memory_bloomfilter_spec.rb +0 -6
  318. data/spec/frontiers/memory_frontier_spec.rb +0 -6
  319. data/spec/frontiers/memory_trie_frontier_spec.rb +0 -6
  320. data/spec/frontiers/normalize_uris_spec.rb +0 -59
  321. data/spec/frontiers/redis_bloomfilter_spec.rb +0 -6
  322. data/spec/frontiers/redis_frontier_spec.rb +0 -6
  323. data/spec/http_adapters/adapter_pool_spec.rb +0 -33
  324. data/spec/http_adapters/net_http_adapter_spec.rb +0 -83
  325. data/spec/http_adapters/selenium_adapter_spec.rb +0 -53
  326. data/spec/integration/callbacks_spec.rb +0 -42
  327. data/spec/integration/locals_spec.rb +0 -106
  328. data/spec/integration/peeking_spec.rb +0 -61
  329. data/spec/job_spec.rb +0 -122
  330. data/spec/processor_spec.rb +0 -31
  331. data/spec/routing/custom_rule_spec.rb +0 -26
  332. data/spec/routing/host_rule_spec.rb +0 -48
  333. data/spec/routing/path_rule_spec.rb +0 -66
  334. data/spec/routing/protocol_rule_spec.rb +0 -26
  335. data/spec/routing/router_spec.rb +0 -67
  336. data/spec/routing/rule_spec.rb +0 -251
  337. data/spec/routing/uri_rule_spec.rb +0 -24
  338. data/spec/shared/frontier.rb +0 -96
  339. data/wayfarer-jruby.gemspec +0 -49
data/docs/misc/testing.md DELETED
@@ -1,11 +0,0 @@
1
- ---
2
- layout: default
3
- title: Testing
4
- ---
5
-
6
- # Testing
7
-
8
- Tests run on MRI 2.3.1 and JRuby 9.1.6.0.
9
-
10
- * Run `rake -T` to list all available (test) tasks.
11
- * When running tests, a HTTP server binds to port 9876 for integration-level tests.
@@ -1,23 +0,0 @@
1
- ---
2
- layout: default
3
- title: Authentication
4
- ---
5
-
6
- # Authentication
7
-
8
- Authentication is best handled with the `setup_adapter` callback See [Callbacks]({{base}}/guides/callbacks.html).
9
-
10
- {% highlight ruby %}
11
- class DummyJob < Wayfarer::Job
12
- config.http_adapter = :selenium
13
-
14
- setup_adapter do |_, _, browser|
15
- browser.visit("https://foo.com/login")
16
-
17
- browser.fill_in("E-mail", with: "foo@bar.com")
18
- browser.fill_in("Password", with: "password")
19
-
20
- browser.click_button("Log in")
21
- end
22
- end
23
- {% endhighlight %}
data/docs/recipes/csv.md DELETED
@@ -1,29 +0,0 @@
1
- ---
2
- layout: default
3
- title: CSV output
4
- ---
5
-
6
- # CSV output
7
-
8
- See:
9
-
10
- * [Locals]({{base}}/guides/locals.html)
11
- * [Callbacks]({{base}}/guides/callbacks.html)
12
-
13
- {% highlight ruby %}
14
- require "csv" # from Ruby's standard lib
15
-
16
- class DummyJob < Wayfarer::Job
17
- let(:records) { [] }
18
-
19
- after_crawl do
20
- CSV.open("output.csv", "w") do |csv|
21
- records.each { |r| csv << [r[:id], r[:name]] }
22
- end
23
- end
24
-
25
- def detail
26
- records << { id: ..., name: ... }
27
- end
28
- end
29
- {% endhighlight %}
@@ -1,20 +0,0 @@
1
- ---
2
- layout: default
3
- title: Executing JavaScript
4
- ---
5
-
6
- # Executing JavaScript
7
-
8
- In order to execute JavaScript in a page's DOM context, use the Selenium HTTP adapter and call `#execute_script` on the WebDriver object:
9
-
10
- {% highlight ruby %}
11
- class DummyJob < Wayfarer::Job
12
- config.http_adapter = :selenium
13
-
14
- # ...
15
-
16
- def foo
17
- pathname = driver.execute_script("return window.location.pathname")
18
- end
19
- end
20
- {% endhighlight %}
@@ -1,18 +0,0 @@
1
- ---
2
- layout: default
3
- title: Starting from multiple URIs
4
- ---
5
-
6
- # Starting from multiple URIs
7
-
8
- You can pass in as many URIs as desired when performing jobs:
9
-
10
- {% highlight ruby %}
11
- class DummyJob < Wayfarer::Job
12
- # ...
13
- end
14
-
15
- uris = [...]
16
-
17
- DummyJob.perform_now(*uris)
18
- {% endhighlight %}
@@ -1,20 +0,0 @@
1
- ---
2
- layout: default
3
- title: Taking screenshots
4
- ---
5
-
6
- # Taking screenshots
7
-
8
- In order to take screenshots, use the Selenium HTTP adapter and call `#save_screenshot` on the WebDriver object:
9
-
10
- {% highlight ruby %}
11
- class DummyJob < Wayfarer::Job
12
- config.http_adapter = :selenium
13
-
14
- # ...
15
-
16
- def foo
17
- driver.save_screenshot("my_screenshot.png")
18
- end
19
- end
20
- {% endhighlight %}
@@ -1,16 +0,0 @@
1
- ---
2
- layout: default
3
- title: Custom rules
4
- ---
5
-
6
- # Custom rules
7
-
8
- Custom rules take a block that gets yielded the URI or an object that responds to `#call(uri)`. If the block or the delegate return a truthy value, the rule matches.
9
-
10
- {% highlight ruby %}
11
- class DummyJob < Wayfarer::Job
12
- route.if -> (uri) { uri.host == uri.host.reverse }
13
- end
14
- {% endhighlight %}
15
-
16
- * Matches only URIs with palindrome hosts
@@ -1,21 +0,0 @@
1
- ---
2
- layout: default
3
- title: Filetypes rules
4
- ---
5
-
6
- # Filetypes rules
7
-
8
- Filetypes rules match against the URI path's file extension.
9
-
10
- {% highlight ruby %}
11
- class DummyJob < Wayfarer::Job
12
- route.filetypes [:png, :jpg], to: :image
13
- route.forbid.filetypes [:php, :js]
14
- end
15
- {% endhighlight %}
16
-
17
- Matches:
18
-
19
- * `http://example.com/foo.png`
20
- * `http://example.com/foo.jpg`
21
- * `https://example.com/qux/bar.jpg`
@@ -1,24 +0,0 @@
1
- ---
2
- layout: default
3
- title: Host rules
4
- ---
5
-
6
- # Host rules
7
-
8
- Host rules match against a host string or RegExp.
9
-
10
- {% highlight ruby %}
11
- class DummyJob < Wayfarer::Job
12
- route.host "example.com"
13
- route.host /example/
14
- end
15
- {% endhighlight %}
16
-
17
- Matches:
18
-
19
- * All URIs hosted on `"example.com"`.
20
- * All URIs that contain `"example"`.
21
-
22
- <aside class="note">
23
- <code>"www.host.net"</code> and <code>"host.net"</code> are not considered equal. You have to specify the exact host when using strings. Consider using <code>/host.net/</code> instead.
24
- </aside>
@@ -1,33 +0,0 @@
1
- ---
2
- layout: default
3
- title: Path rules
4
- ---
5
-
6
- # Path rules
7
-
8
- Path rules match against the path of a URI. Both strings and RegExps are accepted, and path segment pattern matching and RegExp captures are supported.
9
-
10
- {% highlight ruby %}
11
- class DummyJob < Wayfarer::Job
12
- route.path "/:alpha/:beta", to: :foo
13
- route.path /^foobar\/(.+)/, to: :bar
14
-
15
- def foo
16
- params[:alpha]
17
- params[:beta]
18
- end
19
-
20
- def foo
21
- params["0"]
22
- end
23
- end
24
- {% endhighlight %}
25
-
26
- Matches:
27
-
28
- * All URIs with path segments matching `/:alpha/:beta`, e.g. `https://example.com/foo/bar`
29
- * All URIs starting with `"/foobar/"`.
30
-
31
- <aside class="note">
32
- <code>/:alpha/:beta</code> and <code>:alpha/:beta</code> are not considered equal. Note the opening slash.
33
- </aside>
@@ -1,17 +0,0 @@
1
- ---
2
- layout: default
3
- title: Protocol rules
4
- ---
5
-
6
- # Protocol rules
7
-
8
- Protocol rules match against symbols/strings.
9
-
10
- {% highlight ruby %}
11
- class DummyJob < Wayfarer::Job
12
- route.protocol :https
13
- end
14
- {% endhighlight %}
15
-
16
- * Matches `https://example.com`.
17
- * Does not match `http://example.com`.
@@ -1,69 +0,0 @@
1
- ---
2
- layout: default
3
- title: Query rules
4
- ---
5
-
6
- # Query rules
7
-
8
- Query rules impose constraints on key-value query parameters. Strings, integers, RegExps and ranges are supported.
9
-
10
- ## String constraints
11
-
12
- {% highlight ruby %}
13
- class DummyJob < Wayfarer::Job
14
- route.query arg: "foo"
15
- end
16
- {% endhighlight %}
17
-
18
- * Matches `https://example.com?arg=foo`.
19
-
20
- ## Integer constraints
21
-
22
- {% highlight ruby %}
23
- class DummyJob < Wayfarer::Job
24
- route.query arg: 42
25
- end
26
- {% endhighlight %}
27
-
28
- * Matches `https://example.com?arg=42`.
29
-
30
- ---
31
-
32
- ## RegExp constraints
33
-
34
- {% highlight ruby %}
35
- class DummyJob < Wayfarer::Job
36
- route.query arg: /foo/
37
- end
38
- {% endhighlight %}
39
-
40
- * Matches `https://example.com?arg=foo`.
41
- * Matches `https://example.com?arg=foobar`.
42
-
43
- ---
44
-
45
- ## Range constraints
46
-
47
- {% highlight ruby %}
48
- class DummyJob < Wayfarer::Job
49
- route.query arg: 1..10
50
- end
51
- {% endhighlight %}
52
-
53
- * Matches `https://example.com?arg=1`.
54
- * Matches […]
55
- * Matches `https://example.com?arg=10`.
56
-
57
- ---
58
-
59
- ## Compound constraints
60
-
61
- {% highlight ruby %}
62
- class DummyJob < Wayfarer::Job
63
- route.query foo: 1..5, bar: /baz/, qux: "zot", toto: 2
64
- end
65
- {% endhighlight %}
66
-
67
- * Matches `https://example.com?foo=4&bar=bazqux&qux=zot&toto=2`.
68
-
69
- ---
@@ -1,96 +0,0 @@
1
- ---
2
- layout: default
3
- title: Routes
4
- categories: [Routing]
5
- ---
6
-
7
- # Routes
8
-
9
- * Routes are filters for interesting URIs.
10
- * Routes put constraints on URIs that should get processed.
11
- * Routes map URIs to instance methods (actions).
12
- * Routes are tree nodes and thus nestable.
13
-
14
- Currently, the following rules are available:
15
-
16
- * [URI rules](uri_rules.html) match URIs against a string.
17
- * [Host rules](/routing/host_rules.html) match hosts against strings and RegExps.
18
- * [Path rules](/routing/path_rules.html) match paths against pattern strings and RegExps. They support path segment capturing.
19
- * [Query rules](/routing/query_rules.html) match key-value pairs of query parameters against strings, integers, RegExps and ranges.
20
-
21
- Routes can be fordidden. URIs that match forbidden rules are never processed.
22
-
23
- ## Route declaration
24
-
25
- ### Declaration order matching
26
-
27
- {% highlight ruby %}
28
- class DummyJob < Wayfarer::Job
29
- route.host "example.com", to: :foo
30
- route.path "/foo", to: :bar
31
-
32
- # Is equivalent to:
33
- #
34
- # routes do
35
- # host "example.com", to: :foo
36
- # path "/foo", to: :bar
37
- # end
38
- end
39
- {% endhighlight %}
40
-
41
- * Dispatches `https://example.com/foo` to `:foo`.
42
- * Dispatches `https://example.com` to `:foo`.
43
- * Dispatches `https://yahoo.com/foo` to `:bar`.
44
-
45
- ---
46
-
47
- ### Nesting routes (child rules)
48
-
49
- A route matches if it has a child rule that matches. This applies recursively.
50
-
51
- {% highlight ruby %}
52
- class DummyJob < Wayfarer::Job
53
- route.host "example.com", to: :foo do
54
- path "/foo"
55
- end
56
-
57
- # Is equivalent to:
58
- #
59
- # route.host "example.com", path: "/foo", to: :foo
60
- # route.path "/foo", host: "example.com", to: :foo
61
- end
62
- {% endhighlight %}
63
-
64
- * Dispatches `https://example.com/foo` to `:foo`.
65
- * Does not dispatch `https://example.com`.
66
- * Does not dispatch `https://yahoo.com/foo`.
67
-
68
- ---
69
-
70
- ### Deepest routes override actions
71
-
72
- {% highlight ruby %}
73
- class DummyJob < Wayfarer::Job
74
- route.host "example.com", to: :foo do
75
- path "/foo", to: :bar
76
- end
77
- end
78
- {% endhighlight %}
79
-
80
- * Dispatches `https://example.com/foo` to `:bar`.
81
- * Does not dispatch `https://example.com`.
82
- * Does not dispatch `https://yahoo.com/foo`.
83
-
84
- ---
85
-
86
- ### Forbidding rules
87
-
88
- {% highlight ruby %}
89
- class DummyJob < Wayfarer::Job
90
- route.forbid.path "/foo"
91
- route.host "example.com", to: :foo
92
- end
93
- {% endhighlight %}
94
-
95
- * Dispatches `https://example.com` to `:bar`.
96
- * Does not dispatch `https://example.com/foo`.
@@ -1,18 +0,0 @@
1
- ---
2
- layout: default
3
- title: URI Rules
4
- ---
5
-
6
- # URI rules
7
-
8
- URI rules match against a string.
9
-
10
- {% highlight ruby %}
11
- class DummyJob < Wayfarer::Job
12
- route.uri "https://example.com"
13
- end
14
- {% endhighlight %}
15
-
16
- Matches:
17
-
18
- * Only `https://example.com`
@@ -1,65 +0,0 @@
1
- require_relative "../lib/wayfarer"
2
-
3
- class CollectGithubIssues < Wayfarer::Job
4
- config.connection_count = 4
5
- config.logger.level = :fatal
6
-
7
- let(:records) { [] }
8
-
9
- routes do
10
- host "github.com" do
11
- path "/:user/:repo", to: :repository
12
- path "/:user/:repo/issues", to: :index
13
- path "/:user/:repo/issues/:id", to: :show
14
- end
15
- end
16
-
17
- after_crawl do
18
- records.each do |issue|
19
- # Save them somewhere?
20
- puts issue
21
- end
22
- end
23
-
24
- def repository
25
- stage navigation_links
26
- end
27
-
28
- def index
29
- stage issue_listing_links, next_page
30
- end
31
-
32
- def show
33
- return halt if records.count > 30
34
-
35
- records << {
36
- id: params[:id],
37
- title: issue_title,
38
- author: issue_author
39
- }
40
- end
41
-
42
- private
43
-
44
- def issue_title
45
- doc.css(".js-issue-title").text.strip
46
- end
47
-
48
- def issue_author
49
- doc.css(".TableObject-item .author").text.strip
50
- end
51
-
52
- def navigation_links
53
- page.links ".reponav-item"
54
- end
55
-
56
- def issue_listing_links
57
- page.links ".issues-listing"
58
- end
59
-
60
- def next_page
61
- page.links ".next_page"
62
- end
63
- end
64
-
65
- CollectGithubIssues.perform_now("https://github.com/rails/rails")
@@ -1,23 +0,0 @@
1
- require_relative "../lib/wayfarer"
2
-
3
- class FindFoobarOnWikipedia < Wayfarer::Job
4
- config.http_adapter = :selenium
5
- config.selenium_argv = [:chrome]
6
- config.connection_count = 4
7
-
8
- let(:keywords) { [] }
9
-
10
- route.host "en.wikipedia.org", to: :article
11
-
12
- def article
13
- if page.body =~ /Foobar/
14
- driver.save_screenshot("/tmp/foobar.png")
15
- return halt
16
- end
17
-
18
- keywords << page.keywords
19
- stage page.links
20
- end
21
- end
22
-
23
- FindFoobarOnWikipedia.perform_now("https://en.wikipedia.org/wiki/Special:Random")
@@ -1,86 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "ostruct"
4
- require "securerandom"
5
- require "forwardable"
6
-
7
- module Wayfarer
8
- class Configuration < OpenStruct
9
- extend Forwardable
10
-
11
- DEFAULTS = {
12
- # Print full stacktraces?
13
- print_stacktraces: true,
14
-
15
- # Crash when encountering unhandled exceptions?
16
- reraise_exceptions: false,
17
-
18
- # Allow processing URIs multiple times?
19
- allow_circulation: false,
20
-
21
- # How many HTTP connections/Selenium drivers to use
22
- # 1:1 correspondence with spawned threads
23
- connection_count: 1,
24
-
25
- # Which HTTP adapter to use. Supported are :net_http and :selenium
26
- http_adapter: :net_http,
27
-
28
- # Which frontier to use.
29
- frontier: :memory,
30
-
31
- # How long a thread may hold an HTTP adapter.
32
- # Threads that exceed this limit fail with an exception.
33
- connection_timeout: Float::INFINITY,
34
-
35
- # How many 3xx redirects to follow. Has no effect when using Selenium
36
- max_http_redirects: 3,
37
-
38
- # Argument vector for instantiating Selenium drivers
39
- selenium_argv: [:firefox],
40
-
41
- # Argument vector for instantiating a Redis connection
42
- redis_opts: {
43
- host: "localhost",
44
- port: 6379
45
- }.freeze,
46
-
47
- # Size of browser windows
48
- window_size: [1024, 768],
49
-
50
- # Which Mustermann pattern type to use when matching URI paths
51
- # TODO: Mention in docs
52
- mustermann_type: :sinatra,
53
-
54
- # Options for instantiating Bloomfilters
55
- bloomfilter_opts: {
56
- size: 100,
57
- hashes: 2,
58
- seed: 1,
59
- bucket: 3,
60
- raise: false
61
- },
62
-
63
- # Whether to normalize URIs
64
- normalize_uris: true,
65
-
66
- # URI normalization options
67
- # See: https://github.com/rwz/normalize_url
68
- normalize_uri_options: {}
69
- }.freeze
70
-
71
- attr_reader :uuid
72
-
73
- def initialize(overrides = {})
74
- super(DEFAULTS.merge(overrides))
75
- @uuid = SecureRandom.uuid
76
- end
77
-
78
- def logger
79
- @logger ||= Wayfarer.logger.dup
80
- end
81
-
82
- def reset!
83
- DEFAULTS.each { |key, val| self[key] = val }
84
- end
85
- end
86
- end
@@ -1,79 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "observer"
4
- require "normalize_url"
5
-
6
- module Wayfarer
7
- class Crawl
8
- extend Forwardable
9
- include Observable
10
-
11
- # The prepared job.
12
- # @!attribute [r] job
13
- attr_reader :job
14
-
15
- # @!attribute [r] dispatcher
16
- attr_reader :dispatcher
17
-
18
- delegate config: :job
19
- delegate logger: :config
20
-
21
- def initialize(job, *uris)
22
- @job = job.prepare
23
- @uris = uris
24
- @dispatcher = Dispatcher.new(@job)
25
- @processor = Processor.new(@job, frontier, @dispatcher)
26
- end
27
-
28
- def execute
29
- trap_signals
30
-
31
- CrawlObserver.new(@processor, @dispatcher, config.logger)
32
-
33
- @job.run_hook(:before_crawl)
34
- @processor.run(*@uris)
35
- @job.run_hook(:after_crawl)
36
- ensure
37
- untrap_signals
38
- end
39
-
40
- # A frontier with initially pre-staged URIs.
41
- # @return [Frontier]
42
- def frontier
43
- return @frontier if @frontier
44
-
45
- @frontier = case config.frontier
46
- when :memory_trie
47
- Frontiers::MemoryTrieFrontier.new(config)
48
- when :redis
49
- Frontiers::RedisFrontier.new(config)
50
- when :memory_bloom
51
- Frontiers::MemoryBloomfilter.new(config)
52
- when :redis_bloom
53
- Frontiers::RedisBloomfilter.new(config)
54
- else
55
- Frontiers::MemoryFrontier.new(config)
56
- end
57
-
58
- @frontier.extend(Frontiers::NormalizeURIs) if config.normalize_uris
59
-
60
- @frontier.stage(*@uris) # TODO: Test
61
-
62
- @frontier
63
- end
64
-
65
- private
66
-
67
- def trap_signals
68
- @cached_sigint_handler = trap(:INT) {
69
- halt!
70
- @cached_sigint_handler.try(:call)
71
- exit(-1)
72
- }
73
- end
74
-
75
- def untrap_signals
76
- trap(:INT) { @cached_sigint_handler.try(:call) }
77
- end
78
- end
79
- end