wayfarer 0.0.3 → 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (369) hide show
  1. checksums.yaml +5 -5
  2. data/.github/workflows/ci.yaml +32 -0
  3. data/.gitignore +3 -4
  4. data/.rubocop.yml +25 -9
  5. data/.ruby-version +1 -1
  6. data/Dockerfile +5 -0
  7. data/Gemfile +1 -7
  8. data/Gemfile.lock +221 -0
  9. data/RELEASING.md +17 -0
  10. data/Rakefile +38 -90
  11. data/bin/wayfarer +1 -111
  12. data/docker-compose.yml +32 -0
  13. data/docs/cookbook/batch_routing.md +22 -0
  14. data/docs/cookbook/consent_screen.md +36 -0
  15. data/docs/cookbook/executing_javascript.md +41 -0
  16. data/docs/cookbook/querying_html.md +42 -0
  17. data/docs/cookbook/screenshots.md +27 -0
  18. data/docs/cookbook/user_agent.md +7 -0
  19. data/docs/guides/browser_automation/capybara.md +69 -0
  20. data/docs/guides/browser_automation/custom_adapters.md +100 -0
  21. data/docs/guides/browser_automation/ferrum.md +39 -0
  22. data/docs/guides/browser_automation/selenium.md +63 -0
  23. data/docs/guides/callbacks.md +131 -31
  24. data/docs/guides/configuration.md +24 -169
  25. data/docs/guides/debugging.md +17 -0
  26. data/docs/guides/error_handling.md +30 -45
  27. data/docs/guides/jobs.md +101 -0
  28. data/docs/guides/navigation.md +73 -0
  29. data/docs/guides/networking.md +94 -0
  30. data/docs/guides/pages.md +52 -0
  31. data/docs/guides/performance.md +130 -0
  32. data/docs/guides/reliability.md +41 -0
  33. data/docs/guides/routing/steering.md +30 -0
  34. data/docs/guides/tasks.md +14 -0
  35. data/docs/index.md +40 -66
  36. data/docs/reference/api/base.md +48 -0
  37. data/docs/reference/api/route.md +182 -0
  38. data/docs/reference/cli.md +61 -0
  39. data/docs/reference/configuration_keys.md +42 -0
  40. data/docs/reference/environment_variables.md +83 -0
  41. data/lib/wayfarer/base.rb +50 -0
  42. data/lib/wayfarer/callbacks.rb +71 -0
  43. data/lib/wayfarer/cli/base.rb +27 -0
  44. data/lib/wayfarer/cli/generate.rb +17 -0
  45. data/lib/wayfarer/cli/job.rb +60 -0
  46. data/lib/wayfarer/cli/route.rb +29 -0
  47. data/lib/wayfarer/cli/route_printer.rb +116 -0
  48. data/lib/wayfarer/cli/runner.rb +34 -0
  49. data/lib/wayfarer/cli/templates/Gemfile.tt +5 -0
  50. data/lib/wayfarer/cli/templates/job.rb.tt +10 -0
  51. data/lib/wayfarer/config/capybara.rb +10 -0
  52. data/lib/wayfarer/config/ferrum.rb +11 -0
  53. data/lib/wayfarer/config/networking.rb +26 -0
  54. data/lib/wayfarer/config/redis.rb +14 -0
  55. data/lib/wayfarer/config/root.rb +11 -0
  56. data/lib/wayfarer/config/selenium.rb +21 -0
  57. data/lib/wayfarer/config/strconv.rb +45 -0
  58. data/lib/wayfarer/config/struct.rb +72 -0
  59. data/lib/wayfarer/gc.rb +15 -0
  60. data/lib/wayfarer/middleware/chain.rb +19 -0
  61. data/lib/wayfarer/middleware/dedup.rb +25 -0
  62. data/lib/wayfarer/middleware/fetch.rb +47 -0
  63. data/lib/wayfarer/middleware/normalize.rb +25 -0
  64. data/lib/wayfarer/middleware/router.rb +53 -0
  65. data/lib/wayfarer/middleware/stage.rb +23 -0
  66. data/lib/wayfarer/middleware/worker.rb +30 -0
  67. data/lib/wayfarer/networking/capybara.rb +28 -0
  68. data/lib/wayfarer/networking/context.rb +36 -0
  69. data/lib/wayfarer/networking/ferrum.rb +35 -0
  70. data/lib/wayfarer/networking/http.rb +34 -0
  71. data/lib/wayfarer/networking/pool.rb +40 -0
  72. data/lib/wayfarer/networking/result.rb +18 -0
  73. data/lib/wayfarer/networking/selenium.rb +43 -0
  74. data/lib/wayfarer/networking/strategy.rb +38 -0
  75. data/lib/wayfarer/page.rb +17 -74
  76. data/lib/wayfarer/parsing/json.rb +17 -0
  77. data/lib/wayfarer/parsing/xml.rb +17 -0
  78. data/lib/wayfarer/redis/.#barrier.rb +1 -0
  79. data/lib/wayfarer/redis/barrier.rb +36 -0
  80. data/lib/wayfarer/redis/connection.rb +13 -0
  81. data/lib/wayfarer/redis/counter.rb +29 -0
  82. data/lib/wayfarer/redis/pool.rb +20 -0
  83. data/lib/wayfarer/redis/version.rb +19 -0
  84. data/lib/wayfarer/routing/dsl.rb +57 -0
  85. data/lib/wayfarer/routing/matchers/custom.rb +25 -0
  86. data/lib/wayfarer/routing/matchers/host.rb +19 -0
  87. data/lib/wayfarer/routing/matchers/path.rb +49 -0
  88. data/lib/wayfarer/routing/matchers/query.rb +63 -0
  89. data/lib/wayfarer/routing/matchers/scheme.rb +17 -0
  90. data/lib/wayfarer/routing/matchers/suffix.rb +17 -0
  91. data/lib/wayfarer/routing/matchers/url.rb +17 -0
  92. data/lib/wayfarer/routing/path_finder.rb +46 -0
  93. data/lib/wayfarer/routing/result.rb +15 -0
  94. data/lib/wayfarer/routing/root_route.rb +7 -0
  95. data/lib/wayfarer/routing/route.rb +47 -0
  96. data/lib/wayfarer/routing/router.rb +10 -54
  97. data/lib/wayfarer/routing/target_route.rb +7 -0
  98. data/lib/wayfarer/serializer.rb +17 -0
  99. data/lib/wayfarer/stringify.rb +47 -0
  100. data/lib/wayfarer/task.rb +34 -0
  101. data/lib/wayfarer.rb +48 -57
  102. data/mkdocs.yml +47 -0
  103. data/requirements.txt +1 -0
  104. data/spec/base_spec.rb +233 -0
  105. data/spec/callbacks_spec.rb +102 -0
  106. data/spec/cli/generate_spec.rb +39 -0
  107. data/spec/cli/job_spec.rb +74 -0
  108. data/spec/cli/version_spec.rb +13 -0
  109. data/spec/config/capybara_spec.rb +18 -0
  110. data/spec/config/ferrum_spec.rb +24 -0
  111. data/spec/config/networking_spec.rb +73 -0
  112. data/spec/config/redis_spec.rb +32 -0
  113. data/spec/config/root_spec.rb +31 -0
  114. data/spec/config/selenium_spec.rb +56 -0
  115. data/spec/config/strconv_spec.rb +58 -0
  116. data/spec/config/struct_spec.rb +66 -0
  117. data/spec/factories/middleware.rb +15 -0
  118. data/spec/factories/page.rb +78 -0
  119. data/spec/factories/task.rb +12 -0
  120. data/spec/fixtures/dummy_job.rb +7 -0
  121. data/spec/gc_spec.rb +63 -0
  122. data/spec/middleware/chain_spec.rb +96 -0
  123. data/spec/middleware/dedup_spec.rb +76 -0
  124. data/spec/middleware/fetch_spec.rb +100 -0
  125. data/spec/middleware/normalize_spec.rb +28 -0
  126. data/spec/middleware/router_spec.rb +80 -0
  127. data/spec/middleware/stage_spec.rb +39 -0
  128. data/spec/middleware/worker_spec.rb +117 -0
  129. data/spec/networking/capybara_spec.rb +12 -0
  130. data/spec/networking/context_spec.rb +127 -0
  131. data/spec/networking/ferrum_spec.rb +12 -0
  132. data/spec/networking/http_spec.rb +12 -0
  133. data/spec/networking/pool_spec.rb +67 -0
  134. data/spec/networking/selenium_spec.rb +12 -0
  135. data/spec/networking/strategy.rb +170 -0
  136. data/spec/page_spec.rb +21 -12
  137. data/spec/{parsers/json_parser_spec.rb → parsing/json_spec.rb} +5 -4
  138. data/spec/{parsers/xml_parser_spec.rb → parsing/xml_spec.rb} +3 -2
  139. data/spec/redis/barrier_spec.rb +78 -0
  140. data/spec/redis/counter_spec.rb +32 -0
  141. data/spec/redis/pool_spec.rb +18 -0
  142. data/spec/redis/version_spec.rb +13 -0
  143. data/spec/routing/dsl_spec.rb +98 -0
  144. data/spec/routing/integration_spec.rb +110 -0
  145. data/spec/routing/matchers/custom_spec.rb +31 -0
  146. data/spec/routing/matchers/host_spec.rb +49 -0
  147. data/spec/routing/matchers/path_spec.rb +43 -0
  148. data/spec/routing/matchers/query_spec.rb +137 -0
  149. data/spec/routing/matchers/scheme_spec.rb +25 -0
  150. data/spec/routing/{filetypes_rule_spec.rb → matchers/suffix_spec.rb} +14 -13
  151. data/spec/routing/matchers/uri_spec.rb +27 -0
  152. data/spec/routing/path_finder_spec.rb +33 -0
  153. data/spec/routing/root_route_spec.rb +29 -0
  154. data/spec/routing/route_spec.rb +43 -0
  155. data/spec/routing/router_spec.rb +13 -56
  156. data/spec/spec_helpers.rb +73 -38
  157. data/spec/stringify_spec.rb +23 -0
  158. data/{support → spec/support}/static/finders.html +0 -0
  159. data/{support → spec/support}/static/graph/details/a.html +0 -0
  160. data/{support → spec/support}/static/graph/details/b.html +0 -0
  161. data/{support → spec/support}/static/graph/index.html +0 -0
  162. data/{support → spec/support}/static/json/dummy.json +0 -0
  163. data/{support → spec/support}/static/links/links.html +0 -0
  164. data/{support → spec/support}/static/xml/dummy.xml +0 -0
  165. data/{support → spec/support}/test_app.rb +9 -2
  166. data/spec/task_spec.rb +27 -0
  167. data/spec/wayfarer_spec.rb +2 -13
  168. data/wayfarer.gemspec +40 -42
  169. metadata +234 -361
  170. data/.travis.yml +0 -5
  171. data/Changelog.md +0 -10
  172. data/README.md +0 -21
  173. data/benchmark/frontiers.rb +0 -143
  174. data/docs/.gitignore +0 -2
  175. data/docs/_config.yml +0 -15
  176. data/docs/_includes/base.html +0 -7
  177. data/docs/_includes/head.html +0 -10
  178. data/docs/_includes/navigation.html +0 -187
  179. data/docs/_layouts/default.html +0 -42
  180. data/docs/_sass/base.scss +0 -439
  181. data/docs/_sass/variables.scss +0 -24
  182. data/docs/_sass/vendor/bourbon/_bourbon-deprecate.scss +0 -19
  183. data/docs/_sass/vendor/bourbon/_bourbon-deprecated-upcoming.scss +0 -425
  184. data/docs/_sass/vendor/bourbon/_bourbon.scss +0 -90
  185. data/docs/_sass/vendor/bourbon/addons/_border-color.scss +0 -29
  186. data/docs/_sass/vendor/bourbon/addons/_border-radius.scss +0 -48
  187. data/docs/_sass/vendor/bourbon/addons/_border-style.scss +0 -28
  188. data/docs/_sass/vendor/bourbon/addons/_border-width.scss +0 -28
  189. data/docs/_sass/vendor/bourbon/addons/_buttons.scss +0 -69
  190. data/docs/_sass/vendor/bourbon/addons/_clearfix.scss +0 -25
  191. data/docs/_sass/vendor/bourbon/addons/_ellipsis.scss +0 -30
  192. data/docs/_sass/vendor/bourbon/addons/_font-stacks.scss +0 -31
  193. data/docs/_sass/vendor/bourbon/addons/_hide-text.scss +0 -27
  194. data/docs/_sass/vendor/bourbon/addons/_margin.scss +0 -29
  195. data/docs/_sass/vendor/bourbon/addons/_padding.scss +0 -29
  196. data/docs/_sass/vendor/bourbon/addons/_position.scss +0 -51
  197. data/docs/_sass/vendor/bourbon/addons/_prefixer.scss +0 -66
  198. data/docs/_sass/vendor/bourbon/addons/_retina-image.scss +0 -27
  199. data/docs/_sass/vendor/bourbon/addons/_size.scss +0 -56
  200. data/docs/_sass/vendor/bourbon/addons/_text-inputs.scss +0 -118
  201. data/docs/_sass/vendor/bourbon/addons/_timing-functions.scss +0 -34
  202. data/docs/_sass/vendor/bourbon/addons/_triangle.scss +0 -63
  203. data/docs/_sass/vendor/bourbon/addons/_word-wrap.scss +0 -29
  204. data/docs/_sass/vendor/bourbon/css3/_animation.scss +0 -61
  205. data/docs/_sass/vendor/bourbon/css3/_appearance.scss +0 -5
  206. data/docs/_sass/vendor/bourbon/css3/_backface-visibility.scss +0 -5
  207. data/docs/_sass/vendor/bourbon/css3/_background-image.scss +0 -44
  208. data/docs/_sass/vendor/bourbon/css3/_background.scss +0 -57
  209. data/docs/_sass/vendor/bourbon/css3/_border-image.scss +0 -61
  210. data/docs/_sass/vendor/bourbon/css3/_calc.scss +0 -6
  211. data/docs/_sass/vendor/bourbon/css3/_columns.scss +0 -67
  212. data/docs/_sass/vendor/bourbon/css3/_filter.scss +0 -6
  213. data/docs/_sass/vendor/bourbon/css3/_flex-box.scss +0 -327
  214. data/docs/_sass/vendor/bourbon/css3/_font-face.scss +0 -29
  215. data/docs/_sass/vendor/bourbon/css3/_font-feature-settings.scss +0 -6
  216. data/docs/_sass/vendor/bourbon/css3/_hidpi-media-query.scss +0 -12
  217. data/docs/_sass/vendor/bourbon/css3/_hyphens.scss +0 -6
  218. data/docs/_sass/vendor/bourbon/css3/_image-rendering.scss +0 -15
  219. data/docs/_sass/vendor/bourbon/css3/_keyframes.scss +0 -38
  220. data/docs/_sass/vendor/bourbon/css3/_linear-gradient.scss +0 -40
  221. data/docs/_sass/vendor/bourbon/css3/_perspective.scss +0 -12
  222. data/docs/_sass/vendor/bourbon/css3/_placeholder.scss +0 -10
  223. data/docs/_sass/vendor/bourbon/css3/_radial-gradient.scss +0 -40
  224. data/docs/_sass/vendor/bourbon/css3/_selection.scss +0 -44
  225. data/docs/_sass/vendor/bourbon/css3/_text-decoration.scss +0 -27
  226. data/docs/_sass/vendor/bourbon/css3/_transform.scss +0 -21
  227. data/docs/_sass/vendor/bourbon/css3/_transition.scss +0 -81
  228. data/docs/_sass/vendor/bourbon/css3/_user-select.scss +0 -5
  229. data/docs/_sass/vendor/bourbon/functions/_assign-inputs.scss +0 -16
  230. data/docs/_sass/vendor/bourbon/functions/_contains-falsy.scss +0 -25
  231. data/docs/_sass/vendor/bourbon/functions/_contains.scss +0 -31
  232. data/docs/_sass/vendor/bourbon/functions/_is-length.scss +0 -16
  233. data/docs/_sass/vendor/bourbon/functions/_is-light.scss +0 -26
  234. data/docs/_sass/vendor/bourbon/functions/_is-number.scss +0 -16
  235. data/docs/_sass/vendor/bourbon/functions/_is-size.scss +0 -23
  236. data/docs/_sass/vendor/bourbon/functions/_modular-scale.scss +0 -74
  237. data/docs/_sass/vendor/bourbon/functions/_px-to-em.scss +0 -24
  238. data/docs/_sass/vendor/bourbon/functions/_px-to-rem.scss +0 -26
  239. data/docs/_sass/vendor/bourbon/functions/_shade.scss +0 -24
  240. data/docs/_sass/vendor/bourbon/functions/_strip-units.scss +0 -22
  241. data/docs/_sass/vendor/bourbon/functions/_tint.scss +0 -24
  242. data/docs/_sass/vendor/bourbon/functions/_transition-property-name.scss +0 -37
  243. data/docs/_sass/vendor/bourbon/functions/_unpack.scss +0 -32
  244. data/docs/_sass/vendor/bourbon/helpers/_convert-units.scss +0 -26
  245. data/docs/_sass/vendor/bourbon/helpers/_directional-values.scss +0 -108
  246. data/docs/_sass/vendor/bourbon/helpers/_font-source-declaration.scss +0 -53
  247. data/docs/_sass/vendor/bourbon/helpers/_gradient-positions-parser.scss +0 -24
  248. data/docs/_sass/vendor/bourbon/helpers/_linear-angle-parser.scss +0 -35
  249. data/docs/_sass/vendor/bourbon/helpers/_linear-gradient-parser.scss +0 -51
  250. data/docs/_sass/vendor/bourbon/helpers/_linear-positions-parser.scss +0 -77
  251. data/docs/_sass/vendor/bourbon/helpers/_linear-side-corner-parser.scss +0 -41
  252. data/docs/_sass/vendor/bourbon/helpers/_radial-arg-parser.scss +0 -74
  253. data/docs/_sass/vendor/bourbon/helpers/_radial-gradient-parser.scss +0 -55
  254. data/docs/_sass/vendor/bourbon/helpers/_radial-positions-parser.scss +0 -28
  255. data/docs/_sass/vendor/bourbon/helpers/_render-gradients.scss +0 -31
  256. data/docs/_sass/vendor/bourbon/helpers/_shape-size-stripper.scss +0 -15
  257. data/docs/_sass/vendor/bourbon/helpers/_str-to-num.scss +0 -55
  258. data/docs/_sass/vendor/bourbon/settings/_asset-pipeline.scss +0 -7
  259. data/docs/_sass/vendor/bourbon/settings/_deprecation-warnings.scss +0 -8
  260. data/docs/_sass/vendor/bourbon/settings/_prefixer.scss +0 -9
  261. data/docs/_sass/vendor/bourbon/settings/_px-to-em.scss +0 -1
  262. data/docs/_sass/vendor/neat/_neat-helpers.scss +0 -11
  263. data/docs/_sass/vendor/neat/_neat.scss +0 -23
  264. data/docs/_sass/vendor/neat/functions/_new-breakpoint.scss +0 -49
  265. data/docs/_sass/vendor/neat/functions/_private.scss +0 -114
  266. data/docs/_sass/vendor/neat/grid/_box-sizing.scss +0 -15
  267. data/docs/_sass/vendor/neat/grid/_direction-context.scss +0 -33
  268. data/docs/_sass/vendor/neat/grid/_display-context.scss +0 -28
  269. data/docs/_sass/vendor/neat/grid/_fill-parent.scss +0 -22
  270. data/docs/_sass/vendor/neat/grid/_media.scss +0 -92
  271. data/docs/_sass/vendor/neat/grid/_omega.scss +0 -87
  272. data/docs/_sass/vendor/neat/grid/_outer-container.scss +0 -34
  273. data/docs/_sass/vendor/neat/grid/_pad.scss +0 -25
  274. data/docs/_sass/vendor/neat/grid/_private.scss +0 -35
  275. data/docs/_sass/vendor/neat/grid/_row.scss +0 -52
  276. data/docs/_sass/vendor/neat/grid/_shift.scss +0 -50
  277. data/docs/_sass/vendor/neat/grid/_span-columns.scss +0 -94
  278. data/docs/_sass/vendor/neat/grid/_to-deprecate.scss +0 -97
  279. data/docs/_sass/vendor/neat/grid/_visual-grid.scss +0 -42
  280. data/docs/_sass/vendor/neat/mixins/_clearfix.scss +0 -25
  281. data/docs/_sass/vendor/neat/settings/_disable-warnings.scss +0 -13
  282. data/docs/_sass/vendor/neat/settings/_grid.scss +0 -51
  283. data/docs/_sass/vendor/neat/settings/_visual-grid.scss +0 -27
  284. data/docs/_sass/vendor/normalize-3.0.2.scss +0 -427
  285. data/docs/_sass/vendor/pygments.scss +0 -356
  286. data/docs/automating_browsers/capybara.md +0 -70
  287. data/docs/css/screen.scss +0 -7
  288. data/docs/guides/cli.md +0 -52
  289. data/docs/guides/frontiers.md +0 -93
  290. data/docs/guides/halting.md +0 -23
  291. data/docs/guides/job_queues.md +0 -26
  292. data/docs/guides/locals.md +0 -36
  293. data/docs/guides/logging.md +0 -22
  294. data/docs/guides/page_objects.md +0 -67
  295. data/docs/guides/peeking.md +0 -46
  296. data/docs/guides/selenium_capybara.md +0 -100
  297. data/docs/guides/tutorial.md +0 -452
  298. data/docs/js/navigation.js +0 -11
  299. data/docs/misc/contributing.md +0 -20
  300. data/docs/misc/testing.md +0 -11
  301. data/docs/recipes/authentication.md +0 -23
  302. data/docs/recipes/csv.md +0 -29
  303. data/docs/recipes/javascript.md +0 -20
  304. data/docs/recipes/multiple_uris.md +0 -18
  305. data/docs/recipes/screenshots.md +0 -20
  306. data/docs/routing/custom_rules.md +0 -16
  307. data/docs/routing/filetypes_rules.md +0 -21
  308. data/docs/routing/host_rules.md +0 -24
  309. data/docs/routing/path_rules.md +0 -33
  310. data/docs/routing/protocol_rules.md +0 -17
  311. data/docs/routing/query_rules.md +0 -69
  312. data/docs/routing/routes.md +0 -96
  313. data/docs/routing/uri_rules.md +0 -18
  314. data/examples/collect_github_issues.rb +0 -65
  315. data/examples/find_foobar_on_wikipedia.rb +0 -23
  316. data/lib/wayfarer/configuration.rb +0 -86
  317. data/lib/wayfarer/crawl.rb +0 -79
  318. data/lib/wayfarer/crawl_observer.rb +0 -103
  319. data/lib/wayfarer/dispatcher.rb +0 -104
  320. data/lib/wayfarer/finders.rb +0 -61
  321. data/lib/wayfarer/frontiers/frontier.rb +0 -79
  322. data/lib/wayfarer/frontiers/memory_bloomfilter.rb +0 -32
  323. data/lib/wayfarer/frontiers/memory_frontier.rb +0 -76
  324. data/lib/wayfarer/frontiers/memory_trie_frontier.rb +0 -39
  325. data/lib/wayfarer/frontiers/normalize_uris.rb +0 -48
  326. data/lib/wayfarer/frontiers/redis_bloomfilter.rb +0 -34
  327. data/lib/wayfarer/frontiers/redis_frontier.rb +0 -83
  328. data/lib/wayfarer/http_adapters/adapter_pool.rb +0 -62
  329. data/lib/wayfarer/http_adapters/net_http_adapter.rb +0 -77
  330. data/lib/wayfarer/http_adapters/selenium_adapter.rb +0 -80
  331. data/lib/wayfarer/job.rb +0 -211
  332. data/lib/wayfarer/locals.rb +0 -40
  333. data/lib/wayfarer/parsers/json_parser.rb +0 -20
  334. data/lib/wayfarer/parsers/xml_parser.rb +0 -27
  335. data/lib/wayfarer/processor.rb +0 -103
  336. data/lib/wayfarer/routing/custom_rule.rb +0 -21
  337. data/lib/wayfarer/routing/filetypes_rule.rb +0 -20
  338. data/lib/wayfarer/routing/host_rule.rb +0 -19
  339. data/lib/wayfarer/routing/path_rule.rb +0 -54
  340. data/lib/wayfarer/routing/protocol_rule.rb +0 -21
  341. data/lib/wayfarer/routing/query_rule.rb +0 -59
  342. data/lib/wayfarer/routing/rule.rb +0 -114
  343. data/lib/wayfarer/routing/uri_rule.rb +0 -21
  344. data/spec/configuration_spec.rb +0 -26
  345. data/spec/crawl_spec.rb +0 -48
  346. data/spec/finders_spec.rb +0 -49
  347. data/spec/frontiers/memory_bloomfilter_spec.rb +0 -6
  348. data/spec/frontiers/memory_frontier_spec.rb +0 -6
  349. data/spec/frontiers/memory_trie_frontier_spec.rb +0 -6
  350. data/spec/frontiers/normalize_uris_spec.rb +0 -59
  351. data/spec/frontiers/redis_bloomfilter_spec.rb +0 -6
  352. data/spec/frontiers/redis_frontier_spec.rb +0 -6
  353. data/spec/http_adapters/adapter_pool_spec.rb +0 -33
  354. data/spec/http_adapters/net_http_adapter_spec.rb +0 -83
  355. data/spec/http_adapters/selenium_adapter_spec.rb +0 -53
  356. data/spec/integration/callbacks_spec.rb +0 -42
  357. data/spec/integration/locals_spec.rb +0 -106
  358. data/spec/integration/peeking_spec.rb +0 -61
  359. data/spec/job_spec.rb +0 -122
  360. data/spec/processor_spec.rb +0 -31
  361. data/spec/routing/custom_rule_spec.rb +0 -26
  362. data/spec/routing/host_rule_spec.rb +0 -48
  363. data/spec/routing/path_rule_spec.rb +0 -66
  364. data/spec/routing/protocol_rule_spec.rb +0 -26
  365. data/spec/routing/query_rule_spec.rb +0 -124
  366. data/spec/routing/rule_spec.rb +0 -251
  367. data/spec/routing/uri_rule_spec.rb +0 -24
  368. data/spec/shared/frontier.rb +0 -96
  369. data/wayfarer-jruby.gemspec +0 -49
@@ -1,62 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "forwardable"
4
- require "connection_pool"
5
-
6
- module Wayfarer
7
- module HTTPAdapters
8
- # A connection pool that hands out HTTP adapters.
9
- # @private
10
- class AdapterPool
11
- extend Forwardable
12
-
13
- def initialize(job)
14
- @job = job
15
- @config = job.config
16
-
17
- size = @config.connection_count
18
- timeout = @config.connection_timeout
19
-
20
- @pool = ConnectionPool.new(
21
- size: size,
22
- timeout: timeout,
23
- &method(:instantiate_adapter)
24
- )
25
- end
26
-
27
- # Shuts down all HTTP adapters
28
- def free
29
- @pool.shutdown(&:free)
30
- end
31
-
32
- private
33
-
34
- def instantiate_adapter
35
- adapter = if @config.http_adapter == :selenium
36
- HTTPAdapters::SeleniumAdapter.new(@config)
37
- else
38
- HTTPAdapters::NetHTTPAdapter.instance(@config)
39
- end
40
-
41
- @job.run_hook(
42
- :setup_adapter,
43
- adapter,
44
- adapter.try(:driver),
45
- adapter.try(:browser)
46
- )
47
-
48
- adapter
49
- end
50
-
51
- def method_missing(method, *argv, &proc)
52
- super if method == :shutdown
53
- @pool.public_send(method, *argv, &proc)
54
- end
55
-
56
- def respond_to_missing?(method, private = false)
57
- return false if method == :shutdown
58
- @pool.respond_to?(method) || super
59
- end
60
- end
61
- end
62
- end
@@ -1,77 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "securerandom"
4
- require "net/http"
5
- require "net/http/persistent"
6
-
7
- module Wayfarer
8
- module HTTPAdapters
9
- # A singleton adapter for net-http-persistent.
10
- # @api private
11
- class NetHTTPAdapter
12
- # Supported standard lib classes
13
- RECOGNIZED_URI_TYPES = [
14
- URI::HTTP,
15
- URI::HTTPS
16
- ].freeze
17
-
18
- MalformedURI = Class.new(StandardError)
19
- MalformedRedirectURI = Class.new(StandardError)
20
- MaximumRedirectCountReached = Class.new(StandardError)
21
-
22
- attr_accessor :request_header_overrides
23
-
24
- # TODO: Remove default parameter value
25
- def self.instance(config = Wayfarer.config)
26
- @@instance ||= new(config)
27
- end
28
-
29
- def initialize(config)
30
- @config = config
31
- @conn = Net::HTTP::Persistent.new("wayfarer-#{SecureRandom.uuid}")
32
- end
33
-
34
- # This is a singleton class. Use ::instance instead.
35
- private_class_method :new
36
-
37
- # Fetches a page.
38
- # @return [Page]
39
- # @raise [MalformedURI] if the URI is not supported.
40
- # @raise [MalformedRedirectURI] if a redirection URI is not supported.
41
- # @raise [MaximumRedirectCountReached] if too many redirections are
42
- # encountered.
43
- def fetch(uri, redirects_followed = 0)
44
- if !RECOGNIZED_URI_TYPES.include?(uri.class)
45
- raise _ = if redirects_followed.positive?
46
- MalformedRedirectURI
47
- else
48
- MalformedURI
49
- end
50
- elsif redirects_followed > @config.max_http_redirects
51
- raise MaximumRedirectCountReached
52
- end
53
-
54
- res = @conn.request(uri)
55
-
56
- if res.is_a? Net::HTTPRedirection
57
- redirect_uri = URI(res["location"])
58
- return fetch(redirect_uri, redirects_followed + 1)
59
- end
60
-
61
- Page.new(
62
- uri: uri,
63
- status_code: res.code.to_i,
64
- body: res.body,
65
- headers: res.to_hash
66
- )
67
- rescue SocketError
68
- raise MalformedURI
69
- end
70
-
71
- # Shuts down all connections.
72
- def free
73
- @conn.shutdown
74
- end
75
- end
76
- end
77
- end
@@ -1,80 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "selenium-webdriver"
4
- require "selenium/emulated_features"
5
- require "capybara"
6
-
7
- module Wayfarer
8
- module HTTPAdapters
9
- # An adapter for Selenium WebDrivers
10
- # @api private
11
- class SeleniumAdapter
12
- # @!attribute [r] driver
13
- # @return [URI] the Selenium WebDriver.
14
- attr_reader :driver
15
-
16
- def initialize(config = Wayfarer.config)
17
- @config = config
18
- end
19
-
20
- # Fetches a page.
21
- # @return [Page]
22
- def fetch(uri)
23
- driver.navigate.to(uri)
24
-
25
- Page.new(
26
- uri: @driver.current_url,
27
- status_code: @driver.response_code,
28
- body: @driver.page_source,
29
- headers: @driver.response_headers
30
- )
31
- end
32
-
33
- # Closes the driver.
34
- def reload!
35
- @driver&.close
36
- @driver = nil
37
- end
38
-
39
- # Quits the browser.
40
- def free
41
- @driver&.quit
42
- @driver = nil
43
- end
44
-
45
- # The WebDriver.
46
- def driver
47
- @driver ||= instantiate_driver
48
- end
49
-
50
- # A Capybara driver that wraps the {#driver}.
51
- # @see https://github.com/teamcapybara/capybara Capybara
52
- def browser
53
- @browser ||= instantiate_capybara_driver
54
- end
55
-
56
- private
57
-
58
- def instantiate_driver
59
- driver = Selenium::WebDriver.for(*@config.selenium_argv)
60
- driver.manage.window.size = Selenium::WebDriver::Dimension.new(
61
- *@config.window_size
62
- )
63
- driver
64
- end
65
-
66
- def instantiate_capybara_driver
67
- Capybara.run_server = false
68
- Capybara.current_driver = :selenium
69
-
70
- capybara_driver = Capybara::Selenium::Driver.new(nil)
71
- capybara_driver.instance_variable_set(:@browser, driver)
72
-
73
- session = Capybara::Session.new(:selenium, nil)
74
- session.instance_variable_set(:@driver, capybara_driver)
75
-
76
- session
77
- end
78
- end
79
- end
80
- end
data/lib/wayfarer/job.rb DELETED
@@ -1,211 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "forwardable"
4
- require "hooks"
5
- require "active_job"
6
-
7
- # TODO: I only want deep_dup
8
- require "active_support/all"
9
-
10
- module Wayfarer
11
- # A {Job} is a class that has a {Routing::Router} with many {Routing::Rule}s
12
- # which are matched against a URI. Rules map URIs onto job instance methods.
13
- # Under the hood, jobs are instantiated within separate threads by a
14
- # {Processor}. Every instance gets its own thread. If a URI is matched, its
15
- # {Page} is retrieved, and made available to instance methods via {#page}.
16
- #
17
- # Jobs implement ActiveJob's Job API and are therefore compatible with a wide
18
- # range of job queues. To run a job immediately, call ::perform_now.
19
- # enqueue a job, call ::perform_later.
20
- #
21
- # @see https://github.com/rails/rails/tree/master/activejob rails/activejob
22
- # @see http://edgeguides.rubyonrails.org/active_job_basics.html ActiveJob Basics
23
- class Job < ActiveJob::Base
24
- extend Forwardable
25
-
26
- include Hooks
27
- include Locals
28
-
29
- # @!group Callbacks
30
-
31
- # Callback that fires __once__ before any pages are retrieved.
32
- # @method before_crawl
33
- # @scope class
34
- define_hook :before_crawl
35
-
36
- # Callback that fires __once__ after all pages have been retrieved and
37
- # processing is done.
38
- # @method after_crawl
39
- # @scope class
40
- define_hook :after_crawl
41
-
42
- # Callback that fires when HTTP adapters are instantiated.
43
- # @method setup_adapter
44
- # @scope class
45
- # @yield [[HTTPAdapters::NetHTTPAdapter, HTTPAdapters::SeleniumAdapter], [Selenium::WebDriver::Driver, nil], [Capybara::Selenium::Driver, nil]]
46
- define_hooks :setup_adapter
47
-
48
- # @!endgroup
49
-
50
- class << self
51
- extend Forwardable
52
-
53
- # @!attribute [w] router
54
- attr_writer :router
55
-
56
- # @!attribute [w] config
57
- attr_writer :config
58
-
59
- # Returns a class copy.
60
- def prepare
61
- duplicate = dup
62
- duplicate.router = router.dup
63
- duplicate.locals = locals.deep_dup
64
- duplicate.config = config.dup
65
-
66
- duplicate.locals.each do |(key, val)|
67
- duplicate.locals[key] = Locals.thread_safe_counterpart(val)
68
- end
69
-
70
- duplicate.locals.each do |(key, _)|
71
- duplicate.send(:define_method, key) do duplicate.locals[key] end
72
- duplicate.send(:define_singleton_method, key) do
73
- duplicate.locals[key]
74
- end
75
- end
76
-
77
- duplicate
78
- end
79
-
80
- # A configuration based off the global {Wayfarer.config}.
81
- # @yield [Configuration]
82
- # @return [Configuration]
83
- def config
84
- @config ||= Wayfarer.config.clone
85
- yield(@config) if block_given?
86
- @config
87
- end
88
-
89
- # A router.
90
- # If a block is passed in, it is evaluated within the {Router}'s instance.
91
- # @return [Routing::Router]
92
- def router(&proc)
93
- @router ||= Routing::Router.new
94
- @router.instance_eval(&proc) if block_given?
95
- @router
96
- end
97
-
98
- alias route router
99
- alias routes router
100
-
101
- # Overshadows ActiveJob::Base's own logger
102
- delegate logger: :config
103
- end
104
-
105
- # @!attribute [r] staged_uris
106
- # @return [Array<String>, Array<URI>] URIs to stage for the next cycle.
107
- # @see #stage
108
- attr_reader :staged_uris
109
-
110
- # @!attribute [rw] page
111
- attr_writer :page
112
-
113
- # @!attribute [rw] adapter
114
- attr_accessor :adapter
115
-
116
- # @!attribute [rw] params
117
- attr_accessor :params
118
-
119
- def initialize(*argv)
120
- @halts = false
121
- @staged_uris = []
122
- super(*argv)
123
- end
124
-
125
- # Whether this job will stop processing.
126
- def halts?
127
- @halts
128
- end
129
-
130
- # Performs this job.
131
- # @note ActiveJob API
132
- # @override
133
- def perform(*uris)
134
- Crawl.new(self.class, *uris).execute
135
- end
136
-
137
- protected
138
-
139
- # All following instance methods are available within actions.
140
-
141
- # Sets a halting flag that signals the processor to stop its work.
142
- def halt
143
- @halts = true
144
- end
145
-
146
- # Adds URIs to process in the next cycle.
147
- # If a relative path is given, an absolute URI is constructed from the
148
- # current {#page}'s URI.
149
- # @param [String, URI, Array<String>, Array<URI>]
150
- def stage(*uris)
151
- expanded = uris.flatten.map do |u|
152
- if (uri = URI(u)).absolute?
153
- uri
154
- else
155
- # URI#join would discard the path of page.uri.path
156
- current = page.uri.dup
157
- current.path = File.join(page.uri.path, uri.path)
158
- current
159
- end
160
- end
161
-
162
- # This method has somewhat become the guard keeper for invalid URIs that
163
- # would lead to exceptions otherwise down the line
164
- supported = expanded.select do |uri|
165
- HTTPAdapters::NetHTTPAdapter::RECOGNIZED_URI_TYPES.any? do |type|
166
- uri.is_a?(type)
167
- end
168
- end
169
-
170
- @staged_uris.push(*supported)
171
- end
172
-
173
- # The {Page} representing the URI currently processed by an action.
174
- # When using the Selenium adapter, {Page#body} gets refreshed on every call.
175
- # Otherwise, subsequent DOM updates (i.e. JavaScript-induced) would be
176
- # invisible.
177
- # @return Page
178
- def page
179
- return @page unless self.class.config.http_adapter == :selenium
180
-
181
- Page.new(
182
- uri: @page.uri,
183
- status_code: @page.uri,
184
- body: driver.page_source,
185
- headers: @page.headers
186
- )
187
- end
188
-
189
- # The parsed response body.
190
- # When using the Selenium adapter, this parses the body again on every call.
191
- # Otherwise, subsequent DOM updates (i.e. JavaScript-induced) would be
192
- # invisible.
193
- # @method doc
194
- # @see Page#doc
195
- delegate doc: :page
196
-
197
- # The Selenium WebDriver.
198
- # @method driver
199
- # @see https://github.com/peterc/pismo Pismo
200
- # @see Page#driver
201
- delegate driver: :adapter
202
-
203
- # A Capybara driver that wraps the {#driver}.
204
- # @method browser
205
- # @see HTTPAdapters::SeleniumAdapter#browser
206
- delegate browser: :adapter
207
-
208
- # @method logger
209
- delegate logger: :"self.class"
210
- end
211
- end
@@ -1,40 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "concurrent"
4
-
5
- module Wayfarer
6
- # @api private
7
- module Locals
8
- def self.thread_safe_counterpart(value)
9
- case value
10
- when Array then Concurrent::Array.new(value)
11
- when Hash then Concurrent::Hash[value]
12
- when TrueClass then Concurrent::AtomicBoolean.new(value)
13
- when FalseClass then Concurrent::AtomicBoolean.new(value)
14
- when Integer then Concurrent::AtomicFixnum.new(value)
15
- else value
16
- end
17
- end
18
-
19
- def self.included(base)
20
- base.extend(ClassMethods)
21
- end
22
-
23
- module ClassMethods
24
- attr_reader :locals
25
-
26
- def let(key)
27
- raise "#let called without a block" unless block_given?
28
- locals[key] = yield
29
- end
30
-
31
- def locals
32
- @locals ||= {}
33
- end
34
-
35
- def locals=(locals)
36
- @locals = locals
37
- end
38
- end
39
- end
40
- end
@@ -1,20 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "oj" unless RUBY_PLATFORM == "java"
4
-
5
- module Wayfarer
6
- module Parsers
7
- # A wrapper class for parsing JSON.
8
- # @private
9
- module JSONParser
10
- module_function
11
-
12
- # Parses a JSON string.
13
- # @param [String] json_str the JSON string to parse.
14
- # @return [OpenStruct]
15
- def parse(json_str)
16
- RUBY_PLATFORM == "java" ? JSON.parse(json_str) : Oj.load(json_str)
17
- end
18
- end
19
- end
20
- end
@@ -1,27 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "nokogiri"
4
-
5
- module Wayfarer
6
- module Parsers
7
- # A wrapper class for parsing HTML/XML.
8
- # @private
9
- module XMLParser
10
- module_function
11
-
12
- # Parses an XML string.
13
- # @param [String] xml_str the XML string to parse.
14
- # @return [Nokogiri::XML::Document]
15
- def parse_xml(xml_str)
16
- Nokogiri::XML(xml_str)
17
- end
18
-
19
- # Parses a HTML string.
20
- # @param [String] html_str the HTML string to parse.
21
- # @return [Nokogiri::HTML::Document]
22
- def parse_html(html_str)
23
- Nokogiri::HTML(html_str)
24
- end
25
- end
26
- end
27
- end
@@ -1,103 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "pp"
4
- require "concurrent"
5
- require "observer"
6
-
7
- module Wayfarer
8
- # Runs jobs.
9
- class Processor
10
- extend Forwardable
11
-
12
- include Observable
13
- include CrawlObserver::Events
14
- include CrawlObserver::ObservableShortcuts
15
-
16
- attr_reader :job
17
-
18
- delegate config: :job
19
- delegate logger: :config
20
-
21
- def initialize(job, frontier, dispatcher)
22
- @job = job
23
- @frontier = frontier
24
- @dispatcher = dispatcher
25
- @halted = Concurrent::AtomicBoolean.new(false)
26
- end
27
-
28
- # Whether processing is done.
29
- # @return [true, false]
30
- def halted?
31
- @halted.value
32
- end
33
-
34
- # Sets a halt flag.
35
- def halt!
36
- @halted.make_true
37
- end
38
-
39
- # Runs the job.
40
- # @param [*Array<URI>, *Array<String>] uris
41
- def run(*_uris)
42
- notify_observers!(FirstCycle.new(@frontier))
43
-
44
- while @halted.false? && @frontier.cycle
45
- current_uris = @frontier.current_uris
46
- queue = current_uris.inject(Queue.new, :push)
47
-
48
- notify_observers!(NewCycle.new(current_uris.count))
49
-
50
- @threads = Array.new(config.connection_count) do
51
- Thread.new do
52
- begin
53
- loop do
54
- uri = queue.pop(true)
55
- break if uri.nil? || @halted.true?
56
- handle_dispatch_result(@dispatcher.dispatch(@job, uri))
57
- end
58
- rescue ThreadError
59
- notify_observers!(CycleFinished.new)
60
- end
61
- end
62
- end
63
-
64
- @threads.each(&:join)
65
-
66
- notify_observers!(AboutToCycle.new(@frontier.staged_uris.count))
67
- end
68
- ensure
69
- halt!
70
- @frontier.free
71
- @dispatcher.adapter_pool.free
72
- end
73
-
74
- private
75
-
76
- def handle_dispatch_result(result)
77
- case result
78
- when Dispatcher::Mismatch then handle_mismatch(result)
79
- when Dispatcher::Halt then handle_halt(result)
80
- when Dispatcher::Stage then handle_stage(result)
81
- when Dispatcher::Error then handle_error(result)
82
- end
83
- end
84
-
85
- def handle_mismatch(mismatch)
86
- notify_observers!(MismatchedURI.new(mismatch.uri))
87
- end
88
-
89
- def handle_halt(halt)
90
- notify_observers!(HaltInitiated.new(halt.action, halt.uri))
91
- halt!
92
- end
93
-
94
- def handle_stage(stage)
95
- notify_observers!(StagingURIs.new(stage.uris.count))
96
- @frontier.stage(*stage.uris) unless halted?
97
- end
98
-
99
- def handle_error(error)
100
- notify_observers!(UnhandledError.new(error.exception))
101
- end
102
- end
103
- end
@@ -1,21 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "uri"
4
-
5
- module Wayfarer
6
- module Routing
7
- # @private
8
- class CustomRule < Rule
9
- def initialize(delegate_or_block = proc, opts = {}, &proc)
10
- @delegate_or_block = delegate_or_block
11
- super(opts, &proc)
12
- end
13
-
14
- private
15
-
16
- def match!(uri)
17
- !!@delegate_or_block.call(uri)
18
- end
19
- end
20
- end
21
- end
@@ -1,20 +0,0 @@
1
- # frozen_string_literal: true
2
- require "uri"
3
-
4
- module Wayfarer
5
- module Routing
6
- # @private
7
- class FiletypesRule < Rule
8
- def initialize(types, opts = {}, &proc)
9
- @types = types
10
- super(opts, &proc)
11
- end
12
-
13
- private
14
-
15
- def match!(uri)
16
- @types.any? { |type| uri.path =~ /\.#{type}$/ }
17
- end
18
- end
19
- end
20
- end
@@ -1,19 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Wayfarer
4
- module Routing
5
- # @private
6
- class HostRule < Rule
7
- def initialize(str_or_regexp, opts = {}, &proc)
8
- @str_or_regexp = str_or_regexp
9
- super(opts, &proc)
10
- end
11
-
12
- # rubocop:disable Style/CaseEquality
13
- def match!(uri)
14
- @str_or_regexp === uri.host
15
- end
16
- # rubocop:enable Style/CaseEquality
17
- end
18
- end
19
- end