wayfarer 0.0.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (339) hide show
  1. checksums.yaml +5 -5
  2. data/.github/workflows/ci.yaml +32 -0
  3. data/.gitignore +3 -4
  4. data/.rubocop.yml +25 -9
  5. data/.ruby-version +1 -1
  6. data/Dockerfile +5 -0
  7. data/Gemfile +1 -7
  8. data/Gemfile.lock +212 -0
  9. data/RELEASING.md +17 -0
  10. data/Rakefile +38 -90
  11. data/bin/wayfarer +1 -111
  12. data/docker-compose.yml +32 -0
  13. data/docs/cookbook/querying_html.md +42 -0
  14. data/docs/cookbook/screenshots.md +27 -0
  15. data/docs/cookbook/user_agent.md +7 -0
  16. data/docs/guides/browser_automation/capybara.md +3 -0
  17. data/docs/guides/browser_automation/ferrum.md +37 -0
  18. data/docs/guides/browser_automation/selenium.md +59 -0
  19. data/docs/guides/callbacks.md +27 -34
  20. data/docs/guides/configuration.md +20 -171
  21. data/docs/guides/error_handling.md +18 -33
  22. data/docs/guides/jobs.md +75 -0
  23. data/docs/guides/networking.md +20 -0
  24. data/docs/guides/pages.md +52 -0
  25. data/docs/guides/performance.md +22 -0
  26. data/docs/guides/tasks.md +38 -0
  27. data/docs/index.md +34 -68
  28. data/docs/reference/api/base.md +162 -0
  29. data/docs/reference/api/route.md +182 -0
  30. data/docs/reference/cli.md +139 -0
  31. data/docs/reference/environment_variables.md +85 -0
  32. data/lib/wayfarer/base.rb +60 -0
  33. data/lib/wayfarer/cli/base.rb +23 -0
  34. data/lib/wayfarer/cli/generate.rb +17 -0
  35. data/lib/wayfarer/cli/job.rb +58 -0
  36. data/lib/wayfarer/cli/route.rb +27 -0
  37. data/lib/wayfarer/cli/route_printer.rb +116 -0
  38. data/lib/wayfarer/cli/runner.rb +34 -0
  39. data/lib/wayfarer/cli/templates/Gemfile.tt +5 -0
  40. data/lib/wayfarer/cli/templates/job.rb.tt +8 -0
  41. data/lib/wayfarer/config.rb +67 -0
  42. data/lib/wayfarer/gc.rb +19 -0
  43. data/lib/wayfarer/middleware/chain.rb +19 -0
  44. data/lib/wayfarer/middleware/dedup.rb +25 -0
  45. data/lib/wayfarer/middleware/fetch.rb +32 -0
  46. data/lib/wayfarer/middleware/normalize.rb +25 -0
  47. data/lib/wayfarer/middleware/router.rb +21 -0
  48. data/lib/wayfarer/middleware/stage.rb +23 -0
  49. data/lib/wayfarer/middleware/worker.rb +47 -0
  50. data/lib/wayfarer/networking/ferrum.rb +70 -0
  51. data/lib/wayfarer/networking/healer.rb +21 -0
  52. data/lib/wayfarer/networking/net_http.rb +52 -0
  53. data/lib/wayfarer/networking/pool.rb +34 -0
  54. data/lib/wayfarer/networking/result.rb +18 -0
  55. data/lib/wayfarer/networking/selenium.rb +70 -0
  56. data/lib/wayfarer/page.rb +15 -71
  57. data/lib/wayfarer/parsing/json.rb +17 -0
  58. data/lib/wayfarer/parsing/xml.rb +17 -0
  59. data/lib/wayfarer/redis/barrier.rb +36 -0
  60. data/lib/wayfarer/redis/connection.rb +13 -0
  61. data/lib/wayfarer/redis/counter.rb +29 -0
  62. data/lib/wayfarer/redis/pool.rb +18 -0
  63. data/lib/wayfarer/redis/version.rb +19 -0
  64. data/lib/wayfarer/routing/custom_matcher.rb +21 -0
  65. data/lib/wayfarer/routing/dsl.rb +57 -0
  66. data/lib/wayfarer/routing/host_matcher.rb +23 -0
  67. data/lib/wayfarer/routing/path_finder.rb +46 -0
  68. data/lib/wayfarer/routing/path_matcher.rb +46 -0
  69. data/lib/wayfarer/routing/{query_rule.rb → query_matcher.rb} +24 -16
  70. data/lib/wayfarer/routing/result.rb +15 -0
  71. data/lib/wayfarer/routing/root_route.rb +7 -0
  72. data/lib/wayfarer/routing/route.rb +41 -0
  73. data/lib/wayfarer/routing/scheme_matcher.rb +21 -0
  74. data/lib/wayfarer/routing/suffix_matcher.rb +21 -0
  75. data/lib/wayfarer/routing/target_route.rb +7 -0
  76. data/lib/wayfarer/routing/url_matcher.rb +21 -0
  77. data/lib/wayfarer/serializer.rb +17 -0
  78. data/lib/wayfarer/stringify.rb +41 -0
  79. data/lib/wayfarer/task.rb +34 -0
  80. data/lib/wayfarer.rb +47 -58
  81. data/mkdocs.yml +47 -0
  82. data/requirements.txt +1 -0
  83. data/spec/base_spec.rb +219 -0
  84. data/spec/cli/generate_spec.rb +39 -0
  85. data/spec/cli/job_spec.rb +74 -0
  86. data/spec/cli/version_spec.rb +13 -0
  87. data/spec/config_spec.rb +144 -0
  88. data/spec/factories/queue/chain.rb +11 -0
  89. data/spec/factories/queue/middleware.rb +15 -0
  90. data/spec/factories/queue/page.rb +78 -0
  91. data/spec/factories/queue/task.rb +12 -0
  92. data/spec/fixtures/dummy_job.rb +7 -0
  93. data/spec/gc_spec.rb +61 -0
  94. data/spec/middleware/chain_spec.rb +96 -0
  95. data/spec/middleware/dedup_spec.rb +76 -0
  96. data/spec/middleware/fetch_spec.rb +72 -0
  97. data/spec/middleware/normalize_spec.rb +28 -0
  98. data/spec/middleware/router_spec.rb +46 -0
  99. data/spec/middleware/stage_spec.rb +39 -0
  100. data/spec/middleware/worker_spec.rb +90 -0
  101. data/spec/networking/adapter.rb +135 -0
  102. data/spec/networking/ferrum_spec.rb +28 -0
  103. data/spec/networking/healer_spec.rb +46 -0
  104. data/spec/networking/net_http_spec.rb +37 -0
  105. data/spec/networking/pool_spec.rb +42 -0
  106. data/spec/networking/selenium_spec.rb +28 -0
  107. data/spec/page_spec.rb +21 -12
  108. data/spec/{parsers/json_parser_spec.rb → parsing/json_spec.rb} +5 -4
  109. data/spec/{parsers/xml_parser_spec.rb → parsing/xml_spec.rb} +3 -2
  110. data/spec/redis/barrier_spec.rb +78 -0
  111. data/spec/redis/counter_spec.rb +32 -0
  112. data/spec/redis/pool_spec.rb +18 -0
  113. data/spec/redis/version_spec.rb +13 -0
  114. data/spec/routing/custom_matcher_spec.rb +31 -0
  115. data/spec/routing/dsl_spec.rb +98 -0
  116. data/spec/routing/host_matcher_spec.rb +49 -0
  117. data/spec/routing/integration_spec.rb +110 -0
  118. data/spec/routing/path_finder_spec.rb +33 -0
  119. data/spec/routing/path_matcher_spec.rb +43 -0
  120. data/spec/routing/{query_rule_spec.rb → query_matcher_spec.rb} +39 -26
  121. data/spec/routing/root_route_spec.rb +29 -0
  122. data/spec/routing/route_spec.rb +43 -0
  123. data/spec/routing/scheme_matcher_spec.rb +25 -0
  124. data/spec/routing/{filetypes_rule_spec.rb → suffix_matcher_spec.rb} +14 -13
  125. data/spec/routing/uri_matcher_spec.rb +27 -0
  126. data/spec/spec_helpers.rb +65 -38
  127. data/spec/stringify_spec.rb +23 -0
  128. data/{support → spec/support}/static/finders.html +0 -0
  129. data/{support → spec/support}/static/graph/details/a.html +0 -0
  130. data/{support → spec/support}/static/graph/details/b.html +0 -0
  131. data/{support → spec/support}/static/graph/index.html +0 -0
  132. data/{support → spec/support}/static/json/dummy.json +0 -0
  133. data/{support → spec/support}/static/links/links.html +0 -0
  134. data/{support → spec/support}/static/xml/dummy.xml +0 -0
  135. data/{support → spec/support}/test_app.rb +9 -2
  136. data/spec/task_spec.rb +27 -0
  137. data/spec/wayfarer_spec.rb +2 -13
  138. data/wayfarer.gemspec +39 -42
  139. metadata +191 -368
  140. data/.travis.yml +0 -5
  141. data/Changelog.md +0 -10
  142. data/README.md +0 -21
  143. data/benchmark/frontiers.rb +0 -143
  144. data/docs/.gitignore +0 -2
  145. data/docs/_config.yml +0 -15
  146. data/docs/_includes/base.html +0 -7
  147. data/docs/_includes/head.html +0 -10
  148. data/docs/_includes/navigation.html +0 -187
  149. data/docs/_layouts/default.html +0 -42
  150. data/docs/_sass/base.scss +0 -439
  151. data/docs/_sass/variables.scss +0 -24
  152. data/docs/_sass/vendor/bourbon/_bourbon-deprecate.scss +0 -19
  153. data/docs/_sass/vendor/bourbon/_bourbon-deprecated-upcoming.scss +0 -425
  154. data/docs/_sass/vendor/bourbon/_bourbon.scss +0 -90
  155. data/docs/_sass/vendor/bourbon/addons/_border-color.scss +0 -29
  156. data/docs/_sass/vendor/bourbon/addons/_border-radius.scss +0 -48
  157. data/docs/_sass/vendor/bourbon/addons/_border-style.scss +0 -28
  158. data/docs/_sass/vendor/bourbon/addons/_border-width.scss +0 -28
  159. data/docs/_sass/vendor/bourbon/addons/_buttons.scss +0 -69
  160. data/docs/_sass/vendor/bourbon/addons/_clearfix.scss +0 -25
  161. data/docs/_sass/vendor/bourbon/addons/_ellipsis.scss +0 -30
  162. data/docs/_sass/vendor/bourbon/addons/_font-stacks.scss +0 -31
  163. data/docs/_sass/vendor/bourbon/addons/_hide-text.scss +0 -27
  164. data/docs/_sass/vendor/bourbon/addons/_margin.scss +0 -29
  165. data/docs/_sass/vendor/bourbon/addons/_padding.scss +0 -29
  166. data/docs/_sass/vendor/bourbon/addons/_position.scss +0 -51
  167. data/docs/_sass/vendor/bourbon/addons/_prefixer.scss +0 -66
  168. data/docs/_sass/vendor/bourbon/addons/_retina-image.scss +0 -27
  169. data/docs/_sass/vendor/bourbon/addons/_size.scss +0 -56
  170. data/docs/_sass/vendor/bourbon/addons/_text-inputs.scss +0 -118
  171. data/docs/_sass/vendor/bourbon/addons/_timing-functions.scss +0 -34
  172. data/docs/_sass/vendor/bourbon/addons/_triangle.scss +0 -63
  173. data/docs/_sass/vendor/bourbon/addons/_word-wrap.scss +0 -29
  174. data/docs/_sass/vendor/bourbon/css3/_animation.scss +0 -61
  175. data/docs/_sass/vendor/bourbon/css3/_appearance.scss +0 -5
  176. data/docs/_sass/vendor/bourbon/css3/_backface-visibility.scss +0 -5
  177. data/docs/_sass/vendor/bourbon/css3/_background-image.scss +0 -44
  178. data/docs/_sass/vendor/bourbon/css3/_background.scss +0 -57
  179. data/docs/_sass/vendor/bourbon/css3/_border-image.scss +0 -61
  180. data/docs/_sass/vendor/bourbon/css3/_calc.scss +0 -6
  181. data/docs/_sass/vendor/bourbon/css3/_columns.scss +0 -67
  182. data/docs/_sass/vendor/bourbon/css3/_filter.scss +0 -6
  183. data/docs/_sass/vendor/bourbon/css3/_flex-box.scss +0 -327
  184. data/docs/_sass/vendor/bourbon/css3/_font-face.scss +0 -29
  185. data/docs/_sass/vendor/bourbon/css3/_font-feature-settings.scss +0 -6
  186. data/docs/_sass/vendor/bourbon/css3/_hidpi-media-query.scss +0 -12
  187. data/docs/_sass/vendor/bourbon/css3/_hyphens.scss +0 -6
  188. data/docs/_sass/vendor/bourbon/css3/_image-rendering.scss +0 -15
  189. data/docs/_sass/vendor/bourbon/css3/_keyframes.scss +0 -38
  190. data/docs/_sass/vendor/bourbon/css3/_linear-gradient.scss +0 -40
  191. data/docs/_sass/vendor/bourbon/css3/_perspective.scss +0 -12
  192. data/docs/_sass/vendor/bourbon/css3/_placeholder.scss +0 -10
  193. data/docs/_sass/vendor/bourbon/css3/_radial-gradient.scss +0 -40
  194. data/docs/_sass/vendor/bourbon/css3/_selection.scss +0 -44
  195. data/docs/_sass/vendor/bourbon/css3/_text-decoration.scss +0 -27
  196. data/docs/_sass/vendor/bourbon/css3/_transform.scss +0 -21
  197. data/docs/_sass/vendor/bourbon/css3/_transition.scss +0 -81
  198. data/docs/_sass/vendor/bourbon/css3/_user-select.scss +0 -5
  199. data/docs/_sass/vendor/bourbon/functions/_assign-inputs.scss +0 -16
  200. data/docs/_sass/vendor/bourbon/functions/_contains-falsy.scss +0 -25
  201. data/docs/_sass/vendor/bourbon/functions/_contains.scss +0 -31
  202. data/docs/_sass/vendor/bourbon/functions/_is-length.scss +0 -16
  203. data/docs/_sass/vendor/bourbon/functions/_is-light.scss +0 -26
  204. data/docs/_sass/vendor/bourbon/functions/_is-number.scss +0 -16
  205. data/docs/_sass/vendor/bourbon/functions/_is-size.scss +0 -23
  206. data/docs/_sass/vendor/bourbon/functions/_modular-scale.scss +0 -74
  207. data/docs/_sass/vendor/bourbon/functions/_px-to-em.scss +0 -24
  208. data/docs/_sass/vendor/bourbon/functions/_px-to-rem.scss +0 -26
  209. data/docs/_sass/vendor/bourbon/functions/_shade.scss +0 -24
  210. data/docs/_sass/vendor/bourbon/functions/_strip-units.scss +0 -22
  211. data/docs/_sass/vendor/bourbon/functions/_tint.scss +0 -24
  212. data/docs/_sass/vendor/bourbon/functions/_transition-property-name.scss +0 -37
  213. data/docs/_sass/vendor/bourbon/functions/_unpack.scss +0 -32
  214. data/docs/_sass/vendor/bourbon/helpers/_convert-units.scss +0 -26
  215. data/docs/_sass/vendor/bourbon/helpers/_directional-values.scss +0 -108
  216. data/docs/_sass/vendor/bourbon/helpers/_font-source-declaration.scss +0 -53
  217. data/docs/_sass/vendor/bourbon/helpers/_gradient-positions-parser.scss +0 -24
  218. data/docs/_sass/vendor/bourbon/helpers/_linear-angle-parser.scss +0 -35
  219. data/docs/_sass/vendor/bourbon/helpers/_linear-gradient-parser.scss +0 -51
  220. data/docs/_sass/vendor/bourbon/helpers/_linear-positions-parser.scss +0 -77
  221. data/docs/_sass/vendor/bourbon/helpers/_linear-side-corner-parser.scss +0 -41
  222. data/docs/_sass/vendor/bourbon/helpers/_radial-arg-parser.scss +0 -74
  223. data/docs/_sass/vendor/bourbon/helpers/_radial-gradient-parser.scss +0 -55
  224. data/docs/_sass/vendor/bourbon/helpers/_radial-positions-parser.scss +0 -28
  225. data/docs/_sass/vendor/bourbon/helpers/_render-gradients.scss +0 -31
  226. data/docs/_sass/vendor/bourbon/helpers/_shape-size-stripper.scss +0 -15
  227. data/docs/_sass/vendor/bourbon/helpers/_str-to-num.scss +0 -55
  228. data/docs/_sass/vendor/bourbon/settings/_asset-pipeline.scss +0 -7
  229. data/docs/_sass/vendor/bourbon/settings/_deprecation-warnings.scss +0 -8
  230. data/docs/_sass/vendor/bourbon/settings/_prefixer.scss +0 -9
  231. data/docs/_sass/vendor/bourbon/settings/_px-to-em.scss +0 -1
  232. data/docs/_sass/vendor/neat/_neat-helpers.scss +0 -11
  233. data/docs/_sass/vendor/neat/_neat.scss +0 -23
  234. data/docs/_sass/vendor/neat/functions/_new-breakpoint.scss +0 -49
  235. data/docs/_sass/vendor/neat/functions/_private.scss +0 -114
  236. data/docs/_sass/vendor/neat/grid/_box-sizing.scss +0 -15
  237. data/docs/_sass/vendor/neat/grid/_direction-context.scss +0 -33
  238. data/docs/_sass/vendor/neat/grid/_display-context.scss +0 -28
  239. data/docs/_sass/vendor/neat/grid/_fill-parent.scss +0 -22
  240. data/docs/_sass/vendor/neat/grid/_media.scss +0 -92
  241. data/docs/_sass/vendor/neat/grid/_omega.scss +0 -87
  242. data/docs/_sass/vendor/neat/grid/_outer-container.scss +0 -34
  243. data/docs/_sass/vendor/neat/grid/_pad.scss +0 -25
  244. data/docs/_sass/vendor/neat/grid/_private.scss +0 -35
  245. data/docs/_sass/vendor/neat/grid/_row.scss +0 -52
  246. data/docs/_sass/vendor/neat/grid/_shift.scss +0 -50
  247. data/docs/_sass/vendor/neat/grid/_span-columns.scss +0 -94
  248. data/docs/_sass/vendor/neat/grid/_to-deprecate.scss +0 -97
  249. data/docs/_sass/vendor/neat/grid/_visual-grid.scss +0 -42
  250. data/docs/_sass/vendor/neat/mixins/_clearfix.scss +0 -25
  251. data/docs/_sass/vendor/neat/settings/_disable-warnings.scss +0 -13
  252. data/docs/_sass/vendor/neat/settings/_grid.scss +0 -51
  253. data/docs/_sass/vendor/neat/settings/_visual-grid.scss +0 -27
  254. data/docs/_sass/vendor/normalize-3.0.2.scss +0 -427
  255. data/docs/_sass/vendor/pygments.scss +0 -356
  256. data/docs/automating_browsers/capybara.md +0 -70
  257. data/docs/css/screen.scss +0 -7
  258. data/docs/guides/cli.md +0 -52
  259. data/docs/guides/frontiers.md +0 -93
  260. data/docs/guides/halting.md +0 -23
  261. data/docs/guides/job_queues.md +0 -26
  262. data/docs/guides/locals.md +0 -36
  263. data/docs/guides/logging.md +0 -22
  264. data/docs/guides/page_objects.md +0 -67
  265. data/docs/guides/peeking.md +0 -46
  266. data/docs/guides/selenium_capybara.md +0 -100
  267. data/docs/guides/tutorial.md +0 -452
  268. data/docs/js/navigation.js +0 -11
  269. data/docs/misc/contributing.md +0 -20
  270. data/docs/misc/testing.md +0 -11
  271. data/docs/recipes/authentication.md +0 -23
  272. data/docs/recipes/csv.md +0 -29
  273. data/docs/recipes/javascript.md +0 -20
  274. data/docs/recipes/multiple_uris.md +0 -18
  275. data/docs/recipes/screenshots.md +0 -20
  276. data/docs/routing/custom_rules.md +0 -16
  277. data/docs/routing/filetypes_rules.md +0 -21
  278. data/docs/routing/host_rules.md +0 -24
  279. data/docs/routing/path_rules.md +0 -33
  280. data/docs/routing/protocol_rules.md +0 -17
  281. data/docs/routing/query_rules.md +0 -69
  282. data/docs/routing/routes.md +0 -96
  283. data/docs/routing/uri_rules.md +0 -18
  284. data/examples/collect_github_issues.rb +0 -65
  285. data/examples/find_foobar_on_wikipedia.rb +0 -23
  286. data/lib/wayfarer/configuration.rb +0 -86
  287. data/lib/wayfarer/crawl.rb +0 -79
  288. data/lib/wayfarer/crawl_observer.rb +0 -103
  289. data/lib/wayfarer/dispatcher.rb +0 -104
  290. data/lib/wayfarer/finders.rb +0 -61
  291. data/lib/wayfarer/frontiers/frontier.rb +0 -79
  292. data/lib/wayfarer/frontiers/memory_bloomfilter.rb +0 -32
  293. data/lib/wayfarer/frontiers/memory_frontier.rb +0 -76
  294. data/lib/wayfarer/frontiers/memory_trie_frontier.rb +0 -39
  295. data/lib/wayfarer/frontiers/normalize_uris.rb +0 -48
  296. data/lib/wayfarer/frontiers/redis_bloomfilter.rb +0 -34
  297. data/lib/wayfarer/frontiers/redis_frontier.rb +0 -83
  298. data/lib/wayfarer/http_adapters/adapter_pool.rb +0 -62
  299. data/lib/wayfarer/http_adapters/net_http_adapter.rb +0 -77
  300. data/lib/wayfarer/http_adapters/selenium_adapter.rb +0 -80
  301. data/lib/wayfarer/job.rb +0 -211
  302. data/lib/wayfarer/locals.rb +0 -40
  303. data/lib/wayfarer/parsers/json_parser.rb +0 -20
  304. data/lib/wayfarer/parsers/xml_parser.rb +0 -27
  305. data/lib/wayfarer/processor.rb +0 -103
  306. data/lib/wayfarer/routing/custom_rule.rb +0 -21
  307. data/lib/wayfarer/routing/filetypes_rule.rb +0 -20
  308. data/lib/wayfarer/routing/host_rule.rb +0 -19
  309. data/lib/wayfarer/routing/path_rule.rb +0 -54
  310. data/lib/wayfarer/routing/protocol_rule.rb +0 -21
  311. data/lib/wayfarer/routing/router.rb +0 -71
  312. data/lib/wayfarer/routing/rule.rb +0 -114
  313. data/lib/wayfarer/routing/uri_rule.rb +0 -21
  314. data/spec/configuration_spec.rb +0 -26
  315. data/spec/crawl_spec.rb +0 -48
  316. data/spec/finders_spec.rb +0 -49
  317. data/spec/frontiers/memory_bloomfilter_spec.rb +0 -6
  318. data/spec/frontiers/memory_frontier_spec.rb +0 -6
  319. data/spec/frontiers/memory_trie_frontier_spec.rb +0 -6
  320. data/spec/frontiers/normalize_uris_spec.rb +0 -59
  321. data/spec/frontiers/redis_bloomfilter_spec.rb +0 -6
  322. data/spec/frontiers/redis_frontier_spec.rb +0 -6
  323. data/spec/http_adapters/adapter_pool_spec.rb +0 -33
  324. data/spec/http_adapters/net_http_adapter_spec.rb +0 -83
  325. data/spec/http_adapters/selenium_adapter_spec.rb +0 -53
  326. data/spec/integration/callbacks_spec.rb +0 -42
  327. data/spec/integration/locals_spec.rb +0 -106
  328. data/spec/integration/peeking_spec.rb +0 -61
  329. data/spec/job_spec.rb +0 -122
  330. data/spec/processor_spec.rb +0 -31
  331. data/spec/routing/custom_rule_spec.rb +0 -26
  332. data/spec/routing/host_rule_spec.rb +0 -48
  333. data/spec/routing/path_rule_spec.rb +0 -66
  334. data/spec/routing/protocol_rule_spec.rb +0 -26
  335. data/spec/routing/router_spec.rb +0 -67
  336. data/spec/routing/rule_spec.rb +0 -251
  337. data/spec/routing/uri_rule_spec.rb +0 -24
  338. data/spec/shared/frontier.rb +0 -96
  339. data/wayfarer-jruby.gemspec +0 -49
@@ -1,103 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Wayfarer
4
- class CrawlObserver
5
- module Events
6
- FirstCycle = Struct.new(:frontier)
7
- NewCycle = Struct.new(:current_uris_count)
8
- DispatchedURI = Struct.new(:action, :uri)
9
- CycleFinished = Class.new
10
- Peeking = Struct.new(:uri)
11
- AboutToCycle = Struct.new(:staged_uris_count)
12
- MismatchedURI = Struct.new(:uri)
13
- HaltInitiated = Struct.new(:action, :uri)
14
- StagingURIs = Struct.new(:staged_uris_count)
15
- UnhandledError = Struct.new(:exception)
16
- end
17
-
18
- module ObservableShortcuts
19
- def notify_observers!(*argv)
20
- changed
21
- notify_observers(*argv)
22
- end
23
- end
24
-
25
- extend Forwardable
26
-
27
- attr_reader :logger
28
-
29
- def initialize(*observables, logger)
30
- @logger = logger
31
- observables.each { |obsv| obsv.add_observer(self) }
32
- end
33
-
34
- def update(event)
35
- case event
36
- when Events::FirstCycle then first_cycle(event)
37
- when Events::NewCycle then new_cycle(event)
38
- when Events::DispatchedURI then dispatched_uri(event)
39
- when Events::CycleFinished then cycle_finished
40
- when Events::Peeking then peeking(event)
41
- when Events::AboutToCycle then about_to_cycle(event)
42
- when Events::MismatchedURI then mismatched_uri(event)
43
- when Events::HaltInitiated then halt_initiated(event)
44
- when Events::StagingURIs then staging_uris(event)
45
- when Events::UnhandledError then unhandled_error(event)
46
- end
47
- end
48
-
49
- private
50
-
51
- def first_cycle(event)
52
- logger.info("First cycle")
53
- logger.info("Frontier: #{event.frontier}")
54
- end
55
-
56
- def new_cycle(event)
57
- logger.info("Current cycle contains #{event.current_uris_count} URI(s)")
58
- end
59
-
60
- def dispatched_uri(event)
61
- logger.info("Dispatched to \##{event.action}: #{event.uri}")
62
- end
63
-
64
- def cycle_finished
65
- logger.info("No URIs left in current cycle")
66
- end
67
-
68
- def peeking(event)
69
- logger.info("Peeking into: #{event.uri}")
70
- end
71
-
72
- def about_to_cycle(event)
73
- logger.info("About to cycle. #{event.staged_uris_count} staged URI(s)")
74
- end
75
-
76
- def mismatched_uri(event)
77
- logger.debug("No matching route for: #{event.uri}")
78
- end
79
-
80
- def halt_initiated(event)
81
- logger.info("Halt initiated from \##{event.action} at: #{event.uri}")
82
- end
83
-
84
- def staging_uris(event)
85
- logger.info("Staging #{event.staged_uris_count} URI(s)")
86
- end
87
-
88
- def unhandled_error(event)
89
- level = config.reraise_exceptions ? :fatal : :error
90
-
91
- if config.print_stacktraces
92
- logger.public_send level, <<~LOGGER
93
- Unhandled exception in an action: #{event.exception.class.inspect}
94
- #{event.exception.backtrace.map(&:to_s).join("\n* ")}
95
- LOGGER
96
- else
97
- logger.public_send level, <<~LOGGER
98
- Unhandled exception in an action: #{event.exception.class.inspect}
99
- LOGGER
100
- end
101
- end
102
- end
103
- end
@@ -1,104 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "active_support/core_ext/hash/indifferent_access"
4
-
5
- module Wayfarer
6
- # Creates job instances, retrieves pages and, if a URI matches a route, calls
7
- # methods on the instances.
8
- class Dispatcher
9
- extend Forwardable
10
-
11
- include Observable
12
- include CrawlObserver::Events
13
- include CrawlObserver::ObservableShortcuts
14
-
15
- # Result types that a {Processor} operates with.
16
- Mismatch = Struct.new(:uri)
17
- Halt = Struct.new(:uri, :action)
18
- Stage = Struct.new(:uris, :ret_val)
19
- Error = Struct.new(:exception)
20
-
21
- # @!attribute [r] adapter_pool
22
- # @return [AdapterPool]
23
- attr_reader :adapter_pool
24
-
25
- # @!attribute [r] job
26
- attr_reader :job
27
-
28
- def initialize(job)
29
- @job = job
30
- @adapter_pool = HTTPAdapters::AdapterPool.new(job)
31
- end
32
-
33
- delegate config: :job
34
- delegate logger: :config
35
-
36
- # Dispatches this URI. Matches an URI against the rules of the job's router.
37
- # If a rule matches, the page is retrieved, and the action associated with
38
- # the route is called.
39
- #
40
- # @param [Job] job
41
- # @param [URI] uri
42
- def dispatch(job, uri, is_peeking: false)
43
- action, params = job.router.route(uri)
44
- return Mismatch.new(uri) unless action
45
-
46
- params = ActiveSupport::HashWithIndifferentAccess.new(params)
47
-
48
- notify_observers!(DispatchedURI.new(action, uri))
49
-
50
- job_instance = job.new
51
- result = nil
52
-
53
- adapter_pool.with do |adapter|
54
- job_instance.page = adapter.fetch(uri)
55
- job_instance.adapter = adapter
56
- job_instance.params = params
57
-
58
- result = job_instance.public_send(action) { |peek_uri|
59
- begin
60
- unless is_peeking
61
- notify_observers!(Peeking.new(uri))
62
- result = dispatch(job, URI(peek_uri), is_peeking: true)
63
- result.ret_val
64
- end
65
- rescue
66
- nil
67
- end
68
- }
69
- end
70
-
71
- if job_instance.halts?
72
- Halt.new(uri, action)
73
- else
74
- Stage.new(job_instance.staged_uris, result)
75
- end
76
- # What follows are exceptions whose origin I don't care about at the moment
77
- # TODO: Better logging
78
- rescue Net::HTTP::Persistent::Error
79
- logger.warn("Net::HTTP::Persistent::Error @ #{uri}")
80
- rescue Errno::EHOSTUNREACH
81
- logger.warn("Host unreachable @ #{uri}")
82
- rescue Errno::ENETUNREACH
83
- logger.warn("No route to network present @ #{uri}")
84
- rescue Net::OpenTimeout, Net::ReadTimeout
85
- logger.warn("::Net timeout @ #{uri}")
86
-
87
- # SSL verification failed due to a missing certificate
88
- rescue OpenSSL::SSL::SSLError
89
- logger.warn("SSL verification failed @ #{uri}")
90
-
91
- # Ruby/zlib encountered a Z_DATA_ERROR.
92
- # Usually if a stream was prematurely freed.
93
- # Probably has to do with net-http-persistent?
94
- rescue Zlib::DataError
95
- logger.warn("Z_DATA_ERROR")
96
- rescue HTTPAdapters::NetHTTPAdapter::MalformedURI, URI::InvalidURIError
97
- logger.info("[warn#{self}] Malformed URI @ #{uri}")
98
- rescue HTTPAdapters::NetHTTPAdapter::MalformedRedirectURI
99
- logger.info("Malformed redirect URI @ #{uri}")
100
- rescue HTTPAdapters::NetHTTPAdapter::MaximumRedirectCountReached
101
- logger.info("Maximum redirect count reached @ #{uri}")
102
- end
103
- end
104
- end
@@ -1,61 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Wayfarer
4
- module Finders
5
- # Returns the expanded `href` attribute URIs from all or targeted `<a>` tags.
6
- # @param [*Array<String>] filters CSS/XPath expressions.
7
- # @return [Array<URI>]
8
- def links(*filters)
9
- query("a", "href", *filters)
10
- end
11
-
12
- # Returns the expanded `href` attribute URIs from all or targeted `<link rel="stylesheet" ...>` tags.
13
- # @param [*Array<String>] filters CSS/XPath expressions.
14
- # @return [Array<URI>]
15
- def stylesheets(*filters)
16
- query("link[rel='stylesheet']", "href", *filters)
17
- end
18
-
19
- # Returns the expanded `src` attribute URIs from all or targeted `<script>` tags.
20
- # TODO: Tests
21
- # @param [*Array<String>] filters CSS/XPath expressions.
22
- # @return [Array<URI>]
23
- def javascripts(*filters)
24
- query("script", "src", *filters)
25
- end
26
-
27
- alias scripts javascripts
28
-
29
- # Returns the expanded `src` attribute URIs from all or targeted `<img>` tags.
30
- # TODO: Tests
31
- # @param [*Array<String>] filters CSS/XPath expressions.
32
- # @return [Array<URI>]
33
- def images(*filters)
34
- query("img", "src", *filters)
35
- end
36
-
37
- private
38
-
39
- # TODO: Lord have mercy
40
- def query(selector, attr, *filters)
41
- nodes = if filters.any?
42
- doc.search(*filters).css(selector)
43
- else
44
- doc.css(selector)
45
- end
46
-
47
- links = nodes.map { |node|
48
- begin
49
- URI.join(uri, node.attr(attr))
50
- rescue
51
- nil
52
- end
53
- }
54
-
55
- links
56
- .find_all { |uri| uri.is_a?(URI) }
57
- .uniq
58
- .map(&:to_s)
59
- end
60
- end
61
- end
@@ -1,79 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Wayfarer
4
- module Frontiers
5
- # @abstract The common behaviour of all frontiers.
6
- # @api private
7
- class Frontier
8
- attr_reader :config
9
-
10
- def initialize(config)
11
- @config = config
12
- end
13
-
14
- # Returns URIs to be scraped in the current cycle.
15
- # @note Usually an expensive operation!
16
- # @return [Array<URI>]
17
- def current_uris
18
- raise "Unimplemented"
19
- end
20
-
21
- # Returns staged URIs.
22
- # @return [Array<URI>]
23
- def staged_uris
24
- raise "Unimplemented"
25
- end
26
-
27
- # Stages URIs for processing in the next cycle.
28
- # @param [*Array<URI>, *Array<String>] uris
29
- def stage(*_uris)
30
- raise "Unimplemented"
31
- end
32
-
33
- # Whether a URI is cached.
34
- def staged?(_uri)
35
- raise "Unimplemented"
36
- end
37
-
38
- # Caches URIs so they don't get processed again.
39
- # @param [*Array<URI>, *Array<String>] uris
40
- def cache(*_uris)
41
- raise "Unimplemented"
42
- end
43
-
44
- # Whether a URI is cached.
45
- def cached?(_uri)
46
- raise "Unimplemented"
47
- end
48
-
49
- # Frees resources.
50
- def free; end
51
-
52
- # TODO: Documentation
53
- def cycle
54
- unless config.allow_circulation
55
- cache(*current_uris) # TODO: Make it a template method
56
- filter_staged_uris!
57
- end
58
-
59
- return false if staged_uris.none?
60
-
61
- swap!
62
- reset_staged_uris!
63
-
64
- true
65
- end
66
-
67
- protected
68
-
69
- # TODO: Documentation
70
- def filter_staged_uris!; end
71
-
72
- # TODO: Documentation
73
- def swap!; end
74
-
75
- # TODO: Documentation
76
- def reset_staged_uris!; end
77
- end
78
- end
79
- end
@@ -1,32 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "bloomfilter-rb"
4
-
5
- module Wayfarer
6
- module Frontiers
7
- # An in-memory bloomfilter.
8
- # @api private
9
- class MemoryBloomfilter < MemoryFrontier
10
- def initialize(config)
11
- @filter = BloomFilter::Native.new(config.bloomfilter_opts)
12
- super(config)
13
- end
14
-
15
- # @override
16
- def cache(*uris)
17
- uris.each { |uri| @filter.insert(uri) }
18
- end
19
-
20
- # @override
21
- def cached?(uri)
22
- @filter.include?(uri)
23
- end
24
-
25
- # Frees up memory.
26
- def free
27
- @filter.clear
28
- super
29
- end
30
- end
31
- end
32
- end
@@ -1,76 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "set"
4
- require "parallel" unless JAVA_PLATFORM = "java"
5
-
6
- module Wayfarer
7
- module Frontiers
8
- # A naive in-memory frontier.
9
- # @api private
10
- class MemoryFrontier < Frontier
11
- def initialize(config)
12
- @current_uris = Set.new([])
13
- @staged_uris = Set.new([])
14
- @cached_uris = Set.new([])
15
- super(config)
16
- end
17
-
18
- # @override
19
- def current_uris
20
- if JAVA_PLATFORM == "java"
21
- @current_uris.map { |uri| URI(uri) }
22
- else
23
- Parallel.map(@current_uris) { |uri| URI(uri) }
24
- end
25
- end
26
-
27
- # @override
28
- def staged_uris
29
- @staged_uris.to_a # These are assumed to be URIs already, so no map
30
- end
31
-
32
- # @override
33
- def stage(*uris)
34
- @staged_uris |= uris
35
- end
36
-
37
- # @override
38
- def staged?(uri)
39
- @staged_uris.include?(uri.to_s)
40
- end
41
-
42
- # @override
43
- def cache(*uris)
44
- @cached_uris |= if JAVA_PLATFORM == "java"
45
- uris.map(&:to_s)
46
- else
47
- Parallel.map(uris, &:to_s)
48
- end
49
- end
50
-
51
- # @override
52
- def cached?(uri)
53
- @cached_uris.include?(uri.to_s)
54
- end
55
-
56
- # @override
57
- def free
58
- @current_uris = @staged_uris = @cached_uris = nil
59
- end
60
-
61
- private
62
-
63
- def reset_staged_uris!
64
- @staged_uris = Set.new([])
65
- end
66
-
67
- def swap!
68
- @current_uris = @staged_uris
69
- end
70
-
71
- def filter_staged_uris!
72
- @staged_uris.delete_if { |uri| cached?(uri) }
73
- end
74
- end
75
- end
76
- end
@@ -1,39 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "trie"
4
-
5
- module Wayfarer
6
- module Frontiers
7
- # An in-memory trie.
8
- # @api private
9
- class MemoryTrieFrontier < MemoryFrontier
10
- def initialize(config)
11
- @trie = Trie.new
12
- super(config)
13
- end
14
-
15
- # @override
16
- def cache(*uris)
17
- uris.each { |uri| @trie.add(uri.to_s) }
18
- end
19
-
20
- # @override
21
- def match!(uri)
22
- @str_or_regexp === uri.host
23
- end
24
-
25
- def cached?(uri)
26
- # RuboCop autocorrects `#has_key?` to `#key?` otherwise
27
- # rubocop:disable Style/PreferredHashMethods
28
- @trie.has_key?(uri.to_s)
29
- # rubocop:enable Style/PreferredHashMethods
30
- end
31
-
32
- # @override
33
- def free
34
- @trie = nil
35
- super
36
- end
37
- end
38
- end
39
- end
@@ -1,48 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "normalize_url"
4
-
5
- module Wayfarer
6
- module Frontiers
7
- # @api private
8
- module NormalizeURIs
9
- # @override
10
- def stage(*uris)
11
- super(*uris.map { |uri| normalize(uri) })
12
- end
13
-
14
- # @override
15
- def staged?(uri)
16
- super(normalize(uri))
17
- end
18
-
19
- # @override
20
- def cache(*uris)
21
- super(*uris.map { |uri| normalize(uri) })
22
- end
23
-
24
- # @override
25
- def cached?(uri)
26
- super(normalize(uri))
27
- end
28
-
29
- private
30
-
31
- def normalize(uri)
32
- NormalizeUrl.process(uri, config.normalize_uri_options)
33
- end
34
-
35
- def to_s
36
- "URI-normalizing #{super}"
37
- end
38
-
39
- def method_missing(*argv, &proc)
40
- super(*argv, &proc)
41
- end
42
-
43
- def respond_to_missing?(method, private = false)
44
- @frontier.respond_to?(method) || super
45
- end
46
- end
47
- end
48
- end
@@ -1,34 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "bloomfilter-rb"
4
-
5
- module Wayfarer
6
- module Frontiers
7
- # A Redis bloomfilter.
8
- # @api private
9
- class RedisBloomfilter < MemoryBloomfilter
10
- def initialize(config)
11
- @conn = Redis.new(config.redis_opts)
12
- @filter = BloomFilter::Redis.new(config.bloomfilter_opts.merge(db: @conn))
13
- super(config)
14
- end
15
- end
16
-
17
- # @override
18
- def cache(*uris)
19
- uris.each { |uri| @filter.insert(uri) }
20
- end
21
-
22
- # @override
23
- def cached?(uri)
24
- @filter.include?(uri)
25
- end
26
-
27
- # @override
28
- def free
29
- @filter.clear
30
- @conn.disconnect!
31
- super
32
- end
33
- end
34
- end
@@ -1,83 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "redis"
4
-
5
- module Wayfarer
6
- module Frontiers
7
- # A Redis frontier
8
- # @api private
9
- class RedisFrontier < Frontier
10
- def initialize(config)
11
- @conn = Redis.new(config.redis_opts)
12
- super(config)
13
- end
14
-
15
- # @override
16
- def current_uris
17
- @conn.smembers(current_uris_key).map { |str| URI(str) }
18
- end
19
-
20
- # @override
21
- def staged_uris
22
- @conn.smembers(staged_uris_key).map { |str| URI(str) }
23
- end
24
-
25
- # @override
26
- def stage(*uris)
27
- @conn.sadd(staged_uris_key, uris.map(&:to_s)) if uris.any?
28
- end
29
-
30
- # @override
31
- def staged?(uri)
32
- @conn.sismember(staged_uris_key, uri.to_s)
33
- end
34
-
35
- # @override
36
- def cache(*uris)
37
- @conn.sadd(cached_uris_key, uris.map(&:to_s)) if uris.any?
38
- end
39
-
40
- # @override
41
- def cached?(uri)
42
- @conn.sismember(cached_uris_key, uri.to_s)
43
- end
44
-
45
- # @override
46
- def free
47
- [current_uris_key, staged_uris_key, cached_uris_key].each do |key|
48
- @conn.del(key)
49
- end
50
-
51
- @conn.disconnect!
52
- end
53
-
54
- private
55
-
56
- def reset_staged_uris!
57
- @conn.del(staged_uris_key)
58
- end
59
-
60
- # @override
61
- def swap!
62
- # Achieve: @current_uris = @staged_uris
63
- @conn.rename(staged_uris_key, current_uris_key)
64
- end
65
-
66
- def filter_staged_uris!
67
- @conn.sdiffstore(staged_uris_key, staged_uris_key, cached_uris_key)
68
- end
69
-
70
- def current_uris_key
71
- "#{@config.uuid}_current_uris"
72
- end
73
-
74
- def staged_uris_key
75
- "#{@config.uuid}_staged_uris"
76
- end
77
-
78
- def cached_uris_key
79
- "#{@config.uuid}_cached_uris"
80
- end
81
- end
82
- end
83
- end