wayfarer 0.0.3 → 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (369) hide show
  1. checksums.yaml +5 -5
  2. data/.github/workflows/ci.yaml +32 -0
  3. data/.gitignore +3 -4
  4. data/.rubocop.yml +25 -9
  5. data/.ruby-version +1 -1
  6. data/Dockerfile +5 -0
  7. data/Gemfile +1 -7
  8. data/Gemfile.lock +221 -0
  9. data/RELEASING.md +17 -0
  10. data/Rakefile +38 -90
  11. data/bin/wayfarer +1 -111
  12. data/docker-compose.yml +32 -0
  13. data/docs/cookbook/batch_routing.md +22 -0
  14. data/docs/cookbook/consent_screen.md +36 -0
  15. data/docs/cookbook/executing_javascript.md +41 -0
  16. data/docs/cookbook/querying_html.md +42 -0
  17. data/docs/cookbook/screenshots.md +27 -0
  18. data/docs/cookbook/user_agent.md +7 -0
  19. data/docs/guides/browser_automation/capybara.md +69 -0
  20. data/docs/guides/browser_automation/custom_adapters.md +100 -0
  21. data/docs/guides/browser_automation/ferrum.md +39 -0
  22. data/docs/guides/browser_automation/selenium.md +63 -0
  23. data/docs/guides/callbacks.md +131 -31
  24. data/docs/guides/configuration.md +24 -169
  25. data/docs/guides/debugging.md +17 -0
  26. data/docs/guides/error_handling.md +30 -45
  27. data/docs/guides/jobs.md +101 -0
  28. data/docs/guides/navigation.md +73 -0
  29. data/docs/guides/networking.md +94 -0
  30. data/docs/guides/pages.md +52 -0
  31. data/docs/guides/performance.md +130 -0
  32. data/docs/guides/reliability.md +41 -0
  33. data/docs/guides/routing/steering.md +30 -0
  34. data/docs/guides/tasks.md +14 -0
  35. data/docs/index.md +40 -66
  36. data/docs/reference/api/base.md +48 -0
  37. data/docs/reference/api/route.md +182 -0
  38. data/docs/reference/cli.md +61 -0
  39. data/docs/reference/configuration_keys.md +42 -0
  40. data/docs/reference/environment_variables.md +83 -0
  41. data/lib/wayfarer/base.rb +50 -0
  42. data/lib/wayfarer/callbacks.rb +71 -0
  43. data/lib/wayfarer/cli/base.rb +27 -0
  44. data/lib/wayfarer/cli/generate.rb +17 -0
  45. data/lib/wayfarer/cli/job.rb +60 -0
  46. data/lib/wayfarer/cli/route.rb +29 -0
  47. data/lib/wayfarer/cli/route_printer.rb +116 -0
  48. data/lib/wayfarer/cli/runner.rb +34 -0
  49. data/lib/wayfarer/cli/templates/Gemfile.tt +5 -0
  50. data/lib/wayfarer/cli/templates/job.rb.tt +10 -0
  51. data/lib/wayfarer/config/capybara.rb +10 -0
  52. data/lib/wayfarer/config/ferrum.rb +11 -0
  53. data/lib/wayfarer/config/networking.rb +26 -0
  54. data/lib/wayfarer/config/redis.rb +14 -0
  55. data/lib/wayfarer/config/root.rb +11 -0
  56. data/lib/wayfarer/config/selenium.rb +21 -0
  57. data/lib/wayfarer/config/strconv.rb +45 -0
  58. data/lib/wayfarer/config/struct.rb +72 -0
  59. data/lib/wayfarer/gc.rb +15 -0
  60. data/lib/wayfarer/middleware/chain.rb +19 -0
  61. data/lib/wayfarer/middleware/dedup.rb +25 -0
  62. data/lib/wayfarer/middleware/fetch.rb +47 -0
  63. data/lib/wayfarer/middleware/normalize.rb +25 -0
  64. data/lib/wayfarer/middleware/router.rb +53 -0
  65. data/lib/wayfarer/middleware/stage.rb +23 -0
  66. data/lib/wayfarer/middleware/worker.rb +30 -0
  67. data/lib/wayfarer/networking/capybara.rb +28 -0
  68. data/lib/wayfarer/networking/context.rb +36 -0
  69. data/lib/wayfarer/networking/ferrum.rb +35 -0
  70. data/lib/wayfarer/networking/http.rb +34 -0
  71. data/lib/wayfarer/networking/pool.rb +40 -0
  72. data/lib/wayfarer/networking/result.rb +18 -0
  73. data/lib/wayfarer/networking/selenium.rb +43 -0
  74. data/lib/wayfarer/networking/strategy.rb +38 -0
  75. data/lib/wayfarer/page.rb +17 -74
  76. data/lib/wayfarer/parsing/json.rb +17 -0
  77. data/lib/wayfarer/parsing/xml.rb +17 -0
  78. data/lib/wayfarer/redis/.#barrier.rb +1 -0
  79. data/lib/wayfarer/redis/barrier.rb +36 -0
  80. data/lib/wayfarer/redis/connection.rb +13 -0
  81. data/lib/wayfarer/redis/counter.rb +29 -0
  82. data/lib/wayfarer/redis/pool.rb +20 -0
  83. data/lib/wayfarer/redis/version.rb +19 -0
  84. data/lib/wayfarer/routing/dsl.rb +57 -0
  85. data/lib/wayfarer/routing/matchers/custom.rb +25 -0
  86. data/lib/wayfarer/routing/matchers/host.rb +19 -0
  87. data/lib/wayfarer/routing/matchers/path.rb +49 -0
  88. data/lib/wayfarer/routing/matchers/query.rb +63 -0
  89. data/lib/wayfarer/routing/matchers/scheme.rb +17 -0
  90. data/lib/wayfarer/routing/matchers/suffix.rb +17 -0
  91. data/lib/wayfarer/routing/matchers/url.rb +17 -0
  92. data/lib/wayfarer/routing/path_finder.rb +46 -0
  93. data/lib/wayfarer/routing/result.rb +15 -0
  94. data/lib/wayfarer/routing/root_route.rb +7 -0
  95. data/lib/wayfarer/routing/route.rb +47 -0
  96. data/lib/wayfarer/routing/router.rb +10 -54
  97. data/lib/wayfarer/routing/target_route.rb +7 -0
  98. data/lib/wayfarer/serializer.rb +17 -0
  99. data/lib/wayfarer/stringify.rb +47 -0
  100. data/lib/wayfarer/task.rb +34 -0
  101. data/lib/wayfarer.rb +48 -57
  102. data/mkdocs.yml +47 -0
  103. data/requirements.txt +1 -0
  104. data/spec/base_spec.rb +233 -0
  105. data/spec/callbacks_spec.rb +102 -0
  106. data/spec/cli/generate_spec.rb +39 -0
  107. data/spec/cli/job_spec.rb +74 -0
  108. data/spec/cli/version_spec.rb +13 -0
  109. data/spec/config/capybara_spec.rb +18 -0
  110. data/spec/config/ferrum_spec.rb +24 -0
  111. data/spec/config/networking_spec.rb +73 -0
  112. data/spec/config/redis_spec.rb +32 -0
  113. data/spec/config/root_spec.rb +31 -0
  114. data/spec/config/selenium_spec.rb +56 -0
  115. data/spec/config/strconv_spec.rb +58 -0
  116. data/spec/config/struct_spec.rb +66 -0
  117. data/spec/factories/middleware.rb +15 -0
  118. data/spec/factories/page.rb +78 -0
  119. data/spec/factories/task.rb +12 -0
  120. data/spec/fixtures/dummy_job.rb +7 -0
  121. data/spec/gc_spec.rb +63 -0
  122. data/spec/middleware/chain_spec.rb +96 -0
  123. data/spec/middleware/dedup_spec.rb +76 -0
  124. data/spec/middleware/fetch_spec.rb +100 -0
  125. data/spec/middleware/normalize_spec.rb +28 -0
  126. data/spec/middleware/router_spec.rb +80 -0
  127. data/spec/middleware/stage_spec.rb +39 -0
  128. data/spec/middleware/worker_spec.rb +117 -0
  129. data/spec/networking/capybara_spec.rb +12 -0
  130. data/spec/networking/context_spec.rb +127 -0
  131. data/spec/networking/ferrum_spec.rb +12 -0
  132. data/spec/networking/http_spec.rb +12 -0
  133. data/spec/networking/pool_spec.rb +67 -0
  134. data/spec/networking/selenium_spec.rb +12 -0
  135. data/spec/networking/strategy.rb +170 -0
  136. data/spec/page_spec.rb +21 -12
  137. data/spec/{parsers/json_parser_spec.rb → parsing/json_spec.rb} +5 -4
  138. data/spec/{parsers/xml_parser_spec.rb → parsing/xml_spec.rb} +3 -2
  139. data/spec/redis/barrier_spec.rb +78 -0
  140. data/spec/redis/counter_spec.rb +32 -0
  141. data/spec/redis/pool_spec.rb +18 -0
  142. data/spec/redis/version_spec.rb +13 -0
  143. data/spec/routing/dsl_spec.rb +98 -0
  144. data/spec/routing/integration_spec.rb +110 -0
  145. data/spec/routing/matchers/custom_spec.rb +31 -0
  146. data/spec/routing/matchers/host_spec.rb +49 -0
  147. data/spec/routing/matchers/path_spec.rb +43 -0
  148. data/spec/routing/matchers/query_spec.rb +137 -0
  149. data/spec/routing/matchers/scheme_spec.rb +25 -0
  150. data/spec/routing/{filetypes_rule_spec.rb → matchers/suffix_spec.rb} +14 -13
  151. data/spec/routing/matchers/uri_spec.rb +27 -0
  152. data/spec/routing/path_finder_spec.rb +33 -0
  153. data/spec/routing/root_route_spec.rb +29 -0
  154. data/spec/routing/route_spec.rb +43 -0
  155. data/spec/routing/router_spec.rb +13 -56
  156. data/spec/spec_helpers.rb +73 -38
  157. data/spec/stringify_spec.rb +23 -0
  158. data/{support → spec/support}/static/finders.html +0 -0
  159. data/{support → spec/support}/static/graph/details/a.html +0 -0
  160. data/{support → spec/support}/static/graph/details/b.html +0 -0
  161. data/{support → spec/support}/static/graph/index.html +0 -0
  162. data/{support → spec/support}/static/json/dummy.json +0 -0
  163. data/{support → spec/support}/static/links/links.html +0 -0
  164. data/{support → spec/support}/static/xml/dummy.xml +0 -0
  165. data/{support → spec/support}/test_app.rb +9 -2
  166. data/spec/task_spec.rb +27 -0
  167. data/spec/wayfarer_spec.rb +2 -13
  168. data/wayfarer.gemspec +40 -42
  169. metadata +234 -361
  170. data/.travis.yml +0 -5
  171. data/Changelog.md +0 -10
  172. data/README.md +0 -21
  173. data/benchmark/frontiers.rb +0 -143
  174. data/docs/.gitignore +0 -2
  175. data/docs/_config.yml +0 -15
  176. data/docs/_includes/base.html +0 -7
  177. data/docs/_includes/head.html +0 -10
  178. data/docs/_includes/navigation.html +0 -187
  179. data/docs/_layouts/default.html +0 -42
  180. data/docs/_sass/base.scss +0 -439
  181. data/docs/_sass/variables.scss +0 -24
  182. data/docs/_sass/vendor/bourbon/_bourbon-deprecate.scss +0 -19
  183. data/docs/_sass/vendor/bourbon/_bourbon-deprecated-upcoming.scss +0 -425
  184. data/docs/_sass/vendor/bourbon/_bourbon.scss +0 -90
  185. data/docs/_sass/vendor/bourbon/addons/_border-color.scss +0 -29
  186. data/docs/_sass/vendor/bourbon/addons/_border-radius.scss +0 -48
  187. data/docs/_sass/vendor/bourbon/addons/_border-style.scss +0 -28
  188. data/docs/_sass/vendor/bourbon/addons/_border-width.scss +0 -28
  189. data/docs/_sass/vendor/bourbon/addons/_buttons.scss +0 -69
  190. data/docs/_sass/vendor/bourbon/addons/_clearfix.scss +0 -25
  191. data/docs/_sass/vendor/bourbon/addons/_ellipsis.scss +0 -30
  192. data/docs/_sass/vendor/bourbon/addons/_font-stacks.scss +0 -31
  193. data/docs/_sass/vendor/bourbon/addons/_hide-text.scss +0 -27
  194. data/docs/_sass/vendor/bourbon/addons/_margin.scss +0 -29
  195. data/docs/_sass/vendor/bourbon/addons/_padding.scss +0 -29
  196. data/docs/_sass/vendor/bourbon/addons/_position.scss +0 -51
  197. data/docs/_sass/vendor/bourbon/addons/_prefixer.scss +0 -66
  198. data/docs/_sass/vendor/bourbon/addons/_retina-image.scss +0 -27
  199. data/docs/_sass/vendor/bourbon/addons/_size.scss +0 -56
  200. data/docs/_sass/vendor/bourbon/addons/_text-inputs.scss +0 -118
  201. data/docs/_sass/vendor/bourbon/addons/_timing-functions.scss +0 -34
  202. data/docs/_sass/vendor/bourbon/addons/_triangle.scss +0 -63
  203. data/docs/_sass/vendor/bourbon/addons/_word-wrap.scss +0 -29
  204. data/docs/_sass/vendor/bourbon/css3/_animation.scss +0 -61
  205. data/docs/_sass/vendor/bourbon/css3/_appearance.scss +0 -5
  206. data/docs/_sass/vendor/bourbon/css3/_backface-visibility.scss +0 -5
  207. data/docs/_sass/vendor/bourbon/css3/_background-image.scss +0 -44
  208. data/docs/_sass/vendor/bourbon/css3/_background.scss +0 -57
  209. data/docs/_sass/vendor/bourbon/css3/_border-image.scss +0 -61
  210. data/docs/_sass/vendor/bourbon/css3/_calc.scss +0 -6
  211. data/docs/_sass/vendor/bourbon/css3/_columns.scss +0 -67
  212. data/docs/_sass/vendor/bourbon/css3/_filter.scss +0 -6
  213. data/docs/_sass/vendor/bourbon/css3/_flex-box.scss +0 -327
  214. data/docs/_sass/vendor/bourbon/css3/_font-face.scss +0 -29
  215. data/docs/_sass/vendor/bourbon/css3/_font-feature-settings.scss +0 -6
  216. data/docs/_sass/vendor/bourbon/css3/_hidpi-media-query.scss +0 -12
  217. data/docs/_sass/vendor/bourbon/css3/_hyphens.scss +0 -6
  218. data/docs/_sass/vendor/bourbon/css3/_image-rendering.scss +0 -15
  219. data/docs/_sass/vendor/bourbon/css3/_keyframes.scss +0 -38
  220. data/docs/_sass/vendor/bourbon/css3/_linear-gradient.scss +0 -40
  221. data/docs/_sass/vendor/bourbon/css3/_perspective.scss +0 -12
  222. data/docs/_sass/vendor/bourbon/css3/_placeholder.scss +0 -10
  223. data/docs/_sass/vendor/bourbon/css3/_radial-gradient.scss +0 -40
  224. data/docs/_sass/vendor/bourbon/css3/_selection.scss +0 -44
  225. data/docs/_sass/vendor/bourbon/css3/_text-decoration.scss +0 -27
  226. data/docs/_sass/vendor/bourbon/css3/_transform.scss +0 -21
  227. data/docs/_sass/vendor/bourbon/css3/_transition.scss +0 -81
  228. data/docs/_sass/vendor/bourbon/css3/_user-select.scss +0 -5
  229. data/docs/_sass/vendor/bourbon/functions/_assign-inputs.scss +0 -16
  230. data/docs/_sass/vendor/bourbon/functions/_contains-falsy.scss +0 -25
  231. data/docs/_sass/vendor/bourbon/functions/_contains.scss +0 -31
  232. data/docs/_sass/vendor/bourbon/functions/_is-length.scss +0 -16
  233. data/docs/_sass/vendor/bourbon/functions/_is-light.scss +0 -26
  234. data/docs/_sass/vendor/bourbon/functions/_is-number.scss +0 -16
  235. data/docs/_sass/vendor/bourbon/functions/_is-size.scss +0 -23
  236. data/docs/_sass/vendor/bourbon/functions/_modular-scale.scss +0 -74
  237. data/docs/_sass/vendor/bourbon/functions/_px-to-em.scss +0 -24
  238. data/docs/_sass/vendor/bourbon/functions/_px-to-rem.scss +0 -26
  239. data/docs/_sass/vendor/bourbon/functions/_shade.scss +0 -24
  240. data/docs/_sass/vendor/bourbon/functions/_strip-units.scss +0 -22
  241. data/docs/_sass/vendor/bourbon/functions/_tint.scss +0 -24
  242. data/docs/_sass/vendor/bourbon/functions/_transition-property-name.scss +0 -37
  243. data/docs/_sass/vendor/bourbon/functions/_unpack.scss +0 -32
  244. data/docs/_sass/vendor/bourbon/helpers/_convert-units.scss +0 -26
  245. data/docs/_sass/vendor/bourbon/helpers/_directional-values.scss +0 -108
  246. data/docs/_sass/vendor/bourbon/helpers/_font-source-declaration.scss +0 -53
  247. data/docs/_sass/vendor/bourbon/helpers/_gradient-positions-parser.scss +0 -24
  248. data/docs/_sass/vendor/bourbon/helpers/_linear-angle-parser.scss +0 -35
  249. data/docs/_sass/vendor/bourbon/helpers/_linear-gradient-parser.scss +0 -51
  250. data/docs/_sass/vendor/bourbon/helpers/_linear-positions-parser.scss +0 -77
  251. data/docs/_sass/vendor/bourbon/helpers/_linear-side-corner-parser.scss +0 -41
  252. data/docs/_sass/vendor/bourbon/helpers/_radial-arg-parser.scss +0 -74
  253. data/docs/_sass/vendor/bourbon/helpers/_radial-gradient-parser.scss +0 -55
  254. data/docs/_sass/vendor/bourbon/helpers/_radial-positions-parser.scss +0 -28
  255. data/docs/_sass/vendor/bourbon/helpers/_render-gradients.scss +0 -31
  256. data/docs/_sass/vendor/bourbon/helpers/_shape-size-stripper.scss +0 -15
  257. data/docs/_sass/vendor/bourbon/helpers/_str-to-num.scss +0 -55
  258. data/docs/_sass/vendor/bourbon/settings/_asset-pipeline.scss +0 -7
  259. data/docs/_sass/vendor/bourbon/settings/_deprecation-warnings.scss +0 -8
  260. data/docs/_sass/vendor/bourbon/settings/_prefixer.scss +0 -9
  261. data/docs/_sass/vendor/bourbon/settings/_px-to-em.scss +0 -1
  262. data/docs/_sass/vendor/neat/_neat-helpers.scss +0 -11
  263. data/docs/_sass/vendor/neat/_neat.scss +0 -23
  264. data/docs/_sass/vendor/neat/functions/_new-breakpoint.scss +0 -49
  265. data/docs/_sass/vendor/neat/functions/_private.scss +0 -114
  266. data/docs/_sass/vendor/neat/grid/_box-sizing.scss +0 -15
  267. data/docs/_sass/vendor/neat/grid/_direction-context.scss +0 -33
  268. data/docs/_sass/vendor/neat/grid/_display-context.scss +0 -28
  269. data/docs/_sass/vendor/neat/grid/_fill-parent.scss +0 -22
  270. data/docs/_sass/vendor/neat/grid/_media.scss +0 -92
  271. data/docs/_sass/vendor/neat/grid/_omega.scss +0 -87
  272. data/docs/_sass/vendor/neat/grid/_outer-container.scss +0 -34
  273. data/docs/_sass/vendor/neat/grid/_pad.scss +0 -25
  274. data/docs/_sass/vendor/neat/grid/_private.scss +0 -35
  275. data/docs/_sass/vendor/neat/grid/_row.scss +0 -52
  276. data/docs/_sass/vendor/neat/grid/_shift.scss +0 -50
  277. data/docs/_sass/vendor/neat/grid/_span-columns.scss +0 -94
  278. data/docs/_sass/vendor/neat/grid/_to-deprecate.scss +0 -97
  279. data/docs/_sass/vendor/neat/grid/_visual-grid.scss +0 -42
  280. data/docs/_sass/vendor/neat/mixins/_clearfix.scss +0 -25
  281. data/docs/_sass/vendor/neat/settings/_disable-warnings.scss +0 -13
  282. data/docs/_sass/vendor/neat/settings/_grid.scss +0 -51
  283. data/docs/_sass/vendor/neat/settings/_visual-grid.scss +0 -27
  284. data/docs/_sass/vendor/normalize-3.0.2.scss +0 -427
  285. data/docs/_sass/vendor/pygments.scss +0 -356
  286. data/docs/automating_browsers/capybara.md +0 -70
  287. data/docs/css/screen.scss +0 -7
  288. data/docs/guides/cli.md +0 -52
  289. data/docs/guides/frontiers.md +0 -93
  290. data/docs/guides/halting.md +0 -23
  291. data/docs/guides/job_queues.md +0 -26
  292. data/docs/guides/locals.md +0 -36
  293. data/docs/guides/logging.md +0 -22
  294. data/docs/guides/page_objects.md +0 -67
  295. data/docs/guides/peeking.md +0 -46
  296. data/docs/guides/selenium_capybara.md +0 -100
  297. data/docs/guides/tutorial.md +0 -452
  298. data/docs/js/navigation.js +0 -11
  299. data/docs/misc/contributing.md +0 -20
  300. data/docs/misc/testing.md +0 -11
  301. data/docs/recipes/authentication.md +0 -23
  302. data/docs/recipes/csv.md +0 -29
  303. data/docs/recipes/javascript.md +0 -20
  304. data/docs/recipes/multiple_uris.md +0 -18
  305. data/docs/recipes/screenshots.md +0 -20
  306. data/docs/routing/custom_rules.md +0 -16
  307. data/docs/routing/filetypes_rules.md +0 -21
  308. data/docs/routing/host_rules.md +0 -24
  309. data/docs/routing/path_rules.md +0 -33
  310. data/docs/routing/protocol_rules.md +0 -17
  311. data/docs/routing/query_rules.md +0 -69
  312. data/docs/routing/routes.md +0 -96
  313. data/docs/routing/uri_rules.md +0 -18
  314. data/examples/collect_github_issues.rb +0 -65
  315. data/examples/find_foobar_on_wikipedia.rb +0 -23
  316. data/lib/wayfarer/configuration.rb +0 -86
  317. data/lib/wayfarer/crawl.rb +0 -79
  318. data/lib/wayfarer/crawl_observer.rb +0 -103
  319. data/lib/wayfarer/dispatcher.rb +0 -104
  320. data/lib/wayfarer/finders.rb +0 -61
  321. data/lib/wayfarer/frontiers/frontier.rb +0 -79
  322. data/lib/wayfarer/frontiers/memory_bloomfilter.rb +0 -32
  323. data/lib/wayfarer/frontiers/memory_frontier.rb +0 -76
  324. data/lib/wayfarer/frontiers/memory_trie_frontier.rb +0 -39
  325. data/lib/wayfarer/frontiers/normalize_uris.rb +0 -48
  326. data/lib/wayfarer/frontiers/redis_bloomfilter.rb +0 -34
  327. data/lib/wayfarer/frontiers/redis_frontier.rb +0 -83
  328. data/lib/wayfarer/http_adapters/adapter_pool.rb +0 -62
  329. data/lib/wayfarer/http_adapters/net_http_adapter.rb +0 -77
  330. data/lib/wayfarer/http_adapters/selenium_adapter.rb +0 -80
  331. data/lib/wayfarer/job.rb +0 -211
  332. data/lib/wayfarer/locals.rb +0 -40
  333. data/lib/wayfarer/parsers/json_parser.rb +0 -20
  334. data/lib/wayfarer/parsers/xml_parser.rb +0 -27
  335. data/lib/wayfarer/processor.rb +0 -103
  336. data/lib/wayfarer/routing/custom_rule.rb +0 -21
  337. data/lib/wayfarer/routing/filetypes_rule.rb +0 -20
  338. data/lib/wayfarer/routing/host_rule.rb +0 -19
  339. data/lib/wayfarer/routing/path_rule.rb +0 -54
  340. data/lib/wayfarer/routing/protocol_rule.rb +0 -21
  341. data/lib/wayfarer/routing/query_rule.rb +0 -59
  342. data/lib/wayfarer/routing/rule.rb +0 -114
  343. data/lib/wayfarer/routing/uri_rule.rb +0 -21
  344. data/spec/configuration_spec.rb +0 -26
  345. data/spec/crawl_spec.rb +0 -48
  346. data/spec/finders_spec.rb +0 -49
  347. data/spec/frontiers/memory_bloomfilter_spec.rb +0 -6
  348. data/spec/frontiers/memory_frontier_spec.rb +0 -6
  349. data/spec/frontiers/memory_trie_frontier_spec.rb +0 -6
  350. data/spec/frontiers/normalize_uris_spec.rb +0 -59
  351. data/spec/frontiers/redis_bloomfilter_spec.rb +0 -6
  352. data/spec/frontiers/redis_frontier_spec.rb +0 -6
  353. data/spec/http_adapters/adapter_pool_spec.rb +0 -33
  354. data/spec/http_adapters/net_http_adapter_spec.rb +0 -83
  355. data/spec/http_adapters/selenium_adapter_spec.rb +0 -53
  356. data/spec/integration/callbacks_spec.rb +0 -42
  357. data/spec/integration/locals_spec.rb +0 -106
  358. data/spec/integration/peeking_spec.rb +0 -61
  359. data/spec/job_spec.rb +0 -122
  360. data/spec/processor_spec.rb +0 -31
  361. data/spec/routing/custom_rule_spec.rb +0 -26
  362. data/spec/routing/host_rule_spec.rb +0 -48
  363. data/spec/routing/path_rule_spec.rb +0 -66
  364. data/spec/routing/protocol_rule_spec.rb +0 -26
  365. data/spec/routing/query_rule_spec.rb +0 -124
  366. data/spec/routing/rule_spec.rb +0 -251
  367. data/spec/routing/uri_rule_spec.rb +0 -24
  368. data/spec/shared/frontier.rb +0 -96
  369. data/wayfarer-jruby.gemspec +0 -49
@@ -1,67 +0,0 @@
1
- ---
2
- layout: default
3
- title: Page objects
4
- ---
5
-
6
- # `Page` objects
7
-
8
- Retrieved pages are represented by `Page` objects and made accessible by `#page` within actions. `Page`s support the same set of features regardless of the HTTP adapter in use.
9
-
10
- <aside class="note">
11
- HTTP response headers and status codes are not supported by Selenium WebDrivers. Wayfarer emulates both by having the WebDriver fire an AJAX request to the current page and extracting them from the response. Clearly this is a hack, but it might even work for you. See <a href="https://github.com/bauerd/selenium-emulated_features">selenium-emulated_features</a>.
12
- </aside>
13
-
14
- <aside class="note">
15
- Even after having followed redirects, <code>Page#uri</code> always returns the URI that originally initiated the redirects. This behaviour stems from redirects being opaque to WebDrivers.
16
- </aside>
17
-
18
- A `Page` brings to the table all you'd wish for when doing web scraping:
19
-
20
- * [Nokogiri](http://www.nokogiri.org) parses HTML/XML
21
- * [Oj](https://github.com/ohler55/oj) or the standard lib parses JSON
22
- * __When running on MRI__, [Pismo](https://github.com/peterc/pismo) lets you access metadata, e.g. keywords, author, a summary, … No overhead if you don't use it!
23
-
24
- Let's see it in action:
25
-
26
- {% highlight ruby %}
27
- class DummyJob < Wayfarer::Job
28
- # ...
29
-
30
- def example
31
- page # => #<Wayfarer::Page:...>
32
-
33
- page.uri # => #<URI::...>
34
- page.status_code # => Fixnum
35
- page.body # => String
36
- page.headers # => Hash
37
-
38
- page.doc # => #<Nokogiri::HTML::Document:...> (HTML/XML) or Hash (JSON)
39
- # Also accessible as just `doc`
40
-
41
- page.links # => [URI]
42
- page.stylesheets # => [URI]
43
- page.javascripts # => [URI]
44
- page.images # => [URI]
45
-
46
- # All previous four methods accept arbitrary many CSS selectors
47
- page.links ".my-target", ".my-other-target"
48
-
49
- # THESE ARE NOT SUPPORTED ON JRUBY!
50
- # On MRI, the following methods get forwarded to a Pismo::Document
51
- # See https://github.com/peterc/pismo
52
- page.title
53
- page.titles
54
- page.author
55
- page.lede
56
- page.keywords
57
- page.sentences(qty)
58
- page.body
59
- page.html_body
60
- page.feed
61
- page.feeds
62
- page.favicon
63
- page.description
64
- page.datetime
65
- end
66
- end
67
- {% endhighlight %}
@@ -1,46 +0,0 @@
1
- ---
2
- layout: default
3
- title: Peeking
4
- ---
5
-
6
- # Peeking
7
- Peeking allows bypassing the [frontier](frontiers.html) in an ad-hoc manner. Use Ruby's `yield` keyword to immediately retrieve and dispatch a URI from within actions. Control gets handed off to the action matching the yielded URI, if any.
8
-
9
- A matching route for the yielded URI is still required. If the yielded URI matches no route or raises an exception, `yield` returns `nil`.
10
-
11
- <aside class="note">
12
- The action that gets the URI dispatched to <strong>will</strong> get assigned another HTTP adapter! HTTP adapters are never shared across actions, i.e. if you're using the Selenium HTTP adapter, the peeked URI gets retrieved by a different browser process.
13
- </aside>
14
-
15
- {% highlight ruby %}
16
- class DummyJob < Wayfarer::Job
17
- route.uri "https://example.com", to: :foo
18
- route.uri "https://w3c.org", to: :bar
19
-
20
- def foo
21
- w3c_page = yield "https://w3c.org"
22
- end
23
-
24
- def bar
25
- page
26
- end
27
- end
28
- {% endhighlight %}
29
-
30
- __Recursive peeking does not work__, or else peeking might result in an infinite loop. The following does terminate:
31
-
32
- {% highlight ruby %}
33
- class DummyJob < Wayfarer::Job
34
- route.uri "https://example.com", to: :foo
35
- route.uri "https://w3c.org", to: :bar
36
-
37
- def foo
38
- w3c_page = yield "https://w3c.org"
39
- end
40
-
41
- def bar
42
- # Silently ignored, assigns nil
43
- example_page = yield "https://example.com"
44
- end
45
- end
46
- {% endhighlight %}
@@ -1,100 +0,0 @@
1
- ---
2
- layout: default
3
- title: Selenium & Capybara
4
- ---
5
-
6
- # Selenium & Capybara
7
-
8
- [Selenium](http://www.seleniumhq.org) is a browser automation framework. [Capybara](https://github.com/teamcapybara/capybara) is an acceptance testing framework that puts an expressive DSL on Selenium's WebDrivers. Both are first-class citizens in Wayfarer and the best tools for automating browsers.
9
-
10
- ## Selenium WebDrivers
11
-
12
- WebDrivers let you remote-control browsers, e.g. Firefox, Chrome, Safari and PhantomJS.
13
-
14
- Depending on what browser you want to automate, go install and run the corresponding driver first. For installation instructions, see the project websites:
15
-
16
- * Firefox: [geckodriver](https://github.com/mozilla/geckodriver)
17
- * Chrome: [chromedriver](https://sites.google.com/a/chromium.org/chromedriver)
18
- * Safari: [SafariDriver](https://github.com/SeleniumHQ/selenium/wiki/SafariDriver)
19
- * PhantomJS ships with an embedded driver.
20
-
21
- Other browsers are supported, too. For an exhaustive list, see the "Third Party Drivers, Bindings, and Plugins" section on the [Selenium downloads page](http://www.seleniumhq.org/download).
22
-
23
- If you want to run browser processes on a central server, consider using [Selenium Grid](http://www.seleniumhq.org/projects/grid).
24
-
25
- Wayfarer hides the details of managing Ruby driver objects from you. In order to use Selenium, set the `http_adapter` configuration key to `:selenium`. Pass in the desired browser and arguments by setting the `selenium_argv` key. The number of browser processes can be controlled with the `connection_count` key.
26
-
27
- {% highlight ruby %}
28
- class DummyJob < Wayfarer::Job
29
- config do |c|
30
- # Use 4 Firefox processes
31
- c.http_adapter = :selenium
32
- c.selenium_argv = [:firefox]
33
- c.connection_count = 4
34
-
35
- # Chrome
36
- # c.selenium_argv = [:chrome]
37
-
38
- # Safari
39
- # c.selenium_argv = [:safari]
40
-
41
- # PhantomJS
42
- # c.selenium_argv = [:phantomjs]
43
-
44
- # Selenium Grid
45
- # c.selenium_argv = [
46
- # :remote,
47
- # url: "http://localhost:4444/wd/hub",
48
- # desired_capabilities: :firefox
49
- # ]
50
- end
51
- end
52
- {% endhighlight %}
53
-
54
- <aside class="note">
55
- In order to avoid redirect loops, the <code>:net_http</code> adapter supports the <code>max_http_redirects</code> configuration key. Because redirects are opaque to WebDrivers, the configuration key does not apply to the Selenium adapter. See <a href="configuration.html">Configuration</a>.
56
- </aside>
57
-
58
- ### Accessing the WebDriver
59
-
60
- Within actions, `#driver` returns a [`Selenium::WebDriver::Driver`](http://www.rubydoc.info/gems/selenium-webdriver/Selenium/WebDriver/Driver):
61
-
62
- {% highlight ruby %}
63
- class DummyJob < Wayfarer::Job
64
- config do |c|
65
- c.http_adapter = :selenium
66
- c.selenium_argv = [:firefox]
67
- end
68
-
69
- draw uri: "https://example.com"
70
- def example
71
- driver # => #<Selenium::WebDriver::Driver:...>
72
- end
73
- end
74
- {% endhighlight %}
75
-
76
- <aside class="note">
77
- What you do with a WebDriver is opaque to Wayfarer. If you handle navigation yourself with a WebDriver and bypass the <a href="/guides/frontiers.html">frontier</a>, Wayfarer cannot ensure you don't visit URIs twice.
78
- </aside>
79
-
80
- ## Capybara
81
-
82
- When using the `:selenium` HTTP adapter, `#browser` returns a [`Capybara::Selenium::Driver`](http://www.rubydoc.info/github/jnicklas/capybara/Capybara/Selenium/Driver) within actions:
83
-
84
- {% highlight ruby %}
85
- class DummyJob < Wayfarer::Job
86
- config do |c|
87
- c.http_adapter = :selenium
88
- c.selenium_argv = [:firefox]
89
- end
90
-
91
- draw uri: "https://example.com"
92
- def example
93
- browser # => #<Capybara::Selenium::Driver:...>
94
- end
95
- end
96
- {% endhighlight %}
97
-
98
- <aside class="note">
99
- What you do with a WebDriver is opaque to Wayfarer. If you handle navigation yourself with a WebDriver and bypass the <a href="/guides/frontiers.html">frontier</a>, Wayfarer cannot ensure you don't visit URIs twice.
100
- </aside>
@@ -1,452 +0,0 @@
1
- ---
2
- layout: default
3
- title: Tutorial
4
- ---
5
-
6
- # Tutorial
7
- This tutorial walks you through 66.333% of what's to know about Wayfarer, a web crawling framework for Ruby. Along the way, we'll write a reusable crawler that collects the titles of all open issues from an arbitrary GitHub repository.
8
-
9
- First, we get ourselves a subclass of `Wayfarer::Job`. If you've ever worked with a typical MVC web framework, think of a job as a self-contained controller with routes. If you haven't, don't worry!
10
-
11
- {% highlight ruby %}
12
- require "wayfarer" # This line omitted hereafter
13
-
14
- class CollectGithubIssues < Wayfarer::Job
15
- end
16
- {% endhighlight %}
17
-
18
- Suppose we’re interested in Rails' GitHub repository, which is located at `https://github.com/rails/rails`. We need two things:
19
- 1. A route that matches that URI and …
20
- 2. an instance method (action) which handles that page:
21
-
22
- {% highlight ruby %}
23
- class CollectGithubIssues < Wayfarer::Job
24
- route.uri "https://github.com/rails/rails", to: :repository # (1)
25
-
26
- def repository # (2)
27
- puts "This looks like Rails to me!"
28
- end
29
- end
30
- {% endhighlight %}
31
-
32
- We set up a single route which maps the repository URI (and only that URI) to `CollectGithubIssues#repository`. When we feed our job the URI, the `#repository` method gets called.
33
-
34
- To run a job, , call `::perform_now` on your job class and pass an arbitrary number of URIs to start with:
35
-
36
- {% highlight ruby %}
37
- class CollectGithubIssues < Wayfarer::Job
38
- # Gives more detailed output
39
- # I'll omit this from now on
40
- config.logger.level = :debug
41
-
42
- route.uri "https://github.com/rails/rails", to: :repository
43
-
44
- def repository
45
- puts "This looks like Rails to me!"
46
- end
47
- end
48
-
49
- CollectGithubIssues.perform_now("https://github.com/rails/rails", "https://example.com")
50
- {% endhighlight %}
51
-
52
- Note that we pass a URI we have no matching route for, `https://example.com`.
53
-
54
- Save and run your file as you would with every other Ruby file:
55
-
56
- ```
57
- % ruby collect_github_issues.rb
58
- ```
59
-
60
- … and you'll end up with output similiar to this:
61
-
62
- ```
63
- Performing CollectGithubIssues (Job ID: …) from Async(default) with arguments: "https://github.com/rails/rails", "https://example.com"
64
- I, […] INFO -- wayfarer: First cycle
65
- I, […] INFO -- wayfarer: Frontier: URI-normalizing #<Wayfarer::Frontiers::MemoryFrontier:0x007fa2a6ae9cf0>
66
- I, […] INFO -- wayfarer: Current cycle contains 2 URI(s)
67
- I, […] INFO -- wayfarer: Dispatched to #repository: https://github.com/rails/rails
68
- This looks like Rails to me!
69
- I, […] INFO -- wayfarer: Staging 0 URI(s)
70
- D, […] DEBUG -- wayfarer: No matching route for: https://example.com/
71
- I, […] INFO -- wayfarer: No URIs left in current cycle
72
- I, […] INFO -- wayfarer: About to cycle. 0 staged URI(s)
73
- Performed CollectGithubIssues (Job ID: …) from Async(default) in 863.69ms
74
- ```
75
-
76
- Here is what happened:
77
-
78
- 1. Both URIs we passed in were matched against our routes.
79
- 2. Our matching GitHub URI's page was retrieved, the mismatching one ignored.
80
- 3. Our `#repository` action was invoked and has access to the retrieved page.
81
-
82
-
83
-
84
- Let’s exchange our static string for the actual page `<title>`. Inside our instance method, we call `#doc` to get ahold of a [`Nokogiri::HTML::Document`](http://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document). [Nokogiri]() is a HTML/XML library, and a parsed document allows us to access the title tag easily:
85
-
86
- {% highlight ruby %}
87
- class CollectGithubIssues < Wayfarer::Job
88
- route.uri "https://github.com/rails/rails", to: :repository
89
-
90
- def repository
91
- # Outputs the <title> attribute value
92
- puts doc.title
93
- end
94
- end
95
-
96
- CollectGithubIssues.perform_now("https://github.com/rails/rails")
97
- {% endhighlight %}
98
-
99
- Wayfarer does not attempt to do black magic on top of Nokogiri. When it comes to extracting specific data from pages, you’re mostly on your own. There are helpers for finding links, CSS/JavaScript files and images (see [`Page` objects](page_objects.html)). But figuring out what the interesting parts of a HTTP response are is still up to you.
100
-
101
- Wayfarer parses JSON, too. You'll get a `Hash` returned by `#doc` instead of a Nokogiri document.
102
-
103
- Rails’ issues are located at `https://github.com/rails/rails/issues`. We need a new route and a new instance method to handle this issue index. By calling `#stage` and passing in an arbitrary number of URIs, we can stage URIs for processing. Note that just because a URI gets staged does not mean it will be fetched—a matching route is required for every URI. Also, Wayfarer will by default ensure that no URI gets processed twice. This behaviour can be turned off, though (see [Configuration](configuration.html)).
104
-
105
- {% highlight ruby %}
106
- class CollectGithubIssues < Wayfarer::Job
107
- routes do
108
- uri "https://github.com/rails/rails", to: :repository
109
- uri "https://github.com/rails/rails/issues", to: :index
110
- end
111
-
112
- def repository
113
- # This is where we want to head at
114
- stage "https://github.com/rails/rails/issues"
115
- end
116
-
117
- def index
118
- puts "Arrived at the issue listing"
119
- end
120
- end
121
-
122
- CollectGithubIssues.perform_now("https://github.com/rails/rails")
123
- {% endhighlight %}
124
-
125
- What we have so far works fine for the Rails repository, but not for others, because the URIs are hardcoded. That's a real pity, because there are more than 10 million repositories on GitHub. We can do better by switching to a host and path rule.
126
-
127
- A host rule narrows down the host portion of a URI, and a path rule the path. Instead of hard-coding the path, pattern matching can be used to have interesting parts of the path extracted:
128
-
129
- {% highlight ruby %}
130
- class CollectGithubIssues < Wayfarer::Job
131
- routes do
132
- # Both routes match only if
133
- # (1) The host is github.com and
134
- # (2) The path segments match
135
- host "github.com" do
136
- path "/:user/:repo", to: :repository
137
- path "/:user/:repo/issues", to: :index
138
- end
139
- end
140
-
141
- def repository
142
- stage "https://github.com/rails/rails/issues"
143
- end
144
-
145
- def index
146
- # Captured path segments: params # => { repo: ..., user: ... }
147
- # Prints 'rails belongs to rails'.
148
- puts "#{params['repo']} belongs to #{params['user']}"
149
- end
150
- end
151
-
152
- CollectGithubIssues.perform_now("https://github.com/rails/rails")
153
- {% endhighlight %}
154
-
155
- Note that we still have a hard-coded URI in `#repository`. Usually, there are two approaches to identify URIs that one wants to follow:
156
-
157
- 1. Constructing the successor URI from the current URI.
158
- 2. Reading the URI from the HTTP response, e.g. extracting an `<a>` tag's `href` property.
159
-
160
- For the first case, say we're on `https://github.com/:user/:repo` and want to go to `https://github.com/:user/:repo/issues`. `#stage` takes relative paths and URIs too, and constructs absolute URIs by appending to the current page's URI:
161
-
162
- {% highlight ruby %}
163
- class CollectGithubIssues < Wayfarer::Job
164
- # ...
165
-
166
- def index
167
- # Stages "#{page.uri}/issues"
168
- stage "issues"
169
- end
170
-
171
- # ...
172
- end
173
- {% endhighlight %}
174
-
175
- `#page` returns a [`Page` object]({{base}}/guides/page_objects.html), the general representation of a retrieved page. It gives one access to the page's origin URI, the response headers, the status code and the raw response body and more.
176
-
177
- The second case is where Wayfarer's routing shines. We know that the path structure is `/:user/:repo/issues` and that there's a link somewhere on the repository's frontpage that links to there. We can stage __all__ links of the current page, and have our routes ensure that only interesting ones get processed:
178
-
179
- {% highlight ruby %}
180
- class CollectGithubIssues < Wayfarer::Job
181
- # ...
182
-
183
- def repository
184
- # But only route-matching ones get processed
185
- stage page.links
186
- end
187
-
188
- # ...
189
- end
190
- {% endhighlight %}
191
-
192
- `Page#links` returns all links of the current site. But staging all links brings overhead with it, and we'll want to narrow down the links to stage, especially when crawling large page structures. `Page#links` accepts an arbitrary number of CSS selectors to narrow down links. For clarity, let's give the navigation links their own private helper method:
193
-
194
- {% highlight ruby %}
195
- class CollectGithubIssues < Wayfarer::Job
196
- routes do
197
- host "github.com" do
198
- path "/:user/:repo", to: :repository
199
- path "/:user/:repo/issues", to: :index
200
- end
201
- end
202
-
203
- def repository
204
- stage navigation_links
205
- end
206
-
207
- def index
208
- puts "#{params['repo']} belongs to #{params['user']}"
209
- end
210
-
211
- private
212
-
213
- def navigation_links
214
- page.links ".reponav-item"
215
- end
216
- end
217
-
218
- CollectGithubIssues.perform_now("https://github.com/rails/rails")
219
- {% endhighlight %}
220
-
221
- URIs never get dispatched to private instance methods.
222
-
223
- We're prepared to go after the individual issues now. We add the `#issue` action, and route to it with a host and path rule. Links to issue tickets are wrapped in `.issues-listing`, so we can apply the same technique as above:
224
-
225
- {% highlight ruby %}
226
- class CollectGithubIssues < Wayfarer::Job
227
- routes do
228
- host "github.com" do
229
- path "/:user/:repo", to: :repository
230
- path "/:user/:repo/issues", to: :index
231
- path "/:user/:repo/issues/:id", to: :show
232
- end
233
- end
234
-
235
- def repository
236
- stage navigation_links
237
- end
238
-
239
- def index
240
- stage issue_listing_links
241
- end
242
-
243
- def show
244
- puts "Issue No. #{params[:id]} @ #{page.uri}"
245
- end
246
-
247
- private
248
-
249
- def navigation_links
250
- page.links ".reponav-item"
251
- end
252
-
253
- def issue_listing_links
254
- page.links ".issues-listing"
255
- end
256
- end
257
-
258
- CollectGithubIssues.perform_now("https://github.com/rails/rails")
259
- {% endhighlight %}
260
-
261
- Handling pagination boils down to staging one more link in `#index`. As mentioned before, `#stage` accepts an arbitrary number of URIs:
262
-
263
- {% highlight ruby %}
264
- class CollectGithubIssues < Wayfarer::Job
265
- routes do
266
- host "github.com" do
267
- path "/:user/:repo", to: :repository
268
- path "/:user/:repo/issues", to: :index
269
- path "/:user/:repo/issues/:id", to: :show
270
- end
271
- end
272
-
273
- def repository
274
- stage navigation_links
275
- end
276
-
277
- def index
278
- stage issue_listing_links, next_page
279
- end
280
-
281
- def show
282
- puts "Issue No. #{params[:id]} @ #{page.uri}"
283
- end
284
-
285
- private
286
-
287
- def navigation_links
288
- page.links ".reponav-item"
289
- end
290
-
291
- def issue_listing_links
292
- page.links ".issues-listing"
293
- end
294
-
295
- def next_page
296
- page.links ".next_page"
297
- end
298
- end
299
-
300
- CollectGithubIssues.perform_now("https://github.com/rails/rails")
301
- {% endhighlight %}
302
-
303
- By default, all work happens within a single thread. We can speed up crawling by increasing the thread count:
304
-
305
- {% highlight ruby %}
306
- class CollectGithubIssues < Wayfarer::Job
307
- config.connection_count = 4 # Four threads
308
-
309
- # ...
310
- end
311
- {% endhighlight %}
312
-
313
- Next, we want to extract the issue's title, its ID, and the GitHub user who opened it and store that data somewhere.
314
-
315
- For extracting the text from the HTML, we add two private helper methods that query the HTML for the text.
316
-
317
- For storing the data, we introduce a [local]({{base}}/guides/locals.html) named `:records` which stores an array. In job actions, locals can be accessed and manipulated. But now that we've bumped up the thread count, multiple instances of our job class will run concurrently. That's why locals declared with `::let` are replaced with thread-safe counterparts behind the scenes.
318
-
319
- We stop processing with `halt` once we have collected 30 issue records:
320
-
321
- {% highlight ruby %}
322
- class CollectGithubIssues < Wayfarer::Job
323
- config.connection_count = 4
324
-
325
- let(:records) { [] }
326
-
327
- routes do
328
- host "github.com" do
329
- path "/:user/:repo", to: :repository
330
- path "/:user/:repo/issues", to: :index
331
- path "/:user/:repo/issues/:id", to: :show
332
- end
333
- end
334
-
335
- after_crawl do
336
- records.each do |issue|
337
- # Save them somewhere?
338
- puts issue
339
- end
340
- end
341
-
342
- def repository
343
- stage navigation_links
344
- end
345
-
346
- def index
347
- stage issue_listing_links, next_page
348
- end
349
-
350
- def show
351
- return halt if records.count > 30
352
-
353
- records << {
354
- id: params[:id],
355
- title: issue_title,
356
- author: issue_author
357
- }
358
- end
359
-
360
- private
361
-
362
- def issue_title
363
- doc.css(".js-issue-title").text.strip
364
- end
365
-
366
- def issue_author
367
- doc.css(".TableObject-item .author").text.strip
368
- end
369
-
370
- def navigation_links
371
- page.links ".reponav-item"
372
- end
373
-
374
- def issue_listing_links
375
- page.links ".issues-listing"
376
- end
377
-
378
- def next_page
379
- page.links ".next_page"
380
- end
381
- end
382
-
383
- CollectGithubIssues.perform_now("https://github.com/rails/rails")
384
- {% endhighlight %}
385
-
386
- For the last part, we turn off the debugging output (if you have still enabled it) and output each record. You'd probably want to store them somewhere at this point, e.g. by [writing them to a CSV file]({{base}}/recipes/csv.html), or putting them into a database, etc.
387
-
388
- {% highlight ruby %}
389
- class CollectGithubIssues < Wayfarer::Job
390
- config.connection_count = 4
391
- config.logger.level = :fatal
392
-
393
- let(:records) { [] }
394
-
395
- routes do
396
- host "github.com" do
397
- path "/:user/:repo", to: :repository
398
- path "/:user/:repo/issues", to: :index
399
- path "/:user/:repo/issues/:id", to: :show
400
- end
401
- end
402
-
403
- after_crawl do
404
- records.each do |issue|
405
- # Save them somewhere?
406
- puts issue
407
- end
408
- end
409
-
410
- def repository
411
- stage navigation_links
412
- end
413
-
414
- def index
415
- stage issue_listing_links, next_page
416
- end
417
-
418
- def show
419
- return halt if records.count > 30
420
-
421
- records << {
422
- id: params[:id],
423
- title: issue_title,
424
- author: issue_author
425
- }
426
- end
427
-
428
- private
429
-
430
- def issue_title
431
- doc.css(".js-issue-title").text.strip
432
- end
433
-
434
- def issue_author
435
- doc.css(".TableObject-item .author").text.strip
436
- end
437
-
438
- def navigation_links
439
- page.links ".reponav-item"
440
- end
441
-
442
- def issue_listing_links
443
- page.links ".issues-listing"
444
- end
445
-
446
- def next_page
447
- page.links ".next_page"
448
- end
449
- end
450
-
451
- CollectGithubIssues.perform_now("https://github.com/rails/rails")
452
- {% endhighlight %}
@@ -1,11 +0,0 @@
1
- document.addEventListener("DOMContentLoaded", function() {
2
- var links = document.querySelectorAll(".navigation__link");
3
-
4
- for (i = 0; i < links.length; i++) {
5
- var link = links[i];
6
-
7
- if (link.pathname === window.location.pathname) {
8
- link.classList.add("navigation__link--active");
9
- }
10
- }
11
- });
@@ -1,20 +0,0 @@
1
- ---
2
- layout: default
3
- title: Contributing
4
- ---
5
-
6
- # Contributing
7
-
8
- 1. Fork the repository
9
- 2. Ensure the development dependencies are installed:
10
- `% bundle install --with development`
11
- 2. Make changes
12
- 3. Ensure your (new?) tests pass:
13
- `% bundle exec rake test`
14
- 4. Autocorrect RubuCop offenses:
15
- `% bundle exec rake rubocop:auto_correct`
16
- 5. Fix remaining offenses or have a good excuse not to:
17
- `% bundle exec rake rubocop`
18
- 6. Write commit messages at least not worse than mine
19
- 7. Open a pull request on GitHub
20
- 8. Thank you