wayfarer-jruby 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rbenv-gemsets +1 -0
  4. data/.rspec +3 -0
  5. data/.rubocop.yml +21 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/.yardopts +3 -0
  9. data/Gemfile +11 -0
  10. data/LICENSE +19 -0
  11. data/README.md +19 -0
  12. data/Rakefile +114 -0
  13. data/benchmark/frontiers.rb +143 -0
  14. data/bin/wayfarer +116 -0
  15. data/docs/.gitignore +2 -0
  16. data/docs/_config.yml +15 -0
  17. data/docs/_includes/base.html +7 -0
  18. data/docs/_includes/head.html +10 -0
  19. data/docs/_includes/navigation.html +172 -0
  20. data/docs/_layouts/default.html +42 -0
  21. data/docs/_sass/base.scss +439 -0
  22. data/docs/_sass/variables.scss +24 -0
  23. data/docs/_sass/vendor/bourbon/_bourbon-deprecate.scss +19 -0
  24. data/docs/_sass/vendor/bourbon/_bourbon-deprecated-upcoming.scss +425 -0
  25. data/docs/_sass/vendor/bourbon/_bourbon.scss +90 -0
  26. data/docs/_sass/vendor/bourbon/addons/_border-color.scss +29 -0
  27. data/docs/_sass/vendor/bourbon/addons/_border-radius.scss +48 -0
  28. data/docs/_sass/vendor/bourbon/addons/_border-style.scss +28 -0
  29. data/docs/_sass/vendor/bourbon/addons/_border-width.scss +28 -0
  30. data/docs/_sass/vendor/bourbon/addons/_buttons.scss +69 -0
  31. data/docs/_sass/vendor/bourbon/addons/_clearfix.scss +25 -0
  32. data/docs/_sass/vendor/bourbon/addons/_ellipsis.scss +30 -0
  33. data/docs/_sass/vendor/bourbon/addons/_font-stacks.scss +31 -0
  34. data/docs/_sass/vendor/bourbon/addons/_hide-text.scss +27 -0
  35. data/docs/_sass/vendor/bourbon/addons/_margin.scss +29 -0
  36. data/docs/_sass/vendor/bourbon/addons/_padding.scss +29 -0
  37. data/docs/_sass/vendor/bourbon/addons/_position.scss +51 -0
  38. data/docs/_sass/vendor/bourbon/addons/_prefixer.scss +66 -0
  39. data/docs/_sass/vendor/bourbon/addons/_retina-image.scss +27 -0
  40. data/docs/_sass/vendor/bourbon/addons/_size.scss +56 -0
  41. data/docs/_sass/vendor/bourbon/addons/_text-inputs.scss +118 -0
  42. data/docs/_sass/vendor/bourbon/addons/_timing-functions.scss +34 -0
  43. data/docs/_sass/vendor/bourbon/addons/_triangle.scss +63 -0
  44. data/docs/_sass/vendor/bourbon/addons/_word-wrap.scss +29 -0
  45. data/docs/_sass/vendor/bourbon/css3/_animation.scss +61 -0
  46. data/docs/_sass/vendor/bourbon/css3/_appearance.scss +5 -0
  47. data/docs/_sass/vendor/bourbon/css3/_backface-visibility.scss +5 -0
  48. data/docs/_sass/vendor/bourbon/css3/_background-image.scss +44 -0
  49. data/docs/_sass/vendor/bourbon/css3/_background.scss +57 -0
  50. data/docs/_sass/vendor/bourbon/css3/_border-image.scss +61 -0
  51. data/docs/_sass/vendor/bourbon/css3/_calc.scss +6 -0
  52. data/docs/_sass/vendor/bourbon/css3/_columns.scss +67 -0
  53. data/docs/_sass/vendor/bourbon/css3/_filter.scss +6 -0
  54. data/docs/_sass/vendor/bourbon/css3/_flex-box.scss +327 -0
  55. data/docs/_sass/vendor/bourbon/css3/_font-face.scss +29 -0
  56. data/docs/_sass/vendor/bourbon/css3/_font-feature-settings.scss +6 -0
  57. data/docs/_sass/vendor/bourbon/css3/_hidpi-media-query.scss +12 -0
  58. data/docs/_sass/vendor/bourbon/css3/_hyphens.scss +6 -0
  59. data/docs/_sass/vendor/bourbon/css3/_image-rendering.scss +15 -0
  60. data/docs/_sass/vendor/bourbon/css3/_keyframes.scss +38 -0
  61. data/docs/_sass/vendor/bourbon/css3/_linear-gradient.scss +40 -0
  62. data/docs/_sass/vendor/bourbon/css3/_perspective.scss +12 -0
  63. data/docs/_sass/vendor/bourbon/css3/_placeholder.scss +10 -0
  64. data/docs/_sass/vendor/bourbon/css3/_radial-gradient.scss +40 -0
  65. data/docs/_sass/vendor/bourbon/css3/_selection.scss +44 -0
  66. data/docs/_sass/vendor/bourbon/css3/_text-decoration.scss +27 -0
  67. data/docs/_sass/vendor/bourbon/css3/_transform.scss +21 -0
  68. data/docs/_sass/vendor/bourbon/css3/_transition.scss +81 -0
  69. data/docs/_sass/vendor/bourbon/css3/_user-select.scss +5 -0
  70. data/docs/_sass/vendor/bourbon/functions/_assign-inputs.scss +16 -0
  71. data/docs/_sass/vendor/bourbon/functions/_contains-falsy.scss +25 -0
  72. data/docs/_sass/vendor/bourbon/functions/_contains.scss +31 -0
  73. data/docs/_sass/vendor/bourbon/functions/_is-length.scss +16 -0
  74. data/docs/_sass/vendor/bourbon/functions/_is-light.scss +26 -0
  75. data/docs/_sass/vendor/bourbon/functions/_is-number.scss +16 -0
  76. data/docs/_sass/vendor/bourbon/functions/_is-size.scss +23 -0
  77. data/docs/_sass/vendor/bourbon/functions/_modular-scale.scss +74 -0
  78. data/docs/_sass/vendor/bourbon/functions/_px-to-em.scss +24 -0
  79. data/docs/_sass/vendor/bourbon/functions/_px-to-rem.scss +26 -0
  80. data/docs/_sass/vendor/bourbon/functions/_shade.scss +24 -0
  81. data/docs/_sass/vendor/bourbon/functions/_strip-units.scss +22 -0
  82. data/docs/_sass/vendor/bourbon/functions/_tint.scss +24 -0
  83. data/docs/_sass/vendor/bourbon/functions/_transition-property-name.scss +37 -0
  84. data/docs/_sass/vendor/bourbon/functions/_unpack.scss +32 -0
  85. data/docs/_sass/vendor/bourbon/helpers/_convert-units.scss +26 -0
  86. data/docs/_sass/vendor/bourbon/helpers/_directional-values.scss +108 -0
  87. data/docs/_sass/vendor/bourbon/helpers/_font-source-declaration.scss +53 -0
  88. data/docs/_sass/vendor/bourbon/helpers/_gradient-positions-parser.scss +24 -0
  89. data/docs/_sass/vendor/bourbon/helpers/_linear-angle-parser.scss +35 -0
  90. data/docs/_sass/vendor/bourbon/helpers/_linear-gradient-parser.scss +51 -0
  91. data/docs/_sass/vendor/bourbon/helpers/_linear-positions-parser.scss +77 -0
  92. data/docs/_sass/vendor/bourbon/helpers/_linear-side-corner-parser.scss +41 -0
  93. data/docs/_sass/vendor/bourbon/helpers/_radial-arg-parser.scss +74 -0
  94. data/docs/_sass/vendor/bourbon/helpers/_radial-gradient-parser.scss +55 -0
  95. data/docs/_sass/vendor/bourbon/helpers/_radial-positions-parser.scss +28 -0
  96. data/docs/_sass/vendor/bourbon/helpers/_render-gradients.scss +31 -0
  97. data/docs/_sass/vendor/bourbon/helpers/_shape-size-stripper.scss +15 -0
  98. data/docs/_sass/vendor/bourbon/helpers/_str-to-num.scss +55 -0
  99. data/docs/_sass/vendor/bourbon/settings/_asset-pipeline.scss +7 -0
  100. data/docs/_sass/vendor/bourbon/settings/_deprecation-warnings.scss +8 -0
  101. data/docs/_sass/vendor/bourbon/settings/_prefixer.scss +9 -0
  102. data/docs/_sass/vendor/bourbon/settings/_px-to-em.scss +1 -0
  103. data/docs/_sass/vendor/neat/_neat-helpers.scss +11 -0
  104. data/docs/_sass/vendor/neat/_neat.scss +23 -0
  105. data/docs/_sass/vendor/neat/functions/_new-breakpoint.scss +49 -0
  106. data/docs/_sass/vendor/neat/functions/_private.scss +114 -0
  107. data/docs/_sass/vendor/neat/grid/_box-sizing.scss +15 -0
  108. data/docs/_sass/vendor/neat/grid/_direction-context.scss +33 -0
  109. data/docs/_sass/vendor/neat/grid/_display-context.scss +28 -0
  110. data/docs/_sass/vendor/neat/grid/_fill-parent.scss +22 -0
  111. data/docs/_sass/vendor/neat/grid/_media.scss +92 -0
  112. data/docs/_sass/vendor/neat/grid/_omega.scss +87 -0
  113. data/docs/_sass/vendor/neat/grid/_outer-container.scss +34 -0
  114. data/docs/_sass/vendor/neat/grid/_pad.scss +25 -0
  115. data/docs/_sass/vendor/neat/grid/_private.scss +35 -0
  116. data/docs/_sass/vendor/neat/grid/_row.scss +52 -0
  117. data/docs/_sass/vendor/neat/grid/_shift.scss +50 -0
  118. data/docs/_sass/vendor/neat/grid/_span-columns.scss +94 -0
  119. data/docs/_sass/vendor/neat/grid/_to-deprecate.scss +97 -0
  120. data/docs/_sass/vendor/neat/grid/_visual-grid.scss +42 -0
  121. data/docs/_sass/vendor/neat/mixins/_clearfix.scss +25 -0
  122. data/docs/_sass/vendor/neat/settings/_disable-warnings.scss +13 -0
  123. data/docs/_sass/vendor/neat/settings/_grid.scss +51 -0
  124. data/docs/_sass/vendor/neat/settings/_visual-grid.scss +27 -0
  125. data/docs/_sass/vendor/normalize-3.0.2.scss +427 -0
  126. data/docs/_sass/vendor/pygments.scss +356 -0
  127. data/docs/automating_browsers/capybara.md +70 -0
  128. data/docs/css/screen.scss +7 -0
  129. data/docs/guides/callbacks.md +45 -0
  130. data/docs/guides/cli.md +52 -0
  131. data/docs/guides/configuration.md +184 -0
  132. data/docs/guides/error_handling.md +46 -0
  133. data/docs/guides/frontiers.md +93 -0
  134. data/docs/guides/halting.md +23 -0
  135. data/docs/guides/job_queues.md +26 -0
  136. data/docs/guides/locals.md +36 -0
  137. data/docs/guides/logging.md +22 -0
  138. data/docs/guides/page_objects.md +67 -0
  139. data/docs/guides/peeking.md +46 -0
  140. data/docs/guides/selenium_capybara.md +100 -0
  141. data/docs/guides/tutorial.md +452 -0
  142. data/docs/index.md +82 -0
  143. data/docs/js/navigation.js +11 -0
  144. data/docs/misc/contributing.md +20 -0
  145. data/docs/misc/testing.md +11 -0
  146. data/docs/recipes/authentication.md +23 -0
  147. data/docs/recipes/csv.md +29 -0
  148. data/docs/recipes/javascript.md +20 -0
  149. data/docs/recipes/multiple_uris.md +18 -0
  150. data/docs/recipes/screenshots.md +20 -0
  151. data/docs/routing/host_rules.md +24 -0
  152. data/docs/routing/path_rules.md +33 -0
  153. data/docs/routing/query_rules.md +69 -0
  154. data/docs/routing/routes.md +96 -0
  155. data/docs/routing/uri_rules.md +18 -0
  156. data/examples/collect_github_issues.rb +65 -0
  157. data/examples/find_foobar_on_wikipedia.rb +23 -0
  158. data/lib/wayfarer.rb +65 -0
  159. data/lib/wayfarer/configuration.rb +86 -0
  160. data/lib/wayfarer/crawl.rb +79 -0
  161. data/lib/wayfarer/crawl_observer.rb +103 -0
  162. data/lib/wayfarer/dispatcher.rb +104 -0
  163. data/lib/wayfarer/finders.rb +61 -0
  164. data/lib/wayfarer/frontiers/frontier.rb +79 -0
  165. data/lib/wayfarer/frontiers/memory_bloomfilter.rb +32 -0
  166. data/lib/wayfarer/frontiers/memory_frontier.rb +76 -0
  167. data/lib/wayfarer/frontiers/memory_trie_frontier.rb +39 -0
  168. data/lib/wayfarer/frontiers/normalize_uris.rb +48 -0
  169. data/lib/wayfarer/frontiers/redis_bloomfilter.rb +34 -0
  170. data/lib/wayfarer/frontiers/redis_frontier.rb +83 -0
  171. data/lib/wayfarer/http_adapters/adapter_pool.rb +62 -0
  172. data/lib/wayfarer/http_adapters/net_http_adapter.rb +77 -0
  173. data/lib/wayfarer/http_adapters/selenium_adapter.rb +80 -0
  174. data/lib/wayfarer/job.rb +192 -0
  175. data/lib/wayfarer/locals.rb +40 -0
  176. data/lib/wayfarer/page.rb +94 -0
  177. data/lib/wayfarer/parsers/json_parser.rb +20 -0
  178. data/lib/wayfarer/parsers/xml_parser.rb +27 -0
  179. data/lib/wayfarer/processor.rb +103 -0
  180. data/lib/wayfarer/routing/host_rule.rb +19 -0
  181. data/lib/wayfarer/routing/path_rule.rb +54 -0
  182. data/lib/wayfarer/routing/query_rule.rb +59 -0
  183. data/lib/wayfarer/routing/router.rb +71 -0
  184. data/lib/wayfarer/routing/rule.rb +102 -0
  185. data/lib/wayfarer/routing/uri_rule.rb +21 -0
  186. data/spec/configuration_spec.rb +26 -0
  187. data/spec/crawl_spec.rb +48 -0
  188. data/spec/finders_spec.rb +49 -0
  189. data/spec/frontiers/memory_bloomfilter_spec.rb +6 -0
  190. data/spec/frontiers/memory_frontier_spec.rb +6 -0
  191. data/spec/frontiers/memory_trie_frontier_spec.rb +6 -0
  192. data/spec/frontiers/normalize_uris_spec.rb +59 -0
  193. data/spec/frontiers/redis_bloomfilter_spec.rb +6 -0
  194. data/spec/frontiers/redis_frontier_spec.rb +6 -0
  195. data/spec/http_adapters/adapter_pool_spec.rb +33 -0
  196. data/spec/http_adapters/net_http_adapter_spec.rb +83 -0
  197. data/spec/http_adapters/selenium_adapter_spec.rb +53 -0
  198. data/spec/integration/callbacks_spec.rb +42 -0
  199. data/spec/integration/locals_spec.rb +106 -0
  200. data/spec/job_spec.rb +86 -0
  201. data/spec/page_spec.rb +38 -0
  202. data/spec/parsers/json_parser_spec.rb +30 -0
  203. data/spec/parsers/xml_parser_spec.rb +24 -0
  204. data/spec/processor_spec.rb +31 -0
  205. data/spec/routing/host_rule_spec.rb +48 -0
  206. data/spec/routing/path_rule_spec.rb +66 -0
  207. data/spec/routing/query_rule_spec.rb +124 -0
  208. data/spec/routing/router_spec.rb +67 -0
  209. data/spec/routing/rule_spec.rb +218 -0
  210. data/spec/routing/uri_rule_spec.rb +24 -0
  211. data/spec/shared/frontier.rb +96 -0
  212. data/spec/spec_helpers.rb +62 -0
  213. data/spec/wayfarer_spec.rb +24 -0
  214. data/support/static/finders.html +38 -0
  215. data/support/static/graph/details/a.html +10 -0
  216. data/support/static/graph/details/b.html +10 -0
  217. data/support/static/graph/index.html +20 -0
  218. data/support/static/json/dummy.json +13 -0
  219. data/support/static/links/links.html +28 -0
  220. data/support/static/xml/dummy.xml +120 -0
  221. data/support/test_app.rb +45 -0
  222. data/wayfarer-jruby.gemspec +49 -0
  223. data/wayfarer.gemspec +53 -0
  224. metadata +616 -0
@@ -0,0 +1,18 @@
1
+ ---
2
+ layout: default
3
+ title: URI Rules
4
+ ---
5
+
6
+ # URI rules
7
+
8
+ URI rules match against a string.
9
+
10
+ {% highlight ruby %}
11
+ class DummyJob < Wayfarer::Job
12
+ route.uri "https://example.com"
13
+ end
14
+ {% endhighlight %}
15
+
16
+ Matches:
17
+
18
+ * Only `https://example.com`
@@ -0,0 +1,65 @@
1
+ require_relative "../lib/wayfarer"
2
+
3
+ class CollectGithubIssues < Wayfarer::Job
4
+ config.connection_count = 4
5
+ config.logger.level = :fatal
6
+
7
+ let(:records) { [] }
8
+
9
+ routes do
10
+ host "github.com" do
11
+ path "/:user/:repo", to: :repository
12
+ path "/:user/:repo/issues", to: :index
13
+ path "/:user/:repo/issues/:id", to: :show
14
+ end
15
+ end
16
+
17
+ after_crawl do
18
+ records.each do |issue|
19
+ # Save them somewhere?
20
+ puts issue
21
+ end
22
+ end
23
+
24
+ def repository
25
+ stage navigation_links
26
+ end
27
+
28
+ def index
29
+ stage issue_listing_links, next_page
30
+ end
31
+
32
+ def show
33
+ return halt if records.count > 30
34
+
35
+ records << {
36
+ id: params[:id],
37
+ title: issue_title,
38
+ author: issue_author
39
+ }
40
+ end
41
+
42
+ private
43
+
44
+ def issue_title
45
+ doc.css(".js-issue-title").text.strip
46
+ end
47
+
48
+ def issue_author
49
+ doc.css(".TableObject-item .author").text.strip
50
+ end
51
+
52
+ def navigation_links
53
+ page.links ".reponav-item"
54
+ end
55
+
56
+ def issue_listing_links
57
+ page.links ".issues-listing"
58
+ end
59
+
60
+ def next_page
61
+ page.links ".next_page"
62
+ end
63
+ end
64
+
65
+ CollectGithubIssues.perform_now("https://github.com/rails/rails")
@@ -0,0 +1,23 @@
1
+ require_relative "../lib/wayfarer"
2
+
3
+ class FindFoobarOnWikipedia < Wayfarer::Job
4
+ config.http_adapter = :selenium
5
+ config.selenium_argv = [:chrome]
6
+ config.connection_count = 4
7
+
8
+ let(:keywords) { [] }
9
+
10
+ route.host "en.wikipedia.org", to: :article
11
+
12
+ def article
13
+ if page.body =~ /Foobar/
14
+ driver.save_screenshot("/tmp/foobar.png")
15
+ return halt
16
+ end
17
+
18
+ keywords << page.keywords
19
+ stage page.links
20
+ end
21
+ end
22
+
23
+ FindFoobarOnWikipedia.perform_now("https://en.wikipedia.org/wiki/Special:Random")
data/lib/wayfarer.rb ADDED
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ # rubocop:disable Style/Documentation
4
+ require "logger"
5
+ require "uri"
6
+
7
+ # Plumbing
8
+ require_relative "wayfarer/configuration"
9
+
10
+ # Routing
11
+ require_relative "wayfarer/routing/rule"
12
+ require_relative "wayfarer/routing/uri_rule"
13
+ require_relative "wayfarer/routing/host_rule"
14
+ require_relative "wayfarer/routing/path_rule"
15
+ require_relative "wayfarer/routing/query_rule"
16
+ require_relative "wayfarer/routing/router"
17
+
18
+ # Networking
19
+ require_relative "wayfarer/http_adapters/net_http_adapter"
20
+ require_relative "wayfarer/http_adapters/selenium_adapter"
21
+ require_relative "wayfarer/http_adapters/adapter_pool"
22
+
23
+ # Parsers
24
+ require_relative "wayfarer/parsers/xml_parser"
25
+ require_relative "wayfarer/parsers/json_parser"
26
+
27
+ # Frontiers
28
+ require_relative "wayfarer/frontiers/frontier"
29
+ require_relative "wayfarer/frontiers/memory_frontier"
30
+ require_relative "wayfarer/frontiers/redis_frontier"
31
+ require_relative "wayfarer/frontiers/normalize_uris"
32
+
33
+ unless RUBY_PLATFORM == "java"
34
+ require_relative "wayfarer/frontiers/memory_trie_frontier"
35
+ require_relative "wayfarer/frontiers/memory_bloomfilter"
36
+ require_relative "wayfarer/frontiers/redis_bloomfilter"
37
+ end
38
+
39
+ # Processing
40
+ require_relative "wayfarer/crawl"
41
+ require_relative "wayfarer/crawl_observer"
42
+ require_relative "wayfarer/locals"
43
+ require_relative "wayfarer/job"
44
+ require_relative "wayfarer/finders"
45
+ require_relative "wayfarer/page"
46
+ require_relative "wayfarer/dispatcher"
47
+ require_relative "wayfarer/processor"
48
+
49
+ module Wayfarer
50
+ VERSION = "0.0.1"
51
+
52
+ def self.logger
53
+ return @logger if @logger
54
+
55
+ @logger = Logger.new(STDOUT)
56
+ @logger.level = Logger::WARN
57
+ @logger
58
+ end
59
+
60
+ def self.config
61
+ @config ||= Configuration.new
62
+ yield(@config) if block_given?
63
+ @config
64
+ end
65
+ end
@@ -0,0 +1,86 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ostruct"
4
+ require "securerandom"
5
+ require "forwardable"
6
+
7
+ module Wayfarer
8
+ class Configuration < OpenStruct
9
+ extend Forwardable
10
+
11
+ DEFAULTS = {
12
+ # Print full stacktraces?
13
+ print_stacktraces: true,
14
+
15
+ # Crash when encountering unhandled exceptions?
16
+ reraise_exceptions: false,
17
+
18
+ # Allow processing URIs multiple times?
19
+ allow_circulation: false,
20
+
21
+ # How many HTTP connections/Selenium drivers to use
22
+ # 1:1 correspondence with spawned threads
23
+ connection_count: 1,
24
+
25
+ # Which HTTP adapter to use. Supported are :net_http and :selenium
26
+ http_adapter: :net_http,
27
+
28
+ # Which frontier to use.
29
+ frontier: :memory,
30
+
31
+ # How long a thread may hold an HTTP adapter.
32
+ # Threads that exceed this limit fail with an exception.
33
+ connection_timeout: Float::INFINITY,
34
+
35
+ # How many 3xx redirects to follow. Has no effect when using Selenium
36
+ max_http_redirects: 3,
37
+
38
+ # Argument vector for instantiating Selenium drivers
39
+ selenium_argv: [:firefox],
40
+
41
+ # Argument vector for instantiating a Redis connection
42
+ redis_opts: {
43
+ host: "localhost",
44
+ port: 6379
45
+ }.freeze,
46
+
47
+ # Size of browser windows
48
+ window_size: [1024, 768],
49
+
50
+ # Which Mustermann pattern type to use when matching URI paths
51
+ # TODO: Mention in docs
52
+ mustermann_type: :sinatra,
53
+
54
+ # Options for instantiating Bloomfilters
55
+ bloomfilter_opts: {
56
+ size: 100,
57
+ hashes: 2,
58
+ seed: 1,
59
+ bucket: 3,
60
+ raise: false
61
+ },
62
+
63
+ # Whether to normalize URIs
64
+ normalize_uris: true,
65
+
66
+ # URI normalization options
67
+ # See: https://github.com/rwz/normalize_url
68
+ normalize_uri_options: {}
69
+ }.freeze
70
+
71
+ attr_reader :uuid
72
+
73
+ def initialize(overrides = {})
74
+ super(DEFAULTS.merge(overrides))
75
+ @uuid = SecureRandom.uuid
76
+ end
77
+
78
+ def logger
79
+ @logger ||= Wayfarer.logger.dup
80
+ end
81
+
82
+ def reset!
83
+ DEFAULTS.each { |key, val| self[key] = val }
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "observer"
4
+ require "normalize_url"
5
+
6
+ module Wayfarer
7
+ class Crawl
8
+ extend Forwardable
9
+ include Observable
10
+
11
+ # The prepared job.
12
+ # @!attribute [r] job
13
+ attr_reader :job
14
+
15
+ # @!attribute [r] dispatcher
16
+ attr_reader :dispatcher
17
+
18
+ delegate config: :job
19
+ delegate logger: :config
20
+
21
+ def initialize(job, *uris)
22
+ @job = job.prepare
23
+ @uris = uris
24
+ @dispatcher = Dispatcher.new(@job)
25
+ @processor = Processor.new(@job, frontier, @dispatcher)
26
+ end
27
+
28
+ def execute
29
+ trap_signals
30
+
31
+ CrawlObserver.new(@processor, @dispatcher, config.logger)
32
+
33
+ @job.run_hook(:before_crawl)
34
+ @processor.run(*@uris)
35
+ @job.run_hook(:after_crawl)
36
+ ensure
37
+ untrap_signals
38
+ end
39
+
40
+ # A frontier with initially pre-staged URIs.
41
+ # @return [Frontier]
42
+ def frontier
43
+ return @frontier if @frontier
44
+
45
+ @frontier = case config.frontier
46
+ when :memory_trie
47
+ Frontiers::MemoryTrieFrontier.new(config)
48
+ when :redis
49
+ Frontiers::RedisFrontier.new(config)
50
+ when :memory_bloom
51
+ Frontiers::MemoryBloomfilter.new(config)
52
+ when :redis_bloom
53
+ Frontiers::RedisBloomfilter.new(config)
54
+ else
55
+ Frontiers::MemoryFrontier.new(config)
56
+ end
57
+
58
+ @frontier.extend(Frontiers::NormalizeURIs) if config.normalize_uris
59
+
60
+ @frontier.stage(*@uris) # TODO: Test
61
+
62
+ @frontier
63
+ end
64
+
65
+ private
66
+
67
+ def trap_signals
68
+ @cached_sigint_handler = trap(:INT) {
69
+ halt!
70
+ @cached_sigint_handler.try(:call)
71
+ exit(-1)
72
+ }
73
+ end
74
+
75
+ def untrap_signals
76
+ trap(:INT) { @cached_sigint_handler.try(:call) }
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ class CrawlObserver
5
+ module Events
6
+ FirstCycle = Struct.new(:frontier)
7
+ NewCycle = Struct.new(:current_uris_count)
8
+ DispatchedURI = Struct.new(:action, :uri)
9
+ CycleFinished = Class.new
10
+ Peeking = Struct.new(:uri)
11
+ AboutToCycle = Struct.new(:staged_uris_count)
12
+ MismatchedURI = Struct.new(:uri)
13
+ HaltInitiated = Struct.new(:action, :uri)
14
+ StagingURIs = Struct.new(:staged_uris_count)
15
+ UnhandledError = Struct.new(:exception)
16
+ end
17
+
18
+ module ObservableShortcuts
19
+ def notify_observers!(*argv)
20
+ changed
21
+ notify_observers(*argv)
22
+ end
23
+ end
24
+
25
+ extend Forwardable
26
+
27
+ attr_reader :logger
28
+
29
+ def initialize(*observables, logger)
30
+ @logger = logger
31
+ observables.each { |obsv| obsv.add_observer(self) }
32
+ end
33
+
34
+ def update(event)
35
+ case event
36
+ when Events::FirstCycle then first_cycle(event)
37
+ when Events::NewCycle then new_cycle(event)
38
+ when Events::DispatchedURI then dispatched_uri(event)
39
+ when Events::CycleFinished then cycle_finished
40
+ when Events::Peeking then peeking(event)
41
+ when Events::AboutToCycle then about_to_cycle(event)
42
+ when Events::MismatchedURI then mismatched_uri(event)
43
+ when Events::HaltInitiated then halt_initiated(event)
44
+ when Events::StagingURIs then staging_uris(event)
45
+ when Events::UnhandledError then unhandled_error(event)
46
+ end
47
+ end
48
+
49
+ private
50
+
51
+ def first_cycle(event)
52
+ logger.info("First cycle")
53
+ logger.info("Frontier: #{event.frontier}")
54
+ end
55
+
56
+ def new_cycle(event)
57
+ logger.info("Current cycle contains #{event.current_uris_count} URI(s)")
58
+ end
59
+
60
+ def dispatched_uri(event)
61
+ logger.info("Dispatched to \##{event.action}: #{event.uri}")
62
+ end
63
+
64
+ def cycle_finished
65
+ logger.info("No URIs left in current cycle")
66
+ end
67
+
68
+ def peeking(event)
69
+ logger.info("Peeking into: #{event.uri}")
70
+ end
71
+
72
+ def about_to_cycle(event)
73
+ logger.info("About to cycle. #{event.staged_uris_count} staged URI(s)")
74
+ end
75
+
76
+ def mismatched_uri(event)
77
+ logger.debug("No matching route for: #{event.uri}")
78
+ end
79
+
80
+ def halt_initiated(event)
81
+ logger.info("Halt initiated from \##{event.action} at: #{event.uri}")
82
+ end
83
+
84
+ def staging_uris(event)
85
+ logger.info("Staging #{event.staged_uris_count} URI(s)")
86
+ end
87
+
88
+ def unhandled_error(event)
89
+ level = config.reraise_exceptions ? :fatal : :error
90
+
91
+ if config.print_stacktraces
92
+ logger.public_send level, <<~LOGGER
93
+ Unhandled exception in an action: #{event.exception.class.inspect}
94
+ #{event.exception.backtrace.map(&:to_s).join("\n* ")}
95
+ LOGGER
96
+ else
97
+ logger.public_send level, <<~LOGGER
98
+ Unhandled exception in an action: #{event.exception.class.inspect}
99
+ LOGGER
100
+ end
101
+ end
102
+ end
103
+ end