wayfarer-jruby 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rbenv-gemsets +1 -0
  4. data/.rspec +3 -0
  5. data/.rubocop.yml +21 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/.yardopts +3 -0
  9. data/Gemfile +11 -0
  10. data/LICENSE +19 -0
  11. data/README.md +19 -0
  12. data/Rakefile +114 -0
  13. data/benchmark/frontiers.rb +143 -0
  14. data/bin/wayfarer +116 -0
  15. data/docs/.gitignore +2 -0
  16. data/docs/_config.yml +15 -0
  17. data/docs/_includes/base.html +7 -0
  18. data/docs/_includes/head.html +10 -0
  19. data/docs/_includes/navigation.html +172 -0
  20. data/docs/_layouts/default.html +42 -0
  21. data/docs/_sass/base.scss +439 -0
  22. data/docs/_sass/variables.scss +24 -0
  23. data/docs/_sass/vendor/bourbon/_bourbon-deprecate.scss +19 -0
  24. data/docs/_sass/vendor/bourbon/_bourbon-deprecated-upcoming.scss +425 -0
  25. data/docs/_sass/vendor/bourbon/_bourbon.scss +90 -0
  26. data/docs/_sass/vendor/bourbon/addons/_border-color.scss +29 -0
  27. data/docs/_sass/vendor/bourbon/addons/_border-radius.scss +48 -0
  28. data/docs/_sass/vendor/bourbon/addons/_border-style.scss +28 -0
  29. data/docs/_sass/vendor/bourbon/addons/_border-width.scss +28 -0
  30. data/docs/_sass/vendor/bourbon/addons/_buttons.scss +69 -0
  31. data/docs/_sass/vendor/bourbon/addons/_clearfix.scss +25 -0
  32. data/docs/_sass/vendor/bourbon/addons/_ellipsis.scss +30 -0
  33. data/docs/_sass/vendor/bourbon/addons/_font-stacks.scss +31 -0
  34. data/docs/_sass/vendor/bourbon/addons/_hide-text.scss +27 -0
  35. data/docs/_sass/vendor/bourbon/addons/_margin.scss +29 -0
  36. data/docs/_sass/vendor/bourbon/addons/_padding.scss +29 -0
  37. data/docs/_sass/vendor/bourbon/addons/_position.scss +51 -0
  38. data/docs/_sass/vendor/bourbon/addons/_prefixer.scss +66 -0
  39. data/docs/_sass/vendor/bourbon/addons/_retina-image.scss +27 -0
  40. data/docs/_sass/vendor/bourbon/addons/_size.scss +56 -0
  41. data/docs/_sass/vendor/bourbon/addons/_text-inputs.scss +118 -0
  42. data/docs/_sass/vendor/bourbon/addons/_timing-functions.scss +34 -0
  43. data/docs/_sass/vendor/bourbon/addons/_triangle.scss +63 -0
  44. data/docs/_sass/vendor/bourbon/addons/_word-wrap.scss +29 -0
  45. data/docs/_sass/vendor/bourbon/css3/_animation.scss +61 -0
  46. data/docs/_sass/vendor/bourbon/css3/_appearance.scss +5 -0
  47. data/docs/_sass/vendor/bourbon/css3/_backface-visibility.scss +5 -0
  48. data/docs/_sass/vendor/bourbon/css3/_background-image.scss +44 -0
  49. data/docs/_sass/vendor/bourbon/css3/_background.scss +57 -0
  50. data/docs/_sass/vendor/bourbon/css3/_border-image.scss +61 -0
  51. data/docs/_sass/vendor/bourbon/css3/_calc.scss +6 -0
  52. data/docs/_sass/vendor/bourbon/css3/_columns.scss +67 -0
  53. data/docs/_sass/vendor/bourbon/css3/_filter.scss +6 -0
  54. data/docs/_sass/vendor/bourbon/css3/_flex-box.scss +327 -0
  55. data/docs/_sass/vendor/bourbon/css3/_font-face.scss +29 -0
  56. data/docs/_sass/vendor/bourbon/css3/_font-feature-settings.scss +6 -0
  57. data/docs/_sass/vendor/bourbon/css3/_hidpi-media-query.scss +12 -0
  58. data/docs/_sass/vendor/bourbon/css3/_hyphens.scss +6 -0
  59. data/docs/_sass/vendor/bourbon/css3/_image-rendering.scss +15 -0
  60. data/docs/_sass/vendor/bourbon/css3/_keyframes.scss +38 -0
  61. data/docs/_sass/vendor/bourbon/css3/_linear-gradient.scss +40 -0
  62. data/docs/_sass/vendor/bourbon/css3/_perspective.scss +12 -0
  63. data/docs/_sass/vendor/bourbon/css3/_placeholder.scss +10 -0
  64. data/docs/_sass/vendor/bourbon/css3/_radial-gradient.scss +40 -0
  65. data/docs/_sass/vendor/bourbon/css3/_selection.scss +44 -0
  66. data/docs/_sass/vendor/bourbon/css3/_text-decoration.scss +27 -0
  67. data/docs/_sass/vendor/bourbon/css3/_transform.scss +21 -0
  68. data/docs/_sass/vendor/bourbon/css3/_transition.scss +81 -0
  69. data/docs/_sass/vendor/bourbon/css3/_user-select.scss +5 -0
  70. data/docs/_sass/vendor/bourbon/functions/_assign-inputs.scss +16 -0
  71. data/docs/_sass/vendor/bourbon/functions/_contains-falsy.scss +25 -0
  72. data/docs/_sass/vendor/bourbon/functions/_contains.scss +31 -0
  73. data/docs/_sass/vendor/bourbon/functions/_is-length.scss +16 -0
  74. data/docs/_sass/vendor/bourbon/functions/_is-light.scss +26 -0
  75. data/docs/_sass/vendor/bourbon/functions/_is-number.scss +16 -0
  76. data/docs/_sass/vendor/bourbon/functions/_is-size.scss +23 -0
  77. data/docs/_sass/vendor/bourbon/functions/_modular-scale.scss +74 -0
  78. data/docs/_sass/vendor/bourbon/functions/_px-to-em.scss +24 -0
  79. data/docs/_sass/vendor/bourbon/functions/_px-to-rem.scss +26 -0
  80. data/docs/_sass/vendor/bourbon/functions/_shade.scss +24 -0
  81. data/docs/_sass/vendor/bourbon/functions/_strip-units.scss +22 -0
  82. data/docs/_sass/vendor/bourbon/functions/_tint.scss +24 -0
  83. data/docs/_sass/vendor/bourbon/functions/_transition-property-name.scss +37 -0
  84. data/docs/_sass/vendor/bourbon/functions/_unpack.scss +32 -0
  85. data/docs/_sass/vendor/bourbon/helpers/_convert-units.scss +26 -0
  86. data/docs/_sass/vendor/bourbon/helpers/_directional-values.scss +108 -0
  87. data/docs/_sass/vendor/bourbon/helpers/_font-source-declaration.scss +53 -0
  88. data/docs/_sass/vendor/bourbon/helpers/_gradient-positions-parser.scss +24 -0
  89. data/docs/_sass/vendor/bourbon/helpers/_linear-angle-parser.scss +35 -0
  90. data/docs/_sass/vendor/bourbon/helpers/_linear-gradient-parser.scss +51 -0
  91. data/docs/_sass/vendor/bourbon/helpers/_linear-positions-parser.scss +77 -0
  92. data/docs/_sass/vendor/bourbon/helpers/_linear-side-corner-parser.scss +41 -0
  93. data/docs/_sass/vendor/bourbon/helpers/_radial-arg-parser.scss +74 -0
  94. data/docs/_sass/vendor/bourbon/helpers/_radial-gradient-parser.scss +55 -0
  95. data/docs/_sass/vendor/bourbon/helpers/_radial-positions-parser.scss +28 -0
  96. data/docs/_sass/vendor/bourbon/helpers/_render-gradients.scss +31 -0
  97. data/docs/_sass/vendor/bourbon/helpers/_shape-size-stripper.scss +15 -0
  98. data/docs/_sass/vendor/bourbon/helpers/_str-to-num.scss +55 -0
  99. data/docs/_sass/vendor/bourbon/settings/_asset-pipeline.scss +7 -0
  100. data/docs/_sass/vendor/bourbon/settings/_deprecation-warnings.scss +8 -0
  101. data/docs/_sass/vendor/bourbon/settings/_prefixer.scss +9 -0
  102. data/docs/_sass/vendor/bourbon/settings/_px-to-em.scss +1 -0
  103. data/docs/_sass/vendor/neat/_neat-helpers.scss +11 -0
  104. data/docs/_sass/vendor/neat/_neat.scss +23 -0
  105. data/docs/_sass/vendor/neat/functions/_new-breakpoint.scss +49 -0
  106. data/docs/_sass/vendor/neat/functions/_private.scss +114 -0
  107. data/docs/_sass/vendor/neat/grid/_box-sizing.scss +15 -0
  108. data/docs/_sass/vendor/neat/grid/_direction-context.scss +33 -0
  109. data/docs/_sass/vendor/neat/grid/_display-context.scss +28 -0
  110. data/docs/_sass/vendor/neat/grid/_fill-parent.scss +22 -0
  111. data/docs/_sass/vendor/neat/grid/_media.scss +92 -0
  112. data/docs/_sass/vendor/neat/grid/_omega.scss +87 -0
  113. data/docs/_sass/vendor/neat/grid/_outer-container.scss +34 -0
  114. data/docs/_sass/vendor/neat/grid/_pad.scss +25 -0
  115. data/docs/_sass/vendor/neat/grid/_private.scss +35 -0
  116. data/docs/_sass/vendor/neat/grid/_row.scss +52 -0
  117. data/docs/_sass/vendor/neat/grid/_shift.scss +50 -0
  118. data/docs/_sass/vendor/neat/grid/_span-columns.scss +94 -0
  119. data/docs/_sass/vendor/neat/grid/_to-deprecate.scss +97 -0
  120. data/docs/_sass/vendor/neat/grid/_visual-grid.scss +42 -0
  121. data/docs/_sass/vendor/neat/mixins/_clearfix.scss +25 -0
  122. data/docs/_sass/vendor/neat/settings/_disable-warnings.scss +13 -0
  123. data/docs/_sass/vendor/neat/settings/_grid.scss +51 -0
  124. data/docs/_sass/vendor/neat/settings/_visual-grid.scss +27 -0
  125. data/docs/_sass/vendor/normalize-3.0.2.scss +427 -0
  126. data/docs/_sass/vendor/pygments.scss +356 -0
  127. data/docs/automating_browsers/capybara.md +70 -0
  128. data/docs/css/screen.scss +7 -0
  129. data/docs/guides/callbacks.md +45 -0
  130. data/docs/guides/cli.md +52 -0
  131. data/docs/guides/configuration.md +184 -0
  132. data/docs/guides/error_handling.md +46 -0
  133. data/docs/guides/frontiers.md +93 -0
  134. data/docs/guides/halting.md +23 -0
  135. data/docs/guides/job_queues.md +26 -0
  136. data/docs/guides/locals.md +36 -0
  137. data/docs/guides/logging.md +22 -0
  138. data/docs/guides/page_objects.md +67 -0
  139. data/docs/guides/peeking.md +46 -0
  140. data/docs/guides/selenium_capybara.md +100 -0
  141. data/docs/guides/tutorial.md +452 -0
  142. data/docs/index.md +82 -0
  143. data/docs/js/navigation.js +11 -0
  144. data/docs/misc/contributing.md +20 -0
  145. data/docs/misc/testing.md +11 -0
  146. data/docs/recipes/authentication.md +23 -0
  147. data/docs/recipes/csv.md +29 -0
  148. data/docs/recipes/javascript.md +20 -0
  149. data/docs/recipes/multiple_uris.md +18 -0
  150. data/docs/recipes/screenshots.md +20 -0
  151. data/docs/routing/host_rules.md +24 -0
  152. data/docs/routing/path_rules.md +33 -0
  153. data/docs/routing/query_rules.md +69 -0
  154. data/docs/routing/routes.md +96 -0
  155. data/docs/routing/uri_rules.md +18 -0
  156. data/examples/collect_github_issues.rb +65 -0
  157. data/examples/find_foobar_on_wikipedia.rb +23 -0
  158. data/lib/wayfarer.rb +65 -0
  159. data/lib/wayfarer/configuration.rb +86 -0
  160. data/lib/wayfarer/crawl.rb +79 -0
  161. data/lib/wayfarer/crawl_observer.rb +103 -0
  162. data/lib/wayfarer/dispatcher.rb +104 -0
  163. data/lib/wayfarer/finders.rb +61 -0
  164. data/lib/wayfarer/frontiers/frontier.rb +79 -0
  165. data/lib/wayfarer/frontiers/memory_bloomfilter.rb +32 -0
  166. data/lib/wayfarer/frontiers/memory_frontier.rb +76 -0
  167. data/lib/wayfarer/frontiers/memory_trie_frontier.rb +39 -0
  168. data/lib/wayfarer/frontiers/normalize_uris.rb +48 -0
  169. data/lib/wayfarer/frontiers/redis_bloomfilter.rb +34 -0
  170. data/lib/wayfarer/frontiers/redis_frontier.rb +83 -0
  171. data/lib/wayfarer/http_adapters/adapter_pool.rb +62 -0
  172. data/lib/wayfarer/http_adapters/net_http_adapter.rb +77 -0
  173. data/lib/wayfarer/http_adapters/selenium_adapter.rb +80 -0
  174. data/lib/wayfarer/job.rb +192 -0
  175. data/lib/wayfarer/locals.rb +40 -0
  176. data/lib/wayfarer/page.rb +94 -0
  177. data/lib/wayfarer/parsers/json_parser.rb +20 -0
  178. data/lib/wayfarer/parsers/xml_parser.rb +27 -0
  179. data/lib/wayfarer/processor.rb +103 -0
  180. data/lib/wayfarer/routing/host_rule.rb +19 -0
  181. data/lib/wayfarer/routing/path_rule.rb +54 -0
  182. data/lib/wayfarer/routing/query_rule.rb +59 -0
  183. data/lib/wayfarer/routing/router.rb +71 -0
  184. data/lib/wayfarer/routing/rule.rb +102 -0
  185. data/lib/wayfarer/routing/uri_rule.rb +21 -0
  186. data/spec/configuration_spec.rb +26 -0
  187. data/spec/crawl_spec.rb +48 -0
  188. data/spec/finders_spec.rb +49 -0
  189. data/spec/frontiers/memory_bloomfilter_spec.rb +6 -0
  190. data/spec/frontiers/memory_frontier_spec.rb +6 -0
  191. data/spec/frontiers/memory_trie_frontier_spec.rb +6 -0
  192. data/spec/frontiers/normalize_uris_spec.rb +59 -0
  193. data/spec/frontiers/redis_bloomfilter_spec.rb +6 -0
  194. data/spec/frontiers/redis_frontier_spec.rb +6 -0
  195. data/spec/http_adapters/adapter_pool_spec.rb +33 -0
  196. data/spec/http_adapters/net_http_adapter_spec.rb +83 -0
  197. data/spec/http_adapters/selenium_adapter_spec.rb +53 -0
  198. data/spec/integration/callbacks_spec.rb +42 -0
  199. data/spec/integration/locals_spec.rb +106 -0
  200. data/spec/job_spec.rb +86 -0
  201. data/spec/page_spec.rb +38 -0
  202. data/spec/parsers/json_parser_spec.rb +30 -0
  203. data/spec/parsers/xml_parser_spec.rb +24 -0
  204. data/spec/processor_spec.rb +31 -0
  205. data/spec/routing/host_rule_spec.rb +48 -0
  206. data/spec/routing/path_rule_spec.rb +66 -0
  207. data/spec/routing/query_rule_spec.rb +124 -0
  208. data/spec/routing/router_spec.rb +67 -0
  209. data/spec/routing/rule_spec.rb +218 -0
  210. data/spec/routing/uri_rule_spec.rb +24 -0
  211. data/spec/shared/frontier.rb +96 -0
  212. data/spec/spec_helpers.rb +62 -0
  213. data/spec/wayfarer_spec.rb +24 -0
  214. data/support/static/finders.html +38 -0
  215. data/support/static/graph/details/a.html +10 -0
  216. data/support/static/graph/details/b.html +10 -0
  217. data/support/static/graph/index.html +20 -0
  218. data/support/static/json/dummy.json +13 -0
  219. data/support/static/links/links.html +28 -0
  220. data/support/static/xml/dummy.xml +120 -0
  221. data/support/test_app.rb +45 -0
  222. data/wayfarer-jruby.gemspec +49 -0
  223. data/wayfarer.gemspec +53 -0
  224. metadata +616 -0
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_support/core_ext/hash/indifferent_access"
4
+
5
+ module Wayfarer
6
+ # Creates job instances, retrieves pages and, if a URI matches a route, calls
7
+ # methods on the instances.
8
+ class Dispatcher
9
+ extend Forwardable
10
+
11
+ include Observable
12
+ include CrawlObserver::Events
13
+ include CrawlObserver::ObservableShortcuts
14
+
15
+ # Result types that a {Processor} operates with.
16
+ Mismatch = Struct.new(:uri)
17
+ Halt = Struct.new(:uri, :action)
18
+ Stage = Struct.new(:uris, :ret_val)
19
+ Error = Struct.new(:exception)
20
+
21
+ # @!attribute [r] adapter_pool
22
+ # @return [AdapterPool]
23
+ attr_reader :adapter_pool
24
+
25
+ # @!attribute [r] job
26
+ attr_reader :job
27
+
28
+ def initialize(job)
29
+ @job = job
30
+ @adapter_pool = HTTPAdapters::AdapterPool.new(job)
31
+ end
32
+
33
+ delegate config: :job
34
+ delegate logger: :config
35
+
36
+ # Dispatches this URI. Matches an URI against the rules of the job's router.
37
+ # If a rule matches, the page is retrieved, and the action associated with
38
+ # the route is called.
39
+ #
40
+ # @param [Job] job
41
+ # @param [URI] uri
42
+ def dispatch(job, uri, is_peeking: false)
43
+ action, params = job.router.route(uri)
44
+ return Mismatch.new(uri) unless action
45
+
46
+ params = ActiveSupport::HashWithIndifferentAccess.new(params)
47
+
48
+ notify_observers!(DispatchedURI.new(action, uri))
49
+
50
+ job_instance = job.new
51
+ result = nil
52
+
53
+ adapter_pool.with do |adapter|
54
+ job_instance.page = adapter.fetch(uri)
55
+ job_instance.adapter = adapter
56
+ job_instance.params = params
57
+
58
+ result = job_instance.public_send(action) { |peek_uri|
59
+ begin
60
+ unless is_peeking
61
+ notify_observers!(Peeking.new(uri))
62
+ result = dispatch(job, URI(peek_uri), is_peeking: true)
63
+ result.ret_val
64
+ end
65
+ rescue
66
+ nil
67
+ end
68
+ }
69
+ end
70
+
71
+ if job_instance.halts?
72
+ Halt.new(uri, action)
73
+ else
74
+ Stage.new(job_instance.staged_uris, result)
75
+ end
76
+ # What follows are exceptions whose origin I don't care about at the moment
77
+ # TODO: Better logging
78
+ rescue Net::HTTP::Persistent::Error
79
+ logger.warn("Net::HTTP::Persistent::Error @ #{uri}")
80
+ rescue Errno::EHOSTUNREACH
81
+ logger.warn("Host unreachable @ #{uri}")
82
+ rescue Errno::ENETUNREACH
83
+ logger.warn("No route to network present @ #{uri}")
84
+ rescue Net::OpenTimeout, Net::ReadTimeout
85
+ logger.warn("::Net timeout @ #{uri}")
86
+
87
+ # SSL verification failed due to a missing certificate
88
+ rescue OpenSSL::SSL::SSLError
89
+ logger.warn("SSL verification failed @ #{uri}")
90
+
91
+ # Ruby/zlib encountered a Z_DATA_ERROR.
92
+ # Usually if a stream was prematurely freed.
93
+ # Probably has to do with net-http-persistent?
94
+ rescue Zlib::DataError
95
+ logger.warn("Z_DATA_ERROR")
96
+ rescue HTTPAdapters::NetHTTPAdapter::MalformedURI, URI::InvalidURIError
97
+ logger.info("[warn#{self}] Malformed URI @ #{uri}")
98
+ rescue HTTPAdapters::NetHTTPAdapter::MalformedRedirectURI
99
+ logger.info("Malformed redirect URI @ #{uri}")
100
+ rescue HTTPAdapters::NetHTTPAdapter::MaximumRedirectCountReached
101
+ logger.info("Maximum redirect count reached @ #{uri}")
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Finders
5
+ # Returns the expanded `href` attribute URIs from all or targeted `<a>` tags.
6
+ # @param [*Array<String>] filters CSS/XPath expressions.
7
+ # @return [Array<URI>]
8
+ def links(*filters)
9
+ query("a", "href", *filters)
10
+ end
11
+
12
+ # Returns the expanded `href` attribute URIs from all or targeted `<link rel="stylesheet" ...>` tags.
13
+ # @param [*Array<String>] filters CSS/XPath expressions.
14
+ # @return [Array<URI>]
15
+ def stylesheets(*filters)
16
+ query("link[rel='stylesheet']", "href", *filters)
17
+ end
18
+
19
+ # Returns the expanded `src` attribute URIs from all or targeted `<script>` tags.
20
+ # TODO: Tests
21
+ # @param [*Array<String>] filters CSS/XPath expressions.
22
+ # @return [Array<URI>]
23
+ def javascripts(*filters)
24
+ query("script", "src", *filters)
25
+ end
26
+
27
+ alias scripts javascripts
28
+
29
+ # Returns the expanded `src` attribute URIs from all or targeted `<img>` tags.
30
+ # TODO: Tests
31
+ # @param [*Array<String>] filters CSS/XPath expressions.
32
+ # @return [Array<URI>]
33
+ def images(*filters)
34
+ query("img", "src", *filters)
35
+ end
36
+
37
+ private
38
+
39
+ # TODO: Lord have mercy
40
+ def query(selector, attr, *filters)
41
+ nodes = if filters.any?
42
+ doc.search(*filters).css(selector)
43
+ else
44
+ doc.css(selector)
45
+ end
46
+
47
+ links = nodes.map { |node|
48
+ begin
49
+ URI.join(uri, node.attr(attr))
50
+ rescue
51
+ nil
52
+ end
53
+ }
54
+
55
+ links
56
+ .find_all { |uri| uri.is_a?(URI) }
57
+ .uniq
58
+ .map(&:to_s)
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Frontiers
5
+ # @abstract The common behaviour of all frontiers.
6
+ # @api private
7
+ class Frontier
8
+ attr_reader :config
9
+
10
+ def initialize(config)
11
+ @config = config
12
+ end
13
+
14
+ # Returns URIs to be scraped in the current cycle.
15
+ # @note Usually an expensive operation!
16
+ # @return [Array<URI>]
17
+ def current_uris
18
+ raise "Unimplemented"
19
+ end
20
+
21
+ # Returns staged URIs.
22
+ # @return [Array<URI>]
23
+ def staged_uris
24
+ raise "Unimplemented"
25
+ end
26
+
27
+ # Stages URIs for processing in the next cycle.
28
+ # @param [*Array<URI>, *Array<String>] uris
29
+ def stage(*_uris)
30
+ raise "Unimplemented"
31
+ end
32
+
33
+ # Whether a URI is cached.
34
+ def staged?(_uri)
35
+ raise "Unimplemented"
36
+ end
37
+
38
+ # Caches URIs so they don't get processed again.
39
+ # @param [*Array<URI>, *Array<String>] uris
40
+ def cache(*_uris)
41
+ raise "Unimplemented"
42
+ end
43
+
44
+ # Whether a URI is cached.
45
+ def cached?(_uri)
46
+ raise "Unimplemented"
47
+ end
48
+
49
+ # Frees resources.
50
+ def free; end
51
+
52
+ # TODO: Documentation
53
+ def cycle
54
+ unless config.allow_circulation
55
+ cache(*current_uris) # TODO: Make it a template method
56
+ filter_staged_uris!
57
+ end
58
+
59
+ return false if staged_uris.none?
60
+
61
+ swap!
62
+ reset_staged_uris!
63
+
64
+ true
65
+ end
66
+
67
+ protected
68
+
69
+ # TODO: Documentation
70
+ def filter_staged_uris!; end
71
+
72
+ # TODO: Documentation
73
+ def swap!; end
74
+
75
+ # TODO: Documentation
76
+ def reset_staged_uris!; end
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bloomfilter-rb"
4
+
5
+ module Wayfarer
6
+ module Frontiers
7
+ # An in-memory bloomfilter.
8
+ # @api private
9
+ class MemoryBloomfilter < MemoryFrontier
10
+ def initialize(config)
11
+ @filter = BloomFilter::Native.new(config.bloomfilter_opts)
12
+ super(config)
13
+ end
14
+
15
+ # @override
16
+ def cache(*uris)
17
+ uris.each { |uri| @filter.insert(uri) }
18
+ end
19
+
20
+ # @override
21
+ def cached?(uri)
22
+ @filter.include?(uri)
23
+ end
24
+
25
+ # Frees up memory.
26
+ def free
27
+ @filter.clear
28
+ super
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "set"
4
+ require "parallel" unless JAVA_PLATFORM = "java"
5
+
6
+ module Wayfarer
7
+ module Frontiers
8
+ # A naive in-memory frontier.
9
+ # @api private
10
+ class MemoryFrontier < Frontier
11
+ def initialize(config)
12
+ @current_uris = Set.new([])
13
+ @staged_uris = Set.new([])
14
+ @cached_uris = Set.new([])
15
+ super(config)
16
+ end
17
+
18
+ # @override
19
+ def current_uris
20
+ if JAVA_PLATFORM == "java"
21
+ @current_uris.map { |uri| URI(uri) }
22
+ else
23
+ Parallel.map(@current_uris) { |uri| URI(uri) }
24
+ end
25
+ end
26
+
27
+ # @override
28
+ def staged_uris
29
+ @staged_uris.to_a # These are assumed to be URIs already, so no map
30
+ end
31
+
32
+ # @override
33
+ def stage(*uris)
34
+ @staged_uris |= uris
35
+ end
36
+
37
+ # @override
38
+ def staged?(uri)
39
+ @staged_uris.include?(uri.to_s)
40
+ end
41
+
42
+ # @override
43
+ def cache(*uris)
44
+ @cached_uris |= if JAVA_PLATFORM == "java"
45
+ uris.map(&:to_s)
46
+ else
47
+ Parallel.map(uris, &:to_s)
48
+ end
49
+ end
50
+
51
+ # @override
52
+ def cached?(uri)
53
+ @cached_uris.include?(uri.to_s)
54
+ end
55
+
56
+ # @override
57
+ def free
58
+ @current_uris = @staged_uris = @cached_uris = nil
59
+ end
60
+
61
+ private
62
+
63
+ def reset_staged_uris!
64
+ @staged_uris = Set.new([])
65
+ end
66
+
67
+ def swap!
68
+ @current_uris = @staged_uris
69
+ end
70
+
71
+ def filter_staged_uris!
72
+ @staged_uris.delete_if { |uri| cached?(uri) }
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "trie"
4
+
5
+ module Wayfarer
6
+ module Frontiers
7
+ # An in-memory trie.
8
+ # @api private
9
+ class MemoryTrieFrontier < MemoryFrontier
10
+ def initialize(config)
11
+ @trie = Trie.new
12
+ super(config)
13
+ end
14
+
15
+ # @override
16
+ def cache(*uris)
17
+ uris.each { |uri| @trie.add(uri.to_s) }
18
+ end
19
+
20
+ # @override
21
+ def match!(uri)
22
+ @str_or_regexp === uri.host
23
+ end
24
+
25
+ def cached?(uri)
26
+ # RuboCop autocorrects `#has_key?` to `#key?` otherwise
27
+ # rubocop:disable Style/PreferredHashMethods
28
+ @trie.has_key?(uri.to_s)
29
+ # rubocop:enable Style/PreferredHashMethods
30
+ end
31
+
32
+ # @override
33
+ def free
34
+ @trie = nil
35
+ super
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "normalize_url"
4
+
5
+ module Wayfarer
6
+ module Frontiers
7
+ # @api private
8
+ module NormalizeURIs
9
+ # @override
10
+ def stage(*uris)
11
+ super(*uris.map { |uri| normalize(uri) })
12
+ end
13
+
14
+ # @override
15
+ def staged?(uri)
16
+ super(normalize(uri))
17
+ end
18
+
19
+ # @override
20
+ def cache(*uris)
21
+ super(*uris.map { |uri| normalize(uri) })
22
+ end
23
+
24
+ # @override
25
+ def cached?(uri)
26
+ super(normalize(uri))
27
+ end
28
+
29
+ private
30
+
31
+ def normalize(uri)
32
+ NormalizeUrl.process(uri, config.normalize_uri_options)
33
+ end
34
+
35
+ def to_s
36
+ "URI-normalizing #{super}"
37
+ end
38
+
39
+ def method_missing(*argv, &proc)
40
+ super(*argv, &proc)
41
+ end
42
+
43
+ def respond_to_missing?(method, private = false)
44
+ @frontier.respond_to?(method) || super
45
+ end
46
+ end
47
+ end
48
+ end