wayfarer 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rbenv-gemsets +1 -0
  4. data/.rspec +3 -0
  5. data/.rubocop.yml +21 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/.yardopts +3 -0
  9. data/Changelog.md +10 -0
  10. data/Gemfile +11 -0
  11. data/LICENSE +19 -0
  12. data/README.md +21 -0
  13. data/Rakefile +114 -0
  14. data/benchmark/frontiers.rb +143 -0
  15. data/bin/wayfarer +116 -0
  16. data/docs/.gitignore +2 -0
  17. data/docs/_config.yml +15 -0
  18. data/docs/_includes/base.html +7 -0
  19. data/docs/_includes/head.html +10 -0
  20. data/docs/_includes/navigation.html +187 -0
  21. data/docs/_layouts/default.html +42 -0
  22. data/docs/_sass/base.scss +439 -0
  23. data/docs/_sass/variables.scss +24 -0
  24. data/docs/_sass/vendor/bourbon/_bourbon-deprecate.scss +19 -0
  25. data/docs/_sass/vendor/bourbon/_bourbon-deprecated-upcoming.scss +425 -0
  26. data/docs/_sass/vendor/bourbon/_bourbon.scss +90 -0
  27. data/docs/_sass/vendor/bourbon/addons/_border-color.scss +29 -0
  28. data/docs/_sass/vendor/bourbon/addons/_border-radius.scss +48 -0
  29. data/docs/_sass/vendor/bourbon/addons/_border-style.scss +28 -0
  30. data/docs/_sass/vendor/bourbon/addons/_border-width.scss +28 -0
  31. data/docs/_sass/vendor/bourbon/addons/_buttons.scss +69 -0
  32. data/docs/_sass/vendor/bourbon/addons/_clearfix.scss +25 -0
  33. data/docs/_sass/vendor/bourbon/addons/_ellipsis.scss +30 -0
  34. data/docs/_sass/vendor/bourbon/addons/_font-stacks.scss +31 -0
  35. data/docs/_sass/vendor/bourbon/addons/_hide-text.scss +27 -0
  36. data/docs/_sass/vendor/bourbon/addons/_margin.scss +29 -0
  37. data/docs/_sass/vendor/bourbon/addons/_padding.scss +29 -0
  38. data/docs/_sass/vendor/bourbon/addons/_position.scss +51 -0
  39. data/docs/_sass/vendor/bourbon/addons/_prefixer.scss +66 -0
  40. data/docs/_sass/vendor/bourbon/addons/_retina-image.scss +27 -0
  41. data/docs/_sass/vendor/bourbon/addons/_size.scss +56 -0
  42. data/docs/_sass/vendor/bourbon/addons/_text-inputs.scss +118 -0
  43. data/docs/_sass/vendor/bourbon/addons/_timing-functions.scss +34 -0
  44. data/docs/_sass/vendor/bourbon/addons/_triangle.scss +63 -0
  45. data/docs/_sass/vendor/bourbon/addons/_word-wrap.scss +29 -0
  46. data/docs/_sass/vendor/bourbon/css3/_animation.scss +61 -0
  47. data/docs/_sass/vendor/bourbon/css3/_appearance.scss +5 -0
  48. data/docs/_sass/vendor/bourbon/css3/_backface-visibility.scss +5 -0
  49. data/docs/_sass/vendor/bourbon/css3/_background-image.scss +44 -0
  50. data/docs/_sass/vendor/bourbon/css3/_background.scss +57 -0
  51. data/docs/_sass/vendor/bourbon/css3/_border-image.scss +61 -0
  52. data/docs/_sass/vendor/bourbon/css3/_calc.scss +6 -0
  53. data/docs/_sass/vendor/bourbon/css3/_columns.scss +67 -0
  54. data/docs/_sass/vendor/bourbon/css3/_filter.scss +6 -0
  55. data/docs/_sass/vendor/bourbon/css3/_flex-box.scss +327 -0
  56. data/docs/_sass/vendor/bourbon/css3/_font-face.scss +29 -0
  57. data/docs/_sass/vendor/bourbon/css3/_font-feature-settings.scss +6 -0
  58. data/docs/_sass/vendor/bourbon/css3/_hidpi-media-query.scss +12 -0
  59. data/docs/_sass/vendor/bourbon/css3/_hyphens.scss +6 -0
  60. data/docs/_sass/vendor/bourbon/css3/_image-rendering.scss +15 -0
  61. data/docs/_sass/vendor/bourbon/css3/_keyframes.scss +38 -0
  62. data/docs/_sass/vendor/bourbon/css3/_linear-gradient.scss +40 -0
  63. data/docs/_sass/vendor/bourbon/css3/_perspective.scss +12 -0
  64. data/docs/_sass/vendor/bourbon/css3/_placeholder.scss +10 -0
  65. data/docs/_sass/vendor/bourbon/css3/_radial-gradient.scss +40 -0
  66. data/docs/_sass/vendor/bourbon/css3/_selection.scss +44 -0
  67. data/docs/_sass/vendor/bourbon/css3/_text-decoration.scss +27 -0
  68. data/docs/_sass/vendor/bourbon/css3/_transform.scss +21 -0
  69. data/docs/_sass/vendor/bourbon/css3/_transition.scss +81 -0
  70. data/docs/_sass/vendor/bourbon/css3/_user-select.scss +5 -0
  71. data/docs/_sass/vendor/bourbon/functions/_assign-inputs.scss +16 -0
  72. data/docs/_sass/vendor/bourbon/functions/_contains-falsy.scss +25 -0
  73. data/docs/_sass/vendor/bourbon/functions/_contains.scss +31 -0
  74. data/docs/_sass/vendor/bourbon/functions/_is-length.scss +16 -0
  75. data/docs/_sass/vendor/bourbon/functions/_is-light.scss +26 -0
  76. data/docs/_sass/vendor/bourbon/functions/_is-number.scss +16 -0
  77. data/docs/_sass/vendor/bourbon/functions/_is-size.scss +23 -0
  78. data/docs/_sass/vendor/bourbon/functions/_modular-scale.scss +74 -0
  79. data/docs/_sass/vendor/bourbon/functions/_px-to-em.scss +24 -0
  80. data/docs/_sass/vendor/bourbon/functions/_px-to-rem.scss +26 -0
  81. data/docs/_sass/vendor/bourbon/functions/_shade.scss +24 -0
  82. data/docs/_sass/vendor/bourbon/functions/_strip-units.scss +22 -0
  83. data/docs/_sass/vendor/bourbon/functions/_tint.scss +24 -0
  84. data/docs/_sass/vendor/bourbon/functions/_transition-property-name.scss +37 -0
  85. data/docs/_sass/vendor/bourbon/functions/_unpack.scss +32 -0
  86. data/docs/_sass/vendor/bourbon/helpers/_convert-units.scss +26 -0
  87. data/docs/_sass/vendor/bourbon/helpers/_directional-values.scss +108 -0
  88. data/docs/_sass/vendor/bourbon/helpers/_font-source-declaration.scss +53 -0
  89. data/docs/_sass/vendor/bourbon/helpers/_gradient-positions-parser.scss +24 -0
  90. data/docs/_sass/vendor/bourbon/helpers/_linear-angle-parser.scss +35 -0
  91. data/docs/_sass/vendor/bourbon/helpers/_linear-gradient-parser.scss +51 -0
  92. data/docs/_sass/vendor/bourbon/helpers/_linear-positions-parser.scss +77 -0
  93. data/docs/_sass/vendor/bourbon/helpers/_linear-side-corner-parser.scss +41 -0
  94. data/docs/_sass/vendor/bourbon/helpers/_radial-arg-parser.scss +74 -0
  95. data/docs/_sass/vendor/bourbon/helpers/_radial-gradient-parser.scss +55 -0
  96. data/docs/_sass/vendor/bourbon/helpers/_radial-positions-parser.scss +28 -0
  97. data/docs/_sass/vendor/bourbon/helpers/_render-gradients.scss +31 -0
  98. data/docs/_sass/vendor/bourbon/helpers/_shape-size-stripper.scss +15 -0
  99. data/docs/_sass/vendor/bourbon/helpers/_str-to-num.scss +55 -0
  100. data/docs/_sass/vendor/bourbon/settings/_asset-pipeline.scss +7 -0
  101. data/docs/_sass/vendor/bourbon/settings/_deprecation-warnings.scss +8 -0
  102. data/docs/_sass/vendor/bourbon/settings/_prefixer.scss +9 -0
  103. data/docs/_sass/vendor/bourbon/settings/_px-to-em.scss +1 -0
  104. data/docs/_sass/vendor/neat/_neat-helpers.scss +11 -0
  105. data/docs/_sass/vendor/neat/_neat.scss +23 -0
  106. data/docs/_sass/vendor/neat/functions/_new-breakpoint.scss +49 -0
  107. data/docs/_sass/vendor/neat/functions/_private.scss +114 -0
  108. data/docs/_sass/vendor/neat/grid/_box-sizing.scss +15 -0
  109. data/docs/_sass/vendor/neat/grid/_direction-context.scss +33 -0
  110. data/docs/_sass/vendor/neat/grid/_display-context.scss +28 -0
  111. data/docs/_sass/vendor/neat/grid/_fill-parent.scss +22 -0
  112. data/docs/_sass/vendor/neat/grid/_media.scss +92 -0
  113. data/docs/_sass/vendor/neat/grid/_omega.scss +87 -0
  114. data/docs/_sass/vendor/neat/grid/_outer-container.scss +34 -0
  115. data/docs/_sass/vendor/neat/grid/_pad.scss +25 -0
  116. data/docs/_sass/vendor/neat/grid/_private.scss +35 -0
  117. data/docs/_sass/vendor/neat/grid/_row.scss +52 -0
  118. data/docs/_sass/vendor/neat/grid/_shift.scss +50 -0
  119. data/docs/_sass/vendor/neat/grid/_span-columns.scss +94 -0
  120. data/docs/_sass/vendor/neat/grid/_to-deprecate.scss +97 -0
  121. data/docs/_sass/vendor/neat/grid/_visual-grid.scss +42 -0
  122. data/docs/_sass/vendor/neat/mixins/_clearfix.scss +25 -0
  123. data/docs/_sass/vendor/neat/settings/_disable-warnings.scss +13 -0
  124. data/docs/_sass/vendor/neat/settings/_grid.scss +51 -0
  125. data/docs/_sass/vendor/neat/settings/_visual-grid.scss +27 -0
  126. data/docs/_sass/vendor/normalize-3.0.2.scss +427 -0
  127. data/docs/_sass/vendor/pygments.scss +356 -0
  128. data/docs/automating_browsers/capybara.md +70 -0
  129. data/docs/css/screen.scss +7 -0
  130. data/docs/guides/callbacks.md +45 -0
  131. data/docs/guides/cli.md +52 -0
  132. data/docs/guides/configuration.md +184 -0
  133. data/docs/guides/error_handling.md +46 -0
  134. data/docs/guides/frontiers.md +93 -0
  135. data/docs/guides/halting.md +23 -0
  136. data/docs/guides/job_queues.md +26 -0
  137. data/docs/guides/locals.md +36 -0
  138. data/docs/guides/logging.md +22 -0
  139. data/docs/guides/page_objects.md +67 -0
  140. data/docs/guides/peeking.md +46 -0
  141. data/docs/guides/selenium_capybara.md +100 -0
  142. data/docs/guides/tutorial.md +452 -0
  143. data/docs/index.md +82 -0
  144. data/docs/js/navigation.js +11 -0
  145. data/docs/misc/contributing.md +20 -0
  146. data/docs/misc/testing.md +11 -0
  147. data/docs/recipes/authentication.md +23 -0
  148. data/docs/recipes/csv.md +29 -0
  149. data/docs/recipes/javascript.md +20 -0
  150. data/docs/recipes/multiple_uris.md +18 -0
  151. data/docs/recipes/screenshots.md +20 -0
  152. data/docs/routing/custom_rules.md +16 -0
  153. data/docs/routing/filetypes_rules.md +21 -0
  154. data/docs/routing/host_rules.md +24 -0
  155. data/docs/routing/path_rules.md +33 -0
  156. data/docs/routing/protocol_rules.md +17 -0
  157. data/docs/routing/query_rules.md +69 -0
  158. data/docs/routing/routes.md +96 -0
  159. data/docs/routing/uri_rules.md +18 -0
  160. data/examples/collect_github_issues.rb +65 -0
  161. data/examples/find_foobar_on_wikipedia.rb +23 -0
  162. data/lib/wayfarer/configuration.rb +86 -0
  163. data/lib/wayfarer/crawl.rb +79 -0
  164. data/lib/wayfarer/crawl_observer.rb +103 -0
  165. data/lib/wayfarer/dispatcher.rb +104 -0
  166. data/lib/wayfarer/finders.rb +61 -0
  167. data/lib/wayfarer/frontiers/frontier.rb +79 -0
  168. data/lib/wayfarer/frontiers/memory_bloomfilter.rb +32 -0
  169. data/lib/wayfarer/frontiers/memory_frontier.rb +76 -0
  170. data/lib/wayfarer/frontiers/memory_trie_frontier.rb +39 -0
  171. data/lib/wayfarer/frontiers/normalize_uris.rb +48 -0
  172. data/lib/wayfarer/frontiers/redis_bloomfilter.rb +34 -0
  173. data/lib/wayfarer/frontiers/redis_frontier.rb +83 -0
  174. data/lib/wayfarer/http_adapters/adapter_pool.rb +62 -0
  175. data/lib/wayfarer/http_adapters/net_http_adapter.rb +77 -0
  176. data/lib/wayfarer/http_adapters/selenium_adapter.rb +80 -0
  177. data/lib/wayfarer/job.rb +211 -0
  178. data/lib/wayfarer/locals.rb +40 -0
  179. data/lib/wayfarer/page.rb +94 -0
  180. data/lib/wayfarer/parsers/json_parser.rb +20 -0
  181. data/lib/wayfarer/parsers/xml_parser.rb +27 -0
  182. data/lib/wayfarer/processor.rb +103 -0
  183. data/lib/wayfarer/routing/custom_rule.rb +21 -0
  184. data/lib/wayfarer/routing/filetypes_rule.rb +20 -0
  185. data/lib/wayfarer/routing/host_rule.rb +19 -0
  186. data/lib/wayfarer/routing/path_rule.rb +54 -0
  187. data/lib/wayfarer/routing/protocol_rule.rb +21 -0
  188. data/lib/wayfarer/routing/query_rule.rb +59 -0
  189. data/lib/wayfarer/routing/router.rb +71 -0
  190. data/lib/wayfarer/routing/rule.rb +114 -0
  191. data/lib/wayfarer/routing/uri_rule.rb +21 -0
  192. data/lib/wayfarer.rb +68 -0
  193. data/spec/configuration_spec.rb +26 -0
  194. data/spec/crawl_spec.rb +48 -0
  195. data/spec/finders_spec.rb +49 -0
  196. data/spec/frontiers/memory_bloomfilter_spec.rb +6 -0
  197. data/spec/frontiers/memory_frontier_spec.rb +6 -0
  198. data/spec/frontiers/memory_trie_frontier_spec.rb +6 -0
  199. data/spec/frontiers/normalize_uris_spec.rb +59 -0
  200. data/spec/frontiers/redis_bloomfilter_spec.rb +6 -0
  201. data/spec/frontiers/redis_frontier_spec.rb +6 -0
  202. data/spec/http_adapters/adapter_pool_spec.rb +33 -0
  203. data/spec/http_adapters/net_http_adapter_spec.rb +83 -0
  204. data/spec/http_adapters/selenium_adapter_spec.rb +53 -0
  205. data/spec/integration/callbacks_spec.rb +42 -0
  206. data/spec/integration/locals_spec.rb +106 -0
  207. data/spec/integration/peeking_spec.rb +61 -0
  208. data/spec/job_spec.rb +122 -0
  209. data/spec/page_spec.rb +38 -0
  210. data/spec/parsers/json_parser_spec.rb +30 -0
  211. data/spec/parsers/xml_parser_spec.rb +24 -0
  212. data/spec/processor_spec.rb +31 -0
  213. data/spec/routing/custom_rule_spec.rb +26 -0
  214. data/spec/routing/filetypes_rule_spec.rb +40 -0
  215. data/spec/routing/host_rule_spec.rb +48 -0
  216. data/spec/routing/path_rule_spec.rb +66 -0
  217. data/spec/routing/protocol_rule_spec.rb +26 -0
  218. data/spec/routing/query_rule_spec.rb +124 -0
  219. data/spec/routing/router_spec.rb +67 -0
  220. data/spec/routing/rule_spec.rb +251 -0
  221. data/spec/routing/uri_rule_spec.rb +24 -0
  222. data/spec/shared/frontier.rb +96 -0
  223. data/spec/spec_helpers.rb +62 -0
  224. data/spec/wayfarer_spec.rb +24 -0
  225. data/support/static/finders.html +38 -0
  226. data/support/static/graph/details/a.html +10 -0
  227. data/support/static/graph/details/b.html +10 -0
  228. data/support/static/graph/index.html +20 -0
  229. data/support/static/json/dummy.json +13 -0
  230. data/support/static/links/links.html +28 -0
  231. data/support/static/xml/dummy.xml +120 -0
  232. data/support/test_app.rb +45 -0
  233. data/wayfarer-jruby.gemspec +49 -0
  234. data/wayfarer.gemspec +53 -0
  235. metadata +697 -0
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "selenium-webdriver"
4
+ require "selenium/emulated_features"
5
+ require "capybara"
6
+
7
+ module Wayfarer
8
+ module HTTPAdapters
9
+ # An adapter for Selenium WebDrivers
10
+ # @api private
11
+ class SeleniumAdapter
12
+ # @!attribute [r] driver
13
+ # @return [URI] the Selenium WebDriver.
14
+ attr_reader :driver
15
+
16
+ def initialize(config = Wayfarer.config)
17
+ @config = config
18
+ end
19
+
20
+ # Fetches a page.
21
+ # @return [Page]
22
+ def fetch(uri)
23
+ driver.navigate.to(uri)
24
+
25
+ Page.new(
26
+ uri: @driver.current_url,
27
+ status_code: @driver.response_code,
28
+ body: @driver.page_source,
29
+ headers: @driver.response_headers
30
+ )
31
+ end
32
+
33
+ # Closes the driver.
34
+ def reload!
35
+ @driver&.close
36
+ @driver = nil
37
+ end
38
+
39
+ # Quits the browser.
40
+ def free
41
+ @driver&.quit
42
+ @driver = nil
43
+ end
44
+
45
+ # The WebDriver.
46
+ def driver
47
+ @driver ||= instantiate_driver
48
+ end
49
+
50
+ # A Capybara driver that wraps the {#driver}.
51
+ # @see https://github.com/teamcapybara/capybara Capybara
52
+ def browser
53
+ @browser ||= instantiate_capybara_driver
54
+ end
55
+
56
+ private
57
+
58
+ def instantiate_driver
59
+ driver = Selenium::WebDriver.for(*@config.selenium_argv)
60
+ driver.manage.window.size = Selenium::WebDriver::Dimension.new(
61
+ *@config.window_size
62
+ )
63
+ driver
64
+ end
65
+
66
+ def instantiate_capybara_driver
67
+ Capybara.run_server = false
68
+ Capybara.current_driver = :selenium
69
+
70
+ capybara_driver = Capybara::Selenium::Driver.new(nil)
71
+ capybara_driver.instance_variable_set(:@browser, driver)
72
+
73
+ session = Capybara::Session.new(:selenium, nil)
74
+ session.instance_variable_set(:@driver, capybara_driver)
75
+
76
+ session
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,211 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "forwardable"
4
+ require "hooks"
5
+ require "active_job"
6
+
7
+ # TODO: I only want deep_dup
8
+ require "active_support/all"
9
+
10
+ module Wayfarer
11
+ # A {Job} is a class that has a {Routing::Router} with many {Routing::Rule}s
12
+ # which are matched against a URI. Rules map URIs onto job instance methods.
13
+ # Under the hood, jobs are instantiated within separate threads by a
14
+ # {Processor}. Every instance gets its own thread. If a URI is matched, its
15
+ # {Page} is retrieved, and made available to instance methods via {#page}.
16
+ #
17
+ # Jobs implement ActiveJob's Job API and are therefore compatible with a wide
18
+ # range of job queues. To run a job immediately, call ::perform_now.
19
+ # enqueue a job, call ::perform_later.
20
+ #
21
+ # @see https://github.com/rails/rails/tree/master/activejob rails/activejob
22
+ # @see http://edgeguides.rubyonrails.org/active_job_basics.html ActiveJob Basics
23
+ class Job < ActiveJob::Base
24
+ extend Forwardable
25
+
26
+ include Hooks
27
+ include Locals
28
+
29
+ # @!group Callbacks
30
+
31
+ # Callback that fires __once__ before any pages are retrieved.
32
+ # @method before_crawl
33
+ # @scope class
34
+ define_hook :before_crawl
35
+
36
+ # Callback that fires __once__ after all pages have been retrieved and
37
+ # processing is done.
38
+ # @method after_crawl
39
+ # @scope class
40
+ define_hook :after_crawl
41
+
42
+ # Callback that fires when HTTP adapters are instantiated.
43
+ # @method setup_adapter
44
+ # @scope class
45
+ # @yield [[HTTPAdapters::NetHTTPAdapter, HTTPAdapters::SeleniumAdapter], [Selenium::WebDriver::Driver, nil], [Capybara::Selenium::Driver, nil]]
46
+ define_hooks :setup_adapter
47
+
48
+ # @!endgroup
49
+
50
+ class << self
51
+ extend Forwardable
52
+
53
+ # @!attribute [w] router
54
+ attr_writer :router
55
+
56
+ # @!attribute [w] config
57
+ attr_writer :config
58
+
59
+ # Returns a class copy.
60
+ def prepare
61
+ duplicate = dup
62
+ duplicate.router = router.dup
63
+ duplicate.locals = locals.deep_dup
64
+ duplicate.config = config.dup
65
+
66
+ duplicate.locals.each do |(key, val)|
67
+ duplicate.locals[key] = Locals.thread_safe_counterpart(val)
68
+ end
69
+
70
+ duplicate.locals.each do |(key, _)|
71
+ duplicate.send(:define_method, key) do duplicate.locals[key] end
72
+ duplicate.send(:define_singleton_method, key) do
73
+ duplicate.locals[key]
74
+ end
75
+ end
76
+
77
+ duplicate
78
+ end
79
+
80
+ # A configuration based off the global {Wayfarer.config}.
81
+ # @yield [Configuration]
82
+ # @return [Configuration]
83
+ def config
84
+ @config ||= Wayfarer.config.clone
85
+ yield(@config) if block_given?
86
+ @config
87
+ end
88
+
89
+ # A router.
90
+ # If a block is passed in, it is evaluated within the {Router}'s instance.
91
+ # @return [Routing::Router]
92
+ def router(&proc)
93
+ @router ||= Routing::Router.new
94
+ @router.instance_eval(&proc) if block_given?
95
+ @router
96
+ end
97
+
98
+ alias route router
99
+ alias routes router
100
+
101
+ # Overshadows ActiveJob::Base's own logger
102
+ delegate logger: :config
103
+ end
104
+
105
+ # @!attribute [r] staged_uris
106
+ # @return [Array<String>, Array<URI>] URIs to stage for the next cycle.
107
+ # @see #stage
108
+ attr_reader :staged_uris
109
+
110
+ # @!attribute [rw] page
111
+ attr_writer :page
112
+
113
+ # @!attribute [rw] adapter
114
+ attr_accessor :adapter
115
+
116
+ # @!attribute [rw] params
117
+ attr_accessor :params
118
+
119
+ def initialize(*argv)
120
+ @halts = false
121
+ @staged_uris = []
122
+ super(*argv)
123
+ end
124
+
125
+ # Whether this job will stop processing.
126
+ def halts?
127
+ @halts
128
+ end
129
+
130
+ # Performs this job.
131
+ # @note ActiveJob API
132
+ # @override
133
+ def perform(*uris)
134
+ Crawl.new(self.class, *uris).execute
135
+ end
136
+
137
+ protected
138
+
139
+ # All following instance methods are available within actions.
140
+
141
+ # Sets a halting flag that signals the processor to stop its work.
142
+ def halt
143
+ @halts = true
144
+ end
145
+
146
+ # Adds URIs to process in the next cycle.
147
+ # If a relative path is given, an absolute URI is constructed from the
148
+ # current {#page}'s URI.
149
+ # @param [String, URI, Array<String>, Array<URI>]
150
+ def stage(*uris)
151
+ expanded = uris.flatten.map do |u|
152
+ if (uri = URI(u)).absolute?
153
+ uri
154
+ else
155
+ # URI#join would discard the path of page.uri.path
156
+ current = page.uri.dup
157
+ current.path = File.join(page.uri.path, uri.path)
158
+ current
159
+ end
160
+ end
161
+
162
+ # This method has somewhat become the guard keeper for invalid URIs that
163
+ # would lead to exceptions otherwise down the line
164
+ supported = expanded.select do |uri|
165
+ HTTPAdapters::NetHTTPAdapter::RECOGNIZED_URI_TYPES.any? do |type|
166
+ uri.is_a?(type)
167
+ end
168
+ end
169
+
170
+ @staged_uris.push(*supported)
171
+ end
172
+
173
+ # The {Page} representing the URI currently processed by an action.
174
+ # When using the Selenium adapter, {Page#body} gets refreshed on every call.
175
+ # Otherwise, subsequent DOM updates (i.e. JavaScript-induced) would be
176
+ # invisible.
177
+ # @return Page
178
+ def page
179
+ return @page unless self.class.config.http_adapter == :selenium
180
+
181
+ Page.new(
182
+ uri: @page.uri,
183
+ status_code: @page.uri,
184
+ body: driver.page_source,
185
+ headers: @page.headers
186
+ )
187
+ end
188
+
189
+ # The parsed response body.
190
+ # When using the Selenium adapter, this parses the body again on every call.
191
+ # Otherwise, subsequent DOM updates (i.e. JavaScript-induced) would be
192
+ # invisible.
193
+ # @method doc
194
+ # @see Page#doc
195
+ delegate doc: :page
196
+
197
+ # The Selenium WebDriver.
198
+ # @method driver
199
+ # @see https://github.com/peterc/pismo Pismo
200
+ # @see Page#driver
201
+ delegate driver: :adapter
202
+
203
+ # A Capybara driver that wraps the {#driver}.
204
+ # @method browser
205
+ # @see HTTPAdapters::SeleniumAdapter#browser
206
+ delegate browser: :adapter
207
+
208
+ # @method logger
209
+ delegate logger: :"self.class"
210
+ end
211
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "concurrent"
4
+
5
+ module Wayfarer
6
+ # @api private
7
+ module Locals
8
+ def self.thread_safe_counterpart(value)
9
+ case value
10
+ when Array then Concurrent::Array.new(value)
11
+ when Hash then Concurrent::Hash[value]
12
+ when TrueClass then Concurrent::AtomicBoolean.new(value)
13
+ when FalseClass then Concurrent::AtomicBoolean.new(value)
14
+ when Integer then Concurrent::AtomicFixnum.new(value)
15
+ else value
16
+ end
17
+ end
18
+
19
+ def self.included(base)
20
+ base.extend(ClassMethods)
21
+ end
22
+
23
+ module ClassMethods
24
+ attr_reader :locals
25
+
26
+ def let(key)
27
+ raise "#let called without a block" unless block_given?
28
+ locals[key] = yield
29
+ end
30
+
31
+ def locals
32
+ @locals ||= {}
33
+ end
34
+
35
+ def locals=(locals)
36
+ @locals = locals
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ostruct"
4
+ require "forwardable"
5
+ require "mime/types"
6
+ require "mime-types"
7
+
8
+ require "pismo" unless RUBY_PLATFORM == "java"
9
+
10
+ module Wayfarer
11
+ # The representation of fetched pages
12
+ class Page
13
+ extend Forwardable
14
+
15
+ include Finders
16
+
17
+ # @!attribute [r] uri
18
+ # @return [URI] the URI of the page.
19
+ attr_reader :uri
20
+
21
+ # @!attribute [r] status_code
22
+ # @return [Fixnum] the response status code.
23
+ attr_reader :status_code
24
+
25
+ # @!attribute [r] body
26
+ # @return [String] the response body.
27
+ attr_accessor :body
28
+
29
+ # @!attribute [r] headers
30
+ # @return [Hash] the response headers.
31
+ attr_reader :headers
32
+
33
+ def initialize(attrs = {})
34
+ @uri = attrs[:uri]
35
+ @status_code = attrs[:status_code]
36
+ @body = attrs[:body]
37
+ @headers = attrs[:headers]
38
+ end
39
+
40
+ # Returns a parsed representation of the fetched document depending on the
41
+ # Content-Type field.
42
+ # @return [OpenStruct] if the Content-Type field's sub-type is "json".
43
+ # @return [Nokogiri::XML::Document] if the Content-Type field's sub-type is "xml".
44
+ # @return [Nokogiri::HTML::Document] otherwise.
45
+ def doc
46
+ return @doc if @doc
47
+
48
+ # If no Content-Type field is present, assume HTML/XML
49
+ # TODO: Test
50
+ unless @headers["content-type"]
51
+ return @doc = Parsers::XMLParser.parse_html(@body)
52
+ end
53
+
54
+ content_type = @headers["content-type"].first
55
+ sub_type = MIME::Types[content_type].first.sub_type
56
+
57
+ # TODO: Tests
58
+ @doc = case sub_type
59
+ when "json"
60
+ Parsers::JSONParser.parse(@body)
61
+ when "xml"
62
+ Parsers::XMLParser.parse_xml(@body)
63
+ else
64
+ Parsers::XMLParser.parse_html(@body)
65
+ end
66
+ end
67
+
68
+ # Pismo is not supported on JRuby.
69
+ unless RUBY_PLATFORM == "java"
70
+ # `#images` is included from the Helpers module
71
+ # `#body` is an attribute reader defined above
72
+ delegate (Pismo::Document::ATTRIBUTE_METHODS - %i[images body]) => :pismo
73
+ end
74
+
75
+ private
76
+
77
+ # Returns a Pismo document.
78
+ # @note Not available on JRuby.
79
+ # @note Only succeeds when {#doc} returns a `Nokogiri::HTML::Document`.
80
+ # @return [Pismo::Document]
81
+ def pismo
82
+ @pismo_doc ||= instantiate_pismo_document
83
+ end
84
+
85
+ def instantiate_pismo_document
86
+ doc = Pismo::Document.allocate
87
+ doc.instance_variable_set(:@options, {})
88
+ doc.instance_variable_set(:@url, uri)
89
+ doc.instance_variable_set(:@html, body)
90
+ doc.instance_variable_set(:@doc, self.doc)
91
+ doc
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "oj" unless RUBY_PLATFORM == "java"
4
+
5
+ module Wayfarer
6
+ module Parsers
7
+ # A wrapper class for parsing JSON.
8
+ # @private
9
+ module JSONParser
10
+ module_function
11
+
12
+ # Parses a JSON string.
13
+ # @param [String] json_str the JSON string to parse.
14
+ # @return [OpenStruct]
15
+ def parse(json_str)
16
+ RUBY_PLATFORM == "java" ? JSON.parse(json_str) : Oj.load(json_str)
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+
5
+ module Wayfarer
6
+ module Parsers
7
+ # A wrapper class for parsing HTML/XML.
8
+ # @private
9
+ module XMLParser
10
+ module_function
11
+
12
+ # Parses an XML string.
13
+ # @param [String] xml_str the XML string to parse.
14
+ # @return [Nokogiri::XML::Document]
15
+ def parse_xml(xml_str)
16
+ Nokogiri::XML(xml_str)
17
+ end
18
+
19
+ # Parses a HTML string.
20
+ # @param [String] html_str the HTML string to parse.
21
+ # @return [Nokogiri::HTML::Document]
22
+ def parse_html(html_str)
23
+ Nokogiri::HTML(html_str)
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pp"
4
+ require "concurrent"
5
+ require "observer"
6
+
7
+ module Wayfarer
8
+ # Runs jobs.
9
+ class Processor
10
+ extend Forwardable
11
+
12
+ include Observable
13
+ include CrawlObserver::Events
14
+ include CrawlObserver::ObservableShortcuts
15
+
16
+ attr_reader :job
17
+
18
+ delegate config: :job
19
+ delegate logger: :config
20
+
21
+ def initialize(job, frontier, dispatcher)
22
+ @job = job
23
+ @frontier = frontier
24
+ @dispatcher = dispatcher
25
+ @halted = Concurrent::AtomicBoolean.new(false)
26
+ end
27
+
28
+ # Whether processing is done.
29
+ # @return [true, false]
30
+ def halted?
31
+ @halted.value
32
+ end
33
+
34
+ # Sets a halt flag.
35
+ def halt!
36
+ @halted.make_true
37
+ end
38
+
39
+ # Runs the job.
40
+ # @param [*Array<URI>, *Array<String>] uris
41
+ def run(*_uris)
42
+ notify_observers!(FirstCycle.new(@frontier))
43
+
44
+ while @halted.false? && @frontier.cycle
45
+ current_uris = @frontier.current_uris
46
+ queue = current_uris.inject(Queue.new, :push)
47
+
48
+ notify_observers!(NewCycle.new(current_uris.count))
49
+
50
+ @threads = Array.new(config.connection_count) do
51
+ Thread.new do
52
+ begin
53
+ loop do
54
+ uri = queue.pop(true)
55
+ break if uri.nil? || @halted.true?
56
+ handle_dispatch_result(@dispatcher.dispatch(@job, uri))
57
+ end
58
+ rescue ThreadError
59
+ notify_observers!(CycleFinished.new)
60
+ end
61
+ end
62
+ end
63
+
64
+ @threads.each(&:join)
65
+
66
+ notify_observers!(AboutToCycle.new(@frontier.staged_uris.count))
67
+ end
68
+ ensure
69
+ halt!
70
+ @frontier.free
71
+ @dispatcher.adapter_pool.free
72
+ end
73
+
74
+ private
75
+
76
+ def handle_dispatch_result(result)
77
+ case result
78
+ when Dispatcher::Mismatch then handle_mismatch(result)
79
+ when Dispatcher::Halt then handle_halt(result)
80
+ when Dispatcher::Stage then handle_stage(result)
81
+ when Dispatcher::Error then handle_error(result)
82
+ end
83
+ end
84
+
85
+ def handle_mismatch(mismatch)
86
+ notify_observers!(MismatchedURI.new(mismatch.uri))
87
+ end
88
+
89
+ def handle_halt(halt)
90
+ notify_observers!(HaltInitiated.new(halt.action, halt.uri))
91
+ halt!
92
+ end
93
+
94
+ def handle_stage(stage)
95
+ notify_observers!(StagingURIs.new(stage.uris.count))
96
+ @frontier.stage(*stage.uris) unless halted?
97
+ end
98
+
99
+ def handle_error(error)
100
+ notify_observers!(UnhandledError.new(error.exception))
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module Wayfarer
6
+ module Routing
7
+ # @private
8
+ class CustomRule < Rule
9
+ def initialize(delegate_or_block = proc, opts = {}, &proc)
10
+ @delegate_or_block = delegate_or_block
11
+ super(opts, &proc)
12
+ end
13
+
14
+ private
15
+
16
+ def match!(uri)
17
+ !!@delegate_or_block.call(uri)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+ require "uri"
3
+
4
+ module Wayfarer
5
+ module Routing
6
+ # @private
7
+ class FiletypesRule < Rule
8
+ def initialize(types, opts = {}, &proc)
9
+ @types = types
10
+ super(opts, &proc)
11
+ end
12
+
13
+ private
14
+
15
+ def match!(uri)
16
+ @types.any? { |type| uri.path =~ /\.#{type}$/ }
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Routing
5
+ # @private
6
+ class HostRule < Rule
7
+ def initialize(str_or_regexp, opts = {}, &proc)
8
+ @str_or_regexp = str_or_regexp
9
+ super(opts, &proc)
10
+ end
11
+
12
+ # rubocop:disable Style/CaseEquality
13
+ def match!(uri)
14
+ @str_or_regexp === uri.host
15
+ end
16
+ # rubocop:enable Style/CaseEquality
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mustermann"
4
+
5
+ module Wayfarer
6
+ module Routing
7
+ # @private
8
+ class PathRule < Rule
9
+ attr_reader :matcher
10
+
11
+ def initialize(arg, opts = {}, &proc)
12
+ @matcher = if arg.is_a? String
13
+ Mustermann.new(arg, type: Wayfarer.config.mustermann_type)
14
+ else
15
+ arg
16
+ end
17
+
18
+ super(opts, &proc)
19
+ end
20
+
21
+ def params(uri)
22
+ return {} unless match!(uri)
23
+
24
+ path = uri.path
25
+
26
+ if @matcher.is_a? Mustermann
27
+ @matcher.params(path)
28
+ else
29
+ captures = @matcher.match(full_path(uri)).captures
30
+
31
+ captures.each.with_index.reduce({}) do |hash, (capture, i)|
32
+ hash.merge(i.to_s => capture)
33
+ end
34
+ end
35
+ end
36
+
37
+ private
38
+
39
+ # rubocop:disable Style/CaseEquality
40
+ def match!(uri)
41
+ if @matcher.is_a? Mustermann
42
+ @matcher === uri.path
43
+ else
44
+ @matcher =~ full_path(uri)
45
+ end
46
+ end
47
+ # rubocop:enable Style/CaseEquality
48
+
49
+ def full_path(uri)
50
+ "#{uri.path}?#{uri.query}##{uri.fragment}"
51
+ end
52
+ end
53
+ end
54
+ end