wayfarer-jruby 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rbenv-gemsets +1 -0
  4. data/.rspec +3 -0
  5. data/.rubocop.yml +21 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/.yardopts +3 -0
  9. data/Gemfile +11 -0
  10. data/LICENSE +19 -0
  11. data/README.md +19 -0
  12. data/Rakefile +114 -0
  13. data/benchmark/frontiers.rb +143 -0
  14. data/bin/wayfarer +116 -0
  15. data/docs/.gitignore +2 -0
  16. data/docs/_config.yml +15 -0
  17. data/docs/_includes/base.html +7 -0
  18. data/docs/_includes/head.html +10 -0
  19. data/docs/_includes/navigation.html +172 -0
  20. data/docs/_layouts/default.html +42 -0
  21. data/docs/_sass/base.scss +439 -0
  22. data/docs/_sass/variables.scss +24 -0
  23. data/docs/_sass/vendor/bourbon/_bourbon-deprecate.scss +19 -0
  24. data/docs/_sass/vendor/bourbon/_bourbon-deprecated-upcoming.scss +425 -0
  25. data/docs/_sass/vendor/bourbon/_bourbon.scss +90 -0
  26. data/docs/_sass/vendor/bourbon/addons/_border-color.scss +29 -0
  27. data/docs/_sass/vendor/bourbon/addons/_border-radius.scss +48 -0
  28. data/docs/_sass/vendor/bourbon/addons/_border-style.scss +28 -0
  29. data/docs/_sass/vendor/bourbon/addons/_border-width.scss +28 -0
  30. data/docs/_sass/vendor/bourbon/addons/_buttons.scss +69 -0
  31. data/docs/_sass/vendor/bourbon/addons/_clearfix.scss +25 -0
  32. data/docs/_sass/vendor/bourbon/addons/_ellipsis.scss +30 -0
  33. data/docs/_sass/vendor/bourbon/addons/_font-stacks.scss +31 -0
  34. data/docs/_sass/vendor/bourbon/addons/_hide-text.scss +27 -0
  35. data/docs/_sass/vendor/bourbon/addons/_margin.scss +29 -0
  36. data/docs/_sass/vendor/bourbon/addons/_padding.scss +29 -0
  37. data/docs/_sass/vendor/bourbon/addons/_position.scss +51 -0
  38. data/docs/_sass/vendor/bourbon/addons/_prefixer.scss +66 -0
  39. data/docs/_sass/vendor/bourbon/addons/_retina-image.scss +27 -0
  40. data/docs/_sass/vendor/bourbon/addons/_size.scss +56 -0
  41. data/docs/_sass/vendor/bourbon/addons/_text-inputs.scss +118 -0
  42. data/docs/_sass/vendor/bourbon/addons/_timing-functions.scss +34 -0
  43. data/docs/_sass/vendor/bourbon/addons/_triangle.scss +63 -0
  44. data/docs/_sass/vendor/bourbon/addons/_word-wrap.scss +29 -0
  45. data/docs/_sass/vendor/bourbon/css3/_animation.scss +61 -0
  46. data/docs/_sass/vendor/bourbon/css3/_appearance.scss +5 -0
  47. data/docs/_sass/vendor/bourbon/css3/_backface-visibility.scss +5 -0
  48. data/docs/_sass/vendor/bourbon/css3/_background-image.scss +44 -0
  49. data/docs/_sass/vendor/bourbon/css3/_background.scss +57 -0
  50. data/docs/_sass/vendor/bourbon/css3/_border-image.scss +61 -0
  51. data/docs/_sass/vendor/bourbon/css3/_calc.scss +6 -0
  52. data/docs/_sass/vendor/bourbon/css3/_columns.scss +67 -0
  53. data/docs/_sass/vendor/bourbon/css3/_filter.scss +6 -0
  54. data/docs/_sass/vendor/bourbon/css3/_flex-box.scss +327 -0
  55. data/docs/_sass/vendor/bourbon/css3/_font-face.scss +29 -0
  56. data/docs/_sass/vendor/bourbon/css3/_font-feature-settings.scss +6 -0
  57. data/docs/_sass/vendor/bourbon/css3/_hidpi-media-query.scss +12 -0
  58. data/docs/_sass/vendor/bourbon/css3/_hyphens.scss +6 -0
  59. data/docs/_sass/vendor/bourbon/css3/_image-rendering.scss +15 -0
  60. data/docs/_sass/vendor/bourbon/css3/_keyframes.scss +38 -0
  61. data/docs/_sass/vendor/bourbon/css3/_linear-gradient.scss +40 -0
  62. data/docs/_sass/vendor/bourbon/css3/_perspective.scss +12 -0
  63. data/docs/_sass/vendor/bourbon/css3/_placeholder.scss +10 -0
  64. data/docs/_sass/vendor/bourbon/css3/_radial-gradient.scss +40 -0
  65. data/docs/_sass/vendor/bourbon/css3/_selection.scss +44 -0
  66. data/docs/_sass/vendor/bourbon/css3/_text-decoration.scss +27 -0
  67. data/docs/_sass/vendor/bourbon/css3/_transform.scss +21 -0
  68. data/docs/_sass/vendor/bourbon/css3/_transition.scss +81 -0
  69. data/docs/_sass/vendor/bourbon/css3/_user-select.scss +5 -0
  70. data/docs/_sass/vendor/bourbon/functions/_assign-inputs.scss +16 -0
  71. data/docs/_sass/vendor/bourbon/functions/_contains-falsy.scss +25 -0
  72. data/docs/_sass/vendor/bourbon/functions/_contains.scss +31 -0
  73. data/docs/_sass/vendor/bourbon/functions/_is-length.scss +16 -0
  74. data/docs/_sass/vendor/bourbon/functions/_is-light.scss +26 -0
  75. data/docs/_sass/vendor/bourbon/functions/_is-number.scss +16 -0
  76. data/docs/_sass/vendor/bourbon/functions/_is-size.scss +23 -0
  77. data/docs/_sass/vendor/bourbon/functions/_modular-scale.scss +74 -0
  78. data/docs/_sass/vendor/bourbon/functions/_px-to-em.scss +24 -0
  79. data/docs/_sass/vendor/bourbon/functions/_px-to-rem.scss +26 -0
  80. data/docs/_sass/vendor/bourbon/functions/_shade.scss +24 -0
  81. data/docs/_sass/vendor/bourbon/functions/_strip-units.scss +22 -0
  82. data/docs/_sass/vendor/bourbon/functions/_tint.scss +24 -0
  83. data/docs/_sass/vendor/bourbon/functions/_transition-property-name.scss +37 -0
  84. data/docs/_sass/vendor/bourbon/functions/_unpack.scss +32 -0
  85. data/docs/_sass/vendor/bourbon/helpers/_convert-units.scss +26 -0
  86. data/docs/_sass/vendor/bourbon/helpers/_directional-values.scss +108 -0
  87. data/docs/_sass/vendor/bourbon/helpers/_font-source-declaration.scss +53 -0
  88. data/docs/_sass/vendor/bourbon/helpers/_gradient-positions-parser.scss +24 -0
  89. data/docs/_sass/vendor/bourbon/helpers/_linear-angle-parser.scss +35 -0
  90. data/docs/_sass/vendor/bourbon/helpers/_linear-gradient-parser.scss +51 -0
  91. data/docs/_sass/vendor/bourbon/helpers/_linear-positions-parser.scss +77 -0
  92. data/docs/_sass/vendor/bourbon/helpers/_linear-side-corner-parser.scss +41 -0
  93. data/docs/_sass/vendor/bourbon/helpers/_radial-arg-parser.scss +74 -0
  94. data/docs/_sass/vendor/bourbon/helpers/_radial-gradient-parser.scss +55 -0
  95. data/docs/_sass/vendor/bourbon/helpers/_radial-positions-parser.scss +28 -0
  96. data/docs/_sass/vendor/bourbon/helpers/_render-gradients.scss +31 -0
  97. data/docs/_sass/vendor/bourbon/helpers/_shape-size-stripper.scss +15 -0
  98. data/docs/_sass/vendor/bourbon/helpers/_str-to-num.scss +55 -0
  99. data/docs/_sass/vendor/bourbon/settings/_asset-pipeline.scss +7 -0
  100. data/docs/_sass/vendor/bourbon/settings/_deprecation-warnings.scss +8 -0
  101. data/docs/_sass/vendor/bourbon/settings/_prefixer.scss +9 -0
  102. data/docs/_sass/vendor/bourbon/settings/_px-to-em.scss +1 -0
  103. data/docs/_sass/vendor/neat/_neat-helpers.scss +11 -0
  104. data/docs/_sass/vendor/neat/_neat.scss +23 -0
  105. data/docs/_sass/vendor/neat/functions/_new-breakpoint.scss +49 -0
  106. data/docs/_sass/vendor/neat/functions/_private.scss +114 -0
  107. data/docs/_sass/vendor/neat/grid/_box-sizing.scss +15 -0
  108. data/docs/_sass/vendor/neat/grid/_direction-context.scss +33 -0
  109. data/docs/_sass/vendor/neat/grid/_display-context.scss +28 -0
  110. data/docs/_sass/vendor/neat/grid/_fill-parent.scss +22 -0
  111. data/docs/_sass/vendor/neat/grid/_media.scss +92 -0
  112. data/docs/_sass/vendor/neat/grid/_omega.scss +87 -0
  113. data/docs/_sass/vendor/neat/grid/_outer-container.scss +34 -0
  114. data/docs/_sass/vendor/neat/grid/_pad.scss +25 -0
  115. data/docs/_sass/vendor/neat/grid/_private.scss +35 -0
  116. data/docs/_sass/vendor/neat/grid/_row.scss +52 -0
  117. data/docs/_sass/vendor/neat/grid/_shift.scss +50 -0
  118. data/docs/_sass/vendor/neat/grid/_span-columns.scss +94 -0
  119. data/docs/_sass/vendor/neat/grid/_to-deprecate.scss +97 -0
  120. data/docs/_sass/vendor/neat/grid/_visual-grid.scss +42 -0
  121. data/docs/_sass/vendor/neat/mixins/_clearfix.scss +25 -0
  122. data/docs/_sass/vendor/neat/settings/_disable-warnings.scss +13 -0
  123. data/docs/_sass/vendor/neat/settings/_grid.scss +51 -0
  124. data/docs/_sass/vendor/neat/settings/_visual-grid.scss +27 -0
  125. data/docs/_sass/vendor/normalize-3.0.2.scss +427 -0
  126. data/docs/_sass/vendor/pygments.scss +356 -0
  127. data/docs/automating_browsers/capybara.md +70 -0
  128. data/docs/css/screen.scss +7 -0
  129. data/docs/guides/callbacks.md +45 -0
  130. data/docs/guides/cli.md +52 -0
  131. data/docs/guides/configuration.md +184 -0
  132. data/docs/guides/error_handling.md +46 -0
  133. data/docs/guides/frontiers.md +93 -0
  134. data/docs/guides/halting.md +23 -0
  135. data/docs/guides/job_queues.md +26 -0
  136. data/docs/guides/locals.md +36 -0
  137. data/docs/guides/logging.md +22 -0
  138. data/docs/guides/page_objects.md +67 -0
  139. data/docs/guides/peeking.md +46 -0
  140. data/docs/guides/selenium_capybara.md +100 -0
  141. data/docs/guides/tutorial.md +452 -0
  142. data/docs/index.md +82 -0
  143. data/docs/js/navigation.js +11 -0
  144. data/docs/misc/contributing.md +20 -0
  145. data/docs/misc/testing.md +11 -0
  146. data/docs/recipes/authentication.md +23 -0
  147. data/docs/recipes/csv.md +29 -0
  148. data/docs/recipes/javascript.md +20 -0
  149. data/docs/recipes/multiple_uris.md +18 -0
  150. data/docs/recipes/screenshots.md +20 -0
  151. data/docs/routing/host_rules.md +24 -0
  152. data/docs/routing/path_rules.md +33 -0
  153. data/docs/routing/query_rules.md +69 -0
  154. data/docs/routing/routes.md +96 -0
  155. data/docs/routing/uri_rules.md +18 -0
  156. data/examples/collect_github_issues.rb +65 -0
  157. data/examples/find_foobar_on_wikipedia.rb +23 -0
  158. data/lib/wayfarer.rb +65 -0
  159. data/lib/wayfarer/configuration.rb +86 -0
  160. data/lib/wayfarer/crawl.rb +79 -0
  161. data/lib/wayfarer/crawl_observer.rb +103 -0
  162. data/lib/wayfarer/dispatcher.rb +104 -0
  163. data/lib/wayfarer/finders.rb +61 -0
  164. data/lib/wayfarer/frontiers/frontier.rb +79 -0
  165. data/lib/wayfarer/frontiers/memory_bloomfilter.rb +32 -0
  166. data/lib/wayfarer/frontiers/memory_frontier.rb +76 -0
  167. data/lib/wayfarer/frontiers/memory_trie_frontier.rb +39 -0
  168. data/lib/wayfarer/frontiers/normalize_uris.rb +48 -0
  169. data/lib/wayfarer/frontiers/redis_bloomfilter.rb +34 -0
  170. data/lib/wayfarer/frontiers/redis_frontier.rb +83 -0
  171. data/lib/wayfarer/http_adapters/adapter_pool.rb +62 -0
  172. data/lib/wayfarer/http_adapters/net_http_adapter.rb +77 -0
  173. data/lib/wayfarer/http_adapters/selenium_adapter.rb +80 -0
  174. data/lib/wayfarer/job.rb +192 -0
  175. data/lib/wayfarer/locals.rb +40 -0
  176. data/lib/wayfarer/page.rb +94 -0
  177. data/lib/wayfarer/parsers/json_parser.rb +20 -0
  178. data/lib/wayfarer/parsers/xml_parser.rb +27 -0
  179. data/lib/wayfarer/processor.rb +103 -0
  180. data/lib/wayfarer/routing/host_rule.rb +19 -0
  181. data/lib/wayfarer/routing/path_rule.rb +54 -0
  182. data/lib/wayfarer/routing/query_rule.rb +59 -0
  183. data/lib/wayfarer/routing/router.rb +71 -0
  184. data/lib/wayfarer/routing/rule.rb +102 -0
  185. data/lib/wayfarer/routing/uri_rule.rb +21 -0
  186. data/spec/configuration_spec.rb +26 -0
  187. data/spec/crawl_spec.rb +48 -0
  188. data/spec/finders_spec.rb +49 -0
  189. data/spec/frontiers/memory_bloomfilter_spec.rb +6 -0
  190. data/spec/frontiers/memory_frontier_spec.rb +6 -0
  191. data/spec/frontiers/memory_trie_frontier_spec.rb +6 -0
  192. data/spec/frontiers/normalize_uris_spec.rb +59 -0
  193. data/spec/frontiers/redis_bloomfilter_spec.rb +6 -0
  194. data/spec/frontiers/redis_frontier_spec.rb +6 -0
  195. data/spec/http_adapters/adapter_pool_spec.rb +33 -0
  196. data/spec/http_adapters/net_http_adapter_spec.rb +83 -0
  197. data/spec/http_adapters/selenium_adapter_spec.rb +53 -0
  198. data/spec/integration/callbacks_spec.rb +42 -0
  199. data/spec/integration/locals_spec.rb +106 -0
  200. data/spec/job_spec.rb +86 -0
  201. data/spec/page_spec.rb +38 -0
  202. data/spec/parsers/json_parser_spec.rb +30 -0
  203. data/spec/parsers/xml_parser_spec.rb +24 -0
  204. data/spec/processor_spec.rb +31 -0
  205. data/spec/routing/host_rule_spec.rb +48 -0
  206. data/spec/routing/path_rule_spec.rb +66 -0
  207. data/spec/routing/query_rule_spec.rb +124 -0
  208. data/spec/routing/router_spec.rb +67 -0
  209. data/spec/routing/rule_spec.rb +218 -0
  210. data/spec/routing/uri_rule_spec.rb +24 -0
  211. data/spec/shared/frontier.rb +96 -0
  212. data/spec/spec_helpers.rb +62 -0
  213. data/spec/wayfarer_spec.rb +24 -0
  214. data/support/static/finders.html +38 -0
  215. data/support/static/graph/details/a.html +10 -0
  216. data/support/static/graph/details/b.html +10 -0
  217. data/support/static/graph/index.html +20 -0
  218. data/support/static/json/dummy.json +13 -0
  219. data/support/static/links/links.html +28 -0
  220. data/support/static/xml/dummy.xml +120 -0
  221. data/support/test_app.rb +45 -0
  222. data/wayfarer-jruby.gemspec +49 -0
  223. data/wayfarer.gemspec +53 -0
  224. metadata +616 -0
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bloomfilter-rb"
4
+
5
+ module Wayfarer
6
+ module Frontiers
7
+ # A Redis bloomfilter.
8
+ # @api private
9
+ class RedisBloomfilter < MemoryBloomfilter
10
+ def initialize(config)
11
+ @conn = Redis.new(config.redis_opts)
12
+ @filter = BloomFilter::Redis.new(config.bloomfilter_opts.merge(db: @conn))
13
+ super(config)
14
+ end
15
+ end
16
+
17
+ # @override
18
+ def cache(*uris)
19
+ uris.each { |uri| @filter.insert(uri) }
20
+ end
21
+
22
+ # @override
23
+ def cached?(uri)
24
+ @filter.include?(uri)
25
+ end
26
+
27
+ # @override
28
+ def free
29
+ @filter.clear
30
+ @conn.disconnect!
31
+ super
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "redis"
4
+
5
+ module Wayfarer
6
+ module Frontiers
7
+ # A Redis frontier
8
+ # @api private
9
+ class RedisFrontier < Frontier
10
+ def initialize(config)
11
+ @conn = Redis.new(config.redis_opts)
12
+ super(config)
13
+ end
14
+
15
+ # @override
16
+ def current_uris
17
+ @conn.smembers(current_uris_key).map { |str| URI(str) }
18
+ end
19
+
20
+ # @override
21
+ def staged_uris
22
+ @conn.smembers(staged_uris_key).map { |str| URI(str) }
23
+ end
24
+
25
+ # @override
26
+ def stage(*uris)
27
+ @conn.sadd(staged_uris_key, uris.map(&:to_s)) if uris.any?
28
+ end
29
+
30
+ # @override
31
+ def staged?(uri)
32
+ @conn.sismember(staged_uris_key, uri.to_s)
33
+ end
34
+
35
+ # @override
36
+ def cache(*uris)
37
+ @conn.sadd(cached_uris_key, uris.map(&:to_s)) if uris.any?
38
+ end
39
+
40
+ # @override
41
+ def cached?(uri)
42
+ @conn.sismember(cached_uris_key, uri.to_s)
43
+ end
44
+
45
+ # @override
46
+ def free
47
+ [current_uris_key, staged_uris_key, cached_uris_key].each do |key|
48
+ @conn.del(key)
49
+ end
50
+
51
+ @conn.disconnect!
52
+ end
53
+
54
+ private
55
+
56
+ def reset_staged_uris!
57
+ @conn.del(staged_uris_key)
58
+ end
59
+
60
+ # @override
61
+ def swap!
62
+ # Achieve: @current_uris = @staged_uris
63
+ @conn.rename(staged_uris_key, current_uris_key)
64
+ end
65
+
66
+ def filter_staged_uris!
67
+ @conn.sdiffstore(staged_uris_key, staged_uris_key, cached_uris_key)
68
+ end
69
+
70
+ def current_uris_key
71
+ "#{@config.uuid}_current_uris"
72
+ end
73
+
74
+ def staged_uris_key
75
+ "#{@config.uuid}_staged_uris"
76
+ end
77
+
78
+ def cached_uris_key
79
+ "#{@config.uuid}_cached_uris"
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "forwardable"
4
+ require "connection_pool"
5
+
6
+ module Wayfarer
7
+ module HTTPAdapters
8
+ # A connection pool that hands out HTTP adapters.
9
+ # @private
10
+ class AdapterPool
11
+ extend Forwardable
12
+
13
+ def initialize(job)
14
+ @job = job
15
+ @config = job.config
16
+
17
+ size = @config.connection_count
18
+ timeout = @config.connection_timeout
19
+
20
+ @pool = ConnectionPool.new(
21
+ size: size,
22
+ timeout: timeout,
23
+ &method(:instantiate_adapter)
24
+ )
25
+ end
26
+
27
+ # Shuts down all HTTP adapters
28
+ def free
29
+ @pool.shutdown(&:free)
30
+ end
31
+
32
+ private
33
+
34
+ def instantiate_adapter
35
+ adapter = if @config.http_adapter == :selenium
36
+ HTTPAdapters::SeleniumAdapter.new(@config)
37
+ else
38
+ HTTPAdapters::NetHTTPAdapter.instance(@config)
39
+ end
40
+
41
+ @job.run_hook(
42
+ :setup_adapter,
43
+ adapter,
44
+ adapter.try(:driver),
45
+ adapter.try(:browser)
46
+ )
47
+
48
+ adapter
49
+ end
50
+
51
+ def method_missing(method, *argv, &proc)
52
+ super if method == :shutdown
53
+ @pool.public_send(method, *argv, &proc)
54
+ end
55
+
56
+ def respond_to_missing?(method, private = false)
57
+ return false if method == :shutdown
58
+ @pool.respond_to?(method) || super
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "securerandom"
4
+ require "net/http"
5
+ require "net/http/persistent"
6
+
7
+ module Wayfarer
8
+ module HTTPAdapters
9
+ # A singleton adapter for net-http-persistent.
10
+ # @api private
11
+ class NetHTTPAdapter
12
+ # Supported standard lib classes
13
+ RECOGNIZED_URI_TYPES = [
14
+ URI::HTTP,
15
+ URI::HTTPS
16
+ ].freeze
17
+
18
+ MalformedURI = Class.new(StandardError)
19
+ MalformedRedirectURI = Class.new(StandardError)
20
+ MaximumRedirectCountReached = Class.new(StandardError)
21
+
22
+ attr_accessor :request_header_overrides
23
+
24
+ # TODO: Remove default parameter value
25
+ def self.instance(config = Wayfarer.config)
26
+ @@instance ||= new(config)
27
+ end
28
+
29
+ def initialize(config)
30
+ @config = config
31
+ @conn = Net::HTTP::Persistent.new("wayfarer-#{SecureRandom.uuid}")
32
+ end
33
+
34
+ # This is a singleton class. Use ::instance instead.
35
+ private_class_method :new
36
+
37
+ # Fetches a page.
38
+ # @return [Page]
39
+ # @raise [MalformedURI] if the URI is not supported.
40
+ # @raise [MalformedRedirectURI] if a redirection URI is not supported.
41
+ # @raise [MaximumRedirectCountReached] if too many redirections are
42
+ # encountered.
43
+ def fetch(uri, redirects_followed = 0)
44
+ if !RECOGNIZED_URI_TYPES.include?(uri.class)
45
+ raise _ = if redirects_followed.positive?
46
+ MalformedRedirectURI
47
+ else
48
+ MalformedURI
49
+ end
50
+ elsif redirects_followed > @config.max_http_redirects
51
+ raise MaximumRedirectCountReached
52
+ end
53
+
54
+ res = @conn.request(uri)
55
+
56
+ if res.is_a? Net::HTTPRedirection
57
+ redirect_uri = URI(res["location"])
58
+ return fetch(redirect_uri, redirects_followed + 1)
59
+ end
60
+
61
+ Page.new(
62
+ uri: uri,
63
+ status_code: res.code.to_i,
64
+ body: res.body,
65
+ headers: res.to_hash
66
+ )
67
+ rescue SocketError
68
+ raise MalformedURI
69
+ end
70
+
71
+ # Shuts down all connections.
72
+ def free
73
+ @conn.shutdown
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "selenium-webdriver"
4
+ require "selenium/emulated_features"
5
+ require "capybara"
6
+
7
+ module Wayfarer
8
+ module HTTPAdapters
9
+ # An adapter for Selenium WebDrivers
10
+ # @api private
11
+ class SeleniumAdapter
12
+ # @!attribute [r] driver
13
+ # @return [URI] the Selenium WebDriver.
14
+ attr_reader :driver
15
+
16
+ def initialize(config = Wayfarer.config)
17
+ @config = config
18
+ end
19
+
20
+ # Fetches a page.
21
+ # @return [Page]
22
+ def fetch(uri)
23
+ driver.navigate.to(uri)
24
+
25
+ Page.new(
26
+ uri: @driver.current_url,
27
+ status_code: @driver.response_code,
28
+ body: @driver.page_source,
29
+ headers: @driver.response_headers
30
+ )
31
+ end
32
+
33
+ # Closes the driver.
34
+ def reload!
35
+ @driver&.close
36
+ @driver = nil
37
+ end
38
+
39
+ # Quits the browser.
40
+ def free
41
+ @driver&.quit
42
+ @driver = nil
43
+ end
44
+
45
+ # The WebDriver.
46
+ def driver
47
+ @driver ||= instantiate_driver
48
+ end
49
+
50
+ # A Capybara driver that wraps the {#driver}.
51
+ # @see https://github.com/teamcapybara/capybara Capybara
52
+ def browser
53
+ @browser ||= instantiate_capybara_driver
54
+ end
55
+
56
+ private
57
+
58
+ def instantiate_driver
59
+ driver = Selenium::WebDriver.for(*@config.selenium_argv)
60
+ driver.manage.window.size = Selenium::WebDriver::Dimension.new(
61
+ *@config.window_size
62
+ )
63
+ driver
64
+ end
65
+
66
+ def instantiate_capybara_driver
67
+ Capybara.run_server = false
68
+ Capybara.current_driver = :selenium
69
+
70
+ capybara_driver = Capybara::Selenium::Driver.new(nil)
71
+ capybara_driver.instance_variable_set(:@browser, driver)
72
+
73
+ session = Capybara::Session.new(:selenium, nil)
74
+ session.instance_variable_set(:@driver, capybara_driver)
75
+
76
+ session
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,192 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "forwardable"
4
+ require "hooks"
5
+ require "active_job"
6
+
7
+ # TODO: I only want deep_dup
8
+ require "active_support/all"
9
+
10
+ module Wayfarer
11
+ # A {Job} is a class that has a {Routing::Router} with many {Routing::Rule}s
12
+ # which are matched against a URI. Rules map URIs onto job instance methods.
13
+ # Under the hood, jobs are instantiated within separate threads by a
14
+ # {Processor}. Every instance gets its own thread. If a URI is matched, its
15
+ # {Page} is retrieved, and made available to instance methods via {#page}.
16
+ #
17
+ # Jobs implement ActiveJob's Job API and are therefore compatible with a wide
18
+ # range of job queues. To run a job immediately, call ::perform_now.
19
+ # enqueue a job, call ::perform_later.
20
+ #
21
+ # @see https://github.com/rails/rails/tree/master/activejob rails/activejob
22
+ # @see http://edgeguides.rubyonrails.org/active_job_basics.html ActiveJob Basics
23
+ class Job < ActiveJob::Base
24
+ extend Forwardable
25
+
26
+ include Hooks
27
+ include Locals
28
+
29
+ # @!group Callbacks
30
+
31
+ # Callback that fires __once__ before any pages are retrieved.
32
+ # @method before_crawl
33
+ # @scope class
34
+ define_hook :before_crawl
35
+
36
+ # Callback that fires __once__ after all pages have been retrieved and
37
+ # processing is done.
38
+ # @method after_crawl
39
+ # @scope class
40
+ define_hook :after_crawl
41
+
42
+ # Callback that fires when HTTP adapters are instantiated.
43
+ # @method setup_adapter
44
+ # @scope class
45
+ # @yield [[HTTPAdapters::NetHTTPAdapter, HTTPAdapters::SeleniumAdapter], [Selenium::WebDriver::Driver, nil], [Capybara::Selenium::Driver, nil]]
46
+ define_hooks :setup_adapter
47
+
48
+ # @!endgroup
49
+
50
+ class << self
51
+ extend Forwardable
52
+
53
+ # @!attribute [w] router
54
+ attr_writer :router
55
+
56
+ # @!attribute [w] config
57
+ attr_writer :config
58
+
59
+ # Returns a class copy.
60
+ def prepare
61
+ duplicate = dup
62
+ duplicate.router = router.dup
63
+ duplicate.locals = locals.deep_dup
64
+ duplicate.config = config.dup
65
+
66
+ duplicate.locals.each do |(key, val)|
67
+ duplicate.locals[key] = Locals.thread_safe_counterpart(val)
68
+ end
69
+
70
+ duplicate.locals.each do |(key, _)|
71
+ duplicate.send(:define_method, key) do duplicate.locals[key] end
72
+ duplicate.send(:define_singleton_method, key) do
73
+ duplicate.locals[key]
74
+ end
75
+ end
76
+
77
+ duplicate
78
+ end
79
+
80
+ # A configuration based off the global {Wayfarer.config}.
81
+ # @yield [Configuration]
82
+ # @return [Configuration]
83
+ def config
84
+ @config ||= Wayfarer.config.clone
85
+ yield(@config) if block_given?
86
+ @config
87
+ end
88
+
89
+ # A router.
90
+ # If a block is passed in, it is evaluated within the {Router}'s instance.
91
+ # @return [Routing::Router]
92
+ def router(&proc)
93
+ @router ||= Routing::Router.new
94
+ @router.instance_eval(&proc) if block_given?
95
+ @router
96
+ end
97
+
98
+ alias route router
99
+ alias routes router
100
+
101
+ # Overshadows ActiveJob::Base's own logger
102
+ delegate logger: :config
103
+ end
104
+
105
+ # @!attribute [r] staged_uris
106
+ # @return [Array<String>, Array<URI>] URIs to stage for the next cycle.
107
+ # @see #stage
108
+ attr_reader :staged_uris
109
+
110
+ # @!attribute [rw] page
111
+ attr_writer :page
112
+
113
+ # @!attribute [rw] adapter
114
+ attr_accessor :adapter
115
+
116
+ # @!attribute [rw] params
117
+ attr_accessor :params
118
+
119
+ def initialize(*argv)
120
+ @halts = false
121
+ @staged_uris = []
122
+ super(*argv)
123
+ end
124
+
125
+ # Whether this job will stop processing.
126
+ def halts?
127
+ @halts
128
+ end
129
+
130
+ # Performs this job.
131
+ # @note ActiveJob API
132
+ # @override
133
+ def perform(*uris)
134
+ Crawl.new(self.class, *uris).execute
135
+ end
136
+
137
+ protected
138
+
139
+ # All following instance methods are available within actions.
140
+
141
+ # Sets a halting flag that signals the processor to stop its work.
142
+ def halt
143
+ @halts = true
144
+ end
145
+
146
+ # Adds URIs to process in the next cycle.
147
+ # If a relative URI is given, the page's protocol and hostname get
148
+ # prepended.
149
+ # @param [String, URI, Array<String>, Array<URI>]
150
+ def stage(*uris)
151
+ @staged_uris.push(*uris.flatten)
152
+ end
153
+
154
+ # The {Page} representing the URI currently processed by an action.
155
+ # When using the Selenium adapter, {Page#body} gets refreshed on every call.
156
+ # Otherwise, subsequent DOM updates (i.e. JavaScript-induced) would be
157
+ # invisible.
158
+ # @return Page
159
+ def page
160
+ return @page unless self.class.config.http_adapter == :selenium
161
+
162
+ Page.new(
163
+ uri: @page.uri,
164
+ status_code: @page.uri,
165
+ body: driver.page_source,
166
+ headers: @page.headers
167
+ )
168
+ end
169
+
170
+ # The parsed response body.
171
+ # When using the Selenium adapter, this parses the body again on every call.
172
+ # Otherwise, subsequent DOM updates (i.e. JavaScript-induced) would be
173
+ # invisible.
174
+ # @method doc
175
+ # @see Page#doc
176
+ delegate doc: :page
177
+
178
+ # The Selenium WebDriver.
179
+ # @method driver
180
+ # @see https://github.com/peterc/pismo Pismo
181
+ # @see Page#driver
182
+ delegate driver: :adapter
183
+
184
+ # A Capybara driver that wraps the {#driver}.
185
+ # @method browser
186
+ # @see HTTPAdapters::SeleniumAdapter#browser
187
+ delegate browser: :adapter
188
+
189
+ # @method logger
190
+ delegate logger: :"self.class"
191
+ end
192
+ end