wayfarer 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rbenv-gemsets +1 -0
  4. data/.rspec +3 -0
  5. data/.rubocop.yml +21 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/.yardopts +3 -0
  9. data/Changelog.md +10 -0
  10. data/Gemfile +11 -0
  11. data/LICENSE +19 -0
  12. data/README.md +21 -0
  13. data/Rakefile +114 -0
  14. data/benchmark/frontiers.rb +143 -0
  15. data/bin/wayfarer +116 -0
  16. data/docs/.gitignore +2 -0
  17. data/docs/_config.yml +15 -0
  18. data/docs/_includes/base.html +7 -0
  19. data/docs/_includes/head.html +10 -0
  20. data/docs/_includes/navigation.html +187 -0
  21. data/docs/_layouts/default.html +42 -0
  22. data/docs/_sass/base.scss +439 -0
  23. data/docs/_sass/variables.scss +24 -0
  24. data/docs/_sass/vendor/bourbon/_bourbon-deprecate.scss +19 -0
  25. data/docs/_sass/vendor/bourbon/_bourbon-deprecated-upcoming.scss +425 -0
  26. data/docs/_sass/vendor/bourbon/_bourbon.scss +90 -0
  27. data/docs/_sass/vendor/bourbon/addons/_border-color.scss +29 -0
  28. data/docs/_sass/vendor/bourbon/addons/_border-radius.scss +48 -0
  29. data/docs/_sass/vendor/bourbon/addons/_border-style.scss +28 -0
  30. data/docs/_sass/vendor/bourbon/addons/_border-width.scss +28 -0
  31. data/docs/_sass/vendor/bourbon/addons/_buttons.scss +69 -0
  32. data/docs/_sass/vendor/bourbon/addons/_clearfix.scss +25 -0
  33. data/docs/_sass/vendor/bourbon/addons/_ellipsis.scss +30 -0
  34. data/docs/_sass/vendor/bourbon/addons/_font-stacks.scss +31 -0
  35. data/docs/_sass/vendor/bourbon/addons/_hide-text.scss +27 -0
  36. data/docs/_sass/vendor/bourbon/addons/_margin.scss +29 -0
  37. data/docs/_sass/vendor/bourbon/addons/_padding.scss +29 -0
  38. data/docs/_sass/vendor/bourbon/addons/_position.scss +51 -0
  39. data/docs/_sass/vendor/bourbon/addons/_prefixer.scss +66 -0
  40. data/docs/_sass/vendor/bourbon/addons/_retina-image.scss +27 -0
  41. data/docs/_sass/vendor/bourbon/addons/_size.scss +56 -0
  42. data/docs/_sass/vendor/bourbon/addons/_text-inputs.scss +118 -0
  43. data/docs/_sass/vendor/bourbon/addons/_timing-functions.scss +34 -0
  44. data/docs/_sass/vendor/bourbon/addons/_triangle.scss +63 -0
  45. data/docs/_sass/vendor/bourbon/addons/_word-wrap.scss +29 -0
  46. data/docs/_sass/vendor/bourbon/css3/_animation.scss +61 -0
  47. data/docs/_sass/vendor/bourbon/css3/_appearance.scss +5 -0
  48. data/docs/_sass/vendor/bourbon/css3/_backface-visibility.scss +5 -0
  49. data/docs/_sass/vendor/bourbon/css3/_background-image.scss +44 -0
  50. data/docs/_sass/vendor/bourbon/css3/_background.scss +57 -0
  51. data/docs/_sass/vendor/bourbon/css3/_border-image.scss +61 -0
  52. data/docs/_sass/vendor/bourbon/css3/_calc.scss +6 -0
  53. data/docs/_sass/vendor/bourbon/css3/_columns.scss +67 -0
  54. data/docs/_sass/vendor/bourbon/css3/_filter.scss +6 -0
  55. data/docs/_sass/vendor/bourbon/css3/_flex-box.scss +327 -0
  56. data/docs/_sass/vendor/bourbon/css3/_font-face.scss +29 -0
  57. data/docs/_sass/vendor/bourbon/css3/_font-feature-settings.scss +6 -0
  58. data/docs/_sass/vendor/bourbon/css3/_hidpi-media-query.scss +12 -0
  59. data/docs/_sass/vendor/bourbon/css3/_hyphens.scss +6 -0
  60. data/docs/_sass/vendor/bourbon/css3/_image-rendering.scss +15 -0
  61. data/docs/_sass/vendor/bourbon/css3/_keyframes.scss +38 -0
  62. data/docs/_sass/vendor/bourbon/css3/_linear-gradient.scss +40 -0
  63. data/docs/_sass/vendor/bourbon/css3/_perspective.scss +12 -0
  64. data/docs/_sass/vendor/bourbon/css3/_placeholder.scss +10 -0
  65. data/docs/_sass/vendor/bourbon/css3/_radial-gradient.scss +40 -0
  66. data/docs/_sass/vendor/bourbon/css3/_selection.scss +44 -0
  67. data/docs/_sass/vendor/bourbon/css3/_text-decoration.scss +27 -0
  68. data/docs/_sass/vendor/bourbon/css3/_transform.scss +21 -0
  69. data/docs/_sass/vendor/bourbon/css3/_transition.scss +81 -0
  70. data/docs/_sass/vendor/bourbon/css3/_user-select.scss +5 -0
  71. data/docs/_sass/vendor/bourbon/functions/_assign-inputs.scss +16 -0
  72. data/docs/_sass/vendor/bourbon/functions/_contains-falsy.scss +25 -0
  73. data/docs/_sass/vendor/bourbon/functions/_contains.scss +31 -0
  74. data/docs/_sass/vendor/bourbon/functions/_is-length.scss +16 -0
  75. data/docs/_sass/vendor/bourbon/functions/_is-light.scss +26 -0
  76. data/docs/_sass/vendor/bourbon/functions/_is-number.scss +16 -0
  77. data/docs/_sass/vendor/bourbon/functions/_is-size.scss +23 -0
  78. data/docs/_sass/vendor/bourbon/functions/_modular-scale.scss +74 -0
  79. data/docs/_sass/vendor/bourbon/functions/_px-to-em.scss +24 -0
  80. data/docs/_sass/vendor/bourbon/functions/_px-to-rem.scss +26 -0
  81. data/docs/_sass/vendor/bourbon/functions/_shade.scss +24 -0
  82. data/docs/_sass/vendor/bourbon/functions/_strip-units.scss +22 -0
  83. data/docs/_sass/vendor/bourbon/functions/_tint.scss +24 -0
  84. data/docs/_sass/vendor/bourbon/functions/_transition-property-name.scss +37 -0
  85. data/docs/_sass/vendor/bourbon/functions/_unpack.scss +32 -0
  86. data/docs/_sass/vendor/bourbon/helpers/_convert-units.scss +26 -0
  87. data/docs/_sass/vendor/bourbon/helpers/_directional-values.scss +108 -0
  88. data/docs/_sass/vendor/bourbon/helpers/_font-source-declaration.scss +53 -0
  89. data/docs/_sass/vendor/bourbon/helpers/_gradient-positions-parser.scss +24 -0
  90. data/docs/_sass/vendor/bourbon/helpers/_linear-angle-parser.scss +35 -0
  91. data/docs/_sass/vendor/bourbon/helpers/_linear-gradient-parser.scss +51 -0
  92. data/docs/_sass/vendor/bourbon/helpers/_linear-positions-parser.scss +77 -0
  93. data/docs/_sass/vendor/bourbon/helpers/_linear-side-corner-parser.scss +41 -0
  94. data/docs/_sass/vendor/bourbon/helpers/_radial-arg-parser.scss +74 -0
  95. data/docs/_sass/vendor/bourbon/helpers/_radial-gradient-parser.scss +55 -0
  96. data/docs/_sass/vendor/bourbon/helpers/_radial-positions-parser.scss +28 -0
  97. data/docs/_sass/vendor/bourbon/helpers/_render-gradients.scss +31 -0
  98. data/docs/_sass/vendor/bourbon/helpers/_shape-size-stripper.scss +15 -0
  99. data/docs/_sass/vendor/bourbon/helpers/_str-to-num.scss +55 -0
  100. data/docs/_sass/vendor/bourbon/settings/_asset-pipeline.scss +7 -0
  101. data/docs/_sass/vendor/bourbon/settings/_deprecation-warnings.scss +8 -0
  102. data/docs/_sass/vendor/bourbon/settings/_prefixer.scss +9 -0
  103. data/docs/_sass/vendor/bourbon/settings/_px-to-em.scss +1 -0
  104. data/docs/_sass/vendor/neat/_neat-helpers.scss +11 -0
  105. data/docs/_sass/vendor/neat/_neat.scss +23 -0
  106. data/docs/_sass/vendor/neat/functions/_new-breakpoint.scss +49 -0
  107. data/docs/_sass/vendor/neat/functions/_private.scss +114 -0
  108. data/docs/_sass/vendor/neat/grid/_box-sizing.scss +15 -0
  109. data/docs/_sass/vendor/neat/grid/_direction-context.scss +33 -0
  110. data/docs/_sass/vendor/neat/grid/_display-context.scss +28 -0
  111. data/docs/_sass/vendor/neat/grid/_fill-parent.scss +22 -0
  112. data/docs/_sass/vendor/neat/grid/_media.scss +92 -0
  113. data/docs/_sass/vendor/neat/grid/_omega.scss +87 -0
  114. data/docs/_sass/vendor/neat/grid/_outer-container.scss +34 -0
  115. data/docs/_sass/vendor/neat/grid/_pad.scss +25 -0
  116. data/docs/_sass/vendor/neat/grid/_private.scss +35 -0
  117. data/docs/_sass/vendor/neat/grid/_row.scss +52 -0
  118. data/docs/_sass/vendor/neat/grid/_shift.scss +50 -0
  119. data/docs/_sass/vendor/neat/grid/_span-columns.scss +94 -0
  120. data/docs/_sass/vendor/neat/grid/_to-deprecate.scss +97 -0
  121. data/docs/_sass/vendor/neat/grid/_visual-grid.scss +42 -0
  122. data/docs/_sass/vendor/neat/mixins/_clearfix.scss +25 -0
  123. data/docs/_sass/vendor/neat/settings/_disable-warnings.scss +13 -0
  124. data/docs/_sass/vendor/neat/settings/_grid.scss +51 -0
  125. data/docs/_sass/vendor/neat/settings/_visual-grid.scss +27 -0
  126. data/docs/_sass/vendor/normalize-3.0.2.scss +427 -0
  127. data/docs/_sass/vendor/pygments.scss +356 -0
  128. data/docs/automating_browsers/capybara.md +70 -0
  129. data/docs/css/screen.scss +7 -0
  130. data/docs/guides/callbacks.md +45 -0
  131. data/docs/guides/cli.md +52 -0
  132. data/docs/guides/configuration.md +184 -0
  133. data/docs/guides/error_handling.md +46 -0
  134. data/docs/guides/frontiers.md +93 -0
  135. data/docs/guides/halting.md +23 -0
  136. data/docs/guides/job_queues.md +26 -0
  137. data/docs/guides/locals.md +36 -0
  138. data/docs/guides/logging.md +22 -0
  139. data/docs/guides/page_objects.md +67 -0
  140. data/docs/guides/peeking.md +46 -0
  141. data/docs/guides/selenium_capybara.md +100 -0
  142. data/docs/guides/tutorial.md +452 -0
  143. data/docs/index.md +82 -0
  144. data/docs/js/navigation.js +11 -0
  145. data/docs/misc/contributing.md +20 -0
  146. data/docs/misc/testing.md +11 -0
  147. data/docs/recipes/authentication.md +23 -0
  148. data/docs/recipes/csv.md +29 -0
  149. data/docs/recipes/javascript.md +20 -0
  150. data/docs/recipes/multiple_uris.md +18 -0
  151. data/docs/recipes/screenshots.md +20 -0
  152. data/docs/routing/custom_rules.md +16 -0
  153. data/docs/routing/filetypes_rules.md +21 -0
  154. data/docs/routing/host_rules.md +24 -0
  155. data/docs/routing/path_rules.md +33 -0
  156. data/docs/routing/protocol_rules.md +17 -0
  157. data/docs/routing/query_rules.md +69 -0
  158. data/docs/routing/routes.md +96 -0
  159. data/docs/routing/uri_rules.md +18 -0
  160. data/examples/collect_github_issues.rb +65 -0
  161. data/examples/find_foobar_on_wikipedia.rb +23 -0
  162. data/lib/wayfarer/configuration.rb +86 -0
  163. data/lib/wayfarer/crawl.rb +79 -0
  164. data/lib/wayfarer/crawl_observer.rb +103 -0
  165. data/lib/wayfarer/dispatcher.rb +104 -0
  166. data/lib/wayfarer/finders.rb +61 -0
  167. data/lib/wayfarer/frontiers/frontier.rb +79 -0
  168. data/lib/wayfarer/frontiers/memory_bloomfilter.rb +32 -0
  169. data/lib/wayfarer/frontiers/memory_frontier.rb +76 -0
  170. data/lib/wayfarer/frontiers/memory_trie_frontier.rb +39 -0
  171. data/lib/wayfarer/frontiers/normalize_uris.rb +48 -0
  172. data/lib/wayfarer/frontiers/redis_bloomfilter.rb +34 -0
  173. data/lib/wayfarer/frontiers/redis_frontier.rb +83 -0
  174. data/lib/wayfarer/http_adapters/adapter_pool.rb +62 -0
  175. data/lib/wayfarer/http_adapters/net_http_adapter.rb +77 -0
  176. data/lib/wayfarer/http_adapters/selenium_adapter.rb +80 -0
  177. data/lib/wayfarer/job.rb +211 -0
  178. data/lib/wayfarer/locals.rb +40 -0
  179. data/lib/wayfarer/page.rb +94 -0
  180. data/lib/wayfarer/parsers/json_parser.rb +20 -0
  181. data/lib/wayfarer/parsers/xml_parser.rb +27 -0
  182. data/lib/wayfarer/processor.rb +103 -0
  183. data/lib/wayfarer/routing/custom_rule.rb +21 -0
  184. data/lib/wayfarer/routing/filetypes_rule.rb +20 -0
  185. data/lib/wayfarer/routing/host_rule.rb +19 -0
  186. data/lib/wayfarer/routing/path_rule.rb +54 -0
  187. data/lib/wayfarer/routing/protocol_rule.rb +21 -0
  188. data/lib/wayfarer/routing/query_rule.rb +59 -0
  189. data/lib/wayfarer/routing/router.rb +71 -0
  190. data/lib/wayfarer/routing/rule.rb +114 -0
  191. data/lib/wayfarer/routing/uri_rule.rb +21 -0
  192. data/lib/wayfarer.rb +68 -0
  193. data/spec/configuration_spec.rb +26 -0
  194. data/spec/crawl_spec.rb +48 -0
  195. data/spec/finders_spec.rb +49 -0
  196. data/spec/frontiers/memory_bloomfilter_spec.rb +6 -0
  197. data/spec/frontiers/memory_frontier_spec.rb +6 -0
  198. data/spec/frontiers/memory_trie_frontier_spec.rb +6 -0
  199. data/spec/frontiers/normalize_uris_spec.rb +59 -0
  200. data/spec/frontiers/redis_bloomfilter_spec.rb +6 -0
  201. data/spec/frontiers/redis_frontier_spec.rb +6 -0
  202. data/spec/http_adapters/adapter_pool_spec.rb +33 -0
  203. data/spec/http_adapters/net_http_adapter_spec.rb +83 -0
  204. data/spec/http_adapters/selenium_adapter_spec.rb +53 -0
  205. data/spec/integration/callbacks_spec.rb +42 -0
  206. data/spec/integration/locals_spec.rb +106 -0
  207. data/spec/integration/peeking_spec.rb +61 -0
  208. data/spec/job_spec.rb +122 -0
  209. data/spec/page_spec.rb +38 -0
  210. data/spec/parsers/json_parser_spec.rb +30 -0
  211. data/spec/parsers/xml_parser_spec.rb +24 -0
  212. data/spec/processor_spec.rb +31 -0
  213. data/spec/routing/custom_rule_spec.rb +26 -0
  214. data/spec/routing/filetypes_rule_spec.rb +40 -0
  215. data/spec/routing/host_rule_spec.rb +48 -0
  216. data/spec/routing/path_rule_spec.rb +66 -0
  217. data/spec/routing/protocol_rule_spec.rb +26 -0
  218. data/spec/routing/query_rule_spec.rb +124 -0
  219. data/spec/routing/router_spec.rb +67 -0
  220. data/spec/routing/rule_spec.rb +251 -0
  221. data/spec/routing/uri_rule_spec.rb +24 -0
  222. data/spec/shared/frontier.rb +96 -0
  223. data/spec/spec_helpers.rb +62 -0
  224. data/spec/wayfarer_spec.rb +24 -0
  225. data/support/static/finders.html +38 -0
  226. data/support/static/graph/details/a.html +10 -0
  227. data/support/static/graph/details/b.html +10 -0
  228. data/support/static/graph/index.html +20 -0
  229. data/support/static/json/dummy.json +13 -0
  230. data/support/static/links/links.html +28 -0
  231. data/support/static/xml/dummy.xml +120 -0
  232. data/support/test_app.rb +45 -0
  233. data/wayfarer-jruby.gemspec +49 -0
  234. data/wayfarer.gemspec +53 -0
  235. metadata +697 -0
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module Wayfarer
6
+ module Routing
7
+ # @private
8
+ class ProtocolRule < Rule
9
+ def initialize(protocol, opts = {}, &proc)
10
+ @protocol = protocol.to_s
11
+ super(opts, &proc)
12
+ end
13
+
14
+ private
15
+
16
+ def match!(uri)
17
+ uri.scheme == @protocol
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "cgi"
4
+
5
+ module Wayfarer
6
+ module Routing
7
+ # @private
8
+ class QueryRule < Rule
9
+ def initialize(field_constraints, opts = {}, &proc)
10
+ @field_constraints = field_constraints
11
+ super(opts, &proc)
12
+ end
13
+
14
+ private
15
+
16
+ def match!(uri)
17
+ CGI.parse(uri.query).none? do |field, vals| violates?(field, vals) end
18
+ rescue NoMethodError
19
+ # `CGI::parse` throws a `NoMethodError` if `uri.query` is an empty
20
+ # string
21
+ false
22
+ end
23
+
24
+ # rubocop:disable Lint/AssignmentInCondition
25
+ def violates?(field, vals)
26
+ return false unless constraint = @field_constraints[field.to_sym]
27
+ violates_constraint?(constraint, vals)
28
+ end
29
+ # rubocop:enable Lint/AssignmentInCondition
30
+
31
+ def violates_constraint?(constraint, vals)
32
+ case constraint
33
+ when String then violates_string?(constraint, vals)
34
+ when Integer then violates_integer?(constraint, vals)
35
+ when Regexp then violates_regexp?(constraint, vals)
36
+ when Range then violates_range?(constraint, vals)
37
+ end
38
+ end
39
+
40
+ def violates_string?(str, vals)
41
+ vals.none? { |val| str == val }
42
+ end
43
+
44
+ def violates_integer?(int, vals)
45
+ vals.none? do |val| int == Integer(val) end
46
+ rescue ArgumentError
47
+ true
48
+ end
49
+
50
+ def violates_regexp?(regexp, vals)
51
+ vals.none? { |val| regexp.match(val) }
52
+ end
53
+
54
+ def violates_range?(range, vals)
55
+ vals.none? { |val| range.include?(val.to_i) }
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "forwardable"
4
+
5
+ module Wayfarer
6
+ module Routing
7
+ # A {Router} maps URIs onto a {Job}'s instance methods.
8
+ class Router
9
+ extend Forwardable
10
+
11
+ # @!attribute [r] rule
12
+ # @return [Rule]
13
+ attr_reader :rule
14
+
15
+ # @!attribute [r] blacklist
16
+ # @return [Rule]
17
+ attr_reader :blacklist
18
+
19
+ def initialize
20
+ @rule = Rule.new
21
+ @blacklist = Rule.new
22
+ end
23
+
24
+ delegate %i[
25
+ uri
26
+ host
27
+ path
28
+ query
29
+ ] => :rule
30
+
31
+ # Returns the associated instance method (action) of the first rule that
32
+ # matches a URI and the collected parameter hash from the rule chain.
33
+ # @return [[Boolean, Symbol, Hash]] if a matching rule exists.
34
+ # @return [false] if no matching rule exists or the URI is forbidden.
35
+ def route(uri)
36
+ return false if forbids?(uri)
37
+
38
+ # TODO: Use structs instead
39
+ is_matching, params, action = @rule.invoke(uri)
40
+ return action, params if is_matching && params
41
+
42
+ false
43
+ end
44
+
45
+ # Whether a route matches the URI.
46
+ # TODO: Test
47
+ def routes?(uri)
48
+ !!route(uri)
49
+ end
50
+
51
+ # Adds a {Rule} to the blacklist.
52
+ def forbid(opts = {}, &proc)
53
+ @blacklist.build_child_rule_chain_from_options(opts)
54
+ @blacklist.instance_eval(&proc) if block_given?
55
+ @blacklist
56
+ end
57
+
58
+ # Whether the URI is matched by the blacklist rule.
59
+ # @see #forbid
60
+ def forbids?(uri)
61
+ @blacklist.matches?(uri)
62
+ end
63
+
64
+ # Whether the URI is allowed.
65
+ # @see #forbid
66
+ def allows?(uri)
67
+ !forbids?(uri)
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,114 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "forwardable"
4
+
5
+ module Wayfarer
6
+ module Routing
7
+ # Tree nodes
8
+ # @private
9
+ class Rule
10
+ include Enumerable
11
+
12
+ extend Forwardable
13
+
14
+ attr_reader :child_rules
15
+ attr_reader :target_action
16
+
17
+ delegate [:each] => :child_rules
18
+
19
+ def initialize(opts = {}, &proc)
20
+ @child_rules = []
21
+ @target_action = nil
22
+
23
+ build_child_rule_chain_from_options(opts)
24
+ instance_eval(&proc) if block_given?
25
+ end
26
+
27
+ def build_child_rule_chain_from_options(opts)
28
+ @target_action = opts.delete(:to)
29
+ opts.reduce(self) { |rule, (key, val)| rule.send(key, val) }
30
+ end
31
+
32
+ def matches?(uri)
33
+ return false unless match!(uri)
34
+ none? || any? { |child_rule| child_rule.matches?(uri) }
35
+ end
36
+
37
+ def invoke(uri)
38
+ rule_chain = matching_rule_chain(uri)
39
+
40
+ if rule_chain.any?
41
+ params = params_from_rule_chain(rule_chain, uri)
42
+ action = action_from_rule_chain(rule_chain)
43
+
44
+ [true, params, action]
45
+ else
46
+ false
47
+ end
48
+ end
49
+
50
+ # rubocop:disable Lint/AssignmentInCondition
51
+ def matching_rule_chain(uri, chain = [])
52
+ if match!(uri) && none?
53
+ chain << self
54
+ elsif matching_child = detect { |child_rule| child_rule.matches?(uri) }
55
+ matching_child.matching_rule_chain(uri, chain << self)
56
+ else
57
+ []
58
+ end
59
+ end
60
+ # rubocop:enable Lint/AssignmentInCondition
61
+
62
+ def params(*)
63
+ {}
64
+ end
65
+
66
+ def uri(*argv, &proc)
67
+ append_child_rule(URIRule.new(*argv, &proc))
68
+ end
69
+
70
+ def host(*argv, &proc)
71
+ append_child_rule(HostRule.new(*argv, &proc))
72
+ end
73
+
74
+ def path(*argv, &proc)
75
+ append_child_rule(PathRule.new(*argv, &proc))
76
+ end
77
+
78
+ def query(*argv, &proc)
79
+ append_child_rule(QueryRule.new(*argv, &proc))
80
+ end
81
+
82
+ def protocol(*argv, &proc)
83
+ append_child_rule(ProtocolRule.new(*argv, &proc))
84
+ end
85
+
86
+ def if(*argv, &proc)
87
+ append_child_rule(CustomRule.new(*argv, &proc))
88
+ end
89
+
90
+ def filetypes(*argv, &proc)
91
+ append_child_rule(FiletypesRule.new(*argv, &proc))
92
+ end
93
+
94
+ private
95
+
96
+ def append_child_rule(other)
97
+ @child_rules << other
98
+ other
99
+ end
100
+
101
+ def match!(*)
102
+ any?
103
+ end
104
+
105
+ def params_from_rule_chain(rule_chain, uri)
106
+ rule_chain.reduce({}) { |hash, rule| hash.merge(rule.params(uri)) }
107
+ end
108
+
109
+ def action_from_rule_chain(rule_chain)
110
+ rule_chain.map(&:target_action).reverse.reject(&:nil?).first
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module Wayfarer
6
+ module Routing
7
+ # @private
8
+ class URIRule < Rule
9
+ def initialize(uri_str, opts = {}, &proc)
10
+ @uri = URI(uri_str)
11
+ super(opts, &proc)
12
+ end
13
+
14
+ private
15
+
16
+ def match!(uri)
17
+ uri == @uri
18
+ end
19
+ end
20
+ end
21
+ end
data/lib/wayfarer.rb ADDED
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ # rubocop:disable Style/Documentation
4
+ require "logger"
5
+ require "uri"
6
+
7
+ # Plumbing
8
+ require_relative "wayfarer/configuration"
9
+
10
+ # Routing
11
+ require_relative "wayfarer/routing/rule"
12
+ require_relative "wayfarer/routing/uri_rule"
13
+ require_relative "wayfarer/routing/host_rule"
14
+ require_relative "wayfarer/routing/path_rule"
15
+ require_relative "wayfarer/routing/query_rule"
16
+ require_relative "wayfarer/routing/protocol_rule"
17
+ require_relative "wayfarer/routing/filetypes_rule"
18
+ require_relative "wayfarer/routing/custom_rule"
19
+ require_relative "wayfarer/routing/router"
20
+
21
+ # Networking
22
+ require_relative "wayfarer/http_adapters/net_http_adapter"
23
+ require_relative "wayfarer/http_adapters/selenium_adapter"
24
+ require_relative "wayfarer/http_adapters/adapter_pool"
25
+
26
+ # Parsers
27
+ require_relative "wayfarer/parsers/xml_parser"
28
+ require_relative "wayfarer/parsers/json_parser"
29
+
30
+ # Frontiers
31
+ require_relative "wayfarer/frontiers/frontier"
32
+ require_relative "wayfarer/frontiers/memory_frontier"
33
+ require_relative "wayfarer/frontiers/redis_frontier"
34
+ require_relative "wayfarer/frontiers/normalize_uris"
35
+
36
+ unless RUBY_PLATFORM == "java"
37
+ require_relative "wayfarer/frontiers/memory_trie_frontier"
38
+ require_relative "wayfarer/frontiers/memory_bloomfilter"
39
+ require_relative "wayfarer/frontiers/redis_bloomfilter"
40
+ end
41
+
42
+ # Processing
43
+ require_relative "wayfarer/crawl"
44
+ require_relative "wayfarer/crawl_observer"
45
+ require_relative "wayfarer/locals"
46
+ require_relative "wayfarer/job"
47
+ require_relative "wayfarer/finders"
48
+ require_relative "wayfarer/page"
49
+ require_relative "wayfarer/dispatcher"
50
+ require_relative "wayfarer/processor"
51
+
52
+ module Wayfarer
53
+ VERSION = "0.0.3"
54
+
55
+ def self.logger
56
+ return @logger if @logger
57
+
58
+ @logger = Logger.new(STDOUT)
59
+ @logger.level = Logger::WARN
60
+ @logger
61
+ end
62
+
63
+ def self.config
64
+ @config ||= Configuration.new
65
+ yield(@config) if block_given?
66
+ @config
67
+ end
68
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+ require "spec_helpers"
3
+
4
+ describe Wayfarer::Configuration do
5
+ let(:config) { Configuration.new }
6
+
7
+ describe "::new" do
8
+ it "overrides defaults" do
9
+ config = Configuration.new(http_adapter: :selenium)
10
+ expect(config.http_adapter).to be :selenium
11
+ end
12
+ end
13
+
14
+ it "sets keys and values" do
15
+ config.foo = :foo
16
+ expect(config.foo).to be :foo
17
+ end
18
+
19
+ describe "#reset!" do
20
+ it "resets to defaults" do
21
+ config.max_http_redirects = 5
22
+ config.reset!
23
+ expect(config.max_http_redirects).to be 3
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+ require "spec_helpers"
3
+
4
+ describe Wayfarer::Crawl do
5
+ let(:uris) { %w(http://example.com) }
6
+ let(:job) { Class.new(Wayfarer::Job) }
7
+ subject(:crawl) { Crawl.new(job, *uris) }
8
+
9
+ describe "#frontier" do
10
+ context "by default" do
11
+ it "returns a MemoryFrontier" do
12
+ expect(crawl.frontier).to be_a MemoryFrontier
13
+ end
14
+ end
15
+
16
+ context "when Redis frontier is used", redis: true do
17
+ before { job.config.frontier = :redis }
18
+
19
+ it "returns a MemoryFrontier" do
20
+ expect(crawl.frontier).to be_a RedisFrontier
21
+ end
22
+ end
23
+
24
+ context "when memory bloomfilter is used", mri_only: true do
25
+ before { job.config.frontier = :memory_bloom }
26
+
27
+ it "returns a MemoryBloomfilter" do
28
+ expect(crawl.frontier).to be_a MemoryBloomfilter
29
+ end
30
+ end
31
+
32
+ context "when Redis bloomfilter is used", mri_only: true, redis: true do
33
+ before { job.config.frontier = :redis_bloom }
34
+
35
+ it "returns a RedisBloomfilter" do
36
+ expect(crawl.frontier).to be_a RedisBloomfilter
37
+ end
38
+ end
39
+
40
+ context "when memory trie is used", mri_only: true do
41
+ before { job.config.frontier = :memory_trie }
42
+
43
+ it "returns a RedisBloomfilter" do
44
+ expect(crawl.frontier).to be_a MemoryTrieFrontier
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+ require "spec_helpers"
3
+
4
+ describe Wayfarer::Finders do
5
+ let(:page) { fetch_page(test_app("/finders.html")) }
6
+
7
+ describe "#links" do
8
+ context "without paths" do
9
+ it "returns all links" do
10
+ expect(page.links.map(&:to_s)).to eq %w(
11
+ http://localhost:9876/foo
12
+ http://localhost:9876/bar
13
+ http://localhost:9876/baz
14
+ http://google.com
15
+ http://yahoo.com
16
+ http://aol.com
17
+ )
18
+ end
19
+ end
20
+
21
+ context "with paths" do
22
+ it "returns targeted links" do
23
+ expect(page.links("ul li:nth-child(3) a").map(&:to_s)).to eq %w(
24
+ http://localhost:9876/baz
25
+ )
26
+ end
27
+ end
28
+ end
29
+
30
+ describe "#stylesheets" do
31
+ context "without paths" do
32
+ it "returns all stylesheets" do
33
+ expect(page.stylesheets.map(&:to_s)).to eq %w(
34
+ http://localhost:9876/a.css
35
+ http://localhost:9876/b.css
36
+ http://google.com/c.css
37
+ )
38
+ end
39
+ end
40
+
41
+ context "with paths" do
42
+ it "returns targeted stylesheets" do
43
+ expect(page.stylesheets("#stylesheet-c").map(&:to_s)).to eq %w(
44
+ http://google.com/c.css
45
+ )
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+ require "spec_helpers"
3
+
4
+ describe Wayfarer::Frontiers::MemoryBloomfilter, mri_only: true do
5
+ it_behaves_like "Frontier", MemoryBloomfilter
6
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+ require "spec_helpers"
3
+
4
+ describe Wayfarer::Frontiers::MemoryFrontier do
5
+ it_behaves_like "Frontier", MemoryFrontier
6
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+ require "spec_helpers"
3
+
4
+ describe Wayfarer::Frontiers::MemoryTrieFrontier, mri_only: true do
5
+ it_behaves_like "Frontier", MemoryTrieFrontier
6
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+ require "spec_helpers"
3
+
4
+ describe Wayfarer::Frontiers::NormalizeURIs, mri_only: true do
5
+ let(:config) { Wayfarer.config }
6
+
7
+ let(:uris) do
8
+ %w(
9
+ http://example.com/foo?bar=1&zet=0
10
+ http://example.com//foo?zet=0&bar=1
11
+ http://example.com/foo?bar=1&zet=0#bar
12
+ )
13
+ end
14
+
15
+ subject(:frontier) do
16
+ MemoryFrontier.new(config).extend(NormalizeURIs)
17
+ end
18
+
19
+ describe "#stage" do
20
+ it "normalizes URIs" do
21
+ expect {
22
+ frontier.stage(*uris)
23
+ }.to change { frontier.staged_uris.count }.by(1)
24
+ end
25
+ end
26
+
27
+ describe "#staged?" do
28
+ it "normalizes URIs" do
29
+ frontier.stage(*uris)
30
+ uris.each { |u| expect(frontier.staged?(u)).to be true }
31
+ end
32
+ end
33
+
34
+ describe "#cache, #cached?" do
35
+ they "normalize URIs" do
36
+ frontier.cache(*uris)
37
+ uris.each { |u| expect(frontier.cached?(u)).to be true }
38
+ end
39
+ end
40
+
41
+ context "with normalization options" do
42
+ let(:config) do
43
+ Configuration.new(normalize_uri_options: { remove_hash: false })
44
+ end
45
+
46
+ let(:uris) do
47
+ %w(
48
+ http://example.com/foo
49
+ http://example.com/foo#bar
50
+ )
51
+ end
52
+
53
+ it "adheres to options" do
54
+ expect {
55
+ frontier.stage(*uris)
56
+ }.to change { frontier.staged_uris.count }.by(2)
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+ require "spec_helpers"
3
+
4
+ describe Wayfarer::Frontiers::RedisBloomfilter, redis: true, mri_only: true do
5
+ it_behaves_like "Frontier", RedisBloomfilter
6
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+ require "spec_helpers"
3
+
4
+ describe Wayfarer::Frontiers::RedisFrontier, redis: true do
5
+ it_behaves_like "Frontier", RedisFrontier
6
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+ require "spec_helpers"
3
+
4
+ describe Wayfarer::HTTPAdapters::AdapterPool do
5
+ subject(:adapter_pool) { AdapterPool.new(Wayfarer::Job) }
6
+
7
+ describe "#with" do
8
+ after { adapter_pool.free }
9
+
10
+ context "when using Net::HTTP" do
11
+ it "yields a NetHTTPAdapter" do
12
+ adapter_pool.with do |adapter|
13
+ expect(adapter).to be_a NetHTTPAdapter
14
+ end
15
+ end
16
+ end
17
+
18
+ context "when using Selenium", selenium: true do
19
+ subject(:adapter_pool) do
20
+ job = Class.new(Wayfarer::Job)
21
+ job.config.http_adapter = :selenium
22
+
23
+ AdapterPool.new(job)
24
+ end
25
+
26
+ it "yields a SeleniumAdapter" do
27
+ adapter_pool.with do |adapter|
28
+ expect(adapter).to be_a SeleniumAdapter
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+ require "spec_helpers"
3
+
4
+ describe Wayfarer::HTTPAdapters::NetHTTPAdapter do
5
+ subject(:adapter) { Wayfarer::HTTPAdapters::NetHTTPAdapter.instance }
6
+
7
+ describe "#fetch" do
8
+ it "returns a Page" do
9
+ uri = test_app("/hello_world")
10
+ page = adapter.fetch(uri)
11
+ expect(page).to be_a Page
12
+ end
13
+
14
+ it "sets the correct URI" do
15
+ uri = test_app("/status_code/404")
16
+ page = adapter.fetch(uri)
17
+ expect(page.uri).to eq test_app("/status_code/404")
18
+ end
19
+
20
+ it "retrieves the correct HTTP status code" do
21
+ uri = test_app("/status_code/404")
22
+ page = adapter.fetch(uri)
23
+ expect(page.status_code).to be 404
24
+ end
25
+
26
+ it "retrieves the correct response body" do
27
+ uri = test_app("/hello_world")
28
+ page = adapter.fetch(uri)
29
+ expect(page.body).to eq "Hello world!"
30
+ end
31
+
32
+ it "retrieves the correct response headers" do
33
+ uri = test_app("/hello_world")
34
+ page = adapter.fetch(uri)
35
+ expect(page.headers["hello"]).to eq ["world"]
36
+ end
37
+
38
+ context "with malformed URI" do
39
+ it "raises a MalformedURI" do
40
+ expect do
41
+ uri = URI("hptt://bro.ken")
42
+ page = adapter.fetch(uri)
43
+ end.to raise_error(
44
+ Wayfarer::HTTPAdapters::NetHTTPAdapter::MalformedURI
45
+ )
46
+ end
47
+ end
48
+
49
+ context "when response is a redirect" do
50
+ it "follows the redirect" do
51
+ uri = test_app("/redirect?times=3")
52
+ page = adapter.fetch(uri)
53
+
54
+ expect(page.uri.to_s).to eq "http://localhost:9876/redirect?times=0"
55
+ end
56
+
57
+ context "when maximum number of redirects reached" do
58
+ before { Wayfarer.config.max_http_redirects = 5 }
59
+ after { Wayfarer.config.reset! }
60
+
61
+ it "raises a MaximumRedirectCountReached" do
62
+ expect do
63
+ uri = test_app("/redirect?times=6")
64
+ page = adapter.fetch(uri)
65
+ end.to raise_error(
66
+ Wayfarer::HTTPAdapters::NetHTTPAdapter::MaximumRedirectCountReached
67
+ )
68
+ end
69
+ end
70
+
71
+ context "when redirection URI is malformed" do
72
+ it "raises a MalformedRedirectURI" do
73
+ expect do
74
+ uri = test_app("/malformed_redirect")
75
+ page = adapter.fetch(uri)
76
+ end.to raise_error(
77
+ Wayfarer::HTTPAdapters::NetHTTPAdapter::MalformedRedirectURI
78
+ )
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end