wayfarer-jruby 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 72b70dc0921adae626373012636e5646abae4586
4
- data.tar.gz: 480b0c60cd58062d700de8a8f480386ba57f6dc7
3
+ metadata.gz: 295c1dd844209a188bfac6929bfc3085b8e757c2
4
+ data.tar.gz: 94ed8f5f78858c334c727bf5de4b2f870129e436
5
5
  SHA512:
6
- metadata.gz: 523c57e24ff35bf95f3ef49b4c2d0e169eef5d237ee3614593217e5f9d51721e67fb326a87fb86fd9b075b3aa938e78ff96a04d6e5589f92a8b6de969addab43
7
- data.tar.gz: 6b686f439ea1ca38304150ee0f1a3d927bb7a36d6a87de580e273f608127ebde01b152cca3fc69cb8b78ef2082e7d3efb9edddd5b1f889ee07547ed4649a1e61
6
+ metadata.gz: 31d3e3fc05ccec48e995eb05bc8207faf96efc907217ac75575d61f0a4f40eb8217b206df76d97d6897b6f2244b4cc3672e2687ead3b9bb8e74b098e8a926bcf
7
+ data.tar.gz: c1510f79a039cfbe6faf585689a5726f386249244dd8d932614bb4459f1a753aeed645b624d7e5ff4fc3d0e244f8cde8e17912c5d9feb6e2bf733d2b5c00f0de
data/Changelog.md ADDED
@@ -0,0 +1,10 @@
1
+ ## 0.0.2
2
+
3
+ Features:
4
+
5
+ * `Job#stage` now expands relative paths/URIs
6
+ * Added `ProtocolRule` and `CustomRule`
7
+
8
+ Bugfixes:
9
+
10
+ * Fixed the CLI which relied on a removed method alias
data/README.md CHANGED
@@ -6,7 +6,9 @@ Versatile web crawling with (J)Ruby
6
6
 
7
7
  * [__Usage and more__ on the website](https://bauerd.github.io/wayfarer/)
8
8
  * [__API documentation__ on Ruby-Doc.org](http://www.rubydoc.info/github/bauerd/wayfarer) (`master` branch)
9
- * [__Releases__ on RubyGems.org](https://rubygems.org/gems/wayfarer)
9
+ * __Releases__ on RubyGems.org:
10
+ * [wayfarer](https://rubygems.org/gems/wayfarer)
11
+ * [wayfarer-jruby](https://rubygems.org/gems/wayfarer-jruby)
10
12
 
11
13
  MRI:
12
14
  ```
data/bin/wayfarer CHANGED
@@ -61,7 +61,7 @@ module Wayfarer
61
61
 
62
62
  def set_log_level(options)
63
63
  if level = options[:log_level]
64
- Wayfarer.log.level = log_level_from_string(level)
64
+ Wayfarer.logger.level = log_level_from_string(level)
65
65
  end
66
66
  end
67
67
 
@@ -105,6 +105,16 @@
105
105
  Query rules
106
106
  </a>
107
107
  </li>
108
+ <li class="navigation__page">
109
+ <a class="navigation__link" href="{{base}}/routing/protocol_rules.html">
110
+ Protocol rules
111
+ </a>
112
+ </li>
113
+ <li class="navigation__page">
114
+ <a class="navigation__link" href="{{base}}/routing/custom_rules.html">
115
+ Custom rules
116
+ </a>
117
+ </li>
108
118
  </ul>
109
119
  </li>
110
120
 
@@ -11,7 +11,7 @@
11
11
  <a href="{{base}}" class="site-header__link">{{ site.title }}</a>
12
12
  </h1>
13
13
  <div class="site-header__version">
14
- 0.0.1
14
+ 0.0.2
15
15
  </div>
16
16
  </header>
17
17
 
@@ -157,15 +157,15 @@ Note that we still have a hard-coded URI in `#repository`. Usually, there are tw
157
157
  1. Constructing the successor URI from the current URI.
158
158
  2. Reading the URI from the HTTP response, e.g. extracting an `<a>` tag's `href` property.
159
159
 
160
- For the first case, say we're on `https://github.com/:user/:repo` and want to go to `https://github.com/:user/:repo/issues`. All that separates both URIs is the last path segment, and we can simply append it at runtime:
160
+ For the first case, say we're on `https://github.com/:user/:repo` and want to go to `https://github.com/:user/:repo/issues`. `#stage` takes relative paths and URIs too, and constructs absolute URIs by appending to the current page's URI:
161
161
 
162
162
  {% highlight ruby %}
163
163
  class CollectGithubIssues < Wayfarer::Job
164
164
  # ...
165
165
 
166
166
  def index
167
- # page#uri returns a URI object
168
- stage page.uri.to_s << "/issues"
167
+ # Stages "#{page.uri}/issues"
168
+ stage "issues"
169
169
  end
170
170
 
171
171
  # ...
@@ -0,0 +1,16 @@
1
+ ---
2
+ layout: default
3
+ title: Custom rules
4
+ ---
5
+
6
+ # Custom rules
7
+
8
+ Custom rules take a block that gets yielded the URI or an object that responds to `#call(uri)`. If the block or the delegate return a truthy value, the rule matches.
9
+
10
+ {% highlight ruby %}
11
+ class DummyJob < Wayfarer::Job
12
+ route.if -> (uri) { uri.host == uri.host.reverse }
13
+ end
14
+ {% endhighlight %}
15
+
16
+ * Matches only URIs with palindrome hosts
@@ -0,0 +1,17 @@
1
+ ---
2
+ layout: default
3
+ title: Protocol rules
4
+ ---
5
+
6
+ # Protocol rules
7
+
8
+ Protocol rules match against symbols/strings.
9
+
10
+ {% highlight ruby %}
11
+ class DummyJob < Wayfarer::Job
12
+ route.protocol :https
13
+ end
14
+ {% endhighlight %}
15
+
16
+ * Matches `https://example.com`.
17
+ * Does not match `http://example.com`.
data/lib/wayfarer.rb CHANGED
@@ -13,6 +13,8 @@ require_relative "wayfarer/routing/uri_rule"
13
13
  require_relative "wayfarer/routing/host_rule"
14
14
  require_relative "wayfarer/routing/path_rule"
15
15
  require_relative "wayfarer/routing/query_rule"
16
+ require_relative "wayfarer/routing/protocol_rule"
17
+ require_relative "wayfarer/routing/custom_rule"
16
18
  require_relative "wayfarer/routing/router"
17
19
 
18
20
  # Networking
@@ -47,7 +49,7 @@ require_relative "wayfarer/dispatcher"
47
49
  require_relative "wayfarer/processor"
48
50
 
49
51
  module Wayfarer
50
- VERSION = "0.0.1"
52
+ VERSION = "0.0.2"
51
53
 
52
54
  def self.logger
53
55
  return @logger if @logger
data/lib/wayfarer/job.rb CHANGED
@@ -144,11 +144,22 @@ module Wayfarer
144
144
  end
145
145
 
146
146
  # Adds URIs to process in the next cycle.
147
- # If a relative URI is given, the page's protocol and hostname get
148
- # prepended.
147
+ # If a relative path is given, an absolute URI is constructed from the
148
+ # current {#page}'s URI.
149
149
  # @param [String, URI, Array<String>, Array<URI>]
150
150
  def stage(*uris)
151
- @staged_uris.push(*uris.flatten)
151
+ expanded = uris.flatten.map do |u|
152
+ if (uri = URI(u)).absolute?
153
+ uri
154
+ else
155
+ # URI#join would discard the path of page.uri.path
156
+ current = page.uri.dup
157
+ current.path = File.join(page.uri.path, uri.path)
158
+ current
159
+ end
160
+ end
161
+
162
+ @staged_uris.push(*expanded)
152
163
  end
153
164
 
154
165
  # The {Page} representing the URI currently processed by an action.
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module Wayfarer
6
+ module Routing
7
+ # @private
8
+ class CustomRule < Rule
9
+ def initialize(delegate_or_block = proc, opts = {}, &proc)
10
+ @delegate_or_block = delegate_or_block
11
+ super(opts, &proc)
12
+ end
13
+
14
+ private
15
+
16
+ def match!(uri)
17
+ !!@delegate_or_block.call(uri)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module Wayfarer
6
+ module Routing
7
+ # @private
8
+ class ProtocolRule < Rule
9
+ def initialize(protocol, opts = {}, &proc)
10
+ @protocol = protocol.to_s
11
+ super(opts, &proc)
12
+ end
13
+
14
+ private
15
+
16
+ def match!(uri)
17
+ uri.scheme == @protocol
18
+ end
19
+ end
20
+ end
21
+ end
@@ -79,6 +79,14 @@ module Wayfarer
79
79
  append_child_rule(QueryRule.new(*argv, &proc))
80
80
  end
81
81
 
82
+ def protocol(*argv, &proc)
83
+ append_child_rule(ProtocolRule.new(*argv, &proc))
84
+ end
85
+
86
+ def if(*argv, &proc)
87
+ append_child_rule(CustomRule.new(*argv, &proc))
88
+ end
89
+
82
90
  private
83
91
 
84
92
  def append_child_rule(other)
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+ require "spec_helpers"
3
+
4
+ describe Wayfarer, integration: true do
5
+ subject(:job) do
6
+ Class.new(Wayfarer::Job) do
7
+ config.reraise_exceptions = true
8
+ end
9
+ end
10
+
11
+ let(:entry) { test_app("/graph/index.html") }
12
+
13
+ describe "Peeking" do
14
+ it "works" do
15
+ job.class_eval do
16
+ route.path "/graph/index.html", to: :index
17
+ route.path "/graph/details/a.html", to: :detail
18
+
19
+ def index
20
+ peek = yield "http://localhost:9876/graph/details/a.html"
21
+ fail unless peek == :ok
22
+ end
23
+
24
+ def detail
25
+ :ok
26
+ end
27
+ end
28
+
29
+ expect {
30
+ job.perform_now(entry)
31
+ }.not_to raise_error
32
+ end
33
+ end
34
+
35
+ describe "Recursive peeking" do
36
+ it "does not work" do
37
+ job.class_eval do
38
+ route.path "/graph/index.html", to: :index
39
+ route.path "/graph/details/a.html", to: :a
40
+ route.path "/graph/details/a.html", to: :b
41
+
42
+ def index
43
+ peek = yield "http://localhost:9876/graph/details/a.html"
44
+ fail unless peek == :ok
45
+ end
46
+
47
+ def a
48
+ yield "http://localhost:9876/graph/details/b.html" or :ok
49
+ end
50
+
51
+ def b
52
+ yield "http://localhost:9876/graph/details/a.html"
53
+ end
54
+ end
55
+
56
+ expect {
57
+ job.perform_now(entry)
58
+ }.not_to raise_error
59
+ end
60
+ end
61
+ end
data/spec/job_spec.rb CHANGED
@@ -72,15 +72,39 @@ describe Wayfarer::Job do
72
72
  describe "#stage" do
73
73
  it "stages URIs" do
74
74
  job_instance = job.new
75
+ job_instance.page = Page.new(uri: URI("https://yahoo.com/qux"))
75
76
 
76
77
  uris = %w(
77
78
  http://google.com
78
79
  http://example.com
79
80
  )
80
81
 
82
+ expected = uris.map { |u| URI(u) }
83
+
84
+ expect {
85
+ job_instance.send(:stage, *uris)
86
+ }.to change { job_instance.staged_uris }.from([]).to(expected)
87
+ end
88
+
89
+ it "expands relative URIs" do
90
+ job_instance = job.new
91
+ job_instance.page = Page.new(uri: URI("https://yahoo.com/qux"))
92
+
93
+ uris = %w(
94
+ /foo/bar
95
+ bar/qux.html
96
+ barfoo
97
+ )
98
+
99
+ expected = %w(
100
+ https://yahoo.com/qux/foo/bar
101
+ https://yahoo.com/qux/bar/qux.html
102
+ https://yahoo.com/qux/barfoo
103
+ ).map { |u| URI(u) }
104
+
81
105
  expect {
82
106
  job_instance.send(:stage, *uris)
83
- }.to change { job_instance.staged_uris }.from([]).to(uris)
107
+ }.to change { job_instance.staged_uris }.from([]).to(expected)
84
108
  end
85
109
  end
86
110
  end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+ require "spec_helpers"
3
+
4
+ describe Wayfarer::Routing::CustomRule do
5
+ let(:uri) { URI("http://example.com") }
6
+
7
+ describe "#matches?" do
8
+ context "with a block" do
9
+ context "when block is truthy" do
10
+ subject(:rule) { CustomRule.new -> (uri) { uri.is_a?(URI) } }
11
+
12
+ it "returns true" do
13
+ expect(rule.matches?(uri)).to be true
14
+ end
15
+ end
16
+
17
+ context "when block is fals-y" do
18
+ subject(:rule) { CustomRule.new -> (_) { false } }
19
+
20
+ it "returns true" do
21
+ expect(rule.matches?(uri)).to be false
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+ require "spec_helpers"
3
+
4
+ describe Wayfarer::Routing::ProtocolRule do
5
+ describe "#matches?" do
6
+ context "with a string" do
7
+ subject(:rule) { ProtocolRule.new(:http) }
8
+
9
+ context "with matching URI" do
10
+ let(:uri) { URI("http://example.com") }
11
+
12
+ it "returns true" do
13
+ expect(rule.matches?(uri)).to be true
14
+ end
15
+ end
16
+
17
+ context "with mismatching URI" do
18
+ let(:uri) { URI("https://example.com") }
19
+
20
+ it "returns true" do
21
+ expect(rule.matches?(uri)).to be false
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -168,6 +168,28 @@ describe Wayfarer::Routing::Rule do
168
168
  end
169
169
  end
170
170
 
171
+ describe "#protocol" do
172
+ it "adds a ProtocolRule as a sub-rule" do
173
+ rule.protocol(:https)
174
+ expect(rule.child_rules.first).to be_a ProtocolRule
175
+ end
176
+
177
+ it "returns the added QueryRule" do
178
+ expect(rule.protocol(:https)).to be_a ProtocolRule
179
+ end
180
+ end
181
+
182
+ describe "#if" do
183
+ it "adds a CustomRule as a sub-rule" do
184
+ rule.if -> () { true }
185
+ expect(rule.child_rules.first).to be_a CustomRule
186
+ end
187
+
188
+ it "returns the added QueryRule" do
189
+ expect(rule.if -> () { true }).to be_a CustomRule
190
+ end
191
+ end
192
+
171
193
  describe "#build_child_rule_chain_from_options" do
172
194
  subject(:rule) do
173
195
  rule = Rule.new
@@ -2,14 +2,14 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "wayfarer-jruby"
5
- s.version = "0.0.1"
5
+ s.version = "0.0.2"
6
6
  s.license = "MIT"
7
7
 
8
8
  s.homepage = "http://github.com/bauerd/wayfarer"
9
9
  s.description = "Versatile web crawling with JRuby"
10
10
  s.summary = s.description
11
11
 
12
- s.date = "2014-11-12"
12
+ s.date = "2017-05-31"
13
13
  s.authors = ["Dominic Bauer"]
14
14
  s.email = "bauerdominic@gmail.com"
15
15
 
data/wayfarer.gemspec CHANGED
@@ -1,14 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
  Gem::Specification.new do |s|
3
3
  s.name = "wayfarer"
4
- s.version = "0.0.1"
4
+ s.version = "0.0.2"
5
5
  s.license = "MIT"
6
6
 
7
7
  s.homepage = "http://github.com/bauerd/wayfarer"
8
8
  s.description = "Versatile web crawling with Ruby"
9
9
  s.summary = s.description
10
10
 
11
- s.date = "2014-11-12"
11
+ s.date = "2017-05-31"
12
12
  s.authors = ["Dominic Bauer"]
13
13
  s.email = "bauerdominic@gmail.com"
14
14
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayfarer-jruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dominic Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-11-12 00:00:00.000000000 Z
11
+ date: 2017-05-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: connection_pool
@@ -374,6 +374,7 @@ files:
374
374
  - ".ruby-version"
375
375
  - ".travis.yml"
376
376
  - ".yardopts"
377
+ - Changelog.md
377
378
  - Gemfile
378
379
  - LICENSE
379
380
  - README.md
@@ -516,8 +517,10 @@ files:
516
517
  - docs/recipes/javascript.md
517
518
  - docs/recipes/multiple_uris.md
518
519
  - docs/recipes/screenshots.md
520
+ - docs/routing/custom_rules.md
519
521
  - docs/routing/host_rules.md
520
522
  - docs/routing/path_rules.md
523
+ - docs/routing/protocol_rules.md
521
524
  - docs/routing/query_rules.md
522
525
  - docs/routing/routes.md
523
526
  - docs/routing/uri_rules.md
@@ -545,8 +548,10 @@ files:
545
548
  - lib/wayfarer/parsers/json_parser.rb
546
549
  - lib/wayfarer/parsers/xml_parser.rb
547
550
  - lib/wayfarer/processor.rb
551
+ - lib/wayfarer/routing/custom_rule.rb
548
552
  - lib/wayfarer/routing/host_rule.rb
549
553
  - lib/wayfarer/routing/path_rule.rb
554
+ - lib/wayfarer/routing/protocol_rule.rb
550
555
  - lib/wayfarer/routing/query_rule.rb
551
556
  - lib/wayfarer/routing/router.rb
552
557
  - lib/wayfarer/routing/rule.rb
@@ -565,13 +570,16 @@ files:
565
570
  - spec/http_adapters/selenium_adapter_spec.rb
566
571
  - spec/integration/callbacks_spec.rb
567
572
  - spec/integration/locals_spec.rb
573
+ - spec/integration/peeking_spec.rb
568
574
  - spec/job_spec.rb
569
575
  - spec/page_spec.rb
570
576
  - spec/parsers/json_parser_spec.rb
571
577
  - spec/parsers/xml_parser_spec.rb
572
578
  - spec/processor_spec.rb
579
+ - spec/routing/custom_rule_spec.rb
573
580
  - spec/routing/host_rule_spec.rb
574
581
  - spec/routing/path_rule_spec.rb
582
+ - spec/routing/protocol_rule_spec.rb
575
583
  - spec/routing/query_rule_spec.rb
576
584
  - spec/routing/router_spec.rb
577
585
  - spec/routing/rule_spec.rb