wayfarer-jruby 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Changelog.md +10 -0
- data/README.md +3 -1
- data/bin/wayfarer +1 -1
- data/docs/_includes/navigation.html +10 -0
- data/docs/_layouts/default.html +1 -1
- data/docs/guides/tutorial.md +3 -3
- data/docs/routing/custom_rules.md +16 -0
- data/docs/routing/protocol_rules.md +17 -0
- data/lib/wayfarer.rb +3 -1
- data/lib/wayfarer/job.rb +14 -3
- data/lib/wayfarer/routing/custom_rule.rb +21 -0
- data/lib/wayfarer/routing/protocol_rule.rb +21 -0
- data/lib/wayfarer/routing/rule.rb +8 -0
- data/spec/integration/peeking_spec.rb +61 -0
- data/spec/job_spec.rb +25 -1
- data/spec/routing/custom_rule_spec.rb +26 -0
- data/spec/routing/protocol_rule_spec.rb +26 -0
- data/spec/routing/rule_spec.rb +22 -0
- data/wayfarer-jruby.gemspec +2 -2
- data/wayfarer.gemspec +2 -2
- metadata +10 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 295c1dd844209a188bfac6929bfc3085b8e757c2
|
4
|
+
data.tar.gz: 94ed8f5f78858c334c727bf5de4b2f870129e436
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 31d3e3fc05ccec48e995eb05bc8207faf96efc907217ac75575d61f0a4f40eb8217b206df76d97d6897b6f2244b4cc3672e2687ead3b9bb8e74b098e8a926bcf
|
7
|
+
data.tar.gz: c1510f79a039cfbe6faf585689a5726f386249244dd8d932614bb4459f1a753aeed645b624d7e5ff4fc3d0e244f8cde8e17912c5d9feb6e2bf733d2b5c00f0de
|
data/Changelog.md
ADDED
data/README.md
CHANGED
@@ -6,7 +6,9 @@ Versatile web crawling with (J)Ruby
|
|
6
6
|
|
7
7
|
* [__Usage and more__ on the website](https://bauerd.github.io/wayfarer/)
|
8
8
|
* [__API documentation__ on Ruby-Doc.org](http://www.rubydoc.info/github/bauerd/wayfarer) (`master` branch)
|
9
|
-
*
|
9
|
+
* __Releases__ on RubyGems.org:
|
10
|
+
* [wayfarer](https://rubygems.org/gems/wayfarer)
|
11
|
+
* [wayfarer-jruby](https://rubygems.org/gems/wayfarer-jruby)
|
10
12
|
|
11
13
|
MRI:
|
12
14
|
```
|
data/bin/wayfarer
CHANGED
@@ -105,6 +105,16 @@
|
|
105
105
|
Query rules
|
106
106
|
</a>
|
107
107
|
</li>
|
108
|
+
<li class="navigation__page">
|
109
|
+
<a class="navigation__link" href="{{base}}/routing/protocol_rules.html">
|
110
|
+
Protocol rules
|
111
|
+
</a>
|
112
|
+
</li>
|
113
|
+
<li class="navigation__page">
|
114
|
+
<a class="navigation__link" href="{{base}}/routing/custom_rules.html">
|
115
|
+
Custom rules
|
116
|
+
</a>
|
117
|
+
</li>
|
108
118
|
</ul>
|
109
119
|
</li>
|
110
120
|
|
data/docs/_layouts/default.html
CHANGED
data/docs/guides/tutorial.md
CHANGED
@@ -157,15 +157,15 @@ Note that we still have a hard-coded URI in `#repository`. Usually, there are tw
|
|
157
157
|
1. Constructing the successor URI from the current URI.
|
158
158
|
2. Reading the URI from the HTTP response, e.g. extracting an `<a>` tag's `href` property.
|
159
159
|
|
160
|
-
For the first case, say we're on `https://github.com/:user/:repo` and want to go to `https://github.com/:user/:repo/issues`.
|
160
|
+
For the first case, say we're on `https://github.com/:user/:repo` and want to go to `https://github.com/:user/:repo/issues`. `#stage` takes relative paths and URIs too, and constructs absolute URIs by appending to the current page's URI:
|
161
161
|
|
162
162
|
{% highlight ruby %}
|
163
163
|
class CollectGithubIssues < Wayfarer::Job
|
164
164
|
# ...
|
165
165
|
|
166
166
|
def index
|
167
|
-
# page
|
168
|
-
stage
|
167
|
+
# Stages "#{page.uri}/issues"
|
168
|
+
stage "issues"
|
169
169
|
end
|
170
170
|
|
171
171
|
# ...
|
@@ -0,0 +1,16 @@
|
|
1
|
+
---
|
2
|
+
layout: default
|
3
|
+
title: Custom rules
|
4
|
+
---
|
5
|
+
|
6
|
+
# Custom rules
|
7
|
+
|
8
|
+
Custom rules take a block that gets yielded the URI or an object that responds to `#call(uri)`. If the block or the delegate return a truthy value, the rule matches.
|
9
|
+
|
10
|
+
{% highlight ruby %}
|
11
|
+
class DummyJob < Wayfarer::Job
|
12
|
+
route.if -> (uri) { uri.host == uri.host.reverse }
|
13
|
+
end
|
14
|
+
{% endhighlight %}
|
15
|
+
|
16
|
+
* Matches only URIs with palindrome hosts
|
@@ -0,0 +1,17 @@
|
|
1
|
+
---
|
2
|
+
layout: default
|
3
|
+
title: Protocol rules
|
4
|
+
---
|
5
|
+
|
6
|
+
# Protocol rules
|
7
|
+
|
8
|
+
Protocol rules match against symbols/strings.
|
9
|
+
|
10
|
+
{% highlight ruby %}
|
11
|
+
class DummyJob < Wayfarer::Job
|
12
|
+
route.protocol :https
|
13
|
+
end
|
14
|
+
{% endhighlight %}
|
15
|
+
|
16
|
+
* Matches `https://example.com`.
|
17
|
+
* Does not match `http://example.com`.
|
data/lib/wayfarer.rb
CHANGED
@@ -13,6 +13,8 @@ require_relative "wayfarer/routing/uri_rule"
|
|
13
13
|
require_relative "wayfarer/routing/host_rule"
|
14
14
|
require_relative "wayfarer/routing/path_rule"
|
15
15
|
require_relative "wayfarer/routing/query_rule"
|
16
|
+
require_relative "wayfarer/routing/protocol_rule"
|
17
|
+
require_relative "wayfarer/routing/custom_rule"
|
16
18
|
require_relative "wayfarer/routing/router"
|
17
19
|
|
18
20
|
# Networking
|
@@ -47,7 +49,7 @@ require_relative "wayfarer/dispatcher"
|
|
47
49
|
require_relative "wayfarer/processor"
|
48
50
|
|
49
51
|
module Wayfarer
|
50
|
-
VERSION = "0.0.
|
52
|
+
VERSION = "0.0.2"
|
51
53
|
|
52
54
|
def self.logger
|
53
55
|
return @logger if @logger
|
data/lib/wayfarer/job.rb
CHANGED
@@ -144,11 +144,22 @@ module Wayfarer
|
|
144
144
|
end
|
145
145
|
|
146
146
|
# Adds URIs to process in the next cycle.
|
147
|
-
# If a relative
|
148
|
-
#
|
147
|
+
# If a relative path is given, an absolute URI is constructed from the
|
148
|
+
# current {#page}'s URI.
|
149
149
|
# @param [String, URI, Array<String>, Array<URI>]
|
150
150
|
def stage(*uris)
|
151
|
-
|
151
|
+
expanded = uris.flatten.map do |u|
|
152
|
+
if (uri = URI(u)).absolute?
|
153
|
+
uri
|
154
|
+
else
|
155
|
+
# URI#join would discard the path of page.uri.path
|
156
|
+
current = page.uri.dup
|
157
|
+
current.path = File.join(page.uri.path, uri.path)
|
158
|
+
current
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
@staged_uris.push(*expanded)
|
152
163
|
end
|
153
164
|
|
154
165
|
# The {Page} representing the URI currently processed by an action.
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "uri"
|
4
|
+
|
5
|
+
module Wayfarer
|
6
|
+
module Routing
|
7
|
+
# @private
|
8
|
+
class CustomRule < Rule
|
9
|
+
def initialize(delegate_or_block = proc, opts = {}, &proc)
|
10
|
+
@delegate_or_block = delegate_or_block
|
11
|
+
super(opts, &proc)
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def match!(uri)
|
17
|
+
!!@delegate_or_block.call(uri)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "uri"
|
4
|
+
|
5
|
+
module Wayfarer
|
6
|
+
module Routing
|
7
|
+
# @private
|
8
|
+
class ProtocolRule < Rule
|
9
|
+
def initialize(protocol, opts = {}, &proc)
|
10
|
+
@protocol = protocol.to_s
|
11
|
+
super(opts, &proc)
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def match!(uri)
|
17
|
+
uri.scheme == @protocol
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -79,6 +79,14 @@ module Wayfarer
|
|
79
79
|
append_child_rule(QueryRule.new(*argv, &proc))
|
80
80
|
end
|
81
81
|
|
82
|
+
def protocol(*argv, &proc)
|
83
|
+
append_child_rule(ProtocolRule.new(*argv, &proc))
|
84
|
+
end
|
85
|
+
|
86
|
+
def if(*argv, &proc)
|
87
|
+
append_child_rule(CustomRule.new(*argv, &proc))
|
88
|
+
end
|
89
|
+
|
82
90
|
private
|
83
91
|
|
84
92
|
def append_child_rule(other)
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "spec_helpers"
|
3
|
+
|
4
|
+
describe Wayfarer, integration: true do
|
5
|
+
subject(:job) do
|
6
|
+
Class.new(Wayfarer::Job) do
|
7
|
+
config.reraise_exceptions = true
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
let(:entry) { test_app("/graph/index.html") }
|
12
|
+
|
13
|
+
describe "Peeking" do
|
14
|
+
it "works" do
|
15
|
+
job.class_eval do
|
16
|
+
route.path "/graph/index.html", to: :index
|
17
|
+
route.path "/graph/details/a.html", to: :detail
|
18
|
+
|
19
|
+
def index
|
20
|
+
peek = yield "http://localhost:9876/graph/details/a.html"
|
21
|
+
fail unless peek == :ok
|
22
|
+
end
|
23
|
+
|
24
|
+
def detail
|
25
|
+
:ok
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
expect {
|
30
|
+
job.perform_now(entry)
|
31
|
+
}.not_to raise_error
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
describe "Recursive peeking" do
|
36
|
+
it "does not work" do
|
37
|
+
job.class_eval do
|
38
|
+
route.path "/graph/index.html", to: :index
|
39
|
+
route.path "/graph/details/a.html", to: :a
|
40
|
+
route.path "/graph/details/a.html", to: :b
|
41
|
+
|
42
|
+
def index
|
43
|
+
peek = yield "http://localhost:9876/graph/details/a.html"
|
44
|
+
fail unless peek == :ok
|
45
|
+
end
|
46
|
+
|
47
|
+
def a
|
48
|
+
yield "http://localhost:9876/graph/details/b.html" or :ok
|
49
|
+
end
|
50
|
+
|
51
|
+
def b
|
52
|
+
yield "http://localhost:9876/graph/details/a.html"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
expect {
|
57
|
+
job.perform_now(entry)
|
58
|
+
}.not_to raise_error
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
data/spec/job_spec.rb
CHANGED
@@ -72,15 +72,39 @@ describe Wayfarer::Job do
|
|
72
72
|
describe "#stage" do
|
73
73
|
it "stages URIs" do
|
74
74
|
job_instance = job.new
|
75
|
+
job_instance.page = Page.new(uri: URI("https://yahoo.com/qux"))
|
75
76
|
|
76
77
|
uris = %w(
|
77
78
|
http://google.com
|
78
79
|
http://example.com
|
79
80
|
)
|
80
81
|
|
82
|
+
expected = uris.map { |u| URI(u) }
|
83
|
+
|
84
|
+
expect {
|
85
|
+
job_instance.send(:stage, *uris)
|
86
|
+
}.to change { job_instance.staged_uris }.from([]).to(expected)
|
87
|
+
end
|
88
|
+
|
89
|
+
it "expands relative URIs" do
|
90
|
+
job_instance = job.new
|
91
|
+
job_instance.page = Page.new(uri: URI("https://yahoo.com/qux"))
|
92
|
+
|
93
|
+
uris = %w(
|
94
|
+
/foo/bar
|
95
|
+
bar/qux.html
|
96
|
+
barfoo
|
97
|
+
)
|
98
|
+
|
99
|
+
expected = %w(
|
100
|
+
https://yahoo.com/qux/foo/bar
|
101
|
+
https://yahoo.com/qux/bar/qux.html
|
102
|
+
https://yahoo.com/qux/barfoo
|
103
|
+
).map { |u| URI(u) }
|
104
|
+
|
81
105
|
expect {
|
82
106
|
job_instance.send(:stage, *uris)
|
83
|
-
}.to change { job_instance.staged_uris }.from([]).to(
|
107
|
+
}.to change { job_instance.staged_uris }.from([]).to(expected)
|
84
108
|
end
|
85
109
|
end
|
86
110
|
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "spec_helpers"
|
3
|
+
|
4
|
+
describe Wayfarer::Routing::CustomRule do
|
5
|
+
let(:uri) { URI("http://example.com") }
|
6
|
+
|
7
|
+
describe "#matches?" do
|
8
|
+
context "with a block" do
|
9
|
+
context "when block is truthy" do
|
10
|
+
subject(:rule) { CustomRule.new -> (uri) { uri.is_a?(URI) } }
|
11
|
+
|
12
|
+
it "returns true" do
|
13
|
+
expect(rule.matches?(uri)).to be true
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
context "when block is fals-y" do
|
18
|
+
subject(:rule) { CustomRule.new -> (_) { false } }
|
19
|
+
|
20
|
+
it "returns true" do
|
21
|
+
expect(rule.matches?(uri)).to be false
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "spec_helpers"
|
3
|
+
|
4
|
+
describe Wayfarer::Routing::ProtocolRule do
|
5
|
+
describe "#matches?" do
|
6
|
+
context "with a string" do
|
7
|
+
subject(:rule) { ProtocolRule.new(:http) }
|
8
|
+
|
9
|
+
context "with matching URI" do
|
10
|
+
let(:uri) { URI("http://example.com") }
|
11
|
+
|
12
|
+
it "returns true" do
|
13
|
+
expect(rule.matches?(uri)).to be true
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
context "with mismatching URI" do
|
18
|
+
let(:uri) { URI("https://example.com") }
|
19
|
+
|
20
|
+
it "returns true" do
|
21
|
+
expect(rule.matches?(uri)).to be false
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
data/spec/routing/rule_spec.rb
CHANGED
@@ -168,6 +168,28 @@ describe Wayfarer::Routing::Rule do
|
|
168
168
|
end
|
169
169
|
end
|
170
170
|
|
171
|
+
describe "#protocol" do
|
172
|
+
it "adds a ProtocolRule as a sub-rule" do
|
173
|
+
rule.protocol(:https)
|
174
|
+
expect(rule.child_rules.first).to be_a ProtocolRule
|
175
|
+
end
|
176
|
+
|
177
|
+
it "returns the added QueryRule" do
|
178
|
+
expect(rule.protocol(:https)).to be_a ProtocolRule
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
describe "#if" do
|
183
|
+
it "adds a CustomRule as a sub-rule" do
|
184
|
+
rule.if -> () { true }
|
185
|
+
expect(rule.child_rules.first).to be_a CustomRule
|
186
|
+
end
|
187
|
+
|
188
|
+
it "returns the added QueryRule" do
|
189
|
+
expect(rule.if -> () { true }).to be_a CustomRule
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
171
193
|
describe "#build_child_rule_chain_from_options" do
|
172
194
|
subject(:rule) do
|
173
195
|
rule = Rule.new
|
data/wayfarer-jruby.gemspec
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = "wayfarer-jruby"
|
5
|
-
s.version = "0.0.
|
5
|
+
s.version = "0.0.2"
|
6
6
|
s.license = "MIT"
|
7
7
|
|
8
8
|
s.homepage = "http://github.com/bauerd/wayfarer"
|
9
9
|
s.description = "Versatile web crawling with JRuby"
|
10
10
|
s.summary = s.description
|
11
11
|
|
12
|
-
s.date = "
|
12
|
+
s.date = "2017-05-31"
|
13
13
|
s.authors = ["Dominic Bauer"]
|
14
14
|
s.email = "bauerdominic@gmail.com"
|
15
15
|
|
data/wayfarer.gemspec
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
Gem::Specification.new do |s|
|
3
3
|
s.name = "wayfarer"
|
4
|
-
s.version = "0.0.
|
4
|
+
s.version = "0.0.2"
|
5
5
|
s.license = "MIT"
|
6
6
|
|
7
7
|
s.homepage = "http://github.com/bauerd/wayfarer"
|
8
8
|
s.description = "Versatile web crawling with Ruby"
|
9
9
|
s.summary = s.description
|
10
10
|
|
11
|
-
s.date = "
|
11
|
+
s.date = "2017-05-31"
|
12
12
|
s.authors = ["Dominic Bauer"]
|
13
13
|
s.email = "bauerdominic@gmail.com"
|
14
14
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayfarer-jruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dominic Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-05-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: connection_pool
|
@@ -374,6 +374,7 @@ files:
|
|
374
374
|
- ".ruby-version"
|
375
375
|
- ".travis.yml"
|
376
376
|
- ".yardopts"
|
377
|
+
- Changelog.md
|
377
378
|
- Gemfile
|
378
379
|
- LICENSE
|
379
380
|
- README.md
|
@@ -516,8 +517,10 @@ files:
|
|
516
517
|
- docs/recipes/javascript.md
|
517
518
|
- docs/recipes/multiple_uris.md
|
518
519
|
- docs/recipes/screenshots.md
|
520
|
+
- docs/routing/custom_rules.md
|
519
521
|
- docs/routing/host_rules.md
|
520
522
|
- docs/routing/path_rules.md
|
523
|
+
- docs/routing/protocol_rules.md
|
521
524
|
- docs/routing/query_rules.md
|
522
525
|
- docs/routing/routes.md
|
523
526
|
- docs/routing/uri_rules.md
|
@@ -545,8 +548,10 @@ files:
|
|
545
548
|
- lib/wayfarer/parsers/json_parser.rb
|
546
549
|
- lib/wayfarer/parsers/xml_parser.rb
|
547
550
|
- lib/wayfarer/processor.rb
|
551
|
+
- lib/wayfarer/routing/custom_rule.rb
|
548
552
|
- lib/wayfarer/routing/host_rule.rb
|
549
553
|
- lib/wayfarer/routing/path_rule.rb
|
554
|
+
- lib/wayfarer/routing/protocol_rule.rb
|
550
555
|
- lib/wayfarer/routing/query_rule.rb
|
551
556
|
- lib/wayfarer/routing/router.rb
|
552
557
|
- lib/wayfarer/routing/rule.rb
|
@@ -565,13 +570,16 @@ files:
|
|
565
570
|
- spec/http_adapters/selenium_adapter_spec.rb
|
566
571
|
- spec/integration/callbacks_spec.rb
|
567
572
|
- spec/integration/locals_spec.rb
|
573
|
+
- spec/integration/peeking_spec.rb
|
568
574
|
- spec/job_spec.rb
|
569
575
|
- spec/page_spec.rb
|
570
576
|
- spec/parsers/json_parser_spec.rb
|
571
577
|
- spec/parsers/xml_parser_spec.rb
|
572
578
|
- spec/processor_spec.rb
|
579
|
+
- spec/routing/custom_rule_spec.rb
|
573
580
|
- spec/routing/host_rule_spec.rb
|
574
581
|
- spec/routing/path_rule_spec.rb
|
582
|
+
- spec/routing/protocol_rule_spec.rb
|
575
583
|
- spec/routing/query_rule_spec.rb
|
576
584
|
- spec/routing/router_spec.rb
|
577
585
|
- spec/routing/rule_spec.rb
|