wayfarer-jruby 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Changelog.md +10 -0
- data/README.md +3 -1
- data/bin/wayfarer +1 -1
- data/docs/_includes/navigation.html +10 -0
- data/docs/_layouts/default.html +1 -1
- data/docs/guides/tutorial.md +3 -3
- data/docs/routing/custom_rules.md +16 -0
- data/docs/routing/protocol_rules.md +17 -0
- data/lib/wayfarer.rb +3 -1
- data/lib/wayfarer/job.rb +14 -3
- data/lib/wayfarer/routing/custom_rule.rb +21 -0
- data/lib/wayfarer/routing/protocol_rule.rb +21 -0
- data/lib/wayfarer/routing/rule.rb +8 -0
- data/spec/integration/peeking_spec.rb +61 -0
- data/spec/job_spec.rb +25 -1
- data/spec/routing/custom_rule_spec.rb +26 -0
- data/spec/routing/protocol_rule_spec.rb +26 -0
- data/spec/routing/rule_spec.rb +22 -0
- data/wayfarer-jruby.gemspec +2 -2
- data/wayfarer.gemspec +2 -2
- metadata +10 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 295c1dd844209a188bfac6929bfc3085b8e757c2
|
4
|
+
data.tar.gz: 94ed8f5f78858c334c727bf5de4b2f870129e436
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 31d3e3fc05ccec48e995eb05bc8207faf96efc907217ac75575d61f0a4f40eb8217b206df76d97d6897b6f2244b4cc3672e2687ead3b9bb8e74b098e8a926bcf
|
7
|
+
data.tar.gz: c1510f79a039cfbe6faf585689a5726f386249244dd8d932614bb4459f1a753aeed645b624d7e5ff4fc3d0e244f8cde8e17912c5d9feb6e2bf733d2b5c00f0de
|
data/Changelog.md
ADDED
data/README.md
CHANGED
@@ -6,7 +6,9 @@ Versatile web crawling with (J)Ruby
|
|
6
6
|
|
7
7
|
* [__Usage and more__ on the website](https://bauerd.github.io/wayfarer/)
|
8
8
|
* [__API documentation__ on Ruby-Doc.org](http://www.rubydoc.info/github/bauerd/wayfarer) (`master` branch)
|
9
|
-
*
|
9
|
+
* __Releases__ on RubyGems.org:
|
10
|
+
* [wayfarer](https://rubygems.org/gems/wayfarer)
|
11
|
+
* [wayfarer-jruby](https://rubygems.org/gems/wayfarer-jruby)
|
10
12
|
|
11
13
|
MRI:
|
12
14
|
```
|
data/bin/wayfarer
CHANGED
@@ -105,6 +105,16 @@
|
|
105
105
|
Query rules
|
106
106
|
</a>
|
107
107
|
</li>
|
108
|
+
<li class="navigation__page">
|
109
|
+
<a class="navigation__link" href="{{base}}/routing/protocol_rules.html">
|
110
|
+
Protocol rules
|
111
|
+
</a>
|
112
|
+
</li>
|
113
|
+
<li class="navigation__page">
|
114
|
+
<a class="navigation__link" href="{{base}}/routing/custom_rules.html">
|
115
|
+
Custom rules
|
116
|
+
</a>
|
117
|
+
</li>
|
108
118
|
</ul>
|
109
119
|
</li>
|
110
120
|
|
data/docs/_layouts/default.html
CHANGED
data/docs/guides/tutorial.md
CHANGED
@@ -157,15 +157,15 @@ Note that we still have a hard-coded URI in `#repository`. Usually, there are tw
|
|
157
157
|
1. Constructing the successor URI from the current URI.
|
158
158
|
2. Reading the URI from the HTTP response, e.g. extracting an `<a>` tag's `href` property.
|
159
159
|
|
160
|
-
For the first case, say we're on `https://github.com/:user/:repo` and want to go to `https://github.com/:user/:repo/issues`.
|
160
|
+
For the first case, say we're on `https://github.com/:user/:repo` and want to go to `https://github.com/:user/:repo/issues`. `#stage` takes relative paths and URIs too, and constructs absolute URIs by appending to the current page's URI:
|
161
161
|
|
162
162
|
{% highlight ruby %}
|
163
163
|
class CollectGithubIssues < Wayfarer::Job
|
164
164
|
# ...
|
165
165
|
|
166
166
|
def index
|
167
|
-
# page
|
168
|
-
stage
|
167
|
+
# Stages "#{page.uri}/issues"
|
168
|
+
stage "issues"
|
169
169
|
end
|
170
170
|
|
171
171
|
# ...
|
@@ -0,0 +1,16 @@
|
|
1
|
+
---
|
2
|
+
layout: default
|
3
|
+
title: Custom rules
|
4
|
+
---
|
5
|
+
|
6
|
+
# Custom rules
|
7
|
+
|
8
|
+
Custom rules take a block that gets yielded the URI or an object that responds to `#call(uri)`. If the block or the delegate return a truthy value, the rule matches.
|
9
|
+
|
10
|
+
{% highlight ruby %}
|
11
|
+
class DummyJob < Wayfarer::Job
|
12
|
+
route.if -> (uri) { uri.host == uri.host.reverse }
|
13
|
+
end
|
14
|
+
{% endhighlight %}
|
15
|
+
|
16
|
+
* Matches only URIs with palindrome hosts
|
@@ -0,0 +1,17 @@
|
|
1
|
+
---
|
2
|
+
layout: default
|
3
|
+
title: Protocol rules
|
4
|
+
---
|
5
|
+
|
6
|
+
# Protocol rules
|
7
|
+
|
8
|
+
Protocol rules match against symbols/strings.
|
9
|
+
|
10
|
+
{% highlight ruby %}
|
11
|
+
class DummyJob < Wayfarer::Job
|
12
|
+
route.protocol :https
|
13
|
+
end
|
14
|
+
{% endhighlight %}
|
15
|
+
|
16
|
+
* Matches `https://example.com`.
|
17
|
+
* Does not match `http://example.com`.
|
data/lib/wayfarer.rb
CHANGED
@@ -13,6 +13,8 @@ require_relative "wayfarer/routing/uri_rule"
|
|
13
13
|
require_relative "wayfarer/routing/host_rule"
|
14
14
|
require_relative "wayfarer/routing/path_rule"
|
15
15
|
require_relative "wayfarer/routing/query_rule"
|
16
|
+
require_relative "wayfarer/routing/protocol_rule"
|
17
|
+
require_relative "wayfarer/routing/custom_rule"
|
16
18
|
require_relative "wayfarer/routing/router"
|
17
19
|
|
18
20
|
# Networking
|
@@ -47,7 +49,7 @@ require_relative "wayfarer/dispatcher"
|
|
47
49
|
require_relative "wayfarer/processor"
|
48
50
|
|
49
51
|
module Wayfarer
|
50
|
-
VERSION = "0.0.
|
52
|
+
VERSION = "0.0.2"
|
51
53
|
|
52
54
|
def self.logger
|
53
55
|
return @logger if @logger
|
data/lib/wayfarer/job.rb
CHANGED
@@ -144,11 +144,22 @@ module Wayfarer
|
|
144
144
|
end
|
145
145
|
|
146
146
|
# Adds URIs to process in the next cycle.
|
147
|
-
# If a relative
|
148
|
-
#
|
147
|
+
# If a relative path is given, an absolute URI is constructed from the
|
148
|
+
# current {#page}'s URI.
|
149
149
|
# @param [String, URI, Array<String>, Array<URI>]
|
150
150
|
def stage(*uris)
|
151
|
-
|
151
|
+
expanded = uris.flatten.map do |u|
|
152
|
+
if (uri = URI(u)).absolute?
|
153
|
+
uri
|
154
|
+
else
|
155
|
+
# URI#join would discard the path of page.uri.path
|
156
|
+
current = page.uri.dup
|
157
|
+
current.path = File.join(page.uri.path, uri.path)
|
158
|
+
current
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
@staged_uris.push(*expanded)
|
152
163
|
end
|
153
164
|
|
154
165
|
# The {Page} representing the URI currently processed by an action.
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "uri"
|
4
|
+
|
5
|
+
module Wayfarer
|
6
|
+
module Routing
|
7
|
+
# @private
|
8
|
+
class CustomRule < Rule
|
9
|
+
def initialize(delegate_or_block = proc, opts = {}, &proc)
|
10
|
+
@delegate_or_block = delegate_or_block
|
11
|
+
super(opts, &proc)
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def match!(uri)
|
17
|
+
!!@delegate_or_block.call(uri)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "uri"
|
4
|
+
|
5
|
+
module Wayfarer
|
6
|
+
module Routing
|
7
|
+
# @private
|
8
|
+
class ProtocolRule < Rule
|
9
|
+
def initialize(protocol, opts = {}, &proc)
|
10
|
+
@protocol = protocol.to_s
|
11
|
+
super(opts, &proc)
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def match!(uri)
|
17
|
+
uri.scheme == @protocol
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -79,6 +79,14 @@ module Wayfarer
|
|
79
79
|
append_child_rule(QueryRule.new(*argv, &proc))
|
80
80
|
end
|
81
81
|
|
82
|
+
def protocol(*argv, &proc)
|
83
|
+
append_child_rule(ProtocolRule.new(*argv, &proc))
|
84
|
+
end
|
85
|
+
|
86
|
+
def if(*argv, &proc)
|
87
|
+
append_child_rule(CustomRule.new(*argv, &proc))
|
88
|
+
end
|
89
|
+
|
82
90
|
private
|
83
91
|
|
84
92
|
def append_child_rule(other)
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "spec_helpers"
|
3
|
+
|
4
|
+
describe Wayfarer, integration: true do
|
5
|
+
subject(:job) do
|
6
|
+
Class.new(Wayfarer::Job) do
|
7
|
+
config.reraise_exceptions = true
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
let(:entry) { test_app("/graph/index.html") }
|
12
|
+
|
13
|
+
describe "Peeking" do
|
14
|
+
it "works" do
|
15
|
+
job.class_eval do
|
16
|
+
route.path "/graph/index.html", to: :index
|
17
|
+
route.path "/graph/details/a.html", to: :detail
|
18
|
+
|
19
|
+
def index
|
20
|
+
peek = yield "http://localhost:9876/graph/details/a.html"
|
21
|
+
fail unless peek == :ok
|
22
|
+
end
|
23
|
+
|
24
|
+
def detail
|
25
|
+
:ok
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
expect {
|
30
|
+
job.perform_now(entry)
|
31
|
+
}.not_to raise_error
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
describe "Recursive peeking" do
|
36
|
+
it "does not work" do
|
37
|
+
job.class_eval do
|
38
|
+
route.path "/graph/index.html", to: :index
|
39
|
+
route.path "/graph/details/a.html", to: :a
|
40
|
+
route.path "/graph/details/a.html", to: :b
|
41
|
+
|
42
|
+
def index
|
43
|
+
peek = yield "http://localhost:9876/graph/details/a.html"
|
44
|
+
fail unless peek == :ok
|
45
|
+
end
|
46
|
+
|
47
|
+
def a
|
48
|
+
yield "http://localhost:9876/graph/details/b.html" or :ok
|
49
|
+
end
|
50
|
+
|
51
|
+
def b
|
52
|
+
yield "http://localhost:9876/graph/details/a.html"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
expect {
|
57
|
+
job.perform_now(entry)
|
58
|
+
}.not_to raise_error
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
data/spec/job_spec.rb
CHANGED
@@ -72,15 +72,39 @@ describe Wayfarer::Job do
|
|
72
72
|
describe "#stage" do
|
73
73
|
it "stages URIs" do
|
74
74
|
job_instance = job.new
|
75
|
+
job_instance.page = Page.new(uri: URI("https://yahoo.com/qux"))
|
75
76
|
|
76
77
|
uris = %w(
|
77
78
|
http://google.com
|
78
79
|
http://example.com
|
79
80
|
)
|
80
81
|
|
82
|
+
expected = uris.map { |u| URI(u) }
|
83
|
+
|
84
|
+
expect {
|
85
|
+
job_instance.send(:stage, *uris)
|
86
|
+
}.to change { job_instance.staged_uris }.from([]).to(expected)
|
87
|
+
end
|
88
|
+
|
89
|
+
it "expands relative URIs" do
|
90
|
+
job_instance = job.new
|
91
|
+
job_instance.page = Page.new(uri: URI("https://yahoo.com/qux"))
|
92
|
+
|
93
|
+
uris = %w(
|
94
|
+
/foo/bar
|
95
|
+
bar/qux.html
|
96
|
+
barfoo
|
97
|
+
)
|
98
|
+
|
99
|
+
expected = %w(
|
100
|
+
https://yahoo.com/qux/foo/bar
|
101
|
+
https://yahoo.com/qux/bar/qux.html
|
102
|
+
https://yahoo.com/qux/barfoo
|
103
|
+
).map { |u| URI(u) }
|
104
|
+
|
81
105
|
expect {
|
82
106
|
job_instance.send(:stage, *uris)
|
83
|
-
}.to change { job_instance.staged_uris }.from([]).to(
|
107
|
+
}.to change { job_instance.staged_uris }.from([]).to(expected)
|
84
108
|
end
|
85
109
|
end
|
86
110
|
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "spec_helpers"
|
3
|
+
|
4
|
+
describe Wayfarer::Routing::CustomRule do
|
5
|
+
let(:uri) { URI("http://example.com") }
|
6
|
+
|
7
|
+
describe "#matches?" do
|
8
|
+
context "with a block" do
|
9
|
+
context "when block is truthy" do
|
10
|
+
subject(:rule) { CustomRule.new -> (uri) { uri.is_a?(URI) } }
|
11
|
+
|
12
|
+
it "returns true" do
|
13
|
+
expect(rule.matches?(uri)).to be true
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
context "when block is fals-y" do
|
18
|
+
subject(:rule) { CustomRule.new -> (_) { false } }
|
19
|
+
|
20
|
+
it "returns true" do
|
21
|
+
expect(rule.matches?(uri)).to be false
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "spec_helpers"
|
3
|
+
|
4
|
+
describe Wayfarer::Routing::ProtocolRule do
|
5
|
+
describe "#matches?" do
|
6
|
+
context "with a string" do
|
7
|
+
subject(:rule) { ProtocolRule.new(:http) }
|
8
|
+
|
9
|
+
context "with matching URI" do
|
10
|
+
let(:uri) { URI("http://example.com") }
|
11
|
+
|
12
|
+
it "returns true" do
|
13
|
+
expect(rule.matches?(uri)).to be true
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
context "with mismatching URI" do
|
18
|
+
let(:uri) { URI("https://example.com") }
|
19
|
+
|
20
|
+
it "returns true" do
|
21
|
+
expect(rule.matches?(uri)).to be false
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
data/spec/routing/rule_spec.rb
CHANGED
@@ -168,6 +168,28 @@ describe Wayfarer::Routing::Rule do
|
|
168
168
|
end
|
169
169
|
end
|
170
170
|
|
171
|
+
describe "#protocol" do
|
172
|
+
it "adds a ProtocolRule as a sub-rule" do
|
173
|
+
rule.protocol(:https)
|
174
|
+
expect(rule.child_rules.first).to be_a ProtocolRule
|
175
|
+
end
|
176
|
+
|
177
|
+
it "returns the added QueryRule" do
|
178
|
+
expect(rule.protocol(:https)).to be_a ProtocolRule
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
describe "#if" do
|
183
|
+
it "adds a CustomRule as a sub-rule" do
|
184
|
+
rule.if -> () { true }
|
185
|
+
expect(rule.child_rules.first).to be_a CustomRule
|
186
|
+
end
|
187
|
+
|
188
|
+
it "returns the added QueryRule" do
|
189
|
+
expect(rule.if -> () { true }).to be_a CustomRule
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
171
193
|
describe "#build_child_rule_chain_from_options" do
|
172
194
|
subject(:rule) do
|
173
195
|
rule = Rule.new
|
data/wayfarer-jruby.gemspec
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = "wayfarer-jruby"
|
5
|
-
s.version = "0.0.
|
5
|
+
s.version = "0.0.2"
|
6
6
|
s.license = "MIT"
|
7
7
|
|
8
8
|
s.homepage = "http://github.com/bauerd/wayfarer"
|
9
9
|
s.description = "Versatile web crawling with JRuby"
|
10
10
|
s.summary = s.description
|
11
11
|
|
12
|
-
s.date = "
|
12
|
+
s.date = "2017-05-31"
|
13
13
|
s.authors = ["Dominic Bauer"]
|
14
14
|
s.email = "bauerdominic@gmail.com"
|
15
15
|
|
data/wayfarer.gemspec
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
Gem::Specification.new do |s|
|
3
3
|
s.name = "wayfarer"
|
4
|
-
s.version = "0.0.
|
4
|
+
s.version = "0.0.2"
|
5
5
|
s.license = "MIT"
|
6
6
|
|
7
7
|
s.homepage = "http://github.com/bauerd/wayfarer"
|
8
8
|
s.description = "Versatile web crawling with Ruby"
|
9
9
|
s.summary = s.description
|
10
10
|
|
11
|
-
s.date = "
|
11
|
+
s.date = "2017-05-31"
|
12
12
|
s.authors = ["Dominic Bauer"]
|
13
13
|
s.email = "bauerdominic@gmail.com"
|
14
14
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayfarer-jruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dominic Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-05-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: connection_pool
|
@@ -374,6 +374,7 @@ files:
|
|
374
374
|
- ".ruby-version"
|
375
375
|
- ".travis.yml"
|
376
376
|
- ".yardopts"
|
377
|
+
- Changelog.md
|
377
378
|
- Gemfile
|
378
379
|
- LICENSE
|
379
380
|
- README.md
|
@@ -516,8 +517,10 @@ files:
|
|
516
517
|
- docs/recipes/javascript.md
|
517
518
|
- docs/recipes/multiple_uris.md
|
518
519
|
- docs/recipes/screenshots.md
|
520
|
+
- docs/routing/custom_rules.md
|
519
521
|
- docs/routing/host_rules.md
|
520
522
|
- docs/routing/path_rules.md
|
523
|
+
- docs/routing/protocol_rules.md
|
521
524
|
- docs/routing/query_rules.md
|
522
525
|
- docs/routing/routes.md
|
523
526
|
- docs/routing/uri_rules.md
|
@@ -545,8 +548,10 @@ files:
|
|
545
548
|
- lib/wayfarer/parsers/json_parser.rb
|
546
549
|
- lib/wayfarer/parsers/xml_parser.rb
|
547
550
|
- lib/wayfarer/processor.rb
|
551
|
+
- lib/wayfarer/routing/custom_rule.rb
|
548
552
|
- lib/wayfarer/routing/host_rule.rb
|
549
553
|
- lib/wayfarer/routing/path_rule.rb
|
554
|
+
- lib/wayfarer/routing/protocol_rule.rb
|
550
555
|
- lib/wayfarer/routing/query_rule.rb
|
551
556
|
- lib/wayfarer/routing/router.rb
|
552
557
|
- lib/wayfarer/routing/rule.rb
|
@@ -565,13 +570,16 @@ files:
|
|
565
570
|
- spec/http_adapters/selenium_adapter_spec.rb
|
566
571
|
- spec/integration/callbacks_spec.rb
|
567
572
|
- spec/integration/locals_spec.rb
|
573
|
+
- spec/integration/peeking_spec.rb
|
568
574
|
- spec/job_spec.rb
|
569
575
|
- spec/page_spec.rb
|
570
576
|
- spec/parsers/json_parser_spec.rb
|
571
577
|
- spec/parsers/xml_parser_spec.rb
|
572
578
|
- spec/processor_spec.rb
|
579
|
+
- spec/routing/custom_rule_spec.rb
|
573
580
|
- spec/routing/host_rule_spec.rb
|
574
581
|
- spec/routing/path_rule_spec.rb
|
582
|
+
- spec/routing/protocol_rule_spec.rb
|
575
583
|
- spec/routing/query_rule_spec.rb
|
576
584
|
- spec/routing/router_spec.rb
|
577
585
|
- spec/routing/rule_spec.rb
|