iudex-core 1.0.0-java → 1.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/History.rdoc +21 -0
- data/Manifest.txt +9 -1
- data/Rakefile +6 -6
- data/bin/iudex-test-config +1 -1
- data/bin/iudex-url-norm +4 -4
- data/build/effective_tld_name.dat +432 -29
- data/config/mojibake +268 -0
- data/lib/iudex-core/base.rb +1 -1
- data/lib/iudex-core/iudex-core-1.1.0.jar +0 -0
- data/lib/iudex-core/mojibake.rb +73 -0
- data/lib/iudex-core.rb +8 -2
- data/pom.xml +5 -5
- data/test/test_content_fetcher.rb +37 -39
- data/test/test_content_source.rb +75 -0
- data/test/test_mojibake.rb +58 -0
- data/test/test_redirect_handler.rb +170 -0
- data/test/test_visit_manager.rb +107 -0
- data/test/test_visit_queue.rb +268 -0
- data/test/test_visit_url.rb +150 -0
- metadata +26 -16
- data/lib/iudex-core/iudex-core-1.0.0.jar +0 -0
@@ -0,0 +1,170 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
require 'iudex-core'
|
22
|
+
|
23
|
+
class TestRedirectHandler < MiniTest::Unit::TestCase
|
24
|
+
include Iudex::HTTP
|
25
|
+
include Iudex::Core
|
26
|
+
include Iudex::Core::Filters
|
27
|
+
include Gravitext::HTMap
|
28
|
+
|
29
|
+
UniMap.define_accessors
|
30
|
+
|
31
|
+
def test_first_redirect
|
32
|
+
order = new_order
|
33
|
+
redirect( order )
|
34
|
+
do_filter( order )
|
35
|
+
|
36
|
+
assert_equal( "http://www/1", order.url.to_s )
|
37
|
+
assert_equal( 301, order.status )
|
38
|
+
assert_equal( 1.0, order.priority )
|
39
|
+
|
40
|
+
order = revisit( order )
|
41
|
+
assert_nil( order.status )
|
42
|
+
assert_equal( 1.5, order.priority )
|
43
|
+
|
44
|
+
assert_equal( "http://www/2", order.url.to_s )
|
45
|
+
assert_equal( "http://www/1", order.last.url.to_s )
|
46
|
+
|
47
|
+
assert_equal( "http://www/1", order.referer.url.to_s )
|
48
|
+
assert_equal( "http://www/2", order.referer.referent.url.to_s )
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_second_redirect
|
52
|
+
order = new_order
|
53
|
+
redirect( order, 301, 2 )
|
54
|
+
do_filter( order )
|
55
|
+
|
56
|
+
order = revisit( order )
|
57
|
+
redirect( order, 302, 3 )
|
58
|
+
do_filter( order )
|
59
|
+
|
60
|
+
order = revisit( order )
|
61
|
+
|
62
|
+
assert_equal( 2.0, order.priority )
|
63
|
+
|
64
|
+
assert_equal( "http://www/3", order.url.to_s )
|
65
|
+
assert_equal( "http://www/2", order.last.url.to_s )
|
66
|
+
assert_equal( "http://www/1", order.referer.url.to_s )
|
67
|
+
assert_equal( "http://www/3", order.referer.referent.url.to_s )
|
68
|
+
end
|
69
|
+
|
70
|
+
def test_third_redirect
|
71
|
+
order = new_order
|
72
|
+
redirect( order, 301, 2 )
|
73
|
+
do_filter( order )
|
74
|
+
|
75
|
+
order = revisit( order )
|
76
|
+
redirect( order, 302, 3 )
|
77
|
+
do_filter( order )
|
78
|
+
|
79
|
+
order = revisit( order )
|
80
|
+
redirect( order, 303, 4 )
|
81
|
+
do_filter( order )
|
82
|
+
|
83
|
+
order = revisit( order )
|
84
|
+
|
85
|
+
assert_equal( 2.5, order.priority )
|
86
|
+
|
87
|
+
assert_equal( "http://www/4", order.url.to_s )
|
88
|
+
assert_equal( "http://www/3", order.last.url.to_s )
|
89
|
+
assert_equal( "http://www/2", order.last.last.url.to_s )
|
90
|
+
assert_equal( "http://www/1", order.referer.url.to_s )
|
91
|
+
assert_equal( "http://www/4", order.referer.referent.url.to_s )
|
92
|
+
end
|
93
|
+
|
94
|
+
def test_self_redirect
|
95
|
+
order = new_order
|
96
|
+
redirect( order, 307, 1 )
|
97
|
+
do_filter( order )
|
98
|
+
|
99
|
+
assert_equal( "http://www/1", order.url.to_s )
|
100
|
+
assert_equal( HTTPSession::REDIRECT_LOOP, order.status )
|
101
|
+
assert_equal( 1.0, order.priority )
|
102
|
+
assert_nil( order.revisit_order )
|
103
|
+
end
|
104
|
+
|
105
|
+
def test_missing_location
|
106
|
+
order = new_order
|
107
|
+
order.status = 307
|
108
|
+
do_filter( order )
|
109
|
+
|
110
|
+
assert_equal( "http://www/1", order.url.to_s )
|
111
|
+
assert_equal( HTTPSession::MISSING_REDIRECT_LOCATION, order.status )
|
112
|
+
assert_equal( 1.0, order.priority )
|
113
|
+
assert_nil( order.revisit_order )
|
114
|
+
end
|
115
|
+
|
116
|
+
def test_redirect_loop
|
117
|
+
order = new_order
|
118
|
+
redirect( order, 301, 2 )
|
119
|
+
do_filter( order )
|
120
|
+
|
121
|
+
order = revisit( order )
|
122
|
+
redirect( order, 302, 3 )
|
123
|
+
do_filter( order )
|
124
|
+
|
125
|
+
order = revisit( order )
|
126
|
+
redirect( order, 303, 1 ) #Eat our tail
|
127
|
+
do_filter( order )
|
128
|
+
|
129
|
+
assert_equal( "http://www/3", order.url.to_s )
|
130
|
+
assert_equal( HTTPSession::REDIRECT_LOOP, order.status )
|
131
|
+
assert_nil( order.revisit_order )
|
132
|
+
end
|
133
|
+
|
134
|
+
def test_max_path
|
135
|
+
order = new_order
|
136
|
+
(2..5).each do |i|
|
137
|
+
redirect( order, 302, i )
|
138
|
+
do_filter( order, 3 )
|
139
|
+
order = revisit( order ) || break
|
140
|
+
end
|
141
|
+
|
142
|
+
assert_equal( "http://www/3", order.url.to_s )
|
143
|
+
assert_equal( HTTPSession::MAX_REDIRECTS_EXCEEDED, order.status )
|
144
|
+
assert_nil( order.revisit_order )
|
145
|
+
end
|
146
|
+
|
147
|
+
def do_filter( order, max_path = 4 )
|
148
|
+
orig_url = order.url
|
149
|
+
handler = RedirectHandler.new
|
150
|
+
handler.max_path = max_path
|
151
|
+
handler.filter( order )
|
152
|
+
end
|
153
|
+
|
154
|
+
def revisit( order )
|
155
|
+
order.remove( ContentKeys::REVISIT_ORDER )
|
156
|
+
end
|
157
|
+
|
158
|
+
def new_order( i = 1 )
|
159
|
+
UniMap.new.tap do |o|
|
160
|
+
o.url = VisitURL.normalize( "http://www/#{i}" )
|
161
|
+
o.priority = 1.0
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def redirect( o, s = 301, r = 2 )
|
166
|
+
o.status = 301
|
167
|
+
o.response_headers = [ Header.new( "Location", "http://WWW/#{r}" ) ]
|
168
|
+
end
|
169
|
+
|
170
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
require 'iudex-core'
|
22
|
+
|
23
|
+
class TestVisitManager < MiniTest::Unit::TestCase
|
24
|
+
include Gravitext::HTMap
|
25
|
+
include Iudex::Filter
|
26
|
+
include Iudex::Filter::Core
|
27
|
+
include Iudex::Core
|
28
|
+
|
29
|
+
import 'iudex.core.GenericWorkPollStrategy'
|
30
|
+
|
31
|
+
import 'java.util.concurrent.Executors'
|
32
|
+
import 'java.util.concurrent.TimeUnit'
|
33
|
+
import 'java.util.concurrent.CountDownLatch'
|
34
|
+
|
35
|
+
UniMap.define_accessors
|
36
|
+
|
37
|
+
def setup
|
38
|
+
@latch = CountDownLatch.new( 20 )
|
39
|
+
|
40
|
+
@manager = VisitManager.new( TestWorkPoller.new )
|
41
|
+
|
42
|
+
test_filter = fltr do |order|
|
43
|
+
@scheduler.schedule( proc { @manager.release( order, nil ) },
|
44
|
+
rand( 20_000 ), TimeUnit::MICROSECONDS )
|
45
|
+
@latch.countDown();
|
46
|
+
end
|
47
|
+
|
48
|
+
@manager.filter_chain = FilterChain.new( "test", [ test_filter ] )
|
49
|
+
|
50
|
+
@scheduler = Executors::new_scheduled_thread_pool( 1 )
|
51
|
+
end
|
52
|
+
|
53
|
+
def teardown
|
54
|
+
@scheduler.shutdown_now if @scheduler
|
55
|
+
@manager.shutdown if @manager
|
56
|
+
end
|
57
|
+
|
58
|
+
def test
|
59
|
+
@manager.start
|
60
|
+
pass
|
61
|
+
assert( @latch.await( 5, TimeUnit::SECONDS ) )
|
62
|
+
@manager.shutdown
|
63
|
+
pass
|
64
|
+
end
|
65
|
+
|
66
|
+
class TestWorkPoller < GenericWorkPollStrategy
|
67
|
+
include Gravitext::HTMap
|
68
|
+
include Iudex::Core
|
69
|
+
|
70
|
+
def initialize
|
71
|
+
super()
|
72
|
+
self.min_poll_interval = 5
|
73
|
+
self.max_check_interval = 21
|
74
|
+
self.max_poll_interval = 130 #ms
|
75
|
+
@batch = 0
|
76
|
+
end
|
77
|
+
|
78
|
+
def pollWorkImpl( visit_q )
|
79
|
+
@batch += 1
|
80
|
+
|
81
|
+
[ %w[ h2 a 2.2 ],
|
82
|
+
%w[ h2 b 2.1 ],
|
83
|
+
%w[ h2 c 2.0 ],
|
84
|
+
%w[ h3 a 3.2 ],
|
85
|
+
%w[ h3 b 3.1 ],
|
86
|
+
%w[ h3 c 3.0 ],
|
87
|
+
%w[ h1 a 1.2 ],
|
88
|
+
%w[ h1 b 1.1 ] ].each do |h,i,p|
|
89
|
+
|
90
|
+
visit_q.add( order( h, @batch, i, p ) )
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def order( host, batch, i, p )
|
95
|
+
UniMap.new.tap do |o|
|
96
|
+
o.url = visit_url( "http://#{host}.com/#{batch}/#{i}" )
|
97
|
+
o.priority = p.to_f
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def visit_url( url )
|
102
|
+
VisitURL.normalize( url )
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
@@ -0,0 +1,268 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
require 'iudex-core'
|
22
|
+
|
23
|
+
class TestVisitQueue < MiniTest::Unit::TestCase
|
24
|
+
include Iudex::Core
|
25
|
+
include Gravitext::HTMap
|
26
|
+
|
27
|
+
import 'java.util.concurrent.Executors'
|
28
|
+
import 'java.util.concurrent.TimeUnit'
|
29
|
+
import 'java.lang.Runnable'
|
30
|
+
|
31
|
+
UniMap.create_key( 'vtest_input' )
|
32
|
+
|
33
|
+
UniMap.define_accessors
|
34
|
+
|
35
|
+
def setup
|
36
|
+
@visit_q = VisitQueue.new
|
37
|
+
@visit_q.default_min_host_delay = 50 #ms
|
38
|
+
@scheduler = Executors::new_scheduled_thread_pool( 2 )
|
39
|
+
end
|
40
|
+
|
41
|
+
def teardown
|
42
|
+
@scheduler.shutdown_now if @scheduler
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_priority
|
46
|
+
orders = [ %w[ h a 1.3 ],
|
47
|
+
%w[ h b 1.1 ],
|
48
|
+
%w[ h c 1.2 ] ].map do |oinp|
|
49
|
+
order( oinp )
|
50
|
+
end
|
51
|
+
|
52
|
+
@visit_q.add_all( orders )
|
53
|
+
|
54
|
+
orders.sort { |p,n| n.priority <=> p.priority }.each do |o|
|
55
|
+
assert_equal( o.vtest_input, acquire_order )
|
56
|
+
end
|
57
|
+
|
58
|
+
assert_queue_empty
|
59
|
+
end
|
60
|
+
|
61
|
+
def add_common_orders
|
62
|
+
[ %w[ h2 a 2.2 100 ],
|
63
|
+
%w[ w.h2 b 2.1 ],
|
64
|
+
%w[ h2 c 2.0 ],
|
65
|
+
%w[ h3 a 3.2 130 ],
|
66
|
+
%w[ h3 b 3.1 130 ],
|
67
|
+
%w[ m.h3 c 3.0 ],
|
68
|
+
%w[ h1 a 1.2 ],
|
69
|
+
%w[ h1 b 1.1 ] ].each do |oinp|
|
70
|
+
|
71
|
+
@visit_q.add( order( oinp ) )
|
72
|
+
end
|
73
|
+
|
74
|
+
assert_equal( 3, @visit_q.host_count, "host count" )
|
75
|
+
assert_equal( 8, @visit_q.order_count, "order count" )
|
76
|
+
end
|
77
|
+
|
78
|
+
def test_hosts_acquire
|
79
|
+
add_common_orders
|
80
|
+
|
81
|
+
expected = [ %w[ h3 a 3.2 ],
|
82
|
+
%w[ h2 a 2.2 ],
|
83
|
+
%w[ h1 a 1.2 ],
|
84
|
+
%w[ h1 b 1.1 ],
|
85
|
+
%w[ w.h2 b 2.1 ],
|
86
|
+
%w[ h3 b 3.1 ],
|
87
|
+
%w[ h2 c 2.0 ],
|
88
|
+
%w[ m.h3 c 3.0 ] ]
|
89
|
+
|
90
|
+
p = 0
|
91
|
+
expected.each do |o|
|
92
|
+
assert_equal( o, acquire_order, p += 1 )
|
93
|
+
end
|
94
|
+
|
95
|
+
assert_queue_empty
|
96
|
+
end
|
97
|
+
|
98
|
+
def test_configure
|
99
|
+
@visit_q.configure_host( 'h2.com', 75, 2 )
|
100
|
+
|
101
|
+
[ %w[ h2 a 2.2 ],
|
102
|
+
%w[ w.h2 b 2.1 ],
|
103
|
+
%w[ h3 a 3.2 ],
|
104
|
+
%w[ h3 b 3.1 ],
|
105
|
+
%w[ h1 a 1.2 ],
|
106
|
+
%w[ h1 b 1.1 ] ].each do |oinp|
|
107
|
+
|
108
|
+
@visit_q.add( order( oinp ) )
|
109
|
+
|
110
|
+
end
|
111
|
+
assert_equal( 3, @visit_q.host_count, "host count" )
|
112
|
+
|
113
|
+
expected = [ %w[ h3 a 3.2 ],
|
114
|
+
%w[ h2 a 2.2 ],
|
115
|
+
%w[ h1 a 1.2 ],
|
116
|
+
%w[ h3 b 3.1 ],
|
117
|
+
%w[ h1 b 1.1 ],
|
118
|
+
%w[ w.h2 b 2.1 ] ]
|
119
|
+
|
120
|
+
p = 0
|
121
|
+
expected.each do |o|
|
122
|
+
assert_equal( o, acquire_order, p += 1 )
|
123
|
+
end
|
124
|
+
|
125
|
+
assert_queue_empty
|
126
|
+
end
|
127
|
+
|
128
|
+
def test_multi_access_2
|
129
|
+
@visit_q.default_max_access_per_host = 2
|
130
|
+
add_common_orders
|
131
|
+
|
132
|
+
expected = [ %w[ h3 a 3.2 ],
|
133
|
+
%w[ h2 a 2.2 ],
|
134
|
+
%w[ h1 a 1.2 ],
|
135
|
+
%w[ h3 b 3.1 ],
|
136
|
+
%w[ w.h2 b 2.1 ],
|
137
|
+
%w[ h1 b 1.1 ],
|
138
|
+
%w[ h2 c 2.0 ],
|
139
|
+
%w[ m.h3 c 3.0 ] ]
|
140
|
+
|
141
|
+
p = 0
|
142
|
+
expected.each do |o|
|
143
|
+
assert_equal( o, acquire_order, p += 1 )
|
144
|
+
end
|
145
|
+
|
146
|
+
assert_queue_empty
|
147
|
+
end
|
148
|
+
|
149
|
+
def test_multi_access_3
|
150
|
+
@visit_q.default_max_access_per_host = 3
|
151
|
+
add_common_orders
|
152
|
+
|
153
|
+
expected = [ %w[ h3 a 3.2 ],
|
154
|
+
%w[ h2 a 2.2 ],
|
155
|
+
%w[ h1 a 1.2 ],
|
156
|
+
%w[ h3 b 3.1 ],
|
157
|
+
%w[ w.h2 b 2.1 ],
|
158
|
+
%w[ h1 b 1.1 ],
|
159
|
+
%w[ m.h3 c 3.0 ],
|
160
|
+
%w[ h2 c 2.0 ] ]
|
161
|
+
|
162
|
+
p = 0
|
163
|
+
expected.each do |o|
|
164
|
+
assert_equal( o, acquire_order, p += 1 )
|
165
|
+
end
|
166
|
+
|
167
|
+
assert_queue_empty
|
168
|
+
end
|
169
|
+
|
170
|
+
def test_interleaved
|
171
|
+
@visit_q.default_max_access_per_host = 2
|
172
|
+
@visit_q.default_min_host_delay = 3 #ms
|
173
|
+
@visit_q.configure_host( 'h2.com', 1, 4 )
|
174
|
+
|
175
|
+
512.times do |i|
|
176
|
+
@visit_q.add( order( [ %w[ h1 h2 ][rand( 2 )], i, 5 * rand ] ) )
|
177
|
+
end
|
178
|
+
|
179
|
+
c = @visit_q.order_count
|
180
|
+
added = 0
|
181
|
+
|
182
|
+
while c > 0
|
183
|
+
o = @visit_q.acquire( 300 )
|
184
|
+
flunk( "acquire returned null" ) unless o
|
185
|
+
c -= 1
|
186
|
+
@scheduler.schedule( ReleaseJob.new( @visit_q, o ),
|
187
|
+
rand( 20_000 ), TimeUnit::MICROSECONDS )
|
188
|
+
|
189
|
+
while ( added < 1024 ) && ( rand(3) != 1 )
|
190
|
+
added += 1
|
191
|
+
c += 1
|
192
|
+
j = Job.new( added ) do | i, p |
|
193
|
+
@visit_q.add( order( [ %w[ h1 h2 ][rand( 2 )], i, 5 * rand ] ) )
|
194
|
+
end
|
195
|
+
@scheduler.schedule( j, rand( 20_000 ), TimeUnit::MICROSECONDS )
|
196
|
+
end
|
197
|
+
|
198
|
+
end
|
199
|
+
|
200
|
+
assert_queue_empty
|
201
|
+
end
|
202
|
+
|
203
|
+
def assert_queue_empty
|
204
|
+
@scheduler.shutdown
|
205
|
+
@scheduler.await_termination( 2, TimeUnit::SECONDS )
|
206
|
+
@scheduler = nil
|
207
|
+
assert_equal( 0, @visit_q.order_count, "order count" )
|
208
|
+
assert_equal( 0, @visit_q.host_count, "host count" )
|
209
|
+
end
|
210
|
+
|
211
|
+
def acquire_order
|
212
|
+
o = @visit_q.acquire( 200 )
|
213
|
+
if o
|
214
|
+
o.vtest_input.tap do |i|
|
215
|
+
delay = ( i[3] || 20 ).to_i
|
216
|
+
@scheduler.schedule( ReleaseJob.new( @visit_q, o ),
|
217
|
+
delay,
|
218
|
+
TimeUnit::MILLISECONDS )
|
219
|
+
end.slice( 0..2 )
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
def order( args )
|
224
|
+
host, c, p = args
|
225
|
+
UniMap.new.tap do |o|
|
226
|
+
o.url = visit_url( "http://#{host}.com/#{c}" )
|
227
|
+
o.priority = p.to_f
|
228
|
+
o.vtest_input = args
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
def visit_url( url )
|
233
|
+
VisitURL.normalize( url )
|
234
|
+
end
|
235
|
+
|
236
|
+
LOG = RJack::SLF4J[ self ]
|
237
|
+
|
238
|
+
class ReleaseJob
|
239
|
+
include Runnable
|
240
|
+
|
241
|
+
def initialize( visit_q, order )
|
242
|
+
super()
|
243
|
+
@visit_q = visit_q
|
244
|
+
@order = order
|
245
|
+
end
|
246
|
+
|
247
|
+
def run
|
248
|
+
@visit_q.release( @order, nil )
|
249
|
+
rescue => e
|
250
|
+
LOG.error( e )
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
class Job
|
255
|
+
include Runnable
|
256
|
+
|
257
|
+
def initialize( *args, &block )
|
258
|
+
@block = block
|
259
|
+
@args = args
|
260
|
+
end
|
261
|
+
def run
|
262
|
+
@block.call( *@args )
|
263
|
+
rescue => e
|
264
|
+
LOG.error( e )
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
end
|
@@ -0,0 +1,150 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
#.hashdot.profile += jruby-shortlived
|
4
|
+
|
5
|
+
#--
|
6
|
+
# Copyright (c) 2011 David Kellum
|
7
|
+
#
|
8
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
|
+
# may not use this file except in compliance with the License. You
|
10
|
+
# may obtain a copy of the License at
|
11
|
+
#
|
12
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
13
|
+
#
|
14
|
+
# Unless required by applicable law or agreed to in writing, software
|
15
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
16
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
17
|
+
# implied. See the License for the specific language governing
|
18
|
+
# permissions and limitations under the License.
|
19
|
+
#++
|
20
|
+
|
21
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
22
|
+
require 'iudex-core'
|
23
|
+
|
24
|
+
class TestVisitURL < MiniTest::Unit::TestCase
|
25
|
+
include Iudex::Core
|
26
|
+
|
27
|
+
# def setup; end
|
28
|
+
# def teardown end
|
29
|
+
|
30
|
+
def test_normalize_basic
|
31
|
+
|
32
|
+
sets = [ %w[ http://h.c/foo http://h.c/foo
|
33
|
+
http://h.c//foo
|
34
|
+
http://h.c/foo#anchor
|
35
|
+
HTTP://H.C:80/foo
|
36
|
+
HTTP://h.c/bar/../foo
|
37
|
+
http://h.c/./foo
|
38
|
+
http://h.c./foo
|
39
|
+
http://h.c/foo? ],
|
40
|
+
|
41
|
+
%w[ http://h.c/ http://h.c ],
|
42
|
+
|
43
|
+
%w[ http://h.c/?x=a%26b http://h.c/?x=a%26b ],
|
44
|
+
|
45
|
+
[ "http://h.c/foo", " \thttp://h.c/foo\n\r\t" ],
|
46
|
+
[ "http://h.c/foo?q=a+b", "http://h.c/foo?q=a+b" ],
|
47
|
+
[ "http://h.c/foo?q=a%20b", "http://h.c/foo?q=a b",
|
48
|
+
"http://h.c/foo?q=a b",
|
49
|
+
"HTTP://h.c/foo?q=a%20b#anchor",
|
50
|
+
"http://h.c/foo?q=a\t b#anchor\t" ] ]
|
51
|
+
|
52
|
+
sets.each do |tset|
|
53
|
+
expected = VisitURL.normalize( tset.shift )
|
54
|
+
tset.each do |raw|
|
55
|
+
assert_equal( expected.to_s, VisitURL.normalize( raw ).to_s )
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_normalize_utf8
|
61
|
+
|
62
|
+
sets = [ %w[ http://h.c/f%C5%8Do HTTP://h.c/fōo ] ]
|
63
|
+
|
64
|
+
sets.each do |tset|
|
65
|
+
expected = VisitURL.normalize( tset.shift )
|
66
|
+
tset.each do |raw|
|
67
|
+
assert_equal( expected.to_s, VisitURL.normalize( raw ).to_s )
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def test_normalize_escape_case
|
73
|
+
skip( "Escape normalizations not implemented" )
|
74
|
+
|
75
|
+
sets = [ %w[ http://h.c/?x=a%3Ab http://h.c/?x=a%3ab ],
|
76
|
+
%w[ http://h.c/%C2 http://h.c/%C2
|
77
|
+
http://h.c/%c2 ],
|
78
|
+
%w[ http://h.c/foo%20bar HTTP://h.c/%66oo%20bar ],
|
79
|
+
%w[ http://h.c/a%5Bb%5D http://h.c/a[b] ] ]
|
80
|
+
|
81
|
+
sets.each do |tset|
|
82
|
+
expected = VisitURL.normalize( tset.shift )
|
83
|
+
tset.each do |raw|
|
84
|
+
assert_equal( expected.to_s, VisitURL.normalize( raw ).to_s )
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def test_normalize_idn
|
90
|
+
skip( "IDN normalization not implemented" )
|
91
|
+
|
92
|
+
sets = [ %w[ http://xn--bcher-kva.ch/ http://Bücher.ch ] ]
|
93
|
+
|
94
|
+
sets.each do |tset|
|
95
|
+
expected = VisitURL.normalize( tset.shift )
|
96
|
+
tset.each do |raw|
|
97
|
+
assert_equal( expected.to_s, VisitURL.normalize( raw ).to_s )
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def test_uhash
|
103
|
+
h = VisitURL.normalize( "http://gravitext.com/" ).uhash
|
104
|
+
assert_equal( "8dOml647JKxoA1vSNdi3WAK", h.to_s )
|
105
|
+
|
106
|
+
h = VisitURL.normalize( "http://gravitext.com/x/y" ).uhash
|
107
|
+
assert_equal( "0pRfQvGEzGRMQ-RgFbytf7l", h.to_s )
|
108
|
+
end
|
109
|
+
|
110
|
+
def test_domain_hash
|
111
|
+
d = VisitURL.hash_domain( "gravitext.com" );
|
112
|
+
assert_equal( "VdYKPM", d.to_s )
|
113
|
+
|
114
|
+
d = VisitURL.hash_domain( "other.com" );
|
115
|
+
assert_equal( "ZleSiQ", d.to_s )
|
116
|
+
end
|
117
|
+
|
118
|
+
def test_resolve
|
119
|
+
|
120
|
+
sets = [ %w[ http://h.c/ http://h.c/foo ] << "",
|
121
|
+
%w[ http://h.c/ http://h.c/ ] << "",
|
122
|
+
%w[ http://h.c/ http://h.c/foo ] << " ",
|
123
|
+
|
124
|
+
%w[ http://h.c/ http://h.c/foo . ],
|
125
|
+
%w[ http://h.c/bar http://h.c/foo /bar ],
|
126
|
+
%w[ http://h.c/bar http://h.c/foo bar ],
|
127
|
+
%w[ http://h.c/bar http://h.c/foo?q=1 bar ],
|
128
|
+
%w[ http://h.c/bar http://h.c/foo/x/y /bar ],
|
129
|
+
%w[ http://h.c/foo/bar http://h.c/foo/x/y ../bar ],
|
130
|
+
%w[ http://h.c/foo/bar http://h.c/foo/ bar ],
|
131
|
+
|
132
|
+
%w[ http://h.c/a%20b/c%20d http://h.c/a%20b/f ] << "c d",
|
133
|
+
|
134
|
+
%w[ http://h.c/bar?q=1 http://h.c/foo bar?q=1 ],
|
135
|
+
%w[ http://h.c/bar?q=1 http://h.c/foo/ /bar?q=1 ],
|
136
|
+
%w[ http://h.c/bar?q=1 http://h.c/foo?x=2 bar?q=1 ],
|
137
|
+
%w[ http://h.c/foo/bar?q=1 http://h.c/foo/ bar?q=1 ],
|
138
|
+
%w[ http://h.c/foo/bar?q=1 http://h.c/foo/ ./bar?q=1 ] ]
|
139
|
+
|
140
|
+
sets.each do |e,b,r|
|
141
|
+
expected = VisitURL.normalize( e )
|
142
|
+
base = VisitURL.normalize( b )
|
143
|
+
resolved = base.resolve( r )
|
144
|
+
|
145
|
+
assert_equal( expected.to_s, resolved.to_s, [ e,b,r ].inspect )
|
146
|
+
end
|
147
|
+
|
148
|
+
end
|
149
|
+
|
150
|
+
end
|