iudex-core 1.2.1-java → 1.3.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/History.rdoc +11 -0
- data/Manifest.txt +2 -1
- data/Rakefile +1 -0
- data/build/effective_tld_name.dat +103 -21
- data/lib/iudex-core/base.rb +1 -1
- data/lib/iudex-core/iudex-core-1.3.0.jar +0 -0
- data/lib/iudex-core/visit_queue.rb +86 -0
- data/lib/iudex-core.rb +1 -0
- data/pom.xml +5 -5
- data/test/test_visit_manager.rb +5 -0
- data/test/test_visit_queue.rb +39 -3
- metadata +216 -177
- data/lib/iudex-core/iudex-core-1.2.1.jar +0 -0
data/History.rdoc
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
=== 1.3.0 (2012-11-8)
|
2
|
+
* Add DomainKey with optional :type to support configuring of
|
3
|
+
a :domain,:type specific HostQueue
|
4
|
+
* Add VisitQueue.config( options ) extensions for cleaner
|
5
|
+
configuration of HostQueue with type, rate, etc.
|
6
|
+
* Update TLDSets based on upstream c61f326ad19f 2012-10-25
|
7
|
+
* Upgrade to gravitext-util ~> 1.7.0
|
8
|
+
* Upgrade to slf4j ~> 1.7.0, logback ~> 1.5 (dev)
|
9
|
+
* Misc java logging simplifications with slf4j 1.7 varargs
|
10
|
+
* Expose GenericWorkPollStrategy.log()
|
11
|
+
|
1
12
|
=== 1.2.1 (2012-9-15)
|
2
13
|
* Upgrade/narrow to gravitext-util ~> 1.6.1
|
3
14
|
* Upgrade to slf4j [1.6.5,1.8), logback ~> 1.2 (dev)
|
data/Manifest.txt
CHANGED
@@ -14,6 +14,7 @@ lib/iudex-core/base.rb
|
|
14
14
|
lib/iudex-core.rb
|
15
15
|
lib/iudex-core/config.rb
|
16
16
|
lib/iudex-core/mojibake.rb
|
17
|
+
lib/iudex-core/visit_queue.rb
|
17
18
|
test/setup.rb
|
18
19
|
test/test_charsets.rb
|
19
20
|
test/test_content_fetcher.rb
|
@@ -24,4 +25,4 @@ test/test_redirect_handler.rb
|
|
24
25
|
test/test_visit_manager.rb
|
25
26
|
test/test_visit_queue.rb
|
26
27
|
test/test_visit_url.rb
|
27
|
-
lib/iudex-core/iudex-core-1.
|
28
|
+
lib/iudex-core/iudex-core-1.3.0.jar
|
data/Rakefile
CHANGED
@@ -18,6 +18,7 @@ task :clean do
|
|
18
18
|
rm_f 'src/main/java/iudex/core/TLDSets.java'
|
19
19
|
end
|
20
20
|
|
21
|
+
desc "Download and install latest effective_tld_name.dat"
|
21
22
|
task :refresh_tld_dat do
|
22
23
|
sh( "curl http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1" +
|
23
24
|
" -o build/effective_tld_name.dat" )
|
@@ -219,7 +219,6 @@ net.au
|
|
219
219
|
org.au
|
220
220
|
edu.au
|
221
221
|
gov.au
|
222
|
-
csiro.au
|
223
222
|
asn.au
|
224
223
|
id.au
|
225
224
|
// Historic 2LDs (closed to new registration, but sites still exist)
|
@@ -950,9 +949,15 @@ gov.gr
|
|
950
949
|
// gs : http://en.wikipedia.org/wiki/.gs
|
951
950
|
gs
|
952
951
|
|
953
|
-
// gt : http://www.gt/
|
954
|
-
|
955
|
-
|
952
|
+
// gt : http://www.gt/politicas_de_registro.html
|
953
|
+
gt
|
954
|
+
com.gt
|
955
|
+
edu.gt
|
956
|
+
gob.gt
|
957
|
+
ind.gt
|
958
|
+
mil.gt
|
959
|
+
net.gt
|
960
|
+
org.gt
|
956
961
|
|
957
962
|
// gu : http://gadao.gov.gu/registration.txt
|
958
963
|
*.gu
|
@@ -4166,6 +4171,7 @@ name.my
|
|
4166
4171
|
|
4167
4172
|
// mz : http://www.gobin.info/domainname/mz-template.doc
|
4168
4173
|
*.mz
|
4174
|
+
!teledata.mz
|
4169
4175
|
|
4170
4176
|
// na : http://www.na-nic.com.na/
|
4171
4177
|
// http://www.info.na/domain/
|
@@ -5359,8 +5365,17 @@ ed.pw
|
|
5359
5365
|
go.pw
|
5360
5366
|
belau.pw
|
5361
5367
|
|
5362
|
-
// py : http://www.nic.py/
|
5363
|
-
|
5368
|
+
// py : http://www.nic.py/pautas.html#seccion_9
|
5369
|
+
// Confirmed by registry 2012-10-03
|
5370
|
+
com.py
|
5371
|
+
coop.py
|
5372
|
+
edu.py
|
5373
|
+
gov.py
|
5374
|
+
mil.py
|
5375
|
+
net.py
|
5376
|
+
org.py
|
5377
|
+
!nic.py
|
5378
|
+
!una.py
|
5364
5379
|
|
5365
5380
|
// qa : http://domains.qa/en/
|
5366
5381
|
qa
|
@@ -5999,20 +6014,20 @@ com.ug
|
|
5999
6014
|
org.ug
|
6000
6015
|
|
6001
6016
|
// uk : http://en.wikipedia.org/wiki/.uk
|
6017
|
+
// Submitted by registry <noc@nominet.org.uk> 2012-10-02
|
6002
6018
|
*.uk
|
6019
|
+
*.nhs.uk
|
6020
|
+
*.police.uk
|
6003
6021
|
*.sch.uk
|
6004
6022
|
!bl.uk
|
6005
6023
|
!british-library.uk
|
6006
|
-
!icnet.uk
|
6007
6024
|
!jet.uk
|
6008
6025
|
!mod.uk
|
6026
|
+
!national-library-scotland.uk
|
6009
6027
|
!nel.uk
|
6010
|
-
!nhs.uk
|
6011
6028
|
!nic.uk
|
6012
6029
|
!nls.uk
|
6013
|
-
!national-library-scotland.uk
|
6014
6030
|
!parliament.uk
|
6015
|
-
!police.uk
|
6016
6031
|
|
6017
6032
|
// us : http://en.wikipedia.org/wiki/.us
|
6018
6033
|
us
|
@@ -6288,8 +6303,19 @@ gov.vc
|
|
6288
6303
|
mil.vc
|
6289
6304
|
edu.vc
|
6290
6305
|
|
6291
|
-
// ve :
|
6292
|
-
|
6306
|
+
// ve : https://registro.nic.ve/
|
6307
|
+
// Confirmed by registry 2012-10-04
|
6308
|
+
ve
|
6309
|
+
co.ve
|
6310
|
+
com.ve
|
6311
|
+
e12.ve
|
6312
|
+
edu.ve
|
6313
|
+
gov.ve
|
6314
|
+
info.ve
|
6315
|
+
mil.ve
|
6316
|
+
net.ve
|
6317
|
+
org.ve
|
6318
|
+
web.ve
|
6293
6319
|
|
6294
6320
|
// vg : http://en.wikipedia.org/wiki/.vg
|
6295
6321
|
vg
|
@@ -6529,15 +6555,20 @@ priv.at
|
|
6529
6555
|
co.ca
|
6530
6556
|
|
6531
6557
|
// CentralNic : http://www.centralnic.com/names/domains
|
6532
|
-
// Confirmed by registry <gavin.brown@centralnic.com>
|
6558
|
+
// Confirmed by registry <gavin.brown@centralnic.com> 2012-09-27
|
6559
|
+
ae.org
|
6533
6560
|
ar.com
|
6534
6561
|
br.com
|
6535
6562
|
cn.com
|
6563
|
+
com.de
|
6536
6564
|
de.com
|
6537
6565
|
eu.com
|
6538
6566
|
gb.com
|
6567
|
+
gb.net
|
6539
6568
|
gr.com
|
6540
6569
|
hu.com
|
6570
|
+
hu.net
|
6571
|
+
jp.net
|
6541
6572
|
jpn.com
|
6542
6573
|
kr.com
|
6543
6574
|
no.com
|
@@ -6545,25 +6576,68 @@ qc.com
|
|
6545
6576
|
ru.com
|
6546
6577
|
sa.com
|
6547
6578
|
se.com
|
6579
|
+
se.net
|
6548
6580
|
uk.com
|
6581
|
+
uk.net
|
6549
6582
|
us.com
|
6583
|
+
us.org
|
6550
6584
|
uy.com
|
6551
6585
|
za.com
|
6552
|
-
gb.net
|
6553
|
-
jp.net
|
6554
|
-
se.net
|
6555
|
-
uk.net
|
6556
|
-
ae.org
|
6557
|
-
us.org
|
6558
|
-
com.de
|
6559
6586
|
|
6560
6587
|
// Opera Software, A.S.A.
|
6561
6588
|
// Requested by Yngve Pettersen <yngve@opera.com> 2009-11-26
|
6562
6589
|
operaunite.com
|
6563
6590
|
|
6564
6591
|
// Google, Inc.
|
6565
|
-
// Requested by Eduardo Vela <evn@google.com>
|
6592
|
+
// Requested by Eduardo Vela <evn@google.com> 2012-10-24
|
6566
6593
|
appspot.com
|
6594
|
+
blogspot.be
|
6595
|
+
blogspot.bj
|
6596
|
+
blogspot.ca
|
6597
|
+
blogspot.cf
|
6598
|
+
blogspot.ch
|
6599
|
+
blogspot.co.at
|
6600
|
+
blogspot.co.il
|
6601
|
+
blogspot.co.nz
|
6602
|
+
blogspot.co.uk
|
6603
|
+
blogspot.com
|
6604
|
+
blogspot.com.ar
|
6605
|
+
blogspot.com.au
|
6606
|
+
blogspot.com.br
|
6607
|
+
blogspot.com.es
|
6608
|
+
blogspot.cv
|
6609
|
+
blogspot.cz
|
6610
|
+
blogspot.de
|
6611
|
+
blogspot.dk
|
6612
|
+
blogspot.fi
|
6613
|
+
blogspot.fr
|
6614
|
+
blogspot.gr
|
6615
|
+
blogspot.hk
|
6616
|
+
blogspot.hu
|
6617
|
+
blogspot.ie
|
6618
|
+
blogspot.in
|
6619
|
+
blogspot.it
|
6620
|
+
blogspot.jp
|
6621
|
+
blogspot.kr
|
6622
|
+
blogspot.mr
|
6623
|
+
blogspot.mx
|
6624
|
+
blogspot.nl
|
6625
|
+
blogspot.no
|
6626
|
+
blogspot.pt
|
6627
|
+
blogspot.re
|
6628
|
+
blogspot.ro
|
6629
|
+
blogspot.se
|
6630
|
+
blogspot.sg
|
6631
|
+
blogspot.sk
|
6632
|
+
blogspot.td
|
6633
|
+
blogspot.tw
|
6634
|
+
codespot.com
|
6635
|
+
googleapis.com
|
6636
|
+
googlecode.com
|
6637
|
+
|
6638
|
+
// DreamHost : http://www.dreamhost.com/
|
6639
|
+
// Requested by Andrew Farmer <andrew.farmer@dreamhost.com> 2012-10-02
|
6640
|
+
dreamhosters.com
|
6567
6641
|
|
6568
6642
|
// iki.fi : Submitted by Hannu Aronsson <haa@iki.fi> 2009-11-05
|
6569
6643
|
iki.fi
|
@@ -6865,4 +6939,12 @@ webhop.org
|
|
6865
6939
|
worse-than.tv
|
6866
6940
|
writesthisblog.com
|
6867
6941
|
|
6942
|
+
// BetaInABox
|
6943
|
+
// Requested by adrian@betainabox.com 2012-09-13
|
6944
|
+
betainabox.com
|
6945
|
+
|
6946
|
+
// Red Hat, Inc. OpenShift : https://openshift.redhat.com/
|
6947
|
+
// Requested by Tim Kramer <tkramer@rhcloud.com> 2012-10-24
|
6948
|
+
rhcloud.com
|
6949
|
+
|
6868
6950
|
// ===END PRIVATE DOMAINS===
|
data/lib/iudex-core/base.rb
CHANGED
Binary file
|
@@ -0,0 +1,86 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2012 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-core'
|
18
|
+
|
19
|
+
module Iudex::Core
|
20
|
+
|
21
|
+
# Configuration extensions for Java::iudex.core.VisitQueue.
|
22
|
+
class VisitQueue
|
23
|
+
|
24
|
+
# Configure defaults, a specific domain or domain,type pair via an
|
25
|
+
# options Hash.
|
26
|
+
#
|
27
|
+
# ==== Options
|
28
|
+
#
|
29
|
+
# :domain:: Registration level domain String. If not specified,
|
30
|
+
# :type is ignored and other options apply as general
|
31
|
+
# defaults for all (otherwise un-configured
|
32
|
+
# domains/types).
|
33
|
+
#
|
34
|
+
# :type:: An optional type (i.e. PAGE). If specified, this
|
35
|
+
# :domain,:type pair will be given its own HostQueue with
|
36
|
+
# other the options applying exclusively to it.
|
37
|
+
#
|
38
|
+
# :rate:: Target maximum rate of crawl as a Float requests/second
|
39
|
+
# for this :domain(,:type) or the default for any not
|
40
|
+
# otherwise configured. Resource limits including :cons
|
41
|
+
# and HTTP client connections may further inhibit rate
|
42
|
+
# below this value. (Initial default is 2.0 req/second)
|
43
|
+
#
|
44
|
+
# :delay:: Alternative inverse to :rate as Integer milliseconds to
|
45
|
+
# delay between scheduling visits. If specifies, takes
|
46
|
+
# precedence over rate.
|
47
|
+
#
|
48
|
+
# :cons:: Maximum number of concurrent requests for this
|
49
|
+
# :domain(,:type) or the default for any not otherwise
|
50
|
+
# configured. Note that the HTTP clients have their own
|
51
|
+
# per *host:port* destination connection limit which
|
52
|
+
# should generally be set higher than this value.
|
53
|
+
# (Initial default: 1)
|
54
|
+
#
|
55
|
+
def config( opts = {} )
|
56
|
+
|
57
|
+
if opts[ :domain ]
|
58
|
+
opts = { :rate => delay_to_rate( default_min_host_delay ),
|
59
|
+
:cons => default_max_access_per_host }.merge( opts )
|
60
|
+
configure_host( opts[ :domain ],
|
61
|
+
opts[ :type ], # includes nil
|
62
|
+
opts[ :delay ] || rate_to_delay( opts[ :rate ] ),
|
63
|
+
opts[ :cons ] )
|
64
|
+
else
|
65
|
+
if opts[ :rate ]
|
66
|
+
self.default_min_host_delay = rate_to_delay( opts[ :rate ] )
|
67
|
+
end
|
68
|
+
self.default_min_host_delay = opts[ :delay ] if opts[ :delay ]
|
69
|
+
self.default_max_access_per_host = opts[ :cons ] if opts[ :cons ]
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
private
|
75
|
+
|
76
|
+
def rate_to_delay( r )
|
77
|
+
( 1_000.0 / r ).round
|
78
|
+
end
|
79
|
+
|
80
|
+
def delay_to_rate( d )
|
81
|
+
( 1_000.0 / d )
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
data/lib/iudex-core.rb
CHANGED
data/pom.xml
CHANGED
@@ -5,13 +5,13 @@
|
|
5
5
|
<groupId>iudex</groupId>
|
6
6
|
<artifactId>iudex-core</artifactId>
|
7
7
|
<packaging>jar</packaging>
|
8
|
-
<version>1.
|
8
|
+
<version>1.3.0</version>
|
9
9
|
<name>Iudex Core System</name>
|
10
10
|
|
11
11
|
<parent>
|
12
12
|
<groupId>iudex</groupId>
|
13
13
|
<artifactId>iudex-parent</artifactId>
|
14
|
-
<version>1.
|
14
|
+
<version>1.3.0</version>
|
15
15
|
<relativePath>..</relativePath>
|
16
16
|
</parent>
|
17
17
|
|
@@ -30,19 +30,19 @@
|
|
30
30
|
<dependency>
|
31
31
|
<groupId>iudex</groupId>
|
32
32
|
<artifactId>iudex-filter</artifactId>
|
33
|
-
<version>[1.
|
33
|
+
<version>[1.3.0,1.3.999)</version>
|
34
34
|
</dependency>
|
35
35
|
|
36
36
|
<dependency>
|
37
37
|
<groupId>iudex</groupId>
|
38
38
|
<artifactId>iudex-http</artifactId>
|
39
|
-
<version>[1.
|
39
|
+
<version>[1.3.0,1.3.999)</version>
|
40
40
|
</dependency>
|
41
41
|
|
42
42
|
<dependency>
|
43
43
|
<groupId>iudex</groupId>
|
44
44
|
<artifactId>iudex-barc</artifactId>
|
45
|
-
<version>[1.
|
45
|
+
<version>[1.3.0,1.3.999)</version>
|
46
46
|
</dependency>
|
47
47
|
|
48
48
|
<dependency>
|
data/test/test_visit_manager.rb
CHANGED
data/test/test_visit_queue.rb
CHANGED
@@ -34,7 +34,7 @@ class TestVisitQueue < MiniTest::Unit::TestCase
|
|
34
34
|
|
35
35
|
def setup
|
36
36
|
@visit_q = VisitQueue.new
|
37
|
-
@visit_q.
|
37
|
+
@visit_q.config( :delay => 50 ) #ms
|
38
38
|
@scheduler = Executors::new_scheduled_thread_pool( 2 )
|
39
39
|
end
|
40
40
|
|
@@ -96,7 +96,7 @@ class TestVisitQueue < MiniTest::Unit::TestCase
|
|
96
96
|
end
|
97
97
|
|
98
98
|
def test_configure
|
99
|
-
@visit_q.
|
99
|
+
@visit_q.config( :domain => 'h2.com', :delay => 75, :cons => 2 )
|
100
100
|
|
101
101
|
[ %w[ h2 a 2.2 ],
|
102
102
|
%w[ w.h2 b 2.1 ],
|
@@ -125,6 +125,39 @@ class TestVisitQueue < MiniTest::Unit::TestCase
|
|
125
125
|
assert_queue_empty
|
126
126
|
end
|
127
127
|
|
128
|
+
def test_configure_type
|
129
|
+
@visit_q.config( :domain => 'h2.com',
|
130
|
+
:delay => 75, :cons => 2 )
|
131
|
+
@visit_q.config( :domain => 'h2.com', :type => 'ALT',
|
132
|
+
:delay => 50, :cons => 1 )
|
133
|
+
|
134
|
+
[ %w[ h2 a 2.2 ],
|
135
|
+
%w[ w.h2 b 2.1 ],
|
136
|
+
%w[ h2:ALT c 3.2 ],
|
137
|
+
%w[ h2:ALT d 3.1 ],
|
138
|
+
%w[ h1 a 1.2 ],
|
139
|
+
%w[ h1 b 1.1 ] ].each do |oinp|
|
140
|
+
|
141
|
+
@visit_q.add( order( oinp ) )
|
142
|
+
|
143
|
+
end
|
144
|
+
assert_equal( 3, @visit_q.host_count, "host count" )
|
145
|
+
|
146
|
+
expected = [ %w[ h2:ALT c 3.2 ],
|
147
|
+
%w[ h2 a 2.2 ],
|
148
|
+
%w[ h1 a 1.2 ],
|
149
|
+
%w[ h2:ALT d 3.1 ],
|
150
|
+
%w[ h1 b 1.1 ],
|
151
|
+
%w[ w.h2 b 2.1 ] ]
|
152
|
+
|
153
|
+
p = 0
|
154
|
+
expected.each do |o|
|
155
|
+
assert_equal( o, acquire_order, p += 1 )
|
156
|
+
end
|
157
|
+
|
158
|
+
assert_queue_empty
|
159
|
+
end
|
160
|
+
|
128
161
|
def test_multi_access_2
|
129
162
|
@visit_q.default_max_access_per_host = 2
|
130
163
|
add_common_orders
|
@@ -170,7 +203,7 @@ class TestVisitQueue < MiniTest::Unit::TestCase
|
|
170
203
|
def test_interleaved
|
171
204
|
@visit_q.default_max_access_per_host = 2
|
172
205
|
@visit_q.default_min_host_delay = 3 #ms
|
173
|
-
@visit_q.
|
206
|
+
@visit_q.config( :domain => 'h2.com', :delay => 1, :cons => 4 )
|
174
207
|
|
175
208
|
512.times do |i|
|
176
209
|
@visit_q.add( order( [ %w[ h1 h2 ][rand( 2 )], i, 5 * rand ] ) )
|
@@ -222,10 +255,13 @@ class TestVisitQueue < MiniTest::Unit::TestCase
|
|
222
255
|
|
223
256
|
def order( args )
|
224
257
|
host, c, p = args
|
258
|
+
host, t = host.split( ':' )
|
259
|
+
|
225
260
|
UniMap.new.tap do |o|
|
226
261
|
o.url = visit_url( "http://#{host}.com/#{c}" )
|
227
262
|
o.priority = p.to_f
|
228
263
|
o.vtest_input = args
|
264
|
+
o.type = t || 'PAGE'
|
229
265
|
end
|
230
266
|
end
|
231
267
|
|
metadata
CHANGED
@@ -1,191 +1,230 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: iudex-core
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
prerelease:
|
5
|
-
version: 1.
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 1.3.0
|
6
6
|
platform: java
|
7
|
-
authors:
|
8
|
-
|
9
|
-
autorequire:
|
7
|
+
authors:
|
8
|
+
- David Kellum
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
12
|
+
date: 2012-11-08 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rjack-slf4j
|
16
|
+
version_requirements: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - ~>
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: 1.7.0
|
21
|
+
none: false
|
22
|
+
requirement: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 1.7.0
|
27
|
+
none: false
|
28
|
+
prerelease: false
|
29
|
+
type: :runtime
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: hooker
|
32
|
+
version_requirements: !ruby/object:Gem::Requirement
|
33
|
+
requirements:
|
34
|
+
- - ~>
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: 1.0.0
|
37
|
+
none: false
|
38
|
+
requirement: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - ~>
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 1.0.0
|
43
|
+
none: false
|
44
|
+
prerelease: false
|
45
|
+
type: :runtime
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: gravitext-util
|
48
|
+
version_requirements: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ~>
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 1.7.0
|
53
|
+
none: false
|
54
|
+
requirement: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ~>
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: 1.7.0
|
59
|
+
none: false
|
60
|
+
prerelease: false
|
61
|
+
type: :runtime
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: iudex-filter
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ~>
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 1.3.0
|
69
|
+
none: false
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - ~>
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: 1.3.0
|
75
|
+
none: false
|
76
|
+
prerelease: false
|
77
|
+
type: :runtime
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: iudex-http
|
80
|
+
version_requirements: !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - ~>
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: 1.3.0
|
85
|
+
none: false
|
86
|
+
requirement: !ruby/object:Gem::Requirement
|
87
|
+
requirements:
|
88
|
+
- - ~>
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: 1.3.0
|
91
|
+
none: false
|
92
|
+
prerelease: false
|
93
|
+
type: :runtime
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: iudex-barc
|
96
|
+
version_requirements: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - ~>
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: 1.3.0
|
101
|
+
none: false
|
102
|
+
requirement: !ruby/object:Gem::Requirement
|
103
|
+
requirements:
|
104
|
+
- - ~>
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: 1.3.0
|
107
|
+
none: false
|
108
|
+
prerelease: false
|
109
|
+
type: :runtime
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: minitest
|
112
|
+
version_requirements: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - ~>
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '2.3'
|
117
|
+
none: false
|
118
|
+
requirement: !ruby/object:Gem::Requirement
|
119
|
+
requirements:
|
120
|
+
- - ~>
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: '2.3'
|
123
|
+
none: false
|
124
|
+
prerelease: false
|
125
|
+
type: :development
|
126
|
+
- !ruby/object:Gem::Dependency
|
127
|
+
name: rjack-logback
|
128
|
+
version_requirements: !ruby/object:Gem::Requirement
|
129
|
+
requirements:
|
130
|
+
- - ~>
|
131
|
+
- !ruby/object:Gem::Version
|
132
|
+
version: '1.5'
|
133
|
+
none: false
|
134
|
+
requirement: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ~>
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '1.5'
|
139
|
+
none: false
|
140
|
+
prerelease: false
|
141
|
+
type: :development
|
142
|
+
- !ruby/object:Gem::Dependency
|
143
|
+
name: rjack-tarpit
|
144
|
+
version_requirements: !ruby/object:Gem::Requirement
|
145
|
+
requirements:
|
146
|
+
- - ~>
|
147
|
+
- !ruby/object:Gem::Version
|
148
|
+
version: '2.0'
|
149
|
+
none: false
|
150
|
+
requirement: !ruby/object:Gem::Requirement
|
151
|
+
requirements:
|
152
|
+
- - ~>
|
153
|
+
- !ruby/object:Gem::Version
|
154
|
+
version: '2.0'
|
155
|
+
none: false
|
156
|
+
prerelease: false
|
157
|
+
type: :development
|
117
158
|
description: Iudex is a general purpose web crawler and feed processor in ruby/java. The iudex-core gem contains core facilities and notably, does not contain such facilities as database-backed state management.
|
118
|
-
email:
|
119
|
-
|
120
|
-
executables:
|
121
|
-
|
122
|
-
|
159
|
+
email:
|
160
|
+
- dek-oss@gravitext.com
|
161
|
+
executables:
|
162
|
+
- iudex-test-config
|
163
|
+
- iudex-url-norm
|
123
164
|
extensions: []
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
165
|
+
extra_rdoc_files:
|
166
|
+
- History.rdoc
|
167
|
+
- README.rdoc
|
168
|
+
files:
|
169
|
+
- History.rdoc
|
170
|
+
- Manifest.txt
|
171
|
+
- README.rdoc
|
172
|
+
- Rakefile
|
173
|
+
- pom.xml
|
174
|
+
- bin/iudex-test-config
|
175
|
+
- bin/iudex-url-norm
|
176
|
+
- build/TLDSets.java.erb
|
177
|
+
- build/effective_tld_name.dat
|
178
|
+
- build/tld_set_generator.rb
|
179
|
+
- config/config.rb
|
180
|
+
- config/mojibake
|
181
|
+
- lib/iudex-core/base.rb
|
182
|
+
- lib/iudex-core.rb
|
183
|
+
- lib/iudex-core/config.rb
|
184
|
+
- lib/iudex-core/mojibake.rb
|
185
|
+
- lib/iudex-core/visit_queue.rb
|
186
|
+
- test/setup.rb
|
187
|
+
- test/test_charsets.rb
|
188
|
+
- test/test_content_fetcher.rb
|
189
|
+
- test/test_content_source.rb
|
190
|
+
- test/test_log_writer.rb
|
191
|
+
- test/test_mojibake.rb
|
192
|
+
- test/test_redirect_handler.rb
|
193
|
+
- test/test_visit_manager.rb
|
194
|
+
- test/test_visit_queue.rb
|
195
|
+
- test/test_visit_url.rb
|
196
|
+
- lib/iudex-core/iudex-core-1.3.0.jar
|
156
197
|
homepage: http://iudex.gravitext.com
|
157
198
|
licenses: []
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
199
|
+
post_install_message:
|
200
|
+
rdoc_options:
|
201
|
+
- --main
|
202
|
+
- README.rdoc
|
203
|
+
require_paths:
|
204
|
+
- lib
|
205
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
206
|
+
requirements:
|
207
|
+
- - ! '>='
|
208
|
+
- !ruby/object:Gem::Version
|
209
|
+
version: '0'
|
210
|
+
segments:
|
211
|
+
- 0
|
212
|
+
hash: 2
|
166
213
|
none: false
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
214
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
215
|
+
requirements:
|
216
|
+
- - ! '>='
|
217
|
+
- !ruby/object:Gem::Version
|
218
|
+
version: '0'
|
219
|
+
segments:
|
220
|
+
- 0
|
221
|
+
hash: 2
|
175
222
|
none: false
|
176
|
-
requirements:
|
177
|
-
- - ">="
|
178
|
-
- !ruby/object:Gem::Version
|
179
|
-
hash: 2
|
180
|
-
segments:
|
181
|
-
- 0
|
182
|
-
version: "0"
|
183
223
|
requirements: []
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
signing_key:
|
224
|
+
rubyforge_project:
|
225
|
+
rubygems_version: 1.8.24
|
226
|
+
signing_key:
|
188
227
|
specification_version: 3
|
189
228
|
summary: Iudex is a general purpose web crawler and feed processor in ruby/java.
|
190
229
|
test_files: []
|
191
|
-
|
230
|
+
...
|
Binary file
|