iudex-core 1.2.1-java → 1.3.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.rdoc +11 -0
- data/Manifest.txt +2 -1
- data/Rakefile +1 -0
- data/build/effective_tld_name.dat +103 -21
- data/lib/iudex-core/base.rb +1 -1
- data/lib/iudex-core/iudex-core-1.3.0.jar +0 -0
- data/lib/iudex-core/visit_queue.rb +86 -0
- data/lib/iudex-core.rb +1 -0
- data/pom.xml +5 -5
- data/test/test_visit_manager.rb +5 -0
- data/test/test_visit_queue.rb +39 -3
- metadata +216 -177
- data/lib/iudex-core/iudex-core-1.2.1.jar +0 -0
data/History.rdoc
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
=== 1.3.0 (2012-11-8)
|
|
2
|
+
* Add DomainKey with optional :type to support configuring of
|
|
3
|
+
a :domain,:type specific HostQueue
|
|
4
|
+
* Add VisitQueue.config( options ) extensions for cleaner
|
|
5
|
+
configuration of HostQueue with type, rate, etc.
|
|
6
|
+
* Update TLDSets based on upstream c61f326ad19f 2012-10-25
|
|
7
|
+
* Upgrade to gravitext-util ~> 1.7.0
|
|
8
|
+
* Upgrade to slf4j ~> 1.7.0, logback ~> 1.5 (dev)
|
|
9
|
+
* Misc java logging simplifications with slf4j 1.7 varargs
|
|
10
|
+
* Expose GenericWorkPollStrategy.log()
|
|
11
|
+
|
|
1
12
|
=== 1.2.1 (2012-9-15)
|
|
2
13
|
* Upgrade/narrow to gravitext-util ~> 1.6.1
|
|
3
14
|
* Upgrade to slf4j [1.6.5,1.8), logback ~> 1.2 (dev)
|
data/Manifest.txt
CHANGED
|
@@ -14,6 +14,7 @@ lib/iudex-core/base.rb
|
|
|
14
14
|
lib/iudex-core.rb
|
|
15
15
|
lib/iudex-core/config.rb
|
|
16
16
|
lib/iudex-core/mojibake.rb
|
|
17
|
+
lib/iudex-core/visit_queue.rb
|
|
17
18
|
test/setup.rb
|
|
18
19
|
test/test_charsets.rb
|
|
19
20
|
test/test_content_fetcher.rb
|
|
@@ -24,4 +25,4 @@ test/test_redirect_handler.rb
|
|
|
24
25
|
test/test_visit_manager.rb
|
|
25
26
|
test/test_visit_queue.rb
|
|
26
27
|
test/test_visit_url.rb
|
|
27
|
-
lib/iudex-core/iudex-core-1.
|
|
28
|
+
lib/iudex-core/iudex-core-1.3.0.jar
|
data/Rakefile
CHANGED
|
@@ -18,6 +18,7 @@ task :clean do
|
|
|
18
18
|
rm_f 'src/main/java/iudex/core/TLDSets.java'
|
|
19
19
|
end
|
|
20
20
|
|
|
21
|
+
desc "Download and install latest effective_tld_name.dat"
|
|
21
22
|
task :refresh_tld_dat do
|
|
22
23
|
sh( "curl http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1" +
|
|
23
24
|
" -o build/effective_tld_name.dat" )
|
|
@@ -219,7 +219,6 @@ net.au
|
|
|
219
219
|
org.au
|
|
220
220
|
edu.au
|
|
221
221
|
gov.au
|
|
222
|
-
csiro.au
|
|
223
222
|
asn.au
|
|
224
223
|
id.au
|
|
225
224
|
// Historic 2LDs (closed to new registration, but sites still exist)
|
|
@@ -950,9 +949,15 @@ gov.gr
|
|
|
950
949
|
// gs : http://en.wikipedia.org/wiki/.gs
|
|
951
950
|
gs
|
|
952
951
|
|
|
953
|
-
// gt : http://www.gt/
|
|
954
|
-
|
|
955
|
-
|
|
952
|
+
// gt : http://www.gt/politicas_de_registro.html
|
|
953
|
+
gt
|
|
954
|
+
com.gt
|
|
955
|
+
edu.gt
|
|
956
|
+
gob.gt
|
|
957
|
+
ind.gt
|
|
958
|
+
mil.gt
|
|
959
|
+
net.gt
|
|
960
|
+
org.gt
|
|
956
961
|
|
|
957
962
|
// gu : http://gadao.gov.gu/registration.txt
|
|
958
963
|
*.gu
|
|
@@ -4166,6 +4171,7 @@ name.my
|
|
|
4166
4171
|
|
|
4167
4172
|
// mz : http://www.gobin.info/domainname/mz-template.doc
|
|
4168
4173
|
*.mz
|
|
4174
|
+
!teledata.mz
|
|
4169
4175
|
|
|
4170
4176
|
// na : http://www.na-nic.com.na/
|
|
4171
4177
|
// http://www.info.na/domain/
|
|
@@ -5359,8 +5365,17 @@ ed.pw
|
|
|
5359
5365
|
go.pw
|
|
5360
5366
|
belau.pw
|
|
5361
5367
|
|
|
5362
|
-
// py : http://www.nic.py/
|
|
5363
|
-
|
|
5368
|
+
// py : http://www.nic.py/pautas.html#seccion_9
|
|
5369
|
+
// Confirmed by registry 2012-10-03
|
|
5370
|
+
com.py
|
|
5371
|
+
coop.py
|
|
5372
|
+
edu.py
|
|
5373
|
+
gov.py
|
|
5374
|
+
mil.py
|
|
5375
|
+
net.py
|
|
5376
|
+
org.py
|
|
5377
|
+
!nic.py
|
|
5378
|
+
!una.py
|
|
5364
5379
|
|
|
5365
5380
|
// qa : http://domains.qa/en/
|
|
5366
5381
|
qa
|
|
@@ -5999,20 +6014,20 @@ com.ug
|
|
|
5999
6014
|
org.ug
|
|
6000
6015
|
|
|
6001
6016
|
// uk : http://en.wikipedia.org/wiki/.uk
|
|
6017
|
+
// Submitted by registry <noc@nominet.org.uk> 2012-10-02
|
|
6002
6018
|
*.uk
|
|
6019
|
+
*.nhs.uk
|
|
6020
|
+
*.police.uk
|
|
6003
6021
|
*.sch.uk
|
|
6004
6022
|
!bl.uk
|
|
6005
6023
|
!british-library.uk
|
|
6006
|
-
!icnet.uk
|
|
6007
6024
|
!jet.uk
|
|
6008
6025
|
!mod.uk
|
|
6026
|
+
!national-library-scotland.uk
|
|
6009
6027
|
!nel.uk
|
|
6010
|
-
!nhs.uk
|
|
6011
6028
|
!nic.uk
|
|
6012
6029
|
!nls.uk
|
|
6013
|
-
!national-library-scotland.uk
|
|
6014
6030
|
!parliament.uk
|
|
6015
|
-
!police.uk
|
|
6016
6031
|
|
|
6017
6032
|
// us : http://en.wikipedia.org/wiki/.us
|
|
6018
6033
|
us
|
|
@@ -6288,8 +6303,19 @@ gov.vc
|
|
|
6288
6303
|
mil.vc
|
|
6289
6304
|
edu.vc
|
|
6290
6305
|
|
|
6291
|
-
// ve :
|
|
6292
|
-
|
|
6306
|
+
// ve : https://registro.nic.ve/
|
|
6307
|
+
// Confirmed by registry 2012-10-04
|
|
6308
|
+
ve
|
|
6309
|
+
co.ve
|
|
6310
|
+
com.ve
|
|
6311
|
+
e12.ve
|
|
6312
|
+
edu.ve
|
|
6313
|
+
gov.ve
|
|
6314
|
+
info.ve
|
|
6315
|
+
mil.ve
|
|
6316
|
+
net.ve
|
|
6317
|
+
org.ve
|
|
6318
|
+
web.ve
|
|
6293
6319
|
|
|
6294
6320
|
// vg : http://en.wikipedia.org/wiki/.vg
|
|
6295
6321
|
vg
|
|
@@ -6529,15 +6555,20 @@ priv.at
|
|
|
6529
6555
|
co.ca
|
|
6530
6556
|
|
|
6531
6557
|
// CentralNic : http://www.centralnic.com/names/domains
|
|
6532
|
-
// Confirmed by registry <gavin.brown@centralnic.com>
|
|
6558
|
+
// Confirmed by registry <gavin.brown@centralnic.com> 2012-09-27
|
|
6559
|
+
ae.org
|
|
6533
6560
|
ar.com
|
|
6534
6561
|
br.com
|
|
6535
6562
|
cn.com
|
|
6563
|
+
com.de
|
|
6536
6564
|
de.com
|
|
6537
6565
|
eu.com
|
|
6538
6566
|
gb.com
|
|
6567
|
+
gb.net
|
|
6539
6568
|
gr.com
|
|
6540
6569
|
hu.com
|
|
6570
|
+
hu.net
|
|
6571
|
+
jp.net
|
|
6541
6572
|
jpn.com
|
|
6542
6573
|
kr.com
|
|
6543
6574
|
no.com
|
|
@@ -6545,25 +6576,68 @@ qc.com
|
|
|
6545
6576
|
ru.com
|
|
6546
6577
|
sa.com
|
|
6547
6578
|
se.com
|
|
6579
|
+
se.net
|
|
6548
6580
|
uk.com
|
|
6581
|
+
uk.net
|
|
6549
6582
|
us.com
|
|
6583
|
+
us.org
|
|
6550
6584
|
uy.com
|
|
6551
6585
|
za.com
|
|
6552
|
-
gb.net
|
|
6553
|
-
jp.net
|
|
6554
|
-
se.net
|
|
6555
|
-
uk.net
|
|
6556
|
-
ae.org
|
|
6557
|
-
us.org
|
|
6558
|
-
com.de
|
|
6559
6586
|
|
|
6560
6587
|
// Opera Software, A.S.A.
|
|
6561
6588
|
// Requested by Yngve Pettersen <yngve@opera.com> 2009-11-26
|
|
6562
6589
|
operaunite.com
|
|
6563
6590
|
|
|
6564
6591
|
// Google, Inc.
|
|
6565
|
-
// Requested by Eduardo Vela <evn@google.com>
|
|
6592
|
+
// Requested by Eduardo Vela <evn@google.com> 2012-10-24
|
|
6566
6593
|
appspot.com
|
|
6594
|
+
blogspot.be
|
|
6595
|
+
blogspot.bj
|
|
6596
|
+
blogspot.ca
|
|
6597
|
+
blogspot.cf
|
|
6598
|
+
blogspot.ch
|
|
6599
|
+
blogspot.co.at
|
|
6600
|
+
blogspot.co.il
|
|
6601
|
+
blogspot.co.nz
|
|
6602
|
+
blogspot.co.uk
|
|
6603
|
+
blogspot.com
|
|
6604
|
+
blogspot.com.ar
|
|
6605
|
+
blogspot.com.au
|
|
6606
|
+
blogspot.com.br
|
|
6607
|
+
blogspot.com.es
|
|
6608
|
+
blogspot.cv
|
|
6609
|
+
blogspot.cz
|
|
6610
|
+
blogspot.de
|
|
6611
|
+
blogspot.dk
|
|
6612
|
+
blogspot.fi
|
|
6613
|
+
blogspot.fr
|
|
6614
|
+
blogspot.gr
|
|
6615
|
+
blogspot.hk
|
|
6616
|
+
blogspot.hu
|
|
6617
|
+
blogspot.ie
|
|
6618
|
+
blogspot.in
|
|
6619
|
+
blogspot.it
|
|
6620
|
+
blogspot.jp
|
|
6621
|
+
blogspot.kr
|
|
6622
|
+
blogspot.mr
|
|
6623
|
+
blogspot.mx
|
|
6624
|
+
blogspot.nl
|
|
6625
|
+
blogspot.no
|
|
6626
|
+
blogspot.pt
|
|
6627
|
+
blogspot.re
|
|
6628
|
+
blogspot.ro
|
|
6629
|
+
blogspot.se
|
|
6630
|
+
blogspot.sg
|
|
6631
|
+
blogspot.sk
|
|
6632
|
+
blogspot.td
|
|
6633
|
+
blogspot.tw
|
|
6634
|
+
codespot.com
|
|
6635
|
+
googleapis.com
|
|
6636
|
+
googlecode.com
|
|
6637
|
+
|
|
6638
|
+
// DreamHost : http://www.dreamhost.com/
|
|
6639
|
+
// Requested by Andrew Farmer <andrew.farmer@dreamhost.com> 2012-10-02
|
|
6640
|
+
dreamhosters.com
|
|
6567
6641
|
|
|
6568
6642
|
// iki.fi : Submitted by Hannu Aronsson <haa@iki.fi> 2009-11-05
|
|
6569
6643
|
iki.fi
|
|
@@ -6865,4 +6939,12 @@ webhop.org
|
|
|
6865
6939
|
worse-than.tv
|
|
6866
6940
|
writesthisblog.com
|
|
6867
6941
|
|
|
6942
|
+
// BetaInABox
|
|
6943
|
+
// Requested by adrian@betainabox.com 2012-09-13
|
|
6944
|
+
betainabox.com
|
|
6945
|
+
|
|
6946
|
+
// Red Hat, Inc. OpenShift : https://openshift.redhat.com/
|
|
6947
|
+
// Requested by Tim Kramer <tkramer@rhcloud.com> 2012-10-24
|
|
6948
|
+
rhcloud.com
|
|
6949
|
+
|
|
6868
6950
|
// ===END PRIVATE DOMAINS===
|
data/lib/iudex-core/base.rb
CHANGED
|
Binary file
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
#--
|
|
2
|
+
# Copyright (c) 2008-2012 David Kellum
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
|
5
|
+
# may not use this file except in compliance with the License. You
|
|
6
|
+
# may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
13
|
+
# implied. See the License for the specific language governing
|
|
14
|
+
# permissions and limitations under the License.
|
|
15
|
+
#++
|
|
16
|
+
|
|
17
|
+
require 'iudex-core'
|
|
18
|
+
|
|
19
|
+
module Iudex::Core
|
|
20
|
+
|
|
21
|
+
# Configuration extensions for Java::iudex.core.VisitQueue.
|
|
22
|
+
class VisitQueue
|
|
23
|
+
|
|
24
|
+
# Configure defaults, a specific domain or domain,type pair via an
|
|
25
|
+
# options Hash.
|
|
26
|
+
#
|
|
27
|
+
# ==== Options
|
|
28
|
+
#
|
|
29
|
+
# :domain:: Registration level domain String. If not specified,
|
|
30
|
+
# :type is ignored and other options apply as general
|
|
31
|
+
# defaults for all (otherwise un-configured
|
|
32
|
+
# domains/types).
|
|
33
|
+
#
|
|
34
|
+
# :type:: An optional type (i.e. PAGE). If specified, this
|
|
35
|
+
# :domain,:type pair will be given its own HostQueue with
|
|
36
|
+
# other the options applying exclusively to it.
|
|
37
|
+
#
|
|
38
|
+
# :rate:: Target maximum rate of crawl as a Float requests/second
|
|
39
|
+
# for this :domain(,:type) or the default for any not
|
|
40
|
+
# otherwise configured. Resource limits including :cons
|
|
41
|
+
# and HTTP client connections may further inhibit rate
|
|
42
|
+
# below this value. (Initial default is 2.0 req/second)
|
|
43
|
+
#
|
|
44
|
+
# :delay:: Alternative inverse to :rate as Integer milliseconds to
|
|
45
|
+
# delay between scheduling visits. If specifies, takes
|
|
46
|
+
# precedence over rate.
|
|
47
|
+
#
|
|
48
|
+
# :cons:: Maximum number of concurrent requests for this
|
|
49
|
+
# :domain(,:type) or the default for any not otherwise
|
|
50
|
+
# configured. Note that the HTTP clients have their own
|
|
51
|
+
# per *host:port* destination connection limit which
|
|
52
|
+
# should generally be set higher than this value.
|
|
53
|
+
# (Initial default: 1)
|
|
54
|
+
#
|
|
55
|
+
def config( opts = {} )
|
|
56
|
+
|
|
57
|
+
if opts[ :domain ]
|
|
58
|
+
opts = { :rate => delay_to_rate( default_min_host_delay ),
|
|
59
|
+
:cons => default_max_access_per_host }.merge( opts )
|
|
60
|
+
configure_host( opts[ :domain ],
|
|
61
|
+
opts[ :type ], # includes nil
|
|
62
|
+
opts[ :delay ] || rate_to_delay( opts[ :rate ] ),
|
|
63
|
+
opts[ :cons ] )
|
|
64
|
+
else
|
|
65
|
+
if opts[ :rate ]
|
|
66
|
+
self.default_min_host_delay = rate_to_delay( opts[ :rate ] )
|
|
67
|
+
end
|
|
68
|
+
self.default_min_host_delay = opts[ :delay ] if opts[ :delay ]
|
|
69
|
+
self.default_max_access_per_host = opts[ :cons ] if opts[ :cons ]
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
private
|
|
75
|
+
|
|
76
|
+
def rate_to_delay( r )
|
|
77
|
+
( 1_000.0 / r ).round
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def delay_to_rate( d )
|
|
81
|
+
( 1_000.0 / d )
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
end
|
data/lib/iudex-core.rb
CHANGED
data/pom.xml
CHANGED
|
@@ -5,13 +5,13 @@
|
|
|
5
5
|
<groupId>iudex</groupId>
|
|
6
6
|
<artifactId>iudex-core</artifactId>
|
|
7
7
|
<packaging>jar</packaging>
|
|
8
|
-
<version>1.
|
|
8
|
+
<version>1.3.0</version>
|
|
9
9
|
<name>Iudex Core System</name>
|
|
10
10
|
|
|
11
11
|
<parent>
|
|
12
12
|
<groupId>iudex</groupId>
|
|
13
13
|
<artifactId>iudex-parent</artifactId>
|
|
14
|
-
<version>1.
|
|
14
|
+
<version>1.3.0</version>
|
|
15
15
|
<relativePath>..</relativePath>
|
|
16
16
|
</parent>
|
|
17
17
|
|
|
@@ -30,19 +30,19 @@
|
|
|
30
30
|
<dependency>
|
|
31
31
|
<groupId>iudex</groupId>
|
|
32
32
|
<artifactId>iudex-filter</artifactId>
|
|
33
|
-
<version>[1.
|
|
33
|
+
<version>[1.3.0,1.3.999)</version>
|
|
34
34
|
</dependency>
|
|
35
35
|
|
|
36
36
|
<dependency>
|
|
37
37
|
<groupId>iudex</groupId>
|
|
38
38
|
<artifactId>iudex-http</artifactId>
|
|
39
|
-
<version>[1.
|
|
39
|
+
<version>[1.3.0,1.3.999)</version>
|
|
40
40
|
</dependency>
|
|
41
41
|
|
|
42
42
|
<dependency>
|
|
43
43
|
<groupId>iudex</groupId>
|
|
44
44
|
<artifactId>iudex-barc</artifactId>
|
|
45
|
-
<version>[1.
|
|
45
|
+
<version>[1.3.0,1.3.999)</version>
|
|
46
46
|
</dependency>
|
|
47
47
|
|
|
48
48
|
<dependency>
|
data/test/test_visit_manager.rb
CHANGED
data/test/test_visit_queue.rb
CHANGED
|
@@ -34,7 +34,7 @@ class TestVisitQueue < MiniTest::Unit::TestCase
|
|
|
34
34
|
|
|
35
35
|
def setup
|
|
36
36
|
@visit_q = VisitQueue.new
|
|
37
|
-
@visit_q.
|
|
37
|
+
@visit_q.config( :delay => 50 ) #ms
|
|
38
38
|
@scheduler = Executors::new_scheduled_thread_pool( 2 )
|
|
39
39
|
end
|
|
40
40
|
|
|
@@ -96,7 +96,7 @@ class TestVisitQueue < MiniTest::Unit::TestCase
|
|
|
96
96
|
end
|
|
97
97
|
|
|
98
98
|
def test_configure
|
|
99
|
-
@visit_q.
|
|
99
|
+
@visit_q.config( :domain => 'h2.com', :delay => 75, :cons => 2 )
|
|
100
100
|
|
|
101
101
|
[ %w[ h2 a 2.2 ],
|
|
102
102
|
%w[ w.h2 b 2.1 ],
|
|
@@ -125,6 +125,39 @@ class TestVisitQueue < MiniTest::Unit::TestCase
|
|
|
125
125
|
assert_queue_empty
|
|
126
126
|
end
|
|
127
127
|
|
|
128
|
+
def test_configure_type
|
|
129
|
+
@visit_q.config( :domain => 'h2.com',
|
|
130
|
+
:delay => 75, :cons => 2 )
|
|
131
|
+
@visit_q.config( :domain => 'h2.com', :type => 'ALT',
|
|
132
|
+
:delay => 50, :cons => 1 )
|
|
133
|
+
|
|
134
|
+
[ %w[ h2 a 2.2 ],
|
|
135
|
+
%w[ w.h2 b 2.1 ],
|
|
136
|
+
%w[ h2:ALT c 3.2 ],
|
|
137
|
+
%w[ h2:ALT d 3.1 ],
|
|
138
|
+
%w[ h1 a 1.2 ],
|
|
139
|
+
%w[ h1 b 1.1 ] ].each do |oinp|
|
|
140
|
+
|
|
141
|
+
@visit_q.add( order( oinp ) )
|
|
142
|
+
|
|
143
|
+
end
|
|
144
|
+
assert_equal( 3, @visit_q.host_count, "host count" )
|
|
145
|
+
|
|
146
|
+
expected = [ %w[ h2:ALT c 3.2 ],
|
|
147
|
+
%w[ h2 a 2.2 ],
|
|
148
|
+
%w[ h1 a 1.2 ],
|
|
149
|
+
%w[ h2:ALT d 3.1 ],
|
|
150
|
+
%w[ h1 b 1.1 ],
|
|
151
|
+
%w[ w.h2 b 2.1 ] ]
|
|
152
|
+
|
|
153
|
+
p = 0
|
|
154
|
+
expected.each do |o|
|
|
155
|
+
assert_equal( o, acquire_order, p += 1 )
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
assert_queue_empty
|
|
159
|
+
end
|
|
160
|
+
|
|
128
161
|
def test_multi_access_2
|
|
129
162
|
@visit_q.default_max_access_per_host = 2
|
|
130
163
|
add_common_orders
|
|
@@ -170,7 +203,7 @@ class TestVisitQueue < MiniTest::Unit::TestCase
|
|
|
170
203
|
def test_interleaved
|
|
171
204
|
@visit_q.default_max_access_per_host = 2
|
|
172
205
|
@visit_q.default_min_host_delay = 3 #ms
|
|
173
|
-
@visit_q.
|
|
206
|
+
@visit_q.config( :domain => 'h2.com', :delay => 1, :cons => 4 )
|
|
174
207
|
|
|
175
208
|
512.times do |i|
|
|
176
209
|
@visit_q.add( order( [ %w[ h1 h2 ][rand( 2 )], i, 5 * rand ] ) )
|
|
@@ -222,10 +255,13 @@ class TestVisitQueue < MiniTest::Unit::TestCase
|
|
|
222
255
|
|
|
223
256
|
def order( args )
|
|
224
257
|
host, c, p = args
|
|
258
|
+
host, t = host.split( ':' )
|
|
259
|
+
|
|
225
260
|
UniMap.new.tap do |o|
|
|
226
261
|
o.url = visit_url( "http://#{host}.com/#{c}" )
|
|
227
262
|
o.priority = p.to_f
|
|
228
263
|
o.vtest_input = args
|
|
264
|
+
o.type = t || 'PAGE'
|
|
229
265
|
end
|
|
230
266
|
end
|
|
231
267
|
|
metadata
CHANGED
|
@@ -1,191 +1,230 @@
|
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: iudex-core
|
|
3
|
-
version: !ruby/object:Gem::Version
|
|
4
|
-
prerelease:
|
|
5
|
-
version: 1.
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
prerelease:
|
|
5
|
+
version: 1.3.0
|
|
6
6
|
platform: java
|
|
7
|
-
authors:
|
|
8
|
-
|
|
9
|
-
autorequire:
|
|
7
|
+
authors:
|
|
8
|
+
- David Kellum
|
|
9
|
+
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
12
|
+
date: 2012-11-08 00:00:00.000000000 Z
|
|
13
|
+
dependencies:
|
|
14
|
+
- !ruby/object:Gem::Dependency
|
|
15
|
+
name: rjack-slf4j
|
|
16
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
17
|
+
requirements:
|
|
18
|
+
- - ~>
|
|
19
|
+
- !ruby/object:Gem::Version
|
|
20
|
+
version: 1.7.0
|
|
21
|
+
none: false
|
|
22
|
+
requirement: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - ~>
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: 1.7.0
|
|
27
|
+
none: false
|
|
28
|
+
prerelease: false
|
|
29
|
+
type: :runtime
|
|
30
|
+
- !ruby/object:Gem::Dependency
|
|
31
|
+
name: hooker
|
|
32
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
33
|
+
requirements:
|
|
34
|
+
- - ~>
|
|
35
|
+
- !ruby/object:Gem::Version
|
|
36
|
+
version: 1.0.0
|
|
37
|
+
none: false
|
|
38
|
+
requirement: !ruby/object:Gem::Requirement
|
|
39
|
+
requirements:
|
|
40
|
+
- - ~>
|
|
41
|
+
- !ruby/object:Gem::Version
|
|
42
|
+
version: 1.0.0
|
|
43
|
+
none: false
|
|
44
|
+
prerelease: false
|
|
45
|
+
type: :runtime
|
|
46
|
+
- !ruby/object:Gem::Dependency
|
|
47
|
+
name: gravitext-util
|
|
48
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
49
|
+
requirements:
|
|
50
|
+
- - ~>
|
|
51
|
+
- !ruby/object:Gem::Version
|
|
52
|
+
version: 1.7.0
|
|
53
|
+
none: false
|
|
54
|
+
requirement: !ruby/object:Gem::Requirement
|
|
55
|
+
requirements:
|
|
56
|
+
- - ~>
|
|
57
|
+
- !ruby/object:Gem::Version
|
|
58
|
+
version: 1.7.0
|
|
59
|
+
none: false
|
|
60
|
+
prerelease: false
|
|
61
|
+
type: :runtime
|
|
62
|
+
- !ruby/object:Gem::Dependency
|
|
63
|
+
name: iudex-filter
|
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
+
requirements:
|
|
66
|
+
- - ~>
|
|
67
|
+
- !ruby/object:Gem::Version
|
|
68
|
+
version: 1.3.0
|
|
69
|
+
none: false
|
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
|
71
|
+
requirements:
|
|
72
|
+
- - ~>
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
version: 1.3.0
|
|
75
|
+
none: false
|
|
76
|
+
prerelease: false
|
|
77
|
+
type: :runtime
|
|
78
|
+
- !ruby/object:Gem::Dependency
|
|
79
|
+
name: iudex-http
|
|
80
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
81
|
+
requirements:
|
|
82
|
+
- - ~>
|
|
83
|
+
- !ruby/object:Gem::Version
|
|
84
|
+
version: 1.3.0
|
|
85
|
+
none: false
|
|
86
|
+
requirement: !ruby/object:Gem::Requirement
|
|
87
|
+
requirements:
|
|
88
|
+
- - ~>
|
|
89
|
+
- !ruby/object:Gem::Version
|
|
90
|
+
version: 1.3.0
|
|
91
|
+
none: false
|
|
92
|
+
prerelease: false
|
|
93
|
+
type: :runtime
|
|
94
|
+
- !ruby/object:Gem::Dependency
|
|
95
|
+
name: iudex-barc
|
|
96
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
97
|
+
requirements:
|
|
98
|
+
- - ~>
|
|
99
|
+
- !ruby/object:Gem::Version
|
|
100
|
+
version: 1.3.0
|
|
101
|
+
none: false
|
|
102
|
+
requirement: !ruby/object:Gem::Requirement
|
|
103
|
+
requirements:
|
|
104
|
+
- - ~>
|
|
105
|
+
- !ruby/object:Gem::Version
|
|
106
|
+
version: 1.3.0
|
|
107
|
+
none: false
|
|
108
|
+
prerelease: false
|
|
109
|
+
type: :runtime
|
|
110
|
+
- !ruby/object:Gem::Dependency
|
|
111
|
+
name: minitest
|
|
112
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
113
|
+
requirements:
|
|
114
|
+
- - ~>
|
|
115
|
+
- !ruby/object:Gem::Version
|
|
116
|
+
version: '2.3'
|
|
117
|
+
none: false
|
|
118
|
+
requirement: !ruby/object:Gem::Requirement
|
|
119
|
+
requirements:
|
|
120
|
+
- - ~>
|
|
121
|
+
- !ruby/object:Gem::Version
|
|
122
|
+
version: '2.3'
|
|
123
|
+
none: false
|
|
124
|
+
prerelease: false
|
|
125
|
+
type: :development
|
|
126
|
+
- !ruby/object:Gem::Dependency
|
|
127
|
+
name: rjack-logback
|
|
128
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
129
|
+
requirements:
|
|
130
|
+
- - ~>
|
|
131
|
+
- !ruby/object:Gem::Version
|
|
132
|
+
version: '1.5'
|
|
133
|
+
none: false
|
|
134
|
+
requirement: !ruby/object:Gem::Requirement
|
|
135
|
+
requirements:
|
|
136
|
+
- - ~>
|
|
137
|
+
- !ruby/object:Gem::Version
|
|
138
|
+
version: '1.5'
|
|
139
|
+
none: false
|
|
140
|
+
prerelease: false
|
|
141
|
+
type: :development
|
|
142
|
+
- !ruby/object:Gem::Dependency
|
|
143
|
+
name: rjack-tarpit
|
|
144
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
145
|
+
requirements:
|
|
146
|
+
- - ~>
|
|
147
|
+
- !ruby/object:Gem::Version
|
|
148
|
+
version: '2.0'
|
|
149
|
+
none: false
|
|
150
|
+
requirement: !ruby/object:Gem::Requirement
|
|
151
|
+
requirements:
|
|
152
|
+
- - ~>
|
|
153
|
+
- !ruby/object:Gem::Version
|
|
154
|
+
version: '2.0'
|
|
155
|
+
none: false
|
|
156
|
+
prerelease: false
|
|
157
|
+
type: :development
|
|
117
158
|
description: Iudex is a general purpose web crawler and feed processor in ruby/java. The iudex-core gem contains core facilities and notably, does not contain such facilities as database-backed state management.
|
|
118
|
-
email:
|
|
119
|
-
|
|
120
|
-
executables:
|
|
121
|
-
|
|
122
|
-
|
|
159
|
+
email:
|
|
160
|
+
- dek-oss@gravitext.com
|
|
161
|
+
executables:
|
|
162
|
+
- iudex-test-config
|
|
163
|
+
- iudex-url-norm
|
|
123
164
|
extensions: []
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
165
|
+
extra_rdoc_files:
|
|
166
|
+
- History.rdoc
|
|
167
|
+
- README.rdoc
|
|
168
|
+
files:
|
|
169
|
+
- History.rdoc
|
|
170
|
+
- Manifest.txt
|
|
171
|
+
- README.rdoc
|
|
172
|
+
- Rakefile
|
|
173
|
+
- pom.xml
|
|
174
|
+
- bin/iudex-test-config
|
|
175
|
+
- bin/iudex-url-norm
|
|
176
|
+
- build/TLDSets.java.erb
|
|
177
|
+
- build/effective_tld_name.dat
|
|
178
|
+
- build/tld_set_generator.rb
|
|
179
|
+
- config/config.rb
|
|
180
|
+
- config/mojibake
|
|
181
|
+
- lib/iudex-core/base.rb
|
|
182
|
+
- lib/iudex-core.rb
|
|
183
|
+
- lib/iudex-core/config.rb
|
|
184
|
+
- lib/iudex-core/mojibake.rb
|
|
185
|
+
- lib/iudex-core/visit_queue.rb
|
|
186
|
+
- test/setup.rb
|
|
187
|
+
- test/test_charsets.rb
|
|
188
|
+
- test/test_content_fetcher.rb
|
|
189
|
+
- test/test_content_source.rb
|
|
190
|
+
- test/test_log_writer.rb
|
|
191
|
+
- test/test_mojibake.rb
|
|
192
|
+
- test/test_redirect_handler.rb
|
|
193
|
+
- test/test_visit_manager.rb
|
|
194
|
+
- test/test_visit_queue.rb
|
|
195
|
+
- test/test_visit_url.rb
|
|
196
|
+
- lib/iudex-core/iudex-core-1.3.0.jar
|
|
156
197
|
homepage: http://iudex.gravitext.com
|
|
157
198
|
licenses: []
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
199
|
+
post_install_message:
|
|
200
|
+
rdoc_options:
|
|
201
|
+
- --main
|
|
202
|
+
- README.rdoc
|
|
203
|
+
require_paths:
|
|
204
|
+
- lib
|
|
205
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
206
|
+
requirements:
|
|
207
|
+
- - ! '>='
|
|
208
|
+
- !ruby/object:Gem::Version
|
|
209
|
+
version: '0'
|
|
210
|
+
segments:
|
|
211
|
+
- 0
|
|
212
|
+
hash: 2
|
|
166
213
|
none: false
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
214
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
215
|
+
requirements:
|
|
216
|
+
- - ! '>='
|
|
217
|
+
- !ruby/object:Gem::Version
|
|
218
|
+
version: '0'
|
|
219
|
+
segments:
|
|
220
|
+
- 0
|
|
221
|
+
hash: 2
|
|
175
222
|
none: false
|
|
176
|
-
requirements:
|
|
177
|
-
- - ">="
|
|
178
|
-
- !ruby/object:Gem::Version
|
|
179
|
-
hash: 2
|
|
180
|
-
segments:
|
|
181
|
-
- 0
|
|
182
|
-
version: "0"
|
|
183
223
|
requirements: []
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
signing_key:
|
|
224
|
+
rubyforge_project:
|
|
225
|
+
rubygems_version: 1.8.24
|
|
226
|
+
signing_key:
|
|
188
227
|
specification_version: 3
|
|
189
228
|
summary: Iudex is a general purpose web crawler and feed processor in ruby/java.
|
|
190
229
|
test_files: []
|
|
191
|
-
|
|
230
|
+
...
|
|
Binary file
|