iudex-core 1.3.1-java → 1.4.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/History.rdoc +26 -0
- data/Manifest.txt +1 -1
- data/README.rdoc +1 -1
- data/bin/iudex-test-config +1 -1
- data/bin/iudex-url-norm +1 -1
- data/build/TLDSets.java.erb +1 -1
- data/build/effective_tld_name.dat +215 -104
- data/build/tld_set_generator.rb +1 -1
- data/lib/iudex-core.rb +1 -1
- data/lib/iudex-core/base.rb +2 -2
- data/lib/iudex-core/config.rb +1 -1
- data/lib/iudex-core/iudex-core-1.4.0.jar +0 -0
- data/lib/iudex-core/mojibake.rb +1 -1
- data/lib/iudex-core/visit_queue.rb +1 -1
- data/pom.xml +5 -5
- data/test/setup.rb +1 -1
- data/test/test_charsets.rb +1 -1
- data/test/test_content_fetcher.rb +24 -2
- data/test/test_content_source.rb +1 -1
- data/test/test_log_writer.rb +1 -1
- data/test/test_mojibake.rb +1 -1
- data/test/test_redirect_handler.rb +1 -1
- data/test/test_visit_manager.rb +4 -2
- data/test/test_visit_queue.rb +96 -1
- data/test/test_visit_url.rb +1 -1
- metadata +11 -11
- data/lib/iudex-core/iudex-core-1.3.1.jar +0 -0
data/History.rdoc
CHANGED
@@ -1,3 +1,29 @@
|
|
1
|
+
=== 1.4.0 (2013-10-29)
|
2
|
+
* Work order reservation and experimental concurrent (no replace) work
|
3
|
+
polling support:
|
4
|
+
* Add WorkPollStrategy.discard hook in support of un-reserving a
|
5
|
+
replaced VisitQueue.
|
6
|
+
* VisitManager and GenericWorkPollStrategy changes for discard and
|
7
|
+
support of false shouldReplaceQueue
|
8
|
+
* VisitManager.doWaitOnGeneration default true as this is a safety
|
9
|
+
requirement when replacing the queue. When not replacing the queue
|
10
|
+
no wait occurs.
|
11
|
+
* Add VisitQueue.maxAccessTotal and associated waiting to avoid over
|
12
|
+
committing to too many concurrent HTML requests when using an
|
13
|
+
asynchronous HTTP client and many independent domains are
|
14
|
+
available. By default there is no limit, but recommended to set this
|
15
|
+
to some multiple of the VisitManager threads. For an example of the
|
16
|
+
problem this avoids: full HTML parsing can saturate a single CPU
|
17
|
+
and will cause some number of, say, 600 concurrent HTTP requests to
|
18
|
+
timeout given insufficient CPU/threads to handle pending reads into
|
19
|
+
memory.
|
20
|
+
* Add pre-set, dynamic REQUEST_HEADERS support to ContentFetcher. Any
|
21
|
+
request headers found here override the staticly set headers at
|
22
|
+
initialization.
|
23
|
+
* Update TLDSets based on upstream 06c405ba54b3 2013-09-17
|
24
|
+
* Upgrade to iudex-* ~> 1.4.0 dependencies
|
25
|
+
* Upgrade to minitest ~> 4.7.4 (dev)
|
26
|
+
|
1
27
|
=== 1.3.1 (2012-11-15)
|
2
28
|
* Fix bugs in using/preserving configuration with explicit :type
|
3
29
|
specified (new in 1.3.0)
|
data/Manifest.txt
CHANGED
data/README.rdoc
CHANGED
@@ -11,7 +11,7 @@ does not contain such facilities as database-backed state management.
|
|
11
11
|
|
12
12
|
== License
|
13
13
|
|
14
|
-
Copyright (c) 2008-
|
14
|
+
Copyright (c) 2008-2013 David Kellum
|
15
15
|
|
16
16
|
Licensed under the Apache License, Version 2.0 (the "License"); you
|
17
17
|
may not use this file except in compliance with the License. You
|
data/bin/iudex-test-config
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env jruby
|
2
2
|
# -*- ruby -*-
|
3
3
|
#--
|
4
|
-
# Copyright (c) 2008-
|
4
|
+
# Copyright (c) 2008-2013 David Kellum
|
5
5
|
#
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7
7
|
# may not use this file except in compliance with the License. You
|
data/bin/iudex-url-norm
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env jruby
|
2
2
|
# -*- ruby -*-
|
3
3
|
#--
|
4
|
-
# Copyright (c) 2008-
|
4
|
+
# Copyright (c) 2008-2013 David Kellum
|
5
5
|
#
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7
7
|
# may not use this file except in compliance with the License. You
|
data/build/TLDSets.java.erb
CHANGED
@@ -175,17 +175,16 @@ it.ao
|
|
175
175
|
// aq : http://en.wikipedia.org/wiki/.aq
|
176
176
|
aq
|
177
177
|
|
178
|
-
// ar :
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
!uba.ar
|
178
|
+
// ar : https://nic.ar/normativa-vigente.xhtml
|
179
|
+
ar
|
180
|
+
com.ar
|
181
|
+
edu.ar
|
182
|
+
gob.ar
|
183
|
+
int.ar
|
184
|
+
mil.ar
|
185
|
+
net.ar
|
186
|
+
org.ar
|
187
|
+
tur.ar
|
189
188
|
|
190
189
|
// arpa : http://en.wikipedia.org/wiki/.arpa
|
191
190
|
// Confirmed by registry <iana-questions@icann.org> 2008-06-18
|
@@ -221,6 +220,7 @@ edu.au
|
|
221
220
|
gov.au
|
222
221
|
asn.au
|
223
222
|
id.au
|
223
|
+
csiro.au
|
224
224
|
// Historic 2LDs (closed to new registration, but sites still exist)
|
225
225
|
info.au
|
226
226
|
conf.au
|
@@ -691,6 +691,14 @@ inf.cu
|
|
691
691
|
// cv : http://en.wikipedia.org/wiki/.cv
|
692
692
|
cv
|
693
693
|
|
694
|
+
// cw : http://www.una.cw/cw_registry/
|
695
|
+
// Confirmed by registry <registry@una.net> 2013-03-26
|
696
|
+
cw
|
697
|
+
com.cw
|
698
|
+
edu.cw
|
699
|
+
net.cw
|
700
|
+
org.cw
|
701
|
+
|
694
702
|
// cx : http://en.wikipedia.org/wiki/.cx
|
695
703
|
// list of other 2nd level tlds ?
|
696
704
|
cx
|
@@ -1071,13 +1079,14 @@ tozsde.hu
|
|
1071
1079
|
utazas.hu
|
1072
1080
|
video.hu
|
1073
1081
|
|
1074
|
-
// id :
|
1075
|
-
// see also: https://register.pandi.or.id/
|
1082
|
+
// id : https://register.pandi.or.id/
|
1076
1083
|
id
|
1077
1084
|
ac.id
|
1085
|
+
biz.id
|
1078
1086
|
co.id
|
1079
1087
|
go.id
|
1080
1088
|
mil.id
|
1089
|
+
my.id
|
1081
1090
|
net.id
|
1082
1091
|
or.id
|
1083
1092
|
sch.id
|
@@ -4221,13 +4230,16 @@ other.nf
|
|
4221
4230
|
store.nf
|
4222
4231
|
|
4223
4232
|
// ng : http://psg.com/dns/ng/
|
4224
|
-
|
4225
|
-
ac.ng
|
4233
|
+
ng
|
4226
4234
|
com.ng
|
4227
4235
|
edu.ng
|
4228
|
-
|
4236
|
+
name.ng
|
4229
4237
|
net.ng
|
4230
4238
|
org.ng
|
4239
|
+
sch.ng
|
4240
|
+
gov.ng
|
4241
|
+
mil.ng
|
4242
|
+
mobi.ng
|
4231
4243
|
|
4232
4244
|
// ni : http://www.nic.ni/dominios.htm
|
4233
4245
|
*.ni
|
@@ -5028,7 +5040,16 @@ nu
|
|
5028
5040
|
*.nz
|
5029
5041
|
|
5030
5042
|
// om : http://en.wikipedia.org/wiki/.om
|
5031
|
-
|
5043
|
+
om
|
5044
|
+
co.om
|
5045
|
+
com.om
|
5046
|
+
edu.om
|
5047
|
+
gov.om
|
5048
|
+
med.om
|
5049
|
+
museum.om
|
5050
|
+
net.om
|
5051
|
+
org.om
|
5052
|
+
pro.om
|
5032
5053
|
!mediaphone.om
|
5033
5054
|
!nawrastelecom.om
|
5034
5055
|
!nawras.om
|
@@ -5307,6 +5328,9 @@ org.pn
|
|
5307
5328
|
edu.pn
|
5308
5329
|
net.pn
|
5309
5330
|
|
5331
|
+
// post : http://en.wikipedia.org/wiki/.post
|
5332
|
+
post
|
5333
|
+
|
5310
5334
|
// pr : http://www.nic.pr/index.asp?f=1
|
5311
5335
|
pr
|
5312
5336
|
com.pr
|
@@ -5367,6 +5391,7 @@ belau.pw
|
|
5367
5391
|
|
5368
5392
|
// py : http://www.nic.py/pautas.html#seccion_9
|
5369
5393
|
// Confirmed by registry 2012-10-03
|
5394
|
+
py
|
5370
5395
|
com.py
|
5371
5396
|
coop.py
|
5372
5397
|
edu.py
|
@@ -5374,8 +5399,6 @@ gov.py
|
|
5374
5399
|
mil.py
|
5375
5400
|
net.py
|
5376
5401
|
org.py
|
5377
|
-
!nic.py
|
5378
|
-
!una.py
|
5379
5402
|
|
5380
5403
|
// qa : http://domains.qa/en/
|
5381
5404
|
qa
|
@@ -5730,8 +5753,13 @@ store.st
|
|
5730
5753
|
// su : http://en.wikipedia.org/wiki/.su
|
5731
5754
|
su
|
5732
5755
|
|
5733
|
-
// sv : http://www.svnet.org.sv/
|
5734
|
-
|
5756
|
+
// sv : http://www.svnet.org.sv/niveldos.pdf
|
5757
|
+
sv
|
5758
|
+
com.sv
|
5759
|
+
edu.sv
|
5760
|
+
gob.sv
|
5761
|
+
org.sv
|
5762
|
+
red.sv
|
5735
5763
|
|
5736
5764
|
// sx : http://en.wikipedia.org/wiki/.sx
|
5737
5765
|
// Confirmed by registry <jcvignes@openregistry.com> 2012-05-31
|
@@ -5904,16 +5932,20 @@ club.tw
|
|
5904
5932
|
組織.tw
|
5905
5933
|
商業.tw
|
5906
5934
|
|
5907
|
-
// tz : http://
|
5908
|
-
//
|
5909
|
-
// Updated from http://www.tznic.or.tz/index.php/domains.html 2010-10-25
|
5935
|
+
// tz : http://www.tznic.or.tz/index.php/domains
|
5936
|
+
// Confirmed by registry <manager@tznic.or.tz> 2013-01-22
|
5910
5937
|
ac.tz
|
5911
5938
|
co.tz
|
5912
5939
|
go.tz
|
5940
|
+
hotel.tz
|
5941
|
+
info.tz
|
5942
|
+
me.tz
|
5913
5943
|
mil.tz
|
5944
|
+
mobi.tz
|
5914
5945
|
ne.tz
|
5915
5946
|
or.tz
|
5916
5947
|
sc.tz
|
5948
|
+
tv.tz
|
5917
5949
|
|
5918
5950
|
// ua : https://hostmaster.ua/policy/?ua
|
5919
5951
|
// Submitted by registry <dk@cctld.ua> 2012-04-27
|
@@ -6015,9 +6047,8 @@ org.ug
|
|
6015
6047
|
|
6016
6048
|
// uk : http://en.wikipedia.org/wiki/.uk
|
6017
6049
|
// Submitted by registry <noc@nominet.org.uk> 2012-10-02
|
6050
|
+
// and tweaked by us pending further consultation.
|
6018
6051
|
*.uk
|
6019
|
-
*.nhs.uk
|
6020
|
-
*.police.uk
|
6021
6052
|
*.sch.uk
|
6022
6053
|
!bl.uk
|
6023
6054
|
!british-library.uk
|
@@ -6543,19 +6574,62 @@ xxx
|
|
6543
6574
|
// ===END ICANN DOMAINS===
|
6544
6575
|
// ===BEGIN PRIVATE DOMAINS===
|
6545
6576
|
|
6546
|
-
//
|
6547
|
-
|
6548
|
-
|
6549
|
-
|
6550
|
-
//
|
6551
|
-
//
|
6552
|
-
|
6577
|
+
// Amazon CloudFront : https://aws.amazon.com/cloudfront/
|
6578
|
+
// Requested by Donavan Miller <donavanm@amazon.com> 2013-03-22
|
6579
|
+
cloudfront.net
|
6580
|
+
|
6581
|
+
// Amazon Elastic Compute Cloud: https://aws.amazon.com/ec2/
|
6582
|
+
// Requested by Osman Surkatty <osmans@amazon.com> 2013-04-02
|
6583
|
+
compute.amazonaws.com
|
6584
|
+
us-east-1.amazonaws.com
|
6585
|
+
compute-1.amazonaws.com
|
6586
|
+
z-1.compute-1.amazonaws.com
|
6587
|
+
z-2.compute-1.amazonaws.com
|
6588
|
+
ap-northeast-1.compute.amazonaws.com
|
6589
|
+
ap-southeast-1.compute.amazonaws.com
|
6590
|
+
ap-southeast-2.compute.amazonaws.com
|
6591
|
+
eu-west-1.compute.amazonaws.com
|
6592
|
+
sa-east-1.compute.amazonaws.com
|
6593
|
+
us-gov-west-1.compute.amazonaws.com
|
6594
|
+
us-west-1.compute.amazonaws.com
|
6595
|
+
us-west-2.compute.amazonaws.com
|
6596
|
+
|
6597
|
+
// Amazon Elastic Beanstalk : https://aws.amazon.com/elasticbeanstalk/
|
6598
|
+
// Requested by Adam Stein <astein@amazon.com> 2013-04-02
|
6599
|
+
elasticbeanstalk.com
|
6600
|
+
|
6601
|
+
// Amazon Elastic Load Balancing : https://aws.amazon.com/elasticloadbalancing/
|
6602
|
+
// Requested by Scott Vidmar <svidmar@amazon.com> 2013-03-27
|
6603
|
+
elb.amazonaws.com
|
6604
|
+
|
6605
|
+
// Amazon S3 : https://aws.amazon.com/s3/
|
6606
|
+
// Requested by Courtney Eckhardt <coec@amazon.com> 2013-03-22
|
6607
|
+
s3.amazonaws.com
|
6608
|
+
s3-us-west-2.amazonaws.com
|
6609
|
+
s3-us-west-1.amazonaws.com
|
6610
|
+
s3-eu-west-1.amazonaws.com
|
6611
|
+
s3-ap-southeast-1.amazonaws.com
|
6612
|
+
s3-ap-southeast-2.amazonaws.com
|
6613
|
+
s3-ap-northeast-1.amazonaws.com
|
6614
|
+
s3-sa-east-1.amazonaws.com
|
6615
|
+
s3-us-gov-west-1.amazonaws.com
|
6616
|
+
s3-fips-us-gov-west-1.amazonaws.com
|
6617
|
+
s3-website-us-east-1.amazonaws.com
|
6618
|
+
s3-website-us-west-2.amazonaws.com
|
6619
|
+
s3-website-us-west-1.amazonaws.com
|
6620
|
+
s3-website-eu-west-1.amazonaws.com
|
6621
|
+
s3-website-ap-southeast-1.amazonaws.com
|
6622
|
+
s3-website-ap-southeast-2.amazonaws.com
|
6623
|
+
s3-website-ap-northeast-1.amazonaws.com
|
6624
|
+
s3-website-sa-east-1.amazonaws.com
|
6625
|
+
s3-website-us-gov-west-1.amazonaws.com
|
6553
6626
|
|
6554
|
-
//
|
6555
|
-
|
6627
|
+
// BetaInABox
|
6628
|
+
// Requested by adrian@betainabox.com 2012-09-13
|
6629
|
+
betainabox.com
|
6556
6630
|
|
6557
6631
|
// CentralNic : http://www.centralnic.com/names/domains
|
6558
|
-
//
|
6632
|
+
// Requested by registry <gavin.brown@centralnic.com> 2012-09-27
|
6559
6633
|
ae.org
|
6560
6634
|
ar.com
|
6561
6635
|
br.com
|
@@ -6584,79 +6658,24 @@ us.org
|
|
6584
6658
|
uy.com
|
6585
6659
|
za.com
|
6586
6660
|
|
6587
|
-
// Opera Software, A.S.A.
|
6588
|
-
// Requested by Yngve Pettersen <yngve@opera.com> 2009-11-26
|
6589
|
-
operaunite.com
|
6590
|
-
|
6591
|
-
// Google, Inc.
|
6592
|
-
// Requested by Eduardo Vela <evn@google.com> 2012-10-24
|
6593
|
-
appspot.com
|
6594
|
-
blogspot.be
|
6595
|
-
blogspot.bj
|
6596
|
-
blogspot.ca
|
6597
|
-
blogspot.cf
|
6598
|
-
blogspot.ch
|
6599
|
-
blogspot.co.at
|
6600
|
-
blogspot.co.il
|
6601
|
-
blogspot.co.nz
|
6602
|
-
blogspot.co.uk
|
6603
|
-
blogspot.com
|
6604
|
-
blogspot.com.ar
|
6605
|
-
blogspot.com.au
|
6606
|
-
blogspot.com.br
|
6607
|
-
blogspot.com.es
|
6608
|
-
blogspot.cv
|
6609
|
-
blogspot.cz
|
6610
|
-
blogspot.de
|
6611
|
-
blogspot.dk
|
6612
|
-
blogspot.fi
|
6613
|
-
blogspot.fr
|
6614
|
-
blogspot.gr
|
6615
|
-
blogspot.hk
|
6616
|
-
blogspot.hu
|
6617
|
-
blogspot.ie
|
6618
|
-
blogspot.in
|
6619
|
-
blogspot.it
|
6620
|
-
blogspot.jp
|
6621
|
-
blogspot.kr
|
6622
|
-
blogspot.mr
|
6623
|
-
blogspot.mx
|
6624
|
-
blogspot.nl
|
6625
|
-
blogspot.no
|
6626
|
-
blogspot.pt
|
6627
|
-
blogspot.re
|
6628
|
-
blogspot.ro
|
6629
|
-
blogspot.se
|
6630
|
-
blogspot.sg
|
6631
|
-
blogspot.sk
|
6632
|
-
blogspot.td
|
6633
|
-
blogspot.tw
|
6634
|
-
codespot.com
|
6635
|
-
googleapis.com
|
6636
|
-
googlecode.com
|
6637
|
-
|
6638
|
-
// DreamHost : http://www.dreamhost.com/
|
6639
|
-
// Requested by Andrew Farmer <andrew.farmer@dreamhost.com> 2012-10-02
|
6640
|
-
dreamhosters.com
|
6641
|
-
|
6642
|
-
// iki.fi : Submitted by Hannu Aronsson <haa@iki.fi> 2009-11-05
|
6643
|
-
iki.fi
|
6644
|
-
|
6645
6661
|
// c.la : http://www.c.la/
|
6646
6662
|
c.la
|
6647
6663
|
|
6648
|
-
//
|
6649
|
-
//
|
6650
|
-
|
6651
|
-
|
6664
|
+
// cloudControl : https://www.cloudcontrol.com/
|
6665
|
+
// Requested by Tobias Wilken <tw@cloudcontrol.com> 2013-07-23
|
6666
|
+
cloudcontrolled.com
|
6667
|
+
cloudcontrolapp.com
|
6668
|
+
|
6669
|
+
// co.ca : http://registry.co.ca/
|
6670
|
+
co.ca
|
6652
6671
|
|
6653
6672
|
// CoDNS B.V.
|
6654
|
-
// Added 2010-05-23.
|
6655
6673
|
co.nl
|
6656
6674
|
co.no
|
6657
6675
|
|
6658
|
-
//
|
6659
|
-
|
6676
|
+
// DreamHost : http://www.dreamhost.com/
|
6677
|
+
// Requested by Andrew Farmer <andrew.farmer@dreamhost.com> 2012-10-02
|
6678
|
+
dreamhosters.com
|
6660
6679
|
|
6661
6680
|
// DynDNS.com : http://www.dyndns.com/services/dns/dyndns/
|
6662
6681
|
dyndns-at-home.com
|
@@ -6939,12 +6958,104 @@ webhop.org
|
|
6939
6958
|
worse-than.tv
|
6940
6959
|
writesthisblog.com
|
6941
6960
|
|
6942
|
-
//
|
6943
|
-
// Requested by
|
6944
|
-
|
6961
|
+
// Fastly Inc. http://www.fastly.com/
|
6962
|
+
// Requested by Vladimir Vuksan <vladimir@fastly.com> 2013-05-31
|
6963
|
+
a.ssl.fastly.net
|
6964
|
+
b.ssl.fastly.net
|
6965
|
+
global.ssl.fastly.net
|
6966
|
+
a.prod.fastly.net
|
6967
|
+
global.prod.fastly.net
|
6968
|
+
|
6969
|
+
// GitHub, Inc.
|
6970
|
+
// Requested by Ben Toews <btoews@github.com> 2013-04-18
|
6971
|
+
github.io
|
6972
|
+
|
6973
|
+
// GlobeHosting, Inc.
|
6974
|
+
// Requested by Zoltan Egresi <egresi@globehosting.com> 2013-07-12
|
6975
|
+
ro.com
|
6976
|
+
|
6977
|
+
// Google, Inc.
|
6978
|
+
// Requested by Eduardo Vela <evn@google.com> 2012-10-24
|
6979
|
+
appspot.com
|
6980
|
+
blogspot.be
|
6981
|
+
blogspot.bj
|
6982
|
+
blogspot.ca
|
6983
|
+
blogspot.cf
|
6984
|
+
blogspot.ch
|
6985
|
+
blogspot.co.at
|
6986
|
+
blogspot.co.il
|
6987
|
+
blogspot.co.nz
|
6988
|
+
blogspot.co.uk
|
6989
|
+
blogspot.com
|
6990
|
+
blogspot.com.ar
|
6991
|
+
blogspot.com.au
|
6992
|
+
blogspot.com.br
|
6993
|
+
blogspot.com.es
|
6994
|
+
blogspot.cv
|
6995
|
+
blogspot.cz
|
6996
|
+
blogspot.de
|
6997
|
+
blogspot.dk
|
6998
|
+
blogspot.fi
|
6999
|
+
blogspot.fr
|
7000
|
+
blogspot.gr
|
7001
|
+
blogspot.hk
|
7002
|
+
blogspot.hu
|
7003
|
+
blogspot.ie
|
7004
|
+
blogspot.in
|
7005
|
+
blogspot.it
|
7006
|
+
blogspot.jp
|
7007
|
+
blogspot.kr
|
7008
|
+
blogspot.mr
|
7009
|
+
blogspot.mx
|
7010
|
+
blogspot.nl
|
7011
|
+
blogspot.no
|
7012
|
+
blogspot.pt
|
7013
|
+
blogspot.re
|
7014
|
+
blogspot.ro
|
7015
|
+
blogspot.se
|
7016
|
+
blogspot.sg
|
7017
|
+
blogspot.sk
|
7018
|
+
blogspot.td
|
7019
|
+
blogspot.tw
|
7020
|
+
codespot.com
|
7021
|
+
googleapis.com
|
7022
|
+
googlecode.com
|
7023
|
+
|
7024
|
+
// Heroku : https://www.heroku.com/
|
7025
|
+
// Requested by Tom Maher <tmaher@heroku.com> 2013-05-02
|
7026
|
+
herokuapp.com
|
7027
|
+
herokussl.com
|
7028
|
+
|
7029
|
+
// iki.fi
|
7030
|
+
// Requested by Hannu Aronsson <haa@iki.fi> 2009-11-05
|
7031
|
+
iki.fi
|
7032
|
+
|
7033
|
+
// info.at : http://www.info.at/
|
7034
|
+
biz.at
|
7035
|
+
info.at
|
7036
|
+
|
7037
|
+
// Michau Enterprises Limited : http://www.co.pl/
|
7038
|
+
co.pl
|
7039
|
+
|
7040
|
+
// NYC.mn : http://www.information.nyc.mn
|
7041
|
+
// Requested by Matthew Brown <mattbrown@nyc.mn> 2013-03-11
|
7042
|
+
nyc.mn
|
7043
|
+
|
7044
|
+
// Opera Software, A.S.A.
|
7045
|
+
// Requested by Yngve Pettersen <yngve@opera.com> 2009-11-26
|
7046
|
+
operaunite.com
|
6945
7047
|
|
6946
7048
|
// Red Hat, Inc. OpenShift : https://openshift.redhat.com/
|
6947
7049
|
// Requested by Tim Kramer <tkramer@rhcloud.com> 2012-10-24
|
6948
7050
|
rhcloud.com
|
6949
7051
|
|
7052
|
+
// priv.at : http://www.nic.priv.at/
|
7053
|
+
// Requested by registry <lendl@nic.at> 2008-06-09
|
7054
|
+
priv.at
|
7055
|
+
|
7056
|
+
// ZaNiC : http://www.za.net/
|
7057
|
+
// Requested by registry <hostmaster@nic.za.net> 2009-10-03
|
7058
|
+
za.net
|
7059
|
+
za.org
|
7060
|
+
|
6950
7061
|
// ===END PRIVATE DOMAINS===
|
data/build/tld_set_generator.rb
CHANGED
data/lib/iudex-core.rb
CHANGED
data/lib/iudex-core/base.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2008-
|
2
|
+
# Copyright (c) 2008-2013 David Kellum
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
5
|
# may not use this file except in compliance with the License. You
|
@@ -16,7 +16,7 @@
|
|
16
16
|
|
17
17
|
module Iudex
|
18
18
|
module Core
|
19
|
-
VERSION = '1.
|
19
|
+
VERSION = '1.4.0'
|
20
20
|
|
21
21
|
LIB_DIR = File.dirname( __FILE__ ) # :nodoc:
|
22
22
|
end
|
data/lib/iudex-core/config.rb
CHANGED
Binary file
|
data/lib/iudex-core/mojibake.rb
CHANGED
data/pom.xml
CHANGED
@@ -5,13 +5,13 @@
|
|
5
5
|
<groupId>iudex</groupId>
|
6
6
|
<artifactId>iudex-core</artifactId>
|
7
7
|
<packaging>jar</packaging>
|
8
|
-
<version>1.
|
8
|
+
<version>1.4.0</version>
|
9
9
|
<name>Iudex Core System</name>
|
10
10
|
|
11
11
|
<parent>
|
12
12
|
<groupId>iudex</groupId>
|
13
13
|
<artifactId>iudex-parent</artifactId>
|
14
|
-
<version>1.
|
14
|
+
<version>1.4.0</version>
|
15
15
|
<relativePath>..</relativePath>
|
16
16
|
</parent>
|
17
17
|
|
@@ -30,19 +30,19 @@
|
|
30
30
|
<dependency>
|
31
31
|
<groupId>iudex</groupId>
|
32
32
|
<artifactId>iudex-filter</artifactId>
|
33
|
-
<version>[1.
|
33
|
+
<version>[1.4.0,1.4.999)</version>
|
34
34
|
</dependency>
|
35
35
|
|
36
36
|
<dependency>
|
37
37
|
<groupId>iudex</groupId>
|
38
38
|
<artifactId>iudex-http</artifactId>
|
39
|
-
<version>[1.
|
39
|
+
<version>[1.4.0,1.4.999)</version>
|
40
40
|
</dependency>
|
41
41
|
|
42
42
|
<dependency>
|
43
43
|
<groupId>iudex</groupId>
|
44
44
|
<artifactId>iudex-barc</artifactId>
|
45
|
-
<version>[1.
|
45
|
+
<version>[1.4.0,1.4.999)</version>
|
46
46
|
</dependency>
|
47
47
|
|
48
48
|
<dependency>
|
data/test/setup.rb
CHANGED
data/test/test_charsets.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
#.hashdot.profile += jruby-shortlived
|
4
4
|
|
5
5
|
#--
|
6
|
-
# Copyright (c) 2008-
|
6
|
+
# Copyright (c) 2008-2013 David Kellum
|
7
7
|
#
|
8
8
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
9
|
# may not use this file except in compliance with the License. You
|
@@ -2,7 +2,7 @@
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
3
3
|
|
4
4
|
#--
|
5
|
-
# Copyright (c) 2008-
|
5
|
+
# Copyright (c) 2008-2013 David Kellum
|
6
6
|
#
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
8
|
# may not use this file except in compliance with the License. You
|
@@ -45,10 +45,15 @@ module TestHTTPMocks
|
|
45
45
|
def initialize
|
46
46
|
super()
|
47
47
|
@status = 200
|
48
|
+
@request_headers = []
|
49
|
+
end
|
50
|
+
|
51
|
+
def addRequestHeader( h )
|
52
|
+
@request_headers << h
|
48
53
|
end
|
49
54
|
|
50
55
|
def requestHeaders
|
51
|
-
|
56
|
+
@request_headers
|
52
57
|
end
|
53
58
|
|
54
59
|
def responseHeaders
|
@@ -114,6 +119,7 @@ module TestHTTPMocks
|
|
114
119
|
end
|
115
120
|
|
116
121
|
class TestContentFetcher < MiniTest::Unit::TestCase
|
122
|
+
include Iudex::HTTP
|
117
123
|
include Iudex::Core
|
118
124
|
include Iudex::Core::Filters
|
119
125
|
include Iudex::Filter::Core
|
@@ -135,10 +141,25 @@ class TestContentFetcher < MiniTest::Unit::TestCase
|
|
135
141
|
assert_equal( DEFAULT_URL, out.url.to_s )
|
136
142
|
assert_equal( 200, out.status )
|
137
143
|
assert_equal( WEAK_ETAG, out.etag )
|
144
|
+
assert_equal( "User-Agent", out.request_headers.first.name )
|
138
145
|
assert( out.source )
|
139
146
|
end
|
140
147
|
end
|
141
148
|
|
149
|
+
def test_dynamic_request_headers
|
150
|
+
inp = create_content
|
151
|
+
inp.request_headers = [ Header.new( "Dynamic", "value" ),
|
152
|
+
Header.new( "User-Agent", "override" ) ]
|
153
|
+
fetch( inp ) do |out|
|
154
|
+
hhash = out.request_headers.inject( {} ) do |m,h|
|
155
|
+
m[ h.name ] = h.value
|
156
|
+
m
|
157
|
+
end
|
158
|
+
assert_equal( { "Dynamic" => "value",
|
159
|
+
"User-Agent" => "override" }, hhash )
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
142
163
|
def test_304
|
143
164
|
client = MockHTTPClient.new
|
144
165
|
def client.request( session, handler )
|
@@ -182,6 +203,7 @@ class TestContentFetcher < MiniTest::Unit::TestCase
|
|
182
203
|
cf = ContentFetcher.new( client,
|
183
204
|
counter,
|
184
205
|
FilterChain.new( "test-rec", [ rec ] ) )
|
206
|
+
cf.request_headers = [ Header.new( "User-Agent", "default" ) ]
|
185
207
|
cf.filter( content )
|
186
208
|
end
|
187
209
|
|
data/test/test_content_source.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
3
3
|
|
4
4
|
#--
|
5
|
-
# Copyright (c) 2008-
|
5
|
+
# Copyright (c) 2008-2013 David Kellum
|
6
6
|
#
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
8
|
# may not use this file except in compliance with the License. You
|
data/test/test_log_writer.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
3
3
|
|
4
4
|
#--
|
5
|
-
# Copyright (c) 2008-
|
5
|
+
# Copyright (c) 2008-2013 David Kellum
|
6
6
|
#
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
8
|
# may not use this file except in compliance with the License. You
|
data/test/test_mojibake.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
#.hashdot.profile += jruby-shortlived
|
4
4
|
|
5
5
|
#--
|
6
|
-
# Copyright (c) 2008-
|
6
|
+
# Copyright (c) 2008-2013 David Kellum
|
7
7
|
#
|
8
8
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
9
|
# may not use this file except in compliance with the License. You
|
@@ -2,7 +2,7 @@
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
3
3
|
|
4
4
|
#--
|
5
|
-
# Copyright (c) 2008-
|
5
|
+
# Copyright (c) 2008-2013 David Kellum
|
6
6
|
#
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
8
|
# may not use this file except in compliance with the License. You
|
data/test/test_visit_manager.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
3
3
|
|
4
4
|
#--
|
5
|
-
# Copyright (c) 2008-
|
5
|
+
# Copyright (c) 2008-2013 David Kellum
|
6
6
|
#
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
8
|
# may not use this file except in compliance with the License. You
|
@@ -38,6 +38,9 @@ class TestVisitManager < MiniTest::Unit::TestCase
|
|
38
38
|
@latch = CountDownLatch.new( 20 )
|
39
39
|
|
40
40
|
@manager = VisitManager.new( TestWorkPoller.new )
|
41
|
+
@manager.do_wait_on_generation = false
|
42
|
+
|
43
|
+
@scheduler = Executors::new_scheduled_thread_pool( 1 )
|
41
44
|
|
42
45
|
test_filter = fltr do |order|
|
43
46
|
@scheduler.schedule( proc { @manager.release( order, nil ) },
|
@@ -47,7 +50,6 @@ class TestVisitManager < MiniTest::Unit::TestCase
|
|
47
50
|
|
48
51
|
@manager.filter_chain = FilterChain.new( "test", [ test_filter ] )
|
49
52
|
|
50
|
-
@scheduler = Executors::new_scheduled_thread_pool( 1 )
|
51
53
|
end
|
52
54
|
|
53
55
|
def teardown
|
data/test/test_visit_queue.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
3
3
|
|
4
4
|
#--
|
5
|
-
# Copyright (c) 2008-
|
5
|
+
# Copyright (c) 2008-2013 David Kellum
|
6
6
|
#
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
8
|
# may not use this file except in compliance with the License. You
|
@@ -171,6 +171,48 @@ class TestVisitQueue < MiniTest::Unit::TestCase
|
|
171
171
|
assert_queue_empty
|
172
172
|
end
|
173
173
|
|
174
|
+
def test_configure_type_2
|
175
|
+
@visit_q.config( :delay => 50, :cons => 1 )
|
176
|
+
@visit_q.config( :domain => 'h2.com',
|
177
|
+
:delay => 75, :cons => 2 )
|
178
|
+
@visit_q.config( :domain => 'h2.com', :type => 'ALT',
|
179
|
+
:rate => 20, :cons => 1 )
|
180
|
+
@visit_q = @visit_q.clone
|
181
|
+
|
182
|
+
LOG.debug { "As configured:\n" + @visit_q.dump }
|
183
|
+
|
184
|
+
[ %w[ h2 a 2.2 ],
|
185
|
+
%w[ w.h2:AL2 b 2.1 ],
|
186
|
+
%w[ h2:ALT c 3.2 ],
|
187
|
+
%w[ h2:ALT d 3.1 ],
|
188
|
+
%w[ h1:AL2 a 1.2 ],
|
189
|
+
%w[ h1 b 1.1 ] ].each do |oinp|
|
190
|
+
|
191
|
+
@visit_q.add( order( oinp ) )
|
192
|
+
|
193
|
+
end
|
194
|
+
|
195
|
+
LOG.debug { "After add:\n" + @visit_q.dump }
|
196
|
+
|
197
|
+
assert_equal( 3, @visit_q.host_count, "host count" )
|
198
|
+
|
199
|
+
expected = [ %w[ h2:ALT c 3.2 ],
|
200
|
+
%w[ h2 a 2.2 ],
|
201
|
+
%w[ h1:AL2 a 1.2 ],
|
202
|
+
%w[ h2:ALT d 3.1 ],
|
203
|
+
%w[ h1 b 1.1 ],
|
204
|
+
%w[ w.h2:AL2 b 2.1 ] ]
|
205
|
+
|
206
|
+
p = 0
|
207
|
+
expected.each do |o|
|
208
|
+
assert_equal( o, acquire_order, p += 1 )
|
209
|
+
end
|
210
|
+
|
211
|
+
LOG.debug { "After consumed:\n" + @visit_q.dump }
|
212
|
+
|
213
|
+
assert_queue_empty
|
214
|
+
end
|
215
|
+
|
174
216
|
def test_multi_access_2
|
175
217
|
@visit_q.default_max_access_per_host = 2
|
176
218
|
add_common_orders
|
@@ -206,10 +248,63 @@ class TestVisitQueue < MiniTest::Unit::TestCase
|
|
206
248
|
%w[ h2 c 2.0 ] ]
|
207
249
|
|
208
250
|
p = 0
|
251
|
+
concurs = []
|
252
|
+
expected.each do |o|
|
253
|
+
assert_equal( o, acquire_order, p += 1 )
|
254
|
+
concurs << @visit_q.acquired_count
|
255
|
+
end
|
256
|
+
|
257
|
+
assert_operator( 2, :<, concurs.max )
|
258
|
+
assert_queue_empty
|
259
|
+
end
|
260
|
+
|
261
|
+
def test_max_access_total_2
|
262
|
+
@visit_q.default_max_access_per_host = 999
|
263
|
+
@visit_q.max_access_total = 2
|
264
|
+
add_common_orders
|
265
|
+
|
266
|
+
expected = [ %w[ h3 a 3.2 ],
|
267
|
+
%w[ h2 a 2.2 ],
|
268
|
+
%w[ h1 a 1.2 ],
|
269
|
+
%w[ h3 b 3.1 ],
|
270
|
+
%w[ w.h2 b 2.1 ],
|
271
|
+
%w[ h1 b 1.1 ],
|
272
|
+
%w[ m.h3 c 3.0 ],
|
273
|
+
%w[ h2 c 2.0 ] ]
|
274
|
+
|
275
|
+
p = 0
|
276
|
+
concurs = []
|
277
|
+
expected.each do |o|
|
278
|
+
assert_equal( o, acquire_order, p += 1 )
|
279
|
+
concurs << @visit_q.acquired_count
|
280
|
+
end
|
281
|
+
|
282
|
+
assert_equal( 2, concurs.max )
|
283
|
+
assert_queue_empty
|
284
|
+
end
|
285
|
+
|
286
|
+
def test_max_access_total_1
|
287
|
+
@visit_q.default_max_access_per_host = 999
|
288
|
+
@visit_q.max_access_total = 1
|
289
|
+
add_common_orders
|
290
|
+
|
291
|
+
expected = [ %w[ h3 a 3.2 ],
|
292
|
+
%w[ h2 a 2.2 ],
|
293
|
+
%w[ h1 a 1.2 ],
|
294
|
+
%w[ h3 b 3.1 ],
|
295
|
+
%w[ w.h2 b 2.1 ],
|
296
|
+
%w[ m.h3 c 3.0 ],
|
297
|
+
%w[ h1 b 1.1 ],
|
298
|
+
%w[ h2 c 2.0 ] ]
|
299
|
+
|
300
|
+
p = 0
|
301
|
+
concurs = []
|
209
302
|
expected.each do |o|
|
210
303
|
assert_equal( o, acquire_order, p += 1 )
|
304
|
+
concurs << @visit_q.acquired_count
|
211
305
|
end
|
212
306
|
|
307
|
+
assert_equal( 1, concurs.max )
|
213
308
|
assert_queue_empty
|
214
309
|
end
|
215
310
|
|
data/test/test_visit_url.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
#.hashdot.profile += jruby-shortlived
|
4
4
|
|
5
5
|
#--
|
6
|
-
# Copyright (c) 2008-
|
6
|
+
# Copyright (c) 2008-2013 David Kellum
|
7
7
|
#
|
8
8
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
9
|
# may not use this file except in compliance with the License. You
|
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: iudex-core
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 1.
|
5
|
+
version: 1.4.0
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- David Kellum
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2013-10-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rjack-slf4j
|
@@ -65,13 +65,13 @@ dependencies:
|
|
65
65
|
requirements:
|
66
66
|
- - ~>
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: 1.
|
68
|
+
version: 1.4.0
|
69
69
|
none: false
|
70
70
|
requirement: !ruby/object:Gem::Requirement
|
71
71
|
requirements:
|
72
72
|
- - ~>
|
73
73
|
- !ruby/object:Gem::Version
|
74
|
-
version: 1.
|
74
|
+
version: 1.4.0
|
75
75
|
none: false
|
76
76
|
prerelease: false
|
77
77
|
type: :runtime
|
@@ -81,13 +81,13 @@ dependencies:
|
|
81
81
|
requirements:
|
82
82
|
- - ~>
|
83
83
|
- !ruby/object:Gem::Version
|
84
|
-
version: 1.
|
84
|
+
version: 1.4.0
|
85
85
|
none: false
|
86
86
|
requirement: !ruby/object:Gem::Requirement
|
87
87
|
requirements:
|
88
88
|
- - ~>
|
89
89
|
- !ruby/object:Gem::Version
|
90
|
-
version: 1.
|
90
|
+
version: 1.4.0
|
91
91
|
none: false
|
92
92
|
prerelease: false
|
93
93
|
type: :runtime
|
@@ -97,13 +97,13 @@ dependencies:
|
|
97
97
|
requirements:
|
98
98
|
- - ~>
|
99
99
|
- !ruby/object:Gem::Version
|
100
|
-
version: 1.
|
100
|
+
version: 1.4.0
|
101
101
|
none: false
|
102
102
|
requirement: !ruby/object:Gem::Requirement
|
103
103
|
requirements:
|
104
104
|
- - ~>
|
105
105
|
- !ruby/object:Gem::Version
|
106
|
-
version: 1.
|
106
|
+
version: 1.4.0
|
107
107
|
none: false
|
108
108
|
prerelease: false
|
109
109
|
type: :runtime
|
@@ -113,13 +113,13 @@ dependencies:
|
|
113
113
|
requirements:
|
114
114
|
- - ~>
|
115
115
|
- !ruby/object:Gem::Version
|
116
|
-
version:
|
116
|
+
version: 4.7.4
|
117
117
|
none: false
|
118
118
|
requirement: !ruby/object:Gem::Requirement
|
119
119
|
requirements:
|
120
120
|
- - ~>
|
121
121
|
- !ruby/object:Gem::Version
|
122
|
-
version:
|
122
|
+
version: 4.7.4
|
123
123
|
none: false
|
124
124
|
prerelease: false
|
125
125
|
type: :development
|
@@ -193,7 +193,7 @@ files:
|
|
193
193
|
- test/test_visit_manager.rb
|
194
194
|
- test/test_visit_queue.rb
|
195
195
|
- test/test_visit_url.rb
|
196
|
-
- lib/iudex-core/iudex-core-1.
|
196
|
+
- lib/iudex-core/iudex-core-1.4.0.jar
|
197
197
|
homepage: http://iudex.gravitext.com
|
198
198
|
licenses: []
|
199
199
|
post_install_message:
|
Binary file
|