iudex-core 1.0.0-java → 1.1.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/History.rdoc +21 -0
- data/Manifest.txt +9 -1
- data/Rakefile +6 -6
- data/bin/iudex-test-config +1 -1
- data/bin/iudex-url-norm +4 -4
- data/build/effective_tld_name.dat +432 -29
- data/config/mojibake +268 -0
- data/lib/iudex-core/base.rb +1 -1
- data/lib/iudex-core/iudex-core-1.1.0.jar +0 -0
- data/lib/iudex-core/mojibake.rb +73 -0
- data/lib/iudex-core.rb +8 -2
- data/pom.xml +5 -5
- data/test/test_content_fetcher.rb +37 -39
- data/test/test_content_source.rb +75 -0
- data/test/test_mojibake.rb +58 -0
- data/test/test_redirect_handler.rb +170 -0
- data/test/test_visit_manager.rb +107 -0
- data/test/test_visit_queue.rb +268 -0
- data/test/test_visit_url.rb +150 -0
- metadata +26 -16
- data/lib/iudex-core/iudex-core-1.0.0.jar +0 -0
data/.gemtest
ADDED
File without changes
|
data/History.rdoc
CHANGED
@@ -1,2 +1,23 @@
|
|
1
|
+
=== 1.1.0 (2011-11-13)
|
2
|
+
* Update to iudex-filter,http,barc ~> 1.1.0
|
3
|
+
* ContentFetcher updates for iudex-http changes
|
4
|
+
* New MojiBakeFilter, MojiBakeMapper with config table loading support
|
5
|
+
* Replaced VisitExecutor with asynchronous client compatible
|
6
|
+
VisitManager
|
7
|
+
* Visit/HostQueue acquire/release for concurrecy and per host settings
|
8
|
+
* VisitQueue uses VisitURL.domain (registration level) host keys
|
9
|
+
* New VisitCounter interface
|
10
|
+
* New RedirectHandler and Revisitor filters for direct redirect
|
11
|
+
handling with filter access
|
12
|
+
* VisitURL.resolve for redirect support
|
13
|
+
* Drop now redundant RLDomainFilter and RL_DOMAIN key
|
14
|
+
* Add domain to iudex-url-norm output
|
15
|
+
* Add encoding confidence map to ContentSource (supports encoding
|
16
|
+
detection)
|
17
|
+
* Add U+2060 WORD JOINER to Characters.ctrlWS list
|
18
|
+
* Update to minitest ~> 2.3
|
19
|
+
* Update to gravitext-util ~> 1.5.1 (for UniMap.toString)
|
20
|
+
* Update TLDSets based on upstream 9411dffc948b (2011-09-02)
|
21
|
+
|
1
22
|
=== 1.0.0 (2011-04-04)
|
2
23
|
* Initial release.
|
data/Manifest.txt
CHANGED
@@ -9,10 +9,18 @@ build/TLDSets.java.erb
|
|
9
9
|
build/effective_tld_name.dat
|
10
10
|
build/tld_set_generator.rb
|
11
11
|
config/config.rb
|
12
|
+
config/mojibake
|
12
13
|
lib/iudex-core/base.rb
|
13
14
|
lib/iudex-core.rb
|
14
15
|
lib/iudex-core/config.rb
|
16
|
+
lib/iudex-core/mojibake.rb
|
15
17
|
test/setup.rb
|
16
18
|
test/test_content_fetcher.rb
|
19
|
+
test/test_content_source.rb
|
17
20
|
test/test_log_writer.rb
|
18
|
-
|
21
|
+
test/test_mojibake.rb
|
22
|
+
test/test_redirect_handler.rb
|
23
|
+
test/test_visit_manager.rb
|
24
|
+
test/test_visit_queue.rb
|
25
|
+
test/test_visit_url.rb
|
26
|
+
lib/iudex-core/iudex-core-1.1.0.jar
|
data/Rakefile
CHANGED
@@ -4,7 +4,7 @@ $LOAD_PATH << './lib'
|
|
4
4
|
require 'iudex-core/base'
|
5
5
|
|
6
6
|
require 'rubygems'
|
7
|
-
gem 'rjack-tarpit', '~> 1.
|
7
|
+
gem 'rjack-tarpit', '~> 1.4'
|
8
8
|
require 'rjack-tarpit'
|
9
9
|
|
10
10
|
t = RJack::TarPit.new( 'iudex-core',
|
@@ -15,13 +15,13 @@ t.specify do |h|
|
|
15
15
|
h.developer( "David Kellum", "dek-oss@gravitext.com" )
|
16
16
|
h.extra_deps += [ [ 'rjack-slf4j', '~> 1.6.1' ],
|
17
17
|
[ 'hooker', '~> 1.0.0' ],
|
18
|
-
[ 'gravitext-util', '~> 1.5.
|
19
|
-
[ 'iudex-filter', '~> 1.
|
20
|
-
[ 'iudex-http', '~> 1.
|
21
|
-
[ 'iudex-barc', '~> 1.
|
18
|
+
[ 'gravitext-util', '~> 1.5.1' ],
|
19
|
+
[ 'iudex-filter', '~> 1.1.0' ],
|
20
|
+
[ 'iudex-http', '~> 1.1.0' ],
|
21
|
+
[ 'iudex-barc', '~> 1.1.0' ] ]
|
22
22
|
|
23
23
|
h.testlib = :minitest
|
24
|
-
h.extra_dev_deps += [ [ 'minitest', '
|
24
|
+
h.extra_dev_deps += [ [ 'minitest', '~> 2.3' ],
|
25
25
|
[ 'rjack-logback', '~> 1.0' ] ]
|
26
26
|
end
|
27
27
|
|
data/bin/iudex-test-config
CHANGED
@@ -34,7 +34,7 @@ module IudexBinScript
|
|
34
34
|
Hooker.log_with { |m| SLF4J[ 'iudex' ].info( m.rstrip ) }
|
35
35
|
|
36
36
|
OptionParser.new do |opts|
|
37
|
-
opts.on( "-v", "--version", "Display version" ) do
|
37
|
+
opts.on( "-v", "--version", "Display version" ) do
|
38
38
|
puts "iudex-core: #{ Core::VERSION }"
|
39
39
|
end
|
40
40
|
Hooker.register_config( opts )
|
data/bin/iudex-url-norm
CHANGED
@@ -34,14 +34,14 @@ module IudexBinScript
|
|
34
34
|
|
35
35
|
OptionParser.new do |opts|
|
36
36
|
opts.banner = "Usage: iudex-url-norm [options] [UrlsFile]..."
|
37
|
-
opts.on( "-v", "--version", "Display version" ) do
|
37
|
+
opts.on( "-v", "--version", "Display version" ) do
|
38
38
|
puts "iudex-core: #{Core::VERSION}"
|
39
39
|
exit 1
|
40
40
|
end
|
41
41
|
opts.on_tail( "-h", "--help", "Show help and exit" ) do
|
42
42
|
puts opts
|
43
43
|
puts
|
44
|
-
puts( "Write uhash and normalized URLs to STDOUT, " +
|
44
|
+
puts( "Write uhash, domain, and normalized URLs to STDOUT, " +
|
45
45
|
"from UrlsFile(s) or STDIN." )
|
46
46
|
exit 1
|
47
47
|
end
|
@@ -62,8 +62,8 @@ module IudexBinScript
|
|
62
62
|
|
63
63
|
def self.process( fin )
|
64
64
|
fin.each do |url|
|
65
|
-
vurl = Core::VisitURL.normalize( url
|
66
|
-
puts
|
65
|
+
vurl = Core::VisitURL.normalize( url )
|
66
|
+
puts '%23s %24s %s' % [ vurl.uhash, vurl.domain, vurl ]
|
67
67
|
end
|
68
68
|
end
|
69
69
|
|