iudex-core 1.0.0-java → 1.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/History.rdoc +21 -0
- data/Manifest.txt +9 -1
- data/Rakefile +6 -6
- data/bin/iudex-test-config +1 -1
- data/bin/iudex-url-norm +4 -4
- data/build/effective_tld_name.dat +432 -29
- data/config/mojibake +268 -0
- data/lib/iudex-core/base.rb +1 -1
- data/lib/iudex-core/iudex-core-1.1.0.jar +0 -0
- data/lib/iudex-core/mojibake.rb +73 -0
- data/lib/iudex-core.rb +8 -2
- data/pom.xml +5 -5
- data/test/test_content_fetcher.rb +37 -39
- data/test/test_content_source.rb +75 -0
- data/test/test_mojibake.rb +58 -0
- data/test/test_redirect_handler.rb +170 -0
- data/test/test_visit_manager.rb +107 -0
- data/test/test_visit_queue.rb +268 -0
- data/test/test_visit_url.rb +150 -0
- metadata +26 -16
- data/lib/iudex-core/iudex-core-1.0.0.jar +0 -0
data/.gemtest
ADDED
File without changes
|
data/History.rdoc
CHANGED
@@ -1,2 +1,23 @@
|
|
1
|
+
=== 1.1.0 (2011-11-13)
|
2
|
+
* Update to iudex-filter,http,barc ~> 1.1.0
|
3
|
+
* ContentFetcher updates for iudex-http changes
|
4
|
+
* New MojiBakeFilter, MojiBakeMapper with config table loading support
|
5
|
+
* Replaced VisitExecutor with asynchronous client compatible
|
6
|
+
VisitManager
|
7
|
+
* Visit/HostQueue acquire/release for concurrecy and per host settings
|
8
|
+
* VisitQueue uses VisitURL.domain (registration level) host keys
|
9
|
+
* New VisitCounter interface
|
10
|
+
* New RedirectHandler and Revisitor filters for direct redirect
|
11
|
+
handling with filter access
|
12
|
+
* VisitURL.resolve for redirect support
|
13
|
+
* Drop now redundant RLDomainFilter and RL_DOMAIN key
|
14
|
+
* Add domain to iudex-url-norm output
|
15
|
+
* Add encoding confidence map to ContentSource (supports encoding
|
16
|
+
detection)
|
17
|
+
* Add U+2060 WORD JOINER to Characters.ctrlWS list
|
18
|
+
* Update to minitest ~> 2.3
|
19
|
+
* Update to gravitext-util ~> 1.5.1 (for UniMap.toString)
|
20
|
+
* Update TLDSets based on upstream 9411dffc948b (2011-09-02)
|
21
|
+
|
1
22
|
=== 1.0.0 (2011-04-04)
|
2
23
|
* Initial release.
|
data/Manifest.txt
CHANGED
@@ -9,10 +9,18 @@ build/TLDSets.java.erb
|
|
9
9
|
build/effective_tld_name.dat
|
10
10
|
build/tld_set_generator.rb
|
11
11
|
config/config.rb
|
12
|
+
config/mojibake
|
12
13
|
lib/iudex-core/base.rb
|
13
14
|
lib/iudex-core.rb
|
14
15
|
lib/iudex-core/config.rb
|
16
|
+
lib/iudex-core/mojibake.rb
|
15
17
|
test/setup.rb
|
16
18
|
test/test_content_fetcher.rb
|
19
|
+
test/test_content_source.rb
|
17
20
|
test/test_log_writer.rb
|
18
|
-
|
21
|
+
test/test_mojibake.rb
|
22
|
+
test/test_redirect_handler.rb
|
23
|
+
test/test_visit_manager.rb
|
24
|
+
test/test_visit_queue.rb
|
25
|
+
test/test_visit_url.rb
|
26
|
+
lib/iudex-core/iudex-core-1.1.0.jar
|
data/Rakefile
CHANGED
@@ -4,7 +4,7 @@ $LOAD_PATH << './lib'
|
|
4
4
|
require 'iudex-core/base'
|
5
5
|
|
6
6
|
require 'rubygems'
|
7
|
-
gem 'rjack-tarpit', '~> 1.
|
7
|
+
gem 'rjack-tarpit', '~> 1.4'
|
8
8
|
require 'rjack-tarpit'
|
9
9
|
|
10
10
|
t = RJack::TarPit.new( 'iudex-core',
|
@@ -15,13 +15,13 @@ t.specify do |h|
|
|
15
15
|
h.developer( "David Kellum", "dek-oss@gravitext.com" )
|
16
16
|
h.extra_deps += [ [ 'rjack-slf4j', '~> 1.6.1' ],
|
17
17
|
[ 'hooker', '~> 1.0.0' ],
|
18
|
-
[ 'gravitext-util', '~> 1.5.
|
19
|
-
[ 'iudex-filter', '~> 1.
|
20
|
-
[ 'iudex-http', '~> 1.
|
21
|
-
[ 'iudex-barc', '~> 1.
|
18
|
+
[ 'gravitext-util', '~> 1.5.1' ],
|
19
|
+
[ 'iudex-filter', '~> 1.1.0' ],
|
20
|
+
[ 'iudex-http', '~> 1.1.0' ],
|
21
|
+
[ 'iudex-barc', '~> 1.1.0' ] ]
|
22
22
|
|
23
23
|
h.testlib = :minitest
|
24
|
-
h.extra_dev_deps += [ [ 'minitest', '
|
24
|
+
h.extra_dev_deps += [ [ 'minitest', '~> 2.3' ],
|
25
25
|
[ 'rjack-logback', '~> 1.0' ] ]
|
26
26
|
end
|
27
27
|
|
data/bin/iudex-test-config
CHANGED
@@ -34,7 +34,7 @@ module IudexBinScript
|
|
34
34
|
Hooker.log_with { |m| SLF4J[ 'iudex' ].info( m.rstrip ) }
|
35
35
|
|
36
36
|
OptionParser.new do |opts|
|
37
|
-
opts.on( "-v", "--version", "Display version" ) do
|
37
|
+
opts.on( "-v", "--version", "Display version" ) do
|
38
38
|
puts "iudex-core: #{ Core::VERSION }"
|
39
39
|
end
|
40
40
|
Hooker.register_config( opts )
|
data/bin/iudex-url-norm
CHANGED
@@ -34,14 +34,14 @@ module IudexBinScript
|
|
34
34
|
|
35
35
|
OptionParser.new do |opts|
|
36
36
|
opts.banner = "Usage: iudex-url-norm [options] [UrlsFile]..."
|
37
|
-
opts.on( "-v", "--version", "Display version" ) do
|
37
|
+
opts.on( "-v", "--version", "Display version" ) do
|
38
38
|
puts "iudex-core: #{Core::VERSION}"
|
39
39
|
exit 1
|
40
40
|
end
|
41
41
|
opts.on_tail( "-h", "--help", "Show help and exit" ) do
|
42
42
|
puts opts
|
43
43
|
puts
|
44
|
-
puts( "Write uhash and normalized URLs to STDOUT, " +
|
44
|
+
puts( "Write uhash, domain, and normalized URLs to STDOUT, " +
|
45
45
|
"from UrlsFile(s) or STDIN." )
|
46
46
|
exit 1
|
47
47
|
end
|
@@ -62,8 +62,8 @@ module IudexBinScript
|
|
62
62
|
|
63
63
|
def self.process( fin )
|
64
64
|
fin.each do |url|
|
65
|
-
vurl = Core::VisitURL.normalize( url
|
66
|
-
puts
|
65
|
+
vurl = Core::VisitURL.normalize( url )
|
66
|
+
puts '%23s %24s %s' % [ vurl.uhash, vurl.domain, vurl ]
|
67
67
|
end
|
68
68
|
end
|
69
69
|
|