iudex-char-detector 1.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/History.rdoc +2 -0
- data/Manifest.txt +16 -0
- data/README.rdoc +25 -0
- data/Rakefile +40 -0
- data/bin/iudex-char-detect +117 -0
- data/lib/iudex-char-detector/base.rb +21 -0
- data/lib/iudex-char-detector/iudex-char-detector-1.1.0.jar +0 -0
- data/lib/iudex-char-detector.rb +30 -0
- data/pom.xml +45 -0
- data/test/sample.html.ascii +8 -0
- data/test/sample.html.iso +8 -0
- data/test/sample.html.utf16 +0 -0
- data/test/sample.html.utf16le +0 -0
- data/test/sample.html.utf8 +8 -0
- data/test/setup.rb +39 -0
- data/test/test_char_detector.rb +156 -0
- metadata +131 -0
data/.gemtest
ADDED
File without changes
|
data/History.rdoc
ADDED
data/Manifest.txt
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
History.rdoc
|
2
|
+
Manifest.txt
|
3
|
+
README.rdoc
|
4
|
+
Rakefile
|
5
|
+
pom.xml
|
6
|
+
bin/iudex-char-detect
|
7
|
+
lib/iudex-char-detector/base.rb
|
8
|
+
lib/iudex-char-detector.rb
|
9
|
+
test/sample.html.ascii
|
10
|
+
test/sample.html.iso
|
11
|
+
test/sample.html.utf16
|
12
|
+
test/sample.html.utf16le
|
13
|
+
test/sample.html.utf8
|
14
|
+
test/setup.rb
|
15
|
+
test/test_char_detector.rb
|
16
|
+
lib/iudex-char-detector/iudex-char-detector-1.1.0.jar
|
data/README.rdoc
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
= iudex-char-detector
|
2
|
+
|
3
|
+
* http://github.com/dekellum/iudex
|
4
|
+
|
5
|
+
== Description
|
6
|
+
|
7
|
+
Iudex is a general purpose web crawler and feed processor in
|
8
|
+
ruby/java. The iudex-char-detector gem provides charset detection
|
9
|
+
support.
|
10
|
+
|
11
|
+
== License
|
12
|
+
|
13
|
+
Copyright (c) 2011 David Kellum
|
14
|
+
|
15
|
+
Licensed under the Apache License, Version 2.0 (the "License"); you
|
16
|
+
may not use this file except in compliance with the License. You
|
17
|
+
may obtain a copy of the License at:
|
18
|
+
|
19
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
20
|
+
|
21
|
+
Unless required by applicable law or agreed to in writing, software
|
22
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
23
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
24
|
+
implied. See the License for the specific language governing
|
25
|
+
permissions and limitations under the License.
|
data/Rakefile
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
$LOAD_PATH << './lib'
|
4
|
+
require 'iudex-char-detector/base'
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
gem 'rjack-tarpit', '~> 1.4'
|
8
|
+
require 'rjack-tarpit'
|
9
|
+
|
10
|
+
t = RJack::TarPit.new( 'iudex-char-detector',
|
11
|
+
Iudex::CharDetector::VERSION,
|
12
|
+
:no_assembly, :java_platform )
|
13
|
+
|
14
|
+
t.specify do |h|
|
15
|
+
h.developer( "David Kellum", "dek-oss@gravitext.com" )
|
16
|
+
h.extra_deps += [ [ 'iudex-core', '~> 1.1.0' ],
|
17
|
+
[ 'rjack-icu', '~> 4.8.0' ] ]
|
18
|
+
|
19
|
+
h.testlib = :minitest
|
20
|
+
h.extra_dev_deps += [ [ 'minitest', '~> 2.3' ],
|
21
|
+
[ 'rjack-logback', '~> 1.0' ] ]
|
22
|
+
end
|
23
|
+
|
24
|
+
file 'Manifest.txt' => [ 'pom.xml' ]
|
25
|
+
|
26
|
+
task :check_pom_version do
|
27
|
+
t.test_line_match( 'pom.xml', /<version>/, /#{t.version}/ )
|
28
|
+
end
|
29
|
+
task :check_history_version do
|
30
|
+
t.test_line_match( 'History.rdoc', /^==/, / #{t.version} / )
|
31
|
+
end
|
32
|
+
task :check_history_date do
|
33
|
+
t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
|
34
|
+
end
|
35
|
+
|
36
|
+
task :gem => [ :check_pom_version, :check_history_version ]
|
37
|
+
task :tag => [ :check_pom_version, :check_history_version, :check_history_date ]
|
38
|
+
task :push => [ :check_history_date ]
|
39
|
+
|
40
|
+
t.define_tasks
|
@@ -0,0 +1,117 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- ruby -*-
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
$LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
|
21
|
+
|
22
|
+
require 'rubygems'
|
23
|
+
require 'optparse'
|
24
|
+
|
25
|
+
class ChartDetectUtil
|
26
|
+
|
27
|
+
require 'rjack-logback'
|
28
|
+
include RJack
|
29
|
+
|
30
|
+
Logback.config_console( :level => Logback::INFO, :stderr => true )
|
31
|
+
|
32
|
+
require 'iudex-char-detector'
|
33
|
+
include Iudex
|
34
|
+
|
35
|
+
include Gravitext::HTMap
|
36
|
+
UniMap.define_accessors
|
37
|
+
|
38
|
+
include Iudex::Core
|
39
|
+
include Iudex::CharDetector
|
40
|
+
|
41
|
+
import 'java.nio.ByteBuffer'
|
42
|
+
import 'java.nio.charset.Charset'
|
43
|
+
|
44
|
+
def initialize
|
45
|
+
@default_encode = "windows-1252"
|
46
|
+
end
|
47
|
+
|
48
|
+
def run( args = ARGV )
|
49
|
+
|
50
|
+
p = OptionParser.new do |opts|
|
51
|
+
opts.banner = "Usage: iudex-char-detect [options] [TestFile]"
|
52
|
+
opts.on( "-v", "--version", "Display version" ) do
|
53
|
+
puts "iudex-char-detector: #{CharDetector::VERSION}"
|
54
|
+
exit 1
|
55
|
+
end
|
56
|
+
opts.on( "-d", "--debug" ) do
|
57
|
+
Logback[ 'iudex' ].level = Logback::DEBUG
|
58
|
+
end
|
59
|
+
opts.on_tail( "-e", "--encoding ENCODING", String,
|
60
|
+
"Set default encoding (#{@default_encode})" ) do |enc|
|
61
|
+
@default_encode = enc
|
62
|
+
end
|
63
|
+
opts.on_tail( "-h", "--help", "Show help and exit" ) do
|
64
|
+
puts opts
|
65
|
+
puts
|
66
|
+
puts( "Detect charset of File and report encoding, confidence " +
|
67
|
+
"(independent of HTML features)" )
|
68
|
+
exit 1
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
p.parse!( args )
|
73
|
+
|
74
|
+
if args.empty?
|
75
|
+
show_detail( detect( $stdin ) )
|
76
|
+
else
|
77
|
+
max_name = args.map { |fn| fn.length }.max
|
78
|
+
args.each do |fname|
|
79
|
+
res = open( fname, "r" ) { |fin| detect( fin ) }
|
80
|
+
if ARGV.length > 1
|
81
|
+
res.unshift( fname )
|
82
|
+
puts "%-#{ max_name }s : %-12s (%5.4f)" % res
|
83
|
+
else
|
84
|
+
show_detail( res )
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
def detect( fin )
|
92
|
+
map = UniMap.new
|
93
|
+
bytes = ByteBuffer::wrap( fin.read.to_java_bytes )
|
94
|
+
map.source = ContentSource.new( bytes )
|
95
|
+
map.source.set_default_encoding( Charset::lookup( @default_encode ) )
|
96
|
+
|
97
|
+
df = CharDetectFilter.new
|
98
|
+
#FIXME: Option? df.max_detect_length =
|
99
|
+
df.filter( map )
|
100
|
+
s = map.source
|
101
|
+
|
102
|
+
[ s.default_encoding.name,
|
103
|
+
s.encoding_confidence,
|
104
|
+
s.encoding_confidences ]
|
105
|
+
end
|
106
|
+
|
107
|
+
def show_detail( res )
|
108
|
+
puts "%-12s (%.4f)" % res
|
109
|
+
puts "====================="
|
110
|
+
res[2].each do |enc,conf|
|
111
|
+
puts "%-12s (%.4f)" % [ enc, conf ]
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|
116
|
+
|
117
|
+
ChartDetectUtil.new.run
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
module Iudex
|
18
|
+
module CharDetector
|
19
|
+
VERSION = '1.1.0'
|
20
|
+
end
|
21
|
+
end
|
Binary file
|
@@ -0,0 +1,30 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-core'
|
18
|
+
require 'rjack-icu'
|
19
|
+
|
20
|
+
require 'iudex-char-detector/base.rb'
|
21
|
+
|
22
|
+
require 'java'
|
23
|
+
|
24
|
+
module Iudex
|
25
|
+
module CharDetector
|
26
|
+
require "iudex-char-detector/iudex-char-detector-#{VERSION}.jar"
|
27
|
+
|
28
|
+
import 'iudex.chardetector.CharDetectFilter'
|
29
|
+
end
|
30
|
+
end
|
data/pom.xml
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
2
|
+
<modelVersion>4.0.0</modelVersion>
|
3
|
+
<groupId>iudex</groupId>
|
4
|
+
<artifactId>iudex-char-detector</artifactId>
|
5
|
+
<packaging>jar</packaging>
|
6
|
+
<version>1.1.0</version>
|
7
|
+
<name>Iudex charset detection support</name>
|
8
|
+
|
9
|
+
<parent>
|
10
|
+
<groupId>iudex</groupId>
|
11
|
+
<artifactId>iudex-parent</artifactId>
|
12
|
+
<version>1.1</version>
|
13
|
+
<relativePath>..</relativePath>
|
14
|
+
</parent>
|
15
|
+
|
16
|
+
<dependencies>
|
17
|
+
|
18
|
+
<dependency>
|
19
|
+
<groupId>iudex</groupId>
|
20
|
+
<artifactId>iudex-core</artifactId>
|
21
|
+
<version>[1.1,1.2)</version>
|
22
|
+
</dependency>
|
23
|
+
|
24
|
+
<dependency>
|
25
|
+
<groupId>com.ibm.icu</groupId>
|
26
|
+
<artifactId>icu4j</artifactId>
|
27
|
+
<version>[4.8,4.8.9999]</version>
|
28
|
+
</dependency>
|
29
|
+
|
30
|
+
</dependencies>
|
31
|
+
|
32
|
+
<build>
|
33
|
+
<plugins>
|
34
|
+
<plugin>
|
35
|
+
<!-- Parent settings -->
|
36
|
+
<artifactId>maven-compiler-plugin</artifactId>
|
37
|
+
</plugin>
|
38
|
+
<plugin>
|
39
|
+
<!-- Parent settings -->
|
40
|
+
<artifactId>maven-source-plugin</artifactId>
|
41
|
+
</plugin>
|
42
|
+
</plugins>
|
43
|
+
</build>
|
44
|
+
|
45
|
+
</project>
|
Binary file
|
Binary file
|
data/test/setup.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
#### General test setup: LOAD_PATH, logging, console output ####
|
18
|
+
|
19
|
+
test_dir = File.dirname( __FILE__ )
|
20
|
+
|
21
|
+
ldir = File.join( test_dir, "..", "lib" )
|
22
|
+
$LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
|
23
|
+
|
24
|
+
require 'rubygems'
|
25
|
+
require 'rjack-logback'
|
26
|
+
RJack::Logback.config_console( :stderr => true )
|
27
|
+
if ARGV.include?( '--verbose' ) || ARGV.include?( '-v' )
|
28
|
+
RJack::Logback.root.level = RJack::Logback::DEBUG
|
29
|
+
end
|
30
|
+
|
31
|
+
require 'minitest/unit'
|
32
|
+
require 'minitest/autorun'
|
33
|
+
|
34
|
+
# Make test output logging compatible: no partial lines.
|
35
|
+
# class TestOut
|
36
|
+
# def print( *a ); $stdout.puts( *a ); end
|
37
|
+
# def puts( *a ); $stdout.puts( *a ); end
|
38
|
+
# end
|
39
|
+
# MiniTest::Unit.output = TestOut.new
|
@@ -0,0 +1,156 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
#.hashdot.profile += jruby-shortlived
|
4
|
+
|
5
|
+
#--
|
6
|
+
# Copyright (c) 2011 David Kellum
|
7
|
+
#
|
8
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
|
+
# may not use this file except in compliance with the License. You
|
10
|
+
# may obtain a copy of the License at
|
11
|
+
#
|
12
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
13
|
+
#
|
14
|
+
# Unless required by applicable law or agreed to in writing, software
|
15
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
16
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
17
|
+
# implied. See the License for the specific language governing
|
18
|
+
# permissions and limitations under the License.
|
19
|
+
#++
|
20
|
+
|
21
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
22
|
+
require 'iudex-char-detector'
|
23
|
+
|
24
|
+
class TestCharDetector < MiniTest::Unit::TestCase
|
25
|
+
include Gravitext::HTMap
|
26
|
+
UniMap.define_accessors
|
27
|
+
|
28
|
+
include Iudex::Core
|
29
|
+
include Iudex::CharDetector
|
30
|
+
|
31
|
+
import 'java.nio.ByteBuffer'
|
32
|
+
import 'java.nio.charset.Charset'
|
33
|
+
JString = Java::java.lang.String
|
34
|
+
|
35
|
+
SHORT_HTML = <<HTML
|
36
|
+
<html>
|
37
|
+
<head>
|
38
|
+
<title>Un documento electronica (titulo en ASCII)</title>
|
39
|
+
</head>
|
40
|
+
<body>
|
41
|
+
<p>¿De donde eres tú?</p>
|
42
|
+
</body>
|
43
|
+
</html>
|
44
|
+
HTML
|
45
|
+
|
46
|
+
def test_find_nothing
|
47
|
+
df = CharDetectFilter.new
|
48
|
+
df.max_detect_length = 3
|
49
|
+
|
50
|
+
[ "", "a", "ascii" ].each do |ib|
|
51
|
+
assert_nil( df.find_detect_buffer( wrap( ib ) ), ib )
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_find_something
|
56
|
+
df = CharDetectFilter.new
|
57
|
+
df.max_detect_length = 3
|
58
|
+
|
59
|
+
trials = [ %w[ á á ],
|
60
|
+
%w[ é. é. ],
|
61
|
+
%w[ ..ü ..ü ],
|
62
|
+
%w[ ..ü0 ..ü ],
|
63
|
+
%w[ 0..í ..í ],
|
64
|
+
%w[ 0..ó0 ..ó ] ]
|
65
|
+
|
66
|
+
trials.each do |ib,ob|
|
67
|
+
out = df.find_detect_buffer( wrap( encode_as( ib, "ISO-8859-1" ) ) )
|
68
|
+
assert( out, ob )
|
69
|
+
assert_equal( ob, JString.new( out, "ISO-8859-1" ).to_s, ob )
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
def test_ascii
|
75
|
+
map = detect_from( "", "UTF-8" )
|
76
|
+
assert_encoding( map.source, "UTF-8", 0.0 )
|
77
|
+
|
78
|
+
map = detect_from( "ascii", "UTF-8" )
|
79
|
+
assert_encoding( map.source, "UTF-8", 0.0 )
|
80
|
+
end
|
81
|
+
|
82
|
+
def test_html_utf8_as_default
|
83
|
+
map = detect_from( SHORT_HTML, "UTF-8" )
|
84
|
+
assert_encoding( map.source, "UTF-8", 0.80 )
|
85
|
+
end
|
86
|
+
|
87
|
+
def test_html_utf8_wrong_default
|
88
|
+
map = detect_from( SHORT_HTML, "UTF-8", "ISO-8859-1" )
|
89
|
+
assert_encoding( map.source, "UTF-8", 0.80 )
|
90
|
+
end
|
91
|
+
|
92
|
+
def test_html_iso_as_default
|
93
|
+
map = detect_from( SHORT_HTML, "ISO-8859-1" )
|
94
|
+
assert_encoding( map.source, "ISO-8859-1", 0.40 )
|
95
|
+
end
|
96
|
+
|
97
|
+
def test_html_iso_wrong_default
|
98
|
+
map = detect_from( SHORT_HTML, "ISO-8859-1", "UTF-8" )
|
99
|
+
assert_encoding( map.source, "ISO-8859-1", 0.40 )
|
100
|
+
end
|
101
|
+
|
102
|
+
def test_html_iso_from_windows
|
103
|
+
map = detect_from( SHORT_HTML, "windows-1252" )
|
104
|
+
assert_encoding( map.source, "ISO-8859-1", 0.40 )
|
105
|
+
end
|
106
|
+
|
107
|
+
def test_windows_default
|
108
|
+
map = detect_from( '“¿De donde eres tú?”', "windows-1252" )
|
109
|
+
assert_encoding( map.source, "windows-1252", 0.90 )
|
110
|
+
end
|
111
|
+
|
112
|
+
def test_windows_wrong_default
|
113
|
+
map = detect_from( '“¿De donde eres tú?”', "windows-1252", "UTF-8" )
|
114
|
+
assert_encoding( map.source, "windows-1252", 0.90 )
|
115
|
+
end
|
116
|
+
|
117
|
+
def test_mojibaked_utf8
|
118
|
+
map = detect_from( 'âquotedâ€', "UTF-8" )
|
119
|
+
assert_encoding( map.source, "UTF-8", 0.99 )
|
120
|
+
end
|
121
|
+
|
122
|
+
def detect_from( bytes, enc, claimed_enc = nil )
|
123
|
+
map = content( encode_as( bytes, enc ), claimed_enc || enc )
|
124
|
+
df = CharDetectFilter.new
|
125
|
+
df.max_detect_length = SHORT_HTML.length - 20
|
126
|
+
assert( df.filter( map ) )
|
127
|
+
map
|
128
|
+
end
|
129
|
+
|
130
|
+
def assert_encoding( source, enc, min_confidence = 0.10 )
|
131
|
+
assert_equal( enc, source.default_encoding.name )
|
132
|
+
assert_operator( source.encoding_confidence, :>=, min_confidence )
|
133
|
+
end
|
134
|
+
|
135
|
+
def encode_as( bytes, encoding )
|
136
|
+
if encoding == "UTF-8"
|
137
|
+
bytes
|
138
|
+
else
|
139
|
+
bytes = bytes.to_java_bytes if bytes.respond_to?( :to_java_bytes )
|
140
|
+
JString.new( bytes, "UTF-8" ).bytes( encoding )
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
def content( bytes, charset = "UTF-8" )
|
145
|
+
map = UniMap.new
|
146
|
+
map.source = ContentSource.new( wrap( bytes ) )
|
147
|
+
map.source.set_default_encoding( Charset::lookup( charset ) )
|
148
|
+
map
|
149
|
+
end
|
150
|
+
|
151
|
+
def wrap( bytes )
|
152
|
+
bytes = bytes.to_java_bytes if bytes.respond_to?( :to_java_bytes )
|
153
|
+
ByteBuffer::wrap( bytes )
|
154
|
+
end
|
155
|
+
|
156
|
+
end
|
metadata
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: iudex-char-detector
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 1.1.0
|
6
|
+
platform: java
|
7
|
+
authors:
|
8
|
+
- David Kellum
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-11-13 00:00:00 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: iudex-core
|
17
|
+
prerelease: false
|
18
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
19
|
+
none: false
|
20
|
+
requirements:
|
21
|
+
- - ~>
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.1.0
|
24
|
+
type: :runtime
|
25
|
+
version_requirements: *id001
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: rjack-icu
|
28
|
+
prerelease: false
|
29
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
30
|
+
none: false
|
31
|
+
requirements:
|
32
|
+
- - ~>
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: 4.8.0
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id002
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: minitest
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: "2.3"
|
46
|
+
type: :development
|
47
|
+
version_requirements: *id003
|
48
|
+
- !ruby/object:Gem::Dependency
|
49
|
+
name: rjack-logback
|
50
|
+
prerelease: false
|
51
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
52
|
+
none: false
|
53
|
+
requirements:
|
54
|
+
- - ~>
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: "1.0"
|
57
|
+
type: :development
|
58
|
+
version_requirements: *id004
|
59
|
+
- !ruby/object:Gem::Dependency
|
60
|
+
name: rjack-tarpit
|
61
|
+
prerelease: false
|
62
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
63
|
+
none: false
|
64
|
+
requirements:
|
65
|
+
- - ~>
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: 1.4.0
|
68
|
+
type: :development
|
69
|
+
version_requirements: *id005
|
70
|
+
description: |-
|
71
|
+
Iudex is a general purpose web crawler and feed processor in
|
72
|
+
ruby/java. The iudex-char-detector gem provides charset detection
|
73
|
+
support.
|
74
|
+
email:
|
75
|
+
- dek-oss@gravitext.com
|
76
|
+
executables:
|
77
|
+
- iudex-char-detect
|
78
|
+
extensions: []
|
79
|
+
|
80
|
+
extra_rdoc_files:
|
81
|
+
- Manifest.txt
|
82
|
+
- History.rdoc
|
83
|
+
- README.rdoc
|
84
|
+
files:
|
85
|
+
- History.rdoc
|
86
|
+
- Manifest.txt
|
87
|
+
- README.rdoc
|
88
|
+
- Rakefile
|
89
|
+
- pom.xml
|
90
|
+
- bin/iudex-char-detect
|
91
|
+
- lib/iudex-char-detector/base.rb
|
92
|
+
- lib/iudex-char-detector.rb
|
93
|
+
- test/sample.html.ascii
|
94
|
+
- test/sample.html.iso
|
95
|
+
- test/sample.html.utf16
|
96
|
+
- test/sample.html.utf16le
|
97
|
+
- test/sample.html.utf8
|
98
|
+
- test/setup.rb
|
99
|
+
- test/test_char_detector.rb
|
100
|
+
- lib/iudex-char-detector/iudex-char-detector-1.1.0.jar
|
101
|
+
- .gemtest
|
102
|
+
homepage: http://github.com/dekellum/iudex
|
103
|
+
licenses: []
|
104
|
+
|
105
|
+
post_install_message:
|
106
|
+
rdoc_options:
|
107
|
+
- --main
|
108
|
+
- README.rdoc
|
109
|
+
require_paths:
|
110
|
+
- lib
|
111
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
112
|
+
none: false
|
113
|
+
requirements:
|
114
|
+
- - ">="
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: "0"
|
117
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
118
|
+
none: false
|
119
|
+
requirements:
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: "0"
|
123
|
+
requirements: []
|
124
|
+
|
125
|
+
rubyforge_project: iudex-char-detector
|
126
|
+
rubygems_version: 1.8.9
|
127
|
+
signing_key:
|
128
|
+
specification_version: 3
|
129
|
+
summary: Iudex is a general purpose web crawler and feed processor in ruby/java
|
130
|
+
test_files:
|
131
|
+
- test/test_char_detector.rb
|