charlock_holmes 0.6.9.1 → 0.6.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +3 -11
- data/Rakefile +5 -18
- data/charlock_holmes.gemspec +1 -1
- data/ext/charlock_holmes/encoding_detector.c +1 -5
- data/ext/charlock_holmes/ext.c +2 -0
- data/ext/charlock_holmes/extconf.rb +1 -0
- data/ext/charlock_holmes/transliterator.cpp +86 -0
- data/lib/charlock_holmes/version.rb +1 -1
- data/{spec/converter_spec.rb → test/converter_test.rb} +8 -7
- data/{spec/encoding_detector_spec.rb → test/encoding_detector_test.rb} +32 -35
- data/{spec → test}/fixtures/AnsiGraph.psm1 +0 -0
- data/{spec → test}/fixtures/TwigExtensionsDate.es.yml +0 -0
- data/{spec → test}/fixtures/cl-messagepack.lisp +0 -0
- data/{spec → test}/fixtures/core.rkt +0 -0
- data/{spec → test}/fixtures/hello_world +0 -0
- data/{spec → test}/fixtures/laholator.py +0 -0
- data/{spec → test}/fixtures/repl2.cljs +0 -0
- data/test/helper.rb +14 -0
- data/{spec/string_method_spec.rb → test/string_methods_test.rb} +7 -7
- data/test/transliterator_test.rb +119 -0
- metadata +75 -101
- data/.rspec +0 -3
- data/spec/spec_helper.rb +0 -9
data/Gemfile.lock
CHANGED
|
@@ -1,24 +1,16 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
charlock_holmes (0.6.9.
|
|
4
|
+
charlock_holmes (0.6.9.2)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: http://rubygems.org/
|
|
8
8
|
specs:
|
|
9
9
|
chardet (0.9.0)
|
|
10
|
-
|
|
10
|
+
minitest (4.6.2)
|
|
11
11
|
rake (0.9.2)
|
|
12
12
|
rake-compiler (0.7.9)
|
|
13
13
|
rake
|
|
14
|
-
rspec (2.6.0)
|
|
15
|
-
rspec-core (~> 2.6.0)
|
|
16
|
-
rspec-expectations (~> 2.6.0)
|
|
17
|
-
rspec-mocks (~> 2.6.0)
|
|
18
|
-
rspec-core (2.6.4)
|
|
19
|
-
rspec-expectations (2.6.0)
|
|
20
|
-
diff-lcs (~> 1.1.2)
|
|
21
|
-
rspec-mocks (2.6.0)
|
|
22
14
|
|
|
23
15
|
PLATFORMS
|
|
24
16
|
ruby
|
|
@@ -26,5 +18,5 @@ PLATFORMS
|
|
|
26
18
|
DEPENDENCIES
|
|
27
19
|
chardet
|
|
28
20
|
charlock_holmes!
|
|
21
|
+
minitest
|
|
29
22
|
rake-compiler (>= 0.7.5)
|
|
30
|
-
rspec (>= 2.0.0)
|
data/Rakefile
CHANGED
|
@@ -1,23 +1,10 @@
|
|
|
1
|
-
|
|
2
|
-
begin
|
|
3
|
-
require 'rspec'
|
|
4
|
-
require 'rspec/core/rake_task'
|
|
1
|
+
require 'rake/testtask'
|
|
5
2
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
t.rcov = true
|
|
9
|
-
end
|
|
10
|
-
RSpec::Core::RakeTask.new 'spec' do |t|
|
|
11
|
-
t.verbose = true
|
|
12
|
-
end
|
|
13
|
-
|
|
14
|
-
task :default => :spec
|
|
15
|
-
rescue LoadError
|
|
16
|
-
puts "rspec, or one of its dependencies, is not available. Install it with: sudo gem install rspec"
|
|
3
|
+
Rake::TestTask.new do |t|
|
|
4
|
+
t.pattern = "test/**/*_test.rb"
|
|
17
5
|
end
|
|
18
6
|
|
|
19
|
-
|
|
20
|
-
require 'rake' unless defined? Rake
|
|
7
|
+
task :default => :test
|
|
21
8
|
|
|
22
9
|
gem 'rake-compiler', '>= 0.7.5'
|
|
23
10
|
require "rake/extensiontask"
|
|
@@ -26,4 +13,4 @@ Rake::ExtensionTask.new 'charlock_holmes' do |ext|
|
|
|
26
13
|
ext.lib_dir = File.join 'lib', 'charlock_holmes'
|
|
27
14
|
end
|
|
28
15
|
|
|
29
|
-
Rake::Task[:
|
|
16
|
+
Rake::Task[:test].prerequisites << :compile
|
data/charlock_holmes.gemspec
CHANGED
|
@@ -19,7 +19,7 @@ Gem::Specification.new do |s|
|
|
|
19
19
|
|
|
20
20
|
# tests
|
|
21
21
|
s.add_development_dependency 'rake-compiler', ">= 0.7.5"
|
|
22
|
-
s.add_development_dependency '
|
|
22
|
+
s.add_development_dependency 'minitest'
|
|
23
23
|
# benchmarks
|
|
24
24
|
s.add_development_dependency 'chardet'
|
|
25
25
|
end
|
|
@@ -274,15 +274,11 @@ static VALUE rb_encdec__alloc(VALUE klass)
|
|
|
274
274
|
rb_raise(rb_eStandardError, "%s", u_errorName(status));
|
|
275
275
|
}
|
|
276
276
|
|
|
277
|
-
detector->magic = magic_open(
|
|
277
|
+
detector->magic = magic_open(MAGIC_NO_CHECK_SOFT);
|
|
278
278
|
if (detector->magic == NULL) {
|
|
279
279
|
rb_raise(rb_eStandardError, "%s", magic_error(detector->magic));
|
|
280
280
|
}
|
|
281
281
|
|
|
282
|
-
// load the libmagic database
|
|
283
|
-
// NULL means use the default or whatever is specified by the MAGIC env var
|
|
284
|
-
magic_load(detector->magic, NULL);
|
|
285
|
-
|
|
286
282
|
return obj;
|
|
287
283
|
}
|
|
288
284
|
|
data/ext/charlock_holmes/ext.c
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
extern void _init_charlock_encoding_detector();
|
|
4
4
|
extern void _init_charlock_converter();
|
|
5
|
+
extern void _init_charlock_transliterator();
|
|
5
6
|
|
|
6
7
|
VALUE rb_mCharlockHolmes;
|
|
7
8
|
|
|
@@ -10,4 +11,5 @@ void Init_charlock_holmes() {
|
|
|
10
11
|
|
|
11
12
|
_init_charlock_encoding_detector();
|
|
12
13
|
_init_charlock_converter();
|
|
14
|
+
_init_charlock_transliterator();
|
|
13
15
|
}
|
|
@@ -58,6 +58,7 @@ Dir.chdir("#{CWD}/src") do
|
|
|
58
58
|
sys("tar zxvf #{src}")
|
|
59
59
|
Dir.chdir(dir) do
|
|
60
60
|
sys("./configure --prefix=#{CWD}/dst/ --disable-shared --enable-static --with-pic")
|
|
61
|
+
sys("patch -p0 < ../file-soft-check.patch")
|
|
61
62
|
sys("make -C src install")
|
|
62
63
|
sys("make -C magic install")
|
|
63
64
|
end
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
#undef UChar
|
|
3
|
+
|
|
4
|
+
#include <unicode/translit.h>
|
|
5
|
+
|
|
6
|
+
extern "C" {
|
|
7
|
+
|
|
8
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
|
9
|
+
#include <ruby/encoding.h>
|
|
10
|
+
static VALUE rb_eEncodingCompatibilityError;
|
|
11
|
+
|
|
12
|
+
static void check_utf8_encoding(VALUE str) {
|
|
13
|
+
static rb_encoding *_cached[3] = {NULL, NULL, NULL};
|
|
14
|
+
rb_encoding *enc;
|
|
15
|
+
|
|
16
|
+
if (_cached[0] == NULL) {
|
|
17
|
+
_cached[0] = rb_utf8_encoding();
|
|
18
|
+
_cached[1] = rb_usascii_encoding();
|
|
19
|
+
_cached[2] = rb_ascii8bit_encoding();
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
enc = rb_enc_get(str);
|
|
23
|
+
if (enc != _cached[0] && enc != _cached[1] && enc != _cached[2]) {
|
|
24
|
+
rb_raise(rb_eEncodingCompatibilityError,
|
|
25
|
+
"Input must be UTF-8 or US-ASCII, %s given", rb_enc_name(enc));
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
#else
|
|
30
|
+
static void check_utf8_encoding(VALUE str) {}
|
|
31
|
+
#endif
|
|
32
|
+
|
|
33
|
+
extern VALUE rb_mCharlockHolmes;
|
|
34
|
+
static VALUE rb_cTransliterator;
|
|
35
|
+
|
|
36
|
+
static VALUE rb_transliterator_transliterate(VALUE self, VALUE rb_txt, VALUE rb_id) {
|
|
37
|
+
UErrorCode status = U_ZERO_ERROR;
|
|
38
|
+
UParseError p_error;
|
|
39
|
+
Transliterator *trans;
|
|
40
|
+
const char *txt;
|
|
41
|
+
size_t txt_len;
|
|
42
|
+
const char *id;
|
|
43
|
+
size_t id_len;
|
|
44
|
+
UnicodeString *u_txt;
|
|
45
|
+
std::string result;
|
|
46
|
+
VALUE rb_out;
|
|
47
|
+
|
|
48
|
+
Check_Type(rb_txt, T_STRING);
|
|
49
|
+
Check_Type(rb_id, T_STRING);
|
|
50
|
+
|
|
51
|
+
check_utf8_encoding(rb_txt);
|
|
52
|
+
check_utf8_encoding(rb_id);
|
|
53
|
+
|
|
54
|
+
txt = RSTRING_PTR(rb_txt);
|
|
55
|
+
txt_len = RSTRING_LEN(rb_txt);
|
|
56
|
+
id = RSTRING_PTR(rb_id);
|
|
57
|
+
id_len = RSTRING_LEN(rb_id);
|
|
58
|
+
|
|
59
|
+
trans = Transliterator::createInstance(UnicodeString(id, id_len), UTRANS_FORWARD, p_error, status);
|
|
60
|
+
if(!U_SUCCESS(status)) {
|
|
61
|
+
rb_raise(rb_eArgError, "%s", u_errorName(status));
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
u_txt = new UnicodeString(txt, txt_len);
|
|
65
|
+
trans->transliterate(*u_txt);
|
|
66
|
+
result = u_txt->toUTF8String(result);
|
|
67
|
+
|
|
68
|
+
delete u_txt;
|
|
69
|
+
delete trans;
|
|
70
|
+
|
|
71
|
+
rb_out = charlock_new_str(result.data(), result.length());
|
|
72
|
+
|
|
73
|
+
return rb_out;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
void _init_charlock_transliterator() {
|
|
77
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
|
78
|
+
rb_eEncodingCompatibilityError = rb_const_get(rb_cEncoding, rb_intern("CompatibilityError"));
|
|
79
|
+
#endif
|
|
80
|
+
|
|
81
|
+
rb_cTransliterator = rb_define_class_under(rb_mCharlockHolmes, "Transliterator", rb_cObject);
|
|
82
|
+
|
|
83
|
+
rb_define_singleton_method(rb_cTransliterator, "transliterate", (VALUE(*)(...))rb_transliterator_transliterate, 2);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
}
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
# encoding: utf-8
|
|
2
|
+
require File.expand_path("../helper", __FILE__)
|
|
2
3
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
describe CharlockHolmes::Converter do
|
|
6
|
-
test 'is able to convert regular ascii content from ISO-8859-1 to UTF-16, and back again' do
|
|
4
|
+
class ConverterTest < MiniTest::Unit::TestCase
|
|
5
|
+
def test_convert_ascii_from_iso859_1_to_utf16_and_back
|
|
7
6
|
input = 'test'
|
|
8
7
|
|
|
9
8
|
output = CharlockHolmes::Converter.convert input, 'ISO-8859-1', 'UTF-16'
|
|
@@ -15,7 +14,7 @@ describe CharlockHolmes::Converter do
|
|
|
15
14
|
assert input == output
|
|
16
15
|
end
|
|
17
16
|
|
|
18
|
-
|
|
17
|
+
def test_convert_utf8_to_utf16_and_back
|
|
19
18
|
input = 'λ, λ, λ'
|
|
20
19
|
|
|
21
20
|
output = CharlockHolmes::Converter.convert input, 'UTF-8', 'UTF-16'
|
|
@@ -27,7 +26,7 @@ describe CharlockHolmes::Converter do
|
|
|
27
26
|
assert input == output
|
|
28
27
|
end
|
|
29
28
|
|
|
30
|
-
|
|
29
|
+
def test_params_must_be_strings
|
|
31
30
|
assert_raises TypeError do
|
|
32
31
|
CharlockHolmes::Converter.convert nil, 'UTF-8', 'UTF-16'
|
|
33
32
|
end
|
|
@@ -40,8 +39,10 @@ describe CharlockHolmes::Converter do
|
|
|
40
39
|
CharlockHolmes::Converter.convert 'lol', 'UTF-8', nil
|
|
41
40
|
end
|
|
42
41
|
|
|
43
|
-
|
|
42
|
+
begin
|
|
44
43
|
CharlockHolmes::Converter.convert 'lol', 'UTF-8', 'UTF-16'
|
|
44
|
+
rescue Exception => e
|
|
45
|
+
assert_nil e, "#{e.class.name} raised, expected nothing"
|
|
45
46
|
end
|
|
46
47
|
end
|
|
47
48
|
end
|
|
@@ -1,25 +1,24 @@
|
|
|
1
1
|
# encoding: utf-8
|
|
2
|
+
require File.expand_path("../helper", __FILE__)
|
|
2
3
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
describe CharlockHolmes::EncodingDetector do
|
|
6
|
-
before :all do
|
|
4
|
+
class EncodingDetectorTest < MiniTest::Unit::TestCase
|
|
5
|
+
def setup
|
|
7
6
|
@detector = CharlockHolmes::EncodingDetector.new
|
|
8
7
|
end
|
|
9
8
|
|
|
10
|
-
|
|
9
|
+
def test_has_class_level_detect_method
|
|
11
10
|
CharlockHolmes::EncodingDetector.respond_to? :detect
|
|
12
11
|
detected = CharlockHolmes::EncodingDetector.detect 'test'
|
|
13
12
|
assert_equal 'ISO-8859-1', detected[:encoding]
|
|
14
13
|
end
|
|
15
14
|
|
|
16
|
-
|
|
15
|
+
def test_class_level_detect_accepts_encoding_hint
|
|
17
16
|
CharlockHolmes::EncodingDetector.respond_to? :detect
|
|
18
17
|
detected = CharlockHolmes::EncodingDetector.detect 'test', 'UTF-8'
|
|
19
18
|
assert_equal 'ISO-8859-1', detected[:encoding]
|
|
20
19
|
end
|
|
21
20
|
|
|
22
|
-
|
|
21
|
+
def test_has_class_level_detect_all_method
|
|
23
22
|
CharlockHolmes::EncodingDetector.respond_to? :detect_all
|
|
24
23
|
detected_list = CharlockHolmes::EncodingDetector.detect_all 'test'
|
|
25
24
|
assert detected_list.is_a? Array
|
|
@@ -28,7 +27,7 @@ describe CharlockHolmes::EncodingDetector do
|
|
|
28
27
|
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
|
29
28
|
end
|
|
30
29
|
|
|
31
|
-
|
|
30
|
+
def test_class_level_detect_all_method_accepts_encoding_hint
|
|
32
31
|
CharlockHolmes::EncodingDetector.respond_to? :detect_all
|
|
33
32
|
detected_list = CharlockHolmes::EncodingDetector.detect_all 'test', 'UTF-8'
|
|
34
33
|
assert detected_list.is_a? Array
|
|
@@ -37,19 +36,19 @@ describe CharlockHolmes::EncodingDetector do
|
|
|
37
36
|
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
|
38
37
|
end
|
|
39
38
|
|
|
40
|
-
|
|
39
|
+
def test_has_detect_method
|
|
41
40
|
@detector.respond_to? :detect
|
|
42
41
|
detected = @detector.detect 'test'
|
|
43
42
|
assert_equal 'ISO-8859-1', detected[:encoding]
|
|
44
43
|
end
|
|
45
44
|
|
|
46
|
-
|
|
45
|
+
def test_detect_accepts_encoding_hint
|
|
47
46
|
@detector.respond_to? :detect
|
|
48
47
|
detected = @detector.detect 'test', 'UTF-8'
|
|
49
48
|
assert_equal 'ISO-8859-1', detected[:encoding]
|
|
50
49
|
end
|
|
51
50
|
|
|
52
|
-
|
|
51
|
+
def test_has_detect_all_method
|
|
53
52
|
@detector.respond_to? :detect_all
|
|
54
53
|
detected_list = @detector.detect_all 'test'
|
|
55
54
|
assert detected_list.is_a? Array
|
|
@@ -58,7 +57,7 @@ describe CharlockHolmes::EncodingDetector do
|
|
|
58
57
|
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
|
59
58
|
end
|
|
60
59
|
|
|
61
|
-
|
|
60
|
+
def test_detect_all_accepts_encoding_hint
|
|
62
61
|
@detector.respond_to? :detect_all
|
|
63
62
|
detected_list = @detector.detect_all 'test', 'UTF-8'
|
|
64
63
|
assert detected_list.is_a? Array
|
|
@@ -67,7 +66,7 @@ describe CharlockHolmes::EncodingDetector do
|
|
|
67
66
|
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
|
68
67
|
end
|
|
69
68
|
|
|
70
|
-
|
|
69
|
+
def test_strip_tags_flag
|
|
71
70
|
detector = CharlockHolmes::EncodingDetector.new
|
|
72
71
|
detector.strip_tags = true
|
|
73
72
|
assert detector.strip_tags
|
|
@@ -82,7 +81,7 @@ describe CharlockHolmes::EncodingDetector do
|
|
|
82
81
|
assert_equal 'UTF-8', detection[:encoding]
|
|
83
82
|
end
|
|
84
83
|
|
|
85
|
-
|
|
84
|
+
def test_has_list_of_supported_encodings
|
|
86
85
|
CharlockHolmes::EncodingDetector.respond_to? :supported_encodings
|
|
87
86
|
supported_encodings = CharlockHolmes::EncodingDetector.supported_encodings
|
|
88
87
|
|
|
@@ -90,32 +89,30 @@ describe CharlockHolmes::EncodingDetector do
|
|
|
90
89
|
assert supported_encodings.include? 'UTF-8'
|
|
91
90
|
end
|
|
92
91
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
92
|
+
MAPPING = [
|
|
93
|
+
['repl2.cljs', 'ISO-8859-1', :text],
|
|
94
|
+
['core.rkt', 'UTF-8', :text],
|
|
95
|
+
['cl-messagepack.lisp', 'ISO-8859-1', :text],
|
|
96
|
+
['TwigExtensionsDate.es.yml', 'UTF-8', :text],
|
|
97
|
+
['AnsiGraph.psm1', 'UTF-16LE', :text],
|
|
98
|
+
['laholator.py', 'UTF-8', :text],
|
|
99
|
+
['hello_world', nil, :binary]
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
def test_detection_works_as_expected
|
|
104
103
|
MAPPING.each do |mapping|
|
|
105
104
|
file, encoding, type = mapping
|
|
106
105
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
guessed = @detector.detect content
|
|
106
|
+
path = File.expand_path "../fixtures/#{file}", __FILE__
|
|
107
|
+
content = File.read path
|
|
108
|
+
guessed = @detector.detect content
|
|
111
109
|
|
|
112
|
-
|
|
113
|
-
|
|
110
|
+
assert_equal encoding, guessed[:encoding]
|
|
111
|
+
assert_equal type, guessed[:type]
|
|
114
112
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
end
|
|
113
|
+
if content.respond_to?(:force_encoding) && guessed[:type] == :text
|
|
114
|
+
content.force_encoding guessed[:encoding]
|
|
115
|
+
assert content.valid_encoding?
|
|
119
116
|
end
|
|
120
117
|
end
|
|
121
118
|
end
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
data/test/helper.rb
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# Basic test environment.
|
|
2
|
+
|
|
3
|
+
# blah fuck this
|
|
4
|
+
require 'rubygems' if !defined?(Gem)
|
|
5
|
+
require 'bundler/setup'
|
|
6
|
+
|
|
7
|
+
require 'charlock_holmes'
|
|
8
|
+
|
|
9
|
+
# bring in minitest
|
|
10
|
+
require 'minitest/autorun'
|
|
11
|
+
|
|
12
|
+
# put lib and test dirs directly on load path
|
|
13
|
+
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
|
14
|
+
$LOAD_PATH.unshift File.expand_path('..', __FILE__)
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
require
|
|
1
|
+
require File.expand_path("../helper", __FILE__)
|
|
2
2
|
require 'charlock_holmes/string'
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
class StringMethodsTest < MiniTest::Unit::TestCase
|
|
5
|
+
def test_adds_detect_encoding_method
|
|
6
6
|
str = 'test'
|
|
7
7
|
str.respond_to? :detect_encoding
|
|
8
8
|
|
|
@@ -10,7 +10,7 @@ describe String do
|
|
|
10
10
|
assert_equal 'ISO-8859-1', detected[:encoding]
|
|
11
11
|
end
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
def test_detect_encoding_accepts_encoding_hint_param
|
|
14
14
|
str = 'test'
|
|
15
15
|
str.respond_to? :detect_encoding
|
|
16
16
|
|
|
@@ -18,7 +18,7 @@ describe String do
|
|
|
18
18
|
assert_equal 'ISO-8859-1', detected[:encoding]
|
|
19
19
|
end
|
|
20
20
|
|
|
21
|
-
|
|
21
|
+
def test_adds_detect_encodings_method
|
|
22
22
|
str = 'test'
|
|
23
23
|
str.respond_to? :detect_encodings
|
|
24
24
|
|
|
@@ -29,7 +29,7 @@ describe String do
|
|
|
29
29
|
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
|
30
30
|
end
|
|
31
31
|
|
|
32
|
-
|
|
32
|
+
def test_detect_encodings_accepts_encoding_hint_param
|
|
33
33
|
str = 'test'
|
|
34
34
|
str.respond_to? :detect_encodings
|
|
35
35
|
|
|
@@ -41,7 +41,7 @@ describe String do
|
|
|
41
41
|
end
|
|
42
42
|
|
|
43
43
|
if RUBY_VERSION =~ /1.9/
|
|
44
|
-
|
|
44
|
+
def test_adds_detect_encoding_bang_method
|
|
45
45
|
str = 'test'
|
|
46
46
|
str.respond_to? :detect_encoding!
|
|
47
47
|
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
require File.expand_path("../helper", __FILE__)
|
|
3
|
+
|
|
4
|
+
class TransliteratorTest < MiniTest::Unit::TestCase
|
|
5
|
+
DONT_CONVERT = [
|
|
6
|
+
"Vitrum edere possum; mihi non nocet.", # Latin
|
|
7
|
+
"Je puis mangier del voirre. Ne me nuit.", # Old French
|
|
8
|
+
"Kristala jan dezaket, ez dit minik ematen.", # Basque
|
|
9
|
+
"Kaya kong kumain nang bubog at hindi ako masaktan.", # Tagalog
|
|
10
|
+
"Ich kann Glas essen, ohne mir weh zu tun.", # German
|
|
11
|
+
"I can eat glass and it doesn't hurt me.", # English
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
CONVERT_PAIRS = {
|
|
15
|
+
"Je peux manger du verre, ça ne me fait pas de mal." => # French
|
|
16
|
+
"Je peux manger du verre, ca ne me fait pas de mal.",
|
|
17
|
+
"Pot să mănânc sticlă și ea nu mă rănește." => # Romanian
|
|
18
|
+
"Pot sa mananc sticla si ea nu ma raneste.",
|
|
19
|
+
"Ég get etið gler án þess að meiða mig." => # Icelandic
|
|
20
|
+
"Eg get etid gler an thess ad meida mig.",
|
|
21
|
+
"Unë mund të ha qelq dhe nuk më gjen gjë." => # Albanian
|
|
22
|
+
"Une mund te ha qelq dhe nuk me gjen gje.",
|
|
23
|
+
"Mogę jeść szkło i mi nie szkodzi." => # Polish
|
|
24
|
+
"Moge jesc szklo i mi nie szkodzi.",
|
|
25
|
+
# "Я могу есть стекло, оно мне не вредит." => # Russian
|
|
26
|
+
# "Ia moghu iest' stieklo, ono mnie nie vriedit.",
|
|
27
|
+
# "Мога да ям стъкло, то не ми вреди." => # Bulgarian
|
|
28
|
+
# "Mogha da iam stklo, to nie mi vriedi.",
|
|
29
|
+
# "ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬" => # Anglo-Saxon
|
|
30
|
+
# "ic.mag.glas.eotacn.ond.hit.ne.heacrmiacth.me:",
|
|
31
|
+
# "ὕαλον ϕαγεῖν δύναμαι· τοῦτο οὔ με βλάπτει" => # Classical Greek
|
|
32
|
+
# "ualon phagein dunamai; touto ou me blaptei",
|
|
33
|
+
# "मैं काँच खा सकता हूँ और मुझे उससे कोई चोट नहीं पहुंचती" => # Hindi
|
|
34
|
+
# "maiN kaaNc khaa sktaa huuN aur mujhe usse koii cott nhiiN phuNctii",
|
|
35
|
+
# "من می توانم بدونِ احساس درد شيشه بخورم" => # Persian
|
|
36
|
+
# "mn my twnm bdwni Hss drd shyshh bkhwrm",
|
|
37
|
+
# "أنا قادر على أكل الزجاج و هذا لا يؤلمن" => # Arabic
|
|
38
|
+
# "'n qdr 'l~ 'kl lzjj w hdh l yw'lmn",
|
|
39
|
+
# "אני יכול לאכול זכוכית וזה לא מזיק לי" => # Hebrew
|
|
40
|
+
# "ny ykvl lkvl zkvkyt vzh l mzyq ly",
|
|
41
|
+
# "ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ" => # Thai
|
|
42
|
+
# "chankinkracchkaid aetmanaimthamaihchanecchb",
|
|
43
|
+
# "我能吞下玻璃而不伤身体。" => # Chinese
|
|
44
|
+
# "Wo Neng Tun Xia Bo Li Er Bu Shang Shen Ti . ",
|
|
45
|
+
# "私はガラスを食べられます。それは私を傷つけません。" => # Japanese
|
|
46
|
+
# "Si hagarasuwoShi beraremasu. sorehaSi woShang tukemasen. ",
|
|
47
|
+
# "⠋⠗⠁⠝⠉⠑" => # Braille
|
|
48
|
+
# "france",
|
|
49
|
+
"Schloß - Assunção - Łódź" =>
|
|
50
|
+
"Schloss - Assuncao - Lodz",
|
|
51
|
+
"TÜM GOLLER Fb 4-1 Bursa Maç Özeti Íƶle" =>
|
|
52
|
+
"TUM GOLLER Fb 4-1 Bursa Mac Ozeti Izle",
|
|
53
|
+
"ßßßßß" => "ssssssssss"
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
def test_transliterate
|
|
57
|
+
trans_id = "Any-NFD; Any-Latin; Latin-ASCII; Any-NFC"
|
|
58
|
+
|
|
59
|
+
DONT_CONVERT.each do |subject|
|
|
60
|
+
assert_equal subject, trans(subject, trans_id)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
CONVERT_PAIRS.each do |before, after|
|
|
64
|
+
assert_equal after, trans(before, trans_id)
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
if "".respond_to? :force_encoding
|
|
69
|
+
def test_transliterate_id_must_be_utf8_or_ascii
|
|
70
|
+
trans_id = "Any-NFD; Any-Latin; Latin-ASCII; Any-NFC".force_encoding('big5')
|
|
71
|
+
txt = "blah blah blah"
|
|
72
|
+
|
|
73
|
+
assert_raises Encoding::CompatibilityError do
|
|
74
|
+
trans(txt, trans_id)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
trans_id.force_encoding('UTF-8')
|
|
78
|
+
begin
|
|
79
|
+
trans(txt, trans_id)
|
|
80
|
+
rescue Encoding::CompatibilityError => e
|
|
81
|
+
assert_nil e, "#{e.class.name} raised, expected not to"
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
trans_id.force_encoding('US-ASCII')
|
|
85
|
+
begin
|
|
86
|
+
trans(txt, trans_id)
|
|
87
|
+
rescue Encoding::CompatibilityError => e
|
|
88
|
+
assert_nil e, "#{e.class.name} raised, expected not to"
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def test_transliterate_text_must_be_utf8_or_ascii
|
|
93
|
+
trans_id = "Any-NFD; Any-Latin; Latin-ASCII; Any-NFC"
|
|
94
|
+
txt = "blah blah blah".force_encoding('big5')
|
|
95
|
+
|
|
96
|
+
assert_raises Encoding::CompatibilityError do
|
|
97
|
+
trans(txt, trans_id)
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
txt.force_encoding('UTF-8')
|
|
101
|
+
begin
|
|
102
|
+
trans(txt, trans_id)
|
|
103
|
+
rescue Encoding::CompatibilityError => e
|
|
104
|
+
assert_nil e, "#{e.class.name} raised, expected not to"
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
txt.force_encoding('US-ASCII')
|
|
108
|
+
begin
|
|
109
|
+
trans(txt, trans_id)
|
|
110
|
+
rescue Encoding::CompatibilityError => e
|
|
111
|
+
assert_nil e, "#{e.class.name} raised, expected not to"
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def trans(text, id)
|
|
117
|
+
CharlockHolmes::Transliterator.transliterate(text, id)
|
|
118
|
+
end
|
|
119
|
+
end
|
metadata
CHANGED
|
@@ -1,82 +1,73 @@
|
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: charlock_holmes
|
|
3
|
-
version: !ruby/object:Gem::Version
|
|
4
|
-
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.6.9.2
|
|
5
5
|
prerelease:
|
|
6
|
-
segments:
|
|
7
|
-
- 0
|
|
8
|
-
- 6
|
|
9
|
-
- 9
|
|
10
|
-
- 1
|
|
11
|
-
version: 0.6.9.1
|
|
12
6
|
platform: ruby
|
|
13
|
-
authors:
|
|
7
|
+
authors:
|
|
14
8
|
- Brian Lopez
|
|
15
|
-
-
|
|
9
|
+
- Vicent Martí
|
|
16
10
|
autorequire:
|
|
17
11
|
bindir: bin
|
|
18
12
|
cert_chain: []
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
dependencies:
|
|
23
|
-
- !ruby/object:Gem::Dependency
|
|
13
|
+
date: 2013-03-20 00:00:00.000000000 Z
|
|
14
|
+
dependencies:
|
|
15
|
+
- !ruby/object:Gem::Dependency
|
|
24
16
|
name: rake-compiler
|
|
25
|
-
|
|
26
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
|
17
|
+
requirement: !ruby/object:Gem::Requirement
|
|
27
18
|
none: false
|
|
28
|
-
requirements:
|
|
29
|
-
- -
|
|
30
|
-
- !ruby/object:Gem::Version
|
|
31
|
-
hash: 9
|
|
32
|
-
segments:
|
|
33
|
-
- 0
|
|
34
|
-
- 7
|
|
35
|
-
- 5
|
|
19
|
+
requirements:
|
|
20
|
+
- - ! '>='
|
|
21
|
+
- !ruby/object:Gem::Version
|
|
36
22
|
version: 0.7.5
|
|
37
23
|
type: :development
|
|
38
|
-
version_requirements: *id001
|
|
39
|
-
- !ruby/object:Gem::Dependency
|
|
40
|
-
name: rspec
|
|
41
24
|
prerelease: false
|
|
42
|
-
|
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
26
|
+
none: false
|
|
27
|
+
requirements:
|
|
28
|
+
- - ! '>='
|
|
29
|
+
- !ruby/object:Gem::Version
|
|
30
|
+
version: 0.7.5
|
|
31
|
+
- !ruby/object:Gem::Dependency
|
|
32
|
+
name: minitest
|
|
33
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
34
|
none: false
|
|
44
|
-
requirements:
|
|
45
|
-
- -
|
|
46
|
-
- !ruby/object:Gem::Version
|
|
47
|
-
|
|
48
|
-
segments:
|
|
49
|
-
- 2
|
|
50
|
-
- 0
|
|
51
|
-
- 0
|
|
52
|
-
version: 2.0.0
|
|
35
|
+
requirements:
|
|
36
|
+
- - ! '>='
|
|
37
|
+
- !ruby/object:Gem::Version
|
|
38
|
+
version: '0'
|
|
53
39
|
type: :development
|
|
54
|
-
version_requirements: *id002
|
|
55
|
-
- !ruby/object:Gem::Dependency
|
|
56
|
-
name: chardet
|
|
57
40
|
prerelease: false
|
|
58
|
-
|
|
41
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
59
42
|
none: false
|
|
60
|
-
requirements:
|
|
61
|
-
- -
|
|
62
|
-
- !ruby/object:Gem::Version
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
43
|
+
requirements:
|
|
44
|
+
- - ! '>='
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '0'
|
|
47
|
+
- !ruby/object:Gem::Dependency
|
|
48
|
+
name: chardet
|
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
|
50
|
+
none: false
|
|
51
|
+
requirements:
|
|
52
|
+
- - ! '>='
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '0'
|
|
67
55
|
type: :development
|
|
68
|
-
|
|
56
|
+
prerelease: false
|
|
57
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
58
|
+
none: false
|
|
59
|
+
requirements:
|
|
60
|
+
- - ! '>='
|
|
61
|
+
- !ruby/object:Gem::Version
|
|
62
|
+
version: '0'
|
|
69
63
|
description:
|
|
70
64
|
email: seniorlopez@gmail.com
|
|
71
65
|
executables: []
|
|
72
|
-
|
|
73
|
-
extensions:
|
|
66
|
+
extensions:
|
|
74
67
|
- ext/charlock_holmes/extconf.rb
|
|
75
68
|
extra_rdoc_files: []
|
|
76
|
-
|
|
77
|
-
files:
|
|
69
|
+
files:
|
|
78
70
|
- .gitignore
|
|
79
|
-
- .rspec
|
|
80
71
|
- Gemfile
|
|
81
72
|
- Gemfile.lock
|
|
82
73
|
- MIT-LICENSE
|
|
@@ -91,64 +82,47 @@ files:
|
|
|
91
82
|
- ext/charlock_holmes/ext.c
|
|
92
83
|
- ext/charlock_holmes/extconf.rb
|
|
93
84
|
- ext/charlock_holmes/src/file-5.08.tar.gz
|
|
85
|
+
- ext/charlock_holmes/src/file-soft-check.patch
|
|
86
|
+
- ext/charlock_holmes/transliterator.cpp
|
|
94
87
|
- lib/charlock_holmes.rb
|
|
95
88
|
- lib/charlock_holmes/encoding_detector.rb
|
|
96
89
|
- lib/charlock_holmes/string.rb
|
|
97
90
|
- lib/charlock_holmes/version.rb
|
|
98
|
-
-
|
|
99
|
-
-
|
|
100
|
-
-
|
|
101
|
-
-
|
|
102
|
-
-
|
|
103
|
-
-
|
|
104
|
-
-
|
|
105
|
-
-
|
|
106
|
-
-
|
|
107
|
-
-
|
|
108
|
-
-
|
|
109
|
-
|
|
91
|
+
- test/converter_test.rb
|
|
92
|
+
- test/encoding_detector_test.rb
|
|
93
|
+
- test/fixtures/AnsiGraph.psm1
|
|
94
|
+
- test/fixtures/TwigExtensionsDate.es.yml
|
|
95
|
+
- test/fixtures/cl-messagepack.lisp
|
|
96
|
+
- test/fixtures/core.rkt
|
|
97
|
+
- test/fixtures/hello_world
|
|
98
|
+
- test/fixtures/laholator.py
|
|
99
|
+
- test/fixtures/repl2.cljs
|
|
100
|
+
- test/helper.rb
|
|
101
|
+
- test/string_methods_test.rb
|
|
102
|
+
- test/transliterator_test.rb
|
|
110
103
|
homepage: http://github.com/brianmario/charlock_holmes
|
|
111
104
|
licenses: []
|
|
112
|
-
|
|
113
105
|
post_install_message:
|
|
114
|
-
rdoc_options:
|
|
106
|
+
rdoc_options:
|
|
115
107
|
- --charset=UTF-8
|
|
116
|
-
require_paths:
|
|
108
|
+
require_paths:
|
|
117
109
|
- lib
|
|
118
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
|
110
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
119
111
|
none: false
|
|
120
|
-
requirements:
|
|
121
|
-
- -
|
|
122
|
-
- !ruby/object:Gem::Version
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
- 0
|
|
126
|
-
version: "0"
|
|
127
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
112
|
+
requirements:
|
|
113
|
+
- - ! '>='
|
|
114
|
+
- !ruby/object:Gem::Version
|
|
115
|
+
version: '0'
|
|
116
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
128
117
|
none: false
|
|
129
|
-
requirements:
|
|
130
|
-
- -
|
|
131
|
-
- !ruby/object:Gem::Version
|
|
132
|
-
|
|
133
|
-
segments:
|
|
134
|
-
- 0
|
|
135
|
-
version: "0"
|
|
118
|
+
requirements:
|
|
119
|
+
- - ! '>='
|
|
120
|
+
- !ruby/object:Gem::Version
|
|
121
|
+
version: '0'
|
|
136
122
|
requirements: []
|
|
137
|
-
|
|
138
123
|
rubyforge_project:
|
|
139
|
-
rubygems_version: 1.
|
|
124
|
+
rubygems_version: 1.8.23
|
|
140
125
|
signing_key:
|
|
141
126
|
specification_version: 3
|
|
142
127
|
summary: Character encoding detection, brought to you by ICU
|
|
143
|
-
test_files:
|
|
144
|
-
- spec/converter_spec.rb
|
|
145
|
-
- spec/encoding_detector_spec.rb
|
|
146
|
-
- spec/fixtures/AnsiGraph.psm1
|
|
147
|
-
- spec/fixtures/TwigExtensionsDate.es.yml
|
|
148
|
-
- spec/fixtures/cl-messagepack.lisp
|
|
149
|
-
- spec/fixtures/core.rkt
|
|
150
|
-
- spec/fixtures/hello_world
|
|
151
|
-
- spec/fixtures/laholator.py
|
|
152
|
-
- spec/fixtures/repl2.cljs
|
|
153
|
-
- spec/spec_helper.rb
|
|
154
|
-
- spec/string_method_spec.rb
|
|
128
|
+
test_files: []
|
data/.rspec
DELETED