charlock_holmes 0.6.9.1 → 0.6.9.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +3 -11
- data/Rakefile +5 -18
- data/charlock_holmes.gemspec +1 -1
- data/ext/charlock_holmes/encoding_detector.c +1 -5
- data/ext/charlock_holmes/ext.c +2 -0
- data/ext/charlock_holmes/extconf.rb +1 -0
- data/ext/charlock_holmes/transliterator.cpp +86 -0
- data/lib/charlock_holmes/version.rb +1 -1
- data/{spec/converter_spec.rb → test/converter_test.rb} +8 -7
- data/{spec/encoding_detector_spec.rb → test/encoding_detector_test.rb} +32 -35
- data/{spec → test}/fixtures/AnsiGraph.psm1 +0 -0
- data/{spec → test}/fixtures/TwigExtensionsDate.es.yml +0 -0
- data/{spec → test}/fixtures/cl-messagepack.lisp +0 -0
- data/{spec → test}/fixtures/core.rkt +0 -0
- data/{spec → test}/fixtures/hello_world +0 -0
- data/{spec → test}/fixtures/laholator.py +0 -0
- data/{spec → test}/fixtures/repl2.cljs +0 -0
- data/test/helper.rb +14 -0
- data/{spec/string_method_spec.rb → test/string_methods_test.rb} +7 -7
- data/test/transliterator_test.rb +119 -0
- metadata +75 -101
- data/.rspec +0 -3
- data/spec/spec_helper.rb +0 -9
data/Gemfile.lock
CHANGED
@@ -1,24 +1,16 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
charlock_holmes (0.6.9.
|
4
|
+
charlock_holmes (0.6.9.2)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: http://rubygems.org/
|
8
8
|
specs:
|
9
9
|
chardet (0.9.0)
|
10
|
-
|
10
|
+
minitest (4.6.2)
|
11
11
|
rake (0.9.2)
|
12
12
|
rake-compiler (0.7.9)
|
13
13
|
rake
|
14
|
-
rspec (2.6.0)
|
15
|
-
rspec-core (~> 2.6.0)
|
16
|
-
rspec-expectations (~> 2.6.0)
|
17
|
-
rspec-mocks (~> 2.6.0)
|
18
|
-
rspec-core (2.6.4)
|
19
|
-
rspec-expectations (2.6.0)
|
20
|
-
diff-lcs (~> 1.1.2)
|
21
|
-
rspec-mocks (2.6.0)
|
22
14
|
|
23
15
|
PLATFORMS
|
24
16
|
ruby
|
@@ -26,5 +18,5 @@ PLATFORMS
|
|
26
18
|
DEPENDENCIES
|
27
19
|
chardet
|
28
20
|
charlock_holmes!
|
21
|
+
minitest
|
29
22
|
rake-compiler (>= 0.7.5)
|
30
|
-
rspec (>= 2.0.0)
|
data/Rakefile
CHANGED
@@ -1,23 +1,10 @@
|
|
1
|
-
|
2
|
-
begin
|
3
|
-
require 'rspec'
|
4
|
-
require 'rspec/core/rake_task'
|
1
|
+
require 'rake/testtask'
|
5
2
|
|
6
|
-
|
7
|
-
|
8
|
-
t.rcov = true
|
9
|
-
end
|
10
|
-
RSpec::Core::RakeTask.new 'spec' do |t|
|
11
|
-
t.verbose = true
|
12
|
-
end
|
13
|
-
|
14
|
-
task :default => :spec
|
15
|
-
rescue LoadError
|
16
|
-
puts "rspec, or one of its dependencies, is not available. Install it with: sudo gem install rspec"
|
3
|
+
Rake::TestTask.new do |t|
|
4
|
+
t.pattern = "test/**/*_test.rb"
|
17
5
|
end
|
18
6
|
|
19
|
-
|
20
|
-
require 'rake' unless defined? Rake
|
7
|
+
task :default => :test
|
21
8
|
|
22
9
|
gem 'rake-compiler', '>= 0.7.5'
|
23
10
|
require "rake/extensiontask"
|
@@ -26,4 +13,4 @@ Rake::ExtensionTask.new 'charlock_holmes' do |ext|
|
|
26
13
|
ext.lib_dir = File.join 'lib', 'charlock_holmes'
|
27
14
|
end
|
28
15
|
|
29
|
-
Rake::Task[:
|
16
|
+
Rake::Task[:test].prerequisites << :compile
|
data/charlock_holmes.gemspec
CHANGED
@@ -19,7 +19,7 @@ Gem::Specification.new do |s|
|
|
19
19
|
|
20
20
|
# tests
|
21
21
|
s.add_development_dependency 'rake-compiler', ">= 0.7.5"
|
22
|
-
s.add_development_dependency '
|
22
|
+
s.add_development_dependency 'minitest'
|
23
23
|
# benchmarks
|
24
24
|
s.add_development_dependency 'chardet'
|
25
25
|
end
|
@@ -274,15 +274,11 @@ static VALUE rb_encdec__alloc(VALUE klass)
|
|
274
274
|
rb_raise(rb_eStandardError, "%s", u_errorName(status));
|
275
275
|
}
|
276
276
|
|
277
|
-
detector->magic = magic_open(
|
277
|
+
detector->magic = magic_open(MAGIC_NO_CHECK_SOFT);
|
278
278
|
if (detector->magic == NULL) {
|
279
279
|
rb_raise(rb_eStandardError, "%s", magic_error(detector->magic));
|
280
280
|
}
|
281
281
|
|
282
|
-
// load the libmagic database
|
283
|
-
// NULL means use the default or whatever is specified by the MAGIC env var
|
284
|
-
magic_load(detector->magic, NULL);
|
285
|
-
|
286
282
|
return obj;
|
287
283
|
}
|
288
284
|
|
data/ext/charlock_holmes/ext.c
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
extern void _init_charlock_encoding_detector();
|
4
4
|
extern void _init_charlock_converter();
|
5
|
+
extern void _init_charlock_transliterator();
|
5
6
|
|
6
7
|
VALUE rb_mCharlockHolmes;
|
7
8
|
|
@@ -10,4 +11,5 @@ void Init_charlock_holmes() {
|
|
10
11
|
|
11
12
|
_init_charlock_encoding_detector();
|
12
13
|
_init_charlock_converter();
|
14
|
+
_init_charlock_transliterator();
|
13
15
|
}
|
@@ -58,6 +58,7 @@ Dir.chdir("#{CWD}/src") do
|
|
58
58
|
sys("tar zxvf #{src}")
|
59
59
|
Dir.chdir(dir) do
|
60
60
|
sys("./configure --prefix=#{CWD}/dst/ --disable-shared --enable-static --with-pic")
|
61
|
+
sys("patch -p0 < ../file-soft-check.patch")
|
61
62
|
sys("make -C src install")
|
62
63
|
sys("make -C magic install")
|
63
64
|
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
#include "common.h"
|
2
|
+
#undef UChar
|
3
|
+
|
4
|
+
#include <unicode/translit.h>
|
5
|
+
|
6
|
+
extern "C" {
|
7
|
+
|
8
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
9
|
+
#include <ruby/encoding.h>
|
10
|
+
static VALUE rb_eEncodingCompatibilityError;
|
11
|
+
|
12
|
+
static void check_utf8_encoding(VALUE str) {
|
13
|
+
static rb_encoding *_cached[3] = {NULL, NULL, NULL};
|
14
|
+
rb_encoding *enc;
|
15
|
+
|
16
|
+
if (_cached[0] == NULL) {
|
17
|
+
_cached[0] = rb_utf8_encoding();
|
18
|
+
_cached[1] = rb_usascii_encoding();
|
19
|
+
_cached[2] = rb_ascii8bit_encoding();
|
20
|
+
}
|
21
|
+
|
22
|
+
enc = rb_enc_get(str);
|
23
|
+
if (enc != _cached[0] && enc != _cached[1] && enc != _cached[2]) {
|
24
|
+
rb_raise(rb_eEncodingCompatibilityError,
|
25
|
+
"Input must be UTF-8 or US-ASCII, %s given", rb_enc_name(enc));
|
26
|
+
}
|
27
|
+
}
|
28
|
+
|
29
|
+
#else
|
30
|
+
static void check_utf8_encoding(VALUE str) {}
|
31
|
+
#endif
|
32
|
+
|
33
|
+
extern VALUE rb_mCharlockHolmes;
|
34
|
+
static VALUE rb_cTransliterator;
|
35
|
+
|
36
|
+
static VALUE rb_transliterator_transliterate(VALUE self, VALUE rb_txt, VALUE rb_id) {
|
37
|
+
UErrorCode status = U_ZERO_ERROR;
|
38
|
+
UParseError p_error;
|
39
|
+
Transliterator *trans;
|
40
|
+
const char *txt;
|
41
|
+
size_t txt_len;
|
42
|
+
const char *id;
|
43
|
+
size_t id_len;
|
44
|
+
UnicodeString *u_txt;
|
45
|
+
std::string result;
|
46
|
+
VALUE rb_out;
|
47
|
+
|
48
|
+
Check_Type(rb_txt, T_STRING);
|
49
|
+
Check_Type(rb_id, T_STRING);
|
50
|
+
|
51
|
+
check_utf8_encoding(rb_txt);
|
52
|
+
check_utf8_encoding(rb_id);
|
53
|
+
|
54
|
+
txt = RSTRING_PTR(rb_txt);
|
55
|
+
txt_len = RSTRING_LEN(rb_txt);
|
56
|
+
id = RSTRING_PTR(rb_id);
|
57
|
+
id_len = RSTRING_LEN(rb_id);
|
58
|
+
|
59
|
+
trans = Transliterator::createInstance(UnicodeString(id, id_len), UTRANS_FORWARD, p_error, status);
|
60
|
+
if(!U_SUCCESS(status)) {
|
61
|
+
rb_raise(rb_eArgError, "%s", u_errorName(status));
|
62
|
+
}
|
63
|
+
|
64
|
+
u_txt = new UnicodeString(txt, txt_len);
|
65
|
+
trans->transliterate(*u_txt);
|
66
|
+
result = u_txt->toUTF8String(result);
|
67
|
+
|
68
|
+
delete u_txt;
|
69
|
+
delete trans;
|
70
|
+
|
71
|
+
rb_out = charlock_new_str(result.data(), result.length());
|
72
|
+
|
73
|
+
return rb_out;
|
74
|
+
}
|
75
|
+
|
76
|
+
void _init_charlock_transliterator() {
|
77
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
78
|
+
rb_eEncodingCompatibilityError = rb_const_get(rb_cEncoding, rb_intern("CompatibilityError"));
|
79
|
+
#endif
|
80
|
+
|
81
|
+
rb_cTransliterator = rb_define_class_under(rb_mCharlockHolmes, "Transliterator", rb_cObject);
|
82
|
+
|
83
|
+
rb_define_singleton_method(rb_cTransliterator, "transliterate", (VALUE(*)(...))rb_transliterator_transliterate, 2);
|
84
|
+
}
|
85
|
+
|
86
|
+
}
|
@@ -1,9 +1,8 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
require File.expand_path("../helper", __FILE__)
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
describe CharlockHolmes::Converter do
|
6
|
-
test 'is able to convert regular ascii content from ISO-8859-1 to UTF-16, and back again' do
|
4
|
+
class ConverterTest < MiniTest::Unit::TestCase
|
5
|
+
def test_convert_ascii_from_iso859_1_to_utf16_and_back
|
7
6
|
input = 'test'
|
8
7
|
|
9
8
|
output = CharlockHolmes::Converter.convert input, 'ISO-8859-1', 'UTF-16'
|
@@ -15,7 +14,7 @@ describe CharlockHolmes::Converter do
|
|
15
14
|
assert input == output
|
16
15
|
end
|
17
16
|
|
18
|
-
|
17
|
+
def test_convert_utf8_to_utf16_and_back
|
19
18
|
input = 'λ, λ, λ'
|
20
19
|
|
21
20
|
output = CharlockHolmes::Converter.convert input, 'UTF-8', 'UTF-16'
|
@@ -27,7 +26,7 @@ describe CharlockHolmes::Converter do
|
|
27
26
|
assert input == output
|
28
27
|
end
|
29
28
|
|
30
|
-
|
29
|
+
def test_params_must_be_strings
|
31
30
|
assert_raises TypeError do
|
32
31
|
CharlockHolmes::Converter.convert nil, 'UTF-8', 'UTF-16'
|
33
32
|
end
|
@@ -40,8 +39,10 @@ describe CharlockHolmes::Converter do
|
|
40
39
|
CharlockHolmes::Converter.convert 'lol', 'UTF-8', nil
|
41
40
|
end
|
42
41
|
|
43
|
-
|
42
|
+
begin
|
44
43
|
CharlockHolmes::Converter.convert 'lol', 'UTF-8', 'UTF-16'
|
44
|
+
rescue Exception => e
|
45
|
+
assert_nil e, "#{e.class.name} raised, expected nothing"
|
45
46
|
end
|
46
47
|
end
|
47
48
|
end
|
@@ -1,25 +1,24 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
require File.expand_path("../helper", __FILE__)
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
describe CharlockHolmes::EncodingDetector do
|
6
|
-
before :all do
|
4
|
+
class EncodingDetectorTest < MiniTest::Unit::TestCase
|
5
|
+
def setup
|
7
6
|
@detector = CharlockHolmes::EncodingDetector.new
|
8
7
|
end
|
9
8
|
|
10
|
-
|
9
|
+
def test_has_class_level_detect_method
|
11
10
|
CharlockHolmes::EncodingDetector.respond_to? :detect
|
12
11
|
detected = CharlockHolmes::EncodingDetector.detect 'test'
|
13
12
|
assert_equal 'ISO-8859-1', detected[:encoding]
|
14
13
|
end
|
15
14
|
|
16
|
-
|
15
|
+
def test_class_level_detect_accepts_encoding_hint
|
17
16
|
CharlockHolmes::EncodingDetector.respond_to? :detect
|
18
17
|
detected = CharlockHolmes::EncodingDetector.detect 'test', 'UTF-8'
|
19
18
|
assert_equal 'ISO-8859-1', detected[:encoding]
|
20
19
|
end
|
21
20
|
|
22
|
-
|
21
|
+
def test_has_class_level_detect_all_method
|
23
22
|
CharlockHolmes::EncodingDetector.respond_to? :detect_all
|
24
23
|
detected_list = CharlockHolmes::EncodingDetector.detect_all 'test'
|
25
24
|
assert detected_list.is_a? Array
|
@@ -28,7 +27,7 @@ describe CharlockHolmes::EncodingDetector do
|
|
28
27
|
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
29
28
|
end
|
30
29
|
|
31
|
-
|
30
|
+
def test_class_level_detect_all_method_accepts_encoding_hint
|
32
31
|
CharlockHolmes::EncodingDetector.respond_to? :detect_all
|
33
32
|
detected_list = CharlockHolmes::EncodingDetector.detect_all 'test', 'UTF-8'
|
34
33
|
assert detected_list.is_a? Array
|
@@ -37,19 +36,19 @@ describe CharlockHolmes::EncodingDetector do
|
|
37
36
|
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
38
37
|
end
|
39
38
|
|
40
|
-
|
39
|
+
def test_has_detect_method
|
41
40
|
@detector.respond_to? :detect
|
42
41
|
detected = @detector.detect 'test'
|
43
42
|
assert_equal 'ISO-8859-1', detected[:encoding]
|
44
43
|
end
|
45
44
|
|
46
|
-
|
45
|
+
def test_detect_accepts_encoding_hint
|
47
46
|
@detector.respond_to? :detect
|
48
47
|
detected = @detector.detect 'test', 'UTF-8'
|
49
48
|
assert_equal 'ISO-8859-1', detected[:encoding]
|
50
49
|
end
|
51
50
|
|
52
|
-
|
51
|
+
def test_has_detect_all_method
|
53
52
|
@detector.respond_to? :detect_all
|
54
53
|
detected_list = @detector.detect_all 'test'
|
55
54
|
assert detected_list.is_a? Array
|
@@ -58,7 +57,7 @@ describe CharlockHolmes::EncodingDetector do
|
|
58
57
|
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
59
58
|
end
|
60
59
|
|
61
|
-
|
60
|
+
def test_detect_all_accepts_encoding_hint
|
62
61
|
@detector.respond_to? :detect_all
|
63
62
|
detected_list = @detector.detect_all 'test', 'UTF-8'
|
64
63
|
assert detected_list.is_a? Array
|
@@ -67,7 +66,7 @@ describe CharlockHolmes::EncodingDetector do
|
|
67
66
|
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
68
67
|
end
|
69
68
|
|
70
|
-
|
69
|
+
def test_strip_tags_flag
|
71
70
|
detector = CharlockHolmes::EncodingDetector.new
|
72
71
|
detector.strip_tags = true
|
73
72
|
assert detector.strip_tags
|
@@ -82,7 +81,7 @@ describe CharlockHolmes::EncodingDetector do
|
|
82
81
|
assert_equal 'UTF-8', detection[:encoding]
|
83
82
|
end
|
84
83
|
|
85
|
-
|
84
|
+
def test_has_list_of_supported_encodings
|
86
85
|
CharlockHolmes::EncodingDetector.respond_to? :supported_encodings
|
87
86
|
supported_encodings = CharlockHolmes::EncodingDetector.supported_encodings
|
88
87
|
|
@@ -90,32 +89,30 @@ describe CharlockHolmes::EncodingDetector do
|
|
90
89
|
assert supported_encodings.include? 'UTF-8'
|
91
90
|
end
|
92
91
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
92
|
+
MAPPING = [
|
93
|
+
['repl2.cljs', 'ISO-8859-1', :text],
|
94
|
+
['core.rkt', 'UTF-8', :text],
|
95
|
+
['cl-messagepack.lisp', 'ISO-8859-1', :text],
|
96
|
+
['TwigExtensionsDate.es.yml', 'UTF-8', :text],
|
97
|
+
['AnsiGraph.psm1', 'UTF-16LE', :text],
|
98
|
+
['laholator.py', 'UTF-8', :text],
|
99
|
+
['hello_world', nil, :binary]
|
100
|
+
]
|
101
|
+
|
102
|
+
def test_detection_works_as_expected
|
104
103
|
MAPPING.each do |mapping|
|
105
104
|
file, encoding, type = mapping
|
106
105
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
guessed = @detector.detect content
|
106
|
+
path = File.expand_path "../fixtures/#{file}", __FILE__
|
107
|
+
content = File.read path
|
108
|
+
guessed = @detector.detect content
|
111
109
|
|
112
|
-
|
113
|
-
|
110
|
+
assert_equal encoding, guessed[:encoding]
|
111
|
+
assert_equal type, guessed[:type]
|
114
112
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
end
|
113
|
+
if content.respond_to?(:force_encoding) && guessed[:type] == :text
|
114
|
+
content.force_encoding guessed[:encoding]
|
115
|
+
assert content.valid_encoding?
|
119
116
|
end
|
120
117
|
end
|
121
118
|
end
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
data/test/helper.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# Basic test environment.
|
2
|
+
|
3
|
+
# blah fuck this
|
4
|
+
require 'rubygems' if !defined?(Gem)
|
5
|
+
require 'bundler/setup'
|
6
|
+
|
7
|
+
require 'charlock_holmes'
|
8
|
+
|
9
|
+
# bring in minitest
|
10
|
+
require 'minitest/autorun'
|
11
|
+
|
12
|
+
# put lib and test dirs directly on load path
|
13
|
+
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
14
|
+
$LOAD_PATH.unshift File.expand_path('..', __FILE__)
|
@@ -1,8 +1,8 @@
|
|
1
|
-
require
|
1
|
+
require File.expand_path("../helper", __FILE__)
|
2
2
|
require 'charlock_holmes/string'
|
3
3
|
|
4
|
-
|
5
|
-
|
4
|
+
class StringMethodsTest < MiniTest::Unit::TestCase
|
5
|
+
def test_adds_detect_encoding_method
|
6
6
|
str = 'test'
|
7
7
|
str.respond_to? :detect_encoding
|
8
8
|
|
@@ -10,7 +10,7 @@ describe String do
|
|
10
10
|
assert_equal 'ISO-8859-1', detected[:encoding]
|
11
11
|
end
|
12
12
|
|
13
|
-
|
13
|
+
def test_detect_encoding_accepts_encoding_hint_param
|
14
14
|
str = 'test'
|
15
15
|
str.respond_to? :detect_encoding
|
16
16
|
|
@@ -18,7 +18,7 @@ describe String do
|
|
18
18
|
assert_equal 'ISO-8859-1', detected[:encoding]
|
19
19
|
end
|
20
20
|
|
21
|
-
|
21
|
+
def test_adds_detect_encodings_method
|
22
22
|
str = 'test'
|
23
23
|
str.respond_to? :detect_encodings
|
24
24
|
|
@@ -29,7 +29,7 @@ describe String do
|
|
29
29
|
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
30
30
|
end
|
31
31
|
|
32
|
-
|
32
|
+
def test_detect_encodings_accepts_encoding_hint_param
|
33
33
|
str = 'test'
|
34
34
|
str.respond_to? :detect_encodings
|
35
35
|
|
@@ -41,7 +41,7 @@ describe String do
|
|
41
41
|
end
|
42
42
|
|
43
43
|
if RUBY_VERSION =~ /1.9/
|
44
|
-
|
44
|
+
def test_adds_detect_encoding_bang_method
|
45
45
|
str = 'test'
|
46
46
|
str.respond_to? :detect_encoding!
|
47
47
|
|
@@ -0,0 +1,119 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require File.expand_path("../helper", __FILE__)
|
3
|
+
|
4
|
+
class TransliteratorTest < MiniTest::Unit::TestCase
|
5
|
+
DONT_CONVERT = [
|
6
|
+
"Vitrum edere possum; mihi non nocet.", # Latin
|
7
|
+
"Je puis mangier del voirre. Ne me nuit.", # Old French
|
8
|
+
"Kristala jan dezaket, ez dit minik ematen.", # Basque
|
9
|
+
"Kaya kong kumain nang bubog at hindi ako masaktan.", # Tagalog
|
10
|
+
"Ich kann Glas essen, ohne mir weh zu tun.", # German
|
11
|
+
"I can eat glass and it doesn't hurt me.", # English
|
12
|
+
]
|
13
|
+
|
14
|
+
CONVERT_PAIRS = {
|
15
|
+
"Je peux manger du verre, ça ne me fait pas de mal." => # French
|
16
|
+
"Je peux manger du verre, ca ne me fait pas de mal.",
|
17
|
+
"Pot să mănânc sticlă și ea nu mă rănește." => # Romanian
|
18
|
+
"Pot sa mananc sticla si ea nu ma raneste.",
|
19
|
+
"Ég get etið gler án þess að meiða mig." => # Icelandic
|
20
|
+
"Eg get etid gler an thess ad meida mig.",
|
21
|
+
"Unë mund të ha qelq dhe nuk më gjen gjë." => # Albanian
|
22
|
+
"Une mund te ha qelq dhe nuk me gjen gje.",
|
23
|
+
"Mogę jeść szkło i mi nie szkodzi." => # Polish
|
24
|
+
"Moge jesc szklo i mi nie szkodzi.",
|
25
|
+
# "Я могу есть стекло, оно мне не вредит." => # Russian
|
26
|
+
# "Ia moghu iest' stieklo, ono mnie nie vriedit.",
|
27
|
+
# "Мога да ям стъкло, то не ми вреди." => # Bulgarian
|
28
|
+
# "Mogha da iam stklo, to nie mi vriedi.",
|
29
|
+
# "ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬" => # Anglo-Saxon
|
30
|
+
# "ic.mag.glas.eotacn.ond.hit.ne.heacrmiacth.me:",
|
31
|
+
# "ὕαλον ϕαγεῖν δύναμαι· τοῦτο οὔ με βλάπτει" => # Classical Greek
|
32
|
+
# "ualon phagein dunamai; touto ou me blaptei",
|
33
|
+
# "मैं काँच खा सकता हूँ और मुझे उससे कोई चोट नहीं पहुंचती" => # Hindi
|
34
|
+
# "maiN kaaNc khaa sktaa huuN aur mujhe usse koii cott nhiiN phuNctii",
|
35
|
+
# "من می توانم بدونِ احساس درد شيشه بخورم" => # Persian
|
36
|
+
# "mn my twnm bdwni Hss drd shyshh bkhwrm",
|
37
|
+
# "أنا قادر على أكل الزجاج و هذا لا يؤلمن" => # Arabic
|
38
|
+
# "'n qdr 'l~ 'kl lzjj w hdh l yw'lmn",
|
39
|
+
# "אני יכול לאכול זכוכית וזה לא מזיק לי" => # Hebrew
|
40
|
+
# "ny ykvl lkvl zkvkyt vzh l mzyq ly",
|
41
|
+
# "ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ" => # Thai
|
42
|
+
# "chankinkracchkaid aetmanaimthamaihchanecchb",
|
43
|
+
# "我能吞下玻璃而不伤身体。" => # Chinese
|
44
|
+
# "Wo Neng Tun Xia Bo Li Er Bu Shang Shen Ti . ",
|
45
|
+
# "私はガラスを食べられます。それは私を傷つけません。" => # Japanese
|
46
|
+
# "Si hagarasuwoShi beraremasu. sorehaSi woShang tukemasen. ",
|
47
|
+
# "⠋⠗⠁⠝⠉⠑" => # Braille
|
48
|
+
# "france",
|
49
|
+
"Schloß - Assunção - Łódź" =>
|
50
|
+
"Schloss - Assuncao - Lodz",
|
51
|
+
"TÜM GOLLER Fb 4-1 Bursa Maç Özeti Íƶle" =>
|
52
|
+
"TUM GOLLER Fb 4-1 Bursa Mac Ozeti Izle",
|
53
|
+
"ßßßßß" => "ssssssssss"
|
54
|
+
}
|
55
|
+
|
56
|
+
def test_transliterate
|
57
|
+
trans_id = "Any-NFD; Any-Latin; Latin-ASCII; Any-NFC"
|
58
|
+
|
59
|
+
DONT_CONVERT.each do |subject|
|
60
|
+
assert_equal subject, trans(subject, trans_id)
|
61
|
+
end
|
62
|
+
|
63
|
+
CONVERT_PAIRS.each do |before, after|
|
64
|
+
assert_equal after, trans(before, trans_id)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
if "".respond_to? :force_encoding
|
69
|
+
def test_transliterate_id_must_be_utf8_or_ascii
|
70
|
+
trans_id = "Any-NFD; Any-Latin; Latin-ASCII; Any-NFC".force_encoding('big5')
|
71
|
+
txt = "blah blah blah"
|
72
|
+
|
73
|
+
assert_raises Encoding::CompatibilityError do
|
74
|
+
trans(txt, trans_id)
|
75
|
+
end
|
76
|
+
|
77
|
+
trans_id.force_encoding('UTF-8')
|
78
|
+
begin
|
79
|
+
trans(txt, trans_id)
|
80
|
+
rescue Encoding::CompatibilityError => e
|
81
|
+
assert_nil e, "#{e.class.name} raised, expected not to"
|
82
|
+
end
|
83
|
+
|
84
|
+
trans_id.force_encoding('US-ASCII')
|
85
|
+
begin
|
86
|
+
trans(txt, trans_id)
|
87
|
+
rescue Encoding::CompatibilityError => e
|
88
|
+
assert_nil e, "#{e.class.name} raised, expected not to"
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def test_transliterate_text_must_be_utf8_or_ascii
|
93
|
+
trans_id = "Any-NFD; Any-Latin; Latin-ASCII; Any-NFC"
|
94
|
+
txt = "blah blah blah".force_encoding('big5')
|
95
|
+
|
96
|
+
assert_raises Encoding::CompatibilityError do
|
97
|
+
trans(txt, trans_id)
|
98
|
+
end
|
99
|
+
|
100
|
+
txt.force_encoding('UTF-8')
|
101
|
+
begin
|
102
|
+
trans(txt, trans_id)
|
103
|
+
rescue Encoding::CompatibilityError => e
|
104
|
+
assert_nil e, "#{e.class.name} raised, expected not to"
|
105
|
+
end
|
106
|
+
|
107
|
+
txt.force_encoding('US-ASCII')
|
108
|
+
begin
|
109
|
+
trans(txt, trans_id)
|
110
|
+
rescue Encoding::CompatibilityError => e
|
111
|
+
assert_nil e, "#{e.class.name} raised, expected not to"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def trans(text, id)
|
117
|
+
CharlockHolmes::Transliterator.transliterate(text, id)
|
118
|
+
end
|
119
|
+
end
|
metadata
CHANGED
@@ -1,82 +1,73 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: charlock_holmes
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.6.9.2
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 6
|
9
|
-
- 9
|
10
|
-
- 1
|
11
|
-
version: 0.6.9.1
|
12
6
|
platform: ruby
|
13
|
-
authors:
|
7
|
+
authors:
|
14
8
|
- Brian Lopez
|
15
|
-
-
|
9
|
+
- Vicent Martí
|
16
10
|
autorequire:
|
17
11
|
bindir: bin
|
18
12
|
cert_chain: []
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
dependencies:
|
23
|
-
- !ruby/object:Gem::Dependency
|
13
|
+
date: 2013-03-20 00:00:00.000000000 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
24
16
|
name: rake-compiler
|
25
|
-
|
26
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
17
|
+
requirement: !ruby/object:Gem::Requirement
|
27
18
|
none: false
|
28
|
-
requirements:
|
29
|
-
- -
|
30
|
-
- !ruby/object:Gem::Version
|
31
|
-
hash: 9
|
32
|
-
segments:
|
33
|
-
- 0
|
34
|
-
- 7
|
35
|
-
- 5
|
19
|
+
requirements:
|
20
|
+
- - ! '>='
|
21
|
+
- !ruby/object:Gem::Version
|
36
22
|
version: 0.7.5
|
37
23
|
type: :development
|
38
|
-
version_requirements: *id001
|
39
|
-
- !ruby/object:Gem::Dependency
|
40
|
-
name: rspec
|
41
24
|
prerelease: false
|
42
|
-
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
none: false
|
27
|
+
requirements:
|
28
|
+
- - ! '>='
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: 0.7.5
|
31
|
+
- !ruby/object:Gem::Dependency
|
32
|
+
name: minitest
|
33
|
+
requirement: !ruby/object:Gem::Requirement
|
43
34
|
none: false
|
44
|
-
requirements:
|
45
|
-
- -
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
|
48
|
-
segments:
|
49
|
-
- 2
|
50
|
-
- 0
|
51
|
-
- 0
|
52
|
-
version: 2.0.0
|
35
|
+
requirements:
|
36
|
+
- - ! '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
53
39
|
type: :development
|
54
|
-
version_requirements: *id002
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: chardet
|
57
40
|
prerelease: false
|
58
|
-
|
41
|
+
version_requirements: !ruby/object:Gem::Requirement
|
59
42
|
none: false
|
60
|
-
requirements:
|
61
|
-
- -
|
62
|
-
- !ruby/object:Gem::Version
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
43
|
+
requirements:
|
44
|
+
- - ! '>='
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: chardet
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
67
55
|
type: :development
|
68
|
-
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: !ruby/object:Gem::Requirement
|
58
|
+
none: false
|
59
|
+
requirements:
|
60
|
+
- - ! '>='
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: '0'
|
69
63
|
description:
|
70
64
|
email: seniorlopez@gmail.com
|
71
65
|
executables: []
|
72
|
-
|
73
|
-
extensions:
|
66
|
+
extensions:
|
74
67
|
- ext/charlock_holmes/extconf.rb
|
75
68
|
extra_rdoc_files: []
|
76
|
-
|
77
|
-
files:
|
69
|
+
files:
|
78
70
|
- .gitignore
|
79
|
-
- .rspec
|
80
71
|
- Gemfile
|
81
72
|
- Gemfile.lock
|
82
73
|
- MIT-LICENSE
|
@@ -91,64 +82,47 @@ files:
|
|
91
82
|
- ext/charlock_holmes/ext.c
|
92
83
|
- ext/charlock_holmes/extconf.rb
|
93
84
|
- ext/charlock_holmes/src/file-5.08.tar.gz
|
85
|
+
- ext/charlock_holmes/src/file-soft-check.patch
|
86
|
+
- ext/charlock_holmes/transliterator.cpp
|
94
87
|
- lib/charlock_holmes.rb
|
95
88
|
- lib/charlock_holmes/encoding_detector.rb
|
96
89
|
- lib/charlock_holmes/string.rb
|
97
90
|
- lib/charlock_holmes/version.rb
|
98
|
-
-
|
99
|
-
-
|
100
|
-
-
|
101
|
-
-
|
102
|
-
-
|
103
|
-
-
|
104
|
-
-
|
105
|
-
-
|
106
|
-
-
|
107
|
-
-
|
108
|
-
-
|
109
|
-
|
91
|
+
- test/converter_test.rb
|
92
|
+
- test/encoding_detector_test.rb
|
93
|
+
- test/fixtures/AnsiGraph.psm1
|
94
|
+
- test/fixtures/TwigExtensionsDate.es.yml
|
95
|
+
- test/fixtures/cl-messagepack.lisp
|
96
|
+
- test/fixtures/core.rkt
|
97
|
+
- test/fixtures/hello_world
|
98
|
+
- test/fixtures/laholator.py
|
99
|
+
- test/fixtures/repl2.cljs
|
100
|
+
- test/helper.rb
|
101
|
+
- test/string_methods_test.rb
|
102
|
+
- test/transliterator_test.rb
|
110
103
|
homepage: http://github.com/brianmario/charlock_holmes
|
111
104
|
licenses: []
|
112
|
-
|
113
105
|
post_install_message:
|
114
|
-
rdoc_options:
|
106
|
+
rdoc_options:
|
115
107
|
- --charset=UTF-8
|
116
|
-
require_paths:
|
108
|
+
require_paths:
|
117
109
|
- lib
|
118
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
110
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
119
111
|
none: false
|
120
|
-
requirements:
|
121
|
-
- -
|
122
|
-
- !ruby/object:Gem::Version
|
123
|
-
|
124
|
-
|
125
|
-
- 0
|
126
|
-
version: "0"
|
127
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
112
|
+
requirements:
|
113
|
+
- - ! '>='
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: '0'
|
116
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
128
117
|
none: false
|
129
|
-
requirements:
|
130
|
-
- -
|
131
|
-
- !ruby/object:Gem::Version
|
132
|
-
|
133
|
-
segments:
|
134
|
-
- 0
|
135
|
-
version: "0"
|
118
|
+
requirements:
|
119
|
+
- - ! '>='
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
version: '0'
|
136
122
|
requirements: []
|
137
|
-
|
138
123
|
rubyforge_project:
|
139
|
-
rubygems_version: 1.
|
124
|
+
rubygems_version: 1.8.23
|
140
125
|
signing_key:
|
141
126
|
specification_version: 3
|
142
127
|
summary: Character encoding detection, brought to you by ICU
|
143
|
-
test_files:
|
144
|
-
- spec/converter_spec.rb
|
145
|
-
- spec/encoding_detector_spec.rb
|
146
|
-
- spec/fixtures/AnsiGraph.psm1
|
147
|
-
- spec/fixtures/TwigExtensionsDate.es.yml
|
148
|
-
- spec/fixtures/cl-messagepack.lisp
|
149
|
-
- spec/fixtures/core.rkt
|
150
|
-
- spec/fixtures/hello_world
|
151
|
-
- spec/fixtures/laholator.py
|
152
|
-
- spec/fixtures/repl2.cljs
|
153
|
-
- spec/spec_helper.rb
|
154
|
-
- spec/string_method_spec.rb
|
128
|
+
test_files: []
|
data/.rspec
DELETED