charlock_holmes 0.7.1 → 0.7.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/charlock_holmes/encoding_detector.c +13 -1
- data/lib/charlock_holmes/encoding_detector.rb +31 -0
- data/lib/charlock_holmes/string.rb +1 -1
- data/lib/charlock_holmes/version.rb +1 -1
- data/test/encoding_detector_test.rb +12 -2
- data/test/fixtures/ISO-2022-KR.txt +43 -0
- data/test/helper.rb +6 -1
- data/test/string_methods_test.rb +23 -2
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 53abf00f6c72c2ac1339b3f856011bed111b9ad4
|
4
|
+
data.tar.gz: c17048fa5ddf8c5c37f1378653e0bdcea6849e7a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4f97cf5d2bca0e320e1eb54efd77bd4efed5add1621a4c9dcd62e6ec0c1b0b0834a30a5247341308974683e93a064044c2fb524b562d88dd26ead19694bf2121
|
7
|
+
data.tar.gz: 57ac5e9d12ae54f65387ef81f45afde88a8d4988f38f2647e2fbc3c88fe00fe094361f8eb26120f7bacea21024f73a74118bfc06a762d7fcc47caecfd5edc372
|
@@ -15,6 +15,9 @@ static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
|
|
15
15
|
const char *mlang;
|
16
16
|
int mconfidence;
|
17
17
|
VALUE rb_match;
|
18
|
+
VALUE enc_tbl;
|
19
|
+
VALUE enc_name;
|
20
|
+
VALUE compat_enc;
|
18
21
|
|
19
22
|
if (!match)
|
20
23
|
return Qnil;
|
@@ -26,7 +29,16 @@ static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
|
|
26
29
|
rb_match = rb_hash_new();
|
27
30
|
|
28
31
|
rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("text")));
|
29
|
-
|
32
|
+
|
33
|
+
enc_name = charlock_new_str2(mname);
|
34
|
+
rb_hash_aset(rb_match, ID2SYM(rb_intern("encoding")), enc_name);
|
35
|
+
|
36
|
+
enc_tbl = rb_iv_get(rb_cEncodingDetector, "@encoding_table");
|
37
|
+
compat_enc = rb_hash_aref(enc_tbl, enc_name);
|
38
|
+
if (!NIL_P(compat_enc)) {
|
39
|
+
rb_hash_aset(rb_match, ID2SYM(rb_intern("ruby_encoding")), compat_enc);
|
40
|
+
}
|
41
|
+
|
30
42
|
rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(mconfidence));
|
31
43
|
|
32
44
|
if (mlang && mlang[0])
|
@@ -41,5 +41,36 @@ module CharlockHolmes
|
|
41
41
|
def self.detect_all(str, hint_enc=nil)
|
42
42
|
new.detect_all(str, hint_enc)
|
43
43
|
end
|
44
|
+
|
45
|
+
# A mapping table of supported encoding names from EncodingDetector
|
46
|
+
# which point to the corresponding supported encoding name in Ruby.
|
47
|
+
# Like: {"UTF-8" => "UTF-8", "IBM420_rtl" => "ASCII-8BIT"}
|
48
|
+
#
|
49
|
+
# Note that encodings that can't be mapped between Charlock and Ruby will resolve
|
50
|
+
# to "ASCII-8BIT".
|
51
|
+
@encoding_table = {}
|
52
|
+
|
53
|
+
def self.encoding_table
|
54
|
+
@encoding_table
|
55
|
+
end
|
56
|
+
|
57
|
+
BINARY = 'binary'
|
58
|
+
|
59
|
+
# Builds the ENCODING_TABLE hash by running through the list of supported encodings
|
60
|
+
# in the ICU detection API and trying to map them to supported encodings in Ruby.
|
61
|
+
# This is built dynamically so as to take advantage of ICU upgrades which may have
|
62
|
+
# support for more encodings in the future.
|
63
|
+
#
|
64
|
+
# Returns nothing.
|
65
|
+
def self.build_encoding_table
|
66
|
+
supported_encodings.each do |name|
|
67
|
+
@encoding_table[name] = begin
|
68
|
+
::Encoding.find(name).name
|
69
|
+
rescue ArgumentError
|
70
|
+
BINARY
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
build_encoding_table
|
44
75
|
end
|
45
76
|
end
|
@@ -26,7 +26,7 @@ class String
|
|
26
26
|
# Returns: self
|
27
27
|
def detect_encoding!(hint_enc=nil)
|
28
28
|
if detected = self.detect_encoding(hint_enc)
|
29
|
-
self.force_encoding(detected[:
|
29
|
+
self.force_encoding(detected[:ruby_encoding]) if detected[:ruby_encoding]
|
30
30
|
end
|
31
31
|
self
|
32
32
|
end
|
@@ -89,6 +89,17 @@ class EncodingDetectorTest < MiniTest::Test
|
|
89
89
|
assert supported_encodings.include? 'UTF-8'
|
90
90
|
end
|
91
91
|
|
92
|
+
def test_returns_a_ruby_compatible_encoding_name
|
93
|
+
detected = @detector.detect 'test'
|
94
|
+
assert_equal 'ISO-8859-1', detected[:encoding]
|
95
|
+
assert_equal 'ISO-8859-1', detected[:ruby_encoding]
|
96
|
+
|
97
|
+
not_compat_txt = fixture("ISO-2022-KR.txt").read
|
98
|
+
detected = @detector.detect not_compat_txt
|
99
|
+
assert_equal 'ISO-2022-KR', detected[:encoding]
|
100
|
+
assert_equal 'binary', detected[:ruby_encoding]
|
101
|
+
end
|
102
|
+
|
92
103
|
MAPPING = [
|
93
104
|
['repl2.cljs', 'ISO-8859-1', :text],
|
94
105
|
['cl-messagepack.lisp', 'ISO-8859-1', :text],
|
@@ -114,8 +125,7 @@ class EncodingDetectorTest < MiniTest::Test
|
|
114
125
|
MAPPING.each do |mapping|
|
115
126
|
file, encoding, type = mapping
|
116
127
|
|
117
|
-
|
118
|
-
content = File.read path
|
128
|
+
content = fixture(file).read
|
119
129
|
guessed = @detector.detect content
|
120
130
|
|
121
131
|
assert_equal encoding, guessed[:encoding]
|
@@ -0,0 +1,43 @@
|
|
1
|
+
$)C#
|
2
|
+
# Out-AnsiGraph.psm1
|
3
|
+
# Author: xcud
|
4
|
+
# History:
|
5
|
+
# v0.1 September 21, 2009 initial version
|
6
|
+
#
|
7
|
+
# PS Example> ps | select -first 5 | sort -property VM |
|
8
|
+
# Out-AnsiGraph ProcessName, VM
|
9
|
+
# AEADISRV 14508032
|
10
|
+
# audiodg 50757632
|
11
|
+
# conhost 73740288
|
12
|
+
# AppleMobileDeviceService 92061696
|
13
|
+
# btdna 126443520
|
14
|
+
#
|
15
|
+
function Out-AnsiGraph($Parameter1=$null) {
|
16
|
+
BEGIN {
|
17
|
+
$q = new-object Collections.queue
|
18
|
+
$max = 0; $namewidth = 0;
|
19
|
+
}
|
20
|
+
|
21
|
+
PROCESS {
|
22
|
+
if($_) {
|
23
|
+
$name = $_.($Parameter1[0]);
|
24
|
+
$val = $_.($Parameter1[1])
|
25
|
+
if($max -lt $val) { $max = $val}
|
26
|
+
if($namewidth -lt $name.length) {
|
27
|
+
$namewidth = $name.length }
|
28
|
+
$q.enqueue(@($name, $val))
|
29
|
+
}
|
30
|
+
}
|
31
|
+
|
32
|
+
END {
|
33
|
+
$q | %{
|
34
|
+
$graph = ""; 0..($_[1]/$max*20) |
|
35
|
+
%{ $graph += "" }
|
36
|
+
$name = "{0,$namewidth}" -f $_[0]
|
37
|
+
"$name $graph " + $_[1]
|
38
|
+
}
|
39
|
+
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
43
|
+
Export-ModuleMember Out-AnsiGraph
|
data/test/helper.rb
CHANGED
@@ -16,6 +16,11 @@ else
|
|
16
16
|
Minitest::Test = MiniTest::Unit::TestCase
|
17
17
|
end
|
18
18
|
|
19
|
+
def fixture(name)
|
20
|
+
path = File.expand_path "../fixtures/#{name}", __FILE__
|
21
|
+
File.new path
|
22
|
+
end
|
23
|
+
|
19
24
|
# put lib and test dirs directly on load path
|
20
25
|
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
21
|
-
$LOAD_PATH.unshift File.expand_path('..', __FILE__)
|
26
|
+
$LOAD_PATH.unshift File.expand_path('..', __FILE__)
|
data/test/string_methods_test.rb
CHANGED
@@ -40,7 +40,18 @@ class StringMethodsTest < MiniTest::Test
|
|
40
40
|
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
41
41
|
end
|
42
42
|
|
43
|
-
|
43
|
+
def test_returns_a_ruby_compatible_encoding_name
|
44
|
+
detected = 'test'.detect_encoding
|
45
|
+
assert_equal 'ISO-8859-1', detected[:encoding]
|
46
|
+
assert_equal 'ISO-8859-1', detected[:ruby_encoding]
|
47
|
+
|
48
|
+
not_compat_txt = fixture("ISO-2022-KR.txt").read
|
49
|
+
detected = not_compat_txt.detect_encoding
|
50
|
+
assert_equal 'ISO-2022-KR', detected[:encoding]
|
51
|
+
assert_equal 'binary', detected[:ruby_encoding]
|
52
|
+
end
|
53
|
+
|
54
|
+
if "".respond_to? :force_encoding
|
44
55
|
def test_adds_detect_encoding_bang_method
|
45
56
|
str = 'test'
|
46
57
|
str.respond_to? :detect_encoding!
|
@@ -48,5 +59,15 @@ class StringMethodsTest < MiniTest::Test
|
|
48
59
|
str.detect_encoding!
|
49
60
|
assert_equal Encoding.find('ISO-8859-1'), str.encoding
|
50
61
|
end
|
62
|
+
|
63
|
+
def test_sets_a_ruby_compatible_encoding_name
|
64
|
+
str1 = 'test'
|
65
|
+
str1.detect_encoding!
|
66
|
+
assert_equal 'ISO-8859-1', str1.encoding.name
|
67
|
+
|
68
|
+
not_compat_txt = fixture("ISO-2022-KR.txt").read
|
69
|
+
not_compat_txt.detect_encoding!
|
70
|
+
assert_equal 'ASCII-8BIT', not_compat_txt.encoding.name
|
71
|
+
end
|
51
72
|
end
|
52
|
-
end
|
73
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: charlock_holmes
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brian Lopez
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-
|
12
|
+
date: 2014-06-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake-compiler
|
@@ -82,6 +82,7 @@ files:
|
|
82
82
|
- test/converter_test.rb
|
83
83
|
- test/encoding_detector_test.rb
|
84
84
|
- test/fixtures/AnsiGraph.psm1
|
85
|
+
- test/fixtures/ISO-2022-KR.txt
|
85
86
|
- test/fixtures/TwigExtensionsDate.es.yml
|
86
87
|
- test/fixtures/cl-messagepack.lisp
|
87
88
|
- test/fixtures/core.rkt
|