charlock_holmes 0.7.1 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/charlock_holmes/encoding_detector.c +13 -1
- data/lib/charlock_holmes/encoding_detector.rb +31 -0
- data/lib/charlock_holmes/string.rb +1 -1
- data/lib/charlock_holmes/version.rb +1 -1
- data/test/encoding_detector_test.rb +12 -2
- data/test/fixtures/ISO-2022-KR.txt +43 -0
- data/test/helper.rb +6 -1
- data/test/string_methods_test.rb +23 -2
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 53abf00f6c72c2ac1339b3f856011bed111b9ad4
|
4
|
+
data.tar.gz: c17048fa5ddf8c5c37f1378653e0bdcea6849e7a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4f97cf5d2bca0e320e1eb54efd77bd4efed5add1621a4c9dcd62e6ec0c1b0b0834a30a5247341308974683e93a064044c2fb524b562d88dd26ead19694bf2121
|
7
|
+
data.tar.gz: 57ac5e9d12ae54f65387ef81f45afde88a8d4988f38f2647e2fbc3c88fe00fe094361f8eb26120f7bacea21024f73a74118bfc06a762d7fcc47caecfd5edc372
|
@@ -15,6 +15,9 @@ static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
|
|
15
15
|
const char *mlang;
|
16
16
|
int mconfidence;
|
17
17
|
VALUE rb_match;
|
18
|
+
VALUE enc_tbl;
|
19
|
+
VALUE enc_name;
|
20
|
+
VALUE compat_enc;
|
18
21
|
|
19
22
|
if (!match)
|
20
23
|
return Qnil;
|
@@ -26,7 +29,16 @@ static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
|
|
26
29
|
rb_match = rb_hash_new();
|
27
30
|
|
28
31
|
rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("text")));
|
29
|
-
|
32
|
+
|
33
|
+
enc_name = charlock_new_str2(mname);
|
34
|
+
rb_hash_aset(rb_match, ID2SYM(rb_intern("encoding")), enc_name);
|
35
|
+
|
36
|
+
enc_tbl = rb_iv_get(rb_cEncodingDetector, "@encoding_table");
|
37
|
+
compat_enc = rb_hash_aref(enc_tbl, enc_name);
|
38
|
+
if (!NIL_P(compat_enc)) {
|
39
|
+
rb_hash_aset(rb_match, ID2SYM(rb_intern("ruby_encoding")), compat_enc);
|
40
|
+
}
|
41
|
+
|
30
42
|
rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(mconfidence));
|
31
43
|
|
32
44
|
if (mlang && mlang[0])
|
@@ -41,5 +41,36 @@ module CharlockHolmes
|
|
41
41
|
def self.detect_all(str, hint_enc=nil)
|
42
42
|
new.detect_all(str, hint_enc)
|
43
43
|
end
|
44
|
+
|
45
|
+
# A mapping table of supported encoding names from EncodingDetector
|
46
|
+
# which point to the corresponding supported encoding name in Ruby.
|
47
|
+
# Like: {"UTF-8" => "UTF-8", "IBM420_rtl" => "ASCII-8BIT"}
|
48
|
+
#
|
49
|
+
# Note that encodings that can't be mapped between Charlock and Ruby will resolve
|
50
|
+
# to "ASCII-8BIT".
|
51
|
+
@encoding_table = {}
|
52
|
+
|
53
|
+
def self.encoding_table
|
54
|
+
@encoding_table
|
55
|
+
end
|
56
|
+
|
57
|
+
BINARY = 'binary'
|
58
|
+
|
59
|
+
# Builds the ENCODING_TABLE hash by running through the list of supported encodings
|
60
|
+
# in the ICU detection API and trying to map them to supported encodings in Ruby.
|
61
|
+
# This is built dynamically so as to take advantage of ICU upgrades which may have
|
62
|
+
# support for more encodings in the future.
|
63
|
+
#
|
64
|
+
# Returns nothing.
|
65
|
+
def self.build_encoding_table
|
66
|
+
supported_encodings.each do |name|
|
67
|
+
@encoding_table[name] = begin
|
68
|
+
::Encoding.find(name).name
|
69
|
+
rescue ArgumentError
|
70
|
+
BINARY
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
build_encoding_table
|
44
75
|
end
|
45
76
|
end
|
@@ -26,7 +26,7 @@ class String
|
|
26
26
|
# Returns: self
|
27
27
|
def detect_encoding!(hint_enc=nil)
|
28
28
|
if detected = self.detect_encoding(hint_enc)
|
29
|
-
self.force_encoding(detected[:
|
29
|
+
self.force_encoding(detected[:ruby_encoding]) if detected[:ruby_encoding]
|
30
30
|
end
|
31
31
|
self
|
32
32
|
end
|
@@ -89,6 +89,17 @@ class EncodingDetectorTest < MiniTest::Test
|
|
89
89
|
assert supported_encodings.include? 'UTF-8'
|
90
90
|
end
|
91
91
|
|
92
|
+
def test_returns_a_ruby_compatible_encoding_name
|
93
|
+
detected = @detector.detect 'test'
|
94
|
+
assert_equal 'ISO-8859-1', detected[:encoding]
|
95
|
+
assert_equal 'ISO-8859-1', detected[:ruby_encoding]
|
96
|
+
|
97
|
+
not_compat_txt = fixture("ISO-2022-KR.txt").read
|
98
|
+
detected = @detector.detect not_compat_txt
|
99
|
+
assert_equal 'ISO-2022-KR', detected[:encoding]
|
100
|
+
assert_equal 'binary', detected[:ruby_encoding]
|
101
|
+
end
|
102
|
+
|
92
103
|
MAPPING = [
|
93
104
|
['repl2.cljs', 'ISO-8859-1', :text],
|
94
105
|
['cl-messagepack.lisp', 'ISO-8859-1', :text],
|
@@ -114,8 +125,7 @@ class EncodingDetectorTest < MiniTest::Test
|
|
114
125
|
MAPPING.each do |mapping|
|
115
126
|
file, encoding, type = mapping
|
116
127
|
|
117
|
-
|
118
|
-
content = File.read path
|
128
|
+
content = fixture(file).read
|
119
129
|
guessed = @detector.detect content
|
120
130
|
|
121
131
|
assert_equal encoding, guessed[:encoding]
|
@@ -0,0 +1,43 @@
|
|
1
|
+
$)C#
|
2
|
+
# Out-AnsiGraph.psm1
|
3
|
+
# Author: xcud
|
4
|
+
# History:
|
5
|
+
# v0.1 September 21, 2009 initial version
|
6
|
+
#
|
7
|
+
# PS Example> ps | select -first 5 | sort -property VM |
|
8
|
+
# Out-AnsiGraph ProcessName, VM
|
9
|
+
# AEADISRV 14508032
|
10
|
+
# audiodg 50757632
|
11
|
+
# conhost 73740288
|
12
|
+
# AppleMobileDeviceService 92061696
|
13
|
+
# btdna 126443520
|
14
|
+
#
|
15
|
+
function Out-AnsiGraph($Parameter1=$null) {
|
16
|
+
BEGIN {
|
17
|
+
$q = new-object Collections.queue
|
18
|
+
$max = 0; $namewidth = 0;
|
19
|
+
}
|
20
|
+
|
21
|
+
PROCESS {
|
22
|
+
if($_) {
|
23
|
+
$name = $_.($Parameter1[0]);
|
24
|
+
$val = $_.($Parameter1[1])
|
25
|
+
if($max -lt $val) { $max = $val}
|
26
|
+
if($namewidth -lt $name.length) {
|
27
|
+
$namewidth = $name.length }
|
28
|
+
$q.enqueue(@($name, $val))
|
29
|
+
}
|
30
|
+
}
|
31
|
+
|
32
|
+
END {
|
33
|
+
$q | %{
|
34
|
+
$graph = ""; 0..($_[1]/$max*20) |
|
35
|
+
%{ $graph += "" }
|
36
|
+
$name = "{0,$namewidth}" -f $_[0]
|
37
|
+
"$name $graph " + $_[1]
|
38
|
+
}
|
39
|
+
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
43
|
+
Export-ModuleMember Out-AnsiGraph
|
data/test/helper.rb
CHANGED
@@ -16,6 +16,11 @@ else
|
|
16
16
|
Minitest::Test = MiniTest::Unit::TestCase
|
17
17
|
end
|
18
18
|
|
19
|
+
def fixture(name)
|
20
|
+
path = File.expand_path "../fixtures/#{name}", __FILE__
|
21
|
+
File.new path
|
22
|
+
end
|
23
|
+
|
19
24
|
# put lib and test dirs directly on load path
|
20
25
|
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
21
|
-
$LOAD_PATH.unshift File.expand_path('..', __FILE__)
|
26
|
+
$LOAD_PATH.unshift File.expand_path('..', __FILE__)
|
data/test/string_methods_test.rb
CHANGED
@@ -40,7 +40,18 @@ class StringMethodsTest < MiniTest::Test
|
|
40
40
|
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
41
41
|
end
|
42
42
|
|
43
|
-
|
43
|
+
def test_returns_a_ruby_compatible_encoding_name
|
44
|
+
detected = 'test'.detect_encoding
|
45
|
+
assert_equal 'ISO-8859-1', detected[:encoding]
|
46
|
+
assert_equal 'ISO-8859-1', detected[:ruby_encoding]
|
47
|
+
|
48
|
+
not_compat_txt = fixture("ISO-2022-KR.txt").read
|
49
|
+
detected = not_compat_txt.detect_encoding
|
50
|
+
assert_equal 'ISO-2022-KR', detected[:encoding]
|
51
|
+
assert_equal 'binary', detected[:ruby_encoding]
|
52
|
+
end
|
53
|
+
|
54
|
+
if "".respond_to? :force_encoding
|
44
55
|
def test_adds_detect_encoding_bang_method
|
45
56
|
str = 'test'
|
46
57
|
str.respond_to? :detect_encoding!
|
@@ -48,5 +59,15 @@ class StringMethodsTest < MiniTest::Test
|
|
48
59
|
str.detect_encoding!
|
49
60
|
assert_equal Encoding.find('ISO-8859-1'), str.encoding
|
50
61
|
end
|
62
|
+
|
63
|
+
def test_sets_a_ruby_compatible_encoding_name
|
64
|
+
str1 = 'test'
|
65
|
+
str1.detect_encoding!
|
66
|
+
assert_equal 'ISO-8859-1', str1.encoding.name
|
67
|
+
|
68
|
+
not_compat_txt = fixture("ISO-2022-KR.txt").read
|
69
|
+
not_compat_txt.detect_encoding!
|
70
|
+
assert_equal 'ASCII-8BIT', not_compat_txt.encoding.name
|
71
|
+
end
|
51
72
|
end
|
52
|
-
end
|
73
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: charlock_holmes
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brian Lopez
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-
|
12
|
+
date: 2014-06-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake-compiler
|
@@ -82,6 +82,7 @@ files:
|
|
82
82
|
- test/converter_test.rb
|
83
83
|
- test/encoding_detector_test.rb
|
84
84
|
- test/fixtures/AnsiGraph.psm1
|
85
|
+
- test/fixtures/ISO-2022-KR.txt
|
85
86
|
- test/fixtures/TwigExtensionsDate.es.yml
|
86
87
|
- test/fixtures/cl-messagepack.lisp
|
87
88
|
- test/fixtures/core.rkt
|