charlock_holmes 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +6 -0
- data/ext/charlock_holmes/charlock_holmes.c +38 -9
- data/ext/charlock_holmes/extconf.rb +3 -0
- data/lib/charlock_holmes/encoding_detector.rb +21 -2
- data/lib/charlock_holmes/string.rb +13 -4
- data/lib/charlock_holmes/version.rb +1 -1
- data/spec/encoding_detector_spec.rb +42 -1
- data/spec/string_method_spec.rb +30 -0
- metadata +73 -44
data/README.md
CHANGED
@@ -21,6 +21,8 @@ detection = CharlockHolmes::EncodingDetector.detect(contents)
|
|
21
21
|
# that's mostly only returned for legacy encodings like ISO-8859-1
|
22
22
|
```
|
23
23
|
|
24
|
+
NOTE: `CharlockHolmes::EncodingDetector.detect` will return `nil` if it was unable to find an encoding.
|
25
|
+
|
24
26
|
Though it's more efficient to reuse once detector instance:
|
25
27
|
|
26
28
|
``` ruby
|
@@ -66,3 +68,7 @@ If the traditional `gem install charlock_holmes` doesn't work, you may need to s
|
|
66
68
|
At the time of writing, if you installed ICU via homebrew on OSX your gem install may look something like this:
|
67
69
|
|
68
70
|
`gem install charlock_holmes --with-icu-dir=/usr/local/Cellar/icu4c/4.4.1`
|
71
|
+
|
72
|
+
If you're using Bundler and need to specify a custom path, you can do so with the `bundle config` command:
|
73
|
+
|
74
|
+
`bundle config build.charlock_holmes --with-icu-dir=/usr/local/Cellar/icu4c/4.4.1`
|
@@ -44,41 +44,64 @@ static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
|
|
44
44
|
}
|
45
45
|
|
46
46
|
/*
|
47
|
-
* call-seq: detection_hash = EncodingDetector.detect
|
47
|
+
* call-seq: detection_hash = EncodingDetector.detect str[, hint_enc]
|
48
48
|
*
|
49
49
|
* Attempt to detect the encoding of this string
|
50
50
|
*
|
51
|
+
* str - a String, what you want to detect the encoding of
|
52
|
+
* hint_enc - an optional String (like "UTF-8"), the encoding name which will
|
53
|
+
* be used as an additional hint to the charset detector
|
54
|
+
*
|
51
55
|
* Returns: a Hash with :encoding, :language and :confidence
|
52
56
|
*/
|
53
|
-
static VALUE rb_encdec_detect(VALUE
|
57
|
+
static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
|
54
58
|
{
|
55
59
|
UErrorCode status = U_ZERO_ERROR;
|
56
60
|
UCharsetDetector *csd;
|
61
|
+
VALUE rb_str;
|
62
|
+
VALUE rb_enc_hint;
|
63
|
+
|
64
|
+
rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
|
57
65
|
|
58
66
|
Check_Type(rb_str, T_STRING);
|
59
67
|
Data_Get_Struct(self, UCharsetDetector, csd);
|
60
68
|
|
61
69
|
ucsdet_setText(csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
|
70
|
+
|
71
|
+
if (!NIL_P(rb_enc_hint)) {
|
72
|
+
Check_Type(rb_enc_hint, T_STRING);
|
73
|
+
ucsdet_setDeclaredEncoding(csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
|
74
|
+
}
|
75
|
+
|
62
76
|
return rb_encdec_buildmatch(ucsdet_detect(csd, &status));
|
63
77
|
}
|
64
78
|
|
65
79
|
|
66
80
|
/*
|
67
|
-
* call-seq: detection_hash_array = EncodingDetector.detect_all
|
81
|
+
* call-seq: detection_hash_array = EncodingDetector.detect_all str[, hint_enc]
|
68
82
|
*
|
69
83
|
* Attempt to detect the encoding of this string, and return
|
70
84
|
* a list with all the possible encodings that match it.
|
71
85
|
*
|
72
|
-
*
|
86
|
+
*
|
87
|
+
* str - a String, what you want to detect the encoding of
|
88
|
+
* hint_enc - an optional String (like "UTF-8"), the encoding name which will
|
89
|
+
* be used as an additional hint to the charset detector
|
90
|
+
*
|
91
|
+
* Returns: an Array with zero or more Hashes,
|
73
92
|
* each one of them with with :encoding, :language and :confidence
|
74
93
|
*/
|
75
|
-
static VALUE rb_encdec_detect_all(VALUE
|
94
|
+
static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
|
76
95
|
{
|
77
96
|
UErrorCode status = U_ZERO_ERROR;
|
78
97
|
UCharsetDetector *csd;
|
79
|
-
|
98
|
+
const UCharsetMatch **csm;
|
80
99
|
VALUE rb_ret;
|
81
100
|
int i, match_count;
|
101
|
+
VALUE rb_str;
|
102
|
+
VALUE rb_enc_hint;
|
103
|
+
|
104
|
+
rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
|
82
105
|
|
83
106
|
Check_Type(rb_str, T_STRING);
|
84
107
|
Data_Get_Struct(self, UCharsetDetector, csd);
|
@@ -86,7 +109,13 @@ static VALUE rb_encdec_detect_all(VALUE self, VALUE rb_str)
|
|
86
109
|
rb_ret = rb_ary_new();
|
87
110
|
|
88
111
|
ucsdet_setText(csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
|
89
|
-
|
112
|
+
|
113
|
+
if (!NIL_P(rb_enc_hint)) {
|
114
|
+
Check_Type(rb_enc_hint, T_STRING);
|
115
|
+
ucsdet_setDeclaredEncoding(csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
|
116
|
+
}
|
117
|
+
|
118
|
+
csm = ucsdet_detectAll(csd, &match_count, &status);
|
90
119
|
|
91
120
|
for (i = 0; i < match_count; ++i) {
|
92
121
|
rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i]));
|
@@ -114,6 +143,6 @@ void Init_charlock_holmes()
|
|
114
143
|
|
115
144
|
rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject);
|
116
145
|
rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
|
117
|
-
rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, 1);
|
118
|
-
rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, 1);
|
146
|
+
rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
|
147
|
+
rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1);
|
119
148
|
}
|
@@ -4,9 +4,28 @@ module CharlockHolmes
|
|
4
4
|
#
|
5
5
|
# NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
|
6
6
|
#
|
7
|
+
# str - a String, what you want to detect the encoding of
|
8
|
+
# hint_enc - an optional String (like "UTF-8"), the encoding name which will
|
9
|
+
# be used as an additional hint to the charset detector
|
10
|
+
#
|
7
11
|
# Returns: a Hash with :encoding, :language and :confidence
|
8
|
-
def self.detect(str)
|
9
|
-
new.detect(str)
|
12
|
+
def self.detect(str, hint_enc=nil)
|
13
|
+
new.detect(str, hint_enc)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Attempt to detect the encoding of this string, and return
|
17
|
+
# a list with all the possible encodings that match it.
|
18
|
+
#
|
19
|
+
# NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
|
20
|
+
#
|
21
|
+
# str - a String, what you want to detect the encoding of
|
22
|
+
# hint_enc - an optional String (like "UTF-8"), the encoding name which will
|
23
|
+
# be used as an additional hint to the charset detector
|
24
|
+
#
|
25
|
+
# Returns: an Array with zero or more Hashes,
|
26
|
+
# each one of them with with :encoding, :language and :confidence
|
27
|
+
def self.detect_all(str, hint_enc=nil)
|
28
|
+
new.detect_all(str, hint_enc)
|
10
29
|
end
|
11
30
|
end
|
12
31
|
end
|
@@ -4,8 +4,17 @@ class String
|
|
4
4
|
# Attempt to detect the encoding of this string
|
5
5
|
#
|
6
6
|
# Returns: a Hash with :encoding, :language and :confidence
|
7
|
-
def detect_encoding
|
8
|
-
encoding_detector.detect(self)
|
7
|
+
def detect_encoding(hint_enc=nil)
|
8
|
+
encoding_detector.detect(self, hint_enc)
|
9
|
+
end
|
10
|
+
|
11
|
+
# Attempt to detect the encoding of this string, and return
|
12
|
+
# a list with all the possible encodings that match it.
|
13
|
+
#
|
14
|
+
# Returns: an Array with zero or more Hashes,
|
15
|
+
# each one of them with with :encoding, :language and :confidence
|
16
|
+
def detect_encodings(hint_enc=nil)
|
17
|
+
encoding_detector.detect_all(self, hint_enc)
|
9
18
|
end
|
10
19
|
|
11
20
|
if RUBY_VERSION =~ /1.9/
|
@@ -13,8 +22,8 @@ class String
|
|
13
22
|
# then set the encoding to what was detected ala `force_encoding`
|
14
23
|
#
|
15
24
|
# Returns: a Hash with :encoding, :language and :confidence
|
16
|
-
def detect_encoding!
|
17
|
-
if detected = self.detect_encoding
|
25
|
+
def detect_encoding!(hint_enc=nil)
|
26
|
+
if detected = self.detect_encoding(hint_enc)
|
18
27
|
self.force_encoding detected[:encoding]
|
19
28
|
detected
|
20
29
|
end
|
@@ -5,21 +5,62 @@ describe CharlockHolmes::EncodingDetector do
|
|
5
5
|
@detector = CharlockHolmes::EncodingDetector.new
|
6
6
|
end
|
7
7
|
|
8
|
-
test 'has a
|
8
|
+
test 'has a class-level detect method' do
|
9
9
|
CharlockHolmes::EncodingDetector.respond_to? :detect
|
10
10
|
detected = CharlockHolmes::EncodingDetector.detect 'test'
|
11
11
|
assert_equal 'ISO-8859-1', detected[:encoding]
|
12
12
|
end
|
13
13
|
|
14
|
+
test 'has a class-level detect method that accepts an encoding hint' do
|
15
|
+
CharlockHolmes::EncodingDetector.respond_to? :detect
|
16
|
+
detected = CharlockHolmes::EncodingDetector.detect 'test', 'UTF-8'
|
17
|
+
assert_equal 'ISO-8859-1', detected[:encoding]
|
18
|
+
end
|
19
|
+
|
20
|
+
test 'has a class-level detect_all method' do
|
21
|
+
CharlockHolmes::EncodingDetector.respond_to? :detect_all
|
22
|
+
detected_list = CharlockHolmes::EncodingDetector.detect_all 'test'
|
23
|
+
assert detected_list.is_a? Array
|
24
|
+
|
25
|
+
encoding_list = detected_list.map {|d| d[:encoding]}.sort
|
26
|
+
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
27
|
+
end
|
28
|
+
|
29
|
+
test 'has a class-level detect_all method that accepts an encoding hint' do
|
30
|
+
CharlockHolmes::EncodingDetector.respond_to? :detect_all
|
31
|
+
detected_list = CharlockHolmes::EncodingDetector.detect_all 'test', 'UTF-8'
|
32
|
+
assert detected_list.is_a? Array
|
33
|
+
|
34
|
+
encoding_list = detected_list.map {|d| d[:encoding]}.sort
|
35
|
+
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
36
|
+
end
|
37
|
+
|
14
38
|
test 'has a detect method' do
|
15
39
|
@detector.respond_to? :detect
|
16
40
|
detected = @detector.detect 'test'
|
17
41
|
assert_equal 'ISO-8859-1', detected[:encoding]
|
18
42
|
end
|
19
43
|
|
44
|
+
test 'has a detect method that accepts an encoding hint' do
|
45
|
+
@detector.respond_to? :detect
|
46
|
+
detected = @detector.detect 'test', 'UTF-8'
|
47
|
+
assert_equal 'ISO-8859-1', detected[:encoding]
|
48
|
+
end
|
49
|
+
|
20
50
|
test 'has a detect_all method' do
|
21
51
|
@detector.respond_to? :detect_all
|
22
52
|
detected_list = @detector.detect_all 'test'
|
53
|
+
assert detected_list.is_a? Array
|
54
|
+
|
55
|
+
encoding_list = detected_list.map {|d| d[:encoding]}.sort
|
56
|
+
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
57
|
+
end
|
58
|
+
|
59
|
+
test 'has a detect_all method that accepts an encoding hint' do
|
60
|
+
@detector.respond_to? :detect_all
|
61
|
+
detected_list = @detector.detect_all 'test', 'UTF-8'
|
62
|
+
assert detected_list.is_a? Array
|
63
|
+
|
23
64
|
encoding_list = detected_list.map {|d| d[:encoding]}.sort
|
24
65
|
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
25
66
|
end
|
data/spec/string_method_spec.rb
CHANGED
@@ -10,6 +10,36 @@ describe String do
|
|
10
10
|
assert_equal 'ISO-8859-1', detected[:encoding]
|
11
11
|
end
|
12
12
|
|
13
|
+
test 'has a detect_encoding method that accepts an encoding hint' do
|
14
|
+
str = 'test'
|
15
|
+
str.respond_to? :detect_encoding
|
16
|
+
|
17
|
+
detected = str.detect_encoding 'UTF-8'
|
18
|
+
assert_equal 'ISO-8859-1', detected[:encoding]
|
19
|
+
end
|
20
|
+
|
21
|
+
test 'has a detect_encodings method' do
|
22
|
+
str = 'test'
|
23
|
+
str.respond_to? :detect_encodings
|
24
|
+
|
25
|
+
detected_list = str.detect_encodings
|
26
|
+
assert detected_list.is_a? Array
|
27
|
+
|
28
|
+
encoding_list = detected_list.map {|d| d[:encoding]}.sort
|
29
|
+
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
30
|
+
end
|
31
|
+
|
32
|
+
test 'has a detect_encodings method that accepts an encoding hint' do
|
33
|
+
str = 'test'
|
34
|
+
str.respond_to? :detect_encodings
|
35
|
+
|
36
|
+
detected_list = str.detect_encodings 'UTF-8'
|
37
|
+
assert detected_list.is_a? Array
|
38
|
+
|
39
|
+
encoding_list = detected_list.map {|d| d[:encoding]}.sort
|
40
|
+
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
41
|
+
end
|
42
|
+
|
13
43
|
if RUBY_VERSION =~ /1.9/
|
14
44
|
test 'has a detect_encoding! method' do
|
15
45
|
str = 'test'
|
metadata
CHANGED
@@ -1,58 +1,79 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: charlock_holmes
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 19
|
5
5
|
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 3
|
9
|
+
- 0
|
10
|
+
version: 0.3.0
|
6
11
|
platform: ruby
|
7
|
-
authors:
|
12
|
+
authors:
|
8
13
|
- Brian Lopez
|
9
|
-
- Vicent
|
14
|
+
- "Vicent Mart\xC3\xAD"
|
10
15
|
autorequire:
|
11
16
|
bindir: bin
|
12
17
|
cert_chain: []
|
13
|
-
|
18
|
+
|
19
|
+
date: 2011-08-25 00:00:00 -07:00
|
14
20
|
default_executable:
|
15
|
-
dependencies:
|
16
|
-
- !ruby/object:Gem::Dependency
|
21
|
+
dependencies:
|
22
|
+
- !ruby/object:Gem::Dependency
|
17
23
|
name: rake-compiler
|
18
|
-
|
24
|
+
prerelease: false
|
25
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
19
26
|
none: false
|
20
|
-
requirements:
|
21
|
-
- -
|
22
|
-
- !ruby/object:Gem::Version
|
27
|
+
requirements:
|
28
|
+
- - ">="
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
hash: 9
|
31
|
+
segments:
|
32
|
+
- 0
|
33
|
+
- 7
|
34
|
+
- 5
|
23
35
|
version: 0.7.5
|
24
36
|
type: :development
|
25
|
-
|
26
|
-
|
27
|
-
- !ruby/object:Gem::Dependency
|
37
|
+
version_requirements: *id001
|
38
|
+
- !ruby/object:Gem::Dependency
|
28
39
|
name: rspec
|
29
|
-
|
40
|
+
prerelease: false
|
41
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
30
42
|
none: false
|
31
|
-
requirements:
|
32
|
-
- -
|
33
|
-
- !ruby/object:Gem::Version
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
hash: 15
|
47
|
+
segments:
|
48
|
+
- 2
|
49
|
+
- 0
|
50
|
+
- 0
|
34
51
|
version: 2.0.0
|
35
52
|
type: :development
|
36
|
-
|
37
|
-
|
38
|
-
- !ruby/object:Gem::Dependency
|
53
|
+
version_requirements: *id002
|
54
|
+
- !ruby/object:Gem::Dependency
|
39
55
|
name: chardet
|
40
|
-
|
56
|
+
prerelease: false
|
57
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
41
58
|
none: false
|
42
|
-
requirements:
|
43
|
-
- -
|
44
|
-
- !ruby/object:Gem::Version
|
45
|
-
|
59
|
+
requirements:
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
hash: 3
|
63
|
+
segments:
|
64
|
+
- 0
|
65
|
+
version: "0"
|
46
66
|
type: :development
|
47
|
-
|
48
|
-
version_requirements: *70293348070400
|
67
|
+
version_requirements: *id003
|
49
68
|
description:
|
50
69
|
email: seniorlopez@gmail.com
|
51
70
|
executables: []
|
52
|
-
|
71
|
+
|
72
|
+
extensions:
|
53
73
|
- ext/charlock_holmes/extconf.rb
|
54
74
|
extra_rdoc_files: []
|
55
|
-
|
75
|
+
|
76
|
+
files:
|
56
77
|
- .gitignore
|
57
78
|
- .rspec
|
58
79
|
- Gemfile
|
@@ -81,31 +102,39 @@ files:
|
|
81
102
|
has_rdoc: true
|
82
103
|
homepage: http://github.com/brianmario/charlock_holmes
|
83
104
|
licenses: []
|
105
|
+
|
84
106
|
post_install_message:
|
85
|
-
rdoc_options:
|
107
|
+
rdoc_options:
|
86
108
|
- --charset=UTF-8
|
87
|
-
require_paths:
|
109
|
+
require_paths:
|
88
110
|
- lib
|
89
111
|
- ext
|
90
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
112
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
91
113
|
none: false
|
92
|
-
requirements:
|
93
|
-
- -
|
94
|
-
- !ruby/object:Gem::Version
|
95
|
-
|
96
|
-
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
hash: 3
|
118
|
+
segments:
|
119
|
+
- 0
|
120
|
+
version: "0"
|
121
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
97
122
|
none: false
|
98
|
-
requirements:
|
99
|
-
- -
|
100
|
-
- !ruby/object:Gem::Version
|
101
|
-
|
123
|
+
requirements:
|
124
|
+
- - ">="
|
125
|
+
- !ruby/object:Gem::Version
|
126
|
+
hash: 3
|
127
|
+
segments:
|
128
|
+
- 0
|
129
|
+
version: "0"
|
102
130
|
requirements: []
|
131
|
+
|
103
132
|
rubyforge_project:
|
104
133
|
rubygems_version: 1.6.2
|
105
134
|
signing_key:
|
106
135
|
specification_version: 3
|
107
136
|
summary: Character encoding detection, brought to you by ICU
|
108
|
-
test_files:
|
137
|
+
test_files:
|
109
138
|
- spec/encoding_detector_spec.rb
|
110
139
|
- spec/fixtures/AnsiGraph.psm1
|
111
140
|
- spec/fixtures/TwigExtensionsDate.es.yml
|