charlock_holmes 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile.lock
CHANGED
@@ -8,6 +8,15 @@
|
|
8
8
|
static VALUE rb_mCharlockHolmes;
|
9
9
|
static VALUE rb_cEncodingDetector;
|
10
10
|
|
11
|
+
static VALUE charlock_new_str(const char *str, size_t len)
|
12
|
+
{
|
13
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
14
|
+
return rb_external_str_new_with_enc(str, len, rb_utf8_encoding());
|
15
|
+
#else
|
16
|
+
return rb_str_new(str, len);
|
17
|
+
#endif
|
18
|
+
}
|
19
|
+
|
11
20
|
static VALUE charlock_new_str2(const char *str)
|
12
21
|
{
|
13
22
|
#ifdef HAVE_RUBY_ENCODING_H
|
@@ -89,7 +98,7 @@ static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
|
|
89
98
|
* be used as an additional hint to the charset detector
|
90
99
|
*
|
91
100
|
* Returns: an Array with zero or more Hashes,
|
92
|
-
*
|
101
|
+
* each one of them with with :encoding, :language and :confidence
|
93
102
|
*/
|
94
103
|
static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
|
95
104
|
{
|
@@ -124,6 +133,90 @@ static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
|
|
124
133
|
return rb_ret;
|
125
134
|
}
|
126
135
|
|
136
|
+
/*
|
137
|
+
* call-seq: EncodingDetector#strip_tags?
|
138
|
+
*
|
139
|
+
* Returns whether or not the strip_tags flag is set on this detector
|
140
|
+
*
|
141
|
+
* Returns: Boolean
|
142
|
+
*/
|
143
|
+
static VALUE rb_get_strip_tags(VALUE self)
|
144
|
+
{
|
145
|
+
UCharsetDetector *csd;
|
146
|
+
UBool val;
|
147
|
+
VALUE rb_val;
|
148
|
+
|
149
|
+
Data_Get_Struct(self, UCharsetDetector, csd);
|
150
|
+
|
151
|
+
val = ucsdet_isInputFilterEnabled(csd);
|
152
|
+
|
153
|
+
rb_val = val == 1 ? Qtrue : Qfalse;
|
154
|
+
|
155
|
+
return rb_val;
|
156
|
+
}
|
157
|
+
|
158
|
+
/*
|
159
|
+
* call-seq: EncodingDetector#strip_tags = true
|
160
|
+
*
|
161
|
+
* Enable or disable the stripping of HTML/XML tags from the input before
|
162
|
+
* attempting any detection
|
163
|
+
*
|
164
|
+
* Returns: Boolean, the value passed
|
165
|
+
*/
|
166
|
+
static VALUE rb_set_strip_tags(VALUE self, VALUE rb_val)
|
167
|
+
{
|
168
|
+
UCharsetDetector *csd;
|
169
|
+
UBool val;
|
170
|
+
|
171
|
+
Data_Get_Struct(self, UCharsetDetector, csd);
|
172
|
+
|
173
|
+
val = rb_val == Qtrue ? 1 : 0;
|
174
|
+
|
175
|
+
ucsdet_enableInputFilter(csd, val);
|
176
|
+
|
177
|
+
return rb_val;
|
178
|
+
}
|
179
|
+
|
180
|
+
/*
|
181
|
+
* call-seq: detectable_encodings = EncodingDetector.supported_encodings
|
182
|
+
*
|
183
|
+
* The list of detectable encodings supported by this library
|
184
|
+
*
|
185
|
+
* Returns: an Array of Strings
|
186
|
+
*/
|
187
|
+
static VALUE rb_get_supported_encodings(VALUE klass)
|
188
|
+
{
|
189
|
+
UCharsetDetector *csd;
|
190
|
+
UErrorCode status = U_ZERO_ERROR;
|
191
|
+
UEnumeration *encoding_list;
|
192
|
+
VALUE rb_encoding_list;
|
193
|
+
int32_t enc_count;
|
194
|
+
int32_t i;
|
195
|
+
const char *enc_name;
|
196
|
+
int32_t enc_name_len;
|
197
|
+
|
198
|
+
csd = ucsdet_open(&status);
|
199
|
+
|
200
|
+
rb_encoding_list = rb_iv_get(klass, "encoding_list");
|
201
|
+
|
202
|
+
// lazily populate the list
|
203
|
+
if (NIL_P(rb_encoding_list)) {
|
204
|
+
encoding_list = ucsdet_getAllDetectableCharsets(csd, &status);
|
205
|
+
rb_encoding_list = rb_ary_new();
|
206
|
+
enc_count = uenum_count(encoding_list, &status);
|
207
|
+
|
208
|
+
for(i=0; i < enc_count; i++) {
|
209
|
+
enc_name = uenum_next(encoding_list, &enc_name_len, &status);
|
210
|
+
rb_ary_push(rb_encoding_list, charlock_new_str(enc_name, enc_name_len));
|
211
|
+
}
|
212
|
+
|
213
|
+
rb_iv_set(klass, "encoding_list", rb_encoding_list);
|
214
|
+
}
|
215
|
+
|
216
|
+
ucsdet_close(csd);
|
217
|
+
|
218
|
+
return rb_encoding_list;
|
219
|
+
}
|
127
220
|
|
128
221
|
static void rb_encdec__free(void *csd)
|
129
222
|
{
|
@@ -145,4 +238,8 @@ void Init_charlock_holmes()
|
|
145
238
|
rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
|
146
239
|
rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
|
147
240
|
rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1);
|
241
|
+
rb_define_method(rb_cEncodingDetector, "strip_tags", rb_get_strip_tags, 0);
|
242
|
+
rb_define_method(rb_cEncodingDetector, "strip_tags=", rb_set_strip_tags, 1);
|
243
|
+
|
244
|
+
rb_define_singleton_method(rb_cEncodingDetector, "supported_encodings", rb_get_supported_encodings, 0);
|
148
245
|
}
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
require 'spec_helper'
|
2
4
|
|
3
5
|
describe CharlockHolmes::EncodingDetector do
|
@@ -65,6 +67,29 @@ describe CharlockHolmes::EncodingDetector do
|
|
65
67
|
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
66
68
|
end
|
67
69
|
|
70
|
+
test 'has a strip_tags flag' do
|
71
|
+
detector = CharlockHolmes::EncodingDetector.new
|
72
|
+
detector.strip_tags = true
|
73
|
+
assert detector.strip_tags
|
74
|
+
|
75
|
+
detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
|
76
|
+
assert_equal 'UTF-8', detection[:encoding]
|
77
|
+
|
78
|
+
detector.strip_tags = false
|
79
|
+
assert !detector.strip_tags
|
80
|
+
|
81
|
+
detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
|
82
|
+
assert_equal 'UTF-8', detection[:encoding]
|
83
|
+
end
|
84
|
+
|
85
|
+
test 'has a list of supported encodings' do
|
86
|
+
CharlockHolmes::EncodingDetector.respond_to? :supported_encodings
|
87
|
+
supported_encodings = CharlockHolmes::EncodingDetector.supported_encodings
|
88
|
+
|
89
|
+
assert supported_encodings.is_a?(Array)
|
90
|
+
assert supported_encodings.include? 'UTF-8'
|
91
|
+
end
|
92
|
+
|
68
93
|
context 'encoding detection' do
|
69
94
|
MAPPING = [
|
70
95
|
['repl2.cljs', 'ISO-8859-1'],
|
metadata
CHANGED
@@ -1,79 +1,58 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: charlock_holmes
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.4.0
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 3
|
9
|
-
- 0
|
10
|
-
version: 0.3.0
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Brian Lopez
|
14
|
-
-
|
9
|
+
- Vicent Martí
|
15
10
|
autorequire:
|
16
11
|
bindir: bin
|
17
12
|
cert_chain: []
|
18
|
-
|
19
|
-
date: 2011-08-25 00:00:00 -07:00
|
13
|
+
date: 2011-08-25 00:00:00.000000000 -07:00
|
20
14
|
default_executable:
|
21
|
-
dependencies:
|
22
|
-
- !ruby/object:Gem::Dependency
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
23
17
|
name: rake-compiler
|
24
|
-
|
25
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
18
|
+
requirement: &70365951363440 !ruby/object:Gem::Requirement
|
26
19
|
none: false
|
27
|
-
requirements:
|
28
|
-
- -
|
29
|
-
- !ruby/object:Gem::Version
|
30
|
-
hash: 9
|
31
|
-
segments:
|
32
|
-
- 0
|
33
|
-
- 7
|
34
|
-
- 5
|
20
|
+
requirements:
|
21
|
+
- - ! '>='
|
22
|
+
- !ruby/object:Gem::Version
|
35
23
|
version: 0.7.5
|
36
24
|
type: :development
|
37
|
-
version_requirements: *id001
|
38
|
-
- !ruby/object:Gem::Dependency
|
39
|
-
name: rspec
|
40
25
|
prerelease: false
|
41
|
-
|
26
|
+
version_requirements: *70365951363440
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: &70365951362940 !ruby/object:Gem::Requirement
|
42
30
|
none: false
|
43
|
-
requirements:
|
44
|
-
- -
|
45
|
-
- !ruby/object:Gem::Version
|
46
|
-
hash: 15
|
47
|
-
segments:
|
48
|
-
- 2
|
49
|
-
- 0
|
50
|
-
- 0
|
31
|
+
requirements:
|
32
|
+
- - ! '>='
|
33
|
+
- !ruby/object:Gem::Version
|
51
34
|
version: 2.0.0
|
52
35
|
type: :development
|
53
|
-
version_requirements: *id002
|
54
|
-
- !ruby/object:Gem::Dependency
|
55
|
-
name: chardet
|
56
36
|
prerelease: false
|
57
|
-
|
37
|
+
version_requirements: *70365951362940
|
38
|
+
- !ruby/object:Gem::Dependency
|
39
|
+
name: chardet
|
40
|
+
requirement: &70365951394040 !ruby/object:Gem::Requirement
|
58
41
|
none: false
|
59
|
-
requirements:
|
60
|
-
- -
|
61
|
-
- !ruby/object:Gem::Version
|
62
|
-
|
63
|
-
segments:
|
64
|
-
- 0
|
65
|
-
version: "0"
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
66
46
|
type: :development
|
67
|
-
|
47
|
+
prerelease: false
|
48
|
+
version_requirements: *70365951394040
|
68
49
|
description:
|
69
50
|
email: seniorlopez@gmail.com
|
70
51
|
executables: []
|
71
|
-
|
72
|
-
extensions:
|
52
|
+
extensions:
|
73
53
|
- ext/charlock_holmes/extconf.rb
|
74
54
|
extra_rdoc_files: []
|
75
|
-
|
76
|
-
files:
|
55
|
+
files:
|
77
56
|
- .gitignore
|
78
57
|
- .rspec
|
79
58
|
- Gemfile
|
@@ -102,39 +81,31 @@ files:
|
|
102
81
|
has_rdoc: true
|
103
82
|
homepage: http://github.com/brianmario/charlock_holmes
|
104
83
|
licenses: []
|
105
|
-
|
106
84
|
post_install_message:
|
107
|
-
rdoc_options:
|
85
|
+
rdoc_options:
|
108
86
|
- --charset=UTF-8
|
109
|
-
require_paths:
|
87
|
+
require_paths:
|
110
88
|
- lib
|
111
89
|
- ext
|
112
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
90
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
113
91
|
none: false
|
114
|
-
requirements:
|
115
|
-
- -
|
116
|
-
- !ruby/object:Gem::Version
|
117
|
-
|
118
|
-
|
119
|
-
- 0
|
120
|
-
version: "0"
|
121
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - ! '>='
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: '0'
|
96
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
122
97
|
none: false
|
123
|
-
requirements:
|
124
|
-
- -
|
125
|
-
- !ruby/object:Gem::Version
|
126
|
-
|
127
|
-
segments:
|
128
|
-
- 0
|
129
|
-
version: "0"
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
130
102
|
requirements: []
|
131
|
-
|
132
103
|
rubyforge_project:
|
133
104
|
rubygems_version: 1.6.2
|
134
105
|
signing_key:
|
135
106
|
specification_version: 3
|
136
107
|
summary: Character encoding detection, brought to you by ICU
|
137
|
-
test_files:
|
108
|
+
test_files:
|
138
109
|
- spec/encoding_detector_spec.rb
|
139
110
|
- spec/fixtures/AnsiGraph.psm1
|
140
111
|
- spec/fixtures/TwigExtensionsDate.es.yml
|