charlock_holmes 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile.lock
CHANGED
@@ -8,6 +8,15 @@
|
|
8
8
|
static VALUE rb_mCharlockHolmes;
|
9
9
|
static VALUE rb_cEncodingDetector;
|
10
10
|
|
11
|
+
static VALUE charlock_new_str(const char *str, size_t len)
|
12
|
+
{
|
13
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
14
|
+
return rb_external_str_new_with_enc(str, len, rb_utf8_encoding());
|
15
|
+
#else
|
16
|
+
return rb_str_new(str, len);
|
17
|
+
#endif
|
18
|
+
}
|
19
|
+
|
11
20
|
static VALUE charlock_new_str2(const char *str)
|
12
21
|
{
|
13
22
|
#ifdef HAVE_RUBY_ENCODING_H
|
@@ -89,7 +98,7 @@ static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
|
|
89
98
|
* be used as an additional hint to the charset detector
|
90
99
|
*
|
91
100
|
* Returns: an Array with zero or more Hashes,
|
92
|
-
*
|
101
|
+
* each one of them with with :encoding, :language and :confidence
|
93
102
|
*/
|
94
103
|
static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
|
95
104
|
{
|
@@ -124,6 +133,90 @@ static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
|
|
124
133
|
return rb_ret;
|
125
134
|
}
|
126
135
|
|
136
|
+
/*
|
137
|
+
* call-seq: EncodingDetector#strip_tags?
|
138
|
+
*
|
139
|
+
* Returns whether or not the strip_tags flag is set on this detector
|
140
|
+
*
|
141
|
+
* Returns: Boolean
|
142
|
+
*/
|
143
|
+
static VALUE rb_get_strip_tags(VALUE self)
|
144
|
+
{
|
145
|
+
UCharsetDetector *csd;
|
146
|
+
UBool val;
|
147
|
+
VALUE rb_val;
|
148
|
+
|
149
|
+
Data_Get_Struct(self, UCharsetDetector, csd);
|
150
|
+
|
151
|
+
val = ucsdet_isInputFilterEnabled(csd);
|
152
|
+
|
153
|
+
rb_val = val == 1 ? Qtrue : Qfalse;
|
154
|
+
|
155
|
+
return rb_val;
|
156
|
+
}
|
157
|
+
|
158
|
+
/*
|
159
|
+
* call-seq: EncodingDetector#strip_tags = true
|
160
|
+
*
|
161
|
+
* Enable or disable the stripping of HTML/XML tags from the input before
|
162
|
+
* attempting any detection
|
163
|
+
*
|
164
|
+
* Returns: Boolean, the value passed
|
165
|
+
*/
|
166
|
+
static VALUE rb_set_strip_tags(VALUE self, VALUE rb_val)
|
167
|
+
{
|
168
|
+
UCharsetDetector *csd;
|
169
|
+
UBool val;
|
170
|
+
|
171
|
+
Data_Get_Struct(self, UCharsetDetector, csd);
|
172
|
+
|
173
|
+
val = rb_val == Qtrue ? 1 : 0;
|
174
|
+
|
175
|
+
ucsdet_enableInputFilter(csd, val);
|
176
|
+
|
177
|
+
return rb_val;
|
178
|
+
}
|
179
|
+
|
180
|
+
/*
|
181
|
+
* call-seq: detectable_encodings = EncodingDetector.supported_encodings
|
182
|
+
*
|
183
|
+
* The list of detectable encodings supported by this library
|
184
|
+
*
|
185
|
+
* Returns: an Array of Strings
|
186
|
+
*/
|
187
|
+
static VALUE rb_get_supported_encodings(VALUE klass)
|
188
|
+
{
|
189
|
+
UCharsetDetector *csd;
|
190
|
+
UErrorCode status = U_ZERO_ERROR;
|
191
|
+
UEnumeration *encoding_list;
|
192
|
+
VALUE rb_encoding_list;
|
193
|
+
int32_t enc_count;
|
194
|
+
int32_t i;
|
195
|
+
const char *enc_name;
|
196
|
+
int32_t enc_name_len;
|
197
|
+
|
198
|
+
csd = ucsdet_open(&status);
|
199
|
+
|
200
|
+
rb_encoding_list = rb_iv_get(klass, "encoding_list");
|
201
|
+
|
202
|
+
// lazily populate the list
|
203
|
+
if (NIL_P(rb_encoding_list)) {
|
204
|
+
encoding_list = ucsdet_getAllDetectableCharsets(csd, &status);
|
205
|
+
rb_encoding_list = rb_ary_new();
|
206
|
+
enc_count = uenum_count(encoding_list, &status);
|
207
|
+
|
208
|
+
for(i=0; i < enc_count; i++) {
|
209
|
+
enc_name = uenum_next(encoding_list, &enc_name_len, &status);
|
210
|
+
rb_ary_push(rb_encoding_list, charlock_new_str(enc_name, enc_name_len));
|
211
|
+
}
|
212
|
+
|
213
|
+
rb_iv_set(klass, "encoding_list", rb_encoding_list);
|
214
|
+
}
|
215
|
+
|
216
|
+
ucsdet_close(csd);
|
217
|
+
|
218
|
+
return rb_encoding_list;
|
219
|
+
}
|
127
220
|
|
128
221
|
static void rb_encdec__free(void *csd)
|
129
222
|
{
|
@@ -145,4 +238,8 @@ void Init_charlock_holmes()
|
|
145
238
|
rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
|
146
239
|
rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
|
147
240
|
rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1);
|
241
|
+
rb_define_method(rb_cEncodingDetector, "strip_tags", rb_get_strip_tags, 0);
|
242
|
+
rb_define_method(rb_cEncodingDetector, "strip_tags=", rb_set_strip_tags, 1);
|
243
|
+
|
244
|
+
rb_define_singleton_method(rb_cEncodingDetector, "supported_encodings", rb_get_supported_encodings, 0);
|
148
245
|
}
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
require 'spec_helper'
|
2
4
|
|
3
5
|
describe CharlockHolmes::EncodingDetector do
|
@@ -65,6 +67,29 @@ describe CharlockHolmes::EncodingDetector do
|
|
65
67
|
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
|
66
68
|
end
|
67
69
|
|
70
|
+
test 'has a strip_tags flag' do
|
71
|
+
detector = CharlockHolmes::EncodingDetector.new
|
72
|
+
detector.strip_tags = true
|
73
|
+
assert detector.strip_tags
|
74
|
+
|
75
|
+
detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
|
76
|
+
assert_equal 'UTF-8', detection[:encoding]
|
77
|
+
|
78
|
+
detector.strip_tags = false
|
79
|
+
assert !detector.strip_tags
|
80
|
+
|
81
|
+
detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
|
82
|
+
assert_equal 'UTF-8', detection[:encoding]
|
83
|
+
end
|
84
|
+
|
85
|
+
test 'has a list of supported encodings' do
|
86
|
+
CharlockHolmes::EncodingDetector.respond_to? :supported_encodings
|
87
|
+
supported_encodings = CharlockHolmes::EncodingDetector.supported_encodings
|
88
|
+
|
89
|
+
assert supported_encodings.is_a?(Array)
|
90
|
+
assert supported_encodings.include? 'UTF-8'
|
91
|
+
end
|
92
|
+
|
68
93
|
context 'encoding detection' do
|
69
94
|
MAPPING = [
|
70
95
|
['repl2.cljs', 'ISO-8859-1'],
|
metadata
CHANGED
@@ -1,79 +1,58 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: charlock_holmes
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.4.0
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 3
|
9
|
-
- 0
|
10
|
-
version: 0.3.0
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Brian Lopez
|
14
|
-
-
|
9
|
+
- Vicent Martí
|
15
10
|
autorequire:
|
16
11
|
bindir: bin
|
17
12
|
cert_chain: []
|
18
|
-
|
19
|
-
date: 2011-08-25 00:00:00 -07:00
|
13
|
+
date: 2011-08-25 00:00:00.000000000 -07:00
|
20
14
|
default_executable:
|
21
|
-
dependencies:
|
22
|
-
- !ruby/object:Gem::Dependency
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
23
17
|
name: rake-compiler
|
24
|
-
|
25
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
18
|
+
requirement: &70365951363440 !ruby/object:Gem::Requirement
|
26
19
|
none: false
|
27
|
-
requirements:
|
28
|
-
- -
|
29
|
-
- !ruby/object:Gem::Version
|
30
|
-
hash: 9
|
31
|
-
segments:
|
32
|
-
- 0
|
33
|
-
- 7
|
34
|
-
- 5
|
20
|
+
requirements:
|
21
|
+
- - ! '>='
|
22
|
+
- !ruby/object:Gem::Version
|
35
23
|
version: 0.7.5
|
36
24
|
type: :development
|
37
|
-
version_requirements: *id001
|
38
|
-
- !ruby/object:Gem::Dependency
|
39
|
-
name: rspec
|
40
25
|
prerelease: false
|
41
|
-
|
26
|
+
version_requirements: *70365951363440
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: &70365951362940 !ruby/object:Gem::Requirement
|
42
30
|
none: false
|
43
|
-
requirements:
|
44
|
-
- -
|
45
|
-
- !ruby/object:Gem::Version
|
46
|
-
hash: 15
|
47
|
-
segments:
|
48
|
-
- 2
|
49
|
-
- 0
|
50
|
-
- 0
|
31
|
+
requirements:
|
32
|
+
- - ! '>='
|
33
|
+
- !ruby/object:Gem::Version
|
51
34
|
version: 2.0.0
|
52
35
|
type: :development
|
53
|
-
version_requirements: *id002
|
54
|
-
- !ruby/object:Gem::Dependency
|
55
|
-
name: chardet
|
56
36
|
prerelease: false
|
57
|
-
|
37
|
+
version_requirements: *70365951362940
|
38
|
+
- !ruby/object:Gem::Dependency
|
39
|
+
name: chardet
|
40
|
+
requirement: &70365951394040 !ruby/object:Gem::Requirement
|
58
41
|
none: false
|
59
|
-
requirements:
|
60
|
-
- -
|
61
|
-
- !ruby/object:Gem::Version
|
62
|
-
|
63
|
-
segments:
|
64
|
-
- 0
|
65
|
-
version: "0"
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
66
46
|
type: :development
|
67
|
-
|
47
|
+
prerelease: false
|
48
|
+
version_requirements: *70365951394040
|
68
49
|
description:
|
69
50
|
email: seniorlopez@gmail.com
|
70
51
|
executables: []
|
71
|
-
|
72
|
-
extensions:
|
52
|
+
extensions:
|
73
53
|
- ext/charlock_holmes/extconf.rb
|
74
54
|
extra_rdoc_files: []
|
75
|
-
|
76
|
-
files:
|
55
|
+
files:
|
77
56
|
- .gitignore
|
78
57
|
- .rspec
|
79
58
|
- Gemfile
|
@@ -102,39 +81,31 @@ files:
|
|
102
81
|
has_rdoc: true
|
103
82
|
homepage: http://github.com/brianmario/charlock_holmes
|
104
83
|
licenses: []
|
105
|
-
|
106
84
|
post_install_message:
|
107
|
-
rdoc_options:
|
85
|
+
rdoc_options:
|
108
86
|
- --charset=UTF-8
|
109
|
-
require_paths:
|
87
|
+
require_paths:
|
110
88
|
- lib
|
111
89
|
- ext
|
112
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
90
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
113
91
|
none: false
|
114
|
-
requirements:
|
115
|
-
- -
|
116
|
-
- !ruby/object:Gem::Version
|
117
|
-
|
118
|
-
|
119
|
-
- 0
|
120
|
-
version: "0"
|
121
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - ! '>='
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: '0'
|
96
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
122
97
|
none: false
|
123
|
-
requirements:
|
124
|
-
- -
|
125
|
-
- !ruby/object:Gem::Version
|
126
|
-
|
127
|
-
segments:
|
128
|
-
- 0
|
129
|
-
version: "0"
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
130
102
|
requirements: []
|
131
|
-
|
132
103
|
rubyforge_project:
|
133
104
|
rubygems_version: 1.6.2
|
134
105
|
signing_key:
|
135
106
|
specification_version: 3
|
136
107
|
summary: Character encoding detection, brought to you by ICU
|
137
|
-
test_files:
|
108
|
+
test_files:
|
138
109
|
- spec/encoding_detector_spec.rb
|
139
110
|
- spec/fixtures/AnsiGraph.psm1
|
140
111
|
- spec/fixtures/TwigExtensionsDate.es.yml
|