static_holmes 0.7.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/charlock_holmes/common.h +41 -0
- data/ext/charlock_holmes/converter.c +57 -0
- data/ext/charlock_holmes/encoding_detector.c +377 -0
- data/ext/charlock_holmes/ext.c +15 -0
- data/ext/charlock_holmes/extconf.rb +102 -0
- data/ext/charlock_holmes/transliterator.cpp +130 -0
- data/lib/charlock_holmes/encoding_detector.rb +76 -0
- data/lib/charlock_holmes/string.rb +34 -0
- data/lib/charlock_holmes/version.rb +3 -0
- data/lib/charlock_holmes.rb +6 -0
- metadata +98 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 1e35947554b465d48dc970a60031dcca03df441bd829b4464dba2ecf0bc792bb
|
4
|
+
data.tar.gz: 4cdfaff28364c07fc96a0a77a62d2fb5dbccc130859a4268065084e5abdff449
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3007aa5b2d53c6046dc65086cb74881ea10f48fda3de2c89a053a42a820f5045b20ce2b716ce9ac0d51f3e6b704a85e9659958cf70c9631e74593637dfbae486
|
7
|
+
data.tar.gz: 13ad7377a3525419518ef4c20ce6c4763a8fc9e8b5cabaa30495d5d79cb8bf0b239e2238a4dcea5bc181572cec4356be3c2d65bf599c440a74a9f398881a6ec5
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#ifndef CHARLOCK_COMMON_H
|
2
|
+
#define CHARLOCK_COMMON_H
|
3
|
+
|
4
|
+
// tell rbx not to use it's caching compat layer
|
5
|
+
// by doing this we're making a promize to RBX that
|
6
|
+
// we'll never modify the pointers we get back from RSTRING_PTR
|
7
|
+
#define RSTRING_NOT_MODIFIED
|
8
|
+
|
9
|
+
#include <ruby.h>
|
10
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
11
|
+
#include <ruby/encoding.h>
|
12
|
+
#endif
|
13
|
+
|
14
|
+
static inline VALUE charlock_new_enc_str(const char *str, size_t len, void *encoding)
|
15
|
+
{
|
16
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
17
|
+
return rb_external_str_new_with_enc(str, len, (rb_encoding *)encoding);
|
18
|
+
#else
|
19
|
+
return rb_str_new(str, len);
|
20
|
+
#endif
|
21
|
+
}
|
22
|
+
|
23
|
+
static inline VALUE charlock_new_str(const char *str, size_t len)
|
24
|
+
{
|
25
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
26
|
+
return rb_external_str_new_with_enc(str, len, rb_utf8_encoding());
|
27
|
+
#else
|
28
|
+
return rb_str_new(str, len);
|
29
|
+
#endif
|
30
|
+
}
|
31
|
+
|
32
|
+
static inline VALUE charlock_new_str2(const char *str)
|
33
|
+
{
|
34
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
35
|
+
return rb_external_str_new_with_enc(str, strlen(str), rb_utf8_encoding());
|
36
|
+
#else
|
37
|
+
return rb_str_new2(str);
|
38
|
+
#endif
|
39
|
+
}
|
40
|
+
|
41
|
+
#endif
|
@@ -0,0 +1,57 @@
|
|
1
|
+
#include "unicode/ucnv.h"
|
2
|
+
#include "common.h"
|
3
|
+
|
4
|
+
extern VALUE rb_mCharlockHolmes;
|
5
|
+
static VALUE rb_cConverter;
|
6
|
+
|
7
|
+
static VALUE rb_converter_convert(VALUE self, VALUE rb_txt, VALUE rb_src_enc, VALUE rb_dst_enc) {
|
8
|
+
VALUE rb_out;
|
9
|
+
const char *src_enc;
|
10
|
+
const char *dst_enc;
|
11
|
+
const char *src_txt;
|
12
|
+
char *out_buf;
|
13
|
+
void *rb_enc = NULL;
|
14
|
+
int32_t src_len;
|
15
|
+
int32_t out_len;
|
16
|
+
UErrorCode status = U_ZERO_ERROR;
|
17
|
+
|
18
|
+
Check_Type(rb_txt, T_STRING);
|
19
|
+
Check_Type(rb_src_enc, T_STRING);
|
20
|
+
Check_Type(rb_dst_enc, T_STRING);
|
21
|
+
|
22
|
+
src_txt = RSTRING_PTR(rb_txt);
|
23
|
+
src_len = RSTRING_LEN(rb_txt);
|
24
|
+
src_enc = RSTRING_PTR(rb_src_enc);
|
25
|
+
dst_enc = RSTRING_PTR(rb_dst_enc);
|
26
|
+
|
27
|
+
// first determin the size of the output buffer
|
28
|
+
out_len = ucnv_convert(dst_enc, src_enc, NULL, 0, src_txt, src_len, &status);
|
29
|
+
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
30
|
+
rb_raise(rb_eArgError, "%s", u_errorName(status));
|
31
|
+
}
|
32
|
+
out_buf = malloc(out_len);
|
33
|
+
|
34
|
+
// now do the actual conversion
|
35
|
+
status = U_ZERO_ERROR;
|
36
|
+
out_len = ucnv_convert(dst_enc, src_enc, out_buf, out_len, src_txt, src_len, &status);
|
37
|
+
if (U_FAILURE(status)) {
|
38
|
+
free(out_buf);
|
39
|
+
rb_raise(rb_eArgError, "%s", u_errorName(status));
|
40
|
+
}
|
41
|
+
|
42
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
43
|
+
rb_enc = (void *)rb_enc_find(dst_enc);
|
44
|
+
#endif
|
45
|
+
|
46
|
+
rb_out = charlock_new_enc_str(out_buf, out_len, rb_enc);
|
47
|
+
|
48
|
+
free(out_buf);
|
49
|
+
|
50
|
+
return rb_out;
|
51
|
+
}
|
52
|
+
|
53
|
+
void _init_charlock_converter() {
|
54
|
+
rb_cConverter = rb_define_class_under(rb_mCharlockHolmes, "Converter", rb_cObject);
|
55
|
+
|
56
|
+
rb_define_singleton_method(rb_cConverter, "convert", rb_converter_convert, 3);
|
57
|
+
}
|
@@ -0,0 +1,377 @@
|
|
1
|
+
#include "unicode/ucsdet.h"
|
2
|
+
#include "common.h"
|
3
|
+
|
4
|
+
extern VALUE rb_mCharlockHolmes;
|
5
|
+
static VALUE rb_cEncodingDetector;
|
6
|
+
|
7
|
+
typedef struct {
|
8
|
+
UCharsetDetector *csd;
|
9
|
+
} charlock_detector_t;
|
10
|
+
|
11
|
+
static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
|
12
|
+
{
|
13
|
+
UErrorCode status = U_ZERO_ERROR;
|
14
|
+
const char *mname;
|
15
|
+
const char *mlang;
|
16
|
+
int mconfidence;
|
17
|
+
VALUE rb_match;
|
18
|
+
VALUE enc_tbl;
|
19
|
+
VALUE enc_name;
|
20
|
+
VALUE compat_enc;
|
21
|
+
|
22
|
+
if (!match)
|
23
|
+
return Qnil;
|
24
|
+
|
25
|
+
mname = ucsdet_getName(match, &status);
|
26
|
+
mlang = ucsdet_getLanguage(match, &status);
|
27
|
+
mconfidence = ucsdet_getConfidence(match, &status);
|
28
|
+
|
29
|
+
rb_match = rb_hash_new();
|
30
|
+
|
31
|
+
rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("text")));
|
32
|
+
|
33
|
+
enc_name = charlock_new_str2(mname);
|
34
|
+
rb_hash_aset(rb_match, ID2SYM(rb_intern("encoding")), enc_name);
|
35
|
+
|
36
|
+
enc_tbl = rb_iv_get(rb_cEncodingDetector, "@encoding_table");
|
37
|
+
compat_enc = rb_hash_aref(enc_tbl, enc_name);
|
38
|
+
if (!NIL_P(compat_enc)) {
|
39
|
+
rb_hash_aset(rb_match, ID2SYM(rb_intern("ruby_encoding")), compat_enc);
|
40
|
+
}
|
41
|
+
|
42
|
+
rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(mconfidence));
|
43
|
+
|
44
|
+
if (mlang && mlang[0])
|
45
|
+
rb_hash_aset(rb_match, ID2SYM(rb_intern("language")), charlock_new_str2(mlang));
|
46
|
+
|
47
|
+
return rb_match;
|
48
|
+
}
|
49
|
+
|
50
|
+
static VALUE rb_encdec_binarymatch() {
|
51
|
+
VALUE rb_match;
|
52
|
+
|
53
|
+
rb_match = rb_hash_new();
|
54
|
+
|
55
|
+
rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("binary")));
|
56
|
+
rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(100));
|
57
|
+
|
58
|
+
return rb_match;
|
59
|
+
}
|
60
|
+
|
61
|
+
static int detect_binary_content(VALUE self, VALUE rb_str) {
|
62
|
+
size_t buf_len, scan_len;
|
63
|
+
const char *buf;
|
64
|
+
|
65
|
+
buf = RSTRING_PTR(rb_str);
|
66
|
+
buf_len = RSTRING_LEN(rb_str);
|
67
|
+
scan_len = NUM2ULL(rb_iv_get(self, "@binary_scan_length"));
|
68
|
+
|
69
|
+
if (buf_len > 10) {
|
70
|
+
// application/postscript
|
71
|
+
if (!memcmp(buf, "%!PS-Adobe-", 11))
|
72
|
+
return 0;
|
73
|
+
}
|
74
|
+
|
75
|
+
if (buf_len > 7) {
|
76
|
+
// image/png
|
77
|
+
if (!memcmp(buf, "\x89PNG\x0D\x0A\x1A\x0A", 8))
|
78
|
+
return 1;
|
79
|
+
}
|
80
|
+
|
81
|
+
if (buf_len > 5) {
|
82
|
+
// image/gif
|
83
|
+
if (!memcmp(buf, "GIF87a", 6))
|
84
|
+
return 1;
|
85
|
+
|
86
|
+
// image/gif
|
87
|
+
if (!memcmp(buf, "GIF89a", 6))
|
88
|
+
return 1;
|
89
|
+
}
|
90
|
+
|
91
|
+
if (buf_len > 4) {
|
92
|
+
// application/pdf
|
93
|
+
if (!memcmp(buf, "%PDF-", 5))
|
94
|
+
return 1;
|
95
|
+
}
|
96
|
+
|
97
|
+
if (buf_len > 3) {
|
98
|
+
// UTF-32BE
|
99
|
+
if (!memcmp(buf, "\0\0\xfe\xff", 4))
|
100
|
+
return 0;
|
101
|
+
|
102
|
+
// UTF-32LE
|
103
|
+
if (!memcmp(buf, "\xff\xfe\0\0", 4))
|
104
|
+
return 0;
|
105
|
+
}
|
106
|
+
|
107
|
+
if (buf_len > 2) {
|
108
|
+
// image/jpeg
|
109
|
+
if (!memcmp(buf, "\xFF\xD8\xFF", 3))
|
110
|
+
return 1;
|
111
|
+
}
|
112
|
+
|
113
|
+
if (buf_len > 1) {
|
114
|
+
// UTF-16BE
|
115
|
+
if (!memcmp(buf, "\xfe\xff", 2))
|
116
|
+
return 0;
|
117
|
+
|
118
|
+
// UTF-16LE
|
119
|
+
if (!memcmp(buf, "\xff\xfe", 2))
|
120
|
+
return 0;
|
121
|
+
}
|
122
|
+
|
123
|
+
/*
|
124
|
+
* If we got this far, any NULL bytes within the `scan_len`
|
125
|
+
* range will likely mean the contents are binary.
|
126
|
+
*/
|
127
|
+
if (scan_len < buf_len)
|
128
|
+
buf_len = scan_len;
|
129
|
+
return !!memchr(buf, 0, buf_len);
|
130
|
+
}
|
131
|
+
|
132
|
+
/*
|
133
|
+
* call-seq: true/false = EncodingDetector.is_binary? str
|
134
|
+
*
|
135
|
+
* Attempt to detect if a string is binary or text
|
136
|
+
*
|
137
|
+
* str - a String, what you want to perform the binary check on
|
138
|
+
*
|
139
|
+
* Returns: true or false
|
140
|
+
*/
|
141
|
+
static VALUE rb_encdec_is_binary(VALUE self, VALUE str)
|
142
|
+
{
|
143
|
+
if (detect_binary_content(self, str))
|
144
|
+
return Qtrue;
|
145
|
+
else
|
146
|
+
return Qfalse;
|
147
|
+
}
|
148
|
+
|
149
|
+
/*
|
150
|
+
* call-seq: detection_hash = EncodingDetector.detect str[, hint_enc]
|
151
|
+
*
|
152
|
+
* Attempt to detect the encoding of this string
|
153
|
+
*
|
154
|
+
* str - a String, what you want to detect the encoding of
|
155
|
+
* hint_enc - an optional String (like "UTF-8"), the encoding name which will
|
156
|
+
* be used as an additional hint to the charset detector
|
157
|
+
*
|
158
|
+
* Returns: a Hash with :encoding, :language, :type and :confidence
|
159
|
+
*/
|
160
|
+
static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
|
161
|
+
{
|
162
|
+
UErrorCode status = U_ZERO_ERROR;
|
163
|
+
charlock_detector_t *detector;
|
164
|
+
VALUE rb_str;
|
165
|
+
VALUE rb_enc_hint;
|
166
|
+
|
167
|
+
rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
|
168
|
+
|
169
|
+
Check_Type(rb_str, T_STRING);
|
170
|
+
Data_Get_Struct(self, charlock_detector_t, detector);
|
171
|
+
|
172
|
+
// first lets see if this is binary content
|
173
|
+
if (detect_binary_content(self, rb_str)) {
|
174
|
+
return rb_encdec_binarymatch();
|
175
|
+
}
|
176
|
+
|
177
|
+
// if we got here - the data doesn't look like binary
|
178
|
+
// lets try to figure out what encoding the text is in
|
179
|
+
ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
|
180
|
+
|
181
|
+
if (!NIL_P(rb_enc_hint)) {
|
182
|
+
Check_Type(rb_enc_hint, T_STRING);
|
183
|
+
ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
|
184
|
+
}
|
185
|
+
|
186
|
+
return rb_encdec_buildmatch(ucsdet_detect(detector->csd, &status));
|
187
|
+
}
|
188
|
+
|
189
|
+
|
190
|
+
/*
|
191
|
+
* call-seq: detection_hash_array = EncodingDetector.detect_all str[, hint_enc]
|
192
|
+
*
|
193
|
+
* Attempt to detect the encoding of this string, and return
|
194
|
+
* a list with all the possible encodings that match it.
|
195
|
+
*
|
196
|
+
*
|
197
|
+
* str - a String, what you want to detect the encoding of
|
198
|
+
* hint_enc - an optional String (like "UTF-8"), the encoding name which will
|
199
|
+
* be used as an additional hint to the charset detector
|
200
|
+
*
|
201
|
+
* Returns: an Array with zero or more Hashes,
|
202
|
+
* each one of them with with :encoding, :language, :type and :confidence
|
203
|
+
*/
|
204
|
+
static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
|
205
|
+
{
|
206
|
+
UErrorCode status = U_ZERO_ERROR;
|
207
|
+
charlock_detector_t *detector;
|
208
|
+
const UCharsetMatch **csm;
|
209
|
+
VALUE rb_ret;
|
210
|
+
int i, match_count;
|
211
|
+
VALUE rb_str;
|
212
|
+
VALUE rb_enc_hint;
|
213
|
+
VALUE binary_match;
|
214
|
+
|
215
|
+
rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
|
216
|
+
|
217
|
+
Check_Type(rb_str, T_STRING);
|
218
|
+
Data_Get_Struct(self, charlock_detector_t, detector);
|
219
|
+
|
220
|
+
rb_ret = rb_ary_new();
|
221
|
+
|
222
|
+
// first lets see if this is binary content
|
223
|
+
binary_match = Qnil;
|
224
|
+
if (detect_binary_content(self, rb_str)) {
|
225
|
+
binary_match = rb_encdec_binarymatch();
|
226
|
+
}
|
227
|
+
|
228
|
+
ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
|
229
|
+
|
230
|
+
if (!NIL_P(rb_enc_hint)) {
|
231
|
+
Check_Type(rb_enc_hint, T_STRING);
|
232
|
+
ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
|
233
|
+
}
|
234
|
+
|
235
|
+
csm = ucsdet_detectAll(detector->csd, &match_count, &status);
|
236
|
+
|
237
|
+
for (i = 0; i < match_count; ++i) {
|
238
|
+
rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i]));
|
239
|
+
}
|
240
|
+
|
241
|
+
if (!NIL_P(binary_match))
|
242
|
+
rb_ary_unshift(rb_ret, binary_match);
|
243
|
+
|
244
|
+
return rb_ret;
|
245
|
+
}
|
246
|
+
|
247
|
+
/*
|
248
|
+
* call-seq: EncodingDetector#strip_tags?
|
249
|
+
*
|
250
|
+
* Returns whether or not the strip_tags flag is set on this detector
|
251
|
+
*
|
252
|
+
* Returns: Boolean
|
253
|
+
*/
|
254
|
+
static VALUE rb_get_strip_tags(VALUE self)
|
255
|
+
{
|
256
|
+
charlock_detector_t *detector;
|
257
|
+
UBool val;
|
258
|
+
VALUE rb_val;
|
259
|
+
|
260
|
+
Data_Get_Struct(self, charlock_detector_t, detector);
|
261
|
+
|
262
|
+
val = ucsdet_isInputFilterEnabled(detector->csd);
|
263
|
+
|
264
|
+
rb_val = val == 1 ? Qtrue : Qfalse;
|
265
|
+
|
266
|
+
return rb_val;
|
267
|
+
}
|
268
|
+
|
269
|
+
/*
|
270
|
+
* call-seq: EncodingDetector#strip_tags = true
|
271
|
+
*
|
272
|
+
* Enable or disable the stripping of HTML/XML tags from the input before
|
273
|
+
* attempting any detection
|
274
|
+
*
|
275
|
+
* Returns: Boolean, the value passed
|
276
|
+
*/
|
277
|
+
static VALUE rb_set_strip_tags(VALUE self, VALUE rb_val)
|
278
|
+
{
|
279
|
+
charlock_detector_t *detector;
|
280
|
+
UBool val;
|
281
|
+
|
282
|
+
Data_Get_Struct(self, charlock_detector_t, detector);
|
283
|
+
|
284
|
+
val = rb_val == Qtrue ? 1 : 0;
|
285
|
+
|
286
|
+
ucsdet_enableInputFilter(detector->csd, val);
|
287
|
+
|
288
|
+
return rb_val;
|
289
|
+
}
|
290
|
+
|
291
|
+
/*
|
292
|
+
* call-seq: detectable_encodings = EncodingDetector.supported_encodings
|
293
|
+
*
|
294
|
+
* The list of detectable encodings supported by this library
|
295
|
+
*
|
296
|
+
* Returns: an Array of Strings
|
297
|
+
*/
|
298
|
+
static VALUE rb_get_supported_encodings(VALUE klass)
|
299
|
+
{
|
300
|
+
UCharsetDetector *csd;
|
301
|
+
UErrorCode status = U_ZERO_ERROR;
|
302
|
+
UEnumeration *encoding_list;
|
303
|
+
VALUE rb_encoding_list;
|
304
|
+
int32_t enc_count;
|
305
|
+
int32_t i;
|
306
|
+
const char *enc_name;
|
307
|
+
int32_t enc_name_len;
|
308
|
+
|
309
|
+
rb_encoding_list = rb_iv_get(klass, "encoding_list");
|
310
|
+
|
311
|
+
// lazily populate the list
|
312
|
+
if (NIL_P(rb_encoding_list)) {
|
313
|
+
csd = ucsdet_open(&status);
|
314
|
+
|
315
|
+
encoding_list = ucsdet_getAllDetectableCharsets(csd, &status);
|
316
|
+
rb_encoding_list = rb_ary_new();
|
317
|
+
enc_count = uenum_count(encoding_list, &status);
|
318
|
+
|
319
|
+
rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1250"));
|
320
|
+
rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1252"));
|
321
|
+
rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1253"));
|
322
|
+
rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1254"));
|
323
|
+
rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1255"));
|
324
|
+
|
325
|
+
for(i=0; i < enc_count; i++) {
|
326
|
+
enc_name = uenum_next(encoding_list, &enc_name_len, &status);
|
327
|
+
rb_ary_push(rb_encoding_list, charlock_new_str(enc_name, enc_name_len));
|
328
|
+
}
|
329
|
+
|
330
|
+
rb_iv_set(klass, "encoding_list", rb_encoding_list);
|
331
|
+
ucsdet_close(csd);
|
332
|
+
}
|
333
|
+
|
334
|
+
return rb_encoding_list;
|
335
|
+
}
|
336
|
+
|
337
|
+
static void rb_encdec__free(void *obj)
|
338
|
+
{
|
339
|
+
charlock_detector_t *detector;
|
340
|
+
|
341
|
+
detector = (charlock_detector_t *)obj;
|
342
|
+
|
343
|
+
if (detector->csd)
|
344
|
+
ucsdet_close(detector->csd);
|
345
|
+
|
346
|
+
free(detector);
|
347
|
+
}
|
348
|
+
|
349
|
+
static VALUE rb_encdec__alloc(VALUE klass)
|
350
|
+
{
|
351
|
+
charlock_detector_t *detector;
|
352
|
+
UErrorCode status = U_ZERO_ERROR;
|
353
|
+
VALUE obj;
|
354
|
+
|
355
|
+
detector = calloc(1, sizeof(charlock_detector_t));
|
356
|
+
obj = Data_Wrap_Struct(klass, NULL, rb_encdec__free, (void *)detector);
|
357
|
+
|
358
|
+
detector->csd = ucsdet_open(&status);
|
359
|
+
if (U_FAILURE(status)) {
|
360
|
+
rb_raise(rb_eStandardError, "%s", u_errorName(status));
|
361
|
+
}
|
362
|
+
|
363
|
+
return obj;
|
364
|
+
}
|
365
|
+
|
366
|
+
void _init_charlock_encoding_detector()
|
367
|
+
{
|
368
|
+
rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject);
|
369
|
+
rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
|
370
|
+
rb_define_method(rb_cEncodingDetector, "is_binary?", rb_encdec_is_binary, 1);
|
371
|
+
rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
|
372
|
+
rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1);
|
373
|
+
rb_define_method(rb_cEncodingDetector, "strip_tags", rb_get_strip_tags, 0);
|
374
|
+
rb_define_method(rb_cEncodingDetector, "strip_tags=", rb_set_strip_tags, 1);
|
375
|
+
|
376
|
+
rb_define_singleton_method(rb_cEncodingDetector, "supported_encodings", rb_get_supported_encodings, 0);
|
377
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
#include "common.h"
|
2
|
+
|
3
|
+
extern void _init_charlock_encoding_detector();
|
4
|
+
extern void _init_charlock_converter();
|
5
|
+
extern void _init_charlock_transliterator();
|
6
|
+
|
7
|
+
VALUE rb_mCharlockHolmes;
|
8
|
+
|
9
|
+
void Init_charlock_holmes() {
|
10
|
+
rb_mCharlockHolmes = rb_define_module("CharlockHolmes");
|
11
|
+
|
12
|
+
_init_charlock_encoding_detector();
|
13
|
+
_init_charlock_converter();
|
14
|
+
_init_charlock_transliterator();
|
15
|
+
}
|
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
if `which make`.strip.empty?
|
4
|
+
STDERR.puts "\n\n"
|
5
|
+
STDERR.puts "***************************************************************************************"
|
6
|
+
STDERR.puts "*************** make required (apt-get install make build-essential) =( ***************"
|
7
|
+
STDERR.puts "***************************************************************************************"
|
8
|
+
exit(1)
|
9
|
+
end
|
10
|
+
|
11
|
+
##
|
12
|
+
# ICU dependency
|
13
|
+
#
|
14
|
+
|
15
|
+
ldflags = cppflags = nil
|
16
|
+
|
17
|
+
if RbConfig::CONFIG["host_os"] =~ /darwin/
|
18
|
+
begin
|
19
|
+
brew_prefix = `brew --prefix icu4c`.chomp
|
20
|
+
ldflags = "#{brew_prefix}/lib"
|
21
|
+
cppflags = "#{brew_prefix}/include"
|
22
|
+
pkg_conf = "#{brew_prefix}/lib/pkgconfig"
|
23
|
+
# pkg_config should be less error prone than parsing compiler
|
24
|
+
# commandline options, but we need to set default ldflags and cpp flags
|
25
|
+
# in case the user doesn't have pkg-config installed
|
26
|
+
ENV['PKG_CONFIG_PATH'] ||= pkg_conf
|
27
|
+
rescue
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
dir_config 'icu', cppflags, ldflags
|
32
|
+
|
33
|
+
pkg_config("icu-i18n")
|
34
|
+
pkg_config("icu-io")
|
35
|
+
pkg_config("icu-uc")
|
36
|
+
|
37
|
+
$CXXFLAGS << ' -std=c++11' unless $CXXFLAGS.include?("-std=")
|
38
|
+
|
39
|
+
unless have_library 'icui18n' and have_header 'unicode/ucnv.h'
|
40
|
+
STDERR.puts "\n\n"
|
41
|
+
STDERR.puts "***************************************************************************************"
|
42
|
+
STDERR.puts "*********** icu required (brew install icu4c or apt-get install libicu-dev) ***********"
|
43
|
+
STDERR.puts "***************************************************************************************"
|
44
|
+
exit(1)
|
45
|
+
end
|
46
|
+
|
47
|
+
have_library 'z' or abort 'libz missing'
|
48
|
+
have_library 'icuuc' or abort 'libicuuc missing'
|
49
|
+
have_library 'icudata' or abort 'libicudata missing'
|
50
|
+
|
51
|
+
$CFLAGS << ' -Wall -funroll-loops'
|
52
|
+
$CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
|
53
|
+
|
54
|
+
def libflag_to_filename(ldflag)
|
55
|
+
case ldflag
|
56
|
+
when /\A-l(.+)/
|
57
|
+
"lib#{Regexp.last_match(1)}.#{$LIBEXT}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def resolve_static_library(libflag, dirs)
|
62
|
+
filename = libflag_to_filename(libflag)
|
63
|
+
|
64
|
+
dir = dirs.find { |path| File.exist?(File.join(path, filename)) }
|
65
|
+
|
66
|
+
raise "Unable to find #{filename} in #{dirs}" unless dir
|
67
|
+
|
68
|
+
File.join(dir, filename)
|
69
|
+
end
|
70
|
+
|
71
|
+
def substitute_static_libs(packages)
|
72
|
+
# First, find all the -l<lib> flags added by pkg-config. We want to drop
|
73
|
+
# these dynamically linked libraries and substitute them with the static libraries.
|
74
|
+
libflags = packages.map do |pkg|
|
75
|
+
pkg_config(pkg, 'libs-only-l')&.strip&.split(' ')
|
76
|
+
end.flatten.uniq
|
77
|
+
|
78
|
+
# To find where the static libraries live, we need to search the
|
79
|
+
# library paths given by the -L flag from pkg-config.
|
80
|
+
lib_paths = packages.map do |pkg|
|
81
|
+
include_path = pkg_config(pkg, 'libs-only-L')&.strip
|
82
|
+
include_path&.split(' ').map { |lib| lib.gsub(/^-L/, '') }
|
83
|
+
end.flatten.uniq
|
84
|
+
|
85
|
+
# Drop the -l<lib> flags and add in the static libraries.
|
86
|
+
new_libs = $libs.shellsplit
|
87
|
+
new_libs.reject! { |arg| libflags.include?(arg) }
|
88
|
+
libflags.each { |flag| new_libs << resolve_static_library(flag, lib_paths) }
|
89
|
+
$libs = new_libs.uniq.shelljoin
|
90
|
+
end
|
91
|
+
|
92
|
+
static_p = enable_config('static', false)
|
93
|
+
message "Static linking is #{static_p ? 'enabled' : 'disabled'}.\n"
|
94
|
+
|
95
|
+
if static_p
|
96
|
+
$CXXFLAGS << ' -fPIC'
|
97
|
+
ENV['PKG_CONFIG_ALLOW_SYSTEM_LIBS'] = '1'
|
98
|
+
|
99
|
+
substitute_static_libs(%w[icu-i18n icu-io icu-uc])
|
100
|
+
end
|
101
|
+
|
102
|
+
create_makefile 'charlock_holmes/charlock_holmes'
|
@@ -0,0 +1,130 @@
|
|
1
|
+
#include "common.h"
|
2
|
+
#undef UChar
|
3
|
+
|
4
|
+
#include <string>
|
5
|
+
#include <unicode/translit.h>
|
6
|
+
|
7
|
+
extern "C" {
|
8
|
+
|
9
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
10
|
+
#include <ruby/encoding.h>
|
11
|
+
static VALUE rb_eEncodingCompatibilityError;
|
12
|
+
|
13
|
+
static void check_utf8_encoding(VALUE str) {
|
14
|
+
static rb_encoding *_cached[3] = {NULL, NULL, NULL};
|
15
|
+
rb_encoding *enc;
|
16
|
+
|
17
|
+
if (_cached[0] == NULL) {
|
18
|
+
_cached[0] = rb_utf8_encoding();
|
19
|
+
_cached[1] = rb_usascii_encoding();
|
20
|
+
_cached[2] = rb_ascii8bit_encoding();
|
21
|
+
}
|
22
|
+
|
23
|
+
enc = rb_enc_get(str);
|
24
|
+
if (enc != _cached[0] && enc != _cached[1] && enc != _cached[2]) {
|
25
|
+
rb_raise(rb_eEncodingCompatibilityError,
|
26
|
+
"Input must be UTF-8 or US-ASCII, %s given", rb_enc_name(enc));
|
27
|
+
}
|
28
|
+
}
|
29
|
+
|
30
|
+
#else
|
31
|
+
static void check_utf8_encoding(VALUE str) {}
|
32
|
+
#endif
|
33
|
+
|
34
|
+
extern VALUE rb_mCharlockHolmes;
|
35
|
+
static VALUE rb_cTransliterator;
|
36
|
+
|
37
|
+
static VALUE rb_transliterator_id_list(VALUE self) {
|
38
|
+
UErrorCode status = U_ZERO_ERROR;
|
39
|
+
icu::StringEnumeration *id_list;
|
40
|
+
int32_t id_list_size;
|
41
|
+
const char *curr_id;
|
42
|
+
int32_t curr_id_len;
|
43
|
+
VALUE rb_ary;
|
44
|
+
VALUE rb_curr_id;
|
45
|
+
|
46
|
+
id_list_size = 0;
|
47
|
+
id_list = icu::Transliterator::getAvailableIDs(status);
|
48
|
+
if(!U_SUCCESS(status)) {
|
49
|
+
rb_raise(rb_eArgError, "%s", u_errorName(status));
|
50
|
+
}
|
51
|
+
|
52
|
+
status = U_ZERO_ERROR;
|
53
|
+
id_list_size = id_list->count(status);
|
54
|
+
if(!U_SUCCESS(status)) {
|
55
|
+
rb_raise(rb_eArgError, "%s", u_errorName(status));
|
56
|
+
}
|
57
|
+
|
58
|
+
rb_ary = rb_ary_new2(id_list_size);
|
59
|
+
|
60
|
+
do {
|
61
|
+
curr_id_len = 0;
|
62
|
+
curr_id = id_list->next(&curr_id_len, status);
|
63
|
+
if(!U_SUCCESS(status)) {
|
64
|
+
rb_raise(rb_eArgError, "%s", u_errorName(status));
|
65
|
+
}
|
66
|
+
|
67
|
+
if (curr_id != NULL) {
|
68
|
+
rb_curr_id = charlock_new_str(curr_id, curr_id_len);
|
69
|
+
rb_ary_push(rb_ary, rb_curr_id);
|
70
|
+
}
|
71
|
+
} while(curr_id != NULL);
|
72
|
+
|
73
|
+
delete id_list;
|
74
|
+
|
75
|
+
return rb_ary;
|
76
|
+
}
|
77
|
+
|
78
|
+
static VALUE rb_transliterator_transliterate(VALUE self, VALUE rb_txt, VALUE rb_id) {
|
79
|
+
UErrorCode status = U_ZERO_ERROR;
|
80
|
+
UParseError p_error;
|
81
|
+
icu::Transliterator *trans;
|
82
|
+
const char *txt;
|
83
|
+
size_t txt_len;
|
84
|
+
const char *id;
|
85
|
+
size_t id_len;
|
86
|
+
icu::UnicodeString *u_txt;
|
87
|
+
std::string result;
|
88
|
+
VALUE rb_out;
|
89
|
+
|
90
|
+
Check_Type(rb_txt, T_STRING);
|
91
|
+
Check_Type(rb_id, T_STRING);
|
92
|
+
|
93
|
+
check_utf8_encoding(rb_txt);
|
94
|
+
check_utf8_encoding(rb_id);
|
95
|
+
|
96
|
+
txt = RSTRING_PTR(rb_txt);
|
97
|
+
txt_len = RSTRING_LEN(rb_txt);
|
98
|
+
id = RSTRING_PTR(rb_id);
|
99
|
+
id_len = RSTRING_LEN(rb_id);
|
100
|
+
|
101
|
+
trans = icu::Transliterator::createInstance(icu::UnicodeString(id, id_len), UTRANS_FORWARD, p_error, status);
|
102
|
+
if(!U_SUCCESS(status)) {
|
103
|
+
rb_raise(rb_eArgError, "%s", u_errorName(status));
|
104
|
+
}
|
105
|
+
|
106
|
+
u_txt = new icu::UnicodeString(txt, txt_len);
|
107
|
+
trans->transliterate(*u_txt);
|
108
|
+
icu::StringByteSink<std::string> sink(&result);
|
109
|
+
u_txt->toUTF8(sink);
|
110
|
+
|
111
|
+
delete u_txt;
|
112
|
+
delete trans;
|
113
|
+
|
114
|
+
rb_out = charlock_new_str(result.data(), result.length());
|
115
|
+
|
116
|
+
return rb_out;
|
117
|
+
}
|
118
|
+
|
119
|
+
void _init_charlock_transliterator() {
|
120
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
121
|
+
rb_eEncodingCompatibilityError = rb_const_get(rb_cEncoding, rb_intern("CompatibilityError"));
|
122
|
+
#endif
|
123
|
+
|
124
|
+
rb_cTransliterator = rb_define_class_under(rb_mCharlockHolmes, "Transliterator", rb_cObject);
|
125
|
+
|
126
|
+
rb_define_singleton_method(rb_cTransliterator, "id_list", (VALUE(*)(...))rb_transliterator_id_list, 0);
|
127
|
+
rb_define_singleton_method(rb_cTransliterator, "transliterate", (VALUE(*)(...))rb_transliterator_transliterate, 2);
|
128
|
+
}
|
129
|
+
|
130
|
+
}
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module CharlockHolmes
|
2
|
+
class EncodingDetector
|
3
|
+
# Default length for which to scan content for NULL bytes
|
4
|
+
DEFAULT_BINARY_SCAN_LEN = 1024*1024
|
5
|
+
|
6
|
+
# Length for which to scan content for NULL bytes
|
7
|
+
attr_accessor :binary_scan_length
|
8
|
+
|
9
|
+
alias :strip_tags? :strip_tags
|
10
|
+
|
11
|
+
def initialize(scan_len=DEFAULT_BINARY_SCAN_LEN)
|
12
|
+
@binary_scan_length = scan_len
|
13
|
+
end
|
14
|
+
|
15
|
+
# Attempt to detect the encoding of this string
|
16
|
+
#
|
17
|
+
# NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
|
18
|
+
# as well as use the default binary scan length
|
19
|
+
#
|
20
|
+
# str - a String, what you want to detect the encoding of
|
21
|
+
# hint_enc - an optional String (like "UTF-8"), the encoding name which will
|
22
|
+
# be used as an additional hint to the charset detector
|
23
|
+
#
|
24
|
+
# Returns: a Hash with :encoding, :language, :type and :confidence
|
25
|
+
def self.detect(str, hint_enc=nil)
|
26
|
+
new.detect(str, hint_enc)
|
27
|
+
end
|
28
|
+
|
29
|
+
# Attempt to detect the encoding of this string, and return
|
30
|
+
# a list with all the possible encodings that match it.
|
31
|
+
#
|
32
|
+
# NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
|
33
|
+
# as well as use the default binary scan length
|
34
|
+
#
|
35
|
+
# str - a String, what you want to detect the encoding of
|
36
|
+
# hint_enc - an optional String (like "UTF-8"), the encoding name which will
|
37
|
+
# be used as an additional hint to the charset detector
|
38
|
+
#
|
39
|
+
# Returns: an Array with zero or more Hashes,
|
40
|
+
# each one of them with with :encoding, :language, :type and :confidence
|
41
|
+
def self.detect_all(str, hint_enc=nil)
|
42
|
+
new.detect_all(str, hint_enc)
|
43
|
+
end
|
44
|
+
|
45
|
+
# A mapping table of supported encoding names from EncodingDetector
|
46
|
+
# which point to the corresponding supported encoding name in Ruby.
|
47
|
+
# Like: {"UTF-8" => "UTF-8", "IBM420_rtl" => "ASCII-8BIT"}
|
48
|
+
#
|
49
|
+
# Note that encodings that can't be mapped between Charlock and Ruby will resolve
|
50
|
+
# to "ASCII-8BIT".
|
51
|
+
@encoding_table = {}
|
52
|
+
|
53
|
+
def self.encoding_table
|
54
|
+
@encoding_table
|
55
|
+
end
|
56
|
+
|
57
|
+
BINARY = 'binary'
|
58
|
+
|
59
|
+
# Builds the ENCODING_TABLE hash by running through the list of supported encodings
|
60
|
+
# in the ICU detection API and trying to map them to supported encodings in Ruby.
|
61
|
+
# This is built dynamically so as to take advantage of ICU upgrades which may have
|
62
|
+
# support for more encodings in the future.
|
63
|
+
#
|
64
|
+
# Returns nothing.
|
65
|
+
def self.build_encoding_table
|
66
|
+
supported_encodings.each do |name|
|
67
|
+
@encoding_table[name] = begin
|
68
|
+
::Encoding.find(name).name
|
69
|
+
rescue ArgumentError
|
70
|
+
BINARY
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
build_encoding_table
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'charlock_holmes' unless defined? CharlockHolmes
|
2
|
+
|
3
|
+
class String
|
4
|
+
# Attempt to detect the encoding of this string
|
5
|
+
#
|
6
|
+
# Returns: a Hash with :encoding, :language, :type and :confidence
|
7
|
+
def detect_encoding(hint_enc=nil)
|
8
|
+
detector = CharlockHolmes::EncodingDetector.new
|
9
|
+
detector.detect(self, hint_enc)
|
10
|
+
end
|
11
|
+
|
12
|
+
# Attempt to detect the encoding of this string, and return
|
13
|
+
# a list with all the possible encodings that match it.
|
14
|
+
#
|
15
|
+
# Returns: an Array with zero or more Hashes,
|
16
|
+
# each one of them with with :encoding, :language, :type and :confidence
|
17
|
+
def detect_encodings(hint_enc=nil)
|
18
|
+
detector = CharlockHolmes::EncodingDetector.new
|
19
|
+
detector.detect_all(self, hint_enc)
|
20
|
+
end
|
21
|
+
|
22
|
+
if method_defined? :force_encoding
|
23
|
+
# Attempt to detect the encoding of this string
|
24
|
+
# then set the encoding to what was detected ala `force_encoding`
|
25
|
+
#
|
26
|
+
# Returns: self
|
27
|
+
def detect_encoding!(hint_enc=nil)
|
28
|
+
if detected = self.detect_encoding(hint_enc)
|
29
|
+
self.force_encoding(detected[:ruby_encoding]) if detected[:ruby_encoding]
|
30
|
+
end
|
31
|
+
self
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
metadata
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: static_holmes
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.7.7
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Brian Lopez
|
8
|
+
- Vicent Martí
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2024-03-23 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rake-compiler
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - "~>"
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: '1.0'
|
21
|
+
type: :development
|
22
|
+
prerelease: false
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - "~>"
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: '1.0'
|
28
|
+
- !ruby/object:Gem::Dependency
|
29
|
+
name: minitest
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - "~>"
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '5.11'
|
35
|
+
type: :development
|
36
|
+
prerelease: false
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - "~>"
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '5.11'
|
42
|
+
- !ruby/object:Gem::Dependency
|
43
|
+
name: chardet
|
44
|
+
requirement: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - "~>"
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '0.9'
|
49
|
+
type: :development
|
50
|
+
prerelease: false
|
51
|
+
version_requirements: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - "~>"
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0.9'
|
56
|
+
description: charlock_holmes provides binary and text detection as well as text transcoding
|
57
|
+
using libicu
|
58
|
+
email: seniorlopez@gmail.com
|
59
|
+
executables: []
|
60
|
+
extensions:
|
61
|
+
- ext/charlock_holmes/extconf.rb
|
62
|
+
extra_rdoc_files: []
|
63
|
+
files:
|
64
|
+
- ext/charlock_holmes/common.h
|
65
|
+
- ext/charlock_holmes/converter.c
|
66
|
+
- ext/charlock_holmes/encoding_detector.c
|
67
|
+
- ext/charlock_holmes/ext.c
|
68
|
+
- ext/charlock_holmes/extconf.rb
|
69
|
+
- ext/charlock_holmes/transliterator.cpp
|
70
|
+
- lib/charlock_holmes.rb
|
71
|
+
- lib/charlock_holmes/encoding_detector.rb
|
72
|
+
- lib/charlock_holmes/string.rb
|
73
|
+
- lib/charlock_holmes/version.rb
|
74
|
+
homepage: https://gitlab.com/gitlab-org/ruby/gems/charlock_holmes
|
75
|
+
licenses:
|
76
|
+
- MIT
|
77
|
+
metadata: {}
|
78
|
+
post_install_message:
|
79
|
+
rdoc_options:
|
80
|
+
- "--charset=UTF-8"
|
81
|
+
require_paths:
|
82
|
+
- lib
|
83
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
84
|
+
requirements:
|
85
|
+
- - ">="
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: 1.9.3
|
88
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
89
|
+
requirements:
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: '0'
|
93
|
+
requirements: []
|
94
|
+
rubygems_version: 3.5.6
|
95
|
+
signing_key:
|
96
|
+
specification_version: 4
|
97
|
+
summary: Character encoding detection, brought to you by ICU
|
98
|
+
test_files: []
|