static_holmes 0.7.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/ext/charlock_holmes/common.h +41 -0
- data/ext/charlock_holmes/converter.c +57 -0
- data/ext/charlock_holmes/encoding_detector.c +377 -0
- data/ext/charlock_holmes/ext.c +15 -0
- data/ext/charlock_holmes/extconf.rb +102 -0
- data/ext/charlock_holmes/transliterator.cpp +130 -0
- data/lib/charlock_holmes/encoding_detector.rb +76 -0
- data/lib/charlock_holmes/string.rb +34 -0
- data/lib/charlock_holmes/version.rb +3 -0
- data/lib/charlock_holmes.rb +6 -0
- metadata +98 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 1e35947554b465d48dc970a60031dcca03df441bd829b4464dba2ecf0bc792bb
|
4
|
+
data.tar.gz: 4cdfaff28364c07fc96a0a77a62d2fb5dbccc130859a4268065084e5abdff449
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3007aa5b2d53c6046dc65086cb74881ea10f48fda3de2c89a053a42a820f5045b20ce2b716ce9ac0d51f3e6b704a85e9659958cf70c9631e74593637dfbae486
|
7
|
+
data.tar.gz: 13ad7377a3525419518ef4c20ce6c4763a8fc9e8b5cabaa30495d5d79cb8bf0b239e2238a4dcea5bc181572cec4356be3c2d65bf599c440a74a9f398881a6ec5
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#ifndef CHARLOCK_COMMON_H
|
2
|
+
#define CHARLOCK_COMMON_H
|
3
|
+
|
4
|
+
// tell rbx not to use it's caching compat layer
|
5
|
+
// by doing this we're making a promize to RBX that
|
6
|
+
// we'll never modify the pointers we get back from RSTRING_PTR
|
7
|
+
#define RSTRING_NOT_MODIFIED
|
8
|
+
|
9
|
+
#include <ruby.h>
|
10
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
11
|
+
#include <ruby/encoding.h>
|
12
|
+
#endif
|
13
|
+
|
14
|
+
static inline VALUE charlock_new_enc_str(const char *str, size_t len, void *encoding)
|
15
|
+
{
|
16
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
17
|
+
return rb_external_str_new_with_enc(str, len, (rb_encoding *)encoding);
|
18
|
+
#else
|
19
|
+
return rb_str_new(str, len);
|
20
|
+
#endif
|
21
|
+
}
|
22
|
+
|
23
|
+
static inline VALUE charlock_new_str(const char *str, size_t len)
|
24
|
+
{
|
25
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
26
|
+
return rb_external_str_new_with_enc(str, len, rb_utf8_encoding());
|
27
|
+
#else
|
28
|
+
return rb_str_new(str, len);
|
29
|
+
#endif
|
30
|
+
}
|
31
|
+
|
32
|
+
static inline VALUE charlock_new_str2(const char *str)
|
33
|
+
{
|
34
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
35
|
+
return rb_external_str_new_with_enc(str, strlen(str), rb_utf8_encoding());
|
36
|
+
#else
|
37
|
+
return rb_str_new2(str);
|
38
|
+
#endif
|
39
|
+
}
|
40
|
+
|
41
|
+
#endif
|
@@ -0,0 +1,57 @@
|
|
1
|
+
#include "unicode/ucnv.h"
|
2
|
+
#include "common.h"
|
3
|
+
|
4
|
+
extern VALUE rb_mCharlockHolmes;
|
5
|
+
static VALUE rb_cConverter;
|
6
|
+
|
7
|
+
static VALUE rb_converter_convert(VALUE self, VALUE rb_txt, VALUE rb_src_enc, VALUE rb_dst_enc) {
|
8
|
+
VALUE rb_out;
|
9
|
+
const char *src_enc;
|
10
|
+
const char *dst_enc;
|
11
|
+
const char *src_txt;
|
12
|
+
char *out_buf;
|
13
|
+
void *rb_enc = NULL;
|
14
|
+
int32_t src_len;
|
15
|
+
int32_t out_len;
|
16
|
+
UErrorCode status = U_ZERO_ERROR;
|
17
|
+
|
18
|
+
Check_Type(rb_txt, T_STRING);
|
19
|
+
Check_Type(rb_src_enc, T_STRING);
|
20
|
+
Check_Type(rb_dst_enc, T_STRING);
|
21
|
+
|
22
|
+
src_txt = RSTRING_PTR(rb_txt);
|
23
|
+
src_len = RSTRING_LEN(rb_txt);
|
24
|
+
src_enc = RSTRING_PTR(rb_src_enc);
|
25
|
+
dst_enc = RSTRING_PTR(rb_dst_enc);
|
26
|
+
|
27
|
+
// first determin the size of the output buffer
|
28
|
+
out_len = ucnv_convert(dst_enc, src_enc, NULL, 0, src_txt, src_len, &status);
|
29
|
+
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
30
|
+
rb_raise(rb_eArgError, "%s", u_errorName(status));
|
31
|
+
}
|
32
|
+
out_buf = malloc(out_len);
|
33
|
+
|
34
|
+
// now do the actual conversion
|
35
|
+
status = U_ZERO_ERROR;
|
36
|
+
out_len = ucnv_convert(dst_enc, src_enc, out_buf, out_len, src_txt, src_len, &status);
|
37
|
+
if (U_FAILURE(status)) {
|
38
|
+
free(out_buf);
|
39
|
+
rb_raise(rb_eArgError, "%s", u_errorName(status));
|
40
|
+
}
|
41
|
+
|
42
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
43
|
+
rb_enc = (void *)rb_enc_find(dst_enc);
|
44
|
+
#endif
|
45
|
+
|
46
|
+
rb_out = charlock_new_enc_str(out_buf, out_len, rb_enc);
|
47
|
+
|
48
|
+
free(out_buf);
|
49
|
+
|
50
|
+
return rb_out;
|
51
|
+
}
|
52
|
+
|
53
|
+
void _init_charlock_converter() {
|
54
|
+
rb_cConverter = rb_define_class_under(rb_mCharlockHolmes, "Converter", rb_cObject);
|
55
|
+
|
56
|
+
rb_define_singleton_method(rb_cConverter, "convert", rb_converter_convert, 3);
|
57
|
+
}
|
@@ -0,0 +1,377 @@
|
|
1
|
+
#include "unicode/ucsdet.h"
|
2
|
+
#include "common.h"
|
3
|
+
|
4
|
+
extern VALUE rb_mCharlockHolmes;
|
5
|
+
static VALUE rb_cEncodingDetector;
|
6
|
+
|
7
|
+
typedef struct {
|
8
|
+
UCharsetDetector *csd;
|
9
|
+
} charlock_detector_t;
|
10
|
+
|
11
|
+
static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
|
12
|
+
{
|
13
|
+
UErrorCode status = U_ZERO_ERROR;
|
14
|
+
const char *mname;
|
15
|
+
const char *mlang;
|
16
|
+
int mconfidence;
|
17
|
+
VALUE rb_match;
|
18
|
+
VALUE enc_tbl;
|
19
|
+
VALUE enc_name;
|
20
|
+
VALUE compat_enc;
|
21
|
+
|
22
|
+
if (!match)
|
23
|
+
return Qnil;
|
24
|
+
|
25
|
+
mname = ucsdet_getName(match, &status);
|
26
|
+
mlang = ucsdet_getLanguage(match, &status);
|
27
|
+
mconfidence = ucsdet_getConfidence(match, &status);
|
28
|
+
|
29
|
+
rb_match = rb_hash_new();
|
30
|
+
|
31
|
+
rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("text")));
|
32
|
+
|
33
|
+
enc_name = charlock_new_str2(mname);
|
34
|
+
rb_hash_aset(rb_match, ID2SYM(rb_intern("encoding")), enc_name);
|
35
|
+
|
36
|
+
enc_tbl = rb_iv_get(rb_cEncodingDetector, "@encoding_table");
|
37
|
+
compat_enc = rb_hash_aref(enc_tbl, enc_name);
|
38
|
+
if (!NIL_P(compat_enc)) {
|
39
|
+
rb_hash_aset(rb_match, ID2SYM(rb_intern("ruby_encoding")), compat_enc);
|
40
|
+
}
|
41
|
+
|
42
|
+
rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(mconfidence));
|
43
|
+
|
44
|
+
if (mlang && mlang[0])
|
45
|
+
rb_hash_aset(rb_match, ID2SYM(rb_intern("language")), charlock_new_str2(mlang));
|
46
|
+
|
47
|
+
return rb_match;
|
48
|
+
}
|
49
|
+
|
50
|
+
static VALUE rb_encdec_binarymatch() {
|
51
|
+
VALUE rb_match;
|
52
|
+
|
53
|
+
rb_match = rb_hash_new();
|
54
|
+
|
55
|
+
rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("binary")));
|
56
|
+
rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(100));
|
57
|
+
|
58
|
+
return rb_match;
|
59
|
+
}
|
60
|
+
|
61
|
+
static int detect_binary_content(VALUE self, VALUE rb_str) {
|
62
|
+
size_t buf_len, scan_len;
|
63
|
+
const char *buf;
|
64
|
+
|
65
|
+
buf = RSTRING_PTR(rb_str);
|
66
|
+
buf_len = RSTRING_LEN(rb_str);
|
67
|
+
scan_len = NUM2ULL(rb_iv_get(self, "@binary_scan_length"));
|
68
|
+
|
69
|
+
if (buf_len > 10) {
|
70
|
+
// application/postscript
|
71
|
+
if (!memcmp(buf, "%!PS-Adobe-", 11))
|
72
|
+
return 0;
|
73
|
+
}
|
74
|
+
|
75
|
+
if (buf_len > 7) {
|
76
|
+
// image/png
|
77
|
+
if (!memcmp(buf, "\x89PNG\x0D\x0A\x1A\x0A", 8))
|
78
|
+
return 1;
|
79
|
+
}
|
80
|
+
|
81
|
+
if (buf_len > 5) {
|
82
|
+
// image/gif
|
83
|
+
if (!memcmp(buf, "GIF87a", 6))
|
84
|
+
return 1;
|
85
|
+
|
86
|
+
// image/gif
|
87
|
+
if (!memcmp(buf, "GIF89a", 6))
|
88
|
+
return 1;
|
89
|
+
}
|
90
|
+
|
91
|
+
if (buf_len > 4) {
|
92
|
+
// application/pdf
|
93
|
+
if (!memcmp(buf, "%PDF-", 5))
|
94
|
+
return 1;
|
95
|
+
}
|
96
|
+
|
97
|
+
if (buf_len > 3) {
|
98
|
+
// UTF-32BE
|
99
|
+
if (!memcmp(buf, "\0\0\xfe\xff", 4))
|
100
|
+
return 0;
|
101
|
+
|
102
|
+
// UTF-32LE
|
103
|
+
if (!memcmp(buf, "\xff\xfe\0\0", 4))
|
104
|
+
return 0;
|
105
|
+
}
|
106
|
+
|
107
|
+
if (buf_len > 2) {
|
108
|
+
// image/jpeg
|
109
|
+
if (!memcmp(buf, "\xFF\xD8\xFF", 3))
|
110
|
+
return 1;
|
111
|
+
}
|
112
|
+
|
113
|
+
if (buf_len > 1) {
|
114
|
+
// UTF-16BE
|
115
|
+
if (!memcmp(buf, "\xfe\xff", 2))
|
116
|
+
return 0;
|
117
|
+
|
118
|
+
// UTF-16LE
|
119
|
+
if (!memcmp(buf, "\xff\xfe", 2))
|
120
|
+
return 0;
|
121
|
+
}
|
122
|
+
|
123
|
+
/*
|
124
|
+
* If we got this far, any NULL bytes within the `scan_len`
|
125
|
+
* range will likely mean the contents are binary.
|
126
|
+
*/
|
127
|
+
if (scan_len < buf_len)
|
128
|
+
buf_len = scan_len;
|
129
|
+
return !!memchr(buf, 0, buf_len);
|
130
|
+
}
|
131
|
+
|
132
|
+
/*
|
133
|
+
* call-seq: true/false = EncodingDetector.is_binary? str
|
134
|
+
*
|
135
|
+
* Attempt to detect if a string is binary or text
|
136
|
+
*
|
137
|
+
* str - a String, what you want to perform the binary check on
|
138
|
+
*
|
139
|
+
* Returns: true or false
|
140
|
+
*/
|
141
|
+
static VALUE rb_encdec_is_binary(VALUE self, VALUE str)
|
142
|
+
{
|
143
|
+
if (detect_binary_content(self, str))
|
144
|
+
return Qtrue;
|
145
|
+
else
|
146
|
+
return Qfalse;
|
147
|
+
}
|
148
|
+
|
149
|
+
/*
|
150
|
+
* call-seq: detection_hash = EncodingDetector.detect str[, hint_enc]
|
151
|
+
*
|
152
|
+
* Attempt to detect the encoding of this string
|
153
|
+
*
|
154
|
+
* str - a String, what you want to detect the encoding of
|
155
|
+
* hint_enc - an optional String (like "UTF-8"), the encoding name which will
|
156
|
+
* be used as an additional hint to the charset detector
|
157
|
+
*
|
158
|
+
* Returns: a Hash with :encoding, :language, :type and :confidence
|
159
|
+
*/
|
160
|
+
static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
|
161
|
+
{
|
162
|
+
UErrorCode status = U_ZERO_ERROR;
|
163
|
+
charlock_detector_t *detector;
|
164
|
+
VALUE rb_str;
|
165
|
+
VALUE rb_enc_hint;
|
166
|
+
|
167
|
+
rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
|
168
|
+
|
169
|
+
Check_Type(rb_str, T_STRING);
|
170
|
+
Data_Get_Struct(self, charlock_detector_t, detector);
|
171
|
+
|
172
|
+
// first lets see if this is binary content
|
173
|
+
if (detect_binary_content(self, rb_str)) {
|
174
|
+
return rb_encdec_binarymatch();
|
175
|
+
}
|
176
|
+
|
177
|
+
// if we got here - the data doesn't look like binary
|
178
|
+
// lets try to figure out what encoding the text is in
|
179
|
+
ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
|
180
|
+
|
181
|
+
if (!NIL_P(rb_enc_hint)) {
|
182
|
+
Check_Type(rb_enc_hint, T_STRING);
|
183
|
+
ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
|
184
|
+
}
|
185
|
+
|
186
|
+
return rb_encdec_buildmatch(ucsdet_detect(detector->csd, &status));
|
187
|
+
}
|
188
|
+
|
189
|
+
|
190
|
+
/*
|
191
|
+
* call-seq: detection_hash_array = EncodingDetector.detect_all str[, hint_enc]
|
192
|
+
*
|
193
|
+
* Attempt to detect the encoding of this string, and return
|
194
|
+
* a list with all the possible encodings that match it.
|
195
|
+
*
|
196
|
+
*
|
197
|
+
* str - a String, what you want to detect the encoding of
|
198
|
+
* hint_enc - an optional String (like "UTF-8"), the encoding name which will
|
199
|
+
* be used as an additional hint to the charset detector
|
200
|
+
*
|
201
|
+
* Returns: an Array with zero or more Hashes,
|
202
|
+
* each one of them with with :encoding, :language, :type and :confidence
|
203
|
+
*/
|
204
|
+
static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
|
205
|
+
{
|
206
|
+
UErrorCode status = U_ZERO_ERROR;
|
207
|
+
charlock_detector_t *detector;
|
208
|
+
const UCharsetMatch **csm;
|
209
|
+
VALUE rb_ret;
|
210
|
+
int i, match_count;
|
211
|
+
VALUE rb_str;
|
212
|
+
VALUE rb_enc_hint;
|
213
|
+
VALUE binary_match;
|
214
|
+
|
215
|
+
rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
|
216
|
+
|
217
|
+
Check_Type(rb_str, T_STRING);
|
218
|
+
Data_Get_Struct(self, charlock_detector_t, detector);
|
219
|
+
|
220
|
+
rb_ret = rb_ary_new();
|
221
|
+
|
222
|
+
// first lets see if this is binary content
|
223
|
+
binary_match = Qnil;
|
224
|
+
if (detect_binary_content(self, rb_str)) {
|
225
|
+
binary_match = rb_encdec_binarymatch();
|
226
|
+
}
|
227
|
+
|
228
|
+
ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
|
229
|
+
|
230
|
+
if (!NIL_P(rb_enc_hint)) {
|
231
|
+
Check_Type(rb_enc_hint, T_STRING);
|
232
|
+
ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
|
233
|
+
}
|
234
|
+
|
235
|
+
csm = ucsdet_detectAll(detector->csd, &match_count, &status);
|
236
|
+
|
237
|
+
for (i = 0; i < match_count; ++i) {
|
238
|
+
rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i]));
|
239
|
+
}
|
240
|
+
|
241
|
+
if (!NIL_P(binary_match))
|
242
|
+
rb_ary_unshift(rb_ret, binary_match);
|
243
|
+
|
244
|
+
return rb_ret;
|
245
|
+
}
|
246
|
+
|
247
|
+
/*
|
248
|
+
* call-seq: EncodingDetector#strip_tags?
|
249
|
+
*
|
250
|
+
* Returns whether or not the strip_tags flag is set on this detector
|
251
|
+
*
|
252
|
+
* Returns: Boolean
|
253
|
+
*/
|
254
|
+
static VALUE rb_get_strip_tags(VALUE self)
|
255
|
+
{
|
256
|
+
charlock_detector_t *detector;
|
257
|
+
UBool val;
|
258
|
+
VALUE rb_val;
|
259
|
+
|
260
|
+
Data_Get_Struct(self, charlock_detector_t, detector);
|
261
|
+
|
262
|
+
val = ucsdet_isInputFilterEnabled(detector->csd);
|
263
|
+
|
264
|
+
rb_val = val == 1 ? Qtrue : Qfalse;
|
265
|
+
|
266
|
+
return rb_val;
|
267
|
+
}
|
268
|
+
|
269
|
+
/*
|
270
|
+
* call-seq: EncodingDetector#strip_tags = true
|
271
|
+
*
|
272
|
+
* Enable or disable the stripping of HTML/XML tags from the input before
|
273
|
+
* attempting any detection
|
274
|
+
*
|
275
|
+
* Returns: Boolean, the value passed
|
276
|
+
*/
|
277
|
+
static VALUE rb_set_strip_tags(VALUE self, VALUE rb_val)
|
278
|
+
{
|
279
|
+
charlock_detector_t *detector;
|
280
|
+
UBool val;
|
281
|
+
|
282
|
+
Data_Get_Struct(self, charlock_detector_t, detector);
|
283
|
+
|
284
|
+
val = rb_val == Qtrue ? 1 : 0;
|
285
|
+
|
286
|
+
ucsdet_enableInputFilter(detector->csd, val);
|
287
|
+
|
288
|
+
return rb_val;
|
289
|
+
}
|
290
|
+
|
291
|
+
/*
|
292
|
+
* call-seq: detectable_encodings = EncodingDetector.supported_encodings
|
293
|
+
*
|
294
|
+
* The list of detectable encodings supported by this library
|
295
|
+
*
|
296
|
+
* Returns: an Array of Strings
|
297
|
+
*/
|
298
|
+
static VALUE rb_get_supported_encodings(VALUE klass)
|
299
|
+
{
|
300
|
+
UCharsetDetector *csd;
|
301
|
+
UErrorCode status = U_ZERO_ERROR;
|
302
|
+
UEnumeration *encoding_list;
|
303
|
+
VALUE rb_encoding_list;
|
304
|
+
int32_t enc_count;
|
305
|
+
int32_t i;
|
306
|
+
const char *enc_name;
|
307
|
+
int32_t enc_name_len;
|
308
|
+
|
309
|
+
rb_encoding_list = rb_iv_get(klass, "encoding_list");
|
310
|
+
|
311
|
+
// lazily populate the list
|
312
|
+
if (NIL_P(rb_encoding_list)) {
|
313
|
+
csd = ucsdet_open(&status);
|
314
|
+
|
315
|
+
encoding_list = ucsdet_getAllDetectableCharsets(csd, &status);
|
316
|
+
rb_encoding_list = rb_ary_new();
|
317
|
+
enc_count = uenum_count(encoding_list, &status);
|
318
|
+
|
319
|
+
rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1250"));
|
320
|
+
rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1252"));
|
321
|
+
rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1253"));
|
322
|
+
rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1254"));
|
323
|
+
rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1255"));
|
324
|
+
|
325
|
+
for(i=0; i < enc_count; i++) {
|
326
|
+
enc_name = uenum_next(encoding_list, &enc_name_len, &status);
|
327
|
+
rb_ary_push(rb_encoding_list, charlock_new_str(enc_name, enc_name_len));
|
328
|
+
}
|
329
|
+
|
330
|
+
rb_iv_set(klass, "encoding_list", rb_encoding_list);
|
331
|
+
ucsdet_close(csd);
|
332
|
+
}
|
333
|
+
|
334
|
+
return rb_encoding_list;
|
335
|
+
}
|
336
|
+
|
337
|
+
static void rb_encdec__free(void *obj)
|
338
|
+
{
|
339
|
+
charlock_detector_t *detector;
|
340
|
+
|
341
|
+
detector = (charlock_detector_t *)obj;
|
342
|
+
|
343
|
+
if (detector->csd)
|
344
|
+
ucsdet_close(detector->csd);
|
345
|
+
|
346
|
+
free(detector);
|
347
|
+
}
|
348
|
+
|
349
|
+
static VALUE rb_encdec__alloc(VALUE klass)
|
350
|
+
{
|
351
|
+
charlock_detector_t *detector;
|
352
|
+
UErrorCode status = U_ZERO_ERROR;
|
353
|
+
VALUE obj;
|
354
|
+
|
355
|
+
detector = calloc(1, sizeof(charlock_detector_t));
|
356
|
+
obj = Data_Wrap_Struct(klass, NULL, rb_encdec__free, (void *)detector);
|
357
|
+
|
358
|
+
detector->csd = ucsdet_open(&status);
|
359
|
+
if (U_FAILURE(status)) {
|
360
|
+
rb_raise(rb_eStandardError, "%s", u_errorName(status));
|
361
|
+
}
|
362
|
+
|
363
|
+
return obj;
|
364
|
+
}
|
365
|
+
|
366
|
+
void _init_charlock_encoding_detector()
|
367
|
+
{
|
368
|
+
rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject);
|
369
|
+
rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
|
370
|
+
rb_define_method(rb_cEncodingDetector, "is_binary?", rb_encdec_is_binary, 1);
|
371
|
+
rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
|
372
|
+
rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1);
|
373
|
+
rb_define_method(rb_cEncodingDetector, "strip_tags", rb_get_strip_tags, 0);
|
374
|
+
rb_define_method(rb_cEncodingDetector, "strip_tags=", rb_set_strip_tags, 1);
|
375
|
+
|
376
|
+
rb_define_singleton_method(rb_cEncodingDetector, "supported_encodings", rb_get_supported_encodings, 0);
|
377
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
#include "common.h"
|
2
|
+
|
3
|
+
extern void _init_charlock_encoding_detector();
|
4
|
+
extern void _init_charlock_converter();
|
5
|
+
extern void _init_charlock_transliterator();
|
6
|
+
|
7
|
+
VALUE rb_mCharlockHolmes;
|
8
|
+
|
9
|
+
void Init_charlock_holmes() {
|
10
|
+
rb_mCharlockHolmes = rb_define_module("CharlockHolmes");
|
11
|
+
|
12
|
+
_init_charlock_encoding_detector();
|
13
|
+
_init_charlock_converter();
|
14
|
+
_init_charlock_transliterator();
|
15
|
+
}
|
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
if `which make`.strip.empty?
|
4
|
+
STDERR.puts "\n\n"
|
5
|
+
STDERR.puts "***************************************************************************************"
|
6
|
+
STDERR.puts "*************** make required (apt-get install make build-essential) =( ***************"
|
7
|
+
STDERR.puts "***************************************************************************************"
|
8
|
+
exit(1)
|
9
|
+
end
|
10
|
+
|
11
|
+
##
|
12
|
+
# ICU dependency
|
13
|
+
#
|
14
|
+
|
15
|
+
ldflags = cppflags = nil
|
16
|
+
|
17
|
+
if RbConfig::CONFIG["host_os"] =~ /darwin/
|
18
|
+
begin
|
19
|
+
brew_prefix = `brew --prefix icu4c`.chomp
|
20
|
+
ldflags = "#{brew_prefix}/lib"
|
21
|
+
cppflags = "#{brew_prefix}/include"
|
22
|
+
pkg_conf = "#{brew_prefix}/lib/pkgconfig"
|
23
|
+
# pkg_config should be less error prone than parsing compiler
|
24
|
+
# commandline options, but we need to set default ldflags and cpp flags
|
25
|
+
# in case the user doesn't have pkg-config installed
|
26
|
+
ENV['PKG_CONFIG_PATH'] ||= pkg_conf
|
27
|
+
rescue
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
dir_config 'icu', cppflags, ldflags
|
32
|
+
|
33
|
+
pkg_config("icu-i18n")
|
34
|
+
pkg_config("icu-io")
|
35
|
+
pkg_config("icu-uc")
|
36
|
+
|
37
|
+
$CXXFLAGS << ' -std=c++11' unless $CXXFLAGS.include?("-std=")
|
38
|
+
|
39
|
+
unless have_library 'icui18n' and have_header 'unicode/ucnv.h'
|
40
|
+
STDERR.puts "\n\n"
|
41
|
+
STDERR.puts "***************************************************************************************"
|
42
|
+
STDERR.puts "*********** icu required (brew install icu4c or apt-get install libicu-dev) ***********"
|
43
|
+
STDERR.puts "***************************************************************************************"
|
44
|
+
exit(1)
|
45
|
+
end
|
46
|
+
|
47
|
+
have_library 'z' or abort 'libz missing'
|
48
|
+
have_library 'icuuc' or abort 'libicuuc missing'
|
49
|
+
have_library 'icudata' or abort 'libicudata missing'
|
50
|
+
|
51
|
+
$CFLAGS << ' -Wall -funroll-loops'
|
52
|
+
$CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
|
53
|
+
|
54
|
+
def libflag_to_filename(ldflag)
|
55
|
+
case ldflag
|
56
|
+
when /\A-l(.+)/
|
57
|
+
"lib#{Regexp.last_match(1)}.#{$LIBEXT}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def resolve_static_library(libflag, dirs)
|
62
|
+
filename = libflag_to_filename(libflag)
|
63
|
+
|
64
|
+
dir = dirs.find { |path| File.exist?(File.join(path, filename)) }
|
65
|
+
|
66
|
+
raise "Unable to find #{filename} in #{dirs}" unless dir
|
67
|
+
|
68
|
+
File.join(dir, filename)
|
69
|
+
end
|
70
|
+
|
71
|
+
def substitute_static_libs(packages)
|
72
|
+
# First, find all the -l<lib> flags added by pkg-config. We want to drop
|
73
|
+
# these dynamically linked libraries and substitute them with the static libraries.
|
74
|
+
libflags = packages.map do |pkg|
|
75
|
+
pkg_config(pkg, 'libs-only-l')&.strip&.split(' ')
|
76
|
+
end.flatten.uniq
|
77
|
+
|
78
|
+
# To find where the static libraries live, we need to search the
|
79
|
+
# library paths given by the -L flag from pkg-config.
|
80
|
+
lib_paths = packages.map do |pkg|
|
81
|
+
include_path = pkg_config(pkg, 'libs-only-L')&.strip
|
82
|
+
include_path&.split(' ').map { |lib| lib.gsub(/^-L/, '') }
|
83
|
+
end.flatten.uniq
|
84
|
+
|
85
|
+
# Drop the -l<lib> flags and add in the static libraries.
|
86
|
+
new_libs = $libs.shellsplit
|
87
|
+
new_libs.reject! { |arg| libflags.include?(arg) }
|
88
|
+
libflags.each { |flag| new_libs << resolve_static_library(flag, lib_paths) }
|
89
|
+
$libs = new_libs.uniq.shelljoin
|
90
|
+
end
|
91
|
+
|
92
|
+
static_p = enable_config('static', false)
|
93
|
+
message "Static linking is #{static_p ? 'enabled' : 'disabled'}.\n"
|
94
|
+
|
95
|
+
if static_p
|
96
|
+
$CXXFLAGS << ' -fPIC'
|
97
|
+
ENV['PKG_CONFIG_ALLOW_SYSTEM_LIBS'] = '1'
|
98
|
+
|
99
|
+
substitute_static_libs(%w[icu-i18n icu-io icu-uc])
|
100
|
+
end
|
101
|
+
|
102
|
+
create_makefile 'charlock_holmes/charlock_holmes'
|
@@ -0,0 +1,130 @@
|
|
1
|
+
#include "common.h"
|
2
|
+
#undef UChar
|
3
|
+
|
4
|
+
#include <string>
|
5
|
+
#include <unicode/translit.h>
|
6
|
+
|
7
|
+
extern "C" {
|
8
|
+
|
9
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
10
|
+
#include <ruby/encoding.h>
|
11
|
+
static VALUE rb_eEncodingCompatibilityError;
|
12
|
+
|
13
|
+
static void check_utf8_encoding(VALUE str) {
|
14
|
+
static rb_encoding *_cached[3] = {NULL, NULL, NULL};
|
15
|
+
rb_encoding *enc;
|
16
|
+
|
17
|
+
if (_cached[0] == NULL) {
|
18
|
+
_cached[0] = rb_utf8_encoding();
|
19
|
+
_cached[1] = rb_usascii_encoding();
|
20
|
+
_cached[2] = rb_ascii8bit_encoding();
|
21
|
+
}
|
22
|
+
|
23
|
+
enc = rb_enc_get(str);
|
24
|
+
if (enc != _cached[0] && enc != _cached[1] && enc != _cached[2]) {
|
25
|
+
rb_raise(rb_eEncodingCompatibilityError,
|
26
|
+
"Input must be UTF-8 or US-ASCII, %s given", rb_enc_name(enc));
|
27
|
+
}
|
28
|
+
}
|
29
|
+
|
30
|
+
#else
|
31
|
+
static void check_utf8_encoding(VALUE str) {}
|
32
|
+
#endif
|
33
|
+
|
34
|
+
extern VALUE rb_mCharlockHolmes;
|
35
|
+
static VALUE rb_cTransliterator;
|
36
|
+
|
37
|
+
static VALUE rb_transliterator_id_list(VALUE self) {
|
38
|
+
UErrorCode status = U_ZERO_ERROR;
|
39
|
+
icu::StringEnumeration *id_list;
|
40
|
+
int32_t id_list_size;
|
41
|
+
const char *curr_id;
|
42
|
+
int32_t curr_id_len;
|
43
|
+
VALUE rb_ary;
|
44
|
+
VALUE rb_curr_id;
|
45
|
+
|
46
|
+
id_list_size = 0;
|
47
|
+
id_list = icu::Transliterator::getAvailableIDs(status);
|
48
|
+
if(!U_SUCCESS(status)) {
|
49
|
+
rb_raise(rb_eArgError, "%s", u_errorName(status));
|
50
|
+
}
|
51
|
+
|
52
|
+
status = U_ZERO_ERROR;
|
53
|
+
id_list_size = id_list->count(status);
|
54
|
+
if(!U_SUCCESS(status)) {
|
55
|
+
rb_raise(rb_eArgError, "%s", u_errorName(status));
|
56
|
+
}
|
57
|
+
|
58
|
+
rb_ary = rb_ary_new2(id_list_size);
|
59
|
+
|
60
|
+
do {
|
61
|
+
curr_id_len = 0;
|
62
|
+
curr_id = id_list->next(&curr_id_len, status);
|
63
|
+
if(!U_SUCCESS(status)) {
|
64
|
+
rb_raise(rb_eArgError, "%s", u_errorName(status));
|
65
|
+
}
|
66
|
+
|
67
|
+
if (curr_id != NULL) {
|
68
|
+
rb_curr_id = charlock_new_str(curr_id, curr_id_len);
|
69
|
+
rb_ary_push(rb_ary, rb_curr_id);
|
70
|
+
}
|
71
|
+
} while(curr_id != NULL);
|
72
|
+
|
73
|
+
delete id_list;
|
74
|
+
|
75
|
+
return rb_ary;
|
76
|
+
}
|
77
|
+
|
78
|
+
static VALUE rb_transliterator_transliterate(VALUE self, VALUE rb_txt, VALUE rb_id) {
|
79
|
+
UErrorCode status = U_ZERO_ERROR;
|
80
|
+
UParseError p_error;
|
81
|
+
icu::Transliterator *trans;
|
82
|
+
const char *txt;
|
83
|
+
size_t txt_len;
|
84
|
+
const char *id;
|
85
|
+
size_t id_len;
|
86
|
+
icu::UnicodeString *u_txt;
|
87
|
+
std::string result;
|
88
|
+
VALUE rb_out;
|
89
|
+
|
90
|
+
Check_Type(rb_txt, T_STRING);
|
91
|
+
Check_Type(rb_id, T_STRING);
|
92
|
+
|
93
|
+
check_utf8_encoding(rb_txt);
|
94
|
+
check_utf8_encoding(rb_id);
|
95
|
+
|
96
|
+
txt = RSTRING_PTR(rb_txt);
|
97
|
+
txt_len = RSTRING_LEN(rb_txt);
|
98
|
+
id = RSTRING_PTR(rb_id);
|
99
|
+
id_len = RSTRING_LEN(rb_id);
|
100
|
+
|
101
|
+
trans = icu::Transliterator::createInstance(icu::UnicodeString(id, id_len), UTRANS_FORWARD, p_error, status);
|
102
|
+
if(!U_SUCCESS(status)) {
|
103
|
+
rb_raise(rb_eArgError, "%s", u_errorName(status));
|
104
|
+
}
|
105
|
+
|
106
|
+
u_txt = new icu::UnicodeString(txt, txt_len);
|
107
|
+
trans->transliterate(*u_txt);
|
108
|
+
icu::StringByteSink<std::string> sink(&result);
|
109
|
+
u_txt->toUTF8(sink);
|
110
|
+
|
111
|
+
delete u_txt;
|
112
|
+
delete trans;
|
113
|
+
|
114
|
+
rb_out = charlock_new_str(result.data(), result.length());
|
115
|
+
|
116
|
+
return rb_out;
|
117
|
+
}
|
118
|
+
|
119
|
+
void _init_charlock_transliterator() {
|
120
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
121
|
+
rb_eEncodingCompatibilityError = rb_const_get(rb_cEncoding, rb_intern("CompatibilityError"));
|
122
|
+
#endif
|
123
|
+
|
124
|
+
rb_cTransliterator = rb_define_class_under(rb_mCharlockHolmes, "Transliterator", rb_cObject);
|
125
|
+
|
126
|
+
rb_define_singleton_method(rb_cTransliterator, "id_list", (VALUE(*)(...))rb_transliterator_id_list, 0);
|
127
|
+
rb_define_singleton_method(rb_cTransliterator, "transliterate", (VALUE(*)(...))rb_transliterator_transliterate, 2);
|
128
|
+
}
|
129
|
+
|
130
|
+
}
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module CharlockHolmes
|
2
|
+
class EncodingDetector
|
3
|
+
# Default length for which to scan content for NULL bytes
|
4
|
+
DEFAULT_BINARY_SCAN_LEN = 1024*1024
|
5
|
+
|
6
|
+
# Length for which to scan content for NULL bytes
|
7
|
+
attr_accessor :binary_scan_length
|
8
|
+
|
9
|
+
alias :strip_tags? :strip_tags
|
10
|
+
|
11
|
+
def initialize(scan_len=DEFAULT_BINARY_SCAN_LEN)
|
12
|
+
@binary_scan_length = scan_len
|
13
|
+
end
|
14
|
+
|
15
|
+
# Attempt to detect the encoding of this string
|
16
|
+
#
|
17
|
+
# NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
|
18
|
+
# as well as use the default binary scan length
|
19
|
+
#
|
20
|
+
# str - a String, what you want to detect the encoding of
|
21
|
+
# hint_enc - an optional String (like "UTF-8"), the encoding name which will
|
22
|
+
# be used as an additional hint to the charset detector
|
23
|
+
#
|
24
|
+
# Returns: a Hash with :encoding, :language, :type and :confidence
|
25
|
+
def self.detect(str, hint_enc=nil)
|
26
|
+
new.detect(str, hint_enc)
|
27
|
+
end
|
28
|
+
|
29
|
+
# Attempt to detect the encoding of this string, and return
|
30
|
+
# a list with all the possible encodings that match it.
|
31
|
+
#
|
32
|
+
# NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
|
33
|
+
# as well as use the default binary scan length
|
34
|
+
#
|
35
|
+
# str - a String, what you want to detect the encoding of
|
36
|
+
# hint_enc - an optional String (like "UTF-8"), the encoding name which will
|
37
|
+
# be used as an additional hint to the charset detector
|
38
|
+
#
|
39
|
+
# Returns: an Array with zero or more Hashes,
|
40
|
+
# each one of them with with :encoding, :language, :type and :confidence
|
41
|
+
def self.detect_all(str, hint_enc=nil)
|
42
|
+
new.detect_all(str, hint_enc)
|
43
|
+
end
|
44
|
+
|
45
|
+
# A mapping table of supported encoding names from EncodingDetector
|
46
|
+
# which point to the corresponding supported encoding name in Ruby.
|
47
|
+
# Like: {"UTF-8" => "UTF-8", "IBM420_rtl" => "ASCII-8BIT"}
|
48
|
+
#
|
49
|
+
# Note that encodings that can't be mapped between Charlock and Ruby will resolve
|
50
|
+
# to "ASCII-8BIT".
|
51
|
+
@encoding_table = {}
|
52
|
+
|
53
|
+
def self.encoding_table
|
54
|
+
@encoding_table
|
55
|
+
end
|
56
|
+
|
57
|
+
BINARY = 'binary'
|
58
|
+
|
59
|
+
# Builds the ENCODING_TABLE hash by running through the list of supported encodings
|
60
|
+
# in the ICU detection API and trying to map them to supported encodings in Ruby.
|
61
|
+
# This is built dynamically so as to take advantage of ICU upgrades which may have
|
62
|
+
# support for more encodings in the future.
|
63
|
+
#
|
64
|
+
# Returns nothing.
|
65
|
+
def self.build_encoding_table
|
66
|
+
supported_encodings.each do |name|
|
67
|
+
@encoding_table[name] = begin
|
68
|
+
::Encoding.find(name).name
|
69
|
+
rescue ArgumentError
|
70
|
+
BINARY
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
build_encoding_table
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'charlock_holmes' unless defined? CharlockHolmes
|
2
|
+
|
3
|
+
class String
|
4
|
+
# Attempt to detect the encoding of this string
|
5
|
+
#
|
6
|
+
# Returns: a Hash with :encoding, :language, :type and :confidence
|
7
|
+
def detect_encoding(hint_enc=nil)
|
8
|
+
detector = CharlockHolmes::EncodingDetector.new
|
9
|
+
detector.detect(self, hint_enc)
|
10
|
+
end
|
11
|
+
|
12
|
+
# Attempt to detect the encoding of this string, and return
|
13
|
+
# a list with all the possible encodings that match it.
|
14
|
+
#
|
15
|
+
# Returns: an Array with zero or more Hashes,
|
16
|
+
# each one of them with with :encoding, :language, :type and :confidence
|
17
|
+
def detect_encodings(hint_enc=nil)
|
18
|
+
detector = CharlockHolmes::EncodingDetector.new
|
19
|
+
detector.detect_all(self, hint_enc)
|
20
|
+
end
|
21
|
+
|
22
|
+
if method_defined? :force_encoding
|
23
|
+
# Attempt to detect the encoding of this string
|
24
|
+
# then set the encoding to what was detected ala `force_encoding`
|
25
|
+
#
|
26
|
+
# Returns: self
|
27
|
+
def detect_encoding!(hint_enc=nil)
|
28
|
+
if detected = self.detect_encoding(hint_enc)
|
29
|
+
self.force_encoding(detected[:ruby_encoding]) if detected[:ruby_encoding]
|
30
|
+
end
|
31
|
+
self
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
metadata
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: static_holmes
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.7.7
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Brian Lopez
|
8
|
+
- Vicent Martí
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2024-03-23 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rake-compiler
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - "~>"
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: '1.0'
|
21
|
+
type: :development
|
22
|
+
prerelease: false
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - "~>"
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: '1.0'
|
28
|
+
- !ruby/object:Gem::Dependency
|
29
|
+
name: minitest
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - "~>"
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '5.11'
|
35
|
+
type: :development
|
36
|
+
prerelease: false
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - "~>"
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '5.11'
|
42
|
+
- !ruby/object:Gem::Dependency
|
43
|
+
name: chardet
|
44
|
+
requirement: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - "~>"
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '0.9'
|
49
|
+
type: :development
|
50
|
+
prerelease: false
|
51
|
+
version_requirements: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - "~>"
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0.9'
|
56
|
+
description: charlock_holmes provides binary and text detection as well as text transcoding
|
57
|
+
using libicu
|
58
|
+
email: seniorlopez@gmail.com
|
59
|
+
executables: []
|
60
|
+
extensions:
|
61
|
+
- ext/charlock_holmes/extconf.rb
|
62
|
+
extra_rdoc_files: []
|
63
|
+
files:
|
64
|
+
- ext/charlock_holmes/common.h
|
65
|
+
- ext/charlock_holmes/converter.c
|
66
|
+
- ext/charlock_holmes/encoding_detector.c
|
67
|
+
- ext/charlock_holmes/ext.c
|
68
|
+
- ext/charlock_holmes/extconf.rb
|
69
|
+
- ext/charlock_holmes/transliterator.cpp
|
70
|
+
- lib/charlock_holmes.rb
|
71
|
+
- lib/charlock_holmes/encoding_detector.rb
|
72
|
+
- lib/charlock_holmes/string.rb
|
73
|
+
- lib/charlock_holmes/version.rb
|
74
|
+
homepage: https://gitlab.com/gitlab-org/ruby/gems/charlock_holmes
|
75
|
+
licenses:
|
76
|
+
- MIT
|
77
|
+
metadata: {}
|
78
|
+
post_install_message:
|
79
|
+
rdoc_options:
|
80
|
+
- "--charset=UTF-8"
|
81
|
+
require_paths:
|
82
|
+
- lib
|
83
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
84
|
+
requirements:
|
85
|
+
- - ">="
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: 1.9.3
|
88
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
89
|
+
requirements:
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: '0'
|
93
|
+
requirements: []
|
94
|
+
rubygems_version: 3.5.6
|
95
|
+
signing_key:
|
96
|
+
specification_version: 4
|
97
|
+
summary: Character encoding detection, brought to you by ICU
|
98
|
+
test_files: []
|