static_holmes 0.7.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 1e35947554b465d48dc970a60031dcca03df441bd829b4464dba2ecf0bc792bb
4
+ data.tar.gz: 4cdfaff28364c07fc96a0a77a62d2fb5dbccc130859a4268065084e5abdff449
5
+ SHA512:
6
+ metadata.gz: 3007aa5b2d53c6046dc65086cb74881ea10f48fda3de2c89a053a42a820f5045b20ce2b716ce9ac0d51f3e6b704a85e9659958cf70c9631e74593637dfbae486
7
+ data.tar.gz: 13ad7377a3525419518ef4c20ce6c4763a8fc9e8b5cabaa30495d5d79cb8bf0b239e2238a4dcea5bc181572cec4356be3c2d65bf599c440a74a9f398881a6ec5
@@ -0,0 +1,41 @@
1
+ #ifndef CHARLOCK_COMMON_H
2
+ #define CHARLOCK_COMMON_H
3
+
4
+ // tell rbx not to use it's caching compat layer
5
+ // by doing this we're making a promize to RBX that
6
+ // we'll never modify the pointers we get back from RSTRING_PTR
7
+ #define RSTRING_NOT_MODIFIED
8
+
9
+ #include <ruby.h>
10
+ #ifdef HAVE_RUBY_ENCODING_H
11
+ #include <ruby/encoding.h>
12
+ #endif
13
+
14
+ static inline VALUE charlock_new_enc_str(const char *str, size_t len, void *encoding)
15
+ {
16
+ #ifdef HAVE_RUBY_ENCODING_H
17
+ return rb_external_str_new_with_enc(str, len, (rb_encoding *)encoding);
18
+ #else
19
+ return rb_str_new(str, len);
20
+ #endif
21
+ }
22
+
23
+ static inline VALUE charlock_new_str(const char *str, size_t len)
24
+ {
25
+ #ifdef HAVE_RUBY_ENCODING_H
26
+ return rb_external_str_new_with_enc(str, len, rb_utf8_encoding());
27
+ #else
28
+ return rb_str_new(str, len);
29
+ #endif
30
+ }
31
+
32
+ static inline VALUE charlock_new_str2(const char *str)
33
+ {
34
+ #ifdef HAVE_RUBY_ENCODING_H
35
+ return rb_external_str_new_with_enc(str, strlen(str), rb_utf8_encoding());
36
+ #else
37
+ return rb_str_new2(str);
38
+ #endif
39
+ }
40
+
41
+ #endif
@@ -0,0 +1,57 @@
1
+ #include "unicode/ucnv.h"
2
+ #include "common.h"
3
+
4
+ extern VALUE rb_mCharlockHolmes;
5
+ static VALUE rb_cConverter;
6
+
7
+ static VALUE rb_converter_convert(VALUE self, VALUE rb_txt, VALUE rb_src_enc, VALUE rb_dst_enc) {
8
+ VALUE rb_out;
9
+ const char *src_enc;
10
+ const char *dst_enc;
11
+ const char *src_txt;
12
+ char *out_buf;
13
+ void *rb_enc = NULL;
14
+ int32_t src_len;
15
+ int32_t out_len;
16
+ UErrorCode status = U_ZERO_ERROR;
17
+
18
+ Check_Type(rb_txt, T_STRING);
19
+ Check_Type(rb_src_enc, T_STRING);
20
+ Check_Type(rb_dst_enc, T_STRING);
21
+
22
+ src_txt = RSTRING_PTR(rb_txt);
23
+ src_len = RSTRING_LEN(rb_txt);
24
+ src_enc = RSTRING_PTR(rb_src_enc);
25
+ dst_enc = RSTRING_PTR(rb_dst_enc);
26
+
27
+ // first determin the size of the output buffer
28
+ out_len = ucnv_convert(dst_enc, src_enc, NULL, 0, src_txt, src_len, &status);
29
+ if (status != U_BUFFER_OVERFLOW_ERROR) {
30
+ rb_raise(rb_eArgError, "%s", u_errorName(status));
31
+ }
32
+ out_buf = malloc(out_len);
33
+
34
+ // now do the actual conversion
35
+ status = U_ZERO_ERROR;
36
+ out_len = ucnv_convert(dst_enc, src_enc, out_buf, out_len, src_txt, src_len, &status);
37
+ if (U_FAILURE(status)) {
38
+ free(out_buf);
39
+ rb_raise(rb_eArgError, "%s", u_errorName(status));
40
+ }
41
+
42
+ #ifdef HAVE_RUBY_ENCODING_H
43
+ rb_enc = (void *)rb_enc_find(dst_enc);
44
+ #endif
45
+
46
+ rb_out = charlock_new_enc_str(out_buf, out_len, rb_enc);
47
+
48
+ free(out_buf);
49
+
50
+ return rb_out;
51
+ }
52
+
53
+ void _init_charlock_converter() {
54
+ rb_cConverter = rb_define_class_under(rb_mCharlockHolmes, "Converter", rb_cObject);
55
+
56
+ rb_define_singleton_method(rb_cConverter, "convert", rb_converter_convert, 3);
57
+ }
@@ -0,0 +1,377 @@
1
+ #include "unicode/ucsdet.h"
2
+ #include "common.h"
3
+
4
+ extern VALUE rb_mCharlockHolmes;
5
+ static VALUE rb_cEncodingDetector;
6
+
7
+ typedef struct {
8
+ UCharsetDetector *csd;
9
+ } charlock_detector_t;
10
+
11
+ static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
12
+ {
13
+ UErrorCode status = U_ZERO_ERROR;
14
+ const char *mname;
15
+ const char *mlang;
16
+ int mconfidence;
17
+ VALUE rb_match;
18
+ VALUE enc_tbl;
19
+ VALUE enc_name;
20
+ VALUE compat_enc;
21
+
22
+ if (!match)
23
+ return Qnil;
24
+
25
+ mname = ucsdet_getName(match, &status);
26
+ mlang = ucsdet_getLanguage(match, &status);
27
+ mconfidence = ucsdet_getConfidence(match, &status);
28
+
29
+ rb_match = rb_hash_new();
30
+
31
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("text")));
32
+
33
+ enc_name = charlock_new_str2(mname);
34
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("encoding")), enc_name);
35
+
36
+ enc_tbl = rb_iv_get(rb_cEncodingDetector, "@encoding_table");
37
+ compat_enc = rb_hash_aref(enc_tbl, enc_name);
38
+ if (!NIL_P(compat_enc)) {
39
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("ruby_encoding")), compat_enc);
40
+ }
41
+
42
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(mconfidence));
43
+
44
+ if (mlang && mlang[0])
45
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("language")), charlock_new_str2(mlang));
46
+
47
+ return rb_match;
48
+ }
49
+
50
+ static VALUE rb_encdec_binarymatch() {
51
+ VALUE rb_match;
52
+
53
+ rb_match = rb_hash_new();
54
+
55
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("binary")));
56
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(100));
57
+
58
+ return rb_match;
59
+ }
60
+
61
+ static int detect_binary_content(VALUE self, VALUE rb_str) {
62
+ size_t buf_len, scan_len;
63
+ const char *buf;
64
+
65
+ buf = RSTRING_PTR(rb_str);
66
+ buf_len = RSTRING_LEN(rb_str);
67
+ scan_len = NUM2ULL(rb_iv_get(self, "@binary_scan_length"));
68
+
69
+ if (buf_len > 10) {
70
+ // application/postscript
71
+ if (!memcmp(buf, "%!PS-Adobe-", 11))
72
+ return 0;
73
+ }
74
+
75
+ if (buf_len > 7) {
76
+ // image/png
77
+ if (!memcmp(buf, "\x89PNG\x0D\x0A\x1A\x0A", 8))
78
+ return 1;
79
+ }
80
+
81
+ if (buf_len > 5) {
82
+ // image/gif
83
+ if (!memcmp(buf, "GIF87a", 6))
84
+ return 1;
85
+
86
+ // image/gif
87
+ if (!memcmp(buf, "GIF89a", 6))
88
+ return 1;
89
+ }
90
+
91
+ if (buf_len > 4) {
92
+ // application/pdf
93
+ if (!memcmp(buf, "%PDF-", 5))
94
+ return 1;
95
+ }
96
+
97
+ if (buf_len > 3) {
98
+ // UTF-32BE
99
+ if (!memcmp(buf, "\0\0\xfe\xff", 4))
100
+ return 0;
101
+
102
+ // UTF-32LE
103
+ if (!memcmp(buf, "\xff\xfe\0\0", 4))
104
+ return 0;
105
+ }
106
+
107
+ if (buf_len > 2) {
108
+ // image/jpeg
109
+ if (!memcmp(buf, "\xFF\xD8\xFF", 3))
110
+ return 1;
111
+ }
112
+
113
+ if (buf_len > 1) {
114
+ // UTF-16BE
115
+ if (!memcmp(buf, "\xfe\xff", 2))
116
+ return 0;
117
+
118
+ // UTF-16LE
119
+ if (!memcmp(buf, "\xff\xfe", 2))
120
+ return 0;
121
+ }
122
+
123
+ /*
124
+ * If we got this far, any NULL bytes within the `scan_len`
125
+ * range will likely mean the contents are binary.
126
+ */
127
+ if (scan_len < buf_len)
128
+ buf_len = scan_len;
129
+ return !!memchr(buf, 0, buf_len);
130
+ }
131
+
132
+ /*
133
+ * call-seq: true/false = EncodingDetector.is_binary? str
134
+ *
135
+ * Attempt to detect if a string is binary or text
136
+ *
137
+ * str - a String, what you want to perform the binary check on
138
+ *
139
+ * Returns: true or false
140
+ */
141
+ static VALUE rb_encdec_is_binary(VALUE self, VALUE str)
142
+ {
143
+ if (detect_binary_content(self, str))
144
+ return Qtrue;
145
+ else
146
+ return Qfalse;
147
+ }
148
+
149
+ /*
150
+ * call-seq: detection_hash = EncodingDetector.detect str[, hint_enc]
151
+ *
152
+ * Attempt to detect the encoding of this string
153
+ *
154
+ * str - a String, what you want to detect the encoding of
155
+ * hint_enc - an optional String (like "UTF-8"), the encoding name which will
156
+ * be used as an additional hint to the charset detector
157
+ *
158
+ * Returns: a Hash with :encoding, :language, :type and :confidence
159
+ */
160
+ static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
161
+ {
162
+ UErrorCode status = U_ZERO_ERROR;
163
+ charlock_detector_t *detector;
164
+ VALUE rb_str;
165
+ VALUE rb_enc_hint;
166
+
167
+ rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
168
+
169
+ Check_Type(rb_str, T_STRING);
170
+ Data_Get_Struct(self, charlock_detector_t, detector);
171
+
172
+ // first lets see if this is binary content
173
+ if (detect_binary_content(self, rb_str)) {
174
+ return rb_encdec_binarymatch();
175
+ }
176
+
177
+ // if we got here - the data doesn't look like binary
178
+ // lets try to figure out what encoding the text is in
179
+ ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
180
+
181
+ if (!NIL_P(rb_enc_hint)) {
182
+ Check_Type(rb_enc_hint, T_STRING);
183
+ ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
184
+ }
185
+
186
+ return rb_encdec_buildmatch(ucsdet_detect(detector->csd, &status));
187
+ }
188
+
189
+
190
+ /*
191
+ * call-seq: detection_hash_array = EncodingDetector.detect_all str[, hint_enc]
192
+ *
193
+ * Attempt to detect the encoding of this string, and return
194
+ * a list with all the possible encodings that match it.
195
+ *
196
+ *
197
+ * str - a String, what you want to detect the encoding of
198
+ * hint_enc - an optional String (like "UTF-8"), the encoding name which will
199
+ * be used as an additional hint to the charset detector
200
+ *
201
+ * Returns: an Array with zero or more Hashes,
202
+ * each one of them with with :encoding, :language, :type and :confidence
203
+ */
204
+ static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
205
+ {
206
+ UErrorCode status = U_ZERO_ERROR;
207
+ charlock_detector_t *detector;
208
+ const UCharsetMatch **csm;
209
+ VALUE rb_ret;
210
+ int i, match_count;
211
+ VALUE rb_str;
212
+ VALUE rb_enc_hint;
213
+ VALUE binary_match;
214
+
215
+ rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
216
+
217
+ Check_Type(rb_str, T_STRING);
218
+ Data_Get_Struct(self, charlock_detector_t, detector);
219
+
220
+ rb_ret = rb_ary_new();
221
+
222
+ // first lets see if this is binary content
223
+ binary_match = Qnil;
224
+ if (detect_binary_content(self, rb_str)) {
225
+ binary_match = rb_encdec_binarymatch();
226
+ }
227
+
228
+ ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
229
+
230
+ if (!NIL_P(rb_enc_hint)) {
231
+ Check_Type(rb_enc_hint, T_STRING);
232
+ ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
233
+ }
234
+
235
+ csm = ucsdet_detectAll(detector->csd, &match_count, &status);
236
+
237
+ for (i = 0; i < match_count; ++i) {
238
+ rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i]));
239
+ }
240
+
241
+ if (!NIL_P(binary_match))
242
+ rb_ary_unshift(rb_ret, binary_match);
243
+
244
+ return rb_ret;
245
+ }
246
+
247
+ /*
248
+ * call-seq: EncodingDetector#strip_tags?
249
+ *
250
+ * Returns whether or not the strip_tags flag is set on this detector
251
+ *
252
+ * Returns: Boolean
253
+ */
254
+ static VALUE rb_get_strip_tags(VALUE self)
255
+ {
256
+ charlock_detector_t *detector;
257
+ UBool val;
258
+ VALUE rb_val;
259
+
260
+ Data_Get_Struct(self, charlock_detector_t, detector);
261
+
262
+ val = ucsdet_isInputFilterEnabled(detector->csd);
263
+
264
+ rb_val = val == 1 ? Qtrue : Qfalse;
265
+
266
+ return rb_val;
267
+ }
268
+
269
+ /*
270
+ * call-seq: EncodingDetector#strip_tags = true
271
+ *
272
+ * Enable or disable the stripping of HTML/XML tags from the input before
273
+ * attempting any detection
274
+ *
275
+ * Returns: Boolean, the value passed
276
+ */
277
+ static VALUE rb_set_strip_tags(VALUE self, VALUE rb_val)
278
+ {
279
+ charlock_detector_t *detector;
280
+ UBool val;
281
+
282
+ Data_Get_Struct(self, charlock_detector_t, detector);
283
+
284
+ val = rb_val == Qtrue ? 1 : 0;
285
+
286
+ ucsdet_enableInputFilter(detector->csd, val);
287
+
288
+ return rb_val;
289
+ }
290
+
291
+ /*
292
+ * call-seq: detectable_encodings = EncodingDetector.supported_encodings
293
+ *
294
+ * The list of detectable encodings supported by this library
295
+ *
296
+ * Returns: an Array of Strings
297
+ */
298
+ static VALUE rb_get_supported_encodings(VALUE klass)
299
+ {
300
+ UCharsetDetector *csd;
301
+ UErrorCode status = U_ZERO_ERROR;
302
+ UEnumeration *encoding_list;
303
+ VALUE rb_encoding_list;
304
+ int32_t enc_count;
305
+ int32_t i;
306
+ const char *enc_name;
307
+ int32_t enc_name_len;
308
+
309
+ rb_encoding_list = rb_iv_get(klass, "encoding_list");
310
+
311
+ // lazily populate the list
312
+ if (NIL_P(rb_encoding_list)) {
313
+ csd = ucsdet_open(&status);
314
+
315
+ encoding_list = ucsdet_getAllDetectableCharsets(csd, &status);
316
+ rb_encoding_list = rb_ary_new();
317
+ enc_count = uenum_count(encoding_list, &status);
318
+
319
+ rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1250"));
320
+ rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1252"));
321
+ rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1253"));
322
+ rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1254"));
323
+ rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1255"));
324
+
325
+ for(i=0; i < enc_count; i++) {
326
+ enc_name = uenum_next(encoding_list, &enc_name_len, &status);
327
+ rb_ary_push(rb_encoding_list, charlock_new_str(enc_name, enc_name_len));
328
+ }
329
+
330
+ rb_iv_set(klass, "encoding_list", rb_encoding_list);
331
+ ucsdet_close(csd);
332
+ }
333
+
334
+ return rb_encoding_list;
335
+ }
336
+
337
+ static void rb_encdec__free(void *obj)
338
+ {
339
+ charlock_detector_t *detector;
340
+
341
+ detector = (charlock_detector_t *)obj;
342
+
343
+ if (detector->csd)
344
+ ucsdet_close(detector->csd);
345
+
346
+ free(detector);
347
+ }
348
+
349
+ static VALUE rb_encdec__alloc(VALUE klass)
350
+ {
351
+ charlock_detector_t *detector;
352
+ UErrorCode status = U_ZERO_ERROR;
353
+ VALUE obj;
354
+
355
+ detector = calloc(1, sizeof(charlock_detector_t));
356
+ obj = Data_Wrap_Struct(klass, NULL, rb_encdec__free, (void *)detector);
357
+
358
+ detector->csd = ucsdet_open(&status);
359
+ if (U_FAILURE(status)) {
360
+ rb_raise(rb_eStandardError, "%s", u_errorName(status));
361
+ }
362
+
363
+ return obj;
364
+ }
365
+
366
+ void _init_charlock_encoding_detector()
367
+ {
368
+ rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject);
369
+ rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
370
+ rb_define_method(rb_cEncodingDetector, "is_binary?", rb_encdec_is_binary, 1);
371
+ rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
372
+ rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1);
373
+ rb_define_method(rb_cEncodingDetector, "strip_tags", rb_get_strip_tags, 0);
374
+ rb_define_method(rb_cEncodingDetector, "strip_tags=", rb_set_strip_tags, 1);
375
+
376
+ rb_define_singleton_method(rb_cEncodingDetector, "supported_encodings", rb_get_supported_encodings, 0);
377
+ }
@@ -0,0 +1,15 @@
1
+ #include "common.h"
2
+
3
+ extern void _init_charlock_encoding_detector();
4
+ extern void _init_charlock_converter();
5
+ extern void _init_charlock_transliterator();
6
+
7
+ VALUE rb_mCharlockHolmes;
8
+
9
+ void Init_charlock_holmes() {
10
+ rb_mCharlockHolmes = rb_define_module("CharlockHolmes");
11
+
12
+ _init_charlock_encoding_detector();
13
+ _init_charlock_converter();
14
+ _init_charlock_transliterator();
15
+ }
@@ -0,0 +1,102 @@
1
+ require 'mkmf'
2
+
3
+ if `which make`.strip.empty?
4
+ STDERR.puts "\n\n"
5
+ STDERR.puts "***************************************************************************************"
6
+ STDERR.puts "*************** make required (apt-get install make build-essential) =( ***************"
7
+ STDERR.puts "***************************************************************************************"
8
+ exit(1)
9
+ end
10
+
11
+ ##
12
+ # ICU dependency
13
+ #
14
+
15
+ ldflags = cppflags = nil
16
+
17
+ if RbConfig::CONFIG["host_os"] =~ /darwin/
18
+ begin
19
+ brew_prefix = `brew --prefix icu4c`.chomp
20
+ ldflags = "#{brew_prefix}/lib"
21
+ cppflags = "#{brew_prefix}/include"
22
+ pkg_conf = "#{brew_prefix}/lib/pkgconfig"
23
+ # pkg_config should be less error prone than parsing compiler
24
+ # commandline options, but we need to set default ldflags and cpp flags
25
+ # in case the user doesn't have pkg-config installed
26
+ ENV['PKG_CONFIG_PATH'] ||= pkg_conf
27
+ rescue
28
+ end
29
+ end
30
+
31
+ dir_config 'icu', cppflags, ldflags
32
+
33
+ pkg_config("icu-i18n")
34
+ pkg_config("icu-io")
35
+ pkg_config("icu-uc")
36
+
37
+ $CXXFLAGS << ' -std=c++11' unless $CXXFLAGS.include?("-std=")
38
+
39
+ unless have_library 'icui18n' and have_header 'unicode/ucnv.h'
40
+ STDERR.puts "\n\n"
41
+ STDERR.puts "***************************************************************************************"
42
+ STDERR.puts "*********** icu required (brew install icu4c or apt-get install libicu-dev) ***********"
43
+ STDERR.puts "***************************************************************************************"
44
+ exit(1)
45
+ end
46
+
47
+ have_library 'z' or abort 'libz missing'
48
+ have_library 'icuuc' or abort 'libicuuc missing'
49
+ have_library 'icudata' or abort 'libicudata missing'
50
+
51
+ $CFLAGS << ' -Wall -funroll-loops'
52
+ $CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
53
+
54
+ def libflag_to_filename(ldflag)
55
+ case ldflag
56
+ when /\A-l(.+)/
57
+ "lib#{Regexp.last_match(1)}.#{$LIBEXT}"
58
+ end
59
+ end
60
+
61
+ def resolve_static_library(libflag, dirs)
62
+ filename = libflag_to_filename(libflag)
63
+
64
+ dir = dirs.find { |path| File.exist?(File.join(path, filename)) }
65
+
66
+ raise "Unable to find #{filename} in #{dirs}" unless dir
67
+
68
+ File.join(dir, filename)
69
+ end
70
+
71
+ def substitute_static_libs(packages)
72
+ # First, find all the -l<lib> flags added by pkg-config. We want to drop
73
+ # these dynamically linked libraries and substitute them with the static libraries.
74
+ libflags = packages.map do |pkg|
75
+ pkg_config(pkg, 'libs-only-l')&.strip&.split(' ')
76
+ end.flatten.uniq
77
+
78
+ # To find where the static libraries live, we need to search the
79
+ # library paths given by the -L flag from pkg-config.
80
+ lib_paths = packages.map do |pkg|
81
+ include_path = pkg_config(pkg, 'libs-only-L')&.strip
82
+ include_path&.split(' ').map { |lib| lib.gsub(/^-L/, '') }
83
+ end.flatten.uniq
84
+
85
+ # Drop the -l<lib> flags and add in the static libraries.
86
+ new_libs = $libs.shellsplit
87
+ new_libs.reject! { |arg| libflags.include?(arg) }
88
+ libflags.each { |flag| new_libs << resolve_static_library(flag, lib_paths) }
89
+ $libs = new_libs.uniq.shelljoin
90
+ end
91
+
92
+ static_p = enable_config('static', false)
93
+ message "Static linking is #{static_p ? 'enabled' : 'disabled'}.\n"
94
+
95
+ if static_p
96
+ $CXXFLAGS << ' -fPIC'
97
+ ENV['PKG_CONFIG_ALLOW_SYSTEM_LIBS'] = '1'
98
+
99
+ substitute_static_libs(%w[icu-i18n icu-io icu-uc])
100
+ end
101
+
102
+ create_makefile 'charlock_holmes/charlock_holmes'
@@ -0,0 +1,130 @@
1
+ #include "common.h"
2
+ #undef UChar
3
+
4
+ #include <string>
5
+ #include <unicode/translit.h>
6
+
7
+ extern "C" {
8
+
9
+ #ifdef HAVE_RUBY_ENCODING_H
10
+ #include <ruby/encoding.h>
11
+ static VALUE rb_eEncodingCompatibilityError;
12
+
13
+ static void check_utf8_encoding(VALUE str) {
14
+ static rb_encoding *_cached[3] = {NULL, NULL, NULL};
15
+ rb_encoding *enc;
16
+
17
+ if (_cached[0] == NULL) {
18
+ _cached[0] = rb_utf8_encoding();
19
+ _cached[1] = rb_usascii_encoding();
20
+ _cached[2] = rb_ascii8bit_encoding();
21
+ }
22
+
23
+ enc = rb_enc_get(str);
24
+ if (enc != _cached[0] && enc != _cached[1] && enc != _cached[2]) {
25
+ rb_raise(rb_eEncodingCompatibilityError,
26
+ "Input must be UTF-8 or US-ASCII, %s given", rb_enc_name(enc));
27
+ }
28
+ }
29
+
30
+ #else
31
+ static void check_utf8_encoding(VALUE str) {}
32
+ #endif
33
+
34
+ extern VALUE rb_mCharlockHolmes;
35
+ static VALUE rb_cTransliterator;
36
+
37
+ static VALUE rb_transliterator_id_list(VALUE self) {
38
+ UErrorCode status = U_ZERO_ERROR;
39
+ icu::StringEnumeration *id_list;
40
+ int32_t id_list_size;
41
+ const char *curr_id;
42
+ int32_t curr_id_len;
43
+ VALUE rb_ary;
44
+ VALUE rb_curr_id;
45
+
46
+ id_list_size = 0;
47
+ id_list = icu::Transliterator::getAvailableIDs(status);
48
+ if(!U_SUCCESS(status)) {
49
+ rb_raise(rb_eArgError, "%s", u_errorName(status));
50
+ }
51
+
52
+ status = U_ZERO_ERROR;
53
+ id_list_size = id_list->count(status);
54
+ if(!U_SUCCESS(status)) {
55
+ rb_raise(rb_eArgError, "%s", u_errorName(status));
56
+ }
57
+
58
+ rb_ary = rb_ary_new2(id_list_size);
59
+
60
+ do {
61
+ curr_id_len = 0;
62
+ curr_id = id_list->next(&curr_id_len, status);
63
+ if(!U_SUCCESS(status)) {
64
+ rb_raise(rb_eArgError, "%s", u_errorName(status));
65
+ }
66
+
67
+ if (curr_id != NULL) {
68
+ rb_curr_id = charlock_new_str(curr_id, curr_id_len);
69
+ rb_ary_push(rb_ary, rb_curr_id);
70
+ }
71
+ } while(curr_id != NULL);
72
+
73
+ delete id_list;
74
+
75
+ return rb_ary;
76
+ }
77
+
78
+ static VALUE rb_transliterator_transliterate(VALUE self, VALUE rb_txt, VALUE rb_id) {
79
+ UErrorCode status = U_ZERO_ERROR;
80
+ UParseError p_error;
81
+ icu::Transliterator *trans;
82
+ const char *txt;
83
+ size_t txt_len;
84
+ const char *id;
85
+ size_t id_len;
86
+ icu::UnicodeString *u_txt;
87
+ std::string result;
88
+ VALUE rb_out;
89
+
90
+ Check_Type(rb_txt, T_STRING);
91
+ Check_Type(rb_id, T_STRING);
92
+
93
+ check_utf8_encoding(rb_txt);
94
+ check_utf8_encoding(rb_id);
95
+
96
+ txt = RSTRING_PTR(rb_txt);
97
+ txt_len = RSTRING_LEN(rb_txt);
98
+ id = RSTRING_PTR(rb_id);
99
+ id_len = RSTRING_LEN(rb_id);
100
+
101
+ trans = icu::Transliterator::createInstance(icu::UnicodeString(id, id_len), UTRANS_FORWARD, p_error, status);
102
+ if(!U_SUCCESS(status)) {
103
+ rb_raise(rb_eArgError, "%s", u_errorName(status));
104
+ }
105
+
106
+ u_txt = new icu::UnicodeString(txt, txt_len);
107
+ trans->transliterate(*u_txt);
108
+ icu::StringByteSink<std::string> sink(&result);
109
+ u_txt->toUTF8(sink);
110
+
111
+ delete u_txt;
112
+ delete trans;
113
+
114
+ rb_out = charlock_new_str(result.data(), result.length());
115
+
116
+ return rb_out;
117
+ }
118
+
119
+ void _init_charlock_transliterator() {
120
+ #ifdef HAVE_RUBY_ENCODING_H
121
+ rb_eEncodingCompatibilityError = rb_const_get(rb_cEncoding, rb_intern("CompatibilityError"));
122
+ #endif
123
+
124
+ rb_cTransliterator = rb_define_class_under(rb_mCharlockHolmes, "Transliterator", rb_cObject);
125
+
126
+ rb_define_singleton_method(rb_cTransliterator, "id_list", (VALUE(*)(...))rb_transliterator_id_list, 0);
127
+ rb_define_singleton_method(rb_cTransliterator, "transliterate", (VALUE(*)(...))rb_transliterator_transliterate, 2);
128
+ }
129
+
130
+ }
@@ -0,0 +1,76 @@
1
+ module CharlockHolmes
2
+ class EncodingDetector
3
+ # Default length for which to scan content for NULL bytes
4
+ DEFAULT_BINARY_SCAN_LEN = 1024*1024
5
+
6
+ # Length for which to scan content for NULL bytes
7
+ attr_accessor :binary_scan_length
8
+
9
+ alias :strip_tags? :strip_tags
10
+
11
+ def initialize(scan_len=DEFAULT_BINARY_SCAN_LEN)
12
+ @binary_scan_length = scan_len
13
+ end
14
+
15
+ # Attempt to detect the encoding of this string
16
+ #
17
+ # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
18
+ # as well as use the default binary scan length
19
+ #
20
+ # str - a String, what you want to detect the encoding of
21
+ # hint_enc - an optional String (like "UTF-8"), the encoding name which will
22
+ # be used as an additional hint to the charset detector
23
+ #
24
+ # Returns: a Hash with :encoding, :language, :type and :confidence
25
+ def self.detect(str, hint_enc=nil)
26
+ new.detect(str, hint_enc)
27
+ end
28
+
29
+ # Attempt to detect the encoding of this string, and return
30
+ # a list with all the possible encodings that match it.
31
+ #
32
+ # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
33
+ # as well as use the default binary scan length
34
+ #
35
+ # str - a String, what you want to detect the encoding of
36
+ # hint_enc - an optional String (like "UTF-8"), the encoding name which will
37
+ # be used as an additional hint to the charset detector
38
+ #
39
+ # Returns: an Array with zero or more Hashes,
40
+ # each one of them with with :encoding, :language, :type and :confidence
41
+ def self.detect_all(str, hint_enc=nil)
42
+ new.detect_all(str, hint_enc)
43
+ end
44
+
45
+ # A mapping table of supported encoding names from EncodingDetector
46
+ # which point to the corresponding supported encoding name in Ruby.
47
+ # Like: {"UTF-8" => "UTF-8", "IBM420_rtl" => "ASCII-8BIT"}
48
+ #
49
+ # Note that encodings that can't be mapped between Charlock and Ruby will resolve
50
+ # to "ASCII-8BIT".
51
+ @encoding_table = {}
52
+
53
+ def self.encoding_table
54
+ @encoding_table
55
+ end
56
+
57
+ BINARY = 'binary'
58
+
59
+ # Builds the ENCODING_TABLE hash by running through the list of supported encodings
60
+ # in the ICU detection API and trying to map them to supported encodings in Ruby.
61
+ # This is built dynamically so as to take advantage of ICU upgrades which may have
62
+ # support for more encodings in the future.
63
+ #
64
+ # Returns nothing.
65
+ def self.build_encoding_table
66
+ supported_encodings.each do |name|
67
+ @encoding_table[name] = begin
68
+ ::Encoding.find(name).name
69
+ rescue ArgumentError
70
+ BINARY
71
+ end
72
+ end
73
+ end
74
+ build_encoding_table
75
+ end
76
+ end
@@ -0,0 +1,34 @@
1
+ require 'charlock_holmes' unless defined? CharlockHolmes
2
+
3
+ class String
4
+ # Attempt to detect the encoding of this string
5
+ #
6
+ # Returns: a Hash with :encoding, :language, :type and :confidence
7
+ def detect_encoding(hint_enc=nil)
8
+ detector = CharlockHolmes::EncodingDetector.new
9
+ detector.detect(self, hint_enc)
10
+ end
11
+
12
+ # Attempt to detect the encoding of this string, and return
13
+ # a list with all the possible encodings that match it.
14
+ #
15
+ # Returns: an Array with zero or more Hashes,
16
+ # each one of them with with :encoding, :language, :type and :confidence
17
+ def detect_encodings(hint_enc=nil)
18
+ detector = CharlockHolmes::EncodingDetector.new
19
+ detector.detect_all(self, hint_enc)
20
+ end
21
+
22
+ if method_defined? :force_encoding
23
+ # Attempt to detect the encoding of this string
24
+ # then set the encoding to what was detected ala `force_encoding`
25
+ #
26
+ # Returns: self
27
+ def detect_encoding!(hint_enc=nil)
28
+ if detected = self.detect_encoding(hint_enc)
29
+ self.force_encoding(detected[:ruby_encoding]) if detected[:ruby_encoding]
30
+ end
31
+ self
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,3 @@
1
+ module CharlockHolmes
2
+ VERSION = "0.7.7"
3
+ end
@@ -0,0 +1,6 @@
1
+ require 'charlock_holmes/charlock_holmes'
2
+ require 'charlock_holmes/encoding_detector'
3
+ require 'charlock_holmes/version' unless defined? CharlockHolmes::VERSION
4
+
5
+ # require this if you want the String monkey patches
6
+ # require 'charlock_holmes/string'
metadata ADDED
@@ -0,0 +1,98 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: static_holmes
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.7.7
5
+ platform: ruby
6
+ authors:
7
+ - Brian Lopez
8
+ - Vicent Martí
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2024-03-23 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake-compiler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '1.0'
21
+ type: :development
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - "~>"
26
+ - !ruby/object:Gem::Version
27
+ version: '1.0'
28
+ - !ruby/object:Gem::Dependency
29
+ name: minitest
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - "~>"
33
+ - !ruby/object:Gem::Version
34
+ version: '5.11'
35
+ type: :development
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - "~>"
40
+ - !ruby/object:Gem::Version
41
+ version: '5.11'
42
+ - !ruby/object:Gem::Dependency
43
+ name: chardet
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - "~>"
47
+ - !ruby/object:Gem::Version
48
+ version: '0.9'
49
+ type: :development
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - "~>"
54
+ - !ruby/object:Gem::Version
55
+ version: '0.9'
56
+ description: charlock_holmes provides binary and text detection as well as text transcoding
57
+ using libicu
58
+ email: seniorlopez@gmail.com
59
+ executables: []
60
+ extensions:
61
+ - ext/charlock_holmes/extconf.rb
62
+ extra_rdoc_files: []
63
+ files:
64
+ - ext/charlock_holmes/common.h
65
+ - ext/charlock_holmes/converter.c
66
+ - ext/charlock_holmes/encoding_detector.c
67
+ - ext/charlock_holmes/ext.c
68
+ - ext/charlock_holmes/extconf.rb
69
+ - ext/charlock_holmes/transliterator.cpp
70
+ - lib/charlock_holmes.rb
71
+ - lib/charlock_holmes/encoding_detector.rb
72
+ - lib/charlock_holmes/string.rb
73
+ - lib/charlock_holmes/version.rb
74
+ homepage: https://gitlab.com/gitlab-org/ruby/gems/charlock_holmes
75
+ licenses:
76
+ - MIT
77
+ metadata: {}
78
+ post_install_message:
79
+ rdoc_options:
80
+ - "--charset=UTF-8"
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: 1.9.3
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ requirements: []
94
+ rubygems_version: 3.5.6
95
+ signing_key:
96
+ specification_version: 4
97
+ summary: Character encoding detection, brought to you by ICU
98
+ test_files: []