static_holmes 0.7.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 1e35947554b465d48dc970a60031dcca03df441bd829b4464dba2ecf0bc792bb
4
+ data.tar.gz: 4cdfaff28364c07fc96a0a77a62d2fb5dbccc130859a4268065084e5abdff449
5
+ SHA512:
6
+ metadata.gz: 3007aa5b2d53c6046dc65086cb74881ea10f48fda3de2c89a053a42a820f5045b20ce2b716ce9ac0d51f3e6b704a85e9659958cf70c9631e74593637dfbae486
7
+ data.tar.gz: 13ad7377a3525419518ef4c20ce6c4763a8fc9e8b5cabaa30495d5d79cb8bf0b239e2238a4dcea5bc181572cec4356be3c2d65bf599c440a74a9f398881a6ec5
@@ -0,0 +1,41 @@
1
+ #ifndef CHARLOCK_COMMON_H
2
+ #define CHARLOCK_COMMON_H
3
+
4
+ // tell rbx not to use it's caching compat layer
5
+ // by doing this we're making a promize to RBX that
6
+ // we'll never modify the pointers we get back from RSTRING_PTR
7
+ #define RSTRING_NOT_MODIFIED
8
+
9
+ #include <ruby.h>
10
+ #ifdef HAVE_RUBY_ENCODING_H
11
+ #include <ruby/encoding.h>
12
+ #endif
13
+
14
+ static inline VALUE charlock_new_enc_str(const char *str, size_t len, void *encoding)
15
+ {
16
+ #ifdef HAVE_RUBY_ENCODING_H
17
+ return rb_external_str_new_with_enc(str, len, (rb_encoding *)encoding);
18
+ #else
19
+ return rb_str_new(str, len);
20
+ #endif
21
+ }
22
+
23
+ static inline VALUE charlock_new_str(const char *str, size_t len)
24
+ {
25
+ #ifdef HAVE_RUBY_ENCODING_H
26
+ return rb_external_str_new_with_enc(str, len, rb_utf8_encoding());
27
+ #else
28
+ return rb_str_new(str, len);
29
+ #endif
30
+ }
31
+
32
+ static inline VALUE charlock_new_str2(const char *str)
33
+ {
34
+ #ifdef HAVE_RUBY_ENCODING_H
35
+ return rb_external_str_new_with_enc(str, strlen(str), rb_utf8_encoding());
36
+ #else
37
+ return rb_str_new2(str);
38
+ #endif
39
+ }
40
+
41
+ #endif
@@ -0,0 +1,57 @@
1
+ #include "unicode/ucnv.h"
2
+ #include "common.h"
3
+
4
+ extern VALUE rb_mCharlockHolmes;
5
+ static VALUE rb_cConverter;
6
+
7
+ static VALUE rb_converter_convert(VALUE self, VALUE rb_txt, VALUE rb_src_enc, VALUE rb_dst_enc) {
8
+ VALUE rb_out;
9
+ const char *src_enc;
10
+ const char *dst_enc;
11
+ const char *src_txt;
12
+ char *out_buf;
13
+ void *rb_enc = NULL;
14
+ int32_t src_len;
15
+ int32_t out_len;
16
+ UErrorCode status = U_ZERO_ERROR;
17
+
18
+ Check_Type(rb_txt, T_STRING);
19
+ Check_Type(rb_src_enc, T_STRING);
20
+ Check_Type(rb_dst_enc, T_STRING);
21
+
22
+ src_txt = RSTRING_PTR(rb_txt);
23
+ src_len = RSTRING_LEN(rb_txt);
24
+ src_enc = RSTRING_PTR(rb_src_enc);
25
+ dst_enc = RSTRING_PTR(rb_dst_enc);
26
+
27
+ // first determin the size of the output buffer
28
+ out_len = ucnv_convert(dst_enc, src_enc, NULL, 0, src_txt, src_len, &status);
29
+ if (status != U_BUFFER_OVERFLOW_ERROR) {
30
+ rb_raise(rb_eArgError, "%s", u_errorName(status));
31
+ }
32
+ out_buf = malloc(out_len);
33
+
34
+ // now do the actual conversion
35
+ status = U_ZERO_ERROR;
36
+ out_len = ucnv_convert(dst_enc, src_enc, out_buf, out_len, src_txt, src_len, &status);
37
+ if (U_FAILURE(status)) {
38
+ free(out_buf);
39
+ rb_raise(rb_eArgError, "%s", u_errorName(status));
40
+ }
41
+
42
+ #ifdef HAVE_RUBY_ENCODING_H
43
+ rb_enc = (void *)rb_enc_find(dst_enc);
44
+ #endif
45
+
46
+ rb_out = charlock_new_enc_str(out_buf, out_len, rb_enc);
47
+
48
+ free(out_buf);
49
+
50
+ return rb_out;
51
+ }
52
+
53
+ void _init_charlock_converter() {
54
+ rb_cConverter = rb_define_class_under(rb_mCharlockHolmes, "Converter", rb_cObject);
55
+
56
+ rb_define_singleton_method(rb_cConverter, "convert", rb_converter_convert, 3);
57
+ }
@@ -0,0 +1,377 @@
1
+ #include "unicode/ucsdet.h"
2
+ #include "common.h"
3
+
4
+ extern VALUE rb_mCharlockHolmes;
5
+ static VALUE rb_cEncodingDetector;
6
+
7
+ typedef struct {
8
+ UCharsetDetector *csd;
9
+ } charlock_detector_t;
10
+
11
+ static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
12
+ {
13
+ UErrorCode status = U_ZERO_ERROR;
14
+ const char *mname;
15
+ const char *mlang;
16
+ int mconfidence;
17
+ VALUE rb_match;
18
+ VALUE enc_tbl;
19
+ VALUE enc_name;
20
+ VALUE compat_enc;
21
+
22
+ if (!match)
23
+ return Qnil;
24
+
25
+ mname = ucsdet_getName(match, &status);
26
+ mlang = ucsdet_getLanguage(match, &status);
27
+ mconfidence = ucsdet_getConfidence(match, &status);
28
+
29
+ rb_match = rb_hash_new();
30
+
31
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("text")));
32
+
33
+ enc_name = charlock_new_str2(mname);
34
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("encoding")), enc_name);
35
+
36
+ enc_tbl = rb_iv_get(rb_cEncodingDetector, "@encoding_table");
37
+ compat_enc = rb_hash_aref(enc_tbl, enc_name);
38
+ if (!NIL_P(compat_enc)) {
39
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("ruby_encoding")), compat_enc);
40
+ }
41
+
42
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(mconfidence));
43
+
44
+ if (mlang && mlang[0])
45
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("language")), charlock_new_str2(mlang));
46
+
47
+ return rb_match;
48
+ }
49
+
50
+ static VALUE rb_encdec_binarymatch() {
51
+ VALUE rb_match;
52
+
53
+ rb_match = rb_hash_new();
54
+
55
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("binary")));
56
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(100));
57
+
58
+ return rb_match;
59
+ }
60
+
61
+ static int detect_binary_content(VALUE self, VALUE rb_str) {
62
+ size_t buf_len, scan_len;
63
+ const char *buf;
64
+
65
+ buf = RSTRING_PTR(rb_str);
66
+ buf_len = RSTRING_LEN(rb_str);
67
+ scan_len = NUM2ULL(rb_iv_get(self, "@binary_scan_length"));
68
+
69
+ if (buf_len > 10) {
70
+ // application/postscript
71
+ if (!memcmp(buf, "%!PS-Adobe-", 11))
72
+ return 0;
73
+ }
74
+
75
+ if (buf_len > 7) {
76
+ // image/png
77
+ if (!memcmp(buf, "\x89PNG\x0D\x0A\x1A\x0A", 8))
78
+ return 1;
79
+ }
80
+
81
+ if (buf_len > 5) {
82
+ // image/gif
83
+ if (!memcmp(buf, "GIF87a", 6))
84
+ return 1;
85
+
86
+ // image/gif
87
+ if (!memcmp(buf, "GIF89a", 6))
88
+ return 1;
89
+ }
90
+
91
+ if (buf_len > 4) {
92
+ // application/pdf
93
+ if (!memcmp(buf, "%PDF-", 5))
94
+ return 1;
95
+ }
96
+
97
+ if (buf_len > 3) {
98
+ // UTF-32BE
99
+ if (!memcmp(buf, "\0\0\xfe\xff", 4))
100
+ return 0;
101
+
102
+ // UTF-32LE
103
+ if (!memcmp(buf, "\xff\xfe\0\0", 4))
104
+ return 0;
105
+ }
106
+
107
+ if (buf_len > 2) {
108
+ // image/jpeg
109
+ if (!memcmp(buf, "\xFF\xD8\xFF", 3))
110
+ return 1;
111
+ }
112
+
113
+ if (buf_len > 1) {
114
+ // UTF-16BE
115
+ if (!memcmp(buf, "\xfe\xff", 2))
116
+ return 0;
117
+
118
+ // UTF-16LE
119
+ if (!memcmp(buf, "\xff\xfe", 2))
120
+ return 0;
121
+ }
122
+
123
+ /*
124
+ * If we got this far, any NULL bytes within the `scan_len`
125
+ * range will likely mean the contents are binary.
126
+ */
127
+ if (scan_len < buf_len)
128
+ buf_len = scan_len;
129
+ return !!memchr(buf, 0, buf_len);
130
+ }
131
+
132
+ /*
133
+ * call-seq: true/false = EncodingDetector.is_binary? str
134
+ *
135
+ * Attempt to detect if a string is binary or text
136
+ *
137
+ * str - a String, what you want to perform the binary check on
138
+ *
139
+ * Returns: true or false
140
+ */
141
+ static VALUE rb_encdec_is_binary(VALUE self, VALUE str)
142
+ {
143
+ if (detect_binary_content(self, str))
144
+ return Qtrue;
145
+ else
146
+ return Qfalse;
147
+ }
148
+
149
+ /*
150
+ * call-seq: detection_hash = EncodingDetector.detect str[, hint_enc]
151
+ *
152
+ * Attempt to detect the encoding of this string
153
+ *
154
+ * str - a String, what you want to detect the encoding of
155
+ * hint_enc - an optional String (like "UTF-8"), the encoding name which will
156
+ * be used as an additional hint to the charset detector
157
+ *
158
+ * Returns: a Hash with :encoding, :language, :type and :confidence
159
+ */
160
+ static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
161
+ {
162
+ UErrorCode status = U_ZERO_ERROR;
163
+ charlock_detector_t *detector;
164
+ VALUE rb_str;
165
+ VALUE rb_enc_hint;
166
+
167
+ rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
168
+
169
+ Check_Type(rb_str, T_STRING);
170
+ Data_Get_Struct(self, charlock_detector_t, detector);
171
+
172
+ // first lets see if this is binary content
173
+ if (detect_binary_content(self, rb_str)) {
174
+ return rb_encdec_binarymatch();
175
+ }
176
+
177
+ // if we got here - the data doesn't look like binary
178
+ // lets try to figure out what encoding the text is in
179
+ ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
180
+
181
+ if (!NIL_P(rb_enc_hint)) {
182
+ Check_Type(rb_enc_hint, T_STRING);
183
+ ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
184
+ }
185
+
186
+ return rb_encdec_buildmatch(ucsdet_detect(detector->csd, &status));
187
+ }
188
+
189
+
190
+ /*
191
+ * call-seq: detection_hash_array = EncodingDetector.detect_all str[, hint_enc]
192
+ *
193
+ * Attempt to detect the encoding of this string, and return
194
+ * a list with all the possible encodings that match it.
195
+ *
196
+ *
197
+ * str - a String, what you want to detect the encoding of
198
+ * hint_enc - an optional String (like "UTF-8"), the encoding name which will
199
+ * be used as an additional hint to the charset detector
200
+ *
201
+ * Returns: an Array with zero or more Hashes,
202
+ * each one of them with with :encoding, :language, :type and :confidence
203
+ */
204
+ static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
205
+ {
206
+ UErrorCode status = U_ZERO_ERROR;
207
+ charlock_detector_t *detector;
208
+ const UCharsetMatch **csm;
209
+ VALUE rb_ret;
210
+ int i, match_count;
211
+ VALUE rb_str;
212
+ VALUE rb_enc_hint;
213
+ VALUE binary_match;
214
+
215
+ rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
216
+
217
+ Check_Type(rb_str, T_STRING);
218
+ Data_Get_Struct(self, charlock_detector_t, detector);
219
+
220
+ rb_ret = rb_ary_new();
221
+
222
+ // first lets see if this is binary content
223
+ binary_match = Qnil;
224
+ if (detect_binary_content(self, rb_str)) {
225
+ binary_match = rb_encdec_binarymatch();
226
+ }
227
+
228
+ ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
229
+
230
+ if (!NIL_P(rb_enc_hint)) {
231
+ Check_Type(rb_enc_hint, T_STRING);
232
+ ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
233
+ }
234
+
235
+ csm = ucsdet_detectAll(detector->csd, &match_count, &status);
236
+
237
+ for (i = 0; i < match_count; ++i) {
238
+ rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i]));
239
+ }
240
+
241
+ if (!NIL_P(binary_match))
242
+ rb_ary_unshift(rb_ret, binary_match);
243
+
244
+ return rb_ret;
245
+ }
246
+
247
+ /*
248
+ * call-seq: EncodingDetector#strip_tags?
249
+ *
250
+ * Returns whether or not the strip_tags flag is set on this detector
251
+ *
252
+ * Returns: Boolean
253
+ */
254
+ static VALUE rb_get_strip_tags(VALUE self)
255
+ {
256
+ charlock_detector_t *detector;
257
+ UBool val;
258
+ VALUE rb_val;
259
+
260
+ Data_Get_Struct(self, charlock_detector_t, detector);
261
+
262
+ val = ucsdet_isInputFilterEnabled(detector->csd);
263
+
264
+ rb_val = val == 1 ? Qtrue : Qfalse;
265
+
266
+ return rb_val;
267
+ }
268
+
269
+ /*
270
+ * call-seq: EncodingDetector#strip_tags = true
271
+ *
272
+ * Enable or disable the stripping of HTML/XML tags from the input before
273
+ * attempting any detection
274
+ *
275
+ * Returns: Boolean, the value passed
276
+ */
277
+ static VALUE rb_set_strip_tags(VALUE self, VALUE rb_val)
278
+ {
279
+ charlock_detector_t *detector;
280
+ UBool val;
281
+
282
+ Data_Get_Struct(self, charlock_detector_t, detector);
283
+
284
+ val = rb_val == Qtrue ? 1 : 0;
285
+
286
+ ucsdet_enableInputFilter(detector->csd, val);
287
+
288
+ return rb_val;
289
+ }
290
+
291
+ /*
292
+ * call-seq: detectable_encodings = EncodingDetector.supported_encodings
293
+ *
294
+ * The list of detectable encodings supported by this library
295
+ *
296
+ * Returns: an Array of Strings
297
+ */
298
+ static VALUE rb_get_supported_encodings(VALUE klass)
299
+ {
300
+ UCharsetDetector *csd;
301
+ UErrorCode status = U_ZERO_ERROR;
302
+ UEnumeration *encoding_list;
303
+ VALUE rb_encoding_list;
304
+ int32_t enc_count;
305
+ int32_t i;
306
+ const char *enc_name;
307
+ int32_t enc_name_len;
308
+
309
+ rb_encoding_list = rb_iv_get(klass, "encoding_list");
310
+
311
+ // lazily populate the list
312
+ if (NIL_P(rb_encoding_list)) {
313
+ csd = ucsdet_open(&status);
314
+
315
+ encoding_list = ucsdet_getAllDetectableCharsets(csd, &status);
316
+ rb_encoding_list = rb_ary_new();
317
+ enc_count = uenum_count(encoding_list, &status);
318
+
319
+ rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1250"));
320
+ rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1252"));
321
+ rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1253"));
322
+ rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1254"));
323
+ rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1255"));
324
+
325
+ for(i=0; i < enc_count; i++) {
326
+ enc_name = uenum_next(encoding_list, &enc_name_len, &status);
327
+ rb_ary_push(rb_encoding_list, charlock_new_str(enc_name, enc_name_len));
328
+ }
329
+
330
+ rb_iv_set(klass, "encoding_list", rb_encoding_list);
331
+ ucsdet_close(csd);
332
+ }
333
+
334
+ return rb_encoding_list;
335
+ }
336
+
337
+ static void rb_encdec__free(void *obj)
338
+ {
339
+ charlock_detector_t *detector;
340
+
341
+ detector = (charlock_detector_t *)obj;
342
+
343
+ if (detector->csd)
344
+ ucsdet_close(detector->csd);
345
+
346
+ free(detector);
347
+ }
348
+
349
+ static VALUE rb_encdec__alloc(VALUE klass)
350
+ {
351
+ charlock_detector_t *detector;
352
+ UErrorCode status = U_ZERO_ERROR;
353
+ VALUE obj;
354
+
355
+ detector = calloc(1, sizeof(charlock_detector_t));
356
+ obj = Data_Wrap_Struct(klass, NULL, rb_encdec__free, (void *)detector);
357
+
358
+ detector->csd = ucsdet_open(&status);
359
+ if (U_FAILURE(status)) {
360
+ rb_raise(rb_eStandardError, "%s", u_errorName(status));
361
+ }
362
+
363
+ return obj;
364
+ }
365
+
366
+ void _init_charlock_encoding_detector()
367
+ {
368
+ rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject);
369
+ rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
370
+ rb_define_method(rb_cEncodingDetector, "is_binary?", rb_encdec_is_binary, 1);
371
+ rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
372
+ rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1);
373
+ rb_define_method(rb_cEncodingDetector, "strip_tags", rb_get_strip_tags, 0);
374
+ rb_define_method(rb_cEncodingDetector, "strip_tags=", rb_set_strip_tags, 1);
375
+
376
+ rb_define_singleton_method(rb_cEncodingDetector, "supported_encodings", rb_get_supported_encodings, 0);
377
+ }
@@ -0,0 +1,15 @@
1
+ #include "common.h"
2
+
3
+ extern void _init_charlock_encoding_detector();
4
+ extern void _init_charlock_converter();
5
+ extern void _init_charlock_transliterator();
6
+
7
+ VALUE rb_mCharlockHolmes;
8
+
9
+ void Init_charlock_holmes() {
10
+ rb_mCharlockHolmes = rb_define_module("CharlockHolmes");
11
+
12
+ _init_charlock_encoding_detector();
13
+ _init_charlock_converter();
14
+ _init_charlock_transliterator();
15
+ }
@@ -0,0 +1,102 @@
1
+ require 'mkmf'
2
+
3
+ if `which make`.strip.empty?
4
+ STDERR.puts "\n\n"
5
+ STDERR.puts "***************************************************************************************"
6
+ STDERR.puts "*************** make required (apt-get install make build-essential) =( ***************"
7
+ STDERR.puts "***************************************************************************************"
8
+ exit(1)
9
+ end
10
+
11
+ ##
12
+ # ICU dependency
13
+ #
14
+
15
+ ldflags = cppflags = nil
16
+
17
+ if RbConfig::CONFIG["host_os"] =~ /darwin/
18
+ begin
19
+ brew_prefix = `brew --prefix icu4c`.chomp
20
+ ldflags = "#{brew_prefix}/lib"
21
+ cppflags = "#{brew_prefix}/include"
22
+ pkg_conf = "#{brew_prefix}/lib/pkgconfig"
23
+ # pkg_config should be less error prone than parsing compiler
24
+ # commandline options, but we need to set default ldflags and cpp flags
25
+ # in case the user doesn't have pkg-config installed
26
+ ENV['PKG_CONFIG_PATH'] ||= pkg_conf
27
+ rescue
28
+ end
29
+ end
30
+
31
+ dir_config 'icu', cppflags, ldflags
32
+
33
+ pkg_config("icu-i18n")
34
+ pkg_config("icu-io")
35
+ pkg_config("icu-uc")
36
+
37
+ $CXXFLAGS << ' -std=c++11' unless $CXXFLAGS.include?("-std=")
38
+
39
+ unless have_library 'icui18n' and have_header 'unicode/ucnv.h'
40
+ STDERR.puts "\n\n"
41
+ STDERR.puts "***************************************************************************************"
42
+ STDERR.puts "*********** icu required (brew install icu4c or apt-get install libicu-dev) ***********"
43
+ STDERR.puts "***************************************************************************************"
44
+ exit(1)
45
+ end
46
+
47
+ have_library 'z' or abort 'libz missing'
48
+ have_library 'icuuc' or abort 'libicuuc missing'
49
+ have_library 'icudata' or abort 'libicudata missing'
50
+
51
+ $CFLAGS << ' -Wall -funroll-loops'
52
+ $CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
53
+
54
+ def libflag_to_filename(ldflag)
55
+ case ldflag
56
+ when /\A-l(.+)/
57
+ "lib#{Regexp.last_match(1)}.#{$LIBEXT}"
58
+ end
59
+ end
60
+
61
+ def resolve_static_library(libflag, dirs)
62
+ filename = libflag_to_filename(libflag)
63
+
64
+ dir = dirs.find { |path| File.exist?(File.join(path, filename)) }
65
+
66
+ raise "Unable to find #{filename} in #{dirs}" unless dir
67
+
68
+ File.join(dir, filename)
69
+ end
70
+
71
+ def substitute_static_libs(packages)
72
+ # First, find all the -l<lib> flags added by pkg-config. We want to drop
73
+ # these dynamically linked libraries and substitute them with the static libraries.
74
+ libflags = packages.map do |pkg|
75
+ pkg_config(pkg, 'libs-only-l')&.strip&.split(' ')
76
+ end.flatten.uniq
77
+
78
+ # To find where the static libraries live, we need to search the
79
+ # library paths given by the -L flag from pkg-config.
80
+ lib_paths = packages.map do |pkg|
81
+ include_path = pkg_config(pkg, 'libs-only-L')&.strip
82
+ include_path&.split(' ').map { |lib| lib.gsub(/^-L/, '') }
83
+ end.flatten.uniq
84
+
85
+ # Drop the -l<lib> flags and add in the static libraries.
86
+ new_libs = $libs.shellsplit
87
+ new_libs.reject! { |arg| libflags.include?(arg) }
88
+ libflags.each { |flag| new_libs << resolve_static_library(flag, lib_paths) }
89
+ $libs = new_libs.uniq.shelljoin
90
+ end
91
+
92
+ static_p = enable_config('static', false)
93
+ message "Static linking is #{static_p ? 'enabled' : 'disabled'}.\n"
94
+
95
+ if static_p
96
+ $CXXFLAGS << ' -fPIC'
97
+ ENV['PKG_CONFIG_ALLOW_SYSTEM_LIBS'] = '1'
98
+
99
+ substitute_static_libs(%w[icu-i18n icu-io icu-uc])
100
+ end
101
+
102
+ create_makefile 'charlock_holmes/charlock_holmes'
@@ -0,0 +1,130 @@
1
+ #include "common.h"
2
+ #undef UChar
3
+
4
+ #include <string>
5
+ #include <unicode/translit.h>
6
+
7
+ extern "C" {
8
+
9
+ #ifdef HAVE_RUBY_ENCODING_H
10
+ #include <ruby/encoding.h>
11
+ static VALUE rb_eEncodingCompatibilityError;
12
+
13
+ static void check_utf8_encoding(VALUE str) {
14
+ static rb_encoding *_cached[3] = {NULL, NULL, NULL};
15
+ rb_encoding *enc;
16
+
17
+ if (_cached[0] == NULL) {
18
+ _cached[0] = rb_utf8_encoding();
19
+ _cached[1] = rb_usascii_encoding();
20
+ _cached[2] = rb_ascii8bit_encoding();
21
+ }
22
+
23
+ enc = rb_enc_get(str);
24
+ if (enc != _cached[0] && enc != _cached[1] && enc != _cached[2]) {
25
+ rb_raise(rb_eEncodingCompatibilityError,
26
+ "Input must be UTF-8 or US-ASCII, %s given", rb_enc_name(enc));
27
+ }
28
+ }
29
+
30
+ #else
31
+ static void check_utf8_encoding(VALUE str) {}
32
+ #endif
33
+
34
+ extern VALUE rb_mCharlockHolmes;
35
+ static VALUE rb_cTransliterator;
36
+
37
+ static VALUE rb_transliterator_id_list(VALUE self) {
38
+ UErrorCode status = U_ZERO_ERROR;
39
+ icu::StringEnumeration *id_list;
40
+ int32_t id_list_size;
41
+ const char *curr_id;
42
+ int32_t curr_id_len;
43
+ VALUE rb_ary;
44
+ VALUE rb_curr_id;
45
+
46
+ id_list_size = 0;
47
+ id_list = icu::Transliterator::getAvailableIDs(status);
48
+ if(!U_SUCCESS(status)) {
49
+ rb_raise(rb_eArgError, "%s", u_errorName(status));
50
+ }
51
+
52
+ status = U_ZERO_ERROR;
53
+ id_list_size = id_list->count(status);
54
+ if(!U_SUCCESS(status)) {
55
+ rb_raise(rb_eArgError, "%s", u_errorName(status));
56
+ }
57
+
58
+ rb_ary = rb_ary_new2(id_list_size);
59
+
60
+ do {
61
+ curr_id_len = 0;
62
+ curr_id = id_list->next(&curr_id_len, status);
63
+ if(!U_SUCCESS(status)) {
64
+ rb_raise(rb_eArgError, "%s", u_errorName(status));
65
+ }
66
+
67
+ if (curr_id != NULL) {
68
+ rb_curr_id = charlock_new_str(curr_id, curr_id_len);
69
+ rb_ary_push(rb_ary, rb_curr_id);
70
+ }
71
+ } while(curr_id != NULL);
72
+
73
+ delete id_list;
74
+
75
+ return rb_ary;
76
+ }
77
+
78
+ static VALUE rb_transliterator_transliterate(VALUE self, VALUE rb_txt, VALUE rb_id) {
79
+ UErrorCode status = U_ZERO_ERROR;
80
+ UParseError p_error;
81
+ icu::Transliterator *trans;
82
+ const char *txt;
83
+ size_t txt_len;
84
+ const char *id;
85
+ size_t id_len;
86
+ icu::UnicodeString *u_txt;
87
+ std::string result;
88
+ VALUE rb_out;
89
+
90
+ Check_Type(rb_txt, T_STRING);
91
+ Check_Type(rb_id, T_STRING);
92
+
93
+ check_utf8_encoding(rb_txt);
94
+ check_utf8_encoding(rb_id);
95
+
96
+ txt = RSTRING_PTR(rb_txt);
97
+ txt_len = RSTRING_LEN(rb_txt);
98
+ id = RSTRING_PTR(rb_id);
99
+ id_len = RSTRING_LEN(rb_id);
100
+
101
+ trans = icu::Transliterator::createInstance(icu::UnicodeString(id, id_len), UTRANS_FORWARD, p_error, status);
102
+ if(!U_SUCCESS(status)) {
103
+ rb_raise(rb_eArgError, "%s", u_errorName(status));
104
+ }
105
+
106
+ u_txt = new icu::UnicodeString(txt, txt_len);
107
+ trans->transliterate(*u_txt);
108
+ icu::StringByteSink<std::string> sink(&result);
109
+ u_txt->toUTF8(sink);
110
+
111
+ delete u_txt;
112
+ delete trans;
113
+
114
+ rb_out = charlock_new_str(result.data(), result.length());
115
+
116
+ return rb_out;
117
+ }
118
+
119
+ void _init_charlock_transliterator() {
120
+ #ifdef HAVE_RUBY_ENCODING_H
121
+ rb_eEncodingCompatibilityError = rb_const_get(rb_cEncoding, rb_intern("CompatibilityError"));
122
+ #endif
123
+
124
+ rb_cTransliterator = rb_define_class_under(rb_mCharlockHolmes, "Transliterator", rb_cObject);
125
+
126
+ rb_define_singleton_method(rb_cTransliterator, "id_list", (VALUE(*)(...))rb_transliterator_id_list, 0);
127
+ rb_define_singleton_method(rb_cTransliterator, "transliterate", (VALUE(*)(...))rb_transliterator_transliterate, 2);
128
+ }
129
+
130
+ }
@@ -0,0 +1,76 @@
1
+ module CharlockHolmes
2
+ class EncodingDetector
3
+ # Default length for which to scan content for NULL bytes
4
+ DEFAULT_BINARY_SCAN_LEN = 1024*1024
5
+
6
+ # Length for which to scan content for NULL bytes
7
+ attr_accessor :binary_scan_length
8
+
9
+ alias :strip_tags? :strip_tags
10
+
11
+ def initialize(scan_len=DEFAULT_BINARY_SCAN_LEN)
12
+ @binary_scan_length = scan_len
13
+ end
14
+
15
+ # Attempt to detect the encoding of this string
16
+ #
17
+ # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
18
+ # as well as use the default binary scan length
19
+ #
20
+ # str - a String, what you want to detect the encoding of
21
+ # hint_enc - an optional String (like "UTF-8"), the encoding name which will
22
+ # be used as an additional hint to the charset detector
23
+ #
24
+ # Returns: a Hash with :encoding, :language, :type and :confidence
25
+ def self.detect(str, hint_enc=nil)
26
+ new.detect(str, hint_enc)
27
+ end
28
+
29
+ # Attempt to detect the encoding of this string, and return
30
+ # a list with all the possible encodings that match it.
31
+ #
32
+ # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
33
+ # as well as use the default binary scan length
34
+ #
35
+ # str - a String, what you want to detect the encoding of
36
+ # hint_enc - an optional String (like "UTF-8"), the encoding name which will
37
+ # be used as an additional hint to the charset detector
38
+ #
39
+ # Returns: an Array with zero or more Hashes,
40
+ # each one of them with with :encoding, :language, :type and :confidence
41
+ def self.detect_all(str, hint_enc=nil)
42
+ new.detect_all(str, hint_enc)
43
+ end
44
+
45
+ # A mapping table of supported encoding names from EncodingDetector
46
+ # which point to the corresponding supported encoding name in Ruby.
47
+ # Like: {"UTF-8" => "UTF-8", "IBM420_rtl" => "ASCII-8BIT"}
48
+ #
49
+ # Note that encodings that can't be mapped between Charlock and Ruby will resolve
50
+ # to "ASCII-8BIT".
51
+ @encoding_table = {}
52
+
53
+ def self.encoding_table
54
+ @encoding_table
55
+ end
56
+
57
+ BINARY = 'binary'
58
+
59
+ # Builds the ENCODING_TABLE hash by running through the list of supported encodings
60
+ # in the ICU detection API and trying to map them to supported encodings in Ruby.
61
+ # This is built dynamically so as to take advantage of ICU upgrades which may have
62
+ # support for more encodings in the future.
63
+ #
64
+ # Returns nothing.
65
+ def self.build_encoding_table
66
+ supported_encodings.each do |name|
67
+ @encoding_table[name] = begin
68
+ ::Encoding.find(name).name
69
+ rescue ArgumentError
70
+ BINARY
71
+ end
72
+ end
73
+ end
74
+ build_encoding_table
75
+ end
76
+ end
@@ -0,0 +1,34 @@
1
+ require 'charlock_holmes' unless defined? CharlockHolmes
2
+
3
+ class String
4
+ # Attempt to detect the encoding of this string
5
+ #
6
+ # Returns: a Hash with :encoding, :language, :type and :confidence
7
+ def detect_encoding(hint_enc=nil)
8
+ detector = CharlockHolmes::EncodingDetector.new
9
+ detector.detect(self, hint_enc)
10
+ end
11
+
12
+ # Attempt to detect the encoding of this string, and return
13
+ # a list with all the possible encodings that match it.
14
+ #
15
+ # Returns: an Array with zero or more Hashes,
16
+ # each one of them with with :encoding, :language, :type and :confidence
17
+ def detect_encodings(hint_enc=nil)
18
+ detector = CharlockHolmes::EncodingDetector.new
19
+ detector.detect_all(self, hint_enc)
20
+ end
21
+
22
+ if method_defined? :force_encoding
23
+ # Attempt to detect the encoding of this string
24
+ # then set the encoding to what was detected ala `force_encoding`
25
+ #
26
+ # Returns: self
27
+ def detect_encoding!(hint_enc=nil)
28
+ if detected = self.detect_encoding(hint_enc)
29
+ self.force_encoding(detected[:ruby_encoding]) if detected[:ruby_encoding]
30
+ end
31
+ self
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,3 @@
1
+ module CharlockHolmes
2
+ VERSION = "0.7.7"
3
+ end
@@ -0,0 +1,6 @@
1
+ require 'charlock_holmes/charlock_holmes'
2
+ require 'charlock_holmes/encoding_detector'
3
+ require 'charlock_holmes/version' unless defined? CharlockHolmes::VERSION
4
+
5
+ # require this if you want the String monkey patches
6
+ # require 'charlock_holmes/string'
metadata ADDED
@@ -0,0 +1,98 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: static_holmes
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.7.7
5
+ platform: ruby
6
+ authors:
7
+ - Brian Lopez
8
+ - Vicent Martí
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2024-03-23 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake-compiler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '1.0'
21
+ type: :development
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - "~>"
26
+ - !ruby/object:Gem::Version
27
+ version: '1.0'
28
+ - !ruby/object:Gem::Dependency
29
+ name: minitest
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - "~>"
33
+ - !ruby/object:Gem::Version
34
+ version: '5.11'
35
+ type: :development
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - "~>"
40
+ - !ruby/object:Gem::Version
41
+ version: '5.11'
42
+ - !ruby/object:Gem::Dependency
43
+ name: chardet
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - "~>"
47
+ - !ruby/object:Gem::Version
48
+ version: '0.9'
49
+ type: :development
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - "~>"
54
+ - !ruby/object:Gem::Version
55
+ version: '0.9'
56
+ description: charlock_holmes provides binary and text detection as well as text transcoding
57
+ using libicu
58
+ email: seniorlopez@gmail.com
59
+ executables: []
60
+ extensions:
61
+ - ext/charlock_holmes/extconf.rb
62
+ extra_rdoc_files: []
63
+ files:
64
+ - ext/charlock_holmes/common.h
65
+ - ext/charlock_holmes/converter.c
66
+ - ext/charlock_holmes/encoding_detector.c
67
+ - ext/charlock_holmes/ext.c
68
+ - ext/charlock_holmes/extconf.rb
69
+ - ext/charlock_holmes/transliterator.cpp
70
+ - lib/charlock_holmes.rb
71
+ - lib/charlock_holmes/encoding_detector.rb
72
+ - lib/charlock_holmes/string.rb
73
+ - lib/charlock_holmes/version.rb
74
+ homepage: https://gitlab.com/gitlab-org/ruby/gems/charlock_holmes
75
+ licenses:
76
+ - MIT
77
+ metadata: {}
78
+ post_install_message:
79
+ rdoc_options:
80
+ - "--charset=UTF-8"
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: 1.9.3
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ requirements: []
94
+ rubygems_version: 3.5.6
95
+ signing_key:
96
+ specification_version: 4
97
+ summary: Character encoding detection, brought to you by ICU
98
+ test_files: []