charlock_holmes_heroku 0.6.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ # encoding: utf-8
2
+
3
+ require './lib/charlock_holmes/version' unless defined? CharlockHolmes::VERSION
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = %q{charlock_holmes_heroku}
7
+ s.version = CharlockHolmes::VERSION
8
+ s.authors = ["Brian Lopez", "Vicent Martí"]
9
+ s.date = Time.now.utc.strftime("%Y-%m-%d")
10
+ s.email = %q{seniorlopez@gmail.com}
11
+ s.extensions = ["ext/charlock_holmes/extconf.rb"]
12
+ s.files = `git ls-files`.split("\n")
13
+ s.homepage = %q{http://github.com/brianmario/charlock_holmes}
14
+ s.rdoc_options = ["--charset=UTF-8"]
15
+ s.require_paths = ["lib"]
16
+ s.rubygems_version = %q{1.4.2}
17
+ s.summary = %q{Character encoding detection, brought to you by ICU}
18
+ s.test_files = `git ls-files spec`.split("\n")
19
+
20
+ # tests
21
+ s.add_development_dependency 'rake-compiler', ">= 0.7.5"
22
+ s.add_development_dependency 'rspec', ">= 2.0.0"
23
+ # benchmarks
24
+ s.add_development_dependency 'chardet'
25
+ end
@@ -0,0 +1,41 @@
1
+ #ifndef CHARLOCK_COMMON_H
2
+ #define CHARLOCK_COMMON_H
3
+
4
+ // tell rbx not to use it's caching compat layer
5
+ // by doing this we're making a promize to RBX that
6
+ // we'll never modify the pointers we get back from RSTRING_PTR
7
+ #define RSTRING_NOT_MODIFIED
8
+
9
+ #include <ruby.h>
10
+ #ifdef HAVE_RUBY_ENCODING_H
11
+ #include <ruby/encoding.h>
12
+ #endif
13
+
14
+ static VALUE charlock_new_enc_str(const char *str, size_t len, void *encoding)
15
+ {
16
+ #ifdef HAVE_RUBY_ENCODING_H
17
+ return rb_external_str_new_with_enc(str, len, (rb_encoding *)encoding);
18
+ #else
19
+ return rb_str_new(str, len);
20
+ #endif
21
+ }
22
+
23
+ static VALUE charlock_new_str(const char *str, size_t len)
24
+ {
25
+ #ifdef HAVE_RUBY_ENCODING_H
26
+ return rb_external_str_new_with_enc(str, len, rb_utf8_encoding());
27
+ #else
28
+ return rb_str_new(str, len);
29
+ #endif
30
+ }
31
+
32
+ static VALUE charlock_new_str2(const char *str)
33
+ {
34
+ #ifdef HAVE_RUBY_ENCODING_H
35
+ return rb_external_str_new_with_enc(str, strlen(str), rb_utf8_encoding());
36
+ #else
37
+ return rb_str_new2(str);
38
+ #endif
39
+ }
40
+
41
+ #endif
@@ -0,0 +1,53 @@
1
+ #include "unicode/ucnv.h"
2
+ #include "common.h"
3
+
4
+ extern VALUE rb_mCharlockHolmes;
5
+ static VALUE rb_cConverter;
6
+
7
+ static VALUE rb_converter_convert(VALUE self, VALUE rb_txt, VALUE rb_src_enc, VALUE rb_dst_enc) {
8
+ VALUE rb_out;
9
+ const char *src_enc;
10
+ const char *dst_enc;
11
+ const char *src_txt;
12
+ char *out_buf;
13
+ void *rb_enc = NULL;
14
+ int32_t src_len;
15
+ int32_t out_len;
16
+ UErrorCode status = U_ZERO_ERROR;
17
+
18
+ src_txt = RSTRING_PTR(rb_txt);
19
+ src_len = RSTRING_LEN(rb_txt);
20
+ src_enc = RSTRING_PTR(rb_src_enc);
21
+ dst_enc = RSTRING_PTR(rb_dst_enc);
22
+
23
+ // first determin the size of the output buffer
24
+ out_len = ucnv_convert(dst_enc, src_enc, NULL, 0, src_txt, src_len, &status);
25
+ if (status != U_BUFFER_OVERFLOW_ERROR) {
26
+ rb_raise(rb_eArgError, "%s", u_errorName(status));
27
+ }
28
+ out_buf = malloc(out_len);
29
+
30
+ // now do the actual conversion
31
+ status = U_ZERO_ERROR;
32
+ out_len = ucnv_convert(dst_enc, src_enc, out_buf, out_len, src_txt, src_len, &status);
33
+ if (U_FAILURE(status)) {
34
+ free(out_buf);
35
+ rb_raise(rb_eArgError, "%s", u_errorName(status));
36
+ }
37
+
38
+ #ifdef HAVE_RUBY_ENCODING_H
39
+ rb_enc = (void *)rb_enc_find(dst_enc);
40
+ #endif
41
+
42
+ rb_out = charlock_new_enc_str(out_buf, out_len, rb_enc);
43
+
44
+ free(out_buf);
45
+
46
+ return rb_out;
47
+ }
48
+
49
+ void _init_charlock_converter() {
50
+ rb_cConverter = rb_define_class_under(rb_mCharlockHolmes, "Converter", rb_cObject);
51
+
52
+ rb_define_singleton_method(rb_cConverter, "convert", rb_converter_convert, 3);
53
+ }
@@ -0,0 +1,295 @@
1
+ #include "unicode/ucsdet.h"
2
+ #include "magic.h"
3
+ #include "common.h"
4
+
5
+ extern VALUE rb_mCharlockHolmes;
6
+ static VALUE rb_cEncodingDetector;
7
+
8
+ typedef struct {
9
+ UCharsetDetector *csd;
10
+ magic_t magic;
11
+ } charlock_detector_t;
12
+
13
+ static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
14
+ {
15
+ UErrorCode status = U_ZERO_ERROR;
16
+ const char *mname;
17
+ const char *mlang;
18
+ int mconfidence;
19
+ VALUE rb_match;
20
+
21
+ if (!match)
22
+ return Qnil;
23
+
24
+ mname = ucsdet_getName(match, &status);
25
+ mlang = ucsdet_getLanguage(match, &status);
26
+ mconfidence = ucsdet_getConfidence(match, &status);
27
+
28
+ rb_match = rb_hash_new();
29
+
30
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("text")));
31
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("encoding")), charlock_new_str2(mname));
32
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(mconfidence));
33
+
34
+ if (mlang && mlang[0])
35
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("language")), charlock_new_str2(mlang));
36
+
37
+ return rb_match;
38
+ }
39
+
40
+ static VALUE rb_encdec_binarymatch() {
41
+ VALUE rb_match;
42
+
43
+ rb_match = rb_hash_new();
44
+
45
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("binary")));
46
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(100));
47
+
48
+ return rb_match;
49
+ }
50
+
51
+ static int detect_binary_content(charlock_detector_t *detector, VALUE rb_str) {
52
+ const char *binary_result;
53
+
54
+ binary_result = magic_buffer(detector->magic, RSTRING_PTR(rb_str), RSTRING_LEN(rb_str));
55
+
56
+ if (binary_result) {
57
+ if (!strstr(binary_result, "text"))
58
+ return 1;
59
+ } else {
60
+ rb_raise(rb_eStandardError, "%s", magic_error(detector->magic));
61
+ }
62
+
63
+ return 0;
64
+ }
65
+
66
+ /*
67
+ * call-seq: detection_hash = EncodingDetector.detect str[, hint_enc]
68
+ *
69
+ * Attempt to detect the encoding of this string
70
+ *
71
+ * str - a String, what you want to detect the encoding of
72
+ * hint_enc - an optional String (like "UTF-8"), the encoding name which will
73
+ * be used as an additional hint to the charset detector
74
+ *
75
+ * Returns: a Hash with :encoding, :language, :type and :confidence
76
+ */
77
+ static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
78
+ {
79
+ UErrorCode status = U_ZERO_ERROR;
80
+ charlock_detector_t *detector;
81
+ VALUE rb_str;
82
+ VALUE rb_enc_hint;
83
+
84
+ rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
85
+
86
+ Check_Type(rb_str, T_STRING);
87
+ Data_Get_Struct(self, charlock_detector_t, detector);
88
+
89
+ // first lets see if this is binary content
90
+ if (detect_binary_content(detector, rb_str)) {
91
+ return rb_encdec_binarymatch();
92
+ }
93
+
94
+ // if we got here - the data doesn't look like binary
95
+ // lets try to figure out what encoding the text is in
96
+ ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
97
+
98
+ if (!NIL_P(rb_enc_hint)) {
99
+ Check_Type(rb_enc_hint, T_STRING);
100
+ ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
101
+ }
102
+
103
+ return rb_encdec_buildmatch(ucsdet_detect(detector->csd, &status));
104
+ }
105
+
106
+
107
+ /*
108
+ * call-seq: detection_hash_array = EncodingDetector.detect_all str[, hint_enc]
109
+ *
110
+ * Attempt to detect the encoding of this string, and return
111
+ * a list with all the possible encodings that match it.
112
+ *
113
+ *
114
+ * str - a String, what you want to detect the encoding of
115
+ * hint_enc - an optional String (like "UTF-8"), the encoding name which will
116
+ * be used as an additional hint to the charset detector
117
+ *
118
+ * Returns: an Array with zero or more Hashes,
119
+ * each one of them with with :encoding, :language, :type and :confidence
120
+ */
121
+ static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
122
+ {
123
+ UErrorCode status = U_ZERO_ERROR;
124
+ charlock_detector_t *detector;
125
+ const UCharsetMatch **csm;
126
+ VALUE rb_ret;
127
+ int i, match_count;
128
+ VALUE rb_str;
129
+ VALUE rb_enc_hint;
130
+ VALUE binary_match;
131
+
132
+ rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
133
+
134
+ Check_Type(rb_str, T_STRING);
135
+ Data_Get_Struct(self, charlock_detector_t, detector);
136
+
137
+ rb_ret = rb_ary_new();
138
+
139
+ // first lets see if this is binary content
140
+ binary_match = Qnil;
141
+ if (detect_binary_content(detector, rb_str)) {
142
+ binary_match = rb_encdec_binarymatch();
143
+ }
144
+
145
+ ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
146
+
147
+ if (!NIL_P(rb_enc_hint)) {
148
+ Check_Type(rb_enc_hint, T_STRING);
149
+ ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
150
+ }
151
+
152
+ csm = ucsdet_detectAll(detector->csd, &match_count, &status);
153
+
154
+ for (i = 0; i < match_count; ++i) {
155
+ rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i]));
156
+ }
157
+
158
+ if (!NIL_P(binary_match))
159
+ rb_ary_unshift(rb_ret, binary_match);
160
+
161
+ return rb_ret;
162
+ }
163
+
164
+ /*
165
+ * call-seq: EncodingDetector#strip_tags?
166
+ *
167
+ * Returns whether or not the strip_tags flag is set on this detector
168
+ *
169
+ * Returns: Boolean
170
+ */
171
+ static VALUE rb_get_strip_tags(VALUE self)
172
+ {
173
+ charlock_detector_t *detector;
174
+ UBool val;
175
+ VALUE rb_val;
176
+
177
+ Data_Get_Struct(self, charlock_detector_t, detector);
178
+
179
+ val = ucsdet_isInputFilterEnabled(detector->csd);
180
+
181
+ rb_val = val == 1 ? Qtrue : Qfalse;
182
+
183
+ return rb_val;
184
+ }
185
+
186
+ /*
187
+ * call-seq: EncodingDetector#strip_tags = true
188
+ *
189
+ * Enable or disable the stripping of HTML/XML tags from the input before
190
+ * attempting any detection
191
+ *
192
+ * Returns: Boolean, the value passed
193
+ */
194
+ static VALUE rb_set_strip_tags(VALUE self, VALUE rb_val)
195
+ {
196
+ charlock_detector_t *detector;
197
+ UBool val;
198
+
199
+ Data_Get_Struct(self, charlock_detector_t, detector);
200
+
201
+ val = rb_val == Qtrue ? 1 : 0;
202
+
203
+ ucsdet_enableInputFilter(detector->csd, val);
204
+
205
+ return rb_val;
206
+ }
207
+
208
+ /*
209
+ * call-seq: detectable_encodings = EncodingDetector.supported_encodings
210
+ *
211
+ * The list of detectable encodings supported by this library
212
+ *
213
+ * Returns: an Array of Strings
214
+ */
215
+ static VALUE rb_get_supported_encodings(VALUE klass)
216
+ {
217
+ UCharsetDetector *csd;
218
+ UErrorCode status = U_ZERO_ERROR;
219
+ UEnumeration *encoding_list;
220
+ VALUE rb_encoding_list;
221
+ int32_t enc_count;
222
+ int32_t i;
223
+ const char *enc_name;
224
+ int32_t enc_name_len;
225
+
226
+ rb_encoding_list = rb_iv_get(klass, "encoding_list");
227
+
228
+ // lazily populate the list
229
+ if (NIL_P(rb_encoding_list)) {
230
+ csd = ucsdet_open(&status);
231
+
232
+ encoding_list = ucsdet_getAllDetectableCharsets(csd, &status);
233
+ rb_encoding_list = rb_ary_new();
234
+ enc_count = uenum_count(encoding_list, &status);
235
+
236
+ for(i=0; i < enc_count; i++) {
237
+ enc_name = uenum_next(encoding_list, &enc_name_len, &status);
238
+ rb_ary_push(rb_encoding_list, charlock_new_str(enc_name, enc_name_len));
239
+ }
240
+
241
+ rb_iv_set(klass, "encoding_list", rb_encoding_list);
242
+ ucsdet_close(csd);
243
+ }
244
+
245
+ return rb_encoding_list;
246
+ }
247
+
248
+ static void rb_encdec__free(void *obj)
249
+ {
250
+ charlock_detector_t *detector;
251
+
252
+ detector = (charlock_detector_t *)obj;
253
+
254
+ if (detector->csd)
255
+ ucsdet_close(detector->csd);
256
+
257
+ if (detector->magic)
258
+ magic_close(detector->magic);
259
+
260
+ free(detector);
261
+ }
262
+
263
+ static VALUE rb_encdec__alloc(VALUE klass)
264
+ {
265
+ charlock_detector_t *detector;
266
+ UErrorCode status = U_ZERO_ERROR;
267
+ VALUE obj;
268
+
269
+ detector = calloc(1, sizeof(charlock_detector_t));
270
+ obj = Data_Wrap_Struct(klass, NULL, rb_encdec__free, (void *)detector);
271
+
272
+ detector->csd = ucsdet_open(&status);
273
+ if (U_FAILURE(status)) {
274
+ rb_raise(rb_eStandardError, "%s", u_errorName(status));
275
+ }
276
+
277
+ detector->magic = magic_open(MAGIC_NO_CHECK_SOFT);
278
+ if (detector->magic == NULL) {
279
+ rb_raise(rb_eStandardError, "%s", magic_error(detector->magic));
280
+ }
281
+
282
+ return obj;
283
+ }
284
+
285
+ void _init_charlock_encoding_detector()
286
+ {
287
+ rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject);
288
+ rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
289
+ rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
290
+ rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1);
291
+ rb_define_method(rb_cEncodingDetector, "strip_tags", rb_get_strip_tags, 0);
292
+ rb_define_method(rb_cEncodingDetector, "strip_tags=", rb_set_strip_tags, 1);
293
+
294
+ rb_define_singleton_method(rb_cEncodingDetector, "supported_encodings", rb_get_supported_encodings, 0);
295
+ }
@@ -0,0 +1,13 @@
1
+ #include "common.h"
2
+
3
+ extern void _init_charlock_encoding_detector();
4
+ extern void _init_charlock_converter();
5
+
6
+ VALUE rb_mCharlockHolmes;
7
+
8
+ void Init_charlock_holmes() {
9
+ rb_mCharlockHolmes = rb_define_module("CharlockHolmes");
10
+
11
+ _init_charlock_encoding_detector();
12
+ _init_charlock_converter();
13
+ }
@@ -0,0 +1,86 @@
1
+ require 'mkmf'
2
+
3
+ CWD = File.expand_path(File.dirname(__FILE__))
4
+ def sys(cmd)
5
+ puts " -- #{cmd}"
6
+ unless ret = xsystem(cmd)
7
+ raise "#{cmd} failed, please report issue on http://github.com/brianmario/charlock_holmes"
8
+ end
9
+ ret
10
+ end
11
+
12
+ if `which make`.strip.empty?
13
+ STDERR.puts "\n\n"
14
+ STDERR.puts "***************************************************************************************"
15
+ STDERR.puts "*************** make required (apt-get install make build-essential) =( ***************"
16
+ STDERR.puts "***************************************************************************************"
17
+ exit(1)
18
+ end
19
+
20
+ ##
21
+ # ICU dependency
22
+ #
23
+
24
+ src = File.basename('icu4c-52_1-src.tgz')
25
+ dir = File.basename('icu')
26
+
27
+ Dir.chdir("#{CWD}/src") do
28
+ FileUtils.rm_rf(dir) if File.exists?(dir)
29
+
30
+ sys("tar zxvf #{src}")
31
+ Dir.chdir(File.join(dir, 'source')) do
32
+ sys("LDFLAGS= CXXFLAGS=\"-O2 -fPIC\" CFLAGS=\"-O2 -fPIC\" ./configure --prefix=#{CWD}/dst/ --disable-tests --disable-samples --disable-icuio --disable-extras --disable-layout --enable-static --disable-shared")
33
+ sys("make install")
34
+ end
35
+ end
36
+
37
+ dir_config 'icu'
38
+
39
+ $INCFLAGS << " -I#{CWD}/dst/include "
40
+ $LDFLAGS << " -L#{CWD}/dst/lib"
41
+
42
+ unless have_library 'icui18n' and have_library 'icudata' and have_library 'icutu' and have_library 'icuuc' and have_header 'unicode/ucnv.h'
43
+ STDERR.puts "\n\n"
44
+ STDERR.puts "***************************************************************************************"
45
+ STDERR.puts "********* error compiling and linking icu4c. please report issue on github *********"
46
+ STDERR.puts "***************************************************************************************"
47
+ exit(1)
48
+ end
49
+
50
+ ##
51
+ # libmagic dependency
52
+ #
53
+
54
+ src = File.basename('file-5.08.tar.gz')
55
+ dir = File.basename(src, '.tar.gz')
56
+
57
+ Dir.chdir("#{CWD}/src") do
58
+ FileUtils.rm_rf(dir) if File.exists?(dir)
59
+
60
+ sys("tar zxvf #{src}")
61
+ Dir.chdir(dir) do
62
+ sys("./configure --prefix=#{CWD}/dst/ --disable-shared --enable-static --with-pic")
63
+ sys("patch -p0 < ../file-soft-check.patch")
64
+ sys("make -C src install")
65
+ sys("make -C magic install")
66
+ end
67
+ end
68
+
69
+ FileUtils.cp "#{CWD}/dst/lib/libmagic.a", "#{CWD}/libmagic_ext.a"
70
+
71
+ $INCFLAGS[0,0] = " -I#{CWD}/dst/include "
72
+ $LDFLAGS << " -L#{CWD} "
73
+
74
+ dir_config 'magic'
75
+ unless have_library 'magic_ext' and have_header 'magic.h'
76
+ STDERR.puts "\n\n"
77
+ STDERR.puts "***************************************************************************************"
78
+ STDERR.puts "********* error compiling and linking libmagic. please report issue on github *********"
79
+ STDERR.puts "***************************************************************************************"
80
+ exit(1)
81
+ end
82
+
83
+ $CFLAGS << ' -Wall -funroll-loops'
84
+ $CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
85
+ $LIBS << " -lstdc++"
86
+ create_makefile 'charlock_holmes/charlock_holmes'