charlock_holmes_heroku 0.6.13

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,25 @@
1
+ # encoding: utf-8
2
+
3
+ require './lib/charlock_holmes/version' unless defined? CharlockHolmes::VERSION
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = %q{charlock_holmes_heroku}
7
+ s.version = CharlockHolmes::VERSION
8
+ s.authors = ["Brian Lopez", "Vicent Martí"]
9
+ s.date = Time.now.utc.strftime("%Y-%m-%d")
10
+ s.email = %q{seniorlopez@gmail.com}
11
+ s.extensions = ["ext/charlock_holmes/extconf.rb"]
12
+ s.files = `git ls-files`.split("\n")
13
+ s.homepage = %q{http://github.com/brianmario/charlock_holmes}
14
+ s.rdoc_options = ["--charset=UTF-8"]
15
+ s.require_paths = ["lib"]
16
+ s.rubygems_version = %q{1.4.2}
17
+ s.summary = %q{Character encoding detection, brought to you by ICU}
18
+ s.test_files = `git ls-files spec`.split("\n")
19
+
20
+ # tests
21
+ s.add_development_dependency 'rake-compiler', ">= 0.7.5"
22
+ s.add_development_dependency 'rspec', ">= 2.0.0"
23
+ # benchmarks
24
+ s.add_development_dependency 'chardet'
25
+ end
@@ -0,0 +1,41 @@
1
+ #ifndef CHARLOCK_COMMON_H
2
+ #define CHARLOCK_COMMON_H
3
+
4
+ // tell rbx not to use it's caching compat layer
5
+ // by doing this we're making a promize to RBX that
6
+ // we'll never modify the pointers we get back from RSTRING_PTR
7
+ #define RSTRING_NOT_MODIFIED
8
+
9
+ #include <ruby.h>
10
+ #ifdef HAVE_RUBY_ENCODING_H
11
+ #include <ruby/encoding.h>
12
+ #endif
13
+
14
+ static VALUE charlock_new_enc_str(const char *str, size_t len, void *encoding)
15
+ {
16
+ #ifdef HAVE_RUBY_ENCODING_H
17
+ return rb_external_str_new_with_enc(str, len, (rb_encoding *)encoding);
18
+ #else
19
+ return rb_str_new(str, len);
20
+ #endif
21
+ }
22
+
23
+ static VALUE charlock_new_str(const char *str, size_t len)
24
+ {
25
+ #ifdef HAVE_RUBY_ENCODING_H
26
+ return rb_external_str_new_with_enc(str, len, rb_utf8_encoding());
27
+ #else
28
+ return rb_str_new(str, len);
29
+ #endif
30
+ }
31
+
32
+ static VALUE charlock_new_str2(const char *str)
33
+ {
34
+ #ifdef HAVE_RUBY_ENCODING_H
35
+ return rb_external_str_new_with_enc(str, strlen(str), rb_utf8_encoding());
36
+ #else
37
+ return rb_str_new2(str);
38
+ #endif
39
+ }
40
+
41
+ #endif
@@ -0,0 +1,53 @@
1
+ #include "unicode/ucnv.h"
2
+ #include "common.h"
3
+
4
+ extern VALUE rb_mCharlockHolmes;
5
+ static VALUE rb_cConverter;
6
+
7
+ static VALUE rb_converter_convert(VALUE self, VALUE rb_txt, VALUE rb_src_enc, VALUE rb_dst_enc) {
8
+ VALUE rb_out;
9
+ const char *src_enc;
10
+ const char *dst_enc;
11
+ const char *src_txt;
12
+ char *out_buf;
13
+ void *rb_enc = NULL;
14
+ int32_t src_len;
15
+ int32_t out_len;
16
+ UErrorCode status = U_ZERO_ERROR;
17
+
18
+ src_txt = RSTRING_PTR(rb_txt);
19
+ src_len = RSTRING_LEN(rb_txt);
20
+ src_enc = RSTRING_PTR(rb_src_enc);
21
+ dst_enc = RSTRING_PTR(rb_dst_enc);
22
+
23
+ // first determin the size of the output buffer
24
+ out_len = ucnv_convert(dst_enc, src_enc, NULL, 0, src_txt, src_len, &status);
25
+ if (status != U_BUFFER_OVERFLOW_ERROR) {
26
+ rb_raise(rb_eArgError, "%s", u_errorName(status));
27
+ }
28
+ out_buf = malloc(out_len);
29
+
30
+ // now do the actual conversion
31
+ status = U_ZERO_ERROR;
32
+ out_len = ucnv_convert(dst_enc, src_enc, out_buf, out_len, src_txt, src_len, &status);
33
+ if (U_FAILURE(status)) {
34
+ free(out_buf);
35
+ rb_raise(rb_eArgError, "%s", u_errorName(status));
36
+ }
37
+
38
+ #ifdef HAVE_RUBY_ENCODING_H
39
+ rb_enc = (void *)rb_enc_find(dst_enc);
40
+ #endif
41
+
42
+ rb_out = charlock_new_enc_str(out_buf, out_len, rb_enc);
43
+
44
+ free(out_buf);
45
+
46
+ return rb_out;
47
+ }
48
+
49
+ void _init_charlock_converter() {
50
+ rb_cConverter = rb_define_class_under(rb_mCharlockHolmes, "Converter", rb_cObject);
51
+
52
+ rb_define_singleton_method(rb_cConverter, "convert", rb_converter_convert, 3);
53
+ }
@@ -0,0 +1,295 @@
1
+ #include "unicode/ucsdet.h"
2
+ #include "magic.h"
3
+ #include "common.h"
4
+
5
+ extern VALUE rb_mCharlockHolmes;
6
+ static VALUE rb_cEncodingDetector;
7
+
8
+ typedef struct {
9
+ UCharsetDetector *csd;
10
+ magic_t magic;
11
+ } charlock_detector_t;
12
+
13
+ static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
14
+ {
15
+ UErrorCode status = U_ZERO_ERROR;
16
+ const char *mname;
17
+ const char *mlang;
18
+ int mconfidence;
19
+ VALUE rb_match;
20
+
21
+ if (!match)
22
+ return Qnil;
23
+
24
+ mname = ucsdet_getName(match, &status);
25
+ mlang = ucsdet_getLanguage(match, &status);
26
+ mconfidence = ucsdet_getConfidence(match, &status);
27
+
28
+ rb_match = rb_hash_new();
29
+
30
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("text")));
31
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("encoding")), charlock_new_str2(mname));
32
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(mconfidence));
33
+
34
+ if (mlang && mlang[0])
35
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("language")), charlock_new_str2(mlang));
36
+
37
+ return rb_match;
38
+ }
39
+
40
+ static VALUE rb_encdec_binarymatch() {
41
+ VALUE rb_match;
42
+
43
+ rb_match = rb_hash_new();
44
+
45
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("binary")));
46
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(100));
47
+
48
+ return rb_match;
49
+ }
50
+
51
+ static int detect_binary_content(charlock_detector_t *detector, VALUE rb_str) {
52
+ const char *binary_result;
53
+
54
+ binary_result = magic_buffer(detector->magic, RSTRING_PTR(rb_str), RSTRING_LEN(rb_str));
55
+
56
+ if (binary_result) {
57
+ if (!strstr(binary_result, "text"))
58
+ return 1;
59
+ } else {
60
+ rb_raise(rb_eStandardError, "%s", magic_error(detector->magic));
61
+ }
62
+
63
+ return 0;
64
+ }
65
+
66
+ /*
67
+ * call-seq: detection_hash = EncodingDetector.detect str[, hint_enc]
68
+ *
69
+ * Attempt to detect the encoding of this string
70
+ *
71
+ * str - a String, what you want to detect the encoding of
72
+ * hint_enc - an optional String (like "UTF-8"), the encoding name which will
73
+ * be used as an additional hint to the charset detector
74
+ *
75
+ * Returns: a Hash with :encoding, :language, :type and :confidence
76
+ */
77
+ static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
78
+ {
79
+ UErrorCode status = U_ZERO_ERROR;
80
+ charlock_detector_t *detector;
81
+ VALUE rb_str;
82
+ VALUE rb_enc_hint;
83
+
84
+ rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
85
+
86
+ Check_Type(rb_str, T_STRING);
87
+ Data_Get_Struct(self, charlock_detector_t, detector);
88
+
89
+ // first lets see if this is binary content
90
+ if (detect_binary_content(detector, rb_str)) {
91
+ return rb_encdec_binarymatch();
92
+ }
93
+
94
+ // if we got here - the data doesn't look like binary
95
+ // lets try to figure out what encoding the text is in
96
+ ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
97
+
98
+ if (!NIL_P(rb_enc_hint)) {
99
+ Check_Type(rb_enc_hint, T_STRING);
100
+ ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
101
+ }
102
+
103
+ return rb_encdec_buildmatch(ucsdet_detect(detector->csd, &status));
104
+ }
105
+
106
+
107
+ /*
108
+ * call-seq: detection_hash_array = EncodingDetector.detect_all str[, hint_enc]
109
+ *
110
+ * Attempt to detect the encoding of this string, and return
111
+ * a list with all the possible encodings that match it.
112
+ *
113
+ *
114
+ * str - a String, what you want to detect the encoding of
115
+ * hint_enc - an optional String (like "UTF-8"), the encoding name which will
116
+ * be used as an additional hint to the charset detector
117
+ *
118
+ * Returns: an Array with zero or more Hashes,
119
+ * each one of them with with :encoding, :language, :type and :confidence
120
+ */
121
+ static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
122
+ {
123
+ UErrorCode status = U_ZERO_ERROR;
124
+ charlock_detector_t *detector;
125
+ const UCharsetMatch **csm;
126
+ VALUE rb_ret;
127
+ int i, match_count;
128
+ VALUE rb_str;
129
+ VALUE rb_enc_hint;
130
+ VALUE binary_match;
131
+
132
+ rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
133
+
134
+ Check_Type(rb_str, T_STRING);
135
+ Data_Get_Struct(self, charlock_detector_t, detector);
136
+
137
+ rb_ret = rb_ary_new();
138
+
139
+ // first lets see if this is binary content
140
+ binary_match = Qnil;
141
+ if (detect_binary_content(detector, rb_str)) {
142
+ binary_match = rb_encdec_binarymatch();
143
+ }
144
+
145
+ ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
146
+
147
+ if (!NIL_P(rb_enc_hint)) {
148
+ Check_Type(rb_enc_hint, T_STRING);
149
+ ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
150
+ }
151
+
152
+ csm = ucsdet_detectAll(detector->csd, &match_count, &status);
153
+
154
+ for (i = 0; i < match_count; ++i) {
155
+ rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i]));
156
+ }
157
+
158
+ if (!NIL_P(binary_match))
159
+ rb_ary_unshift(rb_ret, binary_match);
160
+
161
+ return rb_ret;
162
+ }
163
+
164
+ /*
165
+ * call-seq: EncodingDetector#strip_tags?
166
+ *
167
+ * Returns whether or not the strip_tags flag is set on this detector
168
+ *
169
+ * Returns: Boolean
170
+ */
171
+ static VALUE rb_get_strip_tags(VALUE self)
172
+ {
173
+ charlock_detector_t *detector;
174
+ UBool val;
175
+ VALUE rb_val;
176
+
177
+ Data_Get_Struct(self, charlock_detector_t, detector);
178
+
179
+ val = ucsdet_isInputFilterEnabled(detector->csd);
180
+
181
+ rb_val = val == 1 ? Qtrue : Qfalse;
182
+
183
+ return rb_val;
184
+ }
185
+
186
+ /*
187
+ * call-seq: EncodingDetector#strip_tags = true
188
+ *
189
+ * Enable or disable the stripping of HTML/XML tags from the input before
190
+ * attempting any detection
191
+ *
192
+ * Returns: Boolean, the value passed
193
+ */
194
+ static VALUE rb_set_strip_tags(VALUE self, VALUE rb_val)
195
+ {
196
+ charlock_detector_t *detector;
197
+ UBool val;
198
+
199
+ Data_Get_Struct(self, charlock_detector_t, detector);
200
+
201
+ val = rb_val == Qtrue ? 1 : 0;
202
+
203
+ ucsdet_enableInputFilter(detector->csd, val);
204
+
205
+ return rb_val;
206
+ }
207
+
208
+ /*
209
+ * call-seq: detectable_encodings = EncodingDetector.supported_encodings
210
+ *
211
+ * The list of detectable encodings supported by this library
212
+ *
213
+ * Returns: an Array of Strings
214
+ */
215
+ static VALUE rb_get_supported_encodings(VALUE klass)
216
+ {
217
+ UCharsetDetector *csd;
218
+ UErrorCode status = U_ZERO_ERROR;
219
+ UEnumeration *encoding_list;
220
+ VALUE rb_encoding_list;
221
+ int32_t enc_count;
222
+ int32_t i;
223
+ const char *enc_name;
224
+ int32_t enc_name_len;
225
+
226
+ rb_encoding_list = rb_iv_get(klass, "encoding_list");
227
+
228
+ // lazily populate the list
229
+ if (NIL_P(rb_encoding_list)) {
230
+ csd = ucsdet_open(&status);
231
+
232
+ encoding_list = ucsdet_getAllDetectableCharsets(csd, &status);
233
+ rb_encoding_list = rb_ary_new();
234
+ enc_count = uenum_count(encoding_list, &status);
235
+
236
+ for(i=0; i < enc_count; i++) {
237
+ enc_name = uenum_next(encoding_list, &enc_name_len, &status);
238
+ rb_ary_push(rb_encoding_list, charlock_new_str(enc_name, enc_name_len));
239
+ }
240
+
241
+ rb_iv_set(klass, "encoding_list", rb_encoding_list);
242
+ ucsdet_close(csd);
243
+ }
244
+
245
+ return rb_encoding_list;
246
+ }
247
+
248
+ static void rb_encdec__free(void *obj)
249
+ {
250
+ charlock_detector_t *detector;
251
+
252
+ detector = (charlock_detector_t *)obj;
253
+
254
+ if (detector->csd)
255
+ ucsdet_close(detector->csd);
256
+
257
+ if (detector->magic)
258
+ magic_close(detector->magic);
259
+
260
+ free(detector);
261
+ }
262
+
263
+ static VALUE rb_encdec__alloc(VALUE klass)
264
+ {
265
+ charlock_detector_t *detector;
266
+ UErrorCode status = U_ZERO_ERROR;
267
+ VALUE obj;
268
+
269
+ detector = calloc(1, sizeof(charlock_detector_t));
270
+ obj = Data_Wrap_Struct(klass, NULL, rb_encdec__free, (void *)detector);
271
+
272
+ detector->csd = ucsdet_open(&status);
273
+ if (U_FAILURE(status)) {
274
+ rb_raise(rb_eStandardError, "%s", u_errorName(status));
275
+ }
276
+
277
+ detector->magic = magic_open(MAGIC_NO_CHECK_SOFT);
278
+ if (detector->magic == NULL) {
279
+ rb_raise(rb_eStandardError, "%s", magic_error(detector->magic));
280
+ }
281
+
282
+ return obj;
283
+ }
284
+
285
+ void _init_charlock_encoding_detector()
286
+ {
287
+ rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject);
288
+ rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
289
+ rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
290
+ rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1);
291
+ rb_define_method(rb_cEncodingDetector, "strip_tags", rb_get_strip_tags, 0);
292
+ rb_define_method(rb_cEncodingDetector, "strip_tags=", rb_set_strip_tags, 1);
293
+
294
+ rb_define_singleton_method(rb_cEncodingDetector, "supported_encodings", rb_get_supported_encodings, 0);
295
+ }
@@ -0,0 +1,13 @@
1
+ #include "common.h"
2
+
3
+ extern void _init_charlock_encoding_detector();
4
+ extern void _init_charlock_converter();
5
+
6
+ VALUE rb_mCharlockHolmes;
7
+
8
+ void Init_charlock_holmes() {
9
+ rb_mCharlockHolmes = rb_define_module("CharlockHolmes");
10
+
11
+ _init_charlock_encoding_detector();
12
+ _init_charlock_converter();
13
+ }
@@ -0,0 +1,86 @@
1
+ require 'mkmf'
2
+
3
+ CWD = File.expand_path(File.dirname(__FILE__))
4
+ def sys(cmd)
5
+ puts " -- #{cmd}"
6
+ unless ret = xsystem(cmd)
7
+ raise "#{cmd} failed, please report issue on http://github.com/brianmario/charlock_holmes"
8
+ end
9
+ ret
10
+ end
11
+
12
+ if `which make`.strip.empty?
13
+ STDERR.puts "\n\n"
14
+ STDERR.puts "***************************************************************************************"
15
+ STDERR.puts "*************** make required (apt-get install make build-essential) =( ***************"
16
+ STDERR.puts "***************************************************************************************"
17
+ exit(1)
18
+ end
19
+
20
+ ##
21
+ # ICU dependency
22
+ #
23
+
24
+ src = File.basename('icu4c-52_1-src.tgz')
25
+ dir = File.basename('icu')
26
+
27
+ Dir.chdir("#{CWD}/src") do
28
+ FileUtils.rm_rf(dir) if File.exists?(dir)
29
+
30
+ sys("tar zxvf #{src}")
31
+ Dir.chdir(File.join(dir, 'source')) do
32
+ sys("LDFLAGS= CXXFLAGS=\"-O2 -fPIC\" CFLAGS=\"-O2 -fPIC\" ./configure --prefix=#{CWD}/dst/ --disable-tests --disable-samples --disable-icuio --disable-extras --disable-layout --enable-static --disable-shared")
33
+ sys("make install")
34
+ end
35
+ end
36
+
37
+ dir_config 'icu'
38
+
39
+ $INCFLAGS << " -I#{CWD}/dst/include "
40
+ $LDFLAGS << " -L#{CWD}/dst/lib"
41
+
42
+ unless have_library 'icui18n' and have_library 'icudata' and have_library 'icutu' and have_library 'icuuc' and have_header 'unicode/ucnv.h'
43
+ STDERR.puts "\n\n"
44
+ STDERR.puts "***************************************************************************************"
45
+ STDERR.puts "********* error compiling and linking icu4c. please report issue on github *********"
46
+ STDERR.puts "***************************************************************************************"
47
+ exit(1)
48
+ end
49
+
50
+ ##
51
+ # libmagic dependency
52
+ #
53
+
54
+ src = File.basename('file-5.08.tar.gz')
55
+ dir = File.basename(src, '.tar.gz')
56
+
57
+ Dir.chdir("#{CWD}/src") do
58
+ FileUtils.rm_rf(dir) if File.exists?(dir)
59
+
60
+ sys("tar zxvf #{src}")
61
+ Dir.chdir(dir) do
62
+ sys("./configure --prefix=#{CWD}/dst/ --disable-shared --enable-static --with-pic")
63
+ sys("patch -p0 < ../file-soft-check.patch")
64
+ sys("make -C src install")
65
+ sys("make -C magic install")
66
+ end
67
+ end
68
+
69
+ FileUtils.cp "#{CWD}/dst/lib/libmagic.a", "#{CWD}/libmagic_ext.a"
70
+
71
+ $INCFLAGS[0,0] = " -I#{CWD}/dst/include "
72
+ $LDFLAGS << " -L#{CWD} "
73
+
74
+ dir_config 'magic'
75
+ unless have_library 'magic_ext' and have_header 'magic.h'
76
+ STDERR.puts "\n\n"
77
+ STDERR.puts "***************************************************************************************"
78
+ STDERR.puts "********* error compiling and linking libmagic. please report issue on github *********"
79
+ STDERR.puts "***************************************************************************************"
80
+ exit(1)
81
+ end
82
+
83
+ $CFLAGS << ' -Wall -funroll-loops'
84
+ $CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
85
+ $LIBS << " -lstdc++"
86
+ create_makefile 'charlock_holmes/charlock_holmes'