charlock_holmes 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,25 @@
1
+ # encoding: utf-8
2
+
3
+ require './lib/charlock_holmes/version' unless defined? CharlockHolmes::VERSION
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = %q{charlock_holmes}
7
+ s.version = CharlockHolmes::VERSION
8
+ s.authors = ["Brian Lopez", "Vicent Martí"]
9
+ s.date = Time.now.utc.strftime("%Y-%m-%d")
10
+ s.email = %q{seniorlopez@gmail.com}
11
+ s.extensions = ["ext/charlock_holmes/extconf.rb"]
12
+ s.files = `git ls-files`.split("\n")
13
+ s.homepage = %q{http://github.com/brianmario/charlock_holmes}
14
+ s.rdoc_options = ["--charset=UTF-8"]
15
+ s.require_paths = ["lib", "ext"]
16
+ s.rubygems_version = %q{1.4.2}
17
+ s.summary = %q{Character encoding detection, brought to you by ICU}
18
+ s.test_files = `git ls-files spec`.split("\n")
19
+
20
+ # tests
21
+ s.add_development_dependency 'rake-compiler', ">= 0.7.5"
22
+ s.add_development_dependency 'rspec', ">= 2.0.0"
23
+ # benchmarks
24
+ s.add_development_dependency 'chardet'
25
+ end
@@ -0,0 +1,119 @@
1
+ #include "unicode/ucsdet.h"
2
+
3
+ #include <ruby.h>
4
+ #ifdef HAVE_RUBY_ENCODING_H
5
+ #include <ruby/encoding.h>
6
+ #endif
7
+
8
+ static VALUE rb_mCharlockHolmes;
9
+ static VALUE rb_cEncodingDetector;
10
+
11
+ static VALUE charlock_new_str2(const char *str)
12
+ {
13
+ #ifdef HAVE_RUBY_ENCODING_H
14
+ return rb_external_str_new_with_enc(str, strlen(str), rb_utf8_encoding());
15
+ #else
16
+ return rb_str_new2(str);
17
+ #endif
18
+ }
19
+
20
+ static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
21
+ {
22
+ UErrorCode status = U_ZERO_ERROR;
23
+ const char *mname;
24
+ const char *mlang;
25
+ int mconfidence;
26
+ VALUE rb_match;
27
+
28
+ if (!match)
29
+ return Qnil;
30
+
31
+ mname = ucsdet_getName(match, &status);
32
+ mlang = ucsdet_getLanguage(match, &status);
33
+ mconfidence = ucsdet_getConfidence(match, &status);
34
+
35
+ rb_match = rb_hash_new();
36
+
37
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("encoding")), charlock_new_str2(mname));
38
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(mconfidence));
39
+
40
+ if (mlang && mlang[0])
41
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("language")), charlock_new_str2(mlang));
42
+
43
+ return rb_match;
44
+ }
45
+
46
+ /*
47
+ * call-seq: detection_hash = EncodingDetector.detect "some string"
48
+ *
49
+ * Attempt to detect the encoding of this string
50
+ *
51
+ * Returns: a Hash with :encoding, :language and :confidence
52
+ */
53
+ static VALUE rb_encdec_detect(VALUE self, VALUE rb_str)
54
+ {
55
+ UErrorCode status = U_ZERO_ERROR;
56
+ UCharsetDetector *csd;
57
+
58
+ Check_Type(rb_str, T_STRING);
59
+ Data_Get_Struct(self, UCharsetDetector, csd);
60
+
61
+ ucsdet_setText(csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
62
+ return rb_encdec_buildmatch(ucsdet_detect(csd, &status));
63
+ }
64
+
65
+
66
+ /*
67
+ * call-seq: detection_hash_array = EncodingDetector.detect_all "some string"
68
+ *
69
+ * Attempt to detect the encoding of this string, and return
70
+ * a list with all the possible encodings that match it.
71
+ *
72
+ * Returns: a List with zero or more Hashes,
73
+ * each one of them with with :encoding, :language and :confidence
74
+ */
75
+ static VALUE rb_encdec_detect_all(VALUE self, VALUE rb_str)
76
+ {
77
+ UErrorCode status = U_ZERO_ERROR;
78
+ UCharsetDetector *csd;
79
+ const UCharsetMatch **csm;
80
+ VALUE rb_ret;
81
+ int i, match_count;
82
+
83
+ Check_Type(rb_str, T_STRING);
84
+ Data_Get_Struct(self, UCharsetDetector, csd);
85
+
86
+ rb_ret = rb_ary_new();
87
+
88
+ ucsdet_setText(csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
89
+ csm = ucsdet_detectAll(csd, &match_count, &status);
90
+
91
+ for (i = 0; i < match_count; ++i) {
92
+ rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i]));
93
+ }
94
+
95
+ return rb_ret;
96
+ }
97
+
98
+
99
+ static void rb_encdec__free(void *csd)
100
+ {
101
+ ucsdet_close((UCharsetDetector *)csd);
102
+ }
103
+
104
+ static VALUE rb_encdec__alloc(VALUE klass)
105
+ {
106
+ UErrorCode status = U_ZERO_ERROR;
107
+ UCharsetDetector *csd = ucsdet_open(&status);
108
+ return Data_Wrap_Struct(klass, NULL, rb_encdec__free, (void *)csd);
109
+ }
110
+
111
+ void Init_charlock_holmes()
112
+ {
113
+ rb_mCharlockHolmes = rb_define_module("CharlockHolmes");
114
+
115
+ rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject);
116
+ rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
117
+ rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, 1);
118
+ rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, 1);
119
+ }
@@ -0,0 +1,10 @@
1
+ require 'mkmf'
2
+
3
+ $CFLAGS << ' -Wall -funroll-loops'
4
+ $CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
5
+
6
+ dir_config 'icu'
7
+
8
+ have_library 'icui18n'
9
+
10
+ create_makefile 'charlock_holmes'
@@ -0,0 +1,6 @@
1
+ require 'charlock_holmes/charlock_holmes'
2
+ require 'charlock_holmes/encoding_detector'
3
+ require 'charlock_holmes/version' unless defined? CharlockHolmes::VERSION
4
+
5
+ # require this if you want the String monkey patches
6
+ # require 'charlock_holmes/string'
@@ -0,0 +1,12 @@
1
+ module CharlockHolmes
2
+ class EncodingDetector
3
+ # Attempt to detect the encoding of this string
4
+ #
5
+ # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
6
+ #
7
+ # Returns: a Hash with :encoding, :language and :confidence
8
+ def self.detect(str)
9
+ new.detect(str)
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,28 @@
1
+ require 'charlock_holmes' unless defined? CharlockHolmes
2
+
3
+ class String
4
+ # Attempt to detect the encoding of this string
5
+ #
6
+ # Returns: a Hash with :encoding, :language and :confidence
7
+ def detect_encoding
8
+ encoding_detector.detect(self)
9
+ end
10
+
11
+ if RUBY_VERSION =~ /1.9/
12
+ # Attempt to detect the encoding of this string
13
+ # then set the encoding to what was detected ala `force_encoding`
14
+ #
15
+ # Returns: a Hash with :encoding, :language and :confidence
16
+ def detect_encoding!
17
+ if detected = self.detect_encoding
18
+ self.force_encoding detected[:encoding]
19
+ detected
20
+ end
21
+ end
22
+ end
23
+
24
+ protected
25
+ def encoding_detector
26
+ @encoding_detector ||= CharlockHolmes::EncodingDetector.new
27
+ end
28
+ end
@@ -0,0 +1,3 @@
1
+ module CharlockHolmes
2
+ VERSION = "0.2.0"
3
+ end
@@ -0,0 +1,54 @@
1
+ require 'spec_helper'
2
+
3
+ describe CharlockHolmes::EncodingDetector do
4
+ before :all do
5
+ @detector = CharlockHolmes::EncodingDetector.new
6
+ end
7
+
8
+ test 'has a detect class-level method' do
9
+ CharlockHolmes::EncodingDetector.respond_to? :detect
10
+ detected = CharlockHolmes::EncodingDetector.detect 'test'
11
+ assert_equal 'ISO-8859-1', detected[:encoding]
12
+ end
13
+
14
+ test 'has a detect method' do
15
+ @detector.respond_to? :detect
16
+ detected = @detector.detect 'test'
17
+ assert_equal 'ISO-8859-1', detected[:encoding]
18
+ end
19
+
20
+ test 'has a detect_all method' do
21
+ @detector.respond_to? :detect_all
22
+ detected_list = @detector.detect_all 'test'
23
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
24
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
25
+ end
26
+
27
+ context 'encoding detection' do
28
+ MAPPING = [
29
+ ['repl2.cljs', 'ISO-8859-1'],
30
+ ['core.rkt', 'UTF-8'],
31
+ ['cl-messagepack.lisp', 'ISO-8859-1'],
32
+ ['TwigExtensionsDate.es.yml', 'UTF-8'],
33
+ ['AnsiGraph.psm1', 'UTF-16LE'],
34
+ ['laholator.py', 'UTF-8']
35
+ ]
36
+
37
+ MAPPING.each do |mapping|
38
+ file, encoding = mapping
39
+
40
+ test "#{file} should be detected as #{encoding}" do
41
+ path = File.expand_path "../fixtures/#{file}", __FILE__
42
+ content = File.read path
43
+ guessed = @detector.detect content
44
+
45
+ assert_equal encoding, guessed[:encoding]
46
+
47
+ if content.respond_to? :force_encoding
48
+ content.force_encoding guessed[:encoding]
49
+ assert content.valid_encoding?
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
Binary file
@@ -0,0 +1,8 @@
1
+ date.year: '%year% año|%year% años'
2
+ date.month: '%month% mes|%month% meses'
3
+ date.day: '%day% día|%day% días'
4
+ date.hour: '%hour% hora|%hour% horas'
5
+ date.minute: '%minute% minuto|%minute% minutos'
6
+ date.second: '%second% segundo|%second% segundos'
7
+ date.new: 'menos de un minuto'
8
+ date.and: ' y '
@@ -0,0 +1,264 @@
1
+ ;;;; cl-messagepack.lisp
2
+
3
+ (in-package #:messagepack)
4
+
5
+ (declaim (optimize (debug 3)))
6
+
7
+ (eval-when (:compile-toplevel :load-toplevel :execute)
8
+ (defun mkstr (&rest args)
9
+ (format nil "~{~a~}" args))
10
+ (defun mksymb (&rest args)
11
+ (intern (apply #'mkstr args))))
12
+
13
+ (defmacro signed-unsigned-convertors (size)
14
+ (let ((speed (if (< size 32) 3 0)))
15
+ `(progn
16
+ (defun ,(mksymb 'sb size '-> 'ub size) (sb)
17
+ (declare (optimize (debug 0) (safety 0) (speed ,speed))
18
+ (type (integer ,(- (expt 2 (1- size))) ,(1- (expt 2 (1- size)))) sb))
19
+ (if (< sb 0)
20
+ (ldb (byte ,size 0) sb)
21
+ sb))
22
+ (defun ,(mksymb 'ub size '-> 'sb size) (sb)
23
+ (declare (optimize (debug 0) (safety 0) (speed ,speed))
24
+ (type (mod ,(expt 2 size)) sb))
25
+ (if (logbitp (1- ,size) sb)
26
+ (- (1+ (logxor (1- (expt 2 ,size)) sb)))
27
+ sb)))))
28
+
29
+ (signed-unsigned-convertors 8)
30
+ (signed-unsigned-convertors 16)
31
+ (signed-unsigned-convertors 32)
32
+ (signed-unsigned-convertors 64)
33
+
34
+ (defun write-hex (data)
35
+ (let (line)
36
+ (loop
37
+ for i from 0 to (1- (length data))
38
+ do (push (elt data i) line)
39
+ when (= (length line) 16)
40
+ do
41
+ (format t "~{~2,'0x ~}~%" (nreverse line))
42
+ (setf line nil))
43
+ (when line
44
+ (format t "~{~2,'0x ~}~%" (nreverse line)))))
45
+
46
+ (defun encode (data)
47
+ (flexi-streams:with-output-to-sequence (stream)
48
+ (encode-stream data stream)))
49
+
50
+ (defun make-hash (data)
51
+ (let ((result (make-hash-table)))
52
+ (dolist (kv data)
53
+ (cond ((consp (cdr kv))
54
+ (setf (gethash (first kv) result) (second kv)))
55
+ (t
56
+ (setf (gethash (car kv) result) (cdr kv)))))
57
+ result))
58
+
59
+ (defun is-byte-array (data-type)
60
+ (and (vectorp data-type)
61
+ (equal '(unsigned-byte 8) (array-element-type data-type))))
62
+
63
+ (defun encode-stream (data stream)
64
+ (cond ((floatp data) (encode-float data stream))
65
+ ((numberp data) (encode-integer data stream))
66
+ ((null data) (write-byte #xc0 stream))
67
+ ((eq data t) (write-byte #xc3 stream))
68
+ ((stringp data)
69
+ (encode-string data stream))
70
+ ((is-byte-array data)
71
+ (encode-raw-bytes data stream))
72
+ ((or (consp data) (vectorp data))
73
+ (encode-array data stream))
74
+ ((hash-table-p data)
75
+ (encode-hash data stream))
76
+ ((symbolp data)
77
+ (encode-string (symbol-name data) stream))
78
+ (t (error "Cannot encode data."))))
79
+
80
+ (defun encode-string (data stream)
81
+ (encode-raw-bytes (babel:string-to-octets data) stream))
82
+
83
+ #+sbcl (defun sbcl-encode-float (data stream)
84
+ (cond ((equal (type-of data) 'single-float)
85
+ (write-byte #xca stream)
86
+ (store-big-endian (sb-kernel:single-float-bits data) stream 4))
87
+ ((equal (type-of data) 'double-float)
88
+ (write-byte #xcb stream)
89
+ (store-big-endian (sb-kernel:double-float-high-bits data) stream 4)
90
+ (store-big-endian (sb-kernel:double-float-low-bits data) stream 4)))
91
+ t)
92
+
93
+ (defun encode-float (data stream)
94
+ (or #+sbcl (sbcl-encode-float data stream)
95
+ #-(or sbcl) (error "No floating point support yet.")))
96
+
97
+ (defun encode-each (data stream &optional (encoder #'encode-stream))
98
+ (cond ((hash-table-p data)
99
+ (maphash (lambda (key value)
100
+ (funcall encoder key stream)
101
+ (funcall encoder value stream))
102
+ data))
103
+ ((or (vectorp data) (consp data))
104
+ (mapc (lambda (subdata)
105
+ (funcall encoder subdata stream))
106
+ (coerce data 'list)))
107
+ (t (error "Not sequence or hash table."))))
108
+
109
+ (defun encode-sequence (data stream
110
+ short-prefix short-length
111
+ typecode-16 typecode-32
112
+ &optional (encoder #'encode-stream))
113
+ (let ((len (if (hash-table-p data)
114
+ (hash-table-count data)
115
+ (length data))))
116
+ (cond ((<= 0 len short-length)
117
+ (write-byte (+ short-prefix len) stream)
118
+ (encode-each data stream encoder))
119
+ ((<= 0 len 65535)
120
+ (write-byte typecode-16 stream)
121
+ (store-big-endian len stream 2)
122
+ (encode-each data stream encoder))
123
+ ((<= 0 len (1- (expt 2 32)))
124
+ (write-byte typecode-32 stream)
125
+ (store-big-endian len stream 4)
126
+ (encode-each data stream encoder)))))
127
+
128
+ (defun encode-hash (data stream)
129
+ (encode-sequence data stream #x80 15 #xdc #xdd))
130
+
131
+ (defun encode-array (data stream)
132
+ (encode-sequence data stream #x90 15 #xdc #xdd))
133
+
134
+ (defun encode-raw-bytes (data stream)
135
+ (encode-sequence data stream #xa0 31 #xda #xdb #'write-byte))
136
+
137
+ (defun encode-integer (data stream)
138
+ (cond ((<= 0 data 127) (write-byte data stream))
139
+ ((<= -32 data -1) (write-byte (sb8->ub8 data) stream))
140
+ ((<= 0 data 255)
141
+ (write-byte #xcc stream)
142
+ (write-byte data stream))
143
+ ((<= 0 data 65535)
144
+ (write-byte #xcd stream)
145
+ (store-big-endian data stream 2))
146
+ ((<= 0 data (1- (expt 2 32)))
147
+ (write-byte #xce stream)
148
+ (store-big-endian data stream 4))
149
+ ((<= 0 data (1- (expt 2 64)))
150
+ (write-byte #xcf stream)
151
+ (store-big-endian data stream 8))
152
+ ((<= -128 data 127)
153
+ (write-byte #xd0 stream)
154
+ (write-byte (sb8->ub8 data) stream))
155
+ ((<= -32768 data 32767)
156
+ (write-byte #xd1 stream)
157
+ (write-byte (sb16->ub16 data) stream))
158
+ ((<= (- (expt 2 31)) data (1- (expt 2 31)))
159
+ (write-byte #xd2 stream)
160
+ (write-byte (sb32->ub32 data) stream))
161
+ ((<= (- (expt 2 63)) data (1- (expt 2 63)))
162
+ (write-byte #xd3 stream)
163
+ (write-byte (sb64->ub64 data) stream))
164
+ (t (error "Integer too large or too small."))))
165
+
166
+ (defun store-big-endian (number stream byte-count)
167
+ (let (byte-list)
168
+ (loop
169
+ while (> number 0)
170
+ do
171
+ (push (rem number 256)
172
+ byte-list)
173
+ (setf number (ash number -8)))
174
+ (loop
175
+ while (< (length byte-list) byte-count)
176
+ do (push 0 byte-list))
177
+ (when (> (length byte-list) byte-count)
178
+ (error "Number too large."))
179
+ (write-sequence byte-list stream)))
180
+
181
+ (defun decode (byte-array)
182
+ (flexi-streams:with-input-from-sequence (stream byte-array)
183
+ (decode-stream stream)))
184
+
185
+ (defun decode-stream (stream)
186
+ (let ((byte (read-byte stream)))
187
+ (cond ((= 0 (ldb (byte 1 7) byte))
188
+ byte)
189
+ ((= 7 (ldb (byte 3 5) byte))
190
+ (ub8->sb8 byte))
191
+ ((= #xcc byte)
192
+ (read-byte stream))
193
+ ((= #xcd byte)
194
+ (load-big-endian stream 2))
195
+ ((= #xce byte)
196
+ (load-big-endian stream 4))
197
+ ((= #xcf byte)
198
+ (load-big-endian stream 8))
199
+ ((= #xd0 byte)
200
+ (ub8->sb8 (read-byte stream)))
201
+ ((= #xd1 byte)
202
+ (ub16->sb16 (load-big-endian stream 2)))
203
+ ((= #xd2 byte)
204
+ (ub32->sb32 (load-big-endian stream 4)))
205
+ ((= #xd3 byte)
206
+ (ub64->sb64 (load-big-endian stream 8)))
207
+ ((= #xc0 byte)
208
+ nil)
209
+ ((= #xc3 byte)
210
+ t)
211
+ ((= #xc2 byte)
212
+ nil)
213
+ ((= #xca byte)
214
+ (or #+sbcl (sb-kernel:make-single-float (load-big-endian stream 4))
215
+ #-(or sbcl) (error "No floating point support yet.")))
216
+ ((= #xcb byte)
217
+ (or #+sbcl (sb-kernel:make-double-float (load-big-endian stream 4)
218
+ (load-big-endian stream 4))
219
+ #-(or sbcl) (error "No floating point support yet.")))
220
+ ((= 5 (ldb (byte 3 5) byte))
221
+ (decode-raw-sequence (ldb (byte 5 0) byte) stream))
222
+ ((= #xda byte)
223
+ (decode-raw-sequence (load-big-endian stream 2) stream))
224
+ ((= #xdb byte)
225
+ (decode-raw-sequence (load-big-endian stream 4) stream))
226
+ ((= 9 (ldb (byte 4 4) byte))
227
+ (decode-array (- byte #x90) stream))
228
+ ((= #xdc byte)
229
+ (decode-array (load-big-endian stream 2) stream))
230
+ ((= #xdd byte)
231
+ (decode-array (load-big-endian stream 4) stream))
232
+ ((= 8 (ldb (byte 4 4) byte))
233
+ (decode-map (- byte #x80) stream))
234
+ ((= #xde byte)
235
+ (decode-map (load-big-endian stream 2) stream))
236
+ ((= #xdf byte)
237
+ (decode-map (load-big-endian stream 4) stream)))))
238
+
239
+ (defun decode-map (length stream)
240
+ (let ((hash-table (make-hash-table :test #'equal)))
241
+ (loop repeat length
242
+ do (let ((key (decode-stream stream))
243
+ (value (decode-stream stream)))
244
+ (setf (gethash key hash-table) value)))
245
+ hash-table))
246
+
247
+ (defun decode-array (length stream)
248
+ (let ((array (make-array length)))
249
+ (dotimes (i length)
250
+ (setf (aref array i) (decode-stream stream)))
251
+ array))
252
+
253
+ (defun decode-raw-sequence (length stream)
254
+ (let ((seq (make-array length :element-type '(mod 256))))
255
+ (read-sequence seq stream)
256
+ (babel:octets-to-string seq)))
257
+
258
+ (defun load-big-endian (stream byte-count)
259
+ (let ((result 0))
260
+ (loop
261
+ repeat byte-count
262
+ do (setf result (+ (ash result 8)
263
+ (read-byte stream))))
264
+ result))