charlock_holmes 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ # encoding: utf-8
2
+
3
+ require './lib/charlock_holmes/version' unless defined? CharlockHolmes::VERSION
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = %q{charlock_holmes}
7
+ s.version = CharlockHolmes::VERSION
8
+ s.authors = ["Brian Lopez", "Vicent Martí"]
9
+ s.date = Time.now.utc.strftime("%Y-%m-%d")
10
+ s.email = %q{seniorlopez@gmail.com}
11
+ s.extensions = ["ext/charlock_holmes/extconf.rb"]
12
+ s.files = `git ls-files`.split("\n")
13
+ s.homepage = %q{http://github.com/brianmario/charlock_holmes}
14
+ s.rdoc_options = ["--charset=UTF-8"]
15
+ s.require_paths = ["lib", "ext"]
16
+ s.rubygems_version = %q{1.4.2}
17
+ s.summary = %q{Character encoding detection, brought to you by ICU}
18
+ s.test_files = `git ls-files spec`.split("\n")
19
+
20
+ # tests
21
+ s.add_development_dependency 'rake-compiler', ">= 0.7.5"
22
+ s.add_development_dependency 'rspec', ">= 2.0.0"
23
+ # benchmarks
24
+ s.add_development_dependency 'chardet'
25
+ end
@@ -0,0 +1,119 @@
1
+ #include "unicode/ucsdet.h"
2
+
3
+ #include <ruby.h>
4
+ #ifdef HAVE_RUBY_ENCODING_H
5
+ #include <ruby/encoding.h>
6
+ #endif
7
+
8
+ static VALUE rb_mCharlockHolmes;
9
+ static VALUE rb_cEncodingDetector;
10
+
11
+ static VALUE charlock_new_str2(const char *str)
12
+ {
13
+ #ifdef HAVE_RUBY_ENCODING_H
14
+ return rb_external_str_new_with_enc(str, strlen(str), rb_utf8_encoding());
15
+ #else
16
+ return rb_str_new2(str);
17
+ #endif
18
+ }
19
+
20
+ static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
21
+ {
22
+ UErrorCode status = U_ZERO_ERROR;
23
+ const char *mname;
24
+ const char *mlang;
25
+ int mconfidence;
26
+ VALUE rb_match;
27
+
28
+ if (!match)
29
+ return Qnil;
30
+
31
+ mname = ucsdet_getName(match, &status);
32
+ mlang = ucsdet_getLanguage(match, &status);
33
+ mconfidence = ucsdet_getConfidence(match, &status);
34
+
35
+ rb_match = rb_hash_new();
36
+
37
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("encoding")), charlock_new_str2(mname));
38
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(mconfidence));
39
+
40
+ if (mlang && mlang[0])
41
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("language")), charlock_new_str2(mlang));
42
+
43
+ return rb_match;
44
+ }
45
+
46
+ /*
47
+ * call-seq: detection_hash = EncodingDetector.detect "some string"
48
+ *
49
+ * Attempt to detect the encoding of this string
50
+ *
51
+ * Returns: a Hash with :encoding, :language and :confidence
52
+ */
53
+ static VALUE rb_encdec_detect(VALUE self, VALUE rb_str)
54
+ {
55
+ UErrorCode status = U_ZERO_ERROR;
56
+ UCharsetDetector *csd;
57
+
58
+ Check_Type(rb_str, T_STRING);
59
+ Data_Get_Struct(self, UCharsetDetector, csd);
60
+
61
+ ucsdet_setText(csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
62
+ return rb_encdec_buildmatch(ucsdet_detect(csd, &status));
63
+ }
64
+
65
+
66
+ /*
67
+ * call-seq: detection_hash_array = EncodingDetector.detect_all "some string"
68
+ *
69
+ * Attempt to detect the encoding of this string, and return
70
+ * a list with all the possible encodings that match it.
71
+ *
72
+ * Returns: a List with zero or more Hashes,
73
+ * each one of them with with :encoding, :language and :confidence
74
+ */
75
+ static VALUE rb_encdec_detect_all(VALUE self, VALUE rb_str)
76
+ {
77
+ UErrorCode status = U_ZERO_ERROR;
78
+ UCharsetDetector *csd;
79
+ const UCharsetMatch **csm;
80
+ VALUE rb_ret;
81
+ int i, match_count;
82
+
83
+ Check_Type(rb_str, T_STRING);
84
+ Data_Get_Struct(self, UCharsetDetector, csd);
85
+
86
+ rb_ret = rb_ary_new();
87
+
88
+ ucsdet_setText(csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
89
+ csm = ucsdet_detectAll(csd, &match_count, &status);
90
+
91
+ for (i = 0; i < match_count; ++i) {
92
+ rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i]));
93
+ }
94
+
95
+ return rb_ret;
96
+ }
97
+
98
+
99
+ static void rb_encdec__free(void *csd)
100
+ {
101
+ ucsdet_close((UCharsetDetector *)csd);
102
+ }
103
+
104
+ static VALUE rb_encdec__alloc(VALUE klass)
105
+ {
106
+ UErrorCode status = U_ZERO_ERROR;
107
+ UCharsetDetector *csd = ucsdet_open(&status);
108
+ return Data_Wrap_Struct(klass, NULL, rb_encdec__free, (void *)csd);
109
+ }
110
+
111
+ void Init_charlock_holmes()
112
+ {
113
+ rb_mCharlockHolmes = rb_define_module("CharlockHolmes");
114
+
115
+ rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject);
116
+ rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
117
+ rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, 1);
118
+ rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, 1);
119
+ }
@@ -0,0 +1,10 @@
1
+ require 'mkmf'
2
+
3
+ $CFLAGS << ' -Wall -funroll-loops'
4
+ $CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
5
+
6
+ dir_config 'icu'
7
+
8
+ have_library 'icui18n'
9
+
10
+ create_makefile 'charlock_holmes'
@@ -0,0 +1,6 @@
1
+ require 'charlock_holmes/charlock_holmes'
2
+ require 'charlock_holmes/encoding_detector'
3
+ require 'charlock_holmes/version' unless defined? CharlockHolmes::VERSION
4
+
5
+ # require this if you want the String monkey patches
6
+ # require 'charlock_holmes/string'
@@ -0,0 +1,12 @@
1
+ module CharlockHolmes
2
+ class EncodingDetector
3
+ # Attempt to detect the encoding of this string
4
+ #
5
+ # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
6
+ #
7
+ # Returns: a Hash with :encoding, :language and :confidence
8
+ def self.detect(str)
9
+ new.detect(str)
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,28 @@
1
+ require 'charlock_holmes' unless defined? CharlockHolmes
2
+
3
+ class String
4
+ # Attempt to detect the encoding of this string
5
+ #
6
+ # Returns: a Hash with :encoding, :language and :confidence
7
+ def detect_encoding
8
+ encoding_detector.detect(self)
9
+ end
10
+
11
+ if RUBY_VERSION =~ /1.9/
12
+ # Attempt to detect the encoding of this string
13
+ # then set the encoding to what was detected ala `force_encoding`
14
+ #
15
+ # Returns: a Hash with :encoding, :language and :confidence
16
+ def detect_encoding!
17
+ if detected = self.detect_encoding
18
+ self.force_encoding detected[:encoding]
19
+ detected
20
+ end
21
+ end
22
+ end
23
+
24
+ protected
25
+ def encoding_detector
26
+ @encoding_detector ||= CharlockHolmes::EncodingDetector.new
27
+ end
28
+ end
@@ -0,0 +1,3 @@
1
+ module CharlockHolmes
2
+ VERSION = "0.2.0"
3
+ end
@@ -0,0 +1,54 @@
1
+ require 'spec_helper'
2
+
3
+ describe CharlockHolmes::EncodingDetector do
4
+ before :all do
5
+ @detector = CharlockHolmes::EncodingDetector.new
6
+ end
7
+
8
+ test 'has a detect class-level method' do
9
+ CharlockHolmes::EncodingDetector.respond_to? :detect
10
+ detected = CharlockHolmes::EncodingDetector.detect 'test'
11
+ assert_equal 'ISO-8859-1', detected[:encoding]
12
+ end
13
+
14
+ test 'has a detect method' do
15
+ @detector.respond_to? :detect
16
+ detected = @detector.detect 'test'
17
+ assert_equal 'ISO-8859-1', detected[:encoding]
18
+ end
19
+
20
+ test 'has a detect_all method' do
21
+ @detector.respond_to? :detect_all
22
+ detected_list = @detector.detect_all 'test'
23
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
24
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
25
+ end
26
+
27
+ context 'encoding detection' do
28
+ MAPPING = [
29
+ ['repl2.cljs', 'ISO-8859-1'],
30
+ ['core.rkt', 'UTF-8'],
31
+ ['cl-messagepack.lisp', 'ISO-8859-1'],
32
+ ['TwigExtensionsDate.es.yml', 'UTF-8'],
33
+ ['AnsiGraph.psm1', 'UTF-16LE'],
34
+ ['laholator.py', 'UTF-8']
35
+ ]
36
+
37
+ MAPPING.each do |mapping|
38
+ file, encoding = mapping
39
+
40
+ test "#{file} should be detected as #{encoding}" do
41
+ path = File.expand_path "../fixtures/#{file}", __FILE__
42
+ content = File.read path
43
+ guessed = @detector.detect content
44
+
45
+ assert_equal encoding, guessed[:encoding]
46
+
47
+ if content.respond_to? :force_encoding
48
+ content.force_encoding guessed[:encoding]
49
+ assert content.valid_encoding?
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
Binary file
@@ -0,0 +1,8 @@
1
+ date.year: '%year% año|%year% años'
2
+ date.month: '%month% mes|%month% meses'
3
+ date.day: '%day% día|%day% días'
4
+ date.hour: '%hour% hora|%hour% horas'
5
+ date.minute: '%minute% minuto|%minute% minutos'
6
+ date.second: '%second% segundo|%second% segundos'
7
+ date.new: 'menos de un minuto'
8
+ date.and: ' y '
@@ -0,0 +1,264 @@
1
+ ;;;; cl-messagepack.lisp
2
+
3
+ (in-package #:messagepack)
4
+
5
+ (declaim (optimize (debug 3)))
6
+
7
+ (eval-when (:compile-toplevel :load-toplevel :execute)
8
+ (defun mkstr (&rest args)
9
+ (format nil "~{~a~}" args))
10
+ (defun mksymb (&rest args)
11
+ (intern (apply #'mkstr args))))
12
+
13
+ (defmacro signed-unsigned-convertors (size)
14
+ (let ((speed (if (< size 32) 3 0)))
15
+ `(progn
16
+ (defun ,(mksymb 'sb size '-> 'ub size) (sb)
17
+ (declare (optimize (debug 0) (safety 0) (speed ,speed))
18
+ (type (integer ,(- (expt 2 (1- size))) ,(1- (expt 2 (1- size)))) sb))
19
+ (if (< sb 0)
20
+ (ldb (byte ,size 0) sb)
21
+ sb))
22
+ (defun ,(mksymb 'ub size '-> 'sb size) (sb)
23
+ (declare (optimize (debug 0) (safety 0) (speed ,speed))
24
+ (type (mod ,(expt 2 size)) sb))
25
+ (if (logbitp (1- ,size) sb)
26
+ (- (1+ (logxor (1- (expt 2 ,size)) sb)))
27
+ sb)))))
28
+
29
+ (signed-unsigned-convertors 8)
30
+ (signed-unsigned-convertors 16)
31
+ (signed-unsigned-convertors 32)
32
+ (signed-unsigned-convertors 64)
33
+
34
+ (defun write-hex (data)
35
+ (let (line)
36
+ (loop
37
+ for i from 0 to (1- (length data))
38
+ do (push (elt data i) line)
39
+ when (= (length line) 16)
40
+ do
41
+ (format t "~{~2,'0x ~}~%" (nreverse line))
42
+ (setf line nil))
43
+ (when line
44
+ (format t "~{~2,'0x ~}~%" (nreverse line)))))
45
+
46
+ (defun encode (data)
47
+ (flexi-streams:with-output-to-sequence (stream)
48
+ (encode-stream data stream)))
49
+
50
+ (defun make-hash (data)
51
+ (let ((result (make-hash-table)))
52
+ (dolist (kv data)
53
+ (cond ((consp (cdr kv))
54
+ (setf (gethash (first kv) result) (second kv)))
55
+ (t
56
+ (setf (gethash (car kv) result) (cdr kv)))))
57
+ result))
58
+
59
+ (defun is-byte-array (data-type)
60
+ (and (vectorp data-type)
61
+ (equal '(unsigned-byte 8) (array-element-type data-type))))
62
+
63
+ (defun encode-stream (data stream)
64
+ (cond ((floatp data) (encode-float data stream))
65
+ ((numberp data) (encode-integer data stream))
66
+ ((null data) (write-byte #xc0 stream))
67
+ ((eq data t) (write-byte #xc3 stream))
68
+ ((stringp data)
69
+ (encode-string data stream))
70
+ ((is-byte-array data)
71
+ (encode-raw-bytes data stream))
72
+ ((or (consp data) (vectorp data))
73
+ (encode-array data stream))
74
+ ((hash-table-p data)
75
+ (encode-hash data stream))
76
+ ((symbolp data)
77
+ (encode-string (symbol-name data) stream))
78
+ (t (error "Cannot encode data."))))
79
+
80
+ (defun encode-string (data stream)
81
+ (encode-raw-bytes (babel:string-to-octets data) stream))
82
+
83
+ #+sbcl (defun sbcl-encode-float (data stream)
84
+ (cond ((equal (type-of data) 'single-float)
85
+ (write-byte #xca stream)
86
+ (store-big-endian (sb-kernel:single-float-bits data) stream 4))
87
+ ((equal (type-of data) 'double-float)
88
+ (write-byte #xcb stream)
89
+ (store-big-endian (sb-kernel:double-float-high-bits data) stream 4)
90
+ (store-big-endian (sb-kernel:double-float-low-bits data) stream 4)))
91
+ t)
92
+
93
+ (defun encode-float (data stream)
94
+ (or #+sbcl (sbcl-encode-float data stream)
95
+ #-(or sbcl) (error "No floating point support yet.")))
96
+
97
+ (defun encode-each (data stream &optional (encoder #'encode-stream))
98
+ (cond ((hash-table-p data)
99
+ (maphash (lambda (key value)
100
+ (funcall encoder key stream)
101
+ (funcall encoder value stream))
102
+ data))
103
+ ((or (vectorp data) (consp data))
104
+ (mapc (lambda (subdata)
105
+ (funcall encoder subdata stream))
106
+ (coerce data 'list)))
107
+ (t (error "Not sequence or hash table."))))
108
+
109
+ (defun encode-sequence (data stream
110
+ short-prefix short-length
111
+ typecode-16 typecode-32
112
+ &optional (encoder #'encode-stream))
113
+ (let ((len (if (hash-table-p data)
114
+ (hash-table-count data)
115
+ (length data))))
116
+ (cond ((<= 0 len short-length)
117
+ (write-byte (+ short-prefix len) stream)
118
+ (encode-each data stream encoder))
119
+ ((<= 0 len 65535)
120
+ (write-byte typecode-16 stream)
121
+ (store-big-endian len stream 2)
122
+ (encode-each data stream encoder))
123
+ ((<= 0 len (1- (expt 2 32)))
124
+ (write-byte typecode-32 stream)
125
+ (store-big-endian len stream 4)
126
+ (encode-each data stream encoder)))))
127
+
128
+ (defun encode-hash (data stream)
129
+ (encode-sequence data stream #x80 15 #xdc #xdd))
130
+
131
+ (defun encode-array (data stream)
132
+ (encode-sequence data stream #x90 15 #xdc #xdd))
133
+
134
+ (defun encode-raw-bytes (data stream)
135
+ (encode-sequence data stream #xa0 31 #xda #xdb #'write-byte))
136
+
137
+ (defun encode-integer (data stream)
138
+ (cond ((<= 0 data 127) (write-byte data stream))
139
+ ((<= -32 data -1) (write-byte (sb8->ub8 data) stream))
140
+ ((<= 0 data 255)
141
+ (write-byte #xcc stream)
142
+ (write-byte data stream))
143
+ ((<= 0 data 65535)
144
+ (write-byte #xcd stream)
145
+ (store-big-endian data stream 2))
146
+ ((<= 0 data (1- (expt 2 32)))
147
+ (write-byte #xce stream)
148
+ (store-big-endian data stream 4))
149
+ ((<= 0 data (1- (expt 2 64)))
150
+ (write-byte #xcf stream)
151
+ (store-big-endian data stream 8))
152
+ ((<= -128 data 127)
153
+ (write-byte #xd0 stream)
154
+ (write-byte (sb8->ub8 data) stream))
155
+ ((<= -32768 data 32767)
156
+ (write-byte #xd1 stream)
157
+ (write-byte (sb16->ub16 data) stream))
158
+ ((<= (- (expt 2 31)) data (1- (expt 2 31)))
159
+ (write-byte #xd2 stream)
160
+ (write-byte (sb32->ub32 data) stream))
161
+ ((<= (- (expt 2 63)) data (1- (expt 2 63)))
162
+ (write-byte #xd3 stream)
163
+ (write-byte (sb64->ub64 data) stream))
164
+ (t (error "Integer too large or too small."))))
165
+
166
+ (defun store-big-endian (number stream byte-count)
167
+ (let (byte-list)
168
+ (loop
169
+ while (> number 0)
170
+ do
171
+ (push (rem number 256)
172
+ byte-list)
173
+ (setf number (ash number -8)))
174
+ (loop
175
+ while (< (length byte-list) byte-count)
176
+ do (push 0 byte-list))
177
+ (when (> (length byte-list) byte-count)
178
+ (error "Number too large."))
179
+ (write-sequence byte-list stream)))
180
+
181
+ (defun decode (byte-array)
182
+ (flexi-streams:with-input-from-sequence (stream byte-array)
183
+ (decode-stream stream)))
184
+
185
+ (defun decode-stream (stream)
186
+ (let ((byte (read-byte stream)))
187
+ (cond ((= 0 (ldb (byte 1 7) byte))
188
+ byte)
189
+ ((= 7 (ldb (byte 3 5) byte))
190
+ (ub8->sb8 byte))
191
+ ((= #xcc byte)
192
+ (read-byte stream))
193
+ ((= #xcd byte)
194
+ (load-big-endian stream 2))
195
+ ((= #xce byte)
196
+ (load-big-endian stream 4))
197
+ ((= #xcf byte)
198
+ (load-big-endian stream 8))
199
+ ((= #xd0 byte)
200
+ (ub8->sb8 (read-byte stream)))
201
+ ((= #xd1 byte)
202
+ (ub16->sb16 (load-big-endian stream 2)))
203
+ ((= #xd2 byte)
204
+ (ub32->sb32 (load-big-endian stream 4)))
205
+ ((= #xd3 byte)
206
+ (ub64->sb64 (load-big-endian stream 8)))
207
+ ((= #xc0 byte)
208
+ nil)
209
+ ((= #xc3 byte)
210
+ t)
211
+ ((= #xc2 byte)
212
+ nil)
213
+ ((= #xca byte)
214
+ (or #+sbcl (sb-kernel:make-single-float (load-big-endian stream 4))
215
+ #-(or sbcl) (error "No floating point support yet.")))
216
+ ((= #xcb byte)
217
+ (or #+sbcl (sb-kernel:make-double-float (load-big-endian stream 4)
218
+ (load-big-endian stream 4))
219
+ #-(or sbcl) (error "No floating point support yet.")))
220
+ ((= 5 (ldb (byte 3 5) byte))
221
+ (decode-raw-sequence (ldb (byte 5 0) byte) stream))
222
+ ((= #xda byte)
223
+ (decode-raw-sequence (load-big-endian stream 2) stream))
224
+ ((= #xdb byte)
225
+ (decode-raw-sequence (load-big-endian stream 4) stream))
226
+ ((= 9 (ldb (byte 4 4) byte))
227
+ (decode-array (- byte #x90) stream))
228
+ ((= #xdc byte)
229
+ (decode-array (load-big-endian stream 2) stream))
230
+ ((= #xdd byte)
231
+ (decode-array (load-big-endian stream 4) stream))
232
+ ((= 8 (ldb (byte 4 4) byte))
233
+ (decode-map (- byte #x80) stream))
234
+ ((= #xde byte)
235
+ (decode-map (load-big-endian stream 2) stream))
236
+ ((= #xdf byte)
237
+ (decode-map (load-big-endian stream 4) stream)))))
238
+
239
+ (defun decode-map (length stream)
240
+ (let ((hash-table (make-hash-table :test #'equal)))
241
+ (loop repeat length
242
+ do (let ((key (decode-stream stream))
243
+ (value (decode-stream stream)))
244
+ (setf (gethash key hash-table) value)))
245
+ hash-table))
246
+
247
+ (defun decode-array (length stream)
248
+ (let ((array (make-array length)))
249
+ (dotimes (i length)
250
+ (setf (aref array i) (decode-stream stream)))
251
+ array))
252
+
253
+ (defun decode-raw-sequence (length stream)
254
+ (let ((seq (make-array length :element-type '(mod 256))))
255
+ (read-sequence seq stream)
256
+ (babel:octets-to-string seq)))
257
+
258
+ (defun load-big-endian (stream byte-count)
259
+ (let ((result 0))
260
+ (loop
261
+ repeat byte-count
262
+ do (setf result (+ (ash result 8)
263
+ (read-byte stream))))
264
+ result))