charlock_holmes_heroku 0.6.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ require 'charlock_holmes/charlock_holmes'
2
+ require 'charlock_holmes/encoding_detector'
3
+ require 'charlock_holmes/version' unless defined? CharlockHolmes::VERSION
4
+
5
+ # require this if you want the String monkey patches
6
+ # require 'charlock_holmes/string'
@@ -0,0 +1,33 @@
1
+ module CharlockHolmes
2
+ class EncodingDetector
3
+ alias :strip_tags? :strip_tags
4
+
5
+ # Attempt to detect the encoding of this string
6
+ #
7
+ # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
8
+ #
9
+ # str - a String, what you want to detect the encoding of
10
+ # hint_enc - an optional String (like "UTF-8"), the encoding name which will
11
+ # be used as an additional hint to the charset detector
12
+ #
13
+ # Returns: a Hash with :encoding, :language, :type and :confidence
14
+ def self.detect(str, hint_enc=nil)
15
+ new.detect(str, hint_enc)
16
+ end
17
+
18
+ # Attempt to detect the encoding of this string, and return
19
+ # a list with all the possible encodings that match it.
20
+ #
21
+ # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
22
+ #
23
+ # str - a String, what you want to detect the encoding of
24
+ # hint_enc - an optional String (like "UTF-8"), the encoding name which will
25
+ # be used as an additional hint to the charset detector
26
+ #
27
+ # Returns: an Array with zero or more Hashes,
28
+ # each one of them with with :encoding, :language, :type and :confidence
29
+ def self.detect_all(str, hint_enc=nil)
30
+ new.detect_all(str, hint_enc)
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,34 @@
1
+ require 'charlock_holmes' unless defined? CharlockHolmes
2
+
3
+ class String
4
+ # Attempt to detect the encoding of this string
5
+ #
6
+ # Returns: a Hash with :encoding, :language, :type and :confidence
7
+ def detect_encoding(hint_enc=nil)
8
+ detector = CharlockHolmes::EncodingDetector.new
9
+ detector.detect(self, hint_enc)
10
+ end
11
+
12
+ # Attempt to detect the encoding of this string, and return
13
+ # a list with all the possible encodings that match it.
14
+ #
15
+ # Returns: an Array with zero or more Hashes,
16
+ # each one of them with with :encoding, :language, :type and :confidence
17
+ def detect_encodings(hint_enc=nil)
18
+ detector = CharlockHolmes::EncodingDetector.new
19
+ detector.detect_all(self, hint_enc)
20
+ end
21
+
22
+ if RUBY_VERSION =~ /1.9/
23
+ # Attempt to detect the encoding of this string
24
+ # then set the encoding to what was detected ala `force_encoding`
25
+ #
26
+ # Returns: self
27
+ def detect_encoding!(hint_enc=nil)
28
+ if detected = self.detect_encoding(hint_enc)
29
+ self.force_encoding(detected[:encoding]) if detected[:encoding]
30
+ end
31
+ self
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,3 @@
1
+ module CharlockHolmes
2
+ VERSION = "0.6.13"
3
+ end
@@ -0,0 +1,29 @@
1
+ # encoding: utf-8
2
+
3
+ require 'spec_helper'
4
+
5
+ describe CharlockHolmes::Converter do
6
+ test 'is able to convert regular ascii content from ISO-8859-1 to UTF-16, and back again' do
7
+ input = 'test'
8
+
9
+ output = CharlockHolmes::Converter.convert input, 'ISO-8859-1', 'UTF-16'
10
+ assert input.bytesize < output.bytesize
11
+ assert input != output
12
+
13
+ output = CharlockHolmes::Converter.convert output, 'UTF-16', 'ISO-8859-1'
14
+ assert input.bytesize == output.bytesize
15
+ assert input == output
16
+ end
17
+
18
+ test 'is able to convert UTF-8 content from UTF-8 to UTF-16, and back again' do
19
+ input = 'λ, λ, λ'
20
+
21
+ output = CharlockHolmes::Converter.convert input, 'UTF-8', 'UTF-16'
22
+ assert input.bytesize < output.bytesize
23
+ assert input != output
24
+
25
+ output = CharlockHolmes::Converter.convert output, 'UTF-16', 'UTF-8'
26
+ assert input.bytesize == output.bytesize
27
+ assert input == output
28
+ end
29
+ end
@@ -0,0 +1,122 @@
1
+ # encoding: utf-8
2
+
3
+ require 'spec_helper'
4
+
5
+ describe CharlockHolmes::EncodingDetector do
6
+ before :all do
7
+ @detector = CharlockHolmes::EncodingDetector.new
8
+ end
9
+
10
+ test 'has a class-level detect method' do
11
+ CharlockHolmes::EncodingDetector.respond_to? :detect
12
+ detected = CharlockHolmes::EncodingDetector.detect 'test'
13
+ assert_equal 'ISO-8859-1', detected[:encoding]
14
+ end
15
+
16
+ test 'has a class-level detect method that accepts an encoding hint' do
17
+ CharlockHolmes::EncodingDetector.respond_to? :detect
18
+ detected = CharlockHolmes::EncodingDetector.detect 'test', 'UTF-8'
19
+ assert_equal 'ISO-8859-1', detected[:encoding]
20
+ end
21
+
22
+ test 'has a class-level detect_all method' do
23
+ CharlockHolmes::EncodingDetector.respond_to? :detect_all
24
+ detected_list = CharlockHolmes::EncodingDetector.detect_all 'test'
25
+ assert detected_list.is_a? Array
26
+
27
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
28
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
29
+ end
30
+
31
+ test 'has a class-level detect_all method that accepts an encoding hint' do
32
+ CharlockHolmes::EncodingDetector.respond_to? :detect_all
33
+ detected_list = CharlockHolmes::EncodingDetector.detect_all 'test', 'UTF-8'
34
+ assert detected_list.is_a? Array
35
+
36
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
37
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
38
+ end
39
+
40
+ test 'has a detect method' do
41
+ @detector.respond_to? :detect
42
+ detected = @detector.detect 'test'
43
+ assert_equal 'ISO-8859-1', detected[:encoding]
44
+ end
45
+
46
+ test 'has a detect method that accepts an encoding hint' do
47
+ @detector.respond_to? :detect
48
+ detected = @detector.detect 'test', 'UTF-8'
49
+ assert_equal 'ISO-8859-1', detected[:encoding]
50
+ end
51
+
52
+ test 'has a detect_all method' do
53
+ @detector.respond_to? :detect_all
54
+ detected_list = @detector.detect_all 'test'
55
+ assert detected_list.is_a? Array
56
+
57
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
58
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
59
+ end
60
+
61
+ test 'has a detect_all method that accepts an encoding hint' do
62
+ @detector.respond_to? :detect_all
63
+ detected_list = @detector.detect_all 'test', 'UTF-8'
64
+ assert detected_list.is_a? Array
65
+
66
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
67
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
68
+ end
69
+
70
+ test 'has a strip_tags flag' do
71
+ detector = CharlockHolmes::EncodingDetector.new
72
+ detector.strip_tags = true
73
+ assert detector.strip_tags
74
+
75
+ detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
76
+ assert_equal 'UTF-8', detection[:encoding]
77
+
78
+ detector.strip_tags = false
79
+ assert !detector.strip_tags
80
+
81
+ detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
82
+ assert_equal 'UTF-8', detection[:encoding]
83
+ end
84
+
85
+ test 'has a list of supported encodings' do
86
+ CharlockHolmes::EncodingDetector.respond_to? :supported_encodings
87
+ supported_encodings = CharlockHolmes::EncodingDetector.supported_encodings
88
+
89
+ assert supported_encodings.is_a?(Array)
90
+ assert supported_encodings.include? 'UTF-8'
91
+ end
92
+
93
+ context 'encoding detection' do
94
+ MAPPING = [
95
+ ['repl2.cljs', 'ISO-8859-1', :text],
96
+ ['core.rkt', 'UTF-8', :text],
97
+ ['cl-messagepack.lisp', 'ISO-8859-1', :text],
98
+ ['TwigExtensionsDate.es.yml', 'UTF-8', :text],
99
+ ['AnsiGraph.psm1', 'UTF-16LE', :text],
100
+ ['laholator.py', 'UTF-8', :text],
101
+ ['hello_world', nil, :binary]
102
+ ]
103
+
104
+ MAPPING.each do |mapping|
105
+ file, encoding, type = mapping
106
+
107
+ test "#{file} should be detected as #{encoding || 'binary'}" do
108
+ path = File.expand_path "../fixtures/#{file}", __FILE__
109
+ content = File.read path
110
+ guessed = @detector.detect content
111
+
112
+ assert_equal encoding, guessed[:encoding]
113
+ assert_equal type, guessed[:type]
114
+
115
+ if content.respond_to?(:force_encoding) && guessed[:type] == :text
116
+ content.force_encoding guessed[:encoding]
117
+ assert content.valid_encoding?
118
+ end
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,8 @@
1
+ date.year: '%year% año|%year% años'
2
+ date.month: '%month% mes|%month% meses'
3
+ date.day: '%day% día|%day% días'
4
+ date.hour: '%hour% hora|%hour% horas'
5
+ date.minute: '%minute% minuto|%minute% minutos'
6
+ date.second: '%second% segundo|%second% segundos'
7
+ date.new: 'menos de un minuto'
8
+ date.and: ' y '
@@ -0,0 +1,264 @@
1
+ ;;;; cl-messagepack.lisp
2
+
3
+ (in-package #:messagepack)
4
+
5
+ (declaim (optimize (debug 3)))
6
+
7
+ (eval-when (:compile-toplevel :load-toplevel :execute)
8
+ (defun mkstr (&rest args)
9
+ (format nil "~{~a~}" args))
10
+ (defun mksymb (&rest args)
11
+ (intern (apply #'mkstr args))))
12
+
13
+ (defmacro signed-unsigned-convertors (size)
14
+ (let ((speed (if (< size 32) 3 0)))
15
+ `(progn
16
+ (defun ,(mksymb 'sb size '-> 'ub size) (sb)
17
+ (declare (optimize (debug 0) (safety 0) (speed ,speed))
18
+ (type (integer ,(- (expt 2 (1- size))) ,(1- (expt 2 (1- size)))) sb))
19
+ (if (< sb 0)
20
+ (ldb (byte ,size 0) sb)
21
+ sb))
22
+ (defun ,(mksymb 'ub size '-> 'sb size) (sb)
23
+ (declare (optimize (debug 0) (safety 0) (speed ,speed))
24
+ (type (mod ,(expt 2 size)) sb))
25
+ (if (logbitp (1- ,size) sb)
26
+ (- (1+ (logxor (1- (expt 2 ,size)) sb)))
27
+ sb)))))
28
+
29
+ (signed-unsigned-convertors 8)
30
+ (signed-unsigned-convertors 16)
31
+ (signed-unsigned-convertors 32)
32
+ (signed-unsigned-convertors 64)
33
+
34
+ (defun write-hex (data)
35
+ (let (line)
36
+ (loop
37
+ for i from 0 to (1- (length data))
38
+ do (push (elt data i) line)
39
+ when (= (length line) 16)
40
+ do
41
+ (format t "~{~2,'0x ~}~%" (nreverse line))
42
+ (setf line nil))
43
+ (when line
44
+ (format t "~{~2,'0x ~}~%" (nreverse line)))))
45
+
46
+ (defun encode (data)
47
+ (flexi-streams:with-output-to-sequence (stream)
48
+ (encode-stream data stream)))
49
+
50
+ (defun make-hash (data)
51
+ (let ((result (make-hash-table)))
52
+ (dolist (kv data)
53
+ (cond ((consp (cdr kv))
54
+ (setf (gethash (first kv) result) (second kv)))
55
+ (t
56
+ (setf (gethash (car kv) result) (cdr kv)))))
57
+ result))
58
+
59
+ (defun is-byte-array (data-type)
60
+ (and (vectorp data-type)
61
+ (equal '(unsigned-byte 8) (array-element-type data-type))))
62
+
63
+ (defun encode-stream (data stream)
64
+ (cond ((floatp data) (encode-float data stream))
65
+ ((numberp data) (encode-integer data stream))
66
+ ((null data) (write-byte #xc0 stream))
67
+ ((eq data t) (write-byte #xc3 stream))
68
+ ((stringp data)
69
+ (encode-string data stream))
70
+ ((is-byte-array data)
71
+ (encode-raw-bytes data stream))
72
+ ((or (consp data) (vectorp data))
73
+ (encode-array data stream))
74
+ ((hash-table-p data)
75
+ (encode-hash data stream))
76
+ ((symbolp data)
77
+ (encode-string (symbol-name data) stream))
78
+ (t (error "Cannot encode data."))))
79
+
80
+ (defun encode-string (data stream)
81
+ (encode-raw-bytes (babel:string-to-octets data) stream))
82
+
83
+ #+sbcl (defun sbcl-encode-float (data stream)
84
+ (cond ((equal (type-of data) 'single-float)
85
+ (write-byte #xca stream)
86
+ (store-big-endian (sb-kernel:single-float-bits data) stream 4))
87
+ ((equal (type-of data) 'double-float)
88
+ (write-byte #xcb stream)
89
+ (store-big-endian (sb-kernel:double-float-high-bits data) stream 4)
90
+ (store-big-endian (sb-kernel:double-float-low-bits data) stream 4)))
91
+ t)
92
+
93
+ (defun encode-float (data stream)
94
+ (or #+sbcl (sbcl-encode-float data stream)
95
+ #-(or sbcl) (error "No floating point support yet.")))
96
+
97
+ (defun encode-each (data stream &optional (encoder #'encode-stream))
98
+ (cond ((hash-table-p data)
99
+ (maphash (lambda (key value)
100
+ (funcall encoder key stream)
101
+ (funcall encoder value stream))
102
+ data))
103
+ ((or (vectorp data) (consp data))
104
+ (mapc (lambda (subdata)
105
+ (funcall encoder subdata stream))
106
+ (coerce data 'list)))
107
+ (t (error "Not sequence or hash table."))))
108
+
109
+ (defun encode-sequence (data stream
110
+ short-prefix short-length
111
+ typecode-16 typecode-32
112
+ &optional (encoder #'encode-stream))
113
+ (let ((len (if (hash-table-p data)
114
+ (hash-table-count data)
115
+ (length data))))
116
+ (cond ((<= 0 len short-length)
117
+ (write-byte (+ short-prefix len) stream)
118
+ (encode-each data stream encoder))
119
+ ((<= 0 len 65535)
120
+ (write-byte typecode-16 stream)
121
+ (store-big-endian len stream 2)
122
+ (encode-each data stream encoder))
123
+ ((<= 0 len (1- (expt 2 32)))
124
+ (write-byte typecode-32 stream)
125
+ (store-big-endian len stream 4)
126
+ (encode-each data stream encoder)))))
127
+
128
+ (defun encode-hash (data stream)
129
+ (encode-sequence data stream #x80 15 #xdc #xdd))
130
+
131
+ (defun encode-array (data stream)
132
+ (encode-sequence data stream #x90 15 #xdc #xdd))
133
+
134
+ (defun encode-raw-bytes (data stream)
135
+ (encode-sequence data stream #xa0 31 #xda #xdb #'write-byte))
136
+
137
+ (defun encode-integer (data stream)
138
+ (cond ((<= 0 data 127) (write-byte data stream))
139
+ ((<= -32 data -1) (write-byte (sb8->ub8 data) stream))
140
+ ((<= 0 data 255)
141
+ (write-byte #xcc stream)
142
+ (write-byte data stream))
143
+ ((<= 0 data 65535)
144
+ (write-byte #xcd stream)
145
+ (store-big-endian data stream 2))
146
+ ((<= 0 data (1- (expt 2 32)))
147
+ (write-byte #xce stream)
148
+ (store-big-endian data stream 4))
149
+ ((<= 0 data (1- (expt 2 64)))
150
+ (write-byte #xcf stream)
151
+ (store-big-endian data stream 8))
152
+ ((<= -128 data 127)
153
+ (write-byte #xd0 stream)
154
+ (write-byte (sb8->ub8 data) stream))
155
+ ((<= -32768 data 32767)
156
+ (write-byte #xd1 stream)
157
+ (write-byte (sb16->ub16 data) stream))
158
+ ((<= (- (expt 2 31)) data (1- (expt 2 31)))
159
+ (write-byte #xd2 stream)
160
+ (write-byte (sb32->ub32 data) stream))
161
+ ((<= (- (expt 2 63)) data (1- (expt 2 63)))
162
+ (write-byte #xd3 stream)
163
+ (write-byte (sb64->ub64 data) stream))
164
+ (t (error "Integer too large or too small."))))
165
+
166
+ (defun store-big-endian (number stream byte-count)
167
+ (let (byte-list)
168
+ (loop
169
+ while (> number 0)
170
+ do
171
+ (push (rem number 256)
172
+ byte-list)
173
+ (setf number (ash number -8)))
174
+ (loop
175
+ while (< (length byte-list) byte-count)
176
+ do (push 0 byte-list))
177
+ (when (> (length byte-list) byte-count)
178
+ (error "Number too large."))
179
+ (write-sequence byte-list stream)))
180
+
181
+ (defun decode (byte-array)
182
+ (flexi-streams:with-input-from-sequence (stream byte-array)
183
+ (decode-stream stream)))
184
+
185
+ (defun decode-stream (stream)
186
+ (let ((byte (read-byte stream)))
187
+ (cond ((= 0 (ldb (byte 1 7) byte))
188
+ byte)
189
+ ((= 7 (ldb (byte 3 5) byte))
190
+ (ub8->sb8 byte))
191
+ ((= #xcc byte)
192
+ (read-byte stream))
193
+ ((= #xcd byte)
194
+ (load-big-endian stream 2))
195
+ ((= #xce byte)
196
+ (load-big-endian stream 4))
197
+ ((= #xcf byte)
198
+ (load-big-endian stream 8))
199
+ ((= #xd0 byte)
200
+ (ub8->sb8 (read-byte stream)))
201
+ ((= #xd1 byte)
202
+ (ub16->sb16 (load-big-endian stream 2)))
203
+ ((= #xd2 byte)
204
+ (ub32->sb32 (load-big-endian stream 4)))
205
+ ((= #xd3 byte)
206
+ (ub64->sb64 (load-big-endian stream 8)))
207
+ ((= #xc0 byte)
208
+ nil)
209
+ ((= #xc3 byte)
210
+ t)
211
+ ((= #xc2 byte)
212
+ nil)
213
+ ((= #xca byte)
214
+ (or #+sbcl (sb-kernel:make-single-float (load-big-endian stream 4))
215
+ #-(or sbcl) (error "No floating point support yet.")))
216
+ ((= #xcb byte)
217
+ (or #+sbcl (sb-kernel:make-double-float (load-big-endian stream 4)
218
+ (load-big-endian stream 4))
219
+ #-(or sbcl) (error "No floating point support yet.")))
220
+ ((= 5 (ldb (byte 3 5) byte))
221
+ (decode-raw-sequence (ldb (byte 5 0) byte) stream))
222
+ ((= #xda byte)
223
+ (decode-raw-sequence (load-big-endian stream 2) stream))
224
+ ((= #xdb byte)
225
+ (decode-raw-sequence (load-big-endian stream 4) stream))
226
+ ((= 9 (ldb (byte 4 4) byte))
227
+ (decode-array (- byte #x90) stream))
228
+ ((= #xdc byte)
229
+ (decode-array (load-big-endian stream 2) stream))
230
+ ((= #xdd byte)
231
+ (decode-array (load-big-endian stream 4) stream))
232
+ ((= 8 (ldb (byte 4 4) byte))
233
+ (decode-map (- byte #x80) stream))
234
+ ((= #xde byte)
235
+ (decode-map (load-big-endian stream 2) stream))
236
+ ((= #xdf byte)
237
+ (decode-map (load-big-endian stream 4) stream)))))
238
+
239
+ (defun decode-map (length stream)
240
+ (let ((hash-table (make-hash-table :test #'equal)))
241
+ (loop repeat length
242
+ do (let ((key (decode-stream stream))
243
+ (value (decode-stream stream)))
244
+ (setf (gethash key hash-table) value)))
245
+ hash-table))
246
+
247
+ (defun decode-array (length stream)
248
+ (let ((array (make-array length)))
249
+ (dotimes (i length)
250
+ (setf (aref array i) (decode-stream stream)))
251
+ array))
252
+
253
+ (defun decode-raw-sequence (length stream)
254
+ (let ((seq (make-array length :element-type '(mod 256))))
255
+ (read-sequence seq stream)
256
+ (babel:octets-to-string seq)))
257
+
258
+ (defun load-big-endian (stream byte-count)
259
+ (let ((result 0))
260
+ (loop
261
+ repeat byte-count
262
+ do (setf result (+ (ash result 8)
263
+ (read-byte stream))))
264
+ result))