charlock_holmes_heroku 0.6.13

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,6 @@
1
+ require 'charlock_holmes/charlock_holmes'
2
+ require 'charlock_holmes/encoding_detector'
3
+ require 'charlock_holmes/version' unless defined? CharlockHolmes::VERSION
4
+
5
+ # require this if you want the String monkey patches
6
+ # require 'charlock_holmes/string'
@@ -0,0 +1,33 @@
1
+ module CharlockHolmes
2
+ class EncodingDetector
3
+ alias :strip_tags? :strip_tags
4
+
5
+ # Attempt to detect the encoding of this string
6
+ #
7
+ # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
8
+ #
9
+ # str - a String, what you want to detect the encoding of
10
+ # hint_enc - an optional String (like "UTF-8"), the encoding name which will
11
+ # be used as an additional hint to the charset detector
12
+ #
13
+ # Returns: a Hash with :encoding, :language, :type and :confidence
14
+ def self.detect(str, hint_enc=nil)
15
+ new.detect(str, hint_enc)
16
+ end
17
+
18
+ # Attempt to detect the encoding of this string, and return
19
+ # a list with all the possible encodings that match it.
20
+ #
21
+ # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
22
+ #
23
+ # str - a String, what you want to detect the encoding of
24
+ # hint_enc - an optional String (like "UTF-8"), the encoding name which will
25
+ # be used as an additional hint to the charset detector
26
+ #
27
+ # Returns: an Array with zero or more Hashes,
28
+ # each one of them with with :encoding, :language, :type and :confidence
29
+ def self.detect_all(str, hint_enc=nil)
30
+ new.detect_all(str, hint_enc)
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,34 @@
1
+ require 'charlock_holmes' unless defined? CharlockHolmes
2
+
3
+ class String
4
+ # Attempt to detect the encoding of this string
5
+ #
6
+ # Returns: a Hash with :encoding, :language, :type and :confidence
7
+ def detect_encoding(hint_enc=nil)
8
+ detector = CharlockHolmes::EncodingDetector.new
9
+ detector.detect(self, hint_enc)
10
+ end
11
+
12
+ # Attempt to detect the encoding of this string, and return
13
+ # a list with all the possible encodings that match it.
14
+ #
15
+ # Returns: an Array with zero or more Hashes,
16
+ # each one of them with with :encoding, :language, :type and :confidence
17
+ def detect_encodings(hint_enc=nil)
18
+ detector = CharlockHolmes::EncodingDetector.new
19
+ detector.detect_all(self, hint_enc)
20
+ end
21
+
22
+ if RUBY_VERSION =~ /1.9/
23
+ # Attempt to detect the encoding of this string
24
+ # then set the encoding to what was detected ala `force_encoding`
25
+ #
26
+ # Returns: self
27
+ def detect_encoding!(hint_enc=nil)
28
+ if detected = self.detect_encoding(hint_enc)
29
+ self.force_encoding(detected[:encoding]) if detected[:encoding]
30
+ end
31
+ self
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,3 @@
1
+ module CharlockHolmes
2
+ VERSION = "0.6.13"
3
+ end
@@ -0,0 +1,29 @@
1
+ # encoding: utf-8
2
+
3
+ require 'spec_helper'
4
+
5
+ describe CharlockHolmes::Converter do
6
+ test 'is able to convert regular ascii content from ISO-8859-1 to UTF-16, and back again' do
7
+ input = 'test'
8
+
9
+ output = CharlockHolmes::Converter.convert input, 'ISO-8859-1', 'UTF-16'
10
+ assert input.bytesize < output.bytesize
11
+ assert input != output
12
+
13
+ output = CharlockHolmes::Converter.convert output, 'UTF-16', 'ISO-8859-1'
14
+ assert input.bytesize == output.bytesize
15
+ assert input == output
16
+ end
17
+
18
+ test 'is able to convert UTF-8 content from UTF-8 to UTF-16, and back again' do
19
+ input = 'λ, λ, λ'
20
+
21
+ output = CharlockHolmes::Converter.convert input, 'UTF-8', 'UTF-16'
22
+ assert input.bytesize < output.bytesize
23
+ assert input != output
24
+
25
+ output = CharlockHolmes::Converter.convert output, 'UTF-16', 'UTF-8'
26
+ assert input.bytesize == output.bytesize
27
+ assert input == output
28
+ end
29
+ end
@@ -0,0 +1,122 @@
1
+ # encoding: utf-8
2
+
3
+ require 'spec_helper'
4
+
5
+ describe CharlockHolmes::EncodingDetector do
6
+ before :all do
7
+ @detector = CharlockHolmes::EncodingDetector.new
8
+ end
9
+
10
+ test 'has a class-level detect method' do
11
+ CharlockHolmes::EncodingDetector.respond_to? :detect
12
+ detected = CharlockHolmes::EncodingDetector.detect 'test'
13
+ assert_equal 'ISO-8859-1', detected[:encoding]
14
+ end
15
+
16
+ test 'has a class-level detect method that accepts an encoding hint' do
17
+ CharlockHolmes::EncodingDetector.respond_to? :detect
18
+ detected = CharlockHolmes::EncodingDetector.detect 'test', 'UTF-8'
19
+ assert_equal 'ISO-8859-1', detected[:encoding]
20
+ end
21
+
22
+ test 'has a class-level detect_all method' do
23
+ CharlockHolmes::EncodingDetector.respond_to? :detect_all
24
+ detected_list = CharlockHolmes::EncodingDetector.detect_all 'test'
25
+ assert detected_list.is_a? Array
26
+
27
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
28
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
29
+ end
30
+
31
+ test 'has a class-level detect_all method that accepts an encoding hint' do
32
+ CharlockHolmes::EncodingDetector.respond_to? :detect_all
33
+ detected_list = CharlockHolmes::EncodingDetector.detect_all 'test', 'UTF-8'
34
+ assert detected_list.is_a? Array
35
+
36
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
37
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
38
+ end
39
+
40
+ test 'has a detect method' do
41
+ @detector.respond_to? :detect
42
+ detected = @detector.detect 'test'
43
+ assert_equal 'ISO-8859-1', detected[:encoding]
44
+ end
45
+
46
+ test 'has a detect method that accepts an encoding hint' do
47
+ @detector.respond_to? :detect
48
+ detected = @detector.detect 'test', 'UTF-8'
49
+ assert_equal 'ISO-8859-1', detected[:encoding]
50
+ end
51
+
52
+ test 'has a detect_all method' do
53
+ @detector.respond_to? :detect_all
54
+ detected_list = @detector.detect_all 'test'
55
+ assert detected_list.is_a? Array
56
+
57
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
58
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
59
+ end
60
+
61
+ test 'has a detect_all method that accepts an encoding hint' do
62
+ @detector.respond_to? :detect_all
63
+ detected_list = @detector.detect_all 'test', 'UTF-8'
64
+ assert detected_list.is_a? Array
65
+
66
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
67
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
68
+ end
69
+
70
+ test 'has a strip_tags flag' do
71
+ detector = CharlockHolmes::EncodingDetector.new
72
+ detector.strip_tags = true
73
+ assert detector.strip_tags
74
+
75
+ detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
76
+ assert_equal 'UTF-8', detection[:encoding]
77
+
78
+ detector.strip_tags = false
79
+ assert !detector.strip_tags
80
+
81
+ detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
82
+ assert_equal 'UTF-8', detection[:encoding]
83
+ end
84
+
85
+ test 'has a list of supported encodings' do
86
+ CharlockHolmes::EncodingDetector.respond_to? :supported_encodings
87
+ supported_encodings = CharlockHolmes::EncodingDetector.supported_encodings
88
+
89
+ assert supported_encodings.is_a?(Array)
90
+ assert supported_encodings.include? 'UTF-8'
91
+ end
92
+
93
+ context 'encoding detection' do
94
+ MAPPING = [
95
+ ['repl2.cljs', 'ISO-8859-1', :text],
96
+ ['core.rkt', 'UTF-8', :text],
97
+ ['cl-messagepack.lisp', 'ISO-8859-1', :text],
98
+ ['TwigExtensionsDate.es.yml', 'UTF-8', :text],
99
+ ['AnsiGraph.psm1', 'UTF-16LE', :text],
100
+ ['laholator.py', 'UTF-8', :text],
101
+ ['hello_world', nil, :binary]
102
+ ]
103
+
104
+ MAPPING.each do |mapping|
105
+ file, encoding, type = mapping
106
+
107
+ test "#{file} should be detected as #{encoding || 'binary'}" do
108
+ path = File.expand_path "../fixtures/#{file}", __FILE__
109
+ content = File.read path
110
+ guessed = @detector.detect content
111
+
112
+ assert_equal encoding, guessed[:encoding]
113
+ assert_equal type, guessed[:type]
114
+
115
+ if content.respond_to?(:force_encoding) && guessed[:type] == :text
116
+ content.force_encoding guessed[:encoding]
117
+ assert content.valid_encoding?
118
+ end
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,8 @@
1
+ date.year: '%year% año|%year% años'
2
+ date.month: '%month% mes|%month% meses'
3
+ date.day: '%day% día|%day% días'
4
+ date.hour: '%hour% hora|%hour% horas'
5
+ date.minute: '%minute% minuto|%minute% minutos'
6
+ date.second: '%second% segundo|%second% segundos'
7
+ date.new: 'menos de un minuto'
8
+ date.and: ' y '
@@ -0,0 +1,264 @@
1
+ ;;;; cl-messagepack.lisp
2
+
3
+ (in-package #:messagepack)
4
+
5
+ (declaim (optimize (debug 3)))
6
+
7
+ (eval-when (:compile-toplevel :load-toplevel :execute)
8
+ (defun mkstr (&rest args)
9
+ (format nil "~{~a~}" args))
10
+ (defun mksymb (&rest args)
11
+ (intern (apply #'mkstr args))))
12
+
13
+ (defmacro signed-unsigned-convertors (size)
14
+ (let ((speed (if (< size 32) 3 0)))
15
+ `(progn
16
+ (defun ,(mksymb 'sb size '-> 'ub size) (sb)
17
+ (declare (optimize (debug 0) (safety 0) (speed ,speed))
18
+ (type (integer ,(- (expt 2 (1- size))) ,(1- (expt 2 (1- size)))) sb))
19
+ (if (< sb 0)
20
+ (ldb (byte ,size 0) sb)
21
+ sb))
22
+ (defun ,(mksymb 'ub size '-> 'sb size) (sb)
23
+ (declare (optimize (debug 0) (safety 0) (speed ,speed))
24
+ (type (mod ,(expt 2 size)) sb))
25
+ (if (logbitp (1- ,size) sb)
26
+ (- (1+ (logxor (1- (expt 2 ,size)) sb)))
27
+ sb)))))
28
+
29
+ (signed-unsigned-convertors 8)
30
+ (signed-unsigned-convertors 16)
31
+ (signed-unsigned-convertors 32)
32
+ (signed-unsigned-convertors 64)
33
+
34
+ (defun write-hex (data)
35
+ (let (line)
36
+ (loop
37
+ for i from 0 to (1- (length data))
38
+ do (push (elt data i) line)
39
+ when (= (length line) 16)
40
+ do
41
+ (format t "~{~2,'0x ~}~%" (nreverse line))
42
+ (setf line nil))
43
+ (when line
44
+ (format t "~{~2,'0x ~}~%" (nreverse line)))))
45
+
46
+ (defun encode (data)
47
+ (flexi-streams:with-output-to-sequence (stream)
48
+ (encode-stream data stream)))
49
+
50
+ (defun make-hash (data)
51
+ (let ((result (make-hash-table)))
52
+ (dolist (kv data)
53
+ (cond ((consp (cdr kv))
54
+ (setf (gethash (first kv) result) (second kv)))
55
+ (t
56
+ (setf (gethash (car kv) result) (cdr kv)))))
57
+ result))
58
+
59
+ (defun is-byte-array (data-type)
60
+ (and (vectorp data-type)
61
+ (equal '(unsigned-byte 8) (array-element-type data-type))))
62
+
63
+ (defun encode-stream (data stream)
64
+ (cond ((floatp data) (encode-float data stream))
65
+ ((numberp data) (encode-integer data stream))
66
+ ((null data) (write-byte #xc0 stream))
67
+ ((eq data t) (write-byte #xc3 stream))
68
+ ((stringp data)
69
+ (encode-string data stream))
70
+ ((is-byte-array data)
71
+ (encode-raw-bytes data stream))
72
+ ((or (consp data) (vectorp data))
73
+ (encode-array data stream))
74
+ ((hash-table-p data)
75
+ (encode-hash data stream))
76
+ ((symbolp data)
77
+ (encode-string (symbol-name data) stream))
78
+ (t (error "Cannot encode data."))))
79
+
80
+ (defun encode-string (data stream)
81
+ (encode-raw-bytes (babel:string-to-octets data) stream))
82
+
83
+ #+sbcl (defun sbcl-encode-float (data stream)
84
+ (cond ((equal (type-of data) 'single-float)
85
+ (write-byte #xca stream)
86
+ (store-big-endian (sb-kernel:single-float-bits data) stream 4))
87
+ ((equal (type-of data) 'double-float)
88
+ (write-byte #xcb stream)
89
+ (store-big-endian (sb-kernel:double-float-high-bits data) stream 4)
90
+ (store-big-endian (sb-kernel:double-float-low-bits data) stream 4)))
91
+ t)
92
+
93
+ (defun encode-float (data stream)
94
+ (or #+sbcl (sbcl-encode-float data stream)
95
+ #-(or sbcl) (error "No floating point support yet.")))
96
+
97
+ (defun encode-each (data stream &optional (encoder #'encode-stream))
98
+ (cond ((hash-table-p data)
99
+ (maphash (lambda (key value)
100
+ (funcall encoder key stream)
101
+ (funcall encoder value stream))
102
+ data))
103
+ ((or (vectorp data) (consp data))
104
+ (mapc (lambda (subdata)
105
+ (funcall encoder subdata stream))
106
+ (coerce data 'list)))
107
+ (t (error "Not sequence or hash table."))))
108
+
109
+ (defun encode-sequence (data stream
110
+ short-prefix short-length
111
+ typecode-16 typecode-32
112
+ &optional (encoder #'encode-stream))
113
+ (let ((len (if (hash-table-p data)
114
+ (hash-table-count data)
115
+ (length data))))
116
+ (cond ((<= 0 len short-length)
117
+ (write-byte (+ short-prefix len) stream)
118
+ (encode-each data stream encoder))
119
+ ((<= 0 len 65535)
120
+ (write-byte typecode-16 stream)
121
+ (store-big-endian len stream 2)
122
+ (encode-each data stream encoder))
123
+ ((<= 0 len (1- (expt 2 32)))
124
+ (write-byte typecode-32 stream)
125
+ (store-big-endian len stream 4)
126
+ (encode-each data stream encoder)))))
127
+
128
+ (defun encode-hash (data stream)
129
+ (encode-sequence data stream #x80 15 #xdc #xdd))
130
+
131
+ (defun encode-array (data stream)
132
+ (encode-sequence data stream #x90 15 #xdc #xdd))
133
+
134
+ (defun encode-raw-bytes (data stream)
135
+ (encode-sequence data stream #xa0 31 #xda #xdb #'write-byte))
136
+
137
+ (defun encode-integer (data stream)
138
+ (cond ((<= 0 data 127) (write-byte data stream))
139
+ ((<= -32 data -1) (write-byte (sb8->ub8 data) stream))
140
+ ((<= 0 data 255)
141
+ (write-byte #xcc stream)
142
+ (write-byte data stream))
143
+ ((<= 0 data 65535)
144
+ (write-byte #xcd stream)
145
+ (store-big-endian data stream 2))
146
+ ((<= 0 data (1- (expt 2 32)))
147
+ (write-byte #xce stream)
148
+ (store-big-endian data stream 4))
149
+ ((<= 0 data (1- (expt 2 64)))
150
+ (write-byte #xcf stream)
151
+ (store-big-endian data stream 8))
152
+ ((<= -128 data 127)
153
+ (write-byte #xd0 stream)
154
+ (write-byte (sb8->ub8 data) stream))
155
+ ((<= -32768 data 32767)
156
+ (write-byte #xd1 stream)
157
+ (write-byte (sb16->ub16 data) stream))
158
+ ((<= (- (expt 2 31)) data (1- (expt 2 31)))
159
+ (write-byte #xd2 stream)
160
+ (write-byte (sb32->ub32 data) stream))
161
+ ((<= (- (expt 2 63)) data (1- (expt 2 63)))
162
+ (write-byte #xd3 stream)
163
+ (write-byte (sb64->ub64 data) stream))
164
+ (t (error "Integer too large or too small."))))
165
+
166
+ (defun store-big-endian (number stream byte-count)
167
+ (let (byte-list)
168
+ (loop
169
+ while (> number 0)
170
+ do
171
+ (push (rem number 256)
172
+ byte-list)
173
+ (setf number (ash number -8)))
174
+ (loop
175
+ while (< (length byte-list) byte-count)
176
+ do (push 0 byte-list))
177
+ (when (> (length byte-list) byte-count)
178
+ (error "Number too large."))
179
+ (write-sequence byte-list stream)))
180
+
181
+ (defun decode (byte-array)
182
+ (flexi-streams:with-input-from-sequence (stream byte-array)
183
+ (decode-stream stream)))
184
+
185
+ (defun decode-stream (stream)
186
+ (let ((byte (read-byte stream)))
187
+ (cond ((= 0 (ldb (byte 1 7) byte))
188
+ byte)
189
+ ((= 7 (ldb (byte 3 5) byte))
190
+ (ub8->sb8 byte))
191
+ ((= #xcc byte)
192
+ (read-byte stream))
193
+ ((= #xcd byte)
194
+ (load-big-endian stream 2))
195
+ ((= #xce byte)
196
+ (load-big-endian stream 4))
197
+ ((= #xcf byte)
198
+ (load-big-endian stream 8))
199
+ ((= #xd0 byte)
200
+ (ub8->sb8 (read-byte stream)))
201
+ ((= #xd1 byte)
202
+ (ub16->sb16 (load-big-endian stream 2)))
203
+ ((= #xd2 byte)
204
+ (ub32->sb32 (load-big-endian stream 4)))
205
+ ((= #xd3 byte)
206
+ (ub64->sb64 (load-big-endian stream 8)))
207
+ ((= #xc0 byte)
208
+ nil)
209
+ ((= #xc3 byte)
210
+ t)
211
+ ((= #xc2 byte)
212
+ nil)
213
+ ((= #xca byte)
214
+ (or #+sbcl (sb-kernel:make-single-float (load-big-endian stream 4))
215
+ #-(or sbcl) (error "No floating point support yet.")))
216
+ ((= #xcb byte)
217
+ (or #+sbcl (sb-kernel:make-double-float (load-big-endian stream 4)
218
+ (load-big-endian stream 4))
219
+ #-(or sbcl) (error "No floating point support yet.")))
220
+ ((= 5 (ldb (byte 3 5) byte))
221
+ (decode-raw-sequence (ldb (byte 5 0) byte) stream))
222
+ ((= #xda byte)
223
+ (decode-raw-sequence (load-big-endian stream 2) stream))
224
+ ((= #xdb byte)
225
+ (decode-raw-sequence (load-big-endian stream 4) stream))
226
+ ((= 9 (ldb (byte 4 4) byte))
227
+ (decode-array (- byte #x90) stream))
228
+ ((= #xdc byte)
229
+ (decode-array (load-big-endian stream 2) stream))
230
+ ((= #xdd byte)
231
+ (decode-array (load-big-endian stream 4) stream))
232
+ ((= 8 (ldb (byte 4 4) byte))
233
+ (decode-map (- byte #x80) stream))
234
+ ((= #xde byte)
235
+ (decode-map (load-big-endian stream 2) stream))
236
+ ((= #xdf byte)
237
+ (decode-map (load-big-endian stream 4) stream)))))
238
+
239
+ (defun decode-map (length stream)
240
+ (let ((hash-table (make-hash-table :test #'equal)))
241
+ (loop repeat length
242
+ do (let ((key (decode-stream stream))
243
+ (value (decode-stream stream)))
244
+ (setf (gethash key hash-table) value)))
245
+ hash-table))
246
+
247
+ (defun decode-array (length stream)
248
+ (let ((array (make-array length)))
249
+ (dotimes (i length)
250
+ (setf (aref array i) (decode-stream stream)))
251
+ array))
252
+
253
+ (defun decode-raw-sequence (length stream)
254
+ (let ((seq (make-array length :element-type '(mod 256))))
255
+ (read-sequence seq stream)
256
+ (babel:octets-to-string seq)))
257
+
258
+ (defun load-big-endian (stream byte-count)
259
+ (let ((result 0))
260
+ (loop
261
+ repeat byte-count
262
+ do (setf result (+ (ash result 8)
263
+ (read-byte stream))))
264
+ result))