charlock_holmes 0.7.3 → 0.7.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,27 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require './lib/charlock_holmes/version' unless defined? CharlockHolmes::VERSION
4
-
5
- Gem::Specification.new do |s|
6
- s.name = %q{charlock_holmes}
7
- s.license = "MIT"
8
- s.version = CharlockHolmes::VERSION
9
- s.authors = ["Brian Lopez", "Vicent Martí"]
10
- s.date = Time.now.utc.strftime("%Y-%m-%d")
11
- s.email = %q{seniorlopez@gmail.com}
12
- s.extensions = ["ext/charlock_holmes/extconf.rb"]
13
- s.files = `git ls-files`.split("\n")
14
- s.homepage = %q{https://github.com/brianmario/charlock_holmes}
15
- s.rdoc_options = ["--charset=UTF-8"]
16
- s.require_paths = ["lib"]
17
- s.rubygems_version = %q{1.4.2}
18
- s.summary = %q{Character encoding detection, brought to you by ICU}
19
- s.description = "charlock_holmes provides binary and text detection as well as text transcoding using libicu"
20
- s.test_files = `git ls-files spec`.split("\n")
21
-
22
- # tests
23
- s.add_development_dependency 'rake-compiler', ">= 0.7.5"
24
- s.add_development_dependency 'minitest'
25
- # benchmarks
26
- s.add_development_dependency 'chardet'
27
- end
@@ -1,48 +0,0 @@
1
- # encoding: utf-8
2
- require File.expand_path("../helper", __FILE__)
3
-
4
- class ConverterTest < MiniTest::Test
5
- def test_convert_ascii_from_iso859_1_to_utf16_and_back
6
- input = 'test'
7
-
8
- output = CharlockHolmes::Converter.convert input, 'ISO-8859-1', 'UTF-16'
9
- assert input.bytesize < output.bytesize
10
- assert input != output
11
-
12
- output = CharlockHolmes::Converter.convert output, 'UTF-16', 'ISO-8859-1'
13
- assert input.bytesize == output.bytesize
14
- assert input == output
15
- end
16
-
17
- def test_convert_utf8_to_utf16_and_back
18
- input = 'λ, λ, λ'
19
-
20
- output = CharlockHolmes::Converter.convert input, 'UTF-8', 'UTF-16'
21
- assert input.bytesize < output.bytesize
22
- assert input != output
23
-
24
- output = CharlockHolmes::Converter.convert output, 'UTF-16', 'UTF-8'
25
- assert input.bytesize == output.bytesize
26
- assert input == output
27
- end
28
-
29
- def test_params_must_be_strings
30
- assert_raises TypeError do
31
- CharlockHolmes::Converter.convert nil, 'UTF-8', 'UTF-16'
32
- end
33
-
34
- assert_raises TypeError do
35
- CharlockHolmes::Converter.convert 'lol', nil, 'UTF-16'
36
- end
37
-
38
- assert_raises TypeError do
39
- CharlockHolmes::Converter.convert 'lol', 'UTF-8', nil
40
- end
41
-
42
- begin
43
- CharlockHolmes::Converter.convert 'lol', 'UTF-8', 'UTF-16'
44
- rescue Exception => e
45
- assert_nil e, "#{e.class.name} raised, expected nothing"
46
- end
47
- end
48
- end
@@ -1,145 +0,0 @@
1
- # encoding: utf-8
2
- require File.expand_path("../helper", __FILE__)
3
-
4
- class EncodingDetectorTest < MiniTest::Test
5
- def setup
6
- @detector = CharlockHolmes::EncodingDetector.new
7
- end
8
-
9
- def test_has_class_level_detect_method
10
- CharlockHolmes::EncodingDetector.respond_to? :detect
11
- detected = CharlockHolmes::EncodingDetector.detect 'test'
12
- assert_equal 'ISO-8859-1', detected[:encoding]
13
- end
14
-
15
- def test_class_level_detect_accepts_encoding_hint
16
- CharlockHolmes::EncodingDetector.respond_to? :detect
17
- detected = CharlockHolmes::EncodingDetector.detect 'test', 'UTF-8'
18
- assert_equal 'ISO-8859-1', detected[:encoding]
19
- end
20
-
21
- def test_has_class_level_detect_all_method
22
- CharlockHolmes::EncodingDetector.respond_to? :detect_all
23
- detected_list = CharlockHolmes::EncodingDetector.detect_all 'test'
24
- assert detected_list.is_a? Array
25
-
26
- encoding_list = detected_list.map {|d| d[:encoding]}.sort
27
- assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
28
- end
29
-
30
- def test_class_level_detect_all_method_accepts_encoding_hint
31
- CharlockHolmes::EncodingDetector.respond_to? :detect_all
32
- detected_list = CharlockHolmes::EncodingDetector.detect_all 'test', 'UTF-8'
33
- assert detected_list.is_a? Array
34
-
35
- encoding_list = detected_list.map {|d| d[:encoding]}.sort
36
- assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
37
- end
38
-
39
- def test_has_detect_method
40
- @detector.respond_to? :detect
41
- detected = @detector.detect 'test'
42
- assert_equal 'ISO-8859-1', detected[:encoding]
43
- end
44
-
45
- def test_detect_accepts_encoding_hint
46
- @detector.respond_to? :detect
47
- detected = @detector.detect 'test', 'UTF-8'
48
- assert_equal 'ISO-8859-1', detected[:encoding]
49
- end
50
-
51
- def test_has_detect_all_method
52
- @detector.respond_to? :detect_all
53
- detected_list = @detector.detect_all 'test'
54
- assert detected_list.is_a? Array
55
-
56
- encoding_list = detected_list.map {|d| d[:encoding]}.sort
57
- assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
58
- end
59
-
60
- def test_detect_all_accepts_encoding_hint
61
- @detector.respond_to? :detect_all
62
- detected_list = @detector.detect_all 'test', 'UTF-8'
63
- assert detected_list.is_a? Array
64
-
65
- encoding_list = detected_list.map {|d| d[:encoding]}.sort
66
- assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
67
- end
68
-
69
- def test_strip_tags_flag
70
- detector = CharlockHolmes::EncodingDetector.new
71
- detector.strip_tags = true
72
- assert detector.strip_tags
73
-
74
- detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
75
- assert_equal 'UTF-8', detection[:encoding]
76
-
77
- detector.strip_tags = false
78
- assert !detector.strip_tags
79
-
80
- detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
81
- assert_equal 'UTF-8', detection[:encoding]
82
- end
83
-
84
- def test_has_list_of_supported_encodings
85
- CharlockHolmes::EncodingDetector.respond_to? :supported_encodings
86
- supported_encodings = CharlockHolmes::EncodingDetector.supported_encodings
87
-
88
- assert supported_encodings.is_a?(Array)
89
- assert supported_encodings.include? 'UTF-8'
90
- assert supported_encodings.include? 'windows-1250'
91
- assert supported_encodings.include? 'windows-1252'
92
- assert supported_encodings.include? 'windows-1253'
93
- assert supported_encodings.include? 'windows-1254'
94
- assert supported_encodings.include? 'windows-1255'
95
- end
96
-
97
- def test_returns_a_ruby_compatible_encoding_name
98
- detected = @detector.detect 'test'
99
- assert_equal 'ISO-8859-1', detected[:encoding]
100
- assert_equal 'ISO-8859-1', detected[:ruby_encoding]
101
-
102
- not_compat_txt = fixture("ISO-2022-KR.txt").read
103
- detected = @detector.detect not_compat_txt
104
- assert_equal 'ISO-2022-KR', detected[:encoding]
105
- assert_equal 'binary', detected[:ruby_encoding]
106
- end
107
-
108
- MAPPING = [
109
- ['repl2.cljs', 'ISO-8859-1', :text],
110
- ['cl-messagepack.lisp', 'ISO-8859-1', :text],
111
- ['sierpinski.ps', 'ISO-8859-1', :text],
112
- ['core.rkt', 'UTF-8', :text],
113
- ['TwigExtensionsDate.es.yml', 'UTF-8', :text],
114
- ['laholator.py', 'UTF-8', :text],
115
- ['vimrc', 'UTF-8', :text],
116
- ['AnsiGraph.psm1', 'UTF-16LE', :text],
117
- ['utf16be.html', 'UTF-16BE', :text],
118
- ['utf32le.html', 'UTF-32LE', :text],
119
- ['utf32be.html', 'UTF-32BE', :text],
120
- ['hello_world', nil, :binary],
121
- ['octocat.png', nil, :binary],
122
- ['octocat.jpg', nil, :binary],
123
- ['octocat.psd', nil, :binary],
124
- ['octocat.gif', nil, :binary],
125
- ['octocat.ai', nil, :binary],
126
- ['foo.pdf', nil, :binary],
127
- ]
128
-
129
- def test_detection_works_as_expected
130
- MAPPING.each do |mapping|
131
- file, encoding, type = mapping
132
-
133
- content = fixture(file).read
134
- guessed = @detector.detect content
135
-
136
- assert_equal encoding, guessed[:encoding]
137
- assert_equal type, guessed[:type]
138
-
139
- if content.respond_to?(:force_encoding) && guessed[:type] == :text
140
- content.force_encoding guessed[:encoding]
141
- assert content.valid_encoding?
142
- end
143
- end
144
- end
145
- end
Binary file
@@ -1,43 +0,0 @@
1
- $)C#
2
- # Out-AnsiGraph.psm1
3
- # Author: xcud
4
- # History:
5
- # v0.1 September 21, 2009 initial version
6
- #
7
- # PS Example> ps | select -first 5 | sort -property VM |
8
- # Out-AnsiGraph ProcessName, VM
9
- # AEADISRV  14508032
10
- # audiodg  50757632
11
- # conhost  73740288
12
- # AppleMobileDeviceService  92061696
13
- # btdna  126443520
14
- #
15
- function Out-AnsiGraph($Parameter1=$null) {
16
- BEGIN {
17
- $q = new-object Collections.queue
18
- $max = 0; $namewidth = 0;
19
- }
20
-
21
- PROCESS {
22
- if($_) {
23
- $name = $_.($Parameter1[0]);
24
- $val = $_.($Parameter1[1])
25
- if($max -lt $val) { $max = $val}
26
- if($namewidth -lt $name.length) {
27
- $namewidth = $name.length }
28
- $q.enqueue(@($name, $val))
29
- }
30
- }
31
-
32
- END {
33
- $q | %{
34
- $graph = ""; 0..($_[1]/$max*20) |
35
- %{ $graph += "" }
36
- $name = "{0,$namewidth}" -f $_[0]
37
- "$name $graph " + $_[1]
38
- }
39
-
40
- }
41
- }
42
-
43
- Export-ModuleMember Out-AnsiGraph
@@ -1,8 +0,0 @@
1
- date.year: '%year% año|%year% años'
2
- date.month: '%month% mes|%month% meses'
3
- date.day: '%day% día|%day% días'
4
- date.hour: '%hour% hora|%hour% horas'
5
- date.minute: '%minute% minuto|%minute% minutos'
6
- date.second: '%second% segundo|%second% segundos'
7
- date.new: 'menos de un minuto'
8
- date.and: ' y '
@@ -1,264 +0,0 @@
1
- ;;;; cl-messagepack.lisp
2
-
3
- (in-package #:messagepack)
4
-
5
- (declaim (optimize (debug 3)))
6
-
7
- (eval-when (:compile-toplevel :load-toplevel :execute)
8
- (defun mkstr (&rest args)
9
- (format nil "~{~a~}" args))
10
- (defun mksymb (&rest args)
11
- (intern (apply #'mkstr args))))
12
-
13
- (defmacro signed-unsigned-convertors (size)
14
- (let ((speed (if (< size 32) 3 0)))
15
- `(progn
16
- (defun ,(mksymb 'sb size '-> 'ub size) (sb)
17
- (declare (optimize (debug 0) (safety 0) (speed ,speed))
18
- (type (integer ,(- (expt 2 (1- size))) ,(1- (expt 2 (1- size)))) sb))
19
- (if (< sb 0)
20
- (ldb (byte ,size 0) sb)
21
- sb))
22
- (defun ,(mksymb 'ub size '-> 'sb size) (sb)
23
- (declare (optimize (debug 0) (safety 0) (speed ,speed))
24
- (type (mod ,(expt 2 size)) sb))
25
- (if (logbitp (1- ,size) sb)
26
- (- (1+ (logxor (1- (expt 2 ,size)) sb)))
27
- sb)))))
28
-
29
- (signed-unsigned-convertors 8)
30
- (signed-unsigned-convertors 16)
31
- (signed-unsigned-convertors 32)
32
- (signed-unsigned-convertors 64)
33
-
34
- (defun write-hex (data)
35
- (let (line)
36
- (loop
37
- for i from 0 to (1- (length data))
38
- do (push (elt data i) line)
39
- when (= (length line) 16)
40
- do
41
- (format t "~{~2,'0x ~}~%" (nreverse line))
42
- (setf line nil))
43
- (when line
44
- (format t "~{~2,'0x ~}~%" (nreverse line)))))
45
-
46
- (defun encode (data)
47
- (flexi-streams:with-output-to-sequence (stream)
48
- (encode-stream data stream)))
49
-
50
- (defun make-hash (data)
51
- (let ((result (make-hash-table)))
52
- (dolist (kv data)
53
- (cond ((consp (cdr kv))
54
- (setf (gethash (first kv) result) (second kv)))
55
- (t
56
- (setf (gethash (car kv) result) (cdr kv)))))
57
- result))
58
-
59
- (defun is-byte-array (data-type)
60
- (and (vectorp data-type)
61
- (equal '(unsigned-byte 8) (array-element-type data-type))))
62
-
63
- (defun encode-stream (data stream)
64
- (cond ((floatp data) (encode-float data stream))
65
- ((numberp data) (encode-integer data stream))
66
- ((null data) (write-byte #xc0 stream))
67
- ((eq data t) (write-byte #xc3 stream))
68
- ((stringp data)
69
- (encode-string data stream))
70
- ((is-byte-array data)
71
- (encode-raw-bytes data stream))
72
- ((or (consp data) (vectorp data))
73
- (encode-array data stream))
74
- ((hash-table-p data)
75
- (encode-hash data stream))
76
- ((symbolp data)
77
- (encode-string (symbol-name data) stream))
78
- (t (error "Cannot encode data."))))
79
-
80
- (defun encode-string (data stream)
81
- (encode-raw-bytes (babel:string-to-octets data) stream))
82
-
83
- #+sbcl (defun sbcl-encode-float (data stream)
84
- (cond ((equal (type-of data) 'single-float)
85
- (write-byte #xca stream)
86
- (store-big-endian (sb-kernel:single-float-bits data) stream 4))
87
- ((equal (type-of data) 'double-float)
88
- (write-byte #xcb stream)
89
- (store-big-endian (sb-kernel:double-float-high-bits data) stream 4)
90
- (store-big-endian (sb-kernel:double-float-low-bits data) stream 4)))
91
- t)
92
-
93
- (defun encode-float (data stream)
94
- (or #+sbcl (sbcl-encode-float data stream)
95
- #-(or sbcl) (error "No floating point support yet.")))
96
-
97
- (defun encode-each (data stream &optional (encoder #'encode-stream))
98
- (cond ((hash-table-p data)
99
- (maphash (lambda (key value)
100
- (funcall encoder key stream)
101
- (funcall encoder value stream))
102
- data))
103
- ((or (vectorp data) (consp data))
104
- (mapc (lambda (subdata)
105
- (funcall encoder subdata stream))
106
- (coerce data 'list)))
107
- (t (error "Not sequence or hash table."))))
108
-
109
- (defun encode-sequence (data stream
110
- short-prefix short-length
111
- typecode-16 typecode-32
112
- &optional (encoder #'encode-stream))
113
- (let ((len (if (hash-table-p data)
114
- (hash-table-count data)
115
- (length data))))
116
- (cond ((<= 0 len short-length)
117
- (write-byte (+ short-prefix len) stream)
118
- (encode-each data stream encoder))
119
- ((<= 0 len 65535)
120
- (write-byte typecode-16 stream)
121
- (store-big-endian len stream 2)
122
- (encode-each data stream encoder))
123
- ((<= 0 len (1- (expt 2 32)))
124
- (write-byte typecode-32 stream)
125
- (store-big-endian len stream 4)
126
- (encode-each data stream encoder)))))
127
-
128
- (defun encode-hash (data stream)
129
- (encode-sequence data stream #x80 15 #xdc #xdd))
130
-
131
- (defun encode-array (data stream)
132
- (encode-sequence data stream #x90 15 #xdc #xdd))
133
-
134
- (defun encode-raw-bytes (data stream)
135
- (encode-sequence data stream #xa0 31 #xda #xdb #'write-byte))
136
-
137
- (defun encode-integer (data stream)
138
- (cond ((<= 0 data 127) (write-byte data stream))
139
- ((<= -32 data -1) (write-byte (sb8->ub8 data) stream))
140
- ((<= 0 data 255)
141
- (write-byte #xcc stream)
142
- (write-byte data stream))
143
- ((<= 0 data 65535)
144
- (write-byte #xcd stream)
145
- (store-big-endian data stream 2))
146
- ((<= 0 data (1- (expt 2 32)))
147
- (write-byte #xce stream)
148
- (store-big-endian data stream 4))
149
- ((<= 0 data (1- (expt 2 64)))
150
- (write-byte #xcf stream)
151
- (store-big-endian data stream 8))
152
- ((<= -128 data 127)
153
- (write-byte #xd0 stream)
154
- (write-byte (sb8->ub8 data) stream))
155
- ((<= -32768 data 32767)
156
- (write-byte #xd1 stream)
157
- (write-byte (sb16->ub16 data) stream))
158
- ((<= (- (expt 2 31)) data (1- (expt 2 31)))
159
- (write-byte #xd2 stream)
160
- (write-byte (sb32->ub32 data) stream))
161
- ((<= (- (expt 2 63)) data (1- (expt 2 63)))
162
- (write-byte #xd3 stream)
163
- (write-byte (sb64->ub64 data) stream))
164
- (t (error "Integer too large or too small."))))
165
-
166
- (defun store-big-endian (number stream byte-count)
167
- (let (byte-list)
168
- (loop
169
- while (> number 0)
170
- do
171
- (push (rem number 256)
172
- byte-list)
173
- (setf number (ash number -8)))
174
- (loop
175
- while (< (length byte-list) byte-count)
176
- do (push 0 byte-list))
177
- (when (> (length byte-list) byte-count)
178
- (error "Number too large."))
179
- (write-sequence byte-list stream)))
180
-
181
- (defun decode (byte-array)
182
- (flexi-streams:with-input-from-sequence (stream byte-array)
183
- (decode-stream stream)))
184
-
185
- (defun decode-stream (stream)
186
- (let ((byte (read-byte stream)))
187
- (cond ((= 0 (ldb (byte 1 7) byte))
188
- byte)
189
- ((= 7 (ldb (byte 3 5) byte))
190
- (ub8->sb8 byte))
191
- ((= #xcc byte)
192
- (read-byte stream))
193
- ((= #xcd byte)
194
- (load-big-endian stream 2))
195
- ((= #xce byte)
196
- (load-big-endian stream 4))
197
- ((= #xcf byte)
198
- (load-big-endian stream 8))
199
- ((= #xd0 byte)
200
- (ub8->sb8 (read-byte stream)))
201
- ((= #xd1 byte)
202
- (ub16->sb16 (load-big-endian stream 2)))
203
- ((= #xd2 byte)
204
- (ub32->sb32 (load-big-endian stream 4)))
205
- ((= #xd3 byte)
206
- (ub64->sb64 (load-big-endian stream 8)))
207
- ((= #xc0 byte)
208
- nil)
209
- ((= #xc3 byte)
210
- t)
211
- ((= #xc2 byte)
212
- nil)
213
- ((= #xca byte)
214
- (or #+sbcl (sb-kernel:make-single-float (load-big-endian stream 4))
215
- #-(or sbcl) (error "No floating point support yet.")))
216
- ((= #xcb byte)
217
- (or #+sbcl (sb-kernel:make-double-float (load-big-endian stream 4)
218
- (load-big-endian stream 4))
219
- #-(or sbcl) (error "No floating point support yet.")))
220
- ((= 5 (ldb (byte 3 5) byte))
221
- (decode-raw-sequence (ldb (byte 5 0) byte) stream))
222
- ((= #xda byte)
223
- (decode-raw-sequence (load-big-endian stream 2) stream))
224
- ((= #xdb byte)
225
- (decode-raw-sequence (load-big-endian stream 4) stream))
226
- ((= 9 (ldb (byte 4 4) byte))
227
- (decode-array (- byte #x90) stream))
228
- ((= #xdc byte)
229
- (decode-array (load-big-endian stream 2) stream))
230
- ((= #xdd byte)
231
- (decode-array (load-big-endian stream 4) stream))
232
- ((= 8 (ldb (byte 4 4) byte))
233
- (decode-map (- byte #x80) stream))
234
- ((= #xde byte)
235
- (decode-map (load-big-endian stream 2) stream))
236
- ((= #xdf byte)
237
- (decode-map (load-big-endian stream 4) stream)))))
238
-
239
- (defun decode-map (length stream)
240
- (let ((hash-table (make-hash-table :test #'equal)))
241
- (loop repeat length
242
- do (let ((key (decode-stream stream))
243
- (value (decode-stream stream)))
244
- (setf (gethash key hash-table) value)))
245
- hash-table))
246
-
247
- (defun decode-array (length stream)
248
- (let ((array (make-array length)))
249
- (dotimes (i length)
250
- (setf (aref array i) (decode-stream stream)))
251
- array))
252
-
253
- (defun decode-raw-sequence (length stream)
254
- (let ((seq (make-array length :element-type '(mod 256))))
255
- (read-sequence seq stream)
256
- (babel:octets-to-string seq)))
257
-
258
- (defun load-big-endian (stream byte-count)
259
- (let ((result 0))
260
- (loop
261
- repeat byte-count
262
- do (setf result (+ (ash result 8)
263
- (read-byte stream))))
264
- result))