unibits 1.3.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +14 -3
- data/lib/unibits.rb +42 -33
- data/lib/unibits/symbolify.rb +419 -328
- data/lib/unibits/version.rb +1 -1
- data/spec/unibits_spec.rb +18 -0
- data/unibits.gemspec +1 -1
- metadata +5 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0d5f49745b4dd8f80a9b4c87c3810b4ddc5dbe66
|
4
|
+
data.tar.gz: a00bc08a018d1c1e40d9560f48cf64e53aa4e20b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 865101498c37b0d480846c6eb9a613942becef9f0f0fa02d01ed78ca1dc8bd4d7d3ae10ce3856ab76ded888278e27da6962657de65ea96c464920c5e3c91866d
|
7
|
+
data.tar.gz: 39362d02cc5f8b8f0c6df3cc3b7f1bb597eef0eb8e2fb828b16f95de8e6a8e8b41faa13d03a33fcb1d2890b7625a855a75924351dca76832994cf50d1af95e9f
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,13 @@
|
|
1
1
|
## CHANGELOG
|
2
2
|
|
3
|
+
### 2.0.0
|
4
|
+
|
5
|
+
* Support more encodings: ISO-8859-X and Windows-125X
|
6
|
+
* Add three HANGUL characters (U+115F, U+1160, U+3164) to list of possible white spaces
|
7
|
+
* Move character handling to separate gem. It is called "characteristics".
|
8
|
+
* Highlight control chars in blue and blanks in light blue
|
9
|
+
* Handle encodings that are not convertible to UTF-8
|
10
|
+
|
3
11
|
### 1.3.0
|
4
12
|
|
5
13
|
* Add variation selectors 17-256 (U+E0100 - U+E01EF)
|
data/README.md
CHANGED
@@ -4,9 +4,19 @@ Ruby library and CLI command that visualizes various Unicode and ASCII encodings
|
|
4
4
|
|
5
5
|
- Makes analyzing encodings easier
|
6
6
|
- Helps you with debugging strings
|
7
|
-
- Supports **UTF-8**, **UTF-16LE**/**UTF-16BE**, **UTF-32LE**/**UTF-32BE**, arbitrary **BINARY** data, and **ASCII**
|
7
|
+
- Supports **UTF-8**, **UTF-16LE**/**UTF-16BE**, **UTF-32LE**/**UTF-32BE**, **ISO-8859-X**, **Windows-125X** arbitrary **BINARY** data, and **ASCII**
|
8
8
|
- Highlights invalid encodings
|
9
9
|
|
10
|
+
## Color Coding
|
11
|
+
|
12
|
+
Each byte of the given string is highlighted using the following mechanism:
|
13
|
+
|
14
|
+
- Red for invalid bytes
|
15
|
+
- Orange for unassigned bytes/characters
|
16
|
+
- Blue for control characters
|
17
|
+
- Light blue for blanks
|
18
|
+
- Random color for all other characters
|
19
|
+
|
10
20
|
## Setup
|
11
21
|
|
12
22
|
Make sure you have Ruby installed and installing gems works properly. Then do:
|
@@ -114,16 +124,17 @@ Example in Ruby: `unibits "🌫 Idiosyncrätic ℜսᖯʏ", encoding: 'ascii'
|
|
114
124
|
|
115
125
|
### BINARY
|
116
126
|
|
117
|
-
|
127
|
+
Not possible to produce invalid binary strings
|
118
128
|
|
119
129
|
## Notes
|
120
130
|
|
121
131
|
Also see
|
122
132
|
|
133
|
+
- [Ruby's Encoding class](https://ruby-doc.org/core/Encoding.html)
|
134
|
+
- [Characteristics gem](https://github.com/janlelis/characteristics)
|
123
135
|
- [UTF-8 (Wikipedia)](https://en.wikipedia.org/wiki/UTF-8#Description)
|
124
136
|
- [UTF-16 (Wikipedia)](https://en.wikipedia.org/wiki/UTF-16#Description)
|
125
137
|
- [UTF-32 (Wikipedia)](https://en.wikipedia.org/wiki/UTF-32)
|
126
|
-
- [Ruby's Encoding class](https://ruby-doc.org/core/Encoding.html)
|
127
138
|
- [Difference between BINARY and ASCII](http://idiosyncratic-ruby.com/56-us-ascii-8bit.html)
|
128
139
|
- [Unicode Micro Libraries for Ruby](https://github.com/janlelis/unicode-x)
|
129
140
|
|
data/lib/unibits.rb
CHANGED
@@ -4,7 +4,7 @@ require_relative "unibits/symbolify"
|
|
4
4
|
require "io/console"
|
5
5
|
require "paint"
|
6
6
|
require "unicode/display_width"
|
7
|
-
require "
|
7
|
+
require "characteristics"
|
8
8
|
|
9
9
|
module Unibits
|
10
10
|
SUPPORTED_ENCODINGS = [
|
@@ -15,6 +15,8 @@ module Unibits
|
|
15
15
|
'UTF-32BE',
|
16
16
|
'ASCII-8BIT',
|
17
17
|
'US-ASCII',
|
18
|
+
/^ISO-8859-/,
|
19
|
+
/^Windows-125/,
|
18
20
|
].freeze
|
19
21
|
DEFAULT_TERMINAL_WIDTH = 80
|
20
22
|
|
@@ -49,6 +51,9 @@ module Unibits
|
|
49
51
|
|
50
52
|
def self.visualize(string, wide_ambiguous: false, width: nil)
|
51
53
|
cols = width || determine_terminal_cols
|
54
|
+
encoding_name = string.encoding.name
|
55
|
+
|
56
|
+
type = Characteristics.type_from_encoding_name(encoding_name)
|
52
57
|
|
53
58
|
cp_buffer = [" "]
|
54
59
|
enc_buffer = [" "]
|
@@ -59,18 +64,10 @@ module Unibits
|
|
59
64
|
|
60
65
|
puts
|
61
66
|
string.each_char{ |char|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
current_color = "#FF5500"
|
67
|
-
else
|
68
|
-
current_color = random_color
|
69
|
-
end
|
70
|
-
else
|
71
|
-
char_valid = false
|
72
|
-
current_color = :red
|
73
|
-
end
|
67
|
+
char_info = Characteristics.create_for_type(char, type)
|
68
|
+
current_color = determine_char_color(char_info)
|
69
|
+
|
70
|
+
current_encoding_error = nil if char_info.valid?
|
74
71
|
|
75
72
|
char.each_byte.with_index{ |byte, index|
|
76
73
|
if Paint.unpaint(hex_buffer[-1]).bytesize > cols - 12
|
@@ -82,10 +79,10 @@ module Unibits
|
|
82
79
|
end
|
83
80
|
|
84
81
|
if index == 0
|
85
|
-
if
|
82
|
+
if char_info.valid?
|
86
83
|
codepoint = "U+%04X" % char.ord
|
87
84
|
else
|
88
|
-
case
|
85
|
+
case encoding_name
|
89
86
|
when "US-ASCII"
|
90
87
|
codepoint = "invalid"
|
91
88
|
when "UTF-8"
|
@@ -167,7 +164,7 @@ module Unibits
|
|
167
164
|
when 'UTF-16LE', 'UTF-16BE'
|
168
165
|
if char.bytesize.odd?
|
169
166
|
codepoint = "incompl."
|
170
|
-
elsif char.b[
|
167
|
+
elsif char.b[encoding_name == 'UTF-16LE' ? 1 : 0].unpack("B*")[0][0, 5] == "11011"
|
171
168
|
codepoint = "hlf.srg."
|
172
169
|
else
|
173
170
|
codepoint = "invalid"
|
@@ -185,14 +182,14 @@ module Unibits
|
|
185
182
|
codepoint.ljust(10), current_color, :bold
|
186
183
|
]
|
187
184
|
|
188
|
-
|
189
|
-
|
185
|
+
symbolified_char = Symbolify.symbolify(char, char_info)
|
186
|
+
|
187
|
+
if char_info.unicode?
|
188
|
+
padding = 10 - Unicode::DisplayWidth.of(symbolified_char, wide_ambiguous ? 2 : 1)
|
190
189
|
else
|
191
|
-
|
190
|
+
padding = 10 - symbolified_char.size
|
192
191
|
end
|
193
192
|
|
194
|
-
padding = 10 - Unicode::DisplayWidth.of(symbolified_char, wide_ambiguous ? 2 : 1)
|
195
|
-
|
196
193
|
enc_buffer[-1] << Paint[
|
197
194
|
symbolified_char, current_color
|
198
195
|
]
|
@@ -208,11 +205,11 @@ module Unibits
|
|
208
205
|
|
209
206
|
bin_byte_complete = byte.to_s(2).rjust(8, "0")
|
210
207
|
|
211
|
-
if !
|
208
|
+
if !char_info.valid?
|
212
209
|
bin_byte_1 = bin_byte_complete
|
213
210
|
bin_byte_2 = ""
|
214
211
|
else
|
215
|
-
case
|
212
|
+
case encoding_name
|
216
213
|
when 'US-ASCII'
|
217
214
|
bin_byte_1 = bin_byte_complete[0...1]
|
218
215
|
bin_byte_2 = bin_byte_complete[1...8]
|
@@ -253,6 +250,9 @@ module Unibits
|
|
253
250
|
when 'UTF-32LE', 'UTF-32BE'
|
254
251
|
bin_byte_1 = ""
|
255
252
|
bin_byte_2 = bin_byte_complete
|
253
|
+
when /^(ISO-8859-|Windows-125)/
|
254
|
+
bin_byte_1 = ""
|
255
|
+
bin_byte_2 = bin_byte_complete
|
256
256
|
end
|
257
257
|
end
|
258
258
|
|
@@ -268,25 +268,34 @@ module Unibits
|
|
268
268
|
}
|
269
269
|
}
|
270
270
|
|
271
|
-
if
|
271
|
+
if type == :unicode
|
272
272
|
enc_buffer.zip(cp_buffer, hex_buffer, bin_buffer, separator).flatten.join("\n")
|
273
273
|
else
|
274
274
|
enc_buffer.zip(hex_buffer, bin_buffer, separator).flatten.join("\n")
|
275
275
|
end
|
276
276
|
end
|
277
277
|
|
278
|
-
def self.random_color
|
279
|
-
"%.2x%.2x%.2x" %[rand(90) + 60, rand(90) + 60, rand(90) + 60]
|
280
|
-
end
|
281
|
-
|
282
|
-
def self.symbolify(char)
|
283
|
-
return char.inspect unless char.encoding.name[0, 3] == "UTF"
|
284
|
-
Symbolify.symbolify(char).encode('UTF-8')
|
285
|
-
end
|
286
|
-
|
287
278
|
def self.determine_terminal_cols
|
288
279
|
STDIN.winsize[1] || DEFAULT_TERMINAL_WIDTH
|
289
280
|
rescue Errno::ENOTTY
|
290
281
|
return DEFAULT_TERMINAL_WIDTH
|
291
282
|
end
|
283
|
+
|
284
|
+
def self.determine_char_color(char_info)
|
285
|
+
if !char_info.valid?
|
286
|
+
"#FF0000"
|
287
|
+
elsif !char_info.assigned?
|
288
|
+
"#FF5500"
|
289
|
+
elsif char_info.control?
|
290
|
+
"#0000FF"
|
291
|
+
elsif char_info.blank?
|
292
|
+
"#33AADD"
|
293
|
+
else
|
294
|
+
random_color
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
def self.random_color
|
299
|
+
"%.2x%.2x%.2x" % [rand(90) + 60, rand(90) + 60, rand(90) + 60]
|
300
|
+
end
|
292
301
|
end
|
data/lib/unibits/symbolify.rb
CHANGED
@@ -1,351 +1,442 @@
|
|
1
|
-
require "unicode/categories"
|
2
|
-
|
3
1
|
module Unibits
|
4
2
|
module Symbolify
|
3
|
+
NO_UTF8_CONVERTER = /^Windows-1258/
|
4
|
+
ASCII_CHARS = "\x20-\x7E".freeze
|
5
5
|
ASCII_CONTROL_CODEPOINTS = "\x00-\x1F\x7F".freeze
|
6
6
|
ASCII_CONTROL_SYMBOLS = "\u{2400}-\u{241F}\u{2421}".freeze
|
7
|
-
ASCII_CHARS = "\x20-\x7E".freeze
|
8
|
-
TAG_START = "\u{E0001}".freeze
|
9
|
-
TAG_START_SYMBOL = "LANG TAG".freeze
|
10
|
-
TAG_SPACE = "\u{E0020}".freeze
|
11
|
-
TAG_SPACE_SYMBOL = "TAG ␠".freeze
|
12
7
|
TAGS = "\u{E0021}-\u{E007E}".freeze
|
13
|
-
|
14
|
-
|
8
|
+
|
9
|
+
CONTROL_C0_SYMBOLS = [
|
10
|
+
"␀",
|
11
|
+
"␁",
|
12
|
+
"␂",
|
13
|
+
"␃",
|
14
|
+
"␄",
|
15
|
+
"␅",
|
16
|
+
"␆",
|
17
|
+
"␇",
|
18
|
+
"␈",
|
19
|
+
"␉",
|
20
|
+
"␊",
|
21
|
+
"␋",
|
22
|
+
"␌",
|
23
|
+
"␍",
|
24
|
+
"␎",
|
25
|
+
"␏",
|
26
|
+
"␐",
|
27
|
+
"␑",
|
28
|
+
"␒",
|
29
|
+
"␓",
|
30
|
+
"␔",
|
31
|
+
"␕",
|
32
|
+
"␖",
|
33
|
+
"␗",
|
34
|
+
"␘",
|
35
|
+
"␙",
|
36
|
+
"␚",
|
37
|
+
"␛",
|
38
|
+
"␜",
|
39
|
+
"␝",
|
40
|
+
"␞",
|
41
|
+
"␟",
|
42
|
+
]
|
43
|
+
|
44
|
+
CONTROL_DELETE_SYMBOL = "␡"
|
45
|
+
|
46
|
+
CONTROL_C1_NAMES = {
|
47
|
+
0x80 => "PAD",
|
48
|
+
0x81 => "HOP",
|
49
|
+
0x82 => "BPH",
|
50
|
+
0x83 => "NBH",
|
51
|
+
0x84 => "IND",
|
52
|
+
0x85 => "NEL",
|
53
|
+
0x86 => "SSA",
|
54
|
+
0x87 => "ESA",
|
55
|
+
0x88 => "HTS",
|
56
|
+
0x89 => "HTJ",
|
57
|
+
0x8A => "VTS",
|
58
|
+
0x8B => "PLD",
|
59
|
+
0x8C => "PLU",
|
60
|
+
0x8D => "RI",
|
61
|
+
0x8E => "SS2",
|
62
|
+
0x8F => "SS3",
|
63
|
+
0x90 => "DCS",
|
64
|
+
0x91 => "PU1",
|
65
|
+
0x92 => "PU2",
|
66
|
+
0x93 => "STS",
|
67
|
+
0x94 => "CCH",
|
68
|
+
0x95 => "MW",
|
69
|
+
0x96 => "SPA",
|
70
|
+
0x97 => "EPA",
|
71
|
+
0x98 => "SOS",
|
72
|
+
0x99 => "SGC",
|
73
|
+
0x9A => "SCI",
|
74
|
+
0x9B => "CSI",
|
75
|
+
0x9C => "ST",
|
76
|
+
0x9D => "OSC",
|
77
|
+
0x9E => "PM",
|
78
|
+
0x9F => "APC",
|
79
|
+
}
|
80
|
+
|
15
81
|
INTERESTING_CODEPOINTS = {
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
"\u{008B}" => "PLD",
|
28
|
-
"\u{008C}" => "PLU",
|
29
|
-
"\u{008D}" => "RI",
|
30
|
-
"\u{008E}" => "SS2",
|
31
|
-
"\u{008F}" => "SS3",
|
32
|
-
"\u{0090}" => "DCS",
|
33
|
-
"\u{0091}" => "PU1",
|
34
|
-
"\u{0092}" => "PU2",
|
35
|
-
"\u{0093}" => "STS",
|
36
|
-
"\u{0094}" => "CCH",
|
37
|
-
"\u{0095}" => "MW",
|
38
|
-
"\u{0096}" => "SPA",
|
39
|
-
"\u{0097}" => "EPA",
|
40
|
-
"\u{0098}" => "SOS",
|
41
|
-
"\u{0099}" => "SGC",
|
42
|
-
"\u{009A}" => "SCI",
|
43
|
-
"\u{009B}" => "CSI",
|
44
|
-
"\u{009C}" => "ST",
|
45
|
-
"\u{009D}" => "OSC",
|
46
|
-
"\u{009E}" => "PM",
|
47
|
-
"\u{009F}" => "APC",
|
82
|
+
0x200E => "LRM",
|
83
|
+
0x200F => "RLM",
|
84
|
+
0x202A => "LRE",
|
85
|
+
0x202B => "RLE",
|
86
|
+
0x202C => "PDF",
|
87
|
+
0x202D => "LRO",
|
88
|
+
0x202E => "RLO",
|
89
|
+
0x2066 => "LRI",
|
90
|
+
0x2067 => "RLI",
|
91
|
+
0x2068 => "FSI",
|
92
|
+
0x2069 => "PDI",
|
48
93
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
94
|
+
0xFE00 => "VS1",
|
95
|
+
0xFE01 => "VS2",
|
96
|
+
0xFE02 => "VS3",
|
97
|
+
0xFE03 => "VS4",
|
98
|
+
0xFE04 => "VS5",
|
99
|
+
0xFE05 => "VS6",
|
100
|
+
0xFE06 => "VS7",
|
101
|
+
0xFE07 => "VS8",
|
102
|
+
0xFE08 => "VS9",
|
103
|
+
0xFE09 => "VS10",
|
104
|
+
0xFE0A => "VS11",
|
105
|
+
0xFE0B => "VS12",
|
106
|
+
0xFE0C => "VS13",
|
107
|
+
0xFE0D => "VS14",
|
108
|
+
0xFE0E => "VS15",
|
109
|
+
0xFE0F => "VS16",
|
60
110
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
"\u{FE03}" => "VS4",
|
65
|
-
"\u{FE04}" => "VS5",
|
66
|
-
"\u{FE05}" => "VS6",
|
67
|
-
"\u{FE06}" => "VS7",
|
68
|
-
"\u{FE07}" => "VS8",
|
69
|
-
"\u{FE08}" => "VS9",
|
70
|
-
"\u{FE09}" => "VS10",
|
71
|
-
"\u{FE0A}" => "VS11",
|
72
|
-
"\u{FE0B}" => "VS12",
|
73
|
-
"\u{FE0C}" => "VS13",
|
74
|
-
"\u{FE0D}" => "VS14",
|
75
|
-
"\u{FE0E}" => "VS15",
|
76
|
-
"\u{FE0F}" => "VS16",
|
111
|
+
0xE0001 => "LANG TAG",
|
112
|
+
0xE0020 => "TAG ␠",
|
113
|
+
0xE007F => "TAG ␡",
|
77
114
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
115
|
+
0xE0100 => "VS17",
|
116
|
+
0xE0101 => "VS18",
|
117
|
+
0xE0102 => "VS19",
|
118
|
+
0xE0103 => "VS20",
|
119
|
+
0xE0104 => "VS21",
|
120
|
+
0xE0105 => "VS22",
|
121
|
+
0xE0106 => "VS23",
|
122
|
+
0xE0107 => "VS24",
|
123
|
+
0xE0108 => "VS25",
|
124
|
+
0xE0109 => "VS26",
|
125
|
+
0xE010A => "VS27",
|
126
|
+
0xE010B => "VS28",
|
127
|
+
0xE010C => "VS29",
|
128
|
+
0xE010D => "VS30",
|
129
|
+
0xE010E => "VS31",
|
130
|
+
0xE010F => "VS32",
|
131
|
+
0xE0110 => "VS33",
|
132
|
+
0xE0111 => "VS34",
|
133
|
+
0xE0112 => "VS35",
|
134
|
+
0xE0113 => "VS36",
|
135
|
+
0xE0114 => "VS37",
|
136
|
+
0xE0115 => "VS38",
|
137
|
+
0xE0116 => "VS39",
|
138
|
+
0xE0117 => "VS40",
|
139
|
+
0xE0118 => "VS41",
|
140
|
+
0xE0119 => "VS42",
|
141
|
+
0xE011A => "VS43",
|
142
|
+
0xE011B => "VS44",
|
143
|
+
0xE011C => "VS45",
|
144
|
+
0xE011D => "VS46",
|
145
|
+
0xE011E => "VS47",
|
146
|
+
0xE011F => "VS48",
|
147
|
+
0xE0120 => "VS49",
|
148
|
+
0xE0121 => "VS50",
|
149
|
+
0xE0122 => "VS51",
|
150
|
+
0xE0123 => "VS52",
|
151
|
+
0xE0124 => "VS53",
|
152
|
+
0xE0125 => "VS54",
|
153
|
+
0xE0126 => "VS55",
|
154
|
+
0xE0127 => "VS56",
|
155
|
+
0xE0128 => "VS57",
|
156
|
+
0xE0129 => "VS58",
|
157
|
+
0xE012A => "VS59",
|
158
|
+
0xE012B => "VS60",
|
159
|
+
0xE012C => "VS61",
|
160
|
+
0xE012D => "VS62",
|
161
|
+
0xE012E => "VS63",
|
162
|
+
0xE012F => "VS64",
|
163
|
+
0xE0130 => "VS65",
|
164
|
+
0xE0131 => "VS66",
|
165
|
+
0xE0132 => "VS67",
|
166
|
+
0xE0133 => "VS68",
|
167
|
+
0xE0134 => "VS69",
|
168
|
+
0xE0135 => "VS70",
|
169
|
+
0xE0136 => "VS71",
|
170
|
+
0xE0137 => "VS72",
|
171
|
+
0xE0138 => "VS73",
|
172
|
+
0xE0139 => "VS74",
|
173
|
+
0xE013A => "VS75",
|
174
|
+
0xE013B => "VS76",
|
175
|
+
0xE013C => "VS77",
|
176
|
+
0xE013D => "VS78",
|
177
|
+
0xE013E => "VS79",
|
178
|
+
0xE013F => "VS80",
|
179
|
+
0xE0140 => "VS81",
|
180
|
+
0xE0141 => "VS82",
|
181
|
+
0xE0142 => "VS83",
|
182
|
+
0xE0143 => "VS84",
|
183
|
+
0xE0144 => "VS85",
|
184
|
+
0xE0145 => "VS86",
|
185
|
+
0xE0146 => "VS87",
|
186
|
+
0xE0147 => "VS88",
|
187
|
+
0xE0148 => "VS89",
|
188
|
+
0xE0149 => "VS90",
|
189
|
+
0xE014A => "VS91",
|
190
|
+
0xE014B => "VS92",
|
191
|
+
0xE014C => "VS93",
|
192
|
+
0xE014D => "VS94",
|
193
|
+
0xE014E => "VS95",
|
194
|
+
0xE014F => "VS96",
|
195
|
+
0xE0150 => "VS97",
|
196
|
+
0xE0151 => "VS98",
|
197
|
+
0xE0152 => "VS99",
|
198
|
+
0xE0153 => "VS100",
|
199
|
+
0xE0154 => "VS101",
|
200
|
+
0xE0155 => "VS102",
|
201
|
+
0xE0156 => "VS103",
|
202
|
+
0xE0157 => "VS104",
|
203
|
+
0xE0158 => "VS105",
|
204
|
+
0xE0159 => "VS106",
|
205
|
+
0xE015A => "VS107",
|
206
|
+
0xE015B => "VS108",
|
207
|
+
0xE015C => "VS109",
|
208
|
+
0xE015D => "VS110",
|
209
|
+
0xE015E => "VS111",
|
210
|
+
0xE015F => "VS112",
|
211
|
+
0xE0160 => "VS113",
|
212
|
+
0xE0161 => "VS114",
|
213
|
+
0xE0162 => "VS115",
|
214
|
+
0xE0163 => "VS116",
|
215
|
+
0xE0164 => "VS117",
|
216
|
+
0xE0165 => "VS118",
|
217
|
+
0xE0166 => "VS119",
|
218
|
+
0xE0167 => "VS120",
|
219
|
+
0xE0168 => "VS121",
|
220
|
+
0xE0169 => "VS122",
|
221
|
+
0xE016A => "VS123",
|
222
|
+
0xE016B => "VS124",
|
223
|
+
0xE016C => "VS125",
|
224
|
+
0xE016D => "VS126",
|
225
|
+
0xE016E => "VS127",
|
226
|
+
0xE016F => "VS128",
|
227
|
+
0xE0170 => "VS129",
|
228
|
+
0xE0171 => "VS130",
|
229
|
+
0xE0172 => "VS131",
|
230
|
+
0xE0173 => "VS132",
|
231
|
+
0xE0174 => "VS133",
|
232
|
+
0xE0175 => "VS134",
|
233
|
+
0xE0176 => "VS135",
|
234
|
+
0xE0177 => "VS136",
|
235
|
+
0xE0178 => "VS137",
|
236
|
+
0xE0179 => "VS138",
|
237
|
+
0xE017A => "VS139",
|
238
|
+
0xE017B => "VS140",
|
239
|
+
0xE017C => "VS141",
|
240
|
+
0xE017D => "VS142",
|
241
|
+
0xE017E => "VS143",
|
242
|
+
0xE017F => "VS144",
|
243
|
+
0xE0180 => "VS145",
|
244
|
+
0xE0181 => "VS146",
|
245
|
+
0xE0182 => "VS147",
|
246
|
+
0xE0183 => "VS148",
|
247
|
+
0xE0184 => "VS149",
|
248
|
+
0xE0185 => "VS150",
|
249
|
+
0xE0186 => "VS151",
|
250
|
+
0xE0187 => "VS152",
|
251
|
+
0xE0188 => "VS153",
|
252
|
+
0xE0189 => "VS154",
|
253
|
+
0xE018A => "VS155",
|
254
|
+
0xE018B => "VS156",
|
255
|
+
0xE018C => "VS157",
|
256
|
+
0xE018D => "VS158",
|
257
|
+
0xE018E => "VS159",
|
258
|
+
0xE018F => "VS160",
|
259
|
+
0xE0190 => "VS161",
|
260
|
+
0xE0191 => "VS162",
|
261
|
+
0xE0192 => "VS163",
|
262
|
+
0xE0193 => "VS164",
|
263
|
+
0xE0194 => "VS165",
|
264
|
+
0xE0195 => "VS166",
|
265
|
+
0xE0196 => "VS167",
|
266
|
+
0xE0197 => "VS168",
|
267
|
+
0xE0198 => "VS169",
|
268
|
+
0xE0199 => "VS170",
|
269
|
+
0xE019A => "VS171",
|
270
|
+
0xE019B => "VS172",
|
271
|
+
0xE019C => "VS173",
|
272
|
+
0xE019D => "VS174",
|
273
|
+
0xE019E => "VS175",
|
274
|
+
0xE019F => "VS176",
|
275
|
+
0xE01A0 => "VS177",
|
276
|
+
0xE01A1 => "VS178",
|
277
|
+
0xE01A2 => "VS179",
|
278
|
+
0xE01A3 => "VS180",
|
279
|
+
0xE01A4 => "VS181",
|
280
|
+
0xE01A5 => "VS182",
|
281
|
+
0xE01A6 => "VS183",
|
282
|
+
0xE01A7 => "VS184",
|
283
|
+
0xE01A8 => "VS185",
|
284
|
+
0xE01A9 => "VS186",
|
285
|
+
0xE01AA => "VS187",
|
286
|
+
0xE01AB => "VS188",
|
287
|
+
0xE01AC => "VS189",
|
288
|
+
0xE01AD => "VS190",
|
289
|
+
0xE01AE => "VS191",
|
290
|
+
0xE01AF => "VS192",
|
291
|
+
0xE01B0 => "VS193",
|
292
|
+
0xE01B1 => "VS194",
|
293
|
+
0xE01B2 => "VS195",
|
294
|
+
0xE01B3 => "VS196",
|
295
|
+
0xE01B4 => "VS197",
|
296
|
+
0xE01B5 => "VS198",
|
297
|
+
0xE01B6 => "VS199",
|
298
|
+
0xE01B7 => "VS200",
|
299
|
+
0xE01B8 => "VS201",
|
300
|
+
0xE01B9 => "VS202",
|
301
|
+
0xE01BA => "VS203",
|
302
|
+
0xE01BB => "VS204",
|
303
|
+
0xE01BC => "VS205",
|
304
|
+
0xE01BD => "VS206",
|
305
|
+
0xE01BE => "VS207",
|
306
|
+
0xE01BF => "VS208",
|
307
|
+
0xE01C0 => "VS209",
|
308
|
+
0xE01C1 => "VS210",
|
309
|
+
0xE01C2 => "VS211",
|
310
|
+
0xE01C3 => "VS212",
|
311
|
+
0xE01C4 => "VS213",
|
312
|
+
0xE01C5 => "VS214",
|
313
|
+
0xE01C6 => "VS215",
|
314
|
+
0xE01C7 => "VS216",
|
315
|
+
0xE01C8 => "VS217",
|
316
|
+
0xE01C9 => "VS218",
|
317
|
+
0xE01CA => "VS219",
|
318
|
+
0xE01CB => "VS220",
|
319
|
+
0xE01CC => "VS221",
|
320
|
+
0xE01CD => "VS222",
|
321
|
+
0xE01CE => "VS223",
|
322
|
+
0xE01CF => "VS224",
|
323
|
+
0xE01D0 => "VS225",
|
324
|
+
0xE01D1 => "VS226",
|
325
|
+
0xE01D2 => "VS227",
|
326
|
+
0xE01D3 => "VS228",
|
327
|
+
0xE01D4 => "VS229",
|
328
|
+
0xE01D5 => "VS230",
|
329
|
+
0xE01D6 => "VS231",
|
330
|
+
0xE01D7 => "VS232",
|
331
|
+
0xE01D8 => "VS233",
|
332
|
+
0xE01D9 => "VS234",
|
333
|
+
0xE01DA => "VS235",
|
334
|
+
0xE01DB => "VS236",
|
335
|
+
0xE01DC => "VS237",
|
336
|
+
0xE01DD => "VS238",
|
337
|
+
0xE01DE => "VS239",
|
338
|
+
0xE01DF => "VS240",
|
339
|
+
0xE01E0 => "VS241",
|
340
|
+
0xE01E1 => "VS242",
|
341
|
+
0xE01E2 => "VS243",
|
342
|
+
0xE01E3 => "VS244",
|
343
|
+
0xE01E4 => "VS245",
|
344
|
+
0xE01E5 => "VS246",
|
345
|
+
0xE01E6 => "VS247",
|
346
|
+
0xE01E7 => "VS248",
|
347
|
+
0xE01E8 => "VS249",
|
348
|
+
0xE01E9 => "VS250",
|
349
|
+
0xE01EA => "VS251",
|
350
|
+
0xE01EB => "VS252",
|
351
|
+
0xE01EC => "VS253",
|
352
|
+
0xE01ED => "VS254",
|
353
|
+
0xE01EE => "VS255",
|
354
|
+
0xE01EF => "VS256",
|
318
355
|
}.freeze
|
319
|
-
COULD_BE_WHITESPACE = '[\p{Space}⠀𝅙]'.freeze
|
320
356
|
|
321
|
-
def self.symbolify(char,
|
322
|
-
|
357
|
+
def self.symbolify(char, char_info)
|
358
|
+
if !char_info.valid?
|
359
|
+
"�"
|
360
|
+
else
|
361
|
+
case char_info
|
362
|
+
when UnicodeCharacteristics
|
363
|
+
Symbolify.unicode(char, char_info)
|
364
|
+
when ByteCharacteristics
|
365
|
+
Symbolify.byte(char, char_info)
|
366
|
+
when AsciiCharacteristics
|
367
|
+
Symbolify.ascii(char, char_info)
|
368
|
+
else
|
369
|
+
Symbolify.binary(char)
|
370
|
+
end
|
371
|
+
end
|
372
|
+
end
|
373
|
+
|
374
|
+
def self.unicode(char, char_info)
|
375
|
+
return "n/a" if !char_info.assigned?
|
323
376
|
|
324
377
|
char = char.dup
|
378
|
+
ord = char.ord
|
379
|
+
encoding = char_info.encoding
|
325
380
|
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
381
|
+
if char_info.delete?
|
382
|
+
char = CONTROL_DELETE_SYMBOL
|
383
|
+
elsif char_info.c0?
|
384
|
+
char = CONTROL_C0_SYMBOLS[ord]
|
385
|
+
elsif char_info.c1?
|
386
|
+
char = CONTROL_C1_NAMES[ord]
|
387
|
+
elsif char_info.blank?
|
388
|
+
char = "]".encode(encoding) + char + "[".encode(encoding)
|
389
|
+
elsif ord > 917536 && ord < 917631
|
390
|
+
char = "TAG ".encode(encoding) +
|
391
|
+
char.tr(TAGS.encode(encoding), ASCII_CHARS.encode(encoding))
|
392
|
+
else
|
393
|
+
char = INTERESTING_CODEPOINTS[char.ord] || char
|
394
|
+
end
|
395
|
+
|
396
|
+
char.encode("UTF-8")
|
397
|
+
end
|
334
398
|
|
335
|
-
|
336
|
-
|
337
|
-
}
|
338
|
-
char.gsub! TAG_START.encode(encoding), TAG_START_SYMBOL.encode(encoding)
|
339
|
-
char.gsub! TAG_SPACE.encode(encoding), TAG_SPACE_SYMBOL.encode(encoding)
|
340
|
-
char.gsub! TAG_DELETE.encode(encoding), TAG_DELETE_SYMBOL.encode(encoding)
|
399
|
+
def self.byte(char, char_info)
|
400
|
+
return "n/a" if !char_info.assigned?
|
341
401
|
|
342
402
|
ord = char.ord
|
343
|
-
|
344
|
-
|
345
|
-
|
403
|
+
encoding = char_info.encoding
|
404
|
+
no_converter = !!(NO_UTF8_CONVERTER =~ encoding.name)
|
405
|
+
treat_char_unconverted = false
|
406
|
+
|
407
|
+
if char_info.delete?
|
408
|
+
char = CONTROL_DELETE_SYMBOL
|
409
|
+
elsif char_info.c0?
|
410
|
+
char = CONTROL_C0_SYMBOLS[ord]
|
411
|
+
elsif char_info.c1?
|
412
|
+
char = CONTROL_C1_NAMES[ord]
|
413
|
+
elsif no_converter
|
414
|
+
treat_char_unconverted = true
|
415
|
+
elsif char_info.blank?
|
416
|
+
char = "]".encode(encoding) + char + "[".encode(encoding)
|
417
|
+
end
|
418
|
+
|
419
|
+
if no_converter && treat_char_unconverted
|
420
|
+
char.inspect
|
421
|
+
else
|
422
|
+
char.encode("UTF-8")
|
423
|
+
end
|
424
|
+
end
|
425
|
+
|
426
|
+
def self.ascii(char, char_info)
|
427
|
+
if char_info.delete?
|
428
|
+
char = CONTROL_DELETE_SYMBOL
|
429
|
+
elsif char_info.c0?
|
430
|
+
char = CONTROL_C0_SYMBOLS[char.ord]
|
431
|
+
elsif char_info.blank?
|
432
|
+
char = "]" + char + "["
|
346
433
|
end
|
347
434
|
|
348
435
|
char
|
349
436
|
end
|
437
|
+
|
438
|
+
def self.binary(char)
|
439
|
+
char.inspect
|
440
|
+
end
|
350
441
|
end
|
351
442
|
end
|
data/lib/unibits/version.rb
CHANGED
data/spec/unibits_spec.rb
CHANGED
@@ -67,6 +67,24 @@ describe Unibits do
|
|
67
67
|
result.must_match "01000011"
|
68
68
|
end
|
69
69
|
|
70
|
+
it "works with 'ISO-8859-' encodings" do
|
71
|
+
string = "\xBC Idiosyncr\xE4tic\n\x91".force_encoding("ISO-8859-1")
|
72
|
+
result = Paint.unpaint(Unibits.visualize(string))
|
73
|
+
result.must_match "BC" # ¼
|
74
|
+
result.must_match "E4" # ä
|
75
|
+
result.must_match "␊" # \n
|
76
|
+
result.must_match "PU1" # C1 name for \x91
|
77
|
+
end
|
78
|
+
|
79
|
+
it "works with 'Windows-125' encodings" do
|
80
|
+
string = "\xBC Idiosyncr\xE4tic\n\x81".force_encoding("Windows-1252")
|
81
|
+
result = Paint.unpaint(Unibits.visualize(string))
|
82
|
+
result.must_match "BC" # ¼
|
83
|
+
result.must_match "E4" # ä
|
84
|
+
result.must_match "␊" # \n
|
85
|
+
result.must_match "n/a" # \x81 is not assigned
|
86
|
+
end
|
87
|
+
|
70
88
|
describe "invalid UTF-8 encodings" do
|
71
89
|
it "- unexpected continuation byte (1/2)" do
|
72
90
|
string = "abc\x80efg"
|
data/unibits.gemspec
CHANGED
@@ -19,7 +19,7 @@ Gem::Specification.new do |gem|
|
|
19
19
|
|
20
20
|
gem.add_dependency 'paint', '>= 0.9', '< 3.0'
|
21
21
|
gem.add_dependency 'unicode-display_width', '~> 1.1'
|
22
|
-
gem.add_dependency '
|
22
|
+
gem.add_dependency 'characteristics', '~> 0.2.0'
|
23
23
|
gem.add_dependency 'rationalist', '~> 2.0'
|
24
24
|
|
25
25
|
gem.required_ruby_version = "~> 2.0"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unibits
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jan Lelis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-03-
|
11
|
+
date: 2017-03-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: paint
|
@@ -45,25 +45,19 @@ dependencies:
|
|
45
45
|
- !ruby/object:Gem::Version
|
46
46
|
version: '1.1'
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
|
-
name:
|
48
|
+
name: characteristics
|
49
49
|
requirement: !ruby/object:Gem::Requirement
|
50
50
|
requirements:
|
51
51
|
- - "~>"
|
52
52
|
- !ruby/object:Gem::Version
|
53
|
-
version:
|
54
|
-
- - ">="
|
55
|
-
- !ruby/object:Gem::Version
|
56
|
-
version: 1.1.2
|
53
|
+
version: 0.2.0
|
57
54
|
type: :runtime
|
58
55
|
prerelease: false
|
59
56
|
version_requirements: !ruby/object:Gem::Requirement
|
60
57
|
requirements:
|
61
58
|
- - "~>"
|
62
59
|
- !ruby/object:Gem::Version
|
63
|
-
version:
|
64
|
-
- - ">="
|
65
|
-
- !ruby/object:Gem::Version
|
66
|
-
version: 1.1.2
|
60
|
+
version: 0.2.0
|
67
61
|
- !ruby/object:Gem::Dependency
|
68
62
|
name: rationalist
|
69
63
|
requirement: !ruby/object:Gem::Requirement
|