unibits 1.3.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b811d8983c85bc87c33a59b4cb7cd5144104a994
4
- data.tar.gz: 63cf6f3b91f26b8ac8b0fca9d90a595662a9f0f4
3
+ metadata.gz: 0d5f49745b4dd8f80a9b4c87c3810b4ddc5dbe66
4
+ data.tar.gz: a00bc08a018d1c1e40d9560f48cf64e53aa4e20b
5
5
  SHA512:
6
- metadata.gz: 65979ed1477c6f5bcbed7b6f74474c1958e582614d91ae0a2ebf5cdf605b31f4263b1cc68e1ff113b21ce572c7025bfacf85db5353b9108e07cf352a3c60e052
7
- data.tar.gz: b4cfbad99ad6257616b96e95f53f21696e545b4b29fafd6c3e212e03bc51f52eddb66c06ddb0b1fe2842a42cb5553f5d1a5ca32b7320aa463cd7bbf9de4f0ad3
6
+ metadata.gz: 865101498c37b0d480846c6eb9a613942becef9f0f0fa02d01ed78ca1dc8bd4d7d3ae10ce3856ab76ded888278e27da6962657de65ea96c464920c5e3c91866d
7
+ data.tar.gz: 39362d02cc5f8b8f0c6df3cc3b7f1bb597eef0eb8e2fb828b16f95de8e6a8e8b41faa13d03a33fcb1d2890b7625a855a75924351dca76832994cf50d1af95e9f
@@ -1,5 +1,13 @@
1
1
  ## CHANGELOG
2
2
 
3
+ ### 2.0.0
4
+
5
+ * Support more encodings: ISO-8859-X and Windows-125X
6
+ * Add three HANGUL characters (U+115F, U+1160, U+3164) to list of possible white spaces
7
+ * Move character handling to separate gem. It is called "characteristics".
8
+ * Highlight control chars in blue and blanks in light blue
9
+ * Handle encodings that are not convertible to UTF-8
10
+
3
11
  ### 1.3.0
4
12
 
5
13
  * Add variation selectors 17-256 (U+E0100 - U+E01EF)
data/README.md CHANGED
@@ -4,9 +4,19 @@ Ruby library and CLI command that visualizes various Unicode and ASCII encodings
4
4
 
5
5
  - Makes analyzing encodings easier
6
6
  - Helps you with debugging strings
7
- - Supports **UTF-8**, **UTF-16LE**/**UTF-16BE**, **UTF-32LE**/**UTF-32BE**, arbitrary **BINARY** data, and **ASCII**
7
+ - Supports **UTF-8**, **UTF-16LE**/**UTF-16BE**, **UTF-32LE**/**UTF-32BE**, **ISO-8859-X**, **Windows-125X** arbitrary **BINARY** data, and **ASCII**
8
8
  - Highlights invalid encodings
9
9
 
10
+ ## Color Coding
11
+
12
+ Each byte of the given string is highlighted using the following mechanism:
13
+
14
+ - Red for invalid bytes
15
+ - Orange for unassigned bytes/characters
16
+ - Blue for control characters
17
+ - Light blue for blanks
18
+ - Random color for all other characters
19
+
10
20
  ## Setup
11
21
 
12
22
  Make sure you have Ruby installed and installing gems works properly. Then do:
@@ -114,16 +124,17 @@ Example in Ruby: `unibits "🌫 Idiosyncrätic ℜսᖯʏ", encoding: 'ascii'
114
124
 
115
125
  ### BINARY
116
126
 
117
- (not possible to produce invalid binary strings)
127
+ Not possible to produce invalid binary strings
118
128
 
119
129
  ## Notes
120
130
 
121
131
  Also see
122
132
 
133
+ - [Ruby's Encoding class](https://ruby-doc.org/core/Encoding.html)
134
+ - [Characteristics gem](https://github.com/janlelis/characteristics)
123
135
  - [UTF-8 (Wikipedia)](https://en.wikipedia.org/wiki/UTF-8#Description)
124
136
  - [UTF-16 (Wikipedia)](https://en.wikipedia.org/wiki/UTF-16#Description)
125
137
  - [UTF-32 (Wikipedia)](https://en.wikipedia.org/wiki/UTF-32)
126
- - [Ruby's Encoding class](https://ruby-doc.org/core/Encoding.html)
127
138
  - [Difference between BINARY and ASCII](http://idiosyncratic-ruby.com/56-us-ascii-8bit.html)
128
139
  - [Unicode Micro Libraries for Ruby](https://github.com/janlelis/unicode-x)
129
140
 
@@ -4,7 +4,7 @@ require_relative "unibits/symbolify"
4
4
  require "io/console"
5
5
  require "paint"
6
6
  require "unicode/display_width"
7
- require "unicode/categories"
7
+ require "characteristics"
8
8
 
9
9
  module Unibits
10
10
  SUPPORTED_ENCODINGS = [
@@ -15,6 +15,8 @@ module Unibits
15
15
  'UTF-32BE',
16
16
  'ASCII-8BIT',
17
17
  'US-ASCII',
18
+ /^ISO-8859-/,
19
+ /^Windows-125/,
18
20
  ].freeze
19
21
  DEFAULT_TERMINAL_WIDTH = 80
20
22
 
@@ -49,6 +51,9 @@ module Unibits
49
51
 
50
52
  def self.visualize(string, wide_ambiguous: false, width: nil)
51
53
  cols = width || determine_terminal_cols
54
+ encoding_name = string.encoding.name
55
+
56
+ type = Characteristics.type_from_encoding_name(encoding_name)
52
57
 
53
58
  cp_buffer = [" "]
54
59
  enc_buffer = [" "]
@@ -59,18 +64,10 @@ module Unibits
59
64
 
60
65
  puts
61
66
  string.each_char{ |char|
62
- if char.valid_encoding?
63
- char_valid = true
64
- current_encoding_error = nil
65
- if Unicode::Categories.category(char) == "Cn"
66
- current_color = "#FF5500"
67
- else
68
- current_color = random_color
69
- end
70
- else
71
- char_valid = false
72
- current_color = :red
73
- end
67
+ char_info = Characteristics.create_for_type(char, type)
68
+ current_color = determine_char_color(char_info)
69
+
70
+ current_encoding_error = nil if char_info.valid?
74
71
 
75
72
  char.each_byte.with_index{ |byte, index|
76
73
  if Paint.unpaint(hex_buffer[-1]).bytesize > cols - 12
@@ -82,10 +79,10 @@ module Unibits
82
79
  end
83
80
 
84
81
  if index == 0
85
- if char_valid
82
+ if char_info.valid?
86
83
  codepoint = "U+%04X" % char.ord
87
84
  else
88
- case string.encoding.name
85
+ case encoding_name
89
86
  when "US-ASCII"
90
87
  codepoint = "invalid"
91
88
  when "UTF-8"
@@ -167,7 +164,7 @@ module Unibits
167
164
  when 'UTF-16LE', 'UTF-16BE'
168
165
  if char.bytesize.odd?
169
166
  codepoint = "incompl."
170
- elsif char.b[string.encoding.name == 'UTF-16LE' ? 1 : 0].unpack("B*")[0][0, 5] == "11011"
167
+ elsif char.b[encoding_name == 'UTF-16LE' ? 1 : 0].unpack("B*")[0][0, 5] == "11011"
171
168
  codepoint = "hlf.srg."
172
169
  else
173
170
  codepoint = "invalid"
@@ -185,14 +182,14 @@ module Unibits
185
182
  codepoint.ljust(10), current_color, :bold
186
183
  ]
187
184
 
188
- if char_valid
189
- symbolified_char = symbolify(char)
185
+ symbolified_char = Symbolify.symbolify(char, char_info)
186
+
187
+ if char_info.unicode?
188
+ padding = 10 - Unicode::DisplayWidth.of(symbolified_char, wide_ambiguous ? 2 : 1)
190
189
  else
191
- symbolified_char = "�"
190
+ padding = 10 - symbolified_char.size
192
191
  end
193
192
 
194
- padding = 10 - Unicode::DisplayWidth.of(symbolified_char, wide_ambiguous ? 2 : 1)
195
-
196
193
  enc_buffer[-1] << Paint[
197
194
  symbolified_char, current_color
198
195
  ]
@@ -208,11 +205,11 @@ module Unibits
208
205
 
209
206
  bin_byte_complete = byte.to_s(2).rjust(8, "0")
210
207
 
211
- if !char_valid
208
+ if !char_info.valid?
212
209
  bin_byte_1 = bin_byte_complete
213
210
  bin_byte_2 = ""
214
211
  else
215
- case string.encoding.name
212
+ case encoding_name
216
213
  when 'US-ASCII'
217
214
  bin_byte_1 = bin_byte_complete[0...1]
218
215
  bin_byte_2 = bin_byte_complete[1...8]
@@ -253,6 +250,9 @@ module Unibits
253
250
  when 'UTF-32LE', 'UTF-32BE'
254
251
  bin_byte_1 = ""
255
252
  bin_byte_2 = bin_byte_complete
253
+ when /^(ISO-8859-|Windows-125)/
254
+ bin_byte_1 = ""
255
+ bin_byte_2 = bin_byte_complete
256
256
  end
257
257
  end
258
258
 
@@ -268,25 +268,34 @@ module Unibits
268
268
  }
269
269
  }
270
270
 
271
- if string.encoding.name[0, 3] == "UTF"
271
+ if type == :unicode
272
272
  enc_buffer.zip(cp_buffer, hex_buffer, bin_buffer, separator).flatten.join("\n")
273
273
  else
274
274
  enc_buffer.zip(hex_buffer, bin_buffer, separator).flatten.join("\n")
275
275
  end
276
276
  end
277
277
 
278
- def self.random_color
279
- "%.2x%.2x%.2x" %[rand(90) + 60, rand(90) + 60, rand(90) + 60]
280
- end
281
-
282
- def self.symbolify(char)
283
- return char.inspect unless char.encoding.name[0, 3] == "UTF"
284
- Symbolify.symbolify(char).encode('UTF-8')
285
- end
286
-
287
278
  def self.determine_terminal_cols
288
279
  STDIN.winsize[1] || DEFAULT_TERMINAL_WIDTH
289
280
  rescue Errno::ENOTTY
290
281
  return DEFAULT_TERMINAL_WIDTH
291
282
  end
283
+
284
+ def self.determine_char_color(char_info)
285
+ if !char_info.valid?
286
+ "#FF0000"
287
+ elsif !char_info.assigned?
288
+ "#FF5500"
289
+ elsif char_info.control?
290
+ "#0000FF"
291
+ elsif char_info.blank?
292
+ "#33AADD"
293
+ else
294
+ random_color
295
+ end
296
+ end
297
+
298
+ def self.random_color
299
+ "%.2x%.2x%.2x" % [rand(90) + 60, rand(90) + 60, rand(90) + 60]
300
+ end
292
301
  end
@@ -1,351 +1,442 @@
1
- require "unicode/categories"
2
-
3
1
  module Unibits
4
2
  module Symbolify
3
+ NO_UTF8_CONVERTER = /^Windows-1258/
4
+ ASCII_CHARS = "\x20-\x7E".freeze
5
5
  ASCII_CONTROL_CODEPOINTS = "\x00-\x1F\x7F".freeze
6
6
  ASCII_CONTROL_SYMBOLS = "\u{2400}-\u{241F}\u{2421}".freeze
7
- ASCII_CHARS = "\x20-\x7E".freeze
8
- TAG_START = "\u{E0001}".freeze
9
- TAG_START_SYMBOL = "LANG TAG".freeze
10
- TAG_SPACE = "\u{E0020}".freeze
11
- TAG_SPACE_SYMBOL = "TAG ␠".freeze
12
7
  TAGS = "\u{E0021}-\u{E007E}".freeze
13
- TAG_DELETE = "\u{E007F}".freeze
14
- TAG_DELETE_SYMBOL = "TAG ␡".freeze
8
+
9
+ CONTROL_C0_SYMBOLS = [
10
+ "␀",
11
+ "␁",
12
+ "␂",
13
+ "␃",
14
+ "␄",
15
+ "␅",
16
+ "␆",
17
+ "␇",
18
+ "␈",
19
+ "␉",
20
+ "␊",
21
+ "␋",
22
+ "␌",
23
+ "␍",
24
+ "␎",
25
+ "␏",
26
+ "␐",
27
+ "␑",
28
+ "␒",
29
+ "␓",
30
+ "␔",
31
+ "␕",
32
+ "␖",
33
+ "␗",
34
+ "␘",
35
+ "␙",
36
+ "␚",
37
+ "␛",
38
+ "␜",
39
+ "␝",
40
+ "␞",
41
+ "␟",
42
+ ]
43
+
44
+ CONTROL_DELETE_SYMBOL = "␡"
45
+
46
+ CONTROL_C1_NAMES = {
47
+ 0x80 => "PAD",
48
+ 0x81 => "HOP",
49
+ 0x82 => "BPH",
50
+ 0x83 => "NBH",
51
+ 0x84 => "IND",
52
+ 0x85 => "NEL",
53
+ 0x86 => "SSA",
54
+ 0x87 => "ESA",
55
+ 0x88 => "HTS",
56
+ 0x89 => "HTJ",
57
+ 0x8A => "VTS",
58
+ 0x8B => "PLD",
59
+ 0x8C => "PLU",
60
+ 0x8D => "RI",
61
+ 0x8E => "SS2",
62
+ 0x8F => "SS3",
63
+ 0x90 => "DCS",
64
+ 0x91 => "PU1",
65
+ 0x92 => "PU2",
66
+ 0x93 => "STS",
67
+ 0x94 => "CCH",
68
+ 0x95 => "MW",
69
+ 0x96 => "SPA",
70
+ 0x97 => "EPA",
71
+ 0x98 => "SOS",
72
+ 0x99 => "SGC",
73
+ 0x9A => "SCI",
74
+ 0x9B => "CSI",
75
+ 0x9C => "ST",
76
+ 0x9D => "OSC",
77
+ 0x9E => "PM",
78
+ 0x9F => "APC",
79
+ }
80
+
15
81
  INTERESTING_CODEPOINTS = {
16
- "\u{0080}" => "PAD",
17
- "\u{0081}" => "HOP",
18
- "\u{0082}" => "BPH",
19
- "\u{0083}" => "NBH",
20
- "\u{0084}" => "IND",
21
- "\u{0085}" => "NEL",
22
- "\u{0086}" => "SSA",
23
- "\u{0087}" => "ESA",
24
- "\u{0088}" => "HTS",
25
- "\u{0089}" => "HTJ",
26
- "\u{008A}" => "VTS",
27
- "\u{008B}" => "PLD",
28
- "\u{008C}" => "PLU",
29
- "\u{008D}" => "RI",
30
- "\u{008E}" => "SS2",
31
- "\u{008F}" => "SS3",
32
- "\u{0090}" => "DCS",
33
- "\u{0091}" => "PU1",
34
- "\u{0092}" => "PU2",
35
- "\u{0093}" => "STS",
36
- "\u{0094}" => "CCH",
37
- "\u{0095}" => "MW",
38
- "\u{0096}" => "SPA",
39
- "\u{0097}" => "EPA",
40
- "\u{0098}" => "SOS",
41
- "\u{0099}" => "SGC",
42
- "\u{009A}" => "SCI",
43
- "\u{009B}" => "CSI",
44
- "\u{009C}" => "ST",
45
- "\u{009D}" => "OSC",
46
- "\u{009E}" => "PM",
47
- "\u{009F}" => "APC",
82
+ 0x200E => "LRM",
83
+ 0x200F => "RLM",
84
+ 0x202A => "LRE",
85
+ 0x202B => "RLE",
86
+ 0x202C => "PDF",
87
+ 0x202D => "LRO",
88
+ 0x202E => "RLO",
89
+ 0x2066 => "LRI",
90
+ 0x2067 => "RLI",
91
+ 0x2068 => "FSI",
92
+ 0x2069 => "PDI",
48
93
 
49
- "\u{200E}" => "LRM",
50
- "\u{200F}" => "RLM",
51
- "\u{202A}" => "LRE",
52
- "\u{202B}" => "RLE",
53
- "\u{202C}" => "PDF",
54
- "\u{202D}" => "LRO",
55
- "\u{202E}" => "RLO",
56
- "\u{2066}" => "LRI",
57
- "\u{2067}" => "RLI",
58
- "\u{2068}" => "FSI",
59
- "\u{2069}" => "PDI",
94
+ 0xFE00 => "VS1",
95
+ 0xFE01 => "VS2",
96
+ 0xFE02 => "VS3",
97
+ 0xFE03 => "VS4",
98
+ 0xFE04 => "VS5",
99
+ 0xFE05 => "VS6",
100
+ 0xFE06 => "VS7",
101
+ 0xFE07 => "VS8",
102
+ 0xFE08 => "VS9",
103
+ 0xFE09 => "VS10",
104
+ 0xFE0A => "VS11",
105
+ 0xFE0B => "VS12",
106
+ 0xFE0C => "VS13",
107
+ 0xFE0D => "VS14",
108
+ 0xFE0E => "VS15",
109
+ 0xFE0F => "VS16",
60
110
 
61
- "\u{FE00}" => "VS1",
62
- "\u{FE01}" => "VS2",
63
- "\u{FE02}" => "VS3",
64
- "\u{FE03}" => "VS4",
65
- "\u{FE04}" => "VS5",
66
- "\u{FE05}" => "VS6",
67
- "\u{FE06}" => "VS7",
68
- "\u{FE07}" => "VS8",
69
- "\u{FE08}" => "VS9",
70
- "\u{FE09}" => "VS10",
71
- "\u{FE0A}" => "VS11",
72
- "\u{FE0B}" => "VS12",
73
- "\u{FE0C}" => "VS13",
74
- "\u{FE0D}" => "VS14",
75
- "\u{FE0E}" => "VS15",
76
- "\u{FE0F}" => "VS16",
111
+ 0xE0001 => "LANG TAG",
112
+ 0xE0020 => "TAG ␠",
113
+ 0xE007F => "TAG ␡",
77
114
 
78
- "\u{E0100}" => "VS17",
79
- "\u{E0101}" => "VS18",
80
- "\u{E0102}" => "VS19",
81
- "\u{E0103}" => "VS20",
82
- "\u{E0104}" => "VS21",
83
- "\u{E0105}" => "VS22",
84
- "\u{E0106}" => "VS23",
85
- "\u{E0107}" => "VS24",
86
- "\u{E0108}" => "VS25",
87
- "\u{E0109}" => "VS26",
88
- "\u{E010A}" => "VS27",
89
- "\u{E010B}" => "VS28",
90
- "\u{E010C}" => "VS29",
91
- "\u{E010D}" => "VS30",
92
- "\u{E010E}" => "VS31",
93
- "\u{E010F}" => "VS32",
94
- "\u{E0110}" => "VS33",
95
- "\u{E0111}" => "VS34",
96
- "\u{E0112}" => "VS35",
97
- "\u{E0113}" => "VS36",
98
- "\u{E0114}" => "VS37",
99
- "\u{E0115}" => "VS38",
100
- "\u{E0116}" => "VS39",
101
- "\u{E0117}" => "VS40",
102
- "\u{E0118}" => "VS41",
103
- "\u{E0119}" => "VS42",
104
- "\u{E011A}" => "VS43",
105
- "\u{E011B}" => "VS44",
106
- "\u{E011C}" => "VS45",
107
- "\u{E011D}" => "VS46",
108
- "\u{E011E}" => "VS47",
109
- "\u{E011F}" => "VS48",
110
- "\u{E0120}" => "VS49",
111
- "\u{E0121}" => "VS50",
112
- "\u{E0122}" => "VS51",
113
- "\u{E0123}" => "VS52",
114
- "\u{E0124}" => "VS53",
115
- "\u{E0125}" => "VS54",
116
- "\u{E0126}" => "VS55",
117
- "\u{E0127}" => "VS56",
118
- "\u{E0128}" => "VS57",
119
- "\u{E0129}" => "VS58",
120
- "\u{E012A}" => "VS59",
121
- "\u{E012B}" => "VS60",
122
- "\u{E012C}" => "VS61",
123
- "\u{E012D}" => "VS62",
124
- "\u{E012E}" => "VS63",
125
- "\u{E012F}" => "VS64",
126
- "\u{E0130}" => "VS65",
127
- "\u{E0131}" => "VS66",
128
- "\u{E0132}" => "VS67",
129
- "\u{E0133}" => "VS68",
130
- "\u{E0134}" => "VS69",
131
- "\u{E0135}" => "VS70",
132
- "\u{E0136}" => "VS71",
133
- "\u{E0137}" => "VS72",
134
- "\u{E0138}" => "VS73",
135
- "\u{E0139}" => "VS74",
136
- "\u{E013A}" => "VS75",
137
- "\u{E013B}" => "VS76",
138
- "\u{E013C}" => "VS77",
139
- "\u{E013D}" => "VS78",
140
- "\u{E013E}" => "VS79",
141
- "\u{E013F}" => "VS80",
142
- "\u{E0140}" => "VS81",
143
- "\u{E0141}" => "VS82",
144
- "\u{E0142}" => "VS83",
145
- "\u{E0143}" => "VS84",
146
- "\u{E0144}" => "VS85",
147
- "\u{E0145}" => "VS86",
148
- "\u{E0146}" => "VS87",
149
- "\u{E0147}" => "VS88",
150
- "\u{E0148}" => "VS89",
151
- "\u{E0149}" => "VS90",
152
- "\u{E014A}" => "VS91",
153
- "\u{E014B}" => "VS92",
154
- "\u{E014C}" => "VS93",
155
- "\u{E014D}" => "VS94",
156
- "\u{E014E}" => "VS95",
157
- "\u{E014F}" => "VS96",
158
- "\u{E0150}" => "VS97",
159
- "\u{E0151}" => "VS98",
160
- "\u{E0152}" => "VS99",
161
- "\u{E0153}" => "VS100",
162
- "\u{E0154}" => "VS101",
163
- "\u{E0155}" => "VS102",
164
- "\u{E0156}" => "VS103",
165
- "\u{E0157}" => "VS104",
166
- "\u{E0158}" => "VS105",
167
- "\u{E0159}" => "VS106",
168
- "\u{E015A}" => "VS107",
169
- "\u{E015B}" => "VS108",
170
- "\u{E015C}" => "VS109",
171
- "\u{E015D}" => "VS110",
172
- "\u{E015E}" => "VS111",
173
- "\u{E015F}" => "VS112",
174
- "\u{E0160}" => "VS113",
175
- "\u{E0161}" => "VS114",
176
- "\u{E0162}" => "VS115",
177
- "\u{E0163}" => "VS116",
178
- "\u{E0164}" => "VS117",
179
- "\u{E0165}" => "VS118",
180
- "\u{E0166}" => "VS119",
181
- "\u{E0167}" => "VS120",
182
- "\u{E0168}" => "VS121",
183
- "\u{E0169}" => "VS122",
184
- "\u{E016A}" => "VS123",
185
- "\u{E016B}" => "VS124",
186
- "\u{E016C}" => "VS125",
187
- "\u{E016D}" => "VS126",
188
- "\u{E016E}" => "VS127",
189
- "\u{E016F}" => "VS128",
190
- "\u{E0170}" => "VS129",
191
- "\u{E0171}" => "VS130",
192
- "\u{E0172}" => "VS131",
193
- "\u{E0173}" => "VS132",
194
- "\u{E0174}" => "VS133",
195
- "\u{E0175}" => "VS134",
196
- "\u{E0176}" => "VS135",
197
- "\u{E0177}" => "VS136",
198
- "\u{E0178}" => "VS137",
199
- "\u{E0179}" => "VS138",
200
- "\u{E017A}" => "VS139",
201
- "\u{E017B}" => "VS140",
202
- "\u{E017C}" => "VS141",
203
- "\u{E017D}" => "VS142",
204
- "\u{E017E}" => "VS143",
205
- "\u{E017F}" => "VS144",
206
- "\u{E0180}" => "VS145",
207
- "\u{E0181}" => "VS146",
208
- "\u{E0182}" => "VS147",
209
- "\u{E0183}" => "VS148",
210
- "\u{E0184}" => "VS149",
211
- "\u{E0185}" => "VS150",
212
- "\u{E0186}" => "VS151",
213
- "\u{E0187}" => "VS152",
214
- "\u{E0188}" => "VS153",
215
- "\u{E0189}" => "VS154",
216
- "\u{E018A}" => "VS155",
217
- "\u{E018B}" => "VS156",
218
- "\u{E018C}" => "VS157",
219
- "\u{E018D}" => "VS158",
220
- "\u{E018E}" => "VS159",
221
- "\u{E018F}" => "VS160",
222
- "\u{E0190}" => "VS161",
223
- "\u{E0191}" => "VS162",
224
- "\u{E0192}" => "VS163",
225
- "\u{E0193}" => "VS164",
226
- "\u{E0194}" => "VS165",
227
- "\u{E0195}" => "VS166",
228
- "\u{E0196}" => "VS167",
229
- "\u{E0197}" => "VS168",
230
- "\u{E0198}" => "VS169",
231
- "\u{E0199}" => "VS170",
232
- "\u{E019A}" => "VS171",
233
- "\u{E019B}" => "VS172",
234
- "\u{E019C}" => "VS173",
235
- "\u{E019D}" => "VS174",
236
- "\u{E019E}" => "VS175",
237
- "\u{E019F}" => "VS176",
238
- "\u{E01A0}" => "VS177",
239
- "\u{E01A1}" => "VS178",
240
- "\u{E01A2}" => "VS179",
241
- "\u{E01A3}" => "VS180",
242
- "\u{E01A4}" => "VS181",
243
- "\u{E01A5}" => "VS182",
244
- "\u{E01A6}" => "VS183",
245
- "\u{E01A7}" => "VS184",
246
- "\u{E01A8}" => "VS185",
247
- "\u{E01A9}" => "VS186",
248
- "\u{E01AA}" => "VS187",
249
- "\u{E01AB}" => "VS188",
250
- "\u{E01AC}" => "VS189",
251
- "\u{E01AD}" => "VS190",
252
- "\u{E01AE}" => "VS191",
253
- "\u{E01AF}" => "VS192",
254
- "\u{E01B0}" => "VS193",
255
- "\u{E01B1}" => "VS194",
256
- "\u{E01B2}" => "VS195",
257
- "\u{E01B3}" => "VS196",
258
- "\u{E01B4}" => "VS197",
259
- "\u{E01B5}" => "VS198",
260
- "\u{E01B6}" => "VS199",
261
- "\u{E01B7}" => "VS200",
262
- "\u{E01B8}" => "VS201",
263
- "\u{E01B9}" => "VS202",
264
- "\u{E01BA}" => "VS203",
265
- "\u{E01BB}" => "VS204",
266
- "\u{E01BC}" => "VS205",
267
- "\u{E01BD}" => "VS206",
268
- "\u{E01BE}" => "VS207",
269
- "\u{E01BF}" => "VS208",
270
- "\u{E01C0}" => "VS209",
271
- "\u{E01C1}" => "VS210",
272
- "\u{E01C2}" => "VS211",
273
- "\u{E01C3}" => "VS212",
274
- "\u{E01C4}" => "VS213",
275
- "\u{E01C5}" => "VS214",
276
- "\u{E01C6}" => "VS215",
277
- "\u{E01C7}" => "VS216",
278
- "\u{E01C8}" => "VS217",
279
- "\u{E01C9}" => "VS218",
280
- "\u{E01CA}" => "VS219",
281
- "\u{E01CB}" => "VS220",
282
- "\u{E01CC}" => "VS221",
283
- "\u{E01CD}" => "VS222",
284
- "\u{E01CE}" => "VS223",
285
- "\u{E01CF}" => "VS224",
286
- "\u{E01D0}" => "VS225",
287
- "\u{E01D1}" => "VS226",
288
- "\u{E01D2}" => "VS227",
289
- "\u{E01D3}" => "VS228",
290
- "\u{E01D4}" => "VS229",
291
- "\u{E01D5}" => "VS230",
292
- "\u{E01D6}" => "VS231",
293
- "\u{E01D7}" => "VS232",
294
- "\u{E01D8}" => "VS233",
295
- "\u{E01D9}" => "VS234",
296
- "\u{E01DA}" => "VS235",
297
- "\u{E01DB}" => "VS236",
298
- "\u{E01DC}" => "VS237",
299
- "\u{E01DD}" => "VS238",
300
- "\u{E01DE}" => "VS239",
301
- "\u{E01DF}" => "VS240",
302
- "\u{E01E0}" => "VS241",
303
- "\u{E01E1}" => "VS242",
304
- "\u{E01E2}" => "VS243",
305
- "\u{E01E3}" => "VS244",
306
- "\u{E01E4}" => "VS245",
307
- "\u{E01E5}" => "VS246",
308
- "\u{E01E6}" => "VS247",
309
- "\u{E01E7}" => "VS248",
310
- "\u{E01E8}" => "VS249",
311
- "\u{E01E9}" => "VS250",
312
- "\u{E01EA}" => "VS251",
313
- "\u{E01EB}" => "VS252",
314
- "\u{E01EC}" => "VS253",
315
- "\u{E01ED}" => "VS254",
316
- "\u{E01EE}" => "VS255",
317
- "\u{E01EF}" => "VS256",
115
+ 0xE0100 => "VS17",
116
+ 0xE0101 => "VS18",
117
+ 0xE0102 => "VS19",
118
+ 0xE0103 => "VS20",
119
+ 0xE0104 => "VS21",
120
+ 0xE0105 => "VS22",
121
+ 0xE0106 => "VS23",
122
+ 0xE0107 => "VS24",
123
+ 0xE0108 => "VS25",
124
+ 0xE0109 => "VS26",
125
+ 0xE010A => "VS27",
126
+ 0xE010B => "VS28",
127
+ 0xE010C => "VS29",
128
+ 0xE010D => "VS30",
129
+ 0xE010E => "VS31",
130
+ 0xE010F => "VS32",
131
+ 0xE0110 => "VS33",
132
+ 0xE0111 => "VS34",
133
+ 0xE0112 => "VS35",
134
+ 0xE0113 => "VS36",
135
+ 0xE0114 => "VS37",
136
+ 0xE0115 => "VS38",
137
+ 0xE0116 => "VS39",
138
+ 0xE0117 => "VS40",
139
+ 0xE0118 => "VS41",
140
+ 0xE0119 => "VS42",
141
+ 0xE011A => "VS43",
142
+ 0xE011B => "VS44",
143
+ 0xE011C => "VS45",
144
+ 0xE011D => "VS46",
145
+ 0xE011E => "VS47",
146
+ 0xE011F => "VS48",
147
+ 0xE0120 => "VS49",
148
+ 0xE0121 => "VS50",
149
+ 0xE0122 => "VS51",
150
+ 0xE0123 => "VS52",
151
+ 0xE0124 => "VS53",
152
+ 0xE0125 => "VS54",
153
+ 0xE0126 => "VS55",
154
+ 0xE0127 => "VS56",
155
+ 0xE0128 => "VS57",
156
+ 0xE0129 => "VS58",
157
+ 0xE012A => "VS59",
158
+ 0xE012B => "VS60",
159
+ 0xE012C => "VS61",
160
+ 0xE012D => "VS62",
161
+ 0xE012E => "VS63",
162
+ 0xE012F => "VS64",
163
+ 0xE0130 => "VS65",
164
+ 0xE0131 => "VS66",
165
+ 0xE0132 => "VS67",
166
+ 0xE0133 => "VS68",
167
+ 0xE0134 => "VS69",
168
+ 0xE0135 => "VS70",
169
+ 0xE0136 => "VS71",
170
+ 0xE0137 => "VS72",
171
+ 0xE0138 => "VS73",
172
+ 0xE0139 => "VS74",
173
+ 0xE013A => "VS75",
174
+ 0xE013B => "VS76",
175
+ 0xE013C => "VS77",
176
+ 0xE013D => "VS78",
177
+ 0xE013E => "VS79",
178
+ 0xE013F => "VS80",
179
+ 0xE0140 => "VS81",
180
+ 0xE0141 => "VS82",
181
+ 0xE0142 => "VS83",
182
+ 0xE0143 => "VS84",
183
+ 0xE0144 => "VS85",
184
+ 0xE0145 => "VS86",
185
+ 0xE0146 => "VS87",
186
+ 0xE0147 => "VS88",
187
+ 0xE0148 => "VS89",
188
+ 0xE0149 => "VS90",
189
+ 0xE014A => "VS91",
190
+ 0xE014B => "VS92",
191
+ 0xE014C => "VS93",
192
+ 0xE014D => "VS94",
193
+ 0xE014E => "VS95",
194
+ 0xE014F => "VS96",
195
+ 0xE0150 => "VS97",
196
+ 0xE0151 => "VS98",
197
+ 0xE0152 => "VS99",
198
+ 0xE0153 => "VS100",
199
+ 0xE0154 => "VS101",
200
+ 0xE0155 => "VS102",
201
+ 0xE0156 => "VS103",
202
+ 0xE0157 => "VS104",
203
+ 0xE0158 => "VS105",
204
+ 0xE0159 => "VS106",
205
+ 0xE015A => "VS107",
206
+ 0xE015B => "VS108",
207
+ 0xE015C => "VS109",
208
+ 0xE015D => "VS110",
209
+ 0xE015E => "VS111",
210
+ 0xE015F => "VS112",
211
+ 0xE0160 => "VS113",
212
+ 0xE0161 => "VS114",
213
+ 0xE0162 => "VS115",
214
+ 0xE0163 => "VS116",
215
+ 0xE0164 => "VS117",
216
+ 0xE0165 => "VS118",
217
+ 0xE0166 => "VS119",
218
+ 0xE0167 => "VS120",
219
+ 0xE0168 => "VS121",
220
+ 0xE0169 => "VS122",
221
+ 0xE016A => "VS123",
222
+ 0xE016B => "VS124",
223
+ 0xE016C => "VS125",
224
+ 0xE016D => "VS126",
225
+ 0xE016E => "VS127",
226
+ 0xE016F => "VS128",
227
+ 0xE0170 => "VS129",
228
+ 0xE0171 => "VS130",
229
+ 0xE0172 => "VS131",
230
+ 0xE0173 => "VS132",
231
+ 0xE0174 => "VS133",
232
+ 0xE0175 => "VS134",
233
+ 0xE0176 => "VS135",
234
+ 0xE0177 => "VS136",
235
+ 0xE0178 => "VS137",
236
+ 0xE0179 => "VS138",
237
+ 0xE017A => "VS139",
238
+ 0xE017B => "VS140",
239
+ 0xE017C => "VS141",
240
+ 0xE017D => "VS142",
241
+ 0xE017E => "VS143",
242
+ 0xE017F => "VS144",
243
+ 0xE0180 => "VS145",
244
+ 0xE0181 => "VS146",
245
+ 0xE0182 => "VS147",
246
+ 0xE0183 => "VS148",
247
+ 0xE0184 => "VS149",
248
+ 0xE0185 => "VS150",
249
+ 0xE0186 => "VS151",
250
+ 0xE0187 => "VS152",
251
+ 0xE0188 => "VS153",
252
+ 0xE0189 => "VS154",
253
+ 0xE018A => "VS155",
254
+ 0xE018B => "VS156",
255
+ 0xE018C => "VS157",
256
+ 0xE018D => "VS158",
257
+ 0xE018E => "VS159",
258
+ 0xE018F => "VS160",
259
+ 0xE0190 => "VS161",
260
+ 0xE0191 => "VS162",
261
+ 0xE0192 => "VS163",
262
+ 0xE0193 => "VS164",
263
+ 0xE0194 => "VS165",
264
+ 0xE0195 => "VS166",
265
+ 0xE0196 => "VS167",
266
+ 0xE0197 => "VS168",
267
+ 0xE0198 => "VS169",
268
+ 0xE0199 => "VS170",
269
+ 0xE019A => "VS171",
270
+ 0xE019B => "VS172",
271
+ 0xE019C => "VS173",
272
+ 0xE019D => "VS174",
273
+ 0xE019E => "VS175",
274
+ 0xE019F => "VS176",
275
+ 0xE01A0 => "VS177",
276
+ 0xE01A1 => "VS178",
277
+ 0xE01A2 => "VS179",
278
+ 0xE01A3 => "VS180",
279
+ 0xE01A4 => "VS181",
280
+ 0xE01A5 => "VS182",
281
+ 0xE01A6 => "VS183",
282
+ 0xE01A7 => "VS184",
283
+ 0xE01A8 => "VS185",
284
+ 0xE01A9 => "VS186",
285
+ 0xE01AA => "VS187",
286
+ 0xE01AB => "VS188",
287
+ 0xE01AC => "VS189",
288
+ 0xE01AD => "VS190",
289
+ 0xE01AE => "VS191",
290
+ 0xE01AF => "VS192",
291
+ 0xE01B0 => "VS193",
292
+ 0xE01B1 => "VS194",
293
+ 0xE01B2 => "VS195",
294
+ 0xE01B3 => "VS196",
295
+ 0xE01B4 => "VS197",
296
+ 0xE01B5 => "VS198",
297
+ 0xE01B6 => "VS199",
298
+ 0xE01B7 => "VS200",
299
+ 0xE01B8 => "VS201",
300
+ 0xE01B9 => "VS202",
301
+ 0xE01BA => "VS203",
302
+ 0xE01BB => "VS204",
303
+ 0xE01BC => "VS205",
304
+ 0xE01BD => "VS206",
305
+ 0xE01BE => "VS207",
306
+ 0xE01BF => "VS208",
307
+ 0xE01C0 => "VS209",
308
+ 0xE01C1 => "VS210",
309
+ 0xE01C2 => "VS211",
310
+ 0xE01C3 => "VS212",
311
+ 0xE01C4 => "VS213",
312
+ 0xE01C5 => "VS214",
313
+ 0xE01C6 => "VS215",
314
+ 0xE01C7 => "VS216",
315
+ 0xE01C8 => "VS217",
316
+ 0xE01C9 => "VS218",
317
+ 0xE01CA => "VS219",
318
+ 0xE01CB => "VS220",
319
+ 0xE01CC => "VS221",
320
+ 0xE01CD => "VS222",
321
+ 0xE01CE => "VS223",
322
+ 0xE01CF => "VS224",
323
+ 0xE01D0 => "VS225",
324
+ 0xE01D1 => "VS226",
325
+ 0xE01D2 => "VS227",
326
+ 0xE01D3 => "VS228",
327
+ 0xE01D4 => "VS229",
328
+ 0xE01D5 => "VS230",
329
+ 0xE01D6 => "VS231",
330
+ 0xE01D7 => "VS232",
331
+ 0xE01D8 => "VS233",
332
+ 0xE01D9 => "VS234",
333
+ 0xE01DA => "VS235",
334
+ 0xE01DB => "VS236",
335
+ 0xE01DC => "VS237",
336
+ 0xE01DD => "VS238",
337
+ 0xE01DE => "VS239",
338
+ 0xE01DF => "VS240",
339
+ 0xE01E0 => "VS241",
340
+ 0xE01E1 => "VS242",
341
+ 0xE01E2 => "VS243",
342
+ 0xE01E3 => "VS244",
343
+ 0xE01E4 => "VS245",
344
+ 0xE01E5 => "VS246",
345
+ 0xE01E6 => "VS247",
346
+ 0xE01E7 => "VS248",
347
+ 0xE01E8 => "VS249",
348
+ 0xE01E9 => "VS250",
349
+ 0xE01EA => "VS251",
350
+ 0xE01EB => "VS252",
351
+ 0xE01EC => "VS253",
352
+ 0xE01ED => "VS254",
353
+ 0xE01EE => "VS255",
354
+ 0xE01EF => "VS256",
318
355
  }.freeze
319
- COULD_BE_WHITESPACE = '[\p{Space}­᠎​‌‍⁠⁡⁢⁣⁤⠀𛲠𛲡𛲢𛲣𝅙𝅳𝅴𝅵𝅶𝅷𝅸𝅹𝅺]'.freeze
320
356
 
321
- def self.symbolify(char, encoding = char.encoding)
322
- return "n/a" if Unicode::Categories.category(char) == "Cn"
357
+ def self.symbolify(char, char_info)
358
+ if !char_info.valid?
359
+ "�"
360
+ else
361
+ case char_info
362
+ when UnicodeCharacteristics
363
+ Symbolify.unicode(char, char_info)
364
+ when ByteCharacteristics
365
+ Symbolify.byte(char, char_info)
366
+ when AsciiCharacteristics
367
+ Symbolify.ascii(char, char_info)
368
+ else
369
+ Symbolify.binary(char)
370
+ end
371
+ end
372
+ end
373
+
374
+ def self.unicode(char, char_info)
375
+ return "n/a" if !char_info.assigned?
323
376
 
324
377
  char = char.dup
378
+ ord = char.ord
379
+ encoding = char_info.encoding
325
380
 
326
- char.tr!(
327
- ASCII_CONTROL_CODEPOINTS.encode(encoding),
328
- ASCII_CONTROL_SYMBOLS.encode(encoding)
329
- )
330
- char.gsub!(
331
- Regexp.compile(COULD_BE_WHITESPACE.encode(encoding)),
332
- ']\0['.encode(encoding)
333
- )
381
+ if char_info.delete?
382
+ char = CONTROL_DELETE_SYMBOL
383
+ elsif char_info.c0?
384
+ char = CONTROL_C0_SYMBOLS[ord]
385
+ elsif char_info.c1?
386
+ char = CONTROL_C1_NAMES[ord]
387
+ elsif char_info.blank?
388
+ char = "]".encode(encoding) + char + "[".encode(encoding)
389
+ elsif ord > 917536 && ord < 917631
390
+ char = "TAG ".encode(encoding) +
391
+ char.tr(TAGS.encode(encoding), ASCII_CHARS.encode(encoding))
392
+ else
393
+ char = INTERESTING_CODEPOINTS[char.ord] || char
394
+ end
395
+
396
+ char.encode("UTF-8")
397
+ end
334
398
 
335
- INTERESTING_CODEPOINTS.each{ |cp, desc|
336
- char.gsub! Regexp.compile(cp.encode(encoding)), desc.encode(encoding)
337
- }
338
- char.gsub! TAG_START.encode(encoding), TAG_START_SYMBOL.encode(encoding)
339
- char.gsub! TAG_SPACE.encode(encoding), TAG_SPACE_SYMBOL.encode(encoding)
340
- char.gsub! TAG_DELETE.encode(encoding), TAG_DELETE_SYMBOL.encode(encoding)
399
+ def self.byte(char, char_info)
400
+ return "n/a" if !char_info.assigned?
341
401
 
342
402
  ord = char.ord
343
- if ord > 917536 && ord < 917631
344
- char.tr!(TAGS.encode(encoding), ASCII_CHARS.encode(encoding))
345
- char = "TAG ".encode(encoding) + char
403
+ encoding = char_info.encoding
404
+ no_converter = !!(NO_UTF8_CONVERTER =~ encoding.name)
405
+ treat_char_unconverted = false
406
+
407
+ if char_info.delete?
408
+ char = CONTROL_DELETE_SYMBOL
409
+ elsif char_info.c0?
410
+ char = CONTROL_C0_SYMBOLS[ord]
411
+ elsif char_info.c1?
412
+ char = CONTROL_C1_NAMES[ord]
413
+ elsif no_converter
414
+ treat_char_unconverted = true
415
+ elsif char_info.blank?
416
+ char = "]".encode(encoding) + char + "[".encode(encoding)
417
+ end
418
+
419
+ if no_converter && treat_char_unconverted
420
+ char.inspect
421
+ else
422
+ char.encode("UTF-8")
423
+ end
424
+ end
425
+
426
+ def self.ascii(char, char_info)
427
+ if char_info.delete?
428
+ char = CONTROL_DELETE_SYMBOL
429
+ elsif char_info.c0?
430
+ char = CONTROL_C0_SYMBOLS[char.ord]
431
+ elsif char_info.blank?
432
+ char = "]" + char + "["
346
433
  end
347
434
 
348
435
  char
349
436
  end
437
+
438
+ def self.binary(char)
439
+ char.inspect
440
+ end
350
441
  end
351
442
  end
@@ -1,3 +1,3 @@
1
1
  module Unibits
2
- VERSION = "1.3.0".freeze
2
+ VERSION = "2.0.0".freeze
3
3
  end
@@ -67,6 +67,24 @@ describe Unibits do
67
67
  result.must_match "01000011"
68
68
  end
69
69
 
70
+ it "works with 'ISO-8859-' encodings" do
71
+ string = "\xBC Idiosyncr\xE4tic\n\x91".force_encoding("ISO-8859-1")
72
+ result = Paint.unpaint(Unibits.visualize(string))
73
+ result.must_match "BC" # ¼
74
+ result.must_match "E4" # ä
75
+ result.must_match "␊" # \n
76
+ result.must_match "PU1" # C1 name for \x91
77
+ end
78
+
79
+ it "works with 'Windows-125' encodings" do
80
+ string = "\xBC Idiosyncr\xE4tic\n\x81".force_encoding("Windows-1252")
81
+ result = Paint.unpaint(Unibits.visualize(string))
82
+ result.must_match "BC" # ¼
83
+ result.must_match "E4" # ä
84
+ result.must_match "␊" # \n
85
+ result.must_match "n/a" # \x81 is not assigned
86
+ end
87
+
70
88
  describe "invalid UTF-8 encodings" do
71
89
  it "- unexpected continuation byte (1/2)" do
72
90
  string = "abc\x80efg"
@@ -19,7 +19,7 @@ Gem::Specification.new do |gem|
19
19
 
20
20
  gem.add_dependency 'paint', '>= 0.9', '< 3.0'
21
21
  gem.add_dependency 'unicode-display_width', '~> 1.1'
22
- gem.add_dependency 'unicode-categories', '~> 1.1', '>= 1.1.2'
22
+ gem.add_dependency 'characteristics', '~> 0.2.0'
23
23
  gem.add_dependency 'rationalist', '~> 2.0'
24
24
 
25
25
  gem.required_ruby_version = "~> 2.0"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unibits
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Lelis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-03-09 00:00:00.000000000 Z
11
+ date: 2017-03-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: paint
@@ -45,25 +45,19 @@ dependencies:
45
45
  - !ruby/object:Gem::Version
46
46
  version: '1.1'
47
47
  - !ruby/object:Gem::Dependency
48
- name: unicode-categories
48
+ name: characteristics
49
49
  requirement: !ruby/object:Gem::Requirement
50
50
  requirements:
51
51
  - - "~>"
52
52
  - !ruby/object:Gem::Version
53
- version: '1.1'
54
- - - ">="
55
- - !ruby/object:Gem::Version
56
- version: 1.1.2
53
+ version: 0.2.0
57
54
  type: :runtime
58
55
  prerelease: false
59
56
  version_requirements: !ruby/object:Gem::Requirement
60
57
  requirements:
61
58
  - - "~>"
62
59
  - !ruby/object:Gem::Version
63
- version: '1.1'
64
- - - ">="
65
- - !ruby/object:Gem::Version
66
- version: 1.1.2
60
+ version: 0.2.0
67
61
  - !ruby/object:Gem::Dependency
68
62
  name: rationalist
69
63
  requirement: !ruby/object:Gem::Requirement