unibits 1.3.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b811d8983c85bc87c33a59b4cb7cd5144104a994
4
- data.tar.gz: 63cf6f3b91f26b8ac8b0fca9d90a595662a9f0f4
3
+ metadata.gz: 0d5f49745b4dd8f80a9b4c87c3810b4ddc5dbe66
4
+ data.tar.gz: a00bc08a018d1c1e40d9560f48cf64e53aa4e20b
5
5
  SHA512:
6
- metadata.gz: 65979ed1477c6f5bcbed7b6f74474c1958e582614d91ae0a2ebf5cdf605b31f4263b1cc68e1ff113b21ce572c7025bfacf85db5353b9108e07cf352a3c60e052
7
- data.tar.gz: b4cfbad99ad6257616b96e95f53f21696e545b4b29fafd6c3e212e03bc51f52eddb66c06ddb0b1fe2842a42cb5553f5d1a5ca32b7320aa463cd7bbf9de4f0ad3
6
+ metadata.gz: 865101498c37b0d480846c6eb9a613942becef9f0f0fa02d01ed78ca1dc8bd4d7d3ae10ce3856ab76ded888278e27da6962657de65ea96c464920c5e3c91866d
7
+ data.tar.gz: 39362d02cc5f8b8f0c6df3cc3b7f1bb597eef0eb8e2fb828b16f95de8e6a8e8b41faa13d03a33fcb1d2890b7625a855a75924351dca76832994cf50d1af95e9f
@@ -1,5 +1,13 @@
1
1
  ## CHANGELOG
2
2
 
3
+ ### 2.0.0
4
+
5
+ * Support more encodings: ISO-8859-X and Windows-125X
6
+ * Add three HANGUL characters (U+115F, U+1160, U+3164) to list of possible white spaces
7
+ * Move character handling to separate gem. It is called "characteristics".
8
+ * Highlight control chars in blue and blanks in light blue
9
+ * Handle encodings that are not convertible to UTF-8
10
+
3
11
  ### 1.3.0
4
12
 
5
13
  * Add variation selectors 17-256 (U+E0100 - U+E01EF)
data/README.md CHANGED
@@ -4,9 +4,19 @@ Ruby library and CLI command that visualizes various Unicode and ASCII encodings
4
4
 
5
5
  - Makes analyzing encodings easier
6
6
  - Helps you with debugging strings
7
- - Supports **UTF-8**, **UTF-16LE**/**UTF-16BE**, **UTF-32LE**/**UTF-32BE**, arbitrary **BINARY** data, and **ASCII**
7
+ - Supports **UTF-8**, **UTF-16LE**/**UTF-16BE**, **UTF-32LE**/**UTF-32BE**, **ISO-8859-X**, **Windows-125X** arbitrary **BINARY** data, and **ASCII**
8
8
  - Highlights invalid encodings
9
9
 
10
+ ## Color Coding
11
+
12
+ Each byte of the given string is highlighted using the following mechanism:
13
+
14
+ - Red for invalid bytes
15
+ - Orange for unassigned bytes/characters
16
+ - Blue for control characters
17
+ - Light blue for blanks
18
+ - Random color for all other characters
19
+
10
20
  ## Setup
11
21
 
12
22
  Make sure you have Ruby installed and installing gems works properly. Then do:
@@ -114,16 +124,17 @@ Example in Ruby: `unibits "🌫 Idiosyncrätic ℜսᖯʏ", encoding: 'ascii'
114
124
 
115
125
  ### BINARY
116
126
 
117
- (not possible to produce invalid binary strings)
127
+ Not possible to produce invalid binary strings
118
128
 
119
129
  ## Notes
120
130
 
121
131
  Also see
122
132
 
133
+ - [Ruby's Encoding class](https://ruby-doc.org/core/Encoding.html)
134
+ - [Characteristics gem](https://github.com/janlelis/characteristics)
123
135
  - [UTF-8 (Wikipedia)](https://en.wikipedia.org/wiki/UTF-8#Description)
124
136
  - [UTF-16 (Wikipedia)](https://en.wikipedia.org/wiki/UTF-16#Description)
125
137
  - [UTF-32 (Wikipedia)](https://en.wikipedia.org/wiki/UTF-32)
126
- - [Ruby's Encoding class](https://ruby-doc.org/core/Encoding.html)
127
138
  - [Difference between BINARY and ASCII](http://idiosyncratic-ruby.com/56-us-ascii-8bit.html)
128
139
  - [Unicode Micro Libraries for Ruby](https://github.com/janlelis/unicode-x)
129
140
 
@@ -4,7 +4,7 @@ require_relative "unibits/symbolify"
4
4
  require "io/console"
5
5
  require "paint"
6
6
  require "unicode/display_width"
7
- require "unicode/categories"
7
+ require "characteristics"
8
8
 
9
9
  module Unibits
10
10
  SUPPORTED_ENCODINGS = [
@@ -15,6 +15,8 @@ module Unibits
15
15
  'UTF-32BE',
16
16
  'ASCII-8BIT',
17
17
  'US-ASCII',
18
+ /^ISO-8859-/,
19
+ /^Windows-125/,
18
20
  ].freeze
19
21
  DEFAULT_TERMINAL_WIDTH = 80
20
22
 
@@ -49,6 +51,9 @@ module Unibits
49
51
 
50
52
  def self.visualize(string, wide_ambiguous: false, width: nil)
51
53
  cols = width || determine_terminal_cols
54
+ encoding_name = string.encoding.name
55
+
56
+ type = Characteristics.type_from_encoding_name(encoding_name)
52
57
 
53
58
  cp_buffer = [" "]
54
59
  enc_buffer = [" "]
@@ -59,18 +64,10 @@ module Unibits
59
64
 
60
65
  puts
61
66
  string.each_char{ |char|
62
- if char.valid_encoding?
63
- char_valid = true
64
- current_encoding_error = nil
65
- if Unicode::Categories.category(char) == "Cn"
66
- current_color = "#FF5500"
67
- else
68
- current_color = random_color
69
- end
70
- else
71
- char_valid = false
72
- current_color = :red
73
- end
67
+ char_info = Characteristics.create_for_type(char, type)
68
+ current_color = determine_char_color(char_info)
69
+
70
+ current_encoding_error = nil if char_info.valid?
74
71
 
75
72
  char.each_byte.with_index{ |byte, index|
76
73
  if Paint.unpaint(hex_buffer[-1]).bytesize > cols - 12
@@ -82,10 +79,10 @@ module Unibits
82
79
  end
83
80
 
84
81
  if index == 0
85
- if char_valid
82
+ if char_info.valid?
86
83
  codepoint = "U+%04X" % char.ord
87
84
  else
88
- case string.encoding.name
85
+ case encoding_name
89
86
  when "US-ASCII"
90
87
  codepoint = "invalid"
91
88
  when "UTF-8"
@@ -167,7 +164,7 @@ module Unibits
167
164
  when 'UTF-16LE', 'UTF-16BE'
168
165
  if char.bytesize.odd?
169
166
  codepoint = "incompl."
170
- elsif char.b[string.encoding.name == 'UTF-16LE' ? 1 : 0].unpack("B*")[0][0, 5] == "11011"
167
+ elsif char.b[encoding_name == 'UTF-16LE' ? 1 : 0].unpack("B*")[0][0, 5] == "11011"
171
168
  codepoint = "hlf.srg."
172
169
  else
173
170
  codepoint = "invalid"
@@ -185,14 +182,14 @@ module Unibits
185
182
  codepoint.ljust(10), current_color, :bold
186
183
  ]
187
184
 
188
- if char_valid
189
- symbolified_char = symbolify(char)
185
+ symbolified_char = Symbolify.symbolify(char, char_info)
186
+
187
+ if char_info.unicode?
188
+ padding = 10 - Unicode::DisplayWidth.of(symbolified_char, wide_ambiguous ? 2 : 1)
190
189
  else
191
- symbolified_char = "�"
190
+ padding = 10 - symbolified_char.size
192
191
  end
193
192
 
194
- padding = 10 - Unicode::DisplayWidth.of(symbolified_char, wide_ambiguous ? 2 : 1)
195
-
196
193
  enc_buffer[-1] << Paint[
197
194
  symbolified_char, current_color
198
195
  ]
@@ -208,11 +205,11 @@ module Unibits
208
205
 
209
206
  bin_byte_complete = byte.to_s(2).rjust(8, "0")
210
207
 
211
- if !char_valid
208
+ if !char_info.valid?
212
209
  bin_byte_1 = bin_byte_complete
213
210
  bin_byte_2 = ""
214
211
  else
215
- case string.encoding.name
212
+ case encoding_name
216
213
  when 'US-ASCII'
217
214
  bin_byte_1 = bin_byte_complete[0...1]
218
215
  bin_byte_2 = bin_byte_complete[1...8]
@@ -253,6 +250,9 @@ module Unibits
253
250
  when 'UTF-32LE', 'UTF-32BE'
254
251
  bin_byte_1 = ""
255
252
  bin_byte_2 = bin_byte_complete
253
+ when /^(ISO-8859-|Windows-125)/
254
+ bin_byte_1 = ""
255
+ bin_byte_2 = bin_byte_complete
256
256
  end
257
257
  end
258
258
 
@@ -268,25 +268,34 @@ module Unibits
268
268
  }
269
269
  }
270
270
 
271
- if string.encoding.name[0, 3] == "UTF"
271
+ if type == :unicode
272
272
  enc_buffer.zip(cp_buffer, hex_buffer, bin_buffer, separator).flatten.join("\n")
273
273
  else
274
274
  enc_buffer.zip(hex_buffer, bin_buffer, separator).flatten.join("\n")
275
275
  end
276
276
  end
277
277
 
278
- def self.random_color
279
- "%.2x%.2x%.2x" %[rand(90) + 60, rand(90) + 60, rand(90) + 60]
280
- end
281
-
282
- def self.symbolify(char)
283
- return char.inspect unless char.encoding.name[0, 3] == "UTF"
284
- Symbolify.symbolify(char).encode('UTF-8')
285
- end
286
-
287
278
  def self.determine_terminal_cols
288
279
  STDIN.winsize[1] || DEFAULT_TERMINAL_WIDTH
289
280
  rescue Errno::ENOTTY
290
281
  return DEFAULT_TERMINAL_WIDTH
291
282
  end
283
+
284
+ def self.determine_char_color(char_info)
285
+ if !char_info.valid?
286
+ "#FF0000"
287
+ elsif !char_info.assigned?
288
+ "#FF5500"
289
+ elsif char_info.control?
290
+ "#0000FF"
291
+ elsif char_info.blank?
292
+ "#33AADD"
293
+ else
294
+ random_color
295
+ end
296
+ end
297
+
298
+ def self.random_color
299
+ "%.2x%.2x%.2x" % [rand(90) + 60, rand(90) + 60, rand(90) + 60]
300
+ end
292
301
  end
@@ -1,351 +1,442 @@
1
- require "unicode/categories"
2
-
3
1
  module Unibits
4
2
  module Symbolify
3
+ NO_UTF8_CONVERTER = /^Windows-1258/
4
+ ASCII_CHARS = "\x20-\x7E".freeze
5
5
  ASCII_CONTROL_CODEPOINTS = "\x00-\x1F\x7F".freeze
6
6
  ASCII_CONTROL_SYMBOLS = "\u{2400}-\u{241F}\u{2421}".freeze
7
- ASCII_CHARS = "\x20-\x7E".freeze
8
- TAG_START = "\u{E0001}".freeze
9
- TAG_START_SYMBOL = "LANG TAG".freeze
10
- TAG_SPACE = "\u{E0020}".freeze
11
- TAG_SPACE_SYMBOL = "TAG ␠".freeze
12
7
  TAGS = "\u{E0021}-\u{E007E}".freeze
13
- TAG_DELETE = "\u{E007F}".freeze
14
- TAG_DELETE_SYMBOL = "TAG ␡".freeze
8
+
9
+ CONTROL_C0_SYMBOLS = [
10
+ "␀",
11
+ "␁",
12
+ "␂",
13
+ "␃",
14
+ "␄",
15
+ "␅",
16
+ "␆",
17
+ "␇",
18
+ "␈",
19
+ "␉",
20
+ "␊",
21
+ "␋",
22
+ "␌",
23
+ "␍",
24
+ "␎",
25
+ "␏",
26
+ "␐",
27
+ "␑",
28
+ "␒",
29
+ "␓",
30
+ "␔",
31
+ "␕",
32
+ "␖",
33
+ "␗",
34
+ "␘",
35
+ "␙",
36
+ "␚",
37
+ "␛",
38
+ "␜",
39
+ "␝",
40
+ "␞",
41
+ "␟",
42
+ ]
43
+
44
+ CONTROL_DELETE_SYMBOL = "␡"
45
+
46
+ CONTROL_C1_NAMES = {
47
+ 0x80 => "PAD",
48
+ 0x81 => "HOP",
49
+ 0x82 => "BPH",
50
+ 0x83 => "NBH",
51
+ 0x84 => "IND",
52
+ 0x85 => "NEL",
53
+ 0x86 => "SSA",
54
+ 0x87 => "ESA",
55
+ 0x88 => "HTS",
56
+ 0x89 => "HTJ",
57
+ 0x8A => "VTS",
58
+ 0x8B => "PLD",
59
+ 0x8C => "PLU",
60
+ 0x8D => "RI",
61
+ 0x8E => "SS2",
62
+ 0x8F => "SS3",
63
+ 0x90 => "DCS",
64
+ 0x91 => "PU1",
65
+ 0x92 => "PU2",
66
+ 0x93 => "STS",
67
+ 0x94 => "CCH",
68
+ 0x95 => "MW",
69
+ 0x96 => "SPA",
70
+ 0x97 => "EPA",
71
+ 0x98 => "SOS",
72
+ 0x99 => "SGC",
73
+ 0x9A => "SCI",
74
+ 0x9B => "CSI",
75
+ 0x9C => "ST",
76
+ 0x9D => "OSC",
77
+ 0x9E => "PM",
78
+ 0x9F => "APC",
79
+ }
80
+
15
81
  INTERESTING_CODEPOINTS = {
16
- "\u{0080}" => "PAD",
17
- "\u{0081}" => "HOP",
18
- "\u{0082}" => "BPH",
19
- "\u{0083}" => "NBH",
20
- "\u{0084}" => "IND",
21
- "\u{0085}" => "NEL",
22
- "\u{0086}" => "SSA",
23
- "\u{0087}" => "ESA",
24
- "\u{0088}" => "HTS",
25
- "\u{0089}" => "HTJ",
26
- "\u{008A}" => "VTS",
27
- "\u{008B}" => "PLD",
28
- "\u{008C}" => "PLU",
29
- "\u{008D}" => "RI",
30
- "\u{008E}" => "SS2",
31
- "\u{008F}" => "SS3",
32
- "\u{0090}" => "DCS",
33
- "\u{0091}" => "PU1",
34
- "\u{0092}" => "PU2",
35
- "\u{0093}" => "STS",
36
- "\u{0094}" => "CCH",
37
- "\u{0095}" => "MW",
38
- "\u{0096}" => "SPA",
39
- "\u{0097}" => "EPA",
40
- "\u{0098}" => "SOS",
41
- "\u{0099}" => "SGC",
42
- "\u{009A}" => "SCI",
43
- "\u{009B}" => "CSI",
44
- "\u{009C}" => "ST",
45
- "\u{009D}" => "OSC",
46
- "\u{009E}" => "PM",
47
- "\u{009F}" => "APC",
82
+ 0x200E => "LRM",
83
+ 0x200F => "RLM",
84
+ 0x202A => "LRE",
85
+ 0x202B => "RLE",
86
+ 0x202C => "PDF",
87
+ 0x202D => "LRO",
88
+ 0x202E => "RLO",
89
+ 0x2066 => "LRI",
90
+ 0x2067 => "RLI",
91
+ 0x2068 => "FSI",
92
+ 0x2069 => "PDI",
48
93
 
49
- "\u{200E}" => "LRM",
50
- "\u{200F}" => "RLM",
51
- "\u{202A}" => "LRE",
52
- "\u{202B}" => "RLE",
53
- "\u{202C}" => "PDF",
54
- "\u{202D}" => "LRO",
55
- "\u{202E}" => "RLO",
56
- "\u{2066}" => "LRI",
57
- "\u{2067}" => "RLI",
58
- "\u{2068}" => "FSI",
59
- "\u{2069}" => "PDI",
94
+ 0xFE00 => "VS1",
95
+ 0xFE01 => "VS2",
96
+ 0xFE02 => "VS3",
97
+ 0xFE03 => "VS4",
98
+ 0xFE04 => "VS5",
99
+ 0xFE05 => "VS6",
100
+ 0xFE06 => "VS7",
101
+ 0xFE07 => "VS8",
102
+ 0xFE08 => "VS9",
103
+ 0xFE09 => "VS10",
104
+ 0xFE0A => "VS11",
105
+ 0xFE0B => "VS12",
106
+ 0xFE0C => "VS13",
107
+ 0xFE0D => "VS14",
108
+ 0xFE0E => "VS15",
109
+ 0xFE0F => "VS16",
60
110
 
61
- "\u{FE00}" => "VS1",
62
- "\u{FE01}" => "VS2",
63
- "\u{FE02}" => "VS3",
64
- "\u{FE03}" => "VS4",
65
- "\u{FE04}" => "VS5",
66
- "\u{FE05}" => "VS6",
67
- "\u{FE06}" => "VS7",
68
- "\u{FE07}" => "VS8",
69
- "\u{FE08}" => "VS9",
70
- "\u{FE09}" => "VS10",
71
- "\u{FE0A}" => "VS11",
72
- "\u{FE0B}" => "VS12",
73
- "\u{FE0C}" => "VS13",
74
- "\u{FE0D}" => "VS14",
75
- "\u{FE0E}" => "VS15",
76
- "\u{FE0F}" => "VS16",
111
+ 0xE0001 => "LANG TAG",
112
+ 0xE0020 => "TAG ␠",
113
+ 0xE007F => "TAG ␡",
77
114
 
78
- "\u{E0100}" => "VS17",
79
- "\u{E0101}" => "VS18",
80
- "\u{E0102}" => "VS19",
81
- "\u{E0103}" => "VS20",
82
- "\u{E0104}" => "VS21",
83
- "\u{E0105}" => "VS22",
84
- "\u{E0106}" => "VS23",
85
- "\u{E0107}" => "VS24",
86
- "\u{E0108}" => "VS25",
87
- "\u{E0109}" => "VS26",
88
- "\u{E010A}" => "VS27",
89
- "\u{E010B}" => "VS28",
90
- "\u{E010C}" => "VS29",
91
- "\u{E010D}" => "VS30",
92
- "\u{E010E}" => "VS31",
93
- "\u{E010F}" => "VS32",
94
- "\u{E0110}" => "VS33",
95
- "\u{E0111}" => "VS34",
96
- "\u{E0112}" => "VS35",
97
- "\u{E0113}" => "VS36",
98
- "\u{E0114}" => "VS37",
99
- "\u{E0115}" => "VS38",
100
- "\u{E0116}" => "VS39",
101
- "\u{E0117}" => "VS40",
102
- "\u{E0118}" => "VS41",
103
- "\u{E0119}" => "VS42",
104
- "\u{E011A}" => "VS43",
105
- "\u{E011B}" => "VS44",
106
- "\u{E011C}" => "VS45",
107
- "\u{E011D}" => "VS46",
108
- "\u{E011E}" => "VS47",
109
- "\u{E011F}" => "VS48",
110
- "\u{E0120}" => "VS49",
111
- "\u{E0121}" => "VS50",
112
- "\u{E0122}" => "VS51",
113
- "\u{E0123}" => "VS52",
114
- "\u{E0124}" => "VS53",
115
- "\u{E0125}" => "VS54",
116
- "\u{E0126}" => "VS55",
117
- "\u{E0127}" => "VS56",
118
- "\u{E0128}" => "VS57",
119
- "\u{E0129}" => "VS58",
120
- "\u{E012A}" => "VS59",
121
- "\u{E012B}" => "VS60",
122
- "\u{E012C}" => "VS61",
123
- "\u{E012D}" => "VS62",
124
- "\u{E012E}" => "VS63",
125
- "\u{E012F}" => "VS64",
126
- "\u{E0130}" => "VS65",
127
- "\u{E0131}" => "VS66",
128
- "\u{E0132}" => "VS67",
129
- "\u{E0133}" => "VS68",
130
- "\u{E0134}" => "VS69",
131
- "\u{E0135}" => "VS70",
132
- "\u{E0136}" => "VS71",
133
- "\u{E0137}" => "VS72",
134
- "\u{E0138}" => "VS73",
135
- "\u{E0139}" => "VS74",
136
- "\u{E013A}" => "VS75",
137
- "\u{E013B}" => "VS76",
138
- "\u{E013C}" => "VS77",
139
- "\u{E013D}" => "VS78",
140
- "\u{E013E}" => "VS79",
141
- "\u{E013F}" => "VS80",
142
- "\u{E0140}" => "VS81",
143
- "\u{E0141}" => "VS82",
144
- "\u{E0142}" => "VS83",
145
- "\u{E0143}" => "VS84",
146
- "\u{E0144}" => "VS85",
147
- "\u{E0145}" => "VS86",
148
- "\u{E0146}" => "VS87",
149
- "\u{E0147}" => "VS88",
150
- "\u{E0148}" => "VS89",
151
- "\u{E0149}" => "VS90",
152
- "\u{E014A}" => "VS91",
153
- "\u{E014B}" => "VS92",
154
- "\u{E014C}" => "VS93",
155
- "\u{E014D}" => "VS94",
156
- "\u{E014E}" => "VS95",
157
- "\u{E014F}" => "VS96",
158
- "\u{E0150}" => "VS97",
159
- "\u{E0151}" => "VS98",
160
- "\u{E0152}" => "VS99",
161
- "\u{E0153}" => "VS100",
162
- "\u{E0154}" => "VS101",
163
- "\u{E0155}" => "VS102",
164
- "\u{E0156}" => "VS103",
165
- "\u{E0157}" => "VS104",
166
- "\u{E0158}" => "VS105",
167
- "\u{E0159}" => "VS106",
168
- "\u{E015A}" => "VS107",
169
- "\u{E015B}" => "VS108",
170
- "\u{E015C}" => "VS109",
171
- "\u{E015D}" => "VS110",
172
- "\u{E015E}" => "VS111",
173
- "\u{E015F}" => "VS112",
174
- "\u{E0160}" => "VS113",
175
- "\u{E0161}" => "VS114",
176
- "\u{E0162}" => "VS115",
177
- "\u{E0163}" => "VS116",
178
- "\u{E0164}" => "VS117",
179
- "\u{E0165}" => "VS118",
180
- "\u{E0166}" => "VS119",
181
- "\u{E0167}" => "VS120",
182
- "\u{E0168}" => "VS121",
183
- "\u{E0169}" => "VS122",
184
- "\u{E016A}" => "VS123",
185
- "\u{E016B}" => "VS124",
186
- "\u{E016C}" => "VS125",
187
- "\u{E016D}" => "VS126",
188
- "\u{E016E}" => "VS127",
189
- "\u{E016F}" => "VS128",
190
- "\u{E0170}" => "VS129",
191
- "\u{E0171}" => "VS130",
192
- "\u{E0172}" => "VS131",
193
- "\u{E0173}" => "VS132",
194
- "\u{E0174}" => "VS133",
195
- "\u{E0175}" => "VS134",
196
- "\u{E0176}" => "VS135",
197
- "\u{E0177}" => "VS136",
198
- "\u{E0178}" => "VS137",
199
- "\u{E0179}" => "VS138",
200
- "\u{E017A}" => "VS139",
201
- "\u{E017B}" => "VS140",
202
- "\u{E017C}" => "VS141",
203
- "\u{E017D}" => "VS142",
204
- "\u{E017E}" => "VS143",
205
- "\u{E017F}" => "VS144",
206
- "\u{E0180}" => "VS145",
207
- "\u{E0181}" => "VS146",
208
- "\u{E0182}" => "VS147",
209
- "\u{E0183}" => "VS148",
210
- "\u{E0184}" => "VS149",
211
- "\u{E0185}" => "VS150",
212
- "\u{E0186}" => "VS151",
213
- "\u{E0187}" => "VS152",
214
- "\u{E0188}" => "VS153",
215
- "\u{E0189}" => "VS154",
216
- "\u{E018A}" => "VS155",
217
- "\u{E018B}" => "VS156",
218
- "\u{E018C}" => "VS157",
219
- "\u{E018D}" => "VS158",
220
- "\u{E018E}" => "VS159",
221
- "\u{E018F}" => "VS160",
222
- "\u{E0190}" => "VS161",
223
- "\u{E0191}" => "VS162",
224
- "\u{E0192}" => "VS163",
225
- "\u{E0193}" => "VS164",
226
- "\u{E0194}" => "VS165",
227
- "\u{E0195}" => "VS166",
228
- "\u{E0196}" => "VS167",
229
- "\u{E0197}" => "VS168",
230
- "\u{E0198}" => "VS169",
231
- "\u{E0199}" => "VS170",
232
- "\u{E019A}" => "VS171",
233
- "\u{E019B}" => "VS172",
234
- "\u{E019C}" => "VS173",
235
- "\u{E019D}" => "VS174",
236
- "\u{E019E}" => "VS175",
237
- "\u{E019F}" => "VS176",
238
- "\u{E01A0}" => "VS177",
239
- "\u{E01A1}" => "VS178",
240
- "\u{E01A2}" => "VS179",
241
- "\u{E01A3}" => "VS180",
242
- "\u{E01A4}" => "VS181",
243
- "\u{E01A5}" => "VS182",
244
- "\u{E01A6}" => "VS183",
245
- "\u{E01A7}" => "VS184",
246
- "\u{E01A8}" => "VS185",
247
- "\u{E01A9}" => "VS186",
248
- "\u{E01AA}" => "VS187",
249
- "\u{E01AB}" => "VS188",
250
- "\u{E01AC}" => "VS189",
251
- "\u{E01AD}" => "VS190",
252
- "\u{E01AE}" => "VS191",
253
- "\u{E01AF}" => "VS192",
254
- "\u{E01B0}" => "VS193",
255
- "\u{E01B1}" => "VS194",
256
- "\u{E01B2}" => "VS195",
257
- "\u{E01B3}" => "VS196",
258
- "\u{E01B4}" => "VS197",
259
- "\u{E01B5}" => "VS198",
260
- "\u{E01B6}" => "VS199",
261
- "\u{E01B7}" => "VS200",
262
- "\u{E01B8}" => "VS201",
263
- "\u{E01B9}" => "VS202",
264
- "\u{E01BA}" => "VS203",
265
- "\u{E01BB}" => "VS204",
266
- "\u{E01BC}" => "VS205",
267
- "\u{E01BD}" => "VS206",
268
- "\u{E01BE}" => "VS207",
269
- "\u{E01BF}" => "VS208",
270
- "\u{E01C0}" => "VS209",
271
- "\u{E01C1}" => "VS210",
272
- "\u{E01C2}" => "VS211",
273
- "\u{E01C3}" => "VS212",
274
- "\u{E01C4}" => "VS213",
275
- "\u{E01C5}" => "VS214",
276
- "\u{E01C6}" => "VS215",
277
- "\u{E01C7}" => "VS216",
278
- "\u{E01C8}" => "VS217",
279
- "\u{E01C9}" => "VS218",
280
- "\u{E01CA}" => "VS219",
281
- "\u{E01CB}" => "VS220",
282
- "\u{E01CC}" => "VS221",
283
- "\u{E01CD}" => "VS222",
284
- "\u{E01CE}" => "VS223",
285
- "\u{E01CF}" => "VS224",
286
- "\u{E01D0}" => "VS225",
287
- "\u{E01D1}" => "VS226",
288
- "\u{E01D2}" => "VS227",
289
- "\u{E01D3}" => "VS228",
290
- "\u{E01D4}" => "VS229",
291
- "\u{E01D5}" => "VS230",
292
- "\u{E01D6}" => "VS231",
293
- "\u{E01D7}" => "VS232",
294
- "\u{E01D8}" => "VS233",
295
- "\u{E01D9}" => "VS234",
296
- "\u{E01DA}" => "VS235",
297
- "\u{E01DB}" => "VS236",
298
- "\u{E01DC}" => "VS237",
299
- "\u{E01DD}" => "VS238",
300
- "\u{E01DE}" => "VS239",
301
- "\u{E01DF}" => "VS240",
302
- "\u{E01E0}" => "VS241",
303
- "\u{E01E1}" => "VS242",
304
- "\u{E01E2}" => "VS243",
305
- "\u{E01E3}" => "VS244",
306
- "\u{E01E4}" => "VS245",
307
- "\u{E01E5}" => "VS246",
308
- "\u{E01E6}" => "VS247",
309
- "\u{E01E7}" => "VS248",
310
- "\u{E01E8}" => "VS249",
311
- "\u{E01E9}" => "VS250",
312
- "\u{E01EA}" => "VS251",
313
- "\u{E01EB}" => "VS252",
314
- "\u{E01EC}" => "VS253",
315
- "\u{E01ED}" => "VS254",
316
- "\u{E01EE}" => "VS255",
317
- "\u{E01EF}" => "VS256",
115
+ 0xE0100 => "VS17",
116
+ 0xE0101 => "VS18",
117
+ 0xE0102 => "VS19",
118
+ 0xE0103 => "VS20",
119
+ 0xE0104 => "VS21",
120
+ 0xE0105 => "VS22",
121
+ 0xE0106 => "VS23",
122
+ 0xE0107 => "VS24",
123
+ 0xE0108 => "VS25",
124
+ 0xE0109 => "VS26",
125
+ 0xE010A => "VS27",
126
+ 0xE010B => "VS28",
127
+ 0xE010C => "VS29",
128
+ 0xE010D => "VS30",
129
+ 0xE010E => "VS31",
130
+ 0xE010F => "VS32",
131
+ 0xE0110 => "VS33",
132
+ 0xE0111 => "VS34",
133
+ 0xE0112 => "VS35",
134
+ 0xE0113 => "VS36",
135
+ 0xE0114 => "VS37",
136
+ 0xE0115 => "VS38",
137
+ 0xE0116 => "VS39",
138
+ 0xE0117 => "VS40",
139
+ 0xE0118 => "VS41",
140
+ 0xE0119 => "VS42",
141
+ 0xE011A => "VS43",
142
+ 0xE011B => "VS44",
143
+ 0xE011C => "VS45",
144
+ 0xE011D => "VS46",
145
+ 0xE011E => "VS47",
146
+ 0xE011F => "VS48",
147
+ 0xE0120 => "VS49",
148
+ 0xE0121 => "VS50",
149
+ 0xE0122 => "VS51",
150
+ 0xE0123 => "VS52",
151
+ 0xE0124 => "VS53",
152
+ 0xE0125 => "VS54",
153
+ 0xE0126 => "VS55",
154
+ 0xE0127 => "VS56",
155
+ 0xE0128 => "VS57",
156
+ 0xE0129 => "VS58",
157
+ 0xE012A => "VS59",
158
+ 0xE012B => "VS60",
159
+ 0xE012C => "VS61",
160
+ 0xE012D => "VS62",
161
+ 0xE012E => "VS63",
162
+ 0xE012F => "VS64",
163
+ 0xE0130 => "VS65",
164
+ 0xE0131 => "VS66",
165
+ 0xE0132 => "VS67",
166
+ 0xE0133 => "VS68",
167
+ 0xE0134 => "VS69",
168
+ 0xE0135 => "VS70",
169
+ 0xE0136 => "VS71",
170
+ 0xE0137 => "VS72",
171
+ 0xE0138 => "VS73",
172
+ 0xE0139 => "VS74",
173
+ 0xE013A => "VS75",
174
+ 0xE013B => "VS76",
175
+ 0xE013C => "VS77",
176
+ 0xE013D => "VS78",
177
+ 0xE013E => "VS79",
178
+ 0xE013F => "VS80",
179
+ 0xE0140 => "VS81",
180
+ 0xE0141 => "VS82",
181
+ 0xE0142 => "VS83",
182
+ 0xE0143 => "VS84",
183
+ 0xE0144 => "VS85",
184
+ 0xE0145 => "VS86",
185
+ 0xE0146 => "VS87",
186
+ 0xE0147 => "VS88",
187
+ 0xE0148 => "VS89",
188
+ 0xE0149 => "VS90",
189
+ 0xE014A => "VS91",
190
+ 0xE014B => "VS92",
191
+ 0xE014C => "VS93",
192
+ 0xE014D => "VS94",
193
+ 0xE014E => "VS95",
194
+ 0xE014F => "VS96",
195
+ 0xE0150 => "VS97",
196
+ 0xE0151 => "VS98",
197
+ 0xE0152 => "VS99",
198
+ 0xE0153 => "VS100",
199
+ 0xE0154 => "VS101",
200
+ 0xE0155 => "VS102",
201
+ 0xE0156 => "VS103",
202
+ 0xE0157 => "VS104",
203
+ 0xE0158 => "VS105",
204
+ 0xE0159 => "VS106",
205
+ 0xE015A => "VS107",
206
+ 0xE015B => "VS108",
207
+ 0xE015C => "VS109",
208
+ 0xE015D => "VS110",
209
+ 0xE015E => "VS111",
210
+ 0xE015F => "VS112",
211
+ 0xE0160 => "VS113",
212
+ 0xE0161 => "VS114",
213
+ 0xE0162 => "VS115",
214
+ 0xE0163 => "VS116",
215
+ 0xE0164 => "VS117",
216
+ 0xE0165 => "VS118",
217
+ 0xE0166 => "VS119",
218
+ 0xE0167 => "VS120",
219
+ 0xE0168 => "VS121",
220
+ 0xE0169 => "VS122",
221
+ 0xE016A => "VS123",
222
+ 0xE016B => "VS124",
223
+ 0xE016C => "VS125",
224
+ 0xE016D => "VS126",
225
+ 0xE016E => "VS127",
226
+ 0xE016F => "VS128",
227
+ 0xE0170 => "VS129",
228
+ 0xE0171 => "VS130",
229
+ 0xE0172 => "VS131",
230
+ 0xE0173 => "VS132",
231
+ 0xE0174 => "VS133",
232
+ 0xE0175 => "VS134",
233
+ 0xE0176 => "VS135",
234
+ 0xE0177 => "VS136",
235
+ 0xE0178 => "VS137",
236
+ 0xE0179 => "VS138",
237
+ 0xE017A => "VS139",
238
+ 0xE017B => "VS140",
239
+ 0xE017C => "VS141",
240
+ 0xE017D => "VS142",
241
+ 0xE017E => "VS143",
242
+ 0xE017F => "VS144",
243
+ 0xE0180 => "VS145",
244
+ 0xE0181 => "VS146",
245
+ 0xE0182 => "VS147",
246
+ 0xE0183 => "VS148",
247
+ 0xE0184 => "VS149",
248
+ 0xE0185 => "VS150",
249
+ 0xE0186 => "VS151",
250
+ 0xE0187 => "VS152",
251
+ 0xE0188 => "VS153",
252
+ 0xE0189 => "VS154",
253
+ 0xE018A => "VS155",
254
+ 0xE018B => "VS156",
255
+ 0xE018C => "VS157",
256
+ 0xE018D => "VS158",
257
+ 0xE018E => "VS159",
258
+ 0xE018F => "VS160",
259
+ 0xE0190 => "VS161",
260
+ 0xE0191 => "VS162",
261
+ 0xE0192 => "VS163",
262
+ 0xE0193 => "VS164",
263
+ 0xE0194 => "VS165",
264
+ 0xE0195 => "VS166",
265
+ 0xE0196 => "VS167",
266
+ 0xE0197 => "VS168",
267
+ 0xE0198 => "VS169",
268
+ 0xE0199 => "VS170",
269
+ 0xE019A => "VS171",
270
+ 0xE019B => "VS172",
271
+ 0xE019C => "VS173",
272
+ 0xE019D => "VS174",
273
+ 0xE019E => "VS175",
274
+ 0xE019F => "VS176",
275
+ 0xE01A0 => "VS177",
276
+ 0xE01A1 => "VS178",
277
+ 0xE01A2 => "VS179",
278
+ 0xE01A3 => "VS180",
279
+ 0xE01A4 => "VS181",
280
+ 0xE01A5 => "VS182",
281
+ 0xE01A6 => "VS183",
282
+ 0xE01A7 => "VS184",
283
+ 0xE01A8 => "VS185",
284
+ 0xE01A9 => "VS186",
285
+ 0xE01AA => "VS187",
286
+ 0xE01AB => "VS188",
287
+ 0xE01AC => "VS189",
288
+ 0xE01AD => "VS190",
289
+ 0xE01AE => "VS191",
290
+ 0xE01AF => "VS192",
291
+ 0xE01B0 => "VS193",
292
+ 0xE01B1 => "VS194",
293
+ 0xE01B2 => "VS195",
294
+ 0xE01B3 => "VS196",
295
+ 0xE01B4 => "VS197",
296
+ 0xE01B5 => "VS198",
297
+ 0xE01B6 => "VS199",
298
+ 0xE01B7 => "VS200",
299
+ 0xE01B8 => "VS201",
300
+ 0xE01B9 => "VS202",
301
+ 0xE01BA => "VS203",
302
+ 0xE01BB => "VS204",
303
+ 0xE01BC => "VS205",
304
+ 0xE01BD => "VS206",
305
+ 0xE01BE => "VS207",
306
+ 0xE01BF => "VS208",
307
+ 0xE01C0 => "VS209",
308
+ 0xE01C1 => "VS210",
309
+ 0xE01C2 => "VS211",
310
+ 0xE01C3 => "VS212",
311
+ 0xE01C4 => "VS213",
312
+ 0xE01C5 => "VS214",
313
+ 0xE01C6 => "VS215",
314
+ 0xE01C7 => "VS216",
315
+ 0xE01C8 => "VS217",
316
+ 0xE01C9 => "VS218",
317
+ 0xE01CA => "VS219",
318
+ 0xE01CB => "VS220",
319
+ 0xE01CC => "VS221",
320
+ 0xE01CD => "VS222",
321
+ 0xE01CE => "VS223",
322
+ 0xE01CF => "VS224",
323
+ 0xE01D0 => "VS225",
324
+ 0xE01D1 => "VS226",
325
+ 0xE01D2 => "VS227",
326
+ 0xE01D3 => "VS228",
327
+ 0xE01D4 => "VS229",
328
+ 0xE01D5 => "VS230",
329
+ 0xE01D6 => "VS231",
330
+ 0xE01D7 => "VS232",
331
+ 0xE01D8 => "VS233",
332
+ 0xE01D9 => "VS234",
333
+ 0xE01DA => "VS235",
334
+ 0xE01DB => "VS236",
335
+ 0xE01DC => "VS237",
336
+ 0xE01DD => "VS238",
337
+ 0xE01DE => "VS239",
338
+ 0xE01DF => "VS240",
339
+ 0xE01E0 => "VS241",
340
+ 0xE01E1 => "VS242",
341
+ 0xE01E2 => "VS243",
342
+ 0xE01E3 => "VS244",
343
+ 0xE01E4 => "VS245",
344
+ 0xE01E5 => "VS246",
345
+ 0xE01E6 => "VS247",
346
+ 0xE01E7 => "VS248",
347
+ 0xE01E8 => "VS249",
348
+ 0xE01E9 => "VS250",
349
+ 0xE01EA => "VS251",
350
+ 0xE01EB => "VS252",
351
+ 0xE01EC => "VS253",
352
+ 0xE01ED => "VS254",
353
+ 0xE01EE => "VS255",
354
+ 0xE01EF => "VS256",
318
355
  }.freeze
319
- COULD_BE_WHITESPACE = '[\p{Space}­᠎​‌‍⁠⁡⁢⁣⁤⠀𛲠𛲡𛲢𛲣𝅙𝅳𝅴𝅵𝅶𝅷𝅸𝅹𝅺]'.freeze
320
356
 
321
- def self.symbolify(char, encoding = char.encoding)
322
- return "n/a" if Unicode::Categories.category(char) == "Cn"
357
+ def self.symbolify(char, char_info)
358
+ if !char_info.valid?
359
+ "�"
360
+ else
361
+ case char_info
362
+ when UnicodeCharacteristics
363
+ Symbolify.unicode(char, char_info)
364
+ when ByteCharacteristics
365
+ Symbolify.byte(char, char_info)
366
+ when AsciiCharacteristics
367
+ Symbolify.ascii(char, char_info)
368
+ else
369
+ Symbolify.binary(char)
370
+ end
371
+ end
372
+ end
373
+
374
+ def self.unicode(char, char_info)
375
+ return "n/a" if !char_info.assigned?
323
376
 
324
377
  char = char.dup
378
+ ord = char.ord
379
+ encoding = char_info.encoding
325
380
 
326
- char.tr!(
327
- ASCII_CONTROL_CODEPOINTS.encode(encoding),
328
- ASCII_CONTROL_SYMBOLS.encode(encoding)
329
- )
330
- char.gsub!(
331
- Regexp.compile(COULD_BE_WHITESPACE.encode(encoding)),
332
- ']\0['.encode(encoding)
333
- )
381
+ if char_info.delete?
382
+ char = CONTROL_DELETE_SYMBOL
383
+ elsif char_info.c0?
384
+ char = CONTROL_C0_SYMBOLS[ord]
385
+ elsif char_info.c1?
386
+ char = CONTROL_C1_NAMES[ord]
387
+ elsif char_info.blank?
388
+ char = "]".encode(encoding) + char + "[".encode(encoding)
389
+ elsif ord > 917536 && ord < 917631
390
+ char = "TAG ".encode(encoding) +
391
+ char.tr(TAGS.encode(encoding), ASCII_CHARS.encode(encoding))
392
+ else
393
+ char = INTERESTING_CODEPOINTS[char.ord] || char
394
+ end
395
+
396
+ char.encode("UTF-8")
397
+ end
334
398
 
335
- INTERESTING_CODEPOINTS.each{ |cp, desc|
336
- char.gsub! Regexp.compile(cp.encode(encoding)), desc.encode(encoding)
337
- }
338
- char.gsub! TAG_START.encode(encoding), TAG_START_SYMBOL.encode(encoding)
339
- char.gsub! TAG_SPACE.encode(encoding), TAG_SPACE_SYMBOL.encode(encoding)
340
- char.gsub! TAG_DELETE.encode(encoding), TAG_DELETE_SYMBOL.encode(encoding)
399
+ def self.byte(char, char_info)
400
+ return "n/a" if !char_info.assigned?
341
401
 
342
402
  ord = char.ord
343
- if ord > 917536 && ord < 917631
344
- char.tr!(TAGS.encode(encoding), ASCII_CHARS.encode(encoding))
345
- char = "TAG ".encode(encoding) + char
403
+ encoding = char_info.encoding
404
+ no_converter = !!(NO_UTF8_CONVERTER =~ encoding.name)
405
+ treat_char_unconverted = false
406
+
407
+ if char_info.delete?
408
+ char = CONTROL_DELETE_SYMBOL
409
+ elsif char_info.c0?
410
+ char = CONTROL_C0_SYMBOLS[ord]
411
+ elsif char_info.c1?
412
+ char = CONTROL_C1_NAMES[ord]
413
+ elsif no_converter
414
+ treat_char_unconverted = true
415
+ elsif char_info.blank?
416
+ char = "]".encode(encoding) + char + "[".encode(encoding)
417
+ end
418
+
419
+ if no_converter && treat_char_unconverted
420
+ char.inspect
421
+ else
422
+ char.encode("UTF-8")
423
+ end
424
+ end
425
+
426
+ def self.ascii(char, char_info)
427
+ if char_info.delete?
428
+ char = CONTROL_DELETE_SYMBOL
429
+ elsif char_info.c0?
430
+ char = CONTROL_C0_SYMBOLS[char.ord]
431
+ elsif char_info.blank?
432
+ char = "]" + char + "["
346
433
  end
347
434
 
348
435
  char
349
436
  end
437
+
438
+ def self.binary(char)
439
+ char.inspect
440
+ end
350
441
  end
351
442
  end
@@ -1,3 +1,3 @@
1
1
  module Unibits
2
- VERSION = "1.3.0".freeze
2
+ VERSION = "2.0.0".freeze
3
3
  end
@@ -67,6 +67,24 @@ describe Unibits do
67
67
  result.must_match "01000011"
68
68
  end
69
69
 
70
+ it "works with 'ISO-8859-' encodings" do
71
+ string = "\xBC Idiosyncr\xE4tic\n\x91".force_encoding("ISO-8859-1")
72
+ result = Paint.unpaint(Unibits.visualize(string))
73
+ result.must_match "BC" # ¼
74
+ result.must_match "E4" # ä
75
+ result.must_match "␊" # \n
76
+ result.must_match "PU1" # C1 name for \x91
77
+ end
78
+
79
+ it "works with 'Windows-125' encodings" do
80
+ string = "\xBC Idiosyncr\xE4tic\n\x81".force_encoding("Windows-1252")
81
+ result = Paint.unpaint(Unibits.visualize(string))
82
+ result.must_match "BC" # ¼
83
+ result.must_match "E4" # ä
84
+ result.must_match "␊" # \n
85
+ result.must_match "n/a" # \x81 is not assigned
86
+ end
87
+
70
88
  describe "invalid UTF-8 encodings" do
71
89
  it "- unexpected continuation byte (1/2)" do
72
90
  string = "abc\x80efg"
@@ -19,7 +19,7 @@ Gem::Specification.new do |gem|
19
19
 
20
20
  gem.add_dependency 'paint', '>= 0.9', '< 3.0'
21
21
  gem.add_dependency 'unicode-display_width', '~> 1.1'
22
- gem.add_dependency 'unicode-categories', '~> 1.1', '>= 1.1.2'
22
+ gem.add_dependency 'characteristics', '~> 0.2.0'
23
23
  gem.add_dependency 'rationalist', '~> 2.0'
24
24
 
25
25
  gem.required_ruby_version = "~> 2.0"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unibits
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Lelis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-03-09 00:00:00.000000000 Z
11
+ date: 2017-03-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: paint
@@ -45,25 +45,19 @@ dependencies:
45
45
  - !ruby/object:Gem::Version
46
46
  version: '1.1'
47
47
  - !ruby/object:Gem::Dependency
48
- name: unicode-categories
48
+ name: characteristics
49
49
  requirement: !ruby/object:Gem::Requirement
50
50
  requirements:
51
51
  - - "~>"
52
52
  - !ruby/object:Gem::Version
53
- version: '1.1'
54
- - - ">="
55
- - !ruby/object:Gem::Version
56
- version: 1.1.2
53
+ version: 0.2.0
57
54
  type: :runtime
58
55
  prerelease: false
59
56
  version_requirements: !ruby/object:Gem::Requirement
60
57
  requirements:
61
58
  - - "~>"
62
59
  - !ruby/object:Gem::Version
63
- version: '1.1'
64
- - - ">="
65
- - !ruby/object:Gem::Version
66
- version: 1.1.2
60
+ version: 0.2.0
67
61
  - !ruby/object:Gem::Dependency
68
62
  name: rationalist
69
63
  requirement: !ruby/object:Gem::Requirement