unicode-display_width 3.1.0 → 3.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 01657362aaf60cf79bb03c63bb96e01914139c7bb965dc9bed18e7988b8c6709
4
- data.tar.gz: 297cc1ab03e72a02e9f33eb4eec2dea2006f23987818083c4bc12aa168e437c3
3
+ metadata.gz: a85ca57ca5e291c17993e526d222dda44b884286484b3831bb8173ce92aafb1a
4
+ data.tar.gz: d1036dfc6464459de04a713e273d09dea767a3b9a9629d9e491052c2ffe97c23
5
5
  SHA512:
6
- metadata.gz: a3878d504a273e44268762fca4857bf26a9322e0e54c0afc437d953dca675822262c9aec54cb5de3d23390b4b778403b36ce0f73ba5b0f1d2c8554a1f796d210
7
- data.tar.gz: 00de0d22f3b245f16de15b3b4864ff754da04ea94eafdeaf06c0e38fec8cfb2559fbeaafc17f165534e70a386e154f70d7b071f5a226c9f64d7088bbb408cabb
6
+ metadata.gz: d669e8a2866b56a78bafb3fff6d2d6430fab6bb1ca2633aeaac68e0634ca14374ac0b325bc7159ef90afe0bdffd9c154700cae1fc3183b1d74281ff4b5024e1b
7
+ data.tar.gz: 5f319484d27dad70b3851398e11cd3cb93b5c4f41a6c3a76c958d505d8357f9e303b661fd7a0339262d1458b82cb8619e6682ee2dbf8c583d33fbde4fd1a8680
data/CHANGELOG.md CHANGED
@@ -1,5 +1,13 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## 3.1.2
4
+
5
+ - Performance improvements
6
+
7
+ ## 3.1.1
8
+
9
+ - Performance improvements
10
+
3
11
  ## 3.1.0
4
12
 
5
13
  **Improve Emoji support:**
@@ -7,7 +15,7 @@
7
15
  - Emoji modes: Differentiate between well-formed Emoji (`:possible`) and any
8
16
  ZWJ/modifier sequence (`:all`). The latter is more common and more efficient
9
17
  to implement.
10
- - Unify `rgi_{fqe,mqe,uqe}` options to just `:rgi` to keep things simpler (corresponds to
18
+ - Unify `:rgi_{fqe,mqe,uqe}` options to just `:rgi` to keep things simpler (corresponds to
11
19
  the former `:rgi_uqe` option). Most terminals that want to support the RGI set
12
20
  will probably want to catch Emoji sequences with missing VS16s.
13
21
  - Add new `:all_no_vs16` and `:rgi_at` modes to be able to support some terminals
data/README.md CHANGED
@@ -114,10 +114,11 @@ The `emoji:` option can be used to configure which type of Emoji should be consi
114
114
  `:all_no_vs16` | EAW (1 or 2) | 2 for all ZWJ/modifier/keycap sequences, even if they are not well-formed Emoji sequences | WezTerm
115
115
  `:possible`| 2 | 2 for all possible/well-formed Emoji sequences | ?
116
116
  `:rgi` | 2 | 2 for all [RGI Emoji](https://www.unicode.org/reports/tr51/#def_rgi_set) sequences | ?
117
- `:rgi_at` | EAW (1 or 2) | 1 or 2: Like `:rgi`, but Emoji sequences starting with a default-text Emoji have width 1 | Apple Terminal
117
+ `:rgi_at` | EAW (1 or 2) | 1 or 2: Like `:rgi`, but Emoji sequences starting with a default-text Emoji have EAW | Apple Terminal
118
118
  `:vs16` | 2 | 2 * number of partial Emoji (sequences never considered to represent a combined Emoji) | kitty?
119
119
  `false` or `:none` | EAW (1 or 2) | No Emoji adjustments | gnome-terminal, many older terminals
120
120
 
121
+ - *EAW:* East Asian Width
121
122
  - *RGI Emoji:* Emoji Recommended for General Interchange
122
123
  - *ZWJ:* Zero-width Joiner: Codepoint `U+200D`,used in many Emoji sequences
123
124
 
Binary file
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Unicode
4
4
  class DisplayWidth
5
- VERSION = "3.1.0"
5
+ VERSION = "3.1.2"
6
6
  UNICODE_VERSION = "16.0.0"
7
7
  DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../../data/")
8
8
  INDEX_FILENAME = DATA_DIRECTORY + "/display_width.marshal.gz"
@@ -10,8 +10,8 @@ module Unicode
10
10
  class DisplayWidth
11
11
  DEFAULT_AMBIGUOUS = 1
12
12
  INITIAL_DEPTH = 0x10000
13
- ASCII_NON_ZERO_REGEX = /[\0\x05\a\b\n\v\f\r\x0E\x0F]/
14
- ASCII_NON_ZERO_STRING = "\0\x05\a\b\n\v\f\r\x0E\x0F"
13
+ ASCII_NON_ZERO_REGEX = /[\0\x05\a\b\n-\x0F]/
14
+ ASCII_NON_ZERO_STRING = "\0\x05\a\b\n-\x0F"
15
15
  ASCII_BACKSPACE = "\b"
16
16
  AMBIGUOUS_MAP = {
17
17
  1 => :WIDTH_ONE,
@@ -21,6 +21,10 @@ module Unicode
21
21
  WIDTH_ONE: 768,
22
22
  WIDTH_TWO: 161,
23
23
  }
24
+ NOT_COMMON_NARROW_REGEX = {
25
+ WIDTH_ONE: /[^\u{10}-\u{2FF}]/m,
26
+ WIDTH_TWO: /[^\u{10}-\u{A1}]/m,
27
+ }
24
28
  FIRST_4096 = {
25
29
  WIDTH_ONE: decompress_index(INDEX[:WIDTH_ONE][0][0], 1),
26
30
  WIDTH_TWO: decompress_index(INDEX[:WIDTH_TWO][0][0], 1),
@@ -30,126 +34,61 @@ module Unicode
30
34
  rgi_at: :REGEX_INCLUDE_MQE_UQE,
31
35
  possible: :REGEX_WELL_FORMED,
32
36
  }
33
- REGEX_EMOJI_BASIC_OR_KEYCAP = Regexp.union(Unicode::Emoji::REGEX_BASIC, Unicode::Emoji::REGEX_EMOJI_KEYCAP)
37
+ REGEX_EMOJI_VS16 = Regexp.union(
38
+ Regexp.compile(
39
+ Unicode::Emoji::REGEX_TEXT_PRESENTATION.source +
40
+ "(?<![#*0-9])" +
41
+ "\u{FE0F}"
42
+ ),
43
+ Unicode::Emoji::REGEX_EMOJI_KEYCAP
44
+ )
34
45
  REGEX_EMOJI_ALL_SEQUENCES = Regexp.union(/.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?(\u{200D}.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?)+/, Unicode::Emoji::REGEX_EMOJI_KEYCAP)
35
- REGEX_EMOJI_NOT_POSSIBLE = /\A[#*0-9]\z/
46
+ REGEX_EMOJI_ALL_SEQUENCES_AND_VS16 = Regexp.union(REGEX_EMOJI_ALL_SEQUENCES, REGEX_EMOJI_VS16)
36
47
 
37
48
  # Returns monospace display width of string
38
49
  def self.of(string, ambiguous = nil, overwrite = nil, old_options = {}, **options)
39
- unless old_options.empty?
40
- warn "Unicode::DisplayWidth: Please migrate to keyword arguments - #{old_options.inspect}"
41
- options.merge! old_options
42
- end
43
-
44
- options[:ambiguous] = ambiguous if ambiguous
45
- options[:ambiguous] ||= DEFAULT_AMBIGUOUS
46
-
47
- if options[:ambiguous] != 1 && options[:ambiguous] != 2
48
- raise ArgumentError, "Unicode::DisplayWidth: Ambiguous width must be 1 or 2"
49
- end
50
-
51
- if overwrite && !overwrite.empty?
52
- warn "Unicode::DisplayWidth: Please migrate to keyword arguments - overwrite: #{overwrite.inspect}"
53
- options[:overwrite] = overwrite
54
- end
55
- options[:overwrite] ||= {}
56
-
57
- if [nil, true, :auto].include?(options[:emoji])
58
- options[:emoji] = EmojiSupport.recommended
59
- end
50
+ string = string.encode(Encoding::UTF_8) unless string.encoding == Encoding::UTF_8
51
+ options = normalize_options(string, ambiguous, overwrite, old_options, **options)
60
52
 
61
- # # #
53
+ width = 0
62
54
 
63
- if !options[:overwrite].empty?
64
- return width_frame(string, options) do |string, index_full, index_low, first_ambiguous|
65
- width_all_features(string, index_full, index_low, first_ambiguous, options[:overwrite])
66
- end
55
+ unless options[:overwrite].empty?
56
+ width, string = width_custom(string, options[:overwrite])
67
57
  end
68
58
 
69
- if !string.ascii_only?
70
- return width_frame(string, options) do |string, index_full, index_low, first_ambiguous|
71
- width_no_overwrite(string, index_full, index_low, first_ambiguous)
72
- end
59
+ if string.ascii_only?
60
+ return width + width_ascii(string)
73
61
  end
74
62
 
75
- width_ascii(string)
76
- end
63
+ ambiguous_index_name = AMBIGUOUS_MAP[options[:ambiguous]]
77
64
 
78
- def self.width_ascii(string)
79
- # Optimization for ASCII-only strings without certain control symbols
80
- if string.match?(ASCII_NON_ZERO_REGEX)
81
- res = string.delete(ASCII_NON_ZERO_STRING).size - string.count(ASCII_BACKSPACE)
82
- return res < 0 ? 0 : res
65
+ unless string.match?(NOT_COMMON_NARROW_REGEX[ambiguous_index_name])
66
+ return width + string.size
83
67
  end
84
68
 
85
- # Pure ASCII
86
- string.size
87
- end
88
-
89
- def self.width_frame(string, options)
90
69
  # Retrieve Emoji width
91
- if options[:emoji] == false || options[:emoji] == :none
92
- res = 0
93
- else
94
- res, string = emoji_width(
70
+ if options[:emoji] != :none
71
+ e_width, string = emoji_width(
95
72
  string,
96
73
  options[:emoji],
97
74
  options[:ambiguous],
98
75
  )
99
- end
100
-
101
- # Prepare indexes
102
- ambiguous_index_name = AMBIGUOUS_MAP[options[:ambiguous]]
103
-
104
- # Get general width
105
- res += yield(string, INDEX[ambiguous_index_name], FIRST_4096[ambiguous_index_name], FIRST_AMBIGUOUS[ambiguous_index_name])
106
-
107
- # Return result + prevent negative lengths
108
- res < 0 ? 0 : res
109
- end
110
-
111
- def self.width_no_overwrite(string, index_full, index_low, first_ambiguous, _ = {})
112
- res = 0
113
-
114
- # Make sure we have UTF-8
115
- string = string.encode(Encoding::UTF_8) unless string.encoding.name == "utf-8"
76
+ width += e_width
116
77
 
117
- string.scan(/.{,80}/m){ |batch|
118
- if batch.ascii_only?
119
- res += batch.size
120
- else
121
- batch.each_codepoint{ |codepoint|
122
- if codepoint > 15 && codepoint < first_ambiguous
123
- res += 1
124
- elsif codepoint < 0x1001
125
- res += index_low[codepoint] || 1
126
- else
127
- d = INITIAL_DEPTH
128
- w = index_full[codepoint / d]
129
- while w.instance_of? Array
130
- w = w[(codepoint %= d) / (d /= 16)]
131
- end
132
-
133
- res += w || 1
134
- end
135
- }
78
+ unless string.match?(NOT_COMMON_NARROW_REGEX[ambiguous_index_name])
79
+ return width + string.size
136
80
  end
137
- }
138
-
139
- res
140
- end
81
+ end
141
82
 
142
- # Same as .width_no_overwrite - but with applying overwrites for each char
143
- def self.width_all_features(string, index_full, index_low, first_ambiguous, overwrite)
144
- res = 0
83
+ index_full = INDEX[ambiguous_index_name]
84
+ index_low = FIRST_4096[ambiguous_index_name]
85
+ first_ambiguous = FIRST_AMBIGUOUS[ambiguous_index_name]
145
86
 
146
87
  string.each_codepoint{ |codepoint|
147
- if overwrite[codepoint]
148
- res += overwrite[codepoint]
149
- elsif codepoint > 15 && codepoint < first_ambiguous
150
- res += 1
88
+ if codepoint > 15 && codepoint < first_ambiguous
89
+ width += 1
151
90
  elsif codepoint < 0x1001
152
- res += index_low[codepoint] || 1
91
+ width += index_low[codepoint] || 1
153
92
  else
154
93
  d = INITIAL_DEPTH
155
94
  w = index_full[codepoint / d]
@@ -157,19 +96,44 @@ module Unicode
157
96
  w = w[(codepoint %= d) / (d /= 16)]
158
97
  end
159
98
 
160
- res += w || 1
99
+ width += w || 1
161
100
  end
162
101
  }
163
102
 
164
- res
103
+ # Return result + prevent negative lengths
104
+ width < 0 ? 0 : width
105
+ end
106
+
107
+ # Returns width of custom overwrites and remaining string
108
+ def self.width_custom(string, overwrite)
109
+ width = 0
110
+
111
+ string = string.each_codepoint.select{ |codepoint|
112
+ if overwrite[codepoint]
113
+ width += overwrite[codepoint]
114
+ nil
115
+ else
116
+ codepoint
117
+ end
118
+ }.pack("U*")
119
+
120
+ [width, string]
165
121
  end
166
122
 
123
+ # Returns width for ASCII-only strings. Will consider zero-width control symbols.
124
+ def self.width_ascii(string)
125
+ if string.match?(ASCII_NON_ZERO_REGEX)
126
+ res = string.delete(ASCII_NON_ZERO_STRING).bytesize - string.count(ASCII_BACKSPACE)
127
+ return res < 0 ? 0 : res
128
+ end
129
+
130
+ string.bytesize
131
+ end
167
132
 
133
+ # Returns width of all considered Emoji and remaining string
168
134
  def self.emoji_width(string, mode = :all, ambiguous = DEFAULT_AMBIGUOUS)
169
135
  res = 0
170
136
 
171
- string = string.encode(Encoding::UTF_8) unless string.encoding.name == "utf-8"
172
-
173
137
  if emoji_set_regex = EMOJI_SEQUENCES_REGEX_MAPPING[mode]
174
138
  emoji_width_via_possible(
175
139
  string,
@@ -177,45 +141,23 @@ module Unicode
177
141
  mode == :rgi_at,
178
142
  ambiguous,
179
143
  )
144
+
180
145
  elsif mode == :all_no_vs16
181
- emoji_width_all(string)
146
+ no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES){ res += 2; "" }
147
+ [res, no_emoji_string]
148
+
182
149
  elsif mode == :vs16
183
- emoji_width_basic(string)
150
+ no_emoji_string = string.gsub(REGEX_EMOJI_VS16){ res += 2; "" }
151
+ [res, no_emoji_string]
152
+
184
153
  elsif mode == :all
185
- res_all, string = emoji_width_all(string)
186
- res_basic, string = emoji_width_basic(string)
187
- [res_all + res_basic, string]
154
+ no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES_AND_VS16){ res += 2; "" }
155
+ [res, no_emoji_string]
156
+
188
157
  else
189
158
  [0, string]
190
- end
191
- end
192
-
193
- # Ensure all explicit VS16 sequences have width 2
194
- def self.emoji_width_basic(string)
195
- res = 0
196
-
197
- no_emoji_string = string.gsub(REGEX_EMOJI_BASIC_OR_KEYCAP){ |basic_emoji|
198
- if basic_emoji.size >= 2 # VS16 present
199
- res += 2
200
- ""
201
- else
202
- basic_emoji
203
- end
204
- }
205
-
206
- [res, no_emoji_string]
207
- end
208
-
209
- # Use simplistic ZWJ/modifier/kecap sequence matching
210
- def self.emoji_width_all(string)
211
- res = 0
212
-
213
- no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES){
214
- res += 2
215
- ""
216
- }
217
159
 
218
- [res, no_emoji_string]
160
+ end
219
161
  end
220
162
 
221
163
  # Match possible Emoji first, then refine
@@ -223,13 +165,9 @@ module Unicode
223
165
  res = 0
224
166
 
225
167
  # For each string possibly an emoji
226
- no_emoji_string = string.gsub(Unicode::Emoji::REGEX_POSSIBLE){ |emoji_candidate|
227
- # Skip notorious false positives
228
- if REGEX_EMOJI_NOT_POSSIBLE.match?(emoji_candidate)
229
- emoji_candidate
230
-
168
+ no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES_AND_VS16){ |emoji_candidate|
231
169
  # Check if we have a combined Emoji with width 2 (or EAW an Apple Terminal)
232
- elsif emoji_candidate == emoji_candidate[emoji_set_regex]
170
+ if emoji_candidate == emoji_candidate[emoji_set_regex]
233
171
  if strict_eaw
234
172
  res += self.of(emoji_candidate[0], ambiguous, emoji: false)
235
173
  else
@@ -241,14 +179,7 @@ module Unicode
241
179
  else
242
180
  if !strict_eaw
243
181
  # Ensure all explicit VS16 sequences have width 2
244
- emoji_candidate.gsub!(Unicode::Emoji::REGEX_BASIC){ |basic_emoji|
245
- if basic_emoji.size == 2 # VS16 present
246
- res += 2
247
- ""
248
- else
249
- basic_emoji
250
- end
251
- }
182
+ emoji_candidate.gsub!(REGEX_EMOJI_VS16){ res += 2; "" }
252
183
  end
253
184
 
254
185
  emoji_candidate
@@ -258,6 +189,34 @@ module Unicode
258
189
  [res, no_emoji_string]
259
190
  end
260
191
 
192
+ def self.normalize_options(string, ambiguous = nil, overwrite = nil, old_options = {}, **options)
193
+ unless old_options.empty?
194
+ warn "Unicode::DisplayWidth: Please migrate to keyword arguments - #{old_options.inspect}"
195
+ options.merge! old_options
196
+ end
197
+
198
+ options[:ambiguous] = ambiguous if ambiguous
199
+ options[:ambiguous] ||= DEFAULT_AMBIGUOUS
200
+
201
+ if options[:ambiguous] != 1 && options[:ambiguous] != 2
202
+ raise ArgumentError, "Unicode::DisplayWidth: Ambiguous width must be 1 or 2"
203
+ end
204
+
205
+ if overwrite && !overwrite.empty?
206
+ warn "Unicode::DisplayWidth: Please migrate to keyword arguments - overwrite: #{overwrite.inspect}"
207
+ options[:overwrite] = overwrite
208
+ end
209
+ options[:overwrite] ||= {}
210
+
211
+ if [nil, true, :auto].include?(options[:emoji])
212
+ options[:emoji] = EmojiSupport.recommended
213
+ elsif options[:emoji] == false
214
+ options[:emoji] = :none
215
+ end
216
+
217
+ options
218
+ end
219
+
261
220
  def initialize(ambiguous: DEFAULT_AMBIGUOUS, overwrite: {}, emoji: true)
262
221
  @ambiguous = ambiguous
263
222
  @overwrite = overwrite
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicode-display_width
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.1.0
4
+ version: 3.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Lelis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-11-18 00:00:00.000000000 Z
11
+ date: 2024-11-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode-emoji
@@ -17,6 +17,9 @@ dependencies:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '4.0'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 4.0.4
20
23
  type: :runtime
21
24
  prerelease: false
22
25
  version_requirements: !ruby/object:Gem::Requirement
@@ -24,6 +27,9 @@ dependencies:
24
27
  - - "~>"
25
28
  - !ruby/object:Gem::Version
26
29
  version: '4.0'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 4.0.4
27
33
  - !ruby/object:Gem::Dependency
28
34
  name: rspec
29
35
  requirement: !ruby/object:Gem::Requirement