unicode-display_width 3.1.0 → 3.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 01657362aaf60cf79bb03c63bb96e01914139c7bb965dc9bed18e7988b8c6709
4
- data.tar.gz: 297cc1ab03e72a02e9f33eb4eec2dea2006f23987818083c4bc12aa168e437c3
3
+ metadata.gz: a85ca57ca5e291c17993e526d222dda44b884286484b3831bb8173ce92aafb1a
4
+ data.tar.gz: d1036dfc6464459de04a713e273d09dea767a3b9a9629d9e491052c2ffe97c23
5
5
  SHA512:
6
- metadata.gz: a3878d504a273e44268762fca4857bf26a9322e0e54c0afc437d953dca675822262c9aec54cb5de3d23390b4b778403b36ce0f73ba5b0f1d2c8554a1f796d210
7
- data.tar.gz: 00de0d22f3b245f16de15b3b4864ff754da04ea94eafdeaf06c0e38fec8cfb2559fbeaafc17f165534e70a386e154f70d7b071f5a226c9f64d7088bbb408cabb
6
+ metadata.gz: d669e8a2866b56a78bafb3fff6d2d6430fab6bb1ca2633aeaac68e0634ca14374ac0b325bc7159ef90afe0bdffd9c154700cae1fc3183b1d74281ff4b5024e1b
7
+ data.tar.gz: 5f319484d27dad70b3851398e11cd3cb93b5c4f41a6c3a76c958d505d8357f9e303b661fd7a0339262d1458b82cb8619e6682ee2dbf8c583d33fbde4fd1a8680
data/CHANGELOG.md CHANGED
@@ -1,5 +1,13 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## 3.1.2
4
+
5
+ - Performance improvements
6
+
7
+ ## 3.1.1
8
+
9
+ - Performance improvements
10
+
3
11
  ## 3.1.0
4
12
 
5
13
  **Improve Emoji support:**
@@ -7,7 +15,7 @@
7
15
  - Emoji modes: Differentiate between well-formed Emoji (`:possible`) and any
8
16
  ZWJ/modifier sequence (`:all`). The latter is more common and more efficient
9
17
  to implement.
10
- - Unify `rgi_{fqe,mqe,uqe}` options to just `:rgi` to keep things simpler (corresponds to
18
+ - Unify `:rgi_{fqe,mqe,uqe}` options to just `:rgi` to keep things simpler (corresponds to
11
19
  the former `:rgi_uqe` option). Most terminals that want to support the RGI set
12
20
  will probably want to catch Emoji sequences with missing VS16s.
13
21
  - Add new `:all_no_vs16` and `:rgi_at` modes to be able to support some terminals
data/README.md CHANGED
@@ -114,10 +114,11 @@ The `emoji:` option can be used to configure which type of Emoji should be consi
114
114
  `:all_no_vs16` | EAW (1 or 2) | 2 for all ZWJ/modifier/keycap sequences, even if they are not well-formed Emoji sequences | WezTerm
115
115
  `:possible`| 2 | 2 for all possible/well-formed Emoji sequences | ?
116
116
  `:rgi` | 2 | 2 for all [RGI Emoji](https://www.unicode.org/reports/tr51/#def_rgi_set) sequences | ?
117
- `:rgi_at` | EAW (1 or 2) | 1 or 2: Like `:rgi`, but Emoji sequences starting with a default-text Emoji have width 1 | Apple Terminal
117
+ `:rgi_at` | EAW (1 or 2) | 1 or 2: Like `:rgi`, but Emoji sequences starting with a default-text Emoji have EAW | Apple Terminal
118
118
  `:vs16` | 2 | 2 * number of partial Emoji (sequences never considered to represent a combined Emoji) | kitty?
119
119
  `false` or `:none` | EAW (1 or 2) | No Emoji adjustments | gnome-terminal, many older terminals
120
120
 
121
+ - *EAW:* East Asian Width
121
122
  - *RGI Emoji:* Emoji Recommended for General Interchange
122
123
  - *ZWJ:* Zero-width Joiner: Codepoint `U+200D`,used in many Emoji sequences
123
124
 
Binary file
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Unicode
4
4
  class DisplayWidth
5
- VERSION = "3.1.0"
5
+ VERSION = "3.1.2"
6
6
  UNICODE_VERSION = "16.0.0"
7
7
  DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../../data/")
8
8
  INDEX_FILENAME = DATA_DIRECTORY + "/display_width.marshal.gz"
@@ -10,8 +10,8 @@ module Unicode
10
10
  class DisplayWidth
11
11
  DEFAULT_AMBIGUOUS = 1
12
12
  INITIAL_DEPTH = 0x10000
13
- ASCII_NON_ZERO_REGEX = /[\0\x05\a\b\n\v\f\r\x0E\x0F]/
14
- ASCII_NON_ZERO_STRING = "\0\x05\a\b\n\v\f\r\x0E\x0F"
13
+ ASCII_NON_ZERO_REGEX = /[\0\x05\a\b\n-\x0F]/
14
+ ASCII_NON_ZERO_STRING = "\0\x05\a\b\n-\x0F"
15
15
  ASCII_BACKSPACE = "\b"
16
16
  AMBIGUOUS_MAP = {
17
17
  1 => :WIDTH_ONE,
@@ -21,6 +21,10 @@ module Unicode
21
21
  WIDTH_ONE: 768,
22
22
  WIDTH_TWO: 161,
23
23
  }
24
+ NOT_COMMON_NARROW_REGEX = {
25
+ WIDTH_ONE: /[^\u{10}-\u{2FF}]/m,
26
+ WIDTH_TWO: /[^\u{10}-\u{A1}]/m,
27
+ }
24
28
  FIRST_4096 = {
25
29
  WIDTH_ONE: decompress_index(INDEX[:WIDTH_ONE][0][0], 1),
26
30
  WIDTH_TWO: decompress_index(INDEX[:WIDTH_TWO][0][0], 1),
@@ -30,126 +34,61 @@ module Unicode
30
34
  rgi_at: :REGEX_INCLUDE_MQE_UQE,
31
35
  possible: :REGEX_WELL_FORMED,
32
36
  }
33
- REGEX_EMOJI_BASIC_OR_KEYCAP = Regexp.union(Unicode::Emoji::REGEX_BASIC, Unicode::Emoji::REGEX_EMOJI_KEYCAP)
37
+ REGEX_EMOJI_VS16 = Regexp.union(
38
+ Regexp.compile(
39
+ Unicode::Emoji::REGEX_TEXT_PRESENTATION.source +
40
+ "(?<![#*0-9])" +
41
+ "\u{FE0F}"
42
+ ),
43
+ Unicode::Emoji::REGEX_EMOJI_KEYCAP
44
+ )
34
45
  REGEX_EMOJI_ALL_SEQUENCES = Regexp.union(/.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?(\u{200D}.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?)+/, Unicode::Emoji::REGEX_EMOJI_KEYCAP)
35
- REGEX_EMOJI_NOT_POSSIBLE = /\A[#*0-9]\z/
46
+ REGEX_EMOJI_ALL_SEQUENCES_AND_VS16 = Regexp.union(REGEX_EMOJI_ALL_SEQUENCES, REGEX_EMOJI_VS16)
36
47
 
37
48
  # Returns monospace display width of string
38
49
  def self.of(string, ambiguous = nil, overwrite = nil, old_options = {}, **options)
39
- unless old_options.empty?
40
- warn "Unicode::DisplayWidth: Please migrate to keyword arguments - #{old_options.inspect}"
41
- options.merge! old_options
42
- end
43
-
44
- options[:ambiguous] = ambiguous if ambiguous
45
- options[:ambiguous] ||= DEFAULT_AMBIGUOUS
46
-
47
- if options[:ambiguous] != 1 && options[:ambiguous] != 2
48
- raise ArgumentError, "Unicode::DisplayWidth: Ambiguous width must be 1 or 2"
49
- end
50
-
51
- if overwrite && !overwrite.empty?
52
- warn "Unicode::DisplayWidth: Please migrate to keyword arguments - overwrite: #{overwrite.inspect}"
53
- options[:overwrite] = overwrite
54
- end
55
- options[:overwrite] ||= {}
56
-
57
- if [nil, true, :auto].include?(options[:emoji])
58
- options[:emoji] = EmojiSupport.recommended
59
- end
50
+ string = string.encode(Encoding::UTF_8) unless string.encoding == Encoding::UTF_8
51
+ options = normalize_options(string, ambiguous, overwrite, old_options, **options)
60
52
 
61
- # # #
53
+ width = 0
62
54
 
63
- if !options[:overwrite].empty?
64
- return width_frame(string, options) do |string, index_full, index_low, first_ambiguous|
65
- width_all_features(string, index_full, index_low, first_ambiguous, options[:overwrite])
66
- end
55
+ unless options[:overwrite].empty?
56
+ width, string = width_custom(string, options[:overwrite])
67
57
  end
68
58
 
69
- if !string.ascii_only?
70
- return width_frame(string, options) do |string, index_full, index_low, first_ambiguous|
71
- width_no_overwrite(string, index_full, index_low, first_ambiguous)
72
- end
59
+ if string.ascii_only?
60
+ return width + width_ascii(string)
73
61
  end
74
62
 
75
- width_ascii(string)
76
- end
63
+ ambiguous_index_name = AMBIGUOUS_MAP[options[:ambiguous]]
77
64
 
78
- def self.width_ascii(string)
79
- # Optimization for ASCII-only strings without certain control symbols
80
- if string.match?(ASCII_NON_ZERO_REGEX)
81
- res = string.delete(ASCII_NON_ZERO_STRING).size - string.count(ASCII_BACKSPACE)
82
- return res < 0 ? 0 : res
65
+ unless string.match?(NOT_COMMON_NARROW_REGEX[ambiguous_index_name])
66
+ return width + string.size
83
67
  end
84
68
 
85
- # Pure ASCII
86
- string.size
87
- end
88
-
89
- def self.width_frame(string, options)
90
69
  # Retrieve Emoji width
91
- if options[:emoji] == false || options[:emoji] == :none
92
- res = 0
93
- else
94
- res, string = emoji_width(
70
+ if options[:emoji] != :none
71
+ e_width, string = emoji_width(
95
72
  string,
96
73
  options[:emoji],
97
74
  options[:ambiguous],
98
75
  )
99
- end
100
-
101
- # Prepare indexes
102
- ambiguous_index_name = AMBIGUOUS_MAP[options[:ambiguous]]
103
-
104
- # Get general width
105
- res += yield(string, INDEX[ambiguous_index_name], FIRST_4096[ambiguous_index_name], FIRST_AMBIGUOUS[ambiguous_index_name])
106
-
107
- # Return result + prevent negative lengths
108
- res < 0 ? 0 : res
109
- end
110
-
111
- def self.width_no_overwrite(string, index_full, index_low, first_ambiguous, _ = {})
112
- res = 0
113
-
114
- # Make sure we have UTF-8
115
- string = string.encode(Encoding::UTF_8) unless string.encoding.name == "utf-8"
76
+ width += e_width
116
77
 
117
- string.scan(/.{,80}/m){ |batch|
118
- if batch.ascii_only?
119
- res += batch.size
120
- else
121
- batch.each_codepoint{ |codepoint|
122
- if codepoint > 15 && codepoint < first_ambiguous
123
- res += 1
124
- elsif codepoint < 0x1001
125
- res += index_low[codepoint] || 1
126
- else
127
- d = INITIAL_DEPTH
128
- w = index_full[codepoint / d]
129
- while w.instance_of? Array
130
- w = w[(codepoint %= d) / (d /= 16)]
131
- end
132
-
133
- res += w || 1
134
- end
135
- }
78
+ unless string.match?(NOT_COMMON_NARROW_REGEX[ambiguous_index_name])
79
+ return width + string.size
136
80
  end
137
- }
138
-
139
- res
140
- end
81
+ end
141
82
 
142
- # Same as .width_no_overwrite - but with applying overwrites for each char
143
- def self.width_all_features(string, index_full, index_low, first_ambiguous, overwrite)
144
- res = 0
83
+ index_full = INDEX[ambiguous_index_name]
84
+ index_low = FIRST_4096[ambiguous_index_name]
85
+ first_ambiguous = FIRST_AMBIGUOUS[ambiguous_index_name]
145
86
 
146
87
  string.each_codepoint{ |codepoint|
147
- if overwrite[codepoint]
148
- res += overwrite[codepoint]
149
- elsif codepoint > 15 && codepoint < first_ambiguous
150
- res += 1
88
+ if codepoint > 15 && codepoint < first_ambiguous
89
+ width += 1
151
90
  elsif codepoint < 0x1001
152
- res += index_low[codepoint] || 1
91
+ width += index_low[codepoint] || 1
153
92
  else
154
93
  d = INITIAL_DEPTH
155
94
  w = index_full[codepoint / d]
@@ -157,19 +96,44 @@ module Unicode
157
96
  w = w[(codepoint %= d) / (d /= 16)]
158
97
  end
159
98
 
160
- res += w || 1
99
+ width += w || 1
161
100
  end
162
101
  }
163
102
 
164
- res
103
+ # Return result + prevent negative lengths
104
+ width < 0 ? 0 : width
105
+ end
106
+
107
+ # Returns width of custom overwrites and remaining string
108
+ def self.width_custom(string, overwrite)
109
+ width = 0
110
+
111
+ string = string.each_codepoint.select{ |codepoint|
112
+ if overwrite[codepoint]
113
+ width += overwrite[codepoint]
114
+ nil
115
+ else
116
+ codepoint
117
+ end
118
+ }.pack("U*")
119
+
120
+ [width, string]
165
121
  end
166
122
 
123
+ # Returns width for ASCII-only strings. Will consider zero-width control symbols.
124
+ def self.width_ascii(string)
125
+ if string.match?(ASCII_NON_ZERO_REGEX)
126
+ res = string.delete(ASCII_NON_ZERO_STRING).bytesize - string.count(ASCII_BACKSPACE)
127
+ return res < 0 ? 0 : res
128
+ end
129
+
130
+ string.bytesize
131
+ end
167
132
 
133
+ # Returns width of all considered Emoji and remaining string
168
134
  def self.emoji_width(string, mode = :all, ambiguous = DEFAULT_AMBIGUOUS)
169
135
  res = 0
170
136
 
171
- string = string.encode(Encoding::UTF_8) unless string.encoding.name == "utf-8"
172
-
173
137
  if emoji_set_regex = EMOJI_SEQUENCES_REGEX_MAPPING[mode]
174
138
  emoji_width_via_possible(
175
139
  string,
@@ -177,45 +141,23 @@ module Unicode
177
141
  mode == :rgi_at,
178
142
  ambiguous,
179
143
  )
144
+
180
145
  elsif mode == :all_no_vs16
181
- emoji_width_all(string)
146
+ no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES){ res += 2; "" }
147
+ [res, no_emoji_string]
148
+
182
149
  elsif mode == :vs16
183
- emoji_width_basic(string)
150
+ no_emoji_string = string.gsub(REGEX_EMOJI_VS16){ res += 2; "" }
151
+ [res, no_emoji_string]
152
+
184
153
  elsif mode == :all
185
- res_all, string = emoji_width_all(string)
186
- res_basic, string = emoji_width_basic(string)
187
- [res_all + res_basic, string]
154
+ no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES_AND_VS16){ res += 2; "" }
155
+ [res, no_emoji_string]
156
+
188
157
  else
189
158
  [0, string]
190
- end
191
- end
192
-
193
- # Ensure all explicit VS16 sequences have width 2
194
- def self.emoji_width_basic(string)
195
- res = 0
196
-
197
- no_emoji_string = string.gsub(REGEX_EMOJI_BASIC_OR_KEYCAP){ |basic_emoji|
198
- if basic_emoji.size >= 2 # VS16 present
199
- res += 2
200
- ""
201
- else
202
- basic_emoji
203
- end
204
- }
205
-
206
- [res, no_emoji_string]
207
- end
208
-
209
- # Use simplistic ZWJ/modifier/kecap sequence matching
210
- def self.emoji_width_all(string)
211
- res = 0
212
-
213
- no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES){
214
- res += 2
215
- ""
216
- }
217
159
 
218
- [res, no_emoji_string]
160
+ end
219
161
  end
220
162
 
221
163
  # Match possible Emoji first, then refine
@@ -223,13 +165,9 @@ module Unicode
223
165
  res = 0
224
166
 
225
167
  # For each string possibly an emoji
226
- no_emoji_string = string.gsub(Unicode::Emoji::REGEX_POSSIBLE){ |emoji_candidate|
227
- # Skip notorious false positives
228
- if REGEX_EMOJI_NOT_POSSIBLE.match?(emoji_candidate)
229
- emoji_candidate
230
-
168
+ no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES_AND_VS16){ |emoji_candidate|
231
169
  # Check if we have a combined Emoji with width 2 (or EAW an Apple Terminal)
232
- elsif emoji_candidate == emoji_candidate[emoji_set_regex]
170
+ if emoji_candidate == emoji_candidate[emoji_set_regex]
233
171
  if strict_eaw
234
172
  res += self.of(emoji_candidate[0], ambiguous, emoji: false)
235
173
  else
@@ -241,14 +179,7 @@ module Unicode
241
179
  else
242
180
  if !strict_eaw
243
181
  # Ensure all explicit VS16 sequences have width 2
244
- emoji_candidate.gsub!(Unicode::Emoji::REGEX_BASIC){ |basic_emoji|
245
- if basic_emoji.size == 2 # VS16 present
246
- res += 2
247
- ""
248
- else
249
- basic_emoji
250
- end
251
- }
182
+ emoji_candidate.gsub!(REGEX_EMOJI_VS16){ res += 2; "" }
252
183
  end
253
184
 
254
185
  emoji_candidate
@@ -258,6 +189,34 @@ module Unicode
258
189
  [res, no_emoji_string]
259
190
  end
260
191
 
192
+ def self.normalize_options(string, ambiguous = nil, overwrite = nil, old_options = {}, **options)
193
+ unless old_options.empty?
194
+ warn "Unicode::DisplayWidth: Please migrate to keyword arguments - #{old_options.inspect}"
195
+ options.merge! old_options
196
+ end
197
+
198
+ options[:ambiguous] = ambiguous if ambiguous
199
+ options[:ambiguous] ||= DEFAULT_AMBIGUOUS
200
+
201
+ if options[:ambiguous] != 1 && options[:ambiguous] != 2
202
+ raise ArgumentError, "Unicode::DisplayWidth: Ambiguous width must be 1 or 2"
203
+ end
204
+
205
+ if overwrite && !overwrite.empty?
206
+ warn "Unicode::DisplayWidth: Please migrate to keyword arguments - overwrite: #{overwrite.inspect}"
207
+ options[:overwrite] = overwrite
208
+ end
209
+ options[:overwrite] ||= {}
210
+
211
+ if [nil, true, :auto].include?(options[:emoji])
212
+ options[:emoji] = EmojiSupport.recommended
213
+ elsif options[:emoji] == false
214
+ options[:emoji] = :none
215
+ end
216
+
217
+ options
218
+ end
219
+
261
220
  def initialize(ambiguous: DEFAULT_AMBIGUOUS, overwrite: {}, emoji: true)
262
221
  @ambiguous = ambiguous
263
222
  @overwrite = overwrite
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicode-display_width
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.1.0
4
+ version: 3.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Lelis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-11-18 00:00:00.000000000 Z
11
+ date: 2024-11-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode-emoji
@@ -17,6 +17,9 @@ dependencies:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '4.0'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 4.0.4
20
23
  type: :runtime
21
24
  prerelease: false
22
25
  version_requirements: !ruby/object:Gem::Requirement
@@ -24,6 +27,9 @@ dependencies:
24
27
  - - "~>"
25
28
  - !ruby/object:Gem::Version
26
29
  version: '4.0'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 4.0.4
27
33
  - !ruby/object:Gem::Dependency
28
34
  name: rspec
29
35
  requirement: !ruby/object:Gem::Requirement