RubyGems - unicode-display_width - Versions diffs - 3.1.0 → 3.1.2 - Mend

unicode-display_width 3.1.0 → 3.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +9 -1
data/README.md +2 -1
data/data/display_width.marshal.gz +0 -0
data/lib/unicode/display_width/constants.rb +1 -1
data/lib/unicode/display_width.rb +108 -149
metadata +8 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 01657362aaf60cf79bb03c63bb96e01914139c7bb965dc9bed18e7988b8c6709
-  data.tar.gz: 297cc1ab03e72a02e9f33eb4eec2dea2006f23987818083c4bc12aa168e437c3
+  metadata.gz: a85ca57ca5e291c17993e526d222dda44b884286484b3831bb8173ce92aafb1a
+  data.tar.gz: d1036dfc6464459de04a713e273d09dea767a3b9a9629d9e491052c2ffe97c23
 SHA512:
-  metadata.gz: a3878d504a273e44268762fca4857bf26a9322e0e54c0afc437d953dca675822262c9aec54cb5de3d23390b4b778403b36ce0f73ba5b0f1d2c8554a1f796d210
-  data.tar.gz: 00de0d22f3b245f16de15b3b4864ff754da04ea94eafdeaf06c0e38fec8cfb2559fbeaafc17f165534e70a386e154f70d7b071f5a226c9f64d7088bbb408cabb
+  metadata.gz: d669e8a2866b56a78bafb3fff6d2d6430fab6bb1ca2633aeaac68e0634ca14374ac0b325bc7159ef90afe0bdffd9c154700cae1fc3183b1d74281ff4b5024e1b
+  data.tar.gz: 5f319484d27dad70b3851398e11cd3cb93b5c4f41a6c3a76c958d505d8357f9e303b661fd7a0339262d1458b82cb8619e6682ee2dbf8c583d33fbde4fd1a8680

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,13 @@
 # CHANGELOG
+## 3.1.2
+- Performance improvements
+## 3.1.1
+- Performance improvements
 ## 3.1.0
 **Improve Emoji support:**
@@ -7,7 +15,7 @@
 - Emoji modes: Differentiate between well-formed Emoji (`:possible`) and any
   ZWJ/modifier sequence (`:all`). The latter is more common and more efficient
   to implement.
-- Unify `rgi_{fqe,mqe,uqe}` options to just `:rgi` to keep things simpler (corresponds to
+- Unify `:rgi_{fqe,mqe,uqe}` options to just `:rgi` to keep things simpler (corresponds to
   the former `:rgi_uqe` option). Most terminals that want to support the RGI set
   will probably want to catch Emoji sequences with missing VS16s.
 - Add new `:all_no_vs16` and `:rgi_at` modes to be able to support some terminals

data/README.md CHANGED Viewed

@@ -114,10 +114,11 @@ The `emoji:` option can be used to configure which type of Emoji should be consi
 `:all_no_vs16` | EAW (1 or 2) | 2 for all ZWJ/modifier/keycap sequences, even if they are not well-formed Emoji sequences | WezTerm
 `:possible`| 2                | 2 for all possible/well-formed Emoji sequences | ?
 `:rgi`     | 2                | 2 for all [RGI Emoji](https://www.unicode.org/reports/tr51/#def_rgi_set) sequences | ?
-`:rgi_at`  | EAW (1 or 2)     | 1 or 2: Like `:rgi`, but Emoji sequences starting with a default-text Emoji have width 1 | Apple Terminal
+`:rgi_at`  | EAW (1 or 2)     | 1 or 2: Like `:rgi`, but Emoji sequences starting with a default-text Emoji have EAW | Apple Terminal
 `:vs16`    | 2                | 2 * number of partial Emoji (sequences never considered to represent a combined Emoji) | kitty?
 `false` or  `:none` | EAW (1 or 2) | No Emoji adjustments | gnome-terminal, many older terminals
+- *EAW:* East Asian Width
 - *RGI Emoji:* Emoji Recommended for General Interchange
 - *ZWJ:* Zero-width Joiner: Codepoint `U+200D`,used in many Emoji sequences

data/data/display_width.marshal.gz CHANGED Viewed

Binary file

data/lib/unicode/display_width/constants.rb CHANGED Viewed

@@ -2,7 +2,7 @@
 module Unicode
   class DisplayWidth
-    VERSION = "3.1.0"
+    VERSION = "3.1.2"
     UNICODE_VERSION = "16.0.0"
     DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../../data/")
     INDEX_FILENAME = DATA_DIRECTORY + "/display_width.marshal.gz"

data/lib/unicode/display_width.rb CHANGED Viewed

@@ -10,8 +10,8 @@ module Unicode
   class DisplayWidth
     DEFAULT_AMBIGUOUS = 1
     INITIAL_DEPTH = 0x10000
-    ASCII_NON_ZERO_REGEX = /[\0\x05\a\b\n\v\f\r\x0E\x0F]/
-    ASCII_NON_ZERO_STRING = "\0\x05\a\b\n\v\f\r\x0E\x0F"
+    ASCII_NON_ZERO_REGEX = /[\0\x05\a\b\n-\x0F]/
+    ASCII_NON_ZERO_STRING = "\0\x05\a\b\n-\x0F"
     ASCII_BACKSPACE = "\b"
     AMBIGUOUS_MAP = {
       1 => :WIDTH_ONE,
@@ -21,6 +21,10 @@ module Unicode
       WIDTH_ONE: 768,
       WIDTH_TWO: 161,
     }
+    NOT_COMMON_NARROW_REGEX = {
+     WIDTH_ONE: /[^\u{10}-\u{2FF}]/m,
+     WIDTH_TWO: /[^\u{10}-\u{A1}]/m,
+    }
     FIRST_4096 = {
       WIDTH_ONE: decompress_index(INDEX[:WIDTH_ONE][0][0], 1),
       WIDTH_TWO: decompress_index(INDEX[:WIDTH_TWO][0][0], 1),
@@ -30,126 +34,61 @@ module Unicode
       rgi_at: :REGEX_INCLUDE_MQE_UQE,
       possible: :REGEX_WELL_FORMED,
     }
-    REGEX_EMOJI_BASIC_OR_KEYCAP = Regexp.union(Unicode::Emoji::REGEX_BASIC, Unicode::Emoji::REGEX_EMOJI_KEYCAP)
+    REGEX_EMOJI_VS16 = Regexp.union(
+      Regexp.compile(
+        Unicode::Emoji::REGEX_TEXT_PRESENTATION.source +
+        "(?<![#*0-9])" +
+        "\u{FE0F}"
+      ),
+      Unicode::Emoji::REGEX_EMOJI_KEYCAP
+    )
     REGEX_EMOJI_ALL_SEQUENCES = Regexp.union(/.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?(\u{200D}.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?)+/, Unicode::Emoji::REGEX_EMOJI_KEYCAP)
-    REGEX_EMOJI_NOT_POSSIBLE = /\A[#*0-9]\z/
+    REGEX_EMOJI_ALL_SEQUENCES_AND_VS16 = Regexp.union(REGEX_EMOJI_ALL_SEQUENCES, REGEX_EMOJI_VS16)
     # Returns monospace display width of string
     def self.of(string, ambiguous = nil, overwrite = nil, old_options = {}, **options)
-      unless old_options.empty?
-        warn "Unicode::DisplayWidth: Please migrate to keyword arguments - #{old_options.inspect}"
-        options.merge! old_options
-      end
-      options[:ambiguous] = ambiguous if ambiguous
-      options[:ambiguous] ||= DEFAULT_AMBIGUOUS
-      if options[:ambiguous] != 1 && options[:ambiguous] != 2
-        raise ArgumentError, "Unicode::DisplayWidth: Ambiguous width must be 1 or 2"
-      end
-      if overwrite && !overwrite.empty?
-        warn "Unicode::DisplayWidth: Please migrate to keyword arguments - overwrite: #{overwrite.inspect}"
-        options[:overwrite] = overwrite
-      end
-      options[:overwrite] ||= {}
-      if [nil, true, :auto].include?(options[:emoji])
-        options[:emoji] = EmojiSupport.recommended
-      end
+      string = string.encode(Encoding::UTF_8) unless string.encoding == Encoding::UTF_8
+      options = normalize_options(string, ambiguous, overwrite, old_options, **options)
-      # # #
+      width = 0
-      if !options[:overwrite].empty?
-        return width_frame(string, options) do |string, index_full, index_low, first_ambiguous|
-          width_all_features(string, index_full, index_low, first_ambiguous, options[:overwrite])
-        end
+      unless options[:overwrite].empty?
+        width, string = width_custom(string, options[:overwrite])
       end
-      if !string.ascii_only?
-        return width_frame(string, options) do |string, index_full, index_low, first_ambiguous|
-          width_no_overwrite(string, index_full, index_low, first_ambiguous)
-        end
+      if string.ascii_only?
+        return width + width_ascii(string)
       end
-      width_ascii(string)
-    end
+      ambiguous_index_name = AMBIGUOUS_MAP[options[:ambiguous]]
-    def self.width_ascii(string)
-      # Optimization for ASCII-only strings without certain control symbols
-      if string.match?(ASCII_NON_ZERO_REGEX)
-        res = string.delete(ASCII_NON_ZERO_STRING).size - string.count(ASCII_BACKSPACE)
-        return res < 0 ? 0 : res
+      unless string.match?(NOT_COMMON_NARROW_REGEX[ambiguous_index_name])
+        return width + string.size
       end
-      # Pure ASCII
-      string.size
-    end
-    def self.width_frame(string, options)
       # Retrieve Emoji width
-      if options[:emoji] == false || options[:emoji] == :none
-        res = 0
-      else
-        res, string = emoji_width(
+      if options[:emoji] != :none
+        e_width, string = emoji_width(
           string,
           options[:emoji],
           options[:ambiguous],
         )
-      end
-      # Prepare indexes
-      ambiguous_index_name = AMBIGUOUS_MAP[options[:ambiguous]]
-      # Get general width
-      res += yield(string, INDEX[ambiguous_index_name], FIRST_4096[ambiguous_index_name], FIRST_AMBIGUOUS[ambiguous_index_name])
-      # Return result + prevent negative lengths
-      res < 0 ? 0 : res
-    end
-    def self.width_no_overwrite(string, index_full, index_low, first_ambiguous, _ = {})
-      res = 0
-      # Make sure we have UTF-8
-      string = string.encode(Encoding::UTF_8) unless string.encoding.name == "utf-8"
+        width += e_width
-      string.scan(/.{,80}/m){ |batch|
-        if batch.ascii_only?
-          res += batch.size
-        else
-          batch.each_codepoint{ |codepoint|
-            if codepoint > 15 && codepoint < first_ambiguous
-              res += 1
-            elsif codepoint < 0x1001
-              res += index_low[codepoint] || 1
-            else
-              d = INITIAL_DEPTH
-              w = index_full[codepoint / d]
-              while w.instance_of? Array
-                w = w[(codepoint %= d) / (d /= 16)]
-              end
-              res += w || 1
-            end
-          }
+        unless string.match?(NOT_COMMON_NARROW_REGEX[ambiguous_index_name])
+          return width + string.size
         end
-      }
-      res
-    end
+      end
-    # Same as .width_no_overwrite - but with applying overwrites for each char
-    def self.width_all_features(string, index_full, index_low, first_ambiguous, overwrite)
-      res = 0
+      index_full = INDEX[ambiguous_index_name]
+      index_low = FIRST_4096[ambiguous_index_name]
+      first_ambiguous = FIRST_AMBIGUOUS[ambiguous_index_name]
       string.each_codepoint{ |codepoint|
-        if overwrite[codepoint]
-          res += overwrite[codepoint]
-        elsif codepoint > 15 && codepoint < first_ambiguous
-          res += 1
+        if codepoint > 15 && codepoint < first_ambiguous
+          width += 1
         elsif codepoint < 0x1001
-          res += index_low[codepoint] || 1
+          width += index_low[codepoint] || 1
         else
           d = INITIAL_DEPTH
           w = index_full[codepoint / d]
@@ -157,19 +96,44 @@ module Unicode
             w = w[(codepoint %= d) / (d /= 16)]
           end
-          res += w || 1
+          width += w || 1
         end
       }
-      res
+      # Return result + prevent negative lengths
+      width < 0 ? 0 : width
+    end
+    # Returns width of custom overwrites and remaining string
+    def self.width_custom(string, overwrite)
+      width = 0
+      string = string.each_codepoint.select{ |codepoint|
+        if overwrite[codepoint]
+          width += overwrite[codepoint]
+          nil
+        else
+          codepoint
+        end
+      }.pack("U*")
+      [width, string]
     end
+    # Returns width for ASCII-only strings. Will consider zero-width control symbols.
+    def self.width_ascii(string)
+      if string.match?(ASCII_NON_ZERO_REGEX)
+        res = string.delete(ASCII_NON_ZERO_STRING).bytesize - string.count(ASCII_BACKSPACE)
+        return res < 0 ? 0 : res
+      end
+      string.bytesize
+    end
+    # Returns width of all considered Emoji and remaining string
     def self.emoji_width(string, mode = :all, ambiguous = DEFAULT_AMBIGUOUS)
       res = 0
-      string = string.encode(Encoding::UTF_8) unless string.encoding.name == "utf-8"
       if emoji_set_regex = EMOJI_SEQUENCES_REGEX_MAPPING[mode]
         emoji_width_via_possible(
           string,
@@ -177,45 +141,23 @@ module Unicode
           mode == :rgi_at,
           ambiguous,
         )
       elsif mode == :all_no_vs16
-        emoji_width_all(string)
+        no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES){ res += 2; "" }
+        [res, no_emoji_string]
       elsif mode == :vs16
-        emoji_width_basic(string)
+        no_emoji_string = string.gsub(REGEX_EMOJI_VS16){ res += 2; "" }
+        [res, no_emoji_string]
       elsif mode == :all
-        res_all, string = emoji_width_all(string)
-        res_basic, string = emoji_width_basic(string)
-        [res_all + res_basic, string]
+        no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES_AND_VS16){ res += 2; "" }
+        [res, no_emoji_string]
       else
         [0, string]
-      end
-    end
-    # Ensure all explicit VS16 sequences have width 2
-    def self.emoji_width_basic(string)
-      res = 0
-      no_emoji_string = string.gsub(REGEX_EMOJI_BASIC_OR_KEYCAP){ |basic_emoji|
-        if basic_emoji.size >= 2 # VS16 present
-          res += 2
-          ""
-        else
-          basic_emoji
-        end
-      }
-      [res, no_emoji_string]
-    end
-    # Use simplistic ZWJ/modifier/kecap sequence matching
-    def self.emoji_width_all(string)
-      res = 0
-      no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES){
-        res += 2
-        ""
-      }
-      [res, no_emoji_string]
+      end
     end
     # Match possible Emoji first, then refine
@@ -223,13 +165,9 @@ module Unicode
       res = 0
       # For each string possibly an emoji
-      no_emoji_string = string.gsub(Unicode::Emoji::REGEX_POSSIBLE){ |emoji_candidate|
-        # Skip notorious false positives
-        if REGEX_EMOJI_NOT_POSSIBLE.match?(emoji_candidate)
-          emoji_candidate
+      no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES_AND_VS16){ |emoji_candidate|
         # Check if we have a combined Emoji with width 2 (or EAW an Apple Terminal)
-        elsif emoji_candidate == emoji_candidate[emoji_set_regex]
+        if emoji_candidate == emoji_candidate[emoji_set_regex]
           if strict_eaw
             res += self.of(emoji_candidate[0], ambiguous, emoji: false)
           else
@@ -241,14 +179,7 @@ module Unicode
         else
           if !strict_eaw
             # Ensure all explicit VS16 sequences have width 2
-            emoji_candidate.gsub!(Unicode::Emoji::REGEX_BASIC){ |basic_emoji|
-              if basic_emoji.size == 2 # VS16 present
-                res += 2
-                ""
-              else
-                basic_emoji
-              end
-            }
+            emoji_candidate.gsub!(REGEX_EMOJI_VS16){ res += 2; "" }
           end
           emoji_candidate
@@ -258,6 +189,34 @@ module Unicode
       [res, no_emoji_string]
     end
+    def self.normalize_options(string, ambiguous = nil, overwrite = nil, old_options = {}, **options)
+      unless old_options.empty?
+        warn "Unicode::DisplayWidth: Please migrate to keyword arguments - #{old_options.inspect}"
+        options.merge! old_options
+      end
+      options[:ambiguous] = ambiguous if ambiguous
+      options[:ambiguous] ||= DEFAULT_AMBIGUOUS
+      if options[:ambiguous] != 1 && options[:ambiguous] != 2
+        raise ArgumentError, "Unicode::DisplayWidth: Ambiguous width must be 1 or 2"
+      end
+      if overwrite && !overwrite.empty?
+        warn "Unicode::DisplayWidth: Please migrate to keyword arguments - overwrite: #{overwrite.inspect}"
+        options[:overwrite] = overwrite
+      end
+      options[:overwrite] ||= {}
+      if [nil, true, :auto].include?(options[:emoji])
+        options[:emoji] = EmojiSupport.recommended
+      elsif options[:emoji] == false
+        options[:emoji] = :none
+      end
+      options
+    end
     def initialize(ambiguous: DEFAULT_AMBIGUOUS, overwrite: {}, emoji: true)
       @ambiguous = ambiguous
       @overwrite = overwrite

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: unicode-display_width
 version: !ruby/object:Gem::Version
-  version: 3.1.0
+  version: 3.1.2
 platform: ruby
 authors:
 - Jan Lelis
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-11-18 00:00:00.000000000 Z
+date: 2024-11-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: unicode-emoji
@@ -17,6 +17,9 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '4.0'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 4.0.4
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
@@ -24,6 +27,9 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '4.0'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 4.0.4
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement