RubyGems - character_set - Versions diffs - 1.3.0-java → 1.4.0-java - Mend

character_set 1.3.0-java → 1.4.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +10 -1
data/README.md +13 -3
data/Rakefile +1 -1
data/lib/character_set/character.rb +1 -1
data/lib/character_set/shared_methods.rb +4 -0
data/lib/character_set/version.rb +1 -1
data/lib/character_set/writer.rb +98 -27
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: cd336b705f4a2c9dc5af1fc0d841bbfb6324a5c8eb955bb72747c9c4c8a8431d
-  data.tar.gz: 965a1f84fe364d1e0d44039f2947f5c91dae2bcd485201fa262d1fbd41ba7dea
+  metadata.gz: 72bf4e9262b86b5d729e7e526e311a8617e70dff50f0f2c70a2081363c2bebb7
+  data.tar.gz: 26f6ff53583ebf1dae307076682df54e3c7bc365022abfc13b988131a0aecff6
 SHA512:
-  metadata.gz: dbef79700f9cc6d00387d373fdd0d307c1b2ebbcdd78d3efd25cbdc54e067b0576c53ec2b7b46eacde8def6a0feb2acd58396af62b462666dde213664c400d73
-  data.tar.gz: f66e839c472188f52511a4ff3a4fda3becbe0af177c8cc4c4d7aeeaf65bb55b256f10a03a4aec453c6800ccd0bd697b4adb514f6621177f08e750d448150333b
+  metadata.gz: '0392f8f135133ed9edd720633694a74df1ee6fcd0fcf806c1a4ecd26314976f129941efee839f9b1a9d2be95bc4d5f0964c36de2b15f309b10953628dc4a756f'
+  data.tar.gz: c52d3928e5c9dba3a130a0bd2e1c5b191ebd6211cc88cb9f5960fc389dea0447897633a4c5893f3ce457c76d9c1a518e11d8f4e40116cab7f728ccae972a3fd0

data/CHANGELOG.md CHANGED Viewed

@@ -4,7 +4,16 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
-## UNRELEASED
+## [1.4.0] - 2019-06-07
+### Added
+- `#to_s_with_surrogate_ranges` / `Writer::write_surrogate_ranges`
+  - allows for much shorter astral plane representations e.g. in JavaScript
+  - thanks to https://github.com/singpolyma for the suggestion and groundwork (#1)
+- improved performance for `#to_s` / `Writer` by avoiding bugged `Range#minmax`
+### Fixed
+- '/' is now escaped by default when stringifying so as to work with //-regexp syntax
 ## [1.3.0] - 2019-04-26

data/README.md CHANGED Viewed

@@ -40,7 +40,7 @@ CharacterSet.parse('[a-c]')
 CharacterSet.parse('\U00000061-\U00000063')
 ```
-If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting.
+If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting. Regexp's `i`-flag is ignored; call `#case_insensitive` on the result if needed.
 ```ruby
 CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
@@ -167,8 +167,18 @@ set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
 # disable abbreviation (grouping of codepoints in ranges)
 set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
-# for full js regex compatibility in case of astral members:
-set.to_s_with_surrogate_alternation # => '(?:[a-c\u0258]|\ud83e\udd29)'
+# astral members require some trickery if we want to target environments
+# that are based on UTF-16 or "UCS-2 with surrogates", such as JavaScript.
+set = CharacterSet['a', 'b', '🤩', '🤪', '🤫']
+# Use #to_s_with_surrogate_ranges e.g. for JavaScript:
+set.to_s_with_surrogate_ranges
+# => '(?:[ab]|\uD83E[\uDD29-\uDD2B])'
+# Or use #to_s_with_surrogate_alternation if such surrogate set pairs
+# don't work in your target environment:
+set.to_s_with_surrogate_alternation
+# => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
 ```
 ### Unicode plane methods

data/Rakefile CHANGED Viewed

@@ -126,7 +126,7 @@ task :sync_predefined_sets do
   %w[assigned emoji whitespace].each do |prop|
     require 'regexp_property_values'
     ranges = RegexpPropertyValues[prop].matched_ranges
-    str = ranges.map { |r| r.minmax.map { |n| n.to_s(16) }.join(',').upcase + "\n" }.join
+    str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
     File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
   end
 end

data/lib/character_set/character.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 class CharacterSet
   class Character
     ENCODING = 'utf-8'.freeze
-    SAFELY_PRINTABLE = (0x21..0x7E).to_a - ['-', '[', '\\', ']', '^'].map(&:ord)
+    SAFELY_PRINTABLE = (0x21..0x7E).to_a - %w(- / [ \\ ] ^).map(&:ord)
     attr_accessor :codepoint

data/lib/character_set/shared_methods.rb CHANGED Viewed

@@ -86,6 +86,10 @@ class CharacterSet
           Writer.write(ranges, opts, &block)
         end
+        def to_s_with_surrogate_ranges
+          Writer.write_surrogate_ranges(bmp_part.ranges, astral_part.ranges)
+        end
         def to_s_with_surrogate_alternation
           Writer.write_surrogate_alternation(bmp_part.ranges, astral_part.ranges)
         end

data/lib/character_set/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class CharacterSet
-  VERSION = '1.3.0'
+  VERSION = '1.4.0'
 end

data/lib/character_set/writer.rb CHANGED Viewed

@@ -1,37 +1,108 @@
 class CharacterSet
   module Writer
-    module_function
-    def write(codepoint_ranges, opts = {}, &block)
-      content = codepoint_ranges.map do |range|
-        if range.size > 2 && opts[:abbreviate] != false
-          range.minmax.map { |cp| Character.new(cp).escape(opts, &block) }.join('-')
-        else
-          range.map { |cp| Character.new(cp).escape(opts, &block) }.join
+    class << self
+      def write(codepoint_ranges, opts = {}, &block)
+        content = codepoint_ranges.map do |range|
+          if range.size > 2 && opts[:abbreviate] != false
+            bounds = [range.min, range.max]
+            bounds.map { |cp| write_codepoint(cp, opts, &block) }.join('-')
+          else
+            range.map { |cp| write_codepoint(cp, opts, &block) }.join
+          end
+        end.join
+        opts[:in_brackets] ? "[#{content}]" : content
+      end
+      def write_codepoint(codepoint, opts = {}, &block)
+        Character.new(codepoint).escape(opts, &block)
+      end
+      def write_surrogate_ranges(bmp_ranges, astral_ranges)
+        astral_branches = surrogate_range_expressions(astral_ranges)
+        bmp_set_with_alternatives(bmp_ranges, astral_branches)
+      end
+      def write_surrogate_alternation(bmp_ranges, astral_ranges)
+        astral_branches = surrogate_pairs(astral_ranges)
+        bmp_set_with_alternatives(bmp_ranges, astral_branches)
+      end
+      private
+      def surrogate_range_expressions(astral_ranges)
+        compressed_surrogate_range_pairs(astral_ranges).map do |hi_ranges, lo_ranges|
+          [hi_ranges, lo_ranges].map do |ranges|
+            use_brackets = ranges.size > 1 || ranges.first.size > 1
+            write(ranges, format: :js, in_brackets: use_brackets)
+          end.join
         end
-      end.join
-      opts[:in_brackets] ? "[#{content}]" : content
-    end
+      end
+      def compressed_surrogate_range_pairs(astral_ranges)
+        halves = astral_ranges.flat_map { |range| surrogate_half_ranges(range) }
-    def write_surrogate_alternation(bmp_ranges, astral_ranges)
-      bmp_set = write(bmp_ranges, format: :js, in_brackets: true)
-      if astral_ranges.empty?
-        bmp_set
-      else
-        surrogate_pairs = surrogate_pairs(astral_ranges)
-        "(?:#{((bmp_ranges.any? ? [bmp_set] : []) + surrogate_pairs) * '|'})"
+        # compress high surrogate codepoint ranges with common low range half
+        with_common_lo = halves.group_by(&:last).map do |lo_range, pairs|
+          hi_ranges = pairs.map(&:first)
+          compressed_hi_ranges = hi_ranges.each_with_object([]) do |range, arr|
+            prev = arr.last
+            if prev.nil? || prev.max + 1 < range.min # first or gap
+              arr << range
+            else # continuous codepoints, expand previous range
+              arr[-1] = (prev.min)..(range.max)
+            end
+          end
+          [compressed_hi_ranges, lo_range]
+        end
+        # compress low surrogate codepoint ranges with common high ranges
+        with_common_lo.each_with_object({}) do |(hi_ranges, lo_range), hash|
+          (hash[hi_ranges] ||= []) << lo_range
+        end
       end
-    end
-    def surrogate_pairs(astral_ranges)
-      astral_ranges.flat_map { |range| range.map { |cp| surrogate_pair(cp) } }
-    end
+      def surrogate_half_ranges(astral_range)
+        hi_min, lo_min = surrogate_pair_codepoints(astral_range.min)
+        hi_max, lo_max = surrogate_pair_codepoints(astral_range.max)
+        hi_count = 1 + hi_max - hi_min
+        return [[hi_min..hi_min, lo_min..lo_max]] if hi_count == 1
+        ranges = []
+        # first high surrogate might be partially covered (if lo_min > 0xDC00)
+        ranges << [hi_min..hi_min, lo_min..0xDFFF]
+        # any high surrogates in between are fully covered
+        ranges << [(hi_min + 1)..(hi_max - 1), 0xDC00..0xDFFF] if hi_count > 2
-    def surrogate_pair(astral_codepoint)
-      base = astral_codepoint - 0x10000
-      high = ((base / 1024).floor + 0xD800).to_s(16)
-      low  = (base % 1024 + 0xDC00).to_s(16)
-      "\\u#{high}\\u#{low}"
+        # last high surrogate might be partially covered (if lo_max < 0xDFFF)
+        ranges << [hi_max..hi_max, 0xDC00..lo_max]
+        ranges
+      end
+      def surrogate_pair_codepoints(astral_codepoint)
+        base = astral_codepoint - 0x10000
+        high = base / 1024 + 0xD800
+        low  = base % 1024 + 0xDC00
+        [high, low]
+      end
+      def bmp_set_with_alternatives(bmp_ranges, alternatives)
+        bmp_set = write(bmp_ranges, format: :js, in_brackets: true)
+        return bmp_set if alternatives.empty? && bmp_ranges.any?
+        "(?:#{((bmp_ranges.any? ? [bmp_set] : []) + alternatives).join('|')})"
+      end
+      def surrogate_pairs(astral_ranges)
+        astral_ranges.flat_map { |range| range.map { |cp| surrogate_pair(cp) } }
+      end
+      def surrogate_pair(astral_codepoint)
+        surrogate_pair_codepoints(astral_codepoint)
+          .map { |half| write_codepoint(half, format: :js) }.join
+      end
     end
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: character_set
 version: !ruby/object:Gem::Version
-  version: 1.3.0
+  version: 1.4.0
 platform: java
 authors:
 - Janosch Müller
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-05-26 00:00:00.000000000 Z
+date: 2019-06-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: benchmark-ips