RubyGems - character_set - Versions diffs - 1.6.0-java → 1.8.0-java - Mend

character_set 1.6.0-java → 1.8.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

checksums.yaml +4 -4
data/.github/workflows/gouteur.yml +1 -1
data/.github/workflows/lint.yml +1 -1
data/.github/workflows/tests.yml +3 -1
data/.rubocop.yml +3 -0
data/BENCHMARK.md +32 -32
data/CHANGELOG.md +24 -1
data/Gemfile +7 -6
data/LICENSE.txt +1 -1
data/README.md +3 -3
data/Rakefile +2 -123
data/character_set.gemspec +0 -7
data/ext/character_set/character_set.c +77 -43
data/lib/character_set/core_ext/regexp_ext.rb +8 -0
data/lib/character_set/expression_converter.rb +37 -54
data/lib/character_set/parser.rb +8 -4
data/lib/character_set/predefined_sets/assigned.cps +73 -52
data/lib/character_set/predefined_sets/emoji.cps +10 -9
data/lib/character_set/ruby_fallback/character_set_methods.rb +14 -17
data/lib/character_set/ruby_fallback/set_methods.rb +6 -21
data/lib/character_set/ruby_fallback/vendored_set_classes.rb +385 -0
data/lib/character_set/ruby_fallback.rb +18 -6
data/lib/character_set/set_method_adapters.rb +1 -1
data/lib/character_set/shared_methods.rb +6 -2
data/lib/character_set/version.rb +1 -1
data/tasks/benchmark.rake +20 -0
data/tasks/benchmarks/shared.rb +28 -0
data/tasks/sync_casefold_data.rake +20 -0
data/tasks/sync_predefined_sets.rake +9 -0
data/tasks/sync_ruby_spec.rake +65 -0
metadata +19 -28
data/benchmarks/shared.rb +0 -30
/data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/delete_in.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/keep_in.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0

data/ext/character_set/character_set.c CHANGED Viewed

@@ -376,22 +376,20 @@ cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop)
   cps = data->cps;
   len = data->len;
   cp = FIX2ULONG(cp_num);
-  if (return_nil_if_noop && (!tst_cp(cps, len, cp) == !on))
+  if (return_nil_if_noop && tst_cp(cps, len, cp) == on)
   {
     return Qnil;
   }
+  if (on)
+  {
+    set_cp(data, cp);
+  }
   else
   {
-    if (on)
-    {
-      set_cp(data, cp);
-    }
-    else
-    {
-      clr_cp(cps, len, cp);
-    }
-    return cs;
+    clr_cp(cps, len, cp);
   }
+  return cs;
 }
 static VALUE
@@ -575,7 +573,7 @@ cs_method_merge(VALUE self, VALUE other)
   {
     return cs_merge_cs(self, other);
   }
-  else if (TYPE(other) == T_ARRAY)
+  if (TYPE(other) == T_ARRAY)
   {
     return cs_merge_rb_array(self, other);
   }
@@ -677,6 +675,18 @@ cs_method_proper_superset_p(VALUE self, VALUE other)
   return (is_superset && is_proper) ? Qtrue : Qfalse;
 }
+static VALUE
+cs_method_spaceship_operator(VALUE self, VALUE other)
+{
+  if (cs_method_eql_p(self, other))
+    return INT2FIX(0);
+  if (cs_method_proper_subset_p(self, other))
+    return INT2FIX(-1);
+  if (cs_method_proper_superset_p(self, other))
+    return INT2FIX(1);
+  return Qnil;
+}
 // *******************************
 // `CharacterSet`-specific methods
 // *******************************
@@ -917,10 +927,10 @@ cs_method_ext_inversion(int argc, VALUE *argv, VALUE self)
   return new_cs;
 }
-typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE *memo);
+typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE memo);
 static inline int
-add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
+add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
 {
   set_cp(data, str_cp);
   return 1;
@@ -967,7 +977,7 @@ cs_method_case_insensitive(VALUE self)
 }
 static inline VALUE
-each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
+each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
 {
   long i, str_len;
   unsigned int str_cp;
@@ -986,21 +996,29 @@ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_d
 }
 static inline VALUE
-each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
+each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
 {
   int n;
   unsigned int str_cp;
   const char *ptr, *end;
-  rb_encoding *enc;
+  rb_encoding *utf8;
+  utf8 = rb_utf8_encoding();
+  if (rb_enc_get(str) == utf8)
+  {
+    str = rb_str_new_frozen(str);
+  }
+  else
+  {
+    str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
+  }
-  str = rb_str_new_frozen(str);
   ptr = RSTRING_PTR(str);
   end = RSTRING_END(str);
-  enc = rb_enc_get(str);
   while (ptr < end)
   {
-    str_cp = rb_enc_codepoint_len(ptr, end, &n, enc);
+    str_cp = rb_enc_codepoint_len(ptr, end, &n, utf8);
     if (!(*func)(str_cp, cp_arr, len, data, memo))
     {
       return Qfalse;
@@ -1031,12 +1049,13 @@ single_byte_optimizable(VALUE str)
 }
 static inline VALUE
-each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
+each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
 {
   if (single_byte_optimizable(str))
   {
     return each_sb_cp(str, func, cp_arr, len, data, memo);
   }
   return each_mb_cp(str, func, cp_arr, len, data, memo);
 }
@@ -1062,11 +1081,11 @@ cs_class_method_of_string(VALUE self, VALUE string)
 }
 static inline int
-count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
+count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
 {
   if (tst_cp(cp_arr, len, str_cp))
   {
-    *memo += 1;
+    *((VALUE *)memo) += 1;
   }
   return 1;
 }
@@ -1074,17 +1093,17 @@ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data
 static VALUE
 cs_method_count_in(VALUE self, VALUE str)
 {
-  VALUE count;
+  long count;
   struct cs_data *data;
   raise_arg_err_unless_string(str);
   data = cs_fetch_data(self);
   count = 0;
-  each_cp(str, count_str_cp, data->cps, data->len, data, &count);
-  return INT2NUM((int)count);
+  each_cp(str, count_str_cp, data->cps, data->len, data, (VALUE)&count);
+  return LONG2FIX(count);
 }
 static inline int
-str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
+str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
 {
   return tst_cp(cp_arr, len, str_cp);
 }
@@ -1099,11 +1118,11 @@ cs_method_cover_p(VALUE self, VALUE str)
 }
 static inline int
-add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
+add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
 {
   if (tst_cp(cp_arr, len, str_cp))
   {
-    rb_ary_push(memo[0], rb_enc_uint_chr((int)str_cp, (rb_encoding *)memo[1]));
+    rb_ary_push(memo, rb_enc_uint_chr((int)str_cp, rb_utf8_encoding()));
   }
   return 1;
 }
@@ -1111,18 +1130,17 @@ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_d
 static VALUE
 cs_method_scan(VALUE self, VALUE str)
 {
-  VALUE memo[2];
+  VALUE memo;
   struct cs_data *data;
   raise_arg_err_unless_string(str);
   data = cs_fetch_data(self);
-  memo[0] = rb_ary_new();
-  memo[1] = (VALUE)rb_enc_get(str);
+  memo = rb_ary_new();
   each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo);
-  return memo[0];
+  return memo;
 }
 static inline int
-str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
+str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
 {
   return !tst_cp(cp_arr, len, str_cp);
 }
@@ -1146,9 +1164,9 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
   cs_cp cs_len;
   VALUE orig_str_len;
-  rb_encoding *enc;
+  rb_encoding *orig_enc, *utf8;
   char *s, *send, *t;
-  int ascompat, cr;
+  int orig_was_utf8, cr;
   raise_arg_err_unless_string(str);
@@ -1159,24 +1177,34 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
     return bang ? Qnil : str;
   }
-  if (!bang)
+  orig_enc = rb_enc_get(str);
+  utf8 = rb_utf8_encoding();
+  orig_was_utf8 = orig_enc == utf8;
+  if (!orig_was_utf8 && orig_enc != rb_usascii_encoding())
+  {
+    str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
+  }
+  else
   {
-    str = rb_str_dup(str);
+    if (!bang)
+    {
+      str = rb_str_dup(str);
+    }
   }
   cps = cs_fetch_cps(set, &cs_len);
   rb_str_modify(str);
-  enc = rb_enc_get(str);
-  ascompat = rb_enc_asciicompat(enc);
   s = t = RSTRING_PTR(str);
   send = RSTRING_END(str);
-  cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
+  cr = ENC_CODERANGE_7BIT;
   while (s < send)
   {
     unsigned int c;
     int clen;
-    if (ascompat && (c = *(unsigned char *)s) < 0x80)
+    if ((c = *(unsigned char *)s) < 0x80)
     {
       if (tst_cp(cps, cs_len, c) != delete)
       {
@@ -1188,12 +1216,12 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
     }
     else
     {
-      c = rb_enc_codepoint_len(s, send, &clen, enc);
+      c = rb_enc_codepoint_len(s, send, &clen, utf8);
       if (tst_cp(cps, cs_len, c) != delete)
       {
         if (t != s)
-          rb_enc_mbcput(c, t, enc);
+          rb_enc_mbcput(c, t, utf8);
         t += clen;
         if (cr == ENC_CODERANGE_7BIT)
           cr = ENC_CODERANGE_VALID;
@@ -1210,6 +1238,11 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
     return Qnil;
   }
+  if (!orig_was_utf8)
+  {
+    return rb_str_encode(str, rb_enc_from_encoding(orig_enc), 0, Qnil);
+  }
   return str;
 }
@@ -1303,6 +1336,7 @@ void Init_character_set()
   rb_define_method(cs, ">=", cs_method_superset_p, 1);
   rb_define_method(cs, "proper_superset?", cs_method_proper_superset_p, 1);
   rb_define_method(cs, ">", cs_method_proper_superset_p, 1);
+  rb_define_method(cs, "<=>", cs_method_spaceship_operator, 1);
   // `CharacterSet`-specific methods

data/lib/character_set/core_ext/regexp_ext.rb CHANGED Viewed

@@ -4,6 +4,14 @@ class CharacterSet
       def character_set
         CharacterSet.of_regexp(self)
       end
+      def covered_by_character_set?(other)
+        other.superset?(character_set)
+      end
+      def uses_character_set?(other)
+        other.intersect?(character_set)
+      end
     end
   end
 end

data/lib/character_set/expression_converter.rb CHANGED Viewed

@@ -4,86 +4,61 @@ class CharacterSet
     Error = Class.new(ArgumentError)
-    def convert(expression, to = CharacterSet)
+    def convert(expression, to = CharacterSet, acc = [])
       CharacterSet.require_optional_dependency('regexp_parser', __method__)
       case expression
-      when Regexp::Expression::Root
-        if expression.count != 1
-          raise Error, 'Pass a Regexp with exactly one expression, e.g. /[a-z]/'
-        end
-        convert(expression[0], to)
       when Regexp::Expression::CharacterSet
-        content = expression.map { |subexp| convert(subexp, to) }.reduce(:+)
-        content ||= to[]
-        expression.negative? ? content.inversion : content
+        content = expression.map { |subexp| convert(subexp, to) }.reduce(:+) || to[]
+        acc << (expression.negative? ? content.inversion : content)
       when Regexp::Expression::CharacterSet::Intersection
-        expression.map { |subexp| convert(subexp, to) }.reduce(:&)
-      when Regexp::Expression::CharacterSet::IntersectedSequence
-        expression.map { |subexp| convert(subexp, to) }.reduce(:+) || to[]
+        acc << expression.map { |subexp| convert(subexp, to) }.reduce(:&)
       when Regexp::Expression::CharacterSet::Range
         start, finish = expression.map { |subexp| convert(subexp, to) }
-        to.new((start.min)..(finish.max))
+        acc << to.new((start.min)..(finish.max))
+      when Regexp::Expression::Subexpression # root, group, alternation, etc.
+        expression.each { |subexp| convert(subexp, to, acc) }
       when Regexp::Expression::CharacterType::Any
-        to.unicode
+        acc << to.unicode
       when Regexp::Expression::CharacterType::Base
         /(?<negative>non)?(?<base_name>.+)/ =~ expression.token
         content =
           if expression.unicode_classes?
-            # in u-mode, type shortcuts match the same as \p{<long type name>}
-            to.of_property(base_name)
+            # in u-mode, most type shortcuts match the same as \p{<long type name>}
+            if base_name == 'linebreak'
+              to.from_ranges(10..13, 133..133, 8232..8233)
+            else
+              to.of_property(base_name)
+            end
           else
             # in normal mode, types match only ascii chars
             case base_name.to_sym
-            when :digit then to.from_ranges(48..57)
-            when :hex   then to.from_ranges(48..57, 65..70, 97..102)
-            when :space then to.from_ranges(9..13, 32..32)
-            when :word  then to.from_ranges(48..57, 65..90, 95..95, 97..122)
+            when :digit     then to.from_ranges(48..57)
+            when :hex       then to.from_ranges(48..57, 65..70, 97..102)
+            when :linebreak then to.from_ranges(10..13)
+            when :space     then to.from_ranges(9..13, 32..32)
+            when :word      then to.from_ranges(48..57, 65..90, 95..95, 97..122)
             else raise Error, "Unsupported CharacterType #{base_name}"
             end
           end
-        negative ? content.inversion : content
+        acc << (negative ? content.inversion : content)
       when Regexp::Expression::EscapeSequence::CodepointList
-        to.new(expression.codepoints)
+        content = to.new(expression.codepoints)
+        acc << (expression.i? ? content.case_insensitive : content)
       when Regexp::Expression::EscapeSequence::Base
-        to[expression.codepoint]
-      when Regexp::Expression::Group::Capture,
-           Regexp::Expression::Group::Passive,
-           Regexp::Expression::Group::Named,
-           Regexp::Expression::Group::Atomic,
-           Regexp::Expression::Group::Options
-        case expression.count
-        when 0 then to[]
-        when 1 then convert(expression.first, to)
-        else
-          raise Error, 'Groups must contain exactly one expression, e.g. ([a-z])'
-        end
-      when Regexp::Expression::Alternation # rubocop:disable Lint/DuplicateBranch
-        expression.map { |subexp| convert(subexp, to) }.reduce(:+)
-      when Regexp::Expression::Alternative
-        case expression.count
-        when 0 then to[]
-        when 1 then convert(expression.first, to)
-        else
-          raise Error, 'Alternatives must contain exactly one expression'
-        end
+        content = to[expression.codepoint]
+        acc << (expression.i? ? content.case_insensitive : content)
       when Regexp::Expression::Literal
-        if expression.set_level == 0 && expression.text.size != 1
-          raise Error, 'Literal runs outside of sets are codepoint *sequences*'
-        end
-        to[expression.text.ord]
+        content = to[*expression.text.chars]
+        acc << (expression.i? ? content.case_insensitive : content)
       when Regexp::Expression::UnicodeProperty::Base,
            Regexp::Expression::PosixClass
@@ -91,14 +66,22 @@ class CharacterSet
         if expression.type == :posixclass && expression.ascii_classes?
           content = content.ascii_part
         end
-        expression.negative? ? content.inversion : content
+        acc << (expression.negative? ? content.inversion : content)
+      when Regexp::Expression::Anchor::Base,
+           Regexp::Expression::Backreference::Base,
+           Regexp::Expression::Keep::Mark,
+           Regexp::Expression::Quantifier
+        # ignore zero-length and repeat expressions
       when Regexp::Expression::Base
         raise Error, "Unsupported expression class `#{expression.class}`"
       else
-        raise Error, "Pass an expression (result of Regexp::Parser.parse)"
+        raise Error, 'Pass an expression (result of Regexp::Parser.parse)'
       end
+      acc.reduce(:+) || to[]
     end
   end
 end

data/lib/character_set/parser.rb CHANGED Viewed

@@ -4,11 +4,15 @@ class CharacterSet
     def codepoints_from_enumerable(object)
       raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each)
       # Use #each to check first element (only this works for all Enumerables)
-      object.each do |e| # rubocop:disable Lint/UnreachableLoop
-        return object            if e.is_a?(Integer) && e >= 0 && e < 0x110000
-        return object.map(&:ord) if e.is_a?(String)  && e.length == 1
-        raise ArgumentError, "#{e.inspect} is not valid as a codepoint"
+      object.each do |el| # rubocop:disable Lint/UnreachableLoop
+        if el.is_a?(Integer) && el >= 0 && el < 0x110000
+          return object
+        elsif el.is_a?(String) && el.length == 1
+          return object.to_a.join.encode('utf-8').codepoints
+        end
+        raise ArgumentError, "#{el.inspect} is not valid as a codepoint"
       end
     end