RubyGems - character_set - Versions diffs - 1.6.0-java → 1.7.0-java - Mend

character_set 1.6.0-java → 1.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +4 -4
data/BENCHMARK.md +32 -32
data/CHANGELOG.md +15 -1
data/README.md +1 -1
data/Rakefile +2 -123
data/character_set.gemspec +0 -7
data/ext/character_set/character_set.c +64 -43
data/lib/character_set/parser.rb +8 -4
data/lib/character_set/predefined_sets/assigned.cps +73 -52
data/lib/character_set/predefined_sets/emoji.cps +10 -9
data/lib/character_set/ruby_fallback/character_set_methods.rb +15 -14
data/lib/character_set/ruby_fallback/set_methods.rb +4 -18
data/lib/character_set/ruby_fallback/vendored_set_classes.rb +492 -0
data/lib/character_set/ruby_fallback.rb +2 -6
data/lib/character_set/shared_methods.rb +2 -2
data/lib/character_set/version.rb +1 -1
data/tasks/benchmark.rake +20 -0
data/tasks/benchmarks/shared.rb +28 -0
data/tasks/sync_casefold_data.rake +20 -0
data/tasks/sync_predefined_sets.rake +9 -0
data/tasks/sync_ruby_spec.rake +65 -0
metadata +19 -28
data/benchmarks/shared.rb +0 -30
/data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/delete_in.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/keep_in.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 5ccdb4b8c43fdf0e3ed297415810a92805ee61324d91c464ccc7bee2575cb14b
-  data.tar.gz: cc36d5c8d65c036981bd368577bf49eb59d2d91f5fb779cae2e42aa991c94610
+  metadata.gz: 36050dd00f44b6efc26567bfd867ff21535fe1e35c9a8018d00f2145b27bfd37
+  data.tar.gz: d88d01cae2f5650271d73c654877b6cf62cf87acba9b8e699677b569c514b0e9
 SHA512:
-  metadata.gz: 35db3bcae78967f9d74beebccb3412a08ff505ead29fb3ef2c1bef05a5ebc5815c16ea948a3c831bb92041493f12502e86c3e01b64d19bfe8baba6dfb89c88e4
-  data.tar.gz: de08614dabac51cc9c71b5ed8a96d2865cab747fcb27ca3b00b0d75efcedf7f7f0f81bdd0472c275901d09258af137ddaf5150d9fea067a79f5fd6b526aa2ac5
+  metadata.gz: 646450cc07172ffdbceaf6cf215c03a60487ced2fdf578c4467f08374b77f6a8d4e043cbb92c89ea2ebc39c5b5adf38f8d74502632e033dbc8982928d6002f99
+  data.tar.gz: 1d77ccb0abef9c591189a77ed862657a04020f6bf9b3f31b7760ced20cdbee962411e29375b7d3883e95d8c97cac992afc89b17dbdaafbe99f0af02cfa22a0e1

data/BENCHMARK.md CHANGED Viewed

@@ -3,88 +3,88 @@ Results of `rake:benchmark` on ruby 3.2.0dev (2022-02-14T14:35:54Z master 26187a
 ```
 Counting non-letters
-CharacterSet#count_in: 14794607.9 i/s
-        String#count:  3875939.3 i/s - 3.82x slower
+CharacterSet#count_in: 14627506.2 i/s
+        String#count:  3859777.0 i/s - 3.79x slower
 ```
 ```
 Detecting non-whitespace
- CharacterSet#cover?: 17448329.0 i/s
-       Regexp#match?: 13089358.1 i/s - 1.33x slower
+ CharacterSet#cover?: 17241902.8 i/s
+       Regexp#match?: 12971122.6 i/s - 1.33x slower
 ```
 ```
 Detecting non-letters
- CharacterSet#cover?: 17565596.9 i/s
-       Regexp#match?:  7951108.0 i/s - 2.21x slower
+ CharacterSet#cover?: 17243472.3 i/s
+       Regexp#match?:  7957626.9 i/s - 2.17x slower
 ```
 ```
 Removing ASCII whitespace
-CharacterSet#delete_in:  6306078.2 i/s
-           String#tr:  4734401.0 i/s - 1.33x slower
-         String#gsub:   211631.8 i/s - 29.80x slower
+CharacterSet#delete_in:  6190975.7 i/s
+           String#tr:  4722716.6 i/s - 1.31x slower
+         String#gsub:   214239.5 i/s - 28.90x slower
 ```
 ```
 Removing whitespace, emoji and umlauts
-CharacterSet#delete_in:  5984149.6 i/s
-           String#tr:   363643.1 i/s - 16.46x slower
-         String#gsub:   317201.7 i/s - 18.87x slower
+CharacterSet#delete_in:  5890471.8 i/s
+           String#tr:   348506.8 i/s - 16.90x slower
+         String#gsub:   318268.3 i/s - 18.51x slower
 ```
 ```
 Removing non-whitespace
-CharacterSet#keep_in:  7650925.6 i/s
-         String#gsub:   207374.6 i/s - 36.89x slower
-           String#tr:       12.3 i/s - 619745.60x slower
+CharacterSet#keep_in:  7396898.0 i/s
+         String#gsub:   208809.7 i/s - 35.42x slower
+           String#tr:       13.1 i/s - 564682.50x slower
 ```
 ```
 Keeping only emoji
-CharacterSet#keep_in:  7272940.1 i/s
-         String#gsub:   177993.8 i/s - 40.86x slower
-           String#tr:       12.3 i/s - 590222.71x slower
+CharacterSet#keep_in:  7022741.1 i/s
+         String#gsub:   180939.6 i/s - 38.81x slower
+           String#tr:       13.1 i/s - 536724.50x slower
 ```
 ```
 Extracting emoji to an Array
-   CharacterSet#scan:  2978285.0 i/s
-         String#scan:   865793.8 i/s - 3.44x slower
+   CharacterSet#scan:  3023176.8 i/s
+         String#scan:   893225.8 i/s - 3.38x slower
 ```
 ```
 Detecting whitespace
-CharacterSet#used_by?: 17292338.4 i/s
-       Regexp#match?: 11705563.9 i/s - 1.48x slower
+CharacterSet#used_by?: 17284025.9 i/s
+       Regexp#match?: 11847064.5 i/s - 1.46x slower
 ```
 ```
 Detecting emoji in a large string
-CharacterSet#used_by?:   340444.1 i/s
-       Regexp#match?:   180549.8 i/s - 1.89x slower
+CharacterSet#used_by?:   341386.1 i/s
+       Regexp#match?:   183121.6 i/s - 1.86x slower
 ```
 ```
 Adding entries
-    CharacterSet#add:  4951781.4 i/s
-       SortedSet#add:  1019637.9 i/s - 4.86x slower
+    CharacterSet#add:  4989762.3 i/s
+       SortedSet#add:  1157911.7 i/s - 4.31x slower
 ```
 ```
 Removing entries
- CharacterSet#delete:  5006337.6 i/s
-    SortedSet#delete:  3922752.2 i/s - same-ish
+ CharacterSet#delete:  4996703.6 i/s
+    SortedSet#delete:  4177401.5 i/s - same-ish
 ```
 ```
 Merging entries
-  CharacterSet#merge:      661.8 i/s
-     SortedSet#merge:        3.9 i/s - 167.82x slower
+  CharacterSet#merge:      666.7 i/s
+     SortedSet#merge:        4.0 i/s - 167.84x slower
 ```
 ```
 Getting the min and max
- CharacterSet#minmax:  1212462.2 i/s
-    SortedSet#minmax:      844.4 i/s - 1435.93x slower
+ CharacterSet#minmax:  1596470.9 i/s
+    SortedSet#minmax:      866.4 i/s - 1842.74x slower
 ```

data/CHANGELOG.md CHANGED Viewed

@@ -4,6 +4,20 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
+## [Unreleased]
+## [1.7.0] - 2023-05-12
+### Added
+- new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.2.0
+### Fixed
+- fixed processing of Strings that are not ASCII- or UTF8-encoded
+- removed dependency on `set` and `sorted_set`
+  - thanks to https://github.com/mikebaldry for reporting a related issue (#2)
 ## [1.6.0] - 2022-02-16
 ### Added
@@ -63,7 +77,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - reduced memory consumption by > 90% for most use cases via dynamic resizing
   - before, every set instance required 136 KB for codepoints
   - now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
-- `#count_in` and `#scan_in` methods for `String` interaction
+- `#count_in` and `#scan` methods for `String` interaction
 - new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
 - conversion methods `#assigned_part`, `#valid_part`
 - sectioning methods `#ascii_part`, `#plane(n)`

data/README.md CHANGED Viewed

@@ -96,7 +96,7 @@ string # => ''
 ```ruby
 CharacterSet.non_ascii.count_in('Tüür') # => 2
-CharacterSet.non_ascii.scan_in('Tüür') # => ['ü', 'ü']
+CharacterSet.non_ascii.scan('Tüür') # => ['ü', 'ü']
 ```
 There is also a core extension for String interaction.

data/Rakefile CHANGED Viewed

@@ -3,6 +3,8 @@ require 'rspec/core/rake_task'
 require 'rubygems/package_task'
 require 'rake/extensiontask'
+Dir['tasks/**/*.rake'].each { |file| load(file) }
 RSpec::Core::RakeTask.new(:spec)
 task default: :spec
@@ -34,129 +36,6 @@ end
 task package: 'java:gem'
-desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
-task :sync_ruby_spec do
-  require 'fileutils'
-  variants = {
-    'CharacterSet'       => './spec/ruby-spec/library/character_set',
-    'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
-  }
-  # download fresh specs from ruby/spec repository
-  variants.each do |_, dir|
-    FileUtils.rm_rf(dir) if File.exist?(dir)
-    `svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
-  end
-  # make copies for each CharacterSet variant
-  base = variants.first[1]
-  variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
-  # adapt specs to work with CharacterSet
-  variants.each do |class_name, dir|
-    Dir["#{dir}/**/*.rb"].each do |spec|
-      # ignore some tests that do not apply or are covered otherwise
-      if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)}
-        File.delete(spec)
-        next
-      end
-      adapted_content =
-        File.read(spec).
-        # adapt class name
-        gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)).
-        gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2").
-        # get shared specs from a single shared dir at the parent level
-        gsub(/(require_relative ['"])(shared\/)/, '\1../\2').
-        # make 'mspec' syntax rspec-compatible
-        gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|').
-        gsub(/be_(false|true)/, 'be \1').
-        gsub('stub!', 'stub').
-        gsub('mock', 'double').
-        gsub('@method', 'method').
-        # remove unneeded requires
-        gsub(/require 'set'\n/, '').
-        gsub(/require.*spec_helper.*\n/, '').
-        gsub(/\A\n+/, '').
-        # make examples use Integers/codepoints
-        gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0').
-        gsub('"one"', '1').
-        gsub('"two"', '2').
-        gsub('"three"', '3').
-        gsub('"four"', '4').
-        gsub('"five"', '5').
-        gsub(/x.(size|length) == 3/, 'x != 3').
-        gsub(/x.(size|length) != 3/, 'x == 3').
-        gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2')
-      File.open(spec, 'w') { |f| f.puts adapted_content }
-    end
-  end
-  # keep only one copy of the shared specs, at the parent level
-  FileUtils.rm_rf(base + '/../shared')
-  FileUtils.mv(base + '/shared', base + '/../')
-  variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') }
-end
-desc 'Download unicode casefold data and write new C header file'
-task :sync_casefold_data do
-  src_path = './CaseFolding.txt'
-  dst_path = './ext/character_set/unicode_casefold_table.h'
-  `wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
-  mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
-    from, type, to = line.split(/\s*;\s*/).first(3)
-    # type 'C' stands for 'common', excludes mappings to multiple chars
-    hash[from] = to if type == 'C'
-  end.sort
-  content = File.read(dst_path + '.tmpl')
-    .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
-    .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
-  File.write(dst_path, content)
-  File.unlink(src_path)
-end
-desc 'Update codepoint data for predefined sets, based on Onigmo'
-task :sync_predefined_sets do
-  %w[assigned emoji whitespace].each do |prop|
-    require 'regexp_property_values'
-    ranges = RegexpPropertyValues[prop].matched_ranges
-    str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
-    File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
-  end
-end
-desc 'Run all IPS benchmarks'
-task :benchmark do
-  Dir['./benchmarks/*.rb'].sort.each { |file| require file }
-end
-namespace :benchmark do
-  desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
-  task :write_to_file do
-    $store_comparison_results = {}
-    Rake.application[:benchmark].invoke
-    File.open('BENCHMARK.md', 'w') do |f|
-      f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
-      $store_comparison_results.each do |caption, result|
-        f.puts '```',
-               caption,
-               '',
-               result.strip.gsub(/ \(±[^)]+\) /, '').gsub(/(same-ish).*$/, '\1').lines[1..-1],
-               '```'
-      end
-    end
-  end
-end
 unless RUBY_PLATFORM =~ /java/
   # recompile before benchmarking or running specs
   task(:benchmark).enhance([:compile])

data/character_set.gemspec CHANGED Viewed

@@ -21,11 +21,4 @@ Gem::Specification.new do |s|
   s.extensions  = %w[ext/character_set/extconf.rb]
   s.required_ruby_version = '>= 2.1.0'
-  # SortedSet, needed for RubyFallback, was moved to a gem in Ruby 3.
-  # This dependency is only used if the C extension is unavailable.
-  # JRuby has it in the stdlib.
-  if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
-    s.add_dependency 'sorted_set', '~> 1.0'
-  end
 end

data/ext/character_set/character_set.c CHANGED Viewed

@@ -376,22 +376,20 @@ cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop)
   cps = data->cps;
   len = data->len;
   cp = FIX2ULONG(cp_num);
-  if (return_nil_if_noop && (!tst_cp(cps, len, cp) == !on))
+  if (return_nil_if_noop && tst_cp(cps, len, cp) == on)
   {
     return Qnil;
   }
+  if (on)
+  {
+    set_cp(data, cp);
+  }
   else
   {
-    if (on)
-    {
-      set_cp(data, cp);
-    }
-    else
-    {
-      clr_cp(cps, len, cp);
-    }
-    return cs;
+    clr_cp(cps, len, cp);
   }
+  return cs;
 }
 static VALUE
@@ -575,7 +573,7 @@ cs_method_merge(VALUE self, VALUE other)
   {
     return cs_merge_cs(self, other);
   }
-  else if (TYPE(other) == T_ARRAY)
+  if (TYPE(other) == T_ARRAY)
   {
     return cs_merge_rb_array(self, other);
   }
@@ -917,10 +915,10 @@ cs_method_ext_inversion(int argc, VALUE *argv, VALUE self)
   return new_cs;
 }
-typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE *memo);
+typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE memo);
 static inline int
-add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
+add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
 {
   set_cp(data, str_cp);
   return 1;
@@ -967,7 +965,7 @@ cs_method_case_insensitive(VALUE self)
 }
 static inline VALUE
-each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
+each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
 {
   long i, str_len;
   unsigned int str_cp;
@@ -986,21 +984,29 @@ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_d
 }
 static inline VALUE
-each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
+each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
 {
   int n;
   unsigned int str_cp;
   const char *ptr, *end;
-  rb_encoding *enc;
+  rb_encoding *utf8;
+  utf8 = rb_utf8_encoding();
+  if (rb_enc_get(str) == utf8)
+  {
+    str = rb_str_new_frozen(str);
+  }
+  else
+  {
+    str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
+  }
-  str = rb_str_new_frozen(str);
   ptr = RSTRING_PTR(str);
   end = RSTRING_END(str);
-  enc = rb_enc_get(str);
   while (ptr < end)
   {
-    str_cp = rb_enc_codepoint_len(ptr, end, &n, enc);
+    str_cp = rb_enc_codepoint_len(ptr, end, &n, utf8);
     if (!(*func)(str_cp, cp_arr, len, data, memo))
     {
       return Qfalse;
@@ -1031,12 +1037,13 @@ single_byte_optimizable(VALUE str)
 }
 static inline VALUE
-each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
+each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
 {
   if (single_byte_optimizable(str))
   {
     return each_sb_cp(str, func, cp_arr, len, data, memo);
   }
   return each_mb_cp(str, func, cp_arr, len, data, memo);
 }
@@ -1062,11 +1069,11 @@ cs_class_method_of_string(VALUE self, VALUE string)
 }
 static inline int
-count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
+count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
 {
   if (tst_cp(cp_arr, len, str_cp))
   {
-    *memo += 1;
+    *((VALUE *)memo) += 1;
   }
   return 1;
 }
@@ -1074,17 +1081,17 @@ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data
 static VALUE
 cs_method_count_in(VALUE self, VALUE str)
 {
-  VALUE count;
+  long count;
   struct cs_data *data;
   raise_arg_err_unless_string(str);
   data = cs_fetch_data(self);
   count = 0;
-  each_cp(str, count_str_cp, data->cps, data->len, data, &count);
-  return INT2NUM((int)count);
+  each_cp(str, count_str_cp, data->cps, data->len, data, (VALUE)&count);
+  return LONG2FIX(count);
 }
 static inline int
-str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
+str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
 {
   return tst_cp(cp_arr, len, str_cp);
 }
@@ -1099,11 +1106,11 @@ cs_method_cover_p(VALUE self, VALUE str)
 }
 static inline int
-add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
+add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
 {
   if (tst_cp(cp_arr, len, str_cp))
   {
-    rb_ary_push(memo[0], rb_enc_uint_chr((int)str_cp, (rb_encoding *)memo[1]));
+    rb_ary_push(memo, rb_enc_uint_chr((int)str_cp, rb_utf8_encoding()));
   }
   return 1;
 }
@@ -1111,18 +1118,17 @@ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_d
 static VALUE
 cs_method_scan(VALUE self, VALUE str)
 {
-  VALUE memo[2];
+  VALUE memo;
   struct cs_data *data;
   raise_arg_err_unless_string(str);
   data = cs_fetch_data(self);
-  memo[0] = rb_ary_new();
-  memo[1] = (VALUE)rb_enc_get(str);
+  memo = rb_ary_new();
   each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo);
-  return memo[0];
+  return memo;
 }
 static inline int
-str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
+str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
 {
   return !tst_cp(cp_arr, len, str_cp);
 }
@@ -1146,9 +1152,9 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
   cs_cp cs_len;
   VALUE orig_str_len;
-  rb_encoding *enc;
+  rb_encoding *orig_enc, *utf8;
   char *s, *send, *t;
-  int ascompat, cr;
+  int orig_was_utf8, cr;
   raise_arg_err_unless_string(str);
@@ -1159,24 +1165,34 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
     return bang ? Qnil : str;
   }
-  if (!bang)
+  orig_enc = rb_enc_get(str);
+  utf8 = rb_utf8_encoding();
+  orig_was_utf8 = orig_enc == utf8;
+  if (!orig_was_utf8 && orig_enc != rb_usascii_encoding())
+  {
+    str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
+  }
+  else
   {
-    str = rb_str_dup(str);
+    if (!bang)
+    {
+      str = rb_str_dup(str);
+    }
   }
   cps = cs_fetch_cps(set, &cs_len);
   rb_str_modify(str);
-  enc = rb_enc_get(str);
-  ascompat = rb_enc_asciicompat(enc);
   s = t = RSTRING_PTR(str);
   send = RSTRING_END(str);
-  cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
+  cr = ENC_CODERANGE_7BIT;
   while (s < send)
   {
     unsigned int c;
     int clen;
-    if (ascompat && (c = *(unsigned char *)s) < 0x80)
+    if ((c = *(unsigned char *)s) < 0x80)
     {
       if (tst_cp(cps, cs_len, c) != delete)
       {
@@ -1188,12 +1204,12 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
     }
     else
     {
-      c = rb_enc_codepoint_len(s, send, &clen, enc);
+      c = rb_enc_codepoint_len(s, send, &clen, utf8);
       if (tst_cp(cps, cs_len, c) != delete)
       {
         if (t != s)
-          rb_enc_mbcput(c, t, enc);
+          rb_enc_mbcput(c, t, utf8);
         t += clen;
         if (cr == ENC_CODERANGE_7BIT)
           cr = ENC_CODERANGE_VALID;
@@ -1210,6 +1226,11 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
     return Qnil;
   }
+  if (!orig_was_utf8)
+  {
+    return rb_str_encode(str, rb_enc_from_encoding(orig_enc), 0, Qnil);
+  }
   return str;
 }

data/lib/character_set/parser.rb CHANGED Viewed

@@ -4,11 +4,15 @@ class CharacterSet
     def codepoints_from_enumerable(object)
       raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each)
       # Use #each to check first element (only this works for all Enumerables)
-      object.each do |e| # rubocop:disable Lint/UnreachableLoop
-        return object            if e.is_a?(Integer) && e >= 0 && e < 0x110000
-        return object.map(&:ord) if e.is_a?(String)  && e.length == 1
-        raise ArgumentError, "#{e.inspect} is not valid as a codepoint"
+      object.each do |el| # rubocop:disable Lint/UnreachableLoop
+        if el.is_a?(Integer) && el >= 0 && el < 0x110000
+          return object
+        elsif el.is_a?(String) && el.length == 1
+          return object.to_a.join.encode('utf-8').codepoints
+        end
+        raise ArgumentError, "#{el.inspect} is not valid as a codepoint"
       end
     end