character_set 1.6.0 → 1.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/BENCHMARK.md +32 -32
  3. data/CHANGELOG.md +15 -1
  4. data/README.md +1 -1
  5. data/Rakefile +2 -123
  6. data/character_set.gemspec +0 -7
  7. data/ext/character_set/character_set.c +64 -43
  8. data/lib/character_set/parser.rb +8 -4
  9. data/lib/character_set/predefined_sets/assigned.cps +73 -52
  10. data/lib/character_set/predefined_sets/emoji.cps +10 -9
  11. data/lib/character_set/ruby_fallback/character_set_methods.rb +15 -14
  12. data/lib/character_set/ruby_fallback/set_methods.rb +4 -18
  13. data/lib/character_set/ruby_fallback/vendored_set_classes.rb +492 -0
  14. data/lib/character_set/ruby_fallback.rb +2 -6
  15. data/lib/character_set/shared_methods.rb +2 -2
  16. data/lib/character_set/version.rb +1 -1
  17. data/tasks/benchmark.rake +20 -0
  18. data/tasks/benchmarks/shared.rb +28 -0
  19. data/tasks/sync_casefold_data.rake +20 -0
  20. data/tasks/sync_predefined_sets.rake +9 -0
  21. data/tasks/sync_ruby_spec.rake +65 -0
  22. metadata +20 -29
  23. data/benchmarks/shared.rb +0 -30
  24. /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
  25. /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
  26. /data/{benchmarks → tasks/benchmarks}/delete_in.rb +0 -0
  27. /data/{benchmarks → tasks/benchmarks}/keep_in.rb +0 -0
  28. /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
  29. /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
  30. /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
  31. /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
  32. /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
  33. /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e216e6c199ac9443cda9180a9e35d5ed92b50b45c12e7f64f45d74ecd2cf08d6
4
- data.tar.gz: 5f3634d426dc33875d6c197ce75466544d97808b1e8b1858ac56d93422b226e8
3
+ metadata.gz: 778cea0208adb290e09f454e3f88c021531decede99604cf2b67f48c9ab3bcd8
4
+ data.tar.gz: 43672f0afce2846bec846791e7445c9614605194967e061b1d9f0be305298be4
5
5
  SHA512:
6
- metadata.gz: d24cfaa40b6e4e472e1f76cc8b6f7f3f1282e6830c0cbf76c4810c0f6f365c7419a19816d0b741cee99eb428dae03fc1d60eecab7d1ba6d210015f0cf2d5ee14
7
- data.tar.gz: 2bd7ea63b286e106358293b1428a687374d0cd2cdc985b2da5b5cf1f45c6c541cb0ddde5b06477243cf4011065cfac7fa6bb8a521fb144a750c90039d268f03b
6
+ metadata.gz: 635f9fb21c973b03b9a0556f2b6cf2c608753acb73616aa8681a5bada3418f955ec887164a03fd2add4edae8a60292eb6e4b681d11a8cf33d1083499afe83815
7
+ data.tar.gz: 6a80f4f7f3f6c2357d84dc71bb5086f273471a8ae5af4e09abc502dc9893da90962d49527b40564de664af0a910f2e99720062dd41ba53427882ecb36e1a40a0
data/BENCHMARK.md CHANGED
@@ -3,88 +3,88 @@ Results of `rake:benchmark` on ruby 3.2.0dev (2022-02-14T14:35:54Z master 26187a
3
3
  ```
4
4
  Counting non-letters
5
5
 
6
- CharacterSet#count_in: 14794607.9 i/s
7
- String#count: 3875939.3 i/s - 3.82x slower
6
+ CharacterSet#count_in: 14627506.2 i/s
7
+ String#count: 3859777.0 i/s - 3.79x slower
8
8
  ```
9
9
  ```
10
10
  Detecting non-whitespace
11
11
 
12
- CharacterSet#cover?: 17448329.0 i/s
13
- Regexp#match?: 13089358.1 i/s - 1.33x slower
12
+ CharacterSet#cover?: 17241902.8 i/s
13
+ Regexp#match?: 12971122.6 i/s - 1.33x slower
14
14
  ```
15
15
  ```
16
16
  Detecting non-letters
17
17
 
18
- CharacterSet#cover?: 17565596.9 i/s
19
- Regexp#match?: 7951108.0 i/s - 2.21x slower
18
+ CharacterSet#cover?: 17243472.3 i/s
19
+ Regexp#match?: 7957626.9 i/s - 2.17x slower
20
20
  ```
21
21
  ```
22
22
  Removing ASCII whitespace
23
23
 
24
- CharacterSet#delete_in: 6306078.2 i/s
25
- String#tr: 4734401.0 i/s - 1.33x slower
26
- String#gsub: 211631.8 i/s - 29.80x slower
24
+ CharacterSet#delete_in: 6190975.7 i/s
25
+ String#tr: 4722716.6 i/s - 1.31x slower
26
+ String#gsub: 214239.5 i/s - 28.90x slower
27
27
  ```
28
28
  ```
29
29
  Removing whitespace, emoji and umlauts
30
30
 
31
- CharacterSet#delete_in: 5984149.6 i/s
32
- String#tr: 363643.1 i/s - 16.46x slower
33
- String#gsub: 317201.7 i/s - 18.87x slower
31
+ CharacterSet#delete_in: 5890471.8 i/s
32
+ String#tr: 348506.8 i/s - 16.90x slower
33
+ String#gsub: 318268.3 i/s - 18.51x slower
34
34
  ```
35
35
  ```
36
36
  Removing non-whitespace
37
37
 
38
- CharacterSet#keep_in: 7650925.6 i/s
39
- String#gsub: 207374.6 i/s - 36.89x slower
40
- String#tr: 12.3 i/s - 619745.60x slower
38
+ CharacterSet#keep_in: 7396898.0 i/s
39
+ String#gsub: 208809.7 i/s - 35.42x slower
40
+ String#tr: 13.1 i/s - 564682.50x slower
41
41
  ```
42
42
  ```
43
43
  Keeping only emoji
44
44
 
45
- CharacterSet#keep_in: 7272940.1 i/s
46
- String#gsub: 177993.8 i/s - 40.86x slower
47
- String#tr: 12.3 i/s - 590222.71x slower
45
+ CharacterSet#keep_in: 7022741.1 i/s
46
+ String#gsub: 180939.6 i/s - 38.81x slower
47
+ String#tr: 13.1 i/s - 536724.50x slower
48
48
  ```
49
49
  ```
50
50
  Extracting emoji to an Array
51
51
 
52
- CharacterSet#scan: 2978285.0 i/s
53
- String#scan: 865793.8 i/s - 3.44x slower
52
+ CharacterSet#scan: 3023176.8 i/s
53
+ String#scan: 893225.8 i/s - 3.38x slower
54
54
  ```
55
55
  ```
56
56
  Detecting whitespace
57
57
 
58
- CharacterSet#used_by?: 17292338.4 i/s
59
- Regexp#match?: 11705563.9 i/s - 1.48x slower
58
+ CharacterSet#used_by?: 17284025.9 i/s
59
+ Regexp#match?: 11847064.5 i/s - 1.46x slower
60
60
  ```
61
61
  ```
62
62
  Detecting emoji in a large string
63
63
 
64
- CharacterSet#used_by?: 340444.1 i/s
65
- Regexp#match?: 180549.8 i/s - 1.89x slower
64
+ CharacterSet#used_by?: 341386.1 i/s
65
+ Regexp#match?: 183121.6 i/s - 1.86x slower
66
66
  ```
67
67
  ```
68
68
  Adding entries
69
69
 
70
- CharacterSet#add: 4951781.4 i/s
71
- SortedSet#add: 1019637.9 i/s - 4.86x slower
70
+ CharacterSet#add: 4989762.3 i/s
71
+ SortedSet#add: 1157911.7 i/s - 4.31x slower
72
72
  ```
73
73
  ```
74
74
  Removing entries
75
75
 
76
- CharacterSet#delete: 5006337.6 i/s
77
- SortedSet#delete: 3922752.2 i/s - same-ish
76
+ CharacterSet#delete: 4996703.6 i/s
77
+ SortedSet#delete: 4177401.5 i/s - same-ish
78
78
  ```
79
79
  ```
80
80
  Merging entries
81
81
 
82
- CharacterSet#merge: 661.8 i/s
83
- SortedSet#merge: 3.9 i/s - 167.82x slower
82
+ CharacterSet#merge: 666.7 i/s
83
+ SortedSet#merge: 4.0 i/s - 167.84x slower
84
84
  ```
85
85
  ```
86
86
  Getting the min and max
87
87
 
88
- CharacterSet#minmax: 1212462.2 i/s
89
- SortedSet#minmax: 844.4 i/s - 1435.93x slower
88
+ CharacterSet#minmax: 1596470.9 i/s
89
+ SortedSet#minmax: 866.4 i/s - 1842.74x slower
90
90
  ```
data/CHANGELOG.md CHANGED
@@ -4,6 +4,20 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [Unreleased]
8
+
9
+ ## [1.7.0] - 2023-05-12
10
+
11
+ ### Added
12
+
13
+ - new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.2.0
14
+
15
+ ### Fixed
16
+
17
+ - fixed processing of Strings that are not ASCII- or UTF8-encoded
18
+ - removed dependency on `set` and `sorted_set`
19
+ - thanks to https://github.com/mikebaldry for reporting a related issue (#2)
20
+
7
21
  ## [1.6.0] - 2022-02-16
8
22
 
9
23
  ### Added
@@ -63,7 +77,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
63
77
  - reduced memory consumption by > 90% for most use cases via dynamic resizing
64
78
  - before, every set instance required 136 KB for codepoints
65
79
  - now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
66
- - `#count_in` and `#scan_in` methods for `String` interaction
80
+ - `#count_in` and `#scan` methods for `String` interaction
67
81
  - new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
68
82
  - conversion methods `#assigned_part`, `#valid_part`
69
83
  - sectioning methods `#ascii_part`, `#plane(n)`
data/README.md CHANGED
@@ -96,7 +96,7 @@ string # => ''
96
96
 
97
97
  ```ruby
98
98
  CharacterSet.non_ascii.count_in('Tüür') # => 2
99
- CharacterSet.non_ascii.scan_in('Tüür') # => ['ü', 'ü']
99
+ CharacterSet.non_ascii.scan('Tüür') # => ['ü', 'ü']
100
100
  ```
101
101
 
102
102
  There is also a core extension for String interaction.
data/Rakefile CHANGED
@@ -3,6 +3,8 @@ require 'rspec/core/rake_task'
3
3
  require 'rubygems/package_task'
4
4
  require 'rake/extensiontask'
5
5
 
6
+ Dir['tasks/**/*.rake'].each { |file| load(file) }
7
+
6
8
  RSpec::Core::RakeTask.new(:spec)
7
9
 
8
10
  task default: :spec
@@ -34,129 +36,6 @@ end
34
36
 
35
37
  task package: 'java:gem'
36
38
 
37
- desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
38
- task :sync_ruby_spec do
39
- require 'fileutils'
40
-
41
- variants = {
42
- 'CharacterSet' => './spec/ruby-spec/library/character_set',
43
- 'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
44
- }
45
-
46
- # download fresh specs from ruby/spec repository
47
- variants.each do |_, dir|
48
- FileUtils.rm_rf(dir) if File.exist?(dir)
49
- `svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
50
- end
51
-
52
- # make copies for each CharacterSet variant
53
- base = variants.first[1]
54
- variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
55
-
56
- # adapt specs to work with CharacterSet
57
- variants.each do |class_name, dir|
58
- Dir["#{dir}/**/*.rb"].each do |spec|
59
- # ignore some tests that do not apply or are covered otherwise
60
- if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)}
61
- File.delete(spec)
62
- next
63
- end
64
-
65
- adapted_content =
66
- File.read(spec).
67
- # adapt class name
68
- gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)).
69
- gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2").
70
- # get shared specs from a single shared dir at the parent level
71
- gsub(/(require_relative ['"])(shared\/)/, '\1../\2').
72
- # make 'mspec' syntax rspec-compatible
73
- gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|').
74
- gsub(/be_(false|true)/, 'be \1').
75
- gsub('stub!', 'stub').
76
- gsub('mock', 'double').
77
- gsub('@method', 'method').
78
- # remove unneeded requires
79
- gsub(/require 'set'\n/, '').
80
- gsub(/require.*spec_helper.*\n/, '').
81
- gsub(/\A\n+/, '').
82
- # make examples use Integers/codepoints
83
- gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0').
84
- gsub('"one"', '1').
85
- gsub('"two"', '2').
86
- gsub('"three"', '3').
87
- gsub('"four"', '4').
88
- gsub('"five"', '5').
89
- gsub(/x.(size|length) == 3/, 'x != 3').
90
- gsub(/x.(size|length) != 3/, 'x == 3').
91
- gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2')
92
-
93
- File.open(spec, 'w') { |f| f.puts adapted_content }
94
- end
95
- end
96
-
97
- # keep only one copy of the shared specs, at the parent level
98
- FileUtils.rm_rf(base + '/../shared')
99
- FileUtils.mv(base + '/shared', base + '/../')
100
- variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') }
101
- end
102
-
103
- desc 'Download unicode casefold data and write new C header file'
104
- task :sync_casefold_data do
105
- src_path = './CaseFolding.txt'
106
- dst_path = './ext/character_set/unicode_casefold_table.h'
107
-
108
- `wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
109
-
110
- mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
111
- from, type, to = line.split(/\s*;\s*/).first(3)
112
- # type 'C' stands for 'common', excludes mappings to multiple chars
113
- hash[from] = to if type == 'C'
114
- end.sort
115
-
116
- content = File.read(dst_path + '.tmpl')
117
- .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
118
- .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
119
-
120
- File.write(dst_path, content)
121
- File.unlink(src_path)
122
- end
123
-
124
- desc 'Update codepoint data for predefined sets, based on Onigmo'
125
- task :sync_predefined_sets do
126
- %w[assigned emoji whitespace].each do |prop|
127
- require 'regexp_property_values'
128
- ranges = RegexpPropertyValues[prop].matched_ranges
129
- str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
130
- File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
131
- end
132
- end
133
-
134
- desc 'Run all IPS benchmarks'
135
- task :benchmark do
136
- Dir['./benchmarks/*.rb'].sort.each { |file| require file }
137
- end
138
-
139
- namespace :benchmark do
140
- desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
141
- task :write_to_file do
142
- $store_comparison_results = {}
143
-
144
- Rake.application[:benchmark].invoke
145
-
146
- File.open('BENCHMARK.md', 'w') do |f|
147
- f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
148
-
149
- $store_comparison_results.each do |caption, result|
150
- f.puts '```',
151
- caption,
152
- '',
153
- result.strip.gsub(/ \(±[^)]+\) /, '').gsub(/(same-ish).*$/, '\1').lines[1..-1],
154
- '```'
155
- end
156
- end
157
- end
158
- end
159
-
160
39
  unless RUBY_PLATFORM =~ /java/
161
40
  # recompile before benchmarking or running specs
162
41
  task(:benchmark).enhance([:compile])
@@ -21,11 +21,4 @@ Gem::Specification.new do |s|
21
21
  s.extensions = %w[ext/character_set/extconf.rb]
22
22
 
23
23
  s.required_ruby_version = '>= 2.1.0'
24
-
25
- # SortedSet, needed for RubyFallback, was moved to a gem in Ruby 3.
26
- # This dependency is only used if the C extension is unavailable.
27
- # JRuby has it in the stdlib.
28
- if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
29
- s.add_dependency 'sorted_set', '~> 1.0'
30
- end
31
24
  end
@@ -376,22 +376,20 @@ cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop)
376
376
  cps = data->cps;
377
377
  len = data->len;
378
378
  cp = FIX2ULONG(cp_num);
379
- if (return_nil_if_noop && (!tst_cp(cps, len, cp) == !on))
379
+ if (return_nil_if_noop && tst_cp(cps, len, cp) == on)
380
380
  {
381
381
  return Qnil;
382
382
  }
383
+
384
+ if (on)
385
+ {
386
+ set_cp(data, cp);
387
+ }
383
388
  else
384
389
  {
385
- if (on)
386
- {
387
- set_cp(data, cp);
388
- }
389
- else
390
- {
391
- clr_cp(cps, len, cp);
392
- }
393
- return cs;
390
+ clr_cp(cps, len, cp);
394
391
  }
392
+ return cs;
395
393
  }
396
394
 
397
395
  static VALUE
@@ -575,7 +573,7 @@ cs_method_merge(VALUE self, VALUE other)
575
573
  {
576
574
  return cs_merge_cs(self, other);
577
575
  }
578
- else if (TYPE(other) == T_ARRAY)
576
+ if (TYPE(other) == T_ARRAY)
579
577
  {
580
578
  return cs_merge_rb_array(self, other);
581
579
  }
@@ -917,10 +915,10 @@ cs_method_ext_inversion(int argc, VALUE *argv, VALUE self)
917
915
  return new_cs;
918
916
  }
919
917
 
920
- typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE *memo);
918
+ typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE memo);
921
919
 
922
920
  static inline int
923
- add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
921
+ add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
924
922
  {
925
923
  set_cp(data, str_cp);
926
924
  return 1;
@@ -967,7 +965,7 @@ cs_method_case_insensitive(VALUE self)
967
965
  }
968
966
 
969
967
  static inline VALUE
970
- each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
968
+ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
971
969
  {
972
970
  long i, str_len;
973
971
  unsigned int str_cp;
@@ -986,21 +984,29 @@ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_d
986
984
  }
987
985
 
988
986
  static inline VALUE
989
- each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
987
+ each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
990
988
  {
991
989
  int n;
992
990
  unsigned int str_cp;
993
991
  const char *ptr, *end;
994
- rb_encoding *enc;
992
+ rb_encoding *utf8;
993
+
994
+ utf8 = rb_utf8_encoding();
995
+ if (rb_enc_get(str) == utf8)
996
+ {
997
+ str = rb_str_new_frozen(str);
998
+ }
999
+ else
1000
+ {
1001
+ str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
1002
+ }
995
1003
 
996
- str = rb_str_new_frozen(str);
997
1004
  ptr = RSTRING_PTR(str);
998
1005
  end = RSTRING_END(str);
999
- enc = rb_enc_get(str);
1000
1006
 
1001
1007
  while (ptr < end)
1002
1008
  {
1003
- str_cp = rb_enc_codepoint_len(ptr, end, &n, enc);
1009
+ str_cp = rb_enc_codepoint_len(ptr, end, &n, utf8);
1004
1010
  if (!(*func)(str_cp, cp_arr, len, data, memo))
1005
1011
  {
1006
1012
  return Qfalse;
@@ -1031,12 +1037,13 @@ single_byte_optimizable(VALUE str)
1031
1037
  }
1032
1038
 
1033
1039
  static inline VALUE
1034
- each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1040
+ each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1035
1041
  {
1036
1042
  if (single_byte_optimizable(str))
1037
1043
  {
1038
1044
  return each_sb_cp(str, func, cp_arr, len, data, memo);
1039
1045
  }
1046
+
1040
1047
  return each_mb_cp(str, func, cp_arr, len, data, memo);
1041
1048
  }
1042
1049
 
@@ -1062,11 +1069,11 @@ cs_class_method_of_string(VALUE self, VALUE string)
1062
1069
  }
1063
1070
 
1064
1071
  static inline int
1065
- count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1072
+ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1066
1073
  {
1067
1074
  if (tst_cp(cp_arr, len, str_cp))
1068
1075
  {
1069
- *memo += 1;
1076
+ *((VALUE *)memo) += 1;
1070
1077
  }
1071
1078
  return 1;
1072
1079
  }
@@ -1074,17 +1081,17 @@ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data
1074
1081
  static VALUE
1075
1082
  cs_method_count_in(VALUE self, VALUE str)
1076
1083
  {
1077
- VALUE count;
1084
+ long count;
1078
1085
  struct cs_data *data;
1079
1086
  raise_arg_err_unless_string(str);
1080
1087
  data = cs_fetch_data(self);
1081
1088
  count = 0;
1082
- each_cp(str, count_str_cp, data->cps, data->len, data, &count);
1083
- return INT2NUM((int)count);
1089
+ each_cp(str, count_str_cp, data->cps, data->len, data, (VALUE)&count);
1090
+ return LONG2FIX(count);
1084
1091
  }
1085
1092
 
1086
1093
  static inline int
1087
- str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1094
+ str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1088
1095
  {
1089
1096
  return tst_cp(cp_arr, len, str_cp);
1090
1097
  }
@@ -1099,11 +1106,11 @@ cs_method_cover_p(VALUE self, VALUE str)
1099
1106
  }
1100
1107
 
1101
1108
  static inline int
1102
- add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1109
+ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1103
1110
  {
1104
1111
  if (tst_cp(cp_arr, len, str_cp))
1105
1112
  {
1106
- rb_ary_push(memo[0], rb_enc_uint_chr((int)str_cp, (rb_encoding *)memo[1]));
1113
+ rb_ary_push(memo, rb_enc_uint_chr((int)str_cp, rb_utf8_encoding()));
1107
1114
  }
1108
1115
  return 1;
1109
1116
  }
@@ -1111,18 +1118,17 @@ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_d
1111
1118
  static VALUE
1112
1119
  cs_method_scan(VALUE self, VALUE str)
1113
1120
  {
1114
- VALUE memo[2];
1121
+ VALUE memo;
1115
1122
  struct cs_data *data;
1116
1123
  raise_arg_err_unless_string(str);
1117
1124
  data = cs_fetch_data(self);
1118
- memo[0] = rb_ary_new();
1119
- memo[1] = (VALUE)rb_enc_get(str);
1125
+ memo = rb_ary_new();
1120
1126
  each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo);
1121
- return memo[0];
1127
+ return memo;
1122
1128
  }
1123
1129
 
1124
1130
  static inline int
1125
- str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1131
+ str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1126
1132
  {
1127
1133
  return !tst_cp(cp_arr, len, str_cp);
1128
1134
  }
@@ -1146,9 +1152,9 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1146
1152
  cs_cp cs_len;
1147
1153
  VALUE orig_str_len;
1148
1154
 
1149
- rb_encoding *enc;
1155
+ rb_encoding *orig_enc, *utf8;
1150
1156
  char *s, *send, *t;
1151
- int ascompat, cr;
1157
+ int orig_was_utf8, cr;
1152
1158
 
1153
1159
  raise_arg_err_unless_string(str);
1154
1160
 
@@ -1159,24 +1165,34 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1159
1165
  return bang ? Qnil : str;
1160
1166
  }
1161
1167
 
1162
- if (!bang)
1168
+ orig_enc = rb_enc_get(str);
1169
+ utf8 = rb_utf8_encoding();
1170
+ orig_was_utf8 = orig_enc == utf8;
1171
+
1172
+ if (!orig_was_utf8 && orig_enc != rb_usascii_encoding())
1173
+ {
1174
+ str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
1175
+ }
1176
+ else
1163
1177
  {
1164
- str = rb_str_dup(str);
1178
+ if (!bang)
1179
+ {
1180
+ str = rb_str_dup(str);
1181
+ }
1165
1182
  }
1166
1183
 
1167
1184
  cps = cs_fetch_cps(set, &cs_len);
1168
1185
  rb_str_modify(str);
1169
- enc = rb_enc_get(str);
1170
- ascompat = rb_enc_asciicompat(enc);
1171
1186
  s = t = RSTRING_PTR(str);
1172
1187
  send = RSTRING_END(str);
1173
- cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
1188
+ cr = ENC_CODERANGE_7BIT;
1189
+
1174
1190
  while (s < send)
1175
1191
  {
1176
1192
  unsigned int c;
1177
1193
  int clen;
1178
1194
 
1179
- if (ascompat && (c = *(unsigned char *)s) < 0x80)
1195
+ if ((c = *(unsigned char *)s) < 0x80)
1180
1196
  {
1181
1197
  if (tst_cp(cps, cs_len, c) != delete)
1182
1198
  {
@@ -1188,12 +1204,12 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1188
1204
  }
1189
1205
  else
1190
1206
  {
1191
- c = rb_enc_codepoint_len(s, send, &clen, enc);
1207
+ c = rb_enc_codepoint_len(s, send, &clen, utf8);
1192
1208
 
1193
1209
  if (tst_cp(cps, cs_len, c) != delete)
1194
1210
  {
1195
1211
  if (t != s)
1196
- rb_enc_mbcput(c, t, enc);
1212
+ rb_enc_mbcput(c, t, utf8);
1197
1213
  t += clen;
1198
1214
  if (cr == ENC_CODERANGE_7BIT)
1199
1215
  cr = ENC_CODERANGE_VALID;
@@ -1210,6 +1226,11 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1210
1226
  return Qnil;
1211
1227
  }
1212
1228
 
1229
+ if (!orig_was_utf8)
1230
+ {
1231
+ return rb_str_encode(str, rb_enc_from_encoding(orig_enc), 0, Qnil);
1232
+ }
1233
+
1213
1234
  return str;
1214
1235
  }
1215
1236
 
@@ -4,11 +4,15 @@ class CharacterSet
4
4
 
5
5
  def codepoints_from_enumerable(object)
6
6
  raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each)
7
+
7
8
  # Use #each to check first element (only this works for all Enumerables)
8
- object.each do |e| # rubocop:disable Lint/UnreachableLoop
9
- return object if e.is_a?(Integer) && e >= 0 && e < 0x110000
10
- return object.map(&:ord) if e.is_a?(String) && e.length == 1
11
- raise ArgumentError, "#{e.inspect} is not valid as a codepoint"
9
+ object.each do |el| # rubocop:disable Lint/UnreachableLoop
10
+ if el.is_a?(Integer) && el >= 0 && el < 0x110000
11
+ return object
12
+ elsif el.is_a?(String) && el.length == 1
13
+ return object.to_a.join.encode('utf-8').codepoints
14
+ end
15
+ raise ArgumentError, "#{el.inspect} is not valid as a codepoint"
12
16
  end
13
17
  end
14
18