character_set 1.6.0-java → 1.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/BENCHMARK.md +32 -32
  3. data/CHANGELOG.md +15 -1
  4. data/README.md +1 -1
  5. data/Rakefile +2 -123
  6. data/character_set.gemspec +0 -7
  7. data/ext/character_set/character_set.c +64 -43
  8. data/lib/character_set/parser.rb +8 -4
  9. data/lib/character_set/predefined_sets/assigned.cps +73 -52
  10. data/lib/character_set/predefined_sets/emoji.cps +10 -9
  11. data/lib/character_set/ruby_fallback/character_set_methods.rb +15 -14
  12. data/lib/character_set/ruby_fallback/set_methods.rb +4 -18
  13. data/lib/character_set/ruby_fallback/vendored_set_classes.rb +492 -0
  14. data/lib/character_set/ruby_fallback.rb +2 -6
  15. data/lib/character_set/shared_methods.rb +2 -2
  16. data/lib/character_set/version.rb +1 -1
  17. data/tasks/benchmark.rake +20 -0
  18. data/tasks/benchmarks/shared.rb +28 -0
  19. data/tasks/sync_casefold_data.rake +20 -0
  20. data/tasks/sync_predefined_sets.rake +9 -0
  21. data/tasks/sync_ruby_spec.rake +65 -0
  22. metadata +19 -28
  23. data/benchmarks/shared.rb +0 -30
  24. /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
  25. /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
  26. /data/{benchmarks → tasks/benchmarks}/delete_in.rb +0 -0
  27. /data/{benchmarks → tasks/benchmarks}/keep_in.rb +0 -0
  28. /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
  29. /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
  30. /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
  31. /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
  32. /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
  33. /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5ccdb4b8c43fdf0e3ed297415810a92805ee61324d91c464ccc7bee2575cb14b
4
- data.tar.gz: cc36d5c8d65c036981bd368577bf49eb59d2d91f5fb779cae2e42aa991c94610
3
+ metadata.gz: 36050dd00f44b6efc26567bfd867ff21535fe1e35c9a8018d00f2145b27bfd37
4
+ data.tar.gz: d88d01cae2f5650271d73c654877b6cf62cf87acba9b8e699677b569c514b0e9
5
5
  SHA512:
6
- metadata.gz: 35db3bcae78967f9d74beebccb3412a08ff505ead29fb3ef2c1bef05a5ebc5815c16ea948a3c831bb92041493f12502e86c3e01b64d19bfe8baba6dfb89c88e4
7
- data.tar.gz: de08614dabac51cc9c71b5ed8a96d2865cab747fcb27ca3b00b0d75efcedf7f7f0f81bdd0472c275901d09258af137ddaf5150d9fea067a79f5fd6b526aa2ac5
6
+ metadata.gz: 646450cc07172ffdbceaf6cf215c03a60487ced2fdf578c4467f08374b77f6a8d4e043cbb92c89ea2ebc39c5b5adf38f8d74502632e033dbc8982928d6002f99
7
+ data.tar.gz: 1d77ccb0abef9c591189a77ed862657a04020f6bf9b3f31b7760ced20cdbee962411e29375b7d3883e95d8c97cac992afc89b17dbdaafbe99f0af02cfa22a0e1
data/BENCHMARK.md CHANGED
@@ -3,88 +3,88 @@ Results of `rake:benchmark` on ruby 3.2.0dev (2022-02-14T14:35:54Z master 26187a
3
3
  ```
4
4
  Counting non-letters
5
5
 
6
- CharacterSet#count_in: 14794607.9 i/s
7
- String#count: 3875939.3 i/s - 3.82x slower
6
+ CharacterSet#count_in: 14627506.2 i/s
7
+ String#count: 3859777.0 i/s - 3.79x slower
8
8
  ```
9
9
  ```
10
10
  Detecting non-whitespace
11
11
 
12
- CharacterSet#cover?: 17448329.0 i/s
13
- Regexp#match?: 13089358.1 i/s - 1.33x slower
12
+ CharacterSet#cover?: 17241902.8 i/s
13
+ Regexp#match?: 12971122.6 i/s - 1.33x slower
14
14
  ```
15
15
  ```
16
16
  Detecting non-letters
17
17
 
18
- CharacterSet#cover?: 17565596.9 i/s
19
- Regexp#match?: 7951108.0 i/s - 2.21x slower
18
+ CharacterSet#cover?: 17243472.3 i/s
19
+ Regexp#match?: 7957626.9 i/s - 2.17x slower
20
20
  ```
21
21
  ```
22
22
  Removing ASCII whitespace
23
23
 
24
- CharacterSet#delete_in: 6306078.2 i/s
25
- String#tr: 4734401.0 i/s - 1.33x slower
26
- String#gsub: 211631.8 i/s - 29.80x slower
24
+ CharacterSet#delete_in: 6190975.7 i/s
25
+ String#tr: 4722716.6 i/s - 1.31x slower
26
+ String#gsub: 214239.5 i/s - 28.90x slower
27
27
  ```
28
28
  ```
29
29
  Removing whitespace, emoji and umlauts
30
30
 
31
- CharacterSet#delete_in: 5984149.6 i/s
32
- String#tr: 363643.1 i/s - 16.46x slower
33
- String#gsub: 317201.7 i/s - 18.87x slower
31
+ CharacterSet#delete_in: 5890471.8 i/s
32
+ String#tr: 348506.8 i/s - 16.90x slower
33
+ String#gsub: 318268.3 i/s - 18.51x slower
34
34
  ```
35
35
  ```
36
36
  Removing non-whitespace
37
37
 
38
- CharacterSet#keep_in: 7650925.6 i/s
39
- String#gsub: 207374.6 i/s - 36.89x slower
40
- String#tr: 12.3 i/s - 619745.60x slower
38
+ CharacterSet#keep_in: 7396898.0 i/s
39
+ String#gsub: 208809.7 i/s - 35.42x slower
40
+ String#tr: 13.1 i/s - 564682.50x slower
41
41
  ```
42
42
  ```
43
43
  Keeping only emoji
44
44
 
45
- CharacterSet#keep_in: 7272940.1 i/s
46
- String#gsub: 177993.8 i/s - 40.86x slower
47
- String#tr: 12.3 i/s - 590222.71x slower
45
+ CharacterSet#keep_in: 7022741.1 i/s
46
+ String#gsub: 180939.6 i/s - 38.81x slower
47
+ String#tr: 13.1 i/s - 536724.50x slower
48
48
  ```
49
49
  ```
50
50
  Extracting emoji to an Array
51
51
 
52
- CharacterSet#scan: 2978285.0 i/s
53
- String#scan: 865793.8 i/s - 3.44x slower
52
+ CharacterSet#scan: 3023176.8 i/s
53
+ String#scan: 893225.8 i/s - 3.38x slower
54
54
  ```
55
55
  ```
56
56
  Detecting whitespace
57
57
 
58
- CharacterSet#used_by?: 17292338.4 i/s
59
- Regexp#match?: 11705563.9 i/s - 1.48x slower
58
+ CharacterSet#used_by?: 17284025.9 i/s
59
+ Regexp#match?: 11847064.5 i/s - 1.46x slower
60
60
  ```
61
61
  ```
62
62
  Detecting emoji in a large string
63
63
 
64
- CharacterSet#used_by?: 340444.1 i/s
65
- Regexp#match?: 180549.8 i/s - 1.89x slower
64
+ CharacterSet#used_by?: 341386.1 i/s
65
+ Regexp#match?: 183121.6 i/s - 1.86x slower
66
66
  ```
67
67
  ```
68
68
  Adding entries
69
69
 
70
- CharacterSet#add: 4951781.4 i/s
71
- SortedSet#add: 1019637.9 i/s - 4.86x slower
70
+ CharacterSet#add: 4989762.3 i/s
71
+ SortedSet#add: 1157911.7 i/s - 4.31x slower
72
72
  ```
73
73
  ```
74
74
  Removing entries
75
75
 
76
- CharacterSet#delete: 5006337.6 i/s
77
- SortedSet#delete: 3922752.2 i/s - same-ish
76
+ CharacterSet#delete: 4996703.6 i/s
77
+ SortedSet#delete: 4177401.5 i/s - same-ish
78
78
  ```
79
79
  ```
80
80
  Merging entries
81
81
 
82
- CharacterSet#merge: 661.8 i/s
83
- SortedSet#merge: 3.9 i/s - 167.82x slower
82
+ CharacterSet#merge: 666.7 i/s
83
+ SortedSet#merge: 4.0 i/s - 167.84x slower
84
84
  ```
85
85
  ```
86
86
  Getting the min and max
87
87
 
88
- CharacterSet#minmax: 1212462.2 i/s
89
- SortedSet#minmax: 844.4 i/s - 1435.93x slower
88
+ CharacterSet#minmax: 1596470.9 i/s
89
+ SortedSet#minmax: 866.4 i/s - 1842.74x slower
90
90
  ```
data/CHANGELOG.md CHANGED
@@ -4,6 +4,20 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [Unreleased]
8
+
9
+ ## [1.7.0] - 2023-05-12
10
+
11
+ ### Added
12
+
13
+ - new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.2.0
14
+
15
+ ### Fixed
16
+
17
+ - fixed processing of Strings that are not ASCII- or UTF8-encoded
18
+ - removed dependency on `set` and `sorted_set`
19
+ - thanks to https://github.com/mikebaldry for reporting a related issue (#2)
20
+
7
21
  ## [1.6.0] - 2022-02-16
8
22
 
9
23
  ### Added
@@ -63,7 +77,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
63
77
  - reduced memory consumption by > 90% for most use cases via dynamic resizing
64
78
  - before, every set instance required 136 KB for codepoints
65
79
  - now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
66
- - `#count_in` and `#scan_in` methods for `String` interaction
80
+ - `#count_in` and `#scan` methods for `String` interaction
67
81
  - new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
68
82
  - conversion methods `#assigned_part`, `#valid_part`
69
83
  - sectioning methods `#ascii_part`, `#plane(n)`
data/README.md CHANGED
@@ -96,7 +96,7 @@ string # => ''
96
96
 
97
97
  ```ruby
98
98
  CharacterSet.non_ascii.count_in('Tüür') # => 2
99
- CharacterSet.non_ascii.scan_in('Tüür') # => ['ü', 'ü']
99
+ CharacterSet.non_ascii.scan('Tüür') # => ['ü', 'ü']
100
100
  ```
101
101
 
102
102
  There is also a core extension for String interaction.
data/Rakefile CHANGED
@@ -3,6 +3,8 @@ require 'rspec/core/rake_task'
3
3
  require 'rubygems/package_task'
4
4
  require 'rake/extensiontask'
5
5
 
6
+ Dir['tasks/**/*.rake'].each { |file| load(file) }
7
+
6
8
  RSpec::Core::RakeTask.new(:spec)
7
9
 
8
10
  task default: :spec
@@ -34,129 +36,6 @@ end
34
36
 
35
37
  task package: 'java:gem'
36
38
 
37
- desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
38
- task :sync_ruby_spec do
39
- require 'fileutils'
40
-
41
- variants = {
42
- 'CharacterSet' => './spec/ruby-spec/library/character_set',
43
- 'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
44
- }
45
-
46
- # download fresh specs from ruby/spec repository
47
- variants.each do |_, dir|
48
- FileUtils.rm_rf(dir) if File.exist?(dir)
49
- `svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
50
- end
51
-
52
- # make copies for each CharacterSet variant
53
- base = variants.first[1]
54
- variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
55
-
56
- # adapt specs to work with CharacterSet
57
- variants.each do |class_name, dir|
58
- Dir["#{dir}/**/*.rb"].each do |spec|
59
- # ignore some tests that do not apply or are covered otherwise
60
- if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)}
61
- File.delete(spec)
62
- next
63
- end
64
-
65
- adapted_content =
66
- File.read(spec).
67
- # adapt class name
68
- gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)).
69
- gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2").
70
- # get shared specs from a single shared dir at the parent level
71
- gsub(/(require_relative ['"])(shared\/)/, '\1../\2').
72
- # make 'mspec' syntax rspec-compatible
73
- gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|').
74
- gsub(/be_(false|true)/, 'be \1').
75
- gsub('stub!', 'stub').
76
- gsub('mock', 'double').
77
- gsub('@method', 'method').
78
- # remove unneeded requires
79
- gsub(/require 'set'\n/, '').
80
- gsub(/require.*spec_helper.*\n/, '').
81
- gsub(/\A\n+/, '').
82
- # make examples use Integers/codepoints
83
- gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0').
84
- gsub('"one"', '1').
85
- gsub('"two"', '2').
86
- gsub('"three"', '3').
87
- gsub('"four"', '4').
88
- gsub('"five"', '5').
89
- gsub(/x.(size|length) == 3/, 'x != 3').
90
- gsub(/x.(size|length) != 3/, 'x == 3').
91
- gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2')
92
-
93
- File.open(spec, 'w') { |f| f.puts adapted_content }
94
- end
95
- end
96
-
97
- # keep only one copy of the shared specs, at the parent level
98
- FileUtils.rm_rf(base + '/../shared')
99
- FileUtils.mv(base + '/shared', base + '/../')
100
- variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') }
101
- end
102
-
103
- desc 'Download unicode casefold data and write new C header file'
104
- task :sync_casefold_data do
105
- src_path = './CaseFolding.txt'
106
- dst_path = './ext/character_set/unicode_casefold_table.h'
107
-
108
- `wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
109
-
110
- mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
111
- from, type, to = line.split(/\s*;\s*/).first(3)
112
- # type 'C' stands for 'common', excludes mappings to multiple chars
113
- hash[from] = to if type == 'C'
114
- end.sort
115
-
116
- content = File.read(dst_path + '.tmpl')
117
- .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
118
- .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
119
-
120
- File.write(dst_path, content)
121
- File.unlink(src_path)
122
- end
123
-
124
- desc 'Update codepoint data for predefined sets, based on Onigmo'
125
- task :sync_predefined_sets do
126
- %w[assigned emoji whitespace].each do |prop|
127
- require 'regexp_property_values'
128
- ranges = RegexpPropertyValues[prop].matched_ranges
129
- str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
130
- File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
131
- end
132
- end
133
-
134
- desc 'Run all IPS benchmarks'
135
- task :benchmark do
136
- Dir['./benchmarks/*.rb'].sort.each { |file| require file }
137
- end
138
-
139
- namespace :benchmark do
140
- desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
141
- task :write_to_file do
142
- $store_comparison_results = {}
143
-
144
- Rake.application[:benchmark].invoke
145
-
146
- File.open('BENCHMARK.md', 'w') do |f|
147
- f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
148
-
149
- $store_comparison_results.each do |caption, result|
150
- f.puts '```',
151
- caption,
152
- '',
153
- result.strip.gsub(/ \(±[^)]+\) /, '').gsub(/(same-ish).*$/, '\1').lines[1..-1],
154
- '```'
155
- end
156
- end
157
- end
158
- end
159
-
160
39
  unless RUBY_PLATFORM =~ /java/
161
40
  # recompile before benchmarking or running specs
162
41
  task(:benchmark).enhance([:compile])
@@ -21,11 +21,4 @@ Gem::Specification.new do |s|
21
21
  s.extensions = %w[ext/character_set/extconf.rb]
22
22
 
23
23
  s.required_ruby_version = '>= 2.1.0'
24
-
25
- # SortedSet, needed for RubyFallback, was moved to a gem in Ruby 3.
26
- # This dependency is only used if the C extension is unavailable.
27
- # JRuby has it in the stdlib.
28
- if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
29
- s.add_dependency 'sorted_set', '~> 1.0'
30
- end
31
24
  end
@@ -376,22 +376,20 @@ cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop)
376
376
  cps = data->cps;
377
377
  len = data->len;
378
378
  cp = FIX2ULONG(cp_num);
379
- if (return_nil_if_noop && (!tst_cp(cps, len, cp) == !on))
379
+ if (return_nil_if_noop && tst_cp(cps, len, cp) == on)
380
380
  {
381
381
  return Qnil;
382
382
  }
383
+
384
+ if (on)
385
+ {
386
+ set_cp(data, cp);
387
+ }
383
388
  else
384
389
  {
385
- if (on)
386
- {
387
- set_cp(data, cp);
388
- }
389
- else
390
- {
391
- clr_cp(cps, len, cp);
392
- }
393
- return cs;
390
+ clr_cp(cps, len, cp);
394
391
  }
392
+ return cs;
395
393
  }
396
394
 
397
395
  static VALUE
@@ -575,7 +573,7 @@ cs_method_merge(VALUE self, VALUE other)
575
573
  {
576
574
  return cs_merge_cs(self, other);
577
575
  }
578
- else if (TYPE(other) == T_ARRAY)
576
+ if (TYPE(other) == T_ARRAY)
579
577
  {
580
578
  return cs_merge_rb_array(self, other);
581
579
  }
@@ -917,10 +915,10 @@ cs_method_ext_inversion(int argc, VALUE *argv, VALUE self)
917
915
  return new_cs;
918
916
  }
919
917
 
920
- typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE *memo);
918
+ typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE memo);
921
919
 
922
920
  static inline int
923
- add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
921
+ add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
924
922
  {
925
923
  set_cp(data, str_cp);
926
924
  return 1;
@@ -967,7 +965,7 @@ cs_method_case_insensitive(VALUE self)
967
965
  }
968
966
 
969
967
  static inline VALUE
970
- each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
968
+ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
971
969
  {
972
970
  long i, str_len;
973
971
  unsigned int str_cp;
@@ -986,21 +984,29 @@ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_d
986
984
  }
987
985
 
988
986
  static inline VALUE
989
- each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
987
+ each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
990
988
  {
991
989
  int n;
992
990
  unsigned int str_cp;
993
991
  const char *ptr, *end;
994
- rb_encoding *enc;
992
+ rb_encoding *utf8;
993
+
994
+ utf8 = rb_utf8_encoding();
995
+ if (rb_enc_get(str) == utf8)
996
+ {
997
+ str = rb_str_new_frozen(str);
998
+ }
999
+ else
1000
+ {
1001
+ str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
1002
+ }
995
1003
 
996
- str = rb_str_new_frozen(str);
997
1004
  ptr = RSTRING_PTR(str);
998
1005
  end = RSTRING_END(str);
999
- enc = rb_enc_get(str);
1000
1006
 
1001
1007
  while (ptr < end)
1002
1008
  {
1003
- str_cp = rb_enc_codepoint_len(ptr, end, &n, enc);
1009
+ str_cp = rb_enc_codepoint_len(ptr, end, &n, utf8);
1004
1010
  if (!(*func)(str_cp, cp_arr, len, data, memo))
1005
1011
  {
1006
1012
  return Qfalse;
@@ -1031,12 +1037,13 @@ single_byte_optimizable(VALUE str)
1031
1037
  }
1032
1038
 
1033
1039
  static inline VALUE
1034
- each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1040
+ each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1035
1041
  {
1036
1042
  if (single_byte_optimizable(str))
1037
1043
  {
1038
1044
  return each_sb_cp(str, func, cp_arr, len, data, memo);
1039
1045
  }
1046
+
1040
1047
  return each_mb_cp(str, func, cp_arr, len, data, memo);
1041
1048
  }
1042
1049
 
@@ -1062,11 +1069,11 @@ cs_class_method_of_string(VALUE self, VALUE string)
1062
1069
  }
1063
1070
 
1064
1071
  static inline int
1065
- count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1072
+ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1066
1073
  {
1067
1074
  if (tst_cp(cp_arr, len, str_cp))
1068
1075
  {
1069
- *memo += 1;
1076
+ *((VALUE *)memo) += 1;
1070
1077
  }
1071
1078
  return 1;
1072
1079
  }
@@ -1074,17 +1081,17 @@ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data
1074
1081
  static VALUE
1075
1082
  cs_method_count_in(VALUE self, VALUE str)
1076
1083
  {
1077
- VALUE count;
1084
+ long count;
1078
1085
  struct cs_data *data;
1079
1086
  raise_arg_err_unless_string(str);
1080
1087
  data = cs_fetch_data(self);
1081
1088
  count = 0;
1082
- each_cp(str, count_str_cp, data->cps, data->len, data, &count);
1083
- return INT2NUM((int)count);
1089
+ each_cp(str, count_str_cp, data->cps, data->len, data, (VALUE)&count);
1090
+ return LONG2FIX(count);
1084
1091
  }
1085
1092
 
1086
1093
  static inline int
1087
- str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1094
+ str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1088
1095
  {
1089
1096
  return tst_cp(cp_arr, len, str_cp);
1090
1097
  }
@@ -1099,11 +1106,11 @@ cs_method_cover_p(VALUE self, VALUE str)
1099
1106
  }
1100
1107
 
1101
1108
  static inline int
1102
- add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1109
+ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1103
1110
  {
1104
1111
  if (tst_cp(cp_arr, len, str_cp))
1105
1112
  {
1106
- rb_ary_push(memo[0], rb_enc_uint_chr((int)str_cp, (rb_encoding *)memo[1]));
1113
+ rb_ary_push(memo, rb_enc_uint_chr((int)str_cp, rb_utf8_encoding()));
1107
1114
  }
1108
1115
  return 1;
1109
1116
  }
@@ -1111,18 +1118,17 @@ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_d
1111
1118
  static VALUE
1112
1119
  cs_method_scan(VALUE self, VALUE str)
1113
1120
  {
1114
- VALUE memo[2];
1121
+ VALUE memo;
1115
1122
  struct cs_data *data;
1116
1123
  raise_arg_err_unless_string(str);
1117
1124
  data = cs_fetch_data(self);
1118
- memo[0] = rb_ary_new();
1119
- memo[1] = (VALUE)rb_enc_get(str);
1125
+ memo = rb_ary_new();
1120
1126
  each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo);
1121
- return memo[0];
1127
+ return memo;
1122
1128
  }
1123
1129
 
1124
1130
  static inline int
1125
- str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1131
+ str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1126
1132
  {
1127
1133
  return !tst_cp(cp_arr, len, str_cp);
1128
1134
  }
@@ -1146,9 +1152,9 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1146
1152
  cs_cp cs_len;
1147
1153
  VALUE orig_str_len;
1148
1154
 
1149
- rb_encoding *enc;
1155
+ rb_encoding *orig_enc, *utf8;
1150
1156
  char *s, *send, *t;
1151
- int ascompat, cr;
1157
+ int orig_was_utf8, cr;
1152
1158
 
1153
1159
  raise_arg_err_unless_string(str);
1154
1160
 
@@ -1159,24 +1165,34 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1159
1165
  return bang ? Qnil : str;
1160
1166
  }
1161
1167
 
1162
- if (!bang)
1168
+ orig_enc = rb_enc_get(str);
1169
+ utf8 = rb_utf8_encoding();
1170
+ orig_was_utf8 = orig_enc == utf8;
1171
+
1172
+ if (!orig_was_utf8 && orig_enc != rb_usascii_encoding())
1173
+ {
1174
+ str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
1175
+ }
1176
+ else
1163
1177
  {
1164
- str = rb_str_dup(str);
1178
+ if (!bang)
1179
+ {
1180
+ str = rb_str_dup(str);
1181
+ }
1165
1182
  }
1166
1183
 
1167
1184
  cps = cs_fetch_cps(set, &cs_len);
1168
1185
  rb_str_modify(str);
1169
- enc = rb_enc_get(str);
1170
- ascompat = rb_enc_asciicompat(enc);
1171
1186
  s = t = RSTRING_PTR(str);
1172
1187
  send = RSTRING_END(str);
1173
- cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
1188
+ cr = ENC_CODERANGE_7BIT;
1189
+
1174
1190
  while (s < send)
1175
1191
  {
1176
1192
  unsigned int c;
1177
1193
  int clen;
1178
1194
 
1179
- if (ascompat && (c = *(unsigned char *)s) < 0x80)
1195
+ if ((c = *(unsigned char *)s) < 0x80)
1180
1196
  {
1181
1197
  if (tst_cp(cps, cs_len, c) != delete)
1182
1198
  {
@@ -1188,12 +1204,12 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1188
1204
  }
1189
1205
  else
1190
1206
  {
1191
- c = rb_enc_codepoint_len(s, send, &clen, enc);
1207
+ c = rb_enc_codepoint_len(s, send, &clen, utf8);
1192
1208
 
1193
1209
  if (tst_cp(cps, cs_len, c) != delete)
1194
1210
  {
1195
1211
  if (t != s)
1196
- rb_enc_mbcput(c, t, enc);
1212
+ rb_enc_mbcput(c, t, utf8);
1197
1213
  t += clen;
1198
1214
  if (cr == ENC_CODERANGE_7BIT)
1199
1215
  cr = ENC_CODERANGE_VALID;
@@ -1210,6 +1226,11 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1210
1226
  return Qnil;
1211
1227
  }
1212
1228
 
1229
+ if (!orig_was_utf8)
1230
+ {
1231
+ return rb_str_encode(str, rb_enc_from_encoding(orig_enc), 0, Qnil);
1232
+ }
1233
+
1213
1234
  return str;
1214
1235
  }
1215
1236
 
@@ -4,11 +4,15 @@ class CharacterSet
4
4
 
5
5
  def codepoints_from_enumerable(object)
6
6
  raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each)
7
+
7
8
  # Use #each to check first element (only this works for all Enumerables)
8
- object.each do |e| # rubocop:disable Lint/UnreachableLoop
9
- return object if e.is_a?(Integer) && e >= 0 && e < 0x110000
10
- return object.map(&:ord) if e.is_a?(String) && e.length == 1
11
- raise ArgumentError, "#{e.inspect} is not valid as a codepoint"
9
+ object.each do |el| # rubocop:disable Lint/UnreachableLoop
10
+ if el.is_a?(Integer) && el >= 0 && el < 0x110000
11
+ return object
12
+ elsif el.is_a?(String) && el.length == 1
13
+ return object.to_a.join.encode('utf-8').codepoints
14
+ end
15
+ raise ArgumentError, "#{el.inspect} is not valid as a codepoint"
12
16
  end
13
17
  end
14
18