character_set 1.1.1-java → 1.4.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +3 -0
  3. data/.github/workflows/lint.yml +29 -0
  4. data/.github/workflows/tests.yml +22 -0
  5. data/.gitignore +1 -0
  6. data/.rubocop.yml +11 -0
  7. data/BENCHMARK.md +53 -17
  8. data/CHANGELOG.md +47 -0
  9. data/README.md +38 -14
  10. data/Rakefile +60 -36
  11. data/benchmarks/count_in.rb +13 -0
  12. data/benchmarks/delete_in.rb +1 -1
  13. data/benchmarks/scan.rb +13 -0
  14. data/benchmarks/shared.rb +5 -0
  15. data/benchmarks/z_add.rb +12 -0
  16. data/benchmarks/z_delete.rb +12 -0
  17. data/benchmarks/z_merge.rb +15 -0
  18. data/benchmarks/z_minmax.rb +12 -0
  19. data/bin/console +2 -0
  20. data/character_set.gemspec +17 -6
  21. data/ext/character_set/character_set.c +963 -414
  22. data/ext/character_set/unicode_casefold_table.h +10 -2
  23. data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
  24. data/lib/character_set/character.rb +1 -1
  25. data/lib/character_set/core_ext/regexp_ext.rb +1 -1
  26. data/lib/character_set/core_ext/string_ext.rb +3 -1
  27. data/lib/character_set/expression_converter.rb +25 -27
  28. data/lib/character_set/parser.rb +1 -1
  29. data/lib/character_set/predefined_sets.rb +25 -260
  30. data/lib/character_set/predefined_sets/any.cps +1 -0
  31. data/lib/character_set/predefined_sets/ascii.cps +1 -0
  32. data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
  33. data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
  34. data/lib/character_set/predefined_sets/assigned.cps +666 -0
  35. data/lib/character_set/predefined_sets/bmp.cps +2 -0
  36. data/lib/character_set/predefined_sets/crypt.cps +2 -0
  37. data/lib/character_set/predefined_sets/emoji.cps +151 -0
  38. data/lib/character_set/predefined_sets/newline.cps +3 -0
  39. data/lib/character_set/predefined_sets/surrogate.cps +1 -0
  40. data/lib/character_set/predefined_sets/unicode.cps +2 -0
  41. data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
  42. data/lib/character_set/predefined_sets/url_host.cps +10 -0
  43. data/lib/character_set/predefined_sets/url_path.cps +7 -0
  44. data/lib/character_set/predefined_sets/url_query.cps +8 -0
  45. data/lib/character_set/predefined_sets/whitespace.cps +10 -0
  46. data/lib/character_set/ruby_fallback.rb +5 -3
  47. data/lib/character_set/ruby_fallback/character_set_methods.rb +53 -6
  48. data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
  49. data/lib/character_set/shared_methods.rb +60 -49
  50. data/lib/character_set/version.rb +1 -1
  51. data/lib/character_set/writer.rb +98 -27
  52. metadata +102 -22
  53. data/.travis.yml +0 -11
  54. data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
@@ -0,0 +1,13 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum et dolorem'
4
+ tr = '^A-Za-z'
5
+ cs = CharacterSet.non_ascii_letter
6
+
7
+ benchmark(
8
+ caption: 'Counting non-letters',
9
+ cases: {
10
+ 'String#count' => -> { str.count(tr) },
11
+ 'CharacterSet#count_in' => -> { cs.count_in(str) },
12
+ }
13
+ )
@@ -14,7 +14,7 @@ benchmark(
14
14
 
15
15
  str = 'Lörem ipsüm ⛷ et dölörem'
16
16
  rx = /[\s\p{emoji}äüö]/
17
- cs = CharacterSet.whitespace + CharacterSet.emoji + CS['ä', 'ü', 'ö']
17
+ cs = CharacterSet.whitespace + CharacterSet.emoji + CharacterSet['ä', 'ö', 'ü']
18
18
 
19
19
  benchmark(
20
20
  caption: 'Removing whitespace, emoji and umlauts',
@@ -0,0 +1,13 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum ⛷ et dolorem'
4
+ rx = /\p{emoji}/
5
+ cs = CharacterSet.emoji
6
+
7
+ benchmark(
8
+ caption: 'Extracting emoji to an Array',
9
+ cases: {
10
+ 'String#scan' => -> { str.scan(rx) },
11
+ 'CharacterSet#scan' => -> { cs.scan(str) },
12
+ }
13
+ )
@@ -3,6 +3,11 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
3
 
4
4
  require 'benchmark/ips'
5
5
  require 'character_set'
6
+ if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
7
+ require 'sorted_set'
8
+ else
9
+ require 'set'
10
+ end
6
11
 
7
12
  def benchmark(caption: nil, cases: {})
8
13
  puts caption
@@ -0,0 +1,12 @@
1
+ require_relative './shared'
2
+
3
+ cs = CharacterSet[]
4
+ ss = SortedSet[]
5
+
6
+ benchmark(
7
+ caption: 'Adding entries',
8
+ cases: {
9
+ 'CharacterSet#add' => -> { cs.add(rand(0x10FFFF)) },
10
+ 'SortedSet#add' => -> { ss.add(rand(0x10FFFF)) },
11
+ }
12
+ )
@@ -0,0 +1,12 @@
1
+ require_relative './shared'
2
+
3
+ cs = CharacterSet.new(0..0x10FFFF)
4
+ ss = SortedSet.new(0..0x10FFFF)
5
+
6
+ benchmark(
7
+ caption: 'Removing entries',
8
+ cases: {
9
+ 'CharacterSet#delete' => -> { cs.delete(rand(0x10FFFF)) },
10
+ 'SortedSet#delete' => -> { ss.delete(rand(0x10FFFF)) },
11
+ }
12
+ )
@@ -0,0 +1,15 @@
1
+ require_relative './shared'
2
+
3
+ cs1 = CharacterSet.new(0...0x88000)
4
+ cs2 = CharacterSet.new(0x88000..0x10FFFF)
5
+
6
+ ss1 = SortedSet.new(0...0x88000)
7
+ ss2 = SortedSet.new(0x88000..0x10FFFF)
8
+
9
+ benchmark(
10
+ caption: 'Merging entries',
11
+ cases: {
12
+ 'CharacterSet#merge' => -> { cs1.merge(cs2) },
13
+ 'SortedSet#merge' => -> { ss1.merge(ss2) },
14
+ }
15
+ )
@@ -0,0 +1,12 @@
1
+ require_relative './shared'
2
+
3
+ cs = CharacterSet.new(0..0xFFFF)
4
+ ss = SortedSet.new(0..0xFFFF)
5
+
6
+ benchmark(
7
+ caption: 'Getting the min and max',
8
+ cases: {
9
+ 'CharacterSet#minmax' => -> { cs.minmax },
10
+ 'SortedSet#minmax' => -> { ss.minmax },
11
+ }
12
+ )
@@ -2,6 +2,8 @@
2
2
 
3
3
  require 'bundler/setup'
4
4
 
5
+ `bundle exec rake compile`
6
+
5
7
  require 'character_set'
6
8
  require 'character_set/core_ext'
7
9
  require 'character_set/pure'
@@ -10,7 +10,7 @@ Gem::Specification.new do |s|
10
10
  s.email = ['janosch84@gmail.com']
11
11
 
12
12
  s.summary = 'Build, read, write and compare sets of Unicode codepoints.'
13
- s.homepage = 'https://github.com/janosch-x/character_set'
13
+ s.homepage = 'https://github.com/jaynetics/character_set'
14
14
  s.license = 'MIT'
15
15
 
16
16
  s.files = `git ls-files -z`.split("\x0").reject do |f|
@@ -22,12 +22,23 @@ Gem::Specification.new do |s|
22
22
 
23
23
  s.required_ruby_version = '>= 2.1.0'
24
24
 
25
+ # SortedSet, needed for RubyFallback, was moved to a gem in Ruby 3.
26
+ # This dependency is only used if the C extension is unavailable.
27
+ # JRuby has it in the stdlib.
28
+ if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
29
+ s.add_dependency 'sorted_set', '~> 1.0'
30
+ end
31
+
25
32
  s.add_development_dependency 'benchmark-ips', '~> 2.7'
26
- s.add_development_dependency 'bundler', '~> 1.16'
27
- s.add_development_dependency 'rake', '~> 12.0'
28
- s.add_development_dependency 'rake-compiler', '~> 1.0'
33
+ s.add_development_dependency 'get_process_mem', '~> 0.2.3'
34
+ s.add_development_dependency 'rake', '~> 13.0'
35
+ s.add_development_dependency 'rake-compiler', '~> 1.1'
29
36
  s.add_development_dependency 'range_compressor', '~> 1.0'
30
- s.add_development_dependency 'regexp_parser', '~> 1.1'
31
- s.add_development_dependency 'regexp_property_values', '~> 0.3.4'
37
+ s.add_development_dependency 'regexp_parser', '~> 1.6'
38
+ s.add_development_dependency 'regexp_property_values', '~> 1.0'
32
39
  s.add_development_dependency 'rspec', '~> 3.8'
40
+ if RUBY_VERSION.to_f >= 2.7
41
+ s.add_development_dependency 'codecov', '~> 0.2.12'
42
+ s.add_development_dependency 'rubocop', '~> 1.8'
43
+ end
33
44
  end
@@ -2,81 +2,180 @@
2
2
  #include "ruby/encoding.h"
3
3
  #include "unicode_casefold_table.h"
4
4
 
5
- #define SETBIT(byte_arr, bit) (byte_arr[bit >> 3] |= (1 << (bit & 0x07)))
6
- #define CLRBIT(byte_arr, bit) (byte_arr[bit >> 3] &= ~(1 << (bit & 0x07)))
7
- #define TSTBIT(byte_arr, bit) (byte_arr[bit >> 3] & (1 << (bit & 0x07)))
5
+ #define UNICODE_PLANE_SIZE 0x10000
6
+ #define UNICODE_PLANE_COUNT 17
7
+ #define UNICODE_CP_COUNT (UNICODE_PLANE_SIZE * UNICODE_PLANE_COUNT)
8
8
 
9
- typedef char cp_byte;
10
- typedef unsigned long cp_index;
9
+ // start at ascii size
10
+ #define CS_DEFAULT_INITIAL_LEN 128
11
11
 
12
- #define UNICODE_CP_COUNT 0x110000
13
- #define UNICODE_BYTES UNICODE_CP_COUNT / 8
14
- #define UNICODE_PLANE_SIZE 0x10000
15
- #define UNICODE_PLANE_COUNT UNICODE_CP_COUNT / UNICODE_PLANE_SIZE
12
+ typedef char cs_ar;
13
+ typedef unsigned long cs_cp;
14
+
15
+ struct cs_data
16
+ {
17
+ cs_ar *cps;
18
+ cs_cp len;
19
+ };
20
+
21
+ #define CS_MSIZE(len) (sizeof(cs_ar) * (len / 8))
22
+
23
+ static inline void
24
+ add_memspace_for_another_plane(struct cs_data *data)
25
+ {
26
+ data->cps = ruby_xrealloc(data->cps, CS_MSIZE(data->len + UNICODE_PLANE_SIZE));
27
+ memset(data->cps + CS_MSIZE(data->len), 0, CS_MSIZE(UNICODE_PLANE_SIZE));
28
+ data->len += UNICODE_PLANE_SIZE;
29
+ }
30
+
31
+ static inline void
32
+ ensure_memsize_fits(struct cs_data *data, cs_cp target_cp)
33
+ {
34
+ while (target_cp >= data->len)
35
+ {
36
+ add_memspace_for_another_plane(data);
37
+ }
38
+ }
39
+
40
+ static inline void
41
+ set_cp(struct cs_data *data, cs_cp cp)
42
+ {
43
+ ensure_memsize_fits(data, cp);
44
+ data->cps[cp >> 3] |= (1 << (cp & 0x07));
45
+ }
46
+
47
+ static inline int
48
+ tst_cp(cs_ar *cps, cs_cp len, cs_cp cp)
49
+ {
50
+ return ((cp < len) && cps[cp >> 3] & (1 << (cp & 0x07)));
51
+ }
52
+
53
+ static inline void
54
+ clr_cp(cs_ar *cps, cs_cp len, cs_cp cp)
55
+ {
56
+ if (cp < len)
57
+ {
58
+ cps[cp >> 3] &= ~(1 << (cp & 0x07));
59
+ }
60
+ }
16
61
 
17
62
  static void
18
- free_character_set(void* codepoints) {
19
- free(codepoints);
63
+ cs_free(void *ptr)
64
+ {
65
+ struct cs_data *data = ptr;
66
+ ruby_xfree(data->cps);
67
+ ruby_xfree(data);
20
68
  }
21
69
 
22
70
  static size_t
23
- memsize_character_set(const void* codepoints) {
24
- return sizeof(cp_byte) * UNICODE_BYTES;
25
- }
26
-
27
- static const rb_data_type_t
28
- character_set_type = {
29
- .wrap_struct_name = "character_set",
30
- .function = {
31
- .dmark = NULL,
32
- .dfree = free_character_set,
33
- .dsize = memsize_character_set,
34
- },
35
- .data = NULL,
36
- .flags = RUBY_TYPED_FREE_IMMEDIATELY,
71
+ cs_memsize(const void *ptr)
72
+ {
73
+ const struct cs_data *data = ptr;
74
+ return sizeof(*data) + CS_MSIZE(data->len);
75
+ }
76
+
77
+ static const rb_data_type_t cs_type = {
78
+ .wrap_struct_name = "character_set",
79
+ .function = {
80
+ .dmark = NULL,
81
+ .dfree = cs_free,
82
+ .dsize = cs_memsize,
83
+ },
84
+ .data = NULL,
85
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY,
37
86
  };
38
87
 
39
- #define FETCH_CODEPOINTS(set, cps)\
40
- TypedData_Get_Struct(set, cp_byte, &character_set_type, cps)
88
+ static inline VALUE
89
+ cs_alloc_len(VALUE klass, struct cs_data **data_ptr, cs_cp len)
90
+ {
91
+ VALUE cs;
92
+ struct cs_data *data;
93
+ cs = TypedData_Make_Struct(klass, struct cs_data, &cs_type, data);
94
+ data->cps = ruby_xmalloc(CS_MSIZE(len));
95
+ memset(data->cps, 0, CS_MSIZE(len));
96
+ data->len = len;
97
+
98
+ if (data_ptr)
99
+ {
100
+ *data_ptr = data;
101
+ }
41
102
 
42
- #define NEW_CHARACTER_SET(klass, cps)\
43
- TypedData_Wrap_Struct(klass, &character_set_type, cps)
103
+ return cs;
104
+ }
44
105
 
45
- static VALUE
46
- method_allocate(VALUE self) {
47
- cp_byte *cp_arr;
48
- cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte));
49
- return NEW_CHARACTER_SET(self, cp_arr);
106
+ static inline VALUE
107
+ cs_alloc(VALUE klass, struct cs_data **data_ptr)
108
+ {
109
+ return cs_alloc_len(klass, data_ptr, CS_DEFAULT_INITIAL_LEN);
50
110
  }
51
111
 
52
- #define FOR_EACH_ACTIVE_CODEPOINT(action)\
53
- cp_index cp;\
54
- cp_byte *cps;\
55
- FETCH_CODEPOINTS(self, cps);\
56
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\
57
- if (TSTBIT(cps, cp)) { action; }\
58
- }
112
+ static inline struct cs_data *
113
+ cs_fetch_data(VALUE cs)
114
+ {
115
+ struct cs_data *data;
116
+ TypedData_Get_Struct(cs, struct cs_data, &cs_type, data);
117
+ return data;
118
+ }
119
+
120
+ static inline cs_ar *
121
+ cs_fetch_cps(VALUE cs, cs_cp *len_ptr)
122
+ {
123
+ struct cs_data *data;
124
+ data = cs_fetch_data(cs);
125
+ *len_ptr = data->len;
126
+ return data->cps;
127
+ }
128
+
129
+ static VALUE
130
+ cs_method_allocate(VALUE self)
131
+ {
132
+ return cs_alloc(self, 0);
133
+ }
134
+
135
+ #define FOR_EACH_ACTIVE_CODEPOINT(action) \
136
+ do \
137
+ { \
138
+ cs_cp cp, len; \
139
+ cs_ar *cps; \
140
+ cps = cs_fetch_cps(self, &len); \
141
+ for (cp = 0; cp < len; cp++) \
142
+ { \
143
+ if (tst_cp(cps, len, cp)) \
144
+ { \
145
+ action; \
146
+ } \
147
+ } \
148
+ } while (0)
59
149
 
60
150
  // ***************************
61
151
  // `Set` compatibility methods
62
152
  // ***************************
63
153
 
64
- static inline VALUE
65
- enumerator_length(VALUE self, VALUE args, VALUE eobj) {
66
- cp_index count;
154
+ static inline cs_cp
155
+ cs_active_cp_count(VALUE self)
156
+ {
157
+ cs_cp count;
67
158
  count = 0;
68
159
  FOR_EACH_ACTIVE_CODEPOINT(count++);
69
- return LONG2FIX(count);
160
+ return count;
70
161
  }
71
162
 
72
163
  static VALUE
73
- method_length(VALUE self) {
74
- return enumerator_length(self, 0, 0);
164
+ cs_method_length(VALUE self)
165
+ {
166
+ return LONG2FIX(cs_active_cp_count(self));
167
+ }
168
+
169
+ static inline VALUE
170
+ cs_enumerator_length(VALUE self, VALUE args, VALUE eobj)
171
+ {
172
+ return LONG2FIX(cs_active_cp_count(self));
75
173
  }
76
174
 
77
175
  static VALUE
78
- method_each(VALUE self) {
79
- RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
176
+ cs_method_each(VALUE self)
177
+ {
178
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
80
179
  FOR_EACH_ACTIVE_CODEPOINT(rb_yield(LONG2FIX(cp)));
81
180
  return self;
82
181
  }
@@ -84,16 +183,19 @@ method_each(VALUE self) {
84
183
  // returns an Array of codepoint Integers by default.
85
184
  // returns an Array of Strings of length 1 if passed `true`.
86
185
  static VALUE
87
- method_to_a(int argc, VALUE *argv, VALUE self) {
186
+ cs_method_to_a(int argc, VALUE *argv, VALUE self)
187
+ {
88
188
  VALUE arr;
89
189
  rb_encoding *enc;
90
190
  rb_check_arity(argc, 0, 1);
91
191
 
92
192
  arr = rb_ary_new();
93
- if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse) {
193
+ if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse)
194
+ {
94
195
  FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, LONG2FIX(cp)));
95
196
  }
96
- else {
197
+ else
198
+ {
97
199
  enc = rb_utf8_encoding();
98
200
  FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, rb_enc_uint_chr((int)cp, enc)));
99
201
  }
@@ -102,302 +204,473 @@ method_to_a(int argc, VALUE *argv, VALUE self) {
102
204
  }
103
205
 
104
206
  static VALUE
105
- method_empty_p(VALUE self) {
207
+ cs_method_empty_p(VALUE self)
208
+ {
106
209
  FOR_EACH_ACTIVE_CODEPOINT(return Qfalse);
107
210
  return Qtrue;
108
211
  }
109
212
 
110
213
  static VALUE
111
- method_hash(VALUE self) {
112
- cp_index cp, hash, four_byte_value;
113
- cp_byte *cps;
114
- FETCH_CODEPOINTS(self, cps);
214
+ cs_method_hash(VALUE self)
215
+ {
216
+ cs_cp cp, len, hash, four_byte_value;
217
+ cs_ar *cps;
218
+ cps = cs_fetch_cps(self, &len);
219
+ four_byte_value = 0;
115
220
 
116
221
  hash = 17;
117
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
118
- if (cp % 32 == 0) {
119
- if (cp != 0) { hash = hash * 23 + four_byte_value; }
222
+ for (cp = 0; cp < len; cp++)
223
+ {
224
+ if (cp % 32 == 0)
225
+ {
226
+ if (cp != 0)
227
+ {
228
+ hash = hash * 23 + four_byte_value;
229
+ }
120
230
  four_byte_value = 0;
121
231
  }
122
- if (TSTBIT(cps, cp)) four_byte_value++;
232
+ if (tst_cp(cps, len, cp))
233
+ {
234
+ four_byte_value++;
235
+ }
123
236
  }
124
237
 
125
238
  return LONG2FIX(hash);
126
239
  }
127
240
 
128
241
  static inline VALUE
129
- delete_if_block_result(VALUE self, int truthy) {
242
+ cs_delete_if_block_result(VALUE self, int truthy)
243
+ {
130
244
  VALUE result;
131
245
  rb_need_block();
132
246
  rb_check_frozen(self);
133
247
  FOR_EACH_ACTIVE_CODEPOINT(
134
- result = rb_yield(LONG2FIX(cp));
135
- if ((NIL_P(result) || result == Qfalse) != truthy) CLRBIT(cps, cp);
136
- );
248
+ result = rb_yield(LONG2FIX(cp));
249
+ if ((NIL_P(result) || result == Qfalse) != truthy) clr_cp(cps, len, cp););
137
250
  return self;
138
251
  }
139
252
 
140
253
  static VALUE
141
- method_delete_if(VALUE self) {
142
- RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
143
- return delete_if_block_result(self, 1);
254
+ cs_method_delete_if(VALUE self)
255
+ {
256
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
257
+ return cs_delete_if_block_result(self, 1);
144
258
  }
145
259
 
146
260
  static VALUE
147
- method_keep_if(VALUE self) {
148
- RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
149
- return delete_if_block_result(self, 0);
261
+ cs_method_keep_if(VALUE self)
262
+ {
263
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
264
+ return cs_delete_if_block_result(self, 0);
150
265
  }
151
266
 
152
267
  static VALUE
153
- method_clear(VALUE self) {
154
- cp_index cp;
155
- cp_byte *cps;
268
+ cs_method_clear(VALUE self)
269
+ {
270
+ struct cs_data *data;
156
271
  rb_check_frozen(self);
157
- FETCH_CODEPOINTS(self, cps);
158
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
159
- CLRBIT(cps, cp);
160
- }
272
+ data = cs_fetch_data(self);
273
+ memset(data->cps, 0, CS_MSIZE(data->len));
161
274
  return self;
162
275
  }
163
276
 
164
- #define RETURN_NEW_SET_BASED_ON(condition)\
165
- cp_index cp;\
166
- cp_byte *a, *b, *new_cps;\
167
- FETCH_CODEPOINTS(self, a);\
168
- if (other) FETCH_CODEPOINTS(other, b);\
169
- new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));\
170
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\
171
- if (condition) SETBIT(new_cps, cp);\
172
- }\
173
- return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);\
277
+ static VALUE
278
+ cs_method_min(VALUE self)
279
+ {
280
+ FOR_EACH_ACTIVE_CODEPOINT(return LONG2FIX(cp));
281
+ return Qnil;
282
+ }
283
+
284
+ static VALUE
285
+ cs_method_max(VALUE self)
286
+ {
287
+ cs_cp len;
288
+ long reverse_idx;
289
+ cs_ar *cps;
290
+ cps = cs_fetch_cps(self, &len);
291
+ for (reverse_idx = len; reverse_idx >= 0; reverse_idx--)
292
+ {
293
+ if (tst_cp(cps, len, reverse_idx))
294
+ {
295
+ return LONG2FIX(reverse_idx);
296
+ }
297
+ }
298
+ return Qnil;
299
+ }
300
+
301
+ static VALUE
302
+ cs_method_minmax(VALUE self)
303
+ {
304
+ VALUE arr;
305
+ arr = rb_ary_new2(2);
306
+ rb_ary_push(arr, cs_method_min(self));
307
+ rb_ary_push(arr, cs_method_max(self));
308
+ return arr;
309
+ }
310
+
311
+ #define RETURN_COMBINED_CS(cs_a, cs_b, comp_op) \
312
+ do \
313
+ { \
314
+ VALUE new_cs; \
315
+ cs_cp cp, alen, blen; \
316
+ cs_ar *acps, *bcps; \
317
+ struct cs_data *new_data; \
318
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
319
+ acps = cs_fetch_cps(cs_a, &alen); \
320
+ bcps = cs_fetch_cps(cs_b, &blen); \
321
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) \
322
+ { \
323
+ if (tst_cp(acps, alen, cp) comp_op tst_cp(bcps, blen, cp)) \
324
+ { \
325
+ set_cp(new_data, cp); \
326
+ } \
327
+ } \
328
+ return new_cs; \
329
+ } while (0)
174
330
 
175
331
  static VALUE
176
- method_intersection(VALUE self, VALUE other) {
177
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) && TSTBIT(b, cp));
332
+ cs_method_intersection(VALUE self, VALUE other)
333
+ {
334
+ RETURN_COMBINED_CS(self, other, &&);
178
335
  }
179
336
 
180
337
  static VALUE
181
- method_exclusion(VALUE self, VALUE other) {
182
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) ^ TSTBIT(b, cp));
338
+ cs_method_exclusion(VALUE self, VALUE other)
339
+ {
340
+ RETURN_COMBINED_CS(self, other, ^);
183
341
  }
184
342
 
185
343
  static VALUE
186
- method_union(VALUE self, VALUE other) {
187
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) || TSTBIT(b, cp));
344
+ cs_method_union(VALUE self, VALUE other)
345
+ {
346
+ RETURN_COMBINED_CS(self, other, ||);
188
347
  }
189
348
 
190
349
  static VALUE
191
- method_difference(VALUE self, VALUE other) {
192
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) > TSTBIT(b, cp));
350
+ cs_method_difference(VALUE self, VALUE other)
351
+ {
352
+ RETURN_COMBINED_CS(self, other, >);
193
353
  }
194
354
 
195
355
  static VALUE
196
- method_include_p(VALUE self, VALUE num) {
197
- cp_byte *cps;
198
- FETCH_CODEPOINTS(self, cps);
199
- return (TSTBIT(cps, FIX2ULONG(num)) ? Qtrue : Qfalse);
356
+ cs_method_include_p(VALUE self, VALUE num)
357
+ {
358
+ cs_ar *cps;
359
+ cs_cp len;
360
+ cps = cs_fetch_cps(self, &len);
361
+ return (tst_cp(cps, len, FIX2ULONG(num)) ? Qtrue : Qfalse);
200
362
  }
201
363
 
202
- static inline int
203
- toggle_codepoint(VALUE set, VALUE cp_num, unsigned int on, int check_if_noop) {
204
- cp_index cp;
205
- cp_byte *cps;
206
- rb_check_frozen(set);
207
- FETCH_CODEPOINTS(set, cps);
364
+ static inline VALUE
365
+ cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop)
366
+ {
367
+ cs_cp cp, len;
368
+ cs_ar *cps;
369
+ struct cs_data *data;
370
+ rb_check_frozen(cs);
371
+ data = cs_fetch_data(cs);
372
+ cps = data->cps;
373
+ len = data->len;
208
374
  cp = FIX2ULONG(cp_num);
209
- if (check_if_noop && (!TSTBIT(cps, cp) == !on)) {
210
- return 0;
375
+ if (return_nil_if_noop && (!tst_cp(cps, len, cp) == !on))
376
+ {
377
+ return Qnil;
211
378
  }
212
- else {
213
- if (on) { SETBIT(cps, cp); }
214
- else { CLRBIT(cps, cp); }
215
- return 1;
379
+ else
380
+ {
381
+ if (on)
382
+ {
383
+ set_cp(data, cp);
384
+ }
385
+ else
386
+ {
387
+ clr_cp(cps, len, cp);
388
+ }
389
+ return cs;
216
390
  }
217
391
  }
218
392
 
219
393
  static VALUE
220
- method_add(VALUE self, VALUE cp_num) {
221
- return toggle_codepoint(self, cp_num, 1, 0) ? self : Qnil;
394
+ cs_method_add(VALUE self, VALUE cp_num)
395
+ {
396
+ return cs_toggle_codepoint(self, cp_num, 1, 0);
222
397
  }
223
398
 
224
399
  static VALUE
225
- method_add_p(VALUE self, VALUE cp_num) {
226
- return toggle_codepoint(self, cp_num, 1, 1) ? self : Qnil;
400
+ cs_method_add_p(VALUE self, VALUE cp_num)
401
+ {
402
+ return cs_toggle_codepoint(self, cp_num, 1, 1);
227
403
  }
228
404
 
229
405
  static VALUE
230
- method_delete(VALUE self, VALUE cp_num) {
231
- return toggle_codepoint(self, cp_num, 0, 0) ? self : Qnil;
406
+ cs_method_delete(VALUE self, VALUE cp_num)
407
+ {
408
+ return cs_toggle_codepoint(self, cp_num, 0, 0);
232
409
  }
233
410
 
234
411
  static VALUE
235
- method_delete_p(VALUE self, VALUE cp_num) {
236
- return toggle_codepoint(self, cp_num, 0, 1) ? self : Qnil;
412
+ cs_method_delete_p(VALUE self, VALUE cp_num)
413
+ {
414
+ return cs_toggle_codepoint(self, cp_num, 0, 1);
237
415
  }
238
416
 
239
- #define COMPARE_SETS(action)\
240
- cp_index cp;\
241
- cp_byte *cps, *other_cps;\
242
- FETCH_CODEPOINTS(self, cps);\
243
- FETCH_CODEPOINTS(other, other_cps);\
244
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { action; }\
245
-
246
417
  static VALUE
247
- method_intersect_p(VALUE self, VALUE other) {
248
- COMPARE_SETS(if (TSTBIT(cps, cp) && TSTBIT(other_cps, cp)) return Qtrue);
418
+ cs_method_intersect_p(VALUE self, VALUE other)
419
+ {
420
+ cs_cp cp, alen, blen;
421
+ cs_ar *acps, *bcps;
422
+ acps = cs_fetch_cps(self, &alen);
423
+ bcps = cs_fetch_cps(other, &blen);
424
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
425
+ {
426
+ if (tst_cp(acps, alen, cp) && tst_cp(bcps, blen, cp))
427
+ {
428
+ return Qtrue;
429
+ }
430
+ }
249
431
  return Qfalse;
250
432
  }
251
433
 
252
434
  static VALUE
253
- method_disjoint_p(VALUE self, VALUE other) {
254
- return method_intersect_p(self, other) ? Qfalse : Qtrue;
435
+ cs_method_disjoint_p(VALUE self, VALUE other)
436
+ {
437
+ return cs_method_intersect_p(self, other) ? Qfalse : Qtrue;
255
438
  }
256
439
 
257
440
  static inline int
258
- is_character_set(VALUE obj) {
259
- return rb_typeddata_is_kind_of(obj, &character_set_type);
441
+ cs_check_type(VALUE obj)
442
+ {
443
+ return rb_typeddata_is_kind_of(obj, &cs_type);
260
444
  }
261
445
 
262
446
  static VALUE
263
- method_eql_p(VALUE self, VALUE other) {
264
- if (!is_character_set(other)) return Qfalse;
265
- if (self == other) return Qtrue; // same object_id
266
-
267
- COMPARE_SETS(if (TSTBIT(cps, cp) != TSTBIT(other_cps, cp)) return Qfalse);
268
-
447
+ cs_cps_eql(VALUE cs_a, VALUE cs_b)
448
+ {
449
+ cs_cp cp, alen, blen;
450
+ cs_ar *acps, *bcps;
451
+ acps = cs_fetch_cps(cs_a, &alen);
452
+ bcps = cs_fetch_cps(cs_b, &blen);
453
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
454
+ {
455
+ if (tst_cp(acps, alen, cp) != tst_cp(bcps, blen, cp))
456
+ {
457
+ return Qfalse;
458
+ }
459
+ }
269
460
  return Qtrue;
270
461
  }
271
462
 
463
+ static VALUE
464
+ cs_method_eql_p(VALUE self, VALUE other)
465
+ {
466
+ if (!cs_check_type(other))
467
+ {
468
+ return Qfalse;
469
+ }
470
+ if (self == other) // same object_id
471
+ {
472
+ return Qtrue;
473
+ }
474
+ return cs_cps_eql(self, other);
475
+ }
476
+
272
477
  static inline VALUE
273
- merge_character_set(VALUE self, VALUE other) {
274
- COMPARE_SETS(if (TSTBIT(other_cps, cp)) SETBIT(cps, cp));
275
- return self;
478
+ cs_merge_cs(VALUE recipient, VALUE source)
479
+ {
480
+ cs_cp cp, source_len;
481
+ struct cs_data *data;
482
+ cs_ar *source_cps;
483
+ data = cs_fetch_data(recipient);
484
+ source_cps = cs_fetch_cps(source, &source_len);
485
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
486
+ {
487
+ if (tst_cp(source_cps, source_len, cp))
488
+ {
489
+ set_cp(data, cp);
490
+ }
491
+ }
492
+ return recipient;
276
493
  }
277
494
 
278
- static inline void
279
- raise_arg_err_unless_valid_as_cp(VALUE object_id) {
280
- if (FIXNUM_P(object_id) && object_id > 0 && object_id < 0x220001) return;
495
+ static inline cs_cp
496
+ cs_checked_cp(VALUE object_id)
497
+ {
498
+ if (FIXNUM_P(object_id) && object_id > 0 && object_id < 0x220001)
499
+ {
500
+ return FIX2ULONG(object_id);
501
+ }
281
502
  rb_raise(rb_eArgError, "CharacterSet members must be between 0 and 0x10FFFF");
282
503
  }
283
504
 
284
505
  static inline VALUE
285
- merge_rb_range(VALUE self, VALUE rb_range) {
506
+ cs_merge_rb_range(VALUE self, VALUE rb_range)
507
+ {
286
508
  VALUE from_id, upto_id;
509
+ cs_cp from_cp, upto_cp, cont_len, rem;
287
510
  int excl;
288
- cp_index cp;
289
- cp_byte *cps;
290
- FETCH_CODEPOINTS(self, cps);
511
+ struct cs_data *data;
512
+ data = cs_fetch_data(self);
291
513
 
292
- if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl))) {
514
+ if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl)))
515
+ {
293
516
  rb_raise(rb_eArgError, "pass a Range");
294
517
  }
295
- if (excl) upto_id -= 2;
518
+ if (excl)
519
+ {
520
+ upto_id -= 2;
521
+ }
522
+
523
+ from_cp = cs_checked_cp(from_id);
524
+ upto_cp = cs_checked_cp(upto_id);
296
525
 
297
- raise_arg_err_unless_valid_as_cp(from_id);
298
- raise_arg_err_unless_valid_as_cp(upto_id);
526
+ if (upto_cp > from_cp && (upto_cp - from_cp > 6))
527
+ {
528
+ // set bits in preceding partially toggled bytes individually
529
+ for (/* */; (from_cp <= upto_cp) && (from_cp % 8); from_cp++)
530
+ {
531
+ set_cp(data, from_cp);
532
+ }
533
+ // memset contiguous bits directly
534
+ cont_len = upto_cp - from_cp + 1;
535
+ rem = cont_len % 8;
536
+ ensure_memsize_fits(data, upto_cp);
537
+ memset(data->cps + CS_MSIZE(from_cp), 0xFF, CS_MSIZE(cont_len - rem) / 8);
538
+ from_cp = upto_cp - rem + 1;
539
+ }
299
540
 
300
- for (/* */; from_id <= upto_id; from_id += 2) {
301
- cp = FIX2ULONG(from_id);
302
- SETBIT(cps, cp);
541
+ // set bits in partially toggled bytes individually
542
+ for (/* */; from_cp <= upto_cp; from_cp++)
543
+ {
544
+ set_cp(data, from_cp);
303
545
  }
546
+
304
547
  return self;
305
548
  }
306
549
 
307
550
  static inline VALUE
308
- merge_rb_array(VALUE self, VALUE rb_array) {
309
- VALUE el;
310
- cp_byte *cps;
311
- VALUE array_length, i;
312
- FETCH_CODEPOINTS(self, cps);
551
+ cs_merge_rb_array(VALUE self, VALUE rb_array)
552
+ {
553
+ VALUE el, array_length, i;
554
+ struct cs_data *data;
313
555
  Check_Type(rb_array, T_ARRAY);
556
+ data = cs_fetch_data(self);
314
557
  array_length = RARRAY_LEN(rb_array);
315
- for (i = 0; i < array_length; i++) {
558
+ for (i = 0; i < array_length; i++)
559
+ {
316
560
  el = RARRAY_AREF(rb_array, i);
317
- raise_arg_err_unless_valid_as_cp(el);
318
- SETBIT(cps, FIX2ULONG(el));
561
+ set_cp(data, cs_checked_cp(el));
319
562
  }
320
563
  return self;
321
564
  }
322
565
 
323
566
  static VALUE
324
- method_merge(VALUE self, VALUE other) {
567
+ cs_method_merge(VALUE self, VALUE other)
568
+ {
325
569
  rb_check_frozen(self);
326
- if (is_character_set(other)) {
327
- return merge_character_set(self, other);
570
+ if (cs_check_type(other))
571
+ {
572
+ return cs_merge_cs(self, other);
328
573
  }
329
- else if (TYPE(other) == T_ARRAY) {
330
- return merge_rb_array(self, other);
574
+ else if (TYPE(other) == T_ARRAY)
575
+ {
576
+ return cs_merge_rb_array(self, other);
331
577
  }
332
- return merge_rb_range(self, other);
578
+ return cs_merge_rb_range(self, other);
333
579
  }
334
580
 
335
581
  static VALUE
336
- method_initialize_copy(VALUE self, VALUE other) {
337
- merge_character_set(self, other);
338
- return other;
582
+ cs_method_initialize_copy(VALUE self, VALUE orig)
583
+ {
584
+ cs_merge_cs(self, orig);
585
+ return self;
339
586
  }
340
587
 
341
588
  static VALUE
342
- method_subtract(VALUE self, VALUE other) {
589
+ cs_method_subtract(VALUE self, VALUE other)
590
+ {
591
+ cs_cp cp, len, other_len;
592
+ cs_ar *cps, *other_cps;
343
593
  rb_check_frozen(self);
344
- COMPARE_SETS(if (TSTBIT(other_cps, cp)) CLRBIT(cps, cp));
594
+ cps = cs_fetch_cps(self, &len);
595
+ other_cps = cs_fetch_cps(other, &other_len);
596
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
597
+ {
598
+ if (tst_cp(other_cps, other_len, cp))
599
+ {
600
+ clr_cp(cps, len, cp);
601
+ }
602
+ }
345
603
  return self;
346
604
  }
347
605
 
348
606
  static inline int
349
- a_subset_of_b(VALUE set_a, VALUE set_b, int *is_proper) {
350
- cp_byte *cps_a, *cps_b;
351
- cp_index cp, size_a, size_b;
607
+ cs_a_subset_of_b(VALUE cs_a, VALUE cs_b, int *is_proper_ptr)
608
+ {
609
+ cs_ar *a, *b;
610
+ cs_cp cp, alen, blen, count_a, count_b;
352
611
 
353
- if (!is_character_set(set_a) || !is_character_set(set_b)) {
612
+ if (!cs_check_type(cs_a) || !cs_check_type(cs_b))
613
+ {
354
614
  rb_raise(rb_eArgError, "pass a CharacterSet");
355
615
  }
356
616
 
357
- FETCH_CODEPOINTS(set_a, cps_a);
358
- FETCH_CODEPOINTS(set_b, cps_b);
359
-
360
- *is_proper = 0;
361
- size_a = 0;
362
- size_b = 0;
363
-
364
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
365
- if (TSTBIT(cps_a, cp)) {
366
- if (!TSTBIT(cps_b, cp)) return 0;
367
- size_a++;
368
- size_b++;
617
+ a = cs_fetch_cps(cs_a, &alen);
618
+ b = cs_fetch_cps(cs_b, &blen);
619
+
620
+ count_a = 0;
621
+ count_b = 0;
622
+
623
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
624
+ {
625
+ if (tst_cp(a, alen, cp))
626
+ {
627
+ if (!tst_cp(b, blen, cp))
628
+ {
629
+ return 0;
630
+ }
631
+ count_a++;
632
+ count_b++;
633
+ }
634
+ else if (tst_cp(b, blen, cp))
635
+ {
636
+ count_b++;
369
637
  }
370
- else if (TSTBIT(cps_b, cp)) size_b++;
371
638
  }
372
639
 
373
- if (size_b > size_a) *is_proper = 1;
640
+ if (is_proper_ptr)
641
+ {
642
+ *is_proper_ptr = count_b > count_a;
643
+ }
644
+
374
645
  return 1;
375
646
  }
376
647
 
377
648
  static VALUE
378
- method_subset_p(VALUE self, VALUE other) {
379
- int is_proper;
380
- return a_subset_of_b(self, other, &is_proper) ? Qtrue : Qfalse;
649
+ cs_method_subset_p(VALUE self, VALUE other)
650
+ {
651
+ return cs_a_subset_of_b(self, other, NULL) ? Qtrue : Qfalse;
381
652
  }
382
653
 
383
654
  static VALUE
384
- method_proper_subset_p(VALUE self, VALUE other) {
385
- int is, is_proper;
386
- is = a_subset_of_b(self, other, &is_proper);
387
- return (is && is_proper) ? Qtrue : Qfalse;
655
+ cs_method_proper_subset_p(VALUE self, VALUE other)
656
+ {
657
+ int is_subset, is_proper;
658
+ is_subset = cs_a_subset_of_b(self, other, &is_proper);
659
+ return (is_subset && is_proper) ? Qtrue : Qfalse;
388
660
  }
389
661
 
390
662
  static VALUE
391
- method_superset_p(VALUE self, VALUE other) {
392
- int is_proper;
393
- return a_subset_of_b(other, self, &is_proper) ? Qtrue : Qfalse;
663
+ cs_method_superset_p(VALUE self, VALUE other)
664
+ {
665
+ return cs_a_subset_of_b(other, self, NULL) ? Qtrue : Qfalse;
394
666
  }
395
667
 
396
668
  static VALUE
397
- method_proper_superset_p(VALUE self, VALUE other) {
398
- int is, is_proper;
399
- is = a_subset_of_b(other, self, &is_proper);
400
- return (is && is_proper) ? Qtrue : Qfalse;
669
+ cs_method_proper_superset_p(VALUE self, VALUE other)
670
+ {
671
+ int is_superset, is_proper;
672
+ is_superset = cs_a_subset_of_b(other, self, &is_proper);
673
+ return (is_superset && is_proper) ? Qtrue : Qfalse;
401
674
  }
402
675
 
403
676
  // *******************************
@@ -405,42 +678,43 @@ method_proper_superset_p(VALUE self, VALUE other) {
405
678
  // *******************************
406
679
 
407
680
  static VALUE
408
- class_method_from_ranges(VALUE self, VALUE ranges) {
409
- VALUE new_set, range_count, i;
410
- new_set = rb_class_new_instance(0, 0, self);
681
+ cs_class_method_from_ranges(VALUE self, VALUE ranges)
682
+ {
683
+ VALUE new_cs, range_count, i;
684
+ new_cs = rb_class_new_instance(0, 0, self);
411
685
  range_count = RARRAY_LEN(ranges);
412
- for (i = 0; i < range_count; i++) {
413
- merge_rb_range(new_set, RARRAY_AREF(ranges, i));
686
+ for (i = 0; i < range_count; i++)
687
+ {
688
+ cs_merge_rb_range(new_cs, RARRAY_AREF(ranges, i));
414
689
  }
415
- return new_set;
690
+ return new_cs;
416
691
  }
417
692
 
418
693
  static VALUE
419
- method_ranges(VALUE self) {
420
- VALUE ranges, codepoint, previous_codepoint, current_start, current_end;
694
+ cs_method_ranges(VALUE self)
695
+ {
696
+ VALUE ranges, cp_num, previous_cp_num, current_start, current_end;
421
697
 
422
698
  ranges = rb_ary_new();
423
- previous_codepoint = 0;
699
+ previous_cp_num = 0;
424
700
  current_start = 0;
425
701
  current_end = 0;
426
702
 
427
703
  FOR_EACH_ACTIVE_CODEPOINT(
428
- codepoint = LONG2FIX(cp);
704
+ cp_num = LONG2FIX(cp);
429
705
 
430
- if (!previous_codepoint) {
431
- current_start = codepoint;
432
- }
433
- else if (previous_codepoint + 2 != codepoint) {
434
- // gap found, finalize previous range
435
- rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
436
- current_start = codepoint;
437
- }
438
- current_end = codepoint;
439
- previous_codepoint = codepoint;
440
- );
706
+ if (!previous_cp_num) {
707
+ current_start = cp_num;
708
+ } else if (previous_cp_num + 2 != cp_num) {
709
+ // gap found, finalize previous range
710
+ rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
711
+ current_start = cp_num;
712
+ } current_end = cp_num;
713
+ previous_cp_num = cp_num;);
441
714
 
442
715
  // add final range
443
- if (current_start) {
716
+ if (current_start)
717
+ {
444
718
  rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
445
719
  }
446
720
 
@@ -448,117 +722,233 @@ method_ranges(VALUE self) {
448
722
  }
449
723
 
450
724
  static VALUE
451
- method_sample(int argc, VALUE *argv, VALUE self) {
452
- VALUE to_a_args[1], array;
725
+ cs_method_sample(int argc, VALUE *argv, VALUE self)
726
+ {
727
+ VALUE array, to_a_args[1] = {Qtrue};
453
728
  rb_check_arity(argc, 0, 1);
454
- to_a_args[0] = Qtrue;
455
- array = method_to_a(1, to_a_args, self);
729
+ array = cs_method_to_a(1, to_a_args, self);
456
730
  return rb_funcall(array, rb_intern("sample"), argc, argc ? argv[0] : 0);
457
731
  }
458
732
 
459
733
  static inline VALUE
460
- new_set_from_section(VALUE set, cp_index from, cp_index upto) {
461
- cp_byte *cps, *new_cps;
462
- cp_index cp;
463
- FETCH_CODEPOINTS(set, cps);
464
- new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));
465
- for (cp = from; cp <= upto; cp++) {
466
- if (TSTBIT(cps, cp)) SETBIT(new_cps, cp);
734
+ cs_from_section(VALUE set, cs_cp from, cs_cp upto)
735
+ {
736
+ VALUE new_cs;
737
+ cs_ar *cps;
738
+ cs_cp cp, len;
739
+ struct cs_data *new_data;
740
+ new_cs = cs_alloc(RBASIC(set)->klass, &new_data);
741
+ cps = cs_fetch_cps(set, &len);
742
+ for (cp = from; cp <= upto; cp++)
743
+ {
744
+ if (tst_cp(cps, len, cp))
745
+ {
746
+ set_cp(new_data, cp);
747
+ }
467
748
  }
468
- return NEW_CHARACTER_SET(RBASIC(set)->klass, new_cps);
749
+ return new_cs;
469
750
  }
470
751
 
471
752
  static VALUE
472
- method_bmp_part(VALUE self) {
473
- return new_set_from_section(self, 0, UNICODE_PLANE_SIZE - 1);
753
+ cs_method_ext_section(VALUE self, VALUE from, VALUE upto)
754
+ {
755
+ return cs_from_section(self, FIX2ULONG(from), FIX2ULONG(upto));
756
+ }
757
+
758
+ static inline cs_cp
759
+ cs_active_cp_count_in_section(VALUE set, cs_cp from, cs_cp upto)
760
+ {
761
+ cs_ar *cps;
762
+ cs_cp cp, count, len;
763
+ cps = cs_fetch_cps(set, &len);
764
+ for (count = 0, cp = from; cp <= upto; cp++)
765
+ {
766
+ if (tst_cp(cps, len, cp))
767
+ {
768
+ count++;
769
+ }
770
+ }
771
+ return count;
474
772
  }
475
773
 
476
774
  static VALUE
477
- method_astral_part(VALUE self) {
478
- return new_set_from_section(self, UNICODE_PLANE_SIZE, UNICODE_CP_COUNT - 1);
775
+ cs_method_ext_count_in_section(VALUE self, VALUE from, VALUE upto)
776
+ {
777
+ cs_cp count;
778
+ count = cs_active_cp_count_in_section(self, FIX2ULONG(from), FIX2ULONG(upto));
779
+ return LONG2FIX(count);
479
780
  }
480
781
 
481
782
  static inline VALUE
482
- set_has_member_in_plane(VALUE set, unsigned int plane) {
483
- cp_byte *cps;
484
- cp_index cp, max_cp;
485
- FETCH_CODEPOINTS(set, cps);
486
- cp = plane * UNICODE_PLANE_SIZE;
487
- max_cp = (plane + 1) * UNICODE_PLANE_SIZE - 1;
488
- for (/* */; cp <= max_cp; cp++) {
489
- if (TSTBIT(cps, cp)) return Qtrue;
783
+ cs_has_cp_in_section(cs_ar *cps, cs_cp len, cs_cp from, cs_cp upto)
784
+ {
785
+ cs_cp cp;
786
+ for (cp = from; cp <= upto; cp++)
787
+ {
788
+ if (tst_cp(cps, len, cp))
789
+ {
790
+ return Qtrue;
791
+ }
490
792
  }
491
793
  return Qfalse;
492
794
  }
493
795
 
494
796
  static VALUE
495
- method_planes(VALUE self) {
797
+ cs_method_ext_section_p(VALUE self, VALUE from, VALUE upto)
798
+ {
799
+ cs_ar *cps;
800
+ cs_cp len;
801
+ cps = cs_fetch_cps(self, &len);
802
+ return cs_has_cp_in_section(cps, len, FIX2ULONG(from), FIX2ULONG(upto));
803
+ }
804
+
805
+ static inline VALUE
806
+ cs_ratio_of_section(VALUE set, cs_cp from, cs_cp upto)
807
+ {
808
+ double section_count, total_count;
809
+ section_count = (double)cs_active_cp_count_in_section(set, from, upto);
810
+ total_count = (double)cs_active_cp_count(set);
811
+ return DBL2NUM(section_count / total_count);
812
+ }
813
+
814
+ static VALUE
815
+ cs_method_ext_section_ratio(VALUE self, VALUE from, VALUE upto)
816
+ {
817
+ return cs_ratio_of_section(self, FIX2ULONG(from), FIX2ULONG(upto));
818
+ }
819
+
820
+ #define MAX_CP 0x10FFFF
821
+ #define MAX_ASCII_CP 0x7F
822
+ #define MAX_BMP_CP 0xFFFF
823
+ #define MIN_ASTRAL_CP 0x10000
824
+
825
+ static inline VALUE
826
+ cs_has_cp_in_plane(cs_ar *cps, cs_cp len, unsigned int plane)
827
+ {
828
+ cs_cp plane_beg, plane_end;
829
+ plane_beg = plane * UNICODE_PLANE_SIZE;
830
+ plane_end = (plane + 1) * MAX_BMP_CP;
831
+ return cs_has_cp_in_section(cps, len, plane_beg, plane_end);
832
+ }
833
+
834
+ static VALUE
835
+ cs_method_planes(VALUE self)
836
+ {
837
+ cs_ar *cps;
838
+ cs_cp len;
496
839
  unsigned int i;
497
840
  VALUE planes;
841
+ cps = cs_fetch_cps(self, &len);
498
842
  planes = rb_ary_new();
499
- for (i = 0; i < UNICODE_PLANE_COUNT; i++) {
500
- if (set_has_member_in_plane(self, i)) rb_ary_push(planes, INT2FIX(i));
843
+ for (i = 0; i < UNICODE_PLANE_COUNT; i++)
844
+ {
845
+ if (cs_has_cp_in_plane(cps, len, i))
846
+ {
847
+ rb_ary_push(planes, INT2FIX(i));
848
+ }
501
849
  }
502
850
  return planes;
503
851
  }
504
852
 
505
- static VALUE
506
- method_member_in_plane_p(VALUE self, VALUE plane_num) {
853
+ static inline int
854
+ cs_valid_plane_num(VALUE num)
855
+ {
507
856
  int plane;
508
- Check_Type(plane_num, T_FIXNUM);
509
- plane = FIX2INT(plane_num);
510
- if (plane < 0 || plane >= UNICODE_PLANE_COUNT) {
511
- rb_raise(rb_eArgError, "plane must be between 0 and 16");
857
+ Check_Type(num, T_FIXNUM);
858
+ plane = FIX2INT(num);
859
+ if (plane < 0 || plane >= UNICODE_PLANE_COUNT)
860
+ {
861
+ rb_raise(rb_eArgError, "plane must be between 0 and %d", UNICODE_PLANE_COUNT - 1);
512
862
  }
513
- return set_has_member_in_plane(self, plane);
863
+ return plane;
864
+ }
865
+
866
+ static VALUE
867
+ cs_method_plane(VALUE self, VALUE plane_num)
868
+ {
869
+ cs_cp plane, plane_beg, plane_end;
870
+ plane = cs_valid_plane_num(plane_num);
871
+ plane_beg = plane * UNICODE_PLANE_SIZE;
872
+ plane_end = (plane + 1) * MAX_BMP_CP;
873
+ return cs_from_section(self, plane_beg, plane_end);
874
+ }
875
+
876
+ static VALUE
877
+ cs_method_member_in_plane_p(VALUE self, VALUE plane_num)
878
+ {
879
+ cs_ar *cps;
880
+ cs_cp len;
881
+ unsigned int plane;
882
+ plane = cs_valid_plane_num(plane_num);
883
+ cps = cs_fetch_cps(self, &len);
884
+ return cs_has_cp_in_plane(cps, len, plane);
514
885
  }
515
886
 
516
887
  #define NON_SURROGATE(cp) (cp > 0xDFFF || cp < 0xD800)
517
888
 
518
889
  static VALUE
519
- method_ext_inversion(int argc, VALUE *argv, VALUE self) {
520
- int include_surrogates;
521
- cp_index upto;
522
- VALUE other;
523
- other = 0;
890
+ cs_method_ext_inversion(int argc, VALUE *argv, VALUE self)
891
+ {
892
+ int inc_surr;
893
+ cs_cp upto, cp, len;
894
+ cs_ar *cps;
895
+ VALUE new_cs;
896
+ struct cs_data *new_data;
897
+
524
898
  rb_check_arity(argc, 0, 2);
525
- include_surrogates = ((argc > 0) && (argv[0] == Qtrue));
526
- if ((argc > 1) && FIXNUM_P(argv[1])) {
527
- upto = FIX2ULONG(argv[1]);
528
- RETURN_NEW_SET_BASED_ON(
529
- cp <= upto && !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
530
- );
899
+
900
+ cps = cs_fetch_cps(self, &len);
901
+ inc_surr = argc && argv[0] == Qtrue;
902
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data);
903
+ upto = argc > 1 && FIXNUM_P(argv[1]) ? FIX2ULONG(argv[1]) : UNICODE_CP_COUNT;
904
+
905
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
906
+ {
907
+ if (cp <= upto && !tst_cp(cps, len, cp) && (inc_surr || NON_SURROGATE(cp)))
908
+ {
909
+ set_cp(new_data, cp);
910
+ }
531
911
  }
532
- RETURN_NEW_SET_BASED_ON(
533
- !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
534
- );
912
+
913
+ return new_cs;
535
914
  }
536
915
 
537
- typedef int(*str_cp_handler)(unsigned int, cp_byte*);
916
+ typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE *memo);
538
917
 
539
918
  static inline int
540
- add_str_cp_to_arr(unsigned int str_cp, cp_byte *cp_arr) {
541
- SETBIT(cp_arr, str_cp);
919
+ add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
920
+ {
921
+ set_cp(data, str_cp);
542
922
  return 1;
543
923
  }
544
924
 
545
925
  static VALUE
546
- method_case_insensitive(VALUE self) {
547
- cp_index i;
548
- cp_byte *new_cps;
549
-
550
- new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));
926
+ cs_method_case_insensitive(VALUE self)
927
+ {
928
+ cs_cp i, len;
929
+ cs_ar *cps;
930
+ VALUE new_cs;
931
+ struct cs_data *new_data;
551
932
 
552
- FOR_EACH_ACTIVE_CODEPOINT(SETBIT(new_cps, cp));
933
+ cps = cs_fetch_cps(self, &len);
934
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data);
935
+ cs_merge_cs(new_cs, self);
553
936
 
554
- for (i = 0; i < CASEFOLD_COUNT; i++) {
937
+ for (i = 0; i < CASEFOLD_COUNT; i++)
938
+ {
555
939
  casefold_mapping m = unicode_casefold_table[i];
556
940
 
557
- if (TSTBIT(cps, m.from)) { SETBIT(new_cps, m.to); }
558
- else if (TSTBIT(cps, m.to)) { SETBIT(new_cps, m.from); }
941
+ if (tst_cp(cps, len, m.from))
942
+ {
943
+ set_cp(new_data, m.to);
944
+ }
945
+ else if (tst_cp(cps, len, m.to))
946
+ {
947
+ set_cp(new_data, m.from);
948
+ }
559
949
  }
560
950
 
561
- return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);
951
+ return new_cs;
562
952
 
563
953
  // OnigCaseFoldType flags;
564
954
  // rb_encoding *enc;
@@ -573,20 +963,27 @@ method_case_insensitive(VALUE self) {
573
963
  }
574
964
 
575
965
  static inline VALUE
576
- each_sb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
577
- long i;
966
+ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
967
+ {
968
+ long i, str_len;
578
969
  unsigned int str_cp;
970
+ str_len = RSTRING_LEN(str);
579
971
 
580
- for (i = 0; i < RSTRING_LEN(str); i++) {
972
+ for (i = 0; i < str_len; i++)
973
+ {
581
974
  str_cp = (RSTRING_PTR(str)[i] & 0xff);
582
- if (!(*func)(str_cp, cp_arr)) return Qfalse;
975
+ if (!(*func)(str_cp, cp_arr, len, data, memo))
976
+ {
977
+ return Qfalse;
978
+ }
583
979
  }
584
980
 
585
981
  return Qtrue;
586
982
  }
587
983
 
588
984
  static inline VALUE
589
- each_mb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
985
+ each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
986
+ {
590
987
  int n;
591
988
  unsigned int str_cp;
592
989
  const char *ptr, *end;
@@ -597,9 +994,13 @@ each_mb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
597
994
  end = RSTRING_END(str);
598
995
  enc = rb_enc_get(str);
599
996
 
600
- while (ptr < end) {
997
+ while (ptr < end)
998
+ {
601
999
  str_cp = rb_enc_codepoint_len(ptr, end, &n, enc);
602
- if (!(*func)(str_cp, cp_arr)) return Qfalse;
1000
+ if (!(*func)(str_cp, cp_arr, len, data, memo))
1001
+ {
1002
+ return Qfalse;
1003
+ }
603
1004
  ptr += n;
604
1005
  }
605
1006
 
@@ -611,105 +1012,236 @@ static inline int
611
1012
  single_byte_optimizable(VALUE str)
612
1013
  {
613
1014
  rb_encoding *enc;
614
- if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) return 1;
1015
+ if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
1016
+ {
1017
+ return 1;
1018
+ }
615
1019
 
616
1020
  enc = rb_enc_get(str);
617
- if (rb_enc_mbmaxlen(enc) == 1) return 1;
1021
+ if (rb_enc_mbmaxlen(enc) == 1)
1022
+ {
1023
+ return 1;
1024
+ }
618
1025
 
619
1026
  return 0;
620
1027
  }
621
1028
 
622
1029
  static inline VALUE
623
- each_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
624
- if (single_byte_optimizable(str)) {
625
- return each_sb_cp(str, func, cp_arr);
1030
+ each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1031
+ {
1032
+ if (single_byte_optimizable(str))
1033
+ {
1034
+ return each_sb_cp(str, func, cp_arr, len, data, memo);
626
1035
  }
627
- return each_mb_cp(str, func, cp_arr);
1036
+ return each_mb_cp(str, func, cp_arr, len, data, memo);
628
1037
  }
629
1038
 
630
1039
  static inline void
631
- raise_arg_err_unless_string(VALUE val) {
632
- if (!RB_TYPE_P(val, T_STRING)) rb_raise(rb_eArgError, "pass a String");
1040
+ raise_arg_err_unless_string(VALUE val)
1041
+ {
1042
+ if (!RB_TYPE_P(val, T_STRING))
1043
+ {
1044
+ rb_raise(rb_eArgError, "pass a String");
1045
+ }
633
1046
  }
634
1047
 
635
1048
  static VALUE
636
- class_method_of(VALUE self, VALUE str) {
637
- cp_byte *cp_arr;
1049
+ cs_class_method_of(VALUE self, VALUE str)
1050
+ {
1051
+ VALUE new_cs;
1052
+ struct cs_data *new_data;
1053
+ new_cs = cs_alloc(self, &new_data);
638
1054
  raise_arg_err_unless_string(str);
639
- cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte));
640
- each_cp(str, add_str_cp_to_arr, cp_arr);
641
- return NEW_CHARACTER_SET(self, cp_arr);
1055
+ each_cp(str, add_str_cp_to_arr, 0, 0, new_data, 0);
1056
+ return new_cs;
642
1057
  }
643
1058
 
644
1059
  static inline int
645
- str_cp_not_in_arr(unsigned int str_cp, cp_byte *cp_arr) {
646
- return !TSTBIT(cp_arr, str_cp);
1060
+ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1061
+ {
1062
+ if (tst_cp(cp_arr, len, str_cp))
1063
+ {
1064
+ *memo += 1;
1065
+ }
1066
+ return 1;
647
1067
  }
648
1068
 
649
1069
  static VALUE
650
- method_used_by_p(VALUE self, VALUE str) {
651
- cp_byte *cps;
652
- VALUE only_uses_other_cps;
1070
+ cs_method_count_in(VALUE self, VALUE str)
1071
+ {
1072
+ VALUE count;
1073
+ struct cs_data *data;
653
1074
  raise_arg_err_unless_string(str);
654
- FETCH_CODEPOINTS(self, cps);
655
- only_uses_other_cps = each_cp(str, str_cp_not_in_arr, cps);
656
- return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
1075
+ data = cs_fetch_data(self);
1076
+ count = 0;
1077
+ each_cp(str, count_str_cp, data->cps, data->len, data, &count);
1078
+ return INT2NUM((int)count);
1079
+ }
1080
+
1081
+ static inline int
1082
+ str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1083
+ {
1084
+ return tst_cp(cp_arr, len, str_cp);
1085
+ }
1086
+
1087
+ static VALUE
1088
+ cs_method_cover_p(VALUE self, VALUE str)
1089
+ {
1090
+ struct cs_data *data;
1091
+ raise_arg_err_unless_string(str);
1092
+ data = cs_fetch_data(self);
1093
+ return each_cp(str, str_cp_in_arr, data->cps, data->len, data, 0);
1094
+ }
1095
+
1096
+ static inline int
1097
+ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1098
+ {
1099
+ if (tst_cp(cp_arr, len, str_cp))
1100
+ {
1101
+ rb_ary_push(memo[0], rb_enc_uint_chr((int)str_cp, (rb_encoding *)memo[1]));
1102
+ }
1103
+ return 1;
1104
+ }
1105
+
1106
+ static VALUE
1107
+ cs_method_scan(VALUE self, VALUE str)
1108
+ {
1109
+ VALUE memo[2];
1110
+ struct cs_data *data;
1111
+ raise_arg_err_unless_string(str);
1112
+ data = cs_fetch_data(self);
1113
+ memo[0] = rb_ary_new();
1114
+ memo[1] = (VALUE)rb_enc_get(str);
1115
+ each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo);
1116
+ return memo[0];
657
1117
  }
658
1118
 
659
1119
  static inline int
660
- str_cp_in_arr(unsigned int str_cp, cp_byte *cp_arr) {
661
- return TSTBIT(cp_arr, str_cp);
1120
+ str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1121
+ {
1122
+ return !tst_cp(cp_arr, len, str_cp);
662
1123
  }
663
1124
 
664
1125
  static VALUE
665
- method_cover_p(VALUE self, VALUE str) {
666
- cp_byte *cps;
1126
+ cs_method_used_by_p(VALUE self, VALUE str)
1127
+ {
1128
+ VALUE only_uses_other_cps;
1129
+ struct cs_data *data;
667
1130
  raise_arg_err_unless_string(str);
668
- FETCH_CODEPOINTS(self, cps);
669
- return each_cp(str, str_cp_in_arr, cps);
1131
+ data = cs_fetch_data(self);
1132
+ only_uses_other_cps = each_cp(str, str_cp_not_in_arr, data->cps, data->len, data, 0);
1133
+ return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
1134
+ }
1135
+
1136
+ static void
1137
+ cs_str_buf_cat(VALUE str, const char *ptr, long len)
1138
+ {
1139
+ long total, olen;
1140
+ char *sptr;
1141
+
1142
+ RSTRING_GETMEM(str, sptr, olen);
1143
+ sptr = RSTRING(str)->as.heap.ptr;
1144
+ olen = RSTRING(str)->as.heap.len;
1145
+ total = olen + len;
1146
+ memcpy(sptr + olen, ptr, len);
1147
+ RSTRING(str)->as.heap.len = total;
1148
+ }
1149
+
1150
+ #ifndef TERM_FILL
1151
+ #define TERM_FILL(ptr, termlen) \
1152
+ do \
1153
+ { \
1154
+ char *const term_fill_ptr = (ptr); \
1155
+ const int term_fill_len = (termlen); \
1156
+ *term_fill_ptr = '\0'; \
1157
+ if (__builtin_expect(!!(term_fill_len > 1), 0)) \
1158
+ memset(term_fill_ptr, 0, term_fill_len); \
1159
+ } while (0)
1160
+ #endif
1161
+
1162
+ static void
1163
+ cs_str_buf_terminate(VALUE str, rb_encoding *enc)
1164
+ {
1165
+ char *ptr;
1166
+ long len;
1167
+
1168
+ ptr = RSTRING(str)->as.heap.ptr;
1169
+ len = RSTRING(str)->as.heap.len;
1170
+ TERM_FILL(ptr + len, rb_enc_mbminlen(enc));
670
1171
  }
671
1172
 
672
1173
  static inline VALUE
673
- apply_to_str(VALUE set, VALUE str, int delete, int bang) {
674
- cp_byte *cps;
1174
+ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1175
+ {
1176
+ cs_ar *cps;
1177
+ cs_cp len;
675
1178
  rb_encoding *str_enc;
676
- VALUE orig_len, blen, new_str_buf, chr;
677
- int n;
1179
+ VALUE orig_len, new_str_buf;
1180
+ int cp_len;
678
1181
  unsigned int str_cp;
679
1182
  const char *ptr, *end;
680
1183
 
681
1184
  raise_arg_err_unless_string(str);
682
1185
 
683
- FETCH_CODEPOINTS(set, cps);
1186
+ cps = cs_fetch_cps(set, &len);
684
1187
 
685
1188
  orig_len = RSTRING_LEN(str);
686
- blen = orig_len + 30; /* len + margin */ // not sure why, copied from string.c
687
- new_str_buf = rb_str_buf_new(blen);
1189
+ if (orig_len < 1) // empty string, will never change
1190
+ {
1191
+ if (bang)
1192
+ {
1193
+ return Qnil;
1194
+ }
1195
+ return rb_str_dup(str);
1196
+ }
1197
+
1198
+ new_str_buf = rb_str_buf_new(orig_len + 30); // len + margin
688
1199
  str_enc = rb_enc_get(str);
689
1200
  rb_enc_associate(new_str_buf, str_enc);
690
- ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ?
691
- ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1201
+ rb_str_modify(new_str_buf);
1202
+ ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
692
1203
 
693
1204
  ptr = RSTRING_PTR(str);
694
1205
  end = RSTRING_END(str);
695
1206
 
696
- while (ptr < end) {
697
- str_cp = rb_enc_codepoint_len(ptr, end, &n, str_enc);
698
- if (!TSTBIT(cps, str_cp) != !delete) {
699
- chr = rb_enc_uint_chr(str_cp, str_enc);
700
- rb_enc_str_buf_cat(new_str_buf, RSTRING_PTR(chr), n, str_enc);
1207
+ if (single_byte_optimizable(str))
1208
+ {
1209
+ while (ptr < end)
1210
+ {
1211
+ str_cp = *ptr & 0xff;
1212
+ if ((!tst_cp(cps, len, str_cp)) == delete)
1213
+ {
1214
+ cs_str_buf_cat(new_str_buf, ptr, 1);
1215
+ }
1216
+ ptr++;
1217
+ }
1218
+ }
1219
+ else // likely to be multibyte string
1220
+ {
1221
+ while (ptr < end)
1222
+ {
1223
+ str_cp = rb_enc_codepoint_len(ptr, end, &cp_len, str_enc);
1224
+ if ((!tst_cp(cps, len, str_cp)) == delete)
1225
+ {
1226
+ cs_str_buf_cat(new_str_buf, ptr, cp_len);
1227
+ }
1228
+ ptr += cp_len;
701
1229
  }
702
- ptr += n;
703
1230
  }
704
1231
 
705
- if (bang) {
706
- if (RSTRING_LEN(new_str_buf) == (long)orig_len) return Qnil; // unchanged
1232
+ cs_str_buf_terminate(new_str_buf, str_enc);
1233
+
1234
+ if (bang)
1235
+ {
1236
+ if (RSTRING_LEN(new_str_buf) == (long)orig_len) // string unchanged
1237
+ {
1238
+ return Qnil;
1239
+ }
707
1240
  rb_str_shared_replace(str, new_str_buf);
708
1241
  }
709
- else {
1242
+ else
1243
+ {
710
1244
  RB_OBJ_WRITE(new_str_buf, &(RBASIC(new_str_buf))->klass, rb_obj_class(str));
711
- // slightly cumbersome approach needed for compatibility with Ruby < 2.3:
712
- RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags&(FL_TAINT));
713
1245
  str = new_str_buf;
714
1246
  }
715
1247
 
@@ -717,98 +1249,115 @@ apply_to_str(VALUE set, VALUE str, int delete, int bang) {
717
1249
  }
718
1250
 
719
1251
  static VALUE
720
- method_delete_in(VALUE self, VALUE str) {
721
- return apply_to_str(self, str, 1, 0);
1252
+ cs_method_delete_in(VALUE self, VALUE str)
1253
+ {
1254
+ return cs_apply_to_str(self, str, 1, 0);
1255
+ }
1256
+
1257
+ static VALUE
1258
+ cs_method_delete_in_bang(VALUE self, VALUE str)
1259
+ {
1260
+ return cs_apply_to_str(self, str, 1, 1);
722
1261
  }
723
1262
 
724
1263
  static VALUE
725
- method_delete_in_bang(VALUE self, VALUE str) {
726
- return apply_to_str(self, str, 1, 1);
1264
+ cs_method_keep_in(VALUE self, VALUE str)
1265
+ {
1266
+ return cs_apply_to_str(self, str, 0, 0);
727
1267
  }
728
1268
 
729
1269
  static VALUE
730
- method_keep_in(VALUE self, VALUE str) {
731
- return apply_to_str(self, str, 0, 0);
1270
+ cs_method_keep_in_bang(VALUE self, VALUE str)
1271
+ {
1272
+ return cs_apply_to_str(self, str, 0, 1);
732
1273
  }
733
1274
 
734
1275
  static VALUE
735
- method_keep_in_bang(VALUE self, VALUE str) {
736
- return apply_to_str(self, str, 0, 1);
1276
+ cs_method_allocated_length(VALUE self)
1277
+ {
1278
+ return LONG2FIX(cs_fetch_data(self)->len);
737
1279
  }
738
1280
 
739
1281
  // ****
740
1282
  // init
741
1283
  // ****
742
1284
 
743
- void
744
- Init_character_set()
1285
+ void Init_character_set()
745
1286
  {
746
1287
  VALUE cs = rb_define_class("CharacterSet", rb_cObject);
747
1288
 
748
- rb_define_alloc_func(cs, method_allocate);
1289
+ rb_define_alloc_func(cs, cs_method_allocate);
749
1290
 
750
1291
  // `Set` compatibility methods
751
1292
 
752
- rb_define_method(cs, "each", method_each, 0);
753
- rb_define_method(cs, "to_a", method_to_a, -1);
754
- rb_define_method(cs, "length", method_length, 0);
755
- rb_define_method(cs, "size", method_length, 0);
756
- rb_define_method(cs, "count", method_length, 0);
757
- rb_define_method(cs, "empty?", method_empty_p, 0);
758
- rb_define_method(cs, "hash", method_hash, 0);
759
- rb_define_method(cs, "keep_if", method_keep_if, 0);
760
- rb_define_method(cs, "delete_if", method_delete_if, 0);
761
- rb_define_method(cs, "clear", method_clear, 0);
762
- rb_define_method(cs, "intersection", method_intersection, 1);
763
- rb_define_method(cs, "&", method_intersection, 1);
764
- rb_define_method(cs, "union", method_union, 1);
765
- rb_define_method(cs, "+", method_union, 1);
766
- rb_define_method(cs, "|", method_union, 1);
767
- rb_define_method(cs, "difference", method_difference, 1);
768
- rb_define_method(cs, "-", method_difference, 1);
769
- rb_define_method(cs, "^", method_exclusion, 1);
770
- rb_define_method(cs, "include?", method_include_p, 1);
771
- rb_define_method(cs, "member?", method_include_p, 1);
772
- rb_define_method(cs, "===", method_include_p, 1);
773
- rb_define_method(cs, "add", method_add, 1);
774
- rb_define_method(cs, "<<", method_add, 1);
775
- rb_define_method(cs, "add?", method_add_p, 1);
776
- rb_define_method(cs, "delete", method_delete, 1);
777
- rb_define_method(cs, "delete?", method_delete_p, 1);
778
- rb_define_method(cs, "intersect?", method_intersect_p, 1);
779
- rb_define_method(cs, "disjoint?", method_disjoint_p, 1);
780
- rb_define_method(cs, "eql?", method_eql_p, 1);
781
- rb_define_method(cs, "==", method_eql_p, 1);
782
- rb_define_method(cs, "merge", method_merge, 1);
783
- rb_define_method(cs, "initialize_clone", method_initialize_copy, 1);
784
- rb_define_method(cs, "initialize_dup", method_initialize_copy, 1);
785
- rb_define_method(cs, "subtract", method_subtract, 1);
786
- rb_define_method(cs, "subset?", method_subset_p, 1);
787
- rb_define_method(cs, "<=", method_subset_p, 1);
788
- rb_define_method(cs, "proper_subset?", method_proper_subset_p, 1);
789
- rb_define_method(cs, "<", method_proper_subset_p, 1);
790
- rb_define_method(cs, "superset?", method_superset_p, 1);
791
- rb_define_method(cs, ">=", method_superset_p, 1);
792
- rb_define_method(cs, "proper_superset?", method_proper_superset_p, 1);
793
- rb_define_method(cs, ">", method_proper_superset_p, 1);
1293
+ rb_define_method(cs, "each", cs_method_each, 0);
1294
+ rb_define_method(cs, "to_a", cs_method_to_a, -1);
1295
+ rb_define_method(cs, "length", cs_method_length, 0);
1296
+ rb_define_method(cs, "size", cs_method_length, 0);
1297
+ rb_define_method(cs, "empty?", cs_method_empty_p, 0);
1298
+ rb_define_method(cs, "hash", cs_method_hash, 0);
1299
+ rb_define_method(cs, "keep_if", cs_method_keep_if, 0);
1300
+ rb_define_method(cs, "delete_if", cs_method_delete_if, 0);
1301
+ rb_define_method(cs, "clear", cs_method_clear, 0);
1302
+ rb_define_method(cs, "min", cs_method_min, 0);
1303
+ rb_define_method(cs, "max", cs_method_max, 0);
1304
+ rb_define_method(cs, "minmax", cs_method_minmax, 0);
1305
+ rb_define_method(cs, "intersection", cs_method_intersection, 1);
1306
+ rb_define_method(cs, "&", cs_method_intersection, 1);
1307
+ rb_define_method(cs, "union", cs_method_union, 1);
1308
+ rb_define_method(cs, "+", cs_method_union, 1);
1309
+ rb_define_method(cs, "|", cs_method_union, 1);
1310
+ rb_define_method(cs, "difference", cs_method_difference, 1);
1311
+ rb_define_method(cs, "-", cs_method_difference, 1);
1312
+ rb_define_method(cs, "^", cs_method_exclusion, 1);
1313
+ rb_define_method(cs, "include?", cs_method_include_p, 1);
1314
+ rb_define_method(cs, "member?", cs_method_include_p, 1);
1315
+ rb_define_method(cs, "===", cs_method_include_p, 1);
1316
+ rb_define_method(cs, "add", cs_method_add, 1);
1317
+ rb_define_method(cs, "<<", cs_method_add, 1);
1318
+ rb_define_method(cs, "add?", cs_method_add_p, 1);
1319
+ rb_define_method(cs, "delete", cs_method_delete, 1);
1320
+ rb_define_method(cs, "delete?", cs_method_delete_p, 1);
1321
+ rb_define_method(cs, "intersect?", cs_method_intersect_p, 1);
1322
+ rb_define_method(cs, "disjoint?", cs_method_disjoint_p, 1);
1323
+ rb_define_method(cs, "eql?", cs_method_eql_p, 1);
1324
+ rb_define_method(cs, "==", cs_method_eql_p, 1);
1325
+ rb_define_method(cs, "merge", cs_method_merge, 1);
1326
+ rb_define_method(cs, "initialize_clone", cs_method_initialize_copy, 1);
1327
+ rb_define_method(cs, "initialize_dup", cs_method_initialize_copy, 1);
1328
+ rb_define_method(cs, "subtract", cs_method_subtract, 1);
1329
+ rb_define_method(cs, "subset?", cs_method_subset_p, 1);
1330
+ rb_define_method(cs, "<=", cs_method_subset_p, 1);
1331
+ rb_define_method(cs, "proper_subset?", cs_method_proper_subset_p, 1);
1332
+ rb_define_method(cs, "<", cs_method_proper_subset_p, 1);
1333
+ rb_define_method(cs, "superset?", cs_method_superset_p, 1);
1334
+ rb_define_method(cs, ">=", cs_method_superset_p, 1);
1335
+ rb_define_method(cs, "proper_superset?", cs_method_proper_superset_p, 1);
1336
+ rb_define_method(cs, ">", cs_method_proper_superset_p, 1);
794
1337
 
795
1338
  // `CharacterSet`-specific methods
796
1339
 
797
- rb_define_singleton_method(cs, "from_ranges", class_method_from_ranges, -2);
798
- rb_define_singleton_method(cs, "of", class_method_of, 1);
799
-
800
- rb_define_method(cs, "ranges", method_ranges, 0);
801
- rb_define_method(cs, "sample", method_sample, -1);
802
- rb_define_method(cs, "bmp_part", method_bmp_part, 0);
803
- rb_define_method(cs, "astral_part", method_astral_part, 0);
804
- rb_define_method(cs, "planes", method_planes, 0);
805
- rb_define_method(cs, "member_in_plane?", method_member_in_plane_p, 1);
806
- rb_define_method(cs, "ext_inversion", method_ext_inversion, -1);
807
- rb_define_method(cs, "case_insensitive", method_case_insensitive, 0);
808
- rb_define_method(cs, "used_by?", method_used_by_p, 1);
809
- rb_define_method(cs, "cover?", method_cover_p, 1);
810
- rb_define_method(cs, "delete_in", method_delete_in, 1);
811
- rb_define_method(cs, "delete_in!", method_delete_in_bang, 1);
812
- rb_define_method(cs, "keep_in", method_keep_in, 1);
813
- rb_define_method(cs, "keep_in!", method_keep_in_bang, 1);
1340
+ rb_define_singleton_method(cs, "from_ranges", cs_class_method_from_ranges, -2);
1341
+ rb_define_singleton_method(cs, "of", cs_class_method_of, 1);
1342
+
1343
+ rb_define_method(cs, "ranges", cs_method_ranges, 0);
1344
+ rb_define_method(cs, "sample", cs_method_sample, -1);
1345
+ rb_define_method(cs, "ext_section", cs_method_ext_section, 2);
1346
+ rb_define_method(cs, "ext_count_in_section", cs_method_ext_count_in_section, 2);
1347
+ rb_define_method(cs, "ext_section?", cs_method_ext_section_p, 2);
1348
+ rb_define_method(cs, "ext_section_ratio", cs_method_ext_section_ratio, 2);
1349
+ rb_define_method(cs, "planes", cs_method_planes, 0);
1350
+ rb_define_method(cs, "plane", cs_method_plane, 1);
1351
+ rb_define_method(cs, "member_in_plane?", cs_method_member_in_plane_p, 1);
1352
+ rb_define_method(cs, "ext_inversion", cs_method_ext_inversion, -1);
1353
+ rb_define_method(cs, "case_insensitive", cs_method_case_insensitive, 0);
1354
+ rb_define_method(cs, "count_in", cs_method_count_in, 1);
1355
+ rb_define_method(cs, "cover?", cs_method_cover_p, 1);
1356
+ rb_define_method(cs, "delete_in", cs_method_delete_in, 1);
1357
+ rb_define_method(cs, "delete_in!", cs_method_delete_in_bang, 1);
1358
+ rb_define_method(cs, "keep_in", cs_method_keep_in, 1);
1359
+ rb_define_method(cs, "keep_in!", cs_method_keep_in_bang, 1);
1360
+ rb_define_method(cs, "scan", cs_method_scan, 1);
1361
+ rb_define_method(cs, "used_by?", cs_method_used_by_p, 1);
1362
+ rb_define_method(cs, "allocated_length", cs_method_allocated_length, 0);
814
1363
  }