character_set 1.1.1 → 1.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +3 -0
  3. data/.github/workflows/lint.yml +29 -0
  4. data/.github/workflows/tests.yml +22 -0
  5. data/.gitignore +1 -0
  6. data/.rubocop.yml +11 -0
  7. data/BENCHMARK.md +53 -17
  8. data/CHANGELOG.md +47 -0
  9. data/README.md +38 -14
  10. data/Rakefile +60 -36
  11. data/benchmarks/count_in.rb +13 -0
  12. data/benchmarks/delete_in.rb +1 -1
  13. data/benchmarks/scan.rb +13 -0
  14. data/benchmarks/shared.rb +5 -0
  15. data/benchmarks/z_add.rb +12 -0
  16. data/benchmarks/z_delete.rb +12 -0
  17. data/benchmarks/z_merge.rb +15 -0
  18. data/benchmarks/z_minmax.rb +12 -0
  19. data/bin/console +2 -0
  20. data/character_set.gemspec +17 -6
  21. data/ext/character_set/character_set.c +963 -414
  22. data/ext/character_set/unicode_casefold_table.h +10 -2
  23. data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
  24. data/lib/character_set/character.rb +1 -1
  25. data/lib/character_set/core_ext/regexp_ext.rb +1 -1
  26. data/lib/character_set/core_ext/string_ext.rb +3 -1
  27. data/lib/character_set/expression_converter.rb +25 -27
  28. data/lib/character_set/parser.rb +1 -1
  29. data/lib/character_set/predefined_sets.rb +25 -260
  30. data/lib/character_set/predefined_sets/any.cps +1 -0
  31. data/lib/character_set/predefined_sets/ascii.cps +1 -0
  32. data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
  33. data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
  34. data/lib/character_set/predefined_sets/assigned.cps +666 -0
  35. data/lib/character_set/predefined_sets/bmp.cps +2 -0
  36. data/lib/character_set/predefined_sets/crypt.cps +2 -0
  37. data/lib/character_set/predefined_sets/emoji.cps +151 -0
  38. data/lib/character_set/predefined_sets/newline.cps +3 -0
  39. data/lib/character_set/predefined_sets/surrogate.cps +1 -0
  40. data/lib/character_set/predefined_sets/unicode.cps +2 -0
  41. data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
  42. data/lib/character_set/predefined_sets/url_host.cps +10 -0
  43. data/lib/character_set/predefined_sets/url_path.cps +7 -0
  44. data/lib/character_set/predefined_sets/url_query.cps +8 -0
  45. data/lib/character_set/predefined_sets/whitespace.cps +10 -0
  46. data/lib/character_set/ruby_fallback.rb +5 -3
  47. data/lib/character_set/ruby_fallback/character_set_methods.rb +53 -6
  48. data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
  49. data/lib/character_set/shared_methods.rb +60 -49
  50. data/lib/character_set/version.rb +1 -1
  51. data/lib/character_set/writer.rb +98 -27
  52. metadata +88 -22
  53. data/.travis.yml +0 -11
  54. data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
@@ -0,0 +1,13 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum et dolorem'
4
+ tr = '^A-Za-z'
5
+ cs = CharacterSet.non_ascii_letter
6
+
7
+ benchmark(
8
+ caption: 'Counting non-letters',
9
+ cases: {
10
+ 'String#count' => -> { str.count(tr) },
11
+ 'CharacterSet#count_in' => -> { cs.count_in(str) },
12
+ }
13
+ )
@@ -14,7 +14,7 @@ benchmark(
14
14
 
15
15
  str = 'Lörem ipsüm ⛷ et dölörem'
16
16
  rx = /[\s\p{emoji}äüö]/
17
- cs = CharacterSet.whitespace + CharacterSet.emoji + CS['ä', 'ü', 'ö']
17
+ cs = CharacterSet.whitespace + CharacterSet.emoji + CharacterSet['ä', 'ö', 'ü']
18
18
 
19
19
  benchmark(
20
20
  caption: 'Removing whitespace, emoji and umlauts',
@@ -0,0 +1,13 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum ⛷ et dolorem'
4
+ rx = /\p{emoji}/
5
+ cs = CharacterSet.emoji
6
+
7
+ benchmark(
8
+ caption: 'Extracting emoji to an Array',
9
+ cases: {
10
+ 'String#scan' => -> { str.scan(rx) },
11
+ 'CharacterSet#scan' => -> { cs.scan(str) },
12
+ }
13
+ )
@@ -3,6 +3,11 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
3
 
4
4
  require 'benchmark/ips'
5
5
  require 'character_set'
6
+ if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
7
+ require 'sorted_set'
8
+ else
9
+ require 'set'
10
+ end
6
11
 
7
12
  def benchmark(caption: nil, cases: {})
8
13
  puts caption
@@ -0,0 +1,12 @@
1
+ require_relative './shared'
2
+
3
+ cs = CharacterSet[]
4
+ ss = SortedSet[]
5
+
6
+ benchmark(
7
+ caption: 'Adding entries',
8
+ cases: {
9
+ 'CharacterSet#add' => -> { cs.add(rand(0x10FFFF)) },
10
+ 'SortedSet#add' => -> { ss.add(rand(0x10FFFF)) },
11
+ }
12
+ )
@@ -0,0 +1,12 @@
1
+ require_relative './shared'
2
+
3
+ cs = CharacterSet.new(0..0x10FFFF)
4
+ ss = SortedSet.new(0..0x10FFFF)
5
+
6
+ benchmark(
7
+ caption: 'Removing entries',
8
+ cases: {
9
+ 'CharacterSet#delete' => -> { cs.delete(rand(0x10FFFF)) },
10
+ 'SortedSet#delete' => -> { ss.delete(rand(0x10FFFF)) },
11
+ }
12
+ )
@@ -0,0 +1,15 @@
1
+ require_relative './shared'
2
+
3
+ cs1 = CharacterSet.new(0...0x88000)
4
+ cs2 = CharacterSet.new(0x88000..0x10FFFF)
5
+
6
+ ss1 = SortedSet.new(0...0x88000)
7
+ ss2 = SortedSet.new(0x88000..0x10FFFF)
8
+
9
+ benchmark(
10
+ caption: 'Merging entries',
11
+ cases: {
12
+ 'CharacterSet#merge' => -> { cs1.merge(cs2) },
13
+ 'SortedSet#merge' => -> { ss1.merge(ss2) },
14
+ }
15
+ )
@@ -0,0 +1,12 @@
1
+ require_relative './shared'
2
+
3
+ cs = CharacterSet.new(0..0xFFFF)
4
+ ss = SortedSet.new(0..0xFFFF)
5
+
6
+ benchmark(
7
+ caption: 'Getting the min and max',
8
+ cases: {
9
+ 'CharacterSet#minmax' => -> { cs.minmax },
10
+ 'SortedSet#minmax' => -> { ss.minmax },
11
+ }
12
+ )
@@ -2,6 +2,8 @@
2
2
 
3
3
  require 'bundler/setup'
4
4
 
5
+ `bundle exec rake compile`
6
+
5
7
  require 'character_set'
6
8
  require 'character_set/core_ext'
7
9
  require 'character_set/pure'
@@ -10,7 +10,7 @@ Gem::Specification.new do |s|
10
10
  s.email = ['janosch84@gmail.com']
11
11
 
12
12
  s.summary = 'Build, read, write and compare sets of Unicode codepoints.'
13
- s.homepage = 'https://github.com/janosch-x/character_set'
13
+ s.homepage = 'https://github.com/jaynetics/character_set'
14
14
  s.license = 'MIT'
15
15
 
16
16
  s.files = `git ls-files -z`.split("\x0").reject do |f|
@@ -22,12 +22,23 @@ Gem::Specification.new do |s|
22
22
 
23
23
  s.required_ruby_version = '>= 2.1.0'
24
24
 
25
+ # SortedSet, needed for RubyFallback, was moved to a gem in Ruby 3.
26
+ # This dependency is only used if the C extension is unavailable.
27
+ # JRuby has it in the stdlib.
28
+ if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
29
+ s.add_dependency 'sorted_set', '~> 1.0'
30
+ end
31
+
25
32
  s.add_development_dependency 'benchmark-ips', '~> 2.7'
26
- s.add_development_dependency 'bundler', '~> 1.16'
27
- s.add_development_dependency 'rake', '~> 12.0'
28
- s.add_development_dependency 'rake-compiler', '~> 1.0'
33
+ s.add_development_dependency 'get_process_mem', '~> 0.2.3'
34
+ s.add_development_dependency 'rake', '~> 13.0'
35
+ s.add_development_dependency 'rake-compiler', '~> 1.1'
29
36
  s.add_development_dependency 'range_compressor', '~> 1.0'
30
- s.add_development_dependency 'regexp_parser', '~> 1.1'
31
- s.add_development_dependency 'regexp_property_values', '~> 0.3.4'
37
+ s.add_development_dependency 'regexp_parser', '~> 1.6'
38
+ s.add_development_dependency 'regexp_property_values', '~> 1.0'
32
39
  s.add_development_dependency 'rspec', '~> 3.8'
40
+ if RUBY_VERSION.to_f >= 2.7
41
+ s.add_development_dependency 'codecov', '~> 0.2.12'
42
+ s.add_development_dependency 'rubocop', '~> 1.8'
43
+ end
33
44
  end
@@ -2,81 +2,180 @@
2
2
  #include "ruby/encoding.h"
3
3
  #include "unicode_casefold_table.h"
4
4
 
5
- #define SETBIT(byte_arr, bit) (byte_arr[bit >> 3] |= (1 << (bit & 0x07)))
6
- #define CLRBIT(byte_arr, bit) (byte_arr[bit >> 3] &= ~(1 << (bit & 0x07)))
7
- #define TSTBIT(byte_arr, bit) (byte_arr[bit >> 3] & (1 << (bit & 0x07)))
5
+ #define UNICODE_PLANE_SIZE 0x10000
6
+ #define UNICODE_PLANE_COUNT 17
7
+ #define UNICODE_CP_COUNT (UNICODE_PLANE_SIZE * UNICODE_PLANE_COUNT)
8
8
 
9
- typedef char cp_byte;
10
- typedef unsigned long cp_index;
9
+ // start at ascii size
10
+ #define CS_DEFAULT_INITIAL_LEN 128
11
11
 
12
- #define UNICODE_CP_COUNT 0x110000
13
- #define UNICODE_BYTES UNICODE_CP_COUNT / 8
14
- #define UNICODE_PLANE_SIZE 0x10000
15
- #define UNICODE_PLANE_COUNT UNICODE_CP_COUNT / UNICODE_PLANE_SIZE
12
+ typedef char cs_ar;
13
+ typedef unsigned long cs_cp;
14
+
15
+ struct cs_data
16
+ {
17
+ cs_ar *cps;
18
+ cs_cp len;
19
+ };
20
+
21
+ #define CS_MSIZE(len) (sizeof(cs_ar) * (len / 8))
22
+
23
+ static inline void
24
+ add_memspace_for_another_plane(struct cs_data *data)
25
+ {
26
+ data->cps = ruby_xrealloc(data->cps, CS_MSIZE(data->len + UNICODE_PLANE_SIZE));
27
+ memset(data->cps + CS_MSIZE(data->len), 0, CS_MSIZE(UNICODE_PLANE_SIZE));
28
+ data->len += UNICODE_PLANE_SIZE;
29
+ }
30
+
31
+ static inline void
32
+ ensure_memsize_fits(struct cs_data *data, cs_cp target_cp)
33
+ {
34
+ while (target_cp >= data->len)
35
+ {
36
+ add_memspace_for_another_plane(data);
37
+ }
38
+ }
39
+
40
+ static inline void
41
+ set_cp(struct cs_data *data, cs_cp cp)
42
+ {
43
+ ensure_memsize_fits(data, cp);
44
+ data->cps[cp >> 3] |= (1 << (cp & 0x07));
45
+ }
46
+
47
+ static inline int
48
+ tst_cp(cs_ar *cps, cs_cp len, cs_cp cp)
49
+ {
50
+ return ((cp < len) && cps[cp >> 3] & (1 << (cp & 0x07)));
51
+ }
52
+
53
+ static inline void
54
+ clr_cp(cs_ar *cps, cs_cp len, cs_cp cp)
55
+ {
56
+ if (cp < len)
57
+ {
58
+ cps[cp >> 3] &= ~(1 << (cp & 0x07));
59
+ }
60
+ }
16
61
 
17
62
  static void
18
- free_character_set(void* codepoints) {
19
- free(codepoints);
63
+ cs_free(void *ptr)
64
+ {
65
+ struct cs_data *data = ptr;
66
+ ruby_xfree(data->cps);
67
+ ruby_xfree(data);
20
68
  }
21
69
 
22
70
  static size_t
23
- memsize_character_set(const void* codepoints) {
24
- return sizeof(cp_byte) * UNICODE_BYTES;
25
- }
26
-
27
- static const rb_data_type_t
28
- character_set_type = {
29
- .wrap_struct_name = "character_set",
30
- .function = {
31
- .dmark = NULL,
32
- .dfree = free_character_set,
33
- .dsize = memsize_character_set,
34
- },
35
- .data = NULL,
36
- .flags = RUBY_TYPED_FREE_IMMEDIATELY,
71
+ cs_memsize(const void *ptr)
72
+ {
73
+ const struct cs_data *data = ptr;
74
+ return sizeof(*data) + CS_MSIZE(data->len);
75
+ }
76
+
77
+ static const rb_data_type_t cs_type = {
78
+ .wrap_struct_name = "character_set",
79
+ .function = {
80
+ .dmark = NULL,
81
+ .dfree = cs_free,
82
+ .dsize = cs_memsize,
83
+ },
84
+ .data = NULL,
85
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY,
37
86
  };
38
87
 
39
- #define FETCH_CODEPOINTS(set, cps)\
40
- TypedData_Get_Struct(set, cp_byte, &character_set_type, cps)
88
+ static inline VALUE
89
+ cs_alloc_len(VALUE klass, struct cs_data **data_ptr, cs_cp len)
90
+ {
91
+ VALUE cs;
92
+ struct cs_data *data;
93
+ cs = TypedData_Make_Struct(klass, struct cs_data, &cs_type, data);
94
+ data->cps = ruby_xmalloc(CS_MSIZE(len));
95
+ memset(data->cps, 0, CS_MSIZE(len));
96
+ data->len = len;
97
+
98
+ if (data_ptr)
99
+ {
100
+ *data_ptr = data;
101
+ }
41
102
 
42
- #define NEW_CHARACTER_SET(klass, cps)\
43
- TypedData_Wrap_Struct(klass, &character_set_type, cps)
103
+ return cs;
104
+ }
44
105
 
45
- static VALUE
46
- method_allocate(VALUE self) {
47
- cp_byte *cp_arr;
48
- cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte));
49
- return NEW_CHARACTER_SET(self, cp_arr);
106
+ static inline VALUE
107
+ cs_alloc(VALUE klass, struct cs_data **data_ptr)
108
+ {
109
+ return cs_alloc_len(klass, data_ptr, CS_DEFAULT_INITIAL_LEN);
50
110
  }
51
111
 
52
- #define FOR_EACH_ACTIVE_CODEPOINT(action)\
53
- cp_index cp;\
54
- cp_byte *cps;\
55
- FETCH_CODEPOINTS(self, cps);\
56
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\
57
- if (TSTBIT(cps, cp)) { action; }\
58
- }
112
+ static inline struct cs_data *
113
+ cs_fetch_data(VALUE cs)
114
+ {
115
+ struct cs_data *data;
116
+ TypedData_Get_Struct(cs, struct cs_data, &cs_type, data);
117
+ return data;
118
+ }
119
+
120
+ static inline cs_ar *
121
+ cs_fetch_cps(VALUE cs, cs_cp *len_ptr)
122
+ {
123
+ struct cs_data *data;
124
+ data = cs_fetch_data(cs);
125
+ *len_ptr = data->len;
126
+ return data->cps;
127
+ }
128
+
129
+ static VALUE
130
+ cs_method_allocate(VALUE self)
131
+ {
132
+ return cs_alloc(self, 0);
133
+ }
134
+
135
+ #define FOR_EACH_ACTIVE_CODEPOINT(action) \
136
+ do \
137
+ { \
138
+ cs_cp cp, len; \
139
+ cs_ar *cps; \
140
+ cps = cs_fetch_cps(self, &len); \
141
+ for (cp = 0; cp < len; cp++) \
142
+ { \
143
+ if (tst_cp(cps, len, cp)) \
144
+ { \
145
+ action; \
146
+ } \
147
+ } \
148
+ } while (0)
59
149
 
60
150
  // ***************************
61
151
  // `Set` compatibility methods
62
152
  // ***************************
63
153
 
64
- static inline VALUE
65
- enumerator_length(VALUE self, VALUE args, VALUE eobj) {
66
- cp_index count;
154
+ static inline cs_cp
155
+ cs_active_cp_count(VALUE self)
156
+ {
157
+ cs_cp count;
67
158
  count = 0;
68
159
  FOR_EACH_ACTIVE_CODEPOINT(count++);
69
- return LONG2FIX(count);
160
+ return count;
70
161
  }
71
162
 
72
163
  static VALUE
73
- method_length(VALUE self) {
74
- return enumerator_length(self, 0, 0);
164
+ cs_method_length(VALUE self)
165
+ {
166
+ return LONG2FIX(cs_active_cp_count(self));
167
+ }
168
+
169
+ static inline VALUE
170
+ cs_enumerator_length(VALUE self, VALUE args, VALUE eobj)
171
+ {
172
+ return LONG2FIX(cs_active_cp_count(self));
75
173
  }
76
174
 
77
175
  static VALUE
78
- method_each(VALUE self) {
79
- RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
176
+ cs_method_each(VALUE self)
177
+ {
178
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
80
179
  FOR_EACH_ACTIVE_CODEPOINT(rb_yield(LONG2FIX(cp)));
81
180
  return self;
82
181
  }
@@ -84,16 +183,19 @@ method_each(VALUE self) {
84
183
  // returns an Array of codepoint Integers by default.
85
184
  // returns an Array of Strings of length 1 if passed `true`.
86
185
  static VALUE
87
- method_to_a(int argc, VALUE *argv, VALUE self) {
186
+ cs_method_to_a(int argc, VALUE *argv, VALUE self)
187
+ {
88
188
  VALUE arr;
89
189
  rb_encoding *enc;
90
190
  rb_check_arity(argc, 0, 1);
91
191
 
92
192
  arr = rb_ary_new();
93
- if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse) {
193
+ if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse)
194
+ {
94
195
  FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, LONG2FIX(cp)));
95
196
  }
96
- else {
197
+ else
198
+ {
97
199
  enc = rb_utf8_encoding();
98
200
  FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, rb_enc_uint_chr((int)cp, enc)));
99
201
  }
@@ -102,302 +204,473 @@ method_to_a(int argc, VALUE *argv, VALUE self) {
102
204
  }
103
205
 
104
206
  static VALUE
105
- method_empty_p(VALUE self) {
207
+ cs_method_empty_p(VALUE self)
208
+ {
106
209
  FOR_EACH_ACTIVE_CODEPOINT(return Qfalse);
107
210
  return Qtrue;
108
211
  }
109
212
 
110
213
  static VALUE
111
- method_hash(VALUE self) {
112
- cp_index cp, hash, four_byte_value;
113
- cp_byte *cps;
114
- FETCH_CODEPOINTS(self, cps);
214
+ cs_method_hash(VALUE self)
215
+ {
216
+ cs_cp cp, len, hash, four_byte_value;
217
+ cs_ar *cps;
218
+ cps = cs_fetch_cps(self, &len);
219
+ four_byte_value = 0;
115
220
 
116
221
  hash = 17;
117
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
118
- if (cp % 32 == 0) {
119
- if (cp != 0) { hash = hash * 23 + four_byte_value; }
222
+ for (cp = 0; cp < len; cp++)
223
+ {
224
+ if (cp % 32 == 0)
225
+ {
226
+ if (cp != 0)
227
+ {
228
+ hash = hash * 23 + four_byte_value;
229
+ }
120
230
  four_byte_value = 0;
121
231
  }
122
- if (TSTBIT(cps, cp)) four_byte_value++;
232
+ if (tst_cp(cps, len, cp))
233
+ {
234
+ four_byte_value++;
235
+ }
123
236
  }
124
237
 
125
238
  return LONG2FIX(hash);
126
239
  }
127
240
 
128
241
  static inline VALUE
129
- delete_if_block_result(VALUE self, int truthy) {
242
+ cs_delete_if_block_result(VALUE self, int truthy)
243
+ {
130
244
  VALUE result;
131
245
  rb_need_block();
132
246
  rb_check_frozen(self);
133
247
  FOR_EACH_ACTIVE_CODEPOINT(
134
- result = rb_yield(LONG2FIX(cp));
135
- if ((NIL_P(result) || result == Qfalse) != truthy) CLRBIT(cps, cp);
136
- );
248
+ result = rb_yield(LONG2FIX(cp));
249
+ if ((NIL_P(result) || result == Qfalse) != truthy) clr_cp(cps, len, cp););
137
250
  return self;
138
251
  }
139
252
 
140
253
  static VALUE
141
- method_delete_if(VALUE self) {
142
- RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
143
- return delete_if_block_result(self, 1);
254
+ cs_method_delete_if(VALUE self)
255
+ {
256
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
257
+ return cs_delete_if_block_result(self, 1);
144
258
  }
145
259
 
146
260
  static VALUE
147
- method_keep_if(VALUE self) {
148
- RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
149
- return delete_if_block_result(self, 0);
261
+ cs_method_keep_if(VALUE self)
262
+ {
263
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
264
+ return cs_delete_if_block_result(self, 0);
150
265
  }
151
266
 
152
267
  static VALUE
153
- method_clear(VALUE self) {
154
- cp_index cp;
155
- cp_byte *cps;
268
+ cs_method_clear(VALUE self)
269
+ {
270
+ struct cs_data *data;
156
271
  rb_check_frozen(self);
157
- FETCH_CODEPOINTS(self, cps);
158
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
159
- CLRBIT(cps, cp);
160
- }
272
+ data = cs_fetch_data(self);
273
+ memset(data->cps, 0, CS_MSIZE(data->len));
161
274
  return self;
162
275
  }
163
276
 
164
- #define RETURN_NEW_SET_BASED_ON(condition)\
165
- cp_index cp;\
166
- cp_byte *a, *b, *new_cps;\
167
- FETCH_CODEPOINTS(self, a);\
168
- if (other) FETCH_CODEPOINTS(other, b);\
169
- new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));\
170
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\
171
- if (condition) SETBIT(new_cps, cp);\
172
- }\
173
- return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);\
277
+ static VALUE
278
+ cs_method_min(VALUE self)
279
+ {
280
+ FOR_EACH_ACTIVE_CODEPOINT(return LONG2FIX(cp));
281
+ return Qnil;
282
+ }
283
+
284
+ static VALUE
285
+ cs_method_max(VALUE self)
286
+ {
287
+ cs_cp len;
288
+ long reverse_idx;
289
+ cs_ar *cps;
290
+ cps = cs_fetch_cps(self, &len);
291
+ for (reverse_idx = len; reverse_idx >= 0; reverse_idx--)
292
+ {
293
+ if (tst_cp(cps, len, reverse_idx))
294
+ {
295
+ return LONG2FIX(reverse_idx);
296
+ }
297
+ }
298
+ return Qnil;
299
+ }
300
+
301
+ static VALUE
302
+ cs_method_minmax(VALUE self)
303
+ {
304
+ VALUE arr;
305
+ arr = rb_ary_new2(2);
306
+ rb_ary_push(arr, cs_method_min(self));
307
+ rb_ary_push(arr, cs_method_max(self));
308
+ return arr;
309
+ }
310
+
311
+ #define RETURN_COMBINED_CS(cs_a, cs_b, comp_op) \
312
+ do \
313
+ { \
314
+ VALUE new_cs; \
315
+ cs_cp cp, alen, blen; \
316
+ cs_ar *acps, *bcps; \
317
+ struct cs_data *new_data; \
318
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
319
+ acps = cs_fetch_cps(cs_a, &alen); \
320
+ bcps = cs_fetch_cps(cs_b, &blen); \
321
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) \
322
+ { \
323
+ if (tst_cp(acps, alen, cp) comp_op tst_cp(bcps, blen, cp)) \
324
+ { \
325
+ set_cp(new_data, cp); \
326
+ } \
327
+ } \
328
+ return new_cs; \
329
+ } while (0)
174
330
 
175
331
  static VALUE
176
- method_intersection(VALUE self, VALUE other) {
177
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) && TSTBIT(b, cp));
332
+ cs_method_intersection(VALUE self, VALUE other)
333
+ {
334
+ RETURN_COMBINED_CS(self, other, &&);
178
335
  }
179
336
 
180
337
  static VALUE
181
- method_exclusion(VALUE self, VALUE other) {
182
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) ^ TSTBIT(b, cp));
338
+ cs_method_exclusion(VALUE self, VALUE other)
339
+ {
340
+ RETURN_COMBINED_CS(self, other, ^);
183
341
  }
184
342
 
185
343
  static VALUE
186
- method_union(VALUE self, VALUE other) {
187
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) || TSTBIT(b, cp));
344
+ cs_method_union(VALUE self, VALUE other)
345
+ {
346
+ RETURN_COMBINED_CS(self, other, ||);
188
347
  }
189
348
 
190
349
  static VALUE
191
- method_difference(VALUE self, VALUE other) {
192
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) > TSTBIT(b, cp));
350
+ cs_method_difference(VALUE self, VALUE other)
351
+ {
352
+ RETURN_COMBINED_CS(self, other, >);
193
353
  }
194
354
 
195
355
  static VALUE
196
- method_include_p(VALUE self, VALUE num) {
197
- cp_byte *cps;
198
- FETCH_CODEPOINTS(self, cps);
199
- return (TSTBIT(cps, FIX2ULONG(num)) ? Qtrue : Qfalse);
356
+ cs_method_include_p(VALUE self, VALUE num)
357
+ {
358
+ cs_ar *cps;
359
+ cs_cp len;
360
+ cps = cs_fetch_cps(self, &len);
361
+ return (tst_cp(cps, len, FIX2ULONG(num)) ? Qtrue : Qfalse);
200
362
  }
201
363
 
202
- static inline int
203
- toggle_codepoint(VALUE set, VALUE cp_num, unsigned int on, int check_if_noop) {
204
- cp_index cp;
205
- cp_byte *cps;
206
- rb_check_frozen(set);
207
- FETCH_CODEPOINTS(set, cps);
364
+ static inline VALUE
365
+ cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop)
366
+ {
367
+ cs_cp cp, len;
368
+ cs_ar *cps;
369
+ struct cs_data *data;
370
+ rb_check_frozen(cs);
371
+ data = cs_fetch_data(cs);
372
+ cps = data->cps;
373
+ len = data->len;
208
374
  cp = FIX2ULONG(cp_num);
209
- if (check_if_noop && (!TSTBIT(cps, cp) == !on)) {
210
- return 0;
375
+ if (return_nil_if_noop && (!tst_cp(cps, len, cp) == !on))
376
+ {
377
+ return Qnil;
211
378
  }
212
- else {
213
- if (on) { SETBIT(cps, cp); }
214
- else { CLRBIT(cps, cp); }
215
- return 1;
379
+ else
380
+ {
381
+ if (on)
382
+ {
383
+ set_cp(data, cp);
384
+ }
385
+ else
386
+ {
387
+ clr_cp(cps, len, cp);
388
+ }
389
+ return cs;
216
390
  }
217
391
  }
218
392
 
219
393
  static VALUE
220
- method_add(VALUE self, VALUE cp_num) {
221
- return toggle_codepoint(self, cp_num, 1, 0) ? self : Qnil;
394
+ cs_method_add(VALUE self, VALUE cp_num)
395
+ {
396
+ return cs_toggle_codepoint(self, cp_num, 1, 0);
222
397
  }
223
398
 
224
399
  static VALUE
225
- method_add_p(VALUE self, VALUE cp_num) {
226
- return toggle_codepoint(self, cp_num, 1, 1) ? self : Qnil;
400
+ cs_method_add_p(VALUE self, VALUE cp_num)
401
+ {
402
+ return cs_toggle_codepoint(self, cp_num, 1, 1);
227
403
  }
228
404
 
229
405
  static VALUE
230
- method_delete(VALUE self, VALUE cp_num) {
231
- return toggle_codepoint(self, cp_num, 0, 0) ? self : Qnil;
406
+ cs_method_delete(VALUE self, VALUE cp_num)
407
+ {
408
+ return cs_toggle_codepoint(self, cp_num, 0, 0);
232
409
  }
233
410
 
234
411
  static VALUE
235
- method_delete_p(VALUE self, VALUE cp_num) {
236
- return toggle_codepoint(self, cp_num, 0, 1) ? self : Qnil;
412
+ cs_method_delete_p(VALUE self, VALUE cp_num)
413
+ {
414
+ return cs_toggle_codepoint(self, cp_num, 0, 1);
237
415
  }
238
416
 
239
- #define COMPARE_SETS(action)\
240
- cp_index cp;\
241
- cp_byte *cps, *other_cps;\
242
- FETCH_CODEPOINTS(self, cps);\
243
- FETCH_CODEPOINTS(other, other_cps);\
244
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { action; }\
245
-
246
417
  static VALUE
247
- method_intersect_p(VALUE self, VALUE other) {
248
- COMPARE_SETS(if (TSTBIT(cps, cp) && TSTBIT(other_cps, cp)) return Qtrue);
418
+ cs_method_intersect_p(VALUE self, VALUE other)
419
+ {
420
+ cs_cp cp, alen, blen;
421
+ cs_ar *acps, *bcps;
422
+ acps = cs_fetch_cps(self, &alen);
423
+ bcps = cs_fetch_cps(other, &blen);
424
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
425
+ {
426
+ if (tst_cp(acps, alen, cp) && tst_cp(bcps, blen, cp))
427
+ {
428
+ return Qtrue;
429
+ }
430
+ }
249
431
  return Qfalse;
250
432
  }
251
433
 
252
434
  static VALUE
253
- method_disjoint_p(VALUE self, VALUE other) {
254
- return method_intersect_p(self, other) ? Qfalse : Qtrue;
435
+ cs_method_disjoint_p(VALUE self, VALUE other)
436
+ {
437
+ return cs_method_intersect_p(self, other) ? Qfalse : Qtrue;
255
438
  }
256
439
 
257
440
  static inline int
258
- is_character_set(VALUE obj) {
259
- return rb_typeddata_is_kind_of(obj, &character_set_type);
441
+ cs_check_type(VALUE obj)
442
+ {
443
+ return rb_typeddata_is_kind_of(obj, &cs_type);
260
444
  }
261
445
 
262
446
  static VALUE
263
- method_eql_p(VALUE self, VALUE other) {
264
- if (!is_character_set(other)) return Qfalse;
265
- if (self == other) return Qtrue; // same object_id
266
-
267
- COMPARE_SETS(if (TSTBIT(cps, cp) != TSTBIT(other_cps, cp)) return Qfalse);
268
-
447
+ cs_cps_eql(VALUE cs_a, VALUE cs_b)
448
+ {
449
+ cs_cp cp, alen, blen;
450
+ cs_ar *acps, *bcps;
451
+ acps = cs_fetch_cps(cs_a, &alen);
452
+ bcps = cs_fetch_cps(cs_b, &blen);
453
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
454
+ {
455
+ if (tst_cp(acps, alen, cp) != tst_cp(bcps, blen, cp))
456
+ {
457
+ return Qfalse;
458
+ }
459
+ }
269
460
  return Qtrue;
270
461
  }
271
462
 
463
+ static VALUE
464
+ cs_method_eql_p(VALUE self, VALUE other)
465
+ {
466
+ if (!cs_check_type(other))
467
+ {
468
+ return Qfalse;
469
+ }
470
+ if (self == other) // same object_id
471
+ {
472
+ return Qtrue;
473
+ }
474
+ return cs_cps_eql(self, other);
475
+ }
476
+
272
477
  static inline VALUE
273
- merge_character_set(VALUE self, VALUE other) {
274
- COMPARE_SETS(if (TSTBIT(other_cps, cp)) SETBIT(cps, cp));
275
- return self;
478
+ cs_merge_cs(VALUE recipient, VALUE source)
479
+ {
480
+ cs_cp cp, source_len;
481
+ struct cs_data *data;
482
+ cs_ar *source_cps;
483
+ data = cs_fetch_data(recipient);
484
+ source_cps = cs_fetch_cps(source, &source_len);
485
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
486
+ {
487
+ if (tst_cp(source_cps, source_len, cp))
488
+ {
489
+ set_cp(data, cp);
490
+ }
491
+ }
492
+ return recipient;
276
493
  }
277
494
 
278
- static inline void
279
- raise_arg_err_unless_valid_as_cp(VALUE object_id) {
280
- if (FIXNUM_P(object_id) && object_id > 0 && object_id < 0x220001) return;
495
+ static inline cs_cp
496
+ cs_checked_cp(VALUE object_id)
497
+ {
498
+ if (FIXNUM_P(object_id) && object_id > 0 && object_id < 0x220001)
499
+ {
500
+ return FIX2ULONG(object_id);
501
+ }
281
502
  rb_raise(rb_eArgError, "CharacterSet members must be between 0 and 0x10FFFF");
282
503
  }
283
504
 
284
505
  static inline VALUE
285
- merge_rb_range(VALUE self, VALUE rb_range) {
506
+ cs_merge_rb_range(VALUE self, VALUE rb_range)
507
+ {
286
508
  VALUE from_id, upto_id;
509
+ cs_cp from_cp, upto_cp, cont_len, rem;
287
510
  int excl;
288
- cp_index cp;
289
- cp_byte *cps;
290
- FETCH_CODEPOINTS(self, cps);
511
+ struct cs_data *data;
512
+ data = cs_fetch_data(self);
291
513
 
292
- if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl))) {
514
+ if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl)))
515
+ {
293
516
  rb_raise(rb_eArgError, "pass a Range");
294
517
  }
295
- if (excl) upto_id -= 2;
518
+ if (excl)
519
+ {
520
+ upto_id -= 2;
521
+ }
522
+
523
+ from_cp = cs_checked_cp(from_id);
524
+ upto_cp = cs_checked_cp(upto_id);
296
525
 
297
- raise_arg_err_unless_valid_as_cp(from_id);
298
- raise_arg_err_unless_valid_as_cp(upto_id);
526
+ if (upto_cp > from_cp && (upto_cp - from_cp > 6))
527
+ {
528
+ // set bits in preceding partially toggled bytes individually
529
+ for (/* */; (from_cp <= upto_cp) && (from_cp % 8); from_cp++)
530
+ {
531
+ set_cp(data, from_cp);
532
+ }
533
+ // memset contiguous bits directly
534
+ cont_len = upto_cp - from_cp + 1;
535
+ rem = cont_len % 8;
536
+ ensure_memsize_fits(data, upto_cp);
537
+ memset(data->cps + CS_MSIZE(from_cp), 0xFF, CS_MSIZE(cont_len - rem) / 8);
538
+ from_cp = upto_cp - rem + 1;
539
+ }
299
540
 
300
- for (/* */; from_id <= upto_id; from_id += 2) {
301
- cp = FIX2ULONG(from_id);
302
- SETBIT(cps, cp);
541
+ // set bits in partially toggled bytes individually
542
+ for (/* */; from_cp <= upto_cp; from_cp++)
543
+ {
544
+ set_cp(data, from_cp);
303
545
  }
546
+
304
547
  return self;
305
548
  }
306
549
 
307
550
  static inline VALUE
308
- merge_rb_array(VALUE self, VALUE rb_array) {
309
- VALUE el;
310
- cp_byte *cps;
311
- VALUE array_length, i;
312
- FETCH_CODEPOINTS(self, cps);
551
+ cs_merge_rb_array(VALUE self, VALUE rb_array)
552
+ {
553
+ VALUE el, array_length, i;
554
+ struct cs_data *data;
313
555
  Check_Type(rb_array, T_ARRAY);
556
+ data = cs_fetch_data(self);
314
557
  array_length = RARRAY_LEN(rb_array);
315
- for (i = 0; i < array_length; i++) {
558
+ for (i = 0; i < array_length; i++)
559
+ {
316
560
  el = RARRAY_AREF(rb_array, i);
317
- raise_arg_err_unless_valid_as_cp(el);
318
- SETBIT(cps, FIX2ULONG(el));
561
+ set_cp(data, cs_checked_cp(el));
319
562
  }
320
563
  return self;
321
564
  }
322
565
 
323
566
  static VALUE
324
- method_merge(VALUE self, VALUE other) {
567
+ cs_method_merge(VALUE self, VALUE other)
568
+ {
325
569
  rb_check_frozen(self);
326
- if (is_character_set(other)) {
327
- return merge_character_set(self, other);
570
+ if (cs_check_type(other))
571
+ {
572
+ return cs_merge_cs(self, other);
328
573
  }
329
- else if (TYPE(other) == T_ARRAY) {
330
- return merge_rb_array(self, other);
574
+ else if (TYPE(other) == T_ARRAY)
575
+ {
576
+ return cs_merge_rb_array(self, other);
331
577
  }
332
- return merge_rb_range(self, other);
578
+ return cs_merge_rb_range(self, other);
333
579
  }
334
580
 
335
581
  static VALUE
336
- method_initialize_copy(VALUE self, VALUE other) {
337
- merge_character_set(self, other);
338
- return other;
582
+ cs_method_initialize_copy(VALUE self, VALUE orig)
583
+ {
584
+ cs_merge_cs(self, orig);
585
+ return self;
339
586
  }
340
587
 
341
588
  static VALUE
342
- method_subtract(VALUE self, VALUE other) {
589
+ cs_method_subtract(VALUE self, VALUE other)
590
+ {
591
+ cs_cp cp, len, other_len;
592
+ cs_ar *cps, *other_cps;
343
593
  rb_check_frozen(self);
344
- COMPARE_SETS(if (TSTBIT(other_cps, cp)) CLRBIT(cps, cp));
594
+ cps = cs_fetch_cps(self, &len);
595
+ other_cps = cs_fetch_cps(other, &other_len);
596
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
597
+ {
598
+ if (tst_cp(other_cps, other_len, cp))
599
+ {
600
+ clr_cp(cps, len, cp);
601
+ }
602
+ }
345
603
  return self;
346
604
  }
347
605
 
348
606
  static inline int
349
- a_subset_of_b(VALUE set_a, VALUE set_b, int *is_proper) {
350
- cp_byte *cps_a, *cps_b;
351
- cp_index cp, size_a, size_b;
607
+ cs_a_subset_of_b(VALUE cs_a, VALUE cs_b, int *is_proper_ptr)
608
+ {
609
+ cs_ar *a, *b;
610
+ cs_cp cp, alen, blen, count_a, count_b;
352
611
 
353
- if (!is_character_set(set_a) || !is_character_set(set_b)) {
612
+ if (!cs_check_type(cs_a) || !cs_check_type(cs_b))
613
+ {
354
614
  rb_raise(rb_eArgError, "pass a CharacterSet");
355
615
  }
356
616
 
357
- FETCH_CODEPOINTS(set_a, cps_a);
358
- FETCH_CODEPOINTS(set_b, cps_b);
359
-
360
- *is_proper = 0;
361
- size_a = 0;
362
- size_b = 0;
363
-
364
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
365
- if (TSTBIT(cps_a, cp)) {
366
- if (!TSTBIT(cps_b, cp)) return 0;
367
- size_a++;
368
- size_b++;
617
+ a = cs_fetch_cps(cs_a, &alen);
618
+ b = cs_fetch_cps(cs_b, &blen);
619
+
620
+ count_a = 0;
621
+ count_b = 0;
622
+
623
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
624
+ {
625
+ if (tst_cp(a, alen, cp))
626
+ {
627
+ if (!tst_cp(b, blen, cp))
628
+ {
629
+ return 0;
630
+ }
631
+ count_a++;
632
+ count_b++;
633
+ }
634
+ else if (tst_cp(b, blen, cp))
635
+ {
636
+ count_b++;
369
637
  }
370
- else if (TSTBIT(cps_b, cp)) size_b++;
371
638
  }
372
639
 
373
- if (size_b > size_a) *is_proper = 1;
640
+ if (is_proper_ptr)
641
+ {
642
+ *is_proper_ptr = count_b > count_a;
643
+ }
644
+
374
645
  return 1;
375
646
  }
376
647
 
377
648
  static VALUE
378
- method_subset_p(VALUE self, VALUE other) {
379
- int is_proper;
380
- return a_subset_of_b(self, other, &is_proper) ? Qtrue : Qfalse;
649
+ cs_method_subset_p(VALUE self, VALUE other)
650
+ {
651
+ return cs_a_subset_of_b(self, other, NULL) ? Qtrue : Qfalse;
381
652
  }
382
653
 
383
654
  static VALUE
384
- method_proper_subset_p(VALUE self, VALUE other) {
385
- int is, is_proper;
386
- is = a_subset_of_b(self, other, &is_proper);
387
- return (is && is_proper) ? Qtrue : Qfalse;
655
+ cs_method_proper_subset_p(VALUE self, VALUE other)
656
+ {
657
+ int is_subset, is_proper;
658
+ is_subset = cs_a_subset_of_b(self, other, &is_proper);
659
+ return (is_subset && is_proper) ? Qtrue : Qfalse;
388
660
  }
389
661
 
390
662
  static VALUE
391
- method_superset_p(VALUE self, VALUE other) {
392
- int is_proper;
393
- return a_subset_of_b(other, self, &is_proper) ? Qtrue : Qfalse;
663
+ cs_method_superset_p(VALUE self, VALUE other)
664
+ {
665
+ return cs_a_subset_of_b(other, self, NULL) ? Qtrue : Qfalse;
394
666
  }
395
667
 
396
668
  static VALUE
397
- method_proper_superset_p(VALUE self, VALUE other) {
398
- int is, is_proper;
399
- is = a_subset_of_b(other, self, &is_proper);
400
- return (is && is_proper) ? Qtrue : Qfalse;
669
+ cs_method_proper_superset_p(VALUE self, VALUE other)
670
+ {
671
+ int is_superset, is_proper;
672
+ is_superset = cs_a_subset_of_b(other, self, &is_proper);
673
+ return (is_superset && is_proper) ? Qtrue : Qfalse;
401
674
  }
402
675
 
403
676
  // *******************************
@@ -405,42 +678,43 @@ method_proper_superset_p(VALUE self, VALUE other) {
405
678
  // *******************************
406
679
 
407
680
  static VALUE
408
- class_method_from_ranges(VALUE self, VALUE ranges) {
409
- VALUE new_set, range_count, i;
410
- new_set = rb_class_new_instance(0, 0, self);
681
+ cs_class_method_from_ranges(VALUE self, VALUE ranges)
682
+ {
683
+ VALUE new_cs, range_count, i;
684
+ new_cs = rb_class_new_instance(0, 0, self);
411
685
  range_count = RARRAY_LEN(ranges);
412
- for (i = 0; i < range_count; i++) {
413
- merge_rb_range(new_set, RARRAY_AREF(ranges, i));
686
+ for (i = 0; i < range_count; i++)
687
+ {
688
+ cs_merge_rb_range(new_cs, RARRAY_AREF(ranges, i));
414
689
  }
415
- return new_set;
690
+ return new_cs;
416
691
  }
417
692
 
418
693
  static VALUE
419
- method_ranges(VALUE self) {
420
- VALUE ranges, codepoint, previous_codepoint, current_start, current_end;
694
+ cs_method_ranges(VALUE self)
695
+ {
696
+ VALUE ranges, cp_num, previous_cp_num, current_start, current_end;
421
697
 
422
698
  ranges = rb_ary_new();
423
- previous_codepoint = 0;
699
+ previous_cp_num = 0;
424
700
  current_start = 0;
425
701
  current_end = 0;
426
702
 
427
703
  FOR_EACH_ACTIVE_CODEPOINT(
428
- codepoint = LONG2FIX(cp);
704
+ cp_num = LONG2FIX(cp);
429
705
 
430
- if (!previous_codepoint) {
431
- current_start = codepoint;
432
- }
433
- else if (previous_codepoint + 2 != codepoint) {
434
- // gap found, finalize previous range
435
- rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
436
- current_start = codepoint;
437
- }
438
- current_end = codepoint;
439
- previous_codepoint = codepoint;
440
- );
706
+ if (!previous_cp_num) {
707
+ current_start = cp_num;
708
+ } else if (previous_cp_num + 2 != cp_num) {
709
+ // gap found, finalize previous range
710
+ rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
711
+ current_start = cp_num;
712
+ } current_end = cp_num;
713
+ previous_cp_num = cp_num;);
441
714
 
442
715
  // add final range
443
- if (current_start) {
716
+ if (current_start)
717
+ {
444
718
  rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
445
719
  }
446
720
 
@@ -448,117 +722,233 @@ method_ranges(VALUE self) {
448
722
  }
449
723
 
450
724
  static VALUE
451
- method_sample(int argc, VALUE *argv, VALUE self) {
452
- VALUE to_a_args[1], array;
725
+ cs_method_sample(int argc, VALUE *argv, VALUE self)
726
+ {
727
+ VALUE array, to_a_args[1] = {Qtrue};
453
728
  rb_check_arity(argc, 0, 1);
454
- to_a_args[0] = Qtrue;
455
- array = method_to_a(1, to_a_args, self);
729
+ array = cs_method_to_a(1, to_a_args, self);
456
730
  return rb_funcall(array, rb_intern("sample"), argc, argc ? argv[0] : 0);
457
731
  }
458
732
 
459
733
  static inline VALUE
460
- new_set_from_section(VALUE set, cp_index from, cp_index upto) {
461
- cp_byte *cps, *new_cps;
462
- cp_index cp;
463
- FETCH_CODEPOINTS(set, cps);
464
- new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));
465
- for (cp = from; cp <= upto; cp++) {
466
- if (TSTBIT(cps, cp)) SETBIT(new_cps, cp);
734
+ cs_from_section(VALUE set, cs_cp from, cs_cp upto)
735
+ {
736
+ VALUE new_cs;
737
+ cs_ar *cps;
738
+ cs_cp cp, len;
739
+ struct cs_data *new_data;
740
+ new_cs = cs_alloc(RBASIC(set)->klass, &new_data);
741
+ cps = cs_fetch_cps(set, &len);
742
+ for (cp = from; cp <= upto; cp++)
743
+ {
744
+ if (tst_cp(cps, len, cp))
745
+ {
746
+ set_cp(new_data, cp);
747
+ }
467
748
  }
468
- return NEW_CHARACTER_SET(RBASIC(set)->klass, new_cps);
749
+ return new_cs;
469
750
  }
470
751
 
471
752
  static VALUE
472
- method_bmp_part(VALUE self) {
473
- return new_set_from_section(self, 0, UNICODE_PLANE_SIZE - 1);
753
+ cs_method_ext_section(VALUE self, VALUE from, VALUE upto)
754
+ {
755
+ return cs_from_section(self, FIX2ULONG(from), FIX2ULONG(upto));
756
+ }
757
+
758
+ static inline cs_cp
759
+ cs_active_cp_count_in_section(VALUE set, cs_cp from, cs_cp upto)
760
+ {
761
+ cs_ar *cps;
762
+ cs_cp cp, count, len;
763
+ cps = cs_fetch_cps(set, &len);
764
+ for (count = 0, cp = from; cp <= upto; cp++)
765
+ {
766
+ if (tst_cp(cps, len, cp))
767
+ {
768
+ count++;
769
+ }
770
+ }
771
+ return count;
474
772
  }
475
773
 
476
774
  static VALUE
477
- method_astral_part(VALUE self) {
478
- return new_set_from_section(self, UNICODE_PLANE_SIZE, UNICODE_CP_COUNT - 1);
775
+ cs_method_ext_count_in_section(VALUE self, VALUE from, VALUE upto)
776
+ {
777
+ cs_cp count;
778
+ count = cs_active_cp_count_in_section(self, FIX2ULONG(from), FIX2ULONG(upto));
779
+ return LONG2FIX(count);
479
780
  }
480
781
 
481
782
  static inline VALUE
482
- set_has_member_in_plane(VALUE set, unsigned int plane) {
483
- cp_byte *cps;
484
- cp_index cp, max_cp;
485
- FETCH_CODEPOINTS(set, cps);
486
- cp = plane * UNICODE_PLANE_SIZE;
487
- max_cp = (plane + 1) * UNICODE_PLANE_SIZE - 1;
488
- for (/* */; cp <= max_cp; cp++) {
489
- if (TSTBIT(cps, cp)) return Qtrue;
783
+ cs_has_cp_in_section(cs_ar *cps, cs_cp len, cs_cp from, cs_cp upto)
784
+ {
785
+ cs_cp cp;
786
+ for (cp = from; cp <= upto; cp++)
787
+ {
788
+ if (tst_cp(cps, len, cp))
789
+ {
790
+ return Qtrue;
791
+ }
490
792
  }
491
793
  return Qfalse;
492
794
  }
493
795
 
494
796
  static VALUE
495
- method_planes(VALUE self) {
797
+ cs_method_ext_section_p(VALUE self, VALUE from, VALUE upto)
798
+ {
799
+ cs_ar *cps;
800
+ cs_cp len;
801
+ cps = cs_fetch_cps(self, &len);
802
+ return cs_has_cp_in_section(cps, len, FIX2ULONG(from), FIX2ULONG(upto));
803
+ }
804
+
805
+ static inline VALUE
806
+ cs_ratio_of_section(VALUE set, cs_cp from, cs_cp upto)
807
+ {
808
+ double section_count, total_count;
809
+ section_count = (double)cs_active_cp_count_in_section(set, from, upto);
810
+ total_count = (double)cs_active_cp_count(set);
811
+ return DBL2NUM(section_count / total_count);
812
+ }
813
+
814
+ static VALUE
815
+ cs_method_ext_section_ratio(VALUE self, VALUE from, VALUE upto)
816
+ {
817
+ return cs_ratio_of_section(self, FIX2ULONG(from), FIX2ULONG(upto));
818
+ }
819
+
820
+ #define MAX_CP 0x10FFFF
821
+ #define MAX_ASCII_CP 0x7F
822
+ #define MAX_BMP_CP 0xFFFF
823
+ #define MIN_ASTRAL_CP 0x10000
824
+
825
+ static inline VALUE
826
+ cs_has_cp_in_plane(cs_ar *cps, cs_cp len, unsigned int plane)
827
+ {
828
+ cs_cp plane_beg, plane_end;
829
+ plane_beg = plane * UNICODE_PLANE_SIZE;
830
+ plane_end = (plane + 1) * MAX_BMP_CP;
831
+ return cs_has_cp_in_section(cps, len, plane_beg, plane_end);
832
+ }
833
+
834
+ static VALUE
835
+ cs_method_planes(VALUE self)
836
+ {
837
+ cs_ar *cps;
838
+ cs_cp len;
496
839
  unsigned int i;
497
840
  VALUE planes;
841
+ cps = cs_fetch_cps(self, &len);
498
842
  planes = rb_ary_new();
499
- for (i = 0; i < UNICODE_PLANE_COUNT; i++) {
500
- if (set_has_member_in_plane(self, i)) rb_ary_push(planes, INT2FIX(i));
843
+ for (i = 0; i < UNICODE_PLANE_COUNT; i++)
844
+ {
845
+ if (cs_has_cp_in_plane(cps, len, i))
846
+ {
847
+ rb_ary_push(planes, INT2FIX(i));
848
+ }
501
849
  }
502
850
  return planes;
503
851
  }
504
852
 
505
- static VALUE
506
- method_member_in_plane_p(VALUE self, VALUE plane_num) {
853
+ static inline int
854
+ cs_valid_plane_num(VALUE num)
855
+ {
507
856
  int plane;
508
- Check_Type(plane_num, T_FIXNUM);
509
- plane = FIX2INT(plane_num);
510
- if (plane < 0 || plane >= UNICODE_PLANE_COUNT) {
511
- rb_raise(rb_eArgError, "plane must be between 0 and 16");
857
+ Check_Type(num, T_FIXNUM);
858
+ plane = FIX2INT(num);
859
+ if (plane < 0 || plane >= UNICODE_PLANE_COUNT)
860
+ {
861
+ rb_raise(rb_eArgError, "plane must be between 0 and %d", UNICODE_PLANE_COUNT - 1);
512
862
  }
513
- return set_has_member_in_plane(self, plane);
863
+ return plane;
864
+ }
865
+
866
+ static VALUE
867
+ cs_method_plane(VALUE self, VALUE plane_num)
868
+ {
869
+ cs_cp plane, plane_beg, plane_end;
870
+ plane = cs_valid_plane_num(plane_num);
871
+ plane_beg = plane * UNICODE_PLANE_SIZE;
872
+ plane_end = (plane + 1) * MAX_BMP_CP;
873
+ return cs_from_section(self, plane_beg, plane_end);
874
+ }
875
+
876
+ static VALUE
877
+ cs_method_member_in_plane_p(VALUE self, VALUE plane_num)
878
+ {
879
+ cs_ar *cps;
880
+ cs_cp len;
881
+ unsigned int plane;
882
+ plane = cs_valid_plane_num(plane_num);
883
+ cps = cs_fetch_cps(self, &len);
884
+ return cs_has_cp_in_plane(cps, len, plane);
514
885
  }
515
886
 
516
887
  #define NON_SURROGATE(cp) (cp > 0xDFFF || cp < 0xD800)
517
888
 
518
889
  static VALUE
519
- method_ext_inversion(int argc, VALUE *argv, VALUE self) {
520
- int include_surrogates;
521
- cp_index upto;
522
- VALUE other;
523
- other = 0;
890
+ cs_method_ext_inversion(int argc, VALUE *argv, VALUE self)
891
+ {
892
+ int inc_surr;
893
+ cs_cp upto, cp, len;
894
+ cs_ar *cps;
895
+ VALUE new_cs;
896
+ struct cs_data *new_data;
897
+
524
898
  rb_check_arity(argc, 0, 2);
525
- include_surrogates = ((argc > 0) && (argv[0] == Qtrue));
526
- if ((argc > 1) && FIXNUM_P(argv[1])) {
527
- upto = FIX2ULONG(argv[1]);
528
- RETURN_NEW_SET_BASED_ON(
529
- cp <= upto && !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
530
- );
899
+
900
+ cps = cs_fetch_cps(self, &len);
901
+ inc_surr = argc && argv[0] == Qtrue;
902
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data);
903
+ upto = argc > 1 && FIXNUM_P(argv[1]) ? FIX2ULONG(argv[1]) : UNICODE_CP_COUNT;
904
+
905
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
906
+ {
907
+ if (cp <= upto && !tst_cp(cps, len, cp) && (inc_surr || NON_SURROGATE(cp)))
908
+ {
909
+ set_cp(new_data, cp);
910
+ }
531
911
  }
532
- RETURN_NEW_SET_BASED_ON(
533
- !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
534
- );
912
+
913
+ return new_cs;
535
914
  }
536
915
 
537
- typedef int(*str_cp_handler)(unsigned int, cp_byte*);
916
+ typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE *memo);
538
917
 
539
918
  static inline int
540
- add_str_cp_to_arr(unsigned int str_cp, cp_byte *cp_arr) {
541
- SETBIT(cp_arr, str_cp);
919
+ add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
920
+ {
921
+ set_cp(data, str_cp);
542
922
  return 1;
543
923
  }
544
924
 
545
925
  static VALUE
546
- method_case_insensitive(VALUE self) {
547
- cp_index i;
548
- cp_byte *new_cps;
549
-
550
- new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));
926
+ cs_method_case_insensitive(VALUE self)
927
+ {
928
+ cs_cp i, len;
929
+ cs_ar *cps;
930
+ VALUE new_cs;
931
+ struct cs_data *new_data;
551
932
 
552
- FOR_EACH_ACTIVE_CODEPOINT(SETBIT(new_cps, cp));
933
+ cps = cs_fetch_cps(self, &len);
934
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data);
935
+ cs_merge_cs(new_cs, self);
553
936
 
554
- for (i = 0; i < CASEFOLD_COUNT; i++) {
937
+ for (i = 0; i < CASEFOLD_COUNT; i++)
938
+ {
555
939
  casefold_mapping m = unicode_casefold_table[i];
556
940
 
557
- if (TSTBIT(cps, m.from)) { SETBIT(new_cps, m.to); }
558
- else if (TSTBIT(cps, m.to)) { SETBIT(new_cps, m.from); }
941
+ if (tst_cp(cps, len, m.from))
942
+ {
943
+ set_cp(new_data, m.to);
944
+ }
945
+ else if (tst_cp(cps, len, m.to))
946
+ {
947
+ set_cp(new_data, m.from);
948
+ }
559
949
  }
560
950
 
561
- return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);
951
+ return new_cs;
562
952
 
563
953
  // OnigCaseFoldType flags;
564
954
  // rb_encoding *enc;
@@ -573,20 +963,27 @@ method_case_insensitive(VALUE self) {
573
963
  }
574
964
 
575
965
  static inline VALUE
576
- each_sb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
577
- long i;
966
+ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
967
+ {
968
+ long i, str_len;
578
969
  unsigned int str_cp;
970
+ str_len = RSTRING_LEN(str);
579
971
 
580
- for (i = 0; i < RSTRING_LEN(str); i++) {
972
+ for (i = 0; i < str_len; i++)
973
+ {
581
974
  str_cp = (RSTRING_PTR(str)[i] & 0xff);
582
- if (!(*func)(str_cp, cp_arr)) return Qfalse;
975
+ if (!(*func)(str_cp, cp_arr, len, data, memo))
976
+ {
977
+ return Qfalse;
978
+ }
583
979
  }
584
980
 
585
981
  return Qtrue;
586
982
  }
587
983
 
588
984
  static inline VALUE
589
- each_mb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
985
+ each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
986
+ {
590
987
  int n;
591
988
  unsigned int str_cp;
592
989
  const char *ptr, *end;
@@ -597,9 +994,13 @@ each_mb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
597
994
  end = RSTRING_END(str);
598
995
  enc = rb_enc_get(str);
599
996
 
600
- while (ptr < end) {
997
+ while (ptr < end)
998
+ {
601
999
  str_cp = rb_enc_codepoint_len(ptr, end, &n, enc);
602
- if (!(*func)(str_cp, cp_arr)) return Qfalse;
1000
+ if (!(*func)(str_cp, cp_arr, len, data, memo))
1001
+ {
1002
+ return Qfalse;
1003
+ }
603
1004
  ptr += n;
604
1005
  }
605
1006
 
@@ -611,105 +1012,236 @@ static inline int
611
1012
  single_byte_optimizable(VALUE str)
612
1013
  {
613
1014
  rb_encoding *enc;
614
- if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) return 1;
1015
+ if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
1016
+ {
1017
+ return 1;
1018
+ }
615
1019
 
616
1020
  enc = rb_enc_get(str);
617
- if (rb_enc_mbmaxlen(enc) == 1) return 1;
1021
+ if (rb_enc_mbmaxlen(enc) == 1)
1022
+ {
1023
+ return 1;
1024
+ }
618
1025
 
619
1026
  return 0;
620
1027
  }
621
1028
 
622
1029
  static inline VALUE
623
- each_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
624
- if (single_byte_optimizable(str)) {
625
- return each_sb_cp(str, func, cp_arr);
1030
+ each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1031
+ {
1032
+ if (single_byte_optimizable(str))
1033
+ {
1034
+ return each_sb_cp(str, func, cp_arr, len, data, memo);
626
1035
  }
627
- return each_mb_cp(str, func, cp_arr);
1036
+ return each_mb_cp(str, func, cp_arr, len, data, memo);
628
1037
  }
629
1038
 
630
1039
  static inline void
631
- raise_arg_err_unless_string(VALUE val) {
632
- if (!RB_TYPE_P(val, T_STRING)) rb_raise(rb_eArgError, "pass a String");
1040
+ raise_arg_err_unless_string(VALUE val)
1041
+ {
1042
+ if (!RB_TYPE_P(val, T_STRING))
1043
+ {
1044
+ rb_raise(rb_eArgError, "pass a String");
1045
+ }
633
1046
  }
634
1047
 
635
1048
  static VALUE
636
- class_method_of(VALUE self, VALUE str) {
637
- cp_byte *cp_arr;
1049
+ cs_class_method_of(VALUE self, VALUE str)
1050
+ {
1051
+ VALUE new_cs;
1052
+ struct cs_data *new_data;
1053
+ new_cs = cs_alloc(self, &new_data);
638
1054
  raise_arg_err_unless_string(str);
639
- cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte));
640
- each_cp(str, add_str_cp_to_arr, cp_arr);
641
- return NEW_CHARACTER_SET(self, cp_arr);
1055
+ each_cp(str, add_str_cp_to_arr, 0, 0, new_data, 0);
1056
+ return new_cs;
642
1057
  }
643
1058
 
644
1059
  static inline int
645
- str_cp_not_in_arr(unsigned int str_cp, cp_byte *cp_arr) {
646
- return !TSTBIT(cp_arr, str_cp);
1060
+ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1061
+ {
1062
+ if (tst_cp(cp_arr, len, str_cp))
1063
+ {
1064
+ *memo += 1;
1065
+ }
1066
+ return 1;
647
1067
  }
648
1068
 
649
1069
  static VALUE
650
- method_used_by_p(VALUE self, VALUE str) {
651
- cp_byte *cps;
652
- VALUE only_uses_other_cps;
1070
+ cs_method_count_in(VALUE self, VALUE str)
1071
+ {
1072
+ VALUE count;
1073
+ struct cs_data *data;
653
1074
  raise_arg_err_unless_string(str);
654
- FETCH_CODEPOINTS(self, cps);
655
- only_uses_other_cps = each_cp(str, str_cp_not_in_arr, cps);
656
- return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
1075
+ data = cs_fetch_data(self);
1076
+ count = 0;
1077
+ each_cp(str, count_str_cp, data->cps, data->len, data, &count);
1078
+ return INT2NUM((int)count);
1079
+ }
1080
+
1081
+ static inline int
1082
+ str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1083
+ {
1084
+ return tst_cp(cp_arr, len, str_cp);
1085
+ }
1086
+
1087
+ static VALUE
1088
+ cs_method_cover_p(VALUE self, VALUE str)
1089
+ {
1090
+ struct cs_data *data;
1091
+ raise_arg_err_unless_string(str);
1092
+ data = cs_fetch_data(self);
1093
+ return each_cp(str, str_cp_in_arr, data->cps, data->len, data, 0);
1094
+ }
1095
+
1096
+ static inline int
1097
+ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1098
+ {
1099
+ if (tst_cp(cp_arr, len, str_cp))
1100
+ {
1101
+ rb_ary_push(memo[0], rb_enc_uint_chr((int)str_cp, (rb_encoding *)memo[1]));
1102
+ }
1103
+ return 1;
1104
+ }
1105
+
1106
+ static VALUE
1107
+ cs_method_scan(VALUE self, VALUE str)
1108
+ {
1109
+ VALUE memo[2];
1110
+ struct cs_data *data;
1111
+ raise_arg_err_unless_string(str);
1112
+ data = cs_fetch_data(self);
1113
+ memo[0] = rb_ary_new();
1114
+ memo[1] = (VALUE)rb_enc_get(str);
1115
+ each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo);
1116
+ return memo[0];
657
1117
  }
658
1118
 
659
1119
  static inline int
660
- str_cp_in_arr(unsigned int str_cp, cp_byte *cp_arr) {
661
- return TSTBIT(cp_arr, str_cp);
1120
+ str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1121
+ {
1122
+ return !tst_cp(cp_arr, len, str_cp);
662
1123
  }
663
1124
 
664
1125
  static VALUE
665
- method_cover_p(VALUE self, VALUE str) {
666
- cp_byte *cps;
1126
+ cs_method_used_by_p(VALUE self, VALUE str)
1127
+ {
1128
+ VALUE only_uses_other_cps;
1129
+ struct cs_data *data;
667
1130
  raise_arg_err_unless_string(str);
668
- FETCH_CODEPOINTS(self, cps);
669
- return each_cp(str, str_cp_in_arr, cps);
1131
+ data = cs_fetch_data(self);
1132
+ only_uses_other_cps = each_cp(str, str_cp_not_in_arr, data->cps, data->len, data, 0);
1133
+ return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
1134
+ }
1135
+
1136
+ static void
1137
+ cs_str_buf_cat(VALUE str, const char *ptr, long len)
1138
+ {
1139
+ long total, olen;
1140
+ char *sptr;
1141
+
1142
+ RSTRING_GETMEM(str, sptr, olen);
1143
+ sptr = RSTRING(str)->as.heap.ptr;
1144
+ olen = RSTRING(str)->as.heap.len;
1145
+ total = olen + len;
1146
+ memcpy(sptr + olen, ptr, len);
1147
+ RSTRING(str)->as.heap.len = total;
1148
+ }
1149
+
1150
+ #ifndef TERM_FILL
1151
+ #define TERM_FILL(ptr, termlen) \
1152
+ do \
1153
+ { \
1154
+ char *const term_fill_ptr = (ptr); \
1155
+ const int term_fill_len = (termlen); \
1156
+ *term_fill_ptr = '\0'; \
1157
+ if (__builtin_expect(!!(term_fill_len > 1), 0)) \
1158
+ memset(term_fill_ptr, 0, term_fill_len); \
1159
+ } while (0)
1160
+ #endif
1161
+
1162
+ static void
1163
+ cs_str_buf_terminate(VALUE str, rb_encoding *enc)
1164
+ {
1165
+ char *ptr;
1166
+ long len;
1167
+
1168
+ ptr = RSTRING(str)->as.heap.ptr;
1169
+ len = RSTRING(str)->as.heap.len;
1170
+ TERM_FILL(ptr + len, rb_enc_mbminlen(enc));
670
1171
  }
671
1172
 
672
1173
  static inline VALUE
673
- apply_to_str(VALUE set, VALUE str, int delete, int bang) {
674
- cp_byte *cps;
1174
+ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1175
+ {
1176
+ cs_ar *cps;
1177
+ cs_cp len;
675
1178
  rb_encoding *str_enc;
676
- VALUE orig_len, blen, new_str_buf, chr;
677
- int n;
1179
+ VALUE orig_len, new_str_buf;
1180
+ int cp_len;
678
1181
  unsigned int str_cp;
679
1182
  const char *ptr, *end;
680
1183
 
681
1184
  raise_arg_err_unless_string(str);
682
1185
 
683
- FETCH_CODEPOINTS(set, cps);
1186
+ cps = cs_fetch_cps(set, &len);
684
1187
 
685
1188
  orig_len = RSTRING_LEN(str);
686
- blen = orig_len + 30; /* len + margin */ // not sure why, copied from string.c
687
- new_str_buf = rb_str_buf_new(blen);
1189
+ if (orig_len < 1) // empty string, will never change
1190
+ {
1191
+ if (bang)
1192
+ {
1193
+ return Qnil;
1194
+ }
1195
+ return rb_str_dup(str);
1196
+ }
1197
+
1198
+ new_str_buf = rb_str_buf_new(orig_len + 30); // len + margin
688
1199
  str_enc = rb_enc_get(str);
689
1200
  rb_enc_associate(new_str_buf, str_enc);
690
- ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ?
691
- ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1201
+ rb_str_modify(new_str_buf);
1202
+ ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
692
1203
 
693
1204
  ptr = RSTRING_PTR(str);
694
1205
  end = RSTRING_END(str);
695
1206
 
696
- while (ptr < end) {
697
- str_cp = rb_enc_codepoint_len(ptr, end, &n, str_enc);
698
- if (!TSTBIT(cps, str_cp) != !delete) {
699
- chr = rb_enc_uint_chr(str_cp, str_enc);
700
- rb_enc_str_buf_cat(new_str_buf, RSTRING_PTR(chr), n, str_enc);
1207
+ if (single_byte_optimizable(str))
1208
+ {
1209
+ while (ptr < end)
1210
+ {
1211
+ str_cp = *ptr & 0xff;
1212
+ if ((!tst_cp(cps, len, str_cp)) == delete)
1213
+ {
1214
+ cs_str_buf_cat(new_str_buf, ptr, 1);
1215
+ }
1216
+ ptr++;
1217
+ }
1218
+ }
1219
+ else // likely to be multibyte string
1220
+ {
1221
+ while (ptr < end)
1222
+ {
1223
+ str_cp = rb_enc_codepoint_len(ptr, end, &cp_len, str_enc);
1224
+ if ((!tst_cp(cps, len, str_cp)) == delete)
1225
+ {
1226
+ cs_str_buf_cat(new_str_buf, ptr, cp_len);
1227
+ }
1228
+ ptr += cp_len;
701
1229
  }
702
- ptr += n;
703
1230
  }
704
1231
 
705
- if (bang) {
706
- if (RSTRING_LEN(new_str_buf) == (long)orig_len) return Qnil; // unchanged
1232
+ cs_str_buf_terminate(new_str_buf, str_enc);
1233
+
1234
+ if (bang)
1235
+ {
1236
+ if (RSTRING_LEN(new_str_buf) == (long)orig_len) // string unchanged
1237
+ {
1238
+ return Qnil;
1239
+ }
707
1240
  rb_str_shared_replace(str, new_str_buf);
708
1241
  }
709
- else {
1242
+ else
1243
+ {
710
1244
  RB_OBJ_WRITE(new_str_buf, &(RBASIC(new_str_buf))->klass, rb_obj_class(str));
711
- // slightly cumbersome approach needed for compatibility with Ruby < 2.3:
712
- RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags&(FL_TAINT));
713
1245
  str = new_str_buf;
714
1246
  }
715
1247
 
@@ -717,98 +1249,115 @@ apply_to_str(VALUE set, VALUE str, int delete, int bang) {
717
1249
  }
718
1250
 
719
1251
  static VALUE
720
- method_delete_in(VALUE self, VALUE str) {
721
- return apply_to_str(self, str, 1, 0);
1252
+ cs_method_delete_in(VALUE self, VALUE str)
1253
+ {
1254
+ return cs_apply_to_str(self, str, 1, 0);
1255
+ }
1256
+
1257
+ static VALUE
1258
+ cs_method_delete_in_bang(VALUE self, VALUE str)
1259
+ {
1260
+ return cs_apply_to_str(self, str, 1, 1);
722
1261
  }
723
1262
 
724
1263
  static VALUE
725
- method_delete_in_bang(VALUE self, VALUE str) {
726
- return apply_to_str(self, str, 1, 1);
1264
+ cs_method_keep_in(VALUE self, VALUE str)
1265
+ {
1266
+ return cs_apply_to_str(self, str, 0, 0);
727
1267
  }
728
1268
 
729
1269
  static VALUE
730
- method_keep_in(VALUE self, VALUE str) {
731
- return apply_to_str(self, str, 0, 0);
1270
+ cs_method_keep_in_bang(VALUE self, VALUE str)
1271
+ {
1272
+ return cs_apply_to_str(self, str, 0, 1);
732
1273
  }
733
1274
 
734
1275
  static VALUE
735
- method_keep_in_bang(VALUE self, VALUE str) {
736
- return apply_to_str(self, str, 0, 1);
1276
+ cs_method_allocated_length(VALUE self)
1277
+ {
1278
+ return LONG2FIX(cs_fetch_data(self)->len);
737
1279
  }
738
1280
 
739
1281
  // ****
740
1282
  // init
741
1283
  // ****
742
1284
 
743
- void
744
- Init_character_set()
1285
+ void Init_character_set()
745
1286
  {
746
1287
  VALUE cs = rb_define_class("CharacterSet", rb_cObject);
747
1288
 
748
- rb_define_alloc_func(cs, method_allocate);
1289
+ rb_define_alloc_func(cs, cs_method_allocate);
749
1290
 
750
1291
  // `Set` compatibility methods
751
1292
 
752
- rb_define_method(cs, "each", method_each, 0);
753
- rb_define_method(cs, "to_a", method_to_a, -1);
754
- rb_define_method(cs, "length", method_length, 0);
755
- rb_define_method(cs, "size", method_length, 0);
756
- rb_define_method(cs, "count", method_length, 0);
757
- rb_define_method(cs, "empty?", method_empty_p, 0);
758
- rb_define_method(cs, "hash", method_hash, 0);
759
- rb_define_method(cs, "keep_if", method_keep_if, 0);
760
- rb_define_method(cs, "delete_if", method_delete_if, 0);
761
- rb_define_method(cs, "clear", method_clear, 0);
762
- rb_define_method(cs, "intersection", method_intersection, 1);
763
- rb_define_method(cs, "&", method_intersection, 1);
764
- rb_define_method(cs, "union", method_union, 1);
765
- rb_define_method(cs, "+", method_union, 1);
766
- rb_define_method(cs, "|", method_union, 1);
767
- rb_define_method(cs, "difference", method_difference, 1);
768
- rb_define_method(cs, "-", method_difference, 1);
769
- rb_define_method(cs, "^", method_exclusion, 1);
770
- rb_define_method(cs, "include?", method_include_p, 1);
771
- rb_define_method(cs, "member?", method_include_p, 1);
772
- rb_define_method(cs, "===", method_include_p, 1);
773
- rb_define_method(cs, "add", method_add, 1);
774
- rb_define_method(cs, "<<", method_add, 1);
775
- rb_define_method(cs, "add?", method_add_p, 1);
776
- rb_define_method(cs, "delete", method_delete, 1);
777
- rb_define_method(cs, "delete?", method_delete_p, 1);
778
- rb_define_method(cs, "intersect?", method_intersect_p, 1);
779
- rb_define_method(cs, "disjoint?", method_disjoint_p, 1);
780
- rb_define_method(cs, "eql?", method_eql_p, 1);
781
- rb_define_method(cs, "==", method_eql_p, 1);
782
- rb_define_method(cs, "merge", method_merge, 1);
783
- rb_define_method(cs, "initialize_clone", method_initialize_copy, 1);
784
- rb_define_method(cs, "initialize_dup", method_initialize_copy, 1);
785
- rb_define_method(cs, "subtract", method_subtract, 1);
786
- rb_define_method(cs, "subset?", method_subset_p, 1);
787
- rb_define_method(cs, "<=", method_subset_p, 1);
788
- rb_define_method(cs, "proper_subset?", method_proper_subset_p, 1);
789
- rb_define_method(cs, "<", method_proper_subset_p, 1);
790
- rb_define_method(cs, "superset?", method_superset_p, 1);
791
- rb_define_method(cs, ">=", method_superset_p, 1);
792
- rb_define_method(cs, "proper_superset?", method_proper_superset_p, 1);
793
- rb_define_method(cs, ">", method_proper_superset_p, 1);
1293
+ rb_define_method(cs, "each", cs_method_each, 0);
1294
+ rb_define_method(cs, "to_a", cs_method_to_a, -1);
1295
+ rb_define_method(cs, "length", cs_method_length, 0);
1296
+ rb_define_method(cs, "size", cs_method_length, 0);
1297
+ rb_define_method(cs, "empty?", cs_method_empty_p, 0);
1298
+ rb_define_method(cs, "hash", cs_method_hash, 0);
1299
+ rb_define_method(cs, "keep_if", cs_method_keep_if, 0);
1300
+ rb_define_method(cs, "delete_if", cs_method_delete_if, 0);
1301
+ rb_define_method(cs, "clear", cs_method_clear, 0);
1302
+ rb_define_method(cs, "min", cs_method_min, 0);
1303
+ rb_define_method(cs, "max", cs_method_max, 0);
1304
+ rb_define_method(cs, "minmax", cs_method_minmax, 0);
1305
+ rb_define_method(cs, "intersection", cs_method_intersection, 1);
1306
+ rb_define_method(cs, "&", cs_method_intersection, 1);
1307
+ rb_define_method(cs, "union", cs_method_union, 1);
1308
+ rb_define_method(cs, "+", cs_method_union, 1);
1309
+ rb_define_method(cs, "|", cs_method_union, 1);
1310
+ rb_define_method(cs, "difference", cs_method_difference, 1);
1311
+ rb_define_method(cs, "-", cs_method_difference, 1);
1312
+ rb_define_method(cs, "^", cs_method_exclusion, 1);
1313
+ rb_define_method(cs, "include?", cs_method_include_p, 1);
1314
+ rb_define_method(cs, "member?", cs_method_include_p, 1);
1315
+ rb_define_method(cs, "===", cs_method_include_p, 1);
1316
+ rb_define_method(cs, "add", cs_method_add, 1);
1317
+ rb_define_method(cs, "<<", cs_method_add, 1);
1318
+ rb_define_method(cs, "add?", cs_method_add_p, 1);
1319
+ rb_define_method(cs, "delete", cs_method_delete, 1);
1320
+ rb_define_method(cs, "delete?", cs_method_delete_p, 1);
1321
+ rb_define_method(cs, "intersect?", cs_method_intersect_p, 1);
1322
+ rb_define_method(cs, "disjoint?", cs_method_disjoint_p, 1);
1323
+ rb_define_method(cs, "eql?", cs_method_eql_p, 1);
1324
+ rb_define_method(cs, "==", cs_method_eql_p, 1);
1325
+ rb_define_method(cs, "merge", cs_method_merge, 1);
1326
+ rb_define_method(cs, "initialize_clone", cs_method_initialize_copy, 1);
1327
+ rb_define_method(cs, "initialize_dup", cs_method_initialize_copy, 1);
1328
+ rb_define_method(cs, "subtract", cs_method_subtract, 1);
1329
+ rb_define_method(cs, "subset?", cs_method_subset_p, 1);
1330
+ rb_define_method(cs, "<=", cs_method_subset_p, 1);
1331
+ rb_define_method(cs, "proper_subset?", cs_method_proper_subset_p, 1);
1332
+ rb_define_method(cs, "<", cs_method_proper_subset_p, 1);
1333
+ rb_define_method(cs, "superset?", cs_method_superset_p, 1);
1334
+ rb_define_method(cs, ">=", cs_method_superset_p, 1);
1335
+ rb_define_method(cs, "proper_superset?", cs_method_proper_superset_p, 1);
1336
+ rb_define_method(cs, ">", cs_method_proper_superset_p, 1);
794
1337
 
795
1338
  // `CharacterSet`-specific methods
796
1339
 
797
- rb_define_singleton_method(cs, "from_ranges", class_method_from_ranges, -2);
798
- rb_define_singleton_method(cs, "of", class_method_of, 1);
799
-
800
- rb_define_method(cs, "ranges", method_ranges, 0);
801
- rb_define_method(cs, "sample", method_sample, -1);
802
- rb_define_method(cs, "bmp_part", method_bmp_part, 0);
803
- rb_define_method(cs, "astral_part", method_astral_part, 0);
804
- rb_define_method(cs, "planes", method_planes, 0);
805
- rb_define_method(cs, "member_in_plane?", method_member_in_plane_p, 1);
806
- rb_define_method(cs, "ext_inversion", method_ext_inversion, -1);
807
- rb_define_method(cs, "case_insensitive", method_case_insensitive, 0);
808
- rb_define_method(cs, "used_by?", method_used_by_p, 1);
809
- rb_define_method(cs, "cover?", method_cover_p, 1);
810
- rb_define_method(cs, "delete_in", method_delete_in, 1);
811
- rb_define_method(cs, "delete_in!", method_delete_in_bang, 1);
812
- rb_define_method(cs, "keep_in", method_keep_in, 1);
813
- rb_define_method(cs, "keep_in!", method_keep_in_bang, 1);
1340
+ rb_define_singleton_method(cs, "from_ranges", cs_class_method_from_ranges, -2);
1341
+ rb_define_singleton_method(cs, "of", cs_class_method_of, 1);
1342
+
1343
+ rb_define_method(cs, "ranges", cs_method_ranges, 0);
1344
+ rb_define_method(cs, "sample", cs_method_sample, -1);
1345
+ rb_define_method(cs, "ext_section", cs_method_ext_section, 2);
1346
+ rb_define_method(cs, "ext_count_in_section", cs_method_ext_count_in_section, 2);
1347
+ rb_define_method(cs, "ext_section?", cs_method_ext_section_p, 2);
1348
+ rb_define_method(cs, "ext_section_ratio", cs_method_ext_section_ratio, 2);
1349
+ rb_define_method(cs, "planes", cs_method_planes, 0);
1350
+ rb_define_method(cs, "plane", cs_method_plane, 1);
1351
+ rb_define_method(cs, "member_in_plane?", cs_method_member_in_plane_p, 1);
1352
+ rb_define_method(cs, "ext_inversion", cs_method_ext_inversion, -1);
1353
+ rb_define_method(cs, "case_insensitive", cs_method_case_insensitive, 0);
1354
+ rb_define_method(cs, "count_in", cs_method_count_in, 1);
1355
+ rb_define_method(cs, "cover?", cs_method_cover_p, 1);
1356
+ rb_define_method(cs, "delete_in", cs_method_delete_in, 1);
1357
+ rb_define_method(cs, "delete_in!", cs_method_delete_in_bang, 1);
1358
+ rb_define_method(cs, "keep_in", cs_method_keep_in, 1);
1359
+ rb_define_method(cs, "keep_in!", cs_method_keep_in_bang, 1);
1360
+ rb_define_method(cs, "scan", cs_method_scan, 1);
1361
+ rb_define_method(cs, "used_by?", cs_method_used_by_p, 1);
1362
+ rb_define_method(cs, "allocated_length", cs_method_allocated_length, 0);
814
1363
  }