character_set 1.2.0-java → 1.5.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +3 -0
  3. data/.github/workflows/gouteur.yml +20 -0
  4. data/.github/workflows/lint.yml +29 -0
  5. data/.github/workflows/tests.yml +22 -0
  6. data/.gitignore +1 -0
  7. data/.gouteur.yml +2 -0
  8. data/.rubocop.yml +17 -0
  9. data/BENCHMARK.md +53 -17
  10. data/CHANGELOG.md +54 -0
  11. data/README.md +51 -12
  12. data/Rakefile +20 -18
  13. data/benchmarks/count_in.rb +13 -0
  14. data/benchmarks/delete_in.rb +1 -1
  15. data/benchmarks/scan.rb +13 -0
  16. data/benchmarks/shared.rb +5 -0
  17. data/benchmarks/z_add.rb +12 -0
  18. data/benchmarks/z_delete.rb +12 -0
  19. data/benchmarks/z_merge.rb +15 -0
  20. data/benchmarks/z_minmax.rb +12 -0
  21. data/bin/console +2 -0
  22. data/character_set.gemspec +17 -4
  23. data/ext/character_set/character_set.c +969 -415
  24. data/ext/character_set/unicode_casefold_table.h +44 -1
  25. data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
  26. data/lib/character_set/character.rb +1 -1
  27. data/lib/character_set/core_ext/regexp_ext.rb +1 -1
  28. data/lib/character_set/core_ext/string_ext.rb +3 -1
  29. data/lib/character_set/expression_converter.rb +41 -43
  30. data/lib/character_set/parser.rb +1 -1
  31. data/lib/character_set/predefined_sets/any.cps +1 -0
  32. data/lib/character_set/predefined_sets/ascii.cps +1 -0
  33. data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
  34. data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
  35. data/lib/character_set/predefined_sets/assigned.cps +677 -0
  36. data/lib/character_set/predefined_sets/bmp.cps +2 -0
  37. data/lib/character_set/predefined_sets/crypt.cps +2 -0
  38. data/lib/character_set/predefined_sets/emoji.cps +152 -0
  39. data/lib/character_set/predefined_sets/newline.cps +3 -0
  40. data/lib/character_set/predefined_sets/surrogate.cps +1 -0
  41. data/lib/character_set/predefined_sets/unicode.cps +2 -0
  42. data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
  43. data/lib/character_set/predefined_sets/url_host.cps +10 -0
  44. data/lib/character_set/predefined_sets/url_path.cps +7 -0
  45. data/lib/character_set/predefined_sets/url_query.cps +8 -0
  46. data/lib/character_set/predefined_sets/whitespace.cps +10 -0
  47. data/lib/character_set/predefined_sets.rb +25 -260
  48. data/lib/character_set/ruby_fallback/character_set_methods.rb +60 -9
  49. data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
  50. data/lib/character_set/ruby_fallback.rb +5 -3
  51. data/lib/character_set/set_method_adapters.rb +4 -3
  52. data/lib/character_set/shared_methods.rb +69 -50
  53. data/lib/character_set/version.rb +1 -1
  54. data/lib/character_set/writer.rb +98 -27
  55. metadata +114 -17
  56. data/.travis.yml +0 -8
  57. data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
@@ -2,81 +2,180 @@
2
2
  #include "ruby/encoding.h"
3
3
  #include "unicode_casefold_table.h"
4
4
 
5
- #define SETBIT(byte_arr, bit) (byte_arr[bit >> 3] |= (1 << (bit & 0x07)))
6
- #define CLRBIT(byte_arr, bit) (byte_arr[bit >> 3] &= ~(1 << (bit & 0x07)))
7
- #define TSTBIT(byte_arr, bit) (byte_arr[bit >> 3] & (1 << (bit & 0x07)))
5
+ #define UNICODE_PLANE_SIZE 0x10000
6
+ #define UNICODE_PLANE_COUNT 17
7
+ #define UNICODE_CP_COUNT (UNICODE_PLANE_SIZE * UNICODE_PLANE_COUNT)
8
8
 
9
- typedef char cp_byte;
10
- typedef unsigned long cp_index;
9
+ // start at ascii size
10
+ #define CS_DEFAULT_INITIAL_LEN 128
11
11
 
12
- #define UNICODE_CP_COUNT 0x110000
13
- #define UNICODE_BYTES UNICODE_CP_COUNT / 8
14
- #define UNICODE_PLANE_SIZE 0x10000
15
- #define UNICODE_PLANE_COUNT UNICODE_CP_COUNT / UNICODE_PLANE_SIZE
12
+ typedef char cs_ar;
13
+ typedef unsigned long cs_cp;
14
+
15
+ struct cs_data
16
+ {
17
+ cs_ar *cps;
18
+ cs_cp len;
19
+ };
20
+
21
+ #define CS_MSIZE(len) (sizeof(cs_ar) * (len / 8))
22
+
23
+ static inline void
24
+ add_memspace_for_another_plane(struct cs_data *data)
25
+ {
26
+ data->cps = ruby_xrealloc(data->cps, CS_MSIZE(data->len + UNICODE_PLANE_SIZE));
27
+ memset(data->cps + CS_MSIZE(data->len), 0, CS_MSIZE(UNICODE_PLANE_SIZE));
28
+ data->len += UNICODE_PLANE_SIZE;
29
+ }
30
+
31
+ static inline void
32
+ ensure_memsize_fits(struct cs_data *data, cs_cp target_cp)
33
+ {
34
+ while (target_cp >= data->len)
35
+ {
36
+ add_memspace_for_another_plane(data);
37
+ }
38
+ }
39
+
40
+ static inline void
41
+ set_cp(struct cs_data *data, cs_cp cp)
42
+ {
43
+ ensure_memsize_fits(data, cp);
44
+ data->cps[cp >> 3] |= (1 << (cp & 0x07));
45
+ }
46
+
47
+ static inline int
48
+ tst_cp(cs_ar *cps, cs_cp len, cs_cp cp)
49
+ {
50
+ return ((cp < len) && cps[cp >> 3] & (1 << (cp & 0x07)));
51
+ }
52
+
53
+ static inline void
54
+ clr_cp(cs_ar *cps, cs_cp len, cs_cp cp)
55
+ {
56
+ if (cp < len)
57
+ {
58
+ cps[cp >> 3] &= ~(1 << (cp & 0x07));
59
+ }
60
+ }
16
61
 
17
62
  static void
18
- free_character_set(void* codepoints) {
19
- free(codepoints);
63
+ cs_free(void *ptr)
64
+ {
65
+ struct cs_data *data = ptr;
66
+ ruby_xfree(data->cps);
67
+ ruby_xfree(data);
20
68
  }
21
69
 
22
70
  static size_t
23
- memsize_character_set(const void* codepoints) {
24
- return sizeof(cp_byte) * UNICODE_BYTES;
25
- }
26
-
27
- static const rb_data_type_t
28
- character_set_type = {
29
- .wrap_struct_name = "character_set",
30
- .function = {
31
- .dmark = NULL,
32
- .dfree = free_character_set,
33
- .dsize = memsize_character_set,
34
- },
35
- .data = NULL,
36
- .flags = RUBY_TYPED_FREE_IMMEDIATELY,
71
+ cs_memsize(const void *ptr)
72
+ {
73
+ const struct cs_data *data = ptr;
74
+ return sizeof(*data) + CS_MSIZE(data->len);
75
+ }
76
+
77
+ static const rb_data_type_t cs_type = {
78
+ .wrap_struct_name = "character_set",
79
+ .function = {
80
+ .dmark = NULL,
81
+ .dfree = cs_free,
82
+ .dsize = cs_memsize,
83
+ },
84
+ .data = NULL,
85
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY,
37
86
  };
38
87
 
39
- #define FETCH_CODEPOINTS(set, cps)\
40
- TypedData_Get_Struct(set, cp_byte, &character_set_type, cps)
88
+ static inline VALUE
89
+ cs_alloc_len(VALUE klass, struct cs_data **data_ptr, cs_cp len)
90
+ {
91
+ VALUE cs;
92
+ struct cs_data *data;
93
+ cs = TypedData_Make_Struct(klass, struct cs_data, &cs_type, data);
94
+ data->cps = ruby_xmalloc(CS_MSIZE(len));
95
+ memset(data->cps, 0, CS_MSIZE(len));
96
+ data->len = len;
97
+
98
+ if (data_ptr)
99
+ {
100
+ *data_ptr = data;
101
+ }
41
102
 
42
- #define NEW_CHARACTER_SET(klass, cps)\
43
- TypedData_Wrap_Struct(klass, &character_set_type, cps)
103
+ return cs;
104
+ }
44
105
 
45
- static VALUE
46
- method_allocate(VALUE self) {
47
- cp_byte *cp_arr;
48
- cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte));
49
- return NEW_CHARACTER_SET(self, cp_arr);
106
+ static inline VALUE
107
+ cs_alloc(VALUE klass, struct cs_data **data_ptr)
108
+ {
109
+ return cs_alloc_len(klass, data_ptr, CS_DEFAULT_INITIAL_LEN);
50
110
  }
51
111
 
52
- #define FOR_EACH_ACTIVE_CODEPOINT(action)\
53
- cp_index cp;\
54
- cp_byte *cps;\
55
- FETCH_CODEPOINTS(self, cps);\
56
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\
57
- if (TSTBIT(cps, cp)) { action; }\
58
- }
112
+ static inline struct cs_data *
113
+ cs_fetch_data(VALUE cs)
114
+ {
115
+ struct cs_data *data;
116
+ TypedData_Get_Struct(cs, struct cs_data, &cs_type, data);
117
+ return data;
118
+ }
119
+
120
+ static inline cs_ar *
121
+ cs_fetch_cps(VALUE cs, cs_cp *len_ptr)
122
+ {
123
+ struct cs_data *data;
124
+ data = cs_fetch_data(cs);
125
+ *len_ptr = data->len;
126
+ return data->cps;
127
+ }
128
+
129
+ static VALUE
130
+ cs_method_allocate(VALUE self)
131
+ {
132
+ return cs_alloc(self, 0);
133
+ }
134
+
135
+ #define FOR_EACH_ACTIVE_CODEPOINT(action) \
136
+ do \
137
+ { \
138
+ cs_cp cp, len; \
139
+ cs_ar *cps; \
140
+ cps = cs_fetch_cps(self, &len); \
141
+ for (cp = 0; cp < len; cp++) \
142
+ { \
143
+ if (tst_cp(cps, len, cp)) \
144
+ { \
145
+ action; \
146
+ } \
147
+ } \
148
+ } while (0)
59
149
 
60
150
  // ***************************
61
151
  // `Set` compatibility methods
62
152
  // ***************************
63
153
 
64
- static inline VALUE
65
- enumerator_length(VALUE self, VALUE args, VALUE eobj) {
66
- cp_index count;
154
+ static inline cs_cp
155
+ cs_active_cp_count(VALUE self)
156
+ {
157
+ cs_cp count;
67
158
  count = 0;
68
159
  FOR_EACH_ACTIVE_CODEPOINT(count++);
69
- return LONG2FIX(count);
160
+ return count;
70
161
  }
71
162
 
72
163
  static VALUE
73
- method_length(VALUE self) {
74
- return enumerator_length(self, 0, 0);
164
+ cs_method_length(VALUE self)
165
+ {
166
+ return LONG2FIX(cs_active_cp_count(self));
167
+ }
168
+
169
+ static inline VALUE
170
+ cs_enumerator_length(VALUE self, VALUE args, VALUE eobj)
171
+ {
172
+ return LONG2FIX(cs_active_cp_count(self));
75
173
  }
76
174
 
77
175
  static VALUE
78
- method_each(VALUE self) {
79
- RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
176
+ cs_method_each(VALUE self)
177
+ {
178
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
80
179
  FOR_EACH_ACTIVE_CODEPOINT(rb_yield(LONG2FIX(cp)));
81
180
  return self;
82
181
  }
@@ -84,16 +183,19 @@ method_each(VALUE self) {
84
183
  // returns an Array of codepoint Integers by default.
85
184
  // returns an Array of Strings of length 1 if passed `true`.
86
185
  static VALUE
87
- method_to_a(int argc, VALUE *argv, VALUE self) {
186
+ cs_method_to_a(int argc, VALUE *argv, VALUE self)
187
+ {
88
188
  VALUE arr;
89
189
  rb_encoding *enc;
90
190
  rb_check_arity(argc, 0, 1);
91
191
 
92
192
  arr = rb_ary_new();
93
- if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse) {
193
+ if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse)
194
+ {
94
195
  FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, LONG2FIX(cp)));
95
196
  }
96
- else {
197
+ else
198
+ {
97
199
  enc = rb_utf8_encoding();
98
200
  FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, rb_enc_uint_chr((int)cp, enc)));
99
201
  }
@@ -102,302 +204,473 @@ method_to_a(int argc, VALUE *argv, VALUE self) {
102
204
  }
103
205
 
104
206
  static VALUE
105
- method_empty_p(VALUE self) {
207
+ cs_method_empty_p(VALUE self)
208
+ {
106
209
  FOR_EACH_ACTIVE_CODEPOINT(return Qfalse);
107
210
  return Qtrue;
108
211
  }
109
212
 
110
213
  static VALUE
111
- method_hash(VALUE self) {
112
- cp_index cp, hash, four_byte_value;
113
- cp_byte *cps;
114
- FETCH_CODEPOINTS(self, cps);
214
+ cs_method_hash(VALUE self)
215
+ {
216
+ cs_cp cp, len, hash, four_byte_value;
217
+ cs_ar *cps;
218
+ cps = cs_fetch_cps(self, &len);
219
+ four_byte_value = 0;
115
220
 
116
221
  hash = 17;
117
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
118
- if (cp % 32 == 0) {
119
- if (cp != 0) { hash = hash * 23 + four_byte_value; }
222
+ for (cp = 0; cp < len; cp++)
223
+ {
224
+ if (cp % 32 == 0)
225
+ {
226
+ if (cp != 0)
227
+ {
228
+ hash = hash * 23 + four_byte_value;
229
+ }
120
230
  four_byte_value = 0;
121
231
  }
122
- if (TSTBIT(cps, cp)) four_byte_value++;
232
+ if (tst_cp(cps, len, cp))
233
+ {
234
+ four_byte_value++;
235
+ }
123
236
  }
124
237
 
125
238
  return LONG2FIX(hash);
126
239
  }
127
240
 
128
241
  static inline VALUE
129
- delete_if_block_result(VALUE self, int truthy) {
242
+ cs_delete_if_block_result(VALUE self, int truthy)
243
+ {
130
244
  VALUE result;
131
245
  rb_need_block();
132
246
  rb_check_frozen(self);
133
247
  FOR_EACH_ACTIVE_CODEPOINT(
134
- result = rb_yield(LONG2FIX(cp));
135
- if ((NIL_P(result) || result == Qfalse) != truthy) CLRBIT(cps, cp);
136
- );
248
+ result = rb_yield(LONG2FIX(cp));
249
+ if ((NIL_P(result) || result == Qfalse) != truthy) clr_cp(cps, len, cp););
137
250
  return self;
138
251
  }
139
252
 
140
253
  static VALUE
141
- method_delete_if(VALUE self) {
142
- RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
143
- return delete_if_block_result(self, 1);
254
+ cs_method_delete_if(VALUE self)
255
+ {
256
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
257
+ return cs_delete_if_block_result(self, 1);
144
258
  }
145
259
 
146
260
  static VALUE
147
- method_keep_if(VALUE self) {
148
- RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
149
- return delete_if_block_result(self, 0);
261
+ cs_method_keep_if(VALUE self)
262
+ {
263
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
264
+ return cs_delete_if_block_result(self, 0);
150
265
  }
151
266
 
152
267
  static VALUE
153
- method_clear(VALUE self) {
154
- cp_index cp;
155
- cp_byte *cps;
268
+ cs_method_clear(VALUE self)
269
+ {
270
+ struct cs_data *data;
156
271
  rb_check_frozen(self);
157
- FETCH_CODEPOINTS(self, cps);
158
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
159
- CLRBIT(cps, cp);
160
- }
272
+ data = cs_fetch_data(self);
273
+ memset(data->cps, 0, CS_MSIZE(data->len));
161
274
  return self;
162
275
  }
163
276
 
164
- #define RETURN_NEW_SET_BASED_ON(condition)\
165
- cp_index cp;\
166
- cp_byte *a, *b, *new_cps;\
167
- FETCH_CODEPOINTS(self, a);\
168
- if (other) FETCH_CODEPOINTS(other, b);\
169
- new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));\
170
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\
171
- if (condition) SETBIT(new_cps, cp);\
172
- }\
173
- return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);\
277
+ static VALUE
278
+ cs_method_min(VALUE self)
279
+ {
280
+ FOR_EACH_ACTIVE_CODEPOINT(return LONG2FIX(cp));
281
+ return Qnil;
282
+ }
283
+
284
+ static VALUE
285
+ cs_method_max(VALUE self)
286
+ {
287
+ cs_cp len;
288
+ long reverse_idx;
289
+ cs_ar *cps;
290
+ cps = cs_fetch_cps(self, &len);
291
+ for (reverse_idx = len; reverse_idx >= 0; reverse_idx--)
292
+ {
293
+ if (tst_cp(cps, len, reverse_idx))
294
+ {
295
+ return LONG2FIX(reverse_idx);
296
+ }
297
+ }
298
+ return Qnil;
299
+ }
300
+
301
+ static VALUE
302
+ cs_method_minmax(VALUE self)
303
+ {
304
+ VALUE arr;
305
+ arr = rb_ary_new2(2);
306
+ rb_ary_push(arr, cs_method_min(self));
307
+ rb_ary_push(arr, cs_method_max(self));
308
+ return arr;
309
+ }
310
+
311
+ #define RETURN_COMBINED_CS(cs_a, cs_b, comp_op) \
312
+ do \
313
+ { \
314
+ VALUE new_cs; \
315
+ cs_cp cp, alen, blen; \
316
+ cs_ar *acps, *bcps; \
317
+ struct cs_data *new_data; \
318
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
319
+ acps = cs_fetch_cps(cs_a, &alen); \
320
+ bcps = cs_fetch_cps(cs_b, &blen); \
321
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) \
322
+ { \
323
+ if (tst_cp(acps, alen, cp) comp_op tst_cp(bcps, blen, cp)) \
324
+ { \
325
+ set_cp(new_data, cp); \
326
+ } \
327
+ } \
328
+ return new_cs; \
329
+ } while (0)
174
330
 
175
331
  static VALUE
176
- method_intersection(VALUE self, VALUE other) {
177
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) && TSTBIT(b, cp));
332
+ cs_method_intersection(VALUE self, VALUE other)
333
+ {
334
+ RETURN_COMBINED_CS(self, other, &&);
178
335
  }
179
336
 
180
337
  static VALUE
181
- method_exclusion(VALUE self, VALUE other) {
182
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) ^ TSTBIT(b, cp));
338
+ cs_method_exclusion(VALUE self, VALUE other)
339
+ {
340
+ RETURN_COMBINED_CS(self, other, ^);
183
341
  }
184
342
 
185
343
  static VALUE
186
- method_union(VALUE self, VALUE other) {
187
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) || TSTBIT(b, cp));
344
+ cs_method_union(VALUE self, VALUE other)
345
+ {
346
+ RETURN_COMBINED_CS(self, other, ||);
188
347
  }
189
348
 
190
349
  static VALUE
191
- method_difference(VALUE self, VALUE other) {
192
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) > TSTBIT(b, cp));
350
+ cs_method_difference(VALUE self, VALUE other)
351
+ {
352
+ RETURN_COMBINED_CS(self, other, >);
193
353
  }
194
354
 
195
355
  static VALUE
196
- method_include_p(VALUE self, VALUE num) {
197
- cp_byte *cps;
198
- FETCH_CODEPOINTS(self, cps);
199
- return (TSTBIT(cps, FIX2ULONG(num)) ? Qtrue : Qfalse);
356
+ cs_method_include_p(VALUE self, VALUE num)
357
+ {
358
+ cs_ar *cps;
359
+ cs_cp len;
360
+ cps = cs_fetch_cps(self, &len);
361
+ return (tst_cp(cps, len, FIX2ULONG(num)) ? Qtrue : Qfalse);
200
362
  }
201
363
 
202
- static inline int
203
- toggle_codepoint(VALUE set, VALUE cp_num, unsigned int on, int check_if_noop) {
204
- cp_index cp;
205
- cp_byte *cps;
206
- rb_check_frozen(set);
207
- FETCH_CODEPOINTS(set, cps);
364
+ static inline VALUE
365
+ cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop)
366
+ {
367
+ cs_cp cp, len;
368
+ cs_ar *cps;
369
+ struct cs_data *data;
370
+ rb_check_frozen(cs);
371
+ data = cs_fetch_data(cs);
372
+ cps = data->cps;
373
+ len = data->len;
208
374
  cp = FIX2ULONG(cp_num);
209
- if (check_if_noop && (!TSTBIT(cps, cp) == !on)) {
210
- return 0;
375
+ if (return_nil_if_noop && (!tst_cp(cps, len, cp) == !on))
376
+ {
377
+ return Qnil;
211
378
  }
212
- else {
213
- if (on) { SETBIT(cps, cp); }
214
- else { CLRBIT(cps, cp); }
215
- return 1;
379
+ else
380
+ {
381
+ if (on)
382
+ {
383
+ set_cp(data, cp);
384
+ }
385
+ else
386
+ {
387
+ clr_cp(cps, len, cp);
388
+ }
389
+ return cs;
216
390
  }
217
391
  }
218
392
 
219
393
  static VALUE
220
- method_add(VALUE self, VALUE cp_num) {
221
- return toggle_codepoint(self, cp_num, 1, 0) ? self : Qnil;
394
+ cs_method_add(VALUE self, VALUE cp_num)
395
+ {
396
+ return cs_toggle_codepoint(self, cp_num, 1, 0);
222
397
  }
223
398
 
224
399
  static VALUE
225
- method_add_p(VALUE self, VALUE cp_num) {
226
- return toggle_codepoint(self, cp_num, 1, 1) ? self : Qnil;
400
+ cs_method_add_p(VALUE self, VALUE cp_num)
401
+ {
402
+ return cs_toggle_codepoint(self, cp_num, 1, 1);
227
403
  }
228
404
 
229
405
  static VALUE
230
- method_delete(VALUE self, VALUE cp_num) {
231
- return toggle_codepoint(self, cp_num, 0, 0) ? self : Qnil;
406
+ cs_method_delete(VALUE self, VALUE cp_num)
407
+ {
408
+ return cs_toggle_codepoint(self, cp_num, 0, 0);
232
409
  }
233
410
 
234
411
  static VALUE
235
- method_delete_p(VALUE self, VALUE cp_num) {
236
- return toggle_codepoint(self, cp_num, 0, 1) ? self : Qnil;
412
+ cs_method_delete_p(VALUE self, VALUE cp_num)
413
+ {
414
+ return cs_toggle_codepoint(self, cp_num, 0, 1);
237
415
  }
238
416
 
239
- #define COMPARE_SETS(action)\
240
- cp_index cp;\
241
- cp_byte *cps, *other_cps;\
242
- FETCH_CODEPOINTS(self, cps);\
243
- FETCH_CODEPOINTS(other, other_cps);\
244
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { action; }\
245
-
246
417
  static VALUE
247
- method_intersect_p(VALUE self, VALUE other) {
248
- COMPARE_SETS(if (TSTBIT(cps, cp) && TSTBIT(other_cps, cp)) return Qtrue);
418
+ cs_method_intersect_p(VALUE self, VALUE other)
419
+ {
420
+ cs_cp cp, alen, blen;
421
+ cs_ar *acps, *bcps;
422
+ acps = cs_fetch_cps(self, &alen);
423
+ bcps = cs_fetch_cps(other, &blen);
424
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
425
+ {
426
+ if (tst_cp(acps, alen, cp) && tst_cp(bcps, blen, cp))
427
+ {
428
+ return Qtrue;
429
+ }
430
+ }
249
431
  return Qfalse;
250
432
  }
251
433
 
252
434
  static VALUE
253
- method_disjoint_p(VALUE self, VALUE other) {
254
- return method_intersect_p(self, other) ? Qfalse : Qtrue;
435
+ cs_method_disjoint_p(VALUE self, VALUE other)
436
+ {
437
+ return cs_method_intersect_p(self, other) ? Qfalse : Qtrue;
255
438
  }
256
439
 
257
440
  static inline int
258
- is_character_set(VALUE obj) {
259
- return rb_typeddata_is_kind_of(obj, &character_set_type);
441
+ cs_check_type(VALUE obj)
442
+ {
443
+ return rb_typeddata_is_kind_of(obj, &cs_type);
260
444
  }
261
445
 
262
446
  static VALUE
263
- method_eql_p(VALUE self, VALUE other) {
264
- if (!is_character_set(other)) return Qfalse;
265
- if (self == other) return Qtrue; // same object_id
266
-
267
- COMPARE_SETS(if (TSTBIT(cps, cp) != TSTBIT(other_cps, cp)) return Qfalse);
268
-
447
+ cs_cps_eql(VALUE cs_a, VALUE cs_b)
448
+ {
449
+ cs_cp cp, alen, blen;
450
+ cs_ar *acps, *bcps;
451
+ acps = cs_fetch_cps(cs_a, &alen);
452
+ bcps = cs_fetch_cps(cs_b, &blen);
453
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
454
+ {
455
+ if (tst_cp(acps, alen, cp) != tst_cp(bcps, blen, cp))
456
+ {
457
+ return Qfalse;
458
+ }
459
+ }
269
460
  return Qtrue;
270
461
  }
271
462
 
463
+ static VALUE
464
+ cs_method_eql_p(VALUE self, VALUE other)
465
+ {
466
+ if (!cs_check_type(other))
467
+ {
468
+ return Qfalse;
469
+ }
470
+ if (self == other) // same object_id
471
+ {
472
+ return Qtrue;
473
+ }
474
+ return cs_cps_eql(self, other);
475
+ }
476
+
272
477
  static inline VALUE
273
- merge_character_set(VALUE self, VALUE other) {
274
- COMPARE_SETS(if (TSTBIT(other_cps, cp)) SETBIT(cps, cp));
275
- return self;
478
+ cs_merge_cs(VALUE recipient, VALUE source)
479
+ {
480
+ cs_cp cp, source_len;
481
+ struct cs_data *data;
482
+ cs_ar *source_cps;
483
+ data = cs_fetch_data(recipient);
484
+ source_cps = cs_fetch_cps(source, &source_len);
485
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
486
+ {
487
+ if (tst_cp(source_cps, source_len, cp))
488
+ {
489
+ set_cp(data, cp);
490
+ }
491
+ }
492
+ return recipient;
276
493
  }
277
494
 
278
- static inline void
279
- raise_arg_err_unless_valid_as_cp(VALUE object_id) {
280
- if (FIXNUM_P(object_id) && object_id > 0 && object_id < 0x220001) return;
495
+ static inline cs_cp
496
+ cs_checked_cp(VALUE object_id)
497
+ {
498
+ if (FIXNUM_P(object_id) && object_id > 0 && object_id < 0x220001)
499
+ {
500
+ return FIX2ULONG(object_id);
501
+ }
281
502
  rb_raise(rb_eArgError, "CharacterSet members must be between 0 and 0x10FFFF");
282
503
  }
283
504
 
284
505
  static inline VALUE
285
- merge_rb_range(VALUE self, VALUE rb_range) {
506
+ cs_merge_rb_range(VALUE self, VALUE rb_range)
507
+ {
286
508
  VALUE from_id, upto_id;
509
+ cs_cp from_cp, upto_cp, cont_len, rem;
287
510
  int excl;
288
- cp_index cp;
289
- cp_byte *cps;
290
- FETCH_CODEPOINTS(self, cps);
511
+ struct cs_data *data;
512
+ data = cs_fetch_data(self);
291
513
 
292
- if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl))) {
514
+ if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl)))
515
+ {
293
516
  rb_raise(rb_eArgError, "pass a Range");
294
517
  }
295
- if (excl) upto_id -= 2;
518
+ if (excl)
519
+ {
520
+ upto_id -= 2;
521
+ }
522
+
523
+ from_cp = cs_checked_cp(from_id);
524
+ upto_cp = cs_checked_cp(upto_id);
296
525
 
297
- raise_arg_err_unless_valid_as_cp(from_id);
298
- raise_arg_err_unless_valid_as_cp(upto_id);
526
+ if (upto_cp > from_cp && (upto_cp - from_cp > 6))
527
+ {
528
+ // set bits in preceding partially toggled bytes individually
529
+ for (/* */; (from_cp <= upto_cp) && (from_cp % 8); from_cp++)
530
+ {
531
+ set_cp(data, from_cp);
532
+ }
533
+ // memset contiguous bits directly
534
+ cont_len = upto_cp - from_cp + 1;
535
+ rem = cont_len % 8;
536
+ ensure_memsize_fits(data, upto_cp);
537
+ memset(data->cps + CS_MSIZE(from_cp), 0xFF, CS_MSIZE(cont_len - rem) / 8);
538
+ from_cp = upto_cp - rem + 1;
539
+ }
299
540
 
300
- for (/* */; from_id <= upto_id; from_id += 2) {
301
- cp = FIX2ULONG(from_id);
302
- SETBIT(cps, cp);
541
+ // set bits in partially toggled bytes individually
542
+ for (/* */; from_cp <= upto_cp; from_cp++)
543
+ {
544
+ set_cp(data, from_cp);
303
545
  }
546
+
304
547
  return self;
305
548
  }
306
549
 
307
550
  static inline VALUE
308
- merge_rb_array(VALUE self, VALUE rb_array) {
309
- VALUE el;
310
- cp_byte *cps;
311
- VALUE array_length, i;
312
- FETCH_CODEPOINTS(self, cps);
551
+ cs_merge_rb_array(VALUE self, VALUE rb_array)
552
+ {
553
+ VALUE el, array_length, i;
554
+ struct cs_data *data;
313
555
  Check_Type(rb_array, T_ARRAY);
556
+ data = cs_fetch_data(self);
314
557
  array_length = RARRAY_LEN(rb_array);
315
- for (i = 0; i < array_length; i++) {
558
+ for (i = 0; i < array_length; i++)
559
+ {
316
560
  el = RARRAY_AREF(rb_array, i);
317
- raise_arg_err_unless_valid_as_cp(el);
318
- SETBIT(cps, FIX2ULONG(el));
561
+ set_cp(data, cs_checked_cp(el));
319
562
  }
320
563
  return self;
321
564
  }
322
565
 
323
566
  static VALUE
324
- method_merge(VALUE self, VALUE other) {
567
+ cs_method_merge(VALUE self, VALUE other)
568
+ {
325
569
  rb_check_frozen(self);
326
- if (is_character_set(other)) {
327
- return merge_character_set(self, other);
570
+ if (cs_check_type(other))
571
+ {
572
+ return cs_merge_cs(self, other);
328
573
  }
329
- else if (TYPE(other) == T_ARRAY) {
330
- return merge_rb_array(self, other);
574
+ else if (TYPE(other) == T_ARRAY)
575
+ {
576
+ return cs_merge_rb_array(self, other);
331
577
  }
332
- return merge_rb_range(self, other);
578
+ return cs_merge_rb_range(self, other);
333
579
  }
334
580
 
335
581
  static VALUE
336
- method_initialize_copy(VALUE self, VALUE other) {
337
- merge_character_set(self, other);
338
- return other;
582
+ cs_method_initialize_copy(VALUE self, VALUE orig)
583
+ {
584
+ cs_merge_cs(self, orig);
585
+ return self;
339
586
  }
340
587
 
341
588
  static VALUE
342
- method_subtract(VALUE self, VALUE other) {
589
+ cs_method_subtract(VALUE self, VALUE other)
590
+ {
591
+ cs_cp cp, len, other_len;
592
+ cs_ar *cps, *other_cps;
343
593
  rb_check_frozen(self);
344
- COMPARE_SETS(if (TSTBIT(other_cps, cp)) CLRBIT(cps, cp));
594
+ cps = cs_fetch_cps(self, &len);
595
+ other_cps = cs_fetch_cps(other, &other_len);
596
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
597
+ {
598
+ if (tst_cp(other_cps, other_len, cp))
599
+ {
600
+ clr_cp(cps, len, cp);
601
+ }
602
+ }
345
603
  return self;
346
604
  }
347
605
 
348
606
  static inline int
349
- a_subset_of_b(VALUE set_a, VALUE set_b, int *is_proper) {
350
- cp_byte *cps_a, *cps_b;
351
- cp_index cp, size_a, size_b;
607
+ cs_a_subset_of_b(VALUE cs_a, VALUE cs_b, int *is_proper_ptr)
608
+ {
609
+ cs_ar *a, *b;
610
+ cs_cp cp, alen, blen, count_a, count_b;
352
611
 
353
- if (!is_character_set(set_a) || !is_character_set(set_b)) {
612
+ if (!cs_check_type(cs_a) || !cs_check_type(cs_b))
613
+ {
354
614
  rb_raise(rb_eArgError, "pass a CharacterSet");
355
615
  }
356
616
 
357
- FETCH_CODEPOINTS(set_a, cps_a);
358
- FETCH_CODEPOINTS(set_b, cps_b);
359
-
360
- *is_proper = 0;
361
- size_a = 0;
362
- size_b = 0;
363
-
364
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
365
- if (TSTBIT(cps_a, cp)) {
366
- if (!TSTBIT(cps_b, cp)) return 0;
367
- size_a++;
368
- size_b++;
617
+ a = cs_fetch_cps(cs_a, &alen);
618
+ b = cs_fetch_cps(cs_b, &blen);
619
+
620
+ count_a = 0;
621
+ count_b = 0;
622
+
623
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
624
+ {
625
+ if (tst_cp(a, alen, cp))
626
+ {
627
+ if (!tst_cp(b, blen, cp))
628
+ {
629
+ return 0;
630
+ }
631
+ count_a++;
632
+ count_b++;
633
+ }
634
+ else if (tst_cp(b, blen, cp))
635
+ {
636
+ count_b++;
369
637
  }
370
- else if (TSTBIT(cps_b, cp)) size_b++;
371
638
  }
372
639
 
373
- if (size_b > size_a) *is_proper = 1;
640
+ if (is_proper_ptr)
641
+ {
642
+ *is_proper_ptr = count_b > count_a;
643
+ }
644
+
374
645
  return 1;
375
646
  }
376
647
 
377
648
  static VALUE
378
- method_subset_p(VALUE self, VALUE other) {
379
- int is_proper;
380
- return a_subset_of_b(self, other, &is_proper) ? Qtrue : Qfalse;
649
+ cs_method_subset_p(VALUE self, VALUE other)
650
+ {
651
+ return cs_a_subset_of_b(self, other, NULL) ? Qtrue : Qfalse;
381
652
  }
382
653
 
383
654
  static VALUE
384
- method_proper_subset_p(VALUE self, VALUE other) {
385
- int is, is_proper;
386
- is = a_subset_of_b(self, other, &is_proper);
387
- return (is && is_proper) ? Qtrue : Qfalse;
655
+ cs_method_proper_subset_p(VALUE self, VALUE other)
656
+ {
657
+ int is_subset, is_proper;
658
+ is_subset = cs_a_subset_of_b(self, other, &is_proper);
659
+ return (is_subset && is_proper) ? Qtrue : Qfalse;
388
660
  }
389
661
 
390
662
  static VALUE
391
- method_superset_p(VALUE self, VALUE other) {
392
- int is_proper;
393
- return a_subset_of_b(other, self, &is_proper) ? Qtrue : Qfalse;
663
+ cs_method_superset_p(VALUE self, VALUE other)
664
+ {
665
+ return cs_a_subset_of_b(other, self, NULL) ? Qtrue : Qfalse;
394
666
  }
395
667
 
396
668
  static VALUE
397
- method_proper_superset_p(VALUE self, VALUE other) {
398
- int is, is_proper;
399
- is = a_subset_of_b(other, self, &is_proper);
400
- return (is && is_proper) ? Qtrue : Qfalse;
669
+ cs_method_proper_superset_p(VALUE self, VALUE other)
670
+ {
671
+ int is_superset, is_proper;
672
+ is_superset = cs_a_subset_of_b(other, self, &is_proper);
673
+ return (is_superset && is_proper) ? Qtrue : Qfalse;
401
674
  }
402
675
 
403
676
  // *******************************
@@ -405,42 +678,44 @@ method_proper_superset_p(VALUE self, VALUE other) {
405
678
  // *******************************
406
679
 
407
680
  static VALUE
408
- class_method_from_ranges(VALUE self, VALUE ranges) {
409
- VALUE new_set, range_count, i;
410
- new_set = rb_class_new_instance(0, 0, self);
681
+ cs_class_method_from_ranges(VALUE self, VALUE ranges)
682
+ {
683
+ VALUE new_cs, range_count, i;
684
+ new_cs = rb_class_new_instance(0, 0, self);
411
685
  range_count = RARRAY_LEN(ranges);
412
- for (i = 0; i < range_count; i++) {
413
- merge_rb_range(new_set, RARRAY_AREF(ranges, i));
686
+ for (i = 0; i < range_count; i++)
687
+ {
688
+ cs_merge_rb_range(new_cs, RARRAY_AREF(ranges, i));
414
689
  }
415
- return new_set;
690
+ return new_cs;
416
691
  }
417
692
 
418
693
  static VALUE
419
- method_ranges(VALUE self) {
420
- VALUE ranges, codepoint, previous_codepoint, current_start, current_end;
694
+ cs_method_ranges(VALUE self)
695
+ {
696
+ VALUE ranges, cp_num, previous_cp_num, current_start, current_end;
421
697
 
422
698
  ranges = rb_ary_new();
423
- previous_codepoint = 0;
699
+ previous_cp_num = 0;
424
700
  current_start = 0;
425
701
  current_end = 0;
426
702
 
427
703
  FOR_EACH_ACTIVE_CODEPOINT(
428
- codepoint = LONG2FIX(cp);
429
-
430
- if (!previous_codepoint) {
431
- current_start = codepoint;
432
- }
433
- else if (previous_codepoint + 2 != codepoint) {
434
- // gap found, finalize previous range
435
- rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
436
- current_start = codepoint;
437
- }
438
- current_end = codepoint;
439
- previous_codepoint = codepoint;
440
- );
704
+ cp_num = LONG2FIX(cp);
705
+
706
+ if (!previous_cp_num) {
707
+ current_start = cp_num;
708
+ } else if (previous_cp_num + 2 != cp_num)
709
+ {
710
+ // gap found, finalize previous range
711
+ rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
712
+ current_start = cp_num;
713
+ } current_end = cp_num;
714
+ previous_cp_num = cp_num;);
441
715
 
442
716
  // add final range
443
- if (current_start) {
717
+ if (current_start)
718
+ {
444
719
  rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
445
720
  }
446
721
 
@@ -448,117 +723,233 @@ method_ranges(VALUE self) {
448
723
  }
449
724
 
450
725
  static VALUE
451
- method_sample(int argc, VALUE *argv, VALUE self) {
452
- VALUE to_a_args[1], array;
726
+ cs_method_sample(int argc, VALUE *argv, VALUE self)
727
+ {
728
+ VALUE array, to_a_args[1] = {Qtrue};
453
729
  rb_check_arity(argc, 0, 1);
454
- to_a_args[0] = Qtrue;
455
- array = method_to_a(1, to_a_args, self);
730
+ array = cs_method_to_a(1, to_a_args, self);
456
731
  return rb_funcall(array, rb_intern("sample"), argc, argc ? argv[0] : 0);
457
732
  }
458
733
 
459
734
  static inline VALUE
460
- new_set_from_section(VALUE set, cp_index from, cp_index upto) {
461
- cp_byte *cps, *new_cps;
462
- cp_index cp;
463
- FETCH_CODEPOINTS(set, cps);
464
- new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));
465
- for (cp = from; cp <= upto; cp++) {
466
- if (TSTBIT(cps, cp)) SETBIT(new_cps, cp);
735
+ cs_from_section(VALUE set, cs_cp from, cs_cp upto)
736
+ {
737
+ VALUE new_cs;
738
+ cs_ar *cps;
739
+ cs_cp cp, len;
740
+ struct cs_data *new_data;
741
+ new_cs = cs_alloc(RBASIC(set)->klass, &new_data);
742
+ cps = cs_fetch_cps(set, &len);
743
+ for (cp = from; cp <= upto; cp++)
744
+ {
745
+ if (tst_cp(cps, len, cp))
746
+ {
747
+ set_cp(new_data, cp);
748
+ }
467
749
  }
468
- return NEW_CHARACTER_SET(RBASIC(set)->klass, new_cps);
750
+ return new_cs;
469
751
  }
470
752
 
471
753
  static VALUE
472
- method_bmp_part(VALUE self) {
473
- return new_set_from_section(self, 0, UNICODE_PLANE_SIZE - 1);
754
+ cs_method_ext_section(VALUE self, VALUE from, VALUE upto)
755
+ {
756
+ return cs_from_section(self, FIX2ULONG(from), FIX2ULONG(upto));
757
+ }
758
+
759
+ static inline cs_cp
760
+ cs_active_cp_count_in_section(VALUE set, cs_cp from, cs_cp upto)
761
+ {
762
+ cs_ar *cps;
763
+ cs_cp cp, count, len;
764
+ cps = cs_fetch_cps(set, &len);
765
+ for (count = 0, cp = from; cp <= upto; cp++)
766
+ {
767
+ if (tst_cp(cps, len, cp))
768
+ {
769
+ count++;
770
+ }
771
+ }
772
+ return count;
474
773
  }
475
774
 
476
775
  static VALUE
477
- method_astral_part(VALUE self) {
478
- return new_set_from_section(self, UNICODE_PLANE_SIZE, UNICODE_CP_COUNT - 1);
776
+ cs_method_ext_count_in_section(VALUE self, VALUE from, VALUE upto)
777
+ {
778
+ cs_cp count;
779
+ count = cs_active_cp_count_in_section(self, FIX2ULONG(from), FIX2ULONG(upto));
780
+ return LONG2FIX(count);
479
781
  }
480
782
 
481
783
  static inline VALUE
482
- set_has_member_in_plane(VALUE set, unsigned int plane) {
483
- cp_byte *cps;
484
- cp_index cp, max_cp;
485
- FETCH_CODEPOINTS(set, cps);
486
- cp = plane * UNICODE_PLANE_SIZE;
487
- max_cp = (plane + 1) * UNICODE_PLANE_SIZE - 1;
488
- for (/* */; cp <= max_cp; cp++) {
489
- if (TSTBIT(cps, cp)) return Qtrue;
784
+ cs_has_cp_in_section(cs_ar *cps, cs_cp len, cs_cp from, cs_cp upto)
785
+ {
786
+ cs_cp cp;
787
+ for (cp = from; cp <= upto; cp++)
788
+ {
789
+ if (tst_cp(cps, len, cp))
790
+ {
791
+ return Qtrue;
792
+ }
490
793
  }
491
794
  return Qfalse;
492
795
  }
493
796
 
494
797
  static VALUE
495
- method_planes(VALUE self) {
798
+ cs_method_ext_section_p(VALUE self, VALUE from, VALUE upto)
799
+ {
800
+ cs_ar *cps;
801
+ cs_cp len;
802
+ cps = cs_fetch_cps(self, &len);
803
+ return cs_has_cp_in_section(cps, len, FIX2ULONG(from), FIX2ULONG(upto));
804
+ }
805
+
806
+ static inline VALUE
807
+ cs_ratio_of_section(VALUE set, cs_cp from, cs_cp upto)
808
+ {
809
+ double section_count, total_count;
810
+ section_count = (double)cs_active_cp_count_in_section(set, from, upto);
811
+ total_count = (double)cs_active_cp_count(set);
812
+ return DBL2NUM(section_count / total_count);
813
+ }
814
+
815
+ static VALUE
816
+ cs_method_ext_section_ratio(VALUE self, VALUE from, VALUE upto)
817
+ {
818
+ return cs_ratio_of_section(self, FIX2ULONG(from), FIX2ULONG(upto));
819
+ }
820
+
821
+ #define MAX_CP 0x10FFFF
822
+ #define MAX_ASCII_CP 0x7F
823
+ #define MAX_BMP_CP 0xFFFF
824
+ #define MIN_ASTRAL_CP 0x10000
825
+
826
+ static inline VALUE
827
+ cs_has_cp_in_plane(cs_ar *cps, cs_cp len, unsigned int plane)
828
+ {
829
+ cs_cp plane_beg, plane_end;
830
+ plane_beg = plane * UNICODE_PLANE_SIZE;
831
+ plane_end = (plane + 1) * MAX_BMP_CP;
832
+ return cs_has_cp_in_section(cps, len, plane_beg, plane_end);
833
+ }
834
+
835
+ static VALUE
836
+ cs_method_planes(VALUE self)
837
+ {
838
+ cs_ar *cps;
839
+ cs_cp len;
496
840
  unsigned int i;
497
841
  VALUE planes;
842
+ cps = cs_fetch_cps(self, &len);
498
843
  planes = rb_ary_new();
499
- for (i = 0; i < UNICODE_PLANE_COUNT; i++) {
500
- if (set_has_member_in_plane(self, i)) rb_ary_push(planes, INT2FIX(i));
844
+ for (i = 0; i < UNICODE_PLANE_COUNT; i++)
845
+ {
846
+ if (cs_has_cp_in_plane(cps, len, i))
847
+ {
848
+ rb_ary_push(planes, INT2FIX(i));
849
+ }
501
850
  }
502
851
  return planes;
503
852
  }
504
853
 
505
- static VALUE
506
- method_member_in_plane_p(VALUE self, VALUE plane_num) {
854
+ static inline int
855
+ cs_valid_plane_num(VALUE num)
856
+ {
507
857
  int plane;
508
- Check_Type(plane_num, T_FIXNUM);
509
- plane = FIX2INT(plane_num);
510
- if (plane < 0 || plane >= UNICODE_PLANE_COUNT) {
511
- rb_raise(rb_eArgError, "plane must be between 0 and 16");
858
+ Check_Type(num, T_FIXNUM);
859
+ plane = FIX2INT(num);
860
+ if (plane < 0 || plane >= UNICODE_PLANE_COUNT)
861
+ {
862
+ rb_raise(rb_eArgError, "plane must be between 0 and %d", UNICODE_PLANE_COUNT - 1);
512
863
  }
513
- return set_has_member_in_plane(self, plane);
864
+ return plane;
865
+ }
866
+
867
+ static VALUE
868
+ cs_method_plane(VALUE self, VALUE plane_num)
869
+ {
870
+ cs_cp plane, plane_beg, plane_end;
871
+ plane = cs_valid_plane_num(plane_num);
872
+ plane_beg = plane * UNICODE_PLANE_SIZE;
873
+ plane_end = (plane + 1) * MAX_BMP_CP;
874
+ return cs_from_section(self, plane_beg, plane_end);
875
+ }
876
+
877
+ static VALUE
878
+ cs_method_member_in_plane_p(VALUE self, VALUE plane_num)
879
+ {
880
+ cs_ar *cps;
881
+ cs_cp len;
882
+ unsigned int plane;
883
+ plane = cs_valid_plane_num(plane_num);
884
+ cps = cs_fetch_cps(self, &len);
885
+ return cs_has_cp_in_plane(cps, len, plane);
514
886
  }
515
887
 
516
888
  #define NON_SURROGATE(cp) (cp > 0xDFFF || cp < 0xD800)
517
889
 
518
890
  static VALUE
519
- method_ext_inversion(int argc, VALUE *argv, VALUE self) {
520
- int include_surrogates;
521
- cp_index upto;
522
- VALUE other;
523
- other = 0;
891
+ cs_method_ext_inversion(int argc, VALUE *argv, VALUE self)
892
+ {
893
+ int inc_surr;
894
+ cs_cp upto, cp, len;
895
+ cs_ar *cps;
896
+ VALUE new_cs;
897
+ struct cs_data *new_data;
898
+
524
899
  rb_check_arity(argc, 0, 2);
525
- include_surrogates = ((argc > 0) && (argv[0] == Qtrue));
526
- if ((argc > 1) && FIXNUM_P(argv[1])) {
527
- upto = FIX2ULONG(argv[1]);
528
- RETURN_NEW_SET_BASED_ON(
529
- cp <= upto && !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
530
- );
900
+
901
+ cps = cs_fetch_cps(self, &len);
902
+ inc_surr = argc && argv[0] == Qtrue;
903
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data);
904
+ upto = argc > 1 && FIXNUM_P(argv[1]) ? FIX2ULONG(argv[1]) : UNICODE_CP_COUNT;
905
+
906
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
907
+ {
908
+ if (cp <= upto && !tst_cp(cps, len, cp) && (inc_surr || NON_SURROGATE(cp)))
909
+ {
910
+ set_cp(new_data, cp);
911
+ }
531
912
  }
532
- RETURN_NEW_SET_BASED_ON(
533
- !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
534
- );
913
+
914
+ return new_cs;
535
915
  }
536
916
 
537
- typedef int(*str_cp_handler)(unsigned int, cp_byte*);
917
+ typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE *memo);
538
918
 
539
919
  static inline int
540
- add_str_cp_to_arr(unsigned int str_cp, cp_byte *cp_arr) {
541
- SETBIT(cp_arr, str_cp);
920
+ add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
921
+ {
922
+ set_cp(data, str_cp);
542
923
  return 1;
543
924
  }
544
925
 
545
926
  static VALUE
546
- method_case_insensitive(VALUE self) {
547
- cp_index i;
548
- cp_byte *new_cps;
549
-
550
- new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));
927
+ cs_method_case_insensitive(VALUE self)
928
+ {
929
+ cs_cp i, len;
930
+ cs_ar *cps;
931
+ VALUE new_cs;
932
+ struct cs_data *new_data;
551
933
 
552
- FOR_EACH_ACTIVE_CODEPOINT(SETBIT(new_cps, cp));
934
+ cps = cs_fetch_cps(self, &len);
935
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data);
936
+ cs_merge_cs(new_cs, self);
553
937
 
554
- for (i = 0; i < CASEFOLD_COUNT; i++) {
938
+ for (i = 0; i < CASEFOLD_COUNT; i++)
939
+ {
555
940
  casefold_mapping m = unicode_casefold_table[i];
556
941
 
557
- if (TSTBIT(cps, m.from)) { SETBIT(new_cps, m.to); }
558
- else if (TSTBIT(cps, m.to)) { SETBIT(new_cps, m.from); }
942
+ if (tst_cp(cps, len, m.from))
943
+ {
944
+ set_cp(new_data, m.to);
945
+ }
946
+ else if (tst_cp(cps, len, m.to))
947
+ {
948
+ set_cp(new_data, m.from);
949
+ }
559
950
  }
560
951
 
561
- return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);
952
+ return new_cs;
562
953
 
563
954
  // OnigCaseFoldType flags;
564
955
  // rb_encoding *enc;
@@ -573,20 +964,27 @@ method_case_insensitive(VALUE self) {
573
964
  }
574
965
 
575
966
  static inline VALUE
576
- each_sb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
577
- long i;
967
+ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
968
+ {
969
+ long i, str_len;
578
970
  unsigned int str_cp;
971
+ str_len = RSTRING_LEN(str);
579
972
 
580
- for (i = 0; i < RSTRING_LEN(str); i++) {
973
+ for (i = 0; i < str_len; i++)
974
+ {
581
975
  str_cp = (RSTRING_PTR(str)[i] & 0xff);
582
- if (!(*func)(str_cp, cp_arr)) return Qfalse;
976
+ if (!(*func)(str_cp, cp_arr, len, data, memo))
977
+ {
978
+ return Qfalse;
979
+ }
583
980
  }
584
981
 
585
982
  return Qtrue;
586
983
  }
587
984
 
588
985
  static inline VALUE
589
- each_mb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
986
+ each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
987
+ {
590
988
  int n;
591
989
  unsigned int str_cp;
592
990
  const char *ptr, *end;
@@ -597,9 +995,13 @@ each_mb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
597
995
  end = RSTRING_END(str);
598
996
  enc = rb_enc_get(str);
599
997
 
600
- while (ptr < end) {
998
+ while (ptr < end)
999
+ {
601
1000
  str_cp = rb_enc_codepoint_len(ptr, end, &n, enc);
602
- if (!(*func)(str_cp, cp_arr)) return Qfalse;
1001
+ if (!(*func)(str_cp, cp_arr, len, data, memo))
1002
+ {
1003
+ return Qfalse;
1004
+ }
603
1005
  ptr += n;
604
1006
  }
605
1007
 
@@ -611,105 +1013,240 @@ static inline int
611
1013
  single_byte_optimizable(VALUE str)
612
1014
  {
613
1015
  rb_encoding *enc;
614
- if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) return 1;
1016
+ if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
1017
+ {
1018
+ return 1;
1019
+ }
615
1020
 
616
1021
  enc = rb_enc_get(str);
617
- if (rb_enc_mbmaxlen(enc) == 1) return 1;
1022
+ if (rb_enc_mbmaxlen(enc) == 1)
1023
+ {
1024
+ return 1;
1025
+ }
618
1026
 
619
1027
  return 0;
620
1028
  }
621
1029
 
622
1030
  static inline VALUE
623
- each_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
624
- if (single_byte_optimizable(str)) {
625
- return each_sb_cp(str, func, cp_arr);
1031
+ each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1032
+ {
1033
+ if (single_byte_optimizable(str))
1034
+ {
1035
+ return each_sb_cp(str, func, cp_arr, len, data, memo);
626
1036
  }
627
- return each_mb_cp(str, func, cp_arr);
1037
+ return each_mb_cp(str, func, cp_arr, len, data, memo);
628
1038
  }
629
1039
 
630
1040
  static inline void
631
- raise_arg_err_unless_string(VALUE val) {
632
- if (!RB_TYPE_P(val, T_STRING)) rb_raise(rb_eArgError, "pass a String");
1041
+ raise_arg_err_unless_string(VALUE val)
1042
+ {
1043
+ if (!RB_TYPE_P(val, T_STRING))
1044
+ {
1045
+ rb_raise(rb_eArgError, "pass a String");
1046
+ }
1047
+ }
1048
+
1049
+ static VALUE
1050
+ cs_class_method_of(int argc, VALUE *argv, VALUE self)
1051
+ {
1052
+ VALUE new_cs;
1053
+ struct cs_data *new_data;
1054
+ int i;
1055
+ new_cs = cs_alloc(self, &new_data);
1056
+ for (i = 0; i < argc; i++)
1057
+ {
1058
+ raise_arg_err_unless_string(argv[i]);
1059
+ each_cp(argv[i], add_str_cp_to_arr, 0, 0, new_data, 0);
1060
+ }
1061
+ return new_cs;
1062
+ }
1063
+
1064
+ static inline int
1065
+ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1066
+ {
1067
+ if (tst_cp(cp_arr, len, str_cp))
1068
+ {
1069
+ *memo += 1;
1070
+ }
1071
+ return 1;
633
1072
  }
634
1073
 
635
1074
  static VALUE
636
- class_method_of(VALUE self, VALUE str) {
637
- cp_byte *cp_arr;
1075
+ cs_method_count_in(VALUE self, VALUE str)
1076
+ {
1077
+ VALUE count;
1078
+ struct cs_data *data;
638
1079
  raise_arg_err_unless_string(str);
639
- cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte));
640
- each_cp(str, add_str_cp_to_arr, cp_arr);
641
- return NEW_CHARACTER_SET(self, cp_arr);
1080
+ data = cs_fetch_data(self);
1081
+ count = 0;
1082
+ each_cp(str, count_str_cp, data->cps, data->len, data, &count);
1083
+ return INT2NUM((int)count);
642
1084
  }
643
1085
 
644
1086
  static inline int
645
- str_cp_not_in_arr(unsigned int str_cp, cp_byte *cp_arr) {
646
- return !TSTBIT(cp_arr, str_cp);
1087
+ str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1088
+ {
1089
+ return tst_cp(cp_arr, len, str_cp);
647
1090
  }
648
1091
 
649
1092
  static VALUE
650
- method_used_by_p(VALUE self, VALUE str) {
651
- cp_byte *cps;
652
- VALUE only_uses_other_cps;
1093
+ cs_method_cover_p(VALUE self, VALUE str)
1094
+ {
1095
+ struct cs_data *data;
653
1096
  raise_arg_err_unless_string(str);
654
- FETCH_CODEPOINTS(self, cps);
655
- only_uses_other_cps = each_cp(str, str_cp_not_in_arr, cps);
656
- return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
1097
+ data = cs_fetch_data(self);
1098
+ return each_cp(str, str_cp_in_arr, data->cps, data->len, data, 0);
657
1099
  }
658
1100
 
659
1101
  static inline int
660
- str_cp_in_arr(unsigned int str_cp, cp_byte *cp_arr) {
661
- return TSTBIT(cp_arr, str_cp);
1102
+ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1103
+ {
1104
+ if (tst_cp(cp_arr, len, str_cp))
1105
+ {
1106
+ rb_ary_push(memo[0], rb_enc_uint_chr((int)str_cp, (rb_encoding *)memo[1]));
1107
+ }
1108
+ return 1;
662
1109
  }
663
1110
 
664
1111
  static VALUE
665
- method_cover_p(VALUE self, VALUE str) {
666
- cp_byte *cps;
1112
+ cs_method_scan(VALUE self, VALUE str)
1113
+ {
1114
+ VALUE memo[2];
1115
+ struct cs_data *data;
667
1116
  raise_arg_err_unless_string(str);
668
- FETCH_CODEPOINTS(self, cps);
669
- return each_cp(str, str_cp_in_arr, cps);
1117
+ data = cs_fetch_data(self);
1118
+ memo[0] = rb_ary_new();
1119
+ memo[1] = (VALUE)rb_enc_get(str);
1120
+ each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo);
1121
+ return memo[0];
1122
+ }
1123
+
1124
+ static inline int
1125
+ str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1126
+ {
1127
+ return !tst_cp(cp_arr, len, str_cp);
1128
+ }
1129
+
1130
+ static VALUE
1131
+ cs_method_used_by_p(VALUE self, VALUE str)
1132
+ {
1133
+ VALUE only_uses_other_cps;
1134
+ struct cs_data *data;
1135
+ raise_arg_err_unless_string(str);
1136
+ data = cs_fetch_data(self);
1137
+ only_uses_other_cps = each_cp(str, str_cp_not_in_arr, data->cps, data->len, data, 0);
1138
+ return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
1139
+ }
1140
+
1141
+ static void
1142
+ cs_str_buf_cat(VALUE str, const char *ptr, long len)
1143
+ {
1144
+ long total, olen;
1145
+ char *sptr;
1146
+
1147
+ RSTRING_GETMEM(str, sptr, olen);
1148
+ sptr = RSTRING(str)->as.heap.ptr;
1149
+ olen = RSTRING(str)->as.heap.len;
1150
+ total = olen + len;
1151
+ memcpy(sptr + olen, ptr, len);
1152
+ RSTRING(str)->as.heap.len = total;
1153
+ }
1154
+
1155
+ #ifndef TERM_FILL
1156
+ #define TERM_FILL(ptr, termlen) \
1157
+ do \
1158
+ { \
1159
+ char *const term_fill_ptr = (ptr); \
1160
+ const int term_fill_len = (termlen); \
1161
+ *term_fill_ptr = '\0'; \
1162
+ if (__builtin_expect(!!(term_fill_len > 1), 0)) \
1163
+ memset(term_fill_ptr, 0, term_fill_len); \
1164
+ } while (0)
1165
+ #endif
1166
+
1167
+ static void
1168
+ cs_str_buf_terminate(VALUE str, rb_encoding *enc)
1169
+ {
1170
+ char *ptr;
1171
+ long len;
1172
+
1173
+ ptr = RSTRING(str)->as.heap.ptr;
1174
+ len = RSTRING(str)->as.heap.len;
1175
+ TERM_FILL(ptr + len, rb_enc_mbminlen(enc));
670
1176
  }
671
1177
 
672
1178
  static inline VALUE
673
- apply_to_str(VALUE set, VALUE str, int delete, int bang) {
674
- cp_byte *cps;
1179
+ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1180
+ {
1181
+ cs_ar *cps;
1182
+ cs_cp len;
675
1183
  rb_encoding *str_enc;
676
- VALUE orig_len, blen, new_str_buf, chr;
677
- int n;
1184
+ VALUE orig_len, new_str_buf;
1185
+ int cp_len;
678
1186
  unsigned int str_cp;
679
1187
  const char *ptr, *end;
680
1188
 
681
1189
  raise_arg_err_unless_string(str);
682
1190
 
683
- FETCH_CODEPOINTS(set, cps);
1191
+ cps = cs_fetch_cps(set, &len);
684
1192
 
685
1193
  orig_len = RSTRING_LEN(str);
686
- blen = orig_len + 30; /* len + margin */ // not sure why, copied from string.c
687
- new_str_buf = rb_str_buf_new(blen);
1194
+ if (orig_len < 1) // empty string, will never change
1195
+ {
1196
+ if (bang)
1197
+ {
1198
+ return Qnil;
1199
+ }
1200
+ return rb_str_dup(str);
1201
+ }
1202
+
1203
+ new_str_buf = rb_str_buf_new(orig_len + 30); // len + margin
688
1204
  str_enc = rb_enc_get(str);
689
1205
  rb_enc_associate(new_str_buf, str_enc);
690
- ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ?
691
- ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1206
+ rb_str_modify(new_str_buf);
1207
+ ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
692
1208
 
693
1209
  ptr = RSTRING_PTR(str);
694
1210
  end = RSTRING_END(str);
695
1211
 
696
- while (ptr < end) {
697
- str_cp = rb_enc_codepoint_len(ptr, end, &n, str_enc);
698
- if (!TSTBIT(cps, str_cp) != !delete) {
699
- chr = rb_enc_uint_chr(str_cp, str_enc);
700
- rb_enc_str_buf_cat(new_str_buf, RSTRING_PTR(chr), n, str_enc);
1212
+ if (single_byte_optimizable(str))
1213
+ {
1214
+ while (ptr < end)
1215
+ {
1216
+ str_cp = *ptr & 0xff;
1217
+ if ((!tst_cp(cps, len, str_cp)) == delete)
1218
+ {
1219
+ cs_str_buf_cat(new_str_buf, ptr, 1);
1220
+ }
1221
+ ptr++;
1222
+ }
1223
+ }
1224
+ else // likely to be multibyte string
1225
+ {
1226
+ while (ptr < end)
1227
+ {
1228
+ str_cp = rb_enc_codepoint_len(ptr, end, &cp_len, str_enc);
1229
+ if ((!tst_cp(cps, len, str_cp)) == delete)
1230
+ {
1231
+ cs_str_buf_cat(new_str_buf, ptr, cp_len);
1232
+ }
1233
+ ptr += cp_len;
701
1234
  }
702
- ptr += n;
703
1235
  }
704
1236
 
705
- if (bang) {
706
- if (RSTRING_LEN(new_str_buf) == (long)orig_len) return Qnil; // unchanged
1237
+ cs_str_buf_terminate(new_str_buf, str_enc);
1238
+
1239
+ if (bang)
1240
+ {
1241
+ if (RSTRING_LEN(new_str_buf) == (long)orig_len) // string unchanged
1242
+ {
1243
+ return Qnil;
1244
+ }
707
1245
  rb_str_shared_replace(str, new_str_buf);
708
1246
  }
709
- else {
1247
+ else
1248
+ {
710
1249
  RB_OBJ_WRITE(new_str_buf, &(RBASIC(new_str_buf))->klass, rb_obj_class(str));
711
- // slightly cumbersome approach needed for compatibility with Ruby < 2.3:
712
- RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags&(FL_TAINT));
713
1250
  str = new_str_buf;
714
1251
  }
715
1252
 
@@ -717,98 +1254,115 @@ apply_to_str(VALUE set, VALUE str, int delete, int bang) {
717
1254
  }
718
1255
 
719
1256
  static VALUE
720
- method_delete_in(VALUE self, VALUE str) {
721
- return apply_to_str(self, str, 1, 0);
1257
+ cs_method_delete_in(VALUE self, VALUE str)
1258
+ {
1259
+ return cs_apply_to_str(self, str, 1, 0);
1260
+ }
1261
+
1262
+ static VALUE
1263
+ cs_method_delete_in_bang(VALUE self, VALUE str)
1264
+ {
1265
+ return cs_apply_to_str(self, str, 1, 1);
722
1266
  }
723
1267
 
724
1268
  static VALUE
725
- method_delete_in_bang(VALUE self, VALUE str) {
726
- return apply_to_str(self, str, 1, 1);
1269
+ cs_method_keep_in(VALUE self, VALUE str)
1270
+ {
1271
+ return cs_apply_to_str(self, str, 0, 0);
727
1272
  }
728
1273
 
729
1274
  static VALUE
730
- method_keep_in(VALUE self, VALUE str) {
731
- return apply_to_str(self, str, 0, 0);
1275
+ cs_method_keep_in_bang(VALUE self, VALUE str)
1276
+ {
1277
+ return cs_apply_to_str(self, str, 0, 1);
732
1278
  }
733
1279
 
734
1280
  static VALUE
735
- method_keep_in_bang(VALUE self, VALUE str) {
736
- return apply_to_str(self, str, 0, 1);
1281
+ cs_method_allocated_length(VALUE self)
1282
+ {
1283
+ return LONG2FIX(cs_fetch_data(self)->len);
737
1284
  }
738
1285
 
739
1286
  // ****
740
1287
  // init
741
1288
  // ****
742
1289
 
743
- void
744
- Init_character_set()
1290
+ void Init_character_set()
745
1291
  {
746
1292
  VALUE cs = rb_define_class("CharacterSet", rb_cObject);
747
1293
 
748
- rb_define_alloc_func(cs, method_allocate);
1294
+ rb_define_alloc_func(cs, cs_method_allocate);
749
1295
 
750
1296
  // `Set` compatibility methods
751
1297
 
752
- rb_define_method(cs, "each", method_each, 0);
753
- rb_define_method(cs, "to_a", method_to_a, -1);
754
- rb_define_method(cs, "length", method_length, 0);
755
- rb_define_method(cs, "size", method_length, 0);
756
- rb_define_method(cs, "count", method_length, 0);
757
- rb_define_method(cs, "empty?", method_empty_p, 0);
758
- rb_define_method(cs, "hash", method_hash, 0);
759
- rb_define_method(cs, "keep_if", method_keep_if, 0);
760
- rb_define_method(cs, "delete_if", method_delete_if, 0);
761
- rb_define_method(cs, "clear", method_clear, 0);
762
- rb_define_method(cs, "intersection", method_intersection, 1);
763
- rb_define_method(cs, "&", method_intersection, 1);
764
- rb_define_method(cs, "union", method_union, 1);
765
- rb_define_method(cs, "+", method_union, 1);
766
- rb_define_method(cs, "|", method_union, 1);
767
- rb_define_method(cs, "difference", method_difference, 1);
768
- rb_define_method(cs, "-", method_difference, 1);
769
- rb_define_method(cs, "^", method_exclusion, 1);
770
- rb_define_method(cs, "include?", method_include_p, 1);
771
- rb_define_method(cs, "member?", method_include_p, 1);
772
- rb_define_method(cs, "===", method_include_p, 1);
773
- rb_define_method(cs, "add", method_add, 1);
774
- rb_define_method(cs, "<<", method_add, 1);
775
- rb_define_method(cs, "add?", method_add_p, 1);
776
- rb_define_method(cs, "delete", method_delete, 1);
777
- rb_define_method(cs, "delete?", method_delete_p, 1);
778
- rb_define_method(cs, "intersect?", method_intersect_p, 1);
779
- rb_define_method(cs, "disjoint?", method_disjoint_p, 1);
780
- rb_define_method(cs, "eql?", method_eql_p, 1);
781
- rb_define_method(cs, "==", method_eql_p, 1);
782
- rb_define_method(cs, "merge", method_merge, 1);
783
- rb_define_method(cs, "initialize_clone", method_initialize_copy, 1);
784
- rb_define_method(cs, "initialize_dup", method_initialize_copy, 1);
785
- rb_define_method(cs, "subtract", method_subtract, 1);
786
- rb_define_method(cs, "subset?", method_subset_p, 1);
787
- rb_define_method(cs, "<=", method_subset_p, 1);
788
- rb_define_method(cs, "proper_subset?", method_proper_subset_p, 1);
789
- rb_define_method(cs, "<", method_proper_subset_p, 1);
790
- rb_define_method(cs, "superset?", method_superset_p, 1);
791
- rb_define_method(cs, ">=", method_superset_p, 1);
792
- rb_define_method(cs, "proper_superset?", method_proper_superset_p, 1);
793
- rb_define_method(cs, ">", method_proper_superset_p, 1);
1298
+ rb_define_method(cs, "each", cs_method_each, 0);
1299
+ rb_define_method(cs, "to_a", cs_method_to_a, -1);
1300
+ rb_define_method(cs, "length", cs_method_length, 0);
1301
+ rb_define_method(cs, "size", cs_method_length, 0);
1302
+ rb_define_method(cs, "empty?", cs_method_empty_p, 0);
1303
+ rb_define_method(cs, "hash", cs_method_hash, 0);
1304
+ rb_define_method(cs, "keep_if", cs_method_keep_if, 0);
1305
+ rb_define_method(cs, "delete_if", cs_method_delete_if, 0);
1306
+ rb_define_method(cs, "clear", cs_method_clear, 0);
1307
+ rb_define_method(cs, "min", cs_method_min, 0);
1308
+ rb_define_method(cs, "max", cs_method_max, 0);
1309
+ rb_define_method(cs, "minmax", cs_method_minmax, 0);
1310
+ rb_define_method(cs, "intersection", cs_method_intersection, 1);
1311
+ rb_define_method(cs, "&", cs_method_intersection, 1);
1312
+ rb_define_method(cs, "union", cs_method_union, 1);
1313
+ rb_define_method(cs, "+", cs_method_union, 1);
1314
+ rb_define_method(cs, "|", cs_method_union, 1);
1315
+ rb_define_method(cs, "difference", cs_method_difference, 1);
1316
+ rb_define_method(cs, "-", cs_method_difference, 1);
1317
+ rb_define_method(cs, "^", cs_method_exclusion, 1);
1318
+ rb_define_method(cs, "include?", cs_method_include_p, 1);
1319
+ rb_define_method(cs, "member?", cs_method_include_p, 1);
1320
+ rb_define_method(cs, "===", cs_method_include_p, 1);
1321
+ rb_define_method(cs, "add", cs_method_add, 1);
1322
+ rb_define_method(cs, "<<", cs_method_add, 1);
1323
+ rb_define_method(cs, "add?", cs_method_add_p, 1);
1324
+ rb_define_method(cs, "delete", cs_method_delete, 1);
1325
+ rb_define_method(cs, "delete?", cs_method_delete_p, 1);
1326
+ rb_define_method(cs, "intersect?", cs_method_intersect_p, 1);
1327
+ rb_define_method(cs, "disjoint?", cs_method_disjoint_p, 1);
1328
+ rb_define_method(cs, "eql?", cs_method_eql_p, 1);
1329
+ rb_define_method(cs, "==", cs_method_eql_p, 1);
1330
+ rb_define_method(cs, "merge", cs_method_merge, 1);
1331
+ rb_define_method(cs, "initialize_clone", cs_method_initialize_copy, 1);
1332
+ rb_define_method(cs, "initialize_dup", cs_method_initialize_copy, 1);
1333
+ rb_define_method(cs, "subtract", cs_method_subtract, 1);
1334
+ rb_define_method(cs, "subset?", cs_method_subset_p, 1);
1335
+ rb_define_method(cs, "<=", cs_method_subset_p, 1);
1336
+ rb_define_method(cs, "proper_subset?", cs_method_proper_subset_p, 1);
1337
+ rb_define_method(cs, "<", cs_method_proper_subset_p, 1);
1338
+ rb_define_method(cs, "superset?", cs_method_superset_p, 1);
1339
+ rb_define_method(cs, ">=", cs_method_superset_p, 1);
1340
+ rb_define_method(cs, "proper_superset?", cs_method_proper_superset_p, 1);
1341
+ rb_define_method(cs, ">", cs_method_proper_superset_p, 1);
794
1342
 
795
1343
  // `CharacterSet`-specific methods
796
1344
 
797
- rb_define_singleton_method(cs, "from_ranges", class_method_from_ranges, -2);
798
- rb_define_singleton_method(cs, "of", class_method_of, 1);
799
-
800
- rb_define_method(cs, "ranges", method_ranges, 0);
801
- rb_define_method(cs, "sample", method_sample, -1);
802
- rb_define_method(cs, "bmp_part", method_bmp_part, 0);
803
- rb_define_method(cs, "astral_part", method_astral_part, 0);
804
- rb_define_method(cs, "planes", method_planes, 0);
805
- rb_define_method(cs, "member_in_plane?", method_member_in_plane_p, 1);
806
- rb_define_method(cs, "ext_inversion", method_ext_inversion, -1);
807
- rb_define_method(cs, "case_insensitive", method_case_insensitive, 0);
808
- rb_define_method(cs, "used_by?", method_used_by_p, 1);
809
- rb_define_method(cs, "cover?", method_cover_p, 1);
810
- rb_define_method(cs, "delete_in", method_delete_in, 1);
811
- rb_define_method(cs, "delete_in!", method_delete_in_bang, 1);
812
- rb_define_method(cs, "keep_in", method_keep_in, 1);
813
- rb_define_method(cs, "keep_in!", method_keep_in_bang, 1);
1345
+ rb_define_singleton_method(cs, "from_ranges", cs_class_method_from_ranges, -2);
1346
+ rb_define_singleton_method(cs, "of", cs_class_method_of, -1);
1347
+
1348
+ rb_define_method(cs, "ranges", cs_method_ranges, 0);
1349
+ rb_define_method(cs, "sample", cs_method_sample, -1);
1350
+ rb_define_method(cs, "ext_section", cs_method_ext_section, 2);
1351
+ rb_define_method(cs, "ext_count_in_section", cs_method_ext_count_in_section, 2);
1352
+ rb_define_method(cs, "ext_section?", cs_method_ext_section_p, 2);
1353
+ rb_define_method(cs, "ext_section_ratio", cs_method_ext_section_ratio, 2);
1354
+ rb_define_method(cs, "planes", cs_method_planes, 0);
1355
+ rb_define_method(cs, "plane", cs_method_plane, 1);
1356
+ rb_define_method(cs, "member_in_plane?", cs_method_member_in_plane_p, 1);
1357
+ rb_define_method(cs, "ext_inversion", cs_method_ext_inversion, -1);
1358
+ rb_define_method(cs, "case_insensitive", cs_method_case_insensitive, 0);
1359
+ rb_define_method(cs, "count_in", cs_method_count_in, 1);
1360
+ rb_define_method(cs, "cover?", cs_method_cover_p, 1);
1361
+ rb_define_method(cs, "delete_in", cs_method_delete_in, 1);
1362
+ rb_define_method(cs, "delete_in!", cs_method_delete_in_bang, 1);
1363
+ rb_define_method(cs, "keep_in", cs_method_keep_in, 1);
1364
+ rb_define_method(cs, "keep_in!", cs_method_keep_in_bang, 1);
1365
+ rb_define_method(cs, "scan", cs_method_scan, 1);
1366
+ rb_define_method(cs, "used_by?", cs_method_used_by_p, 1);
1367
+ rb_define_method(cs, "allocated_length", cs_method_allocated_length, 0);
814
1368
  }