character_set 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,814 @@
1
+ #include "ruby.h"
2
+ #include "ruby/encoding.h"
3
+ #include "unicode_casefold_table.h"
4
+
5
+ #define SETBIT(byte_arr, bit) (byte_arr[bit >> 3] |= (1 << (bit & 0x07)))
6
+ #define CLRBIT(byte_arr, bit) (byte_arr[bit >> 3] &= ~(1 << (bit & 0x07)))
7
+ #define TSTBIT(byte_arr, bit) (byte_arr[bit >> 3] & (1 << (bit & 0x07)))
8
+
9
+ typedef char cp_byte;
10
+ typedef unsigned long cp_index;
11
+
12
+ #define UNICODE_CP_COUNT 0x110000
13
+ #define UNICODE_BYTES UNICODE_CP_COUNT / 8
14
+ #define UNICODE_PLANE_SIZE 0x10000
15
+ #define UNICODE_PLANE_COUNT UNICODE_CP_COUNT / UNICODE_PLANE_SIZE
16
+
17
+ static void
18
+ free_character_set(void* codepoints) {
19
+ free(codepoints);
20
+ }
21
+
22
+ static size_t
23
+ memsize_character_set(const void* codepoints) {
24
+ return sizeof(cp_byte) * UNICODE_BYTES;
25
+ }
26
+
27
+ static const rb_data_type_t
28
+ character_set_type = {
29
+ .wrap_struct_name = "character_set",
30
+ .function = {
31
+ .dmark = NULL,
32
+ .dfree = free_character_set,
33
+ .dsize = memsize_character_set,
34
+ },
35
+ .data = NULL,
36
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY,
37
+ };
38
+
39
+ #define FETCH_CODEPOINTS(set, cps)\
40
+ TypedData_Get_Struct(set, cp_byte, &character_set_type, cps)
41
+
42
+ #define NEW_CHARACTER_SET(klass, cps)\
43
+ TypedData_Wrap_Struct(klass, &character_set_type, cps)
44
+
45
+ static VALUE
46
+ method_allocate(VALUE self) {
47
+ cp_byte *cp_arr;
48
+ cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte));
49
+ return NEW_CHARACTER_SET(self, cp_arr);
50
+ }
51
+
52
+ #define FOR_EACH_ACTIVE_CODEPOINT(action)\
53
+ cp_index cp;\
54
+ cp_byte *cps;\
55
+ FETCH_CODEPOINTS(self, cps);\
56
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\
57
+ if (TSTBIT(cps, cp)) { action; }\
58
+ }
59
+
60
+ // ***************************
61
+ // `Set` compatibility methods
62
+ // ***************************
63
+
64
+ static inline VALUE
65
+ enumerator_length(VALUE self, VALUE args, VALUE eobj) {
66
+ cp_index count;
67
+ count = 0;
68
+ FOR_EACH_ACTIVE_CODEPOINT(count++);
69
+ return LONG2FIX(count);
70
+ }
71
+
72
+ static VALUE
73
+ method_length(VALUE self) {
74
+ return enumerator_length(self, 0, 0);
75
+ }
76
+
77
+ static VALUE
78
+ method_each(VALUE self) {
79
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
80
+ FOR_EACH_ACTIVE_CODEPOINT(rb_yield(LONG2FIX(cp)));
81
+ return self;
82
+ }
83
+
84
+ // returns an Array of codepoint Integers by default.
85
+ // returns an Array of Strings of length 1 if passed `true`.
86
+ static VALUE
87
+ method_to_a(int argc, VALUE *argv, VALUE self) {
88
+ VALUE arr;
89
+ rb_encoding *enc;
90
+ rb_check_arity(argc, 0, 1);
91
+
92
+ arr = rb_ary_new();
93
+ if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse) {
94
+ FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, LONG2FIX(cp)));
95
+ }
96
+ else {
97
+ enc = rb_utf8_encoding();
98
+ FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, rb_enc_uint_chr((int)cp, enc)));
99
+ }
100
+
101
+ return arr;
102
+ }
103
+
104
+ static VALUE
105
+ method_empty_p(VALUE self) {
106
+ FOR_EACH_ACTIVE_CODEPOINT(return Qfalse);
107
+ return Qtrue;
108
+ }
109
+
110
+ static VALUE
111
+ method_hash(VALUE self) {
112
+ cp_index cp, hash, four_byte_value;
113
+ cp_byte *cps;
114
+ FETCH_CODEPOINTS(self, cps);
115
+
116
+ hash = 17;
117
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
118
+ if (cp % 32 == 0) {
119
+ if (cp != 0) { hash = hash * 23 + four_byte_value; }
120
+ four_byte_value = 0;
121
+ }
122
+ if (TSTBIT(cps, cp)) four_byte_value++;
123
+ }
124
+
125
+ return LONG2FIX(hash);
126
+ }
127
+
128
+ static inline VALUE
129
+ delete_if_block_result(VALUE self, int truthy) {
130
+ VALUE result;
131
+ rb_need_block();
132
+ rb_check_frozen(self);
133
+ FOR_EACH_ACTIVE_CODEPOINT(
134
+ result = rb_yield(LONG2FIX(cp));
135
+ if ((NIL_P(result) || result == Qfalse) != truthy) CLRBIT(cps, cp);
136
+ );
137
+ return self;
138
+ }
139
+
140
+ static VALUE
141
+ method_delete_if(VALUE self) {
142
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
143
+ return delete_if_block_result(self, 1);
144
+ }
145
+
146
+ static VALUE
147
+ method_keep_if(VALUE self) {
148
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
149
+ return delete_if_block_result(self, 0);
150
+ }
151
+
152
+ static VALUE
153
+ method_clear(VALUE self) {
154
+ cp_index cp;
155
+ cp_byte *cps;
156
+ rb_check_frozen(self);
157
+ FETCH_CODEPOINTS(self, cps);
158
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
159
+ CLRBIT(cps, cp);
160
+ }
161
+ return self;
162
+ }
163
+
164
+ #define RETURN_NEW_SET_BASED_ON(condition)\
165
+ cp_index cp;\
166
+ cp_byte *a, *b, *new_cps;\
167
+ FETCH_CODEPOINTS(self, a);\
168
+ if (other) FETCH_CODEPOINTS(other, b);\
169
+ new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));\
170
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\
171
+ if (condition) SETBIT(new_cps, cp);\
172
+ }\
173
+ return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);\
174
+
175
+ static VALUE
176
+ method_intersection(VALUE self, VALUE other) {
177
+ RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) && TSTBIT(b, cp));
178
+ }
179
+
180
+ static VALUE
181
+ method_exclusion(VALUE self, VALUE other) {
182
+ RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) ^ TSTBIT(b, cp));
183
+ }
184
+
185
+ static VALUE
186
+ method_union(VALUE self, VALUE other) {
187
+ RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) || TSTBIT(b, cp));
188
+ }
189
+
190
+ static VALUE
191
+ method_difference(VALUE self, VALUE other) {
192
+ RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) > TSTBIT(b, cp));
193
+ }
194
+
195
+ static VALUE
196
+ method_include_p(VALUE self, VALUE num) {
197
+ cp_byte *cps;
198
+ FETCH_CODEPOINTS(self, cps);
199
+ return (TSTBIT(cps, FIX2ULONG(num)) ? Qtrue : Qfalse);
200
+ }
201
+
202
+ static inline int
203
+ toggle_codepoint(VALUE set, VALUE cp_num, unsigned int on, int check_if_noop) {
204
+ cp_index cp;
205
+ cp_byte *cps;
206
+ rb_check_frozen(set);
207
+ FETCH_CODEPOINTS(set, cps);
208
+ cp = FIX2ULONG(cp_num);
209
+ if (check_if_noop && (!TSTBIT(cps, cp) == !on)) {
210
+ return 0;
211
+ }
212
+ else {
213
+ if (on) { SETBIT(cps, cp); }
214
+ else { CLRBIT(cps, cp); }
215
+ return 1;
216
+ }
217
+ }
218
+
219
+ static VALUE
220
+ method_add(VALUE self, VALUE cp_num) {
221
+ return toggle_codepoint(self, cp_num, 1, 0) ? self : Qnil;
222
+ }
223
+
224
+ static VALUE
225
+ method_add_p(VALUE self, VALUE cp_num) {
226
+ return toggle_codepoint(self, cp_num, 1, 1) ? self : Qnil;
227
+ }
228
+
229
+ static VALUE
230
+ method_delete(VALUE self, VALUE cp_num) {
231
+ return toggle_codepoint(self, cp_num, 0, 0) ? self : Qnil;
232
+ }
233
+
234
+ static VALUE
235
+ method_delete_p(VALUE self, VALUE cp_num) {
236
+ return toggle_codepoint(self, cp_num, 0, 1) ? self : Qnil;
237
+ }
238
+
239
+ #define COMPARE_SETS(action)\
240
+ cp_index cp;\
241
+ cp_byte *cps, *other_cps;\
242
+ FETCH_CODEPOINTS(self, cps);\
243
+ FETCH_CODEPOINTS(other, other_cps);\
244
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { action; }\
245
+
246
+ static VALUE
247
+ method_intersect_p(VALUE self, VALUE other) {
248
+ COMPARE_SETS(if (TSTBIT(cps, cp) && TSTBIT(other_cps, cp)) return Qtrue);
249
+ return Qfalse;
250
+ }
251
+
252
+ static VALUE
253
+ method_disjoint_p(VALUE self, VALUE other) {
254
+ return method_intersect_p(self, other) ? Qfalse : Qtrue;
255
+ }
256
+
257
+ static inline int
258
+ is_character_set(VALUE obj) {
259
+ return rb_typeddata_is_kind_of(obj, &character_set_type);
260
+ }
261
+
262
+ static VALUE
263
+ method_eql_p(VALUE self, VALUE other) {
264
+ if (!is_character_set(other)) return Qfalse;
265
+ if (self == other) return Qtrue; // same object_id
266
+
267
+ COMPARE_SETS(if (TSTBIT(cps, cp) != TSTBIT(other_cps, cp)) return Qfalse);
268
+
269
+ return Qtrue;
270
+ }
271
+
272
+ static inline VALUE
273
+ merge_character_set(VALUE self, VALUE other) {
274
+ COMPARE_SETS(if (TSTBIT(other_cps, cp)) SETBIT(cps, cp));
275
+ return self;
276
+ }
277
+
278
+ static inline void
279
+ raise_arg_err_unless_valid_as_cp(VALUE object_id) {
280
+ if (FIXNUM_P(object_id) && object_id > 0 && object_id < 0x220001) return;
281
+ rb_raise(rb_eArgError, "CharacterSet members must be between 0 and 0x10FFFF");
282
+ }
283
+
284
+ static inline VALUE
285
+ merge_rb_range(VALUE self, VALUE rb_range) {
286
+ VALUE from_id, upto_id;
287
+ int excl;
288
+ cp_index cp;
289
+ cp_byte *cps;
290
+ FETCH_CODEPOINTS(self, cps);
291
+
292
+ if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl))) {
293
+ rb_raise(rb_eArgError, "pass a Range");
294
+ }
295
+ if (excl) upto_id -= 2;
296
+
297
+ raise_arg_err_unless_valid_as_cp(from_id);
298
+ raise_arg_err_unless_valid_as_cp(upto_id);
299
+
300
+ for (/* */; from_id <= upto_id; from_id += 2) {
301
+ cp = FIX2ULONG(from_id);
302
+ SETBIT(cps, cp);
303
+ }
304
+ return self;
305
+ }
306
+
307
+ static inline VALUE
308
+ merge_rb_array(VALUE self, VALUE rb_array) {
309
+ VALUE el;
310
+ cp_byte *cps;
311
+ VALUE array_length, i;
312
+ FETCH_CODEPOINTS(self, cps);
313
+ Check_Type(rb_array, T_ARRAY);
314
+ array_length = RARRAY_LEN(rb_array);
315
+ for (i = 0; i < array_length; i++) {
316
+ el = RARRAY_AREF(rb_array, i);
317
+ raise_arg_err_unless_valid_as_cp(el);
318
+ SETBIT(cps, FIX2ULONG(el));
319
+ }
320
+ return self;
321
+ }
322
+
323
+ static VALUE
324
+ method_merge(VALUE self, VALUE other) {
325
+ rb_check_frozen(self);
326
+ if (is_character_set(other)) {
327
+ return merge_character_set(self, other);
328
+ }
329
+ else if (TYPE(other) == T_ARRAY) {
330
+ return merge_rb_array(self, other);
331
+ }
332
+ return merge_rb_range(self, other);
333
+ }
334
+
335
+ static VALUE
336
+ method_initialize_copy(VALUE self, VALUE other) {
337
+ merge_character_set(self, other);
338
+ return other;
339
+ }
340
+
341
+ static VALUE
342
+ method_subtract(VALUE self, VALUE other) {
343
+ rb_check_frozen(self);
344
+ COMPARE_SETS(if (TSTBIT(other_cps, cp)) CLRBIT(cps, cp));
345
+ return self;
346
+ }
347
+
348
+ static inline int
349
+ a_subset_of_b(VALUE set_a, VALUE set_b, int *is_proper) {
350
+ cp_byte *cps_a, *cps_b;
351
+ cp_index cp, size_a, size_b;
352
+
353
+ if (!is_character_set(set_a) || !is_character_set(set_b)) {
354
+ rb_raise(rb_eArgError, "pass a CharacterSet");
355
+ }
356
+
357
+ FETCH_CODEPOINTS(set_a, cps_a);
358
+ FETCH_CODEPOINTS(set_b, cps_b);
359
+
360
+ *is_proper = 0;
361
+ size_a = 0;
362
+ size_b = 0;
363
+
364
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
365
+ if (TSTBIT(cps_a, cp)) {
366
+ if (!TSTBIT(cps_b, cp)) return 0;
367
+ size_a++;
368
+ size_b++;
369
+ }
370
+ else if (TSTBIT(cps_b, cp)) size_b++;
371
+ }
372
+
373
+ if (size_b > size_a) *is_proper = 1;
374
+ return 1;
375
+ }
376
+
377
+ static VALUE
378
+ method_subset_p(VALUE self, VALUE other) {
379
+ int is_proper;
380
+ return a_subset_of_b(self, other, &is_proper) ? Qtrue : Qfalse;
381
+ }
382
+
383
+ static VALUE
384
+ method_proper_subset_p(VALUE self, VALUE other) {
385
+ int is, is_proper;
386
+ is = a_subset_of_b(self, other, &is_proper);
387
+ return (is && is_proper) ? Qtrue : Qfalse;
388
+ }
389
+
390
+ static VALUE
391
+ method_superset_p(VALUE self, VALUE other) {
392
+ int is_proper;
393
+ return a_subset_of_b(other, self, &is_proper) ? Qtrue : Qfalse;
394
+ }
395
+
396
+ static VALUE
397
+ method_proper_superset_p(VALUE self, VALUE other) {
398
+ int is, is_proper;
399
+ is = a_subset_of_b(other, self, &is_proper);
400
+ return (is && is_proper) ? Qtrue : Qfalse;
401
+ }
402
+
403
+ // *******************************
404
+ // `CharacterSet`-specific methods
405
+ // *******************************
406
+
407
+ static VALUE
408
+ class_method_from_ranges(VALUE self, VALUE ranges) {
409
+ VALUE new_set, range_count, i;
410
+ new_set = rb_class_new_instance(0, 0, self);
411
+ range_count = RARRAY_LEN(ranges);
412
+ for (i = 0; i < range_count; i++) {
413
+ merge_rb_range(new_set, RARRAY_AREF(ranges, i));
414
+ }
415
+ return new_set;
416
+ }
417
+
418
+ static VALUE
419
+ method_ranges(VALUE self) {
420
+ VALUE ranges, codepoint, previous_codepoint, current_start, current_end;
421
+
422
+ ranges = rb_ary_new();
423
+ previous_codepoint = 0;
424
+ current_start = 0;
425
+ current_end = 0;
426
+
427
+ FOR_EACH_ACTIVE_CODEPOINT(
428
+ codepoint = LONG2FIX(cp);
429
+
430
+ if (!previous_codepoint) {
431
+ current_start = codepoint;
432
+ }
433
+ else if (previous_codepoint + 2 != codepoint) {
434
+ // gap found, finalize previous range
435
+ rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
436
+ current_start = codepoint;
437
+ }
438
+ current_end = codepoint;
439
+ previous_codepoint = codepoint;
440
+ );
441
+
442
+ // add final range
443
+ if (current_start) {
444
+ rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
445
+ }
446
+
447
+ return ranges;
448
+ }
449
+
450
+ static VALUE
451
+ method_sample(int argc, VALUE *argv, VALUE self) {
452
+ VALUE to_a_args[1], array;
453
+ rb_check_arity(argc, 0, 1);
454
+ to_a_args[0] = Qtrue;
455
+ array = method_to_a(1, to_a_args, self);
456
+ return rb_funcall(array, rb_intern("sample"), argc, argc ? argv[0] : 0);
457
+ }
458
+
459
+ static inline VALUE
460
+ new_set_from_section(VALUE set, cp_index from, cp_index upto) {
461
+ cp_byte *cps, *new_cps;
462
+ cp_index cp;
463
+ FETCH_CODEPOINTS(set, cps);
464
+ new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));
465
+ for (cp = from; cp <= upto; cp++) {
466
+ if (TSTBIT(cps, cp)) SETBIT(new_cps, cp);
467
+ }
468
+ return NEW_CHARACTER_SET(RBASIC(set)->klass, new_cps);
469
+ }
470
+
471
+ static VALUE
472
+ method_bmp_part(VALUE self) {
473
+ return new_set_from_section(self, 0, UNICODE_PLANE_SIZE - 1);
474
+ }
475
+
476
+ static VALUE
477
+ method_astral_part(VALUE self) {
478
+ return new_set_from_section(self, UNICODE_PLANE_SIZE, UNICODE_CP_COUNT - 1);
479
+ }
480
+
481
+ static inline VALUE
482
+ set_has_member_in_plane(VALUE set, unsigned int plane) {
483
+ cp_byte *cps;
484
+ cp_index cp, max_cp;
485
+ FETCH_CODEPOINTS(set, cps);
486
+ cp = plane * UNICODE_PLANE_SIZE;
487
+ max_cp = (plane + 1) * UNICODE_PLANE_SIZE - 1;
488
+ for (/* */; cp <= max_cp; cp++) {
489
+ if (TSTBIT(cps, cp)) return Qtrue;
490
+ }
491
+ return Qfalse;
492
+ }
493
+
494
+ static VALUE
495
+ method_planes(VALUE self) {
496
+ unsigned int i;
497
+ VALUE planes;
498
+ planes = rb_ary_new();
499
+ for (i = 0; i < UNICODE_PLANE_COUNT; i++) {
500
+ if (set_has_member_in_plane(self, i)) rb_ary_push(planes, INT2FIX(i));
501
+ }
502
+ return planes;
503
+ }
504
+
505
+ static VALUE
506
+ method_member_in_plane_p(VALUE self, VALUE plane_num) {
507
+ int plane;
508
+ Check_Type(plane_num, T_FIXNUM);
509
+ plane = FIX2INT(plane_num);
510
+ if (plane < 0 || plane >= UNICODE_PLANE_COUNT) {
511
+ rb_raise(rb_eArgError, "plane must be between 0 and 16");
512
+ }
513
+ return set_has_member_in_plane(self, plane);
514
+ }
515
+
516
+ #define NON_SURROGATE(cp) (cp > 0xDFFF || cp < 0xD800)
517
+
518
+ static VALUE
519
+ method_ext_inversion(int argc, VALUE *argv, VALUE self) {
520
+ int include_surrogates;
521
+ cp_index upto;
522
+ VALUE other;
523
+ other = 0;
524
+ rb_check_arity(argc, 0, 2);
525
+ include_surrogates = ((argc > 0) && (argv[0] == Qtrue));
526
+ if ((argc > 1) && FIXNUM_P(argv[1])) {
527
+ upto = FIX2ULONG(argv[1]);
528
+ RETURN_NEW_SET_BASED_ON(
529
+ cp <= upto && !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
530
+ );
531
+ }
532
+ RETURN_NEW_SET_BASED_ON(
533
+ !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
534
+ );
535
+ }
536
+
537
+ typedef int(*str_cp_handler)(unsigned int, cp_byte*);
538
+
539
+ static inline int
540
+ add_str_cp_to_arr(unsigned int str_cp, cp_byte *cp_arr) {
541
+ SETBIT(cp_arr, str_cp);
542
+ return 1;
543
+ }
544
+
545
+ static VALUE
546
+ method_case_insensitive(VALUE self) {
547
+ cp_index i;
548
+ cp_byte *new_cps;
549
+
550
+ new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));
551
+
552
+ FOR_EACH_ACTIVE_CODEPOINT(SETBIT(new_cps, cp));
553
+
554
+ for (i = 0; i < CASEFOLD_COUNT; i++) {
555
+ casefold_mapping m = unicode_casefold_table[i];
556
+
557
+ if (TSTBIT(cps, m.from)) { SETBIT(new_cps, m.to); }
558
+ else if (TSTBIT(cps, m.to)) { SETBIT(new_cps, m.from); }
559
+ }
560
+
561
+ return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);
562
+
563
+ // OnigCaseFoldType flags;
564
+ // rb_encoding *enc;
565
+ //
566
+ // enc = rb_utf8_encoding();
567
+ //
568
+ // ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE (not public on ruby < 2.4)
569
+ // flags = (1<<13) | (1<<14);
570
+ //
571
+ // // case_map args: flags, pp, end, to, to_end, enc
572
+ // enc->case_map(flags, (const OnigUChar**)&cp, ?, ?, ?, enc);
573
+ }
574
+
575
+ static inline VALUE
576
+ each_sb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
577
+ long i;
578
+ unsigned int str_cp;
579
+
580
+ for (i = 0; i < RSTRING_LEN(str); i++) {
581
+ str_cp = (RSTRING_PTR(str)[i] & 0xff);
582
+ if (!(*func)(str_cp, cp_arr)) return Qfalse;
583
+ }
584
+
585
+ return Qtrue;
586
+ }
587
+
588
+ static inline VALUE
589
+ each_mb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
590
+ int n;
591
+ unsigned int str_cp;
592
+ const char *ptr, *end;
593
+ rb_encoding *enc;
594
+
595
+ str = rb_str_new_frozen(str);
596
+ ptr = RSTRING_PTR(str);
597
+ end = RSTRING_END(str);
598
+ enc = rb_enc_get(str);
599
+
600
+ while (ptr < end) {
601
+ str_cp = rb_enc_codepoint_len(ptr, end, &n, enc);
602
+ if (!(*func)(str_cp, cp_arr)) return Qfalse;
603
+ ptr += n;
604
+ }
605
+
606
+ return Qtrue;
607
+ }
608
+
609
+ // single_byte_optimizable - copied from string.c
610
+ static inline int
611
+ single_byte_optimizable(VALUE str)
612
+ {
613
+ rb_encoding *enc;
614
+ if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) return 1;
615
+
616
+ enc = rb_enc_get(str);
617
+ if (rb_enc_mbmaxlen(enc) == 1) return 1;
618
+
619
+ return 0;
620
+ }
621
+
622
+ static inline VALUE
623
+ each_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
624
+ if (single_byte_optimizable(str)) {
625
+ return each_sb_cp(str, func, cp_arr);
626
+ }
627
+ return each_mb_cp(str, func, cp_arr);
628
+ }
629
+
630
+ static inline void
631
+ raise_arg_err_unless_string(VALUE val) {
632
+ if (!RB_TYPE_P(val, T_STRING)) rb_raise(rb_eArgError, "pass a String");
633
+ }
634
+
635
+ static VALUE
636
+ class_method_of(VALUE self, VALUE str) {
637
+ cp_byte *cp_arr;
638
+ raise_arg_err_unless_string(str);
639
+ cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte));
640
+ each_cp(str, add_str_cp_to_arr, cp_arr);
641
+ return NEW_CHARACTER_SET(self, cp_arr);
642
+ }
643
+
644
+ static inline int
645
+ str_cp_not_in_arr(unsigned int str_cp, cp_byte *cp_arr) {
646
+ return !TSTBIT(cp_arr, str_cp);
647
+ }
648
+
649
+ static VALUE
650
+ method_used_by_p(VALUE self, VALUE str) {
651
+ cp_byte *cps;
652
+ VALUE only_uses_other_cps;
653
+ raise_arg_err_unless_string(str);
654
+ FETCH_CODEPOINTS(self, cps);
655
+ only_uses_other_cps = each_cp(str, str_cp_not_in_arr, cps);
656
+ return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
657
+ }
658
+
659
+ static inline int
660
+ str_cp_in_arr(unsigned int str_cp, cp_byte *cp_arr) {
661
+ return TSTBIT(cp_arr, str_cp);
662
+ }
663
+
664
+ static VALUE
665
+ method_cover_p(VALUE self, VALUE str) {
666
+ cp_byte *cps;
667
+ raise_arg_err_unless_string(str);
668
+ FETCH_CODEPOINTS(self, cps);
669
+ return each_cp(str, str_cp_in_arr, cps);
670
+ }
671
+
672
+ static inline VALUE
673
+ apply_to_str(VALUE set, VALUE str, int delete, int bang) {
674
+ cp_byte *cps;
675
+ rb_encoding *str_enc;
676
+ VALUE orig_len, blen, new_str_buf, chr;
677
+ int n;
678
+ unsigned int str_cp;
679
+ const char *ptr, *end;
680
+
681
+ raise_arg_err_unless_string(str);
682
+
683
+ FETCH_CODEPOINTS(set, cps);
684
+
685
+ orig_len = RSTRING_LEN(str);
686
+ blen = orig_len + 30; /* len + margin */ // not sure why, copied from string.c
687
+ new_str_buf = rb_str_buf_new(blen);
688
+ str_enc = rb_enc_get(str);
689
+ rb_enc_associate(new_str_buf, str_enc);
690
+ ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ?
691
+ ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
692
+
693
+ ptr = RSTRING_PTR(str);
694
+ end = RSTRING_END(str);
695
+
696
+ while (ptr < end) {
697
+ str_cp = rb_enc_codepoint_len(ptr, end, &n, str_enc);
698
+ if (!TSTBIT(cps, str_cp) != !delete) {
699
+ chr = rb_enc_uint_chr(str_cp, str_enc);
700
+ rb_enc_str_buf_cat(new_str_buf, RSTRING_PTR(chr), n, str_enc);
701
+ }
702
+ ptr += n;
703
+ }
704
+
705
+ if (bang) {
706
+ if (RSTRING_LEN(new_str_buf) == (long)orig_len) return Qnil; // unchanged
707
+ rb_str_shared_replace(str, new_str_buf);
708
+ }
709
+ else {
710
+ RB_OBJ_WRITE(new_str_buf, &(RBASIC(new_str_buf))->klass, rb_obj_class(str));
711
+ // slightly cumbersome approach needed for compatibility with Ruby < 2.3:
712
+ RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags&(FL_TAINT));
713
+ str = new_str_buf;
714
+ }
715
+
716
+ return str;
717
+ }
718
+
719
+ static VALUE
720
+ method_delete_in(VALUE self, VALUE str) {
721
+ return apply_to_str(self, str, 1, 0);
722
+ }
723
+
724
+ static VALUE
725
+ method_delete_in_bang(VALUE self, VALUE str) {
726
+ return apply_to_str(self, str, 1, 1);
727
+ }
728
+
729
+ static VALUE
730
+ method_keep_in(VALUE self, VALUE str) {
731
+ return apply_to_str(self, str, 0, 0);
732
+ }
733
+
734
+ static VALUE
735
+ method_keep_in_bang(VALUE self, VALUE str) {
736
+ return apply_to_str(self, str, 0, 1);
737
+ }
738
+
739
+ // ****
740
+ // init
741
+ // ****
742
+
743
+ void
744
+ Init_character_set()
745
+ {
746
+ VALUE cs = rb_define_class("CharacterSet", rb_cObject);
747
+
748
+ rb_define_alloc_func(cs, method_allocate);
749
+
750
+ // `Set` compatibility methods
751
+
752
+ rb_define_method(cs, "each", method_each, 0);
753
+ rb_define_method(cs, "to_a", method_to_a, -1);
754
+ rb_define_method(cs, "length", method_length, 0);
755
+ rb_define_method(cs, "size", method_length, 0);
756
+ rb_define_method(cs, "count", method_length, 0);
757
+ rb_define_method(cs, "empty?", method_empty_p, 0);
758
+ rb_define_method(cs, "hash", method_hash, 0);
759
+ rb_define_method(cs, "keep_if", method_keep_if, 0);
760
+ rb_define_method(cs, "delete_if", method_delete_if, 0);
761
+ rb_define_method(cs, "clear", method_clear, 0);
762
+ rb_define_method(cs, "intersection", method_intersection, 1);
763
+ rb_define_method(cs, "&", method_intersection, 1);
764
+ rb_define_method(cs, "union", method_union, 1);
765
+ rb_define_method(cs, "+", method_union, 1);
766
+ rb_define_method(cs, "|", method_union, 1);
767
+ rb_define_method(cs, "difference", method_difference, 1);
768
+ rb_define_method(cs, "-", method_difference, 1);
769
+ rb_define_method(cs, "^", method_exclusion, 1);
770
+ rb_define_method(cs, "include?", method_include_p, 1);
771
+ rb_define_method(cs, "member?", method_include_p, 1);
772
+ rb_define_method(cs, "===", method_include_p, 1);
773
+ rb_define_method(cs, "add", method_add, 1);
774
+ rb_define_method(cs, "<<", method_add, 1);
775
+ rb_define_method(cs, "add?", method_add_p, 1);
776
+ rb_define_method(cs, "delete", method_delete, 1);
777
+ rb_define_method(cs, "delete?", method_delete_p, 1);
778
+ rb_define_method(cs, "intersect?", method_intersect_p, 1);
779
+ rb_define_method(cs, "disjoint?", method_disjoint_p, 1);
780
+ rb_define_method(cs, "eql?", method_eql_p, 1);
781
+ rb_define_method(cs, "==", method_eql_p, 1);
782
+ rb_define_method(cs, "merge", method_merge, 1);
783
+ rb_define_method(cs, "initialize_clone", method_initialize_copy, 1);
784
+ rb_define_method(cs, "initialize_dup", method_initialize_copy, 1);
785
+ rb_define_method(cs, "subtract", method_subtract, 1);
786
+ rb_define_method(cs, "subset?", method_subset_p, 1);
787
+ rb_define_method(cs, "<=", method_subset_p, 1);
788
+ rb_define_method(cs, "proper_subset?", method_proper_subset_p, 1);
789
+ rb_define_method(cs, "<", method_proper_subset_p, 1);
790
+ rb_define_method(cs, "superset?", method_superset_p, 1);
791
+ rb_define_method(cs, ">=", method_superset_p, 1);
792
+ rb_define_method(cs, "proper_superset?", method_proper_superset_p, 1);
793
+ rb_define_method(cs, ">", method_proper_superset_p, 1);
794
+
795
+ // `CharacterSet`-specific methods
796
+
797
+ rb_define_singleton_method(cs, "from_ranges", class_method_from_ranges, -2);
798
+ rb_define_singleton_method(cs, "of", class_method_of, 1);
799
+
800
+ rb_define_method(cs, "ranges", method_ranges, 0);
801
+ rb_define_method(cs, "sample", method_sample, -1);
802
+ rb_define_method(cs, "bmp_part", method_bmp_part, 0);
803
+ rb_define_method(cs, "astral_part", method_astral_part, 0);
804
+ rb_define_method(cs, "planes", method_planes, 0);
805
+ rb_define_method(cs, "member_in_plane?", method_member_in_plane_p, 1);
806
+ rb_define_method(cs, "ext_inversion", method_ext_inversion, -1);
807
+ rb_define_method(cs, "case_insensitive", method_case_insensitive, 0);
808
+ rb_define_method(cs, "used_by?", method_used_by_p, 1);
809
+ rb_define_method(cs, "cover?", method_cover_p, 1);
810
+ rb_define_method(cs, "delete_in", method_delete_in, 1);
811
+ rb_define_method(cs, "delete_in!", method_delete_in_bang, 1);
812
+ rb_define_method(cs, "keep_in", method_keep_in, 1);
813
+ rb_define_method(cs, "keep_in!", method_keep_in_bang, 1);
814
+ }