character_set 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,814 @@
1
+ #include "ruby.h"
2
+ #include "ruby/encoding.h"
3
+ #include "unicode_casefold_table.h"
4
+
5
+ #define SETBIT(byte_arr, bit) (byte_arr[bit >> 3] |= (1 << (bit & 0x07)))
6
+ #define CLRBIT(byte_arr, bit) (byte_arr[bit >> 3] &= ~(1 << (bit & 0x07)))
7
+ #define TSTBIT(byte_arr, bit) (byte_arr[bit >> 3] & (1 << (bit & 0x07)))
8
+
9
+ typedef char cp_byte;
10
+ typedef unsigned long cp_index;
11
+
12
+ #define UNICODE_CP_COUNT 0x110000
13
+ #define UNICODE_BYTES UNICODE_CP_COUNT / 8
14
+ #define UNICODE_PLANE_SIZE 0x10000
15
+ #define UNICODE_PLANE_COUNT UNICODE_CP_COUNT / UNICODE_PLANE_SIZE
16
+
17
+ static void
18
+ free_character_set(void* codepoints) {
19
+ free(codepoints);
20
+ }
21
+
22
+ static size_t
23
+ memsize_character_set(const void* codepoints) {
24
+ return sizeof(cp_byte) * UNICODE_BYTES;
25
+ }
26
+
27
+ static const rb_data_type_t
28
+ character_set_type = {
29
+ .wrap_struct_name = "character_set",
30
+ .function = {
31
+ .dmark = NULL,
32
+ .dfree = free_character_set,
33
+ .dsize = memsize_character_set,
34
+ },
35
+ .data = NULL,
36
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY,
37
+ };
38
+
39
+ #define FETCH_CODEPOINTS(set, cps)\
40
+ TypedData_Get_Struct(set, cp_byte, &character_set_type, cps)
41
+
42
+ #define NEW_CHARACTER_SET(klass, cps)\
43
+ TypedData_Wrap_Struct(klass, &character_set_type, cps)
44
+
45
+ static VALUE
46
+ method_allocate(VALUE self) {
47
+ cp_byte *cp_arr;
48
+ cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte));
49
+ return NEW_CHARACTER_SET(self, cp_arr);
50
+ }
51
+
52
+ #define FOR_EACH_ACTIVE_CODEPOINT(action)\
53
+ cp_index cp;\
54
+ cp_byte *cps;\
55
+ FETCH_CODEPOINTS(self, cps);\
56
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\
57
+ if (TSTBIT(cps, cp)) { action; }\
58
+ }
59
+
60
+ // ***************************
61
+ // `Set` compatibility methods
62
+ // ***************************
63
+
64
+ static inline VALUE
65
+ enumerator_length(VALUE self, VALUE args, VALUE eobj) {
66
+ cp_index count;
67
+ count = 0;
68
+ FOR_EACH_ACTIVE_CODEPOINT(count++);
69
+ return LONG2FIX(count);
70
+ }
71
+
72
+ static VALUE
73
+ method_length(VALUE self) {
74
+ return enumerator_length(self, 0, 0);
75
+ }
76
+
77
+ static VALUE
78
+ method_each(VALUE self) {
79
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
80
+ FOR_EACH_ACTIVE_CODEPOINT(rb_yield(LONG2FIX(cp)));
81
+ return self;
82
+ }
83
+
84
+ // returns an Array of codepoint Integers by default.
85
+ // returns an Array of Strings of length 1 if passed `true`.
86
+ static VALUE
87
+ method_to_a(int argc, VALUE *argv, VALUE self) {
88
+ VALUE arr;
89
+ rb_encoding *enc;
90
+ rb_check_arity(argc, 0, 1);
91
+
92
+ arr = rb_ary_new();
93
+ if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse) {
94
+ FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, LONG2FIX(cp)));
95
+ }
96
+ else {
97
+ enc = rb_utf8_encoding();
98
+ FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, rb_enc_uint_chr((int)cp, enc)));
99
+ }
100
+
101
+ return arr;
102
+ }
103
+
104
+ static VALUE
105
+ method_empty_p(VALUE self) {
106
+ FOR_EACH_ACTIVE_CODEPOINT(return Qfalse);
107
+ return Qtrue;
108
+ }
109
+
110
+ static VALUE
111
+ method_hash(VALUE self) {
112
+ cp_index cp, hash, four_byte_value;
113
+ cp_byte *cps;
114
+ FETCH_CODEPOINTS(self, cps);
115
+
116
+ hash = 17;
117
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
118
+ if (cp % 32 == 0) {
119
+ if (cp != 0) { hash = hash * 23 + four_byte_value; }
120
+ four_byte_value = 0;
121
+ }
122
+ if (TSTBIT(cps, cp)) four_byte_value++;
123
+ }
124
+
125
+ return LONG2FIX(hash);
126
+ }
127
+
128
+ static inline VALUE
129
+ delete_if_block_result(VALUE self, int truthy) {
130
+ VALUE result;
131
+ rb_need_block();
132
+ rb_check_frozen(self);
133
+ FOR_EACH_ACTIVE_CODEPOINT(
134
+ result = rb_yield(LONG2FIX(cp));
135
+ if ((NIL_P(result) || result == Qfalse) != truthy) CLRBIT(cps, cp);
136
+ );
137
+ return self;
138
+ }
139
+
140
+ static VALUE
141
+ method_delete_if(VALUE self) {
142
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
143
+ return delete_if_block_result(self, 1);
144
+ }
145
+
146
+ static VALUE
147
+ method_keep_if(VALUE self) {
148
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
149
+ return delete_if_block_result(self, 0);
150
+ }
151
+
152
+ static VALUE
153
+ method_clear(VALUE self) {
154
+ cp_index cp;
155
+ cp_byte *cps;
156
+ rb_check_frozen(self);
157
+ FETCH_CODEPOINTS(self, cps);
158
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
159
+ CLRBIT(cps, cp);
160
+ }
161
+ return self;
162
+ }
163
+
164
+ #define RETURN_NEW_SET_BASED_ON(condition)\
165
+ cp_index cp;\
166
+ cp_byte *a, *b, *new_cps;\
167
+ FETCH_CODEPOINTS(self, a);\
168
+ if (other) FETCH_CODEPOINTS(other, b);\
169
+ new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));\
170
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\
171
+ if (condition) SETBIT(new_cps, cp);\
172
+ }\
173
+ return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);\
174
+
175
+ static VALUE
176
+ method_intersection(VALUE self, VALUE other) {
177
+ RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) && TSTBIT(b, cp));
178
+ }
179
+
180
+ static VALUE
181
+ method_exclusion(VALUE self, VALUE other) {
182
+ RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) ^ TSTBIT(b, cp));
183
+ }
184
+
185
+ static VALUE
186
+ method_union(VALUE self, VALUE other) {
187
+ RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) || TSTBIT(b, cp));
188
+ }
189
+
190
+ static VALUE
191
+ method_difference(VALUE self, VALUE other) {
192
+ RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) > TSTBIT(b, cp));
193
+ }
194
+
195
+ static VALUE
196
+ method_include_p(VALUE self, VALUE num) {
197
+ cp_byte *cps;
198
+ FETCH_CODEPOINTS(self, cps);
199
+ return (TSTBIT(cps, FIX2ULONG(num)) ? Qtrue : Qfalse);
200
+ }
201
+
202
+ static inline int
203
+ toggle_codepoint(VALUE set, VALUE cp_num, unsigned int on, int check_if_noop) {
204
+ cp_index cp;
205
+ cp_byte *cps;
206
+ rb_check_frozen(set);
207
+ FETCH_CODEPOINTS(set, cps);
208
+ cp = FIX2ULONG(cp_num);
209
+ if (check_if_noop && (!TSTBIT(cps, cp) == !on)) {
210
+ return 0;
211
+ }
212
+ else {
213
+ if (on) { SETBIT(cps, cp); }
214
+ else { CLRBIT(cps, cp); }
215
+ return 1;
216
+ }
217
+ }
218
+
219
+ static VALUE
220
+ method_add(VALUE self, VALUE cp_num) {
221
+ return toggle_codepoint(self, cp_num, 1, 0) ? self : Qnil;
222
+ }
223
+
224
+ static VALUE
225
+ method_add_p(VALUE self, VALUE cp_num) {
226
+ return toggle_codepoint(self, cp_num, 1, 1) ? self : Qnil;
227
+ }
228
+
229
+ static VALUE
230
+ method_delete(VALUE self, VALUE cp_num) {
231
+ return toggle_codepoint(self, cp_num, 0, 0) ? self : Qnil;
232
+ }
233
+
234
+ static VALUE
235
+ method_delete_p(VALUE self, VALUE cp_num) {
236
+ return toggle_codepoint(self, cp_num, 0, 1) ? self : Qnil;
237
+ }
238
+
239
+ #define COMPARE_SETS(action)\
240
+ cp_index cp;\
241
+ cp_byte *cps, *other_cps;\
242
+ FETCH_CODEPOINTS(self, cps);\
243
+ FETCH_CODEPOINTS(other, other_cps);\
244
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { action; }\
245
+
246
+ static VALUE
247
+ method_intersect_p(VALUE self, VALUE other) {
248
+ COMPARE_SETS(if (TSTBIT(cps, cp) && TSTBIT(other_cps, cp)) return Qtrue);
249
+ return Qfalse;
250
+ }
251
+
252
+ static VALUE
253
+ method_disjoint_p(VALUE self, VALUE other) {
254
+ return method_intersect_p(self, other) ? Qfalse : Qtrue;
255
+ }
256
+
257
+ static inline int
258
+ is_character_set(VALUE obj) {
259
+ return rb_typeddata_is_kind_of(obj, &character_set_type);
260
+ }
261
+
262
+ static VALUE
263
+ method_eql_p(VALUE self, VALUE other) {
264
+ if (!is_character_set(other)) return Qfalse;
265
+ if (self == other) return Qtrue; // same object_id
266
+
267
+ COMPARE_SETS(if (TSTBIT(cps, cp) != TSTBIT(other_cps, cp)) return Qfalse);
268
+
269
+ return Qtrue;
270
+ }
271
+
272
+ static inline VALUE
273
+ merge_character_set(VALUE self, VALUE other) {
274
+ COMPARE_SETS(if (TSTBIT(other_cps, cp)) SETBIT(cps, cp));
275
+ return self;
276
+ }
277
+
278
+ static inline void
279
+ raise_arg_err_unless_valid_as_cp(VALUE object_id) {
280
+ if (FIXNUM_P(object_id) && object_id > 0 && object_id < 0x220001) return;
281
+ rb_raise(rb_eArgError, "CharacterSet members must be between 0 and 0x10FFFF");
282
+ }
283
+
284
+ static inline VALUE
285
+ merge_rb_range(VALUE self, VALUE rb_range) {
286
+ VALUE from_id, upto_id;
287
+ int excl;
288
+ cp_index cp;
289
+ cp_byte *cps;
290
+ FETCH_CODEPOINTS(self, cps);
291
+
292
+ if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl))) {
293
+ rb_raise(rb_eArgError, "pass a Range");
294
+ }
295
+ if (excl) upto_id -= 2;
296
+
297
+ raise_arg_err_unless_valid_as_cp(from_id);
298
+ raise_arg_err_unless_valid_as_cp(upto_id);
299
+
300
+ for (/* */; from_id <= upto_id; from_id += 2) {
301
+ cp = FIX2ULONG(from_id);
302
+ SETBIT(cps, cp);
303
+ }
304
+ return self;
305
+ }
306
+
307
+ static inline VALUE
308
+ merge_rb_array(VALUE self, VALUE rb_array) {
309
+ VALUE el;
310
+ cp_byte *cps;
311
+ VALUE array_length, i;
312
+ FETCH_CODEPOINTS(self, cps);
313
+ Check_Type(rb_array, T_ARRAY);
314
+ array_length = RARRAY_LEN(rb_array);
315
+ for (i = 0; i < array_length; i++) {
316
+ el = RARRAY_AREF(rb_array, i);
317
+ raise_arg_err_unless_valid_as_cp(el);
318
+ SETBIT(cps, FIX2ULONG(el));
319
+ }
320
+ return self;
321
+ }
322
+
323
+ static VALUE
324
+ method_merge(VALUE self, VALUE other) {
325
+ rb_check_frozen(self);
326
+ if (is_character_set(other)) {
327
+ return merge_character_set(self, other);
328
+ }
329
+ else if (TYPE(other) == T_ARRAY) {
330
+ return merge_rb_array(self, other);
331
+ }
332
+ return merge_rb_range(self, other);
333
+ }
334
+
335
+ static VALUE
336
+ method_initialize_copy(VALUE self, VALUE other) {
337
+ merge_character_set(self, other);
338
+ return other;
339
+ }
340
+
341
+ static VALUE
342
+ method_subtract(VALUE self, VALUE other) {
343
+ rb_check_frozen(self);
344
+ COMPARE_SETS(if (TSTBIT(other_cps, cp)) CLRBIT(cps, cp));
345
+ return self;
346
+ }
347
+
348
+ static inline int
349
+ a_subset_of_b(VALUE set_a, VALUE set_b, int *is_proper) {
350
+ cp_byte *cps_a, *cps_b;
351
+ cp_index cp, size_a, size_b;
352
+
353
+ if (!is_character_set(set_a) || !is_character_set(set_b)) {
354
+ rb_raise(rb_eArgError, "pass a CharacterSet");
355
+ }
356
+
357
+ FETCH_CODEPOINTS(set_a, cps_a);
358
+ FETCH_CODEPOINTS(set_b, cps_b);
359
+
360
+ *is_proper = 0;
361
+ size_a = 0;
362
+ size_b = 0;
363
+
364
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
365
+ if (TSTBIT(cps_a, cp)) {
366
+ if (!TSTBIT(cps_b, cp)) return 0;
367
+ size_a++;
368
+ size_b++;
369
+ }
370
+ else if (TSTBIT(cps_b, cp)) size_b++;
371
+ }
372
+
373
+ if (size_b > size_a) *is_proper = 1;
374
+ return 1;
375
+ }
376
+
377
+ static VALUE
378
+ method_subset_p(VALUE self, VALUE other) {
379
+ int is_proper;
380
+ return a_subset_of_b(self, other, &is_proper) ? Qtrue : Qfalse;
381
+ }
382
+
383
+ static VALUE
384
+ method_proper_subset_p(VALUE self, VALUE other) {
385
+ int is, is_proper;
386
+ is = a_subset_of_b(self, other, &is_proper);
387
+ return (is && is_proper) ? Qtrue : Qfalse;
388
+ }
389
+
390
+ static VALUE
391
+ method_superset_p(VALUE self, VALUE other) {
392
+ int is_proper;
393
+ return a_subset_of_b(other, self, &is_proper) ? Qtrue : Qfalse;
394
+ }
395
+
396
+ static VALUE
397
+ method_proper_superset_p(VALUE self, VALUE other) {
398
+ int is, is_proper;
399
+ is = a_subset_of_b(other, self, &is_proper);
400
+ return (is && is_proper) ? Qtrue : Qfalse;
401
+ }
402
+
403
+ // *******************************
404
+ // `CharacterSet`-specific methods
405
+ // *******************************
406
+
407
+ static VALUE
408
+ class_method_from_ranges(VALUE self, VALUE ranges) {
409
+ VALUE new_set, range_count, i;
410
+ new_set = rb_class_new_instance(0, 0, self);
411
+ range_count = RARRAY_LEN(ranges);
412
+ for (i = 0; i < range_count; i++) {
413
+ merge_rb_range(new_set, RARRAY_AREF(ranges, i));
414
+ }
415
+ return new_set;
416
+ }
417
+
418
+ static VALUE
419
+ method_ranges(VALUE self) {
420
+ VALUE ranges, codepoint, previous_codepoint, current_start, current_end;
421
+
422
+ ranges = rb_ary_new();
423
+ previous_codepoint = 0;
424
+ current_start = 0;
425
+ current_end = 0;
426
+
427
+ FOR_EACH_ACTIVE_CODEPOINT(
428
+ codepoint = LONG2FIX(cp);
429
+
430
+ if (!previous_codepoint) {
431
+ current_start = codepoint;
432
+ }
433
+ else if (previous_codepoint + 2 != codepoint) {
434
+ // gap found, finalize previous range
435
+ rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
436
+ current_start = codepoint;
437
+ }
438
+ current_end = codepoint;
439
+ previous_codepoint = codepoint;
440
+ );
441
+
442
+ // add final range
443
+ if (current_start) {
444
+ rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
445
+ }
446
+
447
+ return ranges;
448
+ }
449
+
450
+ static VALUE
451
+ method_sample(int argc, VALUE *argv, VALUE self) {
452
+ VALUE to_a_args[1], array;
453
+ rb_check_arity(argc, 0, 1);
454
+ to_a_args[0] = Qtrue;
455
+ array = method_to_a(1, to_a_args, self);
456
+ return rb_funcall(array, rb_intern("sample"), argc, argc ? argv[0] : 0);
457
+ }
458
+
459
+ static inline VALUE
460
+ new_set_from_section(VALUE set, cp_index from, cp_index upto) {
461
+ cp_byte *cps, *new_cps;
462
+ cp_index cp;
463
+ FETCH_CODEPOINTS(set, cps);
464
+ new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));
465
+ for (cp = from; cp <= upto; cp++) {
466
+ if (TSTBIT(cps, cp)) SETBIT(new_cps, cp);
467
+ }
468
+ return NEW_CHARACTER_SET(RBASIC(set)->klass, new_cps);
469
+ }
470
+
471
+ static VALUE
472
+ method_bmp_part(VALUE self) {
473
+ return new_set_from_section(self, 0, UNICODE_PLANE_SIZE - 1);
474
+ }
475
+
476
+ static VALUE
477
+ method_astral_part(VALUE self) {
478
+ return new_set_from_section(self, UNICODE_PLANE_SIZE, UNICODE_CP_COUNT - 1);
479
+ }
480
+
481
+ static inline VALUE
482
+ set_has_member_in_plane(VALUE set, unsigned int plane) {
483
+ cp_byte *cps;
484
+ cp_index cp, max_cp;
485
+ FETCH_CODEPOINTS(set, cps);
486
+ cp = plane * UNICODE_PLANE_SIZE;
487
+ max_cp = (plane + 1) * UNICODE_PLANE_SIZE - 1;
488
+ for (/* */; cp <= max_cp; cp++) {
489
+ if (TSTBIT(cps, cp)) return Qtrue;
490
+ }
491
+ return Qfalse;
492
+ }
493
+
494
+ static VALUE
495
+ method_planes(VALUE self) {
496
+ unsigned int i;
497
+ VALUE planes;
498
+ planes = rb_ary_new();
499
+ for (i = 0; i < UNICODE_PLANE_COUNT; i++) {
500
+ if (set_has_member_in_plane(self, i)) rb_ary_push(planes, INT2FIX(i));
501
+ }
502
+ return planes;
503
+ }
504
+
505
+ static VALUE
506
+ method_member_in_plane_p(VALUE self, VALUE plane_num) {
507
+ int plane;
508
+ Check_Type(plane_num, T_FIXNUM);
509
+ plane = FIX2INT(plane_num);
510
+ if (plane < 0 || plane >= UNICODE_PLANE_COUNT) {
511
+ rb_raise(rb_eArgError, "plane must be between 0 and 16");
512
+ }
513
+ return set_has_member_in_plane(self, plane);
514
+ }
515
+
516
+ #define NON_SURROGATE(cp) (cp > 0xDFFF || cp < 0xD800)
517
+
518
+ static VALUE
519
+ method_ext_inversion(int argc, VALUE *argv, VALUE self) {
520
+ int include_surrogates;
521
+ cp_index upto;
522
+ VALUE other;
523
+ other = 0;
524
+ rb_check_arity(argc, 0, 2);
525
+ include_surrogates = ((argc > 0) && (argv[0] == Qtrue));
526
+ if ((argc > 1) && FIXNUM_P(argv[1])) {
527
+ upto = FIX2ULONG(argv[1]);
528
+ RETURN_NEW_SET_BASED_ON(
529
+ cp <= upto && !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
530
+ );
531
+ }
532
+ RETURN_NEW_SET_BASED_ON(
533
+ !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
534
+ );
535
+ }
536
+
537
+ typedef int(*str_cp_handler)(unsigned int, cp_byte*);
538
+
539
+ static inline int
540
+ add_str_cp_to_arr(unsigned int str_cp, cp_byte *cp_arr) {
541
+ SETBIT(cp_arr, str_cp);
542
+ return 1;
543
+ }
544
+
545
+ static VALUE
546
+ method_case_insensitive(VALUE self) {
547
+ cp_index i;
548
+ cp_byte *new_cps;
549
+
550
+ new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));
551
+
552
+ FOR_EACH_ACTIVE_CODEPOINT(SETBIT(new_cps, cp));
553
+
554
+ for (i = 0; i < CASEFOLD_COUNT; i++) {
555
+ casefold_mapping m = unicode_casefold_table[i];
556
+
557
+ if (TSTBIT(cps, m.from)) { SETBIT(new_cps, m.to); }
558
+ else if (TSTBIT(cps, m.to)) { SETBIT(new_cps, m.from); }
559
+ }
560
+
561
+ return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);
562
+
563
+ // OnigCaseFoldType flags;
564
+ // rb_encoding *enc;
565
+ //
566
+ // enc = rb_utf8_encoding();
567
+ //
568
+ // ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE (not public on ruby < 2.4)
569
+ // flags = (1<<13) | (1<<14);
570
+ //
571
+ // // case_map args: flags, pp, end, to, to_end, enc
572
+ // enc->case_map(flags, (const OnigUChar**)&cp, ?, ?, ?, enc);
573
+ }
574
+
575
+ static inline VALUE
576
+ each_sb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
577
+ long i;
578
+ unsigned int str_cp;
579
+
580
+ for (i = 0; i < RSTRING_LEN(str); i++) {
581
+ str_cp = (RSTRING_PTR(str)[i] & 0xff);
582
+ if (!(*func)(str_cp, cp_arr)) return Qfalse;
583
+ }
584
+
585
+ return Qtrue;
586
+ }
587
+
588
+ static inline VALUE
589
+ each_mb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
590
+ int n;
591
+ unsigned int str_cp;
592
+ const char *ptr, *end;
593
+ rb_encoding *enc;
594
+
595
+ str = rb_str_new_frozen(str);
596
+ ptr = RSTRING_PTR(str);
597
+ end = RSTRING_END(str);
598
+ enc = rb_enc_get(str);
599
+
600
+ while (ptr < end) {
601
+ str_cp = rb_enc_codepoint_len(ptr, end, &n, enc);
602
+ if (!(*func)(str_cp, cp_arr)) return Qfalse;
603
+ ptr += n;
604
+ }
605
+
606
+ return Qtrue;
607
+ }
608
+
609
+ // single_byte_optimizable - copied from string.c
610
+ static inline int
611
+ single_byte_optimizable(VALUE str)
612
+ {
613
+ rb_encoding *enc;
614
+ if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) return 1;
615
+
616
+ enc = rb_enc_get(str);
617
+ if (rb_enc_mbmaxlen(enc) == 1) return 1;
618
+
619
+ return 0;
620
+ }
621
+
622
+ static inline VALUE
623
+ each_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
624
+ if (single_byte_optimizable(str)) {
625
+ return each_sb_cp(str, func, cp_arr);
626
+ }
627
+ return each_mb_cp(str, func, cp_arr);
628
+ }
629
+
630
+ static inline void
631
+ raise_arg_err_unless_string(VALUE val) {
632
+ if (!RB_TYPE_P(val, T_STRING)) rb_raise(rb_eArgError, "pass a String");
633
+ }
634
+
635
+ static VALUE
636
+ class_method_of(VALUE self, VALUE str) {
637
+ cp_byte *cp_arr;
638
+ raise_arg_err_unless_string(str);
639
+ cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte));
640
+ each_cp(str, add_str_cp_to_arr, cp_arr);
641
+ return NEW_CHARACTER_SET(self, cp_arr);
642
+ }
643
+
644
+ static inline int
645
+ str_cp_not_in_arr(unsigned int str_cp, cp_byte *cp_arr) {
646
+ return !TSTBIT(cp_arr, str_cp);
647
+ }
648
+
649
+ static VALUE
650
+ method_used_by_p(VALUE self, VALUE str) {
651
+ cp_byte *cps;
652
+ VALUE only_uses_other_cps;
653
+ raise_arg_err_unless_string(str);
654
+ FETCH_CODEPOINTS(self, cps);
655
+ only_uses_other_cps = each_cp(str, str_cp_not_in_arr, cps);
656
+ return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
657
+ }
658
+
659
+ static inline int
660
+ str_cp_in_arr(unsigned int str_cp, cp_byte *cp_arr) {
661
+ return TSTBIT(cp_arr, str_cp);
662
+ }
663
+
664
+ static VALUE
665
+ method_cover_p(VALUE self, VALUE str) {
666
+ cp_byte *cps;
667
+ raise_arg_err_unless_string(str);
668
+ FETCH_CODEPOINTS(self, cps);
669
+ return each_cp(str, str_cp_in_arr, cps);
670
+ }
671
+
672
+ static inline VALUE
673
+ apply_to_str(VALUE set, VALUE str, int delete, int bang) {
674
+ cp_byte *cps;
675
+ rb_encoding *str_enc;
676
+ VALUE orig_len, blen, new_str_buf, chr;
677
+ int n;
678
+ unsigned int str_cp;
679
+ const char *ptr, *end;
680
+
681
+ raise_arg_err_unless_string(str);
682
+
683
+ FETCH_CODEPOINTS(set, cps);
684
+
685
+ orig_len = RSTRING_LEN(str);
686
+ blen = orig_len + 30; /* len + margin */ // not sure why, copied from string.c
687
+ new_str_buf = rb_str_buf_new(blen);
688
+ str_enc = rb_enc_get(str);
689
+ rb_enc_associate(new_str_buf, str_enc);
690
+ ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ?
691
+ ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
692
+
693
+ ptr = RSTRING_PTR(str);
694
+ end = RSTRING_END(str);
695
+
696
+ while (ptr < end) {
697
+ str_cp = rb_enc_codepoint_len(ptr, end, &n, str_enc);
698
+ if (!TSTBIT(cps, str_cp) != !delete) {
699
+ chr = rb_enc_uint_chr(str_cp, str_enc);
700
+ rb_enc_str_buf_cat(new_str_buf, RSTRING_PTR(chr), n, str_enc);
701
+ }
702
+ ptr += n;
703
+ }
704
+
705
+ if (bang) {
706
+ if (RSTRING_LEN(new_str_buf) == (long)orig_len) return Qnil; // unchanged
707
+ rb_str_shared_replace(str, new_str_buf);
708
+ }
709
+ else {
710
+ RB_OBJ_WRITE(new_str_buf, &(RBASIC(new_str_buf))->klass, rb_obj_class(str));
711
+ // slightly cumbersome approach needed for compatibility with Ruby < 2.3:
712
+ RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags&(FL_TAINT));
713
+ str = new_str_buf;
714
+ }
715
+
716
+ return str;
717
+ }
718
+
719
+ static VALUE
720
+ method_delete_in(VALUE self, VALUE str) {
721
+ return apply_to_str(self, str, 1, 0);
722
+ }
723
+
724
+ static VALUE
725
+ method_delete_in_bang(VALUE self, VALUE str) {
726
+ return apply_to_str(self, str, 1, 1);
727
+ }
728
+
729
+ static VALUE
730
+ method_keep_in(VALUE self, VALUE str) {
731
+ return apply_to_str(self, str, 0, 0);
732
+ }
733
+
734
+ static VALUE
735
+ method_keep_in_bang(VALUE self, VALUE str) {
736
+ return apply_to_str(self, str, 0, 1);
737
+ }
738
+
739
+ // ****
740
+ // init
741
+ // ****
742
+
743
+ void
744
+ Init_character_set()
745
+ {
746
+ VALUE cs = rb_define_class("CharacterSet", rb_cObject);
747
+
748
+ rb_define_alloc_func(cs, method_allocate);
749
+
750
+ // `Set` compatibility methods
751
+
752
+ rb_define_method(cs, "each", method_each, 0);
753
+ rb_define_method(cs, "to_a", method_to_a, -1);
754
+ rb_define_method(cs, "length", method_length, 0);
755
+ rb_define_method(cs, "size", method_length, 0);
756
+ rb_define_method(cs, "count", method_length, 0);
757
+ rb_define_method(cs, "empty?", method_empty_p, 0);
758
+ rb_define_method(cs, "hash", method_hash, 0);
759
+ rb_define_method(cs, "keep_if", method_keep_if, 0);
760
+ rb_define_method(cs, "delete_if", method_delete_if, 0);
761
+ rb_define_method(cs, "clear", method_clear, 0);
762
+ rb_define_method(cs, "intersection", method_intersection, 1);
763
+ rb_define_method(cs, "&", method_intersection, 1);
764
+ rb_define_method(cs, "union", method_union, 1);
765
+ rb_define_method(cs, "+", method_union, 1);
766
+ rb_define_method(cs, "|", method_union, 1);
767
+ rb_define_method(cs, "difference", method_difference, 1);
768
+ rb_define_method(cs, "-", method_difference, 1);
769
+ rb_define_method(cs, "^", method_exclusion, 1);
770
+ rb_define_method(cs, "include?", method_include_p, 1);
771
+ rb_define_method(cs, "member?", method_include_p, 1);
772
+ rb_define_method(cs, "===", method_include_p, 1);
773
+ rb_define_method(cs, "add", method_add, 1);
774
+ rb_define_method(cs, "<<", method_add, 1);
775
+ rb_define_method(cs, "add?", method_add_p, 1);
776
+ rb_define_method(cs, "delete", method_delete, 1);
777
+ rb_define_method(cs, "delete?", method_delete_p, 1);
778
+ rb_define_method(cs, "intersect?", method_intersect_p, 1);
779
+ rb_define_method(cs, "disjoint?", method_disjoint_p, 1);
780
+ rb_define_method(cs, "eql?", method_eql_p, 1);
781
+ rb_define_method(cs, "==", method_eql_p, 1);
782
+ rb_define_method(cs, "merge", method_merge, 1);
783
+ rb_define_method(cs, "initialize_clone", method_initialize_copy, 1);
784
+ rb_define_method(cs, "initialize_dup", method_initialize_copy, 1);
785
+ rb_define_method(cs, "subtract", method_subtract, 1);
786
+ rb_define_method(cs, "subset?", method_subset_p, 1);
787
+ rb_define_method(cs, "<=", method_subset_p, 1);
788
+ rb_define_method(cs, "proper_subset?", method_proper_subset_p, 1);
789
+ rb_define_method(cs, "<", method_proper_subset_p, 1);
790
+ rb_define_method(cs, "superset?", method_superset_p, 1);
791
+ rb_define_method(cs, ">=", method_superset_p, 1);
792
+ rb_define_method(cs, "proper_superset?", method_proper_superset_p, 1);
793
+ rb_define_method(cs, ">", method_proper_superset_p, 1);
794
+
795
+ // `CharacterSet`-specific methods
796
+
797
+ rb_define_singleton_method(cs, "from_ranges", class_method_from_ranges, -2);
798
+ rb_define_singleton_method(cs, "of", class_method_of, 1);
799
+
800
+ rb_define_method(cs, "ranges", method_ranges, 0);
801
+ rb_define_method(cs, "sample", method_sample, -1);
802
+ rb_define_method(cs, "bmp_part", method_bmp_part, 0);
803
+ rb_define_method(cs, "astral_part", method_astral_part, 0);
804
+ rb_define_method(cs, "planes", method_planes, 0);
805
+ rb_define_method(cs, "member_in_plane?", method_member_in_plane_p, 1);
806
+ rb_define_method(cs, "ext_inversion", method_ext_inversion, -1);
807
+ rb_define_method(cs, "case_insensitive", method_case_insensitive, 0);
808
+ rb_define_method(cs, "used_by?", method_used_by_p, 1);
809
+ rb_define_method(cs, "cover?", method_cover_p, 1);
810
+ rb_define_method(cs, "delete_in", method_delete_in, 1);
811
+ rb_define_method(cs, "delete_in!", method_delete_in_bang, 1);
812
+ rb_define_method(cs, "keep_in", method_keep_in, 1);
813
+ rb_define_method(cs, "keep_in!", method_keep_in_bang, 1);
814
+ }