character_set 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +31 -0
- data/.rspec +3 -0
- data/.travis.yml +11 -0
- data/BENCHMARK.md +50 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +180 -0
- data/Rakefile +137 -0
- data/benchmarks/cover.rb +25 -0
- data/benchmarks/delete_in.rb +25 -0
- data/benchmarks/keep_in.rb +25 -0
- data/benchmarks/shared.rb +25 -0
- data/benchmarks/used_by.rb +25 -0
- data/bin/console +19 -0
- data/bin/setup +8 -0
- data/character_set.gemspec +34 -0
- data/ext/character_set/character_set.c +814 -0
- data/ext/character_set/extconf.rb +5 -0
- data/ext/character_set/unicode_casefold_table.h +1387 -0
- data/lib/character_set/character.rb +76 -0
- data/lib/character_set/common_sets.rb +258 -0
- data/lib/character_set/core_ext/regexp_ext.rb +11 -0
- data/lib/character_set/core_ext/string_ext.rb +35 -0
- data/lib/character_set/core_ext.rb +3 -0
- data/lib/character_set/expression_converter.rb +106 -0
- data/lib/character_set/parser.rb +48 -0
- data/lib/character_set/pure.rb +13 -0
- data/lib/character_set/ruby_fallback/character_set_methods.rb +83 -0
- data/lib/character_set/ruby_fallback/plane_methods.rb +27 -0
- data/lib/character_set/ruby_fallback/set_methods.rb +103 -0
- data/lib/character_set/ruby_fallback.rb +21 -0
- data/lib/character_set/set_method_adapters.rb +39 -0
- data/lib/character_set/shared_methods.rb +155 -0
- data/lib/character_set/version.rb +3 -0
- data/lib/character_set/writer.rb +37 -0
- data/lib/character_set.rb +21 -0
- metadata +193 -0
@@ -0,0 +1,814 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "ruby/encoding.h"
|
3
|
+
#include "unicode_casefold_table.h"
|
4
|
+
|
5
|
+
#define SETBIT(byte_arr, bit) (byte_arr[bit >> 3] |= (1 << (bit & 0x07)))
|
6
|
+
#define CLRBIT(byte_arr, bit) (byte_arr[bit >> 3] &= ~(1 << (bit & 0x07)))
|
7
|
+
#define TSTBIT(byte_arr, bit) (byte_arr[bit >> 3] & (1 << (bit & 0x07)))
|
8
|
+
|
9
|
+
typedef char cp_byte;
|
10
|
+
typedef unsigned long cp_index;
|
11
|
+
|
12
|
+
#define UNICODE_CP_COUNT 0x110000
|
13
|
+
#define UNICODE_BYTES UNICODE_CP_COUNT / 8
|
14
|
+
#define UNICODE_PLANE_SIZE 0x10000
|
15
|
+
#define UNICODE_PLANE_COUNT UNICODE_CP_COUNT / UNICODE_PLANE_SIZE
|
16
|
+
|
17
|
+
static void
|
18
|
+
free_character_set(void* codepoints) {
|
19
|
+
free(codepoints);
|
20
|
+
}
|
21
|
+
|
22
|
+
static size_t
|
23
|
+
memsize_character_set(const void* codepoints) {
|
24
|
+
return sizeof(cp_byte) * UNICODE_BYTES;
|
25
|
+
}
|
26
|
+
|
27
|
+
static const rb_data_type_t
|
28
|
+
character_set_type = {
|
29
|
+
.wrap_struct_name = "character_set",
|
30
|
+
.function = {
|
31
|
+
.dmark = NULL,
|
32
|
+
.dfree = free_character_set,
|
33
|
+
.dsize = memsize_character_set,
|
34
|
+
},
|
35
|
+
.data = NULL,
|
36
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY,
|
37
|
+
};
|
38
|
+
|
39
|
+
#define FETCH_CODEPOINTS(set, cps)\
|
40
|
+
TypedData_Get_Struct(set, cp_byte, &character_set_type, cps)
|
41
|
+
|
42
|
+
#define NEW_CHARACTER_SET(klass, cps)\
|
43
|
+
TypedData_Wrap_Struct(klass, &character_set_type, cps)
|
44
|
+
|
45
|
+
static VALUE
|
46
|
+
method_allocate(VALUE self) {
|
47
|
+
cp_byte *cp_arr;
|
48
|
+
cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte));
|
49
|
+
return NEW_CHARACTER_SET(self, cp_arr);
|
50
|
+
}
|
51
|
+
|
52
|
+
#define FOR_EACH_ACTIVE_CODEPOINT(action)\
|
53
|
+
cp_index cp;\
|
54
|
+
cp_byte *cps;\
|
55
|
+
FETCH_CODEPOINTS(self, cps);\
|
56
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\
|
57
|
+
if (TSTBIT(cps, cp)) { action; }\
|
58
|
+
}
|
59
|
+
|
60
|
+
// ***************************
|
61
|
+
// `Set` compatibility methods
|
62
|
+
// ***************************
|
63
|
+
|
64
|
+
static inline VALUE
|
65
|
+
enumerator_length(VALUE self, VALUE args, VALUE eobj) {
|
66
|
+
cp_index count;
|
67
|
+
count = 0;
|
68
|
+
FOR_EACH_ACTIVE_CODEPOINT(count++);
|
69
|
+
return LONG2FIX(count);
|
70
|
+
}
|
71
|
+
|
72
|
+
static VALUE
|
73
|
+
method_length(VALUE self) {
|
74
|
+
return enumerator_length(self, 0, 0);
|
75
|
+
}
|
76
|
+
|
77
|
+
static VALUE
|
78
|
+
method_each(VALUE self) {
|
79
|
+
RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
|
80
|
+
FOR_EACH_ACTIVE_CODEPOINT(rb_yield(LONG2FIX(cp)));
|
81
|
+
return self;
|
82
|
+
}
|
83
|
+
|
84
|
+
// returns an Array of codepoint Integers by default.
|
85
|
+
// returns an Array of Strings of length 1 if passed `true`.
|
86
|
+
static VALUE
|
87
|
+
method_to_a(int argc, VALUE *argv, VALUE self) {
|
88
|
+
VALUE arr;
|
89
|
+
rb_encoding *enc;
|
90
|
+
rb_check_arity(argc, 0, 1);
|
91
|
+
|
92
|
+
arr = rb_ary_new();
|
93
|
+
if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse) {
|
94
|
+
FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, LONG2FIX(cp)));
|
95
|
+
}
|
96
|
+
else {
|
97
|
+
enc = rb_utf8_encoding();
|
98
|
+
FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, rb_enc_uint_chr((int)cp, enc)));
|
99
|
+
}
|
100
|
+
|
101
|
+
return arr;
|
102
|
+
}
|
103
|
+
|
104
|
+
static VALUE
|
105
|
+
method_empty_p(VALUE self) {
|
106
|
+
FOR_EACH_ACTIVE_CODEPOINT(return Qfalse);
|
107
|
+
return Qtrue;
|
108
|
+
}
|
109
|
+
|
110
|
+
static VALUE
|
111
|
+
method_hash(VALUE self) {
|
112
|
+
cp_index cp, hash, four_byte_value;
|
113
|
+
cp_byte *cps;
|
114
|
+
FETCH_CODEPOINTS(self, cps);
|
115
|
+
|
116
|
+
hash = 17;
|
117
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
|
118
|
+
if (cp % 32 == 0) {
|
119
|
+
if (cp != 0) { hash = hash * 23 + four_byte_value; }
|
120
|
+
four_byte_value = 0;
|
121
|
+
}
|
122
|
+
if (TSTBIT(cps, cp)) four_byte_value++;
|
123
|
+
}
|
124
|
+
|
125
|
+
return LONG2FIX(hash);
|
126
|
+
}
|
127
|
+
|
128
|
+
static inline VALUE
|
129
|
+
delete_if_block_result(VALUE self, int truthy) {
|
130
|
+
VALUE result;
|
131
|
+
rb_need_block();
|
132
|
+
rb_check_frozen(self);
|
133
|
+
FOR_EACH_ACTIVE_CODEPOINT(
|
134
|
+
result = rb_yield(LONG2FIX(cp));
|
135
|
+
if ((NIL_P(result) || result == Qfalse) != truthy) CLRBIT(cps, cp);
|
136
|
+
);
|
137
|
+
return self;
|
138
|
+
}
|
139
|
+
|
140
|
+
static VALUE
|
141
|
+
method_delete_if(VALUE self) {
|
142
|
+
RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
|
143
|
+
return delete_if_block_result(self, 1);
|
144
|
+
}
|
145
|
+
|
146
|
+
static VALUE
|
147
|
+
method_keep_if(VALUE self) {
|
148
|
+
RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
|
149
|
+
return delete_if_block_result(self, 0);
|
150
|
+
}
|
151
|
+
|
152
|
+
static VALUE
|
153
|
+
method_clear(VALUE self) {
|
154
|
+
cp_index cp;
|
155
|
+
cp_byte *cps;
|
156
|
+
rb_check_frozen(self);
|
157
|
+
FETCH_CODEPOINTS(self, cps);
|
158
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
|
159
|
+
CLRBIT(cps, cp);
|
160
|
+
}
|
161
|
+
return self;
|
162
|
+
}
|
163
|
+
|
164
|
+
#define RETURN_NEW_SET_BASED_ON(condition)\
|
165
|
+
cp_index cp;\
|
166
|
+
cp_byte *a, *b, *new_cps;\
|
167
|
+
FETCH_CODEPOINTS(self, a);\
|
168
|
+
if (other) FETCH_CODEPOINTS(other, b);\
|
169
|
+
new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));\
|
170
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\
|
171
|
+
if (condition) SETBIT(new_cps, cp);\
|
172
|
+
}\
|
173
|
+
return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);\
|
174
|
+
|
175
|
+
static VALUE
|
176
|
+
method_intersection(VALUE self, VALUE other) {
|
177
|
+
RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) && TSTBIT(b, cp));
|
178
|
+
}
|
179
|
+
|
180
|
+
static VALUE
|
181
|
+
method_exclusion(VALUE self, VALUE other) {
|
182
|
+
RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) ^ TSTBIT(b, cp));
|
183
|
+
}
|
184
|
+
|
185
|
+
static VALUE
|
186
|
+
method_union(VALUE self, VALUE other) {
|
187
|
+
RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) || TSTBIT(b, cp));
|
188
|
+
}
|
189
|
+
|
190
|
+
static VALUE
|
191
|
+
method_difference(VALUE self, VALUE other) {
|
192
|
+
RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) > TSTBIT(b, cp));
|
193
|
+
}
|
194
|
+
|
195
|
+
static VALUE
|
196
|
+
method_include_p(VALUE self, VALUE num) {
|
197
|
+
cp_byte *cps;
|
198
|
+
FETCH_CODEPOINTS(self, cps);
|
199
|
+
return (TSTBIT(cps, FIX2ULONG(num)) ? Qtrue : Qfalse);
|
200
|
+
}
|
201
|
+
|
202
|
+
static inline int
|
203
|
+
toggle_codepoint(VALUE set, VALUE cp_num, unsigned int on, int check_if_noop) {
|
204
|
+
cp_index cp;
|
205
|
+
cp_byte *cps;
|
206
|
+
rb_check_frozen(set);
|
207
|
+
FETCH_CODEPOINTS(set, cps);
|
208
|
+
cp = FIX2ULONG(cp_num);
|
209
|
+
if (check_if_noop && (!TSTBIT(cps, cp) == !on)) {
|
210
|
+
return 0;
|
211
|
+
}
|
212
|
+
else {
|
213
|
+
if (on) { SETBIT(cps, cp); }
|
214
|
+
else { CLRBIT(cps, cp); }
|
215
|
+
return 1;
|
216
|
+
}
|
217
|
+
}
|
218
|
+
|
219
|
+
static VALUE
|
220
|
+
method_add(VALUE self, VALUE cp_num) {
|
221
|
+
return toggle_codepoint(self, cp_num, 1, 0) ? self : Qnil;
|
222
|
+
}
|
223
|
+
|
224
|
+
static VALUE
|
225
|
+
method_add_p(VALUE self, VALUE cp_num) {
|
226
|
+
return toggle_codepoint(self, cp_num, 1, 1) ? self : Qnil;
|
227
|
+
}
|
228
|
+
|
229
|
+
static VALUE
|
230
|
+
method_delete(VALUE self, VALUE cp_num) {
|
231
|
+
return toggle_codepoint(self, cp_num, 0, 0) ? self : Qnil;
|
232
|
+
}
|
233
|
+
|
234
|
+
static VALUE
|
235
|
+
method_delete_p(VALUE self, VALUE cp_num) {
|
236
|
+
return toggle_codepoint(self, cp_num, 0, 1) ? self : Qnil;
|
237
|
+
}
|
238
|
+
|
239
|
+
#define COMPARE_SETS(action)\
|
240
|
+
cp_index cp;\
|
241
|
+
cp_byte *cps, *other_cps;\
|
242
|
+
FETCH_CODEPOINTS(self, cps);\
|
243
|
+
FETCH_CODEPOINTS(other, other_cps);\
|
244
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { action; }\
|
245
|
+
|
246
|
+
static VALUE
|
247
|
+
method_intersect_p(VALUE self, VALUE other) {
|
248
|
+
COMPARE_SETS(if (TSTBIT(cps, cp) && TSTBIT(other_cps, cp)) return Qtrue);
|
249
|
+
return Qfalse;
|
250
|
+
}
|
251
|
+
|
252
|
+
static VALUE
|
253
|
+
method_disjoint_p(VALUE self, VALUE other) {
|
254
|
+
return method_intersect_p(self, other) ? Qfalse : Qtrue;
|
255
|
+
}
|
256
|
+
|
257
|
+
static inline int
|
258
|
+
is_character_set(VALUE obj) {
|
259
|
+
return rb_typeddata_is_kind_of(obj, &character_set_type);
|
260
|
+
}
|
261
|
+
|
262
|
+
static VALUE
|
263
|
+
method_eql_p(VALUE self, VALUE other) {
|
264
|
+
if (!is_character_set(other)) return Qfalse;
|
265
|
+
if (self == other) return Qtrue; // same object_id
|
266
|
+
|
267
|
+
COMPARE_SETS(if (TSTBIT(cps, cp) != TSTBIT(other_cps, cp)) return Qfalse);
|
268
|
+
|
269
|
+
return Qtrue;
|
270
|
+
}
|
271
|
+
|
272
|
+
static inline VALUE
|
273
|
+
merge_character_set(VALUE self, VALUE other) {
|
274
|
+
COMPARE_SETS(if (TSTBIT(other_cps, cp)) SETBIT(cps, cp));
|
275
|
+
return self;
|
276
|
+
}
|
277
|
+
|
278
|
+
static inline void
|
279
|
+
raise_arg_err_unless_valid_as_cp(VALUE object_id) {
|
280
|
+
if (FIXNUM_P(object_id) && object_id > 0 && object_id < 0x220001) return;
|
281
|
+
rb_raise(rb_eArgError, "CharacterSet members must be between 0 and 0x10FFFF");
|
282
|
+
}
|
283
|
+
|
284
|
+
static inline VALUE
|
285
|
+
merge_rb_range(VALUE self, VALUE rb_range) {
|
286
|
+
VALUE from_id, upto_id;
|
287
|
+
int excl;
|
288
|
+
cp_index cp;
|
289
|
+
cp_byte *cps;
|
290
|
+
FETCH_CODEPOINTS(self, cps);
|
291
|
+
|
292
|
+
if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl))) {
|
293
|
+
rb_raise(rb_eArgError, "pass a Range");
|
294
|
+
}
|
295
|
+
if (excl) upto_id -= 2;
|
296
|
+
|
297
|
+
raise_arg_err_unless_valid_as_cp(from_id);
|
298
|
+
raise_arg_err_unless_valid_as_cp(upto_id);
|
299
|
+
|
300
|
+
for (/* */; from_id <= upto_id; from_id += 2) {
|
301
|
+
cp = FIX2ULONG(from_id);
|
302
|
+
SETBIT(cps, cp);
|
303
|
+
}
|
304
|
+
return self;
|
305
|
+
}
|
306
|
+
|
307
|
+
static inline VALUE
|
308
|
+
merge_rb_array(VALUE self, VALUE rb_array) {
|
309
|
+
VALUE el;
|
310
|
+
cp_byte *cps;
|
311
|
+
VALUE array_length, i;
|
312
|
+
FETCH_CODEPOINTS(self, cps);
|
313
|
+
Check_Type(rb_array, T_ARRAY);
|
314
|
+
array_length = RARRAY_LEN(rb_array);
|
315
|
+
for (i = 0; i < array_length; i++) {
|
316
|
+
el = RARRAY_AREF(rb_array, i);
|
317
|
+
raise_arg_err_unless_valid_as_cp(el);
|
318
|
+
SETBIT(cps, FIX2ULONG(el));
|
319
|
+
}
|
320
|
+
return self;
|
321
|
+
}
|
322
|
+
|
323
|
+
static VALUE
|
324
|
+
method_merge(VALUE self, VALUE other) {
|
325
|
+
rb_check_frozen(self);
|
326
|
+
if (is_character_set(other)) {
|
327
|
+
return merge_character_set(self, other);
|
328
|
+
}
|
329
|
+
else if (TYPE(other) == T_ARRAY) {
|
330
|
+
return merge_rb_array(self, other);
|
331
|
+
}
|
332
|
+
return merge_rb_range(self, other);
|
333
|
+
}
|
334
|
+
|
335
|
+
static VALUE
|
336
|
+
method_initialize_copy(VALUE self, VALUE other) {
|
337
|
+
merge_character_set(self, other);
|
338
|
+
return other;
|
339
|
+
}
|
340
|
+
|
341
|
+
static VALUE
|
342
|
+
method_subtract(VALUE self, VALUE other) {
|
343
|
+
rb_check_frozen(self);
|
344
|
+
COMPARE_SETS(if (TSTBIT(other_cps, cp)) CLRBIT(cps, cp));
|
345
|
+
return self;
|
346
|
+
}
|
347
|
+
|
348
|
+
static inline int
|
349
|
+
a_subset_of_b(VALUE set_a, VALUE set_b, int *is_proper) {
|
350
|
+
cp_byte *cps_a, *cps_b;
|
351
|
+
cp_index cp, size_a, size_b;
|
352
|
+
|
353
|
+
if (!is_character_set(set_a) || !is_character_set(set_b)) {
|
354
|
+
rb_raise(rb_eArgError, "pass a CharacterSet");
|
355
|
+
}
|
356
|
+
|
357
|
+
FETCH_CODEPOINTS(set_a, cps_a);
|
358
|
+
FETCH_CODEPOINTS(set_b, cps_b);
|
359
|
+
|
360
|
+
*is_proper = 0;
|
361
|
+
size_a = 0;
|
362
|
+
size_b = 0;
|
363
|
+
|
364
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
|
365
|
+
if (TSTBIT(cps_a, cp)) {
|
366
|
+
if (!TSTBIT(cps_b, cp)) return 0;
|
367
|
+
size_a++;
|
368
|
+
size_b++;
|
369
|
+
}
|
370
|
+
else if (TSTBIT(cps_b, cp)) size_b++;
|
371
|
+
}
|
372
|
+
|
373
|
+
if (size_b > size_a) *is_proper = 1;
|
374
|
+
return 1;
|
375
|
+
}
|
376
|
+
|
377
|
+
static VALUE
|
378
|
+
method_subset_p(VALUE self, VALUE other) {
|
379
|
+
int is_proper;
|
380
|
+
return a_subset_of_b(self, other, &is_proper) ? Qtrue : Qfalse;
|
381
|
+
}
|
382
|
+
|
383
|
+
static VALUE
|
384
|
+
method_proper_subset_p(VALUE self, VALUE other) {
|
385
|
+
int is, is_proper;
|
386
|
+
is = a_subset_of_b(self, other, &is_proper);
|
387
|
+
return (is && is_proper) ? Qtrue : Qfalse;
|
388
|
+
}
|
389
|
+
|
390
|
+
static VALUE
|
391
|
+
method_superset_p(VALUE self, VALUE other) {
|
392
|
+
int is_proper;
|
393
|
+
return a_subset_of_b(other, self, &is_proper) ? Qtrue : Qfalse;
|
394
|
+
}
|
395
|
+
|
396
|
+
static VALUE
|
397
|
+
method_proper_superset_p(VALUE self, VALUE other) {
|
398
|
+
int is, is_proper;
|
399
|
+
is = a_subset_of_b(other, self, &is_proper);
|
400
|
+
return (is && is_proper) ? Qtrue : Qfalse;
|
401
|
+
}
|
402
|
+
|
403
|
+
// *******************************
|
404
|
+
// `CharacterSet`-specific methods
|
405
|
+
// *******************************
|
406
|
+
|
407
|
+
static VALUE
|
408
|
+
class_method_from_ranges(VALUE self, VALUE ranges) {
|
409
|
+
VALUE new_set, range_count, i;
|
410
|
+
new_set = rb_class_new_instance(0, 0, self);
|
411
|
+
range_count = RARRAY_LEN(ranges);
|
412
|
+
for (i = 0; i < range_count; i++) {
|
413
|
+
merge_rb_range(new_set, RARRAY_AREF(ranges, i));
|
414
|
+
}
|
415
|
+
return new_set;
|
416
|
+
}
|
417
|
+
|
418
|
+
static VALUE
|
419
|
+
method_ranges(VALUE self) {
|
420
|
+
VALUE ranges, codepoint, previous_codepoint, current_start, current_end;
|
421
|
+
|
422
|
+
ranges = rb_ary_new();
|
423
|
+
previous_codepoint = 0;
|
424
|
+
current_start = 0;
|
425
|
+
current_end = 0;
|
426
|
+
|
427
|
+
FOR_EACH_ACTIVE_CODEPOINT(
|
428
|
+
codepoint = LONG2FIX(cp);
|
429
|
+
|
430
|
+
if (!previous_codepoint) {
|
431
|
+
current_start = codepoint;
|
432
|
+
}
|
433
|
+
else if (previous_codepoint + 2 != codepoint) {
|
434
|
+
// gap found, finalize previous range
|
435
|
+
rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
|
436
|
+
current_start = codepoint;
|
437
|
+
}
|
438
|
+
current_end = codepoint;
|
439
|
+
previous_codepoint = codepoint;
|
440
|
+
);
|
441
|
+
|
442
|
+
// add final range
|
443
|
+
if (current_start) {
|
444
|
+
rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
|
445
|
+
}
|
446
|
+
|
447
|
+
return ranges;
|
448
|
+
}
|
449
|
+
|
450
|
+
static VALUE
|
451
|
+
method_sample(int argc, VALUE *argv, VALUE self) {
|
452
|
+
VALUE to_a_args[1], array;
|
453
|
+
rb_check_arity(argc, 0, 1);
|
454
|
+
to_a_args[0] = Qtrue;
|
455
|
+
array = method_to_a(1, to_a_args, self);
|
456
|
+
return rb_funcall(array, rb_intern("sample"), argc, argc ? argv[0] : 0);
|
457
|
+
}
|
458
|
+
|
459
|
+
static inline VALUE
|
460
|
+
new_set_from_section(VALUE set, cp_index from, cp_index upto) {
|
461
|
+
cp_byte *cps, *new_cps;
|
462
|
+
cp_index cp;
|
463
|
+
FETCH_CODEPOINTS(set, cps);
|
464
|
+
new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));
|
465
|
+
for (cp = from; cp <= upto; cp++) {
|
466
|
+
if (TSTBIT(cps, cp)) SETBIT(new_cps, cp);
|
467
|
+
}
|
468
|
+
return NEW_CHARACTER_SET(RBASIC(set)->klass, new_cps);
|
469
|
+
}
|
470
|
+
|
471
|
+
static VALUE
|
472
|
+
method_bmp_part(VALUE self) {
|
473
|
+
return new_set_from_section(self, 0, UNICODE_PLANE_SIZE - 1);
|
474
|
+
}
|
475
|
+
|
476
|
+
static VALUE
|
477
|
+
method_astral_part(VALUE self) {
|
478
|
+
return new_set_from_section(self, UNICODE_PLANE_SIZE, UNICODE_CP_COUNT - 1);
|
479
|
+
}
|
480
|
+
|
481
|
+
static inline VALUE
|
482
|
+
set_has_member_in_plane(VALUE set, unsigned int plane) {
|
483
|
+
cp_byte *cps;
|
484
|
+
cp_index cp, max_cp;
|
485
|
+
FETCH_CODEPOINTS(set, cps);
|
486
|
+
cp = plane * UNICODE_PLANE_SIZE;
|
487
|
+
max_cp = (plane + 1) * UNICODE_PLANE_SIZE - 1;
|
488
|
+
for (/* */; cp <= max_cp; cp++) {
|
489
|
+
if (TSTBIT(cps, cp)) return Qtrue;
|
490
|
+
}
|
491
|
+
return Qfalse;
|
492
|
+
}
|
493
|
+
|
494
|
+
static VALUE
|
495
|
+
method_planes(VALUE self) {
|
496
|
+
unsigned int i;
|
497
|
+
VALUE planes;
|
498
|
+
planes = rb_ary_new();
|
499
|
+
for (i = 0; i < UNICODE_PLANE_COUNT; i++) {
|
500
|
+
if (set_has_member_in_plane(self, i)) rb_ary_push(planes, INT2FIX(i));
|
501
|
+
}
|
502
|
+
return planes;
|
503
|
+
}
|
504
|
+
|
505
|
+
static VALUE
|
506
|
+
method_member_in_plane_p(VALUE self, VALUE plane_num) {
|
507
|
+
int plane;
|
508
|
+
Check_Type(plane_num, T_FIXNUM);
|
509
|
+
plane = FIX2INT(plane_num);
|
510
|
+
if (plane < 0 || plane >= UNICODE_PLANE_COUNT) {
|
511
|
+
rb_raise(rb_eArgError, "plane must be between 0 and 16");
|
512
|
+
}
|
513
|
+
return set_has_member_in_plane(self, plane);
|
514
|
+
}
|
515
|
+
|
516
|
+
#define NON_SURROGATE(cp) (cp > 0xDFFF || cp < 0xD800)
|
517
|
+
|
518
|
+
static VALUE
|
519
|
+
method_ext_inversion(int argc, VALUE *argv, VALUE self) {
|
520
|
+
int include_surrogates;
|
521
|
+
cp_index upto;
|
522
|
+
VALUE other;
|
523
|
+
other = 0;
|
524
|
+
rb_check_arity(argc, 0, 2);
|
525
|
+
include_surrogates = ((argc > 0) && (argv[0] == Qtrue));
|
526
|
+
if ((argc > 1) && FIXNUM_P(argv[1])) {
|
527
|
+
upto = FIX2ULONG(argv[1]);
|
528
|
+
RETURN_NEW_SET_BASED_ON(
|
529
|
+
cp <= upto && !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
|
530
|
+
);
|
531
|
+
}
|
532
|
+
RETURN_NEW_SET_BASED_ON(
|
533
|
+
!TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
|
534
|
+
);
|
535
|
+
}
|
536
|
+
|
537
|
+
typedef int(*str_cp_handler)(unsigned int, cp_byte*);
|
538
|
+
|
539
|
+
static inline int
|
540
|
+
add_str_cp_to_arr(unsigned int str_cp, cp_byte *cp_arr) {
|
541
|
+
SETBIT(cp_arr, str_cp);
|
542
|
+
return 1;
|
543
|
+
}
|
544
|
+
|
545
|
+
static VALUE
|
546
|
+
method_case_insensitive(VALUE self) {
|
547
|
+
cp_index i;
|
548
|
+
cp_byte *new_cps;
|
549
|
+
|
550
|
+
new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));
|
551
|
+
|
552
|
+
FOR_EACH_ACTIVE_CODEPOINT(SETBIT(new_cps, cp));
|
553
|
+
|
554
|
+
for (i = 0; i < CASEFOLD_COUNT; i++) {
|
555
|
+
casefold_mapping m = unicode_casefold_table[i];
|
556
|
+
|
557
|
+
if (TSTBIT(cps, m.from)) { SETBIT(new_cps, m.to); }
|
558
|
+
else if (TSTBIT(cps, m.to)) { SETBIT(new_cps, m.from); }
|
559
|
+
}
|
560
|
+
|
561
|
+
return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);
|
562
|
+
|
563
|
+
// OnigCaseFoldType flags;
|
564
|
+
// rb_encoding *enc;
|
565
|
+
//
|
566
|
+
// enc = rb_utf8_encoding();
|
567
|
+
//
|
568
|
+
// ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE (not public on ruby < 2.4)
|
569
|
+
// flags = (1<<13) | (1<<14);
|
570
|
+
//
|
571
|
+
// // case_map args: flags, pp, end, to, to_end, enc
|
572
|
+
// enc->case_map(flags, (const OnigUChar**)&cp, ?, ?, ?, enc);
|
573
|
+
}
|
574
|
+
|
575
|
+
static inline VALUE
|
576
|
+
each_sb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
|
577
|
+
long i;
|
578
|
+
unsigned int str_cp;
|
579
|
+
|
580
|
+
for (i = 0; i < RSTRING_LEN(str); i++) {
|
581
|
+
str_cp = (RSTRING_PTR(str)[i] & 0xff);
|
582
|
+
if (!(*func)(str_cp, cp_arr)) return Qfalse;
|
583
|
+
}
|
584
|
+
|
585
|
+
return Qtrue;
|
586
|
+
}
|
587
|
+
|
588
|
+
static inline VALUE
|
589
|
+
each_mb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
|
590
|
+
int n;
|
591
|
+
unsigned int str_cp;
|
592
|
+
const char *ptr, *end;
|
593
|
+
rb_encoding *enc;
|
594
|
+
|
595
|
+
str = rb_str_new_frozen(str);
|
596
|
+
ptr = RSTRING_PTR(str);
|
597
|
+
end = RSTRING_END(str);
|
598
|
+
enc = rb_enc_get(str);
|
599
|
+
|
600
|
+
while (ptr < end) {
|
601
|
+
str_cp = rb_enc_codepoint_len(ptr, end, &n, enc);
|
602
|
+
if (!(*func)(str_cp, cp_arr)) return Qfalse;
|
603
|
+
ptr += n;
|
604
|
+
}
|
605
|
+
|
606
|
+
return Qtrue;
|
607
|
+
}
|
608
|
+
|
609
|
+
// single_byte_optimizable - copied from string.c
|
610
|
+
static inline int
|
611
|
+
single_byte_optimizable(VALUE str)
|
612
|
+
{
|
613
|
+
rb_encoding *enc;
|
614
|
+
if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) return 1;
|
615
|
+
|
616
|
+
enc = rb_enc_get(str);
|
617
|
+
if (rb_enc_mbmaxlen(enc) == 1) return 1;
|
618
|
+
|
619
|
+
return 0;
|
620
|
+
}
|
621
|
+
|
622
|
+
static inline VALUE
|
623
|
+
each_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
|
624
|
+
if (single_byte_optimizable(str)) {
|
625
|
+
return each_sb_cp(str, func, cp_arr);
|
626
|
+
}
|
627
|
+
return each_mb_cp(str, func, cp_arr);
|
628
|
+
}
|
629
|
+
|
630
|
+
static inline void
|
631
|
+
raise_arg_err_unless_string(VALUE val) {
|
632
|
+
if (!RB_TYPE_P(val, T_STRING)) rb_raise(rb_eArgError, "pass a String");
|
633
|
+
}
|
634
|
+
|
635
|
+
static VALUE
|
636
|
+
class_method_of(VALUE self, VALUE str) {
|
637
|
+
cp_byte *cp_arr;
|
638
|
+
raise_arg_err_unless_string(str);
|
639
|
+
cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte));
|
640
|
+
each_cp(str, add_str_cp_to_arr, cp_arr);
|
641
|
+
return NEW_CHARACTER_SET(self, cp_arr);
|
642
|
+
}
|
643
|
+
|
644
|
+
static inline int
|
645
|
+
str_cp_not_in_arr(unsigned int str_cp, cp_byte *cp_arr) {
|
646
|
+
return !TSTBIT(cp_arr, str_cp);
|
647
|
+
}
|
648
|
+
|
649
|
+
static VALUE
|
650
|
+
method_used_by_p(VALUE self, VALUE str) {
|
651
|
+
cp_byte *cps;
|
652
|
+
VALUE only_uses_other_cps;
|
653
|
+
raise_arg_err_unless_string(str);
|
654
|
+
FETCH_CODEPOINTS(self, cps);
|
655
|
+
only_uses_other_cps = each_cp(str, str_cp_not_in_arr, cps);
|
656
|
+
return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
|
657
|
+
}
|
658
|
+
|
659
|
+
static inline int
|
660
|
+
str_cp_in_arr(unsigned int str_cp, cp_byte *cp_arr) {
|
661
|
+
return TSTBIT(cp_arr, str_cp);
|
662
|
+
}
|
663
|
+
|
664
|
+
static VALUE
|
665
|
+
method_cover_p(VALUE self, VALUE str) {
|
666
|
+
cp_byte *cps;
|
667
|
+
raise_arg_err_unless_string(str);
|
668
|
+
FETCH_CODEPOINTS(self, cps);
|
669
|
+
return each_cp(str, str_cp_in_arr, cps);
|
670
|
+
}
|
671
|
+
|
672
|
+
static inline VALUE
|
673
|
+
apply_to_str(VALUE set, VALUE str, int delete, int bang) {
|
674
|
+
cp_byte *cps;
|
675
|
+
rb_encoding *str_enc;
|
676
|
+
VALUE orig_len, blen, new_str_buf, chr;
|
677
|
+
int n;
|
678
|
+
unsigned int str_cp;
|
679
|
+
const char *ptr, *end;
|
680
|
+
|
681
|
+
raise_arg_err_unless_string(str);
|
682
|
+
|
683
|
+
FETCH_CODEPOINTS(set, cps);
|
684
|
+
|
685
|
+
orig_len = RSTRING_LEN(str);
|
686
|
+
blen = orig_len + 30; /* len + margin */ // not sure why, copied from string.c
|
687
|
+
new_str_buf = rb_str_buf_new(blen);
|
688
|
+
str_enc = rb_enc_get(str);
|
689
|
+
rb_enc_associate(new_str_buf, str_enc);
|
690
|
+
ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ?
|
691
|
+
ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
|
692
|
+
|
693
|
+
ptr = RSTRING_PTR(str);
|
694
|
+
end = RSTRING_END(str);
|
695
|
+
|
696
|
+
while (ptr < end) {
|
697
|
+
str_cp = rb_enc_codepoint_len(ptr, end, &n, str_enc);
|
698
|
+
if (!TSTBIT(cps, str_cp) != !delete) {
|
699
|
+
chr = rb_enc_uint_chr(str_cp, str_enc);
|
700
|
+
rb_enc_str_buf_cat(new_str_buf, RSTRING_PTR(chr), n, str_enc);
|
701
|
+
}
|
702
|
+
ptr += n;
|
703
|
+
}
|
704
|
+
|
705
|
+
if (bang) {
|
706
|
+
if (RSTRING_LEN(new_str_buf) == (long)orig_len) return Qnil; // unchanged
|
707
|
+
rb_str_shared_replace(str, new_str_buf);
|
708
|
+
}
|
709
|
+
else {
|
710
|
+
RB_OBJ_WRITE(new_str_buf, &(RBASIC(new_str_buf))->klass, rb_obj_class(str));
|
711
|
+
// slightly cumbersome approach needed for compatibility with Ruby < 2.3:
|
712
|
+
RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags&(FL_TAINT));
|
713
|
+
str = new_str_buf;
|
714
|
+
}
|
715
|
+
|
716
|
+
return str;
|
717
|
+
}
|
718
|
+
|
719
|
+
static VALUE
|
720
|
+
method_delete_in(VALUE self, VALUE str) {
|
721
|
+
return apply_to_str(self, str, 1, 0);
|
722
|
+
}
|
723
|
+
|
724
|
+
static VALUE
|
725
|
+
method_delete_in_bang(VALUE self, VALUE str) {
|
726
|
+
return apply_to_str(self, str, 1, 1);
|
727
|
+
}
|
728
|
+
|
729
|
+
static VALUE
|
730
|
+
method_keep_in(VALUE self, VALUE str) {
|
731
|
+
return apply_to_str(self, str, 0, 0);
|
732
|
+
}
|
733
|
+
|
734
|
+
static VALUE
|
735
|
+
method_keep_in_bang(VALUE self, VALUE str) {
|
736
|
+
return apply_to_str(self, str, 0, 1);
|
737
|
+
}
|
738
|
+
|
739
|
+
// ****
|
740
|
+
// init
|
741
|
+
// ****
|
742
|
+
|
743
|
+
void
|
744
|
+
Init_character_set()
|
745
|
+
{
|
746
|
+
VALUE cs = rb_define_class("CharacterSet", rb_cObject);
|
747
|
+
|
748
|
+
rb_define_alloc_func(cs, method_allocate);
|
749
|
+
|
750
|
+
// `Set` compatibility methods
|
751
|
+
|
752
|
+
rb_define_method(cs, "each", method_each, 0);
|
753
|
+
rb_define_method(cs, "to_a", method_to_a, -1);
|
754
|
+
rb_define_method(cs, "length", method_length, 0);
|
755
|
+
rb_define_method(cs, "size", method_length, 0);
|
756
|
+
rb_define_method(cs, "count", method_length, 0);
|
757
|
+
rb_define_method(cs, "empty?", method_empty_p, 0);
|
758
|
+
rb_define_method(cs, "hash", method_hash, 0);
|
759
|
+
rb_define_method(cs, "keep_if", method_keep_if, 0);
|
760
|
+
rb_define_method(cs, "delete_if", method_delete_if, 0);
|
761
|
+
rb_define_method(cs, "clear", method_clear, 0);
|
762
|
+
rb_define_method(cs, "intersection", method_intersection, 1);
|
763
|
+
rb_define_method(cs, "&", method_intersection, 1);
|
764
|
+
rb_define_method(cs, "union", method_union, 1);
|
765
|
+
rb_define_method(cs, "+", method_union, 1);
|
766
|
+
rb_define_method(cs, "|", method_union, 1);
|
767
|
+
rb_define_method(cs, "difference", method_difference, 1);
|
768
|
+
rb_define_method(cs, "-", method_difference, 1);
|
769
|
+
rb_define_method(cs, "^", method_exclusion, 1);
|
770
|
+
rb_define_method(cs, "include?", method_include_p, 1);
|
771
|
+
rb_define_method(cs, "member?", method_include_p, 1);
|
772
|
+
rb_define_method(cs, "===", method_include_p, 1);
|
773
|
+
rb_define_method(cs, "add", method_add, 1);
|
774
|
+
rb_define_method(cs, "<<", method_add, 1);
|
775
|
+
rb_define_method(cs, "add?", method_add_p, 1);
|
776
|
+
rb_define_method(cs, "delete", method_delete, 1);
|
777
|
+
rb_define_method(cs, "delete?", method_delete_p, 1);
|
778
|
+
rb_define_method(cs, "intersect?", method_intersect_p, 1);
|
779
|
+
rb_define_method(cs, "disjoint?", method_disjoint_p, 1);
|
780
|
+
rb_define_method(cs, "eql?", method_eql_p, 1);
|
781
|
+
rb_define_method(cs, "==", method_eql_p, 1);
|
782
|
+
rb_define_method(cs, "merge", method_merge, 1);
|
783
|
+
rb_define_method(cs, "initialize_clone", method_initialize_copy, 1);
|
784
|
+
rb_define_method(cs, "initialize_dup", method_initialize_copy, 1);
|
785
|
+
rb_define_method(cs, "subtract", method_subtract, 1);
|
786
|
+
rb_define_method(cs, "subset?", method_subset_p, 1);
|
787
|
+
rb_define_method(cs, "<=", method_subset_p, 1);
|
788
|
+
rb_define_method(cs, "proper_subset?", method_proper_subset_p, 1);
|
789
|
+
rb_define_method(cs, "<", method_proper_subset_p, 1);
|
790
|
+
rb_define_method(cs, "superset?", method_superset_p, 1);
|
791
|
+
rb_define_method(cs, ">=", method_superset_p, 1);
|
792
|
+
rb_define_method(cs, "proper_superset?", method_proper_superset_p, 1);
|
793
|
+
rb_define_method(cs, ">", method_proper_superset_p, 1);
|
794
|
+
|
795
|
+
// `CharacterSet`-specific methods
|
796
|
+
|
797
|
+
rb_define_singleton_method(cs, "from_ranges", class_method_from_ranges, -2);
|
798
|
+
rb_define_singleton_method(cs, "of", class_method_of, 1);
|
799
|
+
|
800
|
+
rb_define_method(cs, "ranges", method_ranges, 0);
|
801
|
+
rb_define_method(cs, "sample", method_sample, -1);
|
802
|
+
rb_define_method(cs, "bmp_part", method_bmp_part, 0);
|
803
|
+
rb_define_method(cs, "astral_part", method_astral_part, 0);
|
804
|
+
rb_define_method(cs, "planes", method_planes, 0);
|
805
|
+
rb_define_method(cs, "member_in_plane?", method_member_in_plane_p, 1);
|
806
|
+
rb_define_method(cs, "ext_inversion", method_ext_inversion, -1);
|
807
|
+
rb_define_method(cs, "case_insensitive", method_case_insensitive, 0);
|
808
|
+
rb_define_method(cs, "used_by?", method_used_by_p, 1);
|
809
|
+
rb_define_method(cs, "cover?", method_cover_p, 1);
|
810
|
+
rb_define_method(cs, "delete_in", method_delete_in, 1);
|
811
|
+
rb_define_method(cs, "delete_in!", method_delete_in_bang, 1);
|
812
|
+
rb_define_method(cs, "keep_in", method_keep_in, 1);
|
813
|
+
rb_define_method(cs, "keep_in!", method_keep_in_bang, 1);
|
814
|
+
}
|