character_set 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +31 -0
- data/.rspec +3 -0
- data/.travis.yml +11 -0
- data/BENCHMARK.md +50 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +180 -0
- data/Rakefile +137 -0
- data/benchmarks/cover.rb +25 -0
- data/benchmarks/delete_in.rb +25 -0
- data/benchmarks/keep_in.rb +25 -0
- data/benchmarks/shared.rb +25 -0
- data/benchmarks/used_by.rb +25 -0
- data/bin/console +19 -0
- data/bin/setup +8 -0
- data/character_set.gemspec +34 -0
- data/ext/character_set/character_set.c +814 -0
- data/ext/character_set/extconf.rb +5 -0
- data/ext/character_set/unicode_casefold_table.h +1387 -0
- data/lib/character_set/character.rb +76 -0
- data/lib/character_set/common_sets.rb +258 -0
- data/lib/character_set/core_ext/regexp_ext.rb +11 -0
- data/lib/character_set/core_ext/string_ext.rb +35 -0
- data/lib/character_set/core_ext.rb +3 -0
- data/lib/character_set/expression_converter.rb +106 -0
- data/lib/character_set/parser.rb +48 -0
- data/lib/character_set/pure.rb +13 -0
- data/lib/character_set/ruby_fallback/character_set_methods.rb +83 -0
- data/lib/character_set/ruby_fallback/plane_methods.rb +27 -0
- data/lib/character_set/ruby_fallback/set_methods.rb +103 -0
- data/lib/character_set/ruby_fallback.rb +21 -0
- data/lib/character_set/set_method_adapters.rb +39 -0
- data/lib/character_set/shared_methods.rb +155 -0
- data/lib/character_set/version.rb +3 -0
- data/lib/character_set/writer.rb +37 -0
- data/lib/character_set.rb +21 -0
- metadata +193 -0
@@ -0,0 +1,814 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "ruby/encoding.h"
|
3
|
+
#include "unicode_casefold_table.h"
|
4
|
+
|
5
|
+
#define SETBIT(byte_arr, bit) (byte_arr[bit >> 3] |= (1 << (bit & 0x07)))
|
6
|
+
#define CLRBIT(byte_arr, bit) (byte_arr[bit >> 3] &= ~(1 << (bit & 0x07)))
|
7
|
+
#define TSTBIT(byte_arr, bit) (byte_arr[bit >> 3] & (1 << (bit & 0x07)))
|
8
|
+
|
9
|
+
typedef char cp_byte;
|
10
|
+
typedef unsigned long cp_index;
|
11
|
+
|
12
|
+
#define UNICODE_CP_COUNT 0x110000
|
13
|
+
#define UNICODE_BYTES UNICODE_CP_COUNT / 8
|
14
|
+
#define UNICODE_PLANE_SIZE 0x10000
|
15
|
+
#define UNICODE_PLANE_COUNT UNICODE_CP_COUNT / UNICODE_PLANE_SIZE
|
16
|
+
|
17
|
+
static void
|
18
|
+
free_character_set(void* codepoints) {
|
19
|
+
free(codepoints);
|
20
|
+
}
|
21
|
+
|
22
|
+
static size_t
|
23
|
+
memsize_character_set(const void* codepoints) {
|
24
|
+
return sizeof(cp_byte) * UNICODE_BYTES;
|
25
|
+
}
|
26
|
+
|
27
|
+
static const rb_data_type_t
|
28
|
+
character_set_type = {
|
29
|
+
.wrap_struct_name = "character_set",
|
30
|
+
.function = {
|
31
|
+
.dmark = NULL,
|
32
|
+
.dfree = free_character_set,
|
33
|
+
.dsize = memsize_character_set,
|
34
|
+
},
|
35
|
+
.data = NULL,
|
36
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY,
|
37
|
+
};
|
38
|
+
|
39
|
+
#define FETCH_CODEPOINTS(set, cps)\
|
40
|
+
TypedData_Get_Struct(set, cp_byte, &character_set_type, cps)
|
41
|
+
|
42
|
+
#define NEW_CHARACTER_SET(klass, cps)\
|
43
|
+
TypedData_Wrap_Struct(klass, &character_set_type, cps)
|
44
|
+
|
45
|
+
static VALUE
|
46
|
+
method_allocate(VALUE self) {
|
47
|
+
cp_byte *cp_arr;
|
48
|
+
cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte));
|
49
|
+
return NEW_CHARACTER_SET(self, cp_arr);
|
50
|
+
}
|
51
|
+
|
52
|
+
#define FOR_EACH_ACTIVE_CODEPOINT(action)\
|
53
|
+
cp_index cp;\
|
54
|
+
cp_byte *cps;\
|
55
|
+
FETCH_CODEPOINTS(self, cps);\
|
56
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\
|
57
|
+
if (TSTBIT(cps, cp)) { action; }\
|
58
|
+
}
|
59
|
+
|
60
|
+
// ***************************
|
61
|
+
// `Set` compatibility methods
|
62
|
+
// ***************************
|
63
|
+
|
64
|
+
static inline VALUE
|
65
|
+
enumerator_length(VALUE self, VALUE args, VALUE eobj) {
|
66
|
+
cp_index count;
|
67
|
+
count = 0;
|
68
|
+
FOR_EACH_ACTIVE_CODEPOINT(count++);
|
69
|
+
return LONG2FIX(count);
|
70
|
+
}
|
71
|
+
|
72
|
+
static VALUE
|
73
|
+
method_length(VALUE self) {
|
74
|
+
return enumerator_length(self, 0, 0);
|
75
|
+
}
|
76
|
+
|
77
|
+
static VALUE
|
78
|
+
method_each(VALUE self) {
|
79
|
+
RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
|
80
|
+
FOR_EACH_ACTIVE_CODEPOINT(rb_yield(LONG2FIX(cp)));
|
81
|
+
return self;
|
82
|
+
}
|
83
|
+
|
84
|
+
// returns an Array of codepoint Integers by default.
|
85
|
+
// returns an Array of Strings of length 1 if passed `true`.
|
86
|
+
static VALUE
|
87
|
+
method_to_a(int argc, VALUE *argv, VALUE self) {
|
88
|
+
VALUE arr;
|
89
|
+
rb_encoding *enc;
|
90
|
+
rb_check_arity(argc, 0, 1);
|
91
|
+
|
92
|
+
arr = rb_ary_new();
|
93
|
+
if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse) {
|
94
|
+
FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, LONG2FIX(cp)));
|
95
|
+
}
|
96
|
+
else {
|
97
|
+
enc = rb_utf8_encoding();
|
98
|
+
FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, rb_enc_uint_chr((int)cp, enc)));
|
99
|
+
}
|
100
|
+
|
101
|
+
return arr;
|
102
|
+
}
|
103
|
+
|
104
|
+
static VALUE
|
105
|
+
method_empty_p(VALUE self) {
|
106
|
+
FOR_EACH_ACTIVE_CODEPOINT(return Qfalse);
|
107
|
+
return Qtrue;
|
108
|
+
}
|
109
|
+
|
110
|
+
static VALUE
|
111
|
+
method_hash(VALUE self) {
|
112
|
+
cp_index cp, hash, four_byte_value;
|
113
|
+
cp_byte *cps;
|
114
|
+
FETCH_CODEPOINTS(self, cps);
|
115
|
+
|
116
|
+
hash = 17;
|
117
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
|
118
|
+
if (cp % 32 == 0) {
|
119
|
+
if (cp != 0) { hash = hash * 23 + four_byte_value; }
|
120
|
+
four_byte_value = 0;
|
121
|
+
}
|
122
|
+
if (TSTBIT(cps, cp)) four_byte_value++;
|
123
|
+
}
|
124
|
+
|
125
|
+
return LONG2FIX(hash);
|
126
|
+
}
|
127
|
+
|
128
|
+
static inline VALUE
|
129
|
+
delete_if_block_result(VALUE self, int truthy) {
|
130
|
+
VALUE result;
|
131
|
+
rb_need_block();
|
132
|
+
rb_check_frozen(self);
|
133
|
+
FOR_EACH_ACTIVE_CODEPOINT(
|
134
|
+
result = rb_yield(LONG2FIX(cp));
|
135
|
+
if ((NIL_P(result) || result == Qfalse) != truthy) CLRBIT(cps, cp);
|
136
|
+
);
|
137
|
+
return self;
|
138
|
+
}
|
139
|
+
|
140
|
+
static VALUE
|
141
|
+
method_delete_if(VALUE self) {
|
142
|
+
RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
|
143
|
+
return delete_if_block_result(self, 1);
|
144
|
+
}
|
145
|
+
|
146
|
+
static VALUE
|
147
|
+
method_keep_if(VALUE self) {
|
148
|
+
RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
|
149
|
+
return delete_if_block_result(self, 0);
|
150
|
+
}
|
151
|
+
|
152
|
+
static VALUE
|
153
|
+
method_clear(VALUE self) {
|
154
|
+
cp_index cp;
|
155
|
+
cp_byte *cps;
|
156
|
+
rb_check_frozen(self);
|
157
|
+
FETCH_CODEPOINTS(self, cps);
|
158
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
|
159
|
+
CLRBIT(cps, cp);
|
160
|
+
}
|
161
|
+
return self;
|
162
|
+
}
|
163
|
+
|
164
|
+
#define RETURN_NEW_SET_BASED_ON(condition)\
|
165
|
+
cp_index cp;\
|
166
|
+
cp_byte *a, *b, *new_cps;\
|
167
|
+
FETCH_CODEPOINTS(self, a);\
|
168
|
+
if (other) FETCH_CODEPOINTS(other, b);\
|
169
|
+
new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));\
|
170
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\
|
171
|
+
if (condition) SETBIT(new_cps, cp);\
|
172
|
+
}\
|
173
|
+
return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);\
|
174
|
+
|
175
|
+
static VALUE
|
176
|
+
method_intersection(VALUE self, VALUE other) {
|
177
|
+
RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) && TSTBIT(b, cp));
|
178
|
+
}
|
179
|
+
|
180
|
+
static VALUE
|
181
|
+
method_exclusion(VALUE self, VALUE other) {
|
182
|
+
RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) ^ TSTBIT(b, cp));
|
183
|
+
}
|
184
|
+
|
185
|
+
static VALUE
|
186
|
+
method_union(VALUE self, VALUE other) {
|
187
|
+
RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) || TSTBIT(b, cp));
|
188
|
+
}
|
189
|
+
|
190
|
+
static VALUE
|
191
|
+
method_difference(VALUE self, VALUE other) {
|
192
|
+
RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) > TSTBIT(b, cp));
|
193
|
+
}
|
194
|
+
|
195
|
+
static VALUE
|
196
|
+
method_include_p(VALUE self, VALUE num) {
|
197
|
+
cp_byte *cps;
|
198
|
+
FETCH_CODEPOINTS(self, cps);
|
199
|
+
return (TSTBIT(cps, FIX2ULONG(num)) ? Qtrue : Qfalse);
|
200
|
+
}
|
201
|
+
|
202
|
+
static inline int
|
203
|
+
toggle_codepoint(VALUE set, VALUE cp_num, unsigned int on, int check_if_noop) {
|
204
|
+
cp_index cp;
|
205
|
+
cp_byte *cps;
|
206
|
+
rb_check_frozen(set);
|
207
|
+
FETCH_CODEPOINTS(set, cps);
|
208
|
+
cp = FIX2ULONG(cp_num);
|
209
|
+
if (check_if_noop && (!TSTBIT(cps, cp) == !on)) {
|
210
|
+
return 0;
|
211
|
+
}
|
212
|
+
else {
|
213
|
+
if (on) { SETBIT(cps, cp); }
|
214
|
+
else { CLRBIT(cps, cp); }
|
215
|
+
return 1;
|
216
|
+
}
|
217
|
+
}
|
218
|
+
|
219
|
+
static VALUE
|
220
|
+
method_add(VALUE self, VALUE cp_num) {
|
221
|
+
return toggle_codepoint(self, cp_num, 1, 0) ? self : Qnil;
|
222
|
+
}
|
223
|
+
|
224
|
+
static VALUE
|
225
|
+
method_add_p(VALUE self, VALUE cp_num) {
|
226
|
+
return toggle_codepoint(self, cp_num, 1, 1) ? self : Qnil;
|
227
|
+
}
|
228
|
+
|
229
|
+
static VALUE
|
230
|
+
method_delete(VALUE self, VALUE cp_num) {
|
231
|
+
return toggle_codepoint(self, cp_num, 0, 0) ? self : Qnil;
|
232
|
+
}
|
233
|
+
|
234
|
+
static VALUE
|
235
|
+
method_delete_p(VALUE self, VALUE cp_num) {
|
236
|
+
return toggle_codepoint(self, cp_num, 0, 1) ? self : Qnil;
|
237
|
+
}
|
238
|
+
|
239
|
+
#define COMPARE_SETS(action)\
|
240
|
+
cp_index cp;\
|
241
|
+
cp_byte *cps, *other_cps;\
|
242
|
+
FETCH_CODEPOINTS(self, cps);\
|
243
|
+
FETCH_CODEPOINTS(other, other_cps);\
|
244
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { action; }\
|
245
|
+
|
246
|
+
static VALUE
|
247
|
+
method_intersect_p(VALUE self, VALUE other) {
|
248
|
+
COMPARE_SETS(if (TSTBIT(cps, cp) && TSTBIT(other_cps, cp)) return Qtrue);
|
249
|
+
return Qfalse;
|
250
|
+
}
|
251
|
+
|
252
|
+
static VALUE
|
253
|
+
method_disjoint_p(VALUE self, VALUE other) {
|
254
|
+
return method_intersect_p(self, other) ? Qfalse : Qtrue;
|
255
|
+
}
|
256
|
+
|
257
|
+
static inline int
|
258
|
+
is_character_set(VALUE obj) {
|
259
|
+
return rb_typeddata_is_kind_of(obj, &character_set_type);
|
260
|
+
}
|
261
|
+
|
262
|
+
static VALUE
|
263
|
+
method_eql_p(VALUE self, VALUE other) {
|
264
|
+
if (!is_character_set(other)) return Qfalse;
|
265
|
+
if (self == other) return Qtrue; // same object_id
|
266
|
+
|
267
|
+
COMPARE_SETS(if (TSTBIT(cps, cp) != TSTBIT(other_cps, cp)) return Qfalse);
|
268
|
+
|
269
|
+
return Qtrue;
|
270
|
+
}
|
271
|
+
|
272
|
+
static inline VALUE
|
273
|
+
merge_character_set(VALUE self, VALUE other) {
|
274
|
+
COMPARE_SETS(if (TSTBIT(other_cps, cp)) SETBIT(cps, cp));
|
275
|
+
return self;
|
276
|
+
}
|
277
|
+
|
278
|
+
static inline void
|
279
|
+
raise_arg_err_unless_valid_as_cp(VALUE object_id) {
|
280
|
+
if (FIXNUM_P(object_id) && object_id > 0 && object_id < 0x220001) return;
|
281
|
+
rb_raise(rb_eArgError, "CharacterSet members must be between 0 and 0x10FFFF");
|
282
|
+
}
|
283
|
+
|
284
|
+
static inline VALUE
|
285
|
+
merge_rb_range(VALUE self, VALUE rb_range) {
|
286
|
+
VALUE from_id, upto_id;
|
287
|
+
int excl;
|
288
|
+
cp_index cp;
|
289
|
+
cp_byte *cps;
|
290
|
+
FETCH_CODEPOINTS(self, cps);
|
291
|
+
|
292
|
+
if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl))) {
|
293
|
+
rb_raise(rb_eArgError, "pass a Range");
|
294
|
+
}
|
295
|
+
if (excl) upto_id -= 2;
|
296
|
+
|
297
|
+
raise_arg_err_unless_valid_as_cp(from_id);
|
298
|
+
raise_arg_err_unless_valid_as_cp(upto_id);
|
299
|
+
|
300
|
+
for (/* */; from_id <= upto_id; from_id += 2) {
|
301
|
+
cp = FIX2ULONG(from_id);
|
302
|
+
SETBIT(cps, cp);
|
303
|
+
}
|
304
|
+
return self;
|
305
|
+
}
|
306
|
+
|
307
|
+
static inline VALUE
|
308
|
+
merge_rb_array(VALUE self, VALUE rb_array) {
|
309
|
+
VALUE el;
|
310
|
+
cp_byte *cps;
|
311
|
+
VALUE array_length, i;
|
312
|
+
FETCH_CODEPOINTS(self, cps);
|
313
|
+
Check_Type(rb_array, T_ARRAY);
|
314
|
+
array_length = RARRAY_LEN(rb_array);
|
315
|
+
for (i = 0; i < array_length; i++) {
|
316
|
+
el = RARRAY_AREF(rb_array, i);
|
317
|
+
raise_arg_err_unless_valid_as_cp(el);
|
318
|
+
SETBIT(cps, FIX2ULONG(el));
|
319
|
+
}
|
320
|
+
return self;
|
321
|
+
}
|
322
|
+
|
323
|
+
static VALUE
|
324
|
+
method_merge(VALUE self, VALUE other) {
|
325
|
+
rb_check_frozen(self);
|
326
|
+
if (is_character_set(other)) {
|
327
|
+
return merge_character_set(self, other);
|
328
|
+
}
|
329
|
+
else if (TYPE(other) == T_ARRAY) {
|
330
|
+
return merge_rb_array(self, other);
|
331
|
+
}
|
332
|
+
return merge_rb_range(self, other);
|
333
|
+
}
|
334
|
+
|
335
|
+
static VALUE
|
336
|
+
method_initialize_copy(VALUE self, VALUE other) {
|
337
|
+
merge_character_set(self, other);
|
338
|
+
return other;
|
339
|
+
}
|
340
|
+
|
341
|
+
static VALUE
|
342
|
+
method_subtract(VALUE self, VALUE other) {
|
343
|
+
rb_check_frozen(self);
|
344
|
+
COMPARE_SETS(if (TSTBIT(other_cps, cp)) CLRBIT(cps, cp));
|
345
|
+
return self;
|
346
|
+
}
|
347
|
+
|
348
|
+
static inline int
|
349
|
+
a_subset_of_b(VALUE set_a, VALUE set_b, int *is_proper) {
|
350
|
+
cp_byte *cps_a, *cps_b;
|
351
|
+
cp_index cp, size_a, size_b;
|
352
|
+
|
353
|
+
if (!is_character_set(set_a) || !is_character_set(set_b)) {
|
354
|
+
rb_raise(rb_eArgError, "pass a CharacterSet");
|
355
|
+
}
|
356
|
+
|
357
|
+
FETCH_CODEPOINTS(set_a, cps_a);
|
358
|
+
FETCH_CODEPOINTS(set_b, cps_b);
|
359
|
+
|
360
|
+
*is_proper = 0;
|
361
|
+
size_a = 0;
|
362
|
+
size_b = 0;
|
363
|
+
|
364
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
|
365
|
+
if (TSTBIT(cps_a, cp)) {
|
366
|
+
if (!TSTBIT(cps_b, cp)) return 0;
|
367
|
+
size_a++;
|
368
|
+
size_b++;
|
369
|
+
}
|
370
|
+
else if (TSTBIT(cps_b, cp)) size_b++;
|
371
|
+
}
|
372
|
+
|
373
|
+
if (size_b > size_a) *is_proper = 1;
|
374
|
+
return 1;
|
375
|
+
}
|
376
|
+
|
377
|
+
static VALUE
|
378
|
+
method_subset_p(VALUE self, VALUE other) {
|
379
|
+
int is_proper;
|
380
|
+
return a_subset_of_b(self, other, &is_proper) ? Qtrue : Qfalse;
|
381
|
+
}
|
382
|
+
|
383
|
+
static VALUE
|
384
|
+
method_proper_subset_p(VALUE self, VALUE other) {
|
385
|
+
int is, is_proper;
|
386
|
+
is = a_subset_of_b(self, other, &is_proper);
|
387
|
+
return (is && is_proper) ? Qtrue : Qfalse;
|
388
|
+
}
|
389
|
+
|
390
|
+
static VALUE
|
391
|
+
method_superset_p(VALUE self, VALUE other) {
|
392
|
+
int is_proper;
|
393
|
+
return a_subset_of_b(other, self, &is_proper) ? Qtrue : Qfalse;
|
394
|
+
}
|
395
|
+
|
396
|
+
static VALUE
|
397
|
+
method_proper_superset_p(VALUE self, VALUE other) {
|
398
|
+
int is, is_proper;
|
399
|
+
is = a_subset_of_b(other, self, &is_proper);
|
400
|
+
return (is && is_proper) ? Qtrue : Qfalse;
|
401
|
+
}
|
402
|
+
|
403
|
+
// *******************************
|
404
|
+
// `CharacterSet`-specific methods
|
405
|
+
// *******************************
|
406
|
+
|
407
|
+
static VALUE
|
408
|
+
class_method_from_ranges(VALUE self, VALUE ranges) {
|
409
|
+
VALUE new_set, range_count, i;
|
410
|
+
new_set = rb_class_new_instance(0, 0, self);
|
411
|
+
range_count = RARRAY_LEN(ranges);
|
412
|
+
for (i = 0; i < range_count; i++) {
|
413
|
+
merge_rb_range(new_set, RARRAY_AREF(ranges, i));
|
414
|
+
}
|
415
|
+
return new_set;
|
416
|
+
}
|
417
|
+
|
418
|
+
static VALUE
|
419
|
+
method_ranges(VALUE self) {
|
420
|
+
VALUE ranges, codepoint, previous_codepoint, current_start, current_end;
|
421
|
+
|
422
|
+
ranges = rb_ary_new();
|
423
|
+
previous_codepoint = 0;
|
424
|
+
current_start = 0;
|
425
|
+
current_end = 0;
|
426
|
+
|
427
|
+
FOR_EACH_ACTIVE_CODEPOINT(
|
428
|
+
codepoint = LONG2FIX(cp);
|
429
|
+
|
430
|
+
if (!previous_codepoint) {
|
431
|
+
current_start = codepoint;
|
432
|
+
}
|
433
|
+
else if (previous_codepoint + 2 != codepoint) {
|
434
|
+
// gap found, finalize previous range
|
435
|
+
rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
|
436
|
+
current_start = codepoint;
|
437
|
+
}
|
438
|
+
current_end = codepoint;
|
439
|
+
previous_codepoint = codepoint;
|
440
|
+
);
|
441
|
+
|
442
|
+
// add final range
|
443
|
+
if (current_start) {
|
444
|
+
rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
|
445
|
+
}
|
446
|
+
|
447
|
+
return ranges;
|
448
|
+
}
|
449
|
+
|
450
|
+
static VALUE
|
451
|
+
method_sample(int argc, VALUE *argv, VALUE self) {
|
452
|
+
VALUE to_a_args[1], array;
|
453
|
+
rb_check_arity(argc, 0, 1);
|
454
|
+
to_a_args[0] = Qtrue;
|
455
|
+
array = method_to_a(1, to_a_args, self);
|
456
|
+
return rb_funcall(array, rb_intern("sample"), argc, argc ? argv[0] : 0);
|
457
|
+
}
|
458
|
+
|
459
|
+
static inline VALUE
|
460
|
+
new_set_from_section(VALUE set, cp_index from, cp_index upto) {
|
461
|
+
cp_byte *cps, *new_cps;
|
462
|
+
cp_index cp;
|
463
|
+
FETCH_CODEPOINTS(set, cps);
|
464
|
+
new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));
|
465
|
+
for (cp = from; cp <= upto; cp++) {
|
466
|
+
if (TSTBIT(cps, cp)) SETBIT(new_cps, cp);
|
467
|
+
}
|
468
|
+
return NEW_CHARACTER_SET(RBASIC(set)->klass, new_cps);
|
469
|
+
}
|
470
|
+
|
471
|
+
static VALUE
|
472
|
+
method_bmp_part(VALUE self) {
|
473
|
+
return new_set_from_section(self, 0, UNICODE_PLANE_SIZE - 1);
|
474
|
+
}
|
475
|
+
|
476
|
+
static VALUE
|
477
|
+
method_astral_part(VALUE self) {
|
478
|
+
return new_set_from_section(self, UNICODE_PLANE_SIZE, UNICODE_CP_COUNT - 1);
|
479
|
+
}
|
480
|
+
|
481
|
+
static inline VALUE
|
482
|
+
set_has_member_in_plane(VALUE set, unsigned int plane) {
|
483
|
+
cp_byte *cps;
|
484
|
+
cp_index cp, max_cp;
|
485
|
+
FETCH_CODEPOINTS(set, cps);
|
486
|
+
cp = plane * UNICODE_PLANE_SIZE;
|
487
|
+
max_cp = (plane + 1) * UNICODE_PLANE_SIZE - 1;
|
488
|
+
for (/* */; cp <= max_cp; cp++) {
|
489
|
+
if (TSTBIT(cps, cp)) return Qtrue;
|
490
|
+
}
|
491
|
+
return Qfalse;
|
492
|
+
}
|
493
|
+
|
494
|
+
static VALUE
|
495
|
+
method_planes(VALUE self) {
|
496
|
+
unsigned int i;
|
497
|
+
VALUE planes;
|
498
|
+
planes = rb_ary_new();
|
499
|
+
for (i = 0; i < UNICODE_PLANE_COUNT; i++) {
|
500
|
+
if (set_has_member_in_plane(self, i)) rb_ary_push(planes, INT2FIX(i));
|
501
|
+
}
|
502
|
+
return planes;
|
503
|
+
}
|
504
|
+
|
505
|
+
static VALUE
|
506
|
+
method_member_in_plane_p(VALUE self, VALUE plane_num) {
|
507
|
+
int plane;
|
508
|
+
Check_Type(plane_num, T_FIXNUM);
|
509
|
+
plane = FIX2INT(plane_num);
|
510
|
+
if (plane < 0 || plane >= UNICODE_PLANE_COUNT) {
|
511
|
+
rb_raise(rb_eArgError, "plane must be between 0 and 16");
|
512
|
+
}
|
513
|
+
return set_has_member_in_plane(self, plane);
|
514
|
+
}
|
515
|
+
|
516
|
+
#define NON_SURROGATE(cp) (cp > 0xDFFF || cp < 0xD800)
|
517
|
+
|
518
|
+
static VALUE
|
519
|
+
method_ext_inversion(int argc, VALUE *argv, VALUE self) {
|
520
|
+
int include_surrogates;
|
521
|
+
cp_index upto;
|
522
|
+
VALUE other;
|
523
|
+
other = 0;
|
524
|
+
rb_check_arity(argc, 0, 2);
|
525
|
+
include_surrogates = ((argc > 0) && (argv[0] == Qtrue));
|
526
|
+
if ((argc > 1) && FIXNUM_P(argv[1])) {
|
527
|
+
upto = FIX2ULONG(argv[1]);
|
528
|
+
RETURN_NEW_SET_BASED_ON(
|
529
|
+
cp <= upto && !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
|
530
|
+
);
|
531
|
+
}
|
532
|
+
RETURN_NEW_SET_BASED_ON(
|
533
|
+
!TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
|
534
|
+
);
|
535
|
+
}
|
536
|
+
|
537
|
+
typedef int(*str_cp_handler)(unsigned int, cp_byte*);
|
538
|
+
|
539
|
+
static inline int
|
540
|
+
add_str_cp_to_arr(unsigned int str_cp, cp_byte *cp_arr) {
|
541
|
+
SETBIT(cp_arr, str_cp);
|
542
|
+
return 1;
|
543
|
+
}
|
544
|
+
|
545
|
+
static VALUE
|
546
|
+
method_case_insensitive(VALUE self) {
|
547
|
+
cp_index i;
|
548
|
+
cp_byte *new_cps;
|
549
|
+
|
550
|
+
new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));
|
551
|
+
|
552
|
+
FOR_EACH_ACTIVE_CODEPOINT(SETBIT(new_cps, cp));
|
553
|
+
|
554
|
+
for (i = 0; i < CASEFOLD_COUNT; i++) {
|
555
|
+
casefold_mapping m = unicode_casefold_table[i];
|
556
|
+
|
557
|
+
if (TSTBIT(cps, m.from)) { SETBIT(new_cps, m.to); }
|
558
|
+
else if (TSTBIT(cps, m.to)) { SETBIT(new_cps, m.from); }
|
559
|
+
}
|
560
|
+
|
561
|
+
return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);
|
562
|
+
|
563
|
+
// OnigCaseFoldType flags;
|
564
|
+
// rb_encoding *enc;
|
565
|
+
//
|
566
|
+
// enc = rb_utf8_encoding();
|
567
|
+
//
|
568
|
+
// ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE (not public on ruby < 2.4)
|
569
|
+
// flags = (1<<13) | (1<<14);
|
570
|
+
//
|
571
|
+
// // case_map args: flags, pp, end, to, to_end, enc
|
572
|
+
// enc->case_map(flags, (const OnigUChar**)&cp, ?, ?, ?, enc);
|
573
|
+
}
|
574
|
+
|
575
|
+
static inline VALUE
|
576
|
+
each_sb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
|
577
|
+
long i;
|
578
|
+
unsigned int str_cp;
|
579
|
+
|
580
|
+
for (i = 0; i < RSTRING_LEN(str); i++) {
|
581
|
+
str_cp = (RSTRING_PTR(str)[i] & 0xff);
|
582
|
+
if (!(*func)(str_cp, cp_arr)) return Qfalse;
|
583
|
+
}
|
584
|
+
|
585
|
+
return Qtrue;
|
586
|
+
}
|
587
|
+
|
588
|
+
static inline VALUE
|
589
|
+
each_mb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
|
590
|
+
int n;
|
591
|
+
unsigned int str_cp;
|
592
|
+
const char *ptr, *end;
|
593
|
+
rb_encoding *enc;
|
594
|
+
|
595
|
+
str = rb_str_new_frozen(str);
|
596
|
+
ptr = RSTRING_PTR(str);
|
597
|
+
end = RSTRING_END(str);
|
598
|
+
enc = rb_enc_get(str);
|
599
|
+
|
600
|
+
while (ptr < end) {
|
601
|
+
str_cp = rb_enc_codepoint_len(ptr, end, &n, enc);
|
602
|
+
if (!(*func)(str_cp, cp_arr)) return Qfalse;
|
603
|
+
ptr += n;
|
604
|
+
}
|
605
|
+
|
606
|
+
return Qtrue;
|
607
|
+
}
|
608
|
+
|
609
|
+
// single_byte_optimizable - copied from string.c
|
610
|
+
static inline int
|
611
|
+
single_byte_optimizable(VALUE str)
|
612
|
+
{
|
613
|
+
rb_encoding *enc;
|
614
|
+
if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) return 1;
|
615
|
+
|
616
|
+
enc = rb_enc_get(str);
|
617
|
+
if (rb_enc_mbmaxlen(enc) == 1) return 1;
|
618
|
+
|
619
|
+
return 0;
|
620
|
+
}
|
621
|
+
|
622
|
+
static inline VALUE
|
623
|
+
each_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
|
624
|
+
if (single_byte_optimizable(str)) {
|
625
|
+
return each_sb_cp(str, func, cp_arr);
|
626
|
+
}
|
627
|
+
return each_mb_cp(str, func, cp_arr);
|
628
|
+
}
|
629
|
+
|
630
|
+
static inline void
|
631
|
+
raise_arg_err_unless_string(VALUE val) {
|
632
|
+
if (!RB_TYPE_P(val, T_STRING)) rb_raise(rb_eArgError, "pass a String");
|
633
|
+
}
|
634
|
+
|
635
|
+
static VALUE
|
636
|
+
class_method_of(VALUE self, VALUE str) {
|
637
|
+
cp_byte *cp_arr;
|
638
|
+
raise_arg_err_unless_string(str);
|
639
|
+
cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte));
|
640
|
+
each_cp(str, add_str_cp_to_arr, cp_arr);
|
641
|
+
return NEW_CHARACTER_SET(self, cp_arr);
|
642
|
+
}
|
643
|
+
|
644
|
+
static inline int
|
645
|
+
str_cp_not_in_arr(unsigned int str_cp, cp_byte *cp_arr) {
|
646
|
+
return !TSTBIT(cp_arr, str_cp);
|
647
|
+
}
|
648
|
+
|
649
|
+
static VALUE
|
650
|
+
method_used_by_p(VALUE self, VALUE str) {
|
651
|
+
cp_byte *cps;
|
652
|
+
VALUE only_uses_other_cps;
|
653
|
+
raise_arg_err_unless_string(str);
|
654
|
+
FETCH_CODEPOINTS(self, cps);
|
655
|
+
only_uses_other_cps = each_cp(str, str_cp_not_in_arr, cps);
|
656
|
+
return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
|
657
|
+
}
|
658
|
+
|
659
|
+
static inline int
|
660
|
+
str_cp_in_arr(unsigned int str_cp, cp_byte *cp_arr) {
|
661
|
+
return TSTBIT(cp_arr, str_cp);
|
662
|
+
}
|
663
|
+
|
664
|
+
static VALUE
|
665
|
+
method_cover_p(VALUE self, VALUE str) {
|
666
|
+
cp_byte *cps;
|
667
|
+
raise_arg_err_unless_string(str);
|
668
|
+
FETCH_CODEPOINTS(self, cps);
|
669
|
+
return each_cp(str, str_cp_in_arr, cps);
|
670
|
+
}
|
671
|
+
|
672
|
+
static inline VALUE
|
673
|
+
apply_to_str(VALUE set, VALUE str, int delete, int bang) {
|
674
|
+
cp_byte *cps;
|
675
|
+
rb_encoding *str_enc;
|
676
|
+
VALUE orig_len, blen, new_str_buf, chr;
|
677
|
+
int n;
|
678
|
+
unsigned int str_cp;
|
679
|
+
const char *ptr, *end;
|
680
|
+
|
681
|
+
raise_arg_err_unless_string(str);
|
682
|
+
|
683
|
+
FETCH_CODEPOINTS(set, cps);
|
684
|
+
|
685
|
+
orig_len = RSTRING_LEN(str);
|
686
|
+
blen = orig_len + 30; /* len + margin */ // not sure why, copied from string.c
|
687
|
+
new_str_buf = rb_str_buf_new(blen);
|
688
|
+
str_enc = rb_enc_get(str);
|
689
|
+
rb_enc_associate(new_str_buf, str_enc);
|
690
|
+
ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ?
|
691
|
+
ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
|
692
|
+
|
693
|
+
ptr = RSTRING_PTR(str);
|
694
|
+
end = RSTRING_END(str);
|
695
|
+
|
696
|
+
while (ptr < end) {
|
697
|
+
str_cp = rb_enc_codepoint_len(ptr, end, &n, str_enc);
|
698
|
+
if (!TSTBIT(cps, str_cp) != !delete) {
|
699
|
+
chr = rb_enc_uint_chr(str_cp, str_enc);
|
700
|
+
rb_enc_str_buf_cat(new_str_buf, RSTRING_PTR(chr), n, str_enc);
|
701
|
+
}
|
702
|
+
ptr += n;
|
703
|
+
}
|
704
|
+
|
705
|
+
if (bang) {
|
706
|
+
if (RSTRING_LEN(new_str_buf) == (long)orig_len) return Qnil; // unchanged
|
707
|
+
rb_str_shared_replace(str, new_str_buf);
|
708
|
+
}
|
709
|
+
else {
|
710
|
+
RB_OBJ_WRITE(new_str_buf, &(RBASIC(new_str_buf))->klass, rb_obj_class(str));
|
711
|
+
// slightly cumbersome approach needed for compatibility with Ruby < 2.3:
|
712
|
+
RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags&(FL_TAINT));
|
713
|
+
str = new_str_buf;
|
714
|
+
}
|
715
|
+
|
716
|
+
return str;
|
717
|
+
}
|
718
|
+
|
719
|
+
static VALUE
|
720
|
+
method_delete_in(VALUE self, VALUE str) {
|
721
|
+
return apply_to_str(self, str, 1, 0);
|
722
|
+
}
|
723
|
+
|
724
|
+
static VALUE
|
725
|
+
method_delete_in_bang(VALUE self, VALUE str) {
|
726
|
+
return apply_to_str(self, str, 1, 1);
|
727
|
+
}
|
728
|
+
|
729
|
+
static VALUE
|
730
|
+
method_keep_in(VALUE self, VALUE str) {
|
731
|
+
return apply_to_str(self, str, 0, 0);
|
732
|
+
}
|
733
|
+
|
734
|
+
static VALUE
|
735
|
+
method_keep_in_bang(VALUE self, VALUE str) {
|
736
|
+
return apply_to_str(self, str, 0, 1);
|
737
|
+
}
|
738
|
+
|
739
|
+
// ****
|
740
|
+
// init
|
741
|
+
// ****
|
742
|
+
|
743
|
+
void
|
744
|
+
Init_character_set()
|
745
|
+
{
|
746
|
+
VALUE cs = rb_define_class("CharacterSet", rb_cObject);
|
747
|
+
|
748
|
+
rb_define_alloc_func(cs, method_allocate);
|
749
|
+
|
750
|
+
// `Set` compatibility methods
|
751
|
+
|
752
|
+
rb_define_method(cs, "each", method_each, 0);
|
753
|
+
rb_define_method(cs, "to_a", method_to_a, -1);
|
754
|
+
rb_define_method(cs, "length", method_length, 0);
|
755
|
+
rb_define_method(cs, "size", method_length, 0);
|
756
|
+
rb_define_method(cs, "count", method_length, 0);
|
757
|
+
rb_define_method(cs, "empty?", method_empty_p, 0);
|
758
|
+
rb_define_method(cs, "hash", method_hash, 0);
|
759
|
+
rb_define_method(cs, "keep_if", method_keep_if, 0);
|
760
|
+
rb_define_method(cs, "delete_if", method_delete_if, 0);
|
761
|
+
rb_define_method(cs, "clear", method_clear, 0);
|
762
|
+
rb_define_method(cs, "intersection", method_intersection, 1);
|
763
|
+
rb_define_method(cs, "&", method_intersection, 1);
|
764
|
+
rb_define_method(cs, "union", method_union, 1);
|
765
|
+
rb_define_method(cs, "+", method_union, 1);
|
766
|
+
rb_define_method(cs, "|", method_union, 1);
|
767
|
+
rb_define_method(cs, "difference", method_difference, 1);
|
768
|
+
rb_define_method(cs, "-", method_difference, 1);
|
769
|
+
rb_define_method(cs, "^", method_exclusion, 1);
|
770
|
+
rb_define_method(cs, "include?", method_include_p, 1);
|
771
|
+
rb_define_method(cs, "member?", method_include_p, 1);
|
772
|
+
rb_define_method(cs, "===", method_include_p, 1);
|
773
|
+
rb_define_method(cs, "add", method_add, 1);
|
774
|
+
rb_define_method(cs, "<<", method_add, 1);
|
775
|
+
rb_define_method(cs, "add?", method_add_p, 1);
|
776
|
+
rb_define_method(cs, "delete", method_delete, 1);
|
777
|
+
rb_define_method(cs, "delete?", method_delete_p, 1);
|
778
|
+
rb_define_method(cs, "intersect?", method_intersect_p, 1);
|
779
|
+
rb_define_method(cs, "disjoint?", method_disjoint_p, 1);
|
780
|
+
rb_define_method(cs, "eql?", method_eql_p, 1);
|
781
|
+
rb_define_method(cs, "==", method_eql_p, 1);
|
782
|
+
rb_define_method(cs, "merge", method_merge, 1);
|
783
|
+
rb_define_method(cs, "initialize_clone", method_initialize_copy, 1);
|
784
|
+
rb_define_method(cs, "initialize_dup", method_initialize_copy, 1);
|
785
|
+
rb_define_method(cs, "subtract", method_subtract, 1);
|
786
|
+
rb_define_method(cs, "subset?", method_subset_p, 1);
|
787
|
+
rb_define_method(cs, "<=", method_subset_p, 1);
|
788
|
+
rb_define_method(cs, "proper_subset?", method_proper_subset_p, 1);
|
789
|
+
rb_define_method(cs, "<", method_proper_subset_p, 1);
|
790
|
+
rb_define_method(cs, "superset?", method_superset_p, 1);
|
791
|
+
rb_define_method(cs, ">=", method_superset_p, 1);
|
792
|
+
rb_define_method(cs, "proper_superset?", method_proper_superset_p, 1);
|
793
|
+
rb_define_method(cs, ">", method_proper_superset_p, 1);
|
794
|
+
|
795
|
+
// `CharacterSet`-specific methods
|
796
|
+
|
797
|
+
rb_define_singleton_method(cs, "from_ranges", class_method_from_ranges, -2);
|
798
|
+
rb_define_singleton_method(cs, "of", class_method_of, 1);
|
799
|
+
|
800
|
+
rb_define_method(cs, "ranges", method_ranges, 0);
|
801
|
+
rb_define_method(cs, "sample", method_sample, -1);
|
802
|
+
rb_define_method(cs, "bmp_part", method_bmp_part, 0);
|
803
|
+
rb_define_method(cs, "astral_part", method_astral_part, 0);
|
804
|
+
rb_define_method(cs, "planes", method_planes, 0);
|
805
|
+
rb_define_method(cs, "member_in_plane?", method_member_in_plane_p, 1);
|
806
|
+
rb_define_method(cs, "ext_inversion", method_ext_inversion, -1);
|
807
|
+
rb_define_method(cs, "case_insensitive", method_case_insensitive, 0);
|
808
|
+
rb_define_method(cs, "used_by?", method_used_by_p, 1);
|
809
|
+
rb_define_method(cs, "cover?", method_cover_p, 1);
|
810
|
+
rb_define_method(cs, "delete_in", method_delete_in, 1);
|
811
|
+
rb_define_method(cs, "delete_in!", method_delete_in_bang, 1);
|
812
|
+
rb_define_method(cs, "keep_in", method_keep_in, 1);
|
813
|
+
rb_define_method(cs, "keep_in!", method_keep_in_bang, 1);
|
814
|
+
}
|