character_set 1.3.0-java → 1.6.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitattributes +1 -1
- data/.github/workflows/gouteur.yml +20 -0
- data/.github/workflows/lint.yml +29 -0
- data/.github/workflows/tests.yml +26 -0
- data/.gitignore +1 -0
- data/.gouteur.yml +2 -0
- data/.rubocop.yml +17 -0
- data/BENCHMARK.md +35 -31
- data/CHANGELOG.md +50 -1
- data/Gemfile +14 -0
- data/README.md +35 -9
- data/Rakefile +6 -3
- data/benchmarks/delete_in.rb +5 -1
- data/benchmarks/keep_in.rb +5 -1
- data/benchmarks/shared.rb +5 -1
- data/character_set.gemspec +6 -9
- data/ext/character_set/character_set.c +61 -93
- data/ext/character_set/unicode_casefold_table.h +44 -1
- data/lib/character_set/character.rb +1 -1
- data/lib/character_set/core_ext/regexp_ext.rb +1 -1
- data/lib/character_set/core_ext/string_ext.rb +2 -2
- data/lib/character_set/expression_converter.rb +25 -24
- data/lib/character_set/parser.rb +1 -1
- data/lib/character_set/predefined_sets/assigned.cps +51 -40
- data/lib/character_set/predefined_sets/emoji.cps +12 -11
- data/lib/character_set/predefined_sets.rb +11 -0
- data/lib/character_set/ruby_fallback/character_set_methods.rb +5 -6
- data/lib/character_set/ruby_fallback/set_methods.rb +23 -15
- data/lib/character_set/ruby_fallback.rb +5 -1
- data/lib/character_set/set_method_adapters.rb +4 -3
- data/lib/character_set/shared_methods.rb +24 -10
- data/lib/character_set/version.rb +1 -1
- data/lib/character_set/writer.rb +98 -27
- metadata +14 -122
- data/.travis.yml +0 -9
|
@@ -82,7 +82,11 @@ static const rb_data_type_t cs_type = {
|
|
|
82
82
|
.dsize = cs_memsize,
|
|
83
83
|
},
|
|
84
84
|
.data = NULL,
|
|
85
|
+
#ifdef RUBY_TYPED_FROZEN_SHAREABLE
|
|
86
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_FROZEN_SHAREABLE,
|
|
87
|
+
#else
|
|
85
88
|
.flags = RUBY_TYPED_FREE_IMMEDIATELY,
|
|
89
|
+
#endif
|
|
86
90
|
};
|
|
87
91
|
|
|
88
92
|
static inline VALUE
|
|
@@ -216,6 +220,7 @@ cs_method_hash(VALUE self)
|
|
|
216
220
|
cs_cp cp, len, hash, four_byte_value;
|
|
217
221
|
cs_ar *cps;
|
|
218
222
|
cps = cs_fetch_cps(self, &len);
|
|
223
|
+
four_byte_value = 0;
|
|
219
224
|
|
|
220
225
|
hash = 17;
|
|
221
226
|
for (cp = 0; cp < len; cp++)
|
|
@@ -314,9 +319,9 @@ cs_method_minmax(VALUE self)
|
|
|
314
319
|
cs_cp cp, alen, blen; \
|
|
315
320
|
cs_ar *acps, *bcps; \
|
|
316
321
|
struct cs_data *new_data; \
|
|
317
|
-
new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
|
|
318
322
|
acps = cs_fetch_cps(cs_a, &alen); \
|
|
319
323
|
bcps = cs_fetch_cps(cs_b, &blen); \
|
|
324
|
+
new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
|
|
320
325
|
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) \
|
|
321
326
|
{ \
|
|
322
327
|
if (tst_cp(acps, alen, cp) comp_op tst_cp(bcps, blen, cp)) \
|
|
@@ -1045,13 +1050,14 @@ raise_arg_err_unless_string(VALUE val)
|
|
|
1045
1050
|
}
|
|
1046
1051
|
|
|
1047
1052
|
static VALUE
|
|
1048
|
-
|
|
1053
|
+
cs_class_method_of_string(VALUE self, VALUE string)
|
|
1049
1054
|
{
|
|
1050
1055
|
VALUE new_cs;
|
|
1051
1056
|
struct cs_data *new_data;
|
|
1057
|
+
|
|
1058
|
+
raise_arg_err_unless_string(string);
|
|
1052
1059
|
new_cs = cs_alloc(self, &new_data);
|
|
1053
|
-
|
|
1054
|
-
each_cp(str, add_str_cp_to_arr, 0, 0, new_data, 0);
|
|
1060
|
+
each_cp(string, add_str_cp_to_arr, 0, 0, new_data, 0);
|
|
1055
1061
|
return new_cs;
|
|
1056
1062
|
}
|
|
1057
1063
|
|
|
@@ -1074,7 +1080,7 @@ cs_method_count_in(VALUE self, VALUE str)
|
|
|
1074
1080
|
data = cs_fetch_data(self);
|
|
1075
1081
|
count = 0;
|
|
1076
1082
|
each_cp(str, count_str_cp, data->cps, data->len, data, &count);
|
|
1077
|
-
return INT2NUM(count);
|
|
1083
|
+
return INT2NUM((int)count);
|
|
1078
1084
|
}
|
|
1079
1085
|
|
|
1080
1086
|
static inline int
|
|
@@ -1132,118 +1138,76 @@ cs_method_used_by_p(VALUE self, VALUE str)
|
|
|
1132
1138
|
return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
|
|
1133
1139
|
}
|
|
1134
1140
|
|
|
1135
|
-
|
|
1136
|
-
cs_str_buf_cat(VALUE str, const char *ptr, long len)
|
|
1137
|
-
{
|
|
1138
|
-
long total, olen;
|
|
1139
|
-
char *sptr;
|
|
1140
|
-
|
|
1141
|
-
RSTRING_GETMEM(str, sptr, olen);
|
|
1142
|
-
sptr = RSTRING(str)->as.heap.ptr;
|
|
1143
|
-
olen = RSTRING(str)->as.heap.len;
|
|
1144
|
-
total = olen + len;
|
|
1145
|
-
memcpy(sptr + olen, ptr, len);
|
|
1146
|
-
RSTRING(str)->as.heap.len = total;
|
|
1147
|
-
}
|
|
1148
|
-
|
|
1149
|
-
#ifndef TERM_FILL
|
|
1150
|
-
#define TERM_FILL(ptr, termlen) \
|
|
1151
|
-
do \
|
|
1152
|
-
{ \
|
|
1153
|
-
char *const term_fill_ptr = (ptr); \
|
|
1154
|
-
const int term_fill_len = (termlen); \
|
|
1155
|
-
*term_fill_ptr = '\0'; \
|
|
1156
|
-
if (__builtin_expect(!!(term_fill_len > 1), 0)) \
|
|
1157
|
-
memset(term_fill_ptr, 0, term_fill_len); \
|
|
1158
|
-
} while (0)
|
|
1159
|
-
#endif
|
|
1160
|
-
|
|
1161
|
-
static void
|
|
1162
|
-
cs_str_buf_terminate(VALUE str, rb_encoding *enc)
|
|
1163
|
-
{
|
|
1164
|
-
char *ptr;
|
|
1165
|
-
long len;
|
|
1166
|
-
|
|
1167
|
-
ptr = RSTRING(str)->as.heap.ptr;
|
|
1168
|
-
len = RSTRING(str)->as.heap.len;
|
|
1169
|
-
TERM_FILL(ptr + len, rb_enc_mbminlen(enc));
|
|
1170
|
-
}
|
|
1171
|
-
|
|
1141
|
+
// partially based on rb_str_delete_bang
|
|
1172
1142
|
static inline VALUE
|
|
1173
1143
|
cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
|
|
1174
1144
|
{
|
|
1175
1145
|
cs_ar *cps;
|
|
1176
|
-
cs_cp
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1146
|
+
cs_cp cs_len;
|
|
1147
|
+
VALUE orig_str_len;
|
|
1148
|
+
|
|
1149
|
+
rb_encoding *enc;
|
|
1150
|
+
char *s, *send, *t;
|
|
1151
|
+
int ascompat, cr;
|
|
1182
1152
|
|
|
1183
1153
|
raise_arg_err_unless_string(str);
|
|
1184
1154
|
|
|
1185
|
-
|
|
1155
|
+
orig_str_len = RSTRING_LEN(str);
|
|
1186
1156
|
|
|
1187
|
-
|
|
1188
|
-
if (orig_len < 1) // empty string, will never change
|
|
1157
|
+
if (orig_str_len == 0)
|
|
1189
1158
|
{
|
|
1190
|
-
|
|
1191
|
-
{
|
|
1192
|
-
return Qnil;
|
|
1193
|
-
}
|
|
1194
|
-
return rb_str_dup(str);
|
|
1159
|
+
return bang ? Qnil : str;
|
|
1195
1160
|
}
|
|
1196
1161
|
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
|
|
1202
|
-
|
|
1203
|
-
ptr = RSTRING_PTR(str);
|
|
1204
|
-
end = RSTRING_END(str);
|
|
1162
|
+
if (!bang)
|
|
1163
|
+
{
|
|
1164
|
+
str = rb_str_dup(str);
|
|
1165
|
+
}
|
|
1205
1166
|
|
|
1206
|
-
|
|
1167
|
+
cps = cs_fetch_cps(set, &cs_len);
|
|
1168
|
+
rb_str_modify(str);
|
|
1169
|
+
enc = rb_enc_get(str);
|
|
1170
|
+
ascompat = rb_enc_asciicompat(enc);
|
|
1171
|
+
s = t = RSTRING_PTR(str);
|
|
1172
|
+
send = RSTRING_END(str);
|
|
1173
|
+
cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
|
|
1174
|
+
while (s < send)
|
|
1207
1175
|
{
|
|
1208
|
-
|
|
1176
|
+
unsigned int c;
|
|
1177
|
+
int clen;
|
|
1178
|
+
|
|
1179
|
+
if (ascompat && (c = *(unsigned char *)s) < 0x80)
|
|
1209
1180
|
{
|
|
1210
|
-
|
|
1211
|
-
if ((!tst_cp(cps, len, str_cp)) == delete)
|
|
1181
|
+
if (tst_cp(cps, cs_len, c) != delete)
|
|
1212
1182
|
{
|
|
1213
|
-
|
|
1183
|
+
if (t != s)
|
|
1184
|
+
*t = c;
|
|
1185
|
+
t++;
|
|
1214
1186
|
}
|
|
1215
|
-
|
|
1187
|
+
s++;
|
|
1216
1188
|
}
|
|
1217
|
-
|
|
1218
|
-
else // likely to be multibyte string
|
|
1219
|
-
{
|
|
1220
|
-
while (ptr < end)
|
|
1189
|
+
else
|
|
1221
1190
|
{
|
|
1222
|
-
|
|
1223
|
-
|
|
1191
|
+
c = rb_enc_codepoint_len(s, send, &clen, enc);
|
|
1192
|
+
|
|
1193
|
+
if (tst_cp(cps, cs_len, c) != delete)
|
|
1224
1194
|
{
|
|
1225
|
-
|
|
1195
|
+
if (t != s)
|
|
1196
|
+
rb_enc_mbcput(c, t, enc);
|
|
1197
|
+
t += clen;
|
|
1198
|
+
if (cr == ENC_CODERANGE_7BIT)
|
|
1199
|
+
cr = ENC_CODERANGE_VALID;
|
|
1226
1200
|
}
|
|
1227
|
-
|
|
1201
|
+
s += clen;
|
|
1228
1202
|
}
|
|
1229
1203
|
}
|
|
1230
1204
|
|
|
1231
|
-
|
|
1205
|
+
rb_str_set_len(str, t - RSTRING_PTR(str));
|
|
1206
|
+
ENC_CODERANGE_SET(str, cr);
|
|
1232
1207
|
|
|
1233
|
-
if (bang)
|
|
1234
|
-
{
|
|
1235
|
-
if (RSTRING_LEN(new_str_buf) == (long)orig_len) // string unchanged
|
|
1236
|
-
{
|
|
1237
|
-
return Qnil;
|
|
1238
|
-
}
|
|
1239
|
-
rb_str_shared_replace(str, new_str_buf);
|
|
1240
|
-
}
|
|
1241
|
-
else
|
|
1208
|
+
if (bang && (RSTRING_LEN(str) == (long)orig_str_len)) // string unchanged
|
|
1242
1209
|
{
|
|
1243
|
-
|
|
1244
|
-
// slightly cumbersome approach needed for compatibility with Ruby < 2.3:
|
|
1245
|
-
RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags & (FL_TAINT));
|
|
1246
|
-
str = new_str_buf;
|
|
1210
|
+
return Qnil;
|
|
1247
1211
|
}
|
|
1248
1212
|
|
|
1249
1213
|
return str;
|
|
@@ -1285,6 +1249,10 @@ cs_method_allocated_length(VALUE self)
|
|
|
1285
1249
|
|
|
1286
1250
|
void Init_character_set()
|
|
1287
1251
|
{
|
|
1252
|
+
#ifdef HAVE_RB_EXT_RACTOR_SAFE
|
|
1253
|
+
rb_ext_ractor_safe(true);
|
|
1254
|
+
#endif
|
|
1255
|
+
|
|
1288
1256
|
VALUE cs = rb_define_class("CharacterSet", rb_cObject);
|
|
1289
1257
|
|
|
1290
1258
|
rb_define_alloc_func(cs, cs_method_allocate);
|
|
@@ -1339,7 +1307,7 @@ void Init_character_set()
|
|
|
1339
1307
|
// `CharacterSet`-specific methods
|
|
1340
1308
|
|
|
1341
1309
|
rb_define_singleton_method(cs, "from_ranges", cs_class_method_from_ranges, -2);
|
|
1342
|
-
rb_define_singleton_method(cs, "
|
|
1310
|
+
rb_define_singleton_method(cs, "of_string", cs_class_method_of_string, 1);
|
|
1343
1311
|
|
|
1344
1312
|
rb_define_method(cs, "ranges", cs_method_ranges, 0);
|
|
1345
1313
|
rb_define_method(cs, "sample", cs_method_sample, -1);
|
|
@@ -6,7 +6,7 @@ typedef struct casefold_mapping {
|
|
|
6
6
|
unsigned long to;
|
|
7
7
|
} casefold_mapping;
|
|
8
8
|
|
|
9
|
-
#define CASEFOLD_COUNT
|
|
9
|
+
#define CASEFOLD_COUNT 1426
|
|
10
10
|
|
|
11
11
|
static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
|
|
12
12
|
{0x0041,0x0061},
|
|
@@ -564,6 +564,41 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
|
|
|
564
564
|
{0x104D1,0x104F9},
|
|
565
565
|
{0x104D2,0x104FA},
|
|
566
566
|
{0x104D3,0x104FB},
|
|
567
|
+
{0x10570,0x10597},
|
|
568
|
+
{0x10571,0x10598},
|
|
569
|
+
{0x10572,0x10599},
|
|
570
|
+
{0x10573,0x1059A},
|
|
571
|
+
{0x10574,0x1059B},
|
|
572
|
+
{0x10575,0x1059C},
|
|
573
|
+
{0x10576,0x1059D},
|
|
574
|
+
{0x10577,0x1059E},
|
|
575
|
+
{0x10578,0x1059F},
|
|
576
|
+
{0x10579,0x105A0},
|
|
577
|
+
{0x1057A,0x105A1},
|
|
578
|
+
{0x1057C,0x105A3},
|
|
579
|
+
{0x1057D,0x105A4},
|
|
580
|
+
{0x1057E,0x105A5},
|
|
581
|
+
{0x1057F,0x105A6},
|
|
582
|
+
{0x10580,0x105A7},
|
|
583
|
+
{0x10581,0x105A8},
|
|
584
|
+
{0x10582,0x105A9},
|
|
585
|
+
{0x10583,0x105AA},
|
|
586
|
+
{0x10584,0x105AB},
|
|
587
|
+
{0x10585,0x105AC},
|
|
588
|
+
{0x10586,0x105AD},
|
|
589
|
+
{0x10587,0x105AE},
|
|
590
|
+
{0x10588,0x105AF},
|
|
591
|
+
{0x10589,0x105B0},
|
|
592
|
+
{0x1058A,0x105B1},
|
|
593
|
+
{0x1058C,0x105B3},
|
|
594
|
+
{0x1058D,0x105B4},
|
|
595
|
+
{0x1058E,0x105B5},
|
|
596
|
+
{0x1058F,0x105B6},
|
|
597
|
+
{0x10590,0x105B7},
|
|
598
|
+
{0x10591,0x105B8},
|
|
599
|
+
{0x10592,0x105B9},
|
|
600
|
+
{0x10594,0x105BB},
|
|
601
|
+
{0x10595,0x105BC},
|
|
567
602
|
{0x10A0,0x2D00},
|
|
568
603
|
{0x10A1,0x2D01},
|
|
569
604
|
{0x10A2,0x2D02},
|
|
@@ -1102,6 +1137,7 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
|
|
|
1102
1137
|
{0x2C2C,0x2C5C},
|
|
1103
1138
|
{0x2C2D,0x2C5D},
|
|
1104
1139
|
{0x2C2E,0x2C5E},
|
|
1140
|
+
{0x2C2F,0x2C5F},
|
|
1105
1141
|
{0x2C60,0x2C61},
|
|
1106
1142
|
{0x2C62,0x026B},
|
|
1107
1143
|
{0x2C63,0x1D7D},
|
|
@@ -1282,10 +1318,17 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
|
|
|
1282
1318
|
{0xA7BA,0xA7BB},
|
|
1283
1319
|
{0xA7BC,0xA7BD},
|
|
1284
1320
|
{0xA7BE,0xA7BF},
|
|
1321
|
+
{0xA7C0,0xA7C1},
|
|
1285
1322
|
{0xA7C2,0xA7C3},
|
|
1286
1323
|
{0xA7C4,0xA794},
|
|
1287
1324
|
{0xA7C5,0x0282},
|
|
1288
1325
|
{0xA7C6,0x1D8E},
|
|
1326
|
+
{0xA7C7,0xA7C8},
|
|
1327
|
+
{0xA7C9,0xA7CA},
|
|
1328
|
+
{0xA7D0,0xA7D1},
|
|
1329
|
+
{0xA7D6,0xA7D7},
|
|
1330
|
+
{0xA7D8,0xA7D9},
|
|
1331
|
+
{0xA7F5,0xA7F6},
|
|
1289
1332
|
{0xAB70,0x13A0},
|
|
1290
1333
|
{0xAB71,0x13A1},
|
|
1291
1334
|
{0xAB72,0x13A2},
|
|
@@ -2,7 +2,7 @@ class CharacterSet
|
|
|
2
2
|
module CoreExt
|
|
3
3
|
module StringExt
|
|
4
4
|
def character_set
|
|
5
|
-
CharacterSet.
|
|
5
|
+
CharacterSet.of_string(self)
|
|
6
6
|
end
|
|
7
7
|
|
|
8
8
|
{
|
|
@@ -29,4 +29,4 @@ class CharacterSet
|
|
|
29
29
|
end
|
|
30
30
|
end
|
|
31
31
|
|
|
32
|
-
::String.
|
|
32
|
+
::String.instance_eval { include CharacterSet::CoreExt::StringExt }
|
|
@@ -4,56 +4,57 @@ class CharacterSet
|
|
|
4
4
|
|
|
5
5
|
Error = Class.new(ArgumentError)
|
|
6
6
|
|
|
7
|
-
def convert(expression)
|
|
8
|
-
CharacterSet.require_optional_dependency('regexp_parser')
|
|
7
|
+
def convert(expression, to = CharacterSet)
|
|
8
|
+
CharacterSet.require_optional_dependency('regexp_parser', __method__)
|
|
9
9
|
|
|
10
10
|
case expression
|
|
11
11
|
when Regexp::Expression::Root
|
|
12
12
|
if expression.count != 1
|
|
13
13
|
raise Error, 'Pass a Regexp with exactly one expression, e.g. /[a-z]/'
|
|
14
14
|
end
|
|
15
|
-
convert(expression[0])
|
|
15
|
+
convert(expression[0], to)
|
|
16
16
|
|
|
17
17
|
when Regexp::Expression::CharacterSet
|
|
18
|
-
content = expression.map { |subexp| convert(subexp) }.reduce(:+)
|
|
18
|
+
content = expression.map { |subexp| convert(subexp, to) }.reduce(:+)
|
|
19
|
+
content ||= to[]
|
|
19
20
|
expression.negative? ? content.inversion : content
|
|
20
21
|
|
|
21
22
|
when Regexp::Expression::CharacterSet::Intersection
|
|
22
|
-
expression.map { |subexp| convert(subexp) }.reduce(:&)
|
|
23
|
+
expression.map { |subexp| convert(subexp, to) }.reduce(:&)
|
|
23
24
|
|
|
24
25
|
when Regexp::Expression::CharacterSet::IntersectedSequence
|
|
25
|
-
expression.map { |subexp| convert(subexp) }.reduce(:+)
|
|
26
|
+
expression.map { |subexp| convert(subexp, to) }.reduce(:+) || to[]
|
|
26
27
|
|
|
27
28
|
when Regexp::Expression::CharacterSet::Range
|
|
28
|
-
start, finish = expression.map { |subexp| convert(subexp) }
|
|
29
|
-
|
|
29
|
+
start, finish = expression.map { |subexp| convert(subexp, to) }
|
|
30
|
+
to.new((start.min)..(finish.max))
|
|
30
31
|
|
|
31
32
|
when Regexp::Expression::CharacterType::Any
|
|
32
|
-
|
|
33
|
+
to.unicode
|
|
33
34
|
|
|
34
35
|
when Regexp::Expression::CharacterType::Base
|
|
35
36
|
/(?<negative>non)?(?<base_name>.+)/ =~ expression.token
|
|
36
37
|
content =
|
|
37
38
|
if expression.unicode_classes?
|
|
38
39
|
# in u-mode, type shortcuts match the same as \p{<long type name>}
|
|
39
|
-
|
|
40
|
+
to.of_property(base_name)
|
|
40
41
|
else
|
|
41
42
|
# in normal mode, types match only ascii chars
|
|
42
43
|
case base_name.to_sym
|
|
43
|
-
when :digit then
|
|
44
|
-
when :hex then
|
|
45
|
-
when :space then
|
|
46
|
-
when :word then
|
|
44
|
+
when :digit then to.from_ranges(48..57)
|
|
45
|
+
when :hex then to.from_ranges(48..57, 65..70, 97..102)
|
|
46
|
+
when :space then to.from_ranges(9..13, 32..32)
|
|
47
|
+
when :word then to.from_ranges(48..57, 65..90, 95..95, 97..122)
|
|
47
48
|
else raise Error, "Unsupported CharacterType #{base_name}"
|
|
48
49
|
end
|
|
49
50
|
end
|
|
50
51
|
negative ? content.inversion : content
|
|
51
52
|
|
|
52
53
|
when Regexp::Expression::EscapeSequence::CodepointList
|
|
53
|
-
|
|
54
|
+
to.new(expression.codepoints)
|
|
54
55
|
|
|
55
56
|
when Regexp::Expression::EscapeSequence::Base
|
|
56
|
-
|
|
57
|
+
to[expression.codepoint]
|
|
57
58
|
|
|
58
59
|
when Regexp::Expression::Group::Capture,
|
|
59
60
|
Regexp::Expression::Group::Passive,
|
|
@@ -61,19 +62,19 @@ class CharacterSet
|
|
|
61
62
|
Regexp::Expression::Group::Atomic,
|
|
62
63
|
Regexp::Expression::Group::Options
|
|
63
64
|
case expression.count
|
|
64
|
-
when 0 then
|
|
65
|
-
when 1 then convert(expression.first)
|
|
65
|
+
when 0 then to[]
|
|
66
|
+
when 1 then convert(expression.first, to)
|
|
66
67
|
else
|
|
67
68
|
raise Error, 'Groups must contain exactly one expression, e.g. ([a-z])'
|
|
68
69
|
end
|
|
69
70
|
|
|
70
|
-
when Regexp::Expression::Alternation
|
|
71
|
-
expression.map { |subexp| convert(subexp) }.reduce(:+)
|
|
71
|
+
when Regexp::Expression::Alternation # rubocop:disable Lint/DuplicateBranch
|
|
72
|
+
expression.map { |subexp| convert(subexp, to) }.reduce(:+)
|
|
72
73
|
|
|
73
74
|
when Regexp::Expression::Alternative
|
|
74
75
|
case expression.count
|
|
75
|
-
when 0 then
|
|
76
|
-
when 1 then convert(expression.first)
|
|
76
|
+
when 0 then to[]
|
|
77
|
+
when 1 then convert(expression.first, to)
|
|
77
78
|
else
|
|
78
79
|
raise Error, 'Alternatives must contain exactly one expression'
|
|
79
80
|
end
|
|
@@ -82,11 +83,11 @@ class CharacterSet
|
|
|
82
83
|
if expression.set_level == 0 && expression.text.size != 1
|
|
83
84
|
raise Error, 'Literal runs outside of sets are codepoint *sequences*'
|
|
84
85
|
end
|
|
85
|
-
|
|
86
|
+
to[expression.text.ord]
|
|
86
87
|
|
|
87
88
|
when Regexp::Expression::UnicodeProperty::Base,
|
|
88
89
|
Regexp::Expression::PosixClass
|
|
89
|
-
content =
|
|
90
|
+
content = to.of_property(expression.token)
|
|
90
91
|
if expression.type == :posixclass && expression.ascii_classes?
|
|
91
92
|
content = content.ascii_part
|
|
92
93
|
end
|
data/lib/character_set/parser.rb
CHANGED
|
@@ -5,7 +5,7 @@ class CharacterSet
|
|
|
5
5
|
def codepoints_from_enumerable(object)
|
|
6
6
|
raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each)
|
|
7
7
|
# Use #each to check first element (only this works for all Enumerables)
|
|
8
|
-
object.each do |e|
|
|
8
|
+
object.each do |e| # rubocop:disable Lint/UnreachableLoop
|
|
9
9
|
return object if e.is_a?(Integer) && e >= 0 && e < 0x110000
|
|
10
10
|
return object.map(&:ord) if e.is_a?(String) && e.length == 1
|
|
11
11
|
raise ArgumentError, "#{e.inspect} is not valid as a codepoint"
|