character_set 1.6.0-java → 1.8.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/gouteur.yml +1 -1
- data/.github/workflows/lint.yml +1 -1
- data/.github/workflows/tests.yml +3 -1
- data/.rubocop.yml +3 -0
- data/BENCHMARK.md +32 -32
- data/CHANGELOG.md +24 -1
- data/Gemfile +7 -6
- data/LICENSE.txt +1 -1
- data/README.md +3 -3
- data/Rakefile +2 -123
- data/character_set.gemspec +0 -7
- data/ext/character_set/character_set.c +77 -43
- data/lib/character_set/core_ext/regexp_ext.rb +8 -0
- data/lib/character_set/expression_converter.rb +37 -54
- data/lib/character_set/parser.rb +8 -4
- data/lib/character_set/predefined_sets/assigned.cps +73 -52
- data/lib/character_set/predefined_sets/emoji.cps +10 -9
- data/lib/character_set/ruby_fallback/character_set_methods.rb +14 -17
- data/lib/character_set/ruby_fallback/set_methods.rb +6 -21
- data/lib/character_set/ruby_fallback/vendored_set_classes.rb +385 -0
- data/lib/character_set/ruby_fallback.rb +18 -6
- data/lib/character_set/set_method_adapters.rb +1 -1
- data/lib/character_set/shared_methods.rb +6 -2
- data/lib/character_set/version.rb +1 -1
- data/tasks/benchmark.rake +20 -0
- data/tasks/benchmarks/shared.rb +28 -0
- data/tasks/sync_casefold_data.rake +20 -0
- data/tasks/sync_predefined_sets.rake +9 -0
- data/tasks/sync_ruby_spec.rake +65 -0
- metadata +19 -28
- data/benchmarks/shared.rb +0 -30
- /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/delete_in.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/keep_in.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
@@ -376,22 +376,20 @@ cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop)
|
|
376
376
|
cps = data->cps;
|
377
377
|
len = data->len;
|
378
378
|
cp = FIX2ULONG(cp_num);
|
379
|
-
if (return_nil_if_noop &&
|
379
|
+
if (return_nil_if_noop && tst_cp(cps, len, cp) == on)
|
380
380
|
{
|
381
381
|
return Qnil;
|
382
382
|
}
|
383
|
+
|
384
|
+
if (on)
|
385
|
+
{
|
386
|
+
set_cp(data, cp);
|
387
|
+
}
|
383
388
|
else
|
384
389
|
{
|
385
|
-
|
386
|
-
{
|
387
|
-
set_cp(data, cp);
|
388
|
-
}
|
389
|
-
else
|
390
|
-
{
|
391
|
-
clr_cp(cps, len, cp);
|
392
|
-
}
|
393
|
-
return cs;
|
390
|
+
clr_cp(cps, len, cp);
|
394
391
|
}
|
392
|
+
return cs;
|
395
393
|
}
|
396
394
|
|
397
395
|
static VALUE
|
@@ -575,7 +573,7 @@ cs_method_merge(VALUE self, VALUE other)
|
|
575
573
|
{
|
576
574
|
return cs_merge_cs(self, other);
|
577
575
|
}
|
578
|
-
|
576
|
+
if (TYPE(other) == T_ARRAY)
|
579
577
|
{
|
580
578
|
return cs_merge_rb_array(self, other);
|
581
579
|
}
|
@@ -677,6 +675,18 @@ cs_method_proper_superset_p(VALUE self, VALUE other)
|
|
677
675
|
return (is_superset && is_proper) ? Qtrue : Qfalse;
|
678
676
|
}
|
679
677
|
|
678
|
+
static VALUE
|
679
|
+
cs_method_spaceship_operator(VALUE self, VALUE other)
|
680
|
+
{
|
681
|
+
if (cs_method_eql_p(self, other))
|
682
|
+
return INT2FIX(0);
|
683
|
+
if (cs_method_proper_subset_p(self, other))
|
684
|
+
return INT2FIX(-1);
|
685
|
+
if (cs_method_proper_superset_p(self, other))
|
686
|
+
return INT2FIX(1);
|
687
|
+
return Qnil;
|
688
|
+
}
|
689
|
+
|
680
690
|
// *******************************
|
681
691
|
// `CharacterSet`-specific methods
|
682
692
|
// *******************************
|
@@ -917,10 +927,10 @@ cs_method_ext_inversion(int argc, VALUE *argv, VALUE self)
|
|
917
927
|
return new_cs;
|
918
928
|
}
|
919
929
|
|
920
|
-
typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE
|
930
|
+
typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE memo);
|
921
931
|
|
922
932
|
static inline int
|
923
|
-
add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
933
|
+
add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
924
934
|
{
|
925
935
|
set_cp(data, str_cp);
|
926
936
|
return 1;
|
@@ -967,7 +977,7 @@ cs_method_case_insensitive(VALUE self)
|
|
967
977
|
}
|
968
978
|
|
969
979
|
static inline VALUE
|
970
|
-
each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
980
|
+
each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
971
981
|
{
|
972
982
|
long i, str_len;
|
973
983
|
unsigned int str_cp;
|
@@ -986,21 +996,29 @@ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_d
|
|
986
996
|
}
|
987
997
|
|
988
998
|
static inline VALUE
|
989
|
-
each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
999
|
+
each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
990
1000
|
{
|
991
1001
|
int n;
|
992
1002
|
unsigned int str_cp;
|
993
1003
|
const char *ptr, *end;
|
994
|
-
rb_encoding *
|
1004
|
+
rb_encoding *utf8;
|
1005
|
+
|
1006
|
+
utf8 = rb_utf8_encoding();
|
1007
|
+
if (rb_enc_get(str) == utf8)
|
1008
|
+
{
|
1009
|
+
str = rb_str_new_frozen(str);
|
1010
|
+
}
|
1011
|
+
else
|
1012
|
+
{
|
1013
|
+
str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
|
1014
|
+
}
|
995
1015
|
|
996
|
-
str = rb_str_new_frozen(str);
|
997
1016
|
ptr = RSTRING_PTR(str);
|
998
1017
|
end = RSTRING_END(str);
|
999
|
-
enc = rb_enc_get(str);
|
1000
1018
|
|
1001
1019
|
while (ptr < end)
|
1002
1020
|
{
|
1003
|
-
str_cp = rb_enc_codepoint_len(ptr, end, &n,
|
1021
|
+
str_cp = rb_enc_codepoint_len(ptr, end, &n, utf8);
|
1004
1022
|
if (!(*func)(str_cp, cp_arr, len, data, memo))
|
1005
1023
|
{
|
1006
1024
|
return Qfalse;
|
@@ -1031,12 +1049,13 @@ single_byte_optimizable(VALUE str)
|
|
1031
1049
|
}
|
1032
1050
|
|
1033
1051
|
static inline VALUE
|
1034
|
-
each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1052
|
+
each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1035
1053
|
{
|
1036
1054
|
if (single_byte_optimizable(str))
|
1037
1055
|
{
|
1038
1056
|
return each_sb_cp(str, func, cp_arr, len, data, memo);
|
1039
1057
|
}
|
1058
|
+
|
1040
1059
|
return each_mb_cp(str, func, cp_arr, len, data, memo);
|
1041
1060
|
}
|
1042
1061
|
|
@@ -1062,11 +1081,11 @@ cs_class_method_of_string(VALUE self, VALUE string)
|
|
1062
1081
|
}
|
1063
1082
|
|
1064
1083
|
static inline int
|
1065
|
-
count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1084
|
+
count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1066
1085
|
{
|
1067
1086
|
if (tst_cp(cp_arr, len, str_cp))
|
1068
1087
|
{
|
1069
|
-
*memo += 1;
|
1088
|
+
*((VALUE *)memo) += 1;
|
1070
1089
|
}
|
1071
1090
|
return 1;
|
1072
1091
|
}
|
@@ -1074,17 +1093,17 @@ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data
|
|
1074
1093
|
static VALUE
|
1075
1094
|
cs_method_count_in(VALUE self, VALUE str)
|
1076
1095
|
{
|
1077
|
-
|
1096
|
+
long count;
|
1078
1097
|
struct cs_data *data;
|
1079
1098
|
raise_arg_err_unless_string(str);
|
1080
1099
|
data = cs_fetch_data(self);
|
1081
1100
|
count = 0;
|
1082
|
-
each_cp(str, count_str_cp, data->cps, data->len, data, &count);
|
1083
|
-
return
|
1101
|
+
each_cp(str, count_str_cp, data->cps, data->len, data, (VALUE)&count);
|
1102
|
+
return LONG2FIX(count);
|
1084
1103
|
}
|
1085
1104
|
|
1086
1105
|
static inline int
|
1087
|
-
str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1106
|
+
str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1088
1107
|
{
|
1089
1108
|
return tst_cp(cp_arr, len, str_cp);
|
1090
1109
|
}
|
@@ -1099,11 +1118,11 @@ cs_method_cover_p(VALUE self, VALUE str)
|
|
1099
1118
|
}
|
1100
1119
|
|
1101
1120
|
static inline int
|
1102
|
-
add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1121
|
+
add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1103
1122
|
{
|
1104
1123
|
if (tst_cp(cp_arr, len, str_cp))
|
1105
1124
|
{
|
1106
|
-
rb_ary_push(memo
|
1125
|
+
rb_ary_push(memo, rb_enc_uint_chr((int)str_cp, rb_utf8_encoding()));
|
1107
1126
|
}
|
1108
1127
|
return 1;
|
1109
1128
|
}
|
@@ -1111,18 +1130,17 @@ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_d
|
|
1111
1130
|
static VALUE
|
1112
1131
|
cs_method_scan(VALUE self, VALUE str)
|
1113
1132
|
{
|
1114
|
-
VALUE memo
|
1133
|
+
VALUE memo;
|
1115
1134
|
struct cs_data *data;
|
1116
1135
|
raise_arg_err_unless_string(str);
|
1117
1136
|
data = cs_fetch_data(self);
|
1118
|
-
memo
|
1119
|
-
memo[1] = (VALUE)rb_enc_get(str);
|
1137
|
+
memo = rb_ary_new();
|
1120
1138
|
each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo);
|
1121
|
-
return memo
|
1139
|
+
return memo;
|
1122
1140
|
}
|
1123
1141
|
|
1124
1142
|
static inline int
|
1125
|
-
str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1143
|
+
str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1126
1144
|
{
|
1127
1145
|
return !tst_cp(cp_arr, len, str_cp);
|
1128
1146
|
}
|
@@ -1146,9 +1164,9 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
|
|
1146
1164
|
cs_cp cs_len;
|
1147
1165
|
VALUE orig_str_len;
|
1148
1166
|
|
1149
|
-
rb_encoding *
|
1167
|
+
rb_encoding *orig_enc, *utf8;
|
1150
1168
|
char *s, *send, *t;
|
1151
|
-
int
|
1169
|
+
int orig_was_utf8, cr;
|
1152
1170
|
|
1153
1171
|
raise_arg_err_unless_string(str);
|
1154
1172
|
|
@@ -1159,24 +1177,34 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
|
|
1159
1177
|
return bang ? Qnil : str;
|
1160
1178
|
}
|
1161
1179
|
|
1162
|
-
|
1180
|
+
orig_enc = rb_enc_get(str);
|
1181
|
+
utf8 = rb_utf8_encoding();
|
1182
|
+
orig_was_utf8 = orig_enc == utf8;
|
1183
|
+
|
1184
|
+
if (!orig_was_utf8 && orig_enc != rb_usascii_encoding())
|
1185
|
+
{
|
1186
|
+
str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
|
1187
|
+
}
|
1188
|
+
else
|
1163
1189
|
{
|
1164
|
-
|
1190
|
+
if (!bang)
|
1191
|
+
{
|
1192
|
+
str = rb_str_dup(str);
|
1193
|
+
}
|
1165
1194
|
}
|
1166
1195
|
|
1167
1196
|
cps = cs_fetch_cps(set, &cs_len);
|
1168
1197
|
rb_str_modify(str);
|
1169
|
-
enc = rb_enc_get(str);
|
1170
|
-
ascompat = rb_enc_asciicompat(enc);
|
1171
1198
|
s = t = RSTRING_PTR(str);
|
1172
1199
|
send = RSTRING_END(str);
|
1173
|
-
cr =
|
1200
|
+
cr = ENC_CODERANGE_7BIT;
|
1201
|
+
|
1174
1202
|
while (s < send)
|
1175
1203
|
{
|
1176
1204
|
unsigned int c;
|
1177
1205
|
int clen;
|
1178
1206
|
|
1179
|
-
if (
|
1207
|
+
if ((c = *(unsigned char *)s) < 0x80)
|
1180
1208
|
{
|
1181
1209
|
if (tst_cp(cps, cs_len, c) != delete)
|
1182
1210
|
{
|
@@ -1188,12 +1216,12 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
|
|
1188
1216
|
}
|
1189
1217
|
else
|
1190
1218
|
{
|
1191
|
-
c = rb_enc_codepoint_len(s, send, &clen,
|
1219
|
+
c = rb_enc_codepoint_len(s, send, &clen, utf8);
|
1192
1220
|
|
1193
1221
|
if (tst_cp(cps, cs_len, c) != delete)
|
1194
1222
|
{
|
1195
1223
|
if (t != s)
|
1196
|
-
rb_enc_mbcput(c, t,
|
1224
|
+
rb_enc_mbcput(c, t, utf8);
|
1197
1225
|
t += clen;
|
1198
1226
|
if (cr == ENC_CODERANGE_7BIT)
|
1199
1227
|
cr = ENC_CODERANGE_VALID;
|
@@ -1210,6 +1238,11 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
|
|
1210
1238
|
return Qnil;
|
1211
1239
|
}
|
1212
1240
|
|
1241
|
+
if (!orig_was_utf8)
|
1242
|
+
{
|
1243
|
+
return rb_str_encode(str, rb_enc_from_encoding(orig_enc), 0, Qnil);
|
1244
|
+
}
|
1245
|
+
|
1213
1246
|
return str;
|
1214
1247
|
}
|
1215
1248
|
|
@@ -1303,6 +1336,7 @@ void Init_character_set()
|
|
1303
1336
|
rb_define_method(cs, ">=", cs_method_superset_p, 1);
|
1304
1337
|
rb_define_method(cs, "proper_superset?", cs_method_proper_superset_p, 1);
|
1305
1338
|
rb_define_method(cs, ">", cs_method_proper_superset_p, 1);
|
1339
|
+
rb_define_method(cs, "<=>", cs_method_spaceship_operator, 1);
|
1306
1340
|
|
1307
1341
|
// `CharacterSet`-specific methods
|
1308
1342
|
|
@@ -4,6 +4,14 @@ class CharacterSet
|
|
4
4
|
def character_set
|
5
5
|
CharacterSet.of_regexp(self)
|
6
6
|
end
|
7
|
+
|
8
|
+
def covered_by_character_set?(other)
|
9
|
+
other.superset?(character_set)
|
10
|
+
end
|
11
|
+
|
12
|
+
def uses_character_set?(other)
|
13
|
+
other.intersect?(character_set)
|
14
|
+
end
|
7
15
|
end
|
8
16
|
end
|
9
17
|
end
|
@@ -4,86 +4,61 @@ class CharacterSet
|
|
4
4
|
|
5
5
|
Error = Class.new(ArgumentError)
|
6
6
|
|
7
|
-
def convert(expression, to = CharacterSet)
|
7
|
+
def convert(expression, to = CharacterSet, acc = [])
|
8
8
|
CharacterSet.require_optional_dependency('regexp_parser', __method__)
|
9
9
|
|
10
10
|
case expression
|
11
|
-
when Regexp::Expression::Root
|
12
|
-
if expression.count != 1
|
13
|
-
raise Error, 'Pass a Regexp with exactly one expression, e.g. /[a-z]/'
|
14
|
-
end
|
15
|
-
convert(expression[0], to)
|
16
|
-
|
17
11
|
when Regexp::Expression::CharacterSet
|
18
|
-
content = expression.map { |subexp| convert(subexp, to) }.reduce(:+)
|
19
|
-
content
|
20
|
-
expression.negative? ? content.inversion : content
|
12
|
+
content = expression.map { |subexp| convert(subexp, to) }.reduce(:+) || to[]
|
13
|
+
acc << (expression.negative? ? content.inversion : content)
|
21
14
|
|
22
15
|
when Regexp::Expression::CharacterSet::Intersection
|
23
|
-
expression.map { |subexp| convert(subexp, to) }.reduce(:&)
|
24
|
-
|
25
|
-
when Regexp::Expression::CharacterSet::IntersectedSequence
|
26
|
-
expression.map { |subexp| convert(subexp, to) }.reduce(:+) || to[]
|
16
|
+
acc << expression.map { |subexp| convert(subexp, to) }.reduce(:&)
|
27
17
|
|
28
18
|
when Regexp::Expression::CharacterSet::Range
|
29
19
|
start, finish = expression.map { |subexp| convert(subexp, to) }
|
30
|
-
to.new((start.min)..(finish.max))
|
20
|
+
acc << to.new((start.min)..(finish.max))
|
21
|
+
|
22
|
+
when Regexp::Expression::Subexpression # root, group, alternation, etc.
|
23
|
+
expression.each { |subexp| convert(subexp, to, acc) }
|
31
24
|
|
32
25
|
when Regexp::Expression::CharacterType::Any
|
33
|
-
to.unicode
|
26
|
+
acc << to.unicode
|
34
27
|
|
35
28
|
when Regexp::Expression::CharacterType::Base
|
36
29
|
/(?<negative>non)?(?<base_name>.+)/ =~ expression.token
|
37
30
|
content =
|
38
31
|
if expression.unicode_classes?
|
39
|
-
# in u-mode, type shortcuts match the same as \p{<long type name>}
|
40
|
-
|
32
|
+
# in u-mode, most type shortcuts match the same as \p{<long type name>}
|
33
|
+
if base_name == 'linebreak'
|
34
|
+
to.from_ranges(10..13, 133..133, 8232..8233)
|
35
|
+
else
|
36
|
+
to.of_property(base_name)
|
37
|
+
end
|
41
38
|
else
|
42
39
|
# in normal mode, types match only ascii chars
|
43
40
|
case base_name.to_sym
|
44
|
-
when :digit
|
45
|
-
when :hex
|
46
|
-
when :
|
47
|
-
when :
|
41
|
+
when :digit then to.from_ranges(48..57)
|
42
|
+
when :hex then to.from_ranges(48..57, 65..70, 97..102)
|
43
|
+
when :linebreak then to.from_ranges(10..13)
|
44
|
+
when :space then to.from_ranges(9..13, 32..32)
|
45
|
+
when :word then to.from_ranges(48..57, 65..90, 95..95, 97..122)
|
48
46
|
else raise Error, "Unsupported CharacterType #{base_name}"
|
49
47
|
end
|
50
48
|
end
|
51
|
-
negative ? content.inversion : content
|
49
|
+
acc << (negative ? content.inversion : content)
|
52
50
|
|
53
51
|
when Regexp::Expression::EscapeSequence::CodepointList
|
54
|
-
to.new(expression.codepoints)
|
52
|
+
content = to.new(expression.codepoints)
|
53
|
+
acc << (expression.i? ? content.case_insensitive : content)
|
55
54
|
|
56
55
|
when Regexp::Expression::EscapeSequence::Base
|
57
|
-
to[expression.codepoint]
|
58
|
-
|
59
|
-
when Regexp::Expression::Group::Capture,
|
60
|
-
Regexp::Expression::Group::Passive,
|
61
|
-
Regexp::Expression::Group::Named,
|
62
|
-
Regexp::Expression::Group::Atomic,
|
63
|
-
Regexp::Expression::Group::Options
|
64
|
-
case expression.count
|
65
|
-
when 0 then to[]
|
66
|
-
when 1 then convert(expression.first, to)
|
67
|
-
else
|
68
|
-
raise Error, 'Groups must contain exactly one expression, e.g. ([a-z])'
|
69
|
-
end
|
70
|
-
|
71
|
-
when Regexp::Expression::Alternation # rubocop:disable Lint/DuplicateBranch
|
72
|
-
expression.map { |subexp| convert(subexp, to) }.reduce(:+)
|
73
|
-
|
74
|
-
when Regexp::Expression::Alternative
|
75
|
-
case expression.count
|
76
|
-
when 0 then to[]
|
77
|
-
when 1 then convert(expression.first, to)
|
78
|
-
else
|
79
|
-
raise Error, 'Alternatives must contain exactly one expression'
|
80
|
-
end
|
56
|
+
content = to[expression.codepoint]
|
57
|
+
acc << (expression.i? ? content.case_insensitive : content)
|
81
58
|
|
82
59
|
when Regexp::Expression::Literal
|
83
|
-
|
84
|
-
|
85
|
-
end
|
86
|
-
to[expression.text.ord]
|
60
|
+
content = to[*expression.text.chars]
|
61
|
+
acc << (expression.i? ? content.case_insensitive : content)
|
87
62
|
|
88
63
|
when Regexp::Expression::UnicodeProperty::Base,
|
89
64
|
Regexp::Expression::PosixClass
|
@@ -91,14 +66,22 @@ class CharacterSet
|
|
91
66
|
if expression.type == :posixclass && expression.ascii_classes?
|
92
67
|
content = content.ascii_part
|
93
68
|
end
|
94
|
-
expression.negative? ? content.inversion : content
|
69
|
+
acc << (expression.negative? ? content.inversion : content)
|
70
|
+
|
71
|
+
when Regexp::Expression::Anchor::Base,
|
72
|
+
Regexp::Expression::Backreference::Base,
|
73
|
+
Regexp::Expression::Keep::Mark,
|
74
|
+
Regexp::Expression::Quantifier
|
75
|
+
# ignore zero-length and repeat expressions
|
95
76
|
|
96
77
|
when Regexp::Expression::Base
|
97
78
|
raise Error, "Unsupported expression class `#{expression.class}`"
|
98
79
|
|
99
80
|
else
|
100
|
-
raise Error,
|
81
|
+
raise Error, 'Pass an expression (result of Regexp::Parser.parse)'
|
101
82
|
end
|
83
|
+
|
84
|
+
acc.reduce(:+) || to[]
|
102
85
|
end
|
103
86
|
end
|
104
87
|
end
|
data/lib/character_set/parser.rb
CHANGED
@@ -4,11 +4,15 @@ class CharacterSet
|
|
4
4
|
|
5
5
|
def codepoints_from_enumerable(object)
|
6
6
|
raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each)
|
7
|
+
|
7
8
|
# Use #each to check first element (only this works for all Enumerables)
|
8
|
-
object.each do |
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
object.each do |el| # rubocop:disable Lint/UnreachableLoop
|
10
|
+
if el.is_a?(Integer) && el >= 0 && el < 0x110000
|
11
|
+
return object
|
12
|
+
elsif el.is_a?(String) && el.length == 1
|
13
|
+
return object.to_a.join.encode('utf-8').codepoints
|
14
|
+
end
|
15
|
+
raise ArgumentError, "#{el.inspect} is not valid as a codepoint"
|
12
16
|
end
|
13
17
|
end
|
14
18
|
|