character_set 1.4.0 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitattributes +1 -1
- data/.github/workflows/gouteur.yml +20 -0
- data/.github/workflows/lint.yml +29 -0
- data/.github/workflows/tests.yml +28 -0
- data/.gitignore +1 -0
- data/.gouteur.yml +2 -0
- data/.rubocop.yml +20 -0
- data/BENCHMARK.md +35 -31
- data/CHANGELOG.md +64 -1
- data/Gemfile +15 -0
- data/LICENSE.txt +1 -1
- data/README.md +25 -9
- data/Rakefile +2 -120
- data/character_set.gemspec +0 -10
- data/ext/character_set/character_set.c +123 -121
- data/ext/character_set/unicode_casefold_table.h +44 -1
- data/lib/character_set/core_ext/regexp_ext.rb +9 -1
- data/lib/character_set/core_ext/string_ext.rb +2 -2
- data/lib/character_set/expression_converter.rb +40 -56
- data/lib/character_set/parser.rb +8 -4
- data/lib/character_set/predefined_sets/assigned.cps +110 -78
- data/lib/character_set/predefined_sets/emoji.cps +16 -14
- data/lib/character_set/predefined_sets.rb +11 -0
- data/lib/character_set/ruby_fallback/character_set_methods.rb +17 -21
- data/lib/character_set/ruby_fallback/set_methods.rb +9 -16
- data/lib/character_set/ruby_fallback/vendored_set_classes.rb +385 -0
- data/lib/character_set/ruby_fallback.rb +18 -2
- data/lib/character_set/set_method_adapters.rb +4 -3
- data/lib/character_set/shared_methods.rb +25 -11
- data/lib/character_set/version.rb +1 -1
- data/tasks/benchmark.rake +20 -0
- data/{benchmarks → tasks/benchmarks}/delete_in.rb +5 -1
- data/{benchmarks → tasks/benchmarks}/keep_in.rb +5 -1
- data/tasks/benchmarks/shared.rb +28 -0
- data/tasks/sync_casefold_data.rake +20 -0
- data/tasks/sync_predefined_sets.rake +9 -0
- data/tasks/sync_ruby_spec.rake +65 -0
- metadata +29 -146
- data/.travis.yml +0 -9
- data/benchmarks/shared.rb +0 -26
- /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
@@ -82,7 +82,11 @@ static const rb_data_type_t cs_type = {
|
|
82
82
|
.dsize = cs_memsize,
|
83
83
|
},
|
84
84
|
.data = NULL,
|
85
|
+
#ifdef RUBY_TYPED_FROZEN_SHAREABLE
|
86
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_FROZEN_SHAREABLE,
|
87
|
+
#else
|
85
88
|
.flags = RUBY_TYPED_FREE_IMMEDIATELY,
|
89
|
+
#endif
|
86
90
|
};
|
87
91
|
|
88
92
|
static inline VALUE
|
@@ -216,6 +220,7 @@ cs_method_hash(VALUE self)
|
|
216
220
|
cs_cp cp, len, hash, four_byte_value;
|
217
221
|
cs_ar *cps;
|
218
222
|
cps = cs_fetch_cps(self, &len);
|
223
|
+
four_byte_value = 0;
|
219
224
|
|
220
225
|
hash = 17;
|
221
226
|
for (cp = 0; cp < len; cp++)
|
@@ -314,9 +319,9 @@ cs_method_minmax(VALUE self)
|
|
314
319
|
cs_cp cp, alen, blen; \
|
315
320
|
cs_ar *acps, *bcps; \
|
316
321
|
struct cs_data *new_data; \
|
317
|
-
new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
|
318
322
|
acps = cs_fetch_cps(cs_a, &alen); \
|
319
323
|
bcps = cs_fetch_cps(cs_b, &blen); \
|
324
|
+
new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
|
320
325
|
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) \
|
321
326
|
{ \
|
322
327
|
if (tst_cp(acps, alen, cp) comp_op tst_cp(bcps, blen, cp)) \
|
@@ -371,22 +376,20 @@ cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop)
|
|
371
376
|
cps = data->cps;
|
372
377
|
len = data->len;
|
373
378
|
cp = FIX2ULONG(cp_num);
|
374
|
-
if (return_nil_if_noop &&
|
379
|
+
if (return_nil_if_noop && tst_cp(cps, len, cp) == on)
|
375
380
|
{
|
376
381
|
return Qnil;
|
377
382
|
}
|
383
|
+
|
384
|
+
if (on)
|
385
|
+
{
|
386
|
+
set_cp(data, cp);
|
387
|
+
}
|
378
388
|
else
|
379
389
|
{
|
380
|
-
|
381
|
-
{
|
382
|
-
set_cp(data, cp);
|
383
|
-
}
|
384
|
-
else
|
385
|
-
{
|
386
|
-
clr_cp(cps, len, cp);
|
387
|
-
}
|
388
|
-
return cs;
|
390
|
+
clr_cp(cps, len, cp);
|
389
391
|
}
|
392
|
+
return cs;
|
390
393
|
}
|
391
394
|
|
392
395
|
static VALUE
|
@@ -570,7 +573,7 @@ cs_method_merge(VALUE self, VALUE other)
|
|
570
573
|
{
|
571
574
|
return cs_merge_cs(self, other);
|
572
575
|
}
|
573
|
-
|
576
|
+
if (TYPE(other) == T_ARRAY)
|
574
577
|
{
|
575
578
|
return cs_merge_rb_array(self, other);
|
576
579
|
}
|
@@ -672,6 +675,18 @@ cs_method_proper_superset_p(VALUE self, VALUE other)
|
|
672
675
|
return (is_superset && is_proper) ? Qtrue : Qfalse;
|
673
676
|
}
|
674
677
|
|
678
|
+
static VALUE
|
679
|
+
cs_method_spaceship_operator(VALUE self, VALUE other)
|
680
|
+
{
|
681
|
+
if (cs_method_eql_p(self, other))
|
682
|
+
return INT2FIX(0);
|
683
|
+
if (cs_method_proper_subset_p(self, other))
|
684
|
+
return INT2FIX(-1);
|
685
|
+
if (cs_method_proper_superset_p(self, other))
|
686
|
+
return INT2FIX(1);
|
687
|
+
return Qnil;
|
688
|
+
}
|
689
|
+
|
675
690
|
// *******************************
|
676
691
|
// `CharacterSet`-specific methods
|
677
692
|
// *******************************
|
@@ -912,10 +927,10 @@ cs_method_ext_inversion(int argc, VALUE *argv, VALUE self)
|
|
912
927
|
return new_cs;
|
913
928
|
}
|
914
929
|
|
915
|
-
typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE
|
930
|
+
typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE memo);
|
916
931
|
|
917
932
|
static inline int
|
918
|
-
add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
933
|
+
add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
919
934
|
{
|
920
935
|
set_cp(data, str_cp);
|
921
936
|
return 1;
|
@@ -962,7 +977,7 @@ cs_method_case_insensitive(VALUE self)
|
|
962
977
|
}
|
963
978
|
|
964
979
|
static inline VALUE
|
965
|
-
each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
980
|
+
each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
966
981
|
{
|
967
982
|
long i, str_len;
|
968
983
|
unsigned int str_cp;
|
@@ -981,21 +996,29 @@ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_d
|
|
981
996
|
}
|
982
997
|
|
983
998
|
static inline VALUE
|
984
|
-
each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
999
|
+
each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
985
1000
|
{
|
986
1001
|
int n;
|
987
1002
|
unsigned int str_cp;
|
988
1003
|
const char *ptr, *end;
|
989
|
-
rb_encoding *
|
1004
|
+
rb_encoding *utf8;
|
1005
|
+
|
1006
|
+
utf8 = rb_utf8_encoding();
|
1007
|
+
if (rb_enc_get(str) == utf8)
|
1008
|
+
{
|
1009
|
+
str = rb_str_new_frozen(str);
|
1010
|
+
}
|
1011
|
+
else
|
1012
|
+
{
|
1013
|
+
str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
|
1014
|
+
}
|
990
1015
|
|
991
|
-
str = rb_str_new_frozen(str);
|
992
1016
|
ptr = RSTRING_PTR(str);
|
993
1017
|
end = RSTRING_END(str);
|
994
|
-
enc = rb_enc_get(str);
|
995
1018
|
|
996
1019
|
while (ptr < end)
|
997
1020
|
{
|
998
|
-
str_cp = rb_enc_codepoint_len(ptr, end, &n,
|
1021
|
+
str_cp = rb_enc_codepoint_len(ptr, end, &n, utf8);
|
999
1022
|
if (!(*func)(str_cp, cp_arr, len, data, memo))
|
1000
1023
|
{
|
1001
1024
|
return Qfalse;
|
@@ -1026,12 +1049,13 @@ single_byte_optimizable(VALUE str)
|
|
1026
1049
|
}
|
1027
1050
|
|
1028
1051
|
static inline VALUE
|
1029
|
-
each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1052
|
+
each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1030
1053
|
{
|
1031
1054
|
if (single_byte_optimizable(str))
|
1032
1055
|
{
|
1033
1056
|
return each_sb_cp(str, func, cp_arr, len, data, memo);
|
1034
1057
|
}
|
1058
|
+
|
1035
1059
|
return each_mb_cp(str, func, cp_arr, len, data, memo);
|
1036
1060
|
}
|
1037
1061
|
|
@@ -1045,22 +1069,23 @@ raise_arg_err_unless_string(VALUE val)
|
|
1045
1069
|
}
|
1046
1070
|
|
1047
1071
|
static VALUE
|
1048
|
-
|
1072
|
+
cs_class_method_of_string(VALUE self, VALUE string)
|
1049
1073
|
{
|
1050
1074
|
VALUE new_cs;
|
1051
1075
|
struct cs_data *new_data;
|
1076
|
+
|
1077
|
+
raise_arg_err_unless_string(string);
|
1052
1078
|
new_cs = cs_alloc(self, &new_data);
|
1053
|
-
|
1054
|
-
each_cp(str, add_str_cp_to_arr, 0, 0, new_data, 0);
|
1079
|
+
each_cp(string, add_str_cp_to_arr, 0, 0, new_data, 0);
|
1055
1080
|
return new_cs;
|
1056
1081
|
}
|
1057
1082
|
|
1058
1083
|
static inline int
|
1059
|
-
count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1084
|
+
count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1060
1085
|
{
|
1061
1086
|
if (tst_cp(cp_arr, len, str_cp))
|
1062
1087
|
{
|
1063
|
-
*memo += 1;
|
1088
|
+
*((VALUE *)memo) += 1;
|
1064
1089
|
}
|
1065
1090
|
return 1;
|
1066
1091
|
}
|
@@ -1068,17 +1093,17 @@ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data
|
|
1068
1093
|
static VALUE
|
1069
1094
|
cs_method_count_in(VALUE self, VALUE str)
|
1070
1095
|
{
|
1071
|
-
|
1096
|
+
long count;
|
1072
1097
|
struct cs_data *data;
|
1073
1098
|
raise_arg_err_unless_string(str);
|
1074
1099
|
data = cs_fetch_data(self);
|
1075
1100
|
count = 0;
|
1076
|
-
each_cp(str, count_str_cp, data->cps, data->len, data, &count);
|
1077
|
-
return
|
1101
|
+
each_cp(str, count_str_cp, data->cps, data->len, data, (VALUE)&count);
|
1102
|
+
return LONG2FIX(count);
|
1078
1103
|
}
|
1079
1104
|
|
1080
1105
|
static inline int
|
1081
|
-
str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1106
|
+
str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1082
1107
|
{
|
1083
1108
|
return tst_cp(cp_arr, len, str_cp);
|
1084
1109
|
}
|
@@ -1093,11 +1118,11 @@ cs_method_cover_p(VALUE self, VALUE str)
|
|
1093
1118
|
}
|
1094
1119
|
|
1095
1120
|
static inline int
|
1096
|
-
add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1121
|
+
add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1097
1122
|
{
|
1098
1123
|
if (tst_cp(cp_arr, len, str_cp))
|
1099
1124
|
{
|
1100
|
-
rb_ary_push(memo
|
1125
|
+
rb_ary_push(memo, rb_enc_uint_chr((int)str_cp, rb_utf8_encoding()));
|
1101
1126
|
}
|
1102
1127
|
return 1;
|
1103
1128
|
}
|
@@ -1105,18 +1130,17 @@ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_d
|
|
1105
1130
|
static VALUE
|
1106
1131
|
cs_method_scan(VALUE self, VALUE str)
|
1107
1132
|
{
|
1108
|
-
VALUE memo
|
1133
|
+
VALUE memo;
|
1109
1134
|
struct cs_data *data;
|
1110
1135
|
raise_arg_err_unless_string(str);
|
1111
1136
|
data = cs_fetch_data(self);
|
1112
|
-
memo
|
1113
|
-
memo[1] = (VALUE)rb_enc_get(str);
|
1137
|
+
memo = rb_ary_new();
|
1114
1138
|
each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo);
|
1115
|
-
return memo
|
1139
|
+
return memo;
|
1116
1140
|
}
|
1117
1141
|
|
1118
1142
|
static inline int
|
1119
|
-
str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE
|
1143
|
+
str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
|
1120
1144
|
{
|
1121
1145
|
return !tst_cp(cp_arr, len, str_cp);
|
1122
1146
|
}
|
@@ -1132,118 +1156,91 @@ cs_method_used_by_p(VALUE self, VALUE str)
|
|
1132
1156
|
return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
|
1133
1157
|
}
|
1134
1158
|
|
1135
|
-
|
1136
|
-
cs_str_buf_cat(VALUE str, const char *ptr, long len)
|
1137
|
-
{
|
1138
|
-
long total, olen;
|
1139
|
-
char *sptr;
|
1140
|
-
|
1141
|
-
RSTRING_GETMEM(str, sptr, olen);
|
1142
|
-
sptr = RSTRING(str)->as.heap.ptr;
|
1143
|
-
olen = RSTRING(str)->as.heap.len;
|
1144
|
-
total = olen + len;
|
1145
|
-
memcpy(sptr + olen, ptr, len);
|
1146
|
-
RSTRING(str)->as.heap.len = total;
|
1147
|
-
}
|
1148
|
-
|
1149
|
-
#ifndef TERM_FILL
|
1150
|
-
#define TERM_FILL(ptr, termlen) \
|
1151
|
-
do \
|
1152
|
-
{ \
|
1153
|
-
char *const term_fill_ptr = (ptr); \
|
1154
|
-
const int term_fill_len = (termlen); \
|
1155
|
-
*term_fill_ptr = '\0'; \
|
1156
|
-
if (__builtin_expect(!!(term_fill_len > 1), 0)) \
|
1157
|
-
memset(term_fill_ptr, 0, term_fill_len); \
|
1158
|
-
} while (0)
|
1159
|
-
#endif
|
1160
|
-
|
1161
|
-
static void
|
1162
|
-
cs_str_buf_terminate(VALUE str, rb_encoding *enc)
|
1163
|
-
{
|
1164
|
-
char *ptr;
|
1165
|
-
long len;
|
1166
|
-
|
1167
|
-
ptr = RSTRING(str)->as.heap.ptr;
|
1168
|
-
len = RSTRING(str)->as.heap.len;
|
1169
|
-
TERM_FILL(ptr + len, rb_enc_mbminlen(enc));
|
1170
|
-
}
|
1171
|
-
|
1159
|
+
// partially based on rb_str_delete_bang
|
1172
1160
|
static inline VALUE
|
1173
1161
|
cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
|
1174
1162
|
{
|
1175
1163
|
cs_ar *cps;
|
1176
|
-
cs_cp
|
1177
|
-
|
1178
|
-
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
1164
|
+
cs_cp cs_len;
|
1165
|
+
VALUE orig_str_len;
|
1166
|
+
|
1167
|
+
rb_encoding *orig_enc, *utf8;
|
1168
|
+
char *s, *send, *t;
|
1169
|
+
int orig_was_utf8, cr;
|
1182
1170
|
|
1183
1171
|
raise_arg_err_unless_string(str);
|
1184
1172
|
|
1185
|
-
|
1173
|
+
orig_str_len = RSTRING_LEN(str);
|
1174
|
+
|
1175
|
+
if (orig_str_len == 0)
|
1176
|
+
{
|
1177
|
+
return bang ? Qnil : str;
|
1178
|
+
}
|
1186
1179
|
|
1187
|
-
|
1188
|
-
|
1180
|
+
orig_enc = rb_enc_get(str);
|
1181
|
+
utf8 = rb_utf8_encoding();
|
1182
|
+
orig_was_utf8 = orig_enc == utf8;
|
1183
|
+
|
1184
|
+
if (!orig_was_utf8 && orig_enc != rb_usascii_encoding())
|
1185
|
+
{
|
1186
|
+
str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
|
1187
|
+
}
|
1188
|
+
else
|
1189
1189
|
{
|
1190
|
-
if (bang)
|
1190
|
+
if (!bang)
|
1191
1191
|
{
|
1192
|
-
|
1192
|
+
str = rb_str_dup(str);
|
1193
1193
|
}
|
1194
|
-
return rb_str_dup(str);
|
1195
1194
|
}
|
1196
1195
|
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1196
|
+
cps = cs_fetch_cps(set, &cs_len);
|
1197
|
+
rb_str_modify(str);
|
1198
|
+
s = t = RSTRING_PTR(str);
|
1199
|
+
send = RSTRING_END(str);
|
1200
|
+
cr = ENC_CODERANGE_7BIT;
|
1202
1201
|
|
1203
|
-
|
1204
|
-
end = RSTRING_END(str);
|
1205
|
-
|
1206
|
-
if (single_byte_optimizable(str))
|
1202
|
+
while (s < send)
|
1207
1203
|
{
|
1208
|
-
|
1204
|
+
unsigned int c;
|
1205
|
+
int clen;
|
1206
|
+
|
1207
|
+
if ((c = *(unsigned char *)s) < 0x80)
|
1209
1208
|
{
|
1210
|
-
|
1211
|
-
if ((!tst_cp(cps, len, str_cp)) == delete)
|
1209
|
+
if (tst_cp(cps, cs_len, c) != delete)
|
1212
1210
|
{
|
1213
|
-
|
1211
|
+
if (t != s)
|
1212
|
+
*t = c;
|
1213
|
+
t++;
|
1214
1214
|
}
|
1215
|
-
|
1215
|
+
s++;
|
1216
1216
|
}
|
1217
|
-
|
1218
|
-
else // likely to be multibyte string
|
1219
|
-
{
|
1220
|
-
while (ptr < end)
|
1217
|
+
else
|
1221
1218
|
{
|
1222
|
-
|
1223
|
-
|
1219
|
+
c = rb_enc_codepoint_len(s, send, &clen, utf8);
|
1220
|
+
|
1221
|
+
if (tst_cp(cps, cs_len, c) != delete)
|
1224
1222
|
{
|
1225
|
-
|
1223
|
+
if (t != s)
|
1224
|
+
rb_enc_mbcput(c, t, utf8);
|
1225
|
+
t += clen;
|
1226
|
+
if (cr == ENC_CODERANGE_7BIT)
|
1227
|
+
cr = ENC_CODERANGE_VALID;
|
1226
1228
|
}
|
1227
|
-
|
1229
|
+
s += clen;
|
1228
1230
|
}
|
1229
1231
|
}
|
1230
1232
|
|
1231
|
-
|
1233
|
+
rb_str_set_len(str, t - RSTRING_PTR(str));
|
1234
|
+
ENC_CODERANGE_SET(str, cr);
|
1232
1235
|
|
1233
|
-
if (bang)
|
1236
|
+
if (bang && (RSTRING_LEN(str) == (long)orig_str_len)) // string unchanged
|
1234
1237
|
{
|
1235
|
-
|
1236
|
-
{
|
1237
|
-
return Qnil;
|
1238
|
-
}
|
1239
|
-
rb_str_shared_replace(str, new_str_buf);
|
1238
|
+
return Qnil;
|
1240
1239
|
}
|
1241
|
-
|
1240
|
+
|
1241
|
+
if (!orig_was_utf8)
|
1242
1242
|
{
|
1243
|
-
|
1244
|
-
// slightly cumbersome approach needed for compatibility with Ruby < 2.3:
|
1245
|
-
RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags & (FL_TAINT));
|
1246
|
-
str = new_str_buf;
|
1243
|
+
return rb_str_encode(str, rb_enc_from_encoding(orig_enc), 0, Qnil);
|
1247
1244
|
}
|
1248
1245
|
|
1249
1246
|
return str;
|
@@ -1285,6 +1282,10 @@ cs_method_allocated_length(VALUE self)
|
|
1285
1282
|
|
1286
1283
|
void Init_character_set()
|
1287
1284
|
{
|
1285
|
+
#ifdef HAVE_RB_EXT_RACTOR_SAFE
|
1286
|
+
rb_ext_ractor_safe(true);
|
1287
|
+
#endif
|
1288
|
+
|
1288
1289
|
VALUE cs = rb_define_class("CharacterSet", rb_cObject);
|
1289
1290
|
|
1290
1291
|
rb_define_alloc_func(cs, cs_method_allocate);
|
@@ -1335,11 +1336,12 @@ void Init_character_set()
|
|
1335
1336
|
rb_define_method(cs, ">=", cs_method_superset_p, 1);
|
1336
1337
|
rb_define_method(cs, "proper_superset?", cs_method_proper_superset_p, 1);
|
1337
1338
|
rb_define_method(cs, ">", cs_method_proper_superset_p, 1);
|
1339
|
+
rb_define_method(cs, "<=>", cs_method_spaceship_operator, 1);
|
1338
1340
|
|
1339
1341
|
// `CharacterSet`-specific methods
|
1340
1342
|
|
1341
1343
|
rb_define_singleton_method(cs, "from_ranges", cs_class_method_from_ranges, -2);
|
1342
|
-
rb_define_singleton_method(cs, "
|
1344
|
+
rb_define_singleton_method(cs, "of_string", cs_class_method_of_string, 1);
|
1343
1345
|
|
1344
1346
|
rb_define_method(cs, "ranges", cs_method_ranges, 0);
|
1345
1347
|
rb_define_method(cs, "sample", cs_method_sample, -1);
|
@@ -6,7 +6,7 @@ typedef struct casefold_mapping {
|
|
6
6
|
unsigned long to;
|
7
7
|
} casefold_mapping;
|
8
8
|
|
9
|
-
#define CASEFOLD_COUNT
|
9
|
+
#define CASEFOLD_COUNT 1426
|
10
10
|
|
11
11
|
static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
|
12
12
|
{0x0041,0x0061},
|
@@ -564,6 +564,41 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
|
|
564
564
|
{0x104D1,0x104F9},
|
565
565
|
{0x104D2,0x104FA},
|
566
566
|
{0x104D3,0x104FB},
|
567
|
+
{0x10570,0x10597},
|
568
|
+
{0x10571,0x10598},
|
569
|
+
{0x10572,0x10599},
|
570
|
+
{0x10573,0x1059A},
|
571
|
+
{0x10574,0x1059B},
|
572
|
+
{0x10575,0x1059C},
|
573
|
+
{0x10576,0x1059D},
|
574
|
+
{0x10577,0x1059E},
|
575
|
+
{0x10578,0x1059F},
|
576
|
+
{0x10579,0x105A0},
|
577
|
+
{0x1057A,0x105A1},
|
578
|
+
{0x1057C,0x105A3},
|
579
|
+
{0x1057D,0x105A4},
|
580
|
+
{0x1057E,0x105A5},
|
581
|
+
{0x1057F,0x105A6},
|
582
|
+
{0x10580,0x105A7},
|
583
|
+
{0x10581,0x105A8},
|
584
|
+
{0x10582,0x105A9},
|
585
|
+
{0x10583,0x105AA},
|
586
|
+
{0x10584,0x105AB},
|
587
|
+
{0x10585,0x105AC},
|
588
|
+
{0x10586,0x105AD},
|
589
|
+
{0x10587,0x105AE},
|
590
|
+
{0x10588,0x105AF},
|
591
|
+
{0x10589,0x105B0},
|
592
|
+
{0x1058A,0x105B1},
|
593
|
+
{0x1058C,0x105B3},
|
594
|
+
{0x1058D,0x105B4},
|
595
|
+
{0x1058E,0x105B5},
|
596
|
+
{0x1058F,0x105B6},
|
597
|
+
{0x10590,0x105B7},
|
598
|
+
{0x10591,0x105B8},
|
599
|
+
{0x10592,0x105B9},
|
600
|
+
{0x10594,0x105BB},
|
601
|
+
{0x10595,0x105BC},
|
567
602
|
{0x10A0,0x2D00},
|
568
603
|
{0x10A1,0x2D01},
|
569
604
|
{0x10A2,0x2D02},
|
@@ -1102,6 +1137,7 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
|
|
1102
1137
|
{0x2C2C,0x2C5C},
|
1103
1138
|
{0x2C2D,0x2C5D},
|
1104
1139
|
{0x2C2E,0x2C5E},
|
1140
|
+
{0x2C2F,0x2C5F},
|
1105
1141
|
{0x2C60,0x2C61},
|
1106
1142
|
{0x2C62,0x026B},
|
1107
1143
|
{0x2C63,0x1D7D},
|
@@ -1282,10 +1318,17 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
|
|
1282
1318
|
{0xA7BA,0xA7BB},
|
1283
1319
|
{0xA7BC,0xA7BD},
|
1284
1320
|
{0xA7BE,0xA7BF},
|
1321
|
+
{0xA7C0,0xA7C1},
|
1285
1322
|
{0xA7C2,0xA7C3},
|
1286
1323
|
{0xA7C4,0xA794},
|
1287
1324
|
{0xA7C5,0x0282},
|
1288
1325
|
{0xA7C6,0x1D8E},
|
1326
|
+
{0xA7C7,0xA7C8},
|
1327
|
+
{0xA7C9,0xA7CA},
|
1328
|
+
{0xA7D0,0xA7D1},
|
1329
|
+
{0xA7D6,0xA7D7},
|
1330
|
+
{0xA7D8,0xA7D9},
|
1331
|
+
{0xA7F5,0xA7F6},
|
1289
1332
|
{0xAB70,0x13A0},
|
1290
1333
|
{0xAB71,0x13A1},
|
1291
1334
|
{0xAB72,0x13A2},
|
@@ -4,8 +4,16 @@ class CharacterSet
|
|
4
4
|
def character_set
|
5
5
|
CharacterSet.of_regexp(self)
|
6
6
|
end
|
7
|
+
|
8
|
+
def covered_by_character_set?(other)
|
9
|
+
other.superset?(character_set)
|
10
|
+
end
|
11
|
+
|
12
|
+
def uses_character_set?(other)
|
13
|
+
other.intersect?(character_set)
|
14
|
+
end
|
7
15
|
end
|
8
16
|
end
|
9
17
|
end
|
10
18
|
|
11
|
-
::Regexp.
|
19
|
+
::Regexp.instance_eval { include CharacterSet::CoreExt::RegexpExt }
|
@@ -2,7 +2,7 @@ class CharacterSet
|
|
2
2
|
module CoreExt
|
3
3
|
module StringExt
|
4
4
|
def character_set
|
5
|
-
CharacterSet.
|
5
|
+
CharacterSet.of_string(self)
|
6
6
|
end
|
7
7
|
|
8
8
|
{
|
@@ -29,4 +29,4 @@ class CharacterSet
|
|
29
29
|
end
|
30
30
|
end
|
31
31
|
|
32
|
-
::String.
|
32
|
+
::String.instance_eval { include CharacterSet::CoreExt::StringExt }
|
@@ -4,100 +4,84 @@ class CharacterSet
|
|
4
4
|
|
5
5
|
Error = Class.new(ArgumentError)
|
6
6
|
|
7
|
-
def convert(expression)
|
8
|
-
CharacterSet.require_optional_dependency('regexp_parser')
|
7
|
+
def convert(expression, to = CharacterSet, acc = [])
|
8
|
+
CharacterSet.require_optional_dependency('regexp_parser', __method__)
|
9
9
|
|
10
10
|
case expression
|
11
|
-
when Regexp::Expression::Root
|
12
|
-
if expression.count != 1
|
13
|
-
raise Error, 'Pass a Regexp with exactly one expression, e.g. /[a-z]/'
|
14
|
-
end
|
15
|
-
convert(expression[0])
|
16
|
-
|
17
11
|
when Regexp::Expression::CharacterSet
|
18
|
-
content = expression.map { |subexp| convert(subexp) }.reduce(:+)
|
19
|
-
expression.negative? ? content.inversion : content
|
12
|
+
content = expression.map { |subexp| convert(subexp, to) }.reduce(:+) || to[]
|
13
|
+
acc << (expression.negative? ? content.inversion : content)
|
20
14
|
|
21
15
|
when Regexp::Expression::CharacterSet::Intersection
|
22
|
-
expression.map { |subexp| convert(subexp) }.reduce(:&)
|
23
|
-
|
24
|
-
when Regexp::Expression::CharacterSet::IntersectedSequence
|
25
|
-
expression.map { |subexp| convert(subexp) }.reduce(:+)
|
16
|
+
acc << expression.map { |subexp| convert(subexp, to) }.reduce(:&)
|
26
17
|
|
27
18
|
when Regexp::Expression::CharacterSet::Range
|
28
|
-
start, finish = expression.map { |subexp| convert(subexp) }
|
29
|
-
|
19
|
+
start, finish = expression.map { |subexp| convert(subexp, to) }
|
20
|
+
acc << to.new((start.min)..(finish.max))
|
21
|
+
|
22
|
+
when Regexp::Expression::Subexpression # root, group, alternation, etc.
|
23
|
+
expression.each { |subexp| convert(subexp, to, acc) }
|
30
24
|
|
31
25
|
when Regexp::Expression::CharacterType::Any
|
32
|
-
|
26
|
+
acc << to.unicode
|
33
27
|
|
34
28
|
when Regexp::Expression::CharacterType::Base
|
35
29
|
/(?<negative>non)?(?<base_name>.+)/ =~ expression.token
|
36
30
|
content =
|
37
31
|
if expression.unicode_classes?
|
38
|
-
# in u-mode, type shortcuts match the same as \p{<long type name>}
|
39
|
-
|
32
|
+
# in u-mode, most type shortcuts match the same as \p{<long type name>}
|
33
|
+
if base_name == 'linebreak'
|
34
|
+
to.from_ranges(10..13, 133..133, 8232..8233)
|
35
|
+
else
|
36
|
+
to.of_property(base_name)
|
37
|
+
end
|
40
38
|
else
|
41
39
|
# in normal mode, types match only ascii chars
|
42
40
|
case base_name.to_sym
|
43
|
-
when :digit
|
44
|
-
when :hex
|
45
|
-
when :
|
46
|
-
when :
|
41
|
+
when :digit then to.from_ranges(48..57)
|
42
|
+
when :hex then to.from_ranges(48..57, 65..70, 97..102)
|
43
|
+
when :linebreak then to.from_ranges(10..13)
|
44
|
+
when :space then to.from_ranges(9..13, 32..32)
|
45
|
+
when :word then to.from_ranges(48..57, 65..90, 95..95, 97..122)
|
47
46
|
else raise Error, "Unsupported CharacterType #{base_name}"
|
48
47
|
end
|
49
48
|
end
|
50
|
-
negative ? content.inversion : content
|
49
|
+
acc << (negative ? content.inversion : content)
|
51
50
|
|
52
51
|
when Regexp::Expression::EscapeSequence::CodepointList
|
53
|
-
|
52
|
+
content = to.new(expression.codepoints)
|
53
|
+
acc << (expression.i? ? content.case_insensitive : content)
|
54
54
|
|
55
55
|
when Regexp::Expression::EscapeSequence::Base
|
56
|
-
|
57
|
-
|
58
|
-
when Regexp::Expression::Group::Capture,
|
59
|
-
Regexp::Expression::Group::Passive,
|
60
|
-
Regexp::Expression::Group::Named,
|
61
|
-
Regexp::Expression::Group::Atomic,
|
62
|
-
Regexp::Expression::Group::Options
|
63
|
-
case expression.count
|
64
|
-
when 0 then CharacterSet[]
|
65
|
-
when 1 then convert(expression.first)
|
66
|
-
else
|
67
|
-
raise Error, 'Groups must contain exactly one expression, e.g. ([a-z])'
|
68
|
-
end
|
69
|
-
|
70
|
-
when Regexp::Expression::Alternation
|
71
|
-
expression.map { |subexp| convert(subexp) }.reduce(:+)
|
72
|
-
|
73
|
-
when Regexp::Expression::Alternative
|
74
|
-
case expression.count
|
75
|
-
when 0 then CharacterSet[]
|
76
|
-
when 1 then convert(expression.first)
|
77
|
-
else
|
78
|
-
raise Error, 'Alternatives must contain exactly one expression'
|
79
|
-
end
|
56
|
+
content = to[expression.codepoint]
|
57
|
+
acc << (expression.i? ? content.case_insensitive : content)
|
80
58
|
|
81
59
|
when Regexp::Expression::Literal
|
82
|
-
|
83
|
-
|
84
|
-
end
|
85
|
-
CharacterSet[expression.text.ord]
|
60
|
+
content = to[*expression.text.chars]
|
61
|
+
acc << (expression.i? ? content.case_insensitive : content)
|
86
62
|
|
87
63
|
when Regexp::Expression::UnicodeProperty::Base,
|
88
64
|
Regexp::Expression::PosixClass
|
89
|
-
content =
|
65
|
+
content = to.of_property(expression.token)
|
90
66
|
if expression.type == :posixclass && expression.ascii_classes?
|
91
67
|
content = content.ascii_part
|
92
68
|
end
|
93
|
-
expression.negative? ? content.inversion : content
|
69
|
+
acc << (expression.negative? ? content.inversion : content)
|
70
|
+
|
71
|
+
when Regexp::Expression::Anchor::Base,
|
72
|
+
Regexp::Expression::Backreference::Base,
|
73
|
+
Regexp::Expression::Keep::Mark,
|
74
|
+
Regexp::Expression::Quantifier
|
75
|
+
# ignore zero-length and repeat expressions
|
94
76
|
|
95
77
|
when Regexp::Expression::Base
|
96
78
|
raise Error, "Unsupported expression class `#{expression.class}`"
|
97
79
|
|
98
80
|
else
|
99
|
-
raise Error,
|
81
|
+
raise Error, 'Pass an expression (result of Regexp::Parser.parse)'
|
100
82
|
end
|
83
|
+
|
84
|
+
acc.reduce(:+) || to[]
|
101
85
|
end
|
102
86
|
end
|
103
87
|
end
|