character_set 1.4.0 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +1 -1
  3. data/.github/workflows/gouteur.yml +20 -0
  4. data/.github/workflows/lint.yml +29 -0
  5. data/.github/workflows/tests.yml +28 -0
  6. data/.gitignore +1 -0
  7. data/.gouteur.yml +2 -0
  8. data/.rubocop.yml +20 -0
  9. data/BENCHMARK.md +35 -31
  10. data/CHANGELOG.md +64 -1
  11. data/Gemfile +15 -0
  12. data/LICENSE.txt +1 -1
  13. data/README.md +25 -9
  14. data/Rakefile +2 -120
  15. data/character_set.gemspec +0 -10
  16. data/ext/character_set/character_set.c +123 -121
  17. data/ext/character_set/unicode_casefold_table.h +44 -1
  18. data/lib/character_set/core_ext/regexp_ext.rb +9 -1
  19. data/lib/character_set/core_ext/string_ext.rb +2 -2
  20. data/lib/character_set/expression_converter.rb +40 -56
  21. data/lib/character_set/parser.rb +8 -4
  22. data/lib/character_set/predefined_sets/assigned.cps +110 -78
  23. data/lib/character_set/predefined_sets/emoji.cps +16 -14
  24. data/lib/character_set/predefined_sets.rb +11 -0
  25. data/lib/character_set/ruby_fallback/character_set_methods.rb +17 -21
  26. data/lib/character_set/ruby_fallback/set_methods.rb +9 -16
  27. data/lib/character_set/ruby_fallback/vendored_set_classes.rb +385 -0
  28. data/lib/character_set/ruby_fallback.rb +18 -2
  29. data/lib/character_set/set_method_adapters.rb +4 -3
  30. data/lib/character_set/shared_methods.rb +25 -11
  31. data/lib/character_set/version.rb +1 -1
  32. data/tasks/benchmark.rake +20 -0
  33. data/{benchmarks → tasks/benchmarks}/delete_in.rb +5 -1
  34. data/{benchmarks → tasks/benchmarks}/keep_in.rb +5 -1
  35. data/tasks/benchmarks/shared.rb +28 -0
  36. data/tasks/sync_casefold_data.rake +20 -0
  37. data/tasks/sync_predefined_sets.rake +9 -0
  38. data/tasks/sync_ruby_spec.rake +65 -0
  39. metadata +29 -146
  40. data/.travis.yml +0 -9
  41. data/benchmarks/shared.rb +0 -26
  42. /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
  43. /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
  44. /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
  45. /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
  46. /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
  47. /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
  48. /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
  49. /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
@@ -82,7 +82,11 @@ static const rb_data_type_t cs_type = {
82
82
  .dsize = cs_memsize,
83
83
  },
84
84
  .data = NULL,
85
+ #ifdef RUBY_TYPED_FROZEN_SHAREABLE
86
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_FROZEN_SHAREABLE,
87
+ #else
85
88
  .flags = RUBY_TYPED_FREE_IMMEDIATELY,
89
+ #endif
86
90
  };
87
91
 
88
92
  static inline VALUE
@@ -216,6 +220,7 @@ cs_method_hash(VALUE self)
216
220
  cs_cp cp, len, hash, four_byte_value;
217
221
  cs_ar *cps;
218
222
  cps = cs_fetch_cps(self, &len);
223
+ four_byte_value = 0;
219
224
 
220
225
  hash = 17;
221
226
  for (cp = 0; cp < len; cp++)
@@ -314,9 +319,9 @@ cs_method_minmax(VALUE self)
314
319
  cs_cp cp, alen, blen; \
315
320
  cs_ar *acps, *bcps; \
316
321
  struct cs_data *new_data; \
317
- new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
318
322
  acps = cs_fetch_cps(cs_a, &alen); \
319
323
  bcps = cs_fetch_cps(cs_b, &blen); \
324
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
320
325
  for (cp = 0; cp < UNICODE_CP_COUNT; cp++) \
321
326
  { \
322
327
  if (tst_cp(acps, alen, cp) comp_op tst_cp(bcps, blen, cp)) \
@@ -371,22 +376,20 @@ cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop)
371
376
  cps = data->cps;
372
377
  len = data->len;
373
378
  cp = FIX2ULONG(cp_num);
374
- if (return_nil_if_noop && (!tst_cp(cps, len, cp) == !on))
379
+ if (return_nil_if_noop && tst_cp(cps, len, cp) == on)
375
380
  {
376
381
  return Qnil;
377
382
  }
383
+
384
+ if (on)
385
+ {
386
+ set_cp(data, cp);
387
+ }
378
388
  else
379
389
  {
380
- if (on)
381
- {
382
- set_cp(data, cp);
383
- }
384
- else
385
- {
386
- clr_cp(cps, len, cp);
387
- }
388
- return cs;
390
+ clr_cp(cps, len, cp);
389
391
  }
392
+ return cs;
390
393
  }
391
394
 
392
395
  static VALUE
@@ -570,7 +573,7 @@ cs_method_merge(VALUE self, VALUE other)
570
573
  {
571
574
  return cs_merge_cs(self, other);
572
575
  }
573
- else if (TYPE(other) == T_ARRAY)
576
+ if (TYPE(other) == T_ARRAY)
574
577
  {
575
578
  return cs_merge_rb_array(self, other);
576
579
  }
@@ -672,6 +675,18 @@ cs_method_proper_superset_p(VALUE self, VALUE other)
672
675
  return (is_superset && is_proper) ? Qtrue : Qfalse;
673
676
  }
674
677
 
678
+ static VALUE
679
+ cs_method_spaceship_operator(VALUE self, VALUE other)
680
+ {
681
+ if (cs_method_eql_p(self, other))
682
+ return INT2FIX(0);
683
+ if (cs_method_proper_subset_p(self, other))
684
+ return INT2FIX(-1);
685
+ if (cs_method_proper_superset_p(self, other))
686
+ return INT2FIX(1);
687
+ return Qnil;
688
+ }
689
+
675
690
  // *******************************
676
691
  // `CharacterSet`-specific methods
677
692
  // *******************************
@@ -912,10 +927,10 @@ cs_method_ext_inversion(int argc, VALUE *argv, VALUE self)
912
927
  return new_cs;
913
928
  }
914
929
 
915
- typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE *memo);
930
+ typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE memo);
916
931
 
917
932
  static inline int
918
- add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
933
+ add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
919
934
  {
920
935
  set_cp(data, str_cp);
921
936
  return 1;
@@ -962,7 +977,7 @@ cs_method_case_insensitive(VALUE self)
962
977
  }
963
978
 
964
979
  static inline VALUE
965
- each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
980
+ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
966
981
  {
967
982
  long i, str_len;
968
983
  unsigned int str_cp;
@@ -981,21 +996,29 @@ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_d
981
996
  }
982
997
 
983
998
  static inline VALUE
984
- each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
999
+ each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
985
1000
  {
986
1001
  int n;
987
1002
  unsigned int str_cp;
988
1003
  const char *ptr, *end;
989
- rb_encoding *enc;
1004
+ rb_encoding *utf8;
1005
+
1006
+ utf8 = rb_utf8_encoding();
1007
+ if (rb_enc_get(str) == utf8)
1008
+ {
1009
+ str = rb_str_new_frozen(str);
1010
+ }
1011
+ else
1012
+ {
1013
+ str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
1014
+ }
990
1015
 
991
- str = rb_str_new_frozen(str);
992
1016
  ptr = RSTRING_PTR(str);
993
1017
  end = RSTRING_END(str);
994
- enc = rb_enc_get(str);
995
1018
 
996
1019
  while (ptr < end)
997
1020
  {
998
- str_cp = rb_enc_codepoint_len(ptr, end, &n, enc);
1021
+ str_cp = rb_enc_codepoint_len(ptr, end, &n, utf8);
999
1022
  if (!(*func)(str_cp, cp_arr, len, data, memo))
1000
1023
  {
1001
1024
  return Qfalse;
@@ -1026,12 +1049,13 @@ single_byte_optimizable(VALUE str)
1026
1049
  }
1027
1050
 
1028
1051
  static inline VALUE
1029
- each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1052
+ each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1030
1053
  {
1031
1054
  if (single_byte_optimizable(str))
1032
1055
  {
1033
1056
  return each_sb_cp(str, func, cp_arr, len, data, memo);
1034
1057
  }
1058
+
1035
1059
  return each_mb_cp(str, func, cp_arr, len, data, memo);
1036
1060
  }
1037
1061
 
@@ -1045,22 +1069,23 @@ raise_arg_err_unless_string(VALUE val)
1045
1069
  }
1046
1070
 
1047
1071
  static VALUE
1048
- cs_class_method_of(VALUE self, VALUE str)
1072
+ cs_class_method_of_string(VALUE self, VALUE string)
1049
1073
  {
1050
1074
  VALUE new_cs;
1051
1075
  struct cs_data *new_data;
1076
+
1077
+ raise_arg_err_unless_string(string);
1052
1078
  new_cs = cs_alloc(self, &new_data);
1053
- raise_arg_err_unless_string(str);
1054
- each_cp(str, add_str_cp_to_arr, 0, 0, new_data, 0);
1079
+ each_cp(string, add_str_cp_to_arr, 0, 0, new_data, 0);
1055
1080
  return new_cs;
1056
1081
  }
1057
1082
 
1058
1083
  static inline int
1059
- count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1084
+ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1060
1085
  {
1061
1086
  if (tst_cp(cp_arr, len, str_cp))
1062
1087
  {
1063
- *memo += 1;
1088
+ *((VALUE *)memo) += 1;
1064
1089
  }
1065
1090
  return 1;
1066
1091
  }
@@ -1068,17 +1093,17 @@ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data
1068
1093
  static VALUE
1069
1094
  cs_method_count_in(VALUE self, VALUE str)
1070
1095
  {
1071
- VALUE count;
1096
+ long count;
1072
1097
  struct cs_data *data;
1073
1098
  raise_arg_err_unless_string(str);
1074
1099
  data = cs_fetch_data(self);
1075
1100
  count = 0;
1076
- each_cp(str, count_str_cp, data->cps, data->len, data, &count);
1077
- return INT2NUM(count);
1101
+ each_cp(str, count_str_cp, data->cps, data->len, data, (VALUE)&count);
1102
+ return LONG2FIX(count);
1078
1103
  }
1079
1104
 
1080
1105
  static inline int
1081
- str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1106
+ str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1082
1107
  {
1083
1108
  return tst_cp(cp_arr, len, str_cp);
1084
1109
  }
@@ -1093,11 +1118,11 @@ cs_method_cover_p(VALUE self, VALUE str)
1093
1118
  }
1094
1119
 
1095
1120
  static inline int
1096
- add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1121
+ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1097
1122
  {
1098
1123
  if (tst_cp(cp_arr, len, str_cp))
1099
1124
  {
1100
- rb_ary_push(memo[0], rb_enc_uint_chr((int)str_cp, (rb_encoding *)memo[1]));
1125
+ rb_ary_push(memo, rb_enc_uint_chr((int)str_cp, rb_utf8_encoding()));
1101
1126
  }
1102
1127
  return 1;
1103
1128
  }
@@ -1105,18 +1130,17 @@ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_d
1105
1130
  static VALUE
1106
1131
  cs_method_scan(VALUE self, VALUE str)
1107
1132
  {
1108
- VALUE memo[2];
1133
+ VALUE memo;
1109
1134
  struct cs_data *data;
1110
1135
  raise_arg_err_unless_string(str);
1111
1136
  data = cs_fetch_data(self);
1112
- memo[0] = rb_ary_new();
1113
- memo[1] = (VALUE)rb_enc_get(str);
1137
+ memo = rb_ary_new();
1114
1138
  each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo);
1115
- return memo[0];
1139
+ return memo;
1116
1140
  }
1117
1141
 
1118
1142
  static inline int
1119
- str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1143
+ str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1120
1144
  {
1121
1145
  return !tst_cp(cp_arr, len, str_cp);
1122
1146
  }
@@ -1132,118 +1156,91 @@ cs_method_used_by_p(VALUE self, VALUE str)
1132
1156
  return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
1133
1157
  }
1134
1158
 
1135
- static void
1136
- cs_str_buf_cat(VALUE str, const char *ptr, long len)
1137
- {
1138
- long total, olen;
1139
- char *sptr;
1140
-
1141
- RSTRING_GETMEM(str, sptr, olen);
1142
- sptr = RSTRING(str)->as.heap.ptr;
1143
- olen = RSTRING(str)->as.heap.len;
1144
- total = olen + len;
1145
- memcpy(sptr + olen, ptr, len);
1146
- RSTRING(str)->as.heap.len = total;
1147
- }
1148
-
1149
- #ifndef TERM_FILL
1150
- #define TERM_FILL(ptr, termlen) \
1151
- do \
1152
- { \
1153
- char *const term_fill_ptr = (ptr); \
1154
- const int term_fill_len = (termlen); \
1155
- *term_fill_ptr = '\0'; \
1156
- if (__builtin_expect(!!(term_fill_len > 1), 0)) \
1157
- memset(term_fill_ptr, 0, term_fill_len); \
1158
- } while (0)
1159
- #endif
1160
-
1161
- static void
1162
- cs_str_buf_terminate(VALUE str, rb_encoding *enc)
1163
- {
1164
- char *ptr;
1165
- long len;
1166
-
1167
- ptr = RSTRING(str)->as.heap.ptr;
1168
- len = RSTRING(str)->as.heap.len;
1169
- TERM_FILL(ptr + len, rb_enc_mbminlen(enc));
1170
- }
1171
-
1159
+ // partially based on rb_str_delete_bang
1172
1160
  static inline VALUE
1173
1161
  cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1174
1162
  {
1175
1163
  cs_ar *cps;
1176
- cs_cp len;
1177
- rb_encoding *str_enc;
1178
- VALUE orig_len, new_str_buf;
1179
- int cp_len;
1180
- unsigned int str_cp;
1181
- const char *ptr, *end;
1164
+ cs_cp cs_len;
1165
+ VALUE orig_str_len;
1166
+
1167
+ rb_encoding *orig_enc, *utf8;
1168
+ char *s, *send, *t;
1169
+ int orig_was_utf8, cr;
1182
1170
 
1183
1171
  raise_arg_err_unless_string(str);
1184
1172
 
1185
- cps = cs_fetch_cps(set, &len);
1173
+ orig_str_len = RSTRING_LEN(str);
1174
+
1175
+ if (orig_str_len == 0)
1176
+ {
1177
+ return bang ? Qnil : str;
1178
+ }
1186
1179
 
1187
- orig_len = RSTRING_LEN(str);
1188
- if (orig_len < 1) // empty string, will never change
1180
+ orig_enc = rb_enc_get(str);
1181
+ utf8 = rb_utf8_encoding();
1182
+ orig_was_utf8 = orig_enc == utf8;
1183
+
1184
+ if (!orig_was_utf8 && orig_enc != rb_usascii_encoding())
1185
+ {
1186
+ str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
1187
+ }
1188
+ else
1189
1189
  {
1190
- if (bang)
1190
+ if (!bang)
1191
1191
  {
1192
- return Qnil;
1192
+ str = rb_str_dup(str);
1193
1193
  }
1194
- return rb_str_dup(str);
1195
1194
  }
1196
1195
 
1197
- new_str_buf = rb_str_buf_new(orig_len);
1198
- str_enc = rb_enc_get(str);
1199
- rb_enc_associate(new_str_buf, str_enc);
1200
- rb_str_modify(new_str_buf);
1201
- ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1196
+ cps = cs_fetch_cps(set, &cs_len);
1197
+ rb_str_modify(str);
1198
+ s = t = RSTRING_PTR(str);
1199
+ send = RSTRING_END(str);
1200
+ cr = ENC_CODERANGE_7BIT;
1202
1201
 
1203
- ptr = RSTRING_PTR(str);
1204
- end = RSTRING_END(str);
1205
-
1206
- if (single_byte_optimizable(str))
1202
+ while (s < send)
1207
1203
  {
1208
- while (ptr < end)
1204
+ unsigned int c;
1205
+ int clen;
1206
+
1207
+ if ((c = *(unsigned char *)s) < 0x80)
1209
1208
  {
1210
- str_cp = *ptr & 0xff;
1211
- if ((!tst_cp(cps, len, str_cp)) == delete)
1209
+ if (tst_cp(cps, cs_len, c) != delete)
1212
1210
  {
1213
- cs_str_buf_cat(new_str_buf, ptr, 1);
1211
+ if (t != s)
1212
+ *t = c;
1213
+ t++;
1214
1214
  }
1215
- ptr++;
1215
+ s++;
1216
1216
  }
1217
- }
1218
- else // likely to be multibyte string
1219
- {
1220
- while (ptr < end)
1217
+ else
1221
1218
  {
1222
- str_cp = rb_enc_codepoint_len(ptr, end, &cp_len, str_enc);
1223
- if ((!tst_cp(cps, len, str_cp)) == delete)
1219
+ c = rb_enc_codepoint_len(s, send, &clen, utf8);
1220
+
1221
+ if (tst_cp(cps, cs_len, c) != delete)
1224
1222
  {
1225
- cs_str_buf_cat(new_str_buf, ptr, cp_len);
1223
+ if (t != s)
1224
+ rb_enc_mbcput(c, t, utf8);
1225
+ t += clen;
1226
+ if (cr == ENC_CODERANGE_7BIT)
1227
+ cr = ENC_CODERANGE_VALID;
1226
1228
  }
1227
- ptr += cp_len;
1229
+ s += clen;
1228
1230
  }
1229
1231
  }
1230
1232
 
1231
- cs_str_buf_terminate(new_str_buf, str_enc);
1233
+ rb_str_set_len(str, t - RSTRING_PTR(str));
1234
+ ENC_CODERANGE_SET(str, cr);
1232
1235
 
1233
- if (bang)
1236
+ if (bang && (RSTRING_LEN(str) == (long)orig_str_len)) // string unchanged
1234
1237
  {
1235
- if (RSTRING_LEN(new_str_buf) == (long)orig_len) // string unchanged
1236
- {
1237
- return Qnil;
1238
- }
1239
- rb_str_shared_replace(str, new_str_buf);
1238
+ return Qnil;
1240
1239
  }
1241
- else
1240
+
1241
+ if (!orig_was_utf8)
1242
1242
  {
1243
- RB_OBJ_WRITE(new_str_buf, &(RBASIC(new_str_buf))->klass, rb_obj_class(str));
1244
- // slightly cumbersome approach needed for compatibility with Ruby < 2.3:
1245
- RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags & (FL_TAINT));
1246
- str = new_str_buf;
1243
+ return rb_str_encode(str, rb_enc_from_encoding(orig_enc), 0, Qnil);
1247
1244
  }
1248
1245
 
1249
1246
  return str;
@@ -1285,6 +1282,10 @@ cs_method_allocated_length(VALUE self)
1285
1282
 
1286
1283
  void Init_character_set()
1287
1284
  {
1285
+ #ifdef HAVE_RB_EXT_RACTOR_SAFE
1286
+ rb_ext_ractor_safe(true);
1287
+ #endif
1288
+
1288
1289
  VALUE cs = rb_define_class("CharacterSet", rb_cObject);
1289
1290
 
1290
1291
  rb_define_alloc_func(cs, cs_method_allocate);
@@ -1335,11 +1336,12 @@ void Init_character_set()
1335
1336
  rb_define_method(cs, ">=", cs_method_superset_p, 1);
1336
1337
  rb_define_method(cs, "proper_superset?", cs_method_proper_superset_p, 1);
1337
1338
  rb_define_method(cs, ">", cs_method_proper_superset_p, 1);
1339
+ rb_define_method(cs, "<=>", cs_method_spaceship_operator, 1);
1338
1340
 
1339
1341
  // `CharacterSet`-specific methods
1340
1342
 
1341
1343
  rb_define_singleton_method(cs, "from_ranges", cs_class_method_from_ranges, -2);
1342
- rb_define_singleton_method(cs, "of", cs_class_method_of, 1);
1344
+ rb_define_singleton_method(cs, "of_string", cs_class_method_of_string, 1);
1343
1345
 
1344
1346
  rb_define_method(cs, "ranges", cs_method_ranges, 0);
1345
1347
  rb_define_method(cs, "sample", cs_method_sample, -1);
@@ -6,7 +6,7 @@ typedef struct casefold_mapping {
6
6
  unsigned long to;
7
7
  } casefold_mapping;
8
8
 
9
- #define CASEFOLD_COUNT 1383
9
+ #define CASEFOLD_COUNT 1426
10
10
 
11
11
  static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
12
12
  {0x0041,0x0061},
@@ -564,6 +564,41 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
564
564
  {0x104D1,0x104F9},
565
565
  {0x104D2,0x104FA},
566
566
  {0x104D3,0x104FB},
567
+ {0x10570,0x10597},
568
+ {0x10571,0x10598},
569
+ {0x10572,0x10599},
570
+ {0x10573,0x1059A},
571
+ {0x10574,0x1059B},
572
+ {0x10575,0x1059C},
573
+ {0x10576,0x1059D},
574
+ {0x10577,0x1059E},
575
+ {0x10578,0x1059F},
576
+ {0x10579,0x105A0},
577
+ {0x1057A,0x105A1},
578
+ {0x1057C,0x105A3},
579
+ {0x1057D,0x105A4},
580
+ {0x1057E,0x105A5},
581
+ {0x1057F,0x105A6},
582
+ {0x10580,0x105A7},
583
+ {0x10581,0x105A8},
584
+ {0x10582,0x105A9},
585
+ {0x10583,0x105AA},
586
+ {0x10584,0x105AB},
587
+ {0x10585,0x105AC},
588
+ {0x10586,0x105AD},
589
+ {0x10587,0x105AE},
590
+ {0x10588,0x105AF},
591
+ {0x10589,0x105B0},
592
+ {0x1058A,0x105B1},
593
+ {0x1058C,0x105B3},
594
+ {0x1058D,0x105B4},
595
+ {0x1058E,0x105B5},
596
+ {0x1058F,0x105B6},
597
+ {0x10590,0x105B7},
598
+ {0x10591,0x105B8},
599
+ {0x10592,0x105B9},
600
+ {0x10594,0x105BB},
601
+ {0x10595,0x105BC},
567
602
  {0x10A0,0x2D00},
568
603
  {0x10A1,0x2D01},
569
604
  {0x10A2,0x2D02},
@@ -1102,6 +1137,7 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
1102
1137
  {0x2C2C,0x2C5C},
1103
1138
  {0x2C2D,0x2C5D},
1104
1139
  {0x2C2E,0x2C5E},
1140
+ {0x2C2F,0x2C5F},
1105
1141
  {0x2C60,0x2C61},
1106
1142
  {0x2C62,0x026B},
1107
1143
  {0x2C63,0x1D7D},
@@ -1282,10 +1318,17 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
1282
1318
  {0xA7BA,0xA7BB},
1283
1319
  {0xA7BC,0xA7BD},
1284
1320
  {0xA7BE,0xA7BF},
1321
+ {0xA7C0,0xA7C1},
1285
1322
  {0xA7C2,0xA7C3},
1286
1323
  {0xA7C4,0xA794},
1287
1324
  {0xA7C5,0x0282},
1288
1325
  {0xA7C6,0x1D8E},
1326
+ {0xA7C7,0xA7C8},
1327
+ {0xA7C9,0xA7CA},
1328
+ {0xA7D0,0xA7D1},
1329
+ {0xA7D6,0xA7D7},
1330
+ {0xA7D8,0xA7D9},
1331
+ {0xA7F5,0xA7F6},
1289
1332
  {0xAB70,0x13A0},
1290
1333
  {0xAB71,0x13A1},
1291
1334
  {0xAB72,0x13A2},
@@ -4,8 +4,16 @@ class CharacterSet
4
4
  def character_set
5
5
  CharacterSet.of_regexp(self)
6
6
  end
7
+
8
+ def covered_by_character_set?(other)
9
+ other.superset?(character_set)
10
+ end
11
+
12
+ def uses_character_set?(other)
13
+ other.intersect?(character_set)
14
+ end
7
15
  end
8
16
  end
9
17
  end
10
18
 
11
- ::Regexp.send(:include, CharacterSet::CoreExt::RegexpExt)
19
+ ::Regexp.instance_eval { include CharacterSet::CoreExt::RegexpExt }
@@ -2,7 +2,7 @@ class CharacterSet
2
2
  module CoreExt
3
3
  module StringExt
4
4
  def character_set
5
- CharacterSet.of(self)
5
+ CharacterSet.of_string(self)
6
6
  end
7
7
 
8
8
  {
@@ -29,4 +29,4 @@ class CharacterSet
29
29
  end
30
30
  end
31
31
 
32
- ::String.send(:include, CharacterSet::CoreExt::StringExt)
32
+ ::String.instance_eval { include CharacterSet::CoreExt::StringExt }
@@ -4,100 +4,84 @@ class CharacterSet
4
4
 
5
5
  Error = Class.new(ArgumentError)
6
6
 
7
- def convert(expression)
8
- CharacterSet.require_optional_dependency('regexp_parser')
7
+ def convert(expression, to = CharacterSet, acc = [])
8
+ CharacterSet.require_optional_dependency('regexp_parser', __method__)
9
9
 
10
10
  case expression
11
- when Regexp::Expression::Root
12
- if expression.count != 1
13
- raise Error, 'Pass a Regexp with exactly one expression, e.g. /[a-z]/'
14
- end
15
- convert(expression[0])
16
-
17
11
  when Regexp::Expression::CharacterSet
18
- content = expression.map { |subexp| convert(subexp) }.reduce(:+)
19
- expression.negative? ? content.inversion : content
12
+ content = expression.map { |subexp| convert(subexp, to) }.reduce(:+) || to[]
13
+ acc << (expression.negative? ? content.inversion : content)
20
14
 
21
15
  when Regexp::Expression::CharacterSet::Intersection
22
- expression.map { |subexp| convert(subexp) }.reduce(:&)
23
-
24
- when Regexp::Expression::CharacterSet::IntersectedSequence
25
- expression.map { |subexp| convert(subexp) }.reduce(:+)
16
+ acc << expression.map { |subexp| convert(subexp, to) }.reduce(:&)
26
17
 
27
18
  when Regexp::Expression::CharacterSet::Range
28
- start, finish = expression.map { |subexp| convert(subexp) }
29
- CharacterSet.new((start.min)..(finish.max))
19
+ start, finish = expression.map { |subexp| convert(subexp, to) }
20
+ acc << to.new((start.min)..(finish.max))
21
+
22
+ when Regexp::Expression::Subexpression # root, group, alternation, etc.
23
+ expression.each { |subexp| convert(subexp, to, acc) }
30
24
 
31
25
  when Regexp::Expression::CharacterType::Any
32
- CharacterSet.unicode
26
+ acc << to.unicode
33
27
 
34
28
  when Regexp::Expression::CharacterType::Base
35
29
  /(?<negative>non)?(?<base_name>.+)/ =~ expression.token
36
30
  content =
37
31
  if expression.unicode_classes?
38
- # in u-mode, type shortcuts match the same as \p{<long type name>}
39
- CharacterSet.of_property(base_name)
32
+ # in u-mode, most type shortcuts match the same as \p{<long type name>}
33
+ if base_name == 'linebreak'
34
+ to.from_ranges(10..13, 133..133, 8232..8233)
35
+ else
36
+ to.of_property(base_name)
37
+ end
40
38
  else
41
39
  # in normal mode, types match only ascii chars
42
40
  case base_name.to_sym
43
- when :digit then CharacterSet.from_ranges(48..57)
44
- when :hex then CharacterSet.from_ranges(48..57, 65..70, 97..102)
45
- when :space then CharacterSet.from_ranges(9..13, 32..32)
46
- when :word then CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122)
41
+ when :digit then to.from_ranges(48..57)
42
+ when :hex then to.from_ranges(48..57, 65..70, 97..102)
43
+ when :linebreak then to.from_ranges(10..13)
44
+ when :space then to.from_ranges(9..13, 32..32)
45
+ when :word then to.from_ranges(48..57, 65..90, 95..95, 97..122)
47
46
  else raise Error, "Unsupported CharacterType #{base_name}"
48
47
  end
49
48
  end
50
- negative ? content.inversion : content
49
+ acc << (negative ? content.inversion : content)
51
50
 
52
51
  when Regexp::Expression::EscapeSequence::CodepointList
53
- CharacterSet.new(expression.codepoints)
52
+ content = to.new(expression.codepoints)
53
+ acc << (expression.i? ? content.case_insensitive : content)
54
54
 
55
55
  when Regexp::Expression::EscapeSequence::Base
56
- CharacterSet[expression.codepoint]
57
-
58
- when Regexp::Expression::Group::Capture,
59
- Regexp::Expression::Group::Passive,
60
- Regexp::Expression::Group::Named,
61
- Regexp::Expression::Group::Atomic,
62
- Regexp::Expression::Group::Options
63
- case expression.count
64
- when 0 then CharacterSet[]
65
- when 1 then convert(expression.first)
66
- else
67
- raise Error, 'Groups must contain exactly one expression, e.g. ([a-z])'
68
- end
69
-
70
- when Regexp::Expression::Alternation
71
- expression.map { |subexp| convert(subexp) }.reduce(:+)
72
-
73
- when Regexp::Expression::Alternative
74
- case expression.count
75
- when 0 then CharacterSet[]
76
- when 1 then convert(expression.first)
77
- else
78
- raise Error, 'Alternatives must contain exactly one expression'
79
- end
56
+ content = to[expression.codepoint]
57
+ acc << (expression.i? ? content.case_insensitive : content)
80
58
 
81
59
  when Regexp::Expression::Literal
82
- if expression.set_level == 0 && expression.text.size != 1
83
- raise Error, 'Literal runs outside of sets are codepoint *sequences*'
84
- end
85
- CharacterSet[expression.text.ord]
60
+ content = to[*expression.text.chars]
61
+ acc << (expression.i? ? content.case_insensitive : content)
86
62
 
87
63
  when Regexp::Expression::UnicodeProperty::Base,
88
64
  Regexp::Expression::PosixClass
89
- content = CharacterSet.of_property(expression.token)
65
+ content = to.of_property(expression.token)
90
66
  if expression.type == :posixclass && expression.ascii_classes?
91
67
  content = content.ascii_part
92
68
  end
93
- expression.negative? ? content.inversion : content
69
+ acc << (expression.negative? ? content.inversion : content)
70
+
71
+ when Regexp::Expression::Anchor::Base,
72
+ Regexp::Expression::Backreference::Base,
73
+ Regexp::Expression::Keep::Mark,
74
+ Regexp::Expression::Quantifier
75
+ # ignore zero-length and repeat expressions
94
76
 
95
77
  when Regexp::Expression::Base
96
78
  raise Error, "Unsupported expression class `#{expression.class}`"
97
79
 
98
80
  else
99
- raise Error, "Pass an expression (result of Regexp::Parser.parse)"
81
+ raise Error, 'Pass an expression (result of Regexp::Parser.parse)'
100
82
  end
83
+
84
+ acc.reduce(:+) || to[]
101
85
  end
102
86
  end
103
87
  end