character_set 1.5.0-java → 1.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +1 -1
  3. data/.github/workflows/tests.yml +6 -2
  4. data/BENCHMARK.md +35 -31
  5. data/CHANGELOG.md +30 -1
  6. data/Gemfile +14 -0
  7. data/README.md +9 -6
  8. data/Rakefile +2 -120
  9. data/character_set.gemspec +0 -21
  10. data/ext/character_set/character_set.c +110 -125
  11. data/lib/character_set/core_ext/string_ext.rb +1 -1
  12. data/lib/character_set/parser.rb +8 -4
  13. data/lib/character_set/predefined_sets/assigned.cps +73 -52
  14. data/lib/character_set/predefined_sets/emoji.cps +10 -9
  15. data/lib/character_set/predefined_sets.rb +11 -0
  16. data/lib/character_set/ruby_fallback/character_set_methods.rb +17 -20
  17. data/lib/character_set/ruby_fallback/set_methods.rb +4 -18
  18. data/lib/character_set/ruby_fallback/vendored_set_classes.rb +492 -0
  19. data/lib/character_set/ruby_fallback.rb +2 -6
  20. data/lib/character_set/shared_methods.rb +8 -2
  21. data/lib/character_set/version.rb +1 -1
  22. data/tasks/benchmark.rake +20 -0
  23. data/{benchmarks → tasks/benchmarks}/delete_in.rb +5 -1
  24. data/{benchmarks → tasks/benchmarks}/keep_in.rb +5 -1
  25. data/tasks/benchmarks/shared.rb +28 -0
  26. data/tasks/sync_casefold_data.rake +20 -0
  27. data/tasks/sync_predefined_sets.rake +9 -0
  28. data/tasks/sync_ruby_spec.rake +65 -0
  29. metadata +19 -182
  30. data/benchmarks/shared.rb +0 -30
  31. /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
  32. /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
  33. /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
  34. /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
  35. /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
  36. /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
  37. /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
  38. /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
@@ -82,7 +82,11 @@ static const rb_data_type_t cs_type = {
82
82
  .dsize = cs_memsize,
83
83
  },
84
84
  .data = NULL,
85
+ #ifdef RUBY_TYPED_FROZEN_SHAREABLE
86
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_FROZEN_SHAREABLE,
87
+ #else
85
88
  .flags = RUBY_TYPED_FREE_IMMEDIATELY,
89
+ #endif
86
90
  };
87
91
 
88
92
  static inline VALUE
@@ -315,9 +319,9 @@ cs_method_minmax(VALUE self)
315
319
  cs_cp cp, alen, blen; \
316
320
  cs_ar *acps, *bcps; \
317
321
  struct cs_data *new_data; \
318
- new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
319
322
  acps = cs_fetch_cps(cs_a, &alen); \
320
323
  bcps = cs_fetch_cps(cs_b, &blen); \
324
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
321
325
  for (cp = 0; cp < UNICODE_CP_COUNT; cp++) \
322
326
  { \
323
327
  if (tst_cp(acps, alen, cp) comp_op tst_cp(bcps, blen, cp)) \
@@ -372,22 +376,20 @@ cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop)
372
376
  cps = data->cps;
373
377
  len = data->len;
374
378
  cp = FIX2ULONG(cp_num);
375
- if (return_nil_if_noop && (!tst_cp(cps, len, cp) == !on))
379
+ if (return_nil_if_noop && tst_cp(cps, len, cp) == on)
376
380
  {
377
381
  return Qnil;
378
382
  }
383
+
384
+ if (on)
385
+ {
386
+ set_cp(data, cp);
387
+ }
379
388
  else
380
389
  {
381
- if (on)
382
- {
383
- set_cp(data, cp);
384
- }
385
- else
386
- {
387
- clr_cp(cps, len, cp);
388
- }
389
- return cs;
390
+ clr_cp(cps, len, cp);
390
391
  }
392
+ return cs;
391
393
  }
392
394
 
393
395
  static VALUE
@@ -571,7 +573,7 @@ cs_method_merge(VALUE self, VALUE other)
571
573
  {
572
574
  return cs_merge_cs(self, other);
573
575
  }
574
- else if (TYPE(other) == T_ARRAY)
576
+ if (TYPE(other) == T_ARRAY)
575
577
  {
576
578
  return cs_merge_rb_array(self, other);
577
579
  }
@@ -705,8 +707,7 @@ cs_method_ranges(VALUE self)
705
707
 
706
708
  if (!previous_cp_num) {
707
709
  current_start = cp_num;
708
- } else if (previous_cp_num + 2 != cp_num)
709
- {
710
+ } else if (previous_cp_num + 2 != cp_num) {
710
711
  // gap found, finalize previous range
711
712
  rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
712
713
  current_start = cp_num;
@@ -914,10 +915,10 @@ cs_method_ext_inversion(int argc, VALUE *argv, VALUE self)
914
915
  return new_cs;
915
916
  }
916
917
 
917
- typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE *memo);
918
+ typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE memo);
918
919
 
919
920
  static inline int
920
- add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
921
+ add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
921
922
  {
922
923
  set_cp(data, str_cp);
923
924
  return 1;
@@ -964,7 +965,7 @@ cs_method_case_insensitive(VALUE self)
964
965
  }
965
966
 
966
967
  static inline VALUE
967
- each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
968
+ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
968
969
  {
969
970
  long i, str_len;
970
971
  unsigned int str_cp;
@@ -983,21 +984,29 @@ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_d
983
984
  }
984
985
 
985
986
  static inline VALUE
986
- each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
987
+ each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
987
988
  {
988
989
  int n;
989
990
  unsigned int str_cp;
990
991
  const char *ptr, *end;
991
- rb_encoding *enc;
992
+ rb_encoding *utf8;
993
+
994
+ utf8 = rb_utf8_encoding();
995
+ if (rb_enc_get(str) == utf8)
996
+ {
997
+ str = rb_str_new_frozen(str);
998
+ }
999
+ else
1000
+ {
1001
+ str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
1002
+ }
992
1003
 
993
- str = rb_str_new_frozen(str);
994
1004
  ptr = RSTRING_PTR(str);
995
1005
  end = RSTRING_END(str);
996
- enc = rb_enc_get(str);
997
1006
 
998
1007
  while (ptr < end)
999
1008
  {
1000
- str_cp = rb_enc_codepoint_len(ptr, end, &n, enc);
1009
+ str_cp = rb_enc_codepoint_len(ptr, end, &n, utf8);
1001
1010
  if (!(*func)(str_cp, cp_arr, len, data, memo))
1002
1011
  {
1003
1012
  return Qfalse;
@@ -1028,12 +1037,13 @@ single_byte_optimizable(VALUE str)
1028
1037
  }
1029
1038
 
1030
1039
  static inline VALUE
1031
- each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1040
+ each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1032
1041
  {
1033
1042
  if (single_byte_optimizable(str))
1034
1043
  {
1035
1044
  return each_sb_cp(str, func, cp_arr, len, data, memo);
1036
1045
  }
1046
+
1037
1047
  return each_mb_cp(str, func, cp_arr, len, data, memo);
1038
1048
  }
1039
1049
 
@@ -1047,26 +1057,23 @@ raise_arg_err_unless_string(VALUE val)
1047
1057
  }
1048
1058
 
1049
1059
  static VALUE
1050
- cs_class_method_of(int argc, VALUE *argv, VALUE self)
1060
+ cs_class_method_of_string(VALUE self, VALUE string)
1051
1061
  {
1052
1062
  VALUE new_cs;
1053
1063
  struct cs_data *new_data;
1054
- int i;
1064
+
1065
+ raise_arg_err_unless_string(string);
1055
1066
  new_cs = cs_alloc(self, &new_data);
1056
- for (i = 0; i < argc; i++)
1057
- {
1058
- raise_arg_err_unless_string(argv[i]);
1059
- each_cp(argv[i], add_str_cp_to_arr, 0, 0, new_data, 0);
1060
- }
1067
+ each_cp(string, add_str_cp_to_arr, 0, 0, new_data, 0);
1061
1068
  return new_cs;
1062
1069
  }
1063
1070
 
1064
1071
  static inline int
1065
- count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1072
+ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1066
1073
  {
1067
1074
  if (tst_cp(cp_arr, len, str_cp))
1068
1075
  {
1069
- *memo += 1;
1076
+ *((VALUE *)memo) += 1;
1070
1077
  }
1071
1078
  return 1;
1072
1079
  }
@@ -1074,17 +1081,17 @@ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data
1074
1081
  static VALUE
1075
1082
  cs_method_count_in(VALUE self, VALUE str)
1076
1083
  {
1077
- VALUE count;
1084
+ long count;
1078
1085
  struct cs_data *data;
1079
1086
  raise_arg_err_unless_string(str);
1080
1087
  data = cs_fetch_data(self);
1081
1088
  count = 0;
1082
- each_cp(str, count_str_cp, data->cps, data->len, data, &count);
1083
- return INT2NUM((int)count);
1089
+ each_cp(str, count_str_cp, data->cps, data->len, data, (VALUE)&count);
1090
+ return LONG2FIX(count);
1084
1091
  }
1085
1092
 
1086
1093
  static inline int
1087
- str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1094
+ str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1088
1095
  {
1089
1096
  return tst_cp(cp_arr, len, str_cp);
1090
1097
  }
@@ -1099,11 +1106,11 @@ cs_method_cover_p(VALUE self, VALUE str)
1099
1106
  }
1100
1107
 
1101
1108
  static inline int
1102
- add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1109
+ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1103
1110
  {
1104
1111
  if (tst_cp(cp_arr, len, str_cp))
1105
1112
  {
1106
- rb_ary_push(memo[0], rb_enc_uint_chr((int)str_cp, (rb_encoding *)memo[1]));
1113
+ rb_ary_push(memo, rb_enc_uint_chr((int)str_cp, rb_utf8_encoding()));
1107
1114
  }
1108
1115
  return 1;
1109
1116
  }
@@ -1111,18 +1118,17 @@ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_d
1111
1118
  static VALUE
1112
1119
  cs_method_scan(VALUE self, VALUE str)
1113
1120
  {
1114
- VALUE memo[2];
1121
+ VALUE memo;
1115
1122
  struct cs_data *data;
1116
1123
  raise_arg_err_unless_string(str);
1117
1124
  data = cs_fetch_data(self);
1118
- memo[0] = rb_ary_new();
1119
- memo[1] = (VALUE)rb_enc_get(str);
1125
+ memo = rb_ary_new();
1120
1126
  each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo);
1121
- return memo[0];
1127
+ return memo;
1122
1128
  }
1123
1129
 
1124
1130
  static inline int
1125
- str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1131
+ str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1126
1132
  {
1127
1133
  return !tst_cp(cp_arr, len, str_cp);
1128
1134
  }
@@ -1138,116 +1144,91 @@ cs_method_used_by_p(VALUE self, VALUE str)
1138
1144
  return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
1139
1145
  }
1140
1146
 
1141
- static void
1142
- cs_str_buf_cat(VALUE str, const char *ptr, long len)
1143
- {
1144
- long total, olen;
1145
- char *sptr;
1146
-
1147
- RSTRING_GETMEM(str, sptr, olen);
1148
- sptr = RSTRING(str)->as.heap.ptr;
1149
- olen = RSTRING(str)->as.heap.len;
1150
- total = olen + len;
1151
- memcpy(sptr + olen, ptr, len);
1152
- RSTRING(str)->as.heap.len = total;
1153
- }
1154
-
1155
- #ifndef TERM_FILL
1156
- #define TERM_FILL(ptr, termlen) \
1157
- do \
1158
- { \
1159
- char *const term_fill_ptr = (ptr); \
1160
- const int term_fill_len = (termlen); \
1161
- *term_fill_ptr = '\0'; \
1162
- if (__builtin_expect(!!(term_fill_len > 1), 0)) \
1163
- memset(term_fill_ptr, 0, term_fill_len); \
1164
- } while (0)
1165
- #endif
1166
-
1167
- static void
1168
- cs_str_buf_terminate(VALUE str, rb_encoding *enc)
1169
- {
1170
- char *ptr;
1171
- long len;
1172
-
1173
- ptr = RSTRING(str)->as.heap.ptr;
1174
- len = RSTRING(str)->as.heap.len;
1175
- TERM_FILL(ptr + len, rb_enc_mbminlen(enc));
1176
- }
1177
-
1147
+ // partially based on rb_str_delete_bang
1178
1148
  static inline VALUE
1179
1149
  cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1180
1150
  {
1181
1151
  cs_ar *cps;
1182
- cs_cp len;
1183
- rb_encoding *str_enc;
1184
- VALUE orig_len, new_str_buf;
1185
- int cp_len;
1186
- unsigned int str_cp;
1187
- const char *ptr, *end;
1152
+ cs_cp cs_len;
1153
+ VALUE orig_str_len;
1154
+
1155
+ rb_encoding *orig_enc, *utf8;
1156
+ char *s, *send, *t;
1157
+ int orig_was_utf8, cr;
1188
1158
 
1189
1159
  raise_arg_err_unless_string(str);
1190
1160
 
1191
- cps = cs_fetch_cps(set, &len);
1161
+ orig_str_len = RSTRING_LEN(str);
1162
+
1163
+ if (orig_str_len == 0)
1164
+ {
1165
+ return bang ? Qnil : str;
1166
+ }
1192
1167
 
1193
- orig_len = RSTRING_LEN(str);
1194
- if (orig_len < 1) // empty string, will never change
1168
+ orig_enc = rb_enc_get(str);
1169
+ utf8 = rb_utf8_encoding();
1170
+ orig_was_utf8 = orig_enc == utf8;
1171
+
1172
+ if (!orig_was_utf8 && orig_enc != rb_usascii_encoding())
1173
+ {
1174
+ str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
1175
+ }
1176
+ else
1195
1177
  {
1196
- if (bang)
1178
+ if (!bang)
1197
1179
  {
1198
- return Qnil;
1180
+ str = rb_str_dup(str);
1199
1181
  }
1200
- return rb_str_dup(str);
1201
1182
  }
1202
1183
 
1203
- new_str_buf = rb_str_buf_new(orig_len + 30); // len + margin
1204
- str_enc = rb_enc_get(str);
1205
- rb_enc_associate(new_str_buf, str_enc);
1206
- rb_str_modify(new_str_buf);
1207
- ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1184
+ cps = cs_fetch_cps(set, &cs_len);
1185
+ rb_str_modify(str);
1186
+ s = t = RSTRING_PTR(str);
1187
+ send = RSTRING_END(str);
1188
+ cr = ENC_CODERANGE_7BIT;
1208
1189
 
1209
- ptr = RSTRING_PTR(str);
1210
- end = RSTRING_END(str);
1211
-
1212
- if (single_byte_optimizable(str))
1190
+ while (s < send)
1213
1191
  {
1214
- while (ptr < end)
1192
+ unsigned int c;
1193
+ int clen;
1194
+
1195
+ if ((c = *(unsigned char *)s) < 0x80)
1215
1196
  {
1216
- str_cp = *ptr & 0xff;
1217
- if ((!tst_cp(cps, len, str_cp)) == delete)
1197
+ if (tst_cp(cps, cs_len, c) != delete)
1218
1198
  {
1219
- cs_str_buf_cat(new_str_buf, ptr, 1);
1199
+ if (t != s)
1200
+ *t = c;
1201
+ t++;
1220
1202
  }
1221
- ptr++;
1203
+ s++;
1222
1204
  }
1223
- }
1224
- else // likely to be multibyte string
1225
- {
1226
- while (ptr < end)
1205
+ else
1227
1206
  {
1228
- str_cp = rb_enc_codepoint_len(ptr, end, &cp_len, str_enc);
1229
- if ((!tst_cp(cps, len, str_cp)) == delete)
1207
+ c = rb_enc_codepoint_len(s, send, &clen, utf8);
1208
+
1209
+ if (tst_cp(cps, cs_len, c) != delete)
1230
1210
  {
1231
- cs_str_buf_cat(new_str_buf, ptr, cp_len);
1211
+ if (t != s)
1212
+ rb_enc_mbcput(c, t, utf8);
1213
+ t += clen;
1214
+ if (cr == ENC_CODERANGE_7BIT)
1215
+ cr = ENC_CODERANGE_VALID;
1232
1216
  }
1233
- ptr += cp_len;
1217
+ s += clen;
1234
1218
  }
1235
1219
  }
1236
1220
 
1237
- cs_str_buf_terminate(new_str_buf, str_enc);
1221
+ rb_str_set_len(str, t - RSTRING_PTR(str));
1222
+ ENC_CODERANGE_SET(str, cr);
1238
1223
 
1239
- if (bang)
1224
+ if (bang && (RSTRING_LEN(str) == (long)orig_str_len)) // string unchanged
1240
1225
  {
1241
- if (RSTRING_LEN(new_str_buf) == (long)orig_len) // string unchanged
1242
- {
1243
- return Qnil;
1244
- }
1245
- rb_str_shared_replace(str, new_str_buf);
1226
+ return Qnil;
1246
1227
  }
1247
- else
1228
+
1229
+ if (!orig_was_utf8)
1248
1230
  {
1249
- RB_OBJ_WRITE(new_str_buf, &(RBASIC(new_str_buf))->klass, rb_obj_class(str));
1250
- str = new_str_buf;
1231
+ return rb_str_encode(str, rb_enc_from_encoding(orig_enc), 0, Qnil);
1251
1232
  }
1252
1233
 
1253
1234
  return str;
@@ -1289,6 +1270,10 @@ cs_method_allocated_length(VALUE self)
1289
1270
 
1290
1271
  void Init_character_set()
1291
1272
  {
1273
+ #ifdef HAVE_RB_EXT_RACTOR_SAFE
1274
+ rb_ext_ractor_safe(true);
1275
+ #endif
1276
+
1292
1277
  VALUE cs = rb_define_class("CharacterSet", rb_cObject);
1293
1278
 
1294
1279
  rb_define_alloc_func(cs, cs_method_allocate);
@@ -1343,7 +1328,7 @@ void Init_character_set()
1343
1328
  // `CharacterSet`-specific methods
1344
1329
 
1345
1330
  rb_define_singleton_method(cs, "from_ranges", cs_class_method_from_ranges, -2);
1346
- rb_define_singleton_method(cs, "of", cs_class_method_of, -1);
1331
+ rb_define_singleton_method(cs, "of_string", cs_class_method_of_string, 1);
1347
1332
 
1348
1333
  rb_define_method(cs, "ranges", cs_method_ranges, 0);
1349
1334
  rb_define_method(cs, "sample", cs_method_sample, -1);
@@ -2,7 +2,7 @@ class CharacterSet
2
2
  module CoreExt
3
3
  module StringExt
4
4
  def character_set
5
- CharacterSet.of(self)
5
+ CharacterSet.of_string(self)
6
6
  end
7
7
 
8
8
  {
@@ -4,11 +4,15 @@ class CharacterSet
4
4
 
5
5
  def codepoints_from_enumerable(object)
6
6
  raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each)
7
+
7
8
  # Use #each to check first element (only this works for all Enumerables)
8
- object.each do |e| # rubocop:disable Lint/UnreachableLoop
9
- return object if e.is_a?(Integer) && e >= 0 && e < 0x110000
10
- return object.map(&:ord) if e.is_a?(String) && e.length == 1
11
- raise ArgumentError, "#{e.inspect} is not valid as a codepoint"
9
+ object.each do |el| # rubocop:disable Lint/UnreachableLoop
10
+ if el.is_a?(Integer) && el >= 0 && el < 0x110000
11
+ return object
12
+ elsif el.is_a?(String) && el.length == 1
13
+ return object.to_a.join.encode('utf-8').codepoints
14
+ end
15
+ raise ArgumentError, "#{el.inspect} is not valid as a codepoint"
12
16
  end
13
17
  end
14
18