character_set 1.6.0-java → 1.8.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/gouteur.yml +1 -1
  3. data/.github/workflows/lint.yml +1 -1
  4. data/.github/workflows/tests.yml +3 -1
  5. data/.rubocop.yml +3 -0
  6. data/BENCHMARK.md +32 -32
  7. data/CHANGELOG.md +24 -1
  8. data/Gemfile +7 -6
  9. data/LICENSE.txt +1 -1
  10. data/README.md +3 -3
  11. data/Rakefile +2 -123
  12. data/character_set.gemspec +0 -7
  13. data/ext/character_set/character_set.c +77 -43
  14. data/lib/character_set/core_ext/regexp_ext.rb +8 -0
  15. data/lib/character_set/expression_converter.rb +37 -54
  16. data/lib/character_set/parser.rb +8 -4
  17. data/lib/character_set/predefined_sets/assigned.cps +73 -52
  18. data/lib/character_set/predefined_sets/emoji.cps +10 -9
  19. data/lib/character_set/ruby_fallback/character_set_methods.rb +14 -17
  20. data/lib/character_set/ruby_fallback/set_methods.rb +6 -21
  21. data/lib/character_set/ruby_fallback/vendored_set_classes.rb +385 -0
  22. data/lib/character_set/ruby_fallback.rb +18 -6
  23. data/lib/character_set/set_method_adapters.rb +1 -1
  24. data/lib/character_set/shared_methods.rb +6 -2
  25. data/lib/character_set/version.rb +1 -1
  26. data/tasks/benchmark.rake +20 -0
  27. data/tasks/benchmarks/shared.rb +28 -0
  28. data/tasks/sync_casefold_data.rake +20 -0
  29. data/tasks/sync_predefined_sets.rake +9 -0
  30. data/tasks/sync_ruby_spec.rake +65 -0
  31. metadata +19 -28
  32. data/benchmarks/shared.rb +0 -30
  33. /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
  34. /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
  35. /data/{benchmarks → tasks/benchmarks}/delete_in.rb +0 -0
  36. /data/{benchmarks → tasks/benchmarks}/keep_in.rb +0 -0
  37. /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
  38. /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
  39. /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
  40. /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
  41. /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
  42. /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
@@ -376,22 +376,20 @@ cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop)
376
376
  cps = data->cps;
377
377
  len = data->len;
378
378
  cp = FIX2ULONG(cp_num);
379
- if (return_nil_if_noop && (!tst_cp(cps, len, cp) == !on))
379
+ if (return_nil_if_noop && tst_cp(cps, len, cp) == on)
380
380
  {
381
381
  return Qnil;
382
382
  }
383
+
384
+ if (on)
385
+ {
386
+ set_cp(data, cp);
387
+ }
383
388
  else
384
389
  {
385
- if (on)
386
- {
387
- set_cp(data, cp);
388
- }
389
- else
390
- {
391
- clr_cp(cps, len, cp);
392
- }
393
- return cs;
390
+ clr_cp(cps, len, cp);
394
391
  }
392
+ return cs;
395
393
  }
396
394
 
397
395
  static VALUE
@@ -575,7 +573,7 @@ cs_method_merge(VALUE self, VALUE other)
575
573
  {
576
574
  return cs_merge_cs(self, other);
577
575
  }
578
- else if (TYPE(other) == T_ARRAY)
576
+ if (TYPE(other) == T_ARRAY)
579
577
  {
580
578
  return cs_merge_rb_array(self, other);
581
579
  }
@@ -677,6 +675,18 @@ cs_method_proper_superset_p(VALUE self, VALUE other)
677
675
  return (is_superset && is_proper) ? Qtrue : Qfalse;
678
676
  }
679
677
 
678
+ static VALUE
679
+ cs_method_spaceship_operator(VALUE self, VALUE other)
680
+ {
681
+ if (cs_method_eql_p(self, other))
682
+ return INT2FIX(0);
683
+ if (cs_method_proper_subset_p(self, other))
684
+ return INT2FIX(-1);
685
+ if (cs_method_proper_superset_p(self, other))
686
+ return INT2FIX(1);
687
+ return Qnil;
688
+ }
689
+
680
690
  // *******************************
681
691
  // `CharacterSet`-specific methods
682
692
  // *******************************
@@ -917,10 +927,10 @@ cs_method_ext_inversion(int argc, VALUE *argv, VALUE self)
917
927
  return new_cs;
918
928
  }
919
929
 
920
- typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE *memo);
930
+ typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE memo);
921
931
 
922
932
  static inline int
923
- add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
933
+ add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
924
934
  {
925
935
  set_cp(data, str_cp);
926
936
  return 1;
@@ -967,7 +977,7 @@ cs_method_case_insensitive(VALUE self)
967
977
  }
968
978
 
969
979
  static inline VALUE
970
- each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
980
+ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
971
981
  {
972
982
  long i, str_len;
973
983
  unsigned int str_cp;
@@ -986,21 +996,29 @@ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_d
986
996
  }
987
997
 
988
998
  static inline VALUE
989
- each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
999
+ each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
990
1000
  {
991
1001
  int n;
992
1002
  unsigned int str_cp;
993
1003
  const char *ptr, *end;
994
- rb_encoding *enc;
1004
+ rb_encoding *utf8;
1005
+
1006
+ utf8 = rb_utf8_encoding();
1007
+ if (rb_enc_get(str) == utf8)
1008
+ {
1009
+ str = rb_str_new_frozen(str);
1010
+ }
1011
+ else
1012
+ {
1013
+ str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
1014
+ }
995
1015
 
996
- str = rb_str_new_frozen(str);
997
1016
  ptr = RSTRING_PTR(str);
998
1017
  end = RSTRING_END(str);
999
- enc = rb_enc_get(str);
1000
1018
 
1001
1019
  while (ptr < end)
1002
1020
  {
1003
- str_cp = rb_enc_codepoint_len(ptr, end, &n, enc);
1021
+ str_cp = rb_enc_codepoint_len(ptr, end, &n, utf8);
1004
1022
  if (!(*func)(str_cp, cp_arr, len, data, memo))
1005
1023
  {
1006
1024
  return Qfalse;
@@ -1031,12 +1049,13 @@ single_byte_optimizable(VALUE str)
1031
1049
  }
1032
1050
 
1033
1051
  static inline VALUE
1034
- each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1052
+ each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1035
1053
  {
1036
1054
  if (single_byte_optimizable(str))
1037
1055
  {
1038
1056
  return each_sb_cp(str, func, cp_arr, len, data, memo);
1039
1057
  }
1058
+
1040
1059
  return each_mb_cp(str, func, cp_arr, len, data, memo);
1041
1060
  }
1042
1061
 
@@ -1062,11 +1081,11 @@ cs_class_method_of_string(VALUE self, VALUE string)
1062
1081
  }
1063
1082
 
1064
1083
  static inline int
1065
- count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1084
+ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1066
1085
  {
1067
1086
  if (tst_cp(cp_arr, len, str_cp))
1068
1087
  {
1069
- *memo += 1;
1088
+ *((VALUE *)memo) += 1;
1070
1089
  }
1071
1090
  return 1;
1072
1091
  }
@@ -1074,17 +1093,17 @@ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data
1074
1093
  static VALUE
1075
1094
  cs_method_count_in(VALUE self, VALUE str)
1076
1095
  {
1077
- VALUE count;
1096
+ long count;
1078
1097
  struct cs_data *data;
1079
1098
  raise_arg_err_unless_string(str);
1080
1099
  data = cs_fetch_data(self);
1081
1100
  count = 0;
1082
- each_cp(str, count_str_cp, data->cps, data->len, data, &count);
1083
- return INT2NUM((int)count);
1101
+ each_cp(str, count_str_cp, data->cps, data->len, data, (VALUE)&count);
1102
+ return LONG2FIX(count);
1084
1103
  }
1085
1104
 
1086
1105
  static inline int
1087
- str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1106
+ str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1088
1107
  {
1089
1108
  return tst_cp(cp_arr, len, str_cp);
1090
1109
  }
@@ -1099,11 +1118,11 @@ cs_method_cover_p(VALUE self, VALUE str)
1099
1118
  }
1100
1119
 
1101
1120
  static inline int
1102
- add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1121
+ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1103
1122
  {
1104
1123
  if (tst_cp(cp_arr, len, str_cp))
1105
1124
  {
1106
- rb_ary_push(memo[0], rb_enc_uint_chr((int)str_cp, (rb_encoding *)memo[1]));
1125
+ rb_ary_push(memo, rb_enc_uint_chr((int)str_cp, rb_utf8_encoding()));
1107
1126
  }
1108
1127
  return 1;
1109
1128
  }
@@ -1111,18 +1130,17 @@ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_d
1111
1130
  static VALUE
1112
1131
  cs_method_scan(VALUE self, VALUE str)
1113
1132
  {
1114
- VALUE memo[2];
1133
+ VALUE memo;
1115
1134
  struct cs_data *data;
1116
1135
  raise_arg_err_unless_string(str);
1117
1136
  data = cs_fetch_data(self);
1118
- memo[0] = rb_ary_new();
1119
- memo[1] = (VALUE)rb_enc_get(str);
1137
+ memo = rb_ary_new();
1120
1138
  each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo);
1121
- return memo[0];
1139
+ return memo;
1122
1140
  }
1123
1141
 
1124
1142
  static inline int
1125
- str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1143
+ str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE memo)
1126
1144
  {
1127
1145
  return !tst_cp(cp_arr, len, str_cp);
1128
1146
  }
@@ -1146,9 +1164,9 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1146
1164
  cs_cp cs_len;
1147
1165
  VALUE orig_str_len;
1148
1166
 
1149
- rb_encoding *enc;
1167
+ rb_encoding *orig_enc, *utf8;
1150
1168
  char *s, *send, *t;
1151
- int ascompat, cr;
1169
+ int orig_was_utf8, cr;
1152
1170
 
1153
1171
  raise_arg_err_unless_string(str);
1154
1172
 
@@ -1159,24 +1177,34 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1159
1177
  return bang ? Qnil : str;
1160
1178
  }
1161
1179
 
1162
- if (!bang)
1180
+ orig_enc = rb_enc_get(str);
1181
+ utf8 = rb_utf8_encoding();
1182
+ orig_was_utf8 = orig_enc == utf8;
1183
+
1184
+ if (!orig_was_utf8 && orig_enc != rb_usascii_encoding())
1185
+ {
1186
+ str = rb_str_encode(str, rb_enc_from_encoding(utf8), 0, Qnil);
1187
+ }
1188
+ else
1163
1189
  {
1164
- str = rb_str_dup(str);
1190
+ if (!bang)
1191
+ {
1192
+ str = rb_str_dup(str);
1193
+ }
1165
1194
  }
1166
1195
 
1167
1196
  cps = cs_fetch_cps(set, &cs_len);
1168
1197
  rb_str_modify(str);
1169
- enc = rb_enc_get(str);
1170
- ascompat = rb_enc_asciicompat(enc);
1171
1198
  s = t = RSTRING_PTR(str);
1172
1199
  send = RSTRING_END(str);
1173
- cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
1200
+ cr = ENC_CODERANGE_7BIT;
1201
+
1174
1202
  while (s < send)
1175
1203
  {
1176
1204
  unsigned int c;
1177
1205
  int clen;
1178
1206
 
1179
- if (ascompat && (c = *(unsigned char *)s) < 0x80)
1207
+ if ((c = *(unsigned char *)s) < 0x80)
1180
1208
  {
1181
1209
  if (tst_cp(cps, cs_len, c) != delete)
1182
1210
  {
@@ -1188,12 +1216,12 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1188
1216
  }
1189
1217
  else
1190
1218
  {
1191
- c = rb_enc_codepoint_len(s, send, &clen, enc);
1219
+ c = rb_enc_codepoint_len(s, send, &clen, utf8);
1192
1220
 
1193
1221
  if (tst_cp(cps, cs_len, c) != delete)
1194
1222
  {
1195
1223
  if (t != s)
1196
- rb_enc_mbcput(c, t, enc);
1224
+ rb_enc_mbcput(c, t, utf8);
1197
1225
  t += clen;
1198
1226
  if (cr == ENC_CODERANGE_7BIT)
1199
1227
  cr = ENC_CODERANGE_VALID;
@@ -1210,6 +1238,11 @@ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1210
1238
  return Qnil;
1211
1239
  }
1212
1240
 
1241
+ if (!orig_was_utf8)
1242
+ {
1243
+ return rb_str_encode(str, rb_enc_from_encoding(orig_enc), 0, Qnil);
1244
+ }
1245
+
1213
1246
  return str;
1214
1247
  }
1215
1248
 
@@ -1303,6 +1336,7 @@ void Init_character_set()
1303
1336
  rb_define_method(cs, ">=", cs_method_superset_p, 1);
1304
1337
  rb_define_method(cs, "proper_superset?", cs_method_proper_superset_p, 1);
1305
1338
  rb_define_method(cs, ">", cs_method_proper_superset_p, 1);
1339
+ rb_define_method(cs, "<=>", cs_method_spaceship_operator, 1);
1306
1340
 
1307
1341
  // `CharacterSet`-specific methods
1308
1342
 
@@ -4,6 +4,14 @@ class CharacterSet
4
4
  def character_set
5
5
  CharacterSet.of_regexp(self)
6
6
  end
7
+
8
+ def covered_by_character_set?(other)
9
+ other.superset?(character_set)
10
+ end
11
+
12
+ def uses_character_set?(other)
13
+ other.intersect?(character_set)
14
+ end
7
15
  end
8
16
  end
9
17
  end
@@ -4,86 +4,61 @@ class CharacterSet
4
4
 
5
5
  Error = Class.new(ArgumentError)
6
6
 
7
- def convert(expression, to = CharacterSet)
7
+ def convert(expression, to = CharacterSet, acc = [])
8
8
  CharacterSet.require_optional_dependency('regexp_parser', __method__)
9
9
 
10
10
  case expression
11
- when Regexp::Expression::Root
12
- if expression.count != 1
13
- raise Error, 'Pass a Regexp with exactly one expression, e.g. /[a-z]/'
14
- end
15
- convert(expression[0], to)
16
-
17
11
  when Regexp::Expression::CharacterSet
18
- content = expression.map { |subexp| convert(subexp, to) }.reduce(:+)
19
- content ||= to[]
20
- expression.negative? ? content.inversion : content
12
+ content = expression.map { |subexp| convert(subexp, to) }.reduce(:+) || to[]
13
+ acc << (expression.negative? ? content.inversion : content)
21
14
 
22
15
  when Regexp::Expression::CharacterSet::Intersection
23
- expression.map { |subexp| convert(subexp, to) }.reduce(:&)
24
-
25
- when Regexp::Expression::CharacterSet::IntersectedSequence
26
- expression.map { |subexp| convert(subexp, to) }.reduce(:+) || to[]
16
+ acc << expression.map { |subexp| convert(subexp, to) }.reduce(:&)
27
17
 
28
18
  when Regexp::Expression::CharacterSet::Range
29
19
  start, finish = expression.map { |subexp| convert(subexp, to) }
30
- to.new((start.min)..(finish.max))
20
+ acc << to.new((start.min)..(finish.max))
21
+
22
+ when Regexp::Expression::Subexpression # root, group, alternation, etc.
23
+ expression.each { |subexp| convert(subexp, to, acc) }
31
24
 
32
25
  when Regexp::Expression::CharacterType::Any
33
- to.unicode
26
+ acc << to.unicode
34
27
 
35
28
  when Regexp::Expression::CharacterType::Base
36
29
  /(?<negative>non)?(?<base_name>.+)/ =~ expression.token
37
30
  content =
38
31
  if expression.unicode_classes?
39
- # in u-mode, type shortcuts match the same as \p{<long type name>}
40
- to.of_property(base_name)
32
+ # in u-mode, most type shortcuts match the same as \p{<long type name>}
33
+ if base_name == 'linebreak'
34
+ to.from_ranges(10..13, 133..133, 8232..8233)
35
+ else
36
+ to.of_property(base_name)
37
+ end
41
38
  else
42
39
  # in normal mode, types match only ascii chars
43
40
  case base_name.to_sym
44
- when :digit then to.from_ranges(48..57)
45
- when :hex then to.from_ranges(48..57, 65..70, 97..102)
46
- when :space then to.from_ranges(9..13, 32..32)
47
- when :word then to.from_ranges(48..57, 65..90, 95..95, 97..122)
41
+ when :digit then to.from_ranges(48..57)
42
+ when :hex then to.from_ranges(48..57, 65..70, 97..102)
43
+ when :linebreak then to.from_ranges(10..13)
44
+ when :space then to.from_ranges(9..13, 32..32)
45
+ when :word then to.from_ranges(48..57, 65..90, 95..95, 97..122)
48
46
  else raise Error, "Unsupported CharacterType #{base_name}"
49
47
  end
50
48
  end
51
- negative ? content.inversion : content
49
+ acc << (negative ? content.inversion : content)
52
50
 
53
51
  when Regexp::Expression::EscapeSequence::CodepointList
54
- to.new(expression.codepoints)
52
+ content = to.new(expression.codepoints)
53
+ acc << (expression.i? ? content.case_insensitive : content)
55
54
 
56
55
  when Regexp::Expression::EscapeSequence::Base
57
- to[expression.codepoint]
58
-
59
- when Regexp::Expression::Group::Capture,
60
- Regexp::Expression::Group::Passive,
61
- Regexp::Expression::Group::Named,
62
- Regexp::Expression::Group::Atomic,
63
- Regexp::Expression::Group::Options
64
- case expression.count
65
- when 0 then to[]
66
- when 1 then convert(expression.first, to)
67
- else
68
- raise Error, 'Groups must contain exactly one expression, e.g. ([a-z])'
69
- end
70
-
71
- when Regexp::Expression::Alternation # rubocop:disable Lint/DuplicateBranch
72
- expression.map { |subexp| convert(subexp, to) }.reduce(:+)
73
-
74
- when Regexp::Expression::Alternative
75
- case expression.count
76
- when 0 then to[]
77
- when 1 then convert(expression.first, to)
78
- else
79
- raise Error, 'Alternatives must contain exactly one expression'
80
- end
56
+ content = to[expression.codepoint]
57
+ acc << (expression.i? ? content.case_insensitive : content)
81
58
 
82
59
  when Regexp::Expression::Literal
83
- if expression.set_level == 0 && expression.text.size != 1
84
- raise Error, 'Literal runs outside of sets are codepoint *sequences*'
85
- end
86
- to[expression.text.ord]
60
+ content = to[*expression.text.chars]
61
+ acc << (expression.i? ? content.case_insensitive : content)
87
62
 
88
63
  when Regexp::Expression::UnicodeProperty::Base,
89
64
  Regexp::Expression::PosixClass
@@ -91,14 +66,22 @@ class CharacterSet
91
66
  if expression.type == :posixclass && expression.ascii_classes?
92
67
  content = content.ascii_part
93
68
  end
94
- expression.negative? ? content.inversion : content
69
+ acc << (expression.negative? ? content.inversion : content)
70
+
71
+ when Regexp::Expression::Anchor::Base,
72
+ Regexp::Expression::Backreference::Base,
73
+ Regexp::Expression::Keep::Mark,
74
+ Regexp::Expression::Quantifier
75
+ # ignore zero-length and repeat expressions
95
76
 
96
77
  when Regexp::Expression::Base
97
78
  raise Error, "Unsupported expression class `#{expression.class}`"
98
79
 
99
80
  else
100
- raise Error, "Pass an expression (result of Regexp::Parser.parse)"
81
+ raise Error, 'Pass an expression (result of Regexp::Parser.parse)'
101
82
  end
83
+
84
+ acc.reduce(:+) || to[]
102
85
  end
103
86
  end
104
87
  end
@@ -4,11 +4,15 @@ class CharacterSet
4
4
 
5
5
  def codepoints_from_enumerable(object)
6
6
  raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each)
7
+
7
8
  # Use #each to check first element (only this works for all Enumerables)
8
- object.each do |e| # rubocop:disable Lint/UnreachableLoop
9
- return object if e.is_a?(Integer) && e >= 0 && e < 0x110000
10
- return object.map(&:ord) if e.is_a?(String) && e.length == 1
11
- raise ArgumentError, "#{e.inspect} is not valid as a codepoint"
9
+ object.each do |el| # rubocop:disable Lint/UnreachableLoop
10
+ if el.is_a?(Integer) && el >= 0 && el < 0x110000
11
+ return object
12
+ elsif el.is_a?(String) && el.length == 1
13
+ return object.to_a.join.encode('utf-8').codepoints
14
+ end
15
+ raise ArgumentError, "#{el.inspect} is not valid as a codepoint"
12
16
  end
13
17
  end
14
18