character_set 1.3.0-java → 1.6.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -82,7 +82,11 @@ static const rb_data_type_t cs_type = {
82
82
  .dsize = cs_memsize,
83
83
  },
84
84
  .data = NULL,
85
+ #ifdef RUBY_TYPED_FROZEN_SHAREABLE
86
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_FROZEN_SHAREABLE,
87
+ #else
85
88
  .flags = RUBY_TYPED_FREE_IMMEDIATELY,
89
+ #endif
86
90
  };
87
91
 
88
92
  static inline VALUE
@@ -216,6 +220,7 @@ cs_method_hash(VALUE self)
216
220
  cs_cp cp, len, hash, four_byte_value;
217
221
  cs_ar *cps;
218
222
  cps = cs_fetch_cps(self, &len);
223
+ four_byte_value = 0;
219
224
 
220
225
  hash = 17;
221
226
  for (cp = 0; cp < len; cp++)
@@ -314,9 +319,9 @@ cs_method_minmax(VALUE self)
314
319
  cs_cp cp, alen, blen; \
315
320
  cs_ar *acps, *bcps; \
316
321
  struct cs_data *new_data; \
317
- new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
318
322
  acps = cs_fetch_cps(cs_a, &alen); \
319
323
  bcps = cs_fetch_cps(cs_b, &blen); \
324
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
320
325
  for (cp = 0; cp < UNICODE_CP_COUNT; cp++) \
321
326
  { \
322
327
  if (tst_cp(acps, alen, cp) comp_op tst_cp(bcps, blen, cp)) \
@@ -1045,13 +1050,14 @@ raise_arg_err_unless_string(VALUE val)
1045
1050
  }
1046
1051
 
1047
1052
  static VALUE
1048
- cs_class_method_of(VALUE self, VALUE str)
1053
+ cs_class_method_of_string(VALUE self, VALUE string)
1049
1054
  {
1050
1055
  VALUE new_cs;
1051
1056
  struct cs_data *new_data;
1057
+
1058
+ raise_arg_err_unless_string(string);
1052
1059
  new_cs = cs_alloc(self, &new_data);
1053
- raise_arg_err_unless_string(str);
1054
- each_cp(str, add_str_cp_to_arr, 0, 0, new_data, 0);
1060
+ each_cp(string, add_str_cp_to_arr, 0, 0, new_data, 0);
1055
1061
  return new_cs;
1056
1062
  }
1057
1063
 
@@ -1074,7 +1080,7 @@ cs_method_count_in(VALUE self, VALUE str)
1074
1080
  data = cs_fetch_data(self);
1075
1081
  count = 0;
1076
1082
  each_cp(str, count_str_cp, data->cps, data->len, data, &count);
1077
- return INT2NUM(count);
1083
+ return INT2NUM((int)count);
1078
1084
  }
1079
1085
 
1080
1086
  static inline int
@@ -1132,118 +1138,76 @@ cs_method_used_by_p(VALUE self, VALUE str)
1132
1138
  return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
1133
1139
  }
1134
1140
 
1135
- static void
1136
- cs_str_buf_cat(VALUE str, const char *ptr, long len)
1137
- {
1138
- long total, olen;
1139
- char *sptr;
1140
-
1141
- RSTRING_GETMEM(str, sptr, olen);
1142
- sptr = RSTRING(str)->as.heap.ptr;
1143
- olen = RSTRING(str)->as.heap.len;
1144
- total = olen + len;
1145
- memcpy(sptr + olen, ptr, len);
1146
- RSTRING(str)->as.heap.len = total;
1147
- }
1148
-
1149
- #ifndef TERM_FILL
1150
- #define TERM_FILL(ptr, termlen) \
1151
- do \
1152
- { \
1153
- char *const term_fill_ptr = (ptr); \
1154
- const int term_fill_len = (termlen); \
1155
- *term_fill_ptr = '\0'; \
1156
- if (__builtin_expect(!!(term_fill_len > 1), 0)) \
1157
- memset(term_fill_ptr, 0, term_fill_len); \
1158
- } while (0)
1159
- #endif
1160
-
1161
- static void
1162
- cs_str_buf_terminate(VALUE str, rb_encoding *enc)
1163
- {
1164
- char *ptr;
1165
- long len;
1166
-
1167
- ptr = RSTRING(str)->as.heap.ptr;
1168
- len = RSTRING(str)->as.heap.len;
1169
- TERM_FILL(ptr + len, rb_enc_mbminlen(enc));
1170
- }
1171
-
1141
+ // partially based on rb_str_delete_bang
1172
1142
  static inline VALUE
1173
1143
  cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1174
1144
  {
1175
1145
  cs_ar *cps;
1176
- cs_cp len;
1177
- rb_encoding *str_enc;
1178
- VALUE orig_len, new_str_buf;
1179
- int cp_len;
1180
- unsigned int str_cp;
1181
- const char *ptr, *end;
1146
+ cs_cp cs_len;
1147
+ VALUE orig_str_len;
1148
+
1149
+ rb_encoding *enc;
1150
+ char *s, *send, *t;
1151
+ int ascompat, cr;
1182
1152
 
1183
1153
  raise_arg_err_unless_string(str);
1184
1154
 
1185
- cps = cs_fetch_cps(set, &len);
1155
+ orig_str_len = RSTRING_LEN(str);
1186
1156
 
1187
- orig_len = RSTRING_LEN(str);
1188
- if (orig_len < 1) // empty string, will never change
1157
+ if (orig_str_len == 0)
1189
1158
  {
1190
- if (bang)
1191
- {
1192
- return Qnil;
1193
- }
1194
- return rb_str_dup(str);
1159
+ return bang ? Qnil : str;
1195
1160
  }
1196
1161
 
1197
- new_str_buf = rb_str_buf_new(orig_len);
1198
- str_enc = rb_enc_get(str);
1199
- rb_enc_associate(new_str_buf, str_enc);
1200
- rb_str_modify(new_str_buf);
1201
- ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1202
-
1203
- ptr = RSTRING_PTR(str);
1204
- end = RSTRING_END(str);
1162
+ if (!bang)
1163
+ {
1164
+ str = rb_str_dup(str);
1165
+ }
1205
1166
 
1206
- if (single_byte_optimizable(str))
1167
+ cps = cs_fetch_cps(set, &cs_len);
1168
+ rb_str_modify(str);
1169
+ enc = rb_enc_get(str);
1170
+ ascompat = rb_enc_asciicompat(enc);
1171
+ s = t = RSTRING_PTR(str);
1172
+ send = RSTRING_END(str);
1173
+ cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
1174
+ while (s < send)
1207
1175
  {
1208
- while (ptr < end)
1176
+ unsigned int c;
1177
+ int clen;
1178
+
1179
+ if (ascompat && (c = *(unsigned char *)s) < 0x80)
1209
1180
  {
1210
- str_cp = *ptr & 0xff;
1211
- if ((!tst_cp(cps, len, str_cp)) == delete)
1181
+ if (tst_cp(cps, cs_len, c) != delete)
1212
1182
  {
1213
- cs_str_buf_cat(new_str_buf, ptr, 1);
1183
+ if (t != s)
1184
+ *t = c;
1185
+ t++;
1214
1186
  }
1215
- ptr++;
1187
+ s++;
1216
1188
  }
1217
- }
1218
- else // likely to be multibyte string
1219
- {
1220
- while (ptr < end)
1189
+ else
1221
1190
  {
1222
- str_cp = rb_enc_codepoint_len(ptr, end, &cp_len, str_enc);
1223
- if ((!tst_cp(cps, len, str_cp)) == delete)
1191
+ c = rb_enc_codepoint_len(s, send, &clen, enc);
1192
+
1193
+ if (tst_cp(cps, cs_len, c) != delete)
1224
1194
  {
1225
- cs_str_buf_cat(new_str_buf, ptr, cp_len);
1195
+ if (t != s)
1196
+ rb_enc_mbcput(c, t, enc);
1197
+ t += clen;
1198
+ if (cr == ENC_CODERANGE_7BIT)
1199
+ cr = ENC_CODERANGE_VALID;
1226
1200
  }
1227
- ptr += cp_len;
1201
+ s += clen;
1228
1202
  }
1229
1203
  }
1230
1204
 
1231
- cs_str_buf_terminate(new_str_buf, str_enc);
1205
+ rb_str_set_len(str, t - RSTRING_PTR(str));
1206
+ ENC_CODERANGE_SET(str, cr);
1232
1207
 
1233
- if (bang)
1234
- {
1235
- if (RSTRING_LEN(new_str_buf) == (long)orig_len) // string unchanged
1236
- {
1237
- return Qnil;
1238
- }
1239
- rb_str_shared_replace(str, new_str_buf);
1240
- }
1241
- else
1208
+ if (bang && (RSTRING_LEN(str) == (long)orig_str_len)) // string unchanged
1242
1209
  {
1243
- RB_OBJ_WRITE(new_str_buf, &(RBASIC(new_str_buf))->klass, rb_obj_class(str));
1244
- // slightly cumbersome approach needed for compatibility with Ruby < 2.3:
1245
- RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags & (FL_TAINT));
1246
- str = new_str_buf;
1210
+ return Qnil;
1247
1211
  }
1248
1212
 
1249
1213
  return str;
@@ -1285,6 +1249,10 @@ cs_method_allocated_length(VALUE self)
1285
1249
 
1286
1250
  void Init_character_set()
1287
1251
  {
1252
+ #ifdef HAVE_RB_EXT_RACTOR_SAFE
1253
+ rb_ext_ractor_safe(true);
1254
+ #endif
1255
+
1288
1256
  VALUE cs = rb_define_class("CharacterSet", rb_cObject);
1289
1257
 
1290
1258
  rb_define_alloc_func(cs, cs_method_allocate);
@@ -1339,7 +1307,7 @@ void Init_character_set()
1339
1307
  // `CharacterSet`-specific methods
1340
1308
 
1341
1309
  rb_define_singleton_method(cs, "from_ranges", cs_class_method_from_ranges, -2);
1342
- rb_define_singleton_method(cs, "of", cs_class_method_of, 1);
1310
+ rb_define_singleton_method(cs, "of_string", cs_class_method_of_string, 1);
1343
1311
 
1344
1312
  rb_define_method(cs, "ranges", cs_method_ranges, 0);
1345
1313
  rb_define_method(cs, "sample", cs_method_sample, -1);
@@ -6,7 +6,7 @@ typedef struct casefold_mapping {
6
6
  unsigned long to;
7
7
  } casefold_mapping;
8
8
 
9
- #define CASEFOLD_COUNT 1383
9
+ #define CASEFOLD_COUNT 1426
10
10
 
11
11
  static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
12
12
  {0x0041,0x0061},
@@ -564,6 +564,41 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
564
564
  {0x104D1,0x104F9},
565
565
  {0x104D2,0x104FA},
566
566
  {0x104D3,0x104FB},
567
+ {0x10570,0x10597},
568
+ {0x10571,0x10598},
569
+ {0x10572,0x10599},
570
+ {0x10573,0x1059A},
571
+ {0x10574,0x1059B},
572
+ {0x10575,0x1059C},
573
+ {0x10576,0x1059D},
574
+ {0x10577,0x1059E},
575
+ {0x10578,0x1059F},
576
+ {0x10579,0x105A0},
577
+ {0x1057A,0x105A1},
578
+ {0x1057C,0x105A3},
579
+ {0x1057D,0x105A4},
580
+ {0x1057E,0x105A5},
581
+ {0x1057F,0x105A6},
582
+ {0x10580,0x105A7},
583
+ {0x10581,0x105A8},
584
+ {0x10582,0x105A9},
585
+ {0x10583,0x105AA},
586
+ {0x10584,0x105AB},
587
+ {0x10585,0x105AC},
588
+ {0x10586,0x105AD},
589
+ {0x10587,0x105AE},
590
+ {0x10588,0x105AF},
591
+ {0x10589,0x105B0},
592
+ {0x1058A,0x105B1},
593
+ {0x1058C,0x105B3},
594
+ {0x1058D,0x105B4},
595
+ {0x1058E,0x105B5},
596
+ {0x1058F,0x105B6},
597
+ {0x10590,0x105B7},
598
+ {0x10591,0x105B8},
599
+ {0x10592,0x105B9},
600
+ {0x10594,0x105BB},
601
+ {0x10595,0x105BC},
567
602
  {0x10A0,0x2D00},
568
603
  {0x10A1,0x2D01},
569
604
  {0x10A2,0x2D02},
@@ -1102,6 +1137,7 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
1102
1137
  {0x2C2C,0x2C5C},
1103
1138
  {0x2C2D,0x2C5D},
1104
1139
  {0x2C2E,0x2C5E},
1140
+ {0x2C2F,0x2C5F},
1105
1141
  {0x2C60,0x2C61},
1106
1142
  {0x2C62,0x026B},
1107
1143
  {0x2C63,0x1D7D},
@@ -1282,10 +1318,17 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
1282
1318
  {0xA7BA,0xA7BB},
1283
1319
  {0xA7BC,0xA7BD},
1284
1320
  {0xA7BE,0xA7BF},
1321
+ {0xA7C0,0xA7C1},
1285
1322
  {0xA7C2,0xA7C3},
1286
1323
  {0xA7C4,0xA794},
1287
1324
  {0xA7C5,0x0282},
1288
1325
  {0xA7C6,0x1D8E},
1326
+ {0xA7C7,0xA7C8},
1327
+ {0xA7C9,0xA7CA},
1328
+ {0xA7D0,0xA7D1},
1329
+ {0xA7D6,0xA7D7},
1330
+ {0xA7D8,0xA7D9},
1331
+ {0xA7F5,0xA7F6},
1289
1332
  {0xAB70,0x13A0},
1290
1333
  {0xAB71,0x13A1},
1291
1334
  {0xAB72,0x13A2},
@@ -1,7 +1,7 @@
1
1
  class CharacterSet
2
2
  class Character
3
3
  ENCODING = 'utf-8'.freeze
4
- SAFELY_PRINTABLE = (0x21..0x7E).to_a - ['-', '[', '\\', ']', '^'].map(&:ord)
4
+ SAFELY_PRINTABLE = (0x21..0x7E).to_a - %w(- / [ \\ ] ^).map(&:ord)
5
5
 
6
6
  attr_accessor :codepoint
7
7
 
@@ -8,4 +8,4 @@ class CharacterSet
8
8
  end
9
9
  end
10
10
 
11
- ::Regexp.send(:include, CharacterSet::CoreExt::RegexpExt)
11
+ ::Regexp.instance_eval { include CharacterSet::CoreExt::RegexpExt }
@@ -2,7 +2,7 @@ class CharacterSet
2
2
  module CoreExt
3
3
  module StringExt
4
4
  def character_set
5
- CharacterSet.of(self)
5
+ CharacterSet.of_string(self)
6
6
  end
7
7
 
8
8
  {
@@ -29,4 +29,4 @@ class CharacterSet
29
29
  end
30
30
  end
31
31
 
32
- ::String.send(:include, CharacterSet::CoreExt::StringExt)
32
+ ::String.instance_eval { include CharacterSet::CoreExt::StringExt }
@@ -4,56 +4,57 @@ class CharacterSet
4
4
 
5
5
  Error = Class.new(ArgumentError)
6
6
 
7
- def convert(expression)
8
- CharacterSet.require_optional_dependency('regexp_parser')
7
+ def convert(expression, to = CharacterSet)
8
+ CharacterSet.require_optional_dependency('regexp_parser', __method__)
9
9
 
10
10
  case expression
11
11
  when Regexp::Expression::Root
12
12
  if expression.count != 1
13
13
  raise Error, 'Pass a Regexp with exactly one expression, e.g. /[a-z]/'
14
14
  end
15
- convert(expression[0])
15
+ convert(expression[0], to)
16
16
 
17
17
  when Regexp::Expression::CharacterSet
18
- content = expression.map { |subexp| convert(subexp) }.reduce(:+)
18
+ content = expression.map { |subexp| convert(subexp, to) }.reduce(:+)
19
+ content ||= to[]
19
20
  expression.negative? ? content.inversion : content
20
21
 
21
22
  when Regexp::Expression::CharacterSet::Intersection
22
- expression.map { |subexp| convert(subexp) }.reduce(:&)
23
+ expression.map { |subexp| convert(subexp, to) }.reduce(:&)
23
24
 
24
25
  when Regexp::Expression::CharacterSet::IntersectedSequence
25
- expression.map { |subexp| convert(subexp) }.reduce(:+)
26
+ expression.map { |subexp| convert(subexp, to) }.reduce(:+) || to[]
26
27
 
27
28
  when Regexp::Expression::CharacterSet::Range
28
- start, finish = expression.map { |subexp| convert(subexp) }
29
- CharacterSet.new((start.min)..(finish.max))
29
+ start, finish = expression.map { |subexp| convert(subexp, to) }
30
+ to.new((start.min)..(finish.max))
30
31
 
31
32
  when Regexp::Expression::CharacterType::Any
32
- CharacterSet.unicode
33
+ to.unicode
33
34
 
34
35
  when Regexp::Expression::CharacterType::Base
35
36
  /(?<negative>non)?(?<base_name>.+)/ =~ expression.token
36
37
  content =
37
38
  if expression.unicode_classes?
38
39
  # in u-mode, type shortcuts match the same as \p{<long type name>}
39
- CharacterSet.of_property(base_name)
40
+ to.of_property(base_name)
40
41
  else
41
42
  # in normal mode, types match only ascii chars
42
43
  case base_name.to_sym
43
- when :digit then CharacterSet.from_ranges(48..57)
44
- when :hex then CharacterSet.from_ranges(48..57, 65..70, 97..102)
45
- when :space then CharacterSet.from_ranges(9..13, 32..32)
46
- when :word then CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122)
44
+ when :digit then to.from_ranges(48..57)
45
+ when :hex then to.from_ranges(48..57, 65..70, 97..102)
46
+ when :space then to.from_ranges(9..13, 32..32)
47
+ when :word then to.from_ranges(48..57, 65..90, 95..95, 97..122)
47
48
  else raise Error, "Unsupported CharacterType #{base_name}"
48
49
  end
49
50
  end
50
51
  negative ? content.inversion : content
51
52
 
52
53
  when Regexp::Expression::EscapeSequence::CodepointList
53
- CharacterSet.new(expression.codepoints)
54
+ to.new(expression.codepoints)
54
55
 
55
56
  when Regexp::Expression::EscapeSequence::Base
56
- CharacterSet[expression.codepoint]
57
+ to[expression.codepoint]
57
58
 
58
59
  when Regexp::Expression::Group::Capture,
59
60
  Regexp::Expression::Group::Passive,
@@ -61,19 +62,19 @@ class CharacterSet
61
62
  Regexp::Expression::Group::Atomic,
62
63
  Regexp::Expression::Group::Options
63
64
  case expression.count
64
- when 0 then CharacterSet[]
65
- when 1 then convert(expression.first)
65
+ when 0 then to[]
66
+ when 1 then convert(expression.first, to)
66
67
  else
67
68
  raise Error, 'Groups must contain exactly one expression, e.g. ([a-z])'
68
69
  end
69
70
 
70
- when Regexp::Expression::Alternation
71
- expression.map { |subexp| convert(subexp) }.reduce(:+)
71
+ when Regexp::Expression::Alternation # rubocop:disable Lint/DuplicateBranch
72
+ expression.map { |subexp| convert(subexp, to) }.reduce(:+)
72
73
 
73
74
  when Regexp::Expression::Alternative
74
75
  case expression.count
75
- when 0 then CharacterSet[]
76
- when 1 then convert(expression.first)
76
+ when 0 then to[]
77
+ when 1 then convert(expression.first, to)
77
78
  else
78
79
  raise Error, 'Alternatives must contain exactly one expression'
79
80
  end
@@ -82,11 +83,11 @@ class CharacterSet
82
83
  if expression.set_level == 0 && expression.text.size != 1
83
84
  raise Error, 'Literal runs outside of sets are codepoint *sequences*'
84
85
  end
85
- CharacterSet[expression.text.ord]
86
+ to[expression.text.ord]
86
87
 
87
88
  when Regexp::Expression::UnicodeProperty::Base,
88
89
  Regexp::Expression::PosixClass
89
- content = CharacterSet.of_property(expression.token)
90
+ content = to.of_property(expression.token)
90
91
  if expression.type == :posixclass && expression.ascii_classes?
91
92
  content = content.ascii_part
92
93
  end
@@ -5,7 +5,7 @@ class CharacterSet
5
5
  def codepoints_from_enumerable(object)
6
6
  raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each)
7
7
  # Use #each to check first element (only this works for all Enumerables)
8
- object.each do |e|
8
+ object.each do |e| # rubocop:disable Lint/UnreachableLoop
9
9
  return object if e.is_a?(Integer) && e >= 0 && e < 0x110000
10
10
  return object.map(&:ord) if e.is_a?(String) && e.length == 1
11
11
  raise ArgumentError, "#{e.inspect} is not valid as a codepoint"