character_set 1.4.1 → 1.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -6,7 +6,7 @@ typedef struct casefold_mapping {
6
6
  unsigned long to;
7
7
  } casefold_mapping;
8
8
 
9
- #define CASEFOLD_COUNT 1383
9
+ #define CASEFOLD_COUNT 1426
10
10
 
11
11
  static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
12
12
  {0x0041,0x0061},
@@ -564,6 +564,41 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
564
564
  {0x104D1,0x104F9},
565
565
  {0x104D2,0x104FA},
566
566
  {0x104D3,0x104FB},
567
+ {0x10570,0x10597},
568
+ {0x10571,0x10598},
569
+ {0x10572,0x10599},
570
+ {0x10573,0x1059A},
571
+ {0x10574,0x1059B},
572
+ {0x10575,0x1059C},
573
+ {0x10576,0x1059D},
574
+ {0x10577,0x1059E},
575
+ {0x10578,0x1059F},
576
+ {0x10579,0x105A0},
577
+ {0x1057A,0x105A1},
578
+ {0x1057C,0x105A3},
579
+ {0x1057D,0x105A4},
580
+ {0x1057E,0x105A5},
581
+ {0x1057F,0x105A6},
582
+ {0x10580,0x105A7},
583
+ {0x10581,0x105A8},
584
+ {0x10582,0x105A9},
585
+ {0x10583,0x105AA},
586
+ {0x10584,0x105AB},
587
+ {0x10585,0x105AC},
588
+ {0x10586,0x105AD},
589
+ {0x10587,0x105AE},
590
+ {0x10588,0x105AF},
591
+ {0x10589,0x105B0},
592
+ {0x1058A,0x105B1},
593
+ {0x1058C,0x105B3},
594
+ {0x1058D,0x105B4},
595
+ {0x1058E,0x105B5},
596
+ {0x1058F,0x105B6},
597
+ {0x10590,0x105B7},
598
+ {0x10591,0x105B8},
599
+ {0x10592,0x105B9},
600
+ {0x10594,0x105BB},
601
+ {0x10595,0x105BC},
567
602
  {0x10A0,0x2D00},
568
603
  {0x10A1,0x2D01},
569
604
  {0x10A2,0x2D02},
@@ -1102,6 +1137,7 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
1102
1137
  {0x2C2C,0x2C5C},
1103
1138
  {0x2C2D,0x2C5D},
1104
1139
  {0x2C2E,0x2C5E},
1140
+ {0x2C2F,0x2C5F},
1105
1141
  {0x2C60,0x2C61},
1106
1142
  {0x2C62,0x026B},
1107
1143
  {0x2C63,0x1D7D},
@@ -1282,10 +1318,17 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
1282
1318
  {0xA7BA,0xA7BB},
1283
1319
  {0xA7BC,0xA7BD},
1284
1320
  {0xA7BE,0xA7BF},
1321
+ {0xA7C0,0xA7C1},
1285
1322
  {0xA7C2,0xA7C3},
1286
1323
  {0xA7C4,0xA794},
1287
1324
  {0xA7C5,0x0282},
1288
1325
  {0xA7C6,0x1D8E},
1326
+ {0xA7C7,0xA7C8},
1327
+ {0xA7C9,0xA7CA},
1328
+ {0xA7D0,0xA7D1},
1329
+ {0xA7D6,0xA7D7},
1330
+ {0xA7D8,0xA7D9},
1331
+ {0xA7F5,0xA7F6},
1289
1332
  {0xAB70,0x13A0},
1290
1333
  {0xAB71,0x13A1},
1291
1334
  {0xAB72,0x13A2},
@@ -2,7 +2,7 @@ class CharacterSet
2
2
  module CoreExt
3
3
  module StringExt
4
4
  def character_set
5
- CharacterSet.of(self)
5
+ CharacterSet.of_string(self)
6
6
  end
7
7
 
8
8
  {
@@ -4,7 +4,7 @@ class CharacterSet
4
4
 
5
5
  Error = Class.new(ArgumentError)
6
6
 
7
- def convert(expression)
7
+ def convert(expression, to = CharacterSet)
8
8
  CharacterSet.require_optional_dependency('regexp_parser', __method__)
9
9
 
10
10
  case expression
@@ -12,49 +12,49 @@ class CharacterSet
12
12
  if expression.count != 1
13
13
  raise Error, 'Pass a Regexp with exactly one expression, e.g. /[a-z]/'
14
14
  end
15
- convert(expression[0])
15
+ convert(expression[0], to)
16
16
 
17
17
  when Regexp::Expression::CharacterSet
18
- content = expression.map { |subexp| convert(subexp) }.reduce(:+)
19
- content ||= CharacterSet[]
18
+ content = expression.map { |subexp| convert(subexp, to) }.reduce(:+)
19
+ content ||= to[]
20
20
  expression.negative? ? content.inversion : content
21
21
 
22
22
  when Regexp::Expression::CharacterSet::Intersection
23
- expression.map { |subexp| convert(subexp) }.reduce(:&)
23
+ expression.map { |subexp| convert(subexp, to) }.reduce(:&)
24
24
 
25
25
  when Regexp::Expression::CharacterSet::IntersectedSequence
26
- expression.map { |subexp| convert(subexp) }.reduce(:+) || CharacterSet[]
26
+ expression.map { |subexp| convert(subexp, to) }.reduce(:+) || to[]
27
27
 
28
28
  when Regexp::Expression::CharacterSet::Range
29
- start, finish = expression.map { |subexp| convert(subexp) }
30
- CharacterSet.new((start.min)..(finish.max))
29
+ start, finish = expression.map { |subexp| convert(subexp, to) }
30
+ to.new((start.min)..(finish.max))
31
31
 
32
32
  when Regexp::Expression::CharacterType::Any
33
- CharacterSet.unicode
33
+ to.unicode
34
34
 
35
35
  when Regexp::Expression::CharacterType::Base
36
36
  /(?<negative>non)?(?<base_name>.+)/ =~ expression.token
37
37
  content =
38
38
  if expression.unicode_classes?
39
39
  # in u-mode, type shortcuts match the same as \p{<long type name>}
40
- CharacterSet.of_property(base_name)
40
+ to.of_property(base_name)
41
41
  else
42
42
  # in normal mode, types match only ascii chars
43
43
  case base_name.to_sym
44
- when :digit then CharacterSet.from_ranges(48..57)
45
- when :hex then CharacterSet.from_ranges(48..57, 65..70, 97..102)
46
- when :space then CharacterSet.from_ranges(9..13, 32..32)
47
- when :word then CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122)
44
+ when :digit then to.from_ranges(48..57)
45
+ when :hex then to.from_ranges(48..57, 65..70, 97..102)
46
+ when :space then to.from_ranges(9..13, 32..32)
47
+ when :word then to.from_ranges(48..57, 65..90, 95..95, 97..122)
48
48
  else raise Error, "Unsupported CharacterType #{base_name}"
49
49
  end
50
50
  end
51
51
  negative ? content.inversion : content
52
52
 
53
53
  when Regexp::Expression::EscapeSequence::CodepointList
54
- CharacterSet.new(expression.codepoints)
54
+ to.new(expression.codepoints)
55
55
 
56
56
  when Regexp::Expression::EscapeSequence::Base
57
- CharacterSet[expression.codepoint]
57
+ to[expression.codepoint]
58
58
 
59
59
  when Regexp::Expression::Group::Capture,
60
60
  Regexp::Expression::Group::Passive,
@@ -62,19 +62,19 @@ class CharacterSet
62
62
  Regexp::Expression::Group::Atomic,
63
63
  Regexp::Expression::Group::Options
64
64
  case expression.count
65
- when 0 then CharacterSet[]
66
- when 1 then convert(expression.first)
65
+ when 0 then to[]
66
+ when 1 then convert(expression.first, to)
67
67
  else
68
68
  raise Error, 'Groups must contain exactly one expression, e.g. ([a-z])'
69
69
  end
70
70
 
71
71
  when Regexp::Expression::Alternation # rubocop:disable Lint/DuplicateBranch
72
- expression.map { |subexp| convert(subexp) }.reduce(:+)
72
+ expression.map { |subexp| convert(subexp, to) }.reduce(:+)
73
73
 
74
74
  when Regexp::Expression::Alternative
75
75
  case expression.count
76
- when 0 then CharacterSet[]
77
- when 1 then convert(expression.first)
76
+ when 0 then to[]
77
+ when 1 then convert(expression.first, to)
78
78
  else
79
79
  raise Error, 'Alternatives must contain exactly one expression'
80
80
  end
@@ -83,11 +83,11 @@ class CharacterSet
83
83
  if expression.set_level == 0 && expression.text.size != 1
84
84
  raise Error, 'Literal runs outside of sets are codepoint *sequences*'
85
85
  end
86
- CharacterSet[expression.text.ord]
86
+ to[expression.text.ord]
87
87
 
88
88
  when Regexp::Expression::UnicodeProperty::Base,
89
89
  Regexp::Expression::PosixClass
90
- content = CharacterSet.of_property(expression.token)
90
+ content = to.of_property(expression.token)
91
91
  if expression.type == :posixclass && expression.ascii_classes?
92
92
  content = content.ascii_part
93
93
  end
@@ -21,7 +21,7 @@
21
21
  85E,85E
22
22
  860,86A
23
23
  8A0,8B4
24
- 8B6,8BD
24
+ 8B6,8C7
25
25
  8D3,983
26
26
  985,98C
27
27
  98F,990
@@ -76,7 +76,7 @@ B35,B39
76
76
  B3C,B44
77
77
  B47,B48
78
78
  B4B,B4D
79
- B56,B57
79
+ B55,B57
80
80
  B5C,B5D
81
81
  B5F,B63
82
82
  B66,B77
@@ -120,15 +120,14 @@ CDE,CDE
120
120
  CE0,CE3
121
121
  CE6,CEF
122
122
  CF1,CF2
123
- D00,D03
124
- D05,D0C
123
+ D00,D0C
125
124
  D0E,D10
126
125
  D12,D44
127
126
  D46,D48
128
127
  D4A,D4F
129
128
  D54,D63
130
129
  D66,D7F
131
- D82,D83
130
+ D81,D83
132
131
  D85,D96
133
132
  D9A,DB1
134
133
  DB3,DBB
@@ -214,7 +213,7 @@ FCE,FDA
214
213
  1A7F,1A89
215
214
  1A90,1A99
216
215
  1AA0,1AAD
217
- 1AB0,1ABE
216
+ 1AB0,1AC0
218
217
  1B00,1B4B
219
218
  1B50,1B7C
220
219
  1B80,1BF3
@@ -252,7 +251,7 @@ FCE,FDA
252
251
  2440,244A
253
252
  2460,2B73
254
253
  2B76,2B95
255
- 2B98,2C2E
254
+ 2B97,2C2E
256
255
  2C30,2C5E
257
256
  2C60,2CF3
258
257
  2CF9,2D25
@@ -269,7 +268,7 @@ FCE,FDA
269
268
  2DC8,2DCE
270
269
  2DD0,2DD6
271
270
  2DD8,2DDE
272
- 2DE0,2E4F
271
+ 2DE0,2E52
273
272
  2E80,2E99
274
273
  2E9B,2EF3
275
274
  2F00,2FD5
@@ -279,18 +278,16 @@ FCE,FDA
279
278
  3099,30FF
280
279
  3105,312F
281
280
  3131,318E
282
- 3190,31BA
283
- 31C0,31E3
281
+ 3190,31E3
284
282
  31F0,321E
285
- 3220,4DB5
286
- 4DC0,9FEF
283
+ 3220,9FFC
287
284
  A000,A48C
288
285
  A490,A4C6
289
286
  A4D0,A62B
290
287
  A640,A6F7
291
288
  A700,A7BF
292
- A7C2,A7C6
293
- A7F7,A82B
289
+ A7C2,A7CA
290
+ A7F5,A82C
294
291
  A830,A839
295
292
  A840,A877
296
293
  A880,A8C5
@@ -310,7 +307,7 @@ AB09,AB0E
310
307
  AB11,AB16
311
308
  AB20,AB26
312
309
  AB28,AB2E
313
- AB30,AB67
310
+ AB30,AB6B
314
311
  AB70,ABED
315
312
  ABF0,ABF9
316
313
  AC00,D7A3
@@ -355,7 +352,7 @@ FFF9,FFFD
355
352
  10100,10102
356
353
  10107,10133
357
354
  10137,1018E
358
- 10190,1019B
355
+ 10190,1019C
359
356
  101A0,101A0
360
357
  101D0,101FD
361
358
  10280,1029C
@@ -415,8 +412,12 @@ FFF9,FFFD
415
412
  10CFA,10D27
416
413
  10D30,10D39
417
414
  10E60,10E7E
415
+ 10E80,10EA9
416
+ 10EAB,10EAD
417
+ 10EB0,10EB1
418
418
  10F00,10F27
419
419
  10F30,10F59
420
+ 10FB0,10FCB
420
421
  10FE0,10FF6
421
422
  11000,1104D
422
423
  11052,1106F
@@ -425,10 +426,9 @@ FFF9,FFFD
425
426
  110D0,110E8
426
427
  110F0,110F9
427
428
  11100,11134
428
- 11136,11146
429
+ 11136,11147
429
430
  11150,11176
430
- 11180,111CD
431
- 111D0,111DF
431
+ 11180,111DF
432
432
  111E1,111F4
433
433
  11200,11211
434
434
  11213,1123E
@@ -454,9 +454,8 @@ FFF9,FFFD
454
454
  1135D,11363
455
455
  11366,1136C
456
456
  11370,11374
457
- 11400,11459
458
- 1145B,1145B
459
- 1145D,1145F
457
+ 11400,1145B
458
+ 1145D,11461
460
459
  11480,114C7
461
460
  114D0,114D9
462
461
  11580,115B5
@@ -471,7 +470,14 @@ FFF9,FFFD
471
470
  11730,1173F
472
471
  11800,1183B
473
472
  118A0,118F2
474
- 118FF,118FF
473
+ 118FF,11906
474
+ 11909,11909
475
+ 1190C,11913
476
+ 11915,11916
477
+ 11918,11935
478
+ 11937,11938
479
+ 1193B,11946
480
+ 11950,11959
475
481
  119A0,119A7
476
482
  119AA,119D7
477
483
  119DA,119E4
@@ -499,6 +505,7 @@ FFF9,FFFD
499
505
  11D93,11D98
500
506
  11DA0,11DA9
501
507
  11EE0,11EF8
508
+ 11FB0,11FB0
502
509
  11FC0,11FF1
503
510
  11FFF,12399
504
511
  12400,1246E
@@ -522,9 +529,11 @@ FFF9,FFFD
522
529
  16F00,16F4A
523
530
  16F4F,16F87
524
531
  16F8F,16F9F
525
- 16FE0,16FE3
532
+ 16FE0,16FE4
533
+ 16FF0,16FF1
526
534
  17000,187F7
527
- 18800,18AF2
535
+ 18800,18CD5
536
+ 18D00,18D08
528
537
  1B000,1B11E
529
538
  1B150,1B152
530
539
  1B164,1B167
@@ -622,17 +631,15 @@ FFF9,FFFD
622
631
  1F0B1,1F0BF
623
632
  1F0C1,1F0CF
624
633
  1F0D1,1F0F5
625
- 1F100,1F10C
626
- 1F110,1F16C
627
- 1F170,1F1AC
634
+ 1F100,1F1AD
628
635
  1F1E6,1F202
629
636
  1F210,1F23B
630
637
  1F240,1F248
631
638
  1F250,1F251
632
639
  1F260,1F265
633
- 1F300,1F6D5
640
+ 1F300,1F6D7
634
641
  1F6E0,1F6EC
635
- 1F6F0,1F6FA
642
+ 1F6F0,1F6FC
636
643
  1F700,1F773
637
644
  1F780,1F7D8
638
645
  1F7E0,1F7EB
@@ -641,24 +648,28 @@ FFF9,FFFD
641
648
  1F850,1F859
642
649
  1F860,1F887
643
650
  1F890,1F8AD
644
- 1F900,1F90B
645
- 1F90D,1F971
646
- 1F973,1F976
647
- 1F97A,1F9A2
648
- 1F9A5,1F9AA
649
- 1F9AE,1F9CA
651
+ 1F8B0,1F8B1
652
+ 1F900,1F978
653
+ 1F97A,1F9CB
650
654
  1F9CD,1FA53
651
655
  1FA60,1FA6D
652
- 1FA70,1FA73
656
+ 1FA70,1FA74
653
657
  1FA78,1FA7A
654
- 1FA80,1FA82
655
- 1FA90,1FA95
656
- 20000,2A6D6
658
+ 1FA80,1FA86
659
+ 1FA90,1FAA8
660
+ 1FAB0,1FAB6
661
+ 1FAC0,1FAC2
662
+ 1FAD0,1FAD6
663
+ 1FB00,1FB92
664
+ 1FB94,1FBCA
665
+ 1FBF0,1FBF9
666
+ 20000,2A6DD
657
667
  2A700,2B734
658
668
  2B740,2B81D
659
669
  2B820,2CEA1
660
670
  2CEB0,2EBE0
661
671
  2F800,2FA1D
672
+ 30000,3134A
662
673
  E0001,E0001
663
674
  E0020,E007F
664
675
  E0100,E01EF
@@ -44,6 +44,7 @@ AE,AE
44
44
  2699,2699
45
45
  269B,269C
46
46
  26A0,26A1
47
+ 26A7,26A7
47
48
  26AA,26AB
48
49
  26B0,26B1
49
50
  26BD,26BE
@@ -130,22 +131,22 @@ AE,AE
130
131
  1F5FA,1F64F
131
132
  1F680,1F6C5
132
133
  1F6CB,1F6D2
133
- 1F6D5,1F6D5
134
+ 1F6D5,1F6D7
134
135
  1F6E0,1F6E5
135
136
  1F6E9,1F6E9
136
137
  1F6EB,1F6EC
137
138
  1F6F0,1F6F0
138
- 1F6F3,1F6FA
139
+ 1F6F3,1F6FC
139
140
  1F7E0,1F7EB
140
- 1F90D,1F93A
141
+ 1F90C,1F93A
141
142
  1F93C,1F945
142
- 1F947,1F971
143
- 1F973,1F976
144
- 1F97A,1F9A2
145
- 1F9A5,1F9AA
146
- 1F9AE,1F9CA
143
+ 1F947,1F978
144
+ 1F97A,1F9CB
147
145
  1F9CD,1F9FF
148
- 1FA70,1FA73
146
+ 1FA70,1FA74
149
147
  1FA78,1FA7A
150
- 1FA80,1FA82
151
- 1FA90,1FA95
148
+ 1FA80,1FA86
149
+ 1FA90,1FAA8
150
+ 1FAB0,1FAB6
151
+ 1FAC0,1FAC2
152
+ 1FAD0,1FAD6
@@ -22,6 +22,17 @@ class CharacterSet
22
22
  alias valid unicode
23
23
 
24
24
  def build_from_cps_file(path)
25
+ if defined?(Ractor) && Ractor.current != Ractor.main
26
+ raise <<-EOS.gsub(/^ */, '')
27
+ CharacterSet's predefined sets are lazy-loaded.
28
+ Pre-load them to use them in Ractors. E.g.:
29
+
30
+ CharacterSet.ascii # pre-load
31
+ Ractor.new { CharacterSet.ascii.size }.take # => 128
32
+ Ractor.new { 'abc'.keep_character_set(:ascii) }.take # => 'abc'
33
+ EOS
34
+ end
35
+
25
36
  File.readlines(path).inject(new) do |set, line|
26
37
  range_start, range_end = line.split(',')
27
38
  set.merge((range_start.to_i(16))..(range_end.to_i(16)))
@@ -6,9 +6,9 @@ class CharacterSet
6
6
  new(Array(ranges).flat_map(&:to_a))
7
7
  end
8
8
 
9
- def of(string)
10
- raise ArgumentError, 'pass a String' unless string.is_a?(String)
11
- new(string.codepoints)
9
+ def of_string(str)
10
+ raise ArgumentError, 'pass a String' unless str.respond_to?(:codepoints)
11
+ str.codepoints.each_with_object(new) { |cp, set| set << cp }
12
12
  end
13
13
  end
14
14
 
@@ -22,13 +22,14 @@ class CharacterSet
22
22
 
23
23
  # Allow some methods to take an Enum just as well as another CharacterSet.
24
24
  # Tested by ruby-spec.
25
- %w[& + - ^ | difference intersection subtract union].each do |method|
25
+ %w[& + - ^ | difference disjoint? intersect? intersection
26
+ subtract union].each do |method|
26
27
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
27
28
  def #{method}(arg)
28
29
  if arg.is_a?(CharacterSet)
29
- super
30
+ super(arg)
30
31
  elsif arg.respond_to?(:each)
31
- super(CharacterSet.new(arg.to_a))
32
+ super(self.class.new(arg.to_a))
32
33
  else
33
34
  raise ArgumentError, 'pass an enumerable'
34
35
  end
@@ -15,6 +15,12 @@ class CharacterSet
15
15
  new(Array(args))
16
16
  end
17
17
 
18
+ def of(*args)
19
+ args.map do |arg|
20
+ arg.is_a?(Regexp) ? of_regexp(arg) : of_string(arg)
21
+ end.reduce(:merge) || new
22
+ end
23
+
18
24
  def parse(string)
19
25
  codepoints = Parser.codepoints_from_bracket_expression(string)
20
26
  result = new(codepoints)
@@ -36,7 +42,7 @@ class CharacterSet
36
42
  end
37
43
 
38
44
  def of_expression(expression)
39
- ExpressionConverter.convert(expression)
45
+ ExpressionConverter.convert(expression, self)
40
46
  end
41
47
 
42
48
  def require_optional_dependency(name, method)
@@ -90,6 +96,14 @@ class CharacterSet
90
96
  Writer.write_surrogate_alternation(bmp_part.ranges, astral_part.ranges)
91
97
  end
92
98
 
99
+ def secure_token(length = 32)
100
+ CharacterSet.require_optional_dependency('securerandom', __method__)
101
+ cps = to_a
102
+ len = cps.count
103
+ 1.upto(length).map { cps[SecureRandom.random_number(len)] }.pack('U*')
104
+ end
105
+ alias random_token secure_token
106
+
93
107
  def inspect
94
108
  len = length
95
109
  "#<#{klass.name}: {\#{first(5) * ', '}\#{'...' if len > 5}} (size: \#{len})>"
@@ -1,3 +1,3 @@
1
1
  class CharacterSet
2
- VERSION = '1.4.1'
2
+ VERSION = '1.6.0'
3
3
  end