character_set 1.3.0-java → 1.6.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitattributes +1 -1
- data/.github/workflows/gouteur.yml +20 -0
- data/.github/workflows/lint.yml +29 -0
- data/.github/workflows/tests.yml +26 -0
- data/.gitignore +1 -0
- data/.gouteur.yml +2 -0
- data/.rubocop.yml +17 -0
- data/BENCHMARK.md +35 -31
- data/CHANGELOG.md +50 -1
- data/Gemfile +14 -0
- data/README.md +35 -9
- data/Rakefile +6 -3
- data/benchmarks/delete_in.rb +5 -1
- data/benchmarks/keep_in.rb +5 -1
- data/benchmarks/shared.rb +5 -1
- data/character_set.gemspec +6 -9
- data/ext/character_set/character_set.c +61 -93
- data/ext/character_set/unicode_casefold_table.h +44 -1
- data/lib/character_set/character.rb +1 -1
- data/lib/character_set/core_ext/regexp_ext.rb +1 -1
- data/lib/character_set/core_ext/string_ext.rb +2 -2
- data/lib/character_set/expression_converter.rb +25 -24
- data/lib/character_set/parser.rb +1 -1
- data/lib/character_set/predefined_sets/assigned.cps +51 -40
- data/lib/character_set/predefined_sets/emoji.cps +12 -11
- data/lib/character_set/predefined_sets.rb +11 -0
- data/lib/character_set/ruby_fallback/character_set_methods.rb +5 -6
- data/lib/character_set/ruby_fallback/set_methods.rb +23 -15
- data/lib/character_set/ruby_fallback.rb +5 -1
- data/lib/character_set/set_method_adapters.rb +4 -3
- data/lib/character_set/shared_methods.rb +24 -10
- data/lib/character_set/version.rb +1 -1
- data/lib/character_set/writer.rb +98 -27
- metadata +14 -122
- data/.travis.yml +0 -9
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
85E,85E
|
|
22
22
|
860,86A
|
|
23
23
|
8A0,8B4
|
|
24
|
-
8B6,
|
|
24
|
+
8B6,8C7
|
|
25
25
|
8D3,983
|
|
26
26
|
985,98C
|
|
27
27
|
98F,990
|
|
@@ -76,7 +76,7 @@ B35,B39
|
|
|
76
76
|
B3C,B44
|
|
77
77
|
B47,B48
|
|
78
78
|
B4B,B4D
|
|
79
|
-
|
|
79
|
+
B55,B57
|
|
80
80
|
B5C,B5D
|
|
81
81
|
B5F,B63
|
|
82
82
|
B66,B77
|
|
@@ -120,15 +120,14 @@ CDE,CDE
|
|
|
120
120
|
CE0,CE3
|
|
121
121
|
CE6,CEF
|
|
122
122
|
CF1,CF2
|
|
123
|
-
D00,
|
|
124
|
-
D05,D0C
|
|
123
|
+
D00,D0C
|
|
125
124
|
D0E,D10
|
|
126
125
|
D12,D44
|
|
127
126
|
D46,D48
|
|
128
127
|
D4A,D4F
|
|
129
128
|
D54,D63
|
|
130
129
|
D66,D7F
|
|
131
|
-
|
|
130
|
+
D81,D83
|
|
132
131
|
D85,D96
|
|
133
132
|
D9A,DB1
|
|
134
133
|
DB3,DBB
|
|
@@ -214,7 +213,7 @@ FCE,FDA
|
|
|
214
213
|
1A7F,1A89
|
|
215
214
|
1A90,1A99
|
|
216
215
|
1AA0,1AAD
|
|
217
|
-
1AB0,
|
|
216
|
+
1AB0,1AC0
|
|
218
217
|
1B00,1B4B
|
|
219
218
|
1B50,1B7C
|
|
220
219
|
1B80,1BF3
|
|
@@ -252,7 +251,7 @@ FCE,FDA
|
|
|
252
251
|
2440,244A
|
|
253
252
|
2460,2B73
|
|
254
253
|
2B76,2B95
|
|
255
|
-
|
|
254
|
+
2B97,2C2E
|
|
256
255
|
2C30,2C5E
|
|
257
256
|
2C60,2CF3
|
|
258
257
|
2CF9,2D25
|
|
@@ -269,7 +268,7 @@ FCE,FDA
|
|
|
269
268
|
2DC8,2DCE
|
|
270
269
|
2DD0,2DD6
|
|
271
270
|
2DD8,2DDE
|
|
272
|
-
2DE0,
|
|
271
|
+
2DE0,2E52
|
|
273
272
|
2E80,2E99
|
|
274
273
|
2E9B,2EF3
|
|
275
274
|
2F00,2FD5
|
|
@@ -279,18 +278,16 @@ FCE,FDA
|
|
|
279
278
|
3099,30FF
|
|
280
279
|
3105,312F
|
|
281
280
|
3131,318E
|
|
282
|
-
3190,
|
|
283
|
-
31C0,31E3
|
|
281
|
+
3190,31E3
|
|
284
282
|
31F0,321E
|
|
285
|
-
3220,
|
|
286
|
-
4DC0,9FEF
|
|
283
|
+
3220,9FFC
|
|
287
284
|
A000,A48C
|
|
288
285
|
A490,A4C6
|
|
289
286
|
A4D0,A62B
|
|
290
287
|
A640,A6F7
|
|
291
288
|
A700,A7BF
|
|
292
|
-
A7C2,
|
|
293
|
-
|
|
289
|
+
A7C2,A7CA
|
|
290
|
+
A7F5,A82C
|
|
294
291
|
A830,A839
|
|
295
292
|
A840,A877
|
|
296
293
|
A880,A8C5
|
|
@@ -310,7 +307,7 @@ AB09,AB0E
|
|
|
310
307
|
AB11,AB16
|
|
311
308
|
AB20,AB26
|
|
312
309
|
AB28,AB2E
|
|
313
|
-
AB30,
|
|
310
|
+
AB30,AB6B
|
|
314
311
|
AB70,ABED
|
|
315
312
|
ABF0,ABF9
|
|
316
313
|
AC00,D7A3
|
|
@@ -355,7 +352,7 @@ FFF9,FFFD
|
|
|
355
352
|
10100,10102
|
|
356
353
|
10107,10133
|
|
357
354
|
10137,1018E
|
|
358
|
-
10190,
|
|
355
|
+
10190,1019C
|
|
359
356
|
101A0,101A0
|
|
360
357
|
101D0,101FD
|
|
361
358
|
10280,1029C
|
|
@@ -415,8 +412,12 @@ FFF9,FFFD
|
|
|
415
412
|
10CFA,10D27
|
|
416
413
|
10D30,10D39
|
|
417
414
|
10E60,10E7E
|
|
415
|
+
10E80,10EA9
|
|
416
|
+
10EAB,10EAD
|
|
417
|
+
10EB0,10EB1
|
|
418
418
|
10F00,10F27
|
|
419
419
|
10F30,10F59
|
|
420
|
+
10FB0,10FCB
|
|
420
421
|
10FE0,10FF6
|
|
421
422
|
11000,1104D
|
|
422
423
|
11052,1106F
|
|
@@ -425,10 +426,9 @@ FFF9,FFFD
|
|
|
425
426
|
110D0,110E8
|
|
426
427
|
110F0,110F9
|
|
427
428
|
11100,11134
|
|
428
|
-
11136,
|
|
429
|
+
11136,11147
|
|
429
430
|
11150,11176
|
|
430
|
-
11180,
|
|
431
|
-
111D0,111DF
|
|
431
|
+
11180,111DF
|
|
432
432
|
111E1,111F4
|
|
433
433
|
11200,11211
|
|
434
434
|
11213,1123E
|
|
@@ -454,9 +454,8 @@ FFF9,FFFD
|
|
|
454
454
|
1135D,11363
|
|
455
455
|
11366,1136C
|
|
456
456
|
11370,11374
|
|
457
|
-
11400,
|
|
458
|
-
|
|
459
|
-
1145D,1145F
|
|
457
|
+
11400,1145B
|
|
458
|
+
1145D,11461
|
|
460
459
|
11480,114C7
|
|
461
460
|
114D0,114D9
|
|
462
461
|
11580,115B5
|
|
@@ -471,7 +470,14 @@ FFF9,FFFD
|
|
|
471
470
|
11730,1173F
|
|
472
471
|
11800,1183B
|
|
473
472
|
118A0,118F2
|
|
474
|
-
118FF,
|
|
473
|
+
118FF,11906
|
|
474
|
+
11909,11909
|
|
475
|
+
1190C,11913
|
|
476
|
+
11915,11916
|
|
477
|
+
11918,11935
|
|
478
|
+
11937,11938
|
|
479
|
+
1193B,11946
|
|
480
|
+
11950,11959
|
|
475
481
|
119A0,119A7
|
|
476
482
|
119AA,119D7
|
|
477
483
|
119DA,119E4
|
|
@@ -499,6 +505,7 @@ FFF9,FFFD
|
|
|
499
505
|
11D93,11D98
|
|
500
506
|
11DA0,11DA9
|
|
501
507
|
11EE0,11EF8
|
|
508
|
+
11FB0,11FB0
|
|
502
509
|
11FC0,11FF1
|
|
503
510
|
11FFF,12399
|
|
504
511
|
12400,1246E
|
|
@@ -522,9 +529,11 @@ FFF9,FFFD
|
|
|
522
529
|
16F00,16F4A
|
|
523
530
|
16F4F,16F87
|
|
524
531
|
16F8F,16F9F
|
|
525
|
-
16FE0,
|
|
532
|
+
16FE0,16FE4
|
|
533
|
+
16FF0,16FF1
|
|
526
534
|
17000,187F7
|
|
527
|
-
18800,
|
|
535
|
+
18800,18CD5
|
|
536
|
+
18D00,18D08
|
|
528
537
|
1B000,1B11E
|
|
529
538
|
1B150,1B152
|
|
530
539
|
1B164,1B167
|
|
@@ -622,17 +631,15 @@ FFF9,FFFD
|
|
|
622
631
|
1F0B1,1F0BF
|
|
623
632
|
1F0C1,1F0CF
|
|
624
633
|
1F0D1,1F0F5
|
|
625
|
-
1F100,
|
|
626
|
-
1F110,1F16C
|
|
627
|
-
1F170,1F1AC
|
|
634
|
+
1F100,1F1AD
|
|
628
635
|
1F1E6,1F202
|
|
629
636
|
1F210,1F23B
|
|
630
637
|
1F240,1F248
|
|
631
638
|
1F250,1F251
|
|
632
639
|
1F260,1F265
|
|
633
|
-
1F300,
|
|
640
|
+
1F300,1F6D7
|
|
634
641
|
1F6E0,1F6EC
|
|
635
|
-
1F6F0,
|
|
642
|
+
1F6F0,1F6FC
|
|
636
643
|
1F700,1F773
|
|
637
644
|
1F780,1F7D8
|
|
638
645
|
1F7E0,1F7EB
|
|
@@ -641,24 +648,28 @@ FFF9,FFFD
|
|
|
641
648
|
1F850,1F859
|
|
642
649
|
1F860,1F887
|
|
643
650
|
1F890,1F8AD
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
1F97A,1F9A2
|
|
648
|
-
1F9A5,1F9AA
|
|
649
|
-
1F9AE,1F9CA
|
|
651
|
+
1F8B0,1F8B1
|
|
652
|
+
1F900,1F978
|
|
653
|
+
1F97A,1F9CB
|
|
650
654
|
1F9CD,1FA53
|
|
651
655
|
1FA60,1FA6D
|
|
652
|
-
1FA70,
|
|
656
|
+
1FA70,1FA74
|
|
653
657
|
1FA78,1FA7A
|
|
654
|
-
1FA80,
|
|
655
|
-
1FA90,
|
|
656
|
-
|
|
658
|
+
1FA80,1FA86
|
|
659
|
+
1FA90,1FAA8
|
|
660
|
+
1FAB0,1FAB6
|
|
661
|
+
1FAC0,1FAC2
|
|
662
|
+
1FAD0,1FAD6
|
|
663
|
+
1FB00,1FB92
|
|
664
|
+
1FB94,1FBCA
|
|
665
|
+
1FBF0,1FBF9
|
|
666
|
+
20000,2A6DD
|
|
657
667
|
2A700,2B734
|
|
658
668
|
2B740,2B81D
|
|
659
669
|
2B820,2CEA1
|
|
660
670
|
2CEB0,2EBE0
|
|
661
671
|
2F800,2FA1D
|
|
672
|
+
30000,3134A
|
|
662
673
|
E0001,E0001
|
|
663
674
|
E0020,E007F
|
|
664
675
|
E0100,E01EF
|
|
@@ -44,6 +44,7 @@ AE,AE
|
|
|
44
44
|
2699,2699
|
|
45
45
|
269B,269C
|
|
46
46
|
26A0,26A1
|
|
47
|
+
26A7,26A7
|
|
47
48
|
26AA,26AB
|
|
48
49
|
26B0,26B1
|
|
49
50
|
26BD,26BE
|
|
@@ -130,22 +131,22 @@ AE,AE
|
|
|
130
131
|
1F5FA,1F64F
|
|
131
132
|
1F680,1F6C5
|
|
132
133
|
1F6CB,1F6D2
|
|
133
|
-
1F6D5,
|
|
134
|
+
1F6D5,1F6D7
|
|
134
135
|
1F6E0,1F6E5
|
|
135
136
|
1F6E9,1F6E9
|
|
136
137
|
1F6EB,1F6EC
|
|
137
138
|
1F6F0,1F6F0
|
|
138
|
-
1F6F3,
|
|
139
|
+
1F6F3,1F6FC
|
|
139
140
|
1F7E0,1F7EB
|
|
140
|
-
|
|
141
|
+
1F90C,1F93A
|
|
141
142
|
1F93C,1F945
|
|
142
|
-
1F947,
|
|
143
|
-
|
|
144
|
-
1F97A,1F9A2
|
|
145
|
-
1F9A5,1F9AA
|
|
146
|
-
1F9AE,1F9CA
|
|
143
|
+
1F947,1F978
|
|
144
|
+
1F97A,1F9CB
|
|
147
145
|
1F9CD,1F9FF
|
|
148
|
-
1FA70,
|
|
146
|
+
1FA70,1FA74
|
|
149
147
|
1FA78,1FA7A
|
|
150
|
-
1FA80,
|
|
151
|
-
1FA90,
|
|
148
|
+
1FA80,1FA86
|
|
149
|
+
1FA90,1FAA8
|
|
150
|
+
1FAB0,1FAB6
|
|
151
|
+
1FAC0,1FAC2
|
|
152
|
+
1FAD0,1FAD6
|
|
@@ -22,6 +22,17 @@ class CharacterSet
|
|
|
22
22
|
alias valid unicode
|
|
23
23
|
|
|
24
24
|
def build_from_cps_file(path)
|
|
25
|
+
if defined?(Ractor) && Ractor.current != Ractor.main
|
|
26
|
+
raise <<-EOS.gsub(/^ */, '')
|
|
27
|
+
CharacterSet's predefined sets are lazy-loaded.
|
|
28
|
+
Pre-load them to use them in Ractors. E.g.:
|
|
29
|
+
|
|
30
|
+
CharacterSet.ascii # pre-load
|
|
31
|
+
Ractor.new { CharacterSet.ascii.size }.take # => 128
|
|
32
|
+
Ractor.new { 'abc'.keep_character_set(:ascii) }.take # => 'abc'
|
|
33
|
+
EOS
|
|
34
|
+
end
|
|
35
|
+
|
|
25
36
|
File.readlines(path).inject(new) do |set, line|
|
|
26
37
|
range_start, range_end = line.split(',')
|
|
27
38
|
set.merge((range_start.to_i(16))..(range_end.to_i(16)))
|
|
@@ -6,9 +6,9 @@ class CharacterSet
|
|
|
6
6
|
new(Array(ranges).flat_map(&:to_a))
|
|
7
7
|
end
|
|
8
8
|
|
|
9
|
-
def
|
|
10
|
-
raise ArgumentError, 'pass a String' unless
|
|
11
|
-
|
|
9
|
+
def of_string(str)
|
|
10
|
+
raise ArgumentError, 'pass a String' unless str.respond_to?(:codepoints)
|
|
11
|
+
str.codepoints.each_with_object(new) { |cp, set| set << cp }
|
|
12
12
|
end
|
|
13
13
|
end
|
|
14
14
|
|
|
@@ -31,7 +31,7 @@ class CharacterSet
|
|
|
31
31
|
end
|
|
32
32
|
|
|
33
33
|
def ranges
|
|
34
|
-
CharacterSet.require_optional_dependency('range_compressor')
|
|
34
|
+
CharacterSet.require_optional_dependency('range_compressor', __method__)
|
|
35
35
|
RangeCompressor.compress(self)
|
|
36
36
|
end
|
|
37
37
|
|
|
@@ -121,10 +121,9 @@ class CharacterSet
|
|
|
121
121
|
end
|
|
122
122
|
|
|
123
123
|
def make_new_str(original, &block)
|
|
124
|
-
|
|
124
|
+
str!(original)
|
|
125
125
|
.each_codepoint
|
|
126
126
|
.each_with_object(''.encode(original.encoding), &block)
|
|
127
|
-
original.tainted? ? new_string.taint : new_string
|
|
128
127
|
end
|
|
129
128
|
end
|
|
130
129
|
end
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
class CharacterSet
|
|
2
2
|
module RubyFallback
|
|
3
3
|
module SetMethods
|
|
4
|
-
Enumerable.instance_methods
|
|
4
|
+
(Enumerable.instance_methods -
|
|
5
|
+
%i[include? member? to_a] +
|
|
6
|
+
%i[empty? length size]).each do |mthd|
|
|
5
7
|
class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
|
6
8
|
def #{mthd}(*args, &block)
|
|
7
9
|
@__set.#{mthd}(*args, &block)
|
|
@@ -9,7 +11,7 @@ class CharacterSet
|
|
|
9
11
|
RUBY
|
|
10
12
|
end
|
|
11
13
|
|
|
12
|
-
%
|
|
14
|
+
%i[< <= > >= disjoint? intersect? proper_subset? proper_superset?
|
|
13
15
|
subset? superset?].each do |mthd|
|
|
14
16
|
class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
|
15
17
|
def #{mthd}(enum, &block)
|
|
@@ -21,8 +23,8 @@ class CharacterSet
|
|
|
21
23
|
RUBY
|
|
22
24
|
end
|
|
23
25
|
|
|
24
|
-
%
|
|
25
|
-
each filter!
|
|
26
|
+
%i[<< add add? clear collect! delete delete? delete_if
|
|
27
|
+
each filter! map! keep_if reject!
|
|
26
28
|
select! subtract].each do |mthd|
|
|
27
29
|
class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
|
28
30
|
def #{mthd}(*args, &block)
|
|
@@ -32,22 +34,22 @@ class CharacterSet
|
|
|
32
34
|
RUBY
|
|
33
35
|
end
|
|
34
36
|
|
|
35
|
-
|
|
37
|
+
# revert if https://github.com/knu/sorted_set/issues/2 is resolved
|
|
38
|
+
%i[=== include? member?].each do |mthd|
|
|
36
39
|
class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
|
37
|
-
def #{mthd}(
|
|
38
|
-
|
|
39
|
-
enum = enum.map { |el| el.is_a?(String) ? el.ord : el }
|
|
40
|
-
end
|
|
41
|
-
self.class.new(@__set.#{mthd}(enum, &block).to_a)
|
|
40
|
+
def #{mthd}(*args, &block)
|
|
41
|
+
!!@__set.#{mthd}(*args, &block)
|
|
42
42
|
end
|
|
43
43
|
RUBY
|
|
44
44
|
end
|
|
45
45
|
|
|
46
|
-
%
|
|
46
|
+
%i[& + - ^ | difference intersection union].each do |mthd|
|
|
47
47
|
class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
|
48
|
-
def #{mthd}
|
|
49
|
-
|
|
50
|
-
|
|
48
|
+
def #{mthd}(enum, &block)
|
|
49
|
+
if enum.respond_to?(:map)
|
|
50
|
+
enum = enum.map { |el| el.is_a?(String) ? el.ord : el }
|
|
51
|
+
end
|
|
52
|
+
self.class.new(@__set.#{mthd}(enum, &block).to_a)
|
|
51
53
|
end
|
|
52
54
|
RUBY
|
|
53
55
|
end
|
|
@@ -81,7 +83,13 @@ class CharacterSet
|
|
|
81
83
|
|
|
82
84
|
def eql?(other)
|
|
83
85
|
return false unless other.is_a?(self.class)
|
|
84
|
-
|
|
86
|
+
# revert if https://github.com/knu/sorted_set/issues/3 is resolved
|
|
87
|
+
hash == other.hash
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# revert if https://github.com/knu/sorted_set/issues/3 is resolved
|
|
91
|
+
def hash
|
|
92
|
+
@__set.to_a.hash
|
|
85
93
|
end
|
|
86
94
|
|
|
87
95
|
def initialize_dup(orig)
|
|
@@ -22,13 +22,14 @@ class CharacterSet
|
|
|
22
22
|
|
|
23
23
|
# Allow some methods to take an Enum just as well as another CharacterSet.
|
|
24
24
|
# Tested by ruby-spec.
|
|
25
|
-
%w[& + - ^ | difference
|
|
25
|
+
%w[& + - ^ | difference disjoint? intersect? intersection
|
|
26
|
+
subtract union].each do |method|
|
|
26
27
|
class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
|
27
28
|
def #{method}(arg)
|
|
28
29
|
if arg.is_a?(CharacterSet)
|
|
29
|
-
super
|
|
30
|
+
super(arg)
|
|
30
31
|
elsif arg.respond_to?(:each)
|
|
31
|
-
super(
|
|
32
|
+
super(self.class.new(arg.to_a))
|
|
32
33
|
else
|
|
33
34
|
raise ArgumentError, 'pass an enumerable'
|
|
34
35
|
end
|
|
@@ -15,6 +15,12 @@ class CharacterSet
|
|
|
15
15
|
new(Array(args))
|
|
16
16
|
end
|
|
17
17
|
|
|
18
|
+
def of(*args)
|
|
19
|
+
args.map do |arg|
|
|
20
|
+
arg.is_a?(Regexp) ? of_regexp(arg) : of_string(arg)
|
|
21
|
+
end.reduce(:merge) || new
|
|
22
|
+
end
|
|
23
|
+
|
|
18
24
|
def parse(string)
|
|
19
25
|
codepoints = Parser.codepoints_from_bracket_expression(string)
|
|
20
26
|
result = new(codepoints)
|
|
@@ -22,33 +28,29 @@ class CharacterSet
|
|
|
22
28
|
end
|
|
23
29
|
|
|
24
30
|
def of_property(property_name)
|
|
25
|
-
require_optional_dependency('regexp_property_values')
|
|
31
|
+
require_optional_dependency('regexp_property_values', __method__)
|
|
26
32
|
|
|
27
33
|
property = RegexpPropertyValues[property_name.to_s]
|
|
28
34
|
from_ranges(*property.matched_ranges)
|
|
29
35
|
end
|
|
30
36
|
|
|
31
37
|
def of_regexp(regexp)
|
|
32
|
-
require_optional_dependency('regexp_parser')
|
|
38
|
+
require_optional_dependency('regexp_parser', __method__)
|
|
33
39
|
|
|
34
40
|
root = ::Regexp::Parser.parse(regexp)
|
|
35
41
|
of_expression(root)
|
|
36
42
|
end
|
|
37
43
|
|
|
38
44
|
def of_expression(expression)
|
|
39
|
-
ExpressionConverter.convert(expression)
|
|
45
|
+
ExpressionConverter.convert(expression, self)
|
|
40
46
|
end
|
|
41
47
|
|
|
42
|
-
def require_optional_dependency(name)
|
|
48
|
+
def require_optional_dependency(name, method)
|
|
43
49
|
required_optional_dependencies[name] ||= begin
|
|
44
50
|
require name
|
|
45
51
|
true
|
|
46
52
|
rescue ::LoadError
|
|
47
|
-
|
|
48
|
-
loc.absolute_path.to_s.include?('/lib/character_set')
|
|
49
|
-
end
|
|
50
|
-
method = entry_point && entry_point.label
|
|
51
|
-
raise LoadError, 'You must the install the optional dependency '\
|
|
53
|
+
raise LoadError, 'You must install the optional dependency '\
|
|
52
54
|
"'\#{name}' to use the method `\#{method}'."
|
|
53
55
|
end
|
|
54
56
|
end
|
|
@@ -86,10 +88,22 @@ class CharacterSet
|
|
|
86
88
|
Writer.write(ranges, opts, &block)
|
|
87
89
|
end
|
|
88
90
|
|
|
91
|
+
def to_s_with_surrogate_ranges
|
|
92
|
+
Writer.write_surrogate_ranges(bmp_part.ranges, astral_part.ranges)
|
|
93
|
+
end
|
|
94
|
+
|
|
89
95
|
def to_s_with_surrogate_alternation
|
|
90
96
|
Writer.write_surrogate_alternation(bmp_part.ranges, astral_part.ranges)
|
|
91
97
|
end
|
|
92
98
|
|
|
99
|
+
def secure_token(length = 32)
|
|
100
|
+
CharacterSet.require_optional_dependency('securerandom', __method__)
|
|
101
|
+
cps = to_a
|
|
102
|
+
len = cps.count
|
|
103
|
+
1.upto(length).map { cps[SecureRandom.random_number(len)] }.pack('U*')
|
|
104
|
+
end
|
|
105
|
+
alias random_token secure_token
|
|
106
|
+
|
|
93
107
|
def inspect
|
|
94
108
|
len = length
|
|
95
109
|
"#<#{klass.name}: {\#{first(5) * ', '}\#{'...' if len > 5}} (size: \#{len})>"
|
|
@@ -151,7 +165,7 @@ class CharacterSet
|
|
|
151
165
|
end
|
|
152
166
|
|
|
153
167
|
def divide(&func)
|
|
154
|
-
|
|
168
|
+
CharacterSet.require_optional_dependency('set', __method__)
|
|
155
169
|
Set.new(to_a).divide(&func)
|
|
156
170
|
end
|
|
157
171
|
RUBY
|
data/lib/character_set/writer.rb
CHANGED
|
@@ -1,37 +1,108 @@
|
|
|
1
1
|
class CharacterSet
|
|
2
2
|
module Writer
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
3
|
+
class << self
|
|
4
|
+
def write(codepoint_ranges, opts = {}, &block)
|
|
5
|
+
content = codepoint_ranges.map do |range|
|
|
6
|
+
if range.size > 2 && opts[:abbreviate] != false
|
|
7
|
+
bounds = [range.min, range.max]
|
|
8
|
+
bounds.map { |cp| write_codepoint(cp, opts, &block) }.join('-')
|
|
9
|
+
else
|
|
10
|
+
range.map { |cp| write_codepoint(cp, opts, &block) }.join
|
|
11
|
+
end
|
|
12
|
+
end.join
|
|
13
|
+
opts[:in_brackets] ? "[#{content}]" : content
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def write_codepoint(codepoint, opts = {}, &block)
|
|
17
|
+
Character.new(codepoint).escape(opts, &block)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def write_surrogate_ranges(bmp_ranges, astral_ranges)
|
|
21
|
+
astral_branches = surrogate_range_expressions(astral_ranges)
|
|
22
|
+
bmp_set_with_alternatives(bmp_ranges, astral_branches)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def write_surrogate_alternation(bmp_ranges, astral_ranges)
|
|
26
|
+
astral_branches = surrogate_pairs(astral_ranges)
|
|
27
|
+
bmp_set_with_alternatives(bmp_ranges, astral_branches)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def surrogate_range_expressions(astral_ranges)
|
|
33
|
+
compressed_surrogate_range_pairs(astral_ranges).map do |hi_ranges, lo_ranges|
|
|
34
|
+
[hi_ranges, lo_ranges].map do |ranges|
|
|
35
|
+
use_brackets = ranges.size > 1 || ranges.first.size > 1
|
|
36
|
+
write(ranges, format: :js, in_brackets: use_brackets)
|
|
37
|
+
end.join
|
|
11
38
|
end
|
|
12
|
-
end
|
|
13
|
-
|
|
14
|
-
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def compressed_surrogate_range_pairs(astral_ranges)
|
|
42
|
+
halves = astral_ranges.flat_map { |range| surrogate_half_ranges(range) }
|
|
15
43
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
44
|
+
# compress high surrogate codepoint ranges with common low range half
|
|
45
|
+
with_common_lo = halves.group_by(&:last).map do |lo_range, pairs|
|
|
46
|
+
hi_ranges = pairs.map(&:first)
|
|
47
|
+
compressed_hi_ranges = hi_ranges.each_with_object([]) do |range, arr|
|
|
48
|
+
prev = arr.last
|
|
49
|
+
if prev.nil? || prev.max + 1 < range.min # first or gap
|
|
50
|
+
arr << range
|
|
51
|
+
else # continuous codepoints, expand previous range
|
|
52
|
+
arr[-1] = (prev.min)..(range.max)
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
[compressed_hi_ranges, lo_range]
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# compress low surrogate codepoint ranges with common high ranges
|
|
59
|
+
with_common_lo.each_with_object({}) do |(hi_ranges, lo_range), hash|
|
|
60
|
+
(hash[hi_ranges] ||= []) << lo_range
|
|
61
|
+
end
|
|
23
62
|
end
|
|
24
|
-
end
|
|
25
63
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
64
|
+
def surrogate_half_ranges(astral_range)
|
|
65
|
+
hi_min, lo_min = surrogate_pair_codepoints(astral_range.min)
|
|
66
|
+
hi_max, lo_max = surrogate_pair_codepoints(astral_range.max)
|
|
67
|
+
hi_count = 1 + hi_max - hi_min
|
|
68
|
+
return [[hi_min..hi_min, lo_min..lo_max]] if hi_count == 1
|
|
69
|
+
|
|
70
|
+
ranges = []
|
|
71
|
+
|
|
72
|
+
# first high surrogate might be partially covered (if lo_min > 0xDC00)
|
|
73
|
+
ranges << [hi_min..hi_min, lo_min..0xDFFF]
|
|
74
|
+
|
|
75
|
+
# any high surrogates in between are fully covered
|
|
76
|
+
ranges << [(hi_min + 1)..(hi_max - 1), 0xDC00..0xDFFF] if hi_count > 2
|
|
29
77
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
78
|
+
# last high surrogate might be partially covered (if lo_max < 0xDFFF)
|
|
79
|
+
ranges << [hi_max..hi_max, 0xDC00..lo_max]
|
|
80
|
+
|
|
81
|
+
ranges
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def surrogate_pair_codepoints(astral_codepoint)
|
|
85
|
+
base = astral_codepoint - 0x10000
|
|
86
|
+
high = base / 1024 + 0xD800
|
|
87
|
+
low = base % 1024 + 0xDC00
|
|
88
|
+
[high, low]
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def bmp_set_with_alternatives(bmp_ranges, alternatives)
|
|
92
|
+
bmp_set = write(bmp_ranges, format: :js, in_brackets: true)
|
|
93
|
+
return bmp_set if alternatives.empty? && bmp_ranges.any?
|
|
94
|
+
|
|
95
|
+
"(?:#{((bmp_ranges.any? ? [bmp_set] : []) + alternatives).join('|')})"
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def surrogate_pairs(astral_ranges)
|
|
99
|
+
astral_ranges.flat_map { |range| range.map { |cp| surrogate_pair(cp) } }
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def surrogate_pair(astral_codepoint)
|
|
103
|
+
surrogate_pair_codepoints(astral_codepoint)
|
|
104
|
+
.map { |half| write_codepoint(half, format: :js) }.join
|
|
105
|
+
end
|
|
35
106
|
end
|
|
36
107
|
end
|
|
37
108
|
end
|