character_set 1.6.0-java → 1.7.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/BENCHMARK.md +32 -32
- data/CHANGELOG.md +15 -1
- data/README.md +1 -1
- data/Rakefile +2 -123
- data/character_set.gemspec +0 -7
- data/ext/character_set/character_set.c +64 -43
- data/lib/character_set/parser.rb +8 -4
- data/lib/character_set/predefined_sets/assigned.cps +73 -52
- data/lib/character_set/predefined_sets/emoji.cps +10 -9
- data/lib/character_set/ruby_fallback/character_set_methods.rb +15 -14
- data/lib/character_set/ruby_fallback/set_methods.rb +4 -18
- data/lib/character_set/ruby_fallback/vendored_set_classes.rb +492 -0
- data/lib/character_set/ruby_fallback.rb +2 -6
- data/lib/character_set/shared_methods.rb +2 -2
- data/lib/character_set/version.rb +1 -1
- data/tasks/benchmark.rake +20 -0
- data/tasks/benchmarks/shared.rb +28 -0
- data/tasks/sync_casefold_data.rake +20 -0
- data/tasks/sync_predefined_sets.rake +9 -0
- data/tasks/sync_ruby_spec.rake +65 -0
- metadata +19 -28
- data/benchmarks/shared.rb +0 -30
- /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/delete_in.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/keep_in.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
@@ -10,8 +10,7 @@
|
|
10
10
|
591,5C7
|
11
11
|
5D0,5EA
|
12
12
|
5EF,5F4
|
13
|
-
600,
|
14
|
-
61E,70D
|
13
|
+
600,70D
|
15
14
|
70F,74A
|
16
15
|
74D,7B1
|
17
16
|
7C0,7FA
|
@@ -20,9 +19,9 @@
|
|
20
19
|
840,85B
|
21
20
|
85E,85E
|
22
21
|
860,86A
|
23
|
-
|
24
|
-
|
25
|
-
|
22
|
+
870,88E
|
23
|
+
890,891
|
24
|
+
898,983
|
26
25
|
985,98C
|
27
26
|
98F,990
|
28
27
|
993,9A8
|
@@ -100,11 +99,12 @@ C00,C0C
|
|
100
99
|
C0E,C10
|
101
100
|
C12,C28
|
102
101
|
C2A,C39
|
103
|
-
|
102
|
+
C3C,C44
|
104
103
|
C46,C48
|
105
104
|
C4A,C4D
|
106
105
|
C55,C56
|
107
106
|
C58,C5A
|
107
|
+
C5D,C5D
|
108
108
|
C60,C63
|
109
109
|
C66,C6F
|
110
110
|
C77,C8C
|
@@ -116,7 +116,7 @@ CBC,CC4
|
|
116
116
|
CC6,CC8
|
117
117
|
CCA,CCD
|
118
118
|
CD5,CD6
|
119
|
-
|
119
|
+
CDD,CDE
|
120
120
|
CE0,CE3
|
121
121
|
CE6,CEF
|
122
122
|
CF1,CF2
|
@@ -183,9 +183,8 @@ FCE,FDA
|
|
183
183
|
13F8,13FD
|
184
184
|
1400,169C
|
185
185
|
16A0,16F8
|
186
|
-
1700,
|
187
|
-
|
188
|
-
1720,1736
|
186
|
+
1700,1715
|
187
|
+
171F,1736
|
189
188
|
1740,1753
|
190
189
|
1760,176C
|
191
190
|
176E,1770
|
@@ -193,8 +192,7 @@ FCE,FDA
|
|
193
192
|
1780,17DD
|
194
193
|
17E0,17E9
|
195
194
|
17F0,17F9
|
196
|
-
1800,
|
197
|
-
1810,1819
|
195
|
+
1800,1819
|
198
196
|
1820,1878
|
199
197
|
1880,18AA
|
200
198
|
18B0,18F5
|
@@ -213,9 +211,9 @@ FCE,FDA
|
|
213
211
|
1A7F,1A89
|
214
212
|
1A90,1A99
|
215
213
|
1AA0,1AAD
|
216
|
-
1AB0,
|
217
|
-
1B00,
|
218
|
-
1B50,
|
214
|
+
1AB0,1ACE
|
215
|
+
1B00,1B4C
|
216
|
+
1B50,1B7E
|
219
217
|
1B80,1BF3
|
220
218
|
1BFC,1C37
|
221
219
|
1C3B,1C49
|
@@ -223,8 +221,7 @@ FCE,FDA
|
|
223
221
|
1C90,1CBA
|
224
222
|
1CBD,1CC7
|
225
223
|
1CD0,1CFA
|
226
|
-
1D00,
|
227
|
-
1DFB,1F15
|
224
|
+
1D00,1F15
|
228
225
|
1F18,1F1D
|
229
226
|
1F20,1F45
|
230
227
|
1F48,1F4D
|
@@ -244,16 +241,14 @@ FCE,FDA
|
|
244
241
|
2066,2071
|
245
242
|
2074,208E
|
246
243
|
2090,209C
|
247
|
-
20A0,
|
244
|
+
20A0,20C0
|
248
245
|
20D0,20F0
|
249
246
|
2100,218B
|
250
247
|
2190,2426
|
251
248
|
2440,244A
|
252
249
|
2460,2B73
|
253
250
|
2B76,2B95
|
254
|
-
2B97,
|
255
|
-
2C30,2C5E
|
256
|
-
2C60,2CF3
|
251
|
+
2B97,2CF3
|
257
252
|
2CF9,2D25
|
258
253
|
2D27,2D27
|
259
254
|
2D2D,2D2D
|
@@ -268,7 +263,7 @@ FCE,FDA
|
|
268
263
|
2DC8,2DCE
|
269
264
|
2DD0,2DD6
|
270
265
|
2DD8,2DDE
|
271
|
-
2DE0,
|
266
|
+
2DE0,2E5D
|
272
267
|
2E80,2E99
|
273
268
|
2E9B,2EF3
|
274
269
|
2F00,2FD5
|
@@ -280,14 +275,15 @@ FCE,FDA
|
|
280
275
|
3131,318E
|
281
276
|
3190,31E3
|
282
277
|
31F0,321E
|
283
|
-
3220,
|
284
|
-
A000,A48C
|
278
|
+
3220,A48C
|
285
279
|
A490,A4C6
|
286
280
|
A4D0,A62B
|
287
281
|
A640,A6F7
|
288
|
-
A700,
|
289
|
-
|
290
|
-
|
282
|
+
A700,A7CA
|
283
|
+
A7D0,A7D1
|
284
|
+
A7D3,A7D3
|
285
|
+
A7D5,A7D9
|
286
|
+
A7F2,A82C
|
291
287
|
A830,A839
|
292
288
|
A840,A877
|
293
289
|
A880,A8C5
|
@@ -322,12 +318,11 @@ FB38,FB3C
|
|
322
318
|
FB3E,FB3E
|
323
319
|
FB40,FB41
|
324
320
|
FB43,FB44
|
325
|
-
FB46,
|
326
|
-
FBD3,
|
327
|
-
FD50,FD8F
|
321
|
+
FB46,FBC2
|
322
|
+
FBD3,FD8F
|
328
323
|
FD92,FDC7
|
329
|
-
|
330
|
-
|
324
|
+
FDCF,FDCF
|
325
|
+
FDF0,FE19
|
331
326
|
FE20,FE52
|
332
327
|
FE54,FE66
|
333
328
|
FE68,FE6B
|
@@ -370,10 +365,20 @@ FFF9,FFFD
|
|
370
365
|
104D8,104FB
|
371
366
|
10500,10527
|
372
367
|
10530,10563
|
373
|
-
1056F,
|
368
|
+
1056F,1057A
|
369
|
+
1057C,1058A
|
370
|
+
1058C,10592
|
371
|
+
10594,10595
|
372
|
+
10597,105A1
|
373
|
+
105A3,105B1
|
374
|
+
105B3,105B9
|
375
|
+
105BB,105BC
|
374
376
|
10600,10736
|
375
377
|
10740,10755
|
376
378
|
10760,10767
|
379
|
+
10780,10785
|
380
|
+
10787,107B0
|
381
|
+
107B2,107BA
|
377
382
|
10800,10805
|
378
383
|
10808,10808
|
379
384
|
1080A,10835
|
@@ -417,11 +422,12 @@ FFF9,FFFD
|
|
417
422
|
10EB0,10EB1
|
418
423
|
10F00,10F27
|
419
424
|
10F30,10F59
|
425
|
+
10F70,10F89
|
420
426
|
10FB0,10FCB
|
421
427
|
10FE0,10FF6
|
422
428
|
11000,1104D
|
423
|
-
11052,
|
424
|
-
1107F,
|
429
|
+
11052,11075
|
430
|
+
1107F,110C2
|
425
431
|
110CD,110CD
|
426
432
|
110D0,110E8
|
427
433
|
110F0,110F9
|
@@ -463,11 +469,11 @@ FFF9,FFFD
|
|
463
469
|
11600,11644
|
464
470
|
11650,11659
|
465
471
|
11660,1166C
|
466
|
-
11680,
|
472
|
+
11680,116B9
|
467
473
|
116C0,116C9
|
468
474
|
11700,1171A
|
469
475
|
1171D,1172B
|
470
|
-
11730,
|
476
|
+
11730,11746
|
471
477
|
11800,1183B
|
472
478
|
118A0,118F2
|
473
479
|
118FF,11906
|
@@ -483,7 +489,7 @@ FFF9,FFFD
|
|
483
489
|
119DA,119E4
|
484
490
|
11A00,11A47
|
485
491
|
11A50,11AA2
|
486
|
-
|
492
|
+
11AB0,11AF8
|
487
493
|
11C00,11C08
|
488
494
|
11C0A,11C36
|
489
495
|
11C38,11C45
|
@@ -511,13 +517,15 @@ FFF9,FFFD
|
|
511
517
|
12400,1246E
|
512
518
|
12470,12474
|
513
519
|
12480,12543
|
520
|
+
12F90,12FF2
|
514
521
|
13000,1342E
|
515
522
|
13430,13438
|
516
523
|
14400,14646
|
517
524
|
16800,16A38
|
518
525
|
16A40,16A5E
|
519
526
|
16A60,16A69
|
520
|
-
16A6E,
|
527
|
+
16A6E,16ABE
|
528
|
+
16AC0,16AC9
|
521
529
|
16AD0,16AED
|
522
530
|
16AF0,16AF5
|
523
531
|
16B00,16B45
|
@@ -534,7 +542,10 @@ FFF9,FFFD
|
|
534
542
|
17000,187F7
|
535
543
|
18800,18CD5
|
536
544
|
18D00,18D08
|
537
|
-
|
545
|
+
1AFF0,1AFF3
|
546
|
+
1AFF5,1AFFB
|
547
|
+
1AFFD,1AFFE
|
548
|
+
1B000,1B122
|
538
549
|
1B150,1B152
|
539
550
|
1B164,1B167
|
540
551
|
1B170,1B2FB
|
@@ -543,9 +554,12 @@ FFF9,FFFD
|
|
543
554
|
1BC80,1BC88
|
544
555
|
1BC90,1BC99
|
545
556
|
1BC9C,1BCA3
|
557
|
+
1CF00,1CF2D
|
558
|
+
1CF30,1CF46
|
559
|
+
1CF50,1CFC3
|
546
560
|
1D000,1D0F5
|
547
561
|
1D100,1D126
|
548
|
-
1D129,
|
562
|
+
1D129,1D1EA
|
549
563
|
1D200,1D245
|
550
564
|
1D2E0,1D2F3
|
551
565
|
1D300,1D356
|
@@ -573,6 +587,7 @@ FFF9,FFFD
|
|
573
587
|
1D7CE,1DA8B
|
574
588
|
1DA9B,1DA9F
|
575
589
|
1DAA1,1DAAF
|
590
|
+
1DF00,1DF1E
|
576
591
|
1E000,1E006
|
577
592
|
1E008,1E018
|
578
593
|
1E01B,1E021
|
@@ -582,8 +597,13 @@ FFF9,FFFD
|
|
582
597
|
1E130,1E13D
|
583
598
|
1E140,1E149
|
584
599
|
1E14E,1E14F
|
600
|
+
1E290,1E2AE
|
585
601
|
1E2C0,1E2F9
|
586
602
|
1E2FF,1E2FF
|
603
|
+
1E7E0,1E7E6
|
604
|
+
1E7E8,1E7EB
|
605
|
+
1E7ED,1E7EE
|
606
|
+
1E7F0,1E7FE
|
587
607
|
1E800,1E8C4
|
588
608
|
1E8C7,1E8D6
|
589
609
|
1E900,1E94B
|
@@ -638,33 +658,34 @@ FFF9,FFFD
|
|
638
658
|
1F250,1F251
|
639
659
|
1F260,1F265
|
640
660
|
1F300,1F6D7
|
641
|
-
|
661
|
+
1F6DD,1F6EC
|
642
662
|
1F6F0,1F6FC
|
643
663
|
1F700,1F773
|
644
664
|
1F780,1F7D8
|
645
665
|
1F7E0,1F7EB
|
666
|
+
1F7F0,1F7F0
|
646
667
|
1F800,1F80B
|
647
668
|
1F810,1F847
|
648
669
|
1F850,1F859
|
649
670
|
1F860,1F887
|
650
671
|
1F890,1F8AD
|
651
672
|
1F8B0,1F8B1
|
652
|
-
1F900,
|
653
|
-
1F97A,1F9CB
|
654
|
-
1F9CD,1FA53
|
673
|
+
1F900,1FA53
|
655
674
|
1FA60,1FA6D
|
656
675
|
1FA70,1FA74
|
657
|
-
1FA78,
|
676
|
+
1FA78,1FA7C
|
658
677
|
1FA80,1FA86
|
659
|
-
1FA90,
|
660
|
-
1FAB0,
|
661
|
-
1FAC0,
|
662
|
-
1FAD0,
|
678
|
+
1FA90,1FAAC
|
679
|
+
1FAB0,1FABA
|
680
|
+
1FAC0,1FAC5
|
681
|
+
1FAD0,1FAD9
|
682
|
+
1FAE0,1FAE7
|
683
|
+
1FAF0,1FAF6
|
663
684
|
1FB00,1FB92
|
664
685
|
1FB94,1FBCA
|
665
686
|
1FBF0,1FBF9
|
666
|
-
20000,
|
667
|
-
2A700,
|
687
|
+
20000,2A6DF
|
688
|
+
2A700,2B738
|
668
689
|
2B740,2B81D
|
669
690
|
2B820,2CEA1
|
670
691
|
2CEB0,2EBE0
|
@@ -132,21 +132,22 @@ AE,AE
|
|
132
132
|
1F680,1F6C5
|
133
133
|
1F6CB,1F6D2
|
134
134
|
1F6D5,1F6D7
|
135
|
-
|
135
|
+
1F6DD,1F6E5
|
136
136
|
1F6E9,1F6E9
|
137
137
|
1F6EB,1F6EC
|
138
138
|
1F6F0,1F6F0
|
139
139
|
1F6F3,1F6FC
|
140
140
|
1F7E0,1F7EB
|
141
|
+
1F7F0,1F7F0
|
141
142
|
1F90C,1F93A
|
142
143
|
1F93C,1F945
|
143
|
-
1F947,
|
144
|
-
1F97A,1F9CB
|
145
|
-
1F9CD,1F9FF
|
144
|
+
1F947,1F9FF
|
146
145
|
1FA70,1FA74
|
147
|
-
1FA78,
|
146
|
+
1FA78,1FA7C
|
148
147
|
1FA80,1FA86
|
149
|
-
1FA90,
|
150
|
-
1FAB0,
|
151
|
-
1FAC0,
|
152
|
-
1FAD0,
|
148
|
+
1FA90,1FAAC
|
149
|
+
1FAB0,1FABA
|
150
|
+
1FAC0,1FAC5
|
151
|
+
1FAD0,1FAD9
|
152
|
+
1FAE0,1FAE7
|
153
|
+
1FAF0,1FAF6
|
@@ -8,7 +8,7 @@ class CharacterSet
|
|
8
8
|
|
9
9
|
def of_string(str)
|
10
10
|
raise ArgumentError, 'pass a String' unless str.respond_to?(:codepoints)
|
11
|
-
str.
|
11
|
+
str.encode('utf-8').each_codepoint.with_object(new) { |cp, set| set << cp }
|
12
12
|
end
|
13
13
|
end
|
14
14
|
|
@@ -40,16 +40,18 @@ class CharacterSet
|
|
40
40
|
end
|
41
41
|
|
42
42
|
def count_in(string)
|
43
|
-
|
43
|
+
utf8_str!(string).each_codepoint.count { |cp| include?(cp) }
|
44
44
|
end
|
45
45
|
|
46
46
|
def cover?(string)
|
47
|
-
|
47
|
+
utf8_str!(string).each_codepoint { |cp| return false unless include?(cp) }
|
48
48
|
true
|
49
49
|
end
|
50
50
|
|
51
51
|
def delete_in(string)
|
52
|
-
|
52
|
+
utf8_str!(string).each_codepoint.with_object('') do |cp, new_str|
|
53
|
+
include?(cp) || (new_str << cp)
|
54
|
+
end.encode(string.encoding)
|
53
55
|
end
|
54
56
|
|
55
57
|
def delete_in!(string)
|
@@ -58,7 +60,9 @@ class CharacterSet
|
|
58
60
|
end
|
59
61
|
|
60
62
|
def keep_in(string)
|
61
|
-
|
63
|
+
utf8_str!(string).each_codepoint.with_object('') do |cp, new_str|
|
64
|
+
include?(cp) && (new_str << cp)
|
65
|
+
end.encode(string.encoding)
|
62
66
|
end
|
63
67
|
|
64
68
|
def keep_in!(string)
|
@@ -67,14 +71,13 @@ class CharacterSet
|
|
67
71
|
end
|
68
72
|
|
69
73
|
def scan(string)
|
70
|
-
|
71
|
-
|
72
|
-
include?(cp) ? arr.push(cp.chr(encoding)) : arr
|
74
|
+
utf8_str!(string).each_codepoint.with_object([]) do |cp, arr|
|
75
|
+
arr.push(cp.chr('utf-8')) if include?(cp)
|
73
76
|
end
|
74
77
|
end
|
75
78
|
|
76
79
|
def used_by?(string)
|
77
|
-
|
80
|
+
utf8_str!(string).each_codepoint { |cp| return true if include?(cp) }
|
78
81
|
false
|
79
82
|
end
|
80
83
|
|
@@ -115,15 +118,13 @@ class CharacterSet
|
|
115
118
|
num >= 0 && num <= 16 or raise ArgumentError, 'plane must be between 0 and 16'
|
116
119
|
end
|
117
120
|
|
118
|
-
def
|
121
|
+
def utf8_str!(obj)
|
119
122
|
raise ArgumentError, 'pass a String' unless obj.respond_to?(:codepoints)
|
120
|
-
obj
|
123
|
+
obj.encode('utf-8')
|
121
124
|
end
|
122
125
|
|
123
126
|
def make_new_str(original, &block)
|
124
|
-
|
125
|
-
.each_codepoint
|
126
|
-
.each_with_object(''.encode(original.encoding), &block)
|
127
|
+
utf8_str!(original).each_codepoint.with_object('', &block)
|
127
128
|
end
|
128
129
|
end
|
129
130
|
end
|
@@ -3,7 +3,7 @@ class CharacterSet
|
|
3
3
|
module SetMethods
|
4
4
|
(Enumerable.instance_methods -
|
5
5
|
%i[include? member? to_a] +
|
6
|
-
%i[empty? length size]).each do |mthd|
|
6
|
+
%i[empty? hash length size]).each do |mthd|
|
7
7
|
class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
8
8
|
def #{mthd}(*args, &block)
|
9
9
|
@__set.#{mthd}(*args, &block)
|
@@ -11,8 +11,8 @@ class CharacterSet
|
|
11
11
|
RUBY
|
12
12
|
end
|
13
13
|
|
14
|
-
%i[< <= > >= disjoint?
|
15
|
-
subset? superset?].each do |mthd|
|
14
|
+
%i[< <= > >= === disjoint? include? intersect? member?
|
15
|
+
proper_subset? proper_superset? subset? superset?].each do |mthd|
|
16
16
|
class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
17
17
|
def #{mthd}(enum, &block)
|
18
18
|
if enum.is_a?(CharacterSet) || enum.is_a?(CharacterSet::Pure)
|
@@ -34,15 +34,6 @@ class CharacterSet
|
|
34
34
|
RUBY
|
35
35
|
end
|
36
36
|
|
37
|
-
# revert if https://github.com/knu/sorted_set/issues/2 is resolved
|
38
|
-
%i[=== include? member?].each do |mthd|
|
39
|
-
class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
40
|
-
def #{mthd}(*args, &block)
|
41
|
-
!!@__set.#{mthd}(*args, &block)
|
42
|
-
end
|
43
|
-
RUBY
|
44
|
-
end
|
45
|
-
|
46
37
|
%i[& + - ^ | difference intersection union].each do |mthd|
|
47
38
|
class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
48
39
|
def #{mthd}(enum, &block)
|
@@ -83,13 +74,8 @@ class CharacterSet
|
|
83
74
|
|
84
75
|
def eql?(other)
|
85
76
|
return false unless other.is_a?(self.class)
|
86
|
-
# revert if https://github.com/knu/sorted_set/issues/3 is resolved
|
87
|
-
hash == other.hash
|
88
|
-
end
|
89
77
|
|
90
|
-
|
91
|
-
def hash
|
92
|
-
@__set.to_a.hash
|
78
|
+
@__set.eql?(other.instance_variable_get(:@__set))
|
93
79
|
end
|
94
80
|
|
95
81
|
def initialize_dup(orig)
|