character_set 1.6.0 → 1.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/BENCHMARK.md +32 -32
  3. data/CHANGELOG.md +15 -1
  4. data/README.md +1 -1
  5. data/Rakefile +2 -123
  6. data/character_set.gemspec +0 -7
  7. data/ext/character_set/character_set.c +64 -43
  8. data/lib/character_set/parser.rb +8 -4
  9. data/lib/character_set/predefined_sets/assigned.cps +73 -52
  10. data/lib/character_set/predefined_sets/emoji.cps +10 -9
  11. data/lib/character_set/ruby_fallback/character_set_methods.rb +15 -14
  12. data/lib/character_set/ruby_fallback/set_methods.rb +4 -18
  13. data/lib/character_set/ruby_fallback/vendored_set_classes.rb +492 -0
  14. data/lib/character_set/ruby_fallback.rb +2 -6
  15. data/lib/character_set/shared_methods.rb +2 -2
  16. data/lib/character_set/version.rb +1 -1
  17. data/tasks/benchmark.rake +20 -0
  18. data/tasks/benchmarks/shared.rb +28 -0
  19. data/tasks/sync_casefold_data.rake +20 -0
  20. data/tasks/sync_predefined_sets.rake +9 -0
  21. data/tasks/sync_ruby_spec.rake +65 -0
  22. metadata +20 -29
  23. data/benchmarks/shared.rb +0 -30
  24. /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
  25. /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
  26. /data/{benchmarks → tasks/benchmarks}/delete_in.rb +0 -0
  27. /data/{benchmarks → tasks/benchmarks}/keep_in.rb +0 -0
  28. /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
  29. /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
  30. /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
  31. /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
  32. /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
  33. /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
@@ -10,8 +10,7 @@
10
10
  591,5C7
11
11
  5D0,5EA
12
12
  5EF,5F4
13
- 600,61C
14
- 61E,70D
13
+ 600,70D
15
14
  70F,74A
16
15
  74D,7B1
17
16
  7C0,7FA
@@ -20,9 +19,9 @@
20
19
  840,85B
21
20
  85E,85E
22
21
  860,86A
23
- 8A0,8B4
24
- 8B6,8C7
25
- 8D3,983
22
+ 870,88E
23
+ 890,891
24
+ 898,983
26
25
  985,98C
27
26
  98F,990
28
27
  993,9A8
@@ -100,11 +99,12 @@ C00,C0C
100
99
  C0E,C10
101
100
  C12,C28
102
101
  C2A,C39
103
- C3D,C44
102
+ C3C,C44
104
103
  C46,C48
105
104
  C4A,C4D
106
105
  C55,C56
107
106
  C58,C5A
107
+ C5D,C5D
108
108
  C60,C63
109
109
  C66,C6F
110
110
  C77,C8C
@@ -116,7 +116,7 @@ CBC,CC4
116
116
  CC6,CC8
117
117
  CCA,CCD
118
118
  CD5,CD6
119
- CDE,CDE
119
+ CDD,CDE
120
120
  CE0,CE3
121
121
  CE6,CEF
122
122
  CF1,CF2
@@ -183,9 +183,8 @@ FCE,FDA
183
183
  13F8,13FD
184
184
  1400,169C
185
185
  16A0,16F8
186
- 1700,170C
187
- 170E,1714
188
- 1720,1736
186
+ 1700,1715
187
+ 171F,1736
189
188
  1740,1753
190
189
  1760,176C
191
190
  176E,1770
@@ -193,8 +192,7 @@ FCE,FDA
193
192
  1780,17DD
194
193
  17E0,17E9
195
194
  17F0,17F9
196
- 1800,180E
197
- 1810,1819
195
+ 1800,1819
198
196
  1820,1878
199
197
  1880,18AA
200
198
  18B0,18F5
@@ -213,9 +211,9 @@ FCE,FDA
213
211
  1A7F,1A89
214
212
  1A90,1A99
215
213
  1AA0,1AAD
216
- 1AB0,1AC0
217
- 1B00,1B4B
218
- 1B50,1B7C
214
+ 1AB0,1ACE
215
+ 1B00,1B4C
216
+ 1B50,1B7E
219
217
  1B80,1BF3
220
218
  1BFC,1C37
221
219
  1C3B,1C49
@@ -223,8 +221,7 @@ FCE,FDA
223
221
  1C90,1CBA
224
222
  1CBD,1CC7
225
223
  1CD0,1CFA
226
- 1D00,1DF9
227
- 1DFB,1F15
224
+ 1D00,1F15
228
225
  1F18,1F1D
229
226
  1F20,1F45
230
227
  1F48,1F4D
@@ -244,16 +241,14 @@ FCE,FDA
244
241
  2066,2071
245
242
  2074,208E
246
243
  2090,209C
247
- 20A0,20BF
244
+ 20A0,20C0
248
245
  20D0,20F0
249
246
  2100,218B
250
247
  2190,2426
251
248
  2440,244A
252
249
  2460,2B73
253
250
  2B76,2B95
254
- 2B97,2C2E
255
- 2C30,2C5E
256
- 2C60,2CF3
251
+ 2B97,2CF3
257
252
  2CF9,2D25
258
253
  2D27,2D27
259
254
  2D2D,2D2D
@@ -268,7 +263,7 @@ FCE,FDA
268
263
  2DC8,2DCE
269
264
  2DD0,2DD6
270
265
  2DD8,2DDE
271
- 2DE0,2E52
266
+ 2DE0,2E5D
272
267
  2E80,2E99
273
268
  2E9B,2EF3
274
269
  2F00,2FD5
@@ -280,14 +275,15 @@ FCE,FDA
280
275
  3131,318E
281
276
  3190,31E3
282
277
  31F0,321E
283
- 3220,9FFC
284
- A000,A48C
278
+ 3220,A48C
285
279
  A490,A4C6
286
280
  A4D0,A62B
287
281
  A640,A6F7
288
- A700,A7BF
289
- A7C2,A7CA
290
- A7F5,A82C
282
+ A700,A7CA
283
+ A7D0,A7D1
284
+ A7D3,A7D3
285
+ A7D5,A7D9
286
+ A7F2,A82C
291
287
  A830,A839
292
288
  A840,A877
293
289
  A880,A8C5
@@ -322,12 +318,11 @@ FB38,FB3C
322
318
  FB3E,FB3E
323
319
  FB40,FB41
324
320
  FB43,FB44
325
- FB46,FBC1
326
- FBD3,FD3F
327
- FD50,FD8F
321
+ FB46,FBC2
322
+ FBD3,FD8F
328
323
  FD92,FDC7
329
- FDF0,FDFD
330
- FE00,FE19
324
+ FDCF,FDCF
325
+ FDF0,FE19
331
326
  FE20,FE52
332
327
  FE54,FE66
333
328
  FE68,FE6B
@@ -370,10 +365,20 @@ FFF9,FFFD
370
365
  104D8,104FB
371
366
  10500,10527
372
367
  10530,10563
373
- 1056F,1056F
368
+ 1056F,1057A
369
+ 1057C,1058A
370
+ 1058C,10592
371
+ 10594,10595
372
+ 10597,105A1
373
+ 105A3,105B1
374
+ 105B3,105B9
375
+ 105BB,105BC
374
376
  10600,10736
375
377
  10740,10755
376
378
  10760,10767
379
+ 10780,10785
380
+ 10787,107B0
381
+ 107B2,107BA
377
382
  10800,10805
378
383
  10808,10808
379
384
  1080A,10835
@@ -417,11 +422,12 @@ FFF9,FFFD
417
422
  10EB0,10EB1
418
423
  10F00,10F27
419
424
  10F30,10F59
425
+ 10F70,10F89
420
426
  10FB0,10FCB
421
427
  10FE0,10FF6
422
428
  11000,1104D
423
- 11052,1106F
424
- 1107F,110C1
429
+ 11052,11075
430
+ 1107F,110C2
425
431
  110CD,110CD
426
432
  110D0,110E8
427
433
  110F0,110F9
@@ -463,11 +469,11 @@ FFF9,FFFD
463
469
  11600,11644
464
470
  11650,11659
465
471
  11660,1166C
466
- 11680,116B8
472
+ 11680,116B9
467
473
  116C0,116C9
468
474
  11700,1171A
469
475
  1171D,1172B
470
- 11730,1173F
476
+ 11730,11746
471
477
  11800,1183B
472
478
  118A0,118F2
473
479
  118FF,11906
@@ -483,7 +489,7 @@ FFF9,FFFD
483
489
  119DA,119E4
484
490
  11A00,11A47
485
491
  11A50,11AA2
486
- 11AC0,11AF8
492
+ 11AB0,11AF8
487
493
  11C00,11C08
488
494
  11C0A,11C36
489
495
  11C38,11C45
@@ -511,13 +517,15 @@ FFF9,FFFD
511
517
  12400,1246E
512
518
  12470,12474
513
519
  12480,12543
520
+ 12F90,12FF2
514
521
  13000,1342E
515
522
  13430,13438
516
523
  14400,14646
517
524
  16800,16A38
518
525
  16A40,16A5E
519
526
  16A60,16A69
520
- 16A6E,16A6F
527
+ 16A6E,16ABE
528
+ 16AC0,16AC9
521
529
  16AD0,16AED
522
530
  16AF0,16AF5
523
531
  16B00,16B45
@@ -534,7 +542,10 @@ FFF9,FFFD
534
542
  17000,187F7
535
543
  18800,18CD5
536
544
  18D00,18D08
537
- 1B000,1B11E
545
+ 1AFF0,1AFF3
546
+ 1AFF5,1AFFB
547
+ 1AFFD,1AFFE
548
+ 1B000,1B122
538
549
  1B150,1B152
539
550
  1B164,1B167
540
551
  1B170,1B2FB
@@ -543,9 +554,12 @@ FFF9,FFFD
543
554
  1BC80,1BC88
544
555
  1BC90,1BC99
545
556
  1BC9C,1BCA3
557
+ 1CF00,1CF2D
558
+ 1CF30,1CF46
559
+ 1CF50,1CFC3
546
560
  1D000,1D0F5
547
561
  1D100,1D126
548
- 1D129,1D1E8
562
+ 1D129,1D1EA
549
563
  1D200,1D245
550
564
  1D2E0,1D2F3
551
565
  1D300,1D356
@@ -573,6 +587,7 @@ FFF9,FFFD
573
587
  1D7CE,1DA8B
574
588
  1DA9B,1DA9F
575
589
  1DAA1,1DAAF
590
+ 1DF00,1DF1E
576
591
  1E000,1E006
577
592
  1E008,1E018
578
593
  1E01B,1E021
@@ -582,8 +597,13 @@ FFF9,FFFD
582
597
  1E130,1E13D
583
598
  1E140,1E149
584
599
  1E14E,1E14F
600
+ 1E290,1E2AE
585
601
  1E2C0,1E2F9
586
602
  1E2FF,1E2FF
603
+ 1E7E0,1E7E6
604
+ 1E7E8,1E7EB
605
+ 1E7ED,1E7EE
606
+ 1E7F0,1E7FE
587
607
  1E800,1E8C4
588
608
  1E8C7,1E8D6
589
609
  1E900,1E94B
@@ -638,33 +658,34 @@ FFF9,FFFD
638
658
  1F250,1F251
639
659
  1F260,1F265
640
660
  1F300,1F6D7
641
- 1F6E0,1F6EC
661
+ 1F6DD,1F6EC
642
662
  1F6F0,1F6FC
643
663
  1F700,1F773
644
664
  1F780,1F7D8
645
665
  1F7E0,1F7EB
666
+ 1F7F0,1F7F0
646
667
  1F800,1F80B
647
668
  1F810,1F847
648
669
  1F850,1F859
649
670
  1F860,1F887
650
671
  1F890,1F8AD
651
672
  1F8B0,1F8B1
652
- 1F900,1F978
653
- 1F97A,1F9CB
654
- 1F9CD,1FA53
673
+ 1F900,1FA53
655
674
  1FA60,1FA6D
656
675
  1FA70,1FA74
657
- 1FA78,1FA7A
676
+ 1FA78,1FA7C
658
677
  1FA80,1FA86
659
- 1FA90,1FAA8
660
- 1FAB0,1FAB6
661
- 1FAC0,1FAC2
662
- 1FAD0,1FAD6
678
+ 1FA90,1FAAC
679
+ 1FAB0,1FABA
680
+ 1FAC0,1FAC5
681
+ 1FAD0,1FAD9
682
+ 1FAE0,1FAE7
683
+ 1FAF0,1FAF6
663
684
  1FB00,1FB92
664
685
  1FB94,1FBCA
665
686
  1FBF0,1FBF9
666
- 20000,2A6DD
667
- 2A700,2B734
687
+ 20000,2A6DF
688
+ 2A700,2B738
668
689
  2B740,2B81D
669
690
  2B820,2CEA1
670
691
  2CEB0,2EBE0
@@ -132,21 +132,22 @@ AE,AE
132
132
  1F680,1F6C5
133
133
  1F6CB,1F6D2
134
134
  1F6D5,1F6D7
135
- 1F6E0,1F6E5
135
+ 1F6DD,1F6E5
136
136
  1F6E9,1F6E9
137
137
  1F6EB,1F6EC
138
138
  1F6F0,1F6F0
139
139
  1F6F3,1F6FC
140
140
  1F7E0,1F7EB
141
+ 1F7F0,1F7F0
141
142
  1F90C,1F93A
142
143
  1F93C,1F945
143
- 1F947,1F978
144
- 1F97A,1F9CB
145
- 1F9CD,1F9FF
144
+ 1F947,1F9FF
146
145
  1FA70,1FA74
147
- 1FA78,1FA7A
146
+ 1FA78,1FA7C
148
147
  1FA80,1FA86
149
- 1FA90,1FAA8
150
- 1FAB0,1FAB6
151
- 1FAC0,1FAC2
152
- 1FAD0,1FAD6
148
+ 1FA90,1FAAC
149
+ 1FAB0,1FABA
150
+ 1FAC0,1FAC5
151
+ 1FAD0,1FAD9
152
+ 1FAE0,1FAE7
153
+ 1FAF0,1FAF6
@@ -8,7 +8,7 @@ class CharacterSet
8
8
 
9
9
  def of_string(str)
10
10
  raise ArgumentError, 'pass a String' unless str.respond_to?(:codepoints)
11
- str.codepoints.each_with_object(new) { |cp, set| set << cp }
11
+ str.encode('utf-8').each_codepoint.with_object(new) { |cp, set| set << cp }
12
12
  end
13
13
  end
14
14
 
@@ -40,16 +40,18 @@ class CharacterSet
40
40
  end
41
41
 
42
42
  def count_in(string)
43
- str!(string).each_codepoint.count { |cp| include?(cp) }
43
+ utf8_str!(string).each_codepoint.count { |cp| include?(cp) }
44
44
  end
45
45
 
46
46
  def cover?(string)
47
- str!(string).each_codepoint { |cp| return false unless include?(cp) }
47
+ utf8_str!(string).each_codepoint { |cp| return false unless include?(cp) }
48
48
  true
49
49
  end
50
50
 
51
51
  def delete_in(string)
52
- make_new_str(string) { |cp, new_str| include?(cp) || (new_str << cp) }
52
+ utf8_str!(string).each_codepoint.with_object('') do |cp, new_str|
53
+ include?(cp) || (new_str << cp)
54
+ end.encode(string.encoding)
53
55
  end
54
56
 
55
57
  def delete_in!(string)
@@ -58,7 +60,9 @@ class CharacterSet
58
60
  end
59
61
 
60
62
  def keep_in(string)
61
- make_new_str(string) { |cp, new_str| include?(cp) && (new_str << cp) }
63
+ utf8_str!(string).each_codepoint.with_object('') do |cp, new_str|
64
+ include?(cp) && (new_str << cp)
65
+ end.encode(string.encoding)
62
66
  end
63
67
 
64
68
  def keep_in!(string)
@@ -67,14 +71,13 @@ class CharacterSet
67
71
  end
68
72
 
69
73
  def scan(string)
70
- encoding = str!(string).encoding
71
- string.each_codepoint.inject([]) do |arr, cp|
72
- include?(cp) ? arr.push(cp.chr(encoding)) : arr
74
+ utf8_str!(string).each_codepoint.with_object([]) do |cp, arr|
75
+ arr.push(cp.chr('utf-8')) if include?(cp)
73
76
  end
74
77
  end
75
78
 
76
79
  def used_by?(string)
77
- str!(string).each_codepoint { |cp| return true if include?(cp) }
80
+ utf8_str!(string).each_codepoint { |cp| return true if include?(cp) }
78
81
  false
79
82
  end
80
83
 
@@ -115,15 +118,13 @@ class CharacterSet
115
118
  num >= 0 && num <= 16 or raise ArgumentError, 'plane must be between 0 and 16'
116
119
  end
117
120
 
118
- def str!(obj)
121
+ def utf8_str!(obj)
119
122
  raise ArgumentError, 'pass a String' unless obj.respond_to?(:codepoints)
120
- obj
123
+ obj.encode('utf-8')
121
124
  end
122
125
 
123
126
  def make_new_str(original, &block)
124
- str!(original)
125
- .each_codepoint
126
- .each_with_object(''.encode(original.encoding), &block)
127
+ utf8_str!(original).each_codepoint.with_object('', &block)
127
128
  end
128
129
  end
129
130
  end
@@ -3,7 +3,7 @@ class CharacterSet
3
3
  module SetMethods
4
4
  (Enumerable.instance_methods -
5
5
  %i[include? member? to_a] +
6
- %i[empty? length size]).each do |mthd|
6
+ %i[empty? hash length size]).each do |mthd|
7
7
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
8
8
  def #{mthd}(*args, &block)
9
9
  @__set.#{mthd}(*args, &block)
@@ -11,8 +11,8 @@ class CharacterSet
11
11
  RUBY
12
12
  end
13
13
 
14
- %i[< <= > >= disjoint? intersect? proper_subset? proper_superset?
15
- subset? superset?].each do |mthd|
14
+ %i[< <= > >= === disjoint? include? intersect? member?
15
+ proper_subset? proper_superset? subset? superset?].each do |mthd|
16
16
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
17
17
  def #{mthd}(enum, &block)
18
18
  if enum.is_a?(CharacterSet) || enum.is_a?(CharacterSet::Pure)
@@ -34,15 +34,6 @@ class CharacterSet
34
34
  RUBY
35
35
  end
36
36
 
37
- # revert if https://github.com/knu/sorted_set/issues/2 is resolved
38
- %i[=== include? member?].each do |mthd|
39
- class_eval <<-RUBY, __FILE__, __LINE__ + 1
40
- def #{mthd}(*args, &block)
41
- !!@__set.#{mthd}(*args, &block)
42
- end
43
- RUBY
44
- end
45
-
46
37
  %i[& + - ^ | difference intersection union].each do |mthd|
47
38
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
48
39
  def #{mthd}(enum, &block)
@@ -83,13 +74,8 @@ class CharacterSet
83
74
 
84
75
  def eql?(other)
85
76
  return false unless other.is_a?(self.class)
86
- # revert if https://github.com/knu/sorted_set/issues/3 is resolved
87
- hash == other.hash
88
- end
89
77
 
90
- # revert if https://github.com/knu/sorted_set/issues/3 is resolved
91
- def hash
92
- @__set.to_a.hash
78
+ @__set.eql?(other.instance_variable_get(:@__set))
93
79
  end
94
80
 
95
81
  def initialize_dup(orig)