character_set 1.6.0-java → 1.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/BENCHMARK.md +32 -32
  3. data/CHANGELOG.md +15 -1
  4. data/README.md +1 -1
  5. data/Rakefile +2 -123
  6. data/character_set.gemspec +0 -7
  7. data/ext/character_set/character_set.c +64 -43
  8. data/lib/character_set/parser.rb +8 -4
  9. data/lib/character_set/predefined_sets/assigned.cps +73 -52
  10. data/lib/character_set/predefined_sets/emoji.cps +10 -9
  11. data/lib/character_set/ruby_fallback/character_set_methods.rb +15 -14
  12. data/lib/character_set/ruby_fallback/set_methods.rb +4 -18
  13. data/lib/character_set/ruby_fallback/vendored_set_classes.rb +492 -0
  14. data/lib/character_set/ruby_fallback.rb +2 -6
  15. data/lib/character_set/shared_methods.rb +2 -2
  16. data/lib/character_set/version.rb +1 -1
  17. data/tasks/benchmark.rake +20 -0
  18. data/tasks/benchmarks/shared.rb +28 -0
  19. data/tasks/sync_casefold_data.rake +20 -0
  20. data/tasks/sync_predefined_sets.rake +9 -0
  21. data/tasks/sync_ruby_spec.rake +65 -0
  22. metadata +19 -28
  23. data/benchmarks/shared.rb +0 -30
  24. /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
  25. /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
  26. /data/{benchmarks → tasks/benchmarks}/delete_in.rb +0 -0
  27. /data/{benchmarks → tasks/benchmarks}/keep_in.rb +0 -0
  28. /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
  29. /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
  30. /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
  31. /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
  32. /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
  33. /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
@@ -10,8 +10,7 @@
10
10
  591,5C7
11
11
  5D0,5EA
12
12
  5EF,5F4
13
- 600,61C
14
- 61E,70D
13
+ 600,70D
15
14
  70F,74A
16
15
  74D,7B1
17
16
  7C0,7FA
@@ -20,9 +19,9 @@
20
19
  840,85B
21
20
  85E,85E
22
21
  860,86A
23
- 8A0,8B4
24
- 8B6,8C7
25
- 8D3,983
22
+ 870,88E
23
+ 890,891
24
+ 898,983
26
25
  985,98C
27
26
  98F,990
28
27
  993,9A8
@@ -100,11 +99,12 @@ C00,C0C
100
99
  C0E,C10
101
100
  C12,C28
102
101
  C2A,C39
103
- C3D,C44
102
+ C3C,C44
104
103
  C46,C48
105
104
  C4A,C4D
106
105
  C55,C56
107
106
  C58,C5A
107
+ C5D,C5D
108
108
  C60,C63
109
109
  C66,C6F
110
110
  C77,C8C
@@ -116,7 +116,7 @@ CBC,CC4
116
116
  CC6,CC8
117
117
  CCA,CCD
118
118
  CD5,CD6
119
- CDE,CDE
119
+ CDD,CDE
120
120
  CE0,CE3
121
121
  CE6,CEF
122
122
  CF1,CF2
@@ -183,9 +183,8 @@ FCE,FDA
183
183
  13F8,13FD
184
184
  1400,169C
185
185
  16A0,16F8
186
- 1700,170C
187
- 170E,1714
188
- 1720,1736
186
+ 1700,1715
187
+ 171F,1736
189
188
  1740,1753
190
189
  1760,176C
191
190
  176E,1770
@@ -193,8 +192,7 @@ FCE,FDA
193
192
  1780,17DD
194
193
  17E0,17E9
195
194
  17F0,17F9
196
- 1800,180E
197
- 1810,1819
195
+ 1800,1819
198
196
  1820,1878
199
197
  1880,18AA
200
198
  18B0,18F5
@@ -213,9 +211,9 @@ FCE,FDA
213
211
  1A7F,1A89
214
212
  1A90,1A99
215
213
  1AA0,1AAD
216
- 1AB0,1AC0
217
- 1B00,1B4B
218
- 1B50,1B7C
214
+ 1AB0,1ACE
215
+ 1B00,1B4C
216
+ 1B50,1B7E
219
217
  1B80,1BF3
220
218
  1BFC,1C37
221
219
  1C3B,1C49
@@ -223,8 +221,7 @@ FCE,FDA
223
221
  1C90,1CBA
224
222
  1CBD,1CC7
225
223
  1CD0,1CFA
226
- 1D00,1DF9
227
- 1DFB,1F15
224
+ 1D00,1F15
228
225
  1F18,1F1D
229
226
  1F20,1F45
230
227
  1F48,1F4D
@@ -244,16 +241,14 @@ FCE,FDA
244
241
  2066,2071
245
242
  2074,208E
246
243
  2090,209C
247
- 20A0,20BF
244
+ 20A0,20C0
248
245
  20D0,20F0
249
246
  2100,218B
250
247
  2190,2426
251
248
  2440,244A
252
249
  2460,2B73
253
250
  2B76,2B95
254
- 2B97,2C2E
255
- 2C30,2C5E
256
- 2C60,2CF3
251
+ 2B97,2CF3
257
252
  2CF9,2D25
258
253
  2D27,2D27
259
254
  2D2D,2D2D
@@ -268,7 +263,7 @@ FCE,FDA
268
263
  2DC8,2DCE
269
264
  2DD0,2DD6
270
265
  2DD8,2DDE
271
- 2DE0,2E52
266
+ 2DE0,2E5D
272
267
  2E80,2E99
273
268
  2E9B,2EF3
274
269
  2F00,2FD5
@@ -280,14 +275,15 @@ FCE,FDA
280
275
  3131,318E
281
276
  3190,31E3
282
277
  31F0,321E
283
- 3220,9FFC
284
- A000,A48C
278
+ 3220,A48C
285
279
  A490,A4C6
286
280
  A4D0,A62B
287
281
  A640,A6F7
288
- A700,A7BF
289
- A7C2,A7CA
290
- A7F5,A82C
282
+ A700,A7CA
283
+ A7D0,A7D1
284
+ A7D3,A7D3
285
+ A7D5,A7D9
286
+ A7F2,A82C
291
287
  A830,A839
292
288
  A840,A877
293
289
  A880,A8C5
@@ -322,12 +318,11 @@ FB38,FB3C
322
318
  FB3E,FB3E
323
319
  FB40,FB41
324
320
  FB43,FB44
325
- FB46,FBC1
326
- FBD3,FD3F
327
- FD50,FD8F
321
+ FB46,FBC2
322
+ FBD3,FD8F
328
323
  FD92,FDC7
329
- FDF0,FDFD
330
- FE00,FE19
324
+ FDCF,FDCF
325
+ FDF0,FE19
331
326
  FE20,FE52
332
327
  FE54,FE66
333
328
  FE68,FE6B
@@ -370,10 +365,20 @@ FFF9,FFFD
370
365
  104D8,104FB
371
366
  10500,10527
372
367
  10530,10563
373
- 1056F,1056F
368
+ 1056F,1057A
369
+ 1057C,1058A
370
+ 1058C,10592
371
+ 10594,10595
372
+ 10597,105A1
373
+ 105A3,105B1
374
+ 105B3,105B9
375
+ 105BB,105BC
374
376
  10600,10736
375
377
  10740,10755
376
378
  10760,10767
379
+ 10780,10785
380
+ 10787,107B0
381
+ 107B2,107BA
377
382
  10800,10805
378
383
  10808,10808
379
384
  1080A,10835
@@ -417,11 +422,12 @@ FFF9,FFFD
417
422
  10EB0,10EB1
418
423
  10F00,10F27
419
424
  10F30,10F59
425
+ 10F70,10F89
420
426
  10FB0,10FCB
421
427
  10FE0,10FF6
422
428
  11000,1104D
423
- 11052,1106F
424
- 1107F,110C1
429
+ 11052,11075
430
+ 1107F,110C2
425
431
  110CD,110CD
426
432
  110D0,110E8
427
433
  110F0,110F9
@@ -463,11 +469,11 @@ FFF9,FFFD
463
469
  11600,11644
464
470
  11650,11659
465
471
  11660,1166C
466
- 11680,116B8
472
+ 11680,116B9
467
473
  116C0,116C9
468
474
  11700,1171A
469
475
  1171D,1172B
470
- 11730,1173F
476
+ 11730,11746
471
477
  11800,1183B
472
478
  118A0,118F2
473
479
  118FF,11906
@@ -483,7 +489,7 @@ FFF9,FFFD
483
489
  119DA,119E4
484
490
  11A00,11A47
485
491
  11A50,11AA2
486
- 11AC0,11AF8
492
+ 11AB0,11AF8
487
493
  11C00,11C08
488
494
  11C0A,11C36
489
495
  11C38,11C45
@@ -511,13 +517,15 @@ FFF9,FFFD
511
517
  12400,1246E
512
518
  12470,12474
513
519
  12480,12543
520
+ 12F90,12FF2
514
521
  13000,1342E
515
522
  13430,13438
516
523
  14400,14646
517
524
  16800,16A38
518
525
  16A40,16A5E
519
526
  16A60,16A69
520
- 16A6E,16A6F
527
+ 16A6E,16ABE
528
+ 16AC0,16AC9
521
529
  16AD0,16AED
522
530
  16AF0,16AF5
523
531
  16B00,16B45
@@ -534,7 +542,10 @@ FFF9,FFFD
534
542
  17000,187F7
535
543
  18800,18CD5
536
544
  18D00,18D08
537
- 1B000,1B11E
545
+ 1AFF0,1AFF3
546
+ 1AFF5,1AFFB
547
+ 1AFFD,1AFFE
548
+ 1B000,1B122
538
549
  1B150,1B152
539
550
  1B164,1B167
540
551
  1B170,1B2FB
@@ -543,9 +554,12 @@ FFF9,FFFD
543
554
  1BC80,1BC88
544
555
  1BC90,1BC99
545
556
  1BC9C,1BCA3
557
+ 1CF00,1CF2D
558
+ 1CF30,1CF46
559
+ 1CF50,1CFC3
546
560
  1D000,1D0F5
547
561
  1D100,1D126
548
- 1D129,1D1E8
562
+ 1D129,1D1EA
549
563
  1D200,1D245
550
564
  1D2E0,1D2F3
551
565
  1D300,1D356
@@ -573,6 +587,7 @@ FFF9,FFFD
573
587
  1D7CE,1DA8B
574
588
  1DA9B,1DA9F
575
589
  1DAA1,1DAAF
590
+ 1DF00,1DF1E
576
591
  1E000,1E006
577
592
  1E008,1E018
578
593
  1E01B,1E021
@@ -582,8 +597,13 @@ FFF9,FFFD
582
597
  1E130,1E13D
583
598
  1E140,1E149
584
599
  1E14E,1E14F
600
+ 1E290,1E2AE
585
601
  1E2C0,1E2F9
586
602
  1E2FF,1E2FF
603
+ 1E7E0,1E7E6
604
+ 1E7E8,1E7EB
605
+ 1E7ED,1E7EE
606
+ 1E7F0,1E7FE
587
607
  1E800,1E8C4
588
608
  1E8C7,1E8D6
589
609
  1E900,1E94B
@@ -638,33 +658,34 @@ FFF9,FFFD
638
658
  1F250,1F251
639
659
  1F260,1F265
640
660
  1F300,1F6D7
641
- 1F6E0,1F6EC
661
+ 1F6DD,1F6EC
642
662
  1F6F0,1F6FC
643
663
  1F700,1F773
644
664
  1F780,1F7D8
645
665
  1F7E0,1F7EB
666
+ 1F7F0,1F7F0
646
667
  1F800,1F80B
647
668
  1F810,1F847
648
669
  1F850,1F859
649
670
  1F860,1F887
650
671
  1F890,1F8AD
651
672
  1F8B0,1F8B1
652
- 1F900,1F978
653
- 1F97A,1F9CB
654
- 1F9CD,1FA53
673
+ 1F900,1FA53
655
674
  1FA60,1FA6D
656
675
  1FA70,1FA74
657
- 1FA78,1FA7A
676
+ 1FA78,1FA7C
658
677
  1FA80,1FA86
659
- 1FA90,1FAA8
660
- 1FAB0,1FAB6
661
- 1FAC0,1FAC2
662
- 1FAD0,1FAD6
678
+ 1FA90,1FAAC
679
+ 1FAB0,1FABA
680
+ 1FAC0,1FAC5
681
+ 1FAD0,1FAD9
682
+ 1FAE0,1FAE7
683
+ 1FAF0,1FAF6
663
684
  1FB00,1FB92
664
685
  1FB94,1FBCA
665
686
  1FBF0,1FBF9
666
- 20000,2A6DD
667
- 2A700,2B734
687
+ 20000,2A6DF
688
+ 2A700,2B738
668
689
  2B740,2B81D
669
690
  2B820,2CEA1
670
691
  2CEB0,2EBE0
@@ -132,21 +132,22 @@ AE,AE
132
132
  1F680,1F6C5
133
133
  1F6CB,1F6D2
134
134
  1F6D5,1F6D7
135
- 1F6E0,1F6E5
135
+ 1F6DD,1F6E5
136
136
  1F6E9,1F6E9
137
137
  1F6EB,1F6EC
138
138
  1F6F0,1F6F0
139
139
  1F6F3,1F6FC
140
140
  1F7E0,1F7EB
141
+ 1F7F0,1F7F0
141
142
  1F90C,1F93A
142
143
  1F93C,1F945
143
- 1F947,1F978
144
- 1F97A,1F9CB
145
- 1F9CD,1F9FF
144
+ 1F947,1F9FF
146
145
  1FA70,1FA74
147
- 1FA78,1FA7A
146
+ 1FA78,1FA7C
148
147
  1FA80,1FA86
149
- 1FA90,1FAA8
150
- 1FAB0,1FAB6
151
- 1FAC0,1FAC2
152
- 1FAD0,1FAD6
148
+ 1FA90,1FAAC
149
+ 1FAB0,1FABA
150
+ 1FAC0,1FAC5
151
+ 1FAD0,1FAD9
152
+ 1FAE0,1FAE7
153
+ 1FAF0,1FAF6
@@ -8,7 +8,7 @@ class CharacterSet
8
8
 
9
9
  def of_string(str)
10
10
  raise ArgumentError, 'pass a String' unless str.respond_to?(:codepoints)
11
- str.codepoints.each_with_object(new) { |cp, set| set << cp }
11
+ str.encode('utf-8').each_codepoint.with_object(new) { |cp, set| set << cp }
12
12
  end
13
13
  end
14
14
 
@@ -40,16 +40,18 @@ class CharacterSet
40
40
  end
41
41
 
42
42
  def count_in(string)
43
- str!(string).each_codepoint.count { |cp| include?(cp) }
43
+ utf8_str!(string).each_codepoint.count { |cp| include?(cp) }
44
44
  end
45
45
 
46
46
  def cover?(string)
47
- str!(string).each_codepoint { |cp| return false unless include?(cp) }
47
+ utf8_str!(string).each_codepoint { |cp| return false unless include?(cp) }
48
48
  true
49
49
  end
50
50
 
51
51
  def delete_in(string)
52
- make_new_str(string) { |cp, new_str| include?(cp) || (new_str << cp) }
52
+ utf8_str!(string).each_codepoint.with_object('') do |cp, new_str|
53
+ include?(cp) || (new_str << cp)
54
+ end.encode(string.encoding)
53
55
  end
54
56
 
55
57
  def delete_in!(string)
@@ -58,7 +60,9 @@ class CharacterSet
58
60
  end
59
61
 
60
62
  def keep_in(string)
61
- make_new_str(string) { |cp, new_str| include?(cp) && (new_str << cp) }
63
+ utf8_str!(string).each_codepoint.with_object('') do |cp, new_str|
64
+ include?(cp) && (new_str << cp)
65
+ end.encode(string.encoding)
62
66
  end
63
67
 
64
68
  def keep_in!(string)
@@ -67,14 +71,13 @@ class CharacterSet
67
71
  end
68
72
 
69
73
  def scan(string)
70
- encoding = str!(string).encoding
71
- string.each_codepoint.inject([]) do |arr, cp|
72
- include?(cp) ? arr.push(cp.chr(encoding)) : arr
74
+ utf8_str!(string).each_codepoint.with_object([]) do |cp, arr|
75
+ arr.push(cp.chr('utf-8')) if include?(cp)
73
76
  end
74
77
  end
75
78
 
76
79
  def used_by?(string)
77
- str!(string).each_codepoint { |cp| return true if include?(cp) }
80
+ utf8_str!(string).each_codepoint { |cp| return true if include?(cp) }
78
81
  false
79
82
  end
80
83
 
@@ -115,15 +118,13 @@ class CharacterSet
115
118
  num >= 0 && num <= 16 or raise ArgumentError, 'plane must be between 0 and 16'
116
119
  end
117
120
 
118
- def str!(obj)
121
+ def utf8_str!(obj)
119
122
  raise ArgumentError, 'pass a String' unless obj.respond_to?(:codepoints)
120
- obj
123
+ obj.encode('utf-8')
121
124
  end
122
125
 
123
126
  def make_new_str(original, &block)
124
- str!(original)
125
- .each_codepoint
126
- .each_with_object(''.encode(original.encoding), &block)
127
+ utf8_str!(original).each_codepoint.with_object('', &block)
127
128
  end
128
129
  end
129
130
  end
@@ -3,7 +3,7 @@ class CharacterSet
3
3
  module SetMethods
4
4
  (Enumerable.instance_methods -
5
5
  %i[include? member? to_a] +
6
- %i[empty? length size]).each do |mthd|
6
+ %i[empty? hash length size]).each do |mthd|
7
7
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
8
8
  def #{mthd}(*args, &block)
9
9
  @__set.#{mthd}(*args, &block)
@@ -11,8 +11,8 @@ class CharacterSet
11
11
  RUBY
12
12
  end
13
13
 
14
- %i[< <= > >= disjoint? intersect? proper_subset? proper_superset?
15
- subset? superset?].each do |mthd|
14
+ %i[< <= > >= === disjoint? include? intersect? member?
15
+ proper_subset? proper_superset? subset? superset?].each do |mthd|
16
16
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
17
17
  def #{mthd}(enum, &block)
18
18
  if enum.is_a?(CharacterSet) || enum.is_a?(CharacterSet::Pure)
@@ -34,15 +34,6 @@ class CharacterSet
34
34
  RUBY
35
35
  end
36
36
 
37
- # revert if https://github.com/knu/sorted_set/issues/2 is resolved
38
- %i[=== include? member?].each do |mthd|
39
- class_eval <<-RUBY, __FILE__, __LINE__ + 1
40
- def #{mthd}(*args, &block)
41
- !!@__set.#{mthd}(*args, &block)
42
- end
43
- RUBY
44
- end
45
-
46
37
  %i[& + - ^ | difference intersection union].each do |mthd|
47
38
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
48
39
  def #{mthd}(enum, &block)
@@ -83,13 +74,8 @@ class CharacterSet
83
74
 
84
75
  def eql?(other)
85
76
  return false unless other.is_a?(self.class)
86
- # revert if https://github.com/knu/sorted_set/issues/3 is resolved
87
- hash == other.hash
88
- end
89
77
 
90
- # revert if https://github.com/knu/sorted_set/issues/3 is resolved
91
- def hash
92
- @__set.to_a.hash
78
+ @__set.eql?(other.instance_variable_get(:@__set))
93
79
  end
94
80
 
95
81
  def initialize_dup(orig)