character_set 1.2.0 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +3 -0
  3. data/.travis.yml +1 -0
  4. data/BENCHMARK.md +51 -15
  5. data/CHANGELOG.md +20 -0
  6. data/README.md +24 -8
  7. data/Rakefile +20 -18
  8. data/benchmarks/count_in.rb +13 -0
  9. data/benchmarks/delete_in.rb +1 -1
  10. data/benchmarks/scan.rb +13 -0
  11. data/benchmarks/shared.rb +1 -0
  12. data/benchmarks/z_add.rb +12 -0
  13. data/benchmarks/z_delete.rb +12 -0
  14. data/benchmarks/z_merge.rb +15 -0
  15. data/benchmarks/z_minmax.rb +12 -0
  16. data/bin/console +2 -0
  17. data/character_set.gemspec +2 -0
  18. data/ext/character_set/character_set.c +963 -413
  19. data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
  20. data/lib/character_set/core_ext/string_ext.rb +2 -0
  21. data/lib/character_set/expression_converter.rb +21 -24
  22. data/lib/character_set/predefined_sets.rb +25 -260
  23. data/lib/character_set/predefined_sets/any.cps +1 -0
  24. data/lib/character_set/predefined_sets/ascii.cps +1 -0
  25. data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
  26. data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
  27. data/lib/character_set/predefined_sets/assigned.cps +666 -0
  28. data/lib/character_set/predefined_sets/bmp.cps +2 -0
  29. data/lib/character_set/predefined_sets/crypt.cps +2 -0
  30. data/lib/character_set/predefined_sets/emoji.cps +151 -0
  31. data/lib/character_set/predefined_sets/newline.cps +3 -0
  32. data/lib/character_set/predefined_sets/surrogate.cps +1 -0
  33. data/lib/character_set/predefined_sets/unicode.cps +2 -0
  34. data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
  35. data/lib/character_set/predefined_sets/url_host.cps +10 -0
  36. data/lib/character_set/predefined_sets/url_path.cps +7 -0
  37. data/lib/character_set/predefined_sets/url_query.cps +8 -0
  38. data/lib/character_set/predefined_sets/whitespace.cps +10 -0
  39. data/lib/character_set/ruby_fallback.rb +0 -2
  40. data/lib/character_set/ruby_fallback/character_set_methods.rb +52 -4
  41. data/lib/character_set/ruby_fallback/set_methods.rb +2 -2
  42. data/lib/character_set/shared_methods.rb +51 -40
  43. data/lib/character_set/version.rb +1 -1
  44. metadata +54 -3
  45. data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
@@ -0,0 +1,2 @@
1
+ 0,D7FF
2
+ E000,FFFF
@@ -0,0 +1,2 @@
1
+ 2E,5A
2
+ 61,7A
@@ -0,0 +1,151 @@
1
+ 23,23
2
+ 2A,2A
3
+ 30,39
4
+ A9,A9
5
+ AE,AE
6
+ 203C,203C
7
+ 2049,2049
8
+ 2122,2122
9
+ 2139,2139
10
+ 2194,2199
11
+ 21A9,21AA
12
+ 231A,231B
13
+ 2328,2328
14
+ 23CF,23CF
15
+ 23E9,23F3
16
+ 23F8,23FA
17
+ 24C2,24C2
18
+ 25AA,25AB
19
+ 25B6,25B6
20
+ 25C0,25C0
21
+ 25FB,25FE
22
+ 2600,2604
23
+ 260E,260E
24
+ 2611,2611
25
+ 2614,2615
26
+ 2618,2618
27
+ 261D,261D
28
+ 2620,2620
29
+ 2622,2623
30
+ 2626,2626
31
+ 262A,262A
32
+ 262E,262F
33
+ 2638,263A
34
+ 2640,2640
35
+ 2642,2642
36
+ 2648,2653
37
+ 265F,2660
38
+ 2663,2663
39
+ 2665,2666
40
+ 2668,2668
41
+ 267B,267B
42
+ 267E,267F
43
+ 2692,2697
44
+ 2699,2699
45
+ 269B,269C
46
+ 26A0,26A1
47
+ 26AA,26AB
48
+ 26B0,26B1
49
+ 26BD,26BE
50
+ 26C4,26C5
51
+ 26C8,26C8
52
+ 26CE,26CF
53
+ 26D1,26D1
54
+ 26D3,26D4
55
+ 26E9,26EA
56
+ 26F0,26F5
57
+ 26F7,26FA
58
+ 26FD,26FD
59
+ 2702,2702
60
+ 2705,2705
61
+ 2708,270D
62
+ 270F,270F
63
+ 2712,2712
64
+ 2714,2714
65
+ 2716,2716
66
+ 271D,271D
67
+ 2721,2721
68
+ 2728,2728
69
+ 2733,2734
70
+ 2744,2744
71
+ 2747,2747
72
+ 274C,274C
73
+ 274E,274E
74
+ 2753,2755
75
+ 2757,2757
76
+ 2763,2764
77
+ 2795,2797
78
+ 27A1,27A1
79
+ 27B0,27B0
80
+ 27BF,27BF
81
+ 2934,2935
82
+ 2B05,2B07
83
+ 2B1B,2B1C
84
+ 2B50,2B50
85
+ 2B55,2B55
86
+ 3030,3030
87
+ 303D,303D
88
+ 3297,3297
89
+ 3299,3299
90
+ 1F004,1F004
91
+ 1F0CF,1F0CF
92
+ 1F170,1F171
93
+ 1F17E,1F17F
94
+ 1F18E,1F18E
95
+ 1F191,1F19A
96
+ 1F1E6,1F1FF
97
+ 1F201,1F202
98
+ 1F21A,1F21A
99
+ 1F22F,1F22F
100
+ 1F232,1F23A
101
+ 1F250,1F251
102
+ 1F300,1F321
103
+ 1F324,1F393
104
+ 1F396,1F397
105
+ 1F399,1F39B
106
+ 1F39E,1F3F0
107
+ 1F3F3,1F3F5
108
+ 1F3F7,1F4FD
109
+ 1F4FF,1F53D
110
+ 1F549,1F54E
111
+ 1F550,1F567
112
+ 1F56F,1F570
113
+ 1F573,1F57A
114
+ 1F587,1F587
115
+ 1F58A,1F58D
116
+ 1F590,1F590
117
+ 1F595,1F596
118
+ 1F5A4,1F5A5
119
+ 1F5A8,1F5A8
120
+ 1F5B1,1F5B2
121
+ 1F5BC,1F5BC
122
+ 1F5C2,1F5C4
123
+ 1F5D1,1F5D3
124
+ 1F5DC,1F5DE
125
+ 1F5E1,1F5E1
126
+ 1F5E3,1F5E3
127
+ 1F5E8,1F5E8
128
+ 1F5EF,1F5EF
129
+ 1F5F3,1F5F3
130
+ 1F5FA,1F64F
131
+ 1F680,1F6C5
132
+ 1F6CB,1F6D2
133
+ 1F6D5,1F6D5
134
+ 1F6E0,1F6E5
135
+ 1F6E9,1F6E9
136
+ 1F6EB,1F6EC
137
+ 1F6F0,1F6F0
138
+ 1F6F3,1F6FA
139
+ 1F7E0,1F7EB
140
+ 1F90D,1F93A
141
+ 1F93C,1F945
142
+ 1F947,1F971
143
+ 1F973,1F976
144
+ 1F97A,1F9A2
145
+ 1F9A5,1F9AA
146
+ 1F9AE,1F9CA
147
+ 1F9CD,1F9FF
148
+ 1FA70,1FA73
149
+ 1FA78,1FA7A
150
+ 1FA80,1FA82
151
+ 1FA90,1FA95
@@ -0,0 +1,3 @@
1
+ A,D
2
+ 85,85
3
+ 2028,2029
@@ -0,0 +1 @@
1
+ D800,DFFF
@@ -0,0 +1,2 @@
1
+ 0,D7FF
2
+ E000,10FFFF
@@ -0,0 +1,8 @@
1
+ 21,21
2
+ 24,24
3
+ 26,3B
4
+ 3D,3D
5
+ 3F,5A
6
+ 5F,5F
7
+ 61,7A
8
+ 7E,7E
@@ -0,0 +1,10 @@
1
+ 21,21
2
+ 24,24
3
+ 26,2E
4
+ 30,3B
5
+ 3D,3D
6
+ 41,5B
7
+ 5D,5D
8
+ 5F,5F
9
+ 61,7A
10
+ 7E,7E
@@ -0,0 +1,7 @@
1
+ 21,21
2
+ 24,3A
3
+ 3D,3D
4
+ 40,5A
5
+ 5F,5F
6
+ 61,7A
7
+ 7E,7E
@@ -0,0 +1,8 @@
1
+ 21,21
2
+ 24,24
3
+ 26,3B
4
+ 3D,3D
5
+ 3F,5A
6
+ 5F,5F
7
+ 61,7A
8
+ 7E,7E
@@ -0,0 +1,10 @@
1
+ 9,D
2
+ 20,20
3
+ 85,85
4
+ A0,A0
5
+ 1680,1680
6
+ 2000,200A
7
+ 2028,2029
8
+ 202F,202F
9
+ 205F,205F
10
+ 3000,3000
@@ -1,12 +1,10 @@
1
1
  require 'set'
2
2
  require 'character_set/ruby_fallback/set_methods'
3
- require 'character_set/ruby_fallback/plane_methods'
4
3
  require 'character_set/ruby_fallback/character_set_methods'
5
4
 
6
5
  class CharacterSet
7
6
  module RubyFallback
8
7
  include CharacterSet::RubyFallback::SetMethods
9
- include CharacterSet::RubyFallback::PlaneMethods
10
8
  include CharacterSet::RubyFallback::CharacterSetMethods
11
9
 
12
10
  def self.prepended(klass)
@@ -39,9 +39,8 @@ class CharacterSet
39
39
  count.nil? ? to_a(true).sample : to_a(true).sample(count)
40
40
  end
41
41
 
42
- def used_by?(string)
43
- str!(string).each_codepoint { |cp| return true if include?(cp) }
44
- false
42
+ def count_in(string)
43
+ str!(string).each_codepoint.count { |cp| include?(cp) }
45
44
  end
46
45
 
47
46
  def cover?(string)
@@ -67,15 +66,64 @@ class CharacterSet
67
66
  result.size == string.size ? nil : string.replace(result)
68
67
  end
69
68
 
69
+ def scan(string)
70
+ encoding = str!(string).encoding
71
+ string.each_codepoint.inject([]) do |arr, cp|
72
+ include?(cp) ? arr.push(cp.chr(encoding)) : arr
73
+ end
74
+ end
75
+
76
+ def used_by?(string)
77
+ str!(string).each_codepoint { |cp| return true if include?(cp) }
78
+ false
79
+ end
80
+
81
+ def section(from:, upto: 0x10FFFF)
82
+ dup.keep_if { |cp| cp >= from && cp <= upto }
83
+ end
84
+
85
+ def count_in_section(from:, upto: 0x10FFFF)
86
+ count { |cp| cp >= from && cp <= upto }
87
+ end
88
+
89
+ def section?(from:, upto: 0x10FFFF)
90
+ any? { |cp| cp >= from && cp <= upto }
91
+ end
92
+
93
+ def section_ratio(from:, upto: 0x10FFFF)
94
+ section(from: from, upto: upto).count / count.to_f
95
+ end
96
+
97
+ def planes
98
+ plane_size = 0x10000.to_f
99
+ inject({}) { |hash, cp| hash.merge((cp / plane_size).floor => 1) }.keys
100
+ end
101
+
102
+ def plane(num)
103
+ validate_plane_number(num)
104
+ section(from: (num * 0x10000), upto: ((num + 1) * 0x10000) - 1)
105
+ end
106
+
107
+ def member_in_plane?(num)
108
+ validate_plane_number(num)
109
+ ((num * 0x10000)...((num + 1) * 0x10000)).any? { |cp| include?(cp) }
110
+ end
111
+
70
112
  private
71
113
 
114
+ def validate_plane_number(num)
115
+ num >= 0 && num <= 16 or raise ArgumentError, 'plane must be between 0 and 16'
116
+ end
117
+
72
118
  def str!(obj)
73
119
  raise ArgumentError, 'pass a String' unless obj.respond_to?(:codepoints)
74
120
  obj
75
121
  end
76
122
 
77
123
  def make_new_str(original, &block)
78
- new_string = str!(original).each_codepoint.each_with_object('', &block)
124
+ new_string = str!(original)
125
+ .each_codepoint
126
+ .each_with_object(''.encode(original.encoding), &block)
79
127
  original.tainted? ? new_string.taint : new_string
80
128
  end
81
129
  end
@@ -72,8 +72,8 @@ class CharacterSet
72
72
  true
73
73
  elsif other.instance_of?(self.class)
74
74
  @__set == other.instance_variable_get(:@__set)
75
- elsif other.is_a?(self.class) && size == other.size
76
- other.all? { |cp| @__set.include?(cp) }
75
+ elsif other.is_a?(CharacterSet) || other.is_a?(CharacterSet::Pure)
76
+ size == other.size && other.all? { |cp| @__set.include?(cp) }
77
77
  else
78
78
  false
79
79
  end
@@ -70,7 +70,17 @@ class CharacterSet
70
70
  merge(enum)
71
71
  end
72
72
 
73
- # stringification methods
73
+ # CharacterSet-specific conversion methods
74
+
75
+ def assigned_part
76
+ self & self.class.assigned
77
+ end
78
+
79
+ def valid_part
80
+ self - self.class.surrogate
81
+ end
82
+
83
+ # CharacterSet-specific stringification methods
74
84
 
75
85
  def to_s(opts = {}, &block)
76
86
  Writer.write(ranges, opts, &block)
@@ -82,25 +92,30 @@ class CharacterSet
82
92
 
83
93
  def inspect
84
94
  len = length
85
- "#<CharacterSet: {\#{first(5) * ', '}\#{'...' if len > 5}} (size: \#{len})>"
95
+ "#<#{klass.name}: {\#{first(5) * ', '}\#{'...' if len > 5}} (size: \#{len})>"
86
96
  end
87
97
 
88
- # unicode-plane-related methods
98
+ # C-extension adapter methods. Need overriding in pure fallback.
99
+ # Parsing kwargs in C is slower, verbose, and kinda deprecated.
100
+
101
+ def inversion(include_surrogates: false, upto: 0x10FFFF)
102
+ ext_inversion(include_surrogates, upto)
103
+ end
89
104
 
90
- def bmp_part?
91
- !bmp_part.empty?
105
+ def section(from:, upto: 0x10FFFF)
106
+ ext_section(from, upto)
92
107
  end
93
108
 
94
- def astral_part?
95
- !astral_part.empty?
109
+ def count_in_section(from:, upto: 0x10FFFF)
110
+ ext_count_in_section(from, upto)
96
111
  end
97
112
 
98
- def bmp_ratio
99
- bmp_part.count / count.to_f
113
+ def section?(from:, upto: 0x10FFFF)
114
+ ext_section?(from, upto)
100
115
  end
101
116
 
102
- def astral_ratio
103
- astral_part.count / count.to_f
117
+ def section_ratio(from:, upto: 0x10FFFF)
118
+ ext_section_ratio(from, upto)
104
119
  end
105
120
 
106
121
  #
@@ -136,42 +151,38 @@ class CharacterSet
136
151
  end
137
152
 
138
153
  def divide(&func)
139
- block_given? or return enum_for(__method__) { size }
140
154
  require 'set'
155
+ Set.new(to_a).divide(&func)
156
+ end
157
+ RUBY
141
158
 
142
- if func.arity == 2
143
- require 'tsort'
144
-
145
- class << dig = {}
146
- include TSort
159
+ # CharacterSet-specific section methods
147
160
 
148
- alias tsort_each_node each_key
149
- def tsort_each_child(node, &block)
150
- fetch(node).each(&block)
151
- end
152
- end
161
+ {
162
+ ascii: 0..0x7F,
163
+ bmp: 0..0xFFFF,
164
+ astral: 0x10000..0x10FFFF,
165
+ }.each do |section_name, range|
166
+ klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
167
+ def #{section_name}_part
168
+ section(from: #{range.begin}, upto: #{range.end})
169
+ end
153
170
 
154
- each do |u|
155
- dig[u] = a = []
156
- each{ |v| a << v if yield(u, v) }
157
- end
171
+ def #{section_name}_part?
172
+ section?(from: #{range.begin}, upto: #{range.end})
173
+ end
158
174
 
159
- set = Set.new
160
- dig.each_strongly_connected_component do |css|
161
- set.add(self.class.new(css))
162
- end
163
- set
164
- else
165
- Set.new(classify(&func).values)
175
+ def #{section_name}_only?
176
+ #{range.begin == 0 ?
177
+ "!section?(from: #{range.end}, upto: 0x10FFFF)" :
178
+ "!section?(from: 0, upto: #{range.begin})"}
166
179
  end
167
- end
168
180
 
169
- # C-extension adapter method. Needs overriding in pure fallback.
170
- # Parsing kwargs in C is slower, verbose, and kinda deprecated.
171
- def inversion(include_surrogates: false, upto: 0x10FFFF)
172
- ext_inversion(include_surrogates, upto)
173
- end
174
- RUBY
181
+ def #{section_name}_ratio
182
+ section_ratio(from: #{range.begin}, upto: #{range.end})
183
+ end
184
+ RUBY
185
+ end
175
186
  end # self.included
176
187
  end # SharedMethods
177
188
  end