character_set 1.2.0-java → 1.3.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +3 -0
  3. data/.travis.yml +1 -0
  4. data/BENCHMARK.md +51 -15
  5. data/CHANGELOG.md +20 -0
  6. data/README.md +24 -8
  7. data/Rakefile +20 -18
  8. data/benchmarks/count_in.rb +13 -0
  9. data/benchmarks/delete_in.rb +1 -1
  10. data/benchmarks/scan.rb +13 -0
  11. data/benchmarks/shared.rb +1 -0
  12. data/benchmarks/z_add.rb +12 -0
  13. data/benchmarks/z_delete.rb +12 -0
  14. data/benchmarks/z_merge.rb +15 -0
  15. data/benchmarks/z_minmax.rb +12 -0
  16. data/bin/console +2 -0
  17. data/character_set.gemspec +2 -0
  18. data/ext/character_set/character_set.c +963 -413
  19. data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
  20. data/lib/character_set/core_ext/string_ext.rb +2 -0
  21. data/lib/character_set/expression_converter.rb +21 -24
  22. data/lib/character_set/predefined_sets.rb +25 -260
  23. data/lib/character_set/predefined_sets/any.cps +1 -0
  24. data/lib/character_set/predefined_sets/ascii.cps +1 -0
  25. data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
  26. data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
  27. data/lib/character_set/predefined_sets/assigned.cps +666 -0
  28. data/lib/character_set/predefined_sets/bmp.cps +2 -0
  29. data/lib/character_set/predefined_sets/crypt.cps +2 -0
  30. data/lib/character_set/predefined_sets/emoji.cps +151 -0
  31. data/lib/character_set/predefined_sets/newline.cps +3 -0
  32. data/lib/character_set/predefined_sets/surrogate.cps +1 -0
  33. data/lib/character_set/predefined_sets/unicode.cps +2 -0
  34. data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
  35. data/lib/character_set/predefined_sets/url_host.cps +10 -0
  36. data/lib/character_set/predefined_sets/url_path.cps +7 -0
  37. data/lib/character_set/predefined_sets/url_query.cps +8 -0
  38. data/lib/character_set/predefined_sets/whitespace.cps +10 -0
  39. data/lib/character_set/ruby_fallback.rb +0 -2
  40. data/lib/character_set/ruby_fallback/character_set_methods.rb +52 -4
  41. data/lib/character_set/ruby_fallback/set_methods.rb +2 -2
  42. data/lib/character_set/shared_methods.rb +51 -40
  43. data/lib/character_set/version.rb +1 -1
  44. metadata +54 -3
  45. data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
@@ -0,0 +1,2 @@
1
+ 0,D7FF
2
+ E000,FFFF
@@ -0,0 +1,2 @@
1
+ 2E,5A
2
+ 61,7A
@@ -0,0 +1,151 @@
1
+ 23,23
2
+ 2A,2A
3
+ 30,39
4
+ A9,A9
5
+ AE,AE
6
+ 203C,203C
7
+ 2049,2049
8
+ 2122,2122
9
+ 2139,2139
10
+ 2194,2199
11
+ 21A9,21AA
12
+ 231A,231B
13
+ 2328,2328
14
+ 23CF,23CF
15
+ 23E9,23F3
16
+ 23F8,23FA
17
+ 24C2,24C2
18
+ 25AA,25AB
19
+ 25B6,25B6
20
+ 25C0,25C0
21
+ 25FB,25FE
22
+ 2600,2604
23
+ 260E,260E
24
+ 2611,2611
25
+ 2614,2615
26
+ 2618,2618
27
+ 261D,261D
28
+ 2620,2620
29
+ 2622,2623
30
+ 2626,2626
31
+ 262A,262A
32
+ 262E,262F
33
+ 2638,263A
34
+ 2640,2640
35
+ 2642,2642
36
+ 2648,2653
37
+ 265F,2660
38
+ 2663,2663
39
+ 2665,2666
40
+ 2668,2668
41
+ 267B,267B
42
+ 267E,267F
43
+ 2692,2697
44
+ 2699,2699
45
+ 269B,269C
46
+ 26A0,26A1
47
+ 26AA,26AB
48
+ 26B0,26B1
49
+ 26BD,26BE
50
+ 26C4,26C5
51
+ 26C8,26C8
52
+ 26CE,26CF
53
+ 26D1,26D1
54
+ 26D3,26D4
55
+ 26E9,26EA
56
+ 26F0,26F5
57
+ 26F7,26FA
58
+ 26FD,26FD
59
+ 2702,2702
60
+ 2705,2705
61
+ 2708,270D
62
+ 270F,270F
63
+ 2712,2712
64
+ 2714,2714
65
+ 2716,2716
66
+ 271D,271D
67
+ 2721,2721
68
+ 2728,2728
69
+ 2733,2734
70
+ 2744,2744
71
+ 2747,2747
72
+ 274C,274C
73
+ 274E,274E
74
+ 2753,2755
75
+ 2757,2757
76
+ 2763,2764
77
+ 2795,2797
78
+ 27A1,27A1
79
+ 27B0,27B0
80
+ 27BF,27BF
81
+ 2934,2935
82
+ 2B05,2B07
83
+ 2B1B,2B1C
84
+ 2B50,2B50
85
+ 2B55,2B55
86
+ 3030,3030
87
+ 303D,303D
88
+ 3297,3297
89
+ 3299,3299
90
+ 1F004,1F004
91
+ 1F0CF,1F0CF
92
+ 1F170,1F171
93
+ 1F17E,1F17F
94
+ 1F18E,1F18E
95
+ 1F191,1F19A
96
+ 1F1E6,1F1FF
97
+ 1F201,1F202
98
+ 1F21A,1F21A
99
+ 1F22F,1F22F
100
+ 1F232,1F23A
101
+ 1F250,1F251
102
+ 1F300,1F321
103
+ 1F324,1F393
104
+ 1F396,1F397
105
+ 1F399,1F39B
106
+ 1F39E,1F3F0
107
+ 1F3F3,1F3F5
108
+ 1F3F7,1F4FD
109
+ 1F4FF,1F53D
110
+ 1F549,1F54E
111
+ 1F550,1F567
112
+ 1F56F,1F570
113
+ 1F573,1F57A
114
+ 1F587,1F587
115
+ 1F58A,1F58D
116
+ 1F590,1F590
117
+ 1F595,1F596
118
+ 1F5A4,1F5A5
119
+ 1F5A8,1F5A8
120
+ 1F5B1,1F5B2
121
+ 1F5BC,1F5BC
122
+ 1F5C2,1F5C4
123
+ 1F5D1,1F5D3
124
+ 1F5DC,1F5DE
125
+ 1F5E1,1F5E1
126
+ 1F5E3,1F5E3
127
+ 1F5E8,1F5E8
128
+ 1F5EF,1F5EF
129
+ 1F5F3,1F5F3
130
+ 1F5FA,1F64F
131
+ 1F680,1F6C5
132
+ 1F6CB,1F6D2
133
+ 1F6D5,1F6D5
134
+ 1F6E0,1F6E5
135
+ 1F6E9,1F6E9
136
+ 1F6EB,1F6EC
137
+ 1F6F0,1F6F0
138
+ 1F6F3,1F6FA
139
+ 1F7E0,1F7EB
140
+ 1F90D,1F93A
141
+ 1F93C,1F945
142
+ 1F947,1F971
143
+ 1F973,1F976
144
+ 1F97A,1F9A2
145
+ 1F9A5,1F9AA
146
+ 1F9AE,1F9CA
147
+ 1F9CD,1F9FF
148
+ 1FA70,1FA73
149
+ 1FA78,1FA7A
150
+ 1FA80,1FA82
151
+ 1FA90,1FA95
@@ -0,0 +1,3 @@
1
+ A,D
2
+ 85,85
3
+ 2028,2029
@@ -0,0 +1 @@
1
+ D800,DFFF
@@ -0,0 +1,2 @@
1
+ 0,D7FF
2
+ E000,10FFFF
@@ -0,0 +1,8 @@
1
+ 21,21
2
+ 24,24
3
+ 26,3B
4
+ 3D,3D
5
+ 3F,5A
6
+ 5F,5F
7
+ 61,7A
8
+ 7E,7E
@@ -0,0 +1,10 @@
1
+ 21,21
2
+ 24,24
3
+ 26,2E
4
+ 30,3B
5
+ 3D,3D
6
+ 41,5B
7
+ 5D,5D
8
+ 5F,5F
9
+ 61,7A
10
+ 7E,7E
@@ -0,0 +1,7 @@
1
+ 21,21
2
+ 24,3A
3
+ 3D,3D
4
+ 40,5A
5
+ 5F,5F
6
+ 61,7A
7
+ 7E,7E
@@ -0,0 +1,8 @@
1
+ 21,21
2
+ 24,24
3
+ 26,3B
4
+ 3D,3D
5
+ 3F,5A
6
+ 5F,5F
7
+ 61,7A
8
+ 7E,7E
@@ -0,0 +1,10 @@
1
+ 9,D
2
+ 20,20
3
+ 85,85
4
+ A0,A0
5
+ 1680,1680
6
+ 2000,200A
7
+ 2028,2029
8
+ 202F,202F
9
+ 205F,205F
10
+ 3000,3000
@@ -1,12 +1,10 @@
1
1
  require 'set'
2
2
  require 'character_set/ruby_fallback/set_methods'
3
- require 'character_set/ruby_fallback/plane_methods'
4
3
  require 'character_set/ruby_fallback/character_set_methods'
5
4
 
6
5
  class CharacterSet
7
6
  module RubyFallback
8
7
  include CharacterSet::RubyFallback::SetMethods
9
- include CharacterSet::RubyFallback::PlaneMethods
10
8
  include CharacterSet::RubyFallback::CharacterSetMethods
11
9
 
12
10
  def self.prepended(klass)
@@ -39,9 +39,8 @@ class CharacterSet
39
39
  count.nil? ? to_a(true).sample : to_a(true).sample(count)
40
40
  end
41
41
 
42
- def used_by?(string)
43
- str!(string).each_codepoint { |cp| return true if include?(cp) }
44
- false
42
+ def count_in(string)
43
+ str!(string).each_codepoint.count { |cp| include?(cp) }
45
44
  end
46
45
 
47
46
  def cover?(string)
@@ -67,15 +66,64 @@ class CharacterSet
67
66
  result.size == string.size ? nil : string.replace(result)
68
67
  end
69
68
 
69
+ def scan(string)
70
+ encoding = str!(string).encoding
71
+ string.each_codepoint.inject([]) do |arr, cp|
72
+ include?(cp) ? arr.push(cp.chr(encoding)) : arr
73
+ end
74
+ end
75
+
76
+ def used_by?(string)
77
+ str!(string).each_codepoint { |cp| return true if include?(cp) }
78
+ false
79
+ end
80
+
81
+ def section(from:, upto: 0x10FFFF)
82
+ dup.keep_if { |cp| cp >= from && cp <= upto }
83
+ end
84
+
85
+ def count_in_section(from:, upto: 0x10FFFF)
86
+ count { |cp| cp >= from && cp <= upto }
87
+ end
88
+
89
+ def section?(from:, upto: 0x10FFFF)
90
+ any? { |cp| cp >= from && cp <= upto }
91
+ end
92
+
93
+ def section_ratio(from:, upto: 0x10FFFF)
94
+ section(from: from, upto: upto).count / count.to_f
95
+ end
96
+
97
+ def planes
98
+ plane_size = 0x10000.to_f
99
+ inject({}) { |hash, cp| hash.merge((cp / plane_size).floor => 1) }.keys
100
+ end
101
+
102
+ def plane(num)
103
+ validate_plane_number(num)
104
+ section(from: (num * 0x10000), upto: ((num + 1) * 0x10000) - 1)
105
+ end
106
+
107
+ def member_in_plane?(num)
108
+ validate_plane_number(num)
109
+ ((num * 0x10000)...((num + 1) * 0x10000)).any? { |cp| include?(cp) }
110
+ end
111
+
70
112
  private
71
113
 
114
+ def validate_plane_number(num)
115
+ num >= 0 && num <= 16 or raise ArgumentError, 'plane must be between 0 and 16'
116
+ end
117
+
72
118
  def str!(obj)
73
119
  raise ArgumentError, 'pass a String' unless obj.respond_to?(:codepoints)
74
120
  obj
75
121
  end
76
122
 
77
123
  def make_new_str(original, &block)
78
- new_string = str!(original).each_codepoint.each_with_object('', &block)
124
+ new_string = str!(original)
125
+ .each_codepoint
126
+ .each_with_object(''.encode(original.encoding), &block)
79
127
  original.tainted? ? new_string.taint : new_string
80
128
  end
81
129
  end
@@ -72,8 +72,8 @@ class CharacterSet
72
72
  true
73
73
  elsif other.instance_of?(self.class)
74
74
  @__set == other.instance_variable_get(:@__set)
75
- elsif other.is_a?(self.class) && size == other.size
76
- other.all? { |cp| @__set.include?(cp) }
75
+ elsif other.is_a?(CharacterSet) || other.is_a?(CharacterSet::Pure)
76
+ size == other.size && other.all? { |cp| @__set.include?(cp) }
77
77
  else
78
78
  false
79
79
  end
@@ -70,7 +70,17 @@ class CharacterSet
70
70
  merge(enum)
71
71
  end
72
72
 
73
- # stringification methods
73
+ # CharacterSet-specific conversion methods
74
+
75
+ def assigned_part
76
+ self & self.class.assigned
77
+ end
78
+
79
+ def valid_part
80
+ self - self.class.surrogate
81
+ end
82
+
83
+ # CharacterSet-specific stringification methods
74
84
 
75
85
  def to_s(opts = {}, &block)
76
86
  Writer.write(ranges, opts, &block)
@@ -82,25 +92,30 @@ class CharacterSet
82
92
 
83
93
  def inspect
84
94
  len = length
85
- "#<CharacterSet: {\#{first(5) * ', '}\#{'...' if len > 5}} (size: \#{len})>"
95
+ "#<#{klass.name}: {\#{first(5) * ', '}\#{'...' if len > 5}} (size: \#{len})>"
86
96
  end
87
97
 
88
- # unicode-plane-related methods
98
+ # C-extension adapter methods. Need overriding in pure fallback.
99
+ # Parsing kwargs in C is slower, verbose, and kinda deprecated.
100
+
101
+ def inversion(include_surrogates: false, upto: 0x10FFFF)
102
+ ext_inversion(include_surrogates, upto)
103
+ end
89
104
 
90
- def bmp_part?
91
- !bmp_part.empty?
105
+ def section(from:, upto: 0x10FFFF)
106
+ ext_section(from, upto)
92
107
  end
93
108
 
94
- def astral_part?
95
- !astral_part.empty?
109
+ def count_in_section(from:, upto: 0x10FFFF)
110
+ ext_count_in_section(from, upto)
96
111
  end
97
112
 
98
- def bmp_ratio
99
- bmp_part.count / count.to_f
113
+ def section?(from:, upto: 0x10FFFF)
114
+ ext_section?(from, upto)
100
115
  end
101
116
 
102
- def astral_ratio
103
- astral_part.count / count.to_f
117
+ def section_ratio(from:, upto: 0x10FFFF)
118
+ ext_section_ratio(from, upto)
104
119
  end
105
120
 
106
121
  #
@@ -136,42 +151,38 @@ class CharacterSet
136
151
  end
137
152
 
138
153
  def divide(&func)
139
- block_given? or return enum_for(__method__) { size }
140
154
  require 'set'
155
+ Set.new(to_a).divide(&func)
156
+ end
157
+ RUBY
141
158
 
142
- if func.arity == 2
143
- require 'tsort'
144
-
145
- class << dig = {}
146
- include TSort
159
+ # CharacterSet-specific section methods
147
160
 
148
- alias tsort_each_node each_key
149
- def tsort_each_child(node, &block)
150
- fetch(node).each(&block)
151
- end
152
- end
161
+ {
162
+ ascii: 0..0x7F,
163
+ bmp: 0..0xFFFF,
164
+ astral: 0x10000..0x10FFFF,
165
+ }.each do |section_name, range|
166
+ klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
167
+ def #{section_name}_part
168
+ section(from: #{range.begin}, upto: #{range.end})
169
+ end
153
170
 
154
- each do |u|
155
- dig[u] = a = []
156
- each{ |v| a << v if yield(u, v) }
157
- end
171
+ def #{section_name}_part?
172
+ section?(from: #{range.begin}, upto: #{range.end})
173
+ end
158
174
 
159
- set = Set.new
160
- dig.each_strongly_connected_component do |css|
161
- set.add(self.class.new(css))
162
- end
163
- set
164
- else
165
- Set.new(classify(&func).values)
175
+ def #{section_name}_only?
176
+ #{range.begin == 0 ?
177
+ "!section?(from: #{range.end}, upto: 0x10FFFF)" :
178
+ "!section?(from: 0, upto: #{range.begin})"}
166
179
  end
167
- end
168
180
 
169
- # C-extension adapter method. Needs overriding in pure fallback.
170
- # Parsing kwargs in C is slower, verbose, and kinda deprecated.
171
- def inversion(include_surrogates: false, upto: 0x10FFFF)
172
- ext_inversion(include_surrogates, upto)
173
- end
174
- RUBY
181
+ def #{section_name}_ratio
182
+ section_ratio(from: #{range.begin}, upto: #{range.end})
183
+ end
184
+ RUBY
185
+ end
175
186
  end # self.included
176
187
  end # SharedMethods
177
188
  end