character_set 1.2.0 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitattributes +3 -0
- data/.travis.yml +1 -0
- data/BENCHMARK.md +51 -15
- data/CHANGELOG.md +20 -0
- data/README.md +24 -8
- data/Rakefile +20 -18
- data/benchmarks/count_in.rb +13 -0
- data/benchmarks/delete_in.rb +1 -1
- data/benchmarks/scan.rb +13 -0
- data/benchmarks/shared.rb +1 -0
- data/benchmarks/z_add.rb +12 -0
- data/benchmarks/z_delete.rb +12 -0
- data/benchmarks/z_merge.rb +15 -0
- data/benchmarks/z_minmax.rb +12 -0
- data/bin/console +2 -0
- data/character_set.gemspec +2 -0
- data/ext/character_set/character_set.c +963 -413
- data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
- data/lib/character_set/core_ext/string_ext.rb +2 -0
- data/lib/character_set/expression_converter.rb +21 -24
- data/lib/character_set/predefined_sets.rb +25 -260
- data/lib/character_set/predefined_sets/any.cps +1 -0
- data/lib/character_set/predefined_sets/ascii.cps +1 -0
- data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
- data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
- data/lib/character_set/predefined_sets/assigned.cps +666 -0
- data/lib/character_set/predefined_sets/bmp.cps +2 -0
- data/lib/character_set/predefined_sets/crypt.cps +2 -0
- data/lib/character_set/predefined_sets/emoji.cps +151 -0
- data/lib/character_set/predefined_sets/newline.cps +3 -0
- data/lib/character_set/predefined_sets/surrogate.cps +1 -0
- data/lib/character_set/predefined_sets/unicode.cps +2 -0
- data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
- data/lib/character_set/predefined_sets/url_host.cps +10 -0
- data/lib/character_set/predefined_sets/url_path.cps +7 -0
- data/lib/character_set/predefined_sets/url_query.cps +8 -0
- data/lib/character_set/predefined_sets/whitespace.cps +10 -0
- data/lib/character_set/ruby_fallback.rb +0 -2
- data/lib/character_set/ruby_fallback/character_set_methods.rb +52 -4
- data/lib/character_set/ruby_fallback/set_methods.rb +2 -2
- data/lib/character_set/shared_methods.rb +51 -40
- data/lib/character_set/version.rb +1 -1
- metadata +54 -3
- data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
@@ -0,0 +1,151 @@
|
|
1
|
+
23,23
|
2
|
+
2A,2A
|
3
|
+
30,39
|
4
|
+
A9,A9
|
5
|
+
AE,AE
|
6
|
+
203C,203C
|
7
|
+
2049,2049
|
8
|
+
2122,2122
|
9
|
+
2139,2139
|
10
|
+
2194,2199
|
11
|
+
21A9,21AA
|
12
|
+
231A,231B
|
13
|
+
2328,2328
|
14
|
+
23CF,23CF
|
15
|
+
23E9,23F3
|
16
|
+
23F8,23FA
|
17
|
+
24C2,24C2
|
18
|
+
25AA,25AB
|
19
|
+
25B6,25B6
|
20
|
+
25C0,25C0
|
21
|
+
25FB,25FE
|
22
|
+
2600,2604
|
23
|
+
260E,260E
|
24
|
+
2611,2611
|
25
|
+
2614,2615
|
26
|
+
2618,2618
|
27
|
+
261D,261D
|
28
|
+
2620,2620
|
29
|
+
2622,2623
|
30
|
+
2626,2626
|
31
|
+
262A,262A
|
32
|
+
262E,262F
|
33
|
+
2638,263A
|
34
|
+
2640,2640
|
35
|
+
2642,2642
|
36
|
+
2648,2653
|
37
|
+
265F,2660
|
38
|
+
2663,2663
|
39
|
+
2665,2666
|
40
|
+
2668,2668
|
41
|
+
267B,267B
|
42
|
+
267E,267F
|
43
|
+
2692,2697
|
44
|
+
2699,2699
|
45
|
+
269B,269C
|
46
|
+
26A0,26A1
|
47
|
+
26AA,26AB
|
48
|
+
26B0,26B1
|
49
|
+
26BD,26BE
|
50
|
+
26C4,26C5
|
51
|
+
26C8,26C8
|
52
|
+
26CE,26CF
|
53
|
+
26D1,26D1
|
54
|
+
26D3,26D4
|
55
|
+
26E9,26EA
|
56
|
+
26F0,26F5
|
57
|
+
26F7,26FA
|
58
|
+
26FD,26FD
|
59
|
+
2702,2702
|
60
|
+
2705,2705
|
61
|
+
2708,270D
|
62
|
+
270F,270F
|
63
|
+
2712,2712
|
64
|
+
2714,2714
|
65
|
+
2716,2716
|
66
|
+
271D,271D
|
67
|
+
2721,2721
|
68
|
+
2728,2728
|
69
|
+
2733,2734
|
70
|
+
2744,2744
|
71
|
+
2747,2747
|
72
|
+
274C,274C
|
73
|
+
274E,274E
|
74
|
+
2753,2755
|
75
|
+
2757,2757
|
76
|
+
2763,2764
|
77
|
+
2795,2797
|
78
|
+
27A1,27A1
|
79
|
+
27B0,27B0
|
80
|
+
27BF,27BF
|
81
|
+
2934,2935
|
82
|
+
2B05,2B07
|
83
|
+
2B1B,2B1C
|
84
|
+
2B50,2B50
|
85
|
+
2B55,2B55
|
86
|
+
3030,3030
|
87
|
+
303D,303D
|
88
|
+
3297,3297
|
89
|
+
3299,3299
|
90
|
+
1F004,1F004
|
91
|
+
1F0CF,1F0CF
|
92
|
+
1F170,1F171
|
93
|
+
1F17E,1F17F
|
94
|
+
1F18E,1F18E
|
95
|
+
1F191,1F19A
|
96
|
+
1F1E6,1F1FF
|
97
|
+
1F201,1F202
|
98
|
+
1F21A,1F21A
|
99
|
+
1F22F,1F22F
|
100
|
+
1F232,1F23A
|
101
|
+
1F250,1F251
|
102
|
+
1F300,1F321
|
103
|
+
1F324,1F393
|
104
|
+
1F396,1F397
|
105
|
+
1F399,1F39B
|
106
|
+
1F39E,1F3F0
|
107
|
+
1F3F3,1F3F5
|
108
|
+
1F3F7,1F4FD
|
109
|
+
1F4FF,1F53D
|
110
|
+
1F549,1F54E
|
111
|
+
1F550,1F567
|
112
|
+
1F56F,1F570
|
113
|
+
1F573,1F57A
|
114
|
+
1F587,1F587
|
115
|
+
1F58A,1F58D
|
116
|
+
1F590,1F590
|
117
|
+
1F595,1F596
|
118
|
+
1F5A4,1F5A5
|
119
|
+
1F5A8,1F5A8
|
120
|
+
1F5B1,1F5B2
|
121
|
+
1F5BC,1F5BC
|
122
|
+
1F5C2,1F5C4
|
123
|
+
1F5D1,1F5D3
|
124
|
+
1F5DC,1F5DE
|
125
|
+
1F5E1,1F5E1
|
126
|
+
1F5E3,1F5E3
|
127
|
+
1F5E8,1F5E8
|
128
|
+
1F5EF,1F5EF
|
129
|
+
1F5F3,1F5F3
|
130
|
+
1F5FA,1F64F
|
131
|
+
1F680,1F6C5
|
132
|
+
1F6CB,1F6D2
|
133
|
+
1F6D5,1F6D5
|
134
|
+
1F6E0,1F6E5
|
135
|
+
1F6E9,1F6E9
|
136
|
+
1F6EB,1F6EC
|
137
|
+
1F6F0,1F6F0
|
138
|
+
1F6F3,1F6FA
|
139
|
+
1F7E0,1F7EB
|
140
|
+
1F90D,1F93A
|
141
|
+
1F93C,1F945
|
142
|
+
1F947,1F971
|
143
|
+
1F973,1F976
|
144
|
+
1F97A,1F9A2
|
145
|
+
1F9A5,1F9AA
|
146
|
+
1F9AE,1F9CA
|
147
|
+
1F9CD,1F9FF
|
148
|
+
1FA70,1FA73
|
149
|
+
1FA78,1FA7A
|
150
|
+
1FA80,1FA82
|
151
|
+
1FA90,1FA95
|
@@ -0,0 +1 @@
|
|
1
|
+
D800,DFFF
|
@@ -1,12 +1,10 @@
|
|
1
1
|
require 'set'
|
2
2
|
require 'character_set/ruby_fallback/set_methods'
|
3
|
-
require 'character_set/ruby_fallback/plane_methods'
|
4
3
|
require 'character_set/ruby_fallback/character_set_methods'
|
5
4
|
|
6
5
|
class CharacterSet
|
7
6
|
module RubyFallback
|
8
7
|
include CharacterSet::RubyFallback::SetMethods
|
9
|
-
include CharacterSet::RubyFallback::PlaneMethods
|
10
8
|
include CharacterSet::RubyFallback::CharacterSetMethods
|
11
9
|
|
12
10
|
def self.prepended(klass)
|
@@ -39,9 +39,8 @@ class CharacterSet
|
|
39
39
|
count.nil? ? to_a(true).sample : to_a(true).sample(count)
|
40
40
|
end
|
41
41
|
|
42
|
-
def
|
43
|
-
str!(string).each_codepoint { |cp|
|
44
|
-
false
|
42
|
+
def count_in(string)
|
43
|
+
str!(string).each_codepoint.count { |cp| include?(cp) }
|
45
44
|
end
|
46
45
|
|
47
46
|
def cover?(string)
|
@@ -67,15 +66,64 @@ class CharacterSet
|
|
67
66
|
result.size == string.size ? nil : string.replace(result)
|
68
67
|
end
|
69
68
|
|
69
|
+
def scan(string)
|
70
|
+
encoding = str!(string).encoding
|
71
|
+
string.each_codepoint.inject([]) do |arr, cp|
|
72
|
+
include?(cp) ? arr.push(cp.chr(encoding)) : arr
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def used_by?(string)
|
77
|
+
str!(string).each_codepoint { |cp| return true if include?(cp) }
|
78
|
+
false
|
79
|
+
end
|
80
|
+
|
81
|
+
def section(from:, upto: 0x10FFFF)
|
82
|
+
dup.keep_if { |cp| cp >= from && cp <= upto }
|
83
|
+
end
|
84
|
+
|
85
|
+
def count_in_section(from:, upto: 0x10FFFF)
|
86
|
+
count { |cp| cp >= from && cp <= upto }
|
87
|
+
end
|
88
|
+
|
89
|
+
def section?(from:, upto: 0x10FFFF)
|
90
|
+
any? { |cp| cp >= from && cp <= upto }
|
91
|
+
end
|
92
|
+
|
93
|
+
def section_ratio(from:, upto: 0x10FFFF)
|
94
|
+
section(from: from, upto: upto).count / count.to_f
|
95
|
+
end
|
96
|
+
|
97
|
+
def planes
|
98
|
+
plane_size = 0x10000.to_f
|
99
|
+
inject({}) { |hash, cp| hash.merge((cp / plane_size).floor => 1) }.keys
|
100
|
+
end
|
101
|
+
|
102
|
+
def plane(num)
|
103
|
+
validate_plane_number(num)
|
104
|
+
section(from: (num * 0x10000), upto: ((num + 1) * 0x10000) - 1)
|
105
|
+
end
|
106
|
+
|
107
|
+
def member_in_plane?(num)
|
108
|
+
validate_plane_number(num)
|
109
|
+
((num * 0x10000)...((num + 1) * 0x10000)).any? { |cp| include?(cp) }
|
110
|
+
end
|
111
|
+
|
70
112
|
private
|
71
113
|
|
114
|
+
def validate_plane_number(num)
|
115
|
+
num >= 0 && num <= 16 or raise ArgumentError, 'plane must be between 0 and 16'
|
116
|
+
end
|
117
|
+
|
72
118
|
def str!(obj)
|
73
119
|
raise ArgumentError, 'pass a String' unless obj.respond_to?(:codepoints)
|
74
120
|
obj
|
75
121
|
end
|
76
122
|
|
77
123
|
def make_new_str(original, &block)
|
78
|
-
new_string = str!(original)
|
124
|
+
new_string = str!(original)
|
125
|
+
.each_codepoint
|
126
|
+
.each_with_object(''.encode(original.encoding), &block)
|
79
127
|
original.tainted? ? new_string.taint : new_string
|
80
128
|
end
|
81
129
|
end
|
@@ -72,8 +72,8 @@ class CharacterSet
|
|
72
72
|
true
|
73
73
|
elsif other.instance_of?(self.class)
|
74
74
|
@__set == other.instance_variable_get(:@__set)
|
75
|
-
elsif other.is_a?(
|
76
|
-
other.all? { |cp| @__set.include?(cp) }
|
75
|
+
elsif other.is_a?(CharacterSet) || other.is_a?(CharacterSet::Pure)
|
76
|
+
size == other.size && other.all? { |cp| @__set.include?(cp) }
|
77
77
|
else
|
78
78
|
false
|
79
79
|
end
|
@@ -70,7 +70,17 @@ class CharacterSet
|
|
70
70
|
merge(enum)
|
71
71
|
end
|
72
72
|
|
73
|
-
#
|
73
|
+
# CharacterSet-specific conversion methods
|
74
|
+
|
75
|
+
def assigned_part
|
76
|
+
self & self.class.assigned
|
77
|
+
end
|
78
|
+
|
79
|
+
def valid_part
|
80
|
+
self - self.class.surrogate
|
81
|
+
end
|
82
|
+
|
83
|
+
# CharacterSet-specific stringification methods
|
74
84
|
|
75
85
|
def to_s(opts = {}, &block)
|
76
86
|
Writer.write(ranges, opts, &block)
|
@@ -82,25 +92,30 @@ class CharacterSet
|
|
82
92
|
|
83
93
|
def inspect
|
84
94
|
len = length
|
85
|
-
"
|
95
|
+
"#<#{klass.name}: {\#{first(5) * ', '}\#{'...' if len > 5}} (size: \#{len})>"
|
86
96
|
end
|
87
97
|
|
88
|
-
#
|
98
|
+
# C-extension adapter methods. Need overriding in pure fallback.
|
99
|
+
# Parsing kwargs in C is slower, verbose, and kinda deprecated.
|
100
|
+
|
101
|
+
def inversion(include_surrogates: false, upto: 0x10FFFF)
|
102
|
+
ext_inversion(include_surrogates, upto)
|
103
|
+
end
|
89
104
|
|
90
|
-
def
|
91
|
-
|
105
|
+
def section(from:, upto: 0x10FFFF)
|
106
|
+
ext_section(from, upto)
|
92
107
|
end
|
93
108
|
|
94
|
-
def
|
95
|
-
|
109
|
+
def count_in_section(from:, upto: 0x10FFFF)
|
110
|
+
ext_count_in_section(from, upto)
|
96
111
|
end
|
97
112
|
|
98
|
-
def
|
99
|
-
|
113
|
+
def section?(from:, upto: 0x10FFFF)
|
114
|
+
ext_section?(from, upto)
|
100
115
|
end
|
101
116
|
|
102
|
-
def
|
103
|
-
|
117
|
+
def section_ratio(from:, upto: 0x10FFFF)
|
118
|
+
ext_section_ratio(from, upto)
|
104
119
|
end
|
105
120
|
|
106
121
|
#
|
@@ -136,42 +151,38 @@ class CharacterSet
|
|
136
151
|
end
|
137
152
|
|
138
153
|
def divide(&func)
|
139
|
-
block_given? or return enum_for(__method__) { size }
|
140
154
|
require 'set'
|
155
|
+
Set.new(to_a).divide(&func)
|
156
|
+
end
|
157
|
+
RUBY
|
141
158
|
|
142
|
-
|
143
|
-
require 'tsort'
|
144
|
-
|
145
|
-
class << dig = {}
|
146
|
-
include TSort
|
159
|
+
# CharacterSet-specific section methods
|
147
160
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
161
|
+
{
|
162
|
+
ascii: 0..0x7F,
|
163
|
+
bmp: 0..0xFFFF,
|
164
|
+
astral: 0x10000..0x10FFFF,
|
165
|
+
}.each do |section_name, range|
|
166
|
+
klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
167
|
+
def #{section_name}_part
|
168
|
+
section(from: #{range.begin}, upto: #{range.end})
|
169
|
+
end
|
153
170
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
end
|
171
|
+
def #{section_name}_part?
|
172
|
+
section?(from: #{range.begin}, upto: #{range.end})
|
173
|
+
end
|
158
174
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
set
|
164
|
-
else
|
165
|
-
Set.new(classify(&func).values)
|
175
|
+
def #{section_name}_only?
|
176
|
+
#{range.begin == 0 ?
|
177
|
+
"!section?(from: #{range.end}, upto: 0x10FFFF)" :
|
178
|
+
"!section?(from: 0, upto: #{range.begin})"}
|
166
179
|
end
|
167
|
-
end
|
168
180
|
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
RUBY
|
181
|
+
def #{section_name}_ratio
|
182
|
+
section_ratio(from: #{range.begin}, upto: #{range.end})
|
183
|
+
end
|
184
|
+
RUBY
|
185
|
+
end
|
175
186
|
end # self.included
|
176
187
|
end # SharedMethods
|
177
188
|
end
|