character_set 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitattributes +3 -0
- data/.travis.yml +1 -0
- data/BENCHMARK.md +51 -15
- data/CHANGELOG.md +20 -0
- data/README.md +24 -8
- data/Rakefile +20 -18
- data/benchmarks/count_in.rb +13 -0
- data/benchmarks/delete_in.rb +1 -1
- data/benchmarks/scan.rb +13 -0
- data/benchmarks/shared.rb +1 -0
- data/benchmarks/z_add.rb +12 -0
- data/benchmarks/z_delete.rb +12 -0
- data/benchmarks/z_merge.rb +15 -0
- data/benchmarks/z_minmax.rb +12 -0
- data/bin/console +2 -0
- data/character_set.gemspec +2 -0
- data/ext/character_set/character_set.c +963 -413
- data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
- data/lib/character_set/core_ext/string_ext.rb +2 -0
- data/lib/character_set/expression_converter.rb +21 -24
- data/lib/character_set/predefined_sets.rb +25 -260
- data/lib/character_set/predefined_sets/any.cps +1 -0
- data/lib/character_set/predefined_sets/ascii.cps +1 -0
- data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
- data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
- data/lib/character_set/predefined_sets/assigned.cps +666 -0
- data/lib/character_set/predefined_sets/bmp.cps +2 -0
- data/lib/character_set/predefined_sets/crypt.cps +2 -0
- data/lib/character_set/predefined_sets/emoji.cps +151 -0
- data/lib/character_set/predefined_sets/newline.cps +3 -0
- data/lib/character_set/predefined_sets/surrogate.cps +1 -0
- data/lib/character_set/predefined_sets/unicode.cps +2 -0
- data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
- data/lib/character_set/predefined_sets/url_host.cps +10 -0
- data/lib/character_set/predefined_sets/url_path.cps +7 -0
- data/lib/character_set/predefined_sets/url_query.cps +8 -0
- data/lib/character_set/predefined_sets/whitespace.cps +10 -0
- data/lib/character_set/ruby_fallback.rb +0 -2
- data/lib/character_set/ruby_fallback/character_set_methods.rb +52 -4
- data/lib/character_set/ruby_fallback/set_methods.rb +2 -2
- data/lib/character_set/shared_methods.rb +51 -40
- data/lib/character_set/version.rb +1 -1
- metadata +54 -3
- data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
@@ -0,0 +1,151 @@
|
|
1
|
+
23,23
|
2
|
+
2A,2A
|
3
|
+
30,39
|
4
|
+
A9,A9
|
5
|
+
AE,AE
|
6
|
+
203C,203C
|
7
|
+
2049,2049
|
8
|
+
2122,2122
|
9
|
+
2139,2139
|
10
|
+
2194,2199
|
11
|
+
21A9,21AA
|
12
|
+
231A,231B
|
13
|
+
2328,2328
|
14
|
+
23CF,23CF
|
15
|
+
23E9,23F3
|
16
|
+
23F8,23FA
|
17
|
+
24C2,24C2
|
18
|
+
25AA,25AB
|
19
|
+
25B6,25B6
|
20
|
+
25C0,25C0
|
21
|
+
25FB,25FE
|
22
|
+
2600,2604
|
23
|
+
260E,260E
|
24
|
+
2611,2611
|
25
|
+
2614,2615
|
26
|
+
2618,2618
|
27
|
+
261D,261D
|
28
|
+
2620,2620
|
29
|
+
2622,2623
|
30
|
+
2626,2626
|
31
|
+
262A,262A
|
32
|
+
262E,262F
|
33
|
+
2638,263A
|
34
|
+
2640,2640
|
35
|
+
2642,2642
|
36
|
+
2648,2653
|
37
|
+
265F,2660
|
38
|
+
2663,2663
|
39
|
+
2665,2666
|
40
|
+
2668,2668
|
41
|
+
267B,267B
|
42
|
+
267E,267F
|
43
|
+
2692,2697
|
44
|
+
2699,2699
|
45
|
+
269B,269C
|
46
|
+
26A0,26A1
|
47
|
+
26AA,26AB
|
48
|
+
26B0,26B1
|
49
|
+
26BD,26BE
|
50
|
+
26C4,26C5
|
51
|
+
26C8,26C8
|
52
|
+
26CE,26CF
|
53
|
+
26D1,26D1
|
54
|
+
26D3,26D4
|
55
|
+
26E9,26EA
|
56
|
+
26F0,26F5
|
57
|
+
26F7,26FA
|
58
|
+
26FD,26FD
|
59
|
+
2702,2702
|
60
|
+
2705,2705
|
61
|
+
2708,270D
|
62
|
+
270F,270F
|
63
|
+
2712,2712
|
64
|
+
2714,2714
|
65
|
+
2716,2716
|
66
|
+
271D,271D
|
67
|
+
2721,2721
|
68
|
+
2728,2728
|
69
|
+
2733,2734
|
70
|
+
2744,2744
|
71
|
+
2747,2747
|
72
|
+
274C,274C
|
73
|
+
274E,274E
|
74
|
+
2753,2755
|
75
|
+
2757,2757
|
76
|
+
2763,2764
|
77
|
+
2795,2797
|
78
|
+
27A1,27A1
|
79
|
+
27B0,27B0
|
80
|
+
27BF,27BF
|
81
|
+
2934,2935
|
82
|
+
2B05,2B07
|
83
|
+
2B1B,2B1C
|
84
|
+
2B50,2B50
|
85
|
+
2B55,2B55
|
86
|
+
3030,3030
|
87
|
+
303D,303D
|
88
|
+
3297,3297
|
89
|
+
3299,3299
|
90
|
+
1F004,1F004
|
91
|
+
1F0CF,1F0CF
|
92
|
+
1F170,1F171
|
93
|
+
1F17E,1F17F
|
94
|
+
1F18E,1F18E
|
95
|
+
1F191,1F19A
|
96
|
+
1F1E6,1F1FF
|
97
|
+
1F201,1F202
|
98
|
+
1F21A,1F21A
|
99
|
+
1F22F,1F22F
|
100
|
+
1F232,1F23A
|
101
|
+
1F250,1F251
|
102
|
+
1F300,1F321
|
103
|
+
1F324,1F393
|
104
|
+
1F396,1F397
|
105
|
+
1F399,1F39B
|
106
|
+
1F39E,1F3F0
|
107
|
+
1F3F3,1F3F5
|
108
|
+
1F3F7,1F4FD
|
109
|
+
1F4FF,1F53D
|
110
|
+
1F549,1F54E
|
111
|
+
1F550,1F567
|
112
|
+
1F56F,1F570
|
113
|
+
1F573,1F57A
|
114
|
+
1F587,1F587
|
115
|
+
1F58A,1F58D
|
116
|
+
1F590,1F590
|
117
|
+
1F595,1F596
|
118
|
+
1F5A4,1F5A5
|
119
|
+
1F5A8,1F5A8
|
120
|
+
1F5B1,1F5B2
|
121
|
+
1F5BC,1F5BC
|
122
|
+
1F5C2,1F5C4
|
123
|
+
1F5D1,1F5D3
|
124
|
+
1F5DC,1F5DE
|
125
|
+
1F5E1,1F5E1
|
126
|
+
1F5E3,1F5E3
|
127
|
+
1F5E8,1F5E8
|
128
|
+
1F5EF,1F5EF
|
129
|
+
1F5F3,1F5F3
|
130
|
+
1F5FA,1F64F
|
131
|
+
1F680,1F6C5
|
132
|
+
1F6CB,1F6D2
|
133
|
+
1F6D5,1F6D5
|
134
|
+
1F6E0,1F6E5
|
135
|
+
1F6E9,1F6E9
|
136
|
+
1F6EB,1F6EC
|
137
|
+
1F6F0,1F6F0
|
138
|
+
1F6F3,1F6FA
|
139
|
+
1F7E0,1F7EB
|
140
|
+
1F90D,1F93A
|
141
|
+
1F93C,1F945
|
142
|
+
1F947,1F971
|
143
|
+
1F973,1F976
|
144
|
+
1F97A,1F9A2
|
145
|
+
1F9A5,1F9AA
|
146
|
+
1F9AE,1F9CA
|
147
|
+
1F9CD,1F9FF
|
148
|
+
1FA70,1FA73
|
149
|
+
1FA78,1FA7A
|
150
|
+
1FA80,1FA82
|
151
|
+
1FA90,1FA95
|
@@ -0,0 +1 @@
|
|
1
|
+
D800,DFFF
|
@@ -1,12 +1,10 @@
|
|
1
1
|
require 'set'
|
2
2
|
require 'character_set/ruby_fallback/set_methods'
|
3
|
-
require 'character_set/ruby_fallback/plane_methods'
|
4
3
|
require 'character_set/ruby_fallback/character_set_methods'
|
5
4
|
|
6
5
|
class CharacterSet
|
7
6
|
module RubyFallback
|
8
7
|
include CharacterSet::RubyFallback::SetMethods
|
9
|
-
include CharacterSet::RubyFallback::PlaneMethods
|
10
8
|
include CharacterSet::RubyFallback::CharacterSetMethods
|
11
9
|
|
12
10
|
def self.prepended(klass)
|
@@ -39,9 +39,8 @@ class CharacterSet
|
|
39
39
|
count.nil? ? to_a(true).sample : to_a(true).sample(count)
|
40
40
|
end
|
41
41
|
|
42
|
-
def
|
43
|
-
str!(string).each_codepoint { |cp|
|
44
|
-
false
|
42
|
+
def count_in(string)
|
43
|
+
str!(string).each_codepoint.count { |cp| include?(cp) }
|
45
44
|
end
|
46
45
|
|
47
46
|
def cover?(string)
|
@@ -67,15 +66,64 @@ class CharacterSet
|
|
67
66
|
result.size == string.size ? nil : string.replace(result)
|
68
67
|
end
|
69
68
|
|
69
|
+
def scan(string)
|
70
|
+
encoding = str!(string).encoding
|
71
|
+
string.each_codepoint.inject([]) do |arr, cp|
|
72
|
+
include?(cp) ? arr.push(cp.chr(encoding)) : arr
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def used_by?(string)
|
77
|
+
str!(string).each_codepoint { |cp| return true if include?(cp) }
|
78
|
+
false
|
79
|
+
end
|
80
|
+
|
81
|
+
def section(from:, upto: 0x10FFFF)
|
82
|
+
dup.keep_if { |cp| cp >= from && cp <= upto }
|
83
|
+
end
|
84
|
+
|
85
|
+
def count_in_section(from:, upto: 0x10FFFF)
|
86
|
+
count { |cp| cp >= from && cp <= upto }
|
87
|
+
end
|
88
|
+
|
89
|
+
def section?(from:, upto: 0x10FFFF)
|
90
|
+
any? { |cp| cp >= from && cp <= upto }
|
91
|
+
end
|
92
|
+
|
93
|
+
def section_ratio(from:, upto: 0x10FFFF)
|
94
|
+
section(from: from, upto: upto).count / count.to_f
|
95
|
+
end
|
96
|
+
|
97
|
+
def planes
|
98
|
+
plane_size = 0x10000.to_f
|
99
|
+
inject({}) { |hash, cp| hash.merge((cp / plane_size).floor => 1) }.keys
|
100
|
+
end
|
101
|
+
|
102
|
+
def plane(num)
|
103
|
+
validate_plane_number(num)
|
104
|
+
section(from: (num * 0x10000), upto: ((num + 1) * 0x10000) - 1)
|
105
|
+
end
|
106
|
+
|
107
|
+
def member_in_plane?(num)
|
108
|
+
validate_plane_number(num)
|
109
|
+
((num * 0x10000)...((num + 1) * 0x10000)).any? { |cp| include?(cp) }
|
110
|
+
end
|
111
|
+
|
70
112
|
private
|
71
113
|
|
114
|
+
def validate_plane_number(num)
|
115
|
+
num >= 0 && num <= 16 or raise ArgumentError, 'plane must be between 0 and 16'
|
116
|
+
end
|
117
|
+
|
72
118
|
def str!(obj)
|
73
119
|
raise ArgumentError, 'pass a String' unless obj.respond_to?(:codepoints)
|
74
120
|
obj
|
75
121
|
end
|
76
122
|
|
77
123
|
def make_new_str(original, &block)
|
78
|
-
new_string = str!(original)
|
124
|
+
new_string = str!(original)
|
125
|
+
.each_codepoint
|
126
|
+
.each_with_object(''.encode(original.encoding), &block)
|
79
127
|
original.tainted? ? new_string.taint : new_string
|
80
128
|
end
|
81
129
|
end
|
@@ -72,8 +72,8 @@ class CharacterSet
|
|
72
72
|
true
|
73
73
|
elsif other.instance_of?(self.class)
|
74
74
|
@__set == other.instance_variable_get(:@__set)
|
75
|
-
elsif other.is_a?(
|
76
|
-
other.all? { |cp| @__set.include?(cp) }
|
75
|
+
elsif other.is_a?(CharacterSet) || other.is_a?(CharacterSet::Pure)
|
76
|
+
size == other.size && other.all? { |cp| @__set.include?(cp) }
|
77
77
|
else
|
78
78
|
false
|
79
79
|
end
|
@@ -70,7 +70,17 @@ class CharacterSet
|
|
70
70
|
merge(enum)
|
71
71
|
end
|
72
72
|
|
73
|
-
#
|
73
|
+
# CharacterSet-specific conversion methods
|
74
|
+
|
75
|
+
def assigned_part
|
76
|
+
self & self.class.assigned
|
77
|
+
end
|
78
|
+
|
79
|
+
def valid_part
|
80
|
+
self - self.class.surrogate
|
81
|
+
end
|
82
|
+
|
83
|
+
# CharacterSet-specific stringification methods
|
74
84
|
|
75
85
|
def to_s(opts = {}, &block)
|
76
86
|
Writer.write(ranges, opts, &block)
|
@@ -82,25 +92,30 @@ class CharacterSet
|
|
82
92
|
|
83
93
|
def inspect
|
84
94
|
len = length
|
85
|
-
"
|
95
|
+
"#<#{klass.name}: {\#{first(5) * ', '}\#{'...' if len > 5}} (size: \#{len})>"
|
86
96
|
end
|
87
97
|
|
88
|
-
#
|
98
|
+
# C-extension adapter methods. Need overriding in pure fallback.
|
99
|
+
# Parsing kwargs in C is slower, verbose, and kinda deprecated.
|
100
|
+
|
101
|
+
def inversion(include_surrogates: false, upto: 0x10FFFF)
|
102
|
+
ext_inversion(include_surrogates, upto)
|
103
|
+
end
|
89
104
|
|
90
|
-
def
|
91
|
-
|
105
|
+
def section(from:, upto: 0x10FFFF)
|
106
|
+
ext_section(from, upto)
|
92
107
|
end
|
93
108
|
|
94
|
-
def
|
95
|
-
|
109
|
+
def count_in_section(from:, upto: 0x10FFFF)
|
110
|
+
ext_count_in_section(from, upto)
|
96
111
|
end
|
97
112
|
|
98
|
-
def
|
99
|
-
|
113
|
+
def section?(from:, upto: 0x10FFFF)
|
114
|
+
ext_section?(from, upto)
|
100
115
|
end
|
101
116
|
|
102
|
-
def
|
103
|
-
|
117
|
+
def section_ratio(from:, upto: 0x10FFFF)
|
118
|
+
ext_section_ratio(from, upto)
|
104
119
|
end
|
105
120
|
|
106
121
|
#
|
@@ -136,42 +151,38 @@ class CharacterSet
|
|
136
151
|
end
|
137
152
|
|
138
153
|
def divide(&func)
|
139
|
-
block_given? or return enum_for(__method__) { size }
|
140
154
|
require 'set'
|
155
|
+
Set.new(to_a).divide(&func)
|
156
|
+
end
|
157
|
+
RUBY
|
141
158
|
|
142
|
-
|
143
|
-
require 'tsort'
|
144
|
-
|
145
|
-
class << dig = {}
|
146
|
-
include TSort
|
159
|
+
# CharacterSet-specific section methods
|
147
160
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
161
|
+
{
|
162
|
+
ascii: 0..0x7F,
|
163
|
+
bmp: 0..0xFFFF,
|
164
|
+
astral: 0x10000..0x10FFFF,
|
165
|
+
}.each do |section_name, range|
|
166
|
+
klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
167
|
+
def #{section_name}_part
|
168
|
+
section(from: #{range.begin}, upto: #{range.end})
|
169
|
+
end
|
153
170
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
end
|
171
|
+
def #{section_name}_part?
|
172
|
+
section?(from: #{range.begin}, upto: #{range.end})
|
173
|
+
end
|
158
174
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
set
|
164
|
-
else
|
165
|
-
Set.new(classify(&func).values)
|
175
|
+
def #{section_name}_only?
|
176
|
+
#{range.begin == 0 ?
|
177
|
+
"!section?(from: #{range.end}, upto: 0x10FFFF)" :
|
178
|
+
"!section?(from: 0, upto: #{range.begin})"}
|
166
179
|
end
|
167
|
-
end
|
168
180
|
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
RUBY
|
181
|
+
def #{section_name}_ratio
|
182
|
+
section_ratio(from: #{range.begin}, upto: #{range.end})
|
183
|
+
end
|
184
|
+
RUBY
|
185
|
+
end
|
175
186
|
end # self.included
|
176
187
|
end # SharedMethods
|
177
188
|
end
|