character_set 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +31 -0
- data/.rspec +3 -0
- data/.travis.yml +11 -0
- data/BENCHMARK.md +50 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +180 -0
- data/Rakefile +137 -0
- data/benchmarks/cover.rb +25 -0
- data/benchmarks/delete_in.rb +25 -0
- data/benchmarks/keep_in.rb +25 -0
- data/benchmarks/shared.rb +25 -0
- data/benchmarks/used_by.rb +25 -0
- data/bin/console +19 -0
- data/bin/setup +8 -0
- data/character_set.gemspec +34 -0
- data/ext/character_set/character_set.c +814 -0
- data/ext/character_set/extconf.rb +5 -0
- data/ext/character_set/unicode_casefold_table.h +1387 -0
- data/lib/character_set/character.rb +76 -0
- data/lib/character_set/common_sets.rb +258 -0
- data/lib/character_set/core_ext/regexp_ext.rb +11 -0
- data/lib/character_set/core_ext/string_ext.rb +35 -0
- data/lib/character_set/core_ext.rb +3 -0
- data/lib/character_set/expression_converter.rb +106 -0
- data/lib/character_set/parser.rb +48 -0
- data/lib/character_set/pure.rb +13 -0
- data/lib/character_set/ruby_fallback/character_set_methods.rb +83 -0
- data/lib/character_set/ruby_fallback/plane_methods.rb +27 -0
- data/lib/character_set/ruby_fallback/set_methods.rb +103 -0
- data/lib/character_set/ruby_fallback.rb +21 -0
- data/lib/character_set/set_method_adapters.rb +39 -0
- data/lib/character_set/shared_methods.rb +155 -0
- data/lib/character_set/version.rb +3 -0
- data/lib/character_set/writer.rb +37 -0
- data/lib/character_set.rb +21 -0
- metadata +193 -0
@@ -0,0 +1,76 @@
|
|
1
|
+
class CharacterSet
|
2
|
+
class Character
|
3
|
+
ENCODING = 'utf-8'.freeze
|
4
|
+
SAFELY_PRINTABLE = (0x21..0x7E).to_a - ['-', '[', '\\', ']', '^'].map(&:ord)
|
5
|
+
|
6
|
+
attr_accessor :codepoint
|
7
|
+
|
8
|
+
def initialize(codepoint)
|
9
|
+
case codepoint
|
10
|
+
when Integer then self.codepoint = codepoint
|
11
|
+
when String then self.codepoint = codepoint.ord
|
12
|
+
else raise ArgumentError, 'pass an Integer or String'
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_s
|
17
|
+
codepoint.chr(ENCODING)
|
18
|
+
end
|
19
|
+
|
20
|
+
def hex
|
21
|
+
codepoint.to_s(16).upcase
|
22
|
+
end
|
23
|
+
|
24
|
+
def escape(opts = {})
|
25
|
+
return to_s if SAFELY_PRINTABLE.include?(codepoint) && !opts[:escape_all]
|
26
|
+
|
27
|
+
return yield(self) if block_given?
|
28
|
+
|
29
|
+
# https://billposer.org/Software/ListOfRepresentations.html
|
30
|
+
case opts[:format].to_s.downcase.delete('-_ ')
|
31
|
+
when '', 'default', 'es6', 'esnext', 'rb', 'ruby'
|
32
|
+
default_escape(opts)
|
33
|
+
when 'java', 'javascript', 'js'
|
34
|
+
default_escape(opts, false)
|
35
|
+
when 'capitalizableu', 'c#', 'csharp', 'd', 'python'
|
36
|
+
capitalizable_u_escape
|
37
|
+
when 'u+', 'uplus'
|
38
|
+
u_plus_escape
|
39
|
+
when 'literal', 'raw'
|
40
|
+
to_s
|
41
|
+
else
|
42
|
+
raise ArgumentError, "unsupported format: #{opts[:format].inspect}"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def plane
|
47
|
+
codepoint / 0x10000
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def default_escape(opts, support_wide_hex = true)
|
53
|
+
if hex.length <= 2
|
54
|
+
'\\x' + hex.rjust(2, '0')
|
55
|
+
elsif hex.length <= 4
|
56
|
+
'\\u' + hex.rjust(4, '0')
|
57
|
+
elsif support_wide_hex
|
58
|
+
'\\u{' + hex + '}'
|
59
|
+
else
|
60
|
+
raise "#{opts[:format]} does not support escaping astral value #{hex}"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def capitalizable_u_escape
|
65
|
+
if hex.length <= 4
|
66
|
+
'\\u' + hex.rjust(4, '0')
|
67
|
+
else
|
68
|
+
'\\U' + hex.rjust(8, '0')
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def u_plus_escape
|
73
|
+
'U+' + hex.rjust(4, '0')
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,258 @@
|
|
1
|
+
class CharacterSet
|
2
|
+
module CommonSets
|
3
|
+
def ascii
|
4
|
+
@ascii ||= from_ranges(0..0x7F).freeze
|
5
|
+
end
|
6
|
+
|
7
|
+
# basic multilingual plane
|
8
|
+
def bmp
|
9
|
+
@bmp ||= from_ranges(0..0xD7FF, 0xE000..0xFFFF).freeze
|
10
|
+
end
|
11
|
+
|
12
|
+
# ./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
|
13
|
+
def crypt
|
14
|
+
@crypt ||= from_ranges(0x2E..0x5A, 0x61..0x7A).freeze
|
15
|
+
end
|
16
|
+
|
17
|
+
def newline
|
18
|
+
@newline ||= from_ranges(0xA..0xD, 0x85..0x85, 0x2028..0x2029).freeze
|
19
|
+
end
|
20
|
+
|
21
|
+
def unicode
|
22
|
+
@unicode ||= from_ranges(0..0xD7FF, 0xE000..0x10FFFF).freeze
|
23
|
+
end
|
24
|
+
|
25
|
+
def url_fragment
|
26
|
+
@url_fragment ||= from_ranges(
|
27
|
+
0x21..0x21,
|
28
|
+
0x24..0x24,
|
29
|
+
0x26..0x3B,
|
30
|
+
0x3D..0x3D,
|
31
|
+
0x3F..0x5A,
|
32
|
+
0x5F..0x5F,
|
33
|
+
0x61..0x7A,
|
34
|
+
0x7E..0x7E
|
35
|
+
).freeze
|
36
|
+
end
|
37
|
+
|
38
|
+
def url_host
|
39
|
+
@url_host ||= from_ranges(
|
40
|
+
0x21..0x21,
|
41
|
+
0x24..0x24,
|
42
|
+
0x26..0x2E,
|
43
|
+
0x30..0x3B,
|
44
|
+
0x3D..0x3D,
|
45
|
+
0x41..0x5B,
|
46
|
+
0x5D..0x5D,
|
47
|
+
0x5F..0x5F,
|
48
|
+
0x61..0x7A,
|
49
|
+
0x7E..0x7E
|
50
|
+
).freeze
|
51
|
+
end
|
52
|
+
|
53
|
+
def url_path
|
54
|
+
@url_path ||= from_ranges(
|
55
|
+
0x21..0x21,
|
56
|
+
0x24..0x3A,
|
57
|
+
0x3D..0x3D,
|
58
|
+
0x40..0x5A,
|
59
|
+
0x5F..0x5F,
|
60
|
+
0x61..0x7A,
|
61
|
+
0x7E..0x7E
|
62
|
+
).freeze
|
63
|
+
end
|
64
|
+
|
65
|
+
def url_query
|
66
|
+
@url_query ||= from_ranges(
|
67
|
+
0x21..0x21,
|
68
|
+
0x24..0x24,
|
69
|
+
0x26..0x3B,
|
70
|
+
0x3D..0x3D,
|
71
|
+
0x3F..0x5A,
|
72
|
+
0x5F..0x5F,
|
73
|
+
0x61..0x7A,
|
74
|
+
0x7E..0x7E
|
75
|
+
).freeze
|
76
|
+
end
|
77
|
+
|
78
|
+
def whitespace
|
79
|
+
@whitespace ||= from_ranges(
|
80
|
+
0x9..0x9,
|
81
|
+
0xA..0xD,
|
82
|
+
0x20..0x20,
|
83
|
+
0x85..0x85,
|
84
|
+
0xA0..0xA0,
|
85
|
+
0x1680..0x1680,
|
86
|
+
0x180E..0x180E,
|
87
|
+
0x2000..0x200A,
|
88
|
+
0x2028..0x2029,
|
89
|
+
0x202F..0x202F,
|
90
|
+
0x205F..0x205F,
|
91
|
+
0x3000..0x3000
|
92
|
+
).freeze
|
93
|
+
end
|
94
|
+
|
95
|
+
def emoji
|
96
|
+
@emoji ||= from_ranges(
|
97
|
+
0x23..0x23,
|
98
|
+
0x2A..0x2A,
|
99
|
+
0x30..0x39,
|
100
|
+
0xA9..0xA9,
|
101
|
+
0xAE..0xAE,
|
102
|
+
0x203C..0x203C,
|
103
|
+
0x2049..0x2049,
|
104
|
+
0x2122..0x2122,
|
105
|
+
0x2139..0x2139,
|
106
|
+
0x2194..0x2199,
|
107
|
+
0x21A9..0x21AA,
|
108
|
+
0x231A..0x231B,
|
109
|
+
0x2328..0x2328,
|
110
|
+
0x23CF..0x23CF,
|
111
|
+
0x23E9..0x23F3,
|
112
|
+
0x23F8..0x23FA,
|
113
|
+
0x24C2..0x24C2,
|
114
|
+
0x25AA..0x25AB,
|
115
|
+
0x25B6..0x25B6,
|
116
|
+
0x25C0..0x25C0,
|
117
|
+
0x25FB..0x25FE,
|
118
|
+
0x2600..0x2604,
|
119
|
+
0x260E..0x260E,
|
120
|
+
0x2611..0x2611,
|
121
|
+
0x2614..0x2615,
|
122
|
+
0x2618..0x2618,
|
123
|
+
0x261D..0x261D,
|
124
|
+
0x2620..0x2620,
|
125
|
+
0x2622..0x2623,
|
126
|
+
0x2626..0x2626,
|
127
|
+
0x262A..0x262A,
|
128
|
+
0x262E..0x262F,
|
129
|
+
0x2638..0x263A,
|
130
|
+
0x2640..0x2640,
|
131
|
+
0x2642..0x2642,
|
132
|
+
0x2648..0x2653,
|
133
|
+
0x2660..0x2660,
|
134
|
+
0x2663..0x2663,
|
135
|
+
0x2665..0x2666,
|
136
|
+
0x2668..0x2668,
|
137
|
+
0x267B..0x267B,
|
138
|
+
0x267F..0x267F,
|
139
|
+
0x2692..0x2697,
|
140
|
+
0x2699..0x2699,
|
141
|
+
0x269B..0x269C,
|
142
|
+
0x26A0..0x26A1,
|
143
|
+
0x26AA..0x26AB,
|
144
|
+
0x26B0..0x26B1,
|
145
|
+
0x26BD..0x26BE,
|
146
|
+
0x26C4..0x26C5,
|
147
|
+
0x26C8..0x26C8,
|
148
|
+
0x26CE..0x26CF,
|
149
|
+
0x26D1..0x26D1,
|
150
|
+
0x26D3..0x26D4,
|
151
|
+
0x26E9..0x26EA,
|
152
|
+
0x26F0..0x26F5,
|
153
|
+
0x26F7..0x26FA,
|
154
|
+
0x26FD..0x26FD,
|
155
|
+
0x2702..0x2702,
|
156
|
+
0x2705..0x2705,
|
157
|
+
0x2708..0x270D,
|
158
|
+
0x270F..0x270F,
|
159
|
+
0x2712..0x2712,
|
160
|
+
0x2714..0x2714,
|
161
|
+
0x2716..0x2716,
|
162
|
+
0x271D..0x271D,
|
163
|
+
0x2721..0x2721,
|
164
|
+
0x2728..0x2728,
|
165
|
+
0x2733..0x2734,
|
166
|
+
0x2744..0x2744,
|
167
|
+
0x2747..0x2747,
|
168
|
+
0x274C..0x274C,
|
169
|
+
0x274E..0x274E,
|
170
|
+
0x2753..0x2755,
|
171
|
+
0x2757..0x2757,
|
172
|
+
0x2763..0x2764,
|
173
|
+
0x2795..0x2797,
|
174
|
+
0x27A1..0x27A1,
|
175
|
+
0x27B0..0x27B0,
|
176
|
+
0x27BF..0x27BF,
|
177
|
+
0x2934..0x2935,
|
178
|
+
0x2B05..0x2B07,
|
179
|
+
0x2B1B..0x2B1C,
|
180
|
+
0x2B50..0x2B50,
|
181
|
+
0x2B55..0x2B55,
|
182
|
+
0x3030..0x3030,
|
183
|
+
0x303D..0x303D,
|
184
|
+
0x3297..0x3297,
|
185
|
+
0x3299..0x3299,
|
186
|
+
0x1F004..0x1F004,
|
187
|
+
0x1F0CF..0x1F0CF,
|
188
|
+
0x1F170..0x1F171,
|
189
|
+
0x1F17E..0x1F17F,
|
190
|
+
0x1F18E..0x1F18E,
|
191
|
+
0x1F191..0x1F19A,
|
192
|
+
0x1F1E6..0x1F1FF,
|
193
|
+
0x1F201..0x1F202,
|
194
|
+
0x1F21A..0x1F21A,
|
195
|
+
0x1F22F..0x1F22F,
|
196
|
+
0x1F232..0x1F23A,
|
197
|
+
0x1F250..0x1F251,
|
198
|
+
0x1F300..0x1F321,
|
199
|
+
0x1F324..0x1F393,
|
200
|
+
0x1F396..0x1F397,
|
201
|
+
0x1F399..0x1F39B,
|
202
|
+
0x1F39E..0x1F3F0,
|
203
|
+
0x1F3F3..0x1F3F5,
|
204
|
+
0x1F3F7..0x1F4FD,
|
205
|
+
0x1F4FF..0x1F53D,
|
206
|
+
0x1F549..0x1F54E,
|
207
|
+
0x1F550..0x1F567,
|
208
|
+
0x1F56F..0x1F570,
|
209
|
+
0x1F573..0x1F57A,
|
210
|
+
0x1F587..0x1F587,
|
211
|
+
0x1F58A..0x1F58D,
|
212
|
+
0x1F590..0x1F590,
|
213
|
+
0x1F595..0x1F596,
|
214
|
+
0x1F5A4..0x1F5A5,
|
215
|
+
0x1F5A8..0x1F5A8,
|
216
|
+
0x1F5B1..0x1F5B2,
|
217
|
+
0x1F5BC..0x1F5BC,
|
218
|
+
0x1F5C2..0x1F5C4,
|
219
|
+
0x1F5D1..0x1F5D3,
|
220
|
+
0x1F5DC..0x1F5DE,
|
221
|
+
0x1F5E1..0x1F5E1,
|
222
|
+
0x1F5E3..0x1F5E3,
|
223
|
+
0x1F5E8..0x1F5E8,
|
224
|
+
0x1F5EF..0x1F5EF,
|
225
|
+
0x1F5F3..0x1F5F3,
|
226
|
+
0x1F5FA..0x1F64F,
|
227
|
+
0x1F680..0x1F6C5,
|
228
|
+
0x1F6CB..0x1F6D2,
|
229
|
+
0x1F6E0..0x1F6E5,
|
230
|
+
0x1F6E9..0x1F6E9,
|
231
|
+
0x1F6EB..0x1F6EC,
|
232
|
+
0x1F6F0..0x1F6F0,
|
233
|
+
0x1F6F3..0x1F6F8,
|
234
|
+
0x1F910..0x1F93A,
|
235
|
+
0x1F93C..0x1F93E,
|
236
|
+
0x1F940..0x1F945,
|
237
|
+
0x1F947..0x1F94C,
|
238
|
+
0x1F950..0x1F96B,
|
239
|
+
0x1F980..0x1F997,
|
240
|
+
0x1F9C0..0x1F9C0,
|
241
|
+
0x1F9D0..0x1F9E6
|
242
|
+
).freeze
|
243
|
+
end
|
244
|
+
|
245
|
+
def respond_to_missing?(method_name, include_private = false)
|
246
|
+
(base = method_name[/^non_(.*)/, 1]) && respond_to?(base) || super
|
247
|
+
end
|
248
|
+
|
249
|
+
def method_missing(method_name, *args, &block)
|
250
|
+
if (base = method_name[/^non_(.*)/, 1])
|
251
|
+
ivar_name = "@#{method_name}"
|
252
|
+
return instance_variable_get(ivar_name) ||
|
253
|
+
instance_variable_set(ivar_name, send(base).inversion.freeze)
|
254
|
+
end
|
255
|
+
super
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
class CharacterSet
|
2
|
+
module CoreExt
|
3
|
+
module StringExt
|
4
|
+
def character_set
|
5
|
+
CharacterSet.of(self)
|
6
|
+
end
|
7
|
+
|
8
|
+
def covered_by_character_set?(set)
|
9
|
+
set.cover?(self)
|
10
|
+
end
|
11
|
+
|
12
|
+
def uses_character_set?(set)
|
13
|
+
set.used_by?(self)
|
14
|
+
end
|
15
|
+
|
16
|
+
def delete_character_set(set)
|
17
|
+
set.delete_in(self)
|
18
|
+
end
|
19
|
+
|
20
|
+
def delete_character_set!(set)
|
21
|
+
set.delete_in!(self)
|
22
|
+
end
|
23
|
+
|
24
|
+
def keep_character_set(set)
|
25
|
+
set.keep_in(self)
|
26
|
+
end
|
27
|
+
|
28
|
+
def keep_character_set!(set)
|
29
|
+
set.keep_in!(self)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
::String.send(:include, CharacterSet::CoreExt::StringExt)
|
@@ -0,0 +1,106 @@
|
|
1
|
+
class CharacterSet
|
2
|
+
module ExpressionConverter
|
3
|
+
module_function
|
4
|
+
|
5
|
+
Error = Class.new(ArgumentError)
|
6
|
+
|
7
|
+
def convert(expression)
|
8
|
+
@regexp_parser_required ||= require 'regexp_parser'
|
9
|
+
|
10
|
+
case expression
|
11
|
+
when Regexp::Expression::Root
|
12
|
+
if expression.count != 1
|
13
|
+
raise Error, 'Pass a Regexp with exactly one expression, e.g. /[a-z]/'
|
14
|
+
end
|
15
|
+
convert(expression[0])
|
16
|
+
|
17
|
+
when Regexp::Expression::CharacterSet
|
18
|
+
content = expression.map { |subexp| convert(subexp) }.reduce(:+)
|
19
|
+
expression.negative? ? content.inversion : content
|
20
|
+
|
21
|
+
when Regexp::Expression::CharacterSet::Intersection
|
22
|
+
expression.map { |subexp| convert(subexp) }.reduce(:&)
|
23
|
+
|
24
|
+
when Regexp::Expression::CharacterSet::IntersectedSequence
|
25
|
+
expression.map { |subexp| convert(subexp) }.reduce(:+)
|
26
|
+
|
27
|
+
when Regexp::Expression::CharacterSet::Range
|
28
|
+
start, finish = expression.map { |subexp| convert(subexp) }
|
29
|
+
CharacterSet.from_ranges((start.min)..(finish.max))
|
30
|
+
|
31
|
+
when Regexp::Expression::CharacterType::Any
|
32
|
+
CharacterSet.unicode
|
33
|
+
|
34
|
+
when Regexp::Expression::CharacterType::Digit
|
35
|
+
CharacterSet.from_ranges(48..57)
|
36
|
+
|
37
|
+
when Regexp::Expression::CharacterType::NonDigit
|
38
|
+
CharacterSet.from_ranges(48..57).inversion
|
39
|
+
|
40
|
+
when Regexp::Expression::CharacterType::Hex
|
41
|
+
CharacterSet.from_ranges(48..57, 65..70, 97..102)
|
42
|
+
|
43
|
+
when Regexp::Expression::CharacterType::NonHex
|
44
|
+
CharacterSet.from_ranges(48..57, 65..70, 97..102).inversion
|
45
|
+
|
46
|
+
when Regexp::Expression::CharacterType::Space
|
47
|
+
CharacterSet["\t", "\n", "\v", "\f", "\r", "\x20"]
|
48
|
+
|
49
|
+
when Regexp::Expression::CharacterType::NonSpace
|
50
|
+
CharacterSet["\t", "\n", "\v", "\f", "\r", "\x20"].inversion
|
51
|
+
|
52
|
+
when Regexp::Expression::CharacterType::Word
|
53
|
+
CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122)
|
54
|
+
|
55
|
+
when Regexp::Expression::CharacterType::NonWord
|
56
|
+
CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122).inversion
|
57
|
+
|
58
|
+
when Regexp::Expression::EscapeSequence::CodepointList
|
59
|
+
CharacterSet.new(expression.codepoints)
|
60
|
+
|
61
|
+
when Regexp::Expression::EscapeSequence::Base
|
62
|
+
CharacterSet[expression.codepoint]
|
63
|
+
|
64
|
+
when Regexp::Expression::Group::Capture,
|
65
|
+
Regexp::Expression::Group::Passive,
|
66
|
+
Regexp::Expression::Group::Named,
|
67
|
+
Regexp::Expression::Group::Atomic,
|
68
|
+
Regexp::Expression::Group::Options
|
69
|
+
case expression.count
|
70
|
+
when 0 then CharacterSet[]
|
71
|
+
when 1 then convert(expression.first)
|
72
|
+
else
|
73
|
+
raise Error, 'Groups must contain exactly one expression, e.g. ([a-z])'
|
74
|
+
end
|
75
|
+
|
76
|
+
when Regexp::Expression::Alternation
|
77
|
+
expression.map { |subexp| convert(subexp) }.reduce(:+)
|
78
|
+
|
79
|
+
when Regexp::Expression::Alternative
|
80
|
+
case expression.count
|
81
|
+
when 0 then CharacterSet[]
|
82
|
+
when 1 then convert(expression.first)
|
83
|
+
else
|
84
|
+
raise Error, 'Alternatives must contain exactly one expression'
|
85
|
+
end
|
86
|
+
|
87
|
+
when Regexp::Expression::Literal
|
88
|
+
if expression.set_level == 0 && expression.text.size != 1
|
89
|
+
raise Error, 'Literal runs outside of sets are codepoint *sequences*'
|
90
|
+
end
|
91
|
+
CharacterSet[expression.text.ord]
|
92
|
+
|
93
|
+
when Regexp::Expression::UnicodeProperty::Base,
|
94
|
+
Regexp::Expression::PosixClass
|
95
|
+
content = CharacterSet.of_property(expression.token)
|
96
|
+
expression.negative? ? content.inversion : content
|
97
|
+
|
98
|
+
when Regexp::Expression::Base
|
99
|
+
raise Error, "Unsupported expression class `#{expression.class}`"
|
100
|
+
|
101
|
+
else
|
102
|
+
raise Error, "Pass an expression (result of Regexp::Parser.parse)"
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
class CharacterSet
|
2
|
+
module Parser
|
3
|
+
module_function
|
4
|
+
|
5
|
+
def codepoints_from_enumerable(object)
|
6
|
+
raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each)
|
7
|
+
# Use #each to check first element (only this works for all Enumerables)
|
8
|
+
object.each do |e|
|
9
|
+
return object if e.is_a?(Integer) && e >= 0 && e < 0x110000
|
10
|
+
return object.map(&:ord) if e.is_a?(String) && e.length == 1
|
11
|
+
raise ArgumentError, "#{e.inspect} is not valid as a codepoint"
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def codepoints_from_bracket_expression(string)
|
16
|
+
raise ArgumentError, 'pass a String' unless string.is_a?(String)
|
17
|
+
raise ArgumentError, 'advanced syntax' if string =~ /\\[^uUx]|[^\\]\[|&&/
|
18
|
+
|
19
|
+
content = strip_brackets(string)
|
20
|
+
literal_content = eval_escapes(content)
|
21
|
+
|
22
|
+
prev_chr = nil
|
23
|
+
in_range = false
|
24
|
+
|
25
|
+
literal_content.each_char.map do |chr|
|
26
|
+
if chr == '-' && prev_chr && prev_chr != '\\' && prev_chr != '-'
|
27
|
+
in_range = true
|
28
|
+
nil
|
29
|
+
else
|
30
|
+
result = in_range ? ((prev_chr.ord + 1)..(chr.ord)).to_a : chr.ord
|
31
|
+
in_range = false
|
32
|
+
prev_chr = chr
|
33
|
+
result
|
34
|
+
end
|
35
|
+
end.compact.flatten
|
36
|
+
end
|
37
|
+
|
38
|
+
def strip_brackets(string)
|
39
|
+
string[/\A\[\^?(.*)\]\z/, 1] || string.dup
|
40
|
+
end
|
41
|
+
|
42
|
+
def eval_escapes(string)
|
43
|
+
string.gsub(/\\U(\h{8})|\\u(\h{4})|U\+(\h+)|\\x(\h{2})|\\u\{(\h+)\}/) do
|
44
|
+
($1 || $2 || $3 || $4 || $5).to_i(16).chr('utf-8')
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'character_set'
|
2
|
+
require 'character_set/ruby_fallback'
|
3
|
+
|
4
|
+
# CharacterSet::Pure uses only Ruby implementations.
|
5
|
+
# It is equal to CharacterSet if the C ext can't be loaded.
|
6
|
+
class CharacterSet
|
7
|
+
class Pure
|
8
|
+
prepend CharacterSet::RubyFallback
|
9
|
+
prepend CharacterSet::SetMethodAdapters
|
10
|
+
include CharacterSet::SharedMethods
|
11
|
+
extend CharacterSet::CommonSets
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
class CharacterSet
|
2
|
+
module RubyFallback
|
3
|
+
module CharacterSetMethods
|
4
|
+
module ClassMethods
|
5
|
+
def from_ranges(*ranges)
|
6
|
+
new(Array(ranges).flat_map(&:to_a))
|
7
|
+
end
|
8
|
+
|
9
|
+
def of(string)
|
10
|
+
raise ArgumentError, 'pass a String' unless string.is_a?(String)
|
11
|
+
new(string.codepoints)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def inversion(include_surrogates: false, upto: 0x10FFFF)
|
16
|
+
new_set = self.class.new
|
17
|
+
0.upto(upto) do |cp|
|
18
|
+
next unless include_surrogates || cp > 0xDFFF || cp < 0xD800
|
19
|
+
new_set << cp unless include?(cp)
|
20
|
+
end
|
21
|
+
new_set
|
22
|
+
end
|
23
|
+
|
24
|
+
def case_insensitive
|
25
|
+
new_set = dup
|
26
|
+
each do |cp|
|
27
|
+
swapped_cps = cp.chr('utf-8').swapcase.codepoints
|
28
|
+
swapped_cps.size == 1 && new_set << swapped_cps[0]
|
29
|
+
end
|
30
|
+
new_set
|
31
|
+
end
|
32
|
+
|
33
|
+
def ranges
|
34
|
+
@range_compressor_required ||= require 'range_compressor'
|
35
|
+
RangeCompressor.compress(self)
|
36
|
+
end
|
37
|
+
|
38
|
+
def sample(count = nil)
|
39
|
+
count.nil? ? to_a(true).sample : to_a(true).sample(count)
|
40
|
+
end
|
41
|
+
|
42
|
+
def used_by?(string)
|
43
|
+
str!(string).each_codepoint { |cp| return true if include?(cp) }
|
44
|
+
false
|
45
|
+
end
|
46
|
+
|
47
|
+
def cover?(string)
|
48
|
+
str!(string).each_codepoint { |cp| return false unless include?(cp) }
|
49
|
+
true
|
50
|
+
end
|
51
|
+
|
52
|
+
def delete_in(string)
|
53
|
+
make_new_str(string) { |cp, new_str| include?(cp) || (new_str << cp) }
|
54
|
+
end
|
55
|
+
|
56
|
+
def delete_in!(string)
|
57
|
+
result = delete_in(string)
|
58
|
+
result.size == string.size ? nil : string.replace(result)
|
59
|
+
end
|
60
|
+
|
61
|
+
def keep_in(string)
|
62
|
+
make_new_str(string) { |cp, new_str| include?(cp) && (new_str << cp) }
|
63
|
+
end
|
64
|
+
|
65
|
+
def keep_in!(string)
|
66
|
+
result = keep_in(string)
|
67
|
+
result.size == string.size ? nil : string.replace(result)
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def str!(obj)
|
73
|
+
raise ArgumentError, 'pass a String' unless obj.respond_to?(:codepoints)
|
74
|
+
obj
|
75
|
+
end
|
76
|
+
|
77
|
+
def make_new_str(original, &block)
|
78
|
+
new_string = str!(original).each_codepoint.each_with_object('', &block)
|
79
|
+
original.tainted? ? new_string.taint : new_string
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
class CharacterSet
|
2
|
+
module RubyFallback
|
3
|
+
module PlaneMethods
|
4
|
+
def bmp_part
|
5
|
+
dup.keep_if { |cp| cp < 0x10000 }
|
6
|
+
end
|
7
|
+
|
8
|
+
def astral_part
|
9
|
+
dup.keep_if { |cp| cp >= 0x10000 }
|
10
|
+
end
|
11
|
+
|
12
|
+
def planes
|
13
|
+
plane_set = {}
|
14
|
+
plane_size = 0x10000.to_f
|
15
|
+
each do |cp|
|
16
|
+
plane = (cp / plane_size).floor
|
17
|
+
plane_set[plane] = true
|
18
|
+
end
|
19
|
+
plane_set.keys
|
20
|
+
end
|
21
|
+
|
22
|
+
def member_in_plane?(num)
|
23
|
+
((num * 0x10000)...((num + 1) * 0x10000)).any? { |cp| include?(cp) }
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|