character_set 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +31 -0
- data/.rspec +3 -0
- data/.travis.yml +11 -0
- data/BENCHMARK.md +50 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +180 -0
- data/Rakefile +137 -0
- data/benchmarks/cover.rb +25 -0
- data/benchmarks/delete_in.rb +25 -0
- data/benchmarks/keep_in.rb +25 -0
- data/benchmarks/shared.rb +25 -0
- data/benchmarks/used_by.rb +25 -0
- data/bin/console +19 -0
- data/bin/setup +8 -0
- data/character_set.gemspec +34 -0
- data/ext/character_set/character_set.c +814 -0
- data/ext/character_set/extconf.rb +5 -0
- data/ext/character_set/unicode_casefold_table.h +1387 -0
- data/lib/character_set/character.rb +76 -0
- data/lib/character_set/common_sets.rb +258 -0
- data/lib/character_set/core_ext/regexp_ext.rb +11 -0
- data/lib/character_set/core_ext/string_ext.rb +35 -0
- data/lib/character_set/core_ext.rb +3 -0
- data/lib/character_set/expression_converter.rb +106 -0
- data/lib/character_set/parser.rb +48 -0
- data/lib/character_set/pure.rb +13 -0
- data/lib/character_set/ruby_fallback/character_set_methods.rb +83 -0
- data/lib/character_set/ruby_fallback/plane_methods.rb +27 -0
- data/lib/character_set/ruby_fallback/set_methods.rb +103 -0
- data/lib/character_set/ruby_fallback.rb +21 -0
- data/lib/character_set/set_method_adapters.rb +39 -0
- data/lib/character_set/shared_methods.rb +155 -0
- data/lib/character_set/version.rb +3 -0
- data/lib/character_set/writer.rb +37 -0
- data/lib/character_set.rb +21 -0
- metadata +193 -0
@@ -0,0 +1,76 @@
|
|
1
|
+
class CharacterSet
|
2
|
+
class Character
|
3
|
+
ENCODING = 'utf-8'.freeze
|
4
|
+
SAFELY_PRINTABLE = (0x21..0x7E).to_a - ['-', '[', '\\', ']', '^'].map(&:ord)
|
5
|
+
|
6
|
+
attr_accessor :codepoint
|
7
|
+
|
8
|
+
def initialize(codepoint)
|
9
|
+
case codepoint
|
10
|
+
when Integer then self.codepoint = codepoint
|
11
|
+
when String then self.codepoint = codepoint.ord
|
12
|
+
else raise ArgumentError, 'pass an Integer or String'
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_s
|
17
|
+
codepoint.chr(ENCODING)
|
18
|
+
end
|
19
|
+
|
20
|
+
def hex
|
21
|
+
codepoint.to_s(16).upcase
|
22
|
+
end
|
23
|
+
|
24
|
+
def escape(opts = {})
|
25
|
+
return to_s if SAFELY_PRINTABLE.include?(codepoint) && !opts[:escape_all]
|
26
|
+
|
27
|
+
return yield(self) if block_given?
|
28
|
+
|
29
|
+
# https://billposer.org/Software/ListOfRepresentations.html
|
30
|
+
case opts[:format].to_s.downcase.delete('-_ ')
|
31
|
+
when '', 'default', 'es6', 'esnext', 'rb', 'ruby'
|
32
|
+
default_escape(opts)
|
33
|
+
when 'java', 'javascript', 'js'
|
34
|
+
default_escape(opts, false)
|
35
|
+
when 'capitalizableu', 'c#', 'csharp', 'd', 'python'
|
36
|
+
capitalizable_u_escape
|
37
|
+
when 'u+', 'uplus'
|
38
|
+
u_plus_escape
|
39
|
+
when 'literal', 'raw'
|
40
|
+
to_s
|
41
|
+
else
|
42
|
+
raise ArgumentError, "unsupported format: #{opts[:format].inspect}"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def plane
|
47
|
+
codepoint / 0x10000
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def default_escape(opts, support_wide_hex = true)
|
53
|
+
if hex.length <= 2
|
54
|
+
'\\x' + hex.rjust(2, '0')
|
55
|
+
elsif hex.length <= 4
|
56
|
+
'\\u' + hex.rjust(4, '0')
|
57
|
+
elsif support_wide_hex
|
58
|
+
'\\u{' + hex + '}'
|
59
|
+
else
|
60
|
+
raise "#{opts[:format]} does not support escaping astral value #{hex}"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def capitalizable_u_escape
|
65
|
+
if hex.length <= 4
|
66
|
+
'\\u' + hex.rjust(4, '0')
|
67
|
+
else
|
68
|
+
'\\U' + hex.rjust(8, '0')
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def u_plus_escape
|
73
|
+
'U+' + hex.rjust(4, '0')
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,258 @@
|
|
1
|
+
class CharacterSet
|
2
|
+
module CommonSets
|
3
|
+
def ascii
|
4
|
+
@ascii ||= from_ranges(0..0x7F).freeze
|
5
|
+
end
|
6
|
+
|
7
|
+
# basic multilingual plane
|
8
|
+
def bmp
|
9
|
+
@bmp ||= from_ranges(0..0xD7FF, 0xE000..0xFFFF).freeze
|
10
|
+
end
|
11
|
+
|
12
|
+
# ./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
|
13
|
+
def crypt
|
14
|
+
@crypt ||= from_ranges(0x2E..0x5A, 0x61..0x7A).freeze
|
15
|
+
end
|
16
|
+
|
17
|
+
def newline
|
18
|
+
@newline ||= from_ranges(0xA..0xD, 0x85..0x85, 0x2028..0x2029).freeze
|
19
|
+
end
|
20
|
+
|
21
|
+
def unicode
|
22
|
+
@unicode ||= from_ranges(0..0xD7FF, 0xE000..0x10FFFF).freeze
|
23
|
+
end
|
24
|
+
|
25
|
+
def url_fragment
|
26
|
+
@url_fragment ||= from_ranges(
|
27
|
+
0x21..0x21,
|
28
|
+
0x24..0x24,
|
29
|
+
0x26..0x3B,
|
30
|
+
0x3D..0x3D,
|
31
|
+
0x3F..0x5A,
|
32
|
+
0x5F..0x5F,
|
33
|
+
0x61..0x7A,
|
34
|
+
0x7E..0x7E
|
35
|
+
).freeze
|
36
|
+
end
|
37
|
+
|
38
|
+
def url_host
|
39
|
+
@url_host ||= from_ranges(
|
40
|
+
0x21..0x21,
|
41
|
+
0x24..0x24,
|
42
|
+
0x26..0x2E,
|
43
|
+
0x30..0x3B,
|
44
|
+
0x3D..0x3D,
|
45
|
+
0x41..0x5B,
|
46
|
+
0x5D..0x5D,
|
47
|
+
0x5F..0x5F,
|
48
|
+
0x61..0x7A,
|
49
|
+
0x7E..0x7E
|
50
|
+
).freeze
|
51
|
+
end
|
52
|
+
|
53
|
+
def url_path
|
54
|
+
@url_path ||= from_ranges(
|
55
|
+
0x21..0x21,
|
56
|
+
0x24..0x3A,
|
57
|
+
0x3D..0x3D,
|
58
|
+
0x40..0x5A,
|
59
|
+
0x5F..0x5F,
|
60
|
+
0x61..0x7A,
|
61
|
+
0x7E..0x7E
|
62
|
+
).freeze
|
63
|
+
end
|
64
|
+
|
65
|
+
def url_query
|
66
|
+
@url_query ||= from_ranges(
|
67
|
+
0x21..0x21,
|
68
|
+
0x24..0x24,
|
69
|
+
0x26..0x3B,
|
70
|
+
0x3D..0x3D,
|
71
|
+
0x3F..0x5A,
|
72
|
+
0x5F..0x5F,
|
73
|
+
0x61..0x7A,
|
74
|
+
0x7E..0x7E
|
75
|
+
).freeze
|
76
|
+
end
|
77
|
+
|
78
|
+
def whitespace
|
79
|
+
@whitespace ||= from_ranges(
|
80
|
+
0x9..0x9,
|
81
|
+
0xA..0xD,
|
82
|
+
0x20..0x20,
|
83
|
+
0x85..0x85,
|
84
|
+
0xA0..0xA0,
|
85
|
+
0x1680..0x1680,
|
86
|
+
0x180E..0x180E,
|
87
|
+
0x2000..0x200A,
|
88
|
+
0x2028..0x2029,
|
89
|
+
0x202F..0x202F,
|
90
|
+
0x205F..0x205F,
|
91
|
+
0x3000..0x3000
|
92
|
+
).freeze
|
93
|
+
end
|
94
|
+
|
95
|
+
def emoji
|
96
|
+
@emoji ||= from_ranges(
|
97
|
+
0x23..0x23,
|
98
|
+
0x2A..0x2A,
|
99
|
+
0x30..0x39,
|
100
|
+
0xA9..0xA9,
|
101
|
+
0xAE..0xAE,
|
102
|
+
0x203C..0x203C,
|
103
|
+
0x2049..0x2049,
|
104
|
+
0x2122..0x2122,
|
105
|
+
0x2139..0x2139,
|
106
|
+
0x2194..0x2199,
|
107
|
+
0x21A9..0x21AA,
|
108
|
+
0x231A..0x231B,
|
109
|
+
0x2328..0x2328,
|
110
|
+
0x23CF..0x23CF,
|
111
|
+
0x23E9..0x23F3,
|
112
|
+
0x23F8..0x23FA,
|
113
|
+
0x24C2..0x24C2,
|
114
|
+
0x25AA..0x25AB,
|
115
|
+
0x25B6..0x25B6,
|
116
|
+
0x25C0..0x25C0,
|
117
|
+
0x25FB..0x25FE,
|
118
|
+
0x2600..0x2604,
|
119
|
+
0x260E..0x260E,
|
120
|
+
0x2611..0x2611,
|
121
|
+
0x2614..0x2615,
|
122
|
+
0x2618..0x2618,
|
123
|
+
0x261D..0x261D,
|
124
|
+
0x2620..0x2620,
|
125
|
+
0x2622..0x2623,
|
126
|
+
0x2626..0x2626,
|
127
|
+
0x262A..0x262A,
|
128
|
+
0x262E..0x262F,
|
129
|
+
0x2638..0x263A,
|
130
|
+
0x2640..0x2640,
|
131
|
+
0x2642..0x2642,
|
132
|
+
0x2648..0x2653,
|
133
|
+
0x2660..0x2660,
|
134
|
+
0x2663..0x2663,
|
135
|
+
0x2665..0x2666,
|
136
|
+
0x2668..0x2668,
|
137
|
+
0x267B..0x267B,
|
138
|
+
0x267F..0x267F,
|
139
|
+
0x2692..0x2697,
|
140
|
+
0x2699..0x2699,
|
141
|
+
0x269B..0x269C,
|
142
|
+
0x26A0..0x26A1,
|
143
|
+
0x26AA..0x26AB,
|
144
|
+
0x26B0..0x26B1,
|
145
|
+
0x26BD..0x26BE,
|
146
|
+
0x26C4..0x26C5,
|
147
|
+
0x26C8..0x26C8,
|
148
|
+
0x26CE..0x26CF,
|
149
|
+
0x26D1..0x26D1,
|
150
|
+
0x26D3..0x26D4,
|
151
|
+
0x26E9..0x26EA,
|
152
|
+
0x26F0..0x26F5,
|
153
|
+
0x26F7..0x26FA,
|
154
|
+
0x26FD..0x26FD,
|
155
|
+
0x2702..0x2702,
|
156
|
+
0x2705..0x2705,
|
157
|
+
0x2708..0x270D,
|
158
|
+
0x270F..0x270F,
|
159
|
+
0x2712..0x2712,
|
160
|
+
0x2714..0x2714,
|
161
|
+
0x2716..0x2716,
|
162
|
+
0x271D..0x271D,
|
163
|
+
0x2721..0x2721,
|
164
|
+
0x2728..0x2728,
|
165
|
+
0x2733..0x2734,
|
166
|
+
0x2744..0x2744,
|
167
|
+
0x2747..0x2747,
|
168
|
+
0x274C..0x274C,
|
169
|
+
0x274E..0x274E,
|
170
|
+
0x2753..0x2755,
|
171
|
+
0x2757..0x2757,
|
172
|
+
0x2763..0x2764,
|
173
|
+
0x2795..0x2797,
|
174
|
+
0x27A1..0x27A1,
|
175
|
+
0x27B0..0x27B0,
|
176
|
+
0x27BF..0x27BF,
|
177
|
+
0x2934..0x2935,
|
178
|
+
0x2B05..0x2B07,
|
179
|
+
0x2B1B..0x2B1C,
|
180
|
+
0x2B50..0x2B50,
|
181
|
+
0x2B55..0x2B55,
|
182
|
+
0x3030..0x3030,
|
183
|
+
0x303D..0x303D,
|
184
|
+
0x3297..0x3297,
|
185
|
+
0x3299..0x3299,
|
186
|
+
0x1F004..0x1F004,
|
187
|
+
0x1F0CF..0x1F0CF,
|
188
|
+
0x1F170..0x1F171,
|
189
|
+
0x1F17E..0x1F17F,
|
190
|
+
0x1F18E..0x1F18E,
|
191
|
+
0x1F191..0x1F19A,
|
192
|
+
0x1F1E6..0x1F1FF,
|
193
|
+
0x1F201..0x1F202,
|
194
|
+
0x1F21A..0x1F21A,
|
195
|
+
0x1F22F..0x1F22F,
|
196
|
+
0x1F232..0x1F23A,
|
197
|
+
0x1F250..0x1F251,
|
198
|
+
0x1F300..0x1F321,
|
199
|
+
0x1F324..0x1F393,
|
200
|
+
0x1F396..0x1F397,
|
201
|
+
0x1F399..0x1F39B,
|
202
|
+
0x1F39E..0x1F3F0,
|
203
|
+
0x1F3F3..0x1F3F5,
|
204
|
+
0x1F3F7..0x1F4FD,
|
205
|
+
0x1F4FF..0x1F53D,
|
206
|
+
0x1F549..0x1F54E,
|
207
|
+
0x1F550..0x1F567,
|
208
|
+
0x1F56F..0x1F570,
|
209
|
+
0x1F573..0x1F57A,
|
210
|
+
0x1F587..0x1F587,
|
211
|
+
0x1F58A..0x1F58D,
|
212
|
+
0x1F590..0x1F590,
|
213
|
+
0x1F595..0x1F596,
|
214
|
+
0x1F5A4..0x1F5A5,
|
215
|
+
0x1F5A8..0x1F5A8,
|
216
|
+
0x1F5B1..0x1F5B2,
|
217
|
+
0x1F5BC..0x1F5BC,
|
218
|
+
0x1F5C2..0x1F5C4,
|
219
|
+
0x1F5D1..0x1F5D3,
|
220
|
+
0x1F5DC..0x1F5DE,
|
221
|
+
0x1F5E1..0x1F5E1,
|
222
|
+
0x1F5E3..0x1F5E3,
|
223
|
+
0x1F5E8..0x1F5E8,
|
224
|
+
0x1F5EF..0x1F5EF,
|
225
|
+
0x1F5F3..0x1F5F3,
|
226
|
+
0x1F5FA..0x1F64F,
|
227
|
+
0x1F680..0x1F6C5,
|
228
|
+
0x1F6CB..0x1F6D2,
|
229
|
+
0x1F6E0..0x1F6E5,
|
230
|
+
0x1F6E9..0x1F6E9,
|
231
|
+
0x1F6EB..0x1F6EC,
|
232
|
+
0x1F6F0..0x1F6F0,
|
233
|
+
0x1F6F3..0x1F6F8,
|
234
|
+
0x1F910..0x1F93A,
|
235
|
+
0x1F93C..0x1F93E,
|
236
|
+
0x1F940..0x1F945,
|
237
|
+
0x1F947..0x1F94C,
|
238
|
+
0x1F950..0x1F96B,
|
239
|
+
0x1F980..0x1F997,
|
240
|
+
0x1F9C0..0x1F9C0,
|
241
|
+
0x1F9D0..0x1F9E6
|
242
|
+
).freeze
|
243
|
+
end
|
244
|
+
|
245
|
+
def respond_to_missing?(method_name, include_private = false)
|
246
|
+
(base = method_name[/^non_(.*)/, 1]) && respond_to?(base) || super
|
247
|
+
end
|
248
|
+
|
249
|
+
def method_missing(method_name, *args, &block)
|
250
|
+
if (base = method_name[/^non_(.*)/, 1])
|
251
|
+
ivar_name = "@#{method_name}"
|
252
|
+
return instance_variable_get(ivar_name) ||
|
253
|
+
instance_variable_set(ivar_name, send(base).inversion.freeze)
|
254
|
+
end
|
255
|
+
super
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
class CharacterSet
|
2
|
+
module CoreExt
|
3
|
+
module StringExt
|
4
|
+
def character_set
|
5
|
+
CharacterSet.of(self)
|
6
|
+
end
|
7
|
+
|
8
|
+
def covered_by_character_set?(set)
|
9
|
+
set.cover?(self)
|
10
|
+
end
|
11
|
+
|
12
|
+
def uses_character_set?(set)
|
13
|
+
set.used_by?(self)
|
14
|
+
end
|
15
|
+
|
16
|
+
def delete_character_set(set)
|
17
|
+
set.delete_in(self)
|
18
|
+
end
|
19
|
+
|
20
|
+
def delete_character_set!(set)
|
21
|
+
set.delete_in!(self)
|
22
|
+
end
|
23
|
+
|
24
|
+
def keep_character_set(set)
|
25
|
+
set.keep_in(self)
|
26
|
+
end
|
27
|
+
|
28
|
+
def keep_character_set!(set)
|
29
|
+
set.keep_in!(self)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
::String.send(:include, CharacterSet::CoreExt::StringExt)
|
@@ -0,0 +1,106 @@
|
|
1
|
+
class CharacterSet
|
2
|
+
module ExpressionConverter
|
3
|
+
module_function
|
4
|
+
|
5
|
+
Error = Class.new(ArgumentError)
|
6
|
+
|
7
|
+
def convert(expression)
|
8
|
+
@regexp_parser_required ||= require 'regexp_parser'
|
9
|
+
|
10
|
+
case expression
|
11
|
+
when Regexp::Expression::Root
|
12
|
+
if expression.count != 1
|
13
|
+
raise Error, 'Pass a Regexp with exactly one expression, e.g. /[a-z]/'
|
14
|
+
end
|
15
|
+
convert(expression[0])
|
16
|
+
|
17
|
+
when Regexp::Expression::CharacterSet
|
18
|
+
content = expression.map { |subexp| convert(subexp) }.reduce(:+)
|
19
|
+
expression.negative? ? content.inversion : content
|
20
|
+
|
21
|
+
when Regexp::Expression::CharacterSet::Intersection
|
22
|
+
expression.map { |subexp| convert(subexp) }.reduce(:&)
|
23
|
+
|
24
|
+
when Regexp::Expression::CharacterSet::IntersectedSequence
|
25
|
+
expression.map { |subexp| convert(subexp) }.reduce(:+)
|
26
|
+
|
27
|
+
when Regexp::Expression::CharacterSet::Range
|
28
|
+
start, finish = expression.map { |subexp| convert(subexp) }
|
29
|
+
CharacterSet.from_ranges((start.min)..(finish.max))
|
30
|
+
|
31
|
+
when Regexp::Expression::CharacterType::Any
|
32
|
+
CharacterSet.unicode
|
33
|
+
|
34
|
+
when Regexp::Expression::CharacterType::Digit
|
35
|
+
CharacterSet.from_ranges(48..57)
|
36
|
+
|
37
|
+
when Regexp::Expression::CharacterType::NonDigit
|
38
|
+
CharacterSet.from_ranges(48..57).inversion
|
39
|
+
|
40
|
+
when Regexp::Expression::CharacterType::Hex
|
41
|
+
CharacterSet.from_ranges(48..57, 65..70, 97..102)
|
42
|
+
|
43
|
+
when Regexp::Expression::CharacterType::NonHex
|
44
|
+
CharacterSet.from_ranges(48..57, 65..70, 97..102).inversion
|
45
|
+
|
46
|
+
when Regexp::Expression::CharacterType::Space
|
47
|
+
CharacterSet["\t", "\n", "\v", "\f", "\r", "\x20"]
|
48
|
+
|
49
|
+
when Regexp::Expression::CharacterType::NonSpace
|
50
|
+
CharacterSet["\t", "\n", "\v", "\f", "\r", "\x20"].inversion
|
51
|
+
|
52
|
+
when Regexp::Expression::CharacterType::Word
|
53
|
+
CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122)
|
54
|
+
|
55
|
+
when Regexp::Expression::CharacterType::NonWord
|
56
|
+
CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122).inversion
|
57
|
+
|
58
|
+
when Regexp::Expression::EscapeSequence::CodepointList
|
59
|
+
CharacterSet.new(expression.codepoints)
|
60
|
+
|
61
|
+
when Regexp::Expression::EscapeSequence::Base
|
62
|
+
CharacterSet[expression.codepoint]
|
63
|
+
|
64
|
+
when Regexp::Expression::Group::Capture,
|
65
|
+
Regexp::Expression::Group::Passive,
|
66
|
+
Regexp::Expression::Group::Named,
|
67
|
+
Regexp::Expression::Group::Atomic,
|
68
|
+
Regexp::Expression::Group::Options
|
69
|
+
case expression.count
|
70
|
+
when 0 then CharacterSet[]
|
71
|
+
when 1 then convert(expression.first)
|
72
|
+
else
|
73
|
+
raise Error, 'Groups must contain exactly one expression, e.g. ([a-z])'
|
74
|
+
end
|
75
|
+
|
76
|
+
when Regexp::Expression::Alternation
|
77
|
+
expression.map { |subexp| convert(subexp) }.reduce(:+)
|
78
|
+
|
79
|
+
when Regexp::Expression::Alternative
|
80
|
+
case expression.count
|
81
|
+
when 0 then CharacterSet[]
|
82
|
+
when 1 then convert(expression.first)
|
83
|
+
else
|
84
|
+
raise Error, 'Alternatives must contain exactly one expression'
|
85
|
+
end
|
86
|
+
|
87
|
+
when Regexp::Expression::Literal
|
88
|
+
if expression.set_level == 0 && expression.text.size != 1
|
89
|
+
raise Error, 'Literal runs outside of sets are codepoint *sequences*'
|
90
|
+
end
|
91
|
+
CharacterSet[expression.text.ord]
|
92
|
+
|
93
|
+
when Regexp::Expression::UnicodeProperty::Base,
|
94
|
+
Regexp::Expression::PosixClass
|
95
|
+
content = CharacterSet.of_property(expression.token)
|
96
|
+
expression.negative? ? content.inversion : content
|
97
|
+
|
98
|
+
when Regexp::Expression::Base
|
99
|
+
raise Error, "Unsupported expression class `#{expression.class}`"
|
100
|
+
|
101
|
+
else
|
102
|
+
raise Error, "Pass an expression (result of Regexp::Parser.parse)"
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
class CharacterSet
|
2
|
+
module Parser
|
3
|
+
module_function
|
4
|
+
|
5
|
+
def codepoints_from_enumerable(object)
|
6
|
+
raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each)
|
7
|
+
# Use #each to check first element (only this works for all Enumerables)
|
8
|
+
object.each do |e|
|
9
|
+
return object if e.is_a?(Integer) && e >= 0 && e < 0x110000
|
10
|
+
return object.map(&:ord) if e.is_a?(String) && e.length == 1
|
11
|
+
raise ArgumentError, "#{e.inspect} is not valid as a codepoint"
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def codepoints_from_bracket_expression(string)
|
16
|
+
raise ArgumentError, 'pass a String' unless string.is_a?(String)
|
17
|
+
raise ArgumentError, 'advanced syntax' if string =~ /\\[^uUx]|[^\\]\[|&&/
|
18
|
+
|
19
|
+
content = strip_brackets(string)
|
20
|
+
literal_content = eval_escapes(content)
|
21
|
+
|
22
|
+
prev_chr = nil
|
23
|
+
in_range = false
|
24
|
+
|
25
|
+
literal_content.each_char.map do |chr|
|
26
|
+
if chr == '-' && prev_chr && prev_chr != '\\' && prev_chr != '-'
|
27
|
+
in_range = true
|
28
|
+
nil
|
29
|
+
else
|
30
|
+
result = in_range ? ((prev_chr.ord + 1)..(chr.ord)).to_a : chr.ord
|
31
|
+
in_range = false
|
32
|
+
prev_chr = chr
|
33
|
+
result
|
34
|
+
end
|
35
|
+
end.compact.flatten
|
36
|
+
end
|
37
|
+
|
38
|
+
def strip_brackets(string)
|
39
|
+
string[/\A\[\^?(.*)\]\z/, 1] || string.dup
|
40
|
+
end
|
41
|
+
|
42
|
+
def eval_escapes(string)
|
43
|
+
string.gsub(/\\U(\h{8})|\\u(\h{4})|U\+(\h+)|\\x(\h{2})|\\u\{(\h+)\}/) do
|
44
|
+
($1 || $2 || $3 || $4 || $5).to_i(16).chr('utf-8')
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'character_set'
|
2
|
+
require 'character_set/ruby_fallback'
|
3
|
+
|
4
|
+
# CharacterSet::Pure uses only Ruby implementations.
|
5
|
+
# It is equal to CharacterSet if the C ext can't be loaded.
|
6
|
+
class CharacterSet
|
7
|
+
class Pure
|
8
|
+
prepend CharacterSet::RubyFallback
|
9
|
+
prepend CharacterSet::SetMethodAdapters
|
10
|
+
include CharacterSet::SharedMethods
|
11
|
+
extend CharacterSet::CommonSets
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
class CharacterSet
|
2
|
+
module RubyFallback
|
3
|
+
module CharacterSetMethods
|
4
|
+
module ClassMethods
|
5
|
+
def from_ranges(*ranges)
|
6
|
+
new(Array(ranges).flat_map(&:to_a))
|
7
|
+
end
|
8
|
+
|
9
|
+
def of(string)
|
10
|
+
raise ArgumentError, 'pass a String' unless string.is_a?(String)
|
11
|
+
new(string.codepoints)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def inversion(include_surrogates: false, upto: 0x10FFFF)
|
16
|
+
new_set = self.class.new
|
17
|
+
0.upto(upto) do |cp|
|
18
|
+
next unless include_surrogates || cp > 0xDFFF || cp < 0xD800
|
19
|
+
new_set << cp unless include?(cp)
|
20
|
+
end
|
21
|
+
new_set
|
22
|
+
end
|
23
|
+
|
24
|
+
def case_insensitive
|
25
|
+
new_set = dup
|
26
|
+
each do |cp|
|
27
|
+
swapped_cps = cp.chr('utf-8').swapcase.codepoints
|
28
|
+
swapped_cps.size == 1 && new_set << swapped_cps[0]
|
29
|
+
end
|
30
|
+
new_set
|
31
|
+
end
|
32
|
+
|
33
|
+
def ranges
|
34
|
+
@range_compressor_required ||= require 'range_compressor'
|
35
|
+
RangeCompressor.compress(self)
|
36
|
+
end
|
37
|
+
|
38
|
+
def sample(count = nil)
|
39
|
+
count.nil? ? to_a(true).sample : to_a(true).sample(count)
|
40
|
+
end
|
41
|
+
|
42
|
+
def used_by?(string)
|
43
|
+
str!(string).each_codepoint { |cp| return true if include?(cp) }
|
44
|
+
false
|
45
|
+
end
|
46
|
+
|
47
|
+
def cover?(string)
|
48
|
+
str!(string).each_codepoint { |cp| return false unless include?(cp) }
|
49
|
+
true
|
50
|
+
end
|
51
|
+
|
52
|
+
def delete_in(string)
|
53
|
+
make_new_str(string) { |cp, new_str| include?(cp) || (new_str << cp) }
|
54
|
+
end
|
55
|
+
|
56
|
+
def delete_in!(string)
|
57
|
+
result = delete_in(string)
|
58
|
+
result.size == string.size ? nil : string.replace(result)
|
59
|
+
end
|
60
|
+
|
61
|
+
def keep_in(string)
|
62
|
+
make_new_str(string) { |cp, new_str| include?(cp) && (new_str << cp) }
|
63
|
+
end
|
64
|
+
|
65
|
+
def keep_in!(string)
|
66
|
+
result = keep_in(string)
|
67
|
+
result.size == string.size ? nil : string.replace(result)
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def str!(obj)
|
73
|
+
raise ArgumentError, 'pass a String' unless obj.respond_to?(:codepoints)
|
74
|
+
obj
|
75
|
+
end
|
76
|
+
|
77
|
+
def make_new_str(original, &block)
|
78
|
+
new_string = str!(original).each_codepoint.each_with_object('', &block)
|
79
|
+
original.tainted? ? new_string.taint : new_string
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
class CharacterSet
|
2
|
+
module RubyFallback
|
3
|
+
module PlaneMethods
|
4
|
+
def bmp_part
|
5
|
+
dup.keep_if { |cp| cp < 0x10000 }
|
6
|
+
end
|
7
|
+
|
8
|
+
def astral_part
|
9
|
+
dup.keep_if { |cp| cp >= 0x10000 }
|
10
|
+
end
|
11
|
+
|
12
|
+
def planes
|
13
|
+
plane_set = {}
|
14
|
+
plane_size = 0x10000.to_f
|
15
|
+
each do |cp|
|
16
|
+
plane = (cp / plane_size).floor
|
17
|
+
plane_set[plane] = true
|
18
|
+
end
|
19
|
+
plane_set.keys
|
20
|
+
end
|
21
|
+
|
22
|
+
def member_in_plane?(num)
|
23
|
+
((num * 0x10000)...((num + 1) * 0x10000)).any? { |cp| include?(cp) }
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|