character_set 1.4.0 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitattributes +1 -1
- data/.github/workflows/gouteur.yml +20 -0
- data/.github/workflows/lint.yml +29 -0
- data/.github/workflows/tests.yml +28 -0
- data/.gitignore +1 -0
- data/.gouteur.yml +2 -0
- data/.rubocop.yml +20 -0
- data/BENCHMARK.md +35 -31
- data/CHANGELOG.md +64 -1
- data/Gemfile +15 -0
- data/LICENSE.txt +1 -1
- data/README.md +25 -9
- data/Rakefile +2 -120
- data/character_set.gemspec +0 -10
- data/ext/character_set/character_set.c +123 -121
- data/ext/character_set/unicode_casefold_table.h +44 -1
- data/lib/character_set/core_ext/regexp_ext.rb +9 -1
- data/lib/character_set/core_ext/string_ext.rb +2 -2
- data/lib/character_set/expression_converter.rb +40 -56
- data/lib/character_set/parser.rb +8 -4
- data/lib/character_set/predefined_sets/assigned.cps +110 -78
- data/lib/character_set/predefined_sets/emoji.cps +16 -14
- data/lib/character_set/predefined_sets.rb +11 -0
- data/lib/character_set/ruby_fallback/character_set_methods.rb +17 -21
- data/lib/character_set/ruby_fallback/set_methods.rb +9 -16
- data/lib/character_set/ruby_fallback/vendored_set_classes.rb +385 -0
- data/lib/character_set/ruby_fallback.rb +18 -2
- data/lib/character_set/set_method_adapters.rb +4 -3
- data/lib/character_set/shared_methods.rb +25 -11
- data/lib/character_set/version.rb +1 -1
- data/tasks/benchmark.rake +20 -0
- data/{benchmarks → tasks/benchmarks}/delete_in.rb +5 -1
- data/{benchmarks → tasks/benchmarks}/keep_in.rb +5 -1
- data/tasks/benchmarks/shared.rb +28 -0
- data/tasks/sync_casefold_data.rake +20 -0
- data/tasks/sync_predefined_sets.rake +9 -0
- data/tasks/sync_ruby_spec.rake +65 -0
- metadata +29 -146
- data/.travis.yml +0 -9
- data/benchmarks/shared.rb +0 -26
- /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
@@ -0,0 +1,385 @@
|
|
1
|
+
# set, vendored from https://github.com/ruby/set/blob/master/lib/set.rb,
|
2
|
+
# with comments removed and linted.
|
3
|
+
class CharacterSet::RubyFallback::Set
|
4
|
+
Set = self
|
5
|
+
include Enumerable
|
6
|
+
|
7
|
+
def self.[](*ary)
|
8
|
+
new(ary)
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(enum = nil, &block)
|
12
|
+
@hash = Hash.new(false)
|
13
|
+
|
14
|
+
enum.nil? and return
|
15
|
+
|
16
|
+
if block
|
17
|
+
do_with_enum(enum) { |o| add(block[o]) }
|
18
|
+
else
|
19
|
+
merge(enum)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def do_with_enum(enum, &block)
|
24
|
+
if enum.respond_to?(:each_entry)
|
25
|
+
enum.each_entry(&block) if block
|
26
|
+
elsif enum.respond_to?(:each)
|
27
|
+
enum.each(&block) if block
|
28
|
+
else
|
29
|
+
raise ArgumentError, "value must be enumerable"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
private :do_with_enum
|
33
|
+
|
34
|
+
def initialize_dup(orig)
|
35
|
+
super
|
36
|
+
@hash = orig.instance_variable_get(:@hash).dup
|
37
|
+
end
|
38
|
+
|
39
|
+
if Kernel.instance_method(:initialize_clone).arity != 1
|
40
|
+
def initialize_clone(orig, **options)
|
41
|
+
super
|
42
|
+
@hash = orig.instance_variable_get(:@hash).clone(**options)
|
43
|
+
end
|
44
|
+
else
|
45
|
+
def initialize_clone(orig)
|
46
|
+
super
|
47
|
+
@hash = orig.instance_variable_get(:@hash).clone
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def freeze
|
52
|
+
@hash.freeze
|
53
|
+
super
|
54
|
+
end
|
55
|
+
|
56
|
+
def size
|
57
|
+
@hash.size
|
58
|
+
end
|
59
|
+
alias length size
|
60
|
+
|
61
|
+
def empty?
|
62
|
+
@hash.empty?
|
63
|
+
end
|
64
|
+
|
65
|
+
def clear
|
66
|
+
@hash.clear
|
67
|
+
self
|
68
|
+
end
|
69
|
+
|
70
|
+
def to_a
|
71
|
+
@hash.keys
|
72
|
+
end
|
73
|
+
|
74
|
+
def include?(o)
|
75
|
+
@hash[o]
|
76
|
+
end
|
77
|
+
alias member? include?
|
78
|
+
|
79
|
+
def superset?(set)
|
80
|
+
case
|
81
|
+
when set.instance_of?(self.class) && @hash.respond_to?(:>=)
|
82
|
+
@hash >= set.instance_variable_get(:@hash)
|
83
|
+
when set.is_a?(Set)
|
84
|
+
size >= set.size && set.all? { |o| include?(o) }
|
85
|
+
else
|
86
|
+
raise ArgumentError, "value must be a set"
|
87
|
+
end
|
88
|
+
end
|
89
|
+
alias >= superset?
|
90
|
+
|
91
|
+
def proper_superset?(set)
|
92
|
+
case
|
93
|
+
when set.instance_of?(self.class) && @hash.respond_to?(:>)
|
94
|
+
@hash > set.instance_variable_get(:@hash)
|
95
|
+
when set.is_a?(Set)
|
96
|
+
size > set.size && set.all? { |o| include?(o) }
|
97
|
+
else
|
98
|
+
raise ArgumentError, "value must be a set"
|
99
|
+
end
|
100
|
+
end
|
101
|
+
alias > proper_superset?
|
102
|
+
|
103
|
+
def subset?(set)
|
104
|
+
case
|
105
|
+
when set.instance_of?(self.class) && @hash.respond_to?(:<=)
|
106
|
+
@hash <= set.instance_variable_get(:@hash)
|
107
|
+
when set.is_a?(Set)
|
108
|
+
size <= set.size && all? { |o| set.include?(o) }
|
109
|
+
else
|
110
|
+
raise ArgumentError, "value must be a set"
|
111
|
+
end
|
112
|
+
end
|
113
|
+
alias <= subset?
|
114
|
+
|
115
|
+
def proper_subset?(set)
|
116
|
+
case
|
117
|
+
when set.instance_of?(self.class) && @hash.respond_to?(:<)
|
118
|
+
@hash < set.instance_variable_get(:@hash)
|
119
|
+
when set.is_a?(Set)
|
120
|
+
size < set.size && all? { |o| set.include?(o) }
|
121
|
+
else
|
122
|
+
raise ArgumentError, "value must be a set"
|
123
|
+
end
|
124
|
+
end
|
125
|
+
alias < proper_subset?
|
126
|
+
|
127
|
+
def <=>(set)
|
128
|
+
return unless set.is_a?(Set)
|
129
|
+
|
130
|
+
case size <=> set.size
|
131
|
+
when -1 then -1 if proper_subset?(set)
|
132
|
+
when +1 then +1 if proper_superset?(set)
|
133
|
+
else 0 if self.==(set)
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
def intersect?(set)
|
138
|
+
case set
|
139
|
+
when Set
|
140
|
+
if size < set.size
|
141
|
+
any? { |o| set.include?(o) }
|
142
|
+
else
|
143
|
+
set.any? { |o| include?(o) }
|
144
|
+
end
|
145
|
+
when Enumerable
|
146
|
+
set.any? { |o| include?(o) }
|
147
|
+
else
|
148
|
+
raise ArgumentError, "value must be enumerable"
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
def disjoint?(set)
|
153
|
+
!intersect?(set)
|
154
|
+
end
|
155
|
+
|
156
|
+
def each(&block)
|
157
|
+
block_given? or return enum_for(__method__) { size }
|
158
|
+
@hash.each_key(&block)
|
159
|
+
self
|
160
|
+
end
|
161
|
+
|
162
|
+
def add(o)
|
163
|
+
@hash[o] = true
|
164
|
+
self
|
165
|
+
end
|
166
|
+
alias << add
|
167
|
+
|
168
|
+
def add?(o)
|
169
|
+
add(o) unless include?(o)
|
170
|
+
end
|
171
|
+
|
172
|
+
def delete(o)
|
173
|
+
@hash.delete(o)
|
174
|
+
self
|
175
|
+
end
|
176
|
+
|
177
|
+
def delete?(o)
|
178
|
+
delete(o) if include?(o)
|
179
|
+
end
|
180
|
+
|
181
|
+
def delete_if
|
182
|
+
block_given? or return enum_for(__method__) { size }
|
183
|
+
select { |o| yield o }.each { |o| @hash.delete(o) }
|
184
|
+
self
|
185
|
+
end
|
186
|
+
|
187
|
+
def keep_if
|
188
|
+
block_given? or return enum_for(__method__) { size }
|
189
|
+
reject { |o| yield o }.each { |o| @hash.delete(o) }
|
190
|
+
self
|
191
|
+
end
|
192
|
+
|
193
|
+
def reject!(&block)
|
194
|
+
block_given? or return enum_for(__method__) { size }
|
195
|
+
n = size
|
196
|
+
delete_if(&block)
|
197
|
+
self if size != n
|
198
|
+
end
|
199
|
+
|
200
|
+
def select!(&block)
|
201
|
+
block_given? or return enum_for(__method__) { size }
|
202
|
+
n = size
|
203
|
+
keep_if(&block)
|
204
|
+
self if size != n
|
205
|
+
end
|
206
|
+
|
207
|
+
alias filter! select!
|
208
|
+
|
209
|
+
def merge(*enums, **_rest)
|
210
|
+
enums.each do |enum|
|
211
|
+
if enum.instance_of?(self.class)
|
212
|
+
@hash.update(enum.instance_variable_get(:@hash))
|
213
|
+
else
|
214
|
+
do_with_enum(enum) { |o| add(o) }
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
self
|
219
|
+
end
|
220
|
+
|
221
|
+
def subtract(enum)
|
222
|
+
do_with_enum(enum) { |o| delete(o) }
|
223
|
+
self
|
224
|
+
end
|
225
|
+
|
226
|
+
def |(enum)
|
227
|
+
dup.merge(enum)
|
228
|
+
end
|
229
|
+
alias + |
|
230
|
+
alias union |
|
231
|
+
|
232
|
+
def -(enum)
|
233
|
+
dup.subtract(enum)
|
234
|
+
end
|
235
|
+
alias difference -
|
236
|
+
|
237
|
+
def &(enum)
|
238
|
+
n = self.class.new
|
239
|
+
if enum.is_a?(Set)
|
240
|
+
if enum.size > size
|
241
|
+
each { |o| n.add(o) if enum.include?(o) }
|
242
|
+
else
|
243
|
+
enum.each { |o| n.add(o) if include?(o) }
|
244
|
+
end
|
245
|
+
else
|
246
|
+
do_with_enum(enum) { |o| n.add(o) if include?(o) }
|
247
|
+
end
|
248
|
+
n
|
249
|
+
end
|
250
|
+
alias intersection &
|
251
|
+
|
252
|
+
def ^(enum)
|
253
|
+
n = Set.new(enum)
|
254
|
+
each { |o| n.add(o) unless n.delete?(o) }
|
255
|
+
n
|
256
|
+
end
|
257
|
+
|
258
|
+
def ==(other)
|
259
|
+
if self.equal?(other)
|
260
|
+
true
|
261
|
+
elsif other.instance_of?(self.class)
|
262
|
+
@hash == other.instance_variable_get(:@hash)
|
263
|
+
elsif other.is_a?(Set) && self.size == other.size
|
264
|
+
other.all? { |o| @hash.include?(o) }
|
265
|
+
else
|
266
|
+
false
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
def hash
|
271
|
+
@hash.hash
|
272
|
+
end
|
273
|
+
|
274
|
+
def eql?(o)
|
275
|
+
return false unless o.is_a?(Set)
|
276
|
+
@hash.eql?(o.instance_variable_get(:@hash))
|
277
|
+
end
|
278
|
+
|
279
|
+
alias === include?
|
280
|
+
|
281
|
+
def classify
|
282
|
+
block_given? or return enum_for(__method__) { size }
|
283
|
+
|
284
|
+
h = {}
|
285
|
+
|
286
|
+
each { |i|
|
287
|
+
(h[yield(i)] ||= self.class.new).add(i)
|
288
|
+
}
|
289
|
+
|
290
|
+
h
|
291
|
+
end
|
292
|
+
|
293
|
+
def divide(&func)
|
294
|
+
func or return enum_for(__method__) { size }
|
295
|
+
|
296
|
+
if func.arity == 2
|
297
|
+
require 'tsort'
|
298
|
+
|
299
|
+
class << dig = {}
|
300
|
+
include TSort
|
301
|
+
|
302
|
+
alias tsort_each_node each_key
|
303
|
+
def tsort_each_child(node, &block)
|
304
|
+
fetch(node).each(&block)
|
305
|
+
end
|
306
|
+
end
|
307
|
+
|
308
|
+
each { |u|
|
309
|
+
dig[u] = a = []
|
310
|
+
each{ |v| func.call(u, v) and a << v }
|
311
|
+
}
|
312
|
+
|
313
|
+
set = Set.new()
|
314
|
+
dig.each_strongly_connected_component { |css|
|
315
|
+
set.add(self.class.new(css))
|
316
|
+
}
|
317
|
+
set
|
318
|
+
else
|
319
|
+
Set.new(classify(&func).values)
|
320
|
+
end
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
# sorted_set without rbtree dependency, vendored from
|
325
|
+
# https://github.com/ruby/set/blob/72f08c4/lib/set.rb#L731-L800
|
326
|
+
class CharacterSet::RubyFallback::SortedSet < CharacterSet::RubyFallback::Set
|
327
|
+
def initialize(*args)
|
328
|
+
@keys = nil
|
329
|
+
super
|
330
|
+
end
|
331
|
+
|
332
|
+
def clear
|
333
|
+
@keys = nil
|
334
|
+
super
|
335
|
+
end
|
336
|
+
|
337
|
+
def add(o)
|
338
|
+
@keys = nil
|
339
|
+
super
|
340
|
+
end
|
341
|
+
alias << add
|
342
|
+
|
343
|
+
def delete(o)
|
344
|
+
@keys = nil
|
345
|
+
@hash.delete(o)
|
346
|
+
self
|
347
|
+
end
|
348
|
+
|
349
|
+
def delete_if
|
350
|
+
block_given? or return enum_for(__method__) { size }
|
351
|
+
n = @hash.size
|
352
|
+
super
|
353
|
+
@keys = nil if @hash.size != n
|
354
|
+
self
|
355
|
+
end
|
356
|
+
|
357
|
+
def keep_if
|
358
|
+
block_given? or return enum_for(__method__) { size }
|
359
|
+
n = @hash.size
|
360
|
+
super
|
361
|
+
@keys = nil if @hash.size != n
|
362
|
+
self
|
363
|
+
end
|
364
|
+
|
365
|
+
def merge(enum)
|
366
|
+
@keys = nil
|
367
|
+
super
|
368
|
+
end
|
369
|
+
|
370
|
+
def each(&block)
|
371
|
+
block or return enum_for(__method__) { size }
|
372
|
+
to_a.each(&block)
|
373
|
+
self
|
374
|
+
end
|
375
|
+
|
376
|
+
def to_a
|
377
|
+
(@keys = @hash.keys).sort! unless @keys
|
378
|
+
@keys.dup
|
379
|
+
end
|
380
|
+
|
381
|
+
def freeze
|
382
|
+
to_a
|
383
|
+
super
|
384
|
+
end
|
385
|
+
end
|
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'set'
|
2
1
|
require 'character_set/ruby_fallback/set_methods'
|
3
2
|
require 'character_set/ruby_fallback/character_set_methods'
|
4
3
|
|
@@ -12,8 +11,25 @@ class CharacterSet
|
|
12
11
|
end
|
13
12
|
|
14
13
|
def initialize(enum = [])
|
15
|
-
@__set = SortedSet.new
|
14
|
+
@__set = CharacterSet::RubyFallback::SortedSet.new
|
16
15
|
super
|
17
16
|
end
|
18
17
|
end
|
19
18
|
end
|
19
|
+
|
20
|
+
if RUBY_PLATFORM[/java/i]
|
21
|
+
# JRuby has sorted_set in the stdlib.
|
22
|
+
require 'set'
|
23
|
+
CharacterSet::RubyFallback::Set = ::Set
|
24
|
+
CharacterSet::RubyFallback::SortedSet = ::SortedSet
|
25
|
+
else
|
26
|
+
# For other rubies, set/sorted_set are vendored due to dependency issues:
|
27
|
+
#
|
28
|
+
# - issues with default vs. installed gems such as [#2]
|
29
|
+
# - issues with the sorted_set dependency rb_tree
|
30
|
+
# - long-standing issues in recent versions of sorted_set
|
31
|
+
#
|
32
|
+
# The RubyFallback, and thus these set classes, are only used for testing,
|
33
|
+
# and for exotic rubies which use neither C nor Java.
|
34
|
+
require 'character_set/ruby_fallback/vendored_set_classes'
|
35
|
+
end
|
@@ -22,13 +22,14 @@ class CharacterSet
|
|
22
22
|
|
23
23
|
# Allow some methods to take an Enum just as well as another CharacterSet.
|
24
24
|
# Tested by ruby-spec.
|
25
|
-
%w[& + - ^ | difference
|
25
|
+
%w[& + - ^ | <=> difference disjoint? intersect? intersection
|
26
|
+
subtract union].each do |method|
|
26
27
|
class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
27
28
|
def #{method}(arg)
|
28
29
|
if arg.is_a?(CharacterSet)
|
29
|
-
super
|
30
|
+
super(arg)
|
30
31
|
elsif arg.respond_to?(:each)
|
31
|
-
super(
|
32
|
+
super(self.class.new(arg.to_a))
|
32
33
|
else
|
33
34
|
raise ArgumentError, 'pass an enumerable'
|
34
35
|
end
|
@@ -15,6 +15,12 @@ class CharacterSet
|
|
15
15
|
new(Array(args))
|
16
16
|
end
|
17
17
|
|
18
|
+
def of(*args)
|
19
|
+
args.map do |arg|
|
20
|
+
arg.is_a?(Regexp) ? of_regexp(arg) : of_string(arg)
|
21
|
+
end.reduce(:merge) || new
|
22
|
+
end
|
23
|
+
|
18
24
|
def parse(string)
|
19
25
|
codepoints = Parser.codepoints_from_bracket_expression(string)
|
20
26
|
result = new(codepoints)
|
@@ -22,33 +28,29 @@ class CharacterSet
|
|
22
28
|
end
|
23
29
|
|
24
30
|
def of_property(property_name)
|
25
|
-
require_optional_dependency('regexp_property_values')
|
31
|
+
require_optional_dependency('regexp_property_values', __method__)
|
26
32
|
|
27
33
|
property = RegexpPropertyValues[property_name.to_s]
|
28
34
|
from_ranges(*property.matched_ranges)
|
29
35
|
end
|
30
36
|
|
31
37
|
def of_regexp(regexp)
|
32
|
-
require_optional_dependency('regexp_parser')
|
38
|
+
require_optional_dependency('regexp_parser', __method__)
|
33
39
|
|
34
40
|
root = ::Regexp::Parser.parse(regexp)
|
35
41
|
of_expression(root)
|
36
42
|
end
|
37
43
|
|
38
44
|
def of_expression(expression)
|
39
|
-
ExpressionConverter.convert(expression)
|
45
|
+
ExpressionConverter.convert(expression, self)
|
40
46
|
end
|
41
47
|
|
42
|
-
def require_optional_dependency(name)
|
48
|
+
def require_optional_dependency(name, method)
|
43
49
|
required_optional_dependencies[name] ||= begin
|
44
50
|
require name
|
45
51
|
true
|
46
52
|
rescue ::LoadError
|
47
|
-
|
48
|
-
loc.absolute_path.to_s.include?('/lib/character_set')
|
49
|
-
end
|
50
|
-
method = entry_point && entry_point.label
|
51
|
-
raise LoadError, 'You must the install the optional dependency '\
|
53
|
+
raise LoadError, 'You must install the optional dependency '\
|
52
54
|
"'\#{name}' to use the method `\#{method}'."
|
53
55
|
end
|
54
56
|
end
|
@@ -94,6 +96,14 @@ class CharacterSet
|
|
94
96
|
Writer.write_surrogate_alternation(bmp_part.ranges, astral_part.ranges)
|
95
97
|
end
|
96
98
|
|
99
|
+
def secure_token(length = 32)
|
100
|
+
CharacterSet.require_optional_dependency('securerandom', __method__)
|
101
|
+
cps = to_a
|
102
|
+
len = cps.count
|
103
|
+
1.upto(length).map { cps[SecureRandom.random_number(len)] }.pack('U*')
|
104
|
+
end
|
105
|
+
alias random_token secure_token
|
106
|
+
|
97
107
|
def inspect
|
98
108
|
len = length
|
99
109
|
"#<#{klass.name}: {\#{first(5) * ', '}\#{'...' if len > 5}} (size: \#{len})>"
|
@@ -155,8 +165,12 @@ class CharacterSet
|
|
155
165
|
end
|
156
166
|
|
157
167
|
def divide(&func)
|
158
|
-
require '
|
159
|
-
Set.new(to_a).divide(&func)
|
168
|
+
require 'character_set/ruby_fallback'
|
169
|
+
CharacterSet::RubyFallback::Set.new(to_a).divide(&func)
|
170
|
+
end
|
171
|
+
|
172
|
+
def join(separator = '')
|
173
|
+
to_a(true).join(separator)
|
160
174
|
end
|
161
175
|
RUBY
|
162
176
|
|
@@ -0,0 +1,20 @@
|
|
1
|
+
desc 'Run all IPS benchmarks'
|
2
|
+
task :benchmark do
|
3
|
+
Dir["#{__dir__}/benchmarks/*.rb"].sort.each { |file| load(file) }
|
4
|
+
end
|
5
|
+
|
6
|
+
namespace :benchmark do
|
7
|
+
desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
|
8
|
+
task :write_to_file do
|
9
|
+
Rake.application[:benchmark].invoke
|
10
|
+
|
11
|
+
# extract comparison results from reports
|
12
|
+
results = $benchmark_results
|
13
|
+
.map { |caption, report| "```\n#{caption}\n\n#{report[/(?<=Comparison:).+/m].strip}\n```" }
|
14
|
+
.join("\n")
|
15
|
+
.gsub(/ \(±[^)]+\) |(?<=same-ish).*/, '') # remove some noise
|
16
|
+
|
17
|
+
File.write "#{__dir__}/../BENCHMARK.md",
|
18
|
+
"Results of `rake:benchmark` on #{RUBY_DESCRIPTION}\n\n#{results}\n"
|
19
|
+
end
|
20
|
+
end
|
@@ -2,24 +2,28 @@ require_relative './shared'
|
|
2
2
|
|
3
3
|
str = 'Lorem ipsum et dolorem'
|
4
4
|
rx = /\s/
|
5
|
+
trt = "\t\n\v\f\r\s"
|
5
6
|
cs = CharacterSet.whitespace
|
6
7
|
|
7
8
|
benchmark(
|
8
|
-
caption: 'Removing whitespace',
|
9
|
+
caption: 'Removing ASCII whitespace',
|
9
10
|
cases: {
|
10
11
|
'String#gsub' => -> { str.gsub(rx, '') },
|
12
|
+
'String#tr' => -> { str.tr(trt, '') },
|
11
13
|
'CharacterSet#delete_in' => -> { cs.delete_in(str) },
|
12
14
|
}
|
13
15
|
)
|
14
16
|
|
15
17
|
str = 'Lörem ipsüm ⛷ et dölörem'
|
16
18
|
rx = /[\s\p{emoji}äüö]/
|
19
|
+
trt = "\t\n\v\f\r\s😀-🙏äüö"
|
17
20
|
cs = CharacterSet.whitespace + CharacterSet.emoji + CharacterSet['ä', 'ö', 'ü']
|
18
21
|
|
19
22
|
benchmark(
|
20
23
|
caption: 'Removing whitespace, emoji and umlauts',
|
21
24
|
cases: {
|
22
25
|
'String#gsub' => -> { str.gsub(rx, '') },
|
26
|
+
'String#tr' => -> { str.tr(trt, '') },
|
23
27
|
'CharacterSet#delete_in' => -> { cs.delete_in(str) },
|
24
28
|
}
|
25
29
|
)
|
@@ -2,24 +2,28 @@ require_relative './shared'
|
|
2
2
|
|
3
3
|
str = 'Lorem ipsum et dolorem'
|
4
4
|
rx = /\S/
|
5
|
+
trt = "\u{0080}-\u{10FFFF}" # approximation
|
5
6
|
cs = CharacterSet.whitespace
|
6
7
|
|
7
8
|
benchmark(
|
8
9
|
caption: 'Removing non-whitespace',
|
9
10
|
cases: {
|
10
11
|
'String#gsub' => -> { str.gsub(rx, '') },
|
12
|
+
'String#tr' => -> { str.tr(trt, '') },
|
11
13
|
'CharacterSet#keep_in' => -> { cs.keep_in(str) },
|
12
14
|
}
|
13
15
|
)
|
14
16
|
|
15
17
|
str = 'Lorem ipsum ⛷ et dolorem'
|
16
18
|
rx = /\p{^emoji}/
|
19
|
+
trt = "\u0000-\u{1F599}\u{1F650}-\u{10FFFF}"
|
17
20
|
cs = CharacterSet.emoji
|
18
21
|
|
19
22
|
benchmark(
|
20
|
-
caption: '
|
23
|
+
caption: 'Keeping only emoji',
|
21
24
|
cases: {
|
22
25
|
'String#gsub' => -> { str.gsub(rx, '') },
|
26
|
+
'String#tr' => -> { str.tr(trt, '') },
|
23
27
|
'CharacterSet#keep_in' => -> { cs.keep_in(str) },
|
24
28
|
}
|
25
29
|
)
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'benchmark/ips'
|
2
|
+
require_relative '../../lib/character_set'
|
3
|
+
if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
|
4
|
+
require 'sorted_set'
|
5
|
+
else
|
6
|
+
require 'set'
|
7
|
+
end
|
8
|
+
|
9
|
+
def benchmark(caption: nil, cases: {})
|
10
|
+
with_stdouts($stdout, string_io = StringIO.new) do
|
11
|
+
puts caption
|
12
|
+
Benchmark.ips do |x|
|
13
|
+
cases.each { |label, callable| x.report(label, &callable) }
|
14
|
+
x.compare!
|
15
|
+
end
|
16
|
+
end
|
17
|
+
($benchmark_results ||= {})[caption] = string_io.string
|
18
|
+
end
|
19
|
+
|
20
|
+
def with_stdouts(*ios)
|
21
|
+
old_stdout = $stdout
|
22
|
+
ios.define_singleton_method(:method_missing) { |*args| each { |io| io.send(*args) } }
|
23
|
+
ios.define_singleton_method(:respond_to?) { |*args| IO.respond_to?(*args) }
|
24
|
+
$stdout = ios
|
25
|
+
yield
|
26
|
+
ensure
|
27
|
+
$stdout = old_stdout
|
28
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
desc 'Download unicode casefold data and write new C header file'
|
2
|
+
task :sync_casefold_data do
|
3
|
+
src_path = './CaseFolding.txt'
|
4
|
+
dst_path = "#{__dir__}/../ext/character_set/unicode_casefold_table.h"
|
5
|
+
|
6
|
+
`wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
|
7
|
+
|
8
|
+
mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
|
9
|
+
from, type, to = line.split(/\s*;\s*/).first(3)
|
10
|
+
# type 'C' stands for 'common', excludes mappings to multiple chars
|
11
|
+
hash[from] = to if type == 'C'
|
12
|
+
end.sort
|
13
|
+
|
14
|
+
content = File.read(dst_path + '.tmpl')
|
15
|
+
.sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
|
16
|
+
.sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
|
17
|
+
|
18
|
+
File.write(dst_path, content)
|
19
|
+
File.unlink(src_path)
|
20
|
+
end
|
@@ -0,0 +1,9 @@
|
|
1
|
+
desc 'Update codepoint data for predefined sets, based on Onigmo'
|
2
|
+
task :sync_predefined_sets do
|
3
|
+
%w[assigned emoji whitespace].each do |prop|
|
4
|
+
require 'regexp_property_values'
|
5
|
+
ranges = RegexpPropertyValues[prop].matched_ranges
|
6
|
+
str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
|
7
|
+
File.write("#{__dir__}/../lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
|
8
|
+
end
|
9
|
+
end
|