character_set 1.6.0 → 1.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/BENCHMARK.md +32 -32
  3. data/CHANGELOG.md +15 -1
  4. data/README.md +1 -1
  5. data/Rakefile +2 -123
  6. data/character_set.gemspec +0 -7
  7. data/ext/character_set/character_set.c +64 -43
  8. data/lib/character_set/parser.rb +8 -4
  9. data/lib/character_set/predefined_sets/assigned.cps +73 -52
  10. data/lib/character_set/predefined_sets/emoji.cps +10 -9
  11. data/lib/character_set/ruby_fallback/character_set_methods.rb +15 -14
  12. data/lib/character_set/ruby_fallback/set_methods.rb +4 -18
  13. data/lib/character_set/ruby_fallback/vendored_set_classes.rb +492 -0
  14. data/lib/character_set/ruby_fallback.rb +2 -6
  15. data/lib/character_set/shared_methods.rb +2 -2
  16. data/lib/character_set/version.rb +1 -1
  17. data/tasks/benchmark.rake +20 -0
  18. data/tasks/benchmarks/shared.rb +28 -0
  19. data/tasks/sync_casefold_data.rake +20 -0
  20. data/tasks/sync_predefined_sets.rake +9 -0
  21. data/tasks/sync_ruby_spec.rake +65 -0
  22. metadata +20 -29
  23. data/benchmarks/shared.rb +0 -30
  24. /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
  25. /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
  26. /data/{benchmarks → tasks/benchmarks}/delete_in.rb +0 -0
  27. /data/{benchmarks → tasks/benchmarks}/keep_in.rb +0 -0
  28. /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
  29. /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
  30. /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
  31. /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
  32. /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
  33. /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
@@ -0,0 +1,492 @@
1
+ # set and sorted_set are vendored due to various dependency issues:
2
+ #
3
+ # - issues with default vs. installed gems such as [#2]
4
+ # - issues with the sorted_set dependency rb_tree
5
+ # - long-standing issues in recent versions of sorted_set
6
+ #
7
+ # The RubyFallback (and thus these set classes), are only used for testing,
8
+ # and for exotic rubies which use neither C nor Java.
9
+
10
+ class CharacterSet
11
+ module RubyFallback
12
+ if RUBY_PLATFORM[/java/i]
13
+ # Vendoring is not needed for JRuby which has sorted_set in the stdlib.
14
+ require 'set'
15
+
16
+ Set = ::Set
17
+ SortedSet = ::SortedSet
18
+ else
19
+ # set, vendored from https://github.com/ruby/set/blob/master/lib/set.rb,
20
+ # with comments removed and linted.
21
+ class Set
22
+ include Enumerable
23
+
24
+ def self.[](*ary)
25
+ new(ary)
26
+ end
27
+
28
+ def initialize(enum = nil, &block)
29
+ @hash = Hash.new(false)
30
+
31
+ enum.nil? and return
32
+
33
+ if block
34
+ do_with_enum(enum) { |o| add(block[o]) }
35
+ else
36
+ merge(enum)
37
+ end
38
+ end
39
+
40
+ def compare_by_identity
41
+ if @hash.respond_to?(:compare_by_identity)
42
+ @hash.compare_by_identity
43
+ self
44
+ else
45
+ raise NotImplementedError, "#{self.class.name}\##{__method__} is not implemented"
46
+ end
47
+ end
48
+
49
+ def compare_by_identity?
50
+ @hash.respond_to?(:compare_by_identity?) && @hash.compare_by_identity?
51
+ end
52
+
53
+ def do_with_enum(enum, &block)
54
+ if enum.respond_to?(:each_entry)
55
+ enum.each_entry(&block) if block
56
+ elsif enum.respond_to?(:each)
57
+ enum.each(&block) if block
58
+ else
59
+ raise ArgumentError, "value must be enumerable"
60
+ end
61
+ end
62
+ private :do_with_enum
63
+
64
+ def initialize_dup(orig)
65
+ super
66
+ @hash = orig.instance_variable_get(:@hash).dup
67
+ end
68
+
69
+ if Kernel.instance_method(:initialize_clone).arity != 1
70
+ def initialize_clone(orig, **options)
71
+ super
72
+ @hash = orig.instance_variable_get(:@hash).clone(**options)
73
+ end
74
+ else
75
+ def initialize_clone(orig)
76
+ super
77
+ @hash = orig.instance_variable_get(:@hash).clone
78
+ end
79
+ end
80
+
81
+ def freeze
82
+ @hash.freeze
83
+ super
84
+ end
85
+
86
+ def size
87
+ @hash.size
88
+ end
89
+ alias length size
90
+
91
+ def empty?
92
+ @hash.empty?
93
+ end
94
+
95
+ def clear
96
+ @hash.clear
97
+ self
98
+ end
99
+
100
+ def replace(enum)
101
+ if enum.instance_of?(self.class)
102
+ @hash.replace(enum.instance_variable_get(:@hash))
103
+ self
104
+ else
105
+ do_with_enum(enum)
106
+ clear
107
+ merge(enum)
108
+ end
109
+ end
110
+
111
+ def to_a
112
+ @hash.keys
113
+ end
114
+
115
+ def to_set(klass = Set, *args, &block)
116
+ return self if instance_of?(Set) && klass == Set && block.nil? && args.empty?
117
+ klass.new(self, *args, &block)
118
+ end
119
+
120
+ def flatten_merge(set, seen = Set.new)
121
+ set.each { |e|
122
+ if e.is_a?(Set)
123
+ if seen.include?(e_id = e.object_id)
124
+ raise ArgumentError, "tried to flatten recursive Set"
125
+ end
126
+
127
+ seen.add(e_id)
128
+ flatten_merge(e, seen)
129
+ seen.delete(e_id)
130
+ else
131
+ add(e)
132
+ end
133
+ }
134
+
135
+ self
136
+ end
137
+ protected :flatten_merge
138
+
139
+ def flatten
140
+ self.class.new.flatten_merge(self)
141
+ end
142
+
143
+ def flatten!
144
+ replace(flatten()) if any? { |e| e.is_a?(Set) }
145
+ end
146
+
147
+ def include?(o)
148
+ @hash[o]
149
+ end
150
+ alias member? include?
151
+
152
+ def superset?(set)
153
+ case
154
+ when set.instance_of?(self.class) && @hash.respond_to?(:>=)
155
+ @hash >= set.instance_variable_get(:@hash)
156
+ when set.is_a?(Set)
157
+ size >= set.size && set.all? { |o| include?(o) }
158
+ else
159
+ raise ArgumentError, "value must be a set"
160
+ end
161
+ end
162
+ alias >= superset?
163
+
164
+ def proper_superset?(set)
165
+ case
166
+ when set.instance_of?(self.class) && @hash.respond_to?(:>)
167
+ @hash > set.instance_variable_get(:@hash)
168
+ when set.is_a?(Set)
169
+ size > set.size && set.all? { |o| include?(o) }
170
+ else
171
+ raise ArgumentError, "value must be a set"
172
+ end
173
+ end
174
+ alias > proper_superset?
175
+
176
+ def subset?(set)
177
+ case
178
+ when set.instance_of?(self.class) && @hash.respond_to?(:<=)
179
+ @hash <= set.instance_variable_get(:@hash)
180
+ when set.is_a?(Set)
181
+ size <= set.size && all? { |o| set.include?(o) }
182
+ else
183
+ raise ArgumentError, "value must be a set"
184
+ end
185
+ end
186
+ alias <= subset?
187
+
188
+ def proper_subset?(set)
189
+ case
190
+ when set.instance_of?(self.class) && @hash.respond_to?(:<)
191
+ @hash < set.instance_variable_get(:@hash)
192
+ when set.is_a?(Set)
193
+ size < set.size && all? { |o| set.include?(o) }
194
+ else
195
+ raise ArgumentError, "value must be a set"
196
+ end
197
+ end
198
+ alias < proper_subset?
199
+
200
+ def <=>(set)
201
+ return unless set.is_a?(Set)
202
+
203
+ case size <=> set.size
204
+ when -1 then -1 if proper_subset?(set)
205
+ when +1 then +1 if proper_superset?(set)
206
+ else 0 if self.==(set)
207
+ end
208
+ end
209
+
210
+ def intersect?(set)
211
+ case set
212
+ when Set
213
+ if size < set.size
214
+ any? { |o| set.include?(o) }
215
+ else
216
+ set.any? { |o| include?(o) }
217
+ end
218
+ when Enumerable
219
+ set.any? { |o| include?(o) }
220
+ else
221
+ raise ArgumentError, "value must be enumerable"
222
+ end
223
+ end
224
+
225
+ def disjoint?(set)
226
+ !intersect?(set)
227
+ end
228
+
229
+ def each(&block)
230
+ block_given? or return enum_for(__method__) { size }
231
+ @hash.each_key(&block)
232
+ self
233
+ end
234
+
235
+ def add(o)
236
+ @hash[o] = true
237
+ self
238
+ end
239
+ alias << add
240
+
241
+ def add?(o)
242
+ add(o) unless include?(o)
243
+ end
244
+
245
+ def delete(o)
246
+ @hash.delete(o)
247
+ self
248
+ end
249
+
250
+ def delete?(o)
251
+ delete(o) if include?(o)
252
+ end
253
+
254
+ def delete_if
255
+ block_given? or return enum_for(__method__) { size }
256
+ select { |o| yield o }.each { |o| @hash.delete(o) }
257
+ self
258
+ end
259
+
260
+ def keep_if
261
+ block_given? or return enum_for(__method__) { size }
262
+ reject { |o| yield o }.each { |o| @hash.delete(o) }
263
+ self
264
+ end
265
+
266
+ def collect!
267
+ block_given? or return enum_for(__method__) { size }
268
+ set = self.class.new
269
+ each { |o| set << yield(o) }
270
+ replace(set)
271
+ end
272
+ alias map! collect!
273
+
274
+ def reject!(&block)
275
+ block_given? or return enum_for(__method__) { size }
276
+ n = size
277
+ delete_if(&block)
278
+ self if size != n
279
+ end
280
+
281
+ def select!(&block)
282
+ block_given? or return enum_for(__method__) { size }
283
+ n = size
284
+ keep_if(&block)
285
+ self if size != n
286
+ end
287
+
288
+ alias filter! select!
289
+
290
+ def merge(*enums, **_rest)
291
+ enums.each do |enum|
292
+ if enum.instance_of?(self.class)
293
+ @hash.update(enum.instance_variable_get(:@hash))
294
+ else
295
+ do_with_enum(enum) { |o| add(o) }
296
+ end
297
+ end
298
+
299
+ self
300
+ end
301
+
302
+ def subtract(enum)
303
+ do_with_enum(enum) { |o| delete(o) }
304
+ self
305
+ end
306
+
307
+ def |(enum)
308
+ dup.merge(enum)
309
+ end
310
+ alias + |
311
+ alias union |
312
+
313
+ def -(enum)
314
+ dup.subtract(enum)
315
+ end
316
+ alias difference -
317
+
318
+ def &(enum)
319
+ n = self.class.new
320
+ if enum.is_a?(Set)
321
+ if enum.size > size
322
+ each { |o| n.add(o) if enum.include?(o) }
323
+ else
324
+ enum.each { |o| n.add(o) if include?(o) }
325
+ end
326
+ else
327
+ do_with_enum(enum) { |o| n.add(o) if include?(o) }
328
+ end
329
+ n
330
+ end
331
+ alias intersection &
332
+
333
+ def ^(enum)
334
+ n = Set.new(enum)
335
+ each { |o| n.add(o) unless n.delete?(o) }
336
+ n
337
+ end
338
+
339
+ def ==(other)
340
+ if self.equal?(other)
341
+ true
342
+ elsif other.instance_of?(self.class)
343
+ @hash == other.instance_variable_get(:@hash)
344
+ elsif other.is_a?(Set) && self.size == other.size
345
+ other.all? { |o| @hash.include?(o) }
346
+ else
347
+ false
348
+ end
349
+ end
350
+
351
+ def hash
352
+ @hash.hash
353
+ end
354
+
355
+ def eql?(o)
356
+ return false unless o.is_a?(Set)
357
+ @hash.eql?(o.instance_variable_get(:@hash))
358
+ end
359
+
360
+ def reset
361
+ if @hash.respond_to?(:rehash)
362
+ @hash.rehash
363
+ else
364
+ raise FrozenError, "can't modify frozen #{self.class.name}" if frozen?
365
+ end
366
+ self
367
+ end
368
+ alias === include?
369
+
370
+ def classify
371
+ block_given? or return enum_for(__method__) { size }
372
+
373
+ h = {}
374
+
375
+ each { |i|
376
+ (h[yield(i)] ||= self.class.new).add(i)
377
+ }
378
+
379
+ h
380
+ end
381
+
382
+ def divide(&func)
383
+ func or return enum_for(__method__) { size }
384
+
385
+ if func.arity == 2
386
+ require 'tsort'
387
+
388
+ class << dig = {}
389
+ include TSort
390
+
391
+ alias tsort_each_node each_key
392
+ def tsort_each_child(node, &block)
393
+ fetch(node).each(&block)
394
+ end
395
+ end
396
+
397
+ each { |u|
398
+ dig[u] = a = []
399
+ each{ |v| func.call(u, v) and a << v }
400
+ }
401
+
402
+ set = Set.new()
403
+ dig.each_strongly_connected_component { |css|
404
+ set.add(self.class.new(css))
405
+ }
406
+ set
407
+ else
408
+ Set.new(classify(&func).values)
409
+ end
410
+ end
411
+
412
+ def join(separator=nil)
413
+ to_a.join(separator)
414
+ end
415
+ end
416
+
417
+ # sorted_set without rbtree dependency, vendored from
418
+ # https://github.com/ruby/set/blob/72f08c4/lib/set.rb#L731-L800
419
+ class SortedSet < Set
420
+ def initialize(*args)
421
+ @keys = nil
422
+ super
423
+ end
424
+
425
+ def clear
426
+ @keys = nil
427
+ super
428
+ end
429
+
430
+ def replace(enum)
431
+ @keys = nil
432
+ super
433
+ end
434
+
435
+ def add(o)
436
+ o.respond_to?(:<=>) or raise ArgumentError, "value must respond to <=>"
437
+ @keys = nil
438
+ super
439
+ end
440
+ alias << add
441
+
442
+ def delete(o)
443
+ @keys = nil
444
+ @hash.delete(o)
445
+ self
446
+ end
447
+
448
+ def delete_if
449
+ block_given? or return enum_for(__method__) { size }
450
+ n = @hash.size
451
+ super
452
+ @keys = nil if @hash.size != n
453
+ self
454
+ end
455
+
456
+ def keep_if
457
+ block_given? or return enum_for(__method__) { size }
458
+ n = @hash.size
459
+ super
460
+ @keys = nil if @hash.size != n
461
+ self
462
+ end
463
+
464
+ def merge(enum)
465
+ @keys = nil
466
+ super
467
+ end
468
+
469
+ def each(&block)
470
+ block or return enum_for(__method__) { size }
471
+ to_a.each(&block)
472
+ self
473
+ end
474
+
475
+ def to_a
476
+ (@keys = @hash.keys).sort! unless @keys
477
+ @keys.dup
478
+ end
479
+
480
+ def freeze
481
+ to_a
482
+ super
483
+ end
484
+
485
+ def rehash
486
+ @keys = nil
487
+ super
488
+ end
489
+ end
490
+ end
491
+ end
492
+ end
@@ -1,10 +1,6 @@
1
- if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
2
- require 'sorted_set'
3
- else
4
- require 'set'
5
- end
6
1
  require 'character_set/ruby_fallback/set_methods'
7
2
  require 'character_set/ruby_fallback/character_set_methods'
3
+ require 'character_set/ruby_fallback/vendored_set_classes'
8
4
 
9
5
  class CharacterSet
10
6
  module RubyFallback
@@ -16,7 +12,7 @@ class CharacterSet
16
12
  end
17
13
 
18
14
  def initialize(enum = [])
19
- @__set = SortedSet.new
15
+ @__set = CharacterSet::RubyFallback::SortedSet.new
20
16
  super
21
17
  end
22
18
  end
@@ -165,8 +165,8 @@ class CharacterSet
165
165
  end
166
166
 
167
167
  def divide(&func)
168
- CharacterSet.require_optional_dependency('set', __method__)
169
- Set.new(to_a).divide(&func)
168
+ require 'character_set/ruby_fallback/vendored_set_classes'
169
+ CharacterSet::RubyFallback::Set.new(to_a).divide(&func)
170
170
  end
171
171
  RUBY
172
172
 
@@ -1,3 +1,3 @@
1
1
  class CharacterSet
2
- VERSION = '1.6.0'
2
+ VERSION = '1.7.0'
3
3
  end
@@ -0,0 +1,20 @@
1
+ desc 'Run all IPS benchmarks'
2
+ task :benchmark do
3
+ Dir["#{__dir__}/benchmarks/*.rb"].sort.each { |file| load(file) }
4
+ end
5
+
6
+ namespace :benchmark do
7
+ desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
8
+ task :write_to_file do
9
+ Rake.application[:benchmark].invoke
10
+
11
+ # extract comparison results from reports
12
+ results = $benchmark_results
13
+ .map { |caption, report| "```\n#{caption}\n\n#{report[/(?<=Comparison:).+/m].strip}\n```" }
14
+ .join("\n")
15
+ .gsub(/ \(±[^)]+\) |(?<=same-ish).*/, '') # remove some noise
16
+
17
+ File.write "#{__dir__}/../BENCHMARK.md",
18
+ "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}\n\n#{results}\n"
19
+ end
20
+ end
@@ -0,0 +1,28 @@
1
+ require 'benchmark/ips'
2
+ require_relative '../../lib/character_set'
3
+ if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
4
+ require 'sorted_set'
5
+ else
6
+ require 'set'
7
+ end
8
+
9
+ def benchmark(caption: nil, cases: {})
10
+ with_stdouts($stdout, string_io = StringIO.new) do
11
+ puts caption
12
+ Benchmark.ips do |x|
13
+ cases.each { |label, callable| x.report(label, &callable) }
14
+ x.compare!
15
+ end
16
+ end
17
+ ($benchmark_results ||= {})[caption] = string_io.string
18
+ end
19
+
20
+ def with_stdouts(*ios)
21
+ old_stdout = $stdout
22
+ ios.define_singleton_method(:method_missing) { |*args| each { |io| io.send(*args) } }
23
+ ios.define_singleton_method(:respond_to?) { |*args| IO.respond_to?(*args) }
24
+ $stdout = ios
25
+ yield
26
+ ensure
27
+ $stdout = old_stdout
28
+ end
@@ -0,0 +1,20 @@
1
+ desc 'Download unicode casefold data and write new C header file'
2
+ task :sync_casefold_data do
3
+ src_path = './CaseFolding.txt'
4
+ dst_path = "#{__dir__}/../ext/character_set/unicode_casefold_table.h"
5
+
6
+ `wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
7
+
8
+ mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
9
+ from, type, to = line.split(/\s*;\s*/).first(3)
10
+ # type 'C' stands for 'common', excludes mappings to multiple chars
11
+ hash[from] = to if type == 'C'
12
+ end.sort
13
+
14
+ content = File.read(dst_path + '.tmpl')
15
+ .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
16
+ .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
17
+
18
+ File.write(dst_path, content)
19
+ File.unlink(src_path)
20
+ end
@@ -0,0 +1,9 @@
1
+ desc 'Update codepoint data for predefined sets, based on Onigmo'
2
+ task :sync_predefined_sets do
3
+ %w[assigned emoji whitespace].each do |prop|
4
+ require 'regexp_property_values'
5
+ ranges = RegexpPropertyValues[prop].matched_ranges
6
+ str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
7
+ File.write("#{__dir__}/../lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
8
+ end
9
+ end
@@ -0,0 +1,65 @@
1
+ desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
2
+ task :sync_ruby_spec do
3
+ require 'fileutils'
4
+
5
+ variants = {
6
+ 'CharacterSet' => "#{__dir__}/../spec/ruby-spec/library/character_set",
7
+ 'CharacterSet::Pure' => "#{__dir__}/../spec/ruby-spec/library/character_set_pure",
8
+ }
9
+
10
+ # download fresh specs from ruby/spec repository
11
+ variants.each do |_, dir|
12
+ FileUtils.rm_rf(dir)
13
+ `svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
14
+ end
15
+
16
+ # make copies for each CharacterSet variant
17
+ base = variants.first[1]
18
+ variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
19
+
20
+ # adapt specs to work with CharacterSet
21
+ variants.each do |class_name, dir|
22
+ Dir["#{dir}/**/*.rb"].each do |spec|
23
+ # ignore some tests that do not apply or are covered otherwise
24
+ if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)}
25
+ File.delete(spec)
26
+ next
27
+ end
28
+
29
+ adapted_content =
30
+ File.read(spec).
31
+ # adapt class name
32
+ gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)).
33
+ gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2").
34
+ # get shared specs from a single shared dir at the parent level
35
+ gsub(/(require_relative ['"])(shared\/)/, '\1../\2').
36
+ # make 'mspec' syntax rspec-compatible
37
+ gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|').
38
+ gsub(/be_(false|true)/, 'be \1').
39
+ gsub('stub!', 'stub').
40
+ gsub('mock', 'double').
41
+ gsub('@method', 'method').
42
+ # remove unneeded requires
43
+ gsub(/require 'set'\n/, '').
44
+ gsub(/require.*spec_helper.*\n/, '').
45
+ gsub(/\A\n+/, '').
46
+ # make examples use Integers/codepoints
47
+ gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0').
48
+ gsub('"one"', '1').
49
+ gsub('"two"', '2').
50
+ gsub('"three"', '3').
51
+ gsub('"four"', '4').
52
+ gsub('"five"', '5').
53
+ gsub(/x.(size|length) == 3/, 'x != 3').
54
+ gsub(/x.(size|length) != 3/, 'x == 3').
55
+ gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2')
56
+
57
+ File.open(spec, 'w') { |f| f.puts adapted_content }
58
+ end
59
+ end
60
+
61
+ # keep only one copy of the shared specs, at the parent level
62
+ FileUtils.rm_rf(base + '/../shared')
63
+ FileUtils.mv(base + '/shared', base + '/../')
64
+ variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') }
65
+ end