character_set 1.6.0-java → 1.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/BENCHMARK.md +32 -32
  3. data/CHANGELOG.md +15 -1
  4. data/README.md +1 -1
  5. data/Rakefile +2 -123
  6. data/character_set.gemspec +0 -7
  7. data/ext/character_set/character_set.c +64 -43
  8. data/lib/character_set/parser.rb +8 -4
  9. data/lib/character_set/predefined_sets/assigned.cps +73 -52
  10. data/lib/character_set/predefined_sets/emoji.cps +10 -9
  11. data/lib/character_set/ruby_fallback/character_set_methods.rb +15 -14
  12. data/lib/character_set/ruby_fallback/set_methods.rb +4 -18
  13. data/lib/character_set/ruby_fallback/vendored_set_classes.rb +492 -0
  14. data/lib/character_set/ruby_fallback.rb +2 -6
  15. data/lib/character_set/shared_methods.rb +2 -2
  16. data/lib/character_set/version.rb +1 -1
  17. data/tasks/benchmark.rake +20 -0
  18. data/tasks/benchmarks/shared.rb +28 -0
  19. data/tasks/sync_casefold_data.rake +20 -0
  20. data/tasks/sync_predefined_sets.rake +9 -0
  21. data/tasks/sync_ruby_spec.rake +65 -0
  22. metadata +19 -28
  23. data/benchmarks/shared.rb +0 -30
  24. /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
  25. /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
  26. /data/{benchmarks → tasks/benchmarks}/delete_in.rb +0 -0
  27. /data/{benchmarks → tasks/benchmarks}/keep_in.rb +0 -0
  28. /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
  29. /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
  30. /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
  31. /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
  32. /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
  33. /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
@@ -0,0 +1,492 @@
1
+ # set and sorted_set are vendored due to various dependency issues:
2
+ #
3
+ # - issues with default vs. installed gems such as [#2]
4
+ # - issues with the sorted_set dependency rb_tree
5
+ # - long-standing issues in recent versions of sorted_set
6
+ #
7
+ # The RubyFallback (and thus these set classes), are only used for testing,
8
+ # and for exotic rubies which use neither C nor Java.
9
+
10
+ class CharacterSet
11
+ module RubyFallback
12
+ if RUBY_PLATFORM[/java/i]
13
+ # Vendoring is not needed for JRuby which has sorted_set in the stdlib.
14
+ require 'set'
15
+
16
+ Set = ::Set
17
+ SortedSet = ::SortedSet
18
+ else
19
+ # set, vendored from https://github.com/ruby/set/blob/master/lib/set.rb,
20
+ # with comments removed and linted.
21
+ class Set
22
+ include Enumerable
23
+
24
+ def self.[](*ary)
25
+ new(ary)
26
+ end
27
+
28
+ def initialize(enum = nil, &block)
29
+ @hash = Hash.new(false)
30
+
31
+ enum.nil? and return
32
+
33
+ if block
34
+ do_with_enum(enum) { |o| add(block[o]) }
35
+ else
36
+ merge(enum)
37
+ end
38
+ end
39
+
40
+ def compare_by_identity
41
+ if @hash.respond_to?(:compare_by_identity)
42
+ @hash.compare_by_identity
43
+ self
44
+ else
45
+ raise NotImplementedError, "#{self.class.name}\##{__method__} is not implemented"
46
+ end
47
+ end
48
+
49
+ def compare_by_identity?
50
+ @hash.respond_to?(:compare_by_identity?) && @hash.compare_by_identity?
51
+ end
52
+
53
+ def do_with_enum(enum, &block)
54
+ if enum.respond_to?(:each_entry)
55
+ enum.each_entry(&block) if block
56
+ elsif enum.respond_to?(:each)
57
+ enum.each(&block) if block
58
+ else
59
+ raise ArgumentError, "value must be enumerable"
60
+ end
61
+ end
62
+ private :do_with_enum
63
+
64
+ def initialize_dup(orig)
65
+ super
66
+ @hash = orig.instance_variable_get(:@hash).dup
67
+ end
68
+
69
+ if Kernel.instance_method(:initialize_clone).arity != 1
70
+ def initialize_clone(orig, **options)
71
+ super
72
+ @hash = orig.instance_variable_get(:@hash).clone(**options)
73
+ end
74
+ else
75
+ def initialize_clone(orig)
76
+ super
77
+ @hash = orig.instance_variable_get(:@hash).clone
78
+ end
79
+ end
80
+
81
+ def freeze
82
+ @hash.freeze
83
+ super
84
+ end
85
+
86
+ def size
87
+ @hash.size
88
+ end
89
+ alias length size
90
+
91
+ def empty?
92
+ @hash.empty?
93
+ end
94
+
95
+ def clear
96
+ @hash.clear
97
+ self
98
+ end
99
+
100
+ def replace(enum)
101
+ if enum.instance_of?(self.class)
102
+ @hash.replace(enum.instance_variable_get(:@hash))
103
+ self
104
+ else
105
+ do_with_enum(enum)
106
+ clear
107
+ merge(enum)
108
+ end
109
+ end
110
+
111
+ def to_a
112
+ @hash.keys
113
+ end
114
+
115
+ def to_set(klass = Set, *args, &block)
116
+ return self if instance_of?(Set) && klass == Set && block.nil? && args.empty?
117
+ klass.new(self, *args, &block)
118
+ end
119
+
120
+ def flatten_merge(set, seen = Set.new)
121
+ set.each { |e|
122
+ if e.is_a?(Set)
123
+ if seen.include?(e_id = e.object_id)
124
+ raise ArgumentError, "tried to flatten recursive Set"
125
+ end
126
+
127
+ seen.add(e_id)
128
+ flatten_merge(e, seen)
129
+ seen.delete(e_id)
130
+ else
131
+ add(e)
132
+ end
133
+ }
134
+
135
+ self
136
+ end
137
+ protected :flatten_merge
138
+
139
+ def flatten
140
+ self.class.new.flatten_merge(self)
141
+ end
142
+
143
+ def flatten!
144
+ replace(flatten()) if any? { |e| e.is_a?(Set) }
145
+ end
146
+
147
+ def include?(o)
148
+ @hash[o]
149
+ end
150
+ alias member? include?
151
+
152
+ def superset?(set)
153
+ case
154
+ when set.instance_of?(self.class) && @hash.respond_to?(:>=)
155
+ @hash >= set.instance_variable_get(:@hash)
156
+ when set.is_a?(Set)
157
+ size >= set.size && set.all? { |o| include?(o) }
158
+ else
159
+ raise ArgumentError, "value must be a set"
160
+ end
161
+ end
162
+ alias >= superset?
163
+
164
+ def proper_superset?(set)
165
+ case
166
+ when set.instance_of?(self.class) && @hash.respond_to?(:>)
167
+ @hash > set.instance_variable_get(:@hash)
168
+ when set.is_a?(Set)
169
+ size > set.size && set.all? { |o| include?(o) }
170
+ else
171
+ raise ArgumentError, "value must be a set"
172
+ end
173
+ end
174
+ alias > proper_superset?
175
+
176
+ def subset?(set)
177
+ case
178
+ when set.instance_of?(self.class) && @hash.respond_to?(:<=)
179
+ @hash <= set.instance_variable_get(:@hash)
180
+ when set.is_a?(Set)
181
+ size <= set.size && all? { |o| set.include?(o) }
182
+ else
183
+ raise ArgumentError, "value must be a set"
184
+ end
185
+ end
186
+ alias <= subset?
187
+
188
+ def proper_subset?(set)
189
+ case
190
+ when set.instance_of?(self.class) && @hash.respond_to?(:<)
191
+ @hash < set.instance_variable_get(:@hash)
192
+ when set.is_a?(Set)
193
+ size < set.size && all? { |o| set.include?(o) }
194
+ else
195
+ raise ArgumentError, "value must be a set"
196
+ end
197
+ end
198
+ alias < proper_subset?
199
+
200
+ def <=>(set)
201
+ return unless set.is_a?(Set)
202
+
203
+ case size <=> set.size
204
+ when -1 then -1 if proper_subset?(set)
205
+ when +1 then +1 if proper_superset?(set)
206
+ else 0 if self.==(set)
207
+ end
208
+ end
209
+
210
+ def intersect?(set)
211
+ case set
212
+ when Set
213
+ if size < set.size
214
+ any? { |o| set.include?(o) }
215
+ else
216
+ set.any? { |o| include?(o) }
217
+ end
218
+ when Enumerable
219
+ set.any? { |o| include?(o) }
220
+ else
221
+ raise ArgumentError, "value must be enumerable"
222
+ end
223
+ end
224
+
225
+ def disjoint?(set)
226
+ !intersect?(set)
227
+ end
228
+
229
+ def each(&block)
230
+ block_given? or return enum_for(__method__) { size }
231
+ @hash.each_key(&block)
232
+ self
233
+ end
234
+
235
+ def add(o)
236
+ @hash[o] = true
237
+ self
238
+ end
239
+ alias << add
240
+
241
+ def add?(o)
242
+ add(o) unless include?(o)
243
+ end
244
+
245
+ def delete(o)
246
+ @hash.delete(o)
247
+ self
248
+ end
249
+
250
+ def delete?(o)
251
+ delete(o) if include?(o)
252
+ end
253
+
254
+ def delete_if
255
+ block_given? or return enum_for(__method__) { size }
256
+ select { |o| yield o }.each { |o| @hash.delete(o) }
257
+ self
258
+ end
259
+
260
+ def keep_if
261
+ block_given? or return enum_for(__method__) { size }
262
+ reject { |o| yield o }.each { |o| @hash.delete(o) }
263
+ self
264
+ end
265
+
266
+ def collect!
267
+ block_given? or return enum_for(__method__) { size }
268
+ set = self.class.new
269
+ each { |o| set << yield(o) }
270
+ replace(set)
271
+ end
272
+ alias map! collect!
273
+
274
+ def reject!(&block)
275
+ block_given? or return enum_for(__method__) { size }
276
+ n = size
277
+ delete_if(&block)
278
+ self if size != n
279
+ end
280
+
281
+ def select!(&block)
282
+ block_given? or return enum_for(__method__) { size }
283
+ n = size
284
+ keep_if(&block)
285
+ self if size != n
286
+ end
287
+
288
+ alias filter! select!
289
+
290
+ def merge(*enums, **_rest)
291
+ enums.each do |enum|
292
+ if enum.instance_of?(self.class)
293
+ @hash.update(enum.instance_variable_get(:@hash))
294
+ else
295
+ do_with_enum(enum) { |o| add(o) }
296
+ end
297
+ end
298
+
299
+ self
300
+ end
301
+
302
+ def subtract(enum)
303
+ do_with_enum(enum) { |o| delete(o) }
304
+ self
305
+ end
306
+
307
+ def |(enum)
308
+ dup.merge(enum)
309
+ end
310
+ alias + |
311
+ alias union |
312
+
313
+ def -(enum)
314
+ dup.subtract(enum)
315
+ end
316
+ alias difference -
317
+
318
+ def &(enum)
319
+ n = self.class.new
320
+ if enum.is_a?(Set)
321
+ if enum.size > size
322
+ each { |o| n.add(o) if enum.include?(o) }
323
+ else
324
+ enum.each { |o| n.add(o) if include?(o) }
325
+ end
326
+ else
327
+ do_with_enum(enum) { |o| n.add(o) if include?(o) }
328
+ end
329
+ n
330
+ end
331
+ alias intersection &
332
+
333
+ def ^(enum)
334
+ n = Set.new(enum)
335
+ each { |o| n.add(o) unless n.delete?(o) }
336
+ n
337
+ end
338
+
339
+ def ==(other)
340
+ if self.equal?(other)
341
+ true
342
+ elsif other.instance_of?(self.class)
343
+ @hash == other.instance_variable_get(:@hash)
344
+ elsif other.is_a?(Set) && self.size == other.size
345
+ other.all? { |o| @hash.include?(o) }
346
+ else
347
+ false
348
+ end
349
+ end
350
+
351
+ def hash
352
+ @hash.hash
353
+ end
354
+
355
+ def eql?(o)
356
+ return false unless o.is_a?(Set)
357
+ @hash.eql?(o.instance_variable_get(:@hash))
358
+ end
359
+
360
+ def reset
361
+ if @hash.respond_to?(:rehash)
362
+ @hash.rehash
363
+ else
364
+ raise FrozenError, "can't modify frozen #{self.class.name}" if frozen?
365
+ end
366
+ self
367
+ end
368
+ alias === include?
369
+
370
+ def classify
371
+ block_given? or return enum_for(__method__) { size }
372
+
373
+ h = {}
374
+
375
+ each { |i|
376
+ (h[yield(i)] ||= self.class.new).add(i)
377
+ }
378
+
379
+ h
380
+ end
381
+
382
+ def divide(&func)
383
+ func or return enum_for(__method__) { size }
384
+
385
+ if func.arity == 2
386
+ require 'tsort'
387
+
388
+ class << dig = {}
389
+ include TSort
390
+
391
+ alias tsort_each_node each_key
392
+ def tsort_each_child(node, &block)
393
+ fetch(node).each(&block)
394
+ end
395
+ end
396
+
397
+ each { |u|
398
+ dig[u] = a = []
399
+ each{ |v| func.call(u, v) and a << v }
400
+ }
401
+
402
+ set = Set.new()
403
+ dig.each_strongly_connected_component { |css|
404
+ set.add(self.class.new(css))
405
+ }
406
+ set
407
+ else
408
+ Set.new(classify(&func).values)
409
+ end
410
+ end
411
+
412
+ def join(separator=nil)
413
+ to_a.join(separator)
414
+ end
415
+ end
416
+
417
+ # sorted_set without rbtree dependency, vendored from
418
+ # https://github.com/ruby/set/blob/72f08c4/lib/set.rb#L731-L800
419
+ class SortedSet < Set
420
+ def initialize(*args)
421
+ @keys = nil
422
+ super
423
+ end
424
+
425
+ def clear
426
+ @keys = nil
427
+ super
428
+ end
429
+
430
+ def replace(enum)
431
+ @keys = nil
432
+ super
433
+ end
434
+
435
+ def add(o)
436
+ o.respond_to?(:<=>) or raise ArgumentError, "value must respond to <=>"
437
+ @keys = nil
438
+ super
439
+ end
440
+ alias << add
441
+
442
+ def delete(o)
443
+ @keys = nil
444
+ @hash.delete(o)
445
+ self
446
+ end
447
+
448
+ def delete_if
449
+ block_given? or return enum_for(__method__) { size }
450
+ n = @hash.size
451
+ super
452
+ @keys = nil if @hash.size != n
453
+ self
454
+ end
455
+
456
+ def keep_if
457
+ block_given? or return enum_for(__method__) { size }
458
+ n = @hash.size
459
+ super
460
+ @keys = nil if @hash.size != n
461
+ self
462
+ end
463
+
464
+ def merge(enum)
465
+ @keys = nil
466
+ super
467
+ end
468
+
469
+ def each(&block)
470
+ block or return enum_for(__method__) { size }
471
+ to_a.each(&block)
472
+ self
473
+ end
474
+
475
+ def to_a
476
+ (@keys = @hash.keys).sort! unless @keys
477
+ @keys.dup
478
+ end
479
+
480
+ def freeze
481
+ to_a
482
+ super
483
+ end
484
+
485
+ def rehash
486
+ @keys = nil
487
+ super
488
+ end
489
+ end
490
+ end
491
+ end
492
+ end
@@ -1,10 +1,6 @@
1
- if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
2
- require 'sorted_set'
3
- else
4
- require 'set'
5
- end
6
1
  require 'character_set/ruby_fallback/set_methods'
7
2
  require 'character_set/ruby_fallback/character_set_methods'
3
+ require 'character_set/ruby_fallback/vendored_set_classes'
8
4
 
9
5
  class CharacterSet
10
6
  module RubyFallback
@@ -16,7 +12,7 @@ class CharacterSet
16
12
  end
17
13
 
18
14
  def initialize(enum = [])
19
- @__set = SortedSet.new
15
+ @__set = CharacterSet::RubyFallback::SortedSet.new
20
16
  super
21
17
  end
22
18
  end
@@ -165,8 +165,8 @@ class CharacterSet
165
165
  end
166
166
 
167
167
  def divide(&func)
168
- CharacterSet.require_optional_dependency('set', __method__)
169
- Set.new(to_a).divide(&func)
168
+ require 'character_set/ruby_fallback/vendored_set_classes'
169
+ CharacterSet::RubyFallback::Set.new(to_a).divide(&func)
170
170
  end
171
171
  RUBY
172
172
 
@@ -1,3 +1,3 @@
1
1
  class CharacterSet
2
- VERSION = '1.6.0'
2
+ VERSION = '1.7.0'
3
3
  end
@@ -0,0 +1,20 @@
1
+ desc 'Run all IPS benchmarks'
2
+ task :benchmark do
3
+ Dir["#{__dir__}/benchmarks/*.rb"].sort.each { |file| load(file) }
4
+ end
5
+
6
+ namespace :benchmark do
7
+ desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
8
+ task :write_to_file do
9
+ Rake.application[:benchmark].invoke
10
+
11
+ # extract comparison results from reports
12
+ results = $benchmark_results
13
+ .map { |caption, report| "```\n#{caption}\n\n#{report[/(?<=Comparison:).+/m].strip}\n```" }
14
+ .join("\n")
15
+ .gsub(/ \(±[^)]+\) |(?<=same-ish).*/, '') # remove some noise
16
+
17
+ File.write "#{__dir__}/../BENCHMARK.md",
18
+ "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}\n\n#{results}\n"
19
+ end
20
+ end
@@ -0,0 +1,28 @@
1
+ require 'benchmark/ips'
2
+ require_relative '../../lib/character_set'
3
+ if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
4
+ require 'sorted_set'
5
+ else
6
+ require 'set'
7
+ end
8
+
9
+ def benchmark(caption: nil, cases: {})
10
+ with_stdouts($stdout, string_io = StringIO.new) do
11
+ puts caption
12
+ Benchmark.ips do |x|
13
+ cases.each { |label, callable| x.report(label, &callable) }
14
+ x.compare!
15
+ end
16
+ end
17
+ ($benchmark_results ||= {})[caption] = string_io.string
18
+ end
19
+
20
+ def with_stdouts(*ios)
21
+ old_stdout = $stdout
22
+ ios.define_singleton_method(:method_missing) { |*args| each { |io| io.send(*args) } }
23
+ ios.define_singleton_method(:respond_to?) { |*args| IO.respond_to?(*args) }
24
+ $stdout = ios
25
+ yield
26
+ ensure
27
+ $stdout = old_stdout
28
+ end
@@ -0,0 +1,20 @@
1
+ desc 'Download unicode casefold data and write new C header file'
2
+ task :sync_casefold_data do
3
+ src_path = './CaseFolding.txt'
4
+ dst_path = "#{__dir__}/../ext/character_set/unicode_casefold_table.h"
5
+
6
+ `wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
7
+
8
+ mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
9
+ from, type, to = line.split(/\s*;\s*/).first(3)
10
+ # type 'C' stands for 'common', excludes mappings to multiple chars
11
+ hash[from] = to if type == 'C'
12
+ end.sort
13
+
14
+ content = File.read(dst_path + '.tmpl')
15
+ .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
16
+ .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
17
+
18
+ File.write(dst_path, content)
19
+ File.unlink(src_path)
20
+ end
@@ -0,0 +1,9 @@
1
+ desc 'Update codepoint data for predefined sets, based on Onigmo'
2
+ task :sync_predefined_sets do
3
+ %w[assigned emoji whitespace].each do |prop|
4
+ require 'regexp_property_values'
5
+ ranges = RegexpPropertyValues[prop].matched_ranges
6
+ str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
7
+ File.write("#{__dir__}/../lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
8
+ end
9
+ end
@@ -0,0 +1,65 @@
1
+ desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
2
+ task :sync_ruby_spec do
3
+ require 'fileutils'
4
+
5
+ variants = {
6
+ 'CharacterSet' => "#{__dir__}/../spec/ruby-spec/library/character_set",
7
+ 'CharacterSet::Pure' => "#{__dir__}/../spec/ruby-spec/library/character_set_pure",
8
+ }
9
+
10
+ # download fresh specs from ruby/spec repository
11
+ variants.each do |_, dir|
12
+ FileUtils.rm_rf(dir)
13
+ `svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
14
+ end
15
+
16
+ # make copies for each CharacterSet variant
17
+ base = variants.first[1]
18
+ variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
19
+
20
+ # adapt specs to work with CharacterSet
21
+ variants.each do |class_name, dir|
22
+ Dir["#{dir}/**/*.rb"].each do |spec|
23
+ # ignore some tests that do not apply or are covered otherwise
24
+ if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)}
25
+ File.delete(spec)
26
+ next
27
+ end
28
+
29
+ adapted_content =
30
+ File.read(spec).
31
+ # adapt class name
32
+ gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)).
33
+ gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2").
34
+ # get shared specs from a single shared dir at the parent level
35
+ gsub(/(require_relative ['"])(shared\/)/, '\1../\2').
36
+ # make 'mspec' syntax rspec-compatible
37
+ gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|').
38
+ gsub(/be_(false|true)/, 'be \1').
39
+ gsub('stub!', 'stub').
40
+ gsub('mock', 'double').
41
+ gsub('@method', 'method').
42
+ # remove unneeded requires
43
+ gsub(/require 'set'\n/, '').
44
+ gsub(/require.*spec_helper.*\n/, '').
45
+ gsub(/\A\n+/, '').
46
+ # make examples use Integers/codepoints
47
+ gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0').
48
+ gsub('"one"', '1').
49
+ gsub('"two"', '2').
50
+ gsub('"three"', '3').
51
+ gsub('"four"', '4').
52
+ gsub('"five"', '5').
53
+ gsub(/x.(size|length) == 3/, 'x != 3').
54
+ gsub(/x.(size|length) != 3/, 'x == 3').
55
+ gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2')
56
+
57
+ File.open(spec, 'w') { |f| f.puts adapted_content }
58
+ end
59
+ end
60
+
61
+ # keep only one copy of the shared specs, at the parent level
62
+ FileUtils.rm_rf(base + '/../shared')
63
+ FileUtils.mv(base + '/shared', base + '/../')
64
+ variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') }
65
+ end