character_set 1.1.1-java → 1.4.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +3 -0
  3. data/.github/workflows/lint.yml +29 -0
  4. data/.github/workflows/tests.yml +22 -0
  5. data/.gitignore +1 -0
  6. data/.rubocop.yml +11 -0
  7. data/BENCHMARK.md +53 -17
  8. data/CHANGELOG.md +47 -0
  9. data/README.md +38 -14
  10. data/Rakefile +60 -36
  11. data/benchmarks/count_in.rb +13 -0
  12. data/benchmarks/delete_in.rb +1 -1
  13. data/benchmarks/scan.rb +13 -0
  14. data/benchmarks/shared.rb +5 -0
  15. data/benchmarks/z_add.rb +12 -0
  16. data/benchmarks/z_delete.rb +12 -0
  17. data/benchmarks/z_merge.rb +15 -0
  18. data/benchmarks/z_minmax.rb +12 -0
  19. data/bin/console +2 -0
  20. data/character_set.gemspec +17 -6
  21. data/ext/character_set/character_set.c +963 -414
  22. data/ext/character_set/unicode_casefold_table.h +10 -2
  23. data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
  24. data/lib/character_set/character.rb +1 -1
  25. data/lib/character_set/core_ext/regexp_ext.rb +1 -1
  26. data/lib/character_set/core_ext/string_ext.rb +3 -1
  27. data/lib/character_set/expression_converter.rb +25 -27
  28. data/lib/character_set/parser.rb +1 -1
  29. data/lib/character_set/predefined_sets.rb +25 -260
  30. data/lib/character_set/predefined_sets/any.cps +1 -0
  31. data/lib/character_set/predefined_sets/ascii.cps +1 -0
  32. data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
  33. data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
  34. data/lib/character_set/predefined_sets/assigned.cps +666 -0
  35. data/lib/character_set/predefined_sets/bmp.cps +2 -0
  36. data/lib/character_set/predefined_sets/crypt.cps +2 -0
  37. data/lib/character_set/predefined_sets/emoji.cps +151 -0
  38. data/lib/character_set/predefined_sets/newline.cps +3 -0
  39. data/lib/character_set/predefined_sets/surrogate.cps +1 -0
  40. data/lib/character_set/predefined_sets/unicode.cps +2 -0
  41. data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
  42. data/lib/character_set/predefined_sets/url_host.cps +10 -0
  43. data/lib/character_set/predefined_sets/url_path.cps +7 -0
  44. data/lib/character_set/predefined_sets/url_query.cps +8 -0
  45. data/lib/character_set/predefined_sets/whitespace.cps +10 -0
  46. data/lib/character_set/ruby_fallback.rb +5 -3
  47. data/lib/character_set/ruby_fallback/character_set_methods.rb +53 -6
  48. data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
  49. data/lib/character_set/shared_methods.rb +60 -49
  50. data/lib/character_set/version.rb +1 -1
  51. data/lib/character_set/writer.rb +98 -27
  52. metadata +102 -22
  53. data/.travis.yml +0 -11
  54. data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
@@ -0,0 +1,2 @@
1
+ 0,D7FF
2
+ E000,FFFF
@@ -0,0 +1,2 @@
1
+ 2E,5A
2
+ 61,7A
@@ -0,0 +1,151 @@
1
+ 23,23
2
+ 2A,2A
3
+ 30,39
4
+ A9,A9
5
+ AE,AE
6
+ 203C,203C
7
+ 2049,2049
8
+ 2122,2122
9
+ 2139,2139
10
+ 2194,2199
11
+ 21A9,21AA
12
+ 231A,231B
13
+ 2328,2328
14
+ 23CF,23CF
15
+ 23E9,23F3
16
+ 23F8,23FA
17
+ 24C2,24C2
18
+ 25AA,25AB
19
+ 25B6,25B6
20
+ 25C0,25C0
21
+ 25FB,25FE
22
+ 2600,2604
23
+ 260E,260E
24
+ 2611,2611
25
+ 2614,2615
26
+ 2618,2618
27
+ 261D,261D
28
+ 2620,2620
29
+ 2622,2623
30
+ 2626,2626
31
+ 262A,262A
32
+ 262E,262F
33
+ 2638,263A
34
+ 2640,2640
35
+ 2642,2642
36
+ 2648,2653
37
+ 265F,2660
38
+ 2663,2663
39
+ 2665,2666
40
+ 2668,2668
41
+ 267B,267B
42
+ 267E,267F
43
+ 2692,2697
44
+ 2699,2699
45
+ 269B,269C
46
+ 26A0,26A1
47
+ 26AA,26AB
48
+ 26B0,26B1
49
+ 26BD,26BE
50
+ 26C4,26C5
51
+ 26C8,26C8
52
+ 26CE,26CF
53
+ 26D1,26D1
54
+ 26D3,26D4
55
+ 26E9,26EA
56
+ 26F0,26F5
57
+ 26F7,26FA
58
+ 26FD,26FD
59
+ 2702,2702
60
+ 2705,2705
61
+ 2708,270D
62
+ 270F,270F
63
+ 2712,2712
64
+ 2714,2714
65
+ 2716,2716
66
+ 271D,271D
67
+ 2721,2721
68
+ 2728,2728
69
+ 2733,2734
70
+ 2744,2744
71
+ 2747,2747
72
+ 274C,274C
73
+ 274E,274E
74
+ 2753,2755
75
+ 2757,2757
76
+ 2763,2764
77
+ 2795,2797
78
+ 27A1,27A1
79
+ 27B0,27B0
80
+ 27BF,27BF
81
+ 2934,2935
82
+ 2B05,2B07
83
+ 2B1B,2B1C
84
+ 2B50,2B50
85
+ 2B55,2B55
86
+ 3030,3030
87
+ 303D,303D
88
+ 3297,3297
89
+ 3299,3299
90
+ 1F004,1F004
91
+ 1F0CF,1F0CF
92
+ 1F170,1F171
93
+ 1F17E,1F17F
94
+ 1F18E,1F18E
95
+ 1F191,1F19A
96
+ 1F1E6,1F1FF
97
+ 1F201,1F202
98
+ 1F21A,1F21A
99
+ 1F22F,1F22F
100
+ 1F232,1F23A
101
+ 1F250,1F251
102
+ 1F300,1F321
103
+ 1F324,1F393
104
+ 1F396,1F397
105
+ 1F399,1F39B
106
+ 1F39E,1F3F0
107
+ 1F3F3,1F3F5
108
+ 1F3F7,1F4FD
109
+ 1F4FF,1F53D
110
+ 1F549,1F54E
111
+ 1F550,1F567
112
+ 1F56F,1F570
113
+ 1F573,1F57A
114
+ 1F587,1F587
115
+ 1F58A,1F58D
116
+ 1F590,1F590
117
+ 1F595,1F596
118
+ 1F5A4,1F5A5
119
+ 1F5A8,1F5A8
120
+ 1F5B1,1F5B2
121
+ 1F5BC,1F5BC
122
+ 1F5C2,1F5C4
123
+ 1F5D1,1F5D3
124
+ 1F5DC,1F5DE
125
+ 1F5E1,1F5E1
126
+ 1F5E3,1F5E3
127
+ 1F5E8,1F5E8
128
+ 1F5EF,1F5EF
129
+ 1F5F3,1F5F3
130
+ 1F5FA,1F64F
131
+ 1F680,1F6C5
132
+ 1F6CB,1F6D2
133
+ 1F6D5,1F6D5
134
+ 1F6E0,1F6E5
135
+ 1F6E9,1F6E9
136
+ 1F6EB,1F6EC
137
+ 1F6F0,1F6F0
138
+ 1F6F3,1F6FA
139
+ 1F7E0,1F7EB
140
+ 1F90D,1F93A
141
+ 1F93C,1F945
142
+ 1F947,1F971
143
+ 1F973,1F976
144
+ 1F97A,1F9A2
145
+ 1F9A5,1F9AA
146
+ 1F9AE,1F9CA
147
+ 1F9CD,1F9FF
148
+ 1FA70,1FA73
149
+ 1FA78,1FA7A
150
+ 1FA80,1FA82
151
+ 1FA90,1FA95
@@ -0,0 +1,3 @@
1
+ A,D
2
+ 85,85
3
+ 2028,2029
@@ -0,0 +1,2 @@
1
+ 0,D7FF
2
+ E000,10FFFF
@@ -0,0 +1,8 @@
1
+ 21,21
2
+ 24,24
3
+ 26,3B
4
+ 3D,3D
5
+ 3F,5A
6
+ 5F,5F
7
+ 61,7A
8
+ 7E,7E
@@ -0,0 +1,10 @@
1
+ 21,21
2
+ 24,24
3
+ 26,2E
4
+ 30,3B
5
+ 3D,3D
6
+ 41,5B
7
+ 5D,5D
8
+ 5F,5F
9
+ 61,7A
10
+ 7E,7E
@@ -0,0 +1,7 @@
1
+ 21,21
2
+ 24,3A
3
+ 3D,3D
4
+ 40,5A
5
+ 5F,5F
6
+ 61,7A
7
+ 7E,7E
@@ -0,0 +1,8 @@
1
+ 21,21
2
+ 24,24
3
+ 26,3B
4
+ 3D,3D
5
+ 3F,5A
6
+ 5F,5F
7
+ 61,7A
8
+ 7E,7E
@@ -0,0 +1,10 @@
1
+ 9,D
2
+ 20,20
3
+ 85,85
4
+ A0,A0
5
+ 1680,1680
6
+ 2000,200A
7
+ 2028,2029
8
+ 202F,202F
9
+ 205F,205F
10
+ 3000,3000
@@ -1,12 +1,14 @@
1
- require 'set'
1
+ if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
2
+ require 'sorted_set'
3
+ else
4
+ require 'set'
5
+ end
2
6
  require 'character_set/ruby_fallback/set_methods'
3
- require 'character_set/ruby_fallback/plane_methods'
4
7
  require 'character_set/ruby_fallback/character_set_methods'
5
8
 
6
9
  class CharacterSet
7
10
  module RubyFallback
8
11
  include CharacterSet::RubyFallback::SetMethods
9
- include CharacterSet::RubyFallback::PlaneMethods
10
12
  include CharacterSet::RubyFallback::CharacterSetMethods
11
13
 
12
14
  def self.prepended(klass)
@@ -31,7 +31,7 @@ class CharacterSet
31
31
  end
32
32
 
33
33
  def ranges
34
- CharacterSet.require_optional_dependency('range_compressor')
34
+ CharacterSet.require_optional_dependency('range_compressor', __method__)
35
35
  RangeCompressor.compress(self)
36
36
  end
37
37
 
@@ -39,9 +39,8 @@ class CharacterSet
39
39
  count.nil? ? to_a(true).sample : to_a(true).sample(count)
40
40
  end
41
41
 
42
- def used_by?(string)
43
- str!(string).each_codepoint { |cp| return true if include?(cp) }
44
- false
42
+ def count_in(string)
43
+ str!(string).each_codepoint.count { |cp| include?(cp) }
45
44
  end
46
45
 
47
46
  def cover?(string)
@@ -67,16 +66,64 @@ class CharacterSet
67
66
  result.size == string.size ? nil : string.replace(result)
68
67
  end
69
68
 
69
+ def scan(string)
70
+ encoding = str!(string).encoding
71
+ string.each_codepoint.inject([]) do |arr, cp|
72
+ include?(cp) ? arr.push(cp.chr(encoding)) : arr
73
+ end
74
+ end
75
+
76
+ def used_by?(string)
77
+ str!(string).each_codepoint { |cp| return true if include?(cp) }
78
+ false
79
+ end
80
+
81
+ def section(from:, upto: 0x10FFFF)
82
+ dup.keep_if { |cp| cp >= from && cp <= upto }
83
+ end
84
+
85
+ def count_in_section(from:, upto: 0x10FFFF)
86
+ count { |cp| cp >= from && cp <= upto }
87
+ end
88
+
89
+ def section?(from:, upto: 0x10FFFF)
90
+ any? { |cp| cp >= from && cp <= upto }
91
+ end
92
+
93
+ def section_ratio(from:, upto: 0x10FFFF)
94
+ section(from: from, upto: upto).count / count.to_f
95
+ end
96
+
97
+ def planes
98
+ plane_size = 0x10000.to_f
99
+ inject({}) { |hash, cp| hash.merge((cp / plane_size).floor => 1) }.keys
100
+ end
101
+
102
+ def plane(num)
103
+ validate_plane_number(num)
104
+ section(from: (num * 0x10000), upto: ((num + 1) * 0x10000) - 1)
105
+ end
106
+
107
+ def member_in_plane?(num)
108
+ validate_plane_number(num)
109
+ ((num * 0x10000)...((num + 1) * 0x10000)).any? { |cp| include?(cp) }
110
+ end
111
+
70
112
  private
71
113
 
114
+ def validate_plane_number(num)
115
+ num >= 0 && num <= 16 or raise ArgumentError, 'plane must be between 0 and 16'
116
+ end
117
+
72
118
  def str!(obj)
73
119
  raise ArgumentError, 'pass a String' unless obj.respond_to?(:codepoints)
74
120
  obj
75
121
  end
76
122
 
77
123
  def make_new_str(original, &block)
78
- new_string = str!(original).each_codepoint.each_with_object('', &block)
79
- original.tainted? ? new_string.taint : new_string
124
+ str!(original)
125
+ .each_codepoint
126
+ .each_with_object(''.encode(original.encoding), &block)
80
127
  end
81
128
  end
82
129
  end
@@ -1,7 +1,9 @@
1
1
  class CharacterSet
2
2
  module RubyFallback
3
3
  module SetMethods
4
- Enumerable.instance_methods.concat(%w[empty? length size]).each do |mthd|
4
+ (Enumerable.instance_methods -
5
+ %i[include? member? to_a] +
6
+ %i[empty? length size]).each do |mthd|
5
7
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
6
8
  def #{mthd}(*args, &block)
7
9
  @__set.#{mthd}(*args, &block)
@@ -9,7 +11,7 @@ class CharacterSet
9
11
  RUBY
10
12
  end
11
13
 
12
- %w[< <= > >= disjoint? intersect? proper_subset? proper_superset?
14
+ %i[< <= > >= disjoint? intersect? proper_subset? proper_superset?
13
15
  subset? superset?].each do |mthd|
14
16
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
15
17
  def #{mthd}(enum, &block)
@@ -21,8 +23,8 @@ class CharacterSet
21
23
  RUBY
22
24
  end
23
25
 
24
- %w[<< === add add? clear collect! delete delete? delete_if
25
- each filter! hash include? map! member? keep_if reject!
26
+ %i[<< add add? clear collect! delete delete? delete_if
27
+ each filter! map! keep_if reject!
26
28
  select! subtract].each do |mthd|
27
29
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
28
30
  def #{mthd}(*args, &block)
@@ -32,22 +34,22 @@ class CharacterSet
32
34
  RUBY
33
35
  end
34
36
 
35
- %w[& + - ^ | difference intersection union].each do |mthd|
37
+ # revert if https://github.com/knu/sorted_set/issues/2 is resolved
38
+ %i[=== include? member?].each do |mthd|
36
39
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
37
- def #{mthd}(enum, &block)
38
- if enum.respond_to?(:map)
39
- enum = enum.map { |el| el.is_a?(String) ? el.ord : el }
40
- end
41
- self.class.new(@__set.#{mthd}(enum, &block).to_a)
40
+ def #{mthd}(*args, &block)
41
+ !!@__set.#{mthd}(*args, &block)
42
42
  end
43
43
  RUBY
44
44
  end
45
45
 
46
- %w[taint untaint].each do |mthd|
46
+ %i[& + - ^ | difference intersection union].each do |mthd|
47
47
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
48
- def #{mthd}
49
- @__set.#{mthd}
50
- super
48
+ def #{mthd}(enum, &block)
49
+ if enum.respond_to?(:map)
50
+ enum = enum.map { |el| el.is_a?(String) ? el.ord : el }
51
+ end
52
+ self.class.new(@__set.#{mthd}(enum, &block).to_a)
51
53
  end
52
54
  RUBY
53
55
  end
@@ -72,8 +74,8 @@ class CharacterSet
72
74
  true
73
75
  elsif other.instance_of?(self.class)
74
76
  @__set == other.instance_variable_get(:@__set)
75
- elsif other.is_a?(self.class) && size == other.size
76
- other.all? { |cp| @__set.include?(cp) }
77
+ elsif other.is_a?(CharacterSet) || other.is_a?(CharacterSet::Pure)
78
+ size == other.size && other.all? { |cp| @__set.include?(cp) }
77
79
  else
78
80
  false
79
81
  end
@@ -81,7 +83,13 @@ class CharacterSet
81
83
 
82
84
  def eql?(other)
83
85
  return false unless other.is_a?(self.class)
84
- @__set.eql?(other.instance_variable_get(:@__set))
86
+ # revert if https://github.com/knu/sorted_set/issues/3 is resolved
87
+ hash == other.hash
88
+ end
89
+
90
+ # revert if https://github.com/knu/sorted_set/issues/3 is resolved
91
+ def hash
92
+ @__set.to_a.hash
85
93
  end
86
94
 
87
95
  def initialize_dup(orig)
@@ -22,14 +22,14 @@ class CharacterSet
22
22
  end
23
23
 
24
24
  def of_property(property_name)
25
- require_optional_dependency('regexp_property_values')
25
+ require_optional_dependency('regexp_property_values', __method__)
26
26
 
27
27
  property = RegexpPropertyValues[property_name.to_s]
28
28
  from_ranges(*property.matched_ranges)
29
29
  end
30
30
 
31
31
  def of_regexp(regexp)
32
- require_optional_dependency('regexp_parser')
32
+ require_optional_dependency('regexp_parser', __method__)
33
33
 
34
34
  root = ::Regexp::Parser.parse(regexp)
35
35
  of_expression(root)
@@ -39,16 +39,12 @@ class CharacterSet
39
39
  ExpressionConverter.convert(expression)
40
40
  end
41
41
 
42
- def require_optional_dependency(name)
42
+ def require_optional_dependency(name, method)
43
43
  required_optional_dependencies[name] ||= begin
44
44
  require name
45
45
  true
46
46
  rescue ::LoadError
47
- entry_point = caller_locations.reverse.find do |loc|
48
- loc.absolute_path.to_s.include?('/lib/character_set')
49
- end
50
- method = entry_point && entry_point.label
51
- raise LoadError, 'You must the install the optional dependency '\
47
+ raise LoadError, 'You must install the optional dependency '\
52
48
  "'\#{name}' to use the method `\#{method}'."
53
49
  end
54
50
  end
@@ -70,37 +66,56 @@ class CharacterSet
70
66
  merge(enum)
71
67
  end
72
68
 
73
- # stringification methods
69
+ # CharacterSet-specific conversion methods
70
+
71
+ def assigned_part
72
+ self & self.class.assigned
73
+ end
74
+
75
+ def valid_part
76
+ self - self.class.surrogate
77
+ end
78
+
79
+ # CharacterSet-specific stringification methods
74
80
 
75
81
  def to_s(opts = {}, &block)
76
82
  Writer.write(ranges, opts, &block)
77
83
  end
78
84
 
85
+ def to_s_with_surrogate_ranges
86
+ Writer.write_surrogate_ranges(bmp_part.ranges, astral_part.ranges)
87
+ end
88
+
79
89
  def to_s_with_surrogate_alternation
80
90
  Writer.write_surrogate_alternation(bmp_part.ranges, astral_part.ranges)
81
91
  end
82
92
 
83
93
  def inspect
84
94
  len = length
85
- "#<CharacterSet: {\#{first(5) * ', '}\#{'...' if len > 5}} (size: \#{len})>"
95
+ "#<#{klass.name}: {\#{first(5) * ', '}\#{'...' if len > 5}} (size: \#{len})>"
86
96
  end
87
97
 
88
- # unicode-plane-related methods
98
+ # C-extension adapter methods. Need overriding in pure fallback.
99
+ # Parsing kwargs in C is slower, verbose, and kinda deprecated.
89
100
 
90
- def bmp_part?
91
- !bmp_part.empty?
101
+ def inversion(include_surrogates: false, upto: 0x10FFFF)
102
+ ext_inversion(include_surrogates, upto)
92
103
  end
93
104
 
94
- def astral_part?
95
- !astral_part.empty?
105
+ def section(from:, upto: 0x10FFFF)
106
+ ext_section(from, upto)
96
107
  end
97
108
 
98
- def bmp_ratio
99
- bmp_part.count / count.to_f
109
+ def count_in_section(from:, upto: 0x10FFFF)
110
+ ext_count_in_section(from, upto)
100
111
  end
101
112
 
102
- def astral_ratio
103
- astral_part.count / count.to_f
113
+ def section?(from:, upto: 0x10FFFF)
114
+ ext_section?(from, upto)
115
+ end
116
+
117
+ def section_ratio(from:, upto: 0x10FFFF)
118
+ ext_section_ratio(from, upto)
104
119
  end
105
120
 
106
121
  #
@@ -136,42 +151,38 @@ class CharacterSet
136
151
  end
137
152
 
138
153
  def divide(&func)
139
- block_given? or return enum_for(__method__) { size }
140
- require 'set'
141
-
142
- if func.arity == 2
143
- require 'tsort'
154
+ CharacterSet.require_optional_dependency('set', __method__)
155
+ Set.new(to_a).divide(&func)
156
+ end
157
+ RUBY
144
158
 
145
- class << dig = {}
146
- include TSort
159
+ # CharacterSet-specific section methods
147
160
 
148
- alias tsort_each_node each_key
149
- def tsort_each_child(node, &block)
150
- fetch(node).each(&block)
151
- end
152
- end
161
+ {
162
+ ascii: 0..0x7F,
163
+ bmp: 0..0xFFFF,
164
+ astral: 0x10000..0x10FFFF,
165
+ }.each do |section_name, range|
166
+ klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
167
+ def #{section_name}_part
168
+ section(from: #{range.begin}, upto: #{range.end})
169
+ end
153
170
 
154
- each do |u|
155
- dig[u] = a = []
156
- each{ |v| a << v if yield(u, v) }
157
- end
171
+ def #{section_name}_part?
172
+ section?(from: #{range.begin}, upto: #{range.end})
173
+ end
158
174
 
159
- set = Set.new
160
- dig.each_strongly_connected_component do |css|
161
- set.add(self.class.new(css))
162
- end
163
- set
164
- else
165
- Set.new(classify(&func).values)
175
+ def #{section_name}_only?
176
+ #{range.begin == 0 ?
177
+ "!section?(from: #{range.end}, upto: 0x10FFFF)" :
178
+ "!section?(from: 0, upto: #{range.begin})"}
166
179
  end
167
- end
168
180
 
169
- # C-extension adapter method. Needs overriding in pure fallback.
170
- # Parsing kwargs in C is slower, verbose, and kinda deprecated.
171
- def inversion(include_surrogates: false, upto: 0x10FFFF)
172
- ext_inversion(include_surrogates, upto)
173
- end
174
- RUBY
181
+ def #{section_name}_ratio
182
+ section_ratio(from: #{range.begin}, upto: #{range.end})
183
+ end
184
+ RUBY
185
+ end
175
186
  end # self.included
176
187
  end # SharedMethods
177
188
  end