character_set 1.1.1 → 1.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +3 -0
  3. data/.github/workflows/lint.yml +29 -0
  4. data/.github/workflows/tests.yml +22 -0
  5. data/.gitignore +1 -0
  6. data/.rubocop.yml +11 -0
  7. data/BENCHMARK.md +53 -17
  8. data/CHANGELOG.md +47 -0
  9. data/README.md +38 -14
  10. data/Rakefile +60 -36
  11. data/benchmarks/count_in.rb +13 -0
  12. data/benchmarks/delete_in.rb +1 -1
  13. data/benchmarks/scan.rb +13 -0
  14. data/benchmarks/shared.rb +5 -0
  15. data/benchmarks/z_add.rb +12 -0
  16. data/benchmarks/z_delete.rb +12 -0
  17. data/benchmarks/z_merge.rb +15 -0
  18. data/benchmarks/z_minmax.rb +12 -0
  19. data/bin/console +2 -0
  20. data/character_set.gemspec +17 -6
  21. data/ext/character_set/character_set.c +963 -414
  22. data/ext/character_set/unicode_casefold_table.h +10 -2
  23. data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
  24. data/lib/character_set/character.rb +1 -1
  25. data/lib/character_set/core_ext/regexp_ext.rb +1 -1
  26. data/lib/character_set/core_ext/string_ext.rb +3 -1
  27. data/lib/character_set/expression_converter.rb +25 -27
  28. data/lib/character_set/parser.rb +1 -1
  29. data/lib/character_set/predefined_sets.rb +25 -260
  30. data/lib/character_set/predefined_sets/any.cps +1 -0
  31. data/lib/character_set/predefined_sets/ascii.cps +1 -0
  32. data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
  33. data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
  34. data/lib/character_set/predefined_sets/assigned.cps +666 -0
  35. data/lib/character_set/predefined_sets/bmp.cps +2 -0
  36. data/lib/character_set/predefined_sets/crypt.cps +2 -0
  37. data/lib/character_set/predefined_sets/emoji.cps +151 -0
  38. data/lib/character_set/predefined_sets/newline.cps +3 -0
  39. data/lib/character_set/predefined_sets/surrogate.cps +1 -0
  40. data/lib/character_set/predefined_sets/unicode.cps +2 -0
  41. data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
  42. data/lib/character_set/predefined_sets/url_host.cps +10 -0
  43. data/lib/character_set/predefined_sets/url_path.cps +7 -0
  44. data/lib/character_set/predefined_sets/url_query.cps +8 -0
  45. data/lib/character_set/predefined_sets/whitespace.cps +10 -0
  46. data/lib/character_set/ruby_fallback.rb +5 -3
  47. data/lib/character_set/ruby_fallback/character_set_methods.rb +53 -6
  48. data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
  49. data/lib/character_set/shared_methods.rb +60 -49
  50. data/lib/character_set/version.rb +1 -1
  51. data/lib/character_set/writer.rb +98 -27
  52. metadata +88 -22
  53. data/.travis.yml +0 -11
  54. data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
@@ -0,0 +1,2 @@
1
+ 0,D7FF
2
+ E000,FFFF
@@ -0,0 +1,2 @@
1
+ 2E,5A
2
+ 61,7A
@@ -0,0 +1,151 @@
1
+ 23,23
2
+ 2A,2A
3
+ 30,39
4
+ A9,A9
5
+ AE,AE
6
+ 203C,203C
7
+ 2049,2049
8
+ 2122,2122
9
+ 2139,2139
10
+ 2194,2199
11
+ 21A9,21AA
12
+ 231A,231B
13
+ 2328,2328
14
+ 23CF,23CF
15
+ 23E9,23F3
16
+ 23F8,23FA
17
+ 24C2,24C2
18
+ 25AA,25AB
19
+ 25B6,25B6
20
+ 25C0,25C0
21
+ 25FB,25FE
22
+ 2600,2604
23
+ 260E,260E
24
+ 2611,2611
25
+ 2614,2615
26
+ 2618,2618
27
+ 261D,261D
28
+ 2620,2620
29
+ 2622,2623
30
+ 2626,2626
31
+ 262A,262A
32
+ 262E,262F
33
+ 2638,263A
34
+ 2640,2640
35
+ 2642,2642
36
+ 2648,2653
37
+ 265F,2660
38
+ 2663,2663
39
+ 2665,2666
40
+ 2668,2668
41
+ 267B,267B
42
+ 267E,267F
43
+ 2692,2697
44
+ 2699,2699
45
+ 269B,269C
46
+ 26A0,26A1
47
+ 26AA,26AB
48
+ 26B0,26B1
49
+ 26BD,26BE
50
+ 26C4,26C5
51
+ 26C8,26C8
52
+ 26CE,26CF
53
+ 26D1,26D1
54
+ 26D3,26D4
55
+ 26E9,26EA
56
+ 26F0,26F5
57
+ 26F7,26FA
58
+ 26FD,26FD
59
+ 2702,2702
60
+ 2705,2705
61
+ 2708,270D
62
+ 270F,270F
63
+ 2712,2712
64
+ 2714,2714
65
+ 2716,2716
66
+ 271D,271D
67
+ 2721,2721
68
+ 2728,2728
69
+ 2733,2734
70
+ 2744,2744
71
+ 2747,2747
72
+ 274C,274C
73
+ 274E,274E
74
+ 2753,2755
75
+ 2757,2757
76
+ 2763,2764
77
+ 2795,2797
78
+ 27A1,27A1
79
+ 27B0,27B0
80
+ 27BF,27BF
81
+ 2934,2935
82
+ 2B05,2B07
83
+ 2B1B,2B1C
84
+ 2B50,2B50
85
+ 2B55,2B55
86
+ 3030,3030
87
+ 303D,303D
88
+ 3297,3297
89
+ 3299,3299
90
+ 1F004,1F004
91
+ 1F0CF,1F0CF
92
+ 1F170,1F171
93
+ 1F17E,1F17F
94
+ 1F18E,1F18E
95
+ 1F191,1F19A
96
+ 1F1E6,1F1FF
97
+ 1F201,1F202
98
+ 1F21A,1F21A
99
+ 1F22F,1F22F
100
+ 1F232,1F23A
101
+ 1F250,1F251
102
+ 1F300,1F321
103
+ 1F324,1F393
104
+ 1F396,1F397
105
+ 1F399,1F39B
106
+ 1F39E,1F3F0
107
+ 1F3F3,1F3F5
108
+ 1F3F7,1F4FD
109
+ 1F4FF,1F53D
110
+ 1F549,1F54E
111
+ 1F550,1F567
112
+ 1F56F,1F570
113
+ 1F573,1F57A
114
+ 1F587,1F587
115
+ 1F58A,1F58D
116
+ 1F590,1F590
117
+ 1F595,1F596
118
+ 1F5A4,1F5A5
119
+ 1F5A8,1F5A8
120
+ 1F5B1,1F5B2
121
+ 1F5BC,1F5BC
122
+ 1F5C2,1F5C4
123
+ 1F5D1,1F5D3
124
+ 1F5DC,1F5DE
125
+ 1F5E1,1F5E1
126
+ 1F5E3,1F5E3
127
+ 1F5E8,1F5E8
128
+ 1F5EF,1F5EF
129
+ 1F5F3,1F5F3
130
+ 1F5FA,1F64F
131
+ 1F680,1F6C5
132
+ 1F6CB,1F6D2
133
+ 1F6D5,1F6D5
134
+ 1F6E0,1F6E5
135
+ 1F6E9,1F6E9
136
+ 1F6EB,1F6EC
137
+ 1F6F0,1F6F0
138
+ 1F6F3,1F6FA
139
+ 1F7E0,1F7EB
140
+ 1F90D,1F93A
141
+ 1F93C,1F945
142
+ 1F947,1F971
143
+ 1F973,1F976
144
+ 1F97A,1F9A2
145
+ 1F9A5,1F9AA
146
+ 1F9AE,1F9CA
147
+ 1F9CD,1F9FF
148
+ 1FA70,1FA73
149
+ 1FA78,1FA7A
150
+ 1FA80,1FA82
151
+ 1FA90,1FA95
@@ -0,0 +1,3 @@
1
+ A,D
2
+ 85,85
3
+ 2028,2029
@@ -0,0 +1,2 @@
1
+ 0,D7FF
2
+ E000,10FFFF
@@ -0,0 +1,8 @@
1
+ 21,21
2
+ 24,24
3
+ 26,3B
4
+ 3D,3D
5
+ 3F,5A
6
+ 5F,5F
7
+ 61,7A
8
+ 7E,7E
@@ -0,0 +1,10 @@
1
+ 21,21
2
+ 24,24
3
+ 26,2E
4
+ 30,3B
5
+ 3D,3D
6
+ 41,5B
7
+ 5D,5D
8
+ 5F,5F
9
+ 61,7A
10
+ 7E,7E
@@ -0,0 +1,7 @@
1
+ 21,21
2
+ 24,3A
3
+ 3D,3D
4
+ 40,5A
5
+ 5F,5F
6
+ 61,7A
7
+ 7E,7E
@@ -0,0 +1,8 @@
1
+ 21,21
2
+ 24,24
3
+ 26,3B
4
+ 3D,3D
5
+ 3F,5A
6
+ 5F,5F
7
+ 61,7A
8
+ 7E,7E
@@ -0,0 +1,10 @@
1
+ 9,D
2
+ 20,20
3
+ 85,85
4
+ A0,A0
5
+ 1680,1680
6
+ 2000,200A
7
+ 2028,2029
8
+ 202F,202F
9
+ 205F,205F
10
+ 3000,3000
@@ -1,12 +1,14 @@
1
- require 'set'
1
+ if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
2
+ require 'sorted_set'
3
+ else
4
+ require 'set'
5
+ end
2
6
  require 'character_set/ruby_fallback/set_methods'
3
- require 'character_set/ruby_fallback/plane_methods'
4
7
  require 'character_set/ruby_fallback/character_set_methods'
5
8
 
6
9
  class CharacterSet
7
10
  module RubyFallback
8
11
  include CharacterSet::RubyFallback::SetMethods
9
- include CharacterSet::RubyFallback::PlaneMethods
10
12
  include CharacterSet::RubyFallback::CharacterSetMethods
11
13
 
12
14
  def self.prepended(klass)
@@ -31,7 +31,7 @@ class CharacterSet
31
31
  end
32
32
 
33
33
  def ranges
34
- CharacterSet.require_optional_dependency('range_compressor')
34
+ CharacterSet.require_optional_dependency('range_compressor', __method__)
35
35
  RangeCompressor.compress(self)
36
36
  end
37
37
 
@@ -39,9 +39,8 @@ class CharacterSet
39
39
  count.nil? ? to_a(true).sample : to_a(true).sample(count)
40
40
  end
41
41
 
42
- def used_by?(string)
43
- str!(string).each_codepoint { |cp| return true if include?(cp) }
44
- false
42
+ def count_in(string)
43
+ str!(string).each_codepoint.count { |cp| include?(cp) }
45
44
  end
46
45
 
47
46
  def cover?(string)
@@ -67,16 +66,64 @@ class CharacterSet
67
66
  result.size == string.size ? nil : string.replace(result)
68
67
  end
69
68
 
69
+ def scan(string)
70
+ encoding = str!(string).encoding
71
+ string.each_codepoint.inject([]) do |arr, cp|
72
+ include?(cp) ? arr.push(cp.chr(encoding)) : arr
73
+ end
74
+ end
75
+
76
+ def used_by?(string)
77
+ str!(string).each_codepoint { |cp| return true if include?(cp) }
78
+ false
79
+ end
80
+
81
+ def section(from:, upto: 0x10FFFF)
82
+ dup.keep_if { |cp| cp >= from && cp <= upto }
83
+ end
84
+
85
+ def count_in_section(from:, upto: 0x10FFFF)
86
+ count { |cp| cp >= from && cp <= upto }
87
+ end
88
+
89
+ def section?(from:, upto: 0x10FFFF)
90
+ any? { |cp| cp >= from && cp <= upto }
91
+ end
92
+
93
+ def section_ratio(from:, upto: 0x10FFFF)
94
+ section(from: from, upto: upto).count / count.to_f
95
+ end
96
+
97
+ def planes
98
+ plane_size = 0x10000.to_f
99
+ inject({}) { |hash, cp| hash.merge((cp / plane_size).floor => 1) }.keys
100
+ end
101
+
102
+ def plane(num)
103
+ validate_plane_number(num)
104
+ section(from: (num * 0x10000), upto: ((num + 1) * 0x10000) - 1)
105
+ end
106
+
107
+ def member_in_plane?(num)
108
+ validate_plane_number(num)
109
+ ((num * 0x10000)...((num + 1) * 0x10000)).any? { |cp| include?(cp) }
110
+ end
111
+
70
112
  private
71
113
 
114
+ def validate_plane_number(num)
115
+ num >= 0 && num <= 16 or raise ArgumentError, 'plane must be between 0 and 16'
116
+ end
117
+
72
118
  def str!(obj)
73
119
  raise ArgumentError, 'pass a String' unless obj.respond_to?(:codepoints)
74
120
  obj
75
121
  end
76
122
 
77
123
  def make_new_str(original, &block)
78
- new_string = str!(original).each_codepoint.each_with_object('', &block)
79
- original.tainted? ? new_string.taint : new_string
124
+ str!(original)
125
+ .each_codepoint
126
+ .each_with_object(''.encode(original.encoding), &block)
80
127
  end
81
128
  end
82
129
  end
@@ -1,7 +1,9 @@
1
1
  class CharacterSet
2
2
  module RubyFallback
3
3
  module SetMethods
4
- Enumerable.instance_methods.concat(%w[empty? length size]).each do |mthd|
4
+ (Enumerable.instance_methods -
5
+ %i[include? member? to_a] +
6
+ %i[empty? length size]).each do |mthd|
5
7
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
6
8
  def #{mthd}(*args, &block)
7
9
  @__set.#{mthd}(*args, &block)
@@ -9,7 +11,7 @@ class CharacterSet
9
11
  RUBY
10
12
  end
11
13
 
12
- %w[< <= > >= disjoint? intersect? proper_subset? proper_superset?
14
+ %i[< <= > >= disjoint? intersect? proper_subset? proper_superset?
13
15
  subset? superset?].each do |mthd|
14
16
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
15
17
  def #{mthd}(enum, &block)
@@ -21,8 +23,8 @@ class CharacterSet
21
23
  RUBY
22
24
  end
23
25
 
24
- %w[<< === add add? clear collect! delete delete? delete_if
25
- each filter! hash include? map! member? keep_if reject!
26
+ %i[<< add add? clear collect! delete delete? delete_if
27
+ each filter! map! keep_if reject!
26
28
  select! subtract].each do |mthd|
27
29
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
28
30
  def #{mthd}(*args, &block)
@@ -32,22 +34,22 @@ class CharacterSet
32
34
  RUBY
33
35
  end
34
36
 
35
- %w[& + - ^ | difference intersection union].each do |mthd|
37
+ # revert if https://github.com/knu/sorted_set/issues/2 is resolved
38
+ %i[=== include? member?].each do |mthd|
36
39
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
37
- def #{mthd}(enum, &block)
38
- if enum.respond_to?(:map)
39
- enum = enum.map { |el| el.is_a?(String) ? el.ord : el }
40
- end
41
- self.class.new(@__set.#{mthd}(enum, &block).to_a)
40
+ def #{mthd}(*args, &block)
41
+ !!@__set.#{mthd}(*args, &block)
42
42
  end
43
43
  RUBY
44
44
  end
45
45
 
46
- %w[taint untaint].each do |mthd|
46
+ %i[& + - ^ | difference intersection union].each do |mthd|
47
47
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
48
- def #{mthd}
49
- @__set.#{mthd}
50
- super
48
+ def #{mthd}(enum, &block)
49
+ if enum.respond_to?(:map)
50
+ enum = enum.map { |el| el.is_a?(String) ? el.ord : el }
51
+ end
52
+ self.class.new(@__set.#{mthd}(enum, &block).to_a)
51
53
  end
52
54
  RUBY
53
55
  end
@@ -72,8 +74,8 @@ class CharacterSet
72
74
  true
73
75
  elsif other.instance_of?(self.class)
74
76
  @__set == other.instance_variable_get(:@__set)
75
- elsif other.is_a?(self.class) && size == other.size
76
- other.all? { |cp| @__set.include?(cp) }
77
+ elsif other.is_a?(CharacterSet) || other.is_a?(CharacterSet::Pure)
78
+ size == other.size && other.all? { |cp| @__set.include?(cp) }
77
79
  else
78
80
  false
79
81
  end
@@ -81,7 +83,13 @@ class CharacterSet
81
83
 
82
84
  def eql?(other)
83
85
  return false unless other.is_a?(self.class)
84
- @__set.eql?(other.instance_variable_get(:@__set))
86
+ # revert if https://github.com/knu/sorted_set/issues/3 is resolved
87
+ hash == other.hash
88
+ end
89
+
90
+ # revert if https://github.com/knu/sorted_set/issues/3 is resolved
91
+ def hash
92
+ @__set.to_a.hash
85
93
  end
86
94
 
87
95
  def initialize_dup(orig)
@@ -22,14 +22,14 @@ class CharacterSet
22
22
  end
23
23
 
24
24
  def of_property(property_name)
25
- require_optional_dependency('regexp_property_values')
25
+ require_optional_dependency('regexp_property_values', __method__)
26
26
 
27
27
  property = RegexpPropertyValues[property_name.to_s]
28
28
  from_ranges(*property.matched_ranges)
29
29
  end
30
30
 
31
31
  def of_regexp(regexp)
32
- require_optional_dependency('regexp_parser')
32
+ require_optional_dependency('regexp_parser', __method__)
33
33
 
34
34
  root = ::Regexp::Parser.parse(regexp)
35
35
  of_expression(root)
@@ -39,16 +39,12 @@ class CharacterSet
39
39
  ExpressionConverter.convert(expression)
40
40
  end
41
41
 
42
- def require_optional_dependency(name)
42
+ def require_optional_dependency(name, method)
43
43
  required_optional_dependencies[name] ||= begin
44
44
  require name
45
45
  true
46
46
  rescue ::LoadError
47
- entry_point = caller_locations.reverse.find do |loc|
48
- loc.absolute_path.to_s.include?('/lib/character_set')
49
- end
50
- method = entry_point && entry_point.label
51
- raise LoadError, 'You must the install the optional dependency '\
47
+ raise LoadError, 'You must install the optional dependency '\
52
48
  "'\#{name}' to use the method `\#{method}'."
53
49
  end
54
50
  end
@@ -70,37 +66,56 @@ class CharacterSet
70
66
  merge(enum)
71
67
  end
72
68
 
73
- # stringification methods
69
+ # CharacterSet-specific conversion methods
70
+
71
+ def assigned_part
72
+ self & self.class.assigned
73
+ end
74
+
75
+ def valid_part
76
+ self - self.class.surrogate
77
+ end
78
+
79
+ # CharacterSet-specific stringification methods
74
80
 
75
81
  def to_s(opts = {}, &block)
76
82
  Writer.write(ranges, opts, &block)
77
83
  end
78
84
 
85
+ def to_s_with_surrogate_ranges
86
+ Writer.write_surrogate_ranges(bmp_part.ranges, astral_part.ranges)
87
+ end
88
+
79
89
  def to_s_with_surrogate_alternation
80
90
  Writer.write_surrogate_alternation(bmp_part.ranges, astral_part.ranges)
81
91
  end
82
92
 
83
93
  def inspect
84
94
  len = length
85
- "#<CharacterSet: {\#{first(5) * ', '}\#{'...' if len > 5}} (size: \#{len})>"
95
+ "#<#{klass.name}: {\#{first(5) * ', '}\#{'...' if len > 5}} (size: \#{len})>"
86
96
  end
87
97
 
88
- # unicode-plane-related methods
98
+ # C-extension adapter methods. Need overriding in pure fallback.
99
+ # Parsing kwargs in C is slower, verbose, and kinda deprecated.
89
100
 
90
- def bmp_part?
91
- !bmp_part.empty?
101
+ def inversion(include_surrogates: false, upto: 0x10FFFF)
102
+ ext_inversion(include_surrogates, upto)
92
103
  end
93
104
 
94
- def astral_part?
95
- !astral_part.empty?
105
+ def section(from:, upto: 0x10FFFF)
106
+ ext_section(from, upto)
96
107
  end
97
108
 
98
- def bmp_ratio
99
- bmp_part.count / count.to_f
109
+ def count_in_section(from:, upto: 0x10FFFF)
110
+ ext_count_in_section(from, upto)
100
111
  end
101
112
 
102
- def astral_ratio
103
- astral_part.count / count.to_f
113
+ def section?(from:, upto: 0x10FFFF)
114
+ ext_section?(from, upto)
115
+ end
116
+
117
+ def section_ratio(from:, upto: 0x10FFFF)
118
+ ext_section_ratio(from, upto)
104
119
  end
105
120
 
106
121
  #
@@ -136,42 +151,38 @@ class CharacterSet
136
151
  end
137
152
 
138
153
  def divide(&func)
139
- block_given? or return enum_for(__method__) { size }
140
- require 'set'
141
-
142
- if func.arity == 2
143
- require 'tsort'
154
+ CharacterSet.require_optional_dependency('set', __method__)
155
+ Set.new(to_a).divide(&func)
156
+ end
157
+ RUBY
144
158
 
145
- class << dig = {}
146
- include TSort
159
+ # CharacterSet-specific section methods
147
160
 
148
- alias tsort_each_node each_key
149
- def tsort_each_child(node, &block)
150
- fetch(node).each(&block)
151
- end
152
- end
161
+ {
162
+ ascii: 0..0x7F,
163
+ bmp: 0..0xFFFF,
164
+ astral: 0x10000..0x10FFFF,
165
+ }.each do |section_name, range|
166
+ klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
167
+ def #{section_name}_part
168
+ section(from: #{range.begin}, upto: #{range.end})
169
+ end
153
170
 
154
- each do |u|
155
- dig[u] = a = []
156
- each{ |v| a << v if yield(u, v) }
157
- end
171
+ def #{section_name}_part?
172
+ section?(from: #{range.begin}, upto: #{range.end})
173
+ end
158
174
 
159
- set = Set.new
160
- dig.each_strongly_connected_component do |css|
161
- set.add(self.class.new(css))
162
- end
163
- set
164
- else
165
- Set.new(classify(&func).values)
175
+ def #{section_name}_only?
176
+ #{range.begin == 0 ?
177
+ "!section?(from: #{range.end}, upto: 0x10FFFF)" :
178
+ "!section?(from: 0, upto: #{range.begin})"}
166
179
  end
167
- end
168
180
 
169
- # C-extension adapter method. Needs overriding in pure fallback.
170
- # Parsing kwargs in C is slower, verbose, and kinda deprecated.
171
- def inversion(include_surrogates: false, upto: 0x10FFFF)
172
- ext_inversion(include_surrogates, upto)
173
- end
174
- RUBY
181
+ def #{section_name}_ratio
182
+ section_ratio(from: #{range.begin}, upto: #{range.end})
183
+ end
184
+ RUBY
185
+ end
175
186
  end # self.included
176
187
  end # SharedMethods
177
188
  end