character_set 1.1.1 → 1.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +3 -0
  3. data/.github/workflows/lint.yml +29 -0
  4. data/.github/workflows/tests.yml +22 -0
  5. data/.gitignore +1 -0
  6. data/.rubocop.yml +11 -0
  7. data/BENCHMARK.md +53 -17
  8. data/CHANGELOG.md +47 -0
  9. data/README.md +38 -14
  10. data/Rakefile +60 -36
  11. data/benchmarks/count_in.rb +13 -0
  12. data/benchmarks/delete_in.rb +1 -1
  13. data/benchmarks/scan.rb +13 -0
  14. data/benchmarks/shared.rb +5 -0
  15. data/benchmarks/z_add.rb +12 -0
  16. data/benchmarks/z_delete.rb +12 -0
  17. data/benchmarks/z_merge.rb +15 -0
  18. data/benchmarks/z_minmax.rb +12 -0
  19. data/bin/console +2 -0
  20. data/character_set.gemspec +17 -6
  21. data/ext/character_set/character_set.c +963 -414
  22. data/ext/character_set/unicode_casefold_table.h +10 -2
  23. data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
  24. data/lib/character_set/character.rb +1 -1
  25. data/lib/character_set/core_ext/regexp_ext.rb +1 -1
  26. data/lib/character_set/core_ext/string_ext.rb +3 -1
  27. data/lib/character_set/expression_converter.rb +25 -27
  28. data/lib/character_set/parser.rb +1 -1
  29. data/lib/character_set/predefined_sets.rb +25 -260
  30. data/lib/character_set/predefined_sets/any.cps +1 -0
  31. data/lib/character_set/predefined_sets/ascii.cps +1 -0
  32. data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
  33. data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
  34. data/lib/character_set/predefined_sets/assigned.cps +666 -0
  35. data/lib/character_set/predefined_sets/bmp.cps +2 -0
  36. data/lib/character_set/predefined_sets/crypt.cps +2 -0
  37. data/lib/character_set/predefined_sets/emoji.cps +151 -0
  38. data/lib/character_set/predefined_sets/newline.cps +3 -0
  39. data/lib/character_set/predefined_sets/surrogate.cps +1 -0
  40. data/lib/character_set/predefined_sets/unicode.cps +2 -0
  41. data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
  42. data/lib/character_set/predefined_sets/url_host.cps +10 -0
  43. data/lib/character_set/predefined_sets/url_path.cps +7 -0
  44. data/lib/character_set/predefined_sets/url_query.cps +8 -0
  45. data/lib/character_set/predefined_sets/whitespace.cps +10 -0
  46. data/lib/character_set/ruby_fallback.rb +5 -3
  47. data/lib/character_set/ruby_fallback/character_set_methods.rb +53 -6
  48. data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
  49. data/lib/character_set/shared_methods.rb +60 -49
  50. data/lib/character_set/version.rb +1 -1
  51. data/lib/character_set/writer.rb +98 -27
  52. metadata +88 -22
  53. data/.travis.yml +0 -11
  54. data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
@@ -1,11 +1,12 @@
1
- // THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT'
1
+ // THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT
2
+ // -*-C-*-
2
3
 
3
4
  typedef struct casefold_mapping {
4
5
  unsigned long from;
5
6
  unsigned long to;
6
7
  } casefold_mapping;
7
8
 
8
- #define CASEFOLD_COUNT 1376
9
+ #define CASEFOLD_COUNT 1383
9
10
 
10
11
  static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
11
12
  {0x0041,0x0061},
@@ -1278,6 +1279,13 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
1278
1279
  {0xA7B4,0xA7B5},
1279
1280
  {0xA7B6,0xA7B7},
1280
1281
  {0xA7B8,0xA7B9},
1282
+ {0xA7BA,0xA7BB},
1283
+ {0xA7BC,0xA7BD},
1284
+ {0xA7BE,0xA7BF},
1285
+ {0xA7C2,0xA7C3},
1286
+ {0xA7C4,0xA794},
1287
+ {0xA7C5,0x0282},
1288
+ {0xA7C6,0x1D8E},
1281
1289
  {0xAB70,0x13A0},
1282
1290
  {0xAB71,0x13A1},
1283
1291
  {0xAB72,0x13A2},
@@ -0,0 +1,11 @@
1
+ // THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT
2
+ // -*-C-*-
3
+
4
+ typedef struct casefold_mapping {
5
+ unsigned long from;
6
+ unsigned long to;
7
+ } casefold_mapping;
8
+
9
+ #define CASEFOLD_COUNT 0
10
+
11
+ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {};
@@ -1,7 +1,7 @@
1
1
  class CharacterSet
2
2
  class Character
3
3
  ENCODING = 'utf-8'.freeze
4
- SAFELY_PRINTABLE = (0x21..0x7E).to_a - ['-', '[', '\\', ']', '^'].map(&:ord)
4
+ SAFELY_PRINTABLE = (0x21..0x7E).to_a - %w(- / [ \\ ] ^).map(&:ord)
5
5
 
6
6
  attr_accessor :codepoint
7
7
 
@@ -8,4 +8,4 @@ class CharacterSet
8
8
  end
9
9
  end
10
10
 
11
- ::Regexp.send(:include, CharacterSet::CoreExt::RegexpExt)
11
+ ::Regexp.instance_eval { include CharacterSet::CoreExt::RegexpExt }
@@ -6,11 +6,13 @@ class CharacterSet
6
6
  end
7
7
 
8
8
  {
9
+ count_by_character_set: :count_in,
9
10
  covered_by_character_set?: :cover?,
10
11
  delete_character_set: :delete_in,
11
12
  delete_character_set!: :delete_in!,
12
13
  keep_character_set: :keep_in,
13
14
  keep_character_set!: :keep_in!,
15
+ scan_by_character_set: :scan,
14
16
  uses_character_set?: :used_by?,
15
17
  }.each do |string_method, set_method|
16
18
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
@@ -27,4 +29,4 @@ class CharacterSet
27
29
  end
28
30
  end
29
31
 
30
- ::String.send(:include, CharacterSet::CoreExt::StringExt)
32
+ ::String.instance_eval { include CharacterSet::CoreExt::StringExt }
@@ -5,7 +5,7 @@ class CharacterSet
5
5
  Error = Class.new(ArgumentError)
6
6
 
7
7
  def convert(expression)
8
- CharacterSet.require_optional_dependency('regexp_parser')
8
+ CharacterSet.require_optional_dependency('regexp_parser', __method__)
9
9
 
10
10
  case expression
11
11
  when Regexp::Expression::Root
@@ -16,44 +16,39 @@ class CharacterSet
16
16
 
17
17
  when Regexp::Expression::CharacterSet
18
18
  content = expression.map { |subexp| convert(subexp) }.reduce(:+)
19
+ content ||= CharacterSet[]
19
20
  expression.negative? ? content.inversion : content
20
21
 
21
22
  when Regexp::Expression::CharacterSet::Intersection
22
23
  expression.map { |subexp| convert(subexp) }.reduce(:&)
23
24
 
24
25
  when Regexp::Expression::CharacterSet::IntersectedSequence
25
- expression.map { |subexp| convert(subexp) }.reduce(:+)
26
+ expression.map { |subexp| convert(subexp) }.reduce(:+) || CharacterSet[]
26
27
 
27
28
  when Regexp::Expression::CharacterSet::Range
28
29
  start, finish = expression.map { |subexp| convert(subexp) }
29
- CharacterSet.from_ranges((start.min)..(finish.max))
30
+ CharacterSet.new((start.min)..(finish.max))
30
31
 
31
32
  when Regexp::Expression::CharacterType::Any
32
33
  CharacterSet.unicode
33
34
 
34
- when Regexp::Expression::CharacterType::Digit
35
- CharacterSet.from_ranges(48..57)
36
-
37
- when Regexp::Expression::CharacterType::NonDigit
38
- CharacterSet.from_ranges(48..57).inversion
39
-
40
- when Regexp::Expression::CharacterType::Hex
41
- CharacterSet.from_ranges(48..57, 65..70, 97..102)
42
-
43
- when Regexp::Expression::CharacterType::NonHex
44
- CharacterSet.from_ranges(48..57, 65..70, 97..102).inversion
45
-
46
- when Regexp::Expression::CharacterType::Space
47
- CharacterSet["\t", "\n", "\v", "\f", "\r", "\x20"]
48
-
49
- when Regexp::Expression::CharacterType::NonSpace
50
- CharacterSet["\t", "\n", "\v", "\f", "\r", "\x20"].inversion
51
-
52
- when Regexp::Expression::CharacterType::Word
53
- CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122)
54
-
55
- when Regexp::Expression::CharacterType::NonWord
56
- CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122).inversion
35
+ when Regexp::Expression::CharacterType::Base
36
+ /(?<negative>non)?(?<base_name>.+)/ =~ expression.token
37
+ content =
38
+ if expression.unicode_classes?
39
+ # in u-mode, type shortcuts match the same as \p{<long type name>}
40
+ CharacterSet.of_property(base_name)
41
+ else
42
+ # in normal mode, types match only ascii chars
43
+ case base_name.to_sym
44
+ when :digit then CharacterSet.from_ranges(48..57)
45
+ when :hex then CharacterSet.from_ranges(48..57, 65..70, 97..102)
46
+ when :space then CharacterSet.from_ranges(9..13, 32..32)
47
+ when :word then CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122)
48
+ else raise Error, "Unsupported CharacterType #{base_name}"
49
+ end
50
+ end
51
+ negative ? content.inversion : content
57
52
 
58
53
  when Regexp::Expression::EscapeSequence::CodepointList
59
54
  CharacterSet.new(expression.codepoints)
@@ -73,7 +68,7 @@ class CharacterSet
73
68
  raise Error, 'Groups must contain exactly one expression, e.g. ([a-z])'
74
69
  end
75
70
 
76
- when Regexp::Expression::Alternation
71
+ when Regexp::Expression::Alternation # rubocop:disable Lint/DuplicateBranch
77
72
  expression.map { |subexp| convert(subexp) }.reduce(:+)
78
73
 
79
74
  when Regexp::Expression::Alternative
@@ -93,6 +88,9 @@ class CharacterSet
93
88
  when Regexp::Expression::UnicodeProperty::Base,
94
89
  Regexp::Expression::PosixClass
95
90
  content = CharacterSet.of_property(expression.token)
91
+ if expression.type == :posixclass && expression.ascii_classes?
92
+ content = content.ascii_part
93
+ end
96
94
  expression.negative? ? content.inversion : content
97
95
 
98
96
  when Regexp::Expression::Base
@@ -5,7 +5,7 @@ class CharacterSet
5
5
  def codepoints_from_enumerable(object)
6
6
  raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each)
7
7
  # Use #each to check first element (only this works for all Enumerables)
8
- object.each do |e|
8
+ object.each do |e| # rubocop:disable Lint/UnreachableLoop
9
9
  return object if e.is_a?(Integer) && e >= 0 && e < 0x110000
10
10
  return object.map(&:ord) if e.is_a?(String) && e.length == 1
11
11
  raise ArgumentError, "#{e.inspect} is not valid as a codepoint"
@@ -1,266 +1,31 @@
1
1
  class CharacterSet
2
2
  module PredefinedSets
3
- def ascii
4
- @ascii ||= from_ranges(0..0x7F).freeze
5
- end
6
-
7
- def ascii_alnum
8
- @ascii_alnum ||= from_ranges(0x30..0x39, 0x41..0x5A, 0x61..0x7A).freeze
9
- end
10
-
11
- def ascii_letters
12
- @ascii_letters ||= from_ranges(0x41..0x5A, 0x61..0x7A).freeze
13
- end
14
-
15
- # basic multilingual plane
16
- def bmp
17
- @bmp ||= from_ranges(0..0xD7FF, 0xE000..0xFFFF).freeze
18
- end
19
-
20
- # ./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
21
- def crypt
22
- @crypt ||= from_ranges(0x2E..0x5A, 0x61..0x7A).freeze
23
- end
24
-
25
- def newline
26
- @newline ||= from_ranges(0xA..0xD, 0x85..0x85, 0x2028..0x2029).freeze
27
- end
28
-
29
- def unicode
30
- @unicode ||= from_ranges(0..0xD7FF, 0xE000..0x10FFFF).freeze
31
- end
32
-
33
- def url_fragment
34
- @url_fragment ||= from_ranges(
35
- 0x21..0x21,
36
- 0x24..0x24,
37
- 0x26..0x3B,
38
- 0x3D..0x3D,
39
- 0x3F..0x5A,
40
- 0x5F..0x5F,
41
- 0x61..0x7A,
42
- 0x7E..0x7E
43
- ).freeze
44
- end
45
-
46
- def url_host
47
- @url_host ||= from_ranges(
48
- 0x21..0x21,
49
- 0x24..0x24,
50
- 0x26..0x2E,
51
- 0x30..0x3B,
52
- 0x3D..0x3D,
53
- 0x41..0x5B,
54
- 0x5D..0x5D,
55
- 0x5F..0x5F,
56
- 0x61..0x7A,
57
- 0x7E..0x7E
58
- ).freeze
59
- end
60
-
61
- def url_path
62
- @url_path ||= from_ranges(
63
- 0x21..0x21,
64
- 0x24..0x3A,
65
- 0x3D..0x3D,
66
- 0x40..0x5A,
67
- 0x5F..0x5F,
68
- 0x61..0x7A,
69
- 0x7E..0x7E
70
- ).freeze
71
- end
72
-
73
- def url_query
74
- @url_query ||= from_ranges(
75
- 0x21..0x21,
76
- 0x24..0x24,
77
- 0x26..0x3B,
78
- 0x3D..0x3D,
79
- 0x3F..0x5A,
80
- 0x5F..0x5F,
81
- 0x61..0x7A,
82
- 0x7E..0x7E
83
- ).freeze
84
- end
85
-
86
- def whitespace
87
- @whitespace ||= from_ranges(
88
- 0x9..0x9,
89
- 0xA..0xD,
90
- 0x20..0x20,
91
- 0x85..0x85,
92
- 0xA0..0xA0,
93
- 0x1680..0x1680,
94
- 0x180E..0x180E,
95
- 0x2000..0x200A,
96
- 0x2028..0x2029,
97
- 0x202F..0x202F,
98
- 0x205F..0x205F,
99
- 0x3000..0x3000
100
- ).freeze
101
- end
102
-
103
- def emoji
104
- @emoji ||= from_ranges(
105
- 0x23..0x23,
106
- 0x2A..0x2A,
107
- 0x30..0x39,
108
- 0xA9..0xA9,
109
- 0xAE..0xAE,
110
- 0x203C..0x203C,
111
- 0x2049..0x2049,
112
- 0x2122..0x2122,
113
- 0x2139..0x2139,
114
- 0x2194..0x2199,
115
- 0x21A9..0x21AA,
116
- 0x231A..0x231B,
117
- 0x2328..0x2328,
118
- 0x23CF..0x23CF,
119
- 0x23E9..0x23F3,
120
- 0x23F8..0x23FA,
121
- 0x24C2..0x24C2,
122
- 0x25AA..0x25AB,
123
- 0x25B6..0x25B6,
124
- 0x25C0..0x25C0,
125
- 0x25FB..0x25FE,
126
- 0x2600..0x2604,
127
- 0x260E..0x260E,
128
- 0x2611..0x2611,
129
- 0x2614..0x2615,
130
- 0x2618..0x2618,
131
- 0x261D..0x261D,
132
- 0x2620..0x2620,
133
- 0x2622..0x2623,
134
- 0x2626..0x2626,
135
- 0x262A..0x262A,
136
- 0x262E..0x262F,
137
- 0x2638..0x263A,
138
- 0x2640..0x2640,
139
- 0x2642..0x2642,
140
- 0x2648..0x2653,
141
- 0x2660..0x2660,
142
- 0x2663..0x2663,
143
- 0x2665..0x2666,
144
- 0x2668..0x2668,
145
- 0x267B..0x267B,
146
- 0x267F..0x267F,
147
- 0x2692..0x2697,
148
- 0x2699..0x2699,
149
- 0x269B..0x269C,
150
- 0x26A0..0x26A1,
151
- 0x26AA..0x26AB,
152
- 0x26B0..0x26B1,
153
- 0x26BD..0x26BE,
154
- 0x26C4..0x26C5,
155
- 0x26C8..0x26C8,
156
- 0x26CE..0x26CF,
157
- 0x26D1..0x26D1,
158
- 0x26D3..0x26D4,
159
- 0x26E9..0x26EA,
160
- 0x26F0..0x26F5,
161
- 0x26F7..0x26FA,
162
- 0x26FD..0x26FD,
163
- 0x2702..0x2702,
164
- 0x2705..0x2705,
165
- 0x2708..0x270D,
166
- 0x270F..0x270F,
167
- 0x2712..0x2712,
168
- 0x2714..0x2714,
169
- 0x2716..0x2716,
170
- 0x271D..0x271D,
171
- 0x2721..0x2721,
172
- 0x2728..0x2728,
173
- 0x2733..0x2734,
174
- 0x2744..0x2744,
175
- 0x2747..0x2747,
176
- 0x274C..0x274C,
177
- 0x274E..0x274E,
178
- 0x2753..0x2755,
179
- 0x2757..0x2757,
180
- 0x2763..0x2764,
181
- 0x2795..0x2797,
182
- 0x27A1..0x27A1,
183
- 0x27B0..0x27B0,
184
- 0x27BF..0x27BF,
185
- 0x2934..0x2935,
186
- 0x2B05..0x2B07,
187
- 0x2B1B..0x2B1C,
188
- 0x2B50..0x2B50,
189
- 0x2B55..0x2B55,
190
- 0x3030..0x3030,
191
- 0x303D..0x303D,
192
- 0x3297..0x3297,
193
- 0x3299..0x3299,
194
- 0x1F004..0x1F004,
195
- 0x1F0CF..0x1F0CF,
196
- 0x1F170..0x1F171,
197
- 0x1F17E..0x1F17F,
198
- 0x1F18E..0x1F18E,
199
- 0x1F191..0x1F19A,
200
- 0x1F1E6..0x1F1FF,
201
- 0x1F201..0x1F202,
202
- 0x1F21A..0x1F21A,
203
- 0x1F22F..0x1F22F,
204
- 0x1F232..0x1F23A,
205
- 0x1F250..0x1F251,
206
- 0x1F300..0x1F321,
207
- 0x1F324..0x1F393,
208
- 0x1F396..0x1F397,
209
- 0x1F399..0x1F39B,
210
- 0x1F39E..0x1F3F0,
211
- 0x1F3F3..0x1F3F5,
212
- 0x1F3F7..0x1F4FD,
213
- 0x1F4FF..0x1F53D,
214
- 0x1F549..0x1F54E,
215
- 0x1F550..0x1F567,
216
- 0x1F56F..0x1F570,
217
- 0x1F573..0x1F57A,
218
- 0x1F587..0x1F587,
219
- 0x1F58A..0x1F58D,
220
- 0x1F590..0x1F590,
221
- 0x1F595..0x1F596,
222
- 0x1F5A4..0x1F5A5,
223
- 0x1F5A8..0x1F5A8,
224
- 0x1F5B1..0x1F5B2,
225
- 0x1F5BC..0x1F5BC,
226
- 0x1F5C2..0x1F5C4,
227
- 0x1F5D1..0x1F5D3,
228
- 0x1F5DC..0x1F5DE,
229
- 0x1F5E1..0x1F5E1,
230
- 0x1F5E3..0x1F5E3,
231
- 0x1F5E8..0x1F5E8,
232
- 0x1F5EF..0x1F5EF,
233
- 0x1F5F3..0x1F5F3,
234
- 0x1F5FA..0x1F64F,
235
- 0x1F680..0x1F6C5,
236
- 0x1F6CB..0x1F6D2,
237
- 0x1F6E0..0x1F6E5,
238
- 0x1F6E9..0x1F6E9,
239
- 0x1F6EB..0x1F6EC,
240
- 0x1F6F0..0x1F6F0,
241
- 0x1F6F3..0x1F6F8,
242
- 0x1F910..0x1F93A,
243
- 0x1F93C..0x1F93E,
244
- 0x1F940..0x1F945,
245
- 0x1F947..0x1F94C,
246
- 0x1F950..0x1F96B,
247
- 0x1F980..0x1F997,
248
- 0x1F9C0..0x1F9C0,
249
- 0x1F9D0..0x1F9E6
250
- ).freeze
251
- end
252
-
253
- def respond_to_missing?(method_name, include_private = false)
254
- (base = method_name[/^non_(.*)/, 1]) && respond_to?(base) || super
255
- end
256
-
257
- def method_missing(method_name, *args, &block)
258
- if (base = method_name[/^non_(.*)/, 1])
259
- ivar_name = "@#{method_name}"
260
- return instance_variable_get(ivar_name) ||
261
- instance_variable_set(ivar_name, send(base).inversion.freeze)
3
+ Dir[File.join(__dir__, 'predefined_sets', '*.cps')].each do |path|
4
+ set_name = File.basename(path, '.cps')
5
+
6
+ class_eval <<-RUBY, __FILE__, __LINE__ + 1
7
+ def #{set_name}
8
+ @#{set_name} ||= build_from_cps_file('#{path}').freeze
9
+ end
10
+
11
+ def non_#{set_name}
12
+ @non_#{set_name} ||= build_from_cps_file('#{path}').inversion.freeze
13
+ end
14
+ RUBY
15
+ end
16
+
17
+ alias all any
18
+ alias ascii_letters ascii_letter
19
+ alias basic_multilingual_plane bmp
20
+ alias blank whitespace
21
+ alias invalid surrogate
22
+ alias valid unicode
23
+
24
+ def build_from_cps_file(path)
25
+ File.readlines(path).inject(new) do |set, line|
26
+ range_start, range_end = line.split(',')
27
+ set.merge((range_start.to_i(16))..(range_end.to_i(16)))
262
28
  end
263
- super
264
29
  end
265
30
  end
266
31
  end