character_set 1.1.1-java → 1.4.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +3 -0
  3. data/.github/workflows/lint.yml +29 -0
  4. data/.github/workflows/tests.yml +22 -0
  5. data/.gitignore +1 -0
  6. data/.rubocop.yml +11 -0
  7. data/BENCHMARK.md +53 -17
  8. data/CHANGELOG.md +47 -0
  9. data/README.md +38 -14
  10. data/Rakefile +60 -36
  11. data/benchmarks/count_in.rb +13 -0
  12. data/benchmarks/delete_in.rb +1 -1
  13. data/benchmarks/scan.rb +13 -0
  14. data/benchmarks/shared.rb +5 -0
  15. data/benchmarks/z_add.rb +12 -0
  16. data/benchmarks/z_delete.rb +12 -0
  17. data/benchmarks/z_merge.rb +15 -0
  18. data/benchmarks/z_minmax.rb +12 -0
  19. data/bin/console +2 -0
  20. data/character_set.gemspec +17 -6
  21. data/ext/character_set/character_set.c +963 -414
  22. data/ext/character_set/unicode_casefold_table.h +10 -2
  23. data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
  24. data/lib/character_set/character.rb +1 -1
  25. data/lib/character_set/core_ext/regexp_ext.rb +1 -1
  26. data/lib/character_set/core_ext/string_ext.rb +3 -1
  27. data/lib/character_set/expression_converter.rb +25 -27
  28. data/lib/character_set/parser.rb +1 -1
  29. data/lib/character_set/predefined_sets.rb +25 -260
  30. data/lib/character_set/predefined_sets/any.cps +1 -0
  31. data/lib/character_set/predefined_sets/ascii.cps +1 -0
  32. data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
  33. data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
  34. data/lib/character_set/predefined_sets/assigned.cps +666 -0
  35. data/lib/character_set/predefined_sets/bmp.cps +2 -0
  36. data/lib/character_set/predefined_sets/crypt.cps +2 -0
  37. data/lib/character_set/predefined_sets/emoji.cps +151 -0
  38. data/lib/character_set/predefined_sets/newline.cps +3 -0
  39. data/lib/character_set/predefined_sets/surrogate.cps +1 -0
  40. data/lib/character_set/predefined_sets/unicode.cps +2 -0
  41. data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
  42. data/lib/character_set/predefined_sets/url_host.cps +10 -0
  43. data/lib/character_set/predefined_sets/url_path.cps +7 -0
  44. data/lib/character_set/predefined_sets/url_query.cps +8 -0
  45. data/lib/character_set/predefined_sets/whitespace.cps +10 -0
  46. data/lib/character_set/ruby_fallback.rb +5 -3
  47. data/lib/character_set/ruby_fallback/character_set_methods.rb +53 -6
  48. data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
  49. data/lib/character_set/shared_methods.rb +60 -49
  50. data/lib/character_set/version.rb +1 -1
  51. data/lib/character_set/writer.rb +98 -27
  52. metadata +102 -22
  53. data/.travis.yml +0 -11
  54. data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
@@ -1,11 +1,12 @@
1
- // THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT'
1
+ // THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT
2
+ // -*-C-*-
2
3
 
3
4
  typedef struct casefold_mapping {
4
5
  unsigned long from;
5
6
  unsigned long to;
6
7
  } casefold_mapping;
7
8
 
8
- #define CASEFOLD_COUNT 1376
9
+ #define CASEFOLD_COUNT 1383
9
10
 
10
11
  static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
11
12
  {0x0041,0x0061},
@@ -1278,6 +1279,13 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
1278
1279
  {0xA7B4,0xA7B5},
1279
1280
  {0xA7B6,0xA7B7},
1280
1281
  {0xA7B8,0xA7B9},
1282
+ {0xA7BA,0xA7BB},
1283
+ {0xA7BC,0xA7BD},
1284
+ {0xA7BE,0xA7BF},
1285
+ {0xA7C2,0xA7C3},
1286
+ {0xA7C4,0xA794},
1287
+ {0xA7C5,0x0282},
1288
+ {0xA7C6,0x1D8E},
1281
1289
  {0xAB70,0x13A0},
1282
1290
  {0xAB71,0x13A1},
1283
1291
  {0xAB72,0x13A2},
@@ -0,0 +1,11 @@
1
+ // THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT
2
+ // -*-C-*-
3
+
4
+ typedef struct casefold_mapping {
5
+ unsigned long from;
6
+ unsigned long to;
7
+ } casefold_mapping;
8
+
9
+ #define CASEFOLD_COUNT 0
10
+
11
+ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {};
@@ -1,7 +1,7 @@
1
1
  class CharacterSet
2
2
  class Character
3
3
  ENCODING = 'utf-8'.freeze
4
- SAFELY_PRINTABLE = (0x21..0x7E).to_a - ['-', '[', '\\', ']', '^'].map(&:ord)
4
+ SAFELY_PRINTABLE = (0x21..0x7E).to_a - %w(- / [ \\ ] ^).map(&:ord)
5
5
 
6
6
  attr_accessor :codepoint
7
7
 
@@ -8,4 +8,4 @@ class CharacterSet
8
8
  end
9
9
  end
10
10
 
11
- ::Regexp.send(:include, CharacterSet::CoreExt::RegexpExt)
11
+ ::Regexp.instance_eval { include CharacterSet::CoreExt::RegexpExt }
@@ -6,11 +6,13 @@ class CharacterSet
6
6
  end
7
7
 
8
8
  {
9
+ count_by_character_set: :count_in,
9
10
  covered_by_character_set?: :cover?,
10
11
  delete_character_set: :delete_in,
11
12
  delete_character_set!: :delete_in!,
12
13
  keep_character_set: :keep_in,
13
14
  keep_character_set!: :keep_in!,
15
+ scan_by_character_set: :scan,
14
16
  uses_character_set?: :used_by?,
15
17
  }.each do |string_method, set_method|
16
18
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
@@ -27,4 +29,4 @@ class CharacterSet
27
29
  end
28
30
  end
29
31
 
30
- ::String.send(:include, CharacterSet::CoreExt::StringExt)
32
+ ::String.instance_eval { include CharacterSet::CoreExt::StringExt }
@@ -5,7 +5,7 @@ class CharacterSet
5
5
  Error = Class.new(ArgumentError)
6
6
 
7
7
  def convert(expression)
8
- CharacterSet.require_optional_dependency('regexp_parser')
8
+ CharacterSet.require_optional_dependency('regexp_parser', __method__)
9
9
 
10
10
  case expression
11
11
  when Regexp::Expression::Root
@@ -16,44 +16,39 @@ class CharacterSet
16
16
 
17
17
  when Regexp::Expression::CharacterSet
18
18
  content = expression.map { |subexp| convert(subexp) }.reduce(:+)
19
+ content ||= CharacterSet[]
19
20
  expression.negative? ? content.inversion : content
20
21
 
21
22
  when Regexp::Expression::CharacterSet::Intersection
22
23
  expression.map { |subexp| convert(subexp) }.reduce(:&)
23
24
 
24
25
  when Regexp::Expression::CharacterSet::IntersectedSequence
25
- expression.map { |subexp| convert(subexp) }.reduce(:+)
26
+ expression.map { |subexp| convert(subexp) }.reduce(:+) || CharacterSet[]
26
27
 
27
28
  when Regexp::Expression::CharacterSet::Range
28
29
  start, finish = expression.map { |subexp| convert(subexp) }
29
- CharacterSet.from_ranges((start.min)..(finish.max))
30
+ CharacterSet.new((start.min)..(finish.max))
30
31
 
31
32
  when Regexp::Expression::CharacterType::Any
32
33
  CharacterSet.unicode
33
34
 
34
- when Regexp::Expression::CharacterType::Digit
35
- CharacterSet.from_ranges(48..57)
36
-
37
- when Regexp::Expression::CharacterType::NonDigit
38
- CharacterSet.from_ranges(48..57).inversion
39
-
40
- when Regexp::Expression::CharacterType::Hex
41
- CharacterSet.from_ranges(48..57, 65..70, 97..102)
42
-
43
- when Regexp::Expression::CharacterType::NonHex
44
- CharacterSet.from_ranges(48..57, 65..70, 97..102).inversion
45
-
46
- when Regexp::Expression::CharacterType::Space
47
- CharacterSet["\t", "\n", "\v", "\f", "\r", "\x20"]
48
-
49
- when Regexp::Expression::CharacterType::NonSpace
50
- CharacterSet["\t", "\n", "\v", "\f", "\r", "\x20"].inversion
51
-
52
- when Regexp::Expression::CharacterType::Word
53
- CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122)
54
-
55
- when Regexp::Expression::CharacterType::NonWord
56
- CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122).inversion
35
+ when Regexp::Expression::CharacterType::Base
36
+ /(?<negative>non)?(?<base_name>.+)/ =~ expression.token
37
+ content =
38
+ if expression.unicode_classes?
39
+ # in u-mode, type shortcuts match the same as \p{<long type name>}
40
+ CharacterSet.of_property(base_name)
41
+ else
42
+ # in normal mode, types match only ascii chars
43
+ case base_name.to_sym
44
+ when :digit then CharacterSet.from_ranges(48..57)
45
+ when :hex then CharacterSet.from_ranges(48..57, 65..70, 97..102)
46
+ when :space then CharacterSet.from_ranges(9..13, 32..32)
47
+ when :word then CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122)
48
+ else raise Error, "Unsupported CharacterType #{base_name}"
49
+ end
50
+ end
51
+ negative ? content.inversion : content
57
52
 
58
53
  when Regexp::Expression::EscapeSequence::CodepointList
59
54
  CharacterSet.new(expression.codepoints)
@@ -73,7 +68,7 @@ class CharacterSet
73
68
  raise Error, 'Groups must contain exactly one expression, e.g. ([a-z])'
74
69
  end
75
70
 
76
- when Regexp::Expression::Alternation
71
+ when Regexp::Expression::Alternation # rubocop:disable Lint/DuplicateBranch
77
72
  expression.map { |subexp| convert(subexp) }.reduce(:+)
78
73
 
79
74
  when Regexp::Expression::Alternative
@@ -93,6 +88,9 @@ class CharacterSet
93
88
  when Regexp::Expression::UnicodeProperty::Base,
94
89
  Regexp::Expression::PosixClass
95
90
  content = CharacterSet.of_property(expression.token)
91
+ if expression.type == :posixclass && expression.ascii_classes?
92
+ content = content.ascii_part
93
+ end
96
94
  expression.negative? ? content.inversion : content
97
95
 
98
96
  when Regexp::Expression::Base
@@ -5,7 +5,7 @@ class CharacterSet
5
5
  def codepoints_from_enumerable(object)
6
6
  raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each)
7
7
  # Use #each to check first element (only this works for all Enumerables)
8
- object.each do |e|
8
+ object.each do |e| # rubocop:disable Lint/UnreachableLoop
9
9
  return object if e.is_a?(Integer) && e >= 0 && e < 0x110000
10
10
  return object.map(&:ord) if e.is_a?(String) && e.length == 1
11
11
  raise ArgumentError, "#{e.inspect} is not valid as a codepoint"
@@ -1,266 +1,31 @@
1
1
  class CharacterSet
2
2
  module PredefinedSets
3
- def ascii
4
- @ascii ||= from_ranges(0..0x7F).freeze
5
- end
6
-
7
- def ascii_alnum
8
- @ascii_alnum ||= from_ranges(0x30..0x39, 0x41..0x5A, 0x61..0x7A).freeze
9
- end
10
-
11
- def ascii_letters
12
- @ascii_letters ||= from_ranges(0x41..0x5A, 0x61..0x7A).freeze
13
- end
14
-
15
- # basic multilingual plane
16
- def bmp
17
- @bmp ||= from_ranges(0..0xD7FF, 0xE000..0xFFFF).freeze
18
- end
19
-
20
- # ./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
21
- def crypt
22
- @crypt ||= from_ranges(0x2E..0x5A, 0x61..0x7A).freeze
23
- end
24
-
25
- def newline
26
- @newline ||= from_ranges(0xA..0xD, 0x85..0x85, 0x2028..0x2029).freeze
27
- end
28
-
29
- def unicode
30
- @unicode ||= from_ranges(0..0xD7FF, 0xE000..0x10FFFF).freeze
31
- end
32
-
33
- def url_fragment
34
- @url_fragment ||= from_ranges(
35
- 0x21..0x21,
36
- 0x24..0x24,
37
- 0x26..0x3B,
38
- 0x3D..0x3D,
39
- 0x3F..0x5A,
40
- 0x5F..0x5F,
41
- 0x61..0x7A,
42
- 0x7E..0x7E
43
- ).freeze
44
- end
45
-
46
- def url_host
47
- @url_host ||= from_ranges(
48
- 0x21..0x21,
49
- 0x24..0x24,
50
- 0x26..0x2E,
51
- 0x30..0x3B,
52
- 0x3D..0x3D,
53
- 0x41..0x5B,
54
- 0x5D..0x5D,
55
- 0x5F..0x5F,
56
- 0x61..0x7A,
57
- 0x7E..0x7E
58
- ).freeze
59
- end
60
-
61
- def url_path
62
- @url_path ||= from_ranges(
63
- 0x21..0x21,
64
- 0x24..0x3A,
65
- 0x3D..0x3D,
66
- 0x40..0x5A,
67
- 0x5F..0x5F,
68
- 0x61..0x7A,
69
- 0x7E..0x7E
70
- ).freeze
71
- end
72
-
73
- def url_query
74
- @url_query ||= from_ranges(
75
- 0x21..0x21,
76
- 0x24..0x24,
77
- 0x26..0x3B,
78
- 0x3D..0x3D,
79
- 0x3F..0x5A,
80
- 0x5F..0x5F,
81
- 0x61..0x7A,
82
- 0x7E..0x7E
83
- ).freeze
84
- end
85
-
86
- def whitespace
87
- @whitespace ||= from_ranges(
88
- 0x9..0x9,
89
- 0xA..0xD,
90
- 0x20..0x20,
91
- 0x85..0x85,
92
- 0xA0..0xA0,
93
- 0x1680..0x1680,
94
- 0x180E..0x180E,
95
- 0x2000..0x200A,
96
- 0x2028..0x2029,
97
- 0x202F..0x202F,
98
- 0x205F..0x205F,
99
- 0x3000..0x3000
100
- ).freeze
101
- end
102
-
103
- def emoji
104
- @emoji ||= from_ranges(
105
- 0x23..0x23,
106
- 0x2A..0x2A,
107
- 0x30..0x39,
108
- 0xA9..0xA9,
109
- 0xAE..0xAE,
110
- 0x203C..0x203C,
111
- 0x2049..0x2049,
112
- 0x2122..0x2122,
113
- 0x2139..0x2139,
114
- 0x2194..0x2199,
115
- 0x21A9..0x21AA,
116
- 0x231A..0x231B,
117
- 0x2328..0x2328,
118
- 0x23CF..0x23CF,
119
- 0x23E9..0x23F3,
120
- 0x23F8..0x23FA,
121
- 0x24C2..0x24C2,
122
- 0x25AA..0x25AB,
123
- 0x25B6..0x25B6,
124
- 0x25C0..0x25C0,
125
- 0x25FB..0x25FE,
126
- 0x2600..0x2604,
127
- 0x260E..0x260E,
128
- 0x2611..0x2611,
129
- 0x2614..0x2615,
130
- 0x2618..0x2618,
131
- 0x261D..0x261D,
132
- 0x2620..0x2620,
133
- 0x2622..0x2623,
134
- 0x2626..0x2626,
135
- 0x262A..0x262A,
136
- 0x262E..0x262F,
137
- 0x2638..0x263A,
138
- 0x2640..0x2640,
139
- 0x2642..0x2642,
140
- 0x2648..0x2653,
141
- 0x2660..0x2660,
142
- 0x2663..0x2663,
143
- 0x2665..0x2666,
144
- 0x2668..0x2668,
145
- 0x267B..0x267B,
146
- 0x267F..0x267F,
147
- 0x2692..0x2697,
148
- 0x2699..0x2699,
149
- 0x269B..0x269C,
150
- 0x26A0..0x26A1,
151
- 0x26AA..0x26AB,
152
- 0x26B0..0x26B1,
153
- 0x26BD..0x26BE,
154
- 0x26C4..0x26C5,
155
- 0x26C8..0x26C8,
156
- 0x26CE..0x26CF,
157
- 0x26D1..0x26D1,
158
- 0x26D3..0x26D4,
159
- 0x26E9..0x26EA,
160
- 0x26F0..0x26F5,
161
- 0x26F7..0x26FA,
162
- 0x26FD..0x26FD,
163
- 0x2702..0x2702,
164
- 0x2705..0x2705,
165
- 0x2708..0x270D,
166
- 0x270F..0x270F,
167
- 0x2712..0x2712,
168
- 0x2714..0x2714,
169
- 0x2716..0x2716,
170
- 0x271D..0x271D,
171
- 0x2721..0x2721,
172
- 0x2728..0x2728,
173
- 0x2733..0x2734,
174
- 0x2744..0x2744,
175
- 0x2747..0x2747,
176
- 0x274C..0x274C,
177
- 0x274E..0x274E,
178
- 0x2753..0x2755,
179
- 0x2757..0x2757,
180
- 0x2763..0x2764,
181
- 0x2795..0x2797,
182
- 0x27A1..0x27A1,
183
- 0x27B0..0x27B0,
184
- 0x27BF..0x27BF,
185
- 0x2934..0x2935,
186
- 0x2B05..0x2B07,
187
- 0x2B1B..0x2B1C,
188
- 0x2B50..0x2B50,
189
- 0x2B55..0x2B55,
190
- 0x3030..0x3030,
191
- 0x303D..0x303D,
192
- 0x3297..0x3297,
193
- 0x3299..0x3299,
194
- 0x1F004..0x1F004,
195
- 0x1F0CF..0x1F0CF,
196
- 0x1F170..0x1F171,
197
- 0x1F17E..0x1F17F,
198
- 0x1F18E..0x1F18E,
199
- 0x1F191..0x1F19A,
200
- 0x1F1E6..0x1F1FF,
201
- 0x1F201..0x1F202,
202
- 0x1F21A..0x1F21A,
203
- 0x1F22F..0x1F22F,
204
- 0x1F232..0x1F23A,
205
- 0x1F250..0x1F251,
206
- 0x1F300..0x1F321,
207
- 0x1F324..0x1F393,
208
- 0x1F396..0x1F397,
209
- 0x1F399..0x1F39B,
210
- 0x1F39E..0x1F3F0,
211
- 0x1F3F3..0x1F3F5,
212
- 0x1F3F7..0x1F4FD,
213
- 0x1F4FF..0x1F53D,
214
- 0x1F549..0x1F54E,
215
- 0x1F550..0x1F567,
216
- 0x1F56F..0x1F570,
217
- 0x1F573..0x1F57A,
218
- 0x1F587..0x1F587,
219
- 0x1F58A..0x1F58D,
220
- 0x1F590..0x1F590,
221
- 0x1F595..0x1F596,
222
- 0x1F5A4..0x1F5A5,
223
- 0x1F5A8..0x1F5A8,
224
- 0x1F5B1..0x1F5B2,
225
- 0x1F5BC..0x1F5BC,
226
- 0x1F5C2..0x1F5C4,
227
- 0x1F5D1..0x1F5D3,
228
- 0x1F5DC..0x1F5DE,
229
- 0x1F5E1..0x1F5E1,
230
- 0x1F5E3..0x1F5E3,
231
- 0x1F5E8..0x1F5E8,
232
- 0x1F5EF..0x1F5EF,
233
- 0x1F5F3..0x1F5F3,
234
- 0x1F5FA..0x1F64F,
235
- 0x1F680..0x1F6C5,
236
- 0x1F6CB..0x1F6D2,
237
- 0x1F6E0..0x1F6E5,
238
- 0x1F6E9..0x1F6E9,
239
- 0x1F6EB..0x1F6EC,
240
- 0x1F6F0..0x1F6F0,
241
- 0x1F6F3..0x1F6F8,
242
- 0x1F910..0x1F93A,
243
- 0x1F93C..0x1F93E,
244
- 0x1F940..0x1F945,
245
- 0x1F947..0x1F94C,
246
- 0x1F950..0x1F96B,
247
- 0x1F980..0x1F997,
248
- 0x1F9C0..0x1F9C0,
249
- 0x1F9D0..0x1F9E6
250
- ).freeze
251
- end
252
-
253
- def respond_to_missing?(method_name, include_private = false)
254
- (base = method_name[/^non_(.*)/, 1]) && respond_to?(base) || super
255
- end
256
-
257
- def method_missing(method_name, *args, &block)
258
- if (base = method_name[/^non_(.*)/, 1])
259
- ivar_name = "@#{method_name}"
260
- return instance_variable_get(ivar_name) ||
261
- instance_variable_set(ivar_name, send(base).inversion.freeze)
3
+ Dir[File.join(__dir__, 'predefined_sets', '*.cps')].each do |path|
4
+ set_name = File.basename(path, '.cps')
5
+
6
+ class_eval <<-RUBY, __FILE__, __LINE__ + 1
7
+ def #{set_name}
8
+ @#{set_name} ||= build_from_cps_file('#{path}').freeze
9
+ end
10
+
11
+ def non_#{set_name}
12
+ @non_#{set_name} ||= build_from_cps_file('#{path}').inversion.freeze
13
+ end
14
+ RUBY
15
+ end
16
+
17
+ alias all any
18
+ alias ascii_letters ascii_letter
19
+ alias basic_multilingual_plane bmp
20
+ alias blank whitespace
21
+ alias invalid surrogate
22
+ alias valid unicode
23
+
24
+ def build_from_cps_file(path)
25
+ File.readlines(path).inject(new) do |set, line|
26
+ range_start, range_end = line.split(',')
27
+ set.merge((range_start.to_i(16))..(range_end.to_i(16)))
262
28
  end
263
- super
264
29
  end
265
30
  end
266
31
  end