character_set 1.2.0 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +3 -0
  3. data/.github/workflows/gouteur.yml +20 -0
  4. data/.github/workflows/lint.yml +29 -0
  5. data/.github/workflows/tests.yml +22 -0
  6. data/.gitignore +1 -0
  7. data/.gouteur.yml +2 -0
  8. data/.rubocop.yml +17 -0
  9. data/BENCHMARK.md +53 -17
  10. data/CHANGELOG.md +54 -0
  11. data/README.md +51 -12
  12. data/Rakefile +20 -18
  13. data/benchmarks/count_in.rb +13 -0
  14. data/benchmarks/delete_in.rb +1 -1
  15. data/benchmarks/scan.rb +13 -0
  16. data/benchmarks/shared.rb +5 -0
  17. data/benchmarks/z_add.rb +12 -0
  18. data/benchmarks/z_delete.rb +12 -0
  19. data/benchmarks/z_merge.rb +15 -0
  20. data/benchmarks/z_minmax.rb +12 -0
  21. data/bin/console +2 -0
  22. data/character_set.gemspec +17 -4
  23. data/ext/character_set/character_set.c +969 -415
  24. data/ext/character_set/unicode_casefold_table.h +44 -1
  25. data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
  26. data/lib/character_set/character.rb +1 -1
  27. data/lib/character_set/core_ext/regexp_ext.rb +1 -1
  28. data/lib/character_set/core_ext/string_ext.rb +3 -1
  29. data/lib/character_set/expression_converter.rb +41 -43
  30. data/lib/character_set/parser.rb +1 -1
  31. data/lib/character_set/predefined_sets/any.cps +1 -0
  32. data/lib/character_set/predefined_sets/ascii.cps +1 -0
  33. data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
  34. data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
  35. data/lib/character_set/predefined_sets/assigned.cps +677 -0
  36. data/lib/character_set/predefined_sets/bmp.cps +2 -0
  37. data/lib/character_set/predefined_sets/crypt.cps +2 -0
  38. data/lib/character_set/predefined_sets/emoji.cps +152 -0
  39. data/lib/character_set/predefined_sets/newline.cps +3 -0
  40. data/lib/character_set/predefined_sets/surrogate.cps +1 -0
  41. data/lib/character_set/predefined_sets/unicode.cps +2 -0
  42. data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
  43. data/lib/character_set/predefined_sets/url_host.cps +10 -0
  44. data/lib/character_set/predefined_sets/url_path.cps +7 -0
  45. data/lib/character_set/predefined_sets/url_query.cps +8 -0
  46. data/lib/character_set/predefined_sets/whitespace.cps +10 -0
  47. data/lib/character_set/predefined_sets.rb +25 -260
  48. data/lib/character_set/ruby_fallback/character_set_methods.rb +60 -9
  49. data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
  50. data/lib/character_set/ruby_fallback.rb +5 -3
  51. data/lib/character_set/set_method_adapters.rb +4 -3
  52. data/lib/character_set/shared_methods.rb +69 -50
  53. data/lib/character_set/version.rb +1 -1
  54. data/lib/character_set/writer.rb +98 -27
  55. metadata +114 -17
  56. data/.travis.yml +0 -8
  57. data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
@@ -6,7 +6,7 @@ typedef struct casefold_mapping {
6
6
  unsigned long to;
7
7
  } casefold_mapping;
8
8
 
9
- #define CASEFOLD_COUNT 1383
9
+ #define CASEFOLD_COUNT 1426
10
10
 
11
11
  static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
12
12
  {0x0041,0x0061},
@@ -564,6 +564,41 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
564
564
  {0x104D1,0x104F9},
565
565
  {0x104D2,0x104FA},
566
566
  {0x104D3,0x104FB},
567
+ {0x10570,0x10597},
568
+ {0x10571,0x10598},
569
+ {0x10572,0x10599},
570
+ {0x10573,0x1059A},
571
+ {0x10574,0x1059B},
572
+ {0x10575,0x1059C},
573
+ {0x10576,0x1059D},
574
+ {0x10577,0x1059E},
575
+ {0x10578,0x1059F},
576
+ {0x10579,0x105A0},
577
+ {0x1057A,0x105A1},
578
+ {0x1057C,0x105A3},
579
+ {0x1057D,0x105A4},
580
+ {0x1057E,0x105A5},
581
+ {0x1057F,0x105A6},
582
+ {0x10580,0x105A7},
583
+ {0x10581,0x105A8},
584
+ {0x10582,0x105A9},
585
+ {0x10583,0x105AA},
586
+ {0x10584,0x105AB},
587
+ {0x10585,0x105AC},
588
+ {0x10586,0x105AD},
589
+ {0x10587,0x105AE},
590
+ {0x10588,0x105AF},
591
+ {0x10589,0x105B0},
592
+ {0x1058A,0x105B1},
593
+ {0x1058C,0x105B3},
594
+ {0x1058D,0x105B4},
595
+ {0x1058E,0x105B5},
596
+ {0x1058F,0x105B6},
597
+ {0x10590,0x105B7},
598
+ {0x10591,0x105B8},
599
+ {0x10592,0x105B9},
600
+ {0x10594,0x105BB},
601
+ {0x10595,0x105BC},
567
602
  {0x10A0,0x2D00},
568
603
  {0x10A1,0x2D01},
569
604
  {0x10A2,0x2D02},
@@ -1102,6 +1137,7 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
1102
1137
  {0x2C2C,0x2C5C},
1103
1138
  {0x2C2D,0x2C5D},
1104
1139
  {0x2C2E,0x2C5E},
1140
+ {0x2C2F,0x2C5F},
1105
1141
  {0x2C60,0x2C61},
1106
1142
  {0x2C62,0x026B},
1107
1143
  {0x2C63,0x1D7D},
@@ -1282,10 +1318,17 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
1282
1318
  {0xA7BA,0xA7BB},
1283
1319
  {0xA7BC,0xA7BD},
1284
1320
  {0xA7BE,0xA7BF},
1321
+ {0xA7C0,0xA7C1},
1285
1322
  {0xA7C2,0xA7C3},
1286
1323
  {0xA7C4,0xA794},
1287
1324
  {0xA7C5,0x0282},
1288
1325
  {0xA7C6,0x1D8E},
1326
+ {0xA7C7,0xA7C8},
1327
+ {0xA7C9,0xA7CA},
1328
+ {0xA7D0,0xA7D1},
1329
+ {0xA7D6,0xA7D7},
1330
+ {0xA7D8,0xA7D9},
1331
+ {0xA7F5,0xA7F6},
1289
1332
  {0xAB70,0x13A0},
1290
1333
  {0xAB71,0x13A1},
1291
1334
  {0xAB72,0x13A2},
@@ -0,0 +1,11 @@
1
+ // THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT
2
+ // -*-C-*-
3
+
4
+ typedef struct casefold_mapping {
5
+ unsigned long from;
6
+ unsigned long to;
7
+ } casefold_mapping;
8
+
9
+ #define CASEFOLD_COUNT 0
10
+
11
+ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {};
@@ -1,7 +1,7 @@
1
1
  class CharacterSet
2
2
  class Character
3
3
  ENCODING = 'utf-8'.freeze
4
- SAFELY_PRINTABLE = (0x21..0x7E).to_a - ['-', '[', '\\', ']', '^'].map(&:ord)
4
+ SAFELY_PRINTABLE = (0x21..0x7E).to_a - %w(- / [ \\ ] ^).map(&:ord)
5
5
 
6
6
  attr_accessor :codepoint
7
7
 
@@ -8,4 +8,4 @@ class CharacterSet
8
8
  end
9
9
  end
10
10
 
11
- ::Regexp.send(:include, CharacterSet::CoreExt::RegexpExt)
11
+ ::Regexp.instance_eval { include CharacterSet::CoreExt::RegexpExt }
@@ -6,11 +6,13 @@ class CharacterSet
6
6
  end
7
7
 
8
8
  {
9
+ count_by_character_set: :count_in,
9
10
  covered_by_character_set?: :cover?,
10
11
  delete_character_set: :delete_in,
11
12
  delete_character_set!: :delete_in!,
12
13
  keep_character_set: :keep_in,
13
14
  keep_character_set!: :keep_in!,
15
+ scan_by_character_set: :scan,
14
16
  uses_character_set?: :used_by?,
15
17
  }.each do |string_method, set_method|
16
18
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
@@ -27,4 +29,4 @@ class CharacterSet
27
29
  end
28
30
  end
29
31
 
30
- ::String.send(:include, CharacterSet::CoreExt::StringExt)
32
+ ::String.instance_eval { include CharacterSet::CoreExt::StringExt }
@@ -4,62 +4,57 @@ class CharacterSet
4
4
 
5
5
  Error = Class.new(ArgumentError)
6
6
 
7
- def convert(expression)
8
- CharacterSet.require_optional_dependency('regexp_parser')
7
+ def convert(expression, to = CharacterSet)
8
+ CharacterSet.require_optional_dependency('regexp_parser', __method__)
9
9
 
10
10
  case expression
11
11
  when Regexp::Expression::Root
12
12
  if expression.count != 1
13
13
  raise Error, 'Pass a Regexp with exactly one expression, e.g. /[a-z]/'
14
14
  end
15
- convert(expression[0])
15
+ convert(expression[0], to)
16
16
 
17
17
  when Regexp::Expression::CharacterSet
18
- content = expression.map { |subexp| convert(subexp) }.reduce(:+)
18
+ content = expression.map { |subexp| convert(subexp, to) }.reduce(:+)
19
+ content ||= to[]
19
20
  expression.negative? ? content.inversion : content
20
21
 
21
22
  when Regexp::Expression::CharacterSet::Intersection
22
- expression.map { |subexp| convert(subexp) }.reduce(:&)
23
+ expression.map { |subexp| convert(subexp, to) }.reduce(:&)
23
24
 
24
25
  when Regexp::Expression::CharacterSet::IntersectedSequence
25
- expression.map { |subexp| convert(subexp) }.reduce(:+)
26
+ expression.map { |subexp| convert(subexp, to) }.reduce(:+) || to[]
26
27
 
27
28
  when Regexp::Expression::CharacterSet::Range
28
- start, finish = expression.map { |subexp| convert(subexp) }
29
- CharacterSet.from_ranges((start.min)..(finish.max))
29
+ start, finish = expression.map { |subexp| convert(subexp, to) }
30
+ to.new((start.min)..(finish.max))
30
31
 
31
32
  when Regexp::Expression::CharacterType::Any
32
- CharacterSet.unicode
33
-
34
- when Regexp::Expression::CharacterType::Digit
35
- CharacterSet.from_ranges(48..57)
36
-
37
- when Regexp::Expression::CharacterType::NonDigit
38
- CharacterSet.from_ranges(48..57).inversion
39
-
40
- when Regexp::Expression::CharacterType::Hex
41
- CharacterSet.from_ranges(48..57, 65..70, 97..102)
42
-
43
- when Regexp::Expression::CharacterType::NonHex
44
- CharacterSet.from_ranges(48..57, 65..70, 97..102).inversion
45
-
46
- when Regexp::Expression::CharacterType::Space
47
- CharacterSet["\t", "\n", "\v", "\f", "\r", "\x20"]
48
-
49
- when Regexp::Expression::CharacterType::NonSpace
50
- CharacterSet["\t", "\n", "\v", "\f", "\r", "\x20"].inversion
51
-
52
- when Regexp::Expression::CharacterType::Word
53
- CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122)
54
-
55
- when Regexp::Expression::CharacterType::NonWord
56
- CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122).inversion
33
+ to.unicode
34
+
35
+ when Regexp::Expression::CharacterType::Base
36
+ /(?<negative>non)?(?<base_name>.+)/ =~ expression.token
37
+ content =
38
+ if expression.unicode_classes?
39
+ # in u-mode, type shortcuts match the same as \p{<long type name>}
40
+ to.of_property(base_name)
41
+ else
42
+ # in normal mode, types match only ascii chars
43
+ case base_name.to_sym
44
+ when :digit then to.from_ranges(48..57)
45
+ when :hex then to.from_ranges(48..57, 65..70, 97..102)
46
+ when :space then to.from_ranges(9..13, 32..32)
47
+ when :word then to.from_ranges(48..57, 65..90, 95..95, 97..122)
48
+ else raise Error, "Unsupported CharacterType #{base_name}"
49
+ end
50
+ end
51
+ negative ? content.inversion : content
57
52
 
58
53
  when Regexp::Expression::EscapeSequence::CodepointList
59
- CharacterSet.new(expression.codepoints)
54
+ to.new(expression.codepoints)
60
55
 
61
56
  when Regexp::Expression::EscapeSequence::Base
62
- CharacterSet[expression.codepoint]
57
+ to[expression.codepoint]
63
58
 
64
59
  when Regexp::Expression::Group::Capture,
65
60
  Regexp::Expression::Group::Passive,
@@ -67,19 +62,19 @@ class CharacterSet
67
62
  Regexp::Expression::Group::Atomic,
68
63
  Regexp::Expression::Group::Options
69
64
  case expression.count
70
- when 0 then CharacterSet[]
71
- when 1 then convert(expression.first)
65
+ when 0 then to[]
66
+ when 1 then convert(expression.first, to)
72
67
  else
73
68
  raise Error, 'Groups must contain exactly one expression, e.g. ([a-z])'
74
69
  end
75
70
 
76
- when Regexp::Expression::Alternation
77
- expression.map { |subexp| convert(subexp) }.reduce(:+)
71
+ when Regexp::Expression::Alternation # rubocop:disable Lint/DuplicateBranch
72
+ expression.map { |subexp| convert(subexp, to) }.reduce(:+)
78
73
 
79
74
  when Regexp::Expression::Alternative
80
75
  case expression.count
81
- when 0 then CharacterSet[]
82
- when 1 then convert(expression.first)
76
+ when 0 then to[]
77
+ when 1 then convert(expression.first, to)
83
78
  else
84
79
  raise Error, 'Alternatives must contain exactly one expression'
85
80
  end
@@ -88,11 +83,14 @@ class CharacterSet
88
83
  if expression.set_level == 0 && expression.text.size != 1
89
84
  raise Error, 'Literal runs outside of sets are codepoint *sequences*'
90
85
  end
91
- CharacterSet[expression.text.ord]
86
+ to[expression.text.ord]
92
87
 
93
88
  when Regexp::Expression::UnicodeProperty::Base,
94
89
  Regexp::Expression::PosixClass
95
- content = CharacterSet.of_property(expression.token)
90
+ content = to.of_property(expression.token)
91
+ if expression.type == :posixclass && expression.ascii_classes?
92
+ content = content.ascii_part
93
+ end
96
94
  expression.negative? ? content.inversion : content
97
95
 
98
96
  when Regexp::Expression::Base
@@ -5,7 +5,7 @@ class CharacterSet
5
5
  def codepoints_from_enumerable(object)
6
6
  raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each)
7
7
  # Use #each to check first element (only this works for all Enumerables)
8
- object.each do |e|
8
+ object.each do |e| # rubocop:disable Lint/UnreachableLoop
9
9
  return object if e.is_a?(Integer) && e >= 0 && e < 0x110000
10
10
  return object.map(&:ord) if e.is_a?(String) && e.length == 1
11
11
  raise ArgumentError, "#{e.inspect} is not valid as a codepoint"
@@ -0,0 +1 @@
1
+ 0,10FFFF
@@ -0,0 +1 @@
1
+ 0,7F
@@ -0,0 +1,3 @@
1
+ 30,39
2
+ 41,5A
3
+ 61,7A
@@ -0,0 +1,2 @@
1
+ 41,5A
2
+ 61,7A