character_set 1.2.0-java → 1.5.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +3 -0
  3. data/.github/workflows/gouteur.yml +20 -0
  4. data/.github/workflows/lint.yml +29 -0
  5. data/.github/workflows/tests.yml +22 -0
  6. data/.gitignore +1 -0
  7. data/.gouteur.yml +2 -0
  8. data/.rubocop.yml +17 -0
  9. data/BENCHMARK.md +53 -17
  10. data/CHANGELOG.md +54 -0
  11. data/README.md +51 -12
  12. data/Rakefile +20 -18
  13. data/benchmarks/count_in.rb +13 -0
  14. data/benchmarks/delete_in.rb +1 -1
  15. data/benchmarks/scan.rb +13 -0
  16. data/benchmarks/shared.rb +5 -0
  17. data/benchmarks/z_add.rb +12 -0
  18. data/benchmarks/z_delete.rb +12 -0
  19. data/benchmarks/z_merge.rb +15 -0
  20. data/benchmarks/z_minmax.rb +12 -0
  21. data/bin/console +2 -0
  22. data/character_set.gemspec +17 -4
  23. data/ext/character_set/character_set.c +969 -415
  24. data/ext/character_set/unicode_casefold_table.h +44 -1
  25. data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
  26. data/lib/character_set/character.rb +1 -1
  27. data/lib/character_set/core_ext/regexp_ext.rb +1 -1
  28. data/lib/character_set/core_ext/string_ext.rb +3 -1
  29. data/lib/character_set/expression_converter.rb +41 -43
  30. data/lib/character_set/parser.rb +1 -1
  31. data/lib/character_set/predefined_sets/any.cps +1 -0
  32. data/lib/character_set/predefined_sets/ascii.cps +1 -0
  33. data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
  34. data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
  35. data/lib/character_set/predefined_sets/assigned.cps +677 -0
  36. data/lib/character_set/predefined_sets/bmp.cps +2 -0
  37. data/lib/character_set/predefined_sets/crypt.cps +2 -0
  38. data/lib/character_set/predefined_sets/emoji.cps +152 -0
  39. data/lib/character_set/predefined_sets/newline.cps +3 -0
  40. data/lib/character_set/predefined_sets/surrogate.cps +1 -0
  41. data/lib/character_set/predefined_sets/unicode.cps +2 -0
  42. data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
  43. data/lib/character_set/predefined_sets/url_host.cps +10 -0
  44. data/lib/character_set/predefined_sets/url_path.cps +7 -0
  45. data/lib/character_set/predefined_sets/url_query.cps +8 -0
  46. data/lib/character_set/predefined_sets/whitespace.cps +10 -0
  47. data/lib/character_set/predefined_sets.rb +25 -260
  48. data/lib/character_set/ruby_fallback/character_set_methods.rb +60 -9
  49. data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
  50. data/lib/character_set/ruby_fallback.rb +5 -3
  51. data/lib/character_set/set_method_adapters.rb +4 -3
  52. data/lib/character_set/shared_methods.rb +69 -50
  53. data/lib/character_set/version.rb +1 -1
  54. data/lib/character_set/writer.rb +98 -27
  55. metadata +114 -17
  56. data/.travis.yml +0 -8
  57. data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
@@ -6,7 +6,7 @@ typedef struct casefold_mapping {
6
6
  unsigned long to;
7
7
  } casefold_mapping;
8
8
 
9
- #define CASEFOLD_COUNT 1383
9
+ #define CASEFOLD_COUNT 1426
10
10
 
11
11
  static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
12
12
  {0x0041,0x0061},
@@ -564,6 +564,41 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
564
564
  {0x104D1,0x104F9},
565
565
  {0x104D2,0x104FA},
566
566
  {0x104D3,0x104FB},
567
+ {0x10570,0x10597},
568
+ {0x10571,0x10598},
569
+ {0x10572,0x10599},
570
+ {0x10573,0x1059A},
571
+ {0x10574,0x1059B},
572
+ {0x10575,0x1059C},
573
+ {0x10576,0x1059D},
574
+ {0x10577,0x1059E},
575
+ {0x10578,0x1059F},
576
+ {0x10579,0x105A0},
577
+ {0x1057A,0x105A1},
578
+ {0x1057C,0x105A3},
579
+ {0x1057D,0x105A4},
580
+ {0x1057E,0x105A5},
581
+ {0x1057F,0x105A6},
582
+ {0x10580,0x105A7},
583
+ {0x10581,0x105A8},
584
+ {0x10582,0x105A9},
585
+ {0x10583,0x105AA},
586
+ {0x10584,0x105AB},
587
+ {0x10585,0x105AC},
588
+ {0x10586,0x105AD},
589
+ {0x10587,0x105AE},
590
+ {0x10588,0x105AF},
591
+ {0x10589,0x105B0},
592
+ {0x1058A,0x105B1},
593
+ {0x1058C,0x105B3},
594
+ {0x1058D,0x105B4},
595
+ {0x1058E,0x105B5},
596
+ {0x1058F,0x105B6},
597
+ {0x10590,0x105B7},
598
+ {0x10591,0x105B8},
599
+ {0x10592,0x105B9},
600
+ {0x10594,0x105BB},
601
+ {0x10595,0x105BC},
567
602
  {0x10A0,0x2D00},
568
603
  {0x10A1,0x2D01},
569
604
  {0x10A2,0x2D02},
@@ -1102,6 +1137,7 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
1102
1137
  {0x2C2C,0x2C5C},
1103
1138
  {0x2C2D,0x2C5D},
1104
1139
  {0x2C2E,0x2C5E},
1140
+ {0x2C2F,0x2C5F},
1105
1141
  {0x2C60,0x2C61},
1106
1142
  {0x2C62,0x026B},
1107
1143
  {0x2C63,0x1D7D},
@@ -1282,10 +1318,17 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
1282
1318
  {0xA7BA,0xA7BB},
1283
1319
  {0xA7BC,0xA7BD},
1284
1320
  {0xA7BE,0xA7BF},
1321
+ {0xA7C0,0xA7C1},
1285
1322
  {0xA7C2,0xA7C3},
1286
1323
  {0xA7C4,0xA794},
1287
1324
  {0xA7C5,0x0282},
1288
1325
  {0xA7C6,0x1D8E},
1326
+ {0xA7C7,0xA7C8},
1327
+ {0xA7C9,0xA7CA},
1328
+ {0xA7D0,0xA7D1},
1329
+ {0xA7D6,0xA7D7},
1330
+ {0xA7D8,0xA7D9},
1331
+ {0xA7F5,0xA7F6},
1289
1332
  {0xAB70,0x13A0},
1290
1333
  {0xAB71,0x13A1},
1291
1334
  {0xAB72,0x13A2},
@@ -0,0 +1,11 @@
1
+ // THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT
2
+ // -*-C-*-
3
+
4
+ typedef struct casefold_mapping {
5
+ unsigned long from;
6
+ unsigned long to;
7
+ } casefold_mapping;
8
+
9
+ #define CASEFOLD_COUNT 0
10
+
11
+ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {};
@@ -1,7 +1,7 @@
1
1
  class CharacterSet
2
2
  class Character
3
3
  ENCODING = 'utf-8'.freeze
4
- SAFELY_PRINTABLE = (0x21..0x7E).to_a - ['-', '[', '\\', ']', '^'].map(&:ord)
4
+ SAFELY_PRINTABLE = (0x21..0x7E).to_a - %w(- / [ \\ ] ^).map(&:ord)
5
5
 
6
6
  attr_accessor :codepoint
7
7
 
@@ -8,4 +8,4 @@ class CharacterSet
8
8
  end
9
9
  end
10
10
 
11
- ::Regexp.send(:include, CharacterSet::CoreExt::RegexpExt)
11
+ ::Regexp.instance_eval { include CharacterSet::CoreExt::RegexpExt }
@@ -6,11 +6,13 @@ class CharacterSet
6
6
  end
7
7
 
8
8
  {
9
+ count_by_character_set: :count_in,
9
10
  covered_by_character_set?: :cover?,
10
11
  delete_character_set: :delete_in,
11
12
  delete_character_set!: :delete_in!,
12
13
  keep_character_set: :keep_in,
13
14
  keep_character_set!: :keep_in!,
15
+ scan_by_character_set: :scan,
14
16
  uses_character_set?: :used_by?,
15
17
  }.each do |string_method, set_method|
16
18
  class_eval <<-RUBY, __FILE__, __LINE__ + 1
@@ -27,4 +29,4 @@ class CharacterSet
27
29
  end
28
30
  end
29
31
 
30
- ::String.send(:include, CharacterSet::CoreExt::StringExt)
32
+ ::String.instance_eval { include CharacterSet::CoreExt::StringExt }
@@ -4,62 +4,57 @@ class CharacterSet
4
4
 
5
5
  Error = Class.new(ArgumentError)
6
6
 
7
- def convert(expression)
8
- CharacterSet.require_optional_dependency('regexp_parser')
7
+ def convert(expression, to = CharacterSet)
8
+ CharacterSet.require_optional_dependency('regexp_parser', __method__)
9
9
 
10
10
  case expression
11
11
  when Regexp::Expression::Root
12
12
  if expression.count != 1
13
13
  raise Error, 'Pass a Regexp with exactly one expression, e.g. /[a-z]/'
14
14
  end
15
- convert(expression[0])
15
+ convert(expression[0], to)
16
16
 
17
17
  when Regexp::Expression::CharacterSet
18
- content = expression.map { |subexp| convert(subexp) }.reduce(:+)
18
+ content = expression.map { |subexp| convert(subexp, to) }.reduce(:+)
19
+ content ||= to[]
19
20
  expression.negative? ? content.inversion : content
20
21
 
21
22
  when Regexp::Expression::CharacterSet::Intersection
22
- expression.map { |subexp| convert(subexp) }.reduce(:&)
23
+ expression.map { |subexp| convert(subexp, to) }.reduce(:&)
23
24
 
24
25
  when Regexp::Expression::CharacterSet::IntersectedSequence
25
- expression.map { |subexp| convert(subexp) }.reduce(:+)
26
+ expression.map { |subexp| convert(subexp, to) }.reduce(:+) || to[]
26
27
 
27
28
  when Regexp::Expression::CharacterSet::Range
28
- start, finish = expression.map { |subexp| convert(subexp) }
29
- CharacterSet.from_ranges((start.min)..(finish.max))
29
+ start, finish = expression.map { |subexp| convert(subexp, to) }
30
+ to.new((start.min)..(finish.max))
30
31
 
31
32
  when Regexp::Expression::CharacterType::Any
32
- CharacterSet.unicode
33
-
34
- when Regexp::Expression::CharacterType::Digit
35
- CharacterSet.from_ranges(48..57)
36
-
37
- when Regexp::Expression::CharacterType::NonDigit
38
- CharacterSet.from_ranges(48..57).inversion
39
-
40
- when Regexp::Expression::CharacterType::Hex
41
- CharacterSet.from_ranges(48..57, 65..70, 97..102)
42
-
43
- when Regexp::Expression::CharacterType::NonHex
44
- CharacterSet.from_ranges(48..57, 65..70, 97..102).inversion
45
-
46
- when Regexp::Expression::CharacterType::Space
47
- CharacterSet["\t", "\n", "\v", "\f", "\r", "\x20"]
48
-
49
- when Regexp::Expression::CharacterType::NonSpace
50
- CharacterSet["\t", "\n", "\v", "\f", "\r", "\x20"].inversion
51
-
52
- when Regexp::Expression::CharacterType::Word
53
- CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122)
54
-
55
- when Regexp::Expression::CharacterType::NonWord
56
- CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122).inversion
33
+ to.unicode
34
+
35
+ when Regexp::Expression::CharacterType::Base
36
+ /(?<negative>non)?(?<base_name>.+)/ =~ expression.token
37
+ content =
38
+ if expression.unicode_classes?
39
+ # in u-mode, type shortcuts match the same as \p{<long type name>}
40
+ to.of_property(base_name)
41
+ else
42
+ # in normal mode, types match only ascii chars
43
+ case base_name.to_sym
44
+ when :digit then to.from_ranges(48..57)
45
+ when :hex then to.from_ranges(48..57, 65..70, 97..102)
46
+ when :space then to.from_ranges(9..13, 32..32)
47
+ when :word then to.from_ranges(48..57, 65..90, 95..95, 97..122)
48
+ else raise Error, "Unsupported CharacterType #{base_name}"
49
+ end
50
+ end
51
+ negative ? content.inversion : content
57
52
 
58
53
  when Regexp::Expression::EscapeSequence::CodepointList
59
- CharacterSet.new(expression.codepoints)
54
+ to.new(expression.codepoints)
60
55
 
61
56
  when Regexp::Expression::EscapeSequence::Base
62
- CharacterSet[expression.codepoint]
57
+ to[expression.codepoint]
63
58
 
64
59
  when Regexp::Expression::Group::Capture,
65
60
  Regexp::Expression::Group::Passive,
@@ -67,19 +62,19 @@ class CharacterSet
67
62
  Regexp::Expression::Group::Atomic,
68
63
  Regexp::Expression::Group::Options
69
64
  case expression.count
70
- when 0 then CharacterSet[]
71
- when 1 then convert(expression.first)
65
+ when 0 then to[]
66
+ when 1 then convert(expression.first, to)
72
67
  else
73
68
  raise Error, 'Groups must contain exactly one expression, e.g. ([a-z])'
74
69
  end
75
70
 
76
- when Regexp::Expression::Alternation
77
- expression.map { |subexp| convert(subexp) }.reduce(:+)
71
+ when Regexp::Expression::Alternation # rubocop:disable Lint/DuplicateBranch
72
+ expression.map { |subexp| convert(subexp, to) }.reduce(:+)
78
73
 
79
74
  when Regexp::Expression::Alternative
80
75
  case expression.count
81
- when 0 then CharacterSet[]
82
- when 1 then convert(expression.first)
76
+ when 0 then to[]
77
+ when 1 then convert(expression.first, to)
83
78
  else
84
79
  raise Error, 'Alternatives must contain exactly one expression'
85
80
  end
@@ -88,11 +83,14 @@ class CharacterSet
88
83
  if expression.set_level == 0 && expression.text.size != 1
89
84
  raise Error, 'Literal runs outside of sets are codepoint *sequences*'
90
85
  end
91
- CharacterSet[expression.text.ord]
86
+ to[expression.text.ord]
92
87
 
93
88
  when Regexp::Expression::UnicodeProperty::Base,
94
89
  Regexp::Expression::PosixClass
95
- content = CharacterSet.of_property(expression.token)
90
+ content = to.of_property(expression.token)
91
+ if expression.type == :posixclass && expression.ascii_classes?
92
+ content = content.ascii_part
93
+ end
96
94
  expression.negative? ? content.inversion : content
97
95
 
98
96
  when Regexp::Expression::Base
@@ -5,7 +5,7 @@ class CharacterSet
5
5
  def codepoints_from_enumerable(object)
6
6
  raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each)
7
7
  # Use #each to check first element (only this works for all Enumerables)
8
- object.each do |e|
8
+ object.each do |e| # rubocop:disable Lint/UnreachableLoop
9
9
  return object if e.is_a?(Integer) && e >= 0 && e < 0x110000
10
10
  return object.map(&:ord) if e.is_a?(String) && e.length == 1
11
11
  raise ArgumentError, "#{e.inspect} is not valid as a codepoint"
@@ -0,0 +1 @@
1
+ 0,10FFFF
@@ -0,0 +1 @@
1
+ 0,7F
@@ -0,0 +1,3 @@
1
+ 30,39
2
+ 41,5A
3
+ 61,7A
@@ -0,0 +1,2 @@
1
+ 41,5A
2
+ 61,7A