character_set 1.2.0 → 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitattributes +3 -0
- data/.github/workflows/gouteur.yml +20 -0
- data/.github/workflows/lint.yml +29 -0
- data/.github/workflows/tests.yml +22 -0
- data/.gitignore +1 -0
- data/.gouteur.yml +2 -0
- data/.rubocop.yml +17 -0
- data/BENCHMARK.md +53 -17
- data/CHANGELOG.md +54 -0
- data/README.md +51 -12
- data/Rakefile +20 -18
- data/benchmarks/count_in.rb +13 -0
- data/benchmarks/delete_in.rb +1 -1
- data/benchmarks/scan.rb +13 -0
- data/benchmarks/shared.rb +5 -0
- data/benchmarks/z_add.rb +12 -0
- data/benchmarks/z_delete.rb +12 -0
- data/benchmarks/z_merge.rb +15 -0
- data/benchmarks/z_minmax.rb +12 -0
- data/bin/console +2 -0
- data/character_set.gemspec +17 -4
- data/ext/character_set/character_set.c +969 -415
- data/ext/character_set/unicode_casefold_table.h +44 -1
- data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
- data/lib/character_set/character.rb +1 -1
- data/lib/character_set/core_ext/regexp_ext.rb +1 -1
- data/lib/character_set/core_ext/string_ext.rb +3 -1
- data/lib/character_set/expression_converter.rb +41 -43
- data/lib/character_set/parser.rb +1 -1
- data/lib/character_set/predefined_sets/any.cps +1 -0
- data/lib/character_set/predefined_sets/ascii.cps +1 -0
- data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
- data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
- data/lib/character_set/predefined_sets/assigned.cps +677 -0
- data/lib/character_set/predefined_sets/bmp.cps +2 -0
- data/lib/character_set/predefined_sets/crypt.cps +2 -0
- data/lib/character_set/predefined_sets/emoji.cps +152 -0
- data/lib/character_set/predefined_sets/newline.cps +3 -0
- data/lib/character_set/predefined_sets/surrogate.cps +1 -0
- data/lib/character_set/predefined_sets/unicode.cps +2 -0
- data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
- data/lib/character_set/predefined_sets/url_host.cps +10 -0
- data/lib/character_set/predefined_sets/url_path.cps +7 -0
- data/lib/character_set/predefined_sets/url_query.cps +8 -0
- data/lib/character_set/predefined_sets/whitespace.cps +10 -0
- data/lib/character_set/predefined_sets.rb +25 -260
- data/lib/character_set/ruby_fallback/character_set_methods.rb +60 -9
- data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
- data/lib/character_set/ruby_fallback.rb +5 -3
- data/lib/character_set/set_method_adapters.rb +4 -3
- data/lib/character_set/shared_methods.rb +69 -50
- data/lib/character_set/version.rb +1 -1
- data/lib/character_set/writer.rb +98 -27
- metadata +114 -17
- data/.travis.yml +0 -8
- data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
@@ -6,7 +6,7 @@ typedef struct casefold_mapping {
|
|
6
6
|
unsigned long to;
|
7
7
|
} casefold_mapping;
|
8
8
|
|
9
|
-
#define CASEFOLD_COUNT
|
9
|
+
#define CASEFOLD_COUNT 1426
|
10
10
|
|
11
11
|
static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
|
12
12
|
{0x0041,0x0061},
|
@@ -564,6 +564,41 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
|
|
564
564
|
{0x104D1,0x104F9},
|
565
565
|
{0x104D2,0x104FA},
|
566
566
|
{0x104D3,0x104FB},
|
567
|
+
{0x10570,0x10597},
|
568
|
+
{0x10571,0x10598},
|
569
|
+
{0x10572,0x10599},
|
570
|
+
{0x10573,0x1059A},
|
571
|
+
{0x10574,0x1059B},
|
572
|
+
{0x10575,0x1059C},
|
573
|
+
{0x10576,0x1059D},
|
574
|
+
{0x10577,0x1059E},
|
575
|
+
{0x10578,0x1059F},
|
576
|
+
{0x10579,0x105A0},
|
577
|
+
{0x1057A,0x105A1},
|
578
|
+
{0x1057C,0x105A3},
|
579
|
+
{0x1057D,0x105A4},
|
580
|
+
{0x1057E,0x105A5},
|
581
|
+
{0x1057F,0x105A6},
|
582
|
+
{0x10580,0x105A7},
|
583
|
+
{0x10581,0x105A8},
|
584
|
+
{0x10582,0x105A9},
|
585
|
+
{0x10583,0x105AA},
|
586
|
+
{0x10584,0x105AB},
|
587
|
+
{0x10585,0x105AC},
|
588
|
+
{0x10586,0x105AD},
|
589
|
+
{0x10587,0x105AE},
|
590
|
+
{0x10588,0x105AF},
|
591
|
+
{0x10589,0x105B0},
|
592
|
+
{0x1058A,0x105B1},
|
593
|
+
{0x1058C,0x105B3},
|
594
|
+
{0x1058D,0x105B4},
|
595
|
+
{0x1058E,0x105B5},
|
596
|
+
{0x1058F,0x105B6},
|
597
|
+
{0x10590,0x105B7},
|
598
|
+
{0x10591,0x105B8},
|
599
|
+
{0x10592,0x105B9},
|
600
|
+
{0x10594,0x105BB},
|
601
|
+
{0x10595,0x105BC},
|
567
602
|
{0x10A0,0x2D00},
|
568
603
|
{0x10A1,0x2D01},
|
569
604
|
{0x10A2,0x2D02},
|
@@ -1102,6 +1137,7 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
|
|
1102
1137
|
{0x2C2C,0x2C5C},
|
1103
1138
|
{0x2C2D,0x2C5D},
|
1104
1139
|
{0x2C2E,0x2C5E},
|
1140
|
+
{0x2C2F,0x2C5F},
|
1105
1141
|
{0x2C60,0x2C61},
|
1106
1142
|
{0x2C62,0x026B},
|
1107
1143
|
{0x2C63,0x1D7D},
|
@@ -1282,10 +1318,17 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
|
|
1282
1318
|
{0xA7BA,0xA7BB},
|
1283
1319
|
{0xA7BC,0xA7BD},
|
1284
1320
|
{0xA7BE,0xA7BF},
|
1321
|
+
{0xA7C0,0xA7C1},
|
1285
1322
|
{0xA7C2,0xA7C3},
|
1286
1323
|
{0xA7C4,0xA794},
|
1287
1324
|
{0xA7C5,0x0282},
|
1288
1325
|
{0xA7C6,0x1D8E},
|
1326
|
+
{0xA7C7,0xA7C8},
|
1327
|
+
{0xA7C9,0xA7CA},
|
1328
|
+
{0xA7D0,0xA7D1},
|
1329
|
+
{0xA7D6,0xA7D7},
|
1330
|
+
{0xA7D8,0xA7D9},
|
1331
|
+
{0xA7F5,0xA7F6},
|
1289
1332
|
{0xAB70,0x13A0},
|
1290
1333
|
{0xAB71,0x13A1},
|
1291
1334
|
{0xAB72,0x13A2},
|
@@ -0,0 +1,11 @@
|
|
1
|
+
// THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT
|
2
|
+
// -*-C-*-
|
3
|
+
|
4
|
+
typedef struct casefold_mapping {
|
5
|
+
unsigned long from;
|
6
|
+
unsigned long to;
|
7
|
+
} casefold_mapping;
|
8
|
+
|
9
|
+
#define CASEFOLD_COUNT 0
|
10
|
+
|
11
|
+
static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {};
|
@@ -6,11 +6,13 @@ class CharacterSet
|
|
6
6
|
end
|
7
7
|
|
8
8
|
{
|
9
|
+
count_by_character_set: :count_in,
|
9
10
|
covered_by_character_set?: :cover?,
|
10
11
|
delete_character_set: :delete_in,
|
11
12
|
delete_character_set!: :delete_in!,
|
12
13
|
keep_character_set: :keep_in,
|
13
14
|
keep_character_set!: :keep_in!,
|
15
|
+
scan_by_character_set: :scan,
|
14
16
|
uses_character_set?: :used_by?,
|
15
17
|
}.each do |string_method, set_method|
|
16
18
|
class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
@@ -27,4 +29,4 @@ class CharacterSet
|
|
27
29
|
end
|
28
30
|
end
|
29
31
|
|
30
|
-
::String.
|
32
|
+
::String.instance_eval { include CharacterSet::CoreExt::StringExt }
|
@@ -4,62 +4,57 @@ class CharacterSet
|
|
4
4
|
|
5
5
|
Error = Class.new(ArgumentError)
|
6
6
|
|
7
|
-
def convert(expression)
|
8
|
-
CharacterSet.require_optional_dependency('regexp_parser')
|
7
|
+
def convert(expression, to = CharacterSet)
|
8
|
+
CharacterSet.require_optional_dependency('regexp_parser', __method__)
|
9
9
|
|
10
10
|
case expression
|
11
11
|
when Regexp::Expression::Root
|
12
12
|
if expression.count != 1
|
13
13
|
raise Error, 'Pass a Regexp with exactly one expression, e.g. /[a-z]/'
|
14
14
|
end
|
15
|
-
convert(expression[0])
|
15
|
+
convert(expression[0], to)
|
16
16
|
|
17
17
|
when Regexp::Expression::CharacterSet
|
18
|
-
content = expression.map { |subexp| convert(subexp) }.reduce(:+)
|
18
|
+
content = expression.map { |subexp| convert(subexp, to) }.reduce(:+)
|
19
|
+
content ||= to[]
|
19
20
|
expression.negative? ? content.inversion : content
|
20
21
|
|
21
22
|
when Regexp::Expression::CharacterSet::Intersection
|
22
|
-
expression.map { |subexp| convert(subexp) }.reduce(:&)
|
23
|
+
expression.map { |subexp| convert(subexp, to) }.reduce(:&)
|
23
24
|
|
24
25
|
when Regexp::Expression::CharacterSet::IntersectedSequence
|
25
|
-
expression.map { |subexp| convert(subexp) }.reduce(:+)
|
26
|
+
expression.map { |subexp| convert(subexp, to) }.reduce(:+) || to[]
|
26
27
|
|
27
28
|
when Regexp::Expression::CharacterSet::Range
|
28
|
-
start, finish = expression.map { |subexp| convert(subexp) }
|
29
|
-
|
29
|
+
start, finish = expression.map { |subexp| convert(subexp, to) }
|
30
|
+
to.new((start.min)..(finish.max))
|
30
31
|
|
31
32
|
when Regexp::Expression::CharacterType::Any
|
32
|
-
|
33
|
-
|
34
|
-
when Regexp::Expression::CharacterType::
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
when Regexp::Expression::CharacterType::Word
|
53
|
-
CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122)
|
54
|
-
|
55
|
-
when Regexp::Expression::CharacterType::NonWord
|
56
|
-
CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122).inversion
|
33
|
+
to.unicode
|
34
|
+
|
35
|
+
when Regexp::Expression::CharacterType::Base
|
36
|
+
/(?<negative>non)?(?<base_name>.+)/ =~ expression.token
|
37
|
+
content =
|
38
|
+
if expression.unicode_classes?
|
39
|
+
# in u-mode, type shortcuts match the same as \p{<long type name>}
|
40
|
+
to.of_property(base_name)
|
41
|
+
else
|
42
|
+
# in normal mode, types match only ascii chars
|
43
|
+
case base_name.to_sym
|
44
|
+
when :digit then to.from_ranges(48..57)
|
45
|
+
when :hex then to.from_ranges(48..57, 65..70, 97..102)
|
46
|
+
when :space then to.from_ranges(9..13, 32..32)
|
47
|
+
when :word then to.from_ranges(48..57, 65..90, 95..95, 97..122)
|
48
|
+
else raise Error, "Unsupported CharacterType #{base_name}"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
negative ? content.inversion : content
|
57
52
|
|
58
53
|
when Regexp::Expression::EscapeSequence::CodepointList
|
59
|
-
|
54
|
+
to.new(expression.codepoints)
|
60
55
|
|
61
56
|
when Regexp::Expression::EscapeSequence::Base
|
62
|
-
|
57
|
+
to[expression.codepoint]
|
63
58
|
|
64
59
|
when Regexp::Expression::Group::Capture,
|
65
60
|
Regexp::Expression::Group::Passive,
|
@@ -67,19 +62,19 @@ class CharacterSet
|
|
67
62
|
Regexp::Expression::Group::Atomic,
|
68
63
|
Regexp::Expression::Group::Options
|
69
64
|
case expression.count
|
70
|
-
when 0 then
|
71
|
-
when 1 then convert(expression.first)
|
65
|
+
when 0 then to[]
|
66
|
+
when 1 then convert(expression.first, to)
|
72
67
|
else
|
73
68
|
raise Error, 'Groups must contain exactly one expression, e.g. ([a-z])'
|
74
69
|
end
|
75
70
|
|
76
|
-
when Regexp::Expression::Alternation
|
77
|
-
expression.map { |subexp| convert(subexp) }.reduce(:+)
|
71
|
+
when Regexp::Expression::Alternation # rubocop:disable Lint/DuplicateBranch
|
72
|
+
expression.map { |subexp| convert(subexp, to) }.reduce(:+)
|
78
73
|
|
79
74
|
when Regexp::Expression::Alternative
|
80
75
|
case expression.count
|
81
|
-
when 0 then
|
82
|
-
when 1 then convert(expression.first)
|
76
|
+
when 0 then to[]
|
77
|
+
when 1 then convert(expression.first, to)
|
83
78
|
else
|
84
79
|
raise Error, 'Alternatives must contain exactly one expression'
|
85
80
|
end
|
@@ -88,11 +83,14 @@ class CharacterSet
|
|
88
83
|
if expression.set_level == 0 && expression.text.size != 1
|
89
84
|
raise Error, 'Literal runs outside of sets are codepoint *sequences*'
|
90
85
|
end
|
91
|
-
|
86
|
+
to[expression.text.ord]
|
92
87
|
|
93
88
|
when Regexp::Expression::UnicodeProperty::Base,
|
94
89
|
Regexp::Expression::PosixClass
|
95
|
-
content =
|
90
|
+
content = to.of_property(expression.token)
|
91
|
+
if expression.type == :posixclass && expression.ascii_classes?
|
92
|
+
content = content.ascii_part
|
93
|
+
end
|
96
94
|
expression.negative? ? content.inversion : content
|
97
95
|
|
98
96
|
when Regexp::Expression::Base
|
data/lib/character_set/parser.rb
CHANGED
@@ -5,7 +5,7 @@ class CharacterSet
|
|
5
5
|
def codepoints_from_enumerable(object)
|
6
6
|
raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each)
|
7
7
|
# Use #each to check first element (only this works for all Enumerables)
|
8
|
-
object.each do |e|
|
8
|
+
object.each do |e| # rubocop:disable Lint/UnreachableLoop
|
9
9
|
return object if e.is_a?(Integer) && e >= 0 && e < 0x110000
|
10
10
|
return object.map(&:ord) if e.is_a?(String) && e.length == 1
|
11
11
|
raise ArgumentError, "#{e.inspect} is not valid as a codepoint"
|
@@ -0,0 +1 @@
|
|
1
|
+
0,10FFFF
|
@@ -0,0 +1 @@
|
|
1
|
+
0,7F
|