regexp_parser 2.1.1 → 2.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +15 -21
- data/Rakefile +5 -11
- data/lib/regexp_parser/expression/base.rb +123 -0
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +0 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +0 -0
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +0 -0
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +1 -0
- data/lib/regexp_parser/expression/classes/free_space.rb +0 -2
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/property.rb +0 -2
- data/lib/regexp_parser/expression/classes/root.rb +0 -1
- data/lib/regexp_parser/expression/classes/type.rb +0 -2
- data/lib/regexp_parser/expression/quantifier.rb +1 -1
- data/lib/regexp_parser/expression/sequence.rb +0 -1
- data/lib/regexp_parser/expression/subexpression.rb +0 -1
- data/lib/regexp_parser/expression.rb +6 -130
- data/lib/regexp_parser/lexer.rb +7 -5
- data/lib/regexp_parser/scanner/properties/long.yml +13 -0
- data/lib/regexp_parser/scanner/properties/short.yml +9 -1
- data/lib/regexp_parser/syntax/any.rb +1 -3
- data/lib/regexp_parser/syntax/base.rb +9 -9
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens/backref.rb → token/backreference.rb} +6 -5
- data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +31 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +696 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +1 -1
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +10 -0
- data/lib/regexp_parser/syntax.rb +1 -1
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +0 -2
- data/spec/lexer/nesting_spec.rb +2 -2
- data/spec/parser/escapes_spec.rb +43 -31
- data/spec/parser/properties_spec.rb +6 -4
- data/spec/parser/set/ranges_spec.rb +26 -16
- data/spec/scanner/escapes_spec.rb +28 -19
- data/spec/scanner/sets_spec.rb +9 -9
- data/spec/spec_helper.rb +13 -1
- data/spec/support/capturing_stderr.rb +9 -0
- data/spec/syntax/versions/1.8.6_spec.rb +2 -2
- data/spec/syntax/versions/2.0.0_spec.rb +2 -2
- data/spec/syntax/versions/aliases_spec.rb +1 -0
- metadata +26 -26
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/support/runner.rb +0 -42
- data/spec/support/warning_extractor.rb +0 -60
@@ -28,6 +28,7 @@ cari: carian
|
|
28
28
|
cc: control
|
29
29
|
cf: format
|
30
30
|
cher: cherokee
|
31
|
+
chrs: chorasmian
|
31
32
|
ci: case_ignorable
|
32
33
|
cn: unassigned
|
33
34
|
co: private_use
|
@@ -45,12 +46,17 @@ dep: deprecated
|
|
45
46
|
deva: devanagari
|
46
47
|
di: default_ignorable_code_point
|
47
48
|
dia: diacritic
|
49
|
+
diak: dives_akuru
|
48
50
|
dogr: dogra
|
49
51
|
dsrt: deseret
|
50
52
|
dupl: duployan
|
53
|
+
ebase: emoji_modifier_base
|
54
|
+
ecomp: emoji_component
|
51
55
|
egyp: egyptian_hieroglyphs
|
52
56
|
elba: elbasan
|
53
57
|
elym: elymaic
|
58
|
+
emod: emoji_modifier
|
59
|
+
epres: emoji_presentation
|
54
60
|
ethi: ethiopic
|
55
61
|
ext: extender
|
56
62
|
geor: georgian
|
@@ -89,6 +95,7 @@ kana: katakana
|
|
89
95
|
khar: kharoshthi
|
90
96
|
khmr: khmer
|
91
97
|
khoj: khojki
|
98
|
+
kits: khitan_small_script
|
92
99
|
knda: kannada
|
93
100
|
kthi: kaithi
|
94
101
|
l: letter
|
@@ -127,7 +134,7 @@ mroo: mro
|
|
127
134
|
mtei: meetei_mayek
|
128
135
|
mult: multani
|
129
136
|
mymr: myanmar
|
130
|
-
n: number
|
137
|
+
"n": number
|
131
138
|
nand: nandinagari
|
132
139
|
narb: old_north_arabian
|
133
140
|
nbat: nabataean
|
@@ -226,6 +233,7 @@ xidc: xid_continue
|
|
226
233
|
xids: xid_start
|
227
234
|
xpeo: old_persian
|
228
235
|
xsux: cuneiform
|
236
|
+
yezi: yezidi
|
229
237
|
yiii: yi
|
230
238
|
z: separator
|
231
239
|
zanb: zanabazar_square
|
@@ -1,15 +1,13 @@
|
|
1
1
|
module Regexp::Syntax
|
2
|
-
|
3
2
|
# A syntax that always returns true, passing all tokens as implemented. This
|
4
3
|
# is useful during development, testing, and should be useful for some types
|
5
4
|
# of transformations as well.
|
6
5
|
class Any < Base
|
7
6
|
def initialize # rubocop:disable Lint/MissingSuper
|
8
|
-
@implements = { :* => [
|
7
|
+
@implements = { :* => %i[*] }
|
9
8
|
end
|
10
9
|
|
11
10
|
def implements?(_type, _token) true end
|
12
11
|
def implements!(_type, _token) true end
|
13
12
|
end
|
14
|
-
|
15
13
|
end
|
@@ -59,7 +59,7 @@ module Regexp::Syntax
|
|
59
59
|
def normalize_group(type, token)
|
60
60
|
case token
|
61
61
|
when :named_ab, :named_sq
|
62
|
-
[
|
62
|
+
%i[group named]
|
63
63
|
else
|
64
64
|
[type, token]
|
65
65
|
end
|
@@ -68,21 +68,21 @@ module Regexp::Syntax
|
|
68
68
|
def normalize_backref(type, token)
|
69
69
|
case token
|
70
70
|
when :name_ref_ab, :name_ref_sq
|
71
|
-
[
|
71
|
+
%i[backref name_ref]
|
72
72
|
when :name_call_ab, :name_call_sq
|
73
|
-
[
|
73
|
+
%i[backref name_call]
|
74
74
|
when :name_recursion_ref_ab, :name_recursion_ref_sq
|
75
|
-
[
|
75
|
+
%i[backref name_recursion_ref]
|
76
76
|
when :number_ref_ab, :number_ref_sq
|
77
|
-
[
|
77
|
+
%i[backref number_ref]
|
78
78
|
when :number_call_ab, :number_call_sq
|
79
|
-
[
|
79
|
+
%i[backref number_call]
|
80
80
|
when :number_rel_ref_ab, :number_rel_ref_sq
|
81
|
-
[
|
81
|
+
%i[backref number_rel_ref]
|
82
82
|
when :number_rel_call_ab, :number_rel_call_sq
|
83
|
-
[
|
83
|
+
%i[backref number_rel_call]
|
84
84
|
when :number_recursion_ref_ab, :number_recursion_ref_sq
|
85
|
-
[
|
85
|
+
%i[backref number_recursion_ref]
|
86
86
|
else
|
87
87
|
[type, token]
|
88
88
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Regexp::Syntax
|
2
|
+
module Token
|
3
|
+
module Anchor
|
4
|
+
Basic = %i[bol eol]
|
5
|
+
Extended = Basic + %i[word_boundary nonword_boundary]
|
6
|
+
String = %i[bos eos eos_ob_eol]
|
7
|
+
MatchStart = %i[match_start]
|
8
|
+
|
9
|
+
All = Extended + String + MatchStart
|
10
|
+
Type = :anchor
|
11
|
+
end
|
12
|
+
|
13
|
+
Map[Anchor::Type] = Anchor::All
|
14
|
+
end
|
15
|
+
end
|
@@ -1,8 +1,8 @@
|
|
1
1
|
module Regexp::Syntax
|
2
2
|
module Token
|
3
3
|
module Assertion
|
4
|
-
Lookahead = [
|
5
|
-
Lookbehind = [
|
4
|
+
Lookahead = %i[lookahead nlookahead]
|
5
|
+
Lookbehind = %i[lookbehind nlookbehind]
|
6
6
|
|
7
7
|
All = Lookahead + Lookbehind
|
8
8
|
Type = :assertion
|
@@ -1,10 +1,11 @@
|
|
1
1
|
module Regexp::Syntax
|
2
2
|
module Token
|
3
3
|
module Backreference
|
4
|
-
|
5
|
-
Number = [
|
4
|
+
Plain = %i[number]
|
5
|
+
Number = Plain + %i[number_ref number_rel_ref]
|
6
|
+
Name = %i[name_ref]
|
6
7
|
|
7
|
-
RecursionLevel = [
|
8
|
+
RecursionLevel = %i[name_recursion_ref number_recursion_ref]
|
8
9
|
|
9
10
|
All = Name + Number + RecursionLevel
|
10
11
|
Type = :backref
|
@@ -12,8 +13,8 @@ module Regexp::Syntax
|
|
12
13
|
|
13
14
|
# Type is the same as Backreference so keeping it here, for now.
|
14
15
|
module SubexpressionCall
|
15
|
-
Name = [
|
16
|
-
Number = [
|
16
|
+
Name = %i[name_call]
|
17
|
+
Number = %i[number_call number_rel_call]
|
17
18
|
|
18
19
|
All = Name + Number
|
19
20
|
end
|
@@ -2,10 +2,10 @@ module Regexp::Syntax
|
|
2
2
|
module Token
|
3
3
|
module CharacterType
|
4
4
|
Basic = []
|
5
|
-
Extended = [
|
6
|
-
Hex = [
|
5
|
+
Extended = %i[digit nondigit space nonspace word nonword]
|
6
|
+
Hex = %i[hex nonhex]
|
7
7
|
|
8
|
-
Clustered = [
|
8
|
+
Clustered = %i[linebreak xgrapheme]
|
9
9
|
|
10
10
|
All = Basic + Extended + Hex + Clustered
|
11
11
|
Type = :type
|
@@ -1,10 +1,10 @@
|
|
1
1
|
module Regexp::Syntax
|
2
2
|
module Token
|
3
3
|
module Conditional
|
4
|
-
Delimiters = [
|
4
|
+
Delimiters = %i[open close]
|
5
5
|
|
6
|
-
Condition = [
|
7
|
-
Separator = [
|
6
|
+
Condition = %i[condition_open condition condition_close]
|
7
|
+
Separator = %i[separator]
|
8
8
|
|
9
9
|
All = Conditional::Delimiters + Conditional::Condition + Conditional::Separator
|
10
10
|
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Regexp::Syntax
|
2
|
+
module Token
|
3
|
+
# TODO: unify naming with RE::EscapeSequence, on way or the other, in v3.0.0
|
4
|
+
module Escape
|
5
|
+
Basic = %i[backslash literal]
|
6
|
+
|
7
|
+
Control = %i[control meta_sequence]
|
8
|
+
|
9
|
+
ASCII = %i[bell backspace escape form_feed newline carriage
|
10
|
+
tab vertical_tab]
|
11
|
+
|
12
|
+
Unicode = %i[codepoint codepoint_list]
|
13
|
+
|
14
|
+
Meta = %i[dot alternation
|
15
|
+
zero_or_one zero_or_more one_or_more
|
16
|
+
bol eol
|
17
|
+
group_open group_close
|
18
|
+
interval_open interval_close
|
19
|
+
set_open set_close]
|
20
|
+
|
21
|
+
Hex = %i[hex]
|
22
|
+
|
23
|
+
Octal = %i[octal]
|
24
|
+
|
25
|
+
All = Basic + Control + ASCII + Unicode + Meta + Hex + Octal
|
26
|
+
Type = :escape
|
27
|
+
end
|
28
|
+
|
29
|
+
Map[Escape::Type] = Escape::All
|
30
|
+
end
|
31
|
+
end
|
@@ -1,18 +1,18 @@
|
|
1
1
|
module Regexp::Syntax
|
2
2
|
module Token
|
3
3
|
module Group
|
4
|
-
Basic = [
|
5
|
-
Extended = Basic + [
|
4
|
+
Basic = %i[capture close]
|
5
|
+
Extended = Basic + %i[options options_switch]
|
6
6
|
|
7
|
-
Named = [
|
8
|
-
Atomic = [
|
9
|
-
Passive = [
|
10
|
-
Comment = [
|
7
|
+
Named = %i[named]
|
8
|
+
Atomic = %i[atomic]
|
9
|
+
Passive = %i[passive]
|
10
|
+
Comment = %i[comment]
|
11
11
|
|
12
12
|
V1_8_6 = Group::Extended + Group::Named + Group::Atomic +
|
13
13
|
Group::Passive + Group::Comment
|
14
14
|
|
15
|
-
V2_4_1 = [
|
15
|
+
V2_4_1 = %i[absence]
|
16
16
|
|
17
17
|
All = V1_8_6 + V2_4_1
|
18
18
|
Type = :group
|
@@ -1,10 +1,10 @@
|
|
1
1
|
module Regexp::Syntax
|
2
2
|
module Token
|
3
3
|
module PosixClass
|
4
|
-
Standard = [
|
5
|
-
|
4
|
+
Standard = %i[alnum alpha blank cntrl digit graph
|
5
|
+
lower print punct space upper xdigit]
|
6
6
|
|
7
|
-
Extensions = [
|
7
|
+
Extensions = %i[ascii word]
|
8
8
|
|
9
9
|
All = Standard + Extensions
|
10
10
|
Type = :posixclass
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Regexp::Syntax
|
2
|
+
module Token
|
3
|
+
module Quantifier
|
4
|
+
Greedy = %i[
|
5
|
+
zero_or_one
|
6
|
+
zero_or_more
|
7
|
+
one_or_more
|
8
|
+
]
|
9
|
+
|
10
|
+
Reluctant = %i[
|
11
|
+
zero_or_one_reluctant
|
12
|
+
zero_or_more_reluctant
|
13
|
+
one_or_more_reluctant
|
14
|
+
]
|
15
|
+
|
16
|
+
Possessive = %i[
|
17
|
+
zero_or_one_possessive
|
18
|
+
zero_or_more_possessive
|
19
|
+
one_or_more_possessive
|
20
|
+
]
|
21
|
+
|
22
|
+
Interval = %i[interval]
|
23
|
+
IntervalReluctant = %i[interval_reluctant]
|
24
|
+
IntervalPossessive = %i[interval_possessive]
|
25
|
+
|
26
|
+
IntervalAll = Interval + IntervalReluctant +
|
27
|
+
IntervalPossessive
|
28
|
+
|
29
|
+
All = Greedy + Reluctant + Possessive + IntervalAll
|
30
|
+
Type = :quantifier
|
31
|
+
end
|
32
|
+
|
33
|
+
Map[Quantifier::Type] = Quantifier::All
|
34
|
+
end
|
35
|
+
end
|