regexp_parser 2.2.1 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +70 -6
- data/Gemfile +2 -1
- data/README.md +23 -9
- data/Rakefile +1 -56
- data/lib/regexp_parser/error.rb +1 -1
- data/lib/regexp_parser/expression/base.rb +9 -57
- data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -2
- data/lib/regexp_parser/expression/classes/character_set.rb +2 -2
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -2
- data/lib/regexp_parser/expression/classes/free_space.rb +1 -1
- data/lib/regexp_parser/expression/classes/group.rb +6 -6
- data/lib/regexp_parser/expression/methods/tests.rb +10 -1
- data/lib/regexp_parser/expression/quantifier.rb +40 -23
- data/lib/regexp_parser/expression/sequence.rb +2 -2
- data/lib/regexp_parser/expression/sequence_operation.rb +2 -2
- data/lib/regexp_parser/expression/shared.rb +81 -0
- data/lib/regexp_parser/expression/subexpression.rb +11 -7
- data/lib/regexp_parser/expression.rb +1 -0
- data/lib/regexp_parser/lexer.rb +1 -1
- data/lib/regexp_parser/parser.rb +12 -60
- data/lib/regexp_parser/scanner/properties/long.csv +18 -0
- data/lib/regexp_parser/scanner/properties/short.csv +4 -0
- data/lib/regexp_parser/scanner/property.rl +1 -1
- data/lib/regexp_parser/scanner/scanner.rl +42 -31
- data/lib/regexp_parser/scanner.rb +729 -797
- data/lib/regexp_parser/syntax/any.rb +2 -5
- data/lib/regexp_parser/syntax/base.rb +91 -64
- data/lib/regexp_parser/syntax/token/quantifier.rb +4 -4
- data/lib/regexp_parser/syntax/token/unicode_property.rb +26 -5
- data/lib/regexp_parser/syntax/version_lookup.rb +20 -29
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
- data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
- data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
- data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- metadata +4 -2
@@ -0,0 +1,81 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
module Shared
|
3
|
+
def self.included(mod)
|
4
|
+
mod.class_eval do
|
5
|
+
attr_accessor :type, :token, :text, :ts, :te,
|
6
|
+
:level, :set_level, :conditional_level,
|
7
|
+
:options, :quantifier
|
8
|
+
|
9
|
+
attr_reader :nesting_level
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def init_from_token_and_options(token, options = {})
|
14
|
+
self.type = token.type
|
15
|
+
self.token = token.token
|
16
|
+
self.text = token.text
|
17
|
+
self.ts = token.ts
|
18
|
+
self.te = token.te
|
19
|
+
self.level = token.level
|
20
|
+
self.set_level = token.set_level
|
21
|
+
self.conditional_level = token.conditional_level
|
22
|
+
self.nesting_level = 0
|
23
|
+
self.options = options || {}
|
24
|
+
end
|
25
|
+
private :init_from_token_and_options
|
26
|
+
|
27
|
+
def initialize_copy(orig)
|
28
|
+
self.text = orig.text.dup if orig.text
|
29
|
+
self.options = orig.options.dup if orig.options
|
30
|
+
self.quantifier = orig.quantifier.clone if orig.quantifier
|
31
|
+
super
|
32
|
+
end
|
33
|
+
|
34
|
+
def starts_at
|
35
|
+
ts
|
36
|
+
end
|
37
|
+
|
38
|
+
def base_length
|
39
|
+
to_s(:base).length
|
40
|
+
end
|
41
|
+
|
42
|
+
def full_length
|
43
|
+
to_s.length
|
44
|
+
end
|
45
|
+
|
46
|
+
def to_s(format = :full)
|
47
|
+
"#{parts.join}#{quantifier_affix(format)}"
|
48
|
+
end
|
49
|
+
alias :to_str :to_s
|
50
|
+
|
51
|
+
def parts
|
52
|
+
[text.dup]
|
53
|
+
end
|
54
|
+
|
55
|
+
def quantifier_affix(expression_format)
|
56
|
+
quantifier.to_s if quantified? && expression_format != :base
|
57
|
+
end
|
58
|
+
|
59
|
+
def quantified?
|
60
|
+
!quantifier.nil?
|
61
|
+
end
|
62
|
+
|
63
|
+
def offset
|
64
|
+
[starts_at, full_length]
|
65
|
+
end
|
66
|
+
|
67
|
+
def coded_offset
|
68
|
+
'@%d+%d' % offset
|
69
|
+
end
|
70
|
+
|
71
|
+
def terminal?
|
72
|
+
!respond_to?(:expressions)
|
73
|
+
end
|
74
|
+
|
75
|
+
def nesting_level=(lvl)
|
76
|
+
@nesting_level = lvl
|
77
|
+
quantifier && quantifier.nesting_level = lvl
|
78
|
+
terminal? || each { |subexp| subexp.nesting_level = lvl + 1 }
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -5,9 +5,8 @@ module Regexp::Expression
|
|
5
5
|
attr_accessor :expressions
|
6
6
|
|
7
7
|
def initialize(token, options = {})
|
8
|
-
super
|
9
|
-
|
10
8
|
self.expressions = []
|
9
|
+
super
|
11
10
|
end
|
12
11
|
|
13
12
|
# Override base method to clone the expressions as well.
|
@@ -43,16 +42,21 @@ module Regexp::Expression
|
|
43
42
|
ts + to_s.length
|
44
43
|
end
|
45
44
|
|
46
|
-
def
|
47
|
-
|
48
|
-
"#{expressions.join}#{quantifier_affix(format)}"
|
45
|
+
def parts
|
46
|
+
expressions
|
49
47
|
end
|
50
48
|
|
51
49
|
def to_h
|
52
|
-
attributes.merge(
|
50
|
+
attributes.merge(
|
53
51
|
text: to_s(:base),
|
54
52
|
expressions: expressions.map(&:to_h)
|
55
|
-
|
53
|
+
)
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def intersperse(expressions, separator)
|
59
|
+
expressions.flat_map { |exp| [exp, separator] }.slice(0...-1)
|
56
60
|
end
|
57
61
|
end
|
58
62
|
end
|
data/lib/regexp_parser/lexer.rb
CHANGED
data/lib/regexp_parser/parser.rb
CHANGED
@@ -39,6 +39,9 @@ class Regexp::Parser
|
|
39
39
|
parse_token(token)
|
40
40
|
end
|
41
41
|
|
42
|
+
# Trigger recursive setting of #nesting_level, which reflects how deep
|
43
|
+
# a node is in the tree. Do this at the end to account for tree rewrites.
|
44
|
+
root.nesting_level = 0
|
42
45
|
assign_referenced_expressions
|
43
46
|
|
44
47
|
if block_given?
|
@@ -286,17 +289,9 @@ class Regexp::Parser
|
|
286
289
|
def nest(exp)
|
287
290
|
nesting.push(exp)
|
288
291
|
node << exp
|
289
|
-
update_transplanted_subtree(exp, node)
|
290
292
|
self.node = exp
|
291
293
|
end
|
292
294
|
|
293
|
-
# subtrees are transplanted to build Alternations, Intersections, Ranges
|
294
|
-
def update_transplanted_subtree(exp, new_parent)
|
295
|
-
exp.nesting_level = new_parent.nesting_level + 1
|
296
|
-
exp.respond_to?(:each) &&
|
297
|
-
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
298
|
-
end
|
299
|
-
|
300
295
|
def escape(token)
|
301
296
|
case token.token
|
302
297
|
|
@@ -483,7 +478,7 @@ class Regexp::Parser
|
|
483
478
|
new_token = Regexp::Token.new(
|
484
479
|
:group,
|
485
480
|
:passive,
|
486
|
-
'', # text
|
481
|
+
'', # text (none because this group is implicit)
|
487
482
|
target_node.ts,
|
488
483
|
nil, # te (unused)
|
489
484
|
target_node.level,
|
@@ -493,66 +488,23 @@ class Regexp::Parser
|
|
493
488
|
new_group = Group::Passive.new(new_token, active_opts)
|
494
489
|
new_group.implicit = true
|
495
490
|
new_group << target_node
|
496
|
-
|
491
|
+
increase_group_level(target_node)
|
497
492
|
node.expressions[node.expressions.index(target_node)] = new_group
|
498
493
|
target_node = new_group
|
499
494
|
end
|
500
495
|
|
501
|
-
|
502
|
-
|
503
|
-
target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
|
504
|
-
when :zero_or_one_reluctant
|
505
|
-
target_node.quantify(:zero_or_one, token.text, 0, 1, :reluctant)
|
506
|
-
when :zero_or_one_possessive
|
507
|
-
target_node.quantify(:zero_or_one, token.text, 0, 1, :possessive)
|
508
|
-
|
509
|
-
when :zero_or_more
|
510
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :greedy)
|
511
|
-
when :zero_or_more_reluctant
|
512
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
|
513
|
-
when :zero_or_more_possessive
|
514
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :possessive)
|
515
|
-
|
516
|
-
when :one_or_more
|
517
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :greedy)
|
518
|
-
when :one_or_more_reluctant
|
519
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :reluctant)
|
520
|
-
when :one_or_more_possessive
|
521
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :possessive)
|
522
|
-
|
523
|
-
when :interval
|
524
|
-
interval(target_node, token)
|
525
|
-
|
526
|
-
else
|
496
|
+
unless token.token =~ /\A(?:zero_or_one|zero_or_more|one_or_more|interval)
|
497
|
+
(?:_greedy|_reluctant|_possessive)?\z/x
|
527
498
|
raise UnknownTokenError.new('Quantifier', token)
|
528
499
|
end
|
500
|
+
|
501
|
+
target_node.quantify(token, active_opts)
|
529
502
|
end
|
530
503
|
|
531
|
-
def
|
504
|
+
def increase_group_level(exp)
|
532
505
|
exp.level += 1
|
533
|
-
exp.
|
534
|
-
|
535
|
-
|
536
|
-
def interval(target_node, token)
|
537
|
-
text = token.text
|
538
|
-
mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
|
539
|
-
case mchr
|
540
|
-
when '?'
|
541
|
-
range_text = text[0...-1]
|
542
|
-
mode = :reluctant
|
543
|
-
when '+'
|
544
|
-
range_text = text[0...-1]
|
545
|
-
mode = :possessive
|
546
|
-
else
|
547
|
-
range_text = text
|
548
|
-
mode = :greedy
|
549
|
-
end
|
550
|
-
|
551
|
-
range = range_text.gsub(/\{|\}/, '').split(',', 2)
|
552
|
-
min = range[0].empty? ? 0 : range[0]
|
553
|
-
max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
|
554
|
-
|
555
|
-
target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
|
506
|
+
exp.quantifier.level += 1 if exp.quantifier
|
507
|
+
exp.terminal? || exp.each { |subexp| increase_group_level(subexp) }
|
556
508
|
end
|
557
509
|
|
558
510
|
def set(token)
|
@@ -6,6 +6,7 @@ age=11.0,age=11.0
|
|
6
6
|
age=12.0,age=12.0
|
7
7
|
age=12.1,age=12.1
|
8
8
|
age=13.0,age=13.0
|
9
|
+
age=14.0,age=14.0
|
9
10
|
age=2.0,age=2.0
|
10
11
|
age=2.1,age=2.1
|
11
12
|
age=3.0,age=3.0
|
@@ -72,6 +73,7 @@ coptic,coptic
|
|
72
73
|
cuneiform,cuneiform
|
73
74
|
currencysymbol,currency_symbol
|
74
75
|
cypriot,cypriot
|
76
|
+
cyprominoan,cypro_minoan
|
75
77
|
cyrillic,cyrillic
|
76
78
|
dash,dash
|
77
79
|
dashpunctuation,dash_punctuation
|
@@ -136,6 +138,7 @@ inancientgreeknumbers,in_ancient_greek_numbers
|
|
136
138
|
inancientsymbols,in_ancient_symbols
|
137
139
|
inarabic,in_arabic
|
138
140
|
inarabicextendeda,in_arabic_extended_a
|
141
|
+
inarabicextendedb,in_arabic_extended_b
|
139
142
|
inarabicmathematicalalphabeticsymbols,in_arabic_mathematical_alphabetic_symbols
|
140
143
|
inarabicpresentationformsa,in_arabic_presentation_forms_a
|
141
144
|
inarabicpresentationformsb,in_arabic_presentation_forms_b
|
@@ -197,6 +200,7 @@ incuneiform,in_cuneiform
|
|
197
200
|
incuneiformnumbersandpunctuation,in_cuneiform_numbers_and_punctuation
|
198
201
|
incurrencysymbols,in_currency_symbols
|
199
202
|
incypriotsyllabary,in_cypriot_syllabary
|
203
|
+
incyprominoan,in_cypro_minoan
|
200
204
|
incyrillic,in_cyrillic
|
201
205
|
incyrillicextendeda,in_cyrillic_extended_a
|
202
206
|
incyrillicextendedb,in_cyrillic_extended_b
|
@@ -223,6 +227,7 @@ inenclosedideographicsupplement,in_enclosed_ideographic_supplement
|
|
223
227
|
inethiopic,in_ethiopic
|
224
228
|
inethiopicextended,in_ethiopic_extended
|
225
229
|
inethiopicextendeda,in_ethiopic_extended_a
|
230
|
+
inethiopicextendedb,in_ethiopic_extended_b
|
226
231
|
inethiopicsupplement,in_ethiopic_supplement
|
227
232
|
ingeneralpunctuation,in_general_punctuation
|
228
233
|
ingeometricshapes,in_geometric_shapes
|
@@ -264,6 +269,7 @@ initialpunctuation,initial_punctuation
|
|
264
269
|
injavanese,in_javanese
|
265
270
|
inkaithi,in_kaithi
|
266
271
|
inkanaextendeda,in_kana_extended_a
|
272
|
+
inkanaextendedb,in_kana_extended_b
|
267
273
|
inkanasupplement,in_kana_supplement
|
268
274
|
inkanbun,in_kanbun
|
269
275
|
inkangxiradicals,in_kangxi_radicals
|
@@ -285,6 +291,8 @@ inlatinextendedb,in_latin_extended_b
|
|
285
291
|
inlatinextendedc,in_latin_extended_c
|
286
292
|
inlatinextendedd,in_latin_extended_d
|
287
293
|
inlatinextendede,in_latin_extended_e
|
294
|
+
inlatinextendedf,in_latin_extended_f
|
295
|
+
inlatinextendedg,in_latin_extended_g
|
288
296
|
inlepcha,in_lepcha
|
289
297
|
inletterlikesymbols,in_letterlike_symbols
|
290
298
|
inlimbu,in_limbu
|
@@ -349,6 +357,7 @@ inoldpersian,in_old_persian
|
|
349
357
|
inoldsogdian,in_old_sogdian
|
350
358
|
inoldsoutharabian,in_old_south_arabian
|
351
359
|
inoldturkic,in_old_turkic
|
360
|
+
inolduyghur,in_old_uyghur
|
352
361
|
inopticalcharacterrecognition,in_optical_character_recognition
|
353
362
|
inoriya,in_oriya
|
354
363
|
inornamentaldingbats,in_ornamental_dingbats
|
@@ -413,6 +422,7 @@ intaixuanjingsymbols,in_tai_xuan_jing_symbols
|
|
413
422
|
intakri,in_takri
|
414
423
|
intamil,in_tamil
|
415
424
|
intamilsupplement,in_tamil_supplement
|
425
|
+
intangsa,in_tangsa
|
416
426
|
intangut,in_tangut
|
417
427
|
intangutcomponents,in_tangut_components
|
418
428
|
intangutsupplement,in_tangut_supplement
|
@@ -422,15 +432,18 @@ inthai,in_thai
|
|
422
432
|
intibetan,in_tibetan
|
423
433
|
intifinagh,in_tifinagh
|
424
434
|
intirhuta,in_tirhuta
|
435
|
+
intoto,in_toto
|
425
436
|
intransportandmapsymbols,in_transport_and_map_symbols
|
426
437
|
inugaritic,in_ugaritic
|
427
438
|
inunifiedcanadianaboriginalsyllabics,in_unified_canadian_aboriginal_syllabics
|
428
439
|
inunifiedcanadianaboriginalsyllabicsextended,in_unified_canadian_aboriginal_syllabics_extended
|
440
|
+
inunifiedcanadianaboriginalsyllabicsextendeda,in_unified_canadian_aboriginal_syllabics_extended_a
|
429
441
|
invai,in_vai
|
430
442
|
invariationselectors,in_variation_selectors
|
431
443
|
invariationselectorssupplement,in_variation_selectors_supplement
|
432
444
|
invedicextensions,in_vedic_extensions
|
433
445
|
inverticalforms,in_vertical_forms
|
446
|
+
invithkuqi,in_vithkuqi
|
434
447
|
inwancho,in_wancho
|
435
448
|
inwarangciti,in_warang_citi
|
436
449
|
inyezidi,in_yezidi
|
@@ -438,6 +451,7 @@ inyijinghexagramsymbols,in_yijing_hexagram_symbols
|
|
438
451
|
inyiradicals,in_yi_radicals
|
439
452
|
inyisyllables,in_yi_syllables
|
440
453
|
inzanabazarsquare,in_zanabazar_square
|
454
|
+
inznamennymusicalnotation,in_znamenny_musical_notation
|
441
455
|
javanese,javanese
|
442
456
|
joincontrol,join_control
|
443
457
|
kaithi,kaithi
|
@@ -509,6 +523,7 @@ oldpersian,old_persian
|
|
509
523
|
oldsogdian,old_sogdian
|
510
524
|
oldsoutharabian,old_south_arabian
|
511
525
|
oldturkic,old_turkic
|
526
|
+
olduyghur,old_uyghur
|
512
527
|
openpunctuation,open_punctuation
|
513
528
|
oriya,oriya
|
514
529
|
osage,osage
|
@@ -573,6 +588,7 @@ taitham,tai_tham
|
|
573
588
|
taiviet,tai_viet
|
574
589
|
takri,takri
|
575
590
|
tamil,tamil
|
591
|
+
tangsa,tangsa
|
576
592
|
tangut,tangut
|
577
593
|
telugu,telugu
|
578
594
|
terminalpunctuation,terminal_punctuation
|
@@ -582,6 +598,7 @@ tibetan,tibetan
|
|
582
598
|
tifinagh,tifinagh
|
583
599
|
tirhuta,tirhuta
|
584
600
|
titlecaseletter,titlecase_letter
|
601
|
+
toto,toto
|
585
602
|
ugaritic,ugaritic
|
586
603
|
unassigned,unassigned
|
587
604
|
unifiedideograph,unified_ideograph
|
@@ -591,6 +608,7 @@ uppercase,uppercase
|
|
591
608
|
uppercaseletter,uppercase_letter
|
592
609
|
vai,vai
|
593
610
|
variationselector,variation_selector
|
611
|
+
vithkuqi,vithkuqi
|
594
612
|
wancho,wancho
|
595
613
|
warangciti,warang_citi
|
596
614
|
whitespace,white_space
|
@@ -31,6 +31,7 @@ cn,unassigned
|
|
31
31
|
co,private_use
|
32
32
|
combiningmark,mark
|
33
33
|
copt,coptic
|
34
|
+
cpmn,cypro_minoan
|
34
35
|
cprt,cypriot
|
35
36
|
cs,surrogate
|
36
37
|
cwcf,changes_when_casefolded
|
@@ -154,6 +155,7 @@ orkh,old_turkic
|
|
154
155
|
orya,oriya
|
155
156
|
osge,osage
|
156
157
|
osma,osmanya
|
158
|
+
ougr,old_uyghur
|
157
159
|
oupper,other_uppercase
|
158
160
|
p,punctuation
|
159
161
|
palm,palmyrene
|
@@ -219,9 +221,11 @@ tglg,tagalog
|
|
219
221
|
thaa,thaana
|
220
222
|
tibt,tibetan
|
221
223
|
tirh,tirhuta
|
224
|
+
tnsa,tangsa
|
222
225
|
ugar,ugaritic
|
223
226
|
uideo,unified_ideograph
|
224
227
|
vaii,vai
|
228
|
+
vith,vithkuqi
|
225
229
|
vs,variation_selector
|
226
230
|
wara,warang_citi
|
227
231
|
wcho,wancho
|
@@ -20,7 +20,7 @@
|
|
20
20
|
name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
|
21
21
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
|
-
|
23
|
+
validation_error(:property, name) unless token
|
24
24
|
|
25
25
|
self.emit(type, token.to_sym, text)
|
26
26
|
|
@@ -28,13 +28,7 @@
|
|
28
28
|
|
29
29
|
comment = ('#' . [^\n]* . '\n'?);
|
30
30
|
|
31
|
-
|
32
|
-
'cntrl' | 'digit' | 'graph' |
|
33
|
-
'lower' | 'print' | 'punct' |
|
34
|
-
'space' | 'upper' | 'xdigit' |
|
35
|
-
'word' | 'ascii';
|
36
|
-
|
37
|
-
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
31
|
+
class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
|
38
32
|
|
39
33
|
|
40
34
|
# these are not supported in ruby at the moment
|
@@ -74,8 +68,7 @@
|
|
74
68
|
quantity_maximum = ',' . (digit+);
|
75
69
|
quantity_range = (digit+) . ',' . (digit+);
|
76
70
|
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
77
|
-
quantity_maximum | quantity_range ) . range_close
|
78
|
-
quantifier_mode?;
|
71
|
+
quantity_maximum | quantity_range ) . range_close;
|
79
72
|
|
80
73
|
quantifiers = quantifier_greedy | quantifier_reluctant |
|
81
74
|
quantifier_possessive | quantifier_interval;
|
@@ -223,24 +216,28 @@
|
|
223
216
|
fcall character_set;
|
224
217
|
};
|
225
218
|
|
226
|
-
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
219
|
+
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
227
220
|
text = copy(data, ts, te)
|
228
221
|
|
229
222
|
type = :posixclass
|
230
223
|
class_name = text[2..-3]
|
231
|
-
if class_name[0]
|
224
|
+
if class_name[0] == '^'
|
232
225
|
class_name = class_name[1..-1]
|
233
226
|
type = :nonposixclass
|
234
227
|
end
|
235
228
|
|
229
|
+
unless self.class.posix_classes.include?(class_name)
|
230
|
+
validation_error(:posix_class, text)
|
231
|
+
end
|
232
|
+
|
236
233
|
emit(type, class_name.to_sym, text)
|
237
234
|
};
|
238
235
|
|
239
236
|
# These are not supported in ruby at the moment. Enable them if they are.
|
240
|
-
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
237
|
+
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
241
238
|
# emit(:set, :collation, copy(data, ts, te))
|
242
239
|
# };
|
243
|
-
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
240
|
+
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
244
241
|
# emit(:set, :equivalent, copy(data, ts, te))
|
245
242
|
# };
|
246
243
|
|
@@ -323,7 +320,7 @@
|
|
323
320
|
|
324
321
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
325
322
|
text = copy(data, ts-1, te)
|
326
|
-
if text[2]
|
323
|
+
if text[2] == '{'
|
327
324
|
emit(:escape, :codepoint_list, text)
|
328
325
|
else
|
329
326
|
emit(:escape, :codepoint, text)
|
@@ -419,12 +416,12 @@
|
|
419
416
|
|
420
417
|
backslash . anchor_char > (backslashed, 3) {
|
421
418
|
case text = copy(data, ts, te)
|
422
|
-
when '
|
423
|
-
when '
|
424
|
-
when '
|
425
|
-
when '
|
426
|
-
when '
|
427
|
-
when '
|
419
|
+
when '\A'; emit(:anchor, :bos, text)
|
420
|
+
when '\z'; emit(:anchor, :eos, text)
|
421
|
+
when '\Z'; emit(:anchor, :eos_ob_eol, text)
|
422
|
+
when '\b'; emit(:anchor, :word_boundary, text)
|
423
|
+
when '\B'; emit(:anchor, :nonword_boundary, text)
|
424
|
+
when '\G'; emit(:anchor, :match_start, text)
|
428
425
|
end
|
429
426
|
};
|
430
427
|
|
@@ -477,7 +474,7 @@
|
|
477
474
|
group_open . group_options >group_opened {
|
478
475
|
text = copy(data, ts, te)
|
479
476
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
480
|
-
|
477
|
+
validation_error(:group_option, $1 || "-#{$2}", text)
|
481
478
|
end
|
482
479
|
emit_options(text)
|
483
480
|
};
|
@@ -605,7 +602,7 @@
|
|
605
602
|
end
|
606
603
|
};
|
607
604
|
|
608
|
-
quantifier_interval
|
605
|
+
quantifier_interval {
|
609
606
|
emit(:quantifier, :interval, copy(data, ts, te))
|
610
607
|
};
|
611
608
|
|
@@ -686,6 +683,7 @@ class Regexp::Scanner
|
|
686
683
|
end
|
687
684
|
|
688
685
|
# Invalid groupOption. Used for inline options.
|
686
|
+
# TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
|
689
687
|
class InvalidGroupOption < ValidationError
|
690
688
|
def initialize(option, text)
|
691
689
|
super "Invalid group option #{option} in #{text}"
|
@@ -706,6 +704,13 @@ class Regexp::Scanner
|
|
706
704
|
end
|
707
705
|
end
|
708
706
|
|
707
|
+
# The POSIX class name was not recognized by the scanner.
|
708
|
+
class UnknownPosixClassError < ValidationError
|
709
|
+
def initialize(text)
|
710
|
+
super "Unknown POSIX class #{text}"
|
711
|
+
end
|
712
|
+
end
|
713
|
+
|
709
714
|
# Scans the given regular expression text, or Regexp object and collects the
|
710
715
|
# emitted token into an array that gets returned at the end. If a block is
|
711
716
|
# given, it gets called for each emitted token.
|
@@ -771,6 +776,11 @@ class Regexp::Scanner
|
|
771
776
|
File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
|
772
777
|
end
|
773
778
|
|
779
|
+
def self.posix_classes
|
780
|
+
%w[alnum alpha ascii blank cntrl digit graph
|
781
|
+
lower print punct space upper word xdigit]
|
782
|
+
end
|
783
|
+
|
774
784
|
# Emits an array with the details of the scanned pattern
|
775
785
|
def emit(type, token, text)
|
776
786
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
@@ -873,15 +883,16 @@ class Regexp::Scanner
|
|
873
883
|
|
874
884
|
# Centralizes and unifies the handling of validation related
|
875
885
|
# errors.
|
876
|
-
def validation_error(type, what, reason)
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
886
|
+
def validation_error(type, what, reason = nil)
|
887
|
+
error =
|
888
|
+
case type
|
889
|
+
when :backref then InvalidBackrefError.new(what, reason)
|
890
|
+
when :group then InvalidGroupError.new(what, reason)
|
891
|
+
when :group_option then InvalidGroupOption.new(what, reason)
|
892
|
+
when :posix_class then UnknownPosixClassError.new(what)
|
893
|
+
when :property then UnknownUnicodePropertyError.new(what)
|
894
|
+
when :sequence then InvalidSequenceError.new(what, reason)
|
895
|
+
end
|
885
896
|
|
886
897
|
raise error # unless @@config.validation_ignore
|
887
898
|
end
|