regexp_parser 2.8.1 → 2.11.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +6 -4
- data/LICENSE +1 -1
- data/Rakefile +5 -3
- data/lib/regexp_parser/error.rb +2 -0
- data/lib/regexp_parser/expression/base.rb +2 -0
- data/lib/regexp_parser/expression/classes/alternation.rb +2 -0
- data/lib/regexp_parser/expression/classes/anchor.rb +2 -0
- data/lib/regexp_parser/expression/classes/backreference.rb +3 -20
- data/lib/regexp_parser/expression/classes/character_set/intersection.rb +2 -0
- data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -0
- data/lib/regexp_parser/expression/classes/character_set.rb +3 -4
- data/lib/regexp_parser/expression/classes/character_type.rb +2 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -14
- data/lib/regexp_parser/expression/classes/escape_sequence.rb +26 -95
- data/lib/regexp_parser/expression/classes/free_space.rb +2 -0
- data/lib/regexp_parser/expression/classes/group.rb +2 -0
- data/lib/regexp_parser/expression/classes/keep.rb +3 -1
- data/lib/regexp_parser/expression/classes/literal.rb +2 -0
- data/lib/regexp_parser/expression/classes/posix_class.rb +2 -4
- data/lib/regexp_parser/expression/classes/root.rb +2 -0
- data/lib/regexp_parser/expression/classes/unicode_property.rb +8 -9
- data/lib/regexp_parser/expression/methods/construct.rb +2 -0
- data/lib/regexp_parser/expression/methods/escape_sequence_char.rb +7 -0
- data/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb +76 -0
- data/lib/regexp_parser/expression/methods/human_name.rb +2 -0
- data/lib/regexp_parser/expression/methods/match.rb +2 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +2 -0
- data/lib/regexp_parser/expression/methods/negative.rb +22 -0
- data/lib/regexp_parser/expression/methods/options.rb +2 -0
- data/lib/regexp_parser/expression/methods/parts.rb +2 -0
- data/lib/regexp_parser/expression/methods/printing.rb +2 -0
- data/lib/regexp_parser/expression/methods/referenced_expressions.rb +30 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +2 -0
- data/lib/regexp_parser/expression/methods/tests.rb +2 -0
- data/lib/regexp_parser/expression/methods/traverse.rb +2 -0
- data/lib/regexp_parser/expression/quantifier.rb +3 -1
- data/lib/regexp_parser/expression/sequence.rb +2 -0
- data/lib/regexp_parser/expression/sequence_operation.rb +2 -0
- data/lib/regexp_parser/expression/shared.rb +6 -3
- data/lib/regexp_parser/expression/subexpression.rb +2 -0
- data/lib/regexp_parser/expression.rb +39 -33
- data/lib/regexp_parser/lexer.rb +2 -0
- data/lib/regexp_parser/parser.rb +16 -9
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +2 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +3 -1
- data/lib/regexp_parser/scanner/errors/validation_error.rb +2 -0
- data/lib/regexp_parser/scanner/properties/long.csv +37 -0
- data/lib/regexp_parser/scanner/properties/short.csv +9 -0
- data/lib/regexp_parser/scanner/scanner.rl +62 -18
- data/lib/regexp_parser/scanner.rb +1041 -936
- data/lib/regexp_parser/syntax/any.rb +2 -0
- data/lib/regexp_parser/syntax/base.rb +2 -0
- data/lib/regexp_parser/syntax/token/anchor.rb +5 -3
- data/lib/regexp_parser/syntax/token/assertion.rb +4 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +8 -6
- data/lib/regexp_parser/syntax/token/character_set.rb +3 -1
- data/lib/regexp_parser/syntax/token/character_type.rb +6 -4
- data/lib/regexp_parser/syntax/token/conditional.rb +5 -3
- data/lib/regexp_parser/syntax/token/escape.rb +9 -7
- data/lib/regexp_parser/syntax/token/group.rb +8 -6
- data/lib/regexp_parser/syntax/token/keep.rb +3 -1
- data/lib/regexp_parser/syntax/token/meta.rb +4 -2
- data/lib/regexp_parser/syntax/token/posix_class.rb +4 -2
- data/lib/regexp_parser/syntax/token/quantifier.rb +8 -6
- data/lib/regexp_parser/syntax/token/unicode_property.rb +79 -46
- data/lib/regexp_parser/syntax/token/virtual.rb +5 -3
- data/lib/regexp_parser/syntax/token.rb +18 -16
- data/lib/regexp_parser/syntax/version_lookup.rb +4 -2
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -0
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +2 -0
- data/lib/regexp_parser/syntax/versions/1.9.3.rb +2 -0
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +2 -0
- data/lib/regexp_parser/syntax/versions/2.2.0.rb +2 -0
- data/lib/regexp_parser/syntax/versions/2.3.0.rb +2 -0
- data/lib/regexp_parser/syntax/versions/2.4.0.rb +2 -0
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -0
- data/lib/regexp_parser/syntax/versions/2.5.0.rb +2 -0
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +2 -0
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +2 -0
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +2 -0
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +2 -0
- data/lib/regexp_parser/syntax/versions/3.2.0.rb +2 -0
- data/lib/regexp_parser/syntax/versions/3.5.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions.rb +3 -1
- data/lib/regexp_parser/syntax.rb +3 -1
- data/lib/regexp_parser/token.rb +2 -0
- data/lib/regexp_parser/version.rb +3 -1
- data/lib/regexp_parser.rb +8 -6
- data/regexp_parser.gemspec +7 -5
- metadata +12 -11
- data/CHANGELOG.md +0 -691
- data/README.md +0 -506
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module Regexp::Expression
|
|
2
4
|
module Shared
|
|
3
5
|
module ClassMethods; end # filled in ./methods/*.rb
|
|
@@ -70,11 +72,12 @@ module Regexp::Expression
|
|
|
70
72
|
# lit.to_s(:original) # => 'a +' # with quantifier AND intermittent decorations
|
|
71
73
|
#
|
|
72
74
|
def to_s(format = :full)
|
|
73
|
-
base =
|
|
75
|
+
base = ''.dup
|
|
76
|
+
parts.each do |part|
|
|
74
77
|
if part.instance_of?(String)
|
|
75
|
-
|
|
78
|
+
base << part
|
|
76
79
|
elsif !part.custom_to_s_handling
|
|
77
|
-
|
|
80
|
+
base << part.to_s(:original)
|
|
78
81
|
end
|
|
79
82
|
end
|
|
80
83
|
"#{base}#{pre_quantifier_decoration(format)}#{quantifier_affix(format)}"
|
|
@@ -1,36 +1,42 @@
|
|
|
1
|
-
|
|
1
|
+
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
require 'regexp_parser/expression/base'
|
|
5
|
-
require 'regexp_parser/expression/quantifier'
|
|
6
|
-
require 'regexp_parser/expression/subexpression'
|
|
7
|
-
require 'regexp_parser/expression/sequence'
|
|
8
|
-
require 'regexp_parser/expression/sequence_operation'
|
|
3
|
+
require_relative 'error'
|
|
9
4
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
require 'regexp_parser/expression/classes/character_type'
|
|
17
|
-
require 'regexp_parser/expression/classes/conditional'
|
|
18
|
-
require 'regexp_parser/expression/classes/escape_sequence'
|
|
19
|
-
require 'regexp_parser/expression/classes/free_space'
|
|
20
|
-
require 'regexp_parser/expression/classes/group'
|
|
21
|
-
require 'regexp_parser/expression/classes/keep'
|
|
22
|
-
require 'regexp_parser/expression/classes/literal'
|
|
23
|
-
require 'regexp_parser/expression/classes/posix_class'
|
|
24
|
-
require 'regexp_parser/expression/classes/root'
|
|
25
|
-
require 'regexp_parser/expression/classes/unicode_property'
|
|
5
|
+
require_relative 'expression/shared'
|
|
6
|
+
require_relative 'expression/base'
|
|
7
|
+
require_relative 'expression/quantifier'
|
|
8
|
+
require_relative 'expression/subexpression'
|
|
9
|
+
require_relative 'expression/sequence'
|
|
10
|
+
require_relative 'expression/sequence_operation'
|
|
26
11
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
12
|
+
require_relative 'expression/classes/alternation'
|
|
13
|
+
require_relative 'expression/classes/anchor'
|
|
14
|
+
require_relative 'expression/classes/backreference'
|
|
15
|
+
require_relative 'expression/classes/character_set'
|
|
16
|
+
require_relative 'expression/classes/character_set/intersection'
|
|
17
|
+
require_relative 'expression/classes/character_set/range'
|
|
18
|
+
require_relative 'expression/classes/character_type'
|
|
19
|
+
require_relative 'expression/classes/conditional'
|
|
20
|
+
require_relative 'expression/classes/escape_sequence'
|
|
21
|
+
require_relative 'expression/classes/free_space'
|
|
22
|
+
require_relative 'expression/classes/group'
|
|
23
|
+
require_relative 'expression/classes/keep'
|
|
24
|
+
require_relative 'expression/classes/literal'
|
|
25
|
+
require_relative 'expression/classes/posix_class'
|
|
26
|
+
require_relative 'expression/classes/root'
|
|
27
|
+
require_relative 'expression/classes/unicode_property'
|
|
28
|
+
|
|
29
|
+
require_relative 'expression/methods/construct'
|
|
30
|
+
require_relative 'expression/methods/escape_sequence_char'
|
|
31
|
+
require_relative 'expression/methods/escape_sequence_codepoint'
|
|
32
|
+
require_relative 'expression/methods/human_name'
|
|
33
|
+
require_relative 'expression/methods/match'
|
|
34
|
+
require_relative 'expression/methods/match_length'
|
|
35
|
+
require_relative 'expression/methods/negative'
|
|
36
|
+
require_relative 'expression/methods/options'
|
|
37
|
+
require_relative 'expression/methods/parts'
|
|
38
|
+
require_relative 'expression/methods/printing'
|
|
39
|
+
require_relative 'expression/methods/referenced_expressions'
|
|
40
|
+
require_relative 'expression/methods/strfregexp'
|
|
41
|
+
require_relative 'expression/methods/tests'
|
|
42
|
+
require_relative 'expression/methods/traverse'
|
data/lib/regexp_parser/lexer.rb
CHANGED
data/lib/regexp_parser/parser.rb
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'error'
|
|
4
|
+
require_relative 'expression'
|
|
3
5
|
|
|
4
6
|
class Regexp::Parser
|
|
5
7
|
include Regexp::Expression
|
|
@@ -319,6 +321,7 @@ class Regexp::Parser
|
|
|
319
321
|
when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
|
|
320
322
|
when :hex; node << EscapeSequence::Hex.new(token, active_opts)
|
|
321
323
|
when :octal; node << EscapeSequence::Octal.new(token, active_opts)
|
|
324
|
+
when :utf8_hex; node << EscapeSequence::UTF8Hex.new(token, active_opts)
|
|
322
325
|
|
|
323
326
|
when :control
|
|
324
327
|
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
|
@@ -467,6 +470,7 @@ class Regexp::Parser
|
|
|
467
470
|
when *UPTokens::Age; node << UP::Age.new(token, active_opts)
|
|
468
471
|
when *UPTokens::Derived; node << UP::Derived.new(token, active_opts)
|
|
469
472
|
when *UPTokens::Emoji; node << UP::Emoji.new(token, active_opts)
|
|
473
|
+
when *UPTokens::Enumerated; node << UP::Enumerated.new(token, active_opts)
|
|
470
474
|
when *UPTokens::Script; node << UP::Script.new(token, active_opts)
|
|
471
475
|
when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)
|
|
472
476
|
|
|
@@ -574,21 +578,24 @@ class Regexp::Parser
|
|
|
574
578
|
options_stack.last
|
|
575
579
|
end
|
|
576
580
|
|
|
577
|
-
# Assigns referenced expressions to
|
|
581
|
+
# Assigns referenced expressions to referring expressions, e.g. if there is
|
|
578
582
|
# an instance of Backreference::Number, its #referenced_expression is set to
|
|
579
583
|
# the instance of Group::Capture that it refers to via its number.
|
|
580
584
|
def assign_referenced_expressions
|
|
581
|
-
# find all
|
|
582
|
-
targets = { 0 => root }
|
|
585
|
+
# find all referenceable and referring expressions
|
|
586
|
+
targets = { 0 => [root] }
|
|
583
587
|
referrers = []
|
|
584
588
|
root.each_expression do |exp|
|
|
585
|
-
exp.
|
|
586
|
-
|
|
589
|
+
if exp.referential?
|
|
590
|
+
referrers << exp
|
|
591
|
+
elsif exp.is_a?(Group::Capture)
|
|
592
|
+
(targets[exp.identifier] ||= []) << exp
|
|
593
|
+
end
|
|
587
594
|
end
|
|
588
|
-
# assign
|
|
595
|
+
# assign referenced expressions to referring expressions
|
|
589
596
|
# (in a second iteration because there might be forward references)
|
|
590
597
|
referrers.each do |exp|
|
|
591
|
-
exp.
|
|
598
|
+
exp.referenced_expressions = targets[exp.reference] ||
|
|
592
599
|
raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
|
|
593
600
|
end
|
|
594
601
|
end
|
|
@@ -8,6 +8,8 @@ age=12.1,age=12.1
|
|
|
8
8
|
age=13.0,age=13.0
|
|
9
9
|
age=14.0,age=14.0
|
|
10
10
|
age=15.0,age=15.0
|
|
11
|
+
age=15.1,age=15.1
|
|
12
|
+
age=16.0,age=16.0
|
|
11
13
|
age=2.0,age=2.0
|
|
12
14
|
age=2.1,age=2.1
|
|
13
15
|
age=3.0,age=3.0
|
|
@@ -102,18 +104,33 @@ extendedpictographic,extended_pictographic
|
|
|
102
104
|
extender,extender
|
|
103
105
|
finalpunctuation,final_punctuation
|
|
104
106
|
format,format
|
|
107
|
+
garay,garay
|
|
105
108
|
georgian,georgian
|
|
106
109
|
glagolitic,glagolitic
|
|
107
110
|
gothic,gothic
|
|
108
111
|
grantha,grantha
|
|
109
112
|
graph,graph
|
|
110
113
|
graphemebase,grapheme_base
|
|
114
|
+
graphemeclusterbreak=control,grapheme_cluster_break=control
|
|
115
|
+
graphemeclusterbreak=cr,grapheme_cluster_break=cr
|
|
116
|
+
graphemeclusterbreak=extend,grapheme_cluster_break=extend
|
|
117
|
+
graphemeclusterbreak=l,grapheme_cluster_break=l
|
|
118
|
+
graphemeclusterbreak=lf,grapheme_cluster_break=lf
|
|
119
|
+
graphemeclusterbreak=lv,grapheme_cluster_break=lv
|
|
120
|
+
graphemeclusterbreak=lvt,grapheme_cluster_break=lvt
|
|
121
|
+
graphemeclusterbreak=prepend,grapheme_cluster_break=prepend
|
|
122
|
+
graphemeclusterbreak=regionalindicator,grapheme_cluster_break=regional_indicator
|
|
123
|
+
graphemeclusterbreak=spacingmark,grapheme_cluster_break=spacingmark
|
|
124
|
+
graphemeclusterbreak=t,grapheme_cluster_break=t
|
|
125
|
+
graphemeclusterbreak=v,grapheme_cluster_break=v
|
|
126
|
+
graphemeclusterbreak=zwj,grapheme_cluster_break=zwj
|
|
111
127
|
graphemeextend,grapheme_extend
|
|
112
128
|
graphemelink,grapheme_link
|
|
113
129
|
greek,greek
|
|
114
130
|
gujarati,gujarati
|
|
115
131
|
gunjalagondi,gunjala_gondi
|
|
116
132
|
gurmukhi,gurmukhi
|
|
133
|
+
gurungkhema,gurung_khema
|
|
117
134
|
han,han
|
|
118
135
|
hangul,hangul
|
|
119
136
|
hanifirohingya,hanifi_rohingya
|
|
@@ -123,11 +140,14 @@ hebrew,hebrew
|
|
|
123
140
|
hexdigit,hex_digit
|
|
124
141
|
hiragana,hiragana
|
|
125
142
|
hyphen,hyphen
|
|
143
|
+
idcompatmathcontinue,id_compat_math_continue
|
|
144
|
+
idcompatmathstart,id_compat_math_start
|
|
126
145
|
idcontinue,id_continue
|
|
127
146
|
ideographic,ideographic
|
|
128
147
|
idsbinaryoperator,ids_binary_operator
|
|
129
148
|
idstart,id_start
|
|
130
149
|
idstrinaryoperator,ids_trinary_operator
|
|
150
|
+
idsunaryoperator,ids_unary_operator
|
|
131
151
|
imperialaramaic,imperial_aramaic
|
|
132
152
|
inadlam,in_adlam
|
|
133
153
|
inaegeannumbers,in_aegean_numbers
|
|
@@ -190,6 +210,7 @@ incjkunifiedideographsextensione,in_cjk_unified_ideographs_extension_e
|
|
|
190
210
|
incjkunifiedideographsextensionf,in_cjk_unified_ideographs_extension_f
|
|
191
211
|
incjkunifiedideographsextensiong,in_cjk_unified_ideographs_extension_g
|
|
192
212
|
incjkunifiedideographsextensionh,in_cjk_unified_ideographs_extension_h
|
|
213
|
+
incjkunifiedideographsextensioni,in_cjk_unified_ideographs_extension_i
|
|
193
214
|
incombiningdiacriticalmarks,in_combining_diacritical_marks
|
|
194
215
|
incombiningdiacriticalmarksextended,in_combining_diacritical_marks_extended
|
|
195
216
|
incombiningdiacriticalmarksforsymbols,in_combining_diacritical_marks_for_symbols
|
|
@@ -223,6 +244,7 @@ induployan,in_duployan
|
|
|
223
244
|
inearlydynasticcuneiform,in_early_dynastic_cuneiform
|
|
224
245
|
inegyptianhieroglyphformatcontrols,in_egyptian_hieroglyph_format_controls
|
|
225
246
|
inegyptianhieroglyphs,in_egyptian_hieroglyphs
|
|
247
|
+
inegyptianhieroglyphsextendeda,in_egyptian_hieroglyphs_extended_a
|
|
226
248
|
inelbasan,in_elbasan
|
|
227
249
|
inelymaic,in_elymaic
|
|
228
250
|
inemoticons,in_emoticons
|
|
@@ -235,6 +257,7 @@ inethiopicextended,in_ethiopic_extended
|
|
|
235
257
|
inethiopicextendeda,in_ethiopic_extended_a
|
|
236
258
|
inethiopicextendedb,in_ethiopic_extended_b
|
|
237
259
|
inethiopicsupplement,in_ethiopic_supplement
|
|
260
|
+
ingaray,in_garay
|
|
238
261
|
ingeneralpunctuation,in_general_punctuation
|
|
239
262
|
ingeometricshapes,in_geometric_shapes
|
|
240
263
|
ingeometricshapesextended,in_geometric_shapes_extended
|
|
@@ -250,6 +273,7 @@ ingreekextended,in_greek_extended
|
|
|
250
273
|
ingujarati,in_gujarati
|
|
251
274
|
ingunjalagondi,in_gunjala_gondi
|
|
252
275
|
ingurmukhi,in_gurmukhi
|
|
276
|
+
ingurungkhema,in_gurung_khema
|
|
253
277
|
inhalfwidthandfullwidthforms,in_halfwidth_and_fullwidth_forms
|
|
254
278
|
inhangulcompatibilityjamo,in_hangul_compatibility_jamo
|
|
255
279
|
inhanguljamo,in_hangul_jamo
|
|
@@ -291,6 +315,7 @@ inkhmer,in_khmer
|
|
|
291
315
|
inkhmersymbols,in_khmer_symbols
|
|
292
316
|
inkhojki,in_khojki
|
|
293
317
|
inkhudawadi,in_khudawadi
|
|
318
|
+
inkiratrai,in_kirat_rai
|
|
294
319
|
inlao,in_lao
|
|
295
320
|
inlatin1supplement,in_latin_1_supplement
|
|
296
321
|
inlatinextendeda,in_latin_extended_a
|
|
@@ -346,6 +371,7 @@ inmusicalsymbols,in_musical_symbols
|
|
|
346
371
|
inmyanmar,in_myanmar
|
|
347
372
|
inmyanmarextendeda,in_myanmar_extended_a
|
|
348
373
|
inmyanmarextendedb,in_myanmar_extended_b
|
|
374
|
+
inmyanmarextendedc,in_myanmar_extended_c
|
|
349
375
|
innabataean,in_nabataean
|
|
350
376
|
innagmundari,in_nag_mundari
|
|
351
377
|
innandinagari,in_nandinagari
|
|
@@ -367,6 +393,7 @@ inoldsogdian,in_old_sogdian
|
|
|
367
393
|
inoldsoutharabian,in_old_south_arabian
|
|
368
394
|
inoldturkic,in_old_turkic
|
|
369
395
|
inolduyghur,in_old_uyghur
|
|
396
|
+
inolonal,in_ol_onal
|
|
370
397
|
inopticalcharacterrecognition,in_optical_character_recognition
|
|
371
398
|
inoriya,in_oriya
|
|
372
399
|
inornamentaldingbats,in_ornamental_dingbats
|
|
@@ -406,6 +433,7 @@ inspacingmodifierletters,in_spacing_modifier_letters
|
|
|
406
433
|
inspecials,in_specials
|
|
407
434
|
insundanese,in_sundanese
|
|
408
435
|
insundanesesupplement,in_sundanese_supplement
|
|
436
|
+
insunuwar,in_sunuwar
|
|
409
437
|
insuperscriptsandsubscripts,in_superscripts_and_subscripts
|
|
410
438
|
insupplementalarrowsa,in_supplemental_arrows_a
|
|
411
439
|
insupplementalarrowsb,in_supplemental_arrows_b
|
|
@@ -419,6 +447,7 @@ insuttonsignwriting,in_sutton_signwriting
|
|
|
419
447
|
insylotinagri,in_syloti_nagri
|
|
420
448
|
insymbolsandpictographsextendeda,in_symbols_and_pictographs_extended_a
|
|
421
449
|
insymbolsforlegacycomputing,in_symbols_for_legacy_computing
|
|
450
|
+
insymbolsforlegacycomputingsupplement,in_symbols_for_legacy_computing_supplement
|
|
422
451
|
insyriac,in_syriac
|
|
423
452
|
insyriacsupplement,in_syriac_supplement
|
|
424
453
|
intagalog,in_tagalog
|
|
@@ -441,8 +470,10 @@ inthai,in_thai
|
|
|
441
470
|
intibetan,in_tibetan
|
|
442
471
|
intifinagh,in_tifinagh
|
|
443
472
|
intirhuta,in_tirhuta
|
|
473
|
+
intodhri,in_todhri
|
|
444
474
|
intoto,in_toto
|
|
445
475
|
intransportandmapsymbols,in_transport_and_map_symbols
|
|
476
|
+
intulutigalari,in_tulu_tigalari
|
|
446
477
|
inugaritic,in_ugaritic
|
|
447
478
|
inunifiedcanadianaboriginalsyllabics,in_unified_canadian_aboriginal_syllabics
|
|
448
479
|
inunifiedcanadianaboriginalsyllabicsextended,in_unified_canadian_aboriginal_syllabics_extended
|
|
@@ -473,6 +504,7 @@ khitansmallscript,khitan_small_script
|
|
|
473
504
|
khmer,khmer
|
|
474
505
|
khojki,khojki
|
|
475
506
|
khudawadi,khudawadi
|
|
507
|
+
kiratrai,kirat_rai
|
|
476
508
|
lao,lao
|
|
477
509
|
latin,latin
|
|
478
510
|
lepcha,lepcha
|
|
@@ -506,6 +538,7 @@ meroiticcursive,meroitic_cursive
|
|
|
506
538
|
meroitichieroglyphs,meroitic_hieroglyphs
|
|
507
539
|
miao,miao
|
|
508
540
|
modi,modi
|
|
541
|
+
modifiercombiningmark,modifier_combining_mark
|
|
509
542
|
modifierletter,modifier_letter
|
|
510
543
|
modifiersymbol,modifier_symbol
|
|
511
544
|
mongolian,mongolian
|
|
@@ -535,6 +568,7 @@ oldsogdian,old_sogdian
|
|
|
535
568
|
oldsoutharabian,old_south_arabian
|
|
536
569
|
oldturkic,old_turkic
|
|
537
570
|
olduyghur,old_uyghur
|
|
571
|
+
olonal,ol_onal
|
|
538
572
|
openpunctuation,open_punctuation
|
|
539
573
|
oriya,oriya
|
|
540
574
|
osage,osage
|
|
@@ -588,6 +622,7 @@ space,space
|
|
|
588
622
|
spaceseparator,space_separator
|
|
589
623
|
spacingmark,spacing_mark
|
|
590
624
|
sundanese,sundanese
|
|
625
|
+
sunuwar,sunuwar
|
|
591
626
|
surrogate,surrogate
|
|
592
627
|
sylotinagri,syloti_nagri
|
|
593
628
|
symbol,symbol
|
|
@@ -609,7 +644,9 @@ tibetan,tibetan
|
|
|
609
644
|
tifinagh,tifinagh
|
|
610
645
|
tirhuta,tirhuta
|
|
611
646
|
titlecaseletter,titlecase_letter
|
|
647
|
+
todhri,todhri
|
|
612
648
|
toto,toto
|
|
649
|
+
tulutigalari,tulu_tigalari
|
|
613
650
|
ugaritic,ugaritic
|
|
614
651
|
unassigned,unassigned
|
|
615
652
|
unifiedideograph,unified_ideograph
|
|
@@ -58,6 +58,7 @@ epres,emoji_presentation
|
|
|
58
58
|
ethi,ethiopic
|
|
59
59
|
ext,extender
|
|
60
60
|
extpict,extended_pictographic
|
|
61
|
+
gara,garay
|
|
61
62
|
geor,georgian
|
|
62
63
|
glag,glagolitic
|
|
63
64
|
gong,gunjala_gondi
|
|
@@ -69,6 +70,7 @@ grek,greek
|
|
|
69
70
|
grext,grapheme_extend
|
|
70
71
|
grlink,grapheme_link
|
|
71
72
|
gujr,gujarati
|
|
73
|
+
gukh,gurung_khema
|
|
72
74
|
guru,gurmukhi
|
|
73
75
|
hang,hangul
|
|
74
76
|
hani,han
|
|
@@ -86,6 +88,7 @@ ideo,ideographic
|
|
|
86
88
|
ids,id_start
|
|
87
89
|
idsb,ids_binary_operator
|
|
88
90
|
idst,ids_trinary_operator
|
|
91
|
+
idsu,ids_unary_operator
|
|
89
92
|
ital,old_italic
|
|
90
93
|
java,javanese
|
|
91
94
|
joinc,join_control
|
|
@@ -96,6 +99,7 @@ khmr,khmer
|
|
|
96
99
|
khoj,khojki
|
|
97
100
|
kits,khitan_small_script
|
|
98
101
|
knda,kannada
|
|
102
|
+
krai,kirat_rai
|
|
99
103
|
kthi,kaithi
|
|
100
104
|
l,letter
|
|
101
105
|
lana,tai_tham
|
|
@@ -121,6 +125,7 @@ mand,mandaic
|
|
|
121
125
|
mani,manichaean
|
|
122
126
|
marc,marchen
|
|
123
127
|
mc,spacing_mark
|
|
128
|
+
mcm,modifier_combining_mark
|
|
124
129
|
me,enclosing_mark
|
|
125
130
|
medf,medefaidrin
|
|
126
131
|
mend,mende_kikakui
|
|
@@ -153,6 +158,7 @@ oids,other_id_start
|
|
|
153
158
|
olck,ol_chiki
|
|
154
159
|
olower,other_lowercase
|
|
155
160
|
omath,other_math
|
|
161
|
+
onao,ol_onal
|
|
156
162
|
orkh,old_turkic
|
|
157
163
|
orya,oriya
|
|
158
164
|
osge,osage
|
|
@@ -207,6 +213,7 @@ sora,sora_sompeng
|
|
|
207
213
|
soyo,soyombo
|
|
208
214
|
sterm,sentence_terminal
|
|
209
215
|
sund,sundanese
|
|
216
|
+
sunu,sunuwar
|
|
210
217
|
sylo,syloti_nagri
|
|
211
218
|
syrc,syriac
|
|
212
219
|
tagb,tagbanwa
|
|
@@ -224,6 +231,8 @@ thaa,thaana
|
|
|
224
231
|
tibt,tibetan
|
|
225
232
|
tirh,tirhuta
|
|
226
233
|
tnsa,tangsa
|
|
234
|
+
todr,todhri
|
|
235
|
+
tutg,tulu_tigalari
|
|
227
236
|
ugar,ugaritic
|
|
228
237
|
uideo,unified_ideograph
|
|
229
238
|
vaii,vai
|
|
@@ -37,7 +37,8 @@
|
|
|
37
37
|
octal_sequence = [0-7]{1,3};
|
|
38
38
|
|
|
39
39
|
hex_sequence = 'x' . xdigit{1,2};
|
|
40
|
-
hex_sequence_err = 'x' . [^0-
|
|
40
|
+
hex_sequence_err = 'x' . [^0-9A-Fa-f];
|
|
41
|
+
high_hex_sequence = 'x' . [89A-Fa-f] . xdigit . ( '\\x' . [89A-Fa-f] . xdigit )*;
|
|
41
42
|
|
|
42
43
|
codepoint_single = 'u' . xdigit{4};
|
|
43
44
|
codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
|
|
@@ -78,8 +79,8 @@
|
|
|
78
79
|
# try to treat every other group head as options group, like Ruby
|
|
79
80
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
|
80
81
|
|
|
81
|
-
group_name_id_ab = ([
|
|
82
|
-
group_name_id_sq = ([^0-9\-']
|
|
82
|
+
group_name_id_ab = ([^!=0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
|
83
|
+
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
|
83
84
|
group_number = '-'? . [0-9]+;
|
|
84
85
|
group_level = [+\-] . [0-9]+;
|
|
85
86
|
|
|
@@ -210,7 +211,7 @@
|
|
|
210
211
|
type = :nonposixclass
|
|
211
212
|
end
|
|
212
213
|
|
|
213
|
-
unless
|
|
214
|
+
unless POSIX_CLASSES[class_name]
|
|
214
215
|
raise ValidationError.for(:posix_class, text)
|
|
215
216
|
end
|
|
216
217
|
|
|
@@ -246,7 +247,7 @@
|
|
|
246
247
|
# Treat all remaining escapes - those not supported in sets - as literal.
|
|
247
248
|
# (This currently includes \^, \-, \&, \:, although these could potentially
|
|
248
249
|
# be meta chars when not escaped, depending on their position in the set.)
|
|
249
|
-
any > (escaped_set_alpha, 1) {
|
|
250
|
+
(any | utf8_multibyte) > (escaped_set_alpha, 1) {
|
|
250
251
|
emit(:escape, :literal, copy(data, ts-1, te))
|
|
251
252
|
fret;
|
|
252
253
|
};
|
|
@@ -256,9 +257,21 @@
|
|
|
256
257
|
# escape sequence scanner
|
|
257
258
|
# --------------------------------------------------------------------------
|
|
258
259
|
escape_sequence := |*
|
|
259
|
-
[1-9] {
|
|
260
|
+
[1-9] . [0-9]* {
|
|
260
261
|
text = copy(data, ts-1, te)
|
|
261
|
-
|
|
262
|
+
|
|
263
|
+
# If not enough groups have been opened, there is a fallback to either an
|
|
264
|
+
# octal or literal interpretation for 2+ digit numerical escapes.
|
|
265
|
+
digits = text[1..-1]
|
|
266
|
+
if digits.size == 1 || digits.to_i <= capturing_group_count
|
|
267
|
+
emit(:backref, :number, text)
|
|
268
|
+
elsif digits =~ /\A[0-7]{2,}\z/
|
|
269
|
+
emit(:escape, :octal, text)
|
|
270
|
+
else
|
|
271
|
+
emit(:escape, :literal, text[0..1])
|
|
272
|
+
emit(:literal, :literal, text[2..-1])
|
|
273
|
+
end
|
|
274
|
+
|
|
262
275
|
fret;
|
|
263
276
|
};
|
|
264
277
|
|
|
@@ -267,6 +280,13 @@
|
|
|
267
280
|
fret;
|
|
268
281
|
};
|
|
269
282
|
|
|
283
|
+
[8-9] . [0-9] { # special case, emits two tokens
|
|
284
|
+
text = copy(data, ts-1, te)
|
|
285
|
+
emit(:escape, :literal, text[0, 2])
|
|
286
|
+
emit(:literal, :literal, text[2])
|
|
287
|
+
fret;
|
|
288
|
+
};
|
|
289
|
+
|
|
270
290
|
meta_char {
|
|
271
291
|
case text = copy(data, ts-1, te)
|
|
272
292
|
when '\.'; emit(:escape, :dot, text)
|
|
@@ -314,6 +334,16 @@
|
|
|
314
334
|
fret;
|
|
315
335
|
};
|
|
316
336
|
|
|
337
|
+
high_hex_sequence > (escaped_alpha, 5) {
|
|
338
|
+
text = copy(data, ts-1, te)
|
|
339
|
+
if regexp_encoding == Encoding::BINARY
|
|
340
|
+
text.split(/(?=\\)/).each { |part| emit(:escape, :hex, part) }
|
|
341
|
+
else
|
|
342
|
+
emit(:escape, :utf8_hex, text)
|
|
343
|
+
end
|
|
344
|
+
fret;
|
|
345
|
+
};
|
|
346
|
+
|
|
317
347
|
hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
|
|
318
348
|
emit(:escape, :hex, copy(data, ts-1, te))
|
|
319
349
|
fret;
|
|
@@ -357,6 +387,7 @@
|
|
|
357
387
|
conditional_expression := |*
|
|
358
388
|
group_lookup . ')' {
|
|
359
389
|
text = copy(data, ts, te-1)
|
|
390
|
+
text =~ /[^0]/ or raise ValidationError.for(:backref, 'condition', 'invalid ref ID')
|
|
360
391
|
emit(:conditional, :condition, text)
|
|
361
392
|
emit(:conditional, :condition_close, ')')
|
|
362
393
|
};
|
|
@@ -506,6 +537,7 @@
|
|
|
506
537
|
};
|
|
507
538
|
|
|
508
539
|
group_open @group_opened {
|
|
540
|
+
self.capturing_group_count = capturing_group_count + 1
|
|
509
541
|
text = copy(data, ts, te)
|
|
510
542
|
emit(:group, :capture, text)
|
|
511
543
|
};
|
|
@@ -534,13 +566,13 @@
|
|
|
534
566
|
case text = copy(data, ts, te)
|
|
535
567
|
when /^\\k(.)[^0-9\-][^+\-]*['>]$/
|
|
536
568
|
emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
|
|
537
|
-
when /^\\k(.)[1-9]\d*['>]$/
|
|
569
|
+
when /^\\k(.)0*[1-9]\d*['>]$/
|
|
538
570
|
emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
|
|
539
|
-
when /^\\k(.)-[1-9]\d*['>]$/
|
|
571
|
+
when /^\\k(.)-0*[1-9]\d*['>]$/
|
|
540
572
|
emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
|
|
541
573
|
when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
|
|
542
574
|
emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
|
|
543
|
-
when /^\\k(.)-?[1-9]\d*[+\-]\d+['>]$/
|
|
575
|
+
when /^\\k(.)-?0*[1-9]\d*[+\-]\d+['>]$/
|
|
544
576
|
emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
|
|
545
577
|
else
|
|
546
578
|
raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
|
|
@@ -553,9 +585,9 @@
|
|
|
553
585
|
case text = copy(data, ts, te)
|
|
554
586
|
when /^\\g(.)[^0-9+\-].*['>]$/
|
|
555
587
|
emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
|
|
556
|
-
when /^\\g(.)\d
|
|
588
|
+
when /^\\g(.)(?:0|0*[1-9]\d*)['>]$/
|
|
557
589
|
emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
|
|
558
|
-
when /^\\g(.)[+-]\d
|
|
590
|
+
when /^\\g(.)[+-]0*[1-9]\d*/
|
|
559
591
|
emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
|
|
560
592
|
else
|
|
561
593
|
raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
|
|
@@ -632,9 +664,9 @@
|
|
|
632
664
|
*|;
|
|
633
665
|
}%%
|
|
634
666
|
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
667
|
+
require_relative 'scanner/errors/scanner_error'
|
|
668
|
+
require_relative 'scanner/errors/premature_end_error'
|
|
669
|
+
require_relative 'scanner/errors/validation_error'
|
|
638
670
|
|
|
639
671
|
class Regexp::Scanner
|
|
640
672
|
# Scans the given regular expression text, or Regexp object and collects the
|
|
@@ -654,6 +686,7 @@ class Regexp::Scanner
|
|
|
654
686
|
|
|
655
687
|
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
|
656
688
|
self.free_spacing = free_spacing?(input_object, options)
|
|
689
|
+
self.regexp_encoding = extract_encoding(input_object, options)
|
|
657
690
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
|
658
691
|
|
|
659
692
|
data = input.unpack("c*")
|
|
@@ -664,6 +697,7 @@ class Regexp::Scanner
|
|
|
664
697
|
|
|
665
698
|
self.set_depth = 0
|
|
666
699
|
self.group_depth = 0
|
|
700
|
+
self.capturing_group_count = 0
|
|
667
701
|
self.conditional_stack = []
|
|
668
702
|
self.char_pos = 0
|
|
669
703
|
|
|
@@ -703,10 +737,11 @@ class Regexp::Scanner
|
|
|
703
737
|
File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
|
|
704
738
|
end
|
|
705
739
|
|
|
706
|
-
|
|
740
|
+
# Use each_with_object for required_ruby_version >= 2.2, or #to_h for >= 2.6
|
|
741
|
+
POSIX_CLASSES =
|
|
707
742
|
%w[alnum alpha ascii blank cntrl digit graph
|
|
708
743
|
lower print punct space upper word xdigit]
|
|
709
|
-
|
|
744
|
+
.inject({}) { |o, e| o.merge(e => true) }.freeze
|
|
710
745
|
|
|
711
746
|
# Emits an array with the details of the scanned pattern
|
|
712
747
|
def emit(type, token, text)
|
|
@@ -734,16 +769,25 @@ class Regexp::Scanner
|
|
|
734
769
|
end
|
|
735
770
|
end
|
|
736
771
|
|
|
737
|
-
attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
|
|
772
|
+
attr_accessor :capturing_group_count, :literal_run # only public for #||= to work on ruby <= 2.5
|
|
738
773
|
|
|
739
774
|
private
|
|
740
775
|
|
|
741
776
|
attr_accessor :block,
|
|
742
777
|
:collect_tokens, :tokens, :prev_token,
|
|
743
778
|
:free_spacing, :spacing_stack,
|
|
779
|
+
:regexp_encoding,
|
|
744
780
|
:group_depth, :set_depth, :conditional_stack,
|
|
745
781
|
:char_pos
|
|
746
782
|
|
|
783
|
+
def extract_encoding(input_object, options)
|
|
784
|
+
if input_object.is_a?(::Regexp)
|
|
785
|
+
input_object.encoding
|
|
786
|
+
elsif options && (options & Regexp::NOENCODING)
|
|
787
|
+
Encoding::BINARY
|
|
788
|
+
end
|
|
789
|
+
end
|
|
790
|
+
|
|
747
791
|
def free_spacing?(input_object, options)
|
|
748
792
|
if options && !input_object.is_a?(String)
|
|
749
793
|
raise ArgumentError, 'options cannot be supplied unless scanning a String'
|