regexp_parser 2.10.0 → 2.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/Rakefile +3 -3
- data/lib/regexp_parser/expression/classes/escape_sequence.rb +1 -0
- data/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb +6 -0
- data/lib/regexp_parser/expression/shared.rb +4 -3
- data/lib/regexp_parser/parser.rb +1 -0
- data/lib/regexp_parser/scanner/properties/long.csv +19 -0
- data/lib/regexp_parser/scanner/properties/short.csv +8 -0
- data/lib/regexp_parser/scanner/scanner.rl +35 -7
- data/lib/regexp_parser/scanner.rb +500 -470
- data/lib/regexp_parser/syntax/token/escape.rb +1 -1
- data/lib/regexp_parser/syntax/token/unicode_property.rb +13 -0
- data/lib/regexp_parser/syntax/versions/3.5.0.rb +4 -0
- data/lib/regexp_parser/version.rb +1 -1
- metadata +4 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d7598b7311a82778cbcb493188dad178ce93c8478e420cd9e2382732ee90d4e1
|
4
|
+
data.tar.gz: 60a8399981030bdef025cf9657e043a5ccac93adeee62a589a8adb41ec460664
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a7ac06fda5f76d4497b8f01d1e724917d009f7c9ea10befcf03a801af8e769b52619433a22cc997cf584b03e1ca9e6ced257f5fc07e327c966f5c25714d2d0b4
|
7
|
+
data.tar.gz: 3d3f89a383bb63208a41801ea059bfc407ff2e88d657d23b0f13740d418335ad47c9f5174bc1d5b7f06841d7a461828c57efa1f97f8bc1b9b42e255959bd18cf
|
data/LICENSE
CHANGED
data/Rakefile
CHANGED
@@ -14,10 +14,10 @@ RSpec::Core::RakeTask.new(:spec)
|
|
14
14
|
task :default => [:'test:full']
|
15
15
|
|
16
16
|
namespace :test do
|
17
|
-
task full: [:
|
17
|
+
task full: [:ragel, :spec]
|
18
18
|
end
|
19
19
|
|
20
20
|
# Add ragel task as a prerequisite for building the gem to ensure that the
|
21
21
|
# latest scanner code is generated and included in the build.
|
22
|
-
desc "Runs ragel
|
23
|
-
task :
|
22
|
+
desc "Runs ragel before building the gem"
|
23
|
+
task build: :ragel
|
@@ -18,6 +18,7 @@ module Regexp::Expression
|
|
18
18
|
Codepoint = Class.new(Base) # e.g. \u000A
|
19
19
|
|
20
20
|
CodepointList = Class.new(Base) # e.g. \u{A B}
|
21
|
+
UTF8Hex = Class.new(Base) # e.g. \xE2\x82\xAC
|
21
22
|
|
22
23
|
AbstractMetaControlSequence = Class.new(Base)
|
23
24
|
Control = Class.new(AbstractMetaControlSequence) # e.g. \cB
|
@@ -15,6 +15,12 @@ module Regexp::Expression::EscapeSequence
|
|
15
15
|
Hex.class_eval { def codepoint; text[/\h+/].hex end }
|
16
16
|
Codepoint.class_eval { def codepoint; text[/\h+/].hex end }
|
17
17
|
|
18
|
+
UTF8Hex.class_eval do
|
19
|
+
def codepoint
|
20
|
+
text.scan(/\h+/).map(&:hex).pack('C*').force_encoding('utf-8').ord
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
18
24
|
CodepointList.class_eval do
|
19
25
|
# Maybe this should be a unique top-level expression class?
|
20
26
|
def char
|
@@ -70,11 +70,12 @@ module Regexp::Expression
|
|
70
70
|
# lit.to_s(:original) # => 'a +' # with quantifier AND intermittent decorations
|
71
71
|
#
|
72
72
|
def to_s(format = :full)
|
73
|
-
base =
|
73
|
+
base = ''.dup
|
74
|
+
parts.each do |part|
|
74
75
|
if part.instance_of?(String)
|
75
|
-
|
76
|
+
base << part
|
76
77
|
elsif !part.custom_to_s_handling
|
77
|
-
|
78
|
+
base << part.to_s(:original)
|
78
79
|
end
|
79
80
|
end
|
80
81
|
"#{base}#{pre_quantifier_decoration(format)}#{quantifier_affix(format)}"
|
data/lib/regexp_parser/parser.rb
CHANGED
@@ -319,6 +319,7 @@ class Regexp::Parser
|
|
319
319
|
when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
|
320
320
|
when :hex; node << EscapeSequence::Hex.new(token, active_opts)
|
321
321
|
when :octal; node << EscapeSequence::Octal.new(token, active_opts)
|
322
|
+
when :utf8_hex; node << EscapeSequence::UTF8Hex.new(token, active_opts)
|
322
323
|
|
323
324
|
when :control
|
324
325
|
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
@@ -9,6 +9,7 @@ age=13.0,age=13.0
|
|
9
9
|
age=14.0,age=14.0
|
10
10
|
age=15.0,age=15.0
|
11
11
|
age=15.1,age=15.1
|
12
|
+
age=16.0,age=16.0
|
12
13
|
age=2.0,age=2.0
|
13
14
|
age=2.1,age=2.1
|
14
15
|
age=3.0,age=3.0
|
@@ -103,6 +104,7 @@ extendedpictographic,extended_pictographic
|
|
103
104
|
extender,extender
|
104
105
|
finalpunctuation,final_punctuation
|
105
106
|
format,format
|
107
|
+
garay,garay
|
106
108
|
georgian,georgian
|
107
109
|
glagolitic,glagolitic
|
108
110
|
gothic,gothic
|
@@ -128,6 +130,7 @@ greek,greek
|
|
128
130
|
gujarati,gujarati
|
129
131
|
gunjalagondi,gunjala_gondi
|
130
132
|
gurmukhi,gurmukhi
|
133
|
+
gurungkhema,gurung_khema
|
131
134
|
han,han
|
132
135
|
hangul,hangul
|
133
136
|
hanifirohingya,hanifi_rohingya
|
@@ -241,6 +244,7 @@ induployan,in_duployan
|
|
241
244
|
inearlydynasticcuneiform,in_early_dynastic_cuneiform
|
242
245
|
inegyptianhieroglyphformatcontrols,in_egyptian_hieroglyph_format_controls
|
243
246
|
inegyptianhieroglyphs,in_egyptian_hieroglyphs
|
247
|
+
inegyptianhieroglyphsextendeda,in_egyptian_hieroglyphs_extended_a
|
244
248
|
inelbasan,in_elbasan
|
245
249
|
inelymaic,in_elymaic
|
246
250
|
inemoticons,in_emoticons
|
@@ -253,6 +257,7 @@ inethiopicextended,in_ethiopic_extended
|
|
253
257
|
inethiopicextendeda,in_ethiopic_extended_a
|
254
258
|
inethiopicextendedb,in_ethiopic_extended_b
|
255
259
|
inethiopicsupplement,in_ethiopic_supplement
|
260
|
+
ingaray,in_garay
|
256
261
|
ingeneralpunctuation,in_general_punctuation
|
257
262
|
ingeometricshapes,in_geometric_shapes
|
258
263
|
ingeometricshapesextended,in_geometric_shapes_extended
|
@@ -268,6 +273,7 @@ ingreekextended,in_greek_extended
|
|
268
273
|
ingujarati,in_gujarati
|
269
274
|
ingunjalagondi,in_gunjala_gondi
|
270
275
|
ingurmukhi,in_gurmukhi
|
276
|
+
ingurungkhema,in_gurung_khema
|
271
277
|
inhalfwidthandfullwidthforms,in_halfwidth_and_fullwidth_forms
|
272
278
|
inhangulcompatibilityjamo,in_hangul_compatibility_jamo
|
273
279
|
inhanguljamo,in_hangul_jamo
|
@@ -309,6 +315,7 @@ inkhmer,in_khmer
|
|
309
315
|
inkhmersymbols,in_khmer_symbols
|
310
316
|
inkhojki,in_khojki
|
311
317
|
inkhudawadi,in_khudawadi
|
318
|
+
inkiratrai,in_kirat_rai
|
312
319
|
inlao,in_lao
|
313
320
|
inlatin1supplement,in_latin_1_supplement
|
314
321
|
inlatinextendeda,in_latin_extended_a
|
@@ -364,6 +371,7 @@ inmusicalsymbols,in_musical_symbols
|
|
364
371
|
inmyanmar,in_myanmar
|
365
372
|
inmyanmarextendeda,in_myanmar_extended_a
|
366
373
|
inmyanmarextendedb,in_myanmar_extended_b
|
374
|
+
inmyanmarextendedc,in_myanmar_extended_c
|
367
375
|
innabataean,in_nabataean
|
368
376
|
innagmundari,in_nag_mundari
|
369
377
|
innandinagari,in_nandinagari
|
@@ -385,6 +393,7 @@ inoldsogdian,in_old_sogdian
|
|
385
393
|
inoldsoutharabian,in_old_south_arabian
|
386
394
|
inoldturkic,in_old_turkic
|
387
395
|
inolduyghur,in_old_uyghur
|
396
|
+
inolonal,in_ol_onal
|
388
397
|
inopticalcharacterrecognition,in_optical_character_recognition
|
389
398
|
inoriya,in_oriya
|
390
399
|
inornamentaldingbats,in_ornamental_dingbats
|
@@ -424,6 +433,7 @@ inspacingmodifierletters,in_spacing_modifier_letters
|
|
424
433
|
inspecials,in_specials
|
425
434
|
insundanese,in_sundanese
|
426
435
|
insundanesesupplement,in_sundanese_supplement
|
436
|
+
insunuwar,in_sunuwar
|
427
437
|
insuperscriptsandsubscripts,in_superscripts_and_subscripts
|
428
438
|
insupplementalarrowsa,in_supplemental_arrows_a
|
429
439
|
insupplementalarrowsb,in_supplemental_arrows_b
|
@@ -437,6 +447,7 @@ insuttonsignwriting,in_sutton_signwriting
|
|
437
447
|
insylotinagri,in_syloti_nagri
|
438
448
|
insymbolsandpictographsextendeda,in_symbols_and_pictographs_extended_a
|
439
449
|
insymbolsforlegacycomputing,in_symbols_for_legacy_computing
|
450
|
+
insymbolsforlegacycomputingsupplement,in_symbols_for_legacy_computing_supplement
|
440
451
|
insyriac,in_syriac
|
441
452
|
insyriacsupplement,in_syriac_supplement
|
442
453
|
intagalog,in_tagalog
|
@@ -459,8 +470,10 @@ inthai,in_thai
|
|
459
470
|
intibetan,in_tibetan
|
460
471
|
intifinagh,in_tifinagh
|
461
472
|
intirhuta,in_tirhuta
|
473
|
+
intodhri,in_todhri
|
462
474
|
intoto,in_toto
|
463
475
|
intransportandmapsymbols,in_transport_and_map_symbols
|
476
|
+
intulutigalari,in_tulu_tigalari
|
464
477
|
inugaritic,in_ugaritic
|
465
478
|
inunifiedcanadianaboriginalsyllabics,in_unified_canadian_aboriginal_syllabics
|
466
479
|
inunifiedcanadianaboriginalsyllabicsextended,in_unified_canadian_aboriginal_syllabics_extended
|
@@ -491,6 +504,7 @@ khitansmallscript,khitan_small_script
|
|
491
504
|
khmer,khmer
|
492
505
|
khojki,khojki
|
493
506
|
khudawadi,khudawadi
|
507
|
+
kiratrai,kirat_rai
|
494
508
|
lao,lao
|
495
509
|
latin,latin
|
496
510
|
lepcha,lepcha
|
@@ -524,6 +538,7 @@ meroiticcursive,meroitic_cursive
|
|
524
538
|
meroitichieroglyphs,meroitic_hieroglyphs
|
525
539
|
miao,miao
|
526
540
|
modi,modi
|
541
|
+
modifiercombiningmark,modifier_combining_mark
|
527
542
|
modifierletter,modifier_letter
|
528
543
|
modifiersymbol,modifier_symbol
|
529
544
|
mongolian,mongolian
|
@@ -553,6 +568,7 @@ oldsogdian,old_sogdian
|
|
553
568
|
oldsoutharabian,old_south_arabian
|
554
569
|
oldturkic,old_turkic
|
555
570
|
olduyghur,old_uyghur
|
571
|
+
olonal,ol_onal
|
556
572
|
openpunctuation,open_punctuation
|
557
573
|
oriya,oriya
|
558
574
|
osage,osage
|
@@ -606,6 +622,7 @@ space,space
|
|
606
622
|
spaceseparator,space_separator
|
607
623
|
spacingmark,spacing_mark
|
608
624
|
sundanese,sundanese
|
625
|
+
sunuwar,sunuwar
|
609
626
|
surrogate,surrogate
|
610
627
|
sylotinagri,syloti_nagri
|
611
628
|
symbol,symbol
|
@@ -627,7 +644,9 @@ tibetan,tibetan
|
|
627
644
|
tifinagh,tifinagh
|
628
645
|
tirhuta,tirhuta
|
629
646
|
titlecaseletter,titlecase_letter
|
647
|
+
todhri,todhri
|
630
648
|
toto,toto
|
649
|
+
tulutigalari,tulu_tigalari
|
631
650
|
ugaritic,ugaritic
|
632
651
|
unassigned,unassigned
|
633
652
|
unifiedideograph,unified_ideograph
|
@@ -58,6 +58,7 @@ epres,emoji_presentation
|
|
58
58
|
ethi,ethiopic
|
59
59
|
ext,extender
|
60
60
|
extpict,extended_pictographic
|
61
|
+
gara,garay
|
61
62
|
geor,georgian
|
62
63
|
glag,glagolitic
|
63
64
|
gong,gunjala_gondi
|
@@ -69,6 +70,7 @@ grek,greek
|
|
69
70
|
grext,grapheme_extend
|
70
71
|
grlink,grapheme_link
|
71
72
|
gujr,gujarati
|
73
|
+
gukh,gurung_khema
|
72
74
|
guru,gurmukhi
|
73
75
|
hang,hangul
|
74
76
|
hani,han
|
@@ -97,6 +99,7 @@ khmr,khmer
|
|
97
99
|
khoj,khojki
|
98
100
|
kits,khitan_small_script
|
99
101
|
knda,kannada
|
102
|
+
krai,kirat_rai
|
100
103
|
kthi,kaithi
|
101
104
|
l,letter
|
102
105
|
lana,tai_tham
|
@@ -122,6 +125,7 @@ mand,mandaic
|
|
122
125
|
mani,manichaean
|
123
126
|
marc,marchen
|
124
127
|
mc,spacing_mark
|
128
|
+
mcm,modifier_combining_mark
|
125
129
|
me,enclosing_mark
|
126
130
|
medf,medefaidrin
|
127
131
|
mend,mende_kikakui
|
@@ -154,6 +158,7 @@ oids,other_id_start
|
|
154
158
|
olck,ol_chiki
|
155
159
|
olower,other_lowercase
|
156
160
|
omath,other_math
|
161
|
+
onao,ol_onal
|
157
162
|
orkh,old_turkic
|
158
163
|
orya,oriya
|
159
164
|
osge,osage
|
@@ -208,6 +213,7 @@ sora,sora_sompeng
|
|
208
213
|
soyo,soyombo
|
209
214
|
sterm,sentence_terminal
|
210
215
|
sund,sundanese
|
216
|
+
sunu,sunuwar
|
211
217
|
sylo,syloti_nagri
|
212
218
|
syrc,syriac
|
213
219
|
tagb,tagbanwa
|
@@ -225,6 +231,8 @@ thaa,thaana
|
|
225
231
|
tibt,tibetan
|
226
232
|
tirh,tirhuta
|
227
233
|
tnsa,tangsa
|
234
|
+
todr,todhri
|
235
|
+
tutg,tulu_tigalari
|
228
236
|
ugar,ugaritic
|
229
237
|
uideo,unified_ideograph
|
230
238
|
vaii,vai
|
@@ -37,7 +37,8 @@
|
|
37
37
|
octal_sequence = [0-7]{1,3};
|
38
38
|
|
39
39
|
hex_sequence = 'x' . xdigit{1,2};
|
40
|
-
hex_sequence_err = 'x' . [^0-
|
40
|
+
hex_sequence_err = 'x' . [^0-9A-Fa-f];
|
41
|
+
high_hex_sequence = 'x' . [89A-Fa-f] . xdigit . ( '\\x' . [89A-Fa-f] . xdigit )*;
|
41
42
|
|
42
43
|
codepoint_single = 'u' . xdigit{4};
|
43
44
|
codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
|
@@ -210,7 +211,7 @@
|
|
210
211
|
type = :nonposixclass
|
211
212
|
end
|
212
213
|
|
213
|
-
unless
|
214
|
+
unless POSIX_CLASSES[class_name]
|
214
215
|
raise ValidationError.for(:posix_class, text)
|
215
216
|
end
|
216
217
|
|
@@ -256,9 +257,21 @@
|
|
256
257
|
# escape sequence scanner
|
257
258
|
# --------------------------------------------------------------------------
|
258
259
|
escape_sequence := |*
|
259
|
-
[1-9] {
|
260
|
+
[1-9] . [0-9]* {
|
260
261
|
text = copy(data, ts-1, te)
|
261
|
-
|
262
|
+
|
263
|
+
# If not enough groups have been opened, there is a fallback to either an
|
264
|
+
# octal or literal interpretation for 2+ digit numerical escapes.
|
265
|
+
digits = text[1..-1]
|
266
|
+
if digits.size == 1 || digits.to_i <= capturing_group_count
|
267
|
+
emit(:backref, :number, text)
|
268
|
+
elsif digits =~ /\A[0-7]{2,}\z/
|
269
|
+
emit(:escape, :octal, text)
|
270
|
+
else
|
271
|
+
emit(:escape, :literal, text[0..1])
|
272
|
+
emit(:literal, :literal, text[2..-1])
|
273
|
+
end
|
274
|
+
|
262
275
|
fret;
|
263
276
|
};
|
264
277
|
|
@@ -321,6 +334,16 @@
|
|
321
334
|
fret;
|
322
335
|
};
|
323
336
|
|
337
|
+
high_hex_sequence > (escaped_alpha, 5) {
|
338
|
+
text = copy(data, ts-1, te)
|
339
|
+
if regexp_encoding == Encoding::BINARY
|
340
|
+
text.split(/(?=\\)/).each { |part| emit(:escape, :hex, part) }
|
341
|
+
else
|
342
|
+
emit(:escape, :utf8_hex, text)
|
343
|
+
end
|
344
|
+
fret;
|
345
|
+
};
|
346
|
+
|
324
347
|
hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
|
325
348
|
emit(:escape, :hex, copy(data, ts-1, te))
|
326
349
|
fret;
|
@@ -514,6 +537,7 @@
|
|
514
537
|
};
|
515
538
|
|
516
539
|
group_open @group_opened {
|
540
|
+
self.capturing_group_count = capturing_group_count + 1
|
517
541
|
text = copy(data, ts, te)
|
518
542
|
emit(:group, :capture, text)
|
519
543
|
};
|
@@ -662,6 +686,7 @@ class Regexp::Scanner
|
|
662
686
|
|
663
687
|
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
664
688
|
self.free_spacing = free_spacing?(input_object, options)
|
689
|
+
self.regexp_encoding = input_object.encoding if input_object.is_a?(::Regexp)
|
665
690
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
666
691
|
|
667
692
|
data = input.unpack("c*")
|
@@ -672,6 +697,7 @@ class Regexp::Scanner
|
|
672
697
|
|
673
698
|
self.set_depth = 0
|
674
699
|
self.group_depth = 0
|
700
|
+
self.capturing_group_count = 0
|
675
701
|
self.conditional_stack = []
|
676
702
|
self.char_pos = 0
|
677
703
|
|
@@ -711,10 +737,11 @@ class Regexp::Scanner
|
|
711
737
|
File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
|
712
738
|
end
|
713
739
|
|
714
|
-
|
740
|
+
# Use each_with_object for required_ruby_version >= 2.2, or #to_h for >= 2.6
|
741
|
+
POSIX_CLASSES =
|
715
742
|
%w[alnum alpha ascii blank cntrl digit graph
|
716
743
|
lower print punct space upper word xdigit]
|
717
|
-
|
744
|
+
.inject({}) { |o, e| o.merge(e => true) }.freeze
|
718
745
|
|
719
746
|
# Emits an array with the details of the scanned pattern
|
720
747
|
def emit(type, token, text)
|
@@ -742,13 +769,14 @@ class Regexp::Scanner
|
|
742
769
|
end
|
743
770
|
end
|
744
771
|
|
745
|
-
attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
|
772
|
+
attr_accessor :capturing_group_count, :literal_run # only public for #||= to work on ruby <= 2.5
|
746
773
|
|
747
774
|
private
|
748
775
|
|
749
776
|
attr_accessor :block,
|
750
777
|
:collect_tokens, :tokens, :prev_token,
|
751
778
|
:free_spacing, :spacing_stack,
|
779
|
+
:regexp_encoding,
|
752
780
|
:group_depth, :set_depth, :conditional_stack,
|
753
781
|
:char_pos
|
754
782
|
|