regexp_parser 2.0.2 → 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +54 -0
  3. data/Gemfile +5 -1
  4. data/README.md +15 -21
  5. data/Rakefile +11 -17
  6. data/lib/regexp_parser/error.rb +4 -0
  7. data/lib/regexp_parser/expression/base.rb +123 -0
  8. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  9. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +5 -0
  10. data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
  11. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -1
  12. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +0 -0
  13. data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
  14. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +1 -0
  15. data/lib/regexp_parser/expression/classes/free_space.rb +1 -3
  16. data/lib/regexp_parser/expression/classes/group.rb +6 -1
  17. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  18. data/lib/regexp_parser/expression/classes/property.rb +1 -3
  19. data/lib/regexp_parser/expression/classes/root.rb +0 -1
  20. data/lib/regexp_parser/expression/classes/type.rb +0 -2
  21. data/lib/regexp_parser/expression/quantifier.rb +2 -2
  22. data/lib/regexp_parser/expression/sequence.rb +3 -10
  23. data/lib/regexp_parser/expression/subexpression.rb +1 -2
  24. data/lib/regexp_parser/expression.rb +7 -130
  25. data/lib/regexp_parser/lexer.rb +7 -5
  26. data/lib/regexp_parser/parser.rb +282 -334
  27. data/lib/regexp_parser/scanner/properties/long.yml +13 -0
  28. data/lib/regexp_parser/scanner/properties/short.yml +9 -1
  29. data/lib/regexp_parser/scanner/scanner.rl +64 -87
  30. data/lib/regexp_parser/scanner.rb +1024 -1073
  31. data/lib/regexp_parser/syntax/any.rb +2 -4
  32. data/lib/regexp_parser/syntax/base.rb +10 -10
  33. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  34. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  35. data/lib/regexp_parser/syntax/{tokens/backref.rb → token/backreference.rb} +6 -5
  36. data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
  37. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  38. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  39. data/lib/regexp_parser/syntax/token/escape.rb +31 -0
  40. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  41. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  42. data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
  43. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  44. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  45. data/lib/regexp_parser/syntax/token/unicode_property.rb +696 -0
  46. data/lib/regexp_parser/syntax/token.rb +45 -0
  47. data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
  48. data/lib/regexp_parser/syntax/versions/1.8.6.rb +1 -1
  49. data/lib/regexp_parser/syntax/versions/3.1.0.rb +10 -0
  50. data/lib/regexp_parser/syntax.rb +8 -6
  51. data/lib/regexp_parser/token.rb +9 -20
  52. data/lib/regexp_parser/version.rb +1 -1
  53. data/lib/regexp_parser.rb +0 -2
  54. data/spec/expression/clone_spec.rb +36 -4
  55. data/spec/expression/free_space_spec.rb +2 -2
  56. data/spec/expression/methods/match_length_spec.rb +2 -2
  57. data/spec/lexer/nesting_spec.rb +2 -2
  58. data/spec/lexer/refcalls_spec.rb +5 -0
  59. data/spec/parser/all_spec.rb +2 -2
  60. data/spec/parser/escapes_spec.rb +43 -31
  61. data/spec/parser/properties_spec.rb +6 -4
  62. data/spec/parser/refcalls_spec.rb +5 -0
  63. data/spec/parser/set/ranges_spec.rb +26 -16
  64. data/spec/scanner/escapes_spec.rb +29 -20
  65. data/spec/scanner/refcalls_spec.rb +19 -0
  66. data/spec/scanner/sets_spec.rb +66 -23
  67. data/spec/spec_helper.rb +13 -1
  68. data/spec/support/capturing_stderr.rb +9 -0
  69. data/spec/syntax/versions/1.8.6_spec.rb +2 -2
  70. data/spec/syntax/versions/2.0.0_spec.rb +2 -2
  71. data/spec/syntax/versions/aliases_spec.rb +1 -0
  72. metadata +27 -26
  73. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  74. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  75. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  76. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  77. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  78. data/spec/support/runner.rb +0 -42
  79. data/spec/support/warning_extractor.rb +0 -60
@@ -8,6 +8,7 @@ age=10.0: age=10.0
8
8
  age=11.0: age=11.0
9
9
  age=12.0: age=12.0
10
10
  age=12.1: age=12.1
11
+ age=13.0: age=13.0
11
12
  age=2.0: age=2.0
12
13
  age=2.1: age=2.1
13
14
  age=3.0: age=3.0
@@ -64,6 +65,7 @@ changeswhenlowercased: changes_when_lowercased
64
65
  changeswhentitlecased: changes_when_titlecased
65
66
  changeswhenuppercased: changes_when_uppercased
66
67
  cherokee: cherokee
68
+ chorasmian: chorasmian
67
69
  closepunctuation: close_punctuation
68
70
  cntrl: cntrl
69
71
  common: common
@@ -83,6 +85,7 @@ deseret: deseret
83
85
  devanagari: devanagari
84
86
  diacritic: diacritic
85
87
  digit: digit
88
+ divesakuru: dives_akuru
86
89
  dogra: dogra
87
90
  duployan: duployan
88
91
  egyptianhieroglyphs: egyptian_hieroglyphs
@@ -167,6 +170,7 @@ incham: in_cham
167
170
  incherokee: in_cherokee
168
171
  incherokeesupplement: in_cherokee_supplement
169
172
  inchesssymbols: in_chess_symbols
173
+ inchorasmian: in_chorasmian
170
174
  incjkcompatibility: in_cjk_compatibility
171
175
  incjkcompatibilityforms: in_cjk_compatibility_forms
172
176
  incjkcompatibilityideographs: in_cjk_compatibility_ideographs
@@ -181,6 +185,7 @@ incjkunifiedideographsextensionc: in_cjk_unified_ideographs_extension_c
181
185
  incjkunifiedideographsextensiond: in_cjk_unified_ideographs_extension_d
182
186
  incjkunifiedideographsextensione: in_cjk_unified_ideographs_extension_e
183
187
  incjkunifiedideographsextensionf: in_cjk_unified_ideographs_extension_f
188
+ incjkunifiedideographsextensiong: in_cjk_unified_ideographs_extension_g
184
189
  incombiningdiacriticalmarks: in_combining_diacritical_marks
185
190
  incombiningdiacriticalmarksextended: in_combining_diacritical_marks_extended
186
191
  incombiningdiacriticalmarksforsymbols: in_combining_diacritical_marks_for_symbols
@@ -204,6 +209,7 @@ indeseret: in_deseret
204
209
  indevanagari: in_devanagari
205
210
  indevanagariextended: in_devanagari_extended
206
211
  indingbats: in_dingbats
212
+ indivesakuru: in_dives_akuru
207
213
  indogra: in_dogra
208
214
  indominotiles: in_domino_tiles
209
215
  induployan: in_duployan
@@ -269,6 +275,7 @@ inkatakana: in_katakana
269
275
  inkatakanaphoneticextensions: in_katakana_phonetic_extensions
270
276
  inkayahli: in_kayah_li
271
277
  inkharoshthi: in_kharoshthi
278
+ inkhitansmallscript: in_khitan_small_script
272
279
  inkhmer: in_khmer
273
280
  inkhmersymbols: in_khmer_symbols
274
281
  inkhojki: in_khojki
@@ -288,6 +295,7 @@ inlineara: in_linear_a
288
295
  inlinearbideograms: in_linear_b_ideograms
289
296
  inlinearbsyllabary: in_linear_b_syllabary
290
297
  inlisu: in_lisu
298
+ inlisusupplement: in_lisu_supplement
291
299
  inlowsurrogates: in_low_surrogates
292
300
  inlycian: in_lycian
293
301
  inlydian: in_lydian
@@ -395,6 +403,7 @@ insupplementaryprivateuseareab: in_supplementary_private_use_area_b
395
403
  insuttonsignwriting: in_sutton_signwriting
396
404
  insylotinagri: in_syloti_nagri
397
405
  insymbolsandpictographsextendeda: in_symbols_and_pictographs_extended_a
406
+ insymbolsforlegacycomputing: in_symbols_for_legacy_computing
398
407
  insyriac: in_syriac
399
408
  insyriacsupplement: in_syriac_supplement
400
409
  intagalog: in_tagalog
@@ -409,6 +418,7 @@ intamil: in_tamil
409
418
  intamilsupplement: in_tamil_supplement
410
419
  intangut: in_tangut
411
420
  intangutcomponents: in_tangut_components
421
+ intangutsupplement: in_tangut_supplement
412
422
  intelugu: in_telugu
413
423
  inthaana: in_thaana
414
424
  inthai: in_thai
@@ -426,6 +436,7 @@ invedicextensions: in_vedic_extensions
426
436
  inverticalforms: in_vertical_forms
427
437
  inwancho: in_wancho
428
438
  inwarangciti: in_warang_citi
439
+ inyezidi: in_yezidi
429
440
  inyijinghexagramsymbols: in_yijing_hexagram_symbols
430
441
  inyiradicals: in_yi_radicals
431
442
  inyisyllables: in_yi_syllables
@@ -437,6 +448,7 @@ kannada: kannada
437
448
  katakana: katakana
438
449
  kayahli: kayah_li
439
450
  kharoshthi: kharoshthi
451
+ khitansmallscript: khitan_small_script
440
452
  khmer: khmer
441
453
  khojki: khojki
442
454
  khudawadi: khudawadi
@@ -590,5 +602,6 @@ xdigit: xdigit
590
602
  xidcontinue: xid_continue
591
603
  xidstart: xid_start
592
604
  xposixpunct: xposixpunct
605
+ yezidi: yezidi
593
606
  yi: yi
594
607
  zanabazarsquare: zanabazar_square
@@ -28,6 +28,7 @@ cari: carian
28
28
  cc: control
29
29
  cf: format
30
30
  cher: cherokee
31
+ chrs: chorasmian
31
32
  ci: case_ignorable
32
33
  cn: unassigned
33
34
  co: private_use
@@ -45,12 +46,17 @@ dep: deprecated
45
46
  deva: devanagari
46
47
  di: default_ignorable_code_point
47
48
  dia: diacritic
49
+ diak: dives_akuru
48
50
  dogr: dogra
49
51
  dsrt: deseret
50
52
  dupl: duployan
53
+ ebase: emoji_modifier_base
54
+ ecomp: emoji_component
51
55
  egyp: egyptian_hieroglyphs
52
56
  elba: elbasan
53
57
  elym: elymaic
58
+ emod: emoji_modifier
59
+ epres: emoji_presentation
54
60
  ethi: ethiopic
55
61
  ext: extender
56
62
  geor: georgian
@@ -89,6 +95,7 @@ kana: katakana
89
95
  khar: kharoshthi
90
96
  khmr: khmer
91
97
  khoj: khojki
98
+ kits: khitan_small_script
92
99
  knda: kannada
93
100
  kthi: kaithi
94
101
  l: letter
@@ -127,7 +134,7 @@ mroo: mro
127
134
  mtei: meetei_mayek
128
135
  mult: multani
129
136
  mymr: myanmar
130
- n: number
137
+ "n": number
131
138
  nand: nandinagari
132
139
  narb: old_north_arabian
133
140
  nbat: nabataean
@@ -226,6 +233,7 @@ xidc: xid_continue
226
233
  xids: xid_start
227
234
  xpeo: old_persian
228
235
  xsux: cuneiform
236
+ yezi: yezidi
229
237
  yiii: yi
230
238
  z: separator
231
239
  zanb: zanabazar_square
@@ -20,7 +20,7 @@
20
20
 
21
21
  group_open = '(';
22
22
  group_close = ')';
23
- parantheses = group_open | group_close;
23
+ parentheses = group_open | group_close;
24
24
 
25
25
  set_open = '[';
26
26
  set_close = ']';
@@ -37,7 +37,7 @@
37
37
  class_posix = ('[:' . '^'? . class_name_posix . ':]');
38
38
 
39
39
 
40
- # these are not supported in ruby, and need verification
40
+ # these are not supported in ruby at the moment
41
41
  collating_sequence = '[.' . (alpha | [\-])+ . '.]';
42
42
  character_equivalent = '[=' . alpha . '=]';
43
43
 
@@ -58,6 +58,8 @@
58
58
 
59
59
  meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
60
60
 
61
+ sequence_char = [CMcux];
62
+
61
63
  zero_or_one = '?' | '??' | '?+';
62
64
  zero_or_more = '*' | '*?' | '*+';
63
65
  one_or_more = '+' | '+?' | '++';
@@ -106,11 +108,15 @@
106
108
 
107
109
  group_named = ('?' . group_name );
108
110
 
109
- group_name_ref = group_ref . (('<' . group_name_id_ab? . group_level? '>') |
110
- ("'" . group_name_id_sq? . group_level? "'"));
111
+ group_name_backref = 'k' . (('<' . group_name_id_ab? . group_level? '>') |
112
+ ("'" . group_name_id_sq? . group_level? "'"));
113
+ group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
114
+ ("'" . group_name_id_sq? . group_level? "'"));
111
115
 
112
- group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
113
- ("'" . group_number . group_level? "'"));
116
+ group_number_backref = 'k' . (('<' . group_number . group_level? '>') |
117
+ ("'" . group_number . group_level? "'"));
118
+ group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
119
+ ("'" . ((group_number . group_level?) | '0') "'"));
114
120
 
115
121
  group_type = group_atomic | group_passive | group_absence | group_named;
116
122
 
@@ -121,7 +127,7 @@
121
127
 
122
128
  # characters that 'break' a literal
123
129
  meta_char = dot | backslash | alternation |
124
- curlies | parantheses | brackets |
130
+ curlies | parentheses | brackets |
125
131
  line_anchor | quantifier_greedy;
126
132
 
127
133
  literal_delimiters = ']' | '}';
@@ -130,10 +136,12 @@
130
136
  ascii_nonprint = (0x01..0x1f | 0x7f);
131
137
 
132
138
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
133
- keep_mark | [xucCM];
139
+ keep_mark | sequence_char;
140
+
141
+ # escapes that also work within a character set
142
+ set_escape = backslash | brackets | escaped_ascii | property_char |
143
+ sequence_char | single_codepoint_char_type;
134
144
 
135
- non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
136
- multi_codepoint_char_type | [0-9cCM];
137
145
 
138
146
  # EOF error, used where it can be detected
139
147
  action premature_end_error {
@@ -228,13 +236,13 @@
228
236
  emit(type, class_name.to_sym, text)
229
237
  };
230
238
 
231
- collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
232
- emit(:set, :collation, copy(data, ts, te))
233
- };
234
-
235
- character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
236
- emit(:set, :equivalent, copy(data, ts, te))
237
- };
239
+ # These are not supported in ruby at the moment. Enable them if they are.
240
+ # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
241
+ # emit(:set, :collation, copy(data, ts, te))
242
+ # };
243
+ # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
244
+ # emit(:set, :equivalent, copy(data, ts, te))
245
+ # };
238
246
 
239
247
  meta_char > (set_meta, 1) {
240
248
  emit(:literal, :literal, copy(data, ts, te))
@@ -249,16 +257,16 @@
249
257
  # set escapes scanner
250
258
  # --------------------------------------------------------------------------
251
259
  set_escape_sequence := |*
252
- non_set_escape > (escaped_set_alpha, 2) {
253
- emit(:escape, :literal, copy(data, ts-1, te))
254
- fret;
255
- };
256
-
257
- any > (escaped_set_alpha, 1) {
260
+ set_escape > (escaped_set_alpha, 2) {
258
261
  fhold;
259
262
  fnext character_set;
260
263
  fcall escape_sequence;
261
264
  };
265
+
266
+ any > (escaped_set_alpha, 1) {
267
+ emit(:escape, :literal, copy(data, ts-1, te))
268
+ fret;
269
+ };
262
270
  *|;
263
271
 
264
272
 
@@ -538,67 +546,35 @@
538
546
 
539
547
  # Group backreference, named and numbered
540
548
  # ------------------------------------------------------------------------
541
- backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
549
+ backslash . (group_name_backref | group_number_backref) > (backslashed, 4) {
542
550
  case text = copy(data, ts, te)
543
- when /^\\([gk])(<>|'')/ # angle brackets
544
- validation_error(:backref, 'ref/call', 'ref ID is empty')
545
-
546
- # TODO: finer quirks of choosing recursive or non-recursive refs/calls.
547
- # e.g.: `a-1` is a valid group id: 'aa'[/(?<a-1>a)\g<a-1>/] # => 'aa'
548
- when /^\\([gk])<[^\p{digit}+\->][^>+\-]*>/ # angle-brackets
549
- if $1 == 'k'
550
- emit(:backref, :name_ref_ab, text)
551
- else
552
- emit(:backref, :name_call_ab, text)
553
- end
554
-
555
- when /^\\([gk])'[^\p{digit}+\-'][^'+\-]*'/ # single quotes
556
- if $1 == 'k'
557
- emit(:backref, :name_ref_sq, text)
558
- else
559
- emit(:backref, :name_call_sq, text)
560
- end
561
-
562
- when /^\\([gk])<\d+>/ # angle-brackets
563
- if $1 == 'k'
564
- emit(:backref, :number_ref_ab, text)
565
- else
566
- emit(:backref, :number_call_ab, text)
567
- end
568
-
569
- when /^\\([gk])'\d+'/ # single quotes
570
- if $1 == 'k'
571
- emit(:backref, :number_ref_sq, text)
572
- else
573
- emit(:backref, :number_call_sq, text)
574
- end
575
-
576
- when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
577
- if $1 == 'k'
578
- emit(:backref, :number_rel_ref_ab, text)
579
- else
580
- emit(:backref, :number_rel_call_ab, text)
581
- end
582
-
583
- when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
584
- if $1 == 'k'
585
- emit(:backref, :number_rel_ref_sq, text)
586
- else
587
- emit(:backref, :number_rel_call_sq, text)
588
- end
589
-
590
- when /^\\k<[^\p{digit}+\->][^>]*[+\-]\d+>/ # angle-brackets
591
- emit(:backref, :name_recursion_ref_ab, text)
592
-
593
- when /^\\k'[^\p{digit}+\-'][^']*[+\-]\d+'/ # single-quotes
594
- emit(:backref, :name_recursion_ref_sq, text)
595
-
596
- when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
597
- emit(:backref, :number_recursion_ref_ab, text)
598
-
599
- when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
600
- emit(:backref, :number_recursion_ref_sq, text)
551
+ when /^\\k(<>|'')/
552
+ validation_error(:backref, 'backreference', 'ref ID is empty')
553
+ when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
554
+ emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
555
+ when /^\\k(.)\d+\D$/
556
+ emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
557
+ when /^\\k(.)-\d+\D$/
558
+ emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
559
+ when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/
560
+ emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
561
+ when /^\\k(.)-?\d+[+\-]\d+\D$/
562
+ emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
563
+ end
564
+ };
601
565
 
566
+ # Group call, named and numbered
567
+ # ------------------------------------------------------------------------
568
+ backslash . (group_name_call | group_number_call) > (backslashed, 4) {
569
+ case text = copy(data, ts, te)
570
+ when /^\\g(<>|'')/
571
+ validation_error(:backref, 'subexpression call', 'ref ID is empty')
572
+ when /^\\g(.)[^\p{digit}+\->][^+\-]*/
573
+ emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
574
+ when /^\\g(.)\d+\D$/
575
+ emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
576
+ when /^\\g(.)[+-]\d+/
577
+ emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
602
578
  end
603
579
  };
604
580
 
@@ -675,12 +651,14 @@
675
651
  # THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
676
652
  # This file was generated from lib/regexp_parser/scanner/scanner.rl
677
653
 
654
+ require 'regexp_parser/error'
655
+
678
656
  class Regexp::Scanner
679
657
  # General scanner error (catch all)
680
- class ScannerError < StandardError; end
658
+ class ScannerError < Regexp::Parser::Error; end
681
659
 
682
660
  # Base for all scanner validation errors
683
- class ValidationError < StandardError
661
+ class ValidationError < Regexp::Parser::Error
684
662
  def initialize(reason)
685
663
  super reason
686
664
  end
@@ -782,14 +760,13 @@ class Regexp::Scanner
782
760
 
783
761
  # lazy-load property maps when first needed
784
762
  require 'yaml'
785
- PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
786
763
 
787
764
  def self.short_prop_map
788
- @short_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/short.yml")
765
+ @short_prop_map ||= YAML.load_file("#{__dir__}/scanner/properties/short.yml")
789
766
  end
790
767
 
791
768
  def self.long_prop_map
792
- @long_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/long.yml")
769
+ @long_prop_map ||= YAML.load_file("#{__dir__}/scanner/properties/long.yml")
793
770
  end
794
771
 
795
772
  # Emits an array with the details of the scanned pattern