regexp_parser 1.3.0 → 1.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +65 -1
  3. data/Gemfile +3 -3
  4. data/README.md +10 -14
  5. data/Rakefile +3 -4
  6. data/lib/regexp_parser/expression.rb +28 -53
  7. data/lib/regexp_parser/expression/classes/backref.rb +18 -10
  8. data/lib/regexp_parser/expression/classes/conditional.rb +7 -2
  9. data/lib/regexp_parser/expression/classes/escape.rb +0 -4
  10. data/lib/regexp_parser/expression/classes/group.rb +4 -2
  11. data/lib/regexp_parser/expression/classes/keep.rb +1 -3
  12. data/lib/regexp_parser/expression/methods/match.rb +13 -0
  13. data/lib/regexp_parser/expression/methods/match_length.rb +172 -0
  14. data/lib/regexp_parser/expression/methods/options.rb +35 -0
  15. data/lib/regexp_parser/expression/methods/strfregexp.rb +0 -1
  16. data/lib/regexp_parser/expression/methods/tests.rb +6 -15
  17. data/lib/regexp_parser/expression/methods/traverse.rb +3 -1
  18. data/lib/regexp_parser/expression/quantifier.rb +2 -2
  19. data/lib/regexp_parser/expression/sequence.rb +3 -6
  20. data/lib/regexp_parser/expression/sequence_operation.rb +2 -6
  21. data/lib/regexp_parser/expression/subexpression.rb +3 -5
  22. data/lib/regexp_parser/lexer.rb +30 -44
  23. data/lib/regexp_parser/parser.rb +47 -24
  24. data/lib/regexp_parser/scanner.rb +1159 -1329
  25. data/lib/regexp_parser/scanner/char_type.rl +0 -3
  26. data/lib/regexp_parser/scanner/properties/long.yml +34 -1
  27. data/lib/regexp_parser/scanner/properties/short.yml +12 -0
  28. data/lib/regexp_parser/scanner/scanner.rl +82 -190
  29. data/lib/regexp_parser/syntax/tokens.rb +2 -10
  30. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +72 -21
  31. data/lib/regexp_parser/syntax/versions/2.6.0.rb +10 -0
  32. data/lib/regexp_parser/syntax/versions/2.6.2.rb +10 -0
  33. data/lib/regexp_parser/syntax/versions/2.6.3.rb +10 -0
  34. data/lib/regexp_parser/version.rb +1 -1
  35. data/regexp_parser.gemspec +3 -3
  36. data/spec/expression/base_spec.rb +94 -0
  37. data/spec/expression/clone_spec.rb +120 -0
  38. data/spec/expression/conditional_spec.rb +89 -0
  39. data/spec/expression/free_space_spec.rb +27 -0
  40. data/spec/expression/methods/match_length_spec.rb +161 -0
  41. data/spec/expression/methods/match_spec.rb +25 -0
  42. data/spec/expression/methods/strfregexp_spec.rb +224 -0
  43. data/spec/expression/methods/tests_spec.rb +99 -0
  44. data/spec/expression/methods/traverse_spec.rb +161 -0
  45. data/spec/expression/options_spec.rb +128 -0
  46. data/spec/expression/root_spec.rb +9 -0
  47. data/spec/expression/sequence_spec.rb +9 -0
  48. data/spec/expression/subexpression_spec.rb +50 -0
  49. data/spec/expression/to_h_spec.rb +26 -0
  50. data/spec/expression/to_s_spec.rb +100 -0
  51. data/spec/lexer/all_spec.rb +22 -0
  52. data/spec/lexer/conditionals_spec.rb +53 -0
  53. data/spec/lexer/escapes_spec.rb +14 -0
  54. data/spec/lexer/keep_spec.rb +10 -0
  55. data/spec/lexer/literals_spec.rb +89 -0
  56. data/spec/lexer/nesting_spec.rb +99 -0
  57. data/spec/lexer/refcalls_spec.rb +55 -0
  58. data/spec/parser/all_spec.rb +43 -0
  59. data/spec/parser/alternation_spec.rb +88 -0
  60. data/spec/parser/anchors_spec.rb +17 -0
  61. data/spec/parser/conditionals_spec.rb +179 -0
  62. data/spec/parser/errors_spec.rb +30 -0
  63. data/spec/parser/escapes_spec.rb +121 -0
  64. data/spec/parser/free_space_spec.rb +130 -0
  65. data/spec/parser/groups_spec.rb +108 -0
  66. data/spec/parser/keep_spec.rb +6 -0
  67. data/spec/parser/posix_classes_spec.rb +8 -0
  68. data/spec/parser/properties_spec.rb +115 -0
  69. data/spec/parser/quantifiers_spec.rb +51 -0
  70. data/spec/parser/refcalls_spec.rb +112 -0
  71. data/spec/parser/set/intersections_spec.rb +127 -0
  72. data/spec/parser/set/ranges_spec.rb +111 -0
  73. data/spec/parser/sets_spec.rb +178 -0
  74. data/spec/parser/types_spec.rb +18 -0
  75. data/spec/scanner/all_spec.rb +18 -0
  76. data/spec/scanner/anchors_spec.rb +21 -0
  77. data/spec/scanner/conditionals_spec.rb +128 -0
  78. data/spec/scanner/errors_spec.rb +68 -0
  79. data/spec/scanner/escapes_spec.rb +53 -0
  80. data/spec/scanner/free_space_spec.rb +133 -0
  81. data/spec/scanner/groups_spec.rb +52 -0
  82. data/spec/scanner/keep_spec.rb +10 -0
  83. data/spec/scanner/literals_spec.rb +49 -0
  84. data/spec/scanner/meta_spec.rb +18 -0
  85. data/spec/scanner/properties_spec.rb +64 -0
  86. data/spec/scanner/quantifiers_spec.rb +20 -0
  87. data/spec/scanner/refcalls_spec.rb +36 -0
  88. data/spec/scanner/sets_spec.rb +102 -0
  89. data/spec/scanner/types_spec.rb +14 -0
  90. data/spec/spec_helper.rb +15 -0
  91. data/{test → spec}/support/runner.rb +9 -8
  92. data/spec/support/shared_examples.rb +77 -0
  93. data/{test → spec}/support/warning_extractor.rb +5 -7
  94. data/spec/syntax/syntax_spec.rb +48 -0
  95. data/spec/syntax/syntax_token_map_spec.rb +23 -0
  96. data/spec/syntax/versions/1.8.6_spec.rb +17 -0
  97. data/spec/syntax/versions/1.9.1_spec.rb +10 -0
  98. data/spec/syntax/versions/1.9.3_spec.rb +9 -0
  99. data/spec/syntax/versions/2.0.0_spec.rb +13 -0
  100. data/spec/syntax/versions/2.2.0_spec.rb +9 -0
  101. data/spec/syntax/versions/aliases_spec.rb +37 -0
  102. data/spec/token/token_spec.rb +85 -0
  103. metadata +144 -143
  104. data/test/expression/test_all.rb +0 -12
  105. data/test/expression/test_base.rb +0 -90
  106. data/test/expression/test_clone.rb +0 -89
  107. data/test/expression/test_conditionals.rb +0 -113
  108. data/test/expression/test_free_space.rb +0 -35
  109. data/test/expression/test_set.rb +0 -84
  110. data/test/expression/test_strfregexp.rb +0 -230
  111. data/test/expression/test_subexpression.rb +0 -58
  112. data/test/expression/test_tests.rb +0 -99
  113. data/test/expression/test_to_h.rb +0 -59
  114. data/test/expression/test_to_s.rb +0 -104
  115. data/test/expression/test_traverse.rb +0 -161
  116. data/test/helpers.rb +0 -10
  117. data/test/lexer/test_all.rb +0 -41
  118. data/test/lexer/test_conditionals.rb +0 -127
  119. data/test/lexer/test_keep.rb +0 -24
  120. data/test/lexer/test_literals.rb +0 -130
  121. data/test/lexer/test_nesting.rb +0 -132
  122. data/test/lexer/test_refcalls.rb +0 -56
  123. data/test/parser/set/test_intersections.rb +0 -127
  124. data/test/parser/set/test_ranges.rb +0 -111
  125. data/test/parser/test_all.rb +0 -64
  126. data/test/parser/test_alternation.rb +0 -92
  127. data/test/parser/test_anchors.rb +0 -34
  128. data/test/parser/test_conditionals.rb +0 -187
  129. data/test/parser/test_errors.rb +0 -63
  130. data/test/parser/test_escapes.rb +0 -134
  131. data/test/parser/test_free_space.rb +0 -139
  132. data/test/parser/test_groups.rb +0 -289
  133. data/test/parser/test_keep.rb +0 -21
  134. data/test/parser/test_posix_classes.rb +0 -27
  135. data/test/parser/test_properties.rb +0 -133
  136. data/test/parser/test_quantifiers.rb +0 -301
  137. data/test/parser/test_refcalls.rb +0 -186
  138. data/test/parser/test_sets.rb +0 -179
  139. data/test/parser/test_types.rb +0 -50
  140. data/test/scanner/test_all.rb +0 -38
  141. data/test/scanner/test_anchors.rb +0 -38
  142. data/test/scanner/test_conditionals.rb +0 -184
  143. data/test/scanner/test_errors.rb +0 -91
  144. data/test/scanner/test_escapes.rb +0 -56
  145. data/test/scanner/test_free_space.rb +0 -200
  146. data/test/scanner/test_groups.rb +0 -79
  147. data/test/scanner/test_keep.rb +0 -35
  148. data/test/scanner/test_literals.rb +0 -89
  149. data/test/scanner/test_meta.rb +0 -40
  150. data/test/scanner/test_properties.rb +0 -312
  151. data/test/scanner/test_quantifiers.rb +0 -37
  152. data/test/scanner/test_refcalls.rb +0 -52
  153. data/test/scanner/test_scripts.rb +0 -53
  154. data/test/scanner/test_sets.rb +0 -119
  155. data/test/scanner/test_types.rb +0 -35
  156. data/test/scanner/test_unicode_blocks.rb +0 -30
  157. data/test/support/disable_autotest.rb +0 -8
  158. data/test/syntax/test_all.rb +0 -6
  159. data/test/syntax/test_syntax.rb +0 -61
  160. data/test/syntax/test_syntax_token_map.rb +0 -25
  161. data/test/syntax/versions/test_1.8.rb +0 -55
  162. data/test/syntax/versions/test_1.9.1.rb +0 -36
  163. data/test/syntax/versions/test_1.9.3.rb +0 -32
  164. data/test/syntax/versions/test_2.0.0.rb +0 -37
  165. data/test/syntax/versions/test_2.2.0.rb +0 -32
  166. data/test/syntax/versions/test_aliases.rb +0 -129
  167. data/test/syntax/versions/test_all.rb +0 -5
  168. data/test/test_all.rb +0 -5
  169. data/test/token/test_all.rb +0 -2
  170. data/test/token/test_token.rb +0 -107
@@ -21,9 +21,6 @@
21
21
  when '\W'; emit(:type, :nonword, text, ts - 1, te)
22
22
  when '\R'; emit(:type, :linebreak, text, ts - 1, te)
23
23
  when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
24
- else
25
- raise ScannerError.new(
26
- "Unexpected character in type at #{text} (char #{ts})")
27
24
  end
28
25
  fret;
29
26
  };
@@ -5,6 +5,9 @@
5
5
  adlam: adlam
6
6
  age=1.1: age=1.1
7
7
  age=10.0: age=10.0
8
+ age=11.0: age=11.0
9
+ age=12.0: age=12.0
10
+ age=12.1: age=12.1
8
11
  age=2.0: age=2.0
9
12
  age=2.1: age=2.1
10
13
  age=3.0: age=3.0
@@ -63,7 +66,6 @@ changeswhenuppercased: changes_when_uppercased
63
66
  cherokee: cherokee
64
67
  closepunctuation: close_punctuation
65
68
  cntrl: cntrl
66
- combiningmark: combining_mark
67
69
  common: common
68
70
  connectorpunctuation: connector_punctuation
69
71
  control: control
@@ -81,9 +83,11 @@ deseret: deseret
81
83
  devanagari: devanagari
82
84
  diacritic: diacritic
83
85
  digit: digit
86
+ dogra: dogra
84
87
  duployan: duployan
85
88
  egyptianhieroglyphs: egyptian_hieroglyphs
86
89
  elbasan: elbasan
90
+ elymaic: elymaic
87
91
  emoji: emoji
88
92
  emojicomponent: emoji_component
89
93
  emojimodifier: emoji_modifier
@@ -104,9 +108,11 @@ graphemeextend: grapheme_extend
104
108
  graphemelink: grapheme_link
105
109
  greek: greek
106
110
  gujarati: gujarati
111
+ gunjalagondi: gunjala_gondi
107
112
  gurmukhi: gurmukhi
108
113
  han: han
109
114
  hangul: hangul
115
+ hanifirohingya: hanifi_rohingya
110
116
  hanunoo: hanunoo
111
117
  hatran: hatran
112
118
  hebrew: hebrew
@@ -160,6 +166,7 @@ inchakma: in_chakma
160
166
  incham: in_cham
161
167
  incherokee: in_cherokee
162
168
  incherokeesupplement: in_cherokee_supplement
169
+ inchesssymbols: in_chess_symbols
163
170
  incjkcompatibility: in_cjk_compatibility
164
171
  incjkcompatibilityforms: in_cjk_compatibility_forms
165
172
  incjkcompatibilityideographs: in_cjk_compatibility_ideographs
@@ -197,11 +204,14 @@ indeseret: in_deseret
197
204
  indevanagari: in_devanagari
198
205
  indevanagariextended: in_devanagari_extended
199
206
  indingbats: in_dingbats
207
+ indogra: in_dogra
200
208
  indominotiles: in_domino_tiles
201
209
  induployan: in_duployan
202
210
  inearlydynasticcuneiform: in_early_dynastic_cuneiform
211
+ inegyptianhieroglyphformatcontrols: in_egyptian_hieroglyph_format_controls
203
212
  inegyptianhieroglyphs: in_egyptian_hieroglyphs
204
213
  inelbasan: in_elbasan
214
+ inelymaic: in_elymaic
205
215
  inemoticons: in_emoticons
206
216
  inenclosedalphanumerics: in_enclosed_alphanumerics
207
217
  inenclosedalphanumericsupplement: in_enclosed_alphanumeric_supplement
@@ -215,6 +225,7 @@ ingeneralpunctuation: in_general_punctuation
215
225
  ingeometricshapes: in_geometric_shapes
216
226
  ingeometricshapesextended: in_geometric_shapes_extended
217
227
  ingeorgian: in_georgian
228
+ ingeorgianextended: in_georgian_extended
218
229
  ingeorgiansupplement: in_georgian_supplement
219
230
  inglagolitic: in_glagolitic
220
231
  inglagoliticsupplement: in_glagolitic_supplement
@@ -223,6 +234,7 @@ ingrantha: in_grantha
223
234
  ingreekandcoptic: in_greek_and_coptic
224
235
  ingreekextended: in_greek_extended
225
236
  ingujarati: in_gujarati
237
+ ingunjalagondi: in_gunjala_gondi
226
238
  ingurmukhi: in_gurmukhi
227
239
  inhalfwidthandfullwidthforms: in_halfwidth_and_fullwidth_forms
228
240
  inhangulcompatibilityjamo: in_hangul_compatibility_jamo
@@ -230,6 +242,7 @@ inhanguljamo: in_hangul_jamo
230
242
  inhanguljamoextendeda: in_hangul_jamo_extended_a
231
243
  inhanguljamoextendedb: in_hangul_jamo_extended_b
232
244
  inhangulsyllables: in_hangul_syllables
245
+ inhanifirohingya: in_hanifi_rohingya
233
246
  inhanunoo: in_hanunoo
234
247
  inhatran: in_hatran
235
248
  inhebrew: in_hebrew
@@ -240,6 +253,7 @@ inhiragana: in_hiragana
240
253
  inideographicdescriptioncharacters: in_ideographic_description_characters
241
254
  inideographicsymbolsandpunctuation: in_ideographic_symbols_and_punctuation
242
255
  inimperialaramaic: in_imperial_aramaic
256
+ inindicsiyaqnumbers: in_indic_siyaq_numbers
243
257
  ininscriptionalpahlavi: in_inscriptional_pahlavi
244
258
  ininscriptionalparthian: in_inscriptional_parthian
245
259
  inipaextensions: in_ipa_extensions
@@ -279,6 +293,7 @@ inlycian: in_lycian
279
293
  inlydian: in_lydian
280
294
  inmahajani: in_mahajani
281
295
  inmahjongtiles: in_mahjong_tiles
296
+ inmakasar: in_makasar
282
297
  inmalayalam: in_malayalam
283
298
  inmandaic: in_mandaic
284
299
  inmanichaean: in_manichaean
@@ -286,6 +301,8 @@ inmarchen: in_marchen
286
301
  inmasaramgondi: in_masaram_gondi
287
302
  inmathematicalalphanumericsymbols: in_mathematical_alphanumeric_symbols
288
303
  inmathematicaloperators: in_mathematical_operators
304
+ inmayannumerals: in_mayan_numerals
305
+ inmedefaidrin: in_medefaidrin
289
306
  inmeeteimayek: in_meetei_mayek
290
307
  inmeeteimayekextensions: in_meetei_mayek_extensions
291
308
  inmendekikakui: in_mende_kikakui
@@ -309,12 +326,14 @@ inmyanmar: in_myanmar
309
326
  inmyanmarextendeda: in_myanmar_extended_a
310
327
  inmyanmarextendedb: in_myanmar_extended_b
311
328
  innabataean: in_nabataean
329
+ innandinagari: in_nandinagari
312
330
  innewa: in_newa
313
331
  innewtailue: in_new_tai_lue
314
332
  innko: in_nko
315
333
  innoblock: in_no_block
316
334
  innumberforms: in_number_forms
317
335
  innushu: in_nushu
336
+ innyiakengpuachuehmong: in_nyiakeng_puachue_hmong
318
337
  inogham: in_ogham
319
338
  inolchiki: in_ol_chiki
320
339
  inoldhungarian: in_old_hungarian
@@ -322,6 +341,7 @@ inolditalic: in_old_italic
322
341
  inoldnortharabian: in_old_north_arabian
323
342
  inoldpermic: in_old_permic
324
343
  inoldpersian: in_old_persian
344
+ inoldsogdian: in_old_sogdian
325
345
  inoldsoutharabian: in_old_south_arabian
326
346
  inoldturkic: in_old_turkic
327
347
  inopticalcharacterrecognition: in_optical_character_recognition
@@ -329,6 +349,7 @@ inoriya: in_oriya
329
349
  inornamentaldingbats: in_ornamental_dingbats
330
350
  inosage: in_osage
331
351
  inosmanya: in_osmanya
352
+ inottomansiyaqnumbers: in_ottoman_siyaq_numbers
332
353
  inpahawhhmong: in_pahawh_hmong
333
354
  inpalmyrene: in_palmyrene
334
355
  inpaucinhau: in_pau_cin_hau
@@ -354,6 +375,8 @@ insiddham: in_siddham
354
375
  insinhala: in_sinhala
355
376
  insinhalaarchaicnumbers: in_sinhala_archaic_numbers
356
377
  insmallformvariants: in_small_form_variants
378
+ insmallkanaextension: in_small_kana_extension
379
+ insogdian: in_sogdian
357
380
  insorasompeng: in_sora_sompeng
358
381
  insoyombo: in_soyombo
359
382
  inspacingmodifierletters: in_spacing_modifier_letters
@@ -371,6 +394,7 @@ insupplementaryprivateuseareaa: in_supplementary_private_use_area_a
371
394
  insupplementaryprivateuseareab: in_supplementary_private_use_area_b
372
395
  insuttonsignwriting: in_sutton_signwriting
373
396
  insylotinagri: in_syloti_nagri
397
+ insymbolsandpictographsextendeda: in_symbols_and_pictographs_extended_a
374
398
  insyriac: in_syriac
375
399
  insyriacsupplement: in_syriac_supplement
376
400
  intagalog: in_tagalog
@@ -382,6 +406,7 @@ intaiviet: in_tai_viet
382
406
  intaixuanjingsymbols: in_tai_xuan_jing_symbols
383
407
  intakri: in_takri
384
408
  intamil: in_tamil
409
+ intamilsupplement: in_tamil_supplement
385
410
  intangut: in_tangut
386
411
  intangutcomponents: in_tangut_components
387
412
  intelugu: in_telugu
@@ -399,6 +424,7 @@ invariationselectors: in_variation_selectors
399
424
  invariationselectorssupplement: in_variation_selectors_supplement
400
425
  invedicextensions: in_vedic_extensions
401
426
  inverticalforms: in_vertical_forms
427
+ inwancho: in_wancho
402
428
  inwarangciti: in_warang_citi
403
429
  inyijinghexagramsymbols: in_yijing_hexagram_symbols
404
430
  inyiradicals: in_yi_radicals
@@ -431,6 +457,7 @@ lowercaseletter: lowercase_letter
431
457
  lycian: lycian
432
458
  lydian: lydian
433
459
  mahajani: mahajani
460
+ makasar: makasar
434
461
  malayalam: malayalam
435
462
  mandaic: mandaic
436
463
  manichaean: manichaean
@@ -439,6 +466,7 @@ mark: mark
439
466
  masaramgondi: masaram_gondi
440
467
  math: math
441
468
  mathsymbol: math_symbol
469
+ medefaidrin: medefaidrin
442
470
  meeteimayek: meetei_mayek
443
471
  mendekikakui: mende_kikakui
444
472
  meroiticcursive: meroitic_cursive
@@ -452,6 +480,7 @@ mro: mro
452
480
  multani: multani
453
481
  myanmar: myanmar
454
482
  nabataean: nabataean
483
+ nandinagari: nandinagari
455
484
  newa: newa
456
485
  newline: newline
457
486
  newtailue: new_tai_lue
@@ -460,6 +489,7 @@ noncharactercodepoint: noncharacter_code_point
460
489
  nonspacingmark: nonspacing_mark
461
490
  number: number
462
491
  nushu: nushu
492
+ nyiakengpuachuehmong: nyiakeng_puachue_hmong
463
493
  ogham: ogham
464
494
  olchiki: ol_chiki
465
495
  oldhungarian: old_hungarian
@@ -467,6 +497,7 @@ olditalic: old_italic
467
497
  oldnortharabian: old_north_arabian
468
498
  oldpermic: old_permic
469
499
  oldpersian: old_persian
500
+ oldsogdian: old_sogdian
470
501
  oldsoutharabian: old_south_arabian
471
502
  oldturkic: old_turkic
472
503
  openpunctuation: open_punctuation
@@ -515,6 +546,7 @@ siddham: siddham
515
546
  signwriting: signwriting
516
547
  sinhala: sinhala
517
548
  softdotted: soft_dotted
549
+ sogdian: sogdian
518
550
  sorasompeng: sora_sompeng
519
551
  soyombo: soyombo
520
552
  space: space
@@ -550,6 +582,7 @@ uppercase: uppercase
550
582
  uppercaseletter: uppercase_letter
551
583
  vai: vai
552
584
  variationselector: variation_selector
585
+ wancho: wancho
553
586
  warangciti: warang_citi
554
587
  whitespace: white_space
555
588
  word: word
@@ -31,6 +31,7 @@ cher: cherokee
31
31
  ci: case_ignorable
32
32
  cn: unassigned
33
33
  co: private_use
34
+ combiningmark: mark
34
35
  copt: coptic
35
36
  cprt: cypriot
36
37
  cs: surrogate
@@ -44,14 +45,17 @@ dep: deprecated
44
45
  deva: devanagari
45
46
  di: default_ignorable_code_point
46
47
  dia: diacritic
48
+ dogr: dogra
47
49
  dsrt: deseret
48
50
  dupl: duployan
49
51
  egyp: egyptian_hieroglyphs
50
52
  elba: elbasan
53
+ elym: elymaic
51
54
  ethi: ethiopic
52
55
  ext: extender
53
56
  geor: georgian
54
57
  glag: glagolitic
58
+ gong: gunjala_gondi
55
59
  gonm: masaram_gondi
56
60
  goth: gothic
57
61
  gran: grantha
@@ -70,6 +74,7 @@ hex: hex_digit
70
74
  hira: hiragana
71
75
  hluw: anatolian_hieroglyphs
72
76
  hmng: pahawh_hmong
77
+ hmnp: nyiakeng_puachue_hmong
73
78
  hung: old_hungarian
74
79
  idc: id_continue
75
80
  ideo: ideographic
@@ -105,11 +110,13 @@ lyci: lycian
105
110
  lydi: lydian
106
111
  m: mark
107
112
  mahj: mahajani
113
+ maka: makasar
108
114
  mand: mandaic
109
115
  mani: manichaean
110
116
  marc: marchen
111
117
  mc: spacing_mark
112
118
  me: enclosing_mark
119
+ medf: medefaidrin
113
120
  mend: mende_kikakui
114
121
  merc: meroitic_cursive
115
122
  mero: meroitic_hieroglyphs
@@ -121,6 +128,7 @@ mtei: meetei_mayek
121
128
  mult: multani
122
129
  mymr: myanmar
123
130
  n: number
131
+ nand: nandinagari
124
132
  narb: old_north_arabian
125
133
  nbat: nabataean
126
134
  nchar: noncharacter_code_point
@@ -168,6 +176,7 @@ qaai: inherited
168
176
  qmark: quotation_mark
169
177
  ri: regional_indicator
170
178
  rjng: rejang
179
+ rohg: hanifi_rohingya
171
180
  runr: runic
172
181
  s: symbol
173
182
  samr: samaritan
@@ -184,6 +193,8 @@ sinh: sinhala
184
193
  sk: modifier_symbol
185
194
  sm: math_symbol
186
195
  so: other_symbol
196
+ sogd: sogdian
197
+ sogo: old_sogdian
187
198
  sora: sora_sompeng
188
199
  soyo: soyombo
189
200
  sterm: sentence_terminal
@@ -209,6 +220,7 @@ uideo: unified_ideograph
209
220
  vaii: vai
210
221
  vs: variation_selector
211
222
  wara: warang_citi
223
+ wcho: wancho
212
224
  wspace: white_space
213
225
  xidc: xid_continue
214
226
  xids: xid_start
@@ -49,9 +49,9 @@
49
49
  codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
50
50
  codepoint_sequence = codepoint_single | codepoint_list;
51
51
 
52
- control_sequence = ('c' | 'C-') . (backslash . 'M-')?;
52
+ control_sequence = ('c' | 'C-') . (backslash . 'M-')? . backslash? . any;
53
53
 
54
- meta_sequence = 'M-' . (backslash . control_sequence)?;
54
+ meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
55
55
 
56
56
  zero_or_one = '?' | '??' | '?+';
57
57
  zero_or_more = '*' | '*?' | '*+';
@@ -82,7 +82,8 @@
82
82
  assertion_lookbehind = '?<=';
83
83
  assertion_nlookbehind = '?<!';
84
84
 
85
- group_options = '?' . [\-mixdau];
85
+ # try to treat every other group head as options group, like Ruby
86
+ group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
86
87
 
87
88
  group_ref = [gk];
88
89
  group_name_char = (alnum | '_');
@@ -135,41 +136,35 @@
135
136
  # Invalid sequence error, used from sequences, like escapes and sets
136
137
  action invalid_sequence_error {
137
138
  text = ts ? copy(data, ts-1..-1) : data.pack('c*')
138
- raise InvalidSequenceError.new('sequence', text)
139
+ validation_error(:sequence, 'sequence', text)
139
140
  }
140
141
 
141
142
  # group (nesting) and set open/close actions
142
- action group_opened { self.group_depth = group_depth + 1; in_group = true }
143
- action group_closed { self.group_depth = group_depth - 1; in_group = group_depth > 0 ? true : false }
143
+ action group_opened { self.group_depth = group_depth + 1 }
144
+ action group_closed { self.group_depth = group_depth - 1 }
145
+ action set_opened { self.set_depth = set_depth + 1 }
146
+ action set_closed { self.set_depth = set_depth - 1 }
144
147
 
145
148
  # Character set scanner, continues consuming characters until it meets the
146
149
  # closing bracket of the set.
147
150
  # --------------------------------------------------------------------------
148
151
  character_set := |*
149
- set_close > (set_meta, 2) {
150
- set_depth -= 1
151
- in_set = set_depth > 0 ? true : false
152
-
152
+ set_close > (set_meta, 2) @set_closed {
153
153
  emit(:set, :close, *text(data, ts, te))
154
-
155
- if set_depth == 0
156
- fgoto main;
157
- else
154
+ if in_set?
158
155
  fret;
156
+ else
157
+ fgoto main;
159
158
  end
160
159
  };
161
160
 
162
- '-]' { # special case, emits two tokens
163
- set_depth -= 1
164
- in_set = set_depth > 0 ? true : false
165
-
166
- emit(:literal, :literal, copy(data, ts..te-2), ts, te)
167
- emit(:set, :close, copy(data, ts+1..te-1), ts, te)
168
-
169
- if set_depth == 0
170
- fgoto main;
171
- else
161
+ '-]' @set_closed { # special case, emits two tokens
162
+ emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
163
+ emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
164
+ if in_set?
172
165
  fret;
166
+ else
167
+ fgoto main;
173
168
  end
174
169
  };
175
170
 
@@ -207,14 +202,12 @@
207
202
  fcall set_escape_sequence;
208
203
  };
209
204
 
210
- set_open >(open_bracket, 1) {
211
- set_depth += 1
212
-
205
+ set_open >(open_bracket, 1) >set_opened {
213
206
  emit(:set, :open, *text(data, ts, te))
214
207
  fcall character_set;
215
208
  };
216
209
 
217
- class_posix >(open_bracket, 1) @eof(premature_end_error) {
210
+ class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
218
211
  text = text(data, ts, te).first
219
212
 
220
213
  type = :posixclass
@@ -227,11 +220,11 @@
227
220
  emit(type, class_name.to_sym, text, ts, te)
228
221
  };
229
222
 
230
- collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
223
+ collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
231
224
  emit(:set, :collation, *text(data, ts, te))
232
225
  };
233
226
 
234
- character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
227
+ character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
235
228
  emit(:set, :equivalent, *text(data, ts, te))
236
229
  };
237
230
 
@@ -337,44 +330,24 @@
337
330
  };
338
331
 
339
332
  control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
340
- if data[te]
341
- c = data[te].chr
342
- if c =~ /[\x00-\x7F]/
343
- emit(:escape, :control, copy(data, ts-1..te), ts-1, te+1)
344
- p += 1
345
- else
346
- raise InvalidSequenceError.new("control sequence")
347
- end
348
- else
349
- raise PrematureEndError.new("control sequence")
350
- end
333
+ emit_meta_control_sequence(data, ts, te, :control)
351
334
  fret;
352
335
  };
353
336
 
354
337
  meta_sequence >(backslashed, 3) $eof(premature_end_error) {
355
- if data[te]
356
- c = data[te].chr
357
- if c =~ /[\x00-\x7F]/
358
- emit(:escape, :meta_sequence, copy(data, ts-1..te), ts-1, te+1)
359
- p += 1
360
- else
361
- raise InvalidSequenceError.new("meta sequence")
362
- end
363
- else
364
- raise PrematureEndError.new("meta sequence")
365
- end
338
+ emit_meta_control_sequence(data, ts, te, :meta_sequence)
366
339
  fret;
367
340
  };
368
341
 
369
342
  char_type_char > (escaped_alpha, 2) {
370
343
  fhold;
371
- fnext *(in_set ? fentry(character_set) : fentry(main));
344
+ fnext *(in_set? ? fentry(character_set) : fentry(main));
372
345
  fcall char_type;
373
346
  };
374
347
 
375
348
  property_char > (escaped_alpha, 2) {
376
349
  fhold;
377
- fnext *(in_set ? fentry(character_set) : fentry(main));
350
+ fnext *(in_set? ? fentry(character_set) : fentry(main));
378
351
  fcall unicode_property;
379
352
  };
380
353
 
@@ -412,8 +385,7 @@
412
385
  };
413
386
 
414
387
  alternation {
415
- if in_conditional and conditional_stack.length > 0 and
416
- conditional_stack.last[1] == group_depth
388
+ if conditional_stack.last == group_depth
417
389
  emit(:conditional, :separator, *text(data, ts, te))
418
390
  else
419
391
  emit(:meta, :alternation, *text(data, ts, te))
@@ -442,18 +414,12 @@
442
414
  when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
443
415
  when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
444
416
  when '\\G'; emit(:anchor, :match_start, text, ts, te)
445
- else
446
- raise ScannerError.new(
447
- "Unexpected character in anchor at #{text} (char #{ts})")
448
417
  end
449
418
  };
450
419
 
451
420
  # Character sets
452
421
  # ------------------------------------------------------------------------
453
- set_open {
454
- set_depth += 1
455
- in_set = true
456
-
422
+ set_open >set_opened {
457
423
  emit(:set, :open, *text(data, ts, te))
458
424
  fcall character_set;
459
425
  };
@@ -465,9 +431,7 @@
465
431
  conditional {
466
432
  text = text(data, ts, te).first
467
433
 
468
- in_conditional = true unless in_conditional
469
- conditional_depth += 1
470
- conditional_stack << [conditional_depth, group_depth]
434
+ conditional_stack << group_depth
471
435
 
472
436
  emit(:conditional, :open, text[0..-2], ts, te-1)
473
437
  emit(:conditional, :condition_open, '(', te-1, te)
@@ -496,7 +460,11 @@
496
460
  # (?imxdau-imx:subexp) option on/off for subexp
497
461
  # ------------------------------------------------------------------------
498
462
  group_open . group_options >group_opened {
499
- p = scan_options(p, data, ts, te)
463
+ text = text(data, ts, te).first
464
+ if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
465
+ raise InvalidGroupOption.new($1 || "-#{$2}", text)
466
+ end
467
+ emit_options(text, ts, te)
500
468
  };
501
469
 
502
470
  # Assertions
@@ -528,19 +496,15 @@
528
496
  when '(?>'; emit(:group, :atomic, text, ts, te)
529
497
  when '(?~'; emit(:group, :absence, text, ts, te)
530
498
 
531
- when /^\(\?<(\w*)>/
532
- empty_name_error(:group, 'named group (ab)') if $1.empty?
499
+ when /^\(\?(?:<>|'')/
500
+ validation_error(:group, 'named group', 'name is empty')
533
501
 
502
+ when /^\(\?<\w*>/
534
503
  emit(:group, :named_ab, text, ts, te)
535
504
 
536
- when /^\(\?'(\w*)'/
537
- empty_name_error(:group, 'named group (sq)') if $1.empty?
538
-
505
+ when /^\(\?'\w*'/
539
506
  emit(:group, :named_sq, text, ts, te)
540
507
 
541
- else
542
- raise ScannerError.new(
543
- "Unknown subexpression group format '#{text}'")
544
508
  end
545
509
  };
546
510
 
@@ -550,20 +514,13 @@
550
514
  };
551
515
 
552
516
  group_close @group_closed {
553
- if in_conditional and conditional_stack.last and
554
- conditional_stack.last[1] == (group_depth + 1)
555
-
556
- emit(:conditional, :close, *text(data, ts, te))
517
+ if conditional_stack.last == group_depth + 1
557
518
  conditional_stack.pop
558
-
559
- if conditional_stack.length == 0
560
- in_conditional = false
561
- end
519
+ emit(:conditional, :close, *text(data, ts, te))
562
520
  else
563
- if spacing_stack.length > 1 and
564
- spacing_stack.last[:depth] == (group_depth + 1)
521
+ if spacing_stack.length > 1 &&
522
+ spacing_stack.last[:depth] == group_depth + 1
565
523
  spacing_stack.pop
566
-
567
524
  self.free_spacing = spacing_stack.last[:free_spacing]
568
525
  end
569
526
 
@@ -576,11 +533,8 @@
576
533
  # ------------------------------------------------------------------------
577
534
  backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
578
535
  case text = text(data, ts, te).first
579
- when /^\\([gk])<>/ # angle brackets
580
- empty_backref_error("ref/call (ab)")
581
-
582
- when /^\\([gk])''/ # single quotes
583
- empty_backref_error("ref/call (sq)")
536
+ when /^\\([gk])(<>|'')/ # angle brackets
537
+ validation_error(:backref, 'ref/call', 'ref ID is empty')
584
538
 
585
539
  when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
586
540
  if $1 == 'k'
@@ -636,9 +590,6 @@
636
590
  when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
637
591
  emit(:backref, :number_recursion_ref_sq, text, ts, te)
638
592
 
639
- else
640
- raise ScannerError.new(
641
- "Unknown backreference format '#{text}'")
642
593
  end
643
594
  };
644
595
 
@@ -786,7 +737,7 @@ class Regexp::Scanner
786
737
  input = input_object
787
738
  self.free_spacing = false
788
739
  end
789
-
740
+ self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
790
741
 
791
742
  data = input.unpack("c*") if input.is_a?(String)
792
743
  eof = data.length
@@ -794,15 +745,9 @@ class Regexp::Scanner
794
745
  self.tokens = []
795
746
  self.block = block_given? ? block : nil
796
747
 
797
- self.in_group = false
748
+ self.set_depth = 0
798
749
  self.group_depth = 0
799
- self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
800
-
801
- in_set = false
802
- set_depth = 0
803
- in_conditional = false
804
- conditional_depth = 0
805
- conditional_stack = []
750
+ self.conditional_stack = []
806
751
 
807
752
  %% write data;
808
753
  %% write init;
@@ -817,9 +762,9 @@ class Regexp::Scanner
817
762
  end
818
763
 
819
764
  raise PrematureEndError.new("(missing group closing paranthesis) "+
820
- "[#{in_group}:#{group_depth}]") if in_group
765
+ "[#{group_depth}]") if in_group?
821
766
  raise PrematureEndError.new("(missing set closing bracket) "+
822
- "[#{in_set}:#{set_depth}]") if in_set
767
+ "[#{set_depth}]") if in_set?
823
768
 
824
769
  # when the entire expression is a literal run
825
770
  emit_literal if literal
@@ -854,62 +799,15 @@ class Regexp::Scanner
854
799
 
855
800
  private
856
801
 
857
- attr_accessor :tokens, :literal, :block,
858
- :in_group, :group_depth,
859
- :free_spacing, :spacing_stack
860
-
861
- # Ragel's regex-based scan of the group options introduced a lot of
862
- # ambiguity, so we just ask it to find the beginning of what looks
863
- # like an options run and handle the rest in here.
864
- def scan_options(p, data, ts, te)
865
- text = text(data, ts, te).first
866
-
867
- options_char, options_length = true, 0
868
-
869
- # Copy while we have option characters. There is no maximum length,
870
- # as ruby allows things like '(?xxxxxxxxx-xxxxxxxxxxxxx:abc)'.
871
- negative_options = false
872
- while options_char
873
- if data[te + options_length]
874
- c = data[te + options_length].chr
875
-
876
- if c =~ /[-mixdau]/
877
- negative_options = true if c == '-'
802
+ attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
803
+ :group_depth, :set_depth, :conditional_stack
878
804
 
879
- raise InvalidGroupOption.new(c, text) if negative_options and
880
- c =~ /[dau]/
881
-
882
- text << c ; p += 1 ; options_length += 1
883
- else
884
- options_char = false
885
- end
886
- else
887
- raise PrematureEndError.new("expression options `#{text}'")
888
- end
889
- end
890
-
891
- if data[te + options_length]
892
- c = data[te + options_length].chr
893
-
894
- if c == ':'
895
- # Include the ':' in the options text
896
- text << c ; p += 1 ; options_length += 1
897
- emit_options(text, ts, te + options_length)
898
-
899
- elsif c == ')'
900
- # Don't include the closing ')', let group_close handle it.
901
- emit_options(text, ts, te + options_length)
902
-
903
- else
904
- # Plain Regexp reports this as 'undefined group option'
905
- raise ScannerError.new(
906
- "Unexpected `#{c}' in options sequence, ':' or ')' expected")
907
- end
908
- else
909
- raise PrematureEndError.new("expression options `#{text}'")
910
- end
805
+ def in_group?
806
+ group_depth > 0
807
+ end
911
808
 
912
- p # return the new value of the data pointer
809
+ def in_set?
810
+ set_depth > 0
913
811
  end
914
812
 
915
813
  # Copy from ts to te from data as text
@@ -945,32 +843,39 @@ class Regexp::Scanner
945
843
  def emit_options(text, ts, te)
946
844
  token = nil
947
845
 
948
- if text =~ /\(\?([mixdau]*)-?([mix]*)(:)?/
949
- positive, negative, group_local = $1, $2, $3
846
+ # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
847
+ text =~ /\(\?([mixdau]*)(-(?:[mix]*))*(:)?/
848
+ positive, negative, group_local = $1, $2, $3
950
849
 
951
- if positive.include?('x')
952
- self.free_spacing = true
953
- end
850
+ if positive.include?('x')
851
+ self.free_spacing = true
852
+ end
954
853
 
955
- # If the x appears in both, treat it like ruby does, the second cancels
956
- # the first.
957
- if negative.include?('x')
958
- self.free_spacing = false
959
- end
854
+ # If the x appears in both, treat it like ruby does, the second cancels
855
+ # the first.
856
+ if negative && negative.include?('x')
857
+ self.free_spacing = false
858
+ end
960
859
 
961
- if group_local
962
- spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
963
- token = :options
964
- else
965
- # switch for parent group level
966
- spacing_stack.last[:free_spacing] = free_spacing
967
- token = :options_switch
968
- end
860
+ if group_local
861
+ spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
862
+ token = :options
863
+ else
864
+ # switch for parent group level
865
+ spacing_stack.last[:free_spacing] = free_spacing
866
+ token = :options_switch
969
867
  end
970
868
 
971
869
  emit(:group, token, text, ts, te)
972
870
  end
973
871
 
872
+ def emit_meta_control_sequence(data, ts, te, token)
873
+ if data.last < 0x00 || data.last > 0x7F
874
+ validation_error(:sequence, 'escape', token.to_s)
875
+ end
876
+ emit(:escape, token, *text(data, ts, te, 1))
877
+ end
878
+
974
879
  # Centralizes and unifies the handling of validation related
975
880
  # errors.
976
881
  def validation_error(type, what, reason)
@@ -981,21 +886,8 @@ class Regexp::Scanner
981
886
  error = InvalidBackrefError.new(what, reason)
982
887
  when :sequence
983
888
  error = InvalidSequenceError.new(what, reason)
984
- else
985
- error = ValidationError.new('expression')
986
889
  end
987
890
 
988
891
  raise error # unless @@config.validation_ignore
989
892
  end
990
-
991
- # Used for references with an empty name or number
992
- def empty_backref_error(type, what)
993
- validation_error(:backref, what, 'ref ID is empty')
994
- end
995
-
996
- # Used for named expressions with an empty name
997
- def empty_name_error(type, what)
998
- validation_error(type, what, 'name is empty')
999
- end
1000
-
1001
893
  end # module Regexp::Scanner