regexp_parser 1.3.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +53 -1
  3. data/Gemfile +3 -3
  4. data/README.md +10 -14
  5. data/Rakefile +3 -4
  6. data/lib/regexp_parser/expression.rb +28 -53
  7. data/lib/regexp_parser/expression/classes/backref.rb +18 -10
  8. data/lib/regexp_parser/expression/classes/conditional.rb +7 -2
  9. data/lib/regexp_parser/expression/classes/escape.rb +0 -4
  10. data/lib/regexp_parser/expression/classes/group.rb +4 -2
  11. data/lib/regexp_parser/expression/classes/keep.rb +1 -3
  12. data/lib/regexp_parser/expression/methods/match.rb +13 -0
  13. data/lib/regexp_parser/expression/methods/match_length.rb +172 -0
  14. data/lib/regexp_parser/expression/methods/options.rb +35 -0
  15. data/lib/regexp_parser/expression/methods/strfregexp.rb +0 -1
  16. data/lib/regexp_parser/expression/methods/tests.rb +6 -15
  17. data/lib/regexp_parser/expression/quantifier.rb +2 -2
  18. data/lib/regexp_parser/expression/sequence.rb +3 -6
  19. data/lib/regexp_parser/expression/sequence_operation.rb +2 -6
  20. data/lib/regexp_parser/expression/subexpression.rb +3 -5
  21. data/lib/regexp_parser/lexer.rb +30 -44
  22. data/lib/regexp_parser/parser.rb +47 -24
  23. data/lib/regexp_parser/scanner.rb +1159 -1329
  24. data/lib/regexp_parser/scanner/char_type.rl +0 -3
  25. data/lib/regexp_parser/scanner/properties/long.yml +34 -1
  26. data/lib/regexp_parser/scanner/properties/short.yml +12 -0
  27. data/lib/regexp_parser/scanner/scanner.rl +82 -190
  28. data/lib/regexp_parser/syntax/tokens.rb +2 -10
  29. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +72 -21
  30. data/lib/regexp_parser/syntax/versions/2.6.0.rb +10 -0
  31. data/lib/regexp_parser/syntax/versions/2.6.2.rb +10 -0
  32. data/lib/regexp_parser/syntax/versions/2.6.3.rb +10 -0
  33. data/lib/regexp_parser/version.rb +1 -1
  34. data/regexp_parser.gemspec +3 -3
  35. data/spec/expression/base_spec.rb +94 -0
  36. data/spec/expression/clone_spec.rb +120 -0
  37. data/spec/expression/conditional_spec.rb +89 -0
  38. data/spec/expression/free_space_spec.rb +27 -0
  39. data/spec/expression/methods/match_length_spec.rb +154 -0
  40. data/spec/expression/methods/match_spec.rb +25 -0
  41. data/spec/expression/methods/strfregexp_spec.rb +224 -0
  42. data/spec/expression/methods/tests_spec.rb +99 -0
  43. data/spec/expression/methods/traverse_spec.rb +140 -0
  44. data/spec/expression/options_spec.rb +128 -0
  45. data/spec/expression/root_spec.rb +9 -0
  46. data/spec/expression/sequence_spec.rb +9 -0
  47. data/spec/expression/subexpression_spec.rb +50 -0
  48. data/spec/expression/to_h_spec.rb +26 -0
  49. data/spec/expression/to_s_spec.rb +100 -0
  50. data/spec/lexer/all_spec.rb +22 -0
  51. data/spec/lexer/conditionals_spec.rb +53 -0
  52. data/spec/lexer/escapes_spec.rb +14 -0
  53. data/spec/lexer/keep_spec.rb +10 -0
  54. data/spec/lexer/literals_spec.rb +89 -0
  55. data/spec/lexer/nesting_spec.rb +99 -0
  56. data/spec/lexer/refcalls_spec.rb +55 -0
  57. data/spec/parser/all_spec.rb +43 -0
  58. data/spec/parser/alternation_spec.rb +88 -0
  59. data/spec/parser/anchors_spec.rb +17 -0
  60. data/spec/parser/conditionals_spec.rb +179 -0
  61. data/spec/parser/errors_spec.rb +30 -0
  62. data/spec/parser/escapes_spec.rb +121 -0
  63. data/spec/parser/free_space_spec.rb +130 -0
  64. data/spec/parser/groups_spec.rb +108 -0
  65. data/spec/parser/keep_spec.rb +6 -0
  66. data/spec/parser/posix_classes_spec.rb +8 -0
  67. data/spec/parser/properties_spec.rb +115 -0
  68. data/spec/parser/quantifiers_spec.rb +51 -0
  69. data/spec/parser/refcalls_spec.rb +112 -0
  70. data/spec/parser/set/intersections_spec.rb +127 -0
  71. data/spec/parser/set/ranges_spec.rb +111 -0
  72. data/spec/parser/sets_spec.rb +178 -0
  73. data/spec/parser/types_spec.rb +18 -0
  74. data/spec/scanner/all_spec.rb +18 -0
  75. data/spec/scanner/anchors_spec.rb +21 -0
  76. data/spec/scanner/conditionals_spec.rb +128 -0
  77. data/spec/scanner/errors_spec.rb +68 -0
  78. data/spec/scanner/escapes_spec.rb +53 -0
  79. data/spec/scanner/free_space_spec.rb +133 -0
  80. data/spec/scanner/groups_spec.rb +52 -0
  81. data/spec/scanner/keep_spec.rb +10 -0
  82. data/spec/scanner/literals_spec.rb +49 -0
  83. data/spec/scanner/meta_spec.rb +18 -0
  84. data/spec/scanner/properties_spec.rb +64 -0
  85. data/spec/scanner/quantifiers_spec.rb +20 -0
  86. data/spec/scanner/refcalls_spec.rb +36 -0
  87. data/spec/scanner/sets_spec.rb +102 -0
  88. data/spec/scanner/types_spec.rb +14 -0
  89. data/spec/spec_helper.rb +15 -0
  90. data/{test → spec}/support/runner.rb +9 -8
  91. data/spec/support/shared_examples.rb +77 -0
  92. data/{test → spec}/support/warning_extractor.rb +5 -7
  93. data/spec/syntax/syntax_spec.rb +48 -0
  94. data/spec/syntax/syntax_token_map_spec.rb +23 -0
  95. data/spec/syntax/versions/1.8.6_spec.rb +17 -0
  96. data/spec/syntax/versions/1.9.1_spec.rb +10 -0
  97. data/spec/syntax/versions/1.9.3_spec.rb +9 -0
  98. data/spec/syntax/versions/2.0.0_spec.rb +13 -0
  99. data/spec/syntax/versions/2.2.0_spec.rb +9 -0
  100. data/spec/syntax/versions/aliases_spec.rb +37 -0
  101. data/spec/token/token_spec.rb +85 -0
  102. metadata +144 -143
  103. data/test/expression/test_all.rb +0 -12
  104. data/test/expression/test_base.rb +0 -90
  105. data/test/expression/test_clone.rb +0 -89
  106. data/test/expression/test_conditionals.rb +0 -113
  107. data/test/expression/test_free_space.rb +0 -35
  108. data/test/expression/test_set.rb +0 -84
  109. data/test/expression/test_strfregexp.rb +0 -230
  110. data/test/expression/test_subexpression.rb +0 -58
  111. data/test/expression/test_tests.rb +0 -99
  112. data/test/expression/test_to_h.rb +0 -59
  113. data/test/expression/test_to_s.rb +0 -104
  114. data/test/expression/test_traverse.rb +0 -161
  115. data/test/helpers.rb +0 -10
  116. data/test/lexer/test_all.rb +0 -41
  117. data/test/lexer/test_conditionals.rb +0 -127
  118. data/test/lexer/test_keep.rb +0 -24
  119. data/test/lexer/test_literals.rb +0 -130
  120. data/test/lexer/test_nesting.rb +0 -132
  121. data/test/lexer/test_refcalls.rb +0 -56
  122. data/test/parser/set/test_intersections.rb +0 -127
  123. data/test/parser/set/test_ranges.rb +0 -111
  124. data/test/parser/test_all.rb +0 -64
  125. data/test/parser/test_alternation.rb +0 -92
  126. data/test/parser/test_anchors.rb +0 -34
  127. data/test/parser/test_conditionals.rb +0 -187
  128. data/test/parser/test_errors.rb +0 -63
  129. data/test/parser/test_escapes.rb +0 -134
  130. data/test/parser/test_free_space.rb +0 -139
  131. data/test/parser/test_groups.rb +0 -289
  132. data/test/parser/test_keep.rb +0 -21
  133. data/test/parser/test_posix_classes.rb +0 -27
  134. data/test/parser/test_properties.rb +0 -133
  135. data/test/parser/test_quantifiers.rb +0 -301
  136. data/test/parser/test_refcalls.rb +0 -186
  137. data/test/parser/test_sets.rb +0 -179
  138. data/test/parser/test_types.rb +0 -50
  139. data/test/scanner/test_all.rb +0 -38
  140. data/test/scanner/test_anchors.rb +0 -38
  141. data/test/scanner/test_conditionals.rb +0 -184
  142. data/test/scanner/test_errors.rb +0 -91
  143. data/test/scanner/test_escapes.rb +0 -56
  144. data/test/scanner/test_free_space.rb +0 -200
  145. data/test/scanner/test_groups.rb +0 -79
  146. data/test/scanner/test_keep.rb +0 -35
  147. data/test/scanner/test_literals.rb +0 -89
  148. data/test/scanner/test_meta.rb +0 -40
  149. data/test/scanner/test_properties.rb +0 -312
  150. data/test/scanner/test_quantifiers.rb +0 -37
  151. data/test/scanner/test_refcalls.rb +0 -52
  152. data/test/scanner/test_scripts.rb +0 -53
  153. data/test/scanner/test_sets.rb +0 -119
  154. data/test/scanner/test_types.rb +0 -35
  155. data/test/scanner/test_unicode_blocks.rb +0 -30
  156. data/test/support/disable_autotest.rb +0 -8
  157. data/test/syntax/test_all.rb +0 -6
  158. data/test/syntax/test_syntax.rb +0 -61
  159. data/test/syntax/test_syntax_token_map.rb +0 -25
  160. data/test/syntax/versions/test_1.8.rb +0 -55
  161. data/test/syntax/versions/test_1.9.1.rb +0 -36
  162. data/test/syntax/versions/test_1.9.3.rb +0 -32
  163. data/test/syntax/versions/test_2.0.0.rb +0 -37
  164. data/test/syntax/versions/test_2.2.0.rb +0 -32
  165. data/test/syntax/versions/test_aliases.rb +0 -129
  166. data/test/syntax/versions/test_all.rb +0 -5
  167. data/test/test_all.rb +0 -5
  168. data/test/token/test_all.rb +0 -2
  169. data/test/token/test_token.rb +0 -107
@@ -21,9 +21,6 @@
21
21
  when '\W'; emit(:type, :nonword, text, ts - 1, te)
22
22
  when '\R'; emit(:type, :linebreak, text, ts - 1, te)
23
23
  when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
24
- else
25
- raise ScannerError.new(
26
- "Unexpected character in type at #{text} (char #{ts})")
27
24
  end
28
25
  fret;
29
26
  };
@@ -5,6 +5,9 @@
5
5
  adlam: adlam
6
6
  age=1.1: age=1.1
7
7
  age=10.0: age=10.0
8
+ age=11.0: age=11.0
9
+ age=12.0: age=12.0
10
+ age=12.1: age=12.1
8
11
  age=2.0: age=2.0
9
12
  age=2.1: age=2.1
10
13
  age=3.0: age=3.0
@@ -63,7 +66,6 @@ changeswhenuppercased: changes_when_uppercased
63
66
  cherokee: cherokee
64
67
  closepunctuation: close_punctuation
65
68
  cntrl: cntrl
66
- combiningmark: combining_mark
67
69
  common: common
68
70
  connectorpunctuation: connector_punctuation
69
71
  control: control
@@ -81,9 +83,11 @@ deseret: deseret
81
83
  devanagari: devanagari
82
84
  diacritic: diacritic
83
85
  digit: digit
86
+ dogra: dogra
84
87
  duployan: duployan
85
88
  egyptianhieroglyphs: egyptian_hieroglyphs
86
89
  elbasan: elbasan
90
+ elymaic: elymaic
87
91
  emoji: emoji
88
92
  emojicomponent: emoji_component
89
93
  emojimodifier: emoji_modifier
@@ -104,9 +108,11 @@ graphemeextend: grapheme_extend
104
108
  graphemelink: grapheme_link
105
109
  greek: greek
106
110
  gujarati: gujarati
111
+ gunjalagondi: gunjala_gondi
107
112
  gurmukhi: gurmukhi
108
113
  han: han
109
114
  hangul: hangul
115
+ hanifirohingya: hanifi_rohingya
110
116
  hanunoo: hanunoo
111
117
  hatran: hatran
112
118
  hebrew: hebrew
@@ -160,6 +166,7 @@ inchakma: in_chakma
160
166
  incham: in_cham
161
167
  incherokee: in_cherokee
162
168
  incherokeesupplement: in_cherokee_supplement
169
+ inchesssymbols: in_chess_symbols
163
170
  incjkcompatibility: in_cjk_compatibility
164
171
  incjkcompatibilityforms: in_cjk_compatibility_forms
165
172
  incjkcompatibilityideographs: in_cjk_compatibility_ideographs
@@ -197,11 +204,14 @@ indeseret: in_deseret
197
204
  indevanagari: in_devanagari
198
205
  indevanagariextended: in_devanagari_extended
199
206
  indingbats: in_dingbats
207
+ indogra: in_dogra
200
208
  indominotiles: in_domino_tiles
201
209
  induployan: in_duployan
202
210
  inearlydynasticcuneiform: in_early_dynastic_cuneiform
211
+ inegyptianhieroglyphformatcontrols: in_egyptian_hieroglyph_format_controls
203
212
  inegyptianhieroglyphs: in_egyptian_hieroglyphs
204
213
  inelbasan: in_elbasan
214
+ inelymaic: in_elymaic
205
215
  inemoticons: in_emoticons
206
216
  inenclosedalphanumerics: in_enclosed_alphanumerics
207
217
  inenclosedalphanumericsupplement: in_enclosed_alphanumeric_supplement
@@ -215,6 +225,7 @@ ingeneralpunctuation: in_general_punctuation
215
225
  ingeometricshapes: in_geometric_shapes
216
226
  ingeometricshapesextended: in_geometric_shapes_extended
217
227
  ingeorgian: in_georgian
228
+ ingeorgianextended: in_georgian_extended
218
229
  ingeorgiansupplement: in_georgian_supplement
219
230
  inglagolitic: in_glagolitic
220
231
  inglagoliticsupplement: in_glagolitic_supplement
@@ -223,6 +234,7 @@ ingrantha: in_grantha
223
234
  ingreekandcoptic: in_greek_and_coptic
224
235
  ingreekextended: in_greek_extended
225
236
  ingujarati: in_gujarati
237
+ ingunjalagondi: in_gunjala_gondi
226
238
  ingurmukhi: in_gurmukhi
227
239
  inhalfwidthandfullwidthforms: in_halfwidth_and_fullwidth_forms
228
240
  inhangulcompatibilityjamo: in_hangul_compatibility_jamo
@@ -230,6 +242,7 @@ inhanguljamo: in_hangul_jamo
230
242
  inhanguljamoextendeda: in_hangul_jamo_extended_a
231
243
  inhanguljamoextendedb: in_hangul_jamo_extended_b
232
244
  inhangulsyllables: in_hangul_syllables
245
+ inhanifirohingya: in_hanifi_rohingya
233
246
  inhanunoo: in_hanunoo
234
247
  inhatran: in_hatran
235
248
  inhebrew: in_hebrew
@@ -240,6 +253,7 @@ inhiragana: in_hiragana
240
253
  inideographicdescriptioncharacters: in_ideographic_description_characters
241
254
  inideographicsymbolsandpunctuation: in_ideographic_symbols_and_punctuation
242
255
  inimperialaramaic: in_imperial_aramaic
256
+ inindicsiyaqnumbers: in_indic_siyaq_numbers
243
257
  ininscriptionalpahlavi: in_inscriptional_pahlavi
244
258
  ininscriptionalparthian: in_inscriptional_parthian
245
259
  inipaextensions: in_ipa_extensions
@@ -279,6 +293,7 @@ inlycian: in_lycian
279
293
  inlydian: in_lydian
280
294
  inmahajani: in_mahajani
281
295
  inmahjongtiles: in_mahjong_tiles
296
+ inmakasar: in_makasar
282
297
  inmalayalam: in_malayalam
283
298
  inmandaic: in_mandaic
284
299
  inmanichaean: in_manichaean
@@ -286,6 +301,8 @@ inmarchen: in_marchen
286
301
  inmasaramgondi: in_masaram_gondi
287
302
  inmathematicalalphanumericsymbols: in_mathematical_alphanumeric_symbols
288
303
  inmathematicaloperators: in_mathematical_operators
304
+ inmayannumerals: in_mayan_numerals
305
+ inmedefaidrin: in_medefaidrin
289
306
  inmeeteimayek: in_meetei_mayek
290
307
  inmeeteimayekextensions: in_meetei_mayek_extensions
291
308
  inmendekikakui: in_mende_kikakui
@@ -309,12 +326,14 @@ inmyanmar: in_myanmar
309
326
  inmyanmarextendeda: in_myanmar_extended_a
310
327
  inmyanmarextendedb: in_myanmar_extended_b
311
328
  innabataean: in_nabataean
329
+ innandinagari: in_nandinagari
312
330
  innewa: in_newa
313
331
  innewtailue: in_new_tai_lue
314
332
  innko: in_nko
315
333
  innoblock: in_no_block
316
334
  innumberforms: in_number_forms
317
335
  innushu: in_nushu
336
+ innyiakengpuachuehmong: in_nyiakeng_puachue_hmong
318
337
  inogham: in_ogham
319
338
  inolchiki: in_ol_chiki
320
339
  inoldhungarian: in_old_hungarian
@@ -322,6 +341,7 @@ inolditalic: in_old_italic
322
341
  inoldnortharabian: in_old_north_arabian
323
342
  inoldpermic: in_old_permic
324
343
  inoldpersian: in_old_persian
344
+ inoldsogdian: in_old_sogdian
325
345
  inoldsoutharabian: in_old_south_arabian
326
346
  inoldturkic: in_old_turkic
327
347
  inopticalcharacterrecognition: in_optical_character_recognition
@@ -329,6 +349,7 @@ inoriya: in_oriya
329
349
  inornamentaldingbats: in_ornamental_dingbats
330
350
  inosage: in_osage
331
351
  inosmanya: in_osmanya
352
+ inottomansiyaqnumbers: in_ottoman_siyaq_numbers
332
353
  inpahawhhmong: in_pahawh_hmong
333
354
  inpalmyrene: in_palmyrene
334
355
  inpaucinhau: in_pau_cin_hau
@@ -354,6 +375,8 @@ insiddham: in_siddham
354
375
  insinhala: in_sinhala
355
376
  insinhalaarchaicnumbers: in_sinhala_archaic_numbers
356
377
  insmallformvariants: in_small_form_variants
378
+ insmallkanaextension: in_small_kana_extension
379
+ insogdian: in_sogdian
357
380
  insorasompeng: in_sora_sompeng
358
381
  insoyombo: in_soyombo
359
382
  inspacingmodifierletters: in_spacing_modifier_letters
@@ -371,6 +394,7 @@ insupplementaryprivateuseareaa: in_supplementary_private_use_area_a
371
394
  insupplementaryprivateuseareab: in_supplementary_private_use_area_b
372
395
  insuttonsignwriting: in_sutton_signwriting
373
396
  insylotinagri: in_syloti_nagri
397
+ insymbolsandpictographsextendeda: in_symbols_and_pictographs_extended_a
374
398
  insyriac: in_syriac
375
399
  insyriacsupplement: in_syriac_supplement
376
400
  intagalog: in_tagalog
@@ -382,6 +406,7 @@ intaiviet: in_tai_viet
382
406
  intaixuanjingsymbols: in_tai_xuan_jing_symbols
383
407
  intakri: in_takri
384
408
  intamil: in_tamil
409
+ intamilsupplement: in_tamil_supplement
385
410
  intangut: in_tangut
386
411
  intangutcomponents: in_tangut_components
387
412
  intelugu: in_telugu
@@ -399,6 +424,7 @@ invariationselectors: in_variation_selectors
399
424
  invariationselectorssupplement: in_variation_selectors_supplement
400
425
  invedicextensions: in_vedic_extensions
401
426
  inverticalforms: in_vertical_forms
427
+ inwancho: in_wancho
402
428
  inwarangciti: in_warang_citi
403
429
  inyijinghexagramsymbols: in_yijing_hexagram_symbols
404
430
  inyiradicals: in_yi_radicals
@@ -431,6 +457,7 @@ lowercaseletter: lowercase_letter
431
457
  lycian: lycian
432
458
  lydian: lydian
433
459
  mahajani: mahajani
460
+ makasar: makasar
434
461
  malayalam: malayalam
435
462
  mandaic: mandaic
436
463
  manichaean: manichaean
@@ -439,6 +466,7 @@ mark: mark
439
466
  masaramgondi: masaram_gondi
440
467
  math: math
441
468
  mathsymbol: math_symbol
469
+ medefaidrin: medefaidrin
442
470
  meeteimayek: meetei_mayek
443
471
  mendekikakui: mende_kikakui
444
472
  meroiticcursive: meroitic_cursive
@@ -452,6 +480,7 @@ mro: mro
452
480
  multani: multani
453
481
  myanmar: myanmar
454
482
  nabataean: nabataean
483
+ nandinagari: nandinagari
455
484
  newa: newa
456
485
  newline: newline
457
486
  newtailue: new_tai_lue
@@ -460,6 +489,7 @@ noncharactercodepoint: noncharacter_code_point
460
489
  nonspacingmark: nonspacing_mark
461
490
  number: number
462
491
  nushu: nushu
492
+ nyiakengpuachuehmong: nyiakeng_puachue_hmong
463
493
  ogham: ogham
464
494
  olchiki: ol_chiki
465
495
  oldhungarian: old_hungarian
@@ -467,6 +497,7 @@ olditalic: old_italic
467
497
  oldnortharabian: old_north_arabian
468
498
  oldpermic: old_permic
469
499
  oldpersian: old_persian
500
+ oldsogdian: old_sogdian
470
501
  oldsoutharabian: old_south_arabian
471
502
  oldturkic: old_turkic
472
503
  openpunctuation: open_punctuation
@@ -515,6 +546,7 @@ siddham: siddham
515
546
  signwriting: signwriting
516
547
  sinhala: sinhala
517
548
  softdotted: soft_dotted
549
+ sogdian: sogdian
518
550
  sorasompeng: sora_sompeng
519
551
  soyombo: soyombo
520
552
  space: space
@@ -550,6 +582,7 @@ uppercase: uppercase
550
582
  uppercaseletter: uppercase_letter
551
583
  vai: vai
552
584
  variationselector: variation_selector
585
+ wancho: wancho
553
586
  warangciti: warang_citi
554
587
  whitespace: white_space
555
588
  word: word
@@ -31,6 +31,7 @@ cher: cherokee
31
31
  ci: case_ignorable
32
32
  cn: unassigned
33
33
  co: private_use
34
+ combiningmark: mark
34
35
  copt: coptic
35
36
  cprt: cypriot
36
37
  cs: surrogate
@@ -44,14 +45,17 @@ dep: deprecated
44
45
  deva: devanagari
45
46
  di: default_ignorable_code_point
46
47
  dia: diacritic
48
+ dogr: dogra
47
49
  dsrt: deseret
48
50
  dupl: duployan
49
51
  egyp: egyptian_hieroglyphs
50
52
  elba: elbasan
53
+ elym: elymaic
51
54
  ethi: ethiopic
52
55
  ext: extender
53
56
  geor: georgian
54
57
  glag: glagolitic
58
+ gong: gunjala_gondi
55
59
  gonm: masaram_gondi
56
60
  goth: gothic
57
61
  gran: grantha
@@ -70,6 +74,7 @@ hex: hex_digit
70
74
  hira: hiragana
71
75
  hluw: anatolian_hieroglyphs
72
76
  hmng: pahawh_hmong
77
+ hmnp: nyiakeng_puachue_hmong
73
78
  hung: old_hungarian
74
79
  idc: id_continue
75
80
  ideo: ideographic
@@ -105,11 +110,13 @@ lyci: lycian
105
110
  lydi: lydian
106
111
  m: mark
107
112
  mahj: mahajani
113
+ maka: makasar
108
114
  mand: mandaic
109
115
  mani: manichaean
110
116
  marc: marchen
111
117
  mc: spacing_mark
112
118
  me: enclosing_mark
119
+ medf: medefaidrin
113
120
  mend: mende_kikakui
114
121
  merc: meroitic_cursive
115
122
  mero: meroitic_hieroglyphs
@@ -121,6 +128,7 @@ mtei: meetei_mayek
121
128
  mult: multani
122
129
  mymr: myanmar
123
130
  n: number
131
+ nand: nandinagari
124
132
  narb: old_north_arabian
125
133
  nbat: nabataean
126
134
  nchar: noncharacter_code_point
@@ -168,6 +176,7 @@ qaai: inherited
168
176
  qmark: quotation_mark
169
177
  ri: regional_indicator
170
178
  rjng: rejang
179
+ rohg: hanifi_rohingya
171
180
  runr: runic
172
181
  s: symbol
173
182
  samr: samaritan
@@ -184,6 +193,8 @@ sinh: sinhala
184
193
  sk: modifier_symbol
185
194
  sm: math_symbol
186
195
  so: other_symbol
196
+ sogd: sogdian
197
+ sogo: old_sogdian
187
198
  sora: sora_sompeng
188
199
  soyo: soyombo
189
200
  sterm: sentence_terminal
@@ -209,6 +220,7 @@ uideo: unified_ideograph
209
220
  vaii: vai
210
221
  vs: variation_selector
211
222
  wara: warang_citi
223
+ wcho: wancho
212
224
  wspace: white_space
213
225
  xidc: xid_continue
214
226
  xids: xid_start
@@ -49,9 +49,9 @@
49
49
  codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
50
50
  codepoint_sequence = codepoint_single | codepoint_list;
51
51
 
52
- control_sequence = ('c' | 'C-') . (backslash . 'M-')?;
52
+ control_sequence = ('c' | 'C-') . (backslash . 'M-')? . backslash? . any;
53
53
 
54
- meta_sequence = 'M-' . (backslash . control_sequence)?;
54
+ meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
55
55
 
56
56
  zero_or_one = '?' | '??' | '?+';
57
57
  zero_or_more = '*' | '*?' | '*+';
@@ -82,7 +82,8 @@
82
82
  assertion_lookbehind = '?<=';
83
83
  assertion_nlookbehind = '?<!';
84
84
 
85
- group_options = '?' . [\-mixdau];
85
+ # try to treat every other group head as options group, like Ruby
86
+ group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
86
87
 
87
88
  group_ref = [gk];
88
89
  group_name_char = (alnum | '_');
@@ -135,41 +136,35 @@
135
136
  # Invalid sequence error, used from sequences, like escapes and sets
136
137
  action invalid_sequence_error {
137
138
  text = ts ? copy(data, ts-1..-1) : data.pack('c*')
138
- raise InvalidSequenceError.new('sequence', text)
139
+ validation_error(:sequence, 'sequence', text)
139
140
  }
140
141
 
141
142
  # group (nesting) and set open/close actions
142
- action group_opened { self.group_depth = group_depth + 1; in_group = true }
143
- action group_closed { self.group_depth = group_depth - 1; in_group = group_depth > 0 ? true : false }
143
+ action group_opened { self.group_depth = group_depth + 1 }
144
+ action group_closed { self.group_depth = group_depth - 1 }
145
+ action set_opened { self.set_depth = set_depth + 1 }
146
+ action set_closed { self.set_depth = set_depth - 1 }
144
147
 
145
148
  # Character set scanner, continues consuming characters until it meets the
146
149
  # closing bracket of the set.
147
150
  # --------------------------------------------------------------------------
148
151
  character_set := |*
149
- set_close > (set_meta, 2) {
150
- set_depth -= 1
151
- in_set = set_depth > 0 ? true : false
152
-
152
+ set_close > (set_meta, 2) @set_closed {
153
153
  emit(:set, :close, *text(data, ts, te))
154
-
155
- if set_depth == 0
156
- fgoto main;
157
- else
154
+ if in_set?
158
155
  fret;
156
+ else
157
+ fgoto main;
159
158
  end
160
159
  };
161
160
 
162
- '-]' { # special case, emits two tokens
163
- set_depth -= 1
164
- in_set = set_depth > 0 ? true : false
165
-
166
- emit(:literal, :literal, copy(data, ts..te-2), ts, te)
167
- emit(:set, :close, copy(data, ts+1..te-1), ts, te)
168
-
169
- if set_depth == 0
170
- fgoto main;
171
- else
161
+ '-]' @set_closed { # special case, emits two tokens
162
+ emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
163
+ emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
164
+ if in_set?
172
165
  fret;
166
+ else
167
+ fgoto main;
173
168
  end
174
169
  };
175
170
 
@@ -207,14 +202,12 @@
207
202
  fcall set_escape_sequence;
208
203
  };
209
204
 
210
- set_open >(open_bracket, 1) {
211
- set_depth += 1
212
-
205
+ set_open >(open_bracket, 1) >set_opened {
213
206
  emit(:set, :open, *text(data, ts, te))
214
207
  fcall character_set;
215
208
  };
216
209
 
217
- class_posix >(open_bracket, 1) @eof(premature_end_error) {
210
+ class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
218
211
  text = text(data, ts, te).first
219
212
 
220
213
  type = :posixclass
@@ -227,11 +220,11 @@
227
220
  emit(type, class_name.to_sym, text, ts, te)
228
221
  };
229
222
 
230
- collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
223
+ collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
231
224
  emit(:set, :collation, *text(data, ts, te))
232
225
  };
233
226
 
234
- character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
227
+ character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
235
228
  emit(:set, :equivalent, *text(data, ts, te))
236
229
  };
237
230
 
@@ -337,44 +330,24 @@
337
330
  };
338
331
 
339
332
  control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
340
- if data[te]
341
- c = data[te].chr
342
- if c =~ /[\x00-\x7F]/
343
- emit(:escape, :control, copy(data, ts-1..te), ts-1, te+1)
344
- p += 1
345
- else
346
- raise InvalidSequenceError.new("control sequence")
347
- end
348
- else
349
- raise PrematureEndError.new("control sequence")
350
- end
333
+ emit_meta_control_sequence(data, ts, te, :control)
351
334
  fret;
352
335
  };
353
336
 
354
337
  meta_sequence >(backslashed, 3) $eof(premature_end_error) {
355
- if data[te]
356
- c = data[te].chr
357
- if c =~ /[\x00-\x7F]/
358
- emit(:escape, :meta_sequence, copy(data, ts-1..te), ts-1, te+1)
359
- p += 1
360
- else
361
- raise InvalidSequenceError.new("meta sequence")
362
- end
363
- else
364
- raise PrematureEndError.new("meta sequence")
365
- end
338
+ emit_meta_control_sequence(data, ts, te, :meta_sequence)
366
339
  fret;
367
340
  };
368
341
 
369
342
  char_type_char > (escaped_alpha, 2) {
370
343
  fhold;
371
- fnext *(in_set ? fentry(character_set) : fentry(main));
344
+ fnext *(in_set? ? fentry(character_set) : fentry(main));
372
345
  fcall char_type;
373
346
  };
374
347
 
375
348
  property_char > (escaped_alpha, 2) {
376
349
  fhold;
377
- fnext *(in_set ? fentry(character_set) : fentry(main));
350
+ fnext *(in_set? ? fentry(character_set) : fentry(main));
378
351
  fcall unicode_property;
379
352
  };
380
353
 
@@ -412,8 +385,7 @@
412
385
  };
413
386
 
414
387
  alternation {
415
- if in_conditional and conditional_stack.length > 0 and
416
- conditional_stack.last[1] == group_depth
388
+ if conditional_stack.last == group_depth
417
389
  emit(:conditional, :separator, *text(data, ts, te))
418
390
  else
419
391
  emit(:meta, :alternation, *text(data, ts, te))
@@ -442,18 +414,12 @@
442
414
  when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
443
415
  when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
444
416
  when '\\G'; emit(:anchor, :match_start, text, ts, te)
445
- else
446
- raise ScannerError.new(
447
- "Unexpected character in anchor at #{text} (char #{ts})")
448
417
  end
449
418
  };
450
419
 
451
420
  # Character sets
452
421
  # ------------------------------------------------------------------------
453
- set_open {
454
- set_depth += 1
455
- in_set = true
456
-
422
+ set_open >set_opened {
457
423
  emit(:set, :open, *text(data, ts, te))
458
424
  fcall character_set;
459
425
  };
@@ -465,9 +431,7 @@
465
431
  conditional {
466
432
  text = text(data, ts, te).first
467
433
 
468
- in_conditional = true unless in_conditional
469
- conditional_depth += 1
470
- conditional_stack << [conditional_depth, group_depth]
434
+ conditional_stack << group_depth
471
435
 
472
436
  emit(:conditional, :open, text[0..-2], ts, te-1)
473
437
  emit(:conditional, :condition_open, '(', te-1, te)
@@ -496,7 +460,11 @@
496
460
  # (?imxdau-imx:subexp) option on/off for subexp
497
461
  # ------------------------------------------------------------------------
498
462
  group_open . group_options >group_opened {
499
- p = scan_options(p, data, ts, te)
463
+ text = text(data, ts, te).first
464
+ if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
465
+ raise InvalidGroupOption.new($1 || "-#{$2}", text)
466
+ end
467
+ emit_options(text, ts, te)
500
468
  };
501
469
 
502
470
  # Assertions
@@ -528,19 +496,15 @@
528
496
  when '(?>'; emit(:group, :atomic, text, ts, te)
529
497
  when '(?~'; emit(:group, :absence, text, ts, te)
530
498
 
531
- when /^\(\?<(\w*)>/
532
- empty_name_error(:group, 'named group (ab)') if $1.empty?
499
+ when /^\(\?(?:<>|'')/
500
+ validation_error(:group, 'named group', 'name is empty')
533
501
 
502
+ when /^\(\?<\w*>/
534
503
  emit(:group, :named_ab, text, ts, te)
535
504
 
536
- when /^\(\?'(\w*)'/
537
- empty_name_error(:group, 'named group (sq)') if $1.empty?
538
-
505
+ when /^\(\?'\w*'/
539
506
  emit(:group, :named_sq, text, ts, te)
540
507
 
541
- else
542
- raise ScannerError.new(
543
- "Unknown subexpression group format '#{text}'")
544
508
  end
545
509
  };
546
510
 
@@ -550,20 +514,13 @@
550
514
  };
551
515
 
552
516
  group_close @group_closed {
553
- if in_conditional and conditional_stack.last and
554
- conditional_stack.last[1] == (group_depth + 1)
555
-
556
- emit(:conditional, :close, *text(data, ts, te))
517
+ if conditional_stack.last == group_depth + 1
557
518
  conditional_stack.pop
558
-
559
- if conditional_stack.length == 0
560
- in_conditional = false
561
- end
519
+ emit(:conditional, :close, *text(data, ts, te))
562
520
  else
563
- if spacing_stack.length > 1 and
564
- spacing_stack.last[:depth] == (group_depth + 1)
521
+ if spacing_stack.length > 1 &&
522
+ spacing_stack.last[:depth] == group_depth + 1
565
523
  spacing_stack.pop
566
-
567
524
  self.free_spacing = spacing_stack.last[:free_spacing]
568
525
  end
569
526
 
@@ -576,11 +533,8 @@
576
533
  # ------------------------------------------------------------------------
577
534
  backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
578
535
  case text = text(data, ts, te).first
579
- when /^\\([gk])<>/ # angle brackets
580
- empty_backref_error("ref/call (ab)")
581
-
582
- when /^\\([gk])''/ # single quotes
583
- empty_backref_error("ref/call (sq)")
536
+ when /^\\([gk])(<>|'')/ # angle brackets
537
+ validation_error(:backref, 'ref/call', 'ref ID is empty')
584
538
 
585
539
  when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
586
540
  if $1 == 'k'
@@ -636,9 +590,6 @@
636
590
  when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
637
591
  emit(:backref, :number_recursion_ref_sq, text, ts, te)
638
592
 
639
- else
640
- raise ScannerError.new(
641
- "Unknown backreference format '#{text}'")
642
593
  end
643
594
  };
644
595
 
@@ -786,7 +737,7 @@ class Regexp::Scanner
786
737
  input = input_object
787
738
  self.free_spacing = false
788
739
  end
789
-
740
+ self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
790
741
 
791
742
  data = input.unpack("c*") if input.is_a?(String)
792
743
  eof = data.length
@@ -794,15 +745,9 @@ class Regexp::Scanner
794
745
  self.tokens = []
795
746
  self.block = block_given? ? block : nil
796
747
 
797
- self.in_group = false
748
+ self.set_depth = 0
798
749
  self.group_depth = 0
799
- self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
800
-
801
- in_set = false
802
- set_depth = 0
803
- in_conditional = false
804
- conditional_depth = 0
805
- conditional_stack = []
750
+ self.conditional_stack = []
806
751
 
807
752
  %% write data;
808
753
  %% write init;
@@ -817,9 +762,9 @@ class Regexp::Scanner
817
762
  end
818
763
 
819
764
  raise PrematureEndError.new("(missing group closing paranthesis) "+
820
- "[#{in_group}:#{group_depth}]") if in_group
765
+ "[#{group_depth}]") if in_group?
821
766
  raise PrematureEndError.new("(missing set closing bracket) "+
822
- "[#{in_set}:#{set_depth}]") if in_set
767
+ "[#{set_depth}]") if in_set?
823
768
 
824
769
  # when the entire expression is a literal run
825
770
  emit_literal if literal
@@ -854,62 +799,15 @@ class Regexp::Scanner
854
799
 
855
800
  private
856
801
 
857
- attr_accessor :tokens, :literal, :block,
858
- :in_group, :group_depth,
859
- :free_spacing, :spacing_stack
860
-
861
- # Ragel's regex-based scan of the group options introduced a lot of
862
- # ambiguity, so we just ask it to find the beginning of what looks
863
- # like an options run and handle the rest in here.
864
- def scan_options(p, data, ts, te)
865
- text = text(data, ts, te).first
866
-
867
- options_char, options_length = true, 0
868
-
869
- # Copy while we have option characters. There is no maximum length,
870
- # as ruby allows things like '(?xxxxxxxxx-xxxxxxxxxxxxx:abc)'.
871
- negative_options = false
872
- while options_char
873
- if data[te + options_length]
874
- c = data[te + options_length].chr
875
-
876
- if c =~ /[-mixdau]/
877
- negative_options = true if c == '-'
802
+ attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
803
+ :group_depth, :set_depth, :conditional_stack
878
804
 
879
- raise InvalidGroupOption.new(c, text) if negative_options and
880
- c =~ /[dau]/
881
-
882
- text << c ; p += 1 ; options_length += 1
883
- else
884
- options_char = false
885
- end
886
- else
887
- raise PrematureEndError.new("expression options `#{text}'")
888
- end
889
- end
890
-
891
- if data[te + options_length]
892
- c = data[te + options_length].chr
893
-
894
- if c == ':'
895
- # Include the ':' in the options text
896
- text << c ; p += 1 ; options_length += 1
897
- emit_options(text, ts, te + options_length)
898
-
899
- elsif c == ')'
900
- # Don't include the closing ')', let group_close handle it.
901
- emit_options(text, ts, te + options_length)
902
-
903
- else
904
- # Plain Regexp reports this as 'undefined group option'
905
- raise ScannerError.new(
906
- "Unexpected `#{c}' in options sequence, ':' or ')' expected")
907
- end
908
- else
909
- raise PrematureEndError.new("expression options `#{text}'")
910
- end
805
+ def in_group?
806
+ group_depth > 0
807
+ end
911
808
 
912
- p # return the new value of the data pointer
809
+ def in_set?
810
+ set_depth > 0
913
811
  end
914
812
 
915
813
  # Copy from ts to te from data as text
@@ -945,32 +843,39 @@ class Regexp::Scanner
945
843
  def emit_options(text, ts, te)
946
844
  token = nil
947
845
 
948
- if text =~ /\(\?([mixdau]*)-?([mix]*)(:)?/
949
- positive, negative, group_local = $1, $2, $3
846
+ # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
847
+ text =~ /\(\?([mixdau]*)(-(?:[mix]*))*(:)?/
848
+ positive, negative, group_local = $1, $2, $3
950
849
 
951
- if positive.include?('x')
952
- self.free_spacing = true
953
- end
850
+ if positive.include?('x')
851
+ self.free_spacing = true
852
+ end
954
853
 
955
- # If the x appears in both, treat it like ruby does, the second cancels
956
- # the first.
957
- if negative.include?('x')
958
- self.free_spacing = false
959
- end
854
+ # If the x appears in both, treat it like ruby does, the second cancels
855
+ # the first.
856
+ if negative && negative.include?('x')
857
+ self.free_spacing = false
858
+ end
960
859
 
961
- if group_local
962
- spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
963
- token = :options
964
- else
965
- # switch for parent group level
966
- spacing_stack.last[:free_spacing] = free_spacing
967
- token = :options_switch
968
- end
860
+ if group_local
861
+ spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
862
+ token = :options
863
+ else
864
+ # switch for parent group level
865
+ spacing_stack.last[:free_spacing] = free_spacing
866
+ token = :options_switch
969
867
  end
970
868
 
971
869
  emit(:group, token, text, ts, te)
972
870
  end
973
871
 
872
+ def emit_meta_control_sequence(data, ts, te, token)
873
+ if data.last < 0x00 || data.last > 0x7F
874
+ validation_error(:sequence, 'escape', token.to_s)
875
+ end
876
+ emit(:escape, token, *text(data, ts, te, 1))
877
+ end
878
+
974
879
  # Centralizes and unifies the handling of validation related
975
880
  # errors.
976
881
  def validation_error(type, what, reason)
@@ -981,21 +886,8 @@ class Regexp::Scanner
981
886
  error = InvalidBackrefError.new(what, reason)
982
887
  when :sequence
983
888
  error = InvalidSequenceError.new(what, reason)
984
- else
985
- error = ValidationError.new('expression')
986
889
  end
987
890
 
988
891
  raise error # unless @@config.validation_ignore
989
892
  end
990
-
991
- # Used for references with an empty name or number
992
- def empty_backref_error(type, what)
993
- validation_error(:backref, what, 'ref ID is empty')
994
- end
995
-
996
- # Used for named expressions with an empty name
997
- def empty_name_error(type, what)
998
- validation_error(type, what, 'name is empty')
999
- end
1000
-
1001
893
  end # module Regexp::Scanner