regexp_parser 1.3.0 → 1.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +72 -1
  3. data/Gemfile +3 -3
  4. data/README.md +12 -19
  5. data/Rakefile +3 -4
  6. data/lib/regexp_parser/expression.rb +28 -53
  7. data/lib/regexp_parser/expression/classes/backref.rb +18 -10
  8. data/lib/regexp_parser/expression/classes/conditional.rb +7 -2
  9. data/lib/regexp_parser/expression/classes/escape.rb +0 -4
  10. data/lib/regexp_parser/expression/classes/group.rb +4 -2
  11. data/lib/regexp_parser/expression/classes/keep.rb +1 -3
  12. data/lib/regexp_parser/expression/methods/match.rb +13 -0
  13. data/lib/regexp_parser/expression/methods/match_length.rb +172 -0
  14. data/lib/regexp_parser/expression/methods/options.rb +35 -0
  15. data/lib/regexp_parser/expression/methods/strfregexp.rb +0 -1
  16. data/lib/regexp_parser/expression/methods/tests.rb +6 -15
  17. data/lib/regexp_parser/expression/methods/traverse.rb +3 -1
  18. data/lib/regexp_parser/expression/quantifier.rb +2 -2
  19. data/lib/regexp_parser/expression/sequence.rb +3 -6
  20. data/lib/regexp_parser/expression/sequence_operation.rb +2 -6
  21. data/lib/regexp_parser/expression/subexpression.rb +3 -5
  22. data/lib/regexp_parser/lexer.rb +30 -44
  23. data/lib/regexp_parser/parser.rb +47 -24
  24. data/lib/regexp_parser/scanner.rb +1228 -1367
  25. data/lib/regexp_parser/scanner/char_type.rl +0 -3
  26. data/lib/regexp_parser/scanner/properties/long.yml +34 -1
  27. data/lib/regexp_parser/scanner/properties/short.yml +12 -0
  28. data/lib/regexp_parser/scanner/scanner.rl +101 -194
  29. data/lib/regexp_parser/syntax/tokens.rb +2 -10
  30. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +72 -21
  31. data/lib/regexp_parser/syntax/versions/2.6.0.rb +10 -0
  32. data/lib/regexp_parser/syntax/versions/2.6.2.rb +10 -0
  33. data/lib/regexp_parser/syntax/versions/2.6.3.rb +10 -0
  34. data/lib/regexp_parser/version.rb +1 -1
  35. data/regexp_parser.gemspec +3 -3
  36. data/spec/expression/base_spec.rb +94 -0
  37. data/spec/expression/clone_spec.rb +120 -0
  38. data/spec/expression/conditional_spec.rb +89 -0
  39. data/spec/expression/free_space_spec.rb +27 -0
  40. data/spec/expression/methods/match_length_spec.rb +161 -0
  41. data/spec/expression/methods/match_spec.rb +25 -0
  42. data/spec/expression/methods/strfregexp_spec.rb +224 -0
  43. data/spec/expression/methods/tests_spec.rb +99 -0
  44. data/spec/expression/methods/traverse_spec.rb +161 -0
  45. data/spec/expression/options_spec.rb +128 -0
  46. data/spec/expression/root_spec.rb +9 -0
  47. data/spec/expression/sequence_spec.rb +9 -0
  48. data/spec/expression/subexpression_spec.rb +50 -0
  49. data/spec/expression/to_h_spec.rb +26 -0
  50. data/spec/expression/to_s_spec.rb +100 -0
  51. data/spec/lexer/all_spec.rb +22 -0
  52. data/spec/lexer/conditionals_spec.rb +53 -0
  53. data/spec/lexer/delimiters_spec.rb +68 -0
  54. data/spec/lexer/escapes_spec.rb +14 -0
  55. data/spec/lexer/keep_spec.rb +10 -0
  56. data/spec/lexer/literals_spec.rb +89 -0
  57. data/spec/lexer/nesting_spec.rb +99 -0
  58. data/spec/lexer/refcalls_spec.rb +55 -0
  59. data/spec/parser/all_spec.rb +43 -0
  60. data/spec/parser/alternation_spec.rb +88 -0
  61. data/spec/parser/anchors_spec.rb +17 -0
  62. data/spec/parser/conditionals_spec.rb +179 -0
  63. data/spec/parser/errors_spec.rb +30 -0
  64. data/spec/parser/escapes_spec.rb +121 -0
  65. data/spec/parser/free_space_spec.rb +130 -0
  66. data/spec/parser/groups_spec.rb +108 -0
  67. data/spec/parser/keep_spec.rb +6 -0
  68. data/spec/parser/posix_classes_spec.rb +8 -0
  69. data/spec/parser/properties_spec.rb +115 -0
  70. data/spec/parser/quantifiers_spec.rb +52 -0
  71. data/spec/parser/refcalls_spec.rb +112 -0
  72. data/spec/parser/set/intersections_spec.rb +127 -0
  73. data/spec/parser/set/ranges_spec.rb +111 -0
  74. data/spec/parser/sets_spec.rb +178 -0
  75. data/spec/parser/types_spec.rb +18 -0
  76. data/spec/scanner/all_spec.rb +18 -0
  77. data/spec/scanner/anchors_spec.rb +21 -0
  78. data/spec/scanner/conditionals_spec.rb +128 -0
  79. data/spec/scanner/delimiters_spec.rb +52 -0
  80. data/spec/scanner/errors_spec.rb +67 -0
  81. data/spec/scanner/escapes_spec.rb +53 -0
  82. data/spec/scanner/free_space_spec.rb +133 -0
  83. data/spec/scanner/groups_spec.rb +52 -0
  84. data/spec/scanner/keep_spec.rb +10 -0
  85. data/spec/scanner/literals_spec.rb +49 -0
  86. data/spec/scanner/meta_spec.rb +18 -0
  87. data/spec/scanner/properties_spec.rb +64 -0
  88. data/spec/scanner/quantifiers_spec.rb +20 -0
  89. data/spec/scanner/refcalls_spec.rb +36 -0
  90. data/spec/scanner/sets_spec.rb +102 -0
  91. data/spec/scanner/types_spec.rb +14 -0
  92. data/spec/spec_helper.rb +15 -0
  93. data/{test → spec}/support/runner.rb +9 -8
  94. data/spec/support/shared_examples.rb +77 -0
  95. data/{test → spec}/support/warning_extractor.rb +5 -7
  96. data/spec/syntax/syntax_spec.rb +48 -0
  97. data/spec/syntax/syntax_token_map_spec.rb +23 -0
  98. data/spec/syntax/versions/1.8.6_spec.rb +17 -0
  99. data/spec/syntax/versions/1.9.1_spec.rb +10 -0
  100. data/spec/syntax/versions/1.9.3_spec.rb +9 -0
  101. data/spec/syntax/versions/2.0.0_spec.rb +13 -0
  102. data/spec/syntax/versions/2.2.0_spec.rb +9 -0
  103. data/spec/syntax/versions/aliases_spec.rb +37 -0
  104. data/spec/token/token_spec.rb +85 -0
  105. metadata +151 -146
  106. data/test/expression/test_all.rb +0 -12
  107. data/test/expression/test_base.rb +0 -90
  108. data/test/expression/test_clone.rb +0 -89
  109. data/test/expression/test_conditionals.rb +0 -113
  110. data/test/expression/test_free_space.rb +0 -35
  111. data/test/expression/test_set.rb +0 -84
  112. data/test/expression/test_strfregexp.rb +0 -230
  113. data/test/expression/test_subexpression.rb +0 -58
  114. data/test/expression/test_tests.rb +0 -99
  115. data/test/expression/test_to_h.rb +0 -59
  116. data/test/expression/test_to_s.rb +0 -104
  117. data/test/expression/test_traverse.rb +0 -161
  118. data/test/helpers.rb +0 -10
  119. data/test/lexer/test_all.rb +0 -41
  120. data/test/lexer/test_conditionals.rb +0 -127
  121. data/test/lexer/test_keep.rb +0 -24
  122. data/test/lexer/test_literals.rb +0 -130
  123. data/test/lexer/test_nesting.rb +0 -132
  124. data/test/lexer/test_refcalls.rb +0 -56
  125. data/test/parser/set/test_intersections.rb +0 -127
  126. data/test/parser/set/test_ranges.rb +0 -111
  127. data/test/parser/test_all.rb +0 -64
  128. data/test/parser/test_alternation.rb +0 -92
  129. data/test/parser/test_anchors.rb +0 -34
  130. data/test/parser/test_conditionals.rb +0 -187
  131. data/test/parser/test_errors.rb +0 -63
  132. data/test/parser/test_escapes.rb +0 -134
  133. data/test/parser/test_free_space.rb +0 -139
  134. data/test/parser/test_groups.rb +0 -289
  135. data/test/parser/test_keep.rb +0 -21
  136. data/test/parser/test_posix_classes.rb +0 -27
  137. data/test/parser/test_properties.rb +0 -133
  138. data/test/parser/test_quantifiers.rb +0 -301
  139. data/test/parser/test_refcalls.rb +0 -186
  140. data/test/parser/test_sets.rb +0 -179
  141. data/test/parser/test_types.rb +0 -50
  142. data/test/scanner/test_all.rb +0 -38
  143. data/test/scanner/test_anchors.rb +0 -38
  144. data/test/scanner/test_conditionals.rb +0 -184
  145. data/test/scanner/test_errors.rb +0 -91
  146. data/test/scanner/test_escapes.rb +0 -56
  147. data/test/scanner/test_free_space.rb +0 -200
  148. data/test/scanner/test_groups.rb +0 -79
  149. data/test/scanner/test_keep.rb +0 -35
  150. data/test/scanner/test_literals.rb +0 -89
  151. data/test/scanner/test_meta.rb +0 -40
  152. data/test/scanner/test_properties.rb +0 -312
  153. data/test/scanner/test_quantifiers.rb +0 -37
  154. data/test/scanner/test_refcalls.rb +0 -52
  155. data/test/scanner/test_scripts.rb +0 -53
  156. data/test/scanner/test_sets.rb +0 -119
  157. data/test/scanner/test_types.rb +0 -35
  158. data/test/scanner/test_unicode_blocks.rb +0 -30
  159. data/test/support/disable_autotest.rb +0 -8
  160. data/test/syntax/test_all.rb +0 -6
  161. data/test/syntax/test_syntax.rb +0 -61
  162. data/test/syntax/test_syntax_token_map.rb +0 -25
  163. data/test/syntax/versions/test_1.8.rb +0 -55
  164. data/test/syntax/versions/test_1.9.1.rb +0 -36
  165. data/test/syntax/versions/test_1.9.3.rb +0 -32
  166. data/test/syntax/versions/test_2.0.0.rb +0 -37
  167. data/test/syntax/versions/test_2.2.0.rb +0 -32
  168. data/test/syntax/versions/test_aliases.rb +0 -129
  169. data/test/syntax/versions/test_all.rb +0 -5
  170. data/test/test_all.rb +0 -5
  171. data/test/token/test_all.rb +0 -2
  172. data/test/token/test_token.rb +0 -107
@@ -21,9 +21,6 @@
21
21
  when '\W'; emit(:type, :nonword, text, ts - 1, te)
22
22
  when '\R'; emit(:type, :linebreak, text, ts - 1, te)
23
23
  when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
24
- else
25
- raise ScannerError.new(
26
- "Unexpected character in type at #{text} (char #{ts})")
27
24
  end
28
25
  fret;
29
26
  };
@@ -5,6 +5,9 @@
5
5
  adlam: adlam
6
6
  age=1.1: age=1.1
7
7
  age=10.0: age=10.0
8
+ age=11.0: age=11.0
9
+ age=12.0: age=12.0
10
+ age=12.1: age=12.1
8
11
  age=2.0: age=2.0
9
12
  age=2.1: age=2.1
10
13
  age=3.0: age=3.0
@@ -63,7 +66,6 @@ changeswhenuppercased: changes_when_uppercased
63
66
  cherokee: cherokee
64
67
  closepunctuation: close_punctuation
65
68
  cntrl: cntrl
66
- combiningmark: combining_mark
67
69
  common: common
68
70
  connectorpunctuation: connector_punctuation
69
71
  control: control
@@ -81,9 +83,11 @@ deseret: deseret
81
83
  devanagari: devanagari
82
84
  diacritic: diacritic
83
85
  digit: digit
86
+ dogra: dogra
84
87
  duployan: duployan
85
88
  egyptianhieroglyphs: egyptian_hieroglyphs
86
89
  elbasan: elbasan
90
+ elymaic: elymaic
87
91
  emoji: emoji
88
92
  emojicomponent: emoji_component
89
93
  emojimodifier: emoji_modifier
@@ -104,9 +108,11 @@ graphemeextend: grapheme_extend
104
108
  graphemelink: grapheme_link
105
109
  greek: greek
106
110
  gujarati: gujarati
111
+ gunjalagondi: gunjala_gondi
107
112
  gurmukhi: gurmukhi
108
113
  han: han
109
114
  hangul: hangul
115
+ hanifirohingya: hanifi_rohingya
110
116
  hanunoo: hanunoo
111
117
  hatran: hatran
112
118
  hebrew: hebrew
@@ -160,6 +166,7 @@ inchakma: in_chakma
160
166
  incham: in_cham
161
167
  incherokee: in_cherokee
162
168
  incherokeesupplement: in_cherokee_supplement
169
+ inchesssymbols: in_chess_symbols
163
170
  incjkcompatibility: in_cjk_compatibility
164
171
  incjkcompatibilityforms: in_cjk_compatibility_forms
165
172
  incjkcompatibilityideographs: in_cjk_compatibility_ideographs
@@ -197,11 +204,14 @@ indeseret: in_deseret
197
204
  indevanagari: in_devanagari
198
205
  indevanagariextended: in_devanagari_extended
199
206
  indingbats: in_dingbats
207
+ indogra: in_dogra
200
208
  indominotiles: in_domino_tiles
201
209
  induployan: in_duployan
202
210
  inearlydynasticcuneiform: in_early_dynastic_cuneiform
211
+ inegyptianhieroglyphformatcontrols: in_egyptian_hieroglyph_format_controls
203
212
  inegyptianhieroglyphs: in_egyptian_hieroglyphs
204
213
  inelbasan: in_elbasan
214
+ inelymaic: in_elymaic
205
215
  inemoticons: in_emoticons
206
216
  inenclosedalphanumerics: in_enclosed_alphanumerics
207
217
  inenclosedalphanumericsupplement: in_enclosed_alphanumeric_supplement
@@ -215,6 +225,7 @@ ingeneralpunctuation: in_general_punctuation
215
225
  ingeometricshapes: in_geometric_shapes
216
226
  ingeometricshapesextended: in_geometric_shapes_extended
217
227
  ingeorgian: in_georgian
228
+ ingeorgianextended: in_georgian_extended
218
229
  ingeorgiansupplement: in_georgian_supplement
219
230
  inglagolitic: in_glagolitic
220
231
  inglagoliticsupplement: in_glagolitic_supplement
@@ -223,6 +234,7 @@ ingrantha: in_grantha
223
234
  ingreekandcoptic: in_greek_and_coptic
224
235
  ingreekextended: in_greek_extended
225
236
  ingujarati: in_gujarati
237
+ ingunjalagondi: in_gunjala_gondi
226
238
  ingurmukhi: in_gurmukhi
227
239
  inhalfwidthandfullwidthforms: in_halfwidth_and_fullwidth_forms
228
240
  inhangulcompatibilityjamo: in_hangul_compatibility_jamo
@@ -230,6 +242,7 @@ inhanguljamo: in_hangul_jamo
230
242
  inhanguljamoextendeda: in_hangul_jamo_extended_a
231
243
  inhanguljamoextendedb: in_hangul_jamo_extended_b
232
244
  inhangulsyllables: in_hangul_syllables
245
+ inhanifirohingya: in_hanifi_rohingya
233
246
  inhanunoo: in_hanunoo
234
247
  inhatran: in_hatran
235
248
  inhebrew: in_hebrew
@@ -240,6 +253,7 @@ inhiragana: in_hiragana
240
253
  inideographicdescriptioncharacters: in_ideographic_description_characters
241
254
  inideographicsymbolsandpunctuation: in_ideographic_symbols_and_punctuation
242
255
  inimperialaramaic: in_imperial_aramaic
256
+ inindicsiyaqnumbers: in_indic_siyaq_numbers
243
257
  ininscriptionalpahlavi: in_inscriptional_pahlavi
244
258
  ininscriptionalparthian: in_inscriptional_parthian
245
259
  inipaextensions: in_ipa_extensions
@@ -279,6 +293,7 @@ inlycian: in_lycian
279
293
  inlydian: in_lydian
280
294
  inmahajani: in_mahajani
281
295
  inmahjongtiles: in_mahjong_tiles
296
+ inmakasar: in_makasar
282
297
  inmalayalam: in_malayalam
283
298
  inmandaic: in_mandaic
284
299
  inmanichaean: in_manichaean
@@ -286,6 +301,8 @@ inmarchen: in_marchen
286
301
  inmasaramgondi: in_masaram_gondi
287
302
  inmathematicalalphanumericsymbols: in_mathematical_alphanumeric_symbols
288
303
  inmathematicaloperators: in_mathematical_operators
304
+ inmayannumerals: in_mayan_numerals
305
+ inmedefaidrin: in_medefaidrin
289
306
  inmeeteimayek: in_meetei_mayek
290
307
  inmeeteimayekextensions: in_meetei_mayek_extensions
291
308
  inmendekikakui: in_mende_kikakui
@@ -309,12 +326,14 @@ inmyanmar: in_myanmar
309
326
  inmyanmarextendeda: in_myanmar_extended_a
310
327
  inmyanmarextendedb: in_myanmar_extended_b
311
328
  innabataean: in_nabataean
329
+ innandinagari: in_nandinagari
312
330
  innewa: in_newa
313
331
  innewtailue: in_new_tai_lue
314
332
  innko: in_nko
315
333
  innoblock: in_no_block
316
334
  innumberforms: in_number_forms
317
335
  innushu: in_nushu
336
+ innyiakengpuachuehmong: in_nyiakeng_puachue_hmong
318
337
  inogham: in_ogham
319
338
  inolchiki: in_ol_chiki
320
339
  inoldhungarian: in_old_hungarian
@@ -322,6 +341,7 @@ inolditalic: in_old_italic
322
341
  inoldnortharabian: in_old_north_arabian
323
342
  inoldpermic: in_old_permic
324
343
  inoldpersian: in_old_persian
344
+ inoldsogdian: in_old_sogdian
325
345
  inoldsoutharabian: in_old_south_arabian
326
346
  inoldturkic: in_old_turkic
327
347
  inopticalcharacterrecognition: in_optical_character_recognition
@@ -329,6 +349,7 @@ inoriya: in_oriya
329
349
  inornamentaldingbats: in_ornamental_dingbats
330
350
  inosage: in_osage
331
351
  inosmanya: in_osmanya
352
+ inottomansiyaqnumbers: in_ottoman_siyaq_numbers
332
353
  inpahawhhmong: in_pahawh_hmong
333
354
  inpalmyrene: in_palmyrene
334
355
  inpaucinhau: in_pau_cin_hau
@@ -354,6 +375,8 @@ insiddham: in_siddham
354
375
  insinhala: in_sinhala
355
376
  insinhalaarchaicnumbers: in_sinhala_archaic_numbers
356
377
  insmallformvariants: in_small_form_variants
378
+ insmallkanaextension: in_small_kana_extension
379
+ insogdian: in_sogdian
357
380
  insorasompeng: in_sora_sompeng
358
381
  insoyombo: in_soyombo
359
382
  inspacingmodifierletters: in_spacing_modifier_letters
@@ -371,6 +394,7 @@ insupplementaryprivateuseareaa: in_supplementary_private_use_area_a
371
394
  insupplementaryprivateuseareab: in_supplementary_private_use_area_b
372
395
  insuttonsignwriting: in_sutton_signwriting
373
396
  insylotinagri: in_syloti_nagri
397
+ insymbolsandpictographsextendeda: in_symbols_and_pictographs_extended_a
374
398
  insyriac: in_syriac
375
399
  insyriacsupplement: in_syriac_supplement
376
400
  intagalog: in_tagalog
@@ -382,6 +406,7 @@ intaiviet: in_tai_viet
382
406
  intaixuanjingsymbols: in_tai_xuan_jing_symbols
383
407
  intakri: in_takri
384
408
  intamil: in_tamil
409
+ intamilsupplement: in_tamil_supplement
385
410
  intangut: in_tangut
386
411
  intangutcomponents: in_tangut_components
387
412
  intelugu: in_telugu
@@ -399,6 +424,7 @@ invariationselectors: in_variation_selectors
399
424
  invariationselectorssupplement: in_variation_selectors_supplement
400
425
  invedicextensions: in_vedic_extensions
401
426
  inverticalforms: in_vertical_forms
427
+ inwancho: in_wancho
402
428
  inwarangciti: in_warang_citi
403
429
  inyijinghexagramsymbols: in_yijing_hexagram_symbols
404
430
  inyiradicals: in_yi_radicals
@@ -431,6 +457,7 @@ lowercaseletter: lowercase_letter
431
457
  lycian: lycian
432
458
  lydian: lydian
433
459
  mahajani: mahajani
460
+ makasar: makasar
434
461
  malayalam: malayalam
435
462
  mandaic: mandaic
436
463
  manichaean: manichaean
@@ -439,6 +466,7 @@ mark: mark
439
466
  masaramgondi: masaram_gondi
440
467
  math: math
441
468
  mathsymbol: math_symbol
469
+ medefaidrin: medefaidrin
442
470
  meeteimayek: meetei_mayek
443
471
  mendekikakui: mende_kikakui
444
472
  meroiticcursive: meroitic_cursive
@@ -452,6 +480,7 @@ mro: mro
452
480
  multani: multani
453
481
  myanmar: myanmar
454
482
  nabataean: nabataean
483
+ nandinagari: nandinagari
455
484
  newa: newa
456
485
  newline: newline
457
486
  newtailue: new_tai_lue
@@ -460,6 +489,7 @@ noncharactercodepoint: noncharacter_code_point
460
489
  nonspacingmark: nonspacing_mark
461
490
  number: number
462
491
  nushu: nushu
492
+ nyiakengpuachuehmong: nyiakeng_puachue_hmong
463
493
  ogham: ogham
464
494
  olchiki: ol_chiki
465
495
  oldhungarian: old_hungarian
@@ -467,6 +497,7 @@ olditalic: old_italic
467
497
  oldnortharabian: old_north_arabian
468
498
  oldpermic: old_permic
469
499
  oldpersian: old_persian
500
+ oldsogdian: old_sogdian
470
501
  oldsoutharabian: old_south_arabian
471
502
  oldturkic: old_turkic
472
503
  openpunctuation: open_punctuation
@@ -515,6 +546,7 @@ siddham: siddham
515
546
  signwriting: signwriting
516
547
  sinhala: sinhala
517
548
  softdotted: soft_dotted
549
+ sogdian: sogdian
518
550
  sorasompeng: sora_sompeng
519
551
  soyombo: soyombo
520
552
  space: space
@@ -550,6 +582,7 @@ uppercase: uppercase
550
582
  uppercaseletter: uppercase_letter
551
583
  vai: vai
552
584
  variationselector: variation_selector
585
+ wancho: wancho
553
586
  warangciti: warang_citi
554
587
  whitespace: white_space
555
588
  word: word
@@ -31,6 +31,7 @@ cher: cherokee
31
31
  ci: case_ignorable
32
32
  cn: unassigned
33
33
  co: private_use
34
+ combiningmark: mark
34
35
  copt: coptic
35
36
  cprt: cypriot
36
37
  cs: surrogate
@@ -44,14 +45,17 @@ dep: deprecated
44
45
  deva: devanagari
45
46
  di: default_ignorable_code_point
46
47
  dia: diacritic
48
+ dogr: dogra
47
49
  dsrt: deseret
48
50
  dupl: duployan
49
51
  egyp: egyptian_hieroglyphs
50
52
  elba: elbasan
53
+ elym: elymaic
51
54
  ethi: ethiopic
52
55
  ext: extender
53
56
  geor: georgian
54
57
  glag: glagolitic
58
+ gong: gunjala_gondi
55
59
  gonm: masaram_gondi
56
60
  goth: gothic
57
61
  gran: grantha
@@ -70,6 +74,7 @@ hex: hex_digit
70
74
  hira: hiragana
71
75
  hluw: anatolian_hieroglyphs
72
76
  hmng: pahawh_hmong
77
+ hmnp: nyiakeng_puachue_hmong
73
78
  hung: old_hungarian
74
79
  idc: id_continue
75
80
  ideo: ideographic
@@ -105,11 +110,13 @@ lyci: lycian
105
110
  lydi: lydian
106
111
  m: mark
107
112
  mahj: mahajani
113
+ maka: makasar
108
114
  mand: mandaic
109
115
  mani: manichaean
110
116
  marc: marchen
111
117
  mc: spacing_mark
112
118
  me: enclosing_mark
119
+ medf: medefaidrin
113
120
  mend: mende_kikakui
114
121
  merc: meroitic_cursive
115
122
  mero: meroitic_hieroglyphs
@@ -121,6 +128,7 @@ mtei: meetei_mayek
121
128
  mult: multani
122
129
  mymr: myanmar
123
130
  n: number
131
+ nand: nandinagari
124
132
  narb: old_north_arabian
125
133
  nbat: nabataean
126
134
  nchar: noncharacter_code_point
@@ -168,6 +176,7 @@ qaai: inherited
168
176
  qmark: quotation_mark
169
177
  ri: regional_indicator
170
178
  rjng: rejang
179
+ rohg: hanifi_rohingya
171
180
  runr: runic
172
181
  s: symbol
173
182
  samr: samaritan
@@ -184,6 +193,8 @@ sinh: sinhala
184
193
  sk: modifier_symbol
185
194
  sm: math_symbol
186
195
  so: other_symbol
196
+ sogd: sogdian
197
+ sogo: old_sogdian
187
198
  sora: sora_sompeng
188
199
  soyo: soyombo
189
200
  sterm: sentence_terminal
@@ -209,6 +220,7 @@ uideo: unified_ideograph
209
220
  vaii: vai
210
221
  vs: variation_selector
211
222
  wara: warang_citi
223
+ wcho: wancho
212
224
  wspace: white_space
213
225
  xidc: xid_continue
214
226
  xids: xid_start
@@ -49,9 +49,9 @@
49
49
  codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
50
50
  codepoint_sequence = codepoint_single | codepoint_list;
51
51
 
52
- control_sequence = ('c' | 'C-') . (backslash . 'M-')?;
52
+ control_sequence = ('c' | 'C-') . (backslash . 'M-')? . backslash? . any;
53
53
 
54
- meta_sequence = 'M-' . (backslash . control_sequence)?;
54
+ meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
55
55
 
56
56
  zero_or_one = '?' | '??' | '?+';
57
57
  zero_or_more = '*' | '*?' | '*+';
@@ -62,13 +62,17 @@
62
62
  quantifier_possessive = '?+' | '*+' | '++';
63
63
  quantifier_mode = '?' | '+';
64
64
 
65
- quantifier_interval = range_open . (digit+)? . ','? . (digit+)? .
66
- range_close . quantifier_mode?;
65
+ quantity_exact = (digit+);
66
+ quantity_minimum = (digit+) . ',';
67
+ quantity_maximum = ',' . (digit+);
68
+ quantity_range = (digit+) . ',' . (digit+);
69
+ quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
70
+ quantity_maximum | quantity_range ) . range_close .
71
+ quantifier_mode?;
67
72
 
68
73
  quantifiers = quantifier_greedy | quantifier_reluctant |
69
74
  quantifier_possessive | quantifier_interval;
70
75
 
71
-
72
76
  conditional = '(?(';
73
77
 
74
78
  group_comment = '?#' . [^)]* . group_close;
@@ -82,7 +86,8 @@
82
86
  assertion_lookbehind = '?<=';
83
87
  assertion_nlookbehind = '?<!';
84
88
 
85
- group_options = '?' . [\-mixdau];
89
+ # try to treat every other group head as options group, like Ruby
90
+ group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
86
91
 
87
92
  group_ref = [gk];
88
93
  group_name_char = (alnum | '_');
@@ -113,6 +118,8 @@
113
118
  curlies | parantheses | brackets |
114
119
  line_anchor | quantifier_greedy;
115
120
 
121
+ literal_delimiters = ']' | '}';
122
+
116
123
  ascii_print = ((0x20..0x7e) - meta_char);
117
124
  ascii_nonprint = (0x01..0x1f | 0x7f);
118
125
 
@@ -135,41 +142,35 @@
135
142
  # Invalid sequence error, used from sequences, like escapes and sets
136
143
  action invalid_sequence_error {
137
144
  text = ts ? copy(data, ts-1..-1) : data.pack('c*')
138
- raise InvalidSequenceError.new('sequence', text)
145
+ validation_error(:sequence, 'sequence', text)
139
146
  }
140
147
 
141
148
  # group (nesting) and set open/close actions
142
- action group_opened { self.group_depth = group_depth + 1; in_group = true }
143
- action group_closed { self.group_depth = group_depth - 1; in_group = group_depth > 0 ? true : false }
149
+ action group_opened { self.group_depth = group_depth + 1 }
150
+ action group_closed { self.group_depth = group_depth - 1 }
151
+ action set_opened { self.set_depth = set_depth + 1 }
152
+ action set_closed { self.set_depth = set_depth - 1 }
144
153
 
145
154
  # Character set scanner, continues consuming characters until it meets the
146
155
  # closing bracket of the set.
147
156
  # --------------------------------------------------------------------------
148
157
  character_set := |*
149
- set_close > (set_meta, 2) {
150
- set_depth -= 1
151
- in_set = set_depth > 0 ? true : false
152
-
158
+ set_close > (set_meta, 2) @set_closed {
153
159
  emit(:set, :close, *text(data, ts, te))
154
-
155
- if set_depth == 0
156
- fgoto main;
157
- else
160
+ if in_set?
158
161
  fret;
162
+ else
163
+ fgoto main;
159
164
  end
160
165
  };
161
166
 
162
- '-]' { # special case, emits two tokens
163
- set_depth -= 1
164
- in_set = set_depth > 0 ? true : false
165
-
166
- emit(:literal, :literal, copy(data, ts..te-2), ts, te)
167
- emit(:set, :close, copy(data, ts+1..te-1), ts, te)
168
-
169
- if set_depth == 0
170
- fgoto main;
171
- else
167
+ '-]' @set_closed { # special case, emits two tokens
168
+ emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
169
+ emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
170
+ if in_set?
172
171
  fret;
172
+ else
173
+ fgoto main;
173
174
  end
174
175
  };
175
176
 
@@ -207,14 +208,12 @@
207
208
  fcall set_escape_sequence;
208
209
  };
209
210
 
210
- set_open >(open_bracket, 1) {
211
- set_depth += 1
212
-
211
+ set_open >(open_bracket, 1) >set_opened {
213
212
  emit(:set, :open, *text(data, ts, te))
214
213
  fcall character_set;
215
214
  };
216
215
 
217
- class_posix >(open_bracket, 1) @eof(premature_end_error) {
216
+ class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
218
217
  text = text(data, ts, te).first
219
218
 
220
219
  type = :posixclass
@@ -227,11 +226,11 @@
227
226
  emit(type, class_name.to_sym, text, ts, te)
228
227
  };
229
228
 
230
- collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
229
+ collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
231
230
  emit(:set, :collation, *text(data, ts, te))
232
231
  };
233
232
 
234
- character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
233
+ character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
235
234
  emit(:set, :equivalent, *text(data, ts, te))
236
235
  };
237
236
 
@@ -337,44 +336,24 @@
337
336
  };
338
337
 
339
338
  control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
340
- if data[te]
341
- c = data[te].chr
342
- if c =~ /[\x00-\x7F]/
343
- emit(:escape, :control, copy(data, ts-1..te), ts-1, te+1)
344
- p += 1
345
- else
346
- raise InvalidSequenceError.new("control sequence")
347
- end
348
- else
349
- raise PrematureEndError.new("control sequence")
350
- end
339
+ emit_meta_control_sequence(data, ts, te, :control)
351
340
  fret;
352
341
  };
353
342
 
354
343
  meta_sequence >(backslashed, 3) $eof(premature_end_error) {
355
- if data[te]
356
- c = data[te].chr
357
- if c =~ /[\x00-\x7F]/
358
- emit(:escape, :meta_sequence, copy(data, ts-1..te), ts-1, te+1)
359
- p += 1
360
- else
361
- raise InvalidSequenceError.new("meta sequence")
362
- end
363
- else
364
- raise PrematureEndError.new("meta sequence")
365
- end
344
+ emit_meta_control_sequence(data, ts, te, :meta_sequence)
366
345
  fret;
367
346
  };
368
347
 
369
348
  char_type_char > (escaped_alpha, 2) {
370
349
  fhold;
371
- fnext *(in_set ? fentry(character_set) : fentry(main));
350
+ fnext *(in_set? ? fentry(character_set) : fentry(main));
372
351
  fcall char_type;
373
352
  };
374
353
 
375
354
  property_char > (escaped_alpha, 2) {
376
355
  fhold;
377
- fnext *(in_set ? fentry(character_set) : fentry(main));
356
+ fnext *(in_set? ? fentry(character_set) : fentry(main));
378
357
  fcall unicode_property;
379
358
  };
380
359
 
@@ -412,8 +391,7 @@
412
391
  };
413
392
 
414
393
  alternation {
415
- if in_conditional and conditional_stack.length > 0 and
416
- conditional_stack.last[1] == group_depth
394
+ if conditional_stack.last == group_depth
417
395
  emit(:conditional, :separator, *text(data, ts, te))
418
396
  else
419
397
  emit(:meta, :alternation, *text(data, ts, te))
@@ -442,18 +420,16 @@
442
420
  when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
443
421
  when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
444
422
  when '\\G'; emit(:anchor, :match_start, text, ts, te)
445
- else
446
- raise ScannerError.new(
447
- "Unexpected character in anchor at #{text} (char #{ts})")
448
423
  end
449
424
  };
450
425
 
426
+ literal_delimiters {
427
+ append_literal(data, ts, te)
428
+ };
429
+
451
430
  # Character sets
452
431
  # ------------------------------------------------------------------------
453
- set_open {
454
- set_depth += 1
455
- in_set = true
456
-
432
+ set_open >set_opened {
457
433
  emit(:set, :open, *text(data, ts, te))
458
434
  fcall character_set;
459
435
  };
@@ -465,9 +441,7 @@
465
441
  conditional {
466
442
  text = text(data, ts, te).first
467
443
 
468
- in_conditional = true unless in_conditional
469
- conditional_depth += 1
470
- conditional_stack << [conditional_depth, group_depth]
444
+ conditional_stack << group_depth
471
445
 
472
446
  emit(:conditional, :open, text[0..-2], ts, te-1)
473
447
  emit(:conditional, :condition_open, '(', te-1, te)
@@ -496,7 +470,11 @@
496
470
  # (?imxdau-imx:subexp) option on/off for subexp
497
471
  # ------------------------------------------------------------------------
498
472
  group_open . group_options >group_opened {
499
- p = scan_options(p, data, ts, te)
473
+ text = text(data, ts, te).first
474
+ if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
475
+ raise InvalidGroupOption.new($1 || "-#{$2}", text)
476
+ end
477
+ emit_options(text, ts, te)
500
478
  };
501
479
 
502
480
  # Assertions
@@ -528,19 +506,15 @@
528
506
  when '(?>'; emit(:group, :atomic, text, ts, te)
529
507
  when '(?~'; emit(:group, :absence, text, ts, te)
530
508
 
531
- when /^\(\?<(\w*)>/
532
- empty_name_error(:group, 'named group (ab)') if $1.empty?
509
+ when /^\(\?(?:<>|'')/
510
+ validation_error(:group, 'named group', 'name is empty')
533
511
 
512
+ when /^\(\?<\w*>/
534
513
  emit(:group, :named_ab, text, ts, te)
535
514
 
536
- when /^\(\?'(\w*)'/
537
- empty_name_error(:group, 'named group (sq)') if $1.empty?
538
-
515
+ when /^\(\?'\w*'/
539
516
  emit(:group, :named_sq, text, ts, te)
540
517
 
541
- else
542
- raise ScannerError.new(
543
- "Unknown subexpression group format '#{text}'")
544
518
  end
545
519
  };
546
520
 
@@ -550,20 +524,13 @@
550
524
  };
551
525
 
552
526
  group_close @group_closed {
553
- if in_conditional and conditional_stack.last and
554
- conditional_stack.last[1] == (group_depth + 1)
555
-
556
- emit(:conditional, :close, *text(data, ts, te))
527
+ if conditional_stack.last == group_depth + 1
557
528
  conditional_stack.pop
558
-
559
- if conditional_stack.length == 0
560
- in_conditional = false
561
- end
529
+ emit(:conditional, :close, *text(data, ts, te))
562
530
  else
563
- if spacing_stack.length > 1 and
564
- spacing_stack.last[:depth] == (group_depth + 1)
531
+ if spacing_stack.length > 1 &&
532
+ spacing_stack.last[:depth] == group_depth + 1
565
533
  spacing_stack.pop
566
-
567
534
  self.free_spacing = spacing_stack.last[:free_spacing]
568
535
  end
569
536
 
@@ -576,11 +543,8 @@
576
543
  # ------------------------------------------------------------------------
577
544
  backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
578
545
  case text = text(data, ts, te).first
579
- when /^\\([gk])<>/ # angle brackets
580
- empty_backref_error("ref/call (ab)")
581
-
582
- when /^\\([gk])''/ # single quotes
583
- empty_backref_error("ref/call (sq)")
546
+ when /^\\([gk])(<>|'')/ # angle brackets
547
+ validation_error(:backref, 'ref/call', 'ref ID is empty')
584
548
 
585
549
  when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
586
550
  if $1 == 'k'
@@ -636,9 +600,6 @@
636
600
  when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
637
601
  emit(:backref, :number_recursion_ref_sq, text, ts, te)
638
602
 
639
- else
640
- raise ScannerError.new(
641
- "Unknown backreference format '#{text}'")
642
603
  end
643
604
  };
644
605
 
@@ -669,10 +630,15 @@
669
630
  end
670
631
  };
671
632
 
672
- quantifier_interval @err(premature_end_error) {
633
+ quantifier_interval {
673
634
  emit(:quantifier, :interval, *text(data, ts, te))
674
635
  };
675
636
 
637
+ # Catch unmatched curly braces as literals
638
+ range_open {
639
+ append_literal(data, ts, te)
640
+ };
641
+
676
642
  # Escaped sequences
677
643
  # ------------------------------------------------------------------------
678
644
  backslash > (backslashed, 1) {
@@ -786,7 +752,7 @@ class Regexp::Scanner
786
752
  input = input_object
787
753
  self.free_spacing = false
788
754
  end
789
-
755
+ self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
790
756
 
791
757
  data = input.unpack("c*") if input.is_a?(String)
792
758
  eof = data.length
@@ -794,15 +760,9 @@ class Regexp::Scanner
794
760
  self.tokens = []
795
761
  self.block = block_given? ? block : nil
796
762
 
797
- self.in_group = false
763
+ self.set_depth = 0
798
764
  self.group_depth = 0
799
- self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
800
-
801
- in_set = false
802
- set_depth = 0
803
- in_conditional = false
804
- conditional_depth = 0
805
- conditional_stack = []
765
+ self.conditional_stack = []
806
766
 
807
767
  %% write data;
808
768
  %% write init;
@@ -817,9 +777,9 @@ class Regexp::Scanner
817
777
  end
818
778
 
819
779
  raise PrematureEndError.new("(missing group closing paranthesis) "+
820
- "[#{in_group}:#{group_depth}]") if in_group
780
+ "[#{group_depth}]") if in_group?
821
781
  raise PrematureEndError.new("(missing set closing bracket) "+
822
- "[#{in_set}:#{set_depth}]") if in_set
782
+ "[#{set_depth}]") if in_set?
823
783
 
824
784
  # when the entire expression is a literal run
825
785
  emit_literal if literal
@@ -854,62 +814,15 @@ class Regexp::Scanner
854
814
 
855
815
  private
856
816
 
857
- attr_accessor :tokens, :literal, :block,
858
- :in_group, :group_depth,
859
- :free_spacing, :spacing_stack
860
-
861
- # Ragel's regex-based scan of the group options introduced a lot of
862
- # ambiguity, so we just ask it to find the beginning of what looks
863
- # like an options run and handle the rest in here.
864
- def scan_options(p, data, ts, te)
865
- text = text(data, ts, te).first
817
+ attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
818
+ :group_depth, :set_depth, :conditional_stack
866
819
 
867
- options_char, options_length = true, 0
868
-
869
- # Copy while we have option characters. There is no maximum length,
870
- # as ruby allows things like '(?xxxxxxxxx-xxxxxxxxxxxxx:abc)'.
871
- negative_options = false
872
- while options_char
873
- if data[te + options_length]
874
- c = data[te + options_length].chr
875
-
876
- if c =~ /[-mixdau]/
877
- negative_options = true if c == '-'
878
-
879
- raise InvalidGroupOption.new(c, text) if negative_options and
880
- c =~ /[dau]/
881
-
882
- text << c ; p += 1 ; options_length += 1
883
- else
884
- options_char = false
885
- end
886
- else
887
- raise PrematureEndError.new("expression options `#{text}'")
888
- end
889
- end
890
-
891
- if data[te + options_length]
892
- c = data[te + options_length].chr
893
-
894
- if c == ':'
895
- # Include the ':' in the options text
896
- text << c ; p += 1 ; options_length += 1
897
- emit_options(text, ts, te + options_length)
898
-
899
- elsif c == ')'
900
- # Don't include the closing ')', let group_close handle it.
901
- emit_options(text, ts, te + options_length)
902
-
903
- else
904
- # Plain Regexp reports this as 'undefined group option'
905
- raise ScannerError.new(
906
- "Unexpected `#{c}' in options sequence, ':' or ')' expected")
907
- end
908
- else
909
- raise PrematureEndError.new("expression options `#{text}'")
910
- end
820
+ def in_group?
821
+ group_depth > 0
822
+ end
911
823
 
912
- p # return the new value of the data pointer
824
+ def in_set?
825
+ set_depth > 0
913
826
  end
914
827
 
915
828
  # Copy from ts to te from data as text
@@ -945,32 +858,39 @@ class Regexp::Scanner
945
858
  def emit_options(text, ts, te)
946
859
  token = nil
947
860
 
948
- if text =~ /\(\?([mixdau]*)-?([mix]*)(:)?/
949
- positive, negative, group_local = $1, $2, $3
861
+ # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
862
+ text =~ /\(\?([mixdau]*)(-(?:[mix]*))*(:)?/
863
+ positive, negative, group_local = $1, $2, $3
950
864
 
951
- if positive.include?('x')
952
- self.free_spacing = true
953
- end
865
+ if positive.include?('x')
866
+ self.free_spacing = true
867
+ end
954
868
 
955
- # If the x appears in both, treat it like ruby does, the second cancels
956
- # the first.
957
- if negative.include?('x')
958
- self.free_spacing = false
959
- end
869
+ # If the x appears in both, treat it like ruby does, the second cancels
870
+ # the first.
871
+ if negative && negative.include?('x')
872
+ self.free_spacing = false
873
+ end
960
874
 
961
- if group_local
962
- spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
963
- token = :options
964
- else
965
- # switch for parent group level
966
- spacing_stack.last[:free_spacing] = free_spacing
967
- token = :options_switch
968
- end
875
+ if group_local
876
+ spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
877
+ token = :options
878
+ else
879
+ # switch for parent group level
880
+ spacing_stack.last[:free_spacing] = free_spacing
881
+ token = :options_switch
969
882
  end
970
883
 
971
884
  emit(:group, token, text, ts, te)
972
885
  end
973
886
 
887
+ def emit_meta_control_sequence(data, ts, te, token)
888
+ if data.last < 0x00 || data.last > 0x7F
889
+ validation_error(:sequence, 'escape', token.to_s)
890
+ end
891
+ emit(:escape, token, *text(data, ts, te, 1))
892
+ end
893
+
974
894
  # Centralizes and unifies the handling of validation related
975
895
  # errors.
976
896
  def validation_error(type, what, reason)
@@ -981,21 +901,8 @@ class Regexp::Scanner
981
901
  error = InvalidBackrefError.new(what, reason)
982
902
  when :sequence
983
903
  error = InvalidSequenceError.new(what, reason)
984
- else
985
- error = ValidationError.new('expression')
986
904
  end
987
905
 
988
906
  raise error # unless @@config.validation_ignore
989
907
  end
990
-
991
- # Used for references with an empty name or number
992
- def empty_backref_error(type, what)
993
- validation_error(:backref, what, 'ref ID is empty')
994
- end
995
-
996
- # Used for named expressions with an empty name
997
- def empty_name_error(type, what)
998
- validation_error(type, what, 'name is empty')
999
- end
1000
-
1001
908
  end # module Regexp::Scanner