regexp_parser 2.1.1 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +94 -6
  3. data/Gemfile +2 -1
  4. data/LICENSE +1 -1
  5. data/README.md +40 -30
  6. data/Rakefile +6 -70
  7. data/lib/regexp_parser/error.rb +1 -1
  8. data/lib/regexp_parser/expression/base.rb +75 -0
  9. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  10. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +1 -0
  11. data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
  12. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -2
  13. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +2 -2
  14. data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
  15. data/lib/regexp_parser/expression/classes/conditional.rb +2 -2
  16. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +13 -7
  17. data/lib/regexp_parser/expression/classes/free_space.rb +1 -3
  18. data/lib/regexp_parser/expression/classes/group.rb +6 -6
  19. data/lib/regexp_parser/expression/classes/keep.rb +2 -0
  20. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  21. data/lib/regexp_parser/expression/classes/root.rb +3 -6
  22. data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +1 -2
  23. data/lib/regexp_parser/expression/methods/construct.rb +43 -0
  24. data/lib/regexp_parser/expression/methods/match_length.rb +1 -1
  25. data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
  26. data/lib/regexp_parser/expression/methods/tests.rb +10 -1
  27. data/lib/regexp_parser/expression/quantifier.rb +41 -23
  28. data/lib/regexp_parser/expression/sequence.rb +9 -24
  29. data/lib/regexp_parser/expression/sequence_operation.rb +2 -2
  30. data/lib/regexp_parser/expression/shared.rb +85 -0
  31. data/lib/regexp_parser/expression/subexpression.rb +11 -8
  32. data/lib/regexp_parser/expression.rb +10 -132
  33. data/lib/regexp_parser/lexer.rb +8 -6
  34. data/lib/regexp_parser/parser.rb +21 -72
  35. data/lib/regexp_parser/scanner/properties/long.csv +622 -0
  36. data/lib/regexp_parser/scanner/properties/short.csv +246 -0
  37. data/lib/regexp_parser/scanner/property.rl +1 -1
  38. data/lib/regexp_parser/scanner/scanner.rl +48 -35
  39. data/lib/regexp_parser/scanner.rb +735 -801
  40. data/lib/regexp_parser/syntax/any.rb +2 -7
  41. data/lib/regexp_parser/syntax/base.rb +91 -66
  42. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  43. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  44. data/lib/regexp_parser/syntax/token/backreference.rb +30 -0
  45. data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
  46. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  47. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  48. data/lib/regexp_parser/syntax/token/escape.rb +31 -0
  49. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  50. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  51. data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
  52. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  53. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  54. data/lib/regexp_parser/syntax/token/unicode_property.rb +717 -0
  55. data/lib/regexp_parser/syntax/token.rb +45 -0
  56. data/lib/regexp_parser/syntax/version_lookup.rb +20 -29
  57. data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
  58. data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
  59. data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
  60. data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
  61. data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
  62. data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
  63. data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
  64. data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
  65. data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
  66. data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
  67. data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
  68. data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
  69. data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
  70. data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
  71. data/lib/regexp_parser/syntax/versions.rb +1 -1
  72. data/lib/regexp_parser/syntax.rb +1 -1
  73. data/lib/regexp_parser/token.rb +9 -20
  74. data/lib/regexp_parser/version.rb +1 -1
  75. data/lib/regexp_parser.rb +0 -2
  76. data/regexp_parser.gemspec +20 -22
  77. metadata +37 -166
  78. data/lib/regexp_parser/scanner/properties/long.yml +0 -594
  79. data/lib/regexp_parser/scanner/properties/short.yml +0 -237
  80. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  81. data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
  82. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  83. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  84. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  85. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  86. data/spec/expression/base_spec.rb +0 -104
  87. data/spec/expression/clone_spec.rb +0 -152
  88. data/spec/expression/conditional_spec.rb +0 -89
  89. data/spec/expression/free_space_spec.rb +0 -27
  90. data/spec/expression/methods/match_length_spec.rb +0 -161
  91. data/spec/expression/methods/match_spec.rb +0 -25
  92. data/spec/expression/methods/strfregexp_spec.rb +0 -224
  93. data/spec/expression/methods/tests_spec.rb +0 -99
  94. data/spec/expression/methods/traverse_spec.rb +0 -161
  95. data/spec/expression/options_spec.rb +0 -128
  96. data/spec/expression/subexpression_spec.rb +0 -50
  97. data/spec/expression/to_h_spec.rb +0 -26
  98. data/spec/expression/to_s_spec.rb +0 -108
  99. data/spec/lexer/all_spec.rb +0 -22
  100. data/spec/lexer/conditionals_spec.rb +0 -53
  101. data/spec/lexer/delimiters_spec.rb +0 -68
  102. data/spec/lexer/escapes_spec.rb +0 -14
  103. data/spec/lexer/keep_spec.rb +0 -10
  104. data/spec/lexer/literals_spec.rb +0 -64
  105. data/spec/lexer/nesting_spec.rb +0 -99
  106. data/spec/lexer/refcalls_spec.rb +0 -60
  107. data/spec/parser/all_spec.rb +0 -43
  108. data/spec/parser/alternation_spec.rb +0 -88
  109. data/spec/parser/anchors_spec.rb +0 -17
  110. data/spec/parser/conditionals_spec.rb +0 -179
  111. data/spec/parser/errors_spec.rb +0 -30
  112. data/spec/parser/escapes_spec.rb +0 -121
  113. data/spec/parser/free_space_spec.rb +0 -130
  114. data/spec/parser/groups_spec.rb +0 -108
  115. data/spec/parser/keep_spec.rb +0 -6
  116. data/spec/parser/options_spec.rb +0 -28
  117. data/spec/parser/posix_classes_spec.rb +0 -8
  118. data/spec/parser/properties_spec.rb +0 -115
  119. data/spec/parser/quantifiers_spec.rb +0 -68
  120. data/spec/parser/refcalls_spec.rb +0 -117
  121. data/spec/parser/set/intersections_spec.rb +0 -127
  122. data/spec/parser/set/ranges_spec.rb +0 -111
  123. data/spec/parser/sets_spec.rb +0 -178
  124. data/spec/parser/types_spec.rb +0 -18
  125. data/spec/scanner/all_spec.rb +0 -18
  126. data/spec/scanner/anchors_spec.rb +0 -21
  127. data/spec/scanner/conditionals_spec.rb +0 -128
  128. data/spec/scanner/delimiters_spec.rb +0 -52
  129. data/spec/scanner/errors_spec.rb +0 -67
  130. data/spec/scanner/escapes_spec.rb +0 -64
  131. data/spec/scanner/free_space_spec.rb +0 -165
  132. data/spec/scanner/groups_spec.rb +0 -61
  133. data/spec/scanner/keep_spec.rb +0 -10
  134. data/spec/scanner/literals_spec.rb +0 -39
  135. data/spec/scanner/meta_spec.rb +0 -18
  136. data/spec/scanner/options_spec.rb +0 -36
  137. data/spec/scanner/properties_spec.rb +0 -64
  138. data/spec/scanner/quantifiers_spec.rb +0 -25
  139. data/spec/scanner/refcalls_spec.rb +0 -55
  140. data/spec/scanner/sets_spec.rb +0 -151
  141. data/spec/scanner/types_spec.rb +0 -14
  142. data/spec/spec_helper.rb +0 -16
  143. data/spec/support/runner.rb +0 -42
  144. data/spec/support/shared_examples.rb +0 -77
  145. data/spec/support/warning_extractor.rb +0 -60
  146. data/spec/syntax/syntax_spec.rb +0 -48
  147. data/spec/syntax/syntax_token_map_spec.rb +0 -23
  148. data/spec/syntax/versions/1.8.6_spec.rb +0 -17
  149. data/spec/syntax/versions/1.9.1_spec.rb +0 -10
  150. data/spec/syntax/versions/1.9.3_spec.rb +0 -9
  151. data/spec/syntax/versions/2.0.0_spec.rb +0 -13
  152. data/spec/syntax/versions/2.2.0_spec.rb +0 -9
  153. data/spec/syntax/versions/aliases_spec.rb +0 -37
  154. data/spec/token/token_spec.rb +0 -85
@@ -0,0 +1,246 @@
1
+ # THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT
2
+ adlm,adlam
3
+ aghb,caucasian_albanian
4
+ ahex,ascii_hex_digit
5
+ arab,arabic
6
+ armi,imperial_aramaic
7
+ armn,armenian
8
+ avst,avestan
9
+ bali,balinese
10
+ bamu,bamum
11
+ bass,bassa_vah
12
+ batk,batak
13
+ beng,bengali
14
+ bhks,bhaiksuki
15
+ bidic,bidi_control
16
+ bopo,bopomofo
17
+ brah,brahmi
18
+ brai,braille
19
+ bugi,buginese
20
+ buhd,buhid
21
+ c,other
22
+ cakm,chakma
23
+ cans,canadian_aboriginal
24
+ cari,carian
25
+ cc,control
26
+ cf,format
27
+ cher,cherokee
28
+ chrs,chorasmian
29
+ ci,case_ignorable
30
+ cn,unassigned
31
+ co,private_use
32
+ combiningmark,mark
33
+ copt,coptic
34
+ cpmn,cypro_minoan
35
+ cprt,cypriot
36
+ cs,surrogate
37
+ cwcf,changes_when_casefolded
38
+ cwcm,changes_when_casemapped
39
+ cwl,changes_when_lowercased
40
+ cwt,changes_when_titlecased
41
+ cwu,changes_when_uppercased
42
+ cyrl,cyrillic
43
+ dep,deprecated
44
+ deva,devanagari
45
+ di,default_ignorable_code_point
46
+ dia,diacritic
47
+ diak,dives_akuru
48
+ dogr,dogra
49
+ dsrt,deseret
50
+ dupl,duployan
51
+ ebase,emoji_modifier_base
52
+ ecomp,emoji_component
53
+ egyp,egyptian_hieroglyphs
54
+ elba,elbasan
55
+ elym,elymaic
56
+ emod,emoji_modifier
57
+ epres,emoji_presentation
58
+ ethi,ethiopic
59
+ ext,extender
60
+ geor,georgian
61
+ glag,glagolitic
62
+ gong,gunjala_gondi
63
+ gonm,masaram_gondi
64
+ goth,gothic
65
+ gran,grantha
66
+ grbase,grapheme_base
67
+ grek,greek
68
+ grext,grapheme_extend
69
+ grlink,grapheme_link
70
+ gujr,gujarati
71
+ guru,gurmukhi
72
+ hang,hangul
73
+ hani,han
74
+ hano,hanunoo
75
+ hatr,hatran
76
+ hebr,hebrew
77
+ hex,hex_digit
78
+ hira,hiragana
79
+ hluw,anatolian_hieroglyphs
80
+ hmng,pahawh_hmong
81
+ hmnp,nyiakeng_puachue_hmong
82
+ hung,old_hungarian
83
+ idc,id_continue
84
+ ideo,ideographic
85
+ ids,id_start
86
+ idsb,ids_binary_operator
87
+ idst,ids_trinary_operator
88
+ ital,old_italic
89
+ java,javanese
90
+ joinc,join_control
91
+ kali,kayah_li
92
+ kana,katakana
93
+ khar,kharoshthi
94
+ khmr,khmer
95
+ khoj,khojki
96
+ kits,khitan_small_script
97
+ knda,kannada
98
+ kthi,kaithi
99
+ l,letter
100
+ lana,tai_tham
101
+ laoo,lao
102
+ latn,latin
103
+ lc,cased_letter
104
+ lepc,lepcha
105
+ limb,limbu
106
+ lina,linear_a
107
+ linb,linear_b
108
+ ll,lowercase_letter
109
+ lm,modifier_letter
110
+ lo,other_letter
111
+ loe,logical_order_exception
112
+ lt,titlecase_letter
113
+ lu,uppercase_letter
114
+ lyci,lycian
115
+ lydi,lydian
116
+ m,mark
117
+ mahj,mahajani
118
+ maka,makasar
119
+ mand,mandaic
120
+ mani,manichaean
121
+ marc,marchen
122
+ mc,spacing_mark
123
+ me,enclosing_mark
124
+ medf,medefaidrin
125
+ mend,mende_kikakui
126
+ merc,meroitic_cursive
127
+ mero,meroitic_hieroglyphs
128
+ mlym,malayalam
129
+ mn,nonspacing_mark
130
+ mong,mongolian
131
+ mroo,mro
132
+ mtei,meetei_mayek
133
+ mult,multani
134
+ mymr,myanmar
135
+ n,number
136
+ nand,nandinagari
137
+ narb,old_north_arabian
138
+ nbat,nabataean
139
+ nchar,noncharacter_code_point
140
+ nd,decimal_number
141
+ nkoo,nko
142
+ nl,letter_number
143
+ no,other_number
144
+ nshu,nushu
145
+ oalpha,other_alphabetic
146
+ odi,other_default_ignorable_code_point
147
+ ogam,ogham
148
+ ogrext,other_grapheme_extend
149
+ oidc,other_id_continue
150
+ oids,other_id_start
151
+ olck,ol_chiki
152
+ olower,other_lowercase
153
+ omath,other_math
154
+ orkh,old_turkic
155
+ orya,oriya
156
+ osge,osage
157
+ osma,osmanya
158
+ ougr,old_uyghur
159
+ oupper,other_uppercase
160
+ p,punctuation
161
+ palm,palmyrene
162
+ patsyn,pattern_syntax
163
+ patws,pattern_white_space
164
+ pauc,pau_cin_hau
165
+ pc,connector_punctuation
166
+ pcm,prepended_concatenation_mark
167
+ pd,dash_punctuation
168
+ pe,close_punctuation
169
+ perm,old_permic
170
+ pf,final_punctuation
171
+ phag,phags_pa
172
+ phli,inscriptional_pahlavi
173
+ phlp,psalter_pahlavi
174
+ phnx,phoenician
175
+ pi,initial_punctuation
176
+ plrd,miao
177
+ po,other_punctuation
178
+ prti,inscriptional_parthian
179
+ ps,open_punctuation
180
+ qaac,coptic
181
+ qaai,inherited
182
+ qmark,quotation_mark
183
+ ri,regional_indicator
184
+ rjng,rejang
185
+ rohg,hanifi_rohingya
186
+ runr,runic
187
+ s,symbol
188
+ samr,samaritan
189
+ sarb,old_south_arabian
190
+ saur,saurashtra
191
+ sc,currency_symbol
192
+ sd,soft_dotted
193
+ sgnw,signwriting
194
+ shaw,shavian
195
+ shrd,sharada
196
+ sidd,siddham
197
+ sind,khudawadi
198
+ sinh,sinhala
199
+ sk,modifier_symbol
200
+ sm,math_symbol
201
+ so,other_symbol
202
+ sogd,sogdian
203
+ sogo,old_sogdian
204
+ sora,sora_sompeng
205
+ soyo,soyombo
206
+ sterm,sentence_terminal
207
+ sund,sundanese
208
+ sylo,syloti_nagri
209
+ syrc,syriac
210
+ tagb,tagbanwa
211
+ takr,takri
212
+ tale,tai_le
213
+ talu,new_tai_lue
214
+ taml,tamil
215
+ tang,tangut
216
+ tavt,tai_viet
217
+ telu,telugu
218
+ term,terminal_punctuation
219
+ tfng,tifinagh
220
+ tglg,tagalog
221
+ thaa,thaana
222
+ tibt,tibetan
223
+ tirh,tirhuta
224
+ tnsa,tangsa
225
+ ugar,ugaritic
226
+ uideo,unified_ideograph
227
+ vaii,vai
228
+ vith,vithkuqi
229
+ vs,variation_selector
230
+ wara,warang_citi
231
+ wcho,wancho
232
+ wspace,white_space
233
+ xidc,xid_continue
234
+ xids,xid_start
235
+ xpeo,old_persian
236
+ xsux,cuneiform
237
+ yezi,yezidi
238
+ yiii,yi
239
+ z,separator
240
+ zanb,zanabazar_square
241
+ zinh,inherited
242
+ zl,line_separator
243
+ zp,paragraph_separator
244
+ zs,space_separator
245
+ zyyy,common
246
+ zzzz,unknown
@@ -20,7 +20,7 @@
20
20
  name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
21
21
 
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
- raise UnknownUnicodePropertyError.new(name) unless token
23
+ validation_error(:property, name) unless token
24
24
 
25
25
  self.emit(type, token.to_sym, text)
26
26
 
@@ -28,13 +28,7 @@
28
28
 
29
29
  comment = ('#' . [^\n]* . '\n'?);
30
30
 
31
- class_name_posix = 'alnum' | 'alpha' | 'blank' |
32
- 'cntrl' | 'digit' | 'graph' |
33
- 'lower' | 'print' | 'punct' |
34
- 'space' | 'upper' | 'xdigit' |
35
- 'word' | 'ascii';
36
-
37
- class_posix = ('[:' . '^'? . class_name_posix . ':]');
31
+ class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
38
32
 
39
33
 
40
34
  # these are not supported in ruby at the moment
@@ -74,8 +68,7 @@
74
68
  quantity_maximum = ',' . (digit+);
75
69
  quantity_range = (digit+) . ',' . (digit+);
76
70
  quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
77
- quantity_maximum | quantity_range ) . range_close .
78
- quantifier_mode?;
71
+ quantity_maximum | quantity_range ) . range_close;
79
72
 
80
73
  quantifiers = quantifier_greedy | quantifier_reluctant |
81
74
  quantifier_possessive | quantifier_interval;
@@ -223,24 +216,28 @@
223
216
  fcall character_set;
224
217
  };
225
218
 
226
- class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
219
+ class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
227
220
  text = copy(data, ts, te)
228
221
 
229
222
  type = :posixclass
230
223
  class_name = text[2..-3]
231
- if class_name[0].chr == '^'
224
+ if class_name[0] == '^'
232
225
  class_name = class_name[1..-1]
233
226
  type = :nonposixclass
234
227
  end
235
228
 
229
+ unless self.class.posix_classes.include?(class_name)
230
+ validation_error(:posix_class, text)
231
+ end
232
+
236
233
  emit(type, class_name.to_sym, text)
237
234
  };
238
235
 
239
236
  # These are not supported in ruby at the moment. Enable them if they are.
240
- # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
237
+ # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
241
238
  # emit(:set, :collation, copy(data, ts, te))
242
239
  # };
243
- # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
240
+ # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
244
241
  # emit(:set, :equivalent, copy(data, ts, te))
245
242
  # };
246
243
 
@@ -323,7 +320,7 @@
323
320
 
324
321
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
325
322
  text = copy(data, ts-1, te)
326
- if text[2].chr == '{'
323
+ if text[2] == '{'
327
324
  emit(:escape, :codepoint_list, text)
328
325
  else
329
326
  emit(:escape, :codepoint, text)
@@ -419,12 +416,12 @@
419
416
 
420
417
  backslash . anchor_char > (backslashed, 3) {
421
418
  case text = copy(data, ts, te)
422
- when '\\A'; emit(:anchor, :bos, text)
423
- when '\\z'; emit(:anchor, :eos, text)
424
- when '\\Z'; emit(:anchor, :eos_ob_eol, text)
425
- when '\\b'; emit(:anchor, :word_boundary, text)
426
- when '\\B'; emit(:anchor, :nonword_boundary, text)
427
- when '\\G'; emit(:anchor, :match_start, text)
419
+ when '\A'; emit(:anchor, :bos, text)
420
+ when '\z'; emit(:anchor, :eos, text)
421
+ when '\Z'; emit(:anchor, :eos_ob_eol, text)
422
+ when '\b'; emit(:anchor, :word_boundary, text)
423
+ when '\B'; emit(:anchor, :nonword_boundary, text)
424
+ when '\G'; emit(:anchor, :match_start, text)
428
425
  end
429
426
  };
430
427
 
@@ -477,7 +474,7 @@
477
474
  group_open . group_options >group_opened {
478
475
  text = copy(data, ts, te)
479
476
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
480
- raise InvalidGroupOption.new($1 || "-#{$2}", text)
477
+ validation_error(:group_option, $1 || "-#{$2}", text)
481
478
  end
482
479
  emit_options(text)
483
480
  };
@@ -605,7 +602,7 @@
605
602
  end
606
603
  };
607
604
 
608
- quantifier_interval {
605
+ quantifier_interval {
609
606
  emit(:quantifier, :interval, copy(data, ts, te))
610
607
  };
611
608
 
@@ -686,6 +683,7 @@ class Regexp::Scanner
686
683
  end
687
684
 
688
685
  # Invalid groupOption. Used for inline options.
686
+ # TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
689
687
  class InvalidGroupOption < ValidationError
690
688
  def initialize(option, text)
691
689
  super "Invalid group option #{option} in #{text}"
@@ -706,6 +704,13 @@ class Regexp::Scanner
706
704
  end
707
705
  end
708
706
 
707
+ # The POSIX class name was not recognized by the scanner.
708
+ class UnknownPosixClassError < ValidationError
709
+ def initialize(text)
710
+ super "Unknown POSIX class #{text}"
711
+ end
712
+ end
713
+
709
714
  # Scans the given regular expression text, or Regexp object and collects the
710
715
  # emitted token into an array that gets returned at the end. If a block is
711
716
  # given, it gets called for each emitted token.
@@ -759,14 +764,21 @@ class Regexp::Scanner
759
764
  end
760
765
 
761
766
  # lazy-load property maps when first needed
762
- require 'yaml'
763
-
764
767
  def self.short_prop_map
765
- @short_prop_map ||= YAML.load_file("#{__dir__}/scanner/properties/short.yml")
768
+ @short_prop_map ||= parse_prop_map('short')
766
769
  end
767
770
 
768
771
  def self.long_prop_map
769
- @long_prop_map ||= YAML.load_file("#{__dir__}/scanner/properties/long.yml")
772
+ @long_prop_map ||= parse_prop_map('long')
773
+ end
774
+
775
+ def self.parse_prop_map(name)
776
+ File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
777
+ end
778
+
779
+ def self.posix_classes
780
+ %w[alnum alpha ascii blank cntrl digit graph
781
+ lower print punct space upper word xdigit]
770
782
  end
771
783
 
772
784
  # Emits an array with the details of the scanned pattern
@@ -871,15 +883,16 @@ class Regexp::Scanner
871
883
 
872
884
  # Centralizes and unifies the handling of validation related
873
885
  # errors.
874
- def validation_error(type, what, reason)
875
- case type
876
- when :group
877
- error = InvalidGroupError.new(what, reason)
878
- when :backref
879
- error = InvalidBackrefError.new(what, reason)
880
- when :sequence
881
- error = InvalidSequenceError.new(what, reason)
882
- end
886
+ def validation_error(type, what, reason = nil)
887
+ error =
888
+ case type
889
+ when :backref then InvalidBackrefError.new(what, reason)
890
+ when :group then InvalidGroupError.new(what, reason)
891
+ when :group_option then InvalidGroupOption.new(what, reason)
892
+ when :posix_class then UnknownPosixClassError.new(what)
893
+ when :property then UnknownUnicodePropertyError.new(what)
894
+ when :sequence then InvalidSequenceError.new(what, reason)
895
+ end
883
896
 
884
897
  raise error # unless @@config.validation_ignore
885
898
  end