regexp_parser 1.7.0 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +9 -3
  3. data/LICENSE +1 -1
  4. data/Rakefile +6 -70
  5. data/lib/regexp_parser/error.rb +4 -0
  6. data/lib/regexp_parser/expression/base.rb +76 -0
  7. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  8. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  9. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
  10. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
  11. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +4 -8
  12. data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
  13. data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
  14. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
  15. data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
  16. data/lib/regexp_parser/expression/classes/group.rb +28 -15
  17. data/lib/regexp_parser/expression/classes/keep.rb +2 -0
  18. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  19. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
  20. data/lib/regexp_parser/expression/classes/root.rb +4 -19
  21. data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +11 -12
  22. data/lib/regexp_parser/expression/methods/construct.rb +41 -0
  23. data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
  24. data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
  25. data/lib/regexp_parser/expression/methods/negative.rb +20 -0
  26. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  27. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  28. data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
  29. data/lib/regexp_parser/expression/methods/tests.rb +47 -1
  30. data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
  31. data/lib/regexp_parser/expression/quantifier.rb +57 -17
  32. data/lib/regexp_parser/expression/sequence.rb +11 -47
  33. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  34. data/lib/regexp_parser/expression/shared.rb +111 -0
  35. data/lib/regexp_parser/expression/subexpression.rb +27 -19
  36. data/lib/regexp_parser/expression.rb +15 -141
  37. data/lib/regexp_parser/lexer.rb +83 -41
  38. data/lib/regexp_parser/parser.rb +372 -429
  39. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  40. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  41. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  42. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  43. data/lib/regexp_parser/scanner/properties/long.csv +651 -0
  44. data/lib/regexp_parser/scanner/properties/short.csv +249 -0
  45. data/lib/regexp_parser/scanner/property.rl +4 -4
  46. data/lib/regexp_parser/scanner/scanner.rl +303 -368
  47. data/lib/regexp_parser/scanner.rb +1423 -1674
  48. data/lib/regexp_parser/syntax/any.rb +2 -7
  49. data/lib/regexp_parser/syntax/base.rb +92 -67
  50. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  51. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  52. data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
  53. data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
  54. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  55. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  56. data/lib/regexp_parser/syntax/token/escape.rb +33 -0
  57. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  58. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  59. data/lib/regexp_parser/syntax/token/meta.rb +20 -0
  60. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  61. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  62. data/lib/regexp_parser/syntax/token/unicode_property.rb +751 -0
  63. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  64. data/lib/regexp_parser/syntax/token.rb +45 -0
  65. data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
  66. data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
  67. data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
  68. data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
  69. data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
  70. data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
  71. data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
  72. data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
  73. data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
  74. data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
  75. data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
  76. data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
  77. data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
  78. data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
  79. data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
  80. data/lib/regexp_parser/syntax/versions.rb +3 -1
  81. data/lib/regexp_parser/syntax.rb +8 -6
  82. data/lib/regexp_parser/token.rb +9 -20
  83. data/lib/regexp_parser/version.rb +1 -1
  84. data/lib/regexp_parser.rb +0 -2
  85. data/regexp_parser.gemspec +19 -23
  86. metadata +53 -171
  87. data/CHANGELOG.md +0 -349
  88. data/README.md +0 -470
  89. data/lib/regexp_parser/scanner/properties/long.yml +0 -594
  90. data/lib/regexp_parser/scanner/properties/short.yml +0 -237
  91. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  92. data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
  93. data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
  94. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  95. data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
  96. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  97. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  98. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  99. data/spec/expression/base_spec.rb +0 -94
  100. data/spec/expression/clone_spec.rb +0 -120
  101. data/spec/expression/conditional_spec.rb +0 -89
  102. data/spec/expression/free_space_spec.rb +0 -27
  103. data/spec/expression/methods/match_length_spec.rb +0 -161
  104. data/spec/expression/methods/match_spec.rb +0 -25
  105. data/spec/expression/methods/strfregexp_spec.rb +0 -224
  106. data/spec/expression/methods/tests_spec.rb +0 -99
  107. data/spec/expression/methods/traverse_spec.rb +0 -161
  108. data/spec/expression/options_spec.rb +0 -128
  109. data/spec/expression/root_spec.rb +0 -9
  110. data/spec/expression/sequence_spec.rb +0 -9
  111. data/spec/expression/subexpression_spec.rb +0 -50
  112. data/spec/expression/to_h_spec.rb +0 -26
  113. data/spec/expression/to_s_spec.rb +0 -100
  114. data/spec/lexer/all_spec.rb +0 -22
  115. data/spec/lexer/conditionals_spec.rb +0 -53
  116. data/spec/lexer/escapes_spec.rb +0 -14
  117. data/spec/lexer/keep_spec.rb +0 -10
  118. data/spec/lexer/literals_spec.rb +0 -89
  119. data/spec/lexer/nesting_spec.rb +0 -99
  120. data/spec/lexer/refcalls_spec.rb +0 -55
  121. data/spec/parser/all_spec.rb +0 -43
  122. data/spec/parser/alternation_spec.rb +0 -88
  123. data/spec/parser/anchors_spec.rb +0 -17
  124. data/spec/parser/conditionals_spec.rb +0 -179
  125. data/spec/parser/errors_spec.rb +0 -30
  126. data/spec/parser/escapes_spec.rb +0 -121
  127. data/spec/parser/free_space_spec.rb +0 -130
  128. data/spec/parser/groups_spec.rb +0 -108
  129. data/spec/parser/keep_spec.rb +0 -6
  130. data/spec/parser/posix_classes_spec.rb +0 -8
  131. data/spec/parser/properties_spec.rb +0 -115
  132. data/spec/parser/quantifiers_spec.rb +0 -51
  133. data/spec/parser/refcalls_spec.rb +0 -112
  134. data/spec/parser/set/intersections_spec.rb +0 -127
  135. data/spec/parser/set/ranges_spec.rb +0 -111
  136. data/spec/parser/sets_spec.rb +0 -178
  137. data/spec/parser/types_spec.rb +0 -18
  138. data/spec/scanner/all_spec.rb +0 -18
  139. data/spec/scanner/anchors_spec.rb +0 -21
  140. data/spec/scanner/conditionals_spec.rb +0 -128
  141. data/spec/scanner/errors_spec.rb +0 -68
  142. data/spec/scanner/escapes_spec.rb +0 -53
  143. data/spec/scanner/free_space_spec.rb +0 -133
  144. data/spec/scanner/groups_spec.rb +0 -52
  145. data/spec/scanner/keep_spec.rb +0 -10
  146. data/spec/scanner/literals_spec.rb +0 -49
  147. data/spec/scanner/meta_spec.rb +0 -18
  148. data/spec/scanner/properties_spec.rb +0 -64
  149. data/spec/scanner/quantifiers_spec.rb +0 -20
  150. data/spec/scanner/refcalls_spec.rb +0 -36
  151. data/spec/scanner/sets_spec.rb +0 -102
  152. data/spec/scanner/types_spec.rb +0 -14
  153. data/spec/spec_helper.rb +0 -15
  154. data/spec/support/runner.rb +0 -42
  155. data/spec/support/shared_examples.rb +0 -77
  156. data/spec/support/warning_extractor.rb +0 -60
  157. data/spec/syntax/syntax_spec.rb +0 -48
  158. data/spec/syntax/syntax_token_map_spec.rb +0 -23
  159. data/spec/syntax/versions/1.8.6_spec.rb +0 -17
  160. data/spec/syntax/versions/1.9.1_spec.rb +0 -10
  161. data/spec/syntax/versions/1.9.3_spec.rb +0 -9
  162. data/spec/syntax/versions/2.0.0_spec.rb +0 -13
  163. data/spec/syntax/versions/2.2.0_spec.rb +0 -9
  164. data/spec/syntax/versions/aliases_spec.rb +0 -37
  165. data/spec/token/token_spec.rb +0 -85
  166. /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
@@ -0,0 +1,249 @@
1
+ # THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT
2
+ adlm,adlam
3
+ aghb,caucasian_albanian
4
+ ahex,ascii_hex_digit
5
+ arab,arabic
6
+ armi,imperial_aramaic
7
+ armn,armenian
8
+ avst,avestan
9
+ bali,balinese
10
+ bamu,bamum
11
+ bass,bassa_vah
12
+ batk,batak
13
+ beng,bengali
14
+ bhks,bhaiksuki
15
+ bidic,bidi_control
16
+ bopo,bopomofo
17
+ brah,brahmi
18
+ brai,braille
19
+ bugi,buginese
20
+ buhd,buhid
21
+ c,other
22
+ cakm,chakma
23
+ cans,canadian_aboriginal
24
+ cari,carian
25
+ cc,control
26
+ cf,format
27
+ cher,cherokee
28
+ chrs,chorasmian
29
+ ci,case_ignorable
30
+ cn,unassigned
31
+ co,private_use
32
+ combiningmark,mark
33
+ copt,coptic
34
+ cpmn,cypro_minoan
35
+ cprt,cypriot
36
+ cs,surrogate
37
+ cwcf,changes_when_casefolded
38
+ cwcm,changes_when_casemapped
39
+ cwl,changes_when_lowercased
40
+ cwt,changes_when_titlecased
41
+ cwu,changes_when_uppercased
42
+ cyrl,cyrillic
43
+ dep,deprecated
44
+ deva,devanagari
45
+ di,default_ignorable_code_point
46
+ dia,diacritic
47
+ diak,dives_akuru
48
+ dogr,dogra
49
+ dsrt,deseret
50
+ dupl,duployan
51
+ ebase,emoji_modifier_base
52
+ ecomp,emoji_component
53
+ egyp,egyptian_hieroglyphs
54
+ elba,elbasan
55
+ elym,elymaic
56
+ emod,emoji_modifier
57
+ epres,emoji_presentation
58
+ ethi,ethiopic
59
+ ext,extender
60
+ extpict,extended_pictographic
61
+ geor,georgian
62
+ glag,glagolitic
63
+ gong,gunjala_gondi
64
+ gonm,masaram_gondi
65
+ goth,gothic
66
+ gran,grantha
67
+ grbase,grapheme_base
68
+ grek,greek
69
+ grext,grapheme_extend
70
+ grlink,grapheme_link
71
+ gujr,gujarati
72
+ guru,gurmukhi
73
+ hang,hangul
74
+ hani,han
75
+ hano,hanunoo
76
+ hatr,hatran
77
+ hebr,hebrew
78
+ hex,hex_digit
79
+ hira,hiragana
80
+ hluw,anatolian_hieroglyphs
81
+ hmng,pahawh_hmong
82
+ hmnp,nyiakeng_puachue_hmong
83
+ hung,old_hungarian
84
+ idc,id_continue
85
+ ideo,ideographic
86
+ ids,id_start
87
+ idsb,ids_binary_operator
88
+ idst,ids_trinary_operator
89
+ idsu,ids_unary_operator
90
+ ital,old_italic
91
+ java,javanese
92
+ joinc,join_control
93
+ kali,kayah_li
94
+ kana,katakana
95
+ khar,kharoshthi
96
+ khmr,khmer
97
+ khoj,khojki
98
+ kits,khitan_small_script
99
+ knda,kannada
100
+ kthi,kaithi
101
+ l,letter
102
+ lana,tai_tham
103
+ laoo,lao
104
+ latn,latin
105
+ lc,cased_letter
106
+ lepc,lepcha
107
+ limb,limbu
108
+ lina,linear_a
109
+ linb,linear_b
110
+ ll,lowercase_letter
111
+ lm,modifier_letter
112
+ lo,other_letter
113
+ loe,logical_order_exception
114
+ lt,titlecase_letter
115
+ lu,uppercase_letter
116
+ lyci,lycian
117
+ lydi,lydian
118
+ m,mark
119
+ mahj,mahajani
120
+ maka,makasar
121
+ mand,mandaic
122
+ mani,manichaean
123
+ marc,marchen
124
+ mc,spacing_mark
125
+ me,enclosing_mark
126
+ medf,medefaidrin
127
+ mend,mende_kikakui
128
+ merc,meroitic_cursive
129
+ mero,meroitic_hieroglyphs
130
+ mlym,malayalam
131
+ mn,nonspacing_mark
132
+ mong,mongolian
133
+ mroo,mro
134
+ mtei,meetei_mayek
135
+ mult,multani
136
+ mymr,myanmar
137
+ n,number
138
+ nagm,nag_mundari
139
+ nand,nandinagari
140
+ narb,old_north_arabian
141
+ nbat,nabataean
142
+ nchar,noncharacter_code_point
143
+ nd,decimal_number
144
+ nkoo,nko
145
+ nl,letter_number
146
+ no,other_number
147
+ nshu,nushu
148
+ oalpha,other_alphabetic
149
+ odi,other_default_ignorable_code_point
150
+ ogam,ogham
151
+ ogrext,other_grapheme_extend
152
+ oidc,other_id_continue
153
+ oids,other_id_start
154
+ olck,ol_chiki
155
+ olower,other_lowercase
156
+ omath,other_math
157
+ orkh,old_turkic
158
+ orya,oriya
159
+ osge,osage
160
+ osma,osmanya
161
+ ougr,old_uyghur
162
+ oupper,other_uppercase
163
+ p,punctuation
164
+ palm,palmyrene
165
+ patsyn,pattern_syntax
166
+ patws,pattern_white_space
167
+ pauc,pau_cin_hau
168
+ pc,connector_punctuation
169
+ pcm,prepended_concatenation_mark
170
+ pd,dash_punctuation
171
+ pe,close_punctuation
172
+ perm,old_permic
173
+ pf,final_punctuation
174
+ phag,phags_pa
175
+ phli,inscriptional_pahlavi
176
+ phlp,psalter_pahlavi
177
+ phnx,phoenician
178
+ pi,initial_punctuation
179
+ plrd,miao
180
+ po,other_punctuation
181
+ prti,inscriptional_parthian
182
+ ps,open_punctuation
183
+ qaac,coptic
184
+ qaai,inherited
185
+ qmark,quotation_mark
186
+ ri,regional_indicator
187
+ rjng,rejang
188
+ rohg,hanifi_rohingya
189
+ runr,runic
190
+ s,symbol
191
+ samr,samaritan
192
+ sarb,old_south_arabian
193
+ saur,saurashtra
194
+ sc,currency_symbol
195
+ sd,soft_dotted
196
+ sgnw,signwriting
197
+ shaw,shavian
198
+ shrd,sharada
199
+ sidd,siddham
200
+ sind,khudawadi
201
+ sinh,sinhala
202
+ sk,modifier_symbol
203
+ sm,math_symbol
204
+ so,other_symbol
205
+ sogd,sogdian
206
+ sogo,old_sogdian
207
+ sora,sora_sompeng
208
+ soyo,soyombo
209
+ sterm,sentence_terminal
210
+ sund,sundanese
211
+ sylo,syloti_nagri
212
+ syrc,syriac
213
+ tagb,tagbanwa
214
+ takr,takri
215
+ tale,tai_le
216
+ talu,new_tai_lue
217
+ taml,tamil
218
+ tang,tangut
219
+ tavt,tai_viet
220
+ telu,telugu
221
+ term,terminal_punctuation
222
+ tfng,tifinagh
223
+ tglg,tagalog
224
+ thaa,thaana
225
+ tibt,tibetan
226
+ tirh,tirhuta
227
+ tnsa,tangsa
228
+ ugar,ugaritic
229
+ uideo,unified_ideograph
230
+ vaii,vai
231
+ vith,vithkuqi
232
+ vs,variation_selector
233
+ wara,warang_citi
234
+ wcho,wancho
235
+ wspace,white_space
236
+ xidc,xid_continue
237
+ xids,xid_start
238
+ xpeo,old_persian
239
+ xsux,cuneiform
240
+ yezi,yezidi
241
+ yiii,yi
242
+ z,separator
243
+ zanb,zanabazar_square
244
+ zinh,inherited
245
+ zl,line_separator
246
+ zp,paragraph_separator
247
+ zs,space_separator
248
+ zyyy,common
249
+ zzzz,unknown
@@ -14,15 +14,15 @@
14
14
  unicode_property := |*
15
15
 
16
16
  property_sequence < eof(premature_property_end) {
17
- text = text(data, ts, te, 1).first
17
+ text = copy(data, ts-1, te)
18
18
  type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
19
19
 
20
- name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
20
+ name = text[3..-2].gsub(/[\^\s_\-]/, '').downcase
21
21
 
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
- raise UnknownUnicodePropertyError.new(name) unless token
23
+ raise ValidationError.for(:property, name) unless token
24
24
 
25
- self.emit(type, token.to_sym, text, ts-1, te)
25
+ self.emit(type, token.to_sym, text)
26
26
 
27
27
  fret;
28
28
  };