regexp_parser 1.7.0 → 2.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +9 -3
  3. data/LICENSE +1 -1
  4. data/Rakefile +6 -70
  5. data/lib/regexp_parser/error.rb +4 -0
  6. data/lib/regexp_parser/expression/base.rb +76 -0
  7. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  8. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  9. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
  10. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
  11. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +4 -8
  12. data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
  13. data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
  14. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
  15. data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
  16. data/lib/regexp_parser/expression/classes/group.rb +28 -15
  17. data/lib/regexp_parser/expression/classes/keep.rb +2 -0
  18. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  19. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
  20. data/lib/regexp_parser/expression/classes/root.rb +4 -19
  21. data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +11 -12
  22. data/lib/regexp_parser/expression/methods/construct.rb +41 -0
  23. data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
  24. data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
  25. data/lib/regexp_parser/expression/methods/negative.rb +20 -0
  26. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  27. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  28. data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
  29. data/lib/regexp_parser/expression/methods/tests.rb +47 -1
  30. data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
  31. data/lib/regexp_parser/expression/quantifier.rb +57 -17
  32. data/lib/regexp_parser/expression/sequence.rb +11 -47
  33. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  34. data/lib/regexp_parser/expression/shared.rb +111 -0
  35. data/lib/regexp_parser/expression/subexpression.rb +27 -19
  36. data/lib/regexp_parser/expression.rb +15 -141
  37. data/lib/regexp_parser/lexer.rb +83 -41
  38. data/lib/regexp_parser/parser.rb +372 -429
  39. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  40. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  41. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  42. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  43. data/lib/regexp_parser/scanner/properties/long.csv +651 -0
  44. data/lib/regexp_parser/scanner/properties/short.csv +249 -0
  45. data/lib/regexp_parser/scanner/property.rl +4 -4
  46. data/lib/regexp_parser/scanner/scanner.rl +303 -368
  47. data/lib/regexp_parser/scanner.rb +1423 -1674
  48. data/lib/regexp_parser/syntax/any.rb +2 -7
  49. data/lib/regexp_parser/syntax/base.rb +92 -67
  50. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  51. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  52. data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
  53. data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
  54. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  55. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  56. data/lib/regexp_parser/syntax/token/escape.rb +33 -0
  57. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  58. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  59. data/lib/regexp_parser/syntax/token/meta.rb +20 -0
  60. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  61. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  62. data/lib/regexp_parser/syntax/token/unicode_property.rb +751 -0
  63. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  64. data/lib/regexp_parser/syntax/token.rb +45 -0
  65. data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
  66. data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
  67. data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
  68. data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
  69. data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
  70. data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
  71. data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
  72. data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
  73. data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
  74. data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
  75. data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
  76. data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
  77. data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
  78. data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
  79. data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
  80. data/lib/regexp_parser/syntax/versions.rb +3 -1
  81. data/lib/regexp_parser/syntax.rb +8 -6
  82. data/lib/regexp_parser/token.rb +9 -20
  83. data/lib/regexp_parser/version.rb +1 -1
  84. data/lib/regexp_parser.rb +0 -2
  85. data/regexp_parser.gemspec +19 -23
  86. metadata +53 -171
  87. data/CHANGELOG.md +0 -349
  88. data/README.md +0 -470
  89. data/lib/regexp_parser/scanner/properties/long.yml +0 -594
  90. data/lib/regexp_parser/scanner/properties/short.yml +0 -237
  91. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  92. data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
  93. data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
  94. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  95. data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
  96. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  97. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  98. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  99. data/spec/expression/base_spec.rb +0 -94
  100. data/spec/expression/clone_spec.rb +0 -120
  101. data/spec/expression/conditional_spec.rb +0 -89
  102. data/spec/expression/free_space_spec.rb +0 -27
  103. data/spec/expression/methods/match_length_spec.rb +0 -161
  104. data/spec/expression/methods/match_spec.rb +0 -25
  105. data/spec/expression/methods/strfregexp_spec.rb +0 -224
  106. data/spec/expression/methods/tests_spec.rb +0 -99
  107. data/spec/expression/methods/traverse_spec.rb +0 -161
  108. data/spec/expression/options_spec.rb +0 -128
  109. data/spec/expression/root_spec.rb +0 -9
  110. data/spec/expression/sequence_spec.rb +0 -9
  111. data/spec/expression/subexpression_spec.rb +0 -50
  112. data/spec/expression/to_h_spec.rb +0 -26
  113. data/spec/expression/to_s_spec.rb +0 -100
  114. data/spec/lexer/all_spec.rb +0 -22
  115. data/spec/lexer/conditionals_spec.rb +0 -53
  116. data/spec/lexer/escapes_spec.rb +0 -14
  117. data/spec/lexer/keep_spec.rb +0 -10
  118. data/spec/lexer/literals_spec.rb +0 -89
  119. data/spec/lexer/nesting_spec.rb +0 -99
  120. data/spec/lexer/refcalls_spec.rb +0 -55
  121. data/spec/parser/all_spec.rb +0 -43
  122. data/spec/parser/alternation_spec.rb +0 -88
  123. data/spec/parser/anchors_spec.rb +0 -17
  124. data/spec/parser/conditionals_spec.rb +0 -179
  125. data/spec/parser/errors_spec.rb +0 -30
  126. data/spec/parser/escapes_spec.rb +0 -121
  127. data/spec/parser/free_space_spec.rb +0 -130
  128. data/spec/parser/groups_spec.rb +0 -108
  129. data/spec/parser/keep_spec.rb +0 -6
  130. data/spec/parser/posix_classes_spec.rb +0 -8
  131. data/spec/parser/properties_spec.rb +0 -115
  132. data/spec/parser/quantifiers_spec.rb +0 -51
  133. data/spec/parser/refcalls_spec.rb +0 -112
  134. data/spec/parser/set/intersections_spec.rb +0 -127
  135. data/spec/parser/set/ranges_spec.rb +0 -111
  136. data/spec/parser/sets_spec.rb +0 -178
  137. data/spec/parser/types_spec.rb +0 -18
  138. data/spec/scanner/all_spec.rb +0 -18
  139. data/spec/scanner/anchors_spec.rb +0 -21
  140. data/spec/scanner/conditionals_spec.rb +0 -128
  141. data/spec/scanner/errors_spec.rb +0 -68
  142. data/spec/scanner/escapes_spec.rb +0 -53
  143. data/spec/scanner/free_space_spec.rb +0 -133
  144. data/spec/scanner/groups_spec.rb +0 -52
  145. data/spec/scanner/keep_spec.rb +0 -10
  146. data/spec/scanner/literals_spec.rb +0 -49
  147. data/spec/scanner/meta_spec.rb +0 -18
  148. data/spec/scanner/properties_spec.rb +0 -64
  149. data/spec/scanner/quantifiers_spec.rb +0 -20
  150. data/spec/scanner/refcalls_spec.rb +0 -36
  151. data/spec/scanner/sets_spec.rb +0 -102
  152. data/spec/scanner/types_spec.rb +0 -14
  153. data/spec/spec_helper.rb +0 -15
  154. data/spec/support/runner.rb +0 -42
  155. data/spec/support/shared_examples.rb +0 -77
  156. data/spec/support/warning_extractor.rb +0 -60
  157. data/spec/syntax/syntax_spec.rb +0 -48
  158. data/spec/syntax/syntax_token_map_spec.rb +0 -23
  159. data/spec/syntax/versions/1.8.6_spec.rb +0 -17
  160. data/spec/syntax/versions/1.9.1_spec.rb +0 -10
  161. data/spec/syntax/versions/1.9.3_spec.rb +0 -9
  162. data/spec/syntax/versions/2.0.0_spec.rb +0 -13
  163. data/spec/syntax/versions/2.2.0_spec.rb +0 -9
  164. data/spec/syntax/versions/aliases_spec.rb +0 -37
  165. data/spec/token/token_spec.rb +0 -85
  166. /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
@@ -0,0 +1,249 @@
1
+ # THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT
2
+ adlm,adlam
3
+ aghb,caucasian_albanian
4
+ ahex,ascii_hex_digit
5
+ arab,arabic
6
+ armi,imperial_aramaic
7
+ armn,armenian
8
+ avst,avestan
9
+ bali,balinese
10
+ bamu,bamum
11
+ bass,bassa_vah
12
+ batk,batak
13
+ beng,bengali
14
+ bhks,bhaiksuki
15
+ bidic,bidi_control
16
+ bopo,bopomofo
17
+ brah,brahmi
18
+ brai,braille
19
+ bugi,buginese
20
+ buhd,buhid
21
+ c,other
22
+ cakm,chakma
23
+ cans,canadian_aboriginal
24
+ cari,carian
25
+ cc,control
26
+ cf,format
27
+ cher,cherokee
28
+ chrs,chorasmian
29
+ ci,case_ignorable
30
+ cn,unassigned
31
+ co,private_use
32
+ combiningmark,mark
33
+ copt,coptic
34
+ cpmn,cypro_minoan
35
+ cprt,cypriot
36
+ cs,surrogate
37
+ cwcf,changes_when_casefolded
38
+ cwcm,changes_when_casemapped
39
+ cwl,changes_when_lowercased
40
+ cwt,changes_when_titlecased
41
+ cwu,changes_when_uppercased
42
+ cyrl,cyrillic
43
+ dep,deprecated
44
+ deva,devanagari
45
+ di,default_ignorable_code_point
46
+ dia,diacritic
47
+ diak,dives_akuru
48
+ dogr,dogra
49
+ dsrt,deseret
50
+ dupl,duployan
51
+ ebase,emoji_modifier_base
52
+ ecomp,emoji_component
53
+ egyp,egyptian_hieroglyphs
54
+ elba,elbasan
55
+ elym,elymaic
56
+ emod,emoji_modifier
57
+ epres,emoji_presentation
58
+ ethi,ethiopic
59
+ ext,extender
60
+ extpict,extended_pictographic
61
+ geor,georgian
62
+ glag,glagolitic
63
+ gong,gunjala_gondi
64
+ gonm,masaram_gondi
65
+ goth,gothic
66
+ gran,grantha
67
+ grbase,grapheme_base
68
+ grek,greek
69
+ grext,grapheme_extend
70
+ grlink,grapheme_link
71
+ gujr,gujarati
72
+ guru,gurmukhi
73
+ hang,hangul
74
+ hani,han
75
+ hano,hanunoo
76
+ hatr,hatran
77
+ hebr,hebrew
78
+ hex,hex_digit
79
+ hira,hiragana
80
+ hluw,anatolian_hieroglyphs
81
+ hmng,pahawh_hmong
82
+ hmnp,nyiakeng_puachue_hmong
83
+ hung,old_hungarian
84
+ idc,id_continue
85
+ ideo,ideographic
86
+ ids,id_start
87
+ idsb,ids_binary_operator
88
+ idst,ids_trinary_operator
89
+ idsu,ids_unary_operator
90
+ ital,old_italic
91
+ java,javanese
92
+ joinc,join_control
93
+ kali,kayah_li
94
+ kana,katakana
95
+ khar,kharoshthi
96
+ khmr,khmer
97
+ khoj,khojki
98
+ kits,khitan_small_script
99
+ knda,kannada
100
+ kthi,kaithi
101
+ l,letter
102
+ lana,tai_tham
103
+ laoo,lao
104
+ latn,latin
105
+ lc,cased_letter
106
+ lepc,lepcha
107
+ limb,limbu
108
+ lina,linear_a
109
+ linb,linear_b
110
+ ll,lowercase_letter
111
+ lm,modifier_letter
112
+ lo,other_letter
113
+ loe,logical_order_exception
114
+ lt,titlecase_letter
115
+ lu,uppercase_letter
116
+ lyci,lycian
117
+ lydi,lydian
118
+ m,mark
119
+ mahj,mahajani
120
+ maka,makasar
121
+ mand,mandaic
122
+ mani,manichaean
123
+ marc,marchen
124
+ mc,spacing_mark
125
+ me,enclosing_mark
126
+ medf,medefaidrin
127
+ mend,mende_kikakui
128
+ merc,meroitic_cursive
129
+ mero,meroitic_hieroglyphs
130
+ mlym,malayalam
131
+ mn,nonspacing_mark
132
+ mong,mongolian
133
+ mroo,mro
134
+ mtei,meetei_mayek
135
+ mult,multani
136
+ mymr,myanmar
137
+ n,number
138
+ nagm,nag_mundari
139
+ nand,nandinagari
140
+ narb,old_north_arabian
141
+ nbat,nabataean
142
+ nchar,noncharacter_code_point
143
+ nd,decimal_number
144
+ nkoo,nko
145
+ nl,letter_number
146
+ no,other_number
147
+ nshu,nushu
148
+ oalpha,other_alphabetic
149
+ odi,other_default_ignorable_code_point
150
+ ogam,ogham
151
+ ogrext,other_grapheme_extend
152
+ oidc,other_id_continue
153
+ oids,other_id_start
154
+ olck,ol_chiki
155
+ olower,other_lowercase
156
+ omath,other_math
157
+ orkh,old_turkic
158
+ orya,oriya
159
+ osge,osage
160
+ osma,osmanya
161
+ ougr,old_uyghur
162
+ oupper,other_uppercase
163
+ p,punctuation
164
+ palm,palmyrene
165
+ patsyn,pattern_syntax
166
+ patws,pattern_white_space
167
+ pauc,pau_cin_hau
168
+ pc,connector_punctuation
169
+ pcm,prepended_concatenation_mark
170
+ pd,dash_punctuation
171
+ pe,close_punctuation
172
+ perm,old_permic
173
+ pf,final_punctuation
174
+ phag,phags_pa
175
+ phli,inscriptional_pahlavi
176
+ phlp,psalter_pahlavi
177
+ phnx,phoenician
178
+ pi,initial_punctuation
179
+ plrd,miao
180
+ po,other_punctuation
181
+ prti,inscriptional_parthian
182
+ ps,open_punctuation
183
+ qaac,coptic
184
+ qaai,inherited
185
+ qmark,quotation_mark
186
+ ri,regional_indicator
187
+ rjng,rejang
188
+ rohg,hanifi_rohingya
189
+ runr,runic
190
+ s,symbol
191
+ samr,samaritan
192
+ sarb,old_south_arabian
193
+ saur,saurashtra
194
+ sc,currency_symbol
195
+ sd,soft_dotted
196
+ sgnw,signwriting
197
+ shaw,shavian
198
+ shrd,sharada
199
+ sidd,siddham
200
+ sind,khudawadi
201
+ sinh,sinhala
202
+ sk,modifier_symbol
203
+ sm,math_symbol
204
+ so,other_symbol
205
+ sogd,sogdian
206
+ sogo,old_sogdian
207
+ sora,sora_sompeng
208
+ soyo,soyombo
209
+ sterm,sentence_terminal
210
+ sund,sundanese
211
+ sylo,syloti_nagri
212
+ syrc,syriac
213
+ tagb,tagbanwa
214
+ takr,takri
215
+ tale,tai_le
216
+ talu,new_tai_lue
217
+ taml,tamil
218
+ tang,tangut
219
+ tavt,tai_viet
220
+ telu,telugu
221
+ term,terminal_punctuation
222
+ tfng,tifinagh
223
+ tglg,tagalog
224
+ thaa,thaana
225
+ tibt,tibetan
226
+ tirh,tirhuta
227
+ tnsa,tangsa
228
+ ugar,ugaritic
229
+ uideo,unified_ideograph
230
+ vaii,vai
231
+ vith,vithkuqi
232
+ vs,variation_selector
233
+ wara,warang_citi
234
+ wcho,wancho
235
+ wspace,white_space
236
+ xidc,xid_continue
237
+ xids,xid_start
238
+ xpeo,old_persian
239
+ xsux,cuneiform
240
+ yezi,yezidi
241
+ yiii,yi
242
+ z,separator
243
+ zanb,zanabazar_square
244
+ zinh,inherited
245
+ zl,line_separator
246
+ zp,paragraph_separator
247
+ zs,space_separator
248
+ zyyy,common
249
+ zzzz,unknown
@@ -14,15 +14,15 @@
14
14
  unicode_property := |*
15
15
 
16
16
  property_sequence < eof(premature_property_end) {
17
- text = text(data, ts, te, 1).first
17
+ text = copy(data, ts-1, te)
18
18
  type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
19
19
 
20
- name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
20
+ name = text[3..-2].gsub(/[\^\s_\-]/, '').downcase
21
21
 
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
- raise UnknownUnicodePropertyError.new(name) unless token
23
+ raise ValidationError.for(:property, name) unless token
24
24
 
25
- self.emit(type, token.to_sym, text, ts-1, te)
25
+ self.emit(type, token.to_sym, text)
26
26
 
27
27
  fret;
28
28
  };