regexp_parser 1.7.1 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +157 -1
  3. data/Gemfile +6 -1
  4. data/LICENSE +1 -1
  5. data/README.md +38 -32
  6. data/Rakefile +18 -27
  7. data/lib/regexp_parser/error.rb +4 -0
  8. data/lib/regexp_parser/expression/base.rb +123 -0
  9. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  10. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +5 -0
  11. data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
  12. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -1
  13. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +0 -0
  14. data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
  15. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +13 -7
  16. data/lib/regexp_parser/expression/classes/free_space.rb +2 -4
  17. data/lib/regexp_parser/expression/classes/group.rb +28 -3
  18. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  19. data/lib/regexp_parser/expression/classes/property.rb +1 -3
  20. data/lib/regexp_parser/expression/classes/root.rb +4 -17
  21. data/lib/regexp_parser/expression/classes/type.rb +0 -2
  22. data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
  23. data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
  24. data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
  25. data/lib/regexp_parser/expression/quantifier.rb +11 -2
  26. data/lib/regexp_parser/expression/sequence.rb +3 -20
  27. data/lib/regexp_parser/expression/subexpression.rb +1 -2
  28. data/lib/regexp_parser/expression.rb +7 -139
  29. data/lib/regexp_parser/lexer.rb +13 -11
  30. data/lib/regexp_parser/parser.rb +325 -344
  31. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  32. data/lib/regexp_parser/scanner/properties/long.csv +604 -0
  33. data/lib/regexp_parser/scanner/properties/short.csv +242 -0
  34. data/lib/regexp_parser/scanner/property.rl +2 -2
  35. data/lib/regexp_parser/scanner/scanner.rl +235 -255
  36. data/lib/regexp_parser/scanner.rb +1324 -1387
  37. data/lib/regexp_parser/syntax/any.rb +4 -6
  38. data/lib/regexp_parser/syntax/base.rb +13 -15
  39. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  40. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  41. data/lib/regexp_parser/syntax/token/backreference.rb +30 -0
  42. data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
  43. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  44. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  45. data/lib/regexp_parser/syntax/token/escape.rb +31 -0
  46. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  47. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  48. data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
  49. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  50. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  51. data/lib/regexp_parser/syntax/token/unicode_property.rb +696 -0
  52. data/lib/regexp_parser/syntax/token.rb +45 -0
  53. data/lib/regexp_parser/syntax/version_lookup.rb +4 -4
  54. data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -2
  55. data/lib/regexp_parser/syntax/versions/1.9.1.rb +1 -1
  56. data/lib/regexp_parser/syntax/versions/3.1.0.rb +10 -0
  57. data/lib/regexp_parser/syntax.rb +8 -6
  58. data/lib/regexp_parser/token.rb +9 -20
  59. data/lib/regexp_parser/version.rb +1 -1
  60. data/lib/regexp_parser.rb +0 -2
  61. data/regexp_parser.gemspec +20 -22
  62. metadata +34 -165
  63. data/lib/regexp_parser/scanner/properties/long.yml +0 -594
  64. data/lib/regexp_parser/scanner/properties/short.yml +0 -237
  65. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  66. data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
  67. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  68. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  69. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  70. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  71. data/spec/expression/base_spec.rb +0 -94
  72. data/spec/expression/clone_spec.rb +0 -120
  73. data/spec/expression/conditional_spec.rb +0 -89
  74. data/spec/expression/free_space_spec.rb +0 -27
  75. data/spec/expression/methods/match_length_spec.rb +0 -161
  76. data/spec/expression/methods/match_spec.rb +0 -25
  77. data/spec/expression/methods/strfregexp_spec.rb +0 -224
  78. data/spec/expression/methods/tests_spec.rb +0 -99
  79. data/spec/expression/methods/traverse_spec.rb +0 -161
  80. data/spec/expression/options_spec.rb +0 -128
  81. data/spec/expression/root_spec.rb +0 -9
  82. data/spec/expression/sequence_spec.rb +0 -9
  83. data/spec/expression/subexpression_spec.rb +0 -50
  84. data/spec/expression/to_h_spec.rb +0 -26
  85. data/spec/expression/to_s_spec.rb +0 -100
  86. data/spec/lexer/all_spec.rb +0 -22
  87. data/spec/lexer/conditionals_spec.rb +0 -53
  88. data/spec/lexer/delimiters_spec.rb +0 -68
  89. data/spec/lexer/escapes_spec.rb +0 -14
  90. data/spec/lexer/keep_spec.rb +0 -10
  91. data/spec/lexer/literals_spec.rb +0 -89
  92. data/spec/lexer/nesting_spec.rb +0 -99
  93. data/spec/lexer/refcalls_spec.rb +0 -55
  94. data/spec/parser/all_spec.rb +0 -43
  95. data/spec/parser/alternation_spec.rb +0 -88
  96. data/spec/parser/anchors_spec.rb +0 -17
  97. data/spec/parser/conditionals_spec.rb +0 -179
  98. data/spec/parser/errors_spec.rb +0 -30
  99. data/spec/parser/escapes_spec.rb +0 -121
  100. data/spec/parser/free_space_spec.rb +0 -130
  101. data/spec/parser/groups_spec.rb +0 -108
  102. data/spec/parser/keep_spec.rb +0 -6
  103. data/spec/parser/posix_classes_spec.rb +0 -8
  104. data/spec/parser/properties_spec.rb +0 -115
  105. data/spec/parser/quantifiers_spec.rb +0 -52
  106. data/spec/parser/refcalls_spec.rb +0 -112
  107. data/spec/parser/set/intersections_spec.rb +0 -127
  108. data/spec/parser/set/ranges_spec.rb +0 -111
  109. data/spec/parser/sets_spec.rb +0 -178
  110. data/spec/parser/types_spec.rb +0 -18
  111. data/spec/scanner/all_spec.rb +0 -18
  112. data/spec/scanner/anchors_spec.rb +0 -21
  113. data/spec/scanner/conditionals_spec.rb +0 -128
  114. data/spec/scanner/delimiters_spec.rb +0 -52
  115. data/spec/scanner/errors_spec.rb +0 -67
  116. data/spec/scanner/escapes_spec.rb +0 -53
  117. data/spec/scanner/free_space_spec.rb +0 -133
  118. data/spec/scanner/groups_spec.rb +0 -52
  119. data/spec/scanner/keep_spec.rb +0 -10
  120. data/spec/scanner/literals_spec.rb +0 -49
  121. data/spec/scanner/meta_spec.rb +0 -18
  122. data/spec/scanner/properties_spec.rb +0 -64
  123. data/spec/scanner/quantifiers_spec.rb +0 -20
  124. data/spec/scanner/refcalls_spec.rb +0 -36
  125. data/spec/scanner/sets_spec.rb +0 -102
  126. data/spec/scanner/types_spec.rb +0 -14
  127. data/spec/spec_helper.rb +0 -15
  128. data/spec/support/runner.rb +0 -42
  129. data/spec/support/shared_examples.rb +0 -77
  130. data/spec/support/warning_extractor.rb +0 -60
  131. data/spec/syntax/syntax_spec.rb +0 -48
  132. data/spec/syntax/syntax_token_map_spec.rb +0 -23
  133. data/spec/syntax/versions/1.8.6_spec.rb +0 -17
  134. data/spec/syntax/versions/1.9.1_spec.rb +0 -10
  135. data/spec/syntax/versions/1.9.3_spec.rb +0 -9
  136. data/spec/syntax/versions/2.0.0_spec.rb +0 -13
  137. data/spec/syntax/versions/2.2.0_spec.rb +0 -9
  138. data/spec/syntax/versions/aliases_spec.rb +0 -37
  139. data/spec/token/token_spec.rb +0 -85
@@ -0,0 +1,242 @@
1
+ # THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT
2
+ adlm,adlam
3
+ aghb,caucasian_albanian
4
+ ahex,ascii_hex_digit
5
+ arab,arabic
6
+ armi,imperial_aramaic
7
+ armn,armenian
8
+ avst,avestan
9
+ bali,balinese
10
+ bamu,bamum
11
+ bass,bassa_vah
12
+ batk,batak
13
+ beng,bengali
14
+ bhks,bhaiksuki
15
+ bidic,bidi_control
16
+ bopo,bopomofo
17
+ brah,brahmi
18
+ brai,braille
19
+ bugi,buginese
20
+ buhd,buhid
21
+ c,other
22
+ cakm,chakma
23
+ cans,canadian_aboriginal
24
+ cari,carian
25
+ cc,control
26
+ cf,format
27
+ cher,cherokee
28
+ chrs,chorasmian
29
+ ci,case_ignorable
30
+ cn,unassigned
31
+ co,private_use
32
+ combiningmark,mark
33
+ copt,coptic
34
+ cprt,cypriot
35
+ cs,surrogate
36
+ cwcf,changes_when_casefolded
37
+ cwcm,changes_when_casemapped
38
+ cwl,changes_when_lowercased
39
+ cwt,changes_when_titlecased
40
+ cwu,changes_when_uppercased
41
+ cyrl,cyrillic
42
+ dep,deprecated
43
+ deva,devanagari
44
+ di,default_ignorable_code_point
45
+ dia,diacritic
46
+ diak,dives_akuru
47
+ dogr,dogra
48
+ dsrt,deseret
49
+ dupl,duployan
50
+ ebase,emoji_modifier_base
51
+ ecomp,emoji_component
52
+ egyp,egyptian_hieroglyphs
53
+ elba,elbasan
54
+ elym,elymaic
55
+ emod,emoji_modifier
56
+ epres,emoji_presentation
57
+ ethi,ethiopic
58
+ ext,extender
59
+ geor,georgian
60
+ glag,glagolitic
61
+ gong,gunjala_gondi
62
+ gonm,masaram_gondi
63
+ goth,gothic
64
+ gran,grantha
65
+ grbase,grapheme_base
66
+ grek,greek
67
+ grext,grapheme_extend
68
+ grlink,grapheme_link
69
+ gujr,gujarati
70
+ guru,gurmukhi
71
+ hang,hangul
72
+ hani,han
73
+ hano,hanunoo
74
+ hatr,hatran
75
+ hebr,hebrew
76
+ hex,hex_digit
77
+ hira,hiragana
78
+ hluw,anatolian_hieroglyphs
79
+ hmng,pahawh_hmong
80
+ hmnp,nyiakeng_puachue_hmong
81
+ hung,old_hungarian
82
+ idc,id_continue
83
+ ideo,ideographic
84
+ ids,id_start
85
+ idsb,ids_binary_operator
86
+ idst,ids_trinary_operator
87
+ ital,old_italic
88
+ java,javanese
89
+ joinc,join_control
90
+ kali,kayah_li
91
+ kana,katakana
92
+ khar,kharoshthi
93
+ khmr,khmer
94
+ khoj,khojki
95
+ kits,khitan_small_script
96
+ knda,kannada
97
+ kthi,kaithi
98
+ l,letter
99
+ lana,tai_tham
100
+ laoo,lao
101
+ latn,latin
102
+ lc,cased_letter
103
+ lepc,lepcha
104
+ limb,limbu
105
+ lina,linear_a
106
+ linb,linear_b
107
+ ll,lowercase_letter
108
+ lm,modifier_letter
109
+ lo,other_letter
110
+ loe,logical_order_exception
111
+ lt,titlecase_letter
112
+ lu,uppercase_letter
113
+ lyci,lycian
114
+ lydi,lydian
115
+ m,mark
116
+ mahj,mahajani
117
+ maka,makasar
118
+ mand,mandaic
119
+ mani,manichaean
120
+ marc,marchen
121
+ mc,spacing_mark
122
+ me,enclosing_mark
123
+ medf,medefaidrin
124
+ mend,mende_kikakui
125
+ merc,meroitic_cursive
126
+ mero,meroitic_hieroglyphs
127
+ mlym,malayalam
128
+ mn,nonspacing_mark
129
+ mong,mongolian
130
+ mroo,mro
131
+ mtei,meetei_mayek
132
+ mult,multani
133
+ mymr,myanmar
134
+ n,number
135
+ nand,nandinagari
136
+ narb,old_north_arabian
137
+ nbat,nabataean
138
+ nchar,noncharacter_code_point
139
+ nd,decimal_number
140
+ nkoo,nko
141
+ nl,letter_number
142
+ no,other_number
143
+ nshu,nushu
144
+ oalpha,other_alphabetic
145
+ odi,other_default_ignorable_code_point
146
+ ogam,ogham
147
+ ogrext,other_grapheme_extend
148
+ oidc,other_id_continue
149
+ oids,other_id_start
150
+ olck,ol_chiki
151
+ olower,other_lowercase
152
+ omath,other_math
153
+ orkh,old_turkic
154
+ orya,oriya
155
+ osge,osage
156
+ osma,osmanya
157
+ oupper,other_uppercase
158
+ p,punctuation
159
+ palm,palmyrene
160
+ patsyn,pattern_syntax
161
+ patws,pattern_white_space
162
+ pauc,pau_cin_hau
163
+ pc,connector_punctuation
164
+ pcm,prepended_concatenation_mark
165
+ pd,dash_punctuation
166
+ pe,close_punctuation
167
+ perm,old_permic
168
+ pf,final_punctuation
169
+ phag,phags_pa
170
+ phli,inscriptional_pahlavi
171
+ phlp,psalter_pahlavi
172
+ phnx,phoenician
173
+ pi,initial_punctuation
174
+ plrd,miao
175
+ po,other_punctuation
176
+ prti,inscriptional_parthian
177
+ ps,open_punctuation
178
+ qaac,coptic
179
+ qaai,inherited
180
+ qmark,quotation_mark
181
+ ri,regional_indicator
182
+ rjng,rejang
183
+ rohg,hanifi_rohingya
184
+ runr,runic
185
+ s,symbol
186
+ samr,samaritan
187
+ sarb,old_south_arabian
188
+ saur,saurashtra
189
+ sc,currency_symbol
190
+ sd,soft_dotted
191
+ sgnw,signwriting
192
+ shaw,shavian
193
+ shrd,sharada
194
+ sidd,siddham
195
+ sind,khudawadi
196
+ sinh,sinhala
197
+ sk,modifier_symbol
198
+ sm,math_symbol
199
+ so,other_symbol
200
+ sogd,sogdian
201
+ sogo,old_sogdian
202
+ sora,sora_sompeng
203
+ soyo,soyombo
204
+ sterm,sentence_terminal
205
+ sund,sundanese
206
+ sylo,syloti_nagri
207
+ syrc,syriac
208
+ tagb,tagbanwa
209
+ takr,takri
210
+ tale,tai_le
211
+ talu,new_tai_lue
212
+ taml,tamil
213
+ tang,tangut
214
+ tavt,tai_viet
215
+ telu,telugu
216
+ term,terminal_punctuation
217
+ tfng,tifinagh
218
+ tglg,tagalog
219
+ thaa,thaana
220
+ tibt,tibetan
221
+ tirh,tirhuta
222
+ ugar,ugaritic
223
+ uideo,unified_ideograph
224
+ vaii,vai
225
+ vs,variation_selector
226
+ wara,warang_citi
227
+ wcho,wancho
228
+ wspace,white_space
229
+ xidc,xid_continue
230
+ xids,xid_start
231
+ xpeo,old_persian
232
+ xsux,cuneiform
233
+ yezi,yezidi
234
+ yiii,yi
235
+ z,separator
236
+ zanb,zanabazar_square
237
+ zinh,inherited
238
+ zl,line_separator
239
+ zp,paragraph_separator
240
+ zs,space_separator
241
+ zyyy,common
242
+ zzzz,unknown
@@ -14,7 +14,7 @@
14
14
  unicode_property := |*
15
15
 
16
16
  property_sequence < eof(premature_property_end) {
17
- text = text(data, ts, te, 1).first
17
+ text = copy(data, ts-1, te)
18
18
  type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
19
19
 
20
20
  name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
@@ -22,7 +22,7 @@
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
23
  raise UnknownUnicodePropertyError.new(name) unless token
24
24
 
25
- self.emit(type, token.to_sym, text, ts-1, te)
25
+ self.emit(type, token.to_sym, text)
26
26
 
27
27
  fret;
28
28
  };