regexp_parser 2.2.1 → 2.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/Gemfile +1 -1
- data/README.md +14 -4
- data/Rakefile +1 -56
- data/lib/regexp_parser/lexer.rb +1 -1
- data/lib/regexp_parser/scanner/properties/long.csv +18 -0
- data/lib/regexp_parser/scanner/properties/short.csv +4 -0
- data/lib/regexp_parser/scanner.rb +120 -120
- data/lib/regexp_parser/syntax/any.rb +2 -5
- data/lib/regexp_parser/syntax/base.rb +91 -64
- data/lib/regexp_parser/syntax/token/quantifier.rb +4 -4
- data/lib/regexp_parser/syntax/token/unicode_property.rb +26 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +20 -29
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
- data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
- data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
- data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- metadata +7 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 369b108d8410e12bd6af5c659f58cb56c583e48780c1b35b6270bb21cc6a4ee7
|
4
|
+
data.tar.gz: 30cd2c0823ae154a2db04c705f898f252774ec8ab9ef304833c5e3546ba7406a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4104bec7dd02a7ea099de9aeacb766fb1a2db50cb52bd84f44e4bde93431d436b75d0f1b3f4d62242713a1eeca3f4d8c0be034270d515979aad8ad2d504880b0
|
7
|
+
data.tar.gz: 11deb2d7c8a6fad3fa9cb18b3f29cae15bab7e12e6cbbc968706dd02c16b0d1a6b1d69f05a5f665f7b46947315b0ea4ecda62dab8ddca8b5ef71f521b877da74
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,16 @@
|
|
1
|
+
## [Unreleased]
|
2
|
+
|
3
|
+
## [2.3.0] - 2022-04-08 - [Janosch Müller](mailto:janosch84@gmail.com)
|
4
|
+
|
5
|
+
### Added
|
6
|
+
|
7
|
+
- improved parsing performance through `Syntax` refactoring
|
8
|
+
- instead of fresh `Syntax` instances, pre-loaded constants are now re-used
|
9
|
+
- this approximately doubles the parsing speed for simple regexps
|
10
|
+
- added methods to `Syntax` classes to show relative feature sets
|
11
|
+
- e.g. `Regexp::Syntax::V3_2_0.added_features`
|
12
|
+
- support for new unicode properties of Ruby 3.2 / Unicode 14.0
|
13
|
+
|
1
14
|
## [2.2.1] - 2022-02-11 - [Janosch Müller](mailto:janosch84@gmail.com)
|
2
15
|
|
3
16
|
### Fixed
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -157,31 +157,41 @@ flavor). Syntax classes act as lookup tables, and are layered to create
|
|
157
157
|
flavor variations. Syntax only comes into play in the lexer.
|
158
158
|
|
159
159
|
#### Example
|
160
|
-
The following
|
160
|
+
The following fetches syntax objects for Ruby 2.0, 1.9, 1.8, and
|
161
161
|
checks a few of their implementation features.
|
162
162
|
|
163
163
|
```ruby
|
164
164
|
require 'regexp_parser'
|
165
165
|
|
166
|
-
ruby_20 = Regexp::Syntax.
|
166
|
+
ruby_20 = Regexp::Syntax.for 'ruby/2.0'
|
167
167
|
ruby_20.implements? :quantifier, :zero_or_one # => true
|
168
168
|
ruby_20.implements? :quantifier, :zero_or_one_reluctant # => true
|
169
169
|
ruby_20.implements? :quantifier, :zero_or_one_possessive # => true
|
170
170
|
ruby_20.implements? :conditional, :condition # => true
|
171
171
|
|
172
|
-
ruby_19 = Regexp::Syntax.
|
172
|
+
ruby_19 = Regexp::Syntax.for 'ruby/1.9'
|
173
173
|
ruby_19.implements? :quantifier, :zero_or_one # => true
|
174
174
|
ruby_19.implements? :quantifier, :zero_or_one_reluctant # => true
|
175
175
|
ruby_19.implements? :quantifier, :zero_or_one_possessive # => true
|
176
176
|
ruby_19.implements? :conditional, :condition # => false
|
177
177
|
|
178
|
-
ruby_18 = Regexp::Syntax.
|
178
|
+
ruby_18 = Regexp::Syntax.for 'ruby/1.8'
|
179
179
|
ruby_18.implements? :quantifier, :zero_or_one # => true
|
180
180
|
ruby_18.implements? :quantifier, :zero_or_one_reluctant # => true
|
181
181
|
ruby_18.implements? :quantifier, :zero_or_one_possessive # => false
|
182
182
|
ruby_18.implements? :conditional, :condition # => false
|
183
183
|
```
|
184
184
|
|
185
|
+
Syntax objects can also be queried about their complete and relative feature sets.
|
186
|
+
|
187
|
+
```ruby
|
188
|
+
require 'regexp_parser'
|
189
|
+
|
190
|
+
ruby_20 = Regexp::Syntax.for 'ruby/2.0' # => Regexp::Syntax::V2_0_0
|
191
|
+
ruby_20.added_features # => { conditional: [...], ... }
|
192
|
+
ruby_20.removed_features # => { property: [:newline], ... }
|
193
|
+
ruby_20.features # => { anchor: [...], ... }
|
194
|
+
```
|
185
195
|
|
186
196
|
#### Notes
|
187
197
|
* Variations on a token, for example a named group with angle brackets (< and >)
|
data/Rakefile
CHANGED
@@ -5,9 +5,7 @@ require 'rake'
|
|
5
5
|
require 'rake/testtask'
|
6
6
|
require 'rspec/core/rake_task'
|
7
7
|
|
8
|
-
|
9
|
-
RAGEL_OUTPUT_DIR = File.join(__dir__, 'lib/regexp_parser')
|
10
|
-
RAGEL_SOURCE_FILES = %w[scanner] # scanner.rl imports the other files
|
8
|
+
Dir['tasks/**/*.rake'].each { |file| load(file) }
|
11
9
|
|
12
10
|
Bundler::GemHelper.install_tasks
|
13
11
|
|
@@ -19,60 +17,7 @@ namespace :test do
|
|
19
17
|
task full: [:'ragel:rb', :spec]
|
20
18
|
end
|
21
19
|
|
22
|
-
namespace :ragel do
|
23
|
-
desc "Process the ragel source files and output ruby code"
|
24
|
-
task :rb do
|
25
|
-
RAGEL_SOURCE_FILES.each do |source_file|
|
26
|
-
output_file = "#{RAGEL_OUTPUT_DIR}/#{source_file}.rb"
|
27
|
-
# using faster flat table driven FSM, about 25% larger code, but about 30% faster
|
28
|
-
sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{source_file}.rl -o #{output_file}"
|
29
|
-
|
30
|
-
contents = File.read(output_file)
|
31
|
-
|
32
|
-
File.open(output_file, 'r+') do |file|
|
33
|
-
contents = "# -*- warn-indent:false; -*-\n" + contents
|
34
|
-
|
35
|
-
file.write(contents)
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
desc "Delete the ragel generated source file(s)"
|
41
|
-
task :clean do
|
42
|
-
RAGEL_SOURCE_FILES.each do |file|
|
43
|
-
sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb"
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
20
|
# Add ragel task as a prerequisite for building the gem to ensure that the
|
49
21
|
# latest scanner code is generated and included in the build.
|
50
22
|
desc "Runs ragel:rb before building the gem"
|
51
23
|
task :build => ['ragel:rb']
|
52
|
-
|
53
|
-
namespace :props do
|
54
|
-
desc 'Write new property value hashes for the properties scanner'
|
55
|
-
task :update do
|
56
|
-
require 'regexp_property_values'
|
57
|
-
RegexpPropertyValues.update
|
58
|
-
dir = File.join(__dir__, 'lib/regexp_parser/scanner/properties')
|
59
|
-
|
60
|
-
write_hash_to_file = ->(hash, path) do
|
61
|
-
File.open(path, 'w') do |f|
|
62
|
-
f.puts "# THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT",
|
63
|
-
*hash.sort.map { |pair| pair.join(',') }
|
64
|
-
end
|
65
|
-
puts "Wrote #{hash.count} aliases to `#{path}`"
|
66
|
-
end
|
67
|
-
|
68
|
-
long_names_to_tokens = RegexpPropertyValues.all.map do |val|
|
69
|
-
[val.identifier, val.full_name.downcase]
|
70
|
-
end
|
71
|
-
write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.csv")
|
72
|
-
|
73
|
-
short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v|
|
74
|
-
[k.identifier, v.full_name.downcase]
|
75
|
-
end
|
76
|
-
write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.csv")
|
77
|
-
end
|
78
|
-
end
|
data/lib/regexp_parser/lexer.rb
CHANGED
@@ -6,6 +6,7 @@ age=11.0,age=11.0
|
|
6
6
|
age=12.0,age=12.0
|
7
7
|
age=12.1,age=12.1
|
8
8
|
age=13.0,age=13.0
|
9
|
+
age=14.0,age=14.0
|
9
10
|
age=2.0,age=2.0
|
10
11
|
age=2.1,age=2.1
|
11
12
|
age=3.0,age=3.0
|
@@ -72,6 +73,7 @@ coptic,coptic
|
|
72
73
|
cuneiform,cuneiform
|
73
74
|
currencysymbol,currency_symbol
|
74
75
|
cypriot,cypriot
|
76
|
+
cyprominoan,cypro_minoan
|
75
77
|
cyrillic,cyrillic
|
76
78
|
dash,dash
|
77
79
|
dashpunctuation,dash_punctuation
|
@@ -136,6 +138,7 @@ inancientgreeknumbers,in_ancient_greek_numbers
|
|
136
138
|
inancientsymbols,in_ancient_symbols
|
137
139
|
inarabic,in_arabic
|
138
140
|
inarabicextendeda,in_arabic_extended_a
|
141
|
+
inarabicextendedb,in_arabic_extended_b
|
139
142
|
inarabicmathematicalalphabeticsymbols,in_arabic_mathematical_alphabetic_symbols
|
140
143
|
inarabicpresentationformsa,in_arabic_presentation_forms_a
|
141
144
|
inarabicpresentationformsb,in_arabic_presentation_forms_b
|
@@ -197,6 +200,7 @@ incuneiform,in_cuneiform
|
|
197
200
|
incuneiformnumbersandpunctuation,in_cuneiform_numbers_and_punctuation
|
198
201
|
incurrencysymbols,in_currency_symbols
|
199
202
|
incypriotsyllabary,in_cypriot_syllabary
|
203
|
+
incyprominoan,in_cypro_minoan
|
200
204
|
incyrillic,in_cyrillic
|
201
205
|
incyrillicextendeda,in_cyrillic_extended_a
|
202
206
|
incyrillicextendedb,in_cyrillic_extended_b
|
@@ -223,6 +227,7 @@ inenclosedideographicsupplement,in_enclosed_ideographic_supplement
|
|
223
227
|
inethiopic,in_ethiopic
|
224
228
|
inethiopicextended,in_ethiopic_extended
|
225
229
|
inethiopicextendeda,in_ethiopic_extended_a
|
230
|
+
inethiopicextendedb,in_ethiopic_extended_b
|
226
231
|
inethiopicsupplement,in_ethiopic_supplement
|
227
232
|
ingeneralpunctuation,in_general_punctuation
|
228
233
|
ingeometricshapes,in_geometric_shapes
|
@@ -264,6 +269,7 @@ initialpunctuation,initial_punctuation
|
|
264
269
|
injavanese,in_javanese
|
265
270
|
inkaithi,in_kaithi
|
266
271
|
inkanaextendeda,in_kana_extended_a
|
272
|
+
inkanaextendedb,in_kana_extended_b
|
267
273
|
inkanasupplement,in_kana_supplement
|
268
274
|
inkanbun,in_kanbun
|
269
275
|
inkangxiradicals,in_kangxi_radicals
|
@@ -285,6 +291,8 @@ inlatinextendedb,in_latin_extended_b
|
|
285
291
|
inlatinextendedc,in_latin_extended_c
|
286
292
|
inlatinextendedd,in_latin_extended_d
|
287
293
|
inlatinextendede,in_latin_extended_e
|
294
|
+
inlatinextendedf,in_latin_extended_f
|
295
|
+
inlatinextendedg,in_latin_extended_g
|
288
296
|
inlepcha,in_lepcha
|
289
297
|
inletterlikesymbols,in_letterlike_symbols
|
290
298
|
inlimbu,in_limbu
|
@@ -349,6 +357,7 @@ inoldpersian,in_old_persian
|
|
349
357
|
inoldsogdian,in_old_sogdian
|
350
358
|
inoldsoutharabian,in_old_south_arabian
|
351
359
|
inoldturkic,in_old_turkic
|
360
|
+
inolduyghur,in_old_uyghur
|
352
361
|
inopticalcharacterrecognition,in_optical_character_recognition
|
353
362
|
inoriya,in_oriya
|
354
363
|
inornamentaldingbats,in_ornamental_dingbats
|
@@ -413,6 +422,7 @@ intaixuanjingsymbols,in_tai_xuan_jing_symbols
|
|
413
422
|
intakri,in_takri
|
414
423
|
intamil,in_tamil
|
415
424
|
intamilsupplement,in_tamil_supplement
|
425
|
+
intangsa,in_tangsa
|
416
426
|
intangut,in_tangut
|
417
427
|
intangutcomponents,in_tangut_components
|
418
428
|
intangutsupplement,in_tangut_supplement
|
@@ -422,15 +432,18 @@ inthai,in_thai
|
|
422
432
|
intibetan,in_tibetan
|
423
433
|
intifinagh,in_tifinagh
|
424
434
|
intirhuta,in_tirhuta
|
435
|
+
intoto,in_toto
|
425
436
|
intransportandmapsymbols,in_transport_and_map_symbols
|
426
437
|
inugaritic,in_ugaritic
|
427
438
|
inunifiedcanadianaboriginalsyllabics,in_unified_canadian_aboriginal_syllabics
|
428
439
|
inunifiedcanadianaboriginalsyllabicsextended,in_unified_canadian_aboriginal_syllabics_extended
|
440
|
+
inunifiedcanadianaboriginalsyllabicsextendeda,in_unified_canadian_aboriginal_syllabics_extended_a
|
429
441
|
invai,in_vai
|
430
442
|
invariationselectors,in_variation_selectors
|
431
443
|
invariationselectorssupplement,in_variation_selectors_supplement
|
432
444
|
invedicextensions,in_vedic_extensions
|
433
445
|
inverticalforms,in_vertical_forms
|
446
|
+
invithkuqi,in_vithkuqi
|
434
447
|
inwancho,in_wancho
|
435
448
|
inwarangciti,in_warang_citi
|
436
449
|
inyezidi,in_yezidi
|
@@ -438,6 +451,7 @@ inyijinghexagramsymbols,in_yijing_hexagram_symbols
|
|
438
451
|
inyiradicals,in_yi_radicals
|
439
452
|
inyisyllables,in_yi_syllables
|
440
453
|
inzanabazarsquare,in_zanabazar_square
|
454
|
+
inznamennymusicalnotation,in_znamenny_musical_notation
|
441
455
|
javanese,javanese
|
442
456
|
joincontrol,join_control
|
443
457
|
kaithi,kaithi
|
@@ -509,6 +523,7 @@ oldpersian,old_persian
|
|
509
523
|
oldsogdian,old_sogdian
|
510
524
|
oldsoutharabian,old_south_arabian
|
511
525
|
oldturkic,old_turkic
|
526
|
+
olduyghur,old_uyghur
|
512
527
|
openpunctuation,open_punctuation
|
513
528
|
oriya,oriya
|
514
529
|
osage,osage
|
@@ -573,6 +588,7 @@ taitham,tai_tham
|
|
573
588
|
taiviet,tai_viet
|
574
589
|
takri,takri
|
575
590
|
tamil,tamil
|
591
|
+
tangsa,tangsa
|
576
592
|
tangut,tangut
|
577
593
|
telugu,telugu
|
578
594
|
terminalpunctuation,terminal_punctuation
|
@@ -582,6 +598,7 @@ tibetan,tibetan
|
|
582
598
|
tifinagh,tifinagh
|
583
599
|
tirhuta,tirhuta
|
584
600
|
titlecaseletter,titlecase_letter
|
601
|
+
toto,toto
|
585
602
|
ugaritic,ugaritic
|
586
603
|
unassigned,unassigned
|
587
604
|
unifiedideograph,unified_ideograph
|
@@ -591,6 +608,7 @@ uppercase,uppercase
|
|
591
608
|
uppercaseletter,uppercase_letter
|
592
609
|
vai,vai
|
593
610
|
variationselector,variation_selector
|
611
|
+
vithkuqi,vithkuqi
|
594
612
|
wancho,wancho
|
595
613
|
warangciti,warang_citi
|
596
614
|
whitespace,white_space
|
@@ -31,6 +31,7 @@ cn,unassigned
|
|
31
31
|
co,private_use
|
32
32
|
combiningmark,mark
|
33
33
|
copt,coptic
|
34
|
+
cpmn,cypro_minoan
|
34
35
|
cprt,cypriot
|
35
36
|
cs,surrogate
|
36
37
|
cwcf,changes_when_casefolded
|
@@ -154,6 +155,7 @@ orkh,old_turkic
|
|
154
155
|
orya,oriya
|
155
156
|
osge,osage
|
156
157
|
osma,osmanya
|
158
|
+
ougr,old_uyghur
|
157
159
|
oupper,other_uppercase
|
158
160
|
p,punctuation
|
159
161
|
palm,palmyrene
|
@@ -219,9 +221,11 @@ tglg,tagalog
|
|
219
221
|
thaa,thaana
|
220
222
|
tibt,tibetan
|
221
223
|
tirh,tirhuta
|
224
|
+
tnsa,tangsa
|
222
225
|
ugar,ugaritic
|
223
226
|
uideo,unified_ideograph
|
224
227
|
vaii,vai
|
228
|
+
vith,vithkuqi
|
225
229
|
vs,variation_selector
|
226
230
|
wara,warang_citi
|
227
231
|
wcho,wancho
|