regexp_parser 2.2.1 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/Gemfile +1 -1
- data/README.md +14 -4
- data/Rakefile +1 -56
- data/lib/regexp_parser/lexer.rb +1 -1
- data/lib/regexp_parser/scanner/properties/long.csv +18 -0
- data/lib/regexp_parser/scanner/properties/short.csv +4 -0
- data/lib/regexp_parser/scanner.rb +120 -120
- data/lib/regexp_parser/syntax/any.rb +2 -5
- data/lib/regexp_parser/syntax/base.rb +91 -64
- data/lib/regexp_parser/syntax/token/quantifier.rb +4 -4
- data/lib/regexp_parser/syntax/token/unicode_property.rb +26 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +20 -29
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
- data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
- data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
- data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- metadata +7 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 369b108d8410e12bd6af5c659f58cb56c583e48780c1b35b6270bb21cc6a4ee7
|
4
|
+
data.tar.gz: 30cd2c0823ae154a2db04c705f898f252774ec8ab9ef304833c5e3546ba7406a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4104bec7dd02a7ea099de9aeacb766fb1a2db50cb52bd84f44e4bde93431d436b75d0f1b3f4d62242713a1eeca3f4d8c0be034270d515979aad8ad2d504880b0
|
7
|
+
data.tar.gz: 11deb2d7c8a6fad3fa9cb18b3f29cae15bab7e12e6cbbc968706dd02c16b0d1a6b1d69f05a5f665f7b46947315b0ea4ecda62dab8ddca8b5ef71f521b877da74
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,16 @@
|
|
1
|
+
## [Unreleased]
|
2
|
+
|
3
|
+
## [2.3.0] - 2022-04-08 - [Janosch Müller](mailto:janosch84@gmail.com)
|
4
|
+
|
5
|
+
### Added
|
6
|
+
|
7
|
+
- improved parsing performance through `Syntax` refactoring
|
8
|
+
- instead of fresh `Syntax` instances, pre-loaded constants are now re-used
|
9
|
+
- this approximately doubles the parsing speed for simple regexps
|
10
|
+
- added methods to `Syntax` classes to show relative feature sets
|
11
|
+
- e.g. `Regexp::Syntax::V3_2_0.added_features`
|
12
|
+
- support for new unicode properties of Ruby 3.2 / Unicode 14.0
|
13
|
+
|
1
14
|
## [2.2.1] - 2022-02-11 - [Janosch Müller](mailto:janosch84@gmail.com)
|
2
15
|
|
3
16
|
### Fixed
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -157,31 +157,41 @@ flavor). Syntax classes act as lookup tables, and are layered to create
|
|
157
157
|
flavor variations. Syntax only comes into play in the lexer.
|
158
158
|
|
159
159
|
#### Example
|
160
|
-
The following
|
160
|
+
The following fetches syntax objects for Ruby 2.0, 1.9, 1.8, and
|
161
161
|
checks a few of their implementation features.
|
162
162
|
|
163
163
|
```ruby
|
164
164
|
require 'regexp_parser'
|
165
165
|
|
166
|
-
ruby_20 = Regexp::Syntax.
|
166
|
+
ruby_20 = Regexp::Syntax.for 'ruby/2.0'
|
167
167
|
ruby_20.implements? :quantifier, :zero_or_one # => true
|
168
168
|
ruby_20.implements? :quantifier, :zero_or_one_reluctant # => true
|
169
169
|
ruby_20.implements? :quantifier, :zero_or_one_possessive # => true
|
170
170
|
ruby_20.implements? :conditional, :condition # => true
|
171
171
|
|
172
|
-
ruby_19 = Regexp::Syntax.
|
172
|
+
ruby_19 = Regexp::Syntax.for 'ruby/1.9'
|
173
173
|
ruby_19.implements? :quantifier, :zero_or_one # => true
|
174
174
|
ruby_19.implements? :quantifier, :zero_or_one_reluctant # => true
|
175
175
|
ruby_19.implements? :quantifier, :zero_or_one_possessive # => true
|
176
176
|
ruby_19.implements? :conditional, :condition # => false
|
177
177
|
|
178
|
-
ruby_18 = Regexp::Syntax.
|
178
|
+
ruby_18 = Regexp::Syntax.for 'ruby/1.8'
|
179
179
|
ruby_18.implements? :quantifier, :zero_or_one # => true
|
180
180
|
ruby_18.implements? :quantifier, :zero_or_one_reluctant # => true
|
181
181
|
ruby_18.implements? :quantifier, :zero_or_one_possessive # => false
|
182
182
|
ruby_18.implements? :conditional, :condition # => false
|
183
183
|
```
|
184
184
|
|
185
|
+
Syntax objects can also be queried about their complete and relative feature sets.
|
186
|
+
|
187
|
+
```ruby
|
188
|
+
require 'regexp_parser'
|
189
|
+
|
190
|
+
ruby_20 = Regexp::Syntax.for 'ruby/2.0' # => Regexp::Syntax::V2_0_0
|
191
|
+
ruby_20.added_features # => { conditional: [...], ... }
|
192
|
+
ruby_20.removed_features # => { property: [:newline], ... }
|
193
|
+
ruby_20.features # => { anchor: [...], ... }
|
194
|
+
```
|
185
195
|
|
186
196
|
#### Notes
|
187
197
|
* Variations on a token, for example a named group with angle brackets (< and >)
|
data/Rakefile
CHANGED
@@ -5,9 +5,7 @@ require 'rake'
|
|
5
5
|
require 'rake/testtask'
|
6
6
|
require 'rspec/core/rake_task'
|
7
7
|
|
8
|
-
|
9
|
-
RAGEL_OUTPUT_DIR = File.join(__dir__, 'lib/regexp_parser')
|
10
|
-
RAGEL_SOURCE_FILES = %w[scanner] # scanner.rl imports the other files
|
8
|
+
Dir['tasks/**/*.rake'].each { |file| load(file) }
|
11
9
|
|
12
10
|
Bundler::GemHelper.install_tasks
|
13
11
|
|
@@ -19,60 +17,7 @@ namespace :test do
|
|
19
17
|
task full: [:'ragel:rb', :spec]
|
20
18
|
end
|
21
19
|
|
22
|
-
namespace :ragel do
|
23
|
-
desc "Process the ragel source files and output ruby code"
|
24
|
-
task :rb do
|
25
|
-
RAGEL_SOURCE_FILES.each do |source_file|
|
26
|
-
output_file = "#{RAGEL_OUTPUT_DIR}/#{source_file}.rb"
|
27
|
-
# using faster flat table driven FSM, about 25% larger code, but about 30% faster
|
28
|
-
sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{source_file}.rl -o #{output_file}"
|
29
|
-
|
30
|
-
contents = File.read(output_file)
|
31
|
-
|
32
|
-
File.open(output_file, 'r+') do |file|
|
33
|
-
contents = "# -*- warn-indent:false; -*-\n" + contents
|
34
|
-
|
35
|
-
file.write(contents)
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
desc "Delete the ragel generated source file(s)"
|
41
|
-
task :clean do
|
42
|
-
RAGEL_SOURCE_FILES.each do |file|
|
43
|
-
sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb"
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
20
|
# Add ragel task as a prerequisite for building the gem to ensure that the
|
49
21
|
# latest scanner code is generated and included in the build.
|
50
22
|
desc "Runs ragel:rb before building the gem"
|
51
23
|
task :build => ['ragel:rb']
|
52
|
-
|
53
|
-
namespace :props do
|
54
|
-
desc 'Write new property value hashes for the properties scanner'
|
55
|
-
task :update do
|
56
|
-
require 'regexp_property_values'
|
57
|
-
RegexpPropertyValues.update
|
58
|
-
dir = File.join(__dir__, 'lib/regexp_parser/scanner/properties')
|
59
|
-
|
60
|
-
write_hash_to_file = ->(hash, path) do
|
61
|
-
File.open(path, 'w') do |f|
|
62
|
-
f.puts "# THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT",
|
63
|
-
*hash.sort.map { |pair| pair.join(',') }
|
64
|
-
end
|
65
|
-
puts "Wrote #{hash.count} aliases to `#{path}`"
|
66
|
-
end
|
67
|
-
|
68
|
-
long_names_to_tokens = RegexpPropertyValues.all.map do |val|
|
69
|
-
[val.identifier, val.full_name.downcase]
|
70
|
-
end
|
71
|
-
write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.csv")
|
72
|
-
|
73
|
-
short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v|
|
74
|
-
[k.identifier, v.full_name.downcase]
|
75
|
-
end
|
76
|
-
write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.csv")
|
77
|
-
end
|
78
|
-
end
|
data/lib/regexp_parser/lexer.rb
CHANGED
@@ -6,6 +6,7 @@ age=11.0,age=11.0
|
|
6
6
|
age=12.0,age=12.0
|
7
7
|
age=12.1,age=12.1
|
8
8
|
age=13.0,age=13.0
|
9
|
+
age=14.0,age=14.0
|
9
10
|
age=2.0,age=2.0
|
10
11
|
age=2.1,age=2.1
|
11
12
|
age=3.0,age=3.0
|
@@ -72,6 +73,7 @@ coptic,coptic
|
|
72
73
|
cuneiform,cuneiform
|
73
74
|
currencysymbol,currency_symbol
|
74
75
|
cypriot,cypriot
|
76
|
+
cyprominoan,cypro_minoan
|
75
77
|
cyrillic,cyrillic
|
76
78
|
dash,dash
|
77
79
|
dashpunctuation,dash_punctuation
|
@@ -136,6 +138,7 @@ inancientgreeknumbers,in_ancient_greek_numbers
|
|
136
138
|
inancientsymbols,in_ancient_symbols
|
137
139
|
inarabic,in_arabic
|
138
140
|
inarabicextendeda,in_arabic_extended_a
|
141
|
+
inarabicextendedb,in_arabic_extended_b
|
139
142
|
inarabicmathematicalalphabeticsymbols,in_arabic_mathematical_alphabetic_symbols
|
140
143
|
inarabicpresentationformsa,in_arabic_presentation_forms_a
|
141
144
|
inarabicpresentationformsb,in_arabic_presentation_forms_b
|
@@ -197,6 +200,7 @@ incuneiform,in_cuneiform
|
|
197
200
|
incuneiformnumbersandpunctuation,in_cuneiform_numbers_and_punctuation
|
198
201
|
incurrencysymbols,in_currency_symbols
|
199
202
|
incypriotsyllabary,in_cypriot_syllabary
|
203
|
+
incyprominoan,in_cypro_minoan
|
200
204
|
incyrillic,in_cyrillic
|
201
205
|
incyrillicextendeda,in_cyrillic_extended_a
|
202
206
|
incyrillicextendedb,in_cyrillic_extended_b
|
@@ -223,6 +227,7 @@ inenclosedideographicsupplement,in_enclosed_ideographic_supplement
|
|
223
227
|
inethiopic,in_ethiopic
|
224
228
|
inethiopicextended,in_ethiopic_extended
|
225
229
|
inethiopicextendeda,in_ethiopic_extended_a
|
230
|
+
inethiopicextendedb,in_ethiopic_extended_b
|
226
231
|
inethiopicsupplement,in_ethiopic_supplement
|
227
232
|
ingeneralpunctuation,in_general_punctuation
|
228
233
|
ingeometricshapes,in_geometric_shapes
|
@@ -264,6 +269,7 @@ initialpunctuation,initial_punctuation
|
|
264
269
|
injavanese,in_javanese
|
265
270
|
inkaithi,in_kaithi
|
266
271
|
inkanaextendeda,in_kana_extended_a
|
272
|
+
inkanaextendedb,in_kana_extended_b
|
267
273
|
inkanasupplement,in_kana_supplement
|
268
274
|
inkanbun,in_kanbun
|
269
275
|
inkangxiradicals,in_kangxi_radicals
|
@@ -285,6 +291,8 @@ inlatinextendedb,in_latin_extended_b
|
|
285
291
|
inlatinextendedc,in_latin_extended_c
|
286
292
|
inlatinextendedd,in_latin_extended_d
|
287
293
|
inlatinextendede,in_latin_extended_e
|
294
|
+
inlatinextendedf,in_latin_extended_f
|
295
|
+
inlatinextendedg,in_latin_extended_g
|
288
296
|
inlepcha,in_lepcha
|
289
297
|
inletterlikesymbols,in_letterlike_symbols
|
290
298
|
inlimbu,in_limbu
|
@@ -349,6 +357,7 @@ inoldpersian,in_old_persian
|
|
349
357
|
inoldsogdian,in_old_sogdian
|
350
358
|
inoldsoutharabian,in_old_south_arabian
|
351
359
|
inoldturkic,in_old_turkic
|
360
|
+
inolduyghur,in_old_uyghur
|
352
361
|
inopticalcharacterrecognition,in_optical_character_recognition
|
353
362
|
inoriya,in_oriya
|
354
363
|
inornamentaldingbats,in_ornamental_dingbats
|
@@ -413,6 +422,7 @@ intaixuanjingsymbols,in_tai_xuan_jing_symbols
|
|
413
422
|
intakri,in_takri
|
414
423
|
intamil,in_tamil
|
415
424
|
intamilsupplement,in_tamil_supplement
|
425
|
+
intangsa,in_tangsa
|
416
426
|
intangut,in_tangut
|
417
427
|
intangutcomponents,in_tangut_components
|
418
428
|
intangutsupplement,in_tangut_supplement
|
@@ -422,15 +432,18 @@ inthai,in_thai
|
|
422
432
|
intibetan,in_tibetan
|
423
433
|
intifinagh,in_tifinagh
|
424
434
|
intirhuta,in_tirhuta
|
435
|
+
intoto,in_toto
|
425
436
|
intransportandmapsymbols,in_transport_and_map_symbols
|
426
437
|
inugaritic,in_ugaritic
|
427
438
|
inunifiedcanadianaboriginalsyllabics,in_unified_canadian_aboriginal_syllabics
|
428
439
|
inunifiedcanadianaboriginalsyllabicsextended,in_unified_canadian_aboriginal_syllabics_extended
|
440
|
+
inunifiedcanadianaboriginalsyllabicsextendeda,in_unified_canadian_aboriginal_syllabics_extended_a
|
429
441
|
invai,in_vai
|
430
442
|
invariationselectors,in_variation_selectors
|
431
443
|
invariationselectorssupplement,in_variation_selectors_supplement
|
432
444
|
invedicextensions,in_vedic_extensions
|
433
445
|
inverticalforms,in_vertical_forms
|
446
|
+
invithkuqi,in_vithkuqi
|
434
447
|
inwancho,in_wancho
|
435
448
|
inwarangciti,in_warang_citi
|
436
449
|
inyezidi,in_yezidi
|
@@ -438,6 +451,7 @@ inyijinghexagramsymbols,in_yijing_hexagram_symbols
|
|
438
451
|
inyiradicals,in_yi_radicals
|
439
452
|
inyisyllables,in_yi_syllables
|
440
453
|
inzanabazarsquare,in_zanabazar_square
|
454
|
+
inznamennymusicalnotation,in_znamenny_musical_notation
|
441
455
|
javanese,javanese
|
442
456
|
joincontrol,join_control
|
443
457
|
kaithi,kaithi
|
@@ -509,6 +523,7 @@ oldpersian,old_persian
|
|
509
523
|
oldsogdian,old_sogdian
|
510
524
|
oldsoutharabian,old_south_arabian
|
511
525
|
oldturkic,old_turkic
|
526
|
+
olduyghur,old_uyghur
|
512
527
|
openpunctuation,open_punctuation
|
513
528
|
oriya,oriya
|
514
529
|
osage,osage
|
@@ -573,6 +588,7 @@ taitham,tai_tham
|
|
573
588
|
taiviet,tai_viet
|
574
589
|
takri,takri
|
575
590
|
tamil,tamil
|
591
|
+
tangsa,tangsa
|
576
592
|
tangut,tangut
|
577
593
|
telugu,telugu
|
578
594
|
terminalpunctuation,terminal_punctuation
|
@@ -582,6 +598,7 @@ tibetan,tibetan
|
|
582
598
|
tifinagh,tifinagh
|
583
599
|
tirhuta,tirhuta
|
584
600
|
titlecaseletter,titlecase_letter
|
601
|
+
toto,toto
|
585
602
|
ugaritic,ugaritic
|
586
603
|
unassigned,unassigned
|
587
604
|
unifiedideograph,unified_ideograph
|
@@ -591,6 +608,7 @@ uppercase,uppercase
|
|
591
608
|
uppercaseletter,uppercase_letter
|
592
609
|
vai,vai
|
593
610
|
variationselector,variation_selector
|
611
|
+
vithkuqi,vithkuqi
|
594
612
|
wancho,wancho
|
595
613
|
warangciti,warang_citi
|
596
614
|
whitespace,white_space
|
@@ -31,6 +31,7 @@ cn,unassigned
|
|
31
31
|
co,private_use
|
32
32
|
combiningmark,mark
|
33
33
|
copt,coptic
|
34
|
+
cpmn,cypro_minoan
|
34
35
|
cprt,cypriot
|
35
36
|
cs,surrogate
|
36
37
|
cwcf,changes_when_casefolded
|
@@ -154,6 +155,7 @@ orkh,old_turkic
|
|
154
155
|
orya,oriya
|
155
156
|
osge,osage
|
156
157
|
osma,osmanya
|
158
|
+
ougr,old_uyghur
|
157
159
|
oupper,other_uppercase
|
158
160
|
p,punctuation
|
159
161
|
palm,palmyrene
|
@@ -219,9 +221,11 @@ tglg,tagalog
|
|
219
221
|
thaa,thaana
|
220
222
|
tibt,tibetan
|
221
223
|
tirh,tirhuta
|
224
|
+
tnsa,tangsa
|
222
225
|
ugar,ugaritic
|
223
226
|
uideo,unified_ideograph
|
224
227
|
vaii,vai
|
228
|
+
vith,vithkuqi
|
225
229
|
vs,variation_selector
|
226
230
|
wara,warang_citi
|
227
231
|
wcho,wancho
|