regexp_parser 2.2.1 → 2.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 381a794200168f95ff6329cc8a01330d21a05e02b75e0b06dcc6bd8f763c111d
4
- data.tar.gz: bd7617cb3763e6d759c8e1364aed037ae2fff85af3cf28823476cadd14ff080e
3
+ metadata.gz: 369b108d8410e12bd6af5c659f58cb56c583e48780c1b35b6270bb21cc6a4ee7
4
+ data.tar.gz: 30cd2c0823ae154a2db04c705f898f252774ec8ab9ef304833c5e3546ba7406a
5
5
  SHA512:
6
- metadata.gz: 0a039012013e9b57329fd685aaf29386d8b848071e514f59df0acc3437a1dae5c76b6bf94158cc3deece08f3a1fec9437ac84590d97f8590d8dcee1e0dc6c726
7
- data.tar.gz: 4d67da41fbef9b9336ccfd02e3a742286bf4ef96d469c8aa2bbb9a6a55ed4aa6027a28b10ba6c9993b15937e3fe51a349632bcf5808f6237cf77a1d29ceb74f2
6
+ metadata.gz: 4104bec7dd02a7ea099de9aeacb766fb1a2db50cb52bd84f44e4bde93431d436b75d0f1b3f4d62242713a1eeca3f4d8c0be034270d515979aad8ad2d504880b0
7
+ data.tar.gz: 11deb2d7c8a6fad3fa9cb18b3f29cae15bab7e12e6cbbc968706dd02c16b0d1a6b1d69f05a5f665f7b46947315b0ea4ecda62dab8ddca8b5ef71f521b877da74
data/CHANGELOG.md CHANGED
@@ -1,3 +1,16 @@
1
+ ## [Unreleased]
2
+
3
+ ## [2.3.0] - 2022-04-08 - [Janosch Müller](mailto:janosch84@gmail.com)
4
+
5
+ ### Added
6
+
7
+ - improved parsing performance through `Syntax` refactoring
8
+ - instead of fresh `Syntax` instances, pre-loaded constants are now re-used
9
+ - this approximately doubles the parsing speed for simple regexps
10
+ - added methods to `Syntax` classes to show relative feature sets
11
+ - e.g. `Regexp::Syntax::V3_2_0.added_features`
12
+ - support for new unicode properties of Ruby 3.2 / Unicode 14.0
13
+
1
14
  ## [2.2.1] - 2022-02-11 - [Janosch Müller](mailto:janosch84@gmail.com)
2
15
 
3
16
  ### Fixed
data/Gemfile CHANGED
@@ -5,7 +5,7 @@ gemspec
5
5
  group :development, :test do
6
6
  gem 'ice_nine', '~> 0.11.2'
7
7
  gem 'rake', '~> 13.0'
8
- gem 'regexp_property_values', '~> 1.0'
8
+ gem 'regexp_property_values', '~> 1.3'
9
9
  gem 'rspec', '~> 3.10'
10
10
  if RUBY_VERSION.to_f >= 2.7
11
11
  gem 'gouteur'
data/README.md CHANGED
@@ -157,31 +157,41 @@ flavor). Syntax classes act as lookup tables, and are layered to create
157
157
  flavor variations. Syntax only comes into play in the lexer.
158
158
 
159
159
  #### Example
160
- The following instantiates syntax objects for Ruby 2.0, 1.9, 1.8, and
160
+ The following fetches syntax objects for Ruby 2.0, 1.9, 1.8, and
161
161
  checks a few of their implementation features.
162
162
 
163
163
  ```ruby
164
164
  require 'regexp_parser'
165
165
 
166
- ruby_20 = Regexp::Syntax.new 'ruby/2.0'
166
+ ruby_20 = Regexp::Syntax.for 'ruby/2.0'
167
167
  ruby_20.implements? :quantifier, :zero_or_one # => true
168
168
  ruby_20.implements? :quantifier, :zero_or_one_reluctant # => true
169
169
  ruby_20.implements? :quantifier, :zero_or_one_possessive # => true
170
170
  ruby_20.implements? :conditional, :condition # => true
171
171
 
172
- ruby_19 = Regexp::Syntax.new 'ruby/1.9'
172
+ ruby_19 = Regexp::Syntax.for 'ruby/1.9'
173
173
  ruby_19.implements? :quantifier, :zero_or_one # => true
174
174
  ruby_19.implements? :quantifier, :zero_or_one_reluctant # => true
175
175
  ruby_19.implements? :quantifier, :zero_or_one_possessive # => true
176
176
  ruby_19.implements? :conditional, :condition # => false
177
177
 
178
- ruby_18 = Regexp::Syntax.new 'ruby/1.8'
178
+ ruby_18 = Regexp::Syntax.for 'ruby/1.8'
179
179
  ruby_18.implements? :quantifier, :zero_or_one # => true
180
180
  ruby_18.implements? :quantifier, :zero_or_one_reluctant # => true
181
181
  ruby_18.implements? :quantifier, :zero_or_one_possessive # => false
182
182
  ruby_18.implements? :conditional, :condition # => false
183
183
  ```
184
184
 
185
+ Syntax objects can also be queried about their complete and relative feature sets.
186
+
187
+ ```ruby
188
+ require 'regexp_parser'
189
+
190
+ ruby_20 = Regexp::Syntax.for 'ruby/2.0' # => Regexp::Syntax::V2_0_0
191
+ ruby_20.added_features # => { conditional: [...], ... }
192
+ ruby_20.removed_features # => { property: [:newline], ... }
193
+ ruby_20.features # => { anchor: [...], ... }
194
+ ```
185
195
 
186
196
  #### Notes
187
197
  * Variations on a token, for example a named group with angle brackets (< and >)
data/Rakefile CHANGED
@@ -5,9 +5,7 @@ require 'rake'
5
5
  require 'rake/testtask'
6
6
  require 'rspec/core/rake_task'
7
7
 
8
- RAGEL_SOURCE_DIR = File.join(__dir__, 'lib/regexp_parser/scanner')
9
- RAGEL_OUTPUT_DIR = File.join(__dir__, 'lib/regexp_parser')
10
- RAGEL_SOURCE_FILES = %w[scanner] # scanner.rl imports the other files
8
+ Dir['tasks/**/*.rake'].each { |file| load(file) }
11
9
 
12
10
  Bundler::GemHelper.install_tasks
13
11
 
@@ -19,60 +17,7 @@ namespace :test do
19
17
  task full: [:'ragel:rb', :spec]
20
18
  end
21
19
 
22
- namespace :ragel do
23
- desc "Process the ragel source files and output ruby code"
24
- task :rb do
25
- RAGEL_SOURCE_FILES.each do |source_file|
26
- output_file = "#{RAGEL_OUTPUT_DIR}/#{source_file}.rb"
27
- # using faster flat table driven FSM, about 25% larger code, but about 30% faster
28
- sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{source_file}.rl -o #{output_file}"
29
-
30
- contents = File.read(output_file)
31
-
32
- File.open(output_file, 'r+') do |file|
33
- contents = "# -*- warn-indent:false; -*-\n" + contents
34
-
35
- file.write(contents)
36
- end
37
- end
38
- end
39
-
40
- desc "Delete the ragel generated source file(s)"
41
- task :clean do
42
- RAGEL_SOURCE_FILES.each do |file|
43
- sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb"
44
- end
45
- end
46
- end
47
-
48
20
  # Add ragel task as a prerequisite for building the gem to ensure that the
49
21
  # latest scanner code is generated and included in the build.
50
22
  desc "Runs ragel:rb before building the gem"
51
23
  task :build => ['ragel:rb']
52
-
53
- namespace :props do
54
- desc 'Write new property value hashes for the properties scanner'
55
- task :update do
56
- require 'regexp_property_values'
57
- RegexpPropertyValues.update
58
- dir = File.join(__dir__, 'lib/regexp_parser/scanner/properties')
59
-
60
- write_hash_to_file = ->(hash, path) do
61
- File.open(path, 'w') do |f|
62
- f.puts "# THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT",
63
- *hash.sort.map { |pair| pair.join(',') }
64
- end
65
- puts "Wrote #{hash.count} aliases to `#{path}`"
66
- end
67
-
68
- long_names_to_tokens = RegexpPropertyValues.all.map do |val|
69
- [val.identifier, val.full_name.downcase]
70
- end
71
- write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.csv")
72
-
73
- short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v|
74
- [k.identifier, v.full_name.downcase]
75
- end
76
- write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.csv")
77
- end
78
- end
@@ -18,7 +18,7 @@ class Regexp::Lexer
18
18
  end
19
19
 
20
20
  def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
21
- syntax = Regexp::Syntax.new(syntax)
21
+ syntax = Regexp::Syntax.for(syntax)
22
22
 
23
23
  self.tokens = []
24
24
  self.nesting = 0
@@ -6,6 +6,7 @@ age=11.0,age=11.0
6
6
  age=12.0,age=12.0
7
7
  age=12.1,age=12.1
8
8
  age=13.0,age=13.0
9
+ age=14.0,age=14.0
9
10
  age=2.0,age=2.0
10
11
  age=2.1,age=2.1
11
12
  age=3.0,age=3.0
@@ -72,6 +73,7 @@ coptic,coptic
72
73
  cuneiform,cuneiform
73
74
  currencysymbol,currency_symbol
74
75
  cypriot,cypriot
76
+ cyprominoan,cypro_minoan
75
77
  cyrillic,cyrillic
76
78
  dash,dash
77
79
  dashpunctuation,dash_punctuation
@@ -136,6 +138,7 @@ inancientgreeknumbers,in_ancient_greek_numbers
136
138
  inancientsymbols,in_ancient_symbols
137
139
  inarabic,in_arabic
138
140
  inarabicextendeda,in_arabic_extended_a
141
+ inarabicextendedb,in_arabic_extended_b
139
142
  inarabicmathematicalalphabeticsymbols,in_arabic_mathematical_alphabetic_symbols
140
143
  inarabicpresentationformsa,in_arabic_presentation_forms_a
141
144
  inarabicpresentationformsb,in_arabic_presentation_forms_b
@@ -197,6 +200,7 @@ incuneiform,in_cuneiform
197
200
  incuneiformnumbersandpunctuation,in_cuneiform_numbers_and_punctuation
198
201
  incurrencysymbols,in_currency_symbols
199
202
  incypriotsyllabary,in_cypriot_syllabary
203
+ incyprominoan,in_cypro_minoan
200
204
  incyrillic,in_cyrillic
201
205
  incyrillicextendeda,in_cyrillic_extended_a
202
206
  incyrillicextendedb,in_cyrillic_extended_b
@@ -223,6 +227,7 @@ inenclosedideographicsupplement,in_enclosed_ideographic_supplement
223
227
  inethiopic,in_ethiopic
224
228
  inethiopicextended,in_ethiopic_extended
225
229
  inethiopicextendeda,in_ethiopic_extended_a
230
+ inethiopicextendedb,in_ethiopic_extended_b
226
231
  inethiopicsupplement,in_ethiopic_supplement
227
232
  ingeneralpunctuation,in_general_punctuation
228
233
  ingeometricshapes,in_geometric_shapes
@@ -264,6 +269,7 @@ initialpunctuation,initial_punctuation
264
269
  injavanese,in_javanese
265
270
  inkaithi,in_kaithi
266
271
  inkanaextendeda,in_kana_extended_a
272
+ inkanaextendedb,in_kana_extended_b
267
273
  inkanasupplement,in_kana_supplement
268
274
  inkanbun,in_kanbun
269
275
  inkangxiradicals,in_kangxi_radicals
@@ -285,6 +291,8 @@ inlatinextendedb,in_latin_extended_b
285
291
  inlatinextendedc,in_latin_extended_c
286
292
  inlatinextendedd,in_latin_extended_d
287
293
  inlatinextendede,in_latin_extended_e
294
+ inlatinextendedf,in_latin_extended_f
295
+ inlatinextendedg,in_latin_extended_g
288
296
  inlepcha,in_lepcha
289
297
  inletterlikesymbols,in_letterlike_symbols
290
298
  inlimbu,in_limbu
@@ -349,6 +357,7 @@ inoldpersian,in_old_persian
349
357
  inoldsogdian,in_old_sogdian
350
358
  inoldsoutharabian,in_old_south_arabian
351
359
  inoldturkic,in_old_turkic
360
+ inolduyghur,in_old_uyghur
352
361
  inopticalcharacterrecognition,in_optical_character_recognition
353
362
  inoriya,in_oriya
354
363
  inornamentaldingbats,in_ornamental_dingbats
@@ -413,6 +422,7 @@ intaixuanjingsymbols,in_tai_xuan_jing_symbols
413
422
  intakri,in_takri
414
423
  intamil,in_tamil
415
424
  intamilsupplement,in_tamil_supplement
425
+ intangsa,in_tangsa
416
426
  intangut,in_tangut
417
427
  intangutcomponents,in_tangut_components
418
428
  intangutsupplement,in_tangut_supplement
@@ -422,15 +432,18 @@ inthai,in_thai
422
432
  intibetan,in_tibetan
423
433
  intifinagh,in_tifinagh
424
434
  intirhuta,in_tirhuta
435
+ intoto,in_toto
425
436
  intransportandmapsymbols,in_transport_and_map_symbols
426
437
  inugaritic,in_ugaritic
427
438
  inunifiedcanadianaboriginalsyllabics,in_unified_canadian_aboriginal_syllabics
428
439
  inunifiedcanadianaboriginalsyllabicsextended,in_unified_canadian_aboriginal_syllabics_extended
440
+ inunifiedcanadianaboriginalsyllabicsextendeda,in_unified_canadian_aboriginal_syllabics_extended_a
429
441
  invai,in_vai
430
442
  invariationselectors,in_variation_selectors
431
443
  invariationselectorssupplement,in_variation_selectors_supplement
432
444
  invedicextensions,in_vedic_extensions
433
445
  inverticalforms,in_vertical_forms
446
+ invithkuqi,in_vithkuqi
434
447
  inwancho,in_wancho
435
448
  inwarangciti,in_warang_citi
436
449
  inyezidi,in_yezidi
@@ -438,6 +451,7 @@ inyijinghexagramsymbols,in_yijing_hexagram_symbols
438
451
  inyiradicals,in_yi_radicals
439
452
  inyisyllables,in_yi_syllables
440
453
  inzanabazarsquare,in_zanabazar_square
454
+ inznamennymusicalnotation,in_znamenny_musical_notation
441
455
  javanese,javanese
442
456
  joincontrol,join_control
443
457
  kaithi,kaithi
@@ -509,6 +523,7 @@ oldpersian,old_persian
509
523
  oldsogdian,old_sogdian
510
524
  oldsoutharabian,old_south_arabian
511
525
  oldturkic,old_turkic
526
+ olduyghur,old_uyghur
512
527
  openpunctuation,open_punctuation
513
528
  oriya,oriya
514
529
  osage,osage
@@ -573,6 +588,7 @@ taitham,tai_tham
573
588
  taiviet,tai_viet
574
589
  takri,takri
575
590
  tamil,tamil
591
+ tangsa,tangsa
576
592
  tangut,tangut
577
593
  telugu,telugu
578
594
  terminalpunctuation,terminal_punctuation
@@ -582,6 +598,7 @@ tibetan,tibetan
582
598
  tifinagh,tifinagh
583
599
  tirhuta,tirhuta
584
600
  titlecaseletter,titlecase_letter
601
+ toto,toto
585
602
  ugaritic,ugaritic
586
603
  unassigned,unassigned
587
604
  unifiedideograph,unified_ideograph
@@ -591,6 +608,7 @@ uppercase,uppercase
591
608
  uppercaseletter,uppercase_letter
592
609
  vai,vai
593
610
  variationselector,variation_selector
611
+ vithkuqi,vithkuqi
594
612
  wancho,wancho
595
613
  warangciti,warang_citi
596
614
  whitespace,white_space
@@ -31,6 +31,7 @@ cn,unassigned
31
31
  co,private_use
32
32
  combiningmark,mark
33
33
  copt,coptic
34
+ cpmn,cypro_minoan
34
35
  cprt,cypriot
35
36
  cs,surrogate
36
37
  cwcf,changes_when_casefolded
@@ -154,6 +155,7 @@ orkh,old_turkic
154
155
  orya,oriya
155
156
  osge,osage
156
157
  osma,osmanya
158
+ ougr,old_uyghur
157
159
  oupper,other_uppercase
158
160
  p,punctuation
159
161
  palm,palmyrene
@@ -219,9 +221,11 @@ tglg,tagalog
219
221
  thaa,thaana
220
222
  tibt,tibetan
221
223
  tirh,tirhuta
224
+ tnsa,tangsa
222
225
  ugar,ugaritic
223
226
  uideo,unified_ideograph
224
227
  vaii,vai
228
+ vith,vithkuqi
225
229
  vs,variation_selector
226
230
  wara,warang_citi
227
231
  wcho,wancho