regexp_parser 2.2.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 381a794200168f95ff6329cc8a01330d21a05e02b75e0b06dcc6bd8f763c111d
4
- data.tar.gz: bd7617cb3763e6d759c8e1364aed037ae2fff85af3cf28823476cadd14ff080e
3
+ metadata.gz: 369b108d8410e12bd6af5c659f58cb56c583e48780c1b35b6270bb21cc6a4ee7
4
+ data.tar.gz: 30cd2c0823ae154a2db04c705f898f252774ec8ab9ef304833c5e3546ba7406a
5
5
  SHA512:
6
- metadata.gz: 0a039012013e9b57329fd685aaf29386d8b848071e514f59df0acc3437a1dae5c76b6bf94158cc3deece08f3a1fec9437ac84590d97f8590d8dcee1e0dc6c726
7
- data.tar.gz: 4d67da41fbef9b9336ccfd02e3a742286bf4ef96d469c8aa2bbb9a6a55ed4aa6027a28b10ba6c9993b15937e3fe51a349632bcf5808f6237cf77a1d29ceb74f2
6
+ metadata.gz: 4104bec7dd02a7ea099de9aeacb766fb1a2db50cb52bd84f44e4bde93431d436b75d0f1b3f4d62242713a1eeca3f4d8c0be034270d515979aad8ad2d504880b0
7
+ data.tar.gz: 11deb2d7c8a6fad3fa9cb18b3f29cae15bab7e12e6cbbc968706dd02c16b0d1a6b1d69f05a5f665f7b46947315b0ea4ecda62dab8ddca8b5ef71f521b877da74
data/CHANGELOG.md CHANGED
@@ -1,3 +1,16 @@
1
+ ## [Unreleased]
2
+
3
+ ## [2.3.0] - 2022-04-08 - [Janosch Müller](mailto:janosch84@gmail.com)
4
+
5
+ ### Added
6
+
7
+ - improved parsing performance through `Syntax` refactoring
8
+ - instead of fresh `Syntax` instances, pre-loaded constants are now re-used
9
+ - this approximately doubles the parsing speed for simple regexps
10
+ - added methods to `Syntax` classes to show relative feature sets
11
+ - e.g. `Regexp::Syntax::V3_2_0.added_features`
12
+ - support for new unicode properties of Ruby 3.2 / Unicode 14.0
13
+
1
14
  ## [2.2.1] - 2022-02-11 - [Janosch Müller](mailto:janosch84@gmail.com)
2
15
 
3
16
  ### Fixed
data/Gemfile CHANGED
@@ -5,7 +5,7 @@ gemspec
5
5
  group :development, :test do
6
6
  gem 'ice_nine', '~> 0.11.2'
7
7
  gem 'rake', '~> 13.0'
8
- gem 'regexp_property_values', '~> 1.0'
8
+ gem 'regexp_property_values', '~> 1.3'
9
9
  gem 'rspec', '~> 3.10'
10
10
  if RUBY_VERSION.to_f >= 2.7
11
11
  gem 'gouteur'
data/README.md CHANGED
@@ -157,31 +157,41 @@ flavor). Syntax classes act as lookup tables, and are layered to create
157
157
  flavor variations. Syntax only comes into play in the lexer.
158
158
 
159
159
  #### Example
160
- The following instantiates syntax objects for Ruby 2.0, 1.9, 1.8, and
160
+ The following fetches syntax objects for Ruby 2.0, 1.9, 1.8, and
161
161
  checks a few of their implementation features.
162
162
 
163
163
  ```ruby
164
164
  require 'regexp_parser'
165
165
 
166
- ruby_20 = Regexp::Syntax.new 'ruby/2.0'
166
+ ruby_20 = Regexp::Syntax.for 'ruby/2.0'
167
167
  ruby_20.implements? :quantifier, :zero_or_one # => true
168
168
  ruby_20.implements? :quantifier, :zero_or_one_reluctant # => true
169
169
  ruby_20.implements? :quantifier, :zero_or_one_possessive # => true
170
170
  ruby_20.implements? :conditional, :condition # => true
171
171
 
172
- ruby_19 = Regexp::Syntax.new 'ruby/1.9'
172
+ ruby_19 = Regexp::Syntax.for 'ruby/1.9'
173
173
  ruby_19.implements? :quantifier, :zero_or_one # => true
174
174
  ruby_19.implements? :quantifier, :zero_or_one_reluctant # => true
175
175
  ruby_19.implements? :quantifier, :zero_or_one_possessive # => true
176
176
  ruby_19.implements? :conditional, :condition # => false
177
177
 
178
- ruby_18 = Regexp::Syntax.new 'ruby/1.8'
178
+ ruby_18 = Regexp::Syntax.for 'ruby/1.8'
179
179
  ruby_18.implements? :quantifier, :zero_or_one # => true
180
180
  ruby_18.implements? :quantifier, :zero_or_one_reluctant # => true
181
181
  ruby_18.implements? :quantifier, :zero_or_one_possessive # => false
182
182
  ruby_18.implements? :conditional, :condition # => false
183
183
  ```
184
184
 
185
+ Syntax objects can also be queried about their complete and relative feature sets.
186
+
187
+ ```ruby
188
+ require 'regexp_parser'
189
+
190
+ ruby_20 = Regexp::Syntax.for 'ruby/2.0' # => Regexp::Syntax::V2_0_0
191
+ ruby_20.added_features # => { conditional: [...], ... }
192
+ ruby_20.removed_features # => { property: [:newline], ... }
193
+ ruby_20.features # => { anchor: [...], ... }
194
+ ```
185
195
 
186
196
  #### Notes
187
197
  * Variations on a token, for example a named group with angle brackets (< and >)
data/Rakefile CHANGED
@@ -5,9 +5,7 @@ require 'rake'
5
5
  require 'rake/testtask'
6
6
  require 'rspec/core/rake_task'
7
7
 
8
- RAGEL_SOURCE_DIR = File.join(__dir__, 'lib/regexp_parser/scanner')
9
- RAGEL_OUTPUT_DIR = File.join(__dir__, 'lib/regexp_parser')
10
- RAGEL_SOURCE_FILES = %w[scanner] # scanner.rl imports the other files
8
+ Dir['tasks/**/*.rake'].each { |file| load(file) }
11
9
 
12
10
  Bundler::GemHelper.install_tasks
13
11
 
@@ -19,60 +17,7 @@ namespace :test do
19
17
  task full: [:'ragel:rb', :spec]
20
18
  end
21
19
 
22
- namespace :ragel do
23
- desc "Process the ragel source files and output ruby code"
24
- task :rb do
25
- RAGEL_SOURCE_FILES.each do |source_file|
26
- output_file = "#{RAGEL_OUTPUT_DIR}/#{source_file}.rb"
27
- # using faster flat table driven FSM, about 25% larger code, but about 30% faster
28
- sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{source_file}.rl -o #{output_file}"
29
-
30
- contents = File.read(output_file)
31
-
32
- File.open(output_file, 'r+') do |file|
33
- contents = "# -*- warn-indent:false; -*-\n" + contents
34
-
35
- file.write(contents)
36
- end
37
- end
38
- end
39
-
40
- desc "Delete the ragel generated source file(s)"
41
- task :clean do
42
- RAGEL_SOURCE_FILES.each do |file|
43
- sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb"
44
- end
45
- end
46
- end
47
-
48
20
  # Add ragel task as a prerequisite for building the gem to ensure that the
49
21
  # latest scanner code is generated and included in the build.
50
22
  desc "Runs ragel:rb before building the gem"
51
23
  task :build => ['ragel:rb']
52
-
53
- namespace :props do
54
- desc 'Write new property value hashes for the properties scanner'
55
- task :update do
56
- require 'regexp_property_values'
57
- RegexpPropertyValues.update
58
- dir = File.join(__dir__, 'lib/regexp_parser/scanner/properties')
59
-
60
- write_hash_to_file = ->(hash, path) do
61
- File.open(path, 'w') do |f|
62
- f.puts "# THIS FILE IS AUTO-GENERATED BY `rake props:update` - DO NOT EDIT",
63
- *hash.sort.map { |pair| pair.join(',') }
64
- end
65
- puts "Wrote #{hash.count} aliases to `#{path}`"
66
- end
67
-
68
- long_names_to_tokens = RegexpPropertyValues.all.map do |val|
69
- [val.identifier, val.full_name.downcase]
70
- end
71
- write_hash_to_file.call(long_names_to_tokens, "#{dir}/long.csv")
72
-
73
- short_names_to_tokens = RegexpPropertyValues.alias_hash.map do |k, v|
74
- [k.identifier, v.full_name.downcase]
75
- end
76
- write_hash_to_file.call(short_names_to_tokens, "#{dir}/short.csv")
77
- end
78
- end
@@ -18,7 +18,7 @@ class Regexp::Lexer
18
18
  end
19
19
 
20
20
  def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
21
- syntax = Regexp::Syntax.new(syntax)
21
+ syntax = Regexp::Syntax.for(syntax)
22
22
 
23
23
  self.tokens = []
24
24
  self.nesting = 0
@@ -6,6 +6,7 @@ age=11.0,age=11.0
6
6
  age=12.0,age=12.0
7
7
  age=12.1,age=12.1
8
8
  age=13.0,age=13.0
9
+ age=14.0,age=14.0
9
10
  age=2.0,age=2.0
10
11
  age=2.1,age=2.1
11
12
  age=3.0,age=3.0
@@ -72,6 +73,7 @@ coptic,coptic
72
73
  cuneiform,cuneiform
73
74
  currencysymbol,currency_symbol
74
75
  cypriot,cypriot
76
+ cyprominoan,cypro_minoan
75
77
  cyrillic,cyrillic
76
78
  dash,dash
77
79
  dashpunctuation,dash_punctuation
@@ -136,6 +138,7 @@ inancientgreeknumbers,in_ancient_greek_numbers
136
138
  inancientsymbols,in_ancient_symbols
137
139
  inarabic,in_arabic
138
140
  inarabicextendeda,in_arabic_extended_a
141
+ inarabicextendedb,in_arabic_extended_b
139
142
  inarabicmathematicalalphabeticsymbols,in_arabic_mathematical_alphabetic_symbols
140
143
  inarabicpresentationformsa,in_arabic_presentation_forms_a
141
144
  inarabicpresentationformsb,in_arabic_presentation_forms_b
@@ -197,6 +200,7 @@ incuneiform,in_cuneiform
197
200
  incuneiformnumbersandpunctuation,in_cuneiform_numbers_and_punctuation
198
201
  incurrencysymbols,in_currency_symbols
199
202
  incypriotsyllabary,in_cypriot_syllabary
203
+ incyprominoan,in_cypro_minoan
200
204
  incyrillic,in_cyrillic
201
205
  incyrillicextendeda,in_cyrillic_extended_a
202
206
  incyrillicextendedb,in_cyrillic_extended_b
@@ -223,6 +227,7 @@ inenclosedideographicsupplement,in_enclosed_ideographic_supplement
223
227
  inethiopic,in_ethiopic
224
228
  inethiopicextended,in_ethiopic_extended
225
229
  inethiopicextendeda,in_ethiopic_extended_a
230
+ inethiopicextendedb,in_ethiopic_extended_b
226
231
  inethiopicsupplement,in_ethiopic_supplement
227
232
  ingeneralpunctuation,in_general_punctuation
228
233
  ingeometricshapes,in_geometric_shapes
@@ -264,6 +269,7 @@ initialpunctuation,initial_punctuation
264
269
  injavanese,in_javanese
265
270
  inkaithi,in_kaithi
266
271
  inkanaextendeda,in_kana_extended_a
272
+ inkanaextendedb,in_kana_extended_b
267
273
  inkanasupplement,in_kana_supplement
268
274
  inkanbun,in_kanbun
269
275
  inkangxiradicals,in_kangxi_radicals
@@ -285,6 +291,8 @@ inlatinextendedb,in_latin_extended_b
285
291
  inlatinextendedc,in_latin_extended_c
286
292
  inlatinextendedd,in_latin_extended_d
287
293
  inlatinextendede,in_latin_extended_e
294
+ inlatinextendedf,in_latin_extended_f
295
+ inlatinextendedg,in_latin_extended_g
288
296
  inlepcha,in_lepcha
289
297
  inletterlikesymbols,in_letterlike_symbols
290
298
  inlimbu,in_limbu
@@ -349,6 +357,7 @@ inoldpersian,in_old_persian
349
357
  inoldsogdian,in_old_sogdian
350
358
  inoldsoutharabian,in_old_south_arabian
351
359
  inoldturkic,in_old_turkic
360
+ inolduyghur,in_old_uyghur
352
361
  inopticalcharacterrecognition,in_optical_character_recognition
353
362
  inoriya,in_oriya
354
363
  inornamentaldingbats,in_ornamental_dingbats
@@ -413,6 +422,7 @@ intaixuanjingsymbols,in_tai_xuan_jing_symbols
413
422
  intakri,in_takri
414
423
  intamil,in_tamil
415
424
  intamilsupplement,in_tamil_supplement
425
+ intangsa,in_tangsa
416
426
  intangut,in_tangut
417
427
  intangutcomponents,in_tangut_components
418
428
  intangutsupplement,in_tangut_supplement
@@ -422,15 +432,18 @@ inthai,in_thai
422
432
  intibetan,in_tibetan
423
433
  intifinagh,in_tifinagh
424
434
  intirhuta,in_tirhuta
435
+ intoto,in_toto
425
436
  intransportandmapsymbols,in_transport_and_map_symbols
426
437
  inugaritic,in_ugaritic
427
438
  inunifiedcanadianaboriginalsyllabics,in_unified_canadian_aboriginal_syllabics
428
439
  inunifiedcanadianaboriginalsyllabicsextended,in_unified_canadian_aboriginal_syllabics_extended
440
+ inunifiedcanadianaboriginalsyllabicsextendeda,in_unified_canadian_aboriginal_syllabics_extended_a
429
441
  invai,in_vai
430
442
  invariationselectors,in_variation_selectors
431
443
  invariationselectorssupplement,in_variation_selectors_supplement
432
444
  invedicextensions,in_vedic_extensions
433
445
  inverticalforms,in_vertical_forms
446
+ invithkuqi,in_vithkuqi
434
447
  inwancho,in_wancho
435
448
  inwarangciti,in_warang_citi
436
449
  inyezidi,in_yezidi
@@ -438,6 +451,7 @@ inyijinghexagramsymbols,in_yijing_hexagram_symbols
438
451
  inyiradicals,in_yi_radicals
439
452
  inyisyllables,in_yi_syllables
440
453
  inzanabazarsquare,in_zanabazar_square
454
+ inznamennymusicalnotation,in_znamenny_musical_notation
441
455
  javanese,javanese
442
456
  joincontrol,join_control
443
457
  kaithi,kaithi
@@ -509,6 +523,7 @@ oldpersian,old_persian
509
523
  oldsogdian,old_sogdian
510
524
  oldsoutharabian,old_south_arabian
511
525
  oldturkic,old_turkic
526
+ olduyghur,old_uyghur
512
527
  openpunctuation,open_punctuation
513
528
  oriya,oriya
514
529
  osage,osage
@@ -573,6 +588,7 @@ taitham,tai_tham
573
588
  taiviet,tai_viet
574
589
  takri,takri
575
590
  tamil,tamil
591
+ tangsa,tangsa
576
592
  tangut,tangut
577
593
  telugu,telugu
578
594
  terminalpunctuation,terminal_punctuation
@@ -582,6 +598,7 @@ tibetan,tibetan
582
598
  tifinagh,tifinagh
583
599
  tirhuta,tirhuta
584
600
  titlecaseletter,titlecase_letter
601
+ toto,toto
585
602
  ugaritic,ugaritic
586
603
  unassigned,unassigned
587
604
  unifiedideograph,unified_ideograph
@@ -591,6 +608,7 @@ uppercase,uppercase
591
608
  uppercaseletter,uppercase_letter
592
609
  vai,vai
593
610
  variationselector,variation_selector
611
+ vithkuqi,vithkuqi
594
612
  wancho,wancho
595
613
  warangciti,warang_citi
596
614
  whitespace,white_space
@@ -31,6 +31,7 @@ cn,unassigned
31
31
  co,private_use
32
32
  combiningmark,mark
33
33
  copt,coptic
34
+ cpmn,cypro_minoan
34
35
  cprt,cypriot
35
36
  cs,surrogate
36
37
  cwcf,changes_when_casefolded
@@ -154,6 +155,7 @@ orkh,old_turkic
154
155
  orya,oriya
155
156
  osge,osage
156
157
  osma,osmanya
158
+ ougr,old_uyghur
157
159
  oupper,other_uppercase
158
160
  p,punctuation
159
161
  palm,palmyrene
@@ -219,9 +221,11 @@ tglg,tagalog
219
221
  thaa,thaana
220
222
  tibt,tibetan
221
223
  tirh,tirhuta
224
+ tnsa,tangsa
222
225
  ugar,ugaritic
223
226
  uideo,unified_ideograph
224
227
  vaii,vai
228
+ vith,vithkuqi
225
229
  vs,variation_selector
226
230
  wara,warang_citi
227
231
  wcho,wancho