RubyGems - character_set - Versions diffs - 1.1.1 → 1.4.1 - Mend

character_set 1.1.1 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

checksums.yaml +4 -4
data/.gitattributes +3 -0
data/.github/workflows/lint.yml +29 -0
data/.github/workflows/tests.yml +22 -0
data/.gitignore +1 -0
data/.rubocop.yml +11 -0
data/BENCHMARK.md +53 -17
data/CHANGELOG.md +47 -0
data/README.md +38 -14
data/Rakefile +60 -36
data/benchmarks/count_in.rb +13 -0
data/benchmarks/delete_in.rb +1 -1
data/benchmarks/scan.rb +13 -0
data/benchmarks/shared.rb +5 -0
data/benchmarks/z_add.rb +12 -0
data/benchmarks/z_delete.rb +12 -0
data/benchmarks/z_merge.rb +15 -0
data/benchmarks/z_minmax.rb +12 -0
data/bin/console +2 -0
data/character_set.gemspec +17 -6
data/ext/character_set/character_set.c +963 -414
data/ext/character_set/unicode_casefold_table.h +10 -2
data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
data/lib/character_set/character.rb +1 -1
data/lib/character_set/core_ext/regexp_ext.rb +1 -1
data/lib/character_set/core_ext/string_ext.rb +3 -1
data/lib/character_set/expression_converter.rb +25 -27
data/lib/character_set/parser.rb +1 -1
data/lib/character_set/predefined_sets.rb +25 -260
data/lib/character_set/predefined_sets/any.cps +1 -0
data/lib/character_set/predefined_sets/ascii.cps +1 -0
data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
data/lib/character_set/predefined_sets/assigned.cps +666 -0
data/lib/character_set/predefined_sets/bmp.cps +2 -0
data/lib/character_set/predefined_sets/crypt.cps +2 -0
data/lib/character_set/predefined_sets/emoji.cps +151 -0
data/lib/character_set/predefined_sets/newline.cps +3 -0
data/lib/character_set/predefined_sets/surrogate.cps +1 -0
data/lib/character_set/predefined_sets/unicode.cps +2 -0
data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
data/lib/character_set/predefined_sets/url_host.cps +10 -0
data/lib/character_set/predefined_sets/url_path.cps +7 -0
data/lib/character_set/predefined_sets/url_query.cps +8 -0
data/lib/character_set/predefined_sets/whitespace.cps +10 -0
data/lib/character_set/ruby_fallback.rb +5 -3
data/lib/character_set/ruby_fallback/character_set_methods.rb +53 -6
data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
data/lib/character_set/shared_methods.rb +60 -49
data/lib/character_set/version.rb +1 -1
data/lib/character_set/writer.rb +98 -27
metadata +88 -22
data/.travis.yml +0 -11
data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: aa3f78dd78741bc520526d54554d8294cfb81283670b8b975c519c6197dfa6f8
-  data.tar.gz: 1617d3f9133688337ec7490fb0a7db3797609dc9fe9f3ea5c6bfa680310e766d
+  metadata.gz: 7a91fd10258c312d27d3fa84f99f1a97168d12ca08a3911fe31485565a999246
+  data.tar.gz: 2f16c02b72302259bccda6f2bf731950bd6dc8c679af8812c414ac313f1d8fc2
 SHA512:
-  metadata.gz: 816d5ac8bd2459a4c9080a3b1c3409f8de17c5e9847a196b01cbf2b5b4d753554a5d9fb78a891ee6bed97df92d217cc6ee230bb4f595e5ead569ee80a7385f3d
-  data.tar.gz: afd506628f34b4dadfd375e73ae23af69cd59f6423f77139357c9e71df1c7dd852b32c28e998ee38d9bc4ce2e7b726863ee41039ceede3c1f3f0058aad6e1f39
+  metadata.gz: cab6e94ec0a7efc2f26eba33dd1b4d5af639905d23422ec61420411325832a998c07359a4bf50c24379ec4550784ebc6da0effec4c917e7859392345ce9b8db0
+  data.tar.gz: a2dc319a9f8085e85624f25cc6f12dc03992b50f3f1a8d2000e1b69dadfdc4219c887452bdffbb213a91e1cad2011f237f604aa6fdb7e93243304d22fb5adfa3

data/.gitattributes ADDED

@@ -0,0 +1,3 @@
+*.cps linguist-detectable=false
+benchmarks/* linguist-detectable=false
+spec/ruby-spec/* linguist-vendored

data/.github/workflows/lint.yml ADDED

@@ -0,0 +1,29 @@
+# based on https://github.com/rails/rails/blob/4a78dcb/.github/workflows/rubocop.yml
+name: rubocop linting
+on: [push, pull_request]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Ruby
+      uses: ruby/setup-ruby@v1
+      with:
+        ruby-version: 2.7
+    - name: Cache gems
+      uses: actions/cache@v1
+      with:
+        path: vendor/bundle
+        key: ${{ runner.os }}-rubocop-${{ hashFiles('**/Gemfile.lock') }}
+        restore-keys: |
+          ${{ runner.os }}-rubocop-
+    - name: Install gems
+      run: |
+        bundle config path vendor/bundle
+        bundle install --jobs 4 --retry 3
+    - name: Run rubocop
+      run: bundle exec rubocop --lint

data/.github/workflows/tests.yml ADDED

@@ -0,0 +1,22 @@
+name: tests
+on: [push, pull_request]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        ruby: [ '2.2', '2.7', '3.0', 'ruby-head', 'jruby-head' ]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Ruby ${{ matrix.ruby }}
+        uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: ${{ matrix.ruby }}
+      - name: Install dependencies
+        run: bundle install --jobs 4
+      - name: Test with Rake
+        run: bundle exec rake

data/.gitignore CHANGED

@@ -15,6 +15,7 @@
 .ruby-version
 .tags
 .tags1
+.vscode
 bbin/
 binstubs/*
 bundler_stubs/*/.yardoc

data/.rubocop.yml ADDED

@@ -0,0 +1,11 @@
+AllCops:
+  Exclude:
+    - '**/doc/*'
+    - '**/pkg/*'
+    - '**/spec/ruby-spec/**/*'
+    - '**/vendor/**/*' # vendored dependencies
+  NewCops: enable
+  RubyInterpreters:
+    - ruby
+    - rake
+  TargetRubyVersion: 2.4 # really 2.1, but 2.4 is lowest supported by rubocop

data/BENCHMARK.md CHANGED

@@ -1,50 +1,86 @@
-Results of `rake:benchmark` on ruby 2.6.0preview1 (2018-02-24 trunk 62554) [x86_64-darwin17]
+Results of `rake:benchmark` on ruby 3.0.0p0 (2020-12-25 revision 95aff21468) [x86_64-darwin19]
+```
+Counting non-letters
+CharacterSet#count_in:  9472902.2 i/s
+        String#count:  2221799.9 i/s - 4.26x slower
+```
 ```
 Detecting non-whitespace
- CharacterSet#cover?: 13244577.7 i/s
-       Regexp#match?:  8027017.5 i/s - 1.65x  slower
+ CharacterSet#cover?: 12388427.2 i/s
+       Regexp#match?:  7901676.8 i/s - 1.57x slower
 ```
 ```
 Detecting non-letters
- CharacterSet#cover?: 13082940.8 i/s
-       Regexp#match?:  5372589.2 i/s - 2.44x  slower
+ CharacterSet#cover?: 12263689.1 i/s
+       Regexp#match?:  4940889.9 i/s - 2.48x slower
 ```
 ```
 Removing whitespace
-CharacterSet#delete_in:   389315.6 i/s
-         String#gsub:   223773.5 i/s - 1.74x  slower
+CharacterSet#delete_in:  2406722.6 i/s
+         String#gsub:   235760.3 i/s - 10.21x slower
 ```
 ```
 Removing whitespace, emoji and umlauts
-CharacterSet#delete_in:   470239.3 i/s
-         String#gsub:   278679.4 i/s - 1.69x  slower
+CharacterSet#delete_in:  1653607.6 i/s
+         String#gsub:   272782.9 i/s - 6.06x slower
 ```
 ```
 Removing non-whitespace
-CharacterSet#keep_in:  1138461.0 i/s
-         String#gsub:   235287.4 i/s - 4.84x  slower
+CharacterSet#keep_in:  2671038.2 i/s
+         String#gsub:   242551.0 i/s - 11.01x slower
 ```
 ```
 Extracting emoji
-CharacterSet#keep_in:  1474472.0 i/s
-         String#gsub:   212269.6 i/s - 6.95x  slower
+CharacterSet#keep_in:  1726496.5 i/s
+         String#gsub:   215609.2 i/s - 8.01x slower
+```
+```
+Extracting emoji to an Array
+   CharacterSet#scan:  2373856.1 i/s
+         String#scan:   480000.5 i/s - 4.95x slower
 ```
 ```
 Detecting whitespace
-CharacterSet#used_by?: 13063108.7 i/s
-       Regexp#match?:  7215075.0 i/s - 1.81x  slower
+CharacterSet#used_by?: 11988328.7 i/s
+       Regexp#match?:  6758146.8 i/s - 1.77x slower
 ```
 ```
 Detecting emoji in a large string
-CharacterSet#used_by?:   246527.7 i/s
-       Regexp#match?:    92956.5 i/s - 2.65x  slower
+CharacterSet#used_by?:   288223.3 i/s
+       Regexp#match?:   102384.2 i/s - 2.82x slower
+```
+```
+Adding entries
+    CharacterSet#add:  2538251.2 i/s
+       SortedSet#add:   443925.9 i/s - 5.72x slower
+```
+```
+Removing entries
+ CharacterSet#delete:  2487620.8 i/s
+    SortedSet#delete:   628816.1 i/s - 3.96x slower
+```
+```
+Merging entries
+  CharacterSet#merge:      551.6 i/s
+     SortedSet#merge:        1.4 i/s - 393.59x slower
+```
+```
+Getting the min and max
+ CharacterSet#minmax:   636890.7 i/s
+    SortedSet#minmax:      254.1 i/s - 2506.20x slower
 ```

data/CHANGELOG.md CHANGED

@@ -4,6 +4,53 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
+## [1.4.1] - 2020-01-10
+### Fixed
+- multiple fixes for Ruby 3
+  - fixed segfault for some `String` manipulation cases
+  - added `sorted_set` as dependency, so `CharacterSet::Pure` (non-C fallback) works
+- fixed error when parsing a `Regexp` with an empty intersection (e.g. `/[a&&]/`)
+## [1.4.0] - 2019-06-07
+### Added
+- `#to_s_with_surrogate_ranges` / `Writer::write_surrogate_ranges`
+  - allows for much shorter astral plane representations e.g. in JavaScript
+  - thanks to https://github.com/singpolyma for the suggestion and groundwork (#1)
+- improved performance for `#to_s` / `Writer` by avoiding bugged `Range#minmax`
+### Fixed
+- '/' is now escaped by default when stringifying so as to work with //-regexp syntax
+## [1.3.0] - 2019-04-26
+### Added
+- improved `String` manipulation speed
+- improved initialization and `#merge` speed when passing a large `Range`
+- reduced memory consumption by > 90% for most use cases via dynamic resizing
+  - before, every set instance required 136 KB for codepoints
+  - now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
+- `#count_in` and `#scan_in` methods for `String` interaction
+- new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
+- conversion methods `#assigned_part`, `#valid_part`
+- sectioning methods `#ascii_part`, `#plane(n)`
+- section test methods `#ascii_part?`, `#ascii_ratio`, `#ascii_only?`, `#astral_only?`
+### Fixed
+- `#count` now supports passing an argument or block as usual
+- `CharacterSet::Pure#keep_in`, `#delete_in` now preserve the original encoding
+## [1.2.0] - 2019-04-02
+### Added
+- added latest Unicode casefold data (for `#case_insensitive`)
+## [1.1.2] - 2018-09-25
+### Fixed
+- restored `range_compressor` as a runtime dependency for JRuby only
 ## [1.1.1] - 2018-09-24
 ### Fixed

data/README.md CHANGED

@@ -1,15 +1,18 @@
 # CharacterSet
 [![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set)
-[![Build Status](https://travis-ci.org/janosch-x/character_set.svg?branch=master)](https://travis-ci.org/janosch-x/character_set)
+[![Build Status](https://github.com/jaynetics/character_set/workflows/tests/badge.svg)](https://github.com/jaynetics/character_set/actions)
+[![codecov](https://codecov.io/gh/jaynetics/character_set/branch/master/graph/badge.svg)](https://codecov.io/gh/jaynetics/character_set)
-A gem to build, read, write and compare sets of Unicode codepoints.
+This is a C-extended Ruby gem to work with sets of Unicode codepoints. It can read and write these sets in various formats and implements the stdlib `Set` interface for them.
+It also offers an alternate paradigm of `String` processing which grants much better performance than `Regexp` and `String` methods from the stdlib where applicable (see [benchmarks](./BENCHMARK.md)).
 Many parts can be used independently, e.g.:
 - `CharacterSet::Character`
 - `CharacterSet::Parser`
 - `CharacterSet::Writer`
-- [`RangeCompressor`](https://github.com/janosch-x/range_compressor)
+- [`RangeCompressor`](https://github.com/jaynetics/range_compressor)
 ## Usage
@@ -37,7 +40,7 @@ CharacterSet.parse('[a-c]')
 CharacterSet.parse('\U00000061-\U00000063')
 ```
-If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/janosch-x/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting.
+If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting. Regexp's `i`-flag is ignored; call `#case_insensitive` on the result if needed.
 ```ruby
 CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
@@ -49,7 +52,7 @@ require 'character_set/core_ext/regexp_ext'
 ### Predefined utility sets
-`ascii`, `ascii_alnum`, `ascii_letters`, `bmp`, `crypt`, `emoji`, `newline`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace`
+`ascii`, `ascii_alnum`, `ascii_letter`, `assigned`, `bmp`, `crypt`, `emoji`, `newline`, `surrogate`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace`
 ```ruby
 CharacterSet.ascii # => #<CharacterSet (size: 128)>
@@ -60,7 +63,7 @@ CharacterSet.non_ascii
 ### Interact with Strings
-CharacterSet can replace some `Regexp` actions on Strings, at better speed (see [benchmarks](./BENCHMARK.md)).
+`CharacterSet` can replace some types of `String` handling with better performance than the stdlib.
 `#used_by?` and `#cover?` can replace some `Regexp#match?` calls:
@@ -71,6 +74,7 @@ CharacterSet.ascii.cover?('Tr') # => true
 ```
 `#delete_in(!)` and `#keep_in(!)` can replace `String#gsub(!)` and the like:
 ```ruby
 string = 'Tüür'
@@ -84,6 +88,13 @@ CharacterSet.ascii.keep_in!(string) # => ''
 string # => ''
 ```
+`#count_in` and `#scan` can replace `String#count` and `String#scan`:
+```ruby
+CharacterSet.non_ascii.count_in('Tüür') # => 2
+CharacterSet.non_ascii.scan_in('Tüür') # => ['ü', 'ü']
+```
 There is also a core extension for String interaction.
 ```ruby
 require 'character_set/core_ext/string_ext'
@@ -100,7 +111,7 @@ require 'character_set/core_ext/string_ext'
 ### Manipulate
-Use any [Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
+Use [any Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
 Where appropriate, methods take both chars and codepoints, e.g.:
@@ -122,13 +133,13 @@ non_a.include?('ü') # => true
 # surrogate pair halves are not included by default
 CharacterSet['a'].inversion(include_surrogates: true)
-# => #<CharacterSet (size: 1114111)>
+# => #<CharacterSet (size: 1114112)>
 ```
 `#case_insensitive` can be used to create a `CharacterSet` where upper/lower case codepoints are supplemented:
 ```ruby
-CharacterSet['1', 'a'].case_insensitive # => CharacterSet['1', 'A', 'a']
+CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
 ```
 ### Write
@@ -156,20 +167,33 @@ set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
 # disable abbreviation (grouping of codepoints in ranges)
 set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
-# for full js regex compatibility in case of astral members:
-set.to_s_with_surrogate_alternation # => '(?:[\u0258]|\ud83e\udd29)'
-```
+# astral members require some trickery if we want to target environments
+# that are based on UTF-16 or "UCS-2 with surrogates", such as JavaScript.
+set = CharacterSet['a', 'b', '🤩', '🤪', '🤫']
-Note: If you run JRuby or another Ruby without C support, you will also need to install [`range_compressor`](https://github.com/janosch-x/range_compressor) for these writing operations.
+# Use #to_s_with_surrogate_ranges e.g. for JavaScript:
+set.to_s_with_surrogate_ranges
+# => '(?:[ab]|\uD83E[\uDD29-\uDD2B])'
+# Or use #to_s_with_surrogate_alternation if such surrogate set pairs
+# don't work in your target environment:
+set.to_s_with_surrogate_alternation
+# => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
+```
 ### Unicode plane methods
-There are some methods to check for planes and to handle [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
+There are some methods to check for planes and to handle ASCII, [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
 ```Ruby
+CharacterSet['a', 'ü', '🤩'].ascii_part # => CharacterSet['a']
+CharacterSet['a', 'ü', '🤩'].ascii_part? # => true
+CharacterSet['a', 'ü', '🤩'].ascii_only? # => false
+CharacterSet['a', 'ü', '🤩'].ascii_ratio # => 0.3333333
 CharacterSet['a', 'ü', '🤩'].bmp_part # => CharacterSet['a', 'ü']
 CharacterSet['a', 'ü', '🤩'].astral_part # => CharacterSet['🤩']
 CharacterSet['a', 'ü', '🤩'].bmp_ratio # => 0.6666666
 CharacterSet['a', 'ü', '🤩'].planes # => [0, 1]
+CharacterSet['a', 'ü', '🤩'].plane(1) # => CharacterSet['🤩']
 CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
 CharacterSet::Character.new('a').plane # => 0
 ```

data/Rakefile CHANGED

@@ -7,6 +7,13 @@ RSpec::Core::RakeTask.new(:spec)
 task default: :spec
+namespace :spec do
+  task :quick do
+    ENV['SKIP_MEMSAFETY_SPECS'] = '1'
+    Rake::Task[:spec].invoke
+  end
+end
 Rake::ExtensionTask.new('character_set') do |ext|
   ext.lib_dir = 'lib/character_set'
 end
@@ -16,6 +23,8 @@ namespace :java do
   java_gemspec.platform = 'java'
   java_gemspec.extensions = []
+  java_gemspec.add_runtime_dependency 'range_compressor', '~> 1.0'
   Gem::PackageTask.new(java_gemspec) do |pkg|
     pkg.need_zip = true
     pkg.need_tar = true
@@ -33,43 +42,62 @@ task :sync_ruby_spec do
     'CharacterSet'       => './spec/ruby-spec/library/character_set',
     'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
   }
+  # download fresh specs from ruby/spec repository
   variants.each do |_, dir|
     FileUtils.rm_rf(dir) if File.exist?(dir)
     `svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
   end
+  # make copies for each CharacterSet variant
   base = variants.first[1]
   variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
-  variants.each.with_index do |(class_name, dir), i|
+  # adapt specs to work with CharacterSet
+  variants.each do |class_name, dir|
     Dir["#{dir}/**/*.rb"].each do |spec|
-      # remove some tests that do not apply or are covered otherwise
-      if spec =~ %r{/(flatten|initialize|pretty_print)}
+      # ignore some tests that do not apply or are covered otherwise
+      if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)}
         File.delete(spec)
         next
       end
-      # some examples w. Strings must be adapted, "mspec" made rspec-compatible,
-      # and `i` added to shared example names or they'll override each other
       adapted_content =
-        File
-        .read(spec)
-        .gsub('SortedSet', class_name)
-        .gsub('sorted_set_', "sorted_set_#{i}_")
-        .gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |method|')
-        .gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0')
-        .gsub('"one"', '1')
-        .gsub('"two"', '2')
-        .gsub('"three"', '3')
-        .gsub('"four"', '4')
-        .gsub('"five"', '5')
-        .gsub('@method', 'method')
-        .gsub(/be_(false|true)/, 'be \1')
-        .gsub('mock', 'double')
+        File.read(spec).
+        # adapt class name
+        gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)).
+        gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2").
+        # get shared specs from a single shared dir at the parent level
+        gsub(/(require_relative ['"])(shared\/)/, '\1../\2').
+        # make 'mspec' syntax rspec-compatible
+        gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|').
+        gsub(/be_(false|true)/, 'be \1').
+        gsub('stub!', 'stub').
+        gsub('mock', 'double').
+        gsub('@method', 'method').
+        # remove unneeded requires
+        gsub(/require 'set'\n/, '').
+        gsub(/require.*spec_helper.*\n/, '').
+        gsub(/\A\n+/, '').
+        # make examples use Integers/codepoints
+        gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0').
+        gsub('"one"', '1').
+        gsub('"two"', '2').
+        gsub('"three"', '3').
+        gsub('"four"', '4').
+        gsub('"five"', '5').
+        gsub(/x.(size|length) == 3/, 'x != 3').
+        gsub(/x.(size|length) != 3/, 'x == 3').
+        gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2')
       File.open(spec, 'w') { |f| f.puts adapted_content }
     end
   end
+  # keep only one copy of the shared specs, at the parent level
+  FileUtils.rm_rf(base + '/../shared')
+  FileUtils.mv(base + '/shared', base + '/../')
+  variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') }
 end
 desc 'Download unicode casefold data and write new C header file'
@@ -85,26 +113,22 @@ task :sync_casefold_data do
     hash[from] = to if type == 'C'
   end.sort
-  File.open(dst_path, 'w') do |f|
-    f.puts <<-C
-// THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT'
-typedef struct casefold_mapping {
-  unsigned long from;
-  unsigned long to;
-} casefold_mapping;
+  content = File.read(dst_path + '.tmpl')
+    .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
+    .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
-#define CASEFOLD_COUNT #{mapping.size}
-static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
-    C
-    mapping.each { |from, to| f.puts "{0x#{from},0x#{to}}," }
+  File.write(dst_path, content)
+  File.unlink(src_path)
+end
-    f.puts '};'
+desc 'Update codepoint data for predefined sets, based on Onigmo'
+task :sync_predefined_sets do
+  %w[assigned emoji whitespace].each do |prop|
+    require 'regexp_property_values'
+    ranges = RegexpPropertyValues[prop].matched_ranges
+    str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
+    File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
   end
-  File.unlink(src_path)
 end
 desc 'Run all IPS benchmarks'