RubyGems - character_set - Versions diffs - 1.2.0 → 1.5.0 - Mend

character_set 1.2.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

checksums.yaml +4 -4
data/.gitattributes +3 -0
data/.github/workflows/gouteur.yml +20 -0
data/.github/workflows/lint.yml +29 -0
data/.github/workflows/tests.yml +22 -0
data/.gitignore +1 -0
data/.gouteur.yml +2 -0
data/.rubocop.yml +17 -0
data/BENCHMARK.md +53 -17
data/CHANGELOG.md +54 -0
data/README.md +51 -12
data/Rakefile +20 -18
data/benchmarks/count_in.rb +13 -0
data/benchmarks/delete_in.rb +1 -1
data/benchmarks/scan.rb +13 -0
data/benchmarks/shared.rb +5 -0
data/benchmarks/z_add.rb +12 -0
data/benchmarks/z_delete.rb +12 -0
data/benchmarks/z_merge.rb +15 -0
data/benchmarks/z_minmax.rb +12 -0
data/bin/console +2 -0
data/character_set.gemspec +17 -4
data/ext/character_set/character_set.c +969 -415
data/ext/character_set/unicode_casefold_table.h +44 -1
data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
data/lib/character_set/character.rb +1 -1
data/lib/character_set/core_ext/regexp_ext.rb +1 -1
data/lib/character_set/core_ext/string_ext.rb +3 -1
data/lib/character_set/expression_converter.rb +41 -43
data/lib/character_set/parser.rb +1 -1
data/lib/character_set/predefined_sets/any.cps +1 -0
data/lib/character_set/predefined_sets/ascii.cps +1 -0
data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
data/lib/character_set/predefined_sets/assigned.cps +677 -0
data/lib/character_set/predefined_sets/bmp.cps +2 -0
data/lib/character_set/predefined_sets/crypt.cps +2 -0
data/lib/character_set/predefined_sets/emoji.cps +152 -0
data/lib/character_set/predefined_sets/newline.cps +3 -0
data/lib/character_set/predefined_sets/surrogate.cps +1 -0
data/lib/character_set/predefined_sets/unicode.cps +2 -0
data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
data/lib/character_set/predefined_sets/url_host.cps +10 -0
data/lib/character_set/predefined_sets/url_path.cps +7 -0
data/lib/character_set/predefined_sets/url_query.cps +8 -0
data/lib/character_set/predefined_sets/whitespace.cps +10 -0
data/lib/character_set/predefined_sets.rb +25 -260
data/lib/character_set/ruby_fallback/character_set_methods.rb +60 -9
data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
data/lib/character_set/ruby_fallback.rb +5 -3
data/lib/character_set/set_method_adapters.rb +4 -3
data/lib/character_set/shared_methods.rb +69 -50
data/lib/character_set/version.rb +1 -1
data/lib/character_set/writer.rb +98 -27
metadata +114 -17
data/.travis.yml +0 -8
data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 434323b3b99246a17ea5e062afd87d3edc3c09927b2231b4409b295ff63c7d6c
-  data.tar.gz: 174c6dc751b03e49cf87045fad9a48100460244b7d7e25deef27066bd4aef92c
+  metadata.gz: 9622bc20bbdb48f8deff84dbed9e800e6bc500a6a08a27e7b3aea2ea651cd278
+  data.tar.gz: 5853e8d5be7e9a1963419aa4f9fbc631148fe5bef45aa185b9117d32b44aa959
 SHA512:
-  metadata.gz: d9fa059ea3171209af537f0bd7636e3a65b962f30029ca399fe2fa0bd6168dd692b7bc5fb1014590a830b2e9aede9c26ae00ae8fe4a2eae4a86cf95e208b507d
-  data.tar.gz: 692f4596b6adc9b44879b69fb82e55dc90d107156ecabb96c14ea91b4dc0c7dc706724b42093d0ef762cdac697f05ef855c5f462451015e1d06022ab06bc1c8d
+  metadata.gz: 2cc2a60b9388a2e3beef66da20aa8205cc501980a7dc66f2716c66f7e999a083927b27a761e6b932b6d5c16b8e5968f8e04370ecf3c999326f378f60bfa3cedc
+  data.tar.gz: a2a8d1f9ac6cdf6302af98662fc3efda4b8c6fe003c7cdc853a61a64f9c7a596b1bbd7a79dca19081b8ce2576f9c3d848869141b164c145e22befaaffec8b265

data/.gitattributes ADDED Viewed

@@ -0,0 +1,3 @@
+*.cps linguist-detectable=false
+benchmarks/* linguist-detectable=false
+spec/ruby-spec/* linguist-vendored

data/.github/workflows/gouteur.yml ADDED Viewed

@@ -0,0 +1,20 @@
+name: gouteur
+on: [push, pull_request]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Ruby
+        uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: 2.7
+      - name: Prepare
+        run: |
+          bundle install --jobs 4
+          bundle exec rake compile
+      - name: Test
+        run: bundle exec gouteur

data/.github/workflows/lint.yml ADDED Viewed

@@ -0,0 +1,29 @@
+# based on https://github.com/rails/rails/blob/4a78dcb/.github/workflows/rubocop.yml
+name: rubocop linting
+on: [push, pull_request]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Ruby
+      uses: ruby/setup-ruby@v1
+      with:
+        ruby-version: 2.7
+    - name: Cache gems
+      uses: actions/cache@v1
+      with:
+        path: vendor/bundle
+        key: ${{ runner.os }}-rubocop-${{ hashFiles('**/Gemfile.lock') }}
+        restore-keys: |
+          ${{ runner.os }}-rubocop-
+    - name: Install gems
+      run: |
+        bundle config path vendor/bundle
+        bundle install --jobs 4 --retry 3
+    - name: Run rubocop
+      run: bundle exec rubocop --lint

data/.github/workflows/tests.yml ADDED Viewed

@@ -0,0 +1,22 @@
+name: tests
+on: [push, pull_request]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        ruby: [ '2.2', '2.7', '3.0', 'ruby-head', 'jruby-head' ]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Ruby ${{ matrix.ruby }}
+        uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: ${{ matrix.ruby }}
+      - name: Install dependencies
+        run: bundle install --jobs 4
+      - name: Test with Rake
+        run: bundle exec rake

data/.gitignore CHANGED Viewed

@@ -15,6 +15,7 @@
 .ruby-version
 .tags
 .tags1
+.tool-versions
 .vscode
 bbin/
 binstubs/*

data/.gouteur.yml ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ repos:
2	+ - uri: https://github.com/jaynetics/js_regex

data/.rubocop.yml ADDED Viewed

@@ -0,0 +1,17 @@
+AllCops:
+  Exclude:
+    - '**/doc/*'
+    - '**/pkg/*'
+    - '**/spec/ruby-spec/**/*'
+    - '**/vendor/**/*' # vendored dependencies
+  NewCops: enable
+  RubyInterpreters:
+    - ruby
+    - rake
+  TargetRubyVersion: 2.5 # really 2.1, but 2.5 is lowest supported by rubocop
+Lint/AmbiguousOperatorPrecedence:
+  Enabled: false
+Lint/AmbiguousRegexpLiteral:
+  Enabled: false

data/BENCHMARK.md CHANGED Viewed

@@ -1,50 +1,86 @@
-Results of `rake:benchmark` on ruby 2.6.0preview1 (2018-02-24 trunk 62554) [x86_64-darwin17]
+Results of `rake:benchmark` on ruby 3.0.0p0 (2020-12-25 revision 95aff21468) [x86_64-darwin19]
+```
+Counting non-letters
+CharacterSet#count_in:  9472902.2 i/s
+        String#count:  2221799.9 i/s - 4.26x slower
+```
 ```
 Detecting non-whitespace
- CharacterSet#cover?: 13244577.7 i/s
-       Regexp#match?:  8027017.5 i/s - 1.65x  slower
+ CharacterSet#cover?: 12388427.2 i/s
+       Regexp#match?:  7901676.8 i/s - 1.57x slower
 ```
 ```
 Detecting non-letters
- CharacterSet#cover?: 13082940.8 i/s
-       Regexp#match?:  5372589.2 i/s - 2.44x  slower
+ CharacterSet#cover?: 12263689.1 i/s
+       Regexp#match?:  4940889.9 i/s - 2.48x slower
 ```
 ```
 Removing whitespace
-CharacterSet#delete_in:   389315.6 i/s
-         String#gsub:   223773.5 i/s - 1.74x  slower
+CharacterSet#delete_in:  2406722.6 i/s
+         String#gsub:   235760.3 i/s - 10.21x slower
 ```
 ```
 Removing whitespace, emoji and umlauts
-CharacterSet#delete_in:   470239.3 i/s
-         String#gsub:   278679.4 i/s - 1.69x  slower
+CharacterSet#delete_in:  1653607.6 i/s
+         String#gsub:   272782.9 i/s - 6.06x slower
 ```
 ```
 Removing non-whitespace
-CharacterSet#keep_in:  1138461.0 i/s
-         String#gsub:   235287.4 i/s - 4.84x  slower
+CharacterSet#keep_in:  2671038.2 i/s
+         String#gsub:   242551.0 i/s - 11.01x slower
 ```
 ```
 Extracting emoji
-CharacterSet#keep_in:  1474472.0 i/s
-         String#gsub:   212269.6 i/s - 6.95x  slower
+CharacterSet#keep_in:  1726496.5 i/s
+         String#gsub:   215609.2 i/s - 8.01x slower
+```
+```
+Extracting emoji to an Array
+   CharacterSet#scan:  2373856.1 i/s
+         String#scan:   480000.5 i/s - 4.95x slower
 ```
 ```
 Detecting whitespace
-CharacterSet#used_by?: 13063108.7 i/s
-       Regexp#match?:  7215075.0 i/s - 1.81x  slower
+CharacterSet#used_by?: 11988328.7 i/s
+       Regexp#match?:  6758146.8 i/s - 1.77x slower
 ```
 ```
 Detecting emoji in a large string
-CharacterSet#used_by?:   246527.7 i/s
-       Regexp#match?:    92956.5 i/s - 2.65x  slower
+CharacterSet#used_by?:   288223.3 i/s
+       Regexp#match?:   102384.2 i/s - 2.82x slower
+```
+```
+Adding entries
+    CharacterSet#add:  2538251.2 i/s
+       SortedSet#add:   443925.9 i/s - 5.72x slower
+```
+```
+Removing entries
+ CharacterSet#delete:  2487620.8 i/s
+    SortedSet#delete:   628816.1 i/s - 3.96x slower
+```
+```
+Merging entries
+  CharacterSet#merge:      551.6 i/s
+     SortedSet#merge:        1.4 i/s - 393.59x slower
+```
+```
+Getting the min and max
+ CharacterSet#minmax:   636890.7 i/s
+    SortedSet#minmax:      254.1 i/s - 2506.20x slower
 ```

data/CHANGELOG.md CHANGED Viewed

@@ -4,6 +4,60 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
+## [1.5.0] - 2021-12-05
+### Added
+- new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.1.0
+- latest unicode case-folding data (for `#case_insensitive`)
+- support for passing any Enumerable to `#disjoint?`, `#intersect?`
+  - this matches recent broadening of these methods in `ruby/set`
+- new instance method `#secure_token` (see README)
+- class method `::of` now accepts more than one `String`
+- `CharacterSet::ExpressionConverter` can now build output of any Set-like class
+### Fixed
+- `CharacterSet::Pure::of_expression` now returns a `CharacterSet::Pure`
+  - it used to return a regular `CharacterSet`
+## [1.4.1] - 2020-01-10
+### Fixed
+- multiple fixes for Ruby 3
+  - fixed segfault for some `String` manipulation cases
+  - added `sorted_set` as dependency, so `CharacterSet::Pure` (non-C fallback) works
+- fixed error when parsing a `Regexp` with an empty intersection (e.g. `/[a&&]/`)
+## [1.4.0] - 2019-06-07
+### Added
+- `#to_s_with_surrogate_ranges` / `Writer::write_surrogate_ranges`
+  - allows for much shorter astral plane representations e.g. in JavaScript
+  - thanks to https://github.com/singpolyma for the suggestion and groundwork (#1)
+- improved performance for `#to_s` / `Writer` by avoiding bugged `Range#minmax`
+### Fixed
+- '/' is now escaped by default when stringifying so as to work with //-regexp syntax
+## [1.3.0] - 2019-04-26
+### Added
+- improved `String` manipulation speed
+- improved initialization and `#merge` speed when passing a large `Range`
+- reduced memory consumption by > 90% for most use cases via dynamic resizing
+  - before, every set instance required 136 KB for codepoints
+  - now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
+- `#count_in` and `#scan_in` methods for `String` interaction
+- new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
+- conversion methods `#assigned_part`, `#valid_part`
+- sectioning methods `#ascii_part`, `#plane(n)`
+- section test methods `#ascii_part?`, `#ascii_ratio`, `#ascii_only?`, `#astral_only?`
+### Fixed
+- `#count` now supports passing an argument or block as usual
+- `CharacterSet::Pure#keep_in`, `#delete_in` now preserve the original encoding
 ## [1.2.0] - 2019-04-02
 ### Added

data/README.md CHANGED Viewed

@@ -1,12 +1,17 @@
 # CharacterSet
 [![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set)
-[![Build Status](https://travis-ci.org/jaynetics/character_set.svg?branch=master)](https://travis-ci.org/jaynetics/character_set)
+[![Build Status](https://github.com/jaynetics/character_set/workflows/tests/badge.svg)](https://github.com/jaynetics/character_set/actions)
+[![Build Status](https://github.com/jaynetics/character_set/workflows/gouteur/badge.svg)](https://github.com/jaynetics/character_set/actions)
+[![codecov](https://codecov.io/gh/jaynetics/character_set/branch/master/graph/badge.svg)](https://codecov.io/gh/jaynetics/character_set)
-A gem to build, read, write and compare sets of Unicode codepoints.
+This is a C-extended Ruby gem to work with sets of Unicode codepoints. It can read and write these sets in various formats and implements the stdlib `Set` interface for them.
+It also offers an alternate paradigm of `String` processing which grants much better performance than `Regexp` and `String` methods from the stdlib where applicable (see [benchmarks](./BENCHMARK.md)).
 Many parts can be used independently, e.g.:
 - `CharacterSet::Character`
+- `CharacterSet::ExpressionConverter`
 - `CharacterSet::Parser`
 - `CharacterSet::Writer`
 - [`RangeCompressor`](https://github.com/jaynetics/range_compressor)
@@ -37,7 +42,7 @@ CharacterSet.parse('[a-c]')
 CharacterSet.parse('\U00000061-\U00000063')
 ```
-If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting.
+If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting. Regexp's `i`-flag is ignored; call `#case_insensitive` on the result if needed.
 ```ruby
 CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
@@ -49,7 +54,7 @@ require 'character_set/core_ext/regexp_ext'
 ### Predefined utility sets
-`ascii`, `ascii_alnum`, `ascii_letters`, `bmp`, `crypt`, `emoji`, `newline`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace`
+`ascii`, `ascii_alnum`, `ascii_letter`, `assigned`, `bmp`, `crypt`, `emoji`, `newline`, `surrogate`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace`
 ```ruby
 CharacterSet.ascii # => #<CharacterSet (size: 128)>
@@ -60,7 +65,7 @@ CharacterSet.non_ascii
 ### Interact with Strings
-CharacterSet can replace some `Regexp` actions on Strings, at better speed (see [benchmarks](./BENCHMARK.md)).
+`CharacterSet` can replace some types of `String` handling with better performance than the stdlib.
 `#used_by?` and `#cover?` can replace some `Regexp#match?` calls:
@@ -71,6 +76,7 @@ CharacterSet.ascii.cover?('Tr') # => true
 ```
 `#delete_in(!)` and `#keep_in(!)` can replace `String#gsub(!)` and the like:
 ```ruby
 string = 'Tüür'
@@ -84,6 +90,13 @@ CharacterSet.ascii.keep_in!(string) # => ''
 string # => ''
 ```
+`#count_in` and `#scan` can replace `String#count` and `String#scan`:
+```ruby
+CharacterSet.non_ascii.count_in('Tüür') # => 2
+CharacterSet.non_ascii.scan_in('Tüür') # => ['ü', 'ü']
+```
 There is also a core extension for String interaction.
 ```ruby
 require 'character_set/core_ext/string_ext'
@@ -100,7 +113,7 @@ require 'character_set/core_ext/string_ext'
 ### Manipulate
-Use any [Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
+Use [any Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
 Where appropriate, methods take both chars and codepoints, e.g.:
@@ -122,13 +135,13 @@ non_a.include?('ü') # => true
 # surrogate pair halves are not included by default
 CharacterSet['a'].inversion(include_surrogates: true)
-# => #<CharacterSet (size: 1114111)>
+# => #<CharacterSet (size: 1114112)>
 ```
 `#case_insensitive` can be used to create a `CharacterSet` where upper/lower case codepoints are supplemented:
 ```ruby
-CharacterSet['1', 'a'].case_insensitive # => CharacterSet['1', 'A', 'a']
+CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
 ```
 ### Write
@@ -156,18 +169,44 @@ set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
 # disable abbreviation (grouping of codepoints in ranges)
 set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
-# for full js regex compatibility in case of astral members:
-set.to_s_with_surrogate_alternation # => '(?:[\u0258]|\ud83e\udd29)'
+# astral members require some trickery if we want to target environments
+# that are based on UTF-16 or "UCS-2 with surrogates", such as JavaScript.
+set = CharacterSet['a', 'b', '🤩', '🤪', '🤫']
+# Use #to_s_with_surrogate_ranges e.g. for JavaScript:
+set.to_s_with_surrogate_ranges
+# => '(?:[ab]|\uD83E[\uDD29-\uDD2B])'
+# Or use #to_s_with_surrogate_alternation if such surrogate set pairs
+# don't work in your target environment:
+set.to_s_with_surrogate_alternation
+# => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
+```
+### Other features
+#### Secure tokens
+Generate secure random strings of characters from a set:
+```ruby
+CharacterSet.new('a'..'z').secure_token(8) # => "ugwpujmt"
+CharacterSet.crypt.secure_token # => "8.1w7aBT737/pMfcMoO4y2y8/=0xtmo:"
 ```
-### Unicode plane methods
+#### Unicode planes
-There are some methods to check for planes and to handle [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
+There are some methods to check for planes and to handle ASCII, [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
 ```Ruby
+CharacterSet['a', 'ü', '🤩'].ascii_part # => CharacterSet['a']
+CharacterSet['a', 'ü', '🤩'].ascii_part? # => true
+CharacterSet['a', 'ü', '🤩'].ascii_only? # => false
+CharacterSet['a', 'ü', '🤩'].ascii_ratio # => 0.3333333
 CharacterSet['a', 'ü', '🤩'].bmp_part # => CharacterSet['a', 'ü']
 CharacterSet['a', 'ü', '🤩'].astral_part # => CharacterSet['🤩']
 CharacterSet['a', 'ü', '🤩'].bmp_ratio # => 0.6666666
 CharacterSet['a', 'ü', '🤩'].planes # => [0, 1]
+CharacterSet['a', 'ü', '🤩'].plane(1) # => CharacterSet['🤩']
 CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
 CharacterSet::Character.new('a').plane # => 0
 ```

data/Rakefile CHANGED Viewed

@@ -7,6 +7,13 @@ RSpec::Core::RakeTask.new(:spec)
 task default: :spec
+namespace :spec do
+  task :quick do
+    ENV['SKIP_MEMSAFETY_SPECS'] = '1'
+    Rake::Task[:spec].invoke
+  end
+end
 Rake::ExtensionTask.new('character_set') do |ext|
   ext.lib_dir = 'lib/character_set'
 end
@@ -106,27 +113,22 @@ task :sync_casefold_data do
     hash[from] = to if type == 'C'
   end.sort
-  File.open(dst_path, 'w') do |f|
-    f.puts <<-C
-// THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT
-// -*-C-*-
-typedef struct casefold_mapping {
-  unsigned long from;
-  unsigned long to;
-} casefold_mapping;
-#define CASEFOLD_COUNT #{mapping.size}
+  content = File.read(dst_path + '.tmpl')
+    .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
+    .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
-static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
-    C
-    mapping.each { |from, to| f.puts "{0x#{from},0x#{to}}," }
+  File.write(dst_path, content)
+  File.unlink(src_path)
+end
-    f.puts '};'
+desc 'Update codepoint data for predefined sets, based on Onigmo'
+task :sync_predefined_sets do
+  %w[assigned emoji whitespace].each do |prop|
+    require 'regexp_property_values'
+    ranges = RegexpPropertyValues[prop].matched_ranges
+    str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
+    File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
   end
-  File.unlink(src_path)
 end
 desc 'Run all IPS benchmarks'

data/benchmarks/count_in.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require_relative './shared'
+str = 'Lorem ipsum et dolorem'
+tr = '^A-Za-z'
+cs = CharacterSet.non_ascii_letter
+benchmark(
+  caption: 'Counting non-letters',
+  cases: {
+    'String#count'          => -> { str.count(tr) },
+    'CharacterSet#count_in' => -> { cs.count_in(str) },
+  }
+)

data/benchmarks/delete_in.rb CHANGED Viewed

@@ -14,7 +14,7 @@ benchmark(
 str = 'Lörem ipsüm ⛷ et dölörem'
 rx = /[\s\p{emoji}äüö]/
-cs = CharacterSet.whitespace + CharacterSet.emoji + CS['ä', 'ü', 'ö']
+cs = CharacterSet.whitespace + CharacterSet.emoji + CharacterSet['ä', 'ö', 'ü']
 benchmark(
   caption: 'Removing whitespace, emoji and umlauts',

data/benchmarks/scan.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require_relative './shared'
+str = 'Lorem ipsum ⛷ et dolorem'
+rx = /\p{emoji}/
+cs = CharacterSet.emoji
+benchmark(
+  caption: 'Extracting emoji to an Array',
+  cases: {
+    'String#scan'       => -> { str.scan(rx) },
+    'CharacterSet#scan' => -> { cs.scan(str) },
+  }
+)

data/benchmarks/shared.rb CHANGED Viewed

@@ -3,6 +3,11 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 require 'benchmark/ips'
 require 'character_set'
+if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
+  require 'sorted_set'
+else
+  require 'set'
+end
 def benchmark(caption: nil, cases: {})
   puts caption

data/benchmarks/z_add.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require_relative './shared'
+cs = CharacterSet[]
+ss = SortedSet[]
+benchmark(
+  caption: 'Adding entries',
+  cases: {
+    'CharacterSet#add' => -> { cs.add(rand(0x10FFFF)) },
+    'SortedSet#add'    => -> { ss.add(rand(0x10FFFF)) },
+  }
+)

data/benchmarks/z_delete.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require_relative './shared'
+cs = CharacterSet.new(0..0x10FFFF)
+ss = SortedSet.new(0..0x10FFFF)
+benchmark(
+  caption: 'Removing entries',
+  cases: {
+    'CharacterSet#delete' => -> { cs.delete(rand(0x10FFFF)) },
+    'SortedSet#delete'    => -> { ss.delete(rand(0x10FFFF)) },
+  }
+)

data/benchmarks/z_merge.rb ADDED Viewed

@@ -0,0 +1,15 @@
+require_relative './shared'
+cs1 = CharacterSet.new(0...0x88000)
+cs2 = CharacterSet.new(0x88000..0x10FFFF)
+ss1 = SortedSet.new(0...0x88000)
+ss2 = SortedSet.new(0x88000..0x10FFFF)
+benchmark(
+  caption: 'Merging entries',
+  cases: {
+    'CharacterSet#merge' => -> { cs1.merge(cs2) },
+    'SortedSet#merge'    => -> { ss1.merge(ss2) },
+  }
+)

data/benchmarks/z_minmax.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require_relative './shared'
+cs = CharacterSet.new(0..0xFFFF)
+ss = SortedSet.new(0..0xFFFF)
+benchmark(
+  caption: 'Getting the min and max',
+  cases: {
+    'CharacterSet#minmax' => -> { cs.minmax },
+    'SortedSet#minmax'    => -> { ss.minmax },
+  }
+)

data/bin/console CHANGED Viewed

@@ -2,6 +2,8 @@
 require 'bundler/setup'
+`bundle exec rake compile`
 require 'character_set'
 require 'character_set/core_ext'
 require 'character_set/pure'

data/character_set.gemspec CHANGED Viewed

@@ -22,11 +22,24 @@ Gem::Specification.new do |s|
   s.required_ruby_version = '>= 2.1.0'
+  # SortedSet, needed for RubyFallback, was moved to a gem in Ruby 3.
+  # This dependency is only used if the C extension is unavailable.
+  # JRuby has it in the stdlib.
+  if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
+    s.add_dependency 'sorted_set', '~> 1.0'
+  end
   s.add_development_dependency 'benchmark-ips', '~> 2.7'
-  s.add_development_dependency 'rake', '~> 12.0'
-  s.add_development_dependency 'rake-compiler', '~> 1.0'
+  s.add_development_dependency 'get_process_mem', '~> 0.2.3'
+  s.add_development_dependency 'rake', '~> 13.0'
+  s.add_development_dependency 'rake-compiler', '~> 1.1'
   s.add_development_dependency 'range_compressor', '~> 1.0'
-  s.add_development_dependency 'regexp_parser', '~> 1.3'
-  s.add_development_dependency 'regexp_property_values', '~> 0.3.5'
+  s.add_development_dependency 'regexp_parser', '~> 2.1'
+  s.add_development_dependency 'regexp_property_values', '~> 1.0'
   s.add_development_dependency 'rspec', '~> 3.8'
+  if RUBY_VERSION.to_f >= 2.7
+    s.add_development_dependency 'codecov', '~> 0.2.12'
+    s.add_development_dependency 'gouteur', '~> 1.0.0'
+    s.add_development_dependency 'rubocop', '~> 1.8'
+  end
 end