RubyGems - character_set - Versions diffs - 1.5.0-java → 1.7.0-java - Mend

character_set 1.5.0-java → 1.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

checksums.yaml +4 -4
data/.gitattributes +1 -1
data/.github/workflows/tests.yml +6 -2
data/BENCHMARK.md +35 -31
data/CHANGELOG.md +30 -1
data/Gemfile +14 -0
data/README.md +9 -6
data/Rakefile +2 -120
data/character_set.gemspec +0 -21
data/ext/character_set/character_set.c +110 -125
data/lib/character_set/core_ext/string_ext.rb +1 -1
data/lib/character_set/parser.rb +8 -4
data/lib/character_set/predefined_sets/assigned.cps +73 -52
data/lib/character_set/predefined_sets/emoji.cps +10 -9
data/lib/character_set/predefined_sets.rb +11 -0
data/lib/character_set/ruby_fallback/character_set_methods.rb +17 -20
data/lib/character_set/ruby_fallback/set_methods.rb +4 -18
data/lib/character_set/ruby_fallback/vendored_set_classes.rb +492 -0
data/lib/character_set/ruby_fallback.rb +2 -6
data/lib/character_set/shared_methods.rb +8 -2
data/lib/character_set/version.rb +1 -1
data/tasks/benchmark.rake +20 -0
data/{benchmarks → tasks/benchmarks}/delete_in.rb +5 -1
data/{benchmarks → tasks/benchmarks}/keep_in.rb +5 -1
data/tasks/benchmarks/shared.rb +28 -0
data/tasks/sync_casefold_data.rake +20 -0
data/tasks/sync_predefined_sets.rake +9 -0
data/tasks/sync_ruby_spec.rake +65 -0
metadata +19 -182
data/benchmarks/shared.rb +0 -30
/data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
/data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 52823f4f35cdec44378c3828b4b38eba1f9f5bce402a70962eae1fb786132d8c
-  data.tar.gz: b1d6419575a3614675c194cbfde8530be02195cc73365a8c6ca446dd6ea909e6
+  metadata.gz: 36050dd00f44b6efc26567bfd867ff21535fe1e35c9a8018d00f2145b27bfd37
+  data.tar.gz: d88d01cae2f5650271d73c654877b6cf62cf87acba9b8e699677b569c514b0e9
 SHA512:
-  metadata.gz: c021975f912100174a5274454cfc6099a0955262e7e5fac619989a0a2aa5d624e048fe8b5f68b167157aca425c771df1bf137be12924b609b8d30dff1608142e
-  data.tar.gz: 1b702ea538bc5a5209c3544c88c9b38d328080db52640bb4a5780454d296970d8c2557ebe9c3cdd014e3a20af254c77fe694a9f56f09f3f29c039aef81dc381f
+  metadata.gz: 646450cc07172ffdbceaf6cf215c03a60487ced2fdf578c4467f08374b77f6a8d4e043cbb92c89ea2ebc39c5b5adf38f8d74502632e033dbc8982928d6002f99
+  data.tar.gz: 1d77ccb0abef9c591189a77ed862657a04020f6bf9b3f31b7760ced20cdbee962411e29375b7d3883e95d8c97cac992afc89b17dbdaafbe99f0af02cfa22a0e1

data/.gitattributes CHANGED Viewed

@@ -1,3 +1,3 @@
 *.cps linguist-detectable=false
 benchmarks/* linguist-detectable=false
-spec/ruby-spec/* linguist-vendored
+spec/* linguist-detectable=false

data/.github/workflows/tests.yml CHANGED Viewed

@@ -1,6 +1,10 @@
 name: tests
-on: [push, pull_request]
+on:
+  push:
+  pull_request:
+  schedule:
+    - cron: '11 11 14 * *' # at 11:11 am on the 14th of every month
 jobs:
   build:
@@ -8,7 +12,7 @@ jobs:
     strategy:
       matrix:
-        ruby: [ '2.2', '2.7', '3.0', 'ruby-head', 'jruby-head' ]
+        ruby: [ '2.2', '2.7', '3.0', '3.1', 'ruby-head', 'jruby-head' ]
     steps:
       - uses: actions/checkout@v2

data/BENCHMARK.md CHANGED Viewed

@@ -1,86 +1,90 @@
-Results of `rake:benchmark` on ruby 3.0.0p0 (2020-12-25 revision 95aff21468) [x86_64-darwin19]
+Results of `rake:benchmark` on ruby 3.2.0dev (2022-02-14T14:35:54Z master 26187a8520) [arm64-darwin21]
 ```
 Counting non-letters
-CharacterSet#count_in:  9472902.2 i/s
-        String#count:  2221799.9 i/s - 4.26x slower
+CharacterSet#count_in: 14627506.2 i/s
+        String#count:  3859777.0 i/s - 3.79x slower
 ```
 ```
 Detecting non-whitespace
- CharacterSet#cover?: 12388427.2 i/s
-       Regexp#match?:  7901676.8 i/s - 1.57x slower
+ CharacterSet#cover?: 17241902.8 i/s
+       Regexp#match?: 12971122.6 i/s - 1.33x slower
 ```
 ```
 Detecting non-letters
- CharacterSet#cover?: 12263689.1 i/s
-       Regexp#match?:  4940889.9 i/s - 2.48x slower
+ CharacterSet#cover?: 17243472.3 i/s
+       Regexp#match?:  7957626.9 i/s - 2.17x slower
 ```
 ```
-Removing whitespace
+Removing ASCII whitespace
-CharacterSet#delete_in:  2406722.6 i/s
-         String#gsub:   235760.3 i/s - 10.21x slower
+CharacterSet#delete_in:  6190975.7 i/s
+           String#tr:  4722716.6 i/s - 1.31x slower
+         String#gsub:   214239.5 i/s - 28.90x slower
 ```
 ```
 Removing whitespace, emoji and umlauts
-CharacterSet#delete_in:  1653607.6 i/s
-         String#gsub:   272782.9 i/s - 6.06x slower
+CharacterSet#delete_in:  5890471.8 i/s
+           String#tr:   348506.8 i/s - 16.90x slower
+         String#gsub:   318268.3 i/s - 18.51x slower
 ```
 ```
 Removing non-whitespace
-CharacterSet#keep_in:  2671038.2 i/s
-         String#gsub:   242551.0 i/s - 11.01x slower
+CharacterSet#keep_in:  7396898.0 i/s
+         String#gsub:   208809.7 i/s - 35.42x slower
+           String#tr:       13.1 i/s - 564682.50x slower
 ```
 ```
-Extracting emoji
+Keeping only emoji
-CharacterSet#keep_in:  1726496.5 i/s
-         String#gsub:   215609.2 i/s - 8.01x slower
+CharacterSet#keep_in:  7022741.1 i/s
+         String#gsub:   180939.6 i/s - 38.81x slower
+           String#tr:       13.1 i/s - 536724.50x slower
 ```
 ```
 Extracting emoji to an Array
-   CharacterSet#scan:  2373856.1 i/s
-         String#scan:   480000.5 i/s - 4.95x slower
+   CharacterSet#scan:  3023176.8 i/s
+         String#scan:   893225.8 i/s - 3.38x slower
 ```
 ```
 Detecting whitespace
-CharacterSet#used_by?: 11988328.7 i/s
-       Regexp#match?:  6758146.8 i/s - 1.77x slower
+CharacterSet#used_by?: 17284025.9 i/s
+       Regexp#match?: 11847064.5 i/s - 1.46x slower
 ```
 ```
 Detecting emoji in a large string
-CharacterSet#used_by?:   288223.3 i/s
-       Regexp#match?:   102384.2 i/s - 2.82x slower
+CharacterSet#used_by?:   341386.1 i/s
+       Regexp#match?:   183121.6 i/s - 1.86x slower
 ```
 ```
 Adding entries
-    CharacterSet#add:  2538251.2 i/s
-       SortedSet#add:   443925.9 i/s - 5.72x slower
+    CharacterSet#add:  4989762.3 i/s
+       SortedSet#add:  1157911.7 i/s - 4.31x slower
 ```
 ```
 Removing entries
- CharacterSet#delete:  2487620.8 i/s
-    SortedSet#delete:   628816.1 i/s - 3.96x slower
+ CharacterSet#delete:  4996703.6 i/s
+    SortedSet#delete:  4177401.5 i/s - same-ish
 ```
 ```
 Merging entries
-  CharacterSet#merge:      551.6 i/s
-     SortedSet#merge:        1.4 i/s - 393.59x slower
+  CharacterSet#merge:      666.7 i/s
+     SortedSet#merge:        4.0 i/s - 167.84x slower
 ```
 ```
 Getting the min and max
- CharacterSet#minmax:   636890.7 i/s
-    SortedSet#minmax:      254.1 i/s - 2506.20x slower
+ CharacterSet#minmax:  1596470.9 i/s
+    SortedSet#minmax:      866.4 i/s - 1842.74x slower
 ```

data/CHANGELOG.md CHANGED Viewed

@@ -4,6 +4,35 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
+## [Unreleased]
+## [1.7.0] - 2023-05-12
+### Added
+- new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.2.0
+### Fixed
+- fixed processing of Strings that are not ASCII- or UTF8-encoded
+- removed dependency on `set` and `sorted_set`
+  - thanks to https://github.com/mikebaldry for reporting a related issue (#2)
+## [1.6.0] - 2022-02-16
+### Added
+- `::of` now supports both `String` and `Regexp` arguments
+### Fixed
+- fixed segfault during `String` manipulation on Ruby 3.2.0-dev
+- improved performance for `String` manipulation
+- allow usage in Ractors
+  - predefined sets must be pre-initialized for this, though
+  - e.g. `CharacterSet.ascii`, `keep_character_set(:ascii)` etc.
+  - call them once in the main Ractor to trigger initialization
 ## [1.5.0] - 2021-12-05
 ### Added
@@ -48,7 +77,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - reduced memory consumption by > 90% for most use cases via dynamic resizing
   - before, every set instance required 136 KB for codepoints
   - now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
-- `#count_in` and `#scan_in` methods for `String` interaction
+- `#count_in` and `#scan` methods for `String` interaction
 - new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
 - conversion methods `#assigned_part`, `#valid_part`
 - sectioning methods `#ascii_part`, `#plane(n)`

data/Gemfile CHANGED Viewed

@@ -4,3 +4,17 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
 # Specify your gem's dependencies in character_set.gemspec
 gemspec
+gem 'benchmark-ips', '~> 2.7'
+gem 'get_process_mem', '~> 0.2.3'
+gem 'rake', '~> 13.0'
+gem 'rake-compiler', '~> 1.1'
+gem 'range_compressor', '~> 1.0'
+gem 'regexp_parser', '~> 2.1'
+gem 'regexp_property_values', '~> 1.0'
+gem 'rspec', '~> 3.8'
+if RUBY_VERSION.to_f >= 2.7
+  gem 'codecov', '~> 0.2.12'
+  gem 'gouteur', '~> 1.0.0'
+  gem 'rubocop', '~> 1.8'
+end

data/README.md CHANGED Viewed

@@ -5,16 +5,17 @@
 [![Build Status](https://github.com/jaynetics/character_set/workflows/gouteur/badge.svg)](https://github.com/jaynetics/character_set/actions)
 [![codecov](https://codecov.io/gh/jaynetics/character_set/branch/master/graph/badge.svg)](https://codecov.io/gh/jaynetics/character_set)
-This is a C-extended Ruby gem to work with sets of Unicode codepoints. It can read and write these sets in various formats and implements the stdlib `Set` interface for them.
+This is a C-extended Ruby gem to work with sets of Unicode codepoints.
-It also offers an alternate paradigm of `String` processing which grants much better performance than `Regexp` and `String` methods from the stdlib where applicable (see [benchmarks](./BENCHMARK.md)).
+It can [read](#parseinitialize) and [write](#write) sets of codepoints in various formats and it implements the stdlib `Set` interface for them.
+It also offers a [way of scrubbing and scanning characters in Strings](#interact-with-strings) that is more semantic and consistently offers better performance than `Regexp` and `String` methods from the stdlib for this (see [benchmarks](./BENCHMARK.md)).
 Many parts can be used independently, e.g.:
 - `CharacterSet::Character`
 - `CharacterSet::ExpressionConverter`
 - `CharacterSet::Parser`
 - `CharacterSet::Writer`
-- [`RangeCompressor`](https://github.com/jaynetics/range_compressor)
 ## Usage
@@ -42,9 +43,10 @@ CharacterSet.parse('[a-c]')
 CharacterSet.parse('\U00000061-\U00000063')
 ```
-If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting. Regexp's `i`-flag is ignored; call `#case_insensitive` on the result if needed.
+If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `Regexp` and unicode property names can also be read. Regexp intersections, negations, and set nesting are covered, but the `i`-flag is ignored; call `#case_insensitive` on the result if needed.
 ```ruby
+CharacterSet.of(/./) # => #<CharacterSet (size: 1112064)>
 CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
 require 'character_set/core_ext/regexp_ext'
@@ -94,7 +96,7 @@ string # => ''
 ```ruby
 CharacterSet.non_ascii.count_in('Tüür') # => 2
-CharacterSet.non_ascii.scan_in('Tüür') # => ['ü', 'ü']
+CharacterSet.non_ascii.scan('Tüür') # => ['ü', 'ü']
 ```
 There is also a core extension for String interaction.
@@ -145,6 +147,7 @@ CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
 ```
 ### Write
 ```ruby
 set = CharacterSet['a', 'b', 'c', 'j', '-']
@@ -211,6 +214,6 @@ CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
 CharacterSet::Character.new('a').plane # => 0
 ```
-### Contributions
+## Contributions
 Feel free to send suggestions, point out issues, or submit pull requests.

data/Rakefile CHANGED Viewed

@@ -3,6 +3,8 @@ require 'rspec/core/rake_task'
 require 'rubygems/package_task'
 require 'rake/extensiontask'
+Dir['tasks/**/*.rake'].each { |file| load(file) }
 RSpec::Core::RakeTask.new(:spec)
 task default: :spec
@@ -34,126 +36,6 @@ end
 task package: 'java:gem'
-desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
-task :sync_ruby_spec do
-  require 'fileutils'
-  variants = {
-    'CharacterSet'       => './spec/ruby-spec/library/character_set',
-    'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
-  }
-  # download fresh specs from ruby/spec repository
-  variants.each do |_, dir|
-    FileUtils.rm_rf(dir) if File.exist?(dir)
-    `svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
-  end
-  # make copies for each CharacterSet variant
-  base = variants.first[1]
-  variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
-  # adapt specs to work with CharacterSet
-  variants.each do |class_name, dir|
-    Dir["#{dir}/**/*.rb"].each do |spec|
-      # ignore some tests that do not apply or are covered otherwise
-      if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)}
-        File.delete(spec)
-        next
-      end
-      adapted_content =
-        File.read(spec).
-        # adapt class name
-        gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)).
-        gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2").
-        # get shared specs from a single shared dir at the parent level
-        gsub(/(require_relative ['"])(shared\/)/, '\1../\2').
-        # make 'mspec' syntax rspec-compatible
-        gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|').
-        gsub(/be_(false|true)/, 'be \1').
-        gsub('stub!', 'stub').
-        gsub('mock', 'double').
-        gsub('@method', 'method').
-        # remove unneeded requires
-        gsub(/require 'set'\n/, '').
-        gsub(/require.*spec_helper.*\n/, '').
-        gsub(/\A\n+/, '').
-        # make examples use Integers/codepoints
-        gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0').
-        gsub('"one"', '1').
-        gsub('"two"', '2').
-        gsub('"three"', '3').
-        gsub('"four"', '4').
-        gsub('"five"', '5').
-        gsub(/x.(size|length) == 3/, 'x != 3').
-        gsub(/x.(size|length) != 3/, 'x == 3').
-        gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2')
-      File.open(spec, 'w') { |f| f.puts adapted_content }
-    end
-  end
-  # keep only one copy of the shared specs, at the parent level
-  FileUtils.rm_rf(base + '/../shared')
-  FileUtils.mv(base + '/shared', base + '/../')
-  variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') }
-end
-desc 'Download unicode casefold data and write new C header file'
-task :sync_casefold_data do
-  src_path = './CaseFolding.txt'
-  dst_path = './ext/character_set/unicode_casefold_table.h'
-  `wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
-  mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
-    from, type, to = line.split(/\s*;\s*/).first(3)
-    # type 'C' stands for 'common', excludes mappings to multiple chars
-    hash[from] = to if type == 'C'
-  end.sort
-  content = File.read(dst_path + '.tmpl')
-    .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
-    .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
-  File.write(dst_path, content)
-  File.unlink(src_path)
-end
-desc 'Update codepoint data for predefined sets, based on Onigmo'
-task :sync_predefined_sets do
-  %w[assigned emoji whitespace].each do |prop|
-    require 'regexp_property_values'
-    ranges = RegexpPropertyValues[prop].matched_ranges
-    str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
-    File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
-  end
-end
-desc 'Run all IPS benchmarks'
-task :benchmark do
-  Dir['./benchmarks/*.rb'].sort.each { |file| require file }
-end
-namespace :benchmark do
-  desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
-  task :write_to_file do
-    $store_comparison_results = {}
-    Rake.application[:benchmark].invoke
-    File.open('BENCHMARK.md', 'w') do |f|
-      f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
-      $store_comparison_results.each do |caption, result|
-        f.puts '```', caption, '',
-               result.strip.gsub(/(same-ish).*$/, '\1').lines[1..-1], '```'
-      end
-    end
-  end
-end
 unless RUBY_PLATFORM =~ /java/
   # recompile before benchmarking or running specs
   task(:benchmark).enhance([:compile])

data/character_set.gemspec CHANGED Viewed

@@ -21,25 +21,4 @@ Gem::Specification.new do |s|
   s.extensions  = %w[ext/character_set/extconf.rb]
   s.required_ruby_version = '>= 2.1.0'
-  # SortedSet, needed for RubyFallback, was moved to a gem in Ruby 3.
-  # This dependency is only used if the C extension is unavailable.
-  # JRuby has it in the stdlib.
-  if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
-    s.add_dependency 'sorted_set', '~> 1.0'
-  end
-  s.add_development_dependency 'benchmark-ips', '~> 2.7'
-  s.add_development_dependency 'get_process_mem', '~> 0.2.3'
-  s.add_development_dependency 'rake', '~> 13.0'
-  s.add_development_dependency 'rake-compiler', '~> 1.1'
-  s.add_development_dependency 'range_compressor', '~> 1.0'
-  s.add_development_dependency 'regexp_parser', '~> 2.1'
-  s.add_development_dependency 'regexp_property_values', '~> 1.0'
-  s.add_development_dependency 'rspec', '~> 3.8'
-  if RUBY_VERSION.to_f >= 2.7
-    s.add_development_dependency 'codecov', '~> 0.2.12'
-    s.add_development_dependency 'gouteur', '~> 1.0.0'
-    s.add_development_dependency 'rubocop', '~> 1.8'
-  end
 end