character_set 1.4.0 → 1.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitattributes +1 -1
- data/.github/workflows/gouteur.yml +20 -0
- data/.github/workflows/lint.yml +29 -0
- data/.github/workflows/tests.yml +28 -0
- data/.gitignore +1 -0
- data/.gouteur.yml +2 -0
- data/.rubocop.yml +20 -0
- data/BENCHMARK.md +35 -31
- data/CHANGELOG.md +64 -1
- data/Gemfile +15 -0
- data/LICENSE.txt +1 -1
- data/README.md +25 -9
- data/Rakefile +2 -120
- data/character_set.gemspec +0 -10
- data/ext/character_set/character_set.c +123 -121
- data/ext/character_set/unicode_casefold_table.h +44 -1
- data/lib/character_set/core_ext/regexp_ext.rb +9 -1
- data/lib/character_set/core_ext/string_ext.rb +2 -2
- data/lib/character_set/expression_converter.rb +40 -56
- data/lib/character_set/parser.rb +8 -4
- data/lib/character_set/predefined_sets/assigned.cps +110 -78
- data/lib/character_set/predefined_sets/emoji.cps +16 -14
- data/lib/character_set/predefined_sets.rb +11 -0
- data/lib/character_set/ruby_fallback/character_set_methods.rb +17 -21
- data/lib/character_set/ruby_fallback/set_methods.rb +9 -16
- data/lib/character_set/ruby_fallback/vendored_set_classes.rb +385 -0
- data/lib/character_set/ruby_fallback.rb +18 -2
- data/lib/character_set/set_method_adapters.rb +4 -3
- data/lib/character_set/shared_methods.rb +25 -11
- data/lib/character_set/version.rb +1 -1
- data/tasks/benchmark.rake +20 -0
- data/{benchmarks → tasks/benchmarks}/delete_in.rb +5 -1
- data/{benchmarks → tasks/benchmarks}/keep_in.rb +5 -1
- data/tasks/benchmarks/shared.rb +28 -0
- data/tasks/sync_casefold_data.rake +20 -0
- data/tasks/sync_predefined_sets.rake +9 -0
- data/tasks/sync_ruby_spec.rake +65 -0
- metadata +29 -146
- data/.travis.yml +0 -9
- data/benchmarks/shared.rb +0 -26
- /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ebb6792f685df02534f1ef04a92d7f0c5fdcb482e5aaa4856d7a39726e17f007
|
4
|
+
data.tar.gz: c6630aab9b6506c46a970ba83c257cd753f8f76760b6ce8d2639f51efba83eeb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4c773a0546d05939d0b295e50355c6efe870a1ed74901d63c24097ff598d4a43bcd00ce2d03fb492a48fd9c03968a79ee78b789d92836843d6621dca3e8f313c
|
7
|
+
data.tar.gz: 560d3c3aa3f7e4daac3b6d2c89fb9dd6840777fa4d5896fb33564023ef745d81a7e4d0e51fe0ba42f6cd4504bc0b088657cd4ef1ab15d213aa1bb096ba404542
|
data/.gitattributes
CHANGED
@@ -0,0 +1,20 @@
|
|
1
|
+
name: gouteur
|
2
|
+
|
3
|
+
on: [push, pull_request]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
runs-on: ubuntu-latest
|
8
|
+
|
9
|
+
steps:
|
10
|
+
- uses: actions/checkout@v2
|
11
|
+
- name: Set up Ruby
|
12
|
+
uses: ruby/setup-ruby@v1
|
13
|
+
with:
|
14
|
+
ruby-version: 3.3
|
15
|
+
- name: Prepare
|
16
|
+
run: |
|
17
|
+
bundle install --jobs 4
|
18
|
+
bundle exec rake compile
|
19
|
+
- name: Test
|
20
|
+
run: bundle exec gouteur
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# based on https://github.com/rails/rails/blob/4a78dcb/.github/workflows/rubocop.yml
|
2
|
+
|
3
|
+
name: rubocop linting
|
4
|
+
|
5
|
+
on: [push, pull_request]
|
6
|
+
|
7
|
+
jobs:
|
8
|
+
build:
|
9
|
+
runs-on: ubuntu-latest
|
10
|
+
|
11
|
+
steps:
|
12
|
+
- uses: actions/checkout@v2
|
13
|
+
- name: Set up Ruby
|
14
|
+
uses: ruby/setup-ruby@v1
|
15
|
+
with:
|
16
|
+
ruby-version: 3.3
|
17
|
+
- name: Cache gems
|
18
|
+
uses: actions/cache@v1
|
19
|
+
with:
|
20
|
+
path: vendor/bundle
|
21
|
+
key: ${{ runner.os }}-rubocop-${{ hashFiles('**/Gemfile.lock') }}
|
22
|
+
restore-keys: |
|
23
|
+
${{ runner.os }}-rubocop-
|
24
|
+
- name: Install gems
|
25
|
+
run: |
|
26
|
+
bundle config path vendor/bundle
|
27
|
+
bundle install --jobs 4 --retry 3
|
28
|
+
- name: Run rubocop
|
29
|
+
run: bundle exec rubocop --lint
|
@@ -0,0 +1,28 @@
|
|
1
|
+
name: tests
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
pull_request:
|
6
|
+
schedule:
|
7
|
+
- cron: '11 11 14 * *' # at 11:11 am on the 14th of every month
|
8
|
+
|
9
|
+
jobs:
|
10
|
+
build:
|
11
|
+
runs-on: ubuntu-latest
|
12
|
+
|
13
|
+
strategy:
|
14
|
+
matrix:
|
15
|
+
ruby: [ '2.4', '2.7', '3.0', '3.1', '3.2', '3.3', 'ruby-head', 'jruby-head' ]
|
16
|
+
|
17
|
+
steps:
|
18
|
+
- uses: actions/checkout@v2
|
19
|
+
- name: Set up Ruby ${{ matrix.ruby }}
|
20
|
+
uses: ruby/setup-ruby@v1
|
21
|
+
with:
|
22
|
+
ruby-version: ${{ matrix.ruby }}
|
23
|
+
- name: Install dependencies
|
24
|
+
run: bundle install --jobs 4
|
25
|
+
- name: Test with Rake
|
26
|
+
run: bundle exec rake
|
27
|
+
- uses: codecov/codecov-action@v3
|
28
|
+
if: matrix.ruby == '3.2'
|
data/.gitignore
CHANGED
data/.gouteur.yml
ADDED
data/.rubocop.yml
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
AllCops:
|
2
|
+
Exclude:
|
3
|
+
- '**/doc/*'
|
4
|
+
- '**/pkg/*'
|
5
|
+
- '**/spec/ruby-spec/**/*'
|
6
|
+
- '**/vendor/**/*' # vendored dependencies
|
7
|
+
NewCops: enable
|
8
|
+
RubyInterpreters:
|
9
|
+
- ruby
|
10
|
+
- rake
|
11
|
+
TargetRubyVersion: 2.5 # really 2.1, but 2.5 is lowest supported by rubocop
|
12
|
+
|
13
|
+
Lint/AmbiguousOperatorPrecedence:
|
14
|
+
Enabled: false
|
15
|
+
|
16
|
+
Lint/AmbiguousRegexpLiteral:
|
17
|
+
Enabled: false
|
18
|
+
|
19
|
+
Metrics:
|
20
|
+
Enabled: false
|
data/BENCHMARK.md
CHANGED
@@ -1,86 +1,90 @@
|
|
1
|
-
Results of `rake:benchmark` on ruby 2.
|
1
|
+
Results of `rake:benchmark` on ruby 3.2.0dev (2022-02-14T14:35:54Z master 26187a8520) [arm64-darwin21]
|
2
2
|
|
3
3
|
```
|
4
4
|
Counting non-letters
|
5
5
|
|
6
|
-
CharacterSet#count_in:
|
7
|
-
String#count:
|
6
|
+
CharacterSet#count_in: 14627506.2 i/s
|
7
|
+
String#count: 3859777.0 i/s - 3.79x slower
|
8
8
|
```
|
9
9
|
```
|
10
10
|
Detecting non-whitespace
|
11
11
|
|
12
|
-
CharacterSet#cover?:
|
13
|
-
Regexp#match?:
|
12
|
+
CharacterSet#cover?: 17241902.8 i/s
|
13
|
+
Regexp#match?: 12971122.6 i/s - 1.33x slower
|
14
14
|
```
|
15
15
|
```
|
16
16
|
Detecting non-letters
|
17
17
|
|
18
|
-
CharacterSet#cover?:
|
19
|
-
Regexp#match?:
|
18
|
+
CharacterSet#cover?: 17243472.3 i/s
|
19
|
+
Regexp#match?: 7957626.9 i/s - 2.17x slower
|
20
20
|
```
|
21
21
|
```
|
22
|
-
Removing whitespace
|
22
|
+
Removing ASCII whitespace
|
23
23
|
|
24
|
-
CharacterSet#delete_in:
|
25
|
-
|
24
|
+
CharacterSet#delete_in: 6190975.7 i/s
|
25
|
+
String#tr: 4722716.6 i/s - 1.31x slower
|
26
|
+
String#gsub: 214239.5 i/s - 28.90x slower
|
26
27
|
```
|
27
28
|
```
|
28
29
|
Removing whitespace, emoji and umlauts
|
29
30
|
|
30
|
-
CharacterSet#delete_in:
|
31
|
-
|
31
|
+
CharacterSet#delete_in: 5890471.8 i/s
|
32
|
+
String#tr: 348506.8 i/s - 16.90x slower
|
33
|
+
String#gsub: 318268.3 i/s - 18.51x slower
|
32
34
|
```
|
33
35
|
```
|
34
36
|
Removing non-whitespace
|
35
37
|
|
36
|
-
CharacterSet#keep_in:
|
37
|
-
String#gsub:
|
38
|
+
CharacterSet#keep_in: 7396898.0 i/s
|
39
|
+
String#gsub: 208809.7 i/s - 35.42x slower
|
40
|
+
String#tr: 13.1 i/s - 564682.50x slower
|
38
41
|
```
|
39
42
|
```
|
40
|
-
|
43
|
+
Keeping only emoji
|
41
44
|
|
42
|
-
CharacterSet#keep_in:
|
43
|
-
String#gsub:
|
45
|
+
CharacterSet#keep_in: 7022741.1 i/s
|
46
|
+
String#gsub: 180939.6 i/s - 38.81x slower
|
47
|
+
String#tr: 13.1 i/s - 536724.50x slower
|
44
48
|
```
|
45
49
|
```
|
46
50
|
Extracting emoji to an Array
|
47
51
|
|
48
|
-
CharacterSet#scan:
|
49
|
-
String#scan:
|
52
|
+
CharacterSet#scan: 3023176.8 i/s
|
53
|
+
String#scan: 893225.8 i/s - 3.38x slower
|
50
54
|
```
|
51
55
|
```
|
52
56
|
Detecting whitespace
|
53
57
|
|
54
|
-
CharacterSet#used_by?:
|
55
|
-
Regexp#match?:
|
58
|
+
CharacterSet#used_by?: 17284025.9 i/s
|
59
|
+
Regexp#match?: 11847064.5 i/s - 1.46x slower
|
56
60
|
```
|
57
61
|
```
|
58
62
|
Detecting emoji in a large string
|
59
63
|
|
60
|
-
CharacterSet#used_by?:
|
61
|
-
Regexp#match?:
|
64
|
+
CharacterSet#used_by?: 341386.1 i/s
|
65
|
+
Regexp#match?: 183121.6 i/s - 1.86x slower
|
62
66
|
```
|
63
67
|
```
|
64
68
|
Adding entries
|
65
69
|
|
66
|
-
CharacterSet#add:
|
67
|
-
SortedSet#add:
|
70
|
+
CharacterSet#add: 4989762.3 i/s
|
71
|
+
SortedSet#add: 1157911.7 i/s - 4.31x slower
|
68
72
|
```
|
69
73
|
```
|
70
74
|
Removing entries
|
71
75
|
|
72
|
-
CharacterSet#delete:
|
73
|
-
SortedSet#delete:
|
76
|
+
CharacterSet#delete: 4996703.6 i/s
|
77
|
+
SortedSet#delete: 4177401.5 i/s - same-ish
|
74
78
|
```
|
75
79
|
```
|
76
80
|
Merging entries
|
77
81
|
|
78
|
-
CharacterSet#merge:
|
79
|
-
SortedSet#merge:
|
82
|
+
CharacterSet#merge: 666.7 i/s
|
83
|
+
SortedSet#merge: 4.0 i/s - 167.84x slower
|
80
84
|
```
|
81
85
|
```
|
82
86
|
Getting the min and max
|
83
87
|
|
84
|
-
CharacterSet#minmax:
|
85
|
-
SortedSet#minmax:
|
88
|
+
CharacterSet#minmax: 1596470.9 i/s
|
89
|
+
SortedSet#minmax: 866.4 i/s - 1842.74x slower
|
86
90
|
```
|
data/CHANGELOG.md
CHANGED
@@ -4,6 +4,69 @@ All notable changes to this project will be documented in this file.
|
|
4
4
|
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
5
5
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
6
6
|
|
7
|
+
## [Unreleased]
|
8
|
+
|
9
|
+
## [1.8.0] - 2024-01-07
|
10
|
+
|
11
|
+
### Added
|
12
|
+
|
13
|
+
- support for `#<=>` and `#join`, which were added to `set` in the meantime
|
14
|
+
- support for getting the (overall) character set of a Regexp with multiple expressions
|
15
|
+
- support for global and local case-insensitivity in Regexp inputs
|
16
|
+
- `Regexp#{covered_by_character_set?,uses_character_set?}` methods (if core ext is used)
|
17
|
+
|
18
|
+
## [1.7.0] - 2023-05-12
|
19
|
+
|
20
|
+
### Added
|
21
|
+
|
22
|
+
- new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.2.0
|
23
|
+
|
24
|
+
### Fixed
|
25
|
+
|
26
|
+
- fixed processing of Strings that are not ASCII- or UTF8-encoded
|
27
|
+
- removed dependency on `set` and `sorted_set`
|
28
|
+
- thanks to https://github.com/mikebaldry for reporting a related issue (#2)
|
29
|
+
|
30
|
+
## [1.6.0] - 2022-02-16
|
31
|
+
|
32
|
+
### Added
|
33
|
+
|
34
|
+
- `::of` now supports both `String` and `Regexp` arguments
|
35
|
+
|
36
|
+
### Fixed
|
37
|
+
|
38
|
+
- fixed segfault during `String` manipulation on Ruby 3.2.0-dev
|
39
|
+
- improved performance for `String` manipulation
|
40
|
+
- allow usage in Ractors
|
41
|
+
- predefined sets must be pre-initialized for this, though
|
42
|
+
- e.g. `CharacterSet.ascii`, `keep_character_set(:ascii)` etc.
|
43
|
+
- call them once in the main Ractor to trigger initialization
|
44
|
+
|
45
|
+
## [1.5.0] - 2021-12-05
|
46
|
+
|
47
|
+
### Added
|
48
|
+
|
49
|
+
- new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.1.0
|
50
|
+
- latest unicode case-folding data (for `#case_insensitive`)
|
51
|
+
- support for passing any Enumerable to `#disjoint?`, `#intersect?`
|
52
|
+
- this matches recent broadening of these methods in `ruby/set`
|
53
|
+
- new instance method `#secure_token` (see README)
|
54
|
+
- class method `::of` now accepts more than one `String`
|
55
|
+
- `CharacterSet::ExpressionConverter` can now build output of any Set-like class
|
56
|
+
|
57
|
+
### Fixed
|
58
|
+
|
59
|
+
- `CharacterSet::Pure::of_expression` now returns a `CharacterSet::Pure`
|
60
|
+
- it used to return a regular `CharacterSet`
|
61
|
+
|
62
|
+
## [1.4.1] - 2020-01-10
|
63
|
+
|
64
|
+
### Fixed
|
65
|
+
- multiple fixes for Ruby 3
|
66
|
+
- fixed segfault for some `String` manipulation cases
|
67
|
+
- added `sorted_set` as dependency, so `CharacterSet::Pure` (non-C fallback) works
|
68
|
+
- fixed error when parsing a `Regexp` with an empty intersection (e.g. `/[a&&]/`)
|
69
|
+
|
7
70
|
## [1.4.0] - 2019-06-07
|
8
71
|
|
9
72
|
### Added
|
@@ -23,7 +86,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
23
86
|
- reduced memory consumption by > 90% for most use cases via dynamic resizing
|
24
87
|
- before, every set instance required 136 KB for codepoints
|
25
88
|
- now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
|
26
|
-
- `#count_in` and `#
|
89
|
+
- `#count_in` and `#scan` methods for `String` interaction
|
27
90
|
- new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
|
28
91
|
- conversion methods `#assigned_part`, `#valid_part`
|
29
92
|
- sectioning methods `#ascii_part`, `#plane(n)`
|
data/Gemfile
CHANGED
@@ -4,3 +4,18 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
|
|
4
4
|
|
5
5
|
# Specify your gem's dependencies in character_set.gemspec
|
6
6
|
gemspec
|
7
|
+
|
8
|
+
gem 'benchmark-ips', '~> 2.7'
|
9
|
+
gem 'get_process_mem', '~> 0.2.3'
|
10
|
+
gem 'rake', '~> 13.1'
|
11
|
+
gem 'rake-compiler', '~> 1.1'
|
12
|
+
gem 'range_compressor', '~> 1.0'
|
13
|
+
gem 'regexp_parser', '~> 2.9'
|
14
|
+
gem 'regexp_property_values', '~> 1.5'
|
15
|
+
gem 'rspec', '~> 3.8'
|
16
|
+
gem 'warning', '~> 1.3'
|
17
|
+
if RUBY_VERSION.to_f >= 3.0
|
18
|
+
gem 'gouteur', '~> 1.0.0'
|
19
|
+
gem 'rubocop', '~> 1.59'
|
20
|
+
gem 'simplecov-cobertura', require: false
|
21
|
+
end
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,18 +1,21 @@
|
|
1
1
|
# CharacterSet
|
2
2
|
|
3
3
|
[![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set)
|
4
|
-
[![Build Status](https://
|
5
|
-
[![
|
4
|
+
[![Build Status](https://github.com/jaynetics/character_set/workflows/tests/badge.svg)](https://github.com/jaynetics/character_set/actions)
|
5
|
+
[![Build Status](https://github.com/jaynetics/character_set/workflows/gouteur/badge.svg)](https://github.com/jaynetics/character_set/actions)
|
6
|
+
[![Coverage](https://codecov.io/gh/jaynetics/character_set/branch/main/graph/badge.svg?token=oY7gcWNbIN)](https://codecov.io/gh/jaynetics/character_set)
|
6
7
|
|
7
|
-
This is a C-extended Ruby gem to work with sets of Unicode codepoints.
|
8
|
+
This is a C-extended Ruby gem to work with sets of Unicode codepoints.
|
8
9
|
|
9
|
-
It
|
10
|
+
It can [read](#parseinitialize) and [write](#write) sets of codepoints in various formats and it implements the stdlib `Set` interface for them.
|
11
|
+
|
12
|
+
It also offers a [way of scrubbing and scanning characters in Strings](#interact-with-strings) that is more semantic and consistently offers better performance than `Regexp` and `String` methods from the stdlib for this (see [benchmarks](./BENCHMARK.md)).
|
10
13
|
|
11
14
|
Many parts can be used independently, e.g.:
|
12
15
|
- `CharacterSet::Character`
|
16
|
+
- `CharacterSet::ExpressionConverter`
|
13
17
|
- `CharacterSet::Parser`
|
14
18
|
- `CharacterSet::Writer`
|
15
|
-
- [`RangeCompressor`](https://github.com/jaynetics/range_compressor)
|
16
19
|
|
17
20
|
## Usage
|
18
21
|
|
@@ -40,9 +43,10 @@ CharacterSet.parse('[a-c]')
|
|
40
43
|
CharacterSet.parse('\U00000061-\U00000063')
|
41
44
|
```
|
42
45
|
|
43
|
-
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed,
|
46
|
+
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `Regexp` instances and unicode property names can also be read.
|
44
47
|
|
45
48
|
```ruby
|
49
|
+
CharacterSet.of(/./) # => #<CharacterSet (size: 1112064)>
|
46
50
|
CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
|
47
51
|
|
48
52
|
require 'character_set/core_ext/regexp_ext'
|
@@ -92,7 +96,7 @@ string # => ''
|
|
92
96
|
|
93
97
|
```ruby
|
94
98
|
CharacterSet.non_ascii.count_in('Tüür') # => 2
|
95
|
-
CharacterSet.non_ascii.
|
99
|
+
CharacterSet.non_ascii.scan('Tüür') # => ['ü', 'ü']
|
96
100
|
```
|
97
101
|
|
98
102
|
There is also a core extension for String interaction.
|
@@ -143,6 +147,7 @@ CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
|
|
143
147
|
```
|
144
148
|
|
145
149
|
### Write
|
150
|
+
|
146
151
|
```ruby
|
147
152
|
set = CharacterSet['a', 'b', 'c', 'j', '-']
|
148
153
|
|
@@ -181,7 +186,18 @@ set.to_s_with_surrogate_alternation
|
|
181
186
|
# => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
|
182
187
|
```
|
183
188
|
|
184
|
-
###
|
189
|
+
### Other features
|
190
|
+
|
191
|
+
#### Secure tokens
|
192
|
+
|
193
|
+
Generate secure random strings of characters from a set:
|
194
|
+
|
195
|
+
```ruby
|
196
|
+
CharacterSet.new('a'..'z').secure_token(8) # => "ugwpujmt"
|
197
|
+
CharacterSet.crypt.secure_token # => "8.1w7aBT737/pMfcMoO4y2y8/=0xtmo:"
|
198
|
+
```
|
199
|
+
|
200
|
+
#### Unicode planes
|
185
201
|
|
186
202
|
There are some methods to check for planes and to handle ASCII, [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
|
187
203
|
```Ruby
|
@@ -198,6 +214,6 @@ CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
|
|
198
214
|
CharacterSet::Character.new('a').plane # => 0
|
199
215
|
```
|
200
216
|
|
201
|
-
|
217
|
+
## Contributions
|
202
218
|
|
203
219
|
Feel free to send suggestions, point out issues, or submit pull requests.
|
data/Rakefile
CHANGED
@@ -3,6 +3,8 @@ require 'rspec/core/rake_task'
|
|
3
3
|
require 'rubygems/package_task'
|
4
4
|
require 'rake/extensiontask'
|
5
5
|
|
6
|
+
Dir['tasks/**/*.rake'].each { |file| load(file) }
|
7
|
+
|
6
8
|
RSpec::Core::RakeTask.new(:spec)
|
7
9
|
|
8
10
|
task default: :spec
|
@@ -34,126 +36,6 @@ end
|
|
34
36
|
|
35
37
|
task package: 'java:gem'
|
36
38
|
|
37
|
-
desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
|
38
|
-
task :sync_ruby_spec do
|
39
|
-
require 'fileutils'
|
40
|
-
|
41
|
-
variants = {
|
42
|
-
'CharacterSet' => './spec/ruby-spec/library/character_set',
|
43
|
-
'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
|
44
|
-
}
|
45
|
-
|
46
|
-
# download fresh specs from ruby/spec repository
|
47
|
-
variants.each do |_, dir|
|
48
|
-
FileUtils.rm_rf(dir) if File.exist?(dir)
|
49
|
-
`svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
|
50
|
-
end
|
51
|
-
|
52
|
-
# make copies for each CharacterSet variant
|
53
|
-
base = variants.first[1]
|
54
|
-
variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
|
55
|
-
|
56
|
-
# adapt specs to work with CharacterSet
|
57
|
-
variants.each do |class_name, dir|
|
58
|
-
Dir["#{dir}/**/*.rb"].each do |spec|
|
59
|
-
# ignore some tests that do not apply or are covered otherwise
|
60
|
-
if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)}
|
61
|
-
File.delete(spec)
|
62
|
-
next
|
63
|
-
end
|
64
|
-
|
65
|
-
adapted_content =
|
66
|
-
File.read(spec).
|
67
|
-
# adapt class name
|
68
|
-
gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)).
|
69
|
-
gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2").
|
70
|
-
# get shared specs from a single shared dir at the parent level
|
71
|
-
gsub(/(require_relative ['"])(shared\/)/, '\1../\2').
|
72
|
-
# make 'mspec' syntax rspec-compatible
|
73
|
-
gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|').
|
74
|
-
gsub(/be_(false|true)/, 'be \1').
|
75
|
-
gsub('stub!', 'stub').
|
76
|
-
gsub('mock', 'double').
|
77
|
-
gsub('@method', 'method').
|
78
|
-
# remove unneeded requires
|
79
|
-
gsub(/require 'set'\n/, '').
|
80
|
-
gsub(/require.*spec_helper.*\n/, '').
|
81
|
-
gsub(/\A\n+/, '').
|
82
|
-
# make examples use Integers/codepoints
|
83
|
-
gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0').
|
84
|
-
gsub('"one"', '1').
|
85
|
-
gsub('"two"', '2').
|
86
|
-
gsub('"three"', '3').
|
87
|
-
gsub('"four"', '4').
|
88
|
-
gsub('"five"', '5').
|
89
|
-
gsub(/x.(size|length) == 3/, 'x != 3').
|
90
|
-
gsub(/x.(size|length) != 3/, 'x == 3').
|
91
|
-
gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2')
|
92
|
-
|
93
|
-
File.open(spec, 'w') { |f| f.puts adapted_content }
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
# keep only one copy of the shared specs, at the parent level
|
98
|
-
FileUtils.rm_rf(base + '/../shared')
|
99
|
-
FileUtils.mv(base + '/shared', base + '/../')
|
100
|
-
variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') }
|
101
|
-
end
|
102
|
-
|
103
|
-
desc 'Download unicode casefold data and write new C header file'
|
104
|
-
task :sync_casefold_data do
|
105
|
-
src_path = './CaseFolding.txt'
|
106
|
-
dst_path = './ext/character_set/unicode_casefold_table.h'
|
107
|
-
|
108
|
-
`wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
|
109
|
-
|
110
|
-
mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
|
111
|
-
from, type, to = line.split(/\s*;\s*/).first(3)
|
112
|
-
# type 'C' stands for 'common', excludes mappings to multiple chars
|
113
|
-
hash[from] = to if type == 'C'
|
114
|
-
end.sort
|
115
|
-
|
116
|
-
content = File.read(dst_path + '.tmpl')
|
117
|
-
.sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
|
118
|
-
.sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
|
119
|
-
|
120
|
-
File.write(dst_path, content)
|
121
|
-
File.unlink(src_path)
|
122
|
-
end
|
123
|
-
|
124
|
-
desc 'Update codepoint data for predefined sets, based on Onigmo'
|
125
|
-
task :sync_predefined_sets do
|
126
|
-
%w[assigned emoji whitespace].each do |prop|
|
127
|
-
require 'regexp_property_values'
|
128
|
-
ranges = RegexpPropertyValues[prop].matched_ranges
|
129
|
-
str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
|
130
|
-
File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
desc 'Run all IPS benchmarks'
|
135
|
-
task :benchmark do
|
136
|
-
Dir['./benchmarks/*.rb'].sort.each { |file| require file }
|
137
|
-
end
|
138
|
-
|
139
|
-
namespace :benchmark do
|
140
|
-
desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
|
141
|
-
task :write_to_file do
|
142
|
-
$store_comparison_results = {}
|
143
|
-
|
144
|
-
Rake.application[:benchmark].invoke
|
145
|
-
|
146
|
-
File.open('BENCHMARK.md', 'w') do |f|
|
147
|
-
f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
|
148
|
-
|
149
|
-
$store_comparison_results.each do |caption, result|
|
150
|
-
f.puts '```', caption, '',
|
151
|
-
result.strip.gsub(/(same-ish).*$/, '\1').lines[1..-1], '```'
|
152
|
-
end
|
153
|
-
end
|
154
|
-
end
|
155
|
-
end
|
156
|
-
|
157
39
|
unless RUBY_PLATFORM =~ /java/
|
158
40
|
# recompile before benchmarking or running specs
|
159
41
|
task(:benchmark).enhance([:compile])
|
data/character_set.gemspec
CHANGED
@@ -21,14 +21,4 @@ Gem::Specification.new do |s|
|
|
21
21
|
s.extensions = %w[ext/character_set/extconf.rb]
|
22
22
|
|
23
23
|
s.required_ruby_version = '>= 2.1.0'
|
24
|
-
|
25
|
-
s.add_development_dependency 'benchmark-ips', '~> 2.7'
|
26
|
-
s.add_development_dependency 'codecov', '~> 0.1'
|
27
|
-
s.add_development_dependency 'get_process_mem', '~> 0.2.3'
|
28
|
-
s.add_development_dependency 'rake', '~> 12.0'
|
29
|
-
s.add_development_dependency 'rake-compiler', '~> 1.0'
|
30
|
-
s.add_development_dependency 'range_compressor', '~> 1.0'
|
31
|
-
s.add_development_dependency 'regexp_parser', '~> 1.3'
|
32
|
-
s.add_development_dependency 'regexp_property_values', '~> 0.3.5'
|
33
|
-
s.add_development_dependency 'rspec', '~> 3.8'
|
34
24
|
end
|