character_set 1.4.0 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitattributes +1 -1
- data/.github/workflows/gouteur.yml +20 -0
- data/.github/workflows/lint.yml +29 -0
- data/.github/workflows/tests.yml +28 -0
- data/.gitignore +1 -0
- data/.gouteur.yml +2 -0
- data/.rubocop.yml +20 -0
- data/BENCHMARK.md +35 -31
- data/CHANGELOG.md +64 -1
- data/Gemfile +15 -0
- data/LICENSE.txt +1 -1
- data/README.md +25 -9
- data/Rakefile +2 -120
- data/character_set.gemspec +0 -10
- data/ext/character_set/character_set.c +123 -121
- data/ext/character_set/unicode_casefold_table.h +44 -1
- data/lib/character_set/core_ext/regexp_ext.rb +9 -1
- data/lib/character_set/core_ext/string_ext.rb +2 -2
- data/lib/character_set/expression_converter.rb +40 -56
- data/lib/character_set/parser.rb +8 -4
- data/lib/character_set/predefined_sets/assigned.cps +110 -78
- data/lib/character_set/predefined_sets/emoji.cps +16 -14
- data/lib/character_set/predefined_sets.rb +11 -0
- data/lib/character_set/ruby_fallback/character_set_methods.rb +17 -21
- data/lib/character_set/ruby_fallback/set_methods.rb +9 -16
- data/lib/character_set/ruby_fallback/vendored_set_classes.rb +385 -0
- data/lib/character_set/ruby_fallback.rb +18 -2
- data/lib/character_set/set_method_adapters.rb +4 -3
- data/lib/character_set/shared_methods.rb +25 -11
- data/lib/character_set/version.rb +1 -1
- data/tasks/benchmark.rake +20 -0
- data/{benchmarks → tasks/benchmarks}/delete_in.rb +5 -1
- data/{benchmarks → tasks/benchmarks}/keep_in.rb +5 -1
- data/tasks/benchmarks/shared.rb +28 -0
- data/tasks/sync_casefold_data.rake +20 -0
- data/tasks/sync_predefined_sets.rake +9 -0
- data/tasks/sync_ruby_spec.rake +65 -0
- metadata +29 -146
- data/.travis.yml +0 -9
- data/benchmarks/shared.rb +0 -26
- /data/{benchmarks → tasks/benchmarks}/count_in.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/cover.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/scan.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/used_by.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_add.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_delete.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_merge.rb +0 -0
- /data/{benchmarks → tasks/benchmarks}/z_minmax.rb +0 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ebb6792f685df02534f1ef04a92d7f0c5fdcb482e5aaa4856d7a39726e17f007
|
|
4
|
+
data.tar.gz: c6630aab9b6506c46a970ba83c257cd753f8f76760b6ce8d2639f51efba83eeb
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4c773a0546d05939d0b295e50355c6efe870a1ed74901d63c24097ff598d4a43bcd00ce2d03fb492a48fd9c03968a79ee78b789d92836843d6621dca3e8f313c
|
|
7
|
+
data.tar.gz: 560d3c3aa3f7e4daac3b6d2c89fb9dd6840777fa4d5896fb33564023ef745d81a7e4d0e51fe0ba42f6cd4504bc0b088657cd4ef1ab15d213aa1bb096ba404542
|
data/.gitattributes
CHANGED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
name: gouteur
|
|
2
|
+
|
|
3
|
+
on: [push, pull_request]
|
|
4
|
+
|
|
5
|
+
jobs:
|
|
6
|
+
build:
|
|
7
|
+
runs-on: ubuntu-latest
|
|
8
|
+
|
|
9
|
+
steps:
|
|
10
|
+
- uses: actions/checkout@v2
|
|
11
|
+
- name: Set up Ruby
|
|
12
|
+
uses: ruby/setup-ruby@v1
|
|
13
|
+
with:
|
|
14
|
+
ruby-version: 3.3
|
|
15
|
+
- name: Prepare
|
|
16
|
+
run: |
|
|
17
|
+
bundle install --jobs 4
|
|
18
|
+
bundle exec rake compile
|
|
19
|
+
- name: Test
|
|
20
|
+
run: bundle exec gouteur
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# based on https://github.com/rails/rails/blob/4a78dcb/.github/workflows/rubocop.yml
|
|
2
|
+
|
|
3
|
+
name: rubocop linting
|
|
4
|
+
|
|
5
|
+
on: [push, pull_request]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
build:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v2
|
|
13
|
+
- name: Set up Ruby
|
|
14
|
+
uses: ruby/setup-ruby@v1
|
|
15
|
+
with:
|
|
16
|
+
ruby-version: 3.3
|
|
17
|
+
- name: Cache gems
|
|
18
|
+
uses: actions/cache@v1
|
|
19
|
+
with:
|
|
20
|
+
path: vendor/bundle
|
|
21
|
+
key: ${{ runner.os }}-rubocop-${{ hashFiles('**/Gemfile.lock') }}
|
|
22
|
+
restore-keys: |
|
|
23
|
+
${{ runner.os }}-rubocop-
|
|
24
|
+
- name: Install gems
|
|
25
|
+
run: |
|
|
26
|
+
bundle config path vendor/bundle
|
|
27
|
+
bundle install --jobs 4 --retry 3
|
|
28
|
+
- name: Run rubocop
|
|
29
|
+
run: bundle exec rubocop --lint
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
name: tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
pull_request:
|
|
6
|
+
schedule:
|
|
7
|
+
- cron: '11 11 14 * *' # at 11:11 am on the 14th of every month
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
build:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
|
|
13
|
+
strategy:
|
|
14
|
+
matrix:
|
|
15
|
+
ruby: [ '2.4', '2.7', '3.0', '3.1', '3.2', '3.3', 'ruby-head', 'jruby-head' ]
|
|
16
|
+
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v2
|
|
19
|
+
- name: Set up Ruby ${{ matrix.ruby }}
|
|
20
|
+
uses: ruby/setup-ruby@v1
|
|
21
|
+
with:
|
|
22
|
+
ruby-version: ${{ matrix.ruby }}
|
|
23
|
+
- name: Install dependencies
|
|
24
|
+
run: bundle install --jobs 4
|
|
25
|
+
- name: Test with Rake
|
|
26
|
+
run: bundle exec rake
|
|
27
|
+
- uses: codecov/codecov-action@v3
|
|
28
|
+
if: matrix.ruby == '3.2'
|
data/.gitignore
CHANGED
data/.gouteur.yml
ADDED
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
AllCops:
|
|
2
|
+
Exclude:
|
|
3
|
+
- '**/doc/*'
|
|
4
|
+
- '**/pkg/*'
|
|
5
|
+
- '**/spec/ruby-spec/**/*'
|
|
6
|
+
- '**/vendor/**/*' # vendored dependencies
|
|
7
|
+
NewCops: enable
|
|
8
|
+
RubyInterpreters:
|
|
9
|
+
- ruby
|
|
10
|
+
- rake
|
|
11
|
+
TargetRubyVersion: 2.5 # really 2.1, but 2.5 is lowest supported by rubocop
|
|
12
|
+
|
|
13
|
+
Lint/AmbiguousOperatorPrecedence:
|
|
14
|
+
Enabled: false
|
|
15
|
+
|
|
16
|
+
Lint/AmbiguousRegexpLiteral:
|
|
17
|
+
Enabled: false
|
|
18
|
+
|
|
19
|
+
Metrics:
|
|
20
|
+
Enabled: false
|
data/BENCHMARK.md
CHANGED
|
@@ -1,86 +1,90 @@
|
|
|
1
|
-
Results of `rake:benchmark` on ruby 2.
|
|
1
|
+
Results of `rake:benchmark` on ruby 3.2.0dev (2022-02-14T14:35:54Z master 26187a8520) [arm64-darwin21]
|
|
2
2
|
|
|
3
3
|
```
|
|
4
4
|
Counting non-letters
|
|
5
5
|
|
|
6
|
-
CharacterSet#count_in:
|
|
7
|
-
String#count:
|
|
6
|
+
CharacterSet#count_in: 14627506.2 i/s
|
|
7
|
+
String#count: 3859777.0 i/s - 3.79x slower
|
|
8
8
|
```
|
|
9
9
|
```
|
|
10
10
|
Detecting non-whitespace
|
|
11
11
|
|
|
12
|
-
CharacterSet#cover?:
|
|
13
|
-
Regexp#match?:
|
|
12
|
+
CharacterSet#cover?: 17241902.8 i/s
|
|
13
|
+
Regexp#match?: 12971122.6 i/s - 1.33x slower
|
|
14
14
|
```
|
|
15
15
|
```
|
|
16
16
|
Detecting non-letters
|
|
17
17
|
|
|
18
|
-
CharacterSet#cover?:
|
|
19
|
-
Regexp#match?:
|
|
18
|
+
CharacterSet#cover?: 17243472.3 i/s
|
|
19
|
+
Regexp#match?: 7957626.9 i/s - 2.17x slower
|
|
20
20
|
```
|
|
21
21
|
```
|
|
22
|
-
Removing whitespace
|
|
22
|
+
Removing ASCII whitespace
|
|
23
23
|
|
|
24
|
-
CharacterSet#delete_in:
|
|
25
|
-
|
|
24
|
+
CharacterSet#delete_in: 6190975.7 i/s
|
|
25
|
+
String#tr: 4722716.6 i/s - 1.31x slower
|
|
26
|
+
String#gsub: 214239.5 i/s - 28.90x slower
|
|
26
27
|
```
|
|
27
28
|
```
|
|
28
29
|
Removing whitespace, emoji and umlauts
|
|
29
30
|
|
|
30
|
-
CharacterSet#delete_in:
|
|
31
|
-
|
|
31
|
+
CharacterSet#delete_in: 5890471.8 i/s
|
|
32
|
+
String#tr: 348506.8 i/s - 16.90x slower
|
|
33
|
+
String#gsub: 318268.3 i/s - 18.51x slower
|
|
32
34
|
```
|
|
33
35
|
```
|
|
34
36
|
Removing non-whitespace
|
|
35
37
|
|
|
36
|
-
CharacterSet#keep_in:
|
|
37
|
-
String#gsub:
|
|
38
|
+
CharacterSet#keep_in: 7396898.0 i/s
|
|
39
|
+
String#gsub: 208809.7 i/s - 35.42x slower
|
|
40
|
+
String#tr: 13.1 i/s - 564682.50x slower
|
|
38
41
|
```
|
|
39
42
|
```
|
|
40
|
-
|
|
43
|
+
Keeping only emoji
|
|
41
44
|
|
|
42
|
-
CharacterSet#keep_in:
|
|
43
|
-
String#gsub:
|
|
45
|
+
CharacterSet#keep_in: 7022741.1 i/s
|
|
46
|
+
String#gsub: 180939.6 i/s - 38.81x slower
|
|
47
|
+
String#tr: 13.1 i/s - 536724.50x slower
|
|
44
48
|
```
|
|
45
49
|
```
|
|
46
50
|
Extracting emoji to an Array
|
|
47
51
|
|
|
48
|
-
CharacterSet#scan:
|
|
49
|
-
String#scan:
|
|
52
|
+
CharacterSet#scan: 3023176.8 i/s
|
|
53
|
+
String#scan: 893225.8 i/s - 3.38x slower
|
|
50
54
|
```
|
|
51
55
|
```
|
|
52
56
|
Detecting whitespace
|
|
53
57
|
|
|
54
|
-
CharacterSet#used_by?:
|
|
55
|
-
Regexp#match?:
|
|
58
|
+
CharacterSet#used_by?: 17284025.9 i/s
|
|
59
|
+
Regexp#match?: 11847064.5 i/s - 1.46x slower
|
|
56
60
|
```
|
|
57
61
|
```
|
|
58
62
|
Detecting emoji in a large string
|
|
59
63
|
|
|
60
|
-
CharacterSet#used_by?:
|
|
61
|
-
Regexp#match?:
|
|
64
|
+
CharacterSet#used_by?: 341386.1 i/s
|
|
65
|
+
Regexp#match?: 183121.6 i/s - 1.86x slower
|
|
62
66
|
```
|
|
63
67
|
```
|
|
64
68
|
Adding entries
|
|
65
69
|
|
|
66
|
-
CharacterSet#add:
|
|
67
|
-
SortedSet#add:
|
|
70
|
+
CharacterSet#add: 4989762.3 i/s
|
|
71
|
+
SortedSet#add: 1157911.7 i/s - 4.31x slower
|
|
68
72
|
```
|
|
69
73
|
```
|
|
70
74
|
Removing entries
|
|
71
75
|
|
|
72
|
-
CharacterSet#delete:
|
|
73
|
-
SortedSet#delete:
|
|
76
|
+
CharacterSet#delete: 4996703.6 i/s
|
|
77
|
+
SortedSet#delete: 4177401.5 i/s - same-ish
|
|
74
78
|
```
|
|
75
79
|
```
|
|
76
80
|
Merging entries
|
|
77
81
|
|
|
78
|
-
CharacterSet#merge:
|
|
79
|
-
SortedSet#merge:
|
|
82
|
+
CharacterSet#merge: 666.7 i/s
|
|
83
|
+
SortedSet#merge: 4.0 i/s - 167.84x slower
|
|
80
84
|
```
|
|
81
85
|
```
|
|
82
86
|
Getting the min and max
|
|
83
87
|
|
|
84
|
-
CharacterSet#minmax:
|
|
85
|
-
SortedSet#minmax:
|
|
88
|
+
CharacterSet#minmax: 1596470.9 i/s
|
|
89
|
+
SortedSet#minmax: 866.4 i/s - 1842.74x slower
|
|
86
90
|
```
|
data/CHANGELOG.md
CHANGED
|
@@ -4,6 +4,69 @@ All notable changes to this project will be documented in this file.
|
|
|
4
4
|
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
|
5
5
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
|
6
6
|
|
|
7
|
+
## [Unreleased]
|
|
8
|
+
|
|
9
|
+
## [1.8.0] - 2024-01-07
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
|
|
13
|
+
- support for `#<=>` and `#join`, which were added to `set` in the meantime
|
|
14
|
+
- support for getting the (overall) character set of a Regexp with multiple expressions
|
|
15
|
+
- support for global and local case-insensitivity in Regexp inputs
|
|
16
|
+
- `Regexp#{covered_by_character_set?,uses_character_set?}` methods (if core ext is used)
|
|
17
|
+
|
|
18
|
+
## [1.7.0] - 2023-05-12
|
|
19
|
+
|
|
20
|
+
### Added
|
|
21
|
+
|
|
22
|
+
- new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.2.0
|
|
23
|
+
|
|
24
|
+
### Fixed
|
|
25
|
+
|
|
26
|
+
- fixed processing of Strings that are not ASCII- or UTF8-encoded
|
|
27
|
+
- removed dependency on `set` and `sorted_set`
|
|
28
|
+
- thanks to https://github.com/mikebaldry for reporting a related issue (#2)
|
|
29
|
+
|
|
30
|
+
## [1.6.0] - 2022-02-16
|
|
31
|
+
|
|
32
|
+
### Added
|
|
33
|
+
|
|
34
|
+
- `::of` now supports both `String` and `Regexp` arguments
|
|
35
|
+
|
|
36
|
+
### Fixed
|
|
37
|
+
|
|
38
|
+
- fixed segfault during `String` manipulation on Ruby 3.2.0-dev
|
|
39
|
+
- improved performance for `String` manipulation
|
|
40
|
+
- allow usage in Ractors
|
|
41
|
+
- predefined sets must be pre-initialized for this, though
|
|
42
|
+
- e.g. `CharacterSet.ascii`, `keep_character_set(:ascii)` etc.
|
|
43
|
+
- call them once in the main Ractor to trigger initialization
|
|
44
|
+
|
|
45
|
+
## [1.5.0] - 2021-12-05
|
|
46
|
+
|
|
47
|
+
### Added
|
|
48
|
+
|
|
49
|
+
- new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.1.0
|
|
50
|
+
- latest unicode case-folding data (for `#case_insensitive`)
|
|
51
|
+
- support for passing any Enumerable to `#disjoint?`, `#intersect?`
|
|
52
|
+
- this matches recent broadening of these methods in `ruby/set`
|
|
53
|
+
- new instance method `#secure_token` (see README)
|
|
54
|
+
- class method `::of` now accepts more than one `String`
|
|
55
|
+
- `CharacterSet::ExpressionConverter` can now build output of any Set-like class
|
|
56
|
+
|
|
57
|
+
### Fixed
|
|
58
|
+
|
|
59
|
+
- `CharacterSet::Pure::of_expression` now returns a `CharacterSet::Pure`
|
|
60
|
+
- it used to return a regular `CharacterSet`
|
|
61
|
+
|
|
62
|
+
## [1.4.1] - 2020-01-10
|
|
63
|
+
|
|
64
|
+
### Fixed
|
|
65
|
+
- multiple fixes for Ruby 3
|
|
66
|
+
- fixed segfault for some `String` manipulation cases
|
|
67
|
+
- added `sorted_set` as dependency, so `CharacterSet::Pure` (non-C fallback) works
|
|
68
|
+
- fixed error when parsing a `Regexp` with an empty intersection (e.g. `/[a&&]/`)
|
|
69
|
+
|
|
7
70
|
## [1.4.0] - 2019-06-07
|
|
8
71
|
|
|
9
72
|
### Added
|
|
@@ -23,7 +86,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
23
86
|
- reduced memory consumption by > 90% for most use cases via dynamic resizing
|
|
24
87
|
- before, every set instance required 136 KB for codepoints
|
|
25
88
|
- now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
|
|
26
|
-
- `#count_in` and `#
|
|
89
|
+
- `#count_in` and `#scan` methods for `String` interaction
|
|
27
90
|
- new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
|
|
28
91
|
- conversion methods `#assigned_part`, `#valid_part`
|
|
29
92
|
- sectioning methods `#ascii_part`, `#plane(n)`
|
data/Gemfile
CHANGED
|
@@ -4,3 +4,18 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
|
|
|
4
4
|
|
|
5
5
|
# Specify your gem's dependencies in character_set.gemspec
|
|
6
6
|
gemspec
|
|
7
|
+
|
|
8
|
+
gem 'benchmark-ips', '~> 2.7'
|
|
9
|
+
gem 'get_process_mem', '~> 0.2.3'
|
|
10
|
+
gem 'rake', '~> 13.1'
|
|
11
|
+
gem 'rake-compiler', '~> 1.1'
|
|
12
|
+
gem 'range_compressor', '~> 1.0'
|
|
13
|
+
gem 'regexp_parser', '~> 2.9'
|
|
14
|
+
gem 'regexp_property_values', '~> 1.5'
|
|
15
|
+
gem 'rspec', '~> 3.8'
|
|
16
|
+
gem 'warning', '~> 1.3'
|
|
17
|
+
if RUBY_VERSION.to_f >= 3.0
|
|
18
|
+
gem 'gouteur', '~> 1.0.0'
|
|
19
|
+
gem 'rubocop', '~> 1.59'
|
|
20
|
+
gem 'simplecov-cobertura', require: false
|
|
21
|
+
end
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
|
@@ -1,18 +1,21 @@
|
|
|
1
1
|
# CharacterSet
|
|
2
2
|
|
|
3
3
|
[](http://badge.fury.io/rb/character_set)
|
|
4
|
-
[](https://github.com/jaynetics/character_set/actions)
|
|
5
|
+
[](https://github.com/jaynetics/character_set/actions)
|
|
6
|
+
[](https://codecov.io/gh/jaynetics/character_set)
|
|
6
7
|
|
|
7
|
-
This is a C-extended Ruby gem to work with sets of Unicode codepoints.
|
|
8
|
+
This is a C-extended Ruby gem to work with sets of Unicode codepoints.
|
|
8
9
|
|
|
9
|
-
It
|
|
10
|
+
It can [read](#parseinitialize) and [write](#write) sets of codepoints in various formats and it implements the stdlib `Set` interface for them.
|
|
11
|
+
|
|
12
|
+
It also offers a [way of scrubbing and scanning characters in Strings](#interact-with-strings) that is more semantic and consistently offers better performance than `Regexp` and `String` methods from the stdlib for this (see [benchmarks](./BENCHMARK.md)).
|
|
10
13
|
|
|
11
14
|
Many parts can be used independently, e.g.:
|
|
12
15
|
- `CharacterSet::Character`
|
|
16
|
+
- `CharacterSet::ExpressionConverter`
|
|
13
17
|
- `CharacterSet::Parser`
|
|
14
18
|
- `CharacterSet::Writer`
|
|
15
|
-
- [`RangeCompressor`](https://github.com/jaynetics/range_compressor)
|
|
16
19
|
|
|
17
20
|
## Usage
|
|
18
21
|
|
|
@@ -40,9 +43,10 @@ CharacterSet.parse('[a-c]')
|
|
|
40
43
|
CharacterSet.parse('\U00000061-\U00000063')
|
|
41
44
|
```
|
|
42
45
|
|
|
43
|
-
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed,
|
|
46
|
+
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `Regexp` instances and unicode property names can also be read.
|
|
44
47
|
|
|
45
48
|
```ruby
|
|
49
|
+
CharacterSet.of(/./) # => #<CharacterSet (size: 1112064)>
|
|
46
50
|
CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
|
|
47
51
|
|
|
48
52
|
require 'character_set/core_ext/regexp_ext'
|
|
@@ -92,7 +96,7 @@ string # => ''
|
|
|
92
96
|
|
|
93
97
|
```ruby
|
|
94
98
|
CharacterSet.non_ascii.count_in('Tüür') # => 2
|
|
95
|
-
CharacterSet.non_ascii.
|
|
99
|
+
CharacterSet.non_ascii.scan('Tüür') # => ['ü', 'ü']
|
|
96
100
|
```
|
|
97
101
|
|
|
98
102
|
There is also a core extension for String interaction.
|
|
@@ -143,6 +147,7 @@ CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
|
|
|
143
147
|
```
|
|
144
148
|
|
|
145
149
|
### Write
|
|
150
|
+
|
|
146
151
|
```ruby
|
|
147
152
|
set = CharacterSet['a', 'b', 'c', 'j', '-']
|
|
148
153
|
|
|
@@ -181,7 +186,18 @@ set.to_s_with_surrogate_alternation
|
|
|
181
186
|
# => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
|
|
182
187
|
```
|
|
183
188
|
|
|
184
|
-
###
|
|
189
|
+
### Other features
|
|
190
|
+
|
|
191
|
+
#### Secure tokens
|
|
192
|
+
|
|
193
|
+
Generate secure random strings of characters from a set:
|
|
194
|
+
|
|
195
|
+
```ruby
|
|
196
|
+
CharacterSet.new('a'..'z').secure_token(8) # => "ugwpujmt"
|
|
197
|
+
CharacterSet.crypt.secure_token # => "8.1w7aBT737/pMfcMoO4y2y8/=0xtmo:"
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
#### Unicode planes
|
|
185
201
|
|
|
186
202
|
There are some methods to check for planes and to handle ASCII, [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
|
|
187
203
|
```Ruby
|
|
@@ -198,6 +214,6 @@ CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
|
|
|
198
214
|
CharacterSet::Character.new('a').plane # => 0
|
|
199
215
|
```
|
|
200
216
|
|
|
201
|
-
|
|
217
|
+
## Contributions
|
|
202
218
|
|
|
203
219
|
Feel free to send suggestions, point out issues, or submit pull requests.
|
data/Rakefile
CHANGED
|
@@ -3,6 +3,8 @@ require 'rspec/core/rake_task'
|
|
|
3
3
|
require 'rubygems/package_task'
|
|
4
4
|
require 'rake/extensiontask'
|
|
5
5
|
|
|
6
|
+
Dir['tasks/**/*.rake'].each { |file| load(file) }
|
|
7
|
+
|
|
6
8
|
RSpec::Core::RakeTask.new(:spec)
|
|
7
9
|
|
|
8
10
|
task default: :spec
|
|
@@ -34,126 +36,6 @@ end
|
|
|
34
36
|
|
|
35
37
|
task package: 'java:gem'
|
|
36
38
|
|
|
37
|
-
desc 'Download relevant ruby/spec tests, adapt to CharacterSet and its variants'
|
|
38
|
-
task :sync_ruby_spec do
|
|
39
|
-
require 'fileutils'
|
|
40
|
-
|
|
41
|
-
variants = {
|
|
42
|
-
'CharacterSet' => './spec/ruby-spec/library/character_set',
|
|
43
|
-
'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
# download fresh specs from ruby/spec repository
|
|
47
|
-
variants.each do |_, dir|
|
|
48
|
-
FileUtils.rm_rf(dir) if File.exist?(dir)
|
|
49
|
-
`svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
# make copies for each CharacterSet variant
|
|
53
|
-
base = variants.first[1]
|
|
54
|
-
variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
|
|
55
|
-
|
|
56
|
-
# adapt specs to work with CharacterSet
|
|
57
|
-
variants.each do |class_name, dir|
|
|
58
|
-
Dir["#{dir}/**/*.rb"].each do |spec|
|
|
59
|
-
# ignore some tests that do not apply or are covered otherwise
|
|
60
|
-
if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)}
|
|
61
|
-
File.delete(spec)
|
|
62
|
-
next
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
adapted_content =
|
|
66
|
-
File.read(spec).
|
|
67
|
-
# adapt class name
|
|
68
|
-
gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)).
|
|
69
|
-
gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2").
|
|
70
|
-
# get shared specs from a single shared dir at the parent level
|
|
71
|
-
gsub(/(require_relative ['"])(shared\/)/, '\1../\2').
|
|
72
|
-
# make 'mspec' syntax rspec-compatible
|
|
73
|
-
gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|').
|
|
74
|
-
gsub(/be_(false|true)/, 'be \1').
|
|
75
|
-
gsub('stub!', 'stub').
|
|
76
|
-
gsub('mock', 'double').
|
|
77
|
-
gsub('@method', 'method').
|
|
78
|
-
# remove unneeded requires
|
|
79
|
-
gsub(/require 'set'\n/, '').
|
|
80
|
-
gsub(/require.*spec_helper.*\n/, '').
|
|
81
|
-
gsub(/\A\n+/, '').
|
|
82
|
-
# make examples use Integers/codepoints
|
|
83
|
-
gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0').
|
|
84
|
-
gsub('"one"', '1').
|
|
85
|
-
gsub('"two"', '2').
|
|
86
|
-
gsub('"three"', '3').
|
|
87
|
-
gsub('"four"', '4').
|
|
88
|
-
gsub('"five"', '5').
|
|
89
|
-
gsub(/x.(size|length) == 3/, 'x != 3').
|
|
90
|
-
gsub(/x.(size|length) != 3/, 'x == 3').
|
|
91
|
-
gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2')
|
|
92
|
-
|
|
93
|
-
File.open(spec, 'w') { |f| f.puts adapted_content }
|
|
94
|
-
end
|
|
95
|
-
end
|
|
96
|
-
|
|
97
|
-
# keep only one copy of the shared specs, at the parent level
|
|
98
|
-
FileUtils.rm_rf(base + '/../shared')
|
|
99
|
-
FileUtils.mv(base + '/shared', base + '/../')
|
|
100
|
-
variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') }
|
|
101
|
-
end
|
|
102
|
-
|
|
103
|
-
desc 'Download unicode casefold data and write new C header file'
|
|
104
|
-
task :sync_casefold_data do
|
|
105
|
-
src_path = './CaseFolding.txt'
|
|
106
|
-
dst_path = './ext/character_set/unicode_casefold_table.h'
|
|
107
|
-
|
|
108
|
-
`wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt`
|
|
109
|
-
|
|
110
|
-
mapping = File.foreach(src_path).each_with_object({}) do |line, hash|
|
|
111
|
-
from, type, to = line.split(/\s*;\s*/).first(3)
|
|
112
|
-
# type 'C' stands for 'common', excludes mappings to multiple chars
|
|
113
|
-
hash[from] = to if type == 'C'
|
|
114
|
-
end.sort
|
|
115
|
-
|
|
116
|
-
content = File.read(dst_path + '.tmpl')
|
|
117
|
-
.sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
|
|
118
|
-
.sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
|
|
119
|
-
|
|
120
|
-
File.write(dst_path, content)
|
|
121
|
-
File.unlink(src_path)
|
|
122
|
-
end
|
|
123
|
-
|
|
124
|
-
desc 'Update codepoint data for predefined sets, based on Onigmo'
|
|
125
|
-
task :sync_predefined_sets do
|
|
126
|
-
%w[assigned emoji whitespace].each do |prop|
|
|
127
|
-
require 'regexp_property_values'
|
|
128
|
-
ranges = RegexpPropertyValues[prop].matched_ranges
|
|
129
|
-
str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
|
|
130
|
-
File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
|
|
131
|
-
end
|
|
132
|
-
end
|
|
133
|
-
|
|
134
|
-
desc 'Run all IPS benchmarks'
|
|
135
|
-
task :benchmark do
|
|
136
|
-
Dir['./benchmarks/*.rb'].sort.each { |file| require file }
|
|
137
|
-
end
|
|
138
|
-
|
|
139
|
-
namespace :benchmark do
|
|
140
|
-
desc 'Run all IPS benchmarks and store the comparison results in BENCHMARK.md'
|
|
141
|
-
task :write_to_file do
|
|
142
|
-
$store_comparison_results = {}
|
|
143
|
-
|
|
144
|
-
Rake.application[:benchmark].invoke
|
|
145
|
-
|
|
146
|
-
File.open('BENCHMARK.md', 'w') do |f|
|
|
147
|
-
f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
|
|
148
|
-
|
|
149
|
-
$store_comparison_results.each do |caption, result|
|
|
150
|
-
f.puts '```', caption, '',
|
|
151
|
-
result.strip.gsub(/(same-ish).*$/, '\1').lines[1..-1], '```'
|
|
152
|
-
end
|
|
153
|
-
end
|
|
154
|
-
end
|
|
155
|
-
end
|
|
156
|
-
|
|
157
39
|
unless RUBY_PLATFORM =~ /java/
|
|
158
40
|
# recompile before benchmarking or running specs
|
|
159
41
|
task(:benchmark).enhance([:compile])
|
data/character_set.gemspec
CHANGED
|
@@ -21,14 +21,4 @@ Gem::Specification.new do |s|
|
|
|
21
21
|
s.extensions = %w[ext/character_set/extconf.rb]
|
|
22
22
|
|
|
23
23
|
s.required_ruby_version = '>= 2.1.0'
|
|
24
|
-
|
|
25
|
-
s.add_development_dependency 'benchmark-ips', '~> 2.7'
|
|
26
|
-
s.add_development_dependency 'codecov', '~> 0.1'
|
|
27
|
-
s.add_development_dependency 'get_process_mem', '~> 0.2.3'
|
|
28
|
-
s.add_development_dependency 'rake', '~> 12.0'
|
|
29
|
-
s.add_development_dependency 'rake-compiler', '~> 1.0'
|
|
30
|
-
s.add_development_dependency 'range_compressor', '~> 1.0'
|
|
31
|
-
s.add_development_dependency 'regexp_parser', '~> 1.3'
|
|
32
|
-
s.add_development_dependency 'regexp_property_values', '~> 0.3.5'
|
|
33
|
-
s.add_development_dependency 'rspec', '~> 3.8'
|
|
34
24
|
end
|