character_set 1.2.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitattributes +3 -0
- data/.github/workflows/gouteur.yml +20 -0
- data/.github/workflows/lint.yml +29 -0
- data/.github/workflows/tests.yml +22 -0
- data/.gitignore +1 -0
- data/.gouteur.yml +2 -0
- data/.rubocop.yml +17 -0
- data/BENCHMARK.md +53 -17
- data/CHANGELOG.md +54 -0
- data/README.md +51 -12
- data/Rakefile +20 -18
- data/benchmarks/count_in.rb +13 -0
- data/benchmarks/delete_in.rb +1 -1
- data/benchmarks/scan.rb +13 -0
- data/benchmarks/shared.rb +5 -0
- data/benchmarks/z_add.rb +12 -0
- data/benchmarks/z_delete.rb +12 -0
- data/benchmarks/z_merge.rb +15 -0
- data/benchmarks/z_minmax.rb +12 -0
- data/bin/console +2 -0
- data/character_set.gemspec +17 -4
- data/ext/character_set/character_set.c +969 -415
- data/ext/character_set/unicode_casefold_table.h +44 -1
- data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
- data/lib/character_set/character.rb +1 -1
- data/lib/character_set/core_ext/regexp_ext.rb +1 -1
- data/lib/character_set/core_ext/string_ext.rb +3 -1
- data/lib/character_set/expression_converter.rb +41 -43
- data/lib/character_set/parser.rb +1 -1
- data/lib/character_set/predefined_sets/any.cps +1 -0
- data/lib/character_set/predefined_sets/ascii.cps +1 -0
- data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
- data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
- data/lib/character_set/predefined_sets/assigned.cps +677 -0
- data/lib/character_set/predefined_sets/bmp.cps +2 -0
- data/lib/character_set/predefined_sets/crypt.cps +2 -0
- data/lib/character_set/predefined_sets/emoji.cps +152 -0
- data/lib/character_set/predefined_sets/newline.cps +3 -0
- data/lib/character_set/predefined_sets/surrogate.cps +1 -0
- data/lib/character_set/predefined_sets/unicode.cps +2 -0
- data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
- data/lib/character_set/predefined_sets/url_host.cps +10 -0
- data/lib/character_set/predefined_sets/url_path.cps +7 -0
- data/lib/character_set/predefined_sets/url_query.cps +8 -0
- data/lib/character_set/predefined_sets/whitespace.cps +10 -0
- data/lib/character_set/predefined_sets.rb +25 -260
- data/lib/character_set/ruby_fallback/character_set_methods.rb +60 -9
- data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
- data/lib/character_set/ruby_fallback.rb +5 -3
- data/lib/character_set/set_method_adapters.rb +4 -3
- data/lib/character_set/shared_methods.rb +69 -50
- data/lib/character_set/version.rb +1 -1
- data/lib/character_set/writer.rb +98 -27
- metadata +114 -17
- data/.travis.yml +0 -8
- data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9622bc20bbdb48f8deff84dbed9e800e6bc500a6a08a27e7b3aea2ea651cd278
|
|
4
|
+
data.tar.gz: 5853e8d5be7e9a1963419aa4f9fbc631148fe5bef45aa185b9117d32b44aa959
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2cc2a60b9388a2e3beef66da20aa8205cc501980a7dc66f2716c66f7e999a083927b27a761e6b932b6d5c16b8e5968f8e04370ecf3c999326f378f60bfa3cedc
|
|
7
|
+
data.tar.gz: a2a8d1f9ac6cdf6302af98662fc3efda4b8c6fe003c7cdc853a61a64f9c7a596b1bbd7a79dca19081b8ce2576f9c3d848869141b164c145e22befaaffec8b265
|
data/.gitattributes
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
name: gouteur
|
|
2
|
+
|
|
3
|
+
on: [push, pull_request]
|
|
4
|
+
|
|
5
|
+
jobs:
|
|
6
|
+
build:
|
|
7
|
+
runs-on: ubuntu-latest
|
|
8
|
+
|
|
9
|
+
steps:
|
|
10
|
+
- uses: actions/checkout@v2
|
|
11
|
+
- name: Set up Ruby
|
|
12
|
+
uses: ruby/setup-ruby@v1
|
|
13
|
+
with:
|
|
14
|
+
ruby-version: 2.7
|
|
15
|
+
- name: Prepare
|
|
16
|
+
run: |
|
|
17
|
+
bundle install --jobs 4
|
|
18
|
+
bundle exec rake compile
|
|
19
|
+
- name: Test
|
|
20
|
+
run: bundle exec gouteur
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# based on https://github.com/rails/rails/blob/4a78dcb/.github/workflows/rubocop.yml
|
|
2
|
+
|
|
3
|
+
name: rubocop linting
|
|
4
|
+
|
|
5
|
+
on: [push, pull_request]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
build:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v2
|
|
13
|
+
- name: Set up Ruby
|
|
14
|
+
uses: ruby/setup-ruby@v1
|
|
15
|
+
with:
|
|
16
|
+
ruby-version: 2.7
|
|
17
|
+
- name: Cache gems
|
|
18
|
+
uses: actions/cache@v1
|
|
19
|
+
with:
|
|
20
|
+
path: vendor/bundle
|
|
21
|
+
key: ${{ runner.os }}-rubocop-${{ hashFiles('**/Gemfile.lock') }}
|
|
22
|
+
restore-keys: |
|
|
23
|
+
${{ runner.os }}-rubocop-
|
|
24
|
+
- name: Install gems
|
|
25
|
+
run: |
|
|
26
|
+
bundle config path vendor/bundle
|
|
27
|
+
bundle install --jobs 4 --retry 3
|
|
28
|
+
- name: Run rubocop
|
|
29
|
+
run: bundle exec rubocop --lint
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
name: tests
|
|
2
|
+
|
|
3
|
+
on: [push, pull_request]
|
|
4
|
+
|
|
5
|
+
jobs:
|
|
6
|
+
build:
|
|
7
|
+
runs-on: ubuntu-latest
|
|
8
|
+
|
|
9
|
+
strategy:
|
|
10
|
+
matrix:
|
|
11
|
+
ruby: [ '2.2', '2.7', '3.0', 'ruby-head', 'jruby-head' ]
|
|
12
|
+
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v2
|
|
15
|
+
- name: Set up Ruby ${{ matrix.ruby }}
|
|
16
|
+
uses: ruby/setup-ruby@v1
|
|
17
|
+
with:
|
|
18
|
+
ruby-version: ${{ matrix.ruby }}
|
|
19
|
+
- name: Install dependencies
|
|
20
|
+
run: bundle install --jobs 4
|
|
21
|
+
- name: Test with Rake
|
|
22
|
+
run: bundle exec rake
|
data/.gitignore
CHANGED
data/.gouteur.yml
ADDED
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
AllCops:
|
|
2
|
+
Exclude:
|
|
3
|
+
- '**/doc/*'
|
|
4
|
+
- '**/pkg/*'
|
|
5
|
+
- '**/spec/ruby-spec/**/*'
|
|
6
|
+
- '**/vendor/**/*' # vendored dependencies
|
|
7
|
+
NewCops: enable
|
|
8
|
+
RubyInterpreters:
|
|
9
|
+
- ruby
|
|
10
|
+
- rake
|
|
11
|
+
TargetRubyVersion: 2.5 # really 2.1, but 2.5 is lowest supported by rubocop
|
|
12
|
+
|
|
13
|
+
Lint/AmbiguousOperatorPrecedence:
|
|
14
|
+
Enabled: false
|
|
15
|
+
|
|
16
|
+
Lint/AmbiguousRegexpLiteral:
|
|
17
|
+
Enabled: false
|
data/BENCHMARK.md
CHANGED
|
@@ -1,50 +1,86 @@
|
|
|
1
|
-
Results of `rake:benchmark` on ruby
|
|
1
|
+
Results of `rake:benchmark` on ruby 3.0.0p0 (2020-12-25 revision 95aff21468) [x86_64-darwin19]
|
|
2
2
|
|
|
3
|
+
```
|
|
4
|
+
Counting non-letters
|
|
5
|
+
|
|
6
|
+
CharacterSet#count_in: 9472902.2 i/s
|
|
7
|
+
String#count: 2221799.9 i/s - 4.26x slower
|
|
8
|
+
```
|
|
3
9
|
```
|
|
4
10
|
Detecting non-whitespace
|
|
5
11
|
|
|
6
|
-
CharacterSet#cover?:
|
|
7
|
-
Regexp#match?:
|
|
12
|
+
CharacterSet#cover?: 12388427.2 i/s
|
|
13
|
+
Regexp#match?: 7901676.8 i/s - 1.57x slower
|
|
8
14
|
```
|
|
9
15
|
```
|
|
10
16
|
Detecting non-letters
|
|
11
17
|
|
|
12
|
-
CharacterSet#cover?:
|
|
13
|
-
Regexp#match?:
|
|
18
|
+
CharacterSet#cover?: 12263689.1 i/s
|
|
19
|
+
Regexp#match?: 4940889.9 i/s - 2.48x slower
|
|
14
20
|
```
|
|
15
21
|
```
|
|
16
22
|
Removing whitespace
|
|
17
23
|
|
|
18
|
-
CharacterSet#delete_in:
|
|
19
|
-
String#gsub:
|
|
24
|
+
CharacterSet#delete_in: 2406722.6 i/s
|
|
25
|
+
String#gsub: 235760.3 i/s - 10.21x slower
|
|
20
26
|
```
|
|
21
27
|
```
|
|
22
28
|
Removing whitespace, emoji and umlauts
|
|
23
29
|
|
|
24
|
-
CharacterSet#delete_in:
|
|
25
|
-
String#gsub:
|
|
30
|
+
CharacterSet#delete_in: 1653607.6 i/s
|
|
31
|
+
String#gsub: 272782.9 i/s - 6.06x slower
|
|
26
32
|
```
|
|
27
33
|
```
|
|
28
34
|
Removing non-whitespace
|
|
29
35
|
|
|
30
|
-
CharacterSet#keep_in:
|
|
31
|
-
String#gsub:
|
|
36
|
+
CharacterSet#keep_in: 2671038.2 i/s
|
|
37
|
+
String#gsub: 242551.0 i/s - 11.01x slower
|
|
32
38
|
```
|
|
33
39
|
```
|
|
34
40
|
Extracting emoji
|
|
35
41
|
|
|
36
|
-
CharacterSet#keep_in:
|
|
37
|
-
String#gsub:
|
|
42
|
+
CharacterSet#keep_in: 1726496.5 i/s
|
|
43
|
+
String#gsub: 215609.2 i/s - 8.01x slower
|
|
44
|
+
```
|
|
45
|
+
```
|
|
46
|
+
Extracting emoji to an Array
|
|
47
|
+
|
|
48
|
+
CharacterSet#scan: 2373856.1 i/s
|
|
49
|
+
String#scan: 480000.5 i/s - 4.95x slower
|
|
38
50
|
```
|
|
39
51
|
```
|
|
40
52
|
Detecting whitespace
|
|
41
53
|
|
|
42
|
-
CharacterSet#used_by?:
|
|
43
|
-
Regexp#match?:
|
|
54
|
+
CharacterSet#used_by?: 11988328.7 i/s
|
|
55
|
+
Regexp#match?: 6758146.8 i/s - 1.77x slower
|
|
44
56
|
```
|
|
45
57
|
```
|
|
46
58
|
Detecting emoji in a large string
|
|
47
59
|
|
|
48
|
-
CharacterSet#used_by?:
|
|
49
|
-
Regexp#match?:
|
|
60
|
+
CharacterSet#used_by?: 288223.3 i/s
|
|
61
|
+
Regexp#match?: 102384.2 i/s - 2.82x slower
|
|
62
|
+
```
|
|
63
|
+
```
|
|
64
|
+
Adding entries
|
|
65
|
+
|
|
66
|
+
CharacterSet#add: 2538251.2 i/s
|
|
67
|
+
SortedSet#add: 443925.9 i/s - 5.72x slower
|
|
68
|
+
```
|
|
69
|
+
```
|
|
70
|
+
Removing entries
|
|
71
|
+
|
|
72
|
+
CharacterSet#delete: 2487620.8 i/s
|
|
73
|
+
SortedSet#delete: 628816.1 i/s - 3.96x slower
|
|
74
|
+
```
|
|
75
|
+
```
|
|
76
|
+
Merging entries
|
|
77
|
+
|
|
78
|
+
CharacterSet#merge: 551.6 i/s
|
|
79
|
+
SortedSet#merge: 1.4 i/s - 393.59x slower
|
|
80
|
+
```
|
|
81
|
+
```
|
|
82
|
+
Getting the min and max
|
|
83
|
+
|
|
84
|
+
CharacterSet#minmax: 636890.7 i/s
|
|
85
|
+
SortedSet#minmax: 254.1 i/s - 2506.20x slower
|
|
50
86
|
```
|
data/CHANGELOG.md
CHANGED
|
@@ -4,6 +4,60 @@ All notable changes to this project will be documented in this file.
|
|
|
4
4
|
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
|
5
5
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
|
6
6
|
|
|
7
|
+
## [1.5.0] - 2021-12-05
|
|
8
|
+
|
|
9
|
+
### Added
|
|
10
|
+
|
|
11
|
+
- new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.1.0
|
|
12
|
+
- latest unicode case-folding data (for `#case_insensitive`)
|
|
13
|
+
- support for passing any Enumerable to `#disjoint?`, `#intersect?`
|
|
14
|
+
- this matches recent broadening of these methods in `ruby/set`
|
|
15
|
+
- new instance method `#secure_token` (see README)
|
|
16
|
+
- class method `::of` now accepts more than one `String`
|
|
17
|
+
- `CharacterSet::ExpressionConverter` can now build output of any Set-like class
|
|
18
|
+
|
|
19
|
+
### Fixed
|
|
20
|
+
|
|
21
|
+
- `CharacterSet::Pure::of_expression` now returns a `CharacterSet::Pure`
|
|
22
|
+
- it used to return a regular `CharacterSet`
|
|
23
|
+
|
|
24
|
+
## [1.4.1] - 2020-01-10
|
|
25
|
+
|
|
26
|
+
### Fixed
|
|
27
|
+
- multiple fixes for Ruby 3
|
|
28
|
+
- fixed segfault for some `String` manipulation cases
|
|
29
|
+
- added `sorted_set` as dependency, so `CharacterSet::Pure` (non-C fallback) works
|
|
30
|
+
- fixed error when parsing a `Regexp` with an empty intersection (e.g. `/[a&&]/`)
|
|
31
|
+
|
|
32
|
+
## [1.4.0] - 2019-06-07
|
|
33
|
+
|
|
34
|
+
### Added
|
|
35
|
+
- `#to_s_with_surrogate_ranges` / `Writer::write_surrogate_ranges`
|
|
36
|
+
- allows for much shorter astral plane representations e.g. in JavaScript
|
|
37
|
+
- thanks to https://github.com/singpolyma for the suggestion and groundwork (#1)
|
|
38
|
+
- improved performance for `#to_s` / `Writer` by avoiding bugged `Range#minmax`
|
|
39
|
+
|
|
40
|
+
### Fixed
|
|
41
|
+
- '/' is now escaped by default when stringifying so as to work with //-regexp syntax
|
|
42
|
+
|
|
43
|
+
## [1.3.0] - 2019-04-26
|
|
44
|
+
|
|
45
|
+
### Added
|
|
46
|
+
- improved `String` manipulation speed
|
|
47
|
+
- improved initialization and `#merge` speed when passing a large `Range`
|
|
48
|
+
- reduced memory consumption by > 90% for most use cases via dynamic resizing
|
|
49
|
+
- before, every set instance required 136 KB for codepoints
|
|
50
|
+
- now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
|
|
51
|
+
- `#count_in` and `#scan_in` methods for `String` interaction
|
|
52
|
+
- new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
|
|
53
|
+
- conversion methods `#assigned_part`, `#valid_part`
|
|
54
|
+
- sectioning methods `#ascii_part`, `#plane(n)`
|
|
55
|
+
- section test methods `#ascii_part?`, `#ascii_ratio`, `#ascii_only?`, `#astral_only?`
|
|
56
|
+
|
|
57
|
+
### Fixed
|
|
58
|
+
- `#count` now supports passing an argument or block as usual
|
|
59
|
+
- `CharacterSet::Pure#keep_in`, `#delete_in` now preserve the original encoding
|
|
60
|
+
|
|
7
61
|
## [1.2.0] - 2019-04-02
|
|
8
62
|
|
|
9
63
|
### Added
|
data/README.md
CHANGED
|
@@ -1,12 +1,17 @@
|
|
|
1
1
|
# CharacterSet
|
|
2
2
|
|
|
3
3
|
[](http://badge.fury.io/rb/character_set)
|
|
4
|
-
[](https://github.com/jaynetics/character_set/actions)
|
|
5
|
+
[](https://github.com/jaynetics/character_set/actions)
|
|
6
|
+
[](https://codecov.io/gh/jaynetics/character_set)
|
|
5
7
|
|
|
6
|
-
|
|
8
|
+
This is a C-extended Ruby gem to work with sets of Unicode codepoints. It can read and write these sets in various formats and implements the stdlib `Set` interface for them.
|
|
9
|
+
|
|
10
|
+
It also offers an alternate paradigm of `String` processing which grants much better performance than `Regexp` and `String` methods from the stdlib where applicable (see [benchmarks](./BENCHMARK.md)).
|
|
7
11
|
|
|
8
12
|
Many parts can be used independently, e.g.:
|
|
9
13
|
- `CharacterSet::Character`
|
|
14
|
+
- `CharacterSet::ExpressionConverter`
|
|
10
15
|
- `CharacterSet::Parser`
|
|
11
16
|
- `CharacterSet::Writer`
|
|
12
17
|
- [`RangeCompressor`](https://github.com/jaynetics/range_compressor)
|
|
@@ -37,7 +42,7 @@ CharacterSet.parse('[a-c]')
|
|
|
37
42
|
CharacterSet.parse('\U00000061-\U00000063')
|
|
38
43
|
```
|
|
39
44
|
|
|
40
|
-
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting.
|
|
45
|
+
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting. Regexp's `i`-flag is ignored; call `#case_insensitive` on the result if needed.
|
|
41
46
|
|
|
42
47
|
```ruby
|
|
43
48
|
CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
|
|
@@ -49,7 +54,7 @@ require 'character_set/core_ext/regexp_ext'
|
|
|
49
54
|
|
|
50
55
|
### Predefined utility sets
|
|
51
56
|
|
|
52
|
-
`ascii`, `ascii_alnum`, `
|
|
57
|
+
`ascii`, `ascii_alnum`, `ascii_letter`, `assigned`, `bmp`, `crypt`, `emoji`, `newline`, `surrogate`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace`
|
|
53
58
|
|
|
54
59
|
```ruby
|
|
55
60
|
CharacterSet.ascii # => #<CharacterSet (size: 128)>
|
|
@@ -60,7 +65,7 @@ CharacterSet.non_ascii
|
|
|
60
65
|
|
|
61
66
|
### Interact with Strings
|
|
62
67
|
|
|
63
|
-
CharacterSet can replace some `
|
|
68
|
+
`CharacterSet` can replace some types of `String` handling with better performance than the stdlib.
|
|
64
69
|
|
|
65
70
|
`#used_by?` and `#cover?` can replace some `Regexp#match?` calls:
|
|
66
71
|
|
|
@@ -71,6 +76,7 @@ CharacterSet.ascii.cover?('Tr') # => true
|
|
|
71
76
|
```
|
|
72
77
|
|
|
73
78
|
`#delete_in(!)` and `#keep_in(!)` can replace `String#gsub(!)` and the like:
|
|
79
|
+
|
|
74
80
|
```ruby
|
|
75
81
|
string = 'Tüür'
|
|
76
82
|
|
|
@@ -84,6 +90,13 @@ CharacterSet.ascii.keep_in!(string) # => ''
|
|
|
84
90
|
string # => ''
|
|
85
91
|
```
|
|
86
92
|
|
|
93
|
+
`#count_in` and `#scan` can replace `String#count` and `String#scan`:
|
|
94
|
+
|
|
95
|
+
```ruby
|
|
96
|
+
CharacterSet.non_ascii.count_in('Tüür') # => 2
|
|
97
|
+
CharacterSet.non_ascii.scan_in('Tüür') # => ['ü', 'ü']
|
|
98
|
+
```
|
|
99
|
+
|
|
87
100
|
There is also a core extension for String interaction.
|
|
88
101
|
```ruby
|
|
89
102
|
require 'character_set/core_ext/string_ext'
|
|
@@ -100,7 +113,7 @@ require 'character_set/core_ext/string_ext'
|
|
|
100
113
|
|
|
101
114
|
### Manipulate
|
|
102
115
|
|
|
103
|
-
Use any
|
|
116
|
+
Use [any Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
|
|
104
117
|
|
|
105
118
|
Where appropriate, methods take both chars and codepoints, e.g.:
|
|
106
119
|
|
|
@@ -122,13 +135,13 @@ non_a.include?('ü') # => true
|
|
|
122
135
|
|
|
123
136
|
# surrogate pair halves are not included by default
|
|
124
137
|
CharacterSet['a'].inversion(include_surrogates: true)
|
|
125
|
-
# => #<CharacterSet (size:
|
|
138
|
+
# => #<CharacterSet (size: 1114112)>
|
|
126
139
|
```
|
|
127
140
|
|
|
128
141
|
`#case_insensitive` can be used to create a `CharacterSet` where upper/lower case codepoints are supplemented:
|
|
129
142
|
|
|
130
143
|
```ruby
|
|
131
|
-
CharacterSet['1', '
|
|
144
|
+
CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
|
|
132
145
|
```
|
|
133
146
|
|
|
134
147
|
### Write
|
|
@@ -156,18 +169,44 @@ set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
|
|
|
156
169
|
# disable abbreviation (grouping of codepoints in ranges)
|
|
157
170
|
set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
|
|
158
171
|
|
|
159
|
-
#
|
|
160
|
-
|
|
172
|
+
# astral members require some trickery if we want to target environments
|
|
173
|
+
# that are based on UTF-16 or "UCS-2 with surrogates", such as JavaScript.
|
|
174
|
+
set = CharacterSet['a', 'b', '🤩', '🤪', '🤫']
|
|
175
|
+
|
|
176
|
+
# Use #to_s_with_surrogate_ranges e.g. for JavaScript:
|
|
177
|
+
set.to_s_with_surrogate_ranges
|
|
178
|
+
# => '(?:[ab]|\uD83E[\uDD29-\uDD2B])'
|
|
179
|
+
|
|
180
|
+
# Or use #to_s_with_surrogate_alternation if such surrogate set pairs
|
|
181
|
+
# don't work in your target environment:
|
|
182
|
+
set.to_s_with_surrogate_alternation
|
|
183
|
+
# => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### Other features
|
|
187
|
+
|
|
188
|
+
#### Secure tokens
|
|
189
|
+
|
|
190
|
+
Generate secure random strings of characters from a set:
|
|
191
|
+
|
|
192
|
+
```ruby
|
|
193
|
+
CharacterSet.new('a'..'z').secure_token(8) # => "ugwpujmt"
|
|
194
|
+
CharacterSet.crypt.secure_token # => "8.1w7aBT737/pMfcMoO4y2y8/=0xtmo:"
|
|
161
195
|
```
|
|
162
196
|
|
|
163
|
-
|
|
197
|
+
#### Unicode planes
|
|
164
198
|
|
|
165
|
-
There are some methods to check for planes and to handle [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
|
|
199
|
+
There are some methods to check for planes and to handle ASCII, [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
|
|
166
200
|
```Ruby
|
|
201
|
+
CharacterSet['a', 'ü', '🤩'].ascii_part # => CharacterSet['a']
|
|
202
|
+
CharacterSet['a', 'ü', '🤩'].ascii_part? # => true
|
|
203
|
+
CharacterSet['a', 'ü', '🤩'].ascii_only? # => false
|
|
204
|
+
CharacterSet['a', 'ü', '🤩'].ascii_ratio # => 0.3333333
|
|
167
205
|
CharacterSet['a', 'ü', '🤩'].bmp_part # => CharacterSet['a', 'ü']
|
|
168
206
|
CharacterSet['a', 'ü', '🤩'].astral_part # => CharacterSet['🤩']
|
|
169
207
|
CharacterSet['a', 'ü', '🤩'].bmp_ratio # => 0.6666666
|
|
170
208
|
CharacterSet['a', 'ü', '🤩'].planes # => [0, 1]
|
|
209
|
+
CharacterSet['a', 'ü', '🤩'].plane(1) # => CharacterSet['🤩']
|
|
171
210
|
CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
|
|
172
211
|
CharacterSet::Character.new('a').plane # => 0
|
|
173
212
|
```
|
data/Rakefile
CHANGED
|
@@ -7,6 +7,13 @@ RSpec::Core::RakeTask.new(:spec)
|
|
|
7
7
|
|
|
8
8
|
task default: :spec
|
|
9
9
|
|
|
10
|
+
namespace :spec do
|
|
11
|
+
task :quick do
|
|
12
|
+
ENV['SKIP_MEMSAFETY_SPECS'] = '1'
|
|
13
|
+
Rake::Task[:spec].invoke
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
10
17
|
Rake::ExtensionTask.new('character_set') do |ext|
|
|
11
18
|
ext.lib_dir = 'lib/character_set'
|
|
12
19
|
end
|
|
@@ -106,27 +113,22 @@ task :sync_casefold_data do
|
|
|
106
113
|
hash[from] = to if type == 'C'
|
|
107
114
|
end.sort
|
|
108
115
|
|
|
109
|
-
File.
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
// -*-C-*-
|
|
113
|
-
|
|
114
|
-
typedef struct casefold_mapping {
|
|
115
|
-
unsigned long from;
|
|
116
|
-
unsigned long to;
|
|
117
|
-
} casefold_mapping;
|
|
118
|
-
|
|
119
|
-
#define CASEFOLD_COUNT #{mapping.size}
|
|
116
|
+
content = File.read(dst_path + '.tmpl')
|
|
117
|
+
.sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
|
|
118
|
+
.sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
|
|
120
119
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
mapping.each { |from, to| f.puts "{0x#{from},0x#{to}}," }
|
|
120
|
+
File.write(dst_path, content)
|
|
121
|
+
File.unlink(src_path)
|
|
122
|
+
end
|
|
125
123
|
|
|
126
|
-
|
|
124
|
+
desc 'Update codepoint data for predefined sets, based on Onigmo'
|
|
125
|
+
task :sync_predefined_sets do
|
|
126
|
+
%w[assigned emoji whitespace].each do |prop|
|
|
127
|
+
require 'regexp_property_values'
|
|
128
|
+
ranges = RegexpPropertyValues[prop].matched_ranges
|
|
129
|
+
str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
|
|
130
|
+
File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
|
|
127
131
|
end
|
|
128
|
-
|
|
129
|
-
File.unlink(src_path)
|
|
130
132
|
end
|
|
131
133
|
|
|
132
134
|
desc 'Run all IPS benchmarks'
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
require_relative './shared'
|
|
2
|
+
|
|
3
|
+
str = 'Lorem ipsum et dolorem'
|
|
4
|
+
tr = '^A-Za-z'
|
|
5
|
+
cs = CharacterSet.non_ascii_letter
|
|
6
|
+
|
|
7
|
+
benchmark(
|
|
8
|
+
caption: 'Counting non-letters',
|
|
9
|
+
cases: {
|
|
10
|
+
'String#count' => -> { str.count(tr) },
|
|
11
|
+
'CharacterSet#count_in' => -> { cs.count_in(str) },
|
|
12
|
+
}
|
|
13
|
+
)
|
data/benchmarks/delete_in.rb
CHANGED
|
@@ -14,7 +14,7 @@ benchmark(
|
|
|
14
14
|
|
|
15
15
|
str = 'Lörem ipsüm ⛷ et dölörem'
|
|
16
16
|
rx = /[\s\p{emoji}äüö]/
|
|
17
|
-
cs = CharacterSet.whitespace + CharacterSet.emoji +
|
|
17
|
+
cs = CharacterSet.whitespace + CharacterSet.emoji + CharacterSet['ä', 'ö', 'ü']
|
|
18
18
|
|
|
19
19
|
benchmark(
|
|
20
20
|
caption: 'Removing whitespace, emoji and umlauts',
|
data/benchmarks/scan.rb
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
require_relative './shared'
|
|
2
|
+
|
|
3
|
+
str = 'Lorem ipsum ⛷ et dolorem'
|
|
4
|
+
rx = /\p{emoji}/
|
|
5
|
+
cs = CharacterSet.emoji
|
|
6
|
+
|
|
7
|
+
benchmark(
|
|
8
|
+
caption: 'Extracting emoji to an Array',
|
|
9
|
+
cases: {
|
|
10
|
+
'String#scan' => -> { str.scan(rx) },
|
|
11
|
+
'CharacterSet#scan' => -> { cs.scan(str) },
|
|
12
|
+
}
|
|
13
|
+
)
|
data/benchmarks/shared.rb
CHANGED
|
@@ -3,6 +3,11 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
|
3
3
|
|
|
4
4
|
require 'benchmark/ips'
|
|
5
5
|
require 'character_set'
|
|
6
|
+
if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
|
|
7
|
+
require 'sorted_set'
|
|
8
|
+
else
|
|
9
|
+
require 'set'
|
|
10
|
+
end
|
|
6
11
|
|
|
7
12
|
def benchmark(caption: nil, cases: {})
|
|
8
13
|
puts caption
|
data/benchmarks/z_add.rb
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
require_relative './shared'
|
|
2
|
+
|
|
3
|
+
cs = CharacterSet.new(0..0x10FFFF)
|
|
4
|
+
ss = SortedSet.new(0..0x10FFFF)
|
|
5
|
+
|
|
6
|
+
benchmark(
|
|
7
|
+
caption: 'Removing entries',
|
|
8
|
+
cases: {
|
|
9
|
+
'CharacterSet#delete' => -> { cs.delete(rand(0x10FFFF)) },
|
|
10
|
+
'SortedSet#delete' => -> { ss.delete(rand(0x10FFFF)) },
|
|
11
|
+
}
|
|
12
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
require_relative './shared'
|
|
2
|
+
|
|
3
|
+
cs1 = CharacterSet.new(0...0x88000)
|
|
4
|
+
cs2 = CharacterSet.new(0x88000..0x10FFFF)
|
|
5
|
+
|
|
6
|
+
ss1 = SortedSet.new(0...0x88000)
|
|
7
|
+
ss2 = SortedSet.new(0x88000..0x10FFFF)
|
|
8
|
+
|
|
9
|
+
benchmark(
|
|
10
|
+
caption: 'Merging entries',
|
|
11
|
+
cases: {
|
|
12
|
+
'CharacterSet#merge' => -> { cs1.merge(cs2) },
|
|
13
|
+
'SortedSet#merge' => -> { ss1.merge(ss2) },
|
|
14
|
+
}
|
|
15
|
+
)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
require_relative './shared'
|
|
2
|
+
|
|
3
|
+
cs = CharacterSet.new(0..0xFFFF)
|
|
4
|
+
ss = SortedSet.new(0..0xFFFF)
|
|
5
|
+
|
|
6
|
+
benchmark(
|
|
7
|
+
caption: 'Getting the min and max',
|
|
8
|
+
cases: {
|
|
9
|
+
'CharacterSet#minmax' => -> { cs.minmax },
|
|
10
|
+
'SortedSet#minmax' => -> { ss.minmax },
|
|
11
|
+
}
|
|
12
|
+
)
|
data/bin/console
CHANGED
data/character_set.gemspec
CHANGED
|
@@ -22,11 +22,24 @@ Gem::Specification.new do |s|
|
|
|
22
22
|
|
|
23
23
|
s.required_ruby_version = '>= 2.1.0'
|
|
24
24
|
|
|
25
|
+
# SortedSet, needed for RubyFallback, was moved to a gem in Ruby 3.
|
|
26
|
+
# This dependency is only used if the C extension is unavailable.
|
|
27
|
+
# JRuby has it in the stdlib.
|
|
28
|
+
if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
|
|
29
|
+
s.add_dependency 'sorted_set', '~> 1.0'
|
|
30
|
+
end
|
|
31
|
+
|
|
25
32
|
s.add_development_dependency 'benchmark-ips', '~> 2.7'
|
|
26
|
-
s.add_development_dependency '
|
|
27
|
-
s.add_development_dependency 'rake
|
|
33
|
+
s.add_development_dependency 'get_process_mem', '~> 0.2.3'
|
|
34
|
+
s.add_development_dependency 'rake', '~> 13.0'
|
|
35
|
+
s.add_development_dependency 'rake-compiler', '~> 1.1'
|
|
28
36
|
s.add_development_dependency 'range_compressor', '~> 1.0'
|
|
29
|
-
s.add_development_dependency 'regexp_parser', '~> 1
|
|
30
|
-
s.add_development_dependency 'regexp_property_values', '~> 0
|
|
37
|
+
s.add_development_dependency 'regexp_parser', '~> 2.1'
|
|
38
|
+
s.add_development_dependency 'regexp_property_values', '~> 1.0'
|
|
31
39
|
s.add_development_dependency 'rspec', '~> 3.8'
|
|
40
|
+
if RUBY_VERSION.to_f >= 2.7
|
|
41
|
+
s.add_development_dependency 'codecov', '~> 0.2.12'
|
|
42
|
+
s.add_development_dependency 'gouteur', '~> 1.0.0'
|
|
43
|
+
s.add_development_dependency 'rubocop', '~> 1.8'
|
|
44
|
+
end
|
|
32
45
|
end
|