character_set 1.1.1 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitattributes +3 -0
- data/.github/workflows/lint.yml +29 -0
- data/.github/workflows/tests.yml +22 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +11 -0
- data/BENCHMARK.md +53 -17
- data/CHANGELOG.md +47 -0
- data/README.md +38 -14
- data/Rakefile +60 -36
- data/benchmarks/count_in.rb +13 -0
- data/benchmarks/delete_in.rb +1 -1
- data/benchmarks/scan.rb +13 -0
- data/benchmarks/shared.rb +5 -0
- data/benchmarks/z_add.rb +12 -0
- data/benchmarks/z_delete.rb +12 -0
- data/benchmarks/z_merge.rb +15 -0
- data/benchmarks/z_minmax.rb +12 -0
- data/bin/console +2 -0
- data/character_set.gemspec +17 -6
- data/ext/character_set/character_set.c +963 -414
- data/ext/character_set/unicode_casefold_table.h +10 -2
- data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
- data/lib/character_set/character.rb +1 -1
- data/lib/character_set/core_ext/regexp_ext.rb +1 -1
- data/lib/character_set/core_ext/string_ext.rb +3 -1
- data/lib/character_set/expression_converter.rb +25 -27
- data/lib/character_set/parser.rb +1 -1
- data/lib/character_set/predefined_sets.rb +25 -260
- data/lib/character_set/predefined_sets/any.cps +1 -0
- data/lib/character_set/predefined_sets/ascii.cps +1 -0
- data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
- data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
- data/lib/character_set/predefined_sets/assigned.cps +666 -0
- data/lib/character_set/predefined_sets/bmp.cps +2 -0
- data/lib/character_set/predefined_sets/crypt.cps +2 -0
- data/lib/character_set/predefined_sets/emoji.cps +151 -0
- data/lib/character_set/predefined_sets/newline.cps +3 -0
- data/lib/character_set/predefined_sets/surrogate.cps +1 -0
- data/lib/character_set/predefined_sets/unicode.cps +2 -0
- data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
- data/lib/character_set/predefined_sets/url_host.cps +10 -0
- data/lib/character_set/predefined_sets/url_path.cps +7 -0
- data/lib/character_set/predefined_sets/url_query.cps +8 -0
- data/lib/character_set/predefined_sets/whitespace.cps +10 -0
- data/lib/character_set/ruby_fallback.rb +5 -3
- data/lib/character_set/ruby_fallback/character_set_methods.rb +53 -6
- data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
- data/lib/character_set/shared_methods.rb +60 -49
- data/lib/character_set/version.rb +1 -1
- data/lib/character_set/writer.rb +98 -27
- metadata +88 -22
- data/.travis.yml +0 -11
- data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 7a91fd10258c312d27d3fa84f99f1a97168d12ca08a3911fe31485565a999246
|
|
4
|
+
data.tar.gz: 2f16c02b72302259bccda6f2bf731950bd6dc8c679af8812c414ac313f1d8fc2
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: cab6e94ec0a7efc2f26eba33dd1b4d5af639905d23422ec61420411325832a998c07359a4bf50c24379ec4550784ebc6da0effec4c917e7859392345ce9b8db0
|
|
7
|
+
data.tar.gz: a2dc319a9f8085e85624f25cc6f12dc03992b50f3f1a8d2000e1b69dadfdc4219c887452bdffbb213a91e1cad2011f237f604aa6fdb7e93243304d22fb5adfa3
|
data/.gitattributes
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# based on https://github.com/rails/rails/blob/4a78dcb/.github/workflows/rubocop.yml
|
|
2
|
+
|
|
3
|
+
name: rubocop linting
|
|
4
|
+
|
|
5
|
+
on: [push, pull_request]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
build:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v2
|
|
13
|
+
- name: Set up Ruby
|
|
14
|
+
uses: ruby/setup-ruby@v1
|
|
15
|
+
with:
|
|
16
|
+
ruby-version: 2.7
|
|
17
|
+
- name: Cache gems
|
|
18
|
+
uses: actions/cache@v1
|
|
19
|
+
with:
|
|
20
|
+
path: vendor/bundle
|
|
21
|
+
key: ${{ runner.os }}-rubocop-${{ hashFiles('**/Gemfile.lock') }}
|
|
22
|
+
restore-keys: |
|
|
23
|
+
${{ runner.os }}-rubocop-
|
|
24
|
+
- name: Install gems
|
|
25
|
+
run: |
|
|
26
|
+
bundle config path vendor/bundle
|
|
27
|
+
bundle install --jobs 4 --retry 3
|
|
28
|
+
- name: Run rubocop
|
|
29
|
+
run: bundle exec rubocop --lint
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
name: tests
|
|
2
|
+
|
|
3
|
+
on: [push, pull_request]
|
|
4
|
+
|
|
5
|
+
jobs:
|
|
6
|
+
build:
|
|
7
|
+
runs-on: ubuntu-latest
|
|
8
|
+
|
|
9
|
+
strategy:
|
|
10
|
+
matrix:
|
|
11
|
+
ruby: [ '2.2', '2.7', '3.0', 'ruby-head', 'jruby-head' ]
|
|
12
|
+
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v2
|
|
15
|
+
- name: Set up Ruby ${{ matrix.ruby }}
|
|
16
|
+
uses: ruby/setup-ruby@v1
|
|
17
|
+
with:
|
|
18
|
+
ruby-version: ${{ matrix.ruby }}
|
|
19
|
+
- name: Install dependencies
|
|
20
|
+
run: bundle install --jobs 4
|
|
21
|
+
- name: Test with Rake
|
|
22
|
+
run: bundle exec rake
|
data/.gitignore
CHANGED
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
AllCops:
|
|
2
|
+
Exclude:
|
|
3
|
+
- '**/doc/*'
|
|
4
|
+
- '**/pkg/*'
|
|
5
|
+
- '**/spec/ruby-spec/**/*'
|
|
6
|
+
- '**/vendor/**/*' # vendored dependencies
|
|
7
|
+
NewCops: enable
|
|
8
|
+
RubyInterpreters:
|
|
9
|
+
- ruby
|
|
10
|
+
- rake
|
|
11
|
+
TargetRubyVersion: 2.4 # really 2.1, but 2.4 is lowest supported by rubocop
|
data/BENCHMARK.md
CHANGED
|
@@ -1,50 +1,86 @@
|
|
|
1
|
-
Results of `rake:benchmark` on ruby
|
|
1
|
+
Results of `rake:benchmark` on ruby 3.0.0p0 (2020-12-25 revision 95aff21468) [x86_64-darwin19]
|
|
2
2
|
|
|
3
|
+
```
|
|
4
|
+
Counting non-letters
|
|
5
|
+
|
|
6
|
+
CharacterSet#count_in: 9472902.2 i/s
|
|
7
|
+
String#count: 2221799.9 i/s - 4.26x slower
|
|
8
|
+
```
|
|
3
9
|
```
|
|
4
10
|
Detecting non-whitespace
|
|
5
11
|
|
|
6
|
-
CharacterSet#cover?:
|
|
7
|
-
Regexp#match?:
|
|
12
|
+
CharacterSet#cover?: 12388427.2 i/s
|
|
13
|
+
Regexp#match?: 7901676.8 i/s - 1.57x slower
|
|
8
14
|
```
|
|
9
15
|
```
|
|
10
16
|
Detecting non-letters
|
|
11
17
|
|
|
12
|
-
CharacterSet#cover?:
|
|
13
|
-
Regexp#match?:
|
|
18
|
+
CharacterSet#cover?: 12263689.1 i/s
|
|
19
|
+
Regexp#match?: 4940889.9 i/s - 2.48x slower
|
|
14
20
|
```
|
|
15
21
|
```
|
|
16
22
|
Removing whitespace
|
|
17
23
|
|
|
18
|
-
CharacterSet#delete_in:
|
|
19
|
-
String#gsub:
|
|
24
|
+
CharacterSet#delete_in: 2406722.6 i/s
|
|
25
|
+
String#gsub: 235760.3 i/s - 10.21x slower
|
|
20
26
|
```
|
|
21
27
|
```
|
|
22
28
|
Removing whitespace, emoji and umlauts
|
|
23
29
|
|
|
24
|
-
CharacterSet#delete_in:
|
|
25
|
-
String#gsub:
|
|
30
|
+
CharacterSet#delete_in: 1653607.6 i/s
|
|
31
|
+
String#gsub: 272782.9 i/s - 6.06x slower
|
|
26
32
|
```
|
|
27
33
|
```
|
|
28
34
|
Removing non-whitespace
|
|
29
35
|
|
|
30
|
-
CharacterSet#keep_in:
|
|
31
|
-
String#gsub:
|
|
36
|
+
CharacterSet#keep_in: 2671038.2 i/s
|
|
37
|
+
String#gsub: 242551.0 i/s - 11.01x slower
|
|
32
38
|
```
|
|
33
39
|
```
|
|
34
40
|
Extracting emoji
|
|
35
41
|
|
|
36
|
-
CharacterSet#keep_in:
|
|
37
|
-
String#gsub:
|
|
42
|
+
CharacterSet#keep_in: 1726496.5 i/s
|
|
43
|
+
String#gsub: 215609.2 i/s - 8.01x slower
|
|
44
|
+
```
|
|
45
|
+
```
|
|
46
|
+
Extracting emoji to an Array
|
|
47
|
+
|
|
48
|
+
CharacterSet#scan: 2373856.1 i/s
|
|
49
|
+
String#scan: 480000.5 i/s - 4.95x slower
|
|
38
50
|
```
|
|
39
51
|
```
|
|
40
52
|
Detecting whitespace
|
|
41
53
|
|
|
42
|
-
CharacterSet#used_by?:
|
|
43
|
-
Regexp#match?:
|
|
54
|
+
CharacterSet#used_by?: 11988328.7 i/s
|
|
55
|
+
Regexp#match?: 6758146.8 i/s - 1.77x slower
|
|
44
56
|
```
|
|
45
57
|
```
|
|
46
58
|
Detecting emoji in a large string
|
|
47
59
|
|
|
48
|
-
CharacterSet#used_by?:
|
|
49
|
-
Regexp#match?:
|
|
60
|
+
CharacterSet#used_by?: 288223.3 i/s
|
|
61
|
+
Regexp#match?: 102384.2 i/s - 2.82x slower
|
|
62
|
+
```
|
|
63
|
+
```
|
|
64
|
+
Adding entries
|
|
65
|
+
|
|
66
|
+
CharacterSet#add: 2538251.2 i/s
|
|
67
|
+
SortedSet#add: 443925.9 i/s - 5.72x slower
|
|
68
|
+
```
|
|
69
|
+
```
|
|
70
|
+
Removing entries
|
|
71
|
+
|
|
72
|
+
CharacterSet#delete: 2487620.8 i/s
|
|
73
|
+
SortedSet#delete: 628816.1 i/s - 3.96x slower
|
|
74
|
+
```
|
|
75
|
+
```
|
|
76
|
+
Merging entries
|
|
77
|
+
|
|
78
|
+
CharacterSet#merge: 551.6 i/s
|
|
79
|
+
SortedSet#merge: 1.4 i/s - 393.59x slower
|
|
80
|
+
```
|
|
81
|
+
```
|
|
82
|
+
Getting the min and max
|
|
83
|
+
|
|
84
|
+
CharacterSet#minmax: 636890.7 i/s
|
|
85
|
+
SortedSet#minmax: 254.1 i/s - 2506.20x slower
|
|
50
86
|
```
|
data/CHANGELOG.md
CHANGED
|
@@ -4,6 +4,53 @@ All notable changes to this project will be documented in this file.
|
|
|
4
4
|
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
|
5
5
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
|
6
6
|
|
|
7
|
+
## [1.4.1] - 2020-01-10
|
|
8
|
+
|
|
9
|
+
### Fixed
|
|
10
|
+
- multiple fixes for Ruby 3
|
|
11
|
+
- fixed segfault for some `String` manipulation cases
|
|
12
|
+
- added `sorted_set` as dependency, so `CharacterSet::Pure` (non-C fallback) works
|
|
13
|
+
- fixed error when parsing a `Regexp` with an empty intersection (e.g. `/[a&&]/`)
|
|
14
|
+
|
|
15
|
+
## [1.4.0] - 2019-06-07
|
|
16
|
+
|
|
17
|
+
### Added
|
|
18
|
+
- `#to_s_with_surrogate_ranges` / `Writer::write_surrogate_ranges`
|
|
19
|
+
- allows for much shorter astral plane representations e.g. in JavaScript
|
|
20
|
+
- thanks to https://github.com/singpolyma for the suggestion and groundwork (#1)
|
|
21
|
+
- improved performance for `#to_s` / `Writer` by avoiding bugged `Range#minmax`
|
|
22
|
+
|
|
23
|
+
### Fixed
|
|
24
|
+
- '/' is now escaped by default when stringifying so as to work with //-regexp syntax
|
|
25
|
+
|
|
26
|
+
## [1.3.0] - 2019-04-26
|
|
27
|
+
|
|
28
|
+
### Added
|
|
29
|
+
- improved `String` manipulation speed
|
|
30
|
+
- improved initialization and `#merge` speed when passing a large `Range`
|
|
31
|
+
- reduced memory consumption by > 90% for most use cases via dynamic resizing
|
|
32
|
+
- before, every set instance required 136 KB for codepoints
|
|
33
|
+
- now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
|
|
34
|
+
- `#count_in` and `#scan_in` methods for `String` interaction
|
|
35
|
+
- new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
|
|
36
|
+
- conversion methods `#assigned_part`, `#valid_part`
|
|
37
|
+
- sectioning methods `#ascii_part`, `#plane(n)`
|
|
38
|
+
- section test methods `#ascii_part?`, `#ascii_ratio`, `#ascii_only?`, `#astral_only?`
|
|
39
|
+
|
|
40
|
+
### Fixed
|
|
41
|
+
- `#count` now supports passing an argument or block as usual
|
|
42
|
+
- `CharacterSet::Pure#keep_in`, `#delete_in` now preserve the original encoding
|
|
43
|
+
|
|
44
|
+
## [1.2.0] - 2019-04-02
|
|
45
|
+
|
|
46
|
+
### Added
|
|
47
|
+
- added latest Unicode casefold data (for `#case_insensitive`)
|
|
48
|
+
|
|
49
|
+
## [1.1.2] - 2018-09-25
|
|
50
|
+
|
|
51
|
+
### Fixed
|
|
52
|
+
- restored `range_compressor` as a runtime dependency for JRuby only
|
|
53
|
+
|
|
7
54
|
## [1.1.1] - 2018-09-24
|
|
8
55
|
|
|
9
56
|
### Fixed
|
data/README.md
CHANGED
|
@@ -1,15 +1,18 @@
|
|
|
1
1
|
# CharacterSet
|
|
2
2
|
|
|
3
3
|
[](http://badge.fury.io/rb/character_set)
|
|
4
|
-
[](https://github.com/jaynetics/character_set/actions)
|
|
5
|
+
[](https://codecov.io/gh/jaynetics/character_set)
|
|
5
6
|
|
|
6
|
-
|
|
7
|
+
This is a C-extended Ruby gem to work with sets of Unicode codepoints. It can read and write these sets in various formats and implements the stdlib `Set` interface for them.
|
|
8
|
+
|
|
9
|
+
It also offers an alternate paradigm of `String` processing which grants much better performance than `Regexp` and `String` methods from the stdlib where applicable (see [benchmarks](./BENCHMARK.md)).
|
|
7
10
|
|
|
8
11
|
Many parts can be used independently, e.g.:
|
|
9
12
|
- `CharacterSet::Character`
|
|
10
13
|
- `CharacterSet::Parser`
|
|
11
14
|
- `CharacterSet::Writer`
|
|
12
|
-
- [`RangeCompressor`](https://github.com/
|
|
15
|
+
- [`RangeCompressor`](https://github.com/jaynetics/range_compressor)
|
|
13
16
|
|
|
14
17
|
## Usage
|
|
15
18
|
|
|
@@ -37,7 +40,7 @@ CharacterSet.parse('[a-c]')
|
|
|
37
40
|
CharacterSet.parse('\U00000061-\U00000063')
|
|
38
41
|
```
|
|
39
42
|
|
|
40
|
-
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/
|
|
43
|
+
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting. Regexp's `i`-flag is ignored; call `#case_insensitive` on the result if needed.
|
|
41
44
|
|
|
42
45
|
```ruby
|
|
43
46
|
CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
|
|
@@ -49,7 +52,7 @@ require 'character_set/core_ext/regexp_ext'
|
|
|
49
52
|
|
|
50
53
|
### Predefined utility sets
|
|
51
54
|
|
|
52
|
-
`ascii`, `ascii_alnum`, `
|
|
55
|
+
`ascii`, `ascii_alnum`, `ascii_letter`, `assigned`, `bmp`, `crypt`, `emoji`, `newline`, `surrogate`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace`
|
|
53
56
|
|
|
54
57
|
```ruby
|
|
55
58
|
CharacterSet.ascii # => #<CharacterSet (size: 128)>
|
|
@@ -60,7 +63,7 @@ CharacterSet.non_ascii
|
|
|
60
63
|
|
|
61
64
|
### Interact with Strings
|
|
62
65
|
|
|
63
|
-
CharacterSet can replace some `
|
|
66
|
+
`CharacterSet` can replace some types of `String` handling with better performance than the stdlib.
|
|
64
67
|
|
|
65
68
|
`#used_by?` and `#cover?` can replace some `Regexp#match?` calls:
|
|
66
69
|
|
|
@@ -71,6 +74,7 @@ CharacterSet.ascii.cover?('Tr') # => true
|
|
|
71
74
|
```
|
|
72
75
|
|
|
73
76
|
`#delete_in(!)` and `#keep_in(!)` can replace `String#gsub(!)` and the like:
|
|
77
|
+
|
|
74
78
|
```ruby
|
|
75
79
|
string = 'Tüür'
|
|
76
80
|
|
|
@@ -84,6 +88,13 @@ CharacterSet.ascii.keep_in!(string) # => ''
|
|
|
84
88
|
string # => ''
|
|
85
89
|
```
|
|
86
90
|
|
|
91
|
+
`#count_in` and `#scan` can replace `String#count` and `String#scan`:
|
|
92
|
+
|
|
93
|
+
```ruby
|
|
94
|
+
CharacterSet.non_ascii.count_in('Tüür') # => 2
|
|
95
|
+
CharacterSet.non_ascii.scan_in('Tüür') # => ['ü', 'ü']
|
|
96
|
+
```
|
|
97
|
+
|
|
87
98
|
There is also a core extension for String interaction.
|
|
88
99
|
```ruby
|
|
89
100
|
require 'character_set/core_ext/string_ext'
|
|
@@ -100,7 +111,7 @@ require 'character_set/core_ext/string_ext'
|
|
|
100
111
|
|
|
101
112
|
### Manipulate
|
|
102
113
|
|
|
103
|
-
Use any
|
|
114
|
+
Use [any Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
|
|
104
115
|
|
|
105
116
|
Where appropriate, methods take both chars and codepoints, e.g.:
|
|
106
117
|
|
|
@@ -122,13 +133,13 @@ non_a.include?('ü') # => true
|
|
|
122
133
|
|
|
123
134
|
# surrogate pair halves are not included by default
|
|
124
135
|
CharacterSet['a'].inversion(include_surrogates: true)
|
|
125
|
-
# => #<CharacterSet (size:
|
|
136
|
+
# => #<CharacterSet (size: 1114112)>
|
|
126
137
|
```
|
|
127
138
|
|
|
128
139
|
`#case_insensitive` can be used to create a `CharacterSet` where upper/lower case codepoints are supplemented:
|
|
129
140
|
|
|
130
141
|
```ruby
|
|
131
|
-
CharacterSet['1', '
|
|
142
|
+
CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
|
|
132
143
|
```
|
|
133
144
|
|
|
134
145
|
### Write
|
|
@@ -156,20 +167,33 @@ set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
|
|
|
156
167
|
# disable abbreviation (grouping of codepoints in ranges)
|
|
157
168
|
set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
|
|
158
169
|
|
|
159
|
-
#
|
|
160
|
-
|
|
161
|
-
|
|
170
|
+
# astral members require some trickery if we want to target environments
|
|
171
|
+
# that are based on UTF-16 or "UCS-2 with surrogates", such as JavaScript.
|
|
172
|
+
set = CharacterSet['a', 'b', '🤩', '🤪', '🤫']
|
|
162
173
|
|
|
163
|
-
|
|
174
|
+
# Use #to_s_with_surrogate_ranges e.g. for JavaScript:
|
|
175
|
+
set.to_s_with_surrogate_ranges
|
|
176
|
+
# => '(?:[ab]|\uD83E[\uDD29-\uDD2B])'
|
|
177
|
+
|
|
178
|
+
# Or use #to_s_with_surrogate_alternation if such surrogate set pairs
|
|
179
|
+
# don't work in your target environment:
|
|
180
|
+
set.to_s_with_surrogate_alternation
|
|
181
|
+
# => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
|
|
182
|
+
```
|
|
164
183
|
|
|
165
184
|
### Unicode plane methods
|
|
166
185
|
|
|
167
|
-
There are some methods to check for planes and to handle [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
|
|
186
|
+
There are some methods to check for planes and to handle ASCII, [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
|
|
168
187
|
```Ruby
|
|
188
|
+
CharacterSet['a', 'ü', '🤩'].ascii_part # => CharacterSet['a']
|
|
189
|
+
CharacterSet['a', 'ü', '🤩'].ascii_part? # => true
|
|
190
|
+
CharacterSet['a', 'ü', '🤩'].ascii_only? # => false
|
|
191
|
+
CharacterSet['a', 'ü', '🤩'].ascii_ratio # => 0.3333333
|
|
169
192
|
CharacterSet['a', 'ü', '🤩'].bmp_part # => CharacterSet['a', 'ü']
|
|
170
193
|
CharacterSet['a', 'ü', '🤩'].astral_part # => CharacterSet['🤩']
|
|
171
194
|
CharacterSet['a', 'ü', '🤩'].bmp_ratio # => 0.6666666
|
|
172
195
|
CharacterSet['a', 'ü', '🤩'].planes # => [0, 1]
|
|
196
|
+
CharacterSet['a', 'ü', '🤩'].plane(1) # => CharacterSet['🤩']
|
|
173
197
|
CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
|
|
174
198
|
CharacterSet::Character.new('a').plane # => 0
|
|
175
199
|
```
|
data/Rakefile
CHANGED
|
@@ -7,6 +7,13 @@ RSpec::Core::RakeTask.new(:spec)
|
|
|
7
7
|
|
|
8
8
|
task default: :spec
|
|
9
9
|
|
|
10
|
+
namespace :spec do
|
|
11
|
+
task :quick do
|
|
12
|
+
ENV['SKIP_MEMSAFETY_SPECS'] = '1'
|
|
13
|
+
Rake::Task[:spec].invoke
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
10
17
|
Rake::ExtensionTask.new('character_set') do |ext|
|
|
11
18
|
ext.lib_dir = 'lib/character_set'
|
|
12
19
|
end
|
|
@@ -16,6 +23,8 @@ namespace :java do
|
|
|
16
23
|
java_gemspec.platform = 'java'
|
|
17
24
|
java_gemspec.extensions = []
|
|
18
25
|
|
|
26
|
+
java_gemspec.add_runtime_dependency 'range_compressor', '~> 1.0'
|
|
27
|
+
|
|
19
28
|
Gem::PackageTask.new(java_gemspec) do |pkg|
|
|
20
29
|
pkg.need_zip = true
|
|
21
30
|
pkg.need_tar = true
|
|
@@ -33,43 +42,62 @@ task :sync_ruby_spec do
|
|
|
33
42
|
'CharacterSet' => './spec/ruby-spec/library/character_set',
|
|
34
43
|
'CharacterSet::Pure' => './spec/ruby-spec/library/character_set_pure',
|
|
35
44
|
}
|
|
45
|
+
|
|
46
|
+
# download fresh specs from ruby/spec repository
|
|
36
47
|
variants.each do |_, dir|
|
|
37
48
|
FileUtils.rm_rf(dir) if File.exist?(dir)
|
|
38
49
|
`svn export https://github.com/ruby/spec/trunk/library/set/sortedset #{dir}`
|
|
39
50
|
end
|
|
40
51
|
|
|
52
|
+
# make copies for each CharacterSet variant
|
|
41
53
|
base = variants.first[1]
|
|
42
54
|
variants.each_value { |dir| FileUtils.copy_entry(base, dir) unless dir == base }
|
|
43
55
|
|
|
44
|
-
|
|
56
|
+
# adapt specs to work with CharacterSet
|
|
57
|
+
variants.each do |class_name, dir|
|
|
45
58
|
Dir["#{dir}/**/*.rb"].each do |spec|
|
|
46
|
-
#
|
|
47
|
-
if spec =~ %r{/(flatten|initialize|pretty_print)}
|
|
59
|
+
# ignore some tests that do not apply or are covered otherwise
|
|
60
|
+
if spec =~ %r{/(classify|divide|flatten|initialize|pretty_print)}
|
|
48
61
|
File.delete(spec)
|
|
49
62
|
next
|
|
50
63
|
end
|
|
51
64
|
|
|
52
|
-
# some examples w. Strings must be adapted, "mspec" made rspec-compatible,
|
|
53
|
-
# and `i` added to shared example names or they'll override each other
|
|
54
65
|
adapted_content =
|
|
55
|
-
File
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
66
|
+
File.read(spec).
|
|
67
|
+
# adapt class name
|
|
68
|
+
gsub('SortedSet', (spec['/shared/'] ? 'variant' : class_name)).
|
|
69
|
+
gsub(/(it_behaves_like :[^,\n]+), (:[^,\n]+)/, "\\1, #{class_name}, \\2").
|
|
70
|
+
# get shared specs from a single shared dir at the parent level
|
|
71
|
+
gsub(/(require_relative ['"])(shared\/)/, '\1../\2').
|
|
72
|
+
# make 'mspec' syntax rspec-compatible
|
|
73
|
+
gsub(/describe (.*), shared.*$/, 'shared_examples \1 do |variant, method|').
|
|
74
|
+
gsub(/be_(false|true)/, 'be \1').
|
|
75
|
+
gsub('stub!', 'stub').
|
|
76
|
+
gsub('mock', 'double').
|
|
77
|
+
gsub('@method', 'method').
|
|
78
|
+
# remove unneeded requires
|
|
79
|
+
gsub(/require 'set'\n/, '').
|
|
80
|
+
gsub(/require.*spec_helper.*\n/, '').
|
|
81
|
+
gsub(/\A\n+/, '').
|
|
82
|
+
# make examples use Integers/codepoints
|
|
83
|
+
gsub(/1\.0|"cat"|"dog"|"hello"|"test"/, '0').
|
|
84
|
+
gsub('"one"', '1').
|
|
85
|
+
gsub('"two"', '2').
|
|
86
|
+
gsub('"three"', '3').
|
|
87
|
+
gsub('"four"', '4').
|
|
88
|
+
gsub('"five"', '5').
|
|
89
|
+
gsub(/x.(size|length) == 3/, 'x != 3').
|
|
90
|
+
gsub(/x.(size|length) != 3/, 'x == 3').
|
|
91
|
+
gsub(/(add)\(\d\)(\.to_a \}.should raise)/, '\1(:foo)\2')
|
|
69
92
|
|
|
70
93
|
File.open(spec, 'w') { |f| f.puts adapted_content }
|
|
71
94
|
end
|
|
72
95
|
end
|
|
96
|
+
|
|
97
|
+
# keep only one copy of the shared specs, at the parent level
|
|
98
|
+
FileUtils.rm_rf(base + '/../shared')
|
|
99
|
+
FileUtils.mv(base + '/shared', base + '/../')
|
|
100
|
+
variants.each_value { |dir| FileUtils.rm_rf(dir + '/shared') }
|
|
73
101
|
end
|
|
74
102
|
|
|
75
103
|
desc 'Download unicode casefold data and write new C header file'
|
|
@@ -85,26 +113,22 @@ task :sync_casefold_data do
|
|
|
85
113
|
hash[from] = to if type == 'C'
|
|
86
114
|
end.sort
|
|
87
115
|
|
|
88
|
-
File.
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
typedef struct casefold_mapping {
|
|
93
|
-
unsigned long from;
|
|
94
|
-
unsigned long to;
|
|
95
|
-
} casefold_mapping;
|
|
116
|
+
content = File.read(dst_path + '.tmpl')
|
|
117
|
+
.sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
|
|
118
|
+
.sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
|
|
96
119
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
C
|
|
101
|
-
|
|
102
|
-
mapping.each { |from, to| f.puts "{0x#{from},0x#{to}}," }
|
|
120
|
+
File.write(dst_path, content)
|
|
121
|
+
File.unlink(src_path)
|
|
122
|
+
end
|
|
103
123
|
|
|
104
|
-
|
|
124
|
+
desc 'Update codepoint data for predefined sets, based on Onigmo'
|
|
125
|
+
task :sync_predefined_sets do
|
|
126
|
+
%w[assigned emoji whitespace].each do |prop|
|
|
127
|
+
require 'regexp_property_values'
|
|
128
|
+
ranges = RegexpPropertyValues[prop].matched_ranges
|
|
129
|
+
str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
|
|
130
|
+
File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
|
|
105
131
|
end
|
|
106
|
-
|
|
107
|
-
File.unlink(src_path)
|
|
108
132
|
end
|
|
109
133
|
|
|
110
134
|
desc 'Run all IPS benchmarks'
|