character_set 1.5.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitattributes +1 -1
- data/.github/workflows/tests.yml +6 -2
- data/BENCHMARK.md +35 -31
- data/CHANGELOG.md +15 -0
- data/Gemfile +14 -0
- data/README.md +8 -5
- data/Rakefile +5 -2
- data/benchmarks/delete_in.rb +5 -1
- data/benchmarks/keep_in.rb +5 -1
- data/character_set.gemspec +0 -14
- data/ext/character_set/character_set.c +60 -96
- data/lib/character_set/core_ext/string_ext.rb +1 -1
- data/lib/character_set/predefined_sets.rb +11 -0
- data/lib/character_set/ruby_fallback/character_set_methods.rb +3 -7
- data/lib/character_set/shared_methods.rb +6 -0
- data/lib/character_set/version.rb +1 -1
- metadata +3 -157
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e216e6c199ac9443cda9180a9e35d5ed92b50b45c12e7f64f45d74ecd2cf08d6
|
|
4
|
+
data.tar.gz: 5f3634d426dc33875d6c197ce75466544d97808b1e8b1858ac56d93422b226e8
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d24cfaa40b6e4e472e1f76cc8b6f7f3f1282e6830c0cbf76c4810c0f6f365c7419a19816d0b741cee99eb428dae03fc1d60eecab7d1ba6d210015f0cf2d5ee14
|
|
7
|
+
data.tar.gz: 2bd7ea63b286e106358293b1428a687374d0cd2cdc985b2da5b5cf1f45c6c541cb0ddde5b06477243cf4011065cfac7fa6bb8a521fb144a750c90039d268f03b
|
data/.gitattributes
CHANGED
data/.github/workflows/tests.yml
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
name: tests
|
|
2
2
|
|
|
3
|
-
on:
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
pull_request:
|
|
6
|
+
schedule:
|
|
7
|
+
- cron: '11 11 14 * *' # at 11:11 am on the 14th of every month
|
|
4
8
|
|
|
5
9
|
jobs:
|
|
6
10
|
build:
|
|
@@ -8,7 +12,7 @@ jobs:
|
|
|
8
12
|
|
|
9
13
|
strategy:
|
|
10
14
|
matrix:
|
|
11
|
-
ruby: [ '2.2', '2.7', '3.0', 'ruby-head', 'jruby-head' ]
|
|
15
|
+
ruby: [ '2.2', '2.7', '3.0', '3.1', 'ruby-head', 'jruby-head' ]
|
|
12
16
|
|
|
13
17
|
steps:
|
|
14
18
|
- uses: actions/checkout@v2
|
data/BENCHMARK.md
CHANGED
|
@@ -1,86 +1,90 @@
|
|
|
1
|
-
Results of `rake:benchmark` on ruby 3.
|
|
1
|
+
Results of `rake:benchmark` on ruby 3.2.0dev (2022-02-14T14:35:54Z master 26187a8520) [arm64-darwin21]
|
|
2
2
|
|
|
3
3
|
```
|
|
4
4
|
Counting non-letters
|
|
5
5
|
|
|
6
|
-
CharacterSet#count_in:
|
|
7
|
-
String#count:
|
|
6
|
+
CharacterSet#count_in: 14794607.9 i/s
|
|
7
|
+
String#count: 3875939.3 i/s - 3.82x slower
|
|
8
8
|
```
|
|
9
9
|
```
|
|
10
10
|
Detecting non-whitespace
|
|
11
11
|
|
|
12
|
-
CharacterSet#cover?:
|
|
13
|
-
Regexp#match?:
|
|
12
|
+
CharacterSet#cover?: 17448329.0 i/s
|
|
13
|
+
Regexp#match?: 13089358.1 i/s - 1.33x slower
|
|
14
14
|
```
|
|
15
15
|
```
|
|
16
16
|
Detecting non-letters
|
|
17
17
|
|
|
18
|
-
CharacterSet#cover?:
|
|
19
|
-
Regexp#match?:
|
|
18
|
+
CharacterSet#cover?: 17565596.9 i/s
|
|
19
|
+
Regexp#match?: 7951108.0 i/s - 2.21x slower
|
|
20
20
|
```
|
|
21
21
|
```
|
|
22
|
-
Removing whitespace
|
|
22
|
+
Removing ASCII whitespace
|
|
23
23
|
|
|
24
|
-
CharacterSet#delete_in:
|
|
25
|
-
|
|
24
|
+
CharacterSet#delete_in: 6306078.2 i/s
|
|
25
|
+
String#tr: 4734401.0 i/s - 1.33x slower
|
|
26
|
+
String#gsub: 211631.8 i/s - 29.80x slower
|
|
26
27
|
```
|
|
27
28
|
```
|
|
28
29
|
Removing whitespace, emoji and umlauts
|
|
29
30
|
|
|
30
|
-
CharacterSet#delete_in:
|
|
31
|
-
|
|
31
|
+
CharacterSet#delete_in: 5984149.6 i/s
|
|
32
|
+
String#tr: 363643.1 i/s - 16.46x slower
|
|
33
|
+
String#gsub: 317201.7 i/s - 18.87x slower
|
|
32
34
|
```
|
|
33
35
|
```
|
|
34
36
|
Removing non-whitespace
|
|
35
37
|
|
|
36
|
-
CharacterSet#keep_in:
|
|
37
|
-
String#gsub:
|
|
38
|
+
CharacterSet#keep_in: 7650925.6 i/s
|
|
39
|
+
String#gsub: 207374.6 i/s - 36.89x slower
|
|
40
|
+
String#tr: 12.3 i/s - 619745.60x slower
|
|
38
41
|
```
|
|
39
42
|
```
|
|
40
|
-
|
|
43
|
+
Keeping only emoji
|
|
41
44
|
|
|
42
|
-
CharacterSet#keep_in:
|
|
43
|
-
String#gsub:
|
|
45
|
+
CharacterSet#keep_in: 7272940.1 i/s
|
|
46
|
+
String#gsub: 177993.8 i/s - 40.86x slower
|
|
47
|
+
String#tr: 12.3 i/s - 590222.71x slower
|
|
44
48
|
```
|
|
45
49
|
```
|
|
46
50
|
Extracting emoji to an Array
|
|
47
51
|
|
|
48
|
-
CharacterSet#scan:
|
|
49
|
-
String#scan:
|
|
52
|
+
CharacterSet#scan: 2978285.0 i/s
|
|
53
|
+
String#scan: 865793.8 i/s - 3.44x slower
|
|
50
54
|
```
|
|
51
55
|
```
|
|
52
56
|
Detecting whitespace
|
|
53
57
|
|
|
54
|
-
CharacterSet#used_by?:
|
|
55
|
-
Regexp#match?:
|
|
58
|
+
CharacterSet#used_by?: 17292338.4 i/s
|
|
59
|
+
Regexp#match?: 11705563.9 i/s - 1.48x slower
|
|
56
60
|
```
|
|
57
61
|
```
|
|
58
62
|
Detecting emoji in a large string
|
|
59
63
|
|
|
60
|
-
CharacterSet#used_by?:
|
|
61
|
-
Regexp#match?:
|
|
64
|
+
CharacterSet#used_by?: 340444.1 i/s
|
|
65
|
+
Regexp#match?: 180549.8 i/s - 1.89x slower
|
|
62
66
|
```
|
|
63
67
|
```
|
|
64
68
|
Adding entries
|
|
65
69
|
|
|
66
|
-
CharacterSet#add:
|
|
67
|
-
SortedSet#add:
|
|
70
|
+
CharacterSet#add: 4951781.4 i/s
|
|
71
|
+
SortedSet#add: 1019637.9 i/s - 4.86x slower
|
|
68
72
|
```
|
|
69
73
|
```
|
|
70
74
|
Removing entries
|
|
71
75
|
|
|
72
|
-
CharacterSet#delete:
|
|
73
|
-
SortedSet#delete:
|
|
76
|
+
CharacterSet#delete: 5006337.6 i/s
|
|
77
|
+
SortedSet#delete: 3922752.2 i/s - same-ish
|
|
74
78
|
```
|
|
75
79
|
```
|
|
76
80
|
Merging entries
|
|
77
81
|
|
|
78
|
-
CharacterSet#merge:
|
|
79
|
-
SortedSet#merge:
|
|
82
|
+
CharacterSet#merge: 661.8 i/s
|
|
83
|
+
SortedSet#merge: 3.9 i/s - 167.82x slower
|
|
80
84
|
```
|
|
81
85
|
```
|
|
82
86
|
Getting the min and max
|
|
83
87
|
|
|
84
|
-
CharacterSet#minmax:
|
|
85
|
-
SortedSet#minmax:
|
|
88
|
+
CharacterSet#minmax: 1212462.2 i/s
|
|
89
|
+
SortedSet#minmax: 844.4 i/s - 1435.93x slower
|
|
86
90
|
```
|
data/CHANGELOG.md
CHANGED
|
@@ -4,6 +4,21 @@ All notable changes to this project will be documented in this file.
|
|
|
4
4
|
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
|
5
5
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
|
6
6
|
|
|
7
|
+
## [1.6.0] - 2022-02-16
|
|
8
|
+
|
|
9
|
+
### Added
|
|
10
|
+
|
|
11
|
+
- `::of` now supports both `String` and `Regexp` arguments
|
|
12
|
+
|
|
13
|
+
### Fixed
|
|
14
|
+
|
|
15
|
+
- fixed segfault during `String` manipulation on Ruby 3.2.0-dev
|
|
16
|
+
- improved performance for `String` manipulation
|
|
17
|
+
- allow usage in Ractors
|
|
18
|
+
- predefined sets must be pre-initialized for this, though
|
|
19
|
+
- e.g. `CharacterSet.ascii`, `keep_character_set(:ascii)` etc.
|
|
20
|
+
- call them once in the main Ractor to trigger initialization
|
|
21
|
+
|
|
7
22
|
## [1.5.0] - 2021-12-05
|
|
8
23
|
|
|
9
24
|
### Added
|
data/Gemfile
CHANGED
|
@@ -4,3 +4,17 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
|
|
|
4
4
|
|
|
5
5
|
# Specify your gem's dependencies in character_set.gemspec
|
|
6
6
|
gemspec
|
|
7
|
+
|
|
8
|
+
gem 'benchmark-ips', '~> 2.7'
|
|
9
|
+
gem 'get_process_mem', '~> 0.2.3'
|
|
10
|
+
gem 'rake', '~> 13.0'
|
|
11
|
+
gem 'rake-compiler', '~> 1.1'
|
|
12
|
+
gem 'range_compressor', '~> 1.0'
|
|
13
|
+
gem 'regexp_parser', '~> 2.1'
|
|
14
|
+
gem 'regexp_property_values', '~> 1.0'
|
|
15
|
+
gem 'rspec', '~> 3.8'
|
|
16
|
+
if RUBY_VERSION.to_f >= 2.7
|
|
17
|
+
gem 'codecov', '~> 0.2.12'
|
|
18
|
+
gem 'gouteur', '~> 1.0.0'
|
|
19
|
+
gem 'rubocop', '~> 1.8'
|
|
20
|
+
end
|
data/README.md
CHANGED
|
@@ -5,16 +5,17 @@
|
|
|
5
5
|
[](https://github.com/jaynetics/character_set/actions)
|
|
6
6
|
[](https://codecov.io/gh/jaynetics/character_set)
|
|
7
7
|
|
|
8
|
-
This is a C-extended Ruby gem to work with sets of Unicode codepoints.
|
|
8
|
+
This is a C-extended Ruby gem to work with sets of Unicode codepoints.
|
|
9
9
|
|
|
10
|
-
It
|
|
10
|
+
It can [read](#parseinitialize) and [write](#write) sets of codepoints in various formats and it implements the stdlib `Set` interface for them.
|
|
11
|
+
|
|
12
|
+
It also offers a [way of scrubbing and scanning characters in Strings](#interact-with-strings) that is more semantic and consistently offers better performance than `Regexp` and `String` methods from the stdlib for this (see [benchmarks](./BENCHMARK.md)).
|
|
11
13
|
|
|
12
14
|
Many parts can be used independently, e.g.:
|
|
13
15
|
- `CharacterSet::Character`
|
|
14
16
|
- `CharacterSet::ExpressionConverter`
|
|
15
17
|
- `CharacterSet::Parser`
|
|
16
18
|
- `CharacterSet::Writer`
|
|
17
|
-
- [`RangeCompressor`](https://github.com/jaynetics/range_compressor)
|
|
18
19
|
|
|
19
20
|
## Usage
|
|
20
21
|
|
|
@@ -42,9 +43,10 @@ CharacterSet.parse('[a-c]')
|
|
|
42
43
|
CharacterSet.parse('\U00000061-\U00000063')
|
|
43
44
|
```
|
|
44
45
|
|
|
45
|
-
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed,
|
|
46
|
+
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `Regexp` and unicode property names can also be read. Regexp intersections, negations, and set nesting are covered, but the `i`-flag is ignored; call `#case_insensitive` on the result if needed.
|
|
46
47
|
|
|
47
48
|
```ruby
|
|
49
|
+
CharacterSet.of(/./) # => #<CharacterSet (size: 1112064)>
|
|
48
50
|
CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
|
|
49
51
|
|
|
50
52
|
require 'character_set/core_ext/regexp_ext'
|
|
@@ -145,6 +147,7 @@ CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
|
|
|
145
147
|
```
|
|
146
148
|
|
|
147
149
|
### Write
|
|
150
|
+
|
|
148
151
|
```ruby
|
|
149
152
|
set = CharacterSet['a', 'b', 'c', 'j', '-']
|
|
150
153
|
|
|
@@ -211,6 +214,6 @@ CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
|
|
|
211
214
|
CharacterSet::Character.new('a').plane # => 0
|
|
212
215
|
```
|
|
213
216
|
|
|
214
|
-
|
|
217
|
+
## Contributions
|
|
215
218
|
|
|
216
219
|
Feel free to send suggestions, point out issues, or submit pull requests.
|
data/Rakefile
CHANGED
|
@@ -147,8 +147,11 @@ namespace :benchmark do
|
|
|
147
147
|
f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
|
|
148
148
|
|
|
149
149
|
$store_comparison_results.each do |caption, result|
|
|
150
|
-
f.puts '```',
|
|
151
|
-
|
|
150
|
+
f.puts '```',
|
|
151
|
+
caption,
|
|
152
|
+
'',
|
|
153
|
+
result.strip.gsub(/ \(±[^)]+\) /, '').gsub(/(same-ish).*$/, '\1').lines[1..-1],
|
|
154
|
+
'```'
|
|
152
155
|
end
|
|
153
156
|
end
|
|
154
157
|
end
|
data/benchmarks/delete_in.rb
CHANGED
|
@@ -2,24 +2,28 @@ require_relative './shared'
|
|
|
2
2
|
|
|
3
3
|
str = 'Lorem ipsum et dolorem'
|
|
4
4
|
rx = /\s/
|
|
5
|
+
trt = "\t\n\v\f\r\s"
|
|
5
6
|
cs = CharacterSet.whitespace
|
|
6
7
|
|
|
7
8
|
benchmark(
|
|
8
|
-
caption: 'Removing whitespace',
|
|
9
|
+
caption: 'Removing ASCII whitespace',
|
|
9
10
|
cases: {
|
|
10
11
|
'String#gsub' => -> { str.gsub(rx, '') },
|
|
12
|
+
'String#tr' => -> { str.tr(trt, '') },
|
|
11
13
|
'CharacterSet#delete_in' => -> { cs.delete_in(str) },
|
|
12
14
|
}
|
|
13
15
|
)
|
|
14
16
|
|
|
15
17
|
str = 'Lörem ipsüm ⛷ et dölörem'
|
|
16
18
|
rx = /[\s\p{emoji}äüö]/
|
|
19
|
+
trt = "\t\n\v\f\r\s😀-🙏äüö"
|
|
17
20
|
cs = CharacterSet.whitespace + CharacterSet.emoji + CharacterSet['ä', 'ö', 'ü']
|
|
18
21
|
|
|
19
22
|
benchmark(
|
|
20
23
|
caption: 'Removing whitespace, emoji and umlauts',
|
|
21
24
|
cases: {
|
|
22
25
|
'String#gsub' => -> { str.gsub(rx, '') },
|
|
26
|
+
'String#tr' => -> { str.tr(trt, '') },
|
|
23
27
|
'CharacterSet#delete_in' => -> { cs.delete_in(str) },
|
|
24
28
|
}
|
|
25
29
|
)
|
data/benchmarks/keep_in.rb
CHANGED
|
@@ -2,24 +2,28 @@ require_relative './shared'
|
|
|
2
2
|
|
|
3
3
|
str = 'Lorem ipsum et dolorem'
|
|
4
4
|
rx = /\S/
|
|
5
|
+
trt = "\u{0080}-\u{10FFFF}" # approximation
|
|
5
6
|
cs = CharacterSet.whitespace
|
|
6
7
|
|
|
7
8
|
benchmark(
|
|
8
9
|
caption: 'Removing non-whitespace',
|
|
9
10
|
cases: {
|
|
10
11
|
'String#gsub' => -> { str.gsub(rx, '') },
|
|
12
|
+
'String#tr' => -> { str.tr(trt, '') },
|
|
11
13
|
'CharacterSet#keep_in' => -> { cs.keep_in(str) },
|
|
12
14
|
}
|
|
13
15
|
)
|
|
14
16
|
|
|
15
17
|
str = 'Lorem ipsum ⛷ et dolorem'
|
|
16
18
|
rx = /\p{^emoji}/
|
|
19
|
+
trt = "\u0000-\u{1F599}\u{1F650}-\u{10FFFF}"
|
|
17
20
|
cs = CharacterSet.emoji
|
|
18
21
|
|
|
19
22
|
benchmark(
|
|
20
|
-
caption: '
|
|
23
|
+
caption: 'Keeping only emoji',
|
|
21
24
|
cases: {
|
|
22
25
|
'String#gsub' => -> { str.gsub(rx, '') },
|
|
26
|
+
'String#tr' => -> { str.tr(trt, '') },
|
|
23
27
|
'CharacterSet#keep_in' => -> { cs.keep_in(str) },
|
|
24
28
|
}
|
|
25
29
|
)
|
data/character_set.gemspec
CHANGED
|
@@ -28,18 +28,4 @@ Gem::Specification.new do |s|
|
|
|
28
28
|
if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
|
|
29
29
|
s.add_dependency 'sorted_set', '~> 1.0'
|
|
30
30
|
end
|
|
31
|
-
|
|
32
|
-
s.add_development_dependency 'benchmark-ips', '~> 2.7'
|
|
33
|
-
s.add_development_dependency 'get_process_mem', '~> 0.2.3'
|
|
34
|
-
s.add_development_dependency 'rake', '~> 13.0'
|
|
35
|
-
s.add_development_dependency 'rake-compiler', '~> 1.1'
|
|
36
|
-
s.add_development_dependency 'range_compressor', '~> 1.0'
|
|
37
|
-
s.add_development_dependency 'regexp_parser', '~> 2.1'
|
|
38
|
-
s.add_development_dependency 'regexp_property_values', '~> 1.0'
|
|
39
|
-
s.add_development_dependency 'rspec', '~> 3.8'
|
|
40
|
-
if RUBY_VERSION.to_f >= 2.7
|
|
41
|
-
s.add_development_dependency 'codecov', '~> 0.2.12'
|
|
42
|
-
s.add_development_dependency 'gouteur', '~> 1.0.0'
|
|
43
|
-
s.add_development_dependency 'rubocop', '~> 1.8'
|
|
44
|
-
end
|
|
45
31
|
end
|
|
@@ -82,7 +82,11 @@ static const rb_data_type_t cs_type = {
|
|
|
82
82
|
.dsize = cs_memsize,
|
|
83
83
|
},
|
|
84
84
|
.data = NULL,
|
|
85
|
+
#ifdef RUBY_TYPED_FROZEN_SHAREABLE
|
|
86
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_FROZEN_SHAREABLE,
|
|
87
|
+
#else
|
|
85
88
|
.flags = RUBY_TYPED_FREE_IMMEDIATELY,
|
|
89
|
+
#endif
|
|
86
90
|
};
|
|
87
91
|
|
|
88
92
|
static inline VALUE
|
|
@@ -315,9 +319,9 @@ cs_method_minmax(VALUE self)
|
|
|
315
319
|
cs_cp cp, alen, blen; \
|
|
316
320
|
cs_ar *acps, *bcps; \
|
|
317
321
|
struct cs_data *new_data; \
|
|
318
|
-
new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
|
|
319
322
|
acps = cs_fetch_cps(cs_a, &alen); \
|
|
320
323
|
bcps = cs_fetch_cps(cs_b, &blen); \
|
|
324
|
+
new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
|
|
321
325
|
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) \
|
|
322
326
|
{ \
|
|
323
327
|
if (tst_cp(acps, alen, cp) comp_op tst_cp(bcps, blen, cp)) \
|
|
@@ -705,8 +709,7 @@ cs_method_ranges(VALUE self)
|
|
|
705
709
|
|
|
706
710
|
if (!previous_cp_num) {
|
|
707
711
|
current_start = cp_num;
|
|
708
|
-
} else if (previous_cp_num + 2 != cp_num)
|
|
709
|
-
{
|
|
712
|
+
} else if (previous_cp_num + 2 != cp_num) {
|
|
710
713
|
// gap found, finalize previous range
|
|
711
714
|
rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
|
|
712
715
|
current_start = cp_num;
|
|
@@ -1047,17 +1050,14 @@ raise_arg_err_unless_string(VALUE val)
|
|
|
1047
1050
|
}
|
|
1048
1051
|
|
|
1049
1052
|
static VALUE
|
|
1050
|
-
|
|
1053
|
+
cs_class_method_of_string(VALUE self, VALUE string)
|
|
1051
1054
|
{
|
|
1052
1055
|
VALUE new_cs;
|
|
1053
1056
|
struct cs_data *new_data;
|
|
1054
|
-
|
|
1057
|
+
|
|
1058
|
+
raise_arg_err_unless_string(string);
|
|
1055
1059
|
new_cs = cs_alloc(self, &new_data);
|
|
1056
|
-
|
|
1057
|
-
{
|
|
1058
|
-
raise_arg_err_unless_string(argv[i]);
|
|
1059
|
-
each_cp(argv[i], add_str_cp_to_arr, 0, 0, new_data, 0);
|
|
1060
|
-
}
|
|
1060
|
+
each_cp(string, add_str_cp_to_arr, 0, 0, new_data, 0);
|
|
1061
1061
|
return new_cs;
|
|
1062
1062
|
}
|
|
1063
1063
|
|
|
@@ -1138,116 +1138,76 @@ cs_method_used_by_p(VALUE self, VALUE str)
|
|
|
1138
1138
|
return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
|
|
1139
1139
|
}
|
|
1140
1140
|
|
|
1141
|
-
|
|
1142
|
-
cs_str_buf_cat(VALUE str, const char *ptr, long len)
|
|
1143
|
-
{
|
|
1144
|
-
long total, olen;
|
|
1145
|
-
char *sptr;
|
|
1146
|
-
|
|
1147
|
-
RSTRING_GETMEM(str, sptr, olen);
|
|
1148
|
-
sptr = RSTRING(str)->as.heap.ptr;
|
|
1149
|
-
olen = RSTRING(str)->as.heap.len;
|
|
1150
|
-
total = olen + len;
|
|
1151
|
-
memcpy(sptr + olen, ptr, len);
|
|
1152
|
-
RSTRING(str)->as.heap.len = total;
|
|
1153
|
-
}
|
|
1154
|
-
|
|
1155
|
-
#ifndef TERM_FILL
|
|
1156
|
-
#define TERM_FILL(ptr, termlen) \
|
|
1157
|
-
do \
|
|
1158
|
-
{ \
|
|
1159
|
-
char *const term_fill_ptr = (ptr); \
|
|
1160
|
-
const int term_fill_len = (termlen); \
|
|
1161
|
-
*term_fill_ptr = '\0'; \
|
|
1162
|
-
if (__builtin_expect(!!(term_fill_len > 1), 0)) \
|
|
1163
|
-
memset(term_fill_ptr, 0, term_fill_len); \
|
|
1164
|
-
} while (0)
|
|
1165
|
-
#endif
|
|
1166
|
-
|
|
1167
|
-
static void
|
|
1168
|
-
cs_str_buf_terminate(VALUE str, rb_encoding *enc)
|
|
1169
|
-
{
|
|
1170
|
-
char *ptr;
|
|
1171
|
-
long len;
|
|
1172
|
-
|
|
1173
|
-
ptr = RSTRING(str)->as.heap.ptr;
|
|
1174
|
-
len = RSTRING(str)->as.heap.len;
|
|
1175
|
-
TERM_FILL(ptr + len, rb_enc_mbminlen(enc));
|
|
1176
|
-
}
|
|
1177
|
-
|
|
1141
|
+
// partially based on rb_str_delete_bang
|
|
1178
1142
|
static inline VALUE
|
|
1179
1143
|
cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
|
|
1180
1144
|
{
|
|
1181
1145
|
cs_ar *cps;
|
|
1182
|
-
cs_cp
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1146
|
+
cs_cp cs_len;
|
|
1147
|
+
VALUE orig_str_len;
|
|
1148
|
+
|
|
1149
|
+
rb_encoding *enc;
|
|
1150
|
+
char *s, *send, *t;
|
|
1151
|
+
int ascompat, cr;
|
|
1188
1152
|
|
|
1189
1153
|
raise_arg_err_unless_string(str);
|
|
1190
1154
|
|
|
1191
|
-
|
|
1155
|
+
orig_str_len = RSTRING_LEN(str);
|
|
1192
1156
|
|
|
1193
|
-
|
|
1194
|
-
if (orig_len < 1) // empty string, will never change
|
|
1157
|
+
if (orig_str_len == 0)
|
|
1195
1158
|
{
|
|
1196
|
-
|
|
1197
|
-
{
|
|
1198
|
-
return Qnil;
|
|
1199
|
-
}
|
|
1200
|
-
return rb_str_dup(str);
|
|
1159
|
+
return bang ? Qnil : str;
|
|
1201
1160
|
}
|
|
1202
1161
|
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
|
|
1208
|
-
|
|
1209
|
-
ptr = RSTRING_PTR(str);
|
|
1210
|
-
end = RSTRING_END(str);
|
|
1162
|
+
if (!bang)
|
|
1163
|
+
{
|
|
1164
|
+
str = rb_str_dup(str);
|
|
1165
|
+
}
|
|
1211
1166
|
|
|
1212
|
-
|
|
1167
|
+
cps = cs_fetch_cps(set, &cs_len);
|
|
1168
|
+
rb_str_modify(str);
|
|
1169
|
+
enc = rb_enc_get(str);
|
|
1170
|
+
ascompat = rb_enc_asciicompat(enc);
|
|
1171
|
+
s = t = RSTRING_PTR(str);
|
|
1172
|
+
send = RSTRING_END(str);
|
|
1173
|
+
cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
|
|
1174
|
+
while (s < send)
|
|
1213
1175
|
{
|
|
1214
|
-
|
|
1176
|
+
unsigned int c;
|
|
1177
|
+
int clen;
|
|
1178
|
+
|
|
1179
|
+
if (ascompat && (c = *(unsigned char *)s) < 0x80)
|
|
1215
1180
|
{
|
|
1216
|
-
|
|
1217
|
-
if ((!tst_cp(cps, len, str_cp)) == delete)
|
|
1181
|
+
if (tst_cp(cps, cs_len, c) != delete)
|
|
1218
1182
|
{
|
|
1219
|
-
|
|
1183
|
+
if (t != s)
|
|
1184
|
+
*t = c;
|
|
1185
|
+
t++;
|
|
1220
1186
|
}
|
|
1221
|
-
|
|
1187
|
+
s++;
|
|
1222
1188
|
}
|
|
1223
|
-
|
|
1224
|
-
else // likely to be multibyte string
|
|
1225
|
-
{
|
|
1226
|
-
while (ptr < end)
|
|
1189
|
+
else
|
|
1227
1190
|
{
|
|
1228
|
-
|
|
1229
|
-
|
|
1191
|
+
c = rb_enc_codepoint_len(s, send, &clen, enc);
|
|
1192
|
+
|
|
1193
|
+
if (tst_cp(cps, cs_len, c) != delete)
|
|
1230
1194
|
{
|
|
1231
|
-
|
|
1195
|
+
if (t != s)
|
|
1196
|
+
rb_enc_mbcput(c, t, enc);
|
|
1197
|
+
t += clen;
|
|
1198
|
+
if (cr == ENC_CODERANGE_7BIT)
|
|
1199
|
+
cr = ENC_CODERANGE_VALID;
|
|
1232
1200
|
}
|
|
1233
|
-
|
|
1201
|
+
s += clen;
|
|
1234
1202
|
}
|
|
1235
1203
|
}
|
|
1236
1204
|
|
|
1237
|
-
|
|
1205
|
+
rb_str_set_len(str, t - RSTRING_PTR(str));
|
|
1206
|
+
ENC_CODERANGE_SET(str, cr);
|
|
1238
1207
|
|
|
1239
|
-
if (bang)
|
|
1208
|
+
if (bang && (RSTRING_LEN(str) == (long)orig_str_len)) // string unchanged
|
|
1240
1209
|
{
|
|
1241
|
-
|
|
1242
|
-
{
|
|
1243
|
-
return Qnil;
|
|
1244
|
-
}
|
|
1245
|
-
rb_str_shared_replace(str, new_str_buf);
|
|
1246
|
-
}
|
|
1247
|
-
else
|
|
1248
|
-
{
|
|
1249
|
-
RB_OBJ_WRITE(new_str_buf, &(RBASIC(new_str_buf))->klass, rb_obj_class(str));
|
|
1250
|
-
str = new_str_buf;
|
|
1210
|
+
return Qnil;
|
|
1251
1211
|
}
|
|
1252
1212
|
|
|
1253
1213
|
return str;
|
|
@@ -1289,6 +1249,10 @@ cs_method_allocated_length(VALUE self)
|
|
|
1289
1249
|
|
|
1290
1250
|
void Init_character_set()
|
|
1291
1251
|
{
|
|
1252
|
+
#ifdef HAVE_RB_EXT_RACTOR_SAFE
|
|
1253
|
+
rb_ext_ractor_safe(true);
|
|
1254
|
+
#endif
|
|
1255
|
+
|
|
1292
1256
|
VALUE cs = rb_define_class("CharacterSet", rb_cObject);
|
|
1293
1257
|
|
|
1294
1258
|
rb_define_alloc_func(cs, cs_method_allocate);
|
|
@@ -1343,7 +1307,7 @@ void Init_character_set()
|
|
|
1343
1307
|
// `CharacterSet`-specific methods
|
|
1344
1308
|
|
|
1345
1309
|
rb_define_singleton_method(cs, "from_ranges", cs_class_method_from_ranges, -2);
|
|
1346
|
-
rb_define_singleton_method(cs, "
|
|
1310
|
+
rb_define_singleton_method(cs, "of_string", cs_class_method_of_string, 1);
|
|
1347
1311
|
|
|
1348
1312
|
rb_define_method(cs, "ranges", cs_method_ranges, 0);
|
|
1349
1313
|
rb_define_method(cs, "sample", cs_method_sample, -1);
|
|
@@ -22,6 +22,17 @@ class CharacterSet
|
|
|
22
22
|
alias valid unicode
|
|
23
23
|
|
|
24
24
|
def build_from_cps_file(path)
|
|
25
|
+
if defined?(Ractor) && Ractor.current != Ractor.main
|
|
26
|
+
raise <<-EOS.gsub(/^ */, '')
|
|
27
|
+
CharacterSet's predefined sets are lazy-loaded.
|
|
28
|
+
Pre-load them to use them in Ractors. E.g.:
|
|
29
|
+
|
|
30
|
+
CharacterSet.ascii # pre-load
|
|
31
|
+
Ractor.new { CharacterSet.ascii.size }.take # => 128
|
|
32
|
+
Ractor.new { 'abc'.keep_character_set(:ascii) }.take # => 'abc'
|
|
33
|
+
EOS
|
|
34
|
+
end
|
|
35
|
+
|
|
25
36
|
File.readlines(path).inject(new) do |set, line|
|
|
26
37
|
range_start, range_end = line.split(',')
|
|
27
38
|
set.merge((range_start.to_i(16))..(range_end.to_i(16)))
|
|
@@ -6,13 +6,9 @@ class CharacterSet
|
|
|
6
6
|
new(Array(ranges).flat_map(&:to_a))
|
|
7
7
|
end
|
|
8
8
|
|
|
9
|
-
def
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
raise ArgumentError, 'pass a String' unless str.respond_to?(:codepoints)
|
|
13
|
-
str.codepoints.each { |cp| new_set << cp }
|
|
14
|
-
end
|
|
15
|
-
new_set
|
|
9
|
+
def of_string(str)
|
|
10
|
+
raise ArgumentError, 'pass a String' unless str.respond_to?(:codepoints)
|
|
11
|
+
str.codepoints.each_with_object(new) { |cp, set| set << cp }
|
|
16
12
|
end
|
|
17
13
|
end
|
|
18
14
|
|
|
@@ -15,6 +15,12 @@ class CharacterSet
|
|
|
15
15
|
new(Array(args))
|
|
16
16
|
end
|
|
17
17
|
|
|
18
|
+
def of(*args)
|
|
19
|
+
args.map do |arg|
|
|
20
|
+
arg.is_a?(Regexp) ? of_regexp(arg) : of_string(arg)
|
|
21
|
+
end.reduce(:merge) || new
|
|
22
|
+
end
|
|
23
|
+
|
|
18
24
|
def parse(string)
|
|
19
25
|
codepoints = Parser.codepoints_from_bracket_expression(string)
|
|
20
26
|
result = new(codepoints)
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: character_set
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.6.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Janosch Müller
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2022-02-16 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: sorted_set
|
|
@@ -24,160 +24,6 @@ dependencies:
|
|
|
24
24
|
- - "~>"
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
26
|
version: '1.0'
|
|
27
|
-
- !ruby/object:Gem::Dependency
|
|
28
|
-
name: benchmark-ips
|
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
|
30
|
-
requirements:
|
|
31
|
-
- - "~>"
|
|
32
|
-
- !ruby/object:Gem::Version
|
|
33
|
-
version: '2.7'
|
|
34
|
-
type: :development
|
|
35
|
-
prerelease: false
|
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
-
requirements:
|
|
38
|
-
- - "~>"
|
|
39
|
-
- !ruby/object:Gem::Version
|
|
40
|
-
version: '2.7'
|
|
41
|
-
- !ruby/object:Gem::Dependency
|
|
42
|
-
name: get_process_mem
|
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
|
44
|
-
requirements:
|
|
45
|
-
- - "~>"
|
|
46
|
-
- !ruby/object:Gem::Version
|
|
47
|
-
version: 0.2.3
|
|
48
|
-
type: :development
|
|
49
|
-
prerelease: false
|
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
-
requirements:
|
|
52
|
-
- - "~>"
|
|
53
|
-
- !ruby/object:Gem::Version
|
|
54
|
-
version: 0.2.3
|
|
55
|
-
- !ruby/object:Gem::Dependency
|
|
56
|
-
name: rake
|
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
|
58
|
-
requirements:
|
|
59
|
-
- - "~>"
|
|
60
|
-
- !ruby/object:Gem::Version
|
|
61
|
-
version: '13.0'
|
|
62
|
-
type: :development
|
|
63
|
-
prerelease: false
|
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
-
requirements:
|
|
66
|
-
- - "~>"
|
|
67
|
-
- !ruby/object:Gem::Version
|
|
68
|
-
version: '13.0'
|
|
69
|
-
- !ruby/object:Gem::Dependency
|
|
70
|
-
name: rake-compiler
|
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
|
72
|
-
requirements:
|
|
73
|
-
- - "~>"
|
|
74
|
-
- !ruby/object:Gem::Version
|
|
75
|
-
version: '1.1'
|
|
76
|
-
type: :development
|
|
77
|
-
prerelease: false
|
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
79
|
-
requirements:
|
|
80
|
-
- - "~>"
|
|
81
|
-
- !ruby/object:Gem::Version
|
|
82
|
-
version: '1.1'
|
|
83
|
-
- !ruby/object:Gem::Dependency
|
|
84
|
-
name: range_compressor
|
|
85
|
-
requirement: !ruby/object:Gem::Requirement
|
|
86
|
-
requirements:
|
|
87
|
-
- - "~>"
|
|
88
|
-
- !ruby/object:Gem::Version
|
|
89
|
-
version: '1.0'
|
|
90
|
-
type: :development
|
|
91
|
-
prerelease: false
|
|
92
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
93
|
-
requirements:
|
|
94
|
-
- - "~>"
|
|
95
|
-
- !ruby/object:Gem::Version
|
|
96
|
-
version: '1.0'
|
|
97
|
-
- !ruby/object:Gem::Dependency
|
|
98
|
-
name: regexp_parser
|
|
99
|
-
requirement: !ruby/object:Gem::Requirement
|
|
100
|
-
requirements:
|
|
101
|
-
- - "~>"
|
|
102
|
-
- !ruby/object:Gem::Version
|
|
103
|
-
version: '2.1'
|
|
104
|
-
type: :development
|
|
105
|
-
prerelease: false
|
|
106
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
107
|
-
requirements:
|
|
108
|
-
- - "~>"
|
|
109
|
-
- !ruby/object:Gem::Version
|
|
110
|
-
version: '2.1'
|
|
111
|
-
- !ruby/object:Gem::Dependency
|
|
112
|
-
name: regexp_property_values
|
|
113
|
-
requirement: !ruby/object:Gem::Requirement
|
|
114
|
-
requirements:
|
|
115
|
-
- - "~>"
|
|
116
|
-
- !ruby/object:Gem::Version
|
|
117
|
-
version: '1.0'
|
|
118
|
-
type: :development
|
|
119
|
-
prerelease: false
|
|
120
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
121
|
-
requirements:
|
|
122
|
-
- - "~>"
|
|
123
|
-
- !ruby/object:Gem::Version
|
|
124
|
-
version: '1.0'
|
|
125
|
-
- !ruby/object:Gem::Dependency
|
|
126
|
-
name: rspec
|
|
127
|
-
requirement: !ruby/object:Gem::Requirement
|
|
128
|
-
requirements:
|
|
129
|
-
- - "~>"
|
|
130
|
-
- !ruby/object:Gem::Version
|
|
131
|
-
version: '3.8'
|
|
132
|
-
type: :development
|
|
133
|
-
prerelease: false
|
|
134
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
135
|
-
requirements:
|
|
136
|
-
- - "~>"
|
|
137
|
-
- !ruby/object:Gem::Version
|
|
138
|
-
version: '3.8'
|
|
139
|
-
- !ruby/object:Gem::Dependency
|
|
140
|
-
name: codecov
|
|
141
|
-
requirement: !ruby/object:Gem::Requirement
|
|
142
|
-
requirements:
|
|
143
|
-
- - "~>"
|
|
144
|
-
- !ruby/object:Gem::Version
|
|
145
|
-
version: 0.2.12
|
|
146
|
-
type: :development
|
|
147
|
-
prerelease: false
|
|
148
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
149
|
-
requirements:
|
|
150
|
-
- - "~>"
|
|
151
|
-
- !ruby/object:Gem::Version
|
|
152
|
-
version: 0.2.12
|
|
153
|
-
- !ruby/object:Gem::Dependency
|
|
154
|
-
name: gouteur
|
|
155
|
-
requirement: !ruby/object:Gem::Requirement
|
|
156
|
-
requirements:
|
|
157
|
-
- - "~>"
|
|
158
|
-
- !ruby/object:Gem::Version
|
|
159
|
-
version: 1.0.0
|
|
160
|
-
type: :development
|
|
161
|
-
prerelease: false
|
|
162
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
163
|
-
requirements:
|
|
164
|
-
- - "~>"
|
|
165
|
-
- !ruby/object:Gem::Version
|
|
166
|
-
version: 1.0.0
|
|
167
|
-
- !ruby/object:Gem::Dependency
|
|
168
|
-
name: rubocop
|
|
169
|
-
requirement: !ruby/object:Gem::Requirement
|
|
170
|
-
requirements:
|
|
171
|
-
- - "~>"
|
|
172
|
-
- !ruby/object:Gem::Version
|
|
173
|
-
version: '1.8'
|
|
174
|
-
type: :development
|
|
175
|
-
prerelease: false
|
|
176
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
177
|
-
requirements:
|
|
178
|
-
- - "~>"
|
|
179
|
-
- !ruby/object:Gem::Version
|
|
180
|
-
version: '1.8'
|
|
181
27
|
description:
|
|
182
28
|
email:
|
|
183
29
|
- janosch84@gmail.com
|
|
@@ -269,7 +115,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
269
115
|
- !ruby/object:Gem::Version
|
|
270
116
|
version: '0'
|
|
271
117
|
requirements: []
|
|
272
|
-
rubygems_version: 3.
|
|
118
|
+
rubygems_version: 3.4.0.dev
|
|
273
119
|
signing_key:
|
|
274
120
|
specification_version: 4
|
|
275
121
|
summary: Build, read, write and compare sets of Unicode codepoints.
|