character_set 1.4.1 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitattributes +1 -1
- data/.github/workflows/gouteur.yml +20 -0
- data/.github/workflows/tests.yml +6 -2
- data/.gitignore +1 -0
- data/.gouteur.yml +2 -0
- data/.rubocop.yml +7 -1
- data/BENCHMARK.md +35 -31
- data/CHANGELOG.md +32 -0
- data/Gemfile +14 -0
- data/README.md +22 -6
- data/Rakefile +5 -2
- data/benchmarks/delete_in.rb +5 -1
- data/benchmarks/keep_in.rb +5 -1
- data/character_set.gemspec +0 -13
- data/ext/character_set/character_set.c +59 -90
- data/ext/character_set/unicode_casefold_table.h +44 -1
- data/lib/character_set/core_ext/string_ext.rb +1 -1
- data/lib/character_set/expression_converter.rb +23 -23
- data/lib/character_set/predefined_sets/assigned.cps +51 -40
- data/lib/character_set/predefined_sets/emoji.cps +12 -11
- data/lib/character_set/predefined_sets.rb +11 -0
- data/lib/character_set/ruby_fallback/character_set_methods.rb +3 -3
- data/lib/character_set/set_method_adapters.rb +4 -3
- data/lib/character_set/shared_methods.rb +15 -1
- data/lib/character_set/version.rb +1 -1
- metadata +5 -143
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e216e6c199ac9443cda9180a9e35d5ed92b50b45c12e7f64f45d74ecd2cf08d6
|
4
|
+
data.tar.gz: 5f3634d426dc33875d6c197ce75466544d97808b1e8b1858ac56d93422b226e8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d24cfaa40b6e4e472e1f76cc8b6f7f3f1282e6830c0cbf76c4810c0f6f365c7419a19816d0b741cee99eb428dae03fc1d60eecab7d1ba6d210015f0cf2d5ee14
|
7
|
+
data.tar.gz: 2bd7ea63b286e106358293b1428a687374d0cd2cdc985b2da5b5cf1f45c6c541cb0ddde5b06477243cf4011065cfac7fa6bb8a521fb144a750c90039d268f03b
|
data/.gitattributes
CHANGED
@@ -0,0 +1,20 @@
|
|
1
|
+
name: gouteur
|
2
|
+
|
3
|
+
on: [push, pull_request]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
runs-on: ubuntu-latest
|
8
|
+
|
9
|
+
steps:
|
10
|
+
- uses: actions/checkout@v2
|
11
|
+
- name: Set up Ruby
|
12
|
+
uses: ruby/setup-ruby@v1
|
13
|
+
with:
|
14
|
+
ruby-version: 2.7
|
15
|
+
- name: Prepare
|
16
|
+
run: |
|
17
|
+
bundle install --jobs 4
|
18
|
+
bundle exec rake compile
|
19
|
+
- name: Test
|
20
|
+
run: bundle exec gouteur
|
data/.github/workflows/tests.yml
CHANGED
@@ -1,6 +1,10 @@
|
|
1
1
|
name: tests
|
2
2
|
|
3
|
-
on:
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
pull_request:
|
6
|
+
schedule:
|
7
|
+
- cron: '11 11 14 * *' # at 11:11 am on the 14th of every month
|
4
8
|
|
5
9
|
jobs:
|
6
10
|
build:
|
@@ -8,7 +12,7 @@ jobs:
|
|
8
12
|
|
9
13
|
strategy:
|
10
14
|
matrix:
|
11
|
-
ruby: [ '2.2', '2.7', '3.0', 'ruby-head', 'jruby-head' ]
|
15
|
+
ruby: [ '2.2', '2.7', '3.0', '3.1', 'ruby-head', 'jruby-head' ]
|
12
16
|
|
13
17
|
steps:
|
14
18
|
- uses: actions/checkout@v2
|
data/.gitignore
CHANGED
data/.gouteur.yml
ADDED
data/.rubocop.yml
CHANGED
@@ -8,4 +8,10 @@ AllCops:
|
|
8
8
|
RubyInterpreters:
|
9
9
|
- ruby
|
10
10
|
- rake
|
11
|
-
TargetRubyVersion: 2.
|
11
|
+
TargetRubyVersion: 2.5 # really 2.1, but 2.5 is lowest supported by rubocop
|
12
|
+
|
13
|
+
Lint/AmbiguousOperatorPrecedence:
|
14
|
+
Enabled: false
|
15
|
+
|
16
|
+
Lint/AmbiguousRegexpLiteral:
|
17
|
+
Enabled: false
|
data/BENCHMARK.md
CHANGED
@@ -1,86 +1,90 @@
|
|
1
|
-
Results of `rake:benchmark` on ruby 3.
|
1
|
+
Results of `rake:benchmark` on ruby 3.2.0dev (2022-02-14T14:35:54Z master 26187a8520) [arm64-darwin21]
|
2
2
|
|
3
3
|
```
|
4
4
|
Counting non-letters
|
5
5
|
|
6
|
-
CharacterSet#count_in:
|
7
|
-
String#count:
|
6
|
+
CharacterSet#count_in: 14794607.9 i/s
|
7
|
+
String#count: 3875939.3 i/s - 3.82x slower
|
8
8
|
```
|
9
9
|
```
|
10
10
|
Detecting non-whitespace
|
11
11
|
|
12
|
-
CharacterSet#cover?:
|
13
|
-
Regexp#match?:
|
12
|
+
CharacterSet#cover?: 17448329.0 i/s
|
13
|
+
Regexp#match?: 13089358.1 i/s - 1.33x slower
|
14
14
|
```
|
15
15
|
```
|
16
16
|
Detecting non-letters
|
17
17
|
|
18
|
-
CharacterSet#cover?:
|
19
|
-
Regexp#match?:
|
18
|
+
CharacterSet#cover?: 17565596.9 i/s
|
19
|
+
Regexp#match?: 7951108.0 i/s - 2.21x slower
|
20
20
|
```
|
21
21
|
```
|
22
|
-
Removing whitespace
|
22
|
+
Removing ASCII whitespace
|
23
23
|
|
24
|
-
CharacterSet#delete_in:
|
25
|
-
|
24
|
+
CharacterSet#delete_in: 6306078.2 i/s
|
25
|
+
String#tr: 4734401.0 i/s - 1.33x slower
|
26
|
+
String#gsub: 211631.8 i/s - 29.80x slower
|
26
27
|
```
|
27
28
|
```
|
28
29
|
Removing whitespace, emoji and umlauts
|
29
30
|
|
30
|
-
CharacterSet#delete_in:
|
31
|
-
|
31
|
+
CharacterSet#delete_in: 5984149.6 i/s
|
32
|
+
String#tr: 363643.1 i/s - 16.46x slower
|
33
|
+
String#gsub: 317201.7 i/s - 18.87x slower
|
32
34
|
```
|
33
35
|
```
|
34
36
|
Removing non-whitespace
|
35
37
|
|
36
|
-
CharacterSet#keep_in:
|
37
|
-
String#gsub:
|
38
|
+
CharacterSet#keep_in: 7650925.6 i/s
|
39
|
+
String#gsub: 207374.6 i/s - 36.89x slower
|
40
|
+
String#tr: 12.3 i/s - 619745.60x slower
|
38
41
|
```
|
39
42
|
```
|
40
|
-
|
43
|
+
Keeping only emoji
|
41
44
|
|
42
|
-
CharacterSet#keep_in:
|
43
|
-
String#gsub:
|
45
|
+
CharacterSet#keep_in: 7272940.1 i/s
|
46
|
+
String#gsub: 177993.8 i/s - 40.86x slower
|
47
|
+
String#tr: 12.3 i/s - 590222.71x slower
|
44
48
|
```
|
45
49
|
```
|
46
50
|
Extracting emoji to an Array
|
47
51
|
|
48
|
-
CharacterSet#scan:
|
49
|
-
String#scan:
|
52
|
+
CharacterSet#scan: 2978285.0 i/s
|
53
|
+
String#scan: 865793.8 i/s - 3.44x slower
|
50
54
|
```
|
51
55
|
```
|
52
56
|
Detecting whitespace
|
53
57
|
|
54
|
-
CharacterSet#used_by?:
|
55
|
-
Regexp#match?:
|
58
|
+
CharacterSet#used_by?: 17292338.4 i/s
|
59
|
+
Regexp#match?: 11705563.9 i/s - 1.48x slower
|
56
60
|
```
|
57
61
|
```
|
58
62
|
Detecting emoji in a large string
|
59
63
|
|
60
|
-
CharacterSet#used_by?:
|
61
|
-
Regexp#match?:
|
64
|
+
CharacterSet#used_by?: 340444.1 i/s
|
65
|
+
Regexp#match?: 180549.8 i/s - 1.89x slower
|
62
66
|
```
|
63
67
|
```
|
64
68
|
Adding entries
|
65
69
|
|
66
|
-
CharacterSet#add:
|
67
|
-
SortedSet#add:
|
70
|
+
CharacterSet#add: 4951781.4 i/s
|
71
|
+
SortedSet#add: 1019637.9 i/s - 4.86x slower
|
68
72
|
```
|
69
73
|
```
|
70
74
|
Removing entries
|
71
75
|
|
72
|
-
CharacterSet#delete:
|
73
|
-
SortedSet#delete:
|
76
|
+
CharacterSet#delete: 5006337.6 i/s
|
77
|
+
SortedSet#delete: 3922752.2 i/s - same-ish
|
74
78
|
```
|
75
79
|
```
|
76
80
|
Merging entries
|
77
81
|
|
78
|
-
CharacterSet#merge:
|
79
|
-
SortedSet#merge:
|
82
|
+
CharacterSet#merge: 661.8 i/s
|
83
|
+
SortedSet#merge: 3.9 i/s - 167.82x slower
|
80
84
|
```
|
81
85
|
```
|
82
86
|
Getting the min and max
|
83
87
|
|
84
|
-
CharacterSet#minmax:
|
85
|
-
SortedSet#minmax:
|
88
|
+
CharacterSet#minmax: 1212462.2 i/s
|
89
|
+
SortedSet#minmax: 844.4 i/s - 1435.93x slower
|
86
90
|
```
|
data/CHANGELOG.md
CHANGED
@@ -4,6 +4,38 @@ All notable changes to this project will be documented in this file.
|
|
4
4
|
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
5
5
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
6
6
|
|
7
|
+
## [1.6.0] - 2022-02-16
|
8
|
+
|
9
|
+
### Added
|
10
|
+
|
11
|
+
- `::of` now supports both `String` and `Regexp` arguments
|
12
|
+
|
13
|
+
### Fixed
|
14
|
+
|
15
|
+
- fixed segfault during `String` manipulation on Ruby 3.2.0-dev
|
16
|
+
- improved performance for `String` manipulation
|
17
|
+
- allow usage in Ractors
|
18
|
+
- predefined sets must be pre-initialized for this, though
|
19
|
+
- e.g. `CharacterSet.ascii`, `keep_character_set(:ascii)` etc.
|
20
|
+
- call them once in the main Ractor to trigger initialization
|
21
|
+
|
22
|
+
## [1.5.0] - 2021-12-05
|
23
|
+
|
24
|
+
### Added
|
25
|
+
|
26
|
+
- new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.1.0
|
27
|
+
- latest unicode case-folding data (for `#case_insensitive`)
|
28
|
+
- support for passing any Enumerable to `#disjoint?`, `#intersect?`
|
29
|
+
- this matches recent broadening of these methods in `ruby/set`
|
30
|
+
- new instance method `#secure_token` (see README)
|
31
|
+
- class method `::of` now accepts more than one `String`
|
32
|
+
- `CharacterSet::ExpressionConverter` can now build output of any Set-like class
|
33
|
+
|
34
|
+
### Fixed
|
35
|
+
|
36
|
+
- `CharacterSet::Pure::of_expression` now returns a `CharacterSet::Pure`
|
37
|
+
- it used to return a regular `CharacterSet`
|
38
|
+
|
7
39
|
## [1.4.1] - 2020-01-10
|
8
40
|
|
9
41
|
### Fixed
|
data/Gemfile
CHANGED
@@ -4,3 +4,17 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
|
|
4
4
|
|
5
5
|
# Specify your gem's dependencies in character_set.gemspec
|
6
6
|
gemspec
|
7
|
+
|
8
|
+
gem 'benchmark-ips', '~> 2.7'
|
9
|
+
gem 'get_process_mem', '~> 0.2.3'
|
10
|
+
gem 'rake', '~> 13.0'
|
11
|
+
gem 'rake-compiler', '~> 1.1'
|
12
|
+
gem 'range_compressor', '~> 1.0'
|
13
|
+
gem 'regexp_parser', '~> 2.1'
|
14
|
+
gem 'regexp_property_values', '~> 1.0'
|
15
|
+
gem 'rspec', '~> 3.8'
|
16
|
+
if RUBY_VERSION.to_f >= 2.7
|
17
|
+
gem 'codecov', '~> 0.2.12'
|
18
|
+
gem 'gouteur', '~> 1.0.0'
|
19
|
+
gem 'rubocop', '~> 1.8'
|
20
|
+
end
|
data/README.md
CHANGED
@@ -2,17 +2,20 @@
|
|
2
2
|
|
3
3
|
[](http://badge.fury.io/rb/character_set)
|
4
4
|
[](https://github.com/jaynetics/character_set/actions)
|
5
|
+
[](https://github.com/jaynetics/character_set/actions)
|
5
6
|
[](https://codecov.io/gh/jaynetics/character_set)
|
6
7
|
|
7
|
-
This is a C-extended Ruby gem to work with sets of Unicode codepoints.
|
8
|
+
This is a C-extended Ruby gem to work with sets of Unicode codepoints.
|
8
9
|
|
9
|
-
It
|
10
|
+
It can [read](#parseinitialize) and [write](#write) sets of codepoints in various formats and it implements the stdlib `Set` interface for them.
|
11
|
+
|
12
|
+
It also offers a [way of scrubbing and scanning characters in Strings](#interact-with-strings) that is more semantic and consistently offers better performance than `Regexp` and `String` methods from the stdlib for this (see [benchmarks](./BENCHMARK.md)).
|
10
13
|
|
11
14
|
Many parts can be used independently, e.g.:
|
12
15
|
- `CharacterSet::Character`
|
16
|
+
- `CharacterSet::ExpressionConverter`
|
13
17
|
- `CharacterSet::Parser`
|
14
18
|
- `CharacterSet::Writer`
|
15
|
-
- [`RangeCompressor`](https://github.com/jaynetics/range_compressor)
|
16
19
|
|
17
20
|
## Usage
|
18
21
|
|
@@ -40,9 +43,10 @@ CharacterSet.parse('[a-c]')
|
|
40
43
|
CharacterSet.parse('\U00000061-\U00000063')
|
41
44
|
```
|
42
45
|
|
43
|
-
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed,
|
46
|
+
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `Regexp` and unicode property names can also be read. Regexp intersections, negations, and set nesting are covered, but the `i`-flag is ignored; call `#case_insensitive` on the result if needed.
|
44
47
|
|
45
48
|
```ruby
|
49
|
+
CharacterSet.of(/./) # => #<CharacterSet (size: 1112064)>
|
46
50
|
CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
|
47
51
|
|
48
52
|
require 'character_set/core_ext/regexp_ext'
|
@@ -143,6 +147,7 @@ CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
|
|
143
147
|
```
|
144
148
|
|
145
149
|
### Write
|
150
|
+
|
146
151
|
```ruby
|
147
152
|
set = CharacterSet['a', 'b', 'c', 'j', '-']
|
148
153
|
|
@@ -181,7 +186,18 @@ set.to_s_with_surrogate_alternation
|
|
181
186
|
# => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
|
182
187
|
```
|
183
188
|
|
184
|
-
###
|
189
|
+
### Other features
|
190
|
+
|
191
|
+
#### Secure tokens
|
192
|
+
|
193
|
+
Generate secure random strings of characters from a set:
|
194
|
+
|
195
|
+
```ruby
|
196
|
+
CharacterSet.new('a'..'z').secure_token(8) # => "ugwpujmt"
|
197
|
+
CharacterSet.crypt.secure_token # => "8.1w7aBT737/pMfcMoO4y2y8/=0xtmo:"
|
198
|
+
```
|
199
|
+
|
200
|
+
#### Unicode planes
|
185
201
|
|
186
202
|
There are some methods to check for planes and to handle ASCII, [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
|
187
203
|
```Ruby
|
@@ -198,6 +214,6 @@ CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
|
|
198
214
|
CharacterSet::Character.new('a').plane # => 0
|
199
215
|
```
|
200
216
|
|
201
|
-
|
217
|
+
## Contributions
|
202
218
|
|
203
219
|
Feel free to send suggestions, point out issues, or submit pull requests.
|
data/Rakefile
CHANGED
@@ -147,8 +147,11 @@ namespace :benchmark do
|
|
147
147
|
f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
|
148
148
|
|
149
149
|
$store_comparison_results.each do |caption, result|
|
150
|
-
f.puts '```',
|
151
|
-
|
150
|
+
f.puts '```',
|
151
|
+
caption,
|
152
|
+
'',
|
153
|
+
result.strip.gsub(/ \(±[^)]+\) /, '').gsub(/(same-ish).*$/, '\1').lines[1..-1],
|
154
|
+
'```'
|
152
155
|
end
|
153
156
|
end
|
154
157
|
end
|
data/benchmarks/delete_in.rb
CHANGED
@@ -2,24 +2,28 @@ require_relative './shared'
|
|
2
2
|
|
3
3
|
str = 'Lorem ipsum et dolorem'
|
4
4
|
rx = /\s/
|
5
|
+
trt = "\t\n\v\f\r\s"
|
5
6
|
cs = CharacterSet.whitespace
|
6
7
|
|
7
8
|
benchmark(
|
8
|
-
caption: 'Removing whitespace',
|
9
|
+
caption: 'Removing ASCII whitespace',
|
9
10
|
cases: {
|
10
11
|
'String#gsub' => -> { str.gsub(rx, '') },
|
12
|
+
'String#tr' => -> { str.tr(trt, '') },
|
11
13
|
'CharacterSet#delete_in' => -> { cs.delete_in(str) },
|
12
14
|
}
|
13
15
|
)
|
14
16
|
|
15
17
|
str = 'Lörem ipsüm ⛷ et dölörem'
|
16
18
|
rx = /[\s\p{emoji}äüö]/
|
19
|
+
trt = "\t\n\v\f\r\s😀-🙏äüö"
|
17
20
|
cs = CharacterSet.whitespace + CharacterSet.emoji + CharacterSet['ä', 'ö', 'ü']
|
18
21
|
|
19
22
|
benchmark(
|
20
23
|
caption: 'Removing whitespace, emoji and umlauts',
|
21
24
|
cases: {
|
22
25
|
'String#gsub' => -> { str.gsub(rx, '') },
|
26
|
+
'String#tr' => -> { str.tr(trt, '') },
|
23
27
|
'CharacterSet#delete_in' => -> { cs.delete_in(str) },
|
24
28
|
}
|
25
29
|
)
|
data/benchmarks/keep_in.rb
CHANGED
@@ -2,24 +2,28 @@ require_relative './shared'
|
|
2
2
|
|
3
3
|
str = 'Lorem ipsum et dolorem'
|
4
4
|
rx = /\S/
|
5
|
+
trt = "\u{0080}-\u{10FFFF}" # approximation
|
5
6
|
cs = CharacterSet.whitespace
|
6
7
|
|
7
8
|
benchmark(
|
8
9
|
caption: 'Removing non-whitespace',
|
9
10
|
cases: {
|
10
11
|
'String#gsub' => -> { str.gsub(rx, '') },
|
12
|
+
'String#tr' => -> { str.tr(trt, '') },
|
11
13
|
'CharacterSet#keep_in' => -> { cs.keep_in(str) },
|
12
14
|
}
|
13
15
|
)
|
14
16
|
|
15
17
|
str = 'Lorem ipsum ⛷ et dolorem'
|
16
18
|
rx = /\p{^emoji}/
|
19
|
+
trt = "\u0000-\u{1F599}\u{1F650}-\u{10FFFF}"
|
17
20
|
cs = CharacterSet.emoji
|
18
21
|
|
19
22
|
benchmark(
|
20
|
-
caption: '
|
23
|
+
caption: 'Keeping only emoji',
|
21
24
|
cases: {
|
22
25
|
'String#gsub' => -> { str.gsub(rx, '') },
|
26
|
+
'String#tr' => -> { str.tr(trt, '') },
|
23
27
|
'CharacterSet#keep_in' => -> { cs.keep_in(str) },
|
24
28
|
}
|
25
29
|
)
|
data/character_set.gemspec
CHANGED
@@ -28,17 +28,4 @@ Gem::Specification.new do |s|
|
|
28
28
|
if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
|
29
29
|
s.add_dependency 'sorted_set', '~> 1.0'
|
30
30
|
end
|
31
|
-
|
32
|
-
s.add_development_dependency 'benchmark-ips', '~> 2.7'
|
33
|
-
s.add_development_dependency 'get_process_mem', '~> 0.2.3'
|
34
|
-
s.add_development_dependency 'rake', '~> 13.0'
|
35
|
-
s.add_development_dependency 'rake-compiler', '~> 1.1'
|
36
|
-
s.add_development_dependency 'range_compressor', '~> 1.0'
|
37
|
-
s.add_development_dependency 'regexp_parser', '~> 1.6'
|
38
|
-
s.add_development_dependency 'regexp_property_values', '~> 1.0'
|
39
|
-
s.add_development_dependency 'rspec', '~> 3.8'
|
40
|
-
if RUBY_VERSION.to_f >= 2.7
|
41
|
-
s.add_development_dependency 'codecov', '~> 0.2.12'
|
42
|
-
s.add_development_dependency 'rubocop', '~> 1.8'
|
43
|
-
end
|
44
31
|
end
|
@@ -82,7 +82,11 @@ static const rb_data_type_t cs_type = {
|
|
82
82
|
.dsize = cs_memsize,
|
83
83
|
},
|
84
84
|
.data = NULL,
|
85
|
+
#ifdef RUBY_TYPED_FROZEN_SHAREABLE
|
86
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_FROZEN_SHAREABLE,
|
87
|
+
#else
|
85
88
|
.flags = RUBY_TYPED_FREE_IMMEDIATELY,
|
89
|
+
#endif
|
86
90
|
};
|
87
91
|
|
88
92
|
static inline VALUE
|
@@ -315,9 +319,9 @@ cs_method_minmax(VALUE self)
|
|
315
319
|
cs_cp cp, alen, blen; \
|
316
320
|
cs_ar *acps, *bcps; \
|
317
321
|
struct cs_data *new_data; \
|
318
|
-
new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
|
319
322
|
acps = cs_fetch_cps(cs_a, &alen); \
|
320
323
|
bcps = cs_fetch_cps(cs_b, &blen); \
|
324
|
+
new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
|
321
325
|
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) \
|
322
326
|
{ \
|
323
327
|
if (tst_cp(acps, alen, cp) comp_op tst_cp(bcps, blen, cp)) \
|
@@ -1046,13 +1050,14 @@ raise_arg_err_unless_string(VALUE val)
|
|
1046
1050
|
}
|
1047
1051
|
|
1048
1052
|
static VALUE
|
1049
|
-
|
1053
|
+
cs_class_method_of_string(VALUE self, VALUE string)
|
1050
1054
|
{
|
1051
1055
|
VALUE new_cs;
|
1052
1056
|
struct cs_data *new_data;
|
1057
|
+
|
1058
|
+
raise_arg_err_unless_string(string);
|
1053
1059
|
new_cs = cs_alloc(self, &new_data);
|
1054
|
-
|
1055
|
-
each_cp(str, add_str_cp_to_arr, 0, 0, new_data, 0);
|
1060
|
+
each_cp(string, add_str_cp_to_arr, 0, 0, new_data, 0);
|
1056
1061
|
return new_cs;
|
1057
1062
|
}
|
1058
1063
|
|
@@ -1133,116 +1138,76 @@ cs_method_used_by_p(VALUE self, VALUE str)
|
|
1133
1138
|
return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
|
1134
1139
|
}
|
1135
1140
|
|
1136
|
-
|
1137
|
-
cs_str_buf_cat(VALUE str, const char *ptr, long len)
|
1138
|
-
{
|
1139
|
-
long total, olen;
|
1140
|
-
char *sptr;
|
1141
|
-
|
1142
|
-
RSTRING_GETMEM(str, sptr, olen);
|
1143
|
-
sptr = RSTRING(str)->as.heap.ptr;
|
1144
|
-
olen = RSTRING(str)->as.heap.len;
|
1145
|
-
total = olen + len;
|
1146
|
-
memcpy(sptr + olen, ptr, len);
|
1147
|
-
RSTRING(str)->as.heap.len = total;
|
1148
|
-
}
|
1149
|
-
|
1150
|
-
#ifndef TERM_FILL
|
1151
|
-
#define TERM_FILL(ptr, termlen) \
|
1152
|
-
do \
|
1153
|
-
{ \
|
1154
|
-
char *const term_fill_ptr = (ptr); \
|
1155
|
-
const int term_fill_len = (termlen); \
|
1156
|
-
*term_fill_ptr = '\0'; \
|
1157
|
-
if (__builtin_expect(!!(term_fill_len > 1), 0)) \
|
1158
|
-
memset(term_fill_ptr, 0, term_fill_len); \
|
1159
|
-
} while (0)
|
1160
|
-
#endif
|
1161
|
-
|
1162
|
-
static void
|
1163
|
-
cs_str_buf_terminate(VALUE str, rb_encoding *enc)
|
1164
|
-
{
|
1165
|
-
char *ptr;
|
1166
|
-
long len;
|
1167
|
-
|
1168
|
-
ptr = RSTRING(str)->as.heap.ptr;
|
1169
|
-
len = RSTRING(str)->as.heap.len;
|
1170
|
-
TERM_FILL(ptr + len, rb_enc_mbminlen(enc));
|
1171
|
-
}
|
1172
|
-
|
1141
|
+
// partially based on rb_str_delete_bang
|
1173
1142
|
static inline VALUE
|
1174
1143
|
cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
|
1175
1144
|
{
|
1176
1145
|
cs_ar *cps;
|
1177
|
-
cs_cp
|
1178
|
-
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
1182
|
-
|
1146
|
+
cs_cp cs_len;
|
1147
|
+
VALUE orig_str_len;
|
1148
|
+
|
1149
|
+
rb_encoding *enc;
|
1150
|
+
char *s, *send, *t;
|
1151
|
+
int ascompat, cr;
|
1183
1152
|
|
1184
1153
|
raise_arg_err_unless_string(str);
|
1185
1154
|
|
1186
|
-
|
1155
|
+
orig_str_len = RSTRING_LEN(str);
|
1187
1156
|
|
1188
|
-
|
1189
|
-
if (orig_len < 1) // empty string, will never change
|
1157
|
+
if (orig_str_len == 0)
|
1190
1158
|
{
|
1191
|
-
|
1192
|
-
{
|
1193
|
-
return Qnil;
|
1194
|
-
}
|
1195
|
-
return rb_str_dup(str);
|
1159
|
+
return bang ? Qnil : str;
|
1196
1160
|
}
|
1197
1161
|
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
|
1203
|
-
|
1204
|
-
ptr = RSTRING_PTR(str);
|
1205
|
-
end = RSTRING_END(str);
|
1162
|
+
if (!bang)
|
1163
|
+
{
|
1164
|
+
str = rb_str_dup(str);
|
1165
|
+
}
|
1206
1166
|
|
1207
|
-
|
1167
|
+
cps = cs_fetch_cps(set, &cs_len);
|
1168
|
+
rb_str_modify(str);
|
1169
|
+
enc = rb_enc_get(str);
|
1170
|
+
ascompat = rb_enc_asciicompat(enc);
|
1171
|
+
s = t = RSTRING_PTR(str);
|
1172
|
+
send = RSTRING_END(str);
|
1173
|
+
cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
|
1174
|
+
while (s < send)
|
1208
1175
|
{
|
1209
|
-
|
1176
|
+
unsigned int c;
|
1177
|
+
int clen;
|
1178
|
+
|
1179
|
+
if (ascompat && (c = *(unsigned char *)s) < 0x80)
|
1210
1180
|
{
|
1211
|
-
|
1212
|
-
if ((!tst_cp(cps, len, str_cp)) == delete)
|
1181
|
+
if (tst_cp(cps, cs_len, c) != delete)
|
1213
1182
|
{
|
1214
|
-
|
1183
|
+
if (t != s)
|
1184
|
+
*t = c;
|
1185
|
+
t++;
|
1215
1186
|
}
|
1216
|
-
|
1187
|
+
s++;
|
1217
1188
|
}
|
1218
|
-
|
1219
|
-
else // likely to be multibyte string
|
1220
|
-
{
|
1221
|
-
while (ptr < end)
|
1189
|
+
else
|
1222
1190
|
{
|
1223
|
-
|
1224
|
-
|
1191
|
+
c = rb_enc_codepoint_len(s, send, &clen, enc);
|
1192
|
+
|
1193
|
+
if (tst_cp(cps, cs_len, c) != delete)
|
1225
1194
|
{
|
1226
|
-
|
1195
|
+
if (t != s)
|
1196
|
+
rb_enc_mbcput(c, t, enc);
|
1197
|
+
t += clen;
|
1198
|
+
if (cr == ENC_CODERANGE_7BIT)
|
1199
|
+
cr = ENC_CODERANGE_VALID;
|
1227
1200
|
}
|
1228
|
-
|
1201
|
+
s += clen;
|
1229
1202
|
}
|
1230
1203
|
}
|
1231
1204
|
|
1232
|
-
|
1205
|
+
rb_str_set_len(str, t - RSTRING_PTR(str));
|
1206
|
+
ENC_CODERANGE_SET(str, cr);
|
1233
1207
|
|
1234
|
-
if (bang)
|
1235
|
-
{
|
1236
|
-
if (RSTRING_LEN(new_str_buf) == (long)orig_len) // string unchanged
|
1237
|
-
{
|
1238
|
-
return Qnil;
|
1239
|
-
}
|
1240
|
-
rb_str_shared_replace(str, new_str_buf);
|
1241
|
-
}
|
1242
|
-
else
|
1208
|
+
if (bang && (RSTRING_LEN(str) == (long)orig_str_len)) // string unchanged
|
1243
1209
|
{
|
1244
|
-
|
1245
|
-
str = new_str_buf;
|
1210
|
+
return Qnil;
|
1246
1211
|
}
|
1247
1212
|
|
1248
1213
|
return str;
|
@@ -1284,6 +1249,10 @@ cs_method_allocated_length(VALUE self)
|
|
1284
1249
|
|
1285
1250
|
void Init_character_set()
|
1286
1251
|
{
|
1252
|
+
#ifdef HAVE_RB_EXT_RACTOR_SAFE
|
1253
|
+
rb_ext_ractor_safe(true);
|
1254
|
+
#endif
|
1255
|
+
|
1287
1256
|
VALUE cs = rb_define_class("CharacterSet", rb_cObject);
|
1288
1257
|
|
1289
1258
|
rb_define_alloc_func(cs, cs_method_allocate);
|
@@ -1338,7 +1307,7 @@ void Init_character_set()
|
|
1338
1307
|
// `CharacterSet`-specific methods
|
1339
1308
|
|
1340
1309
|
rb_define_singleton_method(cs, "from_ranges", cs_class_method_from_ranges, -2);
|
1341
|
-
rb_define_singleton_method(cs, "
|
1310
|
+
rb_define_singleton_method(cs, "of_string", cs_class_method_of_string, 1);
|
1342
1311
|
|
1343
1312
|
rb_define_method(cs, "ranges", cs_method_ranges, 0);
|
1344
1313
|
rb_define_method(cs, "sample", cs_method_sample, -1);
|