character_set 1.4.1 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7a91fd10258c312d27d3fa84f99f1a97168d12ca08a3911fe31485565a999246
4
- data.tar.gz: 2f16c02b72302259bccda6f2bf731950bd6dc8c679af8812c414ac313f1d8fc2
3
+ metadata.gz: e216e6c199ac9443cda9180a9e35d5ed92b50b45c12e7f64f45d74ecd2cf08d6
4
+ data.tar.gz: 5f3634d426dc33875d6c197ce75466544d97808b1e8b1858ac56d93422b226e8
5
5
  SHA512:
6
- metadata.gz: cab6e94ec0a7efc2f26eba33dd1b4d5af639905d23422ec61420411325832a998c07359a4bf50c24379ec4550784ebc6da0effec4c917e7859392345ce9b8db0
7
- data.tar.gz: a2dc319a9f8085e85624f25cc6f12dc03992b50f3f1a8d2000e1b69dadfdc4219c887452bdffbb213a91e1cad2011f237f604aa6fdb7e93243304d22fb5adfa3
6
+ metadata.gz: d24cfaa40b6e4e472e1f76cc8b6f7f3f1282e6830c0cbf76c4810c0f6f365c7419a19816d0b741cee99eb428dae03fc1d60eecab7d1ba6d210015f0cf2d5ee14
7
+ data.tar.gz: 2bd7ea63b286e106358293b1428a687374d0cd2cdc985b2da5b5cf1f45c6c541cb0ddde5b06477243cf4011065cfac7fa6bb8a521fb144a750c90039d268f03b
data/.gitattributes CHANGED
@@ -1,3 +1,3 @@
1
1
  *.cps linguist-detectable=false
2
2
  benchmarks/* linguist-detectable=false
3
- spec/ruby-spec/* linguist-vendored
3
+ spec/* linguist-detectable=false
@@ -0,0 +1,20 @@
1
+ name: gouteur
2
+
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+
9
+ steps:
10
+ - uses: actions/checkout@v2
11
+ - name: Set up Ruby
12
+ uses: ruby/setup-ruby@v1
13
+ with:
14
+ ruby-version: 2.7
15
+ - name: Prepare
16
+ run: |
17
+ bundle install --jobs 4
18
+ bundle exec rake compile
19
+ - name: Test
20
+ run: bundle exec gouteur
@@ -1,6 +1,10 @@
1
1
  name: tests
2
2
 
3
- on: [push, pull_request]
3
+ on:
4
+ push:
5
+ pull_request:
6
+ schedule:
7
+ - cron: '11 11 14 * *' # at 11:11 am on the 14th of every month
4
8
 
5
9
  jobs:
6
10
  build:
@@ -8,7 +12,7 @@ jobs:
8
12
 
9
13
  strategy:
10
14
  matrix:
11
- ruby: [ '2.2', '2.7', '3.0', 'ruby-head', 'jruby-head' ]
15
+ ruby: [ '2.2', '2.7', '3.0', '3.1', 'ruby-head', 'jruby-head' ]
12
16
 
13
17
  steps:
14
18
  - uses: actions/checkout@v2
data/.gitignore CHANGED
@@ -15,6 +15,7 @@
15
15
  .ruby-version
16
16
  .tags
17
17
  .tags1
18
+ .tool-versions
18
19
  .vscode
19
20
  bbin/
20
21
  binstubs/*
data/.gouteur.yml ADDED
@@ -0,0 +1,2 @@
1
+ repos:
2
+ - uri: https://github.com/jaynetics/js_regex
data/.rubocop.yml CHANGED
@@ -8,4 +8,10 @@ AllCops:
8
8
  RubyInterpreters:
9
9
  - ruby
10
10
  - rake
11
- TargetRubyVersion: 2.4 # really 2.1, but 2.4 is lowest supported by rubocop
11
+ TargetRubyVersion: 2.5 # really 2.1, but 2.5 is lowest supported by rubocop
12
+
13
+ Lint/AmbiguousOperatorPrecedence:
14
+ Enabled: false
15
+
16
+ Lint/AmbiguousRegexpLiteral:
17
+ Enabled: false
data/BENCHMARK.md CHANGED
@@ -1,86 +1,90 @@
1
- Results of `rake:benchmark` on ruby 3.0.0p0 (2020-12-25 revision 95aff21468) [x86_64-darwin19]
1
+ Results of `rake:benchmark` on ruby 3.2.0dev (2022-02-14T14:35:54Z master 26187a8520) [arm64-darwin21]
2
2
 
3
3
  ```
4
4
  Counting non-letters
5
5
 
6
- CharacterSet#count_in: 9472902.2 i/s
7
- String#count: 2221799.9 i/s - 4.26x slower
6
+ CharacterSet#count_in: 14794607.9 i/s
7
+ String#count: 3875939.3 i/s - 3.82x slower
8
8
  ```
9
9
  ```
10
10
  Detecting non-whitespace
11
11
 
12
- CharacterSet#cover?: 12388427.2 i/s
13
- Regexp#match?: 7901676.8 i/s - 1.57x slower
12
+ CharacterSet#cover?: 17448329.0 i/s
13
+ Regexp#match?: 13089358.1 i/s - 1.33x slower
14
14
  ```
15
15
  ```
16
16
  Detecting non-letters
17
17
 
18
- CharacterSet#cover?: 12263689.1 i/s
19
- Regexp#match?: 4940889.9 i/s - 2.48x slower
18
+ CharacterSet#cover?: 17565596.9 i/s
19
+ Regexp#match?: 7951108.0 i/s - 2.21x slower
20
20
  ```
21
21
  ```
22
- Removing whitespace
22
+ Removing ASCII whitespace
23
23
 
24
- CharacterSet#delete_in: 2406722.6 i/s
25
- String#gsub: 235760.3 i/s - 10.21x slower
24
+ CharacterSet#delete_in: 6306078.2 i/s
25
+ String#tr: 4734401.0 i/s - 1.33x slower
26
+ String#gsub: 211631.8 i/s - 29.80x slower
26
27
  ```
27
28
  ```
28
29
  Removing whitespace, emoji and umlauts
29
30
 
30
- CharacterSet#delete_in: 1653607.6 i/s
31
- String#gsub: 272782.9 i/s - 6.06x slower
31
+ CharacterSet#delete_in: 5984149.6 i/s
32
+ String#tr: 363643.1 i/s - 16.46x slower
33
+ String#gsub: 317201.7 i/s - 18.87x slower
32
34
  ```
33
35
  ```
34
36
  Removing non-whitespace
35
37
 
36
- CharacterSet#keep_in: 2671038.2 i/s
37
- String#gsub: 242551.0 i/s - 11.01x slower
38
+ CharacterSet#keep_in: 7650925.6 i/s
39
+ String#gsub: 207374.6 i/s - 36.89x slower
40
+ String#tr: 12.3 i/s - 619745.60x slower
38
41
  ```
39
42
  ```
40
- Extracting emoji
43
+ Keeping only emoji
41
44
 
42
- CharacterSet#keep_in: 1726496.5 i/s
43
- String#gsub: 215609.2 i/s - 8.01x slower
45
+ CharacterSet#keep_in: 7272940.1 i/s
46
+ String#gsub: 177993.8 i/s - 40.86x slower
47
+ String#tr: 12.3 i/s - 590222.71x slower
44
48
  ```
45
49
  ```
46
50
  Extracting emoji to an Array
47
51
 
48
- CharacterSet#scan: 2373856.1 i/s
49
- String#scan: 480000.5 i/s - 4.95x slower
52
+ CharacterSet#scan: 2978285.0 i/s
53
+ String#scan: 865793.8 i/s - 3.44x slower
50
54
  ```
51
55
  ```
52
56
  Detecting whitespace
53
57
 
54
- CharacterSet#used_by?: 11988328.7 i/s
55
- Regexp#match?: 6758146.8 i/s - 1.77x slower
58
+ CharacterSet#used_by?: 17292338.4 i/s
59
+ Regexp#match?: 11705563.9 i/s - 1.48x slower
56
60
  ```
57
61
  ```
58
62
  Detecting emoji in a large string
59
63
 
60
- CharacterSet#used_by?: 288223.3 i/s
61
- Regexp#match?: 102384.2 i/s - 2.82x slower
64
+ CharacterSet#used_by?: 340444.1 i/s
65
+ Regexp#match?: 180549.8 i/s - 1.89x slower
62
66
  ```
63
67
  ```
64
68
  Adding entries
65
69
 
66
- CharacterSet#add: 2538251.2 i/s
67
- SortedSet#add: 443925.9 i/s - 5.72x slower
70
+ CharacterSet#add: 4951781.4 i/s
71
+ SortedSet#add: 1019637.9 i/s - 4.86x slower
68
72
  ```
69
73
  ```
70
74
  Removing entries
71
75
 
72
- CharacterSet#delete: 2487620.8 i/s
73
- SortedSet#delete: 628816.1 i/s - 3.96x slower
76
+ CharacterSet#delete: 5006337.6 i/s
77
+ SortedSet#delete: 3922752.2 i/s - same-ish
74
78
  ```
75
79
  ```
76
80
  Merging entries
77
81
 
78
- CharacterSet#merge: 551.6 i/s
79
- SortedSet#merge: 1.4 i/s - 393.59x slower
82
+ CharacterSet#merge: 661.8 i/s
83
+ SortedSet#merge: 3.9 i/s - 167.82x slower
80
84
  ```
81
85
  ```
82
86
  Getting the min and max
83
87
 
84
- CharacterSet#minmax: 636890.7 i/s
85
- SortedSet#minmax: 254.1 i/s - 2506.20x slower
88
+ CharacterSet#minmax: 1212462.2 i/s
89
+ SortedSet#minmax: 844.4 i/s - 1435.93x slower
86
90
  ```
data/CHANGELOG.md CHANGED
@@ -4,6 +4,38 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [1.6.0] - 2022-02-16
8
+
9
+ ### Added
10
+
11
+ - `::of` now supports both `String` and `Regexp` arguments
12
+
13
+ ### Fixed
14
+
15
+ - fixed segfault during `String` manipulation on Ruby 3.2.0-dev
16
+ - improved performance for `String` manipulation
17
+ - allow usage in Ractors
18
+ - predefined sets must be pre-initialized for this, though
19
+ - e.g. `CharacterSet.ascii`, `keep_character_set(:ascii)` etc.
20
+ - call them once in the main Ractor to trigger initialization
21
+
22
+ ## [1.5.0] - 2021-12-05
23
+
24
+ ### Added
25
+
26
+ - new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.1.0
27
+ - latest unicode case-folding data (for `#case_insensitive`)
28
+ - support for passing any Enumerable to `#disjoint?`, `#intersect?`
29
+ - this matches recent broadening of these methods in `ruby/set`
30
+ - new instance method `#secure_token` (see README)
31
+ - class method `::of` now accepts more than one `String`
32
+ - `CharacterSet::ExpressionConverter` can now build output of any Set-like class
33
+
34
+ ### Fixed
35
+
36
+ - `CharacterSet::Pure::of_expression` now returns a `CharacterSet::Pure`
37
+ - it used to return a regular `CharacterSet`
38
+
7
39
  ## [1.4.1] - 2020-01-10
8
40
 
9
41
  ### Fixed
data/Gemfile CHANGED
@@ -4,3 +4,17 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
4
 
5
5
  # Specify your gem's dependencies in character_set.gemspec
6
6
  gemspec
7
+
8
+ gem 'benchmark-ips', '~> 2.7'
9
+ gem 'get_process_mem', '~> 0.2.3'
10
+ gem 'rake', '~> 13.0'
11
+ gem 'rake-compiler', '~> 1.1'
12
+ gem 'range_compressor', '~> 1.0'
13
+ gem 'regexp_parser', '~> 2.1'
14
+ gem 'regexp_property_values', '~> 1.0'
15
+ gem 'rspec', '~> 3.8'
16
+ if RUBY_VERSION.to_f >= 2.7
17
+ gem 'codecov', '~> 0.2.12'
18
+ gem 'gouteur', '~> 1.0.0'
19
+ gem 'rubocop', '~> 1.8'
20
+ end
data/README.md CHANGED
@@ -2,17 +2,20 @@
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set)
4
4
  [![Build Status](https://github.com/jaynetics/character_set/workflows/tests/badge.svg)](https://github.com/jaynetics/character_set/actions)
5
+ [![Build Status](https://github.com/jaynetics/character_set/workflows/gouteur/badge.svg)](https://github.com/jaynetics/character_set/actions)
5
6
  [![codecov](https://codecov.io/gh/jaynetics/character_set/branch/master/graph/badge.svg)](https://codecov.io/gh/jaynetics/character_set)
6
7
 
7
- This is a C-extended Ruby gem to work with sets of Unicode codepoints. It can read and write these sets in various formats and implements the stdlib `Set` interface for them.
8
+ This is a C-extended Ruby gem to work with sets of Unicode codepoints.
8
9
 
9
- It also offers an alternate paradigm of `String` processing which grants much better performance than `Regexp` and `String` methods from the stdlib where applicable (see [benchmarks](./BENCHMARK.md)).
10
+ It can [read](#parseinitialize) and [write](#write) sets of codepoints in various formats and it implements the stdlib `Set` interface for them.
11
+
12
+ It also offers a [way of scrubbing and scanning characters in Strings](#interact-with-strings) that is more semantic and consistently offers better performance than `Regexp` and `String` methods from the stdlib for this (see [benchmarks](./BENCHMARK.md)).
10
13
 
11
14
  Many parts can be used independently, e.g.:
12
15
  - `CharacterSet::Character`
16
+ - `CharacterSet::ExpressionConverter`
13
17
  - `CharacterSet::Parser`
14
18
  - `CharacterSet::Writer`
15
- - [`RangeCompressor`](https://github.com/jaynetics/range_compressor)
16
19
 
17
20
  ## Usage
18
21
 
@@ -40,9 +43,10 @@ CharacterSet.parse('[a-c]')
40
43
  CharacterSet.parse('\U00000061-\U00000063')
41
44
  ```
42
45
 
43
- If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting. Regexp's `i`-flag is ignored; call `#case_insensitive` on the result if needed.
46
+ If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `Regexp` and unicode property names can also be read. Regexp intersections, negations, and set nesting are covered, but the `i`-flag is ignored; call `#case_insensitive` on the result if needed.
44
47
 
45
48
  ```ruby
49
+ CharacterSet.of(/./) # => #<CharacterSet (size: 1112064)>
46
50
  CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
47
51
 
48
52
  require 'character_set/core_ext/regexp_ext'
@@ -143,6 +147,7 @@ CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
143
147
  ```
144
148
 
145
149
  ### Write
150
+
146
151
  ```ruby
147
152
  set = CharacterSet['a', 'b', 'c', 'j', '-']
148
153
 
@@ -181,7 +186,18 @@ set.to_s_with_surrogate_alternation
181
186
  # => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
182
187
  ```
183
188
 
184
- ### Unicode plane methods
189
+ ### Other features
190
+
191
+ #### Secure tokens
192
+
193
+ Generate secure random strings of characters from a set:
194
+
195
+ ```ruby
196
+ CharacterSet.new('a'..'z').secure_token(8) # => "ugwpujmt"
197
+ CharacterSet.crypt.secure_token # => "8.1w7aBT737/pMfcMoO4y2y8/=0xtmo:"
198
+ ```
199
+
200
+ #### Unicode planes
185
201
 
186
202
  There are some methods to check for planes and to handle ASCII, [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
187
203
  ```Ruby
@@ -198,6 +214,6 @@ CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
198
214
  CharacterSet::Character.new('a').plane # => 0
199
215
  ```
200
216
 
201
- ### Contributions
217
+ ## Contributions
202
218
 
203
219
  Feel free to send suggestions, point out issues, or submit pull requests.
data/Rakefile CHANGED
@@ -147,8 +147,11 @@ namespace :benchmark do
147
147
  f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
148
148
 
149
149
  $store_comparison_results.each do |caption, result|
150
- f.puts '```', caption, '',
151
- result.strip.gsub(/(same-ish).*$/, '\1').lines[1..-1], '```'
150
+ f.puts '```',
151
+ caption,
152
+ '',
153
+ result.strip.gsub(/ \(±[^)]+\) /, '').gsub(/(same-ish).*$/, '\1').lines[1..-1],
154
+ '```'
152
155
  end
153
156
  end
154
157
  end
@@ -2,24 +2,28 @@ require_relative './shared'
2
2
 
3
3
  str = 'Lorem ipsum et dolorem'
4
4
  rx = /\s/
5
+ trt = "\t\n\v\f\r\s"
5
6
  cs = CharacterSet.whitespace
6
7
 
7
8
  benchmark(
8
- caption: 'Removing whitespace',
9
+ caption: 'Removing ASCII whitespace',
9
10
  cases: {
10
11
  'String#gsub' => -> { str.gsub(rx, '') },
12
+ 'String#tr' => -> { str.tr(trt, '') },
11
13
  'CharacterSet#delete_in' => -> { cs.delete_in(str) },
12
14
  }
13
15
  )
14
16
 
15
17
  str = 'Lörem ipsüm ⛷ et dölörem'
16
18
  rx = /[\s\p{emoji}äüö]/
19
+ trt = "\t\n\v\f\r\s😀-🙏äüö"
17
20
  cs = CharacterSet.whitespace + CharacterSet.emoji + CharacterSet['ä', 'ö', 'ü']
18
21
 
19
22
  benchmark(
20
23
  caption: 'Removing whitespace, emoji and umlauts',
21
24
  cases: {
22
25
  'String#gsub' => -> { str.gsub(rx, '') },
26
+ 'String#tr' => -> { str.tr(trt, '') },
23
27
  'CharacterSet#delete_in' => -> { cs.delete_in(str) },
24
28
  }
25
29
  )
@@ -2,24 +2,28 @@ require_relative './shared'
2
2
 
3
3
  str = 'Lorem ipsum et dolorem'
4
4
  rx = /\S/
5
+ trt = "\u{0080}-\u{10FFFF}" # approximation
5
6
  cs = CharacterSet.whitespace
6
7
 
7
8
  benchmark(
8
9
  caption: 'Removing non-whitespace',
9
10
  cases: {
10
11
  'String#gsub' => -> { str.gsub(rx, '') },
12
+ 'String#tr' => -> { str.tr(trt, '') },
11
13
  'CharacterSet#keep_in' => -> { cs.keep_in(str) },
12
14
  }
13
15
  )
14
16
 
15
17
  str = 'Lorem ipsum ⛷ et dolorem'
16
18
  rx = /\p{^emoji}/
19
+ trt = "\u0000-\u{1F599}\u{1F650}-\u{10FFFF}"
17
20
  cs = CharacterSet.emoji
18
21
 
19
22
  benchmark(
20
- caption: 'Extracting emoji',
23
+ caption: 'Keeping only emoji',
21
24
  cases: {
22
25
  'String#gsub' => -> { str.gsub(rx, '') },
26
+ 'String#tr' => -> { str.tr(trt, '') },
23
27
  'CharacterSet#keep_in' => -> { cs.keep_in(str) },
24
28
  }
25
29
  )
@@ -28,17 +28,4 @@ Gem::Specification.new do |s|
28
28
  if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
29
29
  s.add_dependency 'sorted_set', '~> 1.0'
30
30
  end
31
-
32
- s.add_development_dependency 'benchmark-ips', '~> 2.7'
33
- s.add_development_dependency 'get_process_mem', '~> 0.2.3'
34
- s.add_development_dependency 'rake', '~> 13.0'
35
- s.add_development_dependency 'rake-compiler', '~> 1.1'
36
- s.add_development_dependency 'range_compressor', '~> 1.0'
37
- s.add_development_dependency 'regexp_parser', '~> 1.6'
38
- s.add_development_dependency 'regexp_property_values', '~> 1.0'
39
- s.add_development_dependency 'rspec', '~> 3.8'
40
- if RUBY_VERSION.to_f >= 2.7
41
- s.add_development_dependency 'codecov', '~> 0.2.12'
42
- s.add_development_dependency 'rubocop', '~> 1.8'
43
- end
44
31
  end
@@ -82,7 +82,11 @@ static const rb_data_type_t cs_type = {
82
82
  .dsize = cs_memsize,
83
83
  },
84
84
  .data = NULL,
85
+ #ifdef RUBY_TYPED_FROZEN_SHAREABLE
86
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_FROZEN_SHAREABLE,
87
+ #else
85
88
  .flags = RUBY_TYPED_FREE_IMMEDIATELY,
89
+ #endif
86
90
  };
87
91
 
88
92
  static inline VALUE
@@ -315,9 +319,9 @@ cs_method_minmax(VALUE self)
315
319
  cs_cp cp, alen, blen; \
316
320
  cs_ar *acps, *bcps; \
317
321
  struct cs_data *new_data; \
318
- new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
319
322
  acps = cs_fetch_cps(cs_a, &alen); \
320
323
  bcps = cs_fetch_cps(cs_b, &blen); \
324
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
321
325
  for (cp = 0; cp < UNICODE_CP_COUNT; cp++) \
322
326
  { \
323
327
  if (tst_cp(acps, alen, cp) comp_op tst_cp(bcps, blen, cp)) \
@@ -1046,13 +1050,14 @@ raise_arg_err_unless_string(VALUE val)
1046
1050
  }
1047
1051
 
1048
1052
  static VALUE
1049
- cs_class_method_of(VALUE self, VALUE str)
1053
+ cs_class_method_of_string(VALUE self, VALUE string)
1050
1054
  {
1051
1055
  VALUE new_cs;
1052
1056
  struct cs_data *new_data;
1057
+
1058
+ raise_arg_err_unless_string(string);
1053
1059
  new_cs = cs_alloc(self, &new_data);
1054
- raise_arg_err_unless_string(str);
1055
- each_cp(str, add_str_cp_to_arr, 0, 0, new_data, 0);
1060
+ each_cp(string, add_str_cp_to_arr, 0, 0, new_data, 0);
1056
1061
  return new_cs;
1057
1062
  }
1058
1063
 
@@ -1133,116 +1138,76 @@ cs_method_used_by_p(VALUE self, VALUE str)
1133
1138
  return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
1134
1139
  }
1135
1140
 
1136
- static void
1137
- cs_str_buf_cat(VALUE str, const char *ptr, long len)
1138
- {
1139
- long total, olen;
1140
- char *sptr;
1141
-
1142
- RSTRING_GETMEM(str, sptr, olen);
1143
- sptr = RSTRING(str)->as.heap.ptr;
1144
- olen = RSTRING(str)->as.heap.len;
1145
- total = olen + len;
1146
- memcpy(sptr + olen, ptr, len);
1147
- RSTRING(str)->as.heap.len = total;
1148
- }
1149
-
1150
- #ifndef TERM_FILL
1151
- #define TERM_FILL(ptr, termlen) \
1152
- do \
1153
- { \
1154
- char *const term_fill_ptr = (ptr); \
1155
- const int term_fill_len = (termlen); \
1156
- *term_fill_ptr = '\0'; \
1157
- if (__builtin_expect(!!(term_fill_len > 1), 0)) \
1158
- memset(term_fill_ptr, 0, term_fill_len); \
1159
- } while (0)
1160
- #endif
1161
-
1162
- static void
1163
- cs_str_buf_terminate(VALUE str, rb_encoding *enc)
1164
- {
1165
- char *ptr;
1166
- long len;
1167
-
1168
- ptr = RSTRING(str)->as.heap.ptr;
1169
- len = RSTRING(str)->as.heap.len;
1170
- TERM_FILL(ptr + len, rb_enc_mbminlen(enc));
1171
- }
1172
-
1141
+ // partially based on rb_str_delete_bang
1173
1142
  static inline VALUE
1174
1143
  cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1175
1144
  {
1176
1145
  cs_ar *cps;
1177
- cs_cp len;
1178
- rb_encoding *str_enc;
1179
- VALUE orig_len, new_str_buf;
1180
- int cp_len;
1181
- unsigned int str_cp;
1182
- const char *ptr, *end;
1146
+ cs_cp cs_len;
1147
+ VALUE orig_str_len;
1148
+
1149
+ rb_encoding *enc;
1150
+ char *s, *send, *t;
1151
+ int ascompat, cr;
1183
1152
 
1184
1153
  raise_arg_err_unless_string(str);
1185
1154
 
1186
- cps = cs_fetch_cps(set, &len);
1155
+ orig_str_len = RSTRING_LEN(str);
1187
1156
 
1188
- orig_len = RSTRING_LEN(str);
1189
- if (orig_len < 1) // empty string, will never change
1157
+ if (orig_str_len == 0)
1190
1158
  {
1191
- if (bang)
1192
- {
1193
- return Qnil;
1194
- }
1195
- return rb_str_dup(str);
1159
+ return bang ? Qnil : str;
1196
1160
  }
1197
1161
 
1198
- new_str_buf = rb_str_buf_new(orig_len + 30); // len + margin
1199
- str_enc = rb_enc_get(str);
1200
- rb_enc_associate(new_str_buf, str_enc);
1201
- rb_str_modify(new_str_buf);
1202
- ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1203
-
1204
- ptr = RSTRING_PTR(str);
1205
- end = RSTRING_END(str);
1162
+ if (!bang)
1163
+ {
1164
+ str = rb_str_dup(str);
1165
+ }
1206
1166
 
1207
- if (single_byte_optimizable(str))
1167
+ cps = cs_fetch_cps(set, &cs_len);
1168
+ rb_str_modify(str);
1169
+ enc = rb_enc_get(str);
1170
+ ascompat = rb_enc_asciicompat(enc);
1171
+ s = t = RSTRING_PTR(str);
1172
+ send = RSTRING_END(str);
1173
+ cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
1174
+ while (s < send)
1208
1175
  {
1209
- while (ptr < end)
1176
+ unsigned int c;
1177
+ int clen;
1178
+
1179
+ if (ascompat && (c = *(unsigned char *)s) < 0x80)
1210
1180
  {
1211
- str_cp = *ptr & 0xff;
1212
- if ((!tst_cp(cps, len, str_cp)) == delete)
1181
+ if (tst_cp(cps, cs_len, c) != delete)
1213
1182
  {
1214
- cs_str_buf_cat(new_str_buf, ptr, 1);
1183
+ if (t != s)
1184
+ *t = c;
1185
+ t++;
1215
1186
  }
1216
- ptr++;
1187
+ s++;
1217
1188
  }
1218
- }
1219
- else // likely to be multibyte string
1220
- {
1221
- while (ptr < end)
1189
+ else
1222
1190
  {
1223
- str_cp = rb_enc_codepoint_len(ptr, end, &cp_len, str_enc);
1224
- if ((!tst_cp(cps, len, str_cp)) == delete)
1191
+ c = rb_enc_codepoint_len(s, send, &clen, enc);
1192
+
1193
+ if (tst_cp(cps, cs_len, c) != delete)
1225
1194
  {
1226
- cs_str_buf_cat(new_str_buf, ptr, cp_len);
1195
+ if (t != s)
1196
+ rb_enc_mbcput(c, t, enc);
1197
+ t += clen;
1198
+ if (cr == ENC_CODERANGE_7BIT)
1199
+ cr = ENC_CODERANGE_VALID;
1227
1200
  }
1228
- ptr += cp_len;
1201
+ s += clen;
1229
1202
  }
1230
1203
  }
1231
1204
 
1232
- cs_str_buf_terminate(new_str_buf, str_enc);
1205
+ rb_str_set_len(str, t - RSTRING_PTR(str));
1206
+ ENC_CODERANGE_SET(str, cr);
1233
1207
 
1234
- if (bang)
1235
- {
1236
- if (RSTRING_LEN(new_str_buf) == (long)orig_len) // string unchanged
1237
- {
1238
- return Qnil;
1239
- }
1240
- rb_str_shared_replace(str, new_str_buf);
1241
- }
1242
- else
1208
+ if (bang && (RSTRING_LEN(str) == (long)orig_str_len)) // string unchanged
1243
1209
  {
1244
- RB_OBJ_WRITE(new_str_buf, &(RBASIC(new_str_buf))->klass, rb_obj_class(str));
1245
- str = new_str_buf;
1210
+ return Qnil;
1246
1211
  }
1247
1212
 
1248
1213
  return str;
@@ -1284,6 +1249,10 @@ cs_method_allocated_length(VALUE self)
1284
1249
 
1285
1250
  void Init_character_set()
1286
1251
  {
1252
+ #ifdef HAVE_RB_EXT_RACTOR_SAFE
1253
+ rb_ext_ractor_safe(true);
1254
+ #endif
1255
+
1287
1256
  VALUE cs = rb_define_class("CharacterSet", rb_cObject);
1288
1257
 
1289
1258
  rb_define_alloc_func(cs, cs_method_allocate);
@@ -1338,7 +1307,7 @@ void Init_character_set()
1338
1307
  // `CharacterSet`-specific methods
1339
1308
 
1340
1309
  rb_define_singleton_method(cs, "from_ranges", cs_class_method_from_ranges, -2);
1341
- rb_define_singleton_method(cs, "of", cs_class_method_of, 1);
1310
+ rb_define_singleton_method(cs, "of_string", cs_class_method_of_string, 1);
1342
1311
 
1343
1312
  rb_define_method(cs, "ranges", cs_method_ranges, 0);
1344
1313
  rb_define_method(cs, "sample", cs_method_sample, -1);