character_set 1.4.1 → 1.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7a91fd10258c312d27d3fa84f99f1a97168d12ca08a3911fe31485565a999246
4
- data.tar.gz: 2f16c02b72302259bccda6f2bf731950bd6dc8c679af8812c414ac313f1d8fc2
3
+ metadata.gz: e216e6c199ac9443cda9180a9e35d5ed92b50b45c12e7f64f45d74ecd2cf08d6
4
+ data.tar.gz: 5f3634d426dc33875d6c197ce75466544d97808b1e8b1858ac56d93422b226e8
5
5
  SHA512:
6
- metadata.gz: cab6e94ec0a7efc2f26eba33dd1b4d5af639905d23422ec61420411325832a998c07359a4bf50c24379ec4550784ebc6da0effec4c917e7859392345ce9b8db0
7
- data.tar.gz: a2dc319a9f8085e85624f25cc6f12dc03992b50f3f1a8d2000e1b69dadfdc4219c887452bdffbb213a91e1cad2011f237f604aa6fdb7e93243304d22fb5adfa3
6
+ metadata.gz: d24cfaa40b6e4e472e1f76cc8b6f7f3f1282e6830c0cbf76c4810c0f6f365c7419a19816d0b741cee99eb428dae03fc1d60eecab7d1ba6d210015f0cf2d5ee14
7
+ data.tar.gz: 2bd7ea63b286e106358293b1428a687374d0cd2cdc985b2da5b5cf1f45c6c541cb0ddde5b06477243cf4011065cfac7fa6bb8a521fb144a750c90039d268f03b
data/.gitattributes CHANGED
@@ -1,3 +1,3 @@
1
1
  *.cps linguist-detectable=false
2
2
  benchmarks/* linguist-detectable=false
3
- spec/ruby-spec/* linguist-vendored
3
+ spec/* linguist-detectable=false
@@ -0,0 +1,20 @@
1
+ name: gouteur
2
+
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+
9
+ steps:
10
+ - uses: actions/checkout@v2
11
+ - name: Set up Ruby
12
+ uses: ruby/setup-ruby@v1
13
+ with:
14
+ ruby-version: 2.7
15
+ - name: Prepare
16
+ run: |
17
+ bundle install --jobs 4
18
+ bundle exec rake compile
19
+ - name: Test
20
+ run: bundle exec gouteur
@@ -1,6 +1,10 @@
1
1
  name: tests
2
2
 
3
- on: [push, pull_request]
3
+ on:
4
+ push:
5
+ pull_request:
6
+ schedule:
7
+ - cron: '11 11 14 * *' # at 11:11 am on the 14th of every month
4
8
 
5
9
  jobs:
6
10
  build:
@@ -8,7 +12,7 @@ jobs:
8
12
 
9
13
  strategy:
10
14
  matrix:
11
- ruby: [ '2.2', '2.7', '3.0', 'ruby-head', 'jruby-head' ]
15
+ ruby: [ '2.2', '2.7', '3.0', '3.1', 'ruby-head', 'jruby-head' ]
12
16
 
13
17
  steps:
14
18
  - uses: actions/checkout@v2
data/.gitignore CHANGED
@@ -15,6 +15,7 @@
15
15
  .ruby-version
16
16
  .tags
17
17
  .tags1
18
+ .tool-versions
18
19
  .vscode
19
20
  bbin/
20
21
  binstubs/*
data/.gouteur.yml ADDED
@@ -0,0 +1,2 @@
1
+ repos:
2
+ - uri: https://github.com/jaynetics/js_regex
data/.rubocop.yml CHANGED
@@ -8,4 +8,10 @@ AllCops:
8
8
  RubyInterpreters:
9
9
  - ruby
10
10
  - rake
11
- TargetRubyVersion: 2.4 # really 2.1, but 2.4 is lowest supported by rubocop
11
+ TargetRubyVersion: 2.5 # really 2.1, but 2.5 is lowest supported by rubocop
12
+
13
+ Lint/AmbiguousOperatorPrecedence:
14
+ Enabled: false
15
+
16
+ Lint/AmbiguousRegexpLiteral:
17
+ Enabled: false
data/BENCHMARK.md CHANGED
@@ -1,86 +1,90 @@
1
- Results of `rake:benchmark` on ruby 3.0.0p0 (2020-12-25 revision 95aff21468) [x86_64-darwin19]
1
+ Results of `rake:benchmark` on ruby 3.2.0dev (2022-02-14T14:35:54Z master 26187a8520) [arm64-darwin21]
2
2
 
3
3
  ```
4
4
  Counting non-letters
5
5
 
6
- CharacterSet#count_in: 9472902.2 i/s
7
- String#count: 2221799.9 i/s - 4.26x slower
6
+ CharacterSet#count_in: 14794607.9 i/s
7
+ String#count: 3875939.3 i/s - 3.82x slower
8
8
  ```
9
9
  ```
10
10
  Detecting non-whitespace
11
11
 
12
- CharacterSet#cover?: 12388427.2 i/s
13
- Regexp#match?: 7901676.8 i/s - 1.57x slower
12
+ CharacterSet#cover?: 17448329.0 i/s
13
+ Regexp#match?: 13089358.1 i/s - 1.33x slower
14
14
  ```
15
15
  ```
16
16
  Detecting non-letters
17
17
 
18
- CharacterSet#cover?: 12263689.1 i/s
19
- Regexp#match?: 4940889.9 i/s - 2.48x slower
18
+ CharacterSet#cover?: 17565596.9 i/s
19
+ Regexp#match?: 7951108.0 i/s - 2.21x slower
20
20
  ```
21
21
  ```
22
- Removing whitespace
22
+ Removing ASCII whitespace
23
23
 
24
- CharacterSet#delete_in: 2406722.6 i/s
25
- String#gsub: 235760.3 i/s - 10.21x slower
24
+ CharacterSet#delete_in: 6306078.2 i/s
25
+ String#tr: 4734401.0 i/s - 1.33x slower
26
+ String#gsub: 211631.8 i/s - 29.80x slower
26
27
  ```
27
28
  ```
28
29
  Removing whitespace, emoji and umlauts
29
30
 
30
- CharacterSet#delete_in: 1653607.6 i/s
31
- String#gsub: 272782.9 i/s - 6.06x slower
31
+ CharacterSet#delete_in: 5984149.6 i/s
32
+ String#tr: 363643.1 i/s - 16.46x slower
33
+ String#gsub: 317201.7 i/s - 18.87x slower
32
34
  ```
33
35
  ```
34
36
  Removing non-whitespace
35
37
 
36
- CharacterSet#keep_in: 2671038.2 i/s
37
- String#gsub: 242551.0 i/s - 11.01x slower
38
+ CharacterSet#keep_in: 7650925.6 i/s
39
+ String#gsub: 207374.6 i/s - 36.89x slower
40
+ String#tr: 12.3 i/s - 619745.60x slower
38
41
  ```
39
42
  ```
40
- Extracting emoji
43
+ Keeping only emoji
41
44
 
42
- CharacterSet#keep_in: 1726496.5 i/s
43
- String#gsub: 215609.2 i/s - 8.01x slower
45
+ CharacterSet#keep_in: 7272940.1 i/s
46
+ String#gsub: 177993.8 i/s - 40.86x slower
47
+ String#tr: 12.3 i/s - 590222.71x slower
44
48
  ```
45
49
  ```
46
50
  Extracting emoji to an Array
47
51
 
48
- CharacterSet#scan: 2373856.1 i/s
49
- String#scan: 480000.5 i/s - 4.95x slower
52
+ CharacterSet#scan: 2978285.0 i/s
53
+ String#scan: 865793.8 i/s - 3.44x slower
50
54
  ```
51
55
  ```
52
56
  Detecting whitespace
53
57
 
54
- CharacterSet#used_by?: 11988328.7 i/s
55
- Regexp#match?: 6758146.8 i/s - 1.77x slower
58
+ CharacterSet#used_by?: 17292338.4 i/s
59
+ Regexp#match?: 11705563.9 i/s - 1.48x slower
56
60
  ```
57
61
  ```
58
62
  Detecting emoji in a large string
59
63
 
60
- CharacterSet#used_by?: 288223.3 i/s
61
- Regexp#match?: 102384.2 i/s - 2.82x slower
64
+ CharacterSet#used_by?: 340444.1 i/s
65
+ Regexp#match?: 180549.8 i/s - 1.89x slower
62
66
  ```
63
67
  ```
64
68
  Adding entries
65
69
 
66
- CharacterSet#add: 2538251.2 i/s
67
- SortedSet#add: 443925.9 i/s - 5.72x slower
70
+ CharacterSet#add: 4951781.4 i/s
71
+ SortedSet#add: 1019637.9 i/s - 4.86x slower
68
72
  ```
69
73
  ```
70
74
  Removing entries
71
75
 
72
- CharacterSet#delete: 2487620.8 i/s
73
- SortedSet#delete: 628816.1 i/s - 3.96x slower
76
+ CharacterSet#delete: 5006337.6 i/s
77
+ SortedSet#delete: 3922752.2 i/s - same-ish
74
78
  ```
75
79
  ```
76
80
  Merging entries
77
81
 
78
- CharacterSet#merge: 551.6 i/s
79
- SortedSet#merge: 1.4 i/s - 393.59x slower
82
+ CharacterSet#merge: 661.8 i/s
83
+ SortedSet#merge: 3.9 i/s - 167.82x slower
80
84
  ```
81
85
  ```
82
86
  Getting the min and max
83
87
 
84
- CharacterSet#minmax: 636890.7 i/s
85
- SortedSet#minmax: 254.1 i/s - 2506.20x slower
88
+ CharacterSet#minmax: 1212462.2 i/s
89
+ SortedSet#minmax: 844.4 i/s - 1435.93x slower
86
90
  ```
data/CHANGELOG.md CHANGED
@@ -4,6 +4,38 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [1.6.0] - 2022-02-16
8
+
9
+ ### Added
10
+
11
+ - `::of` now supports both `String` and `Regexp` arguments
12
+
13
+ ### Fixed
14
+
15
+ - fixed segfault during `String` manipulation on Ruby 3.2.0-dev
16
+ - improved performance for `String` manipulation
17
+ - allow usage in Ractors
18
+ - predefined sets must be pre-initialized for this, though
19
+ - e.g. `CharacterSet.ascii`, `keep_character_set(:ascii)` etc.
20
+ - call them once in the main Ractor to trigger initialization
21
+
22
+ ## [1.5.0] - 2021-12-05
23
+
24
+ ### Added
25
+
26
+ - new codepoints for `::assigned` and `::emoji` predefined sets, as in Ruby 3.1.0
27
+ - latest unicode case-folding data (for `#case_insensitive`)
28
+ - support for passing any Enumerable to `#disjoint?`, `#intersect?`
29
+ - this matches recent broadening of these methods in `ruby/set`
30
+ - new instance method `#secure_token` (see README)
31
+ - class method `::of` now accepts more than one `String`
32
+ - `CharacterSet::ExpressionConverter` can now build output of any Set-like class
33
+
34
+ ### Fixed
35
+
36
+ - `CharacterSet::Pure::of_expression` now returns a `CharacterSet::Pure`
37
+ - it used to return a regular `CharacterSet`
38
+
7
39
  ## [1.4.1] - 2020-01-10
8
40
 
9
41
  ### Fixed
data/Gemfile CHANGED
@@ -4,3 +4,17 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
4
 
5
5
  # Specify your gem's dependencies in character_set.gemspec
6
6
  gemspec
7
+
8
+ gem 'benchmark-ips', '~> 2.7'
9
+ gem 'get_process_mem', '~> 0.2.3'
10
+ gem 'rake', '~> 13.0'
11
+ gem 'rake-compiler', '~> 1.1'
12
+ gem 'range_compressor', '~> 1.0'
13
+ gem 'regexp_parser', '~> 2.1'
14
+ gem 'regexp_property_values', '~> 1.0'
15
+ gem 'rspec', '~> 3.8'
16
+ if RUBY_VERSION.to_f >= 2.7
17
+ gem 'codecov', '~> 0.2.12'
18
+ gem 'gouteur', '~> 1.0.0'
19
+ gem 'rubocop', '~> 1.8'
20
+ end
data/README.md CHANGED
@@ -2,17 +2,20 @@
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set)
4
4
  [![Build Status](https://github.com/jaynetics/character_set/workflows/tests/badge.svg)](https://github.com/jaynetics/character_set/actions)
5
+ [![Build Status](https://github.com/jaynetics/character_set/workflows/gouteur/badge.svg)](https://github.com/jaynetics/character_set/actions)
5
6
  [![codecov](https://codecov.io/gh/jaynetics/character_set/branch/master/graph/badge.svg)](https://codecov.io/gh/jaynetics/character_set)
6
7
 
7
- This is a C-extended Ruby gem to work with sets of Unicode codepoints. It can read and write these sets in various formats and implements the stdlib `Set` interface for them.
8
+ This is a C-extended Ruby gem to work with sets of Unicode codepoints.
8
9
 
9
- It also offers an alternate paradigm of `String` processing which grants much better performance than `Regexp` and `String` methods from the stdlib where applicable (see [benchmarks](./BENCHMARK.md)).
10
+ It can [read](#parseinitialize) and [write](#write) sets of codepoints in various formats and it implements the stdlib `Set` interface for them.
11
+
12
+ It also offers a [way of scrubbing and scanning characters in Strings](#interact-with-strings) that is more semantic and consistently offers better performance than `Regexp` and `String` methods from the stdlib for this (see [benchmarks](./BENCHMARK.md)).
10
13
 
11
14
  Many parts can be used independently, e.g.:
12
15
  - `CharacterSet::Character`
16
+ - `CharacterSet::ExpressionConverter`
13
17
  - `CharacterSet::Parser`
14
18
  - `CharacterSet::Writer`
15
- - [`RangeCompressor`](https://github.com/jaynetics/range_compressor)
16
19
 
17
20
  ## Usage
18
21
 
@@ -40,9 +43,10 @@ CharacterSet.parse('[a-c]')
40
43
  CharacterSet.parse('\U00000061-\U00000063')
41
44
  ```
42
45
 
43
- If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting. Regexp's `i`-flag is ignored; call `#case_insensitive` on the result if needed.
46
+ If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `Regexp` and unicode property names can also be read. Regexp intersections, negations, and set nesting are covered, but the `i`-flag is ignored; call `#case_insensitive` on the result if needed.
44
47
 
45
48
  ```ruby
49
+ CharacterSet.of(/./) # => #<CharacterSet (size: 1112064)>
46
50
  CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
47
51
 
48
52
  require 'character_set/core_ext/regexp_ext'
@@ -143,6 +147,7 @@ CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
143
147
  ```
144
148
 
145
149
  ### Write
150
+
146
151
  ```ruby
147
152
  set = CharacterSet['a', 'b', 'c', 'j', '-']
148
153
 
@@ -181,7 +186,18 @@ set.to_s_with_surrogate_alternation
181
186
  # => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
182
187
  ```
183
188
 
184
- ### Unicode plane methods
189
+ ### Other features
190
+
191
+ #### Secure tokens
192
+
193
+ Generate secure random strings of characters from a set:
194
+
195
+ ```ruby
196
+ CharacterSet.new('a'..'z').secure_token(8) # => "ugwpujmt"
197
+ CharacterSet.crypt.secure_token # => "8.1w7aBT737/pMfcMoO4y2y8/=0xtmo:"
198
+ ```
199
+
200
+ #### Unicode planes
185
201
 
186
202
  There are some methods to check for planes and to handle ASCII, [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
187
203
  ```Ruby
@@ -198,6 +214,6 @@ CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
198
214
  CharacterSet::Character.new('a').plane # => 0
199
215
  ```
200
216
 
201
- ### Contributions
217
+ ## Contributions
202
218
 
203
219
  Feel free to send suggestions, point out issues, or submit pull requests.
data/Rakefile CHANGED
@@ -147,8 +147,11 @@ namespace :benchmark do
147
147
  f.puts "Results of `rake:benchmark` on #{RUBY_DESCRIPTION}", ''
148
148
 
149
149
  $store_comparison_results.each do |caption, result|
150
- f.puts '```', caption, '',
151
- result.strip.gsub(/(same-ish).*$/, '\1').lines[1..-1], '```'
150
+ f.puts '```',
151
+ caption,
152
+ '',
153
+ result.strip.gsub(/ \(±[^)]+\) /, '').gsub(/(same-ish).*$/, '\1').lines[1..-1],
154
+ '```'
152
155
  end
153
156
  end
154
157
  end
@@ -2,24 +2,28 @@ require_relative './shared'
2
2
 
3
3
  str = 'Lorem ipsum et dolorem'
4
4
  rx = /\s/
5
+ trt = "\t\n\v\f\r\s"
5
6
  cs = CharacterSet.whitespace
6
7
 
7
8
  benchmark(
8
- caption: 'Removing whitespace',
9
+ caption: 'Removing ASCII whitespace',
9
10
  cases: {
10
11
  'String#gsub' => -> { str.gsub(rx, '') },
12
+ 'String#tr' => -> { str.tr(trt, '') },
11
13
  'CharacterSet#delete_in' => -> { cs.delete_in(str) },
12
14
  }
13
15
  )
14
16
 
15
17
  str = 'Lörem ipsüm ⛷ et dölörem'
16
18
  rx = /[\s\p{emoji}äüö]/
19
+ trt = "\t\n\v\f\r\s😀-🙏äüö"
17
20
  cs = CharacterSet.whitespace + CharacterSet.emoji + CharacterSet['ä', 'ö', 'ü']
18
21
 
19
22
  benchmark(
20
23
  caption: 'Removing whitespace, emoji and umlauts',
21
24
  cases: {
22
25
  'String#gsub' => -> { str.gsub(rx, '') },
26
+ 'String#tr' => -> { str.tr(trt, '') },
23
27
  'CharacterSet#delete_in' => -> { cs.delete_in(str) },
24
28
  }
25
29
  )
@@ -2,24 +2,28 @@ require_relative './shared'
2
2
 
3
3
  str = 'Lorem ipsum et dolorem'
4
4
  rx = /\S/
5
+ trt = "\u{0080}-\u{10FFFF}" # approximation
5
6
  cs = CharacterSet.whitespace
6
7
 
7
8
  benchmark(
8
9
  caption: 'Removing non-whitespace',
9
10
  cases: {
10
11
  'String#gsub' => -> { str.gsub(rx, '') },
12
+ 'String#tr' => -> { str.tr(trt, '') },
11
13
  'CharacterSet#keep_in' => -> { cs.keep_in(str) },
12
14
  }
13
15
  )
14
16
 
15
17
  str = 'Lorem ipsum ⛷ et dolorem'
16
18
  rx = /\p{^emoji}/
19
+ trt = "\u0000-\u{1F599}\u{1F650}-\u{10FFFF}"
17
20
  cs = CharacterSet.emoji
18
21
 
19
22
  benchmark(
20
- caption: 'Extracting emoji',
23
+ caption: 'Keeping only emoji',
21
24
  cases: {
22
25
  'String#gsub' => -> { str.gsub(rx, '') },
26
+ 'String#tr' => -> { str.tr(trt, '') },
23
27
  'CharacterSet#keep_in' => -> { cs.keep_in(str) },
24
28
  }
25
29
  )
@@ -28,17 +28,4 @@ Gem::Specification.new do |s|
28
28
  if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
29
29
  s.add_dependency 'sorted_set', '~> 1.0'
30
30
  end
31
-
32
- s.add_development_dependency 'benchmark-ips', '~> 2.7'
33
- s.add_development_dependency 'get_process_mem', '~> 0.2.3'
34
- s.add_development_dependency 'rake', '~> 13.0'
35
- s.add_development_dependency 'rake-compiler', '~> 1.1'
36
- s.add_development_dependency 'range_compressor', '~> 1.0'
37
- s.add_development_dependency 'regexp_parser', '~> 1.6'
38
- s.add_development_dependency 'regexp_property_values', '~> 1.0'
39
- s.add_development_dependency 'rspec', '~> 3.8'
40
- if RUBY_VERSION.to_f >= 2.7
41
- s.add_development_dependency 'codecov', '~> 0.2.12'
42
- s.add_development_dependency 'rubocop', '~> 1.8'
43
- end
44
31
  end
@@ -82,7 +82,11 @@ static const rb_data_type_t cs_type = {
82
82
  .dsize = cs_memsize,
83
83
  },
84
84
  .data = NULL,
85
+ #ifdef RUBY_TYPED_FROZEN_SHAREABLE
86
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_FROZEN_SHAREABLE,
87
+ #else
85
88
  .flags = RUBY_TYPED_FREE_IMMEDIATELY,
89
+ #endif
86
90
  };
87
91
 
88
92
  static inline VALUE
@@ -315,9 +319,9 @@ cs_method_minmax(VALUE self)
315
319
  cs_cp cp, alen, blen; \
316
320
  cs_ar *acps, *bcps; \
317
321
  struct cs_data *new_data; \
318
- new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
319
322
  acps = cs_fetch_cps(cs_a, &alen); \
320
323
  bcps = cs_fetch_cps(cs_b, &blen); \
324
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
321
325
  for (cp = 0; cp < UNICODE_CP_COUNT; cp++) \
322
326
  { \
323
327
  if (tst_cp(acps, alen, cp) comp_op tst_cp(bcps, blen, cp)) \
@@ -1046,13 +1050,14 @@ raise_arg_err_unless_string(VALUE val)
1046
1050
  }
1047
1051
 
1048
1052
  static VALUE
1049
- cs_class_method_of(VALUE self, VALUE str)
1053
+ cs_class_method_of_string(VALUE self, VALUE string)
1050
1054
  {
1051
1055
  VALUE new_cs;
1052
1056
  struct cs_data *new_data;
1057
+
1058
+ raise_arg_err_unless_string(string);
1053
1059
  new_cs = cs_alloc(self, &new_data);
1054
- raise_arg_err_unless_string(str);
1055
- each_cp(str, add_str_cp_to_arr, 0, 0, new_data, 0);
1060
+ each_cp(string, add_str_cp_to_arr, 0, 0, new_data, 0);
1056
1061
  return new_cs;
1057
1062
  }
1058
1063
 
@@ -1133,116 +1138,76 @@ cs_method_used_by_p(VALUE self, VALUE str)
1133
1138
  return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
1134
1139
  }
1135
1140
 
1136
- static void
1137
- cs_str_buf_cat(VALUE str, const char *ptr, long len)
1138
- {
1139
- long total, olen;
1140
- char *sptr;
1141
-
1142
- RSTRING_GETMEM(str, sptr, olen);
1143
- sptr = RSTRING(str)->as.heap.ptr;
1144
- olen = RSTRING(str)->as.heap.len;
1145
- total = olen + len;
1146
- memcpy(sptr + olen, ptr, len);
1147
- RSTRING(str)->as.heap.len = total;
1148
- }
1149
-
1150
- #ifndef TERM_FILL
1151
- #define TERM_FILL(ptr, termlen) \
1152
- do \
1153
- { \
1154
- char *const term_fill_ptr = (ptr); \
1155
- const int term_fill_len = (termlen); \
1156
- *term_fill_ptr = '\0'; \
1157
- if (__builtin_expect(!!(term_fill_len > 1), 0)) \
1158
- memset(term_fill_ptr, 0, term_fill_len); \
1159
- } while (0)
1160
- #endif
1161
-
1162
- static void
1163
- cs_str_buf_terminate(VALUE str, rb_encoding *enc)
1164
- {
1165
- char *ptr;
1166
- long len;
1167
-
1168
- ptr = RSTRING(str)->as.heap.ptr;
1169
- len = RSTRING(str)->as.heap.len;
1170
- TERM_FILL(ptr + len, rb_enc_mbminlen(enc));
1171
- }
1172
-
1141
+ // partially based on rb_str_delete_bang
1173
1142
  static inline VALUE
1174
1143
  cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1175
1144
  {
1176
1145
  cs_ar *cps;
1177
- cs_cp len;
1178
- rb_encoding *str_enc;
1179
- VALUE orig_len, new_str_buf;
1180
- int cp_len;
1181
- unsigned int str_cp;
1182
- const char *ptr, *end;
1146
+ cs_cp cs_len;
1147
+ VALUE orig_str_len;
1148
+
1149
+ rb_encoding *enc;
1150
+ char *s, *send, *t;
1151
+ int ascompat, cr;
1183
1152
 
1184
1153
  raise_arg_err_unless_string(str);
1185
1154
 
1186
- cps = cs_fetch_cps(set, &len);
1155
+ orig_str_len = RSTRING_LEN(str);
1187
1156
 
1188
- orig_len = RSTRING_LEN(str);
1189
- if (orig_len < 1) // empty string, will never change
1157
+ if (orig_str_len == 0)
1190
1158
  {
1191
- if (bang)
1192
- {
1193
- return Qnil;
1194
- }
1195
- return rb_str_dup(str);
1159
+ return bang ? Qnil : str;
1196
1160
  }
1197
1161
 
1198
- new_str_buf = rb_str_buf_new(orig_len + 30); // len + margin
1199
- str_enc = rb_enc_get(str);
1200
- rb_enc_associate(new_str_buf, str_enc);
1201
- rb_str_modify(new_str_buf);
1202
- ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1203
-
1204
- ptr = RSTRING_PTR(str);
1205
- end = RSTRING_END(str);
1162
+ if (!bang)
1163
+ {
1164
+ str = rb_str_dup(str);
1165
+ }
1206
1166
 
1207
- if (single_byte_optimizable(str))
1167
+ cps = cs_fetch_cps(set, &cs_len);
1168
+ rb_str_modify(str);
1169
+ enc = rb_enc_get(str);
1170
+ ascompat = rb_enc_asciicompat(enc);
1171
+ s = t = RSTRING_PTR(str);
1172
+ send = RSTRING_END(str);
1173
+ cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
1174
+ while (s < send)
1208
1175
  {
1209
- while (ptr < end)
1176
+ unsigned int c;
1177
+ int clen;
1178
+
1179
+ if (ascompat && (c = *(unsigned char *)s) < 0x80)
1210
1180
  {
1211
- str_cp = *ptr & 0xff;
1212
- if ((!tst_cp(cps, len, str_cp)) == delete)
1181
+ if (tst_cp(cps, cs_len, c) != delete)
1213
1182
  {
1214
- cs_str_buf_cat(new_str_buf, ptr, 1);
1183
+ if (t != s)
1184
+ *t = c;
1185
+ t++;
1215
1186
  }
1216
- ptr++;
1187
+ s++;
1217
1188
  }
1218
- }
1219
- else // likely to be multibyte string
1220
- {
1221
- while (ptr < end)
1189
+ else
1222
1190
  {
1223
- str_cp = rb_enc_codepoint_len(ptr, end, &cp_len, str_enc);
1224
- if ((!tst_cp(cps, len, str_cp)) == delete)
1191
+ c = rb_enc_codepoint_len(s, send, &clen, enc);
1192
+
1193
+ if (tst_cp(cps, cs_len, c) != delete)
1225
1194
  {
1226
- cs_str_buf_cat(new_str_buf, ptr, cp_len);
1195
+ if (t != s)
1196
+ rb_enc_mbcput(c, t, enc);
1197
+ t += clen;
1198
+ if (cr == ENC_CODERANGE_7BIT)
1199
+ cr = ENC_CODERANGE_VALID;
1227
1200
  }
1228
- ptr += cp_len;
1201
+ s += clen;
1229
1202
  }
1230
1203
  }
1231
1204
 
1232
- cs_str_buf_terminate(new_str_buf, str_enc);
1205
+ rb_str_set_len(str, t - RSTRING_PTR(str));
1206
+ ENC_CODERANGE_SET(str, cr);
1233
1207
 
1234
- if (bang)
1235
- {
1236
- if (RSTRING_LEN(new_str_buf) == (long)orig_len) // string unchanged
1237
- {
1238
- return Qnil;
1239
- }
1240
- rb_str_shared_replace(str, new_str_buf);
1241
- }
1242
- else
1208
+ if (bang && (RSTRING_LEN(str) == (long)orig_str_len)) // string unchanged
1243
1209
  {
1244
- RB_OBJ_WRITE(new_str_buf, &(RBASIC(new_str_buf))->klass, rb_obj_class(str));
1245
- str = new_str_buf;
1210
+ return Qnil;
1246
1211
  }
1247
1212
 
1248
1213
  return str;
@@ -1284,6 +1249,10 @@ cs_method_allocated_length(VALUE self)
1284
1249
 
1285
1250
  void Init_character_set()
1286
1251
  {
1252
+ #ifdef HAVE_RB_EXT_RACTOR_SAFE
1253
+ rb_ext_ractor_safe(true);
1254
+ #endif
1255
+
1287
1256
  VALUE cs = rb_define_class("CharacterSet", rb_cObject);
1288
1257
 
1289
1258
  rb_define_alloc_func(cs, cs_method_allocate);
@@ -1338,7 +1307,7 @@ void Init_character_set()
1338
1307
  // `CharacterSet`-specific methods
1339
1308
 
1340
1309
  rb_define_singleton_method(cs, "from_ranges", cs_class_method_from_ranges, -2);
1341
- rb_define_singleton_method(cs, "of", cs_class_method_of, 1);
1310
+ rb_define_singleton_method(cs, "of_string", cs_class_method_of_string, 1);
1342
1311
 
1343
1312
  rb_define_method(cs, "ranges", cs_method_ranges, 0);
1344
1313
  rb_define_method(cs, "sample", cs_method_sample, -1);