character_set 1.2.0 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +3 -0
  3. data/.travis.yml +1 -0
  4. data/BENCHMARK.md +51 -15
  5. data/CHANGELOG.md +20 -0
  6. data/README.md +24 -8
  7. data/Rakefile +20 -18
  8. data/benchmarks/count_in.rb +13 -0
  9. data/benchmarks/delete_in.rb +1 -1
  10. data/benchmarks/scan.rb +13 -0
  11. data/benchmarks/shared.rb +1 -0
  12. data/benchmarks/z_add.rb +12 -0
  13. data/benchmarks/z_delete.rb +12 -0
  14. data/benchmarks/z_merge.rb +15 -0
  15. data/benchmarks/z_minmax.rb +12 -0
  16. data/bin/console +2 -0
  17. data/character_set.gemspec +2 -0
  18. data/ext/character_set/character_set.c +963 -413
  19. data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
  20. data/lib/character_set/core_ext/string_ext.rb +2 -0
  21. data/lib/character_set/expression_converter.rb +21 -24
  22. data/lib/character_set/predefined_sets.rb +25 -260
  23. data/lib/character_set/predefined_sets/any.cps +1 -0
  24. data/lib/character_set/predefined_sets/ascii.cps +1 -0
  25. data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
  26. data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
  27. data/lib/character_set/predefined_sets/assigned.cps +666 -0
  28. data/lib/character_set/predefined_sets/bmp.cps +2 -0
  29. data/lib/character_set/predefined_sets/crypt.cps +2 -0
  30. data/lib/character_set/predefined_sets/emoji.cps +151 -0
  31. data/lib/character_set/predefined_sets/newline.cps +3 -0
  32. data/lib/character_set/predefined_sets/surrogate.cps +1 -0
  33. data/lib/character_set/predefined_sets/unicode.cps +2 -0
  34. data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
  35. data/lib/character_set/predefined_sets/url_host.cps +10 -0
  36. data/lib/character_set/predefined_sets/url_path.cps +7 -0
  37. data/lib/character_set/predefined_sets/url_query.cps +8 -0
  38. data/lib/character_set/predefined_sets/whitespace.cps +10 -0
  39. data/lib/character_set/ruby_fallback.rb +0 -2
  40. data/lib/character_set/ruby_fallback/character_set_methods.rb +52 -4
  41. data/lib/character_set/ruby_fallback/set_methods.rb +2 -2
  42. data/lib/character_set/shared_methods.rb +51 -40
  43. data/lib/character_set/version.rb +1 -1
  44. metadata +54 -3
  45. data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 434323b3b99246a17ea5e062afd87d3edc3c09927b2231b4409b295ff63c7d6c
4
- data.tar.gz: 174c6dc751b03e49cf87045fad9a48100460244b7d7e25deef27066bd4aef92c
3
+ metadata.gz: ae7ec84b0727a804bf4d82564e6609fdd0bf070fd0e20c0a5688b579e320bc30
4
+ data.tar.gz: b73dec9fbd4abf83fae5881de89e4e1876e48bcefc3ef935401d5adbeb9c6c8e
5
5
  SHA512:
6
- metadata.gz: d9fa059ea3171209af537f0bd7636e3a65b962f30029ca399fe2fa0bd6168dd692b7bc5fb1014590a830b2e9aede9c26ae00ae8fe4a2eae4a86cf95e208b507d
7
- data.tar.gz: 692f4596b6adc9b44879b69fb82e55dc90d107156ecabb96c14ea91b4dc0c7dc706724b42093d0ef762cdac697f05ef855c5f462451015e1d06022ab06bc1c8d
6
+ metadata.gz: 2b84916c89dcd6a234cc5acedfc604f664a9e285c92b3bae6bade748ad3d9c275fb3307fb5721142e52dbedc9b16da65285a8ebd87cd686b55391f222ef1b4f8
7
+ data.tar.gz: 25147010da0adfd869891d50d51e265c2b4f28e1b0cb70727d9784b11c3944b9a06a9844a2068f529e487028c214f44e2ab60271a9a5730cdd40bb04dd989aaf
data/.gitattributes ADDED
@@ -0,0 +1,3 @@
1
+ *.cps linguist-detectable=false
2
+ benchmarks/* linguist-detectable=false
3
+ spec/ruby-spec/* linguist-vendored
data/.travis.yml CHANGED
@@ -5,4 +5,5 @@ rvm:
5
5
  - 2.4
6
6
  - 2.5
7
7
  - 2.6
8
+ - ruby-head
8
9
  - jruby-9.1.9.0
data/BENCHMARK.md CHANGED
@@ -1,46 +1,58 @@
1
- Results of `rake:benchmark` on ruby 2.6.0preview1 (2018-02-24 trunk 62554) [x86_64-darwin17]
1
+ Results of `rake:benchmark` on ruby 2.6.2p47 (2019-03-13 revision 67232) [x86_64-darwin18]
2
2
 
3
+ ```
4
+ Counting non-letters
5
+
6
+ CharacterSet#count_in: 12253693.8 i/s
7
+ String#count: 1737741.7 i/s - 7.05x slower
8
+ ```
3
9
  ```
4
10
  Detecting non-whitespace
5
11
 
6
- CharacterSet#cover?: 13244577.7 i/s
7
- Regexp#match?: 8027017.5 i/s - 1.65x slower
12
+ CharacterSet#cover?: 14058351.9 i/s
13
+ Regexp#match?: 7907608.1 i/s - 1.78x slower
8
14
  ```
9
15
  ```
10
16
  Detecting non-letters
11
17
 
12
- CharacterSet#cover?: 13082940.8 i/s
13
- Regexp#match?: 5372589.2 i/s - 2.44x slower
18
+ CharacterSet#cover?: 13341301.6 i/s
19
+ Regexp#match?: 5187453.3 i/s - 2.57x slower
14
20
  ```
15
21
  ```
16
22
  Removing whitespace
17
23
 
18
- CharacterSet#delete_in: 389315.6 i/s
19
- String#gsub: 223773.5 i/s - 1.74x slower
24
+ CharacterSet#delete_in: 2523184.0 i/s
25
+ String#gsub: 225804.7 i/s - 11.17x slower
20
26
  ```
21
27
  ```
22
28
  Removing whitespace, emoji and umlauts
23
29
 
24
- CharacterSet#delete_in: 470239.3 i/s
25
- String#gsub: 278679.4 i/s - 1.69x slower
30
+ CharacterSet#delete_in: 1712208.6 i/s
31
+ String#gsub: 278508.8 i/s - 6.15x slower
26
32
  ```
27
33
  ```
28
34
  Removing non-whitespace
29
35
 
30
- CharacterSet#keep_in: 1138461.0 i/s
31
- String#gsub: 235287.4 i/s - 4.84x slower
36
+ CharacterSet#keep_in: 2760158.1 i/s
37
+ String#gsub: 232797.7 i/s - 11.86x slower
32
38
  ```
33
39
  ```
34
40
  Extracting emoji
35
41
 
36
- CharacterSet#keep_in: 1474472.0 i/s
37
- String#gsub: 212269.6 i/s - 6.95x slower
42
+ CharacterSet#keep_in: 1775758.8 i/s
43
+ String#gsub: 217649.9 i/s - 8.16x slower
44
+ ```
45
+ ```
46
+ Extracting emoji to an Array
47
+
48
+ CharacterSet#scan: 2579030.8 i/s
49
+ String#scan: 545107.0 i/s - 4.73x slower
38
50
  ```
39
51
  ```
40
52
  Detecting whitespace
41
53
 
42
- CharacterSet#used_by?: 13063108.7 i/s
43
- Regexp#match?: 7215075.0 i/s - 1.81x slower
54
+ CharacterSet#used_by?: 13847689.0 i/s
55
+ Regexp#match?: 7533275.2 i/s - 1.84x slower
44
56
  ```
45
57
  ```
46
58
  Detecting emoji in a large string
@@ -48,3 +60,27 @@ Detecting emoji in a large string
48
60
  CharacterSet#used_by?: 246527.7 i/s
49
61
  Regexp#match?: 92956.5 i/s - 2.65x slower
50
62
  ```
63
+ ```
64
+ Adding entries
65
+
66
+ CharacterSet#add: 3102081.7 i/s
67
+ SortedSet#add: 1897464.8 i/s - 1.63x slower
68
+ ```
69
+ ```
70
+ Removing entries
71
+
72
+ CharacterSet#delete: 3240924.1 i/s
73
+ SortedSet#delete: 2887493.9 i/s - 1.12x slower
74
+ ```
75
+ ```
76
+ Merging entries
77
+
78
+ CharacterSet#merge: 536.8 i/s
79
+ SortedSet#merge: 12.5 i/s - 42.78x slower
80
+ ```
81
+ ```
82
+ Getting the min and max
83
+
84
+ CharacterSet#minmax: 4111960.8 i/s
85
+ SortedSet#minmax: 756.4 i/s - 5436.39x slower
86
+ ```
data/CHANGELOG.md CHANGED
@@ -4,6 +4,26 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## UNRELEASED
8
+
9
+ ## [1.3.0] - 2019-04-26
10
+
11
+ ### Added
12
+ - improved `String` manipulation speed
13
+ - improved initialization and `#merge` speed when passing a large `Range`
14
+ - reduced memory consumption by > 90% for most use cases via dynamic resizing
15
+ - before, every set instance required 136 KB for codepoints
16
+ - now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
17
+ - `#count_in` and `#scan_in` methods for `String` interaction
18
+ - new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
19
+ - conversion methods `#assigned_part`, `#valid_part`
20
+ - sectioning methods `#ascii_part`, `#plane(n)`
21
+ - section test methods `#ascii_part?`, `#ascii_ratio`, `#ascii_only?`, `#astral_only?`
22
+
23
+ ### Fixed
24
+ - `#count` now supports passing an argument or block as usual
25
+ - `CharacterSet::Pure#keep_in`, `#delete_in` now preserve the original encoding
26
+
7
27
  ## [1.2.0] - 2019-04-02
8
28
 
9
29
  ### Added
data/README.md CHANGED
@@ -2,8 +2,11 @@
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set)
4
4
  [![Build Status](https://travis-ci.org/jaynetics/character_set.svg?branch=master)](https://travis-ci.org/jaynetics/character_set)
5
+ [![codecov](https://codecov.io/gh/jaynetics/character_set/branch/master/graph/badge.svg)](https://codecov.io/gh/jaynetics/character_set)
5
6
 
6
- A gem to build, read, write and compare sets of Unicode codepoints.
7
+ This is a C-extended Ruby gem to work with sets of Unicode codepoints. It can read and write these sets in various formats and implements the stdlib `Set` interface for them.
8
+
9
+ It also offers an alternate paradigm of `String` processing which grants much better performance than `Regexp` and `String` methods from the stdlib where applicable (see [benchmarks](./BENCHMARK.md)).
7
10
 
8
11
  Many parts can be used independently, e.g.:
9
12
  - `CharacterSet::Character`
@@ -49,7 +52,7 @@ require 'character_set/core_ext/regexp_ext'
49
52
 
50
53
  ### Predefined utility sets
51
54
 
52
- `ascii`, `ascii_alnum`, `ascii_letters`, `bmp`, `crypt`, `emoji`, `newline`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace`
55
+ `ascii`, `ascii_alnum`, `ascii_letter`, `assigned`, `bmp`, `crypt`, `emoji`, `newline`, `surrogate`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace`
53
56
 
54
57
  ```ruby
55
58
  CharacterSet.ascii # => #<CharacterSet (size: 128)>
@@ -60,7 +63,7 @@ CharacterSet.non_ascii
60
63
 
61
64
  ### Interact with Strings
62
65
 
63
- CharacterSet can replace some `Regexp` actions on Strings, at better speed (see [benchmarks](./BENCHMARK.md)).
66
+ `CharacterSet` can replace some types of `String` handling with better performance than the stdlib.
64
67
 
65
68
  `#used_by?` and `#cover?` can replace some `Regexp#match?` calls:
66
69
 
@@ -71,6 +74,7 @@ CharacterSet.ascii.cover?('Tr') # => true
71
74
  ```
72
75
 
73
76
  `#delete_in(!)` and `#keep_in(!)` can replace `String#gsub(!)` and the like:
77
+
74
78
  ```ruby
75
79
  string = 'Tüür'
76
80
 
@@ -84,6 +88,13 @@ CharacterSet.ascii.keep_in!(string) # => ''
84
88
  string # => ''
85
89
  ```
86
90
 
91
+ `#count_in` and `#scan` can replace `String#count` and `String#scan`:
92
+
93
+ ```ruby
94
+ CharacterSet.non_ascii.count_in('Tüür') # => 2
95
+ CharacterSet.non_ascii.scan_in('Tüür') # => ['ü', 'ü']
96
+ ```
97
+
87
98
  There is also a core extension for String interaction.
88
99
  ```ruby
89
100
  require 'character_set/core_ext/string_ext'
@@ -100,7 +111,7 @@ require 'character_set/core_ext/string_ext'
100
111
 
101
112
  ### Manipulate
102
113
 
103
- Use any [Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
114
+ Use [any Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
104
115
 
105
116
  Where appropriate, methods take both chars and codepoints, e.g.:
106
117
 
@@ -122,13 +133,13 @@ non_a.include?('ü') # => true
122
133
 
123
134
  # surrogate pair halves are not included by default
124
135
  CharacterSet['a'].inversion(include_surrogates: true)
125
- # => #<CharacterSet (size: 1114111)>
136
+ # => #<CharacterSet (size: 1114112)>
126
137
  ```
127
138
 
128
139
  `#case_insensitive` can be used to create a `CharacterSet` where upper/lower case codepoints are supplemented:
129
140
 
130
141
  ```ruby
131
- CharacterSet['1', 'a'].case_insensitive # => CharacterSet['1', 'A', 'a']
142
+ CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
132
143
  ```
133
144
 
134
145
  ### Write
@@ -157,17 +168,22 @@ set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
157
168
  set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
158
169
 
159
170
  # for full js regex compatibility in case of astral members:
160
- set.to_s_with_surrogate_alternation # => '(?:[\u0258]|\ud83e\udd29)'
171
+ set.to_s_with_surrogate_alternation # => '(?:[a-c\u0258]|\ud83e\udd29)'
161
172
  ```
162
173
 
163
174
  ### Unicode plane methods
164
175
 
165
- There are some methods to check for planes and to handle [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
176
+ There are some methods to check for planes and to handle ASCII, [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
166
177
  ```Ruby
178
+ CharacterSet['a', 'ü', '🤩'].ascii_part # => CharacterSet['a']
179
+ CharacterSet['a', 'ü', '🤩'].ascii_part? # => true
180
+ CharacterSet['a', 'ü', '🤩'].ascii_only? # => false
181
+ CharacterSet['a', 'ü', '🤩'].ascii_ratio # => 0.3333333
167
182
  CharacterSet['a', 'ü', '🤩'].bmp_part # => CharacterSet['a', 'ü']
168
183
  CharacterSet['a', 'ü', '🤩'].astral_part # => CharacterSet['🤩']
169
184
  CharacterSet['a', 'ü', '🤩'].bmp_ratio # => 0.6666666
170
185
  CharacterSet['a', 'ü', '🤩'].planes # => [0, 1]
186
+ CharacterSet['a', 'ü', '🤩'].plane(1) # => CharacterSet['🤩']
171
187
  CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
172
188
  CharacterSet::Character.new('a').plane # => 0
173
189
  ```
data/Rakefile CHANGED
@@ -7,6 +7,13 @@ RSpec::Core::RakeTask.new(:spec)
7
7
 
8
8
  task default: :spec
9
9
 
10
+ namespace :spec do
11
+ task :quick do
12
+ ENV['SKIP_MEMSAFETY_SPECS'] = '1'
13
+ Rake::Task[:spec].invoke
14
+ end
15
+ end
16
+
10
17
  Rake::ExtensionTask.new('character_set') do |ext|
11
18
  ext.lib_dir = 'lib/character_set'
12
19
  end
@@ -106,27 +113,22 @@ task :sync_casefold_data do
106
113
  hash[from] = to if type == 'C'
107
114
  end.sort
108
115
 
109
- File.open(dst_path, 'w') do |f|
110
- f.puts <<-C
111
- // THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT
112
- // -*-C-*-
113
-
114
- typedef struct casefold_mapping {
115
- unsigned long from;
116
- unsigned long to;
117
- } casefold_mapping;
118
-
119
- #define CASEFOLD_COUNT #{mapping.size}
116
+ content = File.read(dst_path + '.tmpl')
117
+ .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
118
+ .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
120
119
 
121
- static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
122
- C
123
-
124
- mapping.each { |from, to| f.puts "{0x#{from},0x#{to}}," }
120
+ File.write(dst_path, content)
121
+ File.unlink(src_path)
122
+ end
125
123
 
126
- f.puts '};'
124
+ desc 'Update codepoint data for predefined sets, based on Onigmo'
125
+ task :sync_predefined_sets do
126
+ %w[assigned emoji whitespace].each do |prop|
127
+ require 'regexp_property_values'
128
+ ranges = RegexpPropertyValues[prop].matched_ranges
129
+ str = ranges.map { |r| r.minmax.map { |n| n.to_s(16) }.join(',').upcase + "\n" }.join
130
+ File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
127
131
  end
128
-
129
- File.unlink(src_path)
130
132
  end
131
133
 
132
134
  desc 'Run all IPS benchmarks'
@@ -0,0 +1,13 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum et dolorem'
4
+ tr = '^A-Za-z'
5
+ cs = CharacterSet.non_ascii_letter
6
+
7
+ benchmark(
8
+ caption: 'Counting non-letters',
9
+ cases: {
10
+ 'String#count' => -> { str.count(tr) },
11
+ 'CharacterSet#count_in' => -> { cs.count_in(str) },
12
+ }
13
+ )
@@ -14,7 +14,7 @@ benchmark(
14
14
 
15
15
  str = 'Lörem ipsüm ⛷ et dölörem'
16
16
  rx = /[\s\p{emoji}äüö]/
17
- cs = CharacterSet.whitespace + CharacterSet.emoji + CS['ä', 'ü', 'ö']
17
+ cs = CharacterSet.whitespace + CharacterSet.emoji + CharacterSet['ä', 'ö', 'ü']
18
18
 
19
19
  benchmark(
20
20
  caption: 'Removing whitespace, emoji and umlauts',
@@ -0,0 +1,13 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum ⛷ et dolorem'
4
+ rx = /\p{emoji}/
5
+ cs = CharacterSet.emoji
6
+
7
+ benchmark(
8
+ caption: 'Extracting emoji to an Array',
9
+ cases: {
10
+ 'String#scan' => -> { str.scan(rx) },
11
+ 'CharacterSet#scan' => -> { cs.scan(str) },
12
+ }
13
+ )
data/benchmarks/shared.rb CHANGED
@@ -3,6 +3,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
3
 
4
4
  require 'benchmark/ips'
5
5
  require 'character_set'
6
+ require 'set'
6
7
 
7
8
  def benchmark(caption: nil, cases: {})
8
9
  puts caption
@@ -0,0 +1,12 @@
1
+ require_relative './shared'
2
+
3
+ cs = CharacterSet[]
4
+ ss = SortedSet[]
5
+
6
+ benchmark(
7
+ caption: 'Adding entries',
8
+ cases: {
9
+ 'CharacterSet#add' => -> { cs.add(rand(0x10FFFF)) },
10
+ 'SortedSet#add' => -> { ss.add(rand(0x10FFFF)) },
11
+ }
12
+ )
@@ -0,0 +1,12 @@
1
+ require_relative './shared'
2
+
3
+ cs = CharacterSet.new(0..0x10FFFF)
4
+ ss = SortedSet.new(0..0x10FFFF)
5
+
6
+ benchmark(
7
+ caption: 'Removing entries',
8
+ cases: {
9
+ 'CharacterSet#delete' => -> { cs.delete(rand(0x10FFFF)) },
10
+ 'SortedSet#delete' => -> { ss.delete(rand(0x10FFFF)) },
11
+ }
12
+ )
@@ -0,0 +1,15 @@
1
+ require_relative './shared'
2
+
3
+ cs1 = CharacterSet.new(0...0x88000)
4
+ cs2 = CharacterSet.new(0x88000..0x10FFFF)
5
+
6
+ ss1 = SortedSet.new(0...0x88000)
7
+ ss2 = SortedSet.new(0x88000..0x10FFFF)
8
+
9
+ benchmark(
10
+ caption: 'Merging entries',
11
+ cases: {
12
+ 'CharacterSet#merge' => -> { cs1.merge(cs2) },
13
+ 'SortedSet#merge' => -> { ss1.merge(ss2) },
14
+ }
15
+ )
@@ -0,0 +1,12 @@
1
+ require_relative './shared'
2
+
3
+ cs = CharacterSet.new(0..0xFFFF)
4
+ ss = SortedSet.new(0..0xFFFF)
5
+
6
+ benchmark(
7
+ caption: 'Getting the min and max',
8
+ cases: {
9
+ 'CharacterSet#minmax' => -> { cs.minmax },
10
+ 'SortedSet#minmax' => -> { ss.minmax },
11
+ }
12
+ )
data/bin/console CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  require 'bundler/setup'
4
4
 
5
+ `bundle exec rake compile`
6
+
5
7
  require 'character_set'
6
8
  require 'character_set/core_ext'
7
9
  require 'character_set/pure'
@@ -23,6 +23,8 @@ Gem::Specification.new do |s|
23
23
  s.required_ruby_version = '>= 2.1.0'
24
24
 
25
25
  s.add_development_dependency 'benchmark-ips', '~> 2.7'
26
+ s.add_development_dependency 'codecov', '~> 0.1'
27
+ s.add_development_dependency 'get_process_mem', '~> 0.2.3'
26
28
  s.add_development_dependency 'rake', '~> 12.0'
27
29
  s.add_development_dependency 'rake-compiler', '~> 1.0'
28
30
  s.add_development_dependency 'range_compressor', '~> 1.0'
@@ -2,81 +2,180 @@
2
2
  #include "ruby/encoding.h"
3
3
  #include "unicode_casefold_table.h"
4
4
 
5
- #define SETBIT(byte_arr, bit) (byte_arr[bit >> 3] |= (1 << (bit & 0x07)))
6
- #define CLRBIT(byte_arr, bit) (byte_arr[bit >> 3] &= ~(1 << (bit & 0x07)))
7
- #define TSTBIT(byte_arr, bit) (byte_arr[bit >> 3] & (1 << (bit & 0x07)))
5
+ #define UNICODE_PLANE_SIZE 0x10000
6
+ #define UNICODE_PLANE_COUNT 17
7
+ #define UNICODE_CP_COUNT (UNICODE_PLANE_SIZE * UNICODE_PLANE_COUNT)
8
8
 
9
- typedef char cp_byte;
10
- typedef unsigned long cp_index;
9
+ // start at ascii size
10
+ #define CS_DEFAULT_INITIAL_LEN 128
11
11
 
12
- #define UNICODE_CP_COUNT 0x110000
13
- #define UNICODE_BYTES UNICODE_CP_COUNT / 8
14
- #define UNICODE_PLANE_SIZE 0x10000
15
- #define UNICODE_PLANE_COUNT UNICODE_CP_COUNT / UNICODE_PLANE_SIZE
12
+ typedef char cs_ar;
13
+ typedef unsigned long cs_cp;
14
+
15
+ struct cs_data
16
+ {
17
+ cs_ar *cps;
18
+ cs_cp len;
19
+ };
20
+
21
+ #define CS_MSIZE(len) (sizeof(cs_ar) * (len / 8))
22
+
23
+ static inline void
24
+ add_memspace_for_another_plane(struct cs_data *data)
25
+ {
26
+ data->cps = ruby_xrealloc(data->cps, CS_MSIZE(data->len + UNICODE_PLANE_SIZE));
27
+ memset(data->cps + CS_MSIZE(data->len), 0, CS_MSIZE(UNICODE_PLANE_SIZE));
28
+ data->len += UNICODE_PLANE_SIZE;
29
+ }
30
+
31
+ static inline void
32
+ ensure_memsize_fits(struct cs_data *data, cs_cp target_cp)
33
+ {
34
+ while (target_cp >= data->len)
35
+ {
36
+ add_memspace_for_another_plane(data);
37
+ }
38
+ }
39
+
40
+ static inline void
41
+ set_cp(struct cs_data *data, cs_cp cp)
42
+ {
43
+ ensure_memsize_fits(data, cp);
44
+ data->cps[cp >> 3] |= (1 << (cp & 0x07));
45
+ }
46
+
47
+ static inline int
48
+ tst_cp(cs_ar *cps, cs_cp len, cs_cp cp)
49
+ {
50
+ return ((cp < len) && cps[cp >> 3] & (1 << (cp & 0x07)));
51
+ }
52
+
53
+ static inline void
54
+ clr_cp(cs_ar *cps, cs_cp len, cs_cp cp)
55
+ {
56
+ if (cp < len)
57
+ {
58
+ cps[cp >> 3] &= ~(1 << (cp & 0x07));
59
+ }
60
+ }
16
61
 
17
62
  static void
18
- free_character_set(void* codepoints) {
19
- free(codepoints);
63
+ cs_free(void *ptr)
64
+ {
65
+ struct cs_data *data = ptr;
66
+ ruby_xfree(data->cps);
67
+ ruby_xfree(data);
20
68
  }
21
69
 
22
70
  static size_t
23
- memsize_character_set(const void* codepoints) {
24
- return sizeof(cp_byte) * UNICODE_BYTES;
25
- }
26
-
27
- static const rb_data_type_t
28
- character_set_type = {
29
- .wrap_struct_name = "character_set",
30
- .function = {
31
- .dmark = NULL,
32
- .dfree = free_character_set,
33
- .dsize = memsize_character_set,
34
- },
35
- .data = NULL,
36
- .flags = RUBY_TYPED_FREE_IMMEDIATELY,
71
+ cs_memsize(const void *ptr)
72
+ {
73
+ const struct cs_data *data = ptr;
74
+ return sizeof(*data) + CS_MSIZE(data->len);
75
+ }
76
+
77
+ static const rb_data_type_t cs_type = {
78
+ .wrap_struct_name = "character_set",
79
+ .function = {
80
+ .dmark = NULL,
81
+ .dfree = cs_free,
82
+ .dsize = cs_memsize,
83
+ },
84
+ .data = NULL,
85
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY,
37
86
  };
38
87
 
39
- #define FETCH_CODEPOINTS(set, cps)\
40
- TypedData_Get_Struct(set, cp_byte, &character_set_type, cps)
88
+ static inline VALUE
89
+ cs_alloc_len(VALUE klass, struct cs_data **data_ptr, cs_cp len)
90
+ {
91
+ VALUE cs;
92
+ struct cs_data *data;
93
+ cs = TypedData_Make_Struct(klass, struct cs_data, &cs_type, data);
94
+ data->cps = ruby_xmalloc(CS_MSIZE(len));
95
+ memset(data->cps, 0, CS_MSIZE(len));
96
+ data->len = len;
97
+
98
+ if (data_ptr)
99
+ {
100
+ *data_ptr = data;
101
+ }
41
102
 
42
- #define NEW_CHARACTER_SET(klass, cps)\
43
- TypedData_Wrap_Struct(klass, &character_set_type, cps)
103
+ return cs;
104
+ }
44
105
 
45
- static VALUE
46
- method_allocate(VALUE self) {
47
- cp_byte *cp_arr;
48
- cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte));
49
- return NEW_CHARACTER_SET(self, cp_arr);
106
+ static inline VALUE
107
+ cs_alloc(VALUE klass, struct cs_data **data_ptr)
108
+ {
109
+ return cs_alloc_len(klass, data_ptr, CS_DEFAULT_INITIAL_LEN);
50
110
  }
51
111
 
52
- #define FOR_EACH_ACTIVE_CODEPOINT(action)\
53
- cp_index cp;\
54
- cp_byte *cps;\
55
- FETCH_CODEPOINTS(self, cps);\
56
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\
57
- if (TSTBIT(cps, cp)) { action; }\
58
- }
112
+ static inline struct cs_data *
113
+ cs_fetch_data(VALUE cs)
114
+ {
115
+ struct cs_data *data;
116
+ TypedData_Get_Struct(cs, struct cs_data, &cs_type, data);
117
+ return data;
118
+ }
119
+
120
+ static inline cs_ar *
121
+ cs_fetch_cps(VALUE cs, cs_cp *len_ptr)
122
+ {
123
+ struct cs_data *data;
124
+ data = cs_fetch_data(cs);
125
+ *len_ptr = data->len;
126
+ return data->cps;
127
+ }
128
+
129
+ static VALUE
130
+ cs_method_allocate(VALUE self)
131
+ {
132
+ return cs_alloc(self, 0);
133
+ }
134
+
135
+ #define FOR_EACH_ACTIVE_CODEPOINT(action) \
136
+ do \
137
+ { \
138
+ cs_cp cp, len; \
139
+ cs_ar *cps; \
140
+ cps = cs_fetch_cps(self, &len); \
141
+ for (cp = 0; cp < len; cp++) \
142
+ { \
143
+ if (tst_cp(cps, len, cp)) \
144
+ { \
145
+ action; \
146
+ } \
147
+ } \
148
+ } while (0)
59
149
 
60
150
  // ***************************
61
151
  // `Set` compatibility methods
62
152
  // ***************************
63
153
 
64
- static inline VALUE
65
- enumerator_length(VALUE self, VALUE args, VALUE eobj) {
66
- cp_index count;
154
+ static inline cs_cp
155
+ cs_active_cp_count(VALUE self)
156
+ {
157
+ cs_cp count;
67
158
  count = 0;
68
159
  FOR_EACH_ACTIVE_CODEPOINT(count++);
69
- return LONG2FIX(count);
160
+ return count;
70
161
  }
71
162
 
72
163
  static VALUE
73
- method_length(VALUE self) {
74
- return enumerator_length(self, 0, 0);
164
+ cs_method_length(VALUE self)
165
+ {
166
+ return LONG2FIX(cs_active_cp_count(self));
167
+ }
168
+
169
+ static inline VALUE
170
+ cs_enumerator_length(VALUE self, VALUE args, VALUE eobj)
171
+ {
172
+ return LONG2FIX(cs_active_cp_count(self));
75
173
  }
76
174
 
77
175
  static VALUE
78
- method_each(VALUE self) {
79
- RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
176
+ cs_method_each(VALUE self)
177
+ {
178
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
80
179
  FOR_EACH_ACTIVE_CODEPOINT(rb_yield(LONG2FIX(cp)));
81
180
  return self;
82
181
  }
@@ -84,16 +183,19 @@ method_each(VALUE self) {
84
183
  // returns an Array of codepoint Integers by default.
85
184
  // returns an Array of Strings of length 1 if passed `true`.
86
185
  static VALUE
87
- method_to_a(int argc, VALUE *argv, VALUE self) {
186
+ cs_method_to_a(int argc, VALUE *argv, VALUE self)
187
+ {
88
188
  VALUE arr;
89
189
  rb_encoding *enc;
90
190
  rb_check_arity(argc, 0, 1);
91
191
 
92
192
  arr = rb_ary_new();
93
- if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse) {
193
+ if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse)
194
+ {
94
195
  FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, LONG2FIX(cp)));
95
196
  }
96
- else {
197
+ else
198
+ {
97
199
  enc = rb_utf8_encoding();
98
200
  FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, rb_enc_uint_chr((int)cp, enc)));
99
201
  }
@@ -102,302 +204,472 @@ method_to_a(int argc, VALUE *argv, VALUE self) {
102
204
  }
103
205
 
104
206
  static VALUE
105
- method_empty_p(VALUE self) {
207
+ cs_method_empty_p(VALUE self)
208
+ {
106
209
  FOR_EACH_ACTIVE_CODEPOINT(return Qfalse);
107
210
  return Qtrue;
108
211
  }
109
212
 
110
213
  static VALUE
111
- method_hash(VALUE self) {
112
- cp_index cp, hash, four_byte_value;
113
- cp_byte *cps;
114
- FETCH_CODEPOINTS(self, cps);
214
+ cs_method_hash(VALUE self)
215
+ {
216
+ cs_cp cp, len, hash, four_byte_value;
217
+ cs_ar *cps;
218
+ cps = cs_fetch_cps(self, &len);
115
219
 
116
220
  hash = 17;
117
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
118
- if (cp % 32 == 0) {
119
- if (cp != 0) { hash = hash * 23 + four_byte_value; }
221
+ for (cp = 0; cp < len; cp++)
222
+ {
223
+ if (cp % 32 == 0)
224
+ {
225
+ if (cp != 0)
226
+ {
227
+ hash = hash * 23 + four_byte_value;
228
+ }
120
229
  four_byte_value = 0;
121
230
  }
122
- if (TSTBIT(cps, cp)) four_byte_value++;
231
+ if (tst_cp(cps, len, cp))
232
+ {
233
+ four_byte_value++;
234
+ }
123
235
  }
124
236
 
125
237
  return LONG2FIX(hash);
126
238
  }
127
239
 
128
240
  static inline VALUE
129
- delete_if_block_result(VALUE self, int truthy) {
241
+ cs_delete_if_block_result(VALUE self, int truthy)
242
+ {
130
243
  VALUE result;
131
244
  rb_need_block();
132
245
  rb_check_frozen(self);
133
246
  FOR_EACH_ACTIVE_CODEPOINT(
134
- result = rb_yield(LONG2FIX(cp));
135
- if ((NIL_P(result) || result == Qfalse) != truthy) CLRBIT(cps, cp);
136
- );
247
+ result = rb_yield(LONG2FIX(cp));
248
+ if ((NIL_P(result) || result == Qfalse) != truthy) clr_cp(cps, len, cp););
137
249
  return self;
138
250
  }
139
251
 
140
252
  static VALUE
141
- method_delete_if(VALUE self) {
142
- RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
143
- return delete_if_block_result(self, 1);
253
+ cs_method_delete_if(VALUE self)
254
+ {
255
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
256
+ return cs_delete_if_block_result(self, 1);
144
257
  }
145
258
 
146
259
  static VALUE
147
- method_keep_if(VALUE self) {
148
- RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
149
- return delete_if_block_result(self, 0);
260
+ cs_method_keep_if(VALUE self)
261
+ {
262
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
263
+ return cs_delete_if_block_result(self, 0);
150
264
  }
151
265
 
152
266
  static VALUE
153
- method_clear(VALUE self) {
154
- cp_index cp;
155
- cp_byte *cps;
267
+ cs_method_clear(VALUE self)
268
+ {
269
+ struct cs_data *data;
156
270
  rb_check_frozen(self);
157
- FETCH_CODEPOINTS(self, cps);
158
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
159
- CLRBIT(cps, cp);
160
- }
271
+ data = cs_fetch_data(self);
272
+ memset(data->cps, 0, CS_MSIZE(data->len));
161
273
  return self;
162
274
  }
163
275
 
164
- #define RETURN_NEW_SET_BASED_ON(condition)\
165
- cp_index cp;\
166
- cp_byte *a, *b, *new_cps;\
167
- FETCH_CODEPOINTS(self, a);\
168
- if (other) FETCH_CODEPOINTS(other, b);\
169
- new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));\
170
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\
171
- if (condition) SETBIT(new_cps, cp);\
172
- }\
173
- return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);\
276
+ static VALUE
277
+ cs_method_min(VALUE self)
278
+ {
279
+ FOR_EACH_ACTIVE_CODEPOINT(return LONG2FIX(cp));
280
+ return Qnil;
281
+ }
282
+
283
+ static VALUE
284
+ cs_method_max(VALUE self)
285
+ {
286
+ cs_cp len;
287
+ long reverse_idx;
288
+ cs_ar *cps;
289
+ cps = cs_fetch_cps(self, &len);
290
+ for (reverse_idx = len; reverse_idx >= 0; reverse_idx--)
291
+ {
292
+ if (tst_cp(cps, len, reverse_idx))
293
+ {
294
+ return LONG2FIX(reverse_idx);
295
+ }
296
+ }
297
+ return Qnil;
298
+ }
299
+
300
+ static VALUE
301
+ cs_method_minmax(VALUE self)
302
+ {
303
+ VALUE arr;
304
+ arr = rb_ary_new2(2);
305
+ rb_ary_push(arr, cs_method_min(self));
306
+ rb_ary_push(arr, cs_method_max(self));
307
+ return arr;
308
+ }
309
+
310
+ #define RETURN_COMBINED_CS(cs_a, cs_b, comp_op) \
311
+ do \
312
+ { \
313
+ VALUE new_cs; \
314
+ cs_cp cp, alen, blen; \
315
+ cs_ar *acps, *bcps; \
316
+ struct cs_data *new_data; \
317
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
318
+ acps = cs_fetch_cps(cs_a, &alen); \
319
+ bcps = cs_fetch_cps(cs_b, &blen); \
320
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) \
321
+ { \
322
+ if (tst_cp(acps, alen, cp) comp_op tst_cp(bcps, blen, cp)) \
323
+ { \
324
+ set_cp(new_data, cp); \
325
+ } \
326
+ } \
327
+ return new_cs; \
328
+ } while (0)
174
329
 
175
330
  static VALUE
176
- method_intersection(VALUE self, VALUE other) {
177
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) && TSTBIT(b, cp));
331
+ cs_method_intersection(VALUE self, VALUE other)
332
+ {
333
+ RETURN_COMBINED_CS(self, other, &&);
178
334
  }
179
335
 
180
336
  static VALUE
181
- method_exclusion(VALUE self, VALUE other) {
182
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) ^ TSTBIT(b, cp));
337
+ cs_method_exclusion(VALUE self, VALUE other)
338
+ {
339
+ RETURN_COMBINED_CS(self, other, ^);
183
340
  }
184
341
 
185
342
  static VALUE
186
- method_union(VALUE self, VALUE other) {
187
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) || TSTBIT(b, cp));
343
+ cs_method_union(VALUE self, VALUE other)
344
+ {
345
+ RETURN_COMBINED_CS(self, other, ||);
188
346
  }
189
347
 
190
348
  static VALUE
191
- method_difference(VALUE self, VALUE other) {
192
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) > TSTBIT(b, cp));
349
+ cs_method_difference(VALUE self, VALUE other)
350
+ {
351
+ RETURN_COMBINED_CS(self, other, >);
193
352
  }
194
353
 
195
354
  static VALUE
196
- method_include_p(VALUE self, VALUE num) {
197
- cp_byte *cps;
198
- FETCH_CODEPOINTS(self, cps);
199
- return (TSTBIT(cps, FIX2ULONG(num)) ? Qtrue : Qfalse);
355
+ cs_method_include_p(VALUE self, VALUE num)
356
+ {
357
+ cs_ar *cps;
358
+ cs_cp len;
359
+ cps = cs_fetch_cps(self, &len);
360
+ return (tst_cp(cps, len, FIX2ULONG(num)) ? Qtrue : Qfalse);
200
361
  }
201
362
 
202
- static inline int
203
- toggle_codepoint(VALUE set, VALUE cp_num, unsigned int on, int check_if_noop) {
204
- cp_index cp;
205
- cp_byte *cps;
206
- rb_check_frozen(set);
207
- FETCH_CODEPOINTS(set, cps);
363
+ static inline VALUE
364
+ cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop)
365
+ {
366
+ cs_cp cp, len;
367
+ cs_ar *cps;
368
+ struct cs_data *data;
369
+ rb_check_frozen(cs);
370
+ data = cs_fetch_data(cs);
371
+ cps = data->cps;
372
+ len = data->len;
208
373
  cp = FIX2ULONG(cp_num);
209
- if (check_if_noop && (!TSTBIT(cps, cp) == !on)) {
210
- return 0;
374
+ if (return_nil_if_noop && (!tst_cp(cps, len, cp) == !on))
375
+ {
376
+ return Qnil;
211
377
  }
212
- else {
213
- if (on) { SETBIT(cps, cp); }
214
- else { CLRBIT(cps, cp); }
215
- return 1;
378
+ else
379
+ {
380
+ if (on)
381
+ {
382
+ set_cp(data, cp);
383
+ }
384
+ else
385
+ {
386
+ clr_cp(cps, len, cp);
387
+ }
388
+ return cs;
216
389
  }
217
390
  }
218
391
 
219
392
  static VALUE
220
- method_add(VALUE self, VALUE cp_num) {
221
- return toggle_codepoint(self, cp_num, 1, 0) ? self : Qnil;
393
+ cs_method_add(VALUE self, VALUE cp_num)
394
+ {
395
+ return cs_toggle_codepoint(self, cp_num, 1, 0);
222
396
  }
223
397
 
224
398
  static VALUE
225
- method_add_p(VALUE self, VALUE cp_num) {
226
- return toggle_codepoint(self, cp_num, 1, 1) ? self : Qnil;
399
+ cs_method_add_p(VALUE self, VALUE cp_num)
400
+ {
401
+ return cs_toggle_codepoint(self, cp_num, 1, 1);
227
402
  }
228
403
 
229
404
  static VALUE
230
- method_delete(VALUE self, VALUE cp_num) {
231
- return toggle_codepoint(self, cp_num, 0, 0) ? self : Qnil;
405
+ cs_method_delete(VALUE self, VALUE cp_num)
406
+ {
407
+ return cs_toggle_codepoint(self, cp_num, 0, 0);
232
408
  }
233
409
 
234
410
  static VALUE
235
- method_delete_p(VALUE self, VALUE cp_num) {
236
- return toggle_codepoint(self, cp_num, 0, 1) ? self : Qnil;
411
+ cs_method_delete_p(VALUE self, VALUE cp_num)
412
+ {
413
+ return cs_toggle_codepoint(self, cp_num, 0, 1);
237
414
  }
238
415
 
239
- #define COMPARE_SETS(action)\
240
- cp_index cp;\
241
- cp_byte *cps, *other_cps;\
242
- FETCH_CODEPOINTS(self, cps);\
243
- FETCH_CODEPOINTS(other, other_cps);\
244
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { action; }\
245
-
246
416
  static VALUE
247
- method_intersect_p(VALUE self, VALUE other) {
248
- COMPARE_SETS(if (TSTBIT(cps, cp) && TSTBIT(other_cps, cp)) return Qtrue);
417
+ cs_method_intersect_p(VALUE self, VALUE other)
418
+ {
419
+ cs_cp cp, alen, blen;
420
+ cs_ar *acps, *bcps;
421
+ acps = cs_fetch_cps(self, &alen);
422
+ bcps = cs_fetch_cps(other, &blen);
423
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
424
+ {
425
+ if (tst_cp(acps, alen, cp) && tst_cp(bcps, blen, cp))
426
+ {
427
+ return Qtrue;
428
+ }
429
+ }
249
430
  return Qfalse;
250
431
  }
251
432
 
252
433
  static VALUE
253
- method_disjoint_p(VALUE self, VALUE other) {
254
- return method_intersect_p(self, other) ? Qfalse : Qtrue;
434
+ cs_method_disjoint_p(VALUE self, VALUE other)
435
+ {
436
+ return cs_method_intersect_p(self, other) ? Qfalse : Qtrue;
255
437
  }
256
438
 
257
439
  static inline int
258
- is_character_set(VALUE obj) {
259
- return rb_typeddata_is_kind_of(obj, &character_set_type);
440
+ cs_check_type(VALUE obj)
441
+ {
442
+ return rb_typeddata_is_kind_of(obj, &cs_type);
260
443
  }
261
444
 
262
445
  static VALUE
263
- method_eql_p(VALUE self, VALUE other) {
264
- if (!is_character_set(other)) return Qfalse;
265
- if (self == other) return Qtrue; // same object_id
266
-
267
- COMPARE_SETS(if (TSTBIT(cps, cp) != TSTBIT(other_cps, cp)) return Qfalse);
268
-
446
+ cs_cps_eql(VALUE cs_a, VALUE cs_b)
447
+ {
448
+ cs_cp cp, alen, blen;
449
+ cs_ar *acps, *bcps;
450
+ acps = cs_fetch_cps(cs_a, &alen);
451
+ bcps = cs_fetch_cps(cs_b, &blen);
452
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
453
+ {
454
+ if (tst_cp(acps, alen, cp) != tst_cp(bcps, blen, cp))
455
+ {
456
+ return Qfalse;
457
+ }
458
+ }
269
459
  return Qtrue;
270
460
  }
271
461
 
462
+ static VALUE
463
+ cs_method_eql_p(VALUE self, VALUE other)
464
+ {
465
+ if (!cs_check_type(other))
466
+ {
467
+ return Qfalse;
468
+ }
469
+ if (self == other) // same object_id
470
+ {
471
+ return Qtrue;
472
+ }
473
+ return cs_cps_eql(self, other);
474
+ }
475
+
272
476
  static inline VALUE
273
- merge_character_set(VALUE self, VALUE other) {
274
- COMPARE_SETS(if (TSTBIT(other_cps, cp)) SETBIT(cps, cp));
275
- return self;
477
+ cs_merge_cs(VALUE recipient, VALUE source)
478
+ {
479
+ cs_cp cp, source_len;
480
+ struct cs_data *data;
481
+ cs_ar *source_cps;
482
+ data = cs_fetch_data(recipient);
483
+ source_cps = cs_fetch_cps(source, &source_len);
484
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
485
+ {
486
+ if (tst_cp(source_cps, source_len, cp))
487
+ {
488
+ set_cp(data, cp);
489
+ }
490
+ }
491
+ return recipient;
276
492
  }
277
493
 
278
- static inline void
279
- raise_arg_err_unless_valid_as_cp(VALUE object_id) {
280
- if (FIXNUM_P(object_id) && object_id > 0 && object_id < 0x220001) return;
494
+ static inline cs_cp
495
+ cs_checked_cp(VALUE object_id)
496
+ {
497
+ if (FIXNUM_P(object_id) && object_id > 0 && object_id < 0x220001)
498
+ {
499
+ return FIX2ULONG(object_id);
500
+ }
281
501
  rb_raise(rb_eArgError, "CharacterSet members must be between 0 and 0x10FFFF");
282
502
  }
283
503
 
284
504
  static inline VALUE
285
- merge_rb_range(VALUE self, VALUE rb_range) {
505
+ cs_merge_rb_range(VALUE self, VALUE rb_range)
506
+ {
286
507
  VALUE from_id, upto_id;
508
+ cs_cp from_cp, upto_cp, cont_len, rem;
287
509
  int excl;
288
- cp_index cp;
289
- cp_byte *cps;
290
- FETCH_CODEPOINTS(self, cps);
510
+ struct cs_data *data;
511
+ data = cs_fetch_data(self);
291
512
 
292
- if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl))) {
513
+ if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl)))
514
+ {
293
515
  rb_raise(rb_eArgError, "pass a Range");
294
516
  }
295
- if (excl) upto_id -= 2;
517
+ if (excl)
518
+ {
519
+ upto_id -= 2;
520
+ }
521
+
522
+ from_cp = cs_checked_cp(from_id);
523
+ upto_cp = cs_checked_cp(upto_id);
296
524
 
297
- raise_arg_err_unless_valid_as_cp(from_id);
298
- raise_arg_err_unless_valid_as_cp(upto_id);
525
+ if (upto_cp > from_cp && (upto_cp - from_cp > 6))
526
+ {
527
+ // set bits in preceding partially toggled bytes individually
528
+ for (/* */; (from_cp <= upto_cp) && (from_cp % 8); from_cp++)
529
+ {
530
+ set_cp(data, from_cp);
531
+ }
532
+ // memset contiguous bits directly
533
+ cont_len = upto_cp - from_cp + 1;
534
+ rem = cont_len % 8;
535
+ ensure_memsize_fits(data, upto_cp);
536
+ memset(data->cps + CS_MSIZE(from_cp), 0xFF, CS_MSIZE(cont_len - rem) / 8);
537
+ from_cp = upto_cp - rem + 1;
538
+ }
299
539
 
300
- for (/* */; from_id <= upto_id; from_id += 2) {
301
- cp = FIX2ULONG(from_id);
302
- SETBIT(cps, cp);
540
+ // set bits in partially toggled bytes individually
541
+ for (/* */; from_cp <= upto_cp; from_cp++)
542
+ {
543
+ set_cp(data, from_cp);
303
544
  }
545
+
304
546
  return self;
305
547
  }
306
548
 
307
549
  static inline VALUE
308
- merge_rb_array(VALUE self, VALUE rb_array) {
309
- VALUE el;
310
- cp_byte *cps;
311
- VALUE array_length, i;
312
- FETCH_CODEPOINTS(self, cps);
550
+ cs_merge_rb_array(VALUE self, VALUE rb_array)
551
+ {
552
+ VALUE el, array_length, i;
553
+ struct cs_data *data;
313
554
  Check_Type(rb_array, T_ARRAY);
555
+ data = cs_fetch_data(self);
314
556
  array_length = RARRAY_LEN(rb_array);
315
- for (i = 0; i < array_length; i++) {
557
+ for (i = 0; i < array_length; i++)
558
+ {
316
559
  el = RARRAY_AREF(rb_array, i);
317
- raise_arg_err_unless_valid_as_cp(el);
318
- SETBIT(cps, FIX2ULONG(el));
560
+ set_cp(data, cs_checked_cp(el));
319
561
  }
320
562
  return self;
321
563
  }
322
564
 
323
565
  static VALUE
324
- method_merge(VALUE self, VALUE other) {
566
+ cs_method_merge(VALUE self, VALUE other)
567
+ {
325
568
  rb_check_frozen(self);
326
- if (is_character_set(other)) {
327
- return merge_character_set(self, other);
569
+ if (cs_check_type(other))
570
+ {
571
+ return cs_merge_cs(self, other);
328
572
  }
329
- else if (TYPE(other) == T_ARRAY) {
330
- return merge_rb_array(self, other);
573
+ else if (TYPE(other) == T_ARRAY)
574
+ {
575
+ return cs_merge_rb_array(self, other);
331
576
  }
332
- return merge_rb_range(self, other);
577
+ return cs_merge_rb_range(self, other);
333
578
  }
334
579
 
335
580
  static VALUE
336
- method_initialize_copy(VALUE self, VALUE other) {
337
- merge_character_set(self, other);
338
- return other;
581
+ cs_method_initialize_copy(VALUE self, VALUE orig)
582
+ {
583
+ cs_merge_cs(self, orig);
584
+ return self;
339
585
  }
340
586
 
341
587
  static VALUE
342
- method_subtract(VALUE self, VALUE other) {
588
+ cs_method_subtract(VALUE self, VALUE other)
589
+ {
590
+ cs_cp cp, len, other_len;
591
+ cs_ar *cps, *other_cps;
343
592
  rb_check_frozen(self);
344
- COMPARE_SETS(if (TSTBIT(other_cps, cp)) CLRBIT(cps, cp));
593
+ cps = cs_fetch_cps(self, &len);
594
+ other_cps = cs_fetch_cps(other, &other_len);
595
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
596
+ {
597
+ if (tst_cp(other_cps, other_len, cp))
598
+ {
599
+ clr_cp(cps, len, cp);
600
+ }
601
+ }
345
602
  return self;
346
603
  }
347
604
 
348
605
  static inline int
349
- a_subset_of_b(VALUE set_a, VALUE set_b, int *is_proper) {
350
- cp_byte *cps_a, *cps_b;
351
- cp_index cp, size_a, size_b;
606
+ cs_a_subset_of_b(VALUE cs_a, VALUE cs_b, int *is_proper_ptr)
607
+ {
608
+ cs_ar *a, *b;
609
+ cs_cp cp, alen, blen, count_a, count_b;
352
610
 
353
- if (!is_character_set(set_a) || !is_character_set(set_b)) {
611
+ if (!cs_check_type(cs_a) || !cs_check_type(cs_b))
612
+ {
354
613
  rb_raise(rb_eArgError, "pass a CharacterSet");
355
614
  }
356
615
 
357
- FETCH_CODEPOINTS(set_a, cps_a);
358
- FETCH_CODEPOINTS(set_b, cps_b);
359
-
360
- *is_proper = 0;
361
- size_a = 0;
362
- size_b = 0;
363
-
364
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
365
- if (TSTBIT(cps_a, cp)) {
366
- if (!TSTBIT(cps_b, cp)) return 0;
367
- size_a++;
368
- size_b++;
616
+ a = cs_fetch_cps(cs_a, &alen);
617
+ b = cs_fetch_cps(cs_b, &blen);
618
+
619
+ count_a = 0;
620
+ count_b = 0;
621
+
622
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
623
+ {
624
+ if (tst_cp(a, alen, cp))
625
+ {
626
+ if (!tst_cp(b, blen, cp))
627
+ {
628
+ return 0;
629
+ }
630
+ count_a++;
631
+ count_b++;
632
+ }
633
+ else if (tst_cp(b, blen, cp))
634
+ {
635
+ count_b++;
369
636
  }
370
- else if (TSTBIT(cps_b, cp)) size_b++;
371
637
  }
372
638
 
373
- if (size_b > size_a) *is_proper = 1;
639
+ if (is_proper_ptr)
640
+ {
641
+ *is_proper_ptr = count_b > count_a;
642
+ }
643
+
374
644
  return 1;
375
645
  }
376
646
 
377
647
  static VALUE
378
- method_subset_p(VALUE self, VALUE other) {
379
- int is_proper;
380
- return a_subset_of_b(self, other, &is_proper) ? Qtrue : Qfalse;
648
+ cs_method_subset_p(VALUE self, VALUE other)
649
+ {
650
+ return cs_a_subset_of_b(self, other, NULL) ? Qtrue : Qfalse;
381
651
  }
382
652
 
383
653
  static VALUE
384
- method_proper_subset_p(VALUE self, VALUE other) {
385
- int is, is_proper;
386
- is = a_subset_of_b(self, other, &is_proper);
387
- return (is && is_proper) ? Qtrue : Qfalse;
654
+ cs_method_proper_subset_p(VALUE self, VALUE other)
655
+ {
656
+ int is_subset, is_proper;
657
+ is_subset = cs_a_subset_of_b(self, other, &is_proper);
658
+ return (is_subset && is_proper) ? Qtrue : Qfalse;
388
659
  }
389
660
 
390
661
  static VALUE
391
- method_superset_p(VALUE self, VALUE other) {
392
- int is_proper;
393
- return a_subset_of_b(other, self, &is_proper) ? Qtrue : Qfalse;
662
+ cs_method_superset_p(VALUE self, VALUE other)
663
+ {
664
+ return cs_a_subset_of_b(other, self, NULL) ? Qtrue : Qfalse;
394
665
  }
395
666
 
396
667
  static VALUE
397
- method_proper_superset_p(VALUE self, VALUE other) {
398
- int is, is_proper;
399
- is = a_subset_of_b(other, self, &is_proper);
400
- return (is && is_proper) ? Qtrue : Qfalse;
668
+ cs_method_proper_superset_p(VALUE self, VALUE other)
669
+ {
670
+ int is_superset, is_proper;
671
+ is_superset = cs_a_subset_of_b(other, self, &is_proper);
672
+ return (is_superset && is_proper) ? Qtrue : Qfalse;
401
673
  }
402
674
 
403
675
  // *******************************
@@ -405,42 +677,43 @@ method_proper_superset_p(VALUE self, VALUE other) {
405
677
  // *******************************
406
678
 
407
679
  static VALUE
408
- class_method_from_ranges(VALUE self, VALUE ranges) {
409
- VALUE new_set, range_count, i;
410
- new_set = rb_class_new_instance(0, 0, self);
680
+ cs_class_method_from_ranges(VALUE self, VALUE ranges)
681
+ {
682
+ VALUE new_cs, range_count, i;
683
+ new_cs = rb_class_new_instance(0, 0, self);
411
684
  range_count = RARRAY_LEN(ranges);
412
- for (i = 0; i < range_count; i++) {
413
- merge_rb_range(new_set, RARRAY_AREF(ranges, i));
685
+ for (i = 0; i < range_count; i++)
686
+ {
687
+ cs_merge_rb_range(new_cs, RARRAY_AREF(ranges, i));
414
688
  }
415
- return new_set;
689
+ return new_cs;
416
690
  }
417
691
 
418
692
  static VALUE
419
- method_ranges(VALUE self) {
420
- VALUE ranges, codepoint, previous_codepoint, current_start, current_end;
693
+ cs_method_ranges(VALUE self)
694
+ {
695
+ VALUE ranges, cp_num, previous_cp_num, current_start, current_end;
421
696
 
422
697
  ranges = rb_ary_new();
423
- previous_codepoint = 0;
698
+ previous_cp_num = 0;
424
699
  current_start = 0;
425
700
  current_end = 0;
426
701
 
427
702
  FOR_EACH_ACTIVE_CODEPOINT(
428
- codepoint = LONG2FIX(cp);
703
+ cp_num = LONG2FIX(cp);
429
704
 
430
- if (!previous_codepoint) {
431
- current_start = codepoint;
432
- }
433
- else if (previous_codepoint + 2 != codepoint) {
434
- // gap found, finalize previous range
435
- rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
436
- current_start = codepoint;
437
- }
438
- current_end = codepoint;
439
- previous_codepoint = codepoint;
440
- );
705
+ if (!previous_cp_num) {
706
+ current_start = cp_num;
707
+ } else if (previous_cp_num + 2 != cp_num) {
708
+ // gap found, finalize previous range
709
+ rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
710
+ current_start = cp_num;
711
+ } current_end = cp_num;
712
+ previous_cp_num = cp_num;);
441
713
 
442
714
  // add final range
443
- if (current_start) {
715
+ if (current_start)
716
+ {
444
717
  rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
445
718
  }
446
719
 
@@ -448,117 +721,233 @@ method_ranges(VALUE self) {
448
721
  }
449
722
 
450
723
  static VALUE
451
- method_sample(int argc, VALUE *argv, VALUE self) {
452
- VALUE to_a_args[1], array;
724
+ cs_method_sample(int argc, VALUE *argv, VALUE self)
725
+ {
726
+ VALUE array, to_a_args[1] = {Qtrue};
453
727
  rb_check_arity(argc, 0, 1);
454
- to_a_args[0] = Qtrue;
455
- array = method_to_a(1, to_a_args, self);
728
+ array = cs_method_to_a(1, to_a_args, self);
456
729
  return rb_funcall(array, rb_intern("sample"), argc, argc ? argv[0] : 0);
457
730
  }
458
731
 
459
732
  static inline VALUE
460
- new_set_from_section(VALUE set, cp_index from, cp_index upto) {
461
- cp_byte *cps, *new_cps;
462
- cp_index cp;
463
- FETCH_CODEPOINTS(set, cps);
464
- new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));
465
- for (cp = from; cp <= upto; cp++) {
466
- if (TSTBIT(cps, cp)) SETBIT(new_cps, cp);
733
+ cs_from_section(VALUE set, cs_cp from, cs_cp upto)
734
+ {
735
+ VALUE new_cs;
736
+ cs_ar *cps;
737
+ cs_cp cp, len;
738
+ struct cs_data *new_data;
739
+ new_cs = cs_alloc(RBASIC(set)->klass, &new_data);
740
+ cps = cs_fetch_cps(set, &len);
741
+ for (cp = from; cp <= upto; cp++)
742
+ {
743
+ if (tst_cp(cps, len, cp))
744
+ {
745
+ set_cp(new_data, cp);
746
+ }
467
747
  }
468
- return NEW_CHARACTER_SET(RBASIC(set)->klass, new_cps);
748
+ return new_cs;
469
749
  }
470
750
 
471
751
  static VALUE
472
- method_bmp_part(VALUE self) {
473
- return new_set_from_section(self, 0, UNICODE_PLANE_SIZE - 1);
752
+ cs_method_ext_section(VALUE self, VALUE from, VALUE upto)
753
+ {
754
+ return cs_from_section(self, FIX2ULONG(from), FIX2ULONG(upto));
755
+ }
756
+
757
+ static inline cs_cp
758
+ cs_active_cp_count_in_section(VALUE set, cs_cp from, cs_cp upto)
759
+ {
760
+ cs_ar *cps;
761
+ cs_cp cp, count, len;
762
+ cps = cs_fetch_cps(set, &len);
763
+ for (count = 0, cp = from; cp <= upto; cp++)
764
+ {
765
+ if (tst_cp(cps, len, cp))
766
+ {
767
+ count++;
768
+ }
769
+ }
770
+ return count;
474
771
  }
475
772
 
476
773
  static VALUE
477
- method_astral_part(VALUE self) {
478
- return new_set_from_section(self, UNICODE_PLANE_SIZE, UNICODE_CP_COUNT - 1);
774
+ cs_method_ext_count_in_section(VALUE self, VALUE from, VALUE upto)
775
+ {
776
+ cs_cp count;
777
+ count = cs_active_cp_count_in_section(self, FIX2ULONG(from), FIX2ULONG(upto));
778
+ return LONG2FIX(count);
479
779
  }
480
780
 
481
781
  static inline VALUE
482
- set_has_member_in_plane(VALUE set, unsigned int plane) {
483
- cp_byte *cps;
484
- cp_index cp, max_cp;
485
- FETCH_CODEPOINTS(set, cps);
486
- cp = plane * UNICODE_PLANE_SIZE;
487
- max_cp = (plane + 1) * UNICODE_PLANE_SIZE - 1;
488
- for (/* */; cp <= max_cp; cp++) {
489
- if (TSTBIT(cps, cp)) return Qtrue;
782
+ cs_has_cp_in_section(cs_ar *cps, cs_cp len, cs_cp from, cs_cp upto)
783
+ {
784
+ cs_cp cp;
785
+ for (cp = from; cp <= upto; cp++)
786
+ {
787
+ if (tst_cp(cps, len, cp))
788
+ {
789
+ return Qtrue;
790
+ }
490
791
  }
491
792
  return Qfalse;
492
793
  }
493
794
 
494
795
  static VALUE
495
- method_planes(VALUE self) {
796
+ cs_method_ext_section_p(VALUE self, VALUE from, VALUE upto)
797
+ {
798
+ cs_ar *cps;
799
+ cs_cp len;
800
+ cps = cs_fetch_cps(self, &len);
801
+ return cs_has_cp_in_section(cps, len, FIX2ULONG(from), FIX2ULONG(upto));
802
+ }
803
+
804
+ static inline VALUE
805
+ cs_ratio_of_section(VALUE set, cs_cp from, cs_cp upto)
806
+ {
807
+ double section_count, total_count;
808
+ section_count = (double)cs_active_cp_count_in_section(set, from, upto);
809
+ total_count = (double)cs_active_cp_count(set);
810
+ return DBL2NUM(section_count / total_count);
811
+ }
812
+
813
+ static VALUE
814
+ cs_method_ext_section_ratio(VALUE self, VALUE from, VALUE upto)
815
+ {
816
+ return cs_ratio_of_section(self, FIX2ULONG(from), FIX2ULONG(upto));
817
+ }
818
+
819
+ #define MAX_CP 0x10FFFF
820
+ #define MAX_ASCII_CP 0x7F
821
+ #define MAX_BMP_CP 0xFFFF
822
+ #define MIN_ASTRAL_CP 0x10000
823
+
824
+ static inline VALUE
825
+ cs_has_cp_in_plane(cs_ar *cps, cs_cp len, unsigned int plane)
826
+ {
827
+ cs_cp plane_beg, plane_end;
828
+ plane_beg = plane * UNICODE_PLANE_SIZE;
829
+ plane_end = (plane + 1) * MAX_BMP_CP;
830
+ return cs_has_cp_in_section(cps, len, plane_beg, plane_end);
831
+ }
832
+
833
+ static VALUE
834
+ cs_method_planes(VALUE self)
835
+ {
836
+ cs_ar *cps;
837
+ cs_cp len;
496
838
  unsigned int i;
497
839
  VALUE planes;
840
+ cps = cs_fetch_cps(self, &len);
498
841
  planes = rb_ary_new();
499
- for (i = 0; i < UNICODE_PLANE_COUNT; i++) {
500
- if (set_has_member_in_plane(self, i)) rb_ary_push(planes, INT2FIX(i));
842
+ for (i = 0; i < UNICODE_PLANE_COUNT; i++)
843
+ {
844
+ if (cs_has_cp_in_plane(cps, len, i))
845
+ {
846
+ rb_ary_push(planes, INT2FIX(i));
847
+ }
501
848
  }
502
849
  return planes;
503
850
  }
504
851
 
505
- static VALUE
506
- method_member_in_plane_p(VALUE self, VALUE plane_num) {
852
+ static inline int
853
+ cs_valid_plane_num(VALUE num)
854
+ {
507
855
  int plane;
508
- Check_Type(plane_num, T_FIXNUM);
509
- plane = FIX2INT(plane_num);
510
- if (plane < 0 || plane >= UNICODE_PLANE_COUNT) {
511
- rb_raise(rb_eArgError, "plane must be between 0 and 16");
856
+ Check_Type(num, T_FIXNUM);
857
+ plane = FIX2INT(num);
858
+ if (plane < 0 || plane >= UNICODE_PLANE_COUNT)
859
+ {
860
+ rb_raise(rb_eArgError, "plane must be between 0 and %d", UNICODE_PLANE_COUNT - 1);
512
861
  }
513
- return set_has_member_in_plane(self, plane);
862
+ return plane;
863
+ }
864
+
865
+ static VALUE
866
+ cs_method_plane(VALUE self, VALUE plane_num)
867
+ {
868
+ cs_cp plane, plane_beg, plane_end;
869
+ plane = cs_valid_plane_num(plane_num);
870
+ plane_beg = plane * UNICODE_PLANE_SIZE;
871
+ plane_end = (plane + 1) * MAX_BMP_CP;
872
+ return cs_from_section(self, plane_beg, plane_end);
873
+ }
874
+
875
+ static VALUE
876
+ cs_method_member_in_plane_p(VALUE self, VALUE plane_num)
877
+ {
878
+ cs_ar *cps;
879
+ cs_cp len;
880
+ unsigned int plane;
881
+ plane = cs_valid_plane_num(plane_num);
882
+ cps = cs_fetch_cps(self, &len);
883
+ return cs_has_cp_in_plane(cps, len, plane);
514
884
  }
515
885
 
516
886
  #define NON_SURROGATE(cp) (cp > 0xDFFF || cp < 0xD800)
517
887
 
518
888
  static VALUE
519
- method_ext_inversion(int argc, VALUE *argv, VALUE self) {
520
- int include_surrogates;
521
- cp_index upto;
522
- VALUE other;
523
- other = 0;
889
+ cs_method_ext_inversion(int argc, VALUE *argv, VALUE self)
890
+ {
891
+ int inc_surr;
892
+ cs_cp upto, cp, len;
893
+ cs_ar *cps;
894
+ VALUE new_cs;
895
+ struct cs_data *new_data;
896
+
524
897
  rb_check_arity(argc, 0, 2);
525
- include_surrogates = ((argc > 0) && (argv[0] == Qtrue));
526
- if ((argc > 1) && FIXNUM_P(argv[1])) {
527
- upto = FIX2ULONG(argv[1]);
528
- RETURN_NEW_SET_BASED_ON(
529
- cp <= upto && !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
530
- );
898
+
899
+ cps = cs_fetch_cps(self, &len);
900
+ inc_surr = argc && argv[0] == Qtrue;
901
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data);
902
+ upto = argc > 1 && FIXNUM_P(argv[1]) ? FIX2ULONG(argv[1]) : UNICODE_CP_COUNT;
903
+
904
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
905
+ {
906
+ if (cp <= upto && !tst_cp(cps, len, cp) && (inc_surr || NON_SURROGATE(cp)))
907
+ {
908
+ set_cp(new_data, cp);
909
+ }
531
910
  }
532
- RETURN_NEW_SET_BASED_ON(
533
- !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
534
- );
911
+
912
+ return new_cs;
535
913
  }
536
914
 
537
- typedef int(*str_cp_handler)(unsigned int, cp_byte*);
915
+ typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE *memo);
538
916
 
539
917
  static inline int
540
- add_str_cp_to_arr(unsigned int str_cp, cp_byte *cp_arr) {
541
- SETBIT(cp_arr, str_cp);
918
+ add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
919
+ {
920
+ set_cp(data, str_cp);
542
921
  return 1;
543
922
  }
544
923
 
545
924
  static VALUE
546
- method_case_insensitive(VALUE self) {
547
- cp_index i;
548
- cp_byte *new_cps;
549
-
550
- new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));
925
+ cs_method_case_insensitive(VALUE self)
926
+ {
927
+ cs_cp i, len;
928
+ cs_ar *cps;
929
+ VALUE new_cs;
930
+ struct cs_data *new_data;
551
931
 
552
- FOR_EACH_ACTIVE_CODEPOINT(SETBIT(new_cps, cp));
932
+ cps = cs_fetch_cps(self, &len);
933
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data);
934
+ cs_merge_cs(new_cs, self);
553
935
 
554
- for (i = 0; i < CASEFOLD_COUNT; i++) {
936
+ for (i = 0; i < CASEFOLD_COUNT; i++)
937
+ {
555
938
  casefold_mapping m = unicode_casefold_table[i];
556
939
 
557
- if (TSTBIT(cps, m.from)) { SETBIT(new_cps, m.to); }
558
- else if (TSTBIT(cps, m.to)) { SETBIT(new_cps, m.from); }
940
+ if (tst_cp(cps, len, m.from))
941
+ {
942
+ set_cp(new_data, m.to);
943
+ }
944
+ else if (tst_cp(cps, len, m.to))
945
+ {
946
+ set_cp(new_data, m.from);
947
+ }
559
948
  }
560
949
 
561
- return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);
950
+ return new_cs;
562
951
 
563
952
  // OnigCaseFoldType flags;
564
953
  // rb_encoding *enc;
@@ -573,20 +962,27 @@ method_case_insensitive(VALUE self) {
573
962
  }
574
963
 
575
964
  static inline VALUE
576
- each_sb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
577
- long i;
965
+ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
966
+ {
967
+ long i, str_len;
578
968
  unsigned int str_cp;
969
+ str_len = RSTRING_LEN(str);
579
970
 
580
- for (i = 0; i < RSTRING_LEN(str); i++) {
971
+ for (i = 0; i < str_len; i++)
972
+ {
581
973
  str_cp = (RSTRING_PTR(str)[i] & 0xff);
582
- if (!(*func)(str_cp, cp_arr)) return Qfalse;
974
+ if (!(*func)(str_cp, cp_arr, len, data, memo))
975
+ {
976
+ return Qfalse;
977
+ }
583
978
  }
584
979
 
585
980
  return Qtrue;
586
981
  }
587
982
 
588
983
  static inline VALUE
589
- each_mb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
984
+ each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
985
+ {
590
986
  int n;
591
987
  unsigned int str_cp;
592
988
  const char *ptr, *end;
@@ -597,9 +993,13 @@ each_mb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
597
993
  end = RSTRING_END(str);
598
994
  enc = rb_enc_get(str);
599
995
 
600
- while (ptr < end) {
996
+ while (ptr < end)
997
+ {
601
998
  str_cp = rb_enc_codepoint_len(ptr, end, &n, enc);
602
- if (!(*func)(str_cp, cp_arr)) return Qfalse;
999
+ if (!(*func)(str_cp, cp_arr, len, data, memo))
1000
+ {
1001
+ return Qfalse;
1002
+ }
603
1003
  ptr += n;
604
1004
  }
605
1005
 
@@ -611,105 +1011,238 @@ static inline int
611
1011
  single_byte_optimizable(VALUE str)
612
1012
  {
613
1013
  rb_encoding *enc;
614
- if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) return 1;
1014
+ if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
1015
+ {
1016
+ return 1;
1017
+ }
615
1018
 
616
1019
  enc = rb_enc_get(str);
617
- if (rb_enc_mbmaxlen(enc) == 1) return 1;
1020
+ if (rb_enc_mbmaxlen(enc) == 1)
1021
+ {
1022
+ return 1;
1023
+ }
618
1024
 
619
1025
  return 0;
620
1026
  }
621
1027
 
622
1028
  static inline VALUE
623
- each_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
624
- if (single_byte_optimizable(str)) {
625
- return each_sb_cp(str, func, cp_arr);
1029
+ each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1030
+ {
1031
+ if (single_byte_optimizable(str))
1032
+ {
1033
+ return each_sb_cp(str, func, cp_arr, len, data, memo);
626
1034
  }
627
- return each_mb_cp(str, func, cp_arr);
1035
+ return each_mb_cp(str, func, cp_arr, len, data, memo);
628
1036
  }
629
1037
 
630
1038
  static inline void
631
- raise_arg_err_unless_string(VALUE val) {
632
- if (!RB_TYPE_P(val, T_STRING)) rb_raise(rb_eArgError, "pass a String");
1039
+ raise_arg_err_unless_string(VALUE val)
1040
+ {
1041
+ if (!RB_TYPE_P(val, T_STRING))
1042
+ {
1043
+ rb_raise(rb_eArgError, "pass a String");
1044
+ }
633
1045
  }
634
1046
 
635
1047
  static VALUE
636
- class_method_of(VALUE self, VALUE str) {
637
- cp_byte *cp_arr;
1048
+ cs_class_method_of(VALUE self, VALUE str)
1049
+ {
1050
+ VALUE new_cs;
1051
+ struct cs_data *new_data;
1052
+ new_cs = cs_alloc(self, &new_data);
638
1053
  raise_arg_err_unless_string(str);
639
- cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte));
640
- each_cp(str, add_str_cp_to_arr, cp_arr);
641
- return NEW_CHARACTER_SET(self, cp_arr);
1054
+ each_cp(str, add_str_cp_to_arr, 0, 0, new_data, 0);
1055
+ return new_cs;
642
1056
  }
643
1057
 
644
1058
  static inline int
645
- str_cp_not_in_arr(unsigned int str_cp, cp_byte *cp_arr) {
646
- return !TSTBIT(cp_arr, str_cp);
1059
+ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1060
+ {
1061
+ if (tst_cp(cp_arr, len, str_cp))
1062
+ {
1063
+ *memo += 1;
1064
+ }
1065
+ return 1;
647
1066
  }
648
1067
 
649
1068
  static VALUE
650
- method_used_by_p(VALUE self, VALUE str) {
651
- cp_byte *cps;
652
- VALUE only_uses_other_cps;
1069
+ cs_method_count_in(VALUE self, VALUE str)
1070
+ {
1071
+ VALUE count;
1072
+ struct cs_data *data;
653
1073
  raise_arg_err_unless_string(str);
654
- FETCH_CODEPOINTS(self, cps);
655
- only_uses_other_cps = each_cp(str, str_cp_not_in_arr, cps);
656
- return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
1074
+ data = cs_fetch_data(self);
1075
+ count = 0;
1076
+ each_cp(str, count_str_cp, data->cps, data->len, data, &count);
1077
+ return INT2NUM(count);
1078
+ }
1079
+
1080
+ static inline int
1081
+ str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1082
+ {
1083
+ return tst_cp(cp_arr, len, str_cp);
1084
+ }
1085
+
1086
+ static VALUE
1087
+ cs_method_cover_p(VALUE self, VALUE str)
1088
+ {
1089
+ struct cs_data *data;
1090
+ raise_arg_err_unless_string(str);
1091
+ data = cs_fetch_data(self);
1092
+ return each_cp(str, str_cp_in_arr, data->cps, data->len, data, 0);
1093
+ }
1094
+
1095
+ static inline int
1096
+ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1097
+ {
1098
+ if (tst_cp(cp_arr, len, str_cp))
1099
+ {
1100
+ rb_ary_push(memo[0], rb_enc_uint_chr((int)str_cp, (rb_encoding *)memo[1]));
1101
+ }
1102
+ return 1;
1103
+ }
1104
+
1105
+ static VALUE
1106
+ cs_method_scan(VALUE self, VALUE str)
1107
+ {
1108
+ VALUE memo[2];
1109
+ struct cs_data *data;
1110
+ raise_arg_err_unless_string(str);
1111
+ data = cs_fetch_data(self);
1112
+ memo[0] = rb_ary_new();
1113
+ memo[1] = (VALUE)rb_enc_get(str);
1114
+ each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo);
1115
+ return memo[0];
657
1116
  }
658
1117
 
659
1118
  static inline int
660
- str_cp_in_arr(unsigned int str_cp, cp_byte *cp_arr) {
661
- return TSTBIT(cp_arr, str_cp);
1119
+ str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1120
+ {
1121
+ return !tst_cp(cp_arr, len, str_cp);
662
1122
  }
663
1123
 
664
1124
  static VALUE
665
- method_cover_p(VALUE self, VALUE str) {
666
- cp_byte *cps;
1125
+ cs_method_used_by_p(VALUE self, VALUE str)
1126
+ {
1127
+ VALUE only_uses_other_cps;
1128
+ struct cs_data *data;
667
1129
  raise_arg_err_unless_string(str);
668
- FETCH_CODEPOINTS(self, cps);
669
- return each_cp(str, str_cp_in_arr, cps);
1130
+ data = cs_fetch_data(self);
1131
+ only_uses_other_cps = each_cp(str, str_cp_not_in_arr, data->cps, data->len, data, 0);
1132
+ return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
1133
+ }
1134
+
1135
+ static void
1136
+ cs_str_buf_cat(VALUE str, const char *ptr, long len)
1137
+ {
1138
+ long total, olen;
1139
+ char *sptr;
1140
+
1141
+ RSTRING_GETMEM(str, sptr, olen);
1142
+ sptr = RSTRING(str)->as.heap.ptr;
1143
+ olen = RSTRING(str)->as.heap.len;
1144
+ total = olen + len;
1145
+ memcpy(sptr + olen, ptr, len);
1146
+ RSTRING(str)->as.heap.len = total;
1147
+ }
1148
+
1149
+ #ifndef TERM_FILL
1150
+ #define TERM_FILL(ptr, termlen) \
1151
+ do \
1152
+ { \
1153
+ char *const term_fill_ptr = (ptr); \
1154
+ const int term_fill_len = (termlen); \
1155
+ *term_fill_ptr = '\0'; \
1156
+ if (__builtin_expect(!!(term_fill_len > 1), 0)) \
1157
+ memset(term_fill_ptr, 0, term_fill_len); \
1158
+ } while (0)
1159
+ #endif
1160
+
1161
+ static void
1162
+ cs_str_buf_terminate(VALUE str, rb_encoding *enc)
1163
+ {
1164
+ char *ptr;
1165
+ long len;
1166
+
1167
+ ptr = RSTRING(str)->as.heap.ptr;
1168
+ len = RSTRING(str)->as.heap.len;
1169
+ TERM_FILL(ptr + len, rb_enc_mbminlen(enc));
670
1170
  }
671
1171
 
672
1172
  static inline VALUE
673
- apply_to_str(VALUE set, VALUE str, int delete, int bang) {
674
- cp_byte *cps;
1173
+ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1174
+ {
1175
+ cs_ar *cps;
1176
+ cs_cp len;
675
1177
  rb_encoding *str_enc;
676
- VALUE orig_len, blen, new_str_buf, chr;
677
- int n;
1178
+ VALUE orig_len, new_str_buf;
1179
+ int cp_len;
678
1180
  unsigned int str_cp;
679
1181
  const char *ptr, *end;
680
1182
 
681
1183
  raise_arg_err_unless_string(str);
682
1184
 
683
- FETCH_CODEPOINTS(set, cps);
1185
+ cps = cs_fetch_cps(set, &len);
684
1186
 
685
1187
  orig_len = RSTRING_LEN(str);
686
- blen = orig_len + 30; /* len + margin */ // not sure why, copied from string.c
687
- new_str_buf = rb_str_buf_new(blen);
1188
+ if (orig_len < 1) // empty string, will never change
1189
+ {
1190
+ if (bang)
1191
+ {
1192
+ return Qnil;
1193
+ }
1194
+ return rb_str_dup(str);
1195
+ }
1196
+
1197
+ new_str_buf = rb_str_buf_new(orig_len);
688
1198
  str_enc = rb_enc_get(str);
689
1199
  rb_enc_associate(new_str_buf, str_enc);
690
- ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ?
691
- ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1200
+ rb_str_modify(new_str_buf);
1201
+ ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
692
1202
 
693
1203
  ptr = RSTRING_PTR(str);
694
1204
  end = RSTRING_END(str);
695
1205
 
696
- while (ptr < end) {
697
- str_cp = rb_enc_codepoint_len(ptr, end, &n, str_enc);
698
- if (!TSTBIT(cps, str_cp) != !delete) {
699
- chr = rb_enc_uint_chr(str_cp, str_enc);
700
- rb_enc_str_buf_cat(new_str_buf, RSTRING_PTR(chr), n, str_enc);
1206
+ if (single_byte_optimizable(str))
1207
+ {
1208
+ while (ptr < end)
1209
+ {
1210
+ str_cp = *ptr & 0xff;
1211
+ if ((!tst_cp(cps, len, str_cp)) == delete)
1212
+ {
1213
+ cs_str_buf_cat(new_str_buf, ptr, 1);
1214
+ }
1215
+ ptr++;
1216
+ }
1217
+ }
1218
+ else // likely to be multibyte string
1219
+ {
1220
+ while (ptr < end)
1221
+ {
1222
+ str_cp = rb_enc_codepoint_len(ptr, end, &cp_len, str_enc);
1223
+ if ((!tst_cp(cps, len, str_cp)) == delete)
1224
+ {
1225
+ cs_str_buf_cat(new_str_buf, ptr, cp_len);
1226
+ }
1227
+ ptr += cp_len;
701
1228
  }
702
- ptr += n;
703
1229
  }
704
1230
 
705
- if (bang) {
706
- if (RSTRING_LEN(new_str_buf) == (long)orig_len) return Qnil; // unchanged
1231
+ cs_str_buf_terminate(new_str_buf, str_enc);
1232
+
1233
+ if (bang)
1234
+ {
1235
+ if (RSTRING_LEN(new_str_buf) == (long)orig_len) // string unchanged
1236
+ {
1237
+ return Qnil;
1238
+ }
707
1239
  rb_str_shared_replace(str, new_str_buf);
708
1240
  }
709
- else {
1241
+ else
1242
+ {
710
1243
  RB_OBJ_WRITE(new_str_buf, &(RBASIC(new_str_buf))->klass, rb_obj_class(str));
711
1244
  // slightly cumbersome approach needed for compatibility with Ruby < 2.3:
712
- RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags&(FL_TAINT));
1245
+ RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags & (FL_TAINT));
713
1246
  str = new_str_buf;
714
1247
  }
715
1248
 
@@ -717,98 +1250,115 @@ apply_to_str(VALUE set, VALUE str, int delete, int bang) {
717
1250
  }
718
1251
 
719
1252
  static VALUE
720
- method_delete_in(VALUE self, VALUE str) {
721
- return apply_to_str(self, str, 1, 0);
1253
+ cs_method_delete_in(VALUE self, VALUE str)
1254
+ {
1255
+ return cs_apply_to_str(self, str, 1, 0);
1256
+ }
1257
+
1258
+ static VALUE
1259
+ cs_method_delete_in_bang(VALUE self, VALUE str)
1260
+ {
1261
+ return cs_apply_to_str(self, str, 1, 1);
722
1262
  }
723
1263
 
724
1264
  static VALUE
725
- method_delete_in_bang(VALUE self, VALUE str) {
726
- return apply_to_str(self, str, 1, 1);
1265
+ cs_method_keep_in(VALUE self, VALUE str)
1266
+ {
1267
+ return cs_apply_to_str(self, str, 0, 0);
727
1268
  }
728
1269
 
729
1270
  static VALUE
730
- method_keep_in(VALUE self, VALUE str) {
731
- return apply_to_str(self, str, 0, 0);
1271
+ cs_method_keep_in_bang(VALUE self, VALUE str)
1272
+ {
1273
+ return cs_apply_to_str(self, str, 0, 1);
732
1274
  }
733
1275
 
734
1276
  static VALUE
735
- method_keep_in_bang(VALUE self, VALUE str) {
736
- return apply_to_str(self, str, 0, 1);
1277
+ cs_method_allocated_length(VALUE self)
1278
+ {
1279
+ return LONG2FIX(cs_fetch_data(self)->len);
737
1280
  }
738
1281
 
739
1282
  // ****
740
1283
  // init
741
1284
  // ****
742
1285
 
743
- void
744
- Init_character_set()
1286
+ void Init_character_set()
745
1287
  {
746
1288
  VALUE cs = rb_define_class("CharacterSet", rb_cObject);
747
1289
 
748
- rb_define_alloc_func(cs, method_allocate);
1290
+ rb_define_alloc_func(cs, cs_method_allocate);
749
1291
 
750
1292
  // `Set` compatibility methods
751
1293
 
752
- rb_define_method(cs, "each", method_each, 0);
753
- rb_define_method(cs, "to_a", method_to_a, -1);
754
- rb_define_method(cs, "length", method_length, 0);
755
- rb_define_method(cs, "size", method_length, 0);
756
- rb_define_method(cs, "count", method_length, 0);
757
- rb_define_method(cs, "empty?", method_empty_p, 0);
758
- rb_define_method(cs, "hash", method_hash, 0);
759
- rb_define_method(cs, "keep_if", method_keep_if, 0);
760
- rb_define_method(cs, "delete_if", method_delete_if, 0);
761
- rb_define_method(cs, "clear", method_clear, 0);
762
- rb_define_method(cs, "intersection", method_intersection, 1);
763
- rb_define_method(cs, "&", method_intersection, 1);
764
- rb_define_method(cs, "union", method_union, 1);
765
- rb_define_method(cs, "+", method_union, 1);
766
- rb_define_method(cs, "|", method_union, 1);
767
- rb_define_method(cs, "difference", method_difference, 1);
768
- rb_define_method(cs, "-", method_difference, 1);
769
- rb_define_method(cs, "^", method_exclusion, 1);
770
- rb_define_method(cs, "include?", method_include_p, 1);
771
- rb_define_method(cs, "member?", method_include_p, 1);
772
- rb_define_method(cs, "===", method_include_p, 1);
773
- rb_define_method(cs, "add", method_add, 1);
774
- rb_define_method(cs, "<<", method_add, 1);
775
- rb_define_method(cs, "add?", method_add_p, 1);
776
- rb_define_method(cs, "delete", method_delete, 1);
777
- rb_define_method(cs, "delete?", method_delete_p, 1);
778
- rb_define_method(cs, "intersect?", method_intersect_p, 1);
779
- rb_define_method(cs, "disjoint?", method_disjoint_p, 1);
780
- rb_define_method(cs, "eql?", method_eql_p, 1);
781
- rb_define_method(cs, "==", method_eql_p, 1);
782
- rb_define_method(cs, "merge", method_merge, 1);
783
- rb_define_method(cs, "initialize_clone", method_initialize_copy, 1);
784
- rb_define_method(cs, "initialize_dup", method_initialize_copy, 1);
785
- rb_define_method(cs, "subtract", method_subtract, 1);
786
- rb_define_method(cs, "subset?", method_subset_p, 1);
787
- rb_define_method(cs, "<=", method_subset_p, 1);
788
- rb_define_method(cs, "proper_subset?", method_proper_subset_p, 1);
789
- rb_define_method(cs, "<", method_proper_subset_p, 1);
790
- rb_define_method(cs, "superset?", method_superset_p, 1);
791
- rb_define_method(cs, ">=", method_superset_p, 1);
792
- rb_define_method(cs, "proper_superset?", method_proper_superset_p, 1);
793
- rb_define_method(cs, ">", method_proper_superset_p, 1);
1294
+ rb_define_method(cs, "each", cs_method_each, 0);
1295
+ rb_define_method(cs, "to_a", cs_method_to_a, -1);
1296
+ rb_define_method(cs, "length", cs_method_length, 0);
1297
+ rb_define_method(cs, "size", cs_method_length, 0);
1298
+ rb_define_method(cs, "empty?", cs_method_empty_p, 0);
1299
+ rb_define_method(cs, "hash", cs_method_hash, 0);
1300
+ rb_define_method(cs, "keep_if", cs_method_keep_if, 0);
1301
+ rb_define_method(cs, "delete_if", cs_method_delete_if, 0);
1302
+ rb_define_method(cs, "clear", cs_method_clear, 0);
1303
+ rb_define_method(cs, "min", cs_method_min, 0);
1304
+ rb_define_method(cs, "max", cs_method_max, 0);
1305
+ rb_define_method(cs, "minmax", cs_method_minmax, 0);
1306
+ rb_define_method(cs, "intersection", cs_method_intersection, 1);
1307
+ rb_define_method(cs, "&", cs_method_intersection, 1);
1308
+ rb_define_method(cs, "union", cs_method_union, 1);
1309
+ rb_define_method(cs, "+", cs_method_union, 1);
1310
+ rb_define_method(cs, "|", cs_method_union, 1);
1311
+ rb_define_method(cs, "difference", cs_method_difference, 1);
1312
+ rb_define_method(cs, "-", cs_method_difference, 1);
1313
+ rb_define_method(cs, "^", cs_method_exclusion, 1);
1314
+ rb_define_method(cs, "include?", cs_method_include_p, 1);
1315
+ rb_define_method(cs, "member?", cs_method_include_p, 1);
1316
+ rb_define_method(cs, "===", cs_method_include_p, 1);
1317
+ rb_define_method(cs, "add", cs_method_add, 1);
1318
+ rb_define_method(cs, "<<", cs_method_add, 1);
1319
+ rb_define_method(cs, "add?", cs_method_add_p, 1);
1320
+ rb_define_method(cs, "delete", cs_method_delete, 1);
1321
+ rb_define_method(cs, "delete?", cs_method_delete_p, 1);
1322
+ rb_define_method(cs, "intersect?", cs_method_intersect_p, 1);
1323
+ rb_define_method(cs, "disjoint?", cs_method_disjoint_p, 1);
1324
+ rb_define_method(cs, "eql?", cs_method_eql_p, 1);
1325
+ rb_define_method(cs, "==", cs_method_eql_p, 1);
1326
+ rb_define_method(cs, "merge", cs_method_merge, 1);
1327
+ rb_define_method(cs, "initialize_clone", cs_method_initialize_copy, 1);
1328
+ rb_define_method(cs, "initialize_dup", cs_method_initialize_copy, 1);
1329
+ rb_define_method(cs, "subtract", cs_method_subtract, 1);
1330
+ rb_define_method(cs, "subset?", cs_method_subset_p, 1);
1331
+ rb_define_method(cs, "<=", cs_method_subset_p, 1);
1332
+ rb_define_method(cs, "proper_subset?", cs_method_proper_subset_p, 1);
1333
+ rb_define_method(cs, "<", cs_method_proper_subset_p, 1);
1334
+ rb_define_method(cs, "superset?", cs_method_superset_p, 1);
1335
+ rb_define_method(cs, ">=", cs_method_superset_p, 1);
1336
+ rb_define_method(cs, "proper_superset?", cs_method_proper_superset_p, 1);
1337
+ rb_define_method(cs, ">", cs_method_proper_superset_p, 1);
794
1338
 
795
1339
  // `CharacterSet`-specific methods
796
1340
 
797
- rb_define_singleton_method(cs, "from_ranges", class_method_from_ranges, -2);
798
- rb_define_singleton_method(cs, "of", class_method_of, 1);
799
-
800
- rb_define_method(cs, "ranges", method_ranges, 0);
801
- rb_define_method(cs, "sample", method_sample, -1);
802
- rb_define_method(cs, "bmp_part", method_bmp_part, 0);
803
- rb_define_method(cs, "astral_part", method_astral_part, 0);
804
- rb_define_method(cs, "planes", method_planes, 0);
805
- rb_define_method(cs, "member_in_plane?", method_member_in_plane_p, 1);
806
- rb_define_method(cs, "ext_inversion", method_ext_inversion, -1);
807
- rb_define_method(cs, "case_insensitive", method_case_insensitive, 0);
808
- rb_define_method(cs, "used_by?", method_used_by_p, 1);
809
- rb_define_method(cs, "cover?", method_cover_p, 1);
810
- rb_define_method(cs, "delete_in", method_delete_in, 1);
811
- rb_define_method(cs, "delete_in!", method_delete_in_bang, 1);
812
- rb_define_method(cs, "keep_in", method_keep_in, 1);
813
- rb_define_method(cs, "keep_in!", method_keep_in_bang, 1);
1341
+ rb_define_singleton_method(cs, "from_ranges", cs_class_method_from_ranges, -2);
1342
+ rb_define_singleton_method(cs, "of", cs_class_method_of, 1);
1343
+
1344
+ rb_define_method(cs, "ranges", cs_method_ranges, 0);
1345
+ rb_define_method(cs, "sample", cs_method_sample, -1);
1346
+ rb_define_method(cs, "ext_section", cs_method_ext_section, 2);
1347
+ rb_define_method(cs, "ext_count_in_section", cs_method_ext_count_in_section, 2);
1348
+ rb_define_method(cs, "ext_section?", cs_method_ext_section_p, 2);
1349
+ rb_define_method(cs, "ext_section_ratio", cs_method_ext_section_ratio, 2);
1350
+ rb_define_method(cs, "planes", cs_method_planes, 0);
1351
+ rb_define_method(cs, "plane", cs_method_plane, 1);
1352
+ rb_define_method(cs, "member_in_plane?", cs_method_member_in_plane_p, 1);
1353
+ rb_define_method(cs, "ext_inversion", cs_method_ext_inversion, -1);
1354
+ rb_define_method(cs, "case_insensitive", cs_method_case_insensitive, 0);
1355
+ rb_define_method(cs, "count_in", cs_method_count_in, 1);
1356
+ rb_define_method(cs, "cover?", cs_method_cover_p, 1);
1357
+ rb_define_method(cs, "delete_in", cs_method_delete_in, 1);
1358
+ rb_define_method(cs, "delete_in!", cs_method_delete_in_bang, 1);
1359
+ rb_define_method(cs, "keep_in", cs_method_keep_in, 1);
1360
+ rb_define_method(cs, "keep_in!", cs_method_keep_in_bang, 1);
1361
+ rb_define_method(cs, "scan", cs_method_scan, 1);
1362
+ rb_define_method(cs, "used_by?", cs_method_used_by_p, 1);
1363
+ rb_define_method(cs, "allocated_length", cs_method_allocated_length, 0);
814
1364
  }