character_set 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.gitattributes +3 -0
  3. data/.travis.yml +1 -0
  4. data/BENCHMARK.md +51 -15
  5. data/CHANGELOG.md +20 -0
  6. data/README.md +24 -8
  7. data/Rakefile +20 -18
  8. data/benchmarks/count_in.rb +13 -0
  9. data/benchmarks/delete_in.rb +1 -1
  10. data/benchmarks/scan.rb +13 -0
  11. data/benchmarks/shared.rb +1 -0
  12. data/benchmarks/z_add.rb +12 -0
  13. data/benchmarks/z_delete.rb +12 -0
  14. data/benchmarks/z_merge.rb +15 -0
  15. data/benchmarks/z_minmax.rb +12 -0
  16. data/bin/console +2 -0
  17. data/character_set.gemspec +2 -0
  18. data/ext/character_set/character_set.c +963 -413
  19. data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
  20. data/lib/character_set/core_ext/string_ext.rb +2 -0
  21. data/lib/character_set/expression_converter.rb +21 -24
  22. data/lib/character_set/predefined_sets.rb +25 -260
  23. data/lib/character_set/predefined_sets/any.cps +1 -0
  24. data/lib/character_set/predefined_sets/ascii.cps +1 -0
  25. data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
  26. data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
  27. data/lib/character_set/predefined_sets/assigned.cps +666 -0
  28. data/lib/character_set/predefined_sets/bmp.cps +2 -0
  29. data/lib/character_set/predefined_sets/crypt.cps +2 -0
  30. data/lib/character_set/predefined_sets/emoji.cps +151 -0
  31. data/lib/character_set/predefined_sets/newline.cps +3 -0
  32. data/lib/character_set/predefined_sets/surrogate.cps +1 -0
  33. data/lib/character_set/predefined_sets/unicode.cps +2 -0
  34. data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
  35. data/lib/character_set/predefined_sets/url_host.cps +10 -0
  36. data/lib/character_set/predefined_sets/url_path.cps +7 -0
  37. data/lib/character_set/predefined_sets/url_query.cps +8 -0
  38. data/lib/character_set/predefined_sets/whitespace.cps +10 -0
  39. data/lib/character_set/ruby_fallback.rb +0 -2
  40. data/lib/character_set/ruby_fallback/character_set_methods.rb +52 -4
  41. data/lib/character_set/ruby_fallback/set_methods.rb +2 -2
  42. data/lib/character_set/shared_methods.rb +51 -40
  43. data/lib/character_set/version.rb +1 -1
  44. metadata +54 -3
  45. data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 434323b3b99246a17ea5e062afd87d3edc3c09927b2231b4409b295ff63c7d6c
4
- data.tar.gz: 174c6dc751b03e49cf87045fad9a48100460244b7d7e25deef27066bd4aef92c
3
+ metadata.gz: ae7ec84b0727a804bf4d82564e6609fdd0bf070fd0e20c0a5688b579e320bc30
4
+ data.tar.gz: b73dec9fbd4abf83fae5881de89e4e1876e48bcefc3ef935401d5adbeb9c6c8e
5
5
  SHA512:
6
- metadata.gz: d9fa059ea3171209af537f0bd7636e3a65b962f30029ca399fe2fa0bd6168dd692b7bc5fb1014590a830b2e9aede9c26ae00ae8fe4a2eae4a86cf95e208b507d
7
- data.tar.gz: 692f4596b6adc9b44879b69fb82e55dc90d107156ecabb96c14ea91b4dc0c7dc706724b42093d0ef762cdac697f05ef855c5f462451015e1d06022ab06bc1c8d
6
+ metadata.gz: 2b84916c89dcd6a234cc5acedfc604f664a9e285c92b3bae6bade748ad3d9c275fb3307fb5721142e52dbedc9b16da65285a8ebd87cd686b55391f222ef1b4f8
7
+ data.tar.gz: 25147010da0adfd869891d50d51e265c2b4f28e1b0cb70727d9784b11c3944b9a06a9844a2068f529e487028c214f44e2ab60271a9a5730cdd40bb04dd989aaf
data/.gitattributes ADDED
@@ -0,0 +1,3 @@
1
+ *.cps linguist-detectable=false
2
+ benchmarks/* linguist-detectable=false
3
+ spec/ruby-spec/* linguist-vendored
data/.travis.yml CHANGED
@@ -5,4 +5,5 @@ rvm:
5
5
  - 2.4
6
6
  - 2.5
7
7
  - 2.6
8
+ - ruby-head
8
9
  - jruby-9.1.9.0
data/BENCHMARK.md CHANGED
@@ -1,46 +1,58 @@
1
- Results of `rake:benchmark` on ruby 2.6.0preview1 (2018-02-24 trunk 62554) [x86_64-darwin17]
1
+ Results of `rake:benchmark` on ruby 2.6.2p47 (2019-03-13 revision 67232) [x86_64-darwin18]
2
2
 
3
+ ```
4
+ Counting non-letters
5
+
6
+ CharacterSet#count_in: 12253693.8 i/s
7
+ String#count: 1737741.7 i/s - 7.05x slower
8
+ ```
3
9
  ```
4
10
  Detecting non-whitespace
5
11
 
6
- CharacterSet#cover?: 13244577.7 i/s
7
- Regexp#match?: 8027017.5 i/s - 1.65x slower
12
+ CharacterSet#cover?: 14058351.9 i/s
13
+ Regexp#match?: 7907608.1 i/s - 1.78x slower
8
14
  ```
9
15
  ```
10
16
  Detecting non-letters
11
17
 
12
- CharacterSet#cover?: 13082940.8 i/s
13
- Regexp#match?: 5372589.2 i/s - 2.44x slower
18
+ CharacterSet#cover?: 13341301.6 i/s
19
+ Regexp#match?: 5187453.3 i/s - 2.57x slower
14
20
  ```
15
21
  ```
16
22
  Removing whitespace
17
23
 
18
- CharacterSet#delete_in: 389315.6 i/s
19
- String#gsub: 223773.5 i/s - 1.74x slower
24
+ CharacterSet#delete_in: 2523184.0 i/s
25
+ String#gsub: 225804.7 i/s - 11.17x slower
20
26
  ```
21
27
  ```
22
28
  Removing whitespace, emoji and umlauts
23
29
 
24
- CharacterSet#delete_in: 470239.3 i/s
25
- String#gsub: 278679.4 i/s - 1.69x slower
30
+ CharacterSet#delete_in: 1712208.6 i/s
31
+ String#gsub: 278508.8 i/s - 6.15x slower
26
32
  ```
27
33
  ```
28
34
  Removing non-whitespace
29
35
 
30
- CharacterSet#keep_in: 1138461.0 i/s
31
- String#gsub: 235287.4 i/s - 4.84x slower
36
+ CharacterSet#keep_in: 2760158.1 i/s
37
+ String#gsub: 232797.7 i/s - 11.86x slower
32
38
  ```
33
39
  ```
34
40
  Extracting emoji
35
41
 
36
- CharacterSet#keep_in: 1474472.0 i/s
37
- String#gsub: 212269.6 i/s - 6.95x slower
42
+ CharacterSet#keep_in: 1775758.8 i/s
43
+ String#gsub: 217649.9 i/s - 8.16x slower
44
+ ```
45
+ ```
46
+ Extracting emoji to an Array
47
+
48
+ CharacterSet#scan: 2579030.8 i/s
49
+ String#scan: 545107.0 i/s - 4.73x slower
38
50
  ```
39
51
  ```
40
52
  Detecting whitespace
41
53
 
42
- CharacterSet#used_by?: 13063108.7 i/s
43
- Regexp#match?: 7215075.0 i/s - 1.81x slower
54
+ CharacterSet#used_by?: 13847689.0 i/s
55
+ Regexp#match?: 7533275.2 i/s - 1.84x slower
44
56
  ```
45
57
  ```
46
58
  Detecting emoji in a large string
@@ -48,3 +60,27 @@ Detecting emoji in a large string
48
60
  CharacterSet#used_by?: 246527.7 i/s
49
61
  Regexp#match?: 92956.5 i/s - 2.65x slower
50
62
  ```
63
+ ```
64
+ Adding entries
65
+
66
+ CharacterSet#add: 3102081.7 i/s
67
+ SortedSet#add: 1897464.8 i/s - 1.63x slower
68
+ ```
69
+ ```
70
+ Removing entries
71
+
72
+ CharacterSet#delete: 3240924.1 i/s
73
+ SortedSet#delete: 2887493.9 i/s - 1.12x slower
74
+ ```
75
+ ```
76
+ Merging entries
77
+
78
+ CharacterSet#merge: 536.8 i/s
79
+ SortedSet#merge: 12.5 i/s - 42.78x slower
80
+ ```
81
+ ```
82
+ Getting the min and max
83
+
84
+ CharacterSet#minmax: 4111960.8 i/s
85
+ SortedSet#minmax: 756.4 i/s - 5436.39x slower
86
+ ```
data/CHANGELOG.md CHANGED
@@ -4,6 +4,26 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## UNRELEASED
8
+
9
+ ## [1.3.0] - 2019-04-26
10
+
11
+ ### Added
12
+ - improved `String` manipulation speed
13
+ - improved initialization and `#merge` speed when passing a large `Range`
14
+ - reduced memory consumption by > 90% for most use cases via dynamic resizing
15
+ - before, every set instance required 136 KB for codepoints
16
+ - now, 16 bytes for a CharacterSet in ASCII space, 8 KB for one in BMP space etc.
17
+ - `#count_in` and `#scan_in` methods for `String` interaction
18
+ - new predefined sets `::any`/`::all`, `::assigned`, `::surrogate`
19
+ - conversion methods `#assigned_part`, `#valid_part`
20
+ - sectioning methods `#ascii_part`, `#plane(n)`
21
+ - section test methods `#ascii_part?`, `#ascii_ratio`, `#ascii_only?`, `#astral_only?`
22
+
23
+ ### Fixed
24
+ - `#count` now supports passing an argument or block as usual
25
+ - `CharacterSet::Pure#keep_in`, `#delete_in` now preserve the original encoding
26
+
7
27
  ## [1.2.0] - 2019-04-02
8
28
 
9
29
  ### Added
data/README.md CHANGED
@@ -2,8 +2,11 @@
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/character_set.svg)](http://badge.fury.io/rb/character_set)
4
4
  [![Build Status](https://travis-ci.org/jaynetics/character_set.svg?branch=master)](https://travis-ci.org/jaynetics/character_set)
5
+ [![codecov](https://codecov.io/gh/jaynetics/character_set/branch/master/graph/badge.svg)](https://codecov.io/gh/jaynetics/character_set)
5
6
 
6
- A gem to build, read, write and compare sets of Unicode codepoints.
7
+ This is a C-extended Ruby gem to work with sets of Unicode codepoints. It can read and write these sets in various formats and implements the stdlib `Set` interface for them.
8
+
9
+ It also offers an alternate paradigm of `String` processing which grants much better performance than `Regexp` and `String` methods from the stdlib where applicable (see [benchmarks](./BENCHMARK.md)).
7
10
 
8
11
  Many parts can be used independently, e.g.:
9
12
  - `CharacterSet::Character`
@@ -49,7 +52,7 @@ require 'character_set/core_ext/regexp_ext'
49
52
 
50
53
  ### Predefined utility sets
51
54
 
52
- `ascii`, `ascii_alnum`, `ascii_letters`, `bmp`, `crypt`, `emoji`, `newline`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace`
55
+ `ascii`, `ascii_alnum`, `ascii_letter`, `assigned`, `bmp`, `crypt`, `emoji`, `newline`, `surrogate`, `unicode`, `url_fragment`, `url_host`, `url_path`, `url_query`, `whitespace`
53
56
 
54
57
  ```ruby
55
58
  CharacterSet.ascii # => #<CharacterSet (size: 128)>
@@ -60,7 +63,7 @@ CharacterSet.non_ascii
60
63
 
61
64
  ### Interact with Strings
62
65
 
63
- CharacterSet can replace some `Regexp` actions on Strings, at better speed (see [benchmarks](./BENCHMARK.md)).
66
+ `CharacterSet` can replace some types of `String` handling with better performance than the stdlib.
64
67
 
65
68
  `#used_by?` and `#cover?` can replace some `Regexp#match?` calls:
66
69
 
@@ -71,6 +74,7 @@ CharacterSet.ascii.cover?('Tr') # => true
71
74
  ```
72
75
 
73
76
  `#delete_in(!)` and `#keep_in(!)` can replace `String#gsub(!)` and the like:
77
+
74
78
  ```ruby
75
79
  string = 'Tüür'
76
80
 
@@ -84,6 +88,13 @@ CharacterSet.ascii.keep_in!(string) # => ''
84
88
  string # => ''
85
89
  ```
86
90
 
91
+ `#count_in` and `#scan` can replace `String#count` and `String#scan`:
92
+
93
+ ```ruby
94
+ CharacterSet.non_ascii.count_in('Tüür') # => 2
95
+ CharacterSet.non_ascii.scan_in('Tüür') # => ['ü', 'ü']
96
+ ```
97
+
87
98
  There is also a core extension for String interaction.
88
99
  ```ruby
89
100
  require 'character_set/core_ext/string_ext'
@@ -100,7 +111,7 @@ require 'character_set/core_ext/string_ext'
100
111
 
101
112
  ### Manipulate
102
113
 
103
- Use any [Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
114
+ Use [any Ruby Set method](https://ruby-doc.org/stdlib-2.5.1/libdoc/set/rdoc/Set.html), e.g. `#+`, `#-`, `#&`, `#^`, `#intersect?`, `#<`, `#>` etc. to interact with other sets. Use `#add`, `#delete`, `#include?` etc. to change or check for members.
104
115
 
105
116
  Where appropriate, methods take both chars and codepoints, e.g.:
106
117
 
@@ -122,13 +133,13 @@ non_a.include?('ü') # => true
122
133
 
123
134
  # surrogate pair halves are not included by default
124
135
  CharacterSet['a'].inversion(include_surrogates: true)
125
- # => #<CharacterSet (size: 1114111)>
136
+ # => #<CharacterSet (size: 1114112)>
126
137
  ```
127
138
 
128
139
  `#case_insensitive` can be used to create a `CharacterSet` where upper/lower case codepoints are supplemented:
129
140
 
130
141
  ```ruby
131
- CharacterSet['1', 'a'].case_insensitive # => CharacterSet['1', 'A', 'a']
142
+ CharacterSet['1', 'A'].case_insensitive # => CharacterSet['1', 'A', 'a']
132
143
  ```
133
144
 
134
145
  ### Write
@@ -157,17 +168,22 @@ set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
157
168
  set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
158
169
 
159
170
  # for full js regex compatibility in case of astral members:
160
- set.to_s_with_surrogate_alternation # => '(?:[\u0258]|\ud83e\udd29)'
171
+ set.to_s_with_surrogate_alternation # => '(?:[a-c\u0258]|\ud83e\udd29)'
161
172
  ```
162
173
 
163
174
  ### Unicode plane methods
164
175
 
165
- There are some methods to check for planes and to handle [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
176
+ There are some methods to check for planes and to handle ASCII, [BMP](https://en.wikipedia.org/wiki/Plane_%28Unicode%29#Basic_Multilingual_Plane) and astral parts:
166
177
  ```Ruby
178
+ CharacterSet['a', 'ü', '🤩'].ascii_part # => CharacterSet['a']
179
+ CharacterSet['a', 'ü', '🤩'].ascii_part? # => true
180
+ CharacterSet['a', 'ü', '🤩'].ascii_only? # => false
181
+ CharacterSet['a', 'ü', '🤩'].ascii_ratio # => 0.3333333
167
182
  CharacterSet['a', 'ü', '🤩'].bmp_part # => CharacterSet['a', 'ü']
168
183
  CharacterSet['a', 'ü', '🤩'].astral_part # => CharacterSet['🤩']
169
184
  CharacterSet['a', 'ü', '🤩'].bmp_ratio # => 0.6666666
170
185
  CharacterSet['a', 'ü', '🤩'].planes # => [0, 1]
186
+ CharacterSet['a', 'ü', '🤩'].plane(1) # => CharacterSet['🤩']
171
187
  CharacterSet['a', 'ü', '🤩'].member_in_plane?(7) # => false
172
188
  CharacterSet::Character.new('a').plane # => 0
173
189
  ```
data/Rakefile CHANGED
@@ -7,6 +7,13 @@ RSpec::Core::RakeTask.new(:spec)
7
7
 
8
8
  task default: :spec
9
9
 
10
+ namespace :spec do
11
+ task :quick do
12
+ ENV['SKIP_MEMSAFETY_SPECS'] = '1'
13
+ Rake::Task[:spec].invoke
14
+ end
15
+ end
16
+
10
17
  Rake::ExtensionTask.new('character_set') do |ext|
11
18
  ext.lib_dir = 'lib/character_set'
12
19
  end
@@ -106,27 +113,22 @@ task :sync_casefold_data do
106
113
  hash[from] = to if type == 'C'
107
114
  end.sort
108
115
 
109
- File.open(dst_path, 'w') do |f|
110
- f.puts <<-C
111
- // THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT
112
- // -*-C-*-
113
-
114
- typedef struct casefold_mapping {
115
- unsigned long from;
116
- unsigned long to;
117
- } casefold_mapping;
118
-
119
- #define CASEFOLD_COUNT #{mapping.size}
116
+ content = File.read(dst_path + '.tmpl')
117
+ .sub(/(CASEFOLD_COUNT )0/, "\\1#{mapping.count}")
118
+ .sub('{}', ['{', mapping.map { |a, b| "{0x#{a},0x#{b}}," }, '}'].join("\n"))
120
119
 
121
- static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
122
- C
123
-
124
- mapping.each { |from, to| f.puts "{0x#{from},0x#{to}}," }
120
+ File.write(dst_path, content)
121
+ File.unlink(src_path)
122
+ end
125
123
 
126
- f.puts '};'
124
+ desc 'Update codepoint data for predefined sets, based on Onigmo'
125
+ task :sync_predefined_sets do
126
+ %w[assigned emoji whitespace].each do |prop|
127
+ require 'regexp_property_values'
128
+ ranges = RegexpPropertyValues[prop].matched_ranges
129
+ str = ranges.map { |r| r.minmax.map { |n| n.to_s(16) }.join(',').upcase + "\n" }.join
130
+ File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
127
131
  end
128
-
129
- File.unlink(src_path)
130
132
  end
131
133
 
132
134
  desc 'Run all IPS benchmarks'
@@ -0,0 +1,13 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum et dolorem'
4
+ tr = '^A-Za-z'
5
+ cs = CharacterSet.non_ascii_letter
6
+
7
+ benchmark(
8
+ caption: 'Counting non-letters',
9
+ cases: {
10
+ 'String#count' => -> { str.count(tr) },
11
+ 'CharacterSet#count_in' => -> { cs.count_in(str) },
12
+ }
13
+ )
@@ -14,7 +14,7 @@ benchmark(
14
14
 
15
15
  str = 'Lörem ipsüm ⛷ et dölörem'
16
16
  rx = /[\s\p{emoji}äüö]/
17
- cs = CharacterSet.whitespace + CharacterSet.emoji + CS['ä', 'ü', 'ö']
17
+ cs = CharacterSet.whitespace + CharacterSet.emoji + CharacterSet['ä', 'ö', 'ü']
18
18
 
19
19
  benchmark(
20
20
  caption: 'Removing whitespace, emoji and umlauts',
@@ -0,0 +1,13 @@
1
+ require_relative './shared'
2
+
3
+ str = 'Lorem ipsum ⛷ et dolorem'
4
+ rx = /\p{emoji}/
5
+ cs = CharacterSet.emoji
6
+
7
+ benchmark(
8
+ caption: 'Extracting emoji to an Array',
9
+ cases: {
10
+ 'String#scan' => -> { str.scan(rx) },
11
+ 'CharacterSet#scan' => -> { cs.scan(str) },
12
+ }
13
+ )
data/benchmarks/shared.rb CHANGED
@@ -3,6 +3,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
3
 
4
4
  require 'benchmark/ips'
5
5
  require 'character_set'
6
+ require 'set'
6
7
 
7
8
  def benchmark(caption: nil, cases: {})
8
9
  puts caption
@@ -0,0 +1,12 @@
1
+ require_relative './shared'
2
+
3
+ cs = CharacterSet[]
4
+ ss = SortedSet[]
5
+
6
+ benchmark(
7
+ caption: 'Adding entries',
8
+ cases: {
9
+ 'CharacterSet#add' => -> { cs.add(rand(0x10FFFF)) },
10
+ 'SortedSet#add' => -> { ss.add(rand(0x10FFFF)) },
11
+ }
12
+ )
@@ -0,0 +1,12 @@
1
+ require_relative './shared'
2
+
3
+ cs = CharacterSet.new(0..0x10FFFF)
4
+ ss = SortedSet.new(0..0x10FFFF)
5
+
6
+ benchmark(
7
+ caption: 'Removing entries',
8
+ cases: {
9
+ 'CharacterSet#delete' => -> { cs.delete(rand(0x10FFFF)) },
10
+ 'SortedSet#delete' => -> { ss.delete(rand(0x10FFFF)) },
11
+ }
12
+ )
@@ -0,0 +1,15 @@
1
+ require_relative './shared'
2
+
3
+ cs1 = CharacterSet.new(0...0x88000)
4
+ cs2 = CharacterSet.new(0x88000..0x10FFFF)
5
+
6
+ ss1 = SortedSet.new(0...0x88000)
7
+ ss2 = SortedSet.new(0x88000..0x10FFFF)
8
+
9
+ benchmark(
10
+ caption: 'Merging entries',
11
+ cases: {
12
+ 'CharacterSet#merge' => -> { cs1.merge(cs2) },
13
+ 'SortedSet#merge' => -> { ss1.merge(ss2) },
14
+ }
15
+ )
@@ -0,0 +1,12 @@
1
+ require_relative './shared'
2
+
3
+ cs = CharacterSet.new(0..0xFFFF)
4
+ ss = SortedSet.new(0..0xFFFF)
5
+
6
+ benchmark(
7
+ caption: 'Getting the min and max',
8
+ cases: {
9
+ 'CharacterSet#minmax' => -> { cs.minmax },
10
+ 'SortedSet#minmax' => -> { ss.minmax },
11
+ }
12
+ )
data/bin/console CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  require 'bundler/setup'
4
4
 
5
+ `bundle exec rake compile`
6
+
5
7
  require 'character_set'
6
8
  require 'character_set/core_ext'
7
9
  require 'character_set/pure'
@@ -23,6 +23,8 @@ Gem::Specification.new do |s|
23
23
  s.required_ruby_version = '>= 2.1.0'
24
24
 
25
25
  s.add_development_dependency 'benchmark-ips', '~> 2.7'
26
+ s.add_development_dependency 'codecov', '~> 0.1'
27
+ s.add_development_dependency 'get_process_mem', '~> 0.2.3'
26
28
  s.add_development_dependency 'rake', '~> 12.0'
27
29
  s.add_development_dependency 'rake-compiler', '~> 1.0'
28
30
  s.add_development_dependency 'range_compressor', '~> 1.0'
@@ -2,81 +2,180 @@
2
2
  #include "ruby/encoding.h"
3
3
  #include "unicode_casefold_table.h"
4
4
 
5
- #define SETBIT(byte_arr, bit) (byte_arr[bit >> 3] |= (1 << (bit & 0x07)))
6
- #define CLRBIT(byte_arr, bit) (byte_arr[bit >> 3] &= ~(1 << (bit & 0x07)))
7
- #define TSTBIT(byte_arr, bit) (byte_arr[bit >> 3] & (1 << (bit & 0x07)))
5
+ #define UNICODE_PLANE_SIZE 0x10000
6
+ #define UNICODE_PLANE_COUNT 17
7
+ #define UNICODE_CP_COUNT (UNICODE_PLANE_SIZE * UNICODE_PLANE_COUNT)
8
8
 
9
- typedef char cp_byte;
10
- typedef unsigned long cp_index;
9
+ // start at ascii size
10
+ #define CS_DEFAULT_INITIAL_LEN 128
11
11
 
12
- #define UNICODE_CP_COUNT 0x110000
13
- #define UNICODE_BYTES UNICODE_CP_COUNT / 8
14
- #define UNICODE_PLANE_SIZE 0x10000
15
- #define UNICODE_PLANE_COUNT UNICODE_CP_COUNT / UNICODE_PLANE_SIZE
12
+ typedef char cs_ar;
13
+ typedef unsigned long cs_cp;
14
+
15
+ struct cs_data
16
+ {
17
+ cs_ar *cps;
18
+ cs_cp len;
19
+ };
20
+
21
+ #define CS_MSIZE(len) (sizeof(cs_ar) * (len / 8))
22
+
23
+ static inline void
24
+ add_memspace_for_another_plane(struct cs_data *data)
25
+ {
26
+ data->cps = ruby_xrealloc(data->cps, CS_MSIZE(data->len + UNICODE_PLANE_SIZE));
27
+ memset(data->cps + CS_MSIZE(data->len), 0, CS_MSIZE(UNICODE_PLANE_SIZE));
28
+ data->len += UNICODE_PLANE_SIZE;
29
+ }
30
+
31
+ static inline void
32
+ ensure_memsize_fits(struct cs_data *data, cs_cp target_cp)
33
+ {
34
+ while (target_cp >= data->len)
35
+ {
36
+ add_memspace_for_another_plane(data);
37
+ }
38
+ }
39
+
40
+ static inline void
41
+ set_cp(struct cs_data *data, cs_cp cp)
42
+ {
43
+ ensure_memsize_fits(data, cp);
44
+ data->cps[cp >> 3] |= (1 << (cp & 0x07));
45
+ }
46
+
47
+ static inline int
48
+ tst_cp(cs_ar *cps, cs_cp len, cs_cp cp)
49
+ {
50
+ return ((cp < len) && cps[cp >> 3] & (1 << (cp & 0x07)));
51
+ }
52
+
53
+ static inline void
54
+ clr_cp(cs_ar *cps, cs_cp len, cs_cp cp)
55
+ {
56
+ if (cp < len)
57
+ {
58
+ cps[cp >> 3] &= ~(1 << (cp & 0x07));
59
+ }
60
+ }
16
61
 
17
62
  static void
18
- free_character_set(void* codepoints) {
19
- free(codepoints);
63
+ cs_free(void *ptr)
64
+ {
65
+ struct cs_data *data = ptr;
66
+ ruby_xfree(data->cps);
67
+ ruby_xfree(data);
20
68
  }
21
69
 
22
70
  static size_t
23
- memsize_character_set(const void* codepoints) {
24
- return sizeof(cp_byte) * UNICODE_BYTES;
25
- }
26
-
27
- static const rb_data_type_t
28
- character_set_type = {
29
- .wrap_struct_name = "character_set",
30
- .function = {
31
- .dmark = NULL,
32
- .dfree = free_character_set,
33
- .dsize = memsize_character_set,
34
- },
35
- .data = NULL,
36
- .flags = RUBY_TYPED_FREE_IMMEDIATELY,
71
+ cs_memsize(const void *ptr)
72
+ {
73
+ const struct cs_data *data = ptr;
74
+ return sizeof(*data) + CS_MSIZE(data->len);
75
+ }
76
+
77
+ static const rb_data_type_t cs_type = {
78
+ .wrap_struct_name = "character_set",
79
+ .function = {
80
+ .dmark = NULL,
81
+ .dfree = cs_free,
82
+ .dsize = cs_memsize,
83
+ },
84
+ .data = NULL,
85
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY,
37
86
  };
38
87
 
39
- #define FETCH_CODEPOINTS(set, cps)\
40
- TypedData_Get_Struct(set, cp_byte, &character_set_type, cps)
88
+ static inline VALUE
89
+ cs_alloc_len(VALUE klass, struct cs_data **data_ptr, cs_cp len)
90
+ {
91
+ VALUE cs;
92
+ struct cs_data *data;
93
+ cs = TypedData_Make_Struct(klass, struct cs_data, &cs_type, data);
94
+ data->cps = ruby_xmalloc(CS_MSIZE(len));
95
+ memset(data->cps, 0, CS_MSIZE(len));
96
+ data->len = len;
97
+
98
+ if (data_ptr)
99
+ {
100
+ *data_ptr = data;
101
+ }
41
102
 
42
- #define NEW_CHARACTER_SET(klass, cps)\
43
- TypedData_Wrap_Struct(klass, &character_set_type, cps)
103
+ return cs;
104
+ }
44
105
 
45
- static VALUE
46
- method_allocate(VALUE self) {
47
- cp_byte *cp_arr;
48
- cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte));
49
- return NEW_CHARACTER_SET(self, cp_arr);
106
+ static inline VALUE
107
+ cs_alloc(VALUE klass, struct cs_data **data_ptr)
108
+ {
109
+ return cs_alloc_len(klass, data_ptr, CS_DEFAULT_INITIAL_LEN);
50
110
  }
51
111
 
52
- #define FOR_EACH_ACTIVE_CODEPOINT(action)\
53
- cp_index cp;\
54
- cp_byte *cps;\
55
- FETCH_CODEPOINTS(self, cps);\
56
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\
57
- if (TSTBIT(cps, cp)) { action; }\
58
- }
112
+ static inline struct cs_data *
113
+ cs_fetch_data(VALUE cs)
114
+ {
115
+ struct cs_data *data;
116
+ TypedData_Get_Struct(cs, struct cs_data, &cs_type, data);
117
+ return data;
118
+ }
119
+
120
+ static inline cs_ar *
121
+ cs_fetch_cps(VALUE cs, cs_cp *len_ptr)
122
+ {
123
+ struct cs_data *data;
124
+ data = cs_fetch_data(cs);
125
+ *len_ptr = data->len;
126
+ return data->cps;
127
+ }
128
+
129
+ static VALUE
130
+ cs_method_allocate(VALUE self)
131
+ {
132
+ return cs_alloc(self, 0);
133
+ }
134
+
135
+ #define FOR_EACH_ACTIVE_CODEPOINT(action) \
136
+ do \
137
+ { \
138
+ cs_cp cp, len; \
139
+ cs_ar *cps; \
140
+ cps = cs_fetch_cps(self, &len); \
141
+ for (cp = 0; cp < len; cp++) \
142
+ { \
143
+ if (tst_cp(cps, len, cp)) \
144
+ { \
145
+ action; \
146
+ } \
147
+ } \
148
+ } while (0)
59
149
 
60
150
  // ***************************
61
151
  // `Set` compatibility methods
62
152
  // ***************************
63
153
 
64
- static inline VALUE
65
- enumerator_length(VALUE self, VALUE args, VALUE eobj) {
66
- cp_index count;
154
+ static inline cs_cp
155
+ cs_active_cp_count(VALUE self)
156
+ {
157
+ cs_cp count;
67
158
  count = 0;
68
159
  FOR_EACH_ACTIVE_CODEPOINT(count++);
69
- return LONG2FIX(count);
160
+ return count;
70
161
  }
71
162
 
72
163
  static VALUE
73
- method_length(VALUE self) {
74
- return enumerator_length(self, 0, 0);
164
+ cs_method_length(VALUE self)
165
+ {
166
+ return LONG2FIX(cs_active_cp_count(self));
167
+ }
168
+
169
+ static inline VALUE
170
+ cs_enumerator_length(VALUE self, VALUE args, VALUE eobj)
171
+ {
172
+ return LONG2FIX(cs_active_cp_count(self));
75
173
  }
76
174
 
77
175
  static VALUE
78
- method_each(VALUE self) {
79
- RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
176
+ cs_method_each(VALUE self)
177
+ {
178
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
80
179
  FOR_EACH_ACTIVE_CODEPOINT(rb_yield(LONG2FIX(cp)));
81
180
  return self;
82
181
  }
@@ -84,16 +183,19 @@ method_each(VALUE self) {
84
183
  // returns an Array of codepoint Integers by default.
85
184
  // returns an Array of Strings of length 1 if passed `true`.
86
185
  static VALUE
87
- method_to_a(int argc, VALUE *argv, VALUE self) {
186
+ cs_method_to_a(int argc, VALUE *argv, VALUE self)
187
+ {
88
188
  VALUE arr;
89
189
  rb_encoding *enc;
90
190
  rb_check_arity(argc, 0, 1);
91
191
 
92
192
  arr = rb_ary_new();
93
- if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse) {
193
+ if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse)
194
+ {
94
195
  FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, LONG2FIX(cp)));
95
196
  }
96
- else {
197
+ else
198
+ {
97
199
  enc = rb_utf8_encoding();
98
200
  FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, rb_enc_uint_chr((int)cp, enc)));
99
201
  }
@@ -102,302 +204,472 @@ method_to_a(int argc, VALUE *argv, VALUE self) {
102
204
  }
103
205
 
104
206
  static VALUE
105
- method_empty_p(VALUE self) {
207
+ cs_method_empty_p(VALUE self)
208
+ {
106
209
  FOR_EACH_ACTIVE_CODEPOINT(return Qfalse);
107
210
  return Qtrue;
108
211
  }
109
212
 
110
213
  static VALUE
111
- method_hash(VALUE self) {
112
- cp_index cp, hash, four_byte_value;
113
- cp_byte *cps;
114
- FETCH_CODEPOINTS(self, cps);
214
+ cs_method_hash(VALUE self)
215
+ {
216
+ cs_cp cp, len, hash, four_byte_value;
217
+ cs_ar *cps;
218
+ cps = cs_fetch_cps(self, &len);
115
219
 
116
220
  hash = 17;
117
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
118
- if (cp % 32 == 0) {
119
- if (cp != 0) { hash = hash * 23 + four_byte_value; }
221
+ for (cp = 0; cp < len; cp++)
222
+ {
223
+ if (cp % 32 == 0)
224
+ {
225
+ if (cp != 0)
226
+ {
227
+ hash = hash * 23 + four_byte_value;
228
+ }
120
229
  four_byte_value = 0;
121
230
  }
122
- if (TSTBIT(cps, cp)) four_byte_value++;
231
+ if (tst_cp(cps, len, cp))
232
+ {
233
+ four_byte_value++;
234
+ }
123
235
  }
124
236
 
125
237
  return LONG2FIX(hash);
126
238
  }
127
239
 
128
240
  static inline VALUE
129
- delete_if_block_result(VALUE self, int truthy) {
241
+ cs_delete_if_block_result(VALUE self, int truthy)
242
+ {
130
243
  VALUE result;
131
244
  rb_need_block();
132
245
  rb_check_frozen(self);
133
246
  FOR_EACH_ACTIVE_CODEPOINT(
134
- result = rb_yield(LONG2FIX(cp));
135
- if ((NIL_P(result) || result == Qfalse) != truthy) CLRBIT(cps, cp);
136
- );
247
+ result = rb_yield(LONG2FIX(cp));
248
+ if ((NIL_P(result) || result == Qfalse) != truthy) clr_cp(cps, len, cp););
137
249
  return self;
138
250
  }
139
251
 
140
252
  static VALUE
141
- method_delete_if(VALUE self) {
142
- RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
143
- return delete_if_block_result(self, 1);
253
+ cs_method_delete_if(VALUE self)
254
+ {
255
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
256
+ return cs_delete_if_block_result(self, 1);
144
257
  }
145
258
 
146
259
  static VALUE
147
- method_keep_if(VALUE self) {
148
- RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
149
- return delete_if_block_result(self, 0);
260
+ cs_method_keep_if(VALUE self)
261
+ {
262
+ RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
263
+ return cs_delete_if_block_result(self, 0);
150
264
  }
151
265
 
152
266
  static VALUE
153
- method_clear(VALUE self) {
154
- cp_index cp;
155
- cp_byte *cps;
267
+ cs_method_clear(VALUE self)
268
+ {
269
+ struct cs_data *data;
156
270
  rb_check_frozen(self);
157
- FETCH_CODEPOINTS(self, cps);
158
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
159
- CLRBIT(cps, cp);
160
- }
271
+ data = cs_fetch_data(self);
272
+ memset(data->cps, 0, CS_MSIZE(data->len));
161
273
  return self;
162
274
  }
163
275
 
164
- #define RETURN_NEW_SET_BASED_ON(condition)\
165
- cp_index cp;\
166
- cp_byte *a, *b, *new_cps;\
167
- FETCH_CODEPOINTS(self, a);\
168
- if (other) FETCH_CODEPOINTS(other, b);\
169
- new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));\
170
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {\
171
- if (condition) SETBIT(new_cps, cp);\
172
- }\
173
- return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);\
276
+ static VALUE
277
+ cs_method_min(VALUE self)
278
+ {
279
+ FOR_EACH_ACTIVE_CODEPOINT(return LONG2FIX(cp));
280
+ return Qnil;
281
+ }
282
+
283
+ static VALUE
284
+ cs_method_max(VALUE self)
285
+ {
286
+ cs_cp len;
287
+ long reverse_idx;
288
+ cs_ar *cps;
289
+ cps = cs_fetch_cps(self, &len);
290
+ for (reverse_idx = len; reverse_idx >= 0; reverse_idx--)
291
+ {
292
+ if (tst_cp(cps, len, reverse_idx))
293
+ {
294
+ return LONG2FIX(reverse_idx);
295
+ }
296
+ }
297
+ return Qnil;
298
+ }
299
+
300
+ static VALUE
301
+ cs_method_minmax(VALUE self)
302
+ {
303
+ VALUE arr;
304
+ arr = rb_ary_new2(2);
305
+ rb_ary_push(arr, cs_method_min(self));
306
+ rb_ary_push(arr, cs_method_max(self));
307
+ return arr;
308
+ }
309
+
310
+ #define RETURN_COMBINED_CS(cs_a, cs_b, comp_op) \
311
+ do \
312
+ { \
313
+ VALUE new_cs; \
314
+ cs_cp cp, alen, blen; \
315
+ cs_ar *acps, *bcps; \
316
+ struct cs_data *new_data; \
317
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
318
+ acps = cs_fetch_cps(cs_a, &alen); \
319
+ bcps = cs_fetch_cps(cs_b, &blen); \
320
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++) \
321
+ { \
322
+ if (tst_cp(acps, alen, cp) comp_op tst_cp(bcps, blen, cp)) \
323
+ { \
324
+ set_cp(new_data, cp); \
325
+ } \
326
+ } \
327
+ return new_cs; \
328
+ } while (0)
174
329
 
175
330
  static VALUE
176
- method_intersection(VALUE self, VALUE other) {
177
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) && TSTBIT(b, cp));
331
+ cs_method_intersection(VALUE self, VALUE other)
332
+ {
333
+ RETURN_COMBINED_CS(self, other, &&);
178
334
  }
179
335
 
180
336
  static VALUE
181
- method_exclusion(VALUE self, VALUE other) {
182
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) ^ TSTBIT(b, cp));
337
+ cs_method_exclusion(VALUE self, VALUE other)
338
+ {
339
+ RETURN_COMBINED_CS(self, other, ^);
183
340
  }
184
341
 
185
342
  static VALUE
186
- method_union(VALUE self, VALUE other) {
187
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) || TSTBIT(b, cp));
343
+ cs_method_union(VALUE self, VALUE other)
344
+ {
345
+ RETURN_COMBINED_CS(self, other, ||);
188
346
  }
189
347
 
190
348
  static VALUE
191
- method_difference(VALUE self, VALUE other) {
192
- RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) > TSTBIT(b, cp));
349
+ cs_method_difference(VALUE self, VALUE other)
350
+ {
351
+ RETURN_COMBINED_CS(self, other, >);
193
352
  }
194
353
 
195
354
  static VALUE
196
- method_include_p(VALUE self, VALUE num) {
197
- cp_byte *cps;
198
- FETCH_CODEPOINTS(self, cps);
199
- return (TSTBIT(cps, FIX2ULONG(num)) ? Qtrue : Qfalse);
355
+ cs_method_include_p(VALUE self, VALUE num)
356
+ {
357
+ cs_ar *cps;
358
+ cs_cp len;
359
+ cps = cs_fetch_cps(self, &len);
360
+ return (tst_cp(cps, len, FIX2ULONG(num)) ? Qtrue : Qfalse);
200
361
  }
201
362
 
202
- static inline int
203
- toggle_codepoint(VALUE set, VALUE cp_num, unsigned int on, int check_if_noop) {
204
- cp_index cp;
205
- cp_byte *cps;
206
- rb_check_frozen(set);
207
- FETCH_CODEPOINTS(set, cps);
363
+ static inline VALUE
364
+ cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop)
365
+ {
366
+ cs_cp cp, len;
367
+ cs_ar *cps;
368
+ struct cs_data *data;
369
+ rb_check_frozen(cs);
370
+ data = cs_fetch_data(cs);
371
+ cps = data->cps;
372
+ len = data->len;
208
373
  cp = FIX2ULONG(cp_num);
209
- if (check_if_noop && (!TSTBIT(cps, cp) == !on)) {
210
- return 0;
374
+ if (return_nil_if_noop && (!tst_cp(cps, len, cp) == !on))
375
+ {
376
+ return Qnil;
211
377
  }
212
- else {
213
- if (on) { SETBIT(cps, cp); }
214
- else { CLRBIT(cps, cp); }
215
- return 1;
378
+ else
379
+ {
380
+ if (on)
381
+ {
382
+ set_cp(data, cp);
383
+ }
384
+ else
385
+ {
386
+ clr_cp(cps, len, cp);
387
+ }
388
+ return cs;
216
389
  }
217
390
  }
218
391
 
219
392
  static VALUE
220
- method_add(VALUE self, VALUE cp_num) {
221
- return toggle_codepoint(self, cp_num, 1, 0) ? self : Qnil;
393
+ cs_method_add(VALUE self, VALUE cp_num)
394
+ {
395
+ return cs_toggle_codepoint(self, cp_num, 1, 0);
222
396
  }
223
397
 
224
398
  static VALUE
225
- method_add_p(VALUE self, VALUE cp_num) {
226
- return toggle_codepoint(self, cp_num, 1, 1) ? self : Qnil;
399
+ cs_method_add_p(VALUE self, VALUE cp_num)
400
+ {
401
+ return cs_toggle_codepoint(self, cp_num, 1, 1);
227
402
  }
228
403
 
229
404
  static VALUE
230
- method_delete(VALUE self, VALUE cp_num) {
231
- return toggle_codepoint(self, cp_num, 0, 0) ? self : Qnil;
405
+ cs_method_delete(VALUE self, VALUE cp_num)
406
+ {
407
+ return cs_toggle_codepoint(self, cp_num, 0, 0);
232
408
  }
233
409
 
234
410
  static VALUE
235
- method_delete_p(VALUE self, VALUE cp_num) {
236
- return toggle_codepoint(self, cp_num, 0, 1) ? self : Qnil;
411
+ cs_method_delete_p(VALUE self, VALUE cp_num)
412
+ {
413
+ return cs_toggle_codepoint(self, cp_num, 0, 1);
237
414
  }
238
415
 
239
- #define COMPARE_SETS(action)\
240
- cp_index cp;\
241
- cp_byte *cps, *other_cps;\
242
- FETCH_CODEPOINTS(self, cps);\
243
- FETCH_CODEPOINTS(other, other_cps);\
244
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { action; }\
245
-
246
416
  static VALUE
247
- method_intersect_p(VALUE self, VALUE other) {
248
- COMPARE_SETS(if (TSTBIT(cps, cp) && TSTBIT(other_cps, cp)) return Qtrue);
417
+ cs_method_intersect_p(VALUE self, VALUE other)
418
+ {
419
+ cs_cp cp, alen, blen;
420
+ cs_ar *acps, *bcps;
421
+ acps = cs_fetch_cps(self, &alen);
422
+ bcps = cs_fetch_cps(other, &blen);
423
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
424
+ {
425
+ if (tst_cp(acps, alen, cp) && tst_cp(bcps, blen, cp))
426
+ {
427
+ return Qtrue;
428
+ }
429
+ }
249
430
  return Qfalse;
250
431
  }
251
432
 
252
433
  static VALUE
253
- method_disjoint_p(VALUE self, VALUE other) {
254
- return method_intersect_p(self, other) ? Qfalse : Qtrue;
434
+ cs_method_disjoint_p(VALUE self, VALUE other)
435
+ {
436
+ return cs_method_intersect_p(self, other) ? Qfalse : Qtrue;
255
437
  }
256
438
 
257
439
  static inline int
258
- is_character_set(VALUE obj) {
259
- return rb_typeddata_is_kind_of(obj, &character_set_type);
440
+ cs_check_type(VALUE obj)
441
+ {
442
+ return rb_typeddata_is_kind_of(obj, &cs_type);
260
443
  }
261
444
 
262
445
  static VALUE
263
- method_eql_p(VALUE self, VALUE other) {
264
- if (!is_character_set(other)) return Qfalse;
265
- if (self == other) return Qtrue; // same object_id
266
-
267
- COMPARE_SETS(if (TSTBIT(cps, cp) != TSTBIT(other_cps, cp)) return Qfalse);
268
-
446
+ cs_cps_eql(VALUE cs_a, VALUE cs_b)
447
+ {
448
+ cs_cp cp, alen, blen;
449
+ cs_ar *acps, *bcps;
450
+ acps = cs_fetch_cps(cs_a, &alen);
451
+ bcps = cs_fetch_cps(cs_b, &blen);
452
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
453
+ {
454
+ if (tst_cp(acps, alen, cp) != tst_cp(bcps, blen, cp))
455
+ {
456
+ return Qfalse;
457
+ }
458
+ }
269
459
  return Qtrue;
270
460
  }
271
461
 
462
+ static VALUE
463
+ cs_method_eql_p(VALUE self, VALUE other)
464
+ {
465
+ if (!cs_check_type(other))
466
+ {
467
+ return Qfalse;
468
+ }
469
+ if (self == other) // same object_id
470
+ {
471
+ return Qtrue;
472
+ }
473
+ return cs_cps_eql(self, other);
474
+ }
475
+
272
476
  static inline VALUE
273
- merge_character_set(VALUE self, VALUE other) {
274
- COMPARE_SETS(if (TSTBIT(other_cps, cp)) SETBIT(cps, cp));
275
- return self;
477
+ cs_merge_cs(VALUE recipient, VALUE source)
478
+ {
479
+ cs_cp cp, source_len;
480
+ struct cs_data *data;
481
+ cs_ar *source_cps;
482
+ data = cs_fetch_data(recipient);
483
+ source_cps = cs_fetch_cps(source, &source_len);
484
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
485
+ {
486
+ if (tst_cp(source_cps, source_len, cp))
487
+ {
488
+ set_cp(data, cp);
489
+ }
490
+ }
491
+ return recipient;
276
492
  }
277
493
 
278
- static inline void
279
- raise_arg_err_unless_valid_as_cp(VALUE object_id) {
280
- if (FIXNUM_P(object_id) && object_id > 0 && object_id < 0x220001) return;
494
+ static inline cs_cp
495
+ cs_checked_cp(VALUE object_id)
496
+ {
497
+ if (FIXNUM_P(object_id) && object_id > 0 && object_id < 0x220001)
498
+ {
499
+ return FIX2ULONG(object_id);
500
+ }
281
501
  rb_raise(rb_eArgError, "CharacterSet members must be between 0 and 0x10FFFF");
282
502
  }
283
503
 
284
504
  static inline VALUE
285
- merge_rb_range(VALUE self, VALUE rb_range) {
505
+ cs_merge_rb_range(VALUE self, VALUE rb_range)
506
+ {
286
507
  VALUE from_id, upto_id;
508
+ cs_cp from_cp, upto_cp, cont_len, rem;
287
509
  int excl;
288
- cp_index cp;
289
- cp_byte *cps;
290
- FETCH_CODEPOINTS(self, cps);
510
+ struct cs_data *data;
511
+ data = cs_fetch_data(self);
291
512
 
292
- if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl))) {
513
+ if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl)))
514
+ {
293
515
  rb_raise(rb_eArgError, "pass a Range");
294
516
  }
295
- if (excl) upto_id -= 2;
517
+ if (excl)
518
+ {
519
+ upto_id -= 2;
520
+ }
521
+
522
+ from_cp = cs_checked_cp(from_id);
523
+ upto_cp = cs_checked_cp(upto_id);
296
524
 
297
- raise_arg_err_unless_valid_as_cp(from_id);
298
- raise_arg_err_unless_valid_as_cp(upto_id);
525
+ if (upto_cp > from_cp && (upto_cp - from_cp > 6))
526
+ {
527
+ // set bits in preceding partially toggled bytes individually
528
+ for (/* */; (from_cp <= upto_cp) && (from_cp % 8); from_cp++)
529
+ {
530
+ set_cp(data, from_cp);
531
+ }
532
+ // memset contiguous bits directly
533
+ cont_len = upto_cp - from_cp + 1;
534
+ rem = cont_len % 8;
535
+ ensure_memsize_fits(data, upto_cp);
536
+ memset(data->cps + CS_MSIZE(from_cp), 0xFF, CS_MSIZE(cont_len - rem) / 8);
537
+ from_cp = upto_cp - rem + 1;
538
+ }
299
539
 
300
- for (/* */; from_id <= upto_id; from_id += 2) {
301
- cp = FIX2ULONG(from_id);
302
- SETBIT(cps, cp);
540
+ // set bits in partially toggled bytes individually
541
+ for (/* */; from_cp <= upto_cp; from_cp++)
542
+ {
543
+ set_cp(data, from_cp);
303
544
  }
545
+
304
546
  return self;
305
547
  }
306
548
 
307
549
  static inline VALUE
308
- merge_rb_array(VALUE self, VALUE rb_array) {
309
- VALUE el;
310
- cp_byte *cps;
311
- VALUE array_length, i;
312
- FETCH_CODEPOINTS(self, cps);
550
+ cs_merge_rb_array(VALUE self, VALUE rb_array)
551
+ {
552
+ VALUE el, array_length, i;
553
+ struct cs_data *data;
313
554
  Check_Type(rb_array, T_ARRAY);
555
+ data = cs_fetch_data(self);
314
556
  array_length = RARRAY_LEN(rb_array);
315
- for (i = 0; i < array_length; i++) {
557
+ for (i = 0; i < array_length; i++)
558
+ {
316
559
  el = RARRAY_AREF(rb_array, i);
317
- raise_arg_err_unless_valid_as_cp(el);
318
- SETBIT(cps, FIX2ULONG(el));
560
+ set_cp(data, cs_checked_cp(el));
319
561
  }
320
562
  return self;
321
563
  }
322
564
 
323
565
  static VALUE
324
- method_merge(VALUE self, VALUE other) {
566
+ cs_method_merge(VALUE self, VALUE other)
567
+ {
325
568
  rb_check_frozen(self);
326
- if (is_character_set(other)) {
327
- return merge_character_set(self, other);
569
+ if (cs_check_type(other))
570
+ {
571
+ return cs_merge_cs(self, other);
328
572
  }
329
- else if (TYPE(other) == T_ARRAY) {
330
- return merge_rb_array(self, other);
573
+ else if (TYPE(other) == T_ARRAY)
574
+ {
575
+ return cs_merge_rb_array(self, other);
331
576
  }
332
- return merge_rb_range(self, other);
577
+ return cs_merge_rb_range(self, other);
333
578
  }
334
579
 
335
580
  static VALUE
336
- method_initialize_copy(VALUE self, VALUE other) {
337
- merge_character_set(self, other);
338
- return other;
581
+ cs_method_initialize_copy(VALUE self, VALUE orig)
582
+ {
583
+ cs_merge_cs(self, orig);
584
+ return self;
339
585
  }
340
586
 
341
587
  static VALUE
342
- method_subtract(VALUE self, VALUE other) {
588
+ cs_method_subtract(VALUE self, VALUE other)
589
+ {
590
+ cs_cp cp, len, other_len;
591
+ cs_ar *cps, *other_cps;
343
592
  rb_check_frozen(self);
344
- COMPARE_SETS(if (TSTBIT(other_cps, cp)) CLRBIT(cps, cp));
593
+ cps = cs_fetch_cps(self, &len);
594
+ other_cps = cs_fetch_cps(other, &other_len);
595
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
596
+ {
597
+ if (tst_cp(other_cps, other_len, cp))
598
+ {
599
+ clr_cp(cps, len, cp);
600
+ }
601
+ }
345
602
  return self;
346
603
  }
347
604
 
348
605
  static inline int
349
- a_subset_of_b(VALUE set_a, VALUE set_b, int *is_proper) {
350
- cp_byte *cps_a, *cps_b;
351
- cp_index cp, size_a, size_b;
606
+ cs_a_subset_of_b(VALUE cs_a, VALUE cs_b, int *is_proper_ptr)
607
+ {
608
+ cs_ar *a, *b;
609
+ cs_cp cp, alen, blen, count_a, count_b;
352
610
 
353
- if (!is_character_set(set_a) || !is_character_set(set_b)) {
611
+ if (!cs_check_type(cs_a) || !cs_check_type(cs_b))
612
+ {
354
613
  rb_raise(rb_eArgError, "pass a CharacterSet");
355
614
  }
356
615
 
357
- FETCH_CODEPOINTS(set_a, cps_a);
358
- FETCH_CODEPOINTS(set_b, cps_b);
359
-
360
- *is_proper = 0;
361
- size_a = 0;
362
- size_b = 0;
363
-
364
- for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
365
- if (TSTBIT(cps_a, cp)) {
366
- if (!TSTBIT(cps_b, cp)) return 0;
367
- size_a++;
368
- size_b++;
616
+ a = cs_fetch_cps(cs_a, &alen);
617
+ b = cs_fetch_cps(cs_b, &blen);
618
+
619
+ count_a = 0;
620
+ count_b = 0;
621
+
622
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
623
+ {
624
+ if (tst_cp(a, alen, cp))
625
+ {
626
+ if (!tst_cp(b, blen, cp))
627
+ {
628
+ return 0;
629
+ }
630
+ count_a++;
631
+ count_b++;
632
+ }
633
+ else if (tst_cp(b, blen, cp))
634
+ {
635
+ count_b++;
369
636
  }
370
- else if (TSTBIT(cps_b, cp)) size_b++;
371
637
  }
372
638
 
373
- if (size_b > size_a) *is_proper = 1;
639
+ if (is_proper_ptr)
640
+ {
641
+ *is_proper_ptr = count_b > count_a;
642
+ }
643
+
374
644
  return 1;
375
645
  }
376
646
 
377
647
  static VALUE
378
- method_subset_p(VALUE self, VALUE other) {
379
- int is_proper;
380
- return a_subset_of_b(self, other, &is_proper) ? Qtrue : Qfalse;
648
+ cs_method_subset_p(VALUE self, VALUE other)
649
+ {
650
+ return cs_a_subset_of_b(self, other, NULL) ? Qtrue : Qfalse;
381
651
  }
382
652
 
383
653
  static VALUE
384
- method_proper_subset_p(VALUE self, VALUE other) {
385
- int is, is_proper;
386
- is = a_subset_of_b(self, other, &is_proper);
387
- return (is && is_proper) ? Qtrue : Qfalse;
654
+ cs_method_proper_subset_p(VALUE self, VALUE other)
655
+ {
656
+ int is_subset, is_proper;
657
+ is_subset = cs_a_subset_of_b(self, other, &is_proper);
658
+ return (is_subset && is_proper) ? Qtrue : Qfalse;
388
659
  }
389
660
 
390
661
  static VALUE
391
- method_superset_p(VALUE self, VALUE other) {
392
- int is_proper;
393
- return a_subset_of_b(other, self, &is_proper) ? Qtrue : Qfalse;
662
+ cs_method_superset_p(VALUE self, VALUE other)
663
+ {
664
+ return cs_a_subset_of_b(other, self, NULL) ? Qtrue : Qfalse;
394
665
  }
395
666
 
396
667
  static VALUE
397
- method_proper_superset_p(VALUE self, VALUE other) {
398
- int is, is_proper;
399
- is = a_subset_of_b(other, self, &is_proper);
400
- return (is && is_proper) ? Qtrue : Qfalse;
668
+ cs_method_proper_superset_p(VALUE self, VALUE other)
669
+ {
670
+ int is_superset, is_proper;
671
+ is_superset = cs_a_subset_of_b(other, self, &is_proper);
672
+ return (is_superset && is_proper) ? Qtrue : Qfalse;
401
673
  }
402
674
 
403
675
  // *******************************
@@ -405,42 +677,43 @@ method_proper_superset_p(VALUE self, VALUE other) {
405
677
  // *******************************
406
678
 
407
679
  static VALUE
408
- class_method_from_ranges(VALUE self, VALUE ranges) {
409
- VALUE new_set, range_count, i;
410
- new_set = rb_class_new_instance(0, 0, self);
680
+ cs_class_method_from_ranges(VALUE self, VALUE ranges)
681
+ {
682
+ VALUE new_cs, range_count, i;
683
+ new_cs = rb_class_new_instance(0, 0, self);
411
684
  range_count = RARRAY_LEN(ranges);
412
- for (i = 0; i < range_count; i++) {
413
- merge_rb_range(new_set, RARRAY_AREF(ranges, i));
685
+ for (i = 0; i < range_count; i++)
686
+ {
687
+ cs_merge_rb_range(new_cs, RARRAY_AREF(ranges, i));
414
688
  }
415
- return new_set;
689
+ return new_cs;
416
690
  }
417
691
 
418
692
  static VALUE
419
- method_ranges(VALUE self) {
420
- VALUE ranges, codepoint, previous_codepoint, current_start, current_end;
693
+ cs_method_ranges(VALUE self)
694
+ {
695
+ VALUE ranges, cp_num, previous_cp_num, current_start, current_end;
421
696
 
422
697
  ranges = rb_ary_new();
423
- previous_codepoint = 0;
698
+ previous_cp_num = 0;
424
699
  current_start = 0;
425
700
  current_end = 0;
426
701
 
427
702
  FOR_EACH_ACTIVE_CODEPOINT(
428
- codepoint = LONG2FIX(cp);
703
+ cp_num = LONG2FIX(cp);
429
704
 
430
- if (!previous_codepoint) {
431
- current_start = codepoint;
432
- }
433
- else if (previous_codepoint + 2 != codepoint) {
434
- // gap found, finalize previous range
435
- rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
436
- current_start = codepoint;
437
- }
438
- current_end = codepoint;
439
- previous_codepoint = codepoint;
440
- );
705
+ if (!previous_cp_num) {
706
+ current_start = cp_num;
707
+ } else if (previous_cp_num + 2 != cp_num) {
708
+ // gap found, finalize previous range
709
+ rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
710
+ current_start = cp_num;
711
+ } current_end = cp_num;
712
+ previous_cp_num = cp_num;);
441
713
 
442
714
  // add final range
443
- if (current_start) {
715
+ if (current_start)
716
+ {
444
717
  rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
445
718
  }
446
719
 
@@ -448,117 +721,233 @@ method_ranges(VALUE self) {
448
721
  }
449
722
 
450
723
  static VALUE
451
- method_sample(int argc, VALUE *argv, VALUE self) {
452
- VALUE to_a_args[1], array;
724
+ cs_method_sample(int argc, VALUE *argv, VALUE self)
725
+ {
726
+ VALUE array, to_a_args[1] = {Qtrue};
453
727
  rb_check_arity(argc, 0, 1);
454
- to_a_args[0] = Qtrue;
455
- array = method_to_a(1, to_a_args, self);
728
+ array = cs_method_to_a(1, to_a_args, self);
456
729
  return rb_funcall(array, rb_intern("sample"), argc, argc ? argv[0] : 0);
457
730
  }
458
731
 
459
732
  static inline VALUE
460
- new_set_from_section(VALUE set, cp_index from, cp_index upto) {
461
- cp_byte *cps, *new_cps;
462
- cp_index cp;
463
- FETCH_CODEPOINTS(set, cps);
464
- new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));
465
- for (cp = from; cp <= upto; cp++) {
466
- if (TSTBIT(cps, cp)) SETBIT(new_cps, cp);
733
+ cs_from_section(VALUE set, cs_cp from, cs_cp upto)
734
+ {
735
+ VALUE new_cs;
736
+ cs_ar *cps;
737
+ cs_cp cp, len;
738
+ struct cs_data *new_data;
739
+ new_cs = cs_alloc(RBASIC(set)->klass, &new_data);
740
+ cps = cs_fetch_cps(set, &len);
741
+ for (cp = from; cp <= upto; cp++)
742
+ {
743
+ if (tst_cp(cps, len, cp))
744
+ {
745
+ set_cp(new_data, cp);
746
+ }
467
747
  }
468
- return NEW_CHARACTER_SET(RBASIC(set)->klass, new_cps);
748
+ return new_cs;
469
749
  }
470
750
 
471
751
  static VALUE
472
- method_bmp_part(VALUE self) {
473
- return new_set_from_section(self, 0, UNICODE_PLANE_SIZE - 1);
752
+ cs_method_ext_section(VALUE self, VALUE from, VALUE upto)
753
+ {
754
+ return cs_from_section(self, FIX2ULONG(from), FIX2ULONG(upto));
755
+ }
756
+
757
+ static inline cs_cp
758
+ cs_active_cp_count_in_section(VALUE set, cs_cp from, cs_cp upto)
759
+ {
760
+ cs_ar *cps;
761
+ cs_cp cp, count, len;
762
+ cps = cs_fetch_cps(set, &len);
763
+ for (count = 0, cp = from; cp <= upto; cp++)
764
+ {
765
+ if (tst_cp(cps, len, cp))
766
+ {
767
+ count++;
768
+ }
769
+ }
770
+ return count;
474
771
  }
475
772
 
476
773
  static VALUE
477
- method_astral_part(VALUE self) {
478
- return new_set_from_section(self, UNICODE_PLANE_SIZE, UNICODE_CP_COUNT - 1);
774
+ cs_method_ext_count_in_section(VALUE self, VALUE from, VALUE upto)
775
+ {
776
+ cs_cp count;
777
+ count = cs_active_cp_count_in_section(self, FIX2ULONG(from), FIX2ULONG(upto));
778
+ return LONG2FIX(count);
479
779
  }
480
780
 
481
781
  static inline VALUE
482
- set_has_member_in_plane(VALUE set, unsigned int plane) {
483
- cp_byte *cps;
484
- cp_index cp, max_cp;
485
- FETCH_CODEPOINTS(set, cps);
486
- cp = plane * UNICODE_PLANE_SIZE;
487
- max_cp = (plane + 1) * UNICODE_PLANE_SIZE - 1;
488
- for (/* */; cp <= max_cp; cp++) {
489
- if (TSTBIT(cps, cp)) return Qtrue;
782
+ cs_has_cp_in_section(cs_ar *cps, cs_cp len, cs_cp from, cs_cp upto)
783
+ {
784
+ cs_cp cp;
785
+ for (cp = from; cp <= upto; cp++)
786
+ {
787
+ if (tst_cp(cps, len, cp))
788
+ {
789
+ return Qtrue;
790
+ }
490
791
  }
491
792
  return Qfalse;
492
793
  }
493
794
 
494
795
  static VALUE
495
- method_planes(VALUE self) {
796
+ cs_method_ext_section_p(VALUE self, VALUE from, VALUE upto)
797
+ {
798
+ cs_ar *cps;
799
+ cs_cp len;
800
+ cps = cs_fetch_cps(self, &len);
801
+ return cs_has_cp_in_section(cps, len, FIX2ULONG(from), FIX2ULONG(upto));
802
+ }
803
+
804
+ static inline VALUE
805
+ cs_ratio_of_section(VALUE set, cs_cp from, cs_cp upto)
806
+ {
807
+ double section_count, total_count;
808
+ section_count = (double)cs_active_cp_count_in_section(set, from, upto);
809
+ total_count = (double)cs_active_cp_count(set);
810
+ return DBL2NUM(section_count / total_count);
811
+ }
812
+
813
+ static VALUE
814
+ cs_method_ext_section_ratio(VALUE self, VALUE from, VALUE upto)
815
+ {
816
+ return cs_ratio_of_section(self, FIX2ULONG(from), FIX2ULONG(upto));
817
+ }
818
+
819
+ #define MAX_CP 0x10FFFF
820
+ #define MAX_ASCII_CP 0x7F
821
+ #define MAX_BMP_CP 0xFFFF
822
+ #define MIN_ASTRAL_CP 0x10000
823
+
824
+ static inline VALUE
825
+ cs_has_cp_in_plane(cs_ar *cps, cs_cp len, unsigned int plane)
826
+ {
827
+ cs_cp plane_beg, plane_end;
828
+ plane_beg = plane * UNICODE_PLANE_SIZE;
829
+ plane_end = (plane + 1) * MAX_BMP_CP;
830
+ return cs_has_cp_in_section(cps, len, plane_beg, plane_end);
831
+ }
832
+
833
+ static VALUE
834
+ cs_method_planes(VALUE self)
835
+ {
836
+ cs_ar *cps;
837
+ cs_cp len;
496
838
  unsigned int i;
497
839
  VALUE planes;
840
+ cps = cs_fetch_cps(self, &len);
498
841
  planes = rb_ary_new();
499
- for (i = 0; i < UNICODE_PLANE_COUNT; i++) {
500
- if (set_has_member_in_plane(self, i)) rb_ary_push(planes, INT2FIX(i));
842
+ for (i = 0; i < UNICODE_PLANE_COUNT; i++)
843
+ {
844
+ if (cs_has_cp_in_plane(cps, len, i))
845
+ {
846
+ rb_ary_push(planes, INT2FIX(i));
847
+ }
501
848
  }
502
849
  return planes;
503
850
  }
504
851
 
505
- static VALUE
506
- method_member_in_plane_p(VALUE self, VALUE plane_num) {
852
+ static inline int
853
+ cs_valid_plane_num(VALUE num)
854
+ {
507
855
  int plane;
508
- Check_Type(plane_num, T_FIXNUM);
509
- plane = FIX2INT(plane_num);
510
- if (plane < 0 || plane >= UNICODE_PLANE_COUNT) {
511
- rb_raise(rb_eArgError, "plane must be between 0 and 16");
856
+ Check_Type(num, T_FIXNUM);
857
+ plane = FIX2INT(num);
858
+ if (plane < 0 || plane >= UNICODE_PLANE_COUNT)
859
+ {
860
+ rb_raise(rb_eArgError, "plane must be between 0 and %d", UNICODE_PLANE_COUNT - 1);
512
861
  }
513
- return set_has_member_in_plane(self, plane);
862
+ return plane;
863
+ }
864
+
865
+ static VALUE
866
+ cs_method_plane(VALUE self, VALUE plane_num)
867
+ {
868
+ cs_cp plane, plane_beg, plane_end;
869
+ plane = cs_valid_plane_num(plane_num);
870
+ plane_beg = plane * UNICODE_PLANE_SIZE;
871
+ plane_end = (plane + 1) * MAX_BMP_CP;
872
+ return cs_from_section(self, plane_beg, plane_end);
873
+ }
874
+
875
+ static VALUE
876
+ cs_method_member_in_plane_p(VALUE self, VALUE plane_num)
877
+ {
878
+ cs_ar *cps;
879
+ cs_cp len;
880
+ unsigned int plane;
881
+ plane = cs_valid_plane_num(plane_num);
882
+ cps = cs_fetch_cps(self, &len);
883
+ return cs_has_cp_in_plane(cps, len, plane);
514
884
  }
515
885
 
516
886
  #define NON_SURROGATE(cp) (cp > 0xDFFF || cp < 0xD800)
517
887
 
518
888
  static VALUE
519
- method_ext_inversion(int argc, VALUE *argv, VALUE self) {
520
- int include_surrogates;
521
- cp_index upto;
522
- VALUE other;
523
- other = 0;
889
+ cs_method_ext_inversion(int argc, VALUE *argv, VALUE self)
890
+ {
891
+ int inc_surr;
892
+ cs_cp upto, cp, len;
893
+ cs_ar *cps;
894
+ VALUE new_cs;
895
+ struct cs_data *new_data;
896
+
524
897
  rb_check_arity(argc, 0, 2);
525
- include_surrogates = ((argc > 0) && (argv[0] == Qtrue));
526
- if ((argc > 1) && FIXNUM_P(argv[1])) {
527
- upto = FIX2ULONG(argv[1]);
528
- RETURN_NEW_SET_BASED_ON(
529
- cp <= upto && !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
530
- );
898
+
899
+ cps = cs_fetch_cps(self, &len);
900
+ inc_surr = argc && argv[0] == Qtrue;
901
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data);
902
+ upto = argc > 1 && FIXNUM_P(argv[1]) ? FIX2ULONG(argv[1]) : UNICODE_CP_COUNT;
903
+
904
+ for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
905
+ {
906
+ if (cp <= upto && !tst_cp(cps, len, cp) && (inc_surr || NON_SURROGATE(cp)))
907
+ {
908
+ set_cp(new_data, cp);
909
+ }
531
910
  }
532
- RETURN_NEW_SET_BASED_ON(
533
- !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
534
- );
911
+
912
+ return new_cs;
535
913
  }
536
914
 
537
- typedef int(*str_cp_handler)(unsigned int, cp_byte*);
915
+ typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE *memo);
538
916
 
539
917
  static inline int
540
- add_str_cp_to_arr(unsigned int str_cp, cp_byte *cp_arr) {
541
- SETBIT(cp_arr, str_cp);
918
+ add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
919
+ {
920
+ set_cp(data, str_cp);
542
921
  return 1;
543
922
  }
544
923
 
545
924
  static VALUE
546
- method_case_insensitive(VALUE self) {
547
- cp_index i;
548
- cp_byte *new_cps;
549
-
550
- new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));
925
+ cs_method_case_insensitive(VALUE self)
926
+ {
927
+ cs_cp i, len;
928
+ cs_ar *cps;
929
+ VALUE new_cs;
930
+ struct cs_data *new_data;
551
931
 
552
- FOR_EACH_ACTIVE_CODEPOINT(SETBIT(new_cps, cp));
932
+ cps = cs_fetch_cps(self, &len);
933
+ new_cs = cs_alloc(RBASIC(self)->klass, &new_data);
934
+ cs_merge_cs(new_cs, self);
553
935
 
554
- for (i = 0; i < CASEFOLD_COUNT; i++) {
936
+ for (i = 0; i < CASEFOLD_COUNT; i++)
937
+ {
555
938
  casefold_mapping m = unicode_casefold_table[i];
556
939
 
557
- if (TSTBIT(cps, m.from)) { SETBIT(new_cps, m.to); }
558
- else if (TSTBIT(cps, m.to)) { SETBIT(new_cps, m.from); }
940
+ if (tst_cp(cps, len, m.from))
941
+ {
942
+ set_cp(new_data, m.to);
943
+ }
944
+ else if (tst_cp(cps, len, m.to))
945
+ {
946
+ set_cp(new_data, m.from);
947
+ }
559
948
  }
560
949
 
561
- return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);
950
+ return new_cs;
562
951
 
563
952
  // OnigCaseFoldType flags;
564
953
  // rb_encoding *enc;
@@ -573,20 +962,27 @@ method_case_insensitive(VALUE self) {
573
962
  }
574
963
 
575
964
  static inline VALUE
576
- each_sb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
577
- long i;
965
+ each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
966
+ {
967
+ long i, str_len;
578
968
  unsigned int str_cp;
969
+ str_len = RSTRING_LEN(str);
579
970
 
580
- for (i = 0; i < RSTRING_LEN(str); i++) {
971
+ for (i = 0; i < str_len; i++)
972
+ {
581
973
  str_cp = (RSTRING_PTR(str)[i] & 0xff);
582
- if (!(*func)(str_cp, cp_arr)) return Qfalse;
974
+ if (!(*func)(str_cp, cp_arr, len, data, memo))
975
+ {
976
+ return Qfalse;
977
+ }
583
978
  }
584
979
 
585
980
  return Qtrue;
586
981
  }
587
982
 
588
983
  static inline VALUE
589
- each_mb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
984
+ each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
985
+ {
590
986
  int n;
591
987
  unsigned int str_cp;
592
988
  const char *ptr, *end;
@@ -597,9 +993,13 @@ each_mb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
597
993
  end = RSTRING_END(str);
598
994
  enc = rb_enc_get(str);
599
995
 
600
- while (ptr < end) {
996
+ while (ptr < end)
997
+ {
601
998
  str_cp = rb_enc_codepoint_len(ptr, end, &n, enc);
602
- if (!(*func)(str_cp, cp_arr)) return Qfalse;
999
+ if (!(*func)(str_cp, cp_arr, len, data, memo))
1000
+ {
1001
+ return Qfalse;
1002
+ }
603
1003
  ptr += n;
604
1004
  }
605
1005
 
@@ -611,105 +1011,238 @@ static inline int
611
1011
  single_byte_optimizable(VALUE str)
612
1012
  {
613
1013
  rb_encoding *enc;
614
- if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) return 1;
1014
+ if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
1015
+ {
1016
+ return 1;
1017
+ }
615
1018
 
616
1019
  enc = rb_enc_get(str);
617
- if (rb_enc_mbmaxlen(enc) == 1) return 1;
1020
+ if (rb_enc_mbmaxlen(enc) == 1)
1021
+ {
1022
+ return 1;
1023
+ }
618
1024
 
619
1025
  return 0;
620
1026
  }
621
1027
 
622
1028
  static inline VALUE
623
- each_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
624
- if (single_byte_optimizable(str)) {
625
- return each_sb_cp(str, func, cp_arr);
1029
+ each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1030
+ {
1031
+ if (single_byte_optimizable(str))
1032
+ {
1033
+ return each_sb_cp(str, func, cp_arr, len, data, memo);
626
1034
  }
627
- return each_mb_cp(str, func, cp_arr);
1035
+ return each_mb_cp(str, func, cp_arr, len, data, memo);
628
1036
  }
629
1037
 
630
1038
  static inline void
631
- raise_arg_err_unless_string(VALUE val) {
632
- if (!RB_TYPE_P(val, T_STRING)) rb_raise(rb_eArgError, "pass a String");
1039
+ raise_arg_err_unless_string(VALUE val)
1040
+ {
1041
+ if (!RB_TYPE_P(val, T_STRING))
1042
+ {
1043
+ rb_raise(rb_eArgError, "pass a String");
1044
+ }
633
1045
  }
634
1046
 
635
1047
  static VALUE
636
- class_method_of(VALUE self, VALUE str) {
637
- cp_byte *cp_arr;
1048
+ cs_class_method_of(VALUE self, VALUE str)
1049
+ {
1050
+ VALUE new_cs;
1051
+ struct cs_data *new_data;
1052
+ new_cs = cs_alloc(self, &new_data);
638
1053
  raise_arg_err_unless_string(str);
639
- cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte));
640
- each_cp(str, add_str_cp_to_arr, cp_arr);
641
- return NEW_CHARACTER_SET(self, cp_arr);
1054
+ each_cp(str, add_str_cp_to_arr, 0, 0, new_data, 0);
1055
+ return new_cs;
642
1056
  }
643
1057
 
644
1058
  static inline int
645
- str_cp_not_in_arr(unsigned int str_cp, cp_byte *cp_arr) {
646
- return !TSTBIT(cp_arr, str_cp);
1059
+ count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1060
+ {
1061
+ if (tst_cp(cp_arr, len, str_cp))
1062
+ {
1063
+ *memo += 1;
1064
+ }
1065
+ return 1;
647
1066
  }
648
1067
 
649
1068
  static VALUE
650
- method_used_by_p(VALUE self, VALUE str) {
651
- cp_byte *cps;
652
- VALUE only_uses_other_cps;
1069
+ cs_method_count_in(VALUE self, VALUE str)
1070
+ {
1071
+ VALUE count;
1072
+ struct cs_data *data;
653
1073
  raise_arg_err_unless_string(str);
654
- FETCH_CODEPOINTS(self, cps);
655
- only_uses_other_cps = each_cp(str, str_cp_not_in_arr, cps);
656
- return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
1074
+ data = cs_fetch_data(self);
1075
+ count = 0;
1076
+ each_cp(str, count_str_cp, data->cps, data->len, data, &count);
1077
+ return INT2NUM(count);
1078
+ }
1079
+
1080
+ static inline int
1081
+ str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1082
+ {
1083
+ return tst_cp(cp_arr, len, str_cp);
1084
+ }
1085
+
1086
+ static VALUE
1087
+ cs_method_cover_p(VALUE self, VALUE str)
1088
+ {
1089
+ struct cs_data *data;
1090
+ raise_arg_err_unless_string(str);
1091
+ data = cs_fetch_data(self);
1092
+ return each_cp(str, str_cp_in_arr, data->cps, data->len, data, 0);
1093
+ }
1094
+
1095
+ static inline int
1096
+ add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1097
+ {
1098
+ if (tst_cp(cp_arr, len, str_cp))
1099
+ {
1100
+ rb_ary_push(memo[0], rb_enc_uint_chr((int)str_cp, (rb_encoding *)memo[1]));
1101
+ }
1102
+ return 1;
1103
+ }
1104
+
1105
+ static VALUE
1106
+ cs_method_scan(VALUE self, VALUE str)
1107
+ {
1108
+ VALUE memo[2];
1109
+ struct cs_data *data;
1110
+ raise_arg_err_unless_string(str);
1111
+ data = cs_fetch_data(self);
1112
+ memo[0] = rb_ary_new();
1113
+ memo[1] = (VALUE)rb_enc_get(str);
1114
+ each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo);
1115
+ return memo[0];
657
1116
  }
658
1117
 
659
1118
  static inline int
660
- str_cp_in_arr(unsigned int str_cp, cp_byte *cp_arr) {
661
- return TSTBIT(cp_arr, str_cp);
1119
+ str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
1120
+ {
1121
+ return !tst_cp(cp_arr, len, str_cp);
662
1122
  }
663
1123
 
664
1124
  static VALUE
665
- method_cover_p(VALUE self, VALUE str) {
666
- cp_byte *cps;
1125
+ cs_method_used_by_p(VALUE self, VALUE str)
1126
+ {
1127
+ VALUE only_uses_other_cps;
1128
+ struct cs_data *data;
667
1129
  raise_arg_err_unless_string(str);
668
- FETCH_CODEPOINTS(self, cps);
669
- return each_cp(str, str_cp_in_arr, cps);
1130
+ data = cs_fetch_data(self);
1131
+ only_uses_other_cps = each_cp(str, str_cp_not_in_arr, data->cps, data->len, data, 0);
1132
+ return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
1133
+ }
1134
+
1135
+ static void
1136
+ cs_str_buf_cat(VALUE str, const char *ptr, long len)
1137
+ {
1138
+ long total, olen;
1139
+ char *sptr;
1140
+
1141
+ RSTRING_GETMEM(str, sptr, olen);
1142
+ sptr = RSTRING(str)->as.heap.ptr;
1143
+ olen = RSTRING(str)->as.heap.len;
1144
+ total = olen + len;
1145
+ memcpy(sptr + olen, ptr, len);
1146
+ RSTRING(str)->as.heap.len = total;
1147
+ }
1148
+
1149
+ #ifndef TERM_FILL
1150
+ #define TERM_FILL(ptr, termlen) \
1151
+ do \
1152
+ { \
1153
+ char *const term_fill_ptr = (ptr); \
1154
+ const int term_fill_len = (termlen); \
1155
+ *term_fill_ptr = '\0'; \
1156
+ if (__builtin_expect(!!(term_fill_len > 1), 0)) \
1157
+ memset(term_fill_ptr, 0, term_fill_len); \
1158
+ } while (0)
1159
+ #endif
1160
+
1161
+ static void
1162
+ cs_str_buf_terminate(VALUE str, rb_encoding *enc)
1163
+ {
1164
+ char *ptr;
1165
+ long len;
1166
+
1167
+ ptr = RSTRING(str)->as.heap.ptr;
1168
+ len = RSTRING(str)->as.heap.len;
1169
+ TERM_FILL(ptr + len, rb_enc_mbminlen(enc));
670
1170
  }
671
1171
 
672
1172
  static inline VALUE
673
- apply_to_str(VALUE set, VALUE str, int delete, int bang) {
674
- cp_byte *cps;
1173
+ cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
1174
+ {
1175
+ cs_ar *cps;
1176
+ cs_cp len;
675
1177
  rb_encoding *str_enc;
676
- VALUE orig_len, blen, new_str_buf, chr;
677
- int n;
1178
+ VALUE orig_len, new_str_buf;
1179
+ int cp_len;
678
1180
  unsigned int str_cp;
679
1181
  const char *ptr, *end;
680
1182
 
681
1183
  raise_arg_err_unless_string(str);
682
1184
 
683
- FETCH_CODEPOINTS(set, cps);
1185
+ cps = cs_fetch_cps(set, &len);
684
1186
 
685
1187
  orig_len = RSTRING_LEN(str);
686
- blen = orig_len + 30; /* len + margin */ // not sure why, copied from string.c
687
- new_str_buf = rb_str_buf_new(blen);
1188
+ if (orig_len < 1) // empty string, will never change
1189
+ {
1190
+ if (bang)
1191
+ {
1192
+ return Qnil;
1193
+ }
1194
+ return rb_str_dup(str);
1195
+ }
1196
+
1197
+ new_str_buf = rb_str_buf_new(orig_len);
688
1198
  str_enc = rb_enc_get(str);
689
1199
  rb_enc_associate(new_str_buf, str_enc);
690
- ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ?
691
- ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1200
+ rb_str_modify(new_str_buf);
1201
+ ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
692
1202
 
693
1203
  ptr = RSTRING_PTR(str);
694
1204
  end = RSTRING_END(str);
695
1205
 
696
- while (ptr < end) {
697
- str_cp = rb_enc_codepoint_len(ptr, end, &n, str_enc);
698
- if (!TSTBIT(cps, str_cp) != !delete) {
699
- chr = rb_enc_uint_chr(str_cp, str_enc);
700
- rb_enc_str_buf_cat(new_str_buf, RSTRING_PTR(chr), n, str_enc);
1206
+ if (single_byte_optimizable(str))
1207
+ {
1208
+ while (ptr < end)
1209
+ {
1210
+ str_cp = *ptr & 0xff;
1211
+ if ((!tst_cp(cps, len, str_cp)) == delete)
1212
+ {
1213
+ cs_str_buf_cat(new_str_buf, ptr, 1);
1214
+ }
1215
+ ptr++;
1216
+ }
1217
+ }
1218
+ else // likely to be multibyte string
1219
+ {
1220
+ while (ptr < end)
1221
+ {
1222
+ str_cp = rb_enc_codepoint_len(ptr, end, &cp_len, str_enc);
1223
+ if ((!tst_cp(cps, len, str_cp)) == delete)
1224
+ {
1225
+ cs_str_buf_cat(new_str_buf, ptr, cp_len);
1226
+ }
1227
+ ptr += cp_len;
701
1228
  }
702
- ptr += n;
703
1229
  }
704
1230
 
705
- if (bang) {
706
- if (RSTRING_LEN(new_str_buf) == (long)orig_len) return Qnil; // unchanged
1231
+ cs_str_buf_terminate(new_str_buf, str_enc);
1232
+
1233
+ if (bang)
1234
+ {
1235
+ if (RSTRING_LEN(new_str_buf) == (long)orig_len) // string unchanged
1236
+ {
1237
+ return Qnil;
1238
+ }
707
1239
  rb_str_shared_replace(str, new_str_buf);
708
1240
  }
709
- else {
1241
+ else
1242
+ {
710
1243
  RB_OBJ_WRITE(new_str_buf, &(RBASIC(new_str_buf))->klass, rb_obj_class(str));
711
1244
  // slightly cumbersome approach needed for compatibility with Ruby < 2.3:
712
- RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags&(FL_TAINT));
1245
+ RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags & (FL_TAINT));
713
1246
  str = new_str_buf;
714
1247
  }
715
1248
 
@@ -717,98 +1250,115 @@ apply_to_str(VALUE set, VALUE str, int delete, int bang) {
717
1250
  }
718
1251
 
719
1252
  static VALUE
720
- method_delete_in(VALUE self, VALUE str) {
721
- return apply_to_str(self, str, 1, 0);
1253
+ cs_method_delete_in(VALUE self, VALUE str)
1254
+ {
1255
+ return cs_apply_to_str(self, str, 1, 0);
1256
+ }
1257
+
1258
+ static VALUE
1259
+ cs_method_delete_in_bang(VALUE self, VALUE str)
1260
+ {
1261
+ return cs_apply_to_str(self, str, 1, 1);
722
1262
  }
723
1263
 
724
1264
  static VALUE
725
- method_delete_in_bang(VALUE self, VALUE str) {
726
- return apply_to_str(self, str, 1, 1);
1265
+ cs_method_keep_in(VALUE self, VALUE str)
1266
+ {
1267
+ return cs_apply_to_str(self, str, 0, 0);
727
1268
  }
728
1269
 
729
1270
  static VALUE
730
- method_keep_in(VALUE self, VALUE str) {
731
- return apply_to_str(self, str, 0, 0);
1271
+ cs_method_keep_in_bang(VALUE self, VALUE str)
1272
+ {
1273
+ return cs_apply_to_str(self, str, 0, 1);
732
1274
  }
733
1275
 
734
1276
  static VALUE
735
- method_keep_in_bang(VALUE self, VALUE str) {
736
- return apply_to_str(self, str, 0, 1);
1277
+ cs_method_allocated_length(VALUE self)
1278
+ {
1279
+ return LONG2FIX(cs_fetch_data(self)->len);
737
1280
  }
738
1281
 
739
1282
  // ****
740
1283
  // init
741
1284
  // ****
742
1285
 
743
- void
744
- Init_character_set()
1286
+ void Init_character_set()
745
1287
  {
746
1288
  VALUE cs = rb_define_class("CharacterSet", rb_cObject);
747
1289
 
748
- rb_define_alloc_func(cs, method_allocate);
1290
+ rb_define_alloc_func(cs, cs_method_allocate);
749
1291
 
750
1292
  // `Set` compatibility methods
751
1293
 
752
- rb_define_method(cs, "each", method_each, 0);
753
- rb_define_method(cs, "to_a", method_to_a, -1);
754
- rb_define_method(cs, "length", method_length, 0);
755
- rb_define_method(cs, "size", method_length, 0);
756
- rb_define_method(cs, "count", method_length, 0);
757
- rb_define_method(cs, "empty?", method_empty_p, 0);
758
- rb_define_method(cs, "hash", method_hash, 0);
759
- rb_define_method(cs, "keep_if", method_keep_if, 0);
760
- rb_define_method(cs, "delete_if", method_delete_if, 0);
761
- rb_define_method(cs, "clear", method_clear, 0);
762
- rb_define_method(cs, "intersection", method_intersection, 1);
763
- rb_define_method(cs, "&", method_intersection, 1);
764
- rb_define_method(cs, "union", method_union, 1);
765
- rb_define_method(cs, "+", method_union, 1);
766
- rb_define_method(cs, "|", method_union, 1);
767
- rb_define_method(cs, "difference", method_difference, 1);
768
- rb_define_method(cs, "-", method_difference, 1);
769
- rb_define_method(cs, "^", method_exclusion, 1);
770
- rb_define_method(cs, "include?", method_include_p, 1);
771
- rb_define_method(cs, "member?", method_include_p, 1);
772
- rb_define_method(cs, "===", method_include_p, 1);
773
- rb_define_method(cs, "add", method_add, 1);
774
- rb_define_method(cs, "<<", method_add, 1);
775
- rb_define_method(cs, "add?", method_add_p, 1);
776
- rb_define_method(cs, "delete", method_delete, 1);
777
- rb_define_method(cs, "delete?", method_delete_p, 1);
778
- rb_define_method(cs, "intersect?", method_intersect_p, 1);
779
- rb_define_method(cs, "disjoint?", method_disjoint_p, 1);
780
- rb_define_method(cs, "eql?", method_eql_p, 1);
781
- rb_define_method(cs, "==", method_eql_p, 1);
782
- rb_define_method(cs, "merge", method_merge, 1);
783
- rb_define_method(cs, "initialize_clone", method_initialize_copy, 1);
784
- rb_define_method(cs, "initialize_dup", method_initialize_copy, 1);
785
- rb_define_method(cs, "subtract", method_subtract, 1);
786
- rb_define_method(cs, "subset?", method_subset_p, 1);
787
- rb_define_method(cs, "<=", method_subset_p, 1);
788
- rb_define_method(cs, "proper_subset?", method_proper_subset_p, 1);
789
- rb_define_method(cs, "<", method_proper_subset_p, 1);
790
- rb_define_method(cs, "superset?", method_superset_p, 1);
791
- rb_define_method(cs, ">=", method_superset_p, 1);
792
- rb_define_method(cs, "proper_superset?", method_proper_superset_p, 1);
793
- rb_define_method(cs, ">", method_proper_superset_p, 1);
1294
+ rb_define_method(cs, "each", cs_method_each, 0);
1295
+ rb_define_method(cs, "to_a", cs_method_to_a, -1);
1296
+ rb_define_method(cs, "length", cs_method_length, 0);
1297
+ rb_define_method(cs, "size", cs_method_length, 0);
1298
+ rb_define_method(cs, "empty?", cs_method_empty_p, 0);
1299
+ rb_define_method(cs, "hash", cs_method_hash, 0);
1300
+ rb_define_method(cs, "keep_if", cs_method_keep_if, 0);
1301
+ rb_define_method(cs, "delete_if", cs_method_delete_if, 0);
1302
+ rb_define_method(cs, "clear", cs_method_clear, 0);
1303
+ rb_define_method(cs, "min", cs_method_min, 0);
1304
+ rb_define_method(cs, "max", cs_method_max, 0);
1305
+ rb_define_method(cs, "minmax", cs_method_minmax, 0);
1306
+ rb_define_method(cs, "intersection", cs_method_intersection, 1);
1307
+ rb_define_method(cs, "&", cs_method_intersection, 1);
1308
+ rb_define_method(cs, "union", cs_method_union, 1);
1309
+ rb_define_method(cs, "+", cs_method_union, 1);
1310
+ rb_define_method(cs, "|", cs_method_union, 1);
1311
+ rb_define_method(cs, "difference", cs_method_difference, 1);
1312
+ rb_define_method(cs, "-", cs_method_difference, 1);
1313
+ rb_define_method(cs, "^", cs_method_exclusion, 1);
1314
+ rb_define_method(cs, "include?", cs_method_include_p, 1);
1315
+ rb_define_method(cs, "member?", cs_method_include_p, 1);
1316
+ rb_define_method(cs, "===", cs_method_include_p, 1);
1317
+ rb_define_method(cs, "add", cs_method_add, 1);
1318
+ rb_define_method(cs, "<<", cs_method_add, 1);
1319
+ rb_define_method(cs, "add?", cs_method_add_p, 1);
1320
+ rb_define_method(cs, "delete", cs_method_delete, 1);
1321
+ rb_define_method(cs, "delete?", cs_method_delete_p, 1);
1322
+ rb_define_method(cs, "intersect?", cs_method_intersect_p, 1);
1323
+ rb_define_method(cs, "disjoint?", cs_method_disjoint_p, 1);
1324
+ rb_define_method(cs, "eql?", cs_method_eql_p, 1);
1325
+ rb_define_method(cs, "==", cs_method_eql_p, 1);
1326
+ rb_define_method(cs, "merge", cs_method_merge, 1);
1327
+ rb_define_method(cs, "initialize_clone", cs_method_initialize_copy, 1);
1328
+ rb_define_method(cs, "initialize_dup", cs_method_initialize_copy, 1);
1329
+ rb_define_method(cs, "subtract", cs_method_subtract, 1);
1330
+ rb_define_method(cs, "subset?", cs_method_subset_p, 1);
1331
+ rb_define_method(cs, "<=", cs_method_subset_p, 1);
1332
+ rb_define_method(cs, "proper_subset?", cs_method_proper_subset_p, 1);
1333
+ rb_define_method(cs, "<", cs_method_proper_subset_p, 1);
1334
+ rb_define_method(cs, "superset?", cs_method_superset_p, 1);
1335
+ rb_define_method(cs, ">=", cs_method_superset_p, 1);
1336
+ rb_define_method(cs, "proper_superset?", cs_method_proper_superset_p, 1);
1337
+ rb_define_method(cs, ">", cs_method_proper_superset_p, 1);
794
1338
 
795
1339
  // `CharacterSet`-specific methods
796
1340
 
797
- rb_define_singleton_method(cs, "from_ranges", class_method_from_ranges, -2);
798
- rb_define_singleton_method(cs, "of", class_method_of, 1);
799
-
800
- rb_define_method(cs, "ranges", method_ranges, 0);
801
- rb_define_method(cs, "sample", method_sample, -1);
802
- rb_define_method(cs, "bmp_part", method_bmp_part, 0);
803
- rb_define_method(cs, "astral_part", method_astral_part, 0);
804
- rb_define_method(cs, "planes", method_planes, 0);
805
- rb_define_method(cs, "member_in_plane?", method_member_in_plane_p, 1);
806
- rb_define_method(cs, "ext_inversion", method_ext_inversion, -1);
807
- rb_define_method(cs, "case_insensitive", method_case_insensitive, 0);
808
- rb_define_method(cs, "used_by?", method_used_by_p, 1);
809
- rb_define_method(cs, "cover?", method_cover_p, 1);
810
- rb_define_method(cs, "delete_in", method_delete_in, 1);
811
- rb_define_method(cs, "delete_in!", method_delete_in_bang, 1);
812
- rb_define_method(cs, "keep_in", method_keep_in, 1);
813
- rb_define_method(cs, "keep_in!", method_keep_in_bang, 1);
1341
+ rb_define_singleton_method(cs, "from_ranges", cs_class_method_from_ranges, -2);
1342
+ rb_define_singleton_method(cs, "of", cs_class_method_of, 1);
1343
+
1344
+ rb_define_method(cs, "ranges", cs_method_ranges, 0);
1345
+ rb_define_method(cs, "sample", cs_method_sample, -1);
1346
+ rb_define_method(cs, "ext_section", cs_method_ext_section, 2);
1347
+ rb_define_method(cs, "ext_count_in_section", cs_method_ext_count_in_section, 2);
1348
+ rb_define_method(cs, "ext_section?", cs_method_ext_section_p, 2);
1349
+ rb_define_method(cs, "ext_section_ratio", cs_method_ext_section_ratio, 2);
1350
+ rb_define_method(cs, "planes", cs_method_planes, 0);
1351
+ rb_define_method(cs, "plane", cs_method_plane, 1);
1352
+ rb_define_method(cs, "member_in_plane?", cs_method_member_in_plane_p, 1);
1353
+ rb_define_method(cs, "ext_inversion", cs_method_ext_inversion, -1);
1354
+ rb_define_method(cs, "case_insensitive", cs_method_case_insensitive, 0);
1355
+ rb_define_method(cs, "count_in", cs_method_count_in, 1);
1356
+ rb_define_method(cs, "cover?", cs_method_cover_p, 1);
1357
+ rb_define_method(cs, "delete_in", cs_method_delete_in, 1);
1358
+ rb_define_method(cs, "delete_in!", cs_method_delete_in_bang, 1);
1359
+ rb_define_method(cs, "keep_in", cs_method_keep_in, 1);
1360
+ rb_define_method(cs, "keep_in!", cs_method_keep_in_bang, 1);
1361
+ rb_define_method(cs, "scan", cs_method_scan, 1);
1362
+ rb_define_method(cs, "used_by?", cs_method_used_by_p, 1);
1363
+ rb_define_method(cs, "allocated_length", cs_method_allocated_length, 0);
814
1364
  }