character_set 1.1.1 → 1.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitattributes +3 -0
- data/.github/workflows/lint.yml +29 -0
- data/.github/workflows/tests.yml +22 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +11 -0
- data/BENCHMARK.md +53 -17
- data/CHANGELOG.md +47 -0
- data/README.md +38 -14
- data/Rakefile +60 -36
- data/benchmarks/count_in.rb +13 -0
- data/benchmarks/delete_in.rb +1 -1
- data/benchmarks/scan.rb +13 -0
- data/benchmarks/shared.rb +5 -0
- data/benchmarks/z_add.rb +12 -0
- data/benchmarks/z_delete.rb +12 -0
- data/benchmarks/z_merge.rb +15 -0
- data/benchmarks/z_minmax.rb +12 -0
- data/bin/console +2 -0
- data/character_set.gemspec +17 -6
- data/ext/character_set/character_set.c +963 -414
- data/ext/character_set/unicode_casefold_table.h +10 -2
- data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
- data/lib/character_set/character.rb +1 -1
- data/lib/character_set/core_ext/regexp_ext.rb +1 -1
- data/lib/character_set/core_ext/string_ext.rb +3 -1
- data/lib/character_set/expression_converter.rb +25 -27
- data/lib/character_set/parser.rb +1 -1
- data/lib/character_set/predefined_sets.rb +25 -260
- data/lib/character_set/predefined_sets/any.cps +1 -0
- data/lib/character_set/predefined_sets/ascii.cps +1 -0
- data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
- data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
- data/lib/character_set/predefined_sets/assigned.cps +666 -0
- data/lib/character_set/predefined_sets/bmp.cps +2 -0
- data/lib/character_set/predefined_sets/crypt.cps +2 -0
- data/lib/character_set/predefined_sets/emoji.cps +151 -0
- data/lib/character_set/predefined_sets/newline.cps +3 -0
- data/lib/character_set/predefined_sets/surrogate.cps +1 -0
- data/lib/character_set/predefined_sets/unicode.cps +2 -0
- data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
- data/lib/character_set/predefined_sets/url_host.cps +10 -0
- data/lib/character_set/predefined_sets/url_path.cps +7 -0
- data/lib/character_set/predefined_sets/url_query.cps +8 -0
- data/lib/character_set/predefined_sets/whitespace.cps +10 -0
- data/lib/character_set/ruby_fallback.rb +5 -3
- data/lib/character_set/ruby_fallback/character_set_methods.rb +53 -6
- data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
- data/lib/character_set/shared_methods.rb +60 -49
- data/lib/character_set/version.rb +1 -1
- data/lib/character_set/writer.rb +98 -27
- metadata +88 -22
- data/.travis.yml +0 -11
- data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
@@ -0,0 +1,13 @@
|
|
1
|
+
require_relative './shared'
|
2
|
+
|
3
|
+
str = 'Lorem ipsum et dolorem'
|
4
|
+
tr = '^A-Za-z'
|
5
|
+
cs = CharacterSet.non_ascii_letter
|
6
|
+
|
7
|
+
benchmark(
|
8
|
+
caption: 'Counting non-letters',
|
9
|
+
cases: {
|
10
|
+
'String#count' => -> { str.count(tr) },
|
11
|
+
'CharacterSet#count_in' => -> { cs.count_in(str) },
|
12
|
+
}
|
13
|
+
)
|
data/benchmarks/delete_in.rb
CHANGED
@@ -14,7 +14,7 @@ benchmark(
|
|
14
14
|
|
15
15
|
str = 'Lörem ipsüm ⛷ et dölörem'
|
16
16
|
rx = /[\s\p{emoji}äüö]/
|
17
|
-
cs = CharacterSet.whitespace + CharacterSet.emoji +
|
17
|
+
cs = CharacterSet.whitespace + CharacterSet.emoji + CharacterSet['ä', 'ö', 'ü']
|
18
18
|
|
19
19
|
benchmark(
|
20
20
|
caption: 'Removing whitespace, emoji and umlauts',
|
data/benchmarks/scan.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require_relative './shared'
|
2
|
+
|
3
|
+
str = 'Lorem ipsum ⛷ et dolorem'
|
4
|
+
rx = /\p{emoji}/
|
5
|
+
cs = CharacterSet.emoji
|
6
|
+
|
7
|
+
benchmark(
|
8
|
+
caption: 'Extracting emoji to an Array',
|
9
|
+
cases: {
|
10
|
+
'String#scan' => -> { str.scan(rx) },
|
11
|
+
'CharacterSet#scan' => -> { cs.scan(str) },
|
12
|
+
}
|
13
|
+
)
|
data/benchmarks/shared.rb
CHANGED
@@ -3,6 +3,11 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
3
3
|
|
4
4
|
require 'benchmark/ips'
|
5
5
|
require 'character_set'
|
6
|
+
if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
|
7
|
+
require 'sorted_set'
|
8
|
+
else
|
9
|
+
require 'set'
|
10
|
+
end
|
6
11
|
|
7
12
|
def benchmark(caption: nil, cases: {})
|
8
13
|
puts caption
|
data/benchmarks/z_add.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require_relative './shared'
|
2
|
+
|
3
|
+
cs = CharacterSet.new(0..0x10FFFF)
|
4
|
+
ss = SortedSet.new(0..0x10FFFF)
|
5
|
+
|
6
|
+
benchmark(
|
7
|
+
caption: 'Removing entries',
|
8
|
+
cases: {
|
9
|
+
'CharacterSet#delete' => -> { cs.delete(rand(0x10FFFF)) },
|
10
|
+
'SortedSet#delete' => -> { ss.delete(rand(0x10FFFF)) },
|
11
|
+
}
|
12
|
+
)
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require_relative './shared'
|
2
|
+
|
3
|
+
cs1 = CharacterSet.new(0...0x88000)
|
4
|
+
cs2 = CharacterSet.new(0x88000..0x10FFFF)
|
5
|
+
|
6
|
+
ss1 = SortedSet.new(0...0x88000)
|
7
|
+
ss2 = SortedSet.new(0x88000..0x10FFFF)
|
8
|
+
|
9
|
+
benchmark(
|
10
|
+
caption: 'Merging entries',
|
11
|
+
cases: {
|
12
|
+
'CharacterSet#merge' => -> { cs1.merge(cs2) },
|
13
|
+
'SortedSet#merge' => -> { ss1.merge(ss2) },
|
14
|
+
}
|
15
|
+
)
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require_relative './shared'
|
2
|
+
|
3
|
+
cs = CharacterSet.new(0..0xFFFF)
|
4
|
+
ss = SortedSet.new(0..0xFFFF)
|
5
|
+
|
6
|
+
benchmark(
|
7
|
+
caption: 'Getting the min and max',
|
8
|
+
cases: {
|
9
|
+
'CharacterSet#minmax' => -> { cs.minmax },
|
10
|
+
'SortedSet#minmax' => -> { ss.minmax },
|
11
|
+
}
|
12
|
+
)
|
data/bin/console
CHANGED
data/character_set.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |s|
|
|
10
10
|
s.email = ['janosch84@gmail.com']
|
11
11
|
|
12
12
|
s.summary = 'Build, read, write and compare sets of Unicode codepoints.'
|
13
|
-
s.homepage = 'https://github.com/
|
13
|
+
s.homepage = 'https://github.com/jaynetics/character_set'
|
14
14
|
s.license = 'MIT'
|
15
15
|
|
16
16
|
s.files = `git ls-files -z`.split("\x0").reject do |f|
|
@@ -22,12 +22,23 @@ Gem::Specification.new do |s|
|
|
22
22
|
|
23
23
|
s.required_ruby_version = '>= 2.1.0'
|
24
24
|
|
25
|
+
# SortedSet, needed for RubyFallback, was moved to a gem in Ruby 3.
|
26
|
+
# This dependency is only used if the C extension is unavailable.
|
27
|
+
# JRuby has it in the stdlib.
|
28
|
+
if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
|
29
|
+
s.add_dependency 'sorted_set', '~> 1.0'
|
30
|
+
end
|
31
|
+
|
25
32
|
s.add_development_dependency 'benchmark-ips', '~> 2.7'
|
26
|
-
s.add_development_dependency '
|
27
|
-
s.add_development_dependency 'rake', '~>
|
28
|
-
s.add_development_dependency 'rake-compiler', '~> 1.
|
33
|
+
s.add_development_dependency 'get_process_mem', '~> 0.2.3'
|
34
|
+
s.add_development_dependency 'rake', '~> 13.0'
|
35
|
+
s.add_development_dependency 'rake-compiler', '~> 1.1'
|
29
36
|
s.add_development_dependency 'range_compressor', '~> 1.0'
|
30
|
-
s.add_development_dependency 'regexp_parser', '~> 1.
|
31
|
-
s.add_development_dependency 'regexp_property_values', '~> 0
|
37
|
+
s.add_development_dependency 'regexp_parser', '~> 1.6'
|
38
|
+
s.add_development_dependency 'regexp_property_values', '~> 1.0'
|
32
39
|
s.add_development_dependency 'rspec', '~> 3.8'
|
40
|
+
if RUBY_VERSION.to_f >= 2.7
|
41
|
+
s.add_development_dependency 'codecov', '~> 0.2.12'
|
42
|
+
s.add_development_dependency 'rubocop', '~> 1.8'
|
43
|
+
end
|
33
44
|
end
|
@@ -2,81 +2,180 @@
|
|
2
2
|
#include "ruby/encoding.h"
|
3
3
|
#include "unicode_casefold_table.h"
|
4
4
|
|
5
|
-
#define
|
6
|
-
#define
|
7
|
-
#define
|
5
|
+
#define UNICODE_PLANE_SIZE 0x10000
|
6
|
+
#define UNICODE_PLANE_COUNT 17
|
7
|
+
#define UNICODE_CP_COUNT (UNICODE_PLANE_SIZE * UNICODE_PLANE_COUNT)
|
8
8
|
|
9
|
-
|
10
|
-
|
9
|
+
// start at ascii size
|
10
|
+
#define CS_DEFAULT_INITIAL_LEN 128
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
12
|
+
typedef char cs_ar;
|
13
|
+
typedef unsigned long cs_cp;
|
14
|
+
|
15
|
+
struct cs_data
|
16
|
+
{
|
17
|
+
cs_ar *cps;
|
18
|
+
cs_cp len;
|
19
|
+
};
|
20
|
+
|
21
|
+
#define CS_MSIZE(len) (sizeof(cs_ar) * (len / 8))
|
22
|
+
|
23
|
+
static inline void
|
24
|
+
add_memspace_for_another_plane(struct cs_data *data)
|
25
|
+
{
|
26
|
+
data->cps = ruby_xrealloc(data->cps, CS_MSIZE(data->len + UNICODE_PLANE_SIZE));
|
27
|
+
memset(data->cps + CS_MSIZE(data->len), 0, CS_MSIZE(UNICODE_PLANE_SIZE));
|
28
|
+
data->len += UNICODE_PLANE_SIZE;
|
29
|
+
}
|
30
|
+
|
31
|
+
static inline void
|
32
|
+
ensure_memsize_fits(struct cs_data *data, cs_cp target_cp)
|
33
|
+
{
|
34
|
+
while (target_cp >= data->len)
|
35
|
+
{
|
36
|
+
add_memspace_for_another_plane(data);
|
37
|
+
}
|
38
|
+
}
|
39
|
+
|
40
|
+
static inline void
|
41
|
+
set_cp(struct cs_data *data, cs_cp cp)
|
42
|
+
{
|
43
|
+
ensure_memsize_fits(data, cp);
|
44
|
+
data->cps[cp >> 3] |= (1 << (cp & 0x07));
|
45
|
+
}
|
46
|
+
|
47
|
+
static inline int
|
48
|
+
tst_cp(cs_ar *cps, cs_cp len, cs_cp cp)
|
49
|
+
{
|
50
|
+
return ((cp < len) && cps[cp >> 3] & (1 << (cp & 0x07)));
|
51
|
+
}
|
52
|
+
|
53
|
+
static inline void
|
54
|
+
clr_cp(cs_ar *cps, cs_cp len, cs_cp cp)
|
55
|
+
{
|
56
|
+
if (cp < len)
|
57
|
+
{
|
58
|
+
cps[cp >> 3] &= ~(1 << (cp & 0x07));
|
59
|
+
}
|
60
|
+
}
|
16
61
|
|
17
62
|
static void
|
18
|
-
|
19
|
-
|
63
|
+
cs_free(void *ptr)
|
64
|
+
{
|
65
|
+
struct cs_data *data = ptr;
|
66
|
+
ruby_xfree(data->cps);
|
67
|
+
ruby_xfree(data);
|
20
68
|
}
|
21
69
|
|
22
70
|
static size_t
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
.
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
71
|
+
cs_memsize(const void *ptr)
|
72
|
+
{
|
73
|
+
const struct cs_data *data = ptr;
|
74
|
+
return sizeof(*data) + CS_MSIZE(data->len);
|
75
|
+
}
|
76
|
+
|
77
|
+
static const rb_data_type_t cs_type = {
|
78
|
+
.wrap_struct_name = "character_set",
|
79
|
+
.function = {
|
80
|
+
.dmark = NULL,
|
81
|
+
.dfree = cs_free,
|
82
|
+
.dsize = cs_memsize,
|
83
|
+
},
|
84
|
+
.data = NULL,
|
85
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY,
|
37
86
|
};
|
38
87
|
|
39
|
-
|
40
|
-
|
88
|
+
static inline VALUE
|
89
|
+
cs_alloc_len(VALUE klass, struct cs_data **data_ptr, cs_cp len)
|
90
|
+
{
|
91
|
+
VALUE cs;
|
92
|
+
struct cs_data *data;
|
93
|
+
cs = TypedData_Make_Struct(klass, struct cs_data, &cs_type, data);
|
94
|
+
data->cps = ruby_xmalloc(CS_MSIZE(len));
|
95
|
+
memset(data->cps, 0, CS_MSIZE(len));
|
96
|
+
data->len = len;
|
97
|
+
|
98
|
+
if (data_ptr)
|
99
|
+
{
|
100
|
+
*data_ptr = data;
|
101
|
+
}
|
41
102
|
|
42
|
-
|
43
|
-
|
103
|
+
return cs;
|
104
|
+
}
|
44
105
|
|
45
|
-
static VALUE
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
return NEW_CHARACTER_SET(self, cp_arr);
|
106
|
+
static inline VALUE
|
107
|
+
cs_alloc(VALUE klass, struct cs_data **data_ptr)
|
108
|
+
{
|
109
|
+
return cs_alloc_len(klass, data_ptr, CS_DEFAULT_INITIAL_LEN);
|
50
110
|
}
|
51
111
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
112
|
+
static inline struct cs_data *
|
113
|
+
cs_fetch_data(VALUE cs)
|
114
|
+
{
|
115
|
+
struct cs_data *data;
|
116
|
+
TypedData_Get_Struct(cs, struct cs_data, &cs_type, data);
|
117
|
+
return data;
|
118
|
+
}
|
119
|
+
|
120
|
+
static inline cs_ar *
|
121
|
+
cs_fetch_cps(VALUE cs, cs_cp *len_ptr)
|
122
|
+
{
|
123
|
+
struct cs_data *data;
|
124
|
+
data = cs_fetch_data(cs);
|
125
|
+
*len_ptr = data->len;
|
126
|
+
return data->cps;
|
127
|
+
}
|
128
|
+
|
129
|
+
static VALUE
|
130
|
+
cs_method_allocate(VALUE self)
|
131
|
+
{
|
132
|
+
return cs_alloc(self, 0);
|
133
|
+
}
|
134
|
+
|
135
|
+
#define FOR_EACH_ACTIVE_CODEPOINT(action) \
|
136
|
+
do \
|
137
|
+
{ \
|
138
|
+
cs_cp cp, len; \
|
139
|
+
cs_ar *cps; \
|
140
|
+
cps = cs_fetch_cps(self, &len); \
|
141
|
+
for (cp = 0; cp < len; cp++) \
|
142
|
+
{ \
|
143
|
+
if (tst_cp(cps, len, cp)) \
|
144
|
+
{ \
|
145
|
+
action; \
|
146
|
+
} \
|
147
|
+
} \
|
148
|
+
} while (0)
|
59
149
|
|
60
150
|
// ***************************
|
61
151
|
// `Set` compatibility methods
|
62
152
|
// ***************************
|
63
153
|
|
64
|
-
static inline
|
65
|
-
|
66
|
-
|
154
|
+
static inline cs_cp
|
155
|
+
cs_active_cp_count(VALUE self)
|
156
|
+
{
|
157
|
+
cs_cp count;
|
67
158
|
count = 0;
|
68
159
|
FOR_EACH_ACTIVE_CODEPOINT(count++);
|
69
|
-
return
|
160
|
+
return count;
|
70
161
|
}
|
71
162
|
|
72
163
|
static VALUE
|
73
|
-
|
74
|
-
|
164
|
+
cs_method_length(VALUE self)
|
165
|
+
{
|
166
|
+
return LONG2FIX(cs_active_cp_count(self));
|
167
|
+
}
|
168
|
+
|
169
|
+
static inline VALUE
|
170
|
+
cs_enumerator_length(VALUE self, VALUE args, VALUE eobj)
|
171
|
+
{
|
172
|
+
return LONG2FIX(cs_active_cp_count(self));
|
75
173
|
}
|
76
174
|
|
77
175
|
static VALUE
|
78
|
-
|
79
|
-
|
176
|
+
cs_method_each(VALUE self)
|
177
|
+
{
|
178
|
+
RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
|
80
179
|
FOR_EACH_ACTIVE_CODEPOINT(rb_yield(LONG2FIX(cp)));
|
81
180
|
return self;
|
82
181
|
}
|
@@ -84,16 +183,19 @@ method_each(VALUE self) {
|
|
84
183
|
// returns an Array of codepoint Integers by default.
|
85
184
|
// returns an Array of Strings of length 1 if passed `true`.
|
86
185
|
static VALUE
|
87
|
-
|
186
|
+
cs_method_to_a(int argc, VALUE *argv, VALUE self)
|
187
|
+
{
|
88
188
|
VALUE arr;
|
89
189
|
rb_encoding *enc;
|
90
190
|
rb_check_arity(argc, 0, 1);
|
91
191
|
|
92
192
|
arr = rb_ary_new();
|
93
|
-
if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse)
|
193
|
+
if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse)
|
194
|
+
{
|
94
195
|
FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, LONG2FIX(cp)));
|
95
196
|
}
|
96
|
-
else
|
197
|
+
else
|
198
|
+
{
|
97
199
|
enc = rb_utf8_encoding();
|
98
200
|
FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, rb_enc_uint_chr((int)cp, enc)));
|
99
201
|
}
|
@@ -102,302 +204,473 @@ method_to_a(int argc, VALUE *argv, VALUE self) {
|
|
102
204
|
}
|
103
205
|
|
104
206
|
static VALUE
|
105
|
-
|
207
|
+
cs_method_empty_p(VALUE self)
|
208
|
+
{
|
106
209
|
FOR_EACH_ACTIVE_CODEPOINT(return Qfalse);
|
107
210
|
return Qtrue;
|
108
211
|
}
|
109
212
|
|
110
213
|
static VALUE
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
214
|
+
cs_method_hash(VALUE self)
|
215
|
+
{
|
216
|
+
cs_cp cp, len, hash, four_byte_value;
|
217
|
+
cs_ar *cps;
|
218
|
+
cps = cs_fetch_cps(self, &len);
|
219
|
+
four_byte_value = 0;
|
115
220
|
|
116
221
|
hash = 17;
|
117
|
-
for (cp = 0; cp <
|
118
|
-
|
119
|
-
|
222
|
+
for (cp = 0; cp < len; cp++)
|
223
|
+
{
|
224
|
+
if (cp % 32 == 0)
|
225
|
+
{
|
226
|
+
if (cp != 0)
|
227
|
+
{
|
228
|
+
hash = hash * 23 + four_byte_value;
|
229
|
+
}
|
120
230
|
four_byte_value = 0;
|
121
231
|
}
|
122
|
-
if (
|
232
|
+
if (tst_cp(cps, len, cp))
|
233
|
+
{
|
234
|
+
four_byte_value++;
|
235
|
+
}
|
123
236
|
}
|
124
237
|
|
125
238
|
return LONG2FIX(hash);
|
126
239
|
}
|
127
240
|
|
128
241
|
static inline VALUE
|
129
|
-
|
242
|
+
cs_delete_if_block_result(VALUE self, int truthy)
|
243
|
+
{
|
130
244
|
VALUE result;
|
131
245
|
rb_need_block();
|
132
246
|
rb_check_frozen(self);
|
133
247
|
FOR_EACH_ACTIVE_CODEPOINT(
|
134
|
-
|
135
|
-
|
136
|
-
);
|
248
|
+
result = rb_yield(LONG2FIX(cp));
|
249
|
+
if ((NIL_P(result) || result == Qfalse) != truthy) clr_cp(cps, len, cp););
|
137
250
|
return self;
|
138
251
|
}
|
139
252
|
|
140
253
|
static VALUE
|
141
|
-
|
142
|
-
|
143
|
-
|
254
|
+
cs_method_delete_if(VALUE self)
|
255
|
+
{
|
256
|
+
RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
|
257
|
+
return cs_delete_if_block_result(self, 1);
|
144
258
|
}
|
145
259
|
|
146
260
|
static VALUE
|
147
|
-
|
148
|
-
|
149
|
-
|
261
|
+
cs_method_keep_if(VALUE self)
|
262
|
+
{
|
263
|
+
RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
|
264
|
+
return cs_delete_if_block_result(self, 0);
|
150
265
|
}
|
151
266
|
|
152
267
|
static VALUE
|
153
|
-
|
154
|
-
|
155
|
-
|
268
|
+
cs_method_clear(VALUE self)
|
269
|
+
{
|
270
|
+
struct cs_data *data;
|
156
271
|
rb_check_frozen(self);
|
157
|
-
|
158
|
-
|
159
|
-
CLRBIT(cps, cp);
|
160
|
-
}
|
272
|
+
data = cs_fetch_data(self);
|
273
|
+
memset(data->cps, 0, CS_MSIZE(data->len));
|
161
274
|
return self;
|
162
275
|
}
|
163
276
|
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
277
|
+
static VALUE
|
278
|
+
cs_method_min(VALUE self)
|
279
|
+
{
|
280
|
+
FOR_EACH_ACTIVE_CODEPOINT(return LONG2FIX(cp));
|
281
|
+
return Qnil;
|
282
|
+
}
|
283
|
+
|
284
|
+
static VALUE
|
285
|
+
cs_method_max(VALUE self)
|
286
|
+
{
|
287
|
+
cs_cp len;
|
288
|
+
long reverse_idx;
|
289
|
+
cs_ar *cps;
|
290
|
+
cps = cs_fetch_cps(self, &len);
|
291
|
+
for (reverse_idx = len; reverse_idx >= 0; reverse_idx--)
|
292
|
+
{
|
293
|
+
if (tst_cp(cps, len, reverse_idx))
|
294
|
+
{
|
295
|
+
return LONG2FIX(reverse_idx);
|
296
|
+
}
|
297
|
+
}
|
298
|
+
return Qnil;
|
299
|
+
}
|
300
|
+
|
301
|
+
static VALUE
|
302
|
+
cs_method_minmax(VALUE self)
|
303
|
+
{
|
304
|
+
VALUE arr;
|
305
|
+
arr = rb_ary_new2(2);
|
306
|
+
rb_ary_push(arr, cs_method_min(self));
|
307
|
+
rb_ary_push(arr, cs_method_max(self));
|
308
|
+
return arr;
|
309
|
+
}
|
310
|
+
|
311
|
+
#define RETURN_COMBINED_CS(cs_a, cs_b, comp_op) \
|
312
|
+
do \
|
313
|
+
{ \
|
314
|
+
VALUE new_cs; \
|
315
|
+
cs_cp cp, alen, blen; \
|
316
|
+
cs_ar *acps, *bcps; \
|
317
|
+
struct cs_data *new_data; \
|
318
|
+
new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
|
319
|
+
acps = cs_fetch_cps(cs_a, &alen); \
|
320
|
+
bcps = cs_fetch_cps(cs_b, &blen); \
|
321
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) \
|
322
|
+
{ \
|
323
|
+
if (tst_cp(acps, alen, cp) comp_op tst_cp(bcps, blen, cp)) \
|
324
|
+
{ \
|
325
|
+
set_cp(new_data, cp); \
|
326
|
+
} \
|
327
|
+
} \
|
328
|
+
return new_cs; \
|
329
|
+
} while (0)
|
174
330
|
|
175
331
|
static VALUE
|
176
|
-
|
177
|
-
|
332
|
+
cs_method_intersection(VALUE self, VALUE other)
|
333
|
+
{
|
334
|
+
RETURN_COMBINED_CS(self, other, &&);
|
178
335
|
}
|
179
336
|
|
180
337
|
static VALUE
|
181
|
-
|
182
|
-
|
338
|
+
cs_method_exclusion(VALUE self, VALUE other)
|
339
|
+
{
|
340
|
+
RETURN_COMBINED_CS(self, other, ^);
|
183
341
|
}
|
184
342
|
|
185
343
|
static VALUE
|
186
|
-
|
187
|
-
|
344
|
+
cs_method_union(VALUE self, VALUE other)
|
345
|
+
{
|
346
|
+
RETURN_COMBINED_CS(self, other, ||);
|
188
347
|
}
|
189
348
|
|
190
349
|
static VALUE
|
191
|
-
|
192
|
-
|
350
|
+
cs_method_difference(VALUE self, VALUE other)
|
351
|
+
{
|
352
|
+
RETURN_COMBINED_CS(self, other, >);
|
193
353
|
}
|
194
354
|
|
195
355
|
static VALUE
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
356
|
+
cs_method_include_p(VALUE self, VALUE num)
|
357
|
+
{
|
358
|
+
cs_ar *cps;
|
359
|
+
cs_cp len;
|
360
|
+
cps = cs_fetch_cps(self, &len);
|
361
|
+
return (tst_cp(cps, len, FIX2ULONG(num)) ? Qtrue : Qfalse);
|
200
362
|
}
|
201
363
|
|
202
|
-
static inline
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
364
|
+
static inline VALUE
|
365
|
+
cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop)
|
366
|
+
{
|
367
|
+
cs_cp cp, len;
|
368
|
+
cs_ar *cps;
|
369
|
+
struct cs_data *data;
|
370
|
+
rb_check_frozen(cs);
|
371
|
+
data = cs_fetch_data(cs);
|
372
|
+
cps = data->cps;
|
373
|
+
len = data->len;
|
208
374
|
cp = FIX2ULONG(cp_num);
|
209
|
-
if (
|
210
|
-
|
375
|
+
if (return_nil_if_noop && (!tst_cp(cps, len, cp) == !on))
|
376
|
+
{
|
377
|
+
return Qnil;
|
211
378
|
}
|
212
|
-
else
|
213
|
-
|
214
|
-
|
215
|
-
|
379
|
+
else
|
380
|
+
{
|
381
|
+
if (on)
|
382
|
+
{
|
383
|
+
set_cp(data, cp);
|
384
|
+
}
|
385
|
+
else
|
386
|
+
{
|
387
|
+
clr_cp(cps, len, cp);
|
388
|
+
}
|
389
|
+
return cs;
|
216
390
|
}
|
217
391
|
}
|
218
392
|
|
219
393
|
static VALUE
|
220
|
-
|
221
|
-
|
394
|
+
cs_method_add(VALUE self, VALUE cp_num)
|
395
|
+
{
|
396
|
+
return cs_toggle_codepoint(self, cp_num, 1, 0);
|
222
397
|
}
|
223
398
|
|
224
399
|
static VALUE
|
225
|
-
|
226
|
-
|
400
|
+
cs_method_add_p(VALUE self, VALUE cp_num)
|
401
|
+
{
|
402
|
+
return cs_toggle_codepoint(self, cp_num, 1, 1);
|
227
403
|
}
|
228
404
|
|
229
405
|
static VALUE
|
230
|
-
|
231
|
-
|
406
|
+
cs_method_delete(VALUE self, VALUE cp_num)
|
407
|
+
{
|
408
|
+
return cs_toggle_codepoint(self, cp_num, 0, 0);
|
232
409
|
}
|
233
410
|
|
234
411
|
static VALUE
|
235
|
-
|
236
|
-
|
412
|
+
cs_method_delete_p(VALUE self, VALUE cp_num)
|
413
|
+
{
|
414
|
+
return cs_toggle_codepoint(self, cp_num, 0, 1);
|
237
415
|
}
|
238
416
|
|
239
|
-
#define COMPARE_SETS(action)\
|
240
|
-
cp_index cp;\
|
241
|
-
cp_byte *cps, *other_cps;\
|
242
|
-
FETCH_CODEPOINTS(self, cps);\
|
243
|
-
FETCH_CODEPOINTS(other, other_cps);\
|
244
|
-
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { action; }\
|
245
|
-
|
246
417
|
static VALUE
|
247
|
-
|
248
|
-
|
418
|
+
cs_method_intersect_p(VALUE self, VALUE other)
|
419
|
+
{
|
420
|
+
cs_cp cp, alen, blen;
|
421
|
+
cs_ar *acps, *bcps;
|
422
|
+
acps = cs_fetch_cps(self, &alen);
|
423
|
+
bcps = cs_fetch_cps(other, &blen);
|
424
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
|
425
|
+
{
|
426
|
+
if (tst_cp(acps, alen, cp) && tst_cp(bcps, blen, cp))
|
427
|
+
{
|
428
|
+
return Qtrue;
|
429
|
+
}
|
430
|
+
}
|
249
431
|
return Qfalse;
|
250
432
|
}
|
251
433
|
|
252
434
|
static VALUE
|
253
|
-
|
254
|
-
|
435
|
+
cs_method_disjoint_p(VALUE self, VALUE other)
|
436
|
+
{
|
437
|
+
return cs_method_intersect_p(self, other) ? Qfalse : Qtrue;
|
255
438
|
}
|
256
439
|
|
257
440
|
static inline int
|
258
|
-
|
259
|
-
|
441
|
+
cs_check_type(VALUE obj)
|
442
|
+
{
|
443
|
+
return rb_typeddata_is_kind_of(obj, &cs_type);
|
260
444
|
}
|
261
445
|
|
262
446
|
static VALUE
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
447
|
+
cs_cps_eql(VALUE cs_a, VALUE cs_b)
|
448
|
+
{
|
449
|
+
cs_cp cp, alen, blen;
|
450
|
+
cs_ar *acps, *bcps;
|
451
|
+
acps = cs_fetch_cps(cs_a, &alen);
|
452
|
+
bcps = cs_fetch_cps(cs_b, &blen);
|
453
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
|
454
|
+
{
|
455
|
+
if (tst_cp(acps, alen, cp) != tst_cp(bcps, blen, cp))
|
456
|
+
{
|
457
|
+
return Qfalse;
|
458
|
+
}
|
459
|
+
}
|
269
460
|
return Qtrue;
|
270
461
|
}
|
271
462
|
|
463
|
+
static VALUE
|
464
|
+
cs_method_eql_p(VALUE self, VALUE other)
|
465
|
+
{
|
466
|
+
if (!cs_check_type(other))
|
467
|
+
{
|
468
|
+
return Qfalse;
|
469
|
+
}
|
470
|
+
if (self == other) // same object_id
|
471
|
+
{
|
472
|
+
return Qtrue;
|
473
|
+
}
|
474
|
+
return cs_cps_eql(self, other);
|
475
|
+
}
|
476
|
+
|
272
477
|
static inline VALUE
|
273
|
-
|
274
|
-
|
275
|
-
|
478
|
+
cs_merge_cs(VALUE recipient, VALUE source)
|
479
|
+
{
|
480
|
+
cs_cp cp, source_len;
|
481
|
+
struct cs_data *data;
|
482
|
+
cs_ar *source_cps;
|
483
|
+
data = cs_fetch_data(recipient);
|
484
|
+
source_cps = cs_fetch_cps(source, &source_len);
|
485
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
|
486
|
+
{
|
487
|
+
if (tst_cp(source_cps, source_len, cp))
|
488
|
+
{
|
489
|
+
set_cp(data, cp);
|
490
|
+
}
|
491
|
+
}
|
492
|
+
return recipient;
|
276
493
|
}
|
277
494
|
|
278
|
-
static inline
|
279
|
-
|
280
|
-
|
495
|
+
static inline cs_cp
|
496
|
+
cs_checked_cp(VALUE object_id)
|
497
|
+
{
|
498
|
+
if (FIXNUM_P(object_id) && object_id > 0 && object_id < 0x220001)
|
499
|
+
{
|
500
|
+
return FIX2ULONG(object_id);
|
501
|
+
}
|
281
502
|
rb_raise(rb_eArgError, "CharacterSet members must be between 0 and 0x10FFFF");
|
282
503
|
}
|
283
504
|
|
284
505
|
static inline VALUE
|
285
|
-
|
506
|
+
cs_merge_rb_range(VALUE self, VALUE rb_range)
|
507
|
+
{
|
286
508
|
VALUE from_id, upto_id;
|
509
|
+
cs_cp from_cp, upto_cp, cont_len, rem;
|
287
510
|
int excl;
|
288
|
-
|
289
|
-
|
290
|
-
FETCH_CODEPOINTS(self, cps);
|
511
|
+
struct cs_data *data;
|
512
|
+
data = cs_fetch_data(self);
|
291
513
|
|
292
|
-
if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl)))
|
514
|
+
if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl)))
|
515
|
+
{
|
293
516
|
rb_raise(rb_eArgError, "pass a Range");
|
294
517
|
}
|
295
|
-
if (excl)
|
518
|
+
if (excl)
|
519
|
+
{
|
520
|
+
upto_id -= 2;
|
521
|
+
}
|
522
|
+
|
523
|
+
from_cp = cs_checked_cp(from_id);
|
524
|
+
upto_cp = cs_checked_cp(upto_id);
|
296
525
|
|
297
|
-
|
298
|
-
|
526
|
+
if (upto_cp > from_cp && (upto_cp - from_cp > 6))
|
527
|
+
{
|
528
|
+
// set bits in preceding partially toggled bytes individually
|
529
|
+
for (/* */; (from_cp <= upto_cp) && (from_cp % 8); from_cp++)
|
530
|
+
{
|
531
|
+
set_cp(data, from_cp);
|
532
|
+
}
|
533
|
+
// memset contiguous bits directly
|
534
|
+
cont_len = upto_cp - from_cp + 1;
|
535
|
+
rem = cont_len % 8;
|
536
|
+
ensure_memsize_fits(data, upto_cp);
|
537
|
+
memset(data->cps + CS_MSIZE(from_cp), 0xFF, CS_MSIZE(cont_len - rem) / 8);
|
538
|
+
from_cp = upto_cp - rem + 1;
|
539
|
+
}
|
299
540
|
|
300
|
-
|
301
|
-
|
302
|
-
|
541
|
+
// set bits in partially toggled bytes individually
|
542
|
+
for (/* */; from_cp <= upto_cp; from_cp++)
|
543
|
+
{
|
544
|
+
set_cp(data, from_cp);
|
303
545
|
}
|
546
|
+
|
304
547
|
return self;
|
305
548
|
}
|
306
549
|
|
307
550
|
static inline VALUE
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
FETCH_CODEPOINTS(self, cps);
|
551
|
+
cs_merge_rb_array(VALUE self, VALUE rb_array)
|
552
|
+
{
|
553
|
+
VALUE el, array_length, i;
|
554
|
+
struct cs_data *data;
|
313
555
|
Check_Type(rb_array, T_ARRAY);
|
556
|
+
data = cs_fetch_data(self);
|
314
557
|
array_length = RARRAY_LEN(rb_array);
|
315
|
-
for (i = 0; i < array_length; i++)
|
558
|
+
for (i = 0; i < array_length; i++)
|
559
|
+
{
|
316
560
|
el = RARRAY_AREF(rb_array, i);
|
317
|
-
|
318
|
-
SETBIT(cps, FIX2ULONG(el));
|
561
|
+
set_cp(data, cs_checked_cp(el));
|
319
562
|
}
|
320
563
|
return self;
|
321
564
|
}
|
322
565
|
|
323
566
|
static VALUE
|
324
|
-
|
567
|
+
cs_method_merge(VALUE self, VALUE other)
|
568
|
+
{
|
325
569
|
rb_check_frozen(self);
|
326
|
-
if (
|
327
|
-
|
570
|
+
if (cs_check_type(other))
|
571
|
+
{
|
572
|
+
return cs_merge_cs(self, other);
|
328
573
|
}
|
329
|
-
else if (TYPE(other) == T_ARRAY)
|
330
|
-
|
574
|
+
else if (TYPE(other) == T_ARRAY)
|
575
|
+
{
|
576
|
+
return cs_merge_rb_array(self, other);
|
331
577
|
}
|
332
|
-
return
|
578
|
+
return cs_merge_rb_range(self, other);
|
333
579
|
}
|
334
580
|
|
335
581
|
static VALUE
|
336
|
-
|
337
|
-
|
338
|
-
|
582
|
+
cs_method_initialize_copy(VALUE self, VALUE orig)
|
583
|
+
{
|
584
|
+
cs_merge_cs(self, orig);
|
585
|
+
return self;
|
339
586
|
}
|
340
587
|
|
341
588
|
static VALUE
|
342
|
-
|
589
|
+
cs_method_subtract(VALUE self, VALUE other)
|
590
|
+
{
|
591
|
+
cs_cp cp, len, other_len;
|
592
|
+
cs_ar *cps, *other_cps;
|
343
593
|
rb_check_frozen(self);
|
344
|
-
|
594
|
+
cps = cs_fetch_cps(self, &len);
|
595
|
+
other_cps = cs_fetch_cps(other, &other_len);
|
596
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
|
597
|
+
{
|
598
|
+
if (tst_cp(other_cps, other_len, cp))
|
599
|
+
{
|
600
|
+
clr_cp(cps, len, cp);
|
601
|
+
}
|
602
|
+
}
|
345
603
|
return self;
|
346
604
|
}
|
347
605
|
|
348
606
|
static inline int
|
349
|
-
|
350
|
-
|
351
|
-
|
607
|
+
cs_a_subset_of_b(VALUE cs_a, VALUE cs_b, int *is_proper_ptr)
|
608
|
+
{
|
609
|
+
cs_ar *a, *b;
|
610
|
+
cs_cp cp, alen, blen, count_a, count_b;
|
352
611
|
|
353
|
-
if (!
|
612
|
+
if (!cs_check_type(cs_a) || !cs_check_type(cs_b))
|
613
|
+
{
|
354
614
|
rb_raise(rb_eArgError, "pass a CharacterSet");
|
355
615
|
}
|
356
616
|
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
if (
|
366
|
-
|
367
|
-
|
368
|
-
|
617
|
+
a = cs_fetch_cps(cs_a, &alen);
|
618
|
+
b = cs_fetch_cps(cs_b, &blen);
|
619
|
+
|
620
|
+
count_a = 0;
|
621
|
+
count_b = 0;
|
622
|
+
|
623
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
|
624
|
+
{
|
625
|
+
if (tst_cp(a, alen, cp))
|
626
|
+
{
|
627
|
+
if (!tst_cp(b, blen, cp))
|
628
|
+
{
|
629
|
+
return 0;
|
630
|
+
}
|
631
|
+
count_a++;
|
632
|
+
count_b++;
|
633
|
+
}
|
634
|
+
else if (tst_cp(b, blen, cp))
|
635
|
+
{
|
636
|
+
count_b++;
|
369
637
|
}
|
370
|
-
else if (TSTBIT(cps_b, cp)) size_b++;
|
371
638
|
}
|
372
639
|
|
373
|
-
if (
|
640
|
+
if (is_proper_ptr)
|
641
|
+
{
|
642
|
+
*is_proper_ptr = count_b > count_a;
|
643
|
+
}
|
644
|
+
|
374
645
|
return 1;
|
375
646
|
}
|
376
647
|
|
377
648
|
static VALUE
|
378
|
-
|
379
|
-
|
380
|
-
return
|
649
|
+
cs_method_subset_p(VALUE self, VALUE other)
|
650
|
+
{
|
651
|
+
return cs_a_subset_of_b(self, other, NULL) ? Qtrue : Qfalse;
|
381
652
|
}
|
382
653
|
|
383
654
|
static VALUE
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
655
|
+
cs_method_proper_subset_p(VALUE self, VALUE other)
|
656
|
+
{
|
657
|
+
int is_subset, is_proper;
|
658
|
+
is_subset = cs_a_subset_of_b(self, other, &is_proper);
|
659
|
+
return (is_subset && is_proper) ? Qtrue : Qfalse;
|
388
660
|
}
|
389
661
|
|
390
662
|
static VALUE
|
391
|
-
|
392
|
-
|
393
|
-
return
|
663
|
+
cs_method_superset_p(VALUE self, VALUE other)
|
664
|
+
{
|
665
|
+
return cs_a_subset_of_b(other, self, NULL) ? Qtrue : Qfalse;
|
394
666
|
}
|
395
667
|
|
396
668
|
static VALUE
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
669
|
+
cs_method_proper_superset_p(VALUE self, VALUE other)
|
670
|
+
{
|
671
|
+
int is_superset, is_proper;
|
672
|
+
is_superset = cs_a_subset_of_b(other, self, &is_proper);
|
673
|
+
return (is_superset && is_proper) ? Qtrue : Qfalse;
|
401
674
|
}
|
402
675
|
|
403
676
|
// *******************************
|
@@ -405,42 +678,43 @@ method_proper_superset_p(VALUE self, VALUE other) {
|
|
405
678
|
// *******************************
|
406
679
|
|
407
680
|
static VALUE
|
408
|
-
|
409
|
-
|
410
|
-
|
681
|
+
cs_class_method_from_ranges(VALUE self, VALUE ranges)
|
682
|
+
{
|
683
|
+
VALUE new_cs, range_count, i;
|
684
|
+
new_cs = rb_class_new_instance(0, 0, self);
|
411
685
|
range_count = RARRAY_LEN(ranges);
|
412
|
-
for (i = 0; i < range_count; i++)
|
413
|
-
|
686
|
+
for (i = 0; i < range_count; i++)
|
687
|
+
{
|
688
|
+
cs_merge_rb_range(new_cs, RARRAY_AREF(ranges, i));
|
414
689
|
}
|
415
|
-
return
|
690
|
+
return new_cs;
|
416
691
|
}
|
417
692
|
|
418
693
|
static VALUE
|
419
|
-
|
420
|
-
|
694
|
+
cs_method_ranges(VALUE self)
|
695
|
+
{
|
696
|
+
VALUE ranges, cp_num, previous_cp_num, current_start, current_end;
|
421
697
|
|
422
698
|
ranges = rb_ary_new();
|
423
|
-
|
699
|
+
previous_cp_num = 0;
|
424
700
|
current_start = 0;
|
425
701
|
current_end = 0;
|
426
702
|
|
427
703
|
FOR_EACH_ACTIVE_CODEPOINT(
|
428
|
-
|
704
|
+
cp_num = LONG2FIX(cp);
|
429
705
|
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
current_end = codepoint;
|
439
|
-
previous_codepoint = codepoint;
|
440
|
-
);
|
706
|
+
if (!previous_cp_num) {
|
707
|
+
current_start = cp_num;
|
708
|
+
} else if (previous_cp_num + 2 != cp_num) {
|
709
|
+
// gap found, finalize previous range
|
710
|
+
rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
|
711
|
+
current_start = cp_num;
|
712
|
+
} current_end = cp_num;
|
713
|
+
previous_cp_num = cp_num;);
|
441
714
|
|
442
715
|
// add final range
|
443
|
-
if (current_start)
|
716
|
+
if (current_start)
|
717
|
+
{
|
444
718
|
rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
|
445
719
|
}
|
446
720
|
|
@@ -448,117 +722,233 @@ method_ranges(VALUE self) {
|
|
448
722
|
}
|
449
723
|
|
450
724
|
static VALUE
|
451
|
-
|
452
|
-
|
725
|
+
cs_method_sample(int argc, VALUE *argv, VALUE self)
|
726
|
+
{
|
727
|
+
VALUE array, to_a_args[1] = {Qtrue};
|
453
728
|
rb_check_arity(argc, 0, 1);
|
454
|
-
|
455
|
-
array = method_to_a(1, to_a_args, self);
|
729
|
+
array = cs_method_to_a(1, to_a_args, self);
|
456
730
|
return rb_funcall(array, rb_intern("sample"), argc, argc ? argv[0] : 0);
|
457
731
|
}
|
458
732
|
|
459
733
|
static inline VALUE
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
734
|
+
cs_from_section(VALUE set, cs_cp from, cs_cp upto)
|
735
|
+
{
|
736
|
+
VALUE new_cs;
|
737
|
+
cs_ar *cps;
|
738
|
+
cs_cp cp, len;
|
739
|
+
struct cs_data *new_data;
|
740
|
+
new_cs = cs_alloc(RBASIC(set)->klass, &new_data);
|
741
|
+
cps = cs_fetch_cps(set, &len);
|
742
|
+
for (cp = from; cp <= upto; cp++)
|
743
|
+
{
|
744
|
+
if (tst_cp(cps, len, cp))
|
745
|
+
{
|
746
|
+
set_cp(new_data, cp);
|
747
|
+
}
|
467
748
|
}
|
468
|
-
return
|
749
|
+
return new_cs;
|
469
750
|
}
|
470
751
|
|
471
752
|
static VALUE
|
472
|
-
|
473
|
-
|
753
|
+
cs_method_ext_section(VALUE self, VALUE from, VALUE upto)
|
754
|
+
{
|
755
|
+
return cs_from_section(self, FIX2ULONG(from), FIX2ULONG(upto));
|
756
|
+
}
|
757
|
+
|
758
|
+
static inline cs_cp
|
759
|
+
cs_active_cp_count_in_section(VALUE set, cs_cp from, cs_cp upto)
|
760
|
+
{
|
761
|
+
cs_ar *cps;
|
762
|
+
cs_cp cp, count, len;
|
763
|
+
cps = cs_fetch_cps(set, &len);
|
764
|
+
for (count = 0, cp = from; cp <= upto; cp++)
|
765
|
+
{
|
766
|
+
if (tst_cp(cps, len, cp))
|
767
|
+
{
|
768
|
+
count++;
|
769
|
+
}
|
770
|
+
}
|
771
|
+
return count;
|
474
772
|
}
|
475
773
|
|
476
774
|
static VALUE
|
477
|
-
|
478
|
-
|
775
|
+
cs_method_ext_count_in_section(VALUE self, VALUE from, VALUE upto)
|
776
|
+
{
|
777
|
+
cs_cp count;
|
778
|
+
count = cs_active_cp_count_in_section(self, FIX2ULONG(from), FIX2ULONG(upto));
|
779
|
+
return LONG2FIX(count);
|
479
780
|
}
|
480
781
|
|
481
782
|
static inline VALUE
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
783
|
+
cs_has_cp_in_section(cs_ar *cps, cs_cp len, cs_cp from, cs_cp upto)
|
784
|
+
{
|
785
|
+
cs_cp cp;
|
786
|
+
for (cp = from; cp <= upto; cp++)
|
787
|
+
{
|
788
|
+
if (tst_cp(cps, len, cp))
|
789
|
+
{
|
790
|
+
return Qtrue;
|
791
|
+
}
|
490
792
|
}
|
491
793
|
return Qfalse;
|
492
794
|
}
|
493
795
|
|
494
796
|
static VALUE
|
495
|
-
|
797
|
+
cs_method_ext_section_p(VALUE self, VALUE from, VALUE upto)
|
798
|
+
{
|
799
|
+
cs_ar *cps;
|
800
|
+
cs_cp len;
|
801
|
+
cps = cs_fetch_cps(self, &len);
|
802
|
+
return cs_has_cp_in_section(cps, len, FIX2ULONG(from), FIX2ULONG(upto));
|
803
|
+
}
|
804
|
+
|
805
|
+
static inline VALUE
|
806
|
+
cs_ratio_of_section(VALUE set, cs_cp from, cs_cp upto)
|
807
|
+
{
|
808
|
+
double section_count, total_count;
|
809
|
+
section_count = (double)cs_active_cp_count_in_section(set, from, upto);
|
810
|
+
total_count = (double)cs_active_cp_count(set);
|
811
|
+
return DBL2NUM(section_count / total_count);
|
812
|
+
}
|
813
|
+
|
814
|
+
static VALUE
|
815
|
+
cs_method_ext_section_ratio(VALUE self, VALUE from, VALUE upto)
|
816
|
+
{
|
817
|
+
return cs_ratio_of_section(self, FIX2ULONG(from), FIX2ULONG(upto));
|
818
|
+
}
|
819
|
+
|
820
|
+
#define MAX_CP 0x10FFFF
|
821
|
+
#define MAX_ASCII_CP 0x7F
|
822
|
+
#define MAX_BMP_CP 0xFFFF
|
823
|
+
#define MIN_ASTRAL_CP 0x10000
|
824
|
+
|
825
|
+
static inline VALUE
|
826
|
+
cs_has_cp_in_plane(cs_ar *cps, cs_cp len, unsigned int plane)
|
827
|
+
{
|
828
|
+
cs_cp plane_beg, plane_end;
|
829
|
+
plane_beg = plane * UNICODE_PLANE_SIZE;
|
830
|
+
plane_end = (plane + 1) * MAX_BMP_CP;
|
831
|
+
return cs_has_cp_in_section(cps, len, plane_beg, plane_end);
|
832
|
+
}
|
833
|
+
|
834
|
+
static VALUE
|
835
|
+
cs_method_planes(VALUE self)
|
836
|
+
{
|
837
|
+
cs_ar *cps;
|
838
|
+
cs_cp len;
|
496
839
|
unsigned int i;
|
497
840
|
VALUE planes;
|
841
|
+
cps = cs_fetch_cps(self, &len);
|
498
842
|
planes = rb_ary_new();
|
499
|
-
for (i = 0; i < UNICODE_PLANE_COUNT; i++)
|
500
|
-
|
843
|
+
for (i = 0; i < UNICODE_PLANE_COUNT; i++)
|
844
|
+
{
|
845
|
+
if (cs_has_cp_in_plane(cps, len, i))
|
846
|
+
{
|
847
|
+
rb_ary_push(planes, INT2FIX(i));
|
848
|
+
}
|
501
849
|
}
|
502
850
|
return planes;
|
503
851
|
}
|
504
852
|
|
505
|
-
static
|
506
|
-
|
853
|
+
static inline int
|
854
|
+
cs_valid_plane_num(VALUE num)
|
855
|
+
{
|
507
856
|
int plane;
|
508
|
-
Check_Type(
|
509
|
-
plane = FIX2INT(
|
510
|
-
if (plane < 0 || plane >= UNICODE_PLANE_COUNT)
|
511
|
-
|
857
|
+
Check_Type(num, T_FIXNUM);
|
858
|
+
plane = FIX2INT(num);
|
859
|
+
if (plane < 0 || plane >= UNICODE_PLANE_COUNT)
|
860
|
+
{
|
861
|
+
rb_raise(rb_eArgError, "plane must be between 0 and %d", UNICODE_PLANE_COUNT - 1);
|
512
862
|
}
|
513
|
-
return
|
863
|
+
return plane;
|
864
|
+
}
|
865
|
+
|
866
|
+
static VALUE
|
867
|
+
cs_method_plane(VALUE self, VALUE plane_num)
|
868
|
+
{
|
869
|
+
cs_cp plane, plane_beg, plane_end;
|
870
|
+
plane = cs_valid_plane_num(plane_num);
|
871
|
+
plane_beg = plane * UNICODE_PLANE_SIZE;
|
872
|
+
plane_end = (plane + 1) * MAX_BMP_CP;
|
873
|
+
return cs_from_section(self, plane_beg, plane_end);
|
874
|
+
}
|
875
|
+
|
876
|
+
static VALUE
|
877
|
+
cs_method_member_in_plane_p(VALUE self, VALUE plane_num)
|
878
|
+
{
|
879
|
+
cs_ar *cps;
|
880
|
+
cs_cp len;
|
881
|
+
unsigned int plane;
|
882
|
+
plane = cs_valid_plane_num(plane_num);
|
883
|
+
cps = cs_fetch_cps(self, &len);
|
884
|
+
return cs_has_cp_in_plane(cps, len, plane);
|
514
885
|
}
|
515
886
|
|
516
887
|
#define NON_SURROGATE(cp) (cp > 0xDFFF || cp < 0xD800)
|
517
888
|
|
518
889
|
static VALUE
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
890
|
+
cs_method_ext_inversion(int argc, VALUE *argv, VALUE self)
|
891
|
+
{
|
892
|
+
int inc_surr;
|
893
|
+
cs_cp upto, cp, len;
|
894
|
+
cs_ar *cps;
|
895
|
+
VALUE new_cs;
|
896
|
+
struct cs_data *new_data;
|
897
|
+
|
524
898
|
rb_check_arity(argc, 0, 2);
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
899
|
+
|
900
|
+
cps = cs_fetch_cps(self, &len);
|
901
|
+
inc_surr = argc && argv[0] == Qtrue;
|
902
|
+
new_cs = cs_alloc(RBASIC(self)->klass, &new_data);
|
903
|
+
upto = argc > 1 && FIXNUM_P(argv[1]) ? FIX2ULONG(argv[1]) : UNICODE_CP_COUNT;
|
904
|
+
|
905
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
|
906
|
+
{
|
907
|
+
if (cp <= upto && !tst_cp(cps, len, cp) && (inc_surr || NON_SURROGATE(cp)))
|
908
|
+
{
|
909
|
+
set_cp(new_data, cp);
|
910
|
+
}
|
531
911
|
}
|
532
|
-
|
533
|
-
|
534
|
-
);
|
912
|
+
|
913
|
+
return new_cs;
|
535
914
|
}
|
536
915
|
|
537
|
-
typedef int(*str_cp_handler)(unsigned int,
|
916
|
+
typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE *memo);
|
538
917
|
|
539
918
|
static inline int
|
540
|
-
add_str_cp_to_arr(unsigned int str_cp,
|
541
|
-
|
919
|
+
add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
|
920
|
+
{
|
921
|
+
set_cp(data, str_cp);
|
542
922
|
return 1;
|
543
923
|
}
|
544
924
|
|
545
925
|
static VALUE
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
926
|
+
cs_method_case_insensitive(VALUE self)
|
927
|
+
{
|
928
|
+
cs_cp i, len;
|
929
|
+
cs_ar *cps;
|
930
|
+
VALUE new_cs;
|
931
|
+
struct cs_data *new_data;
|
551
932
|
|
552
|
-
|
933
|
+
cps = cs_fetch_cps(self, &len);
|
934
|
+
new_cs = cs_alloc(RBASIC(self)->klass, &new_data);
|
935
|
+
cs_merge_cs(new_cs, self);
|
553
936
|
|
554
|
-
for (i = 0; i < CASEFOLD_COUNT; i++)
|
937
|
+
for (i = 0; i < CASEFOLD_COUNT; i++)
|
938
|
+
{
|
555
939
|
casefold_mapping m = unicode_casefold_table[i];
|
556
940
|
|
557
|
-
if
|
558
|
-
|
941
|
+
if (tst_cp(cps, len, m.from))
|
942
|
+
{
|
943
|
+
set_cp(new_data, m.to);
|
944
|
+
}
|
945
|
+
else if (tst_cp(cps, len, m.to))
|
946
|
+
{
|
947
|
+
set_cp(new_data, m.from);
|
948
|
+
}
|
559
949
|
}
|
560
950
|
|
561
|
-
return
|
951
|
+
return new_cs;
|
562
952
|
|
563
953
|
// OnigCaseFoldType flags;
|
564
954
|
// rb_encoding *enc;
|
@@ -573,20 +963,27 @@ method_case_insensitive(VALUE self) {
|
|
573
963
|
}
|
574
964
|
|
575
965
|
static inline VALUE
|
576
|
-
each_sb_cp(VALUE str, str_cp_handler func,
|
577
|
-
|
966
|
+
each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
|
967
|
+
{
|
968
|
+
long i, str_len;
|
578
969
|
unsigned int str_cp;
|
970
|
+
str_len = RSTRING_LEN(str);
|
579
971
|
|
580
|
-
for (i = 0; i <
|
972
|
+
for (i = 0; i < str_len; i++)
|
973
|
+
{
|
581
974
|
str_cp = (RSTRING_PTR(str)[i] & 0xff);
|
582
|
-
if (!(*func)(str_cp, cp_arr))
|
975
|
+
if (!(*func)(str_cp, cp_arr, len, data, memo))
|
976
|
+
{
|
977
|
+
return Qfalse;
|
978
|
+
}
|
583
979
|
}
|
584
980
|
|
585
981
|
return Qtrue;
|
586
982
|
}
|
587
983
|
|
588
984
|
static inline VALUE
|
589
|
-
each_mb_cp(VALUE str, str_cp_handler func,
|
985
|
+
each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
|
986
|
+
{
|
590
987
|
int n;
|
591
988
|
unsigned int str_cp;
|
592
989
|
const char *ptr, *end;
|
@@ -597,9 +994,13 @@ each_mb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
|
|
597
994
|
end = RSTRING_END(str);
|
598
995
|
enc = rb_enc_get(str);
|
599
996
|
|
600
|
-
while (ptr < end)
|
997
|
+
while (ptr < end)
|
998
|
+
{
|
601
999
|
str_cp = rb_enc_codepoint_len(ptr, end, &n, enc);
|
602
|
-
if (!(*func)(str_cp, cp_arr))
|
1000
|
+
if (!(*func)(str_cp, cp_arr, len, data, memo))
|
1001
|
+
{
|
1002
|
+
return Qfalse;
|
1003
|
+
}
|
603
1004
|
ptr += n;
|
604
1005
|
}
|
605
1006
|
|
@@ -611,105 +1012,236 @@ static inline int
|
|
611
1012
|
single_byte_optimizable(VALUE str)
|
612
1013
|
{
|
613
1014
|
rb_encoding *enc;
|
614
|
-
if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
|
1015
|
+
if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
|
1016
|
+
{
|
1017
|
+
return 1;
|
1018
|
+
}
|
615
1019
|
|
616
1020
|
enc = rb_enc_get(str);
|
617
|
-
if (rb_enc_mbmaxlen(enc) == 1)
|
1021
|
+
if (rb_enc_mbmaxlen(enc) == 1)
|
1022
|
+
{
|
1023
|
+
return 1;
|
1024
|
+
}
|
618
1025
|
|
619
1026
|
return 0;
|
620
1027
|
}
|
621
1028
|
|
622
1029
|
static inline VALUE
|
623
|
-
each_cp(VALUE str, str_cp_handler func,
|
624
|
-
|
625
|
-
|
1030
|
+
each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
|
1031
|
+
{
|
1032
|
+
if (single_byte_optimizable(str))
|
1033
|
+
{
|
1034
|
+
return each_sb_cp(str, func, cp_arr, len, data, memo);
|
626
1035
|
}
|
627
|
-
return each_mb_cp(str, func, cp_arr);
|
1036
|
+
return each_mb_cp(str, func, cp_arr, len, data, memo);
|
628
1037
|
}
|
629
1038
|
|
630
1039
|
static inline void
|
631
|
-
raise_arg_err_unless_string(VALUE val)
|
632
|
-
|
1040
|
+
raise_arg_err_unless_string(VALUE val)
|
1041
|
+
{
|
1042
|
+
if (!RB_TYPE_P(val, T_STRING))
|
1043
|
+
{
|
1044
|
+
rb_raise(rb_eArgError, "pass a String");
|
1045
|
+
}
|
633
1046
|
}
|
634
1047
|
|
635
1048
|
static VALUE
|
636
|
-
|
637
|
-
|
1049
|
+
cs_class_method_of(VALUE self, VALUE str)
|
1050
|
+
{
|
1051
|
+
VALUE new_cs;
|
1052
|
+
struct cs_data *new_data;
|
1053
|
+
new_cs = cs_alloc(self, &new_data);
|
638
1054
|
raise_arg_err_unless_string(str);
|
639
|
-
|
640
|
-
|
641
|
-
return NEW_CHARACTER_SET(self, cp_arr);
|
1055
|
+
each_cp(str, add_str_cp_to_arr, 0, 0, new_data, 0);
|
1056
|
+
return new_cs;
|
642
1057
|
}
|
643
1058
|
|
644
1059
|
static inline int
|
645
|
-
|
646
|
-
|
1060
|
+
count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
|
1061
|
+
{
|
1062
|
+
if (tst_cp(cp_arr, len, str_cp))
|
1063
|
+
{
|
1064
|
+
*memo += 1;
|
1065
|
+
}
|
1066
|
+
return 1;
|
647
1067
|
}
|
648
1068
|
|
649
1069
|
static VALUE
|
650
|
-
|
651
|
-
|
652
|
-
VALUE
|
1070
|
+
cs_method_count_in(VALUE self, VALUE str)
|
1071
|
+
{
|
1072
|
+
VALUE count;
|
1073
|
+
struct cs_data *data;
|
653
1074
|
raise_arg_err_unless_string(str);
|
654
|
-
|
655
|
-
|
656
|
-
|
1075
|
+
data = cs_fetch_data(self);
|
1076
|
+
count = 0;
|
1077
|
+
each_cp(str, count_str_cp, data->cps, data->len, data, &count);
|
1078
|
+
return INT2NUM((int)count);
|
1079
|
+
}
|
1080
|
+
|
1081
|
+
static inline int
|
1082
|
+
str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
|
1083
|
+
{
|
1084
|
+
return tst_cp(cp_arr, len, str_cp);
|
1085
|
+
}
|
1086
|
+
|
1087
|
+
static VALUE
|
1088
|
+
cs_method_cover_p(VALUE self, VALUE str)
|
1089
|
+
{
|
1090
|
+
struct cs_data *data;
|
1091
|
+
raise_arg_err_unless_string(str);
|
1092
|
+
data = cs_fetch_data(self);
|
1093
|
+
return each_cp(str, str_cp_in_arr, data->cps, data->len, data, 0);
|
1094
|
+
}
|
1095
|
+
|
1096
|
+
static inline int
|
1097
|
+
add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
|
1098
|
+
{
|
1099
|
+
if (tst_cp(cp_arr, len, str_cp))
|
1100
|
+
{
|
1101
|
+
rb_ary_push(memo[0], rb_enc_uint_chr((int)str_cp, (rb_encoding *)memo[1]));
|
1102
|
+
}
|
1103
|
+
return 1;
|
1104
|
+
}
|
1105
|
+
|
1106
|
+
static VALUE
|
1107
|
+
cs_method_scan(VALUE self, VALUE str)
|
1108
|
+
{
|
1109
|
+
VALUE memo[2];
|
1110
|
+
struct cs_data *data;
|
1111
|
+
raise_arg_err_unless_string(str);
|
1112
|
+
data = cs_fetch_data(self);
|
1113
|
+
memo[0] = rb_ary_new();
|
1114
|
+
memo[1] = (VALUE)rb_enc_get(str);
|
1115
|
+
each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo);
|
1116
|
+
return memo[0];
|
657
1117
|
}
|
658
1118
|
|
659
1119
|
static inline int
|
660
|
-
|
661
|
-
|
1120
|
+
str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
|
1121
|
+
{
|
1122
|
+
return !tst_cp(cp_arr, len, str_cp);
|
662
1123
|
}
|
663
1124
|
|
664
1125
|
static VALUE
|
665
|
-
|
666
|
-
|
1126
|
+
cs_method_used_by_p(VALUE self, VALUE str)
|
1127
|
+
{
|
1128
|
+
VALUE only_uses_other_cps;
|
1129
|
+
struct cs_data *data;
|
667
1130
|
raise_arg_err_unless_string(str);
|
668
|
-
|
669
|
-
|
1131
|
+
data = cs_fetch_data(self);
|
1132
|
+
only_uses_other_cps = each_cp(str, str_cp_not_in_arr, data->cps, data->len, data, 0);
|
1133
|
+
return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
|
1134
|
+
}
|
1135
|
+
|
1136
|
+
static void
|
1137
|
+
cs_str_buf_cat(VALUE str, const char *ptr, long len)
|
1138
|
+
{
|
1139
|
+
long total, olen;
|
1140
|
+
char *sptr;
|
1141
|
+
|
1142
|
+
RSTRING_GETMEM(str, sptr, olen);
|
1143
|
+
sptr = RSTRING(str)->as.heap.ptr;
|
1144
|
+
olen = RSTRING(str)->as.heap.len;
|
1145
|
+
total = olen + len;
|
1146
|
+
memcpy(sptr + olen, ptr, len);
|
1147
|
+
RSTRING(str)->as.heap.len = total;
|
1148
|
+
}
|
1149
|
+
|
1150
|
+
#ifndef TERM_FILL
|
1151
|
+
#define TERM_FILL(ptr, termlen) \
|
1152
|
+
do \
|
1153
|
+
{ \
|
1154
|
+
char *const term_fill_ptr = (ptr); \
|
1155
|
+
const int term_fill_len = (termlen); \
|
1156
|
+
*term_fill_ptr = '\0'; \
|
1157
|
+
if (__builtin_expect(!!(term_fill_len > 1), 0)) \
|
1158
|
+
memset(term_fill_ptr, 0, term_fill_len); \
|
1159
|
+
} while (0)
|
1160
|
+
#endif
|
1161
|
+
|
1162
|
+
static void
|
1163
|
+
cs_str_buf_terminate(VALUE str, rb_encoding *enc)
|
1164
|
+
{
|
1165
|
+
char *ptr;
|
1166
|
+
long len;
|
1167
|
+
|
1168
|
+
ptr = RSTRING(str)->as.heap.ptr;
|
1169
|
+
len = RSTRING(str)->as.heap.len;
|
1170
|
+
TERM_FILL(ptr + len, rb_enc_mbminlen(enc));
|
670
1171
|
}
|
671
1172
|
|
672
1173
|
static inline VALUE
|
673
|
-
|
674
|
-
|
1174
|
+
cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
|
1175
|
+
{
|
1176
|
+
cs_ar *cps;
|
1177
|
+
cs_cp len;
|
675
1178
|
rb_encoding *str_enc;
|
676
|
-
VALUE orig_len,
|
677
|
-
int
|
1179
|
+
VALUE orig_len, new_str_buf;
|
1180
|
+
int cp_len;
|
678
1181
|
unsigned int str_cp;
|
679
1182
|
const char *ptr, *end;
|
680
1183
|
|
681
1184
|
raise_arg_err_unless_string(str);
|
682
1185
|
|
683
|
-
|
1186
|
+
cps = cs_fetch_cps(set, &len);
|
684
1187
|
|
685
1188
|
orig_len = RSTRING_LEN(str);
|
686
|
-
|
687
|
-
|
1189
|
+
if (orig_len < 1) // empty string, will never change
|
1190
|
+
{
|
1191
|
+
if (bang)
|
1192
|
+
{
|
1193
|
+
return Qnil;
|
1194
|
+
}
|
1195
|
+
return rb_str_dup(str);
|
1196
|
+
}
|
1197
|
+
|
1198
|
+
new_str_buf = rb_str_buf_new(orig_len + 30); // len + margin
|
688
1199
|
str_enc = rb_enc_get(str);
|
689
1200
|
rb_enc_associate(new_str_buf, str_enc);
|
690
|
-
|
691
|
-
|
1201
|
+
rb_str_modify(new_str_buf);
|
1202
|
+
ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
|
692
1203
|
|
693
1204
|
ptr = RSTRING_PTR(str);
|
694
1205
|
end = RSTRING_END(str);
|
695
1206
|
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
1207
|
+
if (single_byte_optimizable(str))
|
1208
|
+
{
|
1209
|
+
while (ptr < end)
|
1210
|
+
{
|
1211
|
+
str_cp = *ptr & 0xff;
|
1212
|
+
if ((!tst_cp(cps, len, str_cp)) == delete)
|
1213
|
+
{
|
1214
|
+
cs_str_buf_cat(new_str_buf, ptr, 1);
|
1215
|
+
}
|
1216
|
+
ptr++;
|
1217
|
+
}
|
1218
|
+
}
|
1219
|
+
else // likely to be multibyte string
|
1220
|
+
{
|
1221
|
+
while (ptr < end)
|
1222
|
+
{
|
1223
|
+
str_cp = rb_enc_codepoint_len(ptr, end, &cp_len, str_enc);
|
1224
|
+
if ((!tst_cp(cps, len, str_cp)) == delete)
|
1225
|
+
{
|
1226
|
+
cs_str_buf_cat(new_str_buf, ptr, cp_len);
|
1227
|
+
}
|
1228
|
+
ptr += cp_len;
|
701
1229
|
}
|
702
|
-
ptr += n;
|
703
1230
|
}
|
704
1231
|
|
705
|
-
|
706
|
-
|
1232
|
+
cs_str_buf_terminate(new_str_buf, str_enc);
|
1233
|
+
|
1234
|
+
if (bang)
|
1235
|
+
{
|
1236
|
+
if (RSTRING_LEN(new_str_buf) == (long)orig_len) // string unchanged
|
1237
|
+
{
|
1238
|
+
return Qnil;
|
1239
|
+
}
|
707
1240
|
rb_str_shared_replace(str, new_str_buf);
|
708
1241
|
}
|
709
|
-
else
|
1242
|
+
else
|
1243
|
+
{
|
710
1244
|
RB_OBJ_WRITE(new_str_buf, &(RBASIC(new_str_buf))->klass, rb_obj_class(str));
|
711
|
-
// slightly cumbersome approach needed for compatibility with Ruby < 2.3:
|
712
|
-
RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags&(FL_TAINT));
|
713
1245
|
str = new_str_buf;
|
714
1246
|
}
|
715
1247
|
|
@@ -717,98 +1249,115 @@ apply_to_str(VALUE set, VALUE str, int delete, int bang) {
|
|
717
1249
|
}
|
718
1250
|
|
719
1251
|
static VALUE
|
720
|
-
|
721
|
-
|
1252
|
+
cs_method_delete_in(VALUE self, VALUE str)
|
1253
|
+
{
|
1254
|
+
return cs_apply_to_str(self, str, 1, 0);
|
1255
|
+
}
|
1256
|
+
|
1257
|
+
static VALUE
|
1258
|
+
cs_method_delete_in_bang(VALUE self, VALUE str)
|
1259
|
+
{
|
1260
|
+
return cs_apply_to_str(self, str, 1, 1);
|
722
1261
|
}
|
723
1262
|
|
724
1263
|
static VALUE
|
725
|
-
|
726
|
-
|
1264
|
+
cs_method_keep_in(VALUE self, VALUE str)
|
1265
|
+
{
|
1266
|
+
return cs_apply_to_str(self, str, 0, 0);
|
727
1267
|
}
|
728
1268
|
|
729
1269
|
static VALUE
|
730
|
-
|
731
|
-
|
1270
|
+
cs_method_keep_in_bang(VALUE self, VALUE str)
|
1271
|
+
{
|
1272
|
+
return cs_apply_to_str(self, str, 0, 1);
|
732
1273
|
}
|
733
1274
|
|
734
1275
|
static VALUE
|
735
|
-
|
736
|
-
|
1276
|
+
cs_method_allocated_length(VALUE self)
|
1277
|
+
{
|
1278
|
+
return LONG2FIX(cs_fetch_data(self)->len);
|
737
1279
|
}
|
738
1280
|
|
739
1281
|
// ****
|
740
1282
|
// init
|
741
1283
|
// ****
|
742
1284
|
|
743
|
-
void
|
744
|
-
Init_character_set()
|
1285
|
+
void Init_character_set()
|
745
1286
|
{
|
746
1287
|
VALUE cs = rb_define_class("CharacterSet", rb_cObject);
|
747
1288
|
|
748
|
-
rb_define_alloc_func(cs,
|
1289
|
+
rb_define_alloc_func(cs, cs_method_allocate);
|
749
1290
|
|
750
1291
|
// `Set` compatibility methods
|
751
1292
|
|
752
|
-
rb_define_method(cs, "each",
|
753
|
-
rb_define_method(cs, "to_a",
|
754
|
-
rb_define_method(cs, "length",
|
755
|
-
rb_define_method(cs, "size",
|
756
|
-
rb_define_method(cs, "
|
757
|
-
rb_define_method(cs, "
|
758
|
-
rb_define_method(cs, "
|
759
|
-
rb_define_method(cs, "
|
760
|
-
rb_define_method(cs, "
|
761
|
-
rb_define_method(cs, "
|
762
|
-
rb_define_method(cs, "
|
763
|
-
rb_define_method(cs, "
|
764
|
-
rb_define_method(cs, "
|
765
|
-
rb_define_method(cs, "
|
766
|
-
rb_define_method(cs, "
|
767
|
-
rb_define_method(cs, "
|
768
|
-
rb_define_method(cs, "
|
769
|
-
rb_define_method(cs, "
|
770
|
-
rb_define_method(cs, "
|
771
|
-
rb_define_method(cs, "
|
772
|
-
rb_define_method(cs, "
|
773
|
-
rb_define_method(cs, "
|
774
|
-
rb_define_method(cs, "
|
775
|
-
rb_define_method(cs, "add
|
776
|
-
rb_define_method(cs, "
|
777
|
-
rb_define_method(cs, "
|
778
|
-
rb_define_method(cs, "
|
779
|
-
rb_define_method(cs, "
|
780
|
-
rb_define_method(cs, "
|
781
|
-
rb_define_method(cs, "
|
782
|
-
rb_define_method(cs, "
|
783
|
-
rb_define_method(cs, "
|
784
|
-
rb_define_method(cs, "
|
785
|
-
rb_define_method(cs, "
|
786
|
-
rb_define_method(cs, "
|
787
|
-
rb_define_method(cs, "
|
788
|
-
rb_define_method(cs, "
|
789
|
-
rb_define_method(cs, "
|
790
|
-
rb_define_method(cs, "
|
791
|
-
rb_define_method(cs, "
|
792
|
-
rb_define_method(cs, "
|
793
|
-
rb_define_method(cs, "
|
1293
|
+
rb_define_method(cs, "each", cs_method_each, 0);
|
1294
|
+
rb_define_method(cs, "to_a", cs_method_to_a, -1);
|
1295
|
+
rb_define_method(cs, "length", cs_method_length, 0);
|
1296
|
+
rb_define_method(cs, "size", cs_method_length, 0);
|
1297
|
+
rb_define_method(cs, "empty?", cs_method_empty_p, 0);
|
1298
|
+
rb_define_method(cs, "hash", cs_method_hash, 0);
|
1299
|
+
rb_define_method(cs, "keep_if", cs_method_keep_if, 0);
|
1300
|
+
rb_define_method(cs, "delete_if", cs_method_delete_if, 0);
|
1301
|
+
rb_define_method(cs, "clear", cs_method_clear, 0);
|
1302
|
+
rb_define_method(cs, "min", cs_method_min, 0);
|
1303
|
+
rb_define_method(cs, "max", cs_method_max, 0);
|
1304
|
+
rb_define_method(cs, "minmax", cs_method_minmax, 0);
|
1305
|
+
rb_define_method(cs, "intersection", cs_method_intersection, 1);
|
1306
|
+
rb_define_method(cs, "&", cs_method_intersection, 1);
|
1307
|
+
rb_define_method(cs, "union", cs_method_union, 1);
|
1308
|
+
rb_define_method(cs, "+", cs_method_union, 1);
|
1309
|
+
rb_define_method(cs, "|", cs_method_union, 1);
|
1310
|
+
rb_define_method(cs, "difference", cs_method_difference, 1);
|
1311
|
+
rb_define_method(cs, "-", cs_method_difference, 1);
|
1312
|
+
rb_define_method(cs, "^", cs_method_exclusion, 1);
|
1313
|
+
rb_define_method(cs, "include?", cs_method_include_p, 1);
|
1314
|
+
rb_define_method(cs, "member?", cs_method_include_p, 1);
|
1315
|
+
rb_define_method(cs, "===", cs_method_include_p, 1);
|
1316
|
+
rb_define_method(cs, "add", cs_method_add, 1);
|
1317
|
+
rb_define_method(cs, "<<", cs_method_add, 1);
|
1318
|
+
rb_define_method(cs, "add?", cs_method_add_p, 1);
|
1319
|
+
rb_define_method(cs, "delete", cs_method_delete, 1);
|
1320
|
+
rb_define_method(cs, "delete?", cs_method_delete_p, 1);
|
1321
|
+
rb_define_method(cs, "intersect?", cs_method_intersect_p, 1);
|
1322
|
+
rb_define_method(cs, "disjoint?", cs_method_disjoint_p, 1);
|
1323
|
+
rb_define_method(cs, "eql?", cs_method_eql_p, 1);
|
1324
|
+
rb_define_method(cs, "==", cs_method_eql_p, 1);
|
1325
|
+
rb_define_method(cs, "merge", cs_method_merge, 1);
|
1326
|
+
rb_define_method(cs, "initialize_clone", cs_method_initialize_copy, 1);
|
1327
|
+
rb_define_method(cs, "initialize_dup", cs_method_initialize_copy, 1);
|
1328
|
+
rb_define_method(cs, "subtract", cs_method_subtract, 1);
|
1329
|
+
rb_define_method(cs, "subset?", cs_method_subset_p, 1);
|
1330
|
+
rb_define_method(cs, "<=", cs_method_subset_p, 1);
|
1331
|
+
rb_define_method(cs, "proper_subset?", cs_method_proper_subset_p, 1);
|
1332
|
+
rb_define_method(cs, "<", cs_method_proper_subset_p, 1);
|
1333
|
+
rb_define_method(cs, "superset?", cs_method_superset_p, 1);
|
1334
|
+
rb_define_method(cs, ">=", cs_method_superset_p, 1);
|
1335
|
+
rb_define_method(cs, "proper_superset?", cs_method_proper_superset_p, 1);
|
1336
|
+
rb_define_method(cs, ">", cs_method_proper_superset_p, 1);
|
794
1337
|
|
795
1338
|
// `CharacterSet`-specific methods
|
796
1339
|
|
797
|
-
rb_define_singleton_method(cs, "from_ranges",
|
798
|
-
rb_define_singleton_method(cs, "of",
|
799
|
-
|
800
|
-
rb_define_method(cs, "ranges",
|
801
|
-
rb_define_method(cs, "sample",
|
802
|
-
rb_define_method(cs, "
|
803
|
-
rb_define_method(cs, "
|
804
|
-
rb_define_method(cs, "
|
805
|
-
rb_define_method(cs, "
|
806
|
-
rb_define_method(cs, "
|
807
|
-
rb_define_method(cs, "
|
808
|
-
rb_define_method(cs, "
|
809
|
-
rb_define_method(cs, "
|
810
|
-
rb_define_method(cs, "
|
811
|
-
rb_define_method(cs, "
|
812
|
-
rb_define_method(cs, "
|
813
|
-
rb_define_method(cs, "
|
1340
|
+
rb_define_singleton_method(cs, "from_ranges", cs_class_method_from_ranges, -2);
|
1341
|
+
rb_define_singleton_method(cs, "of", cs_class_method_of, 1);
|
1342
|
+
|
1343
|
+
rb_define_method(cs, "ranges", cs_method_ranges, 0);
|
1344
|
+
rb_define_method(cs, "sample", cs_method_sample, -1);
|
1345
|
+
rb_define_method(cs, "ext_section", cs_method_ext_section, 2);
|
1346
|
+
rb_define_method(cs, "ext_count_in_section", cs_method_ext_count_in_section, 2);
|
1347
|
+
rb_define_method(cs, "ext_section?", cs_method_ext_section_p, 2);
|
1348
|
+
rb_define_method(cs, "ext_section_ratio", cs_method_ext_section_ratio, 2);
|
1349
|
+
rb_define_method(cs, "planes", cs_method_planes, 0);
|
1350
|
+
rb_define_method(cs, "plane", cs_method_plane, 1);
|
1351
|
+
rb_define_method(cs, "member_in_plane?", cs_method_member_in_plane_p, 1);
|
1352
|
+
rb_define_method(cs, "ext_inversion", cs_method_ext_inversion, -1);
|
1353
|
+
rb_define_method(cs, "case_insensitive", cs_method_case_insensitive, 0);
|
1354
|
+
rb_define_method(cs, "count_in", cs_method_count_in, 1);
|
1355
|
+
rb_define_method(cs, "cover?", cs_method_cover_p, 1);
|
1356
|
+
rb_define_method(cs, "delete_in", cs_method_delete_in, 1);
|
1357
|
+
rb_define_method(cs, "delete_in!", cs_method_delete_in_bang, 1);
|
1358
|
+
rb_define_method(cs, "keep_in", cs_method_keep_in, 1);
|
1359
|
+
rb_define_method(cs, "keep_in!", cs_method_keep_in_bang, 1);
|
1360
|
+
rb_define_method(cs, "scan", cs_method_scan, 1);
|
1361
|
+
rb_define_method(cs, "used_by?", cs_method_used_by_p, 1);
|
1362
|
+
rb_define_method(cs, "allocated_length", cs_method_allocated_length, 0);
|
814
1363
|
}
|