character_set 1.1.1-java → 1.4.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitattributes +3 -0
- data/.github/workflows/lint.yml +29 -0
- data/.github/workflows/tests.yml +22 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +11 -0
- data/BENCHMARK.md +53 -17
- data/CHANGELOG.md +47 -0
- data/README.md +38 -14
- data/Rakefile +60 -36
- data/benchmarks/count_in.rb +13 -0
- data/benchmarks/delete_in.rb +1 -1
- data/benchmarks/scan.rb +13 -0
- data/benchmarks/shared.rb +5 -0
- data/benchmarks/z_add.rb +12 -0
- data/benchmarks/z_delete.rb +12 -0
- data/benchmarks/z_merge.rb +15 -0
- data/benchmarks/z_minmax.rb +12 -0
- data/bin/console +2 -0
- data/character_set.gemspec +17 -6
- data/ext/character_set/character_set.c +963 -414
- data/ext/character_set/unicode_casefold_table.h +10 -2
- data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
- data/lib/character_set/character.rb +1 -1
- data/lib/character_set/core_ext/regexp_ext.rb +1 -1
- data/lib/character_set/core_ext/string_ext.rb +3 -1
- data/lib/character_set/expression_converter.rb +25 -27
- data/lib/character_set/parser.rb +1 -1
- data/lib/character_set/predefined_sets.rb +25 -260
- data/lib/character_set/predefined_sets/any.cps +1 -0
- data/lib/character_set/predefined_sets/ascii.cps +1 -0
- data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
- data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
- data/lib/character_set/predefined_sets/assigned.cps +666 -0
- data/lib/character_set/predefined_sets/bmp.cps +2 -0
- data/lib/character_set/predefined_sets/crypt.cps +2 -0
- data/lib/character_set/predefined_sets/emoji.cps +151 -0
- data/lib/character_set/predefined_sets/newline.cps +3 -0
- data/lib/character_set/predefined_sets/surrogate.cps +1 -0
- data/lib/character_set/predefined_sets/unicode.cps +2 -0
- data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
- data/lib/character_set/predefined_sets/url_host.cps +10 -0
- data/lib/character_set/predefined_sets/url_path.cps +7 -0
- data/lib/character_set/predefined_sets/url_query.cps +8 -0
- data/lib/character_set/predefined_sets/whitespace.cps +10 -0
- data/lib/character_set/ruby_fallback.rb +5 -3
- data/lib/character_set/ruby_fallback/character_set_methods.rb +53 -6
- data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
- data/lib/character_set/shared_methods.rb +60 -49
- data/lib/character_set/version.rb +1 -1
- data/lib/character_set/writer.rb +98 -27
- metadata +102 -22
- data/.travis.yml +0 -11
- data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
@@ -0,0 +1,13 @@
|
|
1
|
+
require_relative './shared'
|
2
|
+
|
3
|
+
str = 'Lorem ipsum et dolorem'
|
4
|
+
tr = '^A-Za-z'
|
5
|
+
cs = CharacterSet.non_ascii_letter
|
6
|
+
|
7
|
+
benchmark(
|
8
|
+
caption: 'Counting non-letters',
|
9
|
+
cases: {
|
10
|
+
'String#count' => -> { str.count(tr) },
|
11
|
+
'CharacterSet#count_in' => -> { cs.count_in(str) },
|
12
|
+
}
|
13
|
+
)
|
data/benchmarks/delete_in.rb
CHANGED
@@ -14,7 +14,7 @@ benchmark(
|
|
14
14
|
|
15
15
|
str = 'Lörem ipsüm ⛷ et dölörem'
|
16
16
|
rx = /[\s\p{emoji}äüö]/
|
17
|
-
cs = CharacterSet.whitespace + CharacterSet.emoji +
|
17
|
+
cs = CharacterSet.whitespace + CharacterSet.emoji + CharacterSet['ä', 'ö', 'ü']
|
18
18
|
|
19
19
|
benchmark(
|
20
20
|
caption: 'Removing whitespace, emoji and umlauts',
|
data/benchmarks/scan.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require_relative './shared'
|
2
|
+
|
3
|
+
str = 'Lorem ipsum ⛷ et dolorem'
|
4
|
+
rx = /\p{emoji}/
|
5
|
+
cs = CharacterSet.emoji
|
6
|
+
|
7
|
+
benchmark(
|
8
|
+
caption: 'Extracting emoji to an Array',
|
9
|
+
cases: {
|
10
|
+
'String#scan' => -> { str.scan(rx) },
|
11
|
+
'CharacterSet#scan' => -> { cs.scan(str) },
|
12
|
+
}
|
13
|
+
)
|
data/benchmarks/shared.rb
CHANGED
@@ -3,6 +3,11 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
3
3
|
|
4
4
|
require 'benchmark/ips'
|
5
5
|
require 'character_set'
|
6
|
+
if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
|
7
|
+
require 'sorted_set'
|
8
|
+
else
|
9
|
+
require 'set'
|
10
|
+
end
|
6
11
|
|
7
12
|
def benchmark(caption: nil, cases: {})
|
8
13
|
puts caption
|
data/benchmarks/z_add.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require_relative './shared'
|
2
|
+
|
3
|
+
cs = CharacterSet.new(0..0x10FFFF)
|
4
|
+
ss = SortedSet.new(0..0x10FFFF)
|
5
|
+
|
6
|
+
benchmark(
|
7
|
+
caption: 'Removing entries',
|
8
|
+
cases: {
|
9
|
+
'CharacterSet#delete' => -> { cs.delete(rand(0x10FFFF)) },
|
10
|
+
'SortedSet#delete' => -> { ss.delete(rand(0x10FFFF)) },
|
11
|
+
}
|
12
|
+
)
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require_relative './shared'
|
2
|
+
|
3
|
+
cs1 = CharacterSet.new(0...0x88000)
|
4
|
+
cs2 = CharacterSet.new(0x88000..0x10FFFF)
|
5
|
+
|
6
|
+
ss1 = SortedSet.new(0...0x88000)
|
7
|
+
ss2 = SortedSet.new(0x88000..0x10FFFF)
|
8
|
+
|
9
|
+
benchmark(
|
10
|
+
caption: 'Merging entries',
|
11
|
+
cases: {
|
12
|
+
'CharacterSet#merge' => -> { cs1.merge(cs2) },
|
13
|
+
'SortedSet#merge' => -> { ss1.merge(ss2) },
|
14
|
+
}
|
15
|
+
)
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require_relative './shared'
|
2
|
+
|
3
|
+
cs = CharacterSet.new(0..0xFFFF)
|
4
|
+
ss = SortedSet.new(0..0xFFFF)
|
5
|
+
|
6
|
+
benchmark(
|
7
|
+
caption: 'Getting the min and max',
|
8
|
+
cases: {
|
9
|
+
'CharacterSet#minmax' => -> { cs.minmax },
|
10
|
+
'SortedSet#minmax' => -> { ss.minmax },
|
11
|
+
}
|
12
|
+
)
|
data/bin/console
CHANGED
data/character_set.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |s|
|
|
10
10
|
s.email = ['janosch84@gmail.com']
|
11
11
|
|
12
12
|
s.summary = 'Build, read, write and compare sets of Unicode codepoints.'
|
13
|
-
s.homepage = 'https://github.com/
|
13
|
+
s.homepage = 'https://github.com/jaynetics/character_set'
|
14
14
|
s.license = 'MIT'
|
15
15
|
|
16
16
|
s.files = `git ls-files -z`.split("\x0").reject do |f|
|
@@ -22,12 +22,23 @@ Gem::Specification.new do |s|
|
|
22
22
|
|
23
23
|
s.required_ruby_version = '>= 2.1.0'
|
24
24
|
|
25
|
+
# SortedSet, needed for RubyFallback, was moved to a gem in Ruby 3.
|
26
|
+
# This dependency is only used if the C extension is unavailable.
|
27
|
+
# JRuby has it in the stdlib.
|
28
|
+
if RUBY_VERSION.to_f >= 3.0 && !RUBY_PLATFORM[/java/i]
|
29
|
+
s.add_dependency 'sorted_set', '~> 1.0'
|
30
|
+
end
|
31
|
+
|
25
32
|
s.add_development_dependency 'benchmark-ips', '~> 2.7'
|
26
|
-
s.add_development_dependency '
|
27
|
-
s.add_development_dependency 'rake', '~>
|
28
|
-
s.add_development_dependency 'rake-compiler', '~> 1.
|
33
|
+
s.add_development_dependency 'get_process_mem', '~> 0.2.3'
|
34
|
+
s.add_development_dependency 'rake', '~> 13.0'
|
35
|
+
s.add_development_dependency 'rake-compiler', '~> 1.1'
|
29
36
|
s.add_development_dependency 'range_compressor', '~> 1.0'
|
30
|
-
s.add_development_dependency 'regexp_parser', '~> 1.
|
31
|
-
s.add_development_dependency 'regexp_property_values', '~> 0
|
37
|
+
s.add_development_dependency 'regexp_parser', '~> 1.6'
|
38
|
+
s.add_development_dependency 'regexp_property_values', '~> 1.0'
|
32
39
|
s.add_development_dependency 'rspec', '~> 3.8'
|
40
|
+
if RUBY_VERSION.to_f >= 2.7
|
41
|
+
s.add_development_dependency 'codecov', '~> 0.2.12'
|
42
|
+
s.add_development_dependency 'rubocop', '~> 1.8'
|
43
|
+
end
|
33
44
|
end
|
@@ -2,81 +2,180 @@
|
|
2
2
|
#include "ruby/encoding.h"
|
3
3
|
#include "unicode_casefold_table.h"
|
4
4
|
|
5
|
-
#define
|
6
|
-
#define
|
7
|
-
#define
|
5
|
+
#define UNICODE_PLANE_SIZE 0x10000
|
6
|
+
#define UNICODE_PLANE_COUNT 17
|
7
|
+
#define UNICODE_CP_COUNT (UNICODE_PLANE_SIZE * UNICODE_PLANE_COUNT)
|
8
8
|
|
9
|
-
|
10
|
-
|
9
|
+
// start at ascii size
|
10
|
+
#define CS_DEFAULT_INITIAL_LEN 128
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
12
|
+
typedef char cs_ar;
|
13
|
+
typedef unsigned long cs_cp;
|
14
|
+
|
15
|
+
struct cs_data
|
16
|
+
{
|
17
|
+
cs_ar *cps;
|
18
|
+
cs_cp len;
|
19
|
+
};
|
20
|
+
|
21
|
+
#define CS_MSIZE(len) (sizeof(cs_ar) * (len / 8))
|
22
|
+
|
23
|
+
static inline void
|
24
|
+
add_memspace_for_another_plane(struct cs_data *data)
|
25
|
+
{
|
26
|
+
data->cps = ruby_xrealloc(data->cps, CS_MSIZE(data->len + UNICODE_PLANE_SIZE));
|
27
|
+
memset(data->cps + CS_MSIZE(data->len), 0, CS_MSIZE(UNICODE_PLANE_SIZE));
|
28
|
+
data->len += UNICODE_PLANE_SIZE;
|
29
|
+
}
|
30
|
+
|
31
|
+
static inline void
|
32
|
+
ensure_memsize_fits(struct cs_data *data, cs_cp target_cp)
|
33
|
+
{
|
34
|
+
while (target_cp >= data->len)
|
35
|
+
{
|
36
|
+
add_memspace_for_another_plane(data);
|
37
|
+
}
|
38
|
+
}
|
39
|
+
|
40
|
+
static inline void
|
41
|
+
set_cp(struct cs_data *data, cs_cp cp)
|
42
|
+
{
|
43
|
+
ensure_memsize_fits(data, cp);
|
44
|
+
data->cps[cp >> 3] |= (1 << (cp & 0x07));
|
45
|
+
}
|
46
|
+
|
47
|
+
static inline int
|
48
|
+
tst_cp(cs_ar *cps, cs_cp len, cs_cp cp)
|
49
|
+
{
|
50
|
+
return ((cp < len) && cps[cp >> 3] & (1 << (cp & 0x07)));
|
51
|
+
}
|
52
|
+
|
53
|
+
static inline void
|
54
|
+
clr_cp(cs_ar *cps, cs_cp len, cs_cp cp)
|
55
|
+
{
|
56
|
+
if (cp < len)
|
57
|
+
{
|
58
|
+
cps[cp >> 3] &= ~(1 << (cp & 0x07));
|
59
|
+
}
|
60
|
+
}
|
16
61
|
|
17
62
|
static void
|
18
|
-
|
19
|
-
|
63
|
+
cs_free(void *ptr)
|
64
|
+
{
|
65
|
+
struct cs_data *data = ptr;
|
66
|
+
ruby_xfree(data->cps);
|
67
|
+
ruby_xfree(data);
|
20
68
|
}
|
21
69
|
|
22
70
|
static size_t
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
.
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
71
|
+
cs_memsize(const void *ptr)
|
72
|
+
{
|
73
|
+
const struct cs_data *data = ptr;
|
74
|
+
return sizeof(*data) + CS_MSIZE(data->len);
|
75
|
+
}
|
76
|
+
|
77
|
+
static const rb_data_type_t cs_type = {
|
78
|
+
.wrap_struct_name = "character_set",
|
79
|
+
.function = {
|
80
|
+
.dmark = NULL,
|
81
|
+
.dfree = cs_free,
|
82
|
+
.dsize = cs_memsize,
|
83
|
+
},
|
84
|
+
.data = NULL,
|
85
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY,
|
37
86
|
};
|
38
87
|
|
39
|
-
|
40
|
-
|
88
|
+
static inline VALUE
|
89
|
+
cs_alloc_len(VALUE klass, struct cs_data **data_ptr, cs_cp len)
|
90
|
+
{
|
91
|
+
VALUE cs;
|
92
|
+
struct cs_data *data;
|
93
|
+
cs = TypedData_Make_Struct(klass, struct cs_data, &cs_type, data);
|
94
|
+
data->cps = ruby_xmalloc(CS_MSIZE(len));
|
95
|
+
memset(data->cps, 0, CS_MSIZE(len));
|
96
|
+
data->len = len;
|
97
|
+
|
98
|
+
if (data_ptr)
|
99
|
+
{
|
100
|
+
*data_ptr = data;
|
101
|
+
}
|
41
102
|
|
42
|
-
|
43
|
-
|
103
|
+
return cs;
|
104
|
+
}
|
44
105
|
|
45
|
-
static VALUE
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
return NEW_CHARACTER_SET(self, cp_arr);
|
106
|
+
static inline VALUE
|
107
|
+
cs_alloc(VALUE klass, struct cs_data **data_ptr)
|
108
|
+
{
|
109
|
+
return cs_alloc_len(klass, data_ptr, CS_DEFAULT_INITIAL_LEN);
|
50
110
|
}
|
51
111
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
112
|
+
static inline struct cs_data *
|
113
|
+
cs_fetch_data(VALUE cs)
|
114
|
+
{
|
115
|
+
struct cs_data *data;
|
116
|
+
TypedData_Get_Struct(cs, struct cs_data, &cs_type, data);
|
117
|
+
return data;
|
118
|
+
}
|
119
|
+
|
120
|
+
static inline cs_ar *
|
121
|
+
cs_fetch_cps(VALUE cs, cs_cp *len_ptr)
|
122
|
+
{
|
123
|
+
struct cs_data *data;
|
124
|
+
data = cs_fetch_data(cs);
|
125
|
+
*len_ptr = data->len;
|
126
|
+
return data->cps;
|
127
|
+
}
|
128
|
+
|
129
|
+
static VALUE
|
130
|
+
cs_method_allocate(VALUE self)
|
131
|
+
{
|
132
|
+
return cs_alloc(self, 0);
|
133
|
+
}
|
134
|
+
|
135
|
+
#define FOR_EACH_ACTIVE_CODEPOINT(action) \
|
136
|
+
do \
|
137
|
+
{ \
|
138
|
+
cs_cp cp, len; \
|
139
|
+
cs_ar *cps; \
|
140
|
+
cps = cs_fetch_cps(self, &len); \
|
141
|
+
for (cp = 0; cp < len; cp++) \
|
142
|
+
{ \
|
143
|
+
if (tst_cp(cps, len, cp)) \
|
144
|
+
{ \
|
145
|
+
action; \
|
146
|
+
} \
|
147
|
+
} \
|
148
|
+
} while (0)
|
59
149
|
|
60
150
|
// ***************************
|
61
151
|
// `Set` compatibility methods
|
62
152
|
// ***************************
|
63
153
|
|
64
|
-
static inline
|
65
|
-
|
66
|
-
|
154
|
+
static inline cs_cp
|
155
|
+
cs_active_cp_count(VALUE self)
|
156
|
+
{
|
157
|
+
cs_cp count;
|
67
158
|
count = 0;
|
68
159
|
FOR_EACH_ACTIVE_CODEPOINT(count++);
|
69
|
-
return
|
160
|
+
return count;
|
70
161
|
}
|
71
162
|
|
72
163
|
static VALUE
|
73
|
-
|
74
|
-
|
164
|
+
cs_method_length(VALUE self)
|
165
|
+
{
|
166
|
+
return LONG2FIX(cs_active_cp_count(self));
|
167
|
+
}
|
168
|
+
|
169
|
+
static inline VALUE
|
170
|
+
cs_enumerator_length(VALUE self, VALUE args, VALUE eobj)
|
171
|
+
{
|
172
|
+
return LONG2FIX(cs_active_cp_count(self));
|
75
173
|
}
|
76
174
|
|
77
175
|
static VALUE
|
78
|
-
|
79
|
-
|
176
|
+
cs_method_each(VALUE self)
|
177
|
+
{
|
178
|
+
RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
|
80
179
|
FOR_EACH_ACTIVE_CODEPOINT(rb_yield(LONG2FIX(cp)));
|
81
180
|
return self;
|
82
181
|
}
|
@@ -84,16 +183,19 @@ method_each(VALUE self) {
|
|
84
183
|
// returns an Array of codepoint Integers by default.
|
85
184
|
// returns an Array of Strings of length 1 if passed `true`.
|
86
185
|
static VALUE
|
87
|
-
|
186
|
+
cs_method_to_a(int argc, VALUE *argv, VALUE self)
|
187
|
+
{
|
88
188
|
VALUE arr;
|
89
189
|
rb_encoding *enc;
|
90
190
|
rb_check_arity(argc, 0, 1);
|
91
191
|
|
92
192
|
arr = rb_ary_new();
|
93
|
-
if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse)
|
193
|
+
if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse)
|
194
|
+
{
|
94
195
|
FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, LONG2FIX(cp)));
|
95
196
|
}
|
96
|
-
else
|
197
|
+
else
|
198
|
+
{
|
97
199
|
enc = rb_utf8_encoding();
|
98
200
|
FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, rb_enc_uint_chr((int)cp, enc)));
|
99
201
|
}
|
@@ -102,302 +204,473 @@ method_to_a(int argc, VALUE *argv, VALUE self) {
|
|
102
204
|
}
|
103
205
|
|
104
206
|
static VALUE
|
105
|
-
|
207
|
+
cs_method_empty_p(VALUE self)
|
208
|
+
{
|
106
209
|
FOR_EACH_ACTIVE_CODEPOINT(return Qfalse);
|
107
210
|
return Qtrue;
|
108
211
|
}
|
109
212
|
|
110
213
|
static VALUE
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
214
|
+
cs_method_hash(VALUE self)
|
215
|
+
{
|
216
|
+
cs_cp cp, len, hash, four_byte_value;
|
217
|
+
cs_ar *cps;
|
218
|
+
cps = cs_fetch_cps(self, &len);
|
219
|
+
four_byte_value = 0;
|
115
220
|
|
116
221
|
hash = 17;
|
117
|
-
for (cp = 0; cp <
|
118
|
-
|
119
|
-
|
222
|
+
for (cp = 0; cp < len; cp++)
|
223
|
+
{
|
224
|
+
if (cp % 32 == 0)
|
225
|
+
{
|
226
|
+
if (cp != 0)
|
227
|
+
{
|
228
|
+
hash = hash * 23 + four_byte_value;
|
229
|
+
}
|
120
230
|
four_byte_value = 0;
|
121
231
|
}
|
122
|
-
if (
|
232
|
+
if (tst_cp(cps, len, cp))
|
233
|
+
{
|
234
|
+
four_byte_value++;
|
235
|
+
}
|
123
236
|
}
|
124
237
|
|
125
238
|
return LONG2FIX(hash);
|
126
239
|
}
|
127
240
|
|
128
241
|
static inline VALUE
|
129
|
-
|
242
|
+
cs_delete_if_block_result(VALUE self, int truthy)
|
243
|
+
{
|
130
244
|
VALUE result;
|
131
245
|
rb_need_block();
|
132
246
|
rb_check_frozen(self);
|
133
247
|
FOR_EACH_ACTIVE_CODEPOINT(
|
134
|
-
|
135
|
-
|
136
|
-
);
|
248
|
+
result = rb_yield(LONG2FIX(cp));
|
249
|
+
if ((NIL_P(result) || result == Qfalse) != truthy) clr_cp(cps, len, cp););
|
137
250
|
return self;
|
138
251
|
}
|
139
252
|
|
140
253
|
static VALUE
|
141
|
-
|
142
|
-
|
143
|
-
|
254
|
+
cs_method_delete_if(VALUE self)
|
255
|
+
{
|
256
|
+
RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
|
257
|
+
return cs_delete_if_block_result(self, 1);
|
144
258
|
}
|
145
259
|
|
146
260
|
static VALUE
|
147
|
-
|
148
|
-
|
149
|
-
|
261
|
+
cs_method_keep_if(VALUE self)
|
262
|
+
{
|
263
|
+
RETURN_SIZED_ENUMERATOR(self, 0, 0, cs_enumerator_length);
|
264
|
+
return cs_delete_if_block_result(self, 0);
|
150
265
|
}
|
151
266
|
|
152
267
|
static VALUE
|
153
|
-
|
154
|
-
|
155
|
-
|
268
|
+
cs_method_clear(VALUE self)
|
269
|
+
{
|
270
|
+
struct cs_data *data;
|
156
271
|
rb_check_frozen(self);
|
157
|
-
|
158
|
-
|
159
|
-
CLRBIT(cps, cp);
|
160
|
-
}
|
272
|
+
data = cs_fetch_data(self);
|
273
|
+
memset(data->cps, 0, CS_MSIZE(data->len));
|
161
274
|
return self;
|
162
275
|
}
|
163
276
|
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
277
|
+
static VALUE
|
278
|
+
cs_method_min(VALUE self)
|
279
|
+
{
|
280
|
+
FOR_EACH_ACTIVE_CODEPOINT(return LONG2FIX(cp));
|
281
|
+
return Qnil;
|
282
|
+
}
|
283
|
+
|
284
|
+
static VALUE
|
285
|
+
cs_method_max(VALUE self)
|
286
|
+
{
|
287
|
+
cs_cp len;
|
288
|
+
long reverse_idx;
|
289
|
+
cs_ar *cps;
|
290
|
+
cps = cs_fetch_cps(self, &len);
|
291
|
+
for (reverse_idx = len; reverse_idx >= 0; reverse_idx--)
|
292
|
+
{
|
293
|
+
if (tst_cp(cps, len, reverse_idx))
|
294
|
+
{
|
295
|
+
return LONG2FIX(reverse_idx);
|
296
|
+
}
|
297
|
+
}
|
298
|
+
return Qnil;
|
299
|
+
}
|
300
|
+
|
301
|
+
static VALUE
|
302
|
+
cs_method_minmax(VALUE self)
|
303
|
+
{
|
304
|
+
VALUE arr;
|
305
|
+
arr = rb_ary_new2(2);
|
306
|
+
rb_ary_push(arr, cs_method_min(self));
|
307
|
+
rb_ary_push(arr, cs_method_max(self));
|
308
|
+
return arr;
|
309
|
+
}
|
310
|
+
|
311
|
+
#define RETURN_COMBINED_CS(cs_a, cs_b, comp_op) \
|
312
|
+
do \
|
313
|
+
{ \
|
314
|
+
VALUE new_cs; \
|
315
|
+
cs_cp cp, alen, blen; \
|
316
|
+
cs_ar *acps, *bcps; \
|
317
|
+
struct cs_data *new_data; \
|
318
|
+
new_cs = cs_alloc(RBASIC(self)->klass, &new_data); \
|
319
|
+
acps = cs_fetch_cps(cs_a, &alen); \
|
320
|
+
bcps = cs_fetch_cps(cs_b, &blen); \
|
321
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) \
|
322
|
+
{ \
|
323
|
+
if (tst_cp(acps, alen, cp) comp_op tst_cp(bcps, blen, cp)) \
|
324
|
+
{ \
|
325
|
+
set_cp(new_data, cp); \
|
326
|
+
} \
|
327
|
+
} \
|
328
|
+
return new_cs; \
|
329
|
+
} while (0)
|
174
330
|
|
175
331
|
static VALUE
|
176
|
-
|
177
|
-
|
332
|
+
cs_method_intersection(VALUE self, VALUE other)
|
333
|
+
{
|
334
|
+
RETURN_COMBINED_CS(self, other, &&);
|
178
335
|
}
|
179
336
|
|
180
337
|
static VALUE
|
181
|
-
|
182
|
-
|
338
|
+
cs_method_exclusion(VALUE self, VALUE other)
|
339
|
+
{
|
340
|
+
RETURN_COMBINED_CS(self, other, ^);
|
183
341
|
}
|
184
342
|
|
185
343
|
static VALUE
|
186
|
-
|
187
|
-
|
344
|
+
cs_method_union(VALUE self, VALUE other)
|
345
|
+
{
|
346
|
+
RETURN_COMBINED_CS(self, other, ||);
|
188
347
|
}
|
189
348
|
|
190
349
|
static VALUE
|
191
|
-
|
192
|
-
|
350
|
+
cs_method_difference(VALUE self, VALUE other)
|
351
|
+
{
|
352
|
+
RETURN_COMBINED_CS(self, other, >);
|
193
353
|
}
|
194
354
|
|
195
355
|
static VALUE
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
356
|
+
cs_method_include_p(VALUE self, VALUE num)
|
357
|
+
{
|
358
|
+
cs_ar *cps;
|
359
|
+
cs_cp len;
|
360
|
+
cps = cs_fetch_cps(self, &len);
|
361
|
+
return (tst_cp(cps, len, FIX2ULONG(num)) ? Qtrue : Qfalse);
|
200
362
|
}
|
201
363
|
|
202
|
-
static inline
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
364
|
+
static inline VALUE
|
365
|
+
cs_toggle_codepoint(VALUE cs, VALUE cp_num, int on, int return_nil_if_noop)
|
366
|
+
{
|
367
|
+
cs_cp cp, len;
|
368
|
+
cs_ar *cps;
|
369
|
+
struct cs_data *data;
|
370
|
+
rb_check_frozen(cs);
|
371
|
+
data = cs_fetch_data(cs);
|
372
|
+
cps = data->cps;
|
373
|
+
len = data->len;
|
208
374
|
cp = FIX2ULONG(cp_num);
|
209
|
-
if (
|
210
|
-
|
375
|
+
if (return_nil_if_noop && (!tst_cp(cps, len, cp) == !on))
|
376
|
+
{
|
377
|
+
return Qnil;
|
211
378
|
}
|
212
|
-
else
|
213
|
-
|
214
|
-
|
215
|
-
|
379
|
+
else
|
380
|
+
{
|
381
|
+
if (on)
|
382
|
+
{
|
383
|
+
set_cp(data, cp);
|
384
|
+
}
|
385
|
+
else
|
386
|
+
{
|
387
|
+
clr_cp(cps, len, cp);
|
388
|
+
}
|
389
|
+
return cs;
|
216
390
|
}
|
217
391
|
}
|
218
392
|
|
219
393
|
static VALUE
|
220
|
-
|
221
|
-
|
394
|
+
cs_method_add(VALUE self, VALUE cp_num)
|
395
|
+
{
|
396
|
+
return cs_toggle_codepoint(self, cp_num, 1, 0);
|
222
397
|
}
|
223
398
|
|
224
399
|
static VALUE
|
225
|
-
|
226
|
-
|
400
|
+
cs_method_add_p(VALUE self, VALUE cp_num)
|
401
|
+
{
|
402
|
+
return cs_toggle_codepoint(self, cp_num, 1, 1);
|
227
403
|
}
|
228
404
|
|
229
405
|
static VALUE
|
230
|
-
|
231
|
-
|
406
|
+
cs_method_delete(VALUE self, VALUE cp_num)
|
407
|
+
{
|
408
|
+
return cs_toggle_codepoint(self, cp_num, 0, 0);
|
232
409
|
}
|
233
410
|
|
234
411
|
static VALUE
|
235
|
-
|
236
|
-
|
412
|
+
cs_method_delete_p(VALUE self, VALUE cp_num)
|
413
|
+
{
|
414
|
+
return cs_toggle_codepoint(self, cp_num, 0, 1);
|
237
415
|
}
|
238
416
|
|
239
|
-
#define COMPARE_SETS(action)\
|
240
|
-
cp_index cp;\
|
241
|
-
cp_byte *cps, *other_cps;\
|
242
|
-
FETCH_CODEPOINTS(self, cps);\
|
243
|
-
FETCH_CODEPOINTS(other, other_cps);\
|
244
|
-
for (cp = 0; cp < UNICODE_CP_COUNT; cp++) { action; }\
|
245
|
-
|
246
417
|
static VALUE
|
247
|
-
|
248
|
-
|
418
|
+
cs_method_intersect_p(VALUE self, VALUE other)
|
419
|
+
{
|
420
|
+
cs_cp cp, alen, blen;
|
421
|
+
cs_ar *acps, *bcps;
|
422
|
+
acps = cs_fetch_cps(self, &alen);
|
423
|
+
bcps = cs_fetch_cps(other, &blen);
|
424
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
|
425
|
+
{
|
426
|
+
if (tst_cp(acps, alen, cp) && tst_cp(bcps, blen, cp))
|
427
|
+
{
|
428
|
+
return Qtrue;
|
429
|
+
}
|
430
|
+
}
|
249
431
|
return Qfalse;
|
250
432
|
}
|
251
433
|
|
252
434
|
static VALUE
|
253
|
-
|
254
|
-
|
435
|
+
cs_method_disjoint_p(VALUE self, VALUE other)
|
436
|
+
{
|
437
|
+
return cs_method_intersect_p(self, other) ? Qfalse : Qtrue;
|
255
438
|
}
|
256
439
|
|
257
440
|
static inline int
|
258
|
-
|
259
|
-
|
441
|
+
cs_check_type(VALUE obj)
|
442
|
+
{
|
443
|
+
return rb_typeddata_is_kind_of(obj, &cs_type);
|
260
444
|
}
|
261
445
|
|
262
446
|
static VALUE
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
447
|
+
cs_cps_eql(VALUE cs_a, VALUE cs_b)
|
448
|
+
{
|
449
|
+
cs_cp cp, alen, blen;
|
450
|
+
cs_ar *acps, *bcps;
|
451
|
+
acps = cs_fetch_cps(cs_a, &alen);
|
452
|
+
bcps = cs_fetch_cps(cs_b, &blen);
|
453
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
|
454
|
+
{
|
455
|
+
if (tst_cp(acps, alen, cp) != tst_cp(bcps, blen, cp))
|
456
|
+
{
|
457
|
+
return Qfalse;
|
458
|
+
}
|
459
|
+
}
|
269
460
|
return Qtrue;
|
270
461
|
}
|
271
462
|
|
463
|
+
static VALUE
|
464
|
+
cs_method_eql_p(VALUE self, VALUE other)
|
465
|
+
{
|
466
|
+
if (!cs_check_type(other))
|
467
|
+
{
|
468
|
+
return Qfalse;
|
469
|
+
}
|
470
|
+
if (self == other) // same object_id
|
471
|
+
{
|
472
|
+
return Qtrue;
|
473
|
+
}
|
474
|
+
return cs_cps_eql(self, other);
|
475
|
+
}
|
476
|
+
|
272
477
|
static inline VALUE
|
273
|
-
|
274
|
-
|
275
|
-
|
478
|
+
cs_merge_cs(VALUE recipient, VALUE source)
|
479
|
+
{
|
480
|
+
cs_cp cp, source_len;
|
481
|
+
struct cs_data *data;
|
482
|
+
cs_ar *source_cps;
|
483
|
+
data = cs_fetch_data(recipient);
|
484
|
+
source_cps = cs_fetch_cps(source, &source_len);
|
485
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
|
486
|
+
{
|
487
|
+
if (tst_cp(source_cps, source_len, cp))
|
488
|
+
{
|
489
|
+
set_cp(data, cp);
|
490
|
+
}
|
491
|
+
}
|
492
|
+
return recipient;
|
276
493
|
}
|
277
494
|
|
278
|
-
static inline
|
279
|
-
|
280
|
-
|
495
|
+
static inline cs_cp
|
496
|
+
cs_checked_cp(VALUE object_id)
|
497
|
+
{
|
498
|
+
if (FIXNUM_P(object_id) && object_id > 0 && object_id < 0x220001)
|
499
|
+
{
|
500
|
+
return FIX2ULONG(object_id);
|
501
|
+
}
|
281
502
|
rb_raise(rb_eArgError, "CharacterSet members must be between 0 and 0x10FFFF");
|
282
503
|
}
|
283
504
|
|
284
505
|
static inline VALUE
|
285
|
-
|
506
|
+
cs_merge_rb_range(VALUE self, VALUE rb_range)
|
507
|
+
{
|
286
508
|
VALUE from_id, upto_id;
|
509
|
+
cs_cp from_cp, upto_cp, cont_len, rem;
|
287
510
|
int excl;
|
288
|
-
|
289
|
-
|
290
|
-
FETCH_CODEPOINTS(self, cps);
|
511
|
+
struct cs_data *data;
|
512
|
+
data = cs_fetch_data(self);
|
291
513
|
|
292
|
-
if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl)))
|
514
|
+
if (!RTEST(rb_range_values(rb_range, &from_id, &upto_id, &excl)))
|
515
|
+
{
|
293
516
|
rb_raise(rb_eArgError, "pass a Range");
|
294
517
|
}
|
295
|
-
if (excl)
|
518
|
+
if (excl)
|
519
|
+
{
|
520
|
+
upto_id -= 2;
|
521
|
+
}
|
522
|
+
|
523
|
+
from_cp = cs_checked_cp(from_id);
|
524
|
+
upto_cp = cs_checked_cp(upto_id);
|
296
525
|
|
297
|
-
|
298
|
-
|
526
|
+
if (upto_cp > from_cp && (upto_cp - from_cp > 6))
|
527
|
+
{
|
528
|
+
// set bits in preceding partially toggled bytes individually
|
529
|
+
for (/* */; (from_cp <= upto_cp) && (from_cp % 8); from_cp++)
|
530
|
+
{
|
531
|
+
set_cp(data, from_cp);
|
532
|
+
}
|
533
|
+
// memset contiguous bits directly
|
534
|
+
cont_len = upto_cp - from_cp + 1;
|
535
|
+
rem = cont_len % 8;
|
536
|
+
ensure_memsize_fits(data, upto_cp);
|
537
|
+
memset(data->cps + CS_MSIZE(from_cp), 0xFF, CS_MSIZE(cont_len - rem) / 8);
|
538
|
+
from_cp = upto_cp - rem + 1;
|
539
|
+
}
|
299
540
|
|
300
|
-
|
301
|
-
|
302
|
-
|
541
|
+
// set bits in partially toggled bytes individually
|
542
|
+
for (/* */; from_cp <= upto_cp; from_cp++)
|
543
|
+
{
|
544
|
+
set_cp(data, from_cp);
|
303
545
|
}
|
546
|
+
|
304
547
|
return self;
|
305
548
|
}
|
306
549
|
|
307
550
|
static inline VALUE
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
FETCH_CODEPOINTS(self, cps);
|
551
|
+
cs_merge_rb_array(VALUE self, VALUE rb_array)
|
552
|
+
{
|
553
|
+
VALUE el, array_length, i;
|
554
|
+
struct cs_data *data;
|
313
555
|
Check_Type(rb_array, T_ARRAY);
|
556
|
+
data = cs_fetch_data(self);
|
314
557
|
array_length = RARRAY_LEN(rb_array);
|
315
|
-
for (i = 0; i < array_length; i++)
|
558
|
+
for (i = 0; i < array_length; i++)
|
559
|
+
{
|
316
560
|
el = RARRAY_AREF(rb_array, i);
|
317
|
-
|
318
|
-
SETBIT(cps, FIX2ULONG(el));
|
561
|
+
set_cp(data, cs_checked_cp(el));
|
319
562
|
}
|
320
563
|
return self;
|
321
564
|
}
|
322
565
|
|
323
566
|
static VALUE
|
324
|
-
|
567
|
+
cs_method_merge(VALUE self, VALUE other)
|
568
|
+
{
|
325
569
|
rb_check_frozen(self);
|
326
|
-
if (
|
327
|
-
|
570
|
+
if (cs_check_type(other))
|
571
|
+
{
|
572
|
+
return cs_merge_cs(self, other);
|
328
573
|
}
|
329
|
-
else if (TYPE(other) == T_ARRAY)
|
330
|
-
|
574
|
+
else if (TYPE(other) == T_ARRAY)
|
575
|
+
{
|
576
|
+
return cs_merge_rb_array(self, other);
|
331
577
|
}
|
332
|
-
return
|
578
|
+
return cs_merge_rb_range(self, other);
|
333
579
|
}
|
334
580
|
|
335
581
|
static VALUE
|
336
|
-
|
337
|
-
|
338
|
-
|
582
|
+
cs_method_initialize_copy(VALUE self, VALUE orig)
|
583
|
+
{
|
584
|
+
cs_merge_cs(self, orig);
|
585
|
+
return self;
|
339
586
|
}
|
340
587
|
|
341
588
|
static VALUE
|
342
|
-
|
589
|
+
cs_method_subtract(VALUE self, VALUE other)
|
590
|
+
{
|
591
|
+
cs_cp cp, len, other_len;
|
592
|
+
cs_ar *cps, *other_cps;
|
343
593
|
rb_check_frozen(self);
|
344
|
-
|
594
|
+
cps = cs_fetch_cps(self, &len);
|
595
|
+
other_cps = cs_fetch_cps(other, &other_len);
|
596
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
|
597
|
+
{
|
598
|
+
if (tst_cp(other_cps, other_len, cp))
|
599
|
+
{
|
600
|
+
clr_cp(cps, len, cp);
|
601
|
+
}
|
602
|
+
}
|
345
603
|
return self;
|
346
604
|
}
|
347
605
|
|
348
606
|
static inline int
|
349
|
-
|
350
|
-
|
351
|
-
|
607
|
+
cs_a_subset_of_b(VALUE cs_a, VALUE cs_b, int *is_proper_ptr)
|
608
|
+
{
|
609
|
+
cs_ar *a, *b;
|
610
|
+
cs_cp cp, alen, blen, count_a, count_b;
|
352
611
|
|
353
|
-
if (!
|
612
|
+
if (!cs_check_type(cs_a) || !cs_check_type(cs_b))
|
613
|
+
{
|
354
614
|
rb_raise(rb_eArgError, "pass a CharacterSet");
|
355
615
|
}
|
356
616
|
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
if (
|
366
|
-
|
367
|
-
|
368
|
-
|
617
|
+
a = cs_fetch_cps(cs_a, &alen);
|
618
|
+
b = cs_fetch_cps(cs_b, &blen);
|
619
|
+
|
620
|
+
count_a = 0;
|
621
|
+
count_b = 0;
|
622
|
+
|
623
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
|
624
|
+
{
|
625
|
+
if (tst_cp(a, alen, cp))
|
626
|
+
{
|
627
|
+
if (!tst_cp(b, blen, cp))
|
628
|
+
{
|
629
|
+
return 0;
|
630
|
+
}
|
631
|
+
count_a++;
|
632
|
+
count_b++;
|
633
|
+
}
|
634
|
+
else if (tst_cp(b, blen, cp))
|
635
|
+
{
|
636
|
+
count_b++;
|
369
637
|
}
|
370
|
-
else if (TSTBIT(cps_b, cp)) size_b++;
|
371
638
|
}
|
372
639
|
|
373
|
-
if (
|
640
|
+
if (is_proper_ptr)
|
641
|
+
{
|
642
|
+
*is_proper_ptr = count_b > count_a;
|
643
|
+
}
|
644
|
+
|
374
645
|
return 1;
|
375
646
|
}
|
376
647
|
|
377
648
|
static VALUE
|
378
|
-
|
379
|
-
|
380
|
-
return
|
649
|
+
cs_method_subset_p(VALUE self, VALUE other)
|
650
|
+
{
|
651
|
+
return cs_a_subset_of_b(self, other, NULL) ? Qtrue : Qfalse;
|
381
652
|
}
|
382
653
|
|
383
654
|
static VALUE
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
655
|
+
cs_method_proper_subset_p(VALUE self, VALUE other)
|
656
|
+
{
|
657
|
+
int is_subset, is_proper;
|
658
|
+
is_subset = cs_a_subset_of_b(self, other, &is_proper);
|
659
|
+
return (is_subset && is_proper) ? Qtrue : Qfalse;
|
388
660
|
}
|
389
661
|
|
390
662
|
static VALUE
|
391
|
-
|
392
|
-
|
393
|
-
return
|
663
|
+
cs_method_superset_p(VALUE self, VALUE other)
|
664
|
+
{
|
665
|
+
return cs_a_subset_of_b(other, self, NULL) ? Qtrue : Qfalse;
|
394
666
|
}
|
395
667
|
|
396
668
|
static VALUE
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
669
|
+
cs_method_proper_superset_p(VALUE self, VALUE other)
|
670
|
+
{
|
671
|
+
int is_superset, is_proper;
|
672
|
+
is_superset = cs_a_subset_of_b(other, self, &is_proper);
|
673
|
+
return (is_superset && is_proper) ? Qtrue : Qfalse;
|
401
674
|
}
|
402
675
|
|
403
676
|
// *******************************
|
@@ -405,42 +678,43 @@ method_proper_superset_p(VALUE self, VALUE other) {
|
|
405
678
|
// *******************************
|
406
679
|
|
407
680
|
static VALUE
|
408
|
-
|
409
|
-
|
410
|
-
|
681
|
+
cs_class_method_from_ranges(VALUE self, VALUE ranges)
|
682
|
+
{
|
683
|
+
VALUE new_cs, range_count, i;
|
684
|
+
new_cs = rb_class_new_instance(0, 0, self);
|
411
685
|
range_count = RARRAY_LEN(ranges);
|
412
|
-
for (i = 0; i < range_count; i++)
|
413
|
-
|
686
|
+
for (i = 0; i < range_count; i++)
|
687
|
+
{
|
688
|
+
cs_merge_rb_range(new_cs, RARRAY_AREF(ranges, i));
|
414
689
|
}
|
415
|
-
return
|
690
|
+
return new_cs;
|
416
691
|
}
|
417
692
|
|
418
693
|
static VALUE
|
419
|
-
|
420
|
-
|
694
|
+
cs_method_ranges(VALUE self)
|
695
|
+
{
|
696
|
+
VALUE ranges, cp_num, previous_cp_num, current_start, current_end;
|
421
697
|
|
422
698
|
ranges = rb_ary_new();
|
423
|
-
|
699
|
+
previous_cp_num = 0;
|
424
700
|
current_start = 0;
|
425
701
|
current_end = 0;
|
426
702
|
|
427
703
|
FOR_EACH_ACTIVE_CODEPOINT(
|
428
|
-
|
704
|
+
cp_num = LONG2FIX(cp);
|
429
705
|
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
current_end = codepoint;
|
439
|
-
previous_codepoint = codepoint;
|
440
|
-
);
|
706
|
+
if (!previous_cp_num) {
|
707
|
+
current_start = cp_num;
|
708
|
+
} else if (previous_cp_num + 2 != cp_num) {
|
709
|
+
// gap found, finalize previous range
|
710
|
+
rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
|
711
|
+
current_start = cp_num;
|
712
|
+
} current_end = cp_num;
|
713
|
+
previous_cp_num = cp_num;);
|
441
714
|
|
442
715
|
// add final range
|
443
|
-
if (current_start)
|
716
|
+
if (current_start)
|
717
|
+
{
|
444
718
|
rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
|
445
719
|
}
|
446
720
|
|
@@ -448,117 +722,233 @@ method_ranges(VALUE self) {
|
|
448
722
|
}
|
449
723
|
|
450
724
|
static VALUE
|
451
|
-
|
452
|
-
|
725
|
+
cs_method_sample(int argc, VALUE *argv, VALUE self)
|
726
|
+
{
|
727
|
+
VALUE array, to_a_args[1] = {Qtrue};
|
453
728
|
rb_check_arity(argc, 0, 1);
|
454
|
-
|
455
|
-
array = method_to_a(1, to_a_args, self);
|
729
|
+
array = cs_method_to_a(1, to_a_args, self);
|
456
730
|
return rb_funcall(array, rb_intern("sample"), argc, argc ? argv[0] : 0);
|
457
731
|
}
|
458
732
|
|
459
733
|
static inline VALUE
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
734
|
+
cs_from_section(VALUE set, cs_cp from, cs_cp upto)
|
735
|
+
{
|
736
|
+
VALUE new_cs;
|
737
|
+
cs_ar *cps;
|
738
|
+
cs_cp cp, len;
|
739
|
+
struct cs_data *new_data;
|
740
|
+
new_cs = cs_alloc(RBASIC(set)->klass, &new_data);
|
741
|
+
cps = cs_fetch_cps(set, &len);
|
742
|
+
for (cp = from; cp <= upto; cp++)
|
743
|
+
{
|
744
|
+
if (tst_cp(cps, len, cp))
|
745
|
+
{
|
746
|
+
set_cp(new_data, cp);
|
747
|
+
}
|
467
748
|
}
|
468
|
-
return
|
749
|
+
return new_cs;
|
469
750
|
}
|
470
751
|
|
471
752
|
static VALUE
|
472
|
-
|
473
|
-
|
753
|
+
cs_method_ext_section(VALUE self, VALUE from, VALUE upto)
|
754
|
+
{
|
755
|
+
return cs_from_section(self, FIX2ULONG(from), FIX2ULONG(upto));
|
756
|
+
}
|
757
|
+
|
758
|
+
static inline cs_cp
|
759
|
+
cs_active_cp_count_in_section(VALUE set, cs_cp from, cs_cp upto)
|
760
|
+
{
|
761
|
+
cs_ar *cps;
|
762
|
+
cs_cp cp, count, len;
|
763
|
+
cps = cs_fetch_cps(set, &len);
|
764
|
+
for (count = 0, cp = from; cp <= upto; cp++)
|
765
|
+
{
|
766
|
+
if (tst_cp(cps, len, cp))
|
767
|
+
{
|
768
|
+
count++;
|
769
|
+
}
|
770
|
+
}
|
771
|
+
return count;
|
474
772
|
}
|
475
773
|
|
476
774
|
static VALUE
|
477
|
-
|
478
|
-
|
775
|
+
cs_method_ext_count_in_section(VALUE self, VALUE from, VALUE upto)
|
776
|
+
{
|
777
|
+
cs_cp count;
|
778
|
+
count = cs_active_cp_count_in_section(self, FIX2ULONG(from), FIX2ULONG(upto));
|
779
|
+
return LONG2FIX(count);
|
479
780
|
}
|
480
781
|
|
481
782
|
static inline VALUE
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
783
|
+
cs_has_cp_in_section(cs_ar *cps, cs_cp len, cs_cp from, cs_cp upto)
|
784
|
+
{
|
785
|
+
cs_cp cp;
|
786
|
+
for (cp = from; cp <= upto; cp++)
|
787
|
+
{
|
788
|
+
if (tst_cp(cps, len, cp))
|
789
|
+
{
|
790
|
+
return Qtrue;
|
791
|
+
}
|
490
792
|
}
|
491
793
|
return Qfalse;
|
492
794
|
}
|
493
795
|
|
494
796
|
static VALUE
|
495
|
-
|
797
|
+
cs_method_ext_section_p(VALUE self, VALUE from, VALUE upto)
|
798
|
+
{
|
799
|
+
cs_ar *cps;
|
800
|
+
cs_cp len;
|
801
|
+
cps = cs_fetch_cps(self, &len);
|
802
|
+
return cs_has_cp_in_section(cps, len, FIX2ULONG(from), FIX2ULONG(upto));
|
803
|
+
}
|
804
|
+
|
805
|
+
static inline VALUE
|
806
|
+
cs_ratio_of_section(VALUE set, cs_cp from, cs_cp upto)
|
807
|
+
{
|
808
|
+
double section_count, total_count;
|
809
|
+
section_count = (double)cs_active_cp_count_in_section(set, from, upto);
|
810
|
+
total_count = (double)cs_active_cp_count(set);
|
811
|
+
return DBL2NUM(section_count / total_count);
|
812
|
+
}
|
813
|
+
|
814
|
+
static VALUE
|
815
|
+
cs_method_ext_section_ratio(VALUE self, VALUE from, VALUE upto)
|
816
|
+
{
|
817
|
+
return cs_ratio_of_section(self, FIX2ULONG(from), FIX2ULONG(upto));
|
818
|
+
}
|
819
|
+
|
820
|
+
#define MAX_CP 0x10FFFF
|
821
|
+
#define MAX_ASCII_CP 0x7F
|
822
|
+
#define MAX_BMP_CP 0xFFFF
|
823
|
+
#define MIN_ASTRAL_CP 0x10000
|
824
|
+
|
825
|
+
static inline VALUE
|
826
|
+
cs_has_cp_in_plane(cs_ar *cps, cs_cp len, unsigned int plane)
|
827
|
+
{
|
828
|
+
cs_cp plane_beg, plane_end;
|
829
|
+
plane_beg = plane * UNICODE_PLANE_SIZE;
|
830
|
+
plane_end = (plane + 1) * MAX_BMP_CP;
|
831
|
+
return cs_has_cp_in_section(cps, len, plane_beg, plane_end);
|
832
|
+
}
|
833
|
+
|
834
|
+
static VALUE
|
835
|
+
cs_method_planes(VALUE self)
|
836
|
+
{
|
837
|
+
cs_ar *cps;
|
838
|
+
cs_cp len;
|
496
839
|
unsigned int i;
|
497
840
|
VALUE planes;
|
841
|
+
cps = cs_fetch_cps(self, &len);
|
498
842
|
planes = rb_ary_new();
|
499
|
-
for (i = 0; i < UNICODE_PLANE_COUNT; i++)
|
500
|
-
|
843
|
+
for (i = 0; i < UNICODE_PLANE_COUNT; i++)
|
844
|
+
{
|
845
|
+
if (cs_has_cp_in_plane(cps, len, i))
|
846
|
+
{
|
847
|
+
rb_ary_push(planes, INT2FIX(i));
|
848
|
+
}
|
501
849
|
}
|
502
850
|
return planes;
|
503
851
|
}
|
504
852
|
|
505
|
-
static
|
506
|
-
|
853
|
+
static inline int
|
854
|
+
cs_valid_plane_num(VALUE num)
|
855
|
+
{
|
507
856
|
int plane;
|
508
|
-
Check_Type(
|
509
|
-
plane = FIX2INT(
|
510
|
-
if (plane < 0 || plane >= UNICODE_PLANE_COUNT)
|
511
|
-
|
857
|
+
Check_Type(num, T_FIXNUM);
|
858
|
+
plane = FIX2INT(num);
|
859
|
+
if (plane < 0 || plane >= UNICODE_PLANE_COUNT)
|
860
|
+
{
|
861
|
+
rb_raise(rb_eArgError, "plane must be between 0 and %d", UNICODE_PLANE_COUNT - 1);
|
512
862
|
}
|
513
|
-
return
|
863
|
+
return plane;
|
864
|
+
}
|
865
|
+
|
866
|
+
static VALUE
|
867
|
+
cs_method_plane(VALUE self, VALUE plane_num)
|
868
|
+
{
|
869
|
+
cs_cp plane, plane_beg, plane_end;
|
870
|
+
plane = cs_valid_plane_num(plane_num);
|
871
|
+
plane_beg = plane * UNICODE_PLANE_SIZE;
|
872
|
+
plane_end = (plane + 1) * MAX_BMP_CP;
|
873
|
+
return cs_from_section(self, plane_beg, plane_end);
|
874
|
+
}
|
875
|
+
|
876
|
+
static VALUE
|
877
|
+
cs_method_member_in_plane_p(VALUE self, VALUE plane_num)
|
878
|
+
{
|
879
|
+
cs_ar *cps;
|
880
|
+
cs_cp len;
|
881
|
+
unsigned int plane;
|
882
|
+
plane = cs_valid_plane_num(plane_num);
|
883
|
+
cps = cs_fetch_cps(self, &len);
|
884
|
+
return cs_has_cp_in_plane(cps, len, plane);
|
514
885
|
}
|
515
886
|
|
516
887
|
#define NON_SURROGATE(cp) (cp > 0xDFFF || cp < 0xD800)
|
517
888
|
|
518
889
|
static VALUE
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
890
|
+
cs_method_ext_inversion(int argc, VALUE *argv, VALUE self)
|
891
|
+
{
|
892
|
+
int inc_surr;
|
893
|
+
cs_cp upto, cp, len;
|
894
|
+
cs_ar *cps;
|
895
|
+
VALUE new_cs;
|
896
|
+
struct cs_data *new_data;
|
897
|
+
|
524
898
|
rb_check_arity(argc, 0, 2);
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
899
|
+
|
900
|
+
cps = cs_fetch_cps(self, &len);
|
901
|
+
inc_surr = argc && argv[0] == Qtrue;
|
902
|
+
new_cs = cs_alloc(RBASIC(self)->klass, &new_data);
|
903
|
+
upto = argc > 1 && FIXNUM_P(argv[1]) ? FIX2ULONG(argv[1]) : UNICODE_CP_COUNT;
|
904
|
+
|
905
|
+
for (cp = 0; cp < UNICODE_CP_COUNT; cp++)
|
906
|
+
{
|
907
|
+
if (cp <= upto && !tst_cp(cps, len, cp) && (inc_surr || NON_SURROGATE(cp)))
|
908
|
+
{
|
909
|
+
set_cp(new_data, cp);
|
910
|
+
}
|
531
911
|
}
|
532
|
-
|
533
|
-
|
534
|
-
);
|
912
|
+
|
913
|
+
return new_cs;
|
535
914
|
}
|
536
915
|
|
537
|
-
typedef int(*str_cp_handler)(unsigned int,
|
916
|
+
typedef int (*str_cp_handler)(unsigned int, cs_ar *, cs_cp len, struct cs_data *data, VALUE *memo);
|
538
917
|
|
539
918
|
static inline int
|
540
|
-
add_str_cp_to_arr(unsigned int str_cp,
|
541
|
-
|
919
|
+
add_str_cp_to_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
|
920
|
+
{
|
921
|
+
set_cp(data, str_cp);
|
542
922
|
return 1;
|
543
923
|
}
|
544
924
|
|
545
925
|
static VALUE
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
926
|
+
cs_method_case_insensitive(VALUE self)
|
927
|
+
{
|
928
|
+
cs_cp i, len;
|
929
|
+
cs_ar *cps;
|
930
|
+
VALUE new_cs;
|
931
|
+
struct cs_data *new_data;
|
551
932
|
|
552
|
-
|
933
|
+
cps = cs_fetch_cps(self, &len);
|
934
|
+
new_cs = cs_alloc(RBASIC(self)->klass, &new_data);
|
935
|
+
cs_merge_cs(new_cs, self);
|
553
936
|
|
554
|
-
for (i = 0; i < CASEFOLD_COUNT; i++)
|
937
|
+
for (i = 0; i < CASEFOLD_COUNT; i++)
|
938
|
+
{
|
555
939
|
casefold_mapping m = unicode_casefold_table[i];
|
556
940
|
|
557
|
-
if
|
558
|
-
|
941
|
+
if (tst_cp(cps, len, m.from))
|
942
|
+
{
|
943
|
+
set_cp(new_data, m.to);
|
944
|
+
}
|
945
|
+
else if (tst_cp(cps, len, m.to))
|
946
|
+
{
|
947
|
+
set_cp(new_data, m.from);
|
948
|
+
}
|
559
949
|
}
|
560
950
|
|
561
|
-
return
|
951
|
+
return new_cs;
|
562
952
|
|
563
953
|
// OnigCaseFoldType flags;
|
564
954
|
// rb_encoding *enc;
|
@@ -573,20 +963,27 @@ method_case_insensitive(VALUE self) {
|
|
573
963
|
}
|
574
964
|
|
575
965
|
static inline VALUE
|
576
|
-
each_sb_cp(VALUE str, str_cp_handler func,
|
577
|
-
|
966
|
+
each_sb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
|
967
|
+
{
|
968
|
+
long i, str_len;
|
578
969
|
unsigned int str_cp;
|
970
|
+
str_len = RSTRING_LEN(str);
|
579
971
|
|
580
|
-
for (i = 0; i <
|
972
|
+
for (i = 0; i < str_len; i++)
|
973
|
+
{
|
581
974
|
str_cp = (RSTRING_PTR(str)[i] & 0xff);
|
582
|
-
if (!(*func)(str_cp, cp_arr))
|
975
|
+
if (!(*func)(str_cp, cp_arr, len, data, memo))
|
976
|
+
{
|
977
|
+
return Qfalse;
|
978
|
+
}
|
583
979
|
}
|
584
980
|
|
585
981
|
return Qtrue;
|
586
982
|
}
|
587
983
|
|
588
984
|
static inline VALUE
|
589
|
-
each_mb_cp(VALUE str, str_cp_handler func,
|
985
|
+
each_mb_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
|
986
|
+
{
|
590
987
|
int n;
|
591
988
|
unsigned int str_cp;
|
592
989
|
const char *ptr, *end;
|
@@ -597,9 +994,13 @@ each_mb_cp(VALUE str, str_cp_handler func, cp_byte *cp_arr) {
|
|
597
994
|
end = RSTRING_END(str);
|
598
995
|
enc = rb_enc_get(str);
|
599
996
|
|
600
|
-
while (ptr < end)
|
997
|
+
while (ptr < end)
|
998
|
+
{
|
601
999
|
str_cp = rb_enc_codepoint_len(ptr, end, &n, enc);
|
602
|
-
if (!(*func)(str_cp, cp_arr))
|
1000
|
+
if (!(*func)(str_cp, cp_arr, len, data, memo))
|
1001
|
+
{
|
1002
|
+
return Qfalse;
|
1003
|
+
}
|
603
1004
|
ptr += n;
|
604
1005
|
}
|
605
1006
|
|
@@ -611,105 +1012,236 @@ static inline int
|
|
611
1012
|
single_byte_optimizable(VALUE str)
|
612
1013
|
{
|
613
1014
|
rb_encoding *enc;
|
614
|
-
if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
|
1015
|
+
if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
|
1016
|
+
{
|
1017
|
+
return 1;
|
1018
|
+
}
|
615
1019
|
|
616
1020
|
enc = rb_enc_get(str);
|
617
|
-
if (rb_enc_mbmaxlen(enc) == 1)
|
1021
|
+
if (rb_enc_mbmaxlen(enc) == 1)
|
1022
|
+
{
|
1023
|
+
return 1;
|
1024
|
+
}
|
618
1025
|
|
619
1026
|
return 0;
|
620
1027
|
}
|
621
1028
|
|
622
1029
|
static inline VALUE
|
623
|
-
each_cp(VALUE str, str_cp_handler func,
|
624
|
-
|
625
|
-
|
1030
|
+
each_cp(VALUE str, str_cp_handler func, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
|
1031
|
+
{
|
1032
|
+
if (single_byte_optimizable(str))
|
1033
|
+
{
|
1034
|
+
return each_sb_cp(str, func, cp_arr, len, data, memo);
|
626
1035
|
}
|
627
|
-
return each_mb_cp(str, func, cp_arr);
|
1036
|
+
return each_mb_cp(str, func, cp_arr, len, data, memo);
|
628
1037
|
}
|
629
1038
|
|
630
1039
|
static inline void
|
631
|
-
raise_arg_err_unless_string(VALUE val)
|
632
|
-
|
1040
|
+
raise_arg_err_unless_string(VALUE val)
|
1041
|
+
{
|
1042
|
+
if (!RB_TYPE_P(val, T_STRING))
|
1043
|
+
{
|
1044
|
+
rb_raise(rb_eArgError, "pass a String");
|
1045
|
+
}
|
633
1046
|
}
|
634
1047
|
|
635
1048
|
static VALUE
|
636
|
-
|
637
|
-
|
1049
|
+
cs_class_method_of(VALUE self, VALUE str)
|
1050
|
+
{
|
1051
|
+
VALUE new_cs;
|
1052
|
+
struct cs_data *new_data;
|
1053
|
+
new_cs = cs_alloc(self, &new_data);
|
638
1054
|
raise_arg_err_unless_string(str);
|
639
|
-
|
640
|
-
|
641
|
-
return NEW_CHARACTER_SET(self, cp_arr);
|
1055
|
+
each_cp(str, add_str_cp_to_arr, 0, 0, new_data, 0);
|
1056
|
+
return new_cs;
|
642
1057
|
}
|
643
1058
|
|
644
1059
|
static inline int
|
645
|
-
|
646
|
-
|
1060
|
+
count_str_cp(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
|
1061
|
+
{
|
1062
|
+
if (tst_cp(cp_arr, len, str_cp))
|
1063
|
+
{
|
1064
|
+
*memo += 1;
|
1065
|
+
}
|
1066
|
+
return 1;
|
647
1067
|
}
|
648
1068
|
|
649
1069
|
static VALUE
|
650
|
-
|
651
|
-
|
652
|
-
VALUE
|
1070
|
+
cs_method_count_in(VALUE self, VALUE str)
|
1071
|
+
{
|
1072
|
+
VALUE count;
|
1073
|
+
struct cs_data *data;
|
653
1074
|
raise_arg_err_unless_string(str);
|
654
|
-
|
655
|
-
|
656
|
-
|
1075
|
+
data = cs_fetch_data(self);
|
1076
|
+
count = 0;
|
1077
|
+
each_cp(str, count_str_cp, data->cps, data->len, data, &count);
|
1078
|
+
return INT2NUM((int)count);
|
1079
|
+
}
|
1080
|
+
|
1081
|
+
static inline int
|
1082
|
+
str_cp_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
|
1083
|
+
{
|
1084
|
+
return tst_cp(cp_arr, len, str_cp);
|
1085
|
+
}
|
1086
|
+
|
1087
|
+
static VALUE
|
1088
|
+
cs_method_cover_p(VALUE self, VALUE str)
|
1089
|
+
{
|
1090
|
+
struct cs_data *data;
|
1091
|
+
raise_arg_err_unless_string(str);
|
1092
|
+
data = cs_fetch_data(self);
|
1093
|
+
return each_cp(str, str_cp_in_arr, data->cps, data->len, data, 0);
|
1094
|
+
}
|
1095
|
+
|
1096
|
+
static inline int
|
1097
|
+
add_str_cp_to_str_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
|
1098
|
+
{
|
1099
|
+
if (tst_cp(cp_arr, len, str_cp))
|
1100
|
+
{
|
1101
|
+
rb_ary_push(memo[0], rb_enc_uint_chr((int)str_cp, (rb_encoding *)memo[1]));
|
1102
|
+
}
|
1103
|
+
return 1;
|
1104
|
+
}
|
1105
|
+
|
1106
|
+
static VALUE
|
1107
|
+
cs_method_scan(VALUE self, VALUE str)
|
1108
|
+
{
|
1109
|
+
VALUE memo[2];
|
1110
|
+
struct cs_data *data;
|
1111
|
+
raise_arg_err_unless_string(str);
|
1112
|
+
data = cs_fetch_data(self);
|
1113
|
+
memo[0] = rb_ary_new();
|
1114
|
+
memo[1] = (VALUE)rb_enc_get(str);
|
1115
|
+
each_cp(str, add_str_cp_to_str_arr, data->cps, data->len, data, memo);
|
1116
|
+
return memo[0];
|
657
1117
|
}
|
658
1118
|
|
659
1119
|
static inline int
|
660
|
-
|
661
|
-
|
1120
|
+
str_cp_not_in_arr(unsigned int str_cp, cs_ar *cp_arr, cs_cp len, struct cs_data *data, VALUE *memo)
|
1121
|
+
{
|
1122
|
+
return !tst_cp(cp_arr, len, str_cp);
|
662
1123
|
}
|
663
1124
|
|
664
1125
|
static VALUE
|
665
|
-
|
666
|
-
|
1126
|
+
cs_method_used_by_p(VALUE self, VALUE str)
|
1127
|
+
{
|
1128
|
+
VALUE only_uses_other_cps;
|
1129
|
+
struct cs_data *data;
|
667
1130
|
raise_arg_err_unless_string(str);
|
668
|
-
|
669
|
-
|
1131
|
+
data = cs_fetch_data(self);
|
1132
|
+
only_uses_other_cps = each_cp(str, str_cp_not_in_arr, data->cps, data->len, data, 0);
|
1133
|
+
return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
|
1134
|
+
}
|
1135
|
+
|
1136
|
+
static void
|
1137
|
+
cs_str_buf_cat(VALUE str, const char *ptr, long len)
|
1138
|
+
{
|
1139
|
+
long total, olen;
|
1140
|
+
char *sptr;
|
1141
|
+
|
1142
|
+
RSTRING_GETMEM(str, sptr, olen);
|
1143
|
+
sptr = RSTRING(str)->as.heap.ptr;
|
1144
|
+
olen = RSTRING(str)->as.heap.len;
|
1145
|
+
total = olen + len;
|
1146
|
+
memcpy(sptr + olen, ptr, len);
|
1147
|
+
RSTRING(str)->as.heap.len = total;
|
1148
|
+
}
|
1149
|
+
|
1150
|
+
#ifndef TERM_FILL
|
1151
|
+
#define TERM_FILL(ptr, termlen) \
|
1152
|
+
do \
|
1153
|
+
{ \
|
1154
|
+
char *const term_fill_ptr = (ptr); \
|
1155
|
+
const int term_fill_len = (termlen); \
|
1156
|
+
*term_fill_ptr = '\0'; \
|
1157
|
+
if (__builtin_expect(!!(term_fill_len > 1), 0)) \
|
1158
|
+
memset(term_fill_ptr, 0, term_fill_len); \
|
1159
|
+
} while (0)
|
1160
|
+
#endif
|
1161
|
+
|
1162
|
+
static void
|
1163
|
+
cs_str_buf_terminate(VALUE str, rb_encoding *enc)
|
1164
|
+
{
|
1165
|
+
char *ptr;
|
1166
|
+
long len;
|
1167
|
+
|
1168
|
+
ptr = RSTRING(str)->as.heap.ptr;
|
1169
|
+
len = RSTRING(str)->as.heap.len;
|
1170
|
+
TERM_FILL(ptr + len, rb_enc_mbminlen(enc));
|
670
1171
|
}
|
671
1172
|
|
672
1173
|
static inline VALUE
|
673
|
-
|
674
|
-
|
1174
|
+
cs_apply_to_str(VALUE set, VALUE str, int delete, int bang)
|
1175
|
+
{
|
1176
|
+
cs_ar *cps;
|
1177
|
+
cs_cp len;
|
675
1178
|
rb_encoding *str_enc;
|
676
|
-
VALUE orig_len,
|
677
|
-
int
|
1179
|
+
VALUE orig_len, new_str_buf;
|
1180
|
+
int cp_len;
|
678
1181
|
unsigned int str_cp;
|
679
1182
|
const char *ptr, *end;
|
680
1183
|
|
681
1184
|
raise_arg_err_unless_string(str);
|
682
1185
|
|
683
|
-
|
1186
|
+
cps = cs_fetch_cps(set, &len);
|
684
1187
|
|
685
1188
|
orig_len = RSTRING_LEN(str);
|
686
|
-
|
687
|
-
|
1189
|
+
if (orig_len < 1) // empty string, will never change
|
1190
|
+
{
|
1191
|
+
if (bang)
|
1192
|
+
{
|
1193
|
+
return Qnil;
|
1194
|
+
}
|
1195
|
+
return rb_str_dup(str);
|
1196
|
+
}
|
1197
|
+
|
1198
|
+
new_str_buf = rb_str_buf_new(orig_len + 30); // len + margin
|
688
1199
|
str_enc = rb_enc_get(str);
|
689
1200
|
rb_enc_associate(new_str_buf, str_enc);
|
690
|
-
|
691
|
-
|
1201
|
+
rb_str_modify(new_str_buf);
|
1202
|
+
ENC_CODERANGE_SET(new_str_buf, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
|
692
1203
|
|
693
1204
|
ptr = RSTRING_PTR(str);
|
694
1205
|
end = RSTRING_END(str);
|
695
1206
|
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
1207
|
+
if (single_byte_optimizable(str))
|
1208
|
+
{
|
1209
|
+
while (ptr < end)
|
1210
|
+
{
|
1211
|
+
str_cp = *ptr & 0xff;
|
1212
|
+
if ((!tst_cp(cps, len, str_cp)) == delete)
|
1213
|
+
{
|
1214
|
+
cs_str_buf_cat(new_str_buf, ptr, 1);
|
1215
|
+
}
|
1216
|
+
ptr++;
|
1217
|
+
}
|
1218
|
+
}
|
1219
|
+
else // likely to be multibyte string
|
1220
|
+
{
|
1221
|
+
while (ptr < end)
|
1222
|
+
{
|
1223
|
+
str_cp = rb_enc_codepoint_len(ptr, end, &cp_len, str_enc);
|
1224
|
+
if ((!tst_cp(cps, len, str_cp)) == delete)
|
1225
|
+
{
|
1226
|
+
cs_str_buf_cat(new_str_buf, ptr, cp_len);
|
1227
|
+
}
|
1228
|
+
ptr += cp_len;
|
701
1229
|
}
|
702
|
-
ptr += n;
|
703
1230
|
}
|
704
1231
|
|
705
|
-
|
706
|
-
|
1232
|
+
cs_str_buf_terminate(new_str_buf, str_enc);
|
1233
|
+
|
1234
|
+
if (bang)
|
1235
|
+
{
|
1236
|
+
if (RSTRING_LEN(new_str_buf) == (long)orig_len) // string unchanged
|
1237
|
+
{
|
1238
|
+
return Qnil;
|
1239
|
+
}
|
707
1240
|
rb_str_shared_replace(str, new_str_buf);
|
708
1241
|
}
|
709
|
-
else
|
1242
|
+
else
|
1243
|
+
{
|
710
1244
|
RB_OBJ_WRITE(new_str_buf, &(RBASIC(new_str_buf))->klass, rb_obj_class(str));
|
711
|
-
// slightly cumbersome approach needed for compatibility with Ruby < 2.3:
|
712
|
-
RBASIC(new_str_buf)->flags |= (RBASIC(str)->flags&(FL_TAINT));
|
713
1245
|
str = new_str_buf;
|
714
1246
|
}
|
715
1247
|
|
@@ -717,98 +1249,115 @@ apply_to_str(VALUE set, VALUE str, int delete, int bang) {
|
|
717
1249
|
}
|
718
1250
|
|
719
1251
|
static VALUE
|
720
|
-
|
721
|
-
|
1252
|
+
cs_method_delete_in(VALUE self, VALUE str)
|
1253
|
+
{
|
1254
|
+
return cs_apply_to_str(self, str, 1, 0);
|
1255
|
+
}
|
1256
|
+
|
1257
|
+
static VALUE
|
1258
|
+
cs_method_delete_in_bang(VALUE self, VALUE str)
|
1259
|
+
{
|
1260
|
+
return cs_apply_to_str(self, str, 1, 1);
|
722
1261
|
}
|
723
1262
|
|
724
1263
|
static VALUE
|
725
|
-
|
726
|
-
|
1264
|
+
cs_method_keep_in(VALUE self, VALUE str)
|
1265
|
+
{
|
1266
|
+
return cs_apply_to_str(self, str, 0, 0);
|
727
1267
|
}
|
728
1268
|
|
729
1269
|
static VALUE
|
730
|
-
|
731
|
-
|
1270
|
+
cs_method_keep_in_bang(VALUE self, VALUE str)
|
1271
|
+
{
|
1272
|
+
return cs_apply_to_str(self, str, 0, 1);
|
732
1273
|
}
|
733
1274
|
|
734
1275
|
static VALUE
|
735
|
-
|
736
|
-
|
1276
|
+
cs_method_allocated_length(VALUE self)
|
1277
|
+
{
|
1278
|
+
return LONG2FIX(cs_fetch_data(self)->len);
|
737
1279
|
}
|
738
1280
|
|
739
1281
|
// ****
|
740
1282
|
// init
|
741
1283
|
// ****
|
742
1284
|
|
743
|
-
void
|
744
|
-
Init_character_set()
|
1285
|
+
void Init_character_set()
|
745
1286
|
{
|
746
1287
|
VALUE cs = rb_define_class("CharacterSet", rb_cObject);
|
747
1288
|
|
748
|
-
rb_define_alloc_func(cs,
|
1289
|
+
rb_define_alloc_func(cs, cs_method_allocate);
|
749
1290
|
|
750
1291
|
// `Set` compatibility methods
|
751
1292
|
|
752
|
-
rb_define_method(cs, "each",
|
753
|
-
rb_define_method(cs, "to_a",
|
754
|
-
rb_define_method(cs, "length",
|
755
|
-
rb_define_method(cs, "size",
|
756
|
-
rb_define_method(cs, "
|
757
|
-
rb_define_method(cs, "
|
758
|
-
rb_define_method(cs, "
|
759
|
-
rb_define_method(cs, "
|
760
|
-
rb_define_method(cs, "
|
761
|
-
rb_define_method(cs, "
|
762
|
-
rb_define_method(cs, "
|
763
|
-
rb_define_method(cs, "
|
764
|
-
rb_define_method(cs, "
|
765
|
-
rb_define_method(cs, "
|
766
|
-
rb_define_method(cs, "
|
767
|
-
rb_define_method(cs, "
|
768
|
-
rb_define_method(cs, "
|
769
|
-
rb_define_method(cs, "
|
770
|
-
rb_define_method(cs, "
|
771
|
-
rb_define_method(cs, "
|
772
|
-
rb_define_method(cs, "
|
773
|
-
rb_define_method(cs, "
|
774
|
-
rb_define_method(cs, "
|
775
|
-
rb_define_method(cs, "add
|
776
|
-
rb_define_method(cs, "
|
777
|
-
rb_define_method(cs, "
|
778
|
-
rb_define_method(cs, "
|
779
|
-
rb_define_method(cs, "
|
780
|
-
rb_define_method(cs, "
|
781
|
-
rb_define_method(cs, "
|
782
|
-
rb_define_method(cs, "
|
783
|
-
rb_define_method(cs, "
|
784
|
-
rb_define_method(cs, "
|
785
|
-
rb_define_method(cs, "
|
786
|
-
rb_define_method(cs, "
|
787
|
-
rb_define_method(cs, "
|
788
|
-
rb_define_method(cs, "
|
789
|
-
rb_define_method(cs, "
|
790
|
-
rb_define_method(cs, "
|
791
|
-
rb_define_method(cs, "
|
792
|
-
rb_define_method(cs, "
|
793
|
-
rb_define_method(cs, "
|
1293
|
+
rb_define_method(cs, "each", cs_method_each, 0);
|
1294
|
+
rb_define_method(cs, "to_a", cs_method_to_a, -1);
|
1295
|
+
rb_define_method(cs, "length", cs_method_length, 0);
|
1296
|
+
rb_define_method(cs, "size", cs_method_length, 0);
|
1297
|
+
rb_define_method(cs, "empty?", cs_method_empty_p, 0);
|
1298
|
+
rb_define_method(cs, "hash", cs_method_hash, 0);
|
1299
|
+
rb_define_method(cs, "keep_if", cs_method_keep_if, 0);
|
1300
|
+
rb_define_method(cs, "delete_if", cs_method_delete_if, 0);
|
1301
|
+
rb_define_method(cs, "clear", cs_method_clear, 0);
|
1302
|
+
rb_define_method(cs, "min", cs_method_min, 0);
|
1303
|
+
rb_define_method(cs, "max", cs_method_max, 0);
|
1304
|
+
rb_define_method(cs, "minmax", cs_method_minmax, 0);
|
1305
|
+
rb_define_method(cs, "intersection", cs_method_intersection, 1);
|
1306
|
+
rb_define_method(cs, "&", cs_method_intersection, 1);
|
1307
|
+
rb_define_method(cs, "union", cs_method_union, 1);
|
1308
|
+
rb_define_method(cs, "+", cs_method_union, 1);
|
1309
|
+
rb_define_method(cs, "|", cs_method_union, 1);
|
1310
|
+
rb_define_method(cs, "difference", cs_method_difference, 1);
|
1311
|
+
rb_define_method(cs, "-", cs_method_difference, 1);
|
1312
|
+
rb_define_method(cs, "^", cs_method_exclusion, 1);
|
1313
|
+
rb_define_method(cs, "include?", cs_method_include_p, 1);
|
1314
|
+
rb_define_method(cs, "member?", cs_method_include_p, 1);
|
1315
|
+
rb_define_method(cs, "===", cs_method_include_p, 1);
|
1316
|
+
rb_define_method(cs, "add", cs_method_add, 1);
|
1317
|
+
rb_define_method(cs, "<<", cs_method_add, 1);
|
1318
|
+
rb_define_method(cs, "add?", cs_method_add_p, 1);
|
1319
|
+
rb_define_method(cs, "delete", cs_method_delete, 1);
|
1320
|
+
rb_define_method(cs, "delete?", cs_method_delete_p, 1);
|
1321
|
+
rb_define_method(cs, "intersect?", cs_method_intersect_p, 1);
|
1322
|
+
rb_define_method(cs, "disjoint?", cs_method_disjoint_p, 1);
|
1323
|
+
rb_define_method(cs, "eql?", cs_method_eql_p, 1);
|
1324
|
+
rb_define_method(cs, "==", cs_method_eql_p, 1);
|
1325
|
+
rb_define_method(cs, "merge", cs_method_merge, 1);
|
1326
|
+
rb_define_method(cs, "initialize_clone", cs_method_initialize_copy, 1);
|
1327
|
+
rb_define_method(cs, "initialize_dup", cs_method_initialize_copy, 1);
|
1328
|
+
rb_define_method(cs, "subtract", cs_method_subtract, 1);
|
1329
|
+
rb_define_method(cs, "subset?", cs_method_subset_p, 1);
|
1330
|
+
rb_define_method(cs, "<=", cs_method_subset_p, 1);
|
1331
|
+
rb_define_method(cs, "proper_subset?", cs_method_proper_subset_p, 1);
|
1332
|
+
rb_define_method(cs, "<", cs_method_proper_subset_p, 1);
|
1333
|
+
rb_define_method(cs, "superset?", cs_method_superset_p, 1);
|
1334
|
+
rb_define_method(cs, ">=", cs_method_superset_p, 1);
|
1335
|
+
rb_define_method(cs, "proper_superset?", cs_method_proper_superset_p, 1);
|
1336
|
+
rb_define_method(cs, ">", cs_method_proper_superset_p, 1);
|
794
1337
|
|
795
1338
|
// `CharacterSet`-specific methods
|
796
1339
|
|
797
|
-
rb_define_singleton_method(cs, "from_ranges",
|
798
|
-
rb_define_singleton_method(cs, "of",
|
799
|
-
|
800
|
-
rb_define_method(cs, "ranges",
|
801
|
-
rb_define_method(cs, "sample",
|
802
|
-
rb_define_method(cs, "
|
803
|
-
rb_define_method(cs, "
|
804
|
-
rb_define_method(cs, "
|
805
|
-
rb_define_method(cs, "
|
806
|
-
rb_define_method(cs, "
|
807
|
-
rb_define_method(cs, "
|
808
|
-
rb_define_method(cs, "
|
809
|
-
rb_define_method(cs, "
|
810
|
-
rb_define_method(cs, "
|
811
|
-
rb_define_method(cs, "
|
812
|
-
rb_define_method(cs, "
|
813
|
-
rb_define_method(cs, "
|
1340
|
+
rb_define_singleton_method(cs, "from_ranges", cs_class_method_from_ranges, -2);
|
1341
|
+
rb_define_singleton_method(cs, "of", cs_class_method_of, 1);
|
1342
|
+
|
1343
|
+
rb_define_method(cs, "ranges", cs_method_ranges, 0);
|
1344
|
+
rb_define_method(cs, "sample", cs_method_sample, -1);
|
1345
|
+
rb_define_method(cs, "ext_section", cs_method_ext_section, 2);
|
1346
|
+
rb_define_method(cs, "ext_count_in_section", cs_method_ext_count_in_section, 2);
|
1347
|
+
rb_define_method(cs, "ext_section?", cs_method_ext_section_p, 2);
|
1348
|
+
rb_define_method(cs, "ext_section_ratio", cs_method_ext_section_ratio, 2);
|
1349
|
+
rb_define_method(cs, "planes", cs_method_planes, 0);
|
1350
|
+
rb_define_method(cs, "plane", cs_method_plane, 1);
|
1351
|
+
rb_define_method(cs, "member_in_plane?", cs_method_member_in_plane_p, 1);
|
1352
|
+
rb_define_method(cs, "ext_inversion", cs_method_ext_inversion, -1);
|
1353
|
+
rb_define_method(cs, "case_insensitive", cs_method_case_insensitive, 0);
|
1354
|
+
rb_define_method(cs, "count_in", cs_method_count_in, 1);
|
1355
|
+
rb_define_method(cs, "cover?", cs_method_cover_p, 1);
|
1356
|
+
rb_define_method(cs, "delete_in", cs_method_delete_in, 1);
|
1357
|
+
rb_define_method(cs, "delete_in!", cs_method_delete_in_bang, 1);
|
1358
|
+
rb_define_method(cs, "keep_in", cs_method_keep_in, 1);
|
1359
|
+
rb_define_method(cs, "keep_in!", cs_method_keep_in_bang, 1);
|
1360
|
+
rb_define_method(cs, "scan", cs_method_scan, 1);
|
1361
|
+
rb_define_method(cs, "used_by?", cs_method_used_by_p, 1);
|
1362
|
+
rb_define_method(cs, "allocated_length", cs_method_allocated_length, 0);
|
814
1363
|
}
|