string_view 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +25 -4
- data/ext/string_view/extconf.rb +1 -1
- data/ext/string_view/string_view.c +878 -253
- data/ext/string_view/string_view.h +133 -0
- data/ext/string_view/string_view_core_ext.c +29 -0
- data/ext/string_view/string_view_pool.c +208 -0
- data/ext/string_view/string_view_strict.c +102 -0
- data/lib/string_view/core_ext.rb +5 -0
- data/lib/string_view/version.rb +1 -1
- metadata +7 -2
|
@@ -1,45 +1,25 @@
|
|
|
1
|
-
#include "
|
|
2
|
-
#include "ruby/encoding.h"
|
|
3
|
-
#include "ruby/re.h"
|
|
4
|
-
#include "simdutf_c.h"
|
|
5
|
-
|
|
6
|
-
#define SV_LIKELY(x) __builtin_expect(!!(x), 1)
|
|
7
|
-
#define SV_UNLIKELY(x) __builtin_expect(!!(x), 0)
|
|
8
|
-
|
|
9
|
-
#ifdef __GNUC__
|
|
10
|
-
#define SV_INLINE static inline __attribute__((always_inline))
|
|
11
|
-
#else
|
|
12
|
-
#define SV_INLINE static inline
|
|
13
|
-
#endif
|
|
1
|
+
#include "string_view.h"
|
|
14
2
|
|
|
15
3
|
/* ========================================================================= */
|
|
16
|
-
/*
|
|
4
|
+
/* Globals */
|
|
17
5
|
/* ========================================================================= */
|
|
18
6
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
*/
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
long offset; /* byte offset into backing */
|
|
36
|
-
long length; /* byte length of this view */
|
|
37
|
-
long charlen; /* cached character count; -1 = not yet computed */
|
|
38
|
-
int single_byte; /* cached: 1 if char==byte (ASCII/single-byte enc), 0 if multibyte, -1 unknown */
|
|
39
|
-
stride_index_t *stride_idx; /* lazily built stride index for multibyte, NULL if not built */
|
|
40
|
-
} string_view_t;
|
|
41
|
-
|
|
42
|
-
static VALUE cStringView;
|
|
7
|
+
VALUE cStringView;
|
|
8
|
+
VALUE cStringViewStrict;
|
|
9
|
+
VALUE eWouldAllocate;
|
|
10
|
+
|
|
11
|
+
/* Cached method IDs — initialized once in Init_string_view */
|
|
12
|
+
static ID id_index, id_rindex, id_byteindex, id_byterindex;
|
|
13
|
+
static ID id_match, id_match_p, id_match_op;
|
|
14
|
+
static ID id_begin, id_aref;
|
|
15
|
+
static ID id_upcase, id_downcase, id_capitalize, id_swapcase;
|
|
16
|
+
static ID id_strip, id_lstrip, id_rstrip;
|
|
17
|
+
static ID id_chomp, id_chop, id_reverse, id_squeeze;
|
|
18
|
+
static ID id_encode, id_gsub, id_sub, id_tr, id_tr_s;
|
|
19
|
+
static ID id_delete, id_count, id_scan, id_split;
|
|
20
|
+
static ID id_center, id_ljust, id_rjust;
|
|
21
|
+
static ID id_format_op, id_plus, id_multiply;
|
|
22
|
+
static ID id_unpack1, id_scrub, id_unicode_normalize;
|
|
43
23
|
|
|
44
24
|
/*
|
|
45
25
|
* GC callbacks.
|
|
@@ -70,10 +50,7 @@ static void sv_compact(void *ptr) {
|
|
|
70
50
|
|
|
71
51
|
static void sv_free(void *ptr) {
|
|
72
52
|
string_view_t *sv = (string_view_t *)ptr;
|
|
73
|
-
|
|
74
|
-
xfree(sv->stride_idx->offsets);
|
|
75
|
-
xfree(sv->stride_idx);
|
|
76
|
-
}
|
|
53
|
+
sv_clear_stride_index(sv);
|
|
77
54
|
}
|
|
78
55
|
|
|
79
56
|
static size_t sv_memsize(const void *ptr) {
|
|
@@ -85,14 +62,20 @@ static size_t sv_memsize(const void *ptr) {
|
|
|
85
62
|
return size;
|
|
86
63
|
}
|
|
87
64
|
|
|
88
|
-
|
|
65
|
+
const rb_data_type_t string_view_type = {
|
|
89
66
|
.wrap_struct_name = "StringView",
|
|
90
67
|
.function = { .dmark = sv_mark, .dfree = sv_free, .dsize = sv_memsize, .dcompact = sv_compact },
|
|
91
|
-
.flags = RUBY_TYPED_FREE_IMMEDIATELY |
|
|
68
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE,
|
|
92
69
|
};
|
|
93
70
|
|
|
94
|
-
/* Forward declarations */
|
|
95
|
-
static
|
|
71
|
+
/* Forward declarations for functions defined later in this file */
|
|
72
|
+
static long sv_char_count(string_view_t *sv);
|
|
73
|
+
static long sv_char_to_byte_offset(string_view_t *sv, long char_idx);
|
|
74
|
+
static long sv_char_count_partial(string_view_t *sv, const char *p, long len);
|
|
75
|
+
SV_INLINE int sv_single_byte_optimizable(string_view_t *sv);
|
|
76
|
+
SV_INLINE int sv_is_7bit(string_view_t *sv);
|
|
77
|
+
SV_INLINE int sv_is_utf8(string_view_t *sv);
|
|
78
|
+
static long sv_utf8_char_count(const char *p, long len);
|
|
96
79
|
|
|
97
80
|
/* ========================================================================= */
|
|
98
81
|
/* Internal helpers */
|
|
@@ -122,10 +105,12 @@ static VALUE sv_as_shared_str(string_view_t *sv) {
|
|
|
122
105
|
return shared;
|
|
123
106
|
}
|
|
124
107
|
|
|
125
|
-
/* Allocate a new StringView from a parent that already has cached base/enc
|
|
126
|
-
|
|
108
|
+
/* Allocate a new StringView from a parent that already has cached base/enc.
|
|
109
|
+
* Preserves the class of parent_obj (StringView or StringView::Strict). */
|
|
110
|
+
SV_INLINE VALUE sv_new_from_parent_obj(VALUE parent_obj, string_view_t *parent, long offset, long length) {
|
|
127
111
|
string_view_t *sv;
|
|
128
|
-
VALUE
|
|
112
|
+
VALUE klass = rb_obj_class(parent_obj);
|
|
113
|
+
VALUE obj = TypedData_Make_Struct(klass, string_view_t,
|
|
129
114
|
&string_view_type, sv);
|
|
130
115
|
RB_OBJ_WRITE(obj, &sv->backing, parent->backing);
|
|
131
116
|
sv->base = parent->base;
|
|
@@ -133,9 +118,11 @@ SV_INLINE VALUE sv_new_from_parent(string_view_t *parent, long offset, long leng
|
|
|
133
118
|
sv->offset = offset;
|
|
134
119
|
sv->length = length;
|
|
135
120
|
sv->single_byte = parent->single_byte;
|
|
121
|
+
sv->valid_encoding = sv->single_byte == 1 ? 1 : -1;
|
|
122
|
+
sv->pooled = 0;
|
|
136
123
|
sv->charlen = -1;
|
|
137
124
|
sv->stride_idx = NULL;
|
|
138
|
-
|
|
125
|
+
/* Not frozen — see sv_initialize comment for rationale */
|
|
139
126
|
return obj;
|
|
140
127
|
}
|
|
141
128
|
|
|
@@ -154,6 +141,8 @@ static VALUE sv_alloc(VALUE klass) {
|
|
|
154
141
|
sv->offset = 0;
|
|
155
142
|
sv->length = 0;
|
|
156
143
|
sv->single_byte = -1;
|
|
144
|
+
sv->valid_encoding = -1;
|
|
145
|
+
sv->pooled = 0;
|
|
157
146
|
sv->charlen = -1;
|
|
158
147
|
sv->stride_idx = NULL;
|
|
159
148
|
return obj;
|
|
@@ -169,13 +158,7 @@ static VALUE sv_initialize(int argc, VALUE *argv, VALUE self) {
|
|
|
169
158
|
|
|
170
159
|
rb_scan_args(argc, argv, "12", &str, &voffset, &vlength);
|
|
171
160
|
|
|
172
|
-
|
|
173
|
-
rb_raise(rb_eTypeError,
|
|
174
|
-
"no implicit conversion of %s into String",
|
|
175
|
-
rb_obj_classname(str));
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
rb_str_freeze(str);
|
|
161
|
+
sv_check_frozen_string(str);
|
|
179
162
|
|
|
180
163
|
long backing_len = RSTRING_LEN(str);
|
|
181
164
|
|
|
@@ -185,26 +168,20 @@ static VALUE sv_initialize(int argc, VALUE *argv, VALUE self) {
|
|
|
185
168
|
} else {
|
|
186
169
|
offset = NUM2LONG(voffset);
|
|
187
170
|
length = NUM2LONG(vlength);
|
|
188
|
-
|
|
189
|
-
if (offset < 0 || length < 0 || offset + length > backing_len) {
|
|
190
|
-
rb_raise(rb_eArgError,
|
|
191
|
-
"offset %ld, length %ld out of range for string of bytesize %ld",
|
|
192
|
-
offset, length, backing_len);
|
|
193
|
-
}
|
|
171
|
+
sv_check_bounds(offset, length, backing_len);
|
|
194
172
|
}
|
|
195
173
|
|
|
196
174
|
string_view_t *sv = sv_get_struct(self);
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
sv->base = RSTRING_PTR(str);
|
|
200
|
-
sv->enc = enc;
|
|
201
|
-
sv->offset = offset;
|
|
202
|
-
sv->length = length;
|
|
203
|
-
sv->single_byte = sv_compute_single_byte(str, enc);
|
|
204
|
-
sv->charlen = -1;
|
|
205
|
-
sv->stride_idx = NULL;
|
|
175
|
+
sv_init_fields(self, sv, str, RSTRING_PTR(str), rb_enc_get(str),
|
|
176
|
+
offset, length);
|
|
206
177
|
|
|
207
|
-
|
|
178
|
+
/*
|
|
179
|
+
* We intentionally do NOT freeze self. StringView blocks content
|
|
180
|
+
* mutation via the immutable frozen backing and explicit FrozenError
|
|
181
|
+
* on bang methods. Not freezing allows reset! to work without
|
|
182
|
+
* violating Ruby's frozen? contract — libraries and Ruby itself
|
|
183
|
+
* use frozen? to assume immutability for hash keys and Ractor sharing.
|
|
184
|
+
*/
|
|
208
185
|
|
|
209
186
|
return self;
|
|
210
187
|
}
|
|
@@ -236,43 +213,29 @@ static VALUE sv_inspect(VALUE self) {
|
|
|
236
213
|
(void *)self, content, sv->offset, sv->length);
|
|
237
214
|
}
|
|
238
215
|
|
|
239
|
-
static VALUE sv_frozen_p(VALUE self) {
|
|
240
|
-
return Qtrue;
|
|
241
|
-
}
|
|
242
|
-
|
|
243
216
|
/*
|
|
244
217
|
* reset!(new_backing, byte_offset, byte_length) -> self
|
|
245
218
|
*/
|
|
246
219
|
static VALUE sv_reset(VALUE self, VALUE new_backing, VALUE voffset, VALUE vlength) {
|
|
220
|
+
rb_check_frozen(self);
|
|
247
221
|
string_view_t *sv = sv_get_struct(self);
|
|
248
222
|
|
|
249
|
-
if (
|
|
250
|
-
rb_raise(
|
|
251
|
-
"
|
|
252
|
-
rb_obj_classname(new_backing));
|
|
223
|
+
if (SV_UNLIKELY(sv->pooled)) {
|
|
224
|
+
rb_raise(rb_eRuntimeError,
|
|
225
|
+
"can't reset a pooled StringView directly; call StringView::Pool#reset! instead");
|
|
253
226
|
}
|
|
254
227
|
|
|
255
|
-
|
|
228
|
+
sv_check_frozen_string(new_backing);
|
|
256
229
|
|
|
257
230
|
long off = NUM2LONG(voffset);
|
|
258
231
|
long len = NUM2LONG(vlength);
|
|
259
|
-
|
|
232
|
+
sv_check_bounds(off, len, RSTRING_LEN(new_backing));
|
|
260
233
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
"offset %ld, length %ld out of range for string of bytesize %ld",
|
|
264
|
-
off, len, backing_len);
|
|
265
|
-
}
|
|
234
|
+
/* Free old stride index before reinitializing */
|
|
235
|
+
sv_clear_stride_index(sv);
|
|
266
236
|
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
sv->base = RSTRING_PTR(new_backing);
|
|
270
|
-
sv->enc = enc;
|
|
271
|
-
sv->offset = off;
|
|
272
|
-
sv->length = len;
|
|
273
|
-
sv->single_byte = sv_compute_single_byte(new_backing, enc);
|
|
274
|
-
sv->charlen = -1;
|
|
275
|
-
sv->stride_idx = NULL;
|
|
237
|
+
sv_init_fields(self, sv, new_backing, RSTRING_PTR(new_backing),
|
|
238
|
+
rb_enc_get(new_backing), off, len);
|
|
276
239
|
|
|
277
240
|
return self;
|
|
278
241
|
}
|
|
@@ -306,6 +269,8 @@ static VALUE sv_encoding(VALUE self) {
|
|
|
306
269
|
|
|
307
270
|
static VALUE sv_ascii_only_p(VALUE self) {
|
|
308
271
|
string_view_t *sv = sv_get_struct(self);
|
|
272
|
+
if (sv_single_byte_optimizable(sv)) return Qtrue;
|
|
273
|
+
/* single_byte resolved to 0 (multibyte) — scan to confirm non-ASCII bytes */
|
|
309
274
|
const char *p = sv_ptr(sv);
|
|
310
275
|
long i;
|
|
311
276
|
for (i = 0; i < sv->length; i++) {
|
|
@@ -314,6 +279,94 @@ static VALUE sv_ascii_only_p(VALUE self) {
|
|
|
314
279
|
return Qtrue;
|
|
315
280
|
}
|
|
316
281
|
|
|
282
|
+
SV_INLINE long sv_precise_char_len(const char *p, const char *e, rb_encoding *enc) {
|
|
283
|
+
int len = rb_enc_precise_mbclen(p, e, enc);
|
|
284
|
+
if (MBCLEN_CHARFOUND_P(len)) return MBCLEN_CHARFOUND_LEN(len);
|
|
285
|
+
return 1;
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
static int sv_compute_valid_encoding_slice(string_view_t *sv) {
|
|
289
|
+
if (sv_single_byte_optimizable(sv)) return 1;
|
|
290
|
+
|
|
291
|
+
if (SV_LIKELY(sv_is_utf8(sv))) {
|
|
292
|
+
return simdutf_validate_utf8(sv_ptr(sv), (size_t)sv->length) ? 1 : 0;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
rb_encoding *enc = sv_enc(sv);
|
|
296
|
+
const char *p = sv_ptr(sv);
|
|
297
|
+
const char *e = p + sv->length;
|
|
298
|
+
|
|
299
|
+
while (p < e) {
|
|
300
|
+
int len = rb_enc_precise_mbclen(p, e, enc);
|
|
301
|
+
if (!MBCLEN_CHARFOUND_P(len)) return 0;
|
|
302
|
+
p += MBCLEN_CHARFOUND_LEN(len);
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
return 1;
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
SV_INLINE int sv_valid_encoding_cached(string_view_t *sv) {
|
|
309
|
+
if (SV_LIKELY(sv->valid_encoding >= 0)) return sv->valid_encoding;
|
|
310
|
+
sv->valid_encoding = sv_compute_valid_encoding_slice(sv);
|
|
311
|
+
return sv->valid_encoding;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
static long sv_tolerant_char_count(const char *p, const char *e, rb_encoding *enc) {
|
|
315
|
+
long count = 0;
|
|
316
|
+
|
|
317
|
+
while (p < e) {
|
|
318
|
+
p += sv_precise_char_len(p, e, enc);
|
|
319
|
+
count++;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
return count;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
static long sv_tolerant_char_to_byte_offset(string_view_t *sv, long char_idx) {
|
|
326
|
+
rb_encoding *enc = sv_enc(sv);
|
|
327
|
+
const char *p = sv_ptr(sv);
|
|
328
|
+
const char *e = p + sv->length;
|
|
329
|
+
const char *start = p;
|
|
330
|
+
long i = 0;
|
|
331
|
+
|
|
332
|
+
while (i < char_idx && p < e) {
|
|
333
|
+
p += sv_precise_char_len(p, e, enc);
|
|
334
|
+
i++;
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
if (i < char_idx) return -1;
|
|
338
|
+
return p - start;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
static long sv_tolerant_chars_to_bytes(string_view_t *sv, long byte_off, long n) {
|
|
342
|
+
rb_encoding *enc = sv_enc(sv);
|
|
343
|
+
const char *start = sv_ptr(sv) + byte_off;
|
|
344
|
+
const char *p = start;
|
|
345
|
+
const char *e = sv_ptr(sv) + sv->length;
|
|
346
|
+
long i = 0;
|
|
347
|
+
|
|
348
|
+
while (i < n && p < e) {
|
|
349
|
+
p += sv_precise_char_len(p, e, enc);
|
|
350
|
+
i++;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
return p - start;
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
SV_INLINE void sv_check_compatible_string(string_view_t *sv, VALUE other) {
|
|
357
|
+
rb_encoding *oenc = rb_enc_get(other);
|
|
358
|
+
|
|
359
|
+
if (sv->enc == oenc) return;
|
|
360
|
+
if (rb_enc_asciicompat(sv->enc) && rb_enc_asciicompat(oenc) &&
|
|
361
|
+
(sv_is_7bit(sv) || rb_enc_str_asciionly_p(other))) {
|
|
362
|
+
return;
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
rb_raise(rb_eEncCompatError,
|
|
366
|
+
"incompatible character encodings: %s and %s",
|
|
367
|
+
rb_enc_name(sv->enc), rb_enc_name(oenc));
|
|
368
|
+
}
|
|
369
|
+
|
|
317
370
|
/* ========================================================================= */
|
|
318
371
|
/* Tier 1: Searching */
|
|
319
372
|
/* ========================================================================= */
|
|
@@ -324,6 +377,7 @@ static VALUE sv_include_p(VALUE self, VALUE substr) {
|
|
|
324
377
|
const char *p = sv_ptr(sv);
|
|
325
378
|
long slen = RSTRING_LEN(substr);
|
|
326
379
|
if (slen == 0) return Qtrue;
|
|
380
|
+
sv_check_compatible_string(sv, substr);
|
|
327
381
|
if (slen > sv->length) return Qfalse;
|
|
328
382
|
|
|
329
383
|
long pos = rb_memsearch(RSTRING_PTR(substr), slen, p, sv->length, sv_enc(sv));
|
|
@@ -339,6 +393,8 @@ static VALUE sv_start_with_p(int argc, VALUE *argv, VALUE self) {
|
|
|
339
393
|
VALUE prefix = argv[i];
|
|
340
394
|
StringValue(prefix);
|
|
341
395
|
long plen = RSTRING_LEN(prefix);
|
|
396
|
+
if (plen == 0) return Qtrue;
|
|
397
|
+
sv_check_compatible_string(sv, prefix);
|
|
342
398
|
if (plen > sv->length) continue;
|
|
343
399
|
if (memcmp(p, RSTRING_PTR(prefix), plen) == 0) return Qtrue;
|
|
344
400
|
}
|
|
@@ -354,6 +410,8 @@ static VALUE sv_end_with_p(int argc, VALUE *argv, VALUE self) {
|
|
|
354
410
|
VALUE suffix = argv[i];
|
|
355
411
|
StringValue(suffix);
|
|
356
412
|
long slen = RSTRING_LEN(suffix);
|
|
413
|
+
if (slen == 0) return Qtrue;
|
|
414
|
+
sv_check_compatible_string(sv, suffix);
|
|
357
415
|
if (slen > sv->length) continue;
|
|
358
416
|
if (memcmp(p + sv->length - slen, RSTRING_PTR(suffix), slen) == 0)
|
|
359
417
|
return Qtrue;
|
|
@@ -361,16 +419,127 @@ static VALUE sv_end_with_p(int argc, VALUE *argv, VALUE self) {
|
|
|
361
419
|
return Qfalse;
|
|
362
420
|
}
|
|
363
421
|
|
|
364
|
-
|
|
422
|
+
/*
|
|
423
|
+
* index(substring[, offset]) → Integer or nil
|
|
424
|
+
*
|
|
425
|
+
* For String arguments: native zero-alloc implementation using rb_memsearch.
|
|
426
|
+
* For Regexp arguments: delegates to String#index via shared string.
|
|
427
|
+
*/
|
|
428
|
+
VALUE sv_index(int argc, VALUE *argv, VALUE self) {
|
|
365
429
|
string_view_t *sv = sv_get_struct(self);
|
|
366
|
-
VALUE
|
|
367
|
-
|
|
430
|
+
VALUE pattern, voffset;
|
|
431
|
+
rb_scan_args(argc, argv, "11", &pattern, &voffset);
|
|
432
|
+
|
|
433
|
+
/* Regexp path: delegate via shared string */
|
|
434
|
+
if (rb_obj_is_kind_of(pattern, rb_cRegexp)) {
|
|
435
|
+
VALUE shared = sv_as_shared_str(sv);
|
|
436
|
+
return rb_funcallv(shared, id_index, argc, argv);
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
StringValue(pattern);
|
|
440
|
+
const char *p = sv_ptr(sv);
|
|
441
|
+
long plen = RSTRING_LEN(pattern);
|
|
442
|
+
|
|
443
|
+
/* Determine starting char offset */
|
|
444
|
+
long char_off = NIL_P(voffset) ? 0 : NUM2LONG(voffset);
|
|
445
|
+
long total_chars = sv_char_count(sv);
|
|
446
|
+
|
|
447
|
+
if (char_off < 0) char_off += total_chars;
|
|
448
|
+
if (char_off < 0 || char_off > total_chars) return Qnil;
|
|
449
|
+
if (plen == 0) return LONG2NUM(char_off);
|
|
450
|
+
sv_check_compatible_string(sv, pattern);
|
|
451
|
+
|
|
452
|
+
/* Convert char offset to byte offset */
|
|
453
|
+
long byte_off = sv_char_to_byte_offset(sv, char_off);
|
|
454
|
+
if (byte_off < 0) return Qnil;
|
|
455
|
+
|
|
456
|
+
if (plen > sv->length - byte_off) return Qnil;
|
|
457
|
+
|
|
458
|
+
long pos = rb_memsearch(RSTRING_PTR(pattern), plen,
|
|
459
|
+
p + byte_off, sv->length - byte_off,
|
|
460
|
+
sv_enc(sv));
|
|
461
|
+
if (pos < 0 || pos > sv->length - byte_off - plen) return Qnil;
|
|
462
|
+
|
|
463
|
+
/* Convert byte position back to character position */
|
|
464
|
+
if (sv_single_byte_optimizable(sv)) {
|
|
465
|
+
return LONG2NUM(char_off + pos);
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
return LONG2NUM(char_off + sv_char_count_partial(sv, p + byte_off, pos));
|
|
368
469
|
}
|
|
369
470
|
|
|
370
|
-
|
|
471
|
+
/*
|
|
472
|
+
* rindex(substring[, offset]) → Integer or nil
|
|
473
|
+
*
|
|
474
|
+
* For String arguments: native zero-alloc reverse search.
|
|
475
|
+
* For Regexp arguments: delegates to String#rindex via shared string.
|
|
476
|
+
*/
|
|
477
|
+
VALUE sv_rindex(int argc, VALUE *argv, VALUE self) {
|
|
371
478
|
string_view_t *sv = sv_get_struct(self);
|
|
372
|
-
VALUE
|
|
373
|
-
|
|
479
|
+
VALUE pattern, voffset;
|
|
480
|
+
rb_scan_args(argc, argv, "11", &pattern, &voffset);
|
|
481
|
+
|
|
482
|
+
/* Regexp path: delegate */
|
|
483
|
+
if (rb_obj_is_kind_of(pattern, rb_cRegexp)) {
|
|
484
|
+
VALUE shared = sv_as_shared_str(sv);
|
|
485
|
+
return rb_funcallv(shared, id_rindex, argc, argv);
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
StringValue(pattern);
|
|
489
|
+
const char *p = sv_ptr(sv);
|
|
490
|
+
long plen = RSTRING_LEN(pattern);
|
|
491
|
+
long total_chars = sv_char_count(sv);
|
|
492
|
+
|
|
493
|
+
/* Determine the maximum char position to search from */
|
|
494
|
+
long max_char;
|
|
495
|
+
if (NIL_P(voffset)) {
|
|
496
|
+
max_char = total_chars;
|
|
497
|
+
} else {
|
|
498
|
+
max_char = NUM2LONG(voffset);
|
|
499
|
+
if (max_char < 0) max_char += total_chars;
|
|
500
|
+
if (max_char < 0) return Qnil;
|
|
501
|
+
if (max_char > total_chars) max_char = total_chars;
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
if (plen == 0) {
|
|
505
|
+
return LONG2NUM(max_char > total_chars ? total_chars : max_char);
|
|
506
|
+
}
|
|
507
|
+
sv_check_compatible_string(sv, pattern);
|
|
508
|
+
if (plen > sv->length) return Qnil;
|
|
509
|
+
|
|
510
|
+
/* Convert max_char to a byte limit */
|
|
511
|
+
long max_byte = sv_char_to_byte_offset(sv, max_char);
|
|
512
|
+
if (max_byte < 0) max_byte = sv->length;
|
|
513
|
+
|
|
514
|
+
/* Ensure we don't search past the point where the pattern can't fit */
|
|
515
|
+
long search_end = max_byte;
|
|
516
|
+
if (search_end + plen > sv->length) {
|
|
517
|
+
search_end = sv->length - plen;
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
/* Reverse byte search */
|
|
521
|
+
const char *needle = RSTRING_PTR(pattern);
|
|
522
|
+
const char *s;
|
|
523
|
+
for (s = p + search_end; s >= p; ) {
|
|
524
|
+
if (memcmp(s, needle, plen) == 0) {
|
|
525
|
+
long byte_pos = s - p;
|
|
526
|
+
/* Convert byte position to char position */
|
|
527
|
+
if (sv_single_byte_optimizable(sv)) {
|
|
528
|
+
return LONG2NUM(byte_pos);
|
|
529
|
+
}
|
|
530
|
+
return LONG2NUM(sv_char_count_partial(sv, p, byte_pos));
|
|
531
|
+
}
|
|
532
|
+
/* Move back one character */
|
|
533
|
+
if (s == p) break;
|
|
534
|
+
if (sv_single_byte_optimizable(sv)) {
|
|
535
|
+
s--;
|
|
536
|
+
} else {
|
|
537
|
+
rb_encoding *enc = sv_enc(sv);
|
|
538
|
+
s = rb_enc_prev_char(p, s, p + sv->length, enc);
|
|
539
|
+
if (s == NULL) break;
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
return Qnil;
|
|
374
543
|
}
|
|
375
544
|
|
|
376
545
|
static VALUE sv_getbyte(VALUE self, VALUE vidx) {
|
|
@@ -381,16 +550,85 @@ static VALUE sv_getbyte(VALUE self, VALUE vidx) {
|
|
|
381
550
|
return INT2FIX((unsigned char)sv_ptr(sv)[idx]);
|
|
382
551
|
}
|
|
383
552
|
|
|
384
|
-
|
|
553
|
+
/*
|
|
554
|
+
* byteindex(substring[, offset]) → Integer or nil
|
|
555
|
+
*
|
|
556
|
+
* For String arguments: native zero-alloc byte-level search.
|
|
557
|
+
* For Regexp arguments: delegates to String#byteindex via shared string.
|
|
558
|
+
*/
|
|
559
|
+
VALUE sv_byteindex(int argc, VALUE *argv, VALUE self) {
|
|
385
560
|
string_view_t *sv = sv_get_struct(self);
|
|
386
|
-
VALUE
|
|
387
|
-
|
|
561
|
+
VALUE pattern, voffset;
|
|
562
|
+
rb_scan_args(argc, argv, "11", &pattern, &voffset);
|
|
563
|
+
|
|
564
|
+
if (rb_obj_is_kind_of(pattern, rb_cRegexp)) {
|
|
565
|
+
VALUE shared = sv_as_shared_str(sv);
|
|
566
|
+
return rb_funcallv(shared, id_byteindex, argc, argv);
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
StringValue(pattern);
|
|
570
|
+
const char *p = sv_ptr(sv);
|
|
571
|
+
long plen = RSTRING_LEN(pattern);
|
|
572
|
+
long byte_off = NIL_P(voffset) ? 0 : NUM2LONG(voffset);
|
|
573
|
+
|
|
574
|
+
if (byte_off < 0) byte_off += sv->length;
|
|
575
|
+
if (byte_off < 0 || byte_off > sv->length) return Qnil;
|
|
576
|
+
if (plen == 0) return LONG2NUM(byte_off);
|
|
577
|
+
sv_check_compatible_string(sv, pattern);
|
|
578
|
+
if (plen > sv->length - byte_off) return Qnil;
|
|
579
|
+
|
|
580
|
+
long pos = rb_memsearch(RSTRING_PTR(pattern), plen,
|
|
581
|
+
p + byte_off, sv->length - byte_off,
|
|
582
|
+
sv_enc(sv));
|
|
583
|
+
if (pos < 0 || pos > sv->length - byte_off - plen) return Qnil;
|
|
584
|
+
return LONG2NUM(byte_off + pos);
|
|
388
585
|
}
|
|
389
586
|
|
|
390
|
-
|
|
587
|
+
/*
|
|
588
|
+
* byterindex(substring[, offset]) → Integer or nil
|
|
589
|
+
*
|
|
590
|
+
* For String arguments: native zero-alloc reverse byte-level search.
|
|
591
|
+
* For Regexp arguments: delegates to String#byterindex via shared string.
|
|
592
|
+
*/
|
|
593
|
+
VALUE sv_byterindex(int argc, VALUE *argv, VALUE self) {
|
|
391
594
|
string_view_t *sv = sv_get_struct(self);
|
|
392
|
-
VALUE
|
|
393
|
-
|
|
595
|
+
VALUE pattern, voffset;
|
|
596
|
+
rb_scan_args(argc, argv, "11", &pattern, &voffset);
|
|
597
|
+
|
|
598
|
+
if (rb_obj_is_kind_of(pattern, rb_cRegexp)) {
|
|
599
|
+
VALUE shared = sv_as_shared_str(sv);
|
|
600
|
+
return rb_funcallv(shared, id_byterindex, argc, argv);
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
StringValue(pattern);
|
|
604
|
+
const char *p = sv_ptr(sv);
|
|
605
|
+
long plen = RSTRING_LEN(pattern);
|
|
606
|
+
long max_byte;
|
|
607
|
+
|
|
608
|
+
if (NIL_P(voffset)) {
|
|
609
|
+
max_byte = sv->length;
|
|
610
|
+
} else {
|
|
611
|
+
max_byte = NUM2LONG(voffset);
|
|
612
|
+
if (max_byte < 0) max_byte += sv->length;
|
|
613
|
+
if (max_byte < 0) return Qnil;
|
|
614
|
+
if (max_byte > sv->length) max_byte = sv->length;
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
if (plen == 0) return LONG2NUM(max_byte > sv->length ? sv->length : max_byte);
|
|
618
|
+
sv_check_compatible_string(sv, pattern);
|
|
619
|
+
if (plen > sv->length) return Qnil;
|
|
620
|
+
|
|
621
|
+
long search_end = max_byte;
|
|
622
|
+
if (search_end + plen > sv->length) search_end = sv->length - plen;
|
|
623
|
+
|
|
624
|
+
const char *needle = RSTRING_PTR(pattern);
|
|
625
|
+
long i;
|
|
626
|
+
for (i = search_end; i >= 0; i--) {
|
|
627
|
+
if (memcmp(p + i, needle, plen) == 0) {
|
|
628
|
+
return LONG2NUM(i);
|
|
629
|
+
}
|
|
630
|
+
}
|
|
631
|
+
return Qnil;
|
|
394
632
|
}
|
|
395
633
|
|
|
396
634
|
/* ========================================================================= */
|
|
@@ -454,19 +692,19 @@ static VALUE sv_chars(VALUE self) {
|
|
|
454
692
|
static VALUE sv_match(int argc, VALUE *argv, VALUE self) {
|
|
455
693
|
string_view_t *sv = sv_get_struct(self);
|
|
456
694
|
VALUE shared = sv_as_shared_str(sv);
|
|
457
|
-
return rb_funcallv(shared,
|
|
695
|
+
return rb_funcallv(shared, id_match, argc, argv);
|
|
458
696
|
}
|
|
459
697
|
|
|
460
698
|
static VALUE sv_match_p(int argc, VALUE *argv, VALUE self) {
|
|
461
699
|
string_view_t *sv = sv_get_struct(self);
|
|
462
700
|
VALUE shared = sv_as_shared_str(sv);
|
|
463
|
-
return rb_funcallv(shared,
|
|
701
|
+
return rb_funcallv(shared, id_match_p, argc, argv);
|
|
464
702
|
}
|
|
465
703
|
|
|
466
704
|
static VALUE sv_match_operator(VALUE self, VALUE pattern) {
|
|
467
705
|
string_view_t *sv = sv_get_struct(self);
|
|
468
706
|
VALUE shared = sv_as_shared_str(sv);
|
|
469
|
-
return rb_funcall(shared,
|
|
707
|
+
return rb_funcall(shared, id_match_op, 1, pattern);
|
|
470
708
|
}
|
|
471
709
|
|
|
472
710
|
/* ========================================================================= */
|
|
@@ -485,6 +723,11 @@ typedef struct {
|
|
|
485
723
|
char *ptr;
|
|
486
724
|
} sv_cstr_t;
|
|
487
725
|
|
|
726
|
+
typedef struct {
|
|
727
|
+
sv_cstr_t *cs;
|
|
728
|
+
int base;
|
|
729
|
+
} sv_inum_args_t;
|
|
730
|
+
|
|
488
731
|
SV_INLINE void sv_cstr_init(sv_cstr_t *cs, string_view_t *sv) {
|
|
489
732
|
const char *p = sv_ptr(sv);
|
|
490
733
|
long len = sv->length;
|
|
@@ -505,6 +748,22 @@ SV_INLINE void sv_cstr_free(sv_cstr_t *cs) {
|
|
|
505
748
|
}
|
|
506
749
|
}
|
|
507
750
|
|
|
751
|
+
static VALUE sv_cstr_free_ensure(VALUE arg) {
|
|
752
|
+
sv_cstr_free((sv_cstr_t *)arg);
|
|
753
|
+
return Qnil;
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
static VALUE sv_to_i_body(VALUE arg) {
|
|
757
|
+
sv_inum_args_t *args = (sv_inum_args_t *)arg;
|
|
758
|
+
return rb_cstr_to_inum(args->cs->ptr, args->base, 0);
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
static VALUE sv_to_f_body(VALUE arg) {
|
|
762
|
+
sv_cstr_t *cs = (sv_cstr_t *)arg;
|
|
763
|
+
double d = rb_cstr_to_dbl(cs->ptr, 0);
|
|
764
|
+
return DBL2NUM(d);
|
|
765
|
+
}
|
|
766
|
+
|
|
508
767
|
/*
|
|
509
768
|
* to_i([base]) — parse integer directly from byte pointer, zero allocations.
|
|
510
769
|
* Uses rb_cstr_to_inum which parses from a NUL-terminated C string.
|
|
@@ -515,10 +774,12 @@ static VALUE sv_to_i(int argc, VALUE *argv, VALUE self) {
|
|
|
515
774
|
if (argc > 0) base = NUM2INT(argv[0]);
|
|
516
775
|
|
|
517
776
|
sv_cstr_t cs;
|
|
777
|
+
sv_inum_args_t args;
|
|
518
778
|
sv_cstr_init(&cs, sv);
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
return
|
|
779
|
+
args.cs = &cs;
|
|
780
|
+
args.base = base;
|
|
781
|
+
return rb_ensure(sv_to_i_body, (VALUE)&args,
|
|
782
|
+
sv_cstr_free_ensure, (VALUE)&cs);
|
|
522
783
|
}
|
|
523
784
|
|
|
524
785
|
/*
|
|
@@ -528,9 +789,8 @@ static VALUE sv_to_f(VALUE self) {
|
|
|
528
789
|
string_view_t *sv = sv_get_struct(self);
|
|
529
790
|
sv_cstr_t cs;
|
|
530
791
|
sv_cstr_init(&cs, sv);
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
return DBL2NUM(d);
|
|
792
|
+
return rb_ensure(sv_to_f_body, (VALUE)&cs,
|
|
793
|
+
sv_cstr_free_ensure, (VALUE)&cs);
|
|
534
794
|
}
|
|
535
795
|
|
|
536
796
|
/*
|
|
@@ -539,10 +799,12 @@ static VALUE sv_to_f(VALUE self) {
|
|
|
539
799
|
static VALUE sv_hex(VALUE self) {
|
|
540
800
|
string_view_t *sv = sv_get_struct(self);
|
|
541
801
|
sv_cstr_t cs;
|
|
802
|
+
sv_inum_args_t args;
|
|
542
803
|
sv_cstr_init(&cs, sv);
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
return
|
|
804
|
+
args.cs = &cs;
|
|
805
|
+
args.base = 16;
|
|
806
|
+
return rb_ensure(sv_to_i_body, (VALUE)&args,
|
|
807
|
+
sv_cstr_free_ensure, (VALUE)&cs);
|
|
546
808
|
}
|
|
547
809
|
|
|
548
810
|
/*
|
|
@@ -551,16 +813,52 @@ static VALUE sv_hex(VALUE self) {
|
|
|
551
813
|
static VALUE sv_oct(VALUE self) {
|
|
552
814
|
string_view_t *sv = sv_get_struct(self);
|
|
553
815
|
sv_cstr_t cs;
|
|
816
|
+
sv_inum_args_t args;
|
|
554
817
|
sv_cstr_init(&cs, sv);
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
return
|
|
818
|
+
args.cs = &cs;
|
|
819
|
+
args.base = 8;
|
|
820
|
+
return rb_ensure(sv_to_i_body, (VALUE)&args,
|
|
821
|
+
sv_cstr_free_ensure, (VALUE)&cs);
|
|
558
822
|
}
|
|
559
823
|
|
|
560
824
|
/* ========================================================================= */
|
|
561
825
|
/* Tier 1: Comparison */
|
|
562
826
|
/* ========================================================================= */
|
|
563
827
|
|
|
828
|
+
/*
|
|
829
|
+
* Returns 1 if all bytes in the view are < 128 (7-bit ASCII).
|
|
830
|
+
* Uses the single_byte cache when available.
|
|
831
|
+
*/
|
|
832
|
+
SV_INLINE int sv_is_7bit(string_view_t *sv) {
|
|
833
|
+
const unsigned char *p = (const unsigned char *)sv_ptr(sv);
|
|
834
|
+
long i;
|
|
835
|
+
for (i = 0; i < sv->length; i++) {
|
|
836
|
+
if (p[i] > 127) return 0;
|
|
837
|
+
}
|
|
838
|
+
return 1;
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
/*
|
|
842
|
+
* Check encoding compatibility for equality, mirroring Ruby's String#==.
|
|
843
|
+
* Two encodings are compatible for comparison if:
|
|
844
|
+
* - They are the same encoding, OR
|
|
845
|
+
* - Both are ASCII-compatible and at least one side is 7-bit
|
|
846
|
+
* (e.g. UTF-8 "hello" == US-ASCII "hello")
|
|
847
|
+
*/
|
|
848
|
+
SV_INLINE int sv_enc_compatible_for_eq(
|
|
849
|
+
rb_encoding *enc1, int is_7bit_1,
|
|
850
|
+
rb_encoding *enc2, int is_7bit_2)
|
|
851
|
+
{
|
|
852
|
+
if (enc1 == enc2) return 1;
|
|
853
|
+
if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) return 0;
|
|
854
|
+
return is_7bit_1 || is_7bit_2;
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
SV_INLINE int sv_is_string_view(VALUE obj) {
|
|
858
|
+
VALUE klass = rb_obj_class(obj);
|
|
859
|
+
return klass == cStringView || klass == cStringViewStrict;
|
|
860
|
+
}
|
|
861
|
+
|
|
564
862
|
static VALUE sv_eq(VALUE self, VALUE other) {
|
|
565
863
|
string_view_t *sv = sv_get_struct(self);
|
|
566
864
|
const char *p = sv_ptr(sv);
|
|
@@ -568,13 +866,26 @@ static VALUE sv_eq(VALUE self, VALUE other) {
|
|
|
568
866
|
/* Fast path: String is the most common comparison target */
|
|
569
867
|
if (SV_LIKELY(RB_TYPE_P(other, T_STRING))) {
|
|
570
868
|
if (sv->length != RSTRING_LEN(other)) return Qfalse;
|
|
869
|
+
rb_encoding *oenc = rb_enc_get(other);
|
|
870
|
+
if (sv->enc != oenc) {
|
|
871
|
+
int sv_7bit = sv_is_7bit(sv);
|
|
872
|
+
int o_7bit = rb_enc_str_asciionly_p(other);
|
|
873
|
+
if (!sv_enc_compatible_for_eq(sv->enc, sv_7bit, oenc, o_7bit))
|
|
874
|
+
return Qfalse;
|
|
875
|
+
}
|
|
571
876
|
return memcmp(p, RSTRING_PTR(other), sv->length) == 0 ? Qtrue : Qfalse;
|
|
572
877
|
}
|
|
573
878
|
|
|
574
|
-
/* Check for StringView
|
|
575
|
-
if (
|
|
879
|
+
/* Check for StringView or StringView::Strict */
|
|
880
|
+
if (sv_is_string_view(other)) {
|
|
576
881
|
string_view_t *o = sv_get_struct(other);
|
|
577
882
|
if (sv->length != o->length) return Qfalse;
|
|
883
|
+
if (sv->enc != o->enc) {
|
|
884
|
+
int sv_7bit = sv_is_7bit(sv);
|
|
885
|
+
int o_7bit = sv_is_7bit(o);
|
|
886
|
+
if (!sv_enc_compatible_for_eq(sv->enc, sv_7bit, o->enc, o_7bit))
|
|
887
|
+
return Qfalse;
|
|
888
|
+
}
|
|
578
889
|
return memcmp(p, sv_ptr(o), sv->length) == 0 ? Qtrue : Qfalse;
|
|
579
890
|
}
|
|
580
891
|
|
|
@@ -590,7 +901,7 @@ static VALUE sv_cmp(VALUE self, VALUE other) {
|
|
|
590
901
|
if (SV_LIKELY(RB_TYPE_P(other, T_STRING))) {
|
|
591
902
|
op = RSTRING_PTR(other);
|
|
592
903
|
olen = RSTRING_LEN(other);
|
|
593
|
-
} else if (
|
|
904
|
+
} else if (sv_is_string_view(other)) {
|
|
594
905
|
string_view_t *o = sv_get_struct(other);
|
|
595
906
|
op = sv_ptr(o);
|
|
596
907
|
olen = o->length;
|
|
@@ -610,15 +921,22 @@ static VALUE sv_cmp(VALUE self, VALUE other) {
|
|
|
610
921
|
}
|
|
611
922
|
|
|
612
923
|
static VALUE sv_eql_p(VALUE self, VALUE other) {
|
|
613
|
-
if (
|
|
924
|
+
if (!sv_is_string_view(other)) return Qfalse;
|
|
614
925
|
return sv_eq(self, other);
|
|
615
926
|
}
|
|
616
927
|
|
|
617
928
|
static VALUE sv_hash(VALUE self) {
|
|
618
929
|
string_view_t *sv = sv_get_struct(self);
|
|
619
930
|
const char *p = sv_ptr(sv);
|
|
931
|
+
/*
|
|
932
|
+
* Mirror CRuby's rb_str_hash: normalize encoding index to 0 for
|
|
933
|
+
* 7-bit content so that e.g. UTF-8 "hello" and US-ASCII "hello"
|
|
934
|
+
* produce the same hash (they compare equal via sv_eq).
|
|
935
|
+
*/
|
|
936
|
+
int e = rb_enc_to_index(sv->enc);
|
|
937
|
+
if (e && sv_is_7bit(sv)) e = 0;
|
|
620
938
|
st_index_t h = rb_memhash(p, sv->length);
|
|
621
|
-
h ^= (st_index_t)
|
|
939
|
+
h ^= (st_index_t)e;
|
|
622
940
|
return ST2FIX(h);
|
|
623
941
|
}
|
|
624
942
|
|
|
@@ -636,13 +954,17 @@ static VALUE sv_hash(VALUE self) {
|
|
|
636
954
|
* Compute single-byte flag from encoding + coderange.
|
|
637
955
|
* Called once at construction time and cached in sv->single_byte.
|
|
638
956
|
*/
|
|
639
|
-
|
|
957
|
+
int sv_compute_single_byte(VALUE backing, rb_encoding *enc) {
|
|
640
958
|
if (rb_enc_mbmaxlen(enc) == 1) return 1;
|
|
641
959
|
int cr = ENC_CODERANGE(backing);
|
|
642
960
|
if (cr == ENC_CODERANGE_7BIT) return 1;
|
|
643
|
-
/*
|
|
644
|
-
|
|
645
|
-
|
|
961
|
+
/*
|
|
962
|
+
* For VALID and UNKNOWN: the coderange reflects the entire backing
|
|
963
|
+
* string, not this slice. A view over an ASCII-only prefix of a
|
|
964
|
+
* multibyte string would incorrectly get single_byte=0 here.
|
|
965
|
+
* Return -1 (unknown) and let sv_single_byte_optimizable resolve
|
|
966
|
+
* it lazily by scanning the actual slice bytes.
|
|
967
|
+
*/
|
|
646
968
|
return -1;
|
|
647
969
|
}
|
|
648
970
|
|
|
@@ -801,22 +1123,11 @@ static long sv_char_to_byte_offset(string_view_t *sv, long char_idx) {
|
|
|
801
1123
|
return char_idx;
|
|
802
1124
|
}
|
|
803
1125
|
|
|
804
|
-
if (SV_LIKELY(sv_is_utf8(sv))) {
|
|
1126
|
+
if (SV_LIKELY(sv_is_utf8(sv)) && sv_valid_encoding_cached(sv)) {
|
|
805
1127
|
return sv_utf8_char_to_byte_offset_indexed(sv, char_idx);
|
|
806
1128
|
}
|
|
807
1129
|
|
|
808
|
-
|
|
809
|
-
const char *p = sv_ptr(sv);
|
|
810
|
-
const char *e = p + sv->length;
|
|
811
|
-
const char *start = p;
|
|
812
|
-
long i;
|
|
813
|
-
|
|
814
|
-
for (i = 0; i < char_idx && p < e; i++) {
|
|
815
|
-
p += rb_enc_fast_mbclen(p, e, enc);
|
|
816
|
-
}
|
|
817
|
-
|
|
818
|
-
if (i < char_idx) return -1;
|
|
819
|
-
return p - start;
|
|
1130
|
+
return sv_tolerant_char_to_byte_offset(sv, char_idx);
|
|
820
1131
|
}
|
|
821
1132
|
|
|
822
1133
|
static long sv_char_count(string_view_t *sv) {
|
|
@@ -826,12 +1137,11 @@ static long sv_char_count(string_view_t *sv) {
|
|
|
826
1137
|
long count;
|
|
827
1138
|
if (sv_single_byte_optimizable(sv)) {
|
|
828
1139
|
count = sv->length;
|
|
829
|
-
} else if (SV_LIKELY(sv_is_utf8(sv))) {
|
|
1140
|
+
} else if (SV_LIKELY(sv_is_utf8(sv)) && sv_valid_encoding_cached(sv)) {
|
|
830
1141
|
count = sv_utf8_char_count(sv_ptr(sv), sv->length);
|
|
831
1142
|
} else {
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
count = rb_enc_strlen(p, p + sv->length, enc);
|
|
1143
|
+
count = sv_tolerant_char_count(sv_ptr(sv), sv_ptr(sv) + sv->length,
|
|
1144
|
+
sv_enc(sv));
|
|
835
1145
|
}
|
|
836
1146
|
|
|
837
1147
|
sv->charlen = count;
|
|
@@ -844,20 +1154,20 @@ static long sv_chars_to_bytes(string_view_t *sv, long byte_off, long n) {
|
|
|
844
1154
|
return n < remaining ? n : remaining;
|
|
845
1155
|
}
|
|
846
1156
|
|
|
847
|
-
if (SV_LIKELY(sv_is_utf8(sv))) {
|
|
1157
|
+
if (SV_LIKELY(sv_is_utf8(sv)) && sv_valid_encoding_cached(sv)) {
|
|
848
1158
|
return sv_utf8_chars_to_bytes(sv_ptr(sv), sv->length, byte_off, n);
|
|
849
1159
|
}
|
|
850
1160
|
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
const char *e = sv_ptr(sv) + sv->length;
|
|
854
|
-
long i;
|
|
855
|
-
const char *start = p;
|
|
1161
|
+
return sv_tolerant_chars_to_bytes(sv, byte_off, n);
|
|
1162
|
+
}
|
|
856
1163
|
|
|
857
|
-
|
|
858
|
-
|
|
1164
|
+
static long sv_char_count_partial(string_view_t *sv, const char *p, long len) {
|
|
1165
|
+
if (len <= 0) return 0;
|
|
1166
|
+
if (sv_single_byte_optimizable(sv)) return len;
|
|
1167
|
+
if (SV_LIKELY(sv_is_utf8(sv)) && sv_valid_encoding_cached(sv)) {
|
|
1168
|
+
return sv_utf8_char_count(p, len);
|
|
859
1169
|
}
|
|
860
|
-
return p
|
|
1170
|
+
return sv_tolerant_char_count(p, p + len, sv_enc(sv));
|
|
861
1171
|
}
|
|
862
1172
|
|
|
863
1173
|
static VALUE sv_aref(int argc, VALUE *argv, VALUE self) {
|
|
@@ -879,8 +1189,8 @@ static VALUE sv_aref(int argc, VALUE *argv, VALUE self) {
|
|
|
879
1189
|
long total = sv->length;
|
|
880
1190
|
if (idx < 0) idx += total;
|
|
881
1191
|
if (SV_UNLIKELY(idx < 0 || idx > total || len < 0)) return Qnil;
|
|
882
|
-
if (
|
|
883
|
-
return
|
|
1192
|
+
if (len > total - idx) len = total - idx;
|
|
1193
|
+
return sv_new_from_parent_obj(self, sv,
|
|
884
1194
|
sv->offset + idx,
|
|
885
1195
|
len);
|
|
886
1196
|
}
|
|
@@ -901,12 +1211,12 @@ static VALUE sv_aref(int argc, VALUE *argv, VALUE self) {
|
|
|
901
1211
|
|
|
902
1212
|
/* Clamp len to remaining characters */
|
|
903
1213
|
long total_chars = sv_char_count(sv);
|
|
904
|
-
if (
|
|
1214
|
+
if (len > total_chars - idx) len = total_chars - idx;
|
|
905
1215
|
|
|
906
1216
|
long byte_end = sv_char_to_byte_offset(sv, idx + len);
|
|
907
1217
|
long byte_len = byte_end - byte_off;
|
|
908
1218
|
|
|
909
|
-
return
|
|
1219
|
+
return sv_new_from_parent_obj(self, sv,
|
|
910
1220
|
sv->offset + byte_off,
|
|
911
1221
|
byte_len);
|
|
912
1222
|
}
|
|
@@ -914,48 +1224,34 @@ static VALUE sv_aref(int argc, VALUE *argv, VALUE self) {
|
|
|
914
1224
|
if (rb_obj_is_kind_of(arg1, rb_cRange)) {
|
|
915
1225
|
long total_chars = sv_char_count(sv);
|
|
916
1226
|
long beg, len;
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
if (beg < 0) beg += total_chars;
|
|
924
|
-
if (beg < 0) return Qnil;
|
|
925
|
-
|
|
926
|
-
long e;
|
|
927
|
-
if (NIL_P(rb_end)) {
|
|
928
|
-
e = total_chars;
|
|
929
|
-
} else {
|
|
930
|
-
e = NUM2LONG(rb_end);
|
|
931
|
-
if (e < 0) e += total_chars;
|
|
932
|
-
if (!excl) e += 1;
|
|
1227
|
+
|
|
1228
|
+
/* rb_range_beg_len resolves negative indices and clamps to total,
|
|
1229
|
+
* replacing 3 Ruby method dispatches with a single C call. */
|
|
1230
|
+
switch (rb_range_beg_len(arg1, &beg, &len, total_chars, 1)) {
|
|
1231
|
+
case Qfalse: return Qnil;
|
|
1232
|
+
case Qnil: return Qnil;
|
|
933
1233
|
}
|
|
934
|
-
if (e < beg) e = beg;
|
|
935
|
-
len = e - beg;
|
|
936
|
-
if (beg > total_chars) return Qnil;
|
|
937
|
-
if (beg + len > total_chars) len = total_chars - beg;
|
|
938
1234
|
|
|
939
1235
|
long byte_off = sv_char_to_byte_offset(sv, beg);
|
|
940
1236
|
long byte_len = sv_chars_to_bytes(sv, byte_off, len);
|
|
941
1237
|
|
|
942
|
-
return
|
|
1238
|
+
return sv_new_from_parent_obj(self, sv,
|
|
943
1239
|
sv->offset + byte_off,
|
|
944
1240
|
byte_len);
|
|
945
1241
|
}
|
|
946
1242
|
|
|
947
1243
|
if (rb_obj_is_kind_of(arg1, rb_cRegexp)) {
|
|
948
1244
|
VALUE shared = sv_as_shared_str(sv);
|
|
949
|
-
VALUE m = rb_funcall(arg1,
|
|
1245
|
+
VALUE m = rb_funcall(arg1, id_match, 1, shared);
|
|
950
1246
|
if (NIL_P(m)) return Qnil;
|
|
951
1247
|
|
|
952
|
-
VALUE matched = rb_funcall(m,
|
|
953
|
-
long match_beg = NUM2LONG(rb_funcall(m,
|
|
1248
|
+
VALUE matched = rb_funcall(m, id_aref, 1, INT2FIX(0));
|
|
1249
|
+
long match_beg = NUM2LONG(rb_funcall(m, id_begin, 1, INT2FIX(0)));
|
|
954
1250
|
|
|
955
1251
|
long byte_off = sv_char_to_byte_offset(sv, match_beg);
|
|
956
1252
|
long byte_len = RSTRING_LEN(matched);
|
|
957
1253
|
|
|
958
|
-
return
|
|
1254
|
+
return sv_new_from_parent_obj(self, sv,
|
|
959
1255
|
sv->offset + byte_off,
|
|
960
1256
|
byte_len);
|
|
961
1257
|
}
|
|
@@ -964,14 +1260,15 @@ static VALUE sv_aref(int argc, VALUE *argv, VALUE self) {
|
|
|
964
1260
|
const char *p = sv_ptr(sv);
|
|
965
1261
|
long slen = RSTRING_LEN(arg1);
|
|
966
1262
|
if (slen == 0) {
|
|
967
|
-
return
|
|
1263
|
+
return sv_new_from_parent_obj(self, sv, sv->offset, 0);
|
|
968
1264
|
}
|
|
1265
|
+
sv_check_compatible_string(sv, arg1);
|
|
969
1266
|
if (slen > sv->length) return Qnil;
|
|
970
1267
|
|
|
971
1268
|
long pos = rb_memsearch(RSTRING_PTR(arg1), slen, p, sv->length, sv_enc(sv));
|
|
972
1269
|
if (pos < 0 || pos > sv->length - slen) return Qnil;
|
|
973
1270
|
|
|
974
|
-
return
|
|
1271
|
+
return sv_new_from_parent_obj(self, sv, sv->offset + pos, slen);
|
|
975
1272
|
}
|
|
976
1273
|
|
|
977
1274
|
if (RB_INTEGER_TYPE_P(arg1)) {
|
|
@@ -986,7 +1283,7 @@ static VALUE sv_aref(int argc, VALUE *argv, VALUE self) {
|
|
|
986
1283
|
|
|
987
1284
|
long byte_len = sv_chars_to_bytes(sv, byte_off, 1);
|
|
988
1285
|
|
|
989
|
-
return
|
|
1286
|
+
return sv_new_from_parent_obj(self, sv,
|
|
990
1287
|
sv->offset + byte_off,
|
|
991
1288
|
byte_len);
|
|
992
1289
|
}
|
|
@@ -1013,98 +1310,376 @@ static VALUE sv_byteslice(int argc, VALUE *argv, VALUE self) {
|
|
|
1013
1310
|
if (off < 0) off += sv->length;
|
|
1014
1311
|
if (off < 0 || off > sv->length) return Qnil;
|
|
1015
1312
|
if (len < 0) return Qnil;
|
|
1016
|
-
if (
|
|
1313
|
+
if (len > sv->length - off) len = sv->length - off;
|
|
1017
1314
|
|
|
1018
|
-
return
|
|
1315
|
+
return sv_new_from_parent_obj(self, sv, sv->offset + off, len);
|
|
1019
1316
|
}
|
|
1020
1317
|
|
|
1021
1318
|
if (rb_obj_is_kind_of(arg1, rb_cRange)) {
|
|
1022
1319
|
long beg, len;
|
|
1023
|
-
VALUE rb_beg = rb_funcall(arg1, rb_intern("begin"), 0);
|
|
1024
|
-
VALUE rb_end = rb_funcall(arg1, rb_intern("end"), 0);
|
|
1025
|
-
int excl = RTEST(rb_funcall(arg1, rb_intern("exclude_end?"), 0));
|
|
1026
1320
|
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
long e;
|
|
1032
|
-
if (NIL_P(rb_end)) {
|
|
1033
|
-
e = sv->length;
|
|
1034
|
-
} else {
|
|
1035
|
-
e = NUM2LONG(rb_end);
|
|
1036
|
-
if (e < 0) e += sv->length;
|
|
1037
|
-
if (!excl) e += 1;
|
|
1321
|
+
switch (rb_range_beg_len(arg1, &beg, &len, sv->length, 1)) {
|
|
1322
|
+
case Qfalse: return Qnil;
|
|
1323
|
+
case Qnil: return Qnil;
|
|
1038
1324
|
}
|
|
1039
|
-
if (e < beg) e = beg;
|
|
1040
|
-
len = e - beg;
|
|
1041
|
-
if (beg > sv->length) return Qnil;
|
|
1042
|
-
if (beg + len > sv->length) len = sv->length - beg;
|
|
1043
1325
|
|
|
1044
|
-
return
|
|
1326
|
+
return sv_new_from_parent_obj(self, sv, sv->offset + beg, len);
|
|
1045
1327
|
}
|
|
1046
1328
|
|
|
1047
1329
|
{
|
|
1048
1330
|
long idx = NUM2LONG(arg1);
|
|
1049
1331
|
if (idx < 0) idx += sv->length;
|
|
1050
1332
|
if (idx < 0 || idx >= sv->length) return Qnil;
|
|
1051
|
-
return
|
|
1333
|
+
return sv_new_from_parent_obj(self, sv, sv->offset + idx, 1);
|
|
1052
1334
|
}
|
|
1053
1335
|
}
|
|
1054
1336
|
|
|
1337
|
+
/* ========================================================================= */
|
|
1338
|
+
/* Tier 1.5: Zero-copy transforms — returns StringView via offset adjustment */
|
|
1339
|
+
/* ========================================================================= */
|
|
1340
|
+
|
|
1341
|
+
/*
|
|
1342
|
+
* Helper: check if a byte is ASCII whitespace.
|
|
1343
|
+
* Matches Ruby's strip behavior for ASCII-compatible encodings:
|
|
1344
|
+
* space, tab, newline, vertical tab, form feed, carriage return, NUL.
|
|
1345
|
+
*/
|
|
1346
|
+
SV_INLINE int sv_is_ascii_whitespace(unsigned char c) {
|
|
1347
|
+
return c == ' ' || (c >= '\t' && c <= '\r') || c == '\0';
|
|
1348
|
+
}
|
|
1349
|
+
|
|
1350
|
+
/*
|
|
1351
|
+
* strip → StringView
|
|
1352
|
+
* Returns a new StringView with leading and trailing ASCII whitespace removed.
|
|
1353
|
+
* Zero allocations for the byte content — only a new StringView struct.
|
|
1354
|
+
*/
|
|
1355
|
+
static VALUE sv_strip(int argc, VALUE *argv, VALUE self) {
|
|
1356
|
+
rb_check_arity(argc, 0, 0);
|
|
1357
|
+
string_view_t *sv = sv_get_struct(self);
|
|
1358
|
+
const unsigned char *p = (const unsigned char *)sv_ptr(sv);
|
|
1359
|
+
long len = sv->length;
|
|
1360
|
+
|
|
1361
|
+
/* Skip leading whitespace */
|
|
1362
|
+
long left = 0;
|
|
1363
|
+
while (left < len && sv_is_ascii_whitespace(p[left])) left++;
|
|
1364
|
+
|
|
1365
|
+
/* Skip trailing whitespace */
|
|
1366
|
+
long right = len;
|
|
1367
|
+
while (right > left && sv_is_ascii_whitespace(p[right - 1])) right--;
|
|
1368
|
+
|
|
1369
|
+
if (left == 0 && right == len) return self;
|
|
1370
|
+
return sv_new_from_parent_obj(self, sv, sv->offset + left, right - left);
|
|
1371
|
+
}
|
|
1372
|
+
|
|
1373
|
+
/*
|
|
1374
|
+
* lstrip → StringView
|
|
1375
|
+
* Returns a new StringView with leading ASCII whitespace removed.
|
|
1376
|
+
*/
|
|
1377
|
+
static VALUE sv_lstrip(int argc, VALUE *argv, VALUE self) {
|
|
1378
|
+
rb_check_arity(argc, 0, 0);
|
|
1379
|
+
string_view_t *sv = sv_get_struct(self);
|
|
1380
|
+
const unsigned char *p = (const unsigned char *)sv_ptr(sv);
|
|
1381
|
+
long len = sv->length;
|
|
1382
|
+
|
|
1383
|
+
long left = 0;
|
|
1384
|
+
while (left < len && sv_is_ascii_whitespace(p[left])) left++;
|
|
1385
|
+
|
|
1386
|
+
if (left == 0) return self;
|
|
1387
|
+
return sv_new_from_parent_obj(self, sv, sv->offset + left, len - left);
|
|
1388
|
+
}
|
|
1389
|
+
|
|
1390
|
+
/*
|
|
1391
|
+
* rstrip → StringView
|
|
1392
|
+
* Returns a new StringView with trailing ASCII whitespace removed.
|
|
1393
|
+
*/
|
|
1394
|
+
static VALUE sv_rstrip(int argc, VALUE *argv, VALUE self) {
|
|
1395
|
+
rb_check_arity(argc, 0, 0);
|
|
1396
|
+
string_view_t *sv = sv_get_struct(self);
|
|
1397
|
+
const unsigned char *p = (const unsigned char *)sv_ptr(sv);
|
|
1398
|
+
long len = sv->length;
|
|
1399
|
+
|
|
1400
|
+
long right = len;
|
|
1401
|
+
while (right > 0 && sv_is_ascii_whitespace(p[right - 1])) right--;
|
|
1402
|
+
|
|
1403
|
+
if (right == len) return self;
|
|
1404
|
+
return sv_new_from_parent_obj(self, sv, sv->offset, right);
|
|
1405
|
+
}
|
|
1406
|
+
|
|
1407
|
+
/*
|
|
1408
|
+
* chomp([separator]) → StringView
|
|
1409
|
+
* Returns a new StringView with the trailing record separator removed.
|
|
1410
|
+
* Default separator is $/ (typically "\n").
|
|
1411
|
+
* Handles "\n", "\r\n", and "\r" when separator is "\n".
|
|
1412
|
+
*/
|
|
1413
|
+
static VALUE sv_chomp(int argc, VALUE *argv, VALUE self) {
|
|
1414
|
+
rb_check_arity(argc, 0, 1);
|
|
1415
|
+
string_view_t *sv = sv_get_struct(self);
|
|
1416
|
+
const unsigned char *p = (const unsigned char *)sv_ptr(sv);
|
|
1417
|
+
long len = sv->length;
|
|
1418
|
+
|
|
1419
|
+
if (len == 0) return self;
|
|
1420
|
+
|
|
1421
|
+
if (argc == 0 || NIL_P(argv[0])) {
|
|
1422
|
+
/* Default: remove trailing \n, \r\n, or \r */
|
|
1423
|
+
/* Use $/ (input record separator) when no arg given */
|
|
1424
|
+
VALUE rs;
|
|
1425
|
+
if (argc == 0) {
|
|
1426
|
+
rs = rb_rs; /* global $/ */
|
|
1427
|
+
if (NIL_P(rs)) return self; /* $/ is nil, no chomp */
|
|
1428
|
+
} else {
|
|
1429
|
+
return self; /* chomp(nil) returns self */
|
|
1430
|
+
}
|
|
1431
|
+
|
|
1432
|
+
/* Fast path for default $/ which is "\n" */
|
|
1433
|
+
if (RB_TYPE_P(rs, T_STRING) && RSTRING_LEN(rs) == 1 && RSTRING_PTR(rs)[0] == '\n') {
|
|
1434
|
+
if (p[len - 1] == '\n') {
|
|
1435
|
+
long newlen = len - 1;
|
|
1436
|
+
if (newlen > 0 && p[newlen - 1] == '\r') newlen--;
|
|
1437
|
+
return sv_new_from_parent_obj(self, sv, sv->offset, newlen);
|
|
1438
|
+
} else if (p[len - 1] == '\r') {
|
|
1439
|
+
return sv_new_from_parent_obj(self, sv, sv->offset, len - 1);
|
|
1440
|
+
}
|
|
1441
|
+
return self;
|
|
1442
|
+
}
|
|
1443
|
+
|
|
1444
|
+
/* Non-default $/ — use the separator */
|
|
1445
|
+
if (!RB_TYPE_P(rs, T_STRING)) return self;
|
|
1446
|
+
const char *sep = RSTRING_PTR(rs);
|
|
1447
|
+
long seplen = RSTRING_LEN(rs);
|
|
1448
|
+
if (seplen == 0) {
|
|
1449
|
+
/* Paragraph mode: remove trailing \n+ */
|
|
1450
|
+
long right = len;
|
|
1451
|
+
while (right > 0 && p[right - 1] == '\n') right--;
|
|
1452
|
+
if (right == len) return self;
|
|
1453
|
+
return sv_new_from_parent_obj(self, sv, sv->offset, right);
|
|
1454
|
+
}
|
|
1455
|
+
if (seplen > len) return self;
|
|
1456
|
+
if (memcmp(p + len - seplen, sep, seplen) == 0) {
|
|
1457
|
+
return sv_new_from_parent_obj(self, sv, sv->offset, len - seplen);
|
|
1458
|
+
}
|
|
1459
|
+
return self;
|
|
1460
|
+
}
|
|
1461
|
+
|
|
1462
|
+
/* Explicit separator argument */
|
|
1463
|
+
VALUE sep_val = argv[0];
|
|
1464
|
+
if (NIL_P(sep_val)) return self;
|
|
1465
|
+
StringValue(sep_val);
|
|
1466
|
+
const char *sep = RSTRING_PTR(sep_val);
|
|
1467
|
+
long seplen = RSTRING_LEN(sep_val);
|
|
1468
|
+
|
|
1469
|
+
if (seplen == 0) {
|
|
1470
|
+
/* Paragraph mode: remove all trailing newlines */
|
|
1471
|
+
long right = len;
|
|
1472
|
+
while (right > 0 && p[right - 1] == '\n') right--;
|
|
1473
|
+
if (right == len) return self;
|
|
1474
|
+
return sv_new_from_parent_obj(self, sv, sv->offset, right);
|
|
1475
|
+
}
|
|
1476
|
+
|
|
1477
|
+
/* Special handling for "\n": also removes \r\n and \r */
|
|
1478
|
+
if (seplen == 1 && sep[0] == '\n') {
|
|
1479
|
+
if (p[len - 1] == '\n') {
|
|
1480
|
+
long newlen = len - 1;
|
|
1481
|
+
if (newlen > 0 && p[newlen - 1] == '\r') newlen--;
|
|
1482
|
+
return sv_new_from_parent_obj(self, sv, sv->offset, newlen);
|
|
1483
|
+
} else if (p[len - 1] == '\r') {
|
|
1484
|
+
return sv_new_from_parent_obj(self, sv, sv->offset, len - 1);
|
|
1485
|
+
}
|
|
1486
|
+
return self;
|
|
1487
|
+
}
|
|
1488
|
+
|
|
1489
|
+
if (seplen > len) return self;
|
|
1490
|
+
if (memcmp(p + len - seplen, sep, seplen) == 0) {
|
|
1491
|
+
return sv_new_from_parent_obj(self, sv, sv->offset, len - seplen);
|
|
1492
|
+
}
|
|
1493
|
+
return self;
|
|
1494
|
+
}
|
|
1495
|
+
|
|
1496
|
+
/*
|
|
1497
|
+
* chop → StringView
|
|
1498
|
+
* Returns a new StringView with the last character removed.
|
|
1499
|
+
* If the string ends with \r\n, both characters are removed.
|
|
1500
|
+
*/
|
|
1501
|
+
static VALUE sv_chop(int argc, VALUE *argv, VALUE self) {
|
|
1502
|
+
rb_check_arity(argc, 0, 0);
|
|
1503
|
+
string_view_t *sv = sv_get_struct(self);
|
|
1504
|
+
long len = sv->length;
|
|
1505
|
+
|
|
1506
|
+
if (len == 0) return self;
|
|
1507
|
+
|
|
1508
|
+
const unsigned char *p = (const unsigned char *)sv_ptr(sv);
|
|
1509
|
+
|
|
1510
|
+
/* Check for \r\n at the end */
|
|
1511
|
+
if (len >= 2 && p[len - 1] == '\n' && p[len - 2] == '\r') {
|
|
1512
|
+
return sv_new_from_parent_obj(self, sv, sv->offset, len - 2);
|
|
1513
|
+
}
|
|
1514
|
+
|
|
1515
|
+
/* Remove last character (respecting encoding) */
|
|
1516
|
+
if (sv_single_byte_optimizable(sv)) {
|
|
1517
|
+
return sv_new_from_parent_obj(self, sv, sv->offset, len - 1);
|
|
1518
|
+
}
|
|
1519
|
+
|
|
1520
|
+
/* Multibyte: find start of last character */
|
|
1521
|
+
rb_encoding *enc = sv_enc(sv);
|
|
1522
|
+
const char *start = sv_ptr(sv);
|
|
1523
|
+
const char *end = start + len;
|
|
1524
|
+
const char *prev = rb_enc_prev_char(start, end, end, enc);
|
|
1525
|
+
if (prev == NULL) prev = start;
|
|
1526
|
+
long newlen = (long)(prev - start);
|
|
1527
|
+
|
|
1528
|
+
return sv_new_from_parent_obj(self, sv, sv->offset, newlen);
|
|
1529
|
+
}
|
|
1530
|
+
|
|
1531
|
+
/*
|
|
1532
|
+
* delete_prefix(prefix) → StringView
|
|
1533
|
+
* Returns a new StringView with the given prefix removed, or self if
|
|
1534
|
+
* the string doesn't start with the prefix.
|
|
1535
|
+
*/
|
|
1536
|
+
static VALUE sv_delete_prefix(VALUE self, VALUE prefix) {
|
|
1537
|
+
string_view_t *sv = sv_get_struct(self);
|
|
1538
|
+
StringValue(prefix);
|
|
1539
|
+
const char *p = sv_ptr(sv);
|
|
1540
|
+
long plen = RSTRING_LEN(prefix);
|
|
1541
|
+
|
|
1542
|
+
if (plen == 0) return self;
|
|
1543
|
+
sv_check_compatible_string(sv, prefix);
|
|
1544
|
+
if (plen > sv->length) return self;
|
|
1545
|
+
if (memcmp(p, RSTRING_PTR(prefix), plen) != 0) return self;
|
|
1546
|
+
|
|
1547
|
+
return sv_new_from_parent_obj(self, sv, sv->offset + plen, sv->length - plen);
|
|
1548
|
+
}
|
|
1549
|
+
|
|
1550
|
+
/*
|
|
1551
|
+
* delete_suffix(suffix) → StringView
|
|
1552
|
+
* Returns a new StringView with the given suffix removed, or self if
|
|
1553
|
+
* the string doesn't end with the suffix.
|
|
1554
|
+
*/
|
|
1555
|
+
static VALUE sv_delete_suffix(VALUE self, VALUE suffix) {
|
|
1556
|
+
string_view_t *sv = sv_get_struct(self);
|
|
1557
|
+
StringValue(suffix);
|
|
1558
|
+
const char *p = sv_ptr(sv);
|
|
1559
|
+
long slen = RSTRING_LEN(suffix);
|
|
1560
|
+
|
|
1561
|
+
if (slen == 0) return self;
|
|
1562
|
+
sv_check_compatible_string(sv, suffix);
|
|
1563
|
+
if (slen > sv->length) return self;
|
|
1564
|
+
if (memcmp(p + sv->length - slen, RSTRING_PTR(suffix), slen) != 0) return self;
|
|
1565
|
+
|
|
1566
|
+
return sv_new_from_parent_obj(self, sv, sv->offset, sv->length - slen);
|
|
1567
|
+
}
|
|
1568
|
+
|
|
1569
|
+
/*
|
|
1570
|
+
* chr → StringView
|
|
1571
|
+
* Returns the first character as a StringView.
|
|
1572
|
+
*/
|
|
1573
|
+
static VALUE sv_chr(VALUE self) {
|
|
1574
|
+
string_view_t *sv = sv_get_struct(self);
|
|
1575
|
+
|
|
1576
|
+
if (sv->length == 0) return self;
|
|
1577
|
+
|
|
1578
|
+
if (sv_single_byte_optimizable(sv)) {
|
|
1579
|
+
return sv_new_from_parent_obj(self, sv, sv->offset, 1);
|
|
1580
|
+
}
|
|
1581
|
+
|
|
1582
|
+
rb_encoding *enc = sv_enc(sv);
|
|
1583
|
+
const char *p = sv_ptr(sv);
|
|
1584
|
+
const char *e = p + sv->length;
|
|
1585
|
+
int clen = rb_enc_fast_mbclen(p, e, enc);
|
|
1586
|
+
|
|
1587
|
+
return sv_new_from_parent_obj(self, sv, sv->offset, clen);
|
|
1588
|
+
}
|
|
1589
|
+
|
|
1590
|
+
/*
|
|
1591
|
+
* ord → Integer
|
|
1592
|
+
* Returns the codepoint of the first character.
|
|
1593
|
+
*/
|
|
1594
|
+
static VALUE sv_ord(VALUE self) {
|
|
1595
|
+
string_view_t *sv = sv_get_struct(self);
|
|
1596
|
+
|
|
1597
|
+
if (sv->length == 0) {
|
|
1598
|
+
rb_raise(rb_eArgError, "empty string");
|
|
1599
|
+
}
|
|
1600
|
+
|
|
1601
|
+
rb_encoding *enc = sv_enc(sv);
|
|
1602
|
+
const char *p = sv_ptr(sv);
|
|
1603
|
+
const char *e = p + sv->length;
|
|
1604
|
+
unsigned int c = rb_enc_codepoint_len(p, e, NULL, enc);
|
|
1605
|
+
return UINT2NUM(c);
|
|
1606
|
+
}
|
|
1607
|
+
|
|
1608
|
+
/*
|
|
1609
|
+
* valid_encoding? → true/false
|
|
1610
|
+
* Returns whether the view's bytes are valid in its encoding.
|
|
1611
|
+
*/
|
|
1612
|
+
static VALUE sv_valid_encoding_p(VALUE self) {
|
|
1613
|
+
string_view_t *sv = sv_get_struct(self);
|
|
1614
|
+
return sv_valid_encoding_cached(sv) ? Qtrue : Qfalse;
|
|
1615
|
+
}
|
|
1616
|
+
|
|
1617
|
+
/*
|
|
1618
|
+
* b → StringView
|
|
1619
|
+
* Returns a new StringView that references the same bytes but with
|
|
1620
|
+
* ASCII-8BIT encoding. Since we share the same backing bytes, this is
|
|
1621
|
+
* only valid when the backing is also binary-compatible, which it always
|
|
1622
|
+
* is — we just reinterpret the bytes.
|
|
1623
|
+
*
|
|
1624
|
+
* Note: We need to create a new backing with binary encoding since
|
|
1625
|
+
* the encoding is tied to the backing string.
|
|
1626
|
+
* Actually, the encoding is cached in sv->enc, so we can create a
|
|
1627
|
+
* lightweight view with different encoding. But the backing string
|
|
1628
|
+
* has its own encoding... For true zero-alloc we store enc separately.
|
|
1629
|
+
*/
|
|
1630
|
+
|
|
1055
1631
|
/* ========================================================================= */
|
|
1056
1632
|
/* Tier 3: Transform delegation */
|
|
1057
1633
|
/* ========================================================================= */
|
|
1058
1634
|
|
|
1059
|
-
#define SV_DELEGATE_FUNCALL(cname,
|
|
1635
|
+
#define SV_DELEGATE_FUNCALL(cname, cached_id) \
|
|
1060
1636
|
static VALUE sv_##cname(int argc, VALUE *argv, VALUE self) { \
|
|
1061
1637
|
string_view_t *sv = sv_get_struct(self); \
|
|
1062
1638
|
VALUE shared = sv_as_shared_str(sv); \
|
|
1063
1639
|
if (rb_block_given_p()) { \
|
|
1064
|
-
return rb_funcall_with_block(shared,
|
|
1640
|
+
return rb_funcall_with_block(shared, cached_id, \
|
|
1065
1641
|
argc, argv, rb_block_proc()); \
|
|
1066
1642
|
} \
|
|
1067
|
-
return rb_funcallv(shared,
|
|
1068
|
-
}
|
|
1069
|
-
|
|
1070
|
-
SV_DELEGATE_FUNCALL(upcase,
|
|
1071
|
-
SV_DELEGATE_FUNCALL(downcase,
|
|
1072
|
-
SV_DELEGATE_FUNCALL(capitalize,
|
|
1073
|
-
SV_DELEGATE_FUNCALL(swapcase,
|
|
1074
|
-
SV_DELEGATE_FUNCALL(
|
|
1075
|
-
SV_DELEGATE_FUNCALL(
|
|
1076
|
-
SV_DELEGATE_FUNCALL(
|
|
1077
|
-
SV_DELEGATE_FUNCALL(
|
|
1078
|
-
SV_DELEGATE_FUNCALL(
|
|
1079
|
-
SV_DELEGATE_FUNCALL(
|
|
1080
|
-
SV_DELEGATE_FUNCALL(
|
|
1081
|
-
SV_DELEGATE_FUNCALL(
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
SV_DELEGATE_FUNCALL(count,
|
|
1088
|
-
SV_DELEGATE_FUNCALL(scan,
|
|
1089
|
-
SV_DELEGATE_FUNCALL(split,
|
|
1090
|
-
SV_DELEGATE_FUNCALL(center,
|
|
1091
|
-
SV_DELEGATE_FUNCALL(ljust,
|
|
1092
|
-
SV_DELEGATE_FUNCALL(rjust,
|
|
1093
|
-
SV_DELEGATE_FUNCALL(format_op,
|
|
1094
|
-
SV_DELEGATE_FUNCALL(plus,
|
|
1095
|
-
SV_DELEGATE_FUNCALL(multiply,
|
|
1096
|
-
SV_DELEGATE_FUNCALL(unpack1,
|
|
1097
|
-
SV_DELEGATE_FUNCALL(scrub,
|
|
1098
|
-
SV_DELEGATE_FUNCALL(unicode_normalize,
|
|
1643
|
+
return rb_funcallv(shared, cached_id, argc, argv); \
|
|
1644
|
+
}
|
|
1645
|
+
|
|
1646
|
+
SV_DELEGATE_FUNCALL(upcase, id_upcase)
|
|
1647
|
+
SV_DELEGATE_FUNCALL(downcase, id_downcase)
|
|
1648
|
+
SV_DELEGATE_FUNCALL(capitalize,id_capitalize)
|
|
1649
|
+
SV_DELEGATE_FUNCALL(swapcase, id_swapcase)
|
|
1650
|
+
SV_DELEGATE_FUNCALL(reverse, id_reverse)
|
|
1651
|
+
SV_DELEGATE_FUNCALL(squeeze, id_squeeze)
|
|
1652
|
+
SV_DELEGATE_FUNCALL(encode, id_encode)
|
|
1653
|
+
SV_DELEGATE_FUNCALL(gsub, id_gsub)
|
|
1654
|
+
SV_DELEGATE_FUNCALL(sub, id_sub)
|
|
1655
|
+
SV_DELEGATE_FUNCALL(tr, id_tr)
|
|
1656
|
+
SV_DELEGATE_FUNCALL(tr_s, id_tr_s)
|
|
1657
|
+
SV_DELEGATE_FUNCALL(delete_str,id_delete)
|
|
1658
|
+
/*
|
|
1659
|
+
* count(set, ...) → Integer
|
|
1660
|
+
* Delegates to String#count via shared string.
|
|
1661
|
+
* (Character set parsing is complex — reuse Ruby's implementation.)
|
|
1662
|
+
*/
|
|
1663
|
+
SV_DELEGATE_FUNCALL(count, id_count)
|
|
1664
|
+
SV_DELEGATE_FUNCALL(scan, id_scan)
|
|
1665
|
+
SV_DELEGATE_FUNCALL(split, id_split)
|
|
1666
|
+
SV_DELEGATE_FUNCALL(center, id_center)
|
|
1667
|
+
SV_DELEGATE_FUNCALL(ljust, id_ljust)
|
|
1668
|
+
SV_DELEGATE_FUNCALL(rjust, id_rjust)
|
|
1669
|
+
SV_DELEGATE_FUNCALL(format_op, id_format_op)
|
|
1670
|
+
SV_DELEGATE_FUNCALL(plus, id_plus)
|
|
1671
|
+
SV_DELEGATE_FUNCALL(multiply, id_multiply)
|
|
1672
|
+
SV_DELEGATE_FUNCALL(unpack1, id_unpack1)
|
|
1673
|
+
SV_DELEGATE_FUNCALL(scrub, id_scrub)
|
|
1674
|
+
SV_DELEGATE_FUNCALL(unicode_normalize, id_unicode_normalize)
|
|
1099
1675
|
|
|
1100
1676
|
/* ========================================================================= */
|
|
1101
1677
|
/* Bang methods — always raise FrozenError */
|
|
1102
1678
|
/* ========================================================================= */
|
|
1103
1679
|
|
|
1104
1680
|
static VALUE sv_frozen_error(int argc, VALUE *argv, VALUE self) {
|
|
1105
|
-
|
|
1106
|
-
rb_raise(rb_eFrozenError, "can't modify frozen StringView
|
|
1107
|
-
StringValueCStr(str));
|
|
1681
|
+
(void)argc; (void)argv;
|
|
1682
|
+
rb_raise(rb_eFrozenError, "can't modify frozen StringView");
|
|
1108
1683
|
return Qnil;
|
|
1109
1684
|
}
|
|
1110
1685
|
|
|
@@ -1115,6 +1690,46 @@ static VALUE sv_frozen_error(int argc, VALUE *argv, VALUE self) {
|
|
|
1115
1690
|
void Init_string_view(void) {
|
|
1116
1691
|
enc_utf8 = rb_utf8_encoding();
|
|
1117
1692
|
|
|
1693
|
+
/* Cache method IDs — avoids rb_intern hash lookup on every call */
|
|
1694
|
+
id_index = rb_intern("index");
|
|
1695
|
+
id_rindex = rb_intern("rindex");
|
|
1696
|
+
id_byteindex = rb_intern("byteindex");
|
|
1697
|
+
id_byterindex = rb_intern("byterindex");
|
|
1698
|
+
id_match = rb_intern("match");
|
|
1699
|
+
id_match_p = rb_intern("match?");
|
|
1700
|
+
id_match_op = rb_intern("=~");
|
|
1701
|
+
id_begin = rb_intern("begin");
|
|
1702
|
+
id_aref = rb_intern("[]");
|
|
1703
|
+
id_upcase = rb_intern("upcase");
|
|
1704
|
+
id_downcase = rb_intern("downcase");
|
|
1705
|
+
id_capitalize = rb_intern("capitalize");
|
|
1706
|
+
id_swapcase = rb_intern("swapcase");
|
|
1707
|
+
id_strip = rb_intern("strip");
|
|
1708
|
+
id_lstrip = rb_intern("lstrip");
|
|
1709
|
+
id_rstrip = rb_intern("rstrip");
|
|
1710
|
+
id_chomp = rb_intern("chomp");
|
|
1711
|
+
id_chop = rb_intern("chop");
|
|
1712
|
+
id_reverse = rb_intern("reverse");
|
|
1713
|
+
id_squeeze = rb_intern("squeeze");
|
|
1714
|
+
id_encode = rb_intern("encode");
|
|
1715
|
+
id_gsub = rb_intern("gsub");
|
|
1716
|
+
id_sub = rb_intern("sub");
|
|
1717
|
+
id_tr = rb_intern("tr");
|
|
1718
|
+
id_tr_s = rb_intern("tr_s");
|
|
1719
|
+
id_delete = rb_intern("delete");
|
|
1720
|
+
id_count = rb_intern("count");
|
|
1721
|
+
id_scan = rb_intern("scan");
|
|
1722
|
+
id_split = rb_intern("split");
|
|
1723
|
+
id_center = rb_intern("center");
|
|
1724
|
+
id_ljust = rb_intern("ljust");
|
|
1725
|
+
id_rjust = rb_intern("rjust");
|
|
1726
|
+
id_format_op = rb_intern("%");
|
|
1727
|
+
id_plus = rb_intern("+");
|
|
1728
|
+
id_multiply = rb_intern("*");
|
|
1729
|
+
id_unpack1 = rb_intern("unpack1");
|
|
1730
|
+
id_scrub = rb_intern("scrub");
|
|
1731
|
+
id_unicode_normalize = rb_intern("unicode_normalize");
|
|
1732
|
+
|
|
1118
1733
|
cStringView = rb_define_class("StringView", rb_cObject);
|
|
1119
1734
|
rb_include_module(cStringView, rb_mComparable);
|
|
1120
1735
|
|
|
@@ -1122,11 +1737,10 @@ void Init_string_view(void) {
|
|
|
1122
1737
|
rb_define_method(cStringView, "initialize", sv_initialize, -1);
|
|
1123
1738
|
|
|
1124
1739
|
rb_define_method(cStringView, "to_s", sv_to_s, 0);
|
|
1740
|
+
rb_define_method(cStringView, "materialize", sv_to_s, 0);
|
|
1125
1741
|
rb_define_private_method(cStringView, "to_str", sv_to_str, 0);
|
|
1126
1742
|
rb_define_method(cStringView, "inspect", sv_inspect, 0);
|
|
1127
|
-
rb_define_method(cStringView, "frozen?", sv_frozen_p, 0);
|
|
1128
1743
|
rb_define_method(cStringView, "reset!", sv_reset, 3);
|
|
1129
|
-
rb_define_alias(cStringView, "materialize", "to_s");
|
|
1130
1744
|
|
|
1131
1745
|
rb_define_method(cStringView, "bytesize", sv_bytesize, 0);
|
|
1132
1746
|
rb_define_method(cStringView, "length", sv_length, 0);
|
|
@@ -1176,6 +1790,11 @@ void Init_string_view(void) {
|
|
|
1176
1790
|
rb_define_method(cStringView, "rstrip", sv_rstrip, -1);
|
|
1177
1791
|
rb_define_method(cStringView, "chomp", sv_chomp, -1);
|
|
1178
1792
|
rb_define_method(cStringView, "chop", sv_chop, -1);
|
|
1793
|
+
rb_define_method(cStringView, "delete_prefix", sv_delete_prefix, 1);
|
|
1794
|
+
rb_define_method(cStringView, "delete_suffix", sv_delete_suffix, 1);
|
|
1795
|
+
rb_define_method(cStringView, "chr", sv_chr, 0);
|
|
1796
|
+
rb_define_method(cStringView, "ord", sv_ord, 0);
|
|
1797
|
+
rb_define_method(cStringView, "valid_encoding?", sv_valid_encoding_p, 0);
|
|
1179
1798
|
rb_define_method(cStringView, "reverse", sv_reverse, -1);
|
|
1180
1799
|
rb_define_method(cStringView, "squeeze", sv_squeeze, -1);
|
|
1181
1800
|
rb_define_method(cStringView, "encode", sv_encode, -1);
|
|
@@ -1183,7 +1802,7 @@ void Init_string_view(void) {
|
|
|
1183
1802
|
rb_define_method(cStringView, "sub", sv_sub, -1);
|
|
1184
1803
|
rb_define_method(cStringView, "tr", sv_tr, -1);
|
|
1185
1804
|
rb_define_method(cStringView, "tr_s", sv_tr_s, -1);
|
|
1186
|
-
rb_define_method(cStringView, "delete",
|
|
1805
|
+
rb_define_method(cStringView, "delete", sv_delete_str, -1);
|
|
1187
1806
|
rb_define_method(cStringView, "count", sv_count, -1);
|
|
1188
1807
|
rb_define_method(cStringView, "scan", sv_scan, -1);
|
|
1189
1808
|
rb_define_method(cStringView, "split", sv_split, -1);
|
|
@@ -1214,4 +1833,10 @@ void Init_string_view(void) {
|
|
|
1214
1833
|
rb_define_method(cStringView, "gsub!", sv_frozen_error, -1);
|
|
1215
1834
|
rb_define_method(cStringView, "sub!", sv_frozen_error, -1);
|
|
1216
1835
|
rb_define_method(cStringView, "slice!", sv_frozen_error, -1);
|
|
1836
|
+
rb_define_method(cStringView, "delete_prefix!", sv_frozen_error, -1);
|
|
1837
|
+
rb_define_method(cStringView, "delete_suffix!", sv_frozen_error, -1);
|
|
1838
|
+
|
|
1839
|
+
Init_string_view_strict();
|
|
1840
|
+
Init_string_view_pool();
|
|
1841
|
+
Init_string_view_core_ext();
|
|
1217
1842
|
}
|