string_view 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/string_view/string_view.c +177 -80
- data/ext/string_view/string_view.h +12 -0
- data/ext/string_view/string_view_core_ext.c +9 -24
- data/ext/string_view/string_view_pool.c +9 -5
- data/lib/string_view/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: '0203685aa4c5ed043ac5056a55ff02ef2a66b1c1e1dea855863f76c5630f2729'
|
|
4
|
+
data.tar.gz: 3fb4104605525004fb295f301ccd082250cd2a5c6adf6b121250ebbf02d8931e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a8fe9e3f44a6bf76fe291dbec179fc6cd4bccd9cda4d494f2b064e7e6285568544ced4f8be2d31f54d51bcf0428059b17cfdd169738297d1b96a16f88647c3c8
|
|
7
|
+
data.tar.gz: aba06aefc6e28749d5693167005f1271993067fd1bee28ffcff7a7953c7d9c9dd4cf431e09e996a62fb8d7e622cedb605a4019012f6a20e639557fcb906117e7
|
|
@@ -50,10 +50,7 @@ static void sv_compact(void *ptr) {
|
|
|
50
50
|
|
|
51
51
|
static void sv_free(void *ptr) {
|
|
52
52
|
string_view_t *sv = (string_view_t *)ptr;
|
|
53
|
-
|
|
54
|
-
xfree(sv->stride_idx->offsets);
|
|
55
|
-
xfree(sv->stride_idx);
|
|
56
|
-
}
|
|
53
|
+
sv_clear_stride_index(sv);
|
|
57
54
|
}
|
|
58
55
|
|
|
59
56
|
static size_t sv_memsize(const void *ptr) {
|
|
@@ -74,7 +71,9 @@ const rb_data_type_t string_view_type = {
|
|
|
74
71
|
/* Forward declarations for functions defined later in this file */
|
|
75
72
|
static long sv_char_count(string_view_t *sv);
|
|
76
73
|
static long sv_char_to_byte_offset(string_view_t *sv, long char_idx);
|
|
74
|
+
static long sv_char_count_partial(string_view_t *sv, const char *p, long len);
|
|
77
75
|
SV_INLINE int sv_single_byte_optimizable(string_view_t *sv);
|
|
76
|
+
SV_INLINE int sv_is_7bit(string_view_t *sv);
|
|
78
77
|
SV_INLINE int sv_is_utf8(string_view_t *sv);
|
|
79
78
|
static long sv_utf8_char_count(const char *p, long len);
|
|
80
79
|
|
|
@@ -119,6 +118,8 @@ SV_INLINE VALUE sv_new_from_parent_obj(VALUE parent_obj, string_view_t *parent,
|
|
|
119
118
|
sv->offset = offset;
|
|
120
119
|
sv->length = length;
|
|
121
120
|
sv->single_byte = parent->single_byte;
|
|
121
|
+
sv->valid_encoding = sv->single_byte == 1 ? 1 : -1;
|
|
122
|
+
sv->pooled = 0;
|
|
122
123
|
sv->charlen = -1;
|
|
123
124
|
sv->stride_idx = NULL;
|
|
124
125
|
/* Not frozen — see sv_initialize comment for rationale */
|
|
@@ -140,6 +141,8 @@ static VALUE sv_alloc(VALUE klass) {
|
|
|
140
141
|
sv->offset = 0;
|
|
141
142
|
sv->length = 0;
|
|
142
143
|
sv->single_byte = -1;
|
|
144
|
+
sv->valid_encoding = -1;
|
|
145
|
+
sv->pooled = 0;
|
|
143
146
|
sv->charlen = -1;
|
|
144
147
|
sv->stride_idx = NULL;
|
|
145
148
|
return obj;
|
|
@@ -217,6 +220,11 @@ static VALUE sv_reset(VALUE self, VALUE new_backing, VALUE voffset, VALUE vlengt
|
|
|
217
220
|
rb_check_frozen(self);
|
|
218
221
|
string_view_t *sv = sv_get_struct(self);
|
|
219
222
|
|
|
223
|
+
if (SV_UNLIKELY(sv->pooled)) {
|
|
224
|
+
rb_raise(rb_eRuntimeError,
|
|
225
|
+
"can't reset a pooled StringView directly; call StringView::Pool#reset! instead");
|
|
226
|
+
}
|
|
227
|
+
|
|
220
228
|
sv_check_frozen_string(new_backing);
|
|
221
229
|
|
|
222
230
|
long off = NUM2LONG(voffset);
|
|
@@ -224,10 +232,7 @@ static VALUE sv_reset(VALUE self, VALUE new_backing, VALUE voffset, VALUE vlengt
|
|
|
224
232
|
sv_check_bounds(off, len, RSTRING_LEN(new_backing));
|
|
225
233
|
|
|
226
234
|
/* Free old stride index before reinitializing */
|
|
227
|
-
|
|
228
|
-
xfree(sv->stride_idx->offsets);
|
|
229
|
-
xfree(sv->stride_idx);
|
|
230
|
-
}
|
|
235
|
+
sv_clear_stride_index(sv);
|
|
231
236
|
|
|
232
237
|
sv_init_fields(self, sv, new_backing, RSTRING_PTR(new_backing),
|
|
233
238
|
rb_enc_get(new_backing), off, len);
|
|
@@ -274,6 +279,94 @@ static VALUE sv_ascii_only_p(VALUE self) {
|
|
|
274
279
|
return Qtrue;
|
|
275
280
|
}
|
|
276
281
|
|
|
282
|
+
SV_INLINE long sv_precise_char_len(const char *p, const char *e, rb_encoding *enc) {
|
|
283
|
+
int len = rb_enc_precise_mbclen(p, e, enc);
|
|
284
|
+
if (MBCLEN_CHARFOUND_P(len)) return MBCLEN_CHARFOUND_LEN(len);
|
|
285
|
+
return 1;
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
static int sv_compute_valid_encoding_slice(string_view_t *sv) {
|
|
289
|
+
if (sv_single_byte_optimizable(sv)) return 1;
|
|
290
|
+
|
|
291
|
+
if (SV_LIKELY(sv_is_utf8(sv))) {
|
|
292
|
+
return simdutf_validate_utf8(sv_ptr(sv), (size_t)sv->length) ? 1 : 0;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
rb_encoding *enc = sv_enc(sv);
|
|
296
|
+
const char *p = sv_ptr(sv);
|
|
297
|
+
const char *e = p + sv->length;
|
|
298
|
+
|
|
299
|
+
while (p < e) {
|
|
300
|
+
int len = rb_enc_precise_mbclen(p, e, enc);
|
|
301
|
+
if (!MBCLEN_CHARFOUND_P(len)) return 0;
|
|
302
|
+
p += MBCLEN_CHARFOUND_LEN(len);
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
return 1;
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
SV_INLINE int sv_valid_encoding_cached(string_view_t *sv) {
|
|
309
|
+
if (SV_LIKELY(sv->valid_encoding >= 0)) return sv->valid_encoding;
|
|
310
|
+
sv->valid_encoding = sv_compute_valid_encoding_slice(sv);
|
|
311
|
+
return sv->valid_encoding;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
static long sv_tolerant_char_count(const char *p, const char *e, rb_encoding *enc) {
|
|
315
|
+
long count = 0;
|
|
316
|
+
|
|
317
|
+
while (p < e) {
|
|
318
|
+
p += sv_precise_char_len(p, e, enc);
|
|
319
|
+
count++;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
return count;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
static long sv_tolerant_char_to_byte_offset(string_view_t *sv, long char_idx) {
|
|
326
|
+
rb_encoding *enc = sv_enc(sv);
|
|
327
|
+
const char *p = sv_ptr(sv);
|
|
328
|
+
const char *e = p + sv->length;
|
|
329
|
+
const char *start = p;
|
|
330
|
+
long i = 0;
|
|
331
|
+
|
|
332
|
+
while (i < char_idx && p < e) {
|
|
333
|
+
p += sv_precise_char_len(p, e, enc);
|
|
334
|
+
i++;
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
if (i < char_idx) return -1;
|
|
338
|
+
return p - start;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
static long sv_tolerant_chars_to_bytes(string_view_t *sv, long byte_off, long n) {
|
|
342
|
+
rb_encoding *enc = sv_enc(sv);
|
|
343
|
+
const char *start = sv_ptr(sv) + byte_off;
|
|
344
|
+
const char *p = start;
|
|
345
|
+
const char *e = sv_ptr(sv) + sv->length;
|
|
346
|
+
long i = 0;
|
|
347
|
+
|
|
348
|
+
while (i < n && p < e) {
|
|
349
|
+
p += sv_precise_char_len(p, e, enc);
|
|
350
|
+
i++;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
return p - start;
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
SV_INLINE void sv_check_compatible_string(string_view_t *sv, VALUE other) {
|
|
357
|
+
rb_encoding *oenc = rb_enc_get(other);
|
|
358
|
+
|
|
359
|
+
if (sv->enc == oenc) return;
|
|
360
|
+
if (rb_enc_asciicompat(sv->enc) && rb_enc_asciicompat(oenc) &&
|
|
361
|
+
(sv_is_7bit(sv) || rb_enc_str_asciionly_p(other))) {
|
|
362
|
+
return;
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
rb_raise(rb_eEncCompatError,
|
|
366
|
+
"incompatible character encodings: %s and %s",
|
|
367
|
+
rb_enc_name(sv->enc), rb_enc_name(oenc));
|
|
368
|
+
}
|
|
369
|
+
|
|
277
370
|
/* ========================================================================= */
|
|
278
371
|
/* Tier 1: Searching */
|
|
279
372
|
/* ========================================================================= */
|
|
@@ -284,6 +377,7 @@ static VALUE sv_include_p(VALUE self, VALUE substr) {
|
|
|
284
377
|
const char *p = sv_ptr(sv);
|
|
285
378
|
long slen = RSTRING_LEN(substr);
|
|
286
379
|
if (slen == 0) return Qtrue;
|
|
380
|
+
sv_check_compatible_string(sv, substr);
|
|
287
381
|
if (slen > sv->length) return Qfalse;
|
|
288
382
|
|
|
289
383
|
long pos = rb_memsearch(RSTRING_PTR(substr), slen, p, sv->length, sv_enc(sv));
|
|
@@ -299,6 +393,8 @@ static VALUE sv_start_with_p(int argc, VALUE *argv, VALUE self) {
|
|
|
299
393
|
VALUE prefix = argv[i];
|
|
300
394
|
StringValue(prefix);
|
|
301
395
|
long plen = RSTRING_LEN(prefix);
|
|
396
|
+
if (plen == 0) return Qtrue;
|
|
397
|
+
sv_check_compatible_string(sv, prefix);
|
|
302
398
|
if (plen > sv->length) continue;
|
|
303
399
|
if (memcmp(p, RSTRING_PTR(prefix), plen) == 0) return Qtrue;
|
|
304
400
|
}
|
|
@@ -314,6 +410,8 @@ static VALUE sv_end_with_p(int argc, VALUE *argv, VALUE self) {
|
|
|
314
410
|
VALUE suffix = argv[i];
|
|
315
411
|
StringValue(suffix);
|
|
316
412
|
long slen = RSTRING_LEN(suffix);
|
|
413
|
+
if (slen == 0) return Qtrue;
|
|
414
|
+
sv_check_compatible_string(sv, suffix);
|
|
317
415
|
if (slen > sv->length) continue;
|
|
318
416
|
if (memcmp(p + sv->length - slen, RSTRING_PTR(suffix), slen) == 0)
|
|
319
417
|
return Qtrue;
|
|
@@ -348,12 +446,13 @@ VALUE sv_index(int argc, VALUE *argv, VALUE self) {
|
|
|
348
446
|
|
|
349
447
|
if (char_off < 0) char_off += total_chars;
|
|
350
448
|
if (char_off < 0 || char_off > total_chars) return Qnil;
|
|
449
|
+
if (plen == 0) return LONG2NUM(char_off);
|
|
450
|
+
sv_check_compatible_string(sv, pattern);
|
|
351
451
|
|
|
352
452
|
/* Convert char offset to byte offset */
|
|
353
453
|
long byte_off = sv_char_to_byte_offset(sv, char_off);
|
|
354
454
|
if (byte_off < 0) return Qnil;
|
|
355
455
|
|
|
356
|
-
if (plen == 0) return LONG2NUM(char_off);
|
|
357
456
|
if (plen > sv->length - byte_off) return Qnil;
|
|
358
457
|
|
|
359
458
|
long pos = rb_memsearch(RSTRING_PTR(pattern), plen,
|
|
@@ -365,16 +464,8 @@ VALUE sv_index(int argc, VALUE *argv, VALUE self) {
|
|
|
365
464
|
if (sv_single_byte_optimizable(sv)) {
|
|
366
465
|
return LONG2NUM(char_off + pos);
|
|
367
466
|
}
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
long chars = sv_utf8_char_count(p + byte_off, pos);
|
|
371
|
-
return LONG2NUM(char_off + chars);
|
|
372
|
-
}
|
|
373
|
-
rb_encoding *enc = sv_enc(sv);
|
|
374
|
-
const char *s = p + byte_off;
|
|
375
|
-
const char *e = s + pos;
|
|
376
|
-
long chars = rb_enc_strlen(s, e, enc);
|
|
377
|
-
return LONG2NUM(char_off + chars);
|
|
467
|
+
|
|
468
|
+
return LONG2NUM(char_off + sv_char_count_partial(sv, p + byte_off, pos));
|
|
378
469
|
}
|
|
379
470
|
|
|
380
471
|
/*
|
|
@@ -413,6 +504,7 @@ VALUE sv_rindex(int argc, VALUE *argv, VALUE self) {
|
|
|
413
504
|
if (plen == 0) {
|
|
414
505
|
return LONG2NUM(max_char > total_chars ? total_chars : max_char);
|
|
415
506
|
}
|
|
507
|
+
sv_check_compatible_string(sv, pattern);
|
|
416
508
|
if (plen > sv->length) return Qnil;
|
|
417
509
|
|
|
418
510
|
/* Convert max_char to a byte limit */
|
|
@@ -435,11 +527,7 @@ VALUE sv_rindex(int argc, VALUE *argv, VALUE self) {
|
|
|
435
527
|
if (sv_single_byte_optimizable(sv)) {
|
|
436
528
|
return LONG2NUM(byte_pos);
|
|
437
529
|
}
|
|
438
|
-
|
|
439
|
-
return LONG2NUM(sv_utf8_char_count(p, byte_pos));
|
|
440
|
-
}
|
|
441
|
-
rb_encoding *enc = sv_enc(sv);
|
|
442
|
-
return LONG2NUM(rb_enc_strlen(p, s, enc));
|
|
530
|
+
return LONG2NUM(sv_char_count_partial(sv, p, byte_pos));
|
|
443
531
|
}
|
|
444
532
|
/* Move back one character */
|
|
445
533
|
if (s == p) break;
|
|
@@ -486,6 +574,7 @@ VALUE sv_byteindex(int argc, VALUE *argv, VALUE self) {
|
|
|
486
574
|
if (byte_off < 0) byte_off += sv->length;
|
|
487
575
|
if (byte_off < 0 || byte_off > sv->length) return Qnil;
|
|
488
576
|
if (plen == 0) return LONG2NUM(byte_off);
|
|
577
|
+
sv_check_compatible_string(sv, pattern);
|
|
489
578
|
if (plen > sv->length - byte_off) return Qnil;
|
|
490
579
|
|
|
491
580
|
long pos = rb_memsearch(RSTRING_PTR(pattern), plen,
|
|
@@ -526,6 +615,7 @@ VALUE sv_byterindex(int argc, VALUE *argv, VALUE self) {
|
|
|
526
615
|
}
|
|
527
616
|
|
|
528
617
|
if (plen == 0) return LONG2NUM(max_byte > sv->length ? sv->length : max_byte);
|
|
618
|
+
sv_check_compatible_string(sv, pattern);
|
|
529
619
|
if (plen > sv->length) return Qnil;
|
|
530
620
|
|
|
531
621
|
long search_end = max_byte;
|
|
@@ -633,6 +723,11 @@ typedef struct {
|
|
|
633
723
|
char *ptr;
|
|
634
724
|
} sv_cstr_t;
|
|
635
725
|
|
|
726
|
+
typedef struct {
|
|
727
|
+
sv_cstr_t *cs;
|
|
728
|
+
int base;
|
|
729
|
+
} sv_inum_args_t;
|
|
730
|
+
|
|
636
731
|
SV_INLINE void sv_cstr_init(sv_cstr_t *cs, string_view_t *sv) {
|
|
637
732
|
const char *p = sv_ptr(sv);
|
|
638
733
|
long len = sv->length;
|
|
@@ -653,6 +748,22 @@ SV_INLINE void sv_cstr_free(sv_cstr_t *cs) {
|
|
|
653
748
|
}
|
|
654
749
|
}
|
|
655
750
|
|
|
751
|
+
static VALUE sv_cstr_free_ensure(VALUE arg) {
|
|
752
|
+
sv_cstr_free((sv_cstr_t *)arg);
|
|
753
|
+
return Qnil;
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
static VALUE sv_to_i_body(VALUE arg) {
|
|
757
|
+
sv_inum_args_t *args = (sv_inum_args_t *)arg;
|
|
758
|
+
return rb_cstr_to_inum(args->cs->ptr, args->base, 0);
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
static VALUE sv_to_f_body(VALUE arg) {
|
|
762
|
+
sv_cstr_t *cs = (sv_cstr_t *)arg;
|
|
763
|
+
double d = rb_cstr_to_dbl(cs->ptr, 0);
|
|
764
|
+
return DBL2NUM(d);
|
|
765
|
+
}
|
|
766
|
+
|
|
656
767
|
/*
|
|
657
768
|
* to_i([base]) — parse integer directly from byte pointer, zero allocations.
|
|
658
769
|
* Uses rb_cstr_to_inum which parses from a NUL-terminated C string.
|
|
@@ -663,10 +774,12 @@ static VALUE sv_to_i(int argc, VALUE *argv, VALUE self) {
|
|
|
663
774
|
if (argc > 0) base = NUM2INT(argv[0]);
|
|
664
775
|
|
|
665
776
|
sv_cstr_t cs;
|
|
777
|
+
sv_inum_args_t args;
|
|
666
778
|
sv_cstr_init(&cs, sv);
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
return
|
|
779
|
+
args.cs = &cs;
|
|
780
|
+
args.base = base;
|
|
781
|
+
return rb_ensure(sv_to_i_body, (VALUE)&args,
|
|
782
|
+
sv_cstr_free_ensure, (VALUE)&cs);
|
|
670
783
|
}
|
|
671
784
|
|
|
672
785
|
/*
|
|
@@ -676,9 +789,8 @@ static VALUE sv_to_f(VALUE self) {
|
|
|
676
789
|
string_view_t *sv = sv_get_struct(self);
|
|
677
790
|
sv_cstr_t cs;
|
|
678
791
|
sv_cstr_init(&cs, sv);
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
return DBL2NUM(d);
|
|
792
|
+
return rb_ensure(sv_to_f_body, (VALUE)&cs,
|
|
793
|
+
sv_cstr_free_ensure, (VALUE)&cs);
|
|
682
794
|
}
|
|
683
795
|
|
|
684
796
|
/*
|
|
@@ -687,10 +799,12 @@ static VALUE sv_to_f(VALUE self) {
|
|
|
687
799
|
static VALUE sv_hex(VALUE self) {
|
|
688
800
|
string_view_t *sv = sv_get_struct(self);
|
|
689
801
|
sv_cstr_t cs;
|
|
802
|
+
sv_inum_args_t args;
|
|
690
803
|
sv_cstr_init(&cs, sv);
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
return
|
|
804
|
+
args.cs = &cs;
|
|
805
|
+
args.base = 16;
|
|
806
|
+
return rb_ensure(sv_to_i_body, (VALUE)&args,
|
|
807
|
+
sv_cstr_free_ensure, (VALUE)&cs);
|
|
694
808
|
}
|
|
695
809
|
|
|
696
810
|
/*
|
|
@@ -699,10 +813,12 @@ static VALUE sv_hex(VALUE self) {
|
|
|
699
813
|
static VALUE sv_oct(VALUE self) {
|
|
700
814
|
string_view_t *sv = sv_get_struct(self);
|
|
701
815
|
sv_cstr_t cs;
|
|
816
|
+
sv_inum_args_t args;
|
|
702
817
|
sv_cstr_init(&cs, sv);
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
return
|
|
818
|
+
args.cs = &cs;
|
|
819
|
+
args.base = 8;
|
|
820
|
+
return rb_ensure(sv_to_i_body, (VALUE)&args,
|
|
821
|
+
sv_cstr_free_ensure, (VALUE)&cs);
|
|
706
822
|
}
|
|
707
823
|
|
|
708
824
|
/* ========================================================================= */
|
|
@@ -714,11 +830,10 @@ static VALUE sv_oct(VALUE self) {
|
|
|
714
830
|
* Uses the single_byte cache when available.
|
|
715
831
|
*/
|
|
716
832
|
SV_INLINE int sv_is_7bit(string_view_t *sv) {
|
|
717
|
-
|
|
718
|
-
const char *p = sv_ptr(sv);
|
|
833
|
+
const unsigned char *p = (const unsigned char *)sv_ptr(sv);
|
|
719
834
|
long i;
|
|
720
835
|
for (i = 0; i < sv->length; i++) {
|
|
721
|
-
if (
|
|
836
|
+
if (p[i] > 127) return 0;
|
|
722
837
|
}
|
|
723
838
|
return 1;
|
|
724
839
|
}
|
|
@@ -1008,22 +1123,11 @@ static long sv_char_to_byte_offset(string_view_t *sv, long char_idx) {
|
|
|
1008
1123
|
return char_idx;
|
|
1009
1124
|
}
|
|
1010
1125
|
|
|
1011
|
-
if (SV_LIKELY(sv_is_utf8(sv))) {
|
|
1126
|
+
if (SV_LIKELY(sv_is_utf8(sv)) && sv_valid_encoding_cached(sv)) {
|
|
1012
1127
|
return sv_utf8_char_to_byte_offset_indexed(sv, char_idx);
|
|
1013
1128
|
}
|
|
1014
1129
|
|
|
1015
|
-
|
|
1016
|
-
const char *p = sv_ptr(sv);
|
|
1017
|
-
const char *e = p + sv->length;
|
|
1018
|
-
const char *start = p;
|
|
1019
|
-
long i;
|
|
1020
|
-
|
|
1021
|
-
for (i = 0; i < char_idx && p < e; i++) {
|
|
1022
|
-
p += rb_enc_fast_mbclen(p, e, enc);
|
|
1023
|
-
}
|
|
1024
|
-
|
|
1025
|
-
if (i < char_idx) return -1;
|
|
1026
|
-
return p - start;
|
|
1130
|
+
return sv_tolerant_char_to_byte_offset(sv, char_idx);
|
|
1027
1131
|
}
|
|
1028
1132
|
|
|
1029
1133
|
static long sv_char_count(string_view_t *sv) {
|
|
@@ -1033,12 +1137,11 @@ static long sv_char_count(string_view_t *sv) {
|
|
|
1033
1137
|
long count;
|
|
1034
1138
|
if (sv_single_byte_optimizable(sv)) {
|
|
1035
1139
|
count = sv->length;
|
|
1036
|
-
} else if (SV_LIKELY(sv_is_utf8(sv))) {
|
|
1140
|
+
} else if (SV_LIKELY(sv_is_utf8(sv)) && sv_valid_encoding_cached(sv)) {
|
|
1037
1141
|
count = sv_utf8_char_count(sv_ptr(sv), sv->length);
|
|
1038
1142
|
} else {
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
count = rb_enc_strlen(p, p + sv->length, enc);
|
|
1143
|
+
count = sv_tolerant_char_count(sv_ptr(sv), sv_ptr(sv) + sv->length,
|
|
1144
|
+
sv_enc(sv));
|
|
1042
1145
|
}
|
|
1043
1146
|
|
|
1044
1147
|
sv->charlen = count;
|
|
@@ -1051,20 +1154,20 @@ static long sv_chars_to_bytes(string_view_t *sv, long byte_off, long n) {
|
|
|
1051
1154
|
return n < remaining ? n : remaining;
|
|
1052
1155
|
}
|
|
1053
1156
|
|
|
1054
|
-
if (SV_LIKELY(sv_is_utf8(sv))) {
|
|
1157
|
+
if (SV_LIKELY(sv_is_utf8(sv)) && sv_valid_encoding_cached(sv)) {
|
|
1055
1158
|
return sv_utf8_chars_to_bytes(sv_ptr(sv), sv->length, byte_off, n);
|
|
1056
1159
|
}
|
|
1057
1160
|
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
const char *e = sv_ptr(sv) + sv->length;
|
|
1061
|
-
long i;
|
|
1062
|
-
const char *start = p;
|
|
1161
|
+
return sv_tolerant_chars_to_bytes(sv, byte_off, n);
|
|
1162
|
+
}
|
|
1063
1163
|
|
|
1064
|
-
|
|
1065
|
-
|
|
1164
|
+
static long sv_char_count_partial(string_view_t *sv, const char *p, long len) {
|
|
1165
|
+
if (len <= 0) return 0;
|
|
1166
|
+
if (sv_single_byte_optimizable(sv)) return len;
|
|
1167
|
+
if (SV_LIKELY(sv_is_utf8(sv)) && sv_valid_encoding_cached(sv)) {
|
|
1168
|
+
return sv_utf8_char_count(p, len);
|
|
1066
1169
|
}
|
|
1067
|
-
return p
|
|
1170
|
+
return sv_tolerant_char_count(p, p + len, sv_enc(sv));
|
|
1068
1171
|
}
|
|
1069
1172
|
|
|
1070
1173
|
static VALUE sv_aref(int argc, VALUE *argv, VALUE self) {
|
|
@@ -1086,7 +1189,7 @@ static VALUE sv_aref(int argc, VALUE *argv, VALUE self) {
|
|
|
1086
1189
|
long total = sv->length;
|
|
1087
1190
|
if (idx < 0) idx += total;
|
|
1088
1191
|
if (SV_UNLIKELY(idx < 0 || idx > total || len < 0)) return Qnil;
|
|
1089
|
-
if (
|
|
1192
|
+
if (len > total - idx) len = total - idx;
|
|
1090
1193
|
return sv_new_from_parent_obj(self, sv,
|
|
1091
1194
|
sv->offset + idx,
|
|
1092
1195
|
len);
|
|
@@ -1108,7 +1211,7 @@ static VALUE sv_aref(int argc, VALUE *argv, VALUE self) {
|
|
|
1108
1211
|
|
|
1109
1212
|
/* Clamp len to remaining characters */
|
|
1110
1213
|
long total_chars = sv_char_count(sv);
|
|
1111
|
-
if (
|
|
1214
|
+
if (len > total_chars - idx) len = total_chars - idx;
|
|
1112
1215
|
|
|
1113
1216
|
long byte_end = sv_char_to_byte_offset(sv, idx + len);
|
|
1114
1217
|
long byte_len = byte_end - byte_off;
|
|
@@ -1159,6 +1262,7 @@ static VALUE sv_aref(int argc, VALUE *argv, VALUE self) {
|
|
|
1159
1262
|
if (slen == 0) {
|
|
1160
1263
|
return sv_new_from_parent_obj(self, sv, sv->offset, 0);
|
|
1161
1264
|
}
|
|
1265
|
+
sv_check_compatible_string(sv, arg1);
|
|
1162
1266
|
if (slen > sv->length) return Qnil;
|
|
1163
1267
|
|
|
1164
1268
|
long pos = rb_memsearch(RSTRING_PTR(arg1), slen, p, sv->length, sv_enc(sv));
|
|
@@ -1206,7 +1310,7 @@ static VALUE sv_byteslice(int argc, VALUE *argv, VALUE self) {
|
|
|
1206
1310
|
if (off < 0) off += sv->length;
|
|
1207
1311
|
if (off < 0 || off > sv->length) return Qnil;
|
|
1208
1312
|
if (len < 0) return Qnil;
|
|
1209
|
-
if (
|
|
1313
|
+
if (len > sv->length - off) len = sv->length - off;
|
|
1210
1314
|
|
|
1211
1315
|
return sv_new_from_parent_obj(self, sv, sv->offset + off, len);
|
|
1212
1316
|
}
|
|
@@ -1435,8 +1539,9 @@ static VALUE sv_delete_prefix(VALUE self, VALUE prefix) {
|
|
|
1435
1539
|
const char *p = sv_ptr(sv);
|
|
1436
1540
|
long plen = RSTRING_LEN(prefix);
|
|
1437
1541
|
|
|
1438
|
-
if (plen > sv->length) return self;
|
|
1439
1542
|
if (plen == 0) return self;
|
|
1543
|
+
sv_check_compatible_string(sv, prefix);
|
|
1544
|
+
if (plen > sv->length) return self;
|
|
1440
1545
|
if (memcmp(p, RSTRING_PTR(prefix), plen) != 0) return self;
|
|
1441
1546
|
|
|
1442
1547
|
return sv_new_from_parent_obj(self, sv, sv->offset + plen, sv->length - plen);
|
|
@@ -1453,8 +1558,9 @@ static VALUE sv_delete_suffix(VALUE self, VALUE suffix) {
|
|
|
1453
1558
|
const char *p = sv_ptr(sv);
|
|
1454
1559
|
long slen = RSTRING_LEN(suffix);
|
|
1455
1560
|
|
|
1456
|
-
if (slen > sv->length) return self;
|
|
1457
1561
|
if (slen == 0) return self;
|
|
1562
|
+
sv_check_compatible_string(sv, suffix);
|
|
1563
|
+
if (slen > sv->length) return self;
|
|
1458
1564
|
if (memcmp(p + sv->length - slen, RSTRING_PTR(suffix), slen) != 0) return self;
|
|
1459
1565
|
|
|
1460
1566
|
return sv_new_from_parent_obj(self, sv, sv->offset, sv->length - slen);
|
|
@@ -1505,16 +1611,7 @@ static VALUE sv_ord(VALUE self) {
|
|
|
1505
1611
|
*/
|
|
1506
1612
|
static VALUE sv_valid_encoding_p(VALUE self) {
|
|
1507
1613
|
string_view_t *sv = sv_get_struct(self);
|
|
1508
|
-
|
|
1509
|
-
const char *p = sv_ptr(sv);
|
|
1510
|
-
const char *e = p + sv->length;
|
|
1511
|
-
|
|
1512
|
-
while (p < e) {
|
|
1513
|
-
int len = rb_enc_precise_mbclen(p, e, enc);
|
|
1514
|
-
if (!MBCLEN_CHARFOUND_P(len)) return Qfalse;
|
|
1515
|
-
p += MBCLEN_CHARFOUND_LEN(len);
|
|
1516
|
-
}
|
|
1517
|
-
return Qtrue;
|
|
1614
|
+
return sv_valid_encoding_cached(sv) ? Qtrue : Qfalse;
|
|
1518
1615
|
}
|
|
1519
1616
|
|
|
1520
1617
|
/*
|
|
@@ -39,6 +39,8 @@ typedef struct {
|
|
|
39
39
|
long length; /* byte length of this view */
|
|
40
40
|
long charlen; /* cached character count; -1 = not yet computed */
|
|
41
41
|
int single_byte; /* cached: 1 if char==byte (ASCII/single-byte enc), 0 if multibyte, -1 unknown */
|
|
42
|
+
int valid_encoding; /* cached slice validity: 1 valid, 0 invalid, -1 unknown */
|
|
43
|
+
int pooled; /* 1 if owned by StringView::Pool and subject to reuse */
|
|
42
44
|
stride_index_t *stride_idx; /* lazily built stride index for multibyte, NULL if not built */
|
|
43
45
|
} string_view_t;
|
|
44
46
|
|
|
@@ -83,6 +85,14 @@ SV_INLINE void sv_check_bounds(long off, long len, long backing_len) {
|
|
|
83
85
|
}
|
|
84
86
|
}
|
|
85
87
|
|
|
88
|
+
SV_INLINE void sv_clear_stride_index(string_view_t *sv) {
|
|
89
|
+
if (sv->stride_idx) {
|
|
90
|
+
xfree(sv->stride_idx->offsets);
|
|
91
|
+
xfree(sv->stride_idx);
|
|
92
|
+
sv->stride_idx = NULL;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
86
96
|
/*
|
|
87
97
|
* Initialize (or reinitialize) a string_view_t's fields from a frozen backing
|
|
88
98
|
* string. Caller is responsible for freeing any prior stride_idx.
|
|
@@ -96,6 +106,8 @@ SV_INLINE void sv_init_fields(VALUE obj, string_view_t *sv, VALUE backing,
|
|
|
96
106
|
sv->offset = offset;
|
|
97
107
|
sv->length = length;
|
|
98
108
|
sv->single_byte = sv_compute_single_byte(backing, enc);
|
|
109
|
+
sv->valid_encoding = sv->single_byte == 1 ? 1 : -1;
|
|
110
|
+
sv->pooled = 0;
|
|
99
111
|
sv->charlen = -1;
|
|
100
112
|
sv->stride_idx = NULL;
|
|
101
113
|
}
|
|
@@ -4,41 +4,26 @@
|
|
|
4
4
|
/* StringView::CoreExt — module with String#view, included on demand */
|
|
5
5
|
/* ========================================================================= */
|
|
6
6
|
|
|
7
|
-
/* ObjectSpace::WeakKeyMap caching String → Pool.
|
|
8
|
-
* Keys (strings) are held weakly — when a string is GC'd, its entry
|
|
9
|
-
* is automatically removed. Values (pools) are held strongly. */
|
|
10
|
-
static VALUE pool_cache;
|
|
11
|
-
static ID id_aref;
|
|
12
|
-
static ID id_aset;
|
|
13
|
-
|
|
14
7
|
/*
|
|
15
8
|
* view(byte_offset, byte_length) → StringView
|
|
16
9
|
*
|
|
17
10
|
* Returns a StringView into this string at the given byte range.
|
|
18
|
-
*
|
|
19
|
-
*
|
|
11
|
+
* The backing string is frozen in place so StringView can safely reference it.
|
|
12
|
+
* Each call returns a fresh StringView; callers that want explicit object reuse
|
|
13
|
+
* should opt into StringView::Pool directly.
|
|
20
14
|
*/
|
|
21
15
|
static VALUE string_view_method(VALUE self, VALUE voffset, VALUE vlength) {
|
|
16
|
+
VALUE args[3];
|
|
17
|
+
|
|
22
18
|
rb_str_freeze(self);
|
|
23
19
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
}
|
|
29
|
-
return pool_view(pool, voffset, vlength);
|
|
20
|
+
args[0] = self;
|
|
21
|
+
args[1] = voffset;
|
|
22
|
+
args[2] = vlength;
|
|
23
|
+
return rb_class_new_instance(3, args, cStringView);
|
|
30
24
|
}
|
|
31
25
|
|
|
32
26
|
void Init_string_view_core_ext(void) {
|
|
33
|
-
id_aref = rb_intern("[]");
|
|
34
|
-
id_aset = rb_intern("[]=");
|
|
35
|
-
|
|
36
|
-
VALUE cWeakKeyMap = rb_const_get(
|
|
37
|
-
rb_const_get(rb_cObject, rb_intern("ObjectSpace")),
|
|
38
|
-
rb_intern("WeakKeyMap"));
|
|
39
|
-
pool_cache = rb_class_new_instance(0, NULL, cWeakKeyMap);
|
|
40
|
-
rb_gc_register_mark_object(pool_cache);
|
|
41
|
-
|
|
42
27
|
VALUE mCoreExt = rb_define_module_under(cStringView, "CoreExt");
|
|
43
28
|
rb_define_method(mCoreExt, "view", string_view_method, 2);
|
|
44
29
|
}
|
|
@@ -73,6 +73,7 @@ static void pool_grow(sv_pool_t *pool, VALUE pool_obj) {
|
|
|
73
73
|
VALUE obj = TypedData_Make_Struct(cStringView, string_view_t,
|
|
74
74
|
&string_view_type, sv);
|
|
75
75
|
sv_init_fields(obj, sv, pool->backing, pool->base, pool->enc, 0, 0);
|
|
76
|
+
sv->pooled = 1;
|
|
76
77
|
rb_ary_push(pool->views, obj);
|
|
77
78
|
}
|
|
78
79
|
|
|
@@ -145,12 +146,15 @@ VALUE pool_view(VALUE self, VALUE voffset, VALUE vlength) {
|
|
|
145
146
|
VALUE view = RARRAY_AREF(pool->views, pool->next_idx);
|
|
146
147
|
pool->next_idx++;
|
|
147
148
|
|
|
149
|
+
if (SV_UNLIKELY(OBJ_FROZEN(view))) {
|
|
150
|
+
rb_raise(rb_eFrozenError,
|
|
151
|
+
"can't reuse a frozen pooled StringView; materialize it before freezing");
|
|
152
|
+
}
|
|
153
|
+
|
|
148
154
|
string_view_t *sv = (string_view_t *)RTYPEDDATA_GET_DATA(view);
|
|
149
|
-
sv
|
|
150
|
-
sv->
|
|
151
|
-
sv->
|
|
152
|
-
sv->charlen = -1; /* invalidate cached char count */
|
|
153
|
-
sv->stride_idx = NULL; /* invalidate stride index */
|
|
155
|
+
sv_clear_stride_index(sv);
|
|
156
|
+
sv_init_fields(view, sv, pool->backing, pool->base, pool->enc, off, len);
|
|
157
|
+
sv->pooled = 1;
|
|
154
158
|
|
|
155
159
|
return view;
|
|
156
160
|
}
|
data/lib/string_view/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: string_view
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Shopify
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-04-13 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: StringView provides a read-only, zero-copy view into a frozen Ruby String,
|
|
14
14
|
avoiding intermediate allocations for slicing, searching, and delegation of transform
|