string_view 0.2.0-arm64-darwin → 0.2.1-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 22447417d5acbfed6e078cb4044ccc068dc3fe9e29755b4905dd5dfe1e9c7fdb
4
- data.tar.gz: b0c01043028e21047c2213a04b8b470e1a72b583c33201f2c193fe7ee9d55b4a
3
+ metadata.gz: c065311c686e919fd36b47072054046043f90429ce5270c1a4eb8ce765b6bb19
4
+ data.tar.gz: 60af853bedd5f8d390f175ad62451d30bbfdf361b8dd1ac4a7816f60c4c4206e
5
5
  SHA512:
6
- metadata.gz: 505edc71b583f32a264dd87dcad3996431a444738a7c615d58cc9b48e6d388315ad9d53e0d4aa557caf6da75ba29e9f139a448b5ee37f46a11b67013761072d7
7
- data.tar.gz: dff667871cf974bcda93d2e3071e27de7b9fd6d1853fe64e61778a9eae8f1ddab58a567667b5d5e756b26864ee4885f4821299c0c9e04833c258d6c133cc1306
6
+ metadata.gz: 04fd7cdbdd45671c04b99b2893e22cccb74adea90f442714b2a9069ffe2f578a6343827db24f7368802e6550cec284cf722f29a533ed3aaef6952aa4108a9190
7
+ data.tar.gz: f8c0af288757b5775872c3bfc26fcedf53818c998dce35b052b48fa5ac7cd7b9cc2579e0fe7f0c1a18ffb4533c2b529b3c34161a31d97ca496d414ad1f12475a
@@ -50,10 +50,7 @@ static void sv_compact(void *ptr) {
50
50
 
51
51
  static void sv_free(void *ptr) {
52
52
  string_view_t *sv = (string_view_t *)ptr;
53
- if (sv->stride_idx) {
54
- xfree(sv->stride_idx->offsets);
55
- xfree(sv->stride_idx);
56
- }
53
+ sv_clear_stride_index(sv);
57
54
  }
58
55
 
59
56
  static size_t sv_memsize(const void *ptr) {
@@ -74,7 +71,9 @@ const rb_data_type_t string_view_type = {
74
71
  /* Forward declarations for functions defined later in this file */
75
72
  static long sv_char_count(string_view_t *sv);
76
73
  static long sv_char_to_byte_offset(string_view_t *sv, long char_idx);
74
+ static long sv_char_count_partial(string_view_t *sv, const char *p, long len);
77
75
  SV_INLINE int sv_single_byte_optimizable(string_view_t *sv);
76
+ SV_INLINE int sv_is_7bit(string_view_t *sv);
78
77
  SV_INLINE int sv_is_utf8(string_view_t *sv);
79
78
  static long sv_utf8_char_count(const char *p, long len);
80
79
 
@@ -119,6 +118,8 @@ SV_INLINE VALUE sv_new_from_parent_obj(VALUE parent_obj, string_view_t *parent,
119
118
  sv->offset = offset;
120
119
  sv->length = length;
121
120
  sv->single_byte = parent->single_byte;
121
+ sv->valid_encoding = sv->single_byte == 1 ? 1 : -1;
122
+ sv->pooled = 0;
122
123
  sv->charlen = -1;
123
124
  sv->stride_idx = NULL;
124
125
  /* Not frozen — see sv_initialize comment for rationale */
@@ -140,6 +141,8 @@ static VALUE sv_alloc(VALUE klass) {
140
141
  sv->offset = 0;
141
142
  sv->length = 0;
142
143
  sv->single_byte = -1;
144
+ sv->valid_encoding = -1;
145
+ sv->pooled = 0;
143
146
  sv->charlen = -1;
144
147
  sv->stride_idx = NULL;
145
148
  return obj;
@@ -217,6 +220,11 @@ static VALUE sv_reset(VALUE self, VALUE new_backing, VALUE voffset, VALUE vlengt
217
220
  rb_check_frozen(self);
218
221
  string_view_t *sv = sv_get_struct(self);
219
222
 
223
+ if (SV_UNLIKELY(sv->pooled)) {
224
+ rb_raise(rb_eRuntimeError,
225
+ "can't reset a pooled StringView directly; call StringView::Pool#reset! instead");
226
+ }
227
+
220
228
  sv_check_frozen_string(new_backing);
221
229
 
222
230
  long off = NUM2LONG(voffset);
@@ -224,10 +232,7 @@ static VALUE sv_reset(VALUE self, VALUE new_backing, VALUE voffset, VALUE vlengt
224
232
  sv_check_bounds(off, len, RSTRING_LEN(new_backing));
225
233
 
226
234
  /* Free old stride index before reinitializing */
227
- if (sv->stride_idx) {
228
- xfree(sv->stride_idx->offsets);
229
- xfree(sv->stride_idx);
230
- }
235
+ sv_clear_stride_index(sv);
231
236
 
232
237
  sv_init_fields(self, sv, new_backing, RSTRING_PTR(new_backing),
233
238
  rb_enc_get(new_backing), off, len);
@@ -274,6 +279,94 @@ static VALUE sv_ascii_only_p(VALUE self) {
274
279
  return Qtrue;
275
280
  }
276
281
 
282
+ SV_INLINE long sv_precise_char_len(const char *p, const char *e, rb_encoding *enc) {
283
+ int len = rb_enc_precise_mbclen(p, e, enc);
284
+ if (MBCLEN_CHARFOUND_P(len)) return MBCLEN_CHARFOUND_LEN(len);
285
+ return 1;
286
+ }
287
+
288
+ static int sv_compute_valid_encoding_slice(string_view_t *sv) {
289
+ if (sv_single_byte_optimizable(sv)) return 1;
290
+
291
+ if (SV_LIKELY(sv_is_utf8(sv))) {
292
+ return simdutf_validate_utf8(sv_ptr(sv), (size_t)sv->length) ? 1 : 0;
293
+ }
294
+
295
+ rb_encoding *enc = sv_enc(sv);
296
+ const char *p = sv_ptr(sv);
297
+ const char *e = p + sv->length;
298
+
299
+ while (p < e) {
300
+ int len = rb_enc_precise_mbclen(p, e, enc);
301
+ if (!MBCLEN_CHARFOUND_P(len)) return 0;
302
+ p += MBCLEN_CHARFOUND_LEN(len);
303
+ }
304
+
305
+ return 1;
306
+ }
307
+
308
+ SV_INLINE int sv_valid_encoding_cached(string_view_t *sv) {
309
+ if (SV_LIKELY(sv->valid_encoding >= 0)) return sv->valid_encoding;
310
+ sv->valid_encoding = sv_compute_valid_encoding_slice(sv);
311
+ return sv->valid_encoding;
312
+ }
313
+
314
+ static long sv_tolerant_char_count(const char *p, const char *e, rb_encoding *enc) {
315
+ long count = 0;
316
+
317
+ while (p < e) {
318
+ p += sv_precise_char_len(p, e, enc);
319
+ count++;
320
+ }
321
+
322
+ return count;
323
+ }
324
+
325
+ static long sv_tolerant_char_to_byte_offset(string_view_t *sv, long char_idx) {
326
+ rb_encoding *enc = sv_enc(sv);
327
+ const char *p = sv_ptr(sv);
328
+ const char *e = p + sv->length;
329
+ const char *start = p;
330
+ long i = 0;
331
+
332
+ while (i < char_idx && p < e) {
333
+ p += sv_precise_char_len(p, e, enc);
334
+ i++;
335
+ }
336
+
337
+ if (i < char_idx) return -1;
338
+ return p - start;
339
+ }
340
+
341
+ static long sv_tolerant_chars_to_bytes(string_view_t *sv, long byte_off, long n) {
342
+ rb_encoding *enc = sv_enc(sv);
343
+ const char *start = sv_ptr(sv) + byte_off;
344
+ const char *p = start;
345
+ const char *e = sv_ptr(sv) + sv->length;
346
+ long i = 0;
347
+
348
+ while (i < n && p < e) {
349
+ p += sv_precise_char_len(p, e, enc);
350
+ i++;
351
+ }
352
+
353
+ return p - start;
354
+ }
355
+
356
+ SV_INLINE void sv_check_compatible_string(string_view_t *sv, VALUE other) {
357
+ rb_encoding *oenc = rb_enc_get(other);
358
+
359
+ if (sv->enc == oenc) return;
360
+ if (rb_enc_asciicompat(sv->enc) && rb_enc_asciicompat(oenc) &&
361
+ (sv_is_7bit(sv) || rb_enc_str_asciionly_p(other))) {
362
+ return;
363
+ }
364
+
365
+ rb_raise(rb_eEncCompatError,
366
+ "incompatible character encodings: %s and %s",
367
+ rb_enc_name(sv->enc), rb_enc_name(oenc));
368
+ }
369
+
277
370
  /* ========================================================================= */
278
371
  /* Tier 1: Searching */
279
372
  /* ========================================================================= */
@@ -284,6 +377,7 @@ static VALUE sv_include_p(VALUE self, VALUE substr) {
284
377
  const char *p = sv_ptr(sv);
285
378
  long slen = RSTRING_LEN(substr);
286
379
  if (slen == 0) return Qtrue;
380
+ sv_check_compatible_string(sv, substr);
287
381
  if (slen > sv->length) return Qfalse;
288
382
 
289
383
  long pos = rb_memsearch(RSTRING_PTR(substr), slen, p, sv->length, sv_enc(sv));
@@ -299,6 +393,8 @@ static VALUE sv_start_with_p(int argc, VALUE *argv, VALUE self) {
299
393
  VALUE prefix = argv[i];
300
394
  StringValue(prefix);
301
395
  long plen = RSTRING_LEN(prefix);
396
+ if (plen == 0) return Qtrue;
397
+ sv_check_compatible_string(sv, prefix);
302
398
  if (plen > sv->length) continue;
303
399
  if (memcmp(p, RSTRING_PTR(prefix), plen) == 0) return Qtrue;
304
400
  }
@@ -314,6 +410,8 @@ static VALUE sv_end_with_p(int argc, VALUE *argv, VALUE self) {
314
410
  VALUE suffix = argv[i];
315
411
  StringValue(suffix);
316
412
  long slen = RSTRING_LEN(suffix);
413
+ if (slen == 0) return Qtrue;
414
+ sv_check_compatible_string(sv, suffix);
317
415
  if (slen > sv->length) continue;
318
416
  if (memcmp(p + sv->length - slen, RSTRING_PTR(suffix), slen) == 0)
319
417
  return Qtrue;
@@ -348,12 +446,13 @@ VALUE sv_index(int argc, VALUE *argv, VALUE self) {
348
446
 
349
447
  if (char_off < 0) char_off += total_chars;
350
448
  if (char_off < 0 || char_off > total_chars) return Qnil;
449
+ if (plen == 0) return LONG2NUM(char_off);
450
+ sv_check_compatible_string(sv, pattern);
351
451
 
352
452
  /* Convert char offset to byte offset */
353
453
  long byte_off = sv_char_to_byte_offset(sv, char_off);
354
454
  if (byte_off < 0) return Qnil;
355
455
 
356
- if (plen == 0) return LONG2NUM(char_off);
357
456
  if (plen > sv->length - byte_off) return Qnil;
358
457
 
359
458
  long pos = rb_memsearch(RSTRING_PTR(pattern), plen,
@@ -365,16 +464,8 @@ VALUE sv_index(int argc, VALUE *argv, VALUE self) {
365
464
  if (sv_single_byte_optimizable(sv)) {
366
465
  return LONG2NUM(char_off + pos);
367
466
  }
368
- /* Count chars from byte_off to byte_off+pos */
369
- if (sv_is_utf8(sv)) {
370
- long chars = sv_utf8_char_count(p + byte_off, pos);
371
- return LONG2NUM(char_off + chars);
372
- }
373
- rb_encoding *enc = sv_enc(sv);
374
- const char *s = p + byte_off;
375
- const char *e = s + pos;
376
- long chars = rb_enc_strlen(s, e, enc);
377
- return LONG2NUM(char_off + chars);
467
+
468
+ return LONG2NUM(char_off + sv_char_count_partial(sv, p + byte_off, pos));
378
469
  }
379
470
 
380
471
  /*
@@ -413,6 +504,7 @@ VALUE sv_rindex(int argc, VALUE *argv, VALUE self) {
413
504
  if (plen == 0) {
414
505
  return LONG2NUM(max_char > total_chars ? total_chars : max_char);
415
506
  }
507
+ sv_check_compatible_string(sv, pattern);
416
508
  if (plen > sv->length) return Qnil;
417
509
 
418
510
  /* Convert max_char to a byte limit */
@@ -435,11 +527,7 @@ VALUE sv_rindex(int argc, VALUE *argv, VALUE self) {
435
527
  if (sv_single_byte_optimizable(sv)) {
436
528
  return LONG2NUM(byte_pos);
437
529
  }
438
- if (sv_is_utf8(sv)) {
439
- return LONG2NUM(sv_utf8_char_count(p, byte_pos));
440
- }
441
- rb_encoding *enc = sv_enc(sv);
442
- return LONG2NUM(rb_enc_strlen(p, s, enc));
530
+ return LONG2NUM(sv_char_count_partial(sv, p, byte_pos));
443
531
  }
444
532
  /* Move back one character */
445
533
  if (s == p) break;
@@ -486,6 +574,7 @@ VALUE sv_byteindex(int argc, VALUE *argv, VALUE self) {
486
574
  if (byte_off < 0) byte_off += sv->length;
487
575
  if (byte_off < 0 || byte_off > sv->length) return Qnil;
488
576
  if (plen == 0) return LONG2NUM(byte_off);
577
+ sv_check_compatible_string(sv, pattern);
489
578
  if (plen > sv->length - byte_off) return Qnil;
490
579
 
491
580
  long pos = rb_memsearch(RSTRING_PTR(pattern), plen,
@@ -526,6 +615,7 @@ VALUE sv_byterindex(int argc, VALUE *argv, VALUE self) {
526
615
  }
527
616
 
528
617
  if (plen == 0) return LONG2NUM(max_byte > sv->length ? sv->length : max_byte);
618
+ sv_check_compatible_string(sv, pattern);
529
619
  if (plen > sv->length) return Qnil;
530
620
 
531
621
  long search_end = max_byte;
@@ -633,6 +723,11 @@ typedef struct {
633
723
  char *ptr;
634
724
  } sv_cstr_t;
635
725
 
726
+ typedef struct {
727
+ sv_cstr_t *cs;
728
+ int base;
729
+ } sv_inum_args_t;
730
+
636
731
  SV_INLINE void sv_cstr_init(sv_cstr_t *cs, string_view_t *sv) {
637
732
  const char *p = sv_ptr(sv);
638
733
  long len = sv->length;
@@ -653,6 +748,22 @@ SV_INLINE void sv_cstr_free(sv_cstr_t *cs) {
653
748
  }
654
749
  }
655
750
 
751
+ static VALUE sv_cstr_free_ensure(VALUE arg) {
752
+ sv_cstr_free((sv_cstr_t *)arg);
753
+ return Qnil;
754
+ }
755
+
756
+ static VALUE sv_to_i_body(VALUE arg) {
757
+ sv_inum_args_t *args = (sv_inum_args_t *)arg;
758
+ return rb_cstr_to_inum(args->cs->ptr, args->base, 0);
759
+ }
760
+
761
+ static VALUE sv_to_f_body(VALUE arg) {
762
+ sv_cstr_t *cs = (sv_cstr_t *)arg;
763
+ double d = rb_cstr_to_dbl(cs->ptr, 0);
764
+ return DBL2NUM(d);
765
+ }
766
+
656
767
  /*
657
768
  * to_i([base]) — parse integer directly from byte pointer, zero allocations.
658
769
  * Uses rb_cstr_to_inum which parses from a NUL-terminated C string.
@@ -663,10 +774,12 @@ static VALUE sv_to_i(int argc, VALUE *argv, VALUE self) {
663
774
  if (argc > 0) base = NUM2INT(argv[0]);
664
775
 
665
776
  sv_cstr_t cs;
777
+ sv_inum_args_t args;
666
778
  sv_cstr_init(&cs, sv);
667
- VALUE result = rb_cstr_to_inum(cs.ptr, base, 0);
668
- sv_cstr_free(&cs);
669
- return result;
779
+ args.cs = &cs;
780
+ args.base = base;
781
+ return rb_ensure(sv_to_i_body, (VALUE)&args,
782
+ sv_cstr_free_ensure, (VALUE)&cs);
670
783
  }
671
784
 
672
785
  /*
@@ -676,9 +789,8 @@ static VALUE sv_to_f(VALUE self) {
676
789
  string_view_t *sv = sv_get_struct(self);
677
790
  sv_cstr_t cs;
678
791
  sv_cstr_init(&cs, sv);
679
- double d = rb_cstr_to_dbl(cs.ptr, 0);
680
- sv_cstr_free(&cs);
681
- return DBL2NUM(d);
792
+ return rb_ensure(sv_to_f_body, (VALUE)&cs,
793
+ sv_cstr_free_ensure, (VALUE)&cs);
682
794
  }
683
795
 
684
796
  /*
@@ -687,10 +799,12 @@ static VALUE sv_to_f(VALUE self) {
687
799
  static VALUE sv_hex(VALUE self) {
688
800
  string_view_t *sv = sv_get_struct(self);
689
801
  sv_cstr_t cs;
802
+ sv_inum_args_t args;
690
803
  sv_cstr_init(&cs, sv);
691
- VALUE result = rb_cstr_to_inum(cs.ptr, 16, 0);
692
- sv_cstr_free(&cs);
693
- return result;
804
+ args.cs = &cs;
805
+ args.base = 16;
806
+ return rb_ensure(sv_to_i_body, (VALUE)&args,
807
+ sv_cstr_free_ensure, (VALUE)&cs);
694
808
  }
695
809
 
696
810
  /*
@@ -699,10 +813,12 @@ static VALUE sv_hex(VALUE self) {
699
813
  static VALUE sv_oct(VALUE self) {
700
814
  string_view_t *sv = sv_get_struct(self);
701
815
  sv_cstr_t cs;
816
+ sv_inum_args_t args;
702
817
  sv_cstr_init(&cs, sv);
703
- VALUE result = rb_cstr_to_inum(cs.ptr, 8, 0);
704
- sv_cstr_free(&cs);
705
- return result;
818
+ args.cs = &cs;
819
+ args.base = 8;
820
+ return rb_ensure(sv_to_i_body, (VALUE)&args,
821
+ sv_cstr_free_ensure, (VALUE)&cs);
706
822
  }
707
823
 
708
824
  /* ========================================================================= */
@@ -714,11 +830,10 @@ static VALUE sv_oct(VALUE self) {
714
830
  * Uses the single_byte cache when available.
715
831
  */
716
832
  SV_INLINE int sv_is_7bit(string_view_t *sv) {
717
- if (sv_single_byte_optimizable(sv)) return 1;
718
- const char *p = sv_ptr(sv);
833
+ const unsigned char *p = (const unsigned char *)sv_ptr(sv);
719
834
  long i;
720
835
  for (i = 0; i < sv->length; i++) {
721
- if ((unsigned char)p[i] > 127) return 0;
836
+ if (p[i] > 127) return 0;
722
837
  }
723
838
  return 1;
724
839
  }
@@ -1008,22 +1123,11 @@ static long sv_char_to_byte_offset(string_view_t *sv, long char_idx) {
1008
1123
  return char_idx;
1009
1124
  }
1010
1125
 
1011
- if (SV_LIKELY(sv_is_utf8(sv))) {
1126
+ if (SV_LIKELY(sv_is_utf8(sv)) && sv_valid_encoding_cached(sv)) {
1012
1127
  return sv_utf8_char_to_byte_offset_indexed(sv, char_idx);
1013
1128
  }
1014
1129
 
1015
- rb_encoding *enc = sv_enc(sv);
1016
- const char *p = sv_ptr(sv);
1017
- const char *e = p + sv->length;
1018
- const char *start = p;
1019
- long i;
1020
-
1021
- for (i = 0; i < char_idx && p < e; i++) {
1022
- p += rb_enc_fast_mbclen(p, e, enc);
1023
- }
1024
-
1025
- if (i < char_idx) return -1;
1026
- return p - start;
1130
+ return sv_tolerant_char_to_byte_offset(sv, char_idx);
1027
1131
  }
1028
1132
 
1029
1133
  static long sv_char_count(string_view_t *sv) {
@@ -1033,12 +1137,11 @@ static long sv_char_count(string_view_t *sv) {
1033
1137
  long count;
1034
1138
  if (sv_single_byte_optimizable(sv)) {
1035
1139
  count = sv->length;
1036
- } else if (SV_LIKELY(sv_is_utf8(sv))) {
1140
+ } else if (SV_LIKELY(sv_is_utf8(sv)) && sv_valid_encoding_cached(sv)) {
1037
1141
  count = sv_utf8_char_count(sv_ptr(sv), sv->length);
1038
1142
  } else {
1039
- rb_encoding *enc = sv_enc(sv);
1040
- const char *p = sv_ptr(sv);
1041
- count = rb_enc_strlen(p, p + sv->length, enc);
1143
+ count = sv_tolerant_char_count(sv_ptr(sv), sv_ptr(sv) + sv->length,
1144
+ sv_enc(sv));
1042
1145
  }
1043
1146
 
1044
1147
  sv->charlen = count;
@@ -1051,20 +1154,20 @@ static long sv_chars_to_bytes(string_view_t *sv, long byte_off, long n) {
1051
1154
  return n < remaining ? n : remaining;
1052
1155
  }
1053
1156
 
1054
- if (SV_LIKELY(sv_is_utf8(sv))) {
1157
+ if (SV_LIKELY(sv_is_utf8(sv)) && sv_valid_encoding_cached(sv)) {
1055
1158
  return sv_utf8_chars_to_bytes(sv_ptr(sv), sv->length, byte_off, n);
1056
1159
  }
1057
1160
 
1058
- rb_encoding *enc = sv_enc(sv);
1059
- const char *p = sv_ptr(sv) + byte_off;
1060
- const char *e = sv_ptr(sv) + sv->length;
1061
- long i;
1062
- const char *start = p;
1161
+ return sv_tolerant_chars_to_bytes(sv, byte_off, n);
1162
+ }
1063
1163
 
1064
- for (i = 0; i < n && p < e; i++) {
1065
- p += rb_enc_fast_mbclen(p, e, enc);
1164
+ static long sv_char_count_partial(string_view_t *sv, const char *p, long len) {
1165
+ if (len <= 0) return 0;
1166
+ if (sv_single_byte_optimizable(sv)) return len;
1167
+ if (SV_LIKELY(sv_is_utf8(sv)) && sv_valid_encoding_cached(sv)) {
1168
+ return sv_utf8_char_count(p, len);
1066
1169
  }
1067
- return p - start;
1170
+ return sv_tolerant_char_count(p, p + len, sv_enc(sv));
1068
1171
  }
1069
1172
 
1070
1173
  static VALUE sv_aref(int argc, VALUE *argv, VALUE self) {
@@ -1086,7 +1189,7 @@ static VALUE sv_aref(int argc, VALUE *argv, VALUE self) {
1086
1189
  long total = sv->length;
1087
1190
  if (idx < 0) idx += total;
1088
1191
  if (SV_UNLIKELY(idx < 0 || idx > total || len < 0)) return Qnil;
1089
- if (idx + len > total) len = total - idx;
1192
+ if (len > total - idx) len = total - idx;
1090
1193
  return sv_new_from_parent_obj(self, sv,
1091
1194
  sv->offset + idx,
1092
1195
  len);
@@ -1108,7 +1211,7 @@ static VALUE sv_aref(int argc, VALUE *argv, VALUE self) {
1108
1211
 
1109
1212
  /* Clamp len to remaining characters */
1110
1213
  long total_chars = sv_char_count(sv);
1111
- if (idx + len > total_chars) len = total_chars - idx;
1214
+ if (len > total_chars - idx) len = total_chars - idx;
1112
1215
 
1113
1216
  long byte_end = sv_char_to_byte_offset(sv, idx + len);
1114
1217
  long byte_len = byte_end - byte_off;
@@ -1159,6 +1262,7 @@ static VALUE sv_aref(int argc, VALUE *argv, VALUE self) {
1159
1262
  if (slen == 0) {
1160
1263
  return sv_new_from_parent_obj(self, sv, sv->offset, 0);
1161
1264
  }
1265
+ sv_check_compatible_string(sv, arg1);
1162
1266
  if (slen > sv->length) return Qnil;
1163
1267
 
1164
1268
  long pos = rb_memsearch(RSTRING_PTR(arg1), slen, p, sv->length, sv_enc(sv));
@@ -1206,7 +1310,7 @@ static VALUE sv_byteslice(int argc, VALUE *argv, VALUE self) {
1206
1310
  if (off < 0) off += sv->length;
1207
1311
  if (off < 0 || off > sv->length) return Qnil;
1208
1312
  if (len < 0) return Qnil;
1209
- if (off + len > sv->length) len = sv->length - off;
1313
+ if (len > sv->length - off) len = sv->length - off;
1210
1314
 
1211
1315
  return sv_new_from_parent_obj(self, sv, sv->offset + off, len);
1212
1316
  }
@@ -1435,8 +1539,9 @@ static VALUE sv_delete_prefix(VALUE self, VALUE prefix) {
1435
1539
  const char *p = sv_ptr(sv);
1436
1540
  long plen = RSTRING_LEN(prefix);
1437
1541
 
1438
- if (plen > sv->length) return self;
1439
1542
  if (plen == 0) return self;
1543
+ sv_check_compatible_string(sv, prefix);
1544
+ if (plen > sv->length) return self;
1440
1545
  if (memcmp(p, RSTRING_PTR(prefix), plen) != 0) return self;
1441
1546
 
1442
1547
  return sv_new_from_parent_obj(self, sv, sv->offset + plen, sv->length - plen);
@@ -1453,8 +1558,9 @@ static VALUE sv_delete_suffix(VALUE self, VALUE suffix) {
1453
1558
  const char *p = sv_ptr(sv);
1454
1559
  long slen = RSTRING_LEN(suffix);
1455
1560
 
1456
- if (slen > sv->length) return self;
1457
1561
  if (slen == 0) return self;
1562
+ sv_check_compatible_string(sv, suffix);
1563
+ if (slen > sv->length) return self;
1458
1564
  if (memcmp(p + sv->length - slen, RSTRING_PTR(suffix), slen) != 0) return self;
1459
1565
 
1460
1566
  return sv_new_from_parent_obj(self, sv, sv->offset, sv->length - slen);
@@ -1505,16 +1611,7 @@ static VALUE sv_ord(VALUE self) {
1505
1611
  */
1506
1612
  static VALUE sv_valid_encoding_p(VALUE self) {
1507
1613
  string_view_t *sv = sv_get_struct(self);
1508
- rb_encoding *enc = sv_enc(sv);
1509
- const char *p = sv_ptr(sv);
1510
- const char *e = p + sv->length;
1511
-
1512
- while (p < e) {
1513
- int len = rb_enc_precise_mbclen(p, e, enc);
1514
- if (!MBCLEN_CHARFOUND_P(len)) return Qfalse;
1515
- p += MBCLEN_CHARFOUND_LEN(len);
1516
- }
1517
- return Qtrue;
1614
+ return sv_valid_encoding_cached(sv) ? Qtrue : Qfalse;
1518
1615
  }
1519
1616
 
1520
1617
  /*
@@ -39,6 +39,8 @@ typedef struct {
39
39
  long length; /* byte length of this view */
40
40
  long charlen; /* cached character count; -1 = not yet computed */
41
41
  int single_byte; /* cached: 1 if char==byte (ASCII/single-byte enc), 0 if multibyte, -1 unknown */
42
+ int valid_encoding; /* cached slice validity: 1 valid, 0 invalid, -1 unknown */
43
+ int pooled; /* 1 if owned by StringView::Pool and subject to reuse */
42
44
  stride_index_t *stride_idx; /* lazily built stride index for multibyte, NULL if not built */
43
45
  } string_view_t;
44
46
 
@@ -83,6 +85,14 @@ SV_INLINE void sv_check_bounds(long off, long len, long backing_len) {
83
85
  }
84
86
  }
85
87
 
88
+ SV_INLINE void sv_clear_stride_index(string_view_t *sv) {
89
+ if (sv->stride_idx) {
90
+ xfree(sv->stride_idx->offsets);
91
+ xfree(sv->stride_idx);
92
+ sv->stride_idx = NULL;
93
+ }
94
+ }
95
+
86
96
  /*
87
97
  * Initialize (or reinitialize) a string_view_t's fields from a frozen backing
88
98
  * string. Caller is responsible for freeing any prior stride_idx.
@@ -96,6 +106,8 @@ SV_INLINE void sv_init_fields(VALUE obj, string_view_t *sv, VALUE backing,
96
106
  sv->offset = offset;
97
107
  sv->length = length;
98
108
  sv->single_byte = sv_compute_single_byte(backing, enc);
109
+ sv->valid_encoding = sv->single_byte == 1 ? 1 : -1;
110
+ sv->pooled = 0;
99
111
  sv->charlen = -1;
100
112
  sv->stride_idx = NULL;
101
113
  }
@@ -4,41 +4,26 @@
4
4
  /* StringView::CoreExt — module with String#view, included on demand */
5
5
  /* ========================================================================= */
6
6
 
7
- /* ObjectSpace::WeakKeyMap caching String → Pool.
8
- * Keys (strings) are held weakly — when a string is GC'd, its entry
9
- * is automatically removed. Values (pools) are held strongly. */
10
- static VALUE pool_cache;
11
- static ID id_aref;
12
- static ID id_aset;
13
-
14
7
  /*
15
8
  * view(byte_offset, byte_length) → StringView
16
9
  *
17
10
  * Returns a StringView into this string at the given byte range.
18
- * Lazily creates a StringView::Pool and caches it in a global
19
- * WeakKeyMap for automatic cleanup when the string is GC'd.
11
+ * The backing string is frozen in place so StringView can safely reference it.
12
+ * Each call returns a fresh StringView; callers that want explicit object reuse
13
+ * should opt into StringView::Pool directly.
20
14
  */
21
15
  static VALUE string_view_method(VALUE self, VALUE voffset, VALUE vlength) {
16
+ VALUE args[3];
17
+
22
18
  rb_str_freeze(self);
23
19
 
24
- VALUE pool = rb_funcall(pool_cache, id_aref, 1, self);
25
- if (NIL_P(pool)) {
26
- pool = rb_class_new_instance(1, &self, cStringViewPool);
27
- rb_funcall(pool_cache, id_aset, 2, self, pool);
28
- }
29
- return pool_view(pool, voffset, vlength);
20
+ args[0] = self;
21
+ args[1] = voffset;
22
+ args[2] = vlength;
23
+ return rb_class_new_instance(3, args, cStringView);
30
24
  }
31
25
 
32
26
  void Init_string_view_core_ext(void) {
33
- id_aref = rb_intern("[]");
34
- id_aset = rb_intern("[]=");
35
-
36
- VALUE cWeakKeyMap = rb_const_get(
37
- rb_const_get(rb_cObject, rb_intern("ObjectSpace")),
38
- rb_intern("WeakKeyMap"));
39
- pool_cache = rb_class_new_instance(0, NULL, cWeakKeyMap);
40
- rb_gc_register_mark_object(pool_cache);
41
-
42
27
  VALUE mCoreExt = rb_define_module_under(cStringView, "CoreExt");
43
28
  rb_define_method(mCoreExt, "view", string_view_method, 2);
44
29
  }
@@ -73,6 +73,7 @@ static void pool_grow(sv_pool_t *pool, VALUE pool_obj) {
73
73
  VALUE obj = TypedData_Make_Struct(cStringView, string_view_t,
74
74
  &string_view_type, sv);
75
75
  sv_init_fields(obj, sv, pool->backing, pool->base, pool->enc, 0, 0);
76
+ sv->pooled = 1;
76
77
  rb_ary_push(pool->views, obj);
77
78
  }
78
79
 
@@ -145,12 +146,15 @@ VALUE pool_view(VALUE self, VALUE voffset, VALUE vlength) {
145
146
  VALUE view = RARRAY_AREF(pool->views, pool->next_idx);
146
147
  pool->next_idx++;
147
148
 
149
+ if (SV_UNLIKELY(OBJ_FROZEN(view))) {
150
+ rb_raise(rb_eFrozenError,
151
+ "can't reuse a frozen pooled StringView; materialize it before freezing");
152
+ }
153
+
148
154
  string_view_t *sv = (string_view_t *)RTYPEDDATA_GET_DATA(view);
149
- sv->base = pool->base; /* refresh in case backing was mutated */
150
- sv->offset = off;
151
- sv->length = len;
152
- sv->charlen = -1; /* invalidate cached char count */
153
- sv->stride_idx = NULL; /* invalidate stride index */
155
+ sv_clear_stride_index(sv);
156
+ sv_init_fields(view, sv, pool->backing, pool->base, pool->enc, off, len);
157
+ sv->pooled = 1;
154
158
 
155
159
  return view;
156
160
  }
Binary file
Binary file
Binary file
@@ -4,5 +4,5 @@
4
4
  # without requiring the C extension.
5
5
  # The C extension defines StringView as a class, so we use `class` here.
6
6
  class StringView
7
- VERSION = "0.2.0"
7
+ VERSION = "0.2.1"
8
8
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: string_view
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: arm64-darwin
6
6
  authors:
7
7
  - Shopify
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-03-19 00:00:00.000000000 Z
11
+ date: 2026-04-13 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: StringView provides a read-only, zero-copy view into a frozen Ruby String,
14
14
  avoiding intermediate allocations for slicing, searching, and delegation of transform