table_string_replacer 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cb11a5cfd1ec0a7dec1a8131c2b5324552fceaf90df427b973793740fc85d385
4
- data.tar.gz: bbf93bcb1e70775d92d24fa95a7d7f6427a86deb16c06ce1fcd57ac5f62e4890
3
+ metadata.gz: 8d215391ea5818f845c778f73cc12cffad6ebba54d23a47fd6c465d13111ff61
4
+ data.tar.gz: deab5736893ab931455a4b18e8534af239ae7ee300fc2454dd6ad014f0dd56af
5
5
  SHA512:
6
- metadata.gz: c88257041447a1c505760949f57163c12819097280aef4cf319dcb1288fe38613a6832531b1ff55370ca989e8fdbad610939871f6164bca5a0ddce0000615ddf
7
- data.tar.gz: 19d525e54a7c6baa90c82fba83a72eedbbcce4b3cd92918010040e2ebb0d950c0cb97bf89ad1f38862d38fde96d2506f3404239d3699bb3051120428bcadd26b
6
+ metadata.gz: 1fb3e48330c7f87f78acb625e7fe0f29b8c8a64c7da79970fc16c16c4c75123352a9091de8604c2a796780d6e6c08e98e5e3ac278c992b54c99d326002c10ec0
7
+ data.tar.gz: 6d7a709a91cbeacce28c23ba30301c9d580f92bd0014189d0626a62eb3bd49eca205b8f99857f7f1a1350610587668b02b31158d62071ea789830cd0ba7e08d1
data/README.md CHANGED
@@ -60,9 +60,24 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
60
60
 
61
61
  To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
62
62
 
63
+ ## Updating the Gem
64
+
65
+ 1. Make your code changes and ensure all tests pass with `rake test`
66
+ 2. Update the version number in `lib/table_string_replacer/version.rb` following [Semantic Versioning](https://semver.org/) principles
67
+ 3. Update the CHANGELOG.md file with your changes
68
+ 4. Commit your changes to git
69
+ 5. Run `bundle exec rake release`, which will:
70
+ - Create a git tag for the version
71
+ - Push git commits and tags
72
+ - Build the gem and push it to RubyGems.org
73
+
74
+ Alternatively, you can do these steps manually:
75
+ 1. Build the gem: `gem build table_string_replacer.gemspec`
76
+ 2. Push to RubyGems: `gem push table_string_replacer-x.x.x.gem` (where x.x.x is the version number)
77
+
63
78
  ## Contributing
64
79
 
65
- Bug reports and pull requests are welcome on GitHub at https://github.com/yourusername/table_string_replacer.
80
+ Bug reports and pull requests are welcome on GitHub at https://github.com/bv-ankit/table_string_replacer.
66
81
 
67
82
  ## License
68
83
 
@@ -37,7 +37,56 @@ char *strcasestr(const char *haystack, const char *needle) {
37
37
  }
38
38
  #endif
39
39
 
40
- // Fast serialized PHP string replacement with improved memory handling
40
+ // Helper function for binary-safe case-insensitive string search
41
+ static char* binary_strcasestr(const char* haystack, long haystack_len, const char* needle, long needle_len) {
42
+ if (needle_len == 0) return (char*)haystack;
43
+ if (haystack_len < needle_len) return NULL;
44
+
45
+ for (long i = 0; i <= haystack_len - needle_len; i++) {
46
+ long j;
47
+ for (j = 0; j < needle_len; j++) {
48
+ char h = haystack[i + j];
49
+ char n = needle[j];
50
+ if (toupper((unsigned char)h) != toupper((unsigned char)n))
51
+ break;
52
+ }
53
+ if (j == needle_len)
54
+ return (char*)(haystack + i);
55
+ }
56
+
57
+ return NULL;
58
+ }
59
+
60
+ // Extract serialized string length safely handling binary content
61
+ static long extract_serialized_len(const char* str, long max_len, long start_pos, char** endptr) {
62
+ // Ensure we have enough chars for a valid length
63
+ if (start_pos + 1 >= max_len) {
64
+ *endptr = NULL;
65
+ return 0;
66
+ }
67
+
68
+ // Skip to first digit
69
+ long pos = start_pos;
70
+ while (pos < max_len && !isdigit(str[pos])) pos++;
71
+
72
+ // Extract digits until we hit a colon
73
+ long val = 0;
74
+ while (pos < max_len && isdigit(str[pos])) {
75
+ val = val * 10 + (str[pos] - '0');
76
+ pos++;
77
+ }
78
+
79
+ // Check for valid format (must end with colon)
80
+ if (pos < max_len && str[pos] == ':') {
81
+ *endptr = (char*)(str + pos);
82
+ return val;
83
+ } else {
84
+ *endptr = NULL;
85
+ return 0;
86
+ }
87
+ }
88
+
89
+ // Fast serialized PHP string replacement with improved memory handling and binary string support
41
90
  static VALUE rb_serialized_str_replace(VALUE self, VALUE orig_str, VALUE old_str, VALUE new_str) {
42
91
  // Ensure strings are properly initialized
43
92
  Check_Type(orig_str, T_STRING);
@@ -52,97 +101,208 @@ static VALUE rb_serialized_str_replace(VALUE self, VALUE orig_str, VALUE old_str
52
101
  long old_len = RSTRING_LEN(old_str);
53
102
  long new_len = RSTRING_LEN(new_str);
54
103
 
104
+ // Early optimization: if old string is empty or original is empty, return original
105
+ if (old_len == 0 || orig_len == 0) {
106
+ return rb_str_dup(orig_str);
107
+ }
108
+
55
109
  // Early optimization: if old and new are identical, return original
56
110
  if (old_len == new_len && memcmp(old, new, old_len) == 0) {
57
111
  return rb_str_dup(orig_str);
58
112
  }
59
113
 
60
- // Estimate result size more accurately to avoid reallocations
61
- long max_replacements = orig_len / (old_len > 0 ? old_len : 1);
114
+ // Pre-compute a more accurate size estimate by counting potential matches first
115
+ long count = 0;
116
+ long i = 0;
117
+
118
+ // First-pass to count potential replacements in serialized strings
119
+ while (i < orig_len) {
120
+ // Look for serialized string marker pattern 's:'
121
+ if (i + 2 < orig_len && orig[i] == 's' && orig[i+1] == ':') {
122
+ char *endptr;
123
+ long len_val = extract_serialized_len(orig, orig_len, i+2, &endptr);
124
+
125
+ // Valid PHP serialized string format: s:N:"content";
126
+ if (endptr && (endptr+1) < orig + orig_len && *(endptr+1) == '"') {
127
+ long content_start = (endptr + 2) - orig;
128
+
129
+ if (content_start < orig_len) {
130
+ // Only search within the actual serialized string content
131
+ long search_limit = content_start + len_val;
132
+ if (search_limit > orig_len) search_limit = orig_len;
133
+
134
+ // Count occurrences within this serialized string
135
+ char *pos = orig + content_start;
136
+ char *end = orig + search_limit;
137
+ long remaining = search_limit - content_start;
138
+
139
+ while (remaining >= old_len) {
140
+ char *found = binary_strcasestr(pos, remaining, old, old_len);
141
+ if (found && found < end) {
142
+ count++;
143
+ long advance = found - pos + old_len;
144
+ pos += advance;
145
+ remaining -= advance;
146
+ } else {
147
+ break;
148
+ }
149
+ }
150
+ }
151
+
152
+ // Skip past this serialized string entirely for the counting phase
153
+ i = content_start + len_val;
154
+ // Skip past closing quote and semicolon if present
155
+ if (i < orig_len && orig[i] == '"') i++;
156
+ if (i < orig_len && orig[i] == ';') i++;
157
+ continue;
158
+ }
159
+ }
160
+ i++;
161
+ }
162
+
163
+ // Optimize allocation with more precise size calculation
62
164
  long size_diff = new_len - old_len;
63
- long estimated_result_len = orig_len + (size_diff > 0 ? size_diff * max_replacements : 0) + 128;
165
+ long estimated_result_len = orig_len + (size_diff > 0 ? size_diff * count : 0) + 256;
64
166
 
65
- // Pre-allocate result buffer
167
+ // Pre-allocate result buffer - slightly oversized to minimize reallocations
66
168
  VALUE result = rb_str_new(NULL, estimated_result_len);
67
169
  char *res_ptr = RSTRING_PTR(result);
68
170
  long res_len = 0;
69
171
 
70
- long i = 0;
172
+ // Cache the first character of the search string for faster initial checks
173
+ unsigned char first_char = (unsigned char)old[0];
174
+ unsigned char first_char_upper = toupper(first_char);
175
+ unsigned char first_char_lower = tolower(first_char);
176
+
177
+ // Second pass to perform the actual replacements
178
+ i = 0;
71
179
  while (i < orig_len) {
72
- // Check for serialized string marker
180
+ // Look for serialized string marker pattern 's:'
73
181
  if (i + 2 < orig_len && orig[i] == 's' && orig[i+1] == ':') {
74
182
  char *endptr;
75
- long len_pos = i + 2;
183
+ long len_val = extract_serialized_len(orig, orig_len, i+2, &endptr);
76
184
 
77
- // Extract length value more safely
78
- if (len_pos < orig_len) {
79
- long len_val = strtol(orig + len_pos, &endptr, 10);
185
+ // Valid PHP serialized string format: s:N:"content";
186
+ if (endptr && (endptr+1) < orig + orig_len && *(endptr+1) == '"') {
187
+ long content_start = (endptr + 2) - orig;
80
188
 
81
- // Verify we found a valid PHP serialized string
82
- if (endptr && *endptr == ':' && (endptr+1) < orig + orig_len && *(endptr+1) == '"') {
83
- long content_start = (endptr + 2) - orig;
189
+ if (content_start < orig_len) {
190
+ // Only search within the actual serialized string content
191
+ long search_limit = content_start + len_val;
192
+ if (search_limit > orig_len) search_limit = orig_len;
84
193
 
85
- // Make sure content_start is within bounds
86
- if (content_start < orig_len) {
87
- // Only search within the actual serialized string content
88
- long search_limit = content_start + len_val;
89
- if (search_limit > orig_len) search_limit = orig_len;
194
+ // Fast path: check if this serialized string might contain our pattern
195
+ // by checking for the first character before doing a full search
196
+ int potential_match = 0;
197
+ for (long scan_pos = content_start; scan_pos < search_limit; scan_pos++) {
198
+ unsigned char c = (unsigned char)orig[scan_pos];
199
+ if (c == first_char || toupper(c) == first_char_upper ||
200
+ tolower(c) == first_char_lower) {
201
+ potential_match = 1;
202
+ break;
203
+ }
204
+ }
205
+
206
+ // Only do full search if potential match found
207
+ if (potential_match) {
208
+ char *pos = orig + content_start;
209
+ char *end = orig + search_limit;
210
+ long remaining = search_limit - content_start;
90
211
 
91
- char *found = strcasestr(orig + content_start, old);
212
+ char *found = binary_strcasestr(pos, remaining, old, old_len);
92
213
 
93
214
  // Found match within the serialized string content
94
- if (found && found < orig + search_limit) {
95
- // Verify we have enough space in result buffer (resize if needed)
96
- long needed_len = res_len + (found - (orig + i)) + new_len + 100;
97
- if (needed_len > estimated_result_len) {
98
- rb_str_resize(result, needed_len * 2);
99
- res_ptr = RSTRING_PTR(result);
100
- estimated_result_len = needed_len * 2;
101
- }
102
-
215
+ if (found && found < end) {
216
+ // Before applying replacement, ensure we have enough space
217
+ long prefix_len = found - pos;
218
+
103
219
  // Calculate new serialized string length
104
- long new_len_val = len_val - old_len + new_len;
220
+ long modified_len = len_val + (new_len - old_len);
105
221
 
106
- // Update the serialized string length indicator
222
+ // Build the new serialized string header
107
223
  char len_buf[32];
108
- int len_digits = snprintf(len_buf, sizeof(len_buf), "s:%ld:", new_len_val);
224
+ int len_digits = snprintf(len_buf, sizeof(len_buf), "s:%ld:", modified_len);
109
225
 
110
- // Copy prefix up to the 's:' marker
111
- memcpy(res_ptr + res_len, orig + i, 2);
112
- res_len += 2;
226
+ // Ensure we have space in result buffer
227
+ long needed_space = res_len + len_digits + prefix_len + new_len +
228
+ (search_limit - (found + old_len)) + 16;
229
+ if (needed_space > estimated_result_len) {
230
+ estimated_result_len = (estimated_result_len * 3) / 2;
231
+ if (estimated_result_len < needed_space)
232
+ estimated_result_len = needed_space + 256;
233
+ rb_str_resize(result, estimated_result_len);
234
+ res_ptr = RSTRING_PTR(result);
235
+ }
113
236
 
114
- // Copy new length
115
- memcpy(res_ptr + res_len, len_buf + 2, len_digits - 2);
116
- res_len += len_digits - 2;
237
+ // Copy the serialized string header
238
+ memcpy(res_ptr + res_len, len_buf, len_digits);
239
+ res_len += len_digits;
117
240
 
118
- // Copy from length end to the found match position
119
- long pre_len = found - (orig + content_start);
120
- memcpy(res_ptr + res_len, endptr, pre_len + 2);
121
- res_len += pre_len + 2;
241
+ // Copy quote and prefix content
242
+ res_ptr[res_len++] = '"';
243
+ memcpy(res_ptr + res_len, pos, prefix_len);
244
+ res_len += prefix_len;
122
245
 
123
- // Copy the new replacement string
246
+ // Copy the replacement string
124
247
  memcpy(res_ptr + res_len, new, new_len);
125
248
  res_len += new_len;
126
249
 
127
- // Skip to after the replacement point
128
- i = found - orig + old_len;
250
+ // Copy remainder of serialized string
251
+ long suffix_len = search_limit - (found + old_len);
252
+ if (suffix_len > 0) {
253
+ memcpy(res_ptr + res_len, found + old_len, suffix_len);
254
+ res_len += suffix_len;
255
+ }
256
+
257
+ // Add closing quote and semicolon
258
+ res_ptr[res_len++] = '"';
259
+ res_ptr[res_len++] = ';';
260
+
261
+ // Skip past the entire processed serialized string
262
+ i = search_limit;
263
+ // Skip past closing quote and semicolon if present
264
+ if (i < orig_len && orig[i] == '"') i++;
265
+ if (i < orig_len && orig[i] == ';') i++;
129
266
  continue;
130
267
  }
131
268
  }
269
+
270
+ // No match found, copy serialized string as is
271
+ long str_total_len;
272
+ long end_pos = search_limit;
273
+
274
+ // Find the end of the serialized string (should be quote+semicolon)
275
+ if (end_pos < orig_len && orig[end_pos] == '"') end_pos++;
276
+ if (end_pos < orig_len && orig[end_pos] == ';') end_pos++;
277
+
278
+ str_total_len = end_pos - i;
279
+
280
+ if (res_len + str_total_len > estimated_result_len) {
281
+ estimated_result_len = (estimated_result_len * 3) / 2;
282
+ if (estimated_result_len < res_len + str_total_len)
283
+ estimated_result_len = res_len + str_total_len + 256;
284
+ rb_str_resize(result, estimated_result_len);
285
+ res_ptr = RSTRING_PTR(result);
286
+ }
287
+
288
+ memcpy(res_ptr + res_len, orig + i, str_total_len);
289
+ res_len += str_total_len;
290
+ i += str_total_len;
291
+ continue;
132
292
  }
133
293
  }
134
294
  }
135
295
 
136
296
  // If we didn't perform a replacement, copy the current character
137
297
  if (res_len >= estimated_result_len) {
138
- rb_str_resize(result, estimated_result_len * 2);
298
+ estimated_result_len = (estimated_result_len * 3) / 2;
299
+ rb_str_resize(result, estimated_result_len);
139
300
  res_ptr = RSTRING_PTR(result);
140
- estimated_result_len *= 2;
141
301
  }
142
302
  res_ptr[res_len++] = orig[i++];
143
303
  }
144
304
 
145
- // Set final string length and terminate
305
+ // Set final string length
146
306
  rb_str_resize(result, res_len);
147
307
 
148
308
  return result;
@@ -315,4 +475,4 @@ void Init_table_string_replacer() {
315
475
  // Constants for thread safety documentation
316
476
  rb_define_const(mStringReplacer, "THREAD_SAFE", Qtrue);
317
477
  rb_define_const(mStringReplacer, "VERSION", rb_str_new_cstr(RSTRING_PTR(rb_const_get(mStringReplacer, rb_intern("VERSION")))));
318
- }
478
+ }
@@ -1,3 +1,3 @@
1
1
  module TableStringReplacer
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: table_string_replacer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ANKIT KHANDELWAL