multi_string_replace 2.0.1 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yml +28 -0
- data/CHANGELOG.md +29 -0
- data/Gemfile.lock +4 -3
- data/README.md +51 -4
- data/Rakefile +4 -1
- data/bin/benchmark.rb +17 -4
- data/ext/multi_string_replace/aho_trie.c +1 -1
- data/ext/multi_string_replace/ahocorasick.c +57 -41
- data/ext/multi_string_replace/extconf.rb +12 -2
- data/ext/multi_string_replace/multi_string_replace.c +150 -19
- data/lib/multi_string_replace/version.rb +1 -1
- data/multi_string_replace.gemspec +1 -1
- metadata +19 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 92a92ffb1350b891f31634d8b7c3ca4d0a6e1582c91b35f548c61f819cbf5ffe
|
4
|
+
data.tar.gz: 86c5acf5d85393669c05054ce79a1fe37a6fdf98d2e62b17803b94cf46670c1d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 465229a147b4287e725669b7ca10bca5e3ea613cda6d22e69f01ba8f9b9808b1018dd0344caeffc941aee2ea6220392d8d959527b2266c8ab84d8ffdacd0ebc4
|
7
|
+
data.tar.gz: 2148c1d0386ef4efd0ae13dd90466af4b7262fd8d01200ea70ba74f152329abb20655db826a98d12a8604651eb9cd8406ca9498f26681a7b0e7f597d17899a79
|
@@ -0,0 +1,28 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
branches: [ master ]
|
6
|
+
pull_request:
|
7
|
+
branches: [ master ]
|
8
|
+
|
9
|
+
jobs:
|
10
|
+
test:
|
11
|
+
runs-on: ubuntu-latest
|
12
|
+
strategy:
|
13
|
+
fail-fast: false
|
14
|
+
matrix:
|
15
|
+
ruby: [ '3.0', '3.1', '3.2', '3.3', '3.4' ]
|
16
|
+
steps:
|
17
|
+
- uses: actions/checkout@v4
|
18
|
+
- uses: ruby/setup-ruby@v1
|
19
|
+
with:
|
20
|
+
ruby-version: ${{ matrix.ruby }}
|
21
|
+
bundler-cache: true
|
22
|
+
- name: Install dependencies
|
23
|
+
run: bundle install --jobs 4 --retry 3
|
24
|
+
- name: Compile native extension
|
25
|
+
run: rake compile
|
26
|
+
- name: Run specs
|
27
|
+
run: rake spec
|
28
|
+
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# 2.0.2 (August 10, 2023)
|
2
|
+
# 2.0.2 (August 10, 2023)
|
3
|
+
|
4
|
+
# 3.0.0 (September 11, 2025)
|
5
|
+
|
6
|
+
## Features
|
7
|
+
|
8
|
+
- New `MultiStringReplace::Automaton` API to compile patterns once and reuse for faster repeated match/replace.
|
9
|
+
|
10
|
+
## Performance
|
11
|
+
|
12
|
+
- Faster replace path: cached Ruby method IDs, fewer API calls, preallocated buffers, and deferred string allocation for no-match fast path.
|
13
|
+
- Optimized trie setup/clear and memory copies.
|
14
|
+
|
15
|
+
## Stability and build
|
16
|
+
|
17
|
+
- Fixed Bundler conflicts on Ruby 3.4+; Rakefile avoids dual-Bundler loading.
|
18
|
+
- Added GC marking to prevent crashes with the new Automaton.
|
19
|
+
- Safer behavior when replacement is missing; falls back to original substring.
|
20
|
+
|
21
|
+
## Tooling
|
22
|
+
|
23
|
+
- Benchmark script now supports `-f/--file`, `-n/--iters`, and `-A/--automaton` flags.
|
24
|
+
- Added GitHub Actions CI across Ruby 3.0–3.4.
|
25
|
+
|
26
|
+
## Bug fixes:
|
27
|
+
|
28
|
+
- Prevent segfault with some binary characters. [#10](https://github.com/jedld/multi_string_replace/pull/10)
|
29
|
+
|
data/Gemfile.lock
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
multi_string_replace (
|
4
|
+
multi_string_replace (3.0.0)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
|
+
benchmark (0.4.1)
|
9
10
|
diff-lcs (1.3)
|
10
11
|
rake (13.0.1)
|
11
12
|
rake-compiler (1.0.5)
|
@@ -28,11 +29,11 @@ PLATFORMS
|
|
28
29
|
ruby
|
29
30
|
|
30
31
|
DEPENDENCIES
|
31
|
-
|
32
|
+
benchmark
|
32
33
|
multi_string_replace!
|
33
34
|
rake
|
34
35
|
rake-compiler
|
35
36
|
rspec (~> 3.0)
|
36
37
|
|
37
38
|
BUNDLED WITH
|
38
|
-
2.
|
39
|
+
2.6.9
|
data/README.md
CHANGED
@@ -58,6 +58,18 @@ Also adds a mreplace method to String which does the same thing:
|
|
58
58
|
"The quick brown fox jumps over the lazy dog brown".mreplace({'brown' => 'black', 'fox' => ->(_, _) { "cat" }})
|
59
59
|
```
|
60
60
|
|
61
|
+
### Reuse a compiled automaton (faster for repeated calls)
|
62
|
+
|
63
|
+
When running many matches/replacements with the same set of keys, build the automaton once and reuse it:
|
64
|
+
|
65
|
+
```ruby
|
66
|
+
ac = MultiStringReplace::Automaton.new(['brown', 'fox'])
|
67
|
+
ac.match("The quick brown fox")
|
68
|
+
ac.replace("The quick brown fox", { 'brown' => 'black', 'fox' => 'wolf' })
|
69
|
+
```
|
70
|
+
|
71
|
+
This avoids rebuilding the trie/failure links on every call and can improve throughput significantly for repeated workloads.
|
72
|
+
|
61
73
|
## Performance
|
62
74
|
|
63
75
|
Performing token replacement on a 200K text file repeated 100 times
|
@@ -69,17 +81,52 @@ MultiStringReplace 0.196823 0.007979 0.204802 ( 0.207219)
|
|
69
81
|
mreplace 0.200593 0.004031 0.204624 ( 0.205379)
|
70
82
|
```
|
71
83
|
|
72
|
-
Benchmark
|
84
|
+
Benchmark source: <https://github.com/jedld/multi_string_replace/blob/master/bin/benchmark.rb>
|
85
|
+
|
86
|
+
### Run the benchmark locally
|
87
|
+
|
88
|
+
1) Install dependencies
|
89
|
+
|
90
|
+
```bash
|
91
|
+
bundle install
|
92
|
+
```
|
93
|
+
|
94
|
+
2) Compile the native extension (recommended for accurate numbers)
|
95
|
+
|
96
|
+
```bash
|
97
|
+
bundle exec rake compile
|
98
|
+
```
|
99
|
+
|
100
|
+
3) Run the benchmark script
|
101
|
+
|
102
|
+
```bash
|
103
|
+
bundle exec ruby bin/benchmark.rb
|
104
|
+
```
|
105
|
+
|
106
|
+
Notes:
|
107
|
+
- The script reads the sample text from `spec/fixtures/test.txt` and will write results to `replaced.txt` and `replaced2.txt` in the repo root.
|
108
|
+
- To change the number of iterations or the input text, edit `bin/benchmark.rb` (look for the `100.times` loop and the `spec/fixtures/test.txt` path).
|
109
|
+
- For repeated runs with the same keys, consider updating the benchmark to use `MultiStringReplace::Automaton` to showcase the speedup for batched workloads.
|
110
|
+
|
111
|
+
Advanced benchmark flags:
|
112
|
+
|
113
|
+
```bash
|
114
|
+
# choose input and iterations
|
115
|
+
bundle exec ruby bin/benchmark.rb -f spec/fixtures/test.txt -n 200
|
116
|
+
|
117
|
+
# enable Automaton (reuse compiled trie)
|
118
|
+
bundle exec ruby bin/benchmark.rb -A
|
119
|
+
```
|
73
120
|
|
74
121
|
## Development
|
75
122
|
|
76
|
-
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
123
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake compile` followed by run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
77
124
|
|
78
125
|
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
79
126
|
|
80
127
|
## Contributing
|
81
128
|
|
82
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
129
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/jedld/multi_string_replace. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
83
130
|
|
84
131
|
## License
|
85
132
|
|
@@ -87,4 +134,4 @@ The gem is available as open source under the terms of the [MIT License](https:/
|
|
87
134
|
|
88
135
|
## Code of Conduct
|
89
136
|
|
90
|
-
Everyone interacting in the MultiStringReplace project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/
|
137
|
+
Everyone interacting in the MultiStringReplace project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/jedld/multi_string_replace/blob/master/CODE_OF_CONDUCT.md).
|
data/Rakefile
CHANGED
@@ -1,4 +1,7 @@
|
|
1
|
-
|
1
|
+
# Load Bundler gem tasks only when needed to avoid conflicts between system Bundler and gem-installed Bundler
|
2
|
+
if (ARGV & %w[build release install release:guard_clean release:rubygem_push]).any?
|
3
|
+
require "bundler/gem_tasks"
|
4
|
+
end
|
2
5
|
require "rspec/core/rake_task"
|
3
6
|
require "rake/extensiontask"
|
4
7
|
|
data/bin/benchmark.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require "bundler/setup"
|
2
2
|
require 'multi_string_replace'
|
3
3
|
require 'benchmark'
|
4
|
+
require 'optparse'
|
4
5
|
|
5
6
|
class String
|
6
7
|
def mgsub(key_value_pairs=[].freeze)
|
@@ -12,7 +13,14 @@ Regexp.union(*regexp_fragments)) do |match|
|
|
12
13
|
end
|
13
14
|
end
|
14
15
|
|
15
|
-
|
16
|
+
options = { file: File.join('spec', 'fixtures', 'test.txt'), iters: 100, use_automaton: false }
|
17
|
+
OptionParser.new do |opts|
|
18
|
+
opts.on('-f', '--file PATH', 'Input file path') { |v| options[:file] = v }
|
19
|
+
opts.on('-n', '--iters N', Integer, 'Iterations') { |v| options[:iters] = v }
|
20
|
+
opts.on('-A', '--automaton', 'Use Automaton for Msr run') { options[:use_automaton] = true }
|
21
|
+
end.parse!(ARGV)
|
22
|
+
|
23
|
+
body = File.read(options[:file])
|
16
24
|
|
17
25
|
replace = {
|
18
26
|
'Lorem' => 'XXXX',
|
@@ -33,7 +41,12 @@ File.write('replaced.txt', body.gsub(/(#{replace.keys.join('|')})/, replace))
|
|
33
41
|
File.write('replaced2.txt', MultiStringReplace.replace(body, replace))
|
34
42
|
|
35
43
|
Benchmark.bmbm do |x|
|
36
|
-
x.report "multi gsub" do
|
37
|
-
|
38
|
-
|
44
|
+
x.report "multi gsub" do options[:iters].times { body.mgsub(replace.map { |k, v| [/#{k}/, v] } ) } end
|
45
|
+
if options[:use_automaton]
|
46
|
+
ac = MultiStringReplace::Automaton.new(replace.keys)
|
47
|
+
x.report "MultiStringReplace (Automaton)" do options[:iters].times { ac.replace(body, replace) } end
|
48
|
+
else
|
49
|
+
x.report "MultiStringReplace" do options[:iters].times { MultiStringReplace.replace(body, replace) } end
|
50
|
+
end
|
51
|
+
x.report "mreplace" do options[:iters].times { body.mreplace(replace) } end
|
39
52
|
end
|
@@ -29,7 +29,7 @@ bool aho_add_trie_node(struct aho_trie * restrict t, struct aho_text_t * restric
|
|
29
29
|
|
30
30
|
for (int text_idx = 0; text_idx < text->len; text_idx++)
|
31
31
|
{
|
32
|
-
unsigned
|
32
|
+
unsigned char node_text = text->text[text_idx];
|
33
33
|
bool find_node = false;
|
34
34
|
int child_idx = 0;
|
35
35
|
|
@@ -49,7 +49,9 @@ int aho_add_match_text(struct ahocorasick * restrict aho, const char* text, unsi
|
|
49
49
|
|
50
50
|
a_text->id = aho->accumulate_text_id++;
|
51
51
|
|
52
|
-
|
52
|
+
// Copy exactly len bytes and add terminator once
|
53
|
+
memcpy(a_text->text, text, len);
|
54
|
+
a_text->text[len] = '\0';
|
53
55
|
|
54
56
|
a_text->len = len;
|
55
57
|
a_text->prev = NULL;
|
@@ -113,13 +115,18 @@ bool aho_del_match_text(struct ahocorasick * restrict aho, const int id)
|
|
113
115
|
|
114
116
|
void aho_clear_match_text(struct ahocorasick * restrict aho)
|
115
117
|
{
|
116
|
-
|
117
|
-
{
|
118
|
-
|
118
|
+
struct aho_text_t* iter = aho->text_list_head;
|
119
|
+
while (iter) {
|
120
|
+
struct aho_text_t* nxt = iter->next;
|
121
|
+
free(iter->text);
|
122
|
+
free(iter);
|
123
|
+
iter = nxt;
|
119
124
|
}
|
120
|
-
|
121
|
-
// reset id
|
125
|
+
// reset
|
122
126
|
aho->accumulate_text_id = 0;
|
127
|
+
aho->text_list_head = NULL;
|
128
|
+
aho->text_list_tail = NULL;
|
129
|
+
aho->text_list_len = 0;
|
123
130
|
}
|
124
131
|
|
125
132
|
|
@@ -148,7 +155,7 @@ void aho_clear_trie(struct ahocorasick * restrict aho)
|
|
148
155
|
|
149
156
|
unsigned int aho_findtext(struct ahocorasick * restrict aho, const char* data, unsigned long long data_len)
|
150
157
|
{
|
151
|
-
|
158
|
+
unsigned long long i = 0;
|
152
159
|
int match_count = 0;
|
153
160
|
struct aho_trie_node* travasal_node = NULL;
|
154
161
|
|
@@ -168,11 +175,7 @@ unsigned int aho_findtext(struct ahocorasick * restrict aho, const char* data, u
|
|
168
175
|
match.id = result->id;
|
169
176
|
match.len = result->len;
|
170
177
|
|
171
|
-
|
172
|
-
if (result->len == 1)
|
173
|
-
{
|
174
|
-
match.pos = i;
|
175
|
-
}
|
178
|
+
match.pos = i - result->len + 1;
|
176
179
|
|
177
180
|
match_count++;
|
178
181
|
if (aho->callback_match)
|
@@ -187,18 +190,16 @@ unsigned int aho_findtext(struct ahocorasick * restrict aho, const char* data, u
|
|
187
190
|
VALUE aho_replace_text(struct ahocorasick * restrict aho, const char* data,
|
188
191
|
unsigned long long data_len, char *values[], long value_sizes[], VALUE ruby_values[])
|
189
192
|
{
|
190
|
-
|
191
|
-
int match_count = 0;
|
193
|
+
unsigned long long i = 0;
|
192
194
|
struct aho_trie_node* travasal_node = NULL;
|
193
195
|
|
194
196
|
travasal_node = &(aho->trie.root);
|
195
|
-
|
196
|
-
|
197
|
-
long last_concat_pos = 0;
|
197
|
+
// Defer allocation until first match; handle no-match fast path
|
198
|
+
VALUE main_result = Qnil;
|
199
|
+
unsigned long long last_concat_pos = 0;
|
198
200
|
|
199
201
|
for (i = 0; i < data_len; i++)
|
200
202
|
{
|
201
|
-
struct aho_match_t match;
|
202
203
|
struct aho_text_t* result;
|
203
204
|
|
204
205
|
result = aho_find_trie_node(&travasal_node, data[i]);
|
@@ -207,43 +208,58 @@ VALUE aho_replace_text(struct ahocorasick * restrict aho, const char* data,
|
|
207
208
|
continue;
|
208
209
|
}
|
209
210
|
|
210
|
-
|
211
|
-
|
212
|
-
{
|
213
|
-
pos = i;
|
214
|
-
}
|
211
|
+
const int rlen = result->len;
|
212
|
+
unsigned long long pos = i - rlen + 1;
|
215
213
|
|
216
|
-
//
|
217
|
-
if (
|
214
|
+
// On first match, allocate result and copy prefix if any
|
215
|
+
if (NIL_P(main_result)) {
|
216
|
+
main_result = rb_str_buf_new((long)data_len);
|
217
|
+
if (pos > 0) rb_str_cat(main_result, &data[0], pos);
|
218
|
+
} else if (pos > last_concat_pos) {
|
218
219
|
rb_str_cat(main_result, &data[last_concat_pos], pos - last_concat_pos);
|
219
220
|
}
|
220
221
|
|
221
222
|
// concatenate replace
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
223
|
+
const int rid = result->id;
|
224
|
+
if (values[rid] == NULL) {
|
225
|
+
// NULL value indicates either: (1) no replacement supplied, or (2) a proc to compute it
|
226
|
+
if (!NIL_P(ruby_values[rid])) {
|
227
|
+
VALUE proc_result = rb_funcall(ruby_values[rid], rb_intern("call"), 2, LONG2NUM((long)pos), LONG2NUM((long)(pos + rlen)));
|
228
|
+
if (RB_TYPE_P(proc_result, T_NIL)) {
|
229
|
+
rb_str_cat(main_result, &data[pos], rlen);
|
230
|
+
last_concat_pos = i + 1;
|
231
|
+
continue;
|
232
|
+
} else if (RB_TYPE_P(proc_result, T_STRING)) {
|
233
|
+
rb_str_cat(main_result, StringValuePtr(proc_result), RSTRING_LEN(proc_result));
|
234
|
+
last_concat_pos = i + 1;
|
235
|
+
continue;
|
236
|
+
} else {
|
237
|
+
VALUE string_result = rb_funcall(proc_result, rb_intern("to_s"), 0);
|
238
|
+
rb_str_cat(main_result, StringValuePtr(string_result), RSTRING_LEN(string_result));
|
239
|
+
last_concat_pos = i + 1;
|
240
|
+
continue;
|
241
|
+
}
|
242
|
+
} else {
|
243
|
+
// no replacement provided; keep original slice
|
244
|
+
rb_str_cat(main_result, &data[pos], rlen);
|
226
245
|
last_concat_pos = i + 1;
|
227
246
|
continue;
|
228
|
-
} else if (RB_TYPE_P(proc_result, T_STRING)) {
|
229
|
-
value_sizes[result->id] = RSTRING_LEN(proc_result);
|
230
|
-
values[result->id] = StringValuePtr(proc_result);
|
231
|
-
} else {
|
232
|
-
VALUE string_result = rb_funcall(proc_result, rb_intern("to_s"), 0);
|
233
|
-
value_sizes[result->id] = RSTRING_LEN(string_result);
|
234
|
-
values[result->id] = StringValuePtr(string_result);
|
235
247
|
}
|
236
248
|
}
|
237
249
|
|
238
|
-
rb_str_cat(main_result, values[
|
250
|
+
rb_str_cat(main_result, values[rid], value_sizes[rid]);
|
239
251
|
last_concat_pos = i + 1;
|
240
252
|
}
|
241
253
|
|
242
|
-
if (
|
243
|
-
|
254
|
+
if (NIL_P(main_result)) {
|
255
|
+
// No matches; return a copy of input (preserves previous API behavior of returning a new String)
|
256
|
+
return rb_str_new(data, (long)data_len);
|
257
|
+
} else {
|
258
|
+
if (last_concat_pos < data_len) {
|
259
|
+
rb_str_cat(main_result, &data[last_concat_pos], (long)(data_len - last_concat_pos));
|
260
|
+
}
|
261
|
+
return main_result;
|
244
262
|
}
|
245
|
-
|
246
|
-
return main_result;
|
247
263
|
}
|
248
264
|
|
249
265
|
inline void aho_register_match_callback(VALUE rb_result_container, struct ahocorasick * restrict aho,
|
@@ -1,3 +1,13 @@
|
|
1
1
|
require 'mkmf'
|
2
|
-
|
3
|
-
|
2
|
+
|
3
|
+
# Optional: allow users to tweak optimization flags
|
4
|
+
optflags = ENV['MSR_OPTFLAGS'] || '-O3 -fno-strict-aliasing'
|
5
|
+
warnflags = ENV['MSR_WARNFLAGS']
|
6
|
+
|
7
|
+
with_cflags(optflags) do
|
8
|
+
# avoid treating warnings as errors across diverse compilers
|
9
|
+
with_werror(false) do
|
10
|
+
create_header
|
11
|
+
create_makefile 'multi_string_replace/multi_string_replace'
|
12
|
+
end
|
13
|
+
end
|
@@ -14,14 +14,132 @@
|
|
14
14
|
#include "extconf.h"
|
15
15
|
#include <stdio.h>
|
16
16
|
#include <string.h>
|
17
|
-
#include <pthread.h>
|
18
17
|
#include "ahocorasick.h"
|
19
18
|
|
19
|
+
// Cached Ruby IDs for method lookups
|
20
|
+
static ID id_keys;
|
21
|
+
static ID id_values;
|
22
|
+
static ID id_to_s;
|
23
|
+
static ID id_call;
|
24
|
+
|
25
|
+
// Forward declaration used by Automaton methods
|
26
|
+
void callback_match_pos(VALUE rb_result_container, void *arg, struct aho_match_t* m);
|
27
|
+
|
28
|
+
// Automaton wrapper for reuse across calls
|
29
|
+
typedef struct {
|
30
|
+
struct ahocorasick aho;
|
31
|
+
VALUE keys_ary; // Ruby array of keys in insertion order (kept for GC safety and lookups)
|
32
|
+
int built;
|
33
|
+
} automaton_t;
|
34
|
+
|
35
|
+
static void automaton_free(void *ptr) {
|
36
|
+
if (!ptr) return;
|
37
|
+
automaton_t *au = (automaton_t*)ptr;
|
38
|
+
aho_destroy(&au->aho);
|
39
|
+
xfree(au);
|
40
|
+
}
|
41
|
+
|
42
|
+
static size_t automaton_size(const void *ptr) {
|
43
|
+
return ptr ? sizeof(automaton_t) : 0;
|
44
|
+
}
|
45
|
+
|
46
|
+
static void automaton_mark(void *ptr) {
|
47
|
+
if (!ptr) return;
|
48
|
+
automaton_t *au = (automaton_t*)ptr;
|
49
|
+
if (!NIL_P(au->keys_ary)) rb_gc_mark(au->keys_ary);
|
50
|
+
}
|
51
|
+
|
52
|
+
static const rb_data_type_t automaton_type = {
|
53
|
+
"MultiStringReplace::Automaton",
|
54
|
+
{ automaton_mark, automaton_free, automaton_size, },
|
55
|
+
0, 0, RUBY_TYPED_FREE_IMMEDIATELY
|
56
|
+
};
|
57
|
+
|
58
|
+
static VALUE automaton_alloc(VALUE klass) {
|
59
|
+
automaton_t *au;
|
60
|
+
VALUE obj = TypedData_Make_Struct(klass, automaton_t, &automaton_type, au);
|
61
|
+
memset(au, 0, sizeof(*au));
|
62
|
+
aho_init(&au->aho);
|
63
|
+
au->built = 0;
|
64
|
+
au->keys_ary = Qnil;
|
65
|
+
return obj;
|
66
|
+
}
|
67
|
+
|
68
|
+
static VALUE automaton_initialize(VALUE self, VALUE keys) {
|
69
|
+
automaton_t *au;
|
70
|
+
TypedData_Get_Struct(self, automaton_t, &automaton_type, au);
|
71
|
+
Check_Type(keys, T_ARRAY);
|
72
|
+
long size = RARRAY_LEN(keys);
|
73
|
+
au->keys_ary = rb_ary_new_capa(size);
|
74
|
+
|
75
|
+
for (long idx = 0; idx < size; idx++) {
|
76
|
+
VALUE entry = rb_ary_entry(keys, idx);
|
77
|
+
if (!RB_TYPE_P(entry, T_STRING)) {
|
78
|
+
entry = rb_funcall(entry, id_to_s, 0);
|
79
|
+
}
|
80
|
+
rb_ary_push(au->keys_ary, entry); // keep a reference
|
81
|
+
aho_add_match_text(&au->aho, StringValuePtr(entry), RSTRING_LEN(entry));
|
82
|
+
}
|
83
|
+
aho_create_trie(&au->aho);
|
84
|
+
au->built = 1;
|
85
|
+
return self;
|
86
|
+
}
|
87
|
+
|
88
|
+
static VALUE automaton_match(VALUE self, VALUE body) {
|
89
|
+
automaton_t *au;
|
90
|
+
TypedData_Get_Struct(self, automaton_t, &automaton_type, au);
|
91
|
+
if (!au->built) return rb_hash_new();
|
92
|
+
VALUE result = rb_hash_new();
|
93
|
+
char *target = StringValuePtr(body);
|
94
|
+
aho_register_match_callback(result, &au->aho, callback_match_pos, (void*)target);
|
95
|
+
aho_findtext(&au->aho, target, RSTRING_LEN(body));
|
96
|
+
return result;
|
97
|
+
}
|
98
|
+
|
99
|
+
static VALUE automaton_replace(VALUE self, VALUE body, VALUE replace) {
|
100
|
+
automaton_t *au;
|
101
|
+
TypedData_Get_Struct(self, automaton_t, &automaton_type, au);
|
102
|
+
if (!au->built) return body;
|
103
|
+
Check_Type(replace, T_HASH);
|
104
|
+
|
105
|
+
long size = RARRAY_LEN(au->keys_ary);
|
106
|
+
char **values = ALLOCA_N(char*, size);
|
107
|
+
long *value_sizes = ALLOCA_N(long, size);
|
108
|
+
VALUE *ruby_val = ALLOCA_N(VALUE, size);
|
109
|
+
for (long i = 0; i < size; i++) { values[i] = NULL; value_sizes[i] = 0; ruby_val[i] = Qnil; }
|
110
|
+
|
111
|
+
// Build arrays aligned with internal ids: id == index of key in keys_ary
|
112
|
+
for (long idx = 0; idx < size; idx++) {
|
113
|
+
VALUE key = rb_ary_entry(au->keys_ary, idx);
|
114
|
+
VALUE value = rb_hash_aref(replace, key);
|
115
|
+
if (NIL_P(value)) {
|
116
|
+
// no replacement provided => keep original text when encountered
|
117
|
+
values[idx] = NULL; // signals "copy original"
|
118
|
+
ruby_val[idx] = Qnil;
|
119
|
+
continue;
|
120
|
+
}
|
121
|
+
if (RB_TYPE_P(value, T_STRING)) {
|
122
|
+
values[idx] = StringValuePtr(value);
|
123
|
+
value_sizes[idx] = RSTRING_LEN(value);
|
124
|
+
ruby_val[idx] = value; // GC keep
|
125
|
+
} else if (rb_respond_to(value, id_call)) {
|
126
|
+
values[idx] = NULL;
|
127
|
+
ruby_val[idx] = value;
|
128
|
+
} else {
|
129
|
+
VALUE s = rb_funcall(value, id_to_s, 0);
|
130
|
+
values[idx] = StringValuePtr(s);
|
131
|
+
value_sizes[idx] = RSTRING_LEN(s);
|
132
|
+
ruby_val[idx] = s;
|
133
|
+
}
|
134
|
+
}
|
135
|
+
|
136
|
+
char *target = StringValuePtr(body);
|
137
|
+
return aho_replace_text(&au->aho, target, RSTRING_LEN(body), values, value_sizes, ruby_val);
|
138
|
+
}
|
139
|
+
|
20
140
|
void callback_match_pos(VALUE rb_result_container, void *arg, struct aho_match_t* m)
|
21
141
|
{
|
22
|
-
|
23
|
-
unsigned int i = m->pos, idx = 0;
|
24
|
-
|
142
|
+
// Append match position to array keyed by token id
|
25
143
|
VALUE key = LONG2NUM(m->id);
|
26
144
|
VALUE hash_value = rb_hash_aref(rb_result_container, key);
|
27
145
|
|
@@ -35,7 +153,6 @@ void callback_match_pos(VALUE rb_result_container, void *arg, struct aho_match_t
|
|
35
153
|
VALUE multi_string_match(VALUE self, VALUE body, VALUE keys)
|
36
154
|
{
|
37
155
|
Check_Type(keys, T_ARRAY);
|
38
|
-
int state;
|
39
156
|
VALUE result = rb_hash_new();
|
40
157
|
struct ahocorasick aho;
|
41
158
|
aho_init(&aho);
|
@@ -44,10 +161,8 @@ VALUE multi_string_match(VALUE self, VALUE body, VALUE keys)
|
|
44
161
|
|
45
162
|
for(long idx = 0; idx < size; idx++) {
|
46
163
|
VALUE entry = rb_ary_entry(keys, idx);
|
47
|
-
char *key_text;
|
48
|
-
|
49
164
|
if (!RB_TYPE_P(entry, T_STRING)) {
|
50
|
-
|
165
|
+
entry = rb_funcall(entry, id_to_s, 0);
|
51
166
|
}
|
52
167
|
|
53
168
|
aho_add_match_text(&aho, StringValuePtr(entry), RSTRING_LEN(entry));
|
@@ -64,18 +179,22 @@ VALUE multi_string_match(VALUE self, VALUE body, VALUE keys)
|
|
64
179
|
VALUE multi_string_replace(VALUE self, VALUE body, VALUE replace)
|
65
180
|
{
|
66
181
|
Check_Type(replace, T_HASH);
|
67
|
-
int state;
|
68
|
-
|
69
182
|
struct ahocorasick aho;
|
70
183
|
|
71
184
|
char *target = StringValuePtr(body);
|
72
|
-
VALUE keys = rb_funcall(replace,
|
73
|
-
VALUE replace_values = rb_funcall(replace,
|
185
|
+
VALUE keys = rb_funcall(replace, id_keys, 0);
|
186
|
+
VALUE replace_values = rb_funcall(replace, id_values, 0);
|
74
187
|
long size = RARRAY_LEN(keys);
|
75
188
|
|
189
|
+
if (size == 0) {
|
190
|
+
// Nothing to replace; return a copy of input (preserves API semantics)
|
191
|
+
return rb_str_dup(body);
|
192
|
+
}
|
193
|
+
|
76
194
|
long value_sizes[size];
|
77
195
|
char *values[size];
|
78
196
|
VALUE ruby_val[size];
|
197
|
+
for (long i = 0; i < size; i++) ruby_val[i] = Qnil; // GC guard and init
|
79
198
|
|
80
199
|
aho_init(&aho);
|
81
200
|
|
@@ -83,25 +202,23 @@ VALUE multi_string_replace(VALUE self, VALUE body, VALUE replace)
|
|
83
202
|
VALUE entry = rb_ary_entry(keys, idx);
|
84
203
|
VALUE value = rb_ary_entry(replace_values, idx);
|
85
204
|
|
86
|
-
char *key_text;
|
87
|
-
|
88
205
|
if (!RB_TYPE_P(entry, T_STRING)) {
|
89
|
-
entry = rb_funcall(entry,
|
206
|
+
entry = rb_funcall(entry, id_to_s, 0);
|
90
207
|
}
|
91
208
|
|
92
209
|
if (RB_TYPE_P(value, T_STRING)) {
|
93
210
|
values[idx] = StringValuePtr(value);
|
94
211
|
value_sizes[idx] = RSTRING_LEN(value);
|
212
|
+
ruby_val[idx] = value; // keep VALUE alive
|
95
213
|
} else {
|
96
|
-
|
97
|
-
VALUE responds_value = rb_funcall(value, rb_intern("respond_to?"), 1, rb_str_new_cstr("call"));
|
98
|
-
if (RB_TYPE_P(responds_value, T_TRUE)) {
|
214
|
+
if (rb_respond_to(value, id_call)) {
|
99
215
|
values[idx] = NULL;
|
100
216
|
ruby_val[idx] = value;
|
101
217
|
} else {
|
102
|
-
VALUE value_as_string = rb_funcall(value,
|
218
|
+
VALUE value_as_string = rb_funcall(value, id_to_s, 0);
|
103
219
|
values[idx] = StringValuePtr(value_as_string);
|
104
220
|
value_sizes[idx] = RSTRING_LEN(value_as_string);
|
221
|
+
ruby_val[idx] = value_as_string; // keep VALUE alive
|
105
222
|
}
|
106
223
|
}
|
107
224
|
|
@@ -116,6 +233,11 @@ VALUE multi_string_replace(VALUE self, VALUE body, VALUE replace)
|
|
116
233
|
void Init_multi_string_replace()
|
117
234
|
{
|
118
235
|
int state;
|
236
|
+
// Cache frequently used IDs
|
237
|
+
id_keys = rb_intern("keys");
|
238
|
+
id_values = rb_intern("values");
|
239
|
+
id_to_s = rb_intern("to_s");
|
240
|
+
id_call = rb_intern("call");
|
119
241
|
VALUE mod = rb_eval_string_protect("MultiStringReplace", &state);
|
120
242
|
|
121
243
|
if (state)
|
@@ -124,6 +246,15 @@ void Init_multi_string_replace()
|
|
124
246
|
} else {
|
125
247
|
rb_define_singleton_method(mod, "match", multi_string_match, 2);
|
126
248
|
rb_define_singleton_method(mod, "replace", multi_string_replace, 2);
|
249
|
+
|
250
|
+
// Define Automaton class under module
|
251
|
+
VALUE cAutomaton = rb_define_class_under(mod, "Automaton", rb_cObject);
|
252
|
+
rb_define_alloc_func(cAutomaton, automaton_alloc);
|
253
|
+
rb_define_method(cAutomaton, "initialize", automaton_initialize, 1);
|
254
|
+
rb_define_method(cAutomaton, "match", automaton_match, 1);
|
255
|
+
rb_define_method(cAutomaton, "replace", automaton_replace, 2);
|
256
|
+
// Convenience builder: Automaton.build(keys)
|
257
|
+
rb_define_singleton_method(cAutomaton, "build", RUBY_METHOD_FUNC(rb_class_new_instance), -1);
|
127
258
|
}
|
128
259
|
|
129
260
|
}
|
@@ -33,8 +33,8 @@ Gem::Specification.new do |spec|
|
|
33
33
|
spec.require_paths = ["lib"]
|
34
34
|
spec.extensions = ["ext/multi_string_replace/extconf.rb"]
|
35
35
|
|
36
|
-
spec.add_development_dependency "bundler", "~> 2.3.26"
|
37
36
|
spec.add_development_dependency "rake"
|
38
37
|
spec.add_development_dependency "rspec", "~> 3.0"
|
39
38
|
spec.add_development_dependency "rake-compiler"
|
39
|
+
spec.add_development_dependency "benchmark"
|
40
40
|
end
|
metadata
CHANGED
@@ -1,29 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: multi_string_replace
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joseph Dayo
|
8
|
-
autorequire:
|
9
8
|
bindir: exe
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: bundler
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - "~>"
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: 2.3.26
|
20
|
-
type: :development
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - "~>"
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: 2.3.26
|
27
12
|
- !ruby/object:Gem::Dependency
|
28
13
|
name: rake
|
29
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -66,6 +51,20 @@ dependencies:
|
|
66
51
|
- - ">="
|
67
52
|
- !ruby/object:Gem::Version
|
68
53
|
version: '0'
|
54
|
+
- !ruby/object:Gem::Dependency
|
55
|
+
name: benchmark
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
type: :development
|
62
|
+
prerelease: false
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '0'
|
69
68
|
description: A fast multiple string replace library for ruby. Uses a C implementation
|
70
69
|
of the Aho–Corasick Algorithm
|
71
70
|
email:
|
@@ -75,9 +74,11 @@ extensions:
|
|
75
74
|
- ext/multi_string_replace/extconf.rb
|
76
75
|
extra_rdoc_files: []
|
77
76
|
files:
|
77
|
+
- ".github/workflows/ci.yml"
|
78
78
|
- ".gitignore"
|
79
79
|
- ".rspec"
|
80
80
|
- ".travis.yml"
|
81
|
+
- CHANGELOG.md
|
81
82
|
- CODE_OF_CONDUCT.md
|
82
83
|
- Gemfile
|
83
84
|
- Gemfile.lock
|
@@ -107,7 +108,6 @@ licenses:
|
|
107
108
|
- MIT
|
108
109
|
metadata:
|
109
110
|
allowed_push_host: https://rubygems.org
|
110
|
-
post_install_message:
|
111
111
|
rdoc_options: []
|
112
112
|
require_paths:
|
113
113
|
- lib
|
@@ -122,8 +122,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
122
122
|
- !ruby/object:Gem::Version
|
123
123
|
version: '0'
|
124
124
|
requirements: []
|
125
|
-
rubygems_version: 3.
|
126
|
-
signing_key:
|
125
|
+
rubygems_version: 3.6.9
|
127
126
|
specification_version: 4
|
128
127
|
summary: A fast multiple string replace library for ruby. Uses a C implementation
|
129
128
|
of the Aho–Corasick Algorithm
|