multi_string_replace 0.1.0 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile.lock +4 -14
- data/LICENSE.txt +1 -1
- data/README.md +40 -3
- data/bin/benchmark.rb +4 -3
- data/ext/multi_string_replace/aho_trie.c +58 -73
- data/ext/multi_string_replace/aho_trie.h +3 -0
- data/ext/multi_string_replace/ahocorasick.c +28 -4
- data/ext/multi_string_replace/ahocorasick.h +15 -1
- data/ext/multi_string_replace/multi_string_replace.c +37 -9
- data/lib/multi_string_replace.rb +11 -1
- data/lib/multi_string_replace/version.rb +1 -1
- data/multi_string_replace.gemspec +4 -5
- data/replaced.txt +651 -0
- data/replaced2.txt +651 -0
- metadata +10 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '03680f4522d98a7a162df95c1151feedabe821febc15a03c7a98a6996dd43cc1'
|
4
|
+
data.tar.gz: 80222e5675b3310fd079e6099026f8fa584d77542a0a34a30a3c6c3c67643cdb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6bcda8e829e5fd9e747c567a44c09363f304bbc8162d661418fb491f6626019c158d8a60345bb814cab42598c3830d884e6b8b70b20215d9614cb36d6913928e
|
7
|
+
data.tar.gz: 3f2895d64ba1d7560e866104f499a592767fa1dd161c1ff3d975126091d8330f06e85e25f37182de46f05c5dcb51a3f8ed4e38b352b78c8a1955dde1fdca5396
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,22 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
multi_string_replace (
|
4
|
+
multi_string_replace (1.0.4)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
|
-
byebug (10.0.2)
|
10
|
-
coderay (1.1.2)
|
11
9
|
diff-lcs (1.3)
|
12
|
-
|
13
|
-
pry (0.11.3)
|
14
|
-
coderay (~> 1.1.0)
|
15
|
-
method_source (~> 0.9.0)
|
16
|
-
pry-byebug (3.6.0)
|
17
|
-
byebug (~> 10.0)
|
18
|
-
pry (~> 0.10)
|
19
|
-
rake (10.5.0)
|
10
|
+
rake (13.0.1)
|
20
11
|
rake-compiler (1.0.5)
|
21
12
|
rake
|
22
13
|
rspec (3.8.0)
|
@@ -39,10 +30,9 @@ PLATFORMS
|
|
39
30
|
DEPENDENCIES
|
40
31
|
bundler (~> 1.16)
|
41
32
|
multi_string_replace!
|
42
|
-
|
43
|
-
rake (~> 10.0)
|
33
|
+
rake
|
44
34
|
rake-compiler
|
45
35
|
rspec (~> 3.0)
|
46
36
|
|
47
37
|
BUNDLED WITH
|
48
|
-
1.16.
|
38
|
+
1.16.5
|
data/LICENSE.txt
CHANGED
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
18
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
19
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
20
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
-
THE SOFTWARE.
|
21
|
+
THE SOFTWARE.
|
data/README.md
CHANGED
@@ -1,8 +1,15 @@
|
|
1
|
+
|
2
|
+
[](https://rubygems.org/gems/multi_string_replace)
|
3
|
+
|
4
|
+
|
1
5
|
# MultiStringReplace
|
2
6
|
|
3
|
-
|
7
|
+
A fast multiple string replace library for ruby. Uses a C implementation of the Aho–Corasick Algorithm based
|
8
|
+
on https://github.com/morenice/ahocorasick while adding support for a few performance enhancements and on the
|
9
|
+
fly multiple string replacement.
|
4
10
|
|
5
|
-
|
11
|
+
If Regex is not needed, this library offers significant performance advantages over String.gsub() for large string
|
12
|
+
and with a large number of tokens.
|
6
13
|
|
7
14
|
## Installation
|
8
15
|
|
@@ -22,7 +29,37 @@ Or install it yourself as:
|
|
22
29
|
|
23
30
|
## Usage
|
24
31
|
|
25
|
-
|
32
|
+
```ruby
|
33
|
+
MultiStringReplace.match("The quick brown fox jumps over the lazy dog brown", ['brown', 'fox'])
|
34
|
+
# { 0 => [10, 44], 1 => [16] }
|
35
|
+
MultiStringReplace.replace("The quick brown fox jumps over the lazy dog brown", {'brown' => 'black', 'fox' => 'wolf'})
|
36
|
+
# The quick black wolf jumps over the lazy dog black
|
37
|
+
```
|
38
|
+
|
39
|
+
You can also pass in a Proc, these will only get evaluated when the token is encountered.
|
40
|
+
|
41
|
+
```ruby
|
42
|
+
MultiStringReplace.replace("The quick brown fox jumps over the lazy dog brown", {'brown' => 'black', 'fox' => ->() { "cat" }})
|
43
|
+
```
|
44
|
+
|
45
|
+
Also adds a mreplace method to String which does the same thing:
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
"The quick brown fox jumps over the lazy dog brown".mreplace({'brown' => 'black', 'fox' => ->() { "cat" }})
|
49
|
+
```
|
50
|
+
|
51
|
+
## Performance
|
52
|
+
|
53
|
+
Performing token replacement on a 200K text file repeated 100 times
|
54
|
+
|
55
|
+
```
|
56
|
+
user system total real
|
57
|
+
multi gsub 1.322510 0.000000 1.322510 ( 1.344405)
|
58
|
+
MultiStringReplace 0.196823 0.007979 0.204802 ( 0.207219)
|
59
|
+
mreplace 0.200593 0.004031 0.204624 ( 0.205379)
|
60
|
+
```
|
61
|
+
|
62
|
+
Benchmark sources can be found here: <https://github.com/jedld/multi_word_replace/blob/master/bin/benchmark.rb>
|
26
63
|
|
27
64
|
## Development
|
28
65
|
|
data/bin/benchmark.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
require "bundler/setup"
|
2
2
|
require 'multi_string_replace'
|
3
3
|
require 'benchmark'
|
4
|
-
require 'pry-byebug'
|
5
4
|
|
6
5
|
class String
|
7
6
|
def mgsub(key_value_pairs=[].freeze)
|
@@ -27,12 +26,14 @@ replace = {
|
|
27
26
|
'Cras' => 'uuuuuuuu',
|
28
27
|
'nunc' => 'eeeeeee',
|
29
28
|
'cursus' => '乧乨乩乪乫乬乭乮乯买乱乲乳乴乵乶乷乸乹乺乻乼乽乾乿',
|
29
|
+
'Vivamus' => '㐀㐁㐂㐃㐄㐅㐆㐇㐈㐉㐊㐋'
|
30
30
|
}
|
31
31
|
|
32
32
|
File.write('replaced.txt', body.gsub(/(#{replace.keys.join('|')})/, replace))
|
33
33
|
File.write('replaced2.txt', MultiStringReplace.replace(body, replace))
|
34
34
|
|
35
35
|
Benchmark.bmbm do |x|
|
36
|
-
x.report "multi gsub" do body.mgsub(replace.map { |k, v| [/#{k}/, v] } ) end
|
37
|
-
x.report "MultiStringReplace" do MultiStringReplace.replace(body, replace) end
|
36
|
+
x.report "multi gsub" do 100.times { body.mgsub(replace.map { |k, v| [/#{k}/, v] } ) } end
|
37
|
+
x.report "MultiStringReplace" do 100.times { MultiStringReplace.replace(body, replace) } end
|
38
|
+
x.report "mreplace" do 100.times { body.mreplace(replace) } end
|
38
39
|
end
|
@@ -29,60 +29,48 @@ bool aho_add_trie_node(struct aho_trie * restrict t, struct aho_text_t * restric
|
|
29
29
|
|
30
30
|
for (int text_idx = 0; text_idx < text->len; text_idx++)
|
31
31
|
{
|
32
|
-
unsigned
|
32
|
+
unsigned int node_text = text->text[text_idx];
|
33
33
|
bool find_node = false;
|
34
34
|
int child_idx = 0;
|
35
35
|
|
36
36
|
if (travasal_node->child_count == 0)
|
37
37
|
{
|
38
38
|
/* insert first node to child_list */
|
39
|
-
|
40
|
-
(struct aho_trie_node*) malloc(sizeof(struct aho_trie_node));
|
41
|
-
travasal_node->child_count++;
|
39
|
+
struct aho_trie_node* child = (struct aho_trie_node*) malloc(sizeof(struct aho_trie_node));
|
42
40
|
|
43
|
-
|
44
|
-
travasal_node->
|
45
|
-
travasal_node->
|
41
|
+
travasal_node->child_list[node_text] = child;
|
42
|
+
travasal_node->first_child = child;
|
43
|
+
travasal_node->last_child = child;
|
44
|
+
travasal_node->child_count++;
|
46
45
|
|
47
|
-
|
46
|
+
__aho_trie_node_init(child);
|
47
|
+
child->text = node_text;
|
48
|
+
child->parent = travasal_node;
|
49
|
+
child->failure_link = &(t->root);
|
50
|
+
travasal_node = child;
|
48
51
|
continue;
|
49
52
|
}
|
50
53
|
|
51
|
-
if (travasal_node->
|
52
|
-
{
|
53
|
-
return false;
|
54
|
-
}
|
55
|
-
|
56
|
-
for (child_idx=0; child_idx < travasal_node->child_count; child_idx++)
|
57
|
-
{
|
58
|
-
if (travasal_node->child_list[child_idx]->text == node_text )
|
59
|
-
{
|
60
|
-
find_node = true;
|
61
|
-
break;
|
62
|
-
}
|
63
|
-
}
|
64
|
-
|
65
|
-
if (find_node == true)
|
54
|
+
if ( travasal_node->child_list[node_text] != NULL)
|
66
55
|
{
|
67
|
-
travasal_node->child_list[
|
68
|
-
travasal_node = travasal_node->child_list[
|
56
|
+
travasal_node->child_list[node_text]->ref_count++;
|
57
|
+
travasal_node = travasal_node->child_list[node_text];
|
69
58
|
}
|
70
59
|
else
|
71
60
|
{
|
72
61
|
/* push_back to child_list */
|
73
|
-
struct aho_trie_node*
|
74
|
-
|
75
|
-
travasal_node->child_list[travasal_node->child_count] =
|
76
|
-
(struct aho_trie_node*) malloc(sizeof(struct aho_trie_node));
|
62
|
+
struct aho_trie_node* child = (struct aho_trie_node*) malloc(sizeof(struct aho_trie_node));
|
77
63
|
|
78
|
-
|
64
|
+
travasal_node->child_list[node_text] = child;
|
79
65
|
travasal_node->child_count++;
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
66
|
+
travasal_node->last_child->next = child;
|
67
|
+
travasal_node->last_child = child;
|
68
|
+
|
69
|
+
__aho_trie_node_init(child);
|
70
|
+
child->text = node_text;
|
71
|
+
child->parent = travasal_node;
|
72
|
+
child->failure_link = &(t->root);
|
73
|
+
travasal_node = child;
|
86
74
|
}
|
87
75
|
}
|
88
76
|
|
@@ -99,7 +87,6 @@ bool __aho_connect_link(struct aho_trie_node* p, struct aho_trie_node* q)
|
|
99
87
|
{
|
100
88
|
struct aho_trie_node *pf = NULL;
|
101
89
|
int i = 0;
|
102
|
-
|
103
90
|
/* is root node */
|
104
91
|
if (p->parent == NULL)
|
105
92
|
{
|
@@ -108,26 +95,26 @@ bool __aho_connect_link(struct aho_trie_node* p, struct aho_trie_node* q)
|
|
108
95
|
}
|
109
96
|
|
110
97
|
pf = p->failure_link;
|
111
|
-
|
98
|
+
|
99
|
+
/* check child node of failure link(p) */
|
100
|
+
if (pf->child_list[q->text] != NULL)
|
112
101
|
{
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
/* connect failure link */
|
117
|
-
q->failure_link = pf->child_list[i];
|
102
|
+
struct aho_trie_node *node = pf->child_list[q->text];
|
103
|
+
/* connect failure link */
|
104
|
+
q->failure_link =node;
|
118
105
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
}
|
124
|
-
else
|
125
|
-
{
|
126
|
-
q->output_link = pf->child_list[i]->output_link;
|
127
|
-
}
|
128
|
-
return true;
|
106
|
+
/* connect output link */
|
107
|
+
if (node->text_end)
|
108
|
+
{
|
109
|
+
q->output_link = node;
|
129
110
|
}
|
111
|
+
else
|
112
|
+
{
|
113
|
+
q->output_link = node->output_link;
|
114
|
+
}
|
115
|
+
return true;
|
130
116
|
}
|
117
|
+
|
131
118
|
return false;
|
132
119
|
}
|
133
120
|
|
@@ -158,20 +145,20 @@ void aho_connect_link(struct aho_trie * restrict t)
|
|
158
145
|
free(queue_node);
|
159
146
|
|
160
147
|
/* get child node list of p */
|
161
|
-
|
148
|
+
struct aho_trie_node *child_ptr = p->first_child;
|
149
|
+
while (child_ptr != NULL)
|
162
150
|
{
|
163
151
|
struct aho_trie_node *pf = p;
|
164
|
-
|
165
|
-
|
166
|
-
q = p->child_list[i];
|
152
|
+
aho_queue_enqueue(&queue, child_ptr);
|
153
|
+
q = child_ptr;
|
167
154
|
|
168
155
|
while (__aho_connect_link(pf, q) == false)
|
169
156
|
{
|
170
157
|
pf = pf->failure_link;
|
171
158
|
}
|
159
|
+
child_ptr = child_ptr->next;
|
172
160
|
}
|
173
161
|
}
|
174
|
-
|
175
162
|
aho_queue_destroy(&queue);
|
176
163
|
}
|
177
164
|
|
@@ -180,7 +167,6 @@ void aho_clean_trie_node(struct aho_trie * restrict t)
|
|
180
167
|
struct aho_queue queue;
|
181
168
|
aho_queue_init(&queue);
|
182
169
|
aho_queue_enqueue(&queue, &(t->root));
|
183
|
-
|
184
170
|
/* BFS */
|
185
171
|
while (true)
|
186
172
|
{
|
@@ -197,9 +183,11 @@ void aho_clean_trie_node(struct aho_trie * restrict t)
|
|
197
183
|
remove_node = queue_node->data;
|
198
184
|
free(queue_node);
|
199
185
|
|
200
|
-
|
186
|
+
struct aho_trie_node *child_ptr = remove_node->first_child;
|
187
|
+
while (child_ptr != NULL)
|
201
188
|
{
|
202
|
-
aho_queue_enqueue(&queue,
|
189
|
+
aho_queue_enqueue(&queue, child_ptr);
|
190
|
+
child_ptr = child_ptr->next;
|
203
191
|
}
|
204
192
|
|
205
193
|
/* is root node */
|
@@ -218,14 +206,12 @@ bool __aho_find_trie_node(struct aho_trie_node** restrict start, const unsigned
|
|
218
206
|
int i = 0;
|
219
207
|
|
220
208
|
search_node = *start;
|
221
|
-
|
209
|
+
|
210
|
+
if (search_node->child_list[(unsigned int)text] != NULL)
|
222
211
|
{
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
*start = search_node->child_list[i];
|
227
|
-
return true;
|
228
|
-
}
|
212
|
+
/* find it! move to find child node! */
|
213
|
+
*start = search_node->child_list[(unsigned int)text];
|
214
|
+
return true;
|
229
215
|
}
|
230
216
|
|
231
217
|
/* not found */
|
@@ -243,7 +229,6 @@ struct aho_text_t* aho_find_trie_node(struct aho_trie_node** restrict start, con
|
|
243
229
|
{
|
244
230
|
return NULL;
|
245
231
|
}
|
246
|
-
|
247
232
|
/* retry find. move failure link. */
|
248
233
|
*start = (*start)->failure_link;
|
249
234
|
}
|
@@ -260,7 +245,6 @@ struct aho_text_t* aho_find_trie_node(struct aho_trie_node** restrict start, con
|
|
260
245
|
{
|
261
246
|
return (*start)->output_link->output_text;
|
262
247
|
}
|
263
|
-
|
264
248
|
/* keep going */
|
265
249
|
return NULL;
|
266
250
|
}
|
@@ -286,10 +270,11 @@ void aho_print_trie(struct aho_trie * restrict t)
|
|
286
270
|
|
287
271
|
travasal_node = queue_node->data;
|
288
272
|
free(queue_node);
|
289
|
-
|
290
|
-
|
273
|
+
struct aho_trie_node *child_ptr = travasal_node->first_child;
|
274
|
+
while (child_ptr != NULL)
|
291
275
|
{
|
292
|
-
aho_queue_enqueue(&queue,
|
276
|
+
aho_queue_enqueue(&queue, child_ptr);
|
277
|
+
child_ptr = child_ptr->next;
|
293
278
|
}
|
294
279
|
|
295
280
|
/* is root node */
|
@@ -11,6 +11,9 @@ struct aho_trie_node
|
|
11
11
|
|
12
12
|
struct aho_trie_node* parent;
|
13
13
|
struct aho_trie_node* child_list[MAX_AHO_CHILD_NODE];
|
14
|
+
struct aho_trie_node* first_child;
|
15
|
+
struct aho_trie_node* last_child;
|
16
|
+
struct aho_trie_node* next;
|
14
17
|
unsigned int child_count;
|
15
18
|
|
16
19
|
bool text_end;
|
@@ -1,3 +1,15 @@
|
|
1
|
+
|
2
|
+
// MIT License
|
3
|
+
|
4
|
+
// Copyright (c) 2017 morenice
|
5
|
+
|
6
|
+
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
// of this software and associated documentation files (the "Software"), to deal
|
8
|
+
// in the Software without restriction, including without limitation the rights
|
9
|
+
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
// copies of the Software, and to permit persons to whom the Software is
|
11
|
+
// furnished to do so, subject to the following conditions:
|
12
|
+
|
1
13
|
#include <limits.h>
|
2
14
|
#include <string.h>
|
3
15
|
#include <stdlib.h>
|
@@ -20,6 +32,7 @@ void aho_destroy(struct ahocorasick * restrict aho)
|
|
20
32
|
int aho_add_match_text(struct ahocorasick * restrict aho, const char* text, unsigned int len)
|
21
33
|
{
|
22
34
|
struct aho_text_t* a_text = NULL;
|
35
|
+
|
23
36
|
if (aho->accumulate_text_id == AHO_MAX_TEXT_ID)
|
24
37
|
{
|
25
38
|
return -1;
|
@@ -30,11 +43,14 @@ int aho_add_match_text(struct ahocorasick * restrict aho, const char* text, unsi
|
|
30
43
|
goto lack_free_mem;
|
31
44
|
|
32
45
|
a_text->text = (char*) malloc(sizeof(char)*len);
|
46
|
+
|
33
47
|
if (!a_text->text)
|
34
48
|
goto lack_free_mem;
|
35
49
|
|
36
50
|
a_text->id = aho->accumulate_text_id++;
|
51
|
+
|
37
52
|
memcpy(a_text->text, text, len);
|
53
|
+
|
38
54
|
a_text->len = len;
|
39
55
|
a_text->prev = NULL;
|
40
56
|
a_text->next = NULL;
|
@@ -54,6 +70,7 @@ int aho_add_match_text(struct ahocorasick * restrict aho, const char* text, unsi
|
|
54
70
|
return a_text->id;
|
55
71
|
|
56
72
|
lack_free_mem:
|
73
|
+
|
57
74
|
return -1;
|
58
75
|
}
|
59
76
|
|
@@ -110,7 +127,9 @@ void aho_create_trie(struct ahocorasick * restrict aho)
|
|
110
127
|
|
111
128
|
for (iter = aho->text_list_head; iter != NULL; iter = iter->next)
|
112
129
|
{
|
130
|
+
|
113
131
|
aho_add_trie_node(&(aho->trie), iter);
|
132
|
+
|
114
133
|
}
|
115
134
|
|
116
135
|
aho_connect_link(&(aho->trie));
|
@@ -162,7 +181,8 @@ unsigned int aho_findtext(struct ahocorasick * restrict aho, const char* data, u
|
|
162
181
|
return match_count;
|
163
182
|
}
|
164
183
|
|
165
|
-
VALUE aho_replace_text(struct ahocorasick * restrict aho, const char* data,
|
184
|
+
VALUE aho_replace_text(struct ahocorasick * restrict aho, const char* data,
|
185
|
+
unsigned long long data_len, char *values[], long value_sizes[], VALUE ruby_values[])
|
166
186
|
{
|
167
187
|
int i = 0;
|
168
188
|
int match_count = 0;
|
@@ -191,14 +211,18 @@ VALUE aho_replace_text(struct ahocorasick * restrict aho, const char* data, unsi
|
|
191
211
|
}
|
192
212
|
|
193
213
|
// concatenate from last_concat_pos
|
194
|
-
|
214
|
+
if (pos > last_concat_pos) {
|
215
|
+
rb_str_cat(main_result, &data[last_concat_pos], pos - last_concat_pos);
|
216
|
+
}
|
217
|
+
|
195
218
|
// concatenate replace
|
196
219
|
if (values[result->id] == NULL) {
|
197
220
|
VALUE proc_result = rb_funcall(ruby_values[result->id], rb_intern("call"), 0);
|
198
|
-
|
221
|
+
value_sizes[result->id] = RSTRING_LEN(proc_result);
|
222
|
+
values[result->id] = StringValuePtr(proc_result);
|
199
223
|
}
|
200
224
|
|
201
|
-
|
225
|
+
rb_str_cat(main_result, values[result->id], value_sizes[result->id]);
|
202
226
|
last_concat_pos = i + 1;
|
203
227
|
}
|
204
228
|
|