multi_string_replace 0.1.0 → 1.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile.lock +4 -14
- data/LICENSE.txt +1 -1
- data/README.md +40 -3
- data/bin/benchmark.rb +4 -3
- data/ext/multi_string_replace/aho_trie.c +58 -73
- data/ext/multi_string_replace/aho_trie.h +3 -0
- data/ext/multi_string_replace/ahocorasick.c +28 -4
- data/ext/multi_string_replace/ahocorasick.h +15 -1
- data/ext/multi_string_replace/multi_string_replace.c +37 -9
- data/lib/multi_string_replace.rb +11 -1
- data/lib/multi_string_replace/version.rb +1 -1
- data/multi_string_replace.gemspec +4 -5
- data/replaced.txt +651 -0
- data/replaced2.txt +651 -0
- metadata +10 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '03680f4522d98a7a162df95c1151feedabe821febc15a03c7a98a6996dd43cc1'
|
4
|
+
data.tar.gz: 80222e5675b3310fd079e6099026f8fa584d77542a0a34a30a3c6c3c67643cdb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6bcda8e829e5fd9e747c567a44c09363f304bbc8162d661418fb491f6626019c158d8a60345bb814cab42598c3830d884e6b8b70b20215d9614cb36d6913928e
|
7
|
+
data.tar.gz: 3f2895d64ba1d7560e866104f499a592767fa1dd161c1ff3d975126091d8330f06e85e25f37182de46f05c5dcb51a3f8ed4e38b352b78c8a1955dde1fdca5396
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,22 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
multi_string_replace (
|
4
|
+
multi_string_replace (1.0.4)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
|
-
byebug (10.0.2)
|
10
|
-
coderay (1.1.2)
|
11
9
|
diff-lcs (1.3)
|
12
|
-
|
13
|
-
pry (0.11.3)
|
14
|
-
coderay (~> 1.1.0)
|
15
|
-
method_source (~> 0.9.0)
|
16
|
-
pry-byebug (3.6.0)
|
17
|
-
byebug (~> 10.0)
|
18
|
-
pry (~> 0.10)
|
19
|
-
rake (10.5.0)
|
10
|
+
rake (13.0.1)
|
20
11
|
rake-compiler (1.0.5)
|
21
12
|
rake
|
22
13
|
rspec (3.8.0)
|
@@ -39,10 +30,9 @@ PLATFORMS
|
|
39
30
|
DEPENDENCIES
|
40
31
|
bundler (~> 1.16)
|
41
32
|
multi_string_replace!
|
42
|
-
|
43
|
-
rake (~> 10.0)
|
33
|
+
rake
|
44
34
|
rake-compiler
|
45
35
|
rspec (~> 3.0)
|
46
36
|
|
47
37
|
BUNDLED WITH
|
48
|
-
1.16.
|
38
|
+
1.16.5
|
data/LICENSE.txt
CHANGED
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
18
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
19
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
20
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
-
THE SOFTWARE.
|
21
|
+
THE SOFTWARE.
|
data/README.md
CHANGED
@@ -1,8 +1,15 @@
|
|
1
|
+
|
2
|
+
[![Gem](https://img.shields.io/gem/v/multi_string_replace.svg)](https://rubygems.org/gems/multi_string_replace)
|
3
|
+
|
4
|
+
|
1
5
|
# MultiStringReplace
|
2
6
|
|
3
|
-
|
7
|
+
A fast multiple string replace library for ruby. Uses a C implementation of the Aho–Corasick Algorithm based
|
8
|
+
on https://github.com/morenice/ahocorasick while adding support for a few performance enhancements and on the
|
9
|
+
fly multiple string replacement.
|
4
10
|
|
5
|
-
|
11
|
+
If Regex is not needed, this library offers significant performance advantages over String.gsub() for large string
|
12
|
+
and with a large number of tokens.
|
6
13
|
|
7
14
|
## Installation
|
8
15
|
|
@@ -22,7 +29,37 @@ Or install it yourself as:
|
|
22
29
|
|
23
30
|
## Usage
|
24
31
|
|
25
|
-
|
32
|
+
```ruby
|
33
|
+
MultiStringReplace.match("The quick brown fox jumps over the lazy dog brown", ['brown', 'fox'])
|
34
|
+
# { 0 => [10, 44], 1 => [16] }
|
35
|
+
MultiStringReplace.replace("The quick brown fox jumps over the lazy dog brown", {'brown' => 'black', 'fox' => 'wolf'})
|
36
|
+
# The quick black wolf jumps over the lazy dog black
|
37
|
+
```
|
38
|
+
|
39
|
+
You can also pass in a Proc, these will only get evaluated when the token is encountered.
|
40
|
+
|
41
|
+
```ruby
|
42
|
+
MultiStringReplace.replace("The quick brown fox jumps over the lazy dog brown", {'brown' => 'black', 'fox' => ->() { "cat" }})
|
43
|
+
```
|
44
|
+
|
45
|
+
Also adds a mreplace method to String which does the same thing:
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
"The quick brown fox jumps over the lazy dog brown".mreplace({'brown' => 'black', 'fox' => ->() { "cat" }})
|
49
|
+
```
|
50
|
+
|
51
|
+
## Performance
|
52
|
+
|
53
|
+
Performing token replacement on a 200K text file repeated 100 times
|
54
|
+
|
55
|
+
```
|
56
|
+
user system total real
|
57
|
+
multi gsub 1.322510 0.000000 1.322510 ( 1.344405)
|
58
|
+
MultiStringReplace 0.196823 0.007979 0.204802 ( 0.207219)
|
59
|
+
mreplace 0.200593 0.004031 0.204624 ( 0.205379)
|
60
|
+
```
|
61
|
+
|
62
|
+
Benchmark sources can be found here: <https://github.com/jedld/multi_word_replace/blob/master/bin/benchmark.rb>
|
26
63
|
|
27
64
|
## Development
|
28
65
|
|
data/bin/benchmark.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
require "bundler/setup"
|
2
2
|
require 'multi_string_replace'
|
3
3
|
require 'benchmark'
|
4
|
-
require 'pry-byebug'
|
5
4
|
|
6
5
|
class String
|
7
6
|
def mgsub(key_value_pairs=[].freeze)
|
@@ -27,12 +26,14 @@ replace = {
|
|
27
26
|
'Cras' => 'uuuuuuuu',
|
28
27
|
'nunc' => 'eeeeeee',
|
29
28
|
'cursus' => '乧乨乩乪乫乬乭乮乯买乱乲乳乴乵乶乷乸乹乺乻乼乽乾乿',
|
29
|
+
'Vivamus' => '㐀㐁㐂㐃㐄㐅㐆㐇㐈㐉㐊㐋'
|
30
30
|
}
|
31
31
|
|
32
32
|
File.write('replaced.txt', body.gsub(/(#{replace.keys.join('|')})/, replace))
|
33
33
|
File.write('replaced2.txt', MultiStringReplace.replace(body, replace))
|
34
34
|
|
35
35
|
Benchmark.bmbm do |x|
|
36
|
-
x.report "multi gsub" do body.mgsub(replace.map { |k, v| [/#{k}/, v] } ) end
|
37
|
-
x.report "MultiStringReplace" do MultiStringReplace.replace(body, replace) end
|
36
|
+
x.report "multi gsub" do 100.times { body.mgsub(replace.map { |k, v| [/#{k}/, v] } ) } end
|
37
|
+
x.report "MultiStringReplace" do 100.times { MultiStringReplace.replace(body, replace) } end
|
38
|
+
x.report "mreplace" do 100.times { body.mreplace(replace) } end
|
38
39
|
end
|
@@ -29,60 +29,48 @@ bool aho_add_trie_node(struct aho_trie * restrict t, struct aho_text_t * restric
|
|
29
29
|
|
30
30
|
for (int text_idx = 0; text_idx < text->len; text_idx++)
|
31
31
|
{
|
32
|
-
unsigned
|
32
|
+
unsigned int node_text = text->text[text_idx];
|
33
33
|
bool find_node = false;
|
34
34
|
int child_idx = 0;
|
35
35
|
|
36
36
|
if (travasal_node->child_count == 0)
|
37
37
|
{
|
38
38
|
/* insert first node to child_list */
|
39
|
-
|
40
|
-
(struct aho_trie_node*) malloc(sizeof(struct aho_trie_node));
|
41
|
-
travasal_node->child_count++;
|
39
|
+
struct aho_trie_node* child = (struct aho_trie_node*) malloc(sizeof(struct aho_trie_node));
|
42
40
|
|
43
|
-
|
44
|
-
travasal_node->
|
45
|
-
travasal_node->
|
41
|
+
travasal_node->child_list[node_text] = child;
|
42
|
+
travasal_node->first_child = child;
|
43
|
+
travasal_node->last_child = child;
|
44
|
+
travasal_node->child_count++;
|
46
45
|
|
47
|
-
|
46
|
+
__aho_trie_node_init(child);
|
47
|
+
child->text = node_text;
|
48
|
+
child->parent = travasal_node;
|
49
|
+
child->failure_link = &(t->root);
|
50
|
+
travasal_node = child;
|
48
51
|
continue;
|
49
52
|
}
|
50
53
|
|
51
|
-
if (travasal_node->
|
52
|
-
{
|
53
|
-
return false;
|
54
|
-
}
|
55
|
-
|
56
|
-
for (child_idx=0; child_idx < travasal_node->child_count; child_idx++)
|
57
|
-
{
|
58
|
-
if (travasal_node->child_list[child_idx]->text == node_text )
|
59
|
-
{
|
60
|
-
find_node = true;
|
61
|
-
break;
|
62
|
-
}
|
63
|
-
}
|
64
|
-
|
65
|
-
if (find_node == true)
|
54
|
+
if ( travasal_node->child_list[node_text] != NULL)
|
66
55
|
{
|
67
|
-
travasal_node->child_list[
|
68
|
-
travasal_node = travasal_node->child_list[
|
56
|
+
travasal_node->child_list[node_text]->ref_count++;
|
57
|
+
travasal_node = travasal_node->child_list[node_text];
|
69
58
|
}
|
70
59
|
else
|
71
60
|
{
|
72
61
|
/* push_back to child_list */
|
73
|
-
struct aho_trie_node*
|
74
|
-
|
75
|
-
travasal_node->child_list[travasal_node->child_count] =
|
76
|
-
(struct aho_trie_node*) malloc(sizeof(struct aho_trie_node));
|
62
|
+
struct aho_trie_node* child = (struct aho_trie_node*) malloc(sizeof(struct aho_trie_node));
|
77
63
|
|
78
|
-
|
64
|
+
travasal_node->child_list[node_text] = child;
|
79
65
|
travasal_node->child_count++;
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
66
|
+
travasal_node->last_child->next = child;
|
67
|
+
travasal_node->last_child = child;
|
68
|
+
|
69
|
+
__aho_trie_node_init(child);
|
70
|
+
child->text = node_text;
|
71
|
+
child->parent = travasal_node;
|
72
|
+
child->failure_link = &(t->root);
|
73
|
+
travasal_node = child;
|
86
74
|
}
|
87
75
|
}
|
88
76
|
|
@@ -99,7 +87,6 @@ bool __aho_connect_link(struct aho_trie_node* p, struct aho_trie_node* q)
|
|
99
87
|
{
|
100
88
|
struct aho_trie_node *pf = NULL;
|
101
89
|
int i = 0;
|
102
|
-
|
103
90
|
/* is root node */
|
104
91
|
if (p->parent == NULL)
|
105
92
|
{
|
@@ -108,26 +95,26 @@ bool __aho_connect_link(struct aho_trie_node* p, struct aho_trie_node* q)
|
|
108
95
|
}
|
109
96
|
|
110
97
|
pf = p->failure_link;
|
111
|
-
|
98
|
+
|
99
|
+
/* check child node of failure link(p) */
|
100
|
+
if (pf->child_list[q->text] != NULL)
|
112
101
|
{
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
/* connect failure link */
|
117
|
-
q->failure_link = pf->child_list[i];
|
102
|
+
struct aho_trie_node *node = pf->child_list[q->text];
|
103
|
+
/* connect failure link */
|
104
|
+
q->failure_link =node;
|
118
105
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
}
|
124
|
-
else
|
125
|
-
{
|
126
|
-
q->output_link = pf->child_list[i]->output_link;
|
127
|
-
}
|
128
|
-
return true;
|
106
|
+
/* connect output link */
|
107
|
+
if (node->text_end)
|
108
|
+
{
|
109
|
+
q->output_link = node;
|
129
110
|
}
|
111
|
+
else
|
112
|
+
{
|
113
|
+
q->output_link = node->output_link;
|
114
|
+
}
|
115
|
+
return true;
|
130
116
|
}
|
117
|
+
|
131
118
|
return false;
|
132
119
|
}
|
133
120
|
|
@@ -158,20 +145,20 @@ void aho_connect_link(struct aho_trie * restrict t)
|
|
158
145
|
free(queue_node);
|
159
146
|
|
160
147
|
/* get child node list of p */
|
161
|
-
|
148
|
+
struct aho_trie_node *child_ptr = p->first_child;
|
149
|
+
while (child_ptr != NULL)
|
162
150
|
{
|
163
151
|
struct aho_trie_node *pf = p;
|
164
|
-
|
165
|
-
|
166
|
-
q = p->child_list[i];
|
152
|
+
aho_queue_enqueue(&queue, child_ptr);
|
153
|
+
q = child_ptr;
|
167
154
|
|
168
155
|
while (__aho_connect_link(pf, q) == false)
|
169
156
|
{
|
170
157
|
pf = pf->failure_link;
|
171
158
|
}
|
159
|
+
child_ptr = child_ptr->next;
|
172
160
|
}
|
173
161
|
}
|
174
|
-
|
175
162
|
aho_queue_destroy(&queue);
|
176
163
|
}
|
177
164
|
|
@@ -180,7 +167,6 @@ void aho_clean_trie_node(struct aho_trie * restrict t)
|
|
180
167
|
struct aho_queue queue;
|
181
168
|
aho_queue_init(&queue);
|
182
169
|
aho_queue_enqueue(&queue, &(t->root));
|
183
|
-
|
184
170
|
/* BFS */
|
185
171
|
while (true)
|
186
172
|
{
|
@@ -197,9 +183,11 @@ void aho_clean_trie_node(struct aho_trie * restrict t)
|
|
197
183
|
remove_node = queue_node->data;
|
198
184
|
free(queue_node);
|
199
185
|
|
200
|
-
|
186
|
+
struct aho_trie_node *child_ptr = remove_node->first_child;
|
187
|
+
while (child_ptr != NULL)
|
201
188
|
{
|
202
|
-
aho_queue_enqueue(&queue,
|
189
|
+
aho_queue_enqueue(&queue, child_ptr);
|
190
|
+
child_ptr = child_ptr->next;
|
203
191
|
}
|
204
192
|
|
205
193
|
/* is root node */
|
@@ -218,14 +206,12 @@ bool __aho_find_trie_node(struct aho_trie_node** restrict start, const unsigned
|
|
218
206
|
int i = 0;
|
219
207
|
|
220
208
|
search_node = *start;
|
221
|
-
|
209
|
+
|
210
|
+
if (search_node->child_list[(unsigned int)text] != NULL)
|
222
211
|
{
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
*start = search_node->child_list[i];
|
227
|
-
return true;
|
228
|
-
}
|
212
|
+
/* find it! move to find child node! */
|
213
|
+
*start = search_node->child_list[(unsigned int)text];
|
214
|
+
return true;
|
229
215
|
}
|
230
216
|
|
231
217
|
/* not found */
|
@@ -243,7 +229,6 @@ struct aho_text_t* aho_find_trie_node(struct aho_trie_node** restrict start, con
|
|
243
229
|
{
|
244
230
|
return NULL;
|
245
231
|
}
|
246
|
-
|
247
232
|
/* retry find. move failure link. */
|
248
233
|
*start = (*start)->failure_link;
|
249
234
|
}
|
@@ -260,7 +245,6 @@ struct aho_text_t* aho_find_trie_node(struct aho_trie_node** restrict start, con
|
|
260
245
|
{
|
261
246
|
return (*start)->output_link->output_text;
|
262
247
|
}
|
263
|
-
|
264
248
|
/* keep going */
|
265
249
|
return NULL;
|
266
250
|
}
|
@@ -286,10 +270,11 @@ void aho_print_trie(struct aho_trie * restrict t)
|
|
286
270
|
|
287
271
|
travasal_node = queue_node->data;
|
288
272
|
free(queue_node);
|
289
|
-
|
290
|
-
|
273
|
+
struct aho_trie_node *child_ptr = travasal_node->first_child;
|
274
|
+
while (child_ptr != NULL)
|
291
275
|
{
|
292
|
-
aho_queue_enqueue(&queue,
|
276
|
+
aho_queue_enqueue(&queue, child_ptr);
|
277
|
+
child_ptr = child_ptr->next;
|
293
278
|
}
|
294
279
|
|
295
280
|
/* is root node */
|
@@ -11,6 +11,9 @@ struct aho_trie_node
|
|
11
11
|
|
12
12
|
struct aho_trie_node* parent;
|
13
13
|
struct aho_trie_node* child_list[MAX_AHO_CHILD_NODE];
|
14
|
+
struct aho_trie_node* first_child;
|
15
|
+
struct aho_trie_node* last_child;
|
16
|
+
struct aho_trie_node* next;
|
14
17
|
unsigned int child_count;
|
15
18
|
|
16
19
|
bool text_end;
|
@@ -1,3 +1,15 @@
|
|
1
|
+
|
2
|
+
// MIT License
|
3
|
+
|
4
|
+
// Copyright (c) 2017 morenice
|
5
|
+
|
6
|
+
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
// of this software and associated documentation files (the "Software"), to deal
|
8
|
+
// in the Software without restriction, including without limitation the rights
|
9
|
+
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
// copies of the Software, and to permit persons to whom the Software is
|
11
|
+
// furnished to do so, subject to the following conditions:
|
12
|
+
|
1
13
|
#include <limits.h>
|
2
14
|
#include <string.h>
|
3
15
|
#include <stdlib.h>
|
@@ -20,6 +32,7 @@ void aho_destroy(struct ahocorasick * restrict aho)
|
|
20
32
|
int aho_add_match_text(struct ahocorasick * restrict aho, const char* text, unsigned int len)
|
21
33
|
{
|
22
34
|
struct aho_text_t* a_text = NULL;
|
35
|
+
|
23
36
|
if (aho->accumulate_text_id == AHO_MAX_TEXT_ID)
|
24
37
|
{
|
25
38
|
return -1;
|
@@ -30,11 +43,14 @@ int aho_add_match_text(struct ahocorasick * restrict aho, const char* text, unsi
|
|
30
43
|
goto lack_free_mem;
|
31
44
|
|
32
45
|
a_text->text = (char*) malloc(sizeof(char)*len);
|
46
|
+
|
33
47
|
if (!a_text->text)
|
34
48
|
goto lack_free_mem;
|
35
49
|
|
36
50
|
a_text->id = aho->accumulate_text_id++;
|
51
|
+
|
37
52
|
memcpy(a_text->text, text, len);
|
53
|
+
|
38
54
|
a_text->len = len;
|
39
55
|
a_text->prev = NULL;
|
40
56
|
a_text->next = NULL;
|
@@ -54,6 +70,7 @@ int aho_add_match_text(struct ahocorasick * restrict aho, const char* text, unsi
|
|
54
70
|
return a_text->id;
|
55
71
|
|
56
72
|
lack_free_mem:
|
73
|
+
|
57
74
|
return -1;
|
58
75
|
}
|
59
76
|
|
@@ -110,7 +127,9 @@ void aho_create_trie(struct ahocorasick * restrict aho)
|
|
110
127
|
|
111
128
|
for (iter = aho->text_list_head; iter != NULL; iter = iter->next)
|
112
129
|
{
|
130
|
+
|
113
131
|
aho_add_trie_node(&(aho->trie), iter);
|
132
|
+
|
114
133
|
}
|
115
134
|
|
116
135
|
aho_connect_link(&(aho->trie));
|
@@ -162,7 +181,8 @@ unsigned int aho_findtext(struct ahocorasick * restrict aho, const char* data, u
|
|
162
181
|
return match_count;
|
163
182
|
}
|
164
183
|
|
165
|
-
VALUE aho_replace_text(struct ahocorasick * restrict aho, const char* data,
|
184
|
+
VALUE aho_replace_text(struct ahocorasick * restrict aho, const char* data,
|
185
|
+
unsigned long long data_len, char *values[], long value_sizes[], VALUE ruby_values[])
|
166
186
|
{
|
167
187
|
int i = 0;
|
168
188
|
int match_count = 0;
|
@@ -191,14 +211,18 @@ VALUE aho_replace_text(struct ahocorasick * restrict aho, const char* data, unsi
|
|
191
211
|
}
|
192
212
|
|
193
213
|
// concatenate from last_concat_pos
|
194
|
-
|
214
|
+
if (pos > last_concat_pos) {
|
215
|
+
rb_str_cat(main_result, &data[last_concat_pos], pos - last_concat_pos);
|
216
|
+
}
|
217
|
+
|
195
218
|
// concatenate replace
|
196
219
|
if (values[result->id] == NULL) {
|
197
220
|
VALUE proc_result = rb_funcall(ruby_values[result->id], rb_intern("call"), 0);
|
198
|
-
|
221
|
+
value_sizes[result->id] = RSTRING_LEN(proc_result);
|
222
|
+
values[result->id] = StringValuePtr(proc_result);
|
199
223
|
}
|
200
224
|
|
201
|
-
|
225
|
+
rb_str_cat(main_result, values[result->id], value_sizes[result->id]);
|
202
226
|
last_concat_pos = i + 1;
|
203
227
|
}
|
204
228
|
|