multi_string_replace 0.1.0 → 1.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 212107102a63c46a717026fbc8c853da01204b73cfda68fa7ff58909d72cade7
4
- data.tar.gz: 59cafa3a2e9c934e4824a61f87372524b765ba52c4ddadda1283b42f52d4860a
3
+ metadata.gz: '03680f4522d98a7a162df95c1151feedabe821febc15a03c7a98a6996dd43cc1'
4
+ data.tar.gz: 80222e5675b3310fd079e6099026f8fa584d77542a0a34a30a3c6c3c67643cdb
5
5
  SHA512:
6
- metadata.gz: '073753252667508ac0006fe6b6b9d88d5890aaea4a102a94622dcc1e8d12dee9f3760fa1c98ca8ee343b289b474e6e0a89e8cd81379aa618b11d9096154d50e5'
7
- data.tar.gz: c18f648b5a010e926f8d79b219fc67ccd96dd76fe213003988e265c16279e24d0117678f12526be5c54021065da175101546d5fccc88e3f84a3a2dbeee6f7d15
6
+ metadata.gz: 6bcda8e829e5fd9e747c567a44c09363f304bbc8162d661418fb491f6626019c158d8a60345bb814cab42598c3830d884e6b8b70b20215d9614cb36d6913928e
7
+ data.tar.gz: 3f2895d64ba1d7560e866104f499a592767fa1dd161c1ff3d975126091d8330f06e85e25f37182de46f05c5dcb51a3f8ed4e38b352b78c8a1955dde1fdca5396
data/.gitignore CHANGED
@@ -10,6 +10,7 @@
10
10
  *.o
11
11
  *.log
12
12
  *.so
13
+ *.gem
13
14
 
14
15
  extconf.h
15
16
  Makefile
@@ -1,22 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- multi_string_replace (0.1.0)
4
+ multi_string_replace (1.0.4)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
- byebug (10.0.2)
10
- coderay (1.1.2)
11
9
  diff-lcs (1.3)
12
- method_source (0.9.0)
13
- pry (0.11.3)
14
- coderay (~> 1.1.0)
15
- method_source (~> 0.9.0)
16
- pry-byebug (3.6.0)
17
- byebug (~> 10.0)
18
- pry (~> 0.10)
19
- rake (10.5.0)
10
+ rake (13.0.1)
20
11
  rake-compiler (1.0.5)
21
12
  rake
22
13
  rspec (3.8.0)
@@ -39,10 +30,9 @@ PLATFORMS
39
30
  DEPENDENCIES
40
31
  bundler (~> 1.16)
41
32
  multi_string_replace!
42
- pry-byebug
43
- rake (~> 10.0)
33
+ rake
44
34
  rake-compiler
45
35
  rspec (~> 3.0)
46
36
 
47
37
  BUNDLED WITH
48
- 1.16.2
38
+ 1.16.5
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
18
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
19
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
20
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
- THE SOFTWARE.
21
+ THE SOFTWARE.
data/README.md CHANGED
@@ -1,8 +1,15 @@
1
+
2
+ [![Gem](https://img.shields.io/gem/v/multi_string_replace.svg)](https://rubygems.org/gems/multi_string_replace)
3
+
4
+
1
5
  # MultiStringReplace
2
6
 
3
- Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/multi_string_replace`. To experiment with that code, run `bin/console` for an interactive prompt.
7
+ A fast multiple string replace library for ruby. Uses a C implementation of the Aho–Corasick Algorithm based
8
+ on https://github.com/morenice/ahocorasick while adding support for a few performance enhancements and on the
9
+ fly multiple string replacement.
4
10
 
5
- TODO: Delete this and the text above, and describe your gem
11
+ If Regex is not needed, this library offers significant performance advantages over String.gsub() for large string
12
+ and with a large number of tokens.
6
13
 
7
14
  ## Installation
8
15
 
@@ -22,7 +29,37 @@ Or install it yourself as:
22
29
 
23
30
  ## Usage
24
31
 
25
- TODO: Write usage instructions here
32
+ ```ruby
33
+ MultiStringReplace.match("The quick brown fox jumps over the lazy dog brown", ['brown', 'fox'])
34
+ # { 0 => [10, 44], 1 => [16] }
35
+ MultiStringReplace.replace("The quick brown fox jumps over the lazy dog brown", {'brown' => 'black', 'fox' => 'wolf'})
36
+ # The quick black wolf jumps over the lazy dog black
37
+ ```
38
+
39
+ You can also pass in a Proc, these will only get evaluated when the token is encountered.
40
+
41
+ ```ruby
42
+ MultiStringReplace.replace("The quick brown fox jumps over the lazy dog brown", {'brown' => 'black', 'fox' => ->() { "cat" }})
43
+ ```
44
+
45
+ Also adds a mreplace method to String which does the same thing:
46
+
47
+ ```ruby
48
+ "The quick brown fox jumps over the lazy dog brown".mreplace({'brown' => 'black', 'fox' => ->() { "cat" }})
49
+ ```
50
+
51
+ ## Performance
52
+
53
+ Performing token replacement on a 200K text file repeated 100 times
54
+
55
+ ```
56
+ user system total real
57
+ multi gsub 1.322510 0.000000 1.322510 ( 1.344405)
58
+ MultiStringReplace 0.196823 0.007979 0.204802 ( 0.207219)
59
+ mreplace 0.200593 0.004031 0.204624 ( 0.205379)
60
+ ```
61
+
62
+ Benchmark sources can be found here: <https://github.com/jedld/multi_word_replace/blob/master/bin/benchmark.rb>
26
63
 
27
64
  ## Development
28
65
 
@@ -1,7 +1,6 @@
1
1
  require "bundler/setup"
2
2
  require 'multi_string_replace'
3
3
  require 'benchmark'
4
- require 'pry-byebug'
5
4
 
6
5
  class String
7
6
  def mgsub(key_value_pairs=[].freeze)
@@ -27,12 +26,14 @@ replace = {
27
26
  'Cras' => 'uuuuuuuu',
28
27
  'nunc' => 'eeeeeee',
29
28
  'cursus' => '乧乨乩乪乫乬乭乮乯买乱乲乳乴乵乶乷乸乹乺乻乼乽乾乿',
29
+ 'Vivamus' => '㐀㐁㐂㐃㐄㐅㐆㐇㐈㐉㐊㐋'
30
30
  }
31
31
 
32
32
  File.write('replaced.txt', body.gsub(/(#{replace.keys.join('|')})/, replace))
33
33
  File.write('replaced2.txt', MultiStringReplace.replace(body, replace))
34
34
 
35
35
  Benchmark.bmbm do |x|
36
- x.report "multi gsub" do body.mgsub(replace.map { |k, v| [/#{k}/, v] } ) end
37
- x.report "MultiStringReplace" do MultiStringReplace.replace(body, replace) end
36
+ x.report "multi gsub" do 100.times { body.mgsub(replace.map { |k, v| [/#{k}/, v] } ) } end
37
+ x.report "MultiStringReplace" do 100.times { MultiStringReplace.replace(body, replace) } end
38
+ x.report "mreplace" do 100.times { body.mreplace(replace) } end
38
39
  end
@@ -29,60 +29,48 @@ bool aho_add_trie_node(struct aho_trie * restrict t, struct aho_text_t * restric
29
29
 
30
30
  for (int text_idx = 0; text_idx < text->len; text_idx++)
31
31
  {
32
- unsigned char node_text = text->text[text_idx];
32
+ unsigned int node_text = text->text[text_idx];
33
33
  bool find_node = false;
34
34
  int child_idx = 0;
35
35
 
36
36
  if (travasal_node->child_count == 0)
37
37
  {
38
38
  /* insert first node to child_list */
39
- travasal_node->child_list[0] =
40
- (struct aho_trie_node*) malloc(sizeof(struct aho_trie_node));
41
- travasal_node->child_count++;
39
+ struct aho_trie_node* child = (struct aho_trie_node*) malloc(sizeof(struct aho_trie_node));
42
40
 
43
- __aho_trie_node_init(travasal_node->child_list[0]);
44
- travasal_node->child_list[0]->text = node_text;
45
- travasal_node->child_list[0]->parent = travasal_node;
41
+ travasal_node->child_list[node_text] = child;
42
+ travasal_node->first_child = child;
43
+ travasal_node->last_child = child;
44
+ travasal_node->child_count++;
46
45
 
47
- travasal_node = travasal_node->child_list[0];
46
+ __aho_trie_node_init(child);
47
+ child->text = node_text;
48
+ child->parent = travasal_node;
49
+ child->failure_link = &(t->root);
50
+ travasal_node = child;
48
51
  continue;
49
52
  }
50
53
 
51
- if (travasal_node->child_count == MAX_AHO_CHILD_NODE)
52
- {
53
- return false;
54
- }
55
-
56
- for (child_idx=0; child_idx < travasal_node->child_count; child_idx++)
57
- {
58
- if (travasal_node->child_list[child_idx]->text == node_text )
59
- {
60
- find_node = true;
61
- break;
62
- }
63
- }
64
-
65
- if (find_node == true)
54
+ if ( travasal_node->child_list[node_text] != NULL)
66
55
  {
67
- travasal_node->child_list[child_idx]->ref_count++;
68
- travasal_node = travasal_node->child_list[child_idx];
56
+ travasal_node->child_list[node_text]->ref_count++;
57
+ travasal_node = travasal_node->child_list[node_text];
69
58
  }
70
59
  else
71
60
  {
72
61
  /* push_back to child_list */
73
- struct aho_trie_node* child_node = NULL;
74
-
75
- travasal_node->child_list[travasal_node->child_count] =
76
- (struct aho_trie_node*) malloc(sizeof(struct aho_trie_node));
62
+ struct aho_trie_node* child = (struct aho_trie_node*) malloc(sizeof(struct aho_trie_node));
77
63
 
78
- child_node = travasal_node->child_list[travasal_node->child_count];
64
+ travasal_node->child_list[node_text] = child;
79
65
  travasal_node->child_count++;
80
-
81
- __aho_trie_node_init(child_node);
82
- child_node->text = node_text;
83
- child_node->parent = travasal_node;
84
-
85
- travasal_node = child_node;
66
+ travasal_node->last_child->next = child;
67
+ travasal_node->last_child = child;
68
+
69
+ __aho_trie_node_init(child);
70
+ child->text = node_text;
71
+ child->parent = travasal_node;
72
+ child->failure_link = &(t->root);
73
+ travasal_node = child;
86
74
  }
87
75
  }
88
76
 
@@ -99,7 +87,6 @@ bool __aho_connect_link(struct aho_trie_node* p, struct aho_trie_node* q)
99
87
  {
100
88
  struct aho_trie_node *pf = NULL;
101
89
  int i = 0;
102
-
103
90
  /* is root node */
104
91
  if (p->parent == NULL)
105
92
  {
@@ -108,26 +95,26 @@ bool __aho_connect_link(struct aho_trie_node* p, struct aho_trie_node* q)
108
95
  }
109
96
 
110
97
  pf = p->failure_link;
111
- for (i=0; i < pf->child_count; i++)
98
+
99
+ /* check child node of failure link(p) */
100
+ if (pf->child_list[q->text] != NULL)
112
101
  {
113
- /* check child node of failure link(p) */
114
- if (pf->child_list[i]->text == q->text )
115
- {
116
- /* connect failure link */
117
- q->failure_link = pf->child_list[i];
102
+ struct aho_trie_node *node = pf->child_list[q->text];
103
+ /* connect failure link */
104
+ q->failure_link =node;
118
105
 
119
- /* connect output link */
120
- if (pf->child_list[i]->text_end)
121
- {
122
- q->output_link = pf->child_list[i];
123
- }
124
- else
125
- {
126
- q->output_link = pf->child_list[i]->output_link;
127
- }
128
- return true;
106
+ /* connect output link */
107
+ if (node->text_end)
108
+ {
109
+ q->output_link = node;
129
110
  }
111
+ else
112
+ {
113
+ q->output_link = node->output_link;
114
+ }
115
+ return true;
130
116
  }
117
+
131
118
  return false;
132
119
  }
133
120
 
@@ -158,20 +145,20 @@ void aho_connect_link(struct aho_trie * restrict t)
158
145
  free(queue_node);
159
146
 
160
147
  /* get child node list of p */
161
- for (i=0; i < p->child_count; i++)
148
+ struct aho_trie_node *child_ptr = p->first_child;
149
+ while (child_ptr != NULL)
162
150
  {
163
151
  struct aho_trie_node *pf = p;
164
-
165
- aho_queue_enqueue(&queue, p->child_list[i]);
166
- q = p->child_list[i];
152
+ aho_queue_enqueue(&queue, child_ptr);
153
+ q = child_ptr;
167
154
 
168
155
  while (__aho_connect_link(pf, q) == false)
169
156
  {
170
157
  pf = pf->failure_link;
171
158
  }
159
+ child_ptr = child_ptr->next;
172
160
  }
173
161
  }
174
-
175
162
  aho_queue_destroy(&queue);
176
163
  }
177
164
 
@@ -180,7 +167,6 @@ void aho_clean_trie_node(struct aho_trie * restrict t)
180
167
  struct aho_queue queue;
181
168
  aho_queue_init(&queue);
182
169
  aho_queue_enqueue(&queue, &(t->root));
183
-
184
170
  /* BFS */
185
171
  while (true)
186
172
  {
@@ -197,9 +183,11 @@ void aho_clean_trie_node(struct aho_trie * restrict t)
197
183
  remove_node = queue_node->data;
198
184
  free(queue_node);
199
185
 
200
- for (i=0; i < remove_node->child_count; i++)
186
+ struct aho_trie_node *child_ptr = remove_node->first_child;
187
+ while (child_ptr != NULL)
201
188
  {
202
- aho_queue_enqueue(&queue, remove_node->child_list[i]);
189
+ aho_queue_enqueue(&queue, child_ptr);
190
+ child_ptr = child_ptr->next;
203
191
  }
204
192
 
205
193
  /* is root node */
@@ -218,14 +206,12 @@ bool __aho_find_trie_node(struct aho_trie_node** restrict start, const unsigned
218
206
  int i = 0;
219
207
 
220
208
  search_node = *start;
221
- for (i = 0; i < search_node->child_count; i++)
209
+
210
+ if (search_node->child_list[(unsigned int)text] != NULL)
222
211
  {
223
- if (search_node->child_list[i]->text == text)
224
- {
225
- /* find it! move to find child node! */
226
- *start = search_node->child_list[i];
227
- return true;
228
- }
212
+ /* find it! move to find child node! */
213
+ *start = search_node->child_list[(unsigned int)text];
214
+ return true;
229
215
  }
230
216
 
231
217
  /* not found */
@@ -243,7 +229,6 @@ struct aho_text_t* aho_find_trie_node(struct aho_trie_node** restrict start, con
243
229
  {
244
230
  return NULL;
245
231
  }
246
-
247
232
  /* retry find. move failure link. */
248
233
  *start = (*start)->failure_link;
249
234
  }
@@ -260,7 +245,6 @@ struct aho_text_t* aho_find_trie_node(struct aho_trie_node** restrict start, con
260
245
  {
261
246
  return (*start)->output_link->output_text;
262
247
  }
263
-
264
248
  /* keep going */
265
249
  return NULL;
266
250
  }
@@ -286,10 +270,11 @@ void aho_print_trie(struct aho_trie * restrict t)
286
270
 
287
271
  travasal_node = queue_node->data;
288
272
  free(queue_node);
289
-
290
- for (i=0; i < travasal_node->child_count; i++)
273
+ struct aho_trie_node *child_ptr = travasal_node->first_child;
274
+ while (child_ptr != NULL)
291
275
  {
292
- aho_queue_enqueue(&queue, travasal_node->child_list[i]);
276
+ aho_queue_enqueue(&queue, child_ptr);
277
+ child_ptr = child_ptr->next;
293
278
  }
294
279
 
295
280
  /* is root node */
@@ -11,6 +11,9 @@ struct aho_trie_node
11
11
 
12
12
  struct aho_trie_node* parent;
13
13
  struct aho_trie_node* child_list[MAX_AHO_CHILD_NODE];
14
+ struct aho_trie_node* first_child;
15
+ struct aho_trie_node* last_child;
16
+ struct aho_trie_node* next;
14
17
  unsigned int child_count;
15
18
 
16
19
  bool text_end;
@@ -1,3 +1,15 @@
1
+
2
+ // MIT License
3
+
4
+ // Copyright (c) 2017 morenice
5
+
6
+ // Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ // of this software and associated documentation files (the "Software"), to deal
8
+ // in the Software without restriction, including without limitation the rights
9
+ // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ // copies of the Software, and to permit persons to whom the Software is
11
+ // furnished to do so, subject to the following conditions:
12
+
1
13
  #include <limits.h>
2
14
  #include <string.h>
3
15
  #include <stdlib.h>
@@ -20,6 +32,7 @@ void aho_destroy(struct ahocorasick * restrict aho)
20
32
  int aho_add_match_text(struct ahocorasick * restrict aho, const char* text, unsigned int len)
21
33
  {
22
34
  struct aho_text_t* a_text = NULL;
35
+
23
36
  if (aho->accumulate_text_id == AHO_MAX_TEXT_ID)
24
37
  {
25
38
  return -1;
@@ -30,11 +43,14 @@ int aho_add_match_text(struct ahocorasick * restrict aho, const char* text, unsi
30
43
  goto lack_free_mem;
31
44
 
32
45
  a_text->text = (char*) malloc(sizeof(char)*len);
46
+
33
47
  if (!a_text->text)
34
48
  goto lack_free_mem;
35
49
 
36
50
  a_text->id = aho->accumulate_text_id++;
51
+
37
52
  memcpy(a_text->text, text, len);
53
+
38
54
  a_text->len = len;
39
55
  a_text->prev = NULL;
40
56
  a_text->next = NULL;
@@ -54,6 +70,7 @@ int aho_add_match_text(struct ahocorasick * restrict aho, const char* text, unsi
54
70
  return a_text->id;
55
71
 
56
72
  lack_free_mem:
73
+
57
74
  return -1;
58
75
  }
59
76
 
@@ -110,7 +127,9 @@ void aho_create_trie(struct ahocorasick * restrict aho)
110
127
 
111
128
  for (iter = aho->text_list_head; iter != NULL; iter = iter->next)
112
129
  {
130
+
113
131
  aho_add_trie_node(&(aho->trie), iter);
132
+
114
133
  }
115
134
 
116
135
  aho_connect_link(&(aho->trie));
@@ -162,7 +181,8 @@ unsigned int aho_findtext(struct ahocorasick * restrict aho, const char* data, u
162
181
  return match_count;
163
182
  }
164
183
 
165
- VALUE aho_replace_text(struct ahocorasick * restrict aho, const char* data, unsigned long long data_len, char *values[], VALUE ruby_values[])
184
+ VALUE aho_replace_text(struct ahocorasick * restrict aho, const char* data,
185
+ unsigned long long data_len, char *values[], long value_sizes[], VALUE ruby_values[])
166
186
  {
167
187
  int i = 0;
168
188
  int match_count = 0;
@@ -191,14 +211,18 @@ VALUE aho_replace_text(struct ahocorasick * restrict aho, const char* data, unsi
191
211
  }
192
212
 
193
213
  // concatenate from last_concat_pos
194
- rb_str_cat(main_result, &data[last_concat_pos], pos - last_concat_pos);
214
+ if (pos > last_concat_pos) {
215
+ rb_str_cat(main_result, &data[last_concat_pos], pos - last_concat_pos);
216
+ }
217
+
195
218
  // concatenate replace
196
219
  if (values[result->id] == NULL) {
197
220
  VALUE proc_result = rb_funcall(ruby_values[result->id], rb_intern("call"), 0);
198
- values[result->id] = StringValueCStr(proc_result);
221
+ value_sizes[result->id] = RSTRING_LEN(proc_result);
222
+ values[result->id] = StringValuePtr(proc_result);
199
223
  }
200
224
 
201
- rb_str_cat2(main_result, values[result->id]);
225
+ rb_str_cat(main_result, values[result->id], value_sizes[result->id]);
202
226
  last_concat_pos = i + 1;
203
227
  }
204
228