string_metric 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.travis.yml +0 -2
- data/Rakefile +26 -0
- data/benchmarks/dictionary.rb +44 -0
- data/ext/string_metric/levenshtein/extconf.rb +3 -0
- data/ext/string_metric/levenshtein/trie_radix_tree_ext.c +112 -0
- data/lib/string_metric/levenshtein/trie_node.rb +23 -0
- data/lib/string_metric/levenshtein/trie_radix_tree.rb +49 -0
- data/lib/string_metric/levenshtein/trie_radix_tree_ext.rb +20 -0
- data/lib/string_metric/levenshtein.rb +3 -0
- data/lib/string_metric/version.rb +1 -1
- data/spec/fixtures/dictionary_input.txt +10 -0
- data/string_metric.gemspec +3 -0
- metadata +26 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b2a2bb553d840612ee0f9bb1eb36f561e55e5d50
|
4
|
+
data.tar.gz: be1ba43ed05865948bf84e51b6eb9b85fabc170f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f449e8a3cd1303c9ee92bd71fe57bde264b03c201c85bfbe538353da26fbc3c91ad2d5394691535e6d2e59a025f391a5408b479182ba8fca2683aeada3aa2ff5
|
7
|
+
data.tar.gz: 52da6e7fd9076dd34c6b2b6325fdbea4e61a0dc37c38594ac76d3700adb61f10de960648a9b23dc8a9d730f5ea6bbcbab244a260371b3202be7fde2e680b4149
|
data/.gitignore
CHANGED
data/.travis.yml
CHANGED
data/Rakefile
CHANGED
@@ -1 +1,27 @@
|
|
1
1
|
require "bundler/gem_tasks"
|
2
|
+
require "rspec/core/rake_task"
|
3
|
+
|
4
|
+
RSpec::Core::RakeTask.new(:spec)
|
5
|
+
|
6
|
+
if RUBY_ENGINE == "ruby"
|
7
|
+
require "rake/extensiontask"
|
8
|
+
|
9
|
+
gem_name = "string_metric"
|
10
|
+
dir = "#{gem_name}/levenshtein"
|
11
|
+
spec = Gem::Specification.load("#{gem_name}.gemspec")
|
12
|
+
|
13
|
+
Rake::ExtensionTask.new do |ext|
|
14
|
+
ext.name = "trie_radix_tree_ext"
|
15
|
+
ext.ext_dir = "ext/#{dir}"
|
16
|
+
ext.lib_dir = "lib/#{dir}"
|
17
|
+
ext.gem_spec = spec
|
18
|
+
end
|
19
|
+
|
20
|
+
task :default do
|
21
|
+
Rake::Task["compile"].invoke
|
22
|
+
Rake::Task["spec"].invoke
|
23
|
+
end
|
24
|
+
else
|
25
|
+
task default: :spec
|
26
|
+
end
|
27
|
+
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'string_metric'
|
2
|
+
require 'benchmark'
|
3
|
+
require 'pp'
|
4
|
+
|
5
|
+
Benchmark.bmbm(7) do |x|
|
6
|
+
options = {}
|
7
|
+
max_distance = 2
|
8
|
+
|
9
|
+
dict = []
|
10
|
+
trie = StringMetric::Levenshtein::TrieNode.new
|
11
|
+
File.open('/usr/share/dict/words', 'r').each_line do |line|
|
12
|
+
word = line.chomp
|
13
|
+
trie.insert(word)
|
14
|
+
dict << word
|
15
|
+
end
|
16
|
+
|
17
|
+
randomWords = []
|
18
|
+
File.open('spec/fixtures/dictionary_input.txt', 'r').each_line do |word|
|
19
|
+
randomWords << word.chomp
|
20
|
+
end
|
21
|
+
|
22
|
+
matrixResults = []
|
23
|
+
x.report("two_matrix_rows_v2 implementation") do
|
24
|
+
randomWords.each do |from|
|
25
|
+
dict.each do |to|
|
26
|
+
matrixResults << to if StringMetric::Levenshtein::IterativeWithTwoMatrixRowsOptimized.distance(from, to, options) <= max_distance
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
trieResults = []
|
32
|
+
x.report("trie_radix_tree implementation") do
|
33
|
+
randomWords.each do |from|
|
34
|
+
trieResults << StringMetric::Levenshtein::TrieRadixTree.distance(from, trie, max_distance: max_distance)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
trieResultsExt = []
|
39
|
+
x.report("trie_radix_tree_ext implementation") do
|
40
|
+
randomWords.each do |from|
|
41
|
+
trieResultsExt << StringMetric::Levenshtein::TrieRadixTreeExt.distance(from, trie, max_distance: max_distance)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
|
3
|
+
void Init_trie_radix_tree_ext(void);
|
4
|
+
void search_recursive(VALUE node, int letter, int *current_row, VALUE results);
|
5
|
+
VALUE search_ext(VALUE self, VALUE _from, VALUE _from_len, VALUE trie_node,
|
6
|
+
VALUE _max_distance, VALUE _insertion_cost,
|
7
|
+
VALUE _deletion_cost, VALUE _substitution_cost);
|
8
|
+
|
9
|
+
#define MIN2(a, b) (((a) < (b)) ? (a) : (b))
|
10
|
+
#define MIN3(a, b, c) (MIN2(MIN2((a), (b)), (c)))
|
11
|
+
|
12
|
+
#define MALLOC_W(ptr, size) do { \
|
13
|
+
ptr = malloc(size); \
|
14
|
+
if (!ptr) \
|
15
|
+
rb_memerror(); \
|
16
|
+
} while (0)
|
17
|
+
|
18
|
+
// Declare the variables that don't change as global, so that we don't have to pass them around
|
19
|
+
// in our recursive function and increase the stack frame unnecessarily
|
20
|
+
int *from, from_len;
|
21
|
+
int max_distance, insertion_cost, deletion_cost, substitution_cost;
|
22
|
+
|
23
|
+
void Init_trie_radix_tree_ext(void) {
|
24
|
+
|
25
|
+
VALUE StringMetric = rb_define_module("StringMetric");
|
26
|
+
VALUE Levenshtein = rb_define_module_under(StringMetric, "Levenshtein");
|
27
|
+
VALUE TrieRadixTreeExt = rb_define_class_under(Levenshtein, "TrieRadixTreeExt", rb_cObject);
|
28
|
+
|
29
|
+
rb_define_singleton_method(TrieRadixTreeExt, "trie_ext", search_ext, 7);
|
30
|
+
}
|
31
|
+
|
32
|
+
VALUE search_ext(VALUE self, VALUE _from, VALUE _from_len, VALUE trie_node,
|
33
|
+
VALUE _max_distance, VALUE _insertion_cost,
|
34
|
+
VALUE _deletion_cost, VALUE _substitution_cost)
|
35
|
+
{
|
36
|
+
int i, *current_row;
|
37
|
+
VALUE results, letter, node, children, children_keys;
|
38
|
+
|
39
|
+
// Convert from ruby types
|
40
|
+
max_distance = FIX2INT(_max_distance);
|
41
|
+
insertion_cost = FIX2INT(_insertion_cost);
|
42
|
+
deletion_cost = FIX2INT(_deletion_cost);
|
43
|
+
substitution_cost = FIX2INT(_substitution_cost);
|
44
|
+
from_len = FIX2INT(_from_len);
|
45
|
+
|
46
|
+
// The '_from' word is passed as an array of codepoints. Allocate memory and populate the C array
|
47
|
+
MALLOC_W(from, from_len * sizeof(int));
|
48
|
+
for (i = 0; i < from_len; i++)
|
49
|
+
from[i] = FIX2INT(rb_ary_entry(_from, i));
|
50
|
+
|
51
|
+
// Create a hash to store the results and return it to ruby when we are done
|
52
|
+
results = rb_hash_new();
|
53
|
+
|
54
|
+
MALLOC_W(current_row, (from_len + 1) * sizeof(int));
|
55
|
+
for (i = 0; i <= from_len; i++)
|
56
|
+
current_row[i] = i;
|
57
|
+
|
58
|
+
// Extract the hash from trie_node object and get an array of keys
|
59
|
+
children = rb_funcall(trie_node, rb_intern("children"), 0);
|
60
|
+
children_keys = rb_funcall(children, rb_intern("keys"), 0);
|
61
|
+
|
62
|
+
for (i = 0; i < RARRAY_LEN(children_keys); i++) {
|
63
|
+
letter = rb_ary_entry(children_keys, i);
|
64
|
+
node = rb_hash_aref(children, letter);
|
65
|
+
search_recursive(node, FIX2INT(letter), current_row, results);
|
66
|
+
}
|
67
|
+
free(from);
|
68
|
+
free(current_row);
|
69
|
+
return results;
|
70
|
+
}
|
71
|
+
|
72
|
+
void search_recursive(VALUE node, int letter, int *previous_row, VALUE results) {
|
73
|
+
|
74
|
+
int i, min, columns, distance, *current_row;
|
75
|
+
int cost, insert_cost, delete_cost, replace_cost;
|
76
|
+
VALUE word, codepoint, children, children_keys;
|
77
|
+
|
78
|
+
columns = from_len + 1;
|
79
|
+
MALLOC_W(current_row, columns * sizeof(int));
|
80
|
+
current_row[0] = previous_row[0] + 1;
|
81
|
+
|
82
|
+
for (i = 1; i < columns; i++) {
|
83
|
+
cost = (from[i - 1] == letter) ? 0 : substitution_cost;
|
84
|
+
insert_cost = current_row[i - 1] + insertion_cost;
|
85
|
+
delete_cost = previous_row[i] + deletion_cost;
|
86
|
+
replace_cost = previous_row[i - 1] + cost;
|
87
|
+
|
88
|
+
current_row[i] = MIN3(insert_cost, delete_cost, replace_cost);
|
89
|
+
}
|
90
|
+
distance = current_row[columns - 1];
|
91
|
+
word = rb_funcall(node, rb_intern("word"), 0);
|
92
|
+
|
93
|
+
if (distance <= max_distance && word != Qnil)
|
94
|
+
rb_hash_aset(results, word, INT2FIX(distance));
|
95
|
+
|
96
|
+
min = current_row[0];
|
97
|
+
for (i = 1; i < columns; i++)
|
98
|
+
if (current_row[i] < min)
|
99
|
+
min = current_row[i];
|
100
|
+
|
101
|
+
if (min <= max_distance) {
|
102
|
+
children = rb_funcall(node, rb_intern("children"), 0);
|
103
|
+
children_keys = rb_funcall(children, rb_intern("keys"), 0);
|
104
|
+
|
105
|
+
for (i = 0; i < RARRAY_LEN(children_keys); i++) {
|
106
|
+
codepoint = rb_ary_entry(children_keys, i);
|
107
|
+
node = rb_hash_aref(children, codepoint);
|
108
|
+
search_recursive(node, FIX2INT(codepoint), current_row, results);
|
109
|
+
}
|
110
|
+
}
|
111
|
+
free(current_row);
|
112
|
+
}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module StringMetric
|
4
|
+
module Levenshtein
|
5
|
+
class TrieNode
|
6
|
+
attr_accessor :word, :children
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@word = nil
|
10
|
+
@children = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def insert(word)
|
14
|
+
node = self
|
15
|
+
word.codepoints.each do |char|
|
16
|
+
node.children[char] = TrieNode.new unless node.children.key?(char)
|
17
|
+
node = node.children[char]
|
18
|
+
end
|
19
|
+
node.word = word
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module StringMetric
|
4
|
+
module Levenshtein
|
5
|
+
class TrieRadixTree
|
6
|
+
def self.distance(from, node, options = {})
|
7
|
+
|
8
|
+
@max_distance = options[:max_distance] || 0
|
9
|
+
@insertion_cost = options[:insertion_cost] || 1
|
10
|
+
@deletion_cost = options[:deletion_cost] || 1
|
11
|
+
@substitution_cost = options[:substitution_cost] || 1
|
12
|
+
|
13
|
+
results = []
|
14
|
+
word = from.codepoints
|
15
|
+
currentRow = (0..word.length).to_a
|
16
|
+
|
17
|
+
node.children.keys.each do |letter|
|
18
|
+
searchRecursive(node.children[letter], letter, word, currentRow, results)
|
19
|
+
end
|
20
|
+
|
21
|
+
results
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.searchRecursive(node, letter, word, previousRow, results)
|
25
|
+
columns = word.length + 1
|
26
|
+
currentRow = [previousRow[0] + 1]
|
27
|
+
|
28
|
+
(1...columns).each do |column|
|
29
|
+
insertCost = currentRow[column - 1] + @insertion_cost
|
30
|
+
deleteCost = previousRow[column] + @deletion_cost
|
31
|
+
cost = (word[column - 1] == letter) ? 0 : @substitution_cost
|
32
|
+
replaceCost = previousRow[column - 1] + cost
|
33
|
+
|
34
|
+
currentRow << [insertCost, deleteCost, replaceCost].min
|
35
|
+
end
|
36
|
+
|
37
|
+
if currentRow.last <= @max_distance && !node.word.nil?
|
38
|
+
results << [node.word, currentRow.last]
|
39
|
+
end
|
40
|
+
|
41
|
+
if currentRow.min <= @max_distance
|
42
|
+
node.children.keys.each do |letter|
|
43
|
+
searchRecursive(node.children[letter], letter, word, currentRow, results)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require_relative 'trie_radix_tree_ext.so'
|
4
|
+
|
5
|
+
module StringMetric
|
6
|
+
module Levenshtein
|
7
|
+
class TrieRadixTreeExt
|
8
|
+
def self.distance(from, trieNode, options = {})
|
9
|
+
|
10
|
+
max_distance = options[:max_distance] || 0
|
11
|
+
insertion_cost = options[:insertion_cost] || 1
|
12
|
+
deletion_cost = options[:deletion_cost] || 1
|
13
|
+
substitution_cost = options[:substitution_cost] || 1
|
14
|
+
|
15
|
+
trie_ext(from.codepoints, from.length, trieNode, max_distance,
|
16
|
+
insertion_cost, deletion_cost, substitution_cost)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -5,6 +5,9 @@ require_relative "levenshtein/iterative_with_two_matrix_rows"
|
|
5
5
|
require_relative "levenshtein/iterative_with_two_matrix_rows_optimized"
|
6
6
|
require_relative "levenshtein/iterative_with_full_matrix"
|
7
7
|
require_relative "levenshtein/recursive"
|
8
|
+
require_relative "levenshtein/trie_node"
|
9
|
+
require_relative "levenshtein/trie_radix_tree"
|
10
|
+
require_relative "levenshtein/trie_radix_tree_ext"
|
8
11
|
|
9
12
|
module StringMetric
|
10
13
|
# Levenshtein Distance implementation
|
data/string_metric.gemspec
CHANGED
@@ -24,6 +24,9 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.add_development_dependency "text", "~> 1.2.3"
|
25
25
|
|
26
26
|
if RUBY_ENGINE == "ruby"
|
27
|
+
spec.add_development_dependency "rake-compiler", "~> 0.9.2"
|
28
|
+
spec.extensions << "ext/#{spec.name}/levenshtein/extconf.rb"
|
29
|
+
|
27
30
|
if RUBY_VERSION > "1.9.3"
|
28
31
|
spec.add_development_dependency "pry-byebug", "~> 1.2.1"
|
29
32
|
else
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: string_metric
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Giorgos Tsiftsis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-05-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: 1.2.3
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rake-compiler
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 0.9.2
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 0.9.2
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: pry-byebug
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -84,7 +98,8 @@ description: A simple library with String Metric algorithms
|
|
84
98
|
email:
|
85
99
|
- giorgos.tsiftsis@skroutz.gr
|
86
100
|
executables: []
|
87
|
-
extensions:
|
101
|
+
extensions:
|
102
|
+
- ext/string_metric/levenshtein/extconf.rb
|
88
103
|
extra_rdoc_files: []
|
89
104
|
files:
|
90
105
|
- ".gitignore"
|
@@ -94,7 +109,10 @@ files:
|
|
94
109
|
- LICENSE.txt
|
95
110
|
- README.md
|
96
111
|
- Rakefile
|
112
|
+
- benchmarks/dictionary.rb
|
97
113
|
- benchmarks/levenshtein.rb
|
114
|
+
- ext/string_metric/levenshtein/extconf.rb
|
115
|
+
- ext/string_metric/levenshtein/trie_radix_tree_ext.c
|
98
116
|
- lib/string_metric.rb
|
99
117
|
- lib/string_metric/levenshtein.rb
|
100
118
|
- lib/string_metric/levenshtein/experiment.rb
|
@@ -102,7 +120,11 @@ files:
|
|
102
120
|
- lib/string_metric/levenshtein/iterative_with_two_matrix_rows.rb
|
103
121
|
- lib/string_metric/levenshtein/iterative_with_two_matrix_rows_optimized.rb
|
104
122
|
- lib/string_metric/levenshtein/recursive.rb
|
123
|
+
- lib/string_metric/levenshtein/trie_node.rb
|
124
|
+
- lib/string_metric/levenshtein/trie_radix_tree.rb
|
125
|
+
- lib/string_metric/levenshtein/trie_radix_tree_ext.rb
|
105
126
|
- lib/string_metric/version.rb
|
127
|
+
- spec/fixtures/dictionary_input.txt
|
106
128
|
- spec/fixtures/levenshtein.csv
|
107
129
|
- spec/lib/levenshtein/experiment_spec.rb
|
108
130
|
- spec/lib/levenshtein/iterative_with_full_matric_spec.rb
|
@@ -138,6 +160,7 @@ signing_key:
|
|
138
160
|
specification_version: 4
|
139
161
|
summary: A simple library with String Metric algorithms
|
140
162
|
test_files:
|
163
|
+
- spec/fixtures/dictionary_input.txt
|
141
164
|
- spec/fixtures/levenshtein.csv
|
142
165
|
- spec/lib/levenshtein/experiment_spec.rb
|
143
166
|
- spec/lib/levenshtein/iterative_with_full_matric_spec.rb
|