string_metric 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.travis.yml +0 -2
- data/Rakefile +26 -0
- data/benchmarks/dictionary.rb +44 -0
- data/ext/string_metric/levenshtein/extconf.rb +3 -0
- data/ext/string_metric/levenshtein/trie_radix_tree_ext.c +112 -0
- data/lib/string_metric/levenshtein/trie_node.rb +23 -0
- data/lib/string_metric/levenshtein/trie_radix_tree.rb +49 -0
- data/lib/string_metric/levenshtein/trie_radix_tree_ext.rb +20 -0
- data/lib/string_metric/levenshtein.rb +3 -0
- data/lib/string_metric/version.rb +1 -1
- data/spec/fixtures/dictionary_input.txt +10 -0
- data/string_metric.gemspec +3 -0
- metadata +26 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b2a2bb553d840612ee0f9bb1eb36f561e55e5d50
|
4
|
+
data.tar.gz: be1ba43ed05865948bf84e51b6eb9b85fabc170f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f449e8a3cd1303c9ee92bd71fe57bde264b03c201c85bfbe538353da26fbc3c91ad2d5394691535e6d2e59a025f391a5408b479182ba8fca2683aeada3aa2ff5
|
7
|
+
data.tar.gz: 52da6e7fd9076dd34c6b2b6325fdbea4e61a0dc37c38594ac76d3700adb61f10de960648a9b23dc8a9d730f5ea6bbcbab244a260371b3202be7fde2e680b4149
|
data/.gitignore
CHANGED
data/.travis.yml
CHANGED
data/Rakefile
CHANGED
@@ -1 +1,27 @@
|
|
1
1
|
require "bundler/gem_tasks"
|
2
|
+
require "rspec/core/rake_task"
|
3
|
+
|
4
|
+
RSpec::Core::RakeTask.new(:spec)
|
5
|
+
|
6
|
+
if RUBY_ENGINE == "ruby"
|
7
|
+
require "rake/extensiontask"
|
8
|
+
|
9
|
+
gem_name = "string_metric"
|
10
|
+
dir = "#{gem_name}/levenshtein"
|
11
|
+
spec = Gem::Specification.load("#{gem_name}.gemspec")
|
12
|
+
|
13
|
+
Rake::ExtensionTask.new do |ext|
|
14
|
+
ext.name = "trie_radix_tree_ext"
|
15
|
+
ext.ext_dir = "ext/#{dir}"
|
16
|
+
ext.lib_dir = "lib/#{dir}"
|
17
|
+
ext.gem_spec = spec
|
18
|
+
end
|
19
|
+
|
20
|
+
task :default do
|
21
|
+
Rake::Task["compile"].invoke
|
22
|
+
Rake::Task["spec"].invoke
|
23
|
+
end
|
24
|
+
else
|
25
|
+
task default: :spec
|
26
|
+
end
|
27
|
+
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'string_metric'
|
2
|
+
require 'benchmark'
|
3
|
+
require 'pp'
|
4
|
+
|
5
|
+
Benchmark.bmbm(7) do |x|
|
6
|
+
options = {}
|
7
|
+
max_distance = 2
|
8
|
+
|
9
|
+
dict = []
|
10
|
+
trie = StringMetric::Levenshtein::TrieNode.new
|
11
|
+
File.open('/usr/share/dict/words', 'r').each_line do |line|
|
12
|
+
word = line.chomp
|
13
|
+
trie.insert(word)
|
14
|
+
dict << word
|
15
|
+
end
|
16
|
+
|
17
|
+
randomWords = []
|
18
|
+
File.open('spec/fixtures/dictionary_input.txt', 'r').each_line do |word|
|
19
|
+
randomWords << word.chomp
|
20
|
+
end
|
21
|
+
|
22
|
+
matrixResults = []
|
23
|
+
x.report("two_matrix_rows_v2 implementation") do
|
24
|
+
randomWords.each do |from|
|
25
|
+
dict.each do |to|
|
26
|
+
matrixResults << to if StringMetric::Levenshtein::IterativeWithTwoMatrixRowsOptimized.distance(from, to, options) <= max_distance
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
trieResults = []
|
32
|
+
x.report("trie_radix_tree implementation") do
|
33
|
+
randomWords.each do |from|
|
34
|
+
trieResults << StringMetric::Levenshtein::TrieRadixTree.distance(from, trie, max_distance: max_distance)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
trieResultsExt = []
|
39
|
+
x.report("trie_radix_tree_ext implementation") do
|
40
|
+
randomWords.each do |from|
|
41
|
+
trieResultsExt << StringMetric::Levenshtein::TrieRadixTreeExt.distance(from, trie, max_distance: max_distance)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
|
3
|
+
void Init_trie_radix_tree_ext(void);
|
4
|
+
void search_recursive(VALUE node, int letter, int *current_row, VALUE results);
|
5
|
+
VALUE search_ext(VALUE self, VALUE _from, VALUE _from_len, VALUE trie_node,
|
6
|
+
VALUE _max_distance, VALUE _insertion_cost,
|
7
|
+
VALUE _deletion_cost, VALUE _substitution_cost);
|
8
|
+
|
9
|
+
#define MIN2(a, b) (((a) < (b)) ? (a) : (b))
|
10
|
+
#define MIN3(a, b, c) (MIN2(MIN2((a), (b)), (c)))
|
11
|
+
|
12
|
+
#define MALLOC_W(ptr, size) do { \
|
13
|
+
ptr = malloc(size); \
|
14
|
+
if (!ptr) \
|
15
|
+
rb_memerror(); \
|
16
|
+
} while (0)
|
17
|
+
|
18
|
+
// Declare the variables that don't change as global, so that we don't have to pass them around
|
19
|
+
// in our recursive function and increase the stack frame unnecessarily
|
20
|
+
int *from, from_len;
|
21
|
+
int max_distance, insertion_cost, deletion_cost, substitution_cost;
|
22
|
+
|
23
|
+
void Init_trie_radix_tree_ext(void) {
|
24
|
+
|
25
|
+
VALUE StringMetric = rb_define_module("StringMetric");
|
26
|
+
VALUE Levenshtein = rb_define_module_under(StringMetric, "Levenshtein");
|
27
|
+
VALUE TrieRadixTreeExt = rb_define_class_under(Levenshtein, "TrieRadixTreeExt", rb_cObject);
|
28
|
+
|
29
|
+
rb_define_singleton_method(TrieRadixTreeExt, "trie_ext", search_ext, 7);
|
30
|
+
}
|
31
|
+
|
32
|
+
VALUE search_ext(VALUE self, VALUE _from, VALUE _from_len, VALUE trie_node,
|
33
|
+
VALUE _max_distance, VALUE _insertion_cost,
|
34
|
+
VALUE _deletion_cost, VALUE _substitution_cost)
|
35
|
+
{
|
36
|
+
int i, *current_row;
|
37
|
+
VALUE results, letter, node, children, children_keys;
|
38
|
+
|
39
|
+
// Convert from ruby types
|
40
|
+
max_distance = FIX2INT(_max_distance);
|
41
|
+
insertion_cost = FIX2INT(_insertion_cost);
|
42
|
+
deletion_cost = FIX2INT(_deletion_cost);
|
43
|
+
substitution_cost = FIX2INT(_substitution_cost);
|
44
|
+
from_len = FIX2INT(_from_len);
|
45
|
+
|
46
|
+
// The '_from' word is passed as an array of codepoints. Allocate memory and populate the C array
|
47
|
+
MALLOC_W(from, from_len * sizeof(int));
|
48
|
+
for (i = 0; i < from_len; i++)
|
49
|
+
from[i] = FIX2INT(rb_ary_entry(_from, i));
|
50
|
+
|
51
|
+
// Create a hash to store the results and return it to ruby when we are done
|
52
|
+
results = rb_hash_new();
|
53
|
+
|
54
|
+
MALLOC_W(current_row, (from_len + 1) * sizeof(int));
|
55
|
+
for (i = 0; i <= from_len; i++)
|
56
|
+
current_row[i] = i;
|
57
|
+
|
58
|
+
// Extract the hash from trie_node object and get an array of keys
|
59
|
+
children = rb_funcall(trie_node, rb_intern("children"), 0);
|
60
|
+
children_keys = rb_funcall(children, rb_intern("keys"), 0);
|
61
|
+
|
62
|
+
for (i = 0; i < RARRAY_LEN(children_keys); i++) {
|
63
|
+
letter = rb_ary_entry(children_keys, i);
|
64
|
+
node = rb_hash_aref(children, letter);
|
65
|
+
search_recursive(node, FIX2INT(letter), current_row, results);
|
66
|
+
}
|
67
|
+
free(from);
|
68
|
+
free(current_row);
|
69
|
+
return results;
|
70
|
+
}
|
71
|
+
|
72
|
+
void search_recursive(VALUE node, int letter, int *previous_row, VALUE results) {
|
73
|
+
|
74
|
+
int i, min, columns, distance, *current_row;
|
75
|
+
int cost, insert_cost, delete_cost, replace_cost;
|
76
|
+
VALUE word, codepoint, children, children_keys;
|
77
|
+
|
78
|
+
columns = from_len + 1;
|
79
|
+
MALLOC_W(current_row, columns * sizeof(int));
|
80
|
+
current_row[0] = previous_row[0] + 1;
|
81
|
+
|
82
|
+
for (i = 1; i < columns; i++) {
|
83
|
+
cost = (from[i - 1] == letter) ? 0 : substitution_cost;
|
84
|
+
insert_cost = current_row[i - 1] + insertion_cost;
|
85
|
+
delete_cost = previous_row[i] + deletion_cost;
|
86
|
+
replace_cost = previous_row[i - 1] + cost;
|
87
|
+
|
88
|
+
current_row[i] = MIN3(insert_cost, delete_cost, replace_cost);
|
89
|
+
}
|
90
|
+
distance = current_row[columns - 1];
|
91
|
+
word = rb_funcall(node, rb_intern("word"), 0);
|
92
|
+
|
93
|
+
if (distance <= max_distance && word != Qnil)
|
94
|
+
rb_hash_aset(results, word, INT2FIX(distance));
|
95
|
+
|
96
|
+
min = current_row[0];
|
97
|
+
for (i = 1; i < columns; i++)
|
98
|
+
if (current_row[i] < min)
|
99
|
+
min = current_row[i];
|
100
|
+
|
101
|
+
if (min <= max_distance) {
|
102
|
+
children = rb_funcall(node, rb_intern("children"), 0);
|
103
|
+
children_keys = rb_funcall(children, rb_intern("keys"), 0);
|
104
|
+
|
105
|
+
for (i = 0; i < RARRAY_LEN(children_keys); i++) {
|
106
|
+
codepoint = rb_ary_entry(children_keys, i);
|
107
|
+
node = rb_hash_aref(children, codepoint);
|
108
|
+
search_recursive(node, FIX2INT(codepoint), current_row, results);
|
109
|
+
}
|
110
|
+
}
|
111
|
+
free(current_row);
|
112
|
+
}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module StringMetric
|
4
|
+
module Levenshtein
|
5
|
+
class TrieNode
|
6
|
+
attr_accessor :word, :children
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@word = nil
|
10
|
+
@children = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def insert(word)
|
14
|
+
node = self
|
15
|
+
word.codepoints.each do |char|
|
16
|
+
node.children[char] = TrieNode.new unless node.children.key?(char)
|
17
|
+
node = node.children[char]
|
18
|
+
end
|
19
|
+
node.word = word
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module StringMetric
|
4
|
+
module Levenshtein
|
5
|
+
class TrieRadixTree
|
6
|
+
def self.distance(from, node, options = {})
|
7
|
+
|
8
|
+
@max_distance = options[:max_distance] || 0
|
9
|
+
@insertion_cost = options[:insertion_cost] || 1
|
10
|
+
@deletion_cost = options[:deletion_cost] || 1
|
11
|
+
@substitution_cost = options[:substitution_cost] || 1
|
12
|
+
|
13
|
+
results = []
|
14
|
+
word = from.codepoints
|
15
|
+
currentRow = (0..word.length).to_a
|
16
|
+
|
17
|
+
node.children.keys.each do |letter|
|
18
|
+
searchRecursive(node.children[letter], letter, word, currentRow, results)
|
19
|
+
end
|
20
|
+
|
21
|
+
results
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.searchRecursive(node, letter, word, previousRow, results)
|
25
|
+
columns = word.length + 1
|
26
|
+
currentRow = [previousRow[0] + 1]
|
27
|
+
|
28
|
+
(1...columns).each do |column|
|
29
|
+
insertCost = currentRow[column - 1] + @insertion_cost
|
30
|
+
deleteCost = previousRow[column] + @deletion_cost
|
31
|
+
cost = (word[column - 1] == letter) ? 0 : @substitution_cost
|
32
|
+
replaceCost = previousRow[column - 1] + cost
|
33
|
+
|
34
|
+
currentRow << [insertCost, deleteCost, replaceCost].min
|
35
|
+
end
|
36
|
+
|
37
|
+
if currentRow.last <= @max_distance && !node.word.nil?
|
38
|
+
results << [node.word, currentRow.last]
|
39
|
+
end
|
40
|
+
|
41
|
+
if currentRow.min <= @max_distance
|
42
|
+
node.children.keys.each do |letter|
|
43
|
+
searchRecursive(node.children[letter], letter, word, currentRow, results)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require_relative 'trie_radix_tree_ext.so'
|
4
|
+
|
5
|
+
module StringMetric
|
6
|
+
module Levenshtein
|
7
|
+
class TrieRadixTreeExt
|
8
|
+
def self.distance(from, trieNode, options = {})
|
9
|
+
|
10
|
+
max_distance = options[:max_distance] || 0
|
11
|
+
insertion_cost = options[:insertion_cost] || 1
|
12
|
+
deletion_cost = options[:deletion_cost] || 1
|
13
|
+
substitution_cost = options[:substitution_cost] || 1
|
14
|
+
|
15
|
+
trie_ext(from.codepoints, from.length, trieNode, max_distance,
|
16
|
+
insertion_cost, deletion_cost, substitution_cost)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -5,6 +5,9 @@ require_relative "levenshtein/iterative_with_two_matrix_rows"
|
|
5
5
|
require_relative "levenshtein/iterative_with_two_matrix_rows_optimized"
|
6
6
|
require_relative "levenshtein/iterative_with_full_matrix"
|
7
7
|
require_relative "levenshtein/recursive"
|
8
|
+
require_relative "levenshtein/trie_node"
|
9
|
+
require_relative "levenshtein/trie_radix_tree"
|
10
|
+
require_relative "levenshtein/trie_radix_tree_ext"
|
8
11
|
|
9
12
|
module StringMetric
|
10
13
|
# Levenshtein Distance implementation
|
data/string_metric.gemspec
CHANGED
@@ -24,6 +24,9 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.add_development_dependency "text", "~> 1.2.3"
|
25
25
|
|
26
26
|
if RUBY_ENGINE == "ruby"
|
27
|
+
spec.add_development_dependency "rake-compiler", "~> 0.9.2"
|
28
|
+
spec.extensions << "ext/#{spec.name}/levenshtein/extconf.rb"
|
29
|
+
|
27
30
|
if RUBY_VERSION > "1.9.3"
|
28
31
|
spec.add_development_dependency "pry-byebug", "~> 1.2.1"
|
29
32
|
else
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: string_metric
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Giorgos Tsiftsis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-05-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: 1.2.3
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rake-compiler
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 0.9.2
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 0.9.2
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: pry-byebug
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -84,7 +98,8 @@ description: A simple library with String Metric algorithms
|
|
84
98
|
email:
|
85
99
|
- giorgos.tsiftsis@skroutz.gr
|
86
100
|
executables: []
|
87
|
-
extensions:
|
101
|
+
extensions:
|
102
|
+
- ext/string_metric/levenshtein/extconf.rb
|
88
103
|
extra_rdoc_files: []
|
89
104
|
files:
|
90
105
|
- ".gitignore"
|
@@ -94,7 +109,10 @@ files:
|
|
94
109
|
- LICENSE.txt
|
95
110
|
- README.md
|
96
111
|
- Rakefile
|
112
|
+
- benchmarks/dictionary.rb
|
97
113
|
- benchmarks/levenshtein.rb
|
114
|
+
- ext/string_metric/levenshtein/extconf.rb
|
115
|
+
- ext/string_metric/levenshtein/trie_radix_tree_ext.c
|
98
116
|
- lib/string_metric.rb
|
99
117
|
- lib/string_metric/levenshtein.rb
|
100
118
|
- lib/string_metric/levenshtein/experiment.rb
|
@@ -102,7 +120,11 @@ files:
|
|
102
120
|
- lib/string_metric/levenshtein/iterative_with_two_matrix_rows.rb
|
103
121
|
- lib/string_metric/levenshtein/iterative_with_two_matrix_rows_optimized.rb
|
104
122
|
- lib/string_metric/levenshtein/recursive.rb
|
123
|
+
- lib/string_metric/levenshtein/trie_node.rb
|
124
|
+
- lib/string_metric/levenshtein/trie_radix_tree.rb
|
125
|
+
- lib/string_metric/levenshtein/trie_radix_tree_ext.rb
|
105
126
|
- lib/string_metric/version.rb
|
127
|
+
- spec/fixtures/dictionary_input.txt
|
106
128
|
- spec/fixtures/levenshtein.csv
|
107
129
|
- spec/lib/levenshtein/experiment_spec.rb
|
108
130
|
- spec/lib/levenshtein/iterative_with_full_matric_spec.rb
|
@@ -138,6 +160,7 @@ signing_key:
|
|
138
160
|
specification_version: 4
|
139
161
|
summary: A simple library with String Metric algorithms
|
140
162
|
test_files:
|
163
|
+
- spec/fixtures/dictionary_input.txt
|
141
164
|
- spec/fixtures/levenshtein.csv
|
142
165
|
- spec/lib/levenshtein/experiment_spec.rb
|
143
166
|
- spec/lib/levenshtein/iterative_with_full_matric_spec.rb
|