string_metric 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 08ed461c4edc7dfd5e32fb34e93b13b4442c5167
4
- data.tar.gz: 69d6cd7273238442f184d6a63fec949ffcc6f6db
3
+ metadata.gz: b2a2bb553d840612ee0f9bb1eb36f561e55e5d50
4
+ data.tar.gz: be1ba43ed05865948bf84e51b6eb9b85fabc170f
5
5
  SHA512:
6
- metadata.gz: f5d7e8664514c5f31075755c3b09b2a7c4a0cd963601aefd26f1b928426dea2051a12b8abd039dddb0d8986fe25c6ce860d8c4a867757a4bf5b5a1883c6ace1e
7
- data.tar.gz: d9b631aad62da262d7a9afec9c7c1295e8b238cbfe4ef432354e0a2c42d9bf3b777758cf00abe2b14b702069eeb335df505ba9db37faa165335d748f4d748f2e
6
+ metadata.gz: f449e8a3cd1303c9ee92bd71fe57bde264b03c201c85bfbe538353da26fbc3c91ad2d5394691535e6d2e59a025f391a5408b479182ba8fca2683aeada3aa2ff5
7
+ data.tar.gz: 52da6e7fd9076dd34c6b2b6325fdbea4e61a0dc37c38594ac76d3700adb61f10de960648a9b23dc8a9d730f5ea6bbcbab244a260371b3202be7fde2e680b4149
data/.gitignore CHANGED
@@ -1,5 +1,7 @@
1
1
  *.gem
2
2
  *.rbc
3
+ *.o
4
+ *.so
3
5
  .bundle
4
6
  .config
5
7
  .ruby-version
data/.travis.yml CHANGED
@@ -4,5 +4,3 @@ rvm:
4
4
  - "2.0.0"
5
5
  - "2.1.0"
6
6
  - "jruby-19mode"
7
-
8
- script: bundle exec rspec spec
data/Rakefile CHANGED
@@ -1 +1,27 @@
1
1
  require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ if RUBY_ENGINE == "ruby"
7
+ require "rake/extensiontask"
8
+
9
+ gem_name = "string_metric"
10
+ dir = "#{gem_name}/levenshtein"
11
+ spec = Gem::Specification.load("#{gem_name}.gemspec")
12
+
13
+ Rake::ExtensionTask.new do |ext|
14
+ ext.name = "trie_radix_tree_ext"
15
+ ext.ext_dir = "ext/#{dir}"
16
+ ext.lib_dir = "lib/#{dir}"
17
+ ext.gem_spec = spec
18
+ end
19
+
20
+ task :default do
21
+ Rake::Task["compile"].invoke
22
+ Rake::Task["spec"].invoke
23
+ end
24
+ else
25
+ task default: :spec
26
+ end
27
+
@@ -0,0 +1,44 @@
1
+ require 'string_metric'
2
+ require 'benchmark'
3
+ require 'pp'
4
+
5
+ Benchmark.bmbm(7) do |x|
6
+ options = {}
7
+ max_distance = 2
8
+
9
+ dict = []
10
+ trie = StringMetric::Levenshtein::TrieNode.new
11
+ File.open('/usr/share/dict/words', 'r').each_line do |line|
12
+ word = line.chomp
13
+ trie.insert(word)
14
+ dict << word
15
+ end
16
+
17
+ randomWords = []
18
+ File.open('spec/fixtures/dictionary_input.txt', 'r').each_line do |word|
19
+ randomWords << word.chomp
20
+ end
21
+
22
+ matrixResults = []
23
+ x.report("two_matrix_rows_v2 implementation") do
24
+ randomWords.each do |from|
25
+ dict.each do |to|
26
+ matrixResults << to if StringMetric::Levenshtein::IterativeWithTwoMatrixRowsOptimized.distance(from, to, options) <= max_distance
27
+ end
28
+ end
29
+ end
30
+
31
+ trieResults = []
32
+ x.report("trie_radix_tree implementation") do
33
+ randomWords.each do |from|
34
+ trieResults << StringMetric::Levenshtein::TrieRadixTree.distance(from, trie, max_distance: max_distance)
35
+ end
36
+ end
37
+
38
+ trieResultsExt = []
39
+ x.report("trie_radix_tree_ext implementation") do
40
+ randomWords.each do |from|
41
+ trieResultsExt << StringMetric::Levenshtein::TrieRadixTreeExt.distance(from, trie, max_distance: max_distance)
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile('string_metric/levenshtein/trie_radix_tree_ext')
@@ -0,0 +1,112 @@
1
+ #include <ruby.h>
2
+
3
+ void Init_trie_radix_tree_ext(void);
4
+ void search_recursive(VALUE node, int letter, int *current_row, VALUE results);
5
+ VALUE search_ext(VALUE self, VALUE _from, VALUE _from_len, VALUE trie_node,
6
+ VALUE _max_distance, VALUE _insertion_cost,
7
+ VALUE _deletion_cost, VALUE _substitution_cost);
8
+
9
+ #define MIN2(a, b) (((a) < (b)) ? (a) : (b))
10
+ #define MIN3(a, b, c) (MIN2(MIN2((a), (b)), (c)))
11
+
12
+ #define MALLOC_W(ptr, size) do { \
13
+ ptr = malloc(size); \
14
+ if (!ptr) \
15
+ rb_memerror(); \
16
+ } while (0)
17
+
18
+ // Declare the variables that don't change as global, so that we don't have to pass them around
19
+ // in our recursive function and increase the stack frame unnecessarily
20
+ int *from, from_len;
21
+ int max_distance, insertion_cost, deletion_cost, substitution_cost;
22
+
23
+ void Init_trie_radix_tree_ext(void) {
24
+
25
+ VALUE StringMetric = rb_define_module("StringMetric");
26
+ VALUE Levenshtein = rb_define_module_under(StringMetric, "Levenshtein");
27
+ VALUE TrieRadixTreeExt = rb_define_class_under(Levenshtein, "TrieRadixTreeExt", rb_cObject);
28
+
29
+ rb_define_singleton_method(TrieRadixTreeExt, "trie_ext", search_ext, 7);
30
+ }
31
+
32
+ VALUE search_ext(VALUE self, VALUE _from, VALUE _from_len, VALUE trie_node,
33
+ VALUE _max_distance, VALUE _insertion_cost,
34
+ VALUE _deletion_cost, VALUE _substitution_cost)
35
+ {
36
+ int i, *current_row;
37
+ VALUE results, letter, node, children, children_keys;
38
+
39
+ // Convert from ruby types
40
+ max_distance = FIX2INT(_max_distance);
41
+ insertion_cost = FIX2INT(_insertion_cost);
42
+ deletion_cost = FIX2INT(_deletion_cost);
43
+ substitution_cost = FIX2INT(_substitution_cost);
44
+ from_len = FIX2INT(_from_len);
45
+
46
+ // The '_from' word is passed as an array of codepoints. Allocate memory and populate the C array
47
+ MALLOC_W(from, from_len * sizeof(int));
48
+ for (i = 0; i < from_len; i++)
49
+ from[i] = FIX2INT(rb_ary_entry(_from, i));
50
+
51
+ // Create a hash to store the results and return it to ruby when we are done
52
+ results = rb_hash_new();
53
+
54
+ MALLOC_W(current_row, (from_len + 1) * sizeof(int));
55
+ for (i = 0; i <= from_len; i++)
56
+ current_row[i] = i;
57
+
58
+ // Extract the hash from trie_node object and get an array of keys
59
+ children = rb_funcall(trie_node, rb_intern("children"), 0);
60
+ children_keys = rb_funcall(children, rb_intern("keys"), 0);
61
+
62
+ for (i = 0; i < RARRAY_LEN(children_keys); i++) {
63
+ letter = rb_ary_entry(children_keys, i);
64
+ node = rb_hash_aref(children, letter);
65
+ search_recursive(node, FIX2INT(letter), current_row, results);
66
+ }
67
+ free(from);
68
+ free(current_row);
69
+ return results;
70
+ }
71
+
72
+ void search_recursive(VALUE node, int letter, int *previous_row, VALUE results) {
73
+
74
+ int i, min, columns, distance, *current_row;
75
+ int cost, insert_cost, delete_cost, replace_cost;
76
+ VALUE word, codepoint, children, children_keys;
77
+
78
+ columns = from_len + 1;
79
+ MALLOC_W(current_row, columns * sizeof(int));
80
+ current_row[0] = previous_row[0] + 1;
81
+
82
+ for (i = 1; i < columns; i++) {
83
+ cost = (from[i - 1] == letter) ? 0 : substitution_cost;
84
+ insert_cost = current_row[i - 1] + insertion_cost;
85
+ delete_cost = previous_row[i] + deletion_cost;
86
+ replace_cost = previous_row[i - 1] + cost;
87
+
88
+ current_row[i] = MIN3(insert_cost, delete_cost, replace_cost);
89
+ }
90
+ distance = current_row[columns - 1];
91
+ word = rb_funcall(node, rb_intern("word"), 0);
92
+
93
+ if (distance <= max_distance && word != Qnil)
94
+ rb_hash_aset(results, word, INT2FIX(distance));
95
+
96
+ min = current_row[0];
97
+ for (i = 1; i < columns; i++)
98
+ if (current_row[i] < min)
99
+ min = current_row[i];
100
+
101
+ if (min <= max_distance) {
102
+ children = rb_funcall(node, rb_intern("children"), 0);
103
+ children_keys = rb_funcall(children, rb_intern("keys"), 0);
104
+
105
+ for (i = 0; i < RARRAY_LEN(children_keys); i++) {
106
+ codepoint = rb_ary_entry(children_keys, i);
107
+ node = rb_hash_aref(children, codepoint);
108
+ search_recursive(node, FIX2INT(codepoint), current_row, results);
109
+ }
110
+ }
111
+ free(current_row);
112
+ }
@@ -0,0 +1,23 @@
1
+ # coding: utf-8
2
+
3
+ module StringMetric
4
+ module Levenshtein
5
+ class TrieNode
6
+ attr_accessor :word, :children
7
+
8
+ def initialize
9
+ @word = nil
10
+ @children = {}
11
+ end
12
+
13
+ def insert(word)
14
+ node = self
15
+ word.codepoints.each do |char|
16
+ node.children[char] = TrieNode.new unless node.children.key?(char)
17
+ node = node.children[char]
18
+ end
19
+ node.word = word
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,49 @@
1
+ # coding: utf-8
2
+
3
+ module StringMetric
4
+ module Levenshtein
5
+ class TrieRadixTree
6
+ def self.distance(from, node, options = {})
7
+
8
+ @max_distance = options[:max_distance] || 0
9
+ @insertion_cost = options[:insertion_cost] || 1
10
+ @deletion_cost = options[:deletion_cost] || 1
11
+ @substitution_cost = options[:substitution_cost] || 1
12
+
13
+ results = []
14
+ word = from.codepoints
15
+ currentRow = (0..word.length).to_a
16
+
17
+ node.children.keys.each do |letter|
18
+ searchRecursive(node.children[letter], letter, word, currentRow, results)
19
+ end
20
+
21
+ results
22
+ end
23
+
24
+ def self.searchRecursive(node, letter, word, previousRow, results)
25
+ columns = word.length + 1
26
+ currentRow = [previousRow[0] + 1]
27
+
28
+ (1...columns).each do |column|
29
+ insertCost = currentRow[column - 1] + @insertion_cost
30
+ deleteCost = previousRow[column] + @deletion_cost
31
+ cost = (word[column - 1] == letter) ? 0 : @substitution_cost
32
+ replaceCost = previousRow[column - 1] + cost
33
+
34
+ currentRow << [insertCost, deleteCost, replaceCost].min
35
+ end
36
+
37
+ if currentRow.last <= @max_distance && !node.word.nil?
38
+ results << [node.word, currentRow.last]
39
+ end
40
+
41
+ if currentRow.min <= @max_distance
42
+ node.children.keys.each do |letter|
43
+ searchRecursive(node.children[letter], letter, word, currentRow, results)
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,20 @@
1
+ # coding: utf-8
2
+
3
+ require_relative 'trie_radix_tree_ext.so'
4
+
5
+ module StringMetric
6
+ module Levenshtein
7
+ class TrieRadixTreeExt
8
+ def self.distance(from, trieNode, options = {})
9
+
10
+ max_distance = options[:max_distance] || 0
11
+ insertion_cost = options[:insertion_cost] || 1
12
+ deletion_cost = options[:deletion_cost] || 1
13
+ substitution_cost = options[:substitution_cost] || 1
14
+
15
+ trie_ext(from.codepoints, from.length, trieNode, max_distance,
16
+ insertion_cost, deletion_cost, substitution_cost)
17
+ end
18
+ end
19
+ end
20
+ end
@@ -5,6 +5,9 @@ require_relative "levenshtein/iterative_with_two_matrix_rows"
5
5
  require_relative "levenshtein/iterative_with_two_matrix_rows_optimized"
6
6
  require_relative "levenshtein/iterative_with_full_matrix"
7
7
  require_relative "levenshtein/recursive"
8
+ require_relative "levenshtein/trie_node"
9
+ require_relative "levenshtein/trie_radix_tree"
10
+ require_relative "levenshtein/trie_radix_tree_ext"
8
11
 
9
12
  module StringMetric
10
13
  # Levenshtein Distance implementation
@@ -1,3 +1,3 @@
1
1
  module StringMetric
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
@@ -0,0 +1,10 @@
1
+ aliency
2
+ prostatauxe
3
+ Herbartian
4
+ womanize
5
+ unviolent
6
+ disguised
7
+ preanimism
8
+ birdling
9
+ geognosy
10
+ daut
@@ -24,6 +24,9 @@ Gem::Specification.new do |spec|
24
24
  spec.add_development_dependency "text", "~> 1.2.3"
25
25
 
26
26
  if RUBY_ENGINE == "ruby"
27
+ spec.add_development_dependency "rake-compiler", "~> 0.9.2"
28
+ spec.extensions << "ext/#{spec.name}/levenshtein/extconf.rb"
29
+
27
30
  if RUBY_VERSION > "1.9.3"
28
31
  spec.add_development_dependency "pry-byebug", "~> 1.2.1"
29
32
  else
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: string_metric
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Giorgos Tsiftsis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-13 00:00:00.000000000 Z
11
+ date: 2015-05-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: 1.2.3
69
+ - !ruby/object:Gem::Dependency
70
+ name: rake-compiler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 0.9.2
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.9.2
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: pry-byebug
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -84,7 +98,8 @@ description: A simple library with String Metric algorithms
84
98
  email:
85
99
  - giorgos.tsiftsis@skroutz.gr
86
100
  executables: []
87
- extensions: []
101
+ extensions:
102
+ - ext/string_metric/levenshtein/extconf.rb
88
103
  extra_rdoc_files: []
89
104
  files:
90
105
  - ".gitignore"
@@ -94,7 +109,10 @@ files:
94
109
  - LICENSE.txt
95
110
  - README.md
96
111
  - Rakefile
112
+ - benchmarks/dictionary.rb
97
113
  - benchmarks/levenshtein.rb
114
+ - ext/string_metric/levenshtein/extconf.rb
115
+ - ext/string_metric/levenshtein/trie_radix_tree_ext.c
98
116
  - lib/string_metric.rb
99
117
  - lib/string_metric/levenshtein.rb
100
118
  - lib/string_metric/levenshtein/experiment.rb
@@ -102,7 +120,11 @@ files:
102
120
  - lib/string_metric/levenshtein/iterative_with_two_matrix_rows.rb
103
121
  - lib/string_metric/levenshtein/iterative_with_two_matrix_rows_optimized.rb
104
122
  - lib/string_metric/levenshtein/recursive.rb
123
+ - lib/string_metric/levenshtein/trie_node.rb
124
+ - lib/string_metric/levenshtein/trie_radix_tree.rb
125
+ - lib/string_metric/levenshtein/trie_radix_tree_ext.rb
105
126
  - lib/string_metric/version.rb
127
+ - spec/fixtures/dictionary_input.txt
106
128
  - spec/fixtures/levenshtein.csv
107
129
  - spec/lib/levenshtein/experiment_spec.rb
108
130
  - spec/lib/levenshtein/iterative_with_full_matric_spec.rb
@@ -138,6 +160,7 @@ signing_key:
138
160
  specification_version: 4
139
161
  summary: A simple library with String Metric algorithms
140
162
  test_files:
163
+ - spec/fixtures/dictionary_input.txt
141
164
  - spec/fixtures/levenshtein.csv
142
165
  - spec/lib/levenshtein/experiment_spec.rb
143
166
  - spec/lib/levenshtein/iterative_with_full_matric_spec.rb