string_metric 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 08ed461c4edc7dfd5e32fb34e93b13b4442c5167
4
- data.tar.gz: 69d6cd7273238442f184d6a63fec949ffcc6f6db
3
+ metadata.gz: b2a2bb553d840612ee0f9bb1eb36f561e55e5d50
4
+ data.tar.gz: be1ba43ed05865948bf84e51b6eb9b85fabc170f
5
5
  SHA512:
6
- metadata.gz: f5d7e8664514c5f31075755c3b09b2a7c4a0cd963601aefd26f1b928426dea2051a12b8abd039dddb0d8986fe25c6ce860d8c4a867757a4bf5b5a1883c6ace1e
7
- data.tar.gz: d9b631aad62da262d7a9afec9c7c1295e8b238cbfe4ef432354e0a2c42d9bf3b777758cf00abe2b14b702069eeb335df505ba9db37faa165335d748f4d748f2e
6
+ metadata.gz: f449e8a3cd1303c9ee92bd71fe57bde264b03c201c85bfbe538353da26fbc3c91ad2d5394691535e6d2e59a025f391a5408b479182ba8fca2683aeada3aa2ff5
7
+ data.tar.gz: 52da6e7fd9076dd34c6b2b6325fdbea4e61a0dc37c38594ac76d3700adb61f10de960648a9b23dc8a9d730f5ea6bbcbab244a260371b3202be7fde2e680b4149
data/.gitignore CHANGED
@@ -1,5 +1,7 @@
1
1
  *.gem
2
2
  *.rbc
3
+ *.o
4
+ *.so
3
5
  .bundle
4
6
  .config
5
7
  .ruby-version
data/.travis.yml CHANGED
@@ -4,5 +4,3 @@ rvm:
4
4
  - "2.0.0"
5
5
  - "2.1.0"
6
6
  - "jruby-19mode"
7
-
8
- script: bundle exec rspec spec
data/Rakefile CHANGED
@@ -1 +1,27 @@
1
1
  require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ if RUBY_ENGINE == "ruby"
7
+ require "rake/extensiontask"
8
+
9
+ gem_name = "string_metric"
10
+ dir = "#{gem_name}/levenshtein"
11
+ spec = Gem::Specification.load("#{gem_name}.gemspec")
12
+
13
+ Rake::ExtensionTask.new do |ext|
14
+ ext.name = "trie_radix_tree_ext"
15
+ ext.ext_dir = "ext/#{dir}"
16
+ ext.lib_dir = "lib/#{dir}"
17
+ ext.gem_spec = spec
18
+ end
19
+
20
+ task :default do
21
+ Rake::Task["compile"].invoke
22
+ Rake::Task["spec"].invoke
23
+ end
24
+ else
25
+ task default: :spec
26
+ end
27
+
@@ -0,0 +1,44 @@
1
+ require 'string_metric'
2
+ require 'benchmark'
3
+ require 'pp'
4
+
5
+ Benchmark.bmbm(7) do |x|
6
+ options = {}
7
+ max_distance = 2
8
+
9
+ dict = []
10
+ trie = StringMetric::Levenshtein::TrieNode.new
11
+ File.open('/usr/share/dict/words', 'r').each_line do |line|
12
+ word = line.chomp
13
+ trie.insert(word)
14
+ dict << word
15
+ end
16
+
17
+ randomWords = []
18
+ File.open('spec/fixtures/dictionary_input.txt', 'r').each_line do |word|
19
+ randomWords << word.chomp
20
+ end
21
+
22
+ matrixResults = []
23
+ x.report("two_matrix_rows_v2 implementation") do
24
+ randomWords.each do |from|
25
+ dict.each do |to|
26
+ matrixResults << to if StringMetric::Levenshtein::IterativeWithTwoMatrixRowsOptimized.distance(from, to, options) <= max_distance
27
+ end
28
+ end
29
+ end
30
+
31
+ trieResults = []
32
+ x.report("trie_radix_tree implementation") do
33
+ randomWords.each do |from|
34
+ trieResults << StringMetric::Levenshtein::TrieRadixTree.distance(from, trie, max_distance: max_distance)
35
+ end
36
+ end
37
+
38
+ trieResultsExt = []
39
+ x.report("trie_radix_tree_ext implementation") do
40
+ randomWords.each do |from|
41
+ trieResultsExt << StringMetric::Levenshtein::TrieRadixTreeExt.distance(from, trie, max_distance: max_distance)
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile('string_metric/levenshtein/trie_radix_tree_ext')
@@ -0,0 +1,112 @@
1
+ #include <ruby.h>
2
+
3
+ void Init_trie_radix_tree_ext(void);
4
+ void search_recursive(VALUE node, int letter, int *current_row, VALUE results);
5
+ VALUE search_ext(VALUE self, VALUE _from, VALUE _from_len, VALUE trie_node,
6
+ VALUE _max_distance, VALUE _insertion_cost,
7
+ VALUE _deletion_cost, VALUE _substitution_cost);
8
+
9
+ #define MIN2(a, b) (((a) < (b)) ? (a) : (b))
10
+ #define MIN3(a, b, c) (MIN2(MIN2((a), (b)), (c)))
11
+
12
+ #define MALLOC_W(ptr, size) do { \
13
+ ptr = malloc(size); \
14
+ if (!ptr) \
15
+ rb_memerror(); \
16
+ } while (0)
17
+
18
+ // Declare the variables that don't change as global, so that we don't have to pass them around
19
+ // in our recursive function and increase the stack frame unnecessarily
20
+ int *from, from_len;
21
+ int max_distance, insertion_cost, deletion_cost, substitution_cost;
22
+
23
+ void Init_trie_radix_tree_ext(void) {
24
+
25
+ VALUE StringMetric = rb_define_module("StringMetric");
26
+ VALUE Levenshtein = rb_define_module_under(StringMetric, "Levenshtein");
27
+ VALUE TrieRadixTreeExt = rb_define_class_under(Levenshtein, "TrieRadixTreeExt", rb_cObject);
28
+
29
+ rb_define_singleton_method(TrieRadixTreeExt, "trie_ext", search_ext, 7);
30
+ }
31
+
32
+ VALUE search_ext(VALUE self, VALUE _from, VALUE _from_len, VALUE trie_node,
33
+ VALUE _max_distance, VALUE _insertion_cost,
34
+ VALUE _deletion_cost, VALUE _substitution_cost)
35
+ {
36
+ int i, *current_row;
37
+ VALUE results, letter, node, children, children_keys;
38
+
39
+ // Convert from ruby types
40
+ max_distance = FIX2INT(_max_distance);
41
+ insertion_cost = FIX2INT(_insertion_cost);
42
+ deletion_cost = FIX2INT(_deletion_cost);
43
+ substitution_cost = FIX2INT(_substitution_cost);
44
+ from_len = FIX2INT(_from_len);
45
+
46
+ // The '_from' word is passed as an array of codepoints. Allocate memory and populate the C array
47
+ MALLOC_W(from, from_len * sizeof(int));
48
+ for (i = 0; i < from_len; i++)
49
+ from[i] = FIX2INT(rb_ary_entry(_from, i));
50
+
51
+ // Create a hash to store the results and return it to ruby when we are done
52
+ results = rb_hash_new();
53
+
54
+ MALLOC_W(current_row, (from_len + 1) * sizeof(int));
55
+ for (i = 0; i <= from_len; i++)
56
+ current_row[i] = i;
57
+
58
+ // Extract the hash from trie_node object and get an array of keys
59
+ children = rb_funcall(trie_node, rb_intern("children"), 0);
60
+ children_keys = rb_funcall(children, rb_intern("keys"), 0);
61
+
62
+ for (i = 0; i < RARRAY_LEN(children_keys); i++) {
63
+ letter = rb_ary_entry(children_keys, i);
64
+ node = rb_hash_aref(children, letter);
65
+ search_recursive(node, FIX2INT(letter), current_row, results);
66
+ }
67
+ free(from);
68
+ free(current_row);
69
+ return results;
70
+ }
71
+
72
+ void search_recursive(VALUE node, int letter, int *previous_row, VALUE results) {
73
+
74
+ int i, min, columns, distance, *current_row;
75
+ int cost, insert_cost, delete_cost, replace_cost;
76
+ VALUE word, codepoint, children, children_keys;
77
+
78
+ columns = from_len + 1;
79
+ MALLOC_W(current_row, columns * sizeof(int));
80
+ current_row[0] = previous_row[0] + 1;
81
+
82
+ for (i = 1; i < columns; i++) {
83
+ cost = (from[i - 1] == letter) ? 0 : substitution_cost;
84
+ insert_cost = current_row[i - 1] + insertion_cost;
85
+ delete_cost = previous_row[i] + deletion_cost;
86
+ replace_cost = previous_row[i - 1] + cost;
87
+
88
+ current_row[i] = MIN3(insert_cost, delete_cost, replace_cost);
89
+ }
90
+ distance = current_row[columns - 1];
91
+ word = rb_funcall(node, rb_intern("word"), 0);
92
+
93
+ if (distance <= max_distance && word != Qnil)
94
+ rb_hash_aset(results, word, INT2FIX(distance));
95
+
96
+ min = current_row[0];
97
+ for (i = 1; i < columns; i++)
98
+ if (current_row[i] < min)
99
+ min = current_row[i];
100
+
101
+ if (min <= max_distance) {
102
+ children = rb_funcall(node, rb_intern("children"), 0);
103
+ children_keys = rb_funcall(children, rb_intern("keys"), 0);
104
+
105
+ for (i = 0; i < RARRAY_LEN(children_keys); i++) {
106
+ codepoint = rb_ary_entry(children_keys, i);
107
+ node = rb_hash_aref(children, codepoint);
108
+ search_recursive(node, FIX2INT(codepoint), current_row, results);
109
+ }
110
+ }
111
+ free(current_row);
112
+ }
@@ -0,0 +1,23 @@
1
+ # coding: utf-8
2
+
3
+ module StringMetric
4
+ module Levenshtein
5
+ class TrieNode
6
+ attr_accessor :word, :children
7
+
8
+ def initialize
9
+ @word = nil
10
+ @children = {}
11
+ end
12
+
13
+ def insert(word)
14
+ node = self
15
+ word.codepoints.each do |char|
16
+ node.children[char] = TrieNode.new unless node.children.key?(char)
17
+ node = node.children[char]
18
+ end
19
+ node.word = word
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,49 @@
1
+ # coding: utf-8
2
+
3
+ module StringMetric
4
+ module Levenshtein
5
+ class TrieRadixTree
6
+ def self.distance(from, node, options = {})
7
+
8
+ @max_distance = options[:max_distance] || 0
9
+ @insertion_cost = options[:insertion_cost] || 1
10
+ @deletion_cost = options[:deletion_cost] || 1
11
+ @substitution_cost = options[:substitution_cost] || 1
12
+
13
+ results = []
14
+ word = from.codepoints
15
+ currentRow = (0..word.length).to_a
16
+
17
+ node.children.keys.each do |letter|
18
+ searchRecursive(node.children[letter], letter, word, currentRow, results)
19
+ end
20
+
21
+ results
22
+ end
23
+
24
+ def self.searchRecursive(node, letter, word, previousRow, results)
25
+ columns = word.length + 1
26
+ currentRow = [previousRow[0] + 1]
27
+
28
+ (1...columns).each do |column|
29
+ insertCost = currentRow[column - 1] + @insertion_cost
30
+ deleteCost = previousRow[column] + @deletion_cost
31
+ cost = (word[column - 1] == letter) ? 0 : @substitution_cost
32
+ replaceCost = previousRow[column - 1] + cost
33
+
34
+ currentRow << [insertCost, deleteCost, replaceCost].min
35
+ end
36
+
37
+ if currentRow.last <= @max_distance && !node.word.nil?
38
+ results << [node.word, currentRow.last]
39
+ end
40
+
41
+ if currentRow.min <= @max_distance
42
+ node.children.keys.each do |letter|
43
+ searchRecursive(node.children[letter], letter, word, currentRow, results)
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,20 @@
1
+ # coding: utf-8
2
+
3
+ require_relative 'trie_radix_tree_ext.so'
4
+
5
+ module StringMetric
6
+ module Levenshtein
7
+ class TrieRadixTreeExt
8
+ def self.distance(from, trieNode, options = {})
9
+
10
+ max_distance = options[:max_distance] || 0
11
+ insertion_cost = options[:insertion_cost] || 1
12
+ deletion_cost = options[:deletion_cost] || 1
13
+ substitution_cost = options[:substitution_cost] || 1
14
+
15
+ trie_ext(from.codepoints, from.length, trieNode, max_distance,
16
+ insertion_cost, deletion_cost, substitution_cost)
17
+ end
18
+ end
19
+ end
20
+ end
@@ -5,6 +5,9 @@ require_relative "levenshtein/iterative_with_two_matrix_rows"
5
5
  require_relative "levenshtein/iterative_with_two_matrix_rows_optimized"
6
6
  require_relative "levenshtein/iterative_with_full_matrix"
7
7
  require_relative "levenshtein/recursive"
8
+ require_relative "levenshtein/trie_node"
9
+ require_relative "levenshtein/trie_radix_tree"
10
+ require_relative "levenshtein/trie_radix_tree_ext"
8
11
 
9
12
  module StringMetric
10
13
  # Levenshtein Distance implementation
@@ -1,3 +1,3 @@
1
1
  module StringMetric
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
@@ -0,0 +1,10 @@
1
+ aliency
2
+ prostatauxe
3
+ Herbartian
4
+ womanize
5
+ unviolent
6
+ disguised
7
+ preanimism
8
+ birdling
9
+ geognosy
10
+ daut
@@ -24,6 +24,9 @@ Gem::Specification.new do |spec|
24
24
  spec.add_development_dependency "text", "~> 1.2.3"
25
25
 
26
26
  if RUBY_ENGINE == "ruby"
27
+ spec.add_development_dependency "rake-compiler", "~> 0.9.2"
28
+ spec.extensions << "ext/#{spec.name}/levenshtein/extconf.rb"
29
+
27
30
  if RUBY_VERSION > "1.9.3"
28
31
  spec.add_development_dependency "pry-byebug", "~> 1.2.1"
29
32
  else
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: string_metric
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Giorgos Tsiftsis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-13 00:00:00.000000000 Z
11
+ date: 2015-05-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: 1.2.3
69
+ - !ruby/object:Gem::Dependency
70
+ name: rake-compiler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 0.9.2
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.9.2
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: pry-byebug
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -84,7 +98,8 @@ description: A simple library with String Metric algorithms
84
98
  email:
85
99
  - giorgos.tsiftsis@skroutz.gr
86
100
  executables: []
87
- extensions: []
101
+ extensions:
102
+ - ext/string_metric/levenshtein/extconf.rb
88
103
  extra_rdoc_files: []
89
104
  files:
90
105
  - ".gitignore"
@@ -94,7 +109,10 @@ files:
94
109
  - LICENSE.txt
95
110
  - README.md
96
111
  - Rakefile
112
+ - benchmarks/dictionary.rb
97
113
  - benchmarks/levenshtein.rb
114
+ - ext/string_metric/levenshtein/extconf.rb
115
+ - ext/string_metric/levenshtein/trie_radix_tree_ext.c
98
116
  - lib/string_metric.rb
99
117
  - lib/string_metric/levenshtein.rb
100
118
  - lib/string_metric/levenshtein/experiment.rb
@@ -102,7 +120,11 @@ files:
102
120
  - lib/string_metric/levenshtein/iterative_with_two_matrix_rows.rb
103
121
  - lib/string_metric/levenshtein/iterative_with_two_matrix_rows_optimized.rb
104
122
  - lib/string_metric/levenshtein/recursive.rb
123
+ - lib/string_metric/levenshtein/trie_node.rb
124
+ - lib/string_metric/levenshtein/trie_radix_tree.rb
125
+ - lib/string_metric/levenshtein/trie_radix_tree_ext.rb
105
126
  - lib/string_metric/version.rb
127
+ - spec/fixtures/dictionary_input.txt
106
128
  - spec/fixtures/levenshtein.csv
107
129
  - spec/lib/levenshtein/experiment_spec.rb
108
130
  - spec/lib/levenshtein/iterative_with_full_matric_spec.rb
@@ -138,6 +160,7 @@ signing_key:
138
160
  specification_version: 4
139
161
  summary: A simple library with String Metric algorithms
140
162
  test_files:
163
+ - spec/fixtures/dictionary_input.txt
141
164
  - spec/fixtures/levenshtein.csv
142
165
  - spec/lib/levenshtein/experiment_spec.rb
143
166
  - spec/lib/levenshtein/iterative_with_full_matric_spec.rb