string_metric 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b2a2bb553d840612ee0f9bb1eb36f561e55e5d50
4
- data.tar.gz: be1ba43ed05865948bf84e51b6eb9b85fabc170f
3
+ metadata.gz: 821e68238da86612c3b0f498d66186195ccf2c19
4
+ data.tar.gz: 895062d2498b783460c56efea880f4349cfd8a82
5
5
  SHA512:
6
- metadata.gz: f449e8a3cd1303c9ee92bd71fe57bde264b03c201c85bfbe538353da26fbc3c91ad2d5394691535e6d2e59a025f391a5408b479182ba8fca2683aeada3aa2ff5
7
- data.tar.gz: 52da6e7fd9076dd34c6b2b6325fdbea4e61a0dc37c38594ac76d3700adb61f10de960648a9b23dc8a9d730f5ea6bbcbab244a260371b3202be7fde2e680b4149
6
+ metadata.gz: d3ea26bc39ffe8311523d43fb07cb86f713374ad152cd170b7257edecebff740bf5e1756f0af56d1bc313aaa30cf8cd17725aadb6e5316be35d06c83731840da
7
+ data.tar.gz: 4e9d498807aac322012033b4782b2401e24399c7ba78e52b10cfdee4461fa9ba64b3e44aea38118f340a1f4f7af834f080c3cdbbfc5c86a297fcf8de9afac0b3
data/README.md CHANGED
@@ -48,7 +48,7 @@ __Options__
48
48
  penalty. Can be `Fixum` or `Float`.
49
49
 
50
50
  * `:strategy`: The desired strategy for Levenshtein distance. Supported
51
- strategies are `:recursive`, `:two_matrix_rows`, `:two_matrix_rows_v2`,
51
+ strategies are `:recursive`, `:two_matrix_rows`, `:two_matrix_rows_v2`, `:two_matrix_rows_ext`,
52
52
  `:full_matrix` and `:experiment`. The default strategy is
53
53
  `:two_matrix_rows_v2` for MRI and `:two_matrix_rows` for other platforms
54
54
  One should not depend on `:experiment` strategy.
@@ -101,11 +101,12 @@ __Levenshtein__
101
101
 
102
102
  Implementation | User | Real
103
103
  -------------------------------------------------|-----------|-----------
104
- Levenshtein::IterativeWithFullMatrix | 2.260000 | 2.265873
105
- Levenshtein::IterativeWithTwoMatrixRows | 1.970000 | 1.971205
106
- Levenshtein::Experiment | 1.680000 | 1.684419
107
- Levenshtein::IterativeWithTwoMatrixRowsOptimized | 1.270000 | 1.269643
108
- Text::Levenshtein (from gem text) | 2.180000 | 2.186258
104
+ Levenshtein::IterativeWithFullMatrix | 2.320000 | 2.343141
105
+ Levenshtein::IterativeWithTwoMatrixRows | 2.020000 | 2.044638
106
+ Levenshtein::Experiment | 1.750000 | 1.779868
107
+ Levenshtein::IterativeWithTwoMatrixRowsOptimized | 1.320000 | 1.343095
108
+ Levenshtein::IterativeWithTwoMatrixRowsExt | 0.220000 | 0.228965
109
+ Text::Levenshtein (from gem text) | 2.240000 | 2.308803
109
110
 
110
111
  _Currently the set of fixtures is very small - ruby 2.1.0 is used_
111
112
 
data/Rakefile CHANGED
@@ -12,7 +12,14 @@ if RUBY_ENGINE == "ruby"
12
12
 
13
13
  Rake::ExtensionTask.new do |ext|
14
14
  ext.name = "trie_radix_tree_ext"
15
- ext.ext_dir = "ext/#{dir}"
15
+ ext.ext_dir = "ext/#{dir}/trie_radix_tree"
16
+ ext.lib_dir = "lib/#{dir}"
17
+ ext.gem_spec = spec
18
+ end
19
+
20
+ Rake::ExtensionTask.new do |ext|
21
+ ext.name = "iterative_with_two_matrix_rows_ext"
22
+ ext.ext_dir = "ext/#{dir}/iterative_with_two_matrix_rows"
16
23
  ext.lib_dir = "lib/#{dir}"
17
24
  ext.gem_spec = spec
18
25
  end
@@ -24,4 +31,3 @@ if RUBY_ENGINE == "ruby"
24
31
  else
25
32
  task default: :spec
26
33
  end
27
-
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile("string_metric/levenshtein/iterative_with_two_matrix_rows_ext")
@@ -0,0 +1,110 @@
1
+ #include <ruby.h>
2
+
3
+ #define MIN2(a, b) (((a) < (b)) ? (a) : (b))
4
+ #define MIN3(a, b, c) (MIN2(MIN2((a), (b)), (c)))
5
+
6
+ #define MALLOC_W(ptr, size) do { \
7
+ ptr = malloc(size); \
8
+ if (!ptr) \
9
+ rb_memerror(); \
10
+ } while (0)
11
+
12
+ static long min(long *array, long len)
13
+ {
14
+ long i, min = array[0];
15
+
16
+ for (i = 1; i < len; i++)
17
+ if (array[i] < min)
18
+ min = array[i];
19
+
20
+ return min;
21
+ }
22
+
23
+ VALUE distance_ext(VALUE self, VALUE v_from, VALUE v_to, VALUE v_from_len, VALUE v_to_len,
24
+ VALUE v_max_distance, VALUE v_insertion_cost, VALUE v_deletion_cost,
25
+ VALUE v_substitution_cost);
26
+
27
+ void
28
+ Init_iterative_with_two_matrix_rows_ext(void)
29
+ {
30
+ VALUE StringMetric;
31
+ VALUE Levenshtein;
32
+ VALUE IterativeWithTwoMatrixRowsExt;
33
+
34
+ StringMetric = rb_define_module("StringMetric");
35
+ Levenshtein = rb_define_module_under(StringMetric, "Levenshtein");
36
+
37
+ IterativeWithTwoMatrixRowsExt =
38
+ rb_define_class_under(Levenshtein, "IterativeWithTwoMatrixRowsExt", rb_cObject);
39
+
40
+ rb_define_singleton_method(IterativeWithTwoMatrixRowsExt, "distance_ext", distance_ext, 8);
41
+ }
42
+
43
+ VALUE
44
+ distance_ext(VALUE self, VALUE v_from, VALUE v_to, VALUE v_from_len, VALUE v_to_len,
45
+ VALUE v_max_distance, VALUE v_insertion_cost, VALUE v_deletion_cost,
46
+ VALUE v_substitution_cost)
47
+ {
48
+ /* Ruby to C datatype conversions & variables declaration-initialization */
49
+ long max_distance = NUM2LONG(v_max_distance);
50
+ long insertion_cost = NUM2LONG(v_insertion_cost);
51
+ long deletion_cost = NUM2LONG(v_deletion_cost);
52
+ long substitution_cost = NUM2LONG(v_substitution_cost);
53
+
54
+ long from_len = NUM2LONG(v_from_len);
55
+ long to_len = NUM2LONG(v_to_len);
56
+
57
+ long v0_len = (from_len + 1) * sizeof(long);
58
+
59
+ int *from, *to;
60
+ long *v0, current, i, j, sub_cell, ins_cell, cost, x = 0;
61
+
62
+ /* Use malloc for these arrays in order to avoid stack overflow from big input strings */
63
+ MALLOC_W(from, from_len * sizeof(int));
64
+ MALLOC_W(to, to_len * sizeof(int));
65
+ MALLOC_W(v0, v0_len);
66
+
67
+ /* Fill ´from´ & ´to´ C arrays with values from the corresponding ruby arrays */
68
+ for (i = 0; i < from_len; i++)
69
+ from[i] = NUM2INT(rb_ary_entry(v_from, i));
70
+
71
+ for (i = 0; i < to_len; i++)
72
+ to[i] = NUM2INT(rb_ary_entry(v_to, i));
73
+
74
+ /* Beggining of the algorithm */
75
+ for (i = 0; i <= from_len; i++)
76
+ v0[i] = i;
77
+
78
+ for (i = 0; i < to_len; i++)
79
+ {
80
+ current = x = i + 1;
81
+ sub_cell = v0[0];
82
+
83
+ for (j = 0; j < from_len; j++)
84
+ {
85
+ cost = (from[j] == to[i]) ? 0 : substitution_cost;
86
+
87
+ ins_cell = v0[j + 1];
88
+
89
+ x = MIN3(current + deletion_cost, ins_cell + insertion_cost, sub_cell + cost);
90
+
91
+ v0[j] = current;
92
+ current = x;
93
+ sub_cell = ins_cell;
94
+ }
95
+
96
+ v0[from_len] = x;
97
+ if (max_distance && min(v0, v0_len) > max_distance)
98
+ break;
99
+ }
100
+
101
+ /* Clean up allocated memory */
102
+ free(from);
103
+ free(to);
104
+ free(v0);
105
+
106
+ if (max_distance && x > max_distance)
107
+ return LONG2NUM(max_distance);
108
+ else
109
+ return LONG2NUM(x);
110
+ }
@@ -8,6 +8,7 @@ require_relative "levenshtein/recursive"
8
8
  require_relative "levenshtein/trie_node"
9
9
  require_relative "levenshtein/trie_radix_tree"
10
10
  require_relative "levenshtein/trie_radix_tree_ext"
11
+ require_relative "levenshtein/iterative_with_two_matrix_rows_ext" if RUBY_ENGINE == "ruby"
11
12
 
12
13
  module StringMetric
13
14
  # Levenshtein Distance implementation
@@ -23,6 +24,10 @@ module StringMetric
23
24
  two_matrix_rows_v2: IterativeWithTwoMatrixRowsOptimized
24
25
  }
25
26
 
27
+ if RUBY_ENGINE == "ruby"
28
+ STRATEGIES[:two_matrix_rows_ext] = IterativeWithTwoMatrixRowsExt
29
+ end
30
+
26
31
  # Levenshtein Distance of two strings
27
32
  #
28
33
  # @param from [String] the first string
@@ -53,7 +58,7 @@ module StringMetric
53
58
  # Currently the default strategy is set to IterativeWithTwoMatrixRows
54
59
  def default_strategy
55
60
  if RUBY_ENGINE == "ruby"
56
- pick_strategy(:two_matrix_rows_v2)
61
+ pick_strategy(:two_matrix_rows_ext)
57
62
  else
58
63
  pick_strategy(:two_matrix_rows)
59
64
  end
@@ -0,0 +1,33 @@
1
+ # coding: utf-8
2
+
3
+ require_relative "iterative_with_two_matrix_rows_ext.so"
4
+
5
+ module StringMetric
6
+ module Levenshtein
7
+ class IterativeWithTwoMatrixRowsExt
8
+ def self.distance(from, to, options = {})
9
+ max_distance = options[:max_distance]
10
+ insertion_cost = options[:insertion_cost] || 1
11
+ deletion_cost = options[:deletion_cost] || 1
12
+ substitution_cost = options[:substitution_cost] || 1
13
+
14
+ from_len = from.length
15
+ to_len = to.length
16
+
17
+ if max_distance && (to_len - from_len).abs >= max_distance
18
+ return max_distance
19
+ end
20
+
21
+ return 0 if from == to
22
+ return to_len if from_len.zero?
23
+ return from_len if to_len.zero?
24
+
25
+ from = from.codepoints.to_a
26
+ to = to.codepoints.to_a
27
+
28
+ distance_ext(from, to, from_len, to_len, max_distance || 0, insertion_cost,
29
+ deletion_cost, substitution_cost)
30
+ end
31
+ end
32
+ end
33
+ end
@@ -10,7 +10,7 @@ module StringMetric
10
10
  @deletion_cost = options[:deletion_cost] || 1
11
11
  @substitution_cost = options[:substitution_cost] || 1
12
12
 
13
- results = []
13
+ results = {}
14
14
  word = from.codepoints
15
15
  currentRow = (0..word.length).to_a
16
16
 
@@ -35,7 +35,7 @@ module StringMetric
35
35
  end
36
36
 
37
37
  if currentRow.last <= @max_distance && !node.word.nil?
38
- results << [node.word, currentRow.last]
38
+ results[node.word] = currentRow.last
39
39
  end
40
40
 
41
41
  if currentRow.min <= @max_distance
@@ -1,3 +1,3 @@
1
1
  module StringMetric
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.4"
3
3
  end
@@ -0,0 +1,9 @@
1
+ # coding: utf-8
2
+
3
+ if RUBY_ENGINE == "ruby"
4
+ require "spec_helper"
5
+
6
+ describe StringMetric::Levenshtein::IterativeWithTwoMatrixRowsExt do
7
+ it_behaves_like "Levenshtein Distance", { strategy: :two_matrix_rows_ext }
8
+ end
9
+ end
@@ -34,6 +34,7 @@ shared_examples "Levenshtein Distance" do |options|
34
34
  expect(described_class.distance("gambol", "gumbo", max_distance: 1)).to eq 1
35
35
  expect(described_class.distance("kitten", "", max_distance: 2)).to eq 2
36
36
  expect(described_class.distance("", "kitten", max_distance: 3)).to eq 3
37
+ expect(described_class.distance("a", "reallylongwordfortest", max_distance: 5)).to eq 5
37
38
  end
38
39
  end
39
40
  context "and normal distance is less than max_distance" do
@@ -51,6 +52,7 @@ shared_examples "Levenshtein Distance" do |options|
51
52
  expect(described_class.distance("", "cat", max_distance: 4)).to eq 3
52
53
  expect(described_class.distance("cat", "", max_distance: 5)).to eq 3
53
54
  expect(described_class.distance("", "", max_distance: 2)).to eq 0
55
+ expect(described_class.distance("a", "reallylongwordfortest", max_distance: 25)).to eq 20
54
56
  end
55
57
  end
56
58
  context "and normal distance is same as max_distance" do
@@ -66,6 +68,7 @@ shared_examples "Levenshtein Distance" do |options|
66
68
  expect(described_class.distance("", "cat", max_distance: 3)).to eq 3
67
69
  expect(described_class.distance("cat", "", max_distance: 3)).to eq 3
68
70
  expect(described_class.distance("", "", max_distance: 0)).to eq 0
71
+ expect(described_class.distance("a", "reallylongwordfortest", max_distance: 20)).to eq 20
69
72
  end
70
73
  end
71
74
  end
@@ -25,7 +25,8 @@ Gem::Specification.new do |spec|
25
25
 
26
26
  if RUBY_ENGINE == "ruby"
27
27
  spec.add_development_dependency "rake-compiler", "~> 0.9.2"
28
- spec.extensions << "ext/#{spec.name}/levenshtein/extconf.rb"
28
+ spec.extensions << "ext/#{spec.name}/levenshtein/trie_radix_tree/extconf.rb"
29
+ spec.extensions << "ext/#{spec.name}/levenshtein/iterative_with_two_matrix_rows/extconf.rb"
29
30
 
30
31
  if RUBY_VERSION > "1.9.3"
31
32
  spec.add_development_dependency "pry-byebug", "~> 1.2.1"
metadata CHANGED
@@ -1,97 +1,97 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: string_metric
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Giorgos Tsiftsis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-23 00:00:00.000000000 Z
11
+ date: 2015-05-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ~>
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.5'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ~>
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.5'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ~>
32
32
  - !ruby/object:Gem::Version
33
33
  version: 10.1.1
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ~>
39
39
  - !ruby/object:Gem::Version
40
40
  version: 10.1.1
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - "~>"
45
+ - - ~>
46
46
  - !ruby/object:Gem::Version
47
47
  version: 2.14.1
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - "~>"
52
+ - - ~>
53
53
  - !ruby/object:Gem::Version
54
54
  version: 2.14.1
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: text
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - "~>"
59
+ - - ~>
60
60
  - !ruby/object:Gem::Version
61
61
  version: 1.2.3
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - "~>"
66
+ - - ~>
67
67
  - !ruby/object:Gem::Version
68
68
  version: 1.2.3
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rake-compiler
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - "~>"
73
+ - - ~>
74
74
  - !ruby/object:Gem::Version
75
75
  version: 0.9.2
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - "~>"
80
+ - - ~>
81
81
  - !ruby/object:Gem::Version
82
82
  version: 0.9.2
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: pry-byebug
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - "~>"
87
+ - - ~>
88
88
  - !ruby/object:Gem::Version
89
89
  version: 1.2.1
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - "~>"
94
+ - - ~>
95
95
  - !ruby/object:Gem::Version
96
96
  version: 1.2.1
97
97
  description: A simple library with String Metric algorithms
@@ -99,25 +99,29 @@ email:
99
99
  - giorgos.tsiftsis@skroutz.gr
100
100
  executables: []
101
101
  extensions:
102
- - ext/string_metric/levenshtein/extconf.rb
102
+ - ext/string_metric/levenshtein/trie_radix_tree/extconf.rb
103
+ - ext/string_metric/levenshtein/iterative_with_two_matrix_rows/extconf.rb
103
104
  extra_rdoc_files: []
104
105
  files:
105
- - ".gitignore"
106
- - ".rspec"
107
- - ".travis.yml"
106
+ - .gitignore
107
+ - .rspec
108
+ - .travis.yml
108
109
  - Gemfile
109
110
  - LICENSE.txt
110
111
  - README.md
111
112
  - Rakefile
112
113
  - benchmarks/dictionary.rb
113
114
  - benchmarks/levenshtein.rb
114
- - ext/string_metric/levenshtein/extconf.rb
115
- - ext/string_metric/levenshtein/trie_radix_tree_ext.c
115
+ - ext/string_metric/levenshtein/iterative_with_two_matrix_rows/extconf.rb
116
+ - ext/string_metric/levenshtein/iterative_with_two_matrix_rows/iterative_with_two_matrix_rows_ext.c
117
+ - ext/string_metric/levenshtein/trie_radix_tree/extconf.rb
118
+ - ext/string_metric/levenshtein/trie_radix_tree/trie_radix_tree_ext.c
116
119
  - lib/string_metric.rb
117
120
  - lib/string_metric/levenshtein.rb
118
121
  - lib/string_metric/levenshtein/experiment.rb
119
122
  - lib/string_metric/levenshtein/iterative_with_full_matrix.rb
120
123
  - lib/string_metric/levenshtein/iterative_with_two_matrix_rows.rb
124
+ - lib/string_metric/levenshtein/iterative_with_two_matrix_rows_ext.rb
121
125
  - lib/string_metric/levenshtein/iterative_with_two_matrix_rows_optimized.rb
122
126
  - lib/string_metric/levenshtein/recursive.rb
123
127
  - lib/string_metric/levenshtein/trie_node.rb
@@ -128,6 +132,7 @@ files:
128
132
  - spec/fixtures/levenshtein.csv
129
133
  - spec/lib/levenshtein/experiment_spec.rb
130
134
  - spec/lib/levenshtein/iterative_with_full_matric_spec.rb
135
+ - spec/lib/levenshtein/iterative_with_two_matrix_rows_ext_spec.rb
131
136
  - spec/lib/levenshtein/iterative_with_two_matrix_rows_optimized_spec.rb
132
137
  - spec/lib/levenshtein/iterative_with_two_matrix_rows_spec.rb
133
138
  - spec/lib/levenshtein/recursive_spec.rb
@@ -145,17 +150,17 @@ require_paths:
145
150
  - lib
146
151
  required_ruby_version: !ruby/object:Gem::Requirement
147
152
  requirements:
148
- - - ">="
153
+ - - '>='
149
154
  - !ruby/object:Gem::Version
150
155
  version: '0'
151
156
  required_rubygems_version: !ruby/object:Gem::Requirement
152
157
  requirements:
153
- - - ">="
158
+ - - '>='
154
159
  - !ruby/object:Gem::Version
155
160
  version: '0'
156
161
  requirements: []
157
162
  rubyforge_project:
158
- rubygems_version: 2.4.6
163
+ rubygems_version: 2.1.11
159
164
  signing_key:
160
165
  specification_version: 4
161
166
  summary: A simple library with String Metric algorithms
@@ -164,6 +169,7 @@ test_files:
164
169
  - spec/fixtures/levenshtein.csv
165
170
  - spec/lib/levenshtein/experiment_spec.rb
166
171
  - spec/lib/levenshtein/iterative_with_full_matric_spec.rb
172
+ - spec/lib/levenshtein/iterative_with_two_matrix_rows_ext_spec.rb
167
173
  - spec/lib/levenshtein/iterative_with_two_matrix_rows_optimized_spec.rb
168
174
  - spec/lib/levenshtein/iterative_with_two_matrix_rows_spec.rb
169
175
  - spec/lib/levenshtein/recursive_spec.rb