string_metric 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b2a2bb553d840612ee0f9bb1eb36f561e55e5d50
4
- data.tar.gz: be1ba43ed05865948bf84e51b6eb9b85fabc170f
3
+ metadata.gz: 821e68238da86612c3b0f498d66186195ccf2c19
4
+ data.tar.gz: 895062d2498b783460c56efea880f4349cfd8a82
5
5
  SHA512:
6
- metadata.gz: f449e8a3cd1303c9ee92bd71fe57bde264b03c201c85bfbe538353da26fbc3c91ad2d5394691535e6d2e59a025f391a5408b479182ba8fca2683aeada3aa2ff5
7
- data.tar.gz: 52da6e7fd9076dd34c6b2b6325fdbea4e61a0dc37c38594ac76d3700adb61f10de960648a9b23dc8a9d730f5ea6bbcbab244a260371b3202be7fde2e680b4149
6
+ metadata.gz: d3ea26bc39ffe8311523d43fb07cb86f713374ad152cd170b7257edecebff740bf5e1756f0af56d1bc313aaa30cf8cd17725aadb6e5316be35d06c83731840da
7
+ data.tar.gz: 4e9d498807aac322012033b4782b2401e24399c7ba78e52b10cfdee4461fa9ba64b3e44aea38118f340a1f4f7af834f080c3cdbbfc5c86a297fcf8de9afac0b3
data/README.md CHANGED
@@ -48,7 +48,7 @@ __Options__
48
48
  penalty. Can be `Fixum` or `Float`.
49
49
 
50
50
  * `:strategy`: The desired strategy for Levenshtein distance. Supported
51
- strategies are `:recursive`, `:two_matrix_rows`, `:two_matrix_rows_v2`,
51
+ strategies are `:recursive`, `:two_matrix_rows`, `:two_matrix_rows_v2`, `:two_matrix_rows_ext`,
52
52
  `:full_matrix` and `:experiment`. The default strategy is
53
53
  `:two_matrix_rows_v2` for MRI and `:two_matrix_rows` for other platforms
54
54
  One should not depend on `:experiment` strategy.
@@ -101,11 +101,12 @@ __Levenshtein__
101
101
 
102
102
  Implementation | User | Real
103
103
  -------------------------------------------------|-----------|-----------
104
- Levenshtein::IterativeWithFullMatrix | 2.260000 | 2.265873
105
- Levenshtein::IterativeWithTwoMatrixRows | 1.970000 | 1.971205
106
- Levenshtein::Experiment | 1.680000 | 1.684419
107
- Levenshtein::IterativeWithTwoMatrixRowsOptimized | 1.270000 | 1.269643
108
- Text::Levenshtein (from gem text) | 2.180000 | 2.186258
104
+ Levenshtein::IterativeWithFullMatrix | 2.320000 | 2.343141
105
+ Levenshtein::IterativeWithTwoMatrixRows | 2.020000 | 2.044638
106
+ Levenshtein::Experiment | 1.750000 | 1.779868
107
+ Levenshtein::IterativeWithTwoMatrixRowsOptimized | 1.320000 | 1.343095
108
+ Levenshtein::IterativeWithTwoMatrixRowsExt | 0.220000 | 0.228965
109
+ Text::Levenshtein (from gem text) | 2.240000 | 2.308803
109
110
 
110
111
  _Currently the set of fixtures is very small - ruby 2.1.0 is used_
111
112
 
data/Rakefile CHANGED
@@ -12,7 +12,14 @@ if RUBY_ENGINE == "ruby"
12
12
 
13
13
  Rake::ExtensionTask.new do |ext|
14
14
  ext.name = "trie_radix_tree_ext"
15
- ext.ext_dir = "ext/#{dir}"
15
+ ext.ext_dir = "ext/#{dir}/trie_radix_tree"
16
+ ext.lib_dir = "lib/#{dir}"
17
+ ext.gem_spec = spec
18
+ end
19
+
20
+ Rake::ExtensionTask.new do |ext|
21
+ ext.name = "iterative_with_two_matrix_rows_ext"
22
+ ext.ext_dir = "ext/#{dir}/iterative_with_two_matrix_rows"
16
23
  ext.lib_dir = "lib/#{dir}"
17
24
  ext.gem_spec = spec
18
25
  end
@@ -24,4 +31,3 @@ if RUBY_ENGINE == "ruby"
24
31
  else
25
32
  task default: :spec
26
33
  end
27
-
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile("string_metric/levenshtein/iterative_with_two_matrix_rows_ext")
@@ -0,0 +1,110 @@
1
+ #include <ruby.h>
2
+
3
+ #define MIN2(a, b) (((a) < (b)) ? (a) : (b))
4
+ #define MIN3(a, b, c) (MIN2(MIN2((a), (b)), (c)))
5
+
6
+ #define MALLOC_W(ptr, size) do { \
7
+ ptr = malloc(size); \
8
+ if (!ptr) \
9
+ rb_memerror(); \
10
+ } while (0)
11
+
12
+ static long min(long *array, long len)
13
+ {
14
+ long i, min = array[0];
15
+
16
+ for (i = 1; i < len; i++)
17
+ if (array[i] < min)
18
+ min = array[i];
19
+
20
+ return min;
21
+ }
22
+
23
+ VALUE distance_ext(VALUE self, VALUE v_from, VALUE v_to, VALUE v_from_len, VALUE v_to_len,
24
+ VALUE v_max_distance, VALUE v_insertion_cost, VALUE v_deletion_cost,
25
+ VALUE v_substitution_cost);
26
+
27
+ void
28
+ Init_iterative_with_two_matrix_rows_ext(void)
29
+ {
30
+ VALUE StringMetric;
31
+ VALUE Levenshtein;
32
+ VALUE IterativeWithTwoMatrixRowsExt;
33
+
34
+ StringMetric = rb_define_module("StringMetric");
35
+ Levenshtein = rb_define_module_under(StringMetric, "Levenshtein");
36
+
37
+ IterativeWithTwoMatrixRowsExt =
38
+ rb_define_class_under(Levenshtein, "IterativeWithTwoMatrixRowsExt", rb_cObject);
39
+
40
+ rb_define_singleton_method(IterativeWithTwoMatrixRowsExt, "distance_ext", distance_ext, 8);
41
+ }
42
+
43
+ VALUE
44
+ distance_ext(VALUE self, VALUE v_from, VALUE v_to, VALUE v_from_len, VALUE v_to_len,
45
+ VALUE v_max_distance, VALUE v_insertion_cost, VALUE v_deletion_cost,
46
+ VALUE v_substitution_cost)
47
+ {
48
+ /* Ruby to C datatype conversions & variables declaration-initialization */
49
+ long max_distance = NUM2LONG(v_max_distance);
50
+ long insertion_cost = NUM2LONG(v_insertion_cost);
51
+ long deletion_cost = NUM2LONG(v_deletion_cost);
52
+ long substitution_cost = NUM2LONG(v_substitution_cost);
53
+
54
+ long from_len = NUM2LONG(v_from_len);
55
+ long to_len = NUM2LONG(v_to_len);
56
+
57
+ long v0_len = (from_len + 1) * sizeof(long);
58
+
59
+ int *from, *to;
60
+ long *v0, current, i, j, sub_cell, ins_cell, cost, x = 0;
61
+
62
+ /* Use malloc for these arrays in order to avoid stack overflow from big input strings */
63
+ MALLOC_W(from, from_len * sizeof(int));
64
+ MALLOC_W(to, to_len * sizeof(int));
65
+ MALLOC_W(v0, v0_len);
66
+
67
+ /* Fill ´from´ & ´to´ C arrays with values from the corresponding ruby arrays */
68
+ for (i = 0; i < from_len; i++)
69
+ from[i] = NUM2INT(rb_ary_entry(v_from, i));
70
+
71
+ for (i = 0; i < to_len; i++)
72
+ to[i] = NUM2INT(rb_ary_entry(v_to, i));
73
+
74
+ /* Beggining of the algorithm */
75
+ for (i = 0; i <= from_len; i++)
76
+ v0[i] = i;
77
+
78
+ for (i = 0; i < to_len; i++)
79
+ {
80
+ current = x = i + 1;
81
+ sub_cell = v0[0];
82
+
83
+ for (j = 0; j < from_len; j++)
84
+ {
85
+ cost = (from[j] == to[i]) ? 0 : substitution_cost;
86
+
87
+ ins_cell = v0[j + 1];
88
+
89
+ x = MIN3(current + deletion_cost, ins_cell + insertion_cost, sub_cell + cost);
90
+
91
+ v0[j] = current;
92
+ current = x;
93
+ sub_cell = ins_cell;
94
+ }
95
+
96
+ v0[from_len] = x;
97
+ if (max_distance && min(v0, v0_len) > max_distance)
98
+ break;
99
+ }
100
+
101
+ /* Clean up allocated memory */
102
+ free(from);
103
+ free(to);
104
+ free(v0);
105
+
106
+ if (max_distance && x > max_distance)
107
+ return LONG2NUM(max_distance);
108
+ else
109
+ return LONG2NUM(x);
110
+ }
@@ -8,6 +8,7 @@ require_relative "levenshtein/recursive"
8
8
  require_relative "levenshtein/trie_node"
9
9
  require_relative "levenshtein/trie_radix_tree"
10
10
  require_relative "levenshtein/trie_radix_tree_ext"
11
+ require_relative "levenshtein/iterative_with_two_matrix_rows_ext" if RUBY_ENGINE == "ruby"
11
12
 
12
13
  module StringMetric
13
14
  # Levenshtein Distance implementation
@@ -23,6 +24,10 @@ module StringMetric
23
24
  two_matrix_rows_v2: IterativeWithTwoMatrixRowsOptimized
24
25
  }
25
26
 
27
+ if RUBY_ENGINE == "ruby"
28
+ STRATEGIES[:two_matrix_rows_ext] = IterativeWithTwoMatrixRowsExt
29
+ end
30
+
26
31
  # Levenshtein Distance of two strings
27
32
  #
28
33
  # @param from [String] the first string
@@ -53,7 +58,7 @@ module StringMetric
53
58
  # Currently the default strategy is set to IterativeWithTwoMatrixRows
54
59
  def default_strategy
55
60
  if RUBY_ENGINE == "ruby"
56
- pick_strategy(:two_matrix_rows_v2)
61
+ pick_strategy(:two_matrix_rows_ext)
57
62
  else
58
63
  pick_strategy(:two_matrix_rows)
59
64
  end
@@ -0,0 +1,33 @@
1
+ # coding: utf-8
2
+
3
+ require_relative "iterative_with_two_matrix_rows_ext.so"
4
+
5
+ module StringMetric
6
+ module Levenshtein
7
+ class IterativeWithTwoMatrixRowsExt
8
+ def self.distance(from, to, options = {})
9
+ max_distance = options[:max_distance]
10
+ insertion_cost = options[:insertion_cost] || 1
11
+ deletion_cost = options[:deletion_cost] || 1
12
+ substitution_cost = options[:substitution_cost] || 1
13
+
14
+ from_len = from.length
15
+ to_len = to.length
16
+
17
+ if max_distance && (to_len - from_len).abs >= max_distance
18
+ return max_distance
19
+ end
20
+
21
+ return 0 if from == to
22
+ return to_len if from_len.zero?
23
+ return from_len if to_len.zero?
24
+
25
+ from = from.codepoints.to_a
26
+ to = to.codepoints.to_a
27
+
28
+ distance_ext(from, to, from_len, to_len, max_distance || 0, insertion_cost,
29
+ deletion_cost, substitution_cost)
30
+ end
31
+ end
32
+ end
33
+ end
@@ -10,7 +10,7 @@ module StringMetric
10
10
  @deletion_cost = options[:deletion_cost] || 1
11
11
  @substitution_cost = options[:substitution_cost] || 1
12
12
 
13
- results = []
13
+ results = {}
14
14
  word = from.codepoints
15
15
  currentRow = (0..word.length).to_a
16
16
 
@@ -35,7 +35,7 @@ module StringMetric
35
35
  end
36
36
 
37
37
  if currentRow.last <= @max_distance && !node.word.nil?
38
- results << [node.word, currentRow.last]
38
+ results[node.word] = currentRow.last
39
39
  end
40
40
 
41
41
  if currentRow.min <= @max_distance
@@ -1,3 +1,3 @@
1
1
  module StringMetric
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.4"
3
3
  end
@@ -0,0 +1,9 @@
1
+ # coding: utf-8
2
+
3
+ if RUBY_ENGINE == "ruby"
4
+ require "spec_helper"
5
+
6
+ describe StringMetric::Levenshtein::IterativeWithTwoMatrixRowsExt do
7
+ it_behaves_like "Levenshtein Distance", { strategy: :two_matrix_rows_ext }
8
+ end
9
+ end
@@ -34,6 +34,7 @@ shared_examples "Levenshtein Distance" do |options|
34
34
  expect(described_class.distance("gambol", "gumbo", max_distance: 1)).to eq 1
35
35
  expect(described_class.distance("kitten", "", max_distance: 2)).to eq 2
36
36
  expect(described_class.distance("", "kitten", max_distance: 3)).to eq 3
37
+ expect(described_class.distance("a", "reallylongwordfortest", max_distance: 5)).to eq 5
37
38
  end
38
39
  end
39
40
  context "and normal distance is less than max_distance" do
@@ -51,6 +52,7 @@ shared_examples "Levenshtein Distance" do |options|
51
52
  expect(described_class.distance("", "cat", max_distance: 4)).to eq 3
52
53
  expect(described_class.distance("cat", "", max_distance: 5)).to eq 3
53
54
  expect(described_class.distance("", "", max_distance: 2)).to eq 0
55
+ expect(described_class.distance("a", "reallylongwordfortest", max_distance: 25)).to eq 20
54
56
  end
55
57
  end
56
58
  context "and normal distance is same as max_distance" do
@@ -66,6 +68,7 @@ shared_examples "Levenshtein Distance" do |options|
66
68
  expect(described_class.distance("", "cat", max_distance: 3)).to eq 3
67
69
  expect(described_class.distance("cat", "", max_distance: 3)).to eq 3
68
70
  expect(described_class.distance("", "", max_distance: 0)).to eq 0
71
+ expect(described_class.distance("a", "reallylongwordfortest", max_distance: 20)).to eq 20
69
72
  end
70
73
  end
71
74
  end
@@ -25,7 +25,8 @@ Gem::Specification.new do |spec|
25
25
 
26
26
  if RUBY_ENGINE == "ruby"
27
27
  spec.add_development_dependency "rake-compiler", "~> 0.9.2"
28
- spec.extensions << "ext/#{spec.name}/levenshtein/extconf.rb"
28
+ spec.extensions << "ext/#{spec.name}/levenshtein/trie_radix_tree/extconf.rb"
29
+ spec.extensions << "ext/#{spec.name}/levenshtein/iterative_with_two_matrix_rows/extconf.rb"
29
30
 
30
31
  if RUBY_VERSION > "1.9.3"
31
32
  spec.add_development_dependency "pry-byebug", "~> 1.2.1"
metadata CHANGED
@@ -1,97 +1,97 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: string_metric
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Giorgos Tsiftsis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-23 00:00:00.000000000 Z
11
+ date: 2015-05-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ~>
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.5'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ~>
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.5'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ~>
32
32
  - !ruby/object:Gem::Version
33
33
  version: 10.1.1
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ~>
39
39
  - !ruby/object:Gem::Version
40
40
  version: 10.1.1
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - "~>"
45
+ - - ~>
46
46
  - !ruby/object:Gem::Version
47
47
  version: 2.14.1
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - "~>"
52
+ - - ~>
53
53
  - !ruby/object:Gem::Version
54
54
  version: 2.14.1
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: text
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - "~>"
59
+ - - ~>
60
60
  - !ruby/object:Gem::Version
61
61
  version: 1.2.3
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - "~>"
66
+ - - ~>
67
67
  - !ruby/object:Gem::Version
68
68
  version: 1.2.3
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rake-compiler
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - "~>"
73
+ - - ~>
74
74
  - !ruby/object:Gem::Version
75
75
  version: 0.9.2
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - "~>"
80
+ - - ~>
81
81
  - !ruby/object:Gem::Version
82
82
  version: 0.9.2
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: pry-byebug
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - "~>"
87
+ - - ~>
88
88
  - !ruby/object:Gem::Version
89
89
  version: 1.2.1
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - "~>"
94
+ - - ~>
95
95
  - !ruby/object:Gem::Version
96
96
  version: 1.2.1
97
97
  description: A simple library with String Metric algorithms
@@ -99,25 +99,29 @@ email:
99
99
  - giorgos.tsiftsis@skroutz.gr
100
100
  executables: []
101
101
  extensions:
102
- - ext/string_metric/levenshtein/extconf.rb
102
+ - ext/string_metric/levenshtein/trie_radix_tree/extconf.rb
103
+ - ext/string_metric/levenshtein/iterative_with_two_matrix_rows/extconf.rb
103
104
  extra_rdoc_files: []
104
105
  files:
105
- - ".gitignore"
106
- - ".rspec"
107
- - ".travis.yml"
106
+ - .gitignore
107
+ - .rspec
108
+ - .travis.yml
108
109
  - Gemfile
109
110
  - LICENSE.txt
110
111
  - README.md
111
112
  - Rakefile
112
113
  - benchmarks/dictionary.rb
113
114
  - benchmarks/levenshtein.rb
114
- - ext/string_metric/levenshtein/extconf.rb
115
- - ext/string_metric/levenshtein/trie_radix_tree_ext.c
115
+ - ext/string_metric/levenshtein/iterative_with_two_matrix_rows/extconf.rb
116
+ - ext/string_metric/levenshtein/iterative_with_two_matrix_rows/iterative_with_two_matrix_rows_ext.c
117
+ - ext/string_metric/levenshtein/trie_radix_tree/extconf.rb
118
+ - ext/string_metric/levenshtein/trie_radix_tree/trie_radix_tree_ext.c
116
119
  - lib/string_metric.rb
117
120
  - lib/string_metric/levenshtein.rb
118
121
  - lib/string_metric/levenshtein/experiment.rb
119
122
  - lib/string_metric/levenshtein/iterative_with_full_matrix.rb
120
123
  - lib/string_metric/levenshtein/iterative_with_two_matrix_rows.rb
124
+ - lib/string_metric/levenshtein/iterative_with_two_matrix_rows_ext.rb
121
125
  - lib/string_metric/levenshtein/iterative_with_two_matrix_rows_optimized.rb
122
126
  - lib/string_metric/levenshtein/recursive.rb
123
127
  - lib/string_metric/levenshtein/trie_node.rb
@@ -128,6 +132,7 @@ files:
128
132
  - spec/fixtures/levenshtein.csv
129
133
  - spec/lib/levenshtein/experiment_spec.rb
130
134
  - spec/lib/levenshtein/iterative_with_full_matric_spec.rb
135
+ - spec/lib/levenshtein/iterative_with_two_matrix_rows_ext_spec.rb
131
136
  - spec/lib/levenshtein/iterative_with_two_matrix_rows_optimized_spec.rb
132
137
  - spec/lib/levenshtein/iterative_with_two_matrix_rows_spec.rb
133
138
  - spec/lib/levenshtein/recursive_spec.rb
@@ -145,17 +150,17 @@ require_paths:
145
150
  - lib
146
151
  required_ruby_version: !ruby/object:Gem::Requirement
147
152
  requirements:
148
- - - ">="
153
+ - - '>='
149
154
  - !ruby/object:Gem::Version
150
155
  version: '0'
151
156
  required_rubygems_version: !ruby/object:Gem::Requirement
152
157
  requirements:
153
- - - ">="
158
+ - - '>='
154
159
  - !ruby/object:Gem::Version
155
160
  version: '0'
156
161
  requirements: []
157
162
  rubyforge_project:
158
- rubygems_version: 2.4.6
163
+ rubygems_version: 2.1.11
159
164
  signing_key:
160
165
  specification_version: 4
161
166
  summary: A simple library with String Metric algorithms
@@ -164,6 +169,7 @@ test_files:
164
169
  - spec/fixtures/levenshtein.csv
165
170
  - spec/lib/levenshtein/experiment_spec.rb
166
171
  - spec/lib/levenshtein/iterative_with_full_matric_spec.rb
172
+ - spec/lib/levenshtein/iterative_with_two_matrix_rows_ext_spec.rb
167
173
  - spec/lib/levenshtein/iterative_with_two_matrix_rows_optimized_spec.rb
168
174
  - spec/lib/levenshtein/iterative_with_two_matrix_rows_spec.rb
169
175
  - spec/lib/levenshtein/recursive_spec.rb