amatch 0.1.5 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGES CHANGED
@@ -1,5 +1,8 @@
1
- 2005-01-20 (0.1.5)
2
- * Bugfix for insertion/deletion weights. Sorry.
1
+ 2005-06-01 (0.2.0)
2
+ * Major changes in API and implementation:
3
+ Now the Levenshtein edit distance, Sellers edit distance, the Hamming
4
+ distance, the longest common subsequence length, the longest common
5
+ substring length, and the pair distance metric can be computed.
3
6
  2005-01-20 (0.1.4)
4
7
  * Better argument handling in initialization method
5
8
  * Minor changes in Rakefile and README.en
@@ -0,0 +1,5 @@
1
+ /usr/local/stow/ruby/bin/
2
+ /usr/local/stow/ruby/bin/CVS
3
+ /usr/local/stow/ruby/bin/CVS
4
+ /usr/local/stow/ruby/bin/CVS
5
+ /usr/local/stow/ruby/lib/ruby/site_ruby/1.8/i686-linux/.
data/README.en CHANGED
@@ -27,5 +27,5 @@ Florian Frank <flori@ping.de>
27
27
  License
28
28
  =======
29
29
 
30
- GNU General Public License (GPL)
30
+ GNU General Public License, Version 2 (GPLv2)
31
31
 
data/Rakefile CHANGED
@@ -3,101 +3,110 @@
3
3
  require 'rake/clean'
4
4
  require 'rake/testtask'
5
5
  require 'rake/gempackagetask'
6
+ require 'rake/rdoctask'
6
7
  require 'rbconfig'
7
8
 
8
9
  include Config
9
10
 
10
- PKG_NAME = 'amatch'
11
11
  PKG_VERSION = File.read('VERSION').chomp
12
- PKG_FILES = Dir.glob("**/*").delete_if {|item|
13
- item.include?("CVS") or item.include?("pkg")
14
- }
12
+ PKG_FILES = FileList['**/*']
13
+ PKG_FILES.exclude(/CVS/)
14
+ PKG_FILES.exclude(/^pkg/)
15
+ PKG_FILES.exclude(/^doc/)
15
16
 
16
- task :default => [:test]
17
+ task :default => :test
17
18
 
18
19
  desc "Run unit tests"
19
- task(:test => [ :clean, :compile ]) do
20
- ruby %{-Iext tests/test.rb}
20
+ task(:test => [:compile]) do
21
+ cd 'tests' do
22
+ ruby %{-I../ext runner.rb}
23
+ end
21
24
  end
22
25
 
23
26
  desc "Compiling library"
24
27
  task :compile do
25
- cd 'ext' do
26
- ruby %{extconf.rb}
27
- sh "make"
28
- end
28
+ cd 'ext' do
29
+ ruby %{extconf.rb}
30
+ sh "make"
31
+ end
29
32
  end
30
33
 
31
34
  desc "Installing library"
32
- task(:install => [:test]) do
33
- src, = Dir['ext/amatch.*'].reject { |x| /\.[co]$/.match x }
34
- filename = File.basename(src)
35
- dst = File.join(CONFIG["sitelibdir"], filename)
36
- install(src, dst, :verbose => true)
35
+ task :install => :test do
36
+ src, = Dir['ext/amatch.*'].reject { |x| /\.[co]$/.match x }
37
+ filename = File.basename(src)
38
+ dst = File.join(CONFIG["sitelibdir"], filename)
39
+ install(src, dst, :verbose => true)
37
40
  end
38
41
 
42
+ desc "Removing generated files"
39
43
  task :clean do
40
- cd 'ext' do
41
- ruby %{extconf.rb}
42
- sh "make distclean" if File.exist?('Makefile')
43
- end
44
+ cd 'ext' do
45
+ ruby 'extconf.rb'
46
+ sh "make distclean" if File.exist?('Makefile')
47
+ end
44
48
  end
45
49
 
46
- spec = Gem::Specification.new do |s|
47
-
48
- #### Basic information.
50
+ Rake::RDocTask.new do |rd|
51
+ rd.main = 'Amatch'
52
+ rd.rdoc_files.include("ext/amatch.c")
53
+ rd.rdoc_dir = 'doc'
54
+ end
49
55
 
50
- s.name = 'amatch'
51
- s.version = PKG_VERSION
52
- s.summary = "Approximate String Matching library"
53
- s.description = <<EOF
54
- Amatch is a library for approximate string matching and searching using
55
- a dynamic programming algorithm to compute the Levenstein distance
56
- between strings.
56
+ spec = Gem::Specification.new do |s|
57
+ #### Basic information.
58
+
59
+ s.name = 'amatch'
60
+ s.version = PKG_VERSION
61
+ s.summary = "Approximate String Matching library"
62
+ s.description = <<EOF
63
+ Amatch is a library for approximate string matching and searching in strings.
64
+ Several algorithms can be used to do this, and it's also possible to compute a
65
+ similarity metric number between 0.0 and 1.0 for two given strings.
57
66
  EOF
58
67
 
59
- #### Dependencies and requirements.
68
+ #### Dependencies and requirements.
60
69
 
61
- #s.add_dependency('log4r', '> 1.0.4')
62
- #s.requirements << ""
70
+ #s.add_dependency('log4r', '> 1.0.4')
71
+ #s.requirements << ""
63
72
 
64
- s.files = PKG_FILES
73
+ s.files = PKG_FILES
65
74
 
66
- #### C code extensions.
75
+ #### C code extensions.
67
76
 
68
- s.extensions << "ext/extconf.rb"
77
+ s.extensions << "ext/extconf.rb"
69
78
 
70
- #### Load-time details: library and application (you will need one or both).
79
+ #### Load-time details: library and application (you will need one or both).
71
80
 
72
- s.require_path = 'lib' # Use these for libraries.
73
- s.autorequire = 'amatch'
81
+ s.require_path = 'ext' # Use these for libraries.
82
+ s.autorequire = 'amatch'
74
83
 
75
- s.bindir = "bin" # Use these for applications.
76
- s.executables = ["agrep.rb"]
77
- s.default_executable = "agrep.rb"
84
+ s.bindir = "bin" # Use these for applications.
85
+ s.executables = ["agrep.rb"]
86
+ s.default_executable = "agrep.rb"
78
87
 
79
- #### Documentation and testing.
88
+ #### Documentation and testing.
80
89
 
81
- s.has_rdoc = true
82
- #s.extra_rdoc_files = rd.rdoc_files.reject { |fn| fn =~ /\.rb$/ }.to_a
83
- #s.rdoc_options <<
84
- # '--title' << 'Rake -- Ruby Make' <<
85
- # '--main' << 'README' <<
86
- # '--line-numbers'
87
- s.test_files << 'tests/test.rb'
90
+ s.has_rdoc = true
91
+ #s.extra_rdoc_files = FileList['ext/amatch.c']
92
+ s.rdoc_options <<
93
+ '--title' << 'Amatch -- Approximate Matching' <<
94
+ '--main' << 'Amatch' <<
95
+ '--line-numbers'
96
+ s.test_files << 'tests/runner.rb'
88
97
 
89
- #### Author and project details.
98
+ #### Author and project details.
90
99
 
91
- s.author = "Florian Frank"
92
- s.email = "flori@ping.de"
93
- s.homepage = "http://amatch.rubyforge.org"
94
- s.rubyforge_project = "amatch"
100
+ s.author = "Florian Frank"
101
+ s.email = "flori@ping.de"
102
+ s.homepage = "http://amatch.rubyforge.org"
103
+ s.rubyforge_project = "amatch"
95
104
  end
96
105
 
97
106
  Rake::GemPackageTask.new(spec) do |pkg|
98
- pkg.need_tar = true
99
- pkg.package_files += PKG_FILES
107
+ pkg.need_tar = true
108
+ pkg.package_files += PKG_FILES
100
109
  end
101
110
 
102
111
  task :release => [ :clean, :package ]
103
- # vim: set et sw=4 ts=4:
112
+ # vim: set et sw=2 ts=2:
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.5
1
+ 0.2.0
@@ -1,74 +1,87 @@
1
1
  #! /usr/bin/env ruby
2
2
  #
3
- ## $Id: agrep.rb,v 1.1.1.1 2004/09/27 19:23:42 flori Exp $
3
+ ## $Id: agrep.rb,v 1.3 2005/04/24 21:11:06 flori Exp $
4
4
  #
5
5
 
6
6
  require 'amatch'
7
7
  require 'getoptlong'
8
8
 
9
9
  def usage(msg, options)
10
- print msg, "\nUsage: #{File.basename($0)} pattern [FILE ...]\n\n"
11
- options.each { |o|
12
- print " " + o[1] + ", " + o[0] + " " +
13
- (o[2] == GetoptLong::REQUIRED_ARGUMENT ? 'ARGUMENT' : '') + "\n"
14
- }
15
- print "\nReport bugs to <flori@ping.de>.\n"
16
- exit 0
10
+ print msg, "\nUsage: #{File.basename($0)} pattern [FILE ...]\n\n"
11
+ options.each do |o|
12
+ puts " " + o[1] + ", " + o[0] + " " +
13
+ (o[2] == GetoptLong::REQUIRED_ARGUMENT ? 'ARGUMENT' : '')
14
+ end
15
+ puts "\nReport bugs to <flori@ping.de>."
16
+ exit 0
17
+ end
18
+
19
+ class MyAmatch < Amatch
20
+ def l_search_relative(strings)
21
+ if strings.is_a? Array
22
+ l_search(strings).map { |x| x / pattern.size }
23
+ else
24
+ l_search(strings) / pattern.size
25
+ end
26
+ end
17
27
  end
18
28
 
19
29
  $distance = 1
30
+ $mode = :l_search
20
31
  begin
21
- parser = GetoptLong.new
22
- options = [
23
- [ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
24
- [ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
25
- [ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
26
- [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
27
- ]
28
- parser.set_options(*options)
29
- parser.each_option { |name, arg|
30
- name = name.sub(/^--/, '')
31
- case name
32
- when 'distance'
33
- $distance = arg.to_f
34
- when 'relative'
35
- $relative = 1
36
- when 'verbose'
37
- $verbose = 1
38
- when 'help'
39
- usage('You\'ve asked for it!', options)
40
- end
41
- }
32
+ parser = GetoptLong.new
33
+ options = [
34
+ [ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
35
+ [ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
36
+ [ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
37
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
38
+ ]
39
+ parser.set_options(*options)
40
+ parser.each_option do |name, arg|
41
+ name = name.sub(/^--/, '')
42
+ case name
43
+ when 'distance'
44
+ $distance = arg.to_f
45
+ when 'relative'
46
+ $mode = :l_search_relative
47
+ when 'verbose'
48
+ $verbose = 1
49
+ when 'help'
50
+ usage('You\'ve asked for it!', options)
51
+ end
52
+ end
42
53
  rescue
43
- exit 1
54
+ exit 1
44
55
  end
45
- $pattern = ARGV.shift or usage('Pattern needed!', options)
56
+ pattern = ARGV.shift or usage('Pattern needed!', options)
46
57
 
47
- matcher = Amatch.new($pattern)
58
+ matcher = MyAmatch.new(pattern)
48
59
  size = 0
49
60
  start = Time.new
50
61
  if ARGV.size > 0 then
51
- ARGV.each { |filename|
52
- File.stat(filename).file? or next
53
- size += File.size(filename)
54
- begin
55
- File.open(filename, 'r').each_line { |line|
56
- print "#{filename}:#{line}" if
57
- ($relative ? matcher.searchr(line) :
58
- matcher.search(line)) <= $distance
59
- }
60
- rescue
61
- $stderr.print "Failure at #{filename}: #{$!} => Skipping!\n"
62
- end
63
- }
62
+ ARGV.each do |filename|
63
+ File.stat(filename).file? or next
64
+ size += File.size(filename)
65
+ begin
66
+ File.open(filename, 'r').each_line do |line|
67
+ if matcher.__send__($mode, line) < $distance
68
+ puts "#{filename}:#{line}"
69
+ end
70
+ end
71
+ rescue
72
+ STDERR.print "Failure at #{filename}: #{$!} => Skipping!\n"
73
+ end
74
+ end
64
75
  else
65
- $stdin.each_line { |line|
66
- size += line.size
67
- print line if ($relative ? matcher.searchr(line) :
68
- matcher.search(line)) <= $distance
69
- }
76
+ STDIN.each_line do |line|
77
+ size += line.size
78
+ if matcher.__send__($mode, line) <= $distance
79
+ puts line
80
+ end
81
+ end
70
82
  end
71
83
  time = Time.new - start
72
- $verbose and $stderr.printf "%.3f secs running, scanned %.3f KB/s.\n",
73
- time, size / time / 1024
84
+ $verbose and STDERR.printf "%.3f secs running, scanned %.3f KB/s.\n",
85
+ time, size / time / 1024
74
86
  exit 0
87
+ # vim: set et sw=2 ts=2:
@@ -0,0 +1,12 @@
1
+ bin-dir=$prefix/bin
2
+ site-ruby=$prefix/lib/ruby/site_ruby/1.8
3
+ prefix=/usr/local/stow/ruby
4
+ ruby-path=/usr/local/stow/ruby/bin/ruby
5
+ make-prog=make
6
+ rb-dir=$site-ruby
7
+ without-ext=no
8
+ ruby-prog=/usr/local/stow/ruby/bin/ruby
9
+ site-ruby-common=$prefix/lib/ruby/site_ruby
10
+ std-ruby=$prefix/lib/ruby/1.8
11
+ data-dir=$prefix/share
12
+ so-dir=$prefix/lib/ruby/site_ruby/1.8/i686-linux
Binary file
@@ -1,312 +1,1388 @@
1
1
  #include "ruby.h"
2
+ #include "pair.h"
2
3
 
3
- static VALUE cAmatch;
4
+ /*
5
+ * Document-method: pattern
6
+ *
7
+ * call-seq: pattern -> pattern string
8
+ *
9
+ * Returns the current pattern string of this instance.
10
+ */
4
11
 
5
12
  /*
6
- * Vector stuff
13
+ * Document-method: pattern=
14
+ *
15
+ * call-seq: pattern=(pattern)
16
+ *
17
+ * Sets the current pattern string of this instance to <code>pattern</code>.
7
18
  */
8
19
 
9
- typedef struct {
10
- int *ptr;
11
- int len;
12
- } vector;
13
20
 
14
- static vector *
15
- vector_new(len)
16
- int len;
17
- {
18
- vector *v;
19
- v = ALLOC(vector);
20
- if (v == NULL) rb_raise(rb_eNoMemError, "couldn't malloc vector");
21
- v->ptr = ALLOC_N(int, len + 1);
22
- if (v->ptr == NULL) rb_raise(rb_eNoMemError, "couldn't malloc vector data");
23
- v->len = len;
24
- return v;
25
- }
26
-
27
- static void
28
- vector_print(v)
29
- vector *v;
21
+ static VALUE rb_mAmatch, rb_cLevenshtein, rb_cSellers, rb_cHamming,
22
+ rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring;
23
+
24
+ static ID id_split, id_to_f;
25
+
26
+ #define GET_STRUCT(klass) \
27
+ klass *amatch; \
28
+ Data_Get_Struct(self, klass, amatch);
29
+
30
+ #define DEF_ALLOCATOR(type) \
31
+ static type *type##_allocate() \
32
+ { \
33
+ type *obj = ALLOC(type); \
34
+ MEMZERO(obj, type, 1); \
35
+ return obj; \
36
+ }
37
+
38
+ #define DEF_CONSTRUCTOR(klass, type) \
39
+ static VALUE rb_##klass##_s_allocate(VALUE klass2) \
40
+ { \
41
+ type *amatch = type##_allocate(); \
42
+ return Data_Wrap_Struct(klass2, NULL, rb_##klass##_free, amatch); \
43
+ } \
44
+ VALUE rb_##klass##_new(VALUE klass2, VALUE pattern) \
45
+ { \
46
+ VALUE obj = rb_##klass##_s_allocate(klass2); \
47
+ rb_##klass##_initialize(obj, pattern); \
48
+ return obj; \
49
+ }
50
+
51
+ #define DEF_RB_FREE(klass, type) \
52
+ static void rb_##klass##_free(type *amatch) \
53
+ { \
54
+ MEMZERO(amatch->pattern, char, amatch->pattern_len); \
55
+ free(amatch->pattern); \
56
+ MEMZERO(amatch, type, 1); \
57
+ free(amatch); \
58
+ }
59
+
60
+ #define DEF_PATTERN_ACCESSOR(type) \
61
+ static void type##_pattern_set(type *amatch, VALUE pattern) \
62
+ { \
63
+ Check_Type(pattern, T_STRING); \
64
+ free(amatch->pattern); \
65
+ amatch->pattern_len = RSTRING(pattern)->len; \
66
+ amatch->pattern = ALLOC_N(char, amatch->pattern_len); \
67
+ MEMCPY(amatch->pattern, RSTRING(pattern)->ptr, char, \
68
+ RSTRING(pattern)->len); \
69
+ } \
70
+ static VALUE rb_##type##_pattern(VALUE self) \
71
+ { \
72
+ GET_STRUCT(type) \
73
+ return rb_str_new(amatch->pattern, amatch->pattern_len); \
74
+ } \
75
+ static VALUE rb_##type##_pattern_set(VALUE self, VALUE pattern) \
76
+ { \
77
+ GET_STRUCT(type) \
78
+ type##_pattern_set(amatch, pattern); \
79
+ return Qnil; \
80
+ }
81
+
82
+ #define DEF_ITERATE_STRINGS(type) \
83
+ static VALUE type##_iterate_strings(type *amatch, VALUE strings, \
84
+ VALUE (*match_function) (type *amatch, VALUE strings)) \
85
+ { \
86
+ if (TYPE(strings) == T_STRING) { \
87
+ return match_function(amatch, strings); \
88
+ } else { \
89
+ Check_Type(strings, T_ARRAY); \
90
+ int i; \
91
+ VALUE result = rb_ary_new2(RARRAY(strings)->len); \
92
+ for (i = 0; i < RARRAY(strings)->len; i++) { \
93
+ VALUE string = rb_ary_entry(strings, i); \
94
+ if (TYPE(string) != T_STRING) { \
95
+ rb_raise(rb_eTypeError, \
96
+ "array has to contain only strings (%s given)", \
97
+ NIL_P(string) ? \
98
+ "NilClass" : \
99
+ rb_class2name(CLASS_OF(string))); \
100
+ } \
101
+ rb_ary_push(result, match_function(amatch, string)); \
102
+ } \
103
+ return result; \
104
+ } \
105
+ }
106
+
107
+ #define DEF_RB_READER(type, function, name, converter) \
108
+ VALUE function(VALUE self) \
109
+ { \
110
+ GET_STRUCT(type) \
111
+ return converter(amatch->name); \
112
+ }
113
+
114
+ #define DEF_RB_WRITER(type, function, name, vtype, caster, converter, check)\
115
+ VALUE function(VALUE self, VALUE value) \
116
+ { \
117
+ vtype value_ ## vtype; \
118
+ GET_STRUCT(type) \
119
+ caster(value); \
120
+ value_ ## vtype = converter(value); \
121
+ if (!(value_ ## vtype check)) \
122
+ rb_raise(rb_eTypeError, "check of value " #check " failed"); \
123
+ amatch->name = value_ ## vtype; \
124
+ return Qnil; \
125
+ }
126
+
127
+
128
+ #define CAST2FLOAT(obj) \
129
+ if (TYPE(obj) != T_FLOAT && rb_respond_to(obj, id_to_f)) \
130
+ obj = rb_funcall(obj, id_to_f, 0, 0); \
131
+ else \
132
+ Check_Type(obj, T_FLOAT)
133
+ #define FLOAT2C(obj) RFLOAT(obj)->value
134
+
135
+ #define OPTIMIZE_TIME \
136
+ if (amatch->pattern_len < RSTRING(string)->len) { \
137
+ a_ptr = amatch->pattern; \
138
+ a_len = amatch->pattern_len; \
139
+ b_ptr = RSTRING(string)->ptr; \
140
+ b_len = RSTRING(string)->len; \
141
+ } else { \
142
+ a_ptr = RSTRING(string)->ptr; \
143
+ a_len = RSTRING(string)->len; \
144
+ b_ptr = amatch->pattern; \
145
+ b_len = amatch->pattern_len; \
146
+ }
147
+
148
+ #define DONT_OPTIMIZE \
149
+ a_ptr = amatch->pattern; \
150
+ a_len = amatch->pattern_len; \
151
+ b_ptr = RSTRING(string)->ptr; \
152
+ b_len = RSTRING(string)->len; \
153
+
154
+ /*
155
+ * C structures of the Amatch classes
156
+ */
157
+
158
+ typedef struct GeneralStruct {
159
+ char *pattern;
160
+ char pattern_len;
161
+ } General;
162
+
163
+ DEF_ALLOCATOR(General)
164
+ DEF_PATTERN_ACCESSOR(General)
165
+ DEF_ITERATE_STRINGS(General)
166
+
167
+ typedef struct SellersStruct {
168
+ char *pattern;
169
+ char pattern_len;
170
+ double substitution;
171
+ double deletion;
172
+ double insertion;
173
+ } Sellers;
174
+
175
+ DEF_ALLOCATOR(Sellers)
176
+ DEF_PATTERN_ACCESSOR(Sellers)
177
+ DEF_ITERATE_STRINGS(Sellers)
178
+
179
+ static void Sellers_reset_weights(Sellers *self)
30
180
  {
31
- int i;
32
- for(i = 0; i < v->len; i++) printf("%d", v->ptr[i]);
33
- puts("");
181
+ self->substitution = 1.0;
182
+ self->deletion = 1.0;
183
+ self->insertion = 1.0;
34
184
  }
35
185
 
36
- static void
37
- vector_destroy(v)
38
- vector *v;
186
+ typedef struct PairDistanceStruct {
187
+ char *pattern;
188
+ char pattern_len;
189
+ PairArray *pattern_pair_array;
190
+ } PairDistance;
191
+
192
+ DEF_ALLOCATOR(PairDistance)
193
+ DEF_PATTERN_ACCESSOR(PairDistance)
194
+
195
+ /*
196
+ * Levenshtein edit distances are computed here:
197
+ */
198
+
199
+ #define COMPUTE_LEVENSHTEIN_DISTANCE \
200
+ for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
201
+ c = i % 2; /* current row */ \
202
+ p = (i + 1) % 2; /* previous row */ \
203
+ v[c][0] = i; /* first column */ \
204
+ for (j = 1; j <= b_len; j++) { \
205
+ /* Bellman's principle of optimality: */ \
206
+ weight = v[p][j - 1] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
207
+ if (weight > v[p][j] + 1) { \
208
+ weight = v[p][j] + 1; \
209
+ } \
210
+ if (weight > v[c][j - 1] + 1) { \
211
+ weight = v[c][j - 1] + 1; \
212
+ } \
213
+ v[c][j] = weight; \
214
+ } \
215
+ p = c; \
216
+ c = (c + 1) % 2; \
217
+ }
218
+
219
+ static VALUE Levenshtein_match(General *amatch, VALUE string)
39
220
  {
40
- xfree(v->ptr);
41
- xfree(v);
221
+ VALUE result;
222
+ char *a_ptr, *b_ptr;
223
+ int a_len, b_len;
224
+ int *v[2], weight;
225
+ int i, j, c, p;
226
+
227
+ Check_Type(string, T_STRING);
228
+ DONT_OPTIMIZE
229
+
230
+ v[0] = ALLOC_N(int, b_len + 1);
231
+ v[1] = ALLOC_N(int, b_len + 1);
232
+ for (i = 0; i <= b_len; i++) {
233
+ v[0][i] = i;
234
+ v[1][i] = i;
235
+ }
236
+
237
+ COMPUTE_LEVENSHTEIN_DISTANCE
238
+
239
+ result = INT2FIX(v[p][b_len]);
240
+
241
+ free(v[0]);
242
+ free(v[1]);
243
+
244
+ return result;
42
245
  }
43
246
 
44
- static int
45
- vector_minimum(v)
46
- vector *v;
247
+ static VALUE Levenshtein_similar(General *amatch, VALUE string)
47
248
  {
48
- int i;
49
- int min;
249
+ VALUE result;
250
+ char *a_ptr, *b_ptr;
251
+ int a_len, b_len;
252
+ int *v[2], weight;
253
+ int i, j, c, p;
50
254
 
51
- if (v->len == 0) return -1;
52
- min = v->ptr[0];
53
- for (i = 1; i <= v->len; i++) {
54
- if (min > v->ptr[i]) min = v->ptr[i];
255
+ Check_Type(string, T_STRING);
256
+ DONT_OPTIMIZE
257
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
258
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
259
+ v[0] = ALLOC_N(int, b_len + 1);
260
+ v[1] = ALLOC_N(int, b_len + 1);
261
+ for (i = 0; i <= b_len; i++) {
262
+ v[0][i] = i;
263
+ v[1][i] = i;
264
+ }
265
+
266
+ COMPUTE_LEVENSHTEIN_DISTANCE
267
+
268
+ if (b_len > a_len) {
269
+ result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
270
+ } else {
271
+ result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
55
272
  }
56
- return min;
273
+ free(v[0]);
274
+ free(v[1]);
275
+ return result;
57
276
  }
58
277
 
59
- static int
60
- vector_last(v)
61
- vector *v;
278
+ static VALUE Levenshtein_search(General *amatch, VALUE string)
62
279
  {
63
- return v->ptr[v->len];
280
+ VALUE result;
281
+ char *a_ptr, *b_ptr;
282
+ int a_len, b_len;
283
+ int *v[2], weight, min;
284
+ int i, j, c, p;
285
+
286
+ Check_Type(string, T_STRING);
287
+ DONT_OPTIMIZE
288
+
289
+ v[0] = ALLOC_N(int, b_len + 1);
290
+ v[1] = ALLOC_N(int, b_len + 1);
291
+ MEMZERO(v[0], int, b_len + 1);
292
+ MEMZERO(v[1], int, b_len + 1);
293
+
294
+ COMPUTE_LEVENSHTEIN_DISTANCE
295
+
296
+ for (i = 0, min = a_len; i <= b_len; i++) {
297
+ if (v[p][i] < min) min = v[p][i];
298
+ }
299
+
300
+ result = INT2FIX(min);
301
+
302
+ free(v[0]);
303
+ free(v[1]);
304
+
305
+ return result;
64
306
  }
65
307
 
308
+
66
309
  /*
67
- * Edit distances are calculated here
310
+ * Sellers edit distances are computed here:
68
311
  */
69
312
 
70
- enum { MATCH = 1, MATCHR, SEARCH, SEARCHR, COMPARE, COMPARER };
313
+ #define COMPUTE_SELLERS_DISTANCE \
314
+ for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
315
+ c = i % 2; /* current row */ \
316
+ p = (i + 1) % 2; /* previous row */ \
317
+ v[c][0] = i * amatch->deletion; /* first column */ \
318
+ for (j = 1; j <= b_len; j++) { \
319
+ /* Bellman's principle of optimality: */ \
320
+ weight = v[p][j - 1] + \
321
+ (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : amatch->substitution); \
322
+ if (weight > v[p][j] + amatch->insertion) { \
323
+ weight = v[p][j] + amatch->insertion; \
324
+ } \
325
+ if (weight > v[c][j - 1] + amatch->deletion) { \
326
+ weight = v[c][j - 1] + amatch->deletion; \
327
+ } \
328
+ v[c][j] = weight; \
329
+ } \
330
+ p = c; \
331
+ c = (c + 1) % 2; \
332
+ }
71
333
 
72
- static int weight2int(weight, name)
73
- VALUE weight;
74
- char *name;
334
+ static VALUE Sellers_match(Sellers *amatch, VALUE string)
75
335
  {
76
- if (TYPE(weight) != T_FIXNUM) {
77
- rb_raise(rb_eTypeError,
78
- "value of weight %s has to be of type Fixnum (%s given)",
79
- "subw", NIL_P(weight) ? "NilClass" : rb_class2name(CLASS_OF(weight)));
336
+ VALUE result;
337
+ char *a_ptr, *b_ptr;
338
+ int a_len, b_len;
339
+ double *v[2], weight;
340
+ int i, j, c, p;
341
+
342
+ Check_Type(string, T_STRING);
343
+ DONT_OPTIMIZE
344
+
345
+ v[0] = ALLOC_N(double, b_len + 1);
346
+ v[1] = ALLOC_N(double, b_len + 1);
347
+ for (i = 0; i <= b_len; i++) {
348
+ v[0][i] = i * amatch->deletion;
349
+ v[1][i] = i * amatch->deletion;
80
350
  }
81
- return FIX2INT(weight);
351
+
352
+ COMPUTE_SELLERS_DISTANCE
353
+
354
+ result = rb_float_new(v[p][b_len]);
355
+ free(v[0]);
356
+ free(v[1]);
357
+ return result;
82
358
  }
83
359
 
84
- static VALUE
85
- calculate_distance (self, string, mode)
86
- VALUE self;
87
- VALUE string;
88
- char mode;
360
+ static VALUE Sellers_similar(Sellers *amatch, VALUE string)
89
361
  {
90
- VALUE pattern, tmp;
91
- static VALUE result;
92
- int pattern_len, string_len;
93
- char *pattern_ptr, *string_ptr;
94
- vector *v[2];
95
- int weight, sw, dw, iw, i, j, tmpi;
96
- int c = 0, p = 1;
362
+ VALUE result;
363
+ char *a_ptr, *b_ptr;
364
+ int a_len, b_len;
365
+ double *v[2], weight, max_weight;
366
+ int i, j, c, p;
97
367
 
368
+ if (amatch->insertion >= amatch->deletion) {
369
+ if (amatch->substitution >= amatch->insertion) {
370
+ max_weight = amatch->substitution;
371
+ } else {
372
+ max_weight = amatch->insertion;
373
+ }
374
+ } else {
375
+ if (amatch->substitution >= amatch->deletion) {
376
+ max_weight = amatch->substitution;
377
+ } else {
378
+ max_weight = amatch->deletion;
379
+ }
380
+ }
381
+
98
382
  Check_Type(string, T_STRING);
99
- string_ptr = RSTRING(string)->ptr;
100
- string_len = RSTRING(string)->len;
383
+ DONT_OPTIMIZE
384
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
385
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
386
+ v[0] = ALLOC_N(double, b_len + 1);
387
+ v[1] = ALLOC_N(double, b_len + 1);
388
+ for (i = 0; i <= b_len; i++) {
389
+ v[0][i] = i * amatch->deletion;
390
+ v[1][i] = i * amatch->deletion;
391
+ }
101
392
 
102
- pattern = rb_iv_get(self, "@pattern");
103
- Check_Type(pattern, T_STRING);
104
- pattern_ptr = RSTRING(pattern)->ptr;
105
- pattern_len = RSTRING(pattern)->len;
393
+ COMPUTE_SELLERS_DISTANCE
106
394
 
107
- sw = weight2int(rb_iv_get(self, "@subw"), "subw");
108
- dw = weight2int(rb_iv_get(self, "@delw"), "delw");
109
- iw = weight2int(rb_iv_get(self, "@insw"), "insw");
110
-
111
- v[0] = vector_new(string_len);
112
- switch (mode) {
113
- case MATCH:
114
- case MATCHR:
115
- case COMPARE:
116
- case COMPARER:
117
- for (i = 0; i <= v[0]->len; i++) v[0]->ptr[i] = i * iw;
118
- break;
119
- case SEARCH:
120
- case SEARCHR:
121
- for (i = 0; i <= v[0]->len; i++) v[0]->ptr[i] = 0;
122
- break;
123
- default:
124
- rb_raise(rb_eFatal, "unknown mode in calculate_distance");
395
+ if (b_len > a_len) {
396
+ result = rb_float_new(1.0 - v[p][b_len] / (b_len * max_weight));
397
+ } else {
398
+ result = rb_float_new(1.0 - v[p][b_len] / (a_len * max_weight));
125
399
  }
400
+ free(v[0]);
401
+ free(v[1]);
402
+ return result;
403
+ }
404
+
405
+ static VALUE Sellers_search(Sellers *amatch, VALUE string)
406
+ {
407
+ VALUE result;
408
+ char *a_ptr, *b_ptr;
409
+ int a_len, b_len;
410
+ double *v[2], weight, min;
411
+ int i, j, c, p;
412
+
413
+ Check_Type(string, T_STRING);
414
+ DONT_OPTIMIZE
415
+
416
+ v[0] = ALLOC_N(double, b_len + 1);
417
+ v[1] = ALLOC_N(double, b_len + 1);
418
+ MEMZERO(v[0], double, b_len + 1);
419
+ MEMZERO(v[1], double, b_len + 1);
420
+
421
+ COMPUTE_SELLERS_DISTANCE
422
+
423
+ for (i = 0, min = a_len; i <= b_len; i++) {
424
+ if (v[p][i] < min) min = v[p][i];
425
+ }
426
+ result = rb_float_new(min);
427
+ free(v[0]);
428
+ free(v[1]);
429
+
430
+ return result;
431
+ }
432
+
433
+ /*
434
+ * Pair distances are computed here:
435
+ */
126
436
 
127
- v[1] = vector_new(string_len);
128
- for (i = 1; i <= pattern_len; i++) {
129
- c = i % 2; /* current row */
130
- p = (i - 1) % 2; /* previous row */
131
- v[c]->ptr[0] = i * dw; /* first column */
132
- for (j = 1; j <= string_len; j++) {
133
- /* Bellman's principle of optimality: */
134
- weight = v[p]->ptr[j - 1] +
135
- (pattern_ptr[i - 1] == string_ptr[j - 1] ? 0 : sw);
136
- if (weight > v[p]->ptr[j] + iw) weight = v[p]->ptr[j] + iw;
137
- if (weight > v[c]->ptr[j - 1] + dw) weight = v[c]->ptr[j - 1] + dw;
138
- v[c]->ptr[j] = weight;
437
+ static VALUE PairDistance_match(
438
+ PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
439
+ {
440
+ double result;
441
+ VALUE tokens;
442
+ PairArray *pair_array;
443
+
444
+ Check_Type(string, T_STRING);
445
+ if (!NIL_P(regexp) || use_regexp) {
446
+ tokens = rb_funcall(
447
+ rb_str_new(amatch->pattern, amatch->pattern_len),
448
+ id_split, 1, regexp
449
+ );
450
+ if (!amatch->pattern_pair_array) {
451
+ amatch->pattern_pair_array = PairArray_new(tokens);
452
+ } else {
453
+ pair_array_reactivate(amatch->pattern_pair_array);
454
+ }
455
+ tokens = rb_funcall(string, id_split, 1, regexp);
456
+ pair_array = PairArray_new(tokens);
457
+ } else {
458
+ VALUE tmp = rb_str_new(amatch->pattern, amatch->pattern_len);
459
+ tokens = rb_ary_new4(1, &tmp);
460
+ if (!amatch->pattern_pair_array) {
461
+ amatch->pattern_pair_array = PairArray_new(tokens);
462
+ } else {
463
+ pair_array_reactivate(amatch->pattern_pair_array);
139
464
  }
465
+ tokens = rb_ary_new4(1, &string);
466
+ pair_array = PairArray_new(tokens);
140
467
  }
141
- switch (mode) {
142
- case MATCH:
143
- result = INT2FIX(vector_last(v[c]));
144
- break;
145
- case MATCHR:
146
- result = rb_float_new((double) vector_last(v[c]) / pattern_len);
147
- break;
148
- case SEARCH:
149
- tmpi = vector_minimum(v[c]);
150
- result = tmpi < 0 ? INT2FIX(pattern_len) : INT2FIX(tmpi);
151
- break;
152
- case SEARCHR:
153
- tmpi = vector_minimum(v[c]);
154
- result = rb_float_new( tmpi < 0 ? 1.0 : (double) tmpi / pattern_len);
155
- break;
156
- case COMPARE:
157
- result = INT2FIX((string_len < pattern_len ? -1 : 1) *
158
- vector_last(v[c]));
159
- break;
160
- case COMPARER:
161
- result = rb_float_new((double)
162
- (string_len < pattern_len ? -1 : 1) *
163
- vector_last(v[c]) / pattern_len);
164
- break;
165
- default:
166
- rb_raise(rb_eFatal, "unknown mode in calculate_distance");
468
+ result = pair_array_match(amatch->pattern_pair_array, pair_array);
469
+ pair_array_destroy(pair_array);
470
+ return rb_float_new(result);
471
+ }
472
+
473
+ /*
474
+ * Hamming distances are computed here:
475
+ */
476
+
477
+ #define COMPUTE_HAMMING_DISTANCE \
478
+ for (i = 0, result = b_len - a_len; i < a_len; i++) { \
479
+ if (i >= b_len) { \
480
+ result += a_len - b_len; \
481
+ break; \
482
+ } \
483
+ if (b_ptr[i] != a_ptr[i]) result++; \
167
484
  }
168
- vector_destroy(v[0]);
169
- vector_destroy(v[1]);
170
- return result;
485
+
486
+ static VALUE Hamming_match(General *amatch, VALUE string)
487
+ {
488
+ char *a_ptr, *b_ptr;
489
+ int a_len, b_len;
490
+ int i, result;
491
+
492
+ Check_Type(string, T_STRING);
493
+ OPTIMIZE_TIME
494
+ COMPUTE_HAMMING_DISTANCE
495
+ return INT2FIX(result);
496
+ }
497
+
498
+ static VALUE Hamming_similar(General *amatch, VALUE string)
499
+ {
500
+ char *a_ptr, *b_ptr;
501
+ int a_len, b_len;
502
+ int i, result;
503
+
504
+ Check_Type(string, T_STRING);
505
+ OPTIMIZE_TIME
506
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
507
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
508
+ COMPUTE_HAMMING_DISTANCE
509
+ return rb_float_new(1.0 - ((double) result) / b_len);
510
+ }
511
+
512
+ /*
513
+ * Longest Common Subsequence computation
514
+ */
515
+
516
+ #define COMPUTE_LONGEST_SUBSEQUENCE \
517
+ l[0] = ALLOC_N(int, b_len + 1); \
518
+ l[1] = ALLOC_N(int, b_len + 1); \
519
+ for (i = a_len, c = 0, p = 1; i >= 0; i--) { \
520
+ for (j = b_len; j >= 0; j--) { \
521
+ if (i == a_len || j == b_len) { \
522
+ l[c][j] = 0; \
523
+ } else if (a_ptr[i] == b_ptr[j]) { \
524
+ l[c][j] = 1 + l[p][j + 1]; \
525
+ } else { \
526
+ int x = l[p][j], y = l[c][j + 1]; \
527
+ if (x > y) l[c][j] = x; else l[c][j] = y; \
528
+ } \
529
+ } \
530
+ p = c; \
531
+ c = (c + 1) % 2; \
532
+ } \
533
+ result = l[p][0]; \
534
+ free(l[0]); \
535
+ free(l[1]);
536
+
537
+
538
+ static VALUE LongestSubsequence_match(General *amatch, VALUE string)
539
+ {
540
+ char *a_ptr, *b_ptr;
541
+ int a_len, b_len;
542
+ int result, c, p, i, j, *l[2];
543
+
544
+ Check_Type(string, T_STRING);
545
+ OPTIMIZE_TIME
546
+
547
+ if (a_len == 0 || b_len == 0) return INT2FIX(0);
548
+ COMPUTE_LONGEST_SUBSEQUENCE
549
+ return INT2FIX(result);
550
+ }
551
+
552
+ static VALUE LongestSubsequence_similar(General *amatch, VALUE string)
553
+ {
554
+ char *a_ptr, *b_ptr;
555
+ int a_len, b_len;
556
+ int result, c, p, i, j, *l[2];
557
+
558
+ Check_Type(string, T_STRING);
559
+ OPTIMIZE_TIME
560
+
561
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
562
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
563
+ COMPUTE_LONGEST_SUBSEQUENCE
564
+ return rb_float_new(((double) result) / b_len);
565
+ }
566
+
567
+ /*
568
+ * Longest Common Substring computation
569
+ */
570
+
571
+ #define COMPUTE_LONGEST_SUBSTRING \
572
+ l[0] = ALLOC_N(int, b_len); \
573
+ MEMZERO(l[0], int, b_len); \
574
+ l[1] = ALLOC_N(int, b_len); \
575
+ MEMZERO(l[1], int, b_len); \
576
+ result = 0; \
577
+ for (i = 0, c = 0, p = 1; i < a_len; i++) { \
578
+ for (j = 0; j < b_len; j++) { \
579
+ if (a_ptr[i] == b_ptr[j]) { \
580
+ l[c][j] = j == 0 ? 1 : 1 + l[p][j - 1]; \
581
+ if (l[c][j] > result) result = l[c][j]; \
582
+ } else { \
583
+ l[c][j] = 0; \
584
+ } \
585
+ } \
586
+ p = c; \
587
+ c = (c + 1) % 2; \
588
+ } \
589
+ free(l[0]); \
590
+ free(l[1]);
591
+
592
+ static VALUE LongestSubstring_match(General *amatch, VALUE string)
593
+ {
594
+ char *a_ptr, *b_ptr;
595
+ int a_len, b_len;
596
+ int result, c, p, i, j, *l[2];
597
+
598
+ Check_Type(string, T_STRING);
599
+ OPTIMIZE_TIME
600
+ if (a_len == 0 || b_len == 0) return INT2FIX(0);
601
+ COMPUTE_LONGEST_SUBSTRING
602
+ return INT2FIX(result);
603
+ }
604
+
605
+ static VALUE LongestSubstring_similar(General *amatch, VALUE string)
606
+ {
607
+ char *a_ptr, *b_ptr;
608
+ int a_len, b_len;
609
+ int result, c, p, i, j, *l[2];
610
+
611
+ Check_Type(string, T_STRING);
612
+ OPTIMIZE_TIME
613
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
614
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
615
+ COMPUTE_LONGEST_SUBSTRING
616
+ return rb_float_new(((double) result) / b_len);
617
+ }
618
+
619
+ /*
620
+ * Ruby API
621
+ */
622
+
623
+ /*
624
+ * Document-class: Amatch::Levenshtein
625
+ *
626
+ * The Levenshtein edit distance is defined as the minimal costs involved to
627
+ * transform one string into another by using three elementary operations:
628
+ * deletion, insertion and substitution of a character. To transform "water"
629
+ * into "wine", for instance, you have to substitute "a" -> "i": "witer", "t"
630
+ * -> "n": "winer" and delete "r": "wine". The edit distance between "water"
631
+ * and "wine" is 3, because you have to apply three operations. The edit
632
+ * distance between "wine" and "wine" is 0 of course: no operation is
633
+ * necessary for the transformation -- they're already the same string. It's
634
+ * easy to see that more similar strings have smaller edit distances than
635
+ * strings that differ a lot.
636
+ */
637
+
638
+ DEF_RB_FREE(Levenshtein, General)
639
+
640
+ /*
641
+ * call-seq: new(pattern)
642
+ *
643
+ * Creates a new Amatch::Levenshtein instance from <code>pattern</code>.
644
+ */
645
+ static VALUE rb_Levenshtein_initialize(VALUE self, VALUE pattern)
646
+ {
647
+ GET_STRUCT(General)
648
+ General_pattern_set(amatch, pattern);
649
+ return self;
650
+ }
651
+
652
+ DEF_CONSTRUCTOR(Levenshtein, General)
653
+
654
+ /*
655
+ * call-seq: match(strings) -> results
656
+ *
657
+ * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
658
+ * against <code>strings</code>. It returns the number operations, the Sellers
659
+ * distance. <code>strings</code> has to be either a String or an Array of
660
+ * Strings. The returned <code>results</code> are either a Float or an Array of
661
+ * Floats respectively.
662
+ */
663
+ static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
664
+ {
665
+ GET_STRUCT(General)
666
+ return General_iterate_strings(amatch, strings, Levenshtein_match);
667
+ }
668
+
669
+ /*
670
+ * call-seq: similar(strings) -> results
671
+ *
672
+ * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
673
+ * against <code>strings</code>, and compute a Levenshtein distance metric
674
+ * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
675
+ * <code>strings</code> has to be either a String or an Array of Strings. The
676
+ * returned <code>results</code> are either a Fixnum or an Array of Fixnums
677
+ * respectively.
678
+ */
679
+ static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
680
+ {
681
+ GET_STRUCT(General)
682
+ return General_iterate_strings(amatch, strings, Levenshtein_similar);
683
+ }
684
+
685
+ /*
686
+ * call-seq: levenshtein_similar(strings) -> results
687
+ *
688
+ * If called on a String, this string is used as a Amatch::Levenshtein#pattern
689
+ * to match against <code>strings</code>. It returns a Levenshtein distance
690
+ * metric number between 0.0 for very unsimilar strings and 1.0 for an exact
691
+ * match. <code>strings</code> has to be either a String or an Array of
692
+ * Strings. The returned <code>results</code> are either a Float or an Array of
693
+ * Floats respectively.
694
+ */
695
+ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
696
+ {
697
+ VALUE amatch = rb_Levenshtein_new(rb_cSellers, self);
698
+ return rb_Levenshtein_similar(amatch, strings);
699
+ }
700
+
701
+ /*
702
+ * call-seq: search(strings) -> results
703
+ *
704
+ * searches Amatch::Levenshtein#pattern in <code>strings</code> and returns the
705
+ * edit distance (the sum of character operations) as a Fixnum value, by greedy
706
+ * trimming prefixes or postfixes of the match. <code>strings</code> has
707
+ * to be either a String or an Array of Strings. The returned
708
+ * <code>results</code> are either a Float or an Array of Floats respectively.
709
+ */
710
+ static VALUE rb_Levenshtein_search(VALUE self, VALUE strings)
711
+ {
712
+ GET_STRUCT(General)
713
+ return General_iterate_strings(amatch, strings, Levenshtein_search);
714
+ }
715
+
716
+ /*
717
+ * Document-class: Amatch::Sellers
718
+ *
719
+ * The Sellers edit distance is very similar to the Levenshtein edit distance.
720
+ * The difference is, that you can also specify different weights for every
721
+ * operation to prefer special operations over others. This extension of the
722
+ * Sellers edit distance is also known under the names: Needleman-Wunsch
723
+ * distance.
724
+ */
725
+
726
+ DEF_RB_FREE(Sellers, Sellers)
727
+
728
+ /*
729
+ * Document-method: substitution
730
+ *
731
+ * call-seq: substitution -> weight
732
+ *
733
+ * Returns the weight of the substitution operation, that is used to compute
734
+ * the Sellers distance.
735
+ */
736
+ DEF_RB_READER(Sellers, rb_Sellers_substitution, substitution,
737
+ rb_float_new)
738
+
739
+ /*
740
+ * Document-method: deletion
741
+ *
742
+ * call-seq: deletion -> weight
743
+ *
744
+ * Returns the weight of the deletion operation, that is used to compute
745
+ * the Sellers distance.
746
+ */
747
+ DEF_RB_READER(Sellers, rb_Sellers_deletion, deletion,
748
+ rb_float_new)
749
+
750
+ /*
751
+ * Document-method: insertion
752
+ *
753
+ * call-seq: insertion -> weight
754
+ *
755
+ * Returns the weight of the insertion operation, that is used to compute
756
+ * the Sellers distance.
757
+ */
758
+ DEF_RB_READER(Sellers, rb_Sellers_insertion, insertion,
759
+ rb_float_new)
760
+
761
+ /*
762
+ * Document-method: substitution=
763
+ *
764
+ * call-seq: substitution=(weight)
765
+ *
766
+ * Sets the weight of the substitution operation, that is used to compute
767
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
768
+ * should be a Float value >= 0.0.
769
+ */
770
+ DEF_RB_WRITER(Sellers, rb_Sellers_substitution_set, substitution,
771
+ double, CAST2FLOAT, FLOAT2C, >= 0)
772
+
773
+ /*
774
+ * Document-method: deletion=
775
+ *
776
+ * call-seq: deletion=(weight)
777
+ *
778
+ * Sets the weight of the deletion operation, that is used to compute
779
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
780
+ * should be a Float value >= 0.0.
781
+ */
782
+ DEF_RB_WRITER(Sellers, rb_Sellers_deletion_set, deletion,
783
+ double, CAST2FLOAT, FLOAT2C, >= 0)
784
+
785
+ /*
786
+ * Document-method: insertion=
787
+ *
788
+ * call-seq: insertion=(weight)
789
+ *
790
+ * Sets the weight of the insertion operation, that is used to compute
791
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
792
+ * should be a Float value >= 0.0.
793
+ */
794
+ DEF_RB_WRITER(Sellers, rb_Sellers_insertion_set, insertion,
795
+ double, CAST2FLOAT, FLOAT2C, >= 0)
796
+
797
+ /*
798
+ * Resets all weights (substitution, deletion, and insertion) to 1.0.
799
+ */
800
+ static VALUE rb_Sellers_reset_weights(VALUE self)
801
+ {
802
+ GET_STRUCT(Sellers)
803
+ Sellers_reset_weights(amatch);
804
+ return self;
171
805
  }
172
806
 
173
- static VALUE
174
- handle_strings(self, strings, mode)
175
- VALUE self;
176
- VALUE strings;
177
- char mode;
807
+ /*
808
+ * call-seq: new(pattern)
809
+ *
810
+ * Creates a new Amatch::Sellers instance from <code>pattern</code>,
811
+ * with all weights initially set to 1.0.
812
+ */
813
+ static VALUE rb_Sellers_initialize(VALUE self, VALUE pattern)
178
814
  {
179
- if (TYPE(strings) == T_ARRAY) {
815
+ GET_STRUCT(Sellers)
816
+ Sellers_pattern_set(amatch, pattern);
817
+ Sellers_reset_weights(amatch);
818
+ return self;
819
+ }
820
+
821
+ DEF_CONSTRUCTOR(Sellers, Sellers)
822
+
823
+ /*
824
+ * Document-method: pattern
825
+ *
826
+ * call-seq: pattern -> pattern string
827
+ *
828
+ * Returns the current pattern string of this Amatch::Sellers instance.
829
+ */
830
+
831
+ /*
832
+ * Document-method: pattern=
833
+ *
834
+ * call-seq: pattern=(pattern)
835
+ *
836
+ * Sets the current pattern string of this Amatch::Sellers instance to
837
+ * <code>pattern</code>.
838
+ */
839
+
840
+ /*
841
+ * call-seq: match(strings) -> results
842
+ *
843
+ * Uses this Amatch::Sellers instance to match Sellers#pattern against
844
+ * <code>strings</code>, while taking into account the given weights. It
845
+ * returns the number of weighted character operations, the Sellers distance.
846
+ * <code>strings</code> has to be either a String or an Array of Strings. The
847
+ * returned <code>results</code> are either a Float or an Array of Floats
848
+ * respectively.
849
+ */
850
+ static VALUE rb_Sellers_match(VALUE self, VALUE strings)
851
+ {
852
+ GET_STRUCT(Sellers)
853
+ return Sellers_iterate_strings(amatch, strings, Sellers_match);
854
+ }
855
+
856
+ /*
857
+ * call-seq: similar(strings) -> results
858
+ *
859
+ * Uses this Amatch::Sellers instance to match Amatch::Sellers#pattern
860
+ * against <code>strings</code> (taking into account the given weights), and
861
+ * compute a Sellers distance metric number between 0.0 for very unsimilar
862
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
863
+ * String or an Array of Strings. The returned <code>results</code> are either
864
+ * a Fixnum or an Array of Fixnums
865
+ * respectively.
866
+ */
867
+ static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
868
+ {
869
+ GET_STRUCT(Sellers)
870
+ return Sellers_iterate_strings(amatch, strings, Sellers_similar);
871
+ }
872
+
873
+ /*
874
+ * call-seq: search(strings) -> results
875
+ *
876
+ * searches Sellers#pattern in <code>strings</code> and returns the edit
877
+ * distance (the sum of weighted character operations) as a Float value, by
878
+ * greedy trimming prefixes or postfixes of the match. <code>strings</code> has
879
+ * to be either a String or an Array of Strings. The returned
880
+ * <code>results</code> are either a Float or an Array of Floats respectively.
881
+ */
882
+ static VALUE rb_Sellers_search(VALUE self, VALUE strings)
883
+ {
884
+ GET_STRUCT(Sellers)
885
+ return Sellers_iterate_strings(amatch, strings, Sellers_search);
886
+ }
887
+
888
+ /*
889
+ * Document-class: Amatch::PairDistance
890
+ *
891
+ * The pair distance between two strings is based on the number of adjacent
892
+ * character pairs, that are contained in both strings. The similiarity
893
+ * metric of two strings s1 and s2 is
894
+ * 2*|union(pairs(s1), pairs(s2))| / |pairs(s1)| + |pairs(s2)|
895
+ * If it is 1.0 the two strings are an exact match, if less than 1.0 they
896
+ * are more dissimilar. The advantage of considering adjacent characters, is to
897
+ * take account not only of the characters, but also of the character ordering
898
+ * in the original strings.
899
+ *
900
+ * This metric is very capable to find similarities in natural languages.
901
+ * It is explained in more detail in Simon White's article "How to Strike a
902
+ * Match", located at this url:
903
+ * http://www.catalysoft.com/articles/StrikeAMatch.html
904
+ * It is also very similar (a special case) to the method described under
905
+ * http://citeseer.lcs.mit.edu/gravano01using.html in "Using q-grams in a DBMS
906
+ * for Approximate String Processing."
907
+ */
908
+ DEF_RB_FREE(PairDistance, PairDistance)
909
+
910
+ /*
911
+ * call-seq: new(pattern)
912
+ *
913
+ * Creates a new Amatch::PairDistance instance from <code>pattern</code>.
914
+ */
915
+ static VALUE rb_PairDistance_initialize(VALUE self, VALUE pattern)
916
+ {
917
+ GET_STRUCT(PairDistance)
918
+ PairDistance_pattern_set(amatch, pattern);
919
+ return self;
920
+ }
921
+
922
+ DEF_CONSTRUCTOR(PairDistance, PairDistance)
923
+
924
+ /*
925
+ * call-seq: match(strings, regexp = /\s+/) -> results
926
+ *
927
+ * Uses this Amatch::PairDistance instance to match PairDistance#pattern against
928
+ * <code>strings</code>. It returns the pair distance measure, that is a
929
+ * returned value of 1.0 is an exact match, partial matches are lower
930
+ * values, while 0.0 means no match at all.
931
+ *
932
+ * <code>strings</code> has to be either a String or an
933
+ * Array of Strings. The argument <code>regexp</code> is used to split the
934
+ * pattern and strings into tokens first. It defaults to /\s+/. If the
935
+ * splitting should be omitted, call the method with nil as <code>regexp</code>
936
+ * explicitly.
937
+ *
938
+ * The returned <code>results</code> are either a Float or an
939
+ * Array of Floats respectively.
940
+ */
941
+ static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
942
+ {
943
+ VALUE result, strings, regexp = Qnil;
944
+ int use_regexp;
945
+ GET_STRUCT(PairDistance)
946
+
947
+ rb_scan_args(argc, argv, "11", &strings, &regexp);
948
+ use_regexp = NIL_P(regexp) && argc != 2;
949
+ if (TYPE(strings) == T_STRING) {
950
+ result = PairDistance_match(amatch, strings, regexp, use_regexp);
951
+ } else {
952
+ Check_Type(strings, T_ARRAY);
180
953
  int i;
181
- VALUE result = rb_ary_new2(RARRAY(strings)->len);
954
+ result = rb_ary_new2(RARRAY(strings)->len);
182
955
  for (i = 0; i < RARRAY(strings)->len; i++) {
183
956
  VALUE string = rb_ary_entry(strings, i);
184
957
  if (TYPE(string) != T_STRING) {
185
958
  rb_raise(rb_eTypeError,
186
959
  "array has to contain only strings (%s given)",
187
- NIL_P(string) ? "NilClass" :
188
- rb_class2name(CLASS_OF(string)));
960
+ NIL_P(string) ?
961
+ "NilClass" :
962
+ rb_class2name(CLASS_OF(string)));
189
963
  }
190
- rb_ary_push(result, calculate_distance(self, string, mode));
964
+ rb_ary_push(result,
965
+ PairDistance_match(amatch, string, regexp, use_regexp));
191
966
  }
192
- return result;
193
- } else if (TYPE(strings) == T_STRING) {
194
- return calculate_distance(self, strings, mode);
195
- } else {
196
- rb_raise(rb_eTypeError,
197
- "value of strings needs to be string or array (%s given)",
198
- NIL_P(strings) ? "NilClass" : rb_class2name(CLASS_OF(strings)));
199
967
  }
968
+ pair_array_destroy(amatch->pattern_pair_array);
969
+ amatch->pattern_pair_array = NULL;
970
+ return result;
200
971
  }
201
972
 
202
973
  /*
203
- * Ruby API
974
+ * call-seq: pair_distance_similar(strings) -> results
975
+ *
976
+ * If called on a String, this string is used as a Amatch::PairDistance#pattern
977
+ * to match against <code>strings</code> using /\s+/ as the tokenizing regular
978
+ * expression. It returns a pair distance metric number between 0.0 for very
979
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
980
+ * either a String or an Array of Strings. The returned <code>results</code>
981
+ * are either a Float or an Array of Floats respectively.
204
982
  */
205
-
206
- static VALUE
207
- rb_amatch_resetw(self)
208
- VALUE self;
983
+ static VALUE rb_str_pair_distance_similar(VALUE self, VALUE strings)
209
984
  {
210
- rb_iv_set(self, "@subw", INT2FIX(1));
211
- rb_iv_set(self, "@delw", INT2FIX(1));
212
- rb_iv_set(self, "@insw", INT2FIX(1));
213
-
214
- return Qtrue;
985
+ VALUE amatch = rb_PairDistance_new(rb_cSellers, self);
986
+ return rb_PairDistance_match(1, &strings, amatch);
215
987
  }
216
988
 
217
- static VALUE
218
- rb_amatch_initialize(self, pattern)
219
- VALUE self;
220
- VALUE pattern;
221
- {
989
+ /*
990
+ * Document-class: Amatch::Hamming
991
+ *
992
+ * This class computes the Hamming distance between two strings.
993
+ *
994
+ * The Hamming distance between two strings is the number of characters, that
995
+ * are different. Thus a hamming distance of 0 means an exact
996
+ * match, a hamming distance of 1 means one character is different, and so on.
997
+ * If one string is longer than the other string, the missing characters are
998
+ * counted as different characters.
999
+ */
1000
+
1001
+ DEF_RB_FREE(Hamming, General)
222
1002
 
223
- Check_Type(pattern, T_STRING);
224
- rb_iv_set(self, "@pattern", pattern);
225
- rb_amatch_resetw(self);
1003
+ /*
1004
+ * call-seq: new(pattern)
1005
+ *
1006
+ * Creates a new Amatch::Hamming instance from <code>pattern</code>.
1007
+ */
1008
+ static VALUE rb_Hamming_initialize(VALUE self, VALUE pattern)
1009
+ {
1010
+ GET_STRUCT(General)
1011
+ General_pattern_set(amatch, pattern);
226
1012
  return self;
227
1013
  }
228
1014
 
229
- static VALUE
230
- rb_amatch_pattern_is(self, pattern)
231
- VALUE self;
232
- VALUE pattern;
233
- {
234
- Check_Type(pattern, T_STRING);
235
- rb_iv_set(self, "@pattern", pattern);
1015
+ DEF_CONSTRUCTOR(Hamming, General)
236
1016
 
237
- return pattern;
1017
+ /*
1018
+ * call-seq: match(strings) -> results
1019
+ *
1020
+ * Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
1021
+ * <code>strings</code>, that is compute the hamming distance between
1022
+ * <code>pattern</code> and <code>strings</code>. <code>strings</code> has to
1023
+ * be either a String or an Array of Strings. The returned <code>results</code>
1024
+ * are either a Fixnum or an Array of Fixnums respectively.
1025
+ */
1026
+ static VALUE rb_Hamming_match(VALUE self, VALUE strings)
1027
+ {
1028
+ GET_STRUCT(General)
1029
+ return General_iterate_strings(amatch, strings, Hamming_match);
238
1030
  }
239
1031
 
1032
+ /*
1033
+ * call-seq: similar(strings) -> results
1034
+ *
1035
+ * Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
1036
+ * <code>strings</code>, and compute a Hamming distance metric number between
1037
+ * 0.0 for very unsimilar strings and 1.0 for an exact match.
1038
+ * <code>strings</code> has to be either a String or an Array of Strings. The
1039
+ * returned <code>results</code> are either a Fixnum or an Array of Fixnums
1040
+ * respectively.
1041
+ */
1042
+ static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
1043
+ {
1044
+ GET_STRUCT(General)
1045
+ return General_iterate_strings(amatch, strings, Hamming_similar);
1046
+ }
240
1047
 
241
- static VALUE
242
- rb_amatch_match(self, strings)
243
- VALUE self;
244
- VALUE strings;
1048
+ /*
1049
+ * call-seq: hamming_similar(strings) -> results
1050
+ *
1051
+ * If called on a String, this string is used as a Amatch::Hamming#pattern to
1052
+ * match against <code>strings</code>. It returns a Hamming distance metric
1053
+ * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
1054
+ * <code>strings</code>
1055
+ * has to be either a String or an Array of Strings. The returned
1056
+ * <code>results</code> are either a Float or an Array of Floats respectively.
1057
+ */
1058
+ static VALUE rb_str_hamming_similar(VALUE self, VALUE strings)
245
1059
  {
246
- return handle_strings(self, strings, MATCH);
1060
+ VALUE amatch = rb_Hamming_new(rb_cHamming, self);
1061
+ return rb_Hamming_similar(amatch, strings);
247
1062
  }
248
1063
 
249
- static VALUE
250
- rb_amatch_matchr(self, strings)
251
- VALUE self;
252
- VALUE strings;
1064
+
1065
+ /*
1066
+ * Document-class: Amatch::LongestSubsequence
1067
+ *
1068
+ * This class computes the length of the longest subsequence common to two
1069
+ * strings. A subsequence doesn't have to be contiguous. The longer the common
1070
+ * subsequence is, the more similar the two strings will be.
1071
+ *
1072
+ * The longest common subsequence between "test" and "test" is of length 4,
1073
+ * because "test" itself is this subsequence. The longest common subsequence
1074
+ * between "test" and "east" is "e", "s", "t" and the length of the
1075
+ * sequence is 3.
1076
+ */
1077
+ DEF_RB_FREE(LongestSubsequence, General)
1078
+
1079
+ /*
1080
+ * call-seq: new(pattern)
1081
+ *
1082
+ * Creates a new Amatch::LongestSubsequence instance from <code>pattern</code>.
1083
+ */
1084
+ static VALUE rb_LongestSubsequence_initialize(VALUE self, VALUE pattern)
253
1085
  {
254
- return handle_strings(self, strings, MATCHR);
1086
+ GET_STRUCT(General)
1087
+ General_pattern_set(amatch, pattern);
1088
+ return self;
255
1089
  }
256
1090
 
257
- static VALUE
258
- rb_amatch_compare(self, strings)
259
- VALUE self;
260
- VALUE strings;
261
- {
262
- return handle_strings(self, strings, COMPARE);
1091
+ DEF_CONSTRUCTOR(LongestSubsequence, General)
1092
+
1093
+ /*
1094
+ * call-seq: match(strings) -> results
1095
+ *
1096
+ * Uses this Amatch::LongestSubsequence instance to match
1097
+ * LongestSubsequence#pattern against <code>strings</code>, that is compute the
1098
+ * length of the longest common subsequence. <code>strings</code> has to be
1099
+ * either a String or an Array of Strings. The returned <code>results</code>
1100
+ * are either a Fixnum or an Array of Fixnums respectively.
1101
+ */
1102
+ static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
1103
+ {
1104
+ GET_STRUCT(General)
1105
+ return General_iterate_strings(amatch, strings, LongestSubsequence_match);
263
1106
  }
264
1107
 
265
- static VALUE
266
- rb_amatch_comparer(self, strings)
267
- VALUE self;
268
- VALUE strings;
1108
+ /*
1109
+ * call-seq: similar(strings) -> results
1110
+ *
1111
+ * Uses this Amatch::LongestSubsequence instance to match
1112
+ * Amatch::LongestSubsequence#pattern against <code>strings</code>, and compute
1113
+ * a longest substring distance metric number between 0.0 for very unsimilar
1114
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1115
+ * String or an Array of Strings. The returned <code>results</code> are either
1116
+ * a Fixnum or an Array of Fixnums
1117
+ */
1118
+ static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
1119
+ {
1120
+ GET_STRUCT(General)
1121
+ return General_iterate_strings(amatch, strings, LongestSubsequence_similar);
1122
+ }
1123
+
1124
+ /*
1125
+ * call-seq: longest_subsequence_similar(strings) -> results
1126
+ *
1127
+ * If called on a String, this string is used as a
1128
+ * Amatch::LongestSubsequence#pattern to match against <code>strings</code>. It
1129
+ * returns a longest subsequence distance metric number between 0.0 for very
1130
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1131
+ * either a String or an Array of Strings. The returned <code>results</code>
1132
+ * are either a Float or an Array of Floats respectively.
1133
+ */
1134
+ static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
1135
+ {
1136
+ VALUE amatch = rb_LongestSubsequence_new(rb_cSellers, self);
1137
+ return rb_LongestSubsequence_similar(amatch, strings);
1138
+ }
1139
+
1140
+ /*
1141
+ * Document-class: Amatch::LongestSubstring
1142
+ *
1143
+ * The longest common substring is the longest substring, that is part of
1144
+ * two strings. A substring is contiguous, while a subsequence need not to
1145
+ * be. The longer the common substring is, the more similar the two strings
1146
+ * will be.
1147
+ *
1148
+ * The longest common substring between 'string' and 'string' is 'string'
1149
+ * again, thus the longest common substring length is 6. The longest common
1150
+ * substring between 'string' and 'storing' is 'ring', thus the longest common
1151
+ * substring length is 4.
1152
+ */
1153
+
1154
+ DEF_RB_FREE(LongestSubstring, General)
1155
+
1156
+ /*
1157
+ * call-seq: new(pattern)
1158
+ *
1159
+ * Creates a new Amatch::LongestSubstring instance from <code>pattern</code>.
1160
+ */
1161
+ static VALUE rb_LongestSubstring_initialize(VALUE self, VALUE pattern)
269
1162
  {
270
- return handle_strings(self, strings, COMPARER);
1163
+ GET_STRUCT(General)
1164
+ General_pattern_set(amatch, pattern);
1165
+ return self;
271
1166
  }
272
1167
 
1168
+ DEF_CONSTRUCTOR(LongestSubstring, General)
273
1169
 
274
- static VALUE
275
- rb_amatch_search(self, strings)
276
- VALUE self;
277
- VALUE strings;
1170
+ /*
1171
+ * call-seq: match(strings) -> results
1172
+ *
1173
+ * Uses this Amatch::LongestSubstring instance to match
1174
+ * LongestSubstring#pattern against <code>strings</code>, that is compute the
1175
+ * length of the longest common substring. <code>strings</code> has to be
1176
+ * either a String or an Array of Strings. The returned <code>results</code>
1177
+ * are either a Fixnum or an Array of Fixnums respectively.
1178
+ */
1179
+ static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
278
1180
  {
279
- return handle_strings(self, strings, SEARCH);
1181
+ GET_STRUCT(General)
1182
+ return General_iterate_strings(amatch, strings, LongestSubstring_match);
280
1183
  }
281
1184
 
282
- static VALUE
283
- rb_amatch_searchr(self, strings)
284
- VALUE self;
285
- VALUE strings;
1185
+ /*
1186
+ * call-seq: similar(strings) -> results
1187
+ *
1188
+ * Uses this Amatch::LongestSubstring instance to match
1189
+ * Amatch::LongestSubstring#pattern against <code>strings</code>, and compute a
1190
+ * longest substring distance metric number between 0.0 for very unsimilar
1191
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1192
+ * String or an Array of Strings. The returned <code>results</code> are either
1193
+ * a Fixnum or an Array of Fixnums
1194
+ * respectively.
1195
+ */
1196
+ static VALUE rb_LongestSubstring_similar(VALUE self, VALUE strings)
286
1197
  {
287
- return handle_strings(self, strings, SEARCHR);
1198
+ GET_STRUCT(General)
1199
+ return General_iterate_strings(amatch, strings, LongestSubstring_similar);
288
1200
  }
289
1201
 
290
- void
291
- Init_amatch()
1202
+ /*
1203
+ * call-seq: longest_substring_similar(strings) -> results
1204
+ *
1205
+ * If called on a String, this string is used as a
1206
+ * Amatch::LongestSubstring#pattern to match against <code>strings</code>. It
1207
+ * returns a longest substring distance metric number between 0.0 for very
1208
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1209
+ * either a String or an Array of Strings. The returned <code>results</code>
1210
+ * are either a Float or an Array of Floats respectively.
1211
+ */
1212
+ static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
1213
+ {
1214
+ VALUE amatch = rb_LongestSubsequence_new(rb_cSellers, self);
1215
+ return rb_LongestSubstring_similar(amatch, strings);
1216
+ }
1217
+
1218
+ /*
1219
+ * = amatch - Approximate Matching Extension for Ruby
1220
+ *
1221
+ * == Description
1222
+ *
1223
+ * This is a collection of classes that can be used for Approximate
1224
+ * matching, searching, and comparing of Strings. They implement algorithms
1225
+ * that compute the Levenshtein edit distance, Sellers edit distance, the
1226
+ * Hamming distance, the longest common subsequence length, the longest common
1227
+ * substring length, and the pair distance metric.
1228
+ *
1229
+ * == Author
1230
+ *
1231
+ * Florian Frank mailto:flori@ping.de
1232
+ *
1233
+ * == License
1234
+ *
1235
+ * This is free software; you can redistribute it and/or modify it under
1236
+ * the terms of the GNU General Public License Version 2 as published by
1237
+ * the Free Software Foundation: http://www.gnu.org/copyleft/gpl.html
1238
+ *
1239
+ * == Download
1240
+ *
1241
+ * The latest version of <b>amatch</b> can be found at
1242
+ *
1243
+ * * http://rubyforge.org/frs/?group_id=390
1244
+ *
1245
+ * Online Documentation should be located at
1246
+ *
1247
+ * * http://amatch.rubyforge.org
1248
+ *
1249
+ * == Examples
1250
+ * require 'amatch'
1251
+ * # => true
1252
+ * include Amatch
1253
+ * # => Object
1254
+ *
1255
+ * m = Sellers.new("pattern")
1256
+ * # => #<Amatch::Sellers:0x40366324>
1257
+ * m.match("pattren")
1258
+ * # => 2.0
1259
+ * m.substitution = m.insertion = 3
1260
+ * # => 3
1261
+ * m.match("pattren")
1262
+ * # => 4.0
1263
+ * m.reset_weights
1264
+ * # => #<Amatch::Sellers:0x40366324>
1265
+ * m.match(["pattren","parent"])
1266
+ * # => [2.0, 4.0]
1267
+ * m.search("abcpattrendef")
1268
+ * # => 2.0
1269
+ *
1270
+ * m = Levenshtein.new("pattern")
1271
+ * # => #<Amatch::Levenshtein:0x4035919c>
1272
+ * m.match("pattren")
1273
+ * # => 2
1274
+ * m.search("abcpattrendef")
1275
+ * # => 2
1276
+ * "pattern language".levenshtein_similar("language of patterns")
1277
+ * # => 0.2
1278
+ *
1279
+ * m = Hamming.new("pattern")
1280
+ * # => #<Amatch::Hamming:0x40350858>
1281
+ * m.match("pattren")
1282
+ * # => 2
1283
+ * "pattern language".hamming_similar("language of patterns")
1284
+ * # => 0.1
1285
+ *
1286
+ * m = PairDistance.new("pattern")
1287
+ * # => #<Amatch::PairDistance:0x40349be8>
1288
+ * m.match("pattr en")
1289
+ * # => 0.545454545454545
1290
+ * m.match("pattr en", nil)
1291
+ * # => 0.461538461538462
1292
+ * m.match("pattr en", /t+/)
1293
+ * # => 0.285714285714286
1294
+ * "pattern language".pair_distance_similar("language of patterns")
1295
+ * # => 0.928571428571429
1296
+ *
1297
+ * m = LongestSubsequence.new("pattern")
1298
+ * # => #<Amatch::LongestSubsequence:0x4033e900>
1299
+ * m.match("pattren")
1300
+ * # => 6
1301
+ * "pattern language".longest_subsequence_similar("language of patterns")
1302
+ * # => 0.4
1303
+ *
1304
+ * m = LongestSubstring.new("pattern")
1305
+ * # => #<Amatch::LongestSubstring:0x403378d0>
1306
+ * m.match("pattren")
1307
+ * # => 4
1308
+ * "pattern language".longest_substring_similar("language of patterns")
1309
+ * # => 0.4
1310
+ *
1311
+ */
1312
+
1313
+ void Init_amatch()
292
1314
  {
293
- cAmatch = rb_define_class("Amatch", rb_cObject);
294
- rb_define_method(cAmatch, "initialize", rb_amatch_initialize, 1);
295
-
296
- rb_define_attr(cAmatch, "debug", 1, 1);
297
- rb_define_attr(cAmatch, "subw", 1, 1);
298
- rb_define_attr(cAmatch, "delw", 1, 1);
299
- rb_define_attr(cAmatch, "insw", 1, 1);
300
- rb_define_method(cAmatch, "resetw", rb_amatch_resetw, 0);
301
-
302
- rb_define_method(cAmatch, "pattern=", rb_amatch_pattern_is, 1);
303
- rb_define_attr(cAmatch, "pattern", 1, 0);
304
-
305
- rb_define_method(cAmatch, "match", rb_amatch_match, 1);
306
- rb_define_method(cAmatch, "matchr", rb_amatch_matchr, 1);
307
- rb_define_method(cAmatch, "compare", rb_amatch_compare, 1);
308
- rb_define_method(cAmatch, "comparer", rb_amatch_comparer, 1);
309
- rb_define_method(cAmatch, "search", rb_amatch_search, 1);
310
- rb_define_method(cAmatch, "searchr", rb_amatch_searchr, 1);
1315
+ rb_mAmatch = rb_define_module("Amatch");
1316
+
1317
+ /* Levenshtein */
1318
+ rb_cLevenshtein = rb_define_class_under(rb_mAmatch, "Levenshtein", rb_cObject);
1319
+ rb_define_alloc_func(rb_cLevenshtein, rb_Levenshtein_s_allocate);
1320
+ rb_define_method(rb_cLevenshtein, "initialize", rb_Levenshtein_initialize, 1);
1321
+ rb_define_method(rb_cLevenshtein, "pattern", rb_General_pattern, 0);
1322
+ rb_define_method(rb_cLevenshtein, "pattern=", rb_General_pattern_set, 1);
1323
+ rb_define_method(rb_cLevenshtein, "match", rb_Levenshtein_match, 1);
1324
+ rb_define_method(rb_cLevenshtein, "search", rb_Levenshtein_search, 1);
1325
+ rb_define_method(rb_cLevenshtein, "similar", rb_Levenshtein_similar, 1);
1326
+ rb_define_method(rb_cString, "levenshtein_similar", rb_str_levenshtein_similar, 1);
1327
+
1328
+ /* Sellers */
1329
+ rb_cSellers = rb_define_class_under(rb_mAmatch, "Sellers", rb_cObject);
1330
+ rb_define_alloc_func(rb_cSellers, rb_Sellers_s_allocate);
1331
+ rb_define_method(rb_cSellers, "initialize", rb_Sellers_initialize, 1);
1332
+ rb_define_method(rb_cSellers, "pattern", rb_Sellers_pattern, 0);
1333
+ rb_define_method(rb_cSellers, "pattern=", rb_Sellers_pattern_set, 1);
1334
+ rb_define_method(rb_cSellers, "substitution", rb_Sellers_substitution, 0);
1335
+ rb_define_method(rb_cSellers, "substitution=", rb_Sellers_substitution_set, 1);
1336
+ rb_define_method(rb_cSellers, "deletion", rb_Sellers_deletion, 0);
1337
+ rb_define_method(rb_cSellers, "deletion=", rb_Sellers_deletion_set, 1);
1338
+ rb_define_method(rb_cSellers, "insertion", rb_Sellers_insertion, 0);
1339
+ rb_define_method(rb_cSellers, "insertion=", rb_Sellers_insertion_set, 1);
1340
+ rb_define_method(rb_cSellers, "reset_weights", rb_Sellers_reset_weights, 0);
1341
+ rb_define_method(rb_cSellers, "match", rb_Sellers_match, 1);
1342
+ rb_define_method(rb_cSellers, "search", rb_Sellers_search, 1);
1343
+ rb_define_method(rb_cSellers, "similar", rb_Sellers_similar, 1);
1344
+
1345
+ /* Hamming */
1346
+ rb_cHamming = rb_define_class_under(rb_mAmatch, "Hamming", rb_cObject);
1347
+ rb_define_alloc_func(rb_cHamming, rb_Hamming_s_allocate);
1348
+ rb_define_method(rb_cHamming, "initialize", rb_Hamming_initialize, 1);
1349
+ rb_define_method(rb_cHamming, "pattern", rb_General_pattern, 0);
1350
+ rb_define_method(rb_cHamming, "pattern=", rb_General_pattern_set, 1);
1351
+ rb_define_method(rb_cHamming, "match", rb_Hamming_match, 1);
1352
+ rb_define_method(rb_cHamming, "similar", rb_Hamming_similar, 1);
1353
+ rb_define_method(rb_cString, "hamming_similar", rb_str_hamming_similar, 1);
1354
+
1355
+ /* Pair Distance Metric */
1356
+ rb_cPairDistance = rb_define_class_under(rb_mAmatch, "PairDistance", rb_cObject);
1357
+ rb_define_alloc_func(rb_cPairDistance, rb_PairDistance_s_allocate);
1358
+ rb_define_method(rb_cPairDistance, "initialize", rb_PairDistance_initialize, 1);
1359
+ rb_define_method(rb_cPairDistance, "pattern", rb_PairDistance_pattern, 0);
1360
+ rb_define_method(rb_cPairDistance, "pattern=", rb_PairDistance_pattern_set, 1);
1361
+ rb_define_method(rb_cPairDistance, "match", rb_PairDistance_match, -1);
1362
+ rb_define_alias(rb_cPairDistance, "similar", "match");
1363
+ rb_define_method(rb_cString, "pair_distance_similar", rb_str_pair_distance_similar, 1);
1364
+
1365
+ /* Longest Common Subsequence */
1366
+ rb_cLongestSubsequence = rb_define_class_under(rb_mAmatch, "LongestSubsequence", rb_cObject);
1367
+ rb_define_alloc_func(rb_cLongestSubsequence, rb_LongestSubsequence_s_allocate);
1368
+ rb_define_method(rb_cLongestSubsequence, "initialize", rb_LongestSubsequence_initialize, 1);
1369
+ rb_define_method(rb_cLongestSubsequence, "pattern", rb_General_pattern, 0);
1370
+ rb_define_method(rb_cLongestSubsequence, "pattern=", rb_General_pattern_set, 1);
1371
+ rb_define_method(rb_cLongestSubsequence, "match", rb_LongestSubsequence_match, 1);
1372
+ rb_define_method(rb_cLongestSubsequence, "similar", rb_LongestSubsequence_similar, 1);
1373
+ rb_define_method(rb_cString, "longest_subsequence_similar", rb_str_longest_subsequence_similar, 1);
1374
+
1375
+ /* Longest Common Substring */
1376
+ rb_cLongestSubstring = rb_define_class_under(rb_mAmatch, "LongestSubstring", rb_cObject);
1377
+ rb_define_alloc_func(rb_cLongestSubstring, rb_LongestSubstring_s_allocate);
1378
+ rb_define_method(rb_cLongestSubstring, "initialize", rb_LongestSubstring_initialize, 1);
1379
+ rb_define_method(rb_cLongestSubstring, "pattern", rb_General_pattern, 0);
1380
+ rb_define_method(rb_cLongestSubstring, "pattern=", rb_General_pattern_set, 1);
1381
+ rb_define_method(rb_cLongestSubstring, "match", rb_LongestSubstring_match, 1);
1382
+ rb_define_method(rb_cLongestSubstring, "similar", rb_LongestSubstring_similar, 1);
1383
+ rb_define_method(rb_cString, "longest_substring_similar", rb_str_longest_substring_similar, 1);
1384
+
1385
+ id_split = rb_intern("split");
1386
+ id_to_f = rb_intern("to_f");
311
1387
  }
312
1388
  /* vim: set et cin sw=4 ts=4: */