amatch 0.1.5 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGES CHANGED
@@ -1,5 +1,8 @@
1
- 2005-01-20 (0.1.5)
2
- * Bugfix for insertion/deletion weights. Sorry.
1
+ 2005-06-01 (0.2.0)
2
+ * Major changes in API and implementation:
3
+ Now the Levenshtein edit distance, Sellers edit distance, the Hamming
4
+ distance, the longest common subsequence length, the longest common
5
+ substring length, and the pair distance metric can be computed.
3
6
  2005-01-20 (0.1.4)
4
7
  * Better argument handling in initialization method
5
8
  * Minor changes in Rakefile and README.en
@@ -0,0 +1,5 @@
1
+ /usr/local/stow/ruby/bin/
2
+ /usr/local/stow/ruby/bin/CVS
3
+ /usr/local/stow/ruby/bin/CVS
4
+ /usr/local/stow/ruby/bin/CVS
5
+ /usr/local/stow/ruby/lib/ruby/site_ruby/1.8/i686-linux/.
data/README.en CHANGED
@@ -27,5 +27,5 @@ Florian Frank <flori@ping.de>
27
27
  License
28
28
  =======
29
29
 
30
- GNU General Public License (GPL)
30
+ GNU General Public License, Version 2 (GPLv2)
31
31
 
data/Rakefile CHANGED
@@ -3,101 +3,110 @@
3
3
  require 'rake/clean'
4
4
  require 'rake/testtask'
5
5
  require 'rake/gempackagetask'
6
+ require 'rake/rdoctask'
6
7
  require 'rbconfig'
7
8
 
8
9
  include Config
9
10
 
10
- PKG_NAME = 'amatch'
11
11
  PKG_VERSION = File.read('VERSION').chomp
12
- PKG_FILES = Dir.glob("**/*").delete_if {|item|
13
- item.include?("CVS") or item.include?("pkg")
14
- }
12
+ PKG_FILES = FileList['**/*']
13
+ PKG_FILES.exclude(/CVS/)
14
+ PKG_FILES.exclude(/^pkg/)
15
+ PKG_FILES.exclude(/^doc/)
15
16
 
16
- task :default => [:test]
17
+ task :default => :test
17
18
 
18
19
  desc "Run unit tests"
19
- task(:test => [ :clean, :compile ]) do
20
- ruby %{-Iext tests/test.rb}
20
+ task(:test => [:compile]) do
21
+ cd 'tests' do
22
+ ruby %{-I../ext runner.rb}
23
+ end
21
24
  end
22
25
 
23
26
  desc "Compiling library"
24
27
  task :compile do
25
- cd 'ext' do
26
- ruby %{extconf.rb}
27
- sh "make"
28
- end
28
+ cd 'ext' do
29
+ ruby %{extconf.rb}
30
+ sh "make"
31
+ end
29
32
  end
30
33
 
31
34
  desc "Installing library"
32
- task(:install => [:test]) do
33
- src, = Dir['ext/amatch.*'].reject { |x| /\.[co]$/.match x }
34
- filename = File.basename(src)
35
- dst = File.join(CONFIG["sitelibdir"], filename)
36
- install(src, dst, :verbose => true)
35
+ task :install => :test do
36
+ src, = Dir['ext/amatch.*'].reject { |x| /\.[co]$/.match x }
37
+ filename = File.basename(src)
38
+ dst = File.join(CONFIG["sitelibdir"], filename)
39
+ install(src, dst, :verbose => true)
37
40
  end
38
41
 
42
+ desc "Removing generated files"
39
43
  task :clean do
40
- cd 'ext' do
41
- ruby %{extconf.rb}
42
- sh "make distclean" if File.exist?('Makefile')
43
- end
44
+ cd 'ext' do
45
+ ruby 'extconf.rb'
46
+ sh "make distclean" if File.exist?('Makefile')
47
+ end
44
48
  end
45
49
 
46
- spec = Gem::Specification.new do |s|
47
-
48
- #### Basic information.
50
+ Rake::RDocTask.new do |rd|
51
+ rd.main = 'Amatch'
52
+ rd.rdoc_files.include("ext/amatch.c")
53
+ rd.rdoc_dir = 'doc'
54
+ end
49
55
 
50
- s.name = 'amatch'
51
- s.version = PKG_VERSION
52
- s.summary = "Approximate String Matching library"
53
- s.description = <<EOF
54
- Amatch is a library for approximate string matching and searching using
55
- a dynamic programming algorithm to compute the Levenstein distance
56
- between strings.
56
+ spec = Gem::Specification.new do |s|
57
+ #### Basic information.
58
+
59
+ s.name = 'amatch'
60
+ s.version = PKG_VERSION
61
+ s.summary = "Approximate String Matching library"
62
+ s.description = <<EOF
63
+ Amatch is a library for approximate string matching and searching in strings.
64
+ Several algorithms can be used to do this, and it's also possible to compute a
65
+ similarity metric number between 0.0 and 1.0 for two given strings.
57
66
  EOF
58
67
 
59
- #### Dependencies and requirements.
68
+ #### Dependencies and requirements.
60
69
 
61
- #s.add_dependency('log4r', '> 1.0.4')
62
- #s.requirements << ""
70
+ #s.add_dependency('log4r', '> 1.0.4')
71
+ #s.requirements << ""
63
72
 
64
- s.files = PKG_FILES
73
+ s.files = PKG_FILES
65
74
 
66
- #### C code extensions.
75
+ #### C code extensions.
67
76
 
68
- s.extensions << "ext/extconf.rb"
77
+ s.extensions << "ext/extconf.rb"
69
78
 
70
- #### Load-time details: library and application (you will need one or both).
79
+ #### Load-time details: library and application (you will need one or both).
71
80
 
72
- s.require_path = 'lib' # Use these for libraries.
73
- s.autorequire = 'amatch'
81
+ s.require_path = 'ext' # Use these for libraries.
82
+ s.autorequire = 'amatch'
74
83
 
75
- s.bindir = "bin" # Use these for applications.
76
- s.executables = ["agrep.rb"]
77
- s.default_executable = "agrep.rb"
84
+ s.bindir = "bin" # Use these for applications.
85
+ s.executables = ["agrep.rb"]
86
+ s.default_executable = "agrep.rb"
78
87
 
79
- #### Documentation and testing.
88
+ #### Documentation and testing.
80
89
 
81
- s.has_rdoc = true
82
- #s.extra_rdoc_files = rd.rdoc_files.reject { |fn| fn =~ /\.rb$/ }.to_a
83
- #s.rdoc_options <<
84
- # '--title' << 'Rake -- Ruby Make' <<
85
- # '--main' << 'README' <<
86
- # '--line-numbers'
87
- s.test_files << 'tests/test.rb'
90
+ s.has_rdoc = true
91
+ #s.extra_rdoc_files = FileList['ext/amatch.c']
92
+ s.rdoc_options <<
93
+ '--title' << 'Amatch -- Approximate Matching' <<
94
+ '--main' << 'Amatch' <<
95
+ '--line-numbers'
96
+ s.test_files << 'tests/runner.rb'
88
97
 
89
- #### Author and project details.
98
+ #### Author and project details.
90
99
 
91
- s.author = "Florian Frank"
92
- s.email = "flori@ping.de"
93
- s.homepage = "http://amatch.rubyforge.org"
94
- s.rubyforge_project = "amatch"
100
+ s.author = "Florian Frank"
101
+ s.email = "flori@ping.de"
102
+ s.homepage = "http://amatch.rubyforge.org"
103
+ s.rubyforge_project = "amatch"
95
104
  end
96
105
 
97
106
  Rake::GemPackageTask.new(spec) do |pkg|
98
- pkg.need_tar = true
99
- pkg.package_files += PKG_FILES
107
+ pkg.need_tar = true
108
+ pkg.package_files += PKG_FILES
100
109
  end
101
110
 
102
111
  task :release => [ :clean, :package ]
103
- # vim: set et sw=4 ts=4:
112
+ # vim: set et sw=2 ts=2:
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.5
1
+ 0.2.0
@@ -1,74 +1,87 @@
1
1
  #! /usr/bin/env ruby
2
2
  #
3
- ## $Id: agrep.rb,v 1.1.1.1 2004/09/27 19:23:42 flori Exp $
3
+ ## $Id: agrep.rb,v 1.3 2005/04/24 21:11:06 flori Exp $
4
4
  #
5
5
 
6
6
  require 'amatch'
7
7
  require 'getoptlong'
8
8
 
9
9
  def usage(msg, options)
10
- print msg, "\nUsage: #{File.basename($0)} pattern [FILE ...]\n\n"
11
- options.each { |o|
12
- print " " + o[1] + ", " + o[0] + " " +
13
- (o[2] == GetoptLong::REQUIRED_ARGUMENT ? 'ARGUMENT' : '') + "\n"
14
- }
15
- print "\nReport bugs to <flori@ping.de>.\n"
16
- exit 0
10
+ print msg, "\nUsage: #{File.basename($0)} pattern [FILE ...]\n\n"
11
+ options.each do |o|
12
+ puts " " + o[1] + ", " + o[0] + " " +
13
+ (o[2] == GetoptLong::REQUIRED_ARGUMENT ? 'ARGUMENT' : '')
14
+ end
15
+ puts "\nReport bugs to <flori@ping.de>."
16
+ exit 0
17
+ end
18
+
19
+ class MyAmatch < Amatch
20
+ def l_search_relative(strings)
21
+ if strings.is_a? Array
22
+ l_search(strings).map { |x| x / pattern.size }
23
+ else
24
+ l_search(strings) / pattern.size
25
+ end
26
+ end
17
27
  end
18
28
 
19
29
  $distance = 1
30
+ $mode = :l_search
20
31
  begin
21
- parser = GetoptLong.new
22
- options = [
23
- [ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
24
- [ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
25
- [ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
26
- [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
27
- ]
28
- parser.set_options(*options)
29
- parser.each_option { |name, arg|
30
- name = name.sub(/^--/, '')
31
- case name
32
- when 'distance'
33
- $distance = arg.to_f
34
- when 'relative'
35
- $relative = 1
36
- when 'verbose'
37
- $verbose = 1
38
- when 'help'
39
- usage('You\'ve asked for it!', options)
40
- end
41
- }
32
+ parser = GetoptLong.new
33
+ options = [
34
+ [ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
35
+ [ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
36
+ [ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
37
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
38
+ ]
39
+ parser.set_options(*options)
40
+ parser.each_option do |name, arg|
41
+ name = name.sub(/^--/, '')
42
+ case name
43
+ when 'distance'
44
+ $distance = arg.to_f
45
+ when 'relative'
46
+ $mode = :l_search_relative
47
+ when 'verbose'
48
+ $verbose = 1
49
+ when 'help'
50
+ usage('You\'ve asked for it!', options)
51
+ end
52
+ end
42
53
  rescue
43
- exit 1
54
+ exit 1
44
55
  end
45
- $pattern = ARGV.shift or usage('Pattern needed!', options)
56
+ pattern = ARGV.shift or usage('Pattern needed!', options)
46
57
 
47
- matcher = Amatch.new($pattern)
58
+ matcher = MyAmatch.new(pattern)
48
59
  size = 0
49
60
  start = Time.new
50
61
  if ARGV.size > 0 then
51
- ARGV.each { |filename|
52
- File.stat(filename).file? or next
53
- size += File.size(filename)
54
- begin
55
- File.open(filename, 'r').each_line { |line|
56
- print "#{filename}:#{line}" if
57
- ($relative ? matcher.searchr(line) :
58
- matcher.search(line)) <= $distance
59
- }
60
- rescue
61
- $stderr.print "Failure at #{filename}: #{$!} => Skipping!\n"
62
- end
63
- }
62
+ ARGV.each do |filename|
63
+ File.stat(filename).file? or next
64
+ size += File.size(filename)
65
+ begin
66
+ File.open(filename, 'r').each_line do |line|
67
+ if matcher.__send__($mode, line) < $distance
68
+ puts "#{filename}:#{line}"
69
+ end
70
+ end
71
+ rescue
72
+ STDERR.print "Failure at #{filename}: #{$!} => Skipping!\n"
73
+ end
74
+ end
64
75
  else
65
- $stdin.each_line { |line|
66
- size += line.size
67
- print line if ($relative ? matcher.searchr(line) :
68
- matcher.search(line)) <= $distance
69
- }
76
+ STDIN.each_line do |line|
77
+ size += line.size
78
+ if matcher.__send__($mode, line) <= $distance
79
+ puts line
80
+ end
81
+ end
70
82
  end
71
83
  time = Time.new - start
72
- $verbose and $stderr.printf "%.3f secs running, scanned %.3f KB/s.\n",
73
- time, size / time / 1024
84
+ $verbose and STDERR.printf "%.3f secs running, scanned %.3f KB/s.\n",
85
+ time, size / time / 1024
74
86
  exit 0
87
+ # vim: set et sw=2 ts=2:
@@ -0,0 +1,12 @@
1
+ bin-dir=$prefix/bin
2
+ site-ruby=$prefix/lib/ruby/site_ruby/1.8
3
+ prefix=/usr/local/stow/ruby
4
+ ruby-path=/usr/local/stow/ruby/bin/ruby
5
+ make-prog=make
6
+ rb-dir=$site-ruby
7
+ without-ext=no
8
+ ruby-prog=/usr/local/stow/ruby/bin/ruby
9
+ site-ruby-common=$prefix/lib/ruby/site_ruby
10
+ std-ruby=$prefix/lib/ruby/1.8
11
+ data-dir=$prefix/share
12
+ so-dir=$prefix/lib/ruby/site_ruby/1.8/i686-linux
Binary file
@@ -1,312 +1,1388 @@
1
1
  #include "ruby.h"
2
+ #include "pair.h"
2
3
 
3
- static VALUE cAmatch;
4
+ /*
5
+ * Document-method: pattern
6
+ *
7
+ * call-seq: pattern -> pattern string
8
+ *
9
+ * Returns the current pattern string of this instance.
10
+ */
4
11
 
5
12
  /*
6
- * Vector stuff
13
+ * Document-method: pattern=
14
+ *
15
+ * call-seq: pattern=(pattern)
16
+ *
17
+ * Sets the current pattern string of this instance to <code>pattern</code>.
7
18
  */
8
19
 
9
- typedef struct {
10
- int *ptr;
11
- int len;
12
- } vector;
13
20
 
14
- static vector *
15
- vector_new(len)
16
- int len;
17
- {
18
- vector *v;
19
- v = ALLOC(vector);
20
- if (v == NULL) rb_raise(rb_eNoMemError, "couldn't malloc vector");
21
- v->ptr = ALLOC_N(int, len + 1);
22
- if (v->ptr == NULL) rb_raise(rb_eNoMemError, "couldn't malloc vector data");
23
- v->len = len;
24
- return v;
25
- }
26
-
27
- static void
28
- vector_print(v)
29
- vector *v;
21
+ static VALUE rb_mAmatch, rb_cLevenshtein, rb_cSellers, rb_cHamming,
22
+ rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring;
23
+
24
+ static ID id_split, id_to_f;
25
+
26
+ #define GET_STRUCT(klass) \
27
+ klass *amatch; \
28
+ Data_Get_Struct(self, klass, amatch);
29
+
30
+ #define DEF_ALLOCATOR(type) \
31
+ static type *type##_allocate() \
32
+ { \
33
+ type *obj = ALLOC(type); \
34
+ MEMZERO(obj, type, 1); \
35
+ return obj; \
36
+ }
37
+
38
+ #define DEF_CONSTRUCTOR(klass, type) \
39
+ static VALUE rb_##klass##_s_allocate(VALUE klass2) \
40
+ { \
41
+ type *amatch = type##_allocate(); \
42
+ return Data_Wrap_Struct(klass2, NULL, rb_##klass##_free, amatch); \
43
+ } \
44
+ VALUE rb_##klass##_new(VALUE klass2, VALUE pattern) \
45
+ { \
46
+ VALUE obj = rb_##klass##_s_allocate(klass2); \
47
+ rb_##klass##_initialize(obj, pattern); \
48
+ return obj; \
49
+ }
50
+
51
+ #define DEF_RB_FREE(klass, type) \
52
+ static void rb_##klass##_free(type *amatch) \
53
+ { \
54
+ MEMZERO(amatch->pattern, char, amatch->pattern_len); \
55
+ free(amatch->pattern); \
56
+ MEMZERO(amatch, type, 1); \
57
+ free(amatch); \
58
+ }
59
+
60
+ #define DEF_PATTERN_ACCESSOR(type) \
61
+ static void type##_pattern_set(type *amatch, VALUE pattern) \
62
+ { \
63
+ Check_Type(pattern, T_STRING); \
64
+ free(amatch->pattern); \
65
+ amatch->pattern_len = RSTRING(pattern)->len; \
66
+ amatch->pattern = ALLOC_N(char, amatch->pattern_len); \
67
+ MEMCPY(amatch->pattern, RSTRING(pattern)->ptr, char, \
68
+ RSTRING(pattern)->len); \
69
+ } \
70
+ static VALUE rb_##type##_pattern(VALUE self) \
71
+ { \
72
+ GET_STRUCT(type) \
73
+ return rb_str_new(amatch->pattern, amatch->pattern_len); \
74
+ } \
75
+ static VALUE rb_##type##_pattern_set(VALUE self, VALUE pattern) \
76
+ { \
77
+ GET_STRUCT(type) \
78
+ type##_pattern_set(amatch, pattern); \
79
+ return Qnil; \
80
+ }
81
+
82
+ #define DEF_ITERATE_STRINGS(type) \
83
+ static VALUE type##_iterate_strings(type *amatch, VALUE strings, \
84
+ VALUE (*match_function) (type *amatch, VALUE strings)) \
85
+ { \
86
+ if (TYPE(strings) == T_STRING) { \
87
+ return match_function(amatch, strings); \
88
+ } else { \
89
+ Check_Type(strings, T_ARRAY); \
90
+ int i; \
91
+ VALUE result = rb_ary_new2(RARRAY(strings)->len); \
92
+ for (i = 0; i < RARRAY(strings)->len; i++) { \
93
+ VALUE string = rb_ary_entry(strings, i); \
94
+ if (TYPE(string) != T_STRING) { \
95
+ rb_raise(rb_eTypeError, \
96
+ "array has to contain only strings (%s given)", \
97
+ NIL_P(string) ? \
98
+ "NilClass" : \
99
+ rb_class2name(CLASS_OF(string))); \
100
+ } \
101
+ rb_ary_push(result, match_function(amatch, string)); \
102
+ } \
103
+ return result; \
104
+ } \
105
+ }
106
+
107
+ #define DEF_RB_READER(type, function, name, converter) \
108
+ VALUE function(VALUE self) \
109
+ { \
110
+ GET_STRUCT(type) \
111
+ return converter(amatch->name); \
112
+ }
113
+
114
+ #define DEF_RB_WRITER(type, function, name, vtype, caster, converter, check)\
115
+ VALUE function(VALUE self, VALUE value) \
116
+ { \
117
+ vtype value_ ## vtype; \
118
+ GET_STRUCT(type) \
119
+ caster(value); \
120
+ value_ ## vtype = converter(value); \
121
+ if (!(value_ ## vtype check)) \
122
+ rb_raise(rb_eTypeError, "check of value " #check " failed"); \
123
+ amatch->name = value_ ## vtype; \
124
+ return Qnil; \
125
+ }
126
+
127
+
128
+ #define CAST2FLOAT(obj) \
129
+ if (TYPE(obj) != T_FLOAT && rb_respond_to(obj, id_to_f)) \
130
+ obj = rb_funcall(obj, id_to_f, 0, 0); \
131
+ else \
132
+ Check_Type(obj, T_FLOAT)
133
+ #define FLOAT2C(obj) RFLOAT(obj)->value
134
+
135
+ #define OPTIMIZE_TIME \
136
+ if (amatch->pattern_len < RSTRING(string)->len) { \
137
+ a_ptr = amatch->pattern; \
138
+ a_len = amatch->pattern_len; \
139
+ b_ptr = RSTRING(string)->ptr; \
140
+ b_len = RSTRING(string)->len; \
141
+ } else { \
142
+ a_ptr = RSTRING(string)->ptr; \
143
+ a_len = RSTRING(string)->len; \
144
+ b_ptr = amatch->pattern; \
145
+ b_len = amatch->pattern_len; \
146
+ }
147
+
148
+ #define DONT_OPTIMIZE \
149
+ a_ptr = amatch->pattern; \
150
+ a_len = amatch->pattern_len; \
151
+ b_ptr = RSTRING(string)->ptr; \
152
+ b_len = RSTRING(string)->len; \
153
+
154
+ /*
155
+ * C structures of the Amatch classes
156
+ */
157
+
158
+ typedef struct GeneralStruct {
159
+ char *pattern;
160
+ char pattern_len;
161
+ } General;
162
+
163
+ DEF_ALLOCATOR(General)
164
+ DEF_PATTERN_ACCESSOR(General)
165
+ DEF_ITERATE_STRINGS(General)
166
+
167
+ typedef struct SellersStruct {
168
+ char *pattern;
169
+ char pattern_len;
170
+ double substitution;
171
+ double deletion;
172
+ double insertion;
173
+ } Sellers;
174
+
175
+ DEF_ALLOCATOR(Sellers)
176
+ DEF_PATTERN_ACCESSOR(Sellers)
177
+ DEF_ITERATE_STRINGS(Sellers)
178
+
179
+ static void Sellers_reset_weights(Sellers *self)
30
180
  {
31
- int i;
32
- for(i = 0; i < v->len; i++) printf("%d", v->ptr[i]);
33
- puts("");
181
+ self->substitution = 1.0;
182
+ self->deletion = 1.0;
183
+ self->insertion = 1.0;
34
184
  }
35
185
 
36
- static void
37
- vector_destroy(v)
38
- vector *v;
186
+ typedef struct PairDistanceStruct {
187
+ char *pattern;
188
+ char pattern_len;
189
+ PairArray *pattern_pair_array;
190
+ } PairDistance;
191
+
192
+ DEF_ALLOCATOR(PairDistance)
193
+ DEF_PATTERN_ACCESSOR(PairDistance)
194
+
195
+ /*
196
+ * Levenshtein edit distances are computed here:
197
+ */
198
+
199
+ #define COMPUTE_LEVENSHTEIN_DISTANCE \
200
+ for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
201
+ c = i % 2; /* current row */ \
202
+ p = (i + 1) % 2; /* previous row */ \
203
+ v[c][0] = i; /* first column */ \
204
+ for (j = 1; j <= b_len; j++) { \
205
+ /* Bellman's principle of optimality: */ \
206
+ weight = v[p][j - 1] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
207
+ if (weight > v[p][j] + 1) { \
208
+ weight = v[p][j] + 1; \
209
+ } \
210
+ if (weight > v[c][j - 1] + 1) { \
211
+ weight = v[c][j - 1] + 1; \
212
+ } \
213
+ v[c][j] = weight; \
214
+ } \
215
+ p = c; \
216
+ c = (c + 1) % 2; \
217
+ }
218
+
219
+ static VALUE Levenshtein_match(General *amatch, VALUE string)
39
220
  {
40
- xfree(v->ptr);
41
- xfree(v);
221
+ VALUE result;
222
+ char *a_ptr, *b_ptr;
223
+ int a_len, b_len;
224
+ int *v[2], weight;
225
+ int i, j, c, p;
226
+
227
+ Check_Type(string, T_STRING);
228
+ DONT_OPTIMIZE
229
+
230
+ v[0] = ALLOC_N(int, b_len + 1);
231
+ v[1] = ALLOC_N(int, b_len + 1);
232
+ for (i = 0; i <= b_len; i++) {
233
+ v[0][i] = i;
234
+ v[1][i] = i;
235
+ }
236
+
237
+ COMPUTE_LEVENSHTEIN_DISTANCE
238
+
239
+ result = INT2FIX(v[p][b_len]);
240
+
241
+ free(v[0]);
242
+ free(v[1]);
243
+
244
+ return result;
42
245
  }
43
246
 
44
- static int
45
- vector_minimum(v)
46
- vector *v;
247
+ static VALUE Levenshtein_similar(General *amatch, VALUE string)
47
248
  {
48
- int i;
49
- int min;
249
+ VALUE result;
250
+ char *a_ptr, *b_ptr;
251
+ int a_len, b_len;
252
+ int *v[2], weight;
253
+ int i, j, c, p;
50
254
 
51
- if (v->len == 0) return -1;
52
- min = v->ptr[0];
53
- for (i = 1; i <= v->len; i++) {
54
- if (min > v->ptr[i]) min = v->ptr[i];
255
+ Check_Type(string, T_STRING);
256
+ DONT_OPTIMIZE
257
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
258
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
259
+ v[0] = ALLOC_N(int, b_len + 1);
260
+ v[1] = ALLOC_N(int, b_len + 1);
261
+ for (i = 0; i <= b_len; i++) {
262
+ v[0][i] = i;
263
+ v[1][i] = i;
264
+ }
265
+
266
+ COMPUTE_LEVENSHTEIN_DISTANCE
267
+
268
+ if (b_len > a_len) {
269
+ result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
270
+ } else {
271
+ result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
55
272
  }
56
- return min;
273
+ free(v[0]);
274
+ free(v[1]);
275
+ return result;
57
276
  }
58
277
 
59
- static int
60
- vector_last(v)
61
- vector *v;
278
+ static VALUE Levenshtein_search(General *amatch, VALUE string)
62
279
  {
63
- return v->ptr[v->len];
280
+ VALUE result;
281
+ char *a_ptr, *b_ptr;
282
+ int a_len, b_len;
283
+ int *v[2], weight, min;
284
+ int i, j, c, p;
285
+
286
+ Check_Type(string, T_STRING);
287
+ DONT_OPTIMIZE
288
+
289
+ v[0] = ALLOC_N(int, b_len + 1);
290
+ v[1] = ALLOC_N(int, b_len + 1);
291
+ MEMZERO(v[0], int, b_len + 1);
292
+ MEMZERO(v[1], int, b_len + 1);
293
+
294
+ COMPUTE_LEVENSHTEIN_DISTANCE
295
+
296
+ for (i = 0, min = a_len; i <= b_len; i++) {
297
+ if (v[p][i] < min) min = v[p][i];
298
+ }
299
+
300
+ result = INT2FIX(min);
301
+
302
+ free(v[0]);
303
+ free(v[1]);
304
+
305
+ return result;
64
306
  }
65
307
 
308
+
66
309
  /*
67
- * Edit distances are calculated here
310
+ * Sellers edit distances are computed here:
68
311
  */
69
312
 
70
- enum { MATCH = 1, MATCHR, SEARCH, SEARCHR, COMPARE, COMPARER };
313
+ #define COMPUTE_SELLERS_DISTANCE \
314
+ for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
315
+ c = i % 2; /* current row */ \
316
+ p = (i + 1) % 2; /* previous row */ \
317
+ v[c][0] = i * amatch->deletion; /* first column */ \
318
+ for (j = 1; j <= b_len; j++) { \
319
+ /* Bellman's principle of optimality: */ \
320
+ weight = v[p][j - 1] + \
321
+ (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : amatch->substitution); \
322
+ if (weight > v[p][j] + amatch->insertion) { \
323
+ weight = v[p][j] + amatch->insertion; \
324
+ } \
325
+ if (weight > v[c][j - 1] + amatch->deletion) { \
326
+ weight = v[c][j - 1] + amatch->deletion; \
327
+ } \
328
+ v[c][j] = weight; \
329
+ } \
330
+ p = c; \
331
+ c = (c + 1) % 2; \
332
+ }
71
333
 
72
- static int weight2int(weight, name)
73
- VALUE weight;
74
- char *name;
334
+ static VALUE Sellers_match(Sellers *amatch, VALUE string)
75
335
  {
76
- if (TYPE(weight) != T_FIXNUM) {
77
- rb_raise(rb_eTypeError,
78
- "value of weight %s has to be of type Fixnum (%s given)",
79
- "subw", NIL_P(weight) ? "NilClass" : rb_class2name(CLASS_OF(weight)));
336
+ VALUE result;
337
+ char *a_ptr, *b_ptr;
338
+ int a_len, b_len;
339
+ double *v[2], weight;
340
+ int i, j, c, p;
341
+
342
+ Check_Type(string, T_STRING);
343
+ DONT_OPTIMIZE
344
+
345
+ v[0] = ALLOC_N(double, b_len + 1);
346
+ v[1] = ALLOC_N(double, b_len + 1);
347
+ for (i = 0; i <= b_len; i++) {
348
+ v[0][i] = i * amatch->deletion;
349
+ v[1][i] = i * amatch->deletion;
80
350
  }
81
- return FIX2INT(weight);
351
+
352
+ COMPUTE_SELLERS_DISTANCE
353
+
354
+ result = rb_float_new(v[p][b_len]);
355
+ free(v[0]);
356
+ free(v[1]);
357
+ return result;
82
358
  }
83
359
 
84
- static VALUE
85
- calculate_distance (self, string, mode)
86
- VALUE self;
87
- VALUE string;
88
- char mode;
360
+ static VALUE Sellers_similar(Sellers *amatch, VALUE string)
89
361
  {
90
- VALUE pattern, tmp;
91
- static VALUE result;
92
- int pattern_len, string_len;
93
- char *pattern_ptr, *string_ptr;
94
- vector *v[2];
95
- int weight, sw, dw, iw, i, j, tmpi;
96
- int c = 0, p = 1;
362
+ VALUE result;
363
+ char *a_ptr, *b_ptr;
364
+ int a_len, b_len;
365
+ double *v[2], weight, max_weight;
366
+ int i, j, c, p;
97
367
 
368
+ if (amatch->insertion >= amatch->deletion) {
369
+ if (amatch->substitution >= amatch->insertion) {
370
+ max_weight = amatch->substitution;
371
+ } else {
372
+ max_weight = amatch->insertion;
373
+ }
374
+ } else {
375
+ if (amatch->substitution >= amatch->deletion) {
376
+ max_weight = amatch->substitution;
377
+ } else {
378
+ max_weight = amatch->deletion;
379
+ }
380
+ }
381
+
98
382
  Check_Type(string, T_STRING);
99
- string_ptr = RSTRING(string)->ptr;
100
- string_len = RSTRING(string)->len;
383
+ DONT_OPTIMIZE
384
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
385
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
386
+ v[0] = ALLOC_N(double, b_len + 1);
387
+ v[1] = ALLOC_N(double, b_len + 1);
388
+ for (i = 0; i <= b_len; i++) {
389
+ v[0][i] = i * amatch->deletion;
390
+ v[1][i] = i * amatch->deletion;
391
+ }
101
392
 
102
- pattern = rb_iv_get(self, "@pattern");
103
- Check_Type(pattern, T_STRING);
104
- pattern_ptr = RSTRING(pattern)->ptr;
105
- pattern_len = RSTRING(pattern)->len;
393
+ COMPUTE_SELLERS_DISTANCE
106
394
 
107
- sw = weight2int(rb_iv_get(self, "@subw"), "subw");
108
- dw = weight2int(rb_iv_get(self, "@delw"), "delw");
109
- iw = weight2int(rb_iv_get(self, "@insw"), "insw");
110
-
111
- v[0] = vector_new(string_len);
112
- switch (mode) {
113
- case MATCH:
114
- case MATCHR:
115
- case COMPARE:
116
- case COMPARER:
117
- for (i = 0; i <= v[0]->len; i++) v[0]->ptr[i] = i * iw;
118
- break;
119
- case SEARCH:
120
- case SEARCHR:
121
- for (i = 0; i <= v[0]->len; i++) v[0]->ptr[i] = 0;
122
- break;
123
- default:
124
- rb_raise(rb_eFatal, "unknown mode in calculate_distance");
395
+ if (b_len > a_len) {
396
+ result = rb_float_new(1.0 - v[p][b_len] / (b_len * max_weight));
397
+ } else {
398
+ result = rb_float_new(1.0 - v[p][b_len] / (a_len * max_weight));
125
399
  }
400
+ free(v[0]);
401
+ free(v[1]);
402
+ return result;
403
+ }
404
+
405
+ static VALUE Sellers_search(Sellers *amatch, VALUE string)
406
+ {
407
+ VALUE result;
408
+ char *a_ptr, *b_ptr;
409
+ int a_len, b_len;
410
+ double *v[2], weight, min;
411
+ int i, j, c, p;
412
+
413
+ Check_Type(string, T_STRING);
414
+ DONT_OPTIMIZE
415
+
416
+ v[0] = ALLOC_N(double, b_len + 1);
417
+ v[1] = ALLOC_N(double, b_len + 1);
418
+ MEMZERO(v[0], double, b_len + 1);
419
+ MEMZERO(v[1], double, b_len + 1);
420
+
421
+ COMPUTE_SELLERS_DISTANCE
422
+
423
+ for (i = 0, min = a_len; i <= b_len; i++) {
424
+ if (v[p][i] < min) min = v[p][i];
425
+ }
426
+ result = rb_float_new(min);
427
+ free(v[0]);
428
+ free(v[1]);
429
+
430
+ return result;
431
+ }
432
+
433
+ /*
434
+ * Pair distances are computed here:
435
+ */
126
436
 
127
- v[1] = vector_new(string_len);
128
- for (i = 1; i <= pattern_len; i++) {
129
- c = i % 2; /* current row */
130
- p = (i - 1) % 2; /* previous row */
131
- v[c]->ptr[0] = i * dw; /* first column */
132
- for (j = 1; j <= string_len; j++) {
133
- /* Bellman's principle of optimality: */
134
- weight = v[p]->ptr[j - 1] +
135
- (pattern_ptr[i - 1] == string_ptr[j - 1] ? 0 : sw);
136
- if (weight > v[p]->ptr[j] + iw) weight = v[p]->ptr[j] + iw;
137
- if (weight > v[c]->ptr[j - 1] + dw) weight = v[c]->ptr[j - 1] + dw;
138
- v[c]->ptr[j] = weight;
437
+ static VALUE PairDistance_match(
438
+ PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
439
+ {
440
+ double result;
441
+ VALUE tokens;
442
+ PairArray *pair_array;
443
+
444
+ Check_Type(string, T_STRING);
445
+ if (!NIL_P(regexp) || use_regexp) {
446
+ tokens = rb_funcall(
447
+ rb_str_new(amatch->pattern, amatch->pattern_len),
448
+ id_split, 1, regexp
449
+ );
450
+ if (!amatch->pattern_pair_array) {
451
+ amatch->pattern_pair_array = PairArray_new(tokens);
452
+ } else {
453
+ pair_array_reactivate(amatch->pattern_pair_array);
454
+ }
455
+ tokens = rb_funcall(string, id_split, 1, regexp);
456
+ pair_array = PairArray_new(tokens);
457
+ } else {
458
+ VALUE tmp = rb_str_new(amatch->pattern, amatch->pattern_len);
459
+ tokens = rb_ary_new4(1, &tmp);
460
+ if (!amatch->pattern_pair_array) {
461
+ amatch->pattern_pair_array = PairArray_new(tokens);
462
+ } else {
463
+ pair_array_reactivate(amatch->pattern_pair_array);
139
464
  }
465
+ tokens = rb_ary_new4(1, &string);
466
+ pair_array = PairArray_new(tokens);
140
467
  }
141
- switch (mode) {
142
- case MATCH:
143
- result = INT2FIX(vector_last(v[c]));
144
- break;
145
- case MATCHR:
146
- result = rb_float_new((double) vector_last(v[c]) / pattern_len);
147
- break;
148
- case SEARCH:
149
- tmpi = vector_minimum(v[c]);
150
- result = tmpi < 0 ? INT2FIX(pattern_len) : INT2FIX(tmpi);
151
- break;
152
- case SEARCHR:
153
- tmpi = vector_minimum(v[c]);
154
- result = rb_float_new( tmpi < 0 ? 1.0 : (double) tmpi / pattern_len);
155
- break;
156
- case COMPARE:
157
- result = INT2FIX((string_len < pattern_len ? -1 : 1) *
158
- vector_last(v[c]));
159
- break;
160
- case COMPARER:
161
- result = rb_float_new((double)
162
- (string_len < pattern_len ? -1 : 1) *
163
- vector_last(v[c]) / pattern_len);
164
- break;
165
- default:
166
- rb_raise(rb_eFatal, "unknown mode in calculate_distance");
468
+ result = pair_array_match(amatch->pattern_pair_array, pair_array);
469
+ pair_array_destroy(pair_array);
470
+ return rb_float_new(result);
471
+ }
472
+
473
+ /*
474
+ * Hamming distances are computed here:
475
+ */
476
+
477
+ #define COMPUTE_HAMMING_DISTANCE \
478
+ for (i = 0, result = b_len - a_len; i < a_len; i++) { \
479
+ if (i >= b_len) { \
480
+ result += a_len - b_len; \
481
+ break; \
482
+ } \
483
+ if (b_ptr[i] != a_ptr[i]) result++; \
167
484
  }
168
- vector_destroy(v[0]);
169
- vector_destroy(v[1]);
170
- return result;
485
+
486
+ static VALUE Hamming_match(General *amatch, VALUE string)
487
+ {
488
+ char *a_ptr, *b_ptr;
489
+ int a_len, b_len;
490
+ int i, result;
491
+
492
+ Check_Type(string, T_STRING);
493
+ OPTIMIZE_TIME
494
+ COMPUTE_HAMMING_DISTANCE
495
+ return INT2FIX(result);
496
+ }
497
+
498
+ static VALUE Hamming_similar(General *amatch, VALUE string)
499
+ {
500
+ char *a_ptr, *b_ptr;
501
+ int a_len, b_len;
502
+ int i, result;
503
+
504
+ Check_Type(string, T_STRING);
505
+ OPTIMIZE_TIME
506
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
507
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
508
+ COMPUTE_HAMMING_DISTANCE
509
+ return rb_float_new(1.0 - ((double) result) / b_len);
510
+ }
511
+
512
+ /*
513
+ * Longest Common Subsequence computation
514
+ */
515
+
516
+ #define COMPUTE_LONGEST_SUBSEQUENCE \
517
+ l[0] = ALLOC_N(int, b_len + 1); \
518
+ l[1] = ALLOC_N(int, b_len + 1); \
519
+ for (i = a_len, c = 0, p = 1; i >= 0; i--) { \
520
+ for (j = b_len; j >= 0; j--) { \
521
+ if (i == a_len || j == b_len) { \
522
+ l[c][j] = 0; \
523
+ } else if (a_ptr[i] == b_ptr[j]) { \
524
+ l[c][j] = 1 + l[p][j + 1]; \
525
+ } else { \
526
+ int x = l[p][j], y = l[c][j + 1]; \
527
+ if (x > y) l[c][j] = x; else l[c][j] = y; \
528
+ } \
529
+ } \
530
+ p = c; \
531
+ c = (c + 1) % 2; \
532
+ } \
533
+ result = l[p][0]; \
534
+ free(l[0]); \
535
+ free(l[1]);
536
+
537
+
538
+ static VALUE LongestSubsequence_match(General *amatch, VALUE string)
539
+ {
540
+ char *a_ptr, *b_ptr;
541
+ int a_len, b_len;
542
+ int result, c, p, i, j, *l[2];
543
+
544
+ Check_Type(string, T_STRING);
545
+ OPTIMIZE_TIME
546
+
547
+ if (a_len == 0 || b_len == 0) return INT2FIX(0);
548
+ COMPUTE_LONGEST_SUBSEQUENCE
549
+ return INT2FIX(result);
550
+ }
551
+
552
+ static VALUE LongestSubsequence_similar(General *amatch, VALUE string)
553
+ {
554
+ char *a_ptr, *b_ptr;
555
+ int a_len, b_len;
556
+ int result, c, p, i, j, *l[2];
557
+
558
+ Check_Type(string, T_STRING);
559
+ OPTIMIZE_TIME
560
+
561
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
562
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
563
+ COMPUTE_LONGEST_SUBSEQUENCE
564
+ return rb_float_new(((double) result) / b_len);
565
+ }
566
+
567
+ /*
568
+ * Longest Common Substring computation
569
+ */
570
+
571
+ #define COMPUTE_LONGEST_SUBSTRING \
572
+ l[0] = ALLOC_N(int, b_len); \
573
+ MEMZERO(l[0], int, b_len); \
574
+ l[1] = ALLOC_N(int, b_len); \
575
+ MEMZERO(l[1], int, b_len); \
576
+ result = 0; \
577
+ for (i = 0, c = 0, p = 1; i < a_len; i++) { \
578
+ for (j = 0; j < b_len; j++) { \
579
+ if (a_ptr[i] == b_ptr[j]) { \
580
+ l[c][j] = j == 0 ? 1 : 1 + l[p][j - 1]; \
581
+ if (l[c][j] > result) result = l[c][j]; \
582
+ } else { \
583
+ l[c][j] = 0; \
584
+ } \
585
+ } \
586
+ p = c; \
587
+ c = (c + 1) % 2; \
588
+ } \
589
+ free(l[0]); \
590
+ free(l[1]);
591
+
592
+ static VALUE LongestSubstring_match(General *amatch, VALUE string)
593
+ {
594
+ char *a_ptr, *b_ptr;
595
+ int a_len, b_len;
596
+ int result, c, p, i, j, *l[2];
597
+
598
+ Check_Type(string, T_STRING);
599
+ OPTIMIZE_TIME
600
+ if (a_len == 0 || b_len == 0) return INT2FIX(0);
601
+ COMPUTE_LONGEST_SUBSTRING
602
+ return INT2FIX(result);
603
+ }
604
+
605
+ static VALUE LongestSubstring_similar(General *amatch, VALUE string)
606
+ {
607
+ char *a_ptr, *b_ptr;
608
+ int a_len, b_len;
609
+ int result, c, p, i, j, *l[2];
610
+
611
+ Check_Type(string, T_STRING);
612
+ OPTIMIZE_TIME
613
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
614
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
615
+ COMPUTE_LONGEST_SUBSTRING
616
+ return rb_float_new(((double) result) / b_len);
617
+ }
618
+
619
+ /*
620
+ * Ruby API
621
+ */
622
+
623
+ /*
624
+ * Document-class: Amatch::Levenshtein
625
+ *
626
+ * The Levenshtein edit distance is defined as the minimal costs involved to
627
+ * transform one string into another by using three elementary operations:
628
+ * deletion, insertion and substitution of a character. To transform "water"
629
+ * into "wine", for instance, you have to substitute "a" -> "i": "witer", "t"
630
+ * -> "n": "winer" and delete "r": "wine". The edit distance between "water"
631
+ * and "wine" is 3, because you have to apply three operations. The edit
632
+ * distance between "wine" and "wine" is 0 of course: no operation is
633
+ * necessary for the transformation -- they're already the same string. It's
634
+ * easy to see that more similar strings have smaller edit distances than
635
+ * strings that differ a lot.
636
+ */
637
+
638
+ DEF_RB_FREE(Levenshtein, General)
639
+
640
+ /*
641
+ * call-seq: new(pattern)
642
+ *
643
+ * Creates a new Amatch::Levenshtein instance from <code>pattern</code>.
644
+ */
645
+ static VALUE rb_Levenshtein_initialize(VALUE self, VALUE pattern)
646
+ {
647
+ GET_STRUCT(General)
648
+ General_pattern_set(amatch, pattern);
649
+ return self;
650
+ }
651
+
652
+ DEF_CONSTRUCTOR(Levenshtein, General)
653
+
654
+ /*
655
+ * call-seq: match(strings) -> results
656
+ *
657
+ * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
658
+ * against <code>strings</code>. It returns the number operations, the Sellers
659
+ * distance. <code>strings</code> has to be either a String or an Array of
660
+ * Strings. The returned <code>results</code> are either a Float or an Array of
661
+ * Floats respectively.
662
+ */
663
+ static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
664
+ {
665
+ GET_STRUCT(General)
666
+ return General_iterate_strings(amatch, strings, Levenshtein_match);
667
+ }
668
+
669
+ /*
670
+ * call-seq: similar(strings) -> results
671
+ *
672
+ * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
673
+ * against <code>strings</code>, and compute a Levenshtein distance metric
674
+ * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
675
+ * <code>strings</code> has to be either a String or an Array of Strings. The
676
+ * returned <code>results</code> are either a Fixnum or an Array of Fixnums
677
+ * respectively.
678
+ */
679
+ static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
680
+ {
681
+ GET_STRUCT(General)
682
+ return General_iterate_strings(amatch, strings, Levenshtein_similar);
683
+ }
684
+
685
+ /*
686
+ * call-seq: levenshtein_similar(strings) -> results
687
+ *
688
+ * If called on a String, this string is used as a Amatch::Levenshtein#pattern
689
+ * to match against <code>strings</code>. It returns a Levenshtein distance
690
+ * metric number between 0.0 for very unsimilar strings and 1.0 for an exact
691
+ * match. <code>strings</code> has to be either a String or an Array of
692
+ * Strings. The returned <code>results</code> are either a Float or an Array of
693
+ * Floats respectively.
694
+ */
695
+ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
696
+ {
697
+ VALUE amatch = rb_Levenshtein_new(rb_cSellers, self);
698
+ return rb_Levenshtein_similar(amatch, strings);
699
+ }
700
+
701
+ /*
702
+ * call-seq: search(strings) -> results
703
+ *
704
+ * searches Amatch::Levenshtein#pattern in <code>strings</code> and returns the
705
+ * edit distance (the sum of character operations) as a Fixnum value, by greedy
706
+ * trimming prefixes or postfixes of the match. <code>strings</code> has
707
+ * to be either a String or an Array of Strings. The returned
708
+ * <code>results</code> are either a Float or an Array of Floats respectively.
709
+ */
710
+ static VALUE rb_Levenshtein_search(VALUE self, VALUE strings)
711
+ {
712
+ GET_STRUCT(General)
713
+ return General_iterate_strings(amatch, strings, Levenshtein_search);
714
+ }
715
+
716
+ /*
717
+ * Document-class: Amatch::Sellers
718
+ *
719
+ * The Sellers edit distance is very similar to the Levenshtein edit distance.
720
+ * The difference is, that you can also specify different weights for every
721
+ * operation to prefer special operations over others. This extension of the
722
+ * Sellers edit distance is also known under the names: Needleman-Wunsch
723
+ * distance.
724
+ */
725
+
726
+ DEF_RB_FREE(Sellers, Sellers)
727
+
728
+ /*
729
+ * Document-method: substitution
730
+ *
731
+ * call-seq: substitution -> weight
732
+ *
733
+ * Returns the weight of the substitution operation, that is used to compute
734
+ * the Sellers distance.
735
+ */
736
+ DEF_RB_READER(Sellers, rb_Sellers_substitution, substitution,
737
+ rb_float_new)
738
+
739
+ /*
740
+ * Document-method: deletion
741
+ *
742
+ * call-seq: deletion -> weight
743
+ *
744
+ * Returns the weight of the deletion operation, that is used to compute
745
+ * the Sellers distance.
746
+ */
747
+ DEF_RB_READER(Sellers, rb_Sellers_deletion, deletion,
748
+ rb_float_new)
749
+
750
+ /*
751
+ * Document-method: insertion
752
+ *
753
+ * call-seq: insertion -> weight
754
+ *
755
+ * Returns the weight of the insertion operation, that is used to compute
756
+ * the Sellers distance.
757
+ */
758
+ DEF_RB_READER(Sellers, rb_Sellers_insertion, insertion,
759
+ rb_float_new)
760
+
761
+ /*
762
+ * Document-method: substitution=
763
+ *
764
+ * call-seq: substitution=(weight)
765
+ *
766
+ * Sets the weight of the substitution operation, that is used to compute
767
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
768
+ * should be a Float value >= 0.0.
769
+ */
770
+ DEF_RB_WRITER(Sellers, rb_Sellers_substitution_set, substitution,
771
+ double, CAST2FLOAT, FLOAT2C, >= 0)
772
+
773
+ /*
774
+ * Document-method: deletion=
775
+ *
776
+ * call-seq: deletion=(weight)
777
+ *
778
+ * Sets the weight of the deletion operation, that is used to compute
779
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
780
+ * should be a Float value >= 0.0.
781
+ */
782
+ DEF_RB_WRITER(Sellers, rb_Sellers_deletion_set, deletion,
783
+ double, CAST2FLOAT, FLOAT2C, >= 0)
784
+
785
+ /*
786
+ * Document-method: insertion=
787
+ *
788
+ * call-seq: insertion=(weight)
789
+ *
790
+ * Sets the weight of the insertion operation, that is used to compute
791
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
792
+ * should be a Float value >= 0.0.
793
+ */
794
+ DEF_RB_WRITER(Sellers, rb_Sellers_insertion_set, insertion,
795
+ double, CAST2FLOAT, FLOAT2C, >= 0)
796
+
797
+ /*
798
+ * Resets all weights (substitution, deletion, and insertion) to 1.0.
799
+ */
800
+ static VALUE rb_Sellers_reset_weights(VALUE self)
801
+ {
802
+ GET_STRUCT(Sellers)
803
+ Sellers_reset_weights(amatch);
804
+ return self;
171
805
  }
172
806
 
173
- static VALUE
174
- handle_strings(self, strings, mode)
175
- VALUE self;
176
- VALUE strings;
177
- char mode;
807
+ /*
808
+ * call-seq: new(pattern)
809
+ *
810
+ * Creates a new Amatch::Sellers instance from <code>pattern</code>,
811
+ * with all weights initially set to 1.0.
812
+ */
813
+ static VALUE rb_Sellers_initialize(VALUE self, VALUE pattern)
178
814
  {
179
- if (TYPE(strings) == T_ARRAY) {
815
+ GET_STRUCT(Sellers)
816
+ Sellers_pattern_set(amatch, pattern);
817
+ Sellers_reset_weights(amatch);
818
+ return self;
819
+ }
820
+
821
+ DEF_CONSTRUCTOR(Sellers, Sellers)
822
+
823
+ /*
824
+ * Document-method: pattern
825
+ *
826
+ * call-seq: pattern -> pattern string
827
+ *
828
+ * Returns the current pattern string of this Amatch::Sellers instance.
829
+ */
830
+
831
+ /*
832
+ * Document-method: pattern=
833
+ *
834
+ * call-seq: pattern=(pattern)
835
+ *
836
+ * Sets the current pattern string of this Amatch::Sellers instance to
837
+ * <code>pattern</code>.
838
+ */
839
+
840
+ /*
841
+ * call-seq: match(strings) -> results
842
+ *
843
+ * Uses this Amatch::Sellers instance to match Sellers#pattern against
844
+ * <code>strings</code>, while taking into account the given weights. It
845
+ * returns the number of weighted character operations, the Sellers distance.
846
+ * <code>strings</code> has to be either a String or an Array of Strings. The
847
+ * returned <code>results</code> are either a Float or an Array of Floats
848
+ * respectively.
849
+ */
850
+ static VALUE rb_Sellers_match(VALUE self, VALUE strings)
851
+ {
852
+ GET_STRUCT(Sellers)
853
+ return Sellers_iterate_strings(amatch, strings, Sellers_match);
854
+ }
855
+
856
+ /*
857
+ * call-seq: similar(strings) -> results
858
+ *
859
+ * Uses this Amatch::Sellers instance to match Amatch::Sellers#pattern
860
+ * against <code>strings</code> (taking into account the given weights), and
861
+ * compute a Sellers distance metric number between 0.0 for very unsimilar
862
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
863
+ * String or an Array of Strings. The returned <code>results</code> are either
864
+ * a Fixnum or an Array of Fixnums
865
+ * respectively.
866
+ */
867
+ static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
868
+ {
869
+ GET_STRUCT(Sellers)
870
+ return Sellers_iterate_strings(amatch, strings, Sellers_similar);
871
+ }
872
+
873
+ /*
874
+ * call-seq: search(strings) -> results
875
+ *
876
+ * searches Sellers#pattern in <code>strings</code> and returns the edit
877
+ * distance (the sum of weighted character operations) as a Float value, by
878
+ * greedy trimming prefixes or postfixes of the match. <code>strings</code> has
879
+ * to be either a String or an Array of Strings. The returned
880
+ * <code>results</code> are either a Float or an Array of Floats respectively.
881
+ */
882
+ static VALUE rb_Sellers_search(VALUE self, VALUE strings)
883
+ {
884
+ GET_STRUCT(Sellers)
885
+ return Sellers_iterate_strings(amatch, strings, Sellers_search);
886
+ }
887
+
888
+ /*
889
+ * Document-class: Amatch::PairDistance
890
+ *
891
+ * The pair distance between two strings is based on the number of adjacent
892
+ * character pairs, that are contained in both strings. The similiarity
893
+ * metric of two strings s1 and s2 is
894
+ * 2*|union(pairs(s1), pairs(s2))| / |pairs(s1)| + |pairs(s2)|
895
+ * If it is 1.0 the two strings are an exact match, if less than 1.0 they
896
+ * are more dissimilar. The advantage of considering adjacent characters, is to
897
+ * take account not only of the characters, but also of the character ordering
898
+ * in the original strings.
899
+ *
900
+ * This metric is very capable to find similarities in natural languages.
901
+ * It is explained in more detail in Simon White's article "How to Strike a
902
+ * Match", located at this url:
903
+ * http://www.catalysoft.com/articles/StrikeAMatch.html
904
+ * It is also very similar (a special case) to the method described under
905
+ * http://citeseer.lcs.mit.edu/gravano01using.html in "Using q-grams in a DBMS
906
+ * for Approximate String Processing."
907
+ */
908
+ DEF_RB_FREE(PairDistance, PairDistance)
909
+
910
+ /*
911
+ * call-seq: new(pattern)
912
+ *
913
+ * Creates a new Amatch::PairDistance instance from <code>pattern</code>.
914
+ */
915
+ static VALUE rb_PairDistance_initialize(VALUE self, VALUE pattern)
916
+ {
917
+ GET_STRUCT(PairDistance)
918
+ PairDistance_pattern_set(amatch, pattern);
919
+ return self;
920
+ }
921
+
922
+ DEF_CONSTRUCTOR(PairDistance, PairDistance)
923
+
924
+ /*
925
+ * call-seq: match(strings, regexp = /\s+/) -> results
926
+ *
927
+ * Uses this Amatch::PairDistance instance to match PairDistance#pattern against
928
+ * <code>strings</code>. It returns the pair distance measure, that is a
929
+ * returned value of 1.0 is an exact match, partial matches are lower
930
+ * values, while 0.0 means no match at all.
931
+ *
932
+ * <code>strings</code> has to be either a String or an
933
+ * Array of Strings. The argument <code>regexp</code> is used to split the
934
+ * pattern and strings into tokens first. It defaults to /\s+/. If the
935
+ * splitting should be omitted, call the method with nil as <code>regexp</code>
936
+ * explicitly.
937
+ *
938
+ * The returned <code>results</code> are either a Float or an
939
+ * Array of Floats respectively.
940
+ */
941
+ static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
942
+ {
943
+ VALUE result, strings, regexp = Qnil;
944
+ int use_regexp;
945
+ GET_STRUCT(PairDistance)
946
+
947
+ rb_scan_args(argc, argv, "11", &strings, &regexp);
948
+ use_regexp = NIL_P(regexp) && argc != 2;
949
+ if (TYPE(strings) == T_STRING) {
950
+ result = PairDistance_match(amatch, strings, regexp, use_regexp);
951
+ } else {
952
+ Check_Type(strings, T_ARRAY);
180
953
  int i;
181
- VALUE result = rb_ary_new2(RARRAY(strings)->len);
954
+ result = rb_ary_new2(RARRAY(strings)->len);
182
955
  for (i = 0; i < RARRAY(strings)->len; i++) {
183
956
  VALUE string = rb_ary_entry(strings, i);
184
957
  if (TYPE(string) != T_STRING) {
185
958
  rb_raise(rb_eTypeError,
186
959
  "array has to contain only strings (%s given)",
187
- NIL_P(string) ? "NilClass" :
188
- rb_class2name(CLASS_OF(string)));
960
+ NIL_P(string) ?
961
+ "NilClass" :
962
+ rb_class2name(CLASS_OF(string)));
189
963
  }
190
- rb_ary_push(result, calculate_distance(self, string, mode));
964
+ rb_ary_push(result,
965
+ PairDistance_match(amatch, string, regexp, use_regexp));
191
966
  }
192
- return result;
193
- } else if (TYPE(strings) == T_STRING) {
194
- return calculate_distance(self, strings, mode);
195
- } else {
196
- rb_raise(rb_eTypeError,
197
- "value of strings needs to be string or array (%s given)",
198
- NIL_P(strings) ? "NilClass" : rb_class2name(CLASS_OF(strings)));
199
967
  }
968
+ pair_array_destroy(amatch->pattern_pair_array);
969
+ amatch->pattern_pair_array = NULL;
970
+ return result;
200
971
  }
201
972
 
202
973
  /*
203
- * Ruby API
974
+ * call-seq: pair_distance_similar(strings) -> results
975
+ *
976
+ * If called on a String, this string is used as a Amatch::PairDistance#pattern
977
+ * to match against <code>strings</code> using /\s+/ as the tokenizing regular
978
+ * expression. It returns a pair distance metric number between 0.0 for very
979
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
980
+ * either a String or an Array of Strings. The returned <code>results</code>
981
+ * are either a Float or an Array of Floats respectively.
204
982
  */
205
-
206
- static VALUE
207
- rb_amatch_resetw(self)
208
- VALUE self;
983
+ static VALUE rb_str_pair_distance_similar(VALUE self, VALUE strings)
209
984
  {
210
- rb_iv_set(self, "@subw", INT2FIX(1));
211
- rb_iv_set(self, "@delw", INT2FIX(1));
212
- rb_iv_set(self, "@insw", INT2FIX(1));
213
-
214
- return Qtrue;
985
+ VALUE amatch = rb_PairDistance_new(rb_cSellers, self);
986
+ return rb_PairDistance_match(1, &strings, amatch);
215
987
  }
216
988
 
217
- static VALUE
218
- rb_amatch_initialize(self, pattern)
219
- VALUE self;
220
- VALUE pattern;
221
- {
989
+ /*
990
+ * Document-class: Amatch::Hamming
991
+ *
992
+ * This class computes the Hamming distance between two strings.
993
+ *
994
+ * The Hamming distance between two strings is the number of characters, that
995
+ * are different. Thus a hamming distance of 0 means an exact
996
+ * match, a hamming distance of 1 means one character is different, and so on.
997
+ * If one string is longer than the other string, the missing characters are
998
+ * counted as different characters.
999
+ */
1000
+
1001
+ DEF_RB_FREE(Hamming, General)
222
1002
 
223
- Check_Type(pattern, T_STRING);
224
- rb_iv_set(self, "@pattern", pattern);
225
- rb_amatch_resetw(self);
1003
+ /*
1004
+ * call-seq: new(pattern)
1005
+ *
1006
+ * Creates a new Amatch::Hamming instance from <code>pattern</code>.
1007
+ */
1008
+ static VALUE rb_Hamming_initialize(VALUE self, VALUE pattern)
1009
+ {
1010
+ GET_STRUCT(General)
1011
+ General_pattern_set(amatch, pattern);
226
1012
  return self;
227
1013
  }
228
1014
 
229
- static VALUE
230
- rb_amatch_pattern_is(self, pattern)
231
- VALUE self;
232
- VALUE pattern;
233
- {
234
- Check_Type(pattern, T_STRING);
235
- rb_iv_set(self, "@pattern", pattern);
1015
+ DEF_CONSTRUCTOR(Hamming, General)
236
1016
 
237
- return pattern;
1017
+ /*
1018
+ * call-seq: match(strings) -> results
1019
+ *
1020
+ * Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
1021
+ * <code>strings</code>, that is compute the hamming distance between
1022
+ * <code>pattern</code> and <code>strings</code>. <code>strings</code> has to
1023
+ * be either a String or an Array of Strings. The returned <code>results</code>
1024
+ * are either a Fixnum or an Array of Fixnums respectively.
1025
+ */
1026
+ static VALUE rb_Hamming_match(VALUE self, VALUE strings)
1027
+ {
1028
+ GET_STRUCT(General)
1029
+ return General_iterate_strings(amatch, strings, Hamming_match);
238
1030
  }
239
1031
 
1032
+ /*
1033
+ * call-seq: similar(strings) -> results
1034
+ *
1035
+ * Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
1036
+ * <code>strings</code>, and compute a Hamming distance metric number between
1037
+ * 0.0 for very unsimilar strings and 1.0 for an exact match.
1038
+ * <code>strings</code> has to be either a String or an Array of Strings. The
1039
+ * returned <code>results</code> are either a Fixnum or an Array of Fixnums
1040
+ * respectively.
1041
+ */
1042
+ static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
1043
+ {
1044
+ GET_STRUCT(General)
1045
+ return General_iterate_strings(amatch, strings, Hamming_similar);
1046
+ }
240
1047
 
241
- static VALUE
242
- rb_amatch_match(self, strings)
243
- VALUE self;
244
- VALUE strings;
1048
+ /*
1049
+ * call-seq: hamming_similar(strings) -> results
1050
+ *
1051
+ * If called on a String, this string is used as a Amatch::Hamming#pattern to
1052
+ * match against <code>strings</code>. It returns a Hamming distance metric
1053
+ * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
1054
+ * <code>strings</code>
1055
+ * has to be either a String or an Array of Strings. The returned
1056
+ * <code>results</code> are either a Float or an Array of Floats respectively.
1057
+ */
1058
+ static VALUE rb_str_hamming_similar(VALUE self, VALUE strings)
245
1059
  {
246
- return handle_strings(self, strings, MATCH);
1060
+ VALUE amatch = rb_Hamming_new(rb_cHamming, self);
1061
+ return rb_Hamming_similar(amatch, strings);
247
1062
  }
248
1063
 
249
- static VALUE
250
- rb_amatch_matchr(self, strings)
251
- VALUE self;
252
- VALUE strings;
1064
+
1065
+ /*
1066
+ * Document-class: Amatch::LongestSubsequence
1067
+ *
1068
+ * This class computes the length of the longest subsequence common to two
1069
+ * strings. A subsequence doesn't have to be contiguous. The longer the common
1070
+ * subsequence is, the more similar the two strings will be.
1071
+ *
1072
+ * The longest common subsequence between "test" and "test" is of length 4,
1073
+ * because "test" itself is this subsequence. The longest common subsequence
1074
+ * between "test" and "east" is "e", "s", "t" and the length of the
1075
+ * sequence is 3.
1076
+ */
1077
+ DEF_RB_FREE(LongestSubsequence, General)
1078
+
1079
+ /*
1080
+ * call-seq: new(pattern)
1081
+ *
1082
+ * Creates a new Amatch::LongestSubsequence instance from <code>pattern</code>.
1083
+ */
1084
+ static VALUE rb_LongestSubsequence_initialize(VALUE self, VALUE pattern)
253
1085
  {
254
- return handle_strings(self, strings, MATCHR);
1086
+ GET_STRUCT(General)
1087
+ General_pattern_set(amatch, pattern);
1088
+ return self;
255
1089
  }
256
1090
 
257
- static VALUE
258
- rb_amatch_compare(self, strings)
259
- VALUE self;
260
- VALUE strings;
261
- {
262
- return handle_strings(self, strings, COMPARE);
1091
+ DEF_CONSTRUCTOR(LongestSubsequence, General)
1092
+
1093
+ /*
1094
+ * call-seq: match(strings) -> results
1095
+ *
1096
+ * Uses this Amatch::LongestSubsequence instance to match
1097
+ * LongestSubsequence#pattern against <code>strings</code>, that is compute the
1098
+ * length of the longest common subsequence. <code>strings</code> has to be
1099
+ * either a String or an Array of Strings. The returned <code>results</code>
1100
+ * are either a Fixnum or an Array of Fixnums respectively.
1101
+ */
1102
+ static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
1103
+ {
1104
+ GET_STRUCT(General)
1105
+ return General_iterate_strings(amatch, strings, LongestSubsequence_match);
263
1106
  }
264
1107
 
265
- static VALUE
266
- rb_amatch_comparer(self, strings)
267
- VALUE self;
268
- VALUE strings;
1108
+ /*
1109
+ * call-seq: similar(strings) -> results
1110
+ *
1111
+ * Uses this Amatch::LongestSubsequence instance to match
1112
+ * Amatch::LongestSubsequence#pattern against <code>strings</code>, and compute
1113
+ * a longest substring distance metric number between 0.0 for very unsimilar
1114
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1115
+ * String or an Array of Strings. The returned <code>results</code> are either
1116
+ * a Fixnum or an Array of Fixnums
1117
+ */
1118
+ static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
1119
+ {
1120
+ GET_STRUCT(General)
1121
+ return General_iterate_strings(amatch, strings, LongestSubsequence_similar);
1122
+ }
1123
+
1124
+ /*
1125
+ * call-seq: longest_subsequence_similar(strings) -> results
1126
+ *
1127
+ * If called on a String, this string is used as a
1128
+ * Amatch::LongestSubsequence#pattern to match against <code>strings</code>. It
1129
+ * returns a longest subsequence distance metric number between 0.0 for very
1130
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1131
+ * either a String or an Array of Strings. The returned <code>results</code>
1132
+ * are either a Float or an Array of Floats respectively.
1133
+ */
1134
+ static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
1135
+ {
1136
+ VALUE amatch = rb_LongestSubsequence_new(rb_cSellers, self);
1137
+ return rb_LongestSubsequence_similar(amatch, strings);
1138
+ }
1139
+
1140
+ /*
1141
+ * Document-class: Amatch::LongestSubstring
1142
+ *
1143
+ * The longest common substring is the longest substring, that is part of
1144
+ * two strings. A substring is contiguous, while a subsequence need not to
1145
+ * be. The longer the common substring is, the more similar the two strings
1146
+ * will be.
1147
+ *
1148
+ * The longest common substring between 'string' and 'string' is 'string'
1149
+ * again, thus the longest common substring length is 6. The longest common
1150
+ * substring between 'string' and 'storing' is 'ring', thus the longest common
1151
+ * substring length is 4.
1152
+ */
1153
+
1154
+ DEF_RB_FREE(LongestSubstring, General)
1155
+
1156
+ /*
1157
+ * call-seq: new(pattern)
1158
+ *
1159
+ * Creates a new Amatch::LongestSubstring instance from <code>pattern</code>.
1160
+ */
1161
+ static VALUE rb_LongestSubstring_initialize(VALUE self, VALUE pattern)
269
1162
  {
270
- return handle_strings(self, strings, COMPARER);
1163
+ GET_STRUCT(General)
1164
+ General_pattern_set(amatch, pattern);
1165
+ return self;
271
1166
  }
272
1167
 
1168
+ DEF_CONSTRUCTOR(LongestSubstring, General)
273
1169
 
274
- static VALUE
275
- rb_amatch_search(self, strings)
276
- VALUE self;
277
- VALUE strings;
1170
+ /*
1171
+ * call-seq: match(strings) -> results
1172
+ *
1173
+ * Uses this Amatch::LongestSubstring instance to match
1174
+ * LongestSubstring#pattern against <code>strings</code>, that is compute the
1175
+ * length of the longest common substring. <code>strings</code> has to be
1176
+ * either a String or an Array of Strings. The returned <code>results</code>
1177
+ * are either a Fixnum or an Array of Fixnums respectively.
1178
+ */
1179
+ static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
278
1180
  {
279
- return handle_strings(self, strings, SEARCH);
1181
+ GET_STRUCT(General)
1182
+ return General_iterate_strings(amatch, strings, LongestSubstring_match);
280
1183
  }
281
1184
 
282
- static VALUE
283
- rb_amatch_searchr(self, strings)
284
- VALUE self;
285
- VALUE strings;
1185
+ /*
1186
+ * call-seq: similar(strings) -> results
1187
+ *
1188
+ * Uses this Amatch::LongestSubstring instance to match
1189
+ * Amatch::LongestSubstring#pattern against <code>strings</code>, and compute a
1190
+ * longest substring distance metric number between 0.0 for very unsimilar
1191
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1192
+ * String or an Array of Strings. The returned <code>results</code> are either
1193
+ * a Fixnum or an Array of Fixnums
1194
+ * respectively.
1195
+ */
1196
+ static VALUE rb_LongestSubstring_similar(VALUE self, VALUE strings)
286
1197
  {
287
- return handle_strings(self, strings, SEARCHR);
1198
+ GET_STRUCT(General)
1199
+ return General_iterate_strings(amatch, strings, LongestSubstring_similar);
288
1200
  }
289
1201
 
290
- void
291
- Init_amatch()
1202
+ /*
1203
+ * call-seq: longest_substring_similar(strings) -> results
1204
+ *
1205
+ * If called on a String, this string is used as a
1206
+ * Amatch::LongestSubstring#pattern to match against <code>strings</code>. It
1207
+ * returns a longest substring distance metric number between 0.0 for very
1208
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1209
+ * either a String or an Array of Strings. The returned <code>results</code>
1210
+ * are either a Float or an Array of Floats respectively.
1211
+ */
1212
+ static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
1213
+ {
1214
+ VALUE amatch = rb_LongestSubsequence_new(rb_cSellers, self);
1215
+ return rb_LongestSubstring_similar(amatch, strings);
1216
+ }
1217
+
1218
+ /*
1219
+ * = amatch - Approximate Matching Extension for Ruby
1220
+ *
1221
+ * == Description
1222
+ *
1223
+ * This is a collection of classes that can be used for Approximate
1224
+ * matching, searching, and comparing of Strings. They implement algorithms
1225
+ * that compute the Levenshtein edit distance, Sellers edit distance, the
1226
+ * Hamming distance, the longest common subsequence length, the longest common
1227
+ * substring length, and the pair distance metric.
1228
+ *
1229
+ * == Author
1230
+ *
1231
+ * Florian Frank mailto:flori@ping.de
1232
+ *
1233
+ * == License
1234
+ *
1235
+ * This is free software; you can redistribute it and/or modify it under
1236
+ * the terms of the GNU General Public License Version 2 as published by
1237
+ * the Free Software Foundation: http://www.gnu.org/copyleft/gpl.html
1238
+ *
1239
+ * == Download
1240
+ *
1241
+ * The latest version of <b>amatch</b> can be found at
1242
+ *
1243
+ * * http://rubyforge.org/frs/?group_id=390
1244
+ *
1245
+ * Online Documentation should be located at
1246
+ *
1247
+ * * http://amatch.rubyforge.org
1248
+ *
1249
+ * == Examples
1250
+ * require 'amatch'
1251
+ * # => true
1252
+ * include Amatch
1253
+ * # => Object
1254
+ *
1255
+ * m = Sellers.new("pattern")
1256
+ * # => #<Amatch::Sellers:0x40366324>
1257
+ * m.match("pattren")
1258
+ * # => 2.0
1259
+ * m.substitution = m.insertion = 3
1260
+ * # => 3
1261
+ * m.match("pattren")
1262
+ * # => 4.0
1263
+ * m.reset_weights
1264
+ * # => #<Amatch::Sellers:0x40366324>
1265
+ * m.match(["pattren","parent"])
1266
+ * # => [2.0, 4.0]
1267
+ * m.search("abcpattrendef")
1268
+ * # => 2.0
1269
+ *
1270
+ * m = Levenshtein.new("pattern")
1271
+ * # => #<Amatch::Levenshtein:0x4035919c>
1272
+ * m.match("pattren")
1273
+ * # => 2
1274
+ * m.search("abcpattrendef")
1275
+ * # => 2
1276
+ * "pattern language".levenshtein_similar("language of patterns")
1277
+ * # => 0.2
1278
+ *
1279
+ * m = Hamming.new("pattern")
1280
+ * # => #<Amatch::Hamming:0x40350858>
1281
+ * m.match("pattren")
1282
+ * # => 2
1283
+ * "pattern language".hamming_similar("language of patterns")
1284
+ * # => 0.1
1285
+ *
1286
+ * m = PairDistance.new("pattern")
1287
+ * # => #<Amatch::PairDistance:0x40349be8>
1288
+ * m.match("pattr en")
1289
+ * # => 0.545454545454545
1290
+ * m.match("pattr en", nil)
1291
+ * # => 0.461538461538462
1292
+ * m.match("pattr en", /t+/)
1293
+ * # => 0.285714285714286
1294
+ * "pattern language".pair_distance_similar("language of patterns")
1295
+ * # => 0.928571428571429
1296
+ *
1297
+ * m = LongestSubsequence.new("pattern")
1298
+ * # => #<Amatch::LongestSubsequence:0x4033e900>
1299
+ * m.match("pattren")
1300
+ * # => 6
1301
+ * "pattern language".longest_subsequence_similar("language of patterns")
1302
+ * # => 0.4
1303
+ *
1304
+ * m = LongestSubstring.new("pattern")
1305
+ * # => #<Amatch::LongestSubstring:0x403378d0>
1306
+ * m.match("pattren")
1307
+ * # => 4
1308
+ * "pattern language".longest_substring_similar("language of patterns")
1309
+ * # => 0.4
1310
+ *
1311
+ */
1312
+
1313
+ void Init_amatch()
292
1314
  {
293
- cAmatch = rb_define_class("Amatch", rb_cObject);
294
- rb_define_method(cAmatch, "initialize", rb_amatch_initialize, 1);
295
-
296
- rb_define_attr(cAmatch, "debug", 1, 1);
297
- rb_define_attr(cAmatch, "subw", 1, 1);
298
- rb_define_attr(cAmatch, "delw", 1, 1);
299
- rb_define_attr(cAmatch, "insw", 1, 1);
300
- rb_define_method(cAmatch, "resetw", rb_amatch_resetw, 0);
301
-
302
- rb_define_method(cAmatch, "pattern=", rb_amatch_pattern_is, 1);
303
- rb_define_attr(cAmatch, "pattern", 1, 0);
304
-
305
- rb_define_method(cAmatch, "match", rb_amatch_match, 1);
306
- rb_define_method(cAmatch, "matchr", rb_amatch_matchr, 1);
307
- rb_define_method(cAmatch, "compare", rb_amatch_compare, 1);
308
- rb_define_method(cAmatch, "comparer", rb_amatch_comparer, 1);
309
- rb_define_method(cAmatch, "search", rb_amatch_search, 1);
310
- rb_define_method(cAmatch, "searchr", rb_amatch_searchr, 1);
1315
+ rb_mAmatch = rb_define_module("Amatch");
1316
+
1317
+ /* Levenshtein */
1318
+ rb_cLevenshtein = rb_define_class_under(rb_mAmatch, "Levenshtein", rb_cObject);
1319
+ rb_define_alloc_func(rb_cLevenshtein, rb_Levenshtein_s_allocate);
1320
+ rb_define_method(rb_cLevenshtein, "initialize", rb_Levenshtein_initialize, 1);
1321
+ rb_define_method(rb_cLevenshtein, "pattern", rb_General_pattern, 0);
1322
+ rb_define_method(rb_cLevenshtein, "pattern=", rb_General_pattern_set, 1);
1323
+ rb_define_method(rb_cLevenshtein, "match", rb_Levenshtein_match, 1);
1324
+ rb_define_method(rb_cLevenshtein, "search", rb_Levenshtein_search, 1);
1325
+ rb_define_method(rb_cLevenshtein, "similar", rb_Levenshtein_similar, 1);
1326
+ rb_define_method(rb_cString, "levenshtein_similar", rb_str_levenshtein_similar, 1);
1327
+
1328
+ /* Sellers */
1329
+ rb_cSellers = rb_define_class_under(rb_mAmatch, "Sellers", rb_cObject);
1330
+ rb_define_alloc_func(rb_cSellers, rb_Sellers_s_allocate);
1331
+ rb_define_method(rb_cSellers, "initialize", rb_Sellers_initialize, 1);
1332
+ rb_define_method(rb_cSellers, "pattern", rb_Sellers_pattern, 0);
1333
+ rb_define_method(rb_cSellers, "pattern=", rb_Sellers_pattern_set, 1);
1334
+ rb_define_method(rb_cSellers, "substitution", rb_Sellers_substitution, 0);
1335
+ rb_define_method(rb_cSellers, "substitution=", rb_Sellers_substitution_set, 1);
1336
+ rb_define_method(rb_cSellers, "deletion", rb_Sellers_deletion, 0);
1337
+ rb_define_method(rb_cSellers, "deletion=", rb_Sellers_deletion_set, 1);
1338
+ rb_define_method(rb_cSellers, "insertion", rb_Sellers_insertion, 0);
1339
+ rb_define_method(rb_cSellers, "insertion=", rb_Sellers_insertion_set, 1);
1340
+ rb_define_method(rb_cSellers, "reset_weights", rb_Sellers_reset_weights, 0);
1341
+ rb_define_method(rb_cSellers, "match", rb_Sellers_match, 1);
1342
+ rb_define_method(rb_cSellers, "search", rb_Sellers_search, 1);
1343
+ rb_define_method(rb_cSellers, "similar", rb_Sellers_similar, 1);
1344
+
1345
+ /* Hamming */
1346
+ rb_cHamming = rb_define_class_under(rb_mAmatch, "Hamming", rb_cObject);
1347
+ rb_define_alloc_func(rb_cHamming, rb_Hamming_s_allocate);
1348
+ rb_define_method(rb_cHamming, "initialize", rb_Hamming_initialize, 1);
1349
+ rb_define_method(rb_cHamming, "pattern", rb_General_pattern, 0);
1350
+ rb_define_method(rb_cHamming, "pattern=", rb_General_pattern_set, 1);
1351
+ rb_define_method(rb_cHamming, "match", rb_Hamming_match, 1);
1352
+ rb_define_method(rb_cHamming, "similar", rb_Hamming_similar, 1);
1353
+ rb_define_method(rb_cString, "hamming_similar", rb_str_hamming_similar, 1);
1354
+
1355
+ /* Pair Distance Metric */
1356
+ rb_cPairDistance = rb_define_class_under(rb_mAmatch, "PairDistance", rb_cObject);
1357
+ rb_define_alloc_func(rb_cPairDistance, rb_PairDistance_s_allocate);
1358
+ rb_define_method(rb_cPairDistance, "initialize", rb_PairDistance_initialize, 1);
1359
+ rb_define_method(rb_cPairDistance, "pattern", rb_PairDistance_pattern, 0);
1360
+ rb_define_method(rb_cPairDistance, "pattern=", rb_PairDistance_pattern_set, 1);
1361
+ rb_define_method(rb_cPairDistance, "match", rb_PairDistance_match, -1);
1362
+ rb_define_alias(rb_cPairDistance, "similar", "match");
1363
+ rb_define_method(rb_cString, "pair_distance_similar", rb_str_pair_distance_similar, 1);
1364
+
1365
+ /* Longest Common Subsequence */
1366
+ rb_cLongestSubsequence = rb_define_class_under(rb_mAmatch, "LongestSubsequence", rb_cObject);
1367
+ rb_define_alloc_func(rb_cLongestSubsequence, rb_LongestSubsequence_s_allocate);
1368
+ rb_define_method(rb_cLongestSubsequence, "initialize", rb_LongestSubsequence_initialize, 1);
1369
+ rb_define_method(rb_cLongestSubsequence, "pattern", rb_General_pattern, 0);
1370
+ rb_define_method(rb_cLongestSubsequence, "pattern=", rb_General_pattern_set, 1);
1371
+ rb_define_method(rb_cLongestSubsequence, "match", rb_LongestSubsequence_match, 1);
1372
+ rb_define_method(rb_cLongestSubsequence, "similar", rb_LongestSubsequence_similar, 1);
1373
+ rb_define_method(rb_cString, "longest_subsequence_similar", rb_str_longest_subsequence_similar, 1);
1374
+
1375
+ /* Longest Common Substring */
1376
+ rb_cLongestSubstring = rb_define_class_under(rb_mAmatch, "LongestSubstring", rb_cObject);
1377
+ rb_define_alloc_func(rb_cLongestSubstring, rb_LongestSubstring_s_allocate);
1378
+ rb_define_method(rb_cLongestSubstring, "initialize", rb_LongestSubstring_initialize, 1);
1379
+ rb_define_method(rb_cLongestSubstring, "pattern", rb_General_pattern, 0);
1380
+ rb_define_method(rb_cLongestSubstring, "pattern=", rb_General_pattern_set, 1);
1381
+ rb_define_method(rb_cLongestSubstring, "match", rb_LongestSubstring_match, 1);
1382
+ rb_define_method(rb_cLongestSubstring, "similar", rb_LongestSubstring_similar, 1);
1383
+ rb_define_method(rb_cString, "longest_substring_similar", rb_str_longest_substring_similar, 1);
1384
+
1385
+ id_split = rb_intern("split");
1386
+ id_to_f = rb_intern("to_f");
311
1387
  }
312
1388
  /* vim: set et cin sw=4 ts=4: */