amatch 0.1.5 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +5 -2
- data/InstalledFiles +5 -0
- data/README.en +1 -1
- data/Rakefile +67 -58
- data/VERSION +1 -1
- data/bin/agrep.rb +65 -52
- data/config.save +12 -0
- data/ext/amatch.bundle +0 -0
- data/ext/amatch.c +1301 -225
- data/ext/extconf.rb +6 -1
- data/ext/pair.c +78 -0
- data/ext/pair.h +29 -0
- data/ext/tags +24 -0
- data/tests/runner.rb +26 -0
- data/tests/test_hamming.rb +54 -0
- data/tests/test_levenshtein.rb +74 -0
- data/tests/test_longest_subsequence.rb +57 -0
- data/tests/test_longest_substring.rb +57 -0
- data/tests/test_pair_distance.rb +81 -0
- data/tests/test_sellers.rb +94 -0
- metadata +26 -8
- data/amatch.txt.en +0 -117
- data/tests/test.rb +0 -94
data/CHANGES
CHANGED
@@ -1,5 +1,8 @@
|
|
1
|
-
2005-01
|
2
|
-
*
|
1
|
+
2005-06-01 (0.2.0)
|
2
|
+
* Major changes in API and implementation:
|
3
|
+
Now the Levenshtein edit distance, Sellers edit distance, the Hamming
|
4
|
+
distance, the longest common subsequence length, the longest common
|
5
|
+
substring length, and the pair distance metric can be computed.
|
3
6
|
2005-01-20 (0.1.4)
|
4
7
|
* Better argument handling in initialization method
|
5
8
|
* Minor changes in Rakefile and README.en
|
data/InstalledFiles
ADDED
data/README.en
CHANGED
data/Rakefile
CHANGED
@@ -3,101 +3,110 @@
|
|
3
3
|
require 'rake/clean'
|
4
4
|
require 'rake/testtask'
|
5
5
|
require 'rake/gempackagetask'
|
6
|
+
require 'rake/rdoctask'
|
6
7
|
require 'rbconfig'
|
7
8
|
|
8
9
|
include Config
|
9
10
|
|
10
|
-
PKG_NAME = 'amatch'
|
11
11
|
PKG_VERSION = File.read('VERSION').chomp
|
12
|
-
PKG_FILES
|
13
|
-
|
14
|
-
|
12
|
+
PKG_FILES = FileList['**/*']
|
13
|
+
PKG_FILES.exclude(/CVS/)
|
14
|
+
PKG_FILES.exclude(/^pkg/)
|
15
|
+
PKG_FILES.exclude(/^doc/)
|
15
16
|
|
16
|
-
task :default =>
|
17
|
+
task :default => :test
|
17
18
|
|
18
19
|
desc "Run unit tests"
|
19
|
-
task(:test => [
|
20
|
-
|
20
|
+
task(:test => [:compile]) do
|
21
|
+
cd 'tests' do
|
22
|
+
ruby %{-I../ext runner.rb}
|
23
|
+
end
|
21
24
|
end
|
22
25
|
|
23
26
|
desc "Compiling library"
|
24
27
|
task :compile do
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
28
|
+
cd 'ext' do
|
29
|
+
ruby %{extconf.rb}
|
30
|
+
sh "make"
|
31
|
+
end
|
29
32
|
end
|
30
33
|
|
31
34
|
desc "Installing library"
|
32
|
-
task
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
35
|
+
task :install => :test do
|
36
|
+
src, = Dir['ext/amatch.*'].reject { |x| /\.[co]$/.match x }
|
37
|
+
filename = File.basename(src)
|
38
|
+
dst = File.join(CONFIG["sitelibdir"], filename)
|
39
|
+
install(src, dst, :verbose => true)
|
37
40
|
end
|
38
41
|
|
42
|
+
desc "Removing generated files"
|
39
43
|
task :clean do
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
+
cd 'ext' do
|
45
|
+
ruby 'extconf.rb'
|
46
|
+
sh "make distclean" if File.exist?('Makefile')
|
47
|
+
end
|
44
48
|
end
|
45
49
|
|
46
|
-
|
47
|
-
|
48
|
-
|
50
|
+
Rake::RDocTask.new do |rd|
|
51
|
+
rd.main = 'Amatch'
|
52
|
+
rd.rdoc_files.include("ext/amatch.c")
|
53
|
+
rd.rdoc_dir = 'doc'
|
54
|
+
end
|
49
55
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
56
|
+
spec = Gem::Specification.new do |s|
|
57
|
+
#### Basic information.
|
58
|
+
|
59
|
+
s.name = 'amatch'
|
60
|
+
s.version = PKG_VERSION
|
61
|
+
s.summary = "Approximate String Matching library"
|
62
|
+
s.description = <<EOF
|
63
|
+
Amatch is a library for approximate string matching and searching in strings.
|
64
|
+
Several algorithms can be used to do this, and it's also possible to compute a
|
65
|
+
similarity metric number between 0.0 and 1.0 for two given strings.
|
57
66
|
EOF
|
58
67
|
|
59
|
-
|
68
|
+
#### Dependencies and requirements.
|
60
69
|
|
61
|
-
|
62
|
-
|
70
|
+
#s.add_dependency('log4r', '> 1.0.4')
|
71
|
+
#s.requirements << ""
|
63
72
|
|
64
|
-
|
73
|
+
s.files = PKG_FILES
|
65
74
|
|
66
|
-
|
75
|
+
#### C code extensions.
|
67
76
|
|
68
|
-
|
77
|
+
s.extensions << "ext/extconf.rb"
|
69
78
|
|
70
|
-
|
79
|
+
#### Load-time details: library and application (you will need one or both).
|
71
80
|
|
72
|
-
|
73
|
-
|
81
|
+
s.require_path = 'ext' # Use these for libraries.
|
82
|
+
s.autorequire = 'amatch'
|
74
83
|
|
75
|
-
|
76
|
-
|
77
|
-
|
84
|
+
s.bindir = "bin" # Use these for applications.
|
85
|
+
s.executables = ["agrep.rb"]
|
86
|
+
s.default_executable = "agrep.rb"
|
78
87
|
|
79
|
-
|
88
|
+
#### Documentation and testing.
|
80
89
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
90
|
+
s.has_rdoc = true
|
91
|
+
#s.extra_rdoc_files = FileList['ext/amatch.c']
|
92
|
+
s.rdoc_options <<
|
93
|
+
'--title' << 'Amatch -- Approximate Matching' <<
|
94
|
+
'--main' << 'Amatch' <<
|
95
|
+
'--line-numbers'
|
96
|
+
s.test_files << 'tests/runner.rb'
|
88
97
|
|
89
|
-
|
98
|
+
#### Author and project details.
|
90
99
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
100
|
+
s.author = "Florian Frank"
|
101
|
+
s.email = "flori@ping.de"
|
102
|
+
s.homepage = "http://amatch.rubyforge.org"
|
103
|
+
s.rubyforge_project = "amatch"
|
95
104
|
end
|
96
105
|
|
97
106
|
Rake::GemPackageTask.new(spec) do |pkg|
|
98
|
-
|
99
|
-
|
107
|
+
pkg.need_tar = true
|
108
|
+
pkg.package_files += PKG_FILES
|
100
109
|
end
|
101
110
|
|
102
111
|
task :release => [ :clean, :package ]
|
103
|
-
|
112
|
+
# vim: set et sw=2 ts=2:
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/bin/agrep.rb
CHANGED
@@ -1,74 +1,87 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
2
|
#
|
3
|
-
## $Id: agrep.rb,v 1.
|
3
|
+
## $Id: agrep.rb,v 1.3 2005/04/24 21:11:06 flori Exp $
|
4
4
|
#
|
5
5
|
|
6
6
|
require 'amatch'
|
7
7
|
require 'getoptlong'
|
8
8
|
|
9
9
|
def usage(msg, options)
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
10
|
+
print msg, "\nUsage: #{File.basename($0)} pattern [FILE ...]\n\n"
|
11
|
+
options.each do |o|
|
12
|
+
puts " " + o[1] + ", " + o[0] + " " +
|
13
|
+
(o[2] == GetoptLong::REQUIRED_ARGUMENT ? 'ARGUMENT' : '')
|
14
|
+
end
|
15
|
+
puts "\nReport bugs to <flori@ping.de>."
|
16
|
+
exit 0
|
17
|
+
end
|
18
|
+
|
19
|
+
class MyAmatch < Amatch
|
20
|
+
def l_search_relative(strings)
|
21
|
+
if strings.is_a? Array
|
22
|
+
l_search(strings).map { |x| x / pattern.size }
|
23
|
+
else
|
24
|
+
l_search(strings) / pattern.size
|
25
|
+
end
|
26
|
+
end
|
17
27
|
end
|
18
28
|
|
19
29
|
$distance = 1
|
30
|
+
$mode = :l_search
|
20
31
|
begin
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
32
|
+
parser = GetoptLong.new
|
33
|
+
options = [
|
34
|
+
[ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
|
35
|
+
[ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
|
36
|
+
[ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
|
37
|
+
[ '--help', '-h', GetoptLong::NO_ARGUMENT ],
|
38
|
+
]
|
39
|
+
parser.set_options(*options)
|
40
|
+
parser.each_option do |name, arg|
|
41
|
+
name = name.sub(/^--/, '')
|
42
|
+
case name
|
43
|
+
when 'distance'
|
44
|
+
$distance = arg.to_f
|
45
|
+
when 'relative'
|
46
|
+
$mode = :l_search_relative
|
47
|
+
when 'verbose'
|
48
|
+
$verbose = 1
|
49
|
+
when 'help'
|
50
|
+
usage('You\'ve asked for it!', options)
|
51
|
+
end
|
52
|
+
end
|
42
53
|
rescue
|
43
|
-
|
54
|
+
exit 1
|
44
55
|
end
|
45
|
-
|
56
|
+
pattern = ARGV.shift or usage('Pattern needed!', options)
|
46
57
|
|
47
|
-
matcher =
|
58
|
+
matcher = MyAmatch.new(pattern)
|
48
59
|
size = 0
|
49
60
|
start = Time.new
|
50
61
|
if ARGV.size > 0 then
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
62
|
+
ARGV.each do |filename|
|
63
|
+
File.stat(filename).file? or next
|
64
|
+
size += File.size(filename)
|
65
|
+
begin
|
66
|
+
File.open(filename, 'r').each_line do |line|
|
67
|
+
if matcher.__send__($mode, line) < $distance
|
68
|
+
puts "#{filename}:#{line}"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
rescue
|
72
|
+
STDERR.print "Failure at #{filename}: #{$!} => Skipping!\n"
|
73
|
+
end
|
74
|
+
end
|
64
75
|
else
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
76
|
+
STDIN.each_line do |line|
|
77
|
+
size += line.size
|
78
|
+
if matcher.__send__($mode, line) <= $distance
|
79
|
+
puts line
|
80
|
+
end
|
81
|
+
end
|
70
82
|
end
|
71
83
|
time = Time.new - start
|
72
|
-
$verbose and
|
73
|
-
|
84
|
+
$verbose and STDERR.printf "%.3f secs running, scanned %.3f KB/s.\n",
|
85
|
+
time, size / time / 1024
|
74
86
|
exit 0
|
87
|
+
# vim: set et sw=2 ts=2:
|
data/config.save
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
bin-dir=$prefix/bin
|
2
|
+
site-ruby=$prefix/lib/ruby/site_ruby/1.8
|
3
|
+
prefix=/usr/local/stow/ruby
|
4
|
+
ruby-path=/usr/local/stow/ruby/bin/ruby
|
5
|
+
make-prog=make
|
6
|
+
rb-dir=$site-ruby
|
7
|
+
without-ext=no
|
8
|
+
ruby-prog=/usr/local/stow/ruby/bin/ruby
|
9
|
+
site-ruby-common=$prefix/lib/ruby/site_ruby
|
10
|
+
std-ruby=$prefix/lib/ruby/1.8
|
11
|
+
data-dir=$prefix/share
|
12
|
+
so-dir=$prefix/lib/ruby/site_ruby/1.8/i686-linux
|
data/ext/amatch.bundle
ADDED
Binary file
|
data/ext/amatch.c
CHANGED
@@ -1,312 +1,1388 @@
|
|
1
1
|
#include "ruby.h"
|
2
|
+
#include "pair.h"
|
2
3
|
|
3
|
-
|
4
|
+
/*
|
5
|
+
* Document-method: pattern
|
6
|
+
*
|
7
|
+
* call-seq: pattern -> pattern string
|
8
|
+
*
|
9
|
+
* Returns the current pattern string of this instance.
|
10
|
+
*/
|
4
11
|
|
5
12
|
/*
|
6
|
-
*
|
13
|
+
* Document-method: pattern=
|
14
|
+
*
|
15
|
+
* call-seq: pattern=(pattern)
|
16
|
+
*
|
17
|
+
* Sets the current pattern string of this instance to <code>pattern</code>.
|
7
18
|
*/
|
8
19
|
|
9
|
-
typedef struct {
|
10
|
-
int *ptr;
|
11
|
-
int len;
|
12
|
-
} vector;
|
13
20
|
|
14
|
-
static
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
21
|
+
static VALUE rb_mAmatch, rb_cLevenshtein, rb_cSellers, rb_cHamming,
|
22
|
+
rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring;
|
23
|
+
|
24
|
+
static ID id_split, id_to_f;
|
25
|
+
|
26
|
+
#define GET_STRUCT(klass) \
|
27
|
+
klass *amatch; \
|
28
|
+
Data_Get_Struct(self, klass, amatch);
|
29
|
+
|
30
|
+
#define DEF_ALLOCATOR(type) \
|
31
|
+
static type *type##_allocate() \
|
32
|
+
{ \
|
33
|
+
type *obj = ALLOC(type); \
|
34
|
+
MEMZERO(obj, type, 1); \
|
35
|
+
return obj; \
|
36
|
+
}
|
37
|
+
|
38
|
+
#define DEF_CONSTRUCTOR(klass, type) \
|
39
|
+
static VALUE rb_##klass##_s_allocate(VALUE klass2) \
|
40
|
+
{ \
|
41
|
+
type *amatch = type##_allocate(); \
|
42
|
+
return Data_Wrap_Struct(klass2, NULL, rb_##klass##_free, amatch); \
|
43
|
+
} \
|
44
|
+
VALUE rb_##klass##_new(VALUE klass2, VALUE pattern) \
|
45
|
+
{ \
|
46
|
+
VALUE obj = rb_##klass##_s_allocate(klass2); \
|
47
|
+
rb_##klass##_initialize(obj, pattern); \
|
48
|
+
return obj; \
|
49
|
+
}
|
50
|
+
|
51
|
+
#define DEF_RB_FREE(klass, type) \
|
52
|
+
static void rb_##klass##_free(type *amatch) \
|
53
|
+
{ \
|
54
|
+
MEMZERO(amatch->pattern, char, amatch->pattern_len); \
|
55
|
+
free(amatch->pattern); \
|
56
|
+
MEMZERO(amatch, type, 1); \
|
57
|
+
free(amatch); \
|
58
|
+
}
|
59
|
+
|
60
|
+
#define DEF_PATTERN_ACCESSOR(type) \
|
61
|
+
static void type##_pattern_set(type *amatch, VALUE pattern) \
|
62
|
+
{ \
|
63
|
+
Check_Type(pattern, T_STRING); \
|
64
|
+
free(amatch->pattern); \
|
65
|
+
amatch->pattern_len = RSTRING(pattern)->len; \
|
66
|
+
amatch->pattern = ALLOC_N(char, amatch->pattern_len); \
|
67
|
+
MEMCPY(amatch->pattern, RSTRING(pattern)->ptr, char, \
|
68
|
+
RSTRING(pattern)->len); \
|
69
|
+
} \
|
70
|
+
static VALUE rb_##type##_pattern(VALUE self) \
|
71
|
+
{ \
|
72
|
+
GET_STRUCT(type) \
|
73
|
+
return rb_str_new(amatch->pattern, amatch->pattern_len); \
|
74
|
+
} \
|
75
|
+
static VALUE rb_##type##_pattern_set(VALUE self, VALUE pattern) \
|
76
|
+
{ \
|
77
|
+
GET_STRUCT(type) \
|
78
|
+
type##_pattern_set(amatch, pattern); \
|
79
|
+
return Qnil; \
|
80
|
+
}
|
81
|
+
|
82
|
+
#define DEF_ITERATE_STRINGS(type) \
|
83
|
+
static VALUE type##_iterate_strings(type *amatch, VALUE strings, \
|
84
|
+
VALUE (*match_function) (type *amatch, VALUE strings)) \
|
85
|
+
{ \
|
86
|
+
if (TYPE(strings) == T_STRING) { \
|
87
|
+
return match_function(amatch, strings); \
|
88
|
+
} else { \
|
89
|
+
Check_Type(strings, T_ARRAY); \
|
90
|
+
int i; \
|
91
|
+
VALUE result = rb_ary_new2(RARRAY(strings)->len); \
|
92
|
+
for (i = 0; i < RARRAY(strings)->len; i++) { \
|
93
|
+
VALUE string = rb_ary_entry(strings, i); \
|
94
|
+
if (TYPE(string) != T_STRING) { \
|
95
|
+
rb_raise(rb_eTypeError, \
|
96
|
+
"array has to contain only strings (%s given)", \
|
97
|
+
NIL_P(string) ? \
|
98
|
+
"NilClass" : \
|
99
|
+
rb_class2name(CLASS_OF(string))); \
|
100
|
+
} \
|
101
|
+
rb_ary_push(result, match_function(amatch, string)); \
|
102
|
+
} \
|
103
|
+
return result; \
|
104
|
+
} \
|
105
|
+
}
|
106
|
+
|
107
|
+
#define DEF_RB_READER(type, function, name, converter) \
|
108
|
+
VALUE function(VALUE self) \
|
109
|
+
{ \
|
110
|
+
GET_STRUCT(type) \
|
111
|
+
return converter(amatch->name); \
|
112
|
+
}
|
113
|
+
|
114
|
+
#define DEF_RB_WRITER(type, function, name, vtype, caster, converter, check)\
|
115
|
+
VALUE function(VALUE self, VALUE value) \
|
116
|
+
{ \
|
117
|
+
vtype value_ ## vtype; \
|
118
|
+
GET_STRUCT(type) \
|
119
|
+
caster(value); \
|
120
|
+
value_ ## vtype = converter(value); \
|
121
|
+
if (!(value_ ## vtype check)) \
|
122
|
+
rb_raise(rb_eTypeError, "check of value " #check " failed"); \
|
123
|
+
amatch->name = value_ ## vtype; \
|
124
|
+
return Qnil; \
|
125
|
+
}
|
126
|
+
|
127
|
+
|
128
|
+
#define CAST2FLOAT(obj) \
|
129
|
+
if (TYPE(obj) != T_FLOAT && rb_respond_to(obj, id_to_f)) \
|
130
|
+
obj = rb_funcall(obj, id_to_f, 0, 0); \
|
131
|
+
else \
|
132
|
+
Check_Type(obj, T_FLOAT)
|
133
|
+
#define FLOAT2C(obj) RFLOAT(obj)->value
|
134
|
+
|
135
|
+
#define OPTIMIZE_TIME \
|
136
|
+
if (amatch->pattern_len < RSTRING(string)->len) { \
|
137
|
+
a_ptr = amatch->pattern; \
|
138
|
+
a_len = amatch->pattern_len; \
|
139
|
+
b_ptr = RSTRING(string)->ptr; \
|
140
|
+
b_len = RSTRING(string)->len; \
|
141
|
+
} else { \
|
142
|
+
a_ptr = RSTRING(string)->ptr; \
|
143
|
+
a_len = RSTRING(string)->len; \
|
144
|
+
b_ptr = amatch->pattern; \
|
145
|
+
b_len = amatch->pattern_len; \
|
146
|
+
}
|
147
|
+
|
148
|
+
#define DONT_OPTIMIZE \
|
149
|
+
a_ptr = amatch->pattern; \
|
150
|
+
a_len = amatch->pattern_len; \
|
151
|
+
b_ptr = RSTRING(string)->ptr; \
|
152
|
+
b_len = RSTRING(string)->len; \
|
153
|
+
|
154
|
+
/*
|
155
|
+
* C structures of the Amatch classes
|
156
|
+
*/
|
157
|
+
|
158
|
+
typedef struct GeneralStruct {
|
159
|
+
char *pattern;
|
160
|
+
char pattern_len;
|
161
|
+
} General;
|
162
|
+
|
163
|
+
DEF_ALLOCATOR(General)
|
164
|
+
DEF_PATTERN_ACCESSOR(General)
|
165
|
+
DEF_ITERATE_STRINGS(General)
|
166
|
+
|
167
|
+
typedef struct SellersStruct {
|
168
|
+
char *pattern;
|
169
|
+
char pattern_len;
|
170
|
+
double substitution;
|
171
|
+
double deletion;
|
172
|
+
double insertion;
|
173
|
+
} Sellers;
|
174
|
+
|
175
|
+
DEF_ALLOCATOR(Sellers)
|
176
|
+
DEF_PATTERN_ACCESSOR(Sellers)
|
177
|
+
DEF_ITERATE_STRINGS(Sellers)
|
178
|
+
|
179
|
+
static void Sellers_reset_weights(Sellers *self)
|
30
180
|
{
|
31
|
-
|
32
|
-
|
33
|
-
|
181
|
+
self->substitution = 1.0;
|
182
|
+
self->deletion = 1.0;
|
183
|
+
self->insertion = 1.0;
|
34
184
|
}
|
35
185
|
|
36
|
-
|
37
|
-
|
38
|
-
|
186
|
+
typedef struct PairDistanceStruct {
|
187
|
+
char *pattern;
|
188
|
+
char pattern_len;
|
189
|
+
PairArray *pattern_pair_array;
|
190
|
+
} PairDistance;
|
191
|
+
|
192
|
+
DEF_ALLOCATOR(PairDistance)
|
193
|
+
DEF_PATTERN_ACCESSOR(PairDistance)
|
194
|
+
|
195
|
+
/*
|
196
|
+
* Levenshtein edit distances are computed here:
|
197
|
+
*/
|
198
|
+
|
199
|
+
#define COMPUTE_LEVENSHTEIN_DISTANCE \
|
200
|
+
for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
|
201
|
+
c = i % 2; /* current row */ \
|
202
|
+
p = (i + 1) % 2; /* previous row */ \
|
203
|
+
v[c][0] = i; /* first column */ \
|
204
|
+
for (j = 1; j <= b_len; j++) { \
|
205
|
+
/* Bellman's principle of optimality: */ \
|
206
|
+
weight = v[p][j - 1] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
|
207
|
+
if (weight > v[p][j] + 1) { \
|
208
|
+
weight = v[p][j] + 1; \
|
209
|
+
} \
|
210
|
+
if (weight > v[c][j - 1] + 1) { \
|
211
|
+
weight = v[c][j - 1] + 1; \
|
212
|
+
} \
|
213
|
+
v[c][j] = weight; \
|
214
|
+
} \
|
215
|
+
p = c; \
|
216
|
+
c = (c + 1) % 2; \
|
217
|
+
}
|
218
|
+
|
219
|
+
static VALUE Levenshtein_match(General *amatch, VALUE string)
|
39
220
|
{
|
40
|
-
|
41
|
-
|
221
|
+
VALUE result;
|
222
|
+
char *a_ptr, *b_ptr;
|
223
|
+
int a_len, b_len;
|
224
|
+
int *v[2], weight;
|
225
|
+
int i, j, c, p;
|
226
|
+
|
227
|
+
Check_Type(string, T_STRING);
|
228
|
+
DONT_OPTIMIZE
|
229
|
+
|
230
|
+
v[0] = ALLOC_N(int, b_len + 1);
|
231
|
+
v[1] = ALLOC_N(int, b_len + 1);
|
232
|
+
for (i = 0; i <= b_len; i++) {
|
233
|
+
v[0][i] = i;
|
234
|
+
v[1][i] = i;
|
235
|
+
}
|
236
|
+
|
237
|
+
COMPUTE_LEVENSHTEIN_DISTANCE
|
238
|
+
|
239
|
+
result = INT2FIX(v[p][b_len]);
|
240
|
+
|
241
|
+
free(v[0]);
|
242
|
+
free(v[1]);
|
243
|
+
|
244
|
+
return result;
|
42
245
|
}
|
43
246
|
|
44
|
-
static
|
45
|
-
vector_minimum(v)
|
46
|
-
vector *v;
|
247
|
+
static VALUE Levenshtein_similar(General *amatch, VALUE string)
|
47
248
|
{
|
48
|
-
|
49
|
-
|
249
|
+
VALUE result;
|
250
|
+
char *a_ptr, *b_ptr;
|
251
|
+
int a_len, b_len;
|
252
|
+
int *v[2], weight;
|
253
|
+
int i, j, c, p;
|
50
254
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
255
|
+
Check_Type(string, T_STRING);
|
256
|
+
DONT_OPTIMIZE
|
257
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
258
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
259
|
+
v[0] = ALLOC_N(int, b_len + 1);
|
260
|
+
v[1] = ALLOC_N(int, b_len + 1);
|
261
|
+
for (i = 0; i <= b_len; i++) {
|
262
|
+
v[0][i] = i;
|
263
|
+
v[1][i] = i;
|
264
|
+
}
|
265
|
+
|
266
|
+
COMPUTE_LEVENSHTEIN_DISTANCE
|
267
|
+
|
268
|
+
if (b_len > a_len) {
|
269
|
+
result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
|
270
|
+
} else {
|
271
|
+
result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
|
55
272
|
}
|
56
|
-
|
273
|
+
free(v[0]);
|
274
|
+
free(v[1]);
|
275
|
+
return result;
|
57
276
|
}
|
58
277
|
|
59
|
-
static
|
60
|
-
vector_last(v)
|
61
|
-
vector *v;
|
278
|
+
static VALUE Levenshtein_search(General *amatch, VALUE string)
|
62
279
|
{
|
63
|
-
|
280
|
+
VALUE result;
|
281
|
+
char *a_ptr, *b_ptr;
|
282
|
+
int a_len, b_len;
|
283
|
+
int *v[2], weight, min;
|
284
|
+
int i, j, c, p;
|
285
|
+
|
286
|
+
Check_Type(string, T_STRING);
|
287
|
+
DONT_OPTIMIZE
|
288
|
+
|
289
|
+
v[0] = ALLOC_N(int, b_len + 1);
|
290
|
+
v[1] = ALLOC_N(int, b_len + 1);
|
291
|
+
MEMZERO(v[0], int, b_len + 1);
|
292
|
+
MEMZERO(v[1], int, b_len + 1);
|
293
|
+
|
294
|
+
COMPUTE_LEVENSHTEIN_DISTANCE
|
295
|
+
|
296
|
+
for (i = 0, min = a_len; i <= b_len; i++) {
|
297
|
+
if (v[p][i] < min) min = v[p][i];
|
298
|
+
}
|
299
|
+
|
300
|
+
result = INT2FIX(min);
|
301
|
+
|
302
|
+
free(v[0]);
|
303
|
+
free(v[1]);
|
304
|
+
|
305
|
+
return result;
|
64
306
|
}
|
65
307
|
|
308
|
+
|
66
309
|
/*
|
67
|
-
*
|
310
|
+
* Sellers edit distances are computed here:
|
68
311
|
*/
|
69
312
|
|
70
|
-
|
313
|
+
#define COMPUTE_SELLERS_DISTANCE \
|
314
|
+
for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
|
315
|
+
c = i % 2; /* current row */ \
|
316
|
+
p = (i + 1) % 2; /* previous row */ \
|
317
|
+
v[c][0] = i * amatch->deletion; /* first column */ \
|
318
|
+
for (j = 1; j <= b_len; j++) { \
|
319
|
+
/* Bellman's principle of optimality: */ \
|
320
|
+
weight = v[p][j - 1] + \
|
321
|
+
(a_ptr[i - 1] == b_ptr[j - 1] ? 0 : amatch->substitution); \
|
322
|
+
if (weight > v[p][j] + amatch->insertion) { \
|
323
|
+
weight = v[p][j] + amatch->insertion; \
|
324
|
+
} \
|
325
|
+
if (weight > v[c][j - 1] + amatch->deletion) { \
|
326
|
+
weight = v[c][j - 1] + amatch->deletion; \
|
327
|
+
} \
|
328
|
+
v[c][j] = weight; \
|
329
|
+
} \
|
330
|
+
p = c; \
|
331
|
+
c = (c + 1) % 2; \
|
332
|
+
}
|
71
333
|
|
72
|
-
static
|
73
|
-
VALUE weight;
|
74
|
-
char *name;
|
334
|
+
static VALUE Sellers_match(Sellers *amatch, VALUE string)
|
75
335
|
{
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
336
|
+
VALUE result;
|
337
|
+
char *a_ptr, *b_ptr;
|
338
|
+
int a_len, b_len;
|
339
|
+
double *v[2], weight;
|
340
|
+
int i, j, c, p;
|
341
|
+
|
342
|
+
Check_Type(string, T_STRING);
|
343
|
+
DONT_OPTIMIZE
|
344
|
+
|
345
|
+
v[0] = ALLOC_N(double, b_len + 1);
|
346
|
+
v[1] = ALLOC_N(double, b_len + 1);
|
347
|
+
for (i = 0; i <= b_len; i++) {
|
348
|
+
v[0][i] = i * amatch->deletion;
|
349
|
+
v[1][i] = i * amatch->deletion;
|
80
350
|
}
|
81
|
-
|
351
|
+
|
352
|
+
COMPUTE_SELLERS_DISTANCE
|
353
|
+
|
354
|
+
result = rb_float_new(v[p][b_len]);
|
355
|
+
free(v[0]);
|
356
|
+
free(v[1]);
|
357
|
+
return result;
|
82
358
|
}
|
83
359
|
|
84
|
-
static VALUE
|
85
|
-
calculate_distance (self, string, mode)
|
86
|
-
VALUE self;
|
87
|
-
VALUE string;
|
88
|
-
char mode;
|
360
|
+
static VALUE Sellers_similar(Sellers *amatch, VALUE string)
|
89
361
|
{
|
90
|
-
VALUE
|
91
|
-
|
92
|
-
int
|
93
|
-
|
94
|
-
|
95
|
-
int weight, sw, dw, iw, i, j, tmpi;
|
96
|
-
int c = 0, p = 1;
|
362
|
+
VALUE result;
|
363
|
+
char *a_ptr, *b_ptr;
|
364
|
+
int a_len, b_len;
|
365
|
+
double *v[2], weight, max_weight;
|
366
|
+
int i, j, c, p;
|
97
367
|
|
368
|
+
if (amatch->insertion >= amatch->deletion) {
|
369
|
+
if (amatch->substitution >= amatch->insertion) {
|
370
|
+
max_weight = amatch->substitution;
|
371
|
+
} else {
|
372
|
+
max_weight = amatch->insertion;
|
373
|
+
}
|
374
|
+
} else {
|
375
|
+
if (amatch->substitution >= amatch->deletion) {
|
376
|
+
max_weight = amatch->substitution;
|
377
|
+
} else {
|
378
|
+
max_weight = amatch->deletion;
|
379
|
+
}
|
380
|
+
}
|
381
|
+
|
98
382
|
Check_Type(string, T_STRING);
|
99
|
-
|
100
|
-
|
383
|
+
DONT_OPTIMIZE
|
384
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
385
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
386
|
+
v[0] = ALLOC_N(double, b_len + 1);
|
387
|
+
v[1] = ALLOC_N(double, b_len + 1);
|
388
|
+
for (i = 0; i <= b_len; i++) {
|
389
|
+
v[0][i] = i * amatch->deletion;
|
390
|
+
v[1][i] = i * amatch->deletion;
|
391
|
+
}
|
101
392
|
|
102
|
-
|
103
|
-
Check_Type(pattern, T_STRING);
|
104
|
-
pattern_ptr = RSTRING(pattern)->ptr;
|
105
|
-
pattern_len = RSTRING(pattern)->len;
|
393
|
+
COMPUTE_SELLERS_DISTANCE
|
106
394
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
v[0] = vector_new(string_len);
|
112
|
-
switch (mode) {
|
113
|
-
case MATCH:
|
114
|
-
case MATCHR:
|
115
|
-
case COMPARE:
|
116
|
-
case COMPARER:
|
117
|
-
for (i = 0; i <= v[0]->len; i++) v[0]->ptr[i] = i * iw;
|
118
|
-
break;
|
119
|
-
case SEARCH:
|
120
|
-
case SEARCHR:
|
121
|
-
for (i = 0; i <= v[0]->len; i++) v[0]->ptr[i] = 0;
|
122
|
-
break;
|
123
|
-
default:
|
124
|
-
rb_raise(rb_eFatal, "unknown mode in calculate_distance");
|
395
|
+
if (b_len > a_len) {
|
396
|
+
result = rb_float_new(1.0 - v[p][b_len] / (b_len * max_weight));
|
397
|
+
} else {
|
398
|
+
result = rb_float_new(1.0 - v[p][b_len] / (a_len * max_weight));
|
125
399
|
}
|
400
|
+
free(v[0]);
|
401
|
+
free(v[1]);
|
402
|
+
return result;
|
403
|
+
}
|
404
|
+
|
405
|
+
static VALUE Sellers_search(Sellers *amatch, VALUE string)
|
406
|
+
{
|
407
|
+
VALUE result;
|
408
|
+
char *a_ptr, *b_ptr;
|
409
|
+
int a_len, b_len;
|
410
|
+
double *v[2], weight, min;
|
411
|
+
int i, j, c, p;
|
412
|
+
|
413
|
+
Check_Type(string, T_STRING);
|
414
|
+
DONT_OPTIMIZE
|
415
|
+
|
416
|
+
v[0] = ALLOC_N(double, b_len + 1);
|
417
|
+
v[1] = ALLOC_N(double, b_len + 1);
|
418
|
+
MEMZERO(v[0], double, b_len + 1);
|
419
|
+
MEMZERO(v[1], double, b_len + 1);
|
420
|
+
|
421
|
+
COMPUTE_SELLERS_DISTANCE
|
422
|
+
|
423
|
+
for (i = 0, min = a_len; i <= b_len; i++) {
|
424
|
+
if (v[p][i] < min) min = v[p][i];
|
425
|
+
}
|
426
|
+
result = rb_float_new(min);
|
427
|
+
free(v[0]);
|
428
|
+
free(v[1]);
|
429
|
+
|
430
|
+
return result;
|
431
|
+
}
|
432
|
+
|
433
|
+
/*
|
434
|
+
* Pair distances are computed here:
|
435
|
+
*/
|
126
436
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
437
|
+
static VALUE PairDistance_match(
|
438
|
+
PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
|
439
|
+
{
|
440
|
+
double result;
|
441
|
+
VALUE tokens;
|
442
|
+
PairArray *pair_array;
|
443
|
+
|
444
|
+
Check_Type(string, T_STRING);
|
445
|
+
if (!NIL_P(regexp) || use_regexp) {
|
446
|
+
tokens = rb_funcall(
|
447
|
+
rb_str_new(amatch->pattern, amatch->pattern_len),
|
448
|
+
id_split, 1, regexp
|
449
|
+
);
|
450
|
+
if (!amatch->pattern_pair_array) {
|
451
|
+
amatch->pattern_pair_array = PairArray_new(tokens);
|
452
|
+
} else {
|
453
|
+
pair_array_reactivate(amatch->pattern_pair_array);
|
454
|
+
}
|
455
|
+
tokens = rb_funcall(string, id_split, 1, regexp);
|
456
|
+
pair_array = PairArray_new(tokens);
|
457
|
+
} else {
|
458
|
+
VALUE tmp = rb_str_new(amatch->pattern, amatch->pattern_len);
|
459
|
+
tokens = rb_ary_new4(1, &tmp);
|
460
|
+
if (!amatch->pattern_pair_array) {
|
461
|
+
amatch->pattern_pair_array = PairArray_new(tokens);
|
462
|
+
} else {
|
463
|
+
pair_array_reactivate(amatch->pattern_pair_array);
|
139
464
|
}
|
465
|
+
tokens = rb_ary_new4(1, &string);
|
466
|
+
pair_array = PairArray_new(tokens);
|
140
467
|
}
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
result = INT2FIX((string_len < pattern_len ? -1 : 1) *
|
158
|
-
vector_last(v[c]));
|
159
|
-
break;
|
160
|
-
case COMPARER:
|
161
|
-
result = rb_float_new((double)
|
162
|
-
(string_len < pattern_len ? -1 : 1) *
|
163
|
-
vector_last(v[c]) / pattern_len);
|
164
|
-
break;
|
165
|
-
default:
|
166
|
-
rb_raise(rb_eFatal, "unknown mode in calculate_distance");
|
468
|
+
result = pair_array_match(amatch->pattern_pair_array, pair_array);
|
469
|
+
pair_array_destroy(pair_array);
|
470
|
+
return rb_float_new(result);
|
471
|
+
}
|
472
|
+
|
473
|
+
/*
|
474
|
+
* Hamming distances are computed here:
|
475
|
+
*/
|
476
|
+
|
477
|
+
#define COMPUTE_HAMMING_DISTANCE \
|
478
|
+
for (i = 0, result = b_len - a_len; i < a_len; i++) { \
|
479
|
+
if (i >= b_len) { \
|
480
|
+
result += a_len - b_len; \
|
481
|
+
break; \
|
482
|
+
} \
|
483
|
+
if (b_ptr[i] != a_ptr[i]) result++; \
|
167
484
|
}
|
168
|
-
|
169
|
-
|
170
|
-
|
485
|
+
|
486
|
+
static VALUE Hamming_match(General *amatch, VALUE string)
|
487
|
+
{
|
488
|
+
char *a_ptr, *b_ptr;
|
489
|
+
int a_len, b_len;
|
490
|
+
int i, result;
|
491
|
+
|
492
|
+
Check_Type(string, T_STRING);
|
493
|
+
OPTIMIZE_TIME
|
494
|
+
COMPUTE_HAMMING_DISTANCE
|
495
|
+
return INT2FIX(result);
|
496
|
+
}
|
497
|
+
|
498
|
+
static VALUE Hamming_similar(General *amatch, VALUE string)
|
499
|
+
{
|
500
|
+
char *a_ptr, *b_ptr;
|
501
|
+
int a_len, b_len;
|
502
|
+
int i, result;
|
503
|
+
|
504
|
+
Check_Type(string, T_STRING);
|
505
|
+
OPTIMIZE_TIME
|
506
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
507
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
508
|
+
COMPUTE_HAMMING_DISTANCE
|
509
|
+
return rb_float_new(1.0 - ((double) result) / b_len);
|
510
|
+
}
|
511
|
+
|
512
|
+
/*
|
513
|
+
* Longest Common Subsequence computation
|
514
|
+
*/
|
515
|
+
|
516
|
+
#define COMPUTE_LONGEST_SUBSEQUENCE \
|
517
|
+
l[0] = ALLOC_N(int, b_len + 1); \
|
518
|
+
l[1] = ALLOC_N(int, b_len + 1); \
|
519
|
+
for (i = a_len, c = 0, p = 1; i >= 0; i--) { \
|
520
|
+
for (j = b_len; j >= 0; j--) { \
|
521
|
+
if (i == a_len || j == b_len) { \
|
522
|
+
l[c][j] = 0; \
|
523
|
+
} else if (a_ptr[i] == b_ptr[j]) { \
|
524
|
+
l[c][j] = 1 + l[p][j + 1]; \
|
525
|
+
} else { \
|
526
|
+
int x = l[p][j], y = l[c][j + 1]; \
|
527
|
+
if (x > y) l[c][j] = x; else l[c][j] = y; \
|
528
|
+
} \
|
529
|
+
} \
|
530
|
+
p = c; \
|
531
|
+
c = (c + 1) % 2; \
|
532
|
+
} \
|
533
|
+
result = l[p][0]; \
|
534
|
+
free(l[0]); \
|
535
|
+
free(l[1]);
|
536
|
+
|
537
|
+
|
538
|
+
static VALUE LongestSubsequence_match(General *amatch, VALUE string)
|
539
|
+
{
|
540
|
+
char *a_ptr, *b_ptr;
|
541
|
+
int a_len, b_len;
|
542
|
+
int result, c, p, i, j, *l[2];
|
543
|
+
|
544
|
+
Check_Type(string, T_STRING);
|
545
|
+
OPTIMIZE_TIME
|
546
|
+
|
547
|
+
if (a_len == 0 || b_len == 0) return INT2FIX(0);
|
548
|
+
COMPUTE_LONGEST_SUBSEQUENCE
|
549
|
+
return INT2FIX(result);
|
550
|
+
}
|
551
|
+
|
552
|
+
static VALUE LongestSubsequence_similar(General *amatch, VALUE string)
|
553
|
+
{
|
554
|
+
char *a_ptr, *b_ptr;
|
555
|
+
int a_len, b_len;
|
556
|
+
int result, c, p, i, j, *l[2];
|
557
|
+
|
558
|
+
Check_Type(string, T_STRING);
|
559
|
+
OPTIMIZE_TIME
|
560
|
+
|
561
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
562
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
563
|
+
COMPUTE_LONGEST_SUBSEQUENCE
|
564
|
+
return rb_float_new(((double) result) / b_len);
|
565
|
+
}
|
566
|
+
|
567
|
+
/*
|
568
|
+
* Longest Common Substring computation
|
569
|
+
*/
|
570
|
+
|
571
|
+
#define COMPUTE_LONGEST_SUBSTRING \
|
572
|
+
l[0] = ALLOC_N(int, b_len); \
|
573
|
+
MEMZERO(l[0], int, b_len); \
|
574
|
+
l[1] = ALLOC_N(int, b_len); \
|
575
|
+
MEMZERO(l[1], int, b_len); \
|
576
|
+
result = 0; \
|
577
|
+
for (i = 0, c = 0, p = 1; i < a_len; i++) { \
|
578
|
+
for (j = 0; j < b_len; j++) { \
|
579
|
+
if (a_ptr[i] == b_ptr[j]) { \
|
580
|
+
l[c][j] = j == 0 ? 1 : 1 + l[p][j - 1]; \
|
581
|
+
if (l[c][j] > result) result = l[c][j]; \
|
582
|
+
} else { \
|
583
|
+
l[c][j] = 0; \
|
584
|
+
} \
|
585
|
+
} \
|
586
|
+
p = c; \
|
587
|
+
c = (c + 1) % 2; \
|
588
|
+
} \
|
589
|
+
free(l[0]); \
|
590
|
+
free(l[1]);
|
591
|
+
|
592
|
+
static VALUE LongestSubstring_match(General *amatch, VALUE string)
|
593
|
+
{
|
594
|
+
char *a_ptr, *b_ptr;
|
595
|
+
int a_len, b_len;
|
596
|
+
int result, c, p, i, j, *l[2];
|
597
|
+
|
598
|
+
Check_Type(string, T_STRING);
|
599
|
+
OPTIMIZE_TIME
|
600
|
+
if (a_len == 0 || b_len == 0) return INT2FIX(0);
|
601
|
+
COMPUTE_LONGEST_SUBSTRING
|
602
|
+
return INT2FIX(result);
|
603
|
+
}
|
604
|
+
|
605
|
+
static VALUE LongestSubstring_similar(General *amatch, VALUE string)
|
606
|
+
{
|
607
|
+
char *a_ptr, *b_ptr;
|
608
|
+
int a_len, b_len;
|
609
|
+
int result, c, p, i, j, *l[2];
|
610
|
+
|
611
|
+
Check_Type(string, T_STRING);
|
612
|
+
OPTIMIZE_TIME
|
613
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
614
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
615
|
+
COMPUTE_LONGEST_SUBSTRING
|
616
|
+
return rb_float_new(((double) result) / b_len);
|
617
|
+
}
|
618
|
+
|
619
|
+
/*
|
620
|
+
* Ruby API
|
621
|
+
*/
|
622
|
+
|
623
|
+
/*
|
624
|
+
* Document-class: Amatch::Levenshtein
|
625
|
+
*
|
626
|
+
* The Levenshtein edit distance is defined as the minimal costs involved to
|
627
|
+
* transform one string into another by using three elementary operations:
|
628
|
+
* deletion, insertion and substitution of a character. To transform "water"
|
629
|
+
* into "wine", for instance, you have to substitute "a" -> "i": "witer", "t"
|
630
|
+
* -> "n": "winer" and delete "r": "wine". The edit distance between "water"
|
631
|
+
* and "wine" is 3, because you have to apply three operations. The edit
|
632
|
+
* distance between "wine" and "wine" is 0 of course: no operation is
|
633
|
+
* necessary for the transformation -- they're already the same string. It's
|
634
|
+
* easy to see that more similar strings have smaller edit distances than
|
635
|
+
* strings that differ a lot.
|
636
|
+
*/
|
637
|
+
|
638
|
+
DEF_RB_FREE(Levenshtein, General)
|
639
|
+
|
640
|
+
/*
|
641
|
+
* call-seq: new(pattern)
|
642
|
+
*
|
643
|
+
* Creates a new Amatch::Levenshtein instance from <code>pattern</code>.
|
644
|
+
*/
|
645
|
+
static VALUE rb_Levenshtein_initialize(VALUE self, VALUE pattern)
|
646
|
+
{
|
647
|
+
GET_STRUCT(General)
|
648
|
+
General_pattern_set(amatch, pattern);
|
649
|
+
return self;
|
650
|
+
}
|
651
|
+
|
652
|
+
DEF_CONSTRUCTOR(Levenshtein, General)
|
653
|
+
|
654
|
+
/*
|
655
|
+
* call-seq: match(strings) -> results
|
656
|
+
*
|
657
|
+
* Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
|
658
|
+
* against <code>strings</code>. It returns the number operations, the Sellers
|
659
|
+
* distance. <code>strings</code> has to be either a String or an Array of
|
660
|
+
* Strings. The returned <code>results</code> are either a Float or an Array of
|
661
|
+
* Floats respectively.
|
662
|
+
*/
|
663
|
+
static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
|
664
|
+
{
|
665
|
+
GET_STRUCT(General)
|
666
|
+
return General_iterate_strings(amatch, strings, Levenshtein_match);
|
667
|
+
}
|
668
|
+
|
669
|
+
/*
|
670
|
+
* call-seq: similar(strings) -> results
|
671
|
+
*
|
672
|
+
* Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
|
673
|
+
* against <code>strings</code>, and compute a Levenshtein distance metric
|
674
|
+
* number between 0.0 for very unsimilar strings and 1.0 for an exact match.
|
675
|
+
* <code>strings</code> has to be either a String or an Array of Strings. The
|
676
|
+
* returned <code>results</code> are either a Fixnum or an Array of Fixnums
|
677
|
+
* respectively.
|
678
|
+
*/
|
679
|
+
static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
|
680
|
+
{
|
681
|
+
GET_STRUCT(General)
|
682
|
+
return General_iterate_strings(amatch, strings, Levenshtein_similar);
|
683
|
+
}
|
684
|
+
|
685
|
+
/*
|
686
|
+
* call-seq: levenshtein_similar(strings) -> results
|
687
|
+
*
|
688
|
+
* If called on a String, this string is used as a Amatch::Levenshtein#pattern
|
689
|
+
* to match against <code>strings</code>. It returns a Levenshtein distance
|
690
|
+
* metric number between 0.0 for very unsimilar strings and 1.0 for an exact
|
691
|
+
* match. <code>strings</code> has to be either a String or an Array of
|
692
|
+
* Strings. The returned <code>results</code> are either a Float or an Array of
|
693
|
+
* Floats respectively.
|
694
|
+
*/
|
695
|
+
static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
|
696
|
+
{
|
697
|
+
VALUE amatch = rb_Levenshtein_new(rb_cSellers, self);
|
698
|
+
return rb_Levenshtein_similar(amatch, strings);
|
699
|
+
}
|
700
|
+
|
701
|
+
/*
|
702
|
+
* call-seq: search(strings) -> results
|
703
|
+
*
|
704
|
+
* searches Amatch::Levenshtein#pattern in <code>strings</code> and returns the
|
705
|
+
* edit distance (the sum of character operations) as a Fixnum value, by greedy
|
706
|
+
* trimming prefixes or postfixes of the match. <code>strings</code> has
|
707
|
+
* to be either a String or an Array of Strings. The returned
|
708
|
+
* <code>results</code> are either a Float or an Array of Floats respectively.
|
709
|
+
*/
|
710
|
+
static VALUE rb_Levenshtein_search(VALUE self, VALUE strings)
|
711
|
+
{
|
712
|
+
GET_STRUCT(General)
|
713
|
+
return General_iterate_strings(amatch, strings, Levenshtein_search);
|
714
|
+
}
|
715
|
+
|
716
|
+
/*
|
717
|
+
* Document-class: Amatch::Sellers
|
718
|
+
*
|
719
|
+
* The Sellers edit distance is very similar to the Levenshtein edit distance.
|
720
|
+
* The difference is, that you can also specify different weights for every
|
721
|
+
* operation to prefer special operations over others. This extension of the
|
722
|
+
* Sellers edit distance is also known under the names: Needleman-Wunsch
|
723
|
+
* distance.
|
724
|
+
*/
|
725
|
+
|
726
|
+
DEF_RB_FREE(Sellers, Sellers)
|
727
|
+
|
728
|
+
/*
|
729
|
+
* Document-method: substitution
|
730
|
+
*
|
731
|
+
* call-seq: substitution -> weight
|
732
|
+
*
|
733
|
+
* Returns the weight of the substitution operation, that is used to compute
|
734
|
+
* the Sellers distance.
|
735
|
+
*/
|
736
|
+
DEF_RB_READER(Sellers, rb_Sellers_substitution, substitution,
|
737
|
+
rb_float_new)
|
738
|
+
|
739
|
+
/*
|
740
|
+
* Document-method: deletion
|
741
|
+
*
|
742
|
+
* call-seq: deletion -> weight
|
743
|
+
*
|
744
|
+
* Returns the weight of the deletion operation, that is used to compute
|
745
|
+
* the Sellers distance.
|
746
|
+
*/
|
747
|
+
DEF_RB_READER(Sellers, rb_Sellers_deletion, deletion,
|
748
|
+
rb_float_new)
|
749
|
+
|
750
|
+
/*
|
751
|
+
* Document-method: insertion
|
752
|
+
*
|
753
|
+
* call-seq: insertion -> weight
|
754
|
+
*
|
755
|
+
* Returns the weight of the insertion operation, that is used to compute
|
756
|
+
* the Sellers distance.
|
757
|
+
*/
|
758
|
+
DEF_RB_READER(Sellers, rb_Sellers_insertion, insertion,
|
759
|
+
rb_float_new)
|
760
|
+
|
761
|
+
/*
|
762
|
+
* Document-method: substitution=
|
763
|
+
*
|
764
|
+
* call-seq: substitution=(weight)
|
765
|
+
*
|
766
|
+
* Sets the weight of the substitution operation, that is used to compute
|
767
|
+
* the Sellers distance, to <code>weight</code>. The <code>weight</code>
|
768
|
+
* should be a Float value >= 0.0.
|
769
|
+
*/
|
770
|
+
DEF_RB_WRITER(Sellers, rb_Sellers_substitution_set, substitution,
|
771
|
+
double, CAST2FLOAT, FLOAT2C, >= 0)
|
772
|
+
|
773
|
+
/*
|
774
|
+
* Document-method: deletion=
|
775
|
+
*
|
776
|
+
* call-seq: deletion=(weight)
|
777
|
+
*
|
778
|
+
* Sets the weight of the deletion operation, that is used to compute
|
779
|
+
* the Sellers distance, to <code>weight</code>. The <code>weight</code>
|
780
|
+
* should be a Float value >= 0.0.
|
781
|
+
*/
|
782
|
+
DEF_RB_WRITER(Sellers, rb_Sellers_deletion_set, deletion,
|
783
|
+
double, CAST2FLOAT, FLOAT2C, >= 0)
|
784
|
+
|
785
|
+
/*
|
786
|
+
* Document-method: insertion=
|
787
|
+
*
|
788
|
+
* call-seq: insertion=(weight)
|
789
|
+
*
|
790
|
+
* Sets the weight of the insertion operation, that is used to compute
|
791
|
+
* the Sellers distance, to <code>weight</code>. The <code>weight</code>
|
792
|
+
* should be a Float value >= 0.0.
|
793
|
+
*/
|
794
|
+
DEF_RB_WRITER(Sellers, rb_Sellers_insertion_set, insertion,
|
795
|
+
double, CAST2FLOAT, FLOAT2C, >= 0)
|
796
|
+
|
797
|
+
/*
|
798
|
+
* Resets all weights (substitution, deletion, and insertion) to 1.0.
|
799
|
+
*/
|
800
|
+
static VALUE rb_Sellers_reset_weights(VALUE self)
|
801
|
+
{
|
802
|
+
GET_STRUCT(Sellers)
|
803
|
+
Sellers_reset_weights(amatch);
|
804
|
+
return self;
|
171
805
|
}
|
172
806
|
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
807
|
+
/*
|
808
|
+
* call-seq: new(pattern)
|
809
|
+
*
|
810
|
+
* Creates a new Amatch::Sellers instance from <code>pattern</code>,
|
811
|
+
* with all weights initially set to 1.0.
|
812
|
+
*/
|
813
|
+
static VALUE rb_Sellers_initialize(VALUE self, VALUE pattern)
|
178
814
|
{
|
179
|
-
|
815
|
+
GET_STRUCT(Sellers)
|
816
|
+
Sellers_pattern_set(amatch, pattern);
|
817
|
+
Sellers_reset_weights(amatch);
|
818
|
+
return self;
|
819
|
+
}
|
820
|
+
|
821
|
+
DEF_CONSTRUCTOR(Sellers, Sellers)
|
822
|
+
|
823
|
+
/*
|
824
|
+
* Document-method: pattern
|
825
|
+
*
|
826
|
+
* call-seq: pattern -> pattern string
|
827
|
+
*
|
828
|
+
* Returns the current pattern string of this Amatch::Sellers instance.
|
829
|
+
*/
|
830
|
+
|
831
|
+
/*
|
832
|
+
* Document-method: pattern=
|
833
|
+
*
|
834
|
+
* call-seq: pattern=(pattern)
|
835
|
+
*
|
836
|
+
* Sets the current pattern string of this Amatch::Sellers instance to
|
837
|
+
* <code>pattern</code>.
|
838
|
+
*/
|
839
|
+
|
840
|
+
/*
|
841
|
+
* call-seq: match(strings) -> results
|
842
|
+
*
|
843
|
+
* Uses this Amatch::Sellers instance to match Sellers#pattern against
|
844
|
+
* <code>strings</code>, while taking into account the given weights. It
|
845
|
+
* returns the number of weighted character operations, the Sellers distance.
|
846
|
+
* <code>strings</code> has to be either a String or an Array of Strings. The
|
847
|
+
* returned <code>results</code> are either a Float or an Array of Floats
|
848
|
+
* respectively.
|
849
|
+
*/
|
850
|
+
static VALUE rb_Sellers_match(VALUE self, VALUE strings)
|
851
|
+
{
|
852
|
+
GET_STRUCT(Sellers)
|
853
|
+
return Sellers_iterate_strings(amatch, strings, Sellers_match);
|
854
|
+
}
|
855
|
+
|
856
|
+
/*
|
857
|
+
* call-seq: similar(strings) -> results
|
858
|
+
*
|
859
|
+
* Uses this Amatch::Sellers instance to match Amatch::Sellers#pattern
|
860
|
+
* against <code>strings</code> (taking into account the given weights), and
|
861
|
+
* compute a Sellers distance metric number between 0.0 for very unsimilar
|
862
|
+
* strings and 1.0 for an exact match. <code>strings</code> has to be either a
|
863
|
+
* String or an Array of Strings. The returned <code>results</code> are either
|
864
|
+
* a Fixnum or an Array of Fixnums
|
865
|
+
* respectively.
|
866
|
+
*/
|
867
|
+
static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
|
868
|
+
{
|
869
|
+
GET_STRUCT(Sellers)
|
870
|
+
return Sellers_iterate_strings(amatch, strings, Sellers_similar);
|
871
|
+
}
|
872
|
+
|
873
|
+
/*
|
874
|
+
* call-seq: search(strings) -> results
|
875
|
+
*
|
876
|
+
* searches Sellers#pattern in <code>strings</code> and returns the edit
|
877
|
+
* distance (the sum of weighted character operations) as a Float value, by
|
878
|
+
* greedy trimming prefixes or postfixes of the match. <code>strings</code> has
|
879
|
+
* to be either a String or an Array of Strings. The returned
|
880
|
+
* <code>results</code> are either a Float or an Array of Floats respectively.
|
881
|
+
*/
|
882
|
+
static VALUE rb_Sellers_search(VALUE self, VALUE strings)
|
883
|
+
{
|
884
|
+
GET_STRUCT(Sellers)
|
885
|
+
return Sellers_iterate_strings(amatch, strings, Sellers_search);
|
886
|
+
}
|
887
|
+
|
888
|
+
/*
|
889
|
+
* Document-class: Amatch::PairDistance
|
890
|
+
*
|
891
|
+
* The pair distance between two strings is based on the number of adjacent
|
892
|
+
* character pairs, that are contained in both strings. The similiarity
|
893
|
+
* metric of two strings s1 and s2 is
|
894
|
+
* 2*|union(pairs(s1), pairs(s2))| / |pairs(s1)| + |pairs(s2)|
|
895
|
+
* If it is 1.0 the two strings are an exact match, if less than 1.0 they
|
896
|
+
* are more dissimilar. The advantage of considering adjacent characters, is to
|
897
|
+
* take account not only of the characters, but also of the character ordering
|
898
|
+
* in the original strings.
|
899
|
+
*
|
900
|
+
* This metric is very capable to find similarities in natural languages.
|
901
|
+
* It is explained in more detail in Simon White's article "How to Strike a
|
902
|
+
* Match", located at this url:
|
903
|
+
* http://www.catalysoft.com/articles/StrikeAMatch.html
|
904
|
+
* It is also very similar (a special case) to the method described under
|
905
|
+
* http://citeseer.lcs.mit.edu/gravano01using.html in "Using q-grams in a DBMS
|
906
|
+
* for Approximate String Processing."
|
907
|
+
*/
|
908
|
+
DEF_RB_FREE(PairDistance, PairDistance)
|
909
|
+
|
910
|
+
/*
|
911
|
+
* call-seq: new(pattern)
|
912
|
+
*
|
913
|
+
* Creates a new Amatch::PairDistance instance from <code>pattern</code>.
|
914
|
+
*/
|
915
|
+
static VALUE rb_PairDistance_initialize(VALUE self, VALUE pattern)
|
916
|
+
{
|
917
|
+
GET_STRUCT(PairDistance)
|
918
|
+
PairDistance_pattern_set(amatch, pattern);
|
919
|
+
return self;
|
920
|
+
}
|
921
|
+
|
922
|
+
DEF_CONSTRUCTOR(PairDistance, PairDistance)
|
923
|
+
|
924
|
+
/*
|
925
|
+
* call-seq: match(strings, regexp = /\s+/) -> results
|
926
|
+
*
|
927
|
+
* Uses this Amatch::PairDistance instance to match PairDistance#pattern against
|
928
|
+
* <code>strings</code>. It returns the pair distance measure, that is a
|
929
|
+
* returned value of 1.0 is an exact match, partial matches are lower
|
930
|
+
* values, while 0.0 means no match at all.
|
931
|
+
*
|
932
|
+
* <code>strings</code> has to be either a String or an
|
933
|
+
* Array of Strings. The argument <code>regexp</code> is used to split the
|
934
|
+
* pattern and strings into tokens first. It defaults to /\s+/. If the
|
935
|
+
* splitting should be omitted, call the method with nil as <code>regexp</code>
|
936
|
+
* explicitly.
|
937
|
+
*
|
938
|
+
* The returned <code>results</code> are either a Float or an
|
939
|
+
* Array of Floats respectively.
|
940
|
+
*/
|
941
|
+
static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
|
942
|
+
{
|
943
|
+
VALUE result, strings, regexp = Qnil;
|
944
|
+
int use_regexp;
|
945
|
+
GET_STRUCT(PairDistance)
|
946
|
+
|
947
|
+
rb_scan_args(argc, argv, "11", &strings, ®exp);
|
948
|
+
use_regexp = NIL_P(regexp) && argc != 2;
|
949
|
+
if (TYPE(strings) == T_STRING) {
|
950
|
+
result = PairDistance_match(amatch, strings, regexp, use_regexp);
|
951
|
+
} else {
|
952
|
+
Check_Type(strings, T_ARRAY);
|
180
953
|
int i;
|
181
|
-
|
954
|
+
result = rb_ary_new2(RARRAY(strings)->len);
|
182
955
|
for (i = 0; i < RARRAY(strings)->len; i++) {
|
183
956
|
VALUE string = rb_ary_entry(strings, i);
|
184
957
|
if (TYPE(string) != T_STRING) {
|
185
958
|
rb_raise(rb_eTypeError,
|
186
959
|
"array has to contain only strings (%s given)",
|
187
|
-
NIL_P(string) ?
|
188
|
-
|
960
|
+
NIL_P(string) ?
|
961
|
+
"NilClass" :
|
962
|
+
rb_class2name(CLASS_OF(string)));
|
189
963
|
}
|
190
|
-
rb_ary_push(result,
|
964
|
+
rb_ary_push(result,
|
965
|
+
PairDistance_match(amatch, string, regexp, use_regexp));
|
191
966
|
}
|
192
|
-
return result;
|
193
|
-
} else if (TYPE(strings) == T_STRING) {
|
194
|
-
return calculate_distance(self, strings, mode);
|
195
|
-
} else {
|
196
|
-
rb_raise(rb_eTypeError,
|
197
|
-
"value of strings needs to be string or array (%s given)",
|
198
|
-
NIL_P(strings) ? "NilClass" : rb_class2name(CLASS_OF(strings)));
|
199
967
|
}
|
968
|
+
pair_array_destroy(amatch->pattern_pair_array);
|
969
|
+
amatch->pattern_pair_array = NULL;
|
970
|
+
return result;
|
200
971
|
}
|
201
972
|
|
202
973
|
/*
|
203
|
-
*
|
974
|
+
* call-seq: pair_distance_similar(strings) -> results
|
975
|
+
*
|
976
|
+
* If called on a String, this string is used as a Amatch::PairDistance#pattern
|
977
|
+
* to match against <code>strings</code> using /\s+/ as the tokenizing regular
|
978
|
+
* expression. It returns a pair distance metric number between 0.0 for very
|
979
|
+
* unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
|
980
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
981
|
+
* are either a Float or an Array of Floats respectively.
|
204
982
|
*/
|
205
|
-
|
206
|
-
static VALUE
|
207
|
-
rb_amatch_resetw(self)
|
208
|
-
VALUE self;
|
983
|
+
static VALUE rb_str_pair_distance_similar(VALUE self, VALUE strings)
|
209
984
|
{
|
210
|
-
|
211
|
-
|
212
|
-
rb_iv_set(self, "@insw", INT2FIX(1));
|
213
|
-
|
214
|
-
return Qtrue;
|
985
|
+
VALUE amatch = rb_PairDistance_new(rb_cSellers, self);
|
986
|
+
return rb_PairDistance_match(1, &strings, amatch);
|
215
987
|
}
|
216
988
|
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
989
|
+
/*
|
990
|
+
* Document-class: Amatch::Hamming
|
991
|
+
*
|
992
|
+
* This class computes the Hamming distance between two strings.
|
993
|
+
*
|
994
|
+
* The Hamming distance between two strings is the number of characters, that
|
995
|
+
* are different. Thus a hamming distance of 0 means an exact
|
996
|
+
* match, a hamming distance of 1 means one character is different, and so on.
|
997
|
+
* If one string is longer than the other string, the missing characters are
|
998
|
+
* counted as different characters.
|
999
|
+
*/
|
1000
|
+
|
1001
|
+
DEF_RB_FREE(Hamming, General)
|
222
1002
|
|
223
|
-
|
224
|
-
|
225
|
-
|
1003
|
+
/*
|
1004
|
+
* call-seq: new(pattern)
|
1005
|
+
*
|
1006
|
+
* Creates a new Amatch::Hamming instance from <code>pattern</code>.
|
1007
|
+
*/
|
1008
|
+
static VALUE rb_Hamming_initialize(VALUE self, VALUE pattern)
|
1009
|
+
{
|
1010
|
+
GET_STRUCT(General)
|
1011
|
+
General_pattern_set(amatch, pattern);
|
226
1012
|
return self;
|
227
1013
|
}
|
228
1014
|
|
229
|
-
|
230
|
-
rb_amatch_pattern_is(self, pattern)
|
231
|
-
VALUE self;
|
232
|
-
VALUE pattern;
|
233
|
-
{
|
234
|
-
Check_Type(pattern, T_STRING);
|
235
|
-
rb_iv_set(self, "@pattern", pattern);
|
1015
|
+
DEF_CONSTRUCTOR(Hamming, General)
|
236
1016
|
|
237
|
-
|
1017
|
+
/*
|
1018
|
+
* call-seq: match(strings) -> results
|
1019
|
+
*
|
1020
|
+
* Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
|
1021
|
+
* <code>strings</code>, that is compute the hamming distance between
|
1022
|
+
* <code>pattern</code> and <code>strings</code>. <code>strings</code> has to
|
1023
|
+
* be either a String or an Array of Strings. The returned <code>results</code>
|
1024
|
+
* are either a Fixnum or an Array of Fixnums respectively.
|
1025
|
+
*/
|
1026
|
+
static VALUE rb_Hamming_match(VALUE self, VALUE strings)
|
1027
|
+
{
|
1028
|
+
GET_STRUCT(General)
|
1029
|
+
return General_iterate_strings(amatch, strings, Hamming_match);
|
238
1030
|
}
|
239
1031
|
|
1032
|
+
/*
|
1033
|
+
* call-seq: similar(strings) -> results
|
1034
|
+
*
|
1035
|
+
* Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
|
1036
|
+
* <code>strings</code>, and compute a Hamming distance metric number between
|
1037
|
+
* 0.0 for very unsimilar strings and 1.0 for an exact match.
|
1038
|
+
* <code>strings</code> has to be either a String or an Array of Strings. The
|
1039
|
+
* returned <code>results</code> are either a Fixnum or an Array of Fixnums
|
1040
|
+
* respectively.
|
1041
|
+
*/
|
1042
|
+
static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
|
1043
|
+
{
|
1044
|
+
GET_STRUCT(General)
|
1045
|
+
return General_iterate_strings(amatch, strings, Hamming_similar);
|
1046
|
+
}
|
240
1047
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
1048
|
+
/*
|
1049
|
+
* call-seq: hamming_similar(strings) -> results
|
1050
|
+
*
|
1051
|
+
* If called on a String, this string is used as a Amatch::Hamming#pattern to
|
1052
|
+
* match against <code>strings</code>. It returns a Hamming distance metric
|
1053
|
+
* number between 0.0 for very unsimilar strings and 1.0 for an exact match.
|
1054
|
+
* <code>strings</code>
|
1055
|
+
* has to be either a String or an Array of Strings. The returned
|
1056
|
+
* <code>results</code> are either a Float or an Array of Floats respectively.
|
1057
|
+
*/
|
1058
|
+
static VALUE rb_str_hamming_similar(VALUE self, VALUE strings)
|
245
1059
|
{
|
246
|
-
|
1060
|
+
VALUE amatch = rb_Hamming_new(rb_cHamming, self);
|
1061
|
+
return rb_Hamming_similar(amatch, strings);
|
247
1062
|
}
|
248
1063
|
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
1064
|
+
|
1065
|
+
/*
|
1066
|
+
* Document-class: Amatch::LongestSubsequence
|
1067
|
+
*
|
1068
|
+
* This class computes the length of the longest subsequence common to two
|
1069
|
+
* strings. A subsequence doesn't have to be contiguous. The longer the common
|
1070
|
+
* subsequence is, the more similar the two strings will be.
|
1071
|
+
*
|
1072
|
+
* The longest common subsequence between "test" and "test" is of length 4,
|
1073
|
+
* because "test" itself is this subsequence. The longest common subsequence
|
1074
|
+
* between "test" and "east" is "e", "s", "t" and the length of the
|
1075
|
+
* sequence is 3.
|
1076
|
+
*/
|
1077
|
+
DEF_RB_FREE(LongestSubsequence, General)
|
1078
|
+
|
1079
|
+
/*
|
1080
|
+
* call-seq: new(pattern)
|
1081
|
+
*
|
1082
|
+
* Creates a new Amatch::LongestSubsequence instance from <code>pattern</code>.
|
1083
|
+
*/
|
1084
|
+
static VALUE rb_LongestSubsequence_initialize(VALUE self, VALUE pattern)
|
253
1085
|
{
|
254
|
-
|
1086
|
+
GET_STRUCT(General)
|
1087
|
+
General_pattern_set(amatch, pattern);
|
1088
|
+
return self;
|
255
1089
|
}
|
256
1090
|
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
1091
|
+
DEF_CONSTRUCTOR(LongestSubsequence, General)
|
1092
|
+
|
1093
|
+
/*
|
1094
|
+
* call-seq: match(strings) -> results
|
1095
|
+
*
|
1096
|
+
* Uses this Amatch::LongestSubsequence instance to match
|
1097
|
+
* LongestSubsequence#pattern against <code>strings</code>, that is compute the
|
1098
|
+
* length of the longest common subsequence. <code>strings</code> has to be
|
1099
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1100
|
+
* are either a Fixnum or an Array of Fixnums respectively.
|
1101
|
+
*/
|
1102
|
+
static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
|
1103
|
+
{
|
1104
|
+
GET_STRUCT(General)
|
1105
|
+
return General_iterate_strings(amatch, strings, LongestSubsequence_match);
|
263
1106
|
}
|
264
1107
|
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
1108
|
+
/*
|
1109
|
+
* call-seq: similar(strings) -> results
|
1110
|
+
*
|
1111
|
+
* Uses this Amatch::LongestSubsequence instance to match
|
1112
|
+
* Amatch::LongestSubsequence#pattern against <code>strings</code>, and compute
|
1113
|
+
* a longest substring distance metric number between 0.0 for very unsimilar
|
1114
|
+
* strings and 1.0 for an exact match. <code>strings</code> has to be either a
|
1115
|
+
* String or an Array of Strings. The returned <code>results</code> are either
|
1116
|
+
* a Fixnum or an Array of Fixnums
|
1117
|
+
*/
|
1118
|
+
static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
|
1119
|
+
{
|
1120
|
+
GET_STRUCT(General)
|
1121
|
+
return General_iterate_strings(amatch, strings, LongestSubsequence_similar);
|
1122
|
+
}
|
1123
|
+
|
1124
|
+
/*
|
1125
|
+
* call-seq: longest_subsequence_similar(strings) -> results
|
1126
|
+
*
|
1127
|
+
* If called on a String, this string is used as a
|
1128
|
+
* Amatch::LongestSubsequence#pattern to match against <code>strings</code>. It
|
1129
|
+
* returns a longest subsequence distance metric number between 0.0 for very
|
1130
|
+
* unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
|
1131
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1132
|
+
* are either a Float or an Array of Floats respectively.
|
1133
|
+
*/
|
1134
|
+
static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
|
1135
|
+
{
|
1136
|
+
VALUE amatch = rb_LongestSubsequence_new(rb_cSellers, self);
|
1137
|
+
return rb_LongestSubsequence_similar(amatch, strings);
|
1138
|
+
}
|
1139
|
+
|
1140
|
+
/*
|
1141
|
+
* Document-class: Amatch::LongestSubstring
|
1142
|
+
*
|
1143
|
+
* The longest common substring is the longest substring, that is part of
|
1144
|
+
* two strings. A substring is contiguous, while a subsequence need not to
|
1145
|
+
* be. The longer the common substring is, the more similar the two strings
|
1146
|
+
* will be.
|
1147
|
+
*
|
1148
|
+
* The longest common substring between 'string' and 'string' is 'string'
|
1149
|
+
* again, thus the longest common substring length is 6. The longest common
|
1150
|
+
* substring between 'string' and 'storing' is 'ring', thus the longest common
|
1151
|
+
* substring length is 4.
|
1152
|
+
*/
|
1153
|
+
|
1154
|
+
DEF_RB_FREE(LongestSubstring, General)
|
1155
|
+
|
1156
|
+
/*
|
1157
|
+
* call-seq: new(pattern)
|
1158
|
+
*
|
1159
|
+
* Creates a new Amatch::LongestSubstring instance from <code>pattern</code>.
|
1160
|
+
*/
|
1161
|
+
static VALUE rb_LongestSubstring_initialize(VALUE self, VALUE pattern)
|
269
1162
|
{
|
270
|
-
|
1163
|
+
GET_STRUCT(General)
|
1164
|
+
General_pattern_set(amatch, pattern);
|
1165
|
+
return self;
|
271
1166
|
}
|
272
1167
|
|
1168
|
+
DEF_CONSTRUCTOR(LongestSubstring, General)
|
273
1169
|
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
1170
|
+
/*
|
1171
|
+
* call-seq: match(strings) -> results
|
1172
|
+
*
|
1173
|
+
* Uses this Amatch::LongestSubstring instance to match
|
1174
|
+
* LongestSubstring#pattern against <code>strings</code>, that is compute the
|
1175
|
+
* length of the longest common substring. <code>strings</code> has to be
|
1176
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1177
|
+
* are either a Fixnum or an Array of Fixnums respectively.
|
1178
|
+
*/
|
1179
|
+
static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
|
278
1180
|
{
|
279
|
-
|
1181
|
+
GET_STRUCT(General)
|
1182
|
+
return General_iterate_strings(amatch, strings, LongestSubstring_match);
|
280
1183
|
}
|
281
1184
|
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
1185
|
+
/*
|
1186
|
+
* call-seq: similar(strings) -> results
|
1187
|
+
*
|
1188
|
+
* Uses this Amatch::LongestSubstring instance to match
|
1189
|
+
* Amatch::LongestSubstring#pattern against <code>strings</code>, and compute a
|
1190
|
+
* longest substring distance metric number between 0.0 for very unsimilar
|
1191
|
+
* strings and 1.0 for an exact match. <code>strings</code> has to be either a
|
1192
|
+
* String or an Array of Strings. The returned <code>results</code> are either
|
1193
|
+
* a Fixnum or an Array of Fixnums
|
1194
|
+
* respectively.
|
1195
|
+
*/
|
1196
|
+
static VALUE rb_LongestSubstring_similar(VALUE self, VALUE strings)
|
286
1197
|
{
|
287
|
-
|
1198
|
+
GET_STRUCT(General)
|
1199
|
+
return General_iterate_strings(amatch, strings, LongestSubstring_similar);
|
288
1200
|
}
|
289
1201
|
|
290
|
-
|
291
|
-
|
1202
|
+
/*
|
1203
|
+
* call-seq: longest_substring_similar(strings) -> results
|
1204
|
+
*
|
1205
|
+
* If called on a String, this string is used as a
|
1206
|
+
* Amatch::LongestSubstring#pattern to match against <code>strings</code>. It
|
1207
|
+
* returns a longest substring distance metric number between 0.0 for very
|
1208
|
+
* unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
|
1209
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1210
|
+
* are either a Float or an Array of Floats respectively.
|
1211
|
+
*/
|
1212
|
+
static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
|
1213
|
+
{
|
1214
|
+
VALUE amatch = rb_LongestSubsequence_new(rb_cSellers, self);
|
1215
|
+
return rb_LongestSubstring_similar(amatch, strings);
|
1216
|
+
}
|
1217
|
+
|
1218
|
+
/*
|
1219
|
+
* = amatch - Approximate Matching Extension for Ruby
|
1220
|
+
*
|
1221
|
+
* == Description
|
1222
|
+
*
|
1223
|
+
* This is a collection of classes that can be used for Approximate
|
1224
|
+
* matching, searching, and comparing of Strings. They implement algorithms
|
1225
|
+
* that compute the Levenshtein edit distance, Sellers edit distance, the
|
1226
|
+
* Hamming distance, the longest common subsequence length, the longest common
|
1227
|
+
* substring length, and the pair distance metric.
|
1228
|
+
*
|
1229
|
+
* == Author
|
1230
|
+
*
|
1231
|
+
* Florian Frank mailto:flori@ping.de
|
1232
|
+
*
|
1233
|
+
* == License
|
1234
|
+
*
|
1235
|
+
* This is free software; you can redistribute it and/or modify it under
|
1236
|
+
* the terms of the GNU General Public License Version 2 as published by
|
1237
|
+
* the Free Software Foundation: http://www.gnu.org/copyleft/gpl.html
|
1238
|
+
*
|
1239
|
+
* == Download
|
1240
|
+
*
|
1241
|
+
* The latest version of <b>amatch</b> can be found at
|
1242
|
+
*
|
1243
|
+
* * http://rubyforge.org/frs/?group_id=390
|
1244
|
+
*
|
1245
|
+
* Online Documentation should be located at
|
1246
|
+
*
|
1247
|
+
* * http://amatch.rubyforge.org
|
1248
|
+
*
|
1249
|
+
* == Examples
|
1250
|
+
* require 'amatch'
|
1251
|
+
* # => true
|
1252
|
+
* include Amatch
|
1253
|
+
* # => Object
|
1254
|
+
*
|
1255
|
+
* m = Sellers.new("pattern")
|
1256
|
+
* # => #<Amatch::Sellers:0x40366324>
|
1257
|
+
* m.match("pattren")
|
1258
|
+
* # => 2.0
|
1259
|
+
* m.substitution = m.insertion = 3
|
1260
|
+
* # => 3
|
1261
|
+
* m.match("pattren")
|
1262
|
+
* # => 4.0
|
1263
|
+
* m.reset_weights
|
1264
|
+
* # => #<Amatch::Sellers:0x40366324>
|
1265
|
+
* m.match(["pattren","parent"])
|
1266
|
+
* # => [2.0, 4.0]
|
1267
|
+
* m.search("abcpattrendef")
|
1268
|
+
* # => 2.0
|
1269
|
+
*
|
1270
|
+
* m = Levenshtein.new("pattern")
|
1271
|
+
* # => #<Amatch::Levenshtein:0x4035919c>
|
1272
|
+
* m.match("pattren")
|
1273
|
+
* # => 2
|
1274
|
+
* m.search("abcpattrendef")
|
1275
|
+
* # => 2
|
1276
|
+
* "pattern language".levenshtein_similar("language of patterns")
|
1277
|
+
* # => 0.2
|
1278
|
+
*
|
1279
|
+
* m = Hamming.new("pattern")
|
1280
|
+
* # => #<Amatch::Hamming:0x40350858>
|
1281
|
+
* m.match("pattren")
|
1282
|
+
* # => 2
|
1283
|
+
* "pattern language".hamming_similar("language of patterns")
|
1284
|
+
* # => 0.1
|
1285
|
+
*
|
1286
|
+
* m = PairDistance.new("pattern")
|
1287
|
+
* # => #<Amatch::PairDistance:0x40349be8>
|
1288
|
+
* m.match("pattr en")
|
1289
|
+
* # => 0.545454545454545
|
1290
|
+
* m.match("pattr en", nil)
|
1291
|
+
* # => 0.461538461538462
|
1292
|
+
* m.match("pattr en", /t+/)
|
1293
|
+
* # => 0.285714285714286
|
1294
|
+
* "pattern language".pair_distance_similar("language of patterns")
|
1295
|
+
* # => 0.928571428571429
|
1296
|
+
*
|
1297
|
+
* m = LongestSubsequence.new("pattern")
|
1298
|
+
* # => #<Amatch::LongestSubsequence:0x4033e900>
|
1299
|
+
* m.match("pattren")
|
1300
|
+
* # => 6
|
1301
|
+
* "pattern language".longest_subsequence_similar("language of patterns")
|
1302
|
+
* # => 0.4
|
1303
|
+
*
|
1304
|
+
* m = LongestSubstring.new("pattern")
|
1305
|
+
* # => #<Amatch::LongestSubstring:0x403378d0>
|
1306
|
+
* m.match("pattren")
|
1307
|
+
* # => 4
|
1308
|
+
* "pattern language".longest_substring_similar("language of patterns")
|
1309
|
+
* # => 0.4
|
1310
|
+
*
|
1311
|
+
*/
|
1312
|
+
|
1313
|
+
void Init_amatch()
|
292
1314
|
{
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
rb_define_method(
|
301
|
-
|
302
|
-
rb_define_method(
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
rb_define_method(
|
310
|
-
rb_define_method(
|
1315
|
+
rb_mAmatch = rb_define_module("Amatch");
|
1316
|
+
|
1317
|
+
/* Levenshtein */
|
1318
|
+
rb_cLevenshtein = rb_define_class_under(rb_mAmatch, "Levenshtein", rb_cObject);
|
1319
|
+
rb_define_alloc_func(rb_cLevenshtein, rb_Levenshtein_s_allocate);
|
1320
|
+
rb_define_method(rb_cLevenshtein, "initialize", rb_Levenshtein_initialize, 1);
|
1321
|
+
rb_define_method(rb_cLevenshtein, "pattern", rb_General_pattern, 0);
|
1322
|
+
rb_define_method(rb_cLevenshtein, "pattern=", rb_General_pattern_set, 1);
|
1323
|
+
rb_define_method(rb_cLevenshtein, "match", rb_Levenshtein_match, 1);
|
1324
|
+
rb_define_method(rb_cLevenshtein, "search", rb_Levenshtein_search, 1);
|
1325
|
+
rb_define_method(rb_cLevenshtein, "similar", rb_Levenshtein_similar, 1);
|
1326
|
+
rb_define_method(rb_cString, "levenshtein_similar", rb_str_levenshtein_similar, 1);
|
1327
|
+
|
1328
|
+
/* Sellers */
|
1329
|
+
rb_cSellers = rb_define_class_under(rb_mAmatch, "Sellers", rb_cObject);
|
1330
|
+
rb_define_alloc_func(rb_cSellers, rb_Sellers_s_allocate);
|
1331
|
+
rb_define_method(rb_cSellers, "initialize", rb_Sellers_initialize, 1);
|
1332
|
+
rb_define_method(rb_cSellers, "pattern", rb_Sellers_pattern, 0);
|
1333
|
+
rb_define_method(rb_cSellers, "pattern=", rb_Sellers_pattern_set, 1);
|
1334
|
+
rb_define_method(rb_cSellers, "substitution", rb_Sellers_substitution, 0);
|
1335
|
+
rb_define_method(rb_cSellers, "substitution=", rb_Sellers_substitution_set, 1);
|
1336
|
+
rb_define_method(rb_cSellers, "deletion", rb_Sellers_deletion, 0);
|
1337
|
+
rb_define_method(rb_cSellers, "deletion=", rb_Sellers_deletion_set, 1);
|
1338
|
+
rb_define_method(rb_cSellers, "insertion", rb_Sellers_insertion, 0);
|
1339
|
+
rb_define_method(rb_cSellers, "insertion=", rb_Sellers_insertion_set, 1);
|
1340
|
+
rb_define_method(rb_cSellers, "reset_weights", rb_Sellers_reset_weights, 0);
|
1341
|
+
rb_define_method(rb_cSellers, "match", rb_Sellers_match, 1);
|
1342
|
+
rb_define_method(rb_cSellers, "search", rb_Sellers_search, 1);
|
1343
|
+
rb_define_method(rb_cSellers, "similar", rb_Sellers_similar, 1);
|
1344
|
+
|
1345
|
+
/* Hamming */
|
1346
|
+
rb_cHamming = rb_define_class_under(rb_mAmatch, "Hamming", rb_cObject);
|
1347
|
+
rb_define_alloc_func(rb_cHamming, rb_Hamming_s_allocate);
|
1348
|
+
rb_define_method(rb_cHamming, "initialize", rb_Hamming_initialize, 1);
|
1349
|
+
rb_define_method(rb_cHamming, "pattern", rb_General_pattern, 0);
|
1350
|
+
rb_define_method(rb_cHamming, "pattern=", rb_General_pattern_set, 1);
|
1351
|
+
rb_define_method(rb_cHamming, "match", rb_Hamming_match, 1);
|
1352
|
+
rb_define_method(rb_cHamming, "similar", rb_Hamming_similar, 1);
|
1353
|
+
rb_define_method(rb_cString, "hamming_similar", rb_str_hamming_similar, 1);
|
1354
|
+
|
1355
|
+
/* Pair Distance Metric */
|
1356
|
+
rb_cPairDistance = rb_define_class_under(rb_mAmatch, "PairDistance", rb_cObject);
|
1357
|
+
rb_define_alloc_func(rb_cPairDistance, rb_PairDistance_s_allocate);
|
1358
|
+
rb_define_method(rb_cPairDistance, "initialize", rb_PairDistance_initialize, 1);
|
1359
|
+
rb_define_method(rb_cPairDistance, "pattern", rb_PairDistance_pattern, 0);
|
1360
|
+
rb_define_method(rb_cPairDistance, "pattern=", rb_PairDistance_pattern_set, 1);
|
1361
|
+
rb_define_method(rb_cPairDistance, "match", rb_PairDistance_match, -1);
|
1362
|
+
rb_define_alias(rb_cPairDistance, "similar", "match");
|
1363
|
+
rb_define_method(rb_cString, "pair_distance_similar", rb_str_pair_distance_similar, 1);
|
1364
|
+
|
1365
|
+
/* Longest Common Subsequence */
|
1366
|
+
rb_cLongestSubsequence = rb_define_class_under(rb_mAmatch, "LongestSubsequence", rb_cObject);
|
1367
|
+
rb_define_alloc_func(rb_cLongestSubsequence, rb_LongestSubsequence_s_allocate);
|
1368
|
+
rb_define_method(rb_cLongestSubsequence, "initialize", rb_LongestSubsequence_initialize, 1);
|
1369
|
+
rb_define_method(rb_cLongestSubsequence, "pattern", rb_General_pattern, 0);
|
1370
|
+
rb_define_method(rb_cLongestSubsequence, "pattern=", rb_General_pattern_set, 1);
|
1371
|
+
rb_define_method(rb_cLongestSubsequence, "match", rb_LongestSubsequence_match, 1);
|
1372
|
+
rb_define_method(rb_cLongestSubsequence, "similar", rb_LongestSubsequence_similar, 1);
|
1373
|
+
rb_define_method(rb_cString, "longest_subsequence_similar", rb_str_longest_subsequence_similar, 1);
|
1374
|
+
|
1375
|
+
/* Longest Common Substring */
|
1376
|
+
rb_cLongestSubstring = rb_define_class_under(rb_mAmatch, "LongestSubstring", rb_cObject);
|
1377
|
+
rb_define_alloc_func(rb_cLongestSubstring, rb_LongestSubstring_s_allocate);
|
1378
|
+
rb_define_method(rb_cLongestSubstring, "initialize", rb_LongestSubstring_initialize, 1);
|
1379
|
+
rb_define_method(rb_cLongestSubstring, "pattern", rb_General_pattern, 0);
|
1380
|
+
rb_define_method(rb_cLongestSubstring, "pattern=", rb_General_pattern_set, 1);
|
1381
|
+
rb_define_method(rb_cLongestSubstring, "match", rb_LongestSubstring_match, 1);
|
1382
|
+
rb_define_method(rb_cLongestSubstring, "similar", rb_LongestSubstring_similar, 1);
|
1383
|
+
rb_define_method(rb_cString, "longest_substring_similar", rb_str_longest_substring_similar, 1);
|
1384
|
+
|
1385
|
+
id_split = rb_intern("split");
|
1386
|
+
id_to_f = rb_intern("to_f");
|
311
1387
|
}
|
312
1388
|
/* vim: set et cin sw=4 ts=4: */
|