patmcnally-amatch 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +28 -0
- data/COPYING +340 -0
- data/README +25 -0
- data/Rakefile +122 -0
- data/VERSION +1 -0
- data/amatch.gemspec +31 -0
- data/bin/agrep.rb +79 -0
- data/doc-main.txt +115 -0
- data/ext/amatch.c +1641 -0
- data/ext/common.h +25 -0
- data/ext/extconf.rb +6 -0
- data/ext/pair.c +77 -0
- data/ext/pair.h +29 -0
- data/install.rb +28 -0
- data/lib/amatch/version.rb +8 -0
- data/tests/test_hamming.rb +58 -0
- data/tests/test_jaro.rb +29 -0
- data/tests/test_jaro_winkler.rb +38 -0
- data/tests/test_levenshtein.rb +83 -0
- data/tests/test_longest_subsequence.rb +61 -0
- data/tests/test_longest_substring.rb +61 -0
- data/tests/test_pair_distance.rb +86 -0
- data/tests/test_sellers.rb +96 -0
- metadata +85 -0
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.2.4
|
data/amatch.gemspec
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
spec = Gem::Specification.new do |s|
|
2
|
+
s.name = 'amatch'
|
3
|
+
s.version = "0.2.4"
|
4
|
+
s.summary = "Approximate String Matching library"
|
5
|
+
s.description = <<EOF
|
6
|
+
Amatch is a library for approximate string matching and searching in strings.
|
7
|
+
Several algorithms can be used to do this, and it's also possible to compute a
|
8
|
+
similarity metric number between 0.0 and 1.0 for two given strings.
|
9
|
+
EOF
|
10
|
+
|
11
|
+
s.files = ["amatch.gemspec", "bin", "bin/agrep.rb", "CHANGES", "COPYING", "ext", "ext/amatch.c", "ext/common.h", "ext/extconf.rb", "ext/pair.c", "ext/pair.h", "install.rb", "lib", "lib/amatch", "lib/amatch/version.rb", "Rakefile", "README", "tests", "tests/test_hamming.rb", "tests/test_jaro.rb", "tests/test_jaro_winkler.rb", "tests/test_levenshtein.rb", "tests/test_longest_subsequence.rb", "tests/test_longest_substring.rb", "tests/test_pair_distance.rb", "tests/test_sellers.rb", "VERSION"]
|
12
|
+
|
13
|
+
s.extensions << "ext/extconf.rb"
|
14
|
+
|
15
|
+
s.require_path = 'ext'
|
16
|
+
|
17
|
+
s.bindir = "bin"
|
18
|
+
s.executables = ["agrep.rb"]
|
19
|
+
s.default_executable = "agrep.rb"
|
20
|
+
|
21
|
+
s.has_rdoc = true
|
22
|
+
s.extra_rdoc_files.concat ["ext/amatch.c", "lib/amatch/version.rb", "doc-main.txt"]
|
23
|
+
s.rdoc_options << '--main' << 'doc-main.txt' <<
|
24
|
+
'--title' << "amatch - Approximate Matching"
|
25
|
+
s.test_files.concat Dir['tests/test_*.rb']
|
26
|
+
|
27
|
+
s.author = "Florian Frank"
|
28
|
+
s.email = "flori@ping.de"
|
29
|
+
s.homepage = "http://amatch.rubyforge.org"
|
30
|
+
s.rubyforge_project = "amatch"
|
31
|
+
end
|
data/bin/agrep.rb
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'amatch'
|
4
|
+
require 'getoptlong'
|
5
|
+
|
6
|
+
def usage(msg, options)
|
7
|
+
puts msg, "Usage: #{File.basename($0)} [OPTIONS] PATTERN [FILE ...]", ""
|
8
|
+
options.each do |o|
|
9
|
+
puts " " + o[1] + ", " + o[0] + " " +
|
10
|
+
(o[2] == GetoptLong::REQUIRED_ARGUMENT ? 'ARGUMENT' : '')
|
11
|
+
end
|
12
|
+
puts "\nReport bugs to <flori@ping.de>."
|
13
|
+
exit 0
|
14
|
+
end
|
15
|
+
|
16
|
+
class Amatch::Levenshtein
|
17
|
+
def search_relative(strings)
|
18
|
+
search(strings).to_f / pattern.size
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
$distance = 1
|
23
|
+
$mode = :search
|
24
|
+
begin
|
25
|
+
parser = GetoptLong.new
|
26
|
+
options = [
|
27
|
+
[ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
|
28
|
+
[ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
|
29
|
+
[ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
|
30
|
+
[ '--help', '-h', GetoptLong::NO_ARGUMENT ],
|
31
|
+
]
|
32
|
+
parser.set_options(*options)
|
33
|
+
parser.each_option do |name, arg|
|
34
|
+
name = name.sub(/^--/, '')
|
35
|
+
case name
|
36
|
+
when 'distance'
|
37
|
+
$distance = arg.to_f
|
38
|
+
when 'relative'
|
39
|
+
$mode = :search_relative
|
40
|
+
when 'verbose'
|
41
|
+
$verbose = 1
|
42
|
+
when 'help'
|
43
|
+
usage('You\'ve asked for it!', options)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
rescue
|
47
|
+
exit 1
|
48
|
+
end
|
49
|
+
pattern = ARGV.shift or usage('Pattern needed!', options)
|
50
|
+
|
51
|
+
matcher = Amatch::Levenshtein.new(pattern)
|
52
|
+
size = 0
|
53
|
+
start = Time.new
|
54
|
+
if ARGV.size > 0 then
|
55
|
+
ARGV.each do |filename|
|
56
|
+
File.stat(filename).file? or next
|
57
|
+
size += File.size(filename)
|
58
|
+
begin
|
59
|
+
File.open(filename, 'r').each_line do |line|
|
60
|
+
if matcher.__send__($mode, line) <= $distance
|
61
|
+
puts "#{filename}:#{line}"
|
62
|
+
end
|
63
|
+
end
|
64
|
+
rescue
|
65
|
+
STDERR.puts "Failure at #{filename}: #{$!} => Skipping!"
|
66
|
+
end
|
67
|
+
end
|
68
|
+
else
|
69
|
+
STDIN.each_line do |line|
|
70
|
+
size += line.size
|
71
|
+
if matcher.__send__($mode, line) <= $distance
|
72
|
+
puts line
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
time = Time.new - start
|
77
|
+
$verbose and STDERR.printf "%.3f secs running, scanned %.3f KB/s.\n",
|
78
|
+
time, size / time / 1024
|
79
|
+
exit 0
|
data/doc-main.txt
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
== amatch - Approximate Matching Extension for Ruby
|
2
|
+
|
3
|
+
=== Description
|
4
|
+
|
5
|
+
This is a collection of classes that can be used for Approximate
|
6
|
+
matching, searching, and comparing of Strings. They implement algorithms
|
7
|
+
that compute the Levenshtein edit distance, Sellers edit distance, the
|
8
|
+
Hamming distance, the longest common subsequence length, the longest common
|
9
|
+
substring length, the pair distance metric, the Jaro-Winkler metric.
|
10
|
+
|
11
|
+
=== Author
|
12
|
+
|
13
|
+
Florian Frank mailto:flori@ping.de
|
14
|
+
|
15
|
+
=== License
|
16
|
+
|
17
|
+
This is free software; you can redistribute it and/or modify it under
|
18
|
+
the terms of the GNU General Public License Version 2 as published by
|
19
|
+
the Free Software Foundation: http://www.gnu.org/copyleft/gpl.html
|
20
|
+
|
21
|
+
=== Download
|
22
|
+
|
23
|
+
The latest version of <b>amatch</b> can be found at
|
24
|
+
|
25
|
+
* http://rubyforge.org/frs/?group_id=390
|
26
|
+
|
27
|
+
Online Documentation should be located at
|
28
|
+
|
29
|
+
* http://amatch.rubyforge.org
|
30
|
+
|
31
|
+
=== Examples
|
32
|
+
require 'amatch'
|
33
|
+
# => true
|
34
|
+
include Amatch
|
35
|
+
# => Object
|
36
|
+
|
37
|
+
m = Sellers.new("pattern")
|
38
|
+
# => #<Amatch::Sellers:0x40366324>
|
39
|
+
m.match("pattren")
|
40
|
+
# => 2.0
|
41
|
+
m.substitution = m.insertion = 3
|
42
|
+
# => 3
|
43
|
+
m.match("pattren")
|
44
|
+
# => 4.0
|
45
|
+
m.reset_weights
|
46
|
+
# => #<Amatch::Sellers:0x40366324>
|
47
|
+
m.match(["pattren","parent"])
|
48
|
+
# => [2.0, 4.0]
|
49
|
+
m.search("abcpattrendef")
|
50
|
+
# => 2.0
|
51
|
+
|
52
|
+
m = Levenshtein.new("pattern")
|
53
|
+
# => #<Amatch::Levenshtein:0x4035919c>
|
54
|
+
m.match("pattren")
|
55
|
+
# => 2
|
56
|
+
m.search("abcpattrendef")
|
57
|
+
# => 2
|
58
|
+
"pattern language".levenshtein_similar("language of patterns")
|
59
|
+
# => 0.2
|
60
|
+
|
61
|
+
m = Hamming.new("pattern")
|
62
|
+
# => #<Amatch::Hamming:0x40350858>
|
63
|
+
m.match("pattren")
|
64
|
+
# => 2
|
65
|
+
"pattern language".hamming_similar("language of patterns")
|
66
|
+
# => 0.1
|
67
|
+
|
68
|
+
m = PairDistance.new("pattern")
|
69
|
+
# => #<Amatch::PairDistance:0x40349be8>
|
70
|
+
m.match("pattr en")
|
71
|
+
# => 0.545454545454545
|
72
|
+
m.match("pattr en", nil)
|
73
|
+
# => 0.461538461538462
|
74
|
+
m.match("pattr en", /t+/)
|
75
|
+
# => 0.285714285714286
|
76
|
+
"pattern language".pair_distance_similar("language of patterns")
|
77
|
+
# => 0.928571428571429
|
78
|
+
|
79
|
+
m = LongestSubsequence.new("pattern")
|
80
|
+
# => #<Amatch::LongestSubsequence:0x4033e900>
|
81
|
+
m.match("pattren")
|
82
|
+
# => 6
|
83
|
+
"pattern language".longest_subsequence_similar("language of patterns")
|
84
|
+
# => 0.4
|
85
|
+
|
86
|
+
m = LongestSubstring.new("pattern")
|
87
|
+
# => #<Amatch::LongestSubstring:0x403378d0>
|
88
|
+
m.match("pattren")
|
89
|
+
# => 4
|
90
|
+
"pattern language".longest_substring_similar("language of patterns")
|
91
|
+
# => 0.4
|
92
|
+
|
93
|
+
m = Jaro.new("pattern")
|
94
|
+
# => #<Amatch::Jaro:0x363b70>
|
95
|
+
m.match("paTTren")
|
96
|
+
# => 0.952380952380952
|
97
|
+
m.ignore_case = false
|
98
|
+
m.match("paTTren")
|
99
|
+
# => 0.742857142857143
|
100
|
+
"pattern language".jaro_similar("language of patterns")
|
101
|
+
# => 0.672222222222222
|
102
|
+
|
103
|
+
m = JaroWinkler.new("pattern")
|
104
|
+
# #<Amatch::JaroWinkler:0x3530b8>
|
105
|
+
m.match("paTTren")
|
106
|
+
# => 0.971428571712403
|
107
|
+
m.ignore_case = false
|
108
|
+
m.match("paTTren")
|
109
|
+
# => 0.79428571505206
|
110
|
+
m.scaling_factor = 0.05
|
111
|
+
m.match("pattren")
|
112
|
+
# => 0.961904762046678
|
113
|
+
"pattern language".jarowinkler_similar("language of patterns")
|
114
|
+
# => 0.672222222222222
|
115
|
+
|
data/ext/amatch.c
ADDED
@@ -0,0 +1,1641 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "pair.h"
|
3
|
+
#include <ctype.h>
|
4
|
+
#include "common.h"
|
5
|
+
|
6
|
+
/*
|
7
|
+
* Document-method: pattern
|
8
|
+
*
|
9
|
+
* call-seq: pattern -> pattern string
|
10
|
+
*
|
11
|
+
* Returns the current pattern string of this instance.
|
12
|
+
*/
|
13
|
+
|
14
|
+
/*
|
15
|
+
* Document-method: pattern=
|
16
|
+
*
|
17
|
+
* call-seq: pattern=(pattern)
|
18
|
+
*
|
19
|
+
* Sets the current pattern string of this instance to <code>pattern</code>.
|
20
|
+
*/
|
21
|
+
|
22
|
+
|
23
|
+
static VALUE rb_mAmatch, rb_cLevenshtein, rb_cSellers, rb_cHamming,
|
24
|
+
rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring,
|
25
|
+
rb_cJaro, rb_cJaroWinkler;
|
26
|
+
|
27
|
+
static ID id_split, id_to_f;
|
28
|
+
|
29
|
+
#define GET_STRUCT(klass) \
|
30
|
+
klass *amatch; \
|
31
|
+
Data_Get_Struct(self, klass, amatch);
|
32
|
+
|
33
|
+
#define DEF_ALLOCATOR(type) \
|
34
|
+
static type *type##_allocate() \
|
35
|
+
{ \
|
36
|
+
type *obj = ALLOC(type); \
|
37
|
+
MEMZERO(obj, type, 1); \
|
38
|
+
return obj; \
|
39
|
+
}
|
40
|
+
|
41
|
+
#define DEF_CONSTRUCTOR(klass, type) \
|
42
|
+
static VALUE rb_##klass##_s_allocate(VALUE klass2) \
|
43
|
+
{ \
|
44
|
+
type *amatch = type##_allocate(); \
|
45
|
+
return Data_Wrap_Struct(klass2, NULL, rb_##klass##_free, amatch); \
|
46
|
+
} \
|
47
|
+
VALUE rb_##klass##_new(VALUE klass2, VALUE pattern) \
|
48
|
+
{ \
|
49
|
+
VALUE obj = rb_##klass##_s_allocate(klass2); \
|
50
|
+
rb_##klass##_initialize(obj, pattern); \
|
51
|
+
return obj; \
|
52
|
+
}
|
53
|
+
|
54
|
+
#define DEF_RB_FREE(klass, type) \
|
55
|
+
static void rb_##klass##_free(type *amatch) \
|
56
|
+
{ \
|
57
|
+
MEMZERO(amatch->pattern, char, amatch->pattern_len); \
|
58
|
+
free(amatch->pattern); \
|
59
|
+
MEMZERO(amatch, type, 1); \
|
60
|
+
free(amatch); \
|
61
|
+
}
|
62
|
+
|
63
|
+
#define DEF_PATTERN_ACCESSOR(type) \
|
64
|
+
static void type##_pattern_set(type *amatch, VALUE pattern) \
|
65
|
+
{ \
|
66
|
+
Check_Type(pattern, T_STRING); \
|
67
|
+
free(amatch->pattern); \
|
68
|
+
amatch->pattern_len = RSTRING_LEN(pattern); \
|
69
|
+
amatch->pattern = ALLOC_N(char, amatch->pattern_len); \
|
70
|
+
MEMCPY(amatch->pattern, RSTRING_PTR(pattern), char, \
|
71
|
+
RSTRING_LEN(pattern)); \
|
72
|
+
} \
|
73
|
+
static VALUE rb_##type##_pattern(VALUE self) \
|
74
|
+
{ \
|
75
|
+
GET_STRUCT(type) \
|
76
|
+
return rb_str_new(amatch->pattern, amatch->pattern_len); \
|
77
|
+
} \
|
78
|
+
static VALUE rb_##type##_pattern_set(VALUE self, VALUE pattern) \
|
79
|
+
{ \
|
80
|
+
GET_STRUCT(type) \
|
81
|
+
type##_pattern_set(amatch, pattern); \
|
82
|
+
return Qnil; \
|
83
|
+
}
|
84
|
+
|
85
|
+
#define DEF_ITERATE_STRINGS(type) \
|
86
|
+
static VALUE type##_iterate_strings(type *amatch, VALUE strings, \
|
87
|
+
VALUE (*match_function) (type *amatch, VALUE strings)) \
|
88
|
+
{ \
|
89
|
+
if (TYPE(strings) == T_STRING) { \
|
90
|
+
return match_function(amatch, strings); \
|
91
|
+
} else { \
|
92
|
+
Check_Type(strings, T_ARRAY); \
|
93
|
+
int i; \
|
94
|
+
VALUE result = rb_ary_new2(RARRAY_LEN(strings)); \
|
95
|
+
for (i = 0; i < RARRAY_LEN(strings); i++) { \
|
96
|
+
VALUE string = rb_ary_entry(strings, i); \
|
97
|
+
if (TYPE(string) != T_STRING) { \
|
98
|
+
rb_raise(rb_eTypeError, \
|
99
|
+
"array has to contain only strings (%s given)", \
|
100
|
+
NIL_P(string) ? \
|
101
|
+
"NilClass" : \
|
102
|
+
rb_class2name(CLASS_OF(string))); \
|
103
|
+
} \
|
104
|
+
rb_ary_push(result, match_function(amatch, string)); \
|
105
|
+
} \
|
106
|
+
return result; \
|
107
|
+
} \
|
108
|
+
}
|
109
|
+
|
110
|
+
#define DEF_RB_READER(type, function, name, converter) \
|
111
|
+
VALUE function(VALUE self) \
|
112
|
+
{ \
|
113
|
+
GET_STRUCT(type) \
|
114
|
+
return converter(amatch->name); \
|
115
|
+
}
|
116
|
+
|
117
|
+
#define DEF_RB_WRITER(type, function, name, vtype, caster, converter, check)\
|
118
|
+
VALUE function(VALUE self, VALUE value) \
|
119
|
+
{ \
|
120
|
+
vtype value_ ## vtype; \
|
121
|
+
GET_STRUCT(type) \
|
122
|
+
caster(value); \
|
123
|
+
value_ ## vtype = converter(value); \
|
124
|
+
if (!(value_ ## vtype check)) \
|
125
|
+
rb_raise(rb_eTypeError, "check of value " #check " failed"); \
|
126
|
+
amatch->name = value_ ## vtype; \
|
127
|
+
return Qnil; \
|
128
|
+
}
|
129
|
+
|
130
|
+
|
131
|
+
#define CAST2FLOAT(obj) \
|
132
|
+
if (TYPE(obj) != T_FLOAT && rb_respond_to(obj, id_to_f)) \
|
133
|
+
obj = rb_funcall(obj, id_to_f, 0, 0); \
|
134
|
+
else \
|
135
|
+
Check_Type(obj, T_FLOAT)
|
136
|
+
#define FLOAT2C(obj) (RFLOAT_VALUE(obj))
|
137
|
+
|
138
|
+
#define CAST2BOOL(obj) \
|
139
|
+
if (obj == Qfalse || obj == Qnil) \
|
140
|
+
obj = Qfalse; \
|
141
|
+
else \
|
142
|
+
obj = Qtrue;
|
143
|
+
#define BOOL2C(obj) (obj == Qtrue)
|
144
|
+
#define C2BOOL(obj) (obj ? Qtrue : Qfalse)
|
145
|
+
|
146
|
+
#define OPTIMIZE_TIME \
|
147
|
+
if (amatch->pattern_len < RSTRING_LEN(string)) { \
|
148
|
+
a_ptr = amatch->pattern; \
|
149
|
+
a_len = amatch->pattern_len; \
|
150
|
+
b_ptr = RSTRING_PTR(string); \
|
151
|
+
b_len = RSTRING_LEN(string); \
|
152
|
+
} else { \
|
153
|
+
a_ptr = RSTRING_PTR(string); \
|
154
|
+
a_len = RSTRING_LEN(string); \
|
155
|
+
b_ptr = amatch->pattern; \
|
156
|
+
b_len = amatch->pattern_len; \
|
157
|
+
}
|
158
|
+
|
159
|
+
#define DONT_OPTIMIZE \
|
160
|
+
a_ptr = amatch->pattern; \
|
161
|
+
a_len = amatch->pattern_len; \
|
162
|
+
b_ptr = RSTRING_PTR(string); \
|
163
|
+
b_len = RSTRING_LEN(string); \
|
164
|
+
|
165
|
+
/*
|
166
|
+
* C structures of the Amatch classes
|
167
|
+
*/
|
168
|
+
|
169
|
+
typedef struct GeneralStruct {
|
170
|
+
char *pattern;
|
171
|
+
int pattern_len;
|
172
|
+
} General;
|
173
|
+
|
174
|
+
DEF_ALLOCATOR(General)
|
175
|
+
DEF_PATTERN_ACCESSOR(General)
|
176
|
+
DEF_ITERATE_STRINGS(General)
|
177
|
+
|
178
|
+
typedef struct SellersStruct {
|
179
|
+
char *pattern;
|
180
|
+
int pattern_len;
|
181
|
+
double substitution;
|
182
|
+
double deletion;
|
183
|
+
double insertion;
|
184
|
+
} Sellers;
|
185
|
+
|
186
|
+
DEF_ALLOCATOR(Sellers)
|
187
|
+
DEF_PATTERN_ACCESSOR(Sellers)
|
188
|
+
DEF_ITERATE_STRINGS(Sellers)
|
189
|
+
|
190
|
+
static void Sellers_reset_weights(Sellers *self)
|
191
|
+
{
|
192
|
+
self->substitution = 1.0;
|
193
|
+
self->deletion = 1.0;
|
194
|
+
self->insertion = 1.0;
|
195
|
+
}
|
196
|
+
|
197
|
+
typedef struct PairDistanceStruct {
|
198
|
+
char *pattern;
|
199
|
+
int pattern_len;
|
200
|
+
PairArray *pattern_pair_array;
|
201
|
+
} PairDistance;
|
202
|
+
|
203
|
+
DEF_ALLOCATOR(PairDistance)
|
204
|
+
DEF_PATTERN_ACCESSOR(PairDistance)
|
205
|
+
|
206
|
+
typedef struct JaroStruct {
|
207
|
+
char *pattern;
|
208
|
+
int pattern_len;
|
209
|
+
int ignore_case;
|
210
|
+
} Jaro;
|
211
|
+
|
212
|
+
DEF_ALLOCATOR(Jaro)
|
213
|
+
DEF_PATTERN_ACCESSOR(Jaro)
|
214
|
+
DEF_ITERATE_STRINGS(Jaro)
|
215
|
+
|
216
|
+
typedef struct JaroWinklerStruct {
|
217
|
+
char *pattern;
|
218
|
+
int pattern_len;
|
219
|
+
int ignore_case;
|
220
|
+
float scaling_factor;
|
221
|
+
} JaroWinkler;
|
222
|
+
|
223
|
+
DEF_ALLOCATOR(JaroWinkler)
|
224
|
+
DEF_PATTERN_ACCESSOR(JaroWinkler)
|
225
|
+
DEF_ITERATE_STRINGS(JaroWinkler)
|
226
|
+
|
227
|
+
/*
|
228
|
+
* Levenshtein edit distances are computed here:
|
229
|
+
*/
|
230
|
+
|
231
|
+
#define COMPUTE_LEVENSHTEIN_DISTANCE \
|
232
|
+
for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
|
233
|
+
c = i % 2; /* current row */ \
|
234
|
+
p = (i + 1) % 2; /* previous row */ \
|
235
|
+
v[c][0] = i; /* first column */ \
|
236
|
+
for (j = 1; j <= b_len; j++) { \
|
237
|
+
/* Bellman's principle of optimality: */ \
|
238
|
+
weight = v[p][j - 1] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
|
239
|
+
if (weight > v[p][j] + 1) { \
|
240
|
+
weight = v[p][j] + 1; \
|
241
|
+
} \
|
242
|
+
if (weight > v[c][j - 1] + 1) { \
|
243
|
+
weight = v[c][j - 1] + 1; \
|
244
|
+
} \
|
245
|
+
v[c][j] = weight; \
|
246
|
+
} \
|
247
|
+
p = c; \
|
248
|
+
c = (c + 1) % 2; \
|
249
|
+
}
|
250
|
+
|
251
|
+
static VALUE Levenshtein_match(General *amatch, VALUE string)
|
252
|
+
{
|
253
|
+
VALUE result;
|
254
|
+
char *a_ptr, *b_ptr;
|
255
|
+
int a_len, b_len;
|
256
|
+
int *v[2], weight;
|
257
|
+
int i, j, c, p;
|
258
|
+
|
259
|
+
Check_Type(string, T_STRING);
|
260
|
+
DONT_OPTIMIZE
|
261
|
+
|
262
|
+
v[0] = ALLOC_N(int, b_len + 1);
|
263
|
+
v[1] = ALLOC_N(int, b_len + 1);
|
264
|
+
for (i = 0; i <= b_len; i++) {
|
265
|
+
v[0][i] = i;
|
266
|
+
v[1][i] = i;
|
267
|
+
}
|
268
|
+
|
269
|
+
COMPUTE_LEVENSHTEIN_DISTANCE
|
270
|
+
|
271
|
+
result = INT2FIX(v[p][b_len]);
|
272
|
+
|
273
|
+
free(v[0]);
|
274
|
+
free(v[1]);
|
275
|
+
|
276
|
+
return result;
|
277
|
+
}
|
278
|
+
|
279
|
+
static VALUE Levenshtein_similar(General *amatch, VALUE string)
|
280
|
+
{
|
281
|
+
VALUE result;
|
282
|
+
char *a_ptr, *b_ptr;
|
283
|
+
int a_len, b_len;
|
284
|
+
int *v[2], weight;
|
285
|
+
int i, j, c, p;
|
286
|
+
|
287
|
+
Check_Type(string, T_STRING);
|
288
|
+
DONT_OPTIMIZE
|
289
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
290
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
291
|
+
v[0] = ALLOC_N(int, b_len + 1);
|
292
|
+
v[1] = ALLOC_N(int, b_len + 1);
|
293
|
+
for (i = 0; i <= b_len; i++) {
|
294
|
+
v[0][i] = i;
|
295
|
+
v[1][i] = i;
|
296
|
+
}
|
297
|
+
|
298
|
+
COMPUTE_LEVENSHTEIN_DISTANCE
|
299
|
+
|
300
|
+
if (b_len > a_len) {
|
301
|
+
result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
|
302
|
+
} else {
|
303
|
+
result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
|
304
|
+
}
|
305
|
+
free(v[0]);
|
306
|
+
free(v[1]);
|
307
|
+
return result;
|
308
|
+
}
|
309
|
+
|
310
|
+
static VALUE Levenshtein_search(General *amatch, VALUE string)
|
311
|
+
{
|
312
|
+
VALUE result;
|
313
|
+
char *a_ptr, *b_ptr;
|
314
|
+
int a_len, b_len;
|
315
|
+
int *v[2], weight, min;
|
316
|
+
int i, j, c, p;
|
317
|
+
|
318
|
+
Check_Type(string, T_STRING);
|
319
|
+
DONT_OPTIMIZE
|
320
|
+
|
321
|
+
v[0] = ALLOC_N(int, b_len + 1);
|
322
|
+
v[1] = ALLOC_N(int, b_len + 1);
|
323
|
+
MEMZERO(v[0], int, b_len + 1);
|
324
|
+
MEMZERO(v[1], int, b_len + 1);
|
325
|
+
|
326
|
+
COMPUTE_LEVENSHTEIN_DISTANCE
|
327
|
+
|
328
|
+
for (i = 0, min = a_len; i <= b_len; i++) {
|
329
|
+
if (v[p][i] < min) min = v[p][i];
|
330
|
+
}
|
331
|
+
|
332
|
+
result = INT2FIX(min);
|
333
|
+
|
334
|
+
free(v[0]);
|
335
|
+
free(v[1]);
|
336
|
+
|
337
|
+
return result;
|
338
|
+
}
|
339
|
+
|
340
|
+
|
341
|
+
/*
|
342
|
+
* Sellers edit distances are computed here:
|
343
|
+
*/
|
344
|
+
|
345
|
+
#define COMPUTE_SELLERS_DISTANCE \
|
346
|
+
for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
|
347
|
+
c = i % 2; /* current row */ \
|
348
|
+
p = (i + 1) % 2; /* previous row */ \
|
349
|
+
v[c][0] = i * amatch->deletion; /* first column */ \
|
350
|
+
for (j = 1; j <= b_len; j++) { \
|
351
|
+
/* Bellman's principle of optimality: */ \
|
352
|
+
weight = v[p][j - 1] + \
|
353
|
+
(a_ptr[i - 1] == b_ptr[j - 1] ? 0 : amatch->substitution); \
|
354
|
+
if (weight > v[p][j] + amatch->insertion) { \
|
355
|
+
weight = v[p][j] + amatch->insertion; \
|
356
|
+
} \
|
357
|
+
if (weight > v[c][j - 1] + amatch->deletion) { \
|
358
|
+
weight = v[c][j - 1] + amatch->deletion; \
|
359
|
+
} \
|
360
|
+
v[c][j] = weight; \
|
361
|
+
} \
|
362
|
+
p = c; \
|
363
|
+
c = (c + 1) % 2; \
|
364
|
+
}
|
365
|
+
|
366
|
+
static VALUE Sellers_match(Sellers *amatch, VALUE string)
|
367
|
+
{
|
368
|
+
VALUE result;
|
369
|
+
char *a_ptr, *b_ptr;
|
370
|
+
int a_len, b_len;
|
371
|
+
double *v[2], weight;
|
372
|
+
int i, j, c, p;
|
373
|
+
|
374
|
+
Check_Type(string, T_STRING);
|
375
|
+
DONT_OPTIMIZE
|
376
|
+
|
377
|
+
v[0] = ALLOC_N(double, b_len + 1);
|
378
|
+
v[1] = ALLOC_N(double, b_len + 1);
|
379
|
+
for (i = 0; i <= b_len; i++) {
|
380
|
+
v[0][i] = i * amatch->deletion;
|
381
|
+
v[1][i] = i * amatch->deletion;
|
382
|
+
}
|
383
|
+
|
384
|
+
COMPUTE_SELLERS_DISTANCE
|
385
|
+
|
386
|
+
result = rb_float_new(v[p][b_len]);
|
387
|
+
free(v[0]);
|
388
|
+
free(v[1]);
|
389
|
+
return result;
|
390
|
+
}
|
391
|
+
|
392
|
+
static VALUE Sellers_similar(Sellers *amatch, VALUE string)
|
393
|
+
{
|
394
|
+
VALUE result;
|
395
|
+
char *a_ptr, *b_ptr;
|
396
|
+
int a_len, b_len;
|
397
|
+
double *v[2], weight, max_weight;
|
398
|
+
int i, j, c, p;
|
399
|
+
|
400
|
+
if (amatch->insertion >= amatch->deletion) {
|
401
|
+
if (amatch->substitution >= amatch->insertion) {
|
402
|
+
max_weight = amatch->substitution;
|
403
|
+
} else {
|
404
|
+
max_weight = amatch->insertion;
|
405
|
+
}
|
406
|
+
} else {
|
407
|
+
if (amatch->substitution >= amatch->deletion) {
|
408
|
+
max_weight = amatch->substitution;
|
409
|
+
} else {
|
410
|
+
max_weight = amatch->deletion;
|
411
|
+
}
|
412
|
+
}
|
413
|
+
|
414
|
+
Check_Type(string, T_STRING);
|
415
|
+
DONT_OPTIMIZE
|
416
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
417
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
418
|
+
v[0] = ALLOC_N(double, b_len + 1);
|
419
|
+
v[1] = ALLOC_N(double, b_len + 1);
|
420
|
+
for (i = 0; i <= b_len; i++) {
|
421
|
+
v[0][i] = i * amatch->deletion;
|
422
|
+
v[1][i] = i * amatch->deletion;
|
423
|
+
}
|
424
|
+
|
425
|
+
COMPUTE_SELLERS_DISTANCE
|
426
|
+
|
427
|
+
if (b_len > a_len) {
|
428
|
+
result = rb_float_new(1.0 - v[p][b_len] / (b_len * max_weight));
|
429
|
+
} else {
|
430
|
+
result = rb_float_new(1.0 - v[p][b_len] / (a_len * max_weight));
|
431
|
+
}
|
432
|
+
free(v[0]);
|
433
|
+
free(v[1]);
|
434
|
+
return result;
|
435
|
+
}
|
436
|
+
|
437
|
+
static VALUE Sellers_search(Sellers *amatch, VALUE string)
|
438
|
+
{
|
439
|
+
VALUE result;
|
440
|
+
char *a_ptr, *b_ptr;
|
441
|
+
int a_len, b_len;
|
442
|
+
double *v[2], weight, min;
|
443
|
+
int i, j, c, p;
|
444
|
+
|
445
|
+
Check_Type(string, T_STRING);
|
446
|
+
DONT_OPTIMIZE
|
447
|
+
|
448
|
+
v[0] = ALLOC_N(double, b_len + 1);
|
449
|
+
v[1] = ALLOC_N(double, b_len + 1);
|
450
|
+
MEMZERO(v[0], double, b_len + 1);
|
451
|
+
MEMZERO(v[1], double, b_len + 1);
|
452
|
+
|
453
|
+
COMPUTE_SELLERS_DISTANCE
|
454
|
+
|
455
|
+
for (i = 0, min = a_len; i <= b_len; i++) {
|
456
|
+
if (v[p][i] < min) min = v[p][i];
|
457
|
+
}
|
458
|
+
result = rb_float_new(min);
|
459
|
+
free(v[0]);
|
460
|
+
free(v[1]);
|
461
|
+
|
462
|
+
return result;
|
463
|
+
}
|
464
|
+
|
465
|
+
/*
|
466
|
+
* Pair distances are computed here:
|
467
|
+
*/
|
468
|
+
|
469
|
+
static VALUE PairDistance_match(
|
470
|
+
PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
|
471
|
+
{
|
472
|
+
double result;
|
473
|
+
VALUE tokens;
|
474
|
+
PairArray *pair_array;
|
475
|
+
|
476
|
+
Check_Type(string, T_STRING);
|
477
|
+
if (!NIL_P(regexp) || use_regexp) {
|
478
|
+
tokens = rb_funcall(
|
479
|
+
rb_str_new(amatch->pattern, amatch->pattern_len),
|
480
|
+
id_split, 1, regexp
|
481
|
+
);
|
482
|
+
if (!amatch->pattern_pair_array) {
|
483
|
+
amatch->pattern_pair_array = PairArray_new(tokens);
|
484
|
+
} else {
|
485
|
+
pair_array_reactivate(amatch->pattern_pair_array);
|
486
|
+
}
|
487
|
+
tokens = rb_funcall(string, id_split, 1, regexp);
|
488
|
+
pair_array = PairArray_new(tokens);
|
489
|
+
} else {
|
490
|
+
VALUE tmp = rb_str_new(amatch->pattern, amatch->pattern_len);
|
491
|
+
tokens = rb_ary_new4(1, &tmp);
|
492
|
+
if (!amatch->pattern_pair_array) {
|
493
|
+
amatch->pattern_pair_array = PairArray_new(tokens);
|
494
|
+
} else {
|
495
|
+
pair_array_reactivate(amatch->pattern_pair_array);
|
496
|
+
}
|
497
|
+
tokens = rb_ary_new4(1, &string);
|
498
|
+
pair_array = PairArray_new(tokens);
|
499
|
+
}
|
500
|
+
result = pair_array_match(amatch->pattern_pair_array, pair_array);
|
501
|
+
pair_array_destroy(pair_array);
|
502
|
+
return rb_float_new(result);
|
503
|
+
}
|
504
|
+
|
505
|
+
/*
|
506
|
+
* Hamming distances are computed here:
|
507
|
+
*/
|
508
|
+
|
509
|
+
#define COMPUTE_HAMMING_DISTANCE \
|
510
|
+
for (i = 0, result = b_len - a_len; i < a_len; i++) { \
|
511
|
+
if (i >= b_len) { \
|
512
|
+
result += a_len - b_len; \
|
513
|
+
break; \
|
514
|
+
} \
|
515
|
+
if (b_ptr[i] != a_ptr[i]) result++; \
|
516
|
+
}
|
517
|
+
|
518
|
+
static VALUE Hamming_match(General *amatch, VALUE string)
|
519
|
+
{
|
520
|
+
char *a_ptr, *b_ptr;
|
521
|
+
int a_len, b_len;
|
522
|
+
int i, result;
|
523
|
+
|
524
|
+
Check_Type(string, T_STRING);
|
525
|
+
OPTIMIZE_TIME
|
526
|
+
COMPUTE_HAMMING_DISTANCE
|
527
|
+
return INT2FIX(result);
|
528
|
+
}
|
529
|
+
|
530
|
+
static VALUE Hamming_similar(General *amatch, VALUE string)
|
531
|
+
{
|
532
|
+
char *a_ptr, *b_ptr;
|
533
|
+
int a_len, b_len;
|
534
|
+
int i, result;
|
535
|
+
|
536
|
+
Check_Type(string, T_STRING);
|
537
|
+
OPTIMIZE_TIME
|
538
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
539
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
540
|
+
COMPUTE_HAMMING_DISTANCE
|
541
|
+
return rb_float_new(1.0 - ((double) result) / b_len);
|
542
|
+
}
|
543
|
+
|
544
|
+
/*
|
545
|
+
* Longest Common Subsequence computation
|
546
|
+
*/
|
547
|
+
|
548
|
+
#define COMPUTE_LONGEST_SUBSEQUENCE \
|
549
|
+
l[0] = ALLOC_N(int, b_len + 1); \
|
550
|
+
l[1] = ALLOC_N(int, b_len + 1); \
|
551
|
+
for (i = a_len, c = 0, p = 1; i >= 0; i--) { \
|
552
|
+
for (j = b_len; j >= 0; j--) { \
|
553
|
+
if (i == a_len || j == b_len) { \
|
554
|
+
l[c][j] = 0; \
|
555
|
+
} else if (a_ptr[i] == b_ptr[j]) { \
|
556
|
+
l[c][j] = 1 + l[p][j + 1]; \
|
557
|
+
} else { \
|
558
|
+
int x = l[p][j], y = l[c][j + 1]; \
|
559
|
+
if (x > y) l[c][j] = x; else l[c][j] = y; \
|
560
|
+
} \
|
561
|
+
} \
|
562
|
+
p = c; \
|
563
|
+
c = (c + 1) % 2; \
|
564
|
+
} \
|
565
|
+
result = l[p][0]; \
|
566
|
+
free(l[0]); \
|
567
|
+
free(l[1]);
|
568
|
+
|
569
|
+
|
570
|
+
static VALUE LongestSubsequence_match(General *amatch, VALUE string)
|
571
|
+
{
|
572
|
+
char *a_ptr, *b_ptr;
|
573
|
+
int a_len, b_len;
|
574
|
+
int result, c, p, i, j, *l[2];
|
575
|
+
|
576
|
+
Check_Type(string, T_STRING);
|
577
|
+
OPTIMIZE_TIME
|
578
|
+
|
579
|
+
if (a_len == 0 || b_len == 0) return INT2FIX(0);
|
580
|
+
COMPUTE_LONGEST_SUBSEQUENCE
|
581
|
+
return INT2FIX(result);
|
582
|
+
}
|
583
|
+
|
584
|
+
static VALUE LongestSubsequence_similar(General *amatch, VALUE string)
|
585
|
+
{
|
586
|
+
char *a_ptr, *b_ptr;
|
587
|
+
int a_len, b_len;
|
588
|
+
int result, c, p, i, j, *l[2];
|
589
|
+
|
590
|
+
Check_Type(string, T_STRING);
|
591
|
+
OPTIMIZE_TIME
|
592
|
+
|
593
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
594
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
595
|
+
COMPUTE_LONGEST_SUBSEQUENCE
|
596
|
+
return rb_float_new(((double) result) / b_len);
|
597
|
+
}
|
598
|
+
|
599
|
+
/*
|
600
|
+
* Longest Common Substring computation
|
601
|
+
*/
|
602
|
+
|
603
|
+
#define COMPUTE_LONGEST_SUBSTRING \
|
604
|
+
l[0] = ALLOC_N(int, b_len); \
|
605
|
+
MEMZERO(l[0], int, b_len); \
|
606
|
+
l[1] = ALLOC_N(int, b_len); \
|
607
|
+
MEMZERO(l[1], int, b_len); \
|
608
|
+
result = 0; \
|
609
|
+
for (i = 0, c = 0, p = 1; i < a_len; i++) { \
|
610
|
+
for (j = 0; j < b_len; j++) { \
|
611
|
+
if (a_ptr[i] == b_ptr[j]) { \
|
612
|
+
l[c][j] = j == 0 ? 1 : 1 + l[p][j - 1]; \
|
613
|
+
if (l[c][j] > result) result = l[c][j]; \
|
614
|
+
} else { \
|
615
|
+
l[c][j] = 0; \
|
616
|
+
} \
|
617
|
+
} \
|
618
|
+
p = c; \
|
619
|
+
c = (c + 1) % 2; \
|
620
|
+
} \
|
621
|
+
free(l[0]); \
|
622
|
+
free(l[1]);
|
623
|
+
|
624
|
+
static VALUE LongestSubstring_match(General *amatch, VALUE string)
|
625
|
+
{
|
626
|
+
char *a_ptr, *b_ptr;
|
627
|
+
int a_len, b_len;
|
628
|
+
int result, c, p, i, j, *l[2];
|
629
|
+
|
630
|
+
Check_Type(string, T_STRING);
|
631
|
+
OPTIMIZE_TIME
|
632
|
+
if (a_len == 0 || b_len == 0) return INT2FIX(0);
|
633
|
+
COMPUTE_LONGEST_SUBSTRING
|
634
|
+
return INT2FIX(result);
|
635
|
+
}
|
636
|
+
|
637
|
+
static VALUE LongestSubstring_similar(General *amatch, VALUE string)
|
638
|
+
{
|
639
|
+
char *a_ptr, *b_ptr;
|
640
|
+
int a_len, b_len;
|
641
|
+
int result, c, p, i, j, *l[2];
|
642
|
+
|
643
|
+
Check_Type(string, T_STRING);
|
644
|
+
OPTIMIZE_TIME
|
645
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
646
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
647
|
+
COMPUTE_LONGEST_SUBSTRING
|
648
|
+
return rb_float_new(((double) result) / b_len);
|
649
|
+
}
|
650
|
+
|
651
|
+
/*
|
652
|
+
* Jaro computation
|
653
|
+
*/
|
654
|
+
|
655
|
+
#define COMPUTE_JARO \
|
656
|
+
l[0] = ALLOC_N(int, a_len); \
|
657
|
+
MEMZERO(l[0], int, a_len); \
|
658
|
+
l[1] = ALLOC_N(int, b_len); \
|
659
|
+
MEMZERO(l[1], int, b_len); \
|
660
|
+
max_dist = ((a_len > b_len ? a_len : b_len) / 2) - 1; \
|
661
|
+
m = 0; \
|
662
|
+
for (i = 0; i < a_len; i++) { \
|
663
|
+
low = (i > max_dist ? i - max_dist : 0); \
|
664
|
+
high = (i + max_dist < b_len ? i + max_dist : b_len); \
|
665
|
+
for (j = low; j <= high; j++) { \
|
666
|
+
if (!l[1][j] && a_ptr[i] == b_ptr[j]) { \
|
667
|
+
l[0][i] = 1; \
|
668
|
+
l[1][j] = 1; \
|
669
|
+
m++; \
|
670
|
+
break; \
|
671
|
+
} \
|
672
|
+
} \
|
673
|
+
} \
|
674
|
+
if (m == 0) { \
|
675
|
+
result = 0.0; \
|
676
|
+
} else { \
|
677
|
+
k = t = 0; \
|
678
|
+
for (i = 0; i < a_len; i++) { \
|
679
|
+
if (l[0][i]) { \
|
680
|
+
for (j = k; j < b_len; j++) { \
|
681
|
+
if (l[1][j]) { \
|
682
|
+
k = j + 1; \
|
683
|
+
break; \
|
684
|
+
} \
|
685
|
+
} \
|
686
|
+
if (a_ptr[i] != b_ptr[j]) { \
|
687
|
+
t++; \
|
688
|
+
} \
|
689
|
+
} \
|
690
|
+
} \
|
691
|
+
t = t / 2; \
|
692
|
+
result = (((double)m)/a_len + ((double)m)/b_len + ((double)(m-t))/m)/3.0; \
|
693
|
+
}
|
694
|
+
|
695
|
+
#define LOWERCASE_STRINGS \
|
696
|
+
char *ying = ALLOC_N(char, a_len); \
|
697
|
+
MEMCPY(ying, a_ptr, char, a_len); \
|
698
|
+
a_ptr = ying; \
|
699
|
+
char *yang = ALLOC_N(char, b_len); \
|
700
|
+
MEMCPY(yang, b_ptr, char, b_len); \
|
701
|
+
b_ptr = yang; \
|
702
|
+
for (i = 0; i < a_len; i++) { \
|
703
|
+
if (islower(a_ptr[i])) a_ptr[i] = toupper(a_ptr[i]); \
|
704
|
+
} \
|
705
|
+
for (i = 0; i < b_len; i++) { \
|
706
|
+
if (islower(b_ptr[i])) b_ptr[i] = toupper(b_ptr[i]); \
|
707
|
+
}
|
708
|
+
|
709
|
+
#define FREE_STRINGS \
|
710
|
+
xfree(a_ptr); \
|
711
|
+
xfree(b_ptr);
|
712
|
+
|
713
|
+
static VALUE Jaro_match(Jaro *amatch, VALUE string)
|
714
|
+
{
|
715
|
+
char *a_ptr, *b_ptr;
|
716
|
+
int a_len, b_len, max_dist, m, t, i, j, k, low, high;
|
717
|
+
int *l[2];
|
718
|
+
double result;
|
719
|
+
|
720
|
+
Check_Type(string, T_STRING);
|
721
|
+
OPTIMIZE_TIME
|
722
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
723
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
724
|
+
if (amatch->ignore_case) {
|
725
|
+
LOWERCASE_STRINGS
|
726
|
+
}
|
727
|
+
COMPUTE_JARO
|
728
|
+
if (amatch->ignore_case) {
|
729
|
+
FREE_STRINGS
|
730
|
+
}
|
731
|
+
return rb_float_new(result);
|
732
|
+
}
|
733
|
+
|
734
|
+
/*
|
735
|
+
* Jaro-Winkler computation
|
736
|
+
*/
|
737
|
+
|
738
|
+
static VALUE JaroWinkler_match(JaroWinkler *amatch, VALUE string)
|
739
|
+
{
|
740
|
+
char *a_ptr, *b_ptr;
|
741
|
+
int a_len, b_len, max_dist, m, t, i, j, k, low, high, n;
|
742
|
+
int *l[2];
|
743
|
+
double result;
|
744
|
+
|
745
|
+
Check_Type(string, T_STRING);
|
746
|
+
OPTIMIZE_TIME
|
747
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
748
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
749
|
+
if (amatch->ignore_case) {
|
750
|
+
LOWERCASE_STRINGS
|
751
|
+
}
|
752
|
+
COMPUTE_JARO
|
753
|
+
n = 0;
|
754
|
+
for (i = 0; i < (a_len >= 4 ? 4 : a_len); i++) {
|
755
|
+
if (a_ptr[i] == b_ptr[i]) {
|
756
|
+
n++;
|
757
|
+
} else {
|
758
|
+
break;
|
759
|
+
}
|
760
|
+
}
|
761
|
+
result = result + n*amatch->scaling_factor*(1-result);
|
762
|
+
if (amatch->ignore_case) {
|
763
|
+
FREE_STRINGS
|
764
|
+
}
|
765
|
+
return rb_float_new(result);
|
766
|
+
}
|
767
|
+
|
768
|
+
/*
|
769
|
+
* Ruby API
|
770
|
+
*/
|
771
|
+
|
772
|
+
/*
|
773
|
+
* Document-class: Amatch::Levenshtein
|
774
|
+
*
|
775
|
+
* The Levenshtein edit distance is defined as the minimal costs involved to
|
776
|
+
* transform one string into another by using three elementary operations:
|
777
|
+
* deletion, insertion and substitution of a character. To transform "water"
|
778
|
+
* into "wine", for instance, you have to substitute "a" -> "i": "witer", "t"
|
779
|
+
* -> "n": "winer" and delete "r": "wine". The edit distance between "water"
|
780
|
+
* and "wine" is 3, because you have to apply three operations. The edit
|
781
|
+
* distance between "wine" and "wine" is 0 of course: no operation is
|
782
|
+
* necessary for the transformation -- they're already the same string. It's
|
783
|
+
* easy to see that more similar strings have smaller edit distances than
|
784
|
+
* strings that differ a lot.
|
785
|
+
*/
|
786
|
+
|
787
|
+
DEF_RB_FREE(Levenshtein, General)
|
788
|
+
|
789
|
+
/*
|
790
|
+
* call-seq: new(pattern)
|
791
|
+
*
|
792
|
+
* Creates a new Amatch::Levenshtein instance from <code>pattern</code>.
|
793
|
+
*/
|
794
|
+
static VALUE rb_Levenshtein_initialize(VALUE self, VALUE pattern)
|
795
|
+
{
|
796
|
+
GET_STRUCT(General)
|
797
|
+
General_pattern_set(amatch, pattern);
|
798
|
+
return self;
|
799
|
+
}
|
800
|
+
|
801
|
+
DEF_CONSTRUCTOR(Levenshtein, General)
|
802
|
+
|
803
|
+
/*
|
804
|
+
* call-seq: match(strings) -> results
|
805
|
+
*
|
806
|
+
* Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
|
807
|
+
* against <code>strings</code>. It returns the number operations, the Sellers
|
808
|
+
* distance. <code>strings</code> has to be either a String or an Array of
|
809
|
+
* Strings. The returned <code>results</code> are either a Float or an Array of
|
810
|
+
* Floats respectively.
|
811
|
+
*/
|
812
|
+
static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
|
813
|
+
{
|
814
|
+
GET_STRUCT(General)
|
815
|
+
return General_iterate_strings(amatch, strings, Levenshtein_match);
|
816
|
+
}
|
817
|
+
|
818
|
+
/*
|
819
|
+
* call-seq: similar(strings) -> results
|
820
|
+
*
|
821
|
+
* Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
|
822
|
+
* against <code>strings</code>, and compute a Levenshtein distance metric
|
823
|
+
* number between 0.0 for very unsimilar strings and 1.0 for an exact match.
|
824
|
+
* <code>strings</code> has to be either a String or an Array of Strings. The
|
825
|
+
* returned <code>results</code> are either a Fixnum or an Array of Fixnums
|
826
|
+
* respectively.
|
827
|
+
*/
|
828
|
+
static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
|
829
|
+
{
|
830
|
+
GET_STRUCT(General)
|
831
|
+
return General_iterate_strings(amatch, strings, Levenshtein_similar);
|
832
|
+
}
|
833
|
+
|
834
|
+
/*
|
835
|
+
* call-seq: levenshtein_similar(strings) -> results
|
836
|
+
*
|
837
|
+
* If called on a String, this string is used as a Amatch::Levenshtein#pattern
|
838
|
+
* to match against <code>strings</code>. It returns a Levenshtein distance
|
839
|
+
* metric number between 0.0 for very unsimilar strings and 1.0 for an exact
|
840
|
+
* match. <code>strings</code> has to be either a String or an Array of
|
841
|
+
* Strings. The returned <code>results</code> are either a Float or an Array of
|
842
|
+
* Floats respectively.
|
843
|
+
*/
|
844
|
+
static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
|
845
|
+
{
|
846
|
+
VALUE amatch = rb_Levenshtein_new(rb_cLevenshtein, self);
|
847
|
+
return rb_Levenshtein_similar(amatch, strings);
|
848
|
+
}
|
849
|
+
|
850
|
+
/*
|
851
|
+
* call-seq: search(strings) -> results
|
852
|
+
*
|
853
|
+
* searches Amatch::Levenshtein#pattern in <code>strings</code> and returns the
|
854
|
+
* edit distance (the sum of character operations) as a Fixnum value, by greedy
|
855
|
+
* trimming prefixes or postfixes of the match. <code>strings</code> has
|
856
|
+
* to be either a String or an Array of Strings. The returned
|
857
|
+
* <code>results</code> are either a Float or an Array of Floats respectively.
|
858
|
+
*/
|
859
|
+
static VALUE rb_Levenshtein_search(VALUE self, VALUE strings)
|
860
|
+
{
|
861
|
+
GET_STRUCT(General)
|
862
|
+
return General_iterate_strings(amatch, strings, Levenshtein_search);
|
863
|
+
}
|
864
|
+
|
865
|
+
/*
|
866
|
+
* Document-class: Amatch::Sellers
|
867
|
+
*
|
868
|
+
* The Sellers edit distance is very similar to the Levenshtein edit distance.
|
869
|
+
* The difference is, that you can also specify different weights for every
|
870
|
+
* operation to prefer special operations over others. This extension of the
|
871
|
+
* Sellers edit distance is also known under the names: Needleman-Wunsch
|
872
|
+
* distance.
|
873
|
+
*/
|
874
|
+
|
875
|
+
DEF_RB_FREE(Sellers, Sellers)
|
876
|
+
|
877
|
+
/*
|
878
|
+
* Document-method: substitution
|
879
|
+
*
|
880
|
+
* call-seq: substitution -> weight
|
881
|
+
*
|
882
|
+
* Returns the weight of the substitution operation, that is used to compute
|
883
|
+
* the Sellers distance.
|
884
|
+
*/
|
885
|
+
DEF_RB_READER(Sellers, rb_Sellers_substitution, substitution,
|
886
|
+
rb_float_new)
|
887
|
+
|
888
|
+
/*
|
889
|
+
* Document-method: deletion
|
890
|
+
*
|
891
|
+
* call-seq: deletion -> weight
|
892
|
+
*
|
893
|
+
* Returns the weight of the deletion operation, that is used to compute
|
894
|
+
* the Sellers distance.
|
895
|
+
*/
|
896
|
+
DEF_RB_READER(Sellers, rb_Sellers_deletion, deletion,
|
897
|
+
rb_float_new)
|
898
|
+
|
899
|
+
/*
|
900
|
+
* Document-method: insertion
|
901
|
+
*
|
902
|
+
* call-seq: insertion -> weight
|
903
|
+
*
|
904
|
+
* Returns the weight of the insertion operation, that is used to compute
|
905
|
+
* the Sellers distance.
|
906
|
+
*/
|
907
|
+
DEF_RB_READER(Sellers, rb_Sellers_insertion, insertion,
|
908
|
+
rb_float_new)
|
909
|
+
|
910
|
+
/*
|
911
|
+
* Document-method: substitution=
|
912
|
+
*
|
913
|
+
* call-seq: substitution=(weight)
|
914
|
+
*
|
915
|
+
* Sets the weight of the substitution operation, that is used to compute
|
916
|
+
* the Sellers distance, to <code>weight</code>. The <code>weight</code>
|
917
|
+
* should be a Float value >= 0.0.
|
918
|
+
*/
|
919
|
+
DEF_RB_WRITER(Sellers, rb_Sellers_substitution_set, substitution,
|
920
|
+
double, CAST2FLOAT, FLOAT2C, >= 0)
|
921
|
+
|
922
|
+
/*
|
923
|
+
* Document-method: deletion=
|
924
|
+
*
|
925
|
+
* call-seq: deletion=(weight)
|
926
|
+
*
|
927
|
+
* Sets the weight of the deletion operation, that is used to compute
|
928
|
+
* the Sellers distance, to <code>weight</code>. The <code>weight</code>
|
929
|
+
* should be a Float value >= 0.0.
|
930
|
+
*/
|
931
|
+
DEF_RB_WRITER(Sellers, rb_Sellers_deletion_set, deletion,
|
932
|
+
double, CAST2FLOAT, FLOAT2C, >= 0)
|
933
|
+
|
934
|
+
/*
|
935
|
+
* Document-method: insertion=
|
936
|
+
*
|
937
|
+
* call-seq: insertion=(weight)
|
938
|
+
*
|
939
|
+
* Sets the weight of the insertion operation, that is used to compute
|
940
|
+
* the Sellers distance, to <code>weight</code>. The <code>weight</code>
|
941
|
+
* should be a Float value >= 0.0.
|
942
|
+
*/
|
943
|
+
DEF_RB_WRITER(Sellers, rb_Sellers_insertion_set, insertion,
|
944
|
+
double, CAST2FLOAT, FLOAT2C, >= 0)
|
945
|
+
|
946
|
+
/*
|
947
|
+
* Resets all weights (substitution, deletion, and insertion) to 1.0.
|
948
|
+
*/
|
949
|
+
static VALUE rb_Sellers_reset_weights(VALUE self)
|
950
|
+
{
|
951
|
+
GET_STRUCT(Sellers)
|
952
|
+
Sellers_reset_weights(amatch);
|
953
|
+
return self;
|
954
|
+
}
|
955
|
+
|
956
|
+
/*
|
957
|
+
* call-seq: new(pattern)
|
958
|
+
*
|
959
|
+
* Creates a new Amatch::Sellers instance from <code>pattern</code>,
|
960
|
+
* with all weights initially set to 1.0.
|
961
|
+
*/
|
962
|
+
static VALUE rb_Sellers_initialize(VALUE self, VALUE pattern)
|
963
|
+
{
|
964
|
+
GET_STRUCT(Sellers)
|
965
|
+
Sellers_pattern_set(amatch, pattern);
|
966
|
+
Sellers_reset_weights(amatch);
|
967
|
+
return self;
|
968
|
+
}
|
969
|
+
|
970
|
+
DEF_CONSTRUCTOR(Sellers, Sellers)
|
971
|
+
|
972
|
+
/*
|
973
|
+
* Document-method: pattern
|
974
|
+
*
|
975
|
+
* call-seq: pattern -> pattern string
|
976
|
+
*
|
977
|
+
* Returns the current pattern string of this Amatch::Sellers instance.
|
978
|
+
*/
|
979
|
+
|
980
|
+
/*
|
981
|
+
* Document-method: pattern=
|
982
|
+
*
|
983
|
+
* call-seq: pattern=(pattern)
|
984
|
+
*
|
985
|
+
* Sets the current pattern string of this Amatch::Sellers instance to
|
986
|
+
* <code>pattern</code>.
|
987
|
+
*/
|
988
|
+
|
989
|
+
/*
|
990
|
+
* call-seq: match(strings) -> results
|
991
|
+
*
|
992
|
+
* Uses this Amatch::Sellers instance to match Sellers#pattern against
|
993
|
+
* <code>strings</code>, while taking into account the given weights. It
|
994
|
+
* returns the number of weighted character operations, the Sellers distance.
|
995
|
+
* <code>strings</code> has to be either a String or an Array of Strings. The
|
996
|
+
* returned <code>results</code> are either a Float or an Array of Floats
|
997
|
+
* respectively.
|
998
|
+
*/
|
999
|
+
static VALUE rb_Sellers_match(VALUE self, VALUE strings)
|
1000
|
+
{
|
1001
|
+
GET_STRUCT(Sellers)
|
1002
|
+
return Sellers_iterate_strings(amatch, strings, Sellers_match);
|
1003
|
+
}
|
1004
|
+
|
1005
|
+
/*
|
1006
|
+
* call-seq: similar(strings) -> results
|
1007
|
+
*
|
1008
|
+
* Uses this Amatch::Sellers instance to match Amatch::Sellers#pattern
|
1009
|
+
* against <code>strings</code> (taking into account the given weights), and
|
1010
|
+
* compute a Sellers distance metric number between 0.0 for very unsimilar
|
1011
|
+
* strings and 1.0 for an exact match. <code>strings</code> has to be either a
|
1012
|
+
* String or an Array of Strings. The returned <code>results</code> are either
|
1013
|
+
* a Fixnum or an Array of Fixnums
|
1014
|
+
* respectively.
|
1015
|
+
*/
|
1016
|
+
static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
|
1017
|
+
{
|
1018
|
+
GET_STRUCT(Sellers)
|
1019
|
+
return Sellers_iterate_strings(amatch, strings, Sellers_similar);
|
1020
|
+
}
|
1021
|
+
|
1022
|
+
/*
|
1023
|
+
* call-seq: search(strings) -> results
|
1024
|
+
*
|
1025
|
+
* searches Sellers#pattern in <code>strings</code> and returns the edit
|
1026
|
+
* distance (the sum of weighted character operations) as a Float value, by
|
1027
|
+
* greedy trimming prefixes or postfixes of the match. <code>strings</code> has
|
1028
|
+
* to be either a String or an Array of Strings. The returned
|
1029
|
+
* <code>results</code> are either a Float or an Array of Floats respectively.
|
1030
|
+
*/
|
1031
|
+
static VALUE rb_Sellers_search(VALUE self, VALUE strings)
|
1032
|
+
{
|
1033
|
+
GET_STRUCT(Sellers)
|
1034
|
+
return Sellers_iterate_strings(amatch, strings, Sellers_search);
|
1035
|
+
}
|
1036
|
+
|
1037
|
+
/*
|
1038
|
+
* Document-class: Amatch::PairDistance
|
1039
|
+
*
|
1040
|
+
* The pair distance between two strings is based on the number of adjacent
|
1041
|
+
* character pairs, that are contained in both strings. The similiarity
|
1042
|
+
* metric of two strings s1 and s2 is
|
1043
|
+
* 2*|union(pairs(s1), pairs(s2))| / |pairs(s1)| + |pairs(s2)|
|
1044
|
+
* If it is 1.0 the two strings are an exact match, if less than 1.0 they
|
1045
|
+
* are more dissimilar. The advantage of considering adjacent characters, is to
|
1046
|
+
* take account not only of the characters, but also of the character ordering
|
1047
|
+
* in the original strings.
|
1048
|
+
*
|
1049
|
+
* This metric is very capable to find similarities in natural languages.
|
1050
|
+
* It is explained in more detail in Simon White's article "How to Strike a
|
1051
|
+
* Match", located at this url:
|
1052
|
+
* http://www.catalysoft.com/articles/StrikeAMatch.html
|
1053
|
+
* It is also very similar (a special case) to the method described under
|
1054
|
+
* http://citeseer.lcs.mit.edu/gravano01using.html in "Using q-grams in a DBMS
|
1055
|
+
* for Approximate String Processing."
|
1056
|
+
*/
|
1057
|
+
DEF_RB_FREE(PairDistance, PairDistance)
|
1058
|
+
|
1059
|
+
/*
|
1060
|
+
* call-seq: new(pattern)
|
1061
|
+
*
|
1062
|
+
* Creates a new Amatch::PairDistance instance from <code>pattern</code>.
|
1063
|
+
*/
|
1064
|
+
static VALUE rb_PairDistance_initialize(VALUE self, VALUE pattern)
|
1065
|
+
{
|
1066
|
+
GET_STRUCT(PairDistance)
|
1067
|
+
PairDistance_pattern_set(amatch, pattern);
|
1068
|
+
return self;
|
1069
|
+
}
|
1070
|
+
|
1071
|
+
DEF_CONSTRUCTOR(PairDistance, PairDistance)
|
1072
|
+
|
1073
|
+
/*
|
1074
|
+
* call-seq: match(strings, regexp = /\s+/) -> results
|
1075
|
+
*
|
1076
|
+
* Uses this Amatch::PairDistance instance to match PairDistance#pattern against
|
1077
|
+
* <code>strings</code>. It returns the pair distance measure, that is a
|
1078
|
+
* returned value of 1.0 is an exact match, partial matches are lower
|
1079
|
+
* values, while 0.0 means no match at all.
|
1080
|
+
*
|
1081
|
+
* <code>strings</code> has to be either a String or an
|
1082
|
+
* Array of Strings. The argument <code>regexp</code> is used to split the
|
1083
|
+
* pattern and strings into tokens first. It defaults to /\s+/. If the
|
1084
|
+
* splitting should be omitted, call the method with nil as <code>regexp</code>
|
1085
|
+
* explicitly.
|
1086
|
+
*
|
1087
|
+
* The returned <code>results</code> are either a Float or an
|
1088
|
+
* Array of Floats respectively.
|
1089
|
+
*/
|
1090
|
+
static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
|
1091
|
+
{
|
1092
|
+
VALUE result, strings, regexp = Qnil;
|
1093
|
+
int use_regexp;
|
1094
|
+
GET_STRUCT(PairDistance)
|
1095
|
+
|
1096
|
+
rb_scan_args(argc, argv, "11", &strings, ®exp);
|
1097
|
+
use_regexp = NIL_P(regexp) && argc != 2;
|
1098
|
+
if (TYPE(strings) == T_STRING) {
|
1099
|
+
result = PairDistance_match(amatch, strings, regexp, use_regexp);
|
1100
|
+
} else {
|
1101
|
+
Check_Type(strings, T_ARRAY);
|
1102
|
+
int i;
|
1103
|
+
result = rb_ary_new2(RARRAY_LEN(strings));
|
1104
|
+
for (i = 0; i < RARRAY_LEN(strings); i++) {
|
1105
|
+
VALUE string = rb_ary_entry(strings, i);
|
1106
|
+
if (TYPE(string) != T_STRING) {
|
1107
|
+
rb_raise(rb_eTypeError,
|
1108
|
+
"array has to contain only strings (%s given)",
|
1109
|
+
NIL_P(string) ?
|
1110
|
+
"NilClass" :
|
1111
|
+
rb_class2name(CLASS_OF(string)));
|
1112
|
+
}
|
1113
|
+
rb_ary_push(result,
|
1114
|
+
PairDistance_match(amatch, string, regexp, use_regexp));
|
1115
|
+
}
|
1116
|
+
}
|
1117
|
+
pair_array_destroy(amatch->pattern_pair_array);
|
1118
|
+
amatch->pattern_pair_array = NULL;
|
1119
|
+
return result;
|
1120
|
+
}
|
1121
|
+
|
1122
|
+
/*
|
1123
|
+
* call-seq: pair_distance_similar(strings) -> results
|
1124
|
+
*
|
1125
|
+
* If called on a String, this string is used as a Amatch::PairDistance#pattern
|
1126
|
+
* to match against <code>strings</code> using /\s+/ as the tokenizing regular
|
1127
|
+
* expression. It returns a pair distance metric number between 0.0 for very
|
1128
|
+
* unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
|
1129
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1130
|
+
* are either a Float or an Array of Floats respectively.
|
1131
|
+
*/
|
1132
|
+
static VALUE rb_str_pair_distance_similar(VALUE self, VALUE strings)
|
1133
|
+
{
|
1134
|
+
VALUE amatch = rb_PairDistance_new(rb_cPairDistance, self);
|
1135
|
+
return rb_PairDistance_match(1, &strings, amatch);
|
1136
|
+
}
|
1137
|
+
|
1138
|
+
/*
|
1139
|
+
* Document-class: Amatch::Hamming
|
1140
|
+
*
|
1141
|
+
* This class computes the Hamming distance between two strings.
|
1142
|
+
*
|
1143
|
+
* The Hamming distance between two strings is the number of characters, that
|
1144
|
+
* are different. Thus a hamming distance of 0 means an exact
|
1145
|
+
* match, a hamming distance of 1 means one character is different, and so on.
|
1146
|
+
* If one string is longer than the other string, the missing characters are
|
1147
|
+
* counted as different characters.
|
1148
|
+
*/
|
1149
|
+
|
1150
|
+
DEF_RB_FREE(Hamming, General)
|
1151
|
+
|
1152
|
+
/*
|
1153
|
+
* call-seq: new(pattern)
|
1154
|
+
*
|
1155
|
+
* Creates a new Amatch::Hamming instance from <code>pattern</code>.
|
1156
|
+
*/
|
1157
|
+
static VALUE rb_Hamming_initialize(VALUE self, VALUE pattern)
|
1158
|
+
{
|
1159
|
+
GET_STRUCT(General)
|
1160
|
+
General_pattern_set(amatch, pattern);
|
1161
|
+
return self;
|
1162
|
+
}
|
1163
|
+
|
1164
|
+
DEF_CONSTRUCTOR(Hamming, General)
|
1165
|
+
|
1166
|
+
/*
|
1167
|
+
* call-seq: match(strings) -> results
|
1168
|
+
*
|
1169
|
+
* Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
|
1170
|
+
* <code>strings</code>, that is compute the hamming distance between
|
1171
|
+
* <code>pattern</code> and <code>strings</code>. <code>strings</code> has to
|
1172
|
+
* be either a String or an Array of Strings. The returned <code>results</code>
|
1173
|
+
* are either a Fixnum or an Array of Fixnums respectively.
|
1174
|
+
*/
|
1175
|
+
static VALUE rb_Hamming_match(VALUE self, VALUE strings)
|
1176
|
+
{
|
1177
|
+
GET_STRUCT(General)
|
1178
|
+
return General_iterate_strings(amatch, strings, Hamming_match);
|
1179
|
+
}
|
1180
|
+
|
1181
|
+
/*
|
1182
|
+
* call-seq: similar(strings) -> results
|
1183
|
+
*
|
1184
|
+
* Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
|
1185
|
+
* <code>strings</code>, and compute a Hamming distance metric number between
|
1186
|
+
* 0.0 for very unsimilar strings and 1.0 for an exact match.
|
1187
|
+
* <code>strings</code> has to be either a String or an Array of Strings. The
|
1188
|
+
* returned <code>results</code> are either a Fixnum or an Array of Fixnums
|
1189
|
+
* respectively.
|
1190
|
+
*/
|
1191
|
+
static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
|
1192
|
+
{
|
1193
|
+
GET_STRUCT(General)
|
1194
|
+
return General_iterate_strings(amatch, strings, Hamming_similar);
|
1195
|
+
}
|
1196
|
+
|
1197
|
+
/*
|
1198
|
+
* call-seq: hamming_similar(strings) -> results
|
1199
|
+
*
|
1200
|
+
* If called on a String, this string is used as a Amatch::Hamming#pattern to
|
1201
|
+
* match against <code>strings</code>. It returns a Hamming distance metric
|
1202
|
+
* number between 0.0 for very unsimilar strings and 1.0 for an exact match.
|
1203
|
+
* <code>strings</code>
|
1204
|
+
* has to be either a String or an Array of Strings. The returned
|
1205
|
+
* <code>results</code> are either a Float or an Array of Floats respectively.
|
1206
|
+
*/
|
1207
|
+
static VALUE rb_str_hamming_similar(VALUE self, VALUE strings)
|
1208
|
+
{
|
1209
|
+
VALUE amatch = rb_Hamming_new(rb_cHamming, self);
|
1210
|
+
return rb_Hamming_similar(amatch, strings);
|
1211
|
+
}
|
1212
|
+
|
1213
|
+
|
1214
|
+
/*
|
1215
|
+
* Document-class: Amatch::LongestSubsequence
|
1216
|
+
*
|
1217
|
+
* This class computes the length of the longest subsequence common to two
|
1218
|
+
* strings. A subsequence doesn't have to be contiguous. The longer the common
|
1219
|
+
* subsequence is, the more similar the two strings will be.
|
1220
|
+
*
|
1221
|
+
* The longest common subsequence between "test" and "test" is of length 4,
|
1222
|
+
* because "test" itself is this subsequence. The longest common subsequence
|
1223
|
+
* between "test" and "east" is "e", "s", "t" and the length of the
|
1224
|
+
* sequence is 3.
|
1225
|
+
*/
|
1226
|
+
DEF_RB_FREE(LongestSubsequence, General)
|
1227
|
+
|
1228
|
+
/*
|
1229
|
+
* call-seq: new(pattern)
|
1230
|
+
*
|
1231
|
+
* Creates a new Amatch::LongestSubsequence instance from <code>pattern</code>.
|
1232
|
+
*/
|
1233
|
+
static VALUE rb_LongestSubsequence_initialize(VALUE self, VALUE pattern)
|
1234
|
+
{
|
1235
|
+
GET_STRUCT(General)
|
1236
|
+
General_pattern_set(amatch, pattern);
|
1237
|
+
return self;
|
1238
|
+
}
|
1239
|
+
|
1240
|
+
DEF_CONSTRUCTOR(LongestSubsequence, General)
|
1241
|
+
|
1242
|
+
/*
|
1243
|
+
* call-seq: match(strings) -> results
|
1244
|
+
*
|
1245
|
+
* Uses this Amatch::LongestSubsequence instance to match
|
1246
|
+
* LongestSubsequence#pattern against <code>strings</code>, that is compute the
|
1247
|
+
* length of the longest common subsequence. <code>strings</code> has to be
|
1248
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1249
|
+
* are either a Fixnum or an Array of Fixnums respectively.
|
1250
|
+
*/
|
1251
|
+
static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
|
1252
|
+
{
|
1253
|
+
GET_STRUCT(General)
|
1254
|
+
return General_iterate_strings(amatch, strings, LongestSubsequence_match);
|
1255
|
+
}
|
1256
|
+
|
1257
|
+
/*
|
1258
|
+
* call-seq: similar(strings) -> results
|
1259
|
+
*
|
1260
|
+
* Uses this Amatch::LongestSubsequence instance to match
|
1261
|
+
* Amatch::LongestSubsequence#pattern against <code>strings</code>, and compute
|
1262
|
+
* a longest substring distance metric number between 0.0 for very unsimilar
|
1263
|
+
* strings and 1.0 for an exact match. <code>strings</code> has to be either a
|
1264
|
+
* String or an Array of Strings. The returned <code>results</code> are either
|
1265
|
+
* a Fixnum or an Array of Fixnums
|
1266
|
+
*/
|
1267
|
+
static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
|
1268
|
+
{
|
1269
|
+
GET_STRUCT(General)
|
1270
|
+
return General_iterate_strings(amatch, strings, LongestSubsequence_similar);
|
1271
|
+
}
|
1272
|
+
|
1273
|
+
/*
|
1274
|
+
* call-seq: longest_subsequence_similar(strings) -> results
|
1275
|
+
*
|
1276
|
+
* If called on a String, this string is used as a
|
1277
|
+
* Amatch::LongestSubsequence#pattern to match against <code>strings</code>. It
|
1278
|
+
* returns a longest subsequence distance metric number between 0.0 for very
|
1279
|
+
* unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
|
1280
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1281
|
+
* are either a Float or an Array of Floats respectively.
|
1282
|
+
*/
|
1283
|
+
static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
|
1284
|
+
{
|
1285
|
+
VALUE amatch = rb_LongestSubsequence_new(rb_cLongestSubsequence, self);
|
1286
|
+
return rb_LongestSubsequence_similar(amatch, strings);
|
1287
|
+
}
|
1288
|
+
|
1289
|
+
/*
|
1290
|
+
* Document-class: Amatch::LongestSubstring
|
1291
|
+
*
|
1292
|
+
* The longest common substring is the longest substring, that is part of
|
1293
|
+
* two strings. A substring is contiguous, while a subsequence need not to
|
1294
|
+
* be. The longer the common substring is, the more similar the two strings
|
1295
|
+
* will be.
|
1296
|
+
*
|
1297
|
+
* The longest common substring between 'string' and 'string' is 'string'
|
1298
|
+
* again, thus the longest common substring length is 6. The longest common
|
1299
|
+
* substring between 'string' and 'storing' is 'ring', thus the longest common
|
1300
|
+
* substring length is 4.
|
1301
|
+
*/
|
1302
|
+
|
1303
|
+
DEF_RB_FREE(LongestSubstring, General)
|
1304
|
+
|
1305
|
+
/*
|
1306
|
+
* call-seq: new(pattern)
|
1307
|
+
*
|
1308
|
+
* Creates a new Amatch::LongestSubstring instance from <code>pattern</code>.
|
1309
|
+
*/
|
1310
|
+
static VALUE rb_LongestSubstring_initialize(VALUE self, VALUE pattern)
|
1311
|
+
{
|
1312
|
+
GET_STRUCT(General)
|
1313
|
+
General_pattern_set(amatch, pattern);
|
1314
|
+
return self;
|
1315
|
+
}
|
1316
|
+
|
1317
|
+
DEF_CONSTRUCTOR(LongestSubstring, General)
|
1318
|
+
|
1319
|
+
/*
|
1320
|
+
* call-seq: match(strings) -> results
|
1321
|
+
*
|
1322
|
+
* Uses this Amatch::LongestSubstring instance to match
|
1323
|
+
* LongestSubstring#pattern against <code>strings</code>, that is compute the
|
1324
|
+
* length of the longest common substring. <code>strings</code> has to be
|
1325
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1326
|
+
* are either a Fixnum or an Array of Fixnums respectively.
|
1327
|
+
*/
|
1328
|
+
static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
|
1329
|
+
{
|
1330
|
+
GET_STRUCT(General)
|
1331
|
+
return General_iterate_strings(amatch, strings, LongestSubstring_match);
|
1332
|
+
}
|
1333
|
+
|
1334
|
+
/*
|
1335
|
+
* call-seq: similar(strings) -> results
|
1336
|
+
*
|
1337
|
+
* Uses this Amatch::LongestSubstring instance to match
|
1338
|
+
* Amatch::LongestSubstring#pattern against <code>strings</code>, and compute a
|
1339
|
+
* longest substring distance metric number between 0.0 for very unsimilar
|
1340
|
+
* strings and 1.0 for an exact match. <code>strings</code> has to be either a
|
1341
|
+
* String or an Array of Strings. The returned <code>results</code> are either
|
1342
|
+
* a Fixnum or an Array of Fixnums
|
1343
|
+
* respectively.
|
1344
|
+
*/
|
1345
|
+
static VALUE rb_LongestSubstring_similar(VALUE self, VALUE strings)
|
1346
|
+
{
|
1347
|
+
GET_STRUCT(General)
|
1348
|
+
return General_iterate_strings(amatch, strings, LongestSubstring_similar);
|
1349
|
+
}
|
1350
|
+
|
1351
|
+
/*
|
1352
|
+
* call-seq: longest_substring_similar(strings) -> results
|
1353
|
+
*
|
1354
|
+
* If called on a String, this string is used as a
|
1355
|
+
* Amatch::LongestSubstring#pattern to match against <code>strings</code>. It
|
1356
|
+
* returns a longest substring distance metric number between 0.0 for very
|
1357
|
+
* unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
|
1358
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1359
|
+
* are either a Float or an Array of Floats respectively.
|
1360
|
+
*/
|
1361
|
+
static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
|
1362
|
+
{
|
1363
|
+
VALUE amatch = rb_LongestSubstring_new(rb_cLongestSubstring, self);
|
1364
|
+
return rb_LongestSubstring_similar(amatch, strings);
|
1365
|
+
}
|
1366
|
+
|
1367
|
+
/*
|
1368
|
+
* Document-class: Amatch::Jaro
|
1369
|
+
*
|
1370
|
+
* This class computes the Jaro metric for two strings.
|
1371
|
+
* The Jaro metric computes the similarity between 0 (no match)
|
1372
|
+
* and 1 (exact match) by looking for matching and transposed characters.
|
1373
|
+
*/
|
1374
|
+
DEF_RB_FREE(Jaro, Jaro)
|
1375
|
+
|
1376
|
+
/*
|
1377
|
+
* Document-method: ignore_case
|
1378
|
+
*
|
1379
|
+
* call-seq: ignore_case -> true/false
|
1380
|
+
*
|
1381
|
+
* Returns whether case is ignored when computing matching characters.
|
1382
|
+
*/
|
1383
|
+
DEF_RB_READER(Jaro, rb_Jaro_ignore_case, ignore_case, C2BOOL)
|
1384
|
+
|
1385
|
+
/*
|
1386
|
+
* Document-method: ignore_case=
|
1387
|
+
*
|
1388
|
+
* call-seq: ignore_case=(true/false)
|
1389
|
+
*
|
1390
|
+
* Sets whether case is ignored when computing matching characters.
|
1391
|
+
*/
|
1392
|
+
DEF_RB_WRITER(Jaro, rb_Jaro_ignore_case_set, ignore_case,
|
1393
|
+
int, CAST2BOOL, BOOL2C, != Qundef)
|
1394
|
+
|
1395
|
+
/*
|
1396
|
+
* call-seq: new(pattern)
|
1397
|
+
*
|
1398
|
+
* Creates a new Amatch::Jaro instance from <code>pattern</code>.
|
1399
|
+
*/
|
1400
|
+
static VALUE rb_Jaro_initialize(VALUE self, VALUE pattern)
|
1401
|
+
{
|
1402
|
+
GET_STRUCT(Jaro)
|
1403
|
+
Jaro_pattern_set(amatch, pattern);
|
1404
|
+
amatch->ignore_case = 1;
|
1405
|
+
return self;
|
1406
|
+
}
|
1407
|
+
|
1408
|
+
DEF_CONSTRUCTOR(Jaro, Jaro)
|
1409
|
+
|
1410
|
+
/*
|
1411
|
+
* call-seq: match(strings) -> results
|
1412
|
+
*
|
1413
|
+
* Uses this Amatch::Jaro instance to match
|
1414
|
+
* Jaro#pattern against <code>strings</code>, that is compute the
|
1415
|
+
* jaro metric with the strings. <code>strings</code> has to be
|
1416
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1417
|
+
* are either a Float or an Array of Floats respectively.
|
1418
|
+
*/
|
1419
|
+
static VALUE rb_Jaro_match(VALUE self, VALUE strings)
|
1420
|
+
{
|
1421
|
+
GET_STRUCT(Jaro)
|
1422
|
+
return Jaro_iterate_strings(amatch, strings, Jaro_match);
|
1423
|
+
}
|
1424
|
+
|
1425
|
+
/*
|
1426
|
+
* call-seq: jaro_similar(strings) -> results
|
1427
|
+
*
|
1428
|
+
* If called on a String, this string is used as a
|
1429
|
+
* Amatch::Jaro#pattern to match against <code>strings</code>. It
|
1430
|
+
* returns a Jaro metric number between 0.0 for very
|
1431
|
+
* unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
|
1432
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1433
|
+
* are either a Float or an Array of Floats respectively.
|
1434
|
+
*/
|
1435
|
+
static VALUE rb_str_jaro_similar(VALUE self, VALUE strings)
|
1436
|
+
{
|
1437
|
+
VALUE amatch = rb_Jaro_new(rb_cJaro, self);
|
1438
|
+
return rb_Jaro_match(amatch, strings);
|
1439
|
+
}
|
1440
|
+
|
1441
|
+
/*
|
1442
|
+
* Document-class: Amatch::JaroWinkler
|
1443
|
+
*
|
1444
|
+
* This class computes the Jaro-Winkler metric for two strings.
|
1445
|
+
* The Jaro-Winkler metric computes the similarity between 0 (no match)
|
1446
|
+
* and 1 (exact match) by looking for matching and transposed characters.
|
1447
|
+
*
|
1448
|
+
* It is a variant of the Jaro metric, with additional weighting towards
|
1449
|
+
* common prefixes.
|
1450
|
+
*/
|
1451
|
+
DEF_RB_FREE(JaroWinkler, JaroWinkler)
|
1452
|
+
|
1453
|
+
/*
|
1454
|
+
* Document-method: ignore_case
|
1455
|
+
*
|
1456
|
+
* call-seq: ignore_case -> true/false
|
1457
|
+
*
|
1458
|
+
* Returns whether case is ignored when computing matching characters.
|
1459
|
+
* Default is true.
|
1460
|
+
*/
|
1461
|
+
DEF_RB_READER(JaroWinkler, rb_JaroWinkler_ignore_case, ignore_case, C2BOOL)
|
1462
|
+
|
1463
|
+
/*
|
1464
|
+
* Document-method: scaling_factor
|
1465
|
+
*
|
1466
|
+
* call-seq: scaling_factor -> weight
|
1467
|
+
*
|
1468
|
+
* The scaling factor is how much weight to give common prefixes.
|
1469
|
+
* Default is 0.1.
|
1470
|
+
*/
|
1471
|
+
DEF_RB_READER(JaroWinkler, rb_JaroWinkler_scaling_factor, scaling_factor, rb_float_new)
|
1472
|
+
|
1473
|
+
/*
|
1474
|
+
* Document-method: ignore_case=
|
1475
|
+
*
|
1476
|
+
* call-seq: ignore_case=(true/false)
|
1477
|
+
*
|
1478
|
+
* Sets whether case is ignored when computing matching characters.
|
1479
|
+
*/
|
1480
|
+
DEF_RB_WRITER(JaroWinkler, rb_JaroWinkler_ignore_case_set, ignore_case,
|
1481
|
+
int, CAST2BOOL, BOOL2C, != Qundef)
|
1482
|
+
|
1483
|
+
/*
|
1484
|
+
* Document-method: scaling_factor=
|
1485
|
+
*
|
1486
|
+
* call-seq: scaling_factor=(weight)
|
1487
|
+
*
|
1488
|
+
* Sets the weight to give common prefixes.
|
1489
|
+
*/
|
1490
|
+
DEF_RB_WRITER(JaroWinkler, rb_JaroWinkler_scaling_factor_set, scaling_factor,
|
1491
|
+
double, CAST2FLOAT, FLOAT2C, >= 0)
|
1492
|
+
|
1493
|
+
/*
|
1494
|
+
* call-seq: new(pattern)
|
1495
|
+
*
|
1496
|
+
* Creates a new Amatch::JaroWinkler instance from <code>pattern</code>.
|
1497
|
+
*/
|
1498
|
+
static VALUE rb_JaroWinkler_initialize(VALUE self, VALUE pattern)
|
1499
|
+
{
|
1500
|
+
GET_STRUCT(JaroWinkler)
|
1501
|
+
JaroWinkler_pattern_set(amatch, pattern);
|
1502
|
+
amatch->ignore_case = 1;
|
1503
|
+
amatch->scaling_factor = 0.1;
|
1504
|
+
return self;
|
1505
|
+
}
|
1506
|
+
|
1507
|
+
DEF_CONSTRUCTOR(JaroWinkler, JaroWinkler)
|
1508
|
+
|
1509
|
+
/*
|
1510
|
+
* call-seq: match(strings) -> results
|
1511
|
+
*
|
1512
|
+
* Uses this Amatch::Jaro instance to match
|
1513
|
+
* Jaro#pattern against <code>strings</code>, that is compute the
|
1514
|
+
* jaro metric with the strings. <code>strings</code> has to be
|
1515
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1516
|
+
* are either a Float or an Array of Floats respectively.
|
1517
|
+
*/
|
1518
|
+
static VALUE rb_JaroWinkler_match(VALUE self, VALUE strings)
|
1519
|
+
{
|
1520
|
+
GET_STRUCT(JaroWinkler)
|
1521
|
+
return JaroWinkler_iterate_strings(amatch, strings, JaroWinkler_match);
|
1522
|
+
}
|
1523
|
+
|
1524
|
+
/*
|
1525
|
+
* call-seq: jarowinkler_similar(strings) -> results
|
1526
|
+
*
|
1527
|
+
* If called on a String, this string is used as a
|
1528
|
+
* Amatch::JaroWinkler#pattern to match against <code>strings</code>. It
|
1529
|
+
* returns a Jaro-Winkler metric number between 0.0 for very
|
1530
|
+
* unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
|
1531
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1532
|
+
* are either a Float or an Array of Floats respectively.
|
1533
|
+
*/
|
1534
|
+
static VALUE rb_str_jarowinkler_similar(VALUE self, VALUE strings)
|
1535
|
+
{
|
1536
|
+
VALUE amatch = rb_JaroWinkler_new(rb_cJaro, self);
|
1537
|
+
return rb_JaroWinkler_match(amatch, strings);
|
1538
|
+
}
|
1539
|
+
|
1540
|
+
void Init_amatch()
|
1541
|
+
{
|
1542
|
+
rb_require("amatch/version");
|
1543
|
+
rb_mAmatch = rb_define_module("Amatch");
|
1544
|
+
|
1545
|
+
/* Levenshtein */
|
1546
|
+
rb_cLevenshtein = rb_define_class_under(rb_mAmatch, "Levenshtein", rb_cObject);
|
1547
|
+
rb_define_alloc_func(rb_cLevenshtein, rb_Levenshtein_s_allocate);
|
1548
|
+
rb_define_method(rb_cLevenshtein, "initialize", rb_Levenshtein_initialize, 1);
|
1549
|
+
rb_define_method(rb_cLevenshtein, "pattern", rb_General_pattern, 0);
|
1550
|
+
rb_define_method(rb_cLevenshtein, "pattern=", rb_General_pattern_set, 1);
|
1551
|
+
rb_define_method(rb_cLevenshtein, "match", rb_Levenshtein_match, 1);
|
1552
|
+
rb_define_method(rb_cLevenshtein, "search", rb_Levenshtein_search, 1);
|
1553
|
+
rb_define_method(rb_cLevenshtein, "similar", rb_Levenshtein_similar, 1);
|
1554
|
+
rb_define_method(rb_cString, "levenshtein_similar", rb_str_levenshtein_similar, 1);
|
1555
|
+
|
1556
|
+
/* Sellers */
|
1557
|
+
rb_cSellers = rb_define_class_under(rb_mAmatch, "Sellers", rb_cObject);
|
1558
|
+
rb_define_alloc_func(rb_cSellers, rb_Sellers_s_allocate);
|
1559
|
+
rb_define_method(rb_cSellers, "initialize", rb_Sellers_initialize, 1);
|
1560
|
+
rb_define_method(rb_cSellers, "pattern", rb_Sellers_pattern, 0);
|
1561
|
+
rb_define_method(rb_cSellers, "pattern=", rb_Sellers_pattern_set, 1);
|
1562
|
+
rb_define_method(rb_cSellers, "substitution", rb_Sellers_substitution, 0);
|
1563
|
+
rb_define_method(rb_cSellers, "substitution=", rb_Sellers_substitution_set, 1);
|
1564
|
+
rb_define_method(rb_cSellers, "deletion", rb_Sellers_deletion, 0);
|
1565
|
+
rb_define_method(rb_cSellers, "deletion=", rb_Sellers_deletion_set, 1);
|
1566
|
+
rb_define_method(rb_cSellers, "insertion", rb_Sellers_insertion, 0);
|
1567
|
+
rb_define_method(rb_cSellers, "insertion=", rb_Sellers_insertion_set, 1);
|
1568
|
+
rb_define_method(rb_cSellers, "reset_weights", rb_Sellers_reset_weights, 0);
|
1569
|
+
rb_define_method(rb_cSellers, "match", rb_Sellers_match, 1);
|
1570
|
+
rb_define_method(rb_cSellers, "search", rb_Sellers_search, 1);
|
1571
|
+
rb_define_method(rb_cSellers, "similar", rb_Sellers_similar, 1);
|
1572
|
+
|
1573
|
+
/* Hamming */
|
1574
|
+
rb_cHamming = rb_define_class_under(rb_mAmatch, "Hamming", rb_cObject);
|
1575
|
+
rb_define_alloc_func(rb_cHamming, rb_Hamming_s_allocate);
|
1576
|
+
rb_define_method(rb_cHamming, "initialize", rb_Hamming_initialize, 1);
|
1577
|
+
rb_define_method(rb_cHamming, "pattern", rb_General_pattern, 0);
|
1578
|
+
rb_define_method(rb_cHamming, "pattern=", rb_General_pattern_set, 1);
|
1579
|
+
rb_define_method(rb_cHamming, "match", rb_Hamming_match, 1);
|
1580
|
+
rb_define_method(rb_cHamming, "similar", rb_Hamming_similar, 1);
|
1581
|
+
rb_define_method(rb_cString, "hamming_similar", rb_str_hamming_similar, 1);
|
1582
|
+
|
1583
|
+
/* Pair Distance Metric */
|
1584
|
+
rb_cPairDistance = rb_define_class_under(rb_mAmatch, "PairDistance", rb_cObject);
|
1585
|
+
rb_define_alloc_func(rb_cPairDistance, rb_PairDistance_s_allocate);
|
1586
|
+
rb_define_method(rb_cPairDistance, "initialize", rb_PairDistance_initialize, 1);
|
1587
|
+
rb_define_method(rb_cPairDistance, "pattern", rb_PairDistance_pattern, 0);
|
1588
|
+
rb_define_method(rb_cPairDistance, "pattern=", rb_PairDistance_pattern_set, 1);
|
1589
|
+
rb_define_method(rb_cPairDistance, "match", rb_PairDistance_match, -1);
|
1590
|
+
rb_define_alias(rb_cPairDistance, "similar", "match");
|
1591
|
+
rb_define_method(rb_cString, "pair_distance_similar", rb_str_pair_distance_similar, 1);
|
1592
|
+
|
1593
|
+
/* Longest Common Subsequence */
|
1594
|
+
rb_cLongestSubsequence = rb_define_class_under(rb_mAmatch, "LongestSubsequence", rb_cObject);
|
1595
|
+
rb_define_alloc_func(rb_cLongestSubsequence, rb_LongestSubsequence_s_allocate);
|
1596
|
+
rb_define_method(rb_cLongestSubsequence, "initialize", rb_LongestSubsequence_initialize, 1);
|
1597
|
+
rb_define_method(rb_cLongestSubsequence, "pattern", rb_General_pattern, 0);
|
1598
|
+
rb_define_method(rb_cLongestSubsequence, "pattern=", rb_General_pattern_set, 1);
|
1599
|
+
rb_define_method(rb_cLongestSubsequence, "match", rb_LongestSubsequence_match, 1);
|
1600
|
+
rb_define_method(rb_cLongestSubsequence, "similar", rb_LongestSubsequence_similar, 1);
|
1601
|
+
rb_define_method(rb_cString, "longest_subsequence_similar", rb_str_longest_subsequence_similar, 1);
|
1602
|
+
|
1603
|
+
/* Longest Common Substring */
|
1604
|
+
rb_cLongestSubstring = rb_define_class_under(rb_mAmatch, "LongestSubstring", rb_cObject);
|
1605
|
+
rb_define_alloc_func(rb_cLongestSubstring, rb_LongestSubstring_s_allocate);
|
1606
|
+
rb_define_method(rb_cLongestSubstring, "initialize", rb_LongestSubstring_initialize, 1);
|
1607
|
+
rb_define_method(rb_cLongestSubstring, "pattern", rb_General_pattern, 0);
|
1608
|
+
rb_define_method(rb_cLongestSubstring, "pattern=", rb_General_pattern_set, 1);
|
1609
|
+
rb_define_method(rb_cLongestSubstring, "match", rb_LongestSubstring_match, 1);
|
1610
|
+
rb_define_method(rb_cLongestSubstring, "similar", rb_LongestSubstring_similar, 1);
|
1611
|
+
rb_define_method(rb_cString, "longest_substring_similar", rb_str_longest_substring_similar, 1);
|
1612
|
+
|
1613
|
+
/* Jaro */
|
1614
|
+
rb_cJaro = rb_define_class_under(rb_mAmatch, "Jaro", rb_cObject);
|
1615
|
+
rb_define_alloc_func(rb_cJaro, rb_Jaro_s_allocate);
|
1616
|
+
rb_define_method(rb_cJaro, "initialize", rb_Jaro_initialize, 1);
|
1617
|
+
rb_define_method(rb_cJaro, "pattern", rb_Jaro_pattern, 0);
|
1618
|
+
rb_define_method(rb_cJaro, "pattern=", rb_Jaro_pattern_set, 1);
|
1619
|
+
rb_define_method(rb_cJaro, "ignore_case", rb_Jaro_ignore_case, 0);
|
1620
|
+
rb_define_method(rb_cJaro, "ignore_case=", rb_Jaro_ignore_case_set, 1);
|
1621
|
+
rb_define_method(rb_cJaro, "match", rb_Jaro_match, 1);
|
1622
|
+
rb_define_alias(rb_cJaro, "similar", "match");
|
1623
|
+
rb_define_method(rb_cString, "jaro_similar", rb_str_jaro_similar, 1);
|
1624
|
+
|
1625
|
+
/* Jaro-Winkler */
|
1626
|
+
rb_cJaroWinkler = rb_define_class_under(rb_mAmatch, "JaroWinkler", rb_cObject);
|
1627
|
+
rb_define_alloc_func(rb_cJaroWinkler, rb_JaroWinkler_s_allocate);
|
1628
|
+
rb_define_method(rb_cJaroWinkler, "initialize", rb_JaroWinkler_initialize, 1);
|
1629
|
+
rb_define_method(rb_cJaroWinkler, "pattern", rb_JaroWinkler_pattern, 0);
|
1630
|
+
rb_define_method(rb_cJaroWinkler, "pattern=", rb_JaroWinkler_pattern_set, 1);
|
1631
|
+
rb_define_method(rb_cJaroWinkler, "ignore_case", rb_JaroWinkler_ignore_case, 0);
|
1632
|
+
rb_define_method(rb_cJaroWinkler, "ignore_case=", rb_JaroWinkler_ignore_case_set, 1);
|
1633
|
+
rb_define_method(rb_cJaroWinkler, "scaling_factor", rb_JaroWinkler_scaling_factor, 0);
|
1634
|
+
rb_define_method(rb_cJaroWinkler, "scaling_factor=", rb_JaroWinkler_scaling_factor_set, 1);
|
1635
|
+
rb_define_method(rb_cJaroWinkler, "match", rb_JaroWinkler_match, 1);
|
1636
|
+
rb_define_alias(rb_cJaroWinkler, "similar", "match");
|
1637
|
+
rb_define_method(rb_cString, "jarowinkler_similar", rb_str_jarowinkler_similar, 1);
|
1638
|
+
|
1639
|
+
id_split = rb_intern("split");
|
1640
|
+
id_to_f = rb_intern("to_f");
|
1641
|
+
}
|