amatch 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +4 -0
- data/{GPL → COPYING} +7 -7
- data/README +25 -0
- data/Rakefile +66 -64
- data/VERSION +1 -1
- data/bin/agrep.rb +0 -4
- data/doc-main.txt +115 -0
- data/ext/amatch.c +351 -98
- data/ext/common.h +25 -0
- data/ext/extconf.rb +0 -5
- data/ext/pair.c +5 -6
- data/ext/pair.h +1 -1
- data/install.rb +28 -1015
- data/lib/amatch/version.rb +8 -0
- data/tests/test_hamming.rb +1 -2
- data/tests/test_jaro.rb +29 -0
- data/tests/test_jaro_winkler.rb +38 -0
- data/tests/test_levenshtein.rb +25 -27
- data/tests/test_longest_subsequence.rb +1 -2
- data/tests/test_longest_substring.rb +1 -2
- data/tests/test_pair_distance.rb +1 -2
- data/tests/test_sellers.rb +52 -54
- metadata +76 -55
- data/README.en +0 -31
- data/ext/MANIFEST +0 -2
- data/ext/tags +0 -24
- data/tests/runner.rb +0 -26
data/CHANGES
CHANGED
data/{GPL → COPYING}
RENAMED
@@ -1,12 +1,12 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
GNU GENERAL PUBLIC LICENSE
|
2
|
+
Version 2, June 1991
|
3
3
|
|
4
4
|
Copyright (C) 1989, 1991 Free Software Foundation, Inc.
|
5
5
|
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
6
6
|
Everyone is permitted to copy and distribute verbatim copies
|
7
7
|
of this license document, but changing it is not allowed.
|
8
8
|
|
9
|
-
|
9
|
+
Preamble
|
10
10
|
|
11
11
|
The licenses for most software are designed to take away your
|
12
12
|
freedom to share and change it. By contrast, the GNU General Public
|
@@ -56,7 +56,7 @@ patent must be licensed for everyone's free use or not licensed at all.
|
|
56
56
|
The precise terms and conditions for copying, distribution and
|
57
57
|
modification follow.
|
58
58
|
|
59
|
-
|
59
|
+
GNU GENERAL PUBLIC LICENSE
|
60
60
|
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
61
61
|
|
62
62
|
0. This License applies to any program or other work which contains
|
@@ -255,7 +255,7 @@ make exceptions for this. Our decision will be guided by the two goals
|
|
255
255
|
of preserving the free status of all derivatives of our free software and
|
256
256
|
of promoting the sharing and reuse of software generally.
|
257
257
|
|
258
|
-
|
258
|
+
NO WARRANTY
|
259
259
|
|
260
260
|
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
|
261
261
|
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
|
@@ -277,9 +277,9 @@ YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
|
|
277
277
|
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
|
278
278
|
POSSIBILITY OF SUCH DAMAGES.
|
279
279
|
|
280
|
-
|
280
|
+
END OF TERMS AND CONDITIONS
|
281
281
|
|
282
|
-
|
282
|
+
How to Apply These Terms to Your New Programs
|
283
283
|
|
284
284
|
If you develop a new program, and you want it to be of the greatest
|
285
285
|
possible use to the public, the best way to achieve this is to make it
|
data/README
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
Installation
|
2
|
+
============
|
3
|
+
|
4
|
+
Just type into the command line as root:
|
5
|
+
|
6
|
+
# ruby install.rb
|
7
|
+
|
8
|
+
If you have installed rake (rake.rubyforge.org), you can also type:
|
9
|
+
|
10
|
+
# rake install
|
11
|
+
|
12
|
+
To install this extension as a gem type
|
13
|
+
|
14
|
+
# gem install amatch
|
15
|
+
|
16
|
+
Author
|
17
|
+
======
|
18
|
+
|
19
|
+
Florian Frank <flori@ping.de>
|
20
|
+
|
21
|
+
License
|
22
|
+
=======
|
23
|
+
|
24
|
+
GNU General Public License, Version 2 (GPLv2)
|
25
|
+
|
data/Rakefile
CHANGED
@@ -1,32 +1,33 @@
|
|
1
|
-
# vim: set et sw=2 ts=2:
|
2
|
-
require 'rake/clean'
|
3
|
-
require 'rake/testtask'
|
4
|
-
require 'rake/gempackagetask'
|
5
|
-
require 'rake/rdoctask'
|
6
|
-
require 'rbconfig'
|
1
|
+
# vim: set filetype=ruby et sw=2 ts=2:
|
7
2
|
|
3
|
+
begin
|
4
|
+
require 'rake/gempackagetask'
|
5
|
+
rescue LoadError
|
6
|
+
end
|
7
|
+
require 'rbconfig'
|
8
8
|
include Config
|
9
|
+
require 'rake/clean'
|
10
|
+
CLEAN.include 'coverage', 'doc'
|
11
|
+
require 'rake/testtask'
|
9
12
|
|
13
|
+
MAKE = ENV['MAKE'] || %w[gmake make].find { |c| system(c, '-v') }
|
14
|
+
PKG_NAME = 'amatch'
|
10
15
|
PKG_VERSION = File.read('VERSION').chomp
|
11
|
-
PKG_FILES = FileList[
|
12
|
-
|
13
|
-
PKG_FILES.exclude(/^pkg/)
|
14
|
-
PKG_FILES.exclude(/^doc/)
|
16
|
+
PKG_FILES = FileList["**/*"].exclude(/^(pkg|coverage|doc)/)
|
17
|
+
PKG_DOC_FILES = [ "ext/amatch.c" ].concat(Dir['lib/**/*.rb']) << 'doc-main.txt'
|
15
18
|
|
16
19
|
task :default => :test
|
17
20
|
|
18
21
|
desc "Run unit tests"
|
19
|
-
task :test => :
|
20
|
-
|
21
|
-
ruby %{-I../ext runner.rb}
|
22
|
-
end
|
22
|
+
task :test => :compile_ext do
|
23
|
+
sh %{testrb -Iext:lib tests/test_*.rb}
|
23
24
|
end
|
24
25
|
|
25
26
|
desc "Compiling library"
|
26
|
-
task :
|
27
|
+
task :compile_ext do
|
27
28
|
cd 'ext' do
|
28
29
|
ruby %{extconf.rb}
|
29
|
-
sh
|
30
|
+
sh MAKE
|
30
31
|
end
|
31
32
|
end
|
32
33
|
|
@@ -40,72 +41,73 @@ end
|
|
40
41
|
|
41
42
|
desc "Removing generated files"
|
42
43
|
task :clean do
|
43
|
-
|
44
|
-
cd 'ext' do
|
44
|
+
cd 'ext' do
|
45
45
|
ruby 'extconf.rb'
|
46
|
-
sh "
|
46
|
+
sh "#{MAKE} distclean" if File.exist?('Makefile')
|
47
47
|
end
|
48
48
|
end
|
49
49
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
rd.rdoc_dir = 'doc'
|
50
|
+
desc "Build the documentation"
|
51
|
+
task :doc do
|
52
|
+
sh "rdoc -m doc-main.txt -t '#{PKG_NAME} - Approximate Matching' #{PKG_DOC_FILES * ' '}"
|
54
53
|
end
|
55
54
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
s.description = <<EOF
|
55
|
+
if defined? Gem
|
56
|
+
spec = Gem::Specification.new do |s|
|
57
|
+
s.name = 'amatch'
|
58
|
+
s.version = PKG_VERSION
|
59
|
+
s.summary = "Approximate String Matching library"
|
60
|
+
s.description = <<EOF
|
63
61
|
Amatch is a library for approximate string matching and searching in strings.
|
64
62
|
Several algorithms can be used to do this, and it's also possible to compute a
|
65
63
|
similarity metric number between 0.0 and 1.0 for two given strings.
|
66
64
|
EOF
|
67
65
|
|
68
|
-
|
66
|
+
s.files = PKG_FILES
|
69
67
|
|
70
|
-
|
71
|
-
#s.requirements << ""
|
68
|
+
s.extensions << "ext/extconf.rb"
|
72
69
|
|
73
|
-
|
70
|
+
s.require_path = 'ext'
|
74
71
|
|
75
|
-
|
72
|
+
s.bindir = "bin"
|
73
|
+
s.executables = ["agrep.rb"]
|
74
|
+
s.default_executable = "agrep.rb"
|
76
75
|
|
77
|
-
|
76
|
+
s.has_rdoc = true
|
77
|
+
s.extra_rdoc_files.concat PKG_DOC_FILES
|
78
|
+
s.rdoc_options << '--main' << 'doc-main.txt' <<
|
79
|
+
'--title' << "#{PKG_NAME} - Approximate Matching"
|
80
|
+
s.test_files.concat Dir['tests/test_*.rb']
|
78
81
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
s.bindir = "bin" # Use these for applications.
|
85
|
-
s.executables = ["agrep.rb"]
|
86
|
-
s.default_executable = "agrep.rb"
|
87
|
-
|
88
|
-
#### Documentation and testing.
|
89
|
-
|
90
|
-
s.has_rdoc = true
|
91
|
-
#s.extra_rdoc_files = FileList['ext/amatch.c']
|
92
|
-
s.rdoc_options <<
|
93
|
-
'--title' << 'Amatch -- Approximate Matching' <<
|
94
|
-
'--main' << 'Amatch' <<
|
95
|
-
'--line-numbers'
|
96
|
-
s.test_files << 'tests/runner.rb'
|
97
|
-
|
98
|
-
#### Author and project details.
|
82
|
+
s.author = "Florian Frank"
|
83
|
+
s.email = "flori@ping.de"
|
84
|
+
s.homepage = "http://amatch.rubyforge.org"
|
85
|
+
s.rubyforge_project = "amatch"
|
86
|
+
end
|
99
87
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
88
|
+
Rake::GemPackageTask.new(spec) do |pkg|
|
89
|
+
pkg.need_tar = true
|
90
|
+
pkg.package_files += PKG_FILES
|
91
|
+
end
|
104
92
|
end
|
105
93
|
|
106
|
-
|
107
|
-
|
108
|
-
|
94
|
+
desc m = "Writing version information for #{PKG_VERSION}"
|
95
|
+
task :version do
|
96
|
+
puts m
|
97
|
+
File.open(File.join('lib', 'amatch', 'version.rb'), 'w') do |v|
|
98
|
+
v.puts <<EOT
|
99
|
+
module Amatch
|
100
|
+
# Amatch version
|
101
|
+
VERSION = '#{PKG_VERSION}'
|
102
|
+
VERSION_ARRAY = VERSION.split(/\\./).map { |x| x.to_i } # :nodoc:
|
103
|
+
VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
|
104
|
+
VERSION_MINOR = VERSION_ARRAY[1] # :nodoc:
|
105
|
+
VERSION_BUILD = VERSION_ARRAY[2] # :nodoc:
|
109
106
|
end
|
107
|
+
EOT
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
110
111
|
|
111
|
-
|
112
|
+
desc "Prepare a new release"
|
113
|
+
task :release => [ :clean, :version, :package ]
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.4
|
data/bin/agrep.rb
CHANGED
data/doc-main.txt
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
== amatch - Approximate Matching Extension for Ruby
|
2
|
+
|
3
|
+
=== Description
|
4
|
+
|
5
|
+
This is a collection of classes that can be used for Approximate
|
6
|
+
matching, searching, and comparing of Strings. They implement algorithms
|
7
|
+
that compute the Levenshtein edit distance, Sellers edit distance, the
|
8
|
+
Hamming distance, the longest common subsequence length, the longest common
|
9
|
+
substring length, the pair distance metric, the Jaro-Winkler metric.
|
10
|
+
|
11
|
+
=== Author
|
12
|
+
|
13
|
+
Florian Frank mailto:flori@ping.de
|
14
|
+
|
15
|
+
=== License
|
16
|
+
|
17
|
+
This is free software; you can redistribute it and/or modify it under
|
18
|
+
the terms of the GNU General Public License Version 2 as published by
|
19
|
+
the Free Software Foundation: http://www.gnu.org/copyleft/gpl.html
|
20
|
+
|
21
|
+
=== Download
|
22
|
+
|
23
|
+
The latest version of <b>amatch</b> can be found at
|
24
|
+
|
25
|
+
* http://rubyforge.org/frs/?group_id=390
|
26
|
+
|
27
|
+
Online Documentation should be located at
|
28
|
+
|
29
|
+
* http://amatch.rubyforge.org
|
30
|
+
|
31
|
+
=== Examples
|
32
|
+
require 'amatch'
|
33
|
+
# => true
|
34
|
+
include Amatch
|
35
|
+
# => Object
|
36
|
+
|
37
|
+
m = Sellers.new("pattern")
|
38
|
+
# => #<Amatch::Sellers:0x40366324>
|
39
|
+
m.match("pattren")
|
40
|
+
# => 2.0
|
41
|
+
m.substitution = m.insertion = 3
|
42
|
+
# => 3
|
43
|
+
m.match("pattren")
|
44
|
+
# => 4.0
|
45
|
+
m.reset_weights
|
46
|
+
# => #<Amatch::Sellers:0x40366324>
|
47
|
+
m.match(["pattren","parent"])
|
48
|
+
# => [2.0, 4.0]
|
49
|
+
m.search("abcpattrendef")
|
50
|
+
# => 2.0
|
51
|
+
|
52
|
+
m = Levenshtein.new("pattern")
|
53
|
+
# => #<Amatch::Levenshtein:0x4035919c>
|
54
|
+
m.match("pattren")
|
55
|
+
# => 2
|
56
|
+
m.search("abcpattrendef")
|
57
|
+
# => 2
|
58
|
+
"pattern language".levenshtein_similar("language of patterns")
|
59
|
+
# => 0.2
|
60
|
+
|
61
|
+
m = Hamming.new("pattern")
|
62
|
+
# => #<Amatch::Hamming:0x40350858>
|
63
|
+
m.match("pattren")
|
64
|
+
# => 2
|
65
|
+
"pattern language".hamming_similar("language of patterns")
|
66
|
+
# => 0.1
|
67
|
+
|
68
|
+
m = PairDistance.new("pattern")
|
69
|
+
# => #<Amatch::PairDistance:0x40349be8>
|
70
|
+
m.match("pattr en")
|
71
|
+
# => 0.545454545454545
|
72
|
+
m.match("pattr en", nil)
|
73
|
+
# => 0.461538461538462
|
74
|
+
m.match("pattr en", /t+/)
|
75
|
+
# => 0.285714285714286
|
76
|
+
"pattern language".pair_distance_similar("language of patterns")
|
77
|
+
# => 0.928571428571429
|
78
|
+
|
79
|
+
m = LongestSubsequence.new("pattern")
|
80
|
+
# => #<Amatch::LongestSubsequence:0x4033e900>
|
81
|
+
m.match("pattren")
|
82
|
+
# => 6
|
83
|
+
"pattern language".longest_subsequence_similar("language of patterns")
|
84
|
+
# => 0.4
|
85
|
+
|
86
|
+
m = LongestSubstring.new("pattern")
|
87
|
+
# => #<Amatch::LongestSubstring:0x403378d0>
|
88
|
+
m.match("pattren")
|
89
|
+
# => 4
|
90
|
+
"pattern language".longest_substring_similar("language of patterns")
|
91
|
+
# => 0.4
|
92
|
+
|
93
|
+
m = Jaro.new("pattern")
|
94
|
+
# => #<Amatch::Jaro:0x363b70>
|
95
|
+
m.match("paTTren")
|
96
|
+
# => 0.952380952380952
|
97
|
+
m.ignore_case = false
|
98
|
+
m.match("paTTren")
|
99
|
+
# => 0.742857142857143
|
100
|
+
"pattern language".jaro_similar("language of patterns")
|
101
|
+
# => 0.672222222222222
|
102
|
+
|
103
|
+
m = JaroWinkler.new("pattern")
|
104
|
+
# #<Amatch::JaroWinkler:0x3530b8>
|
105
|
+
m.match("paTTren")
|
106
|
+
# => 0.971428571712403
|
107
|
+
m.ignore_case = false
|
108
|
+
m.match("paTTren")
|
109
|
+
# => 0.79428571505206
|
110
|
+
m.scaling_factor = 0.05
|
111
|
+
m.match("pattren")
|
112
|
+
# => 0.961904762046678
|
113
|
+
"pattern language".jarowinkler_similar("language of patterns")
|
114
|
+
# => 0.672222222222222
|
115
|
+
|
data/ext/amatch.c
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
#include "ruby.h"
|
2
2
|
#include "pair.h"
|
3
|
+
#include <ctype.h>
|
4
|
+
#include "common.h"
|
3
5
|
|
4
6
|
/*
|
5
7
|
* Document-method: pattern
|
@@ -19,7 +21,8 @@
|
|
19
21
|
|
20
22
|
|
21
23
|
static VALUE rb_mAmatch, rb_cLevenshtein, rb_cSellers, rb_cHamming,
|
22
|
-
rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring
|
24
|
+
rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring,
|
25
|
+
rb_cJaro, rb_cJaroWinkler;
|
23
26
|
|
24
27
|
static ID id_split, id_to_f;
|
25
28
|
|
@@ -62,10 +65,10 @@ static void type##_pattern_set(type *amatch, VALUE pattern) \
|
|
62
65
|
{ \
|
63
66
|
Check_Type(pattern, T_STRING); \
|
64
67
|
free(amatch->pattern); \
|
65
|
-
amatch->pattern_len =
|
68
|
+
amatch->pattern_len = RSTRING_LEN(pattern); \
|
66
69
|
amatch->pattern = ALLOC_N(char, amatch->pattern_len); \
|
67
|
-
MEMCPY(amatch->pattern,
|
68
|
-
|
70
|
+
MEMCPY(amatch->pattern, RSTRING_PTR(pattern), char, \
|
71
|
+
RSTRING_LEN(pattern)); \
|
69
72
|
} \
|
70
73
|
static VALUE rb_##type##_pattern(VALUE self) \
|
71
74
|
{ \
|
@@ -80,16 +83,16 @@ static VALUE rb_##type##_pattern_set(VALUE self, VALUE pattern) \
|
|
80
83
|
}
|
81
84
|
|
82
85
|
#define DEF_ITERATE_STRINGS(type) \
|
83
|
-
static VALUE type##_iterate_strings(type *amatch, VALUE strings,
|
84
|
-
VALUE (*match_function) (type *amatch, VALUE strings))
|
86
|
+
static VALUE type##_iterate_strings(type *amatch, VALUE strings, \
|
87
|
+
VALUE (*match_function) (type *amatch, VALUE strings)) \
|
85
88
|
{ \
|
86
89
|
if (TYPE(strings) == T_STRING) { \
|
87
90
|
return match_function(amatch, strings); \
|
88
91
|
} else { \
|
89
92
|
Check_Type(strings, T_ARRAY); \
|
90
93
|
int i; \
|
91
|
-
VALUE result = rb_ary_new2(
|
92
|
-
for (i = 0; i <
|
94
|
+
VALUE result = rb_ary_new2(RARRAY_LEN(strings)); \
|
95
|
+
for (i = 0; i < RARRAY_LEN(strings); i++) { \
|
93
96
|
VALUE string = rb_ary_entry(strings, i); \
|
94
97
|
if (TYPE(string) != T_STRING) { \
|
95
98
|
rb_raise(rb_eTypeError, \
|
@@ -130,17 +133,25 @@ VALUE function(VALUE self, VALUE value) \
|
|
130
133
|
obj = rb_funcall(obj, id_to_f, 0, 0); \
|
131
134
|
else \
|
132
135
|
Check_Type(obj, T_FLOAT)
|
133
|
-
#define FLOAT2C(obj)
|
136
|
+
#define FLOAT2C(obj) (RFLOAT_VALUE(obj))
|
137
|
+
|
138
|
+
#define CAST2BOOL(obj) \
|
139
|
+
if (obj == Qfalse || obj == Qnil) \
|
140
|
+
obj = Qfalse; \
|
141
|
+
else \
|
142
|
+
obj = Qtrue;
|
143
|
+
#define BOOL2C(obj) (obj == Qtrue)
|
144
|
+
#define C2BOOL(obj) (obj ? Qtrue : Qfalse)
|
134
145
|
|
135
146
|
#define OPTIMIZE_TIME \
|
136
|
-
if (amatch->pattern_len <
|
147
|
+
if (amatch->pattern_len < RSTRING_LEN(string)) { \
|
137
148
|
a_ptr = amatch->pattern; \
|
138
149
|
a_len = amatch->pattern_len; \
|
139
|
-
b_ptr =
|
140
|
-
b_len =
|
150
|
+
b_ptr = RSTRING_PTR(string); \
|
151
|
+
b_len = RSTRING_LEN(string); \
|
141
152
|
} else { \
|
142
|
-
a_ptr =
|
143
|
-
a_len =
|
153
|
+
a_ptr = RSTRING_PTR(string); \
|
154
|
+
a_len = RSTRING_LEN(string); \
|
144
155
|
b_ptr = amatch->pattern; \
|
145
156
|
b_len = amatch->pattern_len; \
|
146
157
|
}
|
@@ -148,8 +159,8 @@ VALUE function(VALUE self, VALUE value) \
|
|
148
159
|
#define DONT_OPTIMIZE \
|
149
160
|
a_ptr = amatch->pattern; \
|
150
161
|
a_len = amatch->pattern_len; \
|
151
|
-
b_ptr =
|
152
|
-
b_len =
|
162
|
+
b_ptr = RSTRING_PTR(string); \
|
163
|
+
b_len = RSTRING_LEN(string); \
|
153
164
|
|
154
165
|
/*
|
155
166
|
* C structures of the Amatch classes
|
@@ -192,6 +203,27 @@ typedef struct PairDistanceStruct {
|
|
192
203
|
DEF_ALLOCATOR(PairDistance)
|
193
204
|
DEF_PATTERN_ACCESSOR(PairDistance)
|
194
205
|
|
206
|
+
typedef struct JaroStruct {
|
207
|
+
char *pattern;
|
208
|
+
int pattern_len;
|
209
|
+
int ignore_case;
|
210
|
+
} Jaro;
|
211
|
+
|
212
|
+
DEF_ALLOCATOR(Jaro)
|
213
|
+
DEF_PATTERN_ACCESSOR(Jaro)
|
214
|
+
DEF_ITERATE_STRINGS(Jaro)
|
215
|
+
|
216
|
+
typedef struct JaroWinklerStruct {
|
217
|
+
char *pattern;
|
218
|
+
int pattern_len;
|
219
|
+
int ignore_case;
|
220
|
+
float scaling_factor;
|
221
|
+
} JaroWinkler;
|
222
|
+
|
223
|
+
DEF_ALLOCATOR(JaroWinkler)
|
224
|
+
DEF_PATTERN_ACCESSOR(JaroWinkler)
|
225
|
+
DEF_ITERATE_STRINGS(JaroWinkler)
|
226
|
+
|
195
227
|
/*
|
196
228
|
* Levenshtein edit distances are computed here:
|
197
229
|
*/
|
@@ -616,6 +648,123 @@ static VALUE LongestSubstring_similar(General *amatch, VALUE string)
|
|
616
648
|
return rb_float_new(((double) result) / b_len);
|
617
649
|
}
|
618
650
|
|
651
|
+
/*
|
652
|
+
* Jaro computation
|
653
|
+
*/
|
654
|
+
|
655
|
+
#define COMPUTE_JARO \
|
656
|
+
l[0] = ALLOC_N(int, a_len); \
|
657
|
+
MEMZERO(l[0], int, a_len); \
|
658
|
+
l[1] = ALLOC_N(int, b_len); \
|
659
|
+
MEMZERO(l[1], int, b_len); \
|
660
|
+
max_dist = ((a_len > b_len ? a_len : b_len) / 2) - 1; \
|
661
|
+
m = 0; \
|
662
|
+
for (i = 0; i < a_len; i++) { \
|
663
|
+
low = (i > max_dist ? i - max_dist : 0); \
|
664
|
+
high = (i + max_dist < b_len ? i + max_dist : b_len); \
|
665
|
+
for (j = low; j <= high; j++) { \
|
666
|
+
if (!l[1][j] && a_ptr[i] == b_ptr[j]) { \
|
667
|
+
l[0][i] = 1; \
|
668
|
+
l[1][j] = 1; \
|
669
|
+
m++; \
|
670
|
+
break; \
|
671
|
+
} \
|
672
|
+
} \
|
673
|
+
} \
|
674
|
+
if (m == 0) { \
|
675
|
+
result = 0.0; \
|
676
|
+
} else { \
|
677
|
+
k = t = 0; \
|
678
|
+
for (i = 0; i < a_len; i++) { \
|
679
|
+
if (l[0][i]) { \
|
680
|
+
for (j = k; j < b_len; j++) { \
|
681
|
+
if (l[1][j]) { \
|
682
|
+
k = j + 1; \
|
683
|
+
break; \
|
684
|
+
} \
|
685
|
+
} \
|
686
|
+
if (a_ptr[i] != b_ptr[j]) { \
|
687
|
+
t++; \
|
688
|
+
} \
|
689
|
+
} \
|
690
|
+
} \
|
691
|
+
t = t / 2; \
|
692
|
+
result = (((double)m)/a_len + ((double)m)/b_len + ((double)(m-t))/m)/3.0; \
|
693
|
+
}
|
694
|
+
|
695
|
+
#define LOWERCASE_STRINGS \
|
696
|
+
char *ying = ALLOC_N(char, a_len); \
|
697
|
+
MEMCPY(ying, a_ptr, char, a_len); \
|
698
|
+
a_ptr = ying; \
|
699
|
+
char *yang = ALLOC_N(char, b_len); \
|
700
|
+
MEMCPY(yang, b_ptr, char, b_len); \
|
701
|
+
b_ptr = yang; \
|
702
|
+
for (i = 0; i < a_len; i++) { \
|
703
|
+
if (islower(a_ptr[i])) a_ptr[i] = toupper(a_ptr[i]); \
|
704
|
+
} \
|
705
|
+
for (i = 0; i < b_len; i++) { \
|
706
|
+
if (islower(b_ptr[i])) b_ptr[i] = toupper(b_ptr[i]); \
|
707
|
+
}
|
708
|
+
|
709
|
+
#define FREE_STRINGS \
|
710
|
+
xfree(a_ptr); \
|
711
|
+
xfree(b_ptr);
|
712
|
+
|
713
|
+
static VALUE Jaro_match(Jaro *amatch, VALUE string)
|
714
|
+
{
|
715
|
+
char *a_ptr, *b_ptr;
|
716
|
+
int a_len, b_len, max_dist, m, t, i, j, k, low, high;
|
717
|
+
int *l[2];
|
718
|
+
double result;
|
719
|
+
|
720
|
+
Check_Type(string, T_STRING);
|
721
|
+
OPTIMIZE_TIME
|
722
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
723
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
724
|
+
if (amatch->ignore_case) {
|
725
|
+
LOWERCASE_STRINGS
|
726
|
+
}
|
727
|
+
COMPUTE_JARO
|
728
|
+
if (amatch->ignore_case) {
|
729
|
+
FREE_STRINGS
|
730
|
+
}
|
731
|
+
return rb_float_new(result);
|
732
|
+
}
|
733
|
+
|
734
|
+
/*
|
735
|
+
* Jaro-Winkler computation
|
736
|
+
*/
|
737
|
+
|
738
|
+
static VALUE JaroWinkler_match(JaroWinkler *amatch, VALUE string)
|
739
|
+
{
|
740
|
+
char *a_ptr, *b_ptr;
|
741
|
+
int a_len, b_len, max_dist, m, t, i, j, k, low, high, n;
|
742
|
+
int *l[2];
|
743
|
+
double result;
|
744
|
+
|
745
|
+
Check_Type(string, T_STRING);
|
746
|
+
OPTIMIZE_TIME
|
747
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
748
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
749
|
+
if (amatch->ignore_case) {
|
750
|
+
LOWERCASE_STRINGS
|
751
|
+
}
|
752
|
+
COMPUTE_JARO
|
753
|
+
n = 0;
|
754
|
+
for (i = 0; i < (a_len >= 4 ? 4 : a_len); i++) {
|
755
|
+
if (a_ptr[i] == b_ptr[i]) {
|
756
|
+
n++;
|
757
|
+
} else {
|
758
|
+
break;
|
759
|
+
}
|
760
|
+
}
|
761
|
+
result = result + n*amatch->scaling_factor*(1-result);
|
762
|
+
if (amatch->ignore_case) {
|
763
|
+
FREE_STRINGS
|
764
|
+
}
|
765
|
+
return rb_float_new(result);
|
766
|
+
}
|
767
|
+
|
619
768
|
/*
|
620
769
|
* Ruby API
|
621
770
|
*/
|
@@ -951,8 +1100,8 @@ static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
|
|
951
1100
|
} else {
|
952
1101
|
Check_Type(strings, T_ARRAY);
|
953
1102
|
int i;
|
954
|
-
result = rb_ary_new2(
|
955
|
-
for (i = 0; i <
|
1103
|
+
result = rb_ary_new2(RARRAY_LEN(strings));
|
1104
|
+
for (i = 0; i < RARRAY_LEN(strings); i++) {
|
956
1105
|
VALUE string = rb_ary_entry(strings, i);
|
957
1106
|
if (TYPE(string) != T_STRING) {
|
958
1107
|
rb_raise(rb_eTypeError,
|
@@ -1214,104 +1363,183 @@ static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
|
|
1214
1363
|
VALUE amatch = rb_LongestSubstring_new(rb_cLongestSubstring, self);
|
1215
1364
|
return rb_LongestSubstring_similar(amatch, strings);
|
1216
1365
|
}
|
1366
|
+
|
1367
|
+
/*
|
1368
|
+
* Document-class: Amatch::Jaro
|
1369
|
+
*
|
1370
|
+
* This class computes the Jaro metric for two strings.
|
1371
|
+
* The Jaro metric computes the similarity between 0 (no match)
|
1372
|
+
* and 1 (exact match) by looking for matching and transposed characters.
|
1373
|
+
*/
|
1374
|
+
DEF_RB_FREE(Jaro, Jaro)
|
1375
|
+
|
1376
|
+
/*
|
1377
|
+
* Document-method: ignore_case
|
1378
|
+
*
|
1379
|
+
* call-seq: ignore_case -> true/false
|
1380
|
+
*
|
1381
|
+
* Returns whether case is ignored when computing matching characters.
|
1382
|
+
*/
|
1383
|
+
DEF_RB_READER(Jaro, rb_Jaro_ignore_case, ignore_case, C2BOOL)
|
1384
|
+
|
1385
|
+
/*
|
1386
|
+
* Document-method: ignore_case=
|
1387
|
+
*
|
1388
|
+
* call-seq: ignore_case=(true/false)
|
1389
|
+
*
|
1390
|
+
* Sets whether case is ignored when computing matching characters.
|
1391
|
+
*/
|
1392
|
+
DEF_RB_WRITER(Jaro, rb_Jaro_ignore_case_set, ignore_case,
|
1393
|
+
int, CAST2BOOL, BOOL2C, != Qundef)
|
1394
|
+
|
1395
|
+
/*
|
1396
|
+
* call-seq: new(pattern)
|
1397
|
+
*
|
1398
|
+
* Creates a new Amatch::Jaro instance from <code>pattern</code>.
|
1399
|
+
*/
|
1400
|
+
static VALUE rb_Jaro_initialize(VALUE self, VALUE pattern)
|
1401
|
+
{
|
1402
|
+
GET_STRUCT(Jaro)
|
1403
|
+
Jaro_pattern_set(amatch, pattern);
|
1404
|
+
amatch->ignore_case = 1;
|
1405
|
+
return self;
|
1406
|
+
}
|
1407
|
+
|
1408
|
+
DEF_CONSTRUCTOR(Jaro, Jaro)
|
1217
1409
|
|
1218
1410
|
/*
|
1219
|
-
*
|
1411
|
+
* call-seq: match(strings) -> results
|
1220
1412
|
*
|
1221
|
-
*
|
1413
|
+
* Uses this Amatch::Jaro instance to match
|
1414
|
+
* Jaro#pattern against <code>strings</code>, that is compute the
|
1415
|
+
* jaro metric with the strings. <code>strings</code> has to be
|
1416
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1417
|
+
* are either a Float or an Array of Floats respectively.
|
1418
|
+
*/
|
1419
|
+
static VALUE rb_Jaro_match(VALUE self, VALUE strings)
|
1420
|
+
{
|
1421
|
+
GET_STRUCT(Jaro)
|
1422
|
+
return Jaro_iterate_strings(amatch, strings, Jaro_match);
|
1423
|
+
}
|
1424
|
+
|
1425
|
+
/*
|
1426
|
+
* call-seq: jaro_similar(strings) -> results
|
1222
1427
|
*
|
1223
|
-
*
|
1224
|
-
*
|
1225
|
-
*
|
1226
|
-
*
|
1227
|
-
*
|
1428
|
+
* If called on a String, this string is used as a
|
1429
|
+
* Amatch::Jaro#pattern to match against <code>strings</code>. It
|
1430
|
+
* returns a Jaro metric number between 0.0 for very
|
1431
|
+
* unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
|
1432
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1433
|
+
* are either a Float or an Array of Floats respectively.
|
1434
|
+
*/
|
1435
|
+
static VALUE rb_str_jaro_similar(VALUE self, VALUE strings)
|
1436
|
+
{
|
1437
|
+
VALUE amatch = rb_Jaro_new(rb_cJaro, self);
|
1438
|
+
return rb_Jaro_match(amatch, strings);
|
1439
|
+
}
|
1440
|
+
|
1441
|
+
/*
|
1442
|
+
* Document-class: Amatch::JaroWinkler
|
1228
1443
|
*
|
1229
|
-
*
|
1444
|
+
* This class computes the Jaro-Winkler metric for two strings.
|
1445
|
+
* The Jaro-Winkler metric computes the similarity between 0 (no match)
|
1446
|
+
* and 1 (exact match) by looking for matching and transposed characters.
|
1230
1447
|
*
|
1231
|
-
*
|
1448
|
+
* It is a variant of the Jaro metric, with additional weighting towards
|
1449
|
+
* common prefixes.
|
1450
|
+
*/
|
1451
|
+
DEF_RB_FREE(JaroWinkler, JaroWinkler)
|
1452
|
+
|
1453
|
+
/*
|
1454
|
+
* Document-method: ignore_case
|
1232
1455
|
*
|
1233
|
-
*
|
1456
|
+
* call-seq: ignore_case -> true/false
|
1234
1457
|
*
|
1235
|
-
*
|
1236
|
-
*
|
1237
|
-
|
1458
|
+
* Returns whether case is ignored when computing matching characters.
|
1459
|
+
* Default is true.
|
1460
|
+
*/
|
1461
|
+
DEF_RB_READER(JaroWinkler, rb_JaroWinkler_ignore_case, ignore_case, C2BOOL)
|
1462
|
+
|
1463
|
+
/*
|
1464
|
+
* Document-method: scaling_factor
|
1238
1465
|
*
|
1239
|
-
*
|
1466
|
+
* call-seq: scaling_factor -> weight
|
1240
1467
|
*
|
1241
|
-
* The
|
1468
|
+
* The scaling factor is how much weight to give common prefixes.
|
1469
|
+
* Default is 0.1.
|
1470
|
+
*/
|
1471
|
+
DEF_RB_READER(JaroWinkler, rb_JaroWinkler_scaling_factor, scaling_factor, rb_float_new)
|
1472
|
+
|
1473
|
+
/*
|
1474
|
+
* Document-method: ignore_case=
|
1242
1475
|
*
|
1243
|
-
*
|
1476
|
+
* call-seq: ignore_case=(true/false)
|
1244
1477
|
*
|
1245
|
-
*
|
1478
|
+
* Sets whether case is ignored when computing matching characters.
|
1479
|
+
*/
|
1480
|
+
DEF_RB_WRITER(JaroWinkler, rb_JaroWinkler_ignore_case_set, ignore_case,
|
1481
|
+
int, CAST2BOOL, BOOL2C, != Qundef)
|
1482
|
+
|
1483
|
+
/*
|
1484
|
+
* Document-method: scaling_factor=
|
1485
|
+
*
|
1486
|
+
* call-seq: scaling_factor=(weight)
|
1487
|
+
*
|
1488
|
+
* Sets the weight to give common prefixes.
|
1489
|
+
*/
|
1490
|
+
DEF_RB_WRITER(JaroWinkler, rb_JaroWinkler_scaling_factor_set, scaling_factor,
|
1491
|
+
double, CAST2FLOAT, FLOAT2C, >= 0)
|
1492
|
+
|
1493
|
+
/*
|
1494
|
+
* call-seq: new(pattern)
|
1246
1495
|
*
|
1247
|
-
*
|
1496
|
+
* Creates a new Amatch::JaroWinkler instance from <code>pattern</code>.
|
1497
|
+
*/
|
1498
|
+
static VALUE rb_JaroWinkler_initialize(VALUE self, VALUE pattern)
|
1499
|
+
{
|
1500
|
+
GET_STRUCT(JaroWinkler)
|
1501
|
+
JaroWinkler_pattern_set(amatch, pattern);
|
1502
|
+
amatch->ignore_case = 1;
|
1503
|
+
amatch->scaling_factor = 0.1;
|
1504
|
+
return self;
|
1505
|
+
}
|
1506
|
+
|
1507
|
+
DEF_CONSTRUCTOR(JaroWinkler, JaroWinkler)
|
1508
|
+
|
1509
|
+
/*
|
1510
|
+
* call-seq: match(strings) -> results
|
1248
1511
|
*
|
1249
|
-
*
|
1250
|
-
*
|
1251
|
-
*
|
1252
|
-
*
|
1253
|
-
*
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
|
1261
|
-
|
1262
|
-
*
|
1263
|
-
* m.reset_weights
|
1264
|
-
* # => #<Amatch::Sellers:0x40366324>
|
1265
|
-
* m.match(["pattren","parent"])
|
1266
|
-
* # => [2.0, 4.0]
|
1267
|
-
* m.search("abcpattrendef")
|
1268
|
-
* # => 2.0
|
1269
|
-
*
|
1270
|
-
* m = Levenshtein.new("pattern")
|
1271
|
-
* # => #<Amatch::Levenshtein:0x4035919c>
|
1272
|
-
* m.match("pattren")
|
1273
|
-
* # => 2
|
1274
|
-
* m.search("abcpattrendef")
|
1275
|
-
* # => 2
|
1276
|
-
* "pattern language".levenshtein_similar("language of patterns")
|
1277
|
-
* # => 0.2
|
1278
|
-
*
|
1279
|
-
* m = Hamming.new("pattern")
|
1280
|
-
* # => #<Amatch::Hamming:0x40350858>
|
1281
|
-
* m.match("pattren")
|
1282
|
-
* # => 2
|
1283
|
-
* "pattern language".hamming_similar("language of patterns")
|
1284
|
-
* # => 0.1
|
1285
|
-
*
|
1286
|
-
* m = PairDistance.new("pattern")
|
1287
|
-
* # => #<Amatch::PairDistance:0x40349be8>
|
1288
|
-
* m.match("pattr en")
|
1289
|
-
* # => 0.545454545454545
|
1290
|
-
* m.match("pattr en", nil)
|
1291
|
-
* # => 0.461538461538462
|
1292
|
-
* m.match("pattr en", /t+/)
|
1293
|
-
* # => 0.285714285714286
|
1294
|
-
* "pattern language".pair_distance_similar("language of patterns")
|
1295
|
-
* # => 0.928571428571429
|
1296
|
-
*
|
1297
|
-
* m = LongestSubsequence.new("pattern")
|
1298
|
-
* # => #<Amatch::LongestSubsequence:0x4033e900>
|
1299
|
-
* m.match("pattren")
|
1300
|
-
* # => 6
|
1301
|
-
* "pattern language".longest_subsequence_similar("language of patterns")
|
1302
|
-
* # => 0.4
|
1303
|
-
*
|
1304
|
-
* m = LongestSubstring.new("pattern")
|
1305
|
-
* # => #<Amatch::LongestSubstring:0x403378d0>
|
1306
|
-
* m.match("pattren")
|
1307
|
-
* # => 4
|
1308
|
-
* "pattern language".longest_substring_similar("language of patterns")
|
1309
|
-
* # => 0.4
|
1512
|
+
* Uses this Amatch::Jaro instance to match
|
1513
|
+
* Jaro#pattern against <code>strings</code>, that is compute the
|
1514
|
+
* jaro metric with the strings. <code>strings</code> has to be
|
1515
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1516
|
+
* are either a Float or an Array of Floats respectively.
|
1517
|
+
*/
|
1518
|
+
static VALUE rb_JaroWinkler_match(VALUE self, VALUE strings)
|
1519
|
+
{
|
1520
|
+
GET_STRUCT(JaroWinkler)
|
1521
|
+
return JaroWinkler_iterate_strings(amatch, strings, JaroWinkler_match);
|
1522
|
+
}
|
1523
|
+
|
1524
|
+
/*
|
1525
|
+
* call-seq: jarowinkler_similar(strings) -> results
|
1310
1526
|
*
|
1527
|
+
* If called on a String, this string is used as a
|
1528
|
+
* Amatch::JaroWinkler#pattern to match against <code>strings</code>. It
|
1529
|
+
* returns a Jaro-Winkler metric number between 0.0 for very
|
1530
|
+
* unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
|
1531
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1532
|
+
* are either a Float or an Array of Floats respectively.
|
1311
1533
|
*/
|
1534
|
+
static VALUE rb_str_jarowinkler_similar(VALUE self, VALUE strings)
|
1535
|
+
{
|
1536
|
+
VALUE amatch = rb_JaroWinkler_new(rb_cJaro, self);
|
1537
|
+
return rb_JaroWinkler_match(amatch, strings);
|
1538
|
+
}
|
1312
1539
|
|
1313
1540
|
void Init_amatch()
|
1314
1541
|
{
|
1542
|
+
rb_require("amatch/version");
|
1315
1543
|
rb_mAmatch = rb_define_module("Amatch");
|
1316
1544
|
|
1317
1545
|
/* Levenshtein */
|
@@ -1382,7 +1610,32 @@ void Init_amatch()
|
|
1382
1610
|
rb_define_method(rb_cLongestSubstring, "similar", rb_LongestSubstring_similar, 1);
|
1383
1611
|
rb_define_method(rb_cString, "longest_substring_similar", rb_str_longest_substring_similar, 1);
|
1384
1612
|
|
1613
|
+
/* Jaro */
|
1614
|
+
rb_cJaro = rb_define_class_under(rb_mAmatch, "Jaro", rb_cObject);
|
1615
|
+
rb_define_alloc_func(rb_cJaro, rb_Jaro_s_allocate);
|
1616
|
+
rb_define_method(rb_cJaro, "initialize", rb_Jaro_initialize, 1);
|
1617
|
+
rb_define_method(rb_cJaro, "pattern", rb_Jaro_pattern, 0);
|
1618
|
+
rb_define_method(rb_cJaro, "pattern=", rb_Jaro_pattern_set, 1);
|
1619
|
+
rb_define_method(rb_cJaro, "ignore_case", rb_Jaro_ignore_case, 0);
|
1620
|
+
rb_define_method(rb_cJaro, "ignore_case=", rb_Jaro_ignore_case_set, 1);
|
1621
|
+
rb_define_method(rb_cJaro, "match", rb_Jaro_match, 1);
|
1622
|
+
rb_define_alias(rb_cJaro, "similar", "match");
|
1623
|
+
rb_define_method(rb_cString, "jaro_similar", rb_str_jaro_similar, 1);
|
1624
|
+
|
1625
|
+
/* Jaro-Winkler */
|
1626
|
+
rb_cJaroWinkler = rb_define_class_under(rb_mAmatch, "JaroWinkler", rb_cObject);
|
1627
|
+
rb_define_alloc_func(rb_cJaroWinkler, rb_JaroWinkler_s_allocate);
|
1628
|
+
rb_define_method(rb_cJaroWinkler, "initialize", rb_JaroWinkler_initialize, 1);
|
1629
|
+
rb_define_method(rb_cJaroWinkler, "pattern", rb_JaroWinkler_pattern, 0);
|
1630
|
+
rb_define_method(rb_cJaroWinkler, "pattern=", rb_JaroWinkler_pattern_set, 1);
|
1631
|
+
rb_define_method(rb_cJaroWinkler, "ignore_case", rb_JaroWinkler_ignore_case, 0);
|
1632
|
+
rb_define_method(rb_cJaroWinkler, "ignore_case=", rb_JaroWinkler_ignore_case_set, 1);
|
1633
|
+
rb_define_method(rb_cJaroWinkler, "scaling_factor", rb_JaroWinkler_scaling_factor, 0);
|
1634
|
+
rb_define_method(rb_cJaroWinkler, "scaling_factor=", rb_JaroWinkler_scaling_factor_set, 1);
|
1635
|
+
rb_define_method(rb_cJaroWinkler, "match", rb_JaroWinkler_match, 1);
|
1636
|
+
rb_define_alias(rb_cJaroWinkler, "similar", "match");
|
1637
|
+
rb_define_method(rb_cString, "jarowinkler_similar", rb_str_jarowinkler_similar, 1);
|
1638
|
+
|
1385
1639
|
id_split = rb_intern("split");
|
1386
1640
|
id_to_f = rb_intern("to_f");
|
1387
1641
|
}
|
1388
|
-
/* vim: set et cin sw=4 ts=4: */
|