amatch 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +4 -0
- data/{GPL → COPYING} +7 -7
- data/README +25 -0
- data/Rakefile +66 -64
- data/VERSION +1 -1
- data/bin/agrep.rb +0 -4
- data/doc-main.txt +115 -0
- data/ext/amatch.c +351 -98
- data/ext/common.h +25 -0
- data/ext/extconf.rb +0 -5
- data/ext/pair.c +5 -6
- data/ext/pair.h +1 -1
- data/install.rb +28 -1015
- data/lib/amatch/version.rb +8 -0
- data/tests/test_hamming.rb +1 -2
- data/tests/test_jaro.rb +29 -0
- data/tests/test_jaro_winkler.rb +38 -0
- data/tests/test_levenshtein.rb +25 -27
- data/tests/test_longest_subsequence.rb +1 -2
- data/tests/test_longest_substring.rb +1 -2
- data/tests/test_pair_distance.rb +1 -2
- data/tests/test_sellers.rb +52 -54
- metadata +76 -55
- data/README.en +0 -31
- data/ext/MANIFEST +0 -2
- data/ext/tags +0 -24
- data/tests/runner.rb +0 -26
data/CHANGES
CHANGED
data/{GPL → COPYING}
RENAMED
@@ -1,12 +1,12 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
GNU GENERAL PUBLIC LICENSE
|
2
|
+
Version 2, June 1991
|
3
3
|
|
4
4
|
Copyright (C) 1989, 1991 Free Software Foundation, Inc.
|
5
5
|
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
6
6
|
Everyone is permitted to copy and distribute verbatim copies
|
7
7
|
of this license document, but changing it is not allowed.
|
8
8
|
|
9
|
-
|
9
|
+
Preamble
|
10
10
|
|
11
11
|
The licenses for most software are designed to take away your
|
12
12
|
freedom to share and change it. By contrast, the GNU General Public
|
@@ -56,7 +56,7 @@ patent must be licensed for everyone's free use or not licensed at all.
|
|
56
56
|
The precise terms and conditions for copying, distribution and
|
57
57
|
modification follow.
|
58
58
|
|
59
|
-
|
59
|
+
GNU GENERAL PUBLIC LICENSE
|
60
60
|
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
61
61
|
|
62
62
|
0. This License applies to any program or other work which contains
|
@@ -255,7 +255,7 @@ make exceptions for this. Our decision will be guided by the two goals
|
|
255
255
|
of preserving the free status of all derivatives of our free software and
|
256
256
|
of promoting the sharing and reuse of software generally.
|
257
257
|
|
258
|
-
|
258
|
+
NO WARRANTY
|
259
259
|
|
260
260
|
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
|
261
261
|
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
|
@@ -277,9 +277,9 @@ YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
|
|
277
277
|
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
|
278
278
|
POSSIBILITY OF SUCH DAMAGES.
|
279
279
|
|
280
|
-
|
280
|
+
END OF TERMS AND CONDITIONS
|
281
281
|
|
282
|
-
|
282
|
+
How to Apply These Terms to Your New Programs
|
283
283
|
|
284
284
|
If you develop a new program, and you want it to be of the greatest
|
285
285
|
possible use to the public, the best way to achieve this is to make it
|
data/README
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
Installation
|
2
|
+
============
|
3
|
+
|
4
|
+
Just type into the command line as root:
|
5
|
+
|
6
|
+
# ruby install.rb
|
7
|
+
|
8
|
+
If you have installed rake (rake.rubyforge.org), you can also type:
|
9
|
+
|
10
|
+
# rake install
|
11
|
+
|
12
|
+
To install this extension as a gem type
|
13
|
+
|
14
|
+
# gem install amatch
|
15
|
+
|
16
|
+
Author
|
17
|
+
======
|
18
|
+
|
19
|
+
Florian Frank <flori@ping.de>
|
20
|
+
|
21
|
+
License
|
22
|
+
=======
|
23
|
+
|
24
|
+
GNU General Public License, Version 2 (GPLv2)
|
25
|
+
|
data/Rakefile
CHANGED
@@ -1,32 +1,33 @@
|
|
1
|
-
# vim: set et sw=2 ts=2:
|
2
|
-
require 'rake/clean'
|
3
|
-
require 'rake/testtask'
|
4
|
-
require 'rake/gempackagetask'
|
5
|
-
require 'rake/rdoctask'
|
6
|
-
require 'rbconfig'
|
1
|
+
# vim: set filetype=ruby et sw=2 ts=2:
|
7
2
|
|
3
|
+
begin
|
4
|
+
require 'rake/gempackagetask'
|
5
|
+
rescue LoadError
|
6
|
+
end
|
7
|
+
require 'rbconfig'
|
8
8
|
include Config
|
9
|
+
require 'rake/clean'
|
10
|
+
CLEAN.include 'coverage', 'doc'
|
11
|
+
require 'rake/testtask'
|
9
12
|
|
13
|
+
MAKE = ENV['MAKE'] || %w[gmake make].find { |c| system(c, '-v') }
|
14
|
+
PKG_NAME = 'amatch'
|
10
15
|
PKG_VERSION = File.read('VERSION').chomp
|
11
|
-
PKG_FILES = FileList[
|
12
|
-
|
13
|
-
PKG_FILES.exclude(/^pkg/)
|
14
|
-
PKG_FILES.exclude(/^doc/)
|
16
|
+
PKG_FILES = FileList["**/*"].exclude(/^(pkg|coverage|doc)/)
|
17
|
+
PKG_DOC_FILES = [ "ext/amatch.c" ].concat(Dir['lib/**/*.rb']) << 'doc-main.txt'
|
15
18
|
|
16
19
|
task :default => :test
|
17
20
|
|
18
21
|
desc "Run unit tests"
|
19
|
-
task :test => :
|
20
|
-
|
21
|
-
ruby %{-I../ext runner.rb}
|
22
|
-
end
|
22
|
+
task :test => :compile_ext do
|
23
|
+
sh %{testrb -Iext:lib tests/test_*.rb}
|
23
24
|
end
|
24
25
|
|
25
26
|
desc "Compiling library"
|
26
|
-
task :
|
27
|
+
task :compile_ext do
|
27
28
|
cd 'ext' do
|
28
29
|
ruby %{extconf.rb}
|
29
|
-
sh
|
30
|
+
sh MAKE
|
30
31
|
end
|
31
32
|
end
|
32
33
|
|
@@ -40,72 +41,73 @@ end
|
|
40
41
|
|
41
42
|
desc "Removing generated files"
|
42
43
|
task :clean do
|
43
|
-
|
44
|
-
cd 'ext' do
|
44
|
+
cd 'ext' do
|
45
45
|
ruby 'extconf.rb'
|
46
|
-
sh "
|
46
|
+
sh "#{MAKE} distclean" if File.exist?('Makefile')
|
47
47
|
end
|
48
48
|
end
|
49
49
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
rd.rdoc_dir = 'doc'
|
50
|
+
desc "Build the documentation"
|
51
|
+
task :doc do
|
52
|
+
sh "rdoc -m doc-main.txt -t '#{PKG_NAME} - Approximate Matching' #{PKG_DOC_FILES * ' '}"
|
54
53
|
end
|
55
54
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
s.description = <<EOF
|
55
|
+
if defined? Gem
|
56
|
+
spec = Gem::Specification.new do |s|
|
57
|
+
s.name = 'amatch'
|
58
|
+
s.version = PKG_VERSION
|
59
|
+
s.summary = "Approximate String Matching library"
|
60
|
+
s.description = <<EOF
|
63
61
|
Amatch is a library for approximate string matching and searching in strings.
|
64
62
|
Several algorithms can be used to do this, and it's also possible to compute a
|
65
63
|
similarity metric number between 0.0 and 1.0 for two given strings.
|
66
64
|
EOF
|
67
65
|
|
68
|
-
|
66
|
+
s.files = PKG_FILES
|
69
67
|
|
70
|
-
|
71
|
-
#s.requirements << ""
|
68
|
+
s.extensions << "ext/extconf.rb"
|
72
69
|
|
73
|
-
|
70
|
+
s.require_path = 'ext'
|
74
71
|
|
75
|
-
|
72
|
+
s.bindir = "bin"
|
73
|
+
s.executables = ["agrep.rb"]
|
74
|
+
s.default_executable = "agrep.rb"
|
76
75
|
|
77
|
-
|
76
|
+
s.has_rdoc = true
|
77
|
+
s.extra_rdoc_files.concat PKG_DOC_FILES
|
78
|
+
s.rdoc_options << '--main' << 'doc-main.txt' <<
|
79
|
+
'--title' << "#{PKG_NAME} - Approximate Matching"
|
80
|
+
s.test_files.concat Dir['tests/test_*.rb']
|
78
81
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
s.bindir = "bin" # Use these for applications.
|
85
|
-
s.executables = ["agrep.rb"]
|
86
|
-
s.default_executable = "agrep.rb"
|
87
|
-
|
88
|
-
#### Documentation and testing.
|
89
|
-
|
90
|
-
s.has_rdoc = true
|
91
|
-
#s.extra_rdoc_files = FileList['ext/amatch.c']
|
92
|
-
s.rdoc_options <<
|
93
|
-
'--title' << 'Amatch -- Approximate Matching' <<
|
94
|
-
'--main' << 'Amatch' <<
|
95
|
-
'--line-numbers'
|
96
|
-
s.test_files << 'tests/runner.rb'
|
97
|
-
|
98
|
-
#### Author and project details.
|
82
|
+
s.author = "Florian Frank"
|
83
|
+
s.email = "flori@ping.de"
|
84
|
+
s.homepage = "http://amatch.rubyforge.org"
|
85
|
+
s.rubyforge_project = "amatch"
|
86
|
+
end
|
99
87
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
88
|
+
Rake::GemPackageTask.new(spec) do |pkg|
|
89
|
+
pkg.need_tar = true
|
90
|
+
pkg.package_files += PKG_FILES
|
91
|
+
end
|
104
92
|
end
|
105
93
|
|
106
|
-
|
107
|
-
|
108
|
-
|
94
|
+
desc m = "Writing version information for #{PKG_VERSION}"
|
95
|
+
task :version do
|
96
|
+
puts m
|
97
|
+
File.open(File.join('lib', 'amatch', 'version.rb'), 'w') do |v|
|
98
|
+
v.puts <<EOT
|
99
|
+
module Amatch
|
100
|
+
# Amatch version
|
101
|
+
VERSION = '#{PKG_VERSION}'
|
102
|
+
VERSION_ARRAY = VERSION.split(/\\./).map { |x| x.to_i } # :nodoc:
|
103
|
+
VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
|
104
|
+
VERSION_MINOR = VERSION_ARRAY[1] # :nodoc:
|
105
|
+
VERSION_BUILD = VERSION_ARRAY[2] # :nodoc:
|
109
106
|
end
|
107
|
+
EOT
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
110
111
|
|
111
|
-
|
112
|
+
desc "Prepare a new release"
|
113
|
+
task :release => [ :clean, :version, :package ]
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.4
|
data/bin/agrep.rb
CHANGED
data/doc-main.txt
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
== amatch - Approximate Matching Extension for Ruby
|
2
|
+
|
3
|
+
=== Description
|
4
|
+
|
5
|
+
This is a collection of classes that can be used for Approximate
|
6
|
+
matching, searching, and comparing of Strings. They implement algorithms
|
7
|
+
that compute the Levenshtein edit distance, Sellers edit distance, the
|
8
|
+
Hamming distance, the longest common subsequence length, the longest common
|
9
|
+
substring length, the pair distance metric, the Jaro-Winkler metric.
|
10
|
+
|
11
|
+
=== Author
|
12
|
+
|
13
|
+
Florian Frank mailto:flori@ping.de
|
14
|
+
|
15
|
+
=== License
|
16
|
+
|
17
|
+
This is free software; you can redistribute it and/or modify it under
|
18
|
+
the terms of the GNU General Public License Version 2 as published by
|
19
|
+
the Free Software Foundation: http://www.gnu.org/copyleft/gpl.html
|
20
|
+
|
21
|
+
=== Download
|
22
|
+
|
23
|
+
The latest version of <b>amatch</b> can be found at
|
24
|
+
|
25
|
+
* http://rubyforge.org/frs/?group_id=390
|
26
|
+
|
27
|
+
Online Documentation should be located at
|
28
|
+
|
29
|
+
* http://amatch.rubyforge.org
|
30
|
+
|
31
|
+
=== Examples
|
32
|
+
require 'amatch'
|
33
|
+
# => true
|
34
|
+
include Amatch
|
35
|
+
# => Object
|
36
|
+
|
37
|
+
m = Sellers.new("pattern")
|
38
|
+
# => #<Amatch::Sellers:0x40366324>
|
39
|
+
m.match("pattren")
|
40
|
+
# => 2.0
|
41
|
+
m.substitution = m.insertion = 3
|
42
|
+
# => 3
|
43
|
+
m.match("pattren")
|
44
|
+
# => 4.0
|
45
|
+
m.reset_weights
|
46
|
+
# => #<Amatch::Sellers:0x40366324>
|
47
|
+
m.match(["pattren","parent"])
|
48
|
+
# => [2.0, 4.0]
|
49
|
+
m.search("abcpattrendef")
|
50
|
+
# => 2.0
|
51
|
+
|
52
|
+
m = Levenshtein.new("pattern")
|
53
|
+
# => #<Amatch::Levenshtein:0x4035919c>
|
54
|
+
m.match("pattren")
|
55
|
+
# => 2
|
56
|
+
m.search("abcpattrendef")
|
57
|
+
# => 2
|
58
|
+
"pattern language".levenshtein_similar("language of patterns")
|
59
|
+
# => 0.2
|
60
|
+
|
61
|
+
m = Hamming.new("pattern")
|
62
|
+
# => #<Amatch::Hamming:0x40350858>
|
63
|
+
m.match("pattren")
|
64
|
+
# => 2
|
65
|
+
"pattern language".hamming_similar("language of patterns")
|
66
|
+
# => 0.1
|
67
|
+
|
68
|
+
m = PairDistance.new("pattern")
|
69
|
+
# => #<Amatch::PairDistance:0x40349be8>
|
70
|
+
m.match("pattr en")
|
71
|
+
# => 0.545454545454545
|
72
|
+
m.match("pattr en", nil)
|
73
|
+
# => 0.461538461538462
|
74
|
+
m.match("pattr en", /t+/)
|
75
|
+
# => 0.285714285714286
|
76
|
+
"pattern language".pair_distance_similar("language of patterns")
|
77
|
+
# => 0.928571428571429
|
78
|
+
|
79
|
+
m = LongestSubsequence.new("pattern")
|
80
|
+
# => #<Amatch::LongestSubsequence:0x4033e900>
|
81
|
+
m.match("pattren")
|
82
|
+
# => 6
|
83
|
+
"pattern language".longest_subsequence_similar("language of patterns")
|
84
|
+
# => 0.4
|
85
|
+
|
86
|
+
m = LongestSubstring.new("pattern")
|
87
|
+
# => #<Amatch::LongestSubstring:0x403378d0>
|
88
|
+
m.match("pattren")
|
89
|
+
# => 4
|
90
|
+
"pattern language".longest_substring_similar("language of patterns")
|
91
|
+
# => 0.4
|
92
|
+
|
93
|
+
m = Jaro.new("pattern")
|
94
|
+
# => #<Amatch::Jaro:0x363b70>
|
95
|
+
m.match("paTTren")
|
96
|
+
# => 0.952380952380952
|
97
|
+
m.ignore_case = false
|
98
|
+
m.match("paTTren")
|
99
|
+
# => 0.742857142857143
|
100
|
+
"pattern language".jaro_similar("language of patterns")
|
101
|
+
# => 0.672222222222222
|
102
|
+
|
103
|
+
m = JaroWinkler.new("pattern")
|
104
|
+
# #<Amatch::JaroWinkler:0x3530b8>
|
105
|
+
m.match("paTTren")
|
106
|
+
# => 0.971428571712403
|
107
|
+
m.ignore_case = false
|
108
|
+
m.match("paTTren")
|
109
|
+
# => 0.79428571505206
|
110
|
+
m.scaling_factor = 0.05
|
111
|
+
m.match("pattren")
|
112
|
+
# => 0.961904762046678
|
113
|
+
"pattern language".jarowinkler_similar("language of patterns")
|
114
|
+
# => 0.672222222222222
|
115
|
+
|
data/ext/amatch.c
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
#include "ruby.h"
|
2
2
|
#include "pair.h"
|
3
|
+
#include <ctype.h>
|
4
|
+
#include "common.h"
|
3
5
|
|
4
6
|
/*
|
5
7
|
* Document-method: pattern
|
@@ -19,7 +21,8 @@
|
|
19
21
|
|
20
22
|
|
21
23
|
static VALUE rb_mAmatch, rb_cLevenshtein, rb_cSellers, rb_cHamming,
|
22
|
-
rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring
|
24
|
+
rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring,
|
25
|
+
rb_cJaro, rb_cJaroWinkler;
|
23
26
|
|
24
27
|
static ID id_split, id_to_f;
|
25
28
|
|
@@ -62,10 +65,10 @@ static void type##_pattern_set(type *amatch, VALUE pattern) \
|
|
62
65
|
{ \
|
63
66
|
Check_Type(pattern, T_STRING); \
|
64
67
|
free(amatch->pattern); \
|
65
|
-
amatch->pattern_len =
|
68
|
+
amatch->pattern_len = RSTRING_LEN(pattern); \
|
66
69
|
amatch->pattern = ALLOC_N(char, amatch->pattern_len); \
|
67
|
-
MEMCPY(amatch->pattern,
|
68
|
-
|
70
|
+
MEMCPY(amatch->pattern, RSTRING_PTR(pattern), char, \
|
71
|
+
RSTRING_LEN(pattern)); \
|
69
72
|
} \
|
70
73
|
static VALUE rb_##type##_pattern(VALUE self) \
|
71
74
|
{ \
|
@@ -80,16 +83,16 @@ static VALUE rb_##type##_pattern_set(VALUE self, VALUE pattern) \
|
|
80
83
|
}
|
81
84
|
|
82
85
|
#define DEF_ITERATE_STRINGS(type) \
|
83
|
-
static VALUE type##_iterate_strings(type *amatch, VALUE strings,
|
84
|
-
VALUE (*match_function) (type *amatch, VALUE strings))
|
86
|
+
static VALUE type##_iterate_strings(type *amatch, VALUE strings, \
|
87
|
+
VALUE (*match_function) (type *amatch, VALUE strings)) \
|
85
88
|
{ \
|
86
89
|
if (TYPE(strings) == T_STRING) { \
|
87
90
|
return match_function(amatch, strings); \
|
88
91
|
} else { \
|
89
92
|
Check_Type(strings, T_ARRAY); \
|
90
93
|
int i; \
|
91
|
-
VALUE result = rb_ary_new2(
|
92
|
-
for (i = 0; i <
|
94
|
+
VALUE result = rb_ary_new2(RARRAY_LEN(strings)); \
|
95
|
+
for (i = 0; i < RARRAY_LEN(strings); i++) { \
|
93
96
|
VALUE string = rb_ary_entry(strings, i); \
|
94
97
|
if (TYPE(string) != T_STRING) { \
|
95
98
|
rb_raise(rb_eTypeError, \
|
@@ -130,17 +133,25 @@ VALUE function(VALUE self, VALUE value) \
|
|
130
133
|
obj = rb_funcall(obj, id_to_f, 0, 0); \
|
131
134
|
else \
|
132
135
|
Check_Type(obj, T_FLOAT)
|
133
|
-
#define FLOAT2C(obj)
|
136
|
+
#define FLOAT2C(obj) (RFLOAT_VALUE(obj))
|
137
|
+
|
138
|
+
#define CAST2BOOL(obj) \
|
139
|
+
if (obj == Qfalse || obj == Qnil) \
|
140
|
+
obj = Qfalse; \
|
141
|
+
else \
|
142
|
+
obj = Qtrue;
|
143
|
+
#define BOOL2C(obj) (obj == Qtrue)
|
144
|
+
#define C2BOOL(obj) (obj ? Qtrue : Qfalse)
|
134
145
|
|
135
146
|
#define OPTIMIZE_TIME \
|
136
|
-
if (amatch->pattern_len <
|
147
|
+
if (amatch->pattern_len < RSTRING_LEN(string)) { \
|
137
148
|
a_ptr = amatch->pattern; \
|
138
149
|
a_len = amatch->pattern_len; \
|
139
|
-
b_ptr =
|
140
|
-
b_len =
|
150
|
+
b_ptr = RSTRING_PTR(string); \
|
151
|
+
b_len = RSTRING_LEN(string); \
|
141
152
|
} else { \
|
142
|
-
a_ptr =
|
143
|
-
a_len =
|
153
|
+
a_ptr = RSTRING_PTR(string); \
|
154
|
+
a_len = RSTRING_LEN(string); \
|
144
155
|
b_ptr = amatch->pattern; \
|
145
156
|
b_len = amatch->pattern_len; \
|
146
157
|
}
|
@@ -148,8 +159,8 @@ VALUE function(VALUE self, VALUE value) \
|
|
148
159
|
#define DONT_OPTIMIZE \
|
149
160
|
a_ptr = amatch->pattern; \
|
150
161
|
a_len = amatch->pattern_len; \
|
151
|
-
b_ptr =
|
152
|
-
b_len =
|
162
|
+
b_ptr = RSTRING_PTR(string); \
|
163
|
+
b_len = RSTRING_LEN(string); \
|
153
164
|
|
154
165
|
/*
|
155
166
|
* C structures of the Amatch classes
|
@@ -192,6 +203,27 @@ typedef struct PairDistanceStruct {
|
|
192
203
|
DEF_ALLOCATOR(PairDistance)
|
193
204
|
DEF_PATTERN_ACCESSOR(PairDistance)
|
194
205
|
|
206
|
+
typedef struct JaroStruct {
|
207
|
+
char *pattern;
|
208
|
+
int pattern_len;
|
209
|
+
int ignore_case;
|
210
|
+
} Jaro;
|
211
|
+
|
212
|
+
DEF_ALLOCATOR(Jaro)
|
213
|
+
DEF_PATTERN_ACCESSOR(Jaro)
|
214
|
+
DEF_ITERATE_STRINGS(Jaro)
|
215
|
+
|
216
|
+
typedef struct JaroWinklerStruct {
|
217
|
+
char *pattern;
|
218
|
+
int pattern_len;
|
219
|
+
int ignore_case;
|
220
|
+
float scaling_factor;
|
221
|
+
} JaroWinkler;
|
222
|
+
|
223
|
+
DEF_ALLOCATOR(JaroWinkler)
|
224
|
+
DEF_PATTERN_ACCESSOR(JaroWinkler)
|
225
|
+
DEF_ITERATE_STRINGS(JaroWinkler)
|
226
|
+
|
195
227
|
/*
|
196
228
|
* Levenshtein edit distances are computed here:
|
197
229
|
*/
|
@@ -616,6 +648,123 @@ static VALUE LongestSubstring_similar(General *amatch, VALUE string)
|
|
616
648
|
return rb_float_new(((double) result) / b_len);
|
617
649
|
}
|
618
650
|
|
651
|
+
/*
|
652
|
+
* Jaro computation
|
653
|
+
*/
|
654
|
+
|
655
|
+
#define COMPUTE_JARO \
|
656
|
+
l[0] = ALLOC_N(int, a_len); \
|
657
|
+
MEMZERO(l[0], int, a_len); \
|
658
|
+
l[1] = ALLOC_N(int, b_len); \
|
659
|
+
MEMZERO(l[1], int, b_len); \
|
660
|
+
max_dist = ((a_len > b_len ? a_len : b_len) / 2) - 1; \
|
661
|
+
m = 0; \
|
662
|
+
for (i = 0; i < a_len; i++) { \
|
663
|
+
low = (i > max_dist ? i - max_dist : 0); \
|
664
|
+
high = (i + max_dist < b_len ? i + max_dist : b_len); \
|
665
|
+
for (j = low; j <= high; j++) { \
|
666
|
+
if (!l[1][j] && a_ptr[i] == b_ptr[j]) { \
|
667
|
+
l[0][i] = 1; \
|
668
|
+
l[1][j] = 1; \
|
669
|
+
m++; \
|
670
|
+
break; \
|
671
|
+
} \
|
672
|
+
} \
|
673
|
+
} \
|
674
|
+
if (m == 0) { \
|
675
|
+
result = 0.0; \
|
676
|
+
} else { \
|
677
|
+
k = t = 0; \
|
678
|
+
for (i = 0; i < a_len; i++) { \
|
679
|
+
if (l[0][i]) { \
|
680
|
+
for (j = k; j < b_len; j++) { \
|
681
|
+
if (l[1][j]) { \
|
682
|
+
k = j + 1; \
|
683
|
+
break; \
|
684
|
+
} \
|
685
|
+
} \
|
686
|
+
if (a_ptr[i] != b_ptr[j]) { \
|
687
|
+
t++; \
|
688
|
+
} \
|
689
|
+
} \
|
690
|
+
} \
|
691
|
+
t = t / 2; \
|
692
|
+
result = (((double)m)/a_len + ((double)m)/b_len + ((double)(m-t))/m)/3.0; \
|
693
|
+
}
|
694
|
+
|
695
|
+
#define LOWERCASE_STRINGS \
|
696
|
+
char *ying = ALLOC_N(char, a_len); \
|
697
|
+
MEMCPY(ying, a_ptr, char, a_len); \
|
698
|
+
a_ptr = ying; \
|
699
|
+
char *yang = ALLOC_N(char, b_len); \
|
700
|
+
MEMCPY(yang, b_ptr, char, b_len); \
|
701
|
+
b_ptr = yang; \
|
702
|
+
for (i = 0; i < a_len; i++) { \
|
703
|
+
if (islower(a_ptr[i])) a_ptr[i] = toupper(a_ptr[i]); \
|
704
|
+
} \
|
705
|
+
for (i = 0; i < b_len; i++) { \
|
706
|
+
if (islower(b_ptr[i])) b_ptr[i] = toupper(b_ptr[i]); \
|
707
|
+
}
|
708
|
+
|
709
|
+
#define FREE_STRINGS \
|
710
|
+
xfree(a_ptr); \
|
711
|
+
xfree(b_ptr);
|
712
|
+
|
713
|
+
static VALUE Jaro_match(Jaro *amatch, VALUE string)
|
714
|
+
{
|
715
|
+
char *a_ptr, *b_ptr;
|
716
|
+
int a_len, b_len, max_dist, m, t, i, j, k, low, high;
|
717
|
+
int *l[2];
|
718
|
+
double result;
|
719
|
+
|
720
|
+
Check_Type(string, T_STRING);
|
721
|
+
OPTIMIZE_TIME
|
722
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
723
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
724
|
+
if (amatch->ignore_case) {
|
725
|
+
LOWERCASE_STRINGS
|
726
|
+
}
|
727
|
+
COMPUTE_JARO
|
728
|
+
if (amatch->ignore_case) {
|
729
|
+
FREE_STRINGS
|
730
|
+
}
|
731
|
+
return rb_float_new(result);
|
732
|
+
}
|
733
|
+
|
734
|
+
/*
|
735
|
+
* Jaro-Winkler computation
|
736
|
+
*/
|
737
|
+
|
738
|
+
static VALUE JaroWinkler_match(JaroWinkler *amatch, VALUE string)
|
739
|
+
{
|
740
|
+
char *a_ptr, *b_ptr;
|
741
|
+
int a_len, b_len, max_dist, m, t, i, j, k, low, high, n;
|
742
|
+
int *l[2];
|
743
|
+
double result;
|
744
|
+
|
745
|
+
Check_Type(string, T_STRING);
|
746
|
+
OPTIMIZE_TIME
|
747
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
748
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
749
|
+
if (amatch->ignore_case) {
|
750
|
+
LOWERCASE_STRINGS
|
751
|
+
}
|
752
|
+
COMPUTE_JARO
|
753
|
+
n = 0;
|
754
|
+
for (i = 0; i < (a_len >= 4 ? 4 : a_len); i++) {
|
755
|
+
if (a_ptr[i] == b_ptr[i]) {
|
756
|
+
n++;
|
757
|
+
} else {
|
758
|
+
break;
|
759
|
+
}
|
760
|
+
}
|
761
|
+
result = result + n*amatch->scaling_factor*(1-result);
|
762
|
+
if (amatch->ignore_case) {
|
763
|
+
FREE_STRINGS
|
764
|
+
}
|
765
|
+
return rb_float_new(result);
|
766
|
+
}
|
767
|
+
|
619
768
|
/*
|
620
769
|
* Ruby API
|
621
770
|
*/
|
@@ -951,8 +1100,8 @@ static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
|
|
951
1100
|
} else {
|
952
1101
|
Check_Type(strings, T_ARRAY);
|
953
1102
|
int i;
|
954
|
-
result = rb_ary_new2(
|
955
|
-
for (i = 0; i <
|
1103
|
+
result = rb_ary_new2(RARRAY_LEN(strings));
|
1104
|
+
for (i = 0; i < RARRAY_LEN(strings); i++) {
|
956
1105
|
VALUE string = rb_ary_entry(strings, i);
|
957
1106
|
if (TYPE(string) != T_STRING) {
|
958
1107
|
rb_raise(rb_eTypeError,
|
@@ -1214,104 +1363,183 @@ static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
|
|
1214
1363
|
VALUE amatch = rb_LongestSubstring_new(rb_cLongestSubstring, self);
|
1215
1364
|
return rb_LongestSubstring_similar(amatch, strings);
|
1216
1365
|
}
|
1366
|
+
|
1367
|
+
/*
|
1368
|
+
* Document-class: Amatch::Jaro
|
1369
|
+
*
|
1370
|
+
* This class computes the Jaro metric for two strings.
|
1371
|
+
* The Jaro metric computes the similarity between 0 (no match)
|
1372
|
+
* and 1 (exact match) by looking for matching and transposed characters.
|
1373
|
+
*/
|
1374
|
+
DEF_RB_FREE(Jaro, Jaro)
|
1375
|
+
|
1376
|
+
/*
|
1377
|
+
* Document-method: ignore_case
|
1378
|
+
*
|
1379
|
+
* call-seq: ignore_case -> true/false
|
1380
|
+
*
|
1381
|
+
* Returns whether case is ignored when computing matching characters.
|
1382
|
+
*/
|
1383
|
+
DEF_RB_READER(Jaro, rb_Jaro_ignore_case, ignore_case, C2BOOL)
|
1384
|
+
|
1385
|
+
/*
|
1386
|
+
* Document-method: ignore_case=
|
1387
|
+
*
|
1388
|
+
* call-seq: ignore_case=(true/false)
|
1389
|
+
*
|
1390
|
+
* Sets whether case is ignored when computing matching characters.
|
1391
|
+
*/
|
1392
|
+
DEF_RB_WRITER(Jaro, rb_Jaro_ignore_case_set, ignore_case,
|
1393
|
+
int, CAST2BOOL, BOOL2C, != Qundef)
|
1394
|
+
|
1395
|
+
/*
|
1396
|
+
* call-seq: new(pattern)
|
1397
|
+
*
|
1398
|
+
* Creates a new Amatch::Jaro instance from <code>pattern</code>.
|
1399
|
+
*/
|
1400
|
+
static VALUE rb_Jaro_initialize(VALUE self, VALUE pattern)
|
1401
|
+
{
|
1402
|
+
GET_STRUCT(Jaro)
|
1403
|
+
Jaro_pattern_set(amatch, pattern);
|
1404
|
+
amatch->ignore_case = 1;
|
1405
|
+
return self;
|
1406
|
+
}
|
1407
|
+
|
1408
|
+
DEF_CONSTRUCTOR(Jaro, Jaro)
|
1217
1409
|
|
1218
1410
|
/*
|
1219
|
-
*
|
1411
|
+
* call-seq: match(strings) -> results
|
1220
1412
|
*
|
1221
|
-
*
|
1413
|
+
* Uses this Amatch::Jaro instance to match
|
1414
|
+
* Jaro#pattern against <code>strings</code>, that is compute the
|
1415
|
+
* jaro metric with the strings. <code>strings</code> has to be
|
1416
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1417
|
+
* are either a Float or an Array of Floats respectively.
|
1418
|
+
*/
|
1419
|
+
static VALUE rb_Jaro_match(VALUE self, VALUE strings)
|
1420
|
+
{
|
1421
|
+
GET_STRUCT(Jaro)
|
1422
|
+
return Jaro_iterate_strings(amatch, strings, Jaro_match);
|
1423
|
+
}
|
1424
|
+
|
1425
|
+
/*
|
1426
|
+
* call-seq: jaro_similar(strings) -> results
|
1222
1427
|
*
|
1223
|
-
*
|
1224
|
-
*
|
1225
|
-
*
|
1226
|
-
*
|
1227
|
-
*
|
1428
|
+
* If called on a String, this string is used as a
|
1429
|
+
* Amatch::Jaro#pattern to match against <code>strings</code>. It
|
1430
|
+
* returns a Jaro metric number between 0.0 for very
|
1431
|
+
* unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
|
1432
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1433
|
+
* are either a Float or an Array of Floats respectively.
|
1434
|
+
*/
|
1435
|
+
static VALUE rb_str_jaro_similar(VALUE self, VALUE strings)
|
1436
|
+
{
|
1437
|
+
VALUE amatch = rb_Jaro_new(rb_cJaro, self);
|
1438
|
+
return rb_Jaro_match(amatch, strings);
|
1439
|
+
}
|
1440
|
+
|
1441
|
+
/*
|
1442
|
+
* Document-class: Amatch::JaroWinkler
|
1228
1443
|
*
|
1229
|
-
*
|
1444
|
+
* This class computes the Jaro-Winkler metric for two strings.
|
1445
|
+
* The Jaro-Winkler metric computes the similarity between 0 (no match)
|
1446
|
+
* and 1 (exact match) by looking for matching and transposed characters.
|
1230
1447
|
*
|
1231
|
-
*
|
1448
|
+
* It is a variant of the Jaro metric, with additional weighting towards
|
1449
|
+
* common prefixes.
|
1450
|
+
*/
|
1451
|
+
DEF_RB_FREE(JaroWinkler, JaroWinkler)
|
1452
|
+
|
1453
|
+
/*
|
1454
|
+
* Document-method: ignore_case
|
1232
1455
|
*
|
1233
|
-
*
|
1456
|
+
* call-seq: ignore_case -> true/false
|
1234
1457
|
*
|
1235
|
-
*
|
1236
|
-
*
|
1237
|
-
|
1458
|
+
* Returns whether case is ignored when computing matching characters.
|
1459
|
+
* Default is true.
|
1460
|
+
*/
|
1461
|
+
DEF_RB_READER(JaroWinkler, rb_JaroWinkler_ignore_case, ignore_case, C2BOOL)
|
1462
|
+
|
1463
|
+
/*
|
1464
|
+
* Document-method: scaling_factor
|
1238
1465
|
*
|
1239
|
-
*
|
1466
|
+
* call-seq: scaling_factor -> weight
|
1240
1467
|
*
|
1241
|
-
* The
|
1468
|
+
* The scaling factor is how much weight to give common prefixes.
|
1469
|
+
* Default is 0.1.
|
1470
|
+
*/
|
1471
|
+
DEF_RB_READER(JaroWinkler, rb_JaroWinkler_scaling_factor, scaling_factor, rb_float_new)
|
1472
|
+
|
1473
|
+
/*
|
1474
|
+
* Document-method: ignore_case=
|
1242
1475
|
*
|
1243
|
-
*
|
1476
|
+
* call-seq: ignore_case=(true/false)
|
1244
1477
|
*
|
1245
|
-
*
|
1478
|
+
* Sets whether case is ignored when computing matching characters.
|
1479
|
+
*/
|
1480
|
+
DEF_RB_WRITER(JaroWinkler, rb_JaroWinkler_ignore_case_set, ignore_case,
|
1481
|
+
int, CAST2BOOL, BOOL2C, != Qundef)
|
1482
|
+
|
1483
|
+
/*
|
1484
|
+
* Document-method: scaling_factor=
|
1485
|
+
*
|
1486
|
+
* call-seq: scaling_factor=(weight)
|
1487
|
+
*
|
1488
|
+
* Sets the weight to give common prefixes.
|
1489
|
+
*/
|
1490
|
+
DEF_RB_WRITER(JaroWinkler, rb_JaroWinkler_scaling_factor_set, scaling_factor,
|
1491
|
+
double, CAST2FLOAT, FLOAT2C, >= 0)
|
1492
|
+
|
1493
|
+
/*
|
1494
|
+
* call-seq: new(pattern)
|
1246
1495
|
*
|
1247
|
-
*
|
1496
|
+
* Creates a new Amatch::JaroWinkler instance from <code>pattern</code>.
|
1497
|
+
*/
|
1498
|
+
static VALUE rb_JaroWinkler_initialize(VALUE self, VALUE pattern)
|
1499
|
+
{
|
1500
|
+
GET_STRUCT(JaroWinkler)
|
1501
|
+
JaroWinkler_pattern_set(amatch, pattern);
|
1502
|
+
amatch->ignore_case = 1;
|
1503
|
+
amatch->scaling_factor = 0.1;
|
1504
|
+
return self;
|
1505
|
+
}
|
1506
|
+
|
1507
|
+
DEF_CONSTRUCTOR(JaroWinkler, JaroWinkler)
|
1508
|
+
|
1509
|
+
/*
|
1510
|
+
* call-seq: match(strings) -> results
|
1248
1511
|
*
|
1249
|
-
*
|
1250
|
-
*
|
1251
|
-
*
|
1252
|
-
*
|
1253
|
-
*
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
|
1261
|
-
|
1262
|
-
*
|
1263
|
-
* m.reset_weights
|
1264
|
-
* # => #<Amatch::Sellers:0x40366324>
|
1265
|
-
* m.match(["pattren","parent"])
|
1266
|
-
* # => [2.0, 4.0]
|
1267
|
-
* m.search("abcpattrendef")
|
1268
|
-
* # => 2.0
|
1269
|
-
*
|
1270
|
-
* m = Levenshtein.new("pattern")
|
1271
|
-
* # => #<Amatch::Levenshtein:0x4035919c>
|
1272
|
-
* m.match("pattren")
|
1273
|
-
* # => 2
|
1274
|
-
* m.search("abcpattrendef")
|
1275
|
-
* # => 2
|
1276
|
-
* "pattern language".levenshtein_similar("language of patterns")
|
1277
|
-
* # => 0.2
|
1278
|
-
*
|
1279
|
-
* m = Hamming.new("pattern")
|
1280
|
-
* # => #<Amatch::Hamming:0x40350858>
|
1281
|
-
* m.match("pattren")
|
1282
|
-
* # => 2
|
1283
|
-
* "pattern language".hamming_similar("language of patterns")
|
1284
|
-
* # => 0.1
|
1285
|
-
*
|
1286
|
-
* m = PairDistance.new("pattern")
|
1287
|
-
* # => #<Amatch::PairDistance:0x40349be8>
|
1288
|
-
* m.match("pattr en")
|
1289
|
-
* # => 0.545454545454545
|
1290
|
-
* m.match("pattr en", nil)
|
1291
|
-
* # => 0.461538461538462
|
1292
|
-
* m.match("pattr en", /t+/)
|
1293
|
-
* # => 0.285714285714286
|
1294
|
-
* "pattern language".pair_distance_similar("language of patterns")
|
1295
|
-
* # => 0.928571428571429
|
1296
|
-
*
|
1297
|
-
* m = LongestSubsequence.new("pattern")
|
1298
|
-
* # => #<Amatch::LongestSubsequence:0x4033e900>
|
1299
|
-
* m.match("pattren")
|
1300
|
-
* # => 6
|
1301
|
-
* "pattern language".longest_subsequence_similar("language of patterns")
|
1302
|
-
* # => 0.4
|
1303
|
-
*
|
1304
|
-
* m = LongestSubstring.new("pattern")
|
1305
|
-
* # => #<Amatch::LongestSubstring:0x403378d0>
|
1306
|
-
* m.match("pattren")
|
1307
|
-
* # => 4
|
1308
|
-
* "pattern language".longest_substring_similar("language of patterns")
|
1309
|
-
* # => 0.4
|
1512
|
+
* Uses this Amatch::Jaro instance to match
|
1513
|
+
* Jaro#pattern against <code>strings</code>, that is compute the
|
1514
|
+
* jaro metric with the strings. <code>strings</code> has to be
|
1515
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1516
|
+
* are either a Float or an Array of Floats respectively.
|
1517
|
+
*/
|
1518
|
+
static VALUE rb_JaroWinkler_match(VALUE self, VALUE strings)
|
1519
|
+
{
|
1520
|
+
GET_STRUCT(JaroWinkler)
|
1521
|
+
return JaroWinkler_iterate_strings(amatch, strings, JaroWinkler_match);
|
1522
|
+
}
|
1523
|
+
|
1524
|
+
/*
|
1525
|
+
* call-seq: jarowinkler_similar(strings) -> results
|
1310
1526
|
*
|
1527
|
+
* If called on a String, this string is used as a
|
1528
|
+
* Amatch::JaroWinkler#pattern to match against <code>strings</code>. It
|
1529
|
+
* returns a Jaro-Winkler metric number between 0.0 for very
|
1530
|
+
* unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
|
1531
|
+
* either a String or an Array of Strings. The returned <code>results</code>
|
1532
|
+
* are either a Float or an Array of Floats respectively.
|
1311
1533
|
*/
|
1534
|
+
static VALUE rb_str_jarowinkler_similar(VALUE self, VALUE strings)
|
1535
|
+
{
|
1536
|
+
VALUE amatch = rb_JaroWinkler_new(rb_cJaro, self);
|
1537
|
+
return rb_JaroWinkler_match(amatch, strings);
|
1538
|
+
}
|
1312
1539
|
|
1313
1540
|
void Init_amatch()
|
1314
1541
|
{
|
1542
|
+
rb_require("amatch/version");
|
1315
1543
|
rb_mAmatch = rb_define_module("Amatch");
|
1316
1544
|
|
1317
1545
|
/* Levenshtein */
|
@@ -1382,7 +1610,32 @@ void Init_amatch()
|
|
1382
1610
|
rb_define_method(rb_cLongestSubstring, "similar", rb_LongestSubstring_similar, 1);
|
1383
1611
|
rb_define_method(rb_cString, "longest_substring_similar", rb_str_longest_substring_similar, 1);
|
1384
1612
|
|
1613
|
+
/* Jaro */
|
1614
|
+
rb_cJaro = rb_define_class_under(rb_mAmatch, "Jaro", rb_cObject);
|
1615
|
+
rb_define_alloc_func(rb_cJaro, rb_Jaro_s_allocate);
|
1616
|
+
rb_define_method(rb_cJaro, "initialize", rb_Jaro_initialize, 1);
|
1617
|
+
rb_define_method(rb_cJaro, "pattern", rb_Jaro_pattern, 0);
|
1618
|
+
rb_define_method(rb_cJaro, "pattern=", rb_Jaro_pattern_set, 1);
|
1619
|
+
rb_define_method(rb_cJaro, "ignore_case", rb_Jaro_ignore_case, 0);
|
1620
|
+
rb_define_method(rb_cJaro, "ignore_case=", rb_Jaro_ignore_case_set, 1);
|
1621
|
+
rb_define_method(rb_cJaro, "match", rb_Jaro_match, 1);
|
1622
|
+
rb_define_alias(rb_cJaro, "similar", "match");
|
1623
|
+
rb_define_method(rb_cString, "jaro_similar", rb_str_jaro_similar, 1);
|
1624
|
+
|
1625
|
+
/* Jaro-Winkler */
|
1626
|
+
rb_cJaroWinkler = rb_define_class_under(rb_mAmatch, "JaroWinkler", rb_cObject);
|
1627
|
+
rb_define_alloc_func(rb_cJaroWinkler, rb_JaroWinkler_s_allocate);
|
1628
|
+
rb_define_method(rb_cJaroWinkler, "initialize", rb_JaroWinkler_initialize, 1);
|
1629
|
+
rb_define_method(rb_cJaroWinkler, "pattern", rb_JaroWinkler_pattern, 0);
|
1630
|
+
rb_define_method(rb_cJaroWinkler, "pattern=", rb_JaroWinkler_pattern_set, 1);
|
1631
|
+
rb_define_method(rb_cJaroWinkler, "ignore_case", rb_JaroWinkler_ignore_case, 0);
|
1632
|
+
rb_define_method(rb_cJaroWinkler, "ignore_case=", rb_JaroWinkler_ignore_case_set, 1);
|
1633
|
+
rb_define_method(rb_cJaroWinkler, "scaling_factor", rb_JaroWinkler_scaling_factor, 0);
|
1634
|
+
rb_define_method(rb_cJaroWinkler, "scaling_factor=", rb_JaroWinkler_scaling_factor_set, 1);
|
1635
|
+
rb_define_method(rb_cJaroWinkler, "match", rb_JaroWinkler_match, 1);
|
1636
|
+
rb_define_alias(rb_cJaroWinkler, "similar", "match");
|
1637
|
+
rb_define_method(rb_cString, "jarowinkler_similar", rb_str_jarowinkler_similar, 1);
|
1638
|
+
|
1385
1639
|
id_split = rb_intern("split");
|
1386
1640
|
id_to_f = rb_intern("to_f");
|
1387
1641
|
}
|
1388
|
-
/* vim: set et cin sw=4 ts=4: */
|