amatch 0.1.5 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +5 -2
- data/InstalledFiles +5 -0
- data/README.en +1 -1
- data/Rakefile +67 -58
- data/VERSION +1 -1
- data/bin/agrep.rb +65 -52
- data/config.save +12 -0
- data/ext/amatch.bundle +0 -0
- data/ext/amatch.c +1301 -225
- data/ext/extconf.rb +6 -1
- data/ext/pair.c +78 -0
- data/ext/pair.h +29 -0
- data/ext/tags +24 -0
- data/tests/runner.rb +26 -0
- data/tests/test_hamming.rb +54 -0
- data/tests/test_levenshtein.rb +74 -0
- data/tests/test_longest_subsequence.rb +57 -0
- data/tests/test_longest_substring.rb +57 -0
- data/tests/test_pair_distance.rb +81 -0
- data/tests/test_sellers.rb +94 -0
- metadata +26 -8
- data/amatch.txt.en +0 -117
- data/tests/test.rb +0 -94
@@ -0,0 +1,94 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'amatch'
|
3
|
+
require 'test_levenshtein'
|
4
|
+
|
5
|
+
class TC_Sellers < TC_Levenshtein
|
6
|
+
include Amatch
|
7
|
+
|
8
|
+
D = 0.000001
|
9
|
+
|
10
|
+
def setup
|
11
|
+
@empty = Sellers.new('')
|
12
|
+
@simple = Sellers.new('test')
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_weights
|
16
|
+
assert_in_delta 1, @simple.substitution, D
|
17
|
+
assert_in_delta 1, @simple.insertion, D
|
18
|
+
assert_in_delta 1, @simple.deletion, D
|
19
|
+
@simple.insertion = 1
|
20
|
+
@simple.substitution = @simple.deletion = 1000
|
21
|
+
assert_in_delta 1, @simple.match('tst'), D
|
22
|
+
assert_in_delta 1, @simple.search('bbbtstccc'), D
|
23
|
+
@simple.deletion = 1
|
24
|
+
@simple.substitution = @simple.insertion = 1000
|
25
|
+
assert_in_delta 1, @simple.match('tedst'), D
|
26
|
+
assert_in_delta 1, @simple.search('bbbtedstccc'), D
|
27
|
+
@simple.substitution = 1
|
28
|
+
@simple.deletion = @simple.insertion = 1000
|
29
|
+
assert_in_delta 1, @simple.match('tast'), D
|
30
|
+
assert_in_delta 1, @simple.search('bbbtastccc'), D
|
31
|
+
@simple.insertion = 0.5
|
32
|
+
@simple.substitution = @simple.deletion = 1000
|
33
|
+
assert_in_delta 0.5, @simple.match('tst'), D
|
34
|
+
assert_in_delta 0.5, @simple.search('bbbtstccc'), D
|
35
|
+
@simple.deletion = 0.5
|
36
|
+
@simple.substitution = @simple.insertion = 1000
|
37
|
+
assert_in_delta 0.5, @simple.match('tedst'), D
|
38
|
+
assert_in_delta 0.5, @simple.search('bbbtedstccc'), D
|
39
|
+
@simple.substitution = 0.5
|
40
|
+
@simple.deletion = @simple.insertion = 1000
|
41
|
+
assert_in_delta 0.5, @simple.match('tast'), D
|
42
|
+
assert_in_delta 0.5, @simple.search('bbbtastccc'), D
|
43
|
+
@simple.reset_weights
|
44
|
+
assert_in_delta 1, @simple.substitution, D
|
45
|
+
assert_in_delta 1, @simple.insertion, D
|
46
|
+
assert_in_delta 1, @simple.deletion, D
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_weight_exceptions
|
50
|
+
assert_raises(TypeError) { @simple.substitution = :something }
|
51
|
+
assert_raises(TypeError) { @simple.insertion = :something }
|
52
|
+
assert_raises(TypeError) { @simple.deletion = :something }
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_similar
|
56
|
+
assert_in_delta 0.0, @simple.similar(''), D
|
57
|
+
assert_in_delta 1.0, @simple.similar('test'), D
|
58
|
+
assert_in_delta 0.8, @simple.similar('testa'), D
|
59
|
+
assert_in_delta 0.8, @simple.similar('atest'), D
|
60
|
+
assert_in_delta 0.8, @simple.similar('teast'), D
|
61
|
+
assert_in_delta 0.75, @simple.similar('est'), D
|
62
|
+
assert_in_delta 0.75, @simple.similar('tes'), D
|
63
|
+
assert_in_delta 0.75, @simple.similar('tst'), D
|
64
|
+
assert_in_delta 0.75, @simple.similar('best'), D
|
65
|
+
assert_in_delta 0.75, @simple.similar('tost'), D
|
66
|
+
assert_in_delta 0.75, @simple.similar('tesa'), D
|
67
|
+
assert_in_delta 0.25, @simple.similar('taex'), D
|
68
|
+
assert_in_delta 0.4, @simple.similar('aaatestbbb'), D
|
69
|
+
assert_in_delta 0.75, @simple.pattern.levenshtein_similar('est'), D
|
70
|
+
end
|
71
|
+
|
72
|
+
def test_similar
|
73
|
+
assert_in_delta 1, @empty.similar(''), D
|
74
|
+
assert_in_delta 0, @empty.similar('not empty'), D
|
75
|
+
assert_in_delta 0.0, @simple.similar(''), D
|
76
|
+
assert_in_delta 1.0, @simple.similar('test'), D
|
77
|
+
assert_in_delta 0.8, @simple.similar('testa'), D
|
78
|
+
assert_in_delta 0.8, @simple.similar('atest'), D
|
79
|
+
assert_in_delta 0.8, @simple.similar('teast'), D
|
80
|
+
assert_in_delta 0.75, @simple.similar('est'), D
|
81
|
+
assert_in_delta 0.75, @simple.similar('tes'), D
|
82
|
+
assert_in_delta 0.75, @simple.similar('tst'), D
|
83
|
+
assert_in_delta 0.75, @simple.similar('best'), D
|
84
|
+
assert_in_delta 0.75, @simple.similar('tost'), D
|
85
|
+
assert_in_delta 0.75, @simple.similar('tesa'), D
|
86
|
+
assert_in_delta 0.25, @simple.similar('taex'), D
|
87
|
+
assert_in_delta 0.4, @simple.similar('aaatestbbb'), D
|
88
|
+
assert_in_delta 0.75, @simple.pattern.levenshtein_similar('est'), D
|
89
|
+
@simple.insertion = 1
|
90
|
+
@simple.substitution = @simple.deletion = 2
|
91
|
+
assert_in_delta 0.875, @simple.similar('tst'), D
|
92
|
+
end
|
93
|
+
end
|
94
|
+
# vim: set et sw=2 ts=2:
|
metadata
CHANGED
@@ -3,15 +3,17 @@ rubygems_version: 0.8.10
|
|
3
3
|
specification_version: 1
|
4
4
|
name: amatch
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2005-
|
6
|
+
version: 0.2.0
|
7
|
+
date: 2005-06-01
|
8
8
|
summary: Approximate String Matching library
|
9
9
|
require_paths:
|
10
|
-
-
|
10
|
+
- ext
|
11
11
|
email: flori@ping.de
|
12
12
|
homepage: http://amatch.rubyforge.org
|
13
13
|
rubyforge_project: amatch
|
14
|
-
description: Amatch is a library for approximate string matching and searching
|
14
|
+
description: "Amatch is a library for approximate string matching and searching in strings.
|
15
|
+
Several algorithms can be used to do this, and it's also possible to compute a
|
16
|
+
similarity metric number between 0.0 and 1.0 for two given strings."
|
15
17
|
autorequire: amatch
|
16
18
|
default_executable: agrep.rb
|
17
19
|
bindir: bin
|
@@ -36,15 +38,31 @@ files:
|
|
36
38
|
- ext/MANIFEST
|
37
39
|
- ext/amatch.c
|
38
40
|
- ext/extconf.rb
|
41
|
+
- ext/amatch.bundle
|
42
|
+
- ext/tags
|
43
|
+
- ext/pair.h
|
44
|
+
- ext/pair.c
|
39
45
|
- Rakefile
|
40
46
|
- VERSION
|
41
|
-
- amatch.txt.en
|
42
47
|
- install.rb
|
48
|
+
- config.save
|
49
|
+
- InstalledFiles
|
43
50
|
- tests
|
44
|
-
- tests/
|
51
|
+
- tests/test_levenshtein.rb
|
52
|
+
- tests/test_hamming.rb
|
53
|
+
- tests/test_pair_distance.rb
|
54
|
+
- tests/runner.rb
|
55
|
+
- tests/test_sellers.rb
|
56
|
+
- tests/test_longest_subsequence.rb
|
57
|
+
- tests/test_longest_substring.rb
|
45
58
|
test_files:
|
46
|
-
- tests/
|
47
|
-
rdoc_options:
|
59
|
+
- tests/runner.rb
|
60
|
+
rdoc_options:
|
61
|
+
- "--title"
|
62
|
+
- "Amatch -- Approximate Matching"
|
63
|
+
- "--main"
|
64
|
+
- Amatch
|
65
|
+
- "--line-numbers"
|
48
66
|
extra_rdoc_files: []
|
49
67
|
executables:
|
50
68
|
- agrep.rb
|
data/amatch.txt.en
DELETED
@@ -1,117 +0,0 @@
|
|
1
|
-
AMatch
|
2
|
-
|
3
|
-
Approximate Matching/Searching/Comparing
|
4
|
-
|
5
|
-
SYNOPSIS
|
6
|
-
|
7
|
-
require 'amatch'
|
8
|
-
|
9
|
-
m = Amatch.new("pattern")
|
10
|
-
|
11
|
-
p m.match("pattren")
|
12
|
-
p m.match(["pattren","parent"])
|
13
|
-
p m.matchr("pattren")
|
14
|
-
p m.compare("pattren")
|
15
|
-
p m.comparer("pattren")
|
16
|
-
p m.compare("pattn")
|
17
|
-
p m.comparer("pattn")
|
18
|
-
p m.search("abcpattrendef")
|
19
|
-
p m.searchr("abcpattrendef")
|
20
|
-
|
21
|
-
DESCRIPTION
|
22
|
-
|
23
|
-
This class enables your programs to do approximate matching, searching and
|
24
|
-
comparing of strings. It uses an algorithm that calculates the Levenstein
|
25
|
-
distance between those strings to implement those features.
|
26
|
-
|
27
|
-
The Levenstein edit distance is defined as the minimal costs involved to
|
28
|
-
transform one string into another by using three elementary operations:
|
29
|
-
deletion, insertion and substitution of a character. To transform "water" into
|
30
|
-
"wine", for instance, you have to substitute ?a -> i?: "witer", ?t -> ?n:
|
31
|
-
"winer" and delete ?r: "wine". The edit distance between "water" and "wine" is
|
32
|
-
3, because you have to apply three operations. The edit distance between
|
33
|
-
"wine" and "wine" is 0, of course: no operation is necessary for the
|
34
|
-
transformation -- they're already the same string. It's easy to see that more
|
35
|
-
similar strings have smaller edit distances than strings that differ a lot.
|
36
|
-
|
37
|
-
You can als use different weights for every operation to prefer special
|
38
|
-
operations over others. There are three different kinds of match methods
|
39
|
-
defined in this class: "match" computes the Levenstein distance between a
|
40
|
-
pattern and some strings, "search" searches in some text for a special pattern
|
41
|
-
returning a minimal distance, "compare" calculates a value that can be used to
|
42
|
-
define a partial order between strings in relation to a given pattern. It's
|
43
|
-
also possible to compute a relative distance. This floating point value is
|
44
|
-
computed as absolute distance / length of search pattern.
|
45
|
-
|
46
|
-
CONSTRUCTOR
|
47
|
-
|
48
|
-
- Amatch#new(pattern)
|
49
|
-
|
50
|
-
constructs an Amatch object and initializes it with 'pattern'. If no 'pattern'
|
51
|
-
is given it has to be set with Amatch#pattern before matching.
|
52
|
-
|
53
|
-
METHODS
|
54
|
-
|
55
|
-
- Amatch#pattern pattern string to match against
|
56
|
-
|
57
|
-
- Amatch#subw weight of one substitution (type Fixnum)
|
58
|
-
|
59
|
-
- Amatch#delw weight of one deletion (type Fixnum)
|
60
|
-
|
61
|
-
- Amatch#insw weight of one insertion (type Fixnum)
|
62
|
-
|
63
|
-
- Amatch#resetw resets all weights to their default values (=1).
|
64
|
-
|
65
|
-
The following methods require the parameter 'strings'. This parameter can be
|
66
|
-
of type String or Array of Strings. The method executes the matching operation
|
67
|
-
and returns a number if a string was given. If an array of strings was given
|
68
|
-
it returns an array of numbers.
|
69
|
-
|
70
|
-
- Amatch#match(strings)
|
71
|
-
|
72
|
-
calculates the absolute edit distance(s) between 'pattern' and 'strings' =
|
73
|
-
the Levenstein distance in char operations. See also Amatch#pattern.
|
74
|
-
|
75
|
-
- Amatch#matchr(strings)
|
76
|
-
|
77
|
-
calculates the relative edit distance as float. This value is defined as the
|
78
|
-
edit distance divided by the length of 'pattern'. See also Amatch#pattern.
|
79
|
-
|
80
|
-
- Amatch#search(strings)
|
81
|
-
|
82
|
-
searches 'pattern' in strings and returns the edit distance by greedy
|
83
|
-
trimming prefixes or postfixes of the match.
|
84
|
-
|
85
|
-
- Amatch#searchr(strings)
|
86
|
-
|
87
|
-
does the same as Amatch#search but divides the edit distance by the length
|
88
|
-
of 'pattern' and returns the value as float.
|
89
|
-
|
90
|
-
- Amatch#compare(strings)
|
91
|
-
|
92
|
-
calculates the same absolute value like Amatch#match. The sign of the result
|
93
|
-
value is negative if the strings are shorter than 'pattern' or positive
|
94
|
-
else.
|
95
|
-
|
96
|
-
- Amatch#comparer(strings)
|
97
|
-
|
98
|
-
calculates the same absolute value like Amatch#matchr. The sign of the
|
99
|
-
result value is negative if the strings are shorter than 'pattern' or
|
100
|
-
positive else.
|
101
|
-
|
102
|
-
EXAMPLES
|
103
|
-
|
104
|
-
An agrep utility will be installed that demonstrates the usage of this
|
105
|
-
library.
|
106
|
-
|
107
|
-
AUTHOR
|
108
|
-
|
109
|
-
Florian Frank <flori@ping.de>
|
110
|
-
|
111
|
-
COPYRIGHT
|
112
|
-
|
113
|
-
Copyright (c) 2002 Florian Frank <flori@ping.de>
|
114
|
-
|
115
|
-
This is free software; you can redistribute it and/or modify it under the
|
116
|
-
terms of the GNU General Public License Version 2 as published by the Free
|
117
|
-
Software Foundation: http://www.gnu.org/copyleft/gpl.html
|
data/tests/test.rb
DELETED
@@ -1,94 +0,0 @@
|
|
1
|
-
require 'test/unit'
|
2
|
-
require 'amatch'
|
3
|
-
|
4
|
-
class TC_AmatchTest < Test::Unit::TestCase
|
5
|
-
|
6
|
-
def setup
|
7
|
-
@matcher = Amatch.new('test')
|
8
|
-
end
|
9
|
-
|
10
|
-
def test_match
|
11
|
-
assert(@matcher.match('') == 4)
|
12
|
-
assert(@matcher.match('test') == 0)
|
13
|
-
assert(@matcher.match('test') == 0)
|
14
|
-
assert(@matcher.match('testa') == 1)
|
15
|
-
assert(@matcher.match('atest') == 1)
|
16
|
-
assert(@matcher.match('teast') == 1)
|
17
|
-
assert(@matcher.match('est') == 1)
|
18
|
-
assert(@matcher.match('tes') == 1)
|
19
|
-
assert(@matcher.match('tst') == 1)
|
20
|
-
assert(@matcher.match('best') == 1)
|
21
|
-
assert(@matcher.match('tost') == 1)
|
22
|
-
assert(@matcher.match('tesa') == 1)
|
23
|
-
assert(@matcher.match('taex') == 3)
|
24
|
-
assert(@matcher.matchr('') == 1.0)
|
25
|
-
assert(@matcher.matchr('tesa') == 0.25)
|
26
|
-
assert(@matcher.match('aaatestbbb') == 6)
|
27
|
-
end
|
28
|
-
|
29
|
-
def test_search
|
30
|
-
assert(@matcher.search('') == 4)
|
31
|
-
assert(@matcher.searchr('') == 1.0)
|
32
|
-
assert(@matcher.search('aaatestbbb') == 0)
|
33
|
-
assert(@matcher.search('aaataexbbb') == 3)
|
34
|
-
assert(@matcher.searchr('aaataexbbb') == 0.75)
|
35
|
-
assert(@matcher.search('aaaaaaaaa') == 4)
|
36
|
-
assert(@matcher.searchr('aaaaaaaaa') == 1.0)
|
37
|
-
end
|
38
|
-
|
39
|
-
def test_compare
|
40
|
-
assert(@matcher.compare('') == -4)
|
41
|
-
assert(@matcher.comparer('') == -1.0)
|
42
|
-
assert(@matcher.compare('taex') == 3)
|
43
|
-
assert(@matcher.comparer('tesa') == 0.25)
|
44
|
-
assert(@matcher.compare('aaatestbbb') == 6)
|
45
|
-
assert(@matcher.compare('test') == 0)
|
46
|
-
assert(@matcher.compare('tex') == -2)
|
47
|
-
assert(@matcher.comparer('tsa') == -0.5)
|
48
|
-
assert(@matcher.compare('wxyz') == 4)
|
49
|
-
assert(@matcher.comparer('wxyz') == 1.0)
|
50
|
-
assert_raises(TypeError) { @matcher.match(:foo) }
|
51
|
-
end
|
52
|
-
|
53
|
-
def test_array_result
|
54
|
-
assert(@matcher.match([]) == []);
|
55
|
-
assert(@matcher.match(["tets", "test"]) == [2, 0]);
|
56
|
-
assert(@matcher.matchr(["tets", "test"]) == [0.5, 0]);
|
57
|
-
assert(@matcher.compare(["tets", "test"]) == [2, 0]);
|
58
|
-
assert(@matcher.comparer(["tets", "test"]) == [0.5, 0]);
|
59
|
-
assert(@matcher.search(["tetsaaa", "testaaa"]) == [1, 0]);
|
60
|
-
assert(@matcher.searchr(["tetsaaa", "testaaa"]) == [0.25, 0]);
|
61
|
-
assert_raises(TypeError) { @matcher.match([:foo, "bar"]) }
|
62
|
-
end
|
63
|
-
|
64
|
-
def test_weights
|
65
|
-
assert(@matcher.subw == 1)
|
66
|
-
assert(@matcher.insw == 1)
|
67
|
-
assert(@matcher.delw == 1)
|
68
|
-
@matcher.subw = 2
|
69
|
-
assert(@matcher.subw == 2)
|
70
|
-
assert(@matcher.match('tast') == 2)
|
71
|
-
@matcher.subw = 1
|
72
|
-
assert(@matcher.match('tast') == 1)
|
73
|
-
@matcher.delw = 2
|
74
|
-
assert(@matcher.delw == 2)
|
75
|
-
assert(@matcher.match('teist') == 2)
|
76
|
-
@matcher.insw = 2
|
77
|
-
assert(@matcher.insw == 2)
|
78
|
-
assert(@matcher.match('tst') == 2)
|
79
|
-
@matcher.resetw
|
80
|
-
assert(@matcher.subw == 1)
|
81
|
-
assert(@matcher.insw == 1)
|
82
|
-
assert(@matcher.delw == 1)
|
83
|
-
@matcher.subw = :something
|
84
|
-
assert_raises(TypeError) { @matcher.match('anything') }
|
85
|
-
@matcher.subw = 1
|
86
|
-
@matcher.insw = :something
|
87
|
-
assert_raises(TypeError) { @matcher.match('anything') }
|
88
|
-
@matcher.insw = 1
|
89
|
-
@matcher.delw = :something
|
90
|
-
assert_raises(TypeError) { @matcher.match('anything') }
|
91
|
-
end
|
92
|
-
|
93
|
-
end
|
94
|
-
# vim: set noet sw=4 ts=4:
|