amatch 0.2.5-x86-mswin32
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +31 -0
- data/COPYING +340 -0
- data/README +130 -0
- data/Rakefile +141 -0
- data/VERSION +1 -0
- data/amatch.gemspec +31 -0
- data/bin/agrep.rb +79 -0
- data/ext/amatch.c +1641 -0
- data/ext/common.h +25 -0
- data/ext/extconf.rb +6 -0
- data/ext/pair.c +77 -0
- data/ext/pair.h +29 -0
- data/install.rb +28 -0
- data/lib/amatch.so +0 -0
- data/lib/amatch/version.rb +8 -0
- data/tests/test_hamming.rb +58 -0
- data/tests/test_jaro.rb +29 -0
- data/tests/test_jaro_winkler.rb +38 -0
- data/tests/test_levenshtein.rb +83 -0
- data/tests/test_longest_subsequence.rb +61 -0
- data/tests/test_longest_substring.rb +61 -0
- data/tests/test_pair_distance.rb +86 -0
- data/tests/test_sellers.rb +96 -0
- metadata +95 -0
data/ext/common.h
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
#ifndef __COMMON_H__
|
2
|
+
# define __COMMON_H__
|
3
|
+
|
4
|
+
#ifndef RSTRING_PTR
|
5
|
+
#define RSTRING_PTR(str) (RSTRING(str)->ptr)
|
6
|
+
#endif
|
7
|
+
|
8
|
+
#ifndef RSTRING_LEN
|
9
|
+
#define RSTRING_LEN(str) (RSTRING(str)->len)
|
10
|
+
#endif
|
11
|
+
|
12
|
+
#ifndef RARRAY_PTR
|
13
|
+
#define RARRAY_PTR(ary) (RARRAY(ary)->ptr)
|
14
|
+
#endif
|
15
|
+
|
16
|
+
#ifndef RARRAY_LEN
|
17
|
+
#define RARRAY_LEN(ary) (RARRAY(ary)->len)
|
18
|
+
#endif
|
19
|
+
|
20
|
+
#ifndef RFLOAT_VALUE
|
21
|
+
#define RFLOAT_VALUE(val) (RFLOAT(val)->value)
|
22
|
+
#endif
|
23
|
+
|
24
|
+
|
25
|
+
#endif
|
data/ext/extconf.rb
ADDED
data/ext/pair.c
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
#include "pair.h"
|
2
|
+
|
3
|
+
#define DEBUG 0
|
4
|
+
|
5
|
+
static int predict_length(VALUE tokens)
|
6
|
+
{
|
7
|
+
int i, l, result;
|
8
|
+
for (i = 0, result = 0; i < RARRAY_LEN(tokens); i++) {
|
9
|
+
VALUE t = rb_ary_entry(tokens, i);
|
10
|
+
l = RSTRING_LEN(t) - 1;
|
11
|
+
if (l > 0) result += l;
|
12
|
+
}
|
13
|
+
return result;
|
14
|
+
}
|
15
|
+
|
16
|
+
PairArray *PairArray_new(VALUE tokens)
|
17
|
+
{
|
18
|
+
int i, j, k, len = predict_length(tokens);
|
19
|
+
PairArray *pair_array = ALLOC(PairArray);
|
20
|
+
Pair *pairs = ALLOC_N(Pair, len);
|
21
|
+
MEMZERO(pairs, Pair, len);
|
22
|
+
pair_array->pairs = pairs;
|
23
|
+
pair_array->len = len;
|
24
|
+
for (i = 0, k = 0; i < RARRAY_LEN(tokens); i++) {
|
25
|
+
VALUE t = rb_ary_entry(tokens, i);
|
26
|
+
char *string = RSTRING_PTR(t);
|
27
|
+
for (j = 0; j < RSTRING_LEN(t) - 1; j++) {
|
28
|
+
pairs[k].fst = string[j];
|
29
|
+
pairs[k].snd = string[j + 1];
|
30
|
+
pairs[k].status = PAIR_ACTIVE;
|
31
|
+
k++;
|
32
|
+
}
|
33
|
+
}
|
34
|
+
return pair_array;
|
35
|
+
}
|
36
|
+
|
37
|
+
void pair_array_reactivate(PairArray *self)
|
38
|
+
{
|
39
|
+
int i;
|
40
|
+
for (i = 0; i < self->len; i++) {
|
41
|
+
self->pairs[i].status = PAIR_ACTIVE;
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
double pair_array_match(PairArray *self, PairArray *other)
|
46
|
+
{
|
47
|
+
int i, j, matches = 0;
|
48
|
+
int sum = self->len + other->len;
|
49
|
+
if (sum == 0) return 1.0;
|
50
|
+
for (i = 0; i < self->len; i++) {
|
51
|
+
for (j = 0; j < other->len; j++) {
|
52
|
+
#if DEBUG
|
53
|
+
pair_print(self->pairs[i]);
|
54
|
+
putc(' ', stdout);
|
55
|
+
pair_print(other->pairs[j]);
|
56
|
+
printf(" -> %d\n", pair_equal(self->pairs[i], other->pairs[j]));
|
57
|
+
#endif
|
58
|
+
if (pair_equal(self->pairs[i], other->pairs[j])) {
|
59
|
+
matches++;
|
60
|
+
other->pairs[j].status = PAIR_INACTIVE;
|
61
|
+
break;
|
62
|
+
}
|
63
|
+
}
|
64
|
+
}
|
65
|
+
return ((double) (2 * matches)) / sum;
|
66
|
+
}
|
67
|
+
|
68
|
+
void pair_print(Pair pair)
|
69
|
+
{
|
70
|
+
printf("%c%c (%d)", pair.fst, pair.snd, pair.status);
|
71
|
+
}
|
72
|
+
|
73
|
+
void pair_array_destroy(PairArray *pair_array)
|
74
|
+
{
|
75
|
+
free(pair_array->pairs);
|
76
|
+
free(pair_array);
|
77
|
+
}
|
data/ext/pair.h
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#ifndef PAIR_H_INCLUDED
|
2
|
+
#define PAIR_H_INCLUDED
|
3
|
+
|
4
|
+
#include "ruby.h"
|
5
|
+
#include "common.h"
|
6
|
+
|
7
|
+
enum { PAIR_ACTIVE = 1, PAIR_INACTIVE = 2 };
|
8
|
+
|
9
|
+
typedef struct PairStruct {
|
10
|
+
char fst;
|
11
|
+
char snd;
|
12
|
+
char status;
|
13
|
+
char __align;
|
14
|
+
} Pair;
|
15
|
+
|
16
|
+
typedef struct PairArrayStruct {
|
17
|
+
Pair *pairs;
|
18
|
+
int len;
|
19
|
+
} PairArray;
|
20
|
+
|
21
|
+
PairArray *PairArray_new(VALUE tokens);
|
22
|
+
#define pair_equal(a, b) \
|
23
|
+
((a).fst == (b).fst && (a).snd == (b).snd && ((a).status & (b).status & PAIR_ACTIVE))
|
24
|
+
double pair_array_match(PairArray *self, PairArray *other);
|
25
|
+
void pair_array_destroy(PairArray *pair_array);
|
26
|
+
void pair_print(Pair pair);
|
27
|
+
void pair_array_reactivate(PairArray *self);
|
28
|
+
|
29
|
+
#endif
|
data/install.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rbconfig'
|
4
|
+
include Config
|
5
|
+
require 'fileutils'
|
6
|
+
include FileUtils::Verbose
|
7
|
+
|
8
|
+
MAKE = ENV['MAKE'] || %w[gmake make].find { |c| system(c, '-v') }
|
9
|
+
|
10
|
+
bindir = CONFIG['bindir']
|
11
|
+
archdir = CONFIG['sitearchdir']
|
12
|
+
libdir = CONFIG['sitelibdir']
|
13
|
+
dlext = CONFIG['DLEXT']
|
14
|
+
cd 'ext' do
|
15
|
+
system 'ruby extconf.rb' or exit 1
|
16
|
+
system "#{MAKE}" or exit 1
|
17
|
+
mkdir_p archdir
|
18
|
+
install "amatch.#{dlext}", archdir
|
19
|
+
end
|
20
|
+
cd 'bin' do
|
21
|
+
filename = 'edit_json.rb'
|
22
|
+
install('agrep.rb', bindir)
|
23
|
+
end
|
24
|
+
cd 'lib/amatch' do
|
25
|
+
mkdir_p d = File.join(libdir, 'amatch')
|
26
|
+
install 'version.rb', d
|
27
|
+
end
|
28
|
+
warn " *** Installed amatch extension."
|
data/lib/amatch.so
ADDED
Binary file
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'amatch'
|
3
|
+
|
4
|
+
class TestHamming < Test::Unit::TestCase
|
5
|
+
include Amatch
|
6
|
+
|
7
|
+
D = 0.000001
|
8
|
+
|
9
|
+
def setup
|
10
|
+
@small = Hamming.new('test')
|
11
|
+
@empty = Hamming.new('')
|
12
|
+
@long = Hamming.new('A' * 160)
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_empty
|
16
|
+
assert_in_delta 0, @empty.match(''), D
|
17
|
+
assert_in_delta 9, @empty.match('not empty'), D
|
18
|
+
assert_in_delta 1, @empty.similar(''), D
|
19
|
+
assert_in_delta 0, @empty.similar('not empty'), D
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_small_match
|
23
|
+
assert_in_delta 4, @small.match(''), D
|
24
|
+
assert_in_delta 0, @small.match('test'), D
|
25
|
+
assert_in_delta 1, @small.match('testa'), D
|
26
|
+
assert_in_delta 5, @small.match('atest'), D
|
27
|
+
assert_in_delta 3, @small.match('teast'), D
|
28
|
+
assert_in_delta 4, @small.match('est'), D
|
29
|
+
assert_in_delta 1, @small.match('tes'), D
|
30
|
+
assert_in_delta 3, @small.match('tst'), D
|
31
|
+
assert_in_delta 1, @small.match('best'), D
|
32
|
+
assert_in_delta 1, @small.match('tost'), D
|
33
|
+
assert_in_delta 1, @small.match('tesa'), D
|
34
|
+
assert_in_delta 3, @small.match('taex'), D
|
35
|
+
assert_in_delta 9, @small.match('aaatestbbb'), D
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_small_similar
|
39
|
+
assert_in_delta 0.0, @small.similar(''), D
|
40
|
+
assert_in_delta 1.0, @small.similar('test'), D
|
41
|
+
assert_in_delta 0.8, @small.similar('testa'), D
|
42
|
+
assert_in_delta 0.0, @small.similar('atest'), D
|
43
|
+
assert_in_delta 0.4, @small.similar('teast'), D
|
44
|
+
assert_in_delta 0, @small.similar('est'), D
|
45
|
+
assert_in_delta 0.75, @small.similar('tes'), D
|
46
|
+
assert_in_delta 0.25, @small.similar('tst'), D
|
47
|
+
assert_in_delta 0.75, @small.similar('best'), D
|
48
|
+
assert_in_delta 0.75, @small.similar('tost'), D
|
49
|
+
assert_in_delta 0.75, @small.similar('tesa'), D
|
50
|
+
assert_in_delta 0.25, @small.similar('taex'), D
|
51
|
+
assert_in_delta 0.1, @small.similar('aaatestbbb'), D
|
52
|
+
assert_in_delta 0.8, @small.pattern.hamming_similar('testa'), D
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_long
|
56
|
+
assert_in_delta 1.0, @long.similar(@long.pattern), D
|
57
|
+
end
|
58
|
+
end
|
data/tests/test_jaro.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require File.dirname(__FILE__) + "/../ext/amatch"
|
3
|
+
|
4
|
+
class TestJaro < Test::Unit::TestCase
|
5
|
+
include Amatch
|
6
|
+
|
7
|
+
D = 0.0005
|
8
|
+
|
9
|
+
def setup
|
10
|
+
@martha = Jaro.new('Martha')
|
11
|
+
@dwayne = Jaro.new('dwayne')
|
12
|
+
@dixon = Jaro.new('DIXON')
|
13
|
+
@one = Jaro.new('one')
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_case
|
17
|
+
@martha.ignore_case = true
|
18
|
+
assert_in_delta 0.944, @martha.match('MARHTA'), D
|
19
|
+
@martha.ignore_case = false
|
20
|
+
assert_in_delta 0.444, @martha.match('MARHTA'), D
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_match
|
24
|
+
assert_in_delta 0.944, @martha.match('MARHTA'), D
|
25
|
+
assert_in_delta 0.822, @dwayne.match('DUANE'), D
|
26
|
+
assert_in_delta 0.767, @dixon.match('DICKSONX'), D
|
27
|
+
assert_in_delta 0.667, @one.match('orange'), D
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'amatch'
|
3
|
+
|
4
|
+
class TestJaroWinkler < Test::Unit::TestCase
|
5
|
+
include Amatch
|
6
|
+
|
7
|
+
D = 0.0005
|
8
|
+
|
9
|
+
def setup
|
10
|
+
@martha = JaroWinkler.new('Martha')
|
11
|
+
@dwayne = JaroWinkler.new('dwayne')
|
12
|
+
@dixon = JaroWinkler.new('DIXON')
|
13
|
+
@one = JaroWinkler.new("one")
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_case
|
17
|
+
@martha.ignore_case = true
|
18
|
+
assert_in_delta 0.961, @martha.match('MARHTA'), D
|
19
|
+
@martha.ignore_case = false
|
20
|
+
assert_in_delta 0.500, @martha.match('MARHTA'), D
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_match
|
24
|
+
assert_in_delta 0.961, @martha.match('MARHTA'), D
|
25
|
+
assert_in_delta 0.840, @dwayne.match('DUANE'), D
|
26
|
+
assert_in_delta 0.813, @dixon.match('DICKSONX'), D
|
27
|
+
assert_in_delta 0, @one.match('two'), D
|
28
|
+
assert_in_delta 0.700, @one.match('orange'), D
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_scaling_factor
|
32
|
+
assert_in_delta 0.1, @martha.scaling_factor, 0.0000001
|
33
|
+
@martha.scaling_factor = 0.2
|
34
|
+
assert_in_delta 0.978, @martha.match('MARHTA'), D
|
35
|
+
@martha.scaling_factor = 0.5 # this is far too high
|
36
|
+
assert_in_delta 1.028, @martha.match('MARHTA'), D
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'amatch'
|
3
|
+
|
4
|
+
class TestLevenshtein < Test::Unit::TestCase
|
5
|
+
include Amatch
|
6
|
+
|
7
|
+
def setup
|
8
|
+
@d = 0.000001
|
9
|
+
@empty = Levenshtein.new('')
|
10
|
+
@simple = Levenshtein.new('test')
|
11
|
+
@long = Levenshtein.new('A' * 160)
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_match
|
15
|
+
assert_equal 4, @simple.match('')
|
16
|
+
assert_equal 0, @simple.match('test')
|
17
|
+
assert_equal 0, @simple.match('test')
|
18
|
+
assert_equal 1, @simple.match('testa')
|
19
|
+
assert_equal 1, @simple.match('atest')
|
20
|
+
assert_equal 1, @simple.match('teast')
|
21
|
+
assert_equal 1, @simple.match('est')
|
22
|
+
assert_equal 1, @simple.match('tes')
|
23
|
+
assert_equal 1, @simple.match('tst')
|
24
|
+
assert_equal 1, @simple.match('best')
|
25
|
+
assert_equal 1, @simple.match('tost')
|
26
|
+
assert_equal 1, @simple.match('tesa')
|
27
|
+
assert_equal 3, @simple.match('taex')
|
28
|
+
assert_equal 6, @simple.match('aaatestbbb')
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_search
|
32
|
+
assert_equal 4, @simple.search('')
|
33
|
+
assert_equal 0, @empty.search('')
|
34
|
+
assert_equal 0, @empty.search('test')
|
35
|
+
assert_equal 0, @simple.search('aaatestbbb')
|
36
|
+
assert_equal 3, @simple.search('aaataexbbb')
|
37
|
+
assert_equal 4, @simple.search('aaaaaaaaa')
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_array_result
|
41
|
+
assert_equal [2, 0], @simple.match(["tets", "test"])
|
42
|
+
assert_equal [1, 0], @simple.search(["tetsaaa", "testaaa"])
|
43
|
+
assert_raises(TypeError) { @simple.match([:foo, "bar"]) }
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_pattern_setting
|
47
|
+
assert_raises(TypeError) { @simple.pattern = :something }
|
48
|
+
assert_equal 0, @simple.match('test')
|
49
|
+
@simple.pattern = ''
|
50
|
+
assert_equal 4, @simple.match('test')
|
51
|
+
@simple.pattern = 'test'
|
52
|
+
assert_equal 0, @simple.match('test')
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_similar
|
56
|
+
assert_in_delta 1, @empty.similar(''), @d
|
57
|
+
assert_in_delta 0, @empty.similar('not empty'), @d
|
58
|
+
assert_in_delta 0.0, @simple.similar(''), @d
|
59
|
+
assert_in_delta 1.0, @simple.similar('test'), @d
|
60
|
+
assert_in_delta 0.8, @simple.similar('testa'), @d
|
61
|
+
assert_in_delta 0.8, @simple.similar('atest'), @d
|
62
|
+
assert_in_delta 0.8, @simple.similar('teast'), @d
|
63
|
+
assert_in_delta 0.75, @simple.similar('est'), @d
|
64
|
+
assert_in_delta 0.75, @simple.similar('tes'), @d
|
65
|
+
assert_in_delta 0.75, @simple.similar('tst'), @d
|
66
|
+
assert_in_delta 0.75, @simple.similar('best'), @d
|
67
|
+
assert_in_delta 0.75, @simple.similar('tost'), @d
|
68
|
+
assert_in_delta 0.75, @simple.similar('tesa'), @d
|
69
|
+
assert_in_delta 0.25, @simple.similar('taex'), @d
|
70
|
+
assert_in_delta 0.4, @simple.similar('aaatestbbb'), @d
|
71
|
+
assert_in_delta 0.75, @simple.pattern.levenshtein_similar('est'), @d
|
72
|
+
end
|
73
|
+
|
74
|
+
def test_long
|
75
|
+
assert_in_delta 1.0, @long.similar(@long.pattern), @d
|
76
|
+
end
|
77
|
+
|
78
|
+
def test_long2
|
79
|
+
a = "lost this fantasy, this fantasy, this fantasy, this fantasy, this fantasy, this fantasy\r\n\r\nGood love Neat work\r\n\r\nSuper job Fancy work\r\n\r\nPants job Cool work"
|
80
|
+
b = "lost\r\n\r\nGood love Neat work\r\n\r\nSuper job Fancy work\r\n\r\nPants job Cool work"
|
81
|
+
assert a.levenshtein_similar(b)
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'amatch'
|
3
|
+
|
4
|
+
class TestLongestSubsequence < Test::Unit::TestCase
|
5
|
+
include Amatch
|
6
|
+
|
7
|
+
D = 0.000001
|
8
|
+
|
9
|
+
def setup
|
10
|
+
@small = LongestSubsequence.new('test')
|
11
|
+
@empty = LongestSubsequence.new('')
|
12
|
+
@long = LongestSubsequence.new('A' * 160)
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_empty_subsequence
|
16
|
+
assert_equal 0, @empty.match('')
|
17
|
+
assert_equal 0, @empty.match('a')
|
18
|
+
assert_equal 0, @small.match('')
|
19
|
+
assert_equal 0, @empty.match('not empty')
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_small_subsequence
|
23
|
+
assert_equal 4, @small.match('test')
|
24
|
+
assert_equal 4, @small.match('testa')
|
25
|
+
assert_equal 4, @small.match('atest')
|
26
|
+
assert_equal 4, @small.match('teast')
|
27
|
+
assert_equal 3, @small.match('est')
|
28
|
+
assert_equal 3, @small.match('tes')
|
29
|
+
assert_equal 3, @small.match('tst')
|
30
|
+
assert_equal 3, @small.match('best')
|
31
|
+
assert_equal 3, @small.match('tost')
|
32
|
+
assert_equal 3, @small.match('tesa')
|
33
|
+
assert_equal 2, @small.match('taex')
|
34
|
+
assert_equal 1, @small.match('aaatbbb')
|
35
|
+
assert_equal 1, @small.match('aaasbbb')
|
36
|
+
assert_equal 4, @small.match('aaatestbbb')
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_similar
|
40
|
+
assert_in_delta 1, @empty.similar(''), D
|
41
|
+
assert_in_delta 0, @empty.similar('not empty'), D
|
42
|
+
assert_in_delta 0.0, @small.similar(''), D
|
43
|
+
assert_in_delta 1.0, @small.similar('test'), D
|
44
|
+
assert_in_delta 0.8, @small.similar('testa'), D
|
45
|
+
assert_in_delta 0.8, @small.similar('atest'), D
|
46
|
+
assert_in_delta 0.8, @small.similar('teast'), D
|
47
|
+
assert_in_delta 0.75, @small.similar('est'), D
|
48
|
+
assert_in_delta 0.75, @small.similar('tes'), D
|
49
|
+
assert_in_delta 0.75, @small.similar('tst'), D
|
50
|
+
assert_in_delta 0.75, @small.similar('best'), D
|
51
|
+
assert_in_delta 0.75, @small.similar('tost'), D
|
52
|
+
assert_in_delta 0.75, @small.similar('tesa'), D
|
53
|
+
assert_in_delta 0.50, @small.similar('taex'), D
|
54
|
+
assert_in_delta 0.4, @small.similar('aaatestbbb'), D
|
55
|
+
assert_in_delta 0.75, @small.pattern.longest_subsequence_similar('est'), D
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_long
|
59
|
+
assert_in_delta 1.0, @long.similar(@long.pattern), D
|
60
|
+
end
|
61
|
+
end
|