amatch 0.2.5-x86-mswin32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +31 -0
- data/COPYING +340 -0
- data/README +130 -0
- data/Rakefile +141 -0
- data/VERSION +1 -0
- data/amatch.gemspec +31 -0
- data/bin/agrep.rb +79 -0
- data/ext/amatch.c +1641 -0
- data/ext/common.h +25 -0
- data/ext/extconf.rb +6 -0
- data/ext/pair.c +77 -0
- data/ext/pair.h +29 -0
- data/install.rb +28 -0
- data/lib/amatch.so +0 -0
- data/lib/amatch/version.rb +8 -0
- data/tests/test_hamming.rb +58 -0
- data/tests/test_jaro.rb +29 -0
- data/tests/test_jaro_winkler.rb +38 -0
- data/tests/test_levenshtein.rb +83 -0
- data/tests/test_longest_subsequence.rb +61 -0
- data/tests/test_longest_substring.rb +61 -0
- data/tests/test_pair_distance.rb +86 -0
- data/tests/test_sellers.rb +96 -0
- metadata +95 -0
data/ext/common.h
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
#ifndef __COMMON_H__
|
2
|
+
# define __COMMON_H__
|
3
|
+
|
4
|
+
#ifndef RSTRING_PTR
|
5
|
+
#define RSTRING_PTR(str) (RSTRING(str)->ptr)
|
6
|
+
#endif
|
7
|
+
|
8
|
+
#ifndef RSTRING_LEN
|
9
|
+
#define RSTRING_LEN(str) (RSTRING(str)->len)
|
10
|
+
#endif
|
11
|
+
|
12
|
+
#ifndef RARRAY_PTR
|
13
|
+
#define RARRAY_PTR(ary) (RARRAY(ary)->ptr)
|
14
|
+
#endif
|
15
|
+
|
16
|
+
#ifndef RARRAY_LEN
|
17
|
+
#define RARRAY_LEN(ary) (RARRAY(ary)->len)
|
18
|
+
#endif
|
19
|
+
|
20
|
+
#ifndef RFLOAT_VALUE
|
21
|
+
#define RFLOAT_VALUE(val) (RFLOAT(val)->value)
|
22
|
+
#endif
|
23
|
+
|
24
|
+
|
25
|
+
#endif
|
data/ext/extconf.rb
ADDED
data/ext/pair.c
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
#include "pair.h"
|
2
|
+
|
3
|
+
#define DEBUG 0
|
4
|
+
|
5
|
+
static int predict_length(VALUE tokens)
|
6
|
+
{
|
7
|
+
int i, l, result;
|
8
|
+
for (i = 0, result = 0; i < RARRAY_LEN(tokens); i++) {
|
9
|
+
VALUE t = rb_ary_entry(tokens, i);
|
10
|
+
l = RSTRING_LEN(t) - 1;
|
11
|
+
if (l > 0) result += l;
|
12
|
+
}
|
13
|
+
return result;
|
14
|
+
}
|
15
|
+
|
16
|
+
PairArray *PairArray_new(VALUE tokens)
|
17
|
+
{
|
18
|
+
int i, j, k, len = predict_length(tokens);
|
19
|
+
PairArray *pair_array = ALLOC(PairArray);
|
20
|
+
Pair *pairs = ALLOC_N(Pair, len);
|
21
|
+
MEMZERO(pairs, Pair, len);
|
22
|
+
pair_array->pairs = pairs;
|
23
|
+
pair_array->len = len;
|
24
|
+
for (i = 0, k = 0; i < RARRAY_LEN(tokens); i++) {
|
25
|
+
VALUE t = rb_ary_entry(tokens, i);
|
26
|
+
char *string = RSTRING_PTR(t);
|
27
|
+
for (j = 0; j < RSTRING_LEN(t) - 1; j++) {
|
28
|
+
pairs[k].fst = string[j];
|
29
|
+
pairs[k].snd = string[j + 1];
|
30
|
+
pairs[k].status = PAIR_ACTIVE;
|
31
|
+
k++;
|
32
|
+
}
|
33
|
+
}
|
34
|
+
return pair_array;
|
35
|
+
}
|
36
|
+
|
37
|
+
void pair_array_reactivate(PairArray *self)
|
38
|
+
{
|
39
|
+
int i;
|
40
|
+
for (i = 0; i < self->len; i++) {
|
41
|
+
self->pairs[i].status = PAIR_ACTIVE;
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
double pair_array_match(PairArray *self, PairArray *other)
|
46
|
+
{
|
47
|
+
int i, j, matches = 0;
|
48
|
+
int sum = self->len + other->len;
|
49
|
+
if (sum == 0) return 1.0;
|
50
|
+
for (i = 0; i < self->len; i++) {
|
51
|
+
for (j = 0; j < other->len; j++) {
|
52
|
+
#if DEBUG
|
53
|
+
pair_print(self->pairs[i]);
|
54
|
+
putc(' ', stdout);
|
55
|
+
pair_print(other->pairs[j]);
|
56
|
+
printf(" -> %d\n", pair_equal(self->pairs[i], other->pairs[j]));
|
57
|
+
#endif
|
58
|
+
if (pair_equal(self->pairs[i], other->pairs[j])) {
|
59
|
+
matches++;
|
60
|
+
other->pairs[j].status = PAIR_INACTIVE;
|
61
|
+
break;
|
62
|
+
}
|
63
|
+
}
|
64
|
+
}
|
65
|
+
return ((double) (2 * matches)) / sum;
|
66
|
+
}
|
67
|
+
|
68
|
+
void pair_print(Pair pair)
|
69
|
+
{
|
70
|
+
printf("%c%c (%d)", pair.fst, pair.snd, pair.status);
|
71
|
+
}
|
72
|
+
|
73
|
+
void pair_array_destroy(PairArray *pair_array)
|
74
|
+
{
|
75
|
+
free(pair_array->pairs);
|
76
|
+
free(pair_array);
|
77
|
+
}
|
data/ext/pair.h
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#ifndef PAIR_H_INCLUDED
|
2
|
+
#define PAIR_H_INCLUDED
|
3
|
+
|
4
|
+
#include "ruby.h"
|
5
|
+
#include "common.h"
|
6
|
+
|
7
|
+
enum { PAIR_ACTIVE = 1, PAIR_INACTIVE = 2 };
|
8
|
+
|
9
|
+
typedef struct PairStruct {
|
10
|
+
char fst;
|
11
|
+
char snd;
|
12
|
+
char status;
|
13
|
+
char __align;
|
14
|
+
} Pair;
|
15
|
+
|
16
|
+
typedef struct PairArrayStruct {
|
17
|
+
Pair *pairs;
|
18
|
+
int len;
|
19
|
+
} PairArray;
|
20
|
+
|
21
|
+
PairArray *PairArray_new(VALUE tokens);
|
22
|
+
#define pair_equal(a, b) \
|
23
|
+
((a).fst == (b).fst && (a).snd == (b).snd && ((a).status & (b).status & PAIR_ACTIVE))
|
24
|
+
double pair_array_match(PairArray *self, PairArray *other);
|
25
|
+
void pair_array_destroy(PairArray *pair_array);
|
26
|
+
void pair_print(Pair pair);
|
27
|
+
void pair_array_reactivate(PairArray *self);
|
28
|
+
|
29
|
+
#endif
|
data/install.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rbconfig'
|
4
|
+
include Config
|
5
|
+
require 'fileutils'
|
6
|
+
include FileUtils::Verbose
|
7
|
+
|
8
|
+
MAKE = ENV['MAKE'] || %w[gmake make].find { |c| system(c, '-v') }
|
9
|
+
|
10
|
+
bindir = CONFIG['bindir']
|
11
|
+
archdir = CONFIG['sitearchdir']
|
12
|
+
libdir = CONFIG['sitelibdir']
|
13
|
+
dlext = CONFIG['DLEXT']
|
14
|
+
cd 'ext' do
|
15
|
+
system 'ruby extconf.rb' or exit 1
|
16
|
+
system "#{MAKE}" or exit 1
|
17
|
+
mkdir_p archdir
|
18
|
+
install "amatch.#{dlext}", archdir
|
19
|
+
end
|
20
|
+
cd 'bin' do
|
21
|
+
filename = 'edit_json.rb'
|
22
|
+
install('agrep.rb', bindir)
|
23
|
+
end
|
24
|
+
cd 'lib/amatch' do
|
25
|
+
mkdir_p d = File.join(libdir, 'amatch')
|
26
|
+
install 'version.rb', d
|
27
|
+
end
|
28
|
+
warn " *** Installed amatch extension."
|
data/lib/amatch.so
ADDED
Binary file
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'amatch'
|
3
|
+
|
4
|
+
class TestHamming < Test::Unit::TestCase
|
5
|
+
include Amatch
|
6
|
+
|
7
|
+
D = 0.000001
|
8
|
+
|
9
|
+
def setup
|
10
|
+
@small = Hamming.new('test')
|
11
|
+
@empty = Hamming.new('')
|
12
|
+
@long = Hamming.new('A' * 160)
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_empty
|
16
|
+
assert_in_delta 0, @empty.match(''), D
|
17
|
+
assert_in_delta 9, @empty.match('not empty'), D
|
18
|
+
assert_in_delta 1, @empty.similar(''), D
|
19
|
+
assert_in_delta 0, @empty.similar('not empty'), D
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_small_match
|
23
|
+
assert_in_delta 4, @small.match(''), D
|
24
|
+
assert_in_delta 0, @small.match('test'), D
|
25
|
+
assert_in_delta 1, @small.match('testa'), D
|
26
|
+
assert_in_delta 5, @small.match('atest'), D
|
27
|
+
assert_in_delta 3, @small.match('teast'), D
|
28
|
+
assert_in_delta 4, @small.match('est'), D
|
29
|
+
assert_in_delta 1, @small.match('tes'), D
|
30
|
+
assert_in_delta 3, @small.match('tst'), D
|
31
|
+
assert_in_delta 1, @small.match('best'), D
|
32
|
+
assert_in_delta 1, @small.match('tost'), D
|
33
|
+
assert_in_delta 1, @small.match('tesa'), D
|
34
|
+
assert_in_delta 3, @small.match('taex'), D
|
35
|
+
assert_in_delta 9, @small.match('aaatestbbb'), D
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_small_similar
|
39
|
+
assert_in_delta 0.0, @small.similar(''), D
|
40
|
+
assert_in_delta 1.0, @small.similar('test'), D
|
41
|
+
assert_in_delta 0.8, @small.similar('testa'), D
|
42
|
+
assert_in_delta 0.0, @small.similar('atest'), D
|
43
|
+
assert_in_delta 0.4, @small.similar('teast'), D
|
44
|
+
assert_in_delta 0, @small.similar('est'), D
|
45
|
+
assert_in_delta 0.75, @small.similar('tes'), D
|
46
|
+
assert_in_delta 0.25, @small.similar('tst'), D
|
47
|
+
assert_in_delta 0.75, @small.similar('best'), D
|
48
|
+
assert_in_delta 0.75, @small.similar('tost'), D
|
49
|
+
assert_in_delta 0.75, @small.similar('tesa'), D
|
50
|
+
assert_in_delta 0.25, @small.similar('taex'), D
|
51
|
+
assert_in_delta 0.1, @small.similar('aaatestbbb'), D
|
52
|
+
assert_in_delta 0.8, @small.pattern.hamming_similar('testa'), D
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_long
|
56
|
+
assert_in_delta 1.0, @long.similar(@long.pattern), D
|
57
|
+
end
|
58
|
+
end
|
data/tests/test_jaro.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require File.dirname(__FILE__) + "/../ext/amatch"
|
3
|
+
|
4
|
+
class TestJaro < Test::Unit::TestCase
|
5
|
+
include Amatch
|
6
|
+
|
7
|
+
D = 0.0005
|
8
|
+
|
9
|
+
def setup
|
10
|
+
@martha = Jaro.new('Martha')
|
11
|
+
@dwayne = Jaro.new('dwayne')
|
12
|
+
@dixon = Jaro.new('DIXON')
|
13
|
+
@one = Jaro.new('one')
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_case
|
17
|
+
@martha.ignore_case = true
|
18
|
+
assert_in_delta 0.944, @martha.match('MARHTA'), D
|
19
|
+
@martha.ignore_case = false
|
20
|
+
assert_in_delta 0.444, @martha.match('MARHTA'), D
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_match
|
24
|
+
assert_in_delta 0.944, @martha.match('MARHTA'), D
|
25
|
+
assert_in_delta 0.822, @dwayne.match('DUANE'), D
|
26
|
+
assert_in_delta 0.767, @dixon.match('DICKSONX'), D
|
27
|
+
assert_in_delta 0.667, @one.match('orange'), D
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'amatch'
|
3
|
+
|
4
|
+
class TestJaroWinkler < Test::Unit::TestCase
|
5
|
+
include Amatch
|
6
|
+
|
7
|
+
D = 0.0005
|
8
|
+
|
9
|
+
def setup
|
10
|
+
@martha = JaroWinkler.new('Martha')
|
11
|
+
@dwayne = JaroWinkler.new('dwayne')
|
12
|
+
@dixon = JaroWinkler.new('DIXON')
|
13
|
+
@one = JaroWinkler.new("one")
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_case
|
17
|
+
@martha.ignore_case = true
|
18
|
+
assert_in_delta 0.961, @martha.match('MARHTA'), D
|
19
|
+
@martha.ignore_case = false
|
20
|
+
assert_in_delta 0.500, @martha.match('MARHTA'), D
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_match
|
24
|
+
assert_in_delta 0.961, @martha.match('MARHTA'), D
|
25
|
+
assert_in_delta 0.840, @dwayne.match('DUANE'), D
|
26
|
+
assert_in_delta 0.813, @dixon.match('DICKSONX'), D
|
27
|
+
assert_in_delta 0, @one.match('two'), D
|
28
|
+
assert_in_delta 0.700, @one.match('orange'), D
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_scaling_factor
|
32
|
+
assert_in_delta 0.1, @martha.scaling_factor, 0.0000001
|
33
|
+
@martha.scaling_factor = 0.2
|
34
|
+
assert_in_delta 0.978, @martha.match('MARHTA'), D
|
35
|
+
@martha.scaling_factor = 0.5 # this is far too high
|
36
|
+
assert_in_delta 1.028, @martha.match('MARHTA'), D
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'amatch'
|
3
|
+
|
4
|
+
class TestLevenshtein < Test::Unit::TestCase
|
5
|
+
include Amatch
|
6
|
+
|
7
|
+
def setup
|
8
|
+
@d = 0.000001
|
9
|
+
@empty = Levenshtein.new('')
|
10
|
+
@simple = Levenshtein.new('test')
|
11
|
+
@long = Levenshtein.new('A' * 160)
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_match
|
15
|
+
assert_equal 4, @simple.match('')
|
16
|
+
assert_equal 0, @simple.match('test')
|
17
|
+
assert_equal 0, @simple.match('test')
|
18
|
+
assert_equal 1, @simple.match('testa')
|
19
|
+
assert_equal 1, @simple.match('atest')
|
20
|
+
assert_equal 1, @simple.match('teast')
|
21
|
+
assert_equal 1, @simple.match('est')
|
22
|
+
assert_equal 1, @simple.match('tes')
|
23
|
+
assert_equal 1, @simple.match('tst')
|
24
|
+
assert_equal 1, @simple.match('best')
|
25
|
+
assert_equal 1, @simple.match('tost')
|
26
|
+
assert_equal 1, @simple.match('tesa')
|
27
|
+
assert_equal 3, @simple.match('taex')
|
28
|
+
assert_equal 6, @simple.match('aaatestbbb')
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_search
|
32
|
+
assert_equal 4, @simple.search('')
|
33
|
+
assert_equal 0, @empty.search('')
|
34
|
+
assert_equal 0, @empty.search('test')
|
35
|
+
assert_equal 0, @simple.search('aaatestbbb')
|
36
|
+
assert_equal 3, @simple.search('aaataexbbb')
|
37
|
+
assert_equal 4, @simple.search('aaaaaaaaa')
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_array_result
|
41
|
+
assert_equal [2, 0], @simple.match(["tets", "test"])
|
42
|
+
assert_equal [1, 0], @simple.search(["tetsaaa", "testaaa"])
|
43
|
+
assert_raises(TypeError) { @simple.match([:foo, "bar"]) }
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_pattern_setting
|
47
|
+
assert_raises(TypeError) { @simple.pattern = :something }
|
48
|
+
assert_equal 0, @simple.match('test')
|
49
|
+
@simple.pattern = ''
|
50
|
+
assert_equal 4, @simple.match('test')
|
51
|
+
@simple.pattern = 'test'
|
52
|
+
assert_equal 0, @simple.match('test')
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_similar
|
56
|
+
assert_in_delta 1, @empty.similar(''), @d
|
57
|
+
assert_in_delta 0, @empty.similar('not empty'), @d
|
58
|
+
assert_in_delta 0.0, @simple.similar(''), @d
|
59
|
+
assert_in_delta 1.0, @simple.similar('test'), @d
|
60
|
+
assert_in_delta 0.8, @simple.similar('testa'), @d
|
61
|
+
assert_in_delta 0.8, @simple.similar('atest'), @d
|
62
|
+
assert_in_delta 0.8, @simple.similar('teast'), @d
|
63
|
+
assert_in_delta 0.75, @simple.similar('est'), @d
|
64
|
+
assert_in_delta 0.75, @simple.similar('tes'), @d
|
65
|
+
assert_in_delta 0.75, @simple.similar('tst'), @d
|
66
|
+
assert_in_delta 0.75, @simple.similar('best'), @d
|
67
|
+
assert_in_delta 0.75, @simple.similar('tost'), @d
|
68
|
+
assert_in_delta 0.75, @simple.similar('tesa'), @d
|
69
|
+
assert_in_delta 0.25, @simple.similar('taex'), @d
|
70
|
+
assert_in_delta 0.4, @simple.similar('aaatestbbb'), @d
|
71
|
+
assert_in_delta 0.75, @simple.pattern.levenshtein_similar('est'), @d
|
72
|
+
end
|
73
|
+
|
74
|
+
def test_long
|
75
|
+
assert_in_delta 1.0, @long.similar(@long.pattern), @d
|
76
|
+
end
|
77
|
+
|
78
|
+
def test_long2
|
79
|
+
a = "lost this fantasy, this fantasy, this fantasy, this fantasy, this fantasy, this fantasy\r\n\r\nGood love Neat work\r\n\r\nSuper job Fancy work\r\n\r\nPants job Cool work"
|
80
|
+
b = "lost\r\n\r\nGood love Neat work\r\n\r\nSuper job Fancy work\r\n\r\nPants job Cool work"
|
81
|
+
assert a.levenshtein_similar(b)
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'amatch'
|
3
|
+
|
4
|
+
class TestLongestSubsequence < Test::Unit::TestCase
|
5
|
+
include Amatch
|
6
|
+
|
7
|
+
D = 0.000001
|
8
|
+
|
9
|
+
def setup
|
10
|
+
@small = LongestSubsequence.new('test')
|
11
|
+
@empty = LongestSubsequence.new('')
|
12
|
+
@long = LongestSubsequence.new('A' * 160)
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_empty_subsequence
|
16
|
+
assert_equal 0, @empty.match('')
|
17
|
+
assert_equal 0, @empty.match('a')
|
18
|
+
assert_equal 0, @small.match('')
|
19
|
+
assert_equal 0, @empty.match('not empty')
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_small_subsequence
|
23
|
+
assert_equal 4, @small.match('test')
|
24
|
+
assert_equal 4, @small.match('testa')
|
25
|
+
assert_equal 4, @small.match('atest')
|
26
|
+
assert_equal 4, @small.match('teast')
|
27
|
+
assert_equal 3, @small.match('est')
|
28
|
+
assert_equal 3, @small.match('tes')
|
29
|
+
assert_equal 3, @small.match('tst')
|
30
|
+
assert_equal 3, @small.match('best')
|
31
|
+
assert_equal 3, @small.match('tost')
|
32
|
+
assert_equal 3, @small.match('tesa')
|
33
|
+
assert_equal 2, @small.match('taex')
|
34
|
+
assert_equal 1, @small.match('aaatbbb')
|
35
|
+
assert_equal 1, @small.match('aaasbbb')
|
36
|
+
assert_equal 4, @small.match('aaatestbbb')
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_similar
|
40
|
+
assert_in_delta 1, @empty.similar(''), D
|
41
|
+
assert_in_delta 0, @empty.similar('not empty'), D
|
42
|
+
assert_in_delta 0.0, @small.similar(''), D
|
43
|
+
assert_in_delta 1.0, @small.similar('test'), D
|
44
|
+
assert_in_delta 0.8, @small.similar('testa'), D
|
45
|
+
assert_in_delta 0.8, @small.similar('atest'), D
|
46
|
+
assert_in_delta 0.8, @small.similar('teast'), D
|
47
|
+
assert_in_delta 0.75, @small.similar('est'), D
|
48
|
+
assert_in_delta 0.75, @small.similar('tes'), D
|
49
|
+
assert_in_delta 0.75, @small.similar('tst'), D
|
50
|
+
assert_in_delta 0.75, @small.similar('best'), D
|
51
|
+
assert_in_delta 0.75, @small.similar('tost'), D
|
52
|
+
assert_in_delta 0.75, @small.similar('tesa'), D
|
53
|
+
assert_in_delta 0.50, @small.similar('taex'), D
|
54
|
+
assert_in_delta 0.4, @small.similar('aaatestbbb'), D
|
55
|
+
assert_in_delta 0.75, @small.pattern.longest_subsequence_similar('est'), D
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_long
|
59
|
+
assert_in_delta 1.0, @long.similar(@long.pattern), D
|
60
|
+
end
|
61
|
+
end
|