amatch 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/CHANGES +3 -0
- data/COPYING +203 -340
- data/README.md +124 -0
- data/Rakefile +5 -12
- data/VERSION +1 -1
- data/amatch.gemspec +0 -0
- data/bin/{agrep.rb → agrep} +23 -9
- data/bin/dupfind +153 -0
- data/ext/amatch_ext.c +298 -74
- data/images/amatch_ext.png +0 -0
- data/lib/amatch/version.rb +1 -1
- data/tests/test_damerau_levenshtein.rb +93 -0
- metadata +27 -8
- data/README.rdoc +0 -128
Binary file
|
data/lib/amatch/version.rb
CHANGED
@@ -0,0 +1,93 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'amatch'
|
3
|
+
|
4
|
+
class TestDamerauLevenshtein < Test::Unit::TestCase
|
5
|
+
include Amatch
|
6
|
+
|
7
|
+
def setup
|
8
|
+
@d = 0.000001
|
9
|
+
@empty = DamerauLevenshtein.new('')
|
10
|
+
@simple = DamerauLevenshtein.new('test')
|
11
|
+
@long = DamerauLevenshtein.new('A' * 160)
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_version
|
15
|
+
assert_kind_of String, Amatch::VERSION
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_match
|
19
|
+
assert_equal 4, @simple.match('')
|
20
|
+
assert_equal 0, @simple.match('test')
|
21
|
+
assert_equal 1, @simple.match('testa')
|
22
|
+
assert_equal 1, @simple.match('atest')
|
23
|
+
assert_equal 1, @simple.match('teast')
|
24
|
+
assert_equal 1, @simple.match('est')
|
25
|
+
assert_equal 1, @simple.match('tes')
|
26
|
+
assert_equal 1, @simple.match('tst')
|
27
|
+
assert_equal 1, @simple.match('best')
|
28
|
+
assert_equal 1, @simple.match('tost')
|
29
|
+
assert_equal 1, @simple.match('tesa')
|
30
|
+
assert_equal 3, @simple.match('taex')
|
31
|
+
assert_equal 6, @simple.match('aaatestbbb')
|
32
|
+
assert_equal 1, @simple.match('tset')
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_search
|
36
|
+
assert_equal 4, @simple.search('')
|
37
|
+
assert_equal 0, @empty.search('')
|
38
|
+
assert_equal 0, @empty.search('test')
|
39
|
+
assert_equal 0, @simple.search('aaatestbbb')
|
40
|
+
assert_equal 3, @simple.search('aaataexbbb')
|
41
|
+
assert_equal 4, @simple.search('aaaaaaaaa')
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_array_result
|
45
|
+
assert_equal [1, 0], @simple.match(["tets", "test"])
|
46
|
+
assert_equal [1, 0], @simple.search(["tetsaaa", "testaaa"])
|
47
|
+
assert_raises(TypeError) { @simple.match([:foo, "bar"]) }
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_pattern_setting
|
51
|
+
assert_raises(TypeError) { @simple.pattern = :something }
|
52
|
+
assert_equal 0, @simple.match('test')
|
53
|
+
@simple.pattern = ''
|
54
|
+
assert_equal 4, @simple.match('test')
|
55
|
+
@simple.pattern = 'test'
|
56
|
+
assert_equal 0, @simple.match('test')
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_similar
|
60
|
+
assert_in_delta 1, @empty.similar(''), @d
|
61
|
+
assert_in_delta 0, @empty.similar('not empty'), @d
|
62
|
+
assert_in_delta 0.0, @simple.similar(''), @d
|
63
|
+
assert_in_delta 1.0, @simple.similar('test'), @d
|
64
|
+
assert_in_delta 0.8, @simple.similar('testa'), @d
|
65
|
+
assert_in_delta 0.8, @simple.similar('atest'), @d
|
66
|
+
assert_in_delta 0.8, @simple.similar('teast'), @d
|
67
|
+
assert_in_delta 0.75, @simple.similar('est'), @d
|
68
|
+
assert_in_delta 0.75, @simple.similar('tes'), @d
|
69
|
+
assert_in_delta 0.75, @simple.similar('tst'), @d
|
70
|
+
assert_in_delta 0.75, @simple.similar('best'), @d
|
71
|
+
assert_in_delta 0.75, @simple.similar('tost'), @d
|
72
|
+
assert_in_delta 0.75, @simple.similar('tesa'), @d
|
73
|
+
assert_in_delta 0.25, @simple.similar('taex'), @d
|
74
|
+
assert_in_delta 0.4, @simple.similar('aaatestbbb'), @d
|
75
|
+
assert_in_delta 0.75, @simple.pattern.damerau_levenshtein_similar('est'), @d
|
76
|
+
end
|
77
|
+
|
78
|
+
def test_transpositions
|
79
|
+
assert_in_delta 1.0, 'atestatest'.damerau_levenshtein_similar('atestatest'), @d
|
80
|
+
assert_in_delta 0.9, 'atestatest'.damerau_levenshtein_similar('atetsatest'), @d
|
81
|
+
assert_in_delta 0.8, 'atestatest'.damerau_levenshtein_similar('atetsatset'), @d
|
82
|
+
end
|
83
|
+
|
84
|
+
def test_long
|
85
|
+
assert_in_delta 1.0, @long.similar(@long.pattern), @d
|
86
|
+
end
|
87
|
+
|
88
|
+
def test_long2
|
89
|
+
a = "lost this fantasy, this fantasy, this fantasy, this fantasy, this fantasy, this fantasy\r\n\r\nGood love Neat work\r\n\r\nSuper job Fancy work\r\n\r\nPants job Cool work"
|
90
|
+
b = "lost\r\n\r\nGood love Neat work\r\n\r\nSuper job Fancy work\r\n\r\nPants job Cool work"
|
91
|
+
assert a.damerau_levenshtein_similar(b)
|
92
|
+
end
|
93
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: amatch
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Florian Frank
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-07-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: gem_hadar
|
@@ -52,17 +52,32 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '1.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: mize
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
55
69
|
description: |
|
56
70
|
Amatch is a library for approximate string matching and searching in strings.
|
57
71
|
Several algorithms can be used to do this, and it's also possible to compute a
|
58
72
|
similarity metric number between 0.0 and 1.0 for two given strings.
|
59
73
|
email: flori@ping.de
|
60
74
|
executables:
|
61
|
-
- agrep
|
75
|
+
- agrep
|
76
|
+
- dupfind
|
62
77
|
extensions:
|
63
78
|
- ext/extconf.rb
|
64
79
|
extra_rdoc_files:
|
65
|
-
- README.
|
80
|
+
- README.md
|
66
81
|
- lib/amatch.rb
|
67
82
|
- lib/amatch/polite.rb
|
68
83
|
- lib/amatch/rude.rb
|
@@ -75,22 +90,25 @@ files:
|
|
75
90
|
- CHANGES
|
76
91
|
- COPYING
|
77
92
|
- Gemfile
|
78
|
-
- README.
|
93
|
+
- README.md
|
79
94
|
- Rakefile
|
80
95
|
- VERSION
|
81
96
|
- amatch.gemspec
|
82
|
-
- bin/agrep
|
97
|
+
- bin/agrep
|
98
|
+
- bin/dupfind
|
83
99
|
- ext/amatch_ext.c
|
84
100
|
- ext/common.h
|
85
101
|
- ext/extconf.rb
|
86
102
|
- ext/pair.c
|
87
103
|
- ext/pair.h
|
104
|
+
- images/amatch_ext.png
|
88
105
|
- install.rb
|
89
106
|
- lib/amatch.rb
|
90
107
|
- lib/amatch/.keep
|
91
108
|
- lib/amatch/polite.rb
|
92
109
|
- lib/amatch/rude.rb
|
93
110
|
- lib/amatch/version.rb
|
111
|
+
- tests/test_damerau_levenshtein.rb
|
94
112
|
- tests/test_hamming.rb
|
95
113
|
- tests/test_jaro.rb
|
96
114
|
- tests/test_jaro_winkler.rb
|
@@ -101,14 +119,14 @@ files:
|
|
101
119
|
- tests/test_sellers.rb
|
102
120
|
homepage: http://github.com/flori/amatch
|
103
121
|
licenses:
|
104
|
-
-
|
122
|
+
- Apache-2.0
|
105
123
|
metadata: {}
|
106
124
|
post_install_message:
|
107
125
|
rdoc_options:
|
108
126
|
- "--title"
|
109
127
|
- Amatch - Approximate Matching
|
110
128
|
- "--main"
|
111
|
-
- README.
|
129
|
+
- README.md
|
112
130
|
require_paths:
|
113
131
|
- lib
|
114
132
|
- ext
|
@@ -129,6 +147,7 @@ signing_key:
|
|
129
147
|
specification_version: 4
|
130
148
|
summary: Approximate String Matching library
|
131
149
|
test_files:
|
150
|
+
- tests/test_damerau_levenshtein.rb
|
132
151
|
- tests/test_hamming.rb
|
133
152
|
- tests/test_jaro.rb
|
134
153
|
- tests/test_jaro_winkler.rb
|
data/README.rdoc
DELETED
@@ -1,128 +0,0 @@
|
|
1
|
-
= amatch - Approximate Matching Extension for Ruby
|
2
|
-
|
3
|
-
== Description
|
4
|
-
|
5
|
-
This is a collection of classes that can be used for Approximate
|
6
|
-
matching, searching, and comparing of Strings. They implement algorithms
|
7
|
-
that compute the Levenshtein edit distance, Sellers edit distance, the
|
8
|
-
Hamming distance, the longest common subsequence length, the longest common
|
9
|
-
substring length, the pair distance metric, the Jaro-Winkler metric.
|
10
|
-
|
11
|
-
== Download
|
12
|
-
|
13
|
-
The latest version of <b>amatch</b> can be found at
|
14
|
-
|
15
|
-
* http://rubyforge.org/frs/?group_id=390
|
16
|
-
|
17
|
-
Online Documentation should be located at
|
18
|
-
|
19
|
-
* http://amatch.rubyforge.org
|
20
|
-
|
21
|
-
== Installation
|
22
|
-
|
23
|
-
Just type into the command line as root:
|
24
|
-
|
25
|
-
# ruby install.rb
|
26
|
-
|
27
|
-
If you have installed rake (rake.rubyforge.org), you can also type:
|
28
|
-
|
29
|
-
# rake install
|
30
|
-
|
31
|
-
To install this extension as a gem type
|
32
|
-
|
33
|
-
# gem install amatch
|
34
|
-
|
35
|
-
== Examples
|
36
|
-
require 'amatch'
|
37
|
-
# => true
|
38
|
-
include Amatch
|
39
|
-
# => Object
|
40
|
-
|
41
|
-
m = Sellers.new("pattern")
|
42
|
-
# => #<Amatch::Sellers:0x40366324>
|
43
|
-
m.match("pattren")
|
44
|
-
# => 2.0
|
45
|
-
m.substitution = m.insertion = 3
|
46
|
-
# => 3
|
47
|
-
m.match("pattren")
|
48
|
-
# => 4.0
|
49
|
-
m.reset_weights
|
50
|
-
# => #<Amatch::Sellers:0x40366324>
|
51
|
-
m.match(["pattren","parent"])
|
52
|
-
# => [2.0, 4.0]
|
53
|
-
m.search("abcpattrendef")
|
54
|
-
# => 2.0
|
55
|
-
|
56
|
-
m = Levenshtein.new("pattern")
|
57
|
-
# => #<Amatch::Levenshtein:0x4035919c>
|
58
|
-
m.match("pattren")
|
59
|
-
# => 2
|
60
|
-
m.search("abcpattrendef")
|
61
|
-
# => 2
|
62
|
-
"pattern language".levenshtein_similar("language of patterns")
|
63
|
-
# => 0.2
|
64
|
-
|
65
|
-
m = Hamming.new("pattern")
|
66
|
-
# => #<Amatch::Hamming:0x40350858>
|
67
|
-
m.match("pattren")
|
68
|
-
# => 2
|
69
|
-
"pattern language".hamming_similar("language of patterns")
|
70
|
-
# => 0.1
|
71
|
-
|
72
|
-
m = PairDistance.new("pattern")
|
73
|
-
# => #<Amatch::PairDistance:0x40349be8>
|
74
|
-
m.match("pattr en")
|
75
|
-
# => 0.545454545454545
|
76
|
-
m.match("pattr en", nil)
|
77
|
-
# => 0.461538461538462
|
78
|
-
m.match("pattr en", /t+/)
|
79
|
-
# => 0.285714285714286
|
80
|
-
"pattern language".pair_distance_similar("language of patterns")
|
81
|
-
# => 0.928571428571429
|
82
|
-
|
83
|
-
m = LongestSubsequence.new("pattern")
|
84
|
-
# => #<Amatch::LongestSubsequence:0x4033e900>
|
85
|
-
m.match("pattren")
|
86
|
-
# => 6
|
87
|
-
"pattern language".longest_subsequence_similar("language of patterns")
|
88
|
-
# => 0.4
|
89
|
-
|
90
|
-
m = LongestSubstring.new("pattern")
|
91
|
-
# => #<Amatch::LongestSubstring:0x403378d0>
|
92
|
-
m.match("pattren")
|
93
|
-
# => 4
|
94
|
-
"pattern language".longest_substring_similar("language of patterns")
|
95
|
-
# => 0.4
|
96
|
-
|
97
|
-
m = Jaro.new("pattern")
|
98
|
-
# => #<Amatch::Jaro:0x363b70>
|
99
|
-
m.match("paTTren")
|
100
|
-
# => 0.952380952380952
|
101
|
-
m.ignore_case = false
|
102
|
-
m.match("paTTren")
|
103
|
-
# => 0.742857142857143
|
104
|
-
"pattern language".jaro_similar("language of patterns")
|
105
|
-
# => 0.672222222222222
|
106
|
-
|
107
|
-
m = JaroWinkler.new("pattern")
|
108
|
-
# #<Amatch::JaroWinkler:0x3530b8>
|
109
|
-
m.match("paTTren")
|
110
|
-
# => 0.971428571712403
|
111
|
-
m.ignore_case = false
|
112
|
-
m.match("paTTren")
|
113
|
-
# => 0.79428571505206
|
114
|
-
m.scaling_factor = 0.05
|
115
|
-
m.match("pattren")
|
116
|
-
# => 0.961904762046678
|
117
|
-
"pattern language".jarowinkler_similar("language of patterns")
|
118
|
-
# => 0.672222222222222
|
119
|
-
|
120
|
-
== Author
|
121
|
-
|
122
|
-
Florian Frank mailto:flori@ping.de
|
123
|
-
|
124
|
-
== License
|
125
|
-
|
126
|
-
This is free software; you can redistribute it and/or modify it under
|
127
|
-
the terms of the GNU General Public License Version 2 as published by
|
128
|
-
the Free Software Foundation: http://www.gnu.org/copyleft/gpl.html
|