blackwinter-perseus_match 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +676 -0
- data/ChangeLog +5 -0
- data/README +41 -0
- data/Rakefile +24 -0
- data/bin/perseus_match +298 -0
- data/lib/perseus_match.rb +169 -0
- data/lib/perseus_match/cluster.rb +94 -0
- data/lib/perseus_match/list.rb +77 -0
- data/lib/perseus_match/token_set.rb +248 -0
- data/lib/perseus_match/version.rb +27 -0
- data/spec/perseus_match/cluster_spec.rb +45 -0
- data/spec/perseus_match/list_spec.rb +16 -0
- data/spec/perseus_match/token_set_spec.rb +65 -0
- data/spec/perseus_match_spec.rb +168 -0
- data/spec/spec_helper.rb +18 -0
- metadata +95 -0
@@ -0,0 +1,16 @@
|
|
1
|
+
describe PerseusMatch::List, '::pair' do
|
2
|
+
|
3
|
+
before :all do
|
4
|
+
@phrases = %w[foo bar baz]
|
5
|
+
@size = @phrases.size
|
6
|
+
end
|
7
|
+
|
8
|
+
it 'should produce full list of pairs with correct size' do
|
9
|
+
PerseusMatch::List.pair(@phrases).size.should == @size ** 2
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'should produce minimal list of pairs with correct size' do
|
13
|
+
PerseusMatch::List.pair(@phrases, {}, :minimal => true).size.should == (@size ** 2 - @size) / 2
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
describe PerseusMatch::TokenSet, ' with lingo' do
|
2
|
+
|
3
|
+
before :each do
|
4
|
+
PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
|
5
|
+
end
|
6
|
+
|
7
|
+
it 'should tokenize a string' do
|
8
|
+
PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'should report strictly equal TokenSets as ==' do
|
12
|
+
PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('foo bar')
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should report strictly equal TokenSets as eql' do
|
16
|
+
PerseusMatch::TokenSet.new('foo bar').should be_eql(PerseusMatch::TokenSet.new('foo bar'))
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should report slightly equal TokenSets as ==' do
|
20
|
+
PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('Foo Bar')
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'should *not* report slightly equal TokenSets as eql' do
|
24
|
+
PerseusMatch::TokenSet.new('foo bar').should_not be_eql(PerseusMatch::TokenSet.new('Foo Bar'))
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should include form in inspect' do
|
28
|
+
PerseusMatch::TokenSet.new('foo', []).inspect.to_s.should =~ /<foo>/
|
29
|
+
end
|
30
|
+
|
31
|
+
end if LINGO_FOUND
|
32
|
+
|
33
|
+
describe PerseusMatch::TokenSet, ' without lingo' do
|
34
|
+
|
35
|
+
before :each do
|
36
|
+
PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'should take a prepared file for tokenization' do
|
40
|
+
# prevent lingo from being used
|
41
|
+
lingo_base = LINGO_BASE.dup
|
42
|
+
LINGO_BASE.replace('')
|
43
|
+
|
44
|
+
temp = Tempfile.open('perseus_match_spec_tokens_temp') { |t|
|
45
|
+
t.puts *%w[<foo|?> <bar|?>]
|
46
|
+
}
|
47
|
+
|
48
|
+
path = temp.path
|
49
|
+
link = 'perseus.tokens'
|
50
|
+
|
51
|
+
Dir.chdir(File.dirname(path)) {
|
52
|
+
File.symlink(path, link)
|
53
|
+
|
54
|
+
PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
|
55
|
+
|
56
|
+
File.unlink(link)
|
57
|
+
}
|
58
|
+
|
59
|
+
temp.unlink
|
60
|
+
|
61
|
+
# reset lingo base
|
62
|
+
LINGO_BASE.replace(lingo_base)
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
@@ -0,0 +1,168 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'nuggets/tempfile/open'
|
3
|
+
require 'nuggets/util/i18n'
|
4
|
+
|
5
|
+
describe PerseusMatch do
|
6
|
+
|
7
|
+
before :all do
|
8
|
+
@highly_similar = [
|
9
|
+
'Anbetung der Könige',
|
10
|
+
'Die Anbetung der Könige'
|
11
|
+
] # ok
|
12
|
+
|
13
|
+
@similar = [
|
14
|
+
# @highly_similar + ...
|
15
|
+
'Die Anbetung der Heiligen Drei Könige',
|
16
|
+
'dIE AnBeTuNg der heILIGen dREI KÖniGE'
|
17
|
+
] # ok
|
18
|
+
|
19
|
+
@unfortunately_similar = [
|
20
|
+
# @similar + ...
|
21
|
+
'Die Die Die Anbetung der Könige',
|
22
|
+
'Die Könige der Anbetung',
|
23
|
+
'Königsanbetung hoch drei'
|
24
|
+
] # *not* ok -- eventually try to drop these below the threshold
|
25
|
+
|
26
|
+
@somewhat_similar = @highly_similar + @similar + @unfortunately_similar
|
27
|
+
|
28
|
+
phrases = @somewhat_similar + [
|
29
|
+
'Drei mal drei macht sechs',
|
30
|
+
'Das Ende dieses Blödsinns',
|
31
|
+
''
|
32
|
+
]
|
33
|
+
|
34
|
+
temp = Tempfile.open('perseus_match_spec_temp') { |t|
|
35
|
+
t.puts *phrases
|
36
|
+
}
|
37
|
+
|
38
|
+
PerseusMatch::TokenSet.tokenize(temp.path)
|
39
|
+
|
40
|
+
temp.unlink
|
41
|
+
|
42
|
+
@matchings = PerseusMatch.match(phrases)
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'should identify identical (non-empty) strings as identical' do
|
46
|
+
@matchings.each { |matching|
|
47
|
+
if !matching.phrase.empty? && matching.phrase == matching.target
|
48
|
+
inform_on_error(matching) { matching.similarity.should == 1.0 }
|
49
|
+
end
|
50
|
+
}
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'should identify case-insensitively identical (non-empty) strings as nearly identical' do
|
54
|
+
@matchings.each { |matching|
|
55
|
+
if !matching.phrase.empty? && matching.phrase.replace_diacritics.downcase == matching.target.replace_diacritics.downcase
|
56
|
+
inform_on_error(matching) { matching.similarity.should > 0.95 }
|
57
|
+
end
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'should identify *only* case-insensitively identical (non-empty) strings as nearly identical' do
|
62
|
+
@matchings.each { |matching|
|
63
|
+
if !matching.phrase.empty? && matching.phrase.replace_diacritics.downcase != matching.target.replace_diacritics.downcase
|
64
|
+
inform_on_error(matching) { matching.similarity.should < 0.98 }
|
65
|
+
end
|
66
|
+
}
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'should identify disjunct (non-empty) strings as disjunct' do
|
70
|
+
@matchings.each { |matching|
|
71
|
+
if !matching.phrase.empty? && matching.phrase_tokens.disjoint?(matching.target_tokens)
|
72
|
+
inform_on_error(matching) { matching.similarity.should == 0.0 }
|
73
|
+
end
|
74
|
+
}
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'should identify empty string as disjunct with anything, even with itself' do
|
78
|
+
@matchings.each { |matching|
|
79
|
+
if matching.phrase.empty? || matching.target.empty?
|
80
|
+
inform_on_error(matching) { matching.similarity.should == 0.0 }
|
81
|
+
end
|
82
|
+
}
|
83
|
+
end
|
84
|
+
|
85
|
+
it 'should identify certain strings as highly similar (1)' do
|
86
|
+
@matchings.each { |matching|
|
87
|
+
if @highly_similar.include?(matching.phrase) && @highly_similar.include?(matching.target)
|
88
|
+
inform_on_error(matching) { matching.similarity.should > 0.9 }
|
89
|
+
end
|
90
|
+
}
|
91
|
+
end
|
92
|
+
|
93
|
+
it 'should identify certain strings as highly similar (2)' do
|
94
|
+
@highly_similar.each { |phrase|
|
95
|
+
@highly_similar.each { |target|
|
96
|
+
inform_on_error([phrase, target]) { PerseusMatch.check(phrase, target, 0.9).should be_true }
|
97
|
+
}
|
98
|
+
}
|
99
|
+
end
|
100
|
+
|
101
|
+
it 'should identify certain strings as similar (1)' do
|
102
|
+
@matchings.each { |matching|
|
103
|
+
if @similar.include?(matching.phrase) && @similar.include?(matching.target)
|
104
|
+
inform_on_error(matching) { matching.similarity.should > 0.8 }
|
105
|
+
end
|
106
|
+
}
|
107
|
+
end
|
108
|
+
|
109
|
+
it 'should identify certain strings as similar (2)' do
|
110
|
+
@similar.each { |phrase|
|
111
|
+
@similar.each { |target|
|
112
|
+
inform_on_error([phrase, target]) { PerseusMatch.check(phrase, target, 0.8).should be_true }
|
113
|
+
}
|
114
|
+
}
|
115
|
+
end
|
116
|
+
|
117
|
+
it 'should *not* identify other strings as similar (1)' do
|
118
|
+
@matchings.each { |matching|
|
119
|
+
if @somewhat_similar.include?(matching.phrase) && !@somewhat_similar.include?(matching.target)
|
120
|
+
inform_on_error(matching) { matching.similarity.should_not > 0.8 }
|
121
|
+
end
|
122
|
+
}
|
123
|
+
end
|
124
|
+
|
125
|
+
it 'should *not* identify other strings as similar (2)' do
|
126
|
+
@matchings.each { |matching|
|
127
|
+
if @somewhat_similar.include?(matching.phrase) && !@somewhat_similar.include?(matching.target)
|
128
|
+
inform_on_error(matching) { PerseusMatch.check(matching.phrase, matching.target, 0.8).should be_false }
|
129
|
+
end
|
130
|
+
}
|
131
|
+
end
|
132
|
+
|
133
|
+
it 'should be symmetrical' do
|
134
|
+
similarities = {}
|
135
|
+
|
136
|
+
@matchings.each { |matching|
|
137
|
+
if similarity = similarities[[matching.target, matching.phrase]]
|
138
|
+
inform_on_error(matching) { similarity.should == matching.similarity }
|
139
|
+
else
|
140
|
+
similarities[[matching.phrase, matching.target]] = matching.similarity
|
141
|
+
end
|
142
|
+
}
|
143
|
+
end
|
144
|
+
|
145
|
+
it 'should calculate pair distance' do
|
146
|
+
PerseusMatch.distance('foo', 'bar').class.should < Numeric
|
147
|
+
end
|
148
|
+
|
149
|
+
it 'should be clusterable' do
|
150
|
+
PerseusMatch.cluster(@somewhat_similar).should be_an_instance_of(Array)
|
151
|
+
end
|
152
|
+
|
153
|
+
it 'should be checkable (1)' do
|
154
|
+
PerseusMatch.check('foo', 'bar', 0, :>=).should be_true
|
155
|
+
end
|
156
|
+
|
157
|
+
it 'should be checkable (2)' do
|
158
|
+
lambda {
|
159
|
+
begin
|
160
|
+
PerseusMatch.check!('foo', 'bar', 0, :>)
|
161
|
+
rescue PerseusMatch::CheckFailedError => err
|
162
|
+
err.to_s.should =~ /0/
|
163
|
+
raise err
|
164
|
+
end
|
165
|
+
}.should raise_error(PerseusMatch::CheckFailedError)
|
166
|
+
end
|
167
|
+
|
168
|
+
end if LINGO_FOUND
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
unless Object.const_defined?(:PerseusMatch)
|
2
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
3
|
+
require 'perseus_match'
|
4
|
+
end
|
5
|
+
|
6
|
+
def inform_on_error(*args)
|
7
|
+
begin
|
8
|
+
yield
|
9
|
+
rescue Spec::Expectations::ExpectationNotMetError => err
|
10
|
+
unless args.empty?
|
11
|
+
puts
|
12
|
+
p *args
|
13
|
+
puts
|
14
|
+
end
|
15
|
+
|
16
|
+
raise
|
17
|
+
end
|
18
|
+
end
|
metadata
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: blackwinter-perseus_match
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.3
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jens Wille
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-12-09 00:00:00 -08:00
|
13
|
+
default_executable: perseus_match
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: ruby-backports
|
17
|
+
version_requirement:
|
18
|
+
version_requirements: !ruby/object:Gem::Requirement
|
19
|
+
requirements:
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: "0"
|
23
|
+
version:
|
24
|
+
- !ruby/object:Gem::Dependency
|
25
|
+
name: ruby-nuggets
|
26
|
+
version_requirement:
|
27
|
+
version_requirements: !ruby/object:Gem::Requirement
|
28
|
+
requirements:
|
29
|
+
- - ">="
|
30
|
+
- !ruby/object:Gem::Version
|
31
|
+
version: 0.4.0
|
32
|
+
version:
|
33
|
+
description: Fuzzy string matching based on linguistic analysis
|
34
|
+
email: jens.wille@uni-koeln.de
|
35
|
+
executables:
|
36
|
+
- perseus_match
|
37
|
+
extensions: []
|
38
|
+
|
39
|
+
extra_rdoc_files:
|
40
|
+
- COPYING
|
41
|
+
- ChangeLog
|
42
|
+
- README
|
43
|
+
files:
|
44
|
+
- lib/perseus_match/list.rb
|
45
|
+
- lib/perseus_match/version.rb
|
46
|
+
- lib/perseus_match/token_set.rb
|
47
|
+
- lib/perseus_match/cluster.rb
|
48
|
+
- lib/perseus_match.rb
|
49
|
+
- bin/perseus_match
|
50
|
+
- Rakefile
|
51
|
+
- COPYING
|
52
|
+
- ChangeLog
|
53
|
+
- LINGO_BASE
|
54
|
+
- README
|
55
|
+
- spec/spec_helper.rb
|
56
|
+
- spec/perseus_match/list_spec.rb
|
57
|
+
- spec/perseus_match/cluster_spec.rb
|
58
|
+
- spec/perseus_match/token_set_spec.rb
|
59
|
+
- spec/perseus_match_spec.rb
|
60
|
+
has_rdoc: true
|
61
|
+
homepage: http://prometheus.rubyforge.org/perseus_match
|
62
|
+
post_install_message:
|
63
|
+
rdoc_options:
|
64
|
+
- --line-numbers
|
65
|
+
- --inline-source
|
66
|
+
- --title
|
67
|
+
- perseus_match Application documentation
|
68
|
+
- --main
|
69
|
+
- README
|
70
|
+
- --charset
|
71
|
+
- UTF-8
|
72
|
+
- --all
|
73
|
+
require_paths:
|
74
|
+
- lib
|
75
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: "0"
|
80
|
+
version:
|
81
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: "0"
|
86
|
+
version:
|
87
|
+
requirements: []
|
88
|
+
|
89
|
+
rubyforge_project: prometheus
|
90
|
+
rubygems_version: 1.2.0
|
91
|
+
signing_key:
|
92
|
+
specification_version: 2
|
93
|
+
summary: Fuzzy string matching based on linguistic analysis
|
94
|
+
test_files: []
|
95
|
+
|