blackwinter-perseus_match 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,16 @@
1
+ describe PerseusMatch::List, '::pair' do
2
+
3
+ before :all do
4
+ @phrases = %w[foo bar baz]
5
+ @size = @phrases.size
6
+ end
7
+
8
+ it 'should produce full list of pairs with correct size' do
9
+ PerseusMatch::List.pair(@phrases).size.should == @size ** 2
10
+ end
11
+
12
+ it 'should produce minimal list of pairs with correct size' do
13
+ PerseusMatch::List.pair(@phrases, {}, :minimal => true).size.should == (@size ** 2 - @size) / 2
14
+ end
15
+
16
+ end
@@ -0,0 +1,65 @@
1
+ describe PerseusMatch::TokenSet, ' with lingo' do
2
+
3
+ before :each do
4
+ PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
5
+ end
6
+
7
+ it 'should tokenize a string' do
8
+ PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
9
+ end
10
+
11
+ it 'should report strictly equal TokenSets as ==' do
12
+ PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('foo bar')
13
+ end
14
+
15
+ it 'should report strictly equal TokenSets as eql' do
16
+ PerseusMatch::TokenSet.new('foo bar').should be_eql(PerseusMatch::TokenSet.new('foo bar'))
17
+ end
18
+
19
+ it 'should report slightly equal TokenSets as ==' do
20
+ PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('Foo Bar')
21
+ end
22
+
23
+ it 'should *not* report slightly equal TokenSets as eql' do
24
+ PerseusMatch::TokenSet.new('foo bar').should_not be_eql(PerseusMatch::TokenSet.new('Foo Bar'))
25
+ end
26
+
27
+ it 'should include form in inspect' do
28
+ PerseusMatch::TokenSet.new('foo', []).inspect.to_s.should =~ /<foo>/
29
+ end
30
+
31
+ end if LINGO_FOUND
32
+
33
+ describe PerseusMatch::TokenSet, ' without lingo' do
34
+
35
+ before :each do
36
+ PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
37
+ end
38
+
39
+ it 'should take a prepared file for tokenization' do
40
+ # prevent lingo from being used
41
+ lingo_base = LINGO_BASE.dup
42
+ LINGO_BASE.replace('')
43
+
44
+ temp = Tempfile.open('perseus_match_spec_tokens_temp') { |t|
45
+ t.puts *%w[<foo|?> <bar|?>]
46
+ }
47
+
48
+ path = temp.path
49
+ link = 'perseus.tokens'
50
+
51
+ Dir.chdir(File.dirname(path)) {
52
+ File.symlink(path, link)
53
+
54
+ PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
55
+
56
+ File.unlink(link)
57
+ }
58
+
59
+ temp.unlink
60
+
61
+ # reset lingo base
62
+ LINGO_BASE.replace(lingo_base)
63
+ end
64
+
65
+ end
@@ -0,0 +1,168 @@
1
+ require 'rubygems'
2
+ require 'nuggets/tempfile/open'
3
+ require 'nuggets/util/i18n'
4
+
5
+ describe PerseusMatch do
6
+
7
+ before :all do
8
+ @highly_similar = [
9
+ 'Anbetung der Könige',
10
+ 'Die Anbetung der Könige'
11
+ ] # ok
12
+
13
+ @similar = [
14
+ # @highly_similar + ...
15
+ 'Die Anbetung der Heiligen Drei Könige',
16
+ 'dIE AnBeTuNg der heILIGen dREI KÖniGE'
17
+ ] # ok
18
+
19
+ @unfortunately_similar = [
20
+ # @similar + ...
21
+ 'Die Die Die Anbetung der Könige',
22
+ 'Die Könige der Anbetung',
23
+ 'Königsanbetung hoch drei'
24
+ ] # *not* ok -- eventually try to drop these below the threshold
25
+
26
+ @somewhat_similar = @highly_similar + @similar + @unfortunately_similar
27
+
28
+ phrases = @somewhat_similar + [
29
+ 'Drei mal drei macht sechs',
30
+ 'Das Ende dieses Blödsinns',
31
+ ''
32
+ ]
33
+
34
+ temp = Tempfile.open('perseus_match_spec_temp') { |t|
35
+ t.puts *phrases
36
+ }
37
+
38
+ PerseusMatch::TokenSet.tokenize(temp.path)
39
+
40
+ temp.unlink
41
+
42
+ @matchings = PerseusMatch.match(phrases)
43
+ end
44
+
45
+ it 'should identify identical (non-empty) strings as identical' do
46
+ @matchings.each { |matching|
47
+ if !matching.phrase.empty? && matching.phrase == matching.target
48
+ inform_on_error(matching) { matching.similarity.should == 1.0 }
49
+ end
50
+ }
51
+ end
52
+
53
+ it 'should identify case-insensitively identical (non-empty) strings as nearly identical' do
54
+ @matchings.each { |matching|
55
+ if !matching.phrase.empty? && matching.phrase.replace_diacritics.downcase == matching.target.replace_diacritics.downcase
56
+ inform_on_error(matching) { matching.similarity.should > 0.95 }
57
+ end
58
+ }
59
+ end
60
+
61
+ it 'should identify *only* case-insensitively identical (non-empty) strings as nearly identical' do
62
+ @matchings.each { |matching|
63
+ if !matching.phrase.empty? && matching.phrase.replace_diacritics.downcase != matching.target.replace_diacritics.downcase
64
+ inform_on_error(matching) { matching.similarity.should < 0.98 }
65
+ end
66
+ }
67
+ end
68
+
69
+ it 'should identify disjunct (non-empty) strings as disjunct' do
70
+ @matchings.each { |matching|
71
+ if !matching.phrase.empty? && matching.phrase_tokens.disjoint?(matching.target_tokens)
72
+ inform_on_error(matching) { matching.similarity.should == 0.0 }
73
+ end
74
+ }
75
+ end
76
+
77
+ it 'should identify empty string as disjunct with anything, even with itself' do
78
+ @matchings.each { |matching|
79
+ if matching.phrase.empty? || matching.target.empty?
80
+ inform_on_error(matching) { matching.similarity.should == 0.0 }
81
+ end
82
+ }
83
+ end
84
+
85
+ it 'should identify certain strings as highly similar (1)' do
86
+ @matchings.each { |matching|
87
+ if @highly_similar.include?(matching.phrase) && @highly_similar.include?(matching.target)
88
+ inform_on_error(matching) { matching.similarity.should > 0.9 }
89
+ end
90
+ }
91
+ end
92
+
93
+ it 'should identify certain strings as highly similar (2)' do
94
+ @highly_similar.each { |phrase|
95
+ @highly_similar.each { |target|
96
+ inform_on_error([phrase, target]) { PerseusMatch.check(phrase, target, 0.9).should be_true }
97
+ }
98
+ }
99
+ end
100
+
101
+ it 'should identify certain strings as similar (1)' do
102
+ @matchings.each { |matching|
103
+ if @similar.include?(matching.phrase) && @similar.include?(matching.target)
104
+ inform_on_error(matching) { matching.similarity.should > 0.8 }
105
+ end
106
+ }
107
+ end
108
+
109
+ it 'should identify certain strings as similar (2)' do
110
+ @similar.each { |phrase|
111
+ @similar.each { |target|
112
+ inform_on_error([phrase, target]) { PerseusMatch.check(phrase, target, 0.8).should be_true }
113
+ }
114
+ }
115
+ end
116
+
117
+ it 'should *not* identify other strings as similar (1)' do
118
+ @matchings.each { |matching|
119
+ if @somewhat_similar.include?(matching.phrase) && !@somewhat_similar.include?(matching.target)
120
+ inform_on_error(matching) { matching.similarity.should_not > 0.8 }
121
+ end
122
+ }
123
+ end
124
+
125
+ it 'should *not* identify other strings as similar (2)' do
126
+ @matchings.each { |matching|
127
+ if @somewhat_similar.include?(matching.phrase) && !@somewhat_similar.include?(matching.target)
128
+ inform_on_error(matching) { PerseusMatch.check(matching.phrase, matching.target, 0.8).should be_false }
129
+ end
130
+ }
131
+ end
132
+
133
+ it 'should be symmetrical' do
134
+ similarities = {}
135
+
136
+ @matchings.each { |matching|
137
+ if similarity = similarities[[matching.target, matching.phrase]]
138
+ inform_on_error(matching) { similarity.should == matching.similarity }
139
+ else
140
+ similarities[[matching.phrase, matching.target]] = matching.similarity
141
+ end
142
+ }
143
+ end
144
+
145
+ it 'should calculate pair distance' do
146
+ PerseusMatch.distance('foo', 'bar').class.should < Numeric
147
+ end
148
+
149
+ it 'should be clusterable' do
150
+ PerseusMatch.cluster(@somewhat_similar).should be_an_instance_of(Array)
151
+ end
152
+
153
+ it 'should be checkable (1)' do
154
+ PerseusMatch.check('foo', 'bar', 0, :>=).should be_true
155
+ end
156
+
157
+ it 'should be checkable (2)' do
158
+ lambda {
159
+ begin
160
+ PerseusMatch.check!('foo', 'bar', 0, :>)
161
+ rescue PerseusMatch::CheckFailedError => err
162
+ err.to_s.should =~ /0/
163
+ raise err
164
+ end
165
+ }.should raise_error(PerseusMatch::CheckFailedError)
166
+ end
167
+
168
+ end if LINGO_FOUND
@@ -0,0 +1,18 @@
1
+ unless Object.const_defined?(:PerseusMatch)
2
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
3
+ require 'perseus_match'
4
+ end
5
+
6
+ def inform_on_error(*args)
7
+ begin
8
+ yield
9
+ rescue Spec::Expectations::ExpectationNotMetError => err
10
+ unless args.empty?
11
+ puts
12
+ p *args
13
+ puts
14
+ end
15
+
16
+ raise
17
+ end
18
+ end
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: blackwinter-perseus_match
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.3
5
+ platform: ruby
6
+ authors:
7
+ - Jens Wille
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-12-09 00:00:00 -08:00
13
+ default_executable: perseus_match
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: ruby-backports
17
+ version_requirement:
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: "0"
23
+ version:
24
+ - !ruby/object:Gem::Dependency
25
+ name: ruby-nuggets
26
+ version_requirement:
27
+ version_requirements: !ruby/object:Gem::Requirement
28
+ requirements:
29
+ - - ">="
30
+ - !ruby/object:Gem::Version
31
+ version: 0.4.0
32
+ version:
33
+ description: Fuzzy string matching based on linguistic analysis
34
+ email: jens.wille@uni-koeln.de
35
+ executables:
36
+ - perseus_match
37
+ extensions: []
38
+
39
+ extra_rdoc_files:
40
+ - COPYING
41
+ - ChangeLog
42
+ - README
43
+ files:
44
+ - lib/perseus_match/list.rb
45
+ - lib/perseus_match/version.rb
46
+ - lib/perseus_match/token_set.rb
47
+ - lib/perseus_match/cluster.rb
48
+ - lib/perseus_match.rb
49
+ - bin/perseus_match
50
+ - Rakefile
51
+ - COPYING
52
+ - ChangeLog
53
+ - LINGO_BASE
54
+ - README
55
+ - spec/spec_helper.rb
56
+ - spec/perseus_match/list_spec.rb
57
+ - spec/perseus_match/cluster_spec.rb
58
+ - spec/perseus_match/token_set_spec.rb
59
+ - spec/perseus_match_spec.rb
60
+ has_rdoc: true
61
+ homepage: http://prometheus.rubyforge.org/perseus_match
62
+ post_install_message:
63
+ rdoc_options:
64
+ - --line-numbers
65
+ - --inline-source
66
+ - --title
67
+ - perseus_match Application documentation
68
+ - --main
69
+ - README
70
+ - --charset
71
+ - UTF-8
72
+ - --all
73
+ require_paths:
74
+ - lib
75
+ required_ruby_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: "0"
80
+ version:
81
+ required_rubygems_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: "0"
86
+ version:
87
+ requirements: []
88
+
89
+ rubyforge_project: prometheus
90
+ rubygems_version: 1.2.0
91
+ signing_key:
92
+ specification_version: 2
93
+ summary: Fuzzy string matching based on linguistic analysis
94
+ test_files: []
95
+