text 1.0.2 → 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/text/version.rb +1 -1
- data/lib/text/white_similarity.rb +14 -11
- data/test/test_white_similarity.rb +11 -0
- metadata +2 -2
data/lib/text/version.rb
CHANGED
@@ -34,25 +34,28 @@ module Text
|
|
34
34
|
end
|
35
35
|
|
36
36
|
def similarity(str1, str2)
|
37
|
-
pairs1
|
38
|
-
pairs2
|
37
|
+
pairs1 = word_letter_pairs(str1)
|
38
|
+
pairs2 = word_letter_pairs(str2).dup
|
39
39
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
40
|
+
union = pairs1.count + pairs2.count
|
41
|
+
|
42
|
+
intersection = 0
|
43
|
+
pairs1.each_with_index do |pair1|
|
44
|
+
if index = pairs2.index(pair1)
|
45
|
+
intersection += 1
|
46
|
+
pairs2.delete_at(index)
|
47
|
+
end
|
48
|
+
end
|
44
49
|
|
45
50
|
(2.0 * intersection) / union
|
46
51
|
end
|
47
52
|
|
48
53
|
private
|
49
54
|
def word_letter_pairs(str)
|
50
|
-
@word_letter_pairs[str] ||=
|
51
|
-
|
55
|
+
@word_letter_pairs[str] ||=
|
56
|
+
str.upcase.split(/\s+/).map{ |word|
|
52
57
|
(0 ... (word.length - 1)).map { |i| word[i, 2] }
|
53
|
-
}.flatten
|
54
|
-
[Set.new(pairs), pairs.length]
|
55
|
-
)
|
58
|
+
}.flatten.freeze
|
56
59
|
end
|
57
60
|
end
|
58
61
|
end
|
@@ -27,6 +27,12 @@ class WhiteSimilarityTest < Test::Unit::TestCase
|
|
27
27
|
assert_in_delta 0.0, white.similarity(word, "Sold"), 0.01
|
28
28
|
end
|
29
29
|
|
30
|
+
def test_should_not_clobber_cached_values
|
31
|
+
white = Text::WhiteSimilarity.new
|
32
|
+
word = "Healed"
|
33
|
+
assert_equal white.similarity(word, word), white.similarity(word, word)
|
34
|
+
end
|
35
|
+
|
30
36
|
def test_similarity_with_examples_from_article
|
31
37
|
assert_in_delta 0.4, Text::WhiteSimilarity.similarity("GGGGG", "GG"), 0.01
|
32
38
|
assert_in_delta 0.56, Text::WhiteSimilarity.similarity("REPUBLIC OF FRANCE", "FRANCE"), 0.01
|
@@ -35,4 +41,9 @@ class WhiteSimilarityTest < Test::Unit::TestCase
|
|
35
41
|
assert_in_delta 0.61, Text::WhiteSimilarity.similarity("FRENCH REPUBLIC", "REPUBLIC OF CUBA"), 0.01
|
36
42
|
end
|
37
43
|
|
44
|
+
def test_similarity_with_equal_strings
|
45
|
+
assert_equal 1.0, Text::WhiteSimilarity.similarity("aaaaa", "aaaaa")
|
46
|
+
assert_equal 1.0, Text::WhiteSimilarity.similarity("REPUBLIC OF CUBA", "REPUBLIC OF CUBA")
|
47
|
+
end
|
48
|
+
|
38
49
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date:
|
14
|
+
date: 2012-01-09 00:00:00.000000000 Z
|
15
15
|
dependencies: []
|
16
16
|
description: ! 'A collection of text algorithms: Levenshtein, Soundex, Metaphone,
|
17
17
|
Double Metaphone, Figlet, Porter Stemming'
|