similis 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/similis.rb +131 -0
- data/lib/similis/levenshtein_processor.rb +28 -0
- data/lib/similis/soundex_processor.rb +14 -0
- data/lib/similis/version.rb +4 -0
- metadata +49 -0
data/lib/similis.rb
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
require 'similis/soundex_processor'
|
2
|
+
require 'similis/levenshtein_processor'
|
3
|
+
|
4
|
+
module Similis
|
5
|
+
@@lists = Hash.new
|
6
|
+
@@last_method_used = nil
|
7
|
+
VERSION = "0.1"
|
8
|
+
|
9
|
+
def self.last_method_used
|
10
|
+
@@last_method_used
|
11
|
+
end
|
12
|
+
|
13
|
+
#Holds the list of words to be processed by algorithms like soundex
|
14
|
+
class ToBeProcessedList
|
15
|
+
@words = []
|
16
|
+
@processed_words = []
|
17
|
+
@method_used
|
18
|
+
|
19
|
+
PROCESSORS = {:soundex => SoundexProcessor}
|
20
|
+
|
21
|
+
def initialize list
|
22
|
+
@words = list
|
23
|
+
@processed_words = []
|
24
|
+
end
|
25
|
+
|
26
|
+
def get_processed_list
|
27
|
+
@processed_words
|
28
|
+
end
|
29
|
+
|
30
|
+
def get_method
|
31
|
+
@method_used
|
32
|
+
end
|
33
|
+
|
34
|
+
def with method
|
35
|
+
@method_used = method
|
36
|
+
@words.each do |w|
|
37
|
+
value = PROCESSORS[method].calculate w
|
38
|
+
@processed_words << {:word => w, :value => value}
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.metaphone
|
44
|
+
#TODO: Implement
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.soundex string
|
48
|
+
@@last_method_used = :soundex
|
49
|
+
SoundexProcessor.calculate string
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.needleman_wunsch
|
53
|
+
#TODO: Implement
|
54
|
+
end
|
55
|
+
|
56
|
+
#Usage: Similis.preprocess({:key => list_of_words}).with(:algorithm)
|
57
|
+
def self.preprocess my_hash
|
58
|
+
k = my_hash.keys.first
|
59
|
+
@@lists[k] = ToBeProcessedList.new(my_hash[k])
|
60
|
+
@@lists[k]
|
61
|
+
end
|
62
|
+
|
63
|
+
|
64
|
+
#Calculates the Levenstein distance between two words
|
65
|
+
def self.levenshtein(word, other, ins=2, del=1, sub=1)
|
66
|
+
@@last_method_used = :levenshtein
|
67
|
+
LevenshteinProcessor.calculate(word, other, ins, del, sub)
|
68
|
+
end
|
69
|
+
|
70
|
+
#The default method for finding a similar word, is by using Levenshtein distance
|
71
|
+
def self.default_method word, list
|
72
|
+
|
73
|
+
chosen_words = Hash.new
|
74
|
+
|
75
|
+
best_distance = 999999
|
76
|
+
|
77
|
+
list.each do |w|
|
78
|
+
begin
|
79
|
+
w.gsub!(/\n/,"")
|
80
|
+
w.gsub!(/\r/, "")
|
81
|
+
rescue
|
82
|
+
end
|
83
|
+
distance = levenshtein(word, w)
|
84
|
+
if(distance <= best_distance)
|
85
|
+
chosen_words[distance] = Array.new if chosen_words[distance].nil?
|
86
|
+
chosen_words[distance] << w
|
87
|
+
best_distance = distance
|
88
|
+
end
|
89
|
+
end
|
90
|
+
#lets return the word if there is only one, or the list of words if there are more
|
91
|
+
if !chosen_words[best_distance].nil? and chosen_words[best_distance].count == 1
|
92
|
+
chosen_words[best_distance][0]
|
93
|
+
else
|
94
|
+
chosen_words[best_distance]
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
#Finds the most similar words using a preprocessed list
|
99
|
+
def self.find_using_soundex word, list
|
100
|
+
chosen_words = Hash.new
|
101
|
+
soundex_key = soundex word
|
102
|
+
list.each do |item|
|
103
|
+
if item[:value] == soundex_key
|
104
|
+
chosen_words[soundex_key] = Array.new if chosen_words[soundex_key].nil?
|
105
|
+
chosen_words[soundex_key] << item[:word]
|
106
|
+
end
|
107
|
+
end
|
108
|
+
if !chosen_words[soundex_key].nil? and chosen_words[soundex_key].count == 1
|
109
|
+
chosen_words[soundex_key][0]
|
110
|
+
else
|
111
|
+
chosen_words[soundex_key]
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
#Finds the most similar word to "word" from the list
|
116
|
+
def self.find_similar(word, list)
|
117
|
+
method = nil
|
118
|
+
|
119
|
+
if list.is_a? Symbol #If we're referencing the list, using a key for a preprocessed list...
|
120
|
+
method = @@lists[list].get_method
|
121
|
+
list = @@lists[list].get_processed_list
|
122
|
+
end
|
123
|
+
|
124
|
+
if !method.nil? and self.respond_to? method
|
125
|
+
self.send("find_using_#{method}".to_sym, word, list)
|
126
|
+
else
|
127
|
+
default_method word, list
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
class LevenshteinProcessor
|
2
|
+
|
3
|
+
#Code taken from the net.. need to double check if it can be improved
|
4
|
+
def self.calculate word, other, ins, del, sub
|
5
|
+
return nil if word.nil? || other.nil?
|
6
|
+
|
7
|
+
dm = []
|
8
|
+
dm[0] = (0..word.length).collect { |i| i * ins}
|
9
|
+
fill = [0] * (word.length - 1)
|
10
|
+
|
11
|
+
for i in 1..other.length
|
12
|
+
dm[i] = [i * del, fill.flatten]
|
13
|
+
end
|
14
|
+
|
15
|
+
for i in 1..other.length
|
16
|
+
for j in 1..word.length
|
17
|
+
dm[i][j] = [
|
18
|
+
dm[i-1][j-1] + (word[i-1] == other[i-1] ? 0 : sub),
|
19
|
+
dm[i][j-1] + ins,
|
20
|
+
dm[i-1][j] + del
|
21
|
+
].min
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
dm[other.length][word.length]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
@@ -0,0 +1,14 @@
|
|
1
|
+
class SoundexProcessor
|
2
|
+
#Mike Stok's version of this algorithm
|
3
|
+
def self.calculate string
|
4
|
+
begin
|
5
|
+
copy = string.upcase.tr '^A-Z', ''
|
6
|
+
return nil if copy.empty?
|
7
|
+
first_letter = copy[0, 1]
|
8
|
+
copy.tr_s! 'AEHIOUWYBFPVCGJKQSXZDTLMNR', '00000000111122222222334556'
|
9
|
+
copy.sub!(/^(.)\1*/, '').gsub!(/0/, '')
|
10
|
+
"#{first_letter}#{copy.ljust(3,"0")}"
|
11
|
+
rescue
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
metadata
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: similis
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Fernando Doglio
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-07-01 00:00:00.000000000Z
|
13
|
+
dependencies: []
|
14
|
+
description: This gem allows you to search for the closest word inside a list of specified
|
15
|
+
known/valid words
|
16
|
+
email: deleteman@gmail.com
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- lib/similis.rb
|
22
|
+
- lib/similis/levenshtein_processor.rb
|
23
|
+
- lib/similis/soundex_processor.rb
|
24
|
+
- lib/similis/version.rb
|
25
|
+
homepage: https://github.com/deleteman/similis
|
26
|
+
licenses: []
|
27
|
+
post_install_message:
|
28
|
+
rdoc_options: []
|
29
|
+
require_paths:
|
30
|
+
- lib
|
31
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
32
|
+
none: false
|
33
|
+
requirements:
|
34
|
+
- - ! '>='
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: 1.9.2
|
37
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
38
|
+
none: false
|
39
|
+
requirements:
|
40
|
+
- - ! '>='
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: '0'
|
43
|
+
requirements: []
|
44
|
+
rubyforge_project:
|
45
|
+
rubygems_version: 1.8.15
|
46
|
+
signing_key:
|
47
|
+
specification_version: 3
|
48
|
+
summary: Finds the similarities between words, using different algorithms
|
49
|
+
test_files: []
|