similis 0.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/similis.rb ADDED
@@ -0,0 +1,131 @@
1
+ require 'similis/soundex_processor'
2
+ require 'similis/levenshtein_processor'
3
+
4
+ module Similis
5
+ @@lists = Hash.new
6
+ @@last_method_used = nil
7
+ VERSION = "0.1"
8
+
9
+ def self.last_method_used
10
+ @@last_method_used
11
+ end
12
+
13
+ #Holds the list of words to be processed by algorithms like soundex
14
+ class ToBeProcessedList
15
+ @words = []
16
+ @processed_words = []
17
+ @method_used
18
+
19
+ PROCESSORS = {:soundex => SoundexProcessor}
20
+
21
+ def initialize list
22
+ @words = list
23
+ @processed_words = []
24
+ end
25
+
26
+ def get_processed_list
27
+ @processed_words
28
+ end
29
+
30
+ def get_method
31
+ @method_used
32
+ end
33
+
34
+ def with method
35
+ @method_used = method
36
+ @words.each do |w|
37
+ value = PROCESSORS[method].calculate w
38
+ @processed_words << {:word => w, :value => value}
39
+ end
40
+ end
41
+ end
42
+
43
+ def self.metaphone
44
+ #TODO: Implement
45
+ end
46
+
47
+ def self.soundex string
48
+ @@last_method_used = :soundex
49
+ SoundexProcessor.calculate string
50
+ end
51
+
52
+ def self.needleman_wunsch
53
+ #TODO: Implement
54
+ end
55
+
56
+ #Usage: Similis.preprocess({:key => list_of_words}).with(:algorithm)
57
+ def self.preprocess my_hash
58
+ k = my_hash.keys.first
59
+ @@lists[k] = ToBeProcessedList.new(my_hash[k])
60
+ @@lists[k]
61
+ end
62
+
63
+
64
+ #Calculates the Levenstein distance between two words
65
+ def self.levenshtein(word, other, ins=2, del=1, sub=1)
66
+ @@last_method_used = :levenshtein
67
+ LevenshteinProcessor.calculate(word, other, ins, del, sub)
68
+ end
69
+
70
+ #The default method for finding a similar word, is by using Levenshtein distance
71
+ def self.default_method word, list
72
+
73
+ chosen_words = Hash.new
74
+
75
+ best_distance = 999999
76
+
77
+ list.each do |w|
78
+ begin
79
+ w.gsub!(/\n/,"")
80
+ w.gsub!(/\r/, "")
81
+ rescue
82
+ end
83
+ distance = levenshtein(word, w)
84
+ if(distance <= best_distance)
85
+ chosen_words[distance] = Array.new if chosen_words[distance].nil?
86
+ chosen_words[distance] << w
87
+ best_distance = distance
88
+ end
89
+ end
90
+ #lets return the word if there is only one, or the list of words if there are more
91
+ if !chosen_words[best_distance].nil? and chosen_words[best_distance].count == 1
92
+ chosen_words[best_distance][0]
93
+ else
94
+ chosen_words[best_distance]
95
+ end
96
+ end
97
+
98
+ #Finds the most similar words using a preprocessed list
99
+ def self.find_using_soundex word, list
100
+ chosen_words = Hash.new
101
+ soundex_key = soundex word
102
+ list.each do |item|
103
+ if item[:value] == soundex_key
104
+ chosen_words[soundex_key] = Array.new if chosen_words[soundex_key].nil?
105
+ chosen_words[soundex_key] << item[:word]
106
+ end
107
+ end
108
+ if !chosen_words[soundex_key].nil? and chosen_words[soundex_key].count == 1
109
+ chosen_words[soundex_key][0]
110
+ else
111
+ chosen_words[soundex_key]
112
+ end
113
+ end
114
+
115
+ #Finds the most similar word to "word" from the list
116
+ def self.find_similar(word, list)
117
+ method = nil
118
+
119
+ if list.is_a? Symbol #If we're referencing the list, using a key for a preprocessed list...
120
+ method = @@lists[list].get_method
121
+ list = @@lists[list].get_processed_list
122
+ end
123
+
124
+ if !method.nil? and self.respond_to? method
125
+ self.send("find_using_#{method}".to_sym, word, list)
126
+ else
127
+ default_method word, list
128
+ end
129
+ end
130
+
131
+ end
@@ -0,0 +1,28 @@
1
+ class LevenshteinProcessor
2
+
3
+ #Code taken from the net.. need to double check if it can be improved
4
+ def self.calculate word, other, ins, del, sub
5
+ return nil if word.nil? || other.nil?
6
+
7
+ dm = []
8
+ dm[0] = (0..word.length).collect { |i| i * ins}
9
+ fill = [0] * (word.length - 1)
10
+
11
+ for i in 1..other.length
12
+ dm[i] = [i * del, fill.flatten]
13
+ end
14
+
15
+ for i in 1..other.length
16
+ for j in 1..word.length
17
+ dm[i][j] = [
18
+ dm[i-1][j-1] + (word[i-1] == other[i-1] ? 0 : sub),
19
+ dm[i][j-1] + ins,
20
+ dm[i-1][j] + del
21
+ ].min
22
+ end
23
+ end
24
+
25
+ dm[other.length][word.length]
26
+ end
27
+ end
28
+
@@ -0,0 +1,14 @@
1
+ class SoundexProcessor
2
+ #Mike Stok's version of this algorithm
3
+ def self.calculate string
4
+ begin
5
+ copy = string.upcase.tr '^A-Z', ''
6
+ return nil if copy.empty?
7
+ first_letter = copy[0, 1]
8
+ copy.tr_s! 'AEHIOUWYBFPVCGJKQSXZDTLMNR', '00000000111122222222334556'
9
+ copy.sub!(/^(.)\1*/, '').gsub!(/0/, '')
10
+ "#{first_letter}#{copy.ljust(3,"0")}"
11
+ rescue
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,4 @@
1
+
2
+ module Similis
3
+ VERSION = "0.1"
4
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: similis
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Fernando Doglio
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-01 00:00:00.000000000Z
13
+ dependencies: []
14
+ description: This gem allows you to search for the closest word inside a list of specified
15
+ known/valid words
16
+ email: deleteman@gmail.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - lib/similis.rb
22
+ - lib/similis/levenshtein_processor.rb
23
+ - lib/similis/soundex_processor.rb
24
+ - lib/similis/version.rb
25
+ homepage: https://github.com/deleteman/similis
26
+ licenses: []
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ required_ruby_version: !ruby/object:Gem::Requirement
32
+ none: false
33
+ requirements:
34
+ - - ! '>='
35
+ - !ruby/object:Gem::Version
36
+ version: 1.9.2
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ! '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ requirements: []
44
+ rubyforge_project:
45
+ rubygems_version: 1.8.15
46
+ signing_key:
47
+ specification_version: 3
48
+ summary: Finds the similarities between words, using different algorithms
49
+ test_files: []