similis 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/similis.rb ADDED
@@ -0,0 +1,131 @@
1
+ require 'similis/soundex_processor'
2
+ require 'similis/levenshtein_processor'
3
+
4
+ module Similis
5
+ @@lists = Hash.new
6
+ @@last_method_used = nil
7
+ VERSION = "0.1"
8
+
9
+ def self.last_method_used
10
+ @@last_method_used
11
+ end
12
+
13
+ #Holds the list of words to be processed by algorithms like soundex
14
+ class ToBeProcessedList
15
+ @words = []
16
+ @processed_words = []
17
+ @method_used
18
+
19
+ PROCESSORS = {:soundex => SoundexProcessor}
20
+
21
+ def initialize list
22
+ @words = list
23
+ @processed_words = []
24
+ end
25
+
26
+ def get_processed_list
27
+ @processed_words
28
+ end
29
+
30
+ def get_method
31
+ @method_used
32
+ end
33
+
34
+ def with method
35
+ @method_used = method
36
+ @words.each do |w|
37
+ value = PROCESSORS[method].calculate w
38
+ @processed_words << {:word => w, :value => value}
39
+ end
40
+ end
41
+ end
42
+
43
+ def self.metaphone
44
+ #TODO: Implement
45
+ end
46
+
47
+ def self.soundex string
48
+ @@last_method_used = :soundex
49
+ SoundexProcessor.calculate string
50
+ end
51
+
52
+ def self.needleman_wunsch
53
+ #TODO: Implement
54
+ end
55
+
56
+ #Usage: Similis.preprocess({:key => list_of_words}).with(:algorithm)
57
+ def self.preprocess my_hash
58
+ k = my_hash.keys.first
59
+ @@lists[k] = ToBeProcessedList.new(my_hash[k])
60
+ @@lists[k]
61
+ end
62
+
63
+
64
+ #Calculates the Levenstein distance between two words
65
+ def self.levenshtein(word, other, ins=2, del=1, sub=1)
66
+ @@last_method_used = :levenshtein
67
+ LevenshteinProcessor.calculate(word, other, ins, del, sub)
68
+ end
69
+
70
+ #The default method for finding a similar word, is by using Levenshtein distance
71
+ def self.default_method word, list
72
+
73
+ chosen_words = Hash.new
74
+
75
+ best_distance = 999999
76
+
77
+ list.each do |w|
78
+ begin
79
+ w.gsub!(/\n/,"")
80
+ w.gsub!(/\r/, "")
81
+ rescue
82
+ end
83
+ distance = levenshtein(word, w)
84
+ if(distance <= best_distance)
85
+ chosen_words[distance] = Array.new if chosen_words[distance].nil?
86
+ chosen_words[distance] << w
87
+ best_distance = distance
88
+ end
89
+ end
90
+ #lets return the word if there is only one, or the list of words if there are more
91
+ if !chosen_words[best_distance].nil? and chosen_words[best_distance].count == 1
92
+ chosen_words[best_distance][0]
93
+ else
94
+ chosen_words[best_distance]
95
+ end
96
+ end
97
+
98
+ #Finds the most similar words using a preprocessed list
99
+ def self.find_using_soundex word, list
100
+ chosen_words = Hash.new
101
+ soundex_key = soundex word
102
+ list.each do |item|
103
+ if item[:value] == soundex_key
104
+ chosen_words[soundex_key] = Array.new if chosen_words[soundex_key].nil?
105
+ chosen_words[soundex_key] << item[:word]
106
+ end
107
+ end
108
+ if !chosen_words[soundex_key].nil? and chosen_words[soundex_key].count == 1
109
+ chosen_words[soundex_key][0]
110
+ else
111
+ chosen_words[soundex_key]
112
+ end
113
+ end
114
+
115
+ #Finds the most similar word to "word" from the list
116
+ def self.find_similar(word, list)
117
+ method = nil
118
+
119
+ if list.is_a? Symbol #If we're referencing the list, using a key for a preprocessed list...
120
+ method = @@lists[list].get_method
121
+ list = @@lists[list].get_processed_list
122
+ end
123
+
124
+ if !method.nil? and self.respond_to? method
125
+ self.send("find_using_#{method}".to_sym, word, list)
126
+ else
127
+ default_method word, list
128
+ end
129
+ end
130
+
131
+ end
@@ -0,0 +1,28 @@
1
+ class LevenshteinProcessor
2
+
3
+ #Code taken from the net.. need to double check if it can be improved
4
+ def self.calculate word, other, ins, del, sub
5
+ return nil if word.nil? || other.nil?
6
+
7
+ dm = []
8
+ dm[0] = (0..word.length).collect { |i| i * ins}
9
+ fill = [0] * (word.length - 1)
10
+
11
+ for i in 1..other.length
12
+ dm[i] = [i * del, fill.flatten]
13
+ end
14
+
15
+ for i in 1..other.length
16
+ for j in 1..word.length
17
+ dm[i][j] = [
18
+ dm[i-1][j-1] + (word[i-1] == other[i-1] ? 0 : sub),
19
+ dm[i][j-1] + ins,
20
+ dm[i-1][j] + del
21
+ ].min
22
+ end
23
+ end
24
+
25
+ dm[other.length][word.length]
26
+ end
27
+ end
28
+
@@ -0,0 +1,14 @@
1
+ class SoundexProcessor
2
+ #Mike Stok's version of this algorithm
3
+ def self.calculate string
4
+ begin
5
+ copy = string.upcase.tr '^A-Z', ''
6
+ return nil if copy.empty?
7
+ first_letter = copy[0, 1]
8
+ copy.tr_s! 'AEHIOUWYBFPVCGJKQSXZDTLMNR', '00000000111122222222334556'
9
+ copy.sub!(/^(.)\1*/, '').gsub!(/0/, '')
10
+ "#{first_letter}#{copy.ljust(3,"0")}"
11
+ rescue
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,4 @@
1
+
2
+ module Similis
3
+ VERSION = "0.1"
4
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: similis
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Fernando Doglio
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-01 00:00:00.000000000Z
13
+ dependencies: []
14
+ description: This gem allows you to search for the closest word inside a list of specified
15
+ known/valid words
16
+ email: deleteman@gmail.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - lib/similis.rb
22
+ - lib/similis/levenshtein_processor.rb
23
+ - lib/similis/soundex_processor.rb
24
+ - lib/similis/version.rb
25
+ homepage: https://github.com/deleteman/similis
26
+ licenses: []
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ required_ruby_version: !ruby/object:Gem::Requirement
32
+ none: false
33
+ requirements:
34
+ - - ! '>='
35
+ - !ruby/object:Gem::Version
36
+ version: 1.9.2
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ! '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ requirements: []
44
+ rubyforge_project:
45
+ rubygems_version: 1.8.15
46
+ signing_key:
47
+ specification_version: 3
48
+ summary: Finds the similarities between words, using different algorithms
49
+ test_files: []