text 0.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ = Text
2
+
3
+ A collection of text algorithms.
4
+
5
+
6
+ = Usage
7
+
8
+ require 'text'
9
+
10
+ font = Text::Figlet::Font.new('big.flf')
11
+ figlet = Text::Figlet::Typesetter.new(font)
12
+ figlet['Hello World'] # => '...'
13
+
14
+ Text::Levenshtein.distance('test', 'test') # => 0
15
+ Text::Levenshtein.distance('test', 'tent') # => 1
16
+
17
+ Text::Metaphone.metaphone('BRIAN') # => 'BRN'
18
+ Text::Metaphone.double_metaphone('Coburn') # => ['KPRN', nil]
19
+ Text::Metaphone.double_metaphone('Angier') # => ['ANJ', 'ANJR']
20
+
21
+ Text::Soundex.soundex('Knuth') # => 'K530'
22
+
23
+ Text::PorterStemming.stem('abatements') # => 'abat'
24
+
25
+
26
+ = License
27
+
28
+ Same as Ruby.
@@ -0,0 +1,48 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rake/packagetask'
4
+ require 'rake/gempackagetask'
5
+ require 'rcov/rcovtask'
6
+ require 'rake/rdoctask'
7
+
8
+ $:.unshift(File.dirname(__FILE__) + '/lib')
9
+ require 'text/version'
10
+
11
+ gemspec = Gem::Specification.new do |s|
12
+ s.name = 'text'
13
+ s.version = Text::VERSION::STRING
14
+ s.summary = 'A collection of text algorithms'
15
+ s.description = 'A collection of text algorithms: Levenshtein, Soundex, Metaphone, Double Metaphone, Figlet, Porter Stemming'
16
+ s.files = FileList['{lib,test}/**/*', 'README.rdoc', 'Rakefile']
17
+ s.require_path = 'lib'
18
+ s.has_rdoc = true
19
+ s.extra_rdoc_files = %w[README.rdoc]
20
+ s.rubyforge_project = 'text'
21
+ s.homepage = 'http://github.com/threedaymonk/text'
22
+ s.authors = ['Paul Battley', 'Michael Neumann', 'Tim Fletcher']
23
+ s.email = "pbattley@gmail.com"
24
+ end
25
+
26
+ Rake::GemPackageTask.new(gemspec) do |pkg|
27
+ pkg.need_tar_gz = true
28
+ end
29
+
30
+ Rake::PackageTask.new(gemspec.name, gemspec.version) do |pkg|
31
+ pkg.need_tar_gz = true
32
+ pkg.package_files.include gemspec.files
33
+ end
34
+
35
+ Rake::TestTask.new do |t|
36
+ t.verbose = false
37
+ end
38
+
39
+ Rcov::RcovTask.new do |t|
40
+ t.rcov_opts = []
41
+ end
42
+
43
+ Rake::RDocTask.new do |t|
44
+ t.main = 'README.rdoc'
45
+ t.rdoc_files.include 'README.rdoc', 'lib/**/*.rb'
46
+ end
47
+
48
+ task :default => :test
@@ -0,0 +1,7 @@
1
+ require 'text/double_metaphone'
2
+ require 'text/figlet'
3
+ require 'text/levenshtein'
4
+ require 'text/metaphone'
5
+ require 'text/porter_stemming'
6
+ require 'text/soundex'
7
+ require 'text/version'
@@ -0,0 +1,356 @@
1
+ #
2
+ # Ruby implementation of the Double Metaphone algorithm by Lawrence Philips,
3
+ # originally published in the June 2000 issue of C/C++ Users Journal.
4
+ #
5
+ # Based on Stephen Woodbridge's PHP version - http://swoodbridge.com/DoubleMetaPhone/
6
+ #
7
+ # Author: Tim Fletcher (twoggle@gmail.com)
8
+ #
9
+
10
+ module Text # :nodoc:
11
+ module Metaphone
12
+
13
+ # Returns the primary and secondary double metaphone tokens
14
+ # (the secondary will be nil if equal to the primary).
15
+ def double_metaphone(str)
16
+ primary, secondary, current = [], [], 0
17
+ original, length, last = "#{str} ".upcase, str.length, str.length - 1
18
+ if /^GN|KN|PN|WR|PS$/ =~ original[0, 2]
19
+ current += 1
20
+ end
21
+ if 'X' == original[0, 1]
22
+ primary << :S
23
+ secondary << :S
24
+ current += 1
25
+ end
26
+ while primary.length < 4 || secondary.length < 4
27
+ break if current > str.length
28
+ a, b, c = double_metaphone_lookup(original, current, length, last)
29
+ primary << a if a
30
+ secondary << b if b
31
+ current += c if c
32
+ end
33
+ primary, secondary = primary.to_s[0, 4], secondary.to_s[0, 4]
34
+ return primary, (primary == secondary ? nil : secondary)
35
+ end
36
+
37
+
38
+ private
39
+
40
+ def slavo_germanic?(str)
41
+ /W|K|CZ|WITZ/ =~ str
42
+ end
43
+
44
+ def vowel?(str)
45
+ /^A|E|I|O|U|Y$/ =~ str
46
+ end
47
+
48
+ def double_metaphone_lookup(str, pos, length, last)
49
+ case str[pos, 1]
50
+ when /^A|E|I|O|U|Y$/
51
+ if 0 == pos
52
+ return :A, :A, 1
53
+ else
54
+ return nil, nil, 1
55
+ end
56
+ when 'B'
57
+ return :P, :P, ('B' == str[pos + 1, 1] ? 2 : 1)
58
+ when 'Ç'
59
+ return :S, :S, 1
60
+ when 'C'
61
+ if pos > 1 &&
62
+ !vowel?(str[pos - 2, 1]) &&
63
+ 'ACH' == str[pos - 1, 3] &&
64
+ str[pos + 2, 1] != 'I' && (
65
+ str[pos + 2, 1] != 'E' ||
66
+ str[pos - 2, 6] =~ /^(B|M)ACHER$/
67
+ ) then
68
+ return :K, :K, 2
69
+ elsif 0 == pos && 'CAESAR' == str[pos, 6]
70
+ return :S, :S, 2
71
+ elsif 'CHIA' == str[pos, 4]
72
+ return :K, :K, 2
73
+ elsif 'CH' == str[pos, 2]
74
+ if pos > 0 && 'CHAE' == str[pos, 4]
75
+ return :K, :X, 2
76
+ elsif 0 == pos && (
77
+ ['HARAC', 'HARIS'].include?(str[pos + 1, 5]) ||
78
+ ['HOR', 'HYM', 'HIA', 'HEM'].include?(str[pos + 1, 3])
79
+ ) && str[0, 5] != 'CHORE' then
80
+ return :K, :K, 2
81
+ elsif ['VAN ','VON '].include?(str[0, 4]) ||
82
+ 'SCH' == str[0, 3] ||
83
+ ['ORCHES','ARCHIT','ORCHID'].include?(str[pos - 2, 6]) ||
84
+ ['T','S'].include?(str[pos + 2, 1]) || (
85
+ ((0 == pos) || ['A','O','U','E'].include?(str[pos - 1, 1])) &&
86
+ ['L','R','N','M','B','H','F','V','W',' '].include?(str[pos + 2, 1])
87
+ ) then
88
+ return :K, :K, 2
89
+ elsif pos > 0
90
+ return ('MC' == str[0, 2] ? 'K' : 'X'), 'K', 2
91
+ else
92
+ return :X, :X, 2
93
+ end
94
+ elsif 'CZ' == str[pos, 2] && 'WICZ' != str[pos - 2, 4]
95
+ return :S, :X, 2
96
+ elsif 'CIA' == str[pos + 1, 3]
97
+ return :X, :X, 3
98
+ elsif 'CC' == str[pos, 2] && !(1 == pos && 'M' == str[0, 1])
99
+ if /^I|E|H$/ =~ str[pos + 2, 1] && 'HU' != str[pos + 2, 2]
100
+ if (1 == pos && 'A' == str[pos - 1, 1]) ||
101
+ /^UCCE(E|S)$/ =~ str[pos - 1, 5] then
102
+ return :KS, :KS, 3
103
+ else
104
+ return :X, :X, 3
105
+ end
106
+ else
107
+ return :K, :K, 2
108
+ end
109
+ elsif /^C(K|G|Q)$/ =~ str[pos, 2]
110
+ return :K, :K, 2
111
+ elsif /^C(I|E|Y)$/ =~ str[pos, 2]
112
+ return :S, (/^CI(O|E|A)$/ =~ str[pos, 3] ? :X : :S), 2
113
+ else
114
+ if /^ (C|Q|G)$/ =~ str[pos + 1, 2]
115
+ return :K, :K, 3
116
+ else
117
+ return :K, :K, (/^C|K|Q$/ =~ str[pos + 1, 1] && !(['CE','CI'].include?(str[pos + 1, 2])) ? 2 : 1)
118
+ end
119
+ end
120
+ when 'D'
121
+ if 'DG' == str[pos, 2]
122
+ if /^I|E|Y$/ =~ str[pos + 2, 1]
123
+ return :J, :J, 3
124
+ else
125
+ return :TK, :TK, 2
126
+ end
127
+ else
128
+ return :T, :T, (/^D(T|D)$/ =~ str[pos, 2] ? 2 : 1)
129
+ end
130
+ when 'F'
131
+ return :F, :F, ('F' == str[pos + 1, 1] ? 2 : 1)
132
+ when 'G'
133
+ if 'H' == str[pos + 1, 1]
134
+ if pos > 0 && !vowel?(str[pos - 1, 1])
135
+ return :K, :K, 2
136
+ elsif 0 == pos
137
+ if 'I' == str[pos + 2, 1]
138
+ return :J, :J, 2
139
+ else
140
+ return :K, :K, 2
141
+ end
142
+ elsif (pos > 1 && /^B|H|D$/ =~ str[pos - 2, 1]) ||
143
+ (pos > 2 && /^B|H|D$/ =~ str[pos - 3, 1]) ||
144
+ (pos > 3 && /^B|H$/ =~ str[pos - 4, 1])
145
+ return nil, nil, 2
146
+ else
147
+ if (pos > 2 && 'U' == str[pos - 1, 1] && /^C|G|L|R|T$/ =~ str[pos - 3, 1])
148
+ return :F, :F, 2
149
+ elsif pos > 0 && 'I' != str[pos - 1, 1]
150
+ return :K, :K, 2
151
+ else
152
+ return nil, nil, 2
153
+ end
154
+ end
155
+ elsif 'N' == str[pos + 1, 1]
156
+ if 1 == pos && vowel?(str[0, 1]) && !slavo_germanic?(str)
157
+ return :KN, :N, 2
158
+ else
159
+ if 'EY' != str[pos + 2, 2] && 'Y' != str[pos + 1, 1] && !slavo_germanic?(str)
160
+ return :N, :KN, 2
161
+ else
162
+ return :KN, :KN, 2
163
+ end
164
+ end
165
+ elsif 'LI' == str[pos + 1, 2] && !slavo_germanic?(str)
166
+ return :KL, :L, 2
167
+ elsif 0 == pos && ('Y' == str[pos + 1, 1] || /^(E(S|P|B|L|Y|I|R)|I(B|L|N|E))$/ =~ str[pos + 1, 2])
168
+ return :K, :J, 2
169
+ elsif (('ER' == str[pos + 1, 2] || 'Y' == str[pos + 1, 1]) &&
170
+ /^(D|R|M)ANGER$/ !~ str[0, 6] &&
171
+ /^E|I$/ !~ str[pos - 1, 1] &&
172
+ /^(R|O)GY$/ !~ str[pos - 1, 3])
173
+ return :K, :J, 2
174
+ elsif /^E|I|Y$/ =~ str[pos + 1, 1] || /^(A|O)GGI$/ =~ str[pos - 1, 4]
175
+ if (/^V(A|O)N $/ =~ str[0, 4] || 'SCH' == str[0, 3]) || 'ET' == str[pos + 1, 2]
176
+ return :K, :K, 2
177
+ else
178
+ if 'IER ' == str[pos + 1, 4]
179
+ return :J, :J, 2
180
+ else
181
+ return :J, :K, 2
182
+ end
183
+ end
184
+ elsif 'G' == str[pos + 1, 1]
185
+ return :K, :K, 2
186
+ else
187
+ return :K, :K, 1
188
+ end
189
+ when 'H'
190
+ if (0 == pos || vowel?(str[pos - 1, 1])) && vowel?(str[pos + 1, 1])
191
+ return :H, :H, 2
192
+ else
193
+ return nil, nil, 1
194
+ end
195
+ when 'J'
196
+ if 'JOSE' == str[pos, 4] || 'SAN ' == str[0, 4]
197
+ if (0 == pos && ' ' == str[pos + 4, 1]) || 'SAN ' == str[0, 4]
198
+ return :H, :H, 1
199
+ else
200
+ return :J, :H, 1
201
+ end
202
+ else
203
+ current = ('J' == str[pos + 1, 1] ? 2 : 1)
204
+
205
+ if 0 == pos && 'JOSE' != str[pos, 4]
206
+ return :J, :A, current
207
+ else
208
+ if vowel?(str[pos - 1, 1]) && !slavo_germanic?(str) && /^A|O$/ =~ str[pos + 1, 1]
209
+ return :J, :H, current
210
+ else
211
+ if last == pos
212
+ return :J, nil, current
213
+ else
214
+ if /^L|T|K|S|N|M|B|Z$/ !~ str[pos + 1, 1] && /^S|K|L$/ !~ str[pos - 1, 1]
215
+ return :J, :J, current
216
+ else
217
+ return nil, nil, current
218
+ end
219
+ end
220
+ end
221
+ end
222
+ end
223
+ when 'K'
224
+ return :K, :K, ('K' == str[pos + 1, 1] ? 2 : 1)
225
+ when 'L'
226
+ if 'L' == str[pos + 1, 1]
227
+ if (((length - 3) == pos && /^(ILL(O|A)|ALLE)$/ =~ str[pos - 1, 4]) ||
228
+ ((/^(A|O)S$/ =~ str[last - 1, 2] || /^A|O$/ =~ str[last, 1]) && 'ALLE' == str[pos - 1, 4]))
229
+ return :L, nil, 2
230
+ else
231
+ return :L, :L, 2
232
+ end
233
+ else
234
+ return :L, :L, 1
235
+ end
236
+ when 'M'
237
+ if ('UMB' == str[pos - 1, 3] &&
238
+ ((last - 1) == pos || 'ER' == str[pos + 2, 2])) || 'M' == str[pos + 1, 1]
239
+ return :M, :M, 2
240
+ else
241
+ return :M, :M, 1
242
+ end
243
+ when 'N'
244
+ return :N, :N, ('N' == str[pos + 1, 1] ? 2 : 1)
245
+ when 'Ñ'
246
+ return :N, :N, 1
247
+ when 'P'
248
+ if 'H' == str[pos + 1, 1]
249
+ return :F, :F, 2
250
+ else
251
+ return :P, :P, (/^P|B$/ =~ str[pos + 1, 1] ? 2 : 1)
252
+ end
253
+ when 'Q'
254
+ return :K, :K, ('Q' == str[pos + 1, 1] ? 2 : 1)
255
+ when 'R'
256
+ current = ('R' == str[pos + 1, 1] ? 2 : 1)
257
+
258
+ if last == pos && !slavo_germanic?(str) && 'IE' == str[pos - 2, 2] && /^M(E|A)$/ !~ str[pos - 4, 2]
259
+ return nil, :R, current
260
+ else
261
+ return :R, :R, current
262
+ end
263
+ when 'S'
264
+ if /^(I|Y)SL$/ =~ str[pos - 1, 3]
265
+ return nil, nil, 1
266
+ elsif 0 == pos && 'SUGAR' == str[pos, 5]
267
+ return :X, :S, 1
268
+ elsif 'SH' == str[pos, 2]
269
+ if /^H(EIM|OEK|OLM|OLZ)$/ =~ str[pos + 1, 4]
270
+ return :S, :S, 2
271
+ else
272
+ return :X, :X, 2
273
+ end
274
+ elsif /^SI(O|A)$/ =~ str[pos, 3] || 'SIAN' == str[pos, 4]
275
+ return :S, (slavo_germanic?(str) ? :S : :X), 3
276
+ elsif (0 == pos && /^M|N|L|W$/ =~ str[pos + 1, 1]) || 'Z' == str[pos + 1, 1]
277
+ return :S, :X, ('Z' == str[pos + 1, 1] ? 2 : 1)
278
+ elsif 'SC' == str[pos, 2]
279
+ if 'H' == str[pos + 2, 1]
280
+ if /^OO|ER|EN|UY|ED|EM$/ =~ str[pos + 3, 2]
281
+ return (/^E(R|N)$/ =~ str[pos + 3, 2] ? :X : :SK), :SK, 3
282
+ else
283
+ return :X, ((0 == pos && !vowel?(str[3, 1]) && ('W' != str[pos + 3, 1])) ? :S : :X), 3
284
+ end
285
+ elsif /^I|E|Y$/ =~ str[pos + 2, 1]
286
+ return :S, :S, 3
287
+ else
288
+ return :SK, :SK, 3
289
+ end
290
+ else
291
+ return (last == pos && /^(A|O)I$/ =~ str[pos - 2, 2] ? nil : 'S'), 'S', (/^S|Z$/ =~ str[pos + 1, 1] ? 2 : 1)
292
+ end
293
+ when 'T'
294
+ if 'TION' == str[pos, 4]
295
+ return :X, :X, 3
296
+ elsif /^T(IA|CH)$/ =~ str[pos, 3]
297
+ return :X, :X, 3
298
+ elsif 'TH' == str[pos, 2] || 'TTH' == str[pos, 3]
299
+ if /^(O|A)M$/ =~ str[pos + 2, 2] || /^V(A|O)N $/ =~ str[0, 4] || 'SCH' == str[0, 3]
300
+ return :T, :T, 2
301
+ else
302
+ return 0, :T, 2
303
+ end
304
+ else
305
+ return :T, :T, (/^T|D$/ =~ str[pos + 1, 1] ? 2 : 1)
306
+ end
307
+ when 'V'
308
+ return :F, :F, ('V' == str[pos + 1, 1] ? 2 : 1)
309
+ when 'W'
310
+ if 'WR' == str[pos, 2]
311
+ return :R, :R, 2
312
+ end
313
+ pri, sec = nil, nil
314
+
315
+ if 0 == pos && (vowel?(str[pos + 1, 1]) || 'WH' == str[pos, 2])
316
+ pri = :A
317
+ sec = vowel?(str[pos + 1, 1]) ? :F : :A
318
+ end
319
+
320
+ if (last == pos && vowel?(str[pos - 1, 1])) || 'SCH' == str[0, 3] ||
321
+ /^EWSKI|EWSKY|OWSKI|OWSKY$/ =~ str[pos - 1, 5]
322
+ return pri, "#{sec}F".intern, 1
323
+ elsif /^WI(C|T)Z$/ =~ str[pos, 4]
324
+ return "#{pri}TS".intern, "#{sec}FX".intern, 4
325
+ else
326
+ return pri, sec, 1
327
+ end
328
+ when 'X'
329
+ current = (/^C|X$/ =~ str[pos + 1, 1] ? 2 : 1)
330
+
331
+ if !(last == pos && (/^(I|E)AU$/ =~ str[pos - 3, 3] || /^(A|O)U$/ =~ str[pos - 2, 2]))
332
+ return :KS, :KS, current
333
+ else
334
+ return nil, nil, current
335
+ end
336
+ when 'Z'
337
+ if 'H' == str[pos + 1, 1]
338
+ return :J, :J, 2
339
+ else
340
+ current = ('Z' == str[pos + 1, 1] ? 2 : 1)
341
+
342
+ if /^Z(O|I|A)$/ =~ str[pos + 1, 2] || (slavo_germanic?(str) && (pos > 0 && 'T' != str[pos - 1, 1]))
343
+ return :S, :TS, current
344
+ else
345
+ return :S, :S, current
346
+ end
347
+ end
348
+ else
349
+ return nil, nil, 1
350
+ end
351
+ end # def double_metaphone_lookup
352
+
353
+ extend self
354
+
355
+ end # module Metaphone
356
+ end # module Text
@@ -0,0 +1,17 @@
1
+ #
2
+ # Ruby implementation of the Figlet program (http://www.figlet.org/).
3
+ #
4
+ # Author: Tim Fletcher (twoggle@gmail.com)
5
+ #
6
+ # Usage:
7
+ #
8
+ # big_font = Text::Figlet::Font.new('big.flf')
9
+ #
10
+ # figlet = Text::Figlet::Typesetter.new(big_font)
11
+ #
12
+ # puts figlet['hello world']
13
+ #
14
+ #
15
+ require 'text/figlet/font'
16
+ require 'text/figlet/smusher'
17
+ require 'text/figlet/typesetter'