Text 1.1.2 → 1.1.3
Sign up to get free protection for your applications and to get access to all the features.
- metadata +51 -67
- data/README.rdoc +0 -28
- data/lib/text.rb +0 -6
- data/lib/text/double_metaphone.rb +0 -356
- data/lib/text/figlet.rb +0 -17
- data/lib/text/figlet/font.rb +0 -117
- data/lib/text/figlet/smusher.rb +0 -64
- data/lib/text/figlet/typesetter.rb +0 -68
- data/lib/text/levenshtein.rb +0 -65
- data/lib/text/metaphone.rb +0 -97
- data/lib/text/porter_stemming.rb +0 -171
- data/lib/text/soundex.rb +0 -61
- data/rakefile.rb +0 -44
- data/test/data/big.flf +0 -2204
- data/test/data/big.txt +0 -8
- data/test/data/chunky.flf +0 -512
- data/test/data/chunky.txt +0 -5
- data/test/data/double_metaphone.csv +0 -1218
- data/test/data/metaphone.txt +0 -51
- data/test/data/metaphone_buggy.txt +0 -52
- data/test/data/porter_stemming_input.txt +0 -23531
- data/test/data/porter_stemming_output.txt +0 -23531
- data/test/preamble.rb +0 -10
- data/test/test_double_metaphone.rb +0 -23
- data/test/test_figlet.rb +0 -17
- data/test/test_levenshtein.rb +0 -80
- data/test/test_metaphone.rb +0 -39
- data/test/test_porter_stemming.rb +0 -16
- data/test/test_soundex.rb +0 -27
data/lib/text/figlet.rb
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Ruby implementation of the Figlet program (http://www.figlet.org/).
|
3
|
-
#
|
4
|
-
# Author: Tim Fletcher (twoggle@gmail.com)
|
5
|
-
#
|
6
|
-
# Usage:
|
7
|
-
#
|
8
|
-
# big_font = Text::Figlet::Font.new('big.flf')
|
9
|
-
#
|
10
|
-
# figlet = Text::Figlet::Typesetter.new(big_font)
|
11
|
-
#
|
12
|
-
# puts figlet['hello world']
|
13
|
-
#
|
14
|
-
#
|
15
|
-
require 'text/figlet/font'
|
16
|
-
require 'text/figlet/smusher'
|
17
|
-
require 'text/figlet/typesetter'
|
data/lib/text/figlet/font.rb
DELETED
@@ -1,117 +0,0 @@
|
|
1
|
-
module Text
|
2
|
-
module Figlet
|
3
|
-
|
4
|
-
class UnknownFontFormat < StandardError
|
5
|
-
end
|
6
|
-
|
7
|
-
class Font
|
8
|
-
def initialize(filename, load_german = true)
|
9
|
-
file = File.open(filename, 'rb')
|
10
|
-
|
11
|
-
header = file.gets.strip.split(/ /)
|
12
|
-
|
13
|
-
raise UnknownFontFormat if 'flf2a' != header[0][0, 5]
|
14
|
-
|
15
|
-
@hard_blank = header.shift[-1, 1]
|
16
|
-
@height = header.shift.to_i
|
17
|
-
@baseline = header.shift
|
18
|
-
@max_length = header.shift
|
19
|
-
@old_layout = header.shift.to_i
|
20
|
-
@comment_count = header.shift.to_i
|
21
|
-
@right_to_left = header.shift
|
22
|
-
@right_to_left = !@right_to_left.nil? && @right_to_left.to_i == 1
|
23
|
-
|
24
|
-
@load_german, @characters = load_german, {}
|
25
|
-
|
26
|
-
load_comments file
|
27
|
-
load_ascii_characters file
|
28
|
-
load_german_characters file
|
29
|
-
load_extended_characters file
|
30
|
-
|
31
|
-
file.close
|
32
|
-
end
|
33
|
-
|
34
|
-
def [](char)
|
35
|
-
@characters[char]
|
36
|
-
end
|
37
|
-
|
38
|
-
def has_char?(char)
|
39
|
-
@characters.has_key? char
|
40
|
-
end
|
41
|
-
|
42
|
-
attr_reader :height, :hard_blank, :old_layout
|
43
|
-
|
44
|
-
def right_to_left?
|
45
|
-
@right_to_left
|
46
|
-
end
|
47
|
-
|
48
|
-
|
49
|
-
private
|
50
|
-
|
51
|
-
def load_comments(file)
|
52
|
-
@comment_count.times { file.gets.strip }
|
53
|
-
end
|
54
|
-
|
55
|
-
def load_ascii_characters(file)
|
56
|
-
(32..126).each { |i| @characters[i] = load_char(file) }
|
57
|
-
end
|
58
|
-
|
59
|
-
def load_german_characters(file)
|
60
|
-
[91, 92, 93, 123, 124, 125, 126].each do |i|
|
61
|
-
if @load_german
|
62
|
-
unless char = load_char(file)
|
63
|
-
return
|
64
|
-
end
|
65
|
-
@characters[i] = char
|
66
|
-
else
|
67
|
-
skip_char file
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
def load_extended_characters(file)
|
73
|
-
until file.eof?
|
74
|
-
i = file.gets.strip.split(/ /).first
|
75
|
-
if i.empty?
|
76
|
-
next
|
77
|
-
elsif /^\-0x/i =~ i # comment
|
78
|
-
skip_char file
|
79
|
-
else
|
80
|
-
if /^0x/i =~ i
|
81
|
-
i = i[2, 1].hex
|
82
|
-
elsif '0' == i[0] && '0' != i || '-0' == i[0, 2]
|
83
|
-
i = i.oct
|
84
|
-
end
|
85
|
-
unless char = load_char(file)
|
86
|
-
return
|
87
|
-
end
|
88
|
-
@characters[i] = char
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
def load_char(file)
|
94
|
-
char = []
|
95
|
-
@height.times do
|
96
|
-
return false if file.eof?
|
97
|
-
line = file.gets.rstrip
|
98
|
-
if match = /(.){1,2}$/.match(line)
|
99
|
-
line.gsub! match[1], ''
|
100
|
-
end
|
101
|
-
line << "\x00"
|
102
|
-
char << line
|
103
|
-
end
|
104
|
-
return char
|
105
|
-
end
|
106
|
-
|
107
|
-
def skip_char(file)
|
108
|
-
@height.times do
|
109
|
-
return if file.eof?
|
110
|
-
return if file.gets.strip.nil?
|
111
|
-
end
|
112
|
-
end
|
113
|
-
|
114
|
-
end
|
115
|
-
|
116
|
-
end # module Figlet
|
117
|
-
end # module Text
|
data/lib/text/figlet/smusher.rb
DELETED
@@ -1,64 +0,0 @@
|
|
1
|
-
module Text
|
2
|
-
module Figlet
|
3
|
-
|
4
|
-
class Smusher
|
5
|
-
|
6
|
-
def initialize(font)
|
7
|
-
@font = font
|
8
|
-
end
|
9
|
-
|
10
|
-
def [](result)
|
11
|
-
todo = false
|
12
|
-
|
13
|
-
@font.height.times do |j|
|
14
|
-
result[j] = result[j].sub(pattern) { todo, x = callback(todo, $1, $2); x }
|
15
|
-
end
|
16
|
-
@font.height.times do |j|
|
17
|
-
result[j] = if todo
|
18
|
-
result[j].sub(/\s\x00(?!$)|\x00\s/, '').sub(/\x00(?!$)/, '')
|
19
|
-
else
|
20
|
-
result[j].sub(/\x00(?!$)/, '')
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
def pattern
|
26
|
-
@pattern ||= /([^#{@font.hard_blank}\x00\s])\x00([^#{@font.hard_blank}\x00\s])/
|
27
|
-
end
|
28
|
-
|
29
|
-
def symbols
|
30
|
-
@@symbols ||= {
|
31
|
-
24 => '|/\\[]{}()<>',
|
32
|
-
8 => {'[' => ']', ']' => '[', '{' => '}', '}' => '{', '(' => ')', ')' => '('},
|
33
|
-
16 => {"/\\" => '|', "\\/" => 'Y', '><' => 'X'}
|
34
|
-
}
|
35
|
-
end
|
36
|
-
|
37
|
-
def old_layout?(n)
|
38
|
-
@font.old_layout & n > 0
|
39
|
-
end
|
40
|
-
|
41
|
-
def callback(s, a, b)
|
42
|
-
combined = a + b
|
43
|
-
|
44
|
-
if old_layout?(1) && a == b
|
45
|
-
return true, a
|
46
|
-
elsif old_layout?(2) && ('_' == a && symbols[24].include?(b) || '_' == b && symbols[24].include?(a))
|
47
|
-
return true, a
|
48
|
-
elsif old_layout?(4) && ((left = symbols[24].index(a)) && (right = symbols[24].index(b)))
|
49
|
-
return true, (right > left ? b : a)
|
50
|
-
elsif old_layout?(8) && (symbols[8].has_key?(b) && symbols[8][b] == a)
|
51
|
-
return true, '|'
|
52
|
-
elsif old_layout?(16) && symbols[16].has_key?(combined)
|
53
|
-
return true, symbols[16][combined]
|
54
|
-
elsif old_layout?(32) && (a == b && @font.hard_blank == a)
|
55
|
-
return true, @font.hard_blank
|
56
|
-
else
|
57
|
-
return s, "#{a}\00#{b}"
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
end
|
62
|
-
|
63
|
-
end # module Figlet
|
64
|
-
end # module Text
|
@@ -1,68 +0,0 @@
|
|
1
|
-
module Text
|
2
|
-
module Figlet
|
3
|
-
|
4
|
-
class Typesetter
|
5
|
-
|
6
|
-
def initialize(font, options = nil)
|
7
|
-
@font = font
|
8
|
-
@options = options || {}
|
9
|
-
@smush = @options.has_key?(:smush) ? @options[:smush] : true
|
10
|
-
end
|
11
|
-
|
12
|
-
def [](str)
|
13
|
-
result = []
|
14
|
-
str.length.times do |i|
|
15
|
-
char = str[i]
|
16
|
-
unless @font.has_char?(char)
|
17
|
-
if @font.has_char?(0)
|
18
|
-
char = 0
|
19
|
-
else
|
20
|
-
next
|
21
|
-
end
|
22
|
-
end
|
23
|
-
@font.height.times do |j|
|
24
|
-
line = @font[char][j]
|
25
|
-
if result[j].nil?
|
26
|
-
result[j] = line
|
27
|
-
else
|
28
|
-
result[j] = @font.right_to_left?? (line + result[j]) : (result[j] + line)
|
29
|
-
end
|
30
|
-
end
|
31
|
-
if @font.old_layout > -1 && i > 0
|
32
|
-
diff = -1
|
33
|
-
@font.height.times do |j|
|
34
|
-
if match = /\S(\s*\x00\s*)\S/.match(result[j])
|
35
|
-
len = match[1].length
|
36
|
-
diff = (diff == -1 ? len : min(diff, len))
|
37
|
-
end
|
38
|
-
end
|
39
|
-
diff -= 1
|
40
|
-
if diff > 0
|
41
|
-
@font.height.times do |j|
|
42
|
-
if match = /\x00(\s{0,#{diff}})/.match(result[j])
|
43
|
-
b = diff - match[1].length
|
44
|
-
result[j] = result[j].sub(/\s{0,#{b}}\x00\s{#{match[1].length}}/, "\0")
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
smush[result] if @smush
|
49
|
-
end
|
50
|
-
end
|
51
|
-
return result.join("\n").gsub(/\0/, '').gsub(@font.hard_blank, ' ')
|
52
|
-
end
|
53
|
-
|
54
|
-
|
55
|
-
private
|
56
|
-
|
57
|
-
def min(a, b)
|
58
|
-
a > b ? b : a
|
59
|
-
end
|
60
|
-
|
61
|
-
def smush
|
62
|
-
@smusher ||= Smusher.new(@font)
|
63
|
-
end
|
64
|
-
|
65
|
-
end
|
66
|
-
|
67
|
-
end # module Figlet
|
68
|
-
end # module Text
|
data/lib/text/levenshtein.rb
DELETED
@@ -1,65 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Levenshtein distance algorithm implementation for Ruby, with UTF-8 support.
|
3
|
-
#
|
4
|
-
# The Levenshtein distance is a measure of how similar two strings s and t are,
|
5
|
-
# calculated as the number of deletions/insertions/substitutions needed to
|
6
|
-
# transform s into t. The greater the distance, the more the strings differ.
|
7
|
-
#
|
8
|
-
# The Levenshtein distance is also sometimes referred to as the
|
9
|
-
# easier-to-pronounce-and-spell 'edit distance'.
|
10
|
-
#
|
11
|
-
# Author: Paul Battley (pbattley@gmail.com)
|
12
|
-
#
|
13
|
-
|
14
|
-
module Text # :nodoc:
|
15
|
-
module Levenshtein
|
16
|
-
|
17
|
-
# Calculate the Levenshtein distance between two strings +str1+ and +str2+.
|
18
|
-
# +str1+ and +str2+ should be ASCII, UTF-8, or a one-byte-per character encoding such
|
19
|
-
# as ISO-8859-*.
|
20
|
-
#
|
21
|
-
# The strings will be treated as UTF-8 if $KCODE is set appropriately (i.e. 'u').
|
22
|
-
# Otherwise, the comparison will be performed byte-by-byte. There is no specific support
|
23
|
-
# for Shift-JIS or EUC strings.
|
24
|
-
#
|
25
|
-
# When using Unicode text, be aware that this algorithm does not perform normalisation.
|
26
|
-
# If there is a possibility of different normalised forms being used, normalisation
|
27
|
-
# should be performed beforehand.
|
28
|
-
#
|
29
|
-
def distance(str1, str2)
|
30
|
-
if $KCODE =~ /^U/i
|
31
|
-
unpack_rule = 'U*'
|
32
|
-
else
|
33
|
-
unpack_rule = 'C*'
|
34
|
-
end
|
35
|
-
s = str1.unpack(unpack_rule)
|
36
|
-
t = str2.unpack(unpack_rule)
|
37
|
-
n = s.length
|
38
|
-
m = t.length
|
39
|
-
return m if (0 == n)
|
40
|
-
return n if (0 == m)
|
41
|
-
|
42
|
-
d = (0..m).to_a
|
43
|
-
x = nil
|
44
|
-
|
45
|
-
(0...n).each do |i|
|
46
|
-
e = i+1
|
47
|
-
(0...m).each do |j|
|
48
|
-
cost = (s[i] == t[j]) ? 0 : 1
|
49
|
-
x = [
|
50
|
-
d[j+1] + 1, # insertion
|
51
|
-
e + 1, # deletion
|
52
|
-
d[j] + cost # substitution
|
53
|
-
].min
|
54
|
-
d[j] = e
|
55
|
-
e = x
|
56
|
-
end
|
57
|
-
d[m] = x
|
58
|
-
end
|
59
|
-
|
60
|
-
return x
|
61
|
-
end
|
62
|
-
|
63
|
-
extend self
|
64
|
-
end
|
65
|
-
end
|
data/lib/text/metaphone.rb
DELETED
@@ -1,97 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# An implementation of the Metaphone phonetic coding system in Ruby.
|
3
|
-
#
|
4
|
-
# Metaphone encodes names into a phonetic form such that similar-sounding names
|
5
|
-
# have the same or similar Metaphone encodings.
|
6
|
-
#
|
7
|
-
# The original system was described by Lawrence Philips in Computer Language
|
8
|
-
# Vol. 7 No. 12, December 1990, pp 39-43.
|
9
|
-
#
|
10
|
-
# As there are multiple implementations of Metaphone, each with their own
|
11
|
-
# quirks, I have based this on my interpretation of the algorithm specification.
|
12
|
-
# Even LP's original BASIC implementation appears to contain bugs (specifically
|
13
|
-
# with the handling of CC and MB), when compared to his explanation of the
|
14
|
-
# algorithm.
|
15
|
-
#
|
16
|
-
# I have also compared this implementation with that found in PHP's standard
|
17
|
-
# library, which appears to mimic the behaviour of LP's original BASIC
|
18
|
-
# implementation. For compatibility, these rules can also be used by passing
|
19
|
-
# :buggy=>true to the methods.
|
20
|
-
#
|
21
|
-
# Author: Paul Battley (pbattley@gmail.com)
|
22
|
-
#
|
23
|
-
|
24
|
-
module Text # :nodoc:
|
25
|
-
module Metaphone
|
26
|
-
|
27
|
-
module Rules # :nodoc:all
|
28
|
-
|
29
|
-
# Metaphone rules. These are simply applied in order.
|
30
|
-
#
|
31
|
-
STANDARD = [
|
32
|
-
# Regexp, replacement
|
33
|
-
[ /([bcdfhjklmnpqrstvwxyz])\1+/,
|
34
|
-
'\1' ], # Remove doubled consonants except g.
|
35
|
-
# [PHP] remove c from regexp.
|
36
|
-
[ /^ae/, 'E' ],
|
37
|
-
[ /^[gkp]n/, 'N' ],
|
38
|
-
[ /^wr/, 'R' ],
|
39
|
-
[ /^x/, 'S' ],
|
40
|
-
[ /^wh/, 'W' ],
|
41
|
-
[ /mb$/, 'M' ], # [PHP] remove $ from regexp.
|
42
|
-
[ /(?!^)sch/, 'SK' ],
|
43
|
-
[ /th/, '0' ],
|
44
|
-
[ /t?ch|sh/, 'X' ],
|
45
|
-
[ /c(?=ia)/, 'X' ],
|
46
|
-
[ /[st](?=i[ao])/, 'X' ],
|
47
|
-
[ /s?c(?=[iey])/, 'S' ],
|
48
|
-
[ /[cq]/, 'K' ],
|
49
|
-
[ /dg(?=[iey])/, 'J' ],
|
50
|
-
[ /d/, 'T' ],
|
51
|
-
[ /g(?=h[^aeiou])/, '' ],
|
52
|
-
[ /gn(ed)?/, 'N' ],
|
53
|
-
[ /([^g]|^)g(?=[iey])/,
|
54
|
-
'\1J' ],
|
55
|
-
[ /g+/, 'K' ],
|
56
|
-
[ /ph/, 'F' ],
|
57
|
-
[ /([aeiou])h(?=\b|[^aeiou])/,
|
58
|
-
'\1' ],
|
59
|
-
[ /[wy](?![aeiou])/, '' ],
|
60
|
-
[ /z/, 'S' ],
|
61
|
-
[ /v/, 'F' ],
|
62
|
-
[ /(?!^)[aeiou]+/, '' ],
|
63
|
-
]
|
64
|
-
|
65
|
-
# The rules for the 'buggy' alternate implementation used by PHP etc.
|
66
|
-
#
|
67
|
-
BUGGY = STANDARD.dup
|
68
|
-
BUGGY[0] = [ /([bdfhjklmnpqrstvwxyz])\1+/, '\1' ]
|
69
|
-
BUGGY[6] = [ /mb/, 'M' ]
|
70
|
-
end
|
71
|
-
|
72
|
-
# Returns the Metaphone representation of a string. If the string contains
|
73
|
-
# multiple words, each word in turn is converted into its Metaphone
|
74
|
-
# representation. Note that only the letters A-Z are supported, so any
|
75
|
-
# language-specific processing should be done beforehand.
|
76
|
-
#
|
77
|
-
# If the :buggy option is set, alternate 'buggy' rules are used.
|
78
|
-
#
|
79
|
-
def metaphone(str, options={})
|
80
|
-
return str.strip.split(/\s+/).map { |w| metaphone_word(w, options) }.join(' ')
|
81
|
-
end
|
82
|
-
|
83
|
-
private
|
84
|
-
|
85
|
-
def metaphone_word(w, options={})
|
86
|
-
# Normalise case and remove non-ASCII
|
87
|
-
s = w.downcase.gsub(/[^a-z]/, '')
|
88
|
-
# Apply the Metaphone rules
|
89
|
-
rules = options[:buggy] ? Rules::BUGGY : Rules::STANDARD
|
90
|
-
rules.each { |rx, rep| s.gsub!(rx, rep) }
|
91
|
-
return s.upcase
|
92
|
-
end
|
93
|
-
|
94
|
-
extend self
|
95
|
-
|
96
|
-
end
|
97
|
-
end
|
data/lib/text/porter_stemming.rb
DELETED
@@ -1,171 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# This is the Porter Stemming algorithm, ported to Ruby from the
|
3
|
-
# version coded up in Perl. It's easy to follow against the rules
|
4
|
-
# in the original paper in:
|
5
|
-
#
|
6
|
-
# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
7
|
-
# no. 3, pp 130-137,
|
8
|
-
#
|
9
|
-
# Taken from http://www.tartarus.org/~martin/PorterStemmer (Public Domain)
|
10
|
-
#
|
11
|
-
module Text # :nodoc:
|
12
|
-
module PorterStemming
|
13
|
-
|
14
|
-
STEP_2_LIST = {
|
15
|
-
'ational' => 'ate', 'tional' => 'tion', 'enci' => 'ence', 'anci' => 'ance',
|
16
|
-
'izer' => 'ize', 'bli' => 'ble',
|
17
|
-
'alli' => 'al', 'entli' => 'ent', 'eli' => 'e', 'ousli' => 'ous',
|
18
|
-
'ization' => 'ize', 'ation' => 'ate',
|
19
|
-
'ator' => 'ate', 'alism' => 'al', 'iveness' => 'ive', 'fulness' => 'ful',
|
20
|
-
'ousness' => 'ous', 'aliti' => 'al',
|
21
|
-
'iviti' => 'ive', 'biliti' => 'ble', 'logi' => 'log'
|
22
|
-
}
|
23
|
-
|
24
|
-
STEP_3_LIST = {
|
25
|
-
'icate' => 'ic', 'ative' => '', 'alize' => 'al', 'iciti' => 'ic',
|
26
|
-
'ical' => 'ic', 'ful' => '', 'ness' => ''
|
27
|
-
}
|
28
|
-
|
29
|
-
SUFFIX_1_REGEXP = /(
|
30
|
-
ational |
|
31
|
-
tional |
|
32
|
-
enci |
|
33
|
-
anci |
|
34
|
-
izer |
|
35
|
-
bli |
|
36
|
-
alli |
|
37
|
-
entli |
|
38
|
-
eli |
|
39
|
-
ousli |
|
40
|
-
ization |
|
41
|
-
ation |
|
42
|
-
ator |
|
43
|
-
alism |
|
44
|
-
iveness |
|
45
|
-
fulness |
|
46
|
-
ousness |
|
47
|
-
aliti |
|
48
|
-
iviti |
|
49
|
-
biliti |
|
50
|
-
logi)$/x
|
51
|
-
|
52
|
-
SUFFIX_2_REGEXP = /(
|
53
|
-
al |
|
54
|
-
ance |
|
55
|
-
ence |
|
56
|
-
er |
|
57
|
-
ic |
|
58
|
-
able |
|
59
|
-
ible |
|
60
|
-
ant |
|
61
|
-
ement |
|
62
|
-
ment |
|
63
|
-
ent |
|
64
|
-
ou |
|
65
|
-
ism |
|
66
|
-
ate |
|
67
|
-
iti |
|
68
|
-
ous |
|
69
|
-
ive |
|
70
|
-
ize)$/x
|
71
|
-
|
72
|
-
C = "[^aeiou]" # consonant
|
73
|
-
V = "[aeiouy]" # vowel
|
74
|
-
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
|
75
|
-
VV = "#{V}(?>[aeiou]*)" # vowel sequence
|
76
|
-
|
77
|
-
MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
|
78
|
-
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
|
79
|
-
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
|
80
|
-
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
|
81
|
-
|
82
|
-
def self.stem(word)
|
83
|
-
|
84
|
-
# make a copy of the given object and convert it to a string.
|
85
|
-
word = word.dup.to_str
|
86
|
-
|
87
|
-
return word if word.length < 3
|
88
|
-
|
89
|
-
# now map initial y to Y so that the patterns never treat it as vowel
|
90
|
-
word[0] = 'Y' if word[0] == ?y
|
91
|
-
|
92
|
-
# Step 1a
|
93
|
-
if word =~ /(ss|i)es$/
|
94
|
-
word = $` + $1
|
95
|
-
elsif word =~ /([^s])s$/
|
96
|
-
word = $` + $1
|
97
|
-
end
|
98
|
-
|
99
|
-
# Step 1b
|
100
|
-
if word =~ /eed$/
|
101
|
-
word.chop! if $` =~ MGR0
|
102
|
-
elsif word =~ /(ed|ing)$/
|
103
|
-
stem = $`
|
104
|
-
if stem =~ VOWEL_IN_STEM
|
105
|
-
word = stem
|
106
|
-
case word
|
107
|
-
when /(at|bl|iz)$/ then word << "e"
|
108
|
-
when /([^aeiouylsz])\1$/ then word.chop!
|
109
|
-
when /^#{CC}#{V}[^aeiouwxy]$/o then word << "e"
|
110
|
-
end
|
111
|
-
end
|
112
|
-
end
|
113
|
-
|
114
|
-
if word =~ /y$/
|
115
|
-
stem = $`
|
116
|
-
word = stem + "i" if stem =~ VOWEL_IN_STEM
|
117
|
-
end
|
118
|
-
|
119
|
-
# Step 2
|
120
|
-
if word =~ SUFFIX_1_REGEXP
|
121
|
-
stem = $`
|
122
|
-
suffix = $1
|
123
|
-
# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
|
124
|
-
if stem =~ MGR0
|
125
|
-
word = stem + STEP_2_LIST[suffix]
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
# Step 3
|
130
|
-
if word =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
|
131
|
-
stem = $`
|
132
|
-
suffix = $1
|
133
|
-
if stem =~ MGR0
|
134
|
-
word = stem + STEP_3_LIST[suffix]
|
135
|
-
end
|
136
|
-
end
|
137
|
-
|
138
|
-
# Step 4
|
139
|
-
if word =~ SUFFIX_2_REGEXP
|
140
|
-
stem = $`
|
141
|
-
if stem =~ MGR1
|
142
|
-
word = stem
|
143
|
-
end
|
144
|
-
elsif word =~ /(s|t)(ion)$/
|
145
|
-
stem = $` + $1
|
146
|
-
if stem =~ MGR1
|
147
|
-
word = stem
|
148
|
-
end
|
149
|
-
end
|
150
|
-
|
151
|
-
# Step 5
|
152
|
-
if word =~ /e$/
|
153
|
-
stem = $`
|
154
|
-
if (stem =~ MGR1) ||
|
155
|
-
(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
156
|
-
word = stem
|
157
|
-
end
|
158
|
-
end
|
159
|
-
|
160
|
-
if word =~ /ll$/ && word =~ MGR1
|
161
|
-
word.chop!
|
162
|
-
end
|
163
|
-
|
164
|
-
# and turn initial Y back to y
|
165
|
-
word[0] = 'y' if word[0] == ?Y
|
166
|
-
|
167
|
-
word
|
168
|
-
end
|
169
|
-
|
170
|
-
end
|
171
|
-
end
|