text 0.1.13
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +28 -0
- data/Rakefile +48 -0
- data/lib/text.rb +7 -0
- data/lib/text/double_metaphone.rb +356 -0
- data/lib/text/figlet.rb +17 -0
- data/lib/text/figlet/font.rb +117 -0
- data/lib/text/figlet/smusher.rb +64 -0
- data/lib/text/figlet/typesetter.rb +68 -0
- data/lib/text/levenshtein.rb +65 -0
- data/lib/text/metaphone.rb +97 -0
- data/lib/text/porter_stemming.rb +171 -0
- data/lib/text/soundex.rb +61 -0
- data/lib/text/version.rb +9 -0
- data/test/data/big.flf +2204 -0
- data/test/data/big.txt +8 -0
- data/test/data/chunky.flf +512 -0
- data/test/data/chunky.txt +5 -0
- data/test/data/double_metaphone.csv +1218 -0
- data/test/data/metaphone.txt +51 -0
- data/test/data/metaphone_buggy.txt +52 -0
- data/test/data/porter_stemming_input.txt +23531 -0
- data/test/data/porter_stemming_output.txt +23531 -0
- data/test/preamble.rb +10 -0
- data/test/test_double_metaphone.rb +23 -0
- data/test/test_figlet.rb +17 -0
- data/test/test_levenshtein.rb +80 -0
- data/test/test_metaphone.rb +39 -0
- data/test/test_porter_stemming.rb +16 -0
- data/test/test_soundex.rb +27 -0
- metadata +85 -0
@@ -0,0 +1,117 @@
|
|
1
|
+
module Text
|
2
|
+
module Figlet
|
3
|
+
|
4
|
+
class UnknownFontFormat < StandardError
|
5
|
+
end
|
6
|
+
|
7
|
+
class Font
|
8
|
+
def initialize(filename, load_german = true)
|
9
|
+
file = File.open(filename, 'rb')
|
10
|
+
|
11
|
+
header = file.gets.strip.split(/ /)
|
12
|
+
|
13
|
+
raise UnknownFontFormat if 'flf2a' != header[0][0, 5]
|
14
|
+
|
15
|
+
@hard_blank = header.shift[-1, 1]
|
16
|
+
@height = header.shift.to_i
|
17
|
+
@baseline = header.shift
|
18
|
+
@max_length = header.shift
|
19
|
+
@old_layout = header.shift.to_i
|
20
|
+
@comment_count = header.shift.to_i
|
21
|
+
@right_to_left = header.shift
|
22
|
+
@right_to_left = !@right_to_left.nil? && @right_to_left.to_i == 1
|
23
|
+
|
24
|
+
@load_german, @characters = load_german, {}
|
25
|
+
|
26
|
+
load_comments file
|
27
|
+
load_ascii_characters file
|
28
|
+
load_german_characters file
|
29
|
+
load_extended_characters file
|
30
|
+
|
31
|
+
file.close
|
32
|
+
end
|
33
|
+
|
34
|
+
def [](char)
|
35
|
+
@characters[char]
|
36
|
+
end
|
37
|
+
|
38
|
+
def has_char?(char)
|
39
|
+
@characters.has_key? char
|
40
|
+
end
|
41
|
+
|
42
|
+
attr_reader :height, :hard_blank, :old_layout
|
43
|
+
|
44
|
+
def right_to_left?
|
45
|
+
@right_to_left
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def load_comments(file)
|
52
|
+
@comment_count.times { file.gets.strip }
|
53
|
+
end
|
54
|
+
|
55
|
+
def load_ascii_characters(file)
|
56
|
+
(32..126).each { |i| @characters[i] = load_char(file) }
|
57
|
+
end
|
58
|
+
|
59
|
+
def load_german_characters(file)
|
60
|
+
[91, 92, 93, 123, 124, 125, 126].each do |i|
|
61
|
+
if @load_german
|
62
|
+
unless char = load_char(file)
|
63
|
+
return
|
64
|
+
end
|
65
|
+
@characters[i] = char
|
66
|
+
else
|
67
|
+
skip_char file
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def load_extended_characters(file)
|
73
|
+
until file.eof?
|
74
|
+
i = file.gets.strip.split(/ /).first
|
75
|
+
if i.empty?
|
76
|
+
next
|
77
|
+
elsif /^\-0x/i =~ i # comment
|
78
|
+
skip_char file
|
79
|
+
else
|
80
|
+
if /^0x/i =~ i
|
81
|
+
i = i[2, 1].hex
|
82
|
+
elsif '0' == i[0] && '0' != i || '-0' == i[0, 2]
|
83
|
+
i = i.oct
|
84
|
+
end
|
85
|
+
unless char = load_char(file)
|
86
|
+
return
|
87
|
+
end
|
88
|
+
@characters[i] = char
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def load_char(file)
|
94
|
+
char = []
|
95
|
+
@height.times do
|
96
|
+
return false if file.eof?
|
97
|
+
line = file.gets.rstrip
|
98
|
+
if match = /(.){1,2}$/.match(line)
|
99
|
+
line.gsub! match[1], ''
|
100
|
+
end
|
101
|
+
line << "\x00"
|
102
|
+
char << line
|
103
|
+
end
|
104
|
+
return char
|
105
|
+
end
|
106
|
+
|
107
|
+
def skip_char(file)
|
108
|
+
@height.times do
|
109
|
+
return if file.eof?
|
110
|
+
return if file.gets.strip.nil?
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
115
|
+
|
116
|
+
end # module Figlet
|
117
|
+
end # module Text
|
@@ -0,0 +1,64 @@
|
|
1
|
+
module Text
|
2
|
+
module Figlet
|
3
|
+
|
4
|
+
class Smusher
|
5
|
+
|
6
|
+
def initialize(font)
|
7
|
+
@font = font
|
8
|
+
end
|
9
|
+
|
10
|
+
def [](result)
|
11
|
+
todo = false
|
12
|
+
|
13
|
+
@font.height.times do |j|
|
14
|
+
result[j] = result[j].sub(pattern) { todo, x = callback(todo, $1, $2); x }
|
15
|
+
end
|
16
|
+
@font.height.times do |j|
|
17
|
+
result[j] = if todo
|
18
|
+
result[j].sub(/\s\x00(?!$)|\x00\s/, '').sub(/\x00(?!$)/, '')
|
19
|
+
else
|
20
|
+
result[j].sub(/\x00(?!$)/, '')
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def pattern
|
26
|
+
@pattern ||= /([^#{@font.hard_blank}\x00\s])\x00([^#{@font.hard_blank}\x00\s])/
|
27
|
+
end
|
28
|
+
|
29
|
+
def symbols
|
30
|
+
@@symbols ||= {
|
31
|
+
24 => '|/\\[]{}()<>',
|
32
|
+
8 => {'[' => ']', ']' => '[', '{' => '}', '}' => '{', '(' => ')', ')' => '('},
|
33
|
+
16 => {"/\\" => '|', "\\/" => 'Y', '><' => 'X'}
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
def old_layout?(n)
|
38
|
+
@font.old_layout & n > 0
|
39
|
+
end
|
40
|
+
|
41
|
+
def callback(s, a, b)
|
42
|
+
combined = a + b
|
43
|
+
|
44
|
+
if old_layout?(1) && a == b
|
45
|
+
return true, a
|
46
|
+
elsif old_layout?(2) && ('_' == a && symbols[24].include?(b) || '_' == b && symbols[24].include?(a))
|
47
|
+
return true, a
|
48
|
+
elsif old_layout?(4) && ((left = symbols[24].index(a)) && (right = symbols[24].index(b)))
|
49
|
+
return true, (right > left ? b : a)
|
50
|
+
elsif old_layout?(8) && (symbols[8].has_key?(b) && symbols[8][b] == a)
|
51
|
+
return true, '|'
|
52
|
+
elsif old_layout?(16) && symbols[16].has_key?(combined)
|
53
|
+
return true, symbols[16][combined]
|
54
|
+
elsif old_layout?(32) && (a == b && @font.hard_blank == a)
|
55
|
+
return true, @font.hard_blank
|
56
|
+
else
|
57
|
+
return s, "#{a}\00#{b}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
end # module Figlet
|
64
|
+
end # module Text
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module Text
|
2
|
+
module Figlet
|
3
|
+
|
4
|
+
class Typesetter
|
5
|
+
|
6
|
+
def initialize(font, options = nil)
|
7
|
+
@font = font
|
8
|
+
@options = options || {}
|
9
|
+
@smush = @options.has_key?(:smush) ? @options[:smush] : true
|
10
|
+
end
|
11
|
+
|
12
|
+
def [](str)
|
13
|
+
result = []
|
14
|
+
str.length.times do |i|
|
15
|
+
char = str[i]
|
16
|
+
unless @font.has_char?(char)
|
17
|
+
if @font.has_char?(0)
|
18
|
+
char = 0
|
19
|
+
else
|
20
|
+
next
|
21
|
+
end
|
22
|
+
end
|
23
|
+
@font.height.times do |j|
|
24
|
+
line = @font[char][j]
|
25
|
+
if result[j].nil?
|
26
|
+
result[j] = line
|
27
|
+
else
|
28
|
+
result[j] = @font.right_to_left?? (line + result[j]) : (result[j] + line)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
if @font.old_layout > -1 && i > 0
|
32
|
+
diff = -1
|
33
|
+
@font.height.times do |j|
|
34
|
+
if match = /\S(\s*\x00\s*)\S/.match(result[j])
|
35
|
+
len = match[1].length
|
36
|
+
diff = (diff == -1 ? len : min(diff, len))
|
37
|
+
end
|
38
|
+
end
|
39
|
+
diff -= 1
|
40
|
+
if diff > 0
|
41
|
+
@font.height.times do |j|
|
42
|
+
if match = /\x00(\s{0,#{diff}})/.match(result[j])
|
43
|
+
b = diff - match[1].length
|
44
|
+
result[j] = result[j].sub(/\s{0,#{b}}\x00\s{#{match[1].length}}/, "\0")
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
smush[result] if @smush
|
49
|
+
end
|
50
|
+
end
|
51
|
+
return result.join("\n").gsub(/\0/, '').gsub(@font.hard_blank, ' ')
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def min(a, b)
|
58
|
+
a > b ? b : a
|
59
|
+
end
|
60
|
+
|
61
|
+
def smush
|
62
|
+
@smusher ||= Smusher.new(@font)
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
end # module Figlet
|
68
|
+
end # module Text
|
@@ -0,0 +1,65 @@
|
|
1
|
+
#
|
2
|
+
# Levenshtein distance algorithm implementation for Ruby, with UTF-8 support.
|
3
|
+
#
|
4
|
+
# The Levenshtein distance is a measure of how similar two strings s and t are,
|
5
|
+
# calculated as the number of deletions/insertions/substitutions needed to
|
6
|
+
# transform s into t. The greater the distance, the more the strings differ.
|
7
|
+
#
|
8
|
+
# The Levenshtein distance is also sometimes referred to as the
|
9
|
+
# easier-to-pronounce-and-spell 'edit distance'.
|
10
|
+
#
|
11
|
+
# Author: Paul Battley (pbattley@gmail.com)
|
12
|
+
#
|
13
|
+
|
14
|
+
module Text # :nodoc:
|
15
|
+
module Levenshtein
|
16
|
+
|
17
|
+
# Calculate the Levenshtein distance between two strings +str1+ and +str2+.
|
18
|
+
# +str1+ and +str2+ should be ASCII, UTF-8, or a one-byte-per character encoding such
|
19
|
+
# as ISO-8859-*.
|
20
|
+
#
|
21
|
+
# The strings will be treated as UTF-8 if $KCODE is set appropriately (i.e. 'u').
|
22
|
+
# Otherwise, the comparison will be performed byte-by-byte. There is no specific support
|
23
|
+
# for Shift-JIS or EUC strings.
|
24
|
+
#
|
25
|
+
# When using Unicode text, be aware that this algorithm does not perform normalisation.
|
26
|
+
# If there is a possibility of different normalised forms being used, normalisation
|
27
|
+
# should be performed beforehand.
|
28
|
+
#
|
29
|
+
def distance(str1, str2)
|
30
|
+
if $KCODE =~ /^U/i
|
31
|
+
unpack_rule = 'U*'
|
32
|
+
else
|
33
|
+
unpack_rule = 'C*'
|
34
|
+
end
|
35
|
+
s = str1.unpack(unpack_rule)
|
36
|
+
t = str2.unpack(unpack_rule)
|
37
|
+
n = s.length
|
38
|
+
m = t.length
|
39
|
+
return m if (0 == n)
|
40
|
+
return n if (0 == m)
|
41
|
+
|
42
|
+
d = (0..m).to_a
|
43
|
+
x = nil
|
44
|
+
|
45
|
+
(0...n).each do |i|
|
46
|
+
e = i+1
|
47
|
+
(0...m).each do |j|
|
48
|
+
cost = (s[i] == t[j]) ? 0 : 1
|
49
|
+
x = [
|
50
|
+
d[j+1] + 1, # insertion
|
51
|
+
e + 1, # deletion
|
52
|
+
d[j] + cost # substitution
|
53
|
+
].min
|
54
|
+
d[j] = e
|
55
|
+
e = x
|
56
|
+
end
|
57
|
+
d[m] = x
|
58
|
+
end
|
59
|
+
|
60
|
+
return x
|
61
|
+
end
|
62
|
+
|
63
|
+
extend self
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
#
|
2
|
+
# An implementation of the Metaphone phonetic coding system in Ruby.
|
3
|
+
#
|
4
|
+
# Metaphone encodes names into a phonetic form such that similar-sounding names
|
5
|
+
# have the same or similar Metaphone encodings.
|
6
|
+
#
|
7
|
+
# The original system was described by Lawrence Philips in Computer Language
|
8
|
+
# Vol. 7 No. 12, December 1990, pp 39-43.
|
9
|
+
#
|
10
|
+
# As there are multiple implementations of Metaphone, each with their own
|
11
|
+
# quirks, I have based this on my interpretation of the algorithm specification.
|
12
|
+
# Even LP's original BASIC implementation appears to contain bugs (specifically
|
13
|
+
# with the handling of CC and MB), when compared to his explanation of the
|
14
|
+
# algorithm.
|
15
|
+
#
|
16
|
+
# I have also compared this implementation with that found in PHP's standard
|
17
|
+
# library, which appears to mimic the behaviour of LP's original BASIC
|
18
|
+
# implementation. For compatibility, these rules can also be used by passing
|
19
|
+
# :buggy=>true to the methods.
|
20
|
+
#
|
21
|
+
# Author: Paul Battley (pbattley@gmail.com)
|
22
|
+
#
|
23
|
+
|
24
|
+
module Text # :nodoc:
|
25
|
+
module Metaphone
|
26
|
+
|
27
|
+
module Rules # :nodoc:all
|
28
|
+
|
29
|
+
# Metaphone rules. These are simply applied in order.
|
30
|
+
#
|
31
|
+
STANDARD = [
|
32
|
+
# Regexp, replacement
|
33
|
+
[ /([bcdfhjklmnpqrstvwxyz])\1+/,
|
34
|
+
'\1' ], # Remove doubled consonants except g.
|
35
|
+
# [PHP] remove c from regexp.
|
36
|
+
[ /^ae/, 'E' ],
|
37
|
+
[ /^[gkp]n/, 'N' ],
|
38
|
+
[ /^wr/, 'R' ],
|
39
|
+
[ /^x/, 'S' ],
|
40
|
+
[ /^wh/, 'W' ],
|
41
|
+
[ /mb$/, 'M' ], # [PHP] remove $ from regexp.
|
42
|
+
[ /(?!^)sch/, 'SK' ],
|
43
|
+
[ /th/, '0' ],
|
44
|
+
[ /t?ch|sh/, 'X' ],
|
45
|
+
[ /c(?=ia)/, 'X' ],
|
46
|
+
[ /[st](?=i[ao])/, 'X' ],
|
47
|
+
[ /s?c(?=[iey])/, 'S' ],
|
48
|
+
[ /[cq]/, 'K' ],
|
49
|
+
[ /dg(?=[iey])/, 'J' ],
|
50
|
+
[ /d/, 'T' ],
|
51
|
+
[ /g(?=h[^aeiou])/, '' ],
|
52
|
+
[ /gn(ed)?/, 'N' ],
|
53
|
+
[ /([^g]|^)g(?=[iey])/,
|
54
|
+
'\1J' ],
|
55
|
+
[ /g+/, 'K' ],
|
56
|
+
[ /ph/, 'F' ],
|
57
|
+
[ /([aeiou])h(?=\b|[^aeiou])/,
|
58
|
+
'\1' ],
|
59
|
+
[ /[wy](?![aeiou])/, '' ],
|
60
|
+
[ /z/, 'S' ],
|
61
|
+
[ /v/, 'F' ],
|
62
|
+
[ /(?!^)[aeiou]+/, '' ],
|
63
|
+
]
|
64
|
+
|
65
|
+
# The rules for the 'buggy' alternate implementation used by PHP etc.
|
66
|
+
#
|
67
|
+
BUGGY = STANDARD.dup
|
68
|
+
BUGGY[0] = [ /([bdfhjklmnpqrstvwxyz])\1+/, '\1' ]
|
69
|
+
BUGGY[6] = [ /mb/, 'M' ]
|
70
|
+
end
|
71
|
+
|
72
|
+
# Returns the Metaphone representation of a string. If the string contains
|
73
|
+
# multiple words, each word in turn is converted into its Metaphone
|
74
|
+
# representation. Note that only the letters A-Z are supported, so any
|
75
|
+
# language-specific processing should be done beforehand.
|
76
|
+
#
|
77
|
+
# If the :buggy option is set, alternate 'buggy' rules are used.
|
78
|
+
#
|
79
|
+
def metaphone(str, options={})
|
80
|
+
return str.strip.split(/\s+/).map { |w| metaphone_word(w, options) }.join(' ')
|
81
|
+
end
|
82
|
+
|
83
|
+
private
|
84
|
+
|
85
|
+
def metaphone_word(w, options={})
|
86
|
+
# Normalise case and remove non-ASCII
|
87
|
+
s = w.downcase.gsub(/[^a-z]/, '')
|
88
|
+
# Apply the Metaphone rules
|
89
|
+
rules = options[:buggy] ? Rules::BUGGY : Rules::STANDARD
|
90
|
+
rules.each { |rx, rep| s.gsub!(rx, rep) }
|
91
|
+
return s.upcase
|
92
|
+
end
|
93
|
+
|
94
|
+
extend self
|
95
|
+
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,171 @@
|
|
1
|
+
#
|
2
|
+
# This is the Porter Stemming algorithm, ported to Ruby from the
|
3
|
+
# version coded up in Perl. It's easy to follow against the rules
|
4
|
+
# in the original paper in:
|
5
|
+
#
|
6
|
+
# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
7
|
+
# no. 3, pp 130-137,
|
8
|
+
#
|
9
|
+
# Taken from http://www.tartarus.org/~martin/PorterStemmer (Public Domain)
|
10
|
+
#
|
11
|
+
module Text # :nodoc:
|
12
|
+
module PorterStemming
|
13
|
+
|
14
|
+
STEP_2_LIST = {
|
15
|
+
'ational' => 'ate', 'tional' => 'tion', 'enci' => 'ence', 'anci' => 'ance',
|
16
|
+
'izer' => 'ize', 'bli' => 'ble',
|
17
|
+
'alli' => 'al', 'entli' => 'ent', 'eli' => 'e', 'ousli' => 'ous',
|
18
|
+
'ization' => 'ize', 'ation' => 'ate',
|
19
|
+
'ator' => 'ate', 'alism' => 'al', 'iveness' => 'ive', 'fulness' => 'ful',
|
20
|
+
'ousness' => 'ous', 'aliti' => 'al',
|
21
|
+
'iviti' => 'ive', 'biliti' => 'ble', 'logi' => 'log'
|
22
|
+
}
|
23
|
+
|
24
|
+
STEP_3_LIST = {
|
25
|
+
'icate' => 'ic', 'ative' => '', 'alize' => 'al', 'iciti' => 'ic',
|
26
|
+
'ical' => 'ic', 'ful' => '', 'ness' => ''
|
27
|
+
}
|
28
|
+
|
29
|
+
SUFFIX_1_REGEXP = /(
|
30
|
+
ational |
|
31
|
+
tional |
|
32
|
+
enci |
|
33
|
+
anci |
|
34
|
+
izer |
|
35
|
+
bli |
|
36
|
+
alli |
|
37
|
+
entli |
|
38
|
+
eli |
|
39
|
+
ousli |
|
40
|
+
ization |
|
41
|
+
ation |
|
42
|
+
ator |
|
43
|
+
alism |
|
44
|
+
iveness |
|
45
|
+
fulness |
|
46
|
+
ousness |
|
47
|
+
aliti |
|
48
|
+
iviti |
|
49
|
+
biliti |
|
50
|
+
logi)$/x
|
51
|
+
|
52
|
+
SUFFIX_2_REGEXP = /(
|
53
|
+
al |
|
54
|
+
ance |
|
55
|
+
ence |
|
56
|
+
er |
|
57
|
+
ic |
|
58
|
+
able |
|
59
|
+
ible |
|
60
|
+
ant |
|
61
|
+
ement |
|
62
|
+
ment |
|
63
|
+
ent |
|
64
|
+
ou |
|
65
|
+
ism |
|
66
|
+
ate |
|
67
|
+
iti |
|
68
|
+
ous |
|
69
|
+
ive |
|
70
|
+
ize)$/x
|
71
|
+
|
72
|
+
C = "[^aeiou]" # consonant
|
73
|
+
V = "[aeiouy]" # vowel
|
74
|
+
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
|
75
|
+
VV = "#{V}(?>[aeiou]*)" # vowel sequence
|
76
|
+
|
77
|
+
MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
|
78
|
+
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
|
79
|
+
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
|
80
|
+
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
|
81
|
+
|
82
|
+
def self.stem(word)
|
83
|
+
|
84
|
+
# make a copy of the given object and convert it to a string.
|
85
|
+
word = word.dup.to_str
|
86
|
+
|
87
|
+
return word if word.length < 3
|
88
|
+
|
89
|
+
# now map initial y to Y so that the patterns never treat it as vowel
|
90
|
+
word[0] = 'Y' if word[0] == ?y
|
91
|
+
|
92
|
+
# Step 1a
|
93
|
+
if word =~ /(ss|i)es$/
|
94
|
+
word = $` + $1
|
95
|
+
elsif word =~ /([^s])s$/
|
96
|
+
word = $` + $1
|
97
|
+
end
|
98
|
+
|
99
|
+
# Step 1b
|
100
|
+
if word =~ /eed$/
|
101
|
+
word.chop! if $` =~ MGR0
|
102
|
+
elsif word =~ /(ed|ing)$/
|
103
|
+
stem = $`
|
104
|
+
if stem =~ VOWEL_IN_STEM
|
105
|
+
word = stem
|
106
|
+
case word
|
107
|
+
when /(at|bl|iz)$/ then word << "e"
|
108
|
+
when /([^aeiouylsz])\1$/ then word.chop!
|
109
|
+
when /^#{CC}#{V}[^aeiouwxy]$/o then word << "e"
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
if word =~ /y$/
|
115
|
+
stem = $`
|
116
|
+
word = stem + "i" if stem =~ VOWEL_IN_STEM
|
117
|
+
end
|
118
|
+
|
119
|
+
# Step 2
|
120
|
+
if word =~ SUFFIX_1_REGEXP
|
121
|
+
stem = $`
|
122
|
+
suffix = $1
|
123
|
+
# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
|
124
|
+
if stem =~ MGR0
|
125
|
+
word = stem + STEP_2_LIST[suffix]
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
# Step 3
|
130
|
+
if word =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
|
131
|
+
stem = $`
|
132
|
+
suffix = $1
|
133
|
+
if stem =~ MGR0
|
134
|
+
word = stem + STEP_3_LIST[suffix]
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
# Step 4
|
139
|
+
if word =~ SUFFIX_2_REGEXP
|
140
|
+
stem = $`
|
141
|
+
if stem =~ MGR1
|
142
|
+
word = stem
|
143
|
+
end
|
144
|
+
elsif word =~ /(s|t)(ion)$/
|
145
|
+
stem = $` + $1
|
146
|
+
if stem =~ MGR1
|
147
|
+
word = stem
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
# Step 5
|
152
|
+
if word =~ /e$/
|
153
|
+
stem = $`
|
154
|
+
if (stem =~ MGR1) ||
|
155
|
+
(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
156
|
+
word = stem
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
if word =~ /ll$/ && word =~ MGR1
|
161
|
+
word.chop!
|
162
|
+
end
|
163
|
+
|
164
|
+
# and turn initial Y back to y
|
165
|
+
word[0] = 'y' if word[0] == ?Y
|
166
|
+
|
167
|
+
word
|
168
|
+
end
|
169
|
+
|
170
|
+
end
|
171
|
+
end
|