lingo 1.8.1 → 1.8.2
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +23 -5
- data/README +1 -1
- data/Rakefile +5 -7
- data/TODO +2 -0
- data/bin/lingo +5 -1
- data/de.lang +1 -1
- data/en/lingo-syn.txt +0 -0
- data/en.lang +2 -1
- data/lib/lingo/attendee/abbreviator.rb +8 -9
- data/lib/lingo/attendee/debugger.rb +5 -4
- data/lib/lingo/attendee/decomposer.rb +8 -3
- data/lib/lingo/attendee/dehyphenizer.rb +19 -63
- data/lib/lingo/attendee/formatter.rb +1 -1
- data/lib/lingo/attendee/multi_worder.rb +67 -155
- data/lib/lingo/attendee/noneword_filter.rb +16 -9
- data/lib/lingo/attendee/object_filter.rb +1 -1
- data/lib/lingo/attendee/sequencer.rb +32 -63
- data/lib/lingo/attendee/stemmer/porter.rb +343 -0
- data/{info/gpl-hdr.txt → lib/lingo/attendee/stemmer.rb} +33 -0
- data/lib/lingo/attendee/synonymer.rb +10 -9
- data/lib/lingo/attendee/text_reader.rb +102 -76
- data/lib/lingo/attendee/text_writer.rb +23 -26
- data/lib/lingo/attendee/tokenizer.rb +13 -27
- data/lib/lingo/attendee/variator.rb +26 -66
- data/lib/lingo/attendee/vector_filter.rb +42 -43
- data/lib/lingo/attendee/word_searcher.rb +6 -7
- data/lib/lingo/attendee.rb +25 -7
- data/lib/lingo/buffered_attendee.rb +36 -10
- data/lib/lingo/cachable.rb +8 -8
- data/lib/lingo/config.rb +5 -6
- data/lib/lingo/ctl.rb +2 -3
- data/lib/lingo/database/crypter.rb +9 -26
- data/lib/lingo/database/gdbm_store.rb +3 -5
- data/lib/lingo/database/libcdb_store.rb +4 -6
- data/lib/lingo/database/sdbm_store.rb +11 -6
- data/lib/lingo/database/show_progress.rb +3 -43
- data/lib/lingo/database/source/key_value.rb +2 -6
- data/lib/lingo/database/source/multi_key.rb +3 -5
- data/lib/lingo/database/source/multi_value.rb +2 -6
- data/lib/lingo/database/source/single_word.rb +4 -6
- data/lib/lingo/database/source/word_class.rb +4 -10
- data/lib/lingo/database/source.rb +20 -18
- data/lib/lingo/database.rb +84 -59
- data/lib/lingo/error.rb +57 -1
- data/lib/lingo/language/dictionary.rb +21 -18
- data/lib/lingo/language/grammar.rb +40 -49
- data/lib/lingo/language/lexical.rb +6 -6
- data/lib/lingo/language/lexical_hash.rb +6 -0
- data/lib/lingo/language/word.rb +32 -15
- data/lib/lingo/language/word_form.rb +1 -1
- data/lib/lingo/language.rb +14 -25
- data/lib/lingo/reportable.rb +12 -10
- data/lib/lingo/show_progress.rb +81 -0
- data/lib/lingo/version.rb +1 -1
- data/lib/lingo.rb +63 -24
- data/lingo-call.cfg +6 -10
- data/lingo.cfg +60 -44
- data/lir.cfg +42 -41
- data/test/attendee/ts_abbreviator.rb +3 -5
- data/test/attendee/ts_decomposer.rb +3 -5
- data/test/attendee/ts_multi_worder.rb +87 -145
- data/test/attendee/ts_noneword_filter.rb +5 -3
- data/test/attendee/ts_object_filter.rb +5 -3
- data/test/attendee/ts_sequencer.rb +3 -5
- data/test/attendee/ts_stemmer.rb +309 -0
- data/test/attendee/ts_synonymer.rb +15 -11
- data/test/attendee/ts_text_reader.rb +12 -15
- data/test/attendee/ts_text_writer.rb +24 -29
- data/test/attendee/ts_tokenizer.rb +9 -7
- data/test/attendee/ts_variator.rb +4 -4
- data/test/attendee/ts_vector_filter.rb +24 -16
- data/test/attendee/ts_word_searcher.rb +20 -36
- data/test/{lir.csv → lir.vec} +0 -0
- data/test/ref/artikel.vec +943 -943
- data/test/ref/artikel.ven +943 -943
- data/test/ref/lir.non +201 -201
- data/test/ref/lir.seq +178 -178
- data/test/ref/lir.syn +49 -49
- data/test/ref/lir.vec +329 -0
- data/test/test_helper.rb +20 -36
- data/test/ts_database.rb +10 -10
- data/test/ts_language.rb +279 -319
- metadata +93 -104
- data/info/Objekte.png +0 -0
- data/info/Typen.png +0 -0
- data/info/database.png +0 -0
- data/info/db_small.png +0 -0
- data/info/download.png +0 -0
- data/info/kerze.png +0 -0
- data/info/language.png +0 -0
- data/info/lingo.png +0 -0
- data/info/logo.png +0 -0
- data/info/meeting.png +0 -0
- data/info/types.png +0 -0
- data/lingo-all.cfg +0 -89
- data/porter/stem.cfg +0 -311
- data/porter/stem.rb +0 -150
- data/test/ref/lir.csv +0 -329
- data/test.cfg +0 -79
@@ -71,32 +71,39 @@ class Lingo
|
|
71
71
|
protected
|
72
72
|
|
73
73
|
def init
|
74
|
-
@nonewords = []
|
74
|
+
@nonewords, @sort = [], get_key('sort', true)
|
75
75
|
end
|
76
76
|
|
77
|
-
|
78
|
-
# Für jede Datei wird ein neuer Satz nicht erkannter Wörter registriert.
|
79
|
-
def control(cmd, par)
|
77
|
+
def control(cmd, param)
|
80
78
|
case cmd
|
81
79
|
when STR_CMD_FILE
|
82
80
|
@nonewords.clear
|
83
81
|
when STR_CMD_EOL
|
84
82
|
skip_command
|
85
83
|
when STR_CMD_RECORD, STR_CMD_EOF
|
86
|
-
|
87
|
-
nones.each(&method(:forward))
|
88
|
-
add('Objekte gefiltert', nones.size)
|
89
|
-
@nonewords.clear
|
84
|
+
send_nonewords unless @nonewords.empty?
|
90
85
|
end
|
91
86
|
end
|
92
87
|
|
93
88
|
def process(obj)
|
94
89
|
if obj.is_a?(Word) && obj.unknown?
|
95
90
|
inc('Anzahl nicht erkannter Wörter')
|
96
|
-
|
91
|
+
|
92
|
+
non = obj.form.downcase
|
93
|
+
@sort ? @nonewords << non : forward(non)
|
97
94
|
end
|
98
95
|
end
|
99
96
|
|
97
|
+
private
|
98
|
+
|
99
|
+
def send_nonewords
|
100
|
+
@nonewords.sort!
|
101
|
+
@nonewords.uniq!
|
102
|
+
|
103
|
+
add('Objekte gefiltert', @nonewords.size)
|
104
|
+
@nonewords.each(&method(:forward)).clear
|
105
|
+
end
|
106
|
+
|
100
107
|
end
|
101
108
|
|
102
109
|
# For backwards compatibility.
|
@@ -59,7 +59,7 @@ class Lingo
|
|
59
59
|
# - text_reader: { out: lines, files: '$(files)' }
|
60
60
|
# - tokenizer: { in: lines, out: token }
|
61
61
|
# - word_searcher: { in: token, out: words, source: 'sys-dic' }
|
62
|
-
# - object_filter: { in: words, out: filtr, objects: 'obj.kind_of?(Word) && obj.lexicals.size>0 && obj.lexicals[0].attr==
|
62
|
+
# - object_filter: { in: words, out: filtr, objects: 'obj.kind_of?(Word) && obj.lexicals.size>0 && obj.lexicals[0].attr==LA_NOUN' }
|
63
63
|
# - debugger: { in: filtr, prompt: 'out>' }
|
64
64
|
# ergibt die Ausgabe über den Debugger: <tt>lingo -c t1 test.txt</tt>
|
65
65
|
# out> *FILE('test.txt')
|
@@ -96,92 +96,61 @@ class Lingo
|
|
96
96
|
protected
|
97
97
|
|
98
98
|
def init
|
99
|
-
|
100
|
-
@stopper = get_array('stopper', TA_PUNCTUATION + ',' + TA_OTHER).map(&:upcase)
|
101
|
-
@seq_strings = get_key('sequences').map { |e| WordSequence.new(*e) }
|
99
|
+
@stopper = get_array('stopper', DEFAULT_SKIP, :upcase)
|
102
100
|
|
103
|
-
|
101
|
+
@seq = get_key('sequences').map { |string, format|
|
102
|
+
[string = string.downcase, string.split(//), format]
|
103
|
+
}
|
104
|
+
|
105
|
+
raise MissingConfigError.new(:sequences) if @seq.empty?
|
104
106
|
end
|
105
107
|
|
106
|
-
def control(cmd,
|
107
|
-
# Jedes Control-Object ist auch Auslöser der Verarbeitung
|
108
|
+
def control(cmd, param)
|
108
109
|
process_buffer if [STR_CMD_RECORD, STR_CMD_EOF].include?(cmd)
|
109
110
|
end
|
110
111
|
|
111
112
|
def process_buffer?
|
112
|
-
|
113
|
-
|
114
|
-
(item.is_a?(WordForm) && @stopper.include?(item.attr.upcase)) ||
|
115
|
-
(item.is_a?(Word) && item.unknown?)
|
113
|
+
(obj = @buffer.last).is_a?(WordForm) && (obj.is_a?(Word) &&
|
114
|
+
obj.unknown? || @stopper.include?(obj.attr.upcase))
|
116
115
|
end
|
117
116
|
|
118
117
|
def process_buffer
|
119
|
-
|
120
|
-
|
121
|
-
unless @buffer.size < 2
|
122
|
-
matches = Hash.new { |h, k| h[k] = [] }
|
123
|
-
|
124
|
-
sequences(@buffer.map { |obj|
|
125
|
-
obj.is_a?(Word) && !obj.unknown? ? obj.attrs(false) : ['#']
|
126
|
-
}).uniq.each { |sequence|
|
127
|
-
@seq_strings.each { |wordseq|
|
128
|
-
wordseq.scan(sequence) { |pos, form, classes|
|
129
|
-
inc('Anzahl erkannter Sequenzen')
|
130
|
-
|
131
|
-
classes.each_with_index { |wc, index|
|
132
|
-
@buffer[pos + index].lexicals.find { |lex|
|
133
|
-
form.gsub!(index.succ.to_s, lex.form) if lex.attr == wc
|
134
|
-
} or break
|
135
|
-
} or next
|
136
|
-
|
137
|
-
matches[pos] << form
|
138
|
-
}
|
139
|
-
}
|
140
|
-
}
|
141
|
-
|
142
|
-
matches.sort.each { |pos, forms|
|
143
|
-
forms.uniq.each { |form|
|
144
|
-
deferred_insert(pos, Word.new_lexical(form, WA_SEQUENCE, LA_SEQUENCE))
|
145
|
-
}
|
146
|
-
}
|
147
|
-
end
|
148
|
-
|
118
|
+
insert_sequences if @buffer.size > 1
|
149
119
|
forward_buffer
|
150
120
|
end
|
151
121
|
|
152
122
|
private
|
153
123
|
|
154
|
-
def
|
155
|
-
|
124
|
+
def insert_sequences
|
125
|
+
matches, buf, seq = Hash.new { |h, k| h[k] = [] }, @buffer, @seq
|
156
126
|
|
157
|
-
map.
|
158
|
-
|
159
|
-
res.each { |wc1| classes.each { |wc2| temp << (wc1 + wc2) } }
|
160
|
-
res = temp
|
127
|
+
map = buf.map { |obj|
|
128
|
+
obj.is_a?(Word) && !obj.unknown? ? obj.attrs(false) : ['#']
|
161
129
|
}
|
162
130
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
131
|
+
map.shift.product(*map).map!(&:join).tap(&:uniq!).each { |q|
|
132
|
+
seq.each { |string, classes, format|
|
133
|
+
while pos = q.index(string, pos || 0)
|
134
|
+
inc('Anzahl erkannter Sequenzen')
|
167
135
|
|
168
|
-
|
136
|
+
fmt = format.dup
|
169
137
|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
138
|
+
classes.each_with_index { |wc, i|
|
139
|
+
buf[pos + i].lexicals.find { |l|
|
140
|
+
fmt.gsub!(i.succ.to_s, l.form) if l.attr == wc
|
141
|
+
} or break
|
142
|
+
} or next
|
175
143
|
|
176
|
-
|
177
|
-
pos = 0
|
144
|
+
matches[pos] << fmt
|
178
145
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
end
|
146
|
+
pos += 1
|
147
|
+
end
|
148
|
+
}
|
149
|
+
}
|
184
150
|
|
151
|
+
matches.sort.each { |pos, forms| forms.tap(&:uniq!).each { |form|
|
152
|
+
@inserts << [pos, Word.new_lexical(form, WA_SEQUENCE, LA_SEQUENCE)]
|
153
|
+
} }
|
185
154
|
end
|
186
155
|
|
187
156
|
end
|
@@ -0,0 +1,343 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
class Attendee
|
30
|
+
|
31
|
+
class Stemmer
|
32
|
+
|
33
|
+
module Porter
|
34
|
+
|
35
|
+
extend self
|
36
|
+
|
37
|
+
# Rules for Porter-Stemmer, based on:
|
38
|
+
#
|
39
|
+
# An algorithm for suffix stripping
|
40
|
+
#
|
41
|
+
# M.F. Porter
|
42
|
+
# 1980
|
43
|
+
#
|
44
|
+
# Originally published in Program, 14 no. 3, pp 130-137, July 1980.
|
45
|
+
# (A few typos have been corrected.)
|
46
|
+
#
|
47
|
+
# http://tartarus.org/~martin/PorterStemmer/def.txt
|
48
|
+
#
|
49
|
+
# -------------------------------------------------------------------
|
50
|
+
#
|
51
|
+
# 2. THE ALGORITHM
|
52
|
+
#
|
53
|
+
# To present the suffix stripping algorithm in its entirety we will
|
54
|
+
# need a few definitions.
|
55
|
+
#
|
56
|
+
# A _consonant_ in a word is a letter other than A, E, I, O or U,
|
57
|
+
# and other than Y preceded by a consonant. (The fact that the term
|
58
|
+
# `consonant' is defined to some extent in terms of itself does not
|
59
|
+
# make it ambiguous.) So in TOY the consonants are T and Y, and in
|
60
|
+
# SYZYGY they are S, Z and G. If a letter is not a consonant it is
|
61
|
+
# a _vowel_.
|
62
|
+
#
|
63
|
+
# A consonant will be denoted by c, a vowel by v. A list ccc... of
|
64
|
+
# length greater than 0 will be denoted by C, and a list vvv... of
|
65
|
+
# length greater than 0 will be denoted by V. Any word, or part of
|
66
|
+
# a word, therefore has one of the four forms:
|
67
|
+
#
|
68
|
+
# CVCV ... C
|
69
|
+
# CVCV ... V
|
70
|
+
# VCVC ... C
|
71
|
+
# VCVC ... V
|
72
|
+
#
|
73
|
+
# These may all be represented by the single form
|
74
|
+
#
|
75
|
+
# [C]VCVC ... [V]
|
76
|
+
#
|
77
|
+
# where the square brackets denote arbitrary presence of their
|
78
|
+
# contents. Using (VC){m} to denote VC repeated m times, this
|
79
|
+
# may again be written as
|
80
|
+
#
|
81
|
+
# [C](VC){m}[V].
|
82
|
+
#
|
83
|
+
# m will be called the _measure_ of any word or word part when
|
84
|
+
# represented in this form. The case m = 0 covers the null word.
|
85
|
+
# Here are some examples:
|
86
|
+
#
|
87
|
+
# m=0 TR, EE, TREE, Y, BY.
|
88
|
+
# m=1 TROUBLE, OATS, TREES, IVY.
|
89
|
+
# m=2 TROUBLES, PRIVATE, OATEN, ORRERY.
|
90
|
+
#
|
91
|
+
# The _rules_ for removing a suffix will be given in the form
|
92
|
+
#
|
93
|
+
# (condition) S1 -> S2
|
94
|
+
#
|
95
|
+
# This means that if a word ends with the suffix S1, and the stem
|
96
|
+
# before S1 satisfies the given condition, S1 is replaced by S2.
|
97
|
+
# The condition is usually given in terms of m, e.g.
|
98
|
+
#
|
99
|
+
# (m > 1) EMENT ->
|
100
|
+
#
|
101
|
+
# Here S1 is `EMENT' and S2 is null. This would map REPLACEMENT to
|
102
|
+
# REPLAC, since REPLAC is a word part for which m = 2.
|
103
|
+
#
|
104
|
+
# The `condition' part may also contain the following:
|
105
|
+
#
|
106
|
+
# *S - the stem ends with S (and similarly for the other letters).
|
107
|
+
#
|
108
|
+
# *v* - the stem contains a vowel.
|
109
|
+
#
|
110
|
+
# *d - the stem ends with a double consonant (e.g. -TT, -SS).
|
111
|
+
#
|
112
|
+
# *o - the stem ends cvc, where the second c is not W, X or Y (e.g.
|
113
|
+
# -WIL, -HOP).
|
114
|
+
#
|
115
|
+
# And the condition part may also contain expressions with _and_,
|
116
|
+
# _or_ and _not_, so that
|
117
|
+
#
|
118
|
+
# (m>1 and (*S or *T))
|
119
|
+
#
|
120
|
+
# tests for a stem with m>1 ending in S or T, while
|
121
|
+
#
|
122
|
+
# (*d and not (*L or *S or *Z))
|
123
|
+
#
|
124
|
+
# tests for a stem ending with a double consonant other than L, S
|
125
|
+
# or Z. Elaborate conditions like this are required only rarely.
|
126
|
+
#
|
127
|
+
# In a set of rules written beneath each other, only one is obeyed,
|
128
|
+
# and this will be the one with the longest matching S1 for the
|
129
|
+
# given word. For example, with
|
130
|
+
#
|
131
|
+
# SSES -> SS
|
132
|
+
# IES -> I
|
133
|
+
# SS -> SS
|
134
|
+
# S ->
|
135
|
+
#
|
136
|
+
# (here the conditions are all null) CARESSES maps to CARESS since
|
137
|
+
# SSES is the longest match for S1. Equally CARESS maps to CARESS
|
138
|
+
# (S1=`SS') and CARES to CARE (S1=`S').
|
139
|
+
#
|
140
|
+
# In the rules below, examples of their application, successful or
|
141
|
+
# otherwise, are given on the right in lower case. The algorithm
|
142
|
+
# now follows: see RULES.
|
143
|
+
#
|
144
|
+
# The algorithm is careful not to remove a suffix when the stem is
|
145
|
+
# too short, the length of the stem being given by its measure, m.
|
146
|
+
# There is no linguistic basis for this approach. It was merely
|
147
|
+
# observed that m could be used quite effectively to help decide
|
148
|
+
# whether or not it was wise to take off a suffix.
|
149
|
+
#
|
150
|
+
# -------------------------------------------------------------------
|
151
|
+
|
152
|
+
#
|
153
|
+
|
154
|
+
RULES = {
|
155
|
+
# Step 1a
|
156
|
+
S100: [
|
157
|
+
'SSES -> SS', # caresses -> caress
|
158
|
+
'IES -> I', # ponies -> poni, ties -> ti
|
159
|
+
'SS -> SS', # caress -> caress
|
160
|
+
'S -> ' # cats -> cat
|
161
|
+
],
|
162
|
+
|
163
|
+
# Step 1b
|
164
|
+
S110: [
|
165
|
+
'(m>0) EED -> EE goto(S120)', # agreed -> agree, feed -> feed
|
166
|
+
'(*v*) ED -> goto(S111)', # plastered -> plaster, bled -> bled
|
167
|
+
'(*v*) ING -> goto(S111)', # motoring -> motor, sing -> sing
|
168
|
+
'goto(S120)'
|
169
|
+
],
|
170
|
+
|
171
|
+
# If the second or third of the rules in Step 1b is successful,
|
172
|
+
# the following is done:
|
173
|
+
S111: [
|
174
|
+
'AT -> ATE', # conflat(ed) -> conflate
|
175
|
+
'BL -> BLE', # troubl(ed) -> trouble
|
176
|
+
'IZ -> IZE', # siz(ed) -> size
|
177
|
+
'(*d and not (*L or *S or *Z)) -> -1', # hopp(ing) -> hop
|
178
|
+
# tann(ed) -> tan
|
179
|
+
# fall(ing) -> fall
|
180
|
+
# hiss(ing) -> hiss
|
181
|
+
# fizz(ed) -> fizz
|
182
|
+
'(m=1 and *o) -> E' # fail(ing) -> fail
|
183
|
+
# fil(ing) -> file
|
184
|
+
],
|
185
|
+
|
186
|
+
# The rule to map to a single letter causes the removal of one of
|
187
|
+
# the double letter pair. The -E is put back on -AT, -BL and -IZ,
|
188
|
+
# so that the suffixes -ATE, -BLE and -IZE can be recognised later.
|
189
|
+
# This E may be removed in step 4.
|
190
|
+
|
191
|
+
# Step 1c
|
192
|
+
S120: [
|
193
|
+
'(*v*) Y -> I' # happy -> happi, sky -> sky
|
194
|
+
],
|
195
|
+
|
196
|
+
# Step 1 deals with plurals and past participles. The subsequent
|
197
|
+
# steps are much more straightforward.
|
198
|
+
|
199
|
+
# Step 2
|
200
|
+
S200: [
|
201
|
+
'(m>0) ATIONAL -> ATE', # relational -> relate
|
202
|
+
'(m>0) TIONAL -> TION', # conditional -> condition, rational -> rational
|
203
|
+
'(m>0) ENCI -> ENCE', # valenci -> valence
|
204
|
+
'(m>0) ANCI -> ANCE', # hesitanci -> hesitance
|
205
|
+
'(m>0) IZER -> IZE', # digitizer -> digitize
|
206
|
+
'(m>0) ABLI -> ABLE', # conformabli -> conformable
|
207
|
+
'(m>0) ALLI -> AL', # radicalli -> radical
|
208
|
+
'(m>0) ENTLI -> ENT', # differentli -> different
|
209
|
+
'(m>0) ELI -> E', # vileli -> vile
|
210
|
+
'(m>0) OUSLI -> OUS', # analogousli -> analogous
|
211
|
+
'(m>0) IZATION -> IZE', # vietnamization -> vietnamize
|
212
|
+
'(m>0) ATION -> ATE', # predication -> predicate
|
213
|
+
'(m>0) ATOR -> ATE', # operator -> operate
|
214
|
+
'(m>0) ALISM -> AL', # feudalism -> feudal
|
215
|
+
'(m>0) IVENESS -> IVE', # decisiveness -> decisive
|
216
|
+
'(m>0) FULNESS -> FUL', # hopefulness -> hopeful
|
217
|
+
'(m>0) OUSNESS -> OUS', # callousness -> callous
|
218
|
+
'(m>0) ALITI -> AL', # formaliti -> formal
|
219
|
+
'(m>0) IVITI -> IVE', # sensitiviti -> sensitive
|
220
|
+
'(m>0) BILITI -> BLE' # sensibiliti -> sensible
|
221
|
+
],
|
222
|
+
|
223
|
+
# The test for the string S1 can be made fast by doing a program
|
224
|
+
# switch on the penultimate letter of the word being tested. This
|
225
|
+
# gives a fairly even breakdown of the possible values of the
|
226
|
+
# string S1. It will be seen in fact that the S1-strings in step 2
|
227
|
+
# are presented here in the alphabetical order of their penultimate
|
228
|
+
# letter. Similar techniques may be applied in the other steps.
|
229
|
+
|
230
|
+
# Step 3
|
231
|
+
S300: [
|
232
|
+
'(m>0) ICATE -> IC', # triplicate -> triplic
|
233
|
+
'(m>0) ATIVE -> ', # formative -> form
|
234
|
+
'(m>0) ALIZE -> AL', # formalize -> formal
|
235
|
+
'(m>0) ICITI -> IC', # electriciti -> electric
|
236
|
+
'(m>0) ICAL -> IC', # electrical -> electric
|
237
|
+
'(m>0) FUL -> ', # hopeful -> hope
|
238
|
+
'(m>0) NESS -> ' # goodness -> good
|
239
|
+
],
|
240
|
+
|
241
|
+
# Step 4
|
242
|
+
S400: [
|
243
|
+
'(m>1) AL -> ', # revival -> reviv
|
244
|
+
'(m>1) ANCE -> ', # allowance -> allow
|
245
|
+
'(m>1) ENCE -> ', # inference -> infer
|
246
|
+
'(m>1) ER -> ', # airliner -> airlin
|
247
|
+
'(m>1) IC -> ', # gyroscopic -> gyroscop
|
248
|
+
'(m>1) ABLE -> ', # adjustable -> adjust
|
249
|
+
'(m>1) IBLE -> ', # defensible -> defens
|
250
|
+
'(m>1) ANT -> ', # irritant -> irrit
|
251
|
+
'(m>1) EMENT -> ', # replacement -> replac
|
252
|
+
'(m>1) MENT -> ', # adjustment -> adjust
|
253
|
+
'(m>1) ENT -> ', # dependent -> depend
|
254
|
+
'(m>1 and (*S or *T)) ION -> ', # adoption -> adopt
|
255
|
+
'(m>1) OU -> ', # homologou -> homolog
|
256
|
+
'(m>1) ISM -> ', # communism -> commun
|
257
|
+
'(m>1) ATE -> ', # activate -> activ
|
258
|
+
'(m>1) ITI -> ', # angulariti -> angular
|
259
|
+
'(m>1) OUS -> ', # homologous -> homolog
|
260
|
+
'(m>1) IVE -> ', # effective -> effect
|
261
|
+
'(m>1) IZE -> ' # bowdlerize -> bowdler
|
262
|
+
],
|
263
|
+
|
264
|
+
# The suffixes are now removed. All that remains is a little
|
265
|
+
# tidying up.
|
266
|
+
|
267
|
+
# Step 5a
|
268
|
+
S500: [
|
269
|
+
'(m>1) E -> ', # probate -> probat, rate -> rate
|
270
|
+
'(m=1 and not *o) E -> ' # cease -> ceas
|
271
|
+
],
|
272
|
+
|
273
|
+
# Step 5b
|
274
|
+
S510: [
|
275
|
+
'(m > 1 and *d and *L) -> -1' # controll -> control, roll -> roll
|
276
|
+
]
|
277
|
+
}
|
278
|
+
|
279
|
+
GOTO_RE = %r{^#{goto_re = %r{\s*goto\((\S+)\)}}$}
|
280
|
+
|
281
|
+
RULE_RE = %r{^(\(.+\))?\s*(\S*)\s*->\s*(\S*?)(?:#{goto_re})?\s*$}
|
282
|
+
|
283
|
+
def stem(word, found = false)
|
284
|
+
goto, conv = nil, lambda { |s, h| h.each { |q, r| s.gsub!(q, r.to_s) } }
|
285
|
+
|
286
|
+
RULES.each { |key, rules|
|
287
|
+
next if goto && goto != key.to_s
|
288
|
+
|
289
|
+
rules.each { |rule|
|
290
|
+
case rule
|
291
|
+
when RULE_RE
|
292
|
+
cond, repl, goto = $1, $3, $4
|
293
|
+
stem = word[/(.+)#{$2.downcase}$/, 1] or next
|
294
|
+
when GOTO_RE
|
295
|
+
goto = $1
|
296
|
+
break
|
297
|
+
end
|
298
|
+
|
299
|
+
conv[shad = stem.dup,
|
300
|
+
/[^aeiouy]/ => 'c',
|
301
|
+
/[aeiou]/ => 'v',
|
302
|
+
/cy/ => 'cv',
|
303
|
+
/y/ => 'c'
|
304
|
+
]
|
305
|
+
|
306
|
+
if cond
|
307
|
+
conv[cond,
|
308
|
+
'm' => shad.scan(/vc/).size,
|
309
|
+
'*v*' => shad.include?('v'),
|
310
|
+
'*d' => shad.end_with?('c') && (last = stem[-1]) == stem[-2],
|
311
|
+
'*o' => shad.end_with?('cvc') && !'wxy'.include?(last),
|
312
|
+
'and' => '&&',
|
313
|
+
'or' => '||',
|
314
|
+
'not' => '!',
|
315
|
+
'=' => '=='
|
316
|
+
]
|
317
|
+
|
318
|
+
last.upcase! if last
|
319
|
+
cond.gsub!(/\*(\w)/) { last == $1 }
|
320
|
+
|
321
|
+
next unless eval(cond)
|
322
|
+
end
|
323
|
+
|
324
|
+
found, word = true, begin
|
325
|
+
stem[0...Integer(repl)]
|
326
|
+
rescue ArgumentError
|
327
|
+
stem << repl.downcase
|
328
|
+
end
|
329
|
+
|
330
|
+
break
|
331
|
+
}
|
332
|
+
}
|
333
|
+
|
334
|
+
word if found
|
335
|
+
end
|
336
|
+
|
337
|
+
end
|
338
|
+
|
339
|
+
end
|
340
|
+
|
341
|
+
end
|
342
|
+
|
343
|
+
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
#--
|
2
4
|
###############################################################################
|
3
5
|
# #
|
@@ -22,3 +24,34 @@
|
|
22
24
|
###############################################################################
|
23
25
|
#++
|
24
26
|
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
class Attendee
|
30
|
+
|
31
|
+
class Stemmer < self
|
32
|
+
|
33
|
+
protected
|
34
|
+
|
35
|
+
def init
|
36
|
+
extend(Lingo.get_const(get_key('type', 'porter'), self.class))
|
37
|
+
|
38
|
+
@wc = get_key('wordclass', LA_STEM)
|
39
|
+
@all = get_key('mode', '').downcase == 'all'
|
40
|
+
end
|
41
|
+
|
42
|
+
def process(obj)
|
43
|
+
if obj.is_a?(Word) && obj.unknown?
|
44
|
+
stem = stem(obj.form.downcase, @all)
|
45
|
+
obj.add_lexicals([Lexical.new(stem, @wc)]) if stem
|
46
|
+
end
|
47
|
+
|
48
|
+
forward(obj)
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
require_relative 'stemmer/porter'
|
@@ -74,24 +74,25 @@ class Lingo
|
|
74
74
|
|
75
75
|
def init
|
76
76
|
set_dic
|
77
|
-
@skip = get_array('skip', WA_UNKNOWN
|
77
|
+
@skip = get_array('skip', WA_UNKNOWN, :upcase)
|
78
78
|
end
|
79
79
|
|
80
|
-
def control(cmd,
|
81
|
-
|
80
|
+
def control(cmd, param)
|
81
|
+
report_on(cmd, @dic)
|
82
82
|
end
|
83
83
|
|
84
84
|
def process(obj)
|
85
|
-
if obj.is_a?(Word) &&
|
85
|
+
if obj.is_a?(Word) && !@skip.include?(obj.attr)
|
86
86
|
inc('Anzahl gesuchter Wörter')
|
87
87
|
|
88
|
-
|
89
|
-
|
90
|
-
obj.lexicals += synos.sort.uniq
|
88
|
+
unless (syn = @dic.find_synonyms(obj)).empty?
|
89
|
+
inc('Anzahl erweiteter Wörter')
|
91
90
|
|
92
|
-
|
93
|
-
|
91
|
+
obj.add_lexicals(syn.tap(&:uniq!))
|
92
|
+
add('Anzahl gefundener Synonyme', syn.size)
|
93
|
+
end
|
94
94
|
end
|
95
|
+
|
95
96
|
forward(obj)
|
96
97
|
end
|
97
98
|
|