lingo 1.8.1 → 1.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +23 -5
- data/README +1 -1
- data/Rakefile +5 -7
- data/TODO +2 -0
- data/bin/lingo +5 -1
- data/de.lang +1 -1
- data/en/lingo-syn.txt +0 -0
- data/en.lang +2 -1
- data/lib/lingo/attendee/abbreviator.rb +8 -9
- data/lib/lingo/attendee/debugger.rb +5 -4
- data/lib/lingo/attendee/decomposer.rb +8 -3
- data/lib/lingo/attendee/dehyphenizer.rb +19 -63
- data/lib/lingo/attendee/formatter.rb +1 -1
- data/lib/lingo/attendee/multi_worder.rb +67 -155
- data/lib/lingo/attendee/noneword_filter.rb +16 -9
- data/lib/lingo/attendee/object_filter.rb +1 -1
- data/lib/lingo/attendee/sequencer.rb +32 -63
- data/lib/lingo/attendee/stemmer/porter.rb +343 -0
- data/{info/gpl-hdr.txt → lib/lingo/attendee/stemmer.rb} +33 -0
- data/lib/lingo/attendee/synonymer.rb +10 -9
- data/lib/lingo/attendee/text_reader.rb +102 -76
- data/lib/lingo/attendee/text_writer.rb +23 -26
- data/lib/lingo/attendee/tokenizer.rb +13 -27
- data/lib/lingo/attendee/variator.rb +26 -66
- data/lib/lingo/attendee/vector_filter.rb +42 -43
- data/lib/lingo/attendee/word_searcher.rb +6 -7
- data/lib/lingo/attendee.rb +25 -7
- data/lib/lingo/buffered_attendee.rb +36 -10
- data/lib/lingo/cachable.rb +8 -8
- data/lib/lingo/config.rb +5 -6
- data/lib/lingo/ctl.rb +2 -3
- data/lib/lingo/database/crypter.rb +9 -26
- data/lib/lingo/database/gdbm_store.rb +3 -5
- data/lib/lingo/database/libcdb_store.rb +4 -6
- data/lib/lingo/database/sdbm_store.rb +11 -6
- data/lib/lingo/database/show_progress.rb +3 -43
- data/lib/lingo/database/source/key_value.rb +2 -6
- data/lib/lingo/database/source/multi_key.rb +3 -5
- data/lib/lingo/database/source/multi_value.rb +2 -6
- data/lib/lingo/database/source/single_word.rb +4 -6
- data/lib/lingo/database/source/word_class.rb +4 -10
- data/lib/lingo/database/source.rb +20 -18
- data/lib/lingo/database.rb +84 -59
- data/lib/lingo/error.rb +57 -1
- data/lib/lingo/language/dictionary.rb +21 -18
- data/lib/lingo/language/grammar.rb +40 -49
- data/lib/lingo/language/lexical.rb +6 -6
- data/lib/lingo/language/lexical_hash.rb +6 -0
- data/lib/lingo/language/word.rb +32 -15
- data/lib/lingo/language/word_form.rb +1 -1
- data/lib/lingo/language.rb +14 -25
- data/lib/lingo/reportable.rb +12 -10
- data/lib/lingo/show_progress.rb +81 -0
- data/lib/lingo/version.rb +1 -1
- data/lib/lingo.rb +63 -24
- data/lingo-call.cfg +6 -10
- data/lingo.cfg +60 -44
- data/lir.cfg +42 -41
- data/test/attendee/ts_abbreviator.rb +3 -5
- data/test/attendee/ts_decomposer.rb +3 -5
- data/test/attendee/ts_multi_worder.rb +87 -145
- data/test/attendee/ts_noneword_filter.rb +5 -3
- data/test/attendee/ts_object_filter.rb +5 -3
- data/test/attendee/ts_sequencer.rb +3 -5
- data/test/attendee/ts_stemmer.rb +309 -0
- data/test/attendee/ts_synonymer.rb +15 -11
- data/test/attendee/ts_text_reader.rb +12 -15
- data/test/attendee/ts_text_writer.rb +24 -29
- data/test/attendee/ts_tokenizer.rb +9 -7
- data/test/attendee/ts_variator.rb +4 -4
- data/test/attendee/ts_vector_filter.rb +24 -16
- data/test/attendee/ts_word_searcher.rb +20 -36
- data/test/{lir.csv → lir.vec} +0 -0
- data/test/ref/artikel.vec +943 -943
- data/test/ref/artikel.ven +943 -943
- data/test/ref/lir.non +201 -201
- data/test/ref/lir.seq +178 -178
- data/test/ref/lir.syn +49 -49
- data/test/ref/lir.vec +329 -0
- data/test/test_helper.rb +20 -36
- data/test/ts_database.rb +10 -10
- data/test/ts_language.rb +279 -319
- metadata +93 -104
- data/info/Objekte.png +0 -0
- data/info/Typen.png +0 -0
- data/info/database.png +0 -0
- data/info/db_small.png +0 -0
- data/info/download.png +0 -0
- data/info/kerze.png +0 -0
- data/info/language.png +0 -0
- data/info/lingo.png +0 -0
- data/info/logo.png +0 -0
- data/info/meeting.png +0 -0
- data/info/types.png +0 -0
- data/lingo-all.cfg +0 -89
- data/porter/stem.cfg +0 -311
- data/porter/stem.rb +0 -150
- data/test/ref/lir.csv +0 -329
- data/test.cfg +0 -79
@@ -71,32 +71,39 @@ class Lingo
|
|
71
71
|
protected
|
72
72
|
|
73
73
|
def init
|
74
|
-
@nonewords = []
|
74
|
+
@nonewords, @sort = [], get_key('sort', true)
|
75
75
|
end
|
76
76
|
|
77
|
-
|
78
|
-
# Für jede Datei wird ein neuer Satz nicht erkannter Wörter registriert.
|
79
|
-
def control(cmd, par)
|
77
|
+
def control(cmd, param)
|
80
78
|
case cmd
|
81
79
|
when STR_CMD_FILE
|
82
80
|
@nonewords.clear
|
83
81
|
when STR_CMD_EOL
|
84
82
|
skip_command
|
85
83
|
when STR_CMD_RECORD, STR_CMD_EOF
|
86
|
-
|
87
|
-
nones.each(&method(:forward))
|
88
|
-
add('Objekte gefiltert', nones.size)
|
89
|
-
@nonewords.clear
|
84
|
+
send_nonewords unless @nonewords.empty?
|
90
85
|
end
|
91
86
|
end
|
92
87
|
|
93
88
|
def process(obj)
|
94
89
|
if obj.is_a?(Word) && obj.unknown?
|
95
90
|
inc('Anzahl nicht erkannter Wörter')
|
96
|
-
|
91
|
+
|
92
|
+
non = obj.form.downcase
|
93
|
+
@sort ? @nonewords << non : forward(non)
|
97
94
|
end
|
98
95
|
end
|
99
96
|
|
97
|
+
private
|
98
|
+
|
99
|
+
def send_nonewords
|
100
|
+
@nonewords.sort!
|
101
|
+
@nonewords.uniq!
|
102
|
+
|
103
|
+
add('Objekte gefiltert', @nonewords.size)
|
104
|
+
@nonewords.each(&method(:forward)).clear
|
105
|
+
end
|
106
|
+
|
100
107
|
end
|
101
108
|
|
102
109
|
# For backwards compatibility.
|
@@ -59,7 +59,7 @@ class Lingo
|
|
59
59
|
# - text_reader: { out: lines, files: '$(files)' }
|
60
60
|
# - tokenizer: { in: lines, out: token }
|
61
61
|
# - word_searcher: { in: token, out: words, source: 'sys-dic' }
|
62
|
-
# - object_filter: { in: words, out: filtr, objects: 'obj.kind_of?(Word) && obj.lexicals.size>0 && obj.lexicals[0].attr==
|
62
|
+
# - object_filter: { in: words, out: filtr, objects: 'obj.kind_of?(Word) && obj.lexicals.size>0 && obj.lexicals[0].attr==LA_NOUN' }
|
63
63
|
# - debugger: { in: filtr, prompt: 'out>' }
|
64
64
|
# ergibt die Ausgabe über den Debugger: <tt>lingo -c t1 test.txt</tt>
|
65
65
|
# out> *FILE('test.txt')
|
@@ -96,92 +96,61 @@ class Lingo
|
|
96
96
|
protected
|
97
97
|
|
98
98
|
def init
|
99
|
-
|
100
|
-
@stopper = get_array('stopper', TA_PUNCTUATION + ',' + TA_OTHER).map(&:upcase)
|
101
|
-
@seq_strings = get_key('sequences').map { |e| WordSequence.new(*e) }
|
99
|
+
@stopper = get_array('stopper', DEFAULT_SKIP, :upcase)
|
102
100
|
|
103
|
-
|
101
|
+
@seq = get_key('sequences').map { |string, format|
|
102
|
+
[string = string.downcase, string.split(//), format]
|
103
|
+
}
|
104
|
+
|
105
|
+
raise MissingConfigError.new(:sequences) if @seq.empty?
|
104
106
|
end
|
105
107
|
|
106
|
-
def control(cmd,
|
107
|
-
# Jedes Control-Object ist auch Auslöser der Verarbeitung
|
108
|
+
def control(cmd, param)
|
108
109
|
process_buffer if [STR_CMD_RECORD, STR_CMD_EOF].include?(cmd)
|
109
110
|
end
|
110
111
|
|
111
112
|
def process_buffer?
|
112
|
-
|
113
|
-
|
114
|
-
(item.is_a?(WordForm) && @stopper.include?(item.attr.upcase)) ||
|
115
|
-
(item.is_a?(Word) && item.unknown?)
|
113
|
+
(obj = @buffer.last).is_a?(WordForm) && (obj.is_a?(Word) &&
|
114
|
+
obj.unknown? || @stopper.include?(obj.attr.upcase))
|
116
115
|
end
|
117
116
|
|
118
117
|
def process_buffer
|
119
|
-
|
120
|
-
|
121
|
-
unless @buffer.size < 2
|
122
|
-
matches = Hash.new { |h, k| h[k] = [] }
|
123
|
-
|
124
|
-
sequences(@buffer.map { |obj|
|
125
|
-
obj.is_a?(Word) && !obj.unknown? ? obj.attrs(false) : ['#']
|
126
|
-
}).uniq.each { |sequence|
|
127
|
-
@seq_strings.each { |wordseq|
|
128
|
-
wordseq.scan(sequence) { |pos, form, classes|
|
129
|
-
inc('Anzahl erkannter Sequenzen')
|
130
|
-
|
131
|
-
classes.each_with_index { |wc, index|
|
132
|
-
@buffer[pos + index].lexicals.find { |lex|
|
133
|
-
form.gsub!(index.succ.to_s, lex.form) if lex.attr == wc
|
134
|
-
} or break
|
135
|
-
} or next
|
136
|
-
|
137
|
-
matches[pos] << form
|
138
|
-
}
|
139
|
-
}
|
140
|
-
}
|
141
|
-
|
142
|
-
matches.sort.each { |pos, forms|
|
143
|
-
forms.uniq.each { |form|
|
144
|
-
deferred_insert(pos, Word.new_lexical(form, WA_SEQUENCE, LA_SEQUENCE))
|
145
|
-
}
|
146
|
-
}
|
147
|
-
end
|
148
|
-
|
118
|
+
insert_sequences if @buffer.size > 1
|
149
119
|
forward_buffer
|
150
120
|
end
|
151
121
|
|
152
122
|
private
|
153
123
|
|
154
|
-
def
|
155
|
-
|
124
|
+
def insert_sequences
|
125
|
+
matches, buf, seq = Hash.new { |h, k| h[k] = [] }, @buffer, @seq
|
156
126
|
|
157
|
-
map.
|
158
|
-
|
159
|
-
res.each { |wc1| classes.each { |wc2| temp << (wc1 + wc2) } }
|
160
|
-
res = temp
|
127
|
+
map = buf.map { |obj|
|
128
|
+
obj.is_a?(Word) && !obj.unknown? ? obj.attrs(false) : ['#']
|
161
129
|
}
|
162
130
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
131
|
+
map.shift.product(*map).map!(&:join).tap(&:uniq!).each { |q|
|
132
|
+
seq.each { |string, classes, format|
|
133
|
+
while pos = q.index(string, pos || 0)
|
134
|
+
inc('Anzahl erkannter Sequenzen')
|
167
135
|
|
168
|
-
|
136
|
+
fmt = format.dup
|
169
137
|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
138
|
+
classes.each_with_index { |wc, i|
|
139
|
+
buf[pos + i].lexicals.find { |l|
|
140
|
+
fmt.gsub!(i.succ.to_s, l.form) if l.attr == wc
|
141
|
+
} or break
|
142
|
+
} or next
|
175
143
|
|
176
|
-
|
177
|
-
pos = 0
|
144
|
+
matches[pos] << fmt
|
178
145
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
end
|
146
|
+
pos += 1
|
147
|
+
end
|
148
|
+
}
|
149
|
+
}
|
184
150
|
|
151
|
+
matches.sort.each { |pos, forms| forms.tap(&:uniq!).each { |form|
|
152
|
+
@inserts << [pos, Word.new_lexical(form, WA_SEQUENCE, LA_SEQUENCE)]
|
153
|
+
} }
|
185
154
|
end
|
186
155
|
|
187
156
|
end
|
@@ -0,0 +1,343 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
class Attendee
|
30
|
+
|
31
|
+
class Stemmer
|
32
|
+
|
33
|
+
module Porter
|
34
|
+
|
35
|
+
extend self
|
36
|
+
|
37
|
+
# Rules for Porter-Stemmer, based on:
|
38
|
+
#
|
39
|
+
# An algorithm for suffix stripping
|
40
|
+
#
|
41
|
+
# M.F. Porter
|
42
|
+
# 1980
|
43
|
+
#
|
44
|
+
# Originally published in Program, 14 no. 3, pp 130-137, July 1980.
|
45
|
+
# (A few typos have been corrected.)
|
46
|
+
#
|
47
|
+
# http://tartarus.org/~martin/PorterStemmer/def.txt
|
48
|
+
#
|
49
|
+
# -------------------------------------------------------------------
|
50
|
+
#
|
51
|
+
# 2. THE ALGORITHM
|
52
|
+
#
|
53
|
+
# To present the suffix stripping algorithm in its entirety we will
|
54
|
+
# need a few definitions.
|
55
|
+
#
|
56
|
+
# A _consonant_ in a word is a letter other than A, E, I, O or U,
|
57
|
+
# and other than Y preceded by a consonant. (The fact that the term
|
58
|
+
# `consonant' is defined to some extent in terms of itself does not
|
59
|
+
# make it ambiguous.) So in TOY the consonants are T and Y, and in
|
60
|
+
# SYZYGY they are S, Z and G. If a letter is not a consonant it is
|
61
|
+
# a _vowel_.
|
62
|
+
#
|
63
|
+
# A consonant will be denoted by c, a vowel by v. A list ccc... of
|
64
|
+
# length greater than 0 will be denoted by C, and a list vvv... of
|
65
|
+
# length greater than 0 will be denoted by V. Any word, or part of
|
66
|
+
# a word, therefore has one of the four forms:
|
67
|
+
#
|
68
|
+
# CVCV ... C
|
69
|
+
# CVCV ... V
|
70
|
+
# VCVC ... C
|
71
|
+
# VCVC ... V
|
72
|
+
#
|
73
|
+
# These may all be represented by the single form
|
74
|
+
#
|
75
|
+
# [C]VCVC ... [V]
|
76
|
+
#
|
77
|
+
# where the square brackets denote arbitrary presence of their
|
78
|
+
# contents. Using (VC){m} to denote VC repeated m times, this
|
79
|
+
# may again be written as
|
80
|
+
#
|
81
|
+
# [C](VC){m}[V].
|
82
|
+
#
|
83
|
+
# m will be called the _measure_ of any word or word part when
|
84
|
+
# represented in this form. The case m = 0 covers the null word.
|
85
|
+
# Here are some examples:
|
86
|
+
#
|
87
|
+
# m=0 TR, EE, TREE, Y, BY.
|
88
|
+
# m=1 TROUBLE, OATS, TREES, IVY.
|
89
|
+
# m=2 TROUBLES, PRIVATE, OATEN, ORRERY.
|
90
|
+
#
|
91
|
+
# The _rules_ for removing a suffix will be given in the form
|
92
|
+
#
|
93
|
+
# (condition) S1 -> S2
|
94
|
+
#
|
95
|
+
# This means that if a word ends with the suffix S1, and the stem
|
96
|
+
# before S1 satisfies the given condition, S1 is replaced by S2.
|
97
|
+
# The condition is usually given in terms of m, e.g.
|
98
|
+
#
|
99
|
+
# (m > 1) EMENT ->
|
100
|
+
#
|
101
|
+
# Here S1 is `EMENT' and S2 is null. This would map REPLACEMENT to
|
102
|
+
# REPLAC, since REPLAC is a word part for which m = 2.
|
103
|
+
#
|
104
|
+
# The `condition' part may also contain the following:
|
105
|
+
#
|
106
|
+
# *S - the stem ends with S (and similarly for the other letters).
|
107
|
+
#
|
108
|
+
# *v* - the stem contains a vowel.
|
109
|
+
#
|
110
|
+
# *d - the stem ends with a double consonant (e.g. -TT, -SS).
|
111
|
+
#
|
112
|
+
# *o - the stem ends cvc, where the second c is not W, X or Y (e.g.
|
113
|
+
# -WIL, -HOP).
|
114
|
+
#
|
115
|
+
# And the condition part may also contain expressions with _and_,
|
116
|
+
# _or_ and _not_, so that
|
117
|
+
#
|
118
|
+
# (m>1 and (*S or *T))
|
119
|
+
#
|
120
|
+
# tests for a stem with m>1 ending in S or T, while
|
121
|
+
#
|
122
|
+
# (*d and not (*L or *S or *Z))
|
123
|
+
#
|
124
|
+
# tests for a stem ending with a double consonant other than L, S
|
125
|
+
# or Z. Elaborate conditions like this are required only rarely.
|
126
|
+
#
|
127
|
+
# In a set of rules written beneath each other, only one is obeyed,
|
128
|
+
# and this will be the one with the longest matching S1 for the
|
129
|
+
# given word. For example, with
|
130
|
+
#
|
131
|
+
# SSES -> SS
|
132
|
+
# IES -> I
|
133
|
+
# SS -> SS
|
134
|
+
# S ->
|
135
|
+
#
|
136
|
+
# (here the conditions are all null) CARESSES maps to CARESS since
|
137
|
+
# SSES is the longest match for S1. Equally CARESS maps to CARESS
|
138
|
+
# (S1=`SS') and CARES to CARE (S1=`S').
|
139
|
+
#
|
140
|
+
# In the rules below, examples of their application, successful or
|
141
|
+
# otherwise, are given on the right in lower case. The algorithm
|
142
|
+
# now follows: see RULES.
|
143
|
+
#
|
144
|
+
# The algorithm is careful not to remove a suffix when the stem is
|
145
|
+
# too short, the length of the stem being given by its measure, m.
|
146
|
+
# There is no linguistic basis for this approach. It was merely
|
147
|
+
# observed that m could be used quite effectively to help decide
|
148
|
+
# whether or not it was wise to take off a suffix.
|
149
|
+
#
|
150
|
+
# -------------------------------------------------------------------
|
151
|
+
|
152
|
+
#
|
153
|
+
|
154
|
+
RULES = {
|
155
|
+
# Step 1a
|
156
|
+
S100: [
|
157
|
+
'SSES -> SS', # caresses -> caress
|
158
|
+
'IES -> I', # ponies -> poni, ties -> ti
|
159
|
+
'SS -> SS', # caress -> caress
|
160
|
+
'S -> ' # cats -> cat
|
161
|
+
],
|
162
|
+
|
163
|
+
# Step 1b
|
164
|
+
S110: [
|
165
|
+
'(m>0) EED -> EE goto(S120)', # agreed -> agree, feed -> feed
|
166
|
+
'(*v*) ED -> goto(S111)', # plastered -> plaster, bled -> bled
|
167
|
+
'(*v*) ING -> goto(S111)', # motoring -> motor, sing -> sing
|
168
|
+
'goto(S120)'
|
169
|
+
],
|
170
|
+
|
171
|
+
# If the second or third of the rules in Step 1b is successful,
|
172
|
+
# the following is done:
|
173
|
+
S111: [
|
174
|
+
'AT -> ATE', # conflat(ed) -> conflate
|
175
|
+
'BL -> BLE', # troubl(ed) -> trouble
|
176
|
+
'IZ -> IZE', # siz(ed) -> size
|
177
|
+
'(*d and not (*L or *S or *Z)) -> -1', # hopp(ing) -> hop
|
178
|
+
# tann(ed) -> tan
|
179
|
+
# fall(ing) -> fall
|
180
|
+
# hiss(ing) -> hiss
|
181
|
+
# fizz(ed) -> fizz
|
182
|
+
'(m=1 and *o) -> E' # fail(ing) -> fail
|
183
|
+
# fil(ing) -> file
|
184
|
+
],
|
185
|
+
|
186
|
+
# The rule to map to a single letter causes the removal of one of
|
187
|
+
# the double letter pair. The -E is put back on -AT, -BL and -IZ,
|
188
|
+
# so that the suffixes -ATE, -BLE and -IZE can be recognised later.
|
189
|
+
# This E may be removed in step 4.
|
190
|
+
|
191
|
+
# Step 1c
|
192
|
+
S120: [
|
193
|
+
'(*v*) Y -> I' # happy -> happi, sky -> sky
|
194
|
+
],
|
195
|
+
|
196
|
+
# Step 1 deals with plurals and past participles. The subsequent
|
197
|
+
# steps are much more straightforward.
|
198
|
+
|
199
|
+
# Step 2
|
200
|
+
S200: [
|
201
|
+
'(m>0) ATIONAL -> ATE', # relational -> relate
|
202
|
+
'(m>0) TIONAL -> TION', # conditional -> condition, rational -> rational
|
203
|
+
'(m>0) ENCI -> ENCE', # valenci -> valence
|
204
|
+
'(m>0) ANCI -> ANCE', # hesitanci -> hesitance
|
205
|
+
'(m>0) IZER -> IZE', # digitizer -> digitize
|
206
|
+
'(m>0) ABLI -> ABLE', # conformabli -> conformable
|
207
|
+
'(m>0) ALLI -> AL', # radicalli -> radical
|
208
|
+
'(m>0) ENTLI -> ENT', # differentli -> different
|
209
|
+
'(m>0) ELI -> E', # vileli -> vile
|
210
|
+
'(m>0) OUSLI -> OUS', # analogousli -> analogous
|
211
|
+
'(m>0) IZATION -> IZE', # vietnamization -> vietnamize
|
212
|
+
'(m>0) ATION -> ATE', # predication -> predicate
|
213
|
+
'(m>0) ATOR -> ATE', # operator -> operate
|
214
|
+
'(m>0) ALISM -> AL', # feudalism -> feudal
|
215
|
+
'(m>0) IVENESS -> IVE', # decisiveness -> decisive
|
216
|
+
'(m>0) FULNESS -> FUL', # hopefulness -> hopeful
|
217
|
+
'(m>0) OUSNESS -> OUS', # callousness -> callous
|
218
|
+
'(m>0) ALITI -> AL', # formaliti -> formal
|
219
|
+
'(m>0) IVITI -> IVE', # sensitiviti -> sensitive
|
220
|
+
'(m>0) BILITI -> BLE' # sensibiliti -> sensible
|
221
|
+
],
|
222
|
+
|
223
|
+
# The test for the string S1 can be made fast by doing a program
|
224
|
+
# switch on the penultimate letter of the word being tested. This
|
225
|
+
# gives a fairly even breakdown of the possible values of the
|
226
|
+
# string S1. It will be seen in fact that the S1-strings in step 2
|
227
|
+
# are presented here in the alphabetical order of their penultimate
|
228
|
+
# letter. Similar techniques may be applied in the other steps.
|
229
|
+
|
230
|
+
# Step 3
|
231
|
+
S300: [
|
232
|
+
'(m>0) ICATE -> IC', # triplicate -> triplic
|
233
|
+
'(m>0) ATIVE -> ', # formative -> form
|
234
|
+
'(m>0) ALIZE -> AL', # formalize -> formal
|
235
|
+
'(m>0) ICITI -> IC', # electriciti -> electric
|
236
|
+
'(m>0) ICAL -> IC', # electrical -> electric
|
237
|
+
'(m>0) FUL -> ', # hopeful -> hope
|
238
|
+
'(m>0) NESS -> ' # goodness -> good
|
239
|
+
],
|
240
|
+
|
241
|
+
# Step 4
|
242
|
+
S400: [
|
243
|
+
'(m>1) AL -> ', # revival -> reviv
|
244
|
+
'(m>1) ANCE -> ', # allowance -> allow
|
245
|
+
'(m>1) ENCE -> ', # inference -> infer
|
246
|
+
'(m>1) ER -> ', # airliner -> airlin
|
247
|
+
'(m>1) IC -> ', # gyroscopic -> gyroscop
|
248
|
+
'(m>1) ABLE -> ', # adjustable -> adjust
|
249
|
+
'(m>1) IBLE -> ', # defensible -> defens
|
250
|
+
'(m>1) ANT -> ', # irritant -> irrit
|
251
|
+
'(m>1) EMENT -> ', # replacement -> replac
|
252
|
+
'(m>1) MENT -> ', # adjustment -> adjust
|
253
|
+
'(m>1) ENT -> ', # dependent -> depend
|
254
|
+
'(m>1 and (*S or *T)) ION -> ', # adoption -> adopt
|
255
|
+
'(m>1) OU -> ', # homologou -> homolog
|
256
|
+
'(m>1) ISM -> ', # communism -> commun
|
257
|
+
'(m>1) ATE -> ', # activate -> activ
|
258
|
+
'(m>1) ITI -> ', # angulariti -> angular
|
259
|
+
'(m>1) OUS -> ', # homologous -> homolog
|
260
|
+
'(m>1) IVE -> ', # effective -> effect
|
261
|
+
'(m>1) IZE -> ' # bowdlerize -> bowdler
|
262
|
+
],
|
263
|
+
|
264
|
+
# The suffixes are now removed. All that remains is a little
|
265
|
+
# tidying up.
|
266
|
+
|
267
|
+
# Step 5a
|
268
|
+
S500: [
|
269
|
+
'(m>1) E -> ', # probate -> probat, rate -> rate
|
270
|
+
'(m=1 and not *o) E -> ' # cease -> ceas
|
271
|
+
],
|
272
|
+
|
273
|
+
# Step 5b
|
274
|
+
S510: [
|
275
|
+
'(m > 1 and *d and *L) -> -1' # controll -> control, roll -> roll
|
276
|
+
]
|
277
|
+
}
|
278
|
+
|
279
|
+
GOTO_RE = %r{^#{goto_re = %r{\s*goto\((\S+)\)}}$}
|
280
|
+
|
281
|
+
RULE_RE = %r{^(\(.+\))?\s*(\S*)\s*->\s*(\S*?)(?:#{goto_re})?\s*$}
|
282
|
+
|
283
|
+
def stem(word, found = false)
|
284
|
+
goto, conv = nil, lambda { |s, h| h.each { |q, r| s.gsub!(q, r.to_s) } }
|
285
|
+
|
286
|
+
RULES.each { |key, rules|
|
287
|
+
next if goto && goto != key.to_s
|
288
|
+
|
289
|
+
rules.each { |rule|
|
290
|
+
case rule
|
291
|
+
when RULE_RE
|
292
|
+
cond, repl, goto = $1, $3, $4
|
293
|
+
stem = word[/(.+)#{$2.downcase}$/, 1] or next
|
294
|
+
when GOTO_RE
|
295
|
+
goto = $1
|
296
|
+
break
|
297
|
+
end
|
298
|
+
|
299
|
+
conv[shad = stem.dup,
|
300
|
+
/[^aeiouy]/ => 'c',
|
301
|
+
/[aeiou]/ => 'v',
|
302
|
+
/cy/ => 'cv',
|
303
|
+
/y/ => 'c'
|
304
|
+
]
|
305
|
+
|
306
|
+
if cond
|
307
|
+
conv[cond,
|
308
|
+
'm' => shad.scan(/vc/).size,
|
309
|
+
'*v*' => shad.include?('v'),
|
310
|
+
'*d' => shad.end_with?('c') && (last = stem[-1]) == stem[-2],
|
311
|
+
'*o' => shad.end_with?('cvc') && !'wxy'.include?(last),
|
312
|
+
'and' => '&&',
|
313
|
+
'or' => '||',
|
314
|
+
'not' => '!',
|
315
|
+
'=' => '=='
|
316
|
+
]
|
317
|
+
|
318
|
+
last.upcase! if last
|
319
|
+
cond.gsub!(/\*(\w)/) { last == $1 }
|
320
|
+
|
321
|
+
next unless eval(cond)
|
322
|
+
end
|
323
|
+
|
324
|
+
found, word = true, begin
|
325
|
+
stem[0...Integer(repl)]
|
326
|
+
rescue ArgumentError
|
327
|
+
stem << repl.downcase
|
328
|
+
end
|
329
|
+
|
330
|
+
break
|
331
|
+
}
|
332
|
+
}
|
333
|
+
|
334
|
+
word if found
|
335
|
+
end
|
336
|
+
|
337
|
+
end
|
338
|
+
|
339
|
+
end
|
340
|
+
|
341
|
+
end
|
342
|
+
|
343
|
+
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
#--
|
2
4
|
###############################################################################
|
3
5
|
# #
|
@@ -22,3 +24,34 @@
|
|
22
24
|
###############################################################################
|
23
25
|
#++
|
24
26
|
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
class Attendee
|
30
|
+
|
31
|
+
class Stemmer < self
|
32
|
+
|
33
|
+
protected
|
34
|
+
|
35
|
+
def init
|
36
|
+
extend(Lingo.get_const(get_key('type', 'porter'), self.class))
|
37
|
+
|
38
|
+
@wc = get_key('wordclass', LA_STEM)
|
39
|
+
@all = get_key('mode', '').downcase == 'all'
|
40
|
+
end
|
41
|
+
|
42
|
+
def process(obj)
|
43
|
+
if obj.is_a?(Word) && obj.unknown?
|
44
|
+
stem = stem(obj.form.downcase, @all)
|
45
|
+
obj.add_lexicals([Lexical.new(stem, @wc)]) if stem
|
46
|
+
end
|
47
|
+
|
48
|
+
forward(obj)
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
require_relative 'stemmer/porter'
|
@@ -74,24 +74,25 @@ class Lingo
|
|
74
74
|
|
75
75
|
def init
|
76
76
|
set_dic
|
77
|
-
@skip = get_array('skip', WA_UNKNOWN
|
77
|
+
@skip = get_array('skip', WA_UNKNOWN, :upcase)
|
78
78
|
end
|
79
79
|
|
80
|
-
def control(cmd,
|
81
|
-
|
80
|
+
def control(cmd, param)
|
81
|
+
report_on(cmd, @dic)
|
82
82
|
end
|
83
83
|
|
84
84
|
def process(obj)
|
85
|
-
if obj.is_a?(Word) &&
|
85
|
+
if obj.is_a?(Word) && !@skip.include?(obj.attr)
|
86
86
|
inc('Anzahl gesuchter Wörter')
|
87
87
|
|
88
|
-
|
89
|
-
|
90
|
-
obj.lexicals += synos.sort.uniq
|
88
|
+
unless (syn = @dic.find_synonyms(obj)).empty?
|
89
|
+
inc('Anzahl erweiteter Wörter')
|
91
90
|
|
92
|
-
|
93
|
-
|
91
|
+
obj.add_lexicals(syn.tap(&:uniq!))
|
92
|
+
add('Anzahl gefundener Synonyme', syn.size)
|
93
|
+
end
|
94
94
|
end
|
95
|
+
|
95
96
|
forward(obj)
|
96
97
|
end
|
97
98
|
|