lingo 1.8.1 → 1.8.2
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +23 -5
- data/README +1 -1
- data/Rakefile +5 -7
- data/TODO +2 -0
- data/bin/lingo +5 -1
- data/de.lang +1 -1
- data/en/lingo-syn.txt +0 -0
- data/en.lang +2 -1
- data/lib/lingo/attendee/abbreviator.rb +8 -9
- data/lib/lingo/attendee/debugger.rb +5 -4
- data/lib/lingo/attendee/decomposer.rb +8 -3
- data/lib/lingo/attendee/dehyphenizer.rb +19 -63
- data/lib/lingo/attendee/formatter.rb +1 -1
- data/lib/lingo/attendee/multi_worder.rb +67 -155
- data/lib/lingo/attendee/noneword_filter.rb +16 -9
- data/lib/lingo/attendee/object_filter.rb +1 -1
- data/lib/lingo/attendee/sequencer.rb +32 -63
- data/lib/lingo/attendee/stemmer/porter.rb +343 -0
- data/{info/gpl-hdr.txt → lib/lingo/attendee/stemmer.rb} +33 -0
- data/lib/lingo/attendee/synonymer.rb +10 -9
- data/lib/lingo/attendee/text_reader.rb +102 -76
- data/lib/lingo/attendee/text_writer.rb +23 -26
- data/lib/lingo/attendee/tokenizer.rb +13 -27
- data/lib/lingo/attendee/variator.rb +26 -66
- data/lib/lingo/attendee/vector_filter.rb +42 -43
- data/lib/lingo/attendee/word_searcher.rb +6 -7
- data/lib/lingo/attendee.rb +25 -7
- data/lib/lingo/buffered_attendee.rb +36 -10
- data/lib/lingo/cachable.rb +8 -8
- data/lib/lingo/config.rb +5 -6
- data/lib/lingo/ctl.rb +2 -3
- data/lib/lingo/database/crypter.rb +9 -26
- data/lib/lingo/database/gdbm_store.rb +3 -5
- data/lib/lingo/database/libcdb_store.rb +4 -6
- data/lib/lingo/database/sdbm_store.rb +11 -6
- data/lib/lingo/database/show_progress.rb +3 -43
- data/lib/lingo/database/source/key_value.rb +2 -6
- data/lib/lingo/database/source/multi_key.rb +3 -5
- data/lib/lingo/database/source/multi_value.rb +2 -6
- data/lib/lingo/database/source/single_word.rb +4 -6
- data/lib/lingo/database/source/word_class.rb +4 -10
- data/lib/lingo/database/source.rb +20 -18
- data/lib/lingo/database.rb +84 -59
- data/lib/lingo/error.rb +57 -1
- data/lib/lingo/language/dictionary.rb +21 -18
- data/lib/lingo/language/grammar.rb +40 -49
- data/lib/lingo/language/lexical.rb +6 -6
- data/lib/lingo/language/lexical_hash.rb +6 -0
- data/lib/lingo/language/word.rb +32 -15
- data/lib/lingo/language/word_form.rb +1 -1
- data/lib/lingo/language.rb +14 -25
- data/lib/lingo/reportable.rb +12 -10
- data/lib/lingo/show_progress.rb +81 -0
- data/lib/lingo/version.rb +1 -1
- data/lib/lingo.rb +63 -24
- data/lingo-call.cfg +6 -10
- data/lingo.cfg +60 -44
- data/lir.cfg +42 -41
- data/test/attendee/ts_abbreviator.rb +3 -5
- data/test/attendee/ts_decomposer.rb +3 -5
- data/test/attendee/ts_multi_worder.rb +87 -145
- data/test/attendee/ts_noneword_filter.rb +5 -3
- data/test/attendee/ts_object_filter.rb +5 -3
- data/test/attendee/ts_sequencer.rb +3 -5
- data/test/attendee/ts_stemmer.rb +309 -0
- data/test/attendee/ts_synonymer.rb +15 -11
- data/test/attendee/ts_text_reader.rb +12 -15
- data/test/attendee/ts_text_writer.rb +24 -29
- data/test/attendee/ts_tokenizer.rb +9 -7
- data/test/attendee/ts_variator.rb +4 -4
- data/test/attendee/ts_vector_filter.rb +24 -16
- data/test/attendee/ts_word_searcher.rb +20 -36
- data/test/{lir.csv → lir.vec} +0 -0
- data/test/ref/artikel.vec +943 -943
- data/test/ref/artikel.ven +943 -943
- data/test/ref/lir.non +201 -201
- data/test/ref/lir.seq +178 -178
- data/test/ref/lir.syn +49 -49
- data/test/ref/lir.vec +329 -0
- data/test/test_helper.rb +20 -36
- data/test/ts_database.rb +10 -10
- data/test/ts_language.rb +279 -319
- metadata +93 -104
- data/info/Objekte.png +0 -0
- data/info/Typen.png +0 -0
- data/info/database.png +0 -0
- data/info/db_small.png +0 -0
- data/info/download.png +0 -0
- data/info/kerze.png +0 -0
- data/info/language.png +0 -0
- data/info/lingo.png +0 -0
- data/info/logo.png +0 -0
- data/info/meeting.png +0 -0
- data/info/types.png +0 -0
- data/lingo-all.cfg +0 -89
- data/porter/stem.cfg +0 -311
- data/porter/stem.rb +0 -150
- data/test/ref/lir.csv +0 -329
- data/test.cfg +0 -79
data/porter/stem.cfg
DELETED
@@ -1,311 +0,0 @@
|
|
1
|
-
# Stem.cfg
|
2
|
-
#
|
3
|
-
# Rules for Porter-Stemmer
|
4
|
-
#
|
5
|
-
#
|
6
|
-
# based on:
|
7
|
-
# An algorithm for suffix stripping
|
8
|
-
#
|
9
|
-
# M.F.Porter
|
10
|
-
# 1980
|
11
|
-
#
|
12
|
-
# Originally published in Program, 14 no. 3, pp 130-137, July 1980. (A
|
13
|
-
# few typos have been corrected.)
|
14
|
-
#
|
15
|
-
# http://tartarus.org/~martin/PorterStemmer/def.txt
|
16
|
-
#
|
17
|
-
# --------------------------------------------------
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
# 2. THE ALGORITHM
|
23
|
-
#
|
24
|
-
# To present the suffix stripping algorithm in its entirety we will need a few
|
25
|
-
# difinitions.
|
26
|
-
#
|
27
|
-
# A \consonant\ in a word is a letter other than A, E, I, O or U, and other
|
28
|
-
# than Y preceded by a consonant. (The fact that the term `consonant' is
|
29
|
-
# defined to some extent in terms of itself does not make it ambiguous.) So in
|
30
|
-
# TOY the consonants are T and Y, and in SYZYGY they are S, Z and G. If a
|
31
|
-
# letter is not a consonant it is a \vowel\.
|
32
|
-
#
|
33
|
-
# A consonant will be denoted by c, a vowel by v. A list ccc... of length
|
34
|
-
# greater than 0 will be denoted by C, and a list vvv... of length greater
|
35
|
-
# than 0 will be denoted by V. Any word, or part of a word, therefore has one
|
36
|
-
# of the four forms:
|
37
|
-
#
|
38
|
-
# CVCV ... C
|
39
|
-
# CVCV ... V
|
40
|
-
# VCVC ... C
|
41
|
-
# VCVC ... V
|
42
|
-
#
|
43
|
-
# These may all be represented by the single form
|
44
|
-
#
|
45
|
-
# [C]VCVC ... [V]
|
46
|
-
#
|
47
|
-
# where the square brackets denote arbitrary presence of their contents.
|
48
|
-
# Using (VC){m} to denote VC repeated m times, this may again be written as
|
49
|
-
#
|
50
|
-
# [C](VC){m}[V].
|
51
|
-
#
|
52
|
-
# m will be called the \measure\ of any word or word part when represented in
|
53
|
-
# this form. The case m = 0 covers the null word. Here are some examples:
|
54
|
-
#
|
55
|
-
# m=0 TR, EE, TREE, Y, BY.
|
56
|
-
# m=1 TROUBLE, OATS, TREES, IVY.
|
57
|
-
# m=2 TROUBLES, PRIVATE, OATEN, ORRERY.
|
58
|
-
#
|
59
|
-
# The \rules\ for removing a suffix will be given in the form
|
60
|
-
#
|
61
|
-
# (condition) S1 -> S2
|
62
|
-
#
|
63
|
-
# This means that if a word ends with the suffix S1, and the stem before S1
|
64
|
-
# satisfies the given condition, S1 is replaced by S2. The condition is
|
65
|
-
# usually given in terms of m, e.g.
|
66
|
-
#
|
67
|
-
# (m > 1) EMENT ->
|
68
|
-
#
|
69
|
-
# Here S1 is `EMENT' and S2 is null. This would map REPLACEMENT to REPLAC,
|
70
|
-
# since REPLAC is a word part for which m = 2.
|
71
|
-
#
|
72
|
-
# The `condition' part may also contain the following:
|
73
|
-
#
|
74
|
-
# *S - the stem ends with S (and similarly for the other letters).
|
75
|
-
#
|
76
|
-
# *v* - the stem contains a vowel.
|
77
|
-
#
|
78
|
-
# *d - the stem ends with a double consonant (e.g. -TT, -SS).
|
79
|
-
#
|
80
|
-
# *o - the stem ends cvc, where the second c is not W, X or Y (e.g.
|
81
|
-
# -WIL, -HOP).
|
82
|
-
#
|
83
|
-
# And the condition part may also contain expressions with \and\, \or\ and
|
84
|
-
# \not\, so that
|
85
|
-
#
|
86
|
-
# (m>1 and (*S or *T))
|
87
|
-
#
|
88
|
-
# tests for a stem with m>1 ending in S or T, while
|
89
|
-
#
|
90
|
-
# (*d and not (*L or *S or *Z))
|
91
|
-
#
|
92
|
-
# tests for a stem ending witha double consonant other than L, S or Z.
|
93
|
-
# Elaborate conditions like this are required only rarely.
|
94
|
-
#
|
95
|
-
# In a set of rules written beneath each other, only one is obeyed, and this
|
96
|
-
# will be the one with the longest matching S1 for the given word. For
|
97
|
-
# example, with
|
98
|
-
#
|
99
|
-
# SSES -> SS
|
100
|
-
# IES -> I
|
101
|
-
# SS -> SS
|
102
|
-
# S ->
|
103
|
-
#
|
104
|
-
# (here the conditions are all null) CARESSES maps to CARESS since SSES is
|
105
|
-
# the longest match for S1. Equally CARESS maps to CARESS (S1=`SS') and CARES
|
106
|
-
# to CARE (S1=`S').
|
107
|
-
#
|
108
|
-
#
|
109
|
-
---
|
110
|
-
stemmer:
|
111
|
-
# In the rules below, examples of their application, successful or otherwise,
|
112
|
-
# are given on the right in lower case. The algorithm now follows:
|
113
|
-
#
|
114
|
-
# Step 1a
|
115
|
-
# SSES -> SS caresses -> caress
|
116
|
-
# IES -> I ponies -> poni
|
117
|
-
# ties -> ti
|
118
|
-
# SS -> SS caress -> caress
|
119
|
-
# S -> cats -> cat
|
120
|
-
S100:
|
121
|
-
- SSES -> SS
|
122
|
-
- IES -> I
|
123
|
-
- SS -> SS
|
124
|
-
- S ->
|
125
|
-
#
|
126
|
-
# Step 1b
|
127
|
-
#
|
128
|
-
# (m>0) EED -> EE feed -> feed
|
129
|
-
# agreed -> agree
|
130
|
-
# (*v*) ED -> plastered -> plaster
|
131
|
-
# bled -> bled
|
132
|
-
# (*v*) ING -> motoring -> motor
|
133
|
-
# sing -> sing
|
134
|
-
S110:
|
135
|
-
- (m>0) EED -> EE goto(S120)
|
136
|
-
- (*v*) ED -> goto(S111)
|
137
|
-
- (*v*) ING -> goto(S111)
|
138
|
-
- goto(S120)
|
139
|
-
#
|
140
|
-
# If the second or third of the rules in Step 1b is successful, the following
|
141
|
-
# is done:
|
142
|
-
#
|
143
|
-
# AT -> ATE conflat(ed) -> conflate
|
144
|
-
# BL -> BLE troubl(ed) -> trouble
|
145
|
-
# IZ -> IZE siz(ed) -> size
|
146
|
-
# (*d and not (*L or *S or *Z))
|
147
|
-
# -> single letter
|
148
|
-
# hopp(ing) -> hop
|
149
|
-
# tann(ed) -> tan
|
150
|
-
# fall(ing) -> fall
|
151
|
-
# hiss(ing) -> hiss
|
152
|
-
# fizz(ed) -> fizz
|
153
|
-
# (m=1 and *o) -> E fail(ing) -> fail
|
154
|
-
# fil(ing) -> file
|
155
|
-
S111:
|
156
|
-
- AT -> ATE
|
157
|
-
- BL -> BLE
|
158
|
-
- IZ -> IZE
|
159
|
-
- (*d and not (*L or *S or *Z)) -> -1
|
160
|
-
- (m=1 and *o) -> E
|
161
|
-
#
|
162
|
-
# The rule to map to a single letter causes the removal of one of the double
|
163
|
-
# letter pair. The -E is put back on -AT, -BL and -IZ, so that the suffixes
|
164
|
-
# -ATE, -BLE and -IZE can be recognised later. This E may be removed in step
|
165
|
-
# 4.
|
166
|
-
#
|
167
|
-
# Step 1c
|
168
|
-
#
|
169
|
-
# (*v*) Y -> I happy -> happi
|
170
|
-
# sky -> sky
|
171
|
-
S120:
|
172
|
-
- (*v*) Y -> I
|
173
|
-
#
|
174
|
-
# Step 1 deals with plurals and past participles. The subsequent steps are
|
175
|
-
# much more straightforward.
|
176
|
-
#
|
177
|
-
# Step 2
|
178
|
-
#
|
179
|
-
# (m>0) ATIONAL -> ATE relational -> relate
|
180
|
-
# (m>0) TIONAL -> TION conditional -> condition
|
181
|
-
# rational -> rational
|
182
|
-
# (m>0) ENCI -> ENCE valenci -> valence
|
183
|
-
# (m>0) ANCI -> ANCE hesitanci -> hesitance
|
184
|
-
# (m>0) IZER -> IZE digitizer -> digitize
|
185
|
-
# (m>0) ABLI -> ABLE conformabli -> conformable
|
186
|
-
# (m>0) ALLI -> AL radicalli -> radical
|
187
|
-
# (m>0) ENTLI -> ENT differentli -> different
|
188
|
-
# (m>0) ELI -> E vileli - > vile
|
189
|
-
# (m>0) OUSLI -> OUS analogousli -> analogous
|
190
|
-
# (m>0) IZATION -> IZE vietnamization -> vietnamize
|
191
|
-
# (m>0) ATION -> ATE predication -> predicate
|
192
|
-
# (m>0) ATOR -> ATE operator -> operate
|
193
|
-
# (m>0) ALISM -> AL feudalism -> feudal
|
194
|
-
# (m>0) IVENESS -> IVE decisiveness -> decisive
|
195
|
-
# (m>0) FULNESS -> FUL hopefulness -> hopeful
|
196
|
-
# (m>0) OUSNESS -> OUS callousness -> callous
|
197
|
-
# (m>0) ALITI -> AL formaliti -> formal
|
198
|
-
# (m>0) IVITI -> IVE sensitiviti -> sensitive
|
199
|
-
# (m>0) BILITI -> BLE sensibiliti -> sensible
|
200
|
-
S200:
|
201
|
-
- (m>0) ATIONAL -> ATE
|
202
|
-
- (m>0) TIONAL -> TION
|
203
|
-
- (m>0) ENCI -> ENCE
|
204
|
-
- (m>0) ANCI -> ANCE
|
205
|
-
- (m>0) IZER -> IZE
|
206
|
-
- (m>0) ABLI -> ABLE
|
207
|
-
- (m>0) ALLI -> AL
|
208
|
-
- (m>0) ENTLI -> ENT
|
209
|
-
- (m>0) ELI -> E
|
210
|
-
- (m>0) OUSLI -> OUS
|
211
|
-
- (m>0) IZATION -> IZE
|
212
|
-
- (m>0) ATION -> ATE
|
213
|
-
- (m>0) ATOR -> ATE
|
214
|
-
- (m>0) ALISM -> AL
|
215
|
-
- (m>0) IVENESS -> IVE
|
216
|
-
- (m>0) FULNESS -> FUL
|
217
|
-
- (m>0) OUSNESS -> OUS
|
218
|
-
- (m>0) ALITI -> AL
|
219
|
-
- (m>0) IVITI -> IVE
|
220
|
-
- (m>0) BILITI -> BLE
|
221
|
-
#
|
222
|
-
# The test for the string S1 can be made fast by doing a program switch on
|
223
|
-
# the penultimate letter of the word being tested. This gives a fairly even
|
224
|
-
# breakdown of the possible values of the string S1. It will be seen in fact
|
225
|
-
# that the S1-strings in step 2 are presented here in the alphabetical order
|
226
|
-
# of their penultimate letter. Similar techniques may be applied in the other
|
227
|
-
# steps.
|
228
|
-
#
|
229
|
-
# Step 3
|
230
|
-
#
|
231
|
-
# (m>0) ICATE -> IC triplicate -> triplic
|
232
|
-
# (m>0) ATIVE -> formative -> form
|
233
|
-
# (m>0) ALIZE -> AL formalize -> formal
|
234
|
-
# (m>0) ICITI -> IC electriciti -> electric
|
235
|
-
# (m>0) ICAL -> IC electrical -> electric
|
236
|
-
# (m>0) FUL -> hopeful -> hope
|
237
|
-
# (m>0) NESS -> goodness -> good
|
238
|
-
S300:
|
239
|
-
- (m>0) ICATE -> IC
|
240
|
-
- (m>0) ATIVE ->
|
241
|
-
- (m>0) ALIZE -> AL
|
242
|
-
- (m>0) ICITI -> IC
|
243
|
-
- (m>0) ICAL -> IC
|
244
|
-
- (m>0) FUL ->
|
245
|
-
- (m>0) NESS ->
|
246
|
-
#
|
247
|
-
# Step 4
|
248
|
-
#
|
249
|
-
# (m>1) AL -> revival -> reviv
|
250
|
-
# (m>1) ANCE -> allowance -> allow
|
251
|
-
# (m>1) ENCE -> inference -> infer
|
252
|
-
# (m>1) ER -> airliner -> airlin
|
253
|
-
# (m>1) IC -> gyroscopic -> gyroscop
|
254
|
-
# (m>1) ABLE -> adjustable -> adjust
|
255
|
-
# (m>1) IBLE -> defensible -> defens
|
256
|
-
# (m>1) ANT -> irritant -> irrit
|
257
|
-
# (m>1) EMENT -> replacement -> replac
|
258
|
-
# (m>1) MENT -> adjustment -> adjust
|
259
|
-
# (m>1) ENT -> dependent -> depend
|
260
|
-
# (m>1 and (*S or *T)) ION -> adoption -> adopt
|
261
|
-
# (m>1) OU -> homologou -> homolog
|
262
|
-
# (m>1) ISM -> communism -> commun
|
263
|
-
# (m>1) ATE -> activate -> activ
|
264
|
-
# (m>1) ITI -> angulariti -> angular
|
265
|
-
# (m>1) OUS -> homologous -> homolog
|
266
|
-
# (m>1) IVE -> effective -> effect
|
267
|
-
# (m>1) IZE -> bowdlerize -> bowdler
|
268
|
-
S400:
|
269
|
-
- (m>1) AL ->
|
270
|
-
- (m>1) ANCE ->
|
271
|
-
- (m>1) ENCE ->
|
272
|
-
- (m>1) ER ->
|
273
|
-
- (m>1) IC ->
|
274
|
-
- (m>1) ABLE ->
|
275
|
-
- (m>1) IBLE ->
|
276
|
-
- (m>1) ANT ->
|
277
|
-
- (m>1) EMENT ->
|
278
|
-
- (m>1) MENT ->
|
279
|
-
- (m>1) ENT ->
|
280
|
-
- (m>1 and (*S or *T)) ION ->
|
281
|
-
- (m>1) OU ->
|
282
|
-
- (m>1) ISM ->
|
283
|
-
- (m>1) ATE ->
|
284
|
-
- (m>1) ITI ->
|
285
|
-
- (m>1) OUS ->
|
286
|
-
- (m>1) IVE ->
|
287
|
-
- (m>1) IZE ->
|
288
|
-
#
|
289
|
-
# The suffixes are now removed. All that remains is a little tidying up.
|
290
|
-
#
|
291
|
-
# Step 5a
|
292
|
-
#
|
293
|
-
# (m>1) E -> probate -> probat
|
294
|
-
# rate -> rate
|
295
|
-
# (m=1 and not *o) E -> cease -> ceas
|
296
|
-
S500:
|
297
|
-
- (m>1) E ->
|
298
|
-
- (m=1 and not *o) E ->
|
299
|
-
#
|
300
|
-
# Step 5b
|
301
|
-
#
|
302
|
-
# (m > 1 and *d and *L) -> single letter
|
303
|
-
# controll -> control
|
304
|
-
# roll -> roll
|
305
|
-
S510:
|
306
|
-
- (m > 1 and *d and *L) -> -1
|
307
|
-
#
|
308
|
-
# The algorithm is careful not to remove a suffix when the stem is too short,
|
309
|
-
# the length of the stem being given by its measure, m. There is no linguistic
|
310
|
-
# basis for this approach. It was merely observed that m could be used quite
|
311
|
-
# effectively to help decide whether or not it was wise to take off a suffix.
|
data/porter/stem.rb
DELETED
@@ -1,150 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
require "yaml"
|
4
|
-
class String
|
5
|
-
def to_shadow
|
6
|
-
shadow = self.gsub(/[^aeiouy]/, 'c')
|
7
|
-
shadow.gsub!(/[aeiou]/, 'v')
|
8
|
-
shadow.gsub!(/cy/, 'cv')
|
9
|
-
shadow.gsub!(/y/, 'c')
|
10
|
-
shadow
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
# => condition nil oder eine evaluierbare regel
|
17
|
-
# => matchExp eine Regexp
|
18
|
-
# => replacement ist downcase
|
19
|
-
# => return new stem or nil, if rule didn't match
|
20
|
-
def checkSingleRule(word, condition, matchExp, replacement)
|
21
|
-
|
22
|
-
# => check for matching rule
|
23
|
-
return nil unless matchExp.match(word)
|
24
|
-
|
25
|
-
# => remember stem
|
26
|
-
stem = $1
|
27
|
-
|
28
|
-
# => check condition for rule
|
29
|
-
unless condition.nil?
|
30
|
-
evalCondition = condition.dup
|
31
|
-
|
32
|
-
stemShadow = stem.to_shadow
|
33
|
-
|
34
|
-
unless condition.index("m").nil?
|
35
|
-
m = stemShadow.squeeze.scan(/vc/).size
|
36
|
-
evalCondition.gsub!(/m/, m.to_s)
|
37
|
-
end
|
38
|
-
|
39
|
-
unless condition.index("*v*").nil?
|
40
|
-
evalCondition.gsub!(/\*v\*/, stemShadow.index("v").nil? ? "false" : "true")
|
41
|
-
end
|
42
|
-
|
43
|
-
unless condition.index("*d").nil?
|
44
|
-
evalCondition.gsub!(/\*d/, (stemShadow[-1..-1]=="c" && stem[-1]==stem[-2]) ? "true" : "false")
|
45
|
-
end
|
46
|
-
|
47
|
-
unless condition.index("*o").nil?
|
48
|
-
bool = /cvc$/.match(stemShadow) && "wxy".index(stemShadow[-1..-1]).nil?
|
49
|
-
evalCondition.gsub!(/\*o/, bool ? "true" : "false")
|
50
|
-
end
|
51
|
-
|
52
|
-
while /\*(\w)/.match(evalCondition)
|
53
|
-
char = $1
|
54
|
-
if char.downcase == char
|
55
|
-
abort "unbekannter Buchstabe %s in Regel: %" % [char, condition]
|
56
|
-
end
|
57
|
-
|
58
|
-
bool = (stem[-1..-1].upcase == char)
|
59
|
-
evalCondition.gsub!(Regexp.new(Regexp.escape("*#{char}")), bool ? "true" : "false")
|
60
|
-
end
|
61
|
-
|
62
|
-
evalCondition.gsub!(/and/, '&&')
|
63
|
-
evalCondition.gsub!(/or/, '||')
|
64
|
-
evalCondition.gsub!(/not/, '!')
|
65
|
-
evalCondition.gsub!(/=/, '==')
|
66
|
-
p evalCondition
|
67
|
-
return unless eval(evalCondition)
|
68
|
-
end
|
69
|
-
|
70
|
-
# => stem with replacement
|
71
|
-
if /^(-\d+)$/.match(replacement)
|
72
|
-
# => delete last characters from stem, if replacement looks like '-1' oder '-2'
|
73
|
-
stem[0...($1.to_i)]
|
74
|
-
else
|
75
|
-
# => append replacement to stem
|
76
|
-
stem + replacement
|
77
|
-
end
|
78
|
-
|
79
|
-
end
|
80
|
-
|
81
|
-
def checkAllRules(word, rules)
|
82
|
-
sequence = rules.keys.sort.reverse
|
83
|
-
|
84
|
-
actualRuleSet = sequence.pop.to_s
|
85
|
-
|
86
|
-
begin
|
87
|
-
label = nil
|
88
|
-
|
89
|
-
rules[actualRuleSet].each do |rule|
|
90
|
-
unless /^(\(.+\)){0,1}\s*(\S*)\s*->\s*(\S*?)\s*(?:goto\((\S+)\))*\s*$/.match(rule)
|
91
|
-
unless /^\s*goto\s*\(\s*(\S+)\s*\)$/.match(rule)
|
92
|
-
abort "ungültige Regel: %s" % rule
|
93
|
-
else
|
94
|
-
label = $1
|
95
|
-
break
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
|
-
condition, ending, replacement, label = $1, $2.downcase, $3.downcase, $4
|
100
|
-
p [rule, word, condition, ending, replacement, label ]
|
101
|
-
result = checkSingleRule(word, condition, Regexp.new("(.+)#{ending}$"), replacement)
|
102
|
-
|
103
|
-
unless result.nil?
|
104
|
-
p [word, actualRuleSet, rule]
|
105
|
-
word = result
|
106
|
-
break
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
if label.nil?
|
111
|
-
actualRuleSet = sequence.pop.to_s
|
112
|
-
else
|
113
|
-
while label != actualRuleSet && !actualRuleSet.nil?
|
114
|
-
actualRuleSet = sequence.pop.to_s
|
115
|
-
end
|
116
|
-
end
|
117
|
-
end until actualRuleSet.empty?
|
118
|
-
|
119
|
-
word
|
120
|
-
end
|
121
|
-
|
122
|
-
stemmerConfig = YAML::load_file("stem.cfg")
|
123
|
-
|
124
|
-
$rules = stemmerConfig["stemmer"]
|
125
|
-
|
126
|
-
word = $*[0]
|
127
|
-
p checkAllRules(word, $rules)
|
128
|
-
|
129
|
-
def test(word, stem)
|
130
|
-
result = checkAllRules(word, $rules)
|
131
|
-
if stem != result
|
132
|
-
warn "Falsches Wort %s, Stem %s, Result %s" % [word, stem, result]
|
133
|
-
else
|
134
|
-
warn "Korrekt: Wort %s, Stem %s" % [word, stem]
|
135
|
-
end
|
136
|
-
end
|
137
|
-
|
138
|
-
|
139
|
-
#test("caresses", "caress")
|
140
|
-
#test("ponies", "poni")
|
141
|
-
#test("ties", "ti")
|
142
|
-
#test("caress", "caress")
|
143
|
-
#test("cats", "cat")
|
144
|
-
|
145
|
-
#test("feed", "feed")
|
146
|
-
#?test("agreed", "agree")
|
147
|
-
#test("plastered", "plaster")
|
148
|
-
#test("bled", "bled")
|
149
|
-
#test("motoring", "motor")
|
150
|
-
#test("sing", "sing")
|