lingo 1.8.1 → 1.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. data/ChangeLog +23 -5
  2. data/README +1 -1
  3. data/Rakefile +5 -7
  4. data/TODO +2 -0
  5. data/bin/lingo +5 -1
  6. data/de.lang +1 -1
  7. data/en/lingo-syn.txt +0 -0
  8. data/en.lang +2 -1
  9. data/lib/lingo/attendee/abbreviator.rb +8 -9
  10. data/lib/lingo/attendee/debugger.rb +5 -4
  11. data/lib/lingo/attendee/decomposer.rb +8 -3
  12. data/lib/lingo/attendee/dehyphenizer.rb +19 -63
  13. data/lib/lingo/attendee/formatter.rb +1 -1
  14. data/lib/lingo/attendee/multi_worder.rb +67 -155
  15. data/lib/lingo/attendee/noneword_filter.rb +16 -9
  16. data/lib/lingo/attendee/object_filter.rb +1 -1
  17. data/lib/lingo/attendee/sequencer.rb +32 -63
  18. data/lib/lingo/attendee/stemmer/porter.rb +343 -0
  19. data/{info/gpl-hdr.txt → lib/lingo/attendee/stemmer.rb} +33 -0
  20. data/lib/lingo/attendee/synonymer.rb +10 -9
  21. data/lib/lingo/attendee/text_reader.rb +102 -76
  22. data/lib/lingo/attendee/text_writer.rb +23 -26
  23. data/lib/lingo/attendee/tokenizer.rb +13 -27
  24. data/lib/lingo/attendee/variator.rb +26 -66
  25. data/lib/lingo/attendee/vector_filter.rb +42 -43
  26. data/lib/lingo/attendee/word_searcher.rb +6 -7
  27. data/lib/lingo/attendee.rb +25 -7
  28. data/lib/lingo/buffered_attendee.rb +36 -10
  29. data/lib/lingo/cachable.rb +8 -8
  30. data/lib/lingo/config.rb +5 -6
  31. data/lib/lingo/ctl.rb +2 -3
  32. data/lib/lingo/database/crypter.rb +9 -26
  33. data/lib/lingo/database/gdbm_store.rb +3 -5
  34. data/lib/lingo/database/libcdb_store.rb +4 -6
  35. data/lib/lingo/database/sdbm_store.rb +11 -6
  36. data/lib/lingo/database/show_progress.rb +3 -43
  37. data/lib/lingo/database/source/key_value.rb +2 -6
  38. data/lib/lingo/database/source/multi_key.rb +3 -5
  39. data/lib/lingo/database/source/multi_value.rb +2 -6
  40. data/lib/lingo/database/source/single_word.rb +4 -6
  41. data/lib/lingo/database/source/word_class.rb +4 -10
  42. data/lib/lingo/database/source.rb +20 -18
  43. data/lib/lingo/database.rb +84 -59
  44. data/lib/lingo/error.rb +57 -1
  45. data/lib/lingo/language/dictionary.rb +21 -18
  46. data/lib/lingo/language/grammar.rb +40 -49
  47. data/lib/lingo/language/lexical.rb +6 -6
  48. data/lib/lingo/language/lexical_hash.rb +6 -0
  49. data/lib/lingo/language/word.rb +32 -15
  50. data/lib/lingo/language/word_form.rb +1 -1
  51. data/lib/lingo/language.rb +14 -25
  52. data/lib/lingo/reportable.rb +12 -10
  53. data/lib/lingo/show_progress.rb +81 -0
  54. data/lib/lingo/version.rb +1 -1
  55. data/lib/lingo.rb +63 -24
  56. data/lingo-call.cfg +6 -10
  57. data/lingo.cfg +60 -44
  58. data/lir.cfg +42 -41
  59. data/test/attendee/ts_abbreviator.rb +3 -5
  60. data/test/attendee/ts_decomposer.rb +3 -5
  61. data/test/attendee/ts_multi_worder.rb +87 -145
  62. data/test/attendee/ts_noneword_filter.rb +5 -3
  63. data/test/attendee/ts_object_filter.rb +5 -3
  64. data/test/attendee/ts_sequencer.rb +3 -5
  65. data/test/attendee/ts_stemmer.rb +309 -0
  66. data/test/attendee/ts_synonymer.rb +15 -11
  67. data/test/attendee/ts_text_reader.rb +12 -15
  68. data/test/attendee/ts_text_writer.rb +24 -29
  69. data/test/attendee/ts_tokenizer.rb +9 -7
  70. data/test/attendee/ts_variator.rb +4 -4
  71. data/test/attendee/ts_vector_filter.rb +24 -16
  72. data/test/attendee/ts_word_searcher.rb +20 -36
  73. data/test/{lir.csv → lir.vec} +0 -0
  74. data/test/ref/artikel.vec +943 -943
  75. data/test/ref/artikel.ven +943 -943
  76. data/test/ref/lir.non +201 -201
  77. data/test/ref/lir.seq +178 -178
  78. data/test/ref/lir.syn +49 -49
  79. data/test/ref/lir.vec +329 -0
  80. data/test/test_helper.rb +20 -36
  81. data/test/ts_database.rb +10 -10
  82. data/test/ts_language.rb +279 -319
  83. metadata +93 -104
  84. data/info/Objekte.png +0 -0
  85. data/info/Typen.png +0 -0
  86. data/info/database.png +0 -0
  87. data/info/db_small.png +0 -0
  88. data/info/download.png +0 -0
  89. data/info/kerze.png +0 -0
  90. data/info/language.png +0 -0
  91. data/info/lingo.png +0 -0
  92. data/info/logo.png +0 -0
  93. data/info/meeting.png +0 -0
  94. data/info/types.png +0 -0
  95. data/lingo-all.cfg +0 -89
  96. data/porter/stem.cfg +0 -311
  97. data/porter/stem.rb +0 -150
  98. data/test/ref/lir.csv +0 -329
  99. data/test.cfg +0 -79
@@ -71,32 +71,39 @@ class Lingo
71
71
  protected
72
72
 
73
73
  def init
74
- @nonewords = []
74
+ @nonewords, @sort = [], get_key('sort', true)
75
75
  end
76
76
 
77
- # Control behandelt die Kommandos zum Öffnen und Schließen einer Datei.
78
- # Für jede Datei wird ein neuer Satz nicht erkannter Wörter registriert.
79
- def control(cmd, par)
77
+ def control(cmd, param)
80
78
  case cmd
81
79
  when STR_CMD_FILE
82
80
  @nonewords.clear
83
81
  when STR_CMD_EOL
84
82
  skip_command
85
83
  when STR_CMD_RECORD, STR_CMD_EOF
86
- nones = @nonewords.sort.uniq
87
- nones.each(&method(:forward))
88
- add('Objekte gefiltert', nones.size)
89
- @nonewords.clear
84
+ send_nonewords unless @nonewords.empty?
90
85
  end
91
86
  end
92
87
 
93
88
  def process(obj)
94
89
  if obj.is_a?(Word) && obj.unknown?
95
90
  inc('Anzahl nicht erkannter Wörter')
96
- @nonewords << obj.form.downcase
91
+
92
+ non = obj.form.downcase
93
+ @sort ? @nonewords << non : forward(non)
97
94
  end
98
95
  end
99
96
 
97
+ private
98
+
99
+ def send_nonewords
100
+ @nonewords.sort!
101
+ @nonewords.uniq!
102
+
103
+ add('Objekte gefiltert', @nonewords.size)
104
+ @nonewords.each(&method(:forward)).clear
105
+ end
106
+
100
107
  end
101
108
 
102
109
  # For backwards compatibility.
@@ -59,7 +59,7 @@ class Lingo
59
59
  # - text_reader: { out: lines, files: '$(files)' }
60
60
  # - tokenizer: { in: lines, out: token }
61
61
  # - word_searcher: { in: token, out: words, source: 'sys-dic' }
62
- # - object_filter: { in: words, out: filtr, objects: 'obj.kind_of?(Word) && obj.lexicals.size>0 && obj.lexicals[0].attr==LA_SUBSTANTIV' }
62
+ # - object_filter: { in: words, out: filtr, objects: 'obj.kind_of?(Word) && obj.lexicals.size>0 && obj.lexicals[0].attr==LA_NOUN' }
63
63
  # - debugger: { in: filtr, prompt: 'out>' }
64
64
  # ergibt die Ausgabe über den Debugger: <tt>lingo -c t1 test.txt</tt>
65
65
  # out> *FILE('test.txt')
@@ -96,92 +96,61 @@ class Lingo
96
96
  protected
97
97
 
98
98
  def init
99
- # Parameter verwerten
100
- @stopper = get_array('stopper', TA_PUNCTUATION + ',' + TA_OTHER).map(&:upcase)
101
- @seq_strings = get_key('sequences').map { |e| WordSequence.new(*e) }
99
+ @stopper = get_array('stopper', DEFAULT_SKIP, :upcase)
102
100
 
103
- raise MissingConfigError.new(:sequences) if @seq_strings.empty?
101
+ @seq = get_key('sequences').map { |string, format|
102
+ [string = string.downcase, string.split(//), format]
103
+ }
104
+
105
+ raise MissingConfigError.new(:sequences) if @seq.empty?
104
106
  end
105
107
 
106
- def control(cmd, par)
107
- # Jedes Control-Object ist auch Auslöser der Verarbeitung
108
+ def control(cmd, param)
108
109
  process_buffer if [STR_CMD_RECORD, STR_CMD_EOF].include?(cmd)
109
110
  end
110
111
 
111
112
  def process_buffer?
112
- # start buffer processing when stopper token are found or at unknown words
113
- item = @buffer.last
114
- (item.is_a?(WordForm) && @stopper.include?(item.attr.upcase)) ||
115
- (item.is_a?(Word) && item.unknown?)
113
+ (obj = @buffer.last).is_a?(WordForm) && (obj.is_a?(Word) &&
114
+ obj.unknown? || @stopper.include?(obj.attr.upcase))
116
115
  end
117
116
 
118
117
  def process_buffer
119
- return if @buffer.empty?
120
-
121
- unless @buffer.size < 2
122
- matches = Hash.new { |h, k| h[k] = [] }
123
-
124
- sequences(@buffer.map { |obj|
125
- obj.is_a?(Word) && !obj.unknown? ? obj.attrs(false) : ['#']
126
- }).uniq.each { |sequence|
127
- @seq_strings.each { |wordseq|
128
- wordseq.scan(sequence) { |pos, form, classes|
129
- inc('Anzahl erkannter Sequenzen')
130
-
131
- classes.each_with_index { |wc, index|
132
- @buffer[pos + index].lexicals.find { |lex|
133
- form.gsub!(index.succ.to_s, lex.form) if lex.attr == wc
134
- } or break
135
- } or next
136
-
137
- matches[pos] << form
138
- }
139
- }
140
- }
141
-
142
- matches.sort.each { |pos, forms|
143
- forms.uniq.each { |form|
144
- deferred_insert(pos, Word.new_lexical(form, WA_SEQUENCE, LA_SEQUENCE))
145
- }
146
- }
147
- end
148
-
118
+ insert_sequences if @buffer.size > 1
149
119
  forward_buffer
150
120
  end
151
121
 
152
122
  private
153
123
 
154
- def sequences(map)
155
- res = map.shift
124
+ def insert_sequences
125
+ matches, buf, seq = Hash.new { |h, k| h[k] = [] }, @buffer, @seq
156
126
 
157
- map.each { |classes|
158
- temp = []
159
- res.each { |wc1| classes.each { |wc2| temp << (wc1 + wc2) } }
160
- res = temp
127
+ map = buf.map { |obj|
128
+ obj.is_a?(Word) && !obj.unknown? ? obj.attrs(false) : ['#']
161
129
  }
162
130
 
163
- res
164
- end
165
-
166
- class WordSequence
131
+ map.shift.product(*map).map!(&:join).tap(&:uniq!).each { |q|
132
+ seq.each { |string, classes, format|
133
+ while pos = q.index(string, pos || 0)
134
+ inc('Anzahl erkannter Sequenzen')
167
135
 
168
- attr_reader :classes, :format, :string
136
+ fmt = format.dup
169
137
 
170
- def initialize(wordclasses, format)
171
- @string = wordclasses.downcase
172
- @classes = @string.split(//)
173
- @format = format
174
- end
138
+ classes.each_with_index { |wc, i|
139
+ buf[pos + i].lexicals.find { |l|
140
+ fmt.gsub!(i.succ.to_s, l.form) if l.attr == wc
141
+ } or break
142
+ } or next
175
143
 
176
- def scan(sequence)
177
- pos = 0
144
+ matches[pos] << fmt
178
145
 
179
- while pos = sequence.index(string, pos)
180
- yield pos, format.dup, classes
181
- pos += 1
182
- end
183
- end
146
+ pos += 1
147
+ end
148
+ }
149
+ }
184
150
 
151
+ matches.sort.each { |pos, forms| forms.tap(&:uniq!).each { |form|
152
+ @inserts << [pos, Word.new_lexical(form, WA_SEQUENCE, LA_SEQUENCE)]
153
+ } }
185
154
  end
186
155
 
187
156
  end
@@ -0,0 +1,343 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ class Attendee
30
+
31
+ class Stemmer
32
+
33
+ module Porter
34
+
35
+ extend self
36
+
37
+ # Rules for Porter-Stemmer, based on:
38
+ #
39
+ # An algorithm for suffix stripping
40
+ #
41
+ # M.F. Porter
42
+ # 1980
43
+ #
44
+ # Originally published in Program, 14 no. 3, pp 130-137, July 1980.
45
+ # (A few typos have been corrected.)
46
+ #
47
+ # http://tartarus.org/~martin/PorterStemmer/def.txt
48
+ #
49
+ # -------------------------------------------------------------------
50
+ #
51
+ # 2. THE ALGORITHM
52
+ #
53
+ # To present the suffix stripping algorithm in its entirety we will
54
+ # need a few definitions.
55
+ #
56
+ # A _consonant_ in a word is a letter other than A, E, I, O or U,
57
+ # and other than Y preceded by a consonant. (The fact that the term
58
+ # `consonant' is defined to some extent in terms of itself does not
59
+ # make it ambiguous.) So in TOY the consonants are T and Y, and in
60
+ # SYZYGY they are S, Z and G. If a letter is not a consonant it is
61
+ # a _vowel_.
62
+ #
63
+ # A consonant will be denoted by c, a vowel by v. A list ccc... of
64
+ # length greater than 0 will be denoted by C, and a list vvv... of
65
+ # length greater than 0 will be denoted by V. Any word, or part of
66
+ # a word, therefore has one of the four forms:
67
+ #
68
+ # CVCV ... C
69
+ # CVCV ... V
70
+ # VCVC ... C
71
+ # VCVC ... V
72
+ #
73
+ # These may all be represented by the single form
74
+ #
75
+ # [C]VCVC ... [V]
76
+ #
77
+ # where the square brackets denote arbitrary presence of their
78
+ # contents. Using (VC){m} to denote VC repeated m times, this
79
+ # may again be written as
80
+ #
81
+ # [C](VC){m}[V].
82
+ #
83
+ # m will be called the _measure_ of any word or word part when
84
+ # represented in this form. The case m = 0 covers the null word.
85
+ # Here are some examples:
86
+ #
87
+ # m=0 TR, EE, TREE, Y, BY.
88
+ # m=1 TROUBLE, OATS, TREES, IVY.
89
+ # m=2 TROUBLES, PRIVATE, OATEN, ORRERY.
90
+ #
91
+ # The _rules_ for removing a suffix will be given in the form
92
+ #
93
+ # (condition) S1 -> S2
94
+ #
95
+ # This means that if a word ends with the suffix S1, and the stem
96
+ # before S1 satisfies the given condition, S1 is replaced by S2.
97
+ # The condition is usually given in terms of m, e.g.
98
+ #
99
+ # (m > 1) EMENT ->
100
+ #
101
+ # Here S1 is `EMENT' and S2 is null. This would map REPLACEMENT to
102
+ # REPLAC, since REPLAC is a word part for which m = 2.
103
+ #
104
+ # The `condition' part may also contain the following:
105
+ #
106
+ # *S - the stem ends with S (and similarly for the other letters).
107
+ #
108
+ # *v* - the stem contains a vowel.
109
+ #
110
+ # *d - the stem ends with a double consonant (e.g. -TT, -SS).
111
+ #
112
+ # *o - the stem ends cvc, where the second c is not W, X or Y (e.g.
113
+ # -WIL, -HOP).
114
+ #
115
+ # And the condition part may also contain expressions with _and_,
116
+ # _or_ and _not_, so that
117
+ #
118
+ # (m>1 and (*S or *T))
119
+ #
120
+ # tests for a stem with m>1 ending in S or T, while
121
+ #
122
+ # (*d and not (*L or *S or *Z))
123
+ #
124
+ # tests for a stem ending with a double consonant other than L, S
125
+ # or Z. Elaborate conditions like this are required only rarely.
126
+ #
127
+ # In a set of rules written beneath each other, only one is obeyed,
128
+ # and this will be the one with the longest matching S1 for the
129
+ # given word. For example, with
130
+ #
131
+ # SSES -> SS
132
+ # IES -> I
133
+ # SS -> SS
134
+ # S ->
135
+ #
136
+ # (here the conditions are all null) CARESSES maps to CARESS since
137
+ # SSES is the longest match for S1. Equally CARESS maps to CARESS
138
+ # (S1=`SS') and CARES to CARE (S1=`S').
139
+ #
140
+ # In the rules below, examples of their application, successful or
141
+ # otherwise, are given on the right in lower case. The algorithm
142
+ # now follows: see RULES.
143
+ #
144
+ # The algorithm is careful not to remove a suffix when the stem is
145
+ # too short, the length of the stem being given by its measure, m.
146
+ # There is no linguistic basis for this approach. It was merely
147
+ # observed that m could be used quite effectively to help decide
148
+ # whether or not it was wise to take off a suffix.
149
+ #
150
+ # -------------------------------------------------------------------
151
+
152
+ #
153
+
154
+ RULES = {
155
+ # Step 1a
156
+ S100: [
157
+ 'SSES -> SS', # caresses -> caress
158
+ 'IES -> I', # ponies -> poni, ties -> ti
159
+ 'SS -> SS', # caress -> caress
160
+ 'S -> ' # cats -> cat
161
+ ],
162
+
163
+ # Step 1b
164
+ S110: [
165
+ '(m>0) EED -> EE goto(S120)', # agreed -> agree, feed -> feed
166
+ '(*v*) ED -> goto(S111)', # plastered -> plaster, bled -> bled
167
+ '(*v*) ING -> goto(S111)', # motoring -> motor, sing -> sing
168
+ 'goto(S120)'
169
+ ],
170
+
171
+ # If the second or third of the rules in Step 1b is successful,
172
+ # the following is done:
173
+ S111: [
174
+ 'AT -> ATE', # conflat(ed) -> conflate
175
+ 'BL -> BLE', # troubl(ed) -> trouble
176
+ 'IZ -> IZE', # siz(ed) -> size
177
+ '(*d and not (*L or *S or *Z)) -> -1', # hopp(ing) -> hop
178
+ # tann(ed) -> tan
179
+ # fall(ing) -> fall
180
+ # hiss(ing) -> hiss
181
+ # fizz(ed) -> fizz
182
+ '(m=1 and *o) -> E' # fail(ing) -> fail
183
+ # fil(ing) -> file
184
+ ],
185
+
186
+ # The rule to map to a single letter causes the removal of one of
187
+ # the double letter pair. The -E is put back on -AT, -BL and -IZ,
188
+ # so that the suffixes -ATE, -BLE and -IZE can be recognised later.
189
+ # This E may be removed in step 4.
190
+
191
+ # Step 1c
192
+ S120: [
193
+ '(*v*) Y -> I' # happy -> happi, sky -> sky
194
+ ],
195
+
196
+ # Step 1 deals with plurals and past participles. The subsequent
197
+ # steps are much more straightforward.
198
+
199
+ # Step 2
200
+ S200: [
201
+ '(m>0) ATIONAL -> ATE', # relational -> relate
202
+ '(m>0) TIONAL -> TION', # conditional -> condition, rational -> rational
203
+ '(m>0) ENCI -> ENCE', # valenci -> valence
204
+ '(m>0) ANCI -> ANCE', # hesitanci -> hesitance
205
+ '(m>0) IZER -> IZE', # digitizer -> digitize
206
+ '(m>0) ABLI -> ABLE', # conformabli -> conformable
207
+ '(m>0) ALLI -> AL', # radicalli -> radical
208
+ '(m>0) ENTLI -> ENT', # differentli -> different
209
+ '(m>0) ELI -> E', # vileli -> vile
210
+ '(m>0) OUSLI -> OUS', # analogousli -> analogous
211
+ '(m>0) IZATION -> IZE', # vietnamization -> vietnamize
212
+ '(m>0) ATION -> ATE', # predication -> predicate
213
+ '(m>0) ATOR -> ATE', # operator -> operate
214
+ '(m>0) ALISM -> AL', # feudalism -> feudal
215
+ '(m>0) IVENESS -> IVE', # decisiveness -> decisive
216
+ '(m>0) FULNESS -> FUL', # hopefulness -> hopeful
217
+ '(m>0) OUSNESS -> OUS', # callousness -> callous
218
+ '(m>0) ALITI -> AL', # formaliti -> formal
219
+ '(m>0) IVITI -> IVE', # sensitiviti -> sensitive
220
+ '(m>0) BILITI -> BLE' # sensibiliti -> sensible
221
+ ],
222
+
223
+ # The test for the string S1 can be made fast by doing a program
224
+ # switch on the penultimate letter of the word being tested. This
225
+ # gives a fairly even breakdown of the possible values of the
226
+ # string S1. It will be seen in fact that the S1-strings in step 2
227
+ # are presented here in the alphabetical order of their penultimate
228
+ # letter. Similar techniques may be applied in the other steps.
229
+
230
+ # Step 3
231
+ S300: [
232
+ '(m>0) ICATE -> IC', # triplicate -> triplic
233
+ '(m>0) ATIVE -> ', # formative -> form
234
+ '(m>0) ALIZE -> AL', # formalize -> formal
235
+ '(m>0) ICITI -> IC', # electriciti -> electric
236
+ '(m>0) ICAL -> IC', # electrical -> electric
237
+ '(m>0) FUL -> ', # hopeful -> hope
238
+ '(m>0) NESS -> ' # goodness -> good
239
+ ],
240
+
241
+ # Step 4
242
+ S400: [
243
+ '(m>1) AL -> ', # revival -> reviv
244
+ '(m>1) ANCE -> ', # allowance -> allow
245
+ '(m>1) ENCE -> ', # inference -> infer
246
+ '(m>1) ER -> ', # airliner -> airlin
247
+ '(m>1) IC -> ', # gyroscopic -> gyroscop
248
+ '(m>1) ABLE -> ', # adjustable -> adjust
249
+ '(m>1) IBLE -> ', # defensible -> defens
250
+ '(m>1) ANT -> ', # irritant -> irrit
251
+ '(m>1) EMENT -> ', # replacement -> replac
252
+ '(m>1) MENT -> ', # adjustment -> adjust
253
+ '(m>1) ENT -> ', # dependent -> depend
254
+ '(m>1 and (*S or *T)) ION -> ', # adoption -> adopt
255
+ '(m>1) OU -> ', # homologou -> homolog
256
+ '(m>1) ISM -> ', # communism -> commun
257
+ '(m>1) ATE -> ', # activate -> activ
258
+ '(m>1) ITI -> ', # angulariti -> angular
259
+ '(m>1) OUS -> ', # homologous -> homolog
260
+ '(m>1) IVE -> ', # effective -> effect
261
+ '(m>1) IZE -> ' # bowdlerize -> bowdler
262
+ ],
263
+
264
+ # The suffixes are now removed. All that remains is a little
265
+ # tidying up.
266
+
267
+ # Step 5a
268
+ S500: [
269
+ '(m>1) E -> ', # probate -> probat, rate -> rate
270
+ '(m=1 and not *o) E -> ' # cease -> ceas
271
+ ],
272
+
273
+ # Step 5b
274
+ S510: [
275
+ '(m > 1 and *d and *L) -> -1' # controll -> control, roll -> roll
276
+ ]
277
+ }
278
+
279
+ GOTO_RE = %r{^#{goto_re = %r{\s*goto\((\S+)\)}}$}
280
+
281
+ RULE_RE = %r{^(\(.+\))?\s*(\S*)\s*->\s*(\S*?)(?:#{goto_re})?\s*$}
282
+
283
+ def stem(word, found = false)
284
+ goto, conv = nil, lambda { |s, h| h.each { |q, r| s.gsub!(q, r.to_s) } }
285
+
286
+ RULES.each { |key, rules|
287
+ next if goto && goto != key.to_s
288
+
289
+ rules.each { |rule|
290
+ case rule
291
+ when RULE_RE
292
+ cond, repl, goto = $1, $3, $4
293
+ stem = word[/(.+)#{$2.downcase}$/, 1] or next
294
+ when GOTO_RE
295
+ goto = $1
296
+ break
297
+ end
298
+
299
+ conv[shad = stem.dup,
300
+ /[^aeiouy]/ => 'c',
301
+ /[aeiou]/ => 'v',
302
+ /cy/ => 'cv',
303
+ /y/ => 'c'
304
+ ]
305
+
306
+ if cond
307
+ conv[cond,
308
+ 'm' => shad.scan(/vc/).size,
309
+ '*v*' => shad.include?('v'),
310
+ '*d' => shad.end_with?('c') && (last = stem[-1]) == stem[-2],
311
+ '*o' => shad.end_with?('cvc') && !'wxy'.include?(last),
312
+ 'and' => '&&',
313
+ 'or' => '||',
314
+ 'not' => '!',
315
+ '=' => '=='
316
+ ]
317
+
318
+ last.upcase! if last
319
+ cond.gsub!(/\*(\w)/) { last == $1 }
320
+
321
+ next unless eval(cond)
322
+ end
323
+
324
+ found, word = true, begin
325
+ stem[0...Integer(repl)]
326
+ rescue ArgumentError
327
+ stem << repl.downcase
328
+ end
329
+
330
+ break
331
+ }
332
+ }
333
+
334
+ word if found
335
+ end
336
+
337
+ end
338
+
339
+ end
340
+
341
+ end
342
+
343
+ end
@@ -1,3 +1,5 @@
1
+ # encoding: utf-8
2
+
1
3
  #--
2
4
  ###############################################################################
3
5
  # #
@@ -22,3 +24,34 @@
22
24
  ###############################################################################
23
25
  #++
24
26
 
27
+ class Lingo
28
+
29
+ class Attendee
30
+
31
+ class Stemmer < self
32
+
33
+ protected
34
+
35
+ def init
36
+ extend(Lingo.get_const(get_key('type', 'porter'), self.class))
37
+
38
+ @wc = get_key('wordclass', LA_STEM)
39
+ @all = get_key('mode', '').downcase == 'all'
40
+ end
41
+
42
+ def process(obj)
43
+ if obj.is_a?(Word) && obj.unknown?
44
+ stem = stem(obj.form.downcase, @all)
45
+ obj.add_lexicals([Lexical.new(stem, @wc)]) if stem
46
+ end
47
+
48
+ forward(obj)
49
+ end
50
+
51
+ end
52
+
53
+ end
54
+
55
+ end
56
+
57
+ require_relative 'stemmer/porter'
@@ -74,24 +74,25 @@ class Lingo
74
74
 
75
75
  def init
76
76
  set_dic
77
- @skip = get_array('skip', WA_UNKNOWN).map(&:upcase)
77
+ @skip = get_array('skip', WA_UNKNOWN, :upcase)
78
78
  end
79
79
 
80
- def control(cmd, par)
81
- @dic.report.each_pair { |k, v| set( k, v ) } if cmd == STR_CMD_STATUS
80
+ def control(cmd, param)
81
+ report_on(cmd, @dic)
82
82
  end
83
83
 
84
84
  def process(obj)
85
- if obj.is_a?(Word) && @skip.index(obj.attr).nil?
85
+ if obj.is_a?(Word) && !@skip.include?(obj.attr)
86
86
  inc('Anzahl gesuchter Wörter')
87
87
 
88
- # finde die Synonyme für alle Lexicals des Wortes
89
- synos = @dic.find_synonyms(obj)
90
- obj.lexicals += synos.sort.uniq
88
+ unless (syn = @dic.find_synonyms(obj)).empty?
89
+ inc('Anzahl erweiteter Wörter')
91
90
 
92
- inc('Anzahl erweiteter Wörter') if synos.size>0
93
- add('Anzahl gefundener Synonyme', synos.size)
91
+ obj.add_lexicals(syn.tap(&:uniq!))
92
+ add('Anzahl gefundener Synonyme', syn.size)
93
+ end
94
94
  end
95
+
95
96
  forward(obj)
96
97
  end
97
98