lingo 1.8.1 → 1.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (99) hide show
  1. data/ChangeLog +23 -5
  2. data/README +1 -1
  3. data/Rakefile +5 -7
  4. data/TODO +2 -0
  5. data/bin/lingo +5 -1
  6. data/de.lang +1 -1
  7. data/en/lingo-syn.txt +0 -0
  8. data/en.lang +2 -1
  9. data/lib/lingo/attendee/abbreviator.rb +8 -9
  10. data/lib/lingo/attendee/debugger.rb +5 -4
  11. data/lib/lingo/attendee/decomposer.rb +8 -3
  12. data/lib/lingo/attendee/dehyphenizer.rb +19 -63
  13. data/lib/lingo/attendee/formatter.rb +1 -1
  14. data/lib/lingo/attendee/multi_worder.rb +67 -155
  15. data/lib/lingo/attendee/noneword_filter.rb +16 -9
  16. data/lib/lingo/attendee/object_filter.rb +1 -1
  17. data/lib/lingo/attendee/sequencer.rb +32 -63
  18. data/lib/lingo/attendee/stemmer/porter.rb +343 -0
  19. data/{info/gpl-hdr.txt → lib/lingo/attendee/stemmer.rb} +33 -0
  20. data/lib/lingo/attendee/synonymer.rb +10 -9
  21. data/lib/lingo/attendee/text_reader.rb +102 -76
  22. data/lib/lingo/attendee/text_writer.rb +23 -26
  23. data/lib/lingo/attendee/tokenizer.rb +13 -27
  24. data/lib/lingo/attendee/variator.rb +26 -66
  25. data/lib/lingo/attendee/vector_filter.rb +42 -43
  26. data/lib/lingo/attendee/word_searcher.rb +6 -7
  27. data/lib/lingo/attendee.rb +25 -7
  28. data/lib/lingo/buffered_attendee.rb +36 -10
  29. data/lib/lingo/cachable.rb +8 -8
  30. data/lib/lingo/config.rb +5 -6
  31. data/lib/lingo/ctl.rb +2 -3
  32. data/lib/lingo/database/crypter.rb +9 -26
  33. data/lib/lingo/database/gdbm_store.rb +3 -5
  34. data/lib/lingo/database/libcdb_store.rb +4 -6
  35. data/lib/lingo/database/sdbm_store.rb +11 -6
  36. data/lib/lingo/database/show_progress.rb +3 -43
  37. data/lib/lingo/database/source/key_value.rb +2 -6
  38. data/lib/lingo/database/source/multi_key.rb +3 -5
  39. data/lib/lingo/database/source/multi_value.rb +2 -6
  40. data/lib/lingo/database/source/single_word.rb +4 -6
  41. data/lib/lingo/database/source/word_class.rb +4 -10
  42. data/lib/lingo/database/source.rb +20 -18
  43. data/lib/lingo/database.rb +84 -59
  44. data/lib/lingo/error.rb +57 -1
  45. data/lib/lingo/language/dictionary.rb +21 -18
  46. data/lib/lingo/language/grammar.rb +40 -49
  47. data/lib/lingo/language/lexical.rb +6 -6
  48. data/lib/lingo/language/lexical_hash.rb +6 -0
  49. data/lib/lingo/language/word.rb +32 -15
  50. data/lib/lingo/language/word_form.rb +1 -1
  51. data/lib/lingo/language.rb +14 -25
  52. data/lib/lingo/reportable.rb +12 -10
  53. data/lib/lingo/show_progress.rb +81 -0
  54. data/lib/lingo/version.rb +1 -1
  55. data/lib/lingo.rb +63 -24
  56. data/lingo-call.cfg +6 -10
  57. data/lingo.cfg +60 -44
  58. data/lir.cfg +42 -41
  59. data/test/attendee/ts_abbreviator.rb +3 -5
  60. data/test/attendee/ts_decomposer.rb +3 -5
  61. data/test/attendee/ts_multi_worder.rb +87 -145
  62. data/test/attendee/ts_noneword_filter.rb +5 -3
  63. data/test/attendee/ts_object_filter.rb +5 -3
  64. data/test/attendee/ts_sequencer.rb +3 -5
  65. data/test/attendee/ts_stemmer.rb +309 -0
  66. data/test/attendee/ts_synonymer.rb +15 -11
  67. data/test/attendee/ts_text_reader.rb +12 -15
  68. data/test/attendee/ts_text_writer.rb +24 -29
  69. data/test/attendee/ts_tokenizer.rb +9 -7
  70. data/test/attendee/ts_variator.rb +4 -4
  71. data/test/attendee/ts_vector_filter.rb +24 -16
  72. data/test/attendee/ts_word_searcher.rb +20 -36
  73. data/test/{lir.csv → lir.vec} +0 -0
  74. data/test/ref/artikel.vec +943 -943
  75. data/test/ref/artikel.ven +943 -943
  76. data/test/ref/lir.non +201 -201
  77. data/test/ref/lir.seq +178 -178
  78. data/test/ref/lir.syn +49 -49
  79. data/test/ref/lir.vec +329 -0
  80. data/test/test_helper.rb +20 -36
  81. data/test/ts_database.rb +10 -10
  82. data/test/ts_language.rb +279 -319
  83. metadata +93 -104
  84. data/info/Objekte.png +0 -0
  85. data/info/Typen.png +0 -0
  86. data/info/database.png +0 -0
  87. data/info/db_small.png +0 -0
  88. data/info/download.png +0 -0
  89. data/info/kerze.png +0 -0
  90. data/info/language.png +0 -0
  91. data/info/lingo.png +0 -0
  92. data/info/logo.png +0 -0
  93. data/info/meeting.png +0 -0
  94. data/info/types.png +0 -0
  95. data/lingo-all.cfg +0 -89
  96. data/porter/stem.cfg +0 -311
  97. data/porter/stem.rb +0 -150
  98. data/test/ref/lir.csv +0 -329
  99. data/test.cfg +0 -79
@@ -71,32 +71,39 @@ class Lingo
71
71
  protected
72
72
 
73
73
  def init
74
- @nonewords = []
74
+ @nonewords, @sort = [], get_key('sort', true)
75
75
  end
76
76
 
77
- # Control behandelt die Kommandos zum Öffnen und Schließen einer Datei.
78
- # Für jede Datei wird ein neuer Satz nicht erkannter Wörter registriert.
79
- def control(cmd, par)
77
+ def control(cmd, param)
80
78
  case cmd
81
79
  when STR_CMD_FILE
82
80
  @nonewords.clear
83
81
  when STR_CMD_EOL
84
82
  skip_command
85
83
  when STR_CMD_RECORD, STR_CMD_EOF
86
- nones = @nonewords.sort.uniq
87
- nones.each(&method(:forward))
88
- add('Objekte gefiltert', nones.size)
89
- @nonewords.clear
84
+ send_nonewords unless @nonewords.empty?
90
85
  end
91
86
  end
92
87
 
93
88
  def process(obj)
94
89
  if obj.is_a?(Word) && obj.unknown?
95
90
  inc('Anzahl nicht erkannter Wörter')
96
- @nonewords << obj.form.downcase
91
+
92
+ non = obj.form.downcase
93
+ @sort ? @nonewords << non : forward(non)
97
94
  end
98
95
  end
99
96
 
97
+ private
98
+
99
+ def send_nonewords
100
+ @nonewords.sort!
101
+ @nonewords.uniq!
102
+
103
+ add('Objekte gefiltert', @nonewords.size)
104
+ @nonewords.each(&method(:forward)).clear
105
+ end
106
+
100
107
  end
101
108
 
102
109
  # For backwards compatibility.
@@ -59,7 +59,7 @@ class Lingo
59
59
  # - text_reader: { out: lines, files: '$(files)' }
60
60
  # - tokenizer: { in: lines, out: token }
61
61
  # - word_searcher: { in: token, out: words, source: 'sys-dic' }
62
- # - object_filter: { in: words, out: filtr, objects: 'obj.kind_of?(Word) && obj.lexicals.size>0 && obj.lexicals[0].attr==LA_SUBSTANTIV' }
62
+ # - object_filter: { in: words, out: filtr, objects: 'obj.kind_of?(Word) && obj.lexicals.size>0 && obj.lexicals[0].attr==LA_NOUN' }
63
63
  # - debugger: { in: filtr, prompt: 'out>' }
64
64
  # ergibt die Ausgabe über den Debugger: <tt>lingo -c t1 test.txt</tt>
65
65
  # out> *FILE('test.txt')
@@ -96,92 +96,61 @@ class Lingo
96
96
  protected
97
97
 
98
98
  def init
99
- # Parameter verwerten
100
- @stopper = get_array('stopper', TA_PUNCTUATION + ',' + TA_OTHER).map(&:upcase)
101
- @seq_strings = get_key('sequences').map { |e| WordSequence.new(*e) }
99
+ @stopper = get_array('stopper', DEFAULT_SKIP, :upcase)
102
100
 
103
- raise MissingConfigError.new(:sequences) if @seq_strings.empty?
101
+ @seq = get_key('sequences').map { |string, format|
102
+ [string = string.downcase, string.split(//), format]
103
+ }
104
+
105
+ raise MissingConfigError.new(:sequences) if @seq.empty?
104
106
  end
105
107
 
106
- def control(cmd, par)
107
- # Jedes Control-Object ist auch Auslöser der Verarbeitung
108
+ def control(cmd, param)
108
109
  process_buffer if [STR_CMD_RECORD, STR_CMD_EOF].include?(cmd)
109
110
  end
110
111
 
111
112
  def process_buffer?
112
- # start buffer processing when stopper token are found or at unknown words
113
- item = @buffer.last
114
- (item.is_a?(WordForm) && @stopper.include?(item.attr.upcase)) ||
115
- (item.is_a?(Word) && item.unknown?)
113
+ (obj = @buffer.last).is_a?(WordForm) && (obj.is_a?(Word) &&
114
+ obj.unknown? || @stopper.include?(obj.attr.upcase))
116
115
  end
117
116
 
118
117
  def process_buffer
119
- return if @buffer.empty?
120
-
121
- unless @buffer.size < 2
122
- matches = Hash.new { |h, k| h[k] = [] }
123
-
124
- sequences(@buffer.map { |obj|
125
- obj.is_a?(Word) && !obj.unknown? ? obj.attrs(false) : ['#']
126
- }).uniq.each { |sequence|
127
- @seq_strings.each { |wordseq|
128
- wordseq.scan(sequence) { |pos, form, classes|
129
- inc('Anzahl erkannter Sequenzen')
130
-
131
- classes.each_with_index { |wc, index|
132
- @buffer[pos + index].lexicals.find { |lex|
133
- form.gsub!(index.succ.to_s, lex.form) if lex.attr == wc
134
- } or break
135
- } or next
136
-
137
- matches[pos] << form
138
- }
139
- }
140
- }
141
-
142
- matches.sort.each { |pos, forms|
143
- forms.uniq.each { |form|
144
- deferred_insert(pos, Word.new_lexical(form, WA_SEQUENCE, LA_SEQUENCE))
145
- }
146
- }
147
- end
148
-
118
+ insert_sequences if @buffer.size > 1
149
119
  forward_buffer
150
120
  end
151
121
 
152
122
  private
153
123
 
154
- def sequences(map)
155
- res = map.shift
124
+ def insert_sequences
125
+ matches, buf, seq = Hash.new { |h, k| h[k] = [] }, @buffer, @seq
156
126
 
157
- map.each { |classes|
158
- temp = []
159
- res.each { |wc1| classes.each { |wc2| temp << (wc1 + wc2) } }
160
- res = temp
127
+ map = buf.map { |obj|
128
+ obj.is_a?(Word) && !obj.unknown? ? obj.attrs(false) : ['#']
161
129
  }
162
130
 
163
- res
164
- end
165
-
166
- class WordSequence
131
+ map.shift.product(*map).map!(&:join).tap(&:uniq!).each { |q|
132
+ seq.each { |string, classes, format|
133
+ while pos = q.index(string, pos || 0)
134
+ inc('Anzahl erkannter Sequenzen')
167
135
 
168
- attr_reader :classes, :format, :string
136
+ fmt = format.dup
169
137
 
170
- def initialize(wordclasses, format)
171
- @string = wordclasses.downcase
172
- @classes = @string.split(//)
173
- @format = format
174
- end
138
+ classes.each_with_index { |wc, i|
139
+ buf[pos + i].lexicals.find { |l|
140
+ fmt.gsub!(i.succ.to_s, l.form) if l.attr == wc
141
+ } or break
142
+ } or next
175
143
 
176
- def scan(sequence)
177
- pos = 0
144
+ matches[pos] << fmt
178
145
 
179
- while pos = sequence.index(string, pos)
180
- yield pos, format.dup, classes
181
- pos += 1
182
- end
183
- end
146
+ pos += 1
147
+ end
148
+ }
149
+ }
184
150
 
151
+ matches.sort.each { |pos, forms| forms.tap(&:uniq!).each { |form|
152
+ @inserts << [pos, Word.new_lexical(form, WA_SEQUENCE, LA_SEQUENCE)]
153
+ } }
185
154
  end
186
155
 
187
156
  end
@@ -0,0 +1,343 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ class Attendee
30
+
31
+ class Stemmer
32
+
33
+ module Porter
34
+
35
+ extend self
36
+
37
+ # Rules for Porter-Stemmer, based on:
38
+ #
39
+ # An algorithm for suffix stripping
40
+ #
41
+ # M.F. Porter
42
+ # 1980
43
+ #
44
+ # Originally published in Program, 14 no. 3, pp 130-137, July 1980.
45
+ # (A few typos have been corrected.)
46
+ #
47
+ # http://tartarus.org/~martin/PorterStemmer/def.txt
48
+ #
49
+ # -------------------------------------------------------------------
50
+ #
51
+ # 2. THE ALGORITHM
52
+ #
53
+ # To present the suffix stripping algorithm in its entirety we will
54
+ # need a few definitions.
55
+ #
56
+ # A _consonant_ in a word is a letter other than A, E, I, O or U,
57
+ # and other than Y preceded by a consonant. (The fact that the term
58
+ # `consonant' is defined to some extent in terms of itself does not
59
+ # make it ambiguous.) So in TOY the consonants are T and Y, and in
60
+ # SYZYGY they are S, Z and G. If a letter is not a consonant it is
61
+ # a _vowel_.
62
+ #
63
+ # A consonant will be denoted by c, a vowel by v. A list ccc... of
64
+ # length greater than 0 will be denoted by C, and a list vvv... of
65
+ # length greater than 0 will be denoted by V. Any word, or part of
66
+ # a word, therefore has one of the four forms:
67
+ #
68
+ # CVCV ... C
69
+ # CVCV ... V
70
+ # VCVC ... C
71
+ # VCVC ... V
72
+ #
73
+ # These may all be represented by the single form
74
+ #
75
+ # [C]VCVC ... [V]
76
+ #
77
+ # where the square brackets denote arbitrary presence of their
78
+ # contents. Using (VC){m} to denote VC repeated m times, this
79
+ # may again be written as
80
+ #
81
+ # [C](VC){m}[V].
82
+ #
83
+ # m will be called the _measure_ of any word or word part when
84
+ # represented in this form. The case m = 0 covers the null word.
85
+ # Here are some examples:
86
+ #
87
+ # m=0 TR, EE, TREE, Y, BY.
88
+ # m=1 TROUBLE, OATS, TREES, IVY.
89
+ # m=2 TROUBLES, PRIVATE, OATEN, ORRERY.
90
+ #
91
+ # The _rules_ for removing a suffix will be given in the form
92
+ #
93
+ # (condition) S1 -> S2
94
+ #
95
+ # This means that if a word ends with the suffix S1, and the stem
96
+ # before S1 satisfies the given condition, S1 is replaced by S2.
97
+ # The condition is usually given in terms of m, e.g.
98
+ #
99
+ # (m > 1) EMENT ->
100
+ #
101
+ # Here S1 is `EMENT' and S2 is null. This would map REPLACEMENT to
102
+ # REPLAC, since REPLAC is a word part for which m = 2.
103
+ #
104
+ # The `condition' part may also contain the following:
105
+ #
106
+ # *S - the stem ends with S (and similarly for the other letters).
107
+ #
108
+ # *v* - the stem contains a vowel.
109
+ #
110
+ # *d - the stem ends with a double consonant (e.g. -TT, -SS).
111
+ #
112
+ # *o - the stem ends cvc, where the second c is not W, X or Y (e.g.
113
+ # -WIL, -HOP).
114
+ #
115
+ # And the condition part may also contain expressions with _and_,
116
+ # _or_ and _not_, so that
117
+ #
118
+ # (m>1 and (*S or *T))
119
+ #
120
+ # tests for a stem with m>1 ending in S or T, while
121
+ #
122
+ # (*d and not (*L or *S or *Z))
123
+ #
124
+ # tests for a stem ending with a double consonant other than L, S
125
+ # or Z. Elaborate conditions like this are required only rarely.
126
+ #
127
+ # In a set of rules written beneath each other, only one is obeyed,
128
+ # and this will be the one with the longest matching S1 for the
129
+ # given word. For example, with
130
+ #
131
+ # SSES -> SS
132
+ # IES -> I
133
+ # SS -> SS
134
+ # S ->
135
+ #
136
+ # (here the conditions are all null) CARESSES maps to CARESS since
137
+ # SSES is the longest match for S1. Equally CARESS maps to CARESS
138
+ # (S1=`SS') and CARES to CARE (S1=`S').
139
+ #
140
+ # In the rules below, examples of their application, successful or
141
+ # otherwise, are given on the right in lower case. The algorithm
142
+ # now follows: see RULES.
143
+ #
144
+ # The algorithm is careful not to remove a suffix when the stem is
145
+ # too short, the length of the stem being given by its measure, m.
146
+ # There is no linguistic basis for this approach. It was merely
147
+ # observed that m could be used quite effectively to help decide
148
+ # whether or not it was wise to take off a suffix.
149
+ #
150
+ # -------------------------------------------------------------------
151
+
152
+ #
153
+
154
+ RULES = {
155
+ # Step 1a
156
+ S100: [
157
+ 'SSES -> SS', # caresses -> caress
158
+ 'IES -> I', # ponies -> poni, ties -> ti
159
+ 'SS -> SS', # caress -> caress
160
+ 'S -> ' # cats -> cat
161
+ ],
162
+
163
+ # Step 1b
164
+ S110: [
165
+ '(m>0) EED -> EE goto(S120)', # agreed -> agree, feed -> feed
166
+ '(*v*) ED -> goto(S111)', # plastered -> plaster, bled -> bled
167
+ '(*v*) ING -> goto(S111)', # motoring -> motor, sing -> sing
168
+ 'goto(S120)'
169
+ ],
170
+
171
+ # If the second or third of the rules in Step 1b is successful,
172
+ # the following is done:
173
+ S111: [
174
+ 'AT -> ATE', # conflat(ed) -> conflate
175
+ 'BL -> BLE', # troubl(ed) -> trouble
176
+ 'IZ -> IZE', # siz(ed) -> size
177
+ '(*d and not (*L or *S or *Z)) -> -1', # hopp(ing) -> hop
178
+ # tann(ed) -> tan
179
+ # fall(ing) -> fall
180
+ # hiss(ing) -> hiss
181
+ # fizz(ed) -> fizz
182
+ '(m=1 and *o) -> E' # fail(ing) -> fail
183
+ # fil(ing) -> file
184
+ ],
185
+
186
+ # The rule to map to a single letter causes the removal of one of
187
+ # the double letter pair. The -E is put back on -AT, -BL and -IZ,
188
+ # so that the suffixes -ATE, -BLE and -IZE can be recognised later.
189
+ # This E may be removed in step 4.
190
+
191
+ # Step 1c
192
+ S120: [
193
+ '(*v*) Y -> I' # happy -> happi, sky -> sky
194
+ ],
195
+
196
+ # Step 1 deals with plurals and past participles. The subsequent
197
+ # steps are much more straightforward.
198
+
199
+ # Step 2
200
+ S200: [
201
+ '(m>0) ATIONAL -> ATE', # relational -> relate
202
+ '(m>0) TIONAL -> TION', # conditional -> condition, rational -> rational
203
+ '(m>0) ENCI -> ENCE', # valenci -> valence
204
+ '(m>0) ANCI -> ANCE', # hesitanci -> hesitance
205
+ '(m>0) IZER -> IZE', # digitizer -> digitize
206
+ '(m>0) ABLI -> ABLE', # conformabli -> conformable
207
+ '(m>0) ALLI -> AL', # radicalli -> radical
208
+ '(m>0) ENTLI -> ENT', # differentli -> different
209
+ '(m>0) ELI -> E', # vileli -> vile
210
+ '(m>0) OUSLI -> OUS', # analogousli -> analogous
211
+ '(m>0) IZATION -> IZE', # vietnamization -> vietnamize
212
+ '(m>0) ATION -> ATE', # predication -> predicate
213
+ '(m>0) ATOR -> ATE', # operator -> operate
214
+ '(m>0) ALISM -> AL', # feudalism -> feudal
215
+ '(m>0) IVENESS -> IVE', # decisiveness -> decisive
216
+ '(m>0) FULNESS -> FUL', # hopefulness -> hopeful
217
+ '(m>0) OUSNESS -> OUS', # callousness -> callous
218
+ '(m>0) ALITI -> AL', # formaliti -> formal
219
+ '(m>0) IVITI -> IVE', # sensitiviti -> sensitive
220
+ '(m>0) BILITI -> BLE' # sensibiliti -> sensible
221
+ ],
222
+
223
+ # The test for the string S1 can be made fast by doing a program
224
+ # switch on the penultimate letter of the word being tested. This
225
+ # gives a fairly even breakdown of the possible values of the
226
+ # string S1. It will be seen in fact that the S1-strings in step 2
227
+ # are presented here in the alphabetical order of their penultimate
228
+ # letter. Similar techniques may be applied in the other steps.
229
+
230
+ # Step 3
231
+ S300: [
232
+ '(m>0) ICATE -> IC', # triplicate -> triplic
233
+ '(m>0) ATIVE -> ', # formative -> form
234
+ '(m>0) ALIZE -> AL', # formalize -> formal
235
+ '(m>0) ICITI -> IC', # electriciti -> electric
236
+ '(m>0) ICAL -> IC', # electrical -> electric
237
+ '(m>0) FUL -> ', # hopeful -> hope
238
+ '(m>0) NESS -> ' # goodness -> good
239
+ ],
240
+
241
+ # Step 4
242
+ S400: [
243
+ '(m>1) AL -> ', # revival -> reviv
244
+ '(m>1) ANCE -> ', # allowance -> allow
245
+ '(m>1) ENCE -> ', # inference -> infer
246
+ '(m>1) ER -> ', # airliner -> airlin
247
+ '(m>1) IC -> ', # gyroscopic -> gyroscop
248
+ '(m>1) ABLE -> ', # adjustable -> adjust
249
+ '(m>1) IBLE -> ', # defensible -> defens
250
+ '(m>1) ANT -> ', # irritant -> irrit
251
+ '(m>1) EMENT -> ', # replacement -> replac
252
+ '(m>1) MENT -> ', # adjustment -> adjust
253
+ '(m>1) ENT -> ', # dependent -> depend
254
+ '(m>1 and (*S or *T)) ION -> ', # adoption -> adopt
255
+ '(m>1) OU -> ', # homologou -> homolog
256
+ '(m>1) ISM -> ', # communism -> commun
257
+ '(m>1) ATE -> ', # activate -> activ
258
+ '(m>1) ITI -> ', # angulariti -> angular
259
+ '(m>1) OUS -> ', # homologous -> homolog
260
+ '(m>1) IVE -> ', # effective -> effect
261
+ '(m>1) IZE -> ' # bowdlerize -> bowdler
262
+ ],
263
+
264
+ # The suffixes are now removed. All that remains is a little
265
+ # tidying up.
266
+
267
+ # Step 5a
268
+ S500: [
269
+ '(m>1) E -> ', # probate -> probat, rate -> rate
270
+ '(m=1 and not *o) E -> ' # cease -> ceas
271
+ ],
272
+
273
+ # Step 5b
274
+ S510: [
275
+ '(m > 1 and *d and *L) -> -1' # controll -> control, roll -> roll
276
+ ]
277
+ }
278
+
279
+ GOTO_RE = %r{^#{goto_re = %r{\s*goto\((\S+)\)}}$}
280
+
281
+ RULE_RE = %r{^(\(.+\))?\s*(\S*)\s*->\s*(\S*?)(?:#{goto_re})?\s*$}
282
+
283
+ def stem(word, found = false)
284
+ goto, conv = nil, lambda { |s, h| h.each { |q, r| s.gsub!(q, r.to_s) } }
285
+
286
+ RULES.each { |key, rules|
287
+ next if goto && goto != key.to_s
288
+
289
+ rules.each { |rule|
290
+ case rule
291
+ when RULE_RE
292
+ cond, repl, goto = $1, $3, $4
293
+ stem = word[/(.+)#{$2.downcase}$/, 1] or next
294
+ when GOTO_RE
295
+ goto = $1
296
+ break
297
+ end
298
+
299
+ conv[shad = stem.dup,
300
+ /[^aeiouy]/ => 'c',
301
+ /[aeiou]/ => 'v',
302
+ /cy/ => 'cv',
303
+ /y/ => 'c'
304
+ ]
305
+
306
+ if cond
307
+ conv[cond,
308
+ 'm' => shad.scan(/vc/).size,
309
+ '*v*' => shad.include?('v'),
310
+ '*d' => shad.end_with?('c') && (last = stem[-1]) == stem[-2],
311
+ '*o' => shad.end_with?('cvc') && !'wxy'.include?(last),
312
+ 'and' => '&&',
313
+ 'or' => '||',
314
+ 'not' => '!',
315
+ '=' => '=='
316
+ ]
317
+
318
+ last.upcase! if last
319
+ cond.gsub!(/\*(\w)/) { last == $1 }
320
+
321
+ next unless eval(cond)
322
+ end
323
+
324
+ found, word = true, begin
325
+ stem[0...Integer(repl)]
326
+ rescue ArgumentError
327
+ stem << repl.downcase
328
+ end
329
+
330
+ break
331
+ }
332
+ }
333
+
334
+ word if found
335
+ end
336
+
337
+ end
338
+
339
+ end
340
+
341
+ end
342
+
343
+ end
@@ -1,3 +1,5 @@
1
+ # encoding: utf-8
2
+
1
3
  #--
2
4
  ###############################################################################
3
5
  # #
@@ -22,3 +24,34 @@
22
24
  ###############################################################################
23
25
  #++
24
26
 
27
+ class Lingo
28
+
29
+ class Attendee
30
+
31
+ class Stemmer < self
32
+
33
+ protected
34
+
35
+ def init
36
+ extend(Lingo.get_const(get_key('type', 'porter'), self.class))
37
+
38
+ @wc = get_key('wordclass', LA_STEM)
39
+ @all = get_key('mode', '').downcase == 'all'
40
+ end
41
+
42
+ def process(obj)
43
+ if obj.is_a?(Word) && obj.unknown?
44
+ stem = stem(obj.form.downcase, @all)
45
+ obj.add_lexicals([Lexical.new(stem, @wc)]) if stem
46
+ end
47
+
48
+ forward(obj)
49
+ end
50
+
51
+ end
52
+
53
+ end
54
+
55
+ end
56
+
57
+ require_relative 'stemmer/porter'
@@ -74,24 +74,25 @@ class Lingo
74
74
 
75
75
  def init
76
76
  set_dic
77
- @skip = get_array('skip', WA_UNKNOWN).map(&:upcase)
77
+ @skip = get_array('skip', WA_UNKNOWN, :upcase)
78
78
  end
79
79
 
80
- def control(cmd, par)
81
- @dic.report.each_pair { |k, v| set( k, v ) } if cmd == STR_CMD_STATUS
80
+ def control(cmd, param)
81
+ report_on(cmd, @dic)
82
82
  end
83
83
 
84
84
  def process(obj)
85
- if obj.is_a?(Word) && @skip.index(obj.attr).nil?
85
+ if obj.is_a?(Word) && !@skip.include?(obj.attr)
86
86
  inc('Anzahl gesuchter Wörter')
87
87
 
88
- # finde die Synonyme für alle Lexicals des Wortes
89
- synos = @dic.find_synonyms(obj)
90
- obj.lexicals += synos.sort.uniq
88
+ unless (syn = @dic.find_synonyms(obj)).empty?
89
+ inc('Anzahl erweiteter Wörter')
91
90
 
92
- inc('Anzahl erweiteter Wörter') if synos.size>0
93
- add('Anzahl gefundener Synonyme', synos.size)
91
+ obj.add_lexicals(syn.tap(&:uniq!))
92
+ add('Anzahl gefundener Synonyme', syn.size)
93
+ end
94
94
  end
95
+
95
96
  forward(obj)
96
97
  end
97
98