lingo 1.8.1 → 1.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (99) hide show
  1. data/ChangeLog +23 -5
  2. data/README +1 -1
  3. data/Rakefile +5 -7
  4. data/TODO +2 -0
  5. data/bin/lingo +5 -1
  6. data/de.lang +1 -1
  7. data/en/lingo-syn.txt +0 -0
  8. data/en.lang +2 -1
  9. data/lib/lingo/attendee/abbreviator.rb +8 -9
  10. data/lib/lingo/attendee/debugger.rb +5 -4
  11. data/lib/lingo/attendee/decomposer.rb +8 -3
  12. data/lib/lingo/attendee/dehyphenizer.rb +19 -63
  13. data/lib/lingo/attendee/formatter.rb +1 -1
  14. data/lib/lingo/attendee/multi_worder.rb +67 -155
  15. data/lib/lingo/attendee/noneword_filter.rb +16 -9
  16. data/lib/lingo/attendee/object_filter.rb +1 -1
  17. data/lib/lingo/attendee/sequencer.rb +32 -63
  18. data/lib/lingo/attendee/stemmer/porter.rb +343 -0
  19. data/{info/gpl-hdr.txt → lib/lingo/attendee/stemmer.rb} +33 -0
  20. data/lib/lingo/attendee/synonymer.rb +10 -9
  21. data/lib/lingo/attendee/text_reader.rb +102 -76
  22. data/lib/lingo/attendee/text_writer.rb +23 -26
  23. data/lib/lingo/attendee/tokenizer.rb +13 -27
  24. data/lib/lingo/attendee/variator.rb +26 -66
  25. data/lib/lingo/attendee/vector_filter.rb +42 -43
  26. data/lib/lingo/attendee/word_searcher.rb +6 -7
  27. data/lib/lingo/attendee.rb +25 -7
  28. data/lib/lingo/buffered_attendee.rb +36 -10
  29. data/lib/lingo/cachable.rb +8 -8
  30. data/lib/lingo/config.rb +5 -6
  31. data/lib/lingo/ctl.rb +2 -3
  32. data/lib/lingo/database/crypter.rb +9 -26
  33. data/lib/lingo/database/gdbm_store.rb +3 -5
  34. data/lib/lingo/database/libcdb_store.rb +4 -6
  35. data/lib/lingo/database/sdbm_store.rb +11 -6
  36. data/lib/lingo/database/show_progress.rb +3 -43
  37. data/lib/lingo/database/source/key_value.rb +2 -6
  38. data/lib/lingo/database/source/multi_key.rb +3 -5
  39. data/lib/lingo/database/source/multi_value.rb +2 -6
  40. data/lib/lingo/database/source/single_word.rb +4 -6
  41. data/lib/lingo/database/source/word_class.rb +4 -10
  42. data/lib/lingo/database/source.rb +20 -18
  43. data/lib/lingo/database.rb +84 -59
  44. data/lib/lingo/error.rb +57 -1
  45. data/lib/lingo/language/dictionary.rb +21 -18
  46. data/lib/lingo/language/grammar.rb +40 -49
  47. data/lib/lingo/language/lexical.rb +6 -6
  48. data/lib/lingo/language/lexical_hash.rb +6 -0
  49. data/lib/lingo/language/word.rb +32 -15
  50. data/lib/lingo/language/word_form.rb +1 -1
  51. data/lib/lingo/language.rb +14 -25
  52. data/lib/lingo/reportable.rb +12 -10
  53. data/lib/lingo/show_progress.rb +81 -0
  54. data/lib/lingo/version.rb +1 -1
  55. data/lib/lingo.rb +63 -24
  56. data/lingo-call.cfg +6 -10
  57. data/lingo.cfg +60 -44
  58. data/lir.cfg +42 -41
  59. data/test/attendee/ts_abbreviator.rb +3 -5
  60. data/test/attendee/ts_decomposer.rb +3 -5
  61. data/test/attendee/ts_multi_worder.rb +87 -145
  62. data/test/attendee/ts_noneword_filter.rb +5 -3
  63. data/test/attendee/ts_object_filter.rb +5 -3
  64. data/test/attendee/ts_sequencer.rb +3 -5
  65. data/test/attendee/ts_stemmer.rb +309 -0
  66. data/test/attendee/ts_synonymer.rb +15 -11
  67. data/test/attendee/ts_text_reader.rb +12 -15
  68. data/test/attendee/ts_text_writer.rb +24 -29
  69. data/test/attendee/ts_tokenizer.rb +9 -7
  70. data/test/attendee/ts_variator.rb +4 -4
  71. data/test/attendee/ts_vector_filter.rb +24 -16
  72. data/test/attendee/ts_word_searcher.rb +20 -36
  73. data/test/{lir.csv → lir.vec} +0 -0
  74. data/test/ref/artikel.vec +943 -943
  75. data/test/ref/artikel.ven +943 -943
  76. data/test/ref/lir.non +201 -201
  77. data/test/ref/lir.seq +178 -178
  78. data/test/ref/lir.syn +49 -49
  79. data/test/ref/lir.vec +329 -0
  80. data/test/test_helper.rb +20 -36
  81. data/test/ts_database.rb +10 -10
  82. data/test/ts_language.rb +279 -319
  83. metadata +93 -104
  84. data/info/Objekte.png +0 -0
  85. data/info/Typen.png +0 -0
  86. data/info/database.png +0 -0
  87. data/info/db_small.png +0 -0
  88. data/info/download.png +0 -0
  89. data/info/kerze.png +0 -0
  90. data/info/language.png +0 -0
  91. data/info/lingo.png +0 -0
  92. data/info/logo.png +0 -0
  93. data/info/meeting.png +0 -0
  94. data/info/types.png +0 -0
  95. data/lingo-all.cfg +0 -89
  96. data/porter/stem.cfg +0 -311
  97. data/porter/stem.rb +0 -150
  98. data/test/ref/lir.csv +0 -329
  99. data/test.cfg +0 -79
data/porter/stem.cfg DELETED
@@ -1,311 +0,0 @@
1
- # Stem.cfg
2
- #
3
- # Rules for Porter-Stemmer
4
- #
5
- #
6
- # based on:
7
- # An algorithm for suffix stripping
8
- #
9
- # M.F.Porter
10
- # 1980
11
- #
12
- # Originally published in Program, 14 no. 3, pp 130-137, July 1980. (A
13
- # few typos have been corrected.)
14
- #
15
- # http://tartarus.org/~martin/PorterStemmer/def.txt
16
- #
17
- # --------------------------------------------------
18
- #
19
- #
20
- #
21
- #
22
- # 2. THE ALGORITHM
23
- #
24
- # To present the suffix stripping algorithm in its entirety we will need a few
25
- # difinitions.
26
- #
27
- # A \consonant\ in a word is a letter other than A, E, I, O or U, and other
28
- # than Y preceded by a consonant. (The fact that the term `consonant' is
29
- # defined to some extent in terms of itself does not make it ambiguous.) So in
30
- # TOY the consonants are T and Y, and in SYZYGY they are S, Z and G. If a
31
- # letter is not a consonant it is a \vowel\.
32
- #
33
- # A consonant will be denoted by c, a vowel by v. A list ccc... of length
34
- # greater than 0 will be denoted by C, and a list vvv... of length greater
35
- # than 0 will be denoted by V. Any word, or part of a word, therefore has one
36
- # of the four forms:
37
- #
38
- # CVCV ... C
39
- # CVCV ... V
40
- # VCVC ... C
41
- # VCVC ... V
42
- #
43
- # These may all be represented by the single form
44
- #
45
- # [C]VCVC ... [V]
46
- #
47
- # where the square brackets denote arbitrary presence of their contents.
48
- # Using (VC){m} to denote VC repeated m times, this may again be written as
49
- #
50
- # [C](VC){m}[V].
51
- #
52
- # m will be called the \measure\ of any word or word part when represented in
53
- # this form. The case m = 0 covers the null word. Here are some examples:
54
- #
55
- # m=0 TR, EE, TREE, Y, BY.
56
- # m=1 TROUBLE, OATS, TREES, IVY.
57
- # m=2 TROUBLES, PRIVATE, OATEN, ORRERY.
58
- #
59
- # The \rules\ for removing a suffix will be given in the form
60
- #
61
- # (condition) S1 -> S2
62
- #
63
- # This means that if a word ends with the suffix S1, and the stem before S1
64
- # satisfies the given condition, S1 is replaced by S2. The condition is
65
- # usually given in terms of m, e.g.
66
- #
67
- # (m > 1) EMENT ->
68
- #
69
- # Here S1 is `EMENT' and S2 is null. This would map REPLACEMENT to REPLAC,
70
- # since REPLAC is a word part for which m = 2.
71
- #
72
- # The `condition' part may also contain the following:
73
- #
74
- # *S - the stem ends with S (and similarly for the other letters).
75
- #
76
- # *v* - the stem contains a vowel.
77
- #
78
- # *d - the stem ends with a double consonant (e.g. -TT, -SS).
79
- #
80
- # *o - the stem ends cvc, where the second c is not W, X or Y (e.g.
81
- # -WIL, -HOP).
82
- #
83
- # And the condition part may also contain expressions with \and\, \or\ and
84
- # \not\, so that
85
- #
86
- # (m>1 and (*S or *T))
87
- #
88
- # tests for a stem with m>1 ending in S or T, while
89
- #
90
- # (*d and not (*L or *S or *Z))
91
- #
92
- # tests for a stem ending witha double consonant other than L, S or Z.
93
- # Elaborate conditions like this are required only rarely.
94
- #
95
- # In a set of rules written beneath each other, only one is obeyed, and this
96
- # will be the one with the longest matching S1 for the given word. For
97
- # example, with
98
- #
99
- # SSES -> SS
100
- # IES -> I
101
- # SS -> SS
102
- # S ->
103
- #
104
- # (here the conditions are all null) CARESSES maps to CARESS since SSES is
105
- # the longest match for S1. Equally CARESS maps to CARESS (S1=`SS') and CARES
106
- # to CARE (S1=`S').
107
- #
108
- #
109
- ---
110
- stemmer:
111
- # In the rules below, examples of their application, successful or otherwise,
112
- # are given on the right in lower case. The algorithm now follows:
113
- #
114
- # Step 1a
115
- # SSES -> SS caresses -> caress
116
- # IES -> I ponies -> poni
117
- # ties -> ti
118
- # SS -> SS caress -> caress
119
- # S -> cats -> cat
120
- S100:
121
- - SSES -> SS
122
- - IES -> I
123
- - SS -> SS
124
- - S ->
125
- #
126
- # Step 1b
127
- #
128
- # (m>0) EED -> EE feed -> feed
129
- # agreed -> agree
130
- # (*v*) ED -> plastered -> plaster
131
- # bled -> bled
132
- # (*v*) ING -> motoring -> motor
133
- # sing -> sing
134
- S110:
135
- - (m>0) EED -> EE goto(S120)
136
- - (*v*) ED -> goto(S111)
137
- - (*v*) ING -> goto(S111)
138
- - goto(S120)
139
- #
140
- # If the second or third of the rules in Step 1b is successful, the following
141
- # is done:
142
- #
143
- # AT -> ATE conflat(ed) -> conflate
144
- # BL -> BLE troubl(ed) -> trouble
145
- # IZ -> IZE siz(ed) -> size
146
- # (*d and not (*L or *S or *Z))
147
- # -> single letter
148
- # hopp(ing) -> hop
149
- # tann(ed) -> tan
150
- # fall(ing) -> fall
151
- # hiss(ing) -> hiss
152
- # fizz(ed) -> fizz
153
- # (m=1 and *o) -> E fail(ing) -> fail
154
- # fil(ing) -> file
155
- S111:
156
- - AT -> ATE
157
- - BL -> BLE
158
- - IZ -> IZE
159
- - (*d and not (*L or *S or *Z)) -> -1
160
- - (m=1 and *o) -> E
161
- #
162
- # The rule to map to a single letter causes the removal of one of the double
163
- # letter pair. The -E is put back on -AT, -BL and -IZ, so that the suffixes
164
- # -ATE, -BLE and -IZE can be recognised later. This E may be removed in step
165
- # 4.
166
- #
167
- # Step 1c
168
- #
169
- # (*v*) Y -> I happy -> happi
170
- # sky -> sky
171
- S120:
172
- - (*v*) Y -> I
173
- #
174
- # Step 1 deals with plurals and past participles. The subsequent steps are
175
- # much more straightforward.
176
- #
177
- # Step 2
178
- #
179
- # (m>0) ATIONAL -> ATE relational -> relate
180
- # (m>0) TIONAL -> TION conditional -> condition
181
- # rational -> rational
182
- # (m>0) ENCI -> ENCE valenci -> valence
183
- # (m>0) ANCI -> ANCE hesitanci -> hesitance
184
- # (m>0) IZER -> IZE digitizer -> digitize
185
- # (m>0) ABLI -> ABLE conformabli -> conformable
186
- # (m>0) ALLI -> AL radicalli -> radical
187
- # (m>0) ENTLI -> ENT differentli -> different
188
- # (m>0) ELI -> E vileli - > vile
189
- # (m>0) OUSLI -> OUS analogousli -> analogous
190
- # (m>0) IZATION -> IZE vietnamization -> vietnamize
191
- # (m>0) ATION -> ATE predication -> predicate
192
- # (m>0) ATOR -> ATE operator -> operate
193
- # (m>0) ALISM -> AL feudalism -> feudal
194
- # (m>0) IVENESS -> IVE decisiveness -> decisive
195
- # (m>0) FULNESS -> FUL hopefulness -> hopeful
196
- # (m>0) OUSNESS -> OUS callousness -> callous
197
- # (m>0) ALITI -> AL formaliti -> formal
198
- # (m>0) IVITI -> IVE sensitiviti -> sensitive
199
- # (m>0) BILITI -> BLE sensibiliti -> sensible
200
- S200:
201
- - (m>0) ATIONAL -> ATE
202
- - (m>0) TIONAL -> TION
203
- - (m>0) ENCI -> ENCE
204
- - (m>0) ANCI -> ANCE
205
- - (m>0) IZER -> IZE
206
- - (m>0) ABLI -> ABLE
207
- - (m>0) ALLI -> AL
208
- - (m>0) ENTLI -> ENT
209
- - (m>0) ELI -> E
210
- - (m>0) OUSLI -> OUS
211
- - (m>0) IZATION -> IZE
212
- - (m>0) ATION -> ATE
213
- - (m>0) ATOR -> ATE
214
- - (m>0) ALISM -> AL
215
- - (m>0) IVENESS -> IVE
216
- - (m>0) FULNESS -> FUL
217
- - (m>0) OUSNESS -> OUS
218
- - (m>0) ALITI -> AL
219
- - (m>0) IVITI -> IVE
220
- - (m>0) BILITI -> BLE
221
- #
222
- # The test for the string S1 can be made fast by doing a program switch on
223
- # the penultimate letter of the word being tested. This gives a fairly even
224
- # breakdown of the possible values of the string S1. It will be seen in fact
225
- # that the S1-strings in step 2 are presented here in the alphabetical order
226
- # of their penultimate letter. Similar techniques may be applied in the other
227
- # steps.
228
- #
229
- # Step 3
230
- #
231
- # (m>0) ICATE -> IC triplicate -> triplic
232
- # (m>0) ATIVE -> formative -> form
233
- # (m>0) ALIZE -> AL formalize -> formal
234
- # (m>0) ICITI -> IC electriciti -> electric
235
- # (m>0) ICAL -> IC electrical -> electric
236
- # (m>0) FUL -> hopeful -> hope
237
- # (m>0) NESS -> goodness -> good
238
- S300:
239
- - (m>0) ICATE -> IC
240
- - (m>0) ATIVE ->
241
- - (m>0) ALIZE -> AL
242
- - (m>0) ICITI -> IC
243
- - (m>0) ICAL -> IC
244
- - (m>0) FUL ->
245
- - (m>0) NESS ->
246
- #
247
- # Step 4
248
- #
249
- # (m>1) AL -> revival -> reviv
250
- # (m>1) ANCE -> allowance -> allow
251
- # (m>1) ENCE -> inference -> infer
252
- # (m>1) ER -> airliner -> airlin
253
- # (m>1) IC -> gyroscopic -> gyroscop
254
- # (m>1) ABLE -> adjustable -> adjust
255
- # (m>1) IBLE -> defensible -> defens
256
- # (m>1) ANT -> irritant -> irrit
257
- # (m>1) EMENT -> replacement -> replac
258
- # (m>1) MENT -> adjustment -> adjust
259
- # (m>1) ENT -> dependent -> depend
260
- # (m>1 and (*S or *T)) ION -> adoption -> adopt
261
- # (m>1) OU -> homologou -> homolog
262
- # (m>1) ISM -> communism -> commun
263
- # (m>1) ATE -> activate -> activ
264
- # (m>1) ITI -> angulariti -> angular
265
- # (m>1) OUS -> homologous -> homolog
266
- # (m>1) IVE -> effective -> effect
267
- # (m>1) IZE -> bowdlerize -> bowdler
268
- S400:
269
- - (m>1) AL ->
270
- - (m>1) ANCE ->
271
- - (m>1) ENCE ->
272
- - (m>1) ER ->
273
- - (m>1) IC ->
274
- - (m>1) ABLE ->
275
- - (m>1) IBLE ->
276
- - (m>1) ANT ->
277
- - (m>1) EMENT ->
278
- - (m>1) MENT ->
279
- - (m>1) ENT ->
280
- - (m>1 and (*S or *T)) ION ->
281
- - (m>1) OU ->
282
- - (m>1) ISM ->
283
- - (m>1) ATE ->
284
- - (m>1) ITI ->
285
- - (m>1) OUS ->
286
- - (m>1) IVE ->
287
- - (m>1) IZE ->
288
- #
289
- # The suffixes are now removed. All that remains is a little tidying up.
290
- #
291
- # Step 5a
292
- #
293
- # (m>1) E -> probate -> probat
294
- # rate -> rate
295
- # (m=1 and not *o) E -> cease -> ceas
296
- S500:
297
- - (m>1) E ->
298
- - (m=1 and not *o) E ->
299
- #
300
- # Step 5b
301
- #
302
- # (m > 1 and *d and *L) -> single letter
303
- # controll -> control
304
- # roll -> roll
305
- S510:
306
- - (m > 1 and *d and *L) -> -1
307
- #
308
- # The algorithm is careful not to remove a suffix when the stem is too short,
309
- # the length of the stem being given by its measure, m. There is no linguistic
310
- # basis for this approach. It was merely observed that m could be used quite
311
- # effectively to help decide whether or not it was wise to take off a suffix.
data/porter/stem.rb DELETED
@@ -1,150 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require "yaml"
4
- class String
5
- def to_shadow
6
- shadow = self.gsub(/[^aeiouy]/, 'c')
7
- shadow.gsub!(/[aeiou]/, 'v')
8
- shadow.gsub!(/cy/, 'cv')
9
- shadow.gsub!(/y/, 'c')
10
- shadow
11
- end
12
- end
13
-
14
-
15
-
16
- # => condition nil oder eine evaluierbare regel
17
- # => matchExp eine Regexp
18
- # => replacement ist downcase
19
- # => return new stem or nil, if rule didn't match
20
- def checkSingleRule(word, condition, matchExp, replacement)
21
-
22
- # => check for matching rule
23
- return nil unless matchExp.match(word)
24
-
25
- # => remember stem
26
- stem = $1
27
-
28
- # => check condition for rule
29
- unless condition.nil?
30
- evalCondition = condition.dup
31
-
32
- stemShadow = stem.to_shadow
33
-
34
- unless condition.index("m").nil?
35
- m = stemShadow.squeeze.scan(/vc/).size
36
- evalCondition.gsub!(/m/, m.to_s)
37
- end
38
-
39
- unless condition.index("*v*").nil?
40
- evalCondition.gsub!(/\*v\*/, stemShadow.index("v").nil? ? "false" : "true")
41
- end
42
-
43
- unless condition.index("*d").nil?
44
- evalCondition.gsub!(/\*d/, (stemShadow[-1..-1]=="c" && stem[-1]==stem[-2]) ? "true" : "false")
45
- end
46
-
47
- unless condition.index("*o").nil?
48
- bool = /cvc$/.match(stemShadow) && "wxy".index(stemShadow[-1..-1]).nil?
49
- evalCondition.gsub!(/\*o/, bool ? "true" : "false")
50
- end
51
-
52
- while /\*(\w)/.match(evalCondition)
53
- char = $1
54
- if char.downcase == char
55
- abort "unbekannter Buchstabe %s in Regel: %" % [char, condition]
56
- end
57
-
58
- bool = (stem[-1..-1].upcase == char)
59
- evalCondition.gsub!(Regexp.new(Regexp.escape("*#{char}")), bool ? "true" : "false")
60
- end
61
-
62
- evalCondition.gsub!(/and/, '&&')
63
- evalCondition.gsub!(/or/, '||')
64
- evalCondition.gsub!(/not/, '!')
65
- evalCondition.gsub!(/=/, '==')
66
- p evalCondition
67
- return unless eval(evalCondition)
68
- end
69
-
70
- # => stem with replacement
71
- if /^(-\d+)$/.match(replacement)
72
- # => delete last characters from stem, if replacement looks like '-1' oder '-2'
73
- stem[0...($1.to_i)]
74
- else
75
- # => append replacement to stem
76
- stem + replacement
77
- end
78
-
79
- end
80
-
81
- def checkAllRules(word, rules)
82
- sequence = rules.keys.sort.reverse
83
-
84
- actualRuleSet = sequence.pop.to_s
85
-
86
- begin
87
- label = nil
88
-
89
- rules[actualRuleSet].each do |rule|
90
- unless /^(\(.+\)){0,1}\s*(\S*)\s*->\s*(\S*?)\s*(?:goto\((\S+)\))*\s*$/.match(rule)
91
- unless /^\s*goto\s*\(\s*(\S+)\s*\)$/.match(rule)
92
- abort "ungültige Regel: %s" % rule
93
- else
94
- label = $1
95
- break
96
- end
97
- end
98
-
99
- condition, ending, replacement, label = $1, $2.downcase, $3.downcase, $4
100
- p [rule, word, condition, ending, replacement, label ]
101
- result = checkSingleRule(word, condition, Regexp.new("(.+)#{ending}$"), replacement)
102
-
103
- unless result.nil?
104
- p [word, actualRuleSet, rule]
105
- word = result
106
- break
107
- end
108
- end
109
-
110
- if label.nil?
111
- actualRuleSet = sequence.pop.to_s
112
- else
113
- while label != actualRuleSet && !actualRuleSet.nil?
114
- actualRuleSet = sequence.pop.to_s
115
- end
116
- end
117
- end until actualRuleSet.empty?
118
-
119
- word
120
- end
121
-
122
- stemmerConfig = YAML::load_file("stem.cfg")
123
-
124
- $rules = stemmerConfig["stemmer"]
125
-
126
- word = $*[0]
127
- p checkAllRules(word, $rules)
128
-
129
- def test(word, stem)
130
- result = checkAllRules(word, $rules)
131
- if stem != result
132
- warn "Falsches Wort %s, Stem %s, Result %s" % [word, stem, result]
133
- else
134
- warn "Korrekt: Wort %s, Stem %s" % [word, stem]
135
- end
136
- end
137
-
138
-
139
- #test("caresses", "caress")
140
- #test("ponies", "poni")
141
- #test("ties", "ti")
142
- #test("caress", "caress")
143
- #test("cats", "cat")
144
-
145
- #test("feed", "feed")
146
- #?test("agreed", "agree")
147
- #test("plastered", "plaster")
148
- #test("bled", "bled")
149
- #test("motoring", "motor")
150
- #test("sing", "sing")