lingo 1.8.1 → 1.8.2
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +23 -5
- data/README +1 -1
- data/Rakefile +5 -7
- data/TODO +2 -0
- data/bin/lingo +5 -1
- data/de.lang +1 -1
- data/en/lingo-syn.txt +0 -0
- data/en.lang +2 -1
- data/lib/lingo/attendee/abbreviator.rb +8 -9
- data/lib/lingo/attendee/debugger.rb +5 -4
- data/lib/lingo/attendee/decomposer.rb +8 -3
- data/lib/lingo/attendee/dehyphenizer.rb +19 -63
- data/lib/lingo/attendee/formatter.rb +1 -1
- data/lib/lingo/attendee/multi_worder.rb +67 -155
- data/lib/lingo/attendee/noneword_filter.rb +16 -9
- data/lib/lingo/attendee/object_filter.rb +1 -1
- data/lib/lingo/attendee/sequencer.rb +32 -63
- data/lib/lingo/attendee/stemmer/porter.rb +343 -0
- data/{info/gpl-hdr.txt → lib/lingo/attendee/stemmer.rb} +33 -0
- data/lib/lingo/attendee/synonymer.rb +10 -9
- data/lib/lingo/attendee/text_reader.rb +102 -76
- data/lib/lingo/attendee/text_writer.rb +23 -26
- data/lib/lingo/attendee/tokenizer.rb +13 -27
- data/lib/lingo/attendee/variator.rb +26 -66
- data/lib/lingo/attendee/vector_filter.rb +42 -43
- data/lib/lingo/attendee/word_searcher.rb +6 -7
- data/lib/lingo/attendee.rb +25 -7
- data/lib/lingo/buffered_attendee.rb +36 -10
- data/lib/lingo/cachable.rb +8 -8
- data/lib/lingo/config.rb +5 -6
- data/lib/lingo/ctl.rb +2 -3
- data/lib/lingo/database/crypter.rb +9 -26
- data/lib/lingo/database/gdbm_store.rb +3 -5
- data/lib/lingo/database/libcdb_store.rb +4 -6
- data/lib/lingo/database/sdbm_store.rb +11 -6
- data/lib/lingo/database/show_progress.rb +3 -43
- data/lib/lingo/database/source/key_value.rb +2 -6
- data/lib/lingo/database/source/multi_key.rb +3 -5
- data/lib/lingo/database/source/multi_value.rb +2 -6
- data/lib/lingo/database/source/single_word.rb +4 -6
- data/lib/lingo/database/source/word_class.rb +4 -10
- data/lib/lingo/database/source.rb +20 -18
- data/lib/lingo/database.rb +84 -59
- data/lib/lingo/error.rb +57 -1
- data/lib/lingo/language/dictionary.rb +21 -18
- data/lib/lingo/language/grammar.rb +40 -49
- data/lib/lingo/language/lexical.rb +6 -6
- data/lib/lingo/language/lexical_hash.rb +6 -0
- data/lib/lingo/language/word.rb +32 -15
- data/lib/lingo/language/word_form.rb +1 -1
- data/lib/lingo/language.rb +14 -25
- data/lib/lingo/reportable.rb +12 -10
- data/lib/lingo/show_progress.rb +81 -0
- data/lib/lingo/version.rb +1 -1
- data/lib/lingo.rb +63 -24
- data/lingo-call.cfg +6 -10
- data/lingo.cfg +60 -44
- data/lir.cfg +42 -41
- data/test/attendee/ts_abbreviator.rb +3 -5
- data/test/attendee/ts_decomposer.rb +3 -5
- data/test/attendee/ts_multi_worder.rb +87 -145
- data/test/attendee/ts_noneword_filter.rb +5 -3
- data/test/attendee/ts_object_filter.rb +5 -3
- data/test/attendee/ts_sequencer.rb +3 -5
- data/test/attendee/ts_stemmer.rb +309 -0
- data/test/attendee/ts_synonymer.rb +15 -11
- data/test/attendee/ts_text_reader.rb +12 -15
- data/test/attendee/ts_text_writer.rb +24 -29
- data/test/attendee/ts_tokenizer.rb +9 -7
- data/test/attendee/ts_variator.rb +4 -4
- data/test/attendee/ts_vector_filter.rb +24 -16
- data/test/attendee/ts_word_searcher.rb +20 -36
- data/test/{lir.csv → lir.vec} +0 -0
- data/test/ref/artikel.vec +943 -943
- data/test/ref/artikel.ven +943 -943
- data/test/ref/lir.non +201 -201
- data/test/ref/lir.seq +178 -178
- data/test/ref/lir.syn +49 -49
- data/test/ref/lir.vec +329 -0
- data/test/test_helper.rb +20 -36
- data/test/ts_database.rb +10 -10
- data/test/ts_language.rb +279 -319
- metadata +93 -104
- data/info/Objekte.png +0 -0
- data/info/Typen.png +0 -0
- data/info/database.png +0 -0
- data/info/db_small.png +0 -0
- data/info/download.png +0 -0
- data/info/kerze.png +0 -0
- data/info/language.png +0 -0
- data/info/lingo.png +0 -0
- data/info/logo.png +0 -0
- data/info/meeting.png +0 -0
- data/info/types.png +0 -0
- data/lingo-all.cfg +0 -89
- data/porter/stem.cfg +0 -311
- data/porter/stem.rb +0 -150
- data/test/ref/lir.csv +0 -329
- data/test.cfg +0 -79
@@ -0,0 +1,309 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative '../test_helper'
|
4
|
+
|
5
|
+
class TestAttendeeStemmer < AttendeeTestCase
|
6
|
+
|
7
|
+
def test_type
|
8
|
+
assert_raise(Lingo::NameNotFoundError) { meet({ 'type' => 'bla' }, []) }
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_basic
|
12
|
+
meet({}, [
|
13
|
+
wd('bla|IDF'),
|
14
|
+
wd('blub|?'),
|
15
|
+
wd('blubs|?'),
|
16
|
+
ai('EOF|')
|
17
|
+
], [
|
18
|
+
wd('bla|IDF'),
|
19
|
+
wd('blub|?'),
|
20
|
+
wd('blubs|?', 'blub|z'),
|
21
|
+
ai('EOF|')
|
22
|
+
])
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_wc
|
26
|
+
meet({ 'wordclass' => 'w' }, [
|
27
|
+
wd('bla|IDF'),
|
28
|
+
wd('blub|?'),
|
29
|
+
wd('blubs|?'),
|
30
|
+
ai('EOF|')
|
31
|
+
], [
|
32
|
+
wd('bla|IDF'),
|
33
|
+
wd('blub|?'),
|
34
|
+
wd('blubs|?', 'blub|w'),
|
35
|
+
ai('EOF|')
|
36
|
+
])
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_mode
|
40
|
+
meet({ 'mode' => '' }, [
|
41
|
+
wd('bla|IDF'),
|
42
|
+
wd('a|?'),
|
43
|
+
wd('yet|?'),
|
44
|
+
wd('blubs|?'),
|
45
|
+
ai('EOF|')
|
46
|
+
], [
|
47
|
+
wd('bla|IDF'),
|
48
|
+
wd('a|?'),
|
49
|
+
wd('yet|?'),
|
50
|
+
wd('blubs|?', 'blub|z'),
|
51
|
+
ai('EOF|')
|
52
|
+
])
|
53
|
+
|
54
|
+
meet({ 'mode' => 'all' }, [
|
55
|
+
wd('bla|IDF'),
|
56
|
+
wd('a|?'),
|
57
|
+
wd('yet|?'),
|
58
|
+
wd('blubs|?'),
|
59
|
+
ai('EOF|')
|
60
|
+
], [
|
61
|
+
wd('bla|IDF'),
|
62
|
+
wd('a|?', 'a|z'),
|
63
|
+
wd('yet|?', 'yet|z'),
|
64
|
+
wd('blubs|?', 'blub|z'),
|
65
|
+
ai('EOF|')
|
66
|
+
])
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_examples_100
|
70
|
+
meet({}, [
|
71
|
+
wd('S100|IDF'),
|
72
|
+
wd('caresses|?'),
|
73
|
+
wd('ponies|?'),
|
74
|
+
wd('ties|?'),
|
75
|
+
wd('caress|?'),
|
76
|
+
wd('cats|?'),
|
77
|
+
ai('EOF|')
|
78
|
+
], [
|
79
|
+
wd('S100|IDF'),
|
80
|
+
wd('caresses|?', 'caress|z'),
|
81
|
+
wd('ponies|?', 'poni|z'),
|
82
|
+
wd('ties|?', 'ti|z'), # snowball: tie
|
83
|
+
wd('caress|?', 'caress|z'),
|
84
|
+
wd('cats|?', 'cat|z'),
|
85
|
+
ai('EOF|')
|
86
|
+
])
|
87
|
+
end
|
88
|
+
|
89
|
+
def test_examples_110
|
90
|
+
meet({ 'mode' => 'all' }, [
|
91
|
+
wd('S110|IDF'),
|
92
|
+
wd('agreed|?'),
|
93
|
+
wd('feed|?'),
|
94
|
+
wd('plastered|?'),
|
95
|
+
wd('bled|?'),
|
96
|
+
wd('motoring|?'),
|
97
|
+
wd('sing|?'),
|
98
|
+
ai('EOF|')
|
99
|
+
], [
|
100
|
+
wd('S110|IDF'),
|
101
|
+
wd('agreed|?', 'agre|z'),
|
102
|
+
wd('feed|?', 'fe|z'), # snowball: feed
|
103
|
+
wd('plastered|?', 'plaster|z'),
|
104
|
+
wd('bled|?', 'bled|z'),
|
105
|
+
wd('motoring|?', 'motor|z'),
|
106
|
+
wd('sing|?', 'sing|z'),
|
107
|
+
ai('EOF|')
|
108
|
+
])
|
109
|
+
end
|
110
|
+
|
111
|
+
def test_examples_111
|
112
|
+
meet({}, [
|
113
|
+
wd('S111|IDF'),
|
114
|
+
wd('conflated|?'),
|
115
|
+
wd('troubled|?'),
|
116
|
+
wd('sized|?'),
|
117
|
+
wd('hopping|?'),
|
118
|
+
wd('tanned|?'),
|
119
|
+
wd('falling|?'),
|
120
|
+
wd('hissing|?'),
|
121
|
+
wd('fizzed|?'),
|
122
|
+
wd('failing|?'),
|
123
|
+
wd('filing|?'),
|
124
|
+
ai('EOF|')
|
125
|
+
], [
|
126
|
+
wd('S111|IDF'),
|
127
|
+
wd('conflated|?', 'conflat|z'),
|
128
|
+
wd('troubled|?', 'troubl|z'),
|
129
|
+
wd('sized|?', 'size|z'),
|
130
|
+
wd('hopping|?', 'hop|z'),
|
131
|
+
wd('tanned|?', 'tan|z'),
|
132
|
+
wd('falling|?', 'fall|z'),
|
133
|
+
wd('hissing|?', 'hiss|z'),
|
134
|
+
wd('fizzed|?', 'fizz|z'),
|
135
|
+
wd('failing|?', 'fail|z'),
|
136
|
+
wd('filing|?', 'file|z'),
|
137
|
+
ai('EOF|')
|
138
|
+
])
|
139
|
+
end
|
140
|
+
|
141
|
+
def test_examples_120
|
142
|
+
meet({ 'mode' => 'all' }, [
|
143
|
+
wd('S120|IDF'),
|
144
|
+
wd('happy|?'),
|
145
|
+
wd('sky|?'),
|
146
|
+
ai('EOF|')
|
147
|
+
], [
|
148
|
+
wd('S120|IDF'),
|
149
|
+
wd('happy|?', 'happi|z'),
|
150
|
+
wd('sky|?', 'sky|z'),
|
151
|
+
ai('EOF|')
|
152
|
+
])
|
153
|
+
end
|
154
|
+
|
155
|
+
def test_examples_200
|
156
|
+
meet({}, [
|
157
|
+
wd('S200|IDF'),
|
158
|
+
wd('relational|?'),
|
159
|
+
wd('conditional|?'),
|
160
|
+
wd('rational|?'),
|
161
|
+
wd('valency|?'),
|
162
|
+
wd('hesitancy|?'),
|
163
|
+
wd('digitizer|?'),
|
164
|
+
wd('conformably|?'),
|
165
|
+
wd('radically|?'),
|
166
|
+
wd('differently|?'),
|
167
|
+
wd('vilely|?'),
|
168
|
+
wd('analogously|?'),
|
169
|
+
wd('vietnamization|?'),
|
170
|
+
wd('predication|?'),
|
171
|
+
wd('operator|?'),
|
172
|
+
wd('feudalism|?'),
|
173
|
+
wd('decisiveness|?'),
|
174
|
+
wd('hopefulness|?'),
|
175
|
+
wd('callousness|?'),
|
176
|
+
wd('formality|?'),
|
177
|
+
wd('sensitivity|?'),
|
178
|
+
wd('sensibility|?'),
|
179
|
+
ai('EOF|')
|
180
|
+
], [
|
181
|
+
wd('S200|IDF'),
|
182
|
+
wd('relational|?', 'relat|z'),
|
183
|
+
wd('conditional|?', 'condit|z'),
|
184
|
+
wd('rational|?', 'ration|z'),
|
185
|
+
wd('valency|?', 'valenc|z'),
|
186
|
+
wd('hesitancy|?', 'hesit|z'),
|
187
|
+
wd('digitizer|?', 'digit|z'),
|
188
|
+
wd('conformably|?', 'conform|z'),
|
189
|
+
wd('radically|?', 'radic|z'),
|
190
|
+
wd('differently|?', 'differ|z'),
|
191
|
+
wd('vilely|?', 'vile|z'),
|
192
|
+
wd('analogously|?', 'analog|z'),
|
193
|
+
wd('vietnamization|?', 'vietnam|z'),
|
194
|
+
wd('predication|?', 'predic|z'),
|
195
|
+
wd('operator|?', 'oper|z'),
|
196
|
+
wd('feudalism|?', 'feudal|z'),
|
197
|
+
wd('decisiveness|?', 'decis|z'),
|
198
|
+
wd('hopefulness|?', 'hope|z'),
|
199
|
+
wd('callousness|?', 'callous|z'),
|
200
|
+
wd('formality|?', 'formal|z'),
|
201
|
+
wd('sensitivity|?', 'sensit|z'),
|
202
|
+
wd('sensibility|?', 'sensibl|z'),
|
203
|
+
ai('EOF|')
|
204
|
+
])
|
205
|
+
end
|
206
|
+
|
207
|
+
def test_examples_300
|
208
|
+
meet({}, [
|
209
|
+
wd('S300|IDF'),
|
210
|
+
wd('triplicate|?'),
|
211
|
+
wd('formative|?'),
|
212
|
+
wd('formalize|?'),
|
213
|
+
wd('electricity|?'),
|
214
|
+
wd('electrical|?'),
|
215
|
+
wd('hopeful|?'),
|
216
|
+
wd('goodness|?'),
|
217
|
+
ai('EOF|')
|
218
|
+
], [
|
219
|
+
wd('S300|IDF'),
|
220
|
+
wd('triplicate|?', 'triplic|z'),
|
221
|
+
wd('formative|?', 'form|z'), # snowball: format
|
222
|
+
wd('formalize|?', 'formal|z'),
|
223
|
+
wd('electricity|?', 'electr|z'),
|
224
|
+
wd('electrical|?', 'electr|z'),
|
225
|
+
wd('hopeful|?', 'hope|z'),
|
226
|
+
wd('goodness|?', 'good|z'),
|
227
|
+
ai('EOF|')
|
228
|
+
])
|
229
|
+
end
|
230
|
+
|
231
|
+
def test_examples_400
|
232
|
+
meet({}, [
|
233
|
+
wd('S400|IDF'),
|
234
|
+
wd('revival|?'),
|
235
|
+
wd('allowance|?'),
|
236
|
+
wd('inference|?'),
|
237
|
+
wd('airliner|?'),
|
238
|
+
wd('gyroscopic|?'),
|
239
|
+
wd('adjustable|?'),
|
240
|
+
wd('defensible|?'),
|
241
|
+
wd('irritant|?'),
|
242
|
+
wd('replacement|?'),
|
243
|
+
wd('adjustment|?'),
|
244
|
+
wd('dependent|?'),
|
245
|
+
wd('adoption|?'),
|
246
|
+
wd('homologou|?'),
|
247
|
+
wd('communism|?'),
|
248
|
+
wd('activate|?'),
|
249
|
+
wd('angularity|?'),
|
250
|
+
wd('homologous|?'),
|
251
|
+
wd('effective|?'),
|
252
|
+
wd('bowdlerize|?'),
|
253
|
+
ai('EOF|')
|
254
|
+
], [
|
255
|
+
wd('S400|IDF'),
|
256
|
+
wd('revival|?', 'reviv|z'),
|
257
|
+
wd('allowance|?', 'allow|z'),
|
258
|
+
wd('inference|?', 'infer|z'),
|
259
|
+
wd('airliner|?', 'airlin|z'),
|
260
|
+
wd('gyroscopic|?', 'gyroscop|z'),
|
261
|
+
wd('adjustable|?', 'adjust|z'),
|
262
|
+
wd('defensible|?', 'defens|z'),
|
263
|
+
wd('irritant|?', 'irrit|z'),
|
264
|
+
wd('replacement|?', 'replac|z'),
|
265
|
+
wd('adjustment|?', 'adjust|z'),
|
266
|
+
wd('dependent|?', 'depend|z'),
|
267
|
+
wd('adoption|?', 'adopt|z'),
|
268
|
+
wd('homologou|?', 'homolog|z'), # snowball: homologou
|
269
|
+
wd('communism|?', 'commun|z'), # snowball: communism
|
270
|
+
wd('activate|?', 'activ|z'),
|
271
|
+
wd('angularity|?', 'angular|z'),
|
272
|
+
wd('homologous|?', 'homolog|z'),
|
273
|
+
wd('effective|?', 'effect|z'),
|
274
|
+
wd('bowdlerize|?', 'bowdler|z'),
|
275
|
+
ai('EOF|')
|
276
|
+
])
|
277
|
+
end
|
278
|
+
|
279
|
+
def test_examples_500
|
280
|
+
meet({ 'mode' => 'all' }, [
|
281
|
+
wd('S500|IDF'),
|
282
|
+
wd('probate|?'),
|
283
|
+
wd('rate|?'),
|
284
|
+
wd('cease|?'),
|
285
|
+
ai('EOF|')
|
286
|
+
], [
|
287
|
+
wd('S500|IDF'),
|
288
|
+
wd('probate|?', 'probat|z'),
|
289
|
+
wd('rate|?', 'rate|z'),
|
290
|
+
wd('cease|?', 'ceas|z'),
|
291
|
+
ai('EOF|')
|
292
|
+
])
|
293
|
+
end
|
294
|
+
|
295
|
+
def test_examples_510
|
296
|
+
meet({ 'mode' => 'all' }, [
|
297
|
+
wd('S510|IDF'),
|
298
|
+
wd('controll|?'),
|
299
|
+
wd('roll|?'),
|
300
|
+
ai('EOF|')
|
301
|
+
], [
|
302
|
+
wd('S510|IDF'),
|
303
|
+
wd('controll|?', 'control|z'),
|
304
|
+
wd('roll|?', 'roll|z'),
|
305
|
+
ai('EOF|')
|
306
|
+
])
|
307
|
+
end
|
308
|
+
|
309
|
+
end
|
@@ -5,23 +5,27 @@ require_relative '../test_helper'
|
|
5
5
|
class TestAttendeeSynonymer < AttendeeTestCase
|
6
6
|
|
7
7
|
def test_basic
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
meet({ 'source' => 'sys-syn', 'check' => '-,MUL' }, [
|
9
|
+
wd('abtastzeiten|IDF', 'abtastzeit|s')
|
10
|
+
], [
|
11
|
+
wd('abtastzeiten|IDF', 'abtastzeit|s', 'abtastfrequenz|y', 'abtastperiode|y')
|
12
|
+
])
|
12
13
|
end
|
13
14
|
|
14
15
|
def test_first
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
16
|
+
meet({ 'source' => 'sys-syn,tst-syn', 'check' => '-,MUL', 'mode' => 'first' }, [
|
17
|
+
wd('Aktienanleihe|IDF', 'aktienanleihe|s')
|
18
|
+
], [
|
19
|
+
wd('Aktienanleihe|IDF', 'aktienanleihe|s', 'aktien-anleihe|y', 'reverse convertible bond|y', 'reverse convertibles|y')
|
20
|
+
])
|
19
21
|
end
|
20
22
|
|
21
23
|
def test_all
|
22
|
-
|
23
|
-
|
24
|
-
|
24
|
+
meet({ 'source' => 'sys-syn,tst-syn', 'check' => '-,MUL', 'mode' => 'all' }, [
|
25
|
+
wd('Kerlchen|IDF', 'kerlchen|s')
|
26
|
+
], [
|
27
|
+
wd('Kerlchen|IDF', 'kerlchen|s', 'kerlchen|y', 'zwerg-nase|y')
|
28
|
+
])
|
25
29
|
end
|
26
30
|
|
27
31
|
end
|
@@ -5,8 +5,8 @@ require_relative '../test_helper'
|
|
5
5
|
class TestAttendeeTextReader < AttendeeTestCase
|
6
6
|
|
7
7
|
def test_lir_file
|
8
|
-
|
9
|
-
ai('LIR-FORMAT|'), ai('
|
8
|
+
meet({ 'files' => 'test/lir.txt', 'records' => true }, nil, [
|
9
|
+
ai('LIR-FORMAT|'), ai("FILE|#{path = File.expand_path('test/lir.txt')}"),
|
10
10
|
ai('RECORD|00237'),
|
11
11
|
'020: GERHARD.',
|
12
12
|
'025: Automatisches Sammeln, Klassifizieren und Indexieren von wissenschaftlich relevanten Informationsressourcen.',
|
@@ -17,14 +17,13 @@ class TestAttendeeTextReader < AttendeeTestCase
|
|
17
17
|
ai('RECORD|00239'),
|
18
18
|
'020: Information Retrieval und Dokumentmanagement im Multimedia-Zeitalter.',
|
19
19
|
'056: "Das Buch ist ein praxisbezogenes VADEMECUM für alle, die in einer Welt der Datennetze Wissen/Informationen sammeln.',
|
20
|
-
ai(
|
21
|
-
]
|
22
|
-
meet({'files'=>'test/lir.txt', 'lir-record-pattern'=>'^\[(\d+)\.\]'})
|
20
|
+
ai("EOF|#{path}")
|
21
|
+
])
|
23
22
|
end
|
24
23
|
|
25
24
|
def test_lir_file_another_pattern
|
26
|
-
|
27
|
-
ai('LIR-FORMAT|'), ai('
|
25
|
+
meet({ 'files' => 'test/lir2.txt', 'records' => '^\021(\d+)\022' }, nil, [
|
26
|
+
ai('LIR-FORMAT|'), ai("FILE|#{path = File.expand_path('test/lir2.txt')}"),
|
28
27
|
ai('RECORD|00237'),
|
29
28
|
'020: GERHARD.',
|
30
29
|
'025: Automatisches Sammeln, Klassifizieren und Indexieren von wissenschaftlich relevanten Informationsressourcen.',
|
@@ -35,18 +34,16 @@ class TestAttendeeTextReader < AttendeeTestCase
|
|
35
34
|
ai('RECORD|00239'),
|
36
35
|
'020: Information Retrieval und Dokumentmanagement im Multimedia-Zeitalter.',
|
37
36
|
'056: "Das Buch ist ein praxisbezogenes VADEMECUM für alle, die in einer Welt der Datennetze Wissen/Informationen sammeln.',
|
38
|
-
ai(
|
39
|
-
]
|
40
|
-
meet({'files'=>'test/lir2.txt', 'lir-record-pattern'=>'^\021(\d+)\022'})
|
37
|
+
ai("EOF|#{path}")
|
38
|
+
])
|
41
39
|
end
|
42
40
|
|
43
41
|
def test_normal_file
|
44
|
-
|
45
|
-
ai('
|
42
|
+
meet({ 'files' => 'test/mul.txt' }, nil, [
|
43
|
+
ai("FILE|#{path = File.expand_path('test/mul.txt')}"),
|
46
44
|
'Die abstrakte Kunst ist schön.',
|
47
|
-
ai(
|
48
|
-
]
|
49
|
-
meet({'files'=>'test/mul.txt'})
|
45
|
+
ai("EOF|#{path}")
|
46
|
+
])
|
50
47
|
end
|
51
48
|
|
52
49
|
end
|
@@ -5,7 +5,7 @@ require_relative '../test_helper'
|
|
5
5
|
class TestAttendeeTextWriter < AttendeeTestCase
|
6
6
|
|
7
7
|
def setup
|
8
|
-
@
|
8
|
+
@input = [
|
9
9
|
ai('FILE|test/test.txt'),
|
10
10
|
wd('Dies|IDF'),
|
11
11
|
wd('ist|IDF'),
|
@@ -25,34 +25,31 @@ class TestAttendeeTextWriter < AttendeeTestCase
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def test_basic
|
28
|
-
|
29
|
-
@expect = [ "Dies,ist,eine,Zeile,.\n", "Dies,ist,eine,zweite,Zeile,.\n" ]
|
30
|
-
meet({'ext'=>'tst', 'sep'=>','}, false)
|
28
|
+
meet({ 'ext' => 'tst', 'sep' => ',' }, @input)
|
31
29
|
|
32
|
-
|
33
|
-
|
30
|
+
assert_equal([
|
31
|
+
"Dies,ist,eine,Zeile,.\n", "Dies,ist,eine,zweite,Zeile,.\n"
|
32
|
+
], File.readlines('test/test.tst', encoding: Lingo::ENC))
|
34
33
|
end
|
35
34
|
|
36
35
|
def test_complex
|
37
|
-
|
38
|
-
@expect = [ "Dies-ist-eine-Zeile-.\n", "Dies-ist-eine-zweite-Zeile-.\n" ]
|
39
|
-
meet({'ext'=>'yip', 'sep'=>'-'}, false)
|
36
|
+
meet({ 'ext' => 'yip', 'sep' => '-' }, @input)
|
40
37
|
|
41
|
-
|
42
|
-
|
38
|
+
assert_equal([
|
39
|
+
"Dies-ist-eine-Zeile-.\n", "Dies-ist-eine-zweite-Zeile-.\n"
|
40
|
+
], File.readlines('test/test.yip', encoding: Lingo::ENC))
|
43
41
|
end
|
44
42
|
|
45
43
|
def test_crlf
|
46
|
-
|
47
|
-
@expect = [ "Dies\n", "ist\n", "eine\n", "Zeile\n", ".\n", "Dies\n", "ist\n", "eine\n", "zweite\n", "Zeile\n", ".\n" ]
|
48
|
-
meet({'sep'=>"\n"}, false)
|
44
|
+
meet({ 'sep' => "\n" }, @input)
|
49
45
|
|
50
|
-
|
51
|
-
|
46
|
+
assert_equal([
|
47
|
+
"Dies\n", "ist\n", "eine\n", "Zeile\n", ".\n", "Dies\n", "ist\n", "eine\n", "zweite\n", "Zeile\n", ".\n"
|
48
|
+
], File.readlines('test/test.txt2', encoding: Lingo::ENC))
|
52
49
|
end
|
53
50
|
|
54
51
|
def test_lir_file
|
55
|
-
|
52
|
+
meet({ 'ext' => 'vec', 'lir-format' => nil }, [
|
56
53
|
ai('LIR-FORMAT|'), ai('FILE|test/lir.txt'),
|
57
54
|
ai('RECORD|00237'),
|
58
55
|
'020: GERHARD.',
|
@@ -65,28 +62,26 @@ class TestAttendeeTextWriter < AttendeeTestCase
|
|
65
62
|
'020: Information Retrieval und Dokumentmanagement im Multimedia-Zeitalter.',
|
66
63
|
"056: \"Das Buch ist ein praxisbezogenes VADEMECUM für alle, die in einer Welt der Datennetze Wissen/Informationen sammeln.\r",
|
67
64
|
ai('EOF|test/lir.txt')
|
68
|
-
]
|
69
|
-
|
65
|
+
])
|
66
|
+
|
67
|
+
assert_equal([
|
70
68
|
"00237*020: GERHARD. 025: Automatisches Sammeln, Klassifizieren und Indexieren von wissenschaftlich relevanten Informationsressour\
|
71
69
|
cen. 056: Die intellektuelle Erschließung des Internet befindet sich in einer Krise. GERHARD ist derzeit weltweit der einzige.\r\n",
|
72
70
|
"00238*020: Automatisches Sammeln, Klassifizieren und Indexieren von wissenschaftlich relevanten Informationsressourcen. 025: das D\
|
73
71
|
FG-Projekt GERHARD.\r\n",
|
74
72
|
"00239*020: Information Retrieval und Dokumentmanagement im Multimedia-Zeitalter. 056: \"Das Buch ist ein praxisbezogenes VADEMECUM\
|
75
73
|
für alle, die in einer Welt der Datennetze Wissen/Informationen sammeln.\r\n"
|
76
|
-
]
|
77
|
-
meet({'ext'=>'csv', 'lir-format'=>nil}, false)
|
78
|
-
|
79
|
-
@output = File.readlines('test/lir.csv', encoding: Lingo::ENC)
|
80
|
-
assert_equal(@expect, @output)
|
74
|
+
], File.readlines('test/lir.vec', encoding: Lingo::ENC))
|
81
75
|
end
|
82
76
|
|
83
77
|
def test_nonewords
|
84
|
-
|
85
|
-
|
86
|
-
|
78
|
+
meet({ 'ext' => 'non', 'sep' => "\n" }, [
|
79
|
+
ai('FILE|test/text.txt'), 'Nonwörter', 'Nonsense', ai('EOF|test/text.txt')
|
80
|
+
])
|
87
81
|
|
88
|
-
|
89
|
-
|
82
|
+
assert_equal([
|
83
|
+
"Nonwörter\n", "Nonsense"
|
84
|
+
], File.readlines('test/text.non', encoding: Lingo::ENC))
|
90
85
|
end
|
91
86
|
|
92
87
|
end
|
@@ -3,14 +3,17 @@
|
|
3
3
|
class TestAttendeeTokenizer < AttendeeTestCase
|
4
4
|
|
5
5
|
def test_basic
|
6
|
-
|
7
|
-
|
8
|
-
|
6
|
+
meet({}, [
|
7
|
+
"Dies ist ein Test."
|
8
|
+
], [
|
9
|
+
tk('Dies|WORD'), tk('ist|WORD'), tk('ein|WORD'), tk('Test|WORD'), tk('.|PUNC')
|
10
|
+
])
|
9
11
|
end
|
10
12
|
|
11
13
|
def test_complex
|
12
|
-
|
13
|
-
|
14
|
+
meet({}, [
|
15
|
+
"1964 www.vorhauer.de bzw. nasenbär, ()"
|
16
|
+
], [
|
14
17
|
tk('1964|NUMS'),
|
15
18
|
tk('www.vorhauer.de|URLS'),
|
16
19
|
tk('bzw|WORD'),
|
@@ -19,8 +22,7 @@ class TestAttendeeTokenizer < AttendeeTestCase
|
|
19
22
|
tk(',|PUNC'),
|
20
23
|
tk('(|OTHR'),
|
21
24
|
tk(')|OTHR')
|
22
|
-
]
|
23
|
-
meet({})
|
25
|
+
])
|
24
26
|
end
|
25
27
|
|
26
28
|
end
|
@@ -5,14 +5,14 @@ require_relative '../test_helper'
|
|
5
5
|
class TestAttendeeVariator < AttendeeTestCase
|
6
6
|
|
7
7
|
def test_basic
|
8
|
-
|
9
|
-
|
8
|
+
meet({ 'source' => 'sys-dic' }, [
|
9
|
+
wd('fchwarz|?'), wd('fchilling|?'), wd('iehwarzfchilling|?'), wd('fchiiiirg|?')
|
10
|
+
], [
|
10
11
|
wd('*schwarz|IDF', 'schwarz|s', 'schwarz|a'),
|
11
12
|
wd('*schilling|IDF', 'schilling|s'),
|
12
13
|
wd('*schwarzschilling|KOM', 'schwarzschilling|k', 'schwarz|a+', 'schilling|s+', 'schwarz|s+'),
|
13
14
|
wd('fchiiiirg|?')
|
14
|
-
]
|
15
|
-
meet({'source'=>'sys-dic'})
|
15
|
+
])
|
16
16
|
end
|
17
17
|
|
18
18
|
end
|
@@ -14,43 +14,51 @@ class TestAttendeeVectorFilter < AttendeeTestCase
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def test_basic
|
17
|
-
|
18
|
-
|
17
|
+
meet({}, @input, [
|
18
|
+
ai('FILE|test'), 'substantiv', ai('EOF|test')
|
19
|
+
])
|
19
20
|
end
|
20
21
|
|
21
22
|
def test_lexicals
|
22
|
-
|
23
|
-
|
23
|
+
meet({ 'lexicals' => '[save]' }, @input, [
|
24
|
+
ai('FILE|test'), 'adjektiv', 'eigenname', 'substantiv', 'verb', ai('EOF|test')
|
25
|
+
])
|
24
26
|
end
|
25
27
|
|
26
28
|
def test_sort_term_abs
|
27
|
-
|
28
|
-
|
29
|
+
meet({ 'lexicals' => '[save]', 'sort' => 'term_abs' }, @input, [
|
30
|
+
ai('FILE|test'), '1 adjektiv', '1 eigenname', '1 substantiv', '1 verb', ai('EOF|test')
|
31
|
+
])
|
29
32
|
end
|
30
33
|
|
31
34
|
def test_sort_term_rel
|
32
|
-
|
33
|
-
|
35
|
+
meet({ 'lexicals' => '[save]', 'sort' => 'term_rel' }, @input, [
|
36
|
+
ai('FILE|test'), '0.50000 adjektiv', '0.50000 eigenname', '0.50000 substantiv', '0.50000 verb', ai('EOF|test')
|
37
|
+
])
|
34
38
|
end
|
35
39
|
|
36
40
|
def test_sort_sto_abs
|
37
|
-
|
38
|
-
|
41
|
+
meet({ 'lexicals' => '[save]', 'sort' => 'sto_abs' }, @input, [
|
42
|
+
ai('FILE|test'), 'adjektiv {1}', 'eigenname {1}', 'substantiv {1}', 'verb {1}', ai('EOF|test')
|
43
|
+
])
|
39
44
|
end
|
40
45
|
|
41
46
|
def test_sort_sto_rel
|
42
|
-
|
43
|
-
|
47
|
+
meet({ 'lexicals' => '[save]', 'sort' => 'sto_rel' }, @input, [
|
48
|
+
ai('FILE|test'), 'adjektiv {0.50000}', 'eigenname {0.50000}', 'substantiv {0.50000}', 'verb {0.50000}', ai('EOF|test')
|
49
|
+
])
|
44
50
|
end
|
45
51
|
|
46
52
|
def test_nonword
|
47
|
-
|
48
|
-
|
53
|
+
meet({ 'lexicals' => '\?' }, @input, [
|
54
|
+
ai('FILE|test'), 'unknown', ai('EOF|test')
|
55
|
+
])
|
49
56
|
end
|
50
57
|
|
51
58
|
def test_nonword_sort_term_abs
|
52
|
-
|
53
|
-
|
59
|
+
meet({ 'lexicals' => '\?', 'sort' => 'term_abs' }, @input, [
|
60
|
+
ai('FILE|test'), '1 unknown', ai('EOF|test')
|
61
|
+
])
|
54
62
|
end
|
55
63
|
|
56
64
|
end
|