lingo 1.8.1 → 1.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +23 -5
- data/README +1 -1
- data/Rakefile +5 -7
- data/TODO +2 -0
- data/bin/lingo +5 -1
- data/de.lang +1 -1
- data/en/lingo-syn.txt +0 -0
- data/en.lang +2 -1
- data/lib/lingo/attendee/abbreviator.rb +8 -9
- data/lib/lingo/attendee/debugger.rb +5 -4
- data/lib/lingo/attendee/decomposer.rb +8 -3
- data/lib/lingo/attendee/dehyphenizer.rb +19 -63
- data/lib/lingo/attendee/formatter.rb +1 -1
- data/lib/lingo/attendee/multi_worder.rb +67 -155
- data/lib/lingo/attendee/noneword_filter.rb +16 -9
- data/lib/lingo/attendee/object_filter.rb +1 -1
- data/lib/lingo/attendee/sequencer.rb +32 -63
- data/lib/lingo/attendee/stemmer/porter.rb +343 -0
- data/{info/gpl-hdr.txt → lib/lingo/attendee/stemmer.rb} +33 -0
- data/lib/lingo/attendee/synonymer.rb +10 -9
- data/lib/lingo/attendee/text_reader.rb +102 -76
- data/lib/lingo/attendee/text_writer.rb +23 -26
- data/lib/lingo/attendee/tokenizer.rb +13 -27
- data/lib/lingo/attendee/variator.rb +26 -66
- data/lib/lingo/attendee/vector_filter.rb +42 -43
- data/lib/lingo/attendee/word_searcher.rb +6 -7
- data/lib/lingo/attendee.rb +25 -7
- data/lib/lingo/buffered_attendee.rb +36 -10
- data/lib/lingo/cachable.rb +8 -8
- data/lib/lingo/config.rb +5 -6
- data/lib/lingo/ctl.rb +2 -3
- data/lib/lingo/database/crypter.rb +9 -26
- data/lib/lingo/database/gdbm_store.rb +3 -5
- data/lib/lingo/database/libcdb_store.rb +4 -6
- data/lib/lingo/database/sdbm_store.rb +11 -6
- data/lib/lingo/database/show_progress.rb +3 -43
- data/lib/lingo/database/source/key_value.rb +2 -6
- data/lib/lingo/database/source/multi_key.rb +3 -5
- data/lib/lingo/database/source/multi_value.rb +2 -6
- data/lib/lingo/database/source/single_word.rb +4 -6
- data/lib/lingo/database/source/word_class.rb +4 -10
- data/lib/lingo/database/source.rb +20 -18
- data/lib/lingo/database.rb +84 -59
- data/lib/lingo/error.rb +57 -1
- data/lib/lingo/language/dictionary.rb +21 -18
- data/lib/lingo/language/grammar.rb +40 -49
- data/lib/lingo/language/lexical.rb +6 -6
- data/lib/lingo/language/lexical_hash.rb +6 -0
- data/lib/lingo/language/word.rb +32 -15
- data/lib/lingo/language/word_form.rb +1 -1
- data/lib/lingo/language.rb +14 -25
- data/lib/lingo/reportable.rb +12 -10
- data/lib/lingo/show_progress.rb +81 -0
- data/lib/lingo/version.rb +1 -1
- data/lib/lingo.rb +63 -24
- data/lingo-call.cfg +6 -10
- data/lingo.cfg +60 -44
- data/lir.cfg +42 -41
- data/test/attendee/ts_abbreviator.rb +3 -5
- data/test/attendee/ts_decomposer.rb +3 -5
- data/test/attendee/ts_multi_worder.rb +87 -145
- data/test/attendee/ts_noneword_filter.rb +5 -3
- data/test/attendee/ts_object_filter.rb +5 -3
- data/test/attendee/ts_sequencer.rb +3 -5
- data/test/attendee/ts_stemmer.rb +309 -0
- data/test/attendee/ts_synonymer.rb +15 -11
- data/test/attendee/ts_text_reader.rb +12 -15
- data/test/attendee/ts_text_writer.rb +24 -29
- data/test/attendee/ts_tokenizer.rb +9 -7
- data/test/attendee/ts_variator.rb +4 -4
- data/test/attendee/ts_vector_filter.rb +24 -16
- data/test/attendee/ts_word_searcher.rb +20 -36
- data/test/{lir.csv → lir.vec} +0 -0
- data/test/ref/artikel.vec +943 -943
- data/test/ref/artikel.ven +943 -943
- data/test/ref/lir.non +201 -201
- data/test/ref/lir.seq +178 -178
- data/test/ref/lir.syn +49 -49
- data/test/ref/lir.vec +329 -0
- data/test/test_helper.rb +20 -36
- data/test/ts_database.rb +10 -10
- data/test/ts_language.rb +279 -319
- metadata +93 -104
- data/info/Objekte.png +0 -0
- data/info/Typen.png +0 -0
- data/info/database.png +0 -0
- data/info/db_small.png +0 -0
- data/info/download.png +0 -0
- data/info/kerze.png +0 -0
- data/info/language.png +0 -0
- data/info/lingo.png +0 -0
- data/info/logo.png +0 -0
- data/info/meeting.png +0 -0
- data/info/types.png +0 -0
- data/lingo-all.cfg +0 -89
- data/porter/stem.cfg +0 -311
- data/porter/stem.rb +0 -150
- data/test/ref/lir.csv +0 -329
- data/test.cfg +0 -79
@@ -0,0 +1,309 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative '../test_helper'
|
4
|
+
|
5
|
+
class TestAttendeeStemmer < AttendeeTestCase
|
6
|
+
|
7
|
+
def test_type
|
8
|
+
assert_raise(Lingo::NameNotFoundError) { meet({ 'type' => 'bla' }, []) }
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_basic
|
12
|
+
meet({}, [
|
13
|
+
wd('bla|IDF'),
|
14
|
+
wd('blub|?'),
|
15
|
+
wd('blubs|?'),
|
16
|
+
ai('EOF|')
|
17
|
+
], [
|
18
|
+
wd('bla|IDF'),
|
19
|
+
wd('blub|?'),
|
20
|
+
wd('blubs|?', 'blub|z'),
|
21
|
+
ai('EOF|')
|
22
|
+
])
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_wc
|
26
|
+
meet({ 'wordclass' => 'w' }, [
|
27
|
+
wd('bla|IDF'),
|
28
|
+
wd('blub|?'),
|
29
|
+
wd('blubs|?'),
|
30
|
+
ai('EOF|')
|
31
|
+
], [
|
32
|
+
wd('bla|IDF'),
|
33
|
+
wd('blub|?'),
|
34
|
+
wd('blubs|?', 'blub|w'),
|
35
|
+
ai('EOF|')
|
36
|
+
])
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_mode
|
40
|
+
meet({ 'mode' => '' }, [
|
41
|
+
wd('bla|IDF'),
|
42
|
+
wd('a|?'),
|
43
|
+
wd('yet|?'),
|
44
|
+
wd('blubs|?'),
|
45
|
+
ai('EOF|')
|
46
|
+
], [
|
47
|
+
wd('bla|IDF'),
|
48
|
+
wd('a|?'),
|
49
|
+
wd('yet|?'),
|
50
|
+
wd('blubs|?', 'blub|z'),
|
51
|
+
ai('EOF|')
|
52
|
+
])
|
53
|
+
|
54
|
+
meet({ 'mode' => 'all' }, [
|
55
|
+
wd('bla|IDF'),
|
56
|
+
wd('a|?'),
|
57
|
+
wd('yet|?'),
|
58
|
+
wd('blubs|?'),
|
59
|
+
ai('EOF|')
|
60
|
+
], [
|
61
|
+
wd('bla|IDF'),
|
62
|
+
wd('a|?', 'a|z'),
|
63
|
+
wd('yet|?', 'yet|z'),
|
64
|
+
wd('blubs|?', 'blub|z'),
|
65
|
+
ai('EOF|')
|
66
|
+
])
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_examples_100
|
70
|
+
meet({}, [
|
71
|
+
wd('S100|IDF'),
|
72
|
+
wd('caresses|?'),
|
73
|
+
wd('ponies|?'),
|
74
|
+
wd('ties|?'),
|
75
|
+
wd('caress|?'),
|
76
|
+
wd('cats|?'),
|
77
|
+
ai('EOF|')
|
78
|
+
], [
|
79
|
+
wd('S100|IDF'),
|
80
|
+
wd('caresses|?', 'caress|z'),
|
81
|
+
wd('ponies|?', 'poni|z'),
|
82
|
+
wd('ties|?', 'ti|z'), # snowball: tie
|
83
|
+
wd('caress|?', 'caress|z'),
|
84
|
+
wd('cats|?', 'cat|z'),
|
85
|
+
ai('EOF|')
|
86
|
+
])
|
87
|
+
end
|
88
|
+
|
89
|
+
def test_examples_110
|
90
|
+
meet({ 'mode' => 'all' }, [
|
91
|
+
wd('S110|IDF'),
|
92
|
+
wd('agreed|?'),
|
93
|
+
wd('feed|?'),
|
94
|
+
wd('plastered|?'),
|
95
|
+
wd('bled|?'),
|
96
|
+
wd('motoring|?'),
|
97
|
+
wd('sing|?'),
|
98
|
+
ai('EOF|')
|
99
|
+
], [
|
100
|
+
wd('S110|IDF'),
|
101
|
+
wd('agreed|?', 'agre|z'),
|
102
|
+
wd('feed|?', 'fe|z'), # snowball: feed
|
103
|
+
wd('plastered|?', 'plaster|z'),
|
104
|
+
wd('bled|?', 'bled|z'),
|
105
|
+
wd('motoring|?', 'motor|z'),
|
106
|
+
wd('sing|?', 'sing|z'),
|
107
|
+
ai('EOF|')
|
108
|
+
])
|
109
|
+
end
|
110
|
+
|
111
|
+
def test_examples_111
|
112
|
+
meet({}, [
|
113
|
+
wd('S111|IDF'),
|
114
|
+
wd('conflated|?'),
|
115
|
+
wd('troubled|?'),
|
116
|
+
wd('sized|?'),
|
117
|
+
wd('hopping|?'),
|
118
|
+
wd('tanned|?'),
|
119
|
+
wd('falling|?'),
|
120
|
+
wd('hissing|?'),
|
121
|
+
wd('fizzed|?'),
|
122
|
+
wd('failing|?'),
|
123
|
+
wd('filing|?'),
|
124
|
+
ai('EOF|')
|
125
|
+
], [
|
126
|
+
wd('S111|IDF'),
|
127
|
+
wd('conflated|?', 'conflat|z'),
|
128
|
+
wd('troubled|?', 'troubl|z'),
|
129
|
+
wd('sized|?', 'size|z'),
|
130
|
+
wd('hopping|?', 'hop|z'),
|
131
|
+
wd('tanned|?', 'tan|z'),
|
132
|
+
wd('falling|?', 'fall|z'),
|
133
|
+
wd('hissing|?', 'hiss|z'),
|
134
|
+
wd('fizzed|?', 'fizz|z'),
|
135
|
+
wd('failing|?', 'fail|z'),
|
136
|
+
wd('filing|?', 'file|z'),
|
137
|
+
ai('EOF|')
|
138
|
+
])
|
139
|
+
end
|
140
|
+
|
141
|
+
def test_examples_120
|
142
|
+
meet({ 'mode' => 'all' }, [
|
143
|
+
wd('S120|IDF'),
|
144
|
+
wd('happy|?'),
|
145
|
+
wd('sky|?'),
|
146
|
+
ai('EOF|')
|
147
|
+
], [
|
148
|
+
wd('S120|IDF'),
|
149
|
+
wd('happy|?', 'happi|z'),
|
150
|
+
wd('sky|?', 'sky|z'),
|
151
|
+
ai('EOF|')
|
152
|
+
])
|
153
|
+
end
|
154
|
+
|
155
|
+
def test_examples_200
|
156
|
+
meet({}, [
|
157
|
+
wd('S200|IDF'),
|
158
|
+
wd('relational|?'),
|
159
|
+
wd('conditional|?'),
|
160
|
+
wd('rational|?'),
|
161
|
+
wd('valency|?'),
|
162
|
+
wd('hesitancy|?'),
|
163
|
+
wd('digitizer|?'),
|
164
|
+
wd('conformably|?'),
|
165
|
+
wd('radically|?'),
|
166
|
+
wd('differently|?'),
|
167
|
+
wd('vilely|?'),
|
168
|
+
wd('analogously|?'),
|
169
|
+
wd('vietnamization|?'),
|
170
|
+
wd('predication|?'),
|
171
|
+
wd('operator|?'),
|
172
|
+
wd('feudalism|?'),
|
173
|
+
wd('decisiveness|?'),
|
174
|
+
wd('hopefulness|?'),
|
175
|
+
wd('callousness|?'),
|
176
|
+
wd('formality|?'),
|
177
|
+
wd('sensitivity|?'),
|
178
|
+
wd('sensibility|?'),
|
179
|
+
ai('EOF|')
|
180
|
+
], [
|
181
|
+
wd('S200|IDF'),
|
182
|
+
wd('relational|?', 'relat|z'),
|
183
|
+
wd('conditional|?', 'condit|z'),
|
184
|
+
wd('rational|?', 'ration|z'),
|
185
|
+
wd('valency|?', 'valenc|z'),
|
186
|
+
wd('hesitancy|?', 'hesit|z'),
|
187
|
+
wd('digitizer|?', 'digit|z'),
|
188
|
+
wd('conformably|?', 'conform|z'),
|
189
|
+
wd('radically|?', 'radic|z'),
|
190
|
+
wd('differently|?', 'differ|z'),
|
191
|
+
wd('vilely|?', 'vile|z'),
|
192
|
+
wd('analogously|?', 'analog|z'),
|
193
|
+
wd('vietnamization|?', 'vietnam|z'),
|
194
|
+
wd('predication|?', 'predic|z'),
|
195
|
+
wd('operator|?', 'oper|z'),
|
196
|
+
wd('feudalism|?', 'feudal|z'),
|
197
|
+
wd('decisiveness|?', 'decis|z'),
|
198
|
+
wd('hopefulness|?', 'hope|z'),
|
199
|
+
wd('callousness|?', 'callous|z'),
|
200
|
+
wd('formality|?', 'formal|z'),
|
201
|
+
wd('sensitivity|?', 'sensit|z'),
|
202
|
+
wd('sensibility|?', 'sensibl|z'),
|
203
|
+
ai('EOF|')
|
204
|
+
])
|
205
|
+
end
|
206
|
+
|
207
|
+
def test_examples_300
|
208
|
+
meet({}, [
|
209
|
+
wd('S300|IDF'),
|
210
|
+
wd('triplicate|?'),
|
211
|
+
wd('formative|?'),
|
212
|
+
wd('formalize|?'),
|
213
|
+
wd('electricity|?'),
|
214
|
+
wd('electrical|?'),
|
215
|
+
wd('hopeful|?'),
|
216
|
+
wd('goodness|?'),
|
217
|
+
ai('EOF|')
|
218
|
+
], [
|
219
|
+
wd('S300|IDF'),
|
220
|
+
wd('triplicate|?', 'triplic|z'),
|
221
|
+
wd('formative|?', 'form|z'), # snowball: format
|
222
|
+
wd('formalize|?', 'formal|z'),
|
223
|
+
wd('electricity|?', 'electr|z'),
|
224
|
+
wd('electrical|?', 'electr|z'),
|
225
|
+
wd('hopeful|?', 'hope|z'),
|
226
|
+
wd('goodness|?', 'good|z'),
|
227
|
+
ai('EOF|')
|
228
|
+
])
|
229
|
+
end
|
230
|
+
|
231
|
+
def test_examples_400
|
232
|
+
meet({}, [
|
233
|
+
wd('S400|IDF'),
|
234
|
+
wd('revival|?'),
|
235
|
+
wd('allowance|?'),
|
236
|
+
wd('inference|?'),
|
237
|
+
wd('airliner|?'),
|
238
|
+
wd('gyroscopic|?'),
|
239
|
+
wd('adjustable|?'),
|
240
|
+
wd('defensible|?'),
|
241
|
+
wd('irritant|?'),
|
242
|
+
wd('replacement|?'),
|
243
|
+
wd('adjustment|?'),
|
244
|
+
wd('dependent|?'),
|
245
|
+
wd('adoption|?'),
|
246
|
+
wd('homologou|?'),
|
247
|
+
wd('communism|?'),
|
248
|
+
wd('activate|?'),
|
249
|
+
wd('angularity|?'),
|
250
|
+
wd('homologous|?'),
|
251
|
+
wd('effective|?'),
|
252
|
+
wd('bowdlerize|?'),
|
253
|
+
ai('EOF|')
|
254
|
+
], [
|
255
|
+
wd('S400|IDF'),
|
256
|
+
wd('revival|?', 'reviv|z'),
|
257
|
+
wd('allowance|?', 'allow|z'),
|
258
|
+
wd('inference|?', 'infer|z'),
|
259
|
+
wd('airliner|?', 'airlin|z'),
|
260
|
+
wd('gyroscopic|?', 'gyroscop|z'),
|
261
|
+
wd('adjustable|?', 'adjust|z'),
|
262
|
+
wd('defensible|?', 'defens|z'),
|
263
|
+
wd('irritant|?', 'irrit|z'),
|
264
|
+
wd('replacement|?', 'replac|z'),
|
265
|
+
wd('adjustment|?', 'adjust|z'),
|
266
|
+
wd('dependent|?', 'depend|z'),
|
267
|
+
wd('adoption|?', 'adopt|z'),
|
268
|
+
wd('homologou|?', 'homolog|z'), # snowball: homologou
|
269
|
+
wd('communism|?', 'commun|z'), # snowball: communism
|
270
|
+
wd('activate|?', 'activ|z'),
|
271
|
+
wd('angularity|?', 'angular|z'),
|
272
|
+
wd('homologous|?', 'homolog|z'),
|
273
|
+
wd('effective|?', 'effect|z'),
|
274
|
+
wd('bowdlerize|?', 'bowdler|z'),
|
275
|
+
ai('EOF|')
|
276
|
+
])
|
277
|
+
end
|
278
|
+
|
279
|
+
def test_examples_500
|
280
|
+
meet({ 'mode' => 'all' }, [
|
281
|
+
wd('S500|IDF'),
|
282
|
+
wd('probate|?'),
|
283
|
+
wd('rate|?'),
|
284
|
+
wd('cease|?'),
|
285
|
+
ai('EOF|')
|
286
|
+
], [
|
287
|
+
wd('S500|IDF'),
|
288
|
+
wd('probate|?', 'probat|z'),
|
289
|
+
wd('rate|?', 'rate|z'),
|
290
|
+
wd('cease|?', 'ceas|z'),
|
291
|
+
ai('EOF|')
|
292
|
+
])
|
293
|
+
end
|
294
|
+
|
295
|
+
def test_examples_510
|
296
|
+
meet({ 'mode' => 'all' }, [
|
297
|
+
wd('S510|IDF'),
|
298
|
+
wd('controll|?'),
|
299
|
+
wd('roll|?'),
|
300
|
+
ai('EOF|')
|
301
|
+
], [
|
302
|
+
wd('S510|IDF'),
|
303
|
+
wd('controll|?', 'control|z'),
|
304
|
+
wd('roll|?', 'roll|z'),
|
305
|
+
ai('EOF|')
|
306
|
+
])
|
307
|
+
end
|
308
|
+
|
309
|
+
end
|
@@ -5,23 +5,27 @@ require_relative '../test_helper'
|
|
5
5
|
class TestAttendeeSynonymer < AttendeeTestCase
|
6
6
|
|
7
7
|
def test_basic
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
meet({ 'source' => 'sys-syn', 'check' => '-,MUL' }, [
|
9
|
+
wd('abtastzeiten|IDF', 'abtastzeit|s')
|
10
|
+
], [
|
11
|
+
wd('abtastzeiten|IDF', 'abtastzeit|s', 'abtastfrequenz|y', 'abtastperiode|y')
|
12
|
+
])
|
12
13
|
end
|
13
14
|
|
14
15
|
def test_first
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
16
|
+
meet({ 'source' => 'sys-syn,tst-syn', 'check' => '-,MUL', 'mode' => 'first' }, [
|
17
|
+
wd('Aktienanleihe|IDF', 'aktienanleihe|s')
|
18
|
+
], [
|
19
|
+
wd('Aktienanleihe|IDF', 'aktienanleihe|s', 'aktien-anleihe|y', 'reverse convertible bond|y', 'reverse convertibles|y')
|
20
|
+
])
|
19
21
|
end
|
20
22
|
|
21
23
|
def test_all
|
22
|
-
|
23
|
-
|
24
|
-
|
24
|
+
meet({ 'source' => 'sys-syn,tst-syn', 'check' => '-,MUL', 'mode' => 'all' }, [
|
25
|
+
wd('Kerlchen|IDF', 'kerlchen|s')
|
26
|
+
], [
|
27
|
+
wd('Kerlchen|IDF', 'kerlchen|s', 'kerlchen|y', 'zwerg-nase|y')
|
28
|
+
])
|
25
29
|
end
|
26
30
|
|
27
31
|
end
|
@@ -5,8 +5,8 @@ require_relative '../test_helper'
|
|
5
5
|
class TestAttendeeTextReader < AttendeeTestCase
|
6
6
|
|
7
7
|
def test_lir_file
|
8
|
-
|
9
|
-
ai('LIR-FORMAT|'), ai('
|
8
|
+
meet({ 'files' => 'test/lir.txt', 'records' => true }, nil, [
|
9
|
+
ai('LIR-FORMAT|'), ai("FILE|#{path = File.expand_path('test/lir.txt')}"),
|
10
10
|
ai('RECORD|00237'),
|
11
11
|
'020: GERHARD.',
|
12
12
|
'025: Automatisches Sammeln, Klassifizieren und Indexieren von wissenschaftlich relevanten Informationsressourcen.',
|
@@ -17,14 +17,13 @@ class TestAttendeeTextReader < AttendeeTestCase
|
|
17
17
|
ai('RECORD|00239'),
|
18
18
|
'020: Information Retrieval und Dokumentmanagement im Multimedia-Zeitalter.',
|
19
19
|
'056: "Das Buch ist ein praxisbezogenes VADEMECUM für alle, die in einer Welt der Datennetze Wissen/Informationen sammeln.',
|
20
|
-
ai(
|
21
|
-
]
|
22
|
-
meet({'files'=>'test/lir.txt', 'lir-record-pattern'=>'^\[(\d+)\.\]'})
|
20
|
+
ai("EOF|#{path}")
|
21
|
+
])
|
23
22
|
end
|
24
23
|
|
25
24
|
def test_lir_file_another_pattern
|
26
|
-
|
27
|
-
ai('LIR-FORMAT|'), ai('
|
25
|
+
meet({ 'files' => 'test/lir2.txt', 'records' => '^\021(\d+)\022' }, nil, [
|
26
|
+
ai('LIR-FORMAT|'), ai("FILE|#{path = File.expand_path('test/lir2.txt')}"),
|
28
27
|
ai('RECORD|00237'),
|
29
28
|
'020: GERHARD.',
|
30
29
|
'025: Automatisches Sammeln, Klassifizieren und Indexieren von wissenschaftlich relevanten Informationsressourcen.',
|
@@ -35,18 +34,16 @@ class TestAttendeeTextReader < AttendeeTestCase
|
|
35
34
|
ai('RECORD|00239'),
|
36
35
|
'020: Information Retrieval und Dokumentmanagement im Multimedia-Zeitalter.',
|
37
36
|
'056: "Das Buch ist ein praxisbezogenes VADEMECUM für alle, die in einer Welt der Datennetze Wissen/Informationen sammeln.',
|
38
|
-
ai(
|
39
|
-
]
|
40
|
-
meet({'files'=>'test/lir2.txt', 'lir-record-pattern'=>'^\021(\d+)\022'})
|
37
|
+
ai("EOF|#{path}")
|
38
|
+
])
|
41
39
|
end
|
42
40
|
|
43
41
|
def test_normal_file
|
44
|
-
|
45
|
-
ai('
|
42
|
+
meet({ 'files' => 'test/mul.txt' }, nil, [
|
43
|
+
ai("FILE|#{path = File.expand_path('test/mul.txt')}"),
|
46
44
|
'Die abstrakte Kunst ist schön.',
|
47
|
-
ai(
|
48
|
-
]
|
49
|
-
meet({'files'=>'test/mul.txt'})
|
45
|
+
ai("EOF|#{path}")
|
46
|
+
])
|
50
47
|
end
|
51
48
|
|
52
49
|
end
|
@@ -5,7 +5,7 @@ require_relative '../test_helper'
|
|
5
5
|
class TestAttendeeTextWriter < AttendeeTestCase
|
6
6
|
|
7
7
|
def setup
|
8
|
-
@
|
8
|
+
@input = [
|
9
9
|
ai('FILE|test/test.txt'),
|
10
10
|
wd('Dies|IDF'),
|
11
11
|
wd('ist|IDF'),
|
@@ -25,34 +25,31 @@ class TestAttendeeTextWriter < AttendeeTestCase
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def test_basic
|
28
|
-
|
29
|
-
@expect = [ "Dies,ist,eine,Zeile,.\n", "Dies,ist,eine,zweite,Zeile,.\n" ]
|
30
|
-
meet({'ext'=>'tst', 'sep'=>','}, false)
|
28
|
+
meet({ 'ext' => 'tst', 'sep' => ',' }, @input)
|
31
29
|
|
32
|
-
|
33
|
-
|
30
|
+
assert_equal([
|
31
|
+
"Dies,ist,eine,Zeile,.\n", "Dies,ist,eine,zweite,Zeile,.\n"
|
32
|
+
], File.readlines('test/test.tst', encoding: Lingo::ENC))
|
34
33
|
end
|
35
34
|
|
36
35
|
def test_complex
|
37
|
-
|
38
|
-
@expect = [ "Dies-ist-eine-Zeile-.\n", "Dies-ist-eine-zweite-Zeile-.\n" ]
|
39
|
-
meet({'ext'=>'yip', 'sep'=>'-'}, false)
|
36
|
+
meet({ 'ext' => 'yip', 'sep' => '-' }, @input)
|
40
37
|
|
41
|
-
|
42
|
-
|
38
|
+
assert_equal([
|
39
|
+
"Dies-ist-eine-Zeile-.\n", "Dies-ist-eine-zweite-Zeile-.\n"
|
40
|
+
], File.readlines('test/test.yip', encoding: Lingo::ENC))
|
43
41
|
end
|
44
42
|
|
45
43
|
def test_crlf
|
46
|
-
|
47
|
-
@expect = [ "Dies\n", "ist\n", "eine\n", "Zeile\n", ".\n", "Dies\n", "ist\n", "eine\n", "zweite\n", "Zeile\n", ".\n" ]
|
48
|
-
meet({'sep'=>"\n"}, false)
|
44
|
+
meet({ 'sep' => "\n" }, @input)
|
49
45
|
|
50
|
-
|
51
|
-
|
46
|
+
assert_equal([
|
47
|
+
"Dies\n", "ist\n", "eine\n", "Zeile\n", ".\n", "Dies\n", "ist\n", "eine\n", "zweite\n", "Zeile\n", ".\n"
|
48
|
+
], File.readlines('test/test.txt2', encoding: Lingo::ENC))
|
52
49
|
end
|
53
50
|
|
54
51
|
def test_lir_file
|
55
|
-
|
52
|
+
meet({ 'ext' => 'vec', 'lir-format' => nil }, [
|
56
53
|
ai('LIR-FORMAT|'), ai('FILE|test/lir.txt'),
|
57
54
|
ai('RECORD|00237'),
|
58
55
|
'020: GERHARD.',
|
@@ -65,28 +62,26 @@ class TestAttendeeTextWriter < AttendeeTestCase
|
|
65
62
|
'020: Information Retrieval und Dokumentmanagement im Multimedia-Zeitalter.',
|
66
63
|
"056: \"Das Buch ist ein praxisbezogenes VADEMECUM für alle, die in einer Welt der Datennetze Wissen/Informationen sammeln.\r",
|
67
64
|
ai('EOF|test/lir.txt')
|
68
|
-
]
|
69
|
-
|
65
|
+
])
|
66
|
+
|
67
|
+
assert_equal([
|
70
68
|
"00237*020: GERHARD. 025: Automatisches Sammeln, Klassifizieren und Indexieren von wissenschaftlich relevanten Informationsressour\
|
71
69
|
cen. 056: Die intellektuelle Erschließung des Internet befindet sich in einer Krise. GERHARD ist derzeit weltweit der einzige.\r\n",
|
72
70
|
"00238*020: Automatisches Sammeln, Klassifizieren und Indexieren von wissenschaftlich relevanten Informationsressourcen. 025: das D\
|
73
71
|
FG-Projekt GERHARD.\r\n",
|
74
72
|
"00239*020: Information Retrieval und Dokumentmanagement im Multimedia-Zeitalter. 056: \"Das Buch ist ein praxisbezogenes VADEMECUM\
|
75
73
|
für alle, die in einer Welt der Datennetze Wissen/Informationen sammeln.\r\n"
|
76
|
-
]
|
77
|
-
meet({'ext'=>'csv', 'lir-format'=>nil}, false)
|
78
|
-
|
79
|
-
@output = File.readlines('test/lir.csv', encoding: Lingo::ENC)
|
80
|
-
assert_equal(@expect, @output)
|
74
|
+
], File.readlines('test/lir.vec', encoding: Lingo::ENC))
|
81
75
|
end
|
82
76
|
|
83
77
|
def test_nonewords
|
84
|
-
|
85
|
-
|
86
|
-
|
78
|
+
meet({ 'ext' => 'non', 'sep' => "\n" }, [
|
79
|
+
ai('FILE|test/text.txt'), 'Nonwörter', 'Nonsense', ai('EOF|test/text.txt')
|
80
|
+
])
|
87
81
|
|
88
|
-
|
89
|
-
|
82
|
+
assert_equal([
|
83
|
+
"Nonwörter\n", "Nonsense"
|
84
|
+
], File.readlines('test/text.non', encoding: Lingo::ENC))
|
90
85
|
end
|
91
86
|
|
92
87
|
end
|
@@ -3,14 +3,17 @@
|
|
3
3
|
class TestAttendeeTokenizer < AttendeeTestCase
|
4
4
|
|
5
5
|
def test_basic
|
6
|
-
|
7
|
-
|
8
|
-
|
6
|
+
meet({}, [
|
7
|
+
"Dies ist ein Test."
|
8
|
+
], [
|
9
|
+
tk('Dies|WORD'), tk('ist|WORD'), tk('ein|WORD'), tk('Test|WORD'), tk('.|PUNC')
|
10
|
+
])
|
9
11
|
end
|
10
12
|
|
11
13
|
def test_complex
|
12
|
-
|
13
|
-
|
14
|
+
meet({}, [
|
15
|
+
"1964 www.vorhauer.de bzw. nasenbär, ()"
|
16
|
+
], [
|
14
17
|
tk('1964|NUMS'),
|
15
18
|
tk('www.vorhauer.de|URLS'),
|
16
19
|
tk('bzw|WORD'),
|
@@ -19,8 +22,7 @@ class TestAttendeeTokenizer < AttendeeTestCase
|
|
19
22
|
tk(',|PUNC'),
|
20
23
|
tk('(|OTHR'),
|
21
24
|
tk(')|OTHR')
|
22
|
-
]
|
23
|
-
meet({})
|
25
|
+
])
|
24
26
|
end
|
25
27
|
|
26
28
|
end
|
@@ -5,14 +5,14 @@ require_relative '../test_helper'
|
|
5
5
|
class TestAttendeeVariator < AttendeeTestCase
|
6
6
|
|
7
7
|
def test_basic
|
8
|
-
|
9
|
-
|
8
|
+
meet({ 'source' => 'sys-dic' }, [
|
9
|
+
wd('fchwarz|?'), wd('fchilling|?'), wd('iehwarzfchilling|?'), wd('fchiiiirg|?')
|
10
|
+
], [
|
10
11
|
wd('*schwarz|IDF', 'schwarz|s', 'schwarz|a'),
|
11
12
|
wd('*schilling|IDF', 'schilling|s'),
|
12
13
|
wd('*schwarzschilling|KOM', 'schwarzschilling|k', 'schwarz|a+', 'schilling|s+', 'schwarz|s+'),
|
13
14
|
wd('fchiiiirg|?')
|
14
|
-
]
|
15
|
-
meet({'source'=>'sys-dic'})
|
15
|
+
])
|
16
16
|
end
|
17
17
|
|
18
18
|
end
|
@@ -14,43 +14,51 @@ class TestAttendeeVectorFilter < AttendeeTestCase
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def test_basic
|
17
|
-
|
18
|
-
|
17
|
+
meet({}, @input, [
|
18
|
+
ai('FILE|test'), 'substantiv', ai('EOF|test')
|
19
|
+
])
|
19
20
|
end
|
20
21
|
|
21
22
|
def test_lexicals
|
22
|
-
|
23
|
-
|
23
|
+
meet({ 'lexicals' => '[save]' }, @input, [
|
24
|
+
ai('FILE|test'), 'adjektiv', 'eigenname', 'substantiv', 'verb', ai('EOF|test')
|
25
|
+
])
|
24
26
|
end
|
25
27
|
|
26
28
|
def test_sort_term_abs
|
27
|
-
|
28
|
-
|
29
|
+
meet({ 'lexicals' => '[save]', 'sort' => 'term_abs' }, @input, [
|
30
|
+
ai('FILE|test'), '1 adjektiv', '1 eigenname', '1 substantiv', '1 verb', ai('EOF|test')
|
31
|
+
])
|
29
32
|
end
|
30
33
|
|
31
34
|
def test_sort_term_rel
|
32
|
-
|
33
|
-
|
35
|
+
meet({ 'lexicals' => '[save]', 'sort' => 'term_rel' }, @input, [
|
36
|
+
ai('FILE|test'), '0.50000 adjektiv', '0.50000 eigenname', '0.50000 substantiv', '0.50000 verb', ai('EOF|test')
|
37
|
+
])
|
34
38
|
end
|
35
39
|
|
36
40
|
def test_sort_sto_abs
|
37
|
-
|
38
|
-
|
41
|
+
meet({ 'lexicals' => '[save]', 'sort' => 'sto_abs' }, @input, [
|
42
|
+
ai('FILE|test'), 'adjektiv {1}', 'eigenname {1}', 'substantiv {1}', 'verb {1}', ai('EOF|test')
|
43
|
+
])
|
39
44
|
end
|
40
45
|
|
41
46
|
def test_sort_sto_rel
|
42
|
-
|
43
|
-
|
47
|
+
meet({ 'lexicals' => '[save]', 'sort' => 'sto_rel' }, @input, [
|
48
|
+
ai('FILE|test'), 'adjektiv {0.50000}', 'eigenname {0.50000}', 'substantiv {0.50000}', 'verb {0.50000}', ai('EOF|test')
|
49
|
+
])
|
44
50
|
end
|
45
51
|
|
46
52
|
def test_nonword
|
47
|
-
|
48
|
-
|
53
|
+
meet({ 'lexicals' => '\?' }, @input, [
|
54
|
+
ai('FILE|test'), 'unknown', ai('EOF|test')
|
55
|
+
])
|
49
56
|
end
|
50
57
|
|
51
58
|
def test_nonword_sort_term_abs
|
52
|
-
|
53
|
-
|
59
|
+
meet({ 'lexicals' => '\?', 'sort' => 'term_abs' }, @input, [
|
60
|
+
ai('FILE|test'), '1 unknown', ai('EOF|test')
|
61
|
+
])
|
54
62
|
end
|
55
63
|
|
56
64
|
end
|