lingo 1.8.0 → 1.8.1
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +13 -0
- data/README +49 -29
- data/Rakefile +28 -4
- data/TODO +2 -9
- data/bin/lingo +24 -0
- data/bin/lingoctl +24 -0
- data/de/lingo-dic.txt +559 -74
- data/info/gpl-hdr.txt +21 -24
- data/lib/lingo.rb +83 -112
- data/lib/lingo/agenda_item.rb +53 -0
- data/lib/lingo/attendee.rb +261 -0
- data/lib/lingo/attendee/abbreviator.rb +95 -97
- data/lib/lingo/attendee/debugger.rb +94 -93
- data/lib/lingo/attendee/decomposer.rb +76 -83
- data/lib/lingo/attendee/dehyphenizer.rb +141 -144
- data/lib/lingo/attendee/formatter.rb +65 -0
- data/lib/lingo/attendee/multi_worder.rb +302 -0
- data/lib/lingo/attendee/noneword_filter.rb +89 -84
- data/lib/lingo/attendee/object_filter.rb +91 -0
- data/lib/lingo/attendee/sequencer.rb +159 -158
- data/lib/lingo/attendee/synonymer.rb +81 -84
- data/lib/lingo/attendee/text_reader.rb +242 -0
- data/lib/lingo/attendee/text_writer.rb +169 -0
- data/lib/lingo/attendee/tokenizer.rb +192 -191
- data/lib/lingo/attendee/variator.rb +152 -156
- data/lib/lingo/attendee/vector_filter.rb +140 -135
- data/lib/lingo/attendee/word_searcher.rb +98 -0
- data/lib/lingo/buffered_attendee.rb +69 -0
- data/lib/lingo/cachable.rb +58 -0
- data/lib/lingo/call.rb +72 -0
- data/lib/lingo/cli.rb +26 -0
- data/lib/lingo/config.rb +23 -26
- data/lib/lingo/core_ext.rb +42 -0
- data/lib/lingo/ctl.rb +239 -173
- data/lib/lingo/database.rb +148 -496
- data/lib/lingo/database/crypter.rb +85 -0
- data/lib/lingo/database/gdbm_store.rb +49 -0
- data/lib/lingo/database/hash_store.rb +67 -0
- data/lib/lingo/database/libcdb_store.rb +58 -0
- data/lib/lingo/database/sdbm_store.rb +64 -0
- data/lib/lingo/database/show_progress.rb +81 -0
- data/lib/lingo/database/source.rb +134 -0
- data/lib/lingo/database/source/key_value.rb +62 -0
- data/lib/lingo/database/source/multi_key.rb +65 -0
- data/lib/lingo/database/source/multi_value.rb +65 -0
- data/lib/lingo/database/source/single_word.rb +60 -0
- data/lib/lingo/database/source/word_class.rb +64 -0
- data/lib/lingo/error.rb +122 -0
- data/lib/lingo/language.rb +78 -518
- data/lib/lingo/language/dictionary.rb +173 -0
- data/lib/lingo/language/grammar.rb +211 -0
- data/lib/lingo/language/lexical.rb +66 -0
- data/lib/lingo/language/lexical_hash.rb +88 -0
- data/lib/lingo/language/token.rb +48 -0
- data/lib/lingo/language/word.rb +130 -0
- data/lib/lingo/language/word_form.rb +83 -0
- data/lib/lingo/reportable.rb +59 -0
- data/lib/lingo/version.rb +1 -1
- data/lingo-all.cfg +14 -10
- data/lingo-call.cfg +5 -5
- data/lingo.cfg +14 -12
- data/lingo.rb +26 -0
- data/lir.cfg +13 -9
- data/spec/spec_helper.rb +1 -0
- data/test.cfg +11 -11
- data/test/attendee/ts_abbreviator.rb +0 -6
- data/test/attendee/ts_decomposer.rb +0 -6
- data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
- data/test/attendee/ts_noneword_filter.rb +1 -7
- data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
- data/test/attendee/ts_sequencer.rb +0 -6
- data/test/attendee/ts_synonymer.rb +0 -6
- data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
- data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
- data/test/attendee/ts_tokenizer.rb +0 -6
- data/test/attendee/ts_variator.rb +0 -6
- data/test/attendee/ts_vector_filter.rb +1 -7
- data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
- data/test/ref/artikel.non +2 -29
- data/test/ref/artikel.seq +13 -8
- data/test/ref/artikel.vec +30 -15
- data/test/ref/artikel.ven +29 -14
- data/test/ref/artikel.ver +58 -43
- data/test/ref/lir.csv +146 -145
- data/test/ref/lir.non +186 -210
- data/test/ref/lir.seq +54 -50
- data/test/test_helper.rb +41 -36
- data/test/ts_database.rb +12 -11
- data/test/ts_language.rb +118 -68
- metadata +67 -29
- data/lib/lingo/attendee/multiworder.rb +0 -301
- data/lib/lingo/attendee/objectfilter.rb +0 -86
- data/lib/lingo/attendee/textreader.rb +0 -237
- data/lib/lingo/attendee/textwriter.rb +0 -196
- data/lib/lingo/attendee/wordsearcher.rb +0 -96
- data/lib/lingo/attendees.rb +0 -289
- data/lib/lingo/const.rb +0 -131
- data/lib/lingo/modules.rb +0 -98
- data/lib/lingo/types.rb +0 -285
- data/lib/lingo/utilities.rb +0 -40
data/info/gpl-hdr.txt
CHANGED
@@ -1,27 +1,24 @@
|
|
1
1
|
#--
|
2
|
-
|
3
|
-
#
|
4
|
-
#
|
5
|
-
#
|
6
|
-
# Copyright (C) 2007
|
7
|
-
#
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
|
23
|
-
# welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
|
24
|
-
#
|
25
|
-
# Lex Lingo rules from here on
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# Lingo -- A full-featured automatic indexing system #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
7
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
8
|
+
# #
|
9
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
10
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
11
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
12
|
+
# any later version. #
|
13
|
+
# #
|
14
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
15
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
16
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
17
|
+
# more details. #
|
18
|
+
# #
|
19
|
+
# You should have received a copy of the GNU Affero General Public License #
|
20
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
21
|
+
# #
|
22
|
+
###############################################################################
|
26
23
|
#++
|
27
24
|
|
data/lib/lingo.rb
CHANGED
@@ -1,55 +1,35 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
#--
|
4
|
-
|
5
|
-
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
# Copyright (C) 2007
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
|
25
|
-
# welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
|
26
|
-
#
|
27
|
-
# Lex Lingo rules from here on
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
28
25
|
#++
|
29
26
|
|
30
27
|
require 'stringio'
|
31
28
|
require 'benchmark'
|
29
|
+
require 'nuggets/file/ext'
|
32
30
|
require 'nuggets/env/user_home'
|
33
31
|
require 'nuggets/numeric/duration'
|
34
|
-
|
35
|
-
require_relative 'lingo/config'
|
36
|
-
require_relative 'lingo/attendees'
|
37
|
-
require_relative 'lingo/attendee/abbreviator'
|
38
|
-
require_relative 'lingo/attendee/debugger'
|
39
|
-
require_relative 'lingo/attendee/decomposer'
|
40
|
-
require_relative 'lingo/attendee/dehyphenizer'
|
41
|
-
require_relative 'lingo/attendee/multiworder'
|
42
|
-
require_relative 'lingo/attendee/noneword_filter'
|
43
|
-
require_relative 'lingo/attendee/objectfilter'
|
44
|
-
require_relative 'lingo/attendee/variator'
|
45
|
-
require_relative 'lingo/attendee/sequencer'
|
46
|
-
require_relative 'lingo/attendee/synonymer'
|
47
|
-
require_relative 'lingo/attendee/textreader'
|
48
|
-
require_relative 'lingo/attendee/textwriter'
|
49
|
-
require_relative 'lingo/attendee/tokenizer'
|
50
|
-
require_relative 'lingo/attendee/vector_filter'
|
51
|
-
require_relative 'lingo/attendee/wordsearcher'
|
52
|
-
require_relative 'lingo/version'
|
32
|
+
require 'nuggets/string/camelscore'
|
53
33
|
|
54
34
|
class Lingo
|
55
35
|
|
@@ -65,13 +45,22 @@ class Lingo
|
|
65
45
|
# The search path for Lingo dictionary and configuration files.
|
66
46
|
PATH = ENV['LINGO_PATH'] || [CURR, HOME, BASE].join(File::PATH_SEPARATOR)
|
67
47
|
|
48
|
+
ENV['LINGO_PLUGIN_PATH'] ||= File.join(HOME, 'plugins')
|
49
|
+
|
50
|
+
# Map of file types to their standard location and file extension.
|
68
51
|
FIND_OPTIONS = {
|
69
52
|
config: { dir: 'config', ext: 'cfg' },
|
70
53
|
dict: { dir: 'dict', ext: 'txt' },
|
71
54
|
lang: { dir: 'lang', ext: 'lang' },
|
72
|
-
store: { dir: 'store', ext:
|
55
|
+
store: { dir: 'store', ext: nil },
|
56
|
+
sample: { dir: 'txt', ext: 'txt' }
|
73
57
|
}
|
74
58
|
|
59
|
+
# Default encoding
|
60
|
+
ENC = 'UTF-8'.freeze
|
61
|
+
|
62
|
+
STRING_SEPARATOR_RE = %r{[; ,|]}
|
63
|
+
|
75
64
|
class << self
|
76
65
|
|
77
66
|
def talk(*args)
|
@@ -82,10 +71,6 @@ class Lingo
|
|
82
71
|
Call.new(['-c', cfg, *args]).call(&block)
|
83
72
|
end
|
84
73
|
|
85
|
-
def error(msg)
|
86
|
-
abort(msg)
|
87
|
-
end
|
88
|
-
|
89
74
|
def list(type, options = {})
|
90
75
|
options = options_for(type, options)
|
91
76
|
path = path_for(options)
|
@@ -130,10 +115,12 @@ class Lingo
|
|
130
115
|
def find_file(file, path, options)
|
131
116
|
pn = Pathname.new(file_with_ext(file, options)).cleanpath
|
132
117
|
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
118
|
+
if pn.relative?
|
119
|
+
walk(path, options) { |dir|
|
120
|
+
pn2 = pn.expand_path(dir)
|
121
|
+
pn = pn2 and break if pn2.exist?
|
122
|
+
}
|
123
|
+
end
|
137
124
|
|
138
125
|
realpath_for(pn, path)
|
139
126
|
end
|
@@ -142,18 +129,14 @@ class Lingo
|
|
142
129
|
base = basename(:dict, find(:dict, file, path))
|
143
130
|
|
144
131
|
walk(path.reverse, options, false) { |dir|
|
145
|
-
Pathname.new(dir).ascend { |
|
146
|
-
break
|
147
|
-
|
148
|
-
|
149
|
-
s.chomp!(File.extname(s))
|
150
|
-
} if r.writable?
|
151
|
-
|
152
|
-
break true if r.exist?
|
132
|
+
Pathname.new(dir).ascend { |i|
|
133
|
+
break true if i.file?
|
134
|
+
return File.chomp_ext(File.join(dir, base)) if i.writable?
|
135
|
+
break true if i.exist?
|
153
136
|
}
|
154
137
|
}
|
155
138
|
|
156
|
-
raise
|
139
|
+
raise NoWritableStoreError.new(file, path)
|
157
140
|
end
|
158
141
|
|
159
142
|
def options_for(type, options = {})
|
@@ -190,6 +173,11 @@ class Lingo
|
|
190
173
|
pn.realpath(path.first).to_s
|
191
174
|
end
|
192
175
|
|
176
|
+
def require_optional(lib)
|
177
|
+
require lib unless ENV["LINGO_NO_#{lib.upcase}"]
|
178
|
+
rescue LoadError
|
179
|
+
end
|
180
|
+
|
193
181
|
end
|
194
182
|
|
195
183
|
attr_reader :dictionaries, :report_status, :report_time
|
@@ -205,10 +193,19 @@ class Lingo
|
|
205
193
|
|
206
194
|
def dictionary_config
|
207
195
|
@dictionary_config ||= config['language/dictionary']
|
196
|
+
rescue => err
|
197
|
+
raise ConfigLoadError.new(err)
|
208
198
|
end
|
209
199
|
|
210
200
|
def database_config(id)
|
211
|
-
dictionary_config['databases'][id]
|
201
|
+
dictionary_config['databases'][id].tap { |cfg|
|
202
|
+
raise NoDatabaseConfigError.new(id) unless cfg
|
203
|
+
raise InvalidDatabaseConfigError.new(id) unless cfg.has_key?('name')
|
204
|
+
}
|
205
|
+
end
|
206
|
+
|
207
|
+
def lexical_hash(src)
|
208
|
+
@lexical_hash[src]
|
212
209
|
end
|
213
210
|
|
214
211
|
def talk
|
@@ -226,7 +223,7 @@ class Lingo
|
|
226
223
|
|
227
224
|
list.each { |hash|
|
228
225
|
# {'attendee' => {'name'=>'Attendee', 'in'=>'nase', 'out'=>'ohr', 'param'=>'hase'}}
|
229
|
-
cfg = hash.values.first.merge('name' => hash.keys.first.
|
226
|
+
cfg = hash.values.first.merge('name' => hash.keys.first.camelcase)
|
230
227
|
|
231
228
|
%w[in out].each { |key| (cfg[key] ||= '').downcase! }
|
232
229
|
|
@@ -240,16 +237,16 @@ class Lingo
|
|
240
237
|
attendee = Attendee.const_get(cfg['name']).new(cfg, self)
|
241
238
|
@attendees << attendee
|
242
239
|
|
243
|
-
cfg['in'].split(
|
240
|
+
cfg['in'].split(STRING_SEPARATOR_RE).each { |interest|
|
244
241
|
subscriber[interest] << attendee
|
245
242
|
}
|
246
|
-
cfg['out'].split(
|
243
|
+
cfg['out'].split(STRING_SEPARATOR_RE).each { |theme|
|
247
244
|
supplier[theme] << attendee
|
248
245
|
}
|
249
246
|
}
|
250
247
|
|
251
|
-
supplier.each { |channel, attendees| attendees.each { |
|
252
|
-
|
248
|
+
supplier.each { |channel, attendees| attendees.each { |attendee|
|
249
|
+
attendee.add_subscriber(subscriber[channel])
|
253
250
|
} }
|
254
251
|
end
|
255
252
|
|
@@ -257,65 +254,39 @@ class Lingo
|
|
257
254
|
@report_status, @report_time = report_status, report_time
|
258
255
|
|
259
256
|
time = Benchmark.realtime {
|
260
|
-
@attendees.first.listen(AgendaItem.new(STR_CMD_TALK))
|
257
|
+
@attendees.first.listen(AgendaItem.new(Attendee::STR_CMD_TALK))
|
261
258
|
}
|
262
259
|
|
263
260
|
if report_status || report_time
|
264
|
-
|
265
|
-
@attendees.first.listen(AgendaItem.new(STR_CMD_STATUS))
|
266
|
-
|
261
|
+
warn "Require protocol...\n#{separator = '-' * 61}"
|
262
|
+
@attendees.first.listen(AgendaItem.new(Attendee::STR_CMD_STATUS))
|
263
|
+
warn "#{separator}\nThe duration of the meeting was #{time.to_hms(2)}"
|
267
264
|
end
|
268
265
|
end
|
269
266
|
|
270
267
|
def reset(close = true)
|
271
268
|
dictionaries.each(&:close) if close
|
272
269
|
@dictionaries, @attendees = [], []
|
270
|
+
@lexical_hash = Hash.new { |h, k| h[k] = Language::LexicalHash.new(k, self) }
|
273
271
|
end
|
274
272
|
|
275
|
-
|
276
|
-
|
277
|
-
def initialize(args = [])
|
278
|
-
super(args, StringIO.new, StringIO.new, StringIO.new)
|
279
|
-
end
|
280
|
-
|
281
|
-
def call
|
282
|
-
invite
|
283
|
-
|
284
|
-
if block_given?
|
285
|
-
begin
|
286
|
-
yield self
|
287
|
-
ensure
|
288
|
-
reset
|
289
|
-
end
|
290
|
-
else
|
291
|
-
self
|
292
|
-
end
|
293
|
-
end
|
294
|
-
|
295
|
-
def talk(str)
|
296
|
-
config.stdin.reopen(str)
|
297
|
-
|
298
|
-
start
|
299
|
-
|
300
|
-
%w[stdout stderr].flat_map { |key|
|
301
|
-
io = config.send(key).tap(&:rewind)
|
302
|
-
io.readlines.each(&:chomp!).tap {
|
303
|
-
io.truncate(0)
|
304
|
-
io.rewind
|
305
|
-
}
|
306
|
-
}.tap { |res|
|
307
|
-
if block_given?
|
308
|
-
res.map!(&Proc.new)
|
309
|
-
else
|
310
|
-
res.sort!
|
311
|
-
res.uniq!
|
312
|
-
end
|
313
|
-
}
|
314
|
-
end
|
315
|
-
|
273
|
+
def warn(*msg)
|
274
|
+
config.stderr.puts(*msg)
|
316
275
|
end
|
317
276
|
|
318
277
|
end
|
319
278
|
|
279
|
+
require_relative 'lingo/call'
|
280
|
+
require_relative 'lingo/error'
|
281
|
+
require_relative 'lingo/config'
|
282
|
+
require_relative 'lingo/core_ext'
|
283
|
+
require_relative 'lingo/cachable'
|
284
|
+
require_relative 'lingo/reportable'
|
285
|
+
require_relative 'lingo/agenda_item'
|
286
|
+
require_relative 'lingo/database'
|
287
|
+
require_relative 'lingo/language'
|
288
|
+
require_relative 'lingo/attendee'
|
289
|
+
require_relative 'lingo/version'
|
290
|
+
|
320
291
|
require 'nuggets/util/pluggable'
|
321
292
|
Util::Pluggable.load_plugins_for(Lingo)
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
class AgendaItem
|
30
|
+
|
31
|
+
include Comparable
|
32
|
+
|
33
|
+
attr_reader :cmd, :param
|
34
|
+
|
35
|
+
def initialize(cmd, param = nil)
|
36
|
+
@cmd, @param = cmd || '', param || ''
|
37
|
+
end
|
38
|
+
|
39
|
+
def <=>(other)
|
40
|
+
other.is_a?(self.class) ? to_a <=> other.to_a : 1
|
41
|
+
end
|
42
|
+
|
43
|
+
def to_a
|
44
|
+
[cmd, param]
|
45
|
+
end
|
46
|
+
|
47
|
+
def inspect
|
48
|
+
"*#{cmd.upcase}('#{param}')"
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
@@ -0,0 +1,261 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
# Lingo ist als universelles Indexierungssystem entworfen worden. Seine Stärke liegt in der einfachen Konfigurierbarkeit für
|
30
|
+
# spezifische Aufgaben und in der schnelle Entwicklung weiterer Funktionen durch systematischen Kapselung der Komplexität auf
|
31
|
+
# kleine Verarbeitungseinheiten. Die kleinste Verarbeitungseinheit wird Attendee genannt. Um ein gewünschtes Verarbeitungsergebnis
|
32
|
+
# zu bekommen, werden die benötigten Attendees einfach in einer Reihe hinter einander geschaltet. Ein einfaches Beispiel hierfür ist
|
33
|
+
# eine direkte Verbindung zwischen einem Textreader, einem Tokenizer und einem Textwriter. Alle drei Klassen sind von der Klasse
|
34
|
+
# Attendee abgeleitet.
|
35
|
+
#
|
36
|
+
# Der Textreader liest beispielsweise Zeilen aus einer Textdatei und leitet sie weiter an den Tokenizer. Der Tokenizer zerlegt eine
|
37
|
+
# Textzeile in einzelne Wörter und gibt diese weiter an den Textwriter, der diese in eine (andere) Datei schreibt. Über vielfältige
|
38
|
+
# Konfigurationsmöglichkeiten kann das Verhalten der Attendees an die eigenen Bedürfnisse angepasst werden.
|
39
|
+
#
|
40
|
+
# Die Verkettung einzelner Attendees findet über die Schnittstellen +listen+ und +talk+ statt. An +listen+ können beliebige Objekte
|
41
|
+
# zur Ver- und Bearbeitung übergeben werden. Nach der Verarbeitung werden sie mittels +talk+ an die verketteten Attendees weiter
|
42
|
+
# gegeben. Objekte der Klasse AgendaItem dienen dabei der Steuerung der Verarbeitung und sind nicht Bestandteil der normalen
|
43
|
+
# Verarbeitung. Beispiele für AgendaItems sind die Kommandos TALK (Aufforderung zum Start der Verarbeitung), WARN (zur Ausgabe von
|
44
|
+
# Warnungen eines Attendees) und EOL (End of Line, Ende einer Textzeile nach Zerlegung in einzelne Wörter). Eine vollständige
|
45
|
+
# Übersicht benutzer AgendaItems (oder auf Stream Commands) steht in lib/const.rb mit dem Prefix STR_CMD_.
|
46
|
+
#
|
47
|
+
# Um die Entwicklung von neuen Attendees zu beschleunigen, wird durch die Vererbung sind bei wird die gesammte sind in der Regel nur
|
48
|
+
# drei abstrakte Methoden zu implementieren: +init+, +control+ und +process+. Die Methode +init+ wird bei der Instanziierung eines
|
49
|
+
# Objektes einmalig aufgerufen. Sie dient der Vorbereitung der Verarbeitung, z.B. durch das Öffnen und Bereitstellen von
|
50
|
+
# Wörterbüchern zur linguistischen Analyse. An die Methode +control+ werden alle eingehenden AgendaItems weitergeleitet. Dort erfolgt
|
51
|
+
# die Verarbeitungssteuerung, also z.B. bei STR_CMD_FILE das Öffnen einer Datei und bei STR_CMD_EOF respektive das Schließen. Die
|
52
|
+
# echte Verarbeitung von Daten findet daher durch die Methode +process+ statt.
|
53
|
+
#
|
54
|
+
# was macht attendee
|
55
|
+
# - verkettung der attendees anhand von konfigurationsinformationen
|
56
|
+
# - bereitstellung von globalen und spezifischen konfigurationsinformationen
|
57
|
+
# - behandlung von bestimmten übergreifenden Kommandos, z.B. STR_CMD_TALK, STR_CMD_STATUS
|
58
|
+
# - separierung und routing von kommando bzw. datenobjekten
|
59
|
+
#
|
60
|
+
# was macht die abgeleitet klasse
|
61
|
+
# - verarbeitet und/oder transformiert datenobjekte
|
62
|
+
# - wird gesteuert durch kommandos
|
63
|
+
# - schreibt verarbeitungsstatistiken
|
64
|
+
|
65
|
+
class Attendee
|
66
|
+
|
67
|
+
include Language
|
68
|
+
include Reportable
|
69
|
+
|
70
|
+
STR_CMD_TALK = 'TALK'
|
71
|
+
STR_CMD_STATUS = 'STATUS'
|
72
|
+
STR_CMD_LIR = 'LIR-FORMAT'
|
73
|
+
STR_CMD_FILE = 'FILE'
|
74
|
+
STR_CMD_EOL = 'EOL'
|
75
|
+
STR_CMD_RECORD = 'RECORD'
|
76
|
+
STR_CMD_EOF = 'EOF'
|
77
|
+
|
78
|
+
STA_NUM_COMMANDS = 'Received Commands'
|
79
|
+
STA_NUM_OBJECTS = 'Received Objects '
|
80
|
+
STA_TIM_COMMANDS = 'Time to control '
|
81
|
+
STA_TIM_OBJECTS = 'Time to process '
|
82
|
+
|
83
|
+
def initialize(config, lingo)
|
84
|
+
@lingo = lingo
|
85
|
+
|
86
|
+
init_reportable
|
87
|
+
|
88
|
+
# Make sure config exists
|
89
|
+
lingo.dictionary_config
|
90
|
+
|
91
|
+
@config, @subscriber = config, []
|
92
|
+
|
93
|
+
init if self.class.method_defined?(:init)
|
94
|
+
|
95
|
+
@can_control = self.class.method_defined?(:control)
|
96
|
+
@can_process = self.class.method_defined?(:process)
|
97
|
+
|
98
|
+
@skip_command, @timer = false, nil
|
99
|
+
end
|
100
|
+
|
101
|
+
def add_subscriber(subscriber)
|
102
|
+
@subscriber.concat(subscriber)
|
103
|
+
end
|
104
|
+
|
105
|
+
def listen(obj)
|
106
|
+
unless obj.is_a?(AgendaItem)
|
107
|
+
@can_process ? stat_timer(:objects) { process(obj) } : forward(obj)
|
108
|
+
else
|
109
|
+
args = obj.to_a
|
110
|
+
stat_timer(:commands) { control(*args) } if @can_control
|
111
|
+
|
112
|
+
case obj.cmd
|
113
|
+
when STR_CMD_TALK
|
114
|
+
nil
|
115
|
+
when STR_CMD_STATUS
|
116
|
+
report_time
|
117
|
+
report_status
|
118
|
+
|
119
|
+
forward(*args)
|
120
|
+
else
|
121
|
+
forward(*args) unless skip_command!
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def talk(obj)
|
127
|
+
charge_timer { @subscriber.each { |attendee| attendee.listen(obj) } }
|
128
|
+
end
|
129
|
+
|
130
|
+
private
|
131
|
+
|
132
|
+
def sta_for(key)
|
133
|
+
%w[NUM TIM].map { |i| self.class.const_get("STA_#{i}_#{key.upcase}") }
|
134
|
+
end
|
135
|
+
|
136
|
+
def stat_timer(key)
|
137
|
+
n, t = sta_for(key)
|
138
|
+
inc(n)
|
139
|
+
|
140
|
+
return yield unless @lingo.report_time
|
141
|
+
|
142
|
+
@timer = Time.new
|
143
|
+
res = yield
|
144
|
+
add(t, Time.new - @timer)
|
145
|
+
res
|
146
|
+
end
|
147
|
+
|
148
|
+
def charge_timer
|
149
|
+
return yield unless @lingo.report_time
|
150
|
+
|
151
|
+
res = nil
|
152
|
+
@timer += Benchmark.realtime { res = yield }
|
153
|
+
res
|
154
|
+
end
|
155
|
+
|
156
|
+
def report_time
|
157
|
+
return unless @lingo.report_time
|
158
|
+
|
159
|
+
msg = 'Perf: %-15s ' <<
|
160
|
+
'=> %7d commands in %s (%s/cmd)' <<
|
161
|
+
', %8d objects in %s (%s/obj)'
|
162
|
+
|
163
|
+
arg = [@config['name']]
|
164
|
+
|
165
|
+
%w[commands objects].each { |k|
|
166
|
+
n, t = sta_for(k).map(&method(:get))
|
167
|
+
arg << n
|
168
|
+
|
169
|
+
arg.concat([1, n].map { |m|
|
170
|
+
s = m.zero? ? 0.0 : t / m.to_f
|
171
|
+
|
172
|
+
'%9.3f %-2s' %
|
173
|
+
if s < 0.001
|
174
|
+
[s * 1000.0 ** 2, 'µs']
|
175
|
+
elsif s < 1.0
|
176
|
+
[s * 1000.0, 'ms']
|
177
|
+
elsif s < 60.0
|
178
|
+
[s, 's']
|
179
|
+
elsif s < 60.0 ** 2
|
180
|
+
[s / 60.0, 'm']
|
181
|
+
else
|
182
|
+
[s / 60.0 ** 2, 'h']
|
183
|
+
end
|
184
|
+
})
|
185
|
+
}
|
186
|
+
|
187
|
+
@lingo.warn msg % arg
|
188
|
+
end
|
189
|
+
|
190
|
+
def report_status
|
191
|
+
return unless @lingo.report_status
|
192
|
+
|
193
|
+
msg = "Attendee <%s> was connected from '%s' to '%s' reporting..."
|
194
|
+
|
195
|
+
@lingo.warn msg % @config.values_at(*%w[name in out]),
|
196
|
+
nil, report.sort.map { |k, v| " #{k} = #{v}" }, nil
|
197
|
+
end
|
198
|
+
|
199
|
+
def skip_command
|
200
|
+
@skip_command = true
|
201
|
+
end
|
202
|
+
|
203
|
+
def skip_command!
|
204
|
+
@skip_command.tap { @skip_command &&= false }
|
205
|
+
end
|
206
|
+
|
207
|
+
def forward(obj, param = nil)
|
208
|
+
talk(param ? AgendaItem.new(obj, param) : obj)
|
209
|
+
end
|
210
|
+
|
211
|
+
def has_key?(key)
|
212
|
+
@config && @config.has_key?(key)
|
213
|
+
end
|
214
|
+
|
215
|
+
def get_key(key, default = nodefault = Object.new)
|
216
|
+
raise MissingConfigError.new(key) if nodefault && !has_key?(key)
|
217
|
+
@config.fetch(key, default)
|
218
|
+
end
|
219
|
+
|
220
|
+
def get_array(key, default = nil)
|
221
|
+
get_key(key, default).split(STRING_SEPARATOR_RE)
|
222
|
+
end
|
223
|
+
|
224
|
+
def dictionary(src, mod)
|
225
|
+
Language::Dictionary.new({ 'source' => src, 'mode' => mod }, @lingo)
|
226
|
+
end
|
227
|
+
|
228
|
+
def grammar(src, mod)
|
229
|
+
Language::Grammar.new({ 'source' => src, 'mode' => mod }, @lingo)
|
230
|
+
end
|
231
|
+
|
232
|
+
def set_dic
|
233
|
+
@dic = dictionary(get_array('source'), get_key('mode', 'all'))
|
234
|
+
end
|
235
|
+
|
236
|
+
def set_gra
|
237
|
+
@gra = grammar(get_array('source'), get_key('mode', 'all'))
|
238
|
+
end
|
239
|
+
|
240
|
+
end
|
241
|
+
|
242
|
+
end
|
243
|
+
|
244
|
+
require_relative 'buffered_attendee'
|
245
|
+
|
246
|
+
require_relative 'attendee/abbreviator'
|
247
|
+
require_relative 'attendee/debugger'
|
248
|
+
require_relative 'attendee/decomposer'
|
249
|
+
require_relative 'attendee/dehyphenizer'
|
250
|
+
require_relative 'attendee/multi_worder'
|
251
|
+
require_relative 'attendee/noneword_filter'
|
252
|
+
require_relative 'attendee/object_filter'
|
253
|
+
require_relative 'attendee/variator'
|
254
|
+
require_relative 'attendee/sequencer'
|
255
|
+
require_relative 'attendee/synonymer'
|
256
|
+
require_relative 'attendee/text_reader'
|
257
|
+
require_relative 'attendee/text_writer'
|
258
|
+
require_relative 'attendee/formatter'
|
259
|
+
require_relative 'attendee/tokenizer'
|
260
|
+
require_relative 'attendee/vector_filter'
|
261
|
+
require_relative 'attendee/word_searcher'
|