lingo 1.8.0 → 1.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +13 -0
- data/README +49 -29
- data/Rakefile +28 -4
- data/TODO +2 -9
- data/bin/lingo +24 -0
- data/bin/lingoctl +24 -0
- data/de/lingo-dic.txt +559 -74
- data/info/gpl-hdr.txt +21 -24
- data/lib/lingo.rb +83 -112
- data/lib/lingo/agenda_item.rb +53 -0
- data/lib/lingo/attendee.rb +261 -0
- data/lib/lingo/attendee/abbreviator.rb +95 -97
- data/lib/lingo/attendee/debugger.rb +94 -93
- data/lib/lingo/attendee/decomposer.rb +76 -83
- data/lib/lingo/attendee/dehyphenizer.rb +141 -144
- data/lib/lingo/attendee/formatter.rb +65 -0
- data/lib/lingo/attendee/multi_worder.rb +302 -0
- data/lib/lingo/attendee/noneword_filter.rb +89 -84
- data/lib/lingo/attendee/object_filter.rb +91 -0
- data/lib/lingo/attendee/sequencer.rb +159 -158
- data/lib/lingo/attendee/synonymer.rb +81 -84
- data/lib/lingo/attendee/text_reader.rb +242 -0
- data/lib/lingo/attendee/text_writer.rb +169 -0
- data/lib/lingo/attendee/tokenizer.rb +192 -191
- data/lib/lingo/attendee/variator.rb +152 -156
- data/lib/lingo/attendee/vector_filter.rb +140 -135
- data/lib/lingo/attendee/word_searcher.rb +98 -0
- data/lib/lingo/buffered_attendee.rb +69 -0
- data/lib/lingo/cachable.rb +58 -0
- data/lib/lingo/call.rb +72 -0
- data/lib/lingo/cli.rb +26 -0
- data/lib/lingo/config.rb +23 -26
- data/lib/lingo/core_ext.rb +42 -0
- data/lib/lingo/ctl.rb +239 -173
- data/lib/lingo/database.rb +148 -496
- data/lib/lingo/database/crypter.rb +85 -0
- data/lib/lingo/database/gdbm_store.rb +49 -0
- data/lib/lingo/database/hash_store.rb +67 -0
- data/lib/lingo/database/libcdb_store.rb +58 -0
- data/lib/lingo/database/sdbm_store.rb +64 -0
- data/lib/lingo/database/show_progress.rb +81 -0
- data/lib/lingo/database/source.rb +134 -0
- data/lib/lingo/database/source/key_value.rb +62 -0
- data/lib/lingo/database/source/multi_key.rb +65 -0
- data/lib/lingo/database/source/multi_value.rb +65 -0
- data/lib/lingo/database/source/single_word.rb +60 -0
- data/lib/lingo/database/source/word_class.rb +64 -0
- data/lib/lingo/error.rb +122 -0
- data/lib/lingo/language.rb +78 -518
- data/lib/lingo/language/dictionary.rb +173 -0
- data/lib/lingo/language/grammar.rb +211 -0
- data/lib/lingo/language/lexical.rb +66 -0
- data/lib/lingo/language/lexical_hash.rb +88 -0
- data/lib/lingo/language/token.rb +48 -0
- data/lib/lingo/language/word.rb +130 -0
- data/lib/lingo/language/word_form.rb +83 -0
- data/lib/lingo/reportable.rb +59 -0
- data/lib/lingo/version.rb +1 -1
- data/lingo-all.cfg +14 -10
- data/lingo-call.cfg +5 -5
- data/lingo.cfg +14 -12
- data/lingo.rb +26 -0
- data/lir.cfg +13 -9
- data/spec/spec_helper.rb +1 -0
- data/test.cfg +11 -11
- data/test/attendee/ts_abbreviator.rb +0 -6
- data/test/attendee/ts_decomposer.rb +0 -6
- data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
- data/test/attendee/ts_noneword_filter.rb +1 -7
- data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
- data/test/attendee/ts_sequencer.rb +0 -6
- data/test/attendee/ts_synonymer.rb +0 -6
- data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
- data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
- data/test/attendee/ts_tokenizer.rb +0 -6
- data/test/attendee/ts_variator.rb +0 -6
- data/test/attendee/ts_vector_filter.rb +1 -7
- data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
- data/test/ref/artikel.non +2 -29
- data/test/ref/artikel.seq +13 -8
- data/test/ref/artikel.vec +30 -15
- data/test/ref/artikel.ven +29 -14
- data/test/ref/artikel.ver +58 -43
- data/test/ref/lir.csv +146 -145
- data/test/ref/lir.non +186 -210
- data/test/ref/lir.seq +54 -50
- data/test/test_helper.rb +41 -36
- data/test/ts_database.rb +12 -11
- data/test/ts_language.rb +118 -68
- metadata +67 -29
- data/lib/lingo/attendee/multiworder.rb +0 -301
- data/lib/lingo/attendee/objectfilter.rb +0 -86
- data/lib/lingo/attendee/textreader.rb +0 -237
- data/lib/lingo/attendee/textwriter.rb +0 -196
- data/lib/lingo/attendee/wordsearcher.rb +0 -96
- data/lib/lingo/attendees.rb +0 -289
- data/lib/lingo/const.rb +0 -131
- data/lib/lingo/modules.rb +0 -98
- data/lib/lingo/types.rb +0 -285
- data/lib/lingo/utilities.rb +0 -40
data/info/gpl-hdr.txt
CHANGED
|
@@ -1,27 +1,24 @@
|
|
|
1
1
|
#--
|
|
2
|
-
|
|
3
|
-
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# Copyright (C) 2007
|
|
7
|
-
#
|
|
8
|
-
#
|
|
9
|
-
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
#
|
|
14
|
-
#
|
|
15
|
-
#
|
|
16
|
-
#
|
|
17
|
-
#
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
|
|
23
|
-
# welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
|
|
24
|
-
#
|
|
25
|
-
# Lex Lingo rules from here on
|
|
2
|
+
###############################################################################
|
|
3
|
+
# #
|
|
4
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
5
|
+
# #
|
|
6
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
7
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
8
|
+
# #
|
|
9
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
10
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
11
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
12
|
+
# any later version. #
|
|
13
|
+
# #
|
|
14
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
15
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
16
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
17
|
+
# more details. #
|
|
18
|
+
# #
|
|
19
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
20
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
21
|
+
# #
|
|
22
|
+
###############################################################################
|
|
26
23
|
#++
|
|
27
24
|
|
data/lib/lingo.rb
CHANGED
|
@@ -1,55 +1,35 @@
|
|
|
1
1
|
# encoding: utf-8
|
|
2
2
|
|
|
3
3
|
#--
|
|
4
|
-
|
|
5
|
-
#
|
|
6
|
-
#
|
|
7
|
-
#
|
|
8
|
-
# Copyright (C) 2007
|
|
9
|
-
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
#
|
|
14
|
-
#
|
|
15
|
-
#
|
|
16
|
-
#
|
|
17
|
-
#
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
#
|
|
23
|
-
#
|
|
24
|
-
|
|
25
|
-
# welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
|
|
26
|
-
#
|
|
27
|
-
# Lex Lingo rules from here on
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
28
25
|
#++
|
|
29
26
|
|
|
30
27
|
require 'stringio'
|
|
31
28
|
require 'benchmark'
|
|
29
|
+
require 'nuggets/file/ext'
|
|
32
30
|
require 'nuggets/env/user_home'
|
|
33
31
|
require 'nuggets/numeric/duration'
|
|
34
|
-
|
|
35
|
-
require_relative 'lingo/config'
|
|
36
|
-
require_relative 'lingo/attendees'
|
|
37
|
-
require_relative 'lingo/attendee/abbreviator'
|
|
38
|
-
require_relative 'lingo/attendee/debugger'
|
|
39
|
-
require_relative 'lingo/attendee/decomposer'
|
|
40
|
-
require_relative 'lingo/attendee/dehyphenizer'
|
|
41
|
-
require_relative 'lingo/attendee/multiworder'
|
|
42
|
-
require_relative 'lingo/attendee/noneword_filter'
|
|
43
|
-
require_relative 'lingo/attendee/objectfilter'
|
|
44
|
-
require_relative 'lingo/attendee/variator'
|
|
45
|
-
require_relative 'lingo/attendee/sequencer'
|
|
46
|
-
require_relative 'lingo/attendee/synonymer'
|
|
47
|
-
require_relative 'lingo/attendee/textreader'
|
|
48
|
-
require_relative 'lingo/attendee/textwriter'
|
|
49
|
-
require_relative 'lingo/attendee/tokenizer'
|
|
50
|
-
require_relative 'lingo/attendee/vector_filter'
|
|
51
|
-
require_relative 'lingo/attendee/wordsearcher'
|
|
52
|
-
require_relative 'lingo/version'
|
|
32
|
+
require 'nuggets/string/camelscore'
|
|
53
33
|
|
|
54
34
|
class Lingo
|
|
55
35
|
|
|
@@ -65,13 +45,22 @@ class Lingo
|
|
|
65
45
|
# The search path for Lingo dictionary and configuration files.
|
|
66
46
|
PATH = ENV['LINGO_PATH'] || [CURR, HOME, BASE].join(File::PATH_SEPARATOR)
|
|
67
47
|
|
|
48
|
+
ENV['LINGO_PLUGIN_PATH'] ||= File.join(HOME, 'plugins')
|
|
49
|
+
|
|
50
|
+
# Map of file types to their standard location and file extension.
|
|
68
51
|
FIND_OPTIONS = {
|
|
69
52
|
config: { dir: 'config', ext: 'cfg' },
|
|
70
53
|
dict: { dir: 'dict', ext: 'txt' },
|
|
71
54
|
lang: { dir: 'lang', ext: 'lang' },
|
|
72
|
-
store: { dir: 'store', ext:
|
|
55
|
+
store: { dir: 'store', ext: nil },
|
|
56
|
+
sample: { dir: 'txt', ext: 'txt' }
|
|
73
57
|
}
|
|
74
58
|
|
|
59
|
+
# Default encoding
|
|
60
|
+
ENC = 'UTF-8'.freeze
|
|
61
|
+
|
|
62
|
+
STRING_SEPARATOR_RE = %r{[; ,|]}
|
|
63
|
+
|
|
75
64
|
class << self
|
|
76
65
|
|
|
77
66
|
def talk(*args)
|
|
@@ -82,10 +71,6 @@ class Lingo
|
|
|
82
71
|
Call.new(['-c', cfg, *args]).call(&block)
|
|
83
72
|
end
|
|
84
73
|
|
|
85
|
-
def error(msg)
|
|
86
|
-
abort(msg)
|
|
87
|
-
end
|
|
88
|
-
|
|
89
74
|
def list(type, options = {})
|
|
90
75
|
options = options_for(type, options)
|
|
91
76
|
path = path_for(options)
|
|
@@ -130,10 +115,12 @@ class Lingo
|
|
|
130
115
|
def find_file(file, path, options)
|
|
131
116
|
pn = Pathname.new(file_with_ext(file, options)).cleanpath
|
|
132
117
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
118
|
+
if pn.relative?
|
|
119
|
+
walk(path, options) { |dir|
|
|
120
|
+
pn2 = pn.expand_path(dir)
|
|
121
|
+
pn = pn2 and break if pn2.exist?
|
|
122
|
+
}
|
|
123
|
+
end
|
|
137
124
|
|
|
138
125
|
realpath_for(pn, path)
|
|
139
126
|
end
|
|
@@ -142,18 +129,14 @@ class Lingo
|
|
|
142
129
|
base = basename(:dict, find(:dict, file, path))
|
|
143
130
|
|
|
144
131
|
walk(path.reverse, options, false) { |dir|
|
|
145
|
-
Pathname.new(dir).ascend { |
|
|
146
|
-
break
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
s.chomp!(File.extname(s))
|
|
150
|
-
} if r.writable?
|
|
151
|
-
|
|
152
|
-
break true if r.exist?
|
|
132
|
+
Pathname.new(dir).ascend { |i|
|
|
133
|
+
break true if i.file?
|
|
134
|
+
return File.chomp_ext(File.join(dir, base)) if i.writable?
|
|
135
|
+
break true if i.exist?
|
|
153
136
|
}
|
|
154
137
|
}
|
|
155
138
|
|
|
156
|
-
raise
|
|
139
|
+
raise NoWritableStoreError.new(file, path)
|
|
157
140
|
end
|
|
158
141
|
|
|
159
142
|
def options_for(type, options = {})
|
|
@@ -190,6 +173,11 @@ class Lingo
|
|
|
190
173
|
pn.realpath(path.first).to_s
|
|
191
174
|
end
|
|
192
175
|
|
|
176
|
+
def require_optional(lib)
|
|
177
|
+
require lib unless ENV["LINGO_NO_#{lib.upcase}"]
|
|
178
|
+
rescue LoadError
|
|
179
|
+
end
|
|
180
|
+
|
|
193
181
|
end
|
|
194
182
|
|
|
195
183
|
attr_reader :dictionaries, :report_status, :report_time
|
|
@@ -205,10 +193,19 @@ class Lingo
|
|
|
205
193
|
|
|
206
194
|
def dictionary_config
|
|
207
195
|
@dictionary_config ||= config['language/dictionary']
|
|
196
|
+
rescue => err
|
|
197
|
+
raise ConfigLoadError.new(err)
|
|
208
198
|
end
|
|
209
199
|
|
|
210
200
|
def database_config(id)
|
|
211
|
-
dictionary_config['databases'][id]
|
|
201
|
+
dictionary_config['databases'][id].tap { |cfg|
|
|
202
|
+
raise NoDatabaseConfigError.new(id) unless cfg
|
|
203
|
+
raise InvalidDatabaseConfigError.new(id) unless cfg.has_key?('name')
|
|
204
|
+
}
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
def lexical_hash(src)
|
|
208
|
+
@lexical_hash[src]
|
|
212
209
|
end
|
|
213
210
|
|
|
214
211
|
def talk
|
|
@@ -226,7 +223,7 @@ class Lingo
|
|
|
226
223
|
|
|
227
224
|
list.each { |hash|
|
|
228
225
|
# {'attendee' => {'name'=>'Attendee', 'in'=>'nase', 'out'=>'ohr', 'param'=>'hase'}}
|
|
229
|
-
cfg = hash.values.first.merge('name' => hash.keys.first.
|
|
226
|
+
cfg = hash.values.first.merge('name' => hash.keys.first.camelcase)
|
|
230
227
|
|
|
231
228
|
%w[in out].each { |key| (cfg[key] ||= '').downcase! }
|
|
232
229
|
|
|
@@ -240,16 +237,16 @@ class Lingo
|
|
|
240
237
|
attendee = Attendee.const_get(cfg['name']).new(cfg, self)
|
|
241
238
|
@attendees << attendee
|
|
242
239
|
|
|
243
|
-
cfg['in'].split(
|
|
240
|
+
cfg['in'].split(STRING_SEPARATOR_RE).each { |interest|
|
|
244
241
|
subscriber[interest] << attendee
|
|
245
242
|
}
|
|
246
|
-
cfg['out'].split(
|
|
243
|
+
cfg['out'].split(STRING_SEPARATOR_RE).each { |theme|
|
|
247
244
|
supplier[theme] << attendee
|
|
248
245
|
}
|
|
249
246
|
}
|
|
250
247
|
|
|
251
|
-
supplier.each { |channel, attendees| attendees.each { |
|
|
252
|
-
|
|
248
|
+
supplier.each { |channel, attendees| attendees.each { |attendee|
|
|
249
|
+
attendee.add_subscriber(subscriber[channel])
|
|
253
250
|
} }
|
|
254
251
|
end
|
|
255
252
|
|
|
@@ -257,65 +254,39 @@ class Lingo
|
|
|
257
254
|
@report_status, @report_time = report_status, report_time
|
|
258
255
|
|
|
259
256
|
time = Benchmark.realtime {
|
|
260
|
-
@attendees.first.listen(AgendaItem.new(STR_CMD_TALK))
|
|
257
|
+
@attendees.first.listen(AgendaItem.new(Attendee::STR_CMD_TALK))
|
|
261
258
|
}
|
|
262
259
|
|
|
263
260
|
if report_status || report_time
|
|
264
|
-
|
|
265
|
-
@attendees.first.listen(AgendaItem.new(STR_CMD_STATUS))
|
|
266
|
-
|
|
261
|
+
warn "Require protocol...\n#{separator = '-' * 61}"
|
|
262
|
+
@attendees.first.listen(AgendaItem.new(Attendee::STR_CMD_STATUS))
|
|
263
|
+
warn "#{separator}\nThe duration of the meeting was #{time.to_hms(2)}"
|
|
267
264
|
end
|
|
268
265
|
end
|
|
269
266
|
|
|
270
267
|
def reset(close = true)
|
|
271
268
|
dictionaries.each(&:close) if close
|
|
272
269
|
@dictionaries, @attendees = [], []
|
|
270
|
+
@lexical_hash = Hash.new { |h, k| h[k] = Language::LexicalHash.new(k, self) }
|
|
273
271
|
end
|
|
274
272
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
def initialize(args = [])
|
|
278
|
-
super(args, StringIO.new, StringIO.new, StringIO.new)
|
|
279
|
-
end
|
|
280
|
-
|
|
281
|
-
def call
|
|
282
|
-
invite
|
|
283
|
-
|
|
284
|
-
if block_given?
|
|
285
|
-
begin
|
|
286
|
-
yield self
|
|
287
|
-
ensure
|
|
288
|
-
reset
|
|
289
|
-
end
|
|
290
|
-
else
|
|
291
|
-
self
|
|
292
|
-
end
|
|
293
|
-
end
|
|
294
|
-
|
|
295
|
-
def talk(str)
|
|
296
|
-
config.stdin.reopen(str)
|
|
297
|
-
|
|
298
|
-
start
|
|
299
|
-
|
|
300
|
-
%w[stdout stderr].flat_map { |key|
|
|
301
|
-
io = config.send(key).tap(&:rewind)
|
|
302
|
-
io.readlines.each(&:chomp!).tap {
|
|
303
|
-
io.truncate(0)
|
|
304
|
-
io.rewind
|
|
305
|
-
}
|
|
306
|
-
}.tap { |res|
|
|
307
|
-
if block_given?
|
|
308
|
-
res.map!(&Proc.new)
|
|
309
|
-
else
|
|
310
|
-
res.sort!
|
|
311
|
-
res.uniq!
|
|
312
|
-
end
|
|
313
|
-
}
|
|
314
|
-
end
|
|
315
|
-
|
|
273
|
+
def warn(*msg)
|
|
274
|
+
config.stderr.puts(*msg)
|
|
316
275
|
end
|
|
317
276
|
|
|
318
277
|
end
|
|
319
278
|
|
|
279
|
+
require_relative 'lingo/call'
|
|
280
|
+
require_relative 'lingo/error'
|
|
281
|
+
require_relative 'lingo/config'
|
|
282
|
+
require_relative 'lingo/core_ext'
|
|
283
|
+
require_relative 'lingo/cachable'
|
|
284
|
+
require_relative 'lingo/reportable'
|
|
285
|
+
require_relative 'lingo/agenda_item'
|
|
286
|
+
require_relative 'lingo/database'
|
|
287
|
+
require_relative 'lingo/language'
|
|
288
|
+
require_relative 'lingo/attendee'
|
|
289
|
+
require_relative 'lingo/version'
|
|
290
|
+
|
|
320
291
|
require 'nuggets/util/pluggable'
|
|
321
292
|
Util::Pluggable.load_plugins_for(Lingo)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
#--
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
25
|
+
#++
|
|
26
|
+
|
|
27
|
+
class Lingo
|
|
28
|
+
|
|
29
|
+
class AgendaItem
|
|
30
|
+
|
|
31
|
+
include Comparable
|
|
32
|
+
|
|
33
|
+
attr_reader :cmd, :param
|
|
34
|
+
|
|
35
|
+
def initialize(cmd, param = nil)
|
|
36
|
+
@cmd, @param = cmd || '', param || ''
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def <=>(other)
|
|
40
|
+
other.is_a?(self.class) ? to_a <=> other.to_a : 1
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def to_a
|
|
44
|
+
[cmd, param]
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def inspect
|
|
48
|
+
"*#{cmd.upcase}('#{param}')"
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
end
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
#--
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
25
|
+
#++
|
|
26
|
+
|
|
27
|
+
class Lingo
|
|
28
|
+
|
|
29
|
+
# Lingo ist als universelles Indexierungssystem entworfen worden. Seine Stärke liegt in der einfachen Konfigurierbarkeit für
|
|
30
|
+
# spezifische Aufgaben und in der schnelle Entwicklung weiterer Funktionen durch systematischen Kapselung der Komplexität auf
|
|
31
|
+
# kleine Verarbeitungseinheiten. Die kleinste Verarbeitungseinheit wird Attendee genannt. Um ein gewünschtes Verarbeitungsergebnis
|
|
32
|
+
# zu bekommen, werden die benötigten Attendees einfach in einer Reihe hinter einander geschaltet. Ein einfaches Beispiel hierfür ist
|
|
33
|
+
# eine direkte Verbindung zwischen einem Textreader, einem Tokenizer und einem Textwriter. Alle drei Klassen sind von der Klasse
|
|
34
|
+
# Attendee abgeleitet.
|
|
35
|
+
#
|
|
36
|
+
# Der Textreader liest beispielsweise Zeilen aus einer Textdatei und leitet sie weiter an den Tokenizer. Der Tokenizer zerlegt eine
|
|
37
|
+
# Textzeile in einzelne Wörter und gibt diese weiter an den Textwriter, der diese in eine (andere) Datei schreibt. Über vielfältige
|
|
38
|
+
# Konfigurationsmöglichkeiten kann das Verhalten der Attendees an die eigenen Bedürfnisse angepasst werden.
|
|
39
|
+
#
|
|
40
|
+
# Die Verkettung einzelner Attendees findet über die Schnittstellen +listen+ und +talk+ statt. An +listen+ können beliebige Objekte
|
|
41
|
+
# zur Ver- und Bearbeitung übergeben werden. Nach der Verarbeitung werden sie mittels +talk+ an die verketteten Attendees weiter
|
|
42
|
+
# gegeben. Objekte der Klasse AgendaItem dienen dabei der Steuerung der Verarbeitung und sind nicht Bestandteil der normalen
|
|
43
|
+
# Verarbeitung. Beispiele für AgendaItems sind die Kommandos TALK (Aufforderung zum Start der Verarbeitung), WARN (zur Ausgabe von
|
|
44
|
+
# Warnungen eines Attendees) und EOL (End of Line, Ende einer Textzeile nach Zerlegung in einzelne Wörter). Eine vollständige
|
|
45
|
+
# Übersicht benutzer AgendaItems (oder auf Stream Commands) steht in lib/const.rb mit dem Prefix STR_CMD_.
|
|
46
|
+
#
|
|
47
|
+
# Um die Entwicklung von neuen Attendees zu beschleunigen, wird durch die Vererbung sind bei wird die gesammte sind in der Regel nur
|
|
48
|
+
# drei abstrakte Methoden zu implementieren: +init+, +control+ und +process+. Die Methode +init+ wird bei der Instanziierung eines
|
|
49
|
+
# Objektes einmalig aufgerufen. Sie dient der Vorbereitung der Verarbeitung, z.B. durch das Öffnen und Bereitstellen von
|
|
50
|
+
# Wörterbüchern zur linguistischen Analyse. An die Methode +control+ werden alle eingehenden AgendaItems weitergeleitet. Dort erfolgt
|
|
51
|
+
# die Verarbeitungssteuerung, also z.B. bei STR_CMD_FILE das Öffnen einer Datei und bei STR_CMD_EOF respektive das Schließen. Die
|
|
52
|
+
# echte Verarbeitung von Daten findet daher durch die Methode +process+ statt.
|
|
53
|
+
#
|
|
54
|
+
# was macht attendee
|
|
55
|
+
# - verkettung der attendees anhand von konfigurationsinformationen
|
|
56
|
+
# - bereitstellung von globalen und spezifischen konfigurationsinformationen
|
|
57
|
+
# - behandlung von bestimmten übergreifenden Kommandos, z.B. STR_CMD_TALK, STR_CMD_STATUS
|
|
58
|
+
# - separierung und routing von kommando bzw. datenobjekten
|
|
59
|
+
#
|
|
60
|
+
# was macht die abgeleitet klasse
|
|
61
|
+
# - verarbeitet und/oder transformiert datenobjekte
|
|
62
|
+
# - wird gesteuert durch kommandos
|
|
63
|
+
# - schreibt verarbeitungsstatistiken
|
|
64
|
+
|
|
65
|
+
class Attendee
|
|
66
|
+
|
|
67
|
+
include Language
|
|
68
|
+
include Reportable
|
|
69
|
+
|
|
70
|
+
STR_CMD_TALK = 'TALK'
|
|
71
|
+
STR_CMD_STATUS = 'STATUS'
|
|
72
|
+
STR_CMD_LIR = 'LIR-FORMAT'
|
|
73
|
+
STR_CMD_FILE = 'FILE'
|
|
74
|
+
STR_CMD_EOL = 'EOL'
|
|
75
|
+
STR_CMD_RECORD = 'RECORD'
|
|
76
|
+
STR_CMD_EOF = 'EOF'
|
|
77
|
+
|
|
78
|
+
STA_NUM_COMMANDS = 'Received Commands'
|
|
79
|
+
STA_NUM_OBJECTS = 'Received Objects '
|
|
80
|
+
STA_TIM_COMMANDS = 'Time to control '
|
|
81
|
+
STA_TIM_OBJECTS = 'Time to process '
|
|
82
|
+
|
|
83
|
+
def initialize(config, lingo)
|
|
84
|
+
@lingo = lingo
|
|
85
|
+
|
|
86
|
+
init_reportable
|
|
87
|
+
|
|
88
|
+
# Make sure config exists
|
|
89
|
+
lingo.dictionary_config
|
|
90
|
+
|
|
91
|
+
@config, @subscriber = config, []
|
|
92
|
+
|
|
93
|
+
init if self.class.method_defined?(:init)
|
|
94
|
+
|
|
95
|
+
@can_control = self.class.method_defined?(:control)
|
|
96
|
+
@can_process = self.class.method_defined?(:process)
|
|
97
|
+
|
|
98
|
+
@skip_command, @timer = false, nil
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def add_subscriber(subscriber)
|
|
102
|
+
@subscriber.concat(subscriber)
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def listen(obj)
|
|
106
|
+
unless obj.is_a?(AgendaItem)
|
|
107
|
+
@can_process ? stat_timer(:objects) { process(obj) } : forward(obj)
|
|
108
|
+
else
|
|
109
|
+
args = obj.to_a
|
|
110
|
+
stat_timer(:commands) { control(*args) } if @can_control
|
|
111
|
+
|
|
112
|
+
case obj.cmd
|
|
113
|
+
when STR_CMD_TALK
|
|
114
|
+
nil
|
|
115
|
+
when STR_CMD_STATUS
|
|
116
|
+
report_time
|
|
117
|
+
report_status
|
|
118
|
+
|
|
119
|
+
forward(*args)
|
|
120
|
+
else
|
|
121
|
+
forward(*args) unless skip_command!
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def talk(obj)
|
|
127
|
+
charge_timer { @subscriber.each { |attendee| attendee.listen(obj) } }
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
private
|
|
131
|
+
|
|
132
|
+
def sta_for(key)
|
|
133
|
+
%w[NUM TIM].map { |i| self.class.const_get("STA_#{i}_#{key.upcase}") }
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
def stat_timer(key)
|
|
137
|
+
n, t = sta_for(key)
|
|
138
|
+
inc(n)
|
|
139
|
+
|
|
140
|
+
return yield unless @lingo.report_time
|
|
141
|
+
|
|
142
|
+
@timer = Time.new
|
|
143
|
+
res = yield
|
|
144
|
+
add(t, Time.new - @timer)
|
|
145
|
+
res
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def charge_timer
|
|
149
|
+
return yield unless @lingo.report_time
|
|
150
|
+
|
|
151
|
+
res = nil
|
|
152
|
+
@timer += Benchmark.realtime { res = yield }
|
|
153
|
+
res
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def report_time
|
|
157
|
+
return unless @lingo.report_time
|
|
158
|
+
|
|
159
|
+
msg = 'Perf: %-15s ' <<
|
|
160
|
+
'=> %7d commands in %s (%s/cmd)' <<
|
|
161
|
+
', %8d objects in %s (%s/obj)'
|
|
162
|
+
|
|
163
|
+
arg = [@config['name']]
|
|
164
|
+
|
|
165
|
+
%w[commands objects].each { |k|
|
|
166
|
+
n, t = sta_for(k).map(&method(:get))
|
|
167
|
+
arg << n
|
|
168
|
+
|
|
169
|
+
arg.concat([1, n].map { |m|
|
|
170
|
+
s = m.zero? ? 0.0 : t / m.to_f
|
|
171
|
+
|
|
172
|
+
'%9.3f %-2s' %
|
|
173
|
+
if s < 0.001
|
|
174
|
+
[s * 1000.0 ** 2, 'µs']
|
|
175
|
+
elsif s < 1.0
|
|
176
|
+
[s * 1000.0, 'ms']
|
|
177
|
+
elsif s < 60.0
|
|
178
|
+
[s, 's']
|
|
179
|
+
elsif s < 60.0 ** 2
|
|
180
|
+
[s / 60.0, 'm']
|
|
181
|
+
else
|
|
182
|
+
[s / 60.0 ** 2, 'h']
|
|
183
|
+
end
|
|
184
|
+
})
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
@lingo.warn msg % arg
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def report_status
|
|
191
|
+
return unless @lingo.report_status
|
|
192
|
+
|
|
193
|
+
msg = "Attendee <%s> was connected from '%s' to '%s' reporting..."
|
|
194
|
+
|
|
195
|
+
@lingo.warn msg % @config.values_at(*%w[name in out]),
|
|
196
|
+
nil, report.sort.map { |k, v| " #{k} = #{v}" }, nil
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def skip_command
|
|
200
|
+
@skip_command = true
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def skip_command!
|
|
204
|
+
@skip_command.tap { @skip_command &&= false }
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
def forward(obj, param = nil)
|
|
208
|
+
talk(param ? AgendaItem.new(obj, param) : obj)
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
def has_key?(key)
|
|
212
|
+
@config && @config.has_key?(key)
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
def get_key(key, default = nodefault = Object.new)
|
|
216
|
+
raise MissingConfigError.new(key) if nodefault && !has_key?(key)
|
|
217
|
+
@config.fetch(key, default)
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
def get_array(key, default = nil)
|
|
221
|
+
get_key(key, default).split(STRING_SEPARATOR_RE)
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
def dictionary(src, mod)
|
|
225
|
+
Language::Dictionary.new({ 'source' => src, 'mode' => mod }, @lingo)
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def grammar(src, mod)
|
|
229
|
+
Language::Grammar.new({ 'source' => src, 'mode' => mod }, @lingo)
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
def set_dic
|
|
233
|
+
@dic = dictionary(get_array('source'), get_key('mode', 'all'))
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
def set_gra
|
|
237
|
+
@gra = grammar(get_array('source'), get_key('mode', 'all'))
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
require_relative 'buffered_attendee'
|
|
245
|
+
|
|
246
|
+
require_relative 'attendee/abbreviator'
|
|
247
|
+
require_relative 'attendee/debugger'
|
|
248
|
+
require_relative 'attendee/decomposer'
|
|
249
|
+
require_relative 'attendee/dehyphenizer'
|
|
250
|
+
require_relative 'attendee/multi_worder'
|
|
251
|
+
require_relative 'attendee/noneword_filter'
|
|
252
|
+
require_relative 'attendee/object_filter'
|
|
253
|
+
require_relative 'attendee/variator'
|
|
254
|
+
require_relative 'attendee/sequencer'
|
|
255
|
+
require_relative 'attendee/synonymer'
|
|
256
|
+
require_relative 'attendee/text_reader'
|
|
257
|
+
require_relative 'attendee/text_writer'
|
|
258
|
+
require_relative 'attendee/formatter'
|
|
259
|
+
require_relative 'attendee/tokenizer'
|
|
260
|
+
require_relative 'attendee/vector_filter'
|
|
261
|
+
require_relative 'attendee/word_searcher'
|