lingo 1.8.0 → 1.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/ChangeLog +13 -0
  2. data/README +49 -29
  3. data/Rakefile +28 -4
  4. data/TODO +2 -9
  5. data/bin/lingo +24 -0
  6. data/bin/lingoctl +24 -0
  7. data/de/lingo-dic.txt +559 -74
  8. data/info/gpl-hdr.txt +21 -24
  9. data/lib/lingo.rb +83 -112
  10. data/lib/lingo/agenda_item.rb +53 -0
  11. data/lib/lingo/attendee.rb +261 -0
  12. data/lib/lingo/attendee/abbreviator.rb +95 -97
  13. data/lib/lingo/attendee/debugger.rb +94 -93
  14. data/lib/lingo/attendee/decomposer.rb +76 -83
  15. data/lib/lingo/attendee/dehyphenizer.rb +141 -144
  16. data/lib/lingo/attendee/formatter.rb +65 -0
  17. data/lib/lingo/attendee/multi_worder.rb +302 -0
  18. data/lib/lingo/attendee/noneword_filter.rb +89 -84
  19. data/lib/lingo/attendee/object_filter.rb +91 -0
  20. data/lib/lingo/attendee/sequencer.rb +159 -158
  21. data/lib/lingo/attendee/synonymer.rb +81 -84
  22. data/lib/lingo/attendee/text_reader.rb +242 -0
  23. data/lib/lingo/attendee/text_writer.rb +169 -0
  24. data/lib/lingo/attendee/tokenizer.rb +192 -191
  25. data/lib/lingo/attendee/variator.rb +152 -156
  26. data/lib/lingo/attendee/vector_filter.rb +140 -135
  27. data/lib/lingo/attendee/word_searcher.rb +98 -0
  28. data/lib/lingo/buffered_attendee.rb +69 -0
  29. data/lib/lingo/cachable.rb +58 -0
  30. data/lib/lingo/call.rb +72 -0
  31. data/lib/lingo/cli.rb +26 -0
  32. data/lib/lingo/config.rb +23 -26
  33. data/lib/lingo/core_ext.rb +42 -0
  34. data/lib/lingo/ctl.rb +239 -173
  35. data/lib/lingo/database.rb +148 -496
  36. data/lib/lingo/database/crypter.rb +85 -0
  37. data/lib/lingo/database/gdbm_store.rb +49 -0
  38. data/lib/lingo/database/hash_store.rb +67 -0
  39. data/lib/lingo/database/libcdb_store.rb +58 -0
  40. data/lib/lingo/database/sdbm_store.rb +64 -0
  41. data/lib/lingo/database/show_progress.rb +81 -0
  42. data/lib/lingo/database/source.rb +134 -0
  43. data/lib/lingo/database/source/key_value.rb +62 -0
  44. data/lib/lingo/database/source/multi_key.rb +65 -0
  45. data/lib/lingo/database/source/multi_value.rb +65 -0
  46. data/lib/lingo/database/source/single_word.rb +60 -0
  47. data/lib/lingo/database/source/word_class.rb +64 -0
  48. data/lib/lingo/error.rb +122 -0
  49. data/lib/lingo/language.rb +78 -518
  50. data/lib/lingo/language/dictionary.rb +173 -0
  51. data/lib/lingo/language/grammar.rb +211 -0
  52. data/lib/lingo/language/lexical.rb +66 -0
  53. data/lib/lingo/language/lexical_hash.rb +88 -0
  54. data/lib/lingo/language/token.rb +48 -0
  55. data/lib/lingo/language/word.rb +130 -0
  56. data/lib/lingo/language/word_form.rb +83 -0
  57. data/lib/lingo/reportable.rb +59 -0
  58. data/lib/lingo/version.rb +1 -1
  59. data/lingo-all.cfg +14 -10
  60. data/lingo-call.cfg +5 -5
  61. data/lingo.cfg +14 -12
  62. data/lingo.rb +26 -0
  63. data/lir.cfg +13 -9
  64. data/spec/spec_helper.rb +1 -0
  65. data/test.cfg +11 -11
  66. data/test/attendee/ts_abbreviator.rb +0 -6
  67. data/test/attendee/ts_decomposer.rb +0 -6
  68. data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
  69. data/test/attendee/ts_noneword_filter.rb +1 -7
  70. data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
  71. data/test/attendee/ts_sequencer.rb +0 -6
  72. data/test/attendee/ts_synonymer.rb +0 -6
  73. data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
  74. data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
  75. data/test/attendee/ts_tokenizer.rb +0 -6
  76. data/test/attendee/ts_variator.rb +0 -6
  77. data/test/attendee/ts_vector_filter.rb +1 -7
  78. data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
  79. data/test/ref/artikel.non +2 -29
  80. data/test/ref/artikel.seq +13 -8
  81. data/test/ref/artikel.vec +30 -15
  82. data/test/ref/artikel.ven +29 -14
  83. data/test/ref/artikel.ver +58 -43
  84. data/test/ref/lir.csv +146 -145
  85. data/test/ref/lir.non +186 -210
  86. data/test/ref/lir.seq +54 -50
  87. data/test/test_helper.rb +41 -36
  88. data/test/ts_database.rb +12 -11
  89. data/test/ts_language.rb +118 -68
  90. metadata +67 -29
  91. data/lib/lingo/attendee/multiworder.rb +0 -301
  92. data/lib/lingo/attendee/objectfilter.rb +0 -86
  93. data/lib/lingo/attendee/textreader.rb +0 -237
  94. data/lib/lingo/attendee/textwriter.rb +0 -196
  95. data/lib/lingo/attendee/wordsearcher.rb +0 -96
  96. data/lib/lingo/attendees.rb +0 -289
  97. data/lib/lingo/const.rb +0 -131
  98. data/lib/lingo/modules.rb +0 -98
  99. data/lib/lingo/types.rb +0 -285
  100. data/lib/lingo/utilities.rb +0 -40
@@ -1,27 +1,24 @@
1
1
  #--
2
- # LINGO ist ein Indexierungssystem mit Grundformreduktion, Kompositumzerlegung,
3
- # Mehrworterkennung und Relationierung.
4
- #
5
- # Copyright (C) 2005-2007 John Vorhauer
6
- # Copyright (C) 2007-2011 John Vorhauer, Jens Wille
7
- #
8
- # This program is free software; you can redistribute it and/or modify it under
9
- # the terms of the GNU Affero General Public License as published by the Free
10
- # Software Foundation; either version 3 of the License, or (at your option)
11
- # any later version.
12
- #
13
- # This program is distributed in the hope that it will be useful, but WITHOUT
14
- # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15
- # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
16
- # details.
17
- #
18
- # You should have received a copy of the GNU Affero General Public License along
19
- # with this program; if not, write to the Free Software Foundation, Inc.,
20
- # 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
21
- #
22
- # For more information visit http://www.lex-lingo.de or contact me at
23
- # welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
24
- #
25
- # Lex Lingo rules from here on
2
+ ###############################################################################
3
+ # #
4
+ # Lingo -- A full-featured automatic indexing system #
5
+ # #
6
+ # Copyright (C) 2005-2007 John Vorhauer #
7
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
8
+ # #
9
+ # Lingo is free software; you can redistribute it and/or modify it under the #
10
+ # terms of the GNU Affero General Public License as published by the Free #
11
+ # Software Foundation; either version 3 of the License, or (at your option) #
12
+ # any later version. #
13
+ # #
14
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
15
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
16
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
17
+ # more details. #
18
+ # #
19
+ # You should have received a copy of the GNU Affero General Public License #
20
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
21
+ # #
22
+ ###############################################################################
26
23
  #++
27
24
 
@@ -1,55 +1,35 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  #--
4
- # LINGO ist ein Indexierungssystem mit Grundformreduktion, Kompositumzerlegung,
5
- # Mehrworterkennung und Relationierung.
6
- #
7
- # Copyright (C) 2005-2007 John Vorhauer
8
- # Copyright (C) 2007-2011 John Vorhauer, Jens Wille
9
- #
10
- # This program is free software; you can redistribute it and/or modify it under
11
- # the terms of the GNU Affero General Public License as published by the Free
12
- # Software Foundation; either version 3 of the License, or (at your option)
13
- # any later version.
14
- #
15
- # This program is distributed in the hope that it will be useful, but WITHOUT
16
- # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17
- # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
18
- # details.
19
- #
20
- # You should have received a copy of the GNU Affero General Public License along
21
- # with this program; if not, write to the Free Software Foundation, Inc.,
22
- # 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
23
- #
24
- # For more information visit http://www.lex-lingo.de or contact me at
25
- # welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
26
- #
27
- # Lex Lingo rules from here on
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
28
25
  #++
29
26
 
30
27
  require 'stringio'
31
28
  require 'benchmark'
29
+ require 'nuggets/file/ext'
32
30
  require 'nuggets/env/user_home'
33
31
  require 'nuggets/numeric/duration'
34
-
35
- require_relative 'lingo/config'
36
- require_relative 'lingo/attendees'
37
- require_relative 'lingo/attendee/abbreviator'
38
- require_relative 'lingo/attendee/debugger'
39
- require_relative 'lingo/attendee/decomposer'
40
- require_relative 'lingo/attendee/dehyphenizer'
41
- require_relative 'lingo/attendee/multiworder'
42
- require_relative 'lingo/attendee/noneword_filter'
43
- require_relative 'lingo/attendee/objectfilter'
44
- require_relative 'lingo/attendee/variator'
45
- require_relative 'lingo/attendee/sequencer'
46
- require_relative 'lingo/attendee/synonymer'
47
- require_relative 'lingo/attendee/textreader'
48
- require_relative 'lingo/attendee/textwriter'
49
- require_relative 'lingo/attendee/tokenizer'
50
- require_relative 'lingo/attendee/vector_filter'
51
- require_relative 'lingo/attendee/wordsearcher'
52
- require_relative 'lingo/version'
32
+ require 'nuggets/string/camelscore'
53
33
 
54
34
  class Lingo
55
35
 
@@ -65,13 +45,22 @@ class Lingo
65
45
  # The search path for Lingo dictionary and configuration files.
66
46
  PATH = ENV['LINGO_PATH'] || [CURR, HOME, BASE].join(File::PATH_SEPARATOR)
67
47
 
48
+ ENV['LINGO_PLUGIN_PATH'] ||= File.join(HOME, 'plugins')
49
+
50
+ # Map of file types to their standard location and file extension.
68
51
  FIND_OPTIONS = {
69
52
  config: { dir: 'config', ext: 'cfg' },
70
53
  dict: { dir: 'dict', ext: 'txt' },
71
54
  lang: { dir: 'lang', ext: 'lang' },
72
- store: { dir: 'store', ext: nil }
55
+ store: { dir: 'store', ext: nil },
56
+ sample: { dir: 'txt', ext: 'txt' }
73
57
  }
74
58
 
59
+ # Default encoding
60
+ ENC = 'UTF-8'.freeze
61
+
62
+ STRING_SEPARATOR_RE = %r{[; ,|]}
63
+
75
64
  class << self
76
65
 
77
66
  def talk(*args)
@@ -82,10 +71,6 @@ class Lingo
82
71
  Call.new(['-c', cfg, *args]).call(&block)
83
72
  end
84
73
 
85
- def error(msg)
86
- abort(msg)
87
- end
88
-
89
74
  def list(type, options = {})
90
75
  options = options_for(type, options)
91
76
  path = path_for(options)
@@ -130,10 +115,12 @@ class Lingo
130
115
  def find_file(file, path, options)
131
116
  pn = Pathname.new(file_with_ext(file, options)).cleanpath
132
117
 
133
- walk(path, options) { |dir|
134
- pn2 = pn.expand_path(dir)
135
- pn = pn2 and break if pn2.exist?
136
- } if pn.relative?
118
+ if pn.relative?
119
+ walk(path, options) { |dir|
120
+ pn2 = pn.expand_path(dir)
121
+ pn = pn2 and break if pn2.exist?
122
+ }
123
+ end
137
124
 
138
125
  realpath_for(pn, path)
139
126
  end
@@ -142,18 +129,14 @@ class Lingo
142
129
  base = basename(:dict, find(:dict, file, path))
143
130
 
144
131
  walk(path.reverse, options, false) { |dir|
145
- Pathname.new(dir).ascend { |r|
146
- break true if r.file?
147
-
148
- return File.join(dir, base).tap { |s|
149
- s.chomp!(File.extname(s))
150
- } if r.writable?
151
-
152
- break true if r.exist?
132
+ Pathname.new(dir).ascend { |i|
133
+ break true if i.file?
134
+ return File.chomp_ext(File.join(dir, base)) if i.writable?
135
+ break true if i.exist?
153
136
  }
154
137
  }
155
138
 
156
- raise 'No writable store found in search path'
139
+ raise NoWritableStoreError.new(file, path)
157
140
  end
158
141
 
159
142
  def options_for(type, options = {})
@@ -190,6 +173,11 @@ class Lingo
190
173
  pn.realpath(path.first).to_s
191
174
  end
192
175
 
176
+ def require_optional(lib)
177
+ require lib unless ENV["LINGO_NO_#{lib.upcase}"]
178
+ rescue LoadError
179
+ end
180
+
193
181
  end
194
182
 
195
183
  attr_reader :dictionaries, :report_status, :report_time
@@ -205,10 +193,19 @@ class Lingo
205
193
 
206
194
  def dictionary_config
207
195
  @dictionary_config ||= config['language/dictionary']
196
+ rescue => err
197
+ raise ConfigLoadError.new(err)
208
198
  end
209
199
 
210
200
  def database_config(id)
211
- dictionary_config['databases'][id]
201
+ dictionary_config['databases'][id].tap { |cfg|
202
+ raise NoDatabaseConfigError.new(id) unless cfg
203
+ raise InvalidDatabaseConfigError.new(id) unless cfg.has_key?('name')
204
+ }
205
+ end
206
+
207
+ def lexical_hash(src)
208
+ @lexical_hash[src]
212
209
  end
213
210
 
214
211
  def talk
@@ -226,7 +223,7 @@ class Lingo
226
223
 
227
224
  list.each { |hash|
228
225
  # {'attendee' => {'name'=>'Attendee', 'in'=>'nase', 'out'=>'ohr', 'param'=>'hase'}}
229
- cfg = hash.values.first.merge('name' => hash.keys.first.capitalize)
226
+ cfg = hash.values.first.merge('name' => hash.keys.first.camelcase)
230
227
 
231
228
  %w[in out].each { |key| (cfg[key] ||= '').downcase! }
232
229
 
@@ -240,16 +237,16 @@ class Lingo
240
237
  attendee = Attendee.const_get(cfg['name']).new(cfg, self)
241
238
  @attendees << attendee
242
239
 
243
- cfg['in'].split(STRING_SEPERATOR_PATTERN).each { |interest|
240
+ cfg['in'].split(STRING_SEPARATOR_RE).each { |interest|
244
241
  subscriber[interest] << attendee
245
242
  }
246
- cfg['out'].split(STRING_SEPERATOR_PATTERN).each { |theme|
243
+ cfg['out'].split(STRING_SEPARATOR_RE).each { |theme|
247
244
  supplier[theme] << attendee
248
245
  }
249
246
  }
250
247
 
251
- supplier.each { |channel, attendees| attendees.each { |att|
252
- att.add_subscriber(subscriber[channel])
248
+ supplier.each { |channel, attendees| attendees.each { |attendee|
249
+ attendee.add_subscriber(subscriber[channel])
253
250
  } }
254
251
  end
255
252
 
@@ -257,65 +254,39 @@ class Lingo
257
254
  @report_status, @report_time = report_status, report_time
258
255
 
259
256
  time = Benchmark.realtime {
260
- @attendees.first.listen(AgendaItem.new(STR_CMD_TALK))
257
+ @attendees.first.listen(AgendaItem.new(Attendee::STR_CMD_TALK))
261
258
  }
262
259
 
263
260
  if report_status || report_time
264
- config.stderr.puts "Require protocol...\n#{separator = '-' * 61}"
265
- @attendees.first.listen(AgendaItem.new(STR_CMD_STATUS))
266
- config.stderr.puts "#{separator}\nThe duration of the meeting was #{time.to_hms(2)}"
261
+ warn "Require protocol...\n#{separator = '-' * 61}"
262
+ @attendees.first.listen(AgendaItem.new(Attendee::STR_CMD_STATUS))
263
+ warn "#{separator}\nThe duration of the meeting was #{time.to_hms(2)}"
267
264
  end
268
265
  end
269
266
 
270
267
  def reset(close = true)
271
268
  dictionaries.each(&:close) if close
272
269
  @dictionaries, @attendees = [], []
270
+ @lexical_hash = Hash.new { |h, k| h[k] = Language::LexicalHash.new(k, self) }
273
271
  end
274
272
 
275
- class Call < Lingo
276
-
277
- def initialize(args = [])
278
- super(args, StringIO.new, StringIO.new, StringIO.new)
279
- end
280
-
281
- def call
282
- invite
283
-
284
- if block_given?
285
- begin
286
- yield self
287
- ensure
288
- reset
289
- end
290
- else
291
- self
292
- end
293
- end
294
-
295
- def talk(str)
296
- config.stdin.reopen(str)
297
-
298
- start
299
-
300
- %w[stdout stderr].flat_map { |key|
301
- io = config.send(key).tap(&:rewind)
302
- io.readlines.each(&:chomp!).tap {
303
- io.truncate(0)
304
- io.rewind
305
- }
306
- }.tap { |res|
307
- if block_given?
308
- res.map!(&Proc.new)
309
- else
310
- res.sort!
311
- res.uniq!
312
- end
313
- }
314
- end
315
-
273
+ def warn(*msg)
274
+ config.stderr.puts(*msg)
316
275
  end
317
276
 
318
277
  end
319
278
 
279
+ require_relative 'lingo/call'
280
+ require_relative 'lingo/error'
281
+ require_relative 'lingo/config'
282
+ require_relative 'lingo/core_ext'
283
+ require_relative 'lingo/cachable'
284
+ require_relative 'lingo/reportable'
285
+ require_relative 'lingo/agenda_item'
286
+ require_relative 'lingo/database'
287
+ require_relative 'lingo/language'
288
+ require_relative 'lingo/attendee'
289
+ require_relative 'lingo/version'
290
+
320
291
  require 'nuggets/util/pluggable'
321
292
  Util::Pluggable.load_plugins_for(Lingo)
@@ -0,0 +1,53 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ class AgendaItem
30
+
31
+ include Comparable
32
+
33
+ attr_reader :cmd, :param
34
+
35
+ def initialize(cmd, param = nil)
36
+ @cmd, @param = cmd || '', param || ''
37
+ end
38
+
39
+ def <=>(other)
40
+ other.is_a?(self.class) ? to_a <=> other.to_a : 1
41
+ end
42
+
43
+ def to_a
44
+ [cmd, param]
45
+ end
46
+
47
+ def inspect
48
+ "*#{cmd.upcase}('#{param}')"
49
+ end
50
+
51
+ end
52
+
53
+ end
@@ -0,0 +1,261 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ # Lingo ist als universelles Indexierungssystem entworfen worden. Seine Stärke liegt in der einfachen Konfigurierbarkeit für
30
+ # spezifische Aufgaben und in der schnelle Entwicklung weiterer Funktionen durch systematischen Kapselung der Komplexität auf
31
+ # kleine Verarbeitungseinheiten. Die kleinste Verarbeitungseinheit wird Attendee genannt. Um ein gewünschtes Verarbeitungsergebnis
32
+ # zu bekommen, werden die benötigten Attendees einfach in einer Reihe hinter einander geschaltet. Ein einfaches Beispiel hierfür ist
33
+ # eine direkte Verbindung zwischen einem Textreader, einem Tokenizer und einem Textwriter. Alle drei Klassen sind von der Klasse
34
+ # Attendee abgeleitet.
35
+ #
36
+ # Der Textreader liest beispielsweise Zeilen aus einer Textdatei und leitet sie weiter an den Tokenizer. Der Tokenizer zerlegt eine
37
+ # Textzeile in einzelne Wörter und gibt diese weiter an den Textwriter, der diese in eine (andere) Datei schreibt. Über vielfältige
38
+ # Konfigurationsmöglichkeiten kann das Verhalten der Attendees an die eigenen Bedürfnisse angepasst werden.
39
+ #
40
+ # Die Verkettung einzelner Attendees findet über die Schnittstellen +listen+ und +talk+ statt. An +listen+ können beliebige Objekte
41
+ # zur Ver- und Bearbeitung übergeben werden. Nach der Verarbeitung werden sie mittels +talk+ an die verketteten Attendees weiter
42
+ # gegeben. Objekte der Klasse AgendaItem dienen dabei der Steuerung der Verarbeitung und sind nicht Bestandteil der normalen
43
+ # Verarbeitung. Beispiele für AgendaItems sind die Kommandos TALK (Aufforderung zum Start der Verarbeitung), WARN (zur Ausgabe von
44
+ # Warnungen eines Attendees) und EOL (End of Line, Ende einer Textzeile nach Zerlegung in einzelne Wörter). Eine vollständige
45
+ # Übersicht benutzer AgendaItems (oder auf Stream Commands) steht in lib/const.rb mit dem Prefix STR_CMD_.
46
+ #
47
+ # Um die Entwicklung von neuen Attendees zu beschleunigen, wird durch die Vererbung sind bei wird die gesammte sind in der Regel nur
48
+ # drei abstrakte Methoden zu implementieren: +init+, +control+ und +process+. Die Methode +init+ wird bei der Instanziierung eines
49
+ # Objektes einmalig aufgerufen. Sie dient der Vorbereitung der Verarbeitung, z.B. durch das Öffnen und Bereitstellen von
50
+ # Wörterbüchern zur linguistischen Analyse. An die Methode +control+ werden alle eingehenden AgendaItems weitergeleitet. Dort erfolgt
51
+ # die Verarbeitungssteuerung, also z.B. bei STR_CMD_FILE das Öffnen einer Datei und bei STR_CMD_EOF respektive das Schließen. Die
52
+ # echte Verarbeitung von Daten findet daher durch die Methode +process+ statt.
53
+ #
54
+ # was macht attendee
55
+ # - verkettung der attendees anhand von konfigurationsinformationen
56
+ # - bereitstellung von globalen und spezifischen konfigurationsinformationen
57
+ # - behandlung von bestimmten übergreifenden Kommandos, z.B. STR_CMD_TALK, STR_CMD_STATUS
58
+ # - separierung und routing von kommando bzw. datenobjekten
59
+ #
60
+ # was macht die abgeleitet klasse
61
+ # - verarbeitet und/oder transformiert datenobjekte
62
+ # - wird gesteuert durch kommandos
63
+ # - schreibt verarbeitungsstatistiken
64
+
65
+ class Attendee
66
+
67
+ include Language
68
+ include Reportable
69
+
70
+ STR_CMD_TALK = 'TALK'
71
+ STR_CMD_STATUS = 'STATUS'
72
+ STR_CMD_LIR = 'LIR-FORMAT'
73
+ STR_CMD_FILE = 'FILE'
74
+ STR_CMD_EOL = 'EOL'
75
+ STR_CMD_RECORD = 'RECORD'
76
+ STR_CMD_EOF = 'EOF'
77
+
78
+ STA_NUM_COMMANDS = 'Received Commands'
79
+ STA_NUM_OBJECTS = 'Received Objects '
80
+ STA_TIM_COMMANDS = 'Time to control '
81
+ STA_TIM_OBJECTS = 'Time to process '
82
+
83
+ def initialize(config, lingo)
84
+ @lingo = lingo
85
+
86
+ init_reportable
87
+
88
+ # Make sure config exists
89
+ lingo.dictionary_config
90
+
91
+ @config, @subscriber = config, []
92
+
93
+ init if self.class.method_defined?(:init)
94
+
95
+ @can_control = self.class.method_defined?(:control)
96
+ @can_process = self.class.method_defined?(:process)
97
+
98
+ @skip_command, @timer = false, nil
99
+ end
100
+
101
+ def add_subscriber(subscriber)
102
+ @subscriber.concat(subscriber)
103
+ end
104
+
105
+ def listen(obj)
106
+ unless obj.is_a?(AgendaItem)
107
+ @can_process ? stat_timer(:objects) { process(obj) } : forward(obj)
108
+ else
109
+ args = obj.to_a
110
+ stat_timer(:commands) { control(*args) } if @can_control
111
+
112
+ case obj.cmd
113
+ when STR_CMD_TALK
114
+ nil
115
+ when STR_CMD_STATUS
116
+ report_time
117
+ report_status
118
+
119
+ forward(*args)
120
+ else
121
+ forward(*args) unless skip_command!
122
+ end
123
+ end
124
+ end
125
+
126
+ def talk(obj)
127
+ charge_timer { @subscriber.each { |attendee| attendee.listen(obj) } }
128
+ end
129
+
130
+ private
131
+
132
+ def sta_for(key)
133
+ %w[NUM TIM].map { |i| self.class.const_get("STA_#{i}_#{key.upcase}") }
134
+ end
135
+
136
+ def stat_timer(key)
137
+ n, t = sta_for(key)
138
+ inc(n)
139
+
140
+ return yield unless @lingo.report_time
141
+
142
+ @timer = Time.new
143
+ res = yield
144
+ add(t, Time.new - @timer)
145
+ res
146
+ end
147
+
148
+ def charge_timer
149
+ return yield unless @lingo.report_time
150
+
151
+ res = nil
152
+ @timer += Benchmark.realtime { res = yield }
153
+ res
154
+ end
155
+
156
+ def report_time
157
+ return unless @lingo.report_time
158
+
159
+ msg = 'Perf: %-15s ' <<
160
+ '=> %7d commands in %s (%s/cmd)' <<
161
+ ', %8d objects in %s (%s/obj)'
162
+
163
+ arg = [@config['name']]
164
+
165
+ %w[commands objects].each { |k|
166
+ n, t = sta_for(k).map(&method(:get))
167
+ arg << n
168
+
169
+ arg.concat([1, n].map { |m|
170
+ s = m.zero? ? 0.0 : t / m.to_f
171
+
172
+ '%9.3f %-2s' %
173
+ if s < 0.001
174
+ [s * 1000.0 ** 2, 'µs']
175
+ elsif s < 1.0
176
+ [s * 1000.0, 'ms']
177
+ elsif s < 60.0
178
+ [s, 's']
179
+ elsif s < 60.0 ** 2
180
+ [s / 60.0, 'm']
181
+ else
182
+ [s / 60.0 ** 2, 'h']
183
+ end
184
+ })
185
+ }
186
+
187
+ @lingo.warn msg % arg
188
+ end
189
+
190
+ def report_status
191
+ return unless @lingo.report_status
192
+
193
+ msg = "Attendee <%s> was connected from '%s' to '%s' reporting..."
194
+
195
+ @lingo.warn msg % @config.values_at(*%w[name in out]),
196
+ nil, report.sort.map { |k, v| " #{k} = #{v}" }, nil
197
+ end
198
+
199
+ def skip_command
200
+ @skip_command = true
201
+ end
202
+
203
+ def skip_command!
204
+ @skip_command.tap { @skip_command &&= false }
205
+ end
206
+
207
+ def forward(obj, param = nil)
208
+ talk(param ? AgendaItem.new(obj, param) : obj)
209
+ end
210
+
211
+ def has_key?(key)
212
+ @config && @config.has_key?(key)
213
+ end
214
+
215
+ def get_key(key, default = nodefault = Object.new)
216
+ raise MissingConfigError.new(key) if nodefault && !has_key?(key)
217
+ @config.fetch(key, default)
218
+ end
219
+
220
+ def get_array(key, default = nil)
221
+ get_key(key, default).split(STRING_SEPARATOR_RE)
222
+ end
223
+
224
+ def dictionary(src, mod)
225
+ Language::Dictionary.new({ 'source' => src, 'mode' => mod }, @lingo)
226
+ end
227
+
228
+ def grammar(src, mod)
229
+ Language::Grammar.new({ 'source' => src, 'mode' => mod }, @lingo)
230
+ end
231
+
232
+ def set_dic
233
+ @dic = dictionary(get_array('source'), get_key('mode', 'all'))
234
+ end
235
+
236
+ def set_gra
237
+ @gra = grammar(get_array('source'), get_key('mode', 'all'))
238
+ end
239
+
240
+ end
241
+
242
+ end
243
+
244
+ require_relative 'buffered_attendee'
245
+
246
+ require_relative 'attendee/abbreviator'
247
+ require_relative 'attendee/debugger'
248
+ require_relative 'attendee/decomposer'
249
+ require_relative 'attendee/dehyphenizer'
250
+ require_relative 'attendee/multi_worder'
251
+ require_relative 'attendee/noneword_filter'
252
+ require_relative 'attendee/object_filter'
253
+ require_relative 'attendee/variator'
254
+ require_relative 'attendee/sequencer'
255
+ require_relative 'attendee/synonymer'
256
+ require_relative 'attendee/text_reader'
257
+ require_relative 'attendee/text_writer'
258
+ require_relative 'attendee/formatter'
259
+ require_relative 'attendee/tokenizer'
260
+ require_relative 'attendee/vector_filter'
261
+ require_relative 'attendee/word_searcher'