lingo 1.8.0 → 1.8.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/ChangeLog +13 -0
  2. data/README +49 -29
  3. data/Rakefile +28 -4
  4. data/TODO +2 -9
  5. data/bin/lingo +24 -0
  6. data/bin/lingoctl +24 -0
  7. data/de/lingo-dic.txt +559 -74
  8. data/info/gpl-hdr.txt +21 -24
  9. data/lib/lingo.rb +83 -112
  10. data/lib/lingo/agenda_item.rb +53 -0
  11. data/lib/lingo/attendee.rb +261 -0
  12. data/lib/lingo/attendee/abbreviator.rb +95 -97
  13. data/lib/lingo/attendee/debugger.rb +94 -93
  14. data/lib/lingo/attendee/decomposer.rb +76 -83
  15. data/lib/lingo/attendee/dehyphenizer.rb +141 -144
  16. data/lib/lingo/attendee/formatter.rb +65 -0
  17. data/lib/lingo/attendee/multi_worder.rb +302 -0
  18. data/lib/lingo/attendee/noneword_filter.rb +89 -84
  19. data/lib/lingo/attendee/object_filter.rb +91 -0
  20. data/lib/lingo/attendee/sequencer.rb +159 -158
  21. data/lib/lingo/attendee/synonymer.rb +81 -84
  22. data/lib/lingo/attendee/text_reader.rb +242 -0
  23. data/lib/lingo/attendee/text_writer.rb +169 -0
  24. data/lib/lingo/attendee/tokenizer.rb +192 -191
  25. data/lib/lingo/attendee/variator.rb +152 -156
  26. data/lib/lingo/attendee/vector_filter.rb +140 -135
  27. data/lib/lingo/attendee/word_searcher.rb +98 -0
  28. data/lib/lingo/buffered_attendee.rb +69 -0
  29. data/lib/lingo/cachable.rb +58 -0
  30. data/lib/lingo/call.rb +72 -0
  31. data/lib/lingo/cli.rb +26 -0
  32. data/lib/lingo/config.rb +23 -26
  33. data/lib/lingo/core_ext.rb +42 -0
  34. data/lib/lingo/ctl.rb +239 -173
  35. data/lib/lingo/database.rb +148 -496
  36. data/lib/lingo/database/crypter.rb +85 -0
  37. data/lib/lingo/database/gdbm_store.rb +49 -0
  38. data/lib/lingo/database/hash_store.rb +67 -0
  39. data/lib/lingo/database/libcdb_store.rb +58 -0
  40. data/lib/lingo/database/sdbm_store.rb +64 -0
  41. data/lib/lingo/database/show_progress.rb +81 -0
  42. data/lib/lingo/database/source.rb +134 -0
  43. data/lib/lingo/database/source/key_value.rb +62 -0
  44. data/lib/lingo/database/source/multi_key.rb +65 -0
  45. data/lib/lingo/database/source/multi_value.rb +65 -0
  46. data/lib/lingo/database/source/single_word.rb +60 -0
  47. data/lib/lingo/database/source/word_class.rb +64 -0
  48. data/lib/lingo/error.rb +122 -0
  49. data/lib/lingo/language.rb +78 -518
  50. data/lib/lingo/language/dictionary.rb +173 -0
  51. data/lib/lingo/language/grammar.rb +211 -0
  52. data/lib/lingo/language/lexical.rb +66 -0
  53. data/lib/lingo/language/lexical_hash.rb +88 -0
  54. data/lib/lingo/language/token.rb +48 -0
  55. data/lib/lingo/language/word.rb +130 -0
  56. data/lib/lingo/language/word_form.rb +83 -0
  57. data/lib/lingo/reportable.rb +59 -0
  58. data/lib/lingo/version.rb +1 -1
  59. data/lingo-all.cfg +14 -10
  60. data/lingo-call.cfg +5 -5
  61. data/lingo.cfg +14 -12
  62. data/lingo.rb +26 -0
  63. data/lir.cfg +13 -9
  64. data/spec/spec_helper.rb +1 -0
  65. data/test.cfg +11 -11
  66. data/test/attendee/ts_abbreviator.rb +0 -6
  67. data/test/attendee/ts_decomposer.rb +0 -6
  68. data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
  69. data/test/attendee/ts_noneword_filter.rb +1 -7
  70. data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
  71. data/test/attendee/ts_sequencer.rb +0 -6
  72. data/test/attendee/ts_synonymer.rb +0 -6
  73. data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
  74. data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
  75. data/test/attendee/ts_tokenizer.rb +0 -6
  76. data/test/attendee/ts_variator.rb +0 -6
  77. data/test/attendee/ts_vector_filter.rb +1 -7
  78. data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
  79. data/test/ref/artikel.non +2 -29
  80. data/test/ref/artikel.seq +13 -8
  81. data/test/ref/artikel.vec +30 -15
  82. data/test/ref/artikel.ven +29 -14
  83. data/test/ref/artikel.ver +58 -43
  84. data/test/ref/lir.csv +146 -145
  85. data/test/ref/lir.non +186 -210
  86. data/test/ref/lir.seq +54 -50
  87. data/test/test_helper.rb +41 -36
  88. data/test/ts_database.rb +12 -11
  89. data/test/ts_language.rb +118 -68
  90. metadata +67 -29
  91. data/lib/lingo/attendee/multiworder.rb +0 -301
  92. data/lib/lingo/attendee/objectfilter.rb +0 -86
  93. data/lib/lingo/attendee/textreader.rb +0 -237
  94. data/lib/lingo/attendee/textwriter.rb +0 -196
  95. data/lib/lingo/attendee/wordsearcher.rb +0 -96
  96. data/lib/lingo/attendees.rb +0 -289
  97. data/lib/lingo/const.rb +0 -131
  98. data/lib/lingo/modules.rb +0 -98
  99. data/lib/lingo/types.rb +0 -285
  100. data/lib/lingo/utilities.rb +0 -40
@@ -1,27 +1,24 @@
1
1
  #--
2
- # LINGO ist ein Indexierungssystem mit Grundformreduktion, Kompositumzerlegung,
3
- # Mehrworterkennung und Relationierung.
4
- #
5
- # Copyright (C) 2005-2007 John Vorhauer
6
- # Copyright (C) 2007-2011 John Vorhauer, Jens Wille
7
- #
8
- # This program is free software; you can redistribute it and/or modify it under
9
- # the terms of the GNU Affero General Public License as published by the Free
10
- # Software Foundation; either version 3 of the License, or (at your option)
11
- # any later version.
12
- #
13
- # This program is distributed in the hope that it will be useful, but WITHOUT
14
- # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15
- # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
16
- # details.
17
- #
18
- # You should have received a copy of the GNU Affero General Public License along
19
- # with this program; if not, write to the Free Software Foundation, Inc.,
20
- # 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
21
- #
22
- # For more information visit http://www.lex-lingo.de or contact me at
23
- # welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
24
- #
25
- # Lex Lingo rules from here on
2
+ ###############################################################################
3
+ # #
4
+ # Lingo -- A full-featured automatic indexing system #
5
+ # #
6
+ # Copyright (C) 2005-2007 John Vorhauer #
7
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
8
+ # #
9
+ # Lingo is free software; you can redistribute it and/or modify it under the #
10
+ # terms of the GNU Affero General Public License as published by the Free #
11
+ # Software Foundation; either version 3 of the License, or (at your option) #
12
+ # any later version. #
13
+ # #
14
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
15
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
16
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
17
+ # more details. #
18
+ # #
19
+ # You should have received a copy of the GNU Affero General Public License #
20
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
21
+ # #
22
+ ###############################################################################
26
23
  #++
27
24
 
@@ -1,55 +1,35 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  #--
4
- # LINGO ist ein Indexierungssystem mit Grundformreduktion, Kompositumzerlegung,
5
- # Mehrworterkennung und Relationierung.
6
- #
7
- # Copyright (C) 2005-2007 John Vorhauer
8
- # Copyright (C) 2007-2011 John Vorhauer, Jens Wille
9
- #
10
- # This program is free software; you can redistribute it and/or modify it under
11
- # the terms of the GNU Affero General Public License as published by the Free
12
- # Software Foundation; either version 3 of the License, or (at your option)
13
- # any later version.
14
- #
15
- # This program is distributed in the hope that it will be useful, but WITHOUT
16
- # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17
- # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
18
- # details.
19
- #
20
- # You should have received a copy of the GNU Affero General Public License along
21
- # with this program; if not, write to the Free Software Foundation, Inc.,
22
- # 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
23
- #
24
- # For more information visit http://www.lex-lingo.de or contact me at
25
- # welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
26
- #
27
- # Lex Lingo rules from here on
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
28
25
  #++
29
26
 
30
27
  require 'stringio'
31
28
  require 'benchmark'
29
+ require 'nuggets/file/ext'
32
30
  require 'nuggets/env/user_home'
33
31
  require 'nuggets/numeric/duration'
34
-
35
- require_relative 'lingo/config'
36
- require_relative 'lingo/attendees'
37
- require_relative 'lingo/attendee/abbreviator'
38
- require_relative 'lingo/attendee/debugger'
39
- require_relative 'lingo/attendee/decomposer'
40
- require_relative 'lingo/attendee/dehyphenizer'
41
- require_relative 'lingo/attendee/multiworder'
42
- require_relative 'lingo/attendee/noneword_filter'
43
- require_relative 'lingo/attendee/objectfilter'
44
- require_relative 'lingo/attendee/variator'
45
- require_relative 'lingo/attendee/sequencer'
46
- require_relative 'lingo/attendee/synonymer'
47
- require_relative 'lingo/attendee/textreader'
48
- require_relative 'lingo/attendee/textwriter'
49
- require_relative 'lingo/attendee/tokenizer'
50
- require_relative 'lingo/attendee/vector_filter'
51
- require_relative 'lingo/attendee/wordsearcher'
52
- require_relative 'lingo/version'
32
+ require 'nuggets/string/camelscore'
53
33
 
54
34
  class Lingo
55
35
 
@@ -65,13 +45,22 @@ class Lingo
65
45
  # The search path for Lingo dictionary and configuration files.
66
46
  PATH = ENV['LINGO_PATH'] || [CURR, HOME, BASE].join(File::PATH_SEPARATOR)
67
47
 
48
+ ENV['LINGO_PLUGIN_PATH'] ||= File.join(HOME, 'plugins')
49
+
50
+ # Map of file types to their standard location and file extension.
68
51
  FIND_OPTIONS = {
69
52
  config: { dir: 'config', ext: 'cfg' },
70
53
  dict: { dir: 'dict', ext: 'txt' },
71
54
  lang: { dir: 'lang', ext: 'lang' },
72
- store: { dir: 'store', ext: nil }
55
+ store: { dir: 'store', ext: nil },
56
+ sample: { dir: 'txt', ext: 'txt' }
73
57
  }
74
58
 
59
+ # Default encoding
60
+ ENC = 'UTF-8'.freeze
61
+
62
+ STRING_SEPARATOR_RE = %r{[; ,|]}
63
+
75
64
  class << self
76
65
 
77
66
  def talk(*args)
@@ -82,10 +71,6 @@ class Lingo
82
71
  Call.new(['-c', cfg, *args]).call(&block)
83
72
  end
84
73
 
85
- def error(msg)
86
- abort(msg)
87
- end
88
-
89
74
  def list(type, options = {})
90
75
  options = options_for(type, options)
91
76
  path = path_for(options)
@@ -130,10 +115,12 @@ class Lingo
130
115
  def find_file(file, path, options)
131
116
  pn = Pathname.new(file_with_ext(file, options)).cleanpath
132
117
 
133
- walk(path, options) { |dir|
134
- pn2 = pn.expand_path(dir)
135
- pn = pn2 and break if pn2.exist?
136
- } if pn.relative?
118
+ if pn.relative?
119
+ walk(path, options) { |dir|
120
+ pn2 = pn.expand_path(dir)
121
+ pn = pn2 and break if pn2.exist?
122
+ }
123
+ end
137
124
 
138
125
  realpath_for(pn, path)
139
126
  end
@@ -142,18 +129,14 @@ class Lingo
142
129
  base = basename(:dict, find(:dict, file, path))
143
130
 
144
131
  walk(path.reverse, options, false) { |dir|
145
- Pathname.new(dir).ascend { |r|
146
- break true if r.file?
147
-
148
- return File.join(dir, base).tap { |s|
149
- s.chomp!(File.extname(s))
150
- } if r.writable?
151
-
152
- break true if r.exist?
132
+ Pathname.new(dir).ascend { |i|
133
+ break true if i.file?
134
+ return File.chomp_ext(File.join(dir, base)) if i.writable?
135
+ break true if i.exist?
153
136
  }
154
137
  }
155
138
 
156
- raise 'No writable store found in search path'
139
+ raise NoWritableStoreError.new(file, path)
157
140
  end
158
141
 
159
142
  def options_for(type, options = {})
@@ -190,6 +173,11 @@ class Lingo
190
173
  pn.realpath(path.first).to_s
191
174
  end
192
175
 
176
+ def require_optional(lib)
177
+ require lib unless ENV["LINGO_NO_#{lib.upcase}"]
178
+ rescue LoadError
179
+ end
180
+
193
181
  end
194
182
 
195
183
  attr_reader :dictionaries, :report_status, :report_time
@@ -205,10 +193,19 @@ class Lingo
205
193
 
206
194
  def dictionary_config
207
195
  @dictionary_config ||= config['language/dictionary']
196
+ rescue => err
197
+ raise ConfigLoadError.new(err)
208
198
  end
209
199
 
210
200
  def database_config(id)
211
- dictionary_config['databases'][id]
201
+ dictionary_config['databases'][id].tap { |cfg|
202
+ raise NoDatabaseConfigError.new(id) unless cfg
203
+ raise InvalidDatabaseConfigError.new(id) unless cfg.has_key?('name')
204
+ }
205
+ end
206
+
207
+ def lexical_hash(src)
208
+ @lexical_hash[src]
212
209
  end
213
210
 
214
211
  def talk
@@ -226,7 +223,7 @@ class Lingo
226
223
 
227
224
  list.each { |hash|
228
225
  # {'attendee' => {'name'=>'Attendee', 'in'=>'nase', 'out'=>'ohr', 'param'=>'hase'}}
229
- cfg = hash.values.first.merge('name' => hash.keys.first.capitalize)
226
+ cfg = hash.values.first.merge('name' => hash.keys.first.camelcase)
230
227
 
231
228
  %w[in out].each { |key| (cfg[key] ||= '').downcase! }
232
229
 
@@ -240,16 +237,16 @@ class Lingo
240
237
  attendee = Attendee.const_get(cfg['name']).new(cfg, self)
241
238
  @attendees << attendee
242
239
 
243
- cfg['in'].split(STRING_SEPERATOR_PATTERN).each { |interest|
240
+ cfg['in'].split(STRING_SEPARATOR_RE).each { |interest|
244
241
  subscriber[interest] << attendee
245
242
  }
246
- cfg['out'].split(STRING_SEPERATOR_PATTERN).each { |theme|
243
+ cfg['out'].split(STRING_SEPARATOR_RE).each { |theme|
247
244
  supplier[theme] << attendee
248
245
  }
249
246
  }
250
247
 
251
- supplier.each { |channel, attendees| attendees.each { |att|
252
- att.add_subscriber(subscriber[channel])
248
+ supplier.each { |channel, attendees| attendees.each { |attendee|
249
+ attendee.add_subscriber(subscriber[channel])
253
250
  } }
254
251
  end
255
252
 
@@ -257,65 +254,39 @@ class Lingo
257
254
  @report_status, @report_time = report_status, report_time
258
255
 
259
256
  time = Benchmark.realtime {
260
- @attendees.first.listen(AgendaItem.new(STR_CMD_TALK))
257
+ @attendees.first.listen(AgendaItem.new(Attendee::STR_CMD_TALK))
261
258
  }
262
259
 
263
260
  if report_status || report_time
264
- config.stderr.puts "Require protocol...\n#{separator = '-' * 61}"
265
- @attendees.first.listen(AgendaItem.new(STR_CMD_STATUS))
266
- config.stderr.puts "#{separator}\nThe duration of the meeting was #{time.to_hms(2)}"
261
+ warn "Require protocol...\n#{separator = '-' * 61}"
262
+ @attendees.first.listen(AgendaItem.new(Attendee::STR_CMD_STATUS))
263
+ warn "#{separator}\nThe duration of the meeting was #{time.to_hms(2)}"
267
264
  end
268
265
  end
269
266
 
270
267
  def reset(close = true)
271
268
  dictionaries.each(&:close) if close
272
269
  @dictionaries, @attendees = [], []
270
+ @lexical_hash = Hash.new { |h, k| h[k] = Language::LexicalHash.new(k, self) }
273
271
  end
274
272
 
275
- class Call < Lingo
276
-
277
- def initialize(args = [])
278
- super(args, StringIO.new, StringIO.new, StringIO.new)
279
- end
280
-
281
- def call
282
- invite
283
-
284
- if block_given?
285
- begin
286
- yield self
287
- ensure
288
- reset
289
- end
290
- else
291
- self
292
- end
293
- end
294
-
295
- def talk(str)
296
- config.stdin.reopen(str)
297
-
298
- start
299
-
300
- %w[stdout stderr].flat_map { |key|
301
- io = config.send(key).tap(&:rewind)
302
- io.readlines.each(&:chomp!).tap {
303
- io.truncate(0)
304
- io.rewind
305
- }
306
- }.tap { |res|
307
- if block_given?
308
- res.map!(&Proc.new)
309
- else
310
- res.sort!
311
- res.uniq!
312
- end
313
- }
314
- end
315
-
273
+ def warn(*msg)
274
+ config.stderr.puts(*msg)
316
275
  end
317
276
 
318
277
  end
319
278
 
279
+ require_relative 'lingo/call'
280
+ require_relative 'lingo/error'
281
+ require_relative 'lingo/config'
282
+ require_relative 'lingo/core_ext'
283
+ require_relative 'lingo/cachable'
284
+ require_relative 'lingo/reportable'
285
+ require_relative 'lingo/agenda_item'
286
+ require_relative 'lingo/database'
287
+ require_relative 'lingo/language'
288
+ require_relative 'lingo/attendee'
289
+ require_relative 'lingo/version'
290
+
320
291
  require 'nuggets/util/pluggable'
321
292
  Util::Pluggable.load_plugins_for(Lingo)
@@ -0,0 +1,53 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ class AgendaItem
30
+
31
+ include Comparable
32
+
33
+ attr_reader :cmd, :param
34
+
35
+ def initialize(cmd, param = nil)
36
+ @cmd, @param = cmd || '', param || ''
37
+ end
38
+
39
+ def <=>(other)
40
+ other.is_a?(self.class) ? to_a <=> other.to_a : 1
41
+ end
42
+
43
+ def to_a
44
+ [cmd, param]
45
+ end
46
+
47
+ def inspect
48
+ "*#{cmd.upcase}('#{param}')"
49
+ end
50
+
51
+ end
52
+
53
+ end
@@ -0,0 +1,261 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ # Lingo ist als universelles Indexierungssystem entworfen worden. Seine Stärke liegt in der einfachen Konfigurierbarkeit für
30
+ # spezifische Aufgaben und in der schnelle Entwicklung weiterer Funktionen durch systematischen Kapselung der Komplexität auf
31
+ # kleine Verarbeitungseinheiten. Die kleinste Verarbeitungseinheit wird Attendee genannt. Um ein gewünschtes Verarbeitungsergebnis
32
+ # zu bekommen, werden die benötigten Attendees einfach in einer Reihe hinter einander geschaltet. Ein einfaches Beispiel hierfür ist
33
+ # eine direkte Verbindung zwischen einem Textreader, einem Tokenizer und einem Textwriter. Alle drei Klassen sind von der Klasse
34
+ # Attendee abgeleitet.
35
+ #
36
+ # Der Textreader liest beispielsweise Zeilen aus einer Textdatei und leitet sie weiter an den Tokenizer. Der Tokenizer zerlegt eine
37
+ # Textzeile in einzelne Wörter und gibt diese weiter an den Textwriter, der diese in eine (andere) Datei schreibt. Über vielfältige
38
+ # Konfigurationsmöglichkeiten kann das Verhalten der Attendees an die eigenen Bedürfnisse angepasst werden.
39
+ #
40
+ # Die Verkettung einzelner Attendees findet über die Schnittstellen +listen+ und +talk+ statt. An +listen+ können beliebige Objekte
41
+ # zur Ver- und Bearbeitung übergeben werden. Nach der Verarbeitung werden sie mittels +talk+ an die verketteten Attendees weiter
42
+ # gegeben. Objekte der Klasse AgendaItem dienen dabei der Steuerung der Verarbeitung und sind nicht Bestandteil der normalen
43
+ # Verarbeitung. Beispiele für AgendaItems sind die Kommandos TALK (Aufforderung zum Start der Verarbeitung), WARN (zur Ausgabe von
44
+ # Warnungen eines Attendees) und EOL (End of Line, Ende einer Textzeile nach Zerlegung in einzelne Wörter). Eine vollständige
45
+ # Übersicht benutzer AgendaItems (oder auf Stream Commands) steht in lib/const.rb mit dem Prefix STR_CMD_.
46
+ #
47
+ # Um die Entwicklung von neuen Attendees zu beschleunigen, wird durch die Vererbung sind bei wird die gesammte sind in der Regel nur
48
+ # drei abstrakte Methoden zu implementieren: +init+, +control+ und +process+. Die Methode +init+ wird bei der Instanziierung eines
49
+ # Objektes einmalig aufgerufen. Sie dient der Vorbereitung der Verarbeitung, z.B. durch das Öffnen und Bereitstellen von
50
+ # Wörterbüchern zur linguistischen Analyse. An die Methode +control+ werden alle eingehenden AgendaItems weitergeleitet. Dort erfolgt
51
+ # die Verarbeitungssteuerung, also z.B. bei STR_CMD_FILE das Öffnen einer Datei und bei STR_CMD_EOF respektive das Schließen. Die
52
+ # echte Verarbeitung von Daten findet daher durch die Methode +process+ statt.
53
+ #
54
+ # was macht attendee
55
+ # - verkettung der attendees anhand von konfigurationsinformationen
56
+ # - bereitstellung von globalen und spezifischen konfigurationsinformationen
57
+ # - behandlung von bestimmten übergreifenden Kommandos, z.B. STR_CMD_TALK, STR_CMD_STATUS
58
+ # - separierung und routing von kommando bzw. datenobjekten
59
+ #
60
+ # was macht die abgeleitet klasse
61
+ # - verarbeitet und/oder transformiert datenobjekte
62
+ # - wird gesteuert durch kommandos
63
+ # - schreibt verarbeitungsstatistiken
64
+
65
+ class Attendee
66
+
67
+ include Language
68
+ include Reportable
69
+
70
+ STR_CMD_TALK = 'TALK'
71
+ STR_CMD_STATUS = 'STATUS'
72
+ STR_CMD_LIR = 'LIR-FORMAT'
73
+ STR_CMD_FILE = 'FILE'
74
+ STR_CMD_EOL = 'EOL'
75
+ STR_CMD_RECORD = 'RECORD'
76
+ STR_CMD_EOF = 'EOF'
77
+
78
+ STA_NUM_COMMANDS = 'Received Commands'
79
+ STA_NUM_OBJECTS = 'Received Objects '
80
+ STA_TIM_COMMANDS = 'Time to control '
81
+ STA_TIM_OBJECTS = 'Time to process '
82
+
83
+ def initialize(config, lingo)
84
+ @lingo = lingo
85
+
86
+ init_reportable
87
+
88
+ # Make sure config exists
89
+ lingo.dictionary_config
90
+
91
+ @config, @subscriber = config, []
92
+
93
+ init if self.class.method_defined?(:init)
94
+
95
+ @can_control = self.class.method_defined?(:control)
96
+ @can_process = self.class.method_defined?(:process)
97
+
98
+ @skip_command, @timer = false, nil
99
+ end
100
+
101
+ def add_subscriber(subscriber)
102
+ @subscriber.concat(subscriber)
103
+ end
104
+
105
+ def listen(obj)
106
+ unless obj.is_a?(AgendaItem)
107
+ @can_process ? stat_timer(:objects) { process(obj) } : forward(obj)
108
+ else
109
+ args = obj.to_a
110
+ stat_timer(:commands) { control(*args) } if @can_control
111
+
112
+ case obj.cmd
113
+ when STR_CMD_TALK
114
+ nil
115
+ when STR_CMD_STATUS
116
+ report_time
117
+ report_status
118
+
119
+ forward(*args)
120
+ else
121
+ forward(*args) unless skip_command!
122
+ end
123
+ end
124
+ end
125
+
126
+ def talk(obj)
127
+ charge_timer { @subscriber.each { |attendee| attendee.listen(obj) } }
128
+ end
129
+
130
+ private
131
+
132
+ def sta_for(key)
133
+ %w[NUM TIM].map { |i| self.class.const_get("STA_#{i}_#{key.upcase}") }
134
+ end
135
+
136
+ def stat_timer(key)
137
+ n, t = sta_for(key)
138
+ inc(n)
139
+
140
+ return yield unless @lingo.report_time
141
+
142
+ @timer = Time.new
143
+ res = yield
144
+ add(t, Time.new - @timer)
145
+ res
146
+ end
147
+
148
+ def charge_timer
149
+ return yield unless @lingo.report_time
150
+
151
+ res = nil
152
+ @timer += Benchmark.realtime { res = yield }
153
+ res
154
+ end
155
+
156
+ def report_time
157
+ return unless @lingo.report_time
158
+
159
+ msg = 'Perf: %-15s ' <<
160
+ '=> %7d commands in %s (%s/cmd)' <<
161
+ ', %8d objects in %s (%s/obj)'
162
+
163
+ arg = [@config['name']]
164
+
165
+ %w[commands objects].each { |k|
166
+ n, t = sta_for(k).map(&method(:get))
167
+ arg << n
168
+
169
+ arg.concat([1, n].map { |m|
170
+ s = m.zero? ? 0.0 : t / m.to_f
171
+
172
+ '%9.3f %-2s' %
173
+ if s < 0.001
174
+ [s * 1000.0 ** 2, 'µs']
175
+ elsif s < 1.0
176
+ [s * 1000.0, 'ms']
177
+ elsif s < 60.0
178
+ [s, 's']
179
+ elsif s < 60.0 ** 2
180
+ [s / 60.0, 'm']
181
+ else
182
+ [s / 60.0 ** 2, 'h']
183
+ end
184
+ })
185
+ }
186
+
187
+ @lingo.warn msg % arg
188
+ end
189
+
190
+ def report_status
191
+ return unless @lingo.report_status
192
+
193
+ msg = "Attendee <%s> was connected from '%s' to '%s' reporting..."
194
+
195
+ @lingo.warn msg % @config.values_at(*%w[name in out]),
196
+ nil, report.sort.map { |k, v| " #{k} = #{v}" }, nil
197
+ end
198
+
199
+ def skip_command
200
+ @skip_command = true
201
+ end
202
+
203
+ def skip_command!
204
+ @skip_command.tap { @skip_command &&= false }
205
+ end
206
+
207
+ def forward(obj, param = nil)
208
+ talk(param ? AgendaItem.new(obj, param) : obj)
209
+ end
210
+
211
+ def has_key?(key)
212
+ @config && @config.has_key?(key)
213
+ end
214
+
215
+ def get_key(key, default = nodefault = Object.new)
216
+ raise MissingConfigError.new(key) if nodefault && !has_key?(key)
217
+ @config.fetch(key, default)
218
+ end
219
+
220
+ def get_array(key, default = nil)
221
+ get_key(key, default).split(STRING_SEPARATOR_RE)
222
+ end
223
+
224
+ def dictionary(src, mod)
225
+ Language::Dictionary.new({ 'source' => src, 'mode' => mod }, @lingo)
226
+ end
227
+
228
+ def grammar(src, mod)
229
+ Language::Grammar.new({ 'source' => src, 'mode' => mod }, @lingo)
230
+ end
231
+
232
+ def set_dic
233
+ @dic = dictionary(get_array('source'), get_key('mode', 'all'))
234
+ end
235
+
236
+ def set_gra
237
+ @gra = grammar(get_array('source'), get_key('mode', 'all'))
238
+ end
239
+
240
+ end
241
+
242
+ end
243
+
244
+ require_relative 'buffered_attendee'
245
+
246
+ require_relative 'attendee/abbreviator'
247
+ require_relative 'attendee/debugger'
248
+ require_relative 'attendee/decomposer'
249
+ require_relative 'attendee/dehyphenizer'
250
+ require_relative 'attendee/multi_worder'
251
+ require_relative 'attendee/noneword_filter'
252
+ require_relative 'attendee/object_filter'
253
+ require_relative 'attendee/variator'
254
+ require_relative 'attendee/sequencer'
255
+ require_relative 'attendee/synonymer'
256
+ require_relative 'attendee/text_reader'
257
+ require_relative 'attendee/text_writer'
258
+ require_relative 'attendee/formatter'
259
+ require_relative 'attendee/tokenizer'
260
+ require_relative 'attendee/vector_filter'
261
+ require_relative 'attendee/word_searcher'