lingo 1.8.1 → 1.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (99) hide show
  1. data/ChangeLog +23 -5
  2. data/README +1 -1
  3. data/Rakefile +5 -7
  4. data/TODO +2 -0
  5. data/bin/lingo +5 -1
  6. data/de.lang +1 -1
  7. data/en/lingo-syn.txt +0 -0
  8. data/en.lang +2 -1
  9. data/lib/lingo/attendee/abbreviator.rb +8 -9
  10. data/lib/lingo/attendee/debugger.rb +5 -4
  11. data/lib/lingo/attendee/decomposer.rb +8 -3
  12. data/lib/lingo/attendee/dehyphenizer.rb +19 -63
  13. data/lib/lingo/attendee/formatter.rb +1 -1
  14. data/lib/lingo/attendee/multi_worder.rb +67 -155
  15. data/lib/lingo/attendee/noneword_filter.rb +16 -9
  16. data/lib/lingo/attendee/object_filter.rb +1 -1
  17. data/lib/lingo/attendee/sequencer.rb +32 -63
  18. data/lib/lingo/attendee/stemmer/porter.rb +343 -0
  19. data/{info/gpl-hdr.txt → lib/lingo/attendee/stemmer.rb} +33 -0
  20. data/lib/lingo/attendee/synonymer.rb +10 -9
  21. data/lib/lingo/attendee/text_reader.rb +102 -76
  22. data/lib/lingo/attendee/text_writer.rb +23 -26
  23. data/lib/lingo/attendee/tokenizer.rb +13 -27
  24. data/lib/lingo/attendee/variator.rb +26 -66
  25. data/lib/lingo/attendee/vector_filter.rb +42 -43
  26. data/lib/lingo/attendee/word_searcher.rb +6 -7
  27. data/lib/lingo/attendee.rb +25 -7
  28. data/lib/lingo/buffered_attendee.rb +36 -10
  29. data/lib/lingo/cachable.rb +8 -8
  30. data/lib/lingo/config.rb +5 -6
  31. data/lib/lingo/ctl.rb +2 -3
  32. data/lib/lingo/database/crypter.rb +9 -26
  33. data/lib/lingo/database/gdbm_store.rb +3 -5
  34. data/lib/lingo/database/libcdb_store.rb +4 -6
  35. data/lib/lingo/database/sdbm_store.rb +11 -6
  36. data/lib/lingo/database/show_progress.rb +3 -43
  37. data/lib/lingo/database/source/key_value.rb +2 -6
  38. data/lib/lingo/database/source/multi_key.rb +3 -5
  39. data/lib/lingo/database/source/multi_value.rb +2 -6
  40. data/lib/lingo/database/source/single_word.rb +4 -6
  41. data/lib/lingo/database/source/word_class.rb +4 -10
  42. data/lib/lingo/database/source.rb +20 -18
  43. data/lib/lingo/database.rb +84 -59
  44. data/lib/lingo/error.rb +57 -1
  45. data/lib/lingo/language/dictionary.rb +21 -18
  46. data/lib/lingo/language/grammar.rb +40 -49
  47. data/lib/lingo/language/lexical.rb +6 -6
  48. data/lib/lingo/language/lexical_hash.rb +6 -0
  49. data/lib/lingo/language/word.rb +32 -15
  50. data/lib/lingo/language/word_form.rb +1 -1
  51. data/lib/lingo/language.rb +14 -25
  52. data/lib/lingo/reportable.rb +12 -10
  53. data/lib/lingo/show_progress.rb +81 -0
  54. data/lib/lingo/version.rb +1 -1
  55. data/lib/lingo.rb +63 -24
  56. data/lingo-call.cfg +6 -10
  57. data/lingo.cfg +60 -44
  58. data/lir.cfg +42 -41
  59. data/test/attendee/ts_abbreviator.rb +3 -5
  60. data/test/attendee/ts_decomposer.rb +3 -5
  61. data/test/attendee/ts_multi_worder.rb +87 -145
  62. data/test/attendee/ts_noneword_filter.rb +5 -3
  63. data/test/attendee/ts_object_filter.rb +5 -3
  64. data/test/attendee/ts_sequencer.rb +3 -5
  65. data/test/attendee/ts_stemmer.rb +309 -0
  66. data/test/attendee/ts_synonymer.rb +15 -11
  67. data/test/attendee/ts_text_reader.rb +12 -15
  68. data/test/attendee/ts_text_writer.rb +24 -29
  69. data/test/attendee/ts_tokenizer.rb +9 -7
  70. data/test/attendee/ts_variator.rb +4 -4
  71. data/test/attendee/ts_vector_filter.rb +24 -16
  72. data/test/attendee/ts_word_searcher.rb +20 -36
  73. data/test/{lir.csv → lir.vec} +0 -0
  74. data/test/ref/artikel.vec +943 -943
  75. data/test/ref/artikel.ven +943 -943
  76. data/test/ref/lir.non +201 -201
  77. data/test/ref/lir.seq +178 -178
  78. data/test/ref/lir.syn +49 -49
  79. data/test/ref/lir.vec +329 -0
  80. data/test/test_helper.rb +20 -36
  81. data/test/ts_database.rb +10 -10
  82. data/test/ts_language.rb +279 -319
  83. metadata +93 -104
  84. data/info/Objekte.png +0 -0
  85. data/info/Typen.png +0 -0
  86. data/info/database.png +0 -0
  87. data/info/db_small.png +0 -0
  88. data/info/download.png +0 -0
  89. data/info/kerze.png +0 -0
  90. data/info/language.png +0 -0
  91. data/info/lingo.png +0 -0
  92. data/info/logo.png +0 -0
  93. data/info/meeting.png +0 -0
  94. data/info/types.png +0 -0
  95. data/lingo-all.cfg +0 -89
  96. data/porter/stem.cfg +0 -311
  97. data/porter/stem.rb +0 -150
  98. data/test/ref/lir.csv +0 -329
  99. data/test.cfg +0 -79
data/ChangeLog CHANGED
@@ -1,5 +1,23 @@
1
1
  = Revision history for Lingo
2
2
 
3
+ == 1.8.2 [2012-04-19]
4
+
5
+ * Performance improvements regarding Attendee::VectorFilter's (as well as
6
+ Attendee::NonewordFilter's) memory usage; set <tt>sort: false</tt> in the config.
7
+ * Added Attendee::Stemmer (implementing Porter's algorithm for suffix stripping).
8
+ * Added progress reporting to Attendee::TextReader; set <tt>progress: true</tt>
9
+ in the config.
10
+ * Added directory and glob processing to Attendee::TextReader (new options
11
+ +glob+ and +recursive+).
12
+ * Renamed Attendee::TextReader's option +lir-record-pattern+ to +records+.
13
+ * Fixed Attendee::Debugger to forward all objects so it can be inserted
14
+ between any two attendees in the config.
15
+ * Fixed regression introduced in 1.8.0 where Lingo would not use existing
16
+ compiled dictionary when source file is not present.
17
+ * Fixed "invalid byte sequence in UTF-8" on Windows for SDBM store.
18
+ * Enabled pluggable (compiled) dictionaries and storage backends.
19
+ * Extensive internal refactoring and cleanup. (Finished for now.)
20
+
3
21
  == 1.8.1 [2012-02-19]
4
22
 
5
23
  * Introduced alternative storage backends, mainly to circumvent SDBM's record
@@ -62,13 +80,13 @@
62
80
  (requires diff-lcs[http://raa.ruby-lang.org/project/diff-lcs/]).
63
81
  * Provide alternatives to standard zip command on windows platforms.
64
82
  * Use +UNK+ itself if it doesn't have any lexicals.
65
- * Use compo form instead of word form when lexicalizing compositum entry for
83
+ * Use compo form instead of word form when lexicalizing compound entry for
66
84
  multiword dictionaries.
67
85
  * LexicalHash#[] must use target (not source) form.
68
86
  * Optionally, try to find matches for all lexicals a word has.
69
87
  * Make '-' a PRINTABLE_CHAR.
70
88
  * Allow synonyms to be considered for multiword matching.
71
- * Don't use compositum parts.
89
+ * Don't use compound parts.
72
90
  * Introduced some (more or less arbitrary) line length limit. We can only
73
91
  store values of a certain length anyway (with SDBM). Entries exceeding this
74
92
  limit will be rejected and logged in the .rev file.
@@ -252,12 +270,12 @@
252
270
  * <b>Decomposer mit zusätzlicher Validitätsprüfung und Kennzeichnung</b>
253
271
 
254
272
  Der Decomposer kann bei Bedarf Komposita einer zusätzlichen Prüfung unterziehen.
255
- Ist der Schlüssel <tt>de.lang:language/dictionary/compositum/skip-sequences</tt>
273
+ Ist der Schlüssel <tt>de.lang:language/dictionary/compound/skip-sequences</tt>
256
274
  angegeben, der z.B. in der Form <tt>skip-sequences: [ VS ]</tt> definiert wird,
257
275
  wird zusätzlich geprüft, ob das Kompositum mit seinen Teilen diesen Wortklassen
258
276
  entspricht. Hiernach werden Komposita verworfen, die aus Verb-Substantiv-Kombination
259
277
  bestehen. Die Angabe des Parameters ist optional.
260
- Zusätzlich werden bei Angabe des Schlüssels <tt>de.lang:language/dictionary/compositum/append-wordclass</tt>,
278
+ Zusätzlich werden bei Angabe des Schlüssels <tt>de.lang:language/dictionary/compound/append-wordclass</tt>,
261
279
  der i.d.R einen ein Zeichen langen String enthält, die durch Zerlegung erkannten
262
280
  Wortstämme markiert, in dem ihre Wortklasse das über diesen Schlüssel definierte
263
281
  Zeichen angehangen bekommt.
@@ -476,7 +494,7 @@
476
494
  * <b>Kompositum-Zerlegung mit weiterer Einschränkung</b>
477
495
 
478
496
  Ein weiterer Parameter ist für die Kompositumzerlegung hinzugekommen. Als
479
- Attribute des Tags <tt>XML:dictionary/compositum</tt> können jetzt angegeben werden:
497
+ Attribute des Tags <tt>XML:dictionary/compound</tt> können jetzt angegeben werden:
480
498
 
481
499
  Attribut Default Funktion
482
500
  ============================================================================
data/README CHANGED
@@ -25,7 +25,7 @@
25
25
 
26
26
  == VERSION
27
27
 
28
- This documentation refers to Lingo version 1.8.1
28
+ This documentation refers to Lingo version 1.8.2
29
29
 
30
30
 
31
31
  == DESCRIPTION
data/Rakefile CHANGED
@@ -39,10 +39,8 @@ The main functions of Lingo are:
39
39
  of word classes
40
40
  EOT
41
41
  extra_files: FileList[
42
- 'lingo.rb', 'lingo{,-all,-call}.cfg', 'lingo.opt', 'doc/**/*',
43
- '{de,en}.lang', '{de,en}/{lingo-*,user-dic}.txt', 'txt/artikel{,-en}.txt',
44
- 'info/gpl-hdr.txt', 'info/*.png', 'lir.cfg', 'txt/lir.txt', 'porter/*',
45
- 'test.cfg', '{de,en}/test_*.txt'
42
+ 'lingo.rb', 'lingo{,-call}.cfg', 'lir.cfg', '{de,en}.lang',
43
+ '{de,en}/{lingo-*,user-dic,test_*}.txt', 'txt/{artikel{,-en},lir}.txt'
46
44
  ].to_a,
47
45
  required_ruby_version: '>= 1.9',
48
46
  dependencies: [['ruby-nuggets', '>= 0.8.5'], 'unicode', 'highline'],
@@ -54,7 +52,7 @@ rescue LoadError => err
54
52
  end
55
53
 
56
54
  CLEAN.include(
57
- 'txt/*.{log,mul,non,seq,syn,ve?,csv}',
55
+ 'txt/*.{log,mul,non,seq,ste,syn,ve?}',
58
56
  'test/{test.*,text.non}',
59
57
  'store/*/*.rev',
60
58
  'bench/tmp.*'
@@ -78,7 +76,7 @@ end
78
76
 
79
77
  desc 'Test against reference file (TXT)'
80
78
  task 'test:txt' do
81
- test_ref('artikel', 'test')
79
+ test_ref('artikel', 'lingo')
82
80
  end
83
81
 
84
82
  desc 'Test against reference file (LIR)'
@@ -116,7 +114,7 @@ def test_ref(name, cfg = name)
116
114
  }.success? or abort msg.join("\n\n")
117
115
 
118
116
  Dir["test/ref/#{name}.*"].each { |ref|
119
- puts "#{'#' * 60} #{org = ref.sub(/test\/ref/, 'txt')}"
117
+ puts "## #{org = ref.sub(/test\/ref/, 'txt')}"
120
118
  continue += Diff::LCS::Ldiff.run(ARGV.clear << '-a' << org << ref)
121
119
  }
122
120
 
data/TODO CHANGED
@@ -1,5 +1,7 @@
1
1
  = ToDo list for Lingo
2
2
 
3
+ * Configuration parameter validation.
4
+ * Replace regex-based tokenizer with a (Racc/Ragel/ANTLR-based?) lexer.
3
5
  * Update and translate old documentation.
4
6
  * Allow for handling of documents in various encodings, not just the one the
5
7
  dictionaries are encoded in.
data/bin/lingo CHANGED
@@ -26,4 +26,8 @@
26
26
 
27
27
  require 'lingo'
28
28
 
29
- Lingo.talk
29
+ begin
30
+ Lingo.talk
31
+ rescue Lingo::LingoError => err
32
+ $VERBOSE ? raise : abort(err.to_s)
33
+ end
data/de.lang CHANGED
@@ -65,7 +65,7 @@ language:
65
65
  tst-cry: { name: de/test_cry.txt, txt-format: WordClass, crypt } # TEST: Verschlüsselung
66
66
  tst-sgw: { name: de/test_singleword.txt, txt-format: SingleWord } # TEST: SingleWord-Format
67
67
 
68
- compositum:
68
+ compound:
69
69
  min-word-size: "7"
70
70
  min-part-size: "3"
71
71
  max-parts: "5"
data/en/lingo-syn.txt ADDED
File without changes
data/en.lang CHANGED
@@ -52,12 +52,13 @@ language:
52
52
 
53
53
  # Systemwörterbücher
54
54
  sys-dic: { name: en/lingo-dic.txt, txt-format: WordClass, separator: '=' }
55
+ sys-syn: { name: en/lingo-syn.txt, txt-format: KeyValue, separator: '=', def-wc: y }
55
56
  sys-mul: { name: en/lingo-mul.txt, txt-format: SingleWord, use-lex: 'sys-dic', def-wc: m }
56
57
 
57
58
  # Benutzerwörterbücher
58
59
  usr-dic: { name: en/user-dic.txt, txt-format: WordClass, separator: '=' }
59
60
 
60
- compositum:
61
+ compound:
61
62
  min-word-size: "7"
62
63
  min-part-size: "3"
63
64
  max-parts: "5"
@@ -76,17 +76,15 @@ class Lingo
76
76
  set_dic
77
77
  end
78
78
 
79
- def control(cmd, par)
80
- @dic.report.each_pair { |key, value| set(key, value) } if cmd == STR_CMD_STATUS
81
-
82
- # Jedes Control-Object ist auch Auslöser der Verarbeitung
79
+ def control(cmd, param)
80
+ report_on(cmd, @dic)
83
81
  process_buffer
84
82
  end
85
83
 
86
84
  private
87
85
 
88
86
  def process_buffer?
89
- @buffer[-1].kind_of?(Token) && @buffer[-1].form == CHAR_PUNCT
87
+ form_at(-1, Token) == CHAR_PUNCT
90
88
  end
91
89
 
92
90
  def process_buffer
@@ -95,13 +93,14 @@ class Lingo
95
93
  return
96
94
  end
97
95
 
98
- # Wort vor dem Punkt im Abkürzungswörterbuch suchen
99
- if @buffer[-2].kind_of?(Token)
96
+ if form = form_at(-2, Token)
100
97
  inc('Anzahl gesuchter Abkürzungen')
101
- abbr = @dic.find_word(@buffer[-2].form)
102
- if abbr.identified?
98
+
99
+ if (abbr = find_word(form)).identified?
103
100
  inc('Anzahl gefundener Abkürzungen')
101
+
104
102
  abbr.form += CHAR_PUNCT
103
+
105
104
  @buffer[-2] = abbr
106
105
  @buffer.delete_at(-1)
107
106
  end
@@ -95,14 +95,15 @@ class Lingo
95
95
  @prompt = get_key('prompt', 'lex:) ')
96
96
  end
97
97
 
98
- def control(cmd, par)
99
- if cmd != STR_CMD_STATUS
100
- @lingo.warn "#{@prompt} #{AgendaItem.new(cmd, par).inspect}" if eval(@cmd_eval)
98
+ def control(cmd, param)
99
+ if cmd != STR_CMD_STATUS && eval(@cmd_eval)
100
+ warn "#{@prompt} #{AgendaItem.new(cmd, param).inspect}"
101
101
  end
102
102
  end
103
103
 
104
104
  def process(obj)
105
- @lingo.warn "#{@prompt} #{obj.inspect}" if eval(@obj_eval)
105
+ warn "#{@prompt} #{obj.inspect}" if eval(@obj_eval)
106
+ forward(obj)
106
107
  end
107
108
 
108
109
  end
@@ -79,12 +79,17 @@ class Lingo
79
79
  set_gra
80
80
  end
81
81
 
82
- def control(cmd, par)
83
- @gra.report.each { |key, val| set(key, val) } if cmd == STR_CMD_STATUS
82
+ def control(cmd, param)
83
+ report_on(cmd, @gra)
84
84
  end
85
85
 
86
86
  def process(obj)
87
- forward(obj.is_a?(Word) && obj.unknown? ? @gra.find_compositum(obj.form) : obj)
87
+ if obj.is_a?(Word) && obj.unknown?
88
+ com = @gra.find_compound(obj.form)
89
+ obj = com unless com.unknown?
90
+ end
91
+
92
+ forward(obj)
88
93
  end
89
94
 
90
95
  end
@@ -41,9 +41,6 @@ class Lingo
41
41
  # <b>out</b>:: siehe allgemeine Beschreibung des Attendee
42
42
  # <b>source</b>:: siehe allgemeine Beschreibung des Dictionary
43
43
  # <b><i>mode</i></b>:: (Standard: all) siehe allgemeine Beschreibung des Dictionary
44
- # <b><i>stopper</i></b>:: (Standard: TA_PUNCTUATION, TA_OTHER) Gibt die Begrenzungen an, zwischen
45
- # denen der Multiworder suchen soll, i.d.R. Satzzeichen und Sonderzeichen,
46
- # weil sie kaum in einer Mehrwortgruppen vorkommen.
47
44
  #
48
45
  # === Beispiele
49
46
  # Bei der Verarbeitung einer normalen Textdatei mit der Ablaufkonfiguration <tt>t1.cfg</tt>
@@ -74,87 +71,46 @@ class Lingo
74
71
  protected
75
72
 
76
73
  def init
77
- @stopper = get_array('stopper', TA_PUNCTUATION+','+TA_OTHER).map(&:upcase)
78
-
79
74
  set_dic
80
75
  set_gra
81
76
 
82
- @skip = get_array('skip', '').map(&:downcase)
83
-
84
- @number_of_expected_tokens_in_buffer = 2
85
- @eof_handling = false
86
- end
77
+ @skip = get_array('skip', '', :downcase)
87
78
 
88
- def control(cmd, par)
89
- @dic.report.each_pair { |key, value| set(key, value) } if cmd == STR_CMD_STATUS
90
-
91
- # Jedes Control-Object ist auch Auslöser der Verarbeitung
92
- if cmd == STR_CMD_RECORD || cmd == STR_CMD_EOF
93
- @eof_handling = true
94
- while number_of_valid_tokens_in_buffer > 1
95
- process_buffer
96
- end
97
- forward_number_of_token( @buffer.size, false )
98
- @eof_handling = false
99
- end
79
+ @expected_tokens_in_buffer, @eof_handling = 2, false
100
80
  end
101
81
 
102
- def process_buffer?
103
- number_of_valid_tokens_in_buffer >= @number_of_expected_tokens_in_buffer
82
+ def control(cmd, param)
83
+ control_multi(cmd)
104
84
  end
105
85
 
106
86
  def process_buffer
107
- if @buffer[0].is_a?(Word) &&
108
- @buffer[0].form[-1..-1] == '-' &&
109
- @buffer[1].is_a?(Word) &&
110
- !(!( ttt = @buffer[1].get_class(/./) ).nil? &&
111
- !@skip.index( ttt[0].attr ).nil?)
112
-
113
- # Einfache Zusammensetzung versuchen
114
- form = @buffer[0].form[0...-1] + @buffer[1].form
115
- word = @dic.find_word(form)
116
- word = @gra.find_compositum(form) unless word.identified?
117
-
118
- unless word.identified? || (word.attr == WA_KOMPOSITUM && word.get_class('x+').empty?)
119
- # Zusammensetzung mit Bindestrich versuchen
120
- form = @buffer[0].form + @buffer[1].form
121
- word = @dic.find_word(form)
122
- word = @gra.find_compositum(form) unless word.identified?
123
- end
87
+ a, b, h = *ab = @buffer.values_at(0, 1), '-'
124
88
 
125
- unless word.identified? || (word.attr == WA_KOMPOSITUM && word.get_class('x+').empty?)
126
- # Zusammensetzung mit Bindestrich versuchen
127
- form = @buffer[0].form + @buffer[1].form
128
- word = @dic.find_word(form)
129
- word = @gra.find_compositum(form) unless word.identified?
130
- end
89
+ if ab.all? { |i| i.is_a?(Word) } && a.form[-1, 1] == h && !(
90
+ (c = b.get_class(/./).first) && @skip.include?(c.attr)
91
+ )
92
+ a, b = ab.map!(&:form)
131
93
 
132
- if word.identified? || (word.attr == WA_KOMPOSITUM && word.get_class('x+').empty?)
94
+ word = dehyphenize(a.chomp(h) + b)
95
+ word = dehyphenize(a + b) unless dehyphenized?(word)
96
+
97
+ if dehyphenized?(word)
133
98
  @buffer[0] = word
134
- @buffer.delete_at( 1 )
99
+ @buffer.delete_at(1)
135
100
  end
136
101
  end
137
102
 
138
- # Buffer weiterschaufeln
139
- forward_number_of_token( 1, false )
103
+ forward_number_of_token(1, false)
140
104
  end
141
105
 
142
106
  private
143
107
 
144
- # Leitet 'len' Token weiter
145
- def forward_number_of_token( len, count_punc = true )
146
- begin
147
- unless @buffer.empty?
148
- forward( @buffer[0] )
149
- len -= 1 unless count_punc && @buffer[0].form == CHAR_PUNCT
150
- @buffer.delete_at( 0 )
151
- end
152
- end while len > 0
108
+ def dehyphenize(form)
109
+ find_word(form, &:identified?)
153
110
  end
154
111
 
155
- # Liefert die Anzahl gültiger Token zurück
156
- def number_of_valid_tokens_in_buffer
157
- @buffer.collect { |token| (token.form == CHAR_PUNCT) ? nil : 1 }.compact.size
112
+ def dehyphenized?(word)
113
+ word.identified? || word.full_compound?
158
114
  end
159
115
 
160
116
  end
@@ -43,7 +43,7 @@ class Lingo
43
43
  end
44
44
 
45
45
  def process(obj)
46
- if obj.is_a?(Word) || obj.is_a?(Token)
46
+ if obj.is_a?(WordForm)
47
47
  str = obj.form
48
48
 
49
49
  if obj.respond_to?(:lexicals)
@@ -48,9 +48,6 @@ class Lingo
48
48
  # <b>out</b>:: siehe allgemeine Beschreibung des Attendee
49
49
  # <b>source</b>:: siehe allgemeine Beschreibung des Dictionary
50
50
  # <b><i>mode</i></b>:: (Standard: all) siehe allgemeine Beschreibung des Dictionary
51
- # <b><i>stopper</i></b>:: (Standard: TA_PUNCTUATION, TA_OTHER) Gibt die Begrenzungen an, zwischen
52
- # denen der MultiWorder suchen soll, i.d.R. Satzzeichen und Sonderzeichen,
53
- # weil sie kaum in einer Mehrwortgruppen vorkommen.
54
51
  #
55
52
  # === Beispiele
56
53
  # Bei der Verarbeitung einer normalen Textdatei mit der Ablaufkonfiguration <tt>t1.cfg</tt>
@@ -81,216 +78,131 @@ class Lingo
81
78
  protected
82
79
 
83
80
  def init
84
- @stopper = get_array('stopper', TA_PUNCTUATION+','+TA_OTHER).map(&:upcase)
85
- @mul_dic = dictionary(mul_src = get_array('source'), get_key('mode', 'all'))
86
-
87
81
  # combine lexical variants?
88
82
  #
89
83
  # false = old behaviour
90
84
  # true = first match
91
85
  # 'all' = all matches
92
- @combine = get_key('combine', false)
93
- @all_keys = @combine.is_a?(String) && @combine.downcase == 'all'
86
+ @combine = get_key('combine', false)
87
+ @all = @combine.is_a?(String) && @combine.downcase == 'all'
88
+
89
+ lex_src, lex_mod, d = nil, nil, @lingo.dictionary_config['databases']
94
90
 
95
- lex_src, lex_mod, databases = nil, nil, @lingo.dictionary_config['databases']
91
+ (mul_src = get_array('source')).each { |src|
92
+ s, m = d[src].values_at('use-lex', 'lex-mode')
96
93
 
97
- mul_src.each { |src|
98
- this_src, this_mod = databases[src].values_at('use-lex', 'lex-mode')
99
- if lex_src.nil? || lex_src == this_src
100
- lex_src, lex_mod = this_src, this_mod
94
+ if lex_src.nil? || lex_src == s
95
+ lex_src, lex_mod = s, m
101
96
  else
102
- @lingo.warn "#{self.class}: Dictionaries don't match: #{mul_src.join(',')}"
97
+ warn "#{self.class}: Dictionaries don't match: #{mul_src.join(',')}"
103
98
  end
104
99
  }
105
100
 
106
- lex_src = lex_src.split(STRING_SEPARATOR_RE)
101
+ lex_src = lex_src.split(SEP_RE)
107
102
  lex_mod = get_key('lex-mode', lex_mod || 'first')
108
103
 
104
+ @mul_dic = dictionary(mul_src, get_key('mode', 'all'))
109
105
  @lex_dic = dictionary(lex_src, lex_mod)
110
106
  @lex_gra = grammar(lex_src, lex_mod)
111
107
 
112
- if @combine && has_key?('use-syn')
113
- @syn_dic = dictionary(get_array('use-syn'), get_key('syn-mode', 'all'))
108
+ @syn_dic = if @combine && has_key?('use-syn')
109
+ dictionary(get_array('use-syn'), get_key('syn-mode', 'all'))
114
110
  end
115
111
 
116
- @number_of_expected_tokens_in_buffer = 3
117
- @eof_handling = false
112
+ @expected_tokens_in_buffer, @eof_handling = 3, false
118
113
  end
119
114
 
120
- def control(cmd, par)
121
- @mul_dic.report.each_pair { |key, value| set(key, value) } if cmd == STR_CMD_STATUS
122
-
123
- # Jedes Control-Object ist auch Auslöser der Verarbeitung
124
- if cmd == STR_CMD_RECORD || cmd == STR_CMD_EOF
125
- @eof_handling = true
126
- while number_of_valid_tokens_in_buffer > 1
127
- process_buffer
128
- end
129
- forward_number_of_token( @buffer.size, false )
130
- @eof_handling = false
131
- end
132
- end
133
-
134
- def process_buffer?
135
- number_of_valid_tokens_in_buffer >= @number_of_expected_tokens_in_buffer
115
+ def control(cmd, param)
116
+ control_multi(cmd, @mul_dic)
136
117
  end
137
118
 
138
119
  def process_buffer
139
- unless @buffer[0].form == CHAR_PUNCT
140
- # Prüfe 3er Schlüssel
141
- result = check_multiword_key( 3 )
142
- unless result.empty?
143
- # 3er Schlüssel gefunden
144
- lengths = sort_result_len( result )
145
- unless lengths[0] > 3
146
- # Längster erkannter Schlüssel = 3
147
- create_and_forward_multiword( 3, result )
148
- forward_number_of_token( 3 )
149
- return
120
+ unless form_at(0) == CHAR_PUNCT
121
+ unless (res = check_multiword_key(3)).empty?
122
+ len = res.map { |r|
123
+ r.is_a?(Lexical) ? r.form.split(' ').size : r[/^\*(\d+)/, 1].to_i
124
+ }.sort!.reverse!
125
+
126
+ unless (max = len.first) > 3
127
+ create_and_forward_multiword(3, res)
128
+ forward_number_of_token(3)
150
129
  else
151
- # Längster erkannter Schlüssel > 3, Buffer voll genug?
152
- unless @buffer.size >= lengths[0] || @eof_handling
153
- @number_of_expected_tokens_in_buffer = lengths[0]
154
- return
130
+ unless @eof_handling || @buffer.size >= max
131
+ @expected_tokens_in_buffer = max
155
132
  else
156
- # Buffer voll genug, Verarbeitung kann beginnen
157
- catch( :forward_one ) do
158
- lengths.each do |len|
159
- result = check_multiword_key( len )
160
- unless result.empty?
161
- create_and_forward_multiword( len, result )
162
- forward_number_of_token( len )
163
- throw :forward_one
164
- end
165
- end
166
-
167
- # Keinen Match gefunden
168
- forward_number_of_token( 1 )
169
- end
170
-
171
- @number_of_expected_tokens_in_buffer = 3
133
+ forward_number_of_token(len.find { |l|
134
+ r = check_multiword_key(l)
135
+ create_and_forward_multiword(l, r) unless r.empty?
136
+ } || 1)
137
+
138
+ @expected_tokens_in_buffer = 3
172
139
  process_buffer if process_buffer?
173
- return
174
140
  end
175
141
  end
142
+
143
+ return
176
144
  end
177
145
 
178
- # Prüfe 2er Schlüssel
179
- result = check_multiword_key( 2 )
180
- unless result.empty?
181
- create_and_forward_multiword( 2, result )
182
- forward_number_of_token( 1 )
146
+ unless (res = check_multiword_key(2)).empty?
147
+ create_and_forward_multiword(2, res)
148
+ forward_number_of_token(1)
183
149
  end
184
150
  end
185
151
 
186
- # Buffer weiterschaufeln
187
- forward_number_of_token( 1, false )
188
- @number_of_expected_tokens_in_buffer = 3
152
+ forward_number_of_token(1, false)
153
+ @expected_tokens_in_buffer = 3
189
154
  end
190
155
 
191
156
  private
192
157
 
193
- def create_and_forward_multiword( len, lexicals )
194
- # Form aus Buffer auslesen und Teile markieren
195
- pos = 0
196
- form_parts = []
158
+ def create_and_forward_multiword(len, lex)
159
+ pos, parts = 0, []
160
+
197
161
  begin
198
- if @buffer[pos].form == CHAR_PUNCT
199
- @buffer.delete_at( pos )
200
- form_parts[-1] += CHAR_PUNCT
162
+ if (form = form_at(pos)) == CHAR_PUNCT
163
+ @buffer.delete_at(pos)
164
+ parts[-1] += CHAR_PUNCT
201
165
  else
202
166
  @buffer[pos].attr = WA_UNKMULPART if @buffer[pos].unknown?
203
- form_parts << @buffer[pos].form
167
+ parts << form
204
168
  pos += 1
205
169
  end
206
170
  end while pos < len
207
171
 
208
- form = form_parts.join( ' ' )
209
-
210
- # Multiword erstellen
211
- word = Word.new( form, WA_MULTIWORD )
212
- word << lexicals.collect { |lex| (lex.is_a?(Lexical)) ? lex : nil }.compact # FIXME 1.60 - Ausstieg bei "*5" im Synonymer
213
-
214
- # Forword Multiword
215
- forward( word )
216
- end
217
-
218
- # Leitet 'len' Token weiter
219
- def forward_number_of_token( len, count_punc = true )
220
- begin
221
- unless @buffer.empty?
222
- forward( @buffer[0] )
223
- len -= 1 unless count_punc && @buffer[0].form == CHAR_PUNCT
224
- @buffer.delete_at( 0 )
225
- end
226
- end while len > 0
227
- end
228
-
229
- # Ermittelt die maximale Ergebnislänge
230
- def sort_result_len( result )
231
- result.collect do |res|
232
- if res.is_a?( Lexical )
233
- res.form.split( ' ' ).size
234
- else
235
- res =~ /^\*(\d+)/
236
- $1.to_i
237
- end
238
- end.sort.reverse
172
+ forward(Word.new_lexicals(parts.join(' '),
173
+ WA_MULTIWORD, lex.select { |l| l.is_a?(Lexical) }))
239
174
  end
240
175
 
241
176
  # Prüft einen definiert langen Schlüssel ab Position 0 im Buffer
242
- def check_multiword_key( len )
243
- return [] if number_of_valid_tokens_in_buffer < len
177
+ def check_multiword_key(len)
178
+ return [] if valid_tokens_in_buffer < len
244
179
 
245
- # Wortformen aus der Wortliste auslesen
246
- sequence = @buffer.map { |obj|
180
+ seq = @buffer.map { |obj|
247
181
  next [obj] unless obj.is_a?(WordForm)
182
+ next if (form = obj.form) == CHAR_PUNCT
248
183
 
249
- form = obj.form
250
- next if form == CHAR_PUNCT
251
-
252
- word = @lex_dic.find_word(form)
253
- word = @lex_gra.find_compositum(form) if word.unknown?
254
-
255
- lexicals = word.attr == WA_KOMPOSITUM ?
256
- [word.lexicals.first] : word.lexicals.dup
257
-
258
- lexicals << word if lexicals.empty?
259
- lexicals += @syn_dic.find_synonyms(word) if @syn_dic
260
-
261
- lexicals.map { |lex| lex.form }.uniq
262
- }.compact[0, len]
263
-
264
- if @combine
265
- keys, muls = [], []
184
+ w = find_word(form, @lex_dic, @lex_gra)
185
+ l = w.lexicals
266
186
 
267
- sequence.each { |forms|
268
- keys = forms.map { |form|
269
- keys.empty? ? form : keys.map { |key| "#{key} #{form}" }
270
- }.flatten(1)
187
+ (w.attr == WA_COMPOUND ? [l.first] : l.empty? ? [w] : l.dup).tap { |i|
188
+ i.concat(@syn_dic.find_synonyms(w)) if @syn_dic
189
+ i.map! { |j| j.form.downcase }.uniq!
271
190
  }
191
+ }
272
192
 
273
- keys.each { |key|
274
- mul = @mul_dic.select(key.downcase)
275
-
276
- unless mul.empty?
277
- muls.concat(mul)
278
- break unless @all_keys
279
- end
280
- }
193
+ seq.compact!
194
+ seq.slice!(len..-1)
281
195
 
282
- muls.uniq
196
+ if @combine
197
+ [].tap { |mul| seq.shift.product(*seq) { |key|
198
+ mul.concat(@mul_dic.select(key.join(' ')))
199
+ break unless @all_keys || mul.empty?
200
+ } && mul.uniq! }
283
201
  else
284
- key = sequence.map { |forms| forms.first }.join(' ')
285
- @mul_dic.select(key.downcase)
202
+ @mul_dic.select(seq.map!(&:first).join(' '))
286
203
  end
287
204
  end
288
205
 
289
- # Liefert die Anzahl gültiger Token zurück
290
- def number_of_valid_tokens_in_buffer
291
- @buffer.collect { |token| (token.form == CHAR_PUNCT) ? nil : 1 }.compact.size
292
- end
293
-
294
206
  end
295
207
 
296
208
  # For backwards compatibility.