lingo 1.8.0 → 1.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/ChangeLog +13 -0
  2. data/README +49 -29
  3. data/Rakefile +28 -4
  4. data/TODO +2 -9
  5. data/bin/lingo +24 -0
  6. data/bin/lingoctl +24 -0
  7. data/de/lingo-dic.txt +559 -74
  8. data/info/gpl-hdr.txt +21 -24
  9. data/lib/lingo.rb +83 -112
  10. data/lib/lingo/agenda_item.rb +53 -0
  11. data/lib/lingo/attendee.rb +261 -0
  12. data/lib/lingo/attendee/abbreviator.rb +95 -97
  13. data/lib/lingo/attendee/debugger.rb +94 -93
  14. data/lib/lingo/attendee/decomposer.rb +76 -83
  15. data/lib/lingo/attendee/dehyphenizer.rb +141 -144
  16. data/lib/lingo/attendee/formatter.rb +65 -0
  17. data/lib/lingo/attendee/multi_worder.rb +302 -0
  18. data/lib/lingo/attendee/noneword_filter.rb +89 -84
  19. data/lib/lingo/attendee/object_filter.rb +91 -0
  20. data/lib/lingo/attendee/sequencer.rb +159 -158
  21. data/lib/lingo/attendee/synonymer.rb +81 -84
  22. data/lib/lingo/attendee/text_reader.rb +242 -0
  23. data/lib/lingo/attendee/text_writer.rb +169 -0
  24. data/lib/lingo/attendee/tokenizer.rb +192 -191
  25. data/lib/lingo/attendee/variator.rb +152 -156
  26. data/lib/lingo/attendee/vector_filter.rb +140 -135
  27. data/lib/lingo/attendee/word_searcher.rb +98 -0
  28. data/lib/lingo/buffered_attendee.rb +69 -0
  29. data/lib/lingo/cachable.rb +58 -0
  30. data/lib/lingo/call.rb +72 -0
  31. data/lib/lingo/cli.rb +26 -0
  32. data/lib/lingo/config.rb +23 -26
  33. data/lib/lingo/core_ext.rb +42 -0
  34. data/lib/lingo/ctl.rb +239 -173
  35. data/lib/lingo/database.rb +148 -496
  36. data/lib/lingo/database/crypter.rb +85 -0
  37. data/lib/lingo/database/gdbm_store.rb +49 -0
  38. data/lib/lingo/database/hash_store.rb +67 -0
  39. data/lib/lingo/database/libcdb_store.rb +58 -0
  40. data/lib/lingo/database/sdbm_store.rb +64 -0
  41. data/lib/lingo/database/show_progress.rb +81 -0
  42. data/lib/lingo/database/source.rb +134 -0
  43. data/lib/lingo/database/source/key_value.rb +62 -0
  44. data/lib/lingo/database/source/multi_key.rb +65 -0
  45. data/lib/lingo/database/source/multi_value.rb +65 -0
  46. data/lib/lingo/database/source/single_word.rb +60 -0
  47. data/lib/lingo/database/source/word_class.rb +64 -0
  48. data/lib/lingo/error.rb +122 -0
  49. data/lib/lingo/language.rb +78 -518
  50. data/lib/lingo/language/dictionary.rb +173 -0
  51. data/lib/lingo/language/grammar.rb +211 -0
  52. data/lib/lingo/language/lexical.rb +66 -0
  53. data/lib/lingo/language/lexical_hash.rb +88 -0
  54. data/lib/lingo/language/token.rb +48 -0
  55. data/lib/lingo/language/word.rb +130 -0
  56. data/lib/lingo/language/word_form.rb +83 -0
  57. data/lib/lingo/reportable.rb +59 -0
  58. data/lib/lingo/version.rb +1 -1
  59. data/lingo-all.cfg +14 -10
  60. data/lingo-call.cfg +5 -5
  61. data/lingo.cfg +14 -12
  62. data/lingo.rb +26 -0
  63. data/lir.cfg +13 -9
  64. data/spec/spec_helper.rb +1 -0
  65. data/test.cfg +11 -11
  66. data/test/attendee/ts_abbreviator.rb +0 -6
  67. data/test/attendee/ts_decomposer.rb +0 -6
  68. data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
  69. data/test/attendee/ts_noneword_filter.rb +1 -7
  70. data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
  71. data/test/attendee/ts_sequencer.rb +0 -6
  72. data/test/attendee/ts_synonymer.rb +0 -6
  73. data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
  74. data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
  75. data/test/attendee/ts_tokenizer.rb +0 -6
  76. data/test/attendee/ts_variator.rb +0 -6
  77. data/test/attendee/ts_vector_filter.rb +1 -7
  78. data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
  79. data/test/ref/artikel.non +2 -29
  80. data/test/ref/artikel.seq +13 -8
  81. data/test/ref/artikel.vec +30 -15
  82. data/test/ref/artikel.ven +29 -14
  83. data/test/ref/artikel.ver +58 -43
  84. data/test/ref/lir.csv +146 -145
  85. data/test/ref/lir.non +186 -210
  86. data/test/ref/lir.seq +54 -50
  87. data/test/test_helper.rb +41 -36
  88. data/test/ts_database.rb +12 -11
  89. data/test/ts_language.rb +118 -68
  90. metadata +67 -29
  91. data/lib/lingo/attendee/multiworder.rb +0 -301
  92. data/lib/lingo/attendee/objectfilter.rb +0 -86
  93. data/lib/lingo/attendee/textreader.rb +0 -237
  94. data/lib/lingo/attendee/textwriter.rb +0 -196
  95. data/lib/lingo/attendee/wordsearcher.rb +0 -96
  96. data/lib/lingo/attendees.rb +0 -289
  97. data/lib/lingo/const.rb +0 -131
  98. data/lib/lingo/modules.rb +0 -98
  99. data/lib/lingo/types.rb +0 -285
  100. data/lib/lingo/utilities.rb +0 -40
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lingo
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.8.0
4
+ version: 1.8.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -10,22 +10,22 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2012-01-01 00:00:00.000000000 Z
13
+ date: 2012-02-19 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: ruby-nuggets
17
- requirement: &12570880 !ruby/object:Gem::Requirement
17
+ requirement: &10045620 !ruby/object:Gem::Requirement
18
18
  none: false
19
19
  requirements:
20
20
  - - ! '>='
21
21
  - !ruby/object:Gem::Version
22
- version: 0.8.2
22
+ version: 0.8.5
23
23
  type: :runtime
24
24
  prerelease: false
25
- version_requirements: *12570880
25
+ version_requirements: *10045620
26
26
  - !ruby/object:Gem::Dependency
27
27
  name: unicode
28
- requirement: &12594600 !ruby/object:Gem::Requirement
28
+ requirement: &10045140 !ruby/object:Gem::Requirement
29
29
  none: false
30
30
  requirements:
31
31
  - - ! '>='
@@ -33,10 +33,21 @@ dependencies:
33
33
  version: '0'
34
34
  type: :runtime
35
35
  prerelease: false
36
- version_requirements: *12594600
36
+ version_requirements: *10045140
37
+ - !ruby/object:Gem::Dependency
38
+ name: highline
39
+ requirement: &10044660 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ! '>='
43
+ - !ruby/object:Gem::Version
44
+ version: '0'
45
+ type: :runtime
46
+ prerelease: false
47
+ version_requirements: *10044660
37
48
  - !ruby/object:Gem::Dependency
38
49
  name: diff-lcs
39
- requirement: &12594100 !ruby/object:Gem::Requirement
50
+ requirement: &10044140 !ruby/object:Gem::Requirement
40
51
  none: false
41
52
  requirements:
42
53
  - - ! '>='
@@ -44,10 +55,10 @@ dependencies:
44
55
  version: 1.1.3
45
56
  type: :development
46
57
  prerelease: false
47
- version_requirements: *12594100
58
+ version_requirements: *10044140
48
59
  - !ruby/object:Gem::Dependency
49
60
  name: open4
50
- requirement: &12593700 !ruby/object:Gem::Requirement
61
+ requirement: &10043720 !ruby/object:Gem::Requirement
51
62
  none: false
52
63
  requirements:
53
64
  - - ! '>='
@@ -55,8 +66,12 @@ dependencies:
55
66
  version: '0'
56
67
  type: :development
57
68
  prerelease: false
58
- version_requirements: *12593700
59
- description: The full-featured automatic indexing system
69
+ version_requirements: *10043720
70
+ description: ! "Lingo is an open source indexing system for research and teachings.\nThe
71
+ main functions of Lingo are:\n\n* identification of (i.e. reduction to) basic word
72
+ form by means of\n dictionaries and suffix lists\n* algorithmic decomposition\n*
73
+ dictionary-based synonymisation and identification of phrases\n* generic identification
74
+ of phrases/word sequences based on patterns\n of word classes\n"
60
75
  email:
61
76
  - lingo@vorhauer.de
62
77
  - jens.wille@uni-koeln.de
@@ -69,32 +84,55 @@ extra_rdoc_files:
69
84
  - COPYING
70
85
  - ChangeLog
71
86
  files:
72
- - lib/lingo/attendees.rb
73
87
  - lib/lingo/ctl.rb
74
88
  - lib/lingo/database.rb
75
- - lib/lingo/types.rb
89
+ - lib/lingo/error.rb
76
90
  - lib/lingo/version.rb
77
- - lib/lingo/utilities.rb
91
+ - lib/lingo/database/source.rb
92
+ - lib/lingo/database/libcdb_store.rb
93
+ - lib/lingo/database/sdbm_store.rb
94
+ - lib/lingo/database/show_progress.rb
95
+ - lib/lingo/database/crypter.rb
96
+ - lib/lingo/database/source/multi_key.rb
97
+ - lib/lingo/database/source/key_value.rb
98
+ - lib/lingo/database/source/single_word.rb
99
+ - lib/lingo/database/source/word_class.rb
100
+ - lib/lingo/database/source/multi_value.rb
101
+ - lib/lingo/database/gdbm_store.rb
102
+ - lib/lingo/database/hash_store.rb
78
103
  - lib/lingo/cli.rb
104
+ - lib/lingo/cachable.rb
79
105
  - lib/lingo/attendee/variator.rb
80
106
  - lib/lingo/attendee/debugger.rb
107
+ - lib/lingo/attendee/object_filter.rb
81
108
  - lib/lingo/attendee/synonymer.rb
82
- - lib/lingo/attendee/wordsearcher.rb
109
+ - lib/lingo/attendee/text_writer.rb
110
+ - lib/lingo/attendee/multi_worder.rb
111
+ - lib/lingo/attendee/text_reader.rb
83
112
  - lib/lingo/attendee/dehyphenizer.rb
84
- - lib/lingo/attendee/multiworder.rb
85
113
  - lib/lingo/attendee/tokenizer.rb
86
114
  - lib/lingo/attendee/abbreviator.rb
87
- - lib/lingo/attendee/textwriter.rb
88
- - lib/lingo/attendee/objectfilter.rb
115
+ - lib/lingo/attendee/formatter.rb
89
116
  - lib/lingo/attendee/noneword_filter.rb
90
117
  - lib/lingo/attendee/sequencer.rb
91
- - lib/lingo/attendee/textreader.rb
92
118
  - lib/lingo/attendee/decomposer.rb
119
+ - lib/lingo/attendee/word_searcher.rb
93
120
  - lib/lingo/attendee/vector_filter.rb
94
121
  - lib/lingo/config.rb
95
- - lib/lingo/const.rb
96
- - lib/lingo/modules.rb
122
+ - lib/lingo/core_ext.rb
123
+ - lib/lingo/agenda_item.rb
124
+ - lib/lingo/buffered_attendee.rb
125
+ - lib/lingo/reportable.rb
97
126
  - lib/lingo/language.rb
127
+ - lib/lingo/language/dictionary.rb
128
+ - lib/lingo/language/word.rb
129
+ - lib/lingo/language/lexical.rb
130
+ - lib/lingo/language/word_form.rb
131
+ - lib/lingo/language/token.rb
132
+ - lib/lingo/language/grammar.rb
133
+ - lib/lingo/language/lexical_hash.rb
134
+ - lib/lingo/attendee.rb
135
+ - lib/lingo/call.rb
98
136
  - lib/lingo.rb
99
137
  - bin/lingo
100
138
  - bin/lingoctl
@@ -147,17 +185,17 @@ files:
147
185
  - test/lir.csv
148
186
  - test/attendee/ts_abbreviator.rb
149
187
  - test/attendee/ts_noneword_filter.rb
150
- - test/attendee/ts_wordsearcher.rb
151
- - test/attendee/ts_textwriter.rb
188
+ - test/attendee/ts_word_searcher.rb
189
+ - test/attendee/ts_object_filter.rb
152
190
  - test/attendee/ts_vector_filter.rb
153
- - test/attendee/ts_multiworder.rb
154
- - test/attendee/ts_textreader.rb
155
- - test/attendee/ts_objectfilter.rb
191
+ - test/attendee/ts_text_writer.rb
156
192
  - test/attendee/ts_decomposer.rb
157
193
  - test/attendee/ts_sequencer.rb
158
194
  - test/attendee/ts_synonymer.rb
159
195
  - test/attendee/ts_tokenizer.rb
160
196
  - test/attendee/ts_variator.rb
197
+ - test/attendee/ts_text_reader.rb
198
+ - test/attendee/ts_multi_worder.rb
161
199
  - test/mul.txt
162
200
  - test/test_helper.rb
163
201
  - test/ref/artikel.ven
@@ -185,7 +223,7 @@ rdoc_options:
185
223
  - --line-numbers
186
224
  - --all
187
225
  - --title
188
- - lingo Application documentation (v1.8.0)
226
+ - lingo Application documentation (v1.8.1)
189
227
  - --main
190
228
  - README
191
229
  require_paths:
@@ -204,7 +242,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
204
242
  version: '0'
205
243
  requirements: []
206
244
  rubyforge_project:
207
- rubygems_version: 1.8.13
245
+ rubygems_version: 1.8.17
208
246
  signing_key:
209
247
  specification_version: 3
210
248
  summary: The full-featured automatic indexing system
@@ -1,301 +0,0 @@
1
- # encoding: utf-8
2
-
3
- #--
4
- # LINGO ist ein Indexierungssystem mit Grundformreduktion, Kompositumzerlegung,
5
- # Mehrworterkennung und Relationierung.
6
- #
7
- # Copyright (C) 2005-2007 John Vorhauer
8
- # Copyright (C) 2007-2011 John Vorhauer, Jens Wille
9
- #
10
- # This program is free software; you can redistribute it and/or modify it under
11
- # the terms of the GNU Affero General Public License as published by the Free
12
- # Software Foundation; either version 3 of the License, or (at your option)
13
- # any later version.
14
- #
15
- # This program is distributed in the hope that it will be useful, but WITHOUT
16
- # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17
- # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
18
- # details.
19
- #
20
- # You should have received a copy of the GNU Affero General Public License along
21
- # with this program; if not, write to the Free Software Foundation, Inc.,
22
- # 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
23
- #
24
- # For more information visit http://www.lex-lingo.de or contact me at
25
- # welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
26
- #
27
- # Lex Lingo rules from here on
28
- #++
29
-
30
- class Lingo
31
-
32
- # Mit der bisher beschriebenen Vorgehensweise werden die durch den Tokenizer erkannten
33
- # Token aufgelöst und in Words verwandelt und über den Abbreviator und Decomposer auch
34
- # Spezialfälle behandelt, die einzelne Wörter betreffen.
35
- # Um jedoch auch Namen wie z.B. John F. Kennedy als Sinneinheit erkennen zu können, muss
36
- # eine Analyse über mehrere Objekte erfolgen. Dies ist die Hauptaufgabe des Multiworders.
37
- # Der Multiworder analysiert die Teile des Datenstroms, die z.B. durch Satzzeichen oder
38
- # weiteren Einzelzeichen (z.B. '(') begrenzt sind. Erkannte Mehrwortgruppen werden als
39
- # zusätzliches Objekt in den Datenstrom mit eingefügt.
40
- #
41
- # === Mögliche Verlinkung
42
- # Erwartet:: Daten vom Typ *Word* z.B. von Wordsearcher, Decomposer, Ocr_variator, Multiworder
43
- # Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_MULTIWORD). Je erkannter Mehrwortgruppe wird ein zusätzliches Word-Objekt in den Datenstrom eingefügt. Z.B. für Ocr_variator, Sequencer, Noneword_filter, Vector_filter
44
- #
45
- # === Parameter
46
- # Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
47
- # Alle anderen Parameter müssen zwingend angegeben werden.
48
- # <b>in</b>:: siehe allgemeine Beschreibung des Attendee
49
- # <b>out</b>:: siehe allgemeine Beschreibung des Attendee
50
- # <b>source</b>:: siehe allgemeine Beschreibung des Dictionary
51
- # <b><i>mode</i></b>:: (Standard: all) siehe allgemeine Beschreibung des Dictionary
52
- # <b><i>stopper</i></b>:: (Standard: TA_PUNCTUATION, TA_OTHER) Gibt die Begrenzungen an, zwischen
53
- # denen der Multiworder suchen soll, i.d.R. Satzzeichen und Sonderzeichen,
54
- # weil sie kaum in einer Mehrwortgruppen vorkommen.
55
- #
56
- # === Beispiele
57
- # Bei der Verarbeitung einer normalen Textdatei mit der Ablaufkonfiguration <tt>t1.cfg</tt>
58
- # meeting:
59
- # attendees:
60
- # - textreader: { out: lines, files: '$(files)' }
61
- # - tokenizer: { in: lines, out: token }
62
- # - abbreviator: { in: token, out: abbrev, source: 'sys-abk' }
63
- # - wordsearcher: { in: abbrev, out: words, source: 'sys-dic' }
64
- # - decomposer: { in: words, out: comps, source: 'sys-dic' }
65
- # - multiworder: { in: comps, out: multi, source: 'sys-mul' }
66
- # - debugger: { in: multi, prompt: 'out>' }
67
- # ergibt die Ausgabe über den Debugger: <tt>lingo -c t1 test.txt</tt>
68
- # out> *FILE('test.txt')
69
- # out> <Sein = [(sein/s), (sein/v)]>
70
- # out> <Name = [(name/s)]>
71
- # out> <ist = [(sein/v)]>
72
- # out> <johann van siegen|MUL = [(johann van siegen/m)]>
73
- # out> <Johann = [(johann/e)]>
74
- # out> <van = [(van/w)]>
75
- # out> <Siegen = [(sieg/s), (siegen/v), (siegen/e)]>
76
- # out> :./PUNC:
77
- # out> *EOL('test.txt')
78
- # out> *EOF('test.txt')
79
-
80
- class Attendee::Multiworder < BufferedAttendee
81
-
82
- protected
83
-
84
- def init
85
- # Parameter verwerten
86
- @stopper = get_array('stopper', TA_PUNCTUATION+','+TA_OTHER).collect {|s| s.upcase }
87
-
88
- # Wörterbuch bereitstellen
89
- mul_src = get_array('source')
90
- mul_mod = get_key('mode', 'all')
91
- @mul_dic = Dictionary.new({'source'=>mul_src, 'mode'=>mul_mod}, @lingo)
92
-
93
- # combine lexical variants?
94
- #
95
- # false = old behaviour
96
- # true = first match
97
- # 'all' = all matches
98
- @combine = get_key('combine', false)
99
- @all_keys = @combine.is_a?(String) && @combine.downcase == 'all'
100
-
101
- # Lexikalisierungs-Wörterbuch aus angegebenen Quellen ermitteln
102
- lex_src, lex_mod, databases = nil, nil, @lingo.dictionary_config['databases']
103
- mul_src.each { |src|
104
- this_src, this_mod = databases[src].values_at('use-lex', 'lex-mode')
105
- if lex_src.nil? || lex_src==this_src
106
- lex_src, lex_mod = this_src, this_mod
107
- else
108
- forward(STR_CMD_WARN, "Die Mehrwortwörterbücher #{mul_src.join(',')} sind mit unterschiedlichen Wörterbüchern lexikalisiert worden")
109
- end
110
- }
111
- lex_mod = get_key('lex-mode', lex_mod || 'first')
112
- @lex_dic = Dictionary.new({'source'=>lex_src.split(STRING_SEPERATOR_PATTERN), 'mode'=>lex_mod}, @lingo)
113
- @lex_gra = Grammar.new({'source'=>lex_src.split(STRING_SEPERATOR_PATTERN), 'mode'=>lex_mod}, @lingo)
114
-
115
- if @combine && has_key?('use-syn')
116
- syn_src = get_array('use-syn')
117
- syn_mod = get_key('syn-mode', 'all')
118
- @syn_dic = Dictionary.new({'source'=>syn_src, 'mode'=>syn_mod}, @lingo)
119
- end
120
-
121
- @number_of_expected_tokens_in_buffer = 3
122
- @eof_handling = false
123
- end
124
-
125
- def control(cmd, par)
126
- @mul_dic.report.each_pair { |key, value| set(key, value) } if cmd == STR_CMD_STATUS
127
-
128
- # Jedes Control-Object ist auch Auslöser der Verarbeitung
129
- if cmd == STR_CMD_RECORD || cmd == STR_CMD_EOF
130
- @eof_handling = true
131
- while number_of_valid_tokens_in_buffer > 1
132
- process_buffer
133
- end
134
- forward_number_of_token( @buffer.size, false )
135
- @eof_handling = false
136
- end
137
- end
138
-
139
- def process_buffer?
140
- number_of_valid_tokens_in_buffer >= @number_of_expected_tokens_in_buffer
141
- end
142
-
143
- def process_buffer
144
- unless @buffer[0].form == CHAR_PUNCT
145
- # Prüfe 3er Schlüssel
146
- result = check_multiword_key( 3 )
147
- unless result.empty?
148
- # 3er Schlüssel gefunden
149
- lengths = sort_result_len( result )
150
- unless lengths[0] > 3
151
- # Längster erkannter Schlüssel = 3
152
- create_and_forward_multiword( 3, result )
153
- forward_number_of_token( 3 )
154
- return
155
- else
156
- # Längster erkannter Schlüssel > 3, Buffer voll genug?
157
- unless @buffer.size >= lengths[0] || @eof_handling
158
- @number_of_expected_tokens_in_buffer = lengths[0]
159
- return
160
- else
161
- # Buffer voll genug, Verarbeitung kann beginnen
162
- catch( :forward_one ) do
163
- lengths.each do |len|
164
- result = check_multiword_key( len )
165
- unless result.empty?
166
- create_and_forward_multiword( len, result )
167
- forward_number_of_token( len )
168
- throw :forward_one
169
- end
170
- end
171
-
172
- # Keinen Match gefunden
173
- forward_number_of_token( 1 )
174
- end
175
-
176
- @number_of_expected_tokens_in_buffer = 3
177
- process_buffer if process_buffer?
178
- return
179
- end
180
- end
181
- end
182
-
183
- # Prüfe 2er Schlüssel
184
- result = check_multiword_key( 2 )
185
- unless result.empty?
186
- create_and_forward_multiword( 2, result )
187
- forward_number_of_token( 1 )
188
- end
189
- end
190
-
191
- # Buffer weiterschaufeln
192
- forward_number_of_token( 1, false )
193
- @number_of_expected_tokens_in_buffer = 3
194
- end
195
-
196
- private
197
-
198
- def create_and_forward_multiword( len, lexicals )
199
- # Form aus Buffer auslesen und Teile markieren
200
- pos = 0
201
- form_parts = []
202
- begin
203
- if @buffer[pos].form == CHAR_PUNCT
204
- @buffer.delete_at( pos )
205
- form_parts[-1] += CHAR_PUNCT
206
- else
207
- @buffer[pos].attr = WA_UNKMULPART if @buffer[pos].attr == WA_UNKNOWN
208
- form_parts << @buffer[pos].form
209
- pos += 1
210
- end
211
- end while pos < len
212
-
213
- form = form_parts.join( ' ' )
214
-
215
- # Multiword erstellen
216
- word = Word.new( form, WA_MULTIWORD )
217
- word << lexicals.collect { |lex| (lex.is_a?(Lexical)) ? lex : nil }.compact # FIXME 1.60 - Ausstieg bei "*5" im Synonymer
218
-
219
- # Forword Multiword
220
- forward( word )
221
- end
222
-
223
- # Leitet 'len' Token weiter
224
- def forward_number_of_token( len, count_punc = true )
225
- begin
226
- unless @buffer.empty?
227
- forward( @buffer[0] )
228
- len -= 1 unless count_punc && @buffer[0].form == CHAR_PUNCT
229
- @buffer.delete_at( 0 )
230
- end
231
- end while len > 0
232
- end
233
-
234
- # Ermittelt die maximale Ergebnislänge
235
- def sort_result_len( result )
236
- result.collect do |res|
237
- if res.is_a?( Lexical )
238
- res.form.split( ' ' ).size
239
- else
240
- res =~ /^\*(\d+)/
241
- $1.to_i
242
- end
243
- end.sort.reverse
244
- end
245
-
246
- # Prüft einen definiert langen Schlüssel ab Position 0 im Buffer
247
- def check_multiword_key( len )
248
- return [] if number_of_valid_tokens_in_buffer < len
249
-
250
- # Wortformen aus der Wortliste auslesen
251
- sequence = @buffer.map { |obj|
252
- next [obj] unless obj.is_a?(StringA)
253
-
254
- form = obj.form
255
- next if form == CHAR_PUNCT
256
-
257
- word = @lex_dic.find_word(form)
258
- word = @lex_gra.find_compositum(form) if word.attr == WA_UNKNOWN
259
-
260
- lexicals = word.attr == WA_KOMPOSITUM ?
261
- [word.lexicals.first] : word.lexicals.dup
262
-
263
- lexicals << word if lexicals.empty?
264
- lexicals += @syn_dic.find_synonyms(word) if @syn_dic
265
-
266
- lexicals.map { |lex| lex.form }.uniq
267
- }.compact[0, len]
268
-
269
- if @combine
270
- keys, muls = [], []
271
-
272
- sequence.each { |forms|
273
- keys = forms.map { |form|
274
- keys.empty? ? form : keys.map { |key| "#{key} #{form}" }
275
- }.flatten(1)
276
- }
277
-
278
- keys.each { |key|
279
- mul = @mul_dic.select(key.downcase)
280
-
281
- unless mul.empty?
282
- muls.concat(mul)
283
- break unless @all_keys
284
- end
285
- }
286
-
287
- muls.uniq
288
- else
289
- key = sequence.map { |forms| forms.first }.join(' ')
290
- @mul_dic.select(key.downcase)
291
- end
292
- end
293
-
294
- # Liefert die Anzahl gültiger Token zurück
295
- def number_of_valid_tokens_in_buffer
296
- @buffer.collect { |token| (token.form == CHAR_PUNCT) ? nil : 1 }.compact.size
297
- end
298
-
299
- end
300
-
301
- end