lingo 1.8.0 → 1.8.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/ChangeLog +13 -0
  2. data/README +49 -29
  3. data/Rakefile +28 -4
  4. data/TODO +2 -9
  5. data/bin/lingo +24 -0
  6. data/bin/lingoctl +24 -0
  7. data/de/lingo-dic.txt +559 -74
  8. data/info/gpl-hdr.txt +21 -24
  9. data/lib/lingo.rb +83 -112
  10. data/lib/lingo/agenda_item.rb +53 -0
  11. data/lib/lingo/attendee.rb +261 -0
  12. data/lib/lingo/attendee/abbreviator.rb +95 -97
  13. data/lib/lingo/attendee/debugger.rb +94 -93
  14. data/lib/lingo/attendee/decomposer.rb +76 -83
  15. data/lib/lingo/attendee/dehyphenizer.rb +141 -144
  16. data/lib/lingo/attendee/formatter.rb +65 -0
  17. data/lib/lingo/attendee/multi_worder.rb +302 -0
  18. data/lib/lingo/attendee/noneword_filter.rb +89 -84
  19. data/lib/lingo/attendee/object_filter.rb +91 -0
  20. data/lib/lingo/attendee/sequencer.rb +159 -158
  21. data/lib/lingo/attendee/synonymer.rb +81 -84
  22. data/lib/lingo/attendee/text_reader.rb +242 -0
  23. data/lib/lingo/attendee/text_writer.rb +169 -0
  24. data/lib/lingo/attendee/tokenizer.rb +192 -191
  25. data/lib/lingo/attendee/variator.rb +152 -156
  26. data/lib/lingo/attendee/vector_filter.rb +140 -135
  27. data/lib/lingo/attendee/word_searcher.rb +98 -0
  28. data/lib/lingo/buffered_attendee.rb +69 -0
  29. data/lib/lingo/cachable.rb +58 -0
  30. data/lib/lingo/call.rb +72 -0
  31. data/lib/lingo/cli.rb +26 -0
  32. data/lib/lingo/config.rb +23 -26
  33. data/lib/lingo/core_ext.rb +42 -0
  34. data/lib/lingo/ctl.rb +239 -173
  35. data/lib/lingo/database.rb +148 -496
  36. data/lib/lingo/database/crypter.rb +85 -0
  37. data/lib/lingo/database/gdbm_store.rb +49 -0
  38. data/lib/lingo/database/hash_store.rb +67 -0
  39. data/lib/lingo/database/libcdb_store.rb +58 -0
  40. data/lib/lingo/database/sdbm_store.rb +64 -0
  41. data/lib/lingo/database/show_progress.rb +81 -0
  42. data/lib/lingo/database/source.rb +134 -0
  43. data/lib/lingo/database/source/key_value.rb +62 -0
  44. data/lib/lingo/database/source/multi_key.rb +65 -0
  45. data/lib/lingo/database/source/multi_value.rb +65 -0
  46. data/lib/lingo/database/source/single_word.rb +60 -0
  47. data/lib/lingo/database/source/word_class.rb +64 -0
  48. data/lib/lingo/error.rb +122 -0
  49. data/lib/lingo/language.rb +78 -518
  50. data/lib/lingo/language/dictionary.rb +173 -0
  51. data/lib/lingo/language/grammar.rb +211 -0
  52. data/lib/lingo/language/lexical.rb +66 -0
  53. data/lib/lingo/language/lexical_hash.rb +88 -0
  54. data/lib/lingo/language/token.rb +48 -0
  55. data/lib/lingo/language/word.rb +130 -0
  56. data/lib/lingo/language/word_form.rb +83 -0
  57. data/lib/lingo/reportable.rb +59 -0
  58. data/lib/lingo/version.rb +1 -1
  59. data/lingo-all.cfg +14 -10
  60. data/lingo-call.cfg +5 -5
  61. data/lingo.cfg +14 -12
  62. data/lingo.rb +26 -0
  63. data/lir.cfg +13 -9
  64. data/spec/spec_helper.rb +1 -0
  65. data/test.cfg +11 -11
  66. data/test/attendee/ts_abbreviator.rb +0 -6
  67. data/test/attendee/ts_decomposer.rb +0 -6
  68. data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
  69. data/test/attendee/ts_noneword_filter.rb +1 -7
  70. data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
  71. data/test/attendee/ts_sequencer.rb +0 -6
  72. data/test/attendee/ts_synonymer.rb +0 -6
  73. data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
  74. data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
  75. data/test/attendee/ts_tokenizer.rb +0 -6
  76. data/test/attendee/ts_variator.rb +0 -6
  77. data/test/attendee/ts_vector_filter.rb +1 -7
  78. data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
  79. data/test/ref/artikel.non +2 -29
  80. data/test/ref/artikel.seq +13 -8
  81. data/test/ref/artikel.vec +30 -15
  82. data/test/ref/artikel.ven +29 -14
  83. data/test/ref/artikel.ver +58 -43
  84. data/test/ref/lir.csv +146 -145
  85. data/test/ref/lir.non +186 -210
  86. data/test/ref/lir.seq +54 -50
  87. data/test/test_helper.rb +41 -36
  88. data/test/ts_database.rb +12 -11
  89. data/test/ts_language.rb +118 -68
  90. metadata +67 -29
  91. data/lib/lingo/attendee/multiworder.rb +0 -301
  92. data/lib/lingo/attendee/objectfilter.rb +0 -86
  93. data/lib/lingo/attendee/textreader.rb +0 -237
  94. data/lib/lingo/attendee/textwriter.rb +0 -196
  95. data/lib/lingo/attendee/wordsearcher.rb +0 -96
  96. data/lib/lingo/attendees.rb +0 -289
  97. data/lib/lingo/const.rb +0 -131
  98. data/lib/lingo/modules.rb +0 -98
  99. data/lib/lingo/types.rb +0 -285
  100. data/lib/lingo/utilities.rb +0 -40
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lingo
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.8.0
4
+ version: 1.8.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -10,22 +10,22 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2012-01-01 00:00:00.000000000 Z
13
+ date: 2012-02-19 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: ruby-nuggets
17
- requirement: &12570880 !ruby/object:Gem::Requirement
17
+ requirement: &10045620 !ruby/object:Gem::Requirement
18
18
  none: false
19
19
  requirements:
20
20
  - - ! '>='
21
21
  - !ruby/object:Gem::Version
22
- version: 0.8.2
22
+ version: 0.8.5
23
23
  type: :runtime
24
24
  prerelease: false
25
- version_requirements: *12570880
25
+ version_requirements: *10045620
26
26
  - !ruby/object:Gem::Dependency
27
27
  name: unicode
28
- requirement: &12594600 !ruby/object:Gem::Requirement
28
+ requirement: &10045140 !ruby/object:Gem::Requirement
29
29
  none: false
30
30
  requirements:
31
31
  - - ! '>='
@@ -33,10 +33,21 @@ dependencies:
33
33
  version: '0'
34
34
  type: :runtime
35
35
  prerelease: false
36
- version_requirements: *12594600
36
+ version_requirements: *10045140
37
+ - !ruby/object:Gem::Dependency
38
+ name: highline
39
+ requirement: &10044660 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ! '>='
43
+ - !ruby/object:Gem::Version
44
+ version: '0'
45
+ type: :runtime
46
+ prerelease: false
47
+ version_requirements: *10044660
37
48
  - !ruby/object:Gem::Dependency
38
49
  name: diff-lcs
39
- requirement: &12594100 !ruby/object:Gem::Requirement
50
+ requirement: &10044140 !ruby/object:Gem::Requirement
40
51
  none: false
41
52
  requirements:
42
53
  - - ! '>='
@@ -44,10 +55,10 @@ dependencies:
44
55
  version: 1.1.3
45
56
  type: :development
46
57
  prerelease: false
47
- version_requirements: *12594100
58
+ version_requirements: *10044140
48
59
  - !ruby/object:Gem::Dependency
49
60
  name: open4
50
- requirement: &12593700 !ruby/object:Gem::Requirement
61
+ requirement: &10043720 !ruby/object:Gem::Requirement
51
62
  none: false
52
63
  requirements:
53
64
  - - ! '>='
@@ -55,8 +66,12 @@ dependencies:
55
66
  version: '0'
56
67
  type: :development
57
68
  prerelease: false
58
- version_requirements: *12593700
59
- description: The full-featured automatic indexing system
69
+ version_requirements: *10043720
70
+ description: ! "Lingo is an open source indexing system for research and teachings.\nThe
71
+ main functions of Lingo are:\n\n* identification of (i.e. reduction to) basic word
72
+ form by means of\n dictionaries and suffix lists\n* algorithmic decomposition\n*
73
+ dictionary-based synonymisation and identification of phrases\n* generic identification
74
+ of phrases/word sequences based on patterns\n of word classes\n"
60
75
  email:
61
76
  - lingo@vorhauer.de
62
77
  - jens.wille@uni-koeln.de
@@ -69,32 +84,55 @@ extra_rdoc_files:
69
84
  - COPYING
70
85
  - ChangeLog
71
86
  files:
72
- - lib/lingo/attendees.rb
73
87
  - lib/lingo/ctl.rb
74
88
  - lib/lingo/database.rb
75
- - lib/lingo/types.rb
89
+ - lib/lingo/error.rb
76
90
  - lib/lingo/version.rb
77
- - lib/lingo/utilities.rb
91
+ - lib/lingo/database/source.rb
92
+ - lib/lingo/database/libcdb_store.rb
93
+ - lib/lingo/database/sdbm_store.rb
94
+ - lib/lingo/database/show_progress.rb
95
+ - lib/lingo/database/crypter.rb
96
+ - lib/lingo/database/source/multi_key.rb
97
+ - lib/lingo/database/source/key_value.rb
98
+ - lib/lingo/database/source/single_word.rb
99
+ - lib/lingo/database/source/word_class.rb
100
+ - lib/lingo/database/source/multi_value.rb
101
+ - lib/lingo/database/gdbm_store.rb
102
+ - lib/lingo/database/hash_store.rb
78
103
  - lib/lingo/cli.rb
104
+ - lib/lingo/cachable.rb
79
105
  - lib/lingo/attendee/variator.rb
80
106
  - lib/lingo/attendee/debugger.rb
107
+ - lib/lingo/attendee/object_filter.rb
81
108
  - lib/lingo/attendee/synonymer.rb
82
- - lib/lingo/attendee/wordsearcher.rb
109
+ - lib/lingo/attendee/text_writer.rb
110
+ - lib/lingo/attendee/multi_worder.rb
111
+ - lib/lingo/attendee/text_reader.rb
83
112
  - lib/lingo/attendee/dehyphenizer.rb
84
- - lib/lingo/attendee/multiworder.rb
85
113
  - lib/lingo/attendee/tokenizer.rb
86
114
  - lib/lingo/attendee/abbreviator.rb
87
- - lib/lingo/attendee/textwriter.rb
88
- - lib/lingo/attendee/objectfilter.rb
115
+ - lib/lingo/attendee/formatter.rb
89
116
  - lib/lingo/attendee/noneword_filter.rb
90
117
  - lib/lingo/attendee/sequencer.rb
91
- - lib/lingo/attendee/textreader.rb
92
118
  - lib/lingo/attendee/decomposer.rb
119
+ - lib/lingo/attendee/word_searcher.rb
93
120
  - lib/lingo/attendee/vector_filter.rb
94
121
  - lib/lingo/config.rb
95
- - lib/lingo/const.rb
96
- - lib/lingo/modules.rb
122
+ - lib/lingo/core_ext.rb
123
+ - lib/lingo/agenda_item.rb
124
+ - lib/lingo/buffered_attendee.rb
125
+ - lib/lingo/reportable.rb
97
126
  - lib/lingo/language.rb
127
+ - lib/lingo/language/dictionary.rb
128
+ - lib/lingo/language/word.rb
129
+ - lib/lingo/language/lexical.rb
130
+ - lib/lingo/language/word_form.rb
131
+ - lib/lingo/language/token.rb
132
+ - lib/lingo/language/grammar.rb
133
+ - lib/lingo/language/lexical_hash.rb
134
+ - lib/lingo/attendee.rb
135
+ - lib/lingo/call.rb
98
136
  - lib/lingo.rb
99
137
  - bin/lingo
100
138
  - bin/lingoctl
@@ -147,17 +185,17 @@ files:
147
185
  - test/lir.csv
148
186
  - test/attendee/ts_abbreviator.rb
149
187
  - test/attendee/ts_noneword_filter.rb
150
- - test/attendee/ts_wordsearcher.rb
151
- - test/attendee/ts_textwriter.rb
188
+ - test/attendee/ts_word_searcher.rb
189
+ - test/attendee/ts_object_filter.rb
152
190
  - test/attendee/ts_vector_filter.rb
153
- - test/attendee/ts_multiworder.rb
154
- - test/attendee/ts_textreader.rb
155
- - test/attendee/ts_objectfilter.rb
191
+ - test/attendee/ts_text_writer.rb
156
192
  - test/attendee/ts_decomposer.rb
157
193
  - test/attendee/ts_sequencer.rb
158
194
  - test/attendee/ts_synonymer.rb
159
195
  - test/attendee/ts_tokenizer.rb
160
196
  - test/attendee/ts_variator.rb
197
+ - test/attendee/ts_text_reader.rb
198
+ - test/attendee/ts_multi_worder.rb
161
199
  - test/mul.txt
162
200
  - test/test_helper.rb
163
201
  - test/ref/artikel.ven
@@ -185,7 +223,7 @@ rdoc_options:
185
223
  - --line-numbers
186
224
  - --all
187
225
  - --title
188
- - lingo Application documentation (v1.8.0)
226
+ - lingo Application documentation (v1.8.1)
189
227
  - --main
190
228
  - README
191
229
  require_paths:
@@ -204,7 +242,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
204
242
  version: '0'
205
243
  requirements: []
206
244
  rubyforge_project:
207
- rubygems_version: 1.8.13
245
+ rubygems_version: 1.8.17
208
246
  signing_key:
209
247
  specification_version: 3
210
248
  summary: The full-featured automatic indexing system
@@ -1,301 +0,0 @@
1
- # encoding: utf-8
2
-
3
- #--
4
- # LINGO ist ein Indexierungssystem mit Grundformreduktion, Kompositumzerlegung,
5
- # Mehrworterkennung und Relationierung.
6
- #
7
- # Copyright (C) 2005-2007 John Vorhauer
8
- # Copyright (C) 2007-2011 John Vorhauer, Jens Wille
9
- #
10
- # This program is free software; you can redistribute it and/or modify it under
11
- # the terms of the GNU Affero General Public License as published by the Free
12
- # Software Foundation; either version 3 of the License, or (at your option)
13
- # any later version.
14
- #
15
- # This program is distributed in the hope that it will be useful, but WITHOUT
16
- # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17
- # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
18
- # details.
19
- #
20
- # You should have received a copy of the GNU Affero General Public License along
21
- # with this program; if not, write to the Free Software Foundation, Inc.,
22
- # 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
23
- #
24
- # For more information visit http://www.lex-lingo.de or contact me at
25
- # welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
26
- #
27
- # Lex Lingo rules from here on
28
- #++
29
-
30
- class Lingo
31
-
32
- # Mit der bisher beschriebenen Vorgehensweise werden die durch den Tokenizer erkannten
33
- # Token aufgelöst und in Words verwandelt und über den Abbreviator und Decomposer auch
34
- # Spezialfälle behandelt, die einzelne Wörter betreffen.
35
- # Um jedoch auch Namen wie z.B. John F. Kennedy als Sinneinheit erkennen zu können, muss
36
- # eine Analyse über mehrere Objekte erfolgen. Dies ist die Hauptaufgabe des Multiworders.
37
- # Der Multiworder analysiert die Teile des Datenstroms, die z.B. durch Satzzeichen oder
38
- # weiteren Einzelzeichen (z.B. '(') begrenzt sind. Erkannte Mehrwortgruppen werden als
39
- # zusätzliches Objekt in den Datenstrom mit eingefügt.
40
- #
41
- # === Mögliche Verlinkung
42
- # Erwartet:: Daten vom Typ *Word* z.B. von Wordsearcher, Decomposer, Ocr_variator, Multiworder
43
- # Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_MULTIWORD). Je erkannter Mehrwortgruppe wird ein zusätzliches Word-Objekt in den Datenstrom eingefügt. Z.B. für Ocr_variator, Sequencer, Noneword_filter, Vector_filter
44
- #
45
- # === Parameter
46
- # Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
47
- # Alle anderen Parameter müssen zwingend angegeben werden.
48
- # <b>in</b>:: siehe allgemeine Beschreibung des Attendee
49
- # <b>out</b>:: siehe allgemeine Beschreibung des Attendee
50
- # <b>source</b>:: siehe allgemeine Beschreibung des Dictionary
51
- # <b><i>mode</i></b>:: (Standard: all) siehe allgemeine Beschreibung des Dictionary
52
- # <b><i>stopper</i></b>:: (Standard: TA_PUNCTUATION, TA_OTHER) Gibt die Begrenzungen an, zwischen
53
- # denen der Multiworder suchen soll, i.d.R. Satzzeichen und Sonderzeichen,
54
- # weil sie kaum in einer Mehrwortgruppen vorkommen.
55
- #
56
- # === Beispiele
57
- # Bei der Verarbeitung einer normalen Textdatei mit der Ablaufkonfiguration <tt>t1.cfg</tt>
58
- # meeting:
59
- # attendees:
60
- # - textreader: { out: lines, files: '$(files)' }
61
- # - tokenizer: { in: lines, out: token }
62
- # - abbreviator: { in: token, out: abbrev, source: 'sys-abk' }
63
- # - wordsearcher: { in: abbrev, out: words, source: 'sys-dic' }
64
- # - decomposer: { in: words, out: comps, source: 'sys-dic' }
65
- # - multiworder: { in: comps, out: multi, source: 'sys-mul' }
66
- # - debugger: { in: multi, prompt: 'out>' }
67
- # ergibt die Ausgabe über den Debugger: <tt>lingo -c t1 test.txt</tt>
68
- # out> *FILE('test.txt')
69
- # out> <Sein = [(sein/s), (sein/v)]>
70
- # out> <Name = [(name/s)]>
71
- # out> <ist = [(sein/v)]>
72
- # out> <johann van siegen|MUL = [(johann van siegen/m)]>
73
- # out> <Johann = [(johann/e)]>
74
- # out> <van = [(van/w)]>
75
- # out> <Siegen = [(sieg/s), (siegen/v), (siegen/e)]>
76
- # out> :./PUNC:
77
- # out> *EOL('test.txt')
78
- # out> *EOF('test.txt')
79
-
80
- class Attendee::Multiworder < BufferedAttendee
81
-
82
- protected
83
-
84
- def init
85
- # Parameter verwerten
86
- @stopper = get_array('stopper', TA_PUNCTUATION+','+TA_OTHER).collect {|s| s.upcase }
87
-
88
- # Wörterbuch bereitstellen
89
- mul_src = get_array('source')
90
- mul_mod = get_key('mode', 'all')
91
- @mul_dic = Dictionary.new({'source'=>mul_src, 'mode'=>mul_mod}, @lingo)
92
-
93
- # combine lexical variants?
94
- #
95
- # false = old behaviour
96
- # true = first match
97
- # 'all' = all matches
98
- @combine = get_key('combine', false)
99
- @all_keys = @combine.is_a?(String) && @combine.downcase == 'all'
100
-
101
- # Lexikalisierungs-Wörterbuch aus angegebenen Quellen ermitteln
102
- lex_src, lex_mod, databases = nil, nil, @lingo.dictionary_config['databases']
103
- mul_src.each { |src|
104
- this_src, this_mod = databases[src].values_at('use-lex', 'lex-mode')
105
- if lex_src.nil? || lex_src==this_src
106
- lex_src, lex_mod = this_src, this_mod
107
- else
108
- forward(STR_CMD_WARN, "Die Mehrwortwörterbücher #{mul_src.join(',')} sind mit unterschiedlichen Wörterbüchern lexikalisiert worden")
109
- end
110
- }
111
- lex_mod = get_key('lex-mode', lex_mod || 'first')
112
- @lex_dic = Dictionary.new({'source'=>lex_src.split(STRING_SEPERATOR_PATTERN), 'mode'=>lex_mod}, @lingo)
113
- @lex_gra = Grammar.new({'source'=>lex_src.split(STRING_SEPERATOR_PATTERN), 'mode'=>lex_mod}, @lingo)
114
-
115
- if @combine && has_key?('use-syn')
116
- syn_src = get_array('use-syn')
117
- syn_mod = get_key('syn-mode', 'all')
118
- @syn_dic = Dictionary.new({'source'=>syn_src, 'mode'=>syn_mod}, @lingo)
119
- end
120
-
121
- @number_of_expected_tokens_in_buffer = 3
122
- @eof_handling = false
123
- end
124
-
125
- def control(cmd, par)
126
- @mul_dic.report.each_pair { |key, value| set(key, value) } if cmd == STR_CMD_STATUS
127
-
128
- # Jedes Control-Object ist auch Auslöser der Verarbeitung
129
- if cmd == STR_CMD_RECORD || cmd == STR_CMD_EOF
130
- @eof_handling = true
131
- while number_of_valid_tokens_in_buffer > 1
132
- process_buffer
133
- end
134
- forward_number_of_token( @buffer.size, false )
135
- @eof_handling = false
136
- end
137
- end
138
-
139
- def process_buffer?
140
- number_of_valid_tokens_in_buffer >= @number_of_expected_tokens_in_buffer
141
- end
142
-
143
- def process_buffer
144
- unless @buffer[0].form == CHAR_PUNCT
145
- # Prüfe 3er Schlüssel
146
- result = check_multiword_key( 3 )
147
- unless result.empty?
148
- # 3er Schlüssel gefunden
149
- lengths = sort_result_len( result )
150
- unless lengths[0] > 3
151
- # Längster erkannter Schlüssel = 3
152
- create_and_forward_multiword( 3, result )
153
- forward_number_of_token( 3 )
154
- return
155
- else
156
- # Längster erkannter Schlüssel > 3, Buffer voll genug?
157
- unless @buffer.size >= lengths[0] || @eof_handling
158
- @number_of_expected_tokens_in_buffer = lengths[0]
159
- return
160
- else
161
- # Buffer voll genug, Verarbeitung kann beginnen
162
- catch( :forward_one ) do
163
- lengths.each do |len|
164
- result = check_multiword_key( len )
165
- unless result.empty?
166
- create_and_forward_multiword( len, result )
167
- forward_number_of_token( len )
168
- throw :forward_one
169
- end
170
- end
171
-
172
- # Keinen Match gefunden
173
- forward_number_of_token( 1 )
174
- end
175
-
176
- @number_of_expected_tokens_in_buffer = 3
177
- process_buffer if process_buffer?
178
- return
179
- end
180
- end
181
- end
182
-
183
- # Prüfe 2er Schlüssel
184
- result = check_multiword_key( 2 )
185
- unless result.empty?
186
- create_and_forward_multiword( 2, result )
187
- forward_number_of_token( 1 )
188
- end
189
- end
190
-
191
- # Buffer weiterschaufeln
192
- forward_number_of_token( 1, false )
193
- @number_of_expected_tokens_in_buffer = 3
194
- end
195
-
196
- private
197
-
198
- def create_and_forward_multiword( len, lexicals )
199
- # Form aus Buffer auslesen und Teile markieren
200
- pos = 0
201
- form_parts = []
202
- begin
203
- if @buffer[pos].form == CHAR_PUNCT
204
- @buffer.delete_at( pos )
205
- form_parts[-1] += CHAR_PUNCT
206
- else
207
- @buffer[pos].attr = WA_UNKMULPART if @buffer[pos].attr == WA_UNKNOWN
208
- form_parts << @buffer[pos].form
209
- pos += 1
210
- end
211
- end while pos < len
212
-
213
- form = form_parts.join( ' ' )
214
-
215
- # Multiword erstellen
216
- word = Word.new( form, WA_MULTIWORD )
217
- word << lexicals.collect { |lex| (lex.is_a?(Lexical)) ? lex : nil }.compact # FIXME 1.60 - Ausstieg bei "*5" im Synonymer
218
-
219
- # Forword Multiword
220
- forward( word )
221
- end
222
-
223
- # Leitet 'len' Token weiter
224
- def forward_number_of_token( len, count_punc = true )
225
- begin
226
- unless @buffer.empty?
227
- forward( @buffer[0] )
228
- len -= 1 unless count_punc && @buffer[0].form == CHAR_PUNCT
229
- @buffer.delete_at( 0 )
230
- end
231
- end while len > 0
232
- end
233
-
234
- # Ermittelt die maximale Ergebnislänge
235
- def sort_result_len( result )
236
- result.collect do |res|
237
- if res.is_a?( Lexical )
238
- res.form.split( ' ' ).size
239
- else
240
- res =~ /^\*(\d+)/
241
- $1.to_i
242
- end
243
- end.sort.reverse
244
- end
245
-
246
- # Prüft einen definiert langen Schlüssel ab Position 0 im Buffer
247
- def check_multiword_key( len )
248
- return [] if number_of_valid_tokens_in_buffer < len
249
-
250
- # Wortformen aus der Wortliste auslesen
251
- sequence = @buffer.map { |obj|
252
- next [obj] unless obj.is_a?(StringA)
253
-
254
- form = obj.form
255
- next if form == CHAR_PUNCT
256
-
257
- word = @lex_dic.find_word(form)
258
- word = @lex_gra.find_compositum(form) if word.attr == WA_UNKNOWN
259
-
260
- lexicals = word.attr == WA_KOMPOSITUM ?
261
- [word.lexicals.first] : word.lexicals.dup
262
-
263
- lexicals << word if lexicals.empty?
264
- lexicals += @syn_dic.find_synonyms(word) if @syn_dic
265
-
266
- lexicals.map { |lex| lex.form }.uniq
267
- }.compact[0, len]
268
-
269
- if @combine
270
- keys, muls = [], []
271
-
272
- sequence.each { |forms|
273
- keys = forms.map { |form|
274
- keys.empty? ? form : keys.map { |key| "#{key} #{form}" }
275
- }.flatten(1)
276
- }
277
-
278
- keys.each { |key|
279
- mul = @mul_dic.select(key.downcase)
280
-
281
- unless mul.empty?
282
- muls.concat(mul)
283
- break unless @all_keys
284
- end
285
- }
286
-
287
- muls.uniq
288
- else
289
- key = sequence.map { |forms| forms.first }.join(' ')
290
- @mul_dic.select(key.downcase)
291
- end
292
- end
293
-
294
- # Liefert die Anzahl gültiger Token zurück
295
- def number_of_valid_tokens_in_buffer
296
- @buffer.collect { |token| (token.form == CHAR_PUNCT) ? nil : 1 }.compact.size
297
- end
298
-
299
- end
300
-
301
- end