lingo 1.8.0 → 1.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +13 -0
- data/README +49 -29
- data/Rakefile +28 -4
- data/TODO +2 -9
- data/bin/lingo +24 -0
- data/bin/lingoctl +24 -0
- data/de/lingo-dic.txt +559 -74
- data/info/gpl-hdr.txt +21 -24
- data/lib/lingo.rb +83 -112
- data/lib/lingo/agenda_item.rb +53 -0
- data/lib/lingo/attendee.rb +261 -0
- data/lib/lingo/attendee/abbreviator.rb +95 -97
- data/lib/lingo/attendee/debugger.rb +94 -93
- data/lib/lingo/attendee/decomposer.rb +76 -83
- data/lib/lingo/attendee/dehyphenizer.rb +141 -144
- data/lib/lingo/attendee/formatter.rb +65 -0
- data/lib/lingo/attendee/multi_worder.rb +302 -0
- data/lib/lingo/attendee/noneword_filter.rb +89 -84
- data/lib/lingo/attendee/object_filter.rb +91 -0
- data/lib/lingo/attendee/sequencer.rb +159 -158
- data/lib/lingo/attendee/synonymer.rb +81 -84
- data/lib/lingo/attendee/text_reader.rb +242 -0
- data/lib/lingo/attendee/text_writer.rb +169 -0
- data/lib/lingo/attendee/tokenizer.rb +192 -191
- data/lib/lingo/attendee/variator.rb +152 -156
- data/lib/lingo/attendee/vector_filter.rb +140 -135
- data/lib/lingo/attendee/word_searcher.rb +98 -0
- data/lib/lingo/buffered_attendee.rb +69 -0
- data/lib/lingo/cachable.rb +58 -0
- data/lib/lingo/call.rb +72 -0
- data/lib/lingo/cli.rb +26 -0
- data/lib/lingo/config.rb +23 -26
- data/lib/lingo/core_ext.rb +42 -0
- data/lib/lingo/ctl.rb +239 -173
- data/lib/lingo/database.rb +148 -496
- data/lib/lingo/database/crypter.rb +85 -0
- data/lib/lingo/database/gdbm_store.rb +49 -0
- data/lib/lingo/database/hash_store.rb +67 -0
- data/lib/lingo/database/libcdb_store.rb +58 -0
- data/lib/lingo/database/sdbm_store.rb +64 -0
- data/lib/lingo/database/show_progress.rb +81 -0
- data/lib/lingo/database/source.rb +134 -0
- data/lib/lingo/database/source/key_value.rb +62 -0
- data/lib/lingo/database/source/multi_key.rb +65 -0
- data/lib/lingo/database/source/multi_value.rb +65 -0
- data/lib/lingo/database/source/single_word.rb +60 -0
- data/lib/lingo/database/source/word_class.rb +64 -0
- data/lib/lingo/error.rb +122 -0
- data/lib/lingo/language.rb +78 -518
- data/lib/lingo/language/dictionary.rb +173 -0
- data/lib/lingo/language/grammar.rb +211 -0
- data/lib/lingo/language/lexical.rb +66 -0
- data/lib/lingo/language/lexical_hash.rb +88 -0
- data/lib/lingo/language/token.rb +48 -0
- data/lib/lingo/language/word.rb +130 -0
- data/lib/lingo/language/word_form.rb +83 -0
- data/lib/lingo/reportable.rb +59 -0
- data/lib/lingo/version.rb +1 -1
- data/lingo-all.cfg +14 -10
- data/lingo-call.cfg +5 -5
- data/lingo.cfg +14 -12
- data/lingo.rb +26 -0
- data/lir.cfg +13 -9
- data/spec/spec_helper.rb +1 -0
- data/test.cfg +11 -11
- data/test/attendee/ts_abbreviator.rb +0 -6
- data/test/attendee/ts_decomposer.rb +0 -6
- data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
- data/test/attendee/ts_noneword_filter.rb +1 -7
- data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
- data/test/attendee/ts_sequencer.rb +0 -6
- data/test/attendee/ts_synonymer.rb +0 -6
- data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
- data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
- data/test/attendee/ts_tokenizer.rb +0 -6
- data/test/attendee/ts_variator.rb +0 -6
- data/test/attendee/ts_vector_filter.rb +1 -7
- data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
- data/test/ref/artikel.non +2 -29
- data/test/ref/artikel.seq +13 -8
- data/test/ref/artikel.vec +30 -15
- data/test/ref/artikel.ven +29 -14
- data/test/ref/artikel.ver +58 -43
- data/test/ref/lir.csv +146 -145
- data/test/ref/lir.non +186 -210
- data/test/ref/lir.seq +54 -50
- data/test/test_helper.rb +41 -36
- data/test/ts_database.rb +12 -11
- data/test/ts_language.rb +118 -68
- metadata +67 -29
- data/lib/lingo/attendee/multiworder.rb +0 -301
- data/lib/lingo/attendee/objectfilter.rb +0 -86
- data/lib/lingo/attendee/textreader.rb +0 -237
- data/lib/lingo/attendee/textwriter.rb +0 -196
- data/lib/lingo/attendee/wordsearcher.rb +0 -96
- data/lib/lingo/attendees.rb +0 -289
- data/lib/lingo/const.rb +0 -131
- data/lib/lingo/modules.rb +0 -98
- data/lib/lingo/types.rb +0 -285
- data/lib/lingo/utilities.rb +0 -40
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: lingo
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.8.
|
|
4
|
+
version: 1.8.1
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -10,22 +10,22 @@ authors:
|
|
|
10
10
|
autorequire:
|
|
11
11
|
bindir: bin
|
|
12
12
|
cert_chain: []
|
|
13
|
-
date: 2012-
|
|
13
|
+
date: 2012-02-19 00:00:00.000000000 Z
|
|
14
14
|
dependencies:
|
|
15
15
|
- !ruby/object:Gem::Dependency
|
|
16
16
|
name: ruby-nuggets
|
|
17
|
-
requirement: &
|
|
17
|
+
requirement: &10045620 !ruby/object:Gem::Requirement
|
|
18
18
|
none: false
|
|
19
19
|
requirements:
|
|
20
20
|
- - ! '>='
|
|
21
21
|
- !ruby/object:Gem::Version
|
|
22
|
-
version: 0.8.
|
|
22
|
+
version: 0.8.5
|
|
23
23
|
type: :runtime
|
|
24
24
|
prerelease: false
|
|
25
|
-
version_requirements: *
|
|
25
|
+
version_requirements: *10045620
|
|
26
26
|
- !ruby/object:Gem::Dependency
|
|
27
27
|
name: unicode
|
|
28
|
-
requirement: &
|
|
28
|
+
requirement: &10045140 !ruby/object:Gem::Requirement
|
|
29
29
|
none: false
|
|
30
30
|
requirements:
|
|
31
31
|
- - ! '>='
|
|
@@ -33,10 +33,21 @@ dependencies:
|
|
|
33
33
|
version: '0'
|
|
34
34
|
type: :runtime
|
|
35
35
|
prerelease: false
|
|
36
|
-
version_requirements: *
|
|
36
|
+
version_requirements: *10045140
|
|
37
|
+
- !ruby/object:Gem::Dependency
|
|
38
|
+
name: highline
|
|
39
|
+
requirement: &10044660 !ruby/object:Gem::Requirement
|
|
40
|
+
none: false
|
|
41
|
+
requirements:
|
|
42
|
+
- - ! '>='
|
|
43
|
+
- !ruby/object:Gem::Version
|
|
44
|
+
version: '0'
|
|
45
|
+
type: :runtime
|
|
46
|
+
prerelease: false
|
|
47
|
+
version_requirements: *10044660
|
|
37
48
|
- !ruby/object:Gem::Dependency
|
|
38
49
|
name: diff-lcs
|
|
39
|
-
requirement: &
|
|
50
|
+
requirement: &10044140 !ruby/object:Gem::Requirement
|
|
40
51
|
none: false
|
|
41
52
|
requirements:
|
|
42
53
|
- - ! '>='
|
|
@@ -44,10 +55,10 @@ dependencies:
|
|
|
44
55
|
version: 1.1.3
|
|
45
56
|
type: :development
|
|
46
57
|
prerelease: false
|
|
47
|
-
version_requirements: *
|
|
58
|
+
version_requirements: *10044140
|
|
48
59
|
- !ruby/object:Gem::Dependency
|
|
49
60
|
name: open4
|
|
50
|
-
requirement: &
|
|
61
|
+
requirement: &10043720 !ruby/object:Gem::Requirement
|
|
51
62
|
none: false
|
|
52
63
|
requirements:
|
|
53
64
|
- - ! '>='
|
|
@@ -55,8 +66,12 @@ dependencies:
|
|
|
55
66
|
version: '0'
|
|
56
67
|
type: :development
|
|
57
68
|
prerelease: false
|
|
58
|
-
version_requirements: *
|
|
59
|
-
description:
|
|
69
|
+
version_requirements: *10043720
|
|
70
|
+
description: ! "Lingo is an open source indexing system for research and teachings.\nThe
|
|
71
|
+
main functions of Lingo are:\n\n* identification of (i.e. reduction to) basic word
|
|
72
|
+
form by means of\n dictionaries and suffix lists\n* algorithmic decomposition\n*
|
|
73
|
+
dictionary-based synonymisation and identification of phrases\n* generic identification
|
|
74
|
+
of phrases/word sequences based on patterns\n of word classes\n"
|
|
60
75
|
email:
|
|
61
76
|
- lingo@vorhauer.de
|
|
62
77
|
- jens.wille@uni-koeln.de
|
|
@@ -69,32 +84,55 @@ extra_rdoc_files:
|
|
|
69
84
|
- COPYING
|
|
70
85
|
- ChangeLog
|
|
71
86
|
files:
|
|
72
|
-
- lib/lingo/attendees.rb
|
|
73
87
|
- lib/lingo/ctl.rb
|
|
74
88
|
- lib/lingo/database.rb
|
|
75
|
-
- lib/lingo/
|
|
89
|
+
- lib/lingo/error.rb
|
|
76
90
|
- lib/lingo/version.rb
|
|
77
|
-
- lib/lingo/
|
|
91
|
+
- lib/lingo/database/source.rb
|
|
92
|
+
- lib/lingo/database/libcdb_store.rb
|
|
93
|
+
- lib/lingo/database/sdbm_store.rb
|
|
94
|
+
- lib/lingo/database/show_progress.rb
|
|
95
|
+
- lib/lingo/database/crypter.rb
|
|
96
|
+
- lib/lingo/database/source/multi_key.rb
|
|
97
|
+
- lib/lingo/database/source/key_value.rb
|
|
98
|
+
- lib/lingo/database/source/single_word.rb
|
|
99
|
+
- lib/lingo/database/source/word_class.rb
|
|
100
|
+
- lib/lingo/database/source/multi_value.rb
|
|
101
|
+
- lib/lingo/database/gdbm_store.rb
|
|
102
|
+
- lib/lingo/database/hash_store.rb
|
|
78
103
|
- lib/lingo/cli.rb
|
|
104
|
+
- lib/lingo/cachable.rb
|
|
79
105
|
- lib/lingo/attendee/variator.rb
|
|
80
106
|
- lib/lingo/attendee/debugger.rb
|
|
107
|
+
- lib/lingo/attendee/object_filter.rb
|
|
81
108
|
- lib/lingo/attendee/synonymer.rb
|
|
82
|
-
- lib/lingo/attendee/
|
|
109
|
+
- lib/lingo/attendee/text_writer.rb
|
|
110
|
+
- lib/lingo/attendee/multi_worder.rb
|
|
111
|
+
- lib/lingo/attendee/text_reader.rb
|
|
83
112
|
- lib/lingo/attendee/dehyphenizer.rb
|
|
84
|
-
- lib/lingo/attendee/multiworder.rb
|
|
85
113
|
- lib/lingo/attendee/tokenizer.rb
|
|
86
114
|
- lib/lingo/attendee/abbreviator.rb
|
|
87
|
-
- lib/lingo/attendee/
|
|
88
|
-
- lib/lingo/attendee/objectfilter.rb
|
|
115
|
+
- lib/lingo/attendee/formatter.rb
|
|
89
116
|
- lib/lingo/attendee/noneword_filter.rb
|
|
90
117
|
- lib/lingo/attendee/sequencer.rb
|
|
91
|
-
- lib/lingo/attendee/textreader.rb
|
|
92
118
|
- lib/lingo/attendee/decomposer.rb
|
|
119
|
+
- lib/lingo/attendee/word_searcher.rb
|
|
93
120
|
- lib/lingo/attendee/vector_filter.rb
|
|
94
121
|
- lib/lingo/config.rb
|
|
95
|
-
- lib/lingo/
|
|
96
|
-
- lib/lingo/
|
|
122
|
+
- lib/lingo/core_ext.rb
|
|
123
|
+
- lib/lingo/agenda_item.rb
|
|
124
|
+
- lib/lingo/buffered_attendee.rb
|
|
125
|
+
- lib/lingo/reportable.rb
|
|
97
126
|
- lib/lingo/language.rb
|
|
127
|
+
- lib/lingo/language/dictionary.rb
|
|
128
|
+
- lib/lingo/language/word.rb
|
|
129
|
+
- lib/lingo/language/lexical.rb
|
|
130
|
+
- lib/lingo/language/word_form.rb
|
|
131
|
+
- lib/lingo/language/token.rb
|
|
132
|
+
- lib/lingo/language/grammar.rb
|
|
133
|
+
- lib/lingo/language/lexical_hash.rb
|
|
134
|
+
- lib/lingo/attendee.rb
|
|
135
|
+
- lib/lingo/call.rb
|
|
98
136
|
- lib/lingo.rb
|
|
99
137
|
- bin/lingo
|
|
100
138
|
- bin/lingoctl
|
|
@@ -147,17 +185,17 @@ files:
|
|
|
147
185
|
- test/lir.csv
|
|
148
186
|
- test/attendee/ts_abbreviator.rb
|
|
149
187
|
- test/attendee/ts_noneword_filter.rb
|
|
150
|
-
- test/attendee/
|
|
151
|
-
- test/attendee/
|
|
188
|
+
- test/attendee/ts_word_searcher.rb
|
|
189
|
+
- test/attendee/ts_object_filter.rb
|
|
152
190
|
- test/attendee/ts_vector_filter.rb
|
|
153
|
-
- test/attendee/
|
|
154
|
-
- test/attendee/ts_textreader.rb
|
|
155
|
-
- test/attendee/ts_objectfilter.rb
|
|
191
|
+
- test/attendee/ts_text_writer.rb
|
|
156
192
|
- test/attendee/ts_decomposer.rb
|
|
157
193
|
- test/attendee/ts_sequencer.rb
|
|
158
194
|
- test/attendee/ts_synonymer.rb
|
|
159
195
|
- test/attendee/ts_tokenizer.rb
|
|
160
196
|
- test/attendee/ts_variator.rb
|
|
197
|
+
- test/attendee/ts_text_reader.rb
|
|
198
|
+
- test/attendee/ts_multi_worder.rb
|
|
161
199
|
- test/mul.txt
|
|
162
200
|
- test/test_helper.rb
|
|
163
201
|
- test/ref/artikel.ven
|
|
@@ -185,7 +223,7 @@ rdoc_options:
|
|
|
185
223
|
- --line-numbers
|
|
186
224
|
- --all
|
|
187
225
|
- --title
|
|
188
|
-
- lingo Application documentation (v1.8.
|
|
226
|
+
- lingo Application documentation (v1.8.1)
|
|
189
227
|
- --main
|
|
190
228
|
- README
|
|
191
229
|
require_paths:
|
|
@@ -204,7 +242,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
204
242
|
version: '0'
|
|
205
243
|
requirements: []
|
|
206
244
|
rubyforge_project:
|
|
207
|
-
rubygems_version: 1.8.
|
|
245
|
+
rubygems_version: 1.8.17
|
|
208
246
|
signing_key:
|
|
209
247
|
specification_version: 3
|
|
210
248
|
summary: The full-featured automatic indexing system
|
|
@@ -1,301 +0,0 @@
|
|
|
1
|
-
# encoding: utf-8
|
|
2
|
-
|
|
3
|
-
#--
|
|
4
|
-
# LINGO ist ein Indexierungssystem mit Grundformreduktion, Kompositumzerlegung,
|
|
5
|
-
# Mehrworterkennung und Relationierung.
|
|
6
|
-
#
|
|
7
|
-
# Copyright (C) 2005-2007 John Vorhauer
|
|
8
|
-
# Copyright (C) 2007-2011 John Vorhauer, Jens Wille
|
|
9
|
-
#
|
|
10
|
-
# This program is free software; you can redistribute it and/or modify it under
|
|
11
|
-
# the terms of the GNU Affero General Public License as published by the Free
|
|
12
|
-
# Software Foundation; either version 3 of the License, or (at your option)
|
|
13
|
-
# any later version.
|
|
14
|
-
#
|
|
15
|
-
# This program is distributed in the hope that it will be useful, but WITHOUT
|
|
16
|
-
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
17
|
-
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
|
18
|
-
# details.
|
|
19
|
-
#
|
|
20
|
-
# You should have received a copy of the GNU Affero General Public License along
|
|
21
|
-
# with this program; if not, write to the Free Software Foundation, Inc.,
|
|
22
|
-
# 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
|
|
23
|
-
#
|
|
24
|
-
# For more information visit http://www.lex-lingo.de or contact me at
|
|
25
|
-
# welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
|
|
26
|
-
#
|
|
27
|
-
# Lex Lingo rules from here on
|
|
28
|
-
#++
|
|
29
|
-
|
|
30
|
-
class Lingo
|
|
31
|
-
|
|
32
|
-
# Mit der bisher beschriebenen Vorgehensweise werden die durch den Tokenizer erkannten
|
|
33
|
-
# Token aufgelöst und in Words verwandelt und über den Abbreviator und Decomposer auch
|
|
34
|
-
# Spezialfälle behandelt, die einzelne Wörter betreffen.
|
|
35
|
-
# Um jedoch auch Namen wie z.B. John F. Kennedy als Sinneinheit erkennen zu können, muss
|
|
36
|
-
# eine Analyse über mehrere Objekte erfolgen. Dies ist die Hauptaufgabe des Multiworders.
|
|
37
|
-
# Der Multiworder analysiert die Teile des Datenstroms, die z.B. durch Satzzeichen oder
|
|
38
|
-
# weiteren Einzelzeichen (z.B. '(') begrenzt sind. Erkannte Mehrwortgruppen werden als
|
|
39
|
-
# zusätzliches Objekt in den Datenstrom mit eingefügt.
|
|
40
|
-
#
|
|
41
|
-
# === Mögliche Verlinkung
|
|
42
|
-
# Erwartet:: Daten vom Typ *Word* z.B. von Wordsearcher, Decomposer, Ocr_variator, Multiworder
|
|
43
|
-
# Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_MULTIWORD). Je erkannter Mehrwortgruppe wird ein zusätzliches Word-Objekt in den Datenstrom eingefügt. Z.B. für Ocr_variator, Sequencer, Noneword_filter, Vector_filter
|
|
44
|
-
#
|
|
45
|
-
# === Parameter
|
|
46
|
-
# Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
|
|
47
|
-
# Alle anderen Parameter müssen zwingend angegeben werden.
|
|
48
|
-
# <b>in</b>:: siehe allgemeine Beschreibung des Attendee
|
|
49
|
-
# <b>out</b>:: siehe allgemeine Beschreibung des Attendee
|
|
50
|
-
# <b>source</b>:: siehe allgemeine Beschreibung des Dictionary
|
|
51
|
-
# <b><i>mode</i></b>:: (Standard: all) siehe allgemeine Beschreibung des Dictionary
|
|
52
|
-
# <b><i>stopper</i></b>:: (Standard: TA_PUNCTUATION, TA_OTHER) Gibt die Begrenzungen an, zwischen
|
|
53
|
-
# denen der Multiworder suchen soll, i.d.R. Satzzeichen und Sonderzeichen,
|
|
54
|
-
# weil sie kaum in einer Mehrwortgruppen vorkommen.
|
|
55
|
-
#
|
|
56
|
-
# === Beispiele
|
|
57
|
-
# Bei der Verarbeitung einer normalen Textdatei mit der Ablaufkonfiguration <tt>t1.cfg</tt>
|
|
58
|
-
# meeting:
|
|
59
|
-
# attendees:
|
|
60
|
-
# - textreader: { out: lines, files: '$(files)' }
|
|
61
|
-
# - tokenizer: { in: lines, out: token }
|
|
62
|
-
# - abbreviator: { in: token, out: abbrev, source: 'sys-abk' }
|
|
63
|
-
# - wordsearcher: { in: abbrev, out: words, source: 'sys-dic' }
|
|
64
|
-
# - decomposer: { in: words, out: comps, source: 'sys-dic' }
|
|
65
|
-
# - multiworder: { in: comps, out: multi, source: 'sys-mul' }
|
|
66
|
-
# - debugger: { in: multi, prompt: 'out>' }
|
|
67
|
-
# ergibt die Ausgabe über den Debugger: <tt>lingo -c t1 test.txt</tt>
|
|
68
|
-
# out> *FILE('test.txt')
|
|
69
|
-
# out> <Sein = [(sein/s), (sein/v)]>
|
|
70
|
-
# out> <Name = [(name/s)]>
|
|
71
|
-
# out> <ist = [(sein/v)]>
|
|
72
|
-
# out> <johann van siegen|MUL = [(johann van siegen/m)]>
|
|
73
|
-
# out> <Johann = [(johann/e)]>
|
|
74
|
-
# out> <van = [(van/w)]>
|
|
75
|
-
# out> <Siegen = [(sieg/s), (siegen/v), (siegen/e)]>
|
|
76
|
-
# out> :./PUNC:
|
|
77
|
-
# out> *EOL('test.txt')
|
|
78
|
-
# out> *EOF('test.txt')
|
|
79
|
-
|
|
80
|
-
class Attendee::Multiworder < BufferedAttendee
|
|
81
|
-
|
|
82
|
-
protected
|
|
83
|
-
|
|
84
|
-
def init
|
|
85
|
-
# Parameter verwerten
|
|
86
|
-
@stopper = get_array('stopper', TA_PUNCTUATION+','+TA_OTHER).collect {|s| s.upcase }
|
|
87
|
-
|
|
88
|
-
# Wörterbuch bereitstellen
|
|
89
|
-
mul_src = get_array('source')
|
|
90
|
-
mul_mod = get_key('mode', 'all')
|
|
91
|
-
@mul_dic = Dictionary.new({'source'=>mul_src, 'mode'=>mul_mod}, @lingo)
|
|
92
|
-
|
|
93
|
-
# combine lexical variants?
|
|
94
|
-
#
|
|
95
|
-
# false = old behaviour
|
|
96
|
-
# true = first match
|
|
97
|
-
# 'all' = all matches
|
|
98
|
-
@combine = get_key('combine', false)
|
|
99
|
-
@all_keys = @combine.is_a?(String) && @combine.downcase == 'all'
|
|
100
|
-
|
|
101
|
-
# Lexikalisierungs-Wörterbuch aus angegebenen Quellen ermitteln
|
|
102
|
-
lex_src, lex_mod, databases = nil, nil, @lingo.dictionary_config['databases']
|
|
103
|
-
mul_src.each { |src|
|
|
104
|
-
this_src, this_mod = databases[src].values_at('use-lex', 'lex-mode')
|
|
105
|
-
if lex_src.nil? || lex_src==this_src
|
|
106
|
-
lex_src, lex_mod = this_src, this_mod
|
|
107
|
-
else
|
|
108
|
-
forward(STR_CMD_WARN, "Die Mehrwortwörterbücher #{mul_src.join(',')} sind mit unterschiedlichen Wörterbüchern lexikalisiert worden")
|
|
109
|
-
end
|
|
110
|
-
}
|
|
111
|
-
lex_mod = get_key('lex-mode', lex_mod || 'first')
|
|
112
|
-
@lex_dic = Dictionary.new({'source'=>lex_src.split(STRING_SEPERATOR_PATTERN), 'mode'=>lex_mod}, @lingo)
|
|
113
|
-
@lex_gra = Grammar.new({'source'=>lex_src.split(STRING_SEPERATOR_PATTERN), 'mode'=>lex_mod}, @lingo)
|
|
114
|
-
|
|
115
|
-
if @combine && has_key?('use-syn')
|
|
116
|
-
syn_src = get_array('use-syn')
|
|
117
|
-
syn_mod = get_key('syn-mode', 'all')
|
|
118
|
-
@syn_dic = Dictionary.new({'source'=>syn_src, 'mode'=>syn_mod}, @lingo)
|
|
119
|
-
end
|
|
120
|
-
|
|
121
|
-
@number_of_expected_tokens_in_buffer = 3
|
|
122
|
-
@eof_handling = false
|
|
123
|
-
end
|
|
124
|
-
|
|
125
|
-
def control(cmd, par)
|
|
126
|
-
@mul_dic.report.each_pair { |key, value| set(key, value) } if cmd == STR_CMD_STATUS
|
|
127
|
-
|
|
128
|
-
# Jedes Control-Object ist auch Auslöser der Verarbeitung
|
|
129
|
-
if cmd == STR_CMD_RECORD || cmd == STR_CMD_EOF
|
|
130
|
-
@eof_handling = true
|
|
131
|
-
while number_of_valid_tokens_in_buffer > 1
|
|
132
|
-
process_buffer
|
|
133
|
-
end
|
|
134
|
-
forward_number_of_token( @buffer.size, false )
|
|
135
|
-
@eof_handling = false
|
|
136
|
-
end
|
|
137
|
-
end
|
|
138
|
-
|
|
139
|
-
def process_buffer?
|
|
140
|
-
number_of_valid_tokens_in_buffer >= @number_of_expected_tokens_in_buffer
|
|
141
|
-
end
|
|
142
|
-
|
|
143
|
-
def process_buffer
|
|
144
|
-
unless @buffer[0].form == CHAR_PUNCT
|
|
145
|
-
# Prüfe 3er Schlüssel
|
|
146
|
-
result = check_multiword_key( 3 )
|
|
147
|
-
unless result.empty?
|
|
148
|
-
# 3er Schlüssel gefunden
|
|
149
|
-
lengths = sort_result_len( result )
|
|
150
|
-
unless lengths[0] > 3
|
|
151
|
-
# Längster erkannter Schlüssel = 3
|
|
152
|
-
create_and_forward_multiword( 3, result )
|
|
153
|
-
forward_number_of_token( 3 )
|
|
154
|
-
return
|
|
155
|
-
else
|
|
156
|
-
# Längster erkannter Schlüssel > 3, Buffer voll genug?
|
|
157
|
-
unless @buffer.size >= lengths[0] || @eof_handling
|
|
158
|
-
@number_of_expected_tokens_in_buffer = lengths[0]
|
|
159
|
-
return
|
|
160
|
-
else
|
|
161
|
-
# Buffer voll genug, Verarbeitung kann beginnen
|
|
162
|
-
catch( :forward_one ) do
|
|
163
|
-
lengths.each do |len|
|
|
164
|
-
result = check_multiword_key( len )
|
|
165
|
-
unless result.empty?
|
|
166
|
-
create_and_forward_multiword( len, result )
|
|
167
|
-
forward_number_of_token( len )
|
|
168
|
-
throw :forward_one
|
|
169
|
-
end
|
|
170
|
-
end
|
|
171
|
-
|
|
172
|
-
# Keinen Match gefunden
|
|
173
|
-
forward_number_of_token( 1 )
|
|
174
|
-
end
|
|
175
|
-
|
|
176
|
-
@number_of_expected_tokens_in_buffer = 3
|
|
177
|
-
process_buffer if process_buffer?
|
|
178
|
-
return
|
|
179
|
-
end
|
|
180
|
-
end
|
|
181
|
-
end
|
|
182
|
-
|
|
183
|
-
# Prüfe 2er Schlüssel
|
|
184
|
-
result = check_multiword_key( 2 )
|
|
185
|
-
unless result.empty?
|
|
186
|
-
create_and_forward_multiword( 2, result )
|
|
187
|
-
forward_number_of_token( 1 )
|
|
188
|
-
end
|
|
189
|
-
end
|
|
190
|
-
|
|
191
|
-
# Buffer weiterschaufeln
|
|
192
|
-
forward_number_of_token( 1, false )
|
|
193
|
-
@number_of_expected_tokens_in_buffer = 3
|
|
194
|
-
end
|
|
195
|
-
|
|
196
|
-
private
|
|
197
|
-
|
|
198
|
-
def create_and_forward_multiword( len, lexicals )
|
|
199
|
-
# Form aus Buffer auslesen und Teile markieren
|
|
200
|
-
pos = 0
|
|
201
|
-
form_parts = []
|
|
202
|
-
begin
|
|
203
|
-
if @buffer[pos].form == CHAR_PUNCT
|
|
204
|
-
@buffer.delete_at( pos )
|
|
205
|
-
form_parts[-1] += CHAR_PUNCT
|
|
206
|
-
else
|
|
207
|
-
@buffer[pos].attr = WA_UNKMULPART if @buffer[pos].attr == WA_UNKNOWN
|
|
208
|
-
form_parts << @buffer[pos].form
|
|
209
|
-
pos += 1
|
|
210
|
-
end
|
|
211
|
-
end while pos < len
|
|
212
|
-
|
|
213
|
-
form = form_parts.join( ' ' )
|
|
214
|
-
|
|
215
|
-
# Multiword erstellen
|
|
216
|
-
word = Word.new( form, WA_MULTIWORD )
|
|
217
|
-
word << lexicals.collect { |lex| (lex.is_a?(Lexical)) ? lex : nil }.compact # FIXME 1.60 - Ausstieg bei "*5" im Synonymer
|
|
218
|
-
|
|
219
|
-
# Forword Multiword
|
|
220
|
-
forward( word )
|
|
221
|
-
end
|
|
222
|
-
|
|
223
|
-
# Leitet 'len' Token weiter
|
|
224
|
-
def forward_number_of_token( len, count_punc = true )
|
|
225
|
-
begin
|
|
226
|
-
unless @buffer.empty?
|
|
227
|
-
forward( @buffer[0] )
|
|
228
|
-
len -= 1 unless count_punc && @buffer[0].form == CHAR_PUNCT
|
|
229
|
-
@buffer.delete_at( 0 )
|
|
230
|
-
end
|
|
231
|
-
end while len > 0
|
|
232
|
-
end
|
|
233
|
-
|
|
234
|
-
# Ermittelt die maximale Ergebnislänge
|
|
235
|
-
def sort_result_len( result )
|
|
236
|
-
result.collect do |res|
|
|
237
|
-
if res.is_a?( Lexical )
|
|
238
|
-
res.form.split( ' ' ).size
|
|
239
|
-
else
|
|
240
|
-
res =~ /^\*(\d+)/
|
|
241
|
-
$1.to_i
|
|
242
|
-
end
|
|
243
|
-
end.sort.reverse
|
|
244
|
-
end
|
|
245
|
-
|
|
246
|
-
# Prüft einen definiert langen Schlüssel ab Position 0 im Buffer
|
|
247
|
-
def check_multiword_key( len )
|
|
248
|
-
return [] if number_of_valid_tokens_in_buffer < len
|
|
249
|
-
|
|
250
|
-
# Wortformen aus der Wortliste auslesen
|
|
251
|
-
sequence = @buffer.map { |obj|
|
|
252
|
-
next [obj] unless obj.is_a?(StringA)
|
|
253
|
-
|
|
254
|
-
form = obj.form
|
|
255
|
-
next if form == CHAR_PUNCT
|
|
256
|
-
|
|
257
|
-
word = @lex_dic.find_word(form)
|
|
258
|
-
word = @lex_gra.find_compositum(form) if word.attr == WA_UNKNOWN
|
|
259
|
-
|
|
260
|
-
lexicals = word.attr == WA_KOMPOSITUM ?
|
|
261
|
-
[word.lexicals.first] : word.lexicals.dup
|
|
262
|
-
|
|
263
|
-
lexicals << word if lexicals.empty?
|
|
264
|
-
lexicals += @syn_dic.find_synonyms(word) if @syn_dic
|
|
265
|
-
|
|
266
|
-
lexicals.map { |lex| lex.form }.uniq
|
|
267
|
-
}.compact[0, len]
|
|
268
|
-
|
|
269
|
-
if @combine
|
|
270
|
-
keys, muls = [], []
|
|
271
|
-
|
|
272
|
-
sequence.each { |forms|
|
|
273
|
-
keys = forms.map { |form|
|
|
274
|
-
keys.empty? ? form : keys.map { |key| "#{key} #{form}" }
|
|
275
|
-
}.flatten(1)
|
|
276
|
-
}
|
|
277
|
-
|
|
278
|
-
keys.each { |key|
|
|
279
|
-
mul = @mul_dic.select(key.downcase)
|
|
280
|
-
|
|
281
|
-
unless mul.empty?
|
|
282
|
-
muls.concat(mul)
|
|
283
|
-
break unless @all_keys
|
|
284
|
-
end
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
muls.uniq
|
|
288
|
-
else
|
|
289
|
-
key = sequence.map { |forms| forms.first }.join(' ')
|
|
290
|
-
@mul_dic.select(key.downcase)
|
|
291
|
-
end
|
|
292
|
-
end
|
|
293
|
-
|
|
294
|
-
# Liefert die Anzahl gültiger Token zurück
|
|
295
|
-
def number_of_valid_tokens_in_buffer
|
|
296
|
-
@buffer.collect { |token| (token.form == CHAR_PUNCT) ? nil : 1 }.compact.size
|
|
297
|
-
end
|
|
298
|
-
|
|
299
|
-
end
|
|
300
|
-
|
|
301
|
-
end
|