lingo 1.8.0 → 1.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +13 -0
- data/README +49 -29
- data/Rakefile +28 -4
- data/TODO +2 -9
- data/bin/lingo +24 -0
- data/bin/lingoctl +24 -0
- data/de/lingo-dic.txt +559 -74
- data/info/gpl-hdr.txt +21 -24
- data/lib/lingo.rb +83 -112
- data/lib/lingo/agenda_item.rb +53 -0
- data/lib/lingo/attendee.rb +261 -0
- data/lib/lingo/attendee/abbreviator.rb +95 -97
- data/lib/lingo/attendee/debugger.rb +94 -93
- data/lib/lingo/attendee/decomposer.rb +76 -83
- data/lib/lingo/attendee/dehyphenizer.rb +141 -144
- data/lib/lingo/attendee/formatter.rb +65 -0
- data/lib/lingo/attendee/multi_worder.rb +302 -0
- data/lib/lingo/attendee/noneword_filter.rb +89 -84
- data/lib/lingo/attendee/object_filter.rb +91 -0
- data/lib/lingo/attendee/sequencer.rb +159 -158
- data/lib/lingo/attendee/synonymer.rb +81 -84
- data/lib/lingo/attendee/text_reader.rb +242 -0
- data/lib/lingo/attendee/text_writer.rb +169 -0
- data/lib/lingo/attendee/tokenizer.rb +192 -191
- data/lib/lingo/attendee/variator.rb +152 -156
- data/lib/lingo/attendee/vector_filter.rb +140 -135
- data/lib/lingo/attendee/word_searcher.rb +98 -0
- data/lib/lingo/buffered_attendee.rb +69 -0
- data/lib/lingo/cachable.rb +58 -0
- data/lib/lingo/call.rb +72 -0
- data/lib/lingo/cli.rb +26 -0
- data/lib/lingo/config.rb +23 -26
- data/lib/lingo/core_ext.rb +42 -0
- data/lib/lingo/ctl.rb +239 -173
- data/lib/lingo/database.rb +148 -496
- data/lib/lingo/database/crypter.rb +85 -0
- data/lib/lingo/database/gdbm_store.rb +49 -0
- data/lib/lingo/database/hash_store.rb +67 -0
- data/lib/lingo/database/libcdb_store.rb +58 -0
- data/lib/lingo/database/sdbm_store.rb +64 -0
- data/lib/lingo/database/show_progress.rb +81 -0
- data/lib/lingo/database/source.rb +134 -0
- data/lib/lingo/database/source/key_value.rb +62 -0
- data/lib/lingo/database/source/multi_key.rb +65 -0
- data/lib/lingo/database/source/multi_value.rb +65 -0
- data/lib/lingo/database/source/single_word.rb +60 -0
- data/lib/lingo/database/source/word_class.rb +64 -0
- data/lib/lingo/error.rb +122 -0
- data/lib/lingo/language.rb +78 -518
- data/lib/lingo/language/dictionary.rb +173 -0
- data/lib/lingo/language/grammar.rb +211 -0
- data/lib/lingo/language/lexical.rb +66 -0
- data/lib/lingo/language/lexical_hash.rb +88 -0
- data/lib/lingo/language/token.rb +48 -0
- data/lib/lingo/language/word.rb +130 -0
- data/lib/lingo/language/word_form.rb +83 -0
- data/lib/lingo/reportable.rb +59 -0
- data/lib/lingo/version.rb +1 -1
- data/lingo-all.cfg +14 -10
- data/lingo-call.cfg +5 -5
- data/lingo.cfg +14 -12
- data/lingo.rb +26 -0
- data/lir.cfg +13 -9
- data/spec/spec_helper.rb +1 -0
- data/test.cfg +11 -11
- data/test/attendee/ts_abbreviator.rb +0 -6
- data/test/attendee/ts_decomposer.rb +0 -6
- data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
- data/test/attendee/ts_noneword_filter.rb +1 -7
- data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
- data/test/attendee/ts_sequencer.rb +0 -6
- data/test/attendee/ts_synonymer.rb +0 -6
- data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
- data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
- data/test/attendee/ts_tokenizer.rb +0 -6
- data/test/attendee/ts_variator.rb +0 -6
- data/test/attendee/ts_vector_filter.rb +1 -7
- data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
- data/test/ref/artikel.non +2 -29
- data/test/ref/artikel.seq +13 -8
- data/test/ref/artikel.vec +30 -15
- data/test/ref/artikel.ven +29 -14
- data/test/ref/artikel.ver +58 -43
- data/test/ref/lir.csv +146 -145
- data/test/ref/lir.non +186 -210
- data/test/ref/lir.seq +54 -50
- data/test/test_helper.rb +41 -36
- data/test/ts_database.rb +12 -11
- data/test/ts_language.rb +118 -68
- metadata +67 -29
- data/lib/lingo/attendee/multiworder.rb +0 -301
- data/lib/lingo/attendee/objectfilter.rb +0 -86
- data/lib/lingo/attendee/textreader.rb +0 -237
- data/lib/lingo/attendee/textwriter.rb +0 -196
- data/lib/lingo/attendee/wordsearcher.rb +0 -96
- data/lib/lingo/attendees.rb +0 -289
- data/lib/lingo/const.rb +0 -131
- data/lib/lingo/modules.rb +0 -98
- data/lib/lingo/types.rb +0 -285
- data/lib/lingo/utilities.rb +0 -40
|
@@ -1,165 +1,162 @@
|
|
|
1
1
|
# encoding: utf-8
|
|
2
2
|
|
|
3
3
|
#--
|
|
4
|
-
|
|
5
|
-
#
|
|
6
|
-
#
|
|
7
|
-
#
|
|
8
|
-
# Copyright (C) 2007
|
|
9
|
-
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
#
|
|
14
|
-
#
|
|
15
|
-
#
|
|
16
|
-
#
|
|
17
|
-
#
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
#
|
|
23
|
-
#
|
|
24
|
-
|
|
25
|
-
# welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
|
|
26
|
-
#
|
|
27
|
-
# Lex Lingo rules from here on
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
28
25
|
#++
|
|
29
26
|
|
|
30
27
|
class Lingo
|
|
31
28
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
@eof_handling = false
|
|
89
|
-
|
|
90
|
-
@skip = get_array('skip', "").collect { |wc| wc.downcase }
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
def control(cmd, par)
|
|
94
|
-
@dic.report.each_pair { |key, value| set(key, value) } if cmd == STR_CMD_STATUS
|
|
95
|
-
|
|
96
|
-
# Jedes Control-Object ist auch Auslöser der Verarbeitung
|
|
97
|
-
if cmd == STR_CMD_RECORD || cmd == STR_CMD_EOF
|
|
98
|
-
@eof_handling = true
|
|
99
|
-
while number_of_valid_tokens_in_buffer > 1
|
|
100
|
-
process_buffer
|
|
101
|
-
end
|
|
102
|
-
forward_number_of_token( @buffer.size, false )
|
|
29
|
+
class Attendee
|
|
30
|
+
|
|
31
|
+
# Der Dehyphenizer ... muss noch dokumentiert werden
|
|
32
|
+
#
|
|
33
|
+
# === Mögliche Verlinkung
|
|
34
|
+
# Erwartet:: Daten vom Typ *Word* z.B. von Wordsearcher, Decomposer, Ocr_variator, Multiworder
|
|
35
|
+
# Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_MULTIWORD). Je erkannter Mehrwortgruppe wird ein zusätzliches Word-Objekt in den Datenstrom eingefügt. Z.B. für Ocr_variator, Sequencer, Noneword_filter, Vector_filter
|
|
36
|
+
#
|
|
37
|
+
# === Parameter
|
|
38
|
+
# Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
|
|
39
|
+
# Alle anderen Parameter müssen zwingend angegeben werden.
|
|
40
|
+
# <b>in</b>:: siehe allgemeine Beschreibung des Attendee
|
|
41
|
+
# <b>out</b>:: siehe allgemeine Beschreibung des Attendee
|
|
42
|
+
# <b>source</b>:: siehe allgemeine Beschreibung des Dictionary
|
|
43
|
+
# <b><i>mode</i></b>:: (Standard: all) siehe allgemeine Beschreibung des Dictionary
|
|
44
|
+
# <b><i>stopper</i></b>:: (Standard: TA_PUNCTUATION, TA_OTHER) Gibt die Begrenzungen an, zwischen
|
|
45
|
+
# denen der Multiworder suchen soll, i.d.R. Satzzeichen und Sonderzeichen,
|
|
46
|
+
# weil sie kaum in einer Mehrwortgruppen vorkommen.
|
|
47
|
+
#
|
|
48
|
+
# === Beispiele
|
|
49
|
+
# Bei der Verarbeitung einer normalen Textdatei mit der Ablaufkonfiguration <tt>t1.cfg</tt>
|
|
50
|
+
# meeting:
|
|
51
|
+
# attendees:
|
|
52
|
+
# - text_reader: { out: lines, files: '$(files)' }
|
|
53
|
+
# - tokenizer: { in: lines, out: token }
|
|
54
|
+
# - abbreviator: { in: token, out: abbrev, source: 'sys-abk' }
|
|
55
|
+
# - word_searcher: { in: abbrev, out: words, source: 'sys-dic' }
|
|
56
|
+
# - decomposer: { in: words, out: comps, source: 'sys-dic' }
|
|
57
|
+
# - multi_worder: { in: comps, out: multi, source: 'sys-mul' }
|
|
58
|
+
# - debugger: { in: multi, prompt: 'out>' }
|
|
59
|
+
# ergibt die Ausgabe über den Debugger: <tt>lingo -c t1 test.txt</tt>
|
|
60
|
+
# out> *FILE('test.txt')
|
|
61
|
+
# out> <Sein = [(sein/s), (sein/v)]>
|
|
62
|
+
# out> <Name = [(name/s)]>
|
|
63
|
+
# out> <ist = [(sein/v)]>
|
|
64
|
+
# out> <johann van siegen|MUL = [(johann van siegen/m)]>
|
|
65
|
+
# out> <Johann = [(johann/e)]>
|
|
66
|
+
# out> <van = [(van/w)]>
|
|
67
|
+
# out> <Siegen = [(sieg/s), (siegen/v), (siegen/e)]>
|
|
68
|
+
# out> :./PUNC:
|
|
69
|
+
# out> *EOL('test.txt')
|
|
70
|
+
# out> *EOF('test.txt')
|
|
71
|
+
|
|
72
|
+
class Dehyphenizer < BufferedAttendee
|
|
73
|
+
|
|
74
|
+
protected
|
|
75
|
+
|
|
76
|
+
def init
|
|
77
|
+
@stopper = get_array('stopper', TA_PUNCTUATION+','+TA_OTHER).map(&:upcase)
|
|
78
|
+
|
|
79
|
+
set_dic
|
|
80
|
+
set_gra
|
|
81
|
+
|
|
82
|
+
@skip = get_array('skip', '').map(&:downcase)
|
|
83
|
+
|
|
84
|
+
@number_of_expected_tokens_in_buffer = 2
|
|
103
85
|
@eof_handling = false
|
|
104
86
|
end
|
|
105
|
-
end
|
|
106
87
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
# Einfache Zusammensetzung versuchen
|
|
119
|
-
form = @buffer[0].form[0...-1] + @buffer[1].form
|
|
120
|
-
word = @dic.find_word( form )
|
|
121
|
-
word = @gra.find_compositum( form ) unless word.attr == WA_IDENTIFIED
|
|
122
|
-
|
|
123
|
-
unless word.attr == WA_IDENTIFIED || (word.attr == WA_KOMPOSITUM && word.get_class('x+').empty?)
|
|
124
|
-
# Zusammensetzung mit Bindestrich versuchen
|
|
125
|
-
form = @buffer[0].form + @buffer[1].form
|
|
126
|
-
word = @dic.find_word( form )
|
|
127
|
-
word = @gra.find_compositum( form ) unless word.attr == WA_IDENTIFIED
|
|
88
|
+
def control(cmd, par)
|
|
89
|
+
@dic.report.each_pair { |key, value| set(key, value) } if cmd == STR_CMD_STATUS
|
|
90
|
+
|
|
91
|
+
# Jedes Control-Object ist auch Auslöser der Verarbeitung
|
|
92
|
+
if cmd == STR_CMD_RECORD || cmd == STR_CMD_EOF
|
|
93
|
+
@eof_handling = true
|
|
94
|
+
while number_of_valid_tokens_in_buffer > 1
|
|
95
|
+
process_buffer
|
|
96
|
+
end
|
|
97
|
+
forward_number_of_token( @buffer.size, false )
|
|
98
|
+
@eof_handling = false
|
|
128
99
|
end
|
|
100
|
+
end
|
|
129
101
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
word = @dic.find_word( form )
|
|
134
|
-
word = @gra.find_compositum( form ) unless word.attr == WA_IDENTIFIED
|
|
135
|
-
end
|
|
102
|
+
def process_buffer?
|
|
103
|
+
number_of_valid_tokens_in_buffer >= @number_of_expected_tokens_in_buffer
|
|
104
|
+
end
|
|
136
105
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
@buffer.
|
|
106
|
+
def process_buffer
|
|
107
|
+
if @buffer[0].is_a?(Word) &&
|
|
108
|
+
@buffer[0].form[-1..-1] == '-' &&
|
|
109
|
+
@buffer[1].is_a?(Word) &&
|
|
110
|
+
!(!( ttt = @buffer[1].get_class(/./) ).nil? &&
|
|
111
|
+
!@skip.index( ttt[0].attr ).nil?)
|
|
112
|
+
|
|
113
|
+
# Einfache Zusammensetzung versuchen
|
|
114
|
+
form = @buffer[0].form[0...-1] + @buffer[1].form
|
|
115
|
+
word = @dic.find_word(form)
|
|
116
|
+
word = @gra.find_compositum(form) unless word.identified?
|
|
117
|
+
|
|
118
|
+
unless word.identified? || (word.attr == WA_KOMPOSITUM && word.get_class('x+').empty?)
|
|
119
|
+
# Zusammensetzung mit Bindestrich versuchen
|
|
120
|
+
form = @buffer[0].form + @buffer[1].form
|
|
121
|
+
word = @dic.find_word(form)
|
|
122
|
+
word = @gra.find_compositum(form) unless word.identified?
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
unless word.identified? || (word.attr == WA_KOMPOSITUM && word.get_class('x+').empty?)
|
|
126
|
+
# Zusammensetzung mit Bindestrich versuchen
|
|
127
|
+
form = @buffer[0].form + @buffer[1].form
|
|
128
|
+
word = @dic.find_word(form)
|
|
129
|
+
word = @gra.find_compositum(form) unless word.identified?
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
if word.identified? || (word.attr == WA_KOMPOSITUM && word.get_class('x+').empty?)
|
|
133
|
+
@buffer[0] = word
|
|
134
|
+
@buffer.delete_at( 1 )
|
|
135
|
+
end
|
|
140
136
|
end
|
|
141
|
-
end
|
|
142
137
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
138
|
+
# Buffer weiterschaufeln
|
|
139
|
+
forward_number_of_token( 1, false )
|
|
140
|
+
end
|
|
146
141
|
|
|
147
|
-
|
|
142
|
+
private
|
|
143
|
+
|
|
144
|
+
# Leitet 'len' Token weiter
|
|
145
|
+
def forward_number_of_token( len, count_punc = true )
|
|
146
|
+
begin
|
|
147
|
+
unless @buffer.empty?
|
|
148
|
+
forward( @buffer[0] )
|
|
149
|
+
len -= 1 unless count_punc && @buffer[0].form == CHAR_PUNCT
|
|
150
|
+
@buffer.delete_at( 0 )
|
|
151
|
+
end
|
|
152
|
+
end while len > 0
|
|
153
|
+
end
|
|
148
154
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
forward( @buffer[0] )
|
|
154
|
-
len -= 1 unless count_punc && @buffer[0].form == CHAR_PUNCT
|
|
155
|
-
@buffer.delete_at( 0 )
|
|
156
|
-
end
|
|
157
|
-
end while len > 0
|
|
158
|
-
end
|
|
155
|
+
# Liefert die Anzahl gültiger Token zurück
|
|
156
|
+
def number_of_valid_tokens_in_buffer
|
|
157
|
+
@buffer.collect { |token| (token.form == CHAR_PUNCT) ? nil : 1 }.compact.size
|
|
158
|
+
end
|
|
159
159
|
|
|
160
|
-
# Liefert die Anzahl gültiger Token zurück
|
|
161
|
-
def number_of_valid_tokens_in_buffer
|
|
162
|
-
@buffer.collect { |token| (token.form == CHAR_PUNCT) ? nil : 1 }.compact.size
|
|
163
160
|
end
|
|
164
161
|
|
|
165
162
|
end
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
#--
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
25
|
+
#++
|
|
26
|
+
|
|
27
|
+
class Lingo
|
|
28
|
+
|
|
29
|
+
class Attendee
|
|
30
|
+
|
|
31
|
+
class Formatter < TextWriter
|
|
32
|
+
|
|
33
|
+
protected
|
|
34
|
+
|
|
35
|
+
def init
|
|
36
|
+
super
|
|
37
|
+
|
|
38
|
+
@ext = get_key('ext', '-')
|
|
39
|
+
@format = get_key('format', '%s')
|
|
40
|
+
@map = get_key('map', Hash.new { |h, k| h[k] = k })
|
|
41
|
+
|
|
42
|
+
@no_puts = true
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def process(obj)
|
|
46
|
+
if obj.is_a?(Word) || obj.is_a?(Token)
|
|
47
|
+
str = obj.form
|
|
48
|
+
|
|
49
|
+
if obj.respond_to?(:lexicals)
|
|
50
|
+
lex = obj.lexicals.first # TODO
|
|
51
|
+
att = @map[lex.attr] if lex
|
|
52
|
+
str = @format % [str, lex.form, att] if att
|
|
53
|
+
end
|
|
54
|
+
else
|
|
55
|
+
str = obj.to_s
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
@lir ? @lir_rec_buf << str : @file.print(str)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
end
|
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
#--
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
25
|
+
#++
|
|
26
|
+
|
|
27
|
+
class Lingo
|
|
28
|
+
|
|
29
|
+
class Attendee
|
|
30
|
+
|
|
31
|
+
# Mit der bisher beschriebenen Vorgehensweise werden die durch den Tokenizer erkannten
|
|
32
|
+
# Token aufgelöst und in Words verwandelt und über den Abbreviator und Decomposer auch
|
|
33
|
+
# Spezialfälle behandelt, die einzelne Wörter betreffen.
|
|
34
|
+
# Um jedoch auch Namen wie z.B. John F. Kennedy als Sinneinheit erkennen zu können, muss
|
|
35
|
+
# eine Analyse über mehrere Objekte erfolgen. Dies ist die Hauptaufgabe des MultiWorders.
|
|
36
|
+
# Der MultiWorder analysiert die Teile des Datenstroms, die z.B. durch Satzzeichen oder
|
|
37
|
+
# weiteren Einzelzeichen (z.B. '(') begrenzt sind. Erkannte Mehrwortgruppen werden als
|
|
38
|
+
# zusätzliches Objekt in den Datenstrom mit eingefügt.
|
|
39
|
+
#
|
|
40
|
+
# === Mögliche Verlinkung
|
|
41
|
+
# Erwartet:: Daten vom Typ *Word* z.B. von Wordsearcher, Decomposer, Ocr_variator, MultiWorder
|
|
42
|
+
# Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_MULTIWORD). Je erkannter Mehrwortgruppe wird ein zusätzliches Word-Objekt in den Datenstrom eingefügt. Z.B. für Ocr_variator, Sequencer, Noneword_filter, Vector_filter
|
|
43
|
+
#
|
|
44
|
+
# === Parameter
|
|
45
|
+
# Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
|
|
46
|
+
# Alle anderen Parameter müssen zwingend angegeben werden.
|
|
47
|
+
# <b>in</b>:: siehe allgemeine Beschreibung des Attendee
|
|
48
|
+
# <b>out</b>:: siehe allgemeine Beschreibung des Attendee
|
|
49
|
+
# <b>source</b>:: siehe allgemeine Beschreibung des Dictionary
|
|
50
|
+
# <b><i>mode</i></b>:: (Standard: all) siehe allgemeine Beschreibung des Dictionary
|
|
51
|
+
# <b><i>stopper</i></b>:: (Standard: TA_PUNCTUATION, TA_OTHER) Gibt die Begrenzungen an, zwischen
|
|
52
|
+
# denen der MultiWorder suchen soll, i.d.R. Satzzeichen und Sonderzeichen,
|
|
53
|
+
# weil sie kaum in einer Mehrwortgruppen vorkommen.
|
|
54
|
+
#
|
|
55
|
+
# === Beispiele
|
|
56
|
+
# Bei der Verarbeitung einer normalen Textdatei mit der Ablaufkonfiguration <tt>t1.cfg</tt>
|
|
57
|
+
# meeting:
|
|
58
|
+
# attendees:
|
|
59
|
+
# - text_reader: { out: lines, files: '$(files)' }
|
|
60
|
+
# - tokenizer: { in: lines, out: token }
|
|
61
|
+
# - abbreviator: { in: token, out: abbrev, source: 'sys-abk' }
|
|
62
|
+
# - word_searcher: { in: abbrev, out: words, source: 'sys-dic' }
|
|
63
|
+
# - decomposer: { in: words, out: comps, source: 'sys-dic' }
|
|
64
|
+
# - multi_worder: { in: comps, out: multi, source: 'sys-mul' }
|
|
65
|
+
# - debugger: { in: multi, prompt: 'out>' }
|
|
66
|
+
# ergibt die Ausgabe über den Debugger: <tt>lingo -c t1 test.txt</tt>
|
|
67
|
+
# out> *FILE('test.txt')
|
|
68
|
+
# out> <Sein = [(sein/s), (sein/v)]>
|
|
69
|
+
# out> <Name = [(name/s)]>
|
|
70
|
+
# out> <ist = [(sein/v)]>
|
|
71
|
+
# out> <johann van siegen|MUL = [(johann van siegen/m)]>
|
|
72
|
+
# out> <Johann = [(johann/e)]>
|
|
73
|
+
# out> <van = [(van/w)]>
|
|
74
|
+
# out> <Siegen = [(sieg/s), (siegen/v), (siegen/e)]>
|
|
75
|
+
# out> :./PUNC:
|
|
76
|
+
# out> *EOL('test.txt')
|
|
77
|
+
# out> *EOF('test.txt')
|
|
78
|
+
|
|
79
|
+
class MultiWorder < BufferedAttendee
|
|
80
|
+
|
|
81
|
+
protected
|
|
82
|
+
|
|
83
|
+
def init
|
|
84
|
+
@stopper = get_array('stopper', TA_PUNCTUATION+','+TA_OTHER).map(&:upcase)
|
|
85
|
+
@mul_dic = dictionary(mul_src = get_array('source'), get_key('mode', 'all'))
|
|
86
|
+
|
|
87
|
+
# combine lexical variants?
|
|
88
|
+
#
|
|
89
|
+
# false = old behaviour
|
|
90
|
+
# true = first match
|
|
91
|
+
# 'all' = all matches
|
|
92
|
+
@combine = get_key('combine', false)
|
|
93
|
+
@all_keys = @combine.is_a?(String) && @combine.downcase == 'all'
|
|
94
|
+
|
|
95
|
+
lex_src, lex_mod, databases = nil, nil, @lingo.dictionary_config['databases']
|
|
96
|
+
|
|
97
|
+
mul_src.each { |src|
|
|
98
|
+
this_src, this_mod = databases[src].values_at('use-lex', 'lex-mode')
|
|
99
|
+
if lex_src.nil? || lex_src == this_src
|
|
100
|
+
lex_src, lex_mod = this_src, this_mod
|
|
101
|
+
else
|
|
102
|
+
@lingo.warn "#{self.class}: Dictionaries don't match: #{mul_src.join(',')}"
|
|
103
|
+
end
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
lex_src = lex_src.split(STRING_SEPARATOR_RE)
|
|
107
|
+
lex_mod = get_key('lex-mode', lex_mod || 'first')
|
|
108
|
+
|
|
109
|
+
@lex_dic = dictionary(lex_src, lex_mod)
|
|
110
|
+
@lex_gra = grammar(lex_src, lex_mod)
|
|
111
|
+
|
|
112
|
+
if @combine && has_key?('use-syn')
|
|
113
|
+
@syn_dic = dictionary(get_array('use-syn'), get_key('syn-mode', 'all'))
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
@number_of_expected_tokens_in_buffer = 3
|
|
117
|
+
@eof_handling = false
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def control(cmd, par)
|
|
121
|
+
@mul_dic.report.each_pair { |key, value| set(key, value) } if cmd == STR_CMD_STATUS
|
|
122
|
+
|
|
123
|
+
# Jedes Control-Object ist auch Auslöser der Verarbeitung
|
|
124
|
+
if cmd == STR_CMD_RECORD || cmd == STR_CMD_EOF
|
|
125
|
+
@eof_handling = true
|
|
126
|
+
while number_of_valid_tokens_in_buffer > 1
|
|
127
|
+
process_buffer
|
|
128
|
+
end
|
|
129
|
+
forward_number_of_token( @buffer.size, false )
|
|
130
|
+
@eof_handling = false
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def process_buffer?
|
|
135
|
+
number_of_valid_tokens_in_buffer >= @number_of_expected_tokens_in_buffer
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def process_buffer
|
|
139
|
+
unless @buffer[0].form == CHAR_PUNCT
|
|
140
|
+
# Prüfe 3er Schlüssel
|
|
141
|
+
result = check_multiword_key( 3 )
|
|
142
|
+
unless result.empty?
|
|
143
|
+
# 3er Schlüssel gefunden
|
|
144
|
+
lengths = sort_result_len( result )
|
|
145
|
+
unless lengths[0] > 3
|
|
146
|
+
# Längster erkannter Schlüssel = 3
|
|
147
|
+
create_and_forward_multiword( 3, result )
|
|
148
|
+
forward_number_of_token( 3 )
|
|
149
|
+
return
|
|
150
|
+
else
|
|
151
|
+
# Längster erkannter Schlüssel > 3, Buffer voll genug?
|
|
152
|
+
unless @buffer.size >= lengths[0] || @eof_handling
|
|
153
|
+
@number_of_expected_tokens_in_buffer = lengths[0]
|
|
154
|
+
return
|
|
155
|
+
else
|
|
156
|
+
# Buffer voll genug, Verarbeitung kann beginnen
|
|
157
|
+
catch( :forward_one ) do
|
|
158
|
+
lengths.each do |len|
|
|
159
|
+
result = check_multiword_key( len )
|
|
160
|
+
unless result.empty?
|
|
161
|
+
create_and_forward_multiword( len, result )
|
|
162
|
+
forward_number_of_token( len )
|
|
163
|
+
throw :forward_one
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Keinen Match gefunden
|
|
168
|
+
forward_number_of_token( 1 )
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
@number_of_expected_tokens_in_buffer = 3
|
|
172
|
+
process_buffer if process_buffer?
|
|
173
|
+
return
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Prüfe 2er Schlüssel
|
|
179
|
+
result = check_multiword_key( 2 )
|
|
180
|
+
unless result.empty?
|
|
181
|
+
create_and_forward_multiword( 2, result )
|
|
182
|
+
forward_number_of_token( 1 )
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Buffer weiterschaufeln
|
|
187
|
+
forward_number_of_token( 1, false )
|
|
188
|
+
@number_of_expected_tokens_in_buffer = 3
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
private
|
|
192
|
+
|
|
193
|
+
def create_and_forward_multiword( len, lexicals )
|
|
194
|
+
# Form aus Buffer auslesen und Teile markieren
|
|
195
|
+
pos = 0
|
|
196
|
+
form_parts = []
|
|
197
|
+
begin
|
|
198
|
+
if @buffer[pos].form == CHAR_PUNCT
|
|
199
|
+
@buffer.delete_at( pos )
|
|
200
|
+
form_parts[-1] += CHAR_PUNCT
|
|
201
|
+
else
|
|
202
|
+
@buffer[pos].attr = WA_UNKMULPART if @buffer[pos].unknown?
|
|
203
|
+
form_parts << @buffer[pos].form
|
|
204
|
+
pos += 1
|
|
205
|
+
end
|
|
206
|
+
end while pos < len
|
|
207
|
+
|
|
208
|
+
form = form_parts.join( ' ' )
|
|
209
|
+
|
|
210
|
+
# Multiword erstellen
|
|
211
|
+
word = Word.new( form, WA_MULTIWORD )
|
|
212
|
+
word << lexicals.collect { |lex| (lex.is_a?(Lexical)) ? lex : nil }.compact # FIXME 1.60 - Ausstieg bei "*5" im Synonymer
|
|
213
|
+
|
|
214
|
+
# Forword Multiword
|
|
215
|
+
forward( word )
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
# Leitet 'len' Token weiter
|
|
219
|
+
def forward_number_of_token( len, count_punc = true )
|
|
220
|
+
begin
|
|
221
|
+
unless @buffer.empty?
|
|
222
|
+
forward( @buffer[0] )
|
|
223
|
+
len -= 1 unless count_punc && @buffer[0].form == CHAR_PUNCT
|
|
224
|
+
@buffer.delete_at( 0 )
|
|
225
|
+
end
|
|
226
|
+
end while len > 0
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Ermittelt die maximale Ergebnislänge
|
|
230
|
+
def sort_result_len( result )
|
|
231
|
+
result.collect do |res|
|
|
232
|
+
if res.is_a?( Lexical )
|
|
233
|
+
res.form.split( ' ' ).size
|
|
234
|
+
else
|
|
235
|
+
res =~ /^\*(\d+)/
|
|
236
|
+
$1.to_i
|
|
237
|
+
end
|
|
238
|
+
end.sort.reverse
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
# Prüft einen definiert langen Schlüssel ab Position 0 im Buffer
|
|
242
|
+
def check_multiword_key( len )
|
|
243
|
+
return [] if number_of_valid_tokens_in_buffer < len
|
|
244
|
+
|
|
245
|
+
# Wortformen aus der Wortliste auslesen
|
|
246
|
+
sequence = @buffer.map { |obj|
|
|
247
|
+
next [obj] unless obj.is_a?(WordForm)
|
|
248
|
+
|
|
249
|
+
form = obj.form
|
|
250
|
+
next if form == CHAR_PUNCT
|
|
251
|
+
|
|
252
|
+
word = @lex_dic.find_word(form)
|
|
253
|
+
word = @lex_gra.find_compositum(form) if word.unknown?
|
|
254
|
+
|
|
255
|
+
lexicals = word.attr == WA_KOMPOSITUM ?
|
|
256
|
+
[word.lexicals.first] : word.lexicals.dup
|
|
257
|
+
|
|
258
|
+
lexicals << word if lexicals.empty?
|
|
259
|
+
lexicals += @syn_dic.find_synonyms(word) if @syn_dic
|
|
260
|
+
|
|
261
|
+
lexicals.map { |lex| lex.form }.uniq
|
|
262
|
+
}.compact[0, len]
|
|
263
|
+
|
|
264
|
+
if @combine
|
|
265
|
+
keys, muls = [], []
|
|
266
|
+
|
|
267
|
+
sequence.each { |forms|
|
|
268
|
+
keys = forms.map { |form|
|
|
269
|
+
keys.empty? ? form : keys.map { |key| "#{key} #{form}" }
|
|
270
|
+
}.flatten(1)
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
keys.each { |key|
|
|
274
|
+
mul = @mul_dic.select(key.downcase)
|
|
275
|
+
|
|
276
|
+
unless mul.empty?
|
|
277
|
+
muls.concat(mul)
|
|
278
|
+
break unless @all_keys
|
|
279
|
+
end
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
muls.uniq
|
|
283
|
+
else
|
|
284
|
+
key = sequence.map { |forms| forms.first }.join(' ')
|
|
285
|
+
@mul_dic.select(key.downcase)
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
# Liefert die Anzahl gültiger Token zurück
|
|
290
|
+
def number_of_valid_tokens_in_buffer
|
|
291
|
+
@buffer.collect { |token| (token.form == CHAR_PUNCT) ? nil : 1 }.compact.size
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
# For backwards compatibility.
|
|
297
|
+
Multiworder = MultiWorder
|
|
298
|
+
Multi_worder = MultiWorder
|
|
299
|
+
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
end
|