lingo 1.8.1 → 1.8.2
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +23 -5
- data/README +1 -1
- data/Rakefile +5 -7
- data/TODO +2 -0
- data/bin/lingo +5 -1
- data/de.lang +1 -1
- data/en/lingo-syn.txt +0 -0
- data/en.lang +2 -1
- data/lib/lingo/attendee/abbreviator.rb +8 -9
- data/lib/lingo/attendee/debugger.rb +5 -4
- data/lib/lingo/attendee/decomposer.rb +8 -3
- data/lib/lingo/attendee/dehyphenizer.rb +19 -63
- data/lib/lingo/attendee/formatter.rb +1 -1
- data/lib/lingo/attendee/multi_worder.rb +67 -155
- data/lib/lingo/attendee/noneword_filter.rb +16 -9
- data/lib/lingo/attendee/object_filter.rb +1 -1
- data/lib/lingo/attendee/sequencer.rb +32 -63
- data/lib/lingo/attendee/stemmer/porter.rb +343 -0
- data/{info/gpl-hdr.txt → lib/lingo/attendee/stemmer.rb} +33 -0
- data/lib/lingo/attendee/synonymer.rb +10 -9
- data/lib/lingo/attendee/text_reader.rb +102 -76
- data/lib/lingo/attendee/text_writer.rb +23 -26
- data/lib/lingo/attendee/tokenizer.rb +13 -27
- data/lib/lingo/attendee/variator.rb +26 -66
- data/lib/lingo/attendee/vector_filter.rb +42 -43
- data/lib/lingo/attendee/word_searcher.rb +6 -7
- data/lib/lingo/attendee.rb +25 -7
- data/lib/lingo/buffered_attendee.rb +36 -10
- data/lib/lingo/cachable.rb +8 -8
- data/lib/lingo/config.rb +5 -6
- data/lib/lingo/ctl.rb +2 -3
- data/lib/lingo/database/crypter.rb +9 -26
- data/lib/lingo/database/gdbm_store.rb +3 -5
- data/lib/lingo/database/libcdb_store.rb +4 -6
- data/lib/lingo/database/sdbm_store.rb +11 -6
- data/lib/lingo/database/show_progress.rb +3 -43
- data/lib/lingo/database/source/key_value.rb +2 -6
- data/lib/lingo/database/source/multi_key.rb +3 -5
- data/lib/lingo/database/source/multi_value.rb +2 -6
- data/lib/lingo/database/source/single_word.rb +4 -6
- data/lib/lingo/database/source/word_class.rb +4 -10
- data/lib/lingo/database/source.rb +20 -18
- data/lib/lingo/database.rb +84 -59
- data/lib/lingo/error.rb +57 -1
- data/lib/lingo/language/dictionary.rb +21 -18
- data/lib/lingo/language/grammar.rb +40 -49
- data/lib/lingo/language/lexical.rb +6 -6
- data/lib/lingo/language/lexical_hash.rb +6 -0
- data/lib/lingo/language/word.rb +32 -15
- data/lib/lingo/language/word_form.rb +1 -1
- data/lib/lingo/language.rb +14 -25
- data/lib/lingo/reportable.rb +12 -10
- data/lib/lingo/show_progress.rb +81 -0
- data/lib/lingo/version.rb +1 -1
- data/lib/lingo.rb +63 -24
- data/lingo-call.cfg +6 -10
- data/lingo.cfg +60 -44
- data/lir.cfg +42 -41
- data/test/attendee/ts_abbreviator.rb +3 -5
- data/test/attendee/ts_decomposer.rb +3 -5
- data/test/attendee/ts_multi_worder.rb +87 -145
- data/test/attendee/ts_noneword_filter.rb +5 -3
- data/test/attendee/ts_object_filter.rb +5 -3
- data/test/attendee/ts_sequencer.rb +3 -5
- data/test/attendee/ts_stemmer.rb +309 -0
- data/test/attendee/ts_synonymer.rb +15 -11
- data/test/attendee/ts_text_reader.rb +12 -15
- data/test/attendee/ts_text_writer.rb +24 -29
- data/test/attendee/ts_tokenizer.rb +9 -7
- data/test/attendee/ts_variator.rb +4 -4
- data/test/attendee/ts_vector_filter.rb +24 -16
- data/test/attendee/ts_word_searcher.rb +20 -36
- data/test/{lir.csv → lir.vec} +0 -0
- data/test/ref/artikel.vec +943 -943
- data/test/ref/artikel.ven +943 -943
- data/test/ref/lir.non +201 -201
- data/test/ref/lir.seq +178 -178
- data/test/ref/lir.syn +49 -49
- data/test/ref/lir.vec +329 -0
- data/test/test_helper.rb +20 -36
- data/test/ts_database.rb +10 -10
- data/test/ts_language.rb +279 -319
- metadata +93 -104
- data/info/Objekte.png +0 -0
- data/info/Typen.png +0 -0
- data/info/database.png +0 -0
- data/info/db_small.png +0 -0
- data/info/download.png +0 -0
- data/info/kerze.png +0 -0
- data/info/language.png +0 -0
- data/info/lingo.png +0 -0
- data/info/logo.png +0 -0
- data/info/meeting.png +0 -0
- data/info/types.png +0 -0
- data/lingo-all.cfg +0 -89
- data/porter/stem.cfg +0 -311
- data/porter/stem.rb +0 -150
- data/test/ref/lir.csv +0 -329
- data/test.cfg +0 -79
data/ChangeLog
CHANGED
@@ -1,5 +1,23 @@
|
|
1
1
|
= Revision history for Lingo
|
2
2
|
|
3
|
+
== 1.8.2 [2012-04-19]
|
4
|
+
|
5
|
+
* Performance improvements regarding Attendee::VectorFilter's (as well as
|
6
|
+
Attendee::NonewordFilter's) memory usage; set <tt>sort: false</tt> in the config.
|
7
|
+
* Added Attendee::Stemmer (implementing Porter's algorithm for suffix stripping).
|
8
|
+
* Added progress reporting to Attendee::TextReader; set <tt>progress: true</tt>
|
9
|
+
in the config.
|
10
|
+
* Added directory and glob processing to Attendee::TextReader (new options
|
11
|
+
+glob+ and +recursive+).
|
12
|
+
* Renamed Attendee::TextReader's option +lir-record-pattern+ to +records+.
|
13
|
+
* Fixed Attendee::Debugger to forward all objects so it can be inserted
|
14
|
+
between any two attendees in the config.
|
15
|
+
* Fixed regression introduced in 1.8.0 where Lingo would not use existing
|
16
|
+
compiled dictionary when source file is not present.
|
17
|
+
* Fixed "invalid byte sequence in UTF-8" on Windows for SDBM store.
|
18
|
+
* Enabled pluggable (compiled) dictionaries and storage backends.
|
19
|
+
* Extensive internal refactoring and cleanup. (Finished for now.)
|
20
|
+
|
3
21
|
== 1.8.1 [2012-02-19]
|
4
22
|
|
5
23
|
* Introduced alternative storage backends, mainly to circumvent SDBM's record
|
@@ -62,13 +80,13 @@
|
|
62
80
|
(requires diff-lcs[http://raa.ruby-lang.org/project/diff-lcs/]).
|
63
81
|
* Provide alternatives to standard zip command on windows platforms.
|
64
82
|
* Use +UNK+ itself if it doesn't have any lexicals.
|
65
|
-
* Use compo form instead of word form when lexicalizing
|
83
|
+
* Use compo form instead of word form when lexicalizing compound entry for
|
66
84
|
multiword dictionaries.
|
67
85
|
* LexicalHash#[] must use target (not source) form.
|
68
86
|
* Optionally, try to find matches for all lexicals a word has.
|
69
87
|
* Make '-' a PRINTABLE_CHAR.
|
70
88
|
* Allow synonyms to be considered for multiword matching.
|
71
|
-
* Don't use
|
89
|
+
* Don't use compound parts.
|
72
90
|
* Introduced some (more or less arbitrary) line length limit. We can only
|
73
91
|
store values of a certain length anyway (with SDBM). Entries exceeding this
|
74
92
|
limit will be rejected and logged in the .rev file.
|
@@ -252,12 +270,12 @@
|
|
252
270
|
* <b>Decomposer mit zusätzlicher Validitätsprüfung und Kennzeichnung</b>
|
253
271
|
|
254
272
|
Der Decomposer kann bei Bedarf Komposita einer zusätzlichen Prüfung unterziehen.
|
255
|
-
Ist der Schlüssel <tt>de.lang:language/dictionary/
|
273
|
+
Ist der Schlüssel <tt>de.lang:language/dictionary/compound/skip-sequences</tt>
|
256
274
|
angegeben, der z.B. in der Form <tt>skip-sequences: [ VS ]</tt> definiert wird,
|
257
275
|
wird zusätzlich geprüft, ob das Kompositum mit seinen Teilen diesen Wortklassen
|
258
276
|
entspricht. Hiernach werden Komposita verworfen, die aus Verb-Substantiv-Kombination
|
259
277
|
bestehen. Die Angabe des Parameters ist optional.
|
260
|
-
Zusätzlich werden bei Angabe des Schlüssels <tt>de.lang:language/dictionary/
|
278
|
+
Zusätzlich werden bei Angabe des Schlüssels <tt>de.lang:language/dictionary/compound/append-wordclass</tt>,
|
261
279
|
der i.d.R einen ein Zeichen langen String enthält, die durch Zerlegung erkannten
|
262
280
|
Wortstämme markiert, in dem ihre Wortklasse das über diesen Schlüssel definierte
|
263
281
|
Zeichen angehangen bekommt.
|
@@ -476,7 +494,7 @@
|
|
476
494
|
* <b>Kompositum-Zerlegung mit weiterer Einschränkung</b>
|
477
495
|
|
478
496
|
Ein weiterer Parameter ist für die Kompositumzerlegung hinzugekommen. Als
|
479
|
-
Attribute des Tags <tt>XML:dictionary/
|
497
|
+
Attribute des Tags <tt>XML:dictionary/compound</tt> können jetzt angegeben werden:
|
480
498
|
|
481
499
|
Attribut Default Funktion
|
482
500
|
============================================================================
|
data/README
CHANGED
data/Rakefile
CHANGED
@@ -39,10 +39,8 @@ The main functions of Lingo are:
|
|
39
39
|
of word classes
|
40
40
|
EOT
|
41
41
|
extra_files: FileList[
|
42
|
-
'lingo.rb', 'lingo{,-
|
43
|
-
'{de,en}
|
44
|
-
'info/gpl-hdr.txt', 'info/*.png', 'lir.cfg', 'txt/lir.txt', 'porter/*',
|
45
|
-
'test.cfg', '{de,en}/test_*.txt'
|
42
|
+
'lingo.rb', 'lingo{,-call}.cfg', 'lir.cfg', '{de,en}.lang',
|
43
|
+
'{de,en}/{lingo-*,user-dic,test_*}.txt', 'txt/{artikel{,-en},lir}.txt'
|
46
44
|
].to_a,
|
47
45
|
required_ruby_version: '>= 1.9',
|
48
46
|
dependencies: [['ruby-nuggets', '>= 0.8.5'], 'unicode', 'highline'],
|
@@ -54,7 +52,7 @@ rescue LoadError => err
|
|
54
52
|
end
|
55
53
|
|
56
54
|
CLEAN.include(
|
57
|
-
'txt/*.{log,mul,non,seq,syn,ve
|
55
|
+
'txt/*.{log,mul,non,seq,ste,syn,ve?}',
|
58
56
|
'test/{test.*,text.non}',
|
59
57
|
'store/*/*.rev',
|
60
58
|
'bench/tmp.*'
|
@@ -78,7 +76,7 @@ end
|
|
78
76
|
|
79
77
|
desc 'Test against reference file (TXT)'
|
80
78
|
task 'test:txt' do
|
81
|
-
test_ref('artikel', '
|
79
|
+
test_ref('artikel', 'lingo')
|
82
80
|
end
|
83
81
|
|
84
82
|
desc 'Test against reference file (LIR)'
|
@@ -116,7 +114,7 @@ def test_ref(name, cfg = name)
|
|
116
114
|
}.success? or abort msg.join("\n\n")
|
117
115
|
|
118
116
|
Dir["test/ref/#{name}.*"].each { |ref|
|
119
|
-
puts "
|
117
|
+
puts "## #{org = ref.sub(/test\/ref/, 'txt')}"
|
120
118
|
continue += Diff::LCS::Ldiff.run(ARGV.clear << '-a' << org << ref)
|
121
119
|
}
|
122
120
|
|
data/TODO
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
= ToDo list for Lingo
|
2
2
|
|
3
|
+
* Configuration parameter validation.
|
4
|
+
* Replace regex-based tokenizer with a (Racc/Ragel/ANTLR-based?) lexer.
|
3
5
|
* Update and translate old documentation.
|
4
6
|
* Allow for handling of documents in various encodings, not just the one the
|
5
7
|
dictionaries are encoded in.
|
data/bin/lingo
CHANGED
data/de.lang
CHANGED
@@ -65,7 +65,7 @@ language:
|
|
65
65
|
tst-cry: { name: de/test_cry.txt, txt-format: WordClass, crypt } # TEST: Verschlüsselung
|
66
66
|
tst-sgw: { name: de/test_singleword.txt, txt-format: SingleWord } # TEST: SingleWord-Format
|
67
67
|
|
68
|
-
|
68
|
+
compound:
|
69
69
|
min-word-size: "7"
|
70
70
|
min-part-size: "3"
|
71
71
|
max-parts: "5"
|
data/en/lingo-syn.txt
ADDED
File without changes
|
data/en.lang
CHANGED
@@ -52,12 +52,13 @@ language:
|
|
52
52
|
|
53
53
|
# Systemwörterbücher
|
54
54
|
sys-dic: { name: en/lingo-dic.txt, txt-format: WordClass, separator: '=' }
|
55
|
+
sys-syn: { name: en/lingo-syn.txt, txt-format: KeyValue, separator: '=', def-wc: y }
|
55
56
|
sys-mul: { name: en/lingo-mul.txt, txt-format: SingleWord, use-lex: 'sys-dic', def-wc: m }
|
56
57
|
|
57
58
|
# Benutzerwörterbücher
|
58
59
|
usr-dic: { name: en/user-dic.txt, txt-format: WordClass, separator: '=' }
|
59
60
|
|
60
|
-
|
61
|
+
compound:
|
61
62
|
min-word-size: "7"
|
62
63
|
min-part-size: "3"
|
63
64
|
max-parts: "5"
|
@@ -76,17 +76,15 @@ class Lingo
|
|
76
76
|
set_dic
|
77
77
|
end
|
78
78
|
|
79
|
-
def control(cmd,
|
80
|
-
|
81
|
-
|
82
|
-
# Jedes Control-Object ist auch Auslöser der Verarbeitung
|
79
|
+
def control(cmd, param)
|
80
|
+
report_on(cmd, @dic)
|
83
81
|
process_buffer
|
84
82
|
end
|
85
83
|
|
86
84
|
private
|
87
85
|
|
88
86
|
def process_buffer?
|
89
|
-
|
87
|
+
form_at(-1, Token) == CHAR_PUNCT
|
90
88
|
end
|
91
89
|
|
92
90
|
def process_buffer
|
@@ -95,13 +93,14 @@ class Lingo
|
|
95
93
|
return
|
96
94
|
end
|
97
95
|
|
98
|
-
|
99
|
-
if @buffer[-2].kind_of?(Token)
|
96
|
+
if form = form_at(-2, Token)
|
100
97
|
inc('Anzahl gesuchter Abkürzungen')
|
101
|
-
|
102
|
-
if abbr.identified?
|
98
|
+
|
99
|
+
if (abbr = find_word(form)).identified?
|
103
100
|
inc('Anzahl gefundener Abkürzungen')
|
101
|
+
|
104
102
|
abbr.form += CHAR_PUNCT
|
103
|
+
|
105
104
|
@buffer[-2] = abbr
|
106
105
|
@buffer.delete_at(-1)
|
107
106
|
end
|
@@ -95,14 +95,15 @@ class Lingo
|
|
95
95
|
@prompt = get_key('prompt', 'lex:) ')
|
96
96
|
end
|
97
97
|
|
98
|
-
def control(cmd,
|
99
|
-
if cmd != STR_CMD_STATUS
|
100
|
-
|
98
|
+
def control(cmd, param)
|
99
|
+
if cmd != STR_CMD_STATUS && eval(@cmd_eval)
|
100
|
+
warn "#{@prompt} #{AgendaItem.new(cmd, param).inspect}"
|
101
101
|
end
|
102
102
|
end
|
103
103
|
|
104
104
|
def process(obj)
|
105
|
-
|
105
|
+
warn "#{@prompt} #{obj.inspect}" if eval(@obj_eval)
|
106
|
+
forward(obj)
|
106
107
|
end
|
107
108
|
|
108
109
|
end
|
@@ -79,12 +79,17 @@ class Lingo
|
|
79
79
|
set_gra
|
80
80
|
end
|
81
81
|
|
82
|
-
def control(cmd,
|
83
|
-
|
82
|
+
def control(cmd, param)
|
83
|
+
report_on(cmd, @gra)
|
84
84
|
end
|
85
85
|
|
86
86
|
def process(obj)
|
87
|
-
|
87
|
+
if obj.is_a?(Word) && obj.unknown?
|
88
|
+
com = @gra.find_compound(obj.form)
|
89
|
+
obj = com unless com.unknown?
|
90
|
+
end
|
91
|
+
|
92
|
+
forward(obj)
|
88
93
|
end
|
89
94
|
|
90
95
|
end
|
@@ -41,9 +41,6 @@ class Lingo
|
|
41
41
|
# <b>out</b>:: siehe allgemeine Beschreibung des Attendee
|
42
42
|
# <b>source</b>:: siehe allgemeine Beschreibung des Dictionary
|
43
43
|
# <b><i>mode</i></b>:: (Standard: all) siehe allgemeine Beschreibung des Dictionary
|
44
|
-
# <b><i>stopper</i></b>:: (Standard: TA_PUNCTUATION, TA_OTHER) Gibt die Begrenzungen an, zwischen
|
45
|
-
# denen der Multiworder suchen soll, i.d.R. Satzzeichen und Sonderzeichen,
|
46
|
-
# weil sie kaum in einer Mehrwortgruppen vorkommen.
|
47
44
|
#
|
48
45
|
# === Beispiele
|
49
46
|
# Bei der Verarbeitung einer normalen Textdatei mit der Ablaufkonfiguration <tt>t1.cfg</tt>
|
@@ -74,87 +71,46 @@ class Lingo
|
|
74
71
|
protected
|
75
72
|
|
76
73
|
def init
|
77
|
-
@stopper = get_array('stopper', TA_PUNCTUATION+','+TA_OTHER).map(&:upcase)
|
78
|
-
|
79
74
|
set_dic
|
80
75
|
set_gra
|
81
76
|
|
82
|
-
@skip = get_array('skip', ''
|
83
|
-
|
84
|
-
@number_of_expected_tokens_in_buffer = 2
|
85
|
-
@eof_handling = false
|
86
|
-
end
|
77
|
+
@skip = get_array('skip', '', :downcase)
|
87
78
|
|
88
|
-
|
89
|
-
@dic.report.each_pair { |key, value| set(key, value) } if cmd == STR_CMD_STATUS
|
90
|
-
|
91
|
-
# Jedes Control-Object ist auch Auslöser der Verarbeitung
|
92
|
-
if cmd == STR_CMD_RECORD || cmd == STR_CMD_EOF
|
93
|
-
@eof_handling = true
|
94
|
-
while number_of_valid_tokens_in_buffer > 1
|
95
|
-
process_buffer
|
96
|
-
end
|
97
|
-
forward_number_of_token( @buffer.size, false )
|
98
|
-
@eof_handling = false
|
99
|
-
end
|
79
|
+
@expected_tokens_in_buffer, @eof_handling = 2, false
|
100
80
|
end
|
101
81
|
|
102
|
-
def
|
103
|
-
|
82
|
+
def control(cmd, param)
|
83
|
+
control_multi(cmd)
|
104
84
|
end
|
105
85
|
|
106
86
|
def process_buffer
|
107
|
-
|
108
|
-
@buffer[0].form[-1..-1] == '-' &&
|
109
|
-
@buffer[1].is_a?(Word) &&
|
110
|
-
!(!( ttt = @buffer[1].get_class(/./) ).nil? &&
|
111
|
-
!@skip.index( ttt[0].attr ).nil?)
|
112
|
-
|
113
|
-
# Einfache Zusammensetzung versuchen
|
114
|
-
form = @buffer[0].form[0...-1] + @buffer[1].form
|
115
|
-
word = @dic.find_word(form)
|
116
|
-
word = @gra.find_compositum(form) unless word.identified?
|
117
|
-
|
118
|
-
unless word.identified? || (word.attr == WA_KOMPOSITUM && word.get_class('x+').empty?)
|
119
|
-
# Zusammensetzung mit Bindestrich versuchen
|
120
|
-
form = @buffer[0].form + @buffer[1].form
|
121
|
-
word = @dic.find_word(form)
|
122
|
-
word = @gra.find_compositum(form) unless word.identified?
|
123
|
-
end
|
87
|
+
a, b, h = *ab = @buffer.values_at(0, 1), '-'
|
124
88
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
word = @gra.find_compositum(form) unless word.identified?
|
130
|
-
end
|
89
|
+
if ab.all? { |i| i.is_a?(Word) } && a.form[-1, 1] == h && !(
|
90
|
+
(c = b.get_class(/./).first) && @skip.include?(c.attr)
|
91
|
+
)
|
92
|
+
a, b = ab.map!(&:form)
|
131
93
|
|
132
|
-
|
94
|
+
word = dehyphenize(a.chomp(h) + b)
|
95
|
+
word = dehyphenize(a + b) unless dehyphenized?(word)
|
96
|
+
|
97
|
+
if dehyphenized?(word)
|
133
98
|
@buffer[0] = word
|
134
|
-
@buffer.delete_at(
|
99
|
+
@buffer.delete_at(1)
|
135
100
|
end
|
136
101
|
end
|
137
102
|
|
138
|
-
|
139
|
-
forward_number_of_token( 1, false )
|
103
|
+
forward_number_of_token(1, false)
|
140
104
|
end
|
141
105
|
|
142
106
|
private
|
143
107
|
|
144
|
-
|
145
|
-
|
146
|
-
begin
|
147
|
-
unless @buffer.empty?
|
148
|
-
forward( @buffer[0] )
|
149
|
-
len -= 1 unless count_punc && @buffer[0].form == CHAR_PUNCT
|
150
|
-
@buffer.delete_at( 0 )
|
151
|
-
end
|
152
|
-
end while len > 0
|
108
|
+
def dehyphenize(form)
|
109
|
+
find_word(form, &:identified?)
|
153
110
|
end
|
154
111
|
|
155
|
-
|
156
|
-
|
157
|
-
@buffer.collect { |token| (token.form == CHAR_PUNCT) ? nil : 1 }.compact.size
|
112
|
+
def dehyphenized?(word)
|
113
|
+
word.identified? || word.full_compound?
|
158
114
|
end
|
159
115
|
|
160
116
|
end
|
@@ -48,9 +48,6 @@ class Lingo
|
|
48
48
|
# <b>out</b>:: siehe allgemeine Beschreibung des Attendee
|
49
49
|
# <b>source</b>:: siehe allgemeine Beschreibung des Dictionary
|
50
50
|
# <b><i>mode</i></b>:: (Standard: all) siehe allgemeine Beschreibung des Dictionary
|
51
|
-
# <b><i>stopper</i></b>:: (Standard: TA_PUNCTUATION, TA_OTHER) Gibt die Begrenzungen an, zwischen
|
52
|
-
# denen der MultiWorder suchen soll, i.d.R. Satzzeichen und Sonderzeichen,
|
53
|
-
# weil sie kaum in einer Mehrwortgruppen vorkommen.
|
54
51
|
#
|
55
52
|
# === Beispiele
|
56
53
|
# Bei der Verarbeitung einer normalen Textdatei mit der Ablaufkonfiguration <tt>t1.cfg</tt>
|
@@ -81,216 +78,131 @@ class Lingo
|
|
81
78
|
protected
|
82
79
|
|
83
80
|
def init
|
84
|
-
@stopper = get_array('stopper', TA_PUNCTUATION+','+TA_OTHER).map(&:upcase)
|
85
|
-
@mul_dic = dictionary(mul_src = get_array('source'), get_key('mode', 'all'))
|
86
|
-
|
87
81
|
# combine lexical variants?
|
88
82
|
#
|
89
83
|
# false = old behaviour
|
90
84
|
# true = first match
|
91
85
|
# 'all' = all matches
|
92
|
-
@combine
|
93
|
-
@
|
86
|
+
@combine = get_key('combine', false)
|
87
|
+
@all = @combine.is_a?(String) && @combine.downcase == 'all'
|
88
|
+
|
89
|
+
lex_src, lex_mod, d = nil, nil, @lingo.dictionary_config['databases']
|
94
90
|
|
95
|
-
|
91
|
+
(mul_src = get_array('source')).each { |src|
|
92
|
+
s, m = d[src].values_at('use-lex', 'lex-mode')
|
96
93
|
|
97
|
-
|
98
|
-
|
99
|
-
if lex_src.nil? || lex_src == this_src
|
100
|
-
lex_src, lex_mod = this_src, this_mod
|
94
|
+
if lex_src.nil? || lex_src == s
|
95
|
+
lex_src, lex_mod = s, m
|
101
96
|
else
|
102
|
-
|
97
|
+
warn "#{self.class}: Dictionaries don't match: #{mul_src.join(',')}"
|
103
98
|
end
|
104
99
|
}
|
105
100
|
|
106
|
-
lex_src = lex_src.split(
|
101
|
+
lex_src = lex_src.split(SEP_RE)
|
107
102
|
lex_mod = get_key('lex-mode', lex_mod || 'first')
|
108
103
|
|
104
|
+
@mul_dic = dictionary(mul_src, get_key('mode', 'all'))
|
109
105
|
@lex_dic = dictionary(lex_src, lex_mod)
|
110
106
|
@lex_gra = grammar(lex_src, lex_mod)
|
111
107
|
|
112
|
-
if @combine && has_key?('use-syn')
|
113
|
-
|
108
|
+
@syn_dic = if @combine && has_key?('use-syn')
|
109
|
+
dictionary(get_array('use-syn'), get_key('syn-mode', 'all'))
|
114
110
|
end
|
115
111
|
|
116
|
-
@
|
117
|
-
@eof_handling = false
|
112
|
+
@expected_tokens_in_buffer, @eof_handling = 3, false
|
118
113
|
end
|
119
114
|
|
120
|
-
def control(cmd,
|
121
|
-
|
122
|
-
|
123
|
-
# Jedes Control-Object ist auch Auslöser der Verarbeitung
|
124
|
-
if cmd == STR_CMD_RECORD || cmd == STR_CMD_EOF
|
125
|
-
@eof_handling = true
|
126
|
-
while number_of_valid_tokens_in_buffer > 1
|
127
|
-
process_buffer
|
128
|
-
end
|
129
|
-
forward_number_of_token( @buffer.size, false )
|
130
|
-
@eof_handling = false
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
def process_buffer?
|
135
|
-
number_of_valid_tokens_in_buffer >= @number_of_expected_tokens_in_buffer
|
115
|
+
def control(cmd, param)
|
116
|
+
control_multi(cmd, @mul_dic)
|
136
117
|
end
|
137
118
|
|
138
119
|
def process_buffer
|
139
|
-
unless
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
unless
|
146
|
-
|
147
|
-
|
148
|
-
forward_number_of_token( 3 )
|
149
|
-
return
|
120
|
+
unless form_at(0) == CHAR_PUNCT
|
121
|
+
unless (res = check_multiword_key(3)).empty?
|
122
|
+
len = res.map { |r|
|
123
|
+
r.is_a?(Lexical) ? r.form.split(' ').size : r[/^\*(\d+)/, 1].to_i
|
124
|
+
}.sort!.reverse!
|
125
|
+
|
126
|
+
unless (max = len.first) > 3
|
127
|
+
create_and_forward_multiword(3, res)
|
128
|
+
forward_number_of_token(3)
|
150
129
|
else
|
151
|
-
|
152
|
-
|
153
|
-
@number_of_expected_tokens_in_buffer = lengths[0]
|
154
|
-
return
|
130
|
+
unless @eof_handling || @buffer.size >= max
|
131
|
+
@expected_tokens_in_buffer = max
|
155
132
|
else
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
forward_number_of_token( len )
|
163
|
-
throw :forward_one
|
164
|
-
end
|
165
|
-
end
|
166
|
-
|
167
|
-
# Keinen Match gefunden
|
168
|
-
forward_number_of_token( 1 )
|
169
|
-
end
|
170
|
-
|
171
|
-
@number_of_expected_tokens_in_buffer = 3
|
133
|
+
forward_number_of_token(len.find { |l|
|
134
|
+
r = check_multiword_key(l)
|
135
|
+
create_and_forward_multiword(l, r) unless r.empty?
|
136
|
+
} || 1)
|
137
|
+
|
138
|
+
@expected_tokens_in_buffer = 3
|
172
139
|
process_buffer if process_buffer?
|
173
|
-
return
|
174
140
|
end
|
175
141
|
end
|
142
|
+
|
143
|
+
return
|
176
144
|
end
|
177
145
|
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
create_and_forward_multiword( 2, result )
|
182
|
-
forward_number_of_token( 1 )
|
146
|
+
unless (res = check_multiword_key(2)).empty?
|
147
|
+
create_and_forward_multiword(2, res)
|
148
|
+
forward_number_of_token(1)
|
183
149
|
end
|
184
150
|
end
|
185
151
|
|
186
|
-
|
187
|
-
|
188
|
-
@number_of_expected_tokens_in_buffer = 3
|
152
|
+
forward_number_of_token(1, false)
|
153
|
+
@expected_tokens_in_buffer = 3
|
189
154
|
end
|
190
155
|
|
191
156
|
private
|
192
157
|
|
193
|
-
def create_and_forward_multiword(
|
194
|
-
|
195
|
-
|
196
|
-
form_parts = []
|
158
|
+
def create_and_forward_multiword(len, lex)
|
159
|
+
pos, parts = 0, []
|
160
|
+
|
197
161
|
begin
|
198
|
-
if
|
199
|
-
@buffer.delete_at(
|
200
|
-
|
162
|
+
if (form = form_at(pos)) == CHAR_PUNCT
|
163
|
+
@buffer.delete_at(pos)
|
164
|
+
parts[-1] += CHAR_PUNCT
|
201
165
|
else
|
202
166
|
@buffer[pos].attr = WA_UNKMULPART if @buffer[pos].unknown?
|
203
|
-
|
167
|
+
parts << form
|
204
168
|
pos += 1
|
205
169
|
end
|
206
170
|
end while pos < len
|
207
171
|
|
208
|
-
|
209
|
-
|
210
|
-
# Multiword erstellen
|
211
|
-
word = Word.new( form, WA_MULTIWORD )
|
212
|
-
word << lexicals.collect { |lex| (lex.is_a?(Lexical)) ? lex : nil }.compact # FIXME 1.60 - Ausstieg bei "*5" im Synonymer
|
213
|
-
|
214
|
-
# Forword Multiword
|
215
|
-
forward( word )
|
216
|
-
end
|
217
|
-
|
218
|
-
# Leitet 'len' Token weiter
|
219
|
-
def forward_number_of_token( len, count_punc = true )
|
220
|
-
begin
|
221
|
-
unless @buffer.empty?
|
222
|
-
forward( @buffer[0] )
|
223
|
-
len -= 1 unless count_punc && @buffer[0].form == CHAR_PUNCT
|
224
|
-
@buffer.delete_at( 0 )
|
225
|
-
end
|
226
|
-
end while len > 0
|
227
|
-
end
|
228
|
-
|
229
|
-
# Ermittelt die maximale Ergebnislänge
|
230
|
-
def sort_result_len( result )
|
231
|
-
result.collect do |res|
|
232
|
-
if res.is_a?( Lexical )
|
233
|
-
res.form.split( ' ' ).size
|
234
|
-
else
|
235
|
-
res =~ /^\*(\d+)/
|
236
|
-
$1.to_i
|
237
|
-
end
|
238
|
-
end.sort.reverse
|
172
|
+
forward(Word.new_lexicals(parts.join(' '),
|
173
|
+
WA_MULTIWORD, lex.select { |l| l.is_a?(Lexical) }))
|
239
174
|
end
|
240
175
|
|
241
176
|
# Prüft einen definiert langen Schlüssel ab Position 0 im Buffer
|
242
|
-
def check_multiword_key(
|
243
|
-
return [] if
|
177
|
+
def check_multiword_key(len)
|
178
|
+
return [] if valid_tokens_in_buffer < len
|
244
179
|
|
245
|
-
|
246
|
-
sequence = @buffer.map { |obj|
|
180
|
+
seq = @buffer.map { |obj|
|
247
181
|
next [obj] unless obj.is_a?(WordForm)
|
182
|
+
next if (form = obj.form) == CHAR_PUNCT
|
248
183
|
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
word = @lex_dic.find_word(form)
|
253
|
-
word = @lex_gra.find_compositum(form) if word.unknown?
|
254
|
-
|
255
|
-
lexicals = word.attr == WA_KOMPOSITUM ?
|
256
|
-
[word.lexicals.first] : word.lexicals.dup
|
257
|
-
|
258
|
-
lexicals << word if lexicals.empty?
|
259
|
-
lexicals += @syn_dic.find_synonyms(word) if @syn_dic
|
260
|
-
|
261
|
-
lexicals.map { |lex| lex.form }.uniq
|
262
|
-
}.compact[0, len]
|
263
|
-
|
264
|
-
if @combine
|
265
|
-
keys, muls = [], []
|
184
|
+
w = find_word(form, @lex_dic, @lex_gra)
|
185
|
+
l = w.lexicals
|
266
186
|
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
}.flatten(1)
|
187
|
+
(w.attr == WA_COMPOUND ? [l.first] : l.empty? ? [w] : l.dup).tap { |i|
|
188
|
+
i.concat(@syn_dic.find_synonyms(w)) if @syn_dic
|
189
|
+
i.map! { |j| j.form.downcase }.uniq!
|
271
190
|
}
|
191
|
+
}
|
272
192
|
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
unless mul.empty?
|
277
|
-
muls.concat(mul)
|
278
|
-
break unless @all_keys
|
279
|
-
end
|
280
|
-
}
|
193
|
+
seq.compact!
|
194
|
+
seq.slice!(len..-1)
|
281
195
|
|
282
|
-
|
196
|
+
if @combine
|
197
|
+
[].tap { |mul| seq.shift.product(*seq) { |key|
|
198
|
+
mul.concat(@mul_dic.select(key.join(' ')))
|
199
|
+
break unless @all_keys || mul.empty?
|
200
|
+
} && mul.uniq! }
|
283
201
|
else
|
284
|
-
|
285
|
-
@mul_dic.select(key.downcase)
|
202
|
+
@mul_dic.select(seq.map!(&:first).join(' '))
|
286
203
|
end
|
287
204
|
end
|
288
205
|
|
289
|
-
# Liefert die Anzahl gültiger Token zurück
|
290
|
-
def number_of_valid_tokens_in_buffer
|
291
|
-
@buffer.collect { |token| (token.form == CHAR_PUNCT) ? nil : 1 }.compact.size
|
292
|
-
end
|
293
|
-
|
294
206
|
end
|
295
207
|
|
296
208
|
# For backwards compatibility.
|