lingo 1.8.1 → 1.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +23 -5
- data/README +1 -1
- data/Rakefile +5 -7
- data/TODO +2 -0
- data/bin/lingo +5 -1
- data/de.lang +1 -1
- data/en/lingo-syn.txt +0 -0
- data/en.lang +2 -1
- data/lib/lingo/attendee/abbreviator.rb +8 -9
- data/lib/lingo/attendee/debugger.rb +5 -4
- data/lib/lingo/attendee/decomposer.rb +8 -3
- data/lib/lingo/attendee/dehyphenizer.rb +19 -63
- data/lib/lingo/attendee/formatter.rb +1 -1
- data/lib/lingo/attendee/multi_worder.rb +67 -155
- data/lib/lingo/attendee/noneword_filter.rb +16 -9
- data/lib/lingo/attendee/object_filter.rb +1 -1
- data/lib/lingo/attendee/sequencer.rb +32 -63
- data/lib/lingo/attendee/stemmer/porter.rb +343 -0
- data/{info/gpl-hdr.txt → lib/lingo/attendee/stemmer.rb} +33 -0
- data/lib/lingo/attendee/synonymer.rb +10 -9
- data/lib/lingo/attendee/text_reader.rb +102 -76
- data/lib/lingo/attendee/text_writer.rb +23 -26
- data/lib/lingo/attendee/tokenizer.rb +13 -27
- data/lib/lingo/attendee/variator.rb +26 -66
- data/lib/lingo/attendee/vector_filter.rb +42 -43
- data/lib/lingo/attendee/word_searcher.rb +6 -7
- data/lib/lingo/attendee.rb +25 -7
- data/lib/lingo/buffered_attendee.rb +36 -10
- data/lib/lingo/cachable.rb +8 -8
- data/lib/lingo/config.rb +5 -6
- data/lib/lingo/ctl.rb +2 -3
- data/lib/lingo/database/crypter.rb +9 -26
- data/lib/lingo/database/gdbm_store.rb +3 -5
- data/lib/lingo/database/libcdb_store.rb +4 -6
- data/lib/lingo/database/sdbm_store.rb +11 -6
- data/lib/lingo/database/show_progress.rb +3 -43
- data/lib/lingo/database/source/key_value.rb +2 -6
- data/lib/lingo/database/source/multi_key.rb +3 -5
- data/lib/lingo/database/source/multi_value.rb +2 -6
- data/lib/lingo/database/source/single_word.rb +4 -6
- data/lib/lingo/database/source/word_class.rb +4 -10
- data/lib/lingo/database/source.rb +20 -18
- data/lib/lingo/database.rb +84 -59
- data/lib/lingo/error.rb +57 -1
- data/lib/lingo/language/dictionary.rb +21 -18
- data/lib/lingo/language/grammar.rb +40 -49
- data/lib/lingo/language/lexical.rb +6 -6
- data/lib/lingo/language/lexical_hash.rb +6 -0
- data/lib/lingo/language/word.rb +32 -15
- data/lib/lingo/language/word_form.rb +1 -1
- data/lib/lingo/language.rb +14 -25
- data/lib/lingo/reportable.rb +12 -10
- data/lib/lingo/show_progress.rb +81 -0
- data/lib/lingo/version.rb +1 -1
- data/lib/lingo.rb +63 -24
- data/lingo-call.cfg +6 -10
- data/lingo.cfg +60 -44
- data/lir.cfg +42 -41
- data/test/attendee/ts_abbreviator.rb +3 -5
- data/test/attendee/ts_decomposer.rb +3 -5
- data/test/attendee/ts_multi_worder.rb +87 -145
- data/test/attendee/ts_noneword_filter.rb +5 -3
- data/test/attendee/ts_object_filter.rb +5 -3
- data/test/attendee/ts_sequencer.rb +3 -5
- data/test/attendee/ts_stemmer.rb +309 -0
- data/test/attendee/ts_synonymer.rb +15 -11
- data/test/attendee/ts_text_reader.rb +12 -15
- data/test/attendee/ts_text_writer.rb +24 -29
- data/test/attendee/ts_tokenizer.rb +9 -7
- data/test/attendee/ts_variator.rb +4 -4
- data/test/attendee/ts_vector_filter.rb +24 -16
- data/test/attendee/ts_word_searcher.rb +20 -36
- data/test/{lir.csv → lir.vec} +0 -0
- data/test/ref/artikel.vec +943 -943
- data/test/ref/artikel.ven +943 -943
- data/test/ref/lir.non +201 -201
- data/test/ref/lir.seq +178 -178
- data/test/ref/lir.syn +49 -49
- data/test/ref/lir.vec +329 -0
- data/test/test_helper.rb +20 -36
- data/test/ts_database.rb +10 -10
- data/test/ts_language.rb +279 -319
- metadata +93 -104
- data/info/Objekte.png +0 -0
- data/info/Typen.png +0 -0
- data/info/database.png +0 -0
- data/info/db_small.png +0 -0
- data/info/download.png +0 -0
- data/info/kerze.png +0 -0
- data/info/language.png +0 -0
- data/info/lingo.png +0 -0
- data/info/logo.png +0 -0
- data/info/meeting.png +0 -0
- data/info/types.png +0 -0
- data/lingo-all.cfg +0 -89
- data/porter/stem.cfg +0 -311
- data/porter/stem.rb +0 -150
- data/test/ref/lir.csv +0 -329
- data/test.cfg +0 -79
data/ChangeLog
CHANGED
@@ -1,5 +1,23 @@
|
|
1
1
|
= Revision history for Lingo
|
2
2
|
|
3
|
+
== 1.8.2 [2012-04-19]
|
4
|
+
|
5
|
+
* Performance improvements regarding Attendee::VectorFilter's (as well as
|
6
|
+
Attendee::NonewordFilter's) memory usage; set <tt>sort: false</tt> in the config.
|
7
|
+
* Added Attendee::Stemmer (implementing Porter's algorithm for suffix stripping).
|
8
|
+
* Added progress reporting to Attendee::TextReader; set <tt>progress: true</tt>
|
9
|
+
in the config.
|
10
|
+
* Added directory and glob processing to Attendee::TextReader (new options
|
11
|
+
+glob+ and +recursive+).
|
12
|
+
* Renamed Attendee::TextReader's option +lir-record-pattern+ to +records+.
|
13
|
+
* Fixed Attendee::Debugger to forward all objects so it can be inserted
|
14
|
+
between any two attendees in the config.
|
15
|
+
* Fixed regression introduced in 1.8.0 where Lingo would not use existing
|
16
|
+
compiled dictionary when source file is not present.
|
17
|
+
* Fixed "invalid byte sequence in UTF-8" on Windows for SDBM store.
|
18
|
+
* Enabled pluggable (compiled) dictionaries and storage backends.
|
19
|
+
* Extensive internal refactoring and cleanup. (Finished for now.)
|
20
|
+
|
3
21
|
== 1.8.1 [2012-02-19]
|
4
22
|
|
5
23
|
* Introduced alternative storage backends, mainly to circumvent SDBM's record
|
@@ -62,13 +80,13 @@
|
|
62
80
|
(requires diff-lcs[http://raa.ruby-lang.org/project/diff-lcs/]).
|
63
81
|
* Provide alternatives to standard zip command on windows platforms.
|
64
82
|
* Use +UNK+ itself if it doesn't have any lexicals.
|
65
|
-
* Use compo form instead of word form when lexicalizing
|
83
|
+
* Use compo form instead of word form when lexicalizing compound entry for
|
66
84
|
multiword dictionaries.
|
67
85
|
* LexicalHash#[] must use target (not source) form.
|
68
86
|
* Optionally, try to find matches for all lexicals a word has.
|
69
87
|
* Make '-' a PRINTABLE_CHAR.
|
70
88
|
* Allow synonyms to be considered for multiword matching.
|
71
|
-
* Don't use
|
89
|
+
* Don't use compound parts.
|
72
90
|
* Introduced some (more or less arbitrary) line length limit. We can only
|
73
91
|
store values of a certain length anyway (with SDBM). Entries exceeding this
|
74
92
|
limit will be rejected and logged in the .rev file.
|
@@ -252,12 +270,12 @@
|
|
252
270
|
* <b>Decomposer mit zusätzlicher Validitätsprüfung und Kennzeichnung</b>
|
253
271
|
|
254
272
|
Der Decomposer kann bei Bedarf Komposita einer zusätzlichen Prüfung unterziehen.
|
255
|
-
Ist der Schlüssel <tt>de.lang:language/dictionary/
|
273
|
+
Ist der Schlüssel <tt>de.lang:language/dictionary/compound/skip-sequences</tt>
|
256
274
|
angegeben, der z.B. in der Form <tt>skip-sequences: [ VS ]</tt> definiert wird,
|
257
275
|
wird zusätzlich geprüft, ob das Kompositum mit seinen Teilen diesen Wortklassen
|
258
276
|
entspricht. Hiernach werden Komposita verworfen, die aus Verb-Substantiv-Kombination
|
259
277
|
bestehen. Die Angabe des Parameters ist optional.
|
260
|
-
Zusätzlich werden bei Angabe des Schlüssels <tt>de.lang:language/dictionary/
|
278
|
+
Zusätzlich werden bei Angabe des Schlüssels <tt>de.lang:language/dictionary/compound/append-wordclass</tt>,
|
261
279
|
der i.d.R einen ein Zeichen langen String enthält, die durch Zerlegung erkannten
|
262
280
|
Wortstämme markiert, in dem ihre Wortklasse das über diesen Schlüssel definierte
|
263
281
|
Zeichen angehangen bekommt.
|
@@ -476,7 +494,7 @@
|
|
476
494
|
* <b>Kompositum-Zerlegung mit weiterer Einschränkung</b>
|
477
495
|
|
478
496
|
Ein weiterer Parameter ist für die Kompositumzerlegung hinzugekommen. Als
|
479
|
-
Attribute des Tags <tt>XML:dictionary/
|
497
|
+
Attribute des Tags <tt>XML:dictionary/compound</tt> können jetzt angegeben werden:
|
480
498
|
|
481
499
|
Attribut Default Funktion
|
482
500
|
============================================================================
|
data/README
CHANGED
data/Rakefile
CHANGED
@@ -39,10 +39,8 @@ The main functions of Lingo are:
|
|
39
39
|
of word classes
|
40
40
|
EOT
|
41
41
|
extra_files: FileList[
|
42
|
-
'lingo.rb', 'lingo{,-
|
43
|
-
'{de,en}
|
44
|
-
'info/gpl-hdr.txt', 'info/*.png', 'lir.cfg', 'txt/lir.txt', 'porter/*',
|
45
|
-
'test.cfg', '{de,en}/test_*.txt'
|
42
|
+
'lingo.rb', 'lingo{,-call}.cfg', 'lir.cfg', '{de,en}.lang',
|
43
|
+
'{de,en}/{lingo-*,user-dic,test_*}.txt', 'txt/{artikel{,-en},lir}.txt'
|
46
44
|
].to_a,
|
47
45
|
required_ruby_version: '>= 1.9',
|
48
46
|
dependencies: [['ruby-nuggets', '>= 0.8.5'], 'unicode', 'highline'],
|
@@ -54,7 +52,7 @@ rescue LoadError => err
|
|
54
52
|
end
|
55
53
|
|
56
54
|
CLEAN.include(
|
57
|
-
'txt/*.{log,mul,non,seq,syn,ve
|
55
|
+
'txt/*.{log,mul,non,seq,ste,syn,ve?}',
|
58
56
|
'test/{test.*,text.non}',
|
59
57
|
'store/*/*.rev',
|
60
58
|
'bench/tmp.*'
|
@@ -78,7 +76,7 @@ end
|
|
78
76
|
|
79
77
|
desc 'Test against reference file (TXT)'
|
80
78
|
task 'test:txt' do
|
81
|
-
test_ref('artikel', '
|
79
|
+
test_ref('artikel', 'lingo')
|
82
80
|
end
|
83
81
|
|
84
82
|
desc 'Test against reference file (LIR)'
|
@@ -116,7 +114,7 @@ def test_ref(name, cfg = name)
|
|
116
114
|
}.success? or abort msg.join("\n\n")
|
117
115
|
|
118
116
|
Dir["test/ref/#{name}.*"].each { |ref|
|
119
|
-
puts "
|
117
|
+
puts "## #{org = ref.sub(/test\/ref/, 'txt')}"
|
120
118
|
continue += Diff::LCS::Ldiff.run(ARGV.clear << '-a' << org << ref)
|
121
119
|
}
|
122
120
|
|
data/TODO
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
= ToDo list for Lingo
|
2
2
|
|
3
|
+
* Configuration parameter validation.
|
4
|
+
* Replace regex-based tokenizer with a (Racc/Ragel/ANTLR-based?) lexer.
|
3
5
|
* Update and translate old documentation.
|
4
6
|
* Allow for handling of documents in various encodings, not just the one the
|
5
7
|
dictionaries are encoded in.
|
data/bin/lingo
CHANGED
data/de.lang
CHANGED
@@ -65,7 +65,7 @@ language:
|
|
65
65
|
tst-cry: { name: de/test_cry.txt, txt-format: WordClass, crypt } # TEST: Verschlüsselung
|
66
66
|
tst-sgw: { name: de/test_singleword.txt, txt-format: SingleWord } # TEST: SingleWord-Format
|
67
67
|
|
68
|
-
|
68
|
+
compound:
|
69
69
|
min-word-size: "7"
|
70
70
|
min-part-size: "3"
|
71
71
|
max-parts: "5"
|
data/en/lingo-syn.txt
ADDED
File without changes
|
data/en.lang
CHANGED
@@ -52,12 +52,13 @@ language:
|
|
52
52
|
|
53
53
|
# Systemwörterbücher
|
54
54
|
sys-dic: { name: en/lingo-dic.txt, txt-format: WordClass, separator: '=' }
|
55
|
+
sys-syn: { name: en/lingo-syn.txt, txt-format: KeyValue, separator: '=', def-wc: y }
|
55
56
|
sys-mul: { name: en/lingo-mul.txt, txt-format: SingleWord, use-lex: 'sys-dic', def-wc: m }
|
56
57
|
|
57
58
|
# Benutzerwörterbücher
|
58
59
|
usr-dic: { name: en/user-dic.txt, txt-format: WordClass, separator: '=' }
|
59
60
|
|
60
|
-
|
61
|
+
compound:
|
61
62
|
min-word-size: "7"
|
62
63
|
min-part-size: "3"
|
63
64
|
max-parts: "5"
|
@@ -76,17 +76,15 @@ class Lingo
|
|
76
76
|
set_dic
|
77
77
|
end
|
78
78
|
|
79
|
-
def control(cmd,
|
80
|
-
|
81
|
-
|
82
|
-
# Jedes Control-Object ist auch Auslöser der Verarbeitung
|
79
|
+
def control(cmd, param)
|
80
|
+
report_on(cmd, @dic)
|
83
81
|
process_buffer
|
84
82
|
end
|
85
83
|
|
86
84
|
private
|
87
85
|
|
88
86
|
def process_buffer?
|
89
|
-
|
87
|
+
form_at(-1, Token) == CHAR_PUNCT
|
90
88
|
end
|
91
89
|
|
92
90
|
def process_buffer
|
@@ -95,13 +93,14 @@ class Lingo
|
|
95
93
|
return
|
96
94
|
end
|
97
95
|
|
98
|
-
|
99
|
-
if @buffer[-2].kind_of?(Token)
|
96
|
+
if form = form_at(-2, Token)
|
100
97
|
inc('Anzahl gesuchter Abkürzungen')
|
101
|
-
|
102
|
-
if abbr.identified?
|
98
|
+
|
99
|
+
if (abbr = find_word(form)).identified?
|
103
100
|
inc('Anzahl gefundener Abkürzungen')
|
101
|
+
|
104
102
|
abbr.form += CHAR_PUNCT
|
103
|
+
|
105
104
|
@buffer[-2] = abbr
|
106
105
|
@buffer.delete_at(-1)
|
107
106
|
end
|
@@ -95,14 +95,15 @@ class Lingo
|
|
95
95
|
@prompt = get_key('prompt', 'lex:) ')
|
96
96
|
end
|
97
97
|
|
98
|
-
def control(cmd,
|
99
|
-
if cmd != STR_CMD_STATUS
|
100
|
-
|
98
|
+
def control(cmd, param)
|
99
|
+
if cmd != STR_CMD_STATUS && eval(@cmd_eval)
|
100
|
+
warn "#{@prompt} #{AgendaItem.new(cmd, param).inspect}"
|
101
101
|
end
|
102
102
|
end
|
103
103
|
|
104
104
|
def process(obj)
|
105
|
-
|
105
|
+
warn "#{@prompt} #{obj.inspect}" if eval(@obj_eval)
|
106
|
+
forward(obj)
|
106
107
|
end
|
107
108
|
|
108
109
|
end
|
@@ -79,12 +79,17 @@ class Lingo
|
|
79
79
|
set_gra
|
80
80
|
end
|
81
81
|
|
82
|
-
def control(cmd,
|
83
|
-
|
82
|
+
def control(cmd, param)
|
83
|
+
report_on(cmd, @gra)
|
84
84
|
end
|
85
85
|
|
86
86
|
def process(obj)
|
87
|
-
|
87
|
+
if obj.is_a?(Word) && obj.unknown?
|
88
|
+
com = @gra.find_compound(obj.form)
|
89
|
+
obj = com unless com.unknown?
|
90
|
+
end
|
91
|
+
|
92
|
+
forward(obj)
|
88
93
|
end
|
89
94
|
|
90
95
|
end
|
@@ -41,9 +41,6 @@ class Lingo
|
|
41
41
|
# <b>out</b>:: siehe allgemeine Beschreibung des Attendee
|
42
42
|
# <b>source</b>:: siehe allgemeine Beschreibung des Dictionary
|
43
43
|
# <b><i>mode</i></b>:: (Standard: all) siehe allgemeine Beschreibung des Dictionary
|
44
|
-
# <b><i>stopper</i></b>:: (Standard: TA_PUNCTUATION, TA_OTHER) Gibt die Begrenzungen an, zwischen
|
45
|
-
# denen der Multiworder suchen soll, i.d.R. Satzzeichen und Sonderzeichen,
|
46
|
-
# weil sie kaum in einer Mehrwortgruppen vorkommen.
|
47
44
|
#
|
48
45
|
# === Beispiele
|
49
46
|
# Bei der Verarbeitung einer normalen Textdatei mit der Ablaufkonfiguration <tt>t1.cfg</tt>
|
@@ -74,87 +71,46 @@ class Lingo
|
|
74
71
|
protected
|
75
72
|
|
76
73
|
def init
|
77
|
-
@stopper = get_array('stopper', TA_PUNCTUATION+','+TA_OTHER).map(&:upcase)
|
78
|
-
|
79
74
|
set_dic
|
80
75
|
set_gra
|
81
76
|
|
82
|
-
@skip = get_array('skip', ''
|
83
|
-
|
84
|
-
@number_of_expected_tokens_in_buffer = 2
|
85
|
-
@eof_handling = false
|
86
|
-
end
|
77
|
+
@skip = get_array('skip', '', :downcase)
|
87
78
|
|
88
|
-
|
89
|
-
@dic.report.each_pair { |key, value| set(key, value) } if cmd == STR_CMD_STATUS
|
90
|
-
|
91
|
-
# Jedes Control-Object ist auch Auslöser der Verarbeitung
|
92
|
-
if cmd == STR_CMD_RECORD || cmd == STR_CMD_EOF
|
93
|
-
@eof_handling = true
|
94
|
-
while number_of_valid_tokens_in_buffer > 1
|
95
|
-
process_buffer
|
96
|
-
end
|
97
|
-
forward_number_of_token( @buffer.size, false )
|
98
|
-
@eof_handling = false
|
99
|
-
end
|
79
|
+
@expected_tokens_in_buffer, @eof_handling = 2, false
|
100
80
|
end
|
101
81
|
|
102
|
-
def
|
103
|
-
|
82
|
+
def control(cmd, param)
|
83
|
+
control_multi(cmd)
|
104
84
|
end
|
105
85
|
|
106
86
|
def process_buffer
|
107
|
-
|
108
|
-
@buffer[0].form[-1..-1] == '-' &&
|
109
|
-
@buffer[1].is_a?(Word) &&
|
110
|
-
!(!( ttt = @buffer[1].get_class(/./) ).nil? &&
|
111
|
-
!@skip.index( ttt[0].attr ).nil?)
|
112
|
-
|
113
|
-
# Einfache Zusammensetzung versuchen
|
114
|
-
form = @buffer[0].form[0...-1] + @buffer[1].form
|
115
|
-
word = @dic.find_word(form)
|
116
|
-
word = @gra.find_compositum(form) unless word.identified?
|
117
|
-
|
118
|
-
unless word.identified? || (word.attr == WA_KOMPOSITUM && word.get_class('x+').empty?)
|
119
|
-
# Zusammensetzung mit Bindestrich versuchen
|
120
|
-
form = @buffer[0].form + @buffer[1].form
|
121
|
-
word = @dic.find_word(form)
|
122
|
-
word = @gra.find_compositum(form) unless word.identified?
|
123
|
-
end
|
87
|
+
a, b, h = *ab = @buffer.values_at(0, 1), '-'
|
124
88
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
word = @gra.find_compositum(form) unless word.identified?
|
130
|
-
end
|
89
|
+
if ab.all? { |i| i.is_a?(Word) } && a.form[-1, 1] == h && !(
|
90
|
+
(c = b.get_class(/./).first) && @skip.include?(c.attr)
|
91
|
+
)
|
92
|
+
a, b = ab.map!(&:form)
|
131
93
|
|
132
|
-
|
94
|
+
word = dehyphenize(a.chomp(h) + b)
|
95
|
+
word = dehyphenize(a + b) unless dehyphenized?(word)
|
96
|
+
|
97
|
+
if dehyphenized?(word)
|
133
98
|
@buffer[0] = word
|
134
|
-
@buffer.delete_at(
|
99
|
+
@buffer.delete_at(1)
|
135
100
|
end
|
136
101
|
end
|
137
102
|
|
138
|
-
|
139
|
-
forward_number_of_token( 1, false )
|
103
|
+
forward_number_of_token(1, false)
|
140
104
|
end
|
141
105
|
|
142
106
|
private
|
143
107
|
|
144
|
-
|
145
|
-
|
146
|
-
begin
|
147
|
-
unless @buffer.empty?
|
148
|
-
forward( @buffer[0] )
|
149
|
-
len -= 1 unless count_punc && @buffer[0].form == CHAR_PUNCT
|
150
|
-
@buffer.delete_at( 0 )
|
151
|
-
end
|
152
|
-
end while len > 0
|
108
|
+
def dehyphenize(form)
|
109
|
+
find_word(form, &:identified?)
|
153
110
|
end
|
154
111
|
|
155
|
-
|
156
|
-
|
157
|
-
@buffer.collect { |token| (token.form == CHAR_PUNCT) ? nil : 1 }.compact.size
|
112
|
+
def dehyphenized?(word)
|
113
|
+
word.identified? || word.full_compound?
|
158
114
|
end
|
159
115
|
|
160
116
|
end
|
@@ -48,9 +48,6 @@ class Lingo
|
|
48
48
|
# <b>out</b>:: siehe allgemeine Beschreibung des Attendee
|
49
49
|
# <b>source</b>:: siehe allgemeine Beschreibung des Dictionary
|
50
50
|
# <b><i>mode</i></b>:: (Standard: all) siehe allgemeine Beschreibung des Dictionary
|
51
|
-
# <b><i>stopper</i></b>:: (Standard: TA_PUNCTUATION, TA_OTHER) Gibt die Begrenzungen an, zwischen
|
52
|
-
# denen der MultiWorder suchen soll, i.d.R. Satzzeichen und Sonderzeichen,
|
53
|
-
# weil sie kaum in einer Mehrwortgruppen vorkommen.
|
54
51
|
#
|
55
52
|
# === Beispiele
|
56
53
|
# Bei der Verarbeitung einer normalen Textdatei mit der Ablaufkonfiguration <tt>t1.cfg</tt>
|
@@ -81,216 +78,131 @@ class Lingo
|
|
81
78
|
protected
|
82
79
|
|
83
80
|
def init
|
84
|
-
@stopper = get_array('stopper', TA_PUNCTUATION+','+TA_OTHER).map(&:upcase)
|
85
|
-
@mul_dic = dictionary(mul_src = get_array('source'), get_key('mode', 'all'))
|
86
|
-
|
87
81
|
# combine lexical variants?
|
88
82
|
#
|
89
83
|
# false = old behaviour
|
90
84
|
# true = first match
|
91
85
|
# 'all' = all matches
|
92
|
-
@combine
|
93
|
-
@
|
86
|
+
@combine = get_key('combine', false)
|
87
|
+
@all = @combine.is_a?(String) && @combine.downcase == 'all'
|
88
|
+
|
89
|
+
lex_src, lex_mod, d = nil, nil, @lingo.dictionary_config['databases']
|
94
90
|
|
95
|
-
|
91
|
+
(mul_src = get_array('source')).each { |src|
|
92
|
+
s, m = d[src].values_at('use-lex', 'lex-mode')
|
96
93
|
|
97
|
-
|
98
|
-
|
99
|
-
if lex_src.nil? || lex_src == this_src
|
100
|
-
lex_src, lex_mod = this_src, this_mod
|
94
|
+
if lex_src.nil? || lex_src == s
|
95
|
+
lex_src, lex_mod = s, m
|
101
96
|
else
|
102
|
-
|
97
|
+
warn "#{self.class}: Dictionaries don't match: #{mul_src.join(',')}"
|
103
98
|
end
|
104
99
|
}
|
105
100
|
|
106
|
-
lex_src = lex_src.split(
|
101
|
+
lex_src = lex_src.split(SEP_RE)
|
107
102
|
lex_mod = get_key('lex-mode', lex_mod || 'first')
|
108
103
|
|
104
|
+
@mul_dic = dictionary(mul_src, get_key('mode', 'all'))
|
109
105
|
@lex_dic = dictionary(lex_src, lex_mod)
|
110
106
|
@lex_gra = grammar(lex_src, lex_mod)
|
111
107
|
|
112
|
-
if @combine && has_key?('use-syn')
|
113
|
-
|
108
|
+
@syn_dic = if @combine && has_key?('use-syn')
|
109
|
+
dictionary(get_array('use-syn'), get_key('syn-mode', 'all'))
|
114
110
|
end
|
115
111
|
|
116
|
-
@
|
117
|
-
@eof_handling = false
|
112
|
+
@expected_tokens_in_buffer, @eof_handling = 3, false
|
118
113
|
end
|
119
114
|
|
120
|
-
def control(cmd,
|
121
|
-
|
122
|
-
|
123
|
-
# Jedes Control-Object ist auch Auslöser der Verarbeitung
|
124
|
-
if cmd == STR_CMD_RECORD || cmd == STR_CMD_EOF
|
125
|
-
@eof_handling = true
|
126
|
-
while number_of_valid_tokens_in_buffer > 1
|
127
|
-
process_buffer
|
128
|
-
end
|
129
|
-
forward_number_of_token( @buffer.size, false )
|
130
|
-
@eof_handling = false
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
def process_buffer?
|
135
|
-
number_of_valid_tokens_in_buffer >= @number_of_expected_tokens_in_buffer
|
115
|
+
def control(cmd, param)
|
116
|
+
control_multi(cmd, @mul_dic)
|
136
117
|
end
|
137
118
|
|
138
119
|
def process_buffer
|
139
|
-
unless
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
unless
|
146
|
-
|
147
|
-
|
148
|
-
forward_number_of_token( 3 )
|
149
|
-
return
|
120
|
+
unless form_at(0) == CHAR_PUNCT
|
121
|
+
unless (res = check_multiword_key(3)).empty?
|
122
|
+
len = res.map { |r|
|
123
|
+
r.is_a?(Lexical) ? r.form.split(' ').size : r[/^\*(\d+)/, 1].to_i
|
124
|
+
}.sort!.reverse!
|
125
|
+
|
126
|
+
unless (max = len.first) > 3
|
127
|
+
create_and_forward_multiword(3, res)
|
128
|
+
forward_number_of_token(3)
|
150
129
|
else
|
151
|
-
|
152
|
-
|
153
|
-
@number_of_expected_tokens_in_buffer = lengths[0]
|
154
|
-
return
|
130
|
+
unless @eof_handling || @buffer.size >= max
|
131
|
+
@expected_tokens_in_buffer = max
|
155
132
|
else
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
forward_number_of_token( len )
|
163
|
-
throw :forward_one
|
164
|
-
end
|
165
|
-
end
|
166
|
-
|
167
|
-
# Keinen Match gefunden
|
168
|
-
forward_number_of_token( 1 )
|
169
|
-
end
|
170
|
-
|
171
|
-
@number_of_expected_tokens_in_buffer = 3
|
133
|
+
forward_number_of_token(len.find { |l|
|
134
|
+
r = check_multiword_key(l)
|
135
|
+
create_and_forward_multiword(l, r) unless r.empty?
|
136
|
+
} || 1)
|
137
|
+
|
138
|
+
@expected_tokens_in_buffer = 3
|
172
139
|
process_buffer if process_buffer?
|
173
|
-
return
|
174
140
|
end
|
175
141
|
end
|
142
|
+
|
143
|
+
return
|
176
144
|
end
|
177
145
|
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
create_and_forward_multiword( 2, result )
|
182
|
-
forward_number_of_token( 1 )
|
146
|
+
unless (res = check_multiword_key(2)).empty?
|
147
|
+
create_and_forward_multiword(2, res)
|
148
|
+
forward_number_of_token(1)
|
183
149
|
end
|
184
150
|
end
|
185
151
|
|
186
|
-
|
187
|
-
|
188
|
-
@number_of_expected_tokens_in_buffer = 3
|
152
|
+
forward_number_of_token(1, false)
|
153
|
+
@expected_tokens_in_buffer = 3
|
189
154
|
end
|
190
155
|
|
191
156
|
private
|
192
157
|
|
193
|
-
def create_and_forward_multiword(
|
194
|
-
|
195
|
-
|
196
|
-
form_parts = []
|
158
|
+
def create_and_forward_multiword(len, lex)
|
159
|
+
pos, parts = 0, []
|
160
|
+
|
197
161
|
begin
|
198
|
-
if
|
199
|
-
@buffer.delete_at(
|
200
|
-
|
162
|
+
if (form = form_at(pos)) == CHAR_PUNCT
|
163
|
+
@buffer.delete_at(pos)
|
164
|
+
parts[-1] += CHAR_PUNCT
|
201
165
|
else
|
202
166
|
@buffer[pos].attr = WA_UNKMULPART if @buffer[pos].unknown?
|
203
|
-
|
167
|
+
parts << form
|
204
168
|
pos += 1
|
205
169
|
end
|
206
170
|
end while pos < len
|
207
171
|
|
208
|
-
|
209
|
-
|
210
|
-
# Multiword erstellen
|
211
|
-
word = Word.new( form, WA_MULTIWORD )
|
212
|
-
word << lexicals.collect { |lex| (lex.is_a?(Lexical)) ? lex : nil }.compact # FIXME 1.60 - Ausstieg bei "*5" im Synonymer
|
213
|
-
|
214
|
-
# Forword Multiword
|
215
|
-
forward( word )
|
216
|
-
end
|
217
|
-
|
218
|
-
# Leitet 'len' Token weiter
|
219
|
-
def forward_number_of_token( len, count_punc = true )
|
220
|
-
begin
|
221
|
-
unless @buffer.empty?
|
222
|
-
forward( @buffer[0] )
|
223
|
-
len -= 1 unless count_punc && @buffer[0].form == CHAR_PUNCT
|
224
|
-
@buffer.delete_at( 0 )
|
225
|
-
end
|
226
|
-
end while len > 0
|
227
|
-
end
|
228
|
-
|
229
|
-
# Ermittelt die maximale Ergebnislänge
|
230
|
-
def sort_result_len( result )
|
231
|
-
result.collect do |res|
|
232
|
-
if res.is_a?( Lexical )
|
233
|
-
res.form.split( ' ' ).size
|
234
|
-
else
|
235
|
-
res =~ /^\*(\d+)/
|
236
|
-
$1.to_i
|
237
|
-
end
|
238
|
-
end.sort.reverse
|
172
|
+
forward(Word.new_lexicals(parts.join(' '),
|
173
|
+
WA_MULTIWORD, lex.select { |l| l.is_a?(Lexical) }))
|
239
174
|
end
|
240
175
|
|
241
176
|
# Prüft einen definiert langen Schlüssel ab Position 0 im Buffer
|
242
|
-
def check_multiword_key(
|
243
|
-
return [] if
|
177
|
+
def check_multiword_key(len)
|
178
|
+
return [] if valid_tokens_in_buffer < len
|
244
179
|
|
245
|
-
|
246
|
-
sequence = @buffer.map { |obj|
|
180
|
+
seq = @buffer.map { |obj|
|
247
181
|
next [obj] unless obj.is_a?(WordForm)
|
182
|
+
next if (form = obj.form) == CHAR_PUNCT
|
248
183
|
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
word = @lex_dic.find_word(form)
|
253
|
-
word = @lex_gra.find_compositum(form) if word.unknown?
|
254
|
-
|
255
|
-
lexicals = word.attr == WA_KOMPOSITUM ?
|
256
|
-
[word.lexicals.first] : word.lexicals.dup
|
257
|
-
|
258
|
-
lexicals << word if lexicals.empty?
|
259
|
-
lexicals += @syn_dic.find_synonyms(word) if @syn_dic
|
260
|
-
|
261
|
-
lexicals.map { |lex| lex.form }.uniq
|
262
|
-
}.compact[0, len]
|
263
|
-
|
264
|
-
if @combine
|
265
|
-
keys, muls = [], []
|
184
|
+
w = find_word(form, @lex_dic, @lex_gra)
|
185
|
+
l = w.lexicals
|
266
186
|
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
}.flatten(1)
|
187
|
+
(w.attr == WA_COMPOUND ? [l.first] : l.empty? ? [w] : l.dup).tap { |i|
|
188
|
+
i.concat(@syn_dic.find_synonyms(w)) if @syn_dic
|
189
|
+
i.map! { |j| j.form.downcase }.uniq!
|
271
190
|
}
|
191
|
+
}
|
272
192
|
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
unless mul.empty?
|
277
|
-
muls.concat(mul)
|
278
|
-
break unless @all_keys
|
279
|
-
end
|
280
|
-
}
|
193
|
+
seq.compact!
|
194
|
+
seq.slice!(len..-1)
|
281
195
|
|
282
|
-
|
196
|
+
if @combine
|
197
|
+
[].tap { |mul| seq.shift.product(*seq) { |key|
|
198
|
+
mul.concat(@mul_dic.select(key.join(' ')))
|
199
|
+
break unless @all_keys || mul.empty?
|
200
|
+
} && mul.uniq! }
|
283
201
|
else
|
284
|
-
|
285
|
-
@mul_dic.select(key.downcase)
|
202
|
+
@mul_dic.select(seq.map!(&:first).join(' '))
|
286
203
|
end
|
287
204
|
end
|
288
205
|
|
289
|
-
# Liefert die Anzahl gültiger Token zurück
|
290
|
-
def number_of_valid_tokens_in_buffer
|
291
|
-
@buffer.collect { |token| (token.form == CHAR_PUNCT) ? nil : 1 }.compact.size
|
292
|
-
end
|
293
|
-
|
294
206
|
end
|
295
207
|
|
296
208
|
# For backwards compatibility.
|