lingo 1.8.0 → 1.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +13 -0
- data/README +49 -29
- data/Rakefile +28 -4
- data/TODO +2 -9
- data/bin/lingo +24 -0
- data/bin/lingoctl +24 -0
- data/de/lingo-dic.txt +559 -74
- data/info/gpl-hdr.txt +21 -24
- data/lib/lingo.rb +83 -112
- data/lib/lingo/agenda_item.rb +53 -0
- data/lib/lingo/attendee.rb +261 -0
- data/lib/lingo/attendee/abbreviator.rb +95 -97
- data/lib/lingo/attendee/debugger.rb +94 -93
- data/lib/lingo/attendee/decomposer.rb +76 -83
- data/lib/lingo/attendee/dehyphenizer.rb +141 -144
- data/lib/lingo/attendee/formatter.rb +65 -0
- data/lib/lingo/attendee/multi_worder.rb +302 -0
- data/lib/lingo/attendee/noneword_filter.rb +89 -84
- data/lib/lingo/attendee/object_filter.rb +91 -0
- data/lib/lingo/attendee/sequencer.rb +159 -158
- data/lib/lingo/attendee/synonymer.rb +81 -84
- data/lib/lingo/attendee/text_reader.rb +242 -0
- data/lib/lingo/attendee/text_writer.rb +169 -0
- data/lib/lingo/attendee/tokenizer.rb +192 -191
- data/lib/lingo/attendee/variator.rb +152 -156
- data/lib/lingo/attendee/vector_filter.rb +140 -135
- data/lib/lingo/attendee/word_searcher.rb +98 -0
- data/lib/lingo/buffered_attendee.rb +69 -0
- data/lib/lingo/cachable.rb +58 -0
- data/lib/lingo/call.rb +72 -0
- data/lib/lingo/cli.rb +26 -0
- data/lib/lingo/config.rb +23 -26
- data/lib/lingo/core_ext.rb +42 -0
- data/lib/lingo/ctl.rb +239 -173
- data/lib/lingo/database.rb +148 -496
- data/lib/lingo/database/crypter.rb +85 -0
- data/lib/lingo/database/gdbm_store.rb +49 -0
- data/lib/lingo/database/hash_store.rb +67 -0
- data/lib/lingo/database/libcdb_store.rb +58 -0
- data/lib/lingo/database/sdbm_store.rb +64 -0
- data/lib/lingo/database/show_progress.rb +81 -0
- data/lib/lingo/database/source.rb +134 -0
- data/lib/lingo/database/source/key_value.rb +62 -0
- data/lib/lingo/database/source/multi_key.rb +65 -0
- data/lib/lingo/database/source/multi_value.rb +65 -0
- data/lib/lingo/database/source/single_word.rb +60 -0
- data/lib/lingo/database/source/word_class.rb +64 -0
- data/lib/lingo/error.rb +122 -0
- data/lib/lingo/language.rb +78 -518
- data/lib/lingo/language/dictionary.rb +173 -0
- data/lib/lingo/language/grammar.rb +211 -0
- data/lib/lingo/language/lexical.rb +66 -0
- data/lib/lingo/language/lexical_hash.rb +88 -0
- data/lib/lingo/language/token.rb +48 -0
- data/lib/lingo/language/word.rb +130 -0
- data/lib/lingo/language/word_form.rb +83 -0
- data/lib/lingo/reportable.rb +59 -0
- data/lib/lingo/version.rb +1 -1
- data/lingo-all.cfg +14 -10
- data/lingo-call.cfg +5 -5
- data/lingo.cfg +14 -12
- data/lingo.rb +26 -0
- data/lir.cfg +13 -9
- data/spec/spec_helper.rb +1 -0
- data/test.cfg +11 -11
- data/test/attendee/ts_abbreviator.rb +0 -6
- data/test/attendee/ts_decomposer.rb +0 -6
- data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
- data/test/attendee/ts_noneword_filter.rb +1 -7
- data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
- data/test/attendee/ts_sequencer.rb +0 -6
- data/test/attendee/ts_synonymer.rb +0 -6
- data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
- data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
- data/test/attendee/ts_tokenizer.rb +0 -6
- data/test/attendee/ts_variator.rb +0 -6
- data/test/attendee/ts_vector_filter.rb +1 -7
- data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
- data/test/ref/artikel.non +2 -29
- data/test/ref/artikel.seq +13 -8
- data/test/ref/artikel.vec +30 -15
- data/test/ref/artikel.ven +29 -14
- data/test/ref/artikel.ver +58 -43
- data/test/ref/lir.csv +146 -145
- data/test/ref/lir.non +186 -210
- data/test/ref/lir.seq +54 -50
- data/test/test_helper.rb +41 -36
- data/test/ts_database.rb +12 -11
- data/test/ts_language.rb +118 -68
- metadata +67 -29
- data/lib/lingo/attendee/multiworder.rb +0 -301
- data/lib/lingo/attendee/objectfilter.rb +0 -86
- data/lib/lingo/attendee/textreader.rb +0 -237
- data/lib/lingo/attendee/textwriter.rb +0 -196
- data/lib/lingo/attendee/wordsearcher.rb +0 -96
- data/lib/lingo/attendees.rb +0 -289
- data/lib/lingo/const.rb +0 -131
- data/lib/lingo/modules.rb +0 -98
- data/lib/lingo/types.rb +0 -285
- data/lib/lingo/utilities.rb +0 -40
data/lib/lingo/database.rb
CHANGED
|
@@ -1,585 +1,237 @@
|
|
|
1
1
|
# encoding: utf-8
|
|
2
2
|
|
|
3
3
|
#--
|
|
4
|
-
|
|
5
|
-
#
|
|
6
|
-
#
|
|
7
|
-
#
|
|
8
|
-
# Copyright (C) 2007
|
|
9
|
-
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
#
|
|
14
|
-
#
|
|
15
|
-
#
|
|
16
|
-
#
|
|
17
|
-
#
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
#
|
|
23
|
-
#
|
|
24
|
-
|
|
25
|
-
# welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
|
|
26
|
-
#
|
|
27
|
-
# Lex Lingo rules from here on
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
28
25
|
#++
|
|
29
26
|
|
|
30
|
-
require 'sdbm'
|
|
31
27
|
require 'pathname'
|
|
32
28
|
require 'fileutils'
|
|
33
29
|
require 'digest/sha1'
|
|
34
30
|
|
|
35
|
-
require_relative '
|
|
36
|
-
require_relative '
|
|
37
|
-
require_relative '
|
|
38
|
-
require_relative '
|
|
31
|
+
require_relative 'database/show_progress'
|
|
32
|
+
require_relative 'database/crypter'
|
|
33
|
+
require_relative 'database/source'
|
|
34
|
+
require_relative 'database/hash_store'
|
|
35
|
+
require_relative 'database/sdbm_store'
|
|
36
|
+
require_relative 'database/gdbm_store'
|
|
37
|
+
require_relative 'database/libcdb_store'
|
|
39
38
|
|
|
40
39
|
class Lingo
|
|
41
40
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
# to actually substitute the placeholder.
|
|
49
|
-
length = (format % 0).length
|
|
50
|
-
|
|
51
|
-
# Now we know how far to "go back" to
|
|
52
|
-
# overwrite the formatted string...
|
|
53
|
-
back = "\b" * length
|
|
54
|
-
|
|
55
|
-
@format = format + back
|
|
56
|
-
@clear = ' ' * length + back
|
|
57
|
-
|
|
58
|
-
print msg, ': '
|
|
59
|
-
end
|
|
60
|
-
|
|
61
|
-
def start(msg, max)
|
|
62
|
-
@ratio, @count, @next_step = max / 100.0, 0, 0
|
|
63
|
-
print msg, ' '
|
|
64
|
-
step
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
def stop(msg)
|
|
68
|
-
print @clear
|
|
69
|
-
print msg, "\n"
|
|
70
|
-
end
|
|
71
|
-
|
|
72
|
-
def tick(value)
|
|
73
|
-
@count = value
|
|
74
|
-
step if @count >= @next_step
|
|
75
|
-
end
|
|
76
|
-
|
|
77
|
-
private
|
|
78
|
-
|
|
79
|
-
def step
|
|
80
|
-
percent = @count / @ratio
|
|
81
|
-
@next_step = (percent + 1) * @ratio
|
|
82
|
-
|
|
83
|
-
print @format % percent
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
def print(*args)
|
|
87
|
-
@out.print(*args) if @active
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
end
|
|
91
|
-
|
|
92
|
-
# Crypter ermöglicht die Ver- und Entschlüsselung von Wörterbüchern
|
|
93
|
-
|
|
94
|
-
class Crypter
|
|
95
|
-
|
|
96
|
-
HEX_CHARS = '0123456789abcdef'.freeze
|
|
97
|
-
|
|
98
|
-
def digest(key)
|
|
99
|
-
Digest::SHA1.hexdigest(key)
|
|
100
|
-
end
|
|
101
|
-
|
|
102
|
-
def encode(key, val)
|
|
103
|
-
hex = ''
|
|
104
|
-
|
|
105
|
-
crypt(key, val).each_byte { |byte|
|
|
106
|
-
# To get a hex representation for a char we just utilize
|
|
107
|
-
# the quotient and the remainder of division by base 16.
|
|
108
|
-
q, r = byte.divmod(16)
|
|
109
|
-
hex << HEX_CHARS[q] << HEX_CHARS[r]
|
|
110
|
-
}
|
|
41
|
+
# Die Klasse Database stellt eine einheitliche Schnittstelle auf Lingo-Datenbanken bereit.
|
|
42
|
+
# Die Identifizierung der Datenbank erfolgt über die ID der Datenbank, so wie sie in der
|
|
43
|
+
# Sprachkonfigurationsdatei <tt>de.lang</tt> unter <tt>language/dictionary/databases</tt>
|
|
44
|
+
# hinterlegt ist.
|
|
45
|
+
#
|
|
46
|
+
# Das Lesen und Schreiben der Datenbank erfolgt über die Funktionen []() und []=().
|
|
111
47
|
|
|
112
|
-
|
|
113
|
-
end
|
|
48
|
+
class Database
|
|
114
49
|
|
|
115
|
-
|
|
116
|
-
str, q, first = '', 0, false
|
|
50
|
+
include Cachable
|
|
117
51
|
|
|
118
|
-
|
|
119
|
-
byte = byte.chr(ENC)
|
|
52
|
+
BACKENDS = %w[LibCDB SDBM GDBM].unshift(ENV['LINGO_BACKEND']).compact.uniq
|
|
120
53
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
else
|
|
126
|
-
# Now we got both parts, so let's revert the divmod(16)
|
|
127
|
-
str << q * 16 + HEX_CHARS.index(byte)
|
|
128
|
-
end
|
|
129
|
-
}
|
|
54
|
+
FLD_SEP = '|'
|
|
55
|
+
IDX_REF = '^'
|
|
56
|
+
KEY_REF = '*'
|
|
57
|
+
SYS_KEY = '~'
|
|
130
58
|
|
|
131
|
-
|
|
132
|
-
end
|
|
133
|
-
|
|
134
|
-
private
|
|
59
|
+
INDEX_PATTERN = %r{\A#{Regexp.escape(IDX_REF)}\d+\z}
|
|
135
60
|
|
|
136
|
-
def
|
|
137
|
-
|
|
138
|
-
v.each_codepoint { |x| c << (x ^ y.next).chr(ENC) }
|
|
139
|
-
c
|
|
61
|
+
def self.open(*args, &block)
|
|
62
|
+
new(*args).open(&block)
|
|
140
63
|
end
|
|
141
64
|
|
|
142
|
-
end
|
|
143
|
-
|
|
144
|
-
# Die Klasse TxtFile stellt eine einheitliche Schnittstelle auf die unterschiedlichen Formate
|
|
145
|
-
# von Wörterbuch-Quelldateien bereit. Die Identifizierung der Quelldatei erfolgt über die ID
|
|
146
|
-
# der Datei, so wie sie in der Sprachkonfigurationsdatei <tt>de.lang</tt> unter
|
|
147
|
-
# <tt>language/dictionary/databases</tt> hinterlegt ist.
|
|
148
|
-
#
|
|
149
|
-
# Die Verarbeitung der Wörterbücher erfolgt mittels des Iterators <b>each</b>, der für jede
|
|
150
|
-
# Zeile der Quelldatei ein Array bereitstellt in der Form <tt>[ key, [val1, val2, ...] ]</tt>.
|
|
151
|
-
#
|
|
152
|
-
# Nicht korrekt erkannte Zeilen werden abgewiesen und in eine Revoke-Datei gespeichert, die
|
|
153
|
-
# an der Dateiendung <tt>.rev</tt> zu erkennen ist.
|
|
154
|
-
|
|
155
|
-
class TxtFile
|
|
156
|
-
|
|
157
|
-
attr_reader :position
|
|
158
|
-
|
|
159
65
|
def initialize(id, lingo)
|
|
160
|
-
# Konfiguration der Datenbank auslesen
|
|
161
66
|
@config = lingo.database_config(id)
|
|
162
67
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
@
|
|
166
|
-
@pn_reject = Pathname.new(Lingo.find(:store, source_file) << '.rev')
|
|
68
|
+
@id, @lingo = id, lingo
|
|
69
|
+
@src_file = Lingo.find(:dict, @config['name'])
|
|
70
|
+
@crypter = Crypter.new if @config.has_key?('crypt')
|
|
167
71
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
@legal_word = '(?:' + PRINTABLE_CHAR + '|[' + Regexp.escape('- /&()[].,') + '])+' # TODO: v1.60 - ',' bei TxtFile zulassen; in const.rb einbauen
|
|
174
|
-
@line_pattern = Regexp.new('^'+@legal_word+'$')
|
|
175
|
-
|
|
176
|
-
@position = 0
|
|
177
|
-
end
|
|
178
|
-
|
|
179
|
-
def size
|
|
180
|
-
@pn_source.size
|
|
181
|
-
end
|
|
182
|
-
|
|
183
|
-
def each
|
|
184
|
-
# Reject-Datei öffnen
|
|
185
|
-
fail_msg = "Fehler beim öffnen der Reject-Datei '#{@pn_reject.to_s}'"
|
|
186
|
-
reject_file = @pn_reject.open('w', encoding: ENC)
|
|
187
|
-
|
|
188
|
-
# Alle Zeilen der Quelldatei verarbeiten
|
|
189
|
-
fail_msg = "Fehler beim öffnen der Wörterbuch-Quelldatei '#{@pn_source.to_s}'"
|
|
190
|
-
|
|
191
|
-
@pn_source.each_line($/, encoding: ENC) do |raw_line|
|
|
192
|
-
@position += raw_line.size # Position innerhalb der Datei aktualisieren
|
|
193
|
-
line = raw_line.chomp.downcase # Zeile normieren
|
|
194
|
-
|
|
195
|
-
next if line =~ /^\s*\043/ || line.strip == '' # Kommentarzeilen und leere Zeilen überspringen
|
|
196
|
-
|
|
197
|
-
# Ungültige Zeilen protokollieren
|
|
198
|
-
unless line.length < 4096 && line =~ @line_pattern
|
|
199
|
-
fail_msg = "Fehler beim schreiben der Reject-Datei '#{@pn_reject.to_s}'"
|
|
200
|
-
reject_file.puts line
|
|
201
|
-
next
|
|
202
|
-
end
|
|
203
|
-
|
|
204
|
-
# Zeile in Werte konvertieren
|
|
205
|
-
yield convert_line(line, $1, $2)
|
|
72
|
+
begin
|
|
73
|
+
@dbm_name = Lingo.find(:store, @src_file)
|
|
74
|
+
FileUtils.mkdir_p(File.dirname(@dbm_name))
|
|
75
|
+
rescue NoWritableStoreError
|
|
76
|
+
@backend = HashStore
|
|
206
77
|
end
|
|
207
78
|
|
|
208
|
-
|
|
209
|
-
reject_file.close
|
|
210
|
-
@pn_reject.delete if @pn_reject.size == 0
|
|
211
|
-
|
|
212
|
-
self
|
|
213
|
-
rescue RuntimeError
|
|
214
|
-
Lingo.error(fail_msg)
|
|
215
|
-
end
|
|
216
|
-
|
|
217
|
-
end
|
|
218
|
-
|
|
219
|
-
# Abgeleitet von TxtFile behandelt die Klasse Dateien mit dem Format <tt>SingleWord</tt>.
|
|
220
|
-
# Eine Zeile <tt>"Fachbegriff\n"</tt> wird gewandelt in <tt>[ 'fachbegriff', ['#s'] ]</tt>.
|
|
221
|
-
# Die Wortklasse kann über den Parameter <tt>def-wc</tt> beeinflusst werden.
|
|
222
|
-
|
|
223
|
-
class TxtFile_Singleword < TxtFile
|
|
224
|
-
|
|
225
|
-
def initialize(id, lingo)
|
|
226
|
-
super
|
|
227
|
-
|
|
228
|
-
@wc = @config.fetch('def-wc', 's').downcase
|
|
229
|
-
@mul_wc = @config.fetch('def-mul-wc', @wc).downcase
|
|
79
|
+
extend(backend)
|
|
230
80
|
|
|
231
|
-
@
|
|
232
|
-
end
|
|
233
|
-
|
|
234
|
-
private
|
|
81
|
+
@dbm_name << store_ext if respond_to?(:store_ext, true)
|
|
235
82
|
|
|
236
|
-
|
|
237
|
-
|
|
83
|
+
init_cachable
|
|
84
|
+
convert unless uptodate?
|
|
238
85
|
end
|
|
239
86
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
# Die Wortklasse kann über den Parameter <tt>def-wc</tt> beeinflusst werden.
|
|
245
|
-
# Der Trenner zwischen Schlüssel und Projektion kann über den Parameter <tt>separator</tt> geändert werden.
|
|
246
|
-
|
|
247
|
-
class TxtFile_Keyvalue < TxtFile
|
|
248
|
-
|
|
249
|
-
def initialize(id, lingo)
|
|
250
|
-
super
|
|
251
|
-
|
|
252
|
-
@separator = @config.fetch('separator', '*')
|
|
253
|
-
@line_pattern = Regexp.new('^(' + @legal_word + ')' + Regexp.escape(@separator) + '(' + @legal_word + ')$')
|
|
87
|
+
def backend
|
|
88
|
+
@backend ||= BACKENDS.find { |mod|
|
|
89
|
+
break self.class.const_get("#{mod}Store") if Object.const_defined?(mod)
|
|
90
|
+
} || HashStore
|
|
254
91
|
end
|
|
255
92
|
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
def convert_line(line, key, val)
|
|
259
|
-
key, val = key.strip, val.strip
|
|
260
|
-
val = '' if key == val
|
|
261
|
-
val = [val + '#' + @wordclass]
|
|
262
|
-
[key, val]
|
|
93
|
+
def closed?
|
|
94
|
+
@db.nil? || _closed?
|
|
263
95
|
end
|
|
264
96
|
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
class TxtFile_Wordclass < TxtFile
|
|
272
|
-
|
|
273
|
-
def initialize(id, lingo)
|
|
274
|
-
super
|
|
275
|
-
|
|
276
|
-
@separator = @config.fetch('separator', ',')
|
|
277
|
-
@line_pattern = Regexp.new('^(' + @legal_word + ')' + Regexp.escape(@separator) + '((?:' + @legal_word + '\043\w)+)$')
|
|
97
|
+
def open
|
|
98
|
+
@db = _open if closed?
|
|
99
|
+
block_given? ? yield(self) : self
|
|
100
|
+
ensure
|
|
101
|
+
close if @db && block_given?
|
|
278
102
|
end
|
|
279
103
|
|
|
280
|
-
|
|
104
|
+
def close
|
|
105
|
+
@db.close unless closed?
|
|
106
|
+
@db = nil
|
|
281
107
|
|
|
282
|
-
|
|
283
|
-
key, valstr = key.strip, val.strip
|
|
284
|
-
val = valstr.gsub(/\s+\043/, '#').scan(/\S.+?\s*\043\w/)
|
|
285
|
-
val = val.map do |str|
|
|
286
|
-
str =~ /^(.+)\043(.)/
|
|
287
|
-
($1 == key ? '' : $1) + '#' + $2
|
|
288
|
-
end
|
|
289
|
-
[key, val]
|
|
108
|
+
self
|
|
290
109
|
end
|
|
291
110
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
# Der Trenner zwischen Schlüssel und Projektion kann über den Parameter <tt>separator</tt> geändert werden.
|
|
297
|
-
|
|
298
|
-
class TxtFile_Multivalue < TxtFile
|
|
299
|
-
|
|
300
|
-
def initialize(id, lingo)
|
|
301
|
-
super
|
|
302
|
-
|
|
303
|
-
@separator = @config.fetch('separator', ';')
|
|
304
|
-
@line_pattern = Regexp.new('^' + @legal_word + '(?:' + Regexp.escape(@separator) + @legal_word + ')*$')
|
|
111
|
+
def to_h
|
|
112
|
+
{}.tap { |hash| @db.each { |key, val|
|
|
113
|
+
hash[key.force_encoding(ENC).freeze] = val.force_encoding(ENC)
|
|
114
|
+
} unless closed? }
|
|
305
115
|
end
|
|
306
116
|
|
|
307
|
-
|
|
117
|
+
def [](key)
|
|
118
|
+
val = _val(key) unless closed?
|
|
119
|
+
return unless val
|
|
308
120
|
|
|
309
|
-
|
|
310
|
-
|
|
121
|
+
# Äquvalenzklassen behandeln
|
|
122
|
+
val.split(FLD_SEP).map { |v|
|
|
123
|
+
v =~ INDEX_PATTERN ? _val(v) : v
|
|
124
|
+
}.compact.join(FLD_SEP).split(FLD_SEP)
|
|
311
125
|
end
|
|
312
126
|
|
|
313
|
-
|
|
127
|
+
def []=(key, val)
|
|
128
|
+
return if closed?
|
|
314
129
|
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
# Die Sonderbehandlung erfolgt in der Klasse Txt2DbmConverter, wo daraus Schlüssel-Werte-Paare in der Form
|
|
318
|
-
# <tt>[ 'sieg', ['triumph'] ]</tt> und <tt>[ 'erfolg', ['triumph'] ]</tt> erzeugt werden.
|
|
319
|
-
# Der Trenner zwischen Schlüssel und Projektion kann über den Parameter <tt>separator</tt> geändert werden.
|
|
130
|
+
val = val.dup
|
|
131
|
+
val.concat(retrieve(key)) if hit?(key)
|
|
320
132
|
|
|
321
|
-
|
|
133
|
+
val.sort!
|
|
134
|
+
val.uniq!
|
|
135
|
+
store(key, val)
|
|
322
136
|
|
|
323
|
-
|
|
324
|
-
|
|
137
|
+
val = val.join(FLD_SEP)
|
|
138
|
+
key, val = @crypter.encode(key, val) if @crypter
|
|
325
139
|
|
|
326
|
-
|
|
327
|
-
@line_pattern = Regexp.new('^' + @legal_word + '(?:' + Regexp.escape(@separator) + @legal_word + ')*$')
|
|
140
|
+
_set(key, val)
|
|
328
141
|
end
|
|
329
142
|
|
|
330
143
|
private
|
|
331
144
|
|
|
332
|
-
def
|
|
333
|
-
|
|
334
|
-
[
|
|
335
|
-
end
|
|
336
|
-
|
|
337
|
-
end
|
|
145
|
+
def uptodate?(file = @dbm_name)
|
|
146
|
+
src = Pathname.new(@src_file)
|
|
147
|
+
@source_key = lambda { [src.size, src.mtime].join(FLD_SEP) }
|
|
338
148
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
# Sprachkonfigurationsdatei <tt>de.lang</tt> unter <tt>language/dictionary/databases</tt>
|
|
342
|
-
# hinterlegt ist.
|
|
343
|
-
#
|
|
344
|
-
# Das Lesen und Schreiben der Datenbank erfolgt über die Funktionen []() und []=().
|
|
345
|
-
|
|
346
|
-
class DbmFile
|
|
347
|
-
|
|
348
|
-
include Cachable
|
|
349
|
-
|
|
350
|
-
INDEX_PATTERN = %r{\A#{Regexp.escape(IDX_REF)}\d+\z}
|
|
351
|
-
|
|
352
|
-
def self.open(*args)
|
|
353
|
-
dbm = new(*args)
|
|
354
|
-
dbm.open { yield dbm }
|
|
149
|
+
sys_key = open { @db[SYS_KEY] } if File.exist?(file)
|
|
150
|
+
sys_key && (!src.exist? || sys_key == @source_key.call)
|
|
355
151
|
end
|
|
356
152
|
|
|
357
|
-
def
|
|
358
|
-
@
|
|
359
|
-
|
|
360
|
-
init_cachable
|
|
361
|
-
|
|
362
|
-
config = lingo.database_config(id)
|
|
363
|
-
raise "No such database `#{id}'." unless config && config.has_key?('name')
|
|
364
|
-
|
|
365
|
-
@id, @dbm = id, nil
|
|
366
|
-
@src_file = Lingo.find(:dict, config['name'])
|
|
367
|
-
@dbm_name = Lingo.find(:store, @src_file)
|
|
368
|
-
|
|
369
|
-
Txt2DbmConverter.new(id, lingo).convert if read_mode && !uptodate?
|
|
370
|
-
|
|
371
|
-
@crypter = config.has_key?('crypt') ? Crypter.new : nil
|
|
372
|
-
|
|
373
|
-
FileUtils.mkdir_p(File.dirname(@dbm_name))
|
|
153
|
+
def uptodate!
|
|
154
|
+
@db[SYS_KEY] = @source_key.call
|
|
374
155
|
end
|
|
375
156
|
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
key = open { @dbm[SYS_KEY] }
|
|
380
|
-
rescue RuntimeError
|
|
381
|
-
end if File.exist?("#{@dbm_name}.pag")
|
|
382
|
-
|
|
383
|
-
key && (!(pn = Pathname.new(@src_file)).exist? || key == source_key(pn))
|
|
157
|
+
def create
|
|
158
|
+
_clear
|
|
159
|
+
open { yield }
|
|
384
160
|
end
|
|
385
161
|
|
|
386
|
-
def
|
|
387
|
-
if
|
|
388
|
-
@dbm = SDBM.open(@dbm_name)
|
|
389
|
-
block_given? ? yield : self
|
|
390
|
-
else
|
|
391
|
-
Lingo.error("DbmFile #{@dbm_name} bereits geöffnet")
|
|
392
|
-
end
|
|
393
|
-
ensure
|
|
394
|
-
close if @dbm && block_given?
|
|
162
|
+
def _clear
|
|
163
|
+
File.delete(@dbm_name) if File.exist?(@dbm_name)
|
|
395
164
|
end
|
|
396
165
|
|
|
397
|
-
def
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
@dbm.each { |key, val|
|
|
401
|
-
[key, val].each { |x| x.encode!(ENC) }
|
|
402
|
-
hash[key.freeze] = val
|
|
403
|
-
} unless closed?
|
|
404
|
-
|
|
405
|
-
hash
|
|
166
|
+
def _open
|
|
167
|
+
raise NotImplementedError
|
|
406
168
|
end
|
|
407
169
|
|
|
408
|
-
def
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
if closed?
|
|
412
|
-
files.each { |file| File.delete(file) if File.exist?(file) }
|
|
413
|
-
else
|
|
414
|
-
close
|
|
415
|
-
files.each { |file| File.delete(file) }
|
|
416
|
-
open
|
|
417
|
-
end
|
|
418
|
-
|
|
419
|
-
self
|
|
420
|
-
end
|
|
421
|
-
|
|
422
|
-
def close
|
|
423
|
-
unless closed?
|
|
424
|
-
@dbm.close
|
|
425
|
-
@dbm = nil
|
|
426
|
-
|
|
427
|
-
self
|
|
428
|
-
else
|
|
429
|
-
#Lingo.error("DbmFile #{@dbm_name} nicht geöffnet")
|
|
430
|
-
end
|
|
431
|
-
end
|
|
432
|
-
|
|
433
|
-
def closed?
|
|
434
|
-
@dbm.nil? || @dbm.closed?
|
|
170
|
+
def _closed?
|
|
171
|
+
@db.closed?
|
|
435
172
|
end
|
|
436
173
|
|
|
437
|
-
def
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
if val = _get(key)
|
|
441
|
-
# Äquvalenzklassen behandeln
|
|
442
|
-
val.split(FLD_SEP).map { |v|
|
|
443
|
-
v =~ INDEX_PATTERN ? _get(v) : v
|
|
444
|
-
}.compact.join(FLD_SEP).split(FLD_SEP)
|
|
445
|
-
end
|
|
446
|
-
end
|
|
447
|
-
|
|
448
|
-
def []=(key, val)
|
|
449
|
-
return if closed?
|
|
450
|
-
|
|
451
|
-
val += retrieve(key) if hit?(key)
|
|
452
|
-
|
|
453
|
-
store(key, val = val.sort.uniq)
|
|
454
|
-
_set(key, val.join(FLD_SEP))
|
|
174
|
+
def _set(key, val)
|
|
175
|
+
@db[key] = val
|
|
455
176
|
end
|
|
456
177
|
|
|
457
|
-
def
|
|
458
|
-
|
|
459
|
-
@dbm[SYS_KEY] = source_key(Pathname.new(Lingo.find(:dict, filename)))
|
|
178
|
+
def _get(key)
|
|
179
|
+
@db[key]
|
|
460
180
|
end
|
|
461
181
|
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
if val = @dbm[@crypter ? @crypter.digest(key) : key]
|
|
466
|
-
val.encode!(ENC)
|
|
182
|
+
def _val(key)
|
|
183
|
+
if val = _get(@crypter ? @crypter.digest(key) : key)
|
|
184
|
+
val.force_encoding(ENC)
|
|
467
185
|
@crypter ? @crypter.decode(key, val) : val
|
|
468
186
|
end
|
|
469
187
|
end
|
|
470
188
|
|
|
471
|
-
def
|
|
472
|
-
|
|
473
|
-
@dbm[key] = (val.length < 950) ? val : val[0, 950]
|
|
474
|
-
end
|
|
475
|
-
|
|
476
|
-
def source_key(src)
|
|
477
|
-
[src.size, src.mtime].join(FLD_SEP)
|
|
478
|
-
end
|
|
479
|
-
|
|
480
|
-
end
|
|
481
|
-
|
|
482
|
-
# Die Klasse Txt2DbConverter steuert die Konvertierung von Wörterbuch-Quelldateien in
|
|
483
|
-
# Lingo-Datenbanken. Die Identifizierung der Quelldatei erfolgt über die ID
|
|
484
|
-
# der Datei, so wie sie in der Sprachkonfigurationsdatei <tt>de.lang</tt> unter
|
|
485
|
-
# <tt>language/dictionary/databases</tt> hinterlegt ist.
|
|
486
|
-
|
|
487
|
-
class Txt2DbmConverter
|
|
189
|
+
def convert(verbose = @lingo.config.stderr.tty?)
|
|
190
|
+
src = Source.get(@config.fetch('txt-format', 'KeyValue'), @id, @lingo)
|
|
488
191
|
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
192
|
+
if lex = @config['use-lex']
|
|
193
|
+
a, s = [{
|
|
194
|
+
'source' => lex.split(STRING_SEPARATOR_RE),
|
|
195
|
+
'mode' => @config['lex-mode']
|
|
196
|
+
}, @lingo], ' '
|
|
492
197
|
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
@source = case @format
|
|
496
|
-
when 'singleword' then TxtFile_Singleword
|
|
497
|
-
when 'keyvalue' then TxtFile_Keyvalue
|
|
498
|
-
when 'wordclass' then TxtFile_Wordclass
|
|
499
|
-
when 'multivalue' then TxtFile_Multivalue
|
|
500
|
-
when 'multikey' then TxtFile_Multikey
|
|
501
|
-
else
|
|
502
|
-
Lingo.error("Unbekanntes Textformat '#{config['txt-format'].downcase}' bei '#{'language/dictionary/databases/' + id}'")
|
|
503
|
-
end.new(id, lingo)
|
|
198
|
+
dic = Language::Dictionary.new(*a)
|
|
199
|
+
gra = Language::Grammar.new(*a)
|
|
504
200
|
|
|
505
|
-
|
|
506
|
-
|
|
201
|
+
block = lambda { |form|
|
|
202
|
+
res = dic.find_word(form)
|
|
507
203
|
|
|
508
|
-
|
|
509
|
-
|
|
204
|
+
if res.unknown?
|
|
205
|
+
res = gra.find_compositum(form)
|
|
206
|
+
com = res.compo_form
|
|
207
|
+
end
|
|
510
208
|
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
begin
|
|
516
|
-
@lexicalize = true
|
|
517
|
-
@dictionary = Dictionary.new({ 'source' => lex_dic.split(STRING_SEPERATOR_PATTERN), 'mode' => lex_mod }, lingo)
|
|
518
|
-
@grammar = Grammar.new({ 'source' => lex_dic.split(STRING_SEPERATOR_PATTERN), 'mode' => lex_mod }, lingo)
|
|
519
|
-
rescue RuntimeError
|
|
520
|
-
Lingo.error("Auf das Wörterbuch (#{lex_dic}) für die Lexikalisierung der Mehrwortgruppen in (#{@config['name']}) konnte nicht zugegriffen werden")
|
|
521
|
-
end if lex_dic
|
|
522
|
-
end
|
|
523
|
-
|
|
524
|
-
def convert
|
|
525
|
-
@progress.start('convert', @source.size)
|
|
526
|
-
|
|
527
|
-
@destination.open
|
|
528
|
-
@destination.clear
|
|
209
|
+
com ? com.form : res.norm
|
|
210
|
+
}
|
|
211
|
+
end
|
|
529
212
|
|
|
530
|
-
|
|
531
|
-
|
|
213
|
+
ShowProgress.new(self, src.size, verbose) { |progress| create {
|
|
214
|
+
src.each { |key, val|
|
|
215
|
+
progress[src.position]
|
|
532
216
|
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
# Schlüssel in Grundform wandeln
|
|
536
|
-
gkey = key.split(' ').map do |form|
|
|
217
|
+
if key
|
|
218
|
+
key.chomp!('.')
|
|
537
219
|
|
|
538
|
-
|
|
539
|
-
|
|
220
|
+
if lex && key.include?(s)
|
|
221
|
+
k = key.split(s).map!(&block).join(s)
|
|
540
222
|
|
|
541
|
-
|
|
542
|
-
|
|
223
|
+
c = k.count(s) + 1
|
|
224
|
+
self[k.split(s)[0, 3].join(s)] = ["#{KEY_REF}#{c}"] if c > 3
|
|
543
225
|
|
|
544
|
-
|
|
545
|
-
if result.attr == WA_UNKNOWN
|
|
546
|
-
result = @grammar.find_compositum(wordform)
|
|
547
|
-
compo = result.compo_form
|
|
226
|
+
key, val = k, val.map { |v| v.start_with?('#') ? key + v : v }
|
|
548
227
|
end
|
|
228
|
+
end
|
|
549
229
|
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
skey = gkey.split
|
|
554
|
-
# Zusatzschlüssel einfügen, wenn Anzahl Wörter > 3
|
|
555
|
-
@destination[skey[0...3].join(' ')] = [KEY_REF + skey.size.to_s] if skey.size > 3
|
|
556
|
-
|
|
557
|
-
value = value.map { |v| v =~ /^\043/ ? key + v : v }
|
|
558
|
-
key = gkey
|
|
559
|
-
end
|
|
560
|
-
|
|
561
|
-
# Format Sonderbehandlungen
|
|
562
|
-
key.gsub!(/\.$/, '') if key
|
|
563
|
-
case @format
|
|
564
|
-
when 'multivalue' # Äquvalenzklassen behandeln
|
|
565
|
-
key = IDX_REF + @index.to_s
|
|
566
|
-
@index += 1
|
|
567
|
-
@destination[key] = value
|
|
568
|
-
value.each { |v| @destination[v] = [key] }
|
|
569
|
-
when 'multikey' # Äquvalenzklassen behandeln
|
|
570
|
-
value.each { |v| @destination[v] = [key] }
|
|
571
|
-
else
|
|
572
|
-
@destination[key] = value
|
|
573
|
-
end
|
|
230
|
+
src.set(self, key, val)
|
|
231
|
+
}
|
|
574
232
|
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
@destination.set_source_file(@config['name'])
|
|
578
|
-
@destination.close
|
|
579
|
-
|
|
580
|
-
@progress.stop('ok')
|
|
581
|
-
|
|
582
|
-
self
|
|
233
|
+
uptodate!
|
|
234
|
+
} }
|
|
583
235
|
end
|
|
584
236
|
|
|
585
237
|
end
|