lingo 1.8.0 → 1.8.1
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +13 -0
- data/README +49 -29
- data/Rakefile +28 -4
- data/TODO +2 -9
- data/bin/lingo +24 -0
- data/bin/lingoctl +24 -0
- data/de/lingo-dic.txt +559 -74
- data/info/gpl-hdr.txt +21 -24
- data/lib/lingo.rb +83 -112
- data/lib/lingo/agenda_item.rb +53 -0
- data/lib/lingo/attendee.rb +261 -0
- data/lib/lingo/attendee/abbreviator.rb +95 -97
- data/lib/lingo/attendee/debugger.rb +94 -93
- data/lib/lingo/attendee/decomposer.rb +76 -83
- data/lib/lingo/attendee/dehyphenizer.rb +141 -144
- data/lib/lingo/attendee/formatter.rb +65 -0
- data/lib/lingo/attendee/multi_worder.rb +302 -0
- data/lib/lingo/attendee/noneword_filter.rb +89 -84
- data/lib/lingo/attendee/object_filter.rb +91 -0
- data/lib/lingo/attendee/sequencer.rb +159 -158
- data/lib/lingo/attendee/synonymer.rb +81 -84
- data/lib/lingo/attendee/text_reader.rb +242 -0
- data/lib/lingo/attendee/text_writer.rb +169 -0
- data/lib/lingo/attendee/tokenizer.rb +192 -191
- data/lib/lingo/attendee/variator.rb +152 -156
- data/lib/lingo/attendee/vector_filter.rb +140 -135
- data/lib/lingo/attendee/word_searcher.rb +98 -0
- data/lib/lingo/buffered_attendee.rb +69 -0
- data/lib/lingo/cachable.rb +58 -0
- data/lib/lingo/call.rb +72 -0
- data/lib/lingo/cli.rb +26 -0
- data/lib/lingo/config.rb +23 -26
- data/lib/lingo/core_ext.rb +42 -0
- data/lib/lingo/ctl.rb +239 -173
- data/lib/lingo/database.rb +148 -496
- data/lib/lingo/database/crypter.rb +85 -0
- data/lib/lingo/database/gdbm_store.rb +49 -0
- data/lib/lingo/database/hash_store.rb +67 -0
- data/lib/lingo/database/libcdb_store.rb +58 -0
- data/lib/lingo/database/sdbm_store.rb +64 -0
- data/lib/lingo/database/show_progress.rb +81 -0
- data/lib/lingo/database/source.rb +134 -0
- data/lib/lingo/database/source/key_value.rb +62 -0
- data/lib/lingo/database/source/multi_key.rb +65 -0
- data/lib/lingo/database/source/multi_value.rb +65 -0
- data/lib/lingo/database/source/single_word.rb +60 -0
- data/lib/lingo/database/source/word_class.rb +64 -0
- data/lib/lingo/error.rb +122 -0
- data/lib/lingo/language.rb +78 -518
- data/lib/lingo/language/dictionary.rb +173 -0
- data/lib/lingo/language/grammar.rb +211 -0
- data/lib/lingo/language/lexical.rb +66 -0
- data/lib/lingo/language/lexical_hash.rb +88 -0
- data/lib/lingo/language/token.rb +48 -0
- data/lib/lingo/language/word.rb +130 -0
- data/lib/lingo/language/word_form.rb +83 -0
- data/lib/lingo/reportable.rb +59 -0
- data/lib/lingo/version.rb +1 -1
- data/lingo-all.cfg +14 -10
- data/lingo-call.cfg +5 -5
- data/lingo.cfg +14 -12
- data/lingo.rb +26 -0
- data/lir.cfg +13 -9
- data/spec/spec_helper.rb +1 -0
- data/test.cfg +11 -11
- data/test/attendee/ts_abbreviator.rb +0 -6
- data/test/attendee/ts_decomposer.rb +0 -6
- data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
- data/test/attendee/ts_noneword_filter.rb +1 -7
- data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
- data/test/attendee/ts_sequencer.rb +0 -6
- data/test/attendee/ts_synonymer.rb +0 -6
- data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
- data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
- data/test/attendee/ts_tokenizer.rb +0 -6
- data/test/attendee/ts_variator.rb +0 -6
- data/test/attendee/ts_vector_filter.rb +1 -7
- data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
- data/test/ref/artikel.non +2 -29
- data/test/ref/artikel.seq +13 -8
- data/test/ref/artikel.vec +30 -15
- data/test/ref/artikel.ven +29 -14
- data/test/ref/artikel.ver +58 -43
- data/test/ref/lir.csv +146 -145
- data/test/ref/lir.non +186 -210
- data/test/ref/lir.seq +54 -50
- data/test/test_helper.rb +41 -36
- data/test/ts_database.rb +12 -11
- data/test/ts_language.rb +118 -68
- metadata +67 -29
- data/lib/lingo/attendee/multiworder.rb +0 -301
- data/lib/lingo/attendee/objectfilter.rb +0 -86
- data/lib/lingo/attendee/textreader.rb +0 -237
- data/lib/lingo/attendee/textwriter.rb +0 -196
- data/lib/lingo/attendee/wordsearcher.rb +0 -96
- data/lib/lingo/attendees.rb +0 -289
- data/lib/lingo/const.rb +0 -131
- data/lib/lingo/modules.rb +0 -98
- data/lib/lingo/types.rb +0 -285
- data/lib/lingo/utilities.rb +0 -40
data/lib/lingo/database.rb
CHANGED
@@ -1,585 +1,237 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
#--
|
4
|
-
|
5
|
-
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
# Copyright (C) 2007
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
|
25
|
-
# welcomeATlex-lingoDOTde near 50°55'N+6°55'E.
|
26
|
-
#
|
27
|
-
# Lex Lingo rules from here on
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
28
25
|
#++
|
29
26
|
|
30
|
-
require 'sdbm'
|
31
27
|
require 'pathname'
|
32
28
|
require 'fileutils'
|
33
29
|
require 'digest/sha1'
|
34
30
|
|
35
|
-
require_relative '
|
36
|
-
require_relative '
|
37
|
-
require_relative '
|
38
|
-
require_relative '
|
31
|
+
require_relative 'database/show_progress'
|
32
|
+
require_relative 'database/crypter'
|
33
|
+
require_relative 'database/source'
|
34
|
+
require_relative 'database/hash_store'
|
35
|
+
require_relative 'database/sdbm_store'
|
36
|
+
require_relative 'database/gdbm_store'
|
37
|
+
require_relative 'database/libcdb_store'
|
39
38
|
|
40
39
|
class Lingo
|
41
40
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
# to actually substitute the placeholder.
|
49
|
-
length = (format % 0).length
|
50
|
-
|
51
|
-
# Now we know how far to "go back" to
|
52
|
-
# overwrite the formatted string...
|
53
|
-
back = "\b" * length
|
54
|
-
|
55
|
-
@format = format + back
|
56
|
-
@clear = ' ' * length + back
|
57
|
-
|
58
|
-
print msg, ': '
|
59
|
-
end
|
60
|
-
|
61
|
-
def start(msg, max)
|
62
|
-
@ratio, @count, @next_step = max / 100.0, 0, 0
|
63
|
-
print msg, ' '
|
64
|
-
step
|
65
|
-
end
|
66
|
-
|
67
|
-
def stop(msg)
|
68
|
-
print @clear
|
69
|
-
print msg, "\n"
|
70
|
-
end
|
71
|
-
|
72
|
-
def tick(value)
|
73
|
-
@count = value
|
74
|
-
step if @count >= @next_step
|
75
|
-
end
|
76
|
-
|
77
|
-
private
|
78
|
-
|
79
|
-
def step
|
80
|
-
percent = @count / @ratio
|
81
|
-
@next_step = (percent + 1) * @ratio
|
82
|
-
|
83
|
-
print @format % percent
|
84
|
-
end
|
85
|
-
|
86
|
-
def print(*args)
|
87
|
-
@out.print(*args) if @active
|
88
|
-
end
|
89
|
-
|
90
|
-
end
|
91
|
-
|
92
|
-
# Crypter ermöglicht die Ver- und Entschlüsselung von Wörterbüchern
|
93
|
-
|
94
|
-
class Crypter
|
95
|
-
|
96
|
-
HEX_CHARS = '0123456789abcdef'.freeze
|
97
|
-
|
98
|
-
def digest(key)
|
99
|
-
Digest::SHA1.hexdigest(key)
|
100
|
-
end
|
101
|
-
|
102
|
-
def encode(key, val)
|
103
|
-
hex = ''
|
104
|
-
|
105
|
-
crypt(key, val).each_byte { |byte|
|
106
|
-
# To get a hex representation for a char we just utilize
|
107
|
-
# the quotient and the remainder of division by base 16.
|
108
|
-
q, r = byte.divmod(16)
|
109
|
-
hex << HEX_CHARS[q] << HEX_CHARS[r]
|
110
|
-
}
|
41
|
+
# Die Klasse Database stellt eine einheitliche Schnittstelle auf Lingo-Datenbanken bereit.
|
42
|
+
# Die Identifizierung der Datenbank erfolgt über die ID der Datenbank, so wie sie in der
|
43
|
+
# Sprachkonfigurationsdatei <tt>de.lang</tt> unter <tt>language/dictionary/databases</tt>
|
44
|
+
# hinterlegt ist.
|
45
|
+
#
|
46
|
+
# Das Lesen und Schreiben der Datenbank erfolgt über die Funktionen []() und []=().
|
111
47
|
|
112
|
-
|
113
|
-
end
|
48
|
+
class Database
|
114
49
|
|
115
|
-
|
116
|
-
str, q, first = '', 0, false
|
50
|
+
include Cachable
|
117
51
|
|
118
|
-
|
119
|
-
byte = byte.chr(ENC)
|
52
|
+
BACKENDS = %w[LibCDB SDBM GDBM].unshift(ENV['LINGO_BACKEND']).compact.uniq
|
120
53
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
else
|
126
|
-
# Now we got both parts, so let's revert the divmod(16)
|
127
|
-
str << q * 16 + HEX_CHARS.index(byte)
|
128
|
-
end
|
129
|
-
}
|
54
|
+
FLD_SEP = '|'
|
55
|
+
IDX_REF = '^'
|
56
|
+
KEY_REF = '*'
|
57
|
+
SYS_KEY = '~'
|
130
58
|
|
131
|
-
|
132
|
-
end
|
133
|
-
|
134
|
-
private
|
59
|
+
INDEX_PATTERN = %r{\A#{Regexp.escape(IDX_REF)}\d+\z}
|
135
60
|
|
136
|
-
def
|
137
|
-
|
138
|
-
v.each_codepoint { |x| c << (x ^ y.next).chr(ENC) }
|
139
|
-
c
|
61
|
+
def self.open(*args, &block)
|
62
|
+
new(*args).open(&block)
|
140
63
|
end
|
141
64
|
|
142
|
-
end
|
143
|
-
|
144
|
-
# Die Klasse TxtFile stellt eine einheitliche Schnittstelle auf die unterschiedlichen Formate
|
145
|
-
# von Wörterbuch-Quelldateien bereit. Die Identifizierung der Quelldatei erfolgt über die ID
|
146
|
-
# der Datei, so wie sie in der Sprachkonfigurationsdatei <tt>de.lang</tt> unter
|
147
|
-
# <tt>language/dictionary/databases</tt> hinterlegt ist.
|
148
|
-
#
|
149
|
-
# Die Verarbeitung der Wörterbücher erfolgt mittels des Iterators <b>each</b>, der für jede
|
150
|
-
# Zeile der Quelldatei ein Array bereitstellt in der Form <tt>[ key, [val1, val2, ...] ]</tt>.
|
151
|
-
#
|
152
|
-
# Nicht korrekt erkannte Zeilen werden abgewiesen und in eine Revoke-Datei gespeichert, die
|
153
|
-
# an der Dateiendung <tt>.rev</tt> zu erkennen ist.
|
154
|
-
|
155
|
-
class TxtFile
|
156
|
-
|
157
|
-
attr_reader :position
|
158
|
-
|
159
65
|
def initialize(id, lingo)
|
160
|
-
# Konfiguration der Datenbank auslesen
|
161
66
|
@config = lingo.database_config(id)
|
162
67
|
|
163
|
-
|
164
|
-
|
165
|
-
@
|
166
|
-
@pn_reject = Pathname.new(Lingo.find(:store, source_file) << '.rev')
|
68
|
+
@id, @lingo = id, lingo
|
69
|
+
@src_file = Lingo.find(:dict, @config['name'])
|
70
|
+
@crypter = Crypter.new if @config.has_key?('crypt')
|
167
71
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
@legal_word = '(?:' + PRINTABLE_CHAR + '|[' + Regexp.escape('- /&()[].,') + '])+' # TODO: v1.60 - ',' bei TxtFile zulassen; in const.rb einbauen
|
174
|
-
@line_pattern = Regexp.new('^'+@legal_word+'$')
|
175
|
-
|
176
|
-
@position = 0
|
177
|
-
end
|
178
|
-
|
179
|
-
def size
|
180
|
-
@pn_source.size
|
181
|
-
end
|
182
|
-
|
183
|
-
def each
|
184
|
-
# Reject-Datei öffnen
|
185
|
-
fail_msg = "Fehler beim öffnen der Reject-Datei '#{@pn_reject.to_s}'"
|
186
|
-
reject_file = @pn_reject.open('w', encoding: ENC)
|
187
|
-
|
188
|
-
# Alle Zeilen der Quelldatei verarbeiten
|
189
|
-
fail_msg = "Fehler beim öffnen der Wörterbuch-Quelldatei '#{@pn_source.to_s}'"
|
190
|
-
|
191
|
-
@pn_source.each_line($/, encoding: ENC) do |raw_line|
|
192
|
-
@position += raw_line.size # Position innerhalb der Datei aktualisieren
|
193
|
-
line = raw_line.chomp.downcase # Zeile normieren
|
194
|
-
|
195
|
-
next if line =~ /^\s*\043/ || line.strip == '' # Kommentarzeilen und leere Zeilen überspringen
|
196
|
-
|
197
|
-
# Ungültige Zeilen protokollieren
|
198
|
-
unless line.length < 4096 && line =~ @line_pattern
|
199
|
-
fail_msg = "Fehler beim schreiben der Reject-Datei '#{@pn_reject.to_s}'"
|
200
|
-
reject_file.puts line
|
201
|
-
next
|
202
|
-
end
|
203
|
-
|
204
|
-
# Zeile in Werte konvertieren
|
205
|
-
yield convert_line(line, $1, $2)
|
72
|
+
begin
|
73
|
+
@dbm_name = Lingo.find(:store, @src_file)
|
74
|
+
FileUtils.mkdir_p(File.dirname(@dbm_name))
|
75
|
+
rescue NoWritableStoreError
|
76
|
+
@backend = HashStore
|
206
77
|
end
|
207
78
|
|
208
|
-
|
209
|
-
reject_file.close
|
210
|
-
@pn_reject.delete if @pn_reject.size == 0
|
211
|
-
|
212
|
-
self
|
213
|
-
rescue RuntimeError
|
214
|
-
Lingo.error(fail_msg)
|
215
|
-
end
|
216
|
-
|
217
|
-
end
|
218
|
-
|
219
|
-
# Abgeleitet von TxtFile behandelt die Klasse Dateien mit dem Format <tt>SingleWord</tt>.
|
220
|
-
# Eine Zeile <tt>"Fachbegriff\n"</tt> wird gewandelt in <tt>[ 'fachbegriff', ['#s'] ]</tt>.
|
221
|
-
# Die Wortklasse kann über den Parameter <tt>def-wc</tt> beeinflusst werden.
|
222
|
-
|
223
|
-
class TxtFile_Singleword < TxtFile
|
224
|
-
|
225
|
-
def initialize(id, lingo)
|
226
|
-
super
|
227
|
-
|
228
|
-
@wc = @config.fetch('def-wc', 's').downcase
|
229
|
-
@mul_wc = @config.fetch('def-mul-wc', @wc).downcase
|
79
|
+
extend(backend)
|
230
80
|
|
231
|
-
@
|
232
|
-
end
|
233
|
-
|
234
|
-
private
|
81
|
+
@dbm_name << store_ext if respond_to?(:store_ext, true)
|
235
82
|
|
236
|
-
|
237
|
-
|
83
|
+
init_cachable
|
84
|
+
convert unless uptodate?
|
238
85
|
end
|
239
86
|
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
# Die Wortklasse kann über den Parameter <tt>def-wc</tt> beeinflusst werden.
|
245
|
-
# Der Trenner zwischen Schlüssel und Projektion kann über den Parameter <tt>separator</tt> geändert werden.
|
246
|
-
|
247
|
-
class TxtFile_Keyvalue < TxtFile
|
248
|
-
|
249
|
-
def initialize(id, lingo)
|
250
|
-
super
|
251
|
-
|
252
|
-
@separator = @config.fetch('separator', '*')
|
253
|
-
@line_pattern = Regexp.new('^(' + @legal_word + ')' + Regexp.escape(@separator) + '(' + @legal_word + ')$')
|
87
|
+
def backend
|
88
|
+
@backend ||= BACKENDS.find { |mod|
|
89
|
+
break self.class.const_get("#{mod}Store") if Object.const_defined?(mod)
|
90
|
+
} || HashStore
|
254
91
|
end
|
255
92
|
|
256
|
-
|
257
|
-
|
258
|
-
def convert_line(line, key, val)
|
259
|
-
key, val = key.strip, val.strip
|
260
|
-
val = '' if key == val
|
261
|
-
val = [val + '#' + @wordclass]
|
262
|
-
[key, val]
|
93
|
+
def closed?
|
94
|
+
@db.nil? || _closed?
|
263
95
|
end
|
264
96
|
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
class TxtFile_Wordclass < TxtFile
|
272
|
-
|
273
|
-
def initialize(id, lingo)
|
274
|
-
super
|
275
|
-
|
276
|
-
@separator = @config.fetch('separator', ',')
|
277
|
-
@line_pattern = Regexp.new('^(' + @legal_word + ')' + Regexp.escape(@separator) + '((?:' + @legal_word + '\043\w)+)$')
|
97
|
+
def open
|
98
|
+
@db = _open if closed?
|
99
|
+
block_given? ? yield(self) : self
|
100
|
+
ensure
|
101
|
+
close if @db && block_given?
|
278
102
|
end
|
279
103
|
|
280
|
-
|
104
|
+
def close
|
105
|
+
@db.close unless closed?
|
106
|
+
@db = nil
|
281
107
|
|
282
|
-
|
283
|
-
key, valstr = key.strip, val.strip
|
284
|
-
val = valstr.gsub(/\s+\043/, '#').scan(/\S.+?\s*\043\w/)
|
285
|
-
val = val.map do |str|
|
286
|
-
str =~ /^(.+)\043(.)/
|
287
|
-
($1 == key ? '' : $1) + '#' + $2
|
288
|
-
end
|
289
|
-
[key, val]
|
108
|
+
self
|
290
109
|
end
|
291
110
|
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
# Der Trenner zwischen Schlüssel und Projektion kann über den Parameter <tt>separator</tt> geändert werden.
|
297
|
-
|
298
|
-
class TxtFile_Multivalue < TxtFile
|
299
|
-
|
300
|
-
def initialize(id, lingo)
|
301
|
-
super
|
302
|
-
|
303
|
-
@separator = @config.fetch('separator', ';')
|
304
|
-
@line_pattern = Regexp.new('^' + @legal_word + '(?:' + Regexp.escape(@separator) + @legal_word + ')*$')
|
111
|
+
def to_h
|
112
|
+
{}.tap { |hash| @db.each { |key, val|
|
113
|
+
hash[key.force_encoding(ENC).freeze] = val.force_encoding(ENC)
|
114
|
+
} unless closed? }
|
305
115
|
end
|
306
116
|
|
307
|
-
|
117
|
+
def [](key)
|
118
|
+
val = _val(key) unless closed?
|
119
|
+
return unless val
|
308
120
|
|
309
|
-
|
310
|
-
|
121
|
+
# Äquvalenzklassen behandeln
|
122
|
+
val.split(FLD_SEP).map { |v|
|
123
|
+
v =~ INDEX_PATTERN ? _val(v) : v
|
124
|
+
}.compact.join(FLD_SEP).split(FLD_SEP)
|
311
125
|
end
|
312
126
|
|
313
|
-
|
127
|
+
def []=(key, val)
|
128
|
+
return if closed?
|
314
129
|
|
315
|
-
|
316
|
-
|
317
|
-
# Die Sonderbehandlung erfolgt in der Klasse Txt2DbmConverter, wo daraus Schlüssel-Werte-Paare in der Form
|
318
|
-
# <tt>[ 'sieg', ['triumph'] ]</tt> und <tt>[ 'erfolg', ['triumph'] ]</tt> erzeugt werden.
|
319
|
-
# Der Trenner zwischen Schlüssel und Projektion kann über den Parameter <tt>separator</tt> geändert werden.
|
130
|
+
val = val.dup
|
131
|
+
val.concat(retrieve(key)) if hit?(key)
|
320
132
|
|
321
|
-
|
133
|
+
val.sort!
|
134
|
+
val.uniq!
|
135
|
+
store(key, val)
|
322
136
|
|
323
|
-
|
324
|
-
|
137
|
+
val = val.join(FLD_SEP)
|
138
|
+
key, val = @crypter.encode(key, val) if @crypter
|
325
139
|
|
326
|
-
|
327
|
-
@line_pattern = Regexp.new('^' + @legal_word + '(?:' + Regexp.escape(@separator) + @legal_word + ')*$')
|
140
|
+
_set(key, val)
|
328
141
|
end
|
329
142
|
|
330
143
|
private
|
331
144
|
|
332
|
-
def
|
333
|
-
|
334
|
-
[
|
335
|
-
end
|
336
|
-
|
337
|
-
end
|
145
|
+
def uptodate?(file = @dbm_name)
|
146
|
+
src = Pathname.new(@src_file)
|
147
|
+
@source_key = lambda { [src.size, src.mtime].join(FLD_SEP) }
|
338
148
|
|
339
|
-
|
340
|
-
|
341
|
-
# Sprachkonfigurationsdatei <tt>de.lang</tt> unter <tt>language/dictionary/databases</tt>
|
342
|
-
# hinterlegt ist.
|
343
|
-
#
|
344
|
-
# Das Lesen und Schreiben der Datenbank erfolgt über die Funktionen []() und []=().
|
345
|
-
|
346
|
-
class DbmFile
|
347
|
-
|
348
|
-
include Cachable
|
349
|
-
|
350
|
-
INDEX_PATTERN = %r{\A#{Regexp.escape(IDX_REF)}\d+\z}
|
351
|
-
|
352
|
-
def self.open(*args)
|
353
|
-
dbm = new(*args)
|
354
|
-
dbm.open { yield dbm }
|
149
|
+
sys_key = open { @db[SYS_KEY] } if File.exist?(file)
|
150
|
+
sys_key && (!src.exist? || sys_key == @source_key.call)
|
355
151
|
end
|
356
152
|
|
357
|
-
def
|
358
|
-
@
|
359
|
-
|
360
|
-
init_cachable
|
361
|
-
|
362
|
-
config = lingo.database_config(id)
|
363
|
-
raise "No such database `#{id}'." unless config && config.has_key?('name')
|
364
|
-
|
365
|
-
@id, @dbm = id, nil
|
366
|
-
@src_file = Lingo.find(:dict, config['name'])
|
367
|
-
@dbm_name = Lingo.find(:store, @src_file)
|
368
|
-
|
369
|
-
Txt2DbmConverter.new(id, lingo).convert if read_mode && !uptodate?
|
370
|
-
|
371
|
-
@crypter = config.has_key?('crypt') ? Crypter.new : nil
|
372
|
-
|
373
|
-
FileUtils.mkdir_p(File.dirname(@dbm_name))
|
153
|
+
def uptodate!
|
154
|
+
@db[SYS_KEY] = @source_key.call
|
374
155
|
end
|
375
156
|
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
key = open { @dbm[SYS_KEY] }
|
380
|
-
rescue RuntimeError
|
381
|
-
end if File.exist?("#{@dbm_name}.pag")
|
382
|
-
|
383
|
-
key && (!(pn = Pathname.new(@src_file)).exist? || key == source_key(pn))
|
157
|
+
def create
|
158
|
+
_clear
|
159
|
+
open { yield }
|
384
160
|
end
|
385
161
|
|
386
|
-
def
|
387
|
-
if
|
388
|
-
@dbm = SDBM.open(@dbm_name)
|
389
|
-
block_given? ? yield : self
|
390
|
-
else
|
391
|
-
Lingo.error("DbmFile #{@dbm_name} bereits geöffnet")
|
392
|
-
end
|
393
|
-
ensure
|
394
|
-
close if @dbm && block_given?
|
162
|
+
def _clear
|
163
|
+
File.delete(@dbm_name) if File.exist?(@dbm_name)
|
395
164
|
end
|
396
165
|
|
397
|
-
def
|
398
|
-
|
399
|
-
|
400
|
-
@dbm.each { |key, val|
|
401
|
-
[key, val].each { |x| x.encode!(ENC) }
|
402
|
-
hash[key.freeze] = val
|
403
|
-
} unless closed?
|
404
|
-
|
405
|
-
hash
|
166
|
+
def _open
|
167
|
+
raise NotImplementedError
|
406
168
|
end
|
407
169
|
|
408
|
-
def
|
409
|
-
|
410
|
-
|
411
|
-
if closed?
|
412
|
-
files.each { |file| File.delete(file) if File.exist?(file) }
|
413
|
-
else
|
414
|
-
close
|
415
|
-
files.each { |file| File.delete(file) }
|
416
|
-
open
|
417
|
-
end
|
418
|
-
|
419
|
-
self
|
420
|
-
end
|
421
|
-
|
422
|
-
def close
|
423
|
-
unless closed?
|
424
|
-
@dbm.close
|
425
|
-
@dbm = nil
|
426
|
-
|
427
|
-
self
|
428
|
-
else
|
429
|
-
#Lingo.error("DbmFile #{@dbm_name} nicht geöffnet")
|
430
|
-
end
|
431
|
-
end
|
432
|
-
|
433
|
-
def closed?
|
434
|
-
@dbm.nil? || @dbm.closed?
|
170
|
+
def _closed?
|
171
|
+
@db.closed?
|
435
172
|
end
|
436
173
|
|
437
|
-
def
|
438
|
-
|
439
|
-
|
440
|
-
if val = _get(key)
|
441
|
-
# Äquvalenzklassen behandeln
|
442
|
-
val.split(FLD_SEP).map { |v|
|
443
|
-
v =~ INDEX_PATTERN ? _get(v) : v
|
444
|
-
}.compact.join(FLD_SEP).split(FLD_SEP)
|
445
|
-
end
|
446
|
-
end
|
447
|
-
|
448
|
-
def []=(key, val)
|
449
|
-
return if closed?
|
450
|
-
|
451
|
-
val += retrieve(key) if hit?(key)
|
452
|
-
|
453
|
-
store(key, val = val.sort.uniq)
|
454
|
-
_set(key, val.join(FLD_SEP))
|
174
|
+
def _set(key, val)
|
175
|
+
@db[key] = val
|
455
176
|
end
|
456
177
|
|
457
|
-
def
|
458
|
-
|
459
|
-
@dbm[SYS_KEY] = source_key(Pathname.new(Lingo.find(:dict, filename)))
|
178
|
+
def _get(key)
|
179
|
+
@db[key]
|
460
180
|
end
|
461
181
|
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
if val = @dbm[@crypter ? @crypter.digest(key) : key]
|
466
|
-
val.encode!(ENC)
|
182
|
+
def _val(key)
|
183
|
+
if val = _get(@crypter ? @crypter.digest(key) : key)
|
184
|
+
val.force_encoding(ENC)
|
467
185
|
@crypter ? @crypter.decode(key, val) : val
|
468
186
|
end
|
469
187
|
end
|
470
188
|
|
471
|
-
def
|
472
|
-
|
473
|
-
@dbm[key] = (val.length < 950) ? val : val[0, 950]
|
474
|
-
end
|
475
|
-
|
476
|
-
def source_key(src)
|
477
|
-
[src.size, src.mtime].join(FLD_SEP)
|
478
|
-
end
|
479
|
-
|
480
|
-
end
|
481
|
-
|
482
|
-
# Die Klasse Txt2DbConverter steuert die Konvertierung von Wörterbuch-Quelldateien in
|
483
|
-
# Lingo-Datenbanken. Die Identifizierung der Quelldatei erfolgt über die ID
|
484
|
-
# der Datei, so wie sie in der Sprachkonfigurationsdatei <tt>de.lang</tt> unter
|
485
|
-
# <tt>language/dictionary/databases</tt> hinterlegt ist.
|
486
|
-
|
487
|
-
class Txt2DbmConverter
|
189
|
+
def convert(verbose = @lingo.config.stderr.tty?)
|
190
|
+
src = Source.get(@config.fetch('txt-format', 'KeyValue'), @id, @lingo)
|
488
191
|
|
489
|
-
|
490
|
-
|
491
|
-
|
192
|
+
if lex = @config['use-lex']
|
193
|
+
a, s = [{
|
194
|
+
'source' => lex.split(STRING_SEPARATOR_RE),
|
195
|
+
'mode' => @config['lex-mode']
|
196
|
+
}, @lingo], ' '
|
492
197
|
|
493
|
-
|
494
|
-
|
495
|
-
@source = case @format
|
496
|
-
when 'singleword' then TxtFile_Singleword
|
497
|
-
when 'keyvalue' then TxtFile_Keyvalue
|
498
|
-
when 'wordclass' then TxtFile_Wordclass
|
499
|
-
when 'multivalue' then TxtFile_Multivalue
|
500
|
-
when 'multikey' then TxtFile_Multikey
|
501
|
-
else
|
502
|
-
Lingo.error("Unbekanntes Textformat '#{config['txt-format'].downcase}' bei '#{'language/dictionary/databases/' + id}'")
|
503
|
-
end.new(id, lingo)
|
198
|
+
dic = Language::Dictionary.new(*a)
|
199
|
+
gra = Language::Grammar.new(*a)
|
504
200
|
|
505
|
-
|
506
|
-
|
201
|
+
block = lambda { |form|
|
202
|
+
res = dic.find_word(form)
|
507
203
|
|
508
|
-
|
509
|
-
|
204
|
+
if res.unknown?
|
205
|
+
res = gra.find_compositum(form)
|
206
|
+
com = res.compo_form
|
207
|
+
end
|
510
208
|
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
begin
|
516
|
-
@lexicalize = true
|
517
|
-
@dictionary = Dictionary.new({ 'source' => lex_dic.split(STRING_SEPERATOR_PATTERN), 'mode' => lex_mod }, lingo)
|
518
|
-
@grammar = Grammar.new({ 'source' => lex_dic.split(STRING_SEPERATOR_PATTERN), 'mode' => lex_mod }, lingo)
|
519
|
-
rescue RuntimeError
|
520
|
-
Lingo.error("Auf das Wörterbuch (#{lex_dic}) für die Lexikalisierung der Mehrwortgruppen in (#{@config['name']}) konnte nicht zugegriffen werden")
|
521
|
-
end if lex_dic
|
522
|
-
end
|
523
|
-
|
524
|
-
def convert
|
525
|
-
@progress.start('convert', @source.size)
|
526
|
-
|
527
|
-
@destination.open
|
528
|
-
@destination.clear
|
209
|
+
com ? com.form : res.norm
|
210
|
+
}
|
211
|
+
end
|
529
212
|
|
530
|
-
|
531
|
-
|
213
|
+
ShowProgress.new(self, src.size, verbose) { |progress| create {
|
214
|
+
src.each { |key, val|
|
215
|
+
progress[src.position]
|
532
216
|
|
533
|
-
|
534
|
-
|
535
|
-
# Schlüssel in Grundform wandeln
|
536
|
-
gkey = key.split(' ').map do |form|
|
217
|
+
if key
|
218
|
+
key.chomp!('.')
|
537
219
|
|
538
|
-
|
539
|
-
|
220
|
+
if lex && key.include?(s)
|
221
|
+
k = key.split(s).map!(&block).join(s)
|
540
222
|
|
541
|
-
|
542
|
-
|
223
|
+
c = k.count(s) + 1
|
224
|
+
self[k.split(s)[0, 3].join(s)] = ["#{KEY_REF}#{c}"] if c > 3
|
543
225
|
|
544
|
-
|
545
|
-
if result.attr == WA_UNKNOWN
|
546
|
-
result = @grammar.find_compositum(wordform)
|
547
|
-
compo = result.compo_form
|
226
|
+
key, val = k, val.map { |v| v.start_with?('#') ? key + v : v }
|
548
227
|
end
|
228
|
+
end
|
549
229
|
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
skey = gkey.split
|
554
|
-
# Zusatzschlüssel einfügen, wenn Anzahl Wörter > 3
|
555
|
-
@destination[skey[0...3].join(' ')] = [KEY_REF + skey.size.to_s] if skey.size > 3
|
556
|
-
|
557
|
-
value = value.map { |v| v =~ /^\043/ ? key + v : v }
|
558
|
-
key = gkey
|
559
|
-
end
|
560
|
-
|
561
|
-
# Format Sonderbehandlungen
|
562
|
-
key.gsub!(/\.$/, '') if key
|
563
|
-
case @format
|
564
|
-
when 'multivalue' # Äquvalenzklassen behandeln
|
565
|
-
key = IDX_REF + @index.to_s
|
566
|
-
@index += 1
|
567
|
-
@destination[key] = value
|
568
|
-
value.each { |v| @destination[v] = [key] }
|
569
|
-
when 'multikey' # Äquvalenzklassen behandeln
|
570
|
-
value.each { |v| @destination[v] = [key] }
|
571
|
-
else
|
572
|
-
@destination[key] = value
|
573
|
-
end
|
230
|
+
src.set(self, key, val)
|
231
|
+
}
|
574
232
|
|
575
|
-
|
576
|
-
|
577
|
-
@destination.set_source_file(@config['name'])
|
578
|
-
@destination.close
|
579
|
-
|
580
|
-
@progress.stop('ok')
|
581
|
-
|
582
|
-
self
|
233
|
+
uptodate!
|
234
|
+
} }
|
583
235
|
end
|
584
236
|
|
585
237
|
end
|