lingo 1.8.0 → 1.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +13 -0
- data/README +49 -29
- data/Rakefile +28 -4
- data/TODO +2 -9
- data/bin/lingo +24 -0
- data/bin/lingoctl +24 -0
- data/de/lingo-dic.txt +559 -74
- data/info/gpl-hdr.txt +21 -24
- data/lib/lingo.rb +83 -112
- data/lib/lingo/agenda_item.rb +53 -0
- data/lib/lingo/attendee.rb +261 -0
- data/lib/lingo/attendee/abbreviator.rb +95 -97
- data/lib/lingo/attendee/debugger.rb +94 -93
- data/lib/lingo/attendee/decomposer.rb +76 -83
- data/lib/lingo/attendee/dehyphenizer.rb +141 -144
- data/lib/lingo/attendee/formatter.rb +65 -0
- data/lib/lingo/attendee/multi_worder.rb +302 -0
- data/lib/lingo/attendee/noneword_filter.rb +89 -84
- data/lib/lingo/attendee/object_filter.rb +91 -0
- data/lib/lingo/attendee/sequencer.rb +159 -158
- data/lib/lingo/attendee/synonymer.rb +81 -84
- data/lib/lingo/attendee/text_reader.rb +242 -0
- data/lib/lingo/attendee/text_writer.rb +169 -0
- data/lib/lingo/attendee/tokenizer.rb +192 -191
- data/lib/lingo/attendee/variator.rb +152 -156
- data/lib/lingo/attendee/vector_filter.rb +140 -135
- data/lib/lingo/attendee/word_searcher.rb +98 -0
- data/lib/lingo/buffered_attendee.rb +69 -0
- data/lib/lingo/cachable.rb +58 -0
- data/lib/lingo/call.rb +72 -0
- data/lib/lingo/cli.rb +26 -0
- data/lib/lingo/config.rb +23 -26
- data/lib/lingo/core_ext.rb +42 -0
- data/lib/lingo/ctl.rb +239 -173
- data/lib/lingo/database.rb +148 -496
- data/lib/lingo/database/crypter.rb +85 -0
- data/lib/lingo/database/gdbm_store.rb +49 -0
- data/lib/lingo/database/hash_store.rb +67 -0
- data/lib/lingo/database/libcdb_store.rb +58 -0
- data/lib/lingo/database/sdbm_store.rb +64 -0
- data/lib/lingo/database/show_progress.rb +81 -0
- data/lib/lingo/database/source.rb +134 -0
- data/lib/lingo/database/source/key_value.rb +62 -0
- data/lib/lingo/database/source/multi_key.rb +65 -0
- data/lib/lingo/database/source/multi_value.rb +65 -0
- data/lib/lingo/database/source/single_word.rb +60 -0
- data/lib/lingo/database/source/word_class.rb +64 -0
- data/lib/lingo/error.rb +122 -0
- data/lib/lingo/language.rb +78 -518
- data/lib/lingo/language/dictionary.rb +173 -0
- data/lib/lingo/language/grammar.rb +211 -0
- data/lib/lingo/language/lexical.rb +66 -0
- data/lib/lingo/language/lexical_hash.rb +88 -0
- data/lib/lingo/language/token.rb +48 -0
- data/lib/lingo/language/word.rb +130 -0
- data/lib/lingo/language/word_form.rb +83 -0
- data/lib/lingo/reportable.rb +59 -0
- data/lib/lingo/version.rb +1 -1
- data/lingo-all.cfg +14 -10
- data/lingo-call.cfg +5 -5
- data/lingo.cfg +14 -12
- data/lingo.rb +26 -0
- data/lir.cfg +13 -9
- data/spec/spec_helper.rb +1 -0
- data/test.cfg +11 -11
- data/test/attendee/ts_abbreviator.rb +0 -6
- data/test/attendee/ts_decomposer.rb +0 -6
- data/test/attendee/{ts_multiworder.rb → ts_multi_worder.rb} +1 -7
- data/test/attendee/ts_noneword_filter.rb +1 -7
- data/test/attendee/{ts_objectfilter.rb → ts_object_filter.rb} +1 -7
- data/test/attendee/ts_sequencer.rb +0 -6
- data/test/attendee/ts_synonymer.rb +0 -6
- data/test/attendee/{ts_textreader.rb → ts_text_reader.rb} +1 -7
- data/test/attendee/{ts_textwriter.rb → ts_text_writer.rb} +1 -7
- data/test/attendee/ts_tokenizer.rb +0 -6
- data/test/attendee/ts_variator.rb +0 -6
- data/test/attendee/ts_vector_filter.rb +1 -7
- data/test/attendee/{ts_wordsearcher.rb → ts_word_searcher.rb} +1 -7
- data/test/ref/artikel.non +2 -29
- data/test/ref/artikel.seq +13 -8
- data/test/ref/artikel.vec +30 -15
- data/test/ref/artikel.ven +29 -14
- data/test/ref/artikel.ver +58 -43
- data/test/ref/lir.csv +146 -145
- data/test/ref/lir.non +186 -210
- data/test/ref/lir.seq +54 -50
- data/test/test_helper.rb +41 -36
- data/test/ts_database.rb +12 -11
- data/test/ts_language.rb +118 -68
- metadata +67 -29
- data/lib/lingo/attendee/multiworder.rb +0 -301
- data/lib/lingo/attendee/objectfilter.rb +0 -86
- data/lib/lingo/attendee/textreader.rb +0 -237
- data/lib/lingo/attendee/textwriter.rb +0 -196
- data/lib/lingo/attendee/wordsearcher.rb +0 -96
- data/lib/lingo/attendees.rb +0 -289
- data/lib/lingo/const.rb +0 -131
- data/lib/lingo/modules.rb +0 -98
- data/lib/lingo/types.rb +0 -285
- data/lib/lingo/utilities.rb +0 -40
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
#--
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
25
|
+
#++
|
|
26
|
+
|
|
27
|
+
class Lingo
|
|
28
|
+
|
|
29
|
+
class Database
|
|
30
|
+
|
|
31
|
+
# Crypter ermöglicht die Ver- und Entschlüsselung von Wörterbüchern
|
|
32
|
+
|
|
33
|
+
class Crypter
|
|
34
|
+
|
|
35
|
+
HEX_CHARS = '0123456789abcdef'.freeze
|
|
36
|
+
|
|
37
|
+
def digest(key)
|
|
38
|
+
Digest::SHA1.hexdigest(key)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def encode(key, val)
|
|
42
|
+
hex = ''
|
|
43
|
+
|
|
44
|
+
crypt(key, val).each_byte { |byte|
|
|
45
|
+
# To get a hex representation for a char we just utilize
|
|
46
|
+
# the quotient and the remainder of division by base 16.
|
|
47
|
+
q, r = byte.divmod(16)
|
|
48
|
+
hex << HEX_CHARS[q] << HEX_CHARS[r]
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
[digest(key), hex]
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def decode(key, val)
|
|
55
|
+
str, q, first = '', 0, false
|
|
56
|
+
|
|
57
|
+
val.each_byte { |byte|
|
|
58
|
+
byte = byte.chr(ENC)
|
|
59
|
+
|
|
60
|
+
# Our hex chars are 2 bytes wide, so we have to keep track
|
|
61
|
+
# of whether it's the first or the second of the two.
|
|
62
|
+
if first = !first
|
|
63
|
+
q = HEX_CHARS.index(byte)
|
|
64
|
+
else
|
|
65
|
+
# Now we got both parts, so let's revert the divmod(16)
|
|
66
|
+
str << q * 16 + HEX_CHARS.index(byte)
|
|
67
|
+
end
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
crypt(key, str)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
private
|
|
74
|
+
|
|
75
|
+
def crypt(k, v)
|
|
76
|
+
c, y = '', k.codepoints.reverse_each.cycle
|
|
77
|
+
v.each_codepoint { |x| c << (x ^ y.next).chr(ENC) }
|
|
78
|
+
c
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
#--
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
25
|
+
#++
|
|
26
|
+
|
|
27
|
+
class Lingo
|
|
28
|
+
|
|
29
|
+
require_optional 'gdbm'
|
|
30
|
+
|
|
31
|
+
class Database
|
|
32
|
+
|
|
33
|
+
module GDBMStore
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def store_ext
|
|
38
|
+
'.db'
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def _open
|
|
42
|
+
GDBM.open(@dbm_name)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
end
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
#--
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
25
|
+
#++
|
|
26
|
+
|
|
27
|
+
class Lingo
|
|
28
|
+
|
|
29
|
+
class Database
|
|
30
|
+
|
|
31
|
+
module HashStore
|
|
32
|
+
|
|
33
|
+
def to_h
|
|
34
|
+
@db.dup
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def close
|
|
38
|
+
self
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
43
|
+
def uptodate?
|
|
44
|
+
false
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def uptodate!
|
|
48
|
+
nil
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def _clear
|
|
52
|
+
@db.clear if @db
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def _open
|
|
56
|
+
{}
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def _closed?
|
|
60
|
+
false
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
#--
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
25
|
+
#++
|
|
26
|
+
|
|
27
|
+
class Lingo
|
|
28
|
+
|
|
29
|
+
require_optional 'libcdb'
|
|
30
|
+
|
|
31
|
+
class Database
|
|
32
|
+
|
|
33
|
+
module LibCDBStore
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def store_ext
|
|
38
|
+
'.cdb'
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def create
|
|
42
|
+
LibCDB::CDB.open(@dbm_name, 'w') { |db|
|
|
43
|
+
@db = db
|
|
44
|
+
yield
|
|
45
|
+
}
|
|
46
|
+
ensure
|
|
47
|
+
@db = nil
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def _open
|
|
51
|
+
LibCDB::CDB.open(@dbm_name)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
end
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
#--
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
25
|
+
#++
|
|
26
|
+
|
|
27
|
+
class Lingo
|
|
28
|
+
|
|
29
|
+
require_optional 'sdbm'
|
|
30
|
+
|
|
31
|
+
class Database
|
|
32
|
+
|
|
33
|
+
module SDBMStore
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def uptodate?
|
|
38
|
+
super(@dbm_name + '.pag')
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def _clear
|
|
42
|
+
File.delete(*Dir["#{@dbm_name}.{pag,dir}"])
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def _open
|
|
46
|
+
SDBM.open(@dbm_name)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def _set(key, val)
|
|
50
|
+
if val.length > 950
|
|
51
|
+
val = val[0, 950]
|
|
52
|
+
|
|
53
|
+
@lingo.warn "Warning: Entry `#{key}' (#{@src_file})" <<
|
|
54
|
+
'too long for SDBM. Truncating...'
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
super
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
end
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
#--
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
25
|
+
#++
|
|
26
|
+
|
|
27
|
+
class Lingo
|
|
28
|
+
|
|
29
|
+
class Database
|
|
30
|
+
|
|
31
|
+
class ShowProgress
|
|
32
|
+
|
|
33
|
+
def initialize(src, max, act = true)
|
|
34
|
+
@out, @act = src.instance_variable_get(:@lingo).config.stderr, act
|
|
35
|
+
|
|
36
|
+
# To get the length of the formatted string we have
|
|
37
|
+
# to actually substitute the placeholder.
|
|
38
|
+
fmt = ' [%3d%%]'
|
|
39
|
+
len = (fmt % 0).length
|
|
40
|
+
|
|
41
|
+
# Now we know how far to "go back" to
|
|
42
|
+
# overwrite the formatted string...
|
|
43
|
+
back = "\b" * len
|
|
44
|
+
|
|
45
|
+
@fmt = fmt + back
|
|
46
|
+
@clr = ' ' * len + back
|
|
47
|
+
|
|
48
|
+
print src.instance_variable_get(:@config)['name'], ': '
|
|
49
|
+
|
|
50
|
+
@rat, @cnt, @next = max / 100.0, 0, 0
|
|
51
|
+
print 'convert '
|
|
52
|
+
step
|
|
53
|
+
|
|
54
|
+
yield self
|
|
55
|
+
|
|
56
|
+
print "#{@clr}ok\n"
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def [](value)
|
|
60
|
+
@cnt = value
|
|
61
|
+
step if @cnt >= @next
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
private
|
|
65
|
+
|
|
66
|
+
def step
|
|
67
|
+
percent = @cnt / @rat
|
|
68
|
+
@next = (percent + 1) * @rat
|
|
69
|
+
|
|
70
|
+
print @fmt % percent
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def print(*args)
|
|
74
|
+
@out.print(*args) if @act
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
end
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
#--
|
|
4
|
+
###############################################################################
|
|
5
|
+
# #
|
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
|
7
|
+
# #
|
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
|
10
|
+
# #
|
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
|
14
|
+
# any later version. #
|
|
15
|
+
# #
|
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
|
19
|
+
# more details. #
|
|
20
|
+
# #
|
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
|
23
|
+
# #
|
|
24
|
+
###############################################################################
|
|
25
|
+
#++
|
|
26
|
+
|
|
27
|
+
require_relative 'source/key_value'
|
|
28
|
+
require_relative 'source/multi_key'
|
|
29
|
+
require_relative 'source/multi_value'
|
|
30
|
+
require_relative 'source/single_word'
|
|
31
|
+
require_relative 'source/word_class'
|
|
32
|
+
|
|
33
|
+
class Lingo
|
|
34
|
+
|
|
35
|
+
class Database
|
|
36
|
+
|
|
37
|
+
# Die Klasse Source stellt eine einheitliche Schnittstelle auf die unterschiedlichen Formate
|
|
38
|
+
# von Wörterbuch-Quelldateien bereit. Die Identifizierung der Quelldatei erfolgt über die ID
|
|
39
|
+
# der Datei, so wie sie in der Sprachkonfigurationsdatei <tt>de.lang</tt> unter
|
|
40
|
+
# <tt>language/dictionary/databases</tt> hinterlegt ist.
|
|
41
|
+
#
|
|
42
|
+
# Die Verarbeitung der Wörterbücher erfolgt mittels des Iterators <b>each</b>, der für jede
|
|
43
|
+
# Zeile der Quelldatei ein Array bereitstellt in der Form <tt>[ key, [val1, val2, ...] ]</tt>.
|
|
44
|
+
#
|
|
45
|
+
# Nicht korrekt erkannte Zeilen werden abgewiesen und in eine Revoke-Datei gespeichert, die
|
|
46
|
+
# an der Dateiendung <tt>.rev</tt> zu erkennen ist.
|
|
47
|
+
|
|
48
|
+
class Source
|
|
49
|
+
|
|
50
|
+
# Define printable characters for tokenizer for UTF-8 encoding
|
|
51
|
+
UTF8_DIGIT = '[0-9]'
|
|
52
|
+
# Define Basic Latin printable characters for UTF-8 encoding from U+0000 to U+007f
|
|
53
|
+
UTF8_BASLAT = '[A-Za-z]'
|
|
54
|
+
# Define Latin-1 Supplement printable characters for UTF-8 encoding from U+0080 to U+00ff
|
|
55
|
+
UTF8_LAT1SP = '[\xc3\x80-\xc3\x96\xc3\x98-\xc3\xb6\xc3\xb8-\xc3\xbf]'
|
|
56
|
+
# Define Latin Extended-A printable characters for UTF-8 encoding from U+0100 to U+017f
|
|
57
|
+
UTF8_LATEXA = '[\xc4\x80-\xc4\xbf\xc5\x80-\xc5\xbf]'
|
|
58
|
+
# Define Latin Extended-B printable characters for UTF-8 encoding from U+0180 to U+024f
|
|
59
|
+
UTF8_LATEXB = '[\xc6\x80-\xc6\xbf\xc7\x80-\xc7\xbf\xc8\x80-\xc8\xbf\xc9\x80-\xc9\x8f]'
|
|
60
|
+
# Define IPA Extension printable characters for UTF-8 encoding from U+024f to U+02af
|
|
61
|
+
UTF8_IPAEXT = '[\xc9\xa0-\xc9\xbf\xca\xa0-\xca\xaf]'
|
|
62
|
+
# Collect all UTF-8 printable characters in Unicode range U+0000 to U+02af
|
|
63
|
+
UTF8_CHAR = "#{UTF8_DIGIT}|#{UTF8_BASLAT}|#{UTF8_LAT1SP}|#{UTF8_LATEXA}|#{UTF8_LATEXB}|#{UTF8_IPAEXT}"
|
|
64
|
+
|
|
65
|
+
PRINTABLE_CHAR = "#{UTF8_CHAR}|[<>-]"
|
|
66
|
+
|
|
67
|
+
def self.get(name, *args)
|
|
68
|
+
const_get(name.camelcase).new(*args)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
attr_reader :position
|
|
72
|
+
|
|
73
|
+
def initialize(id, lingo)
|
|
74
|
+
@config = lingo.database_config(id)
|
|
75
|
+
|
|
76
|
+
source_file = Lingo.find(:dict, name = @config['name'])
|
|
77
|
+
reject_file = begin
|
|
78
|
+
Lingo.find(:store, source_file) << '.rev'
|
|
79
|
+
rescue NoWritableStoreError
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
@pn_source = Pathname.new(source_file)
|
|
83
|
+
@pn_reject = Pathname.new(reject_file) if reject_file
|
|
84
|
+
|
|
85
|
+
raise SourceFileNotFoundError.new(name, id) unless @pn_source.exist?
|
|
86
|
+
|
|
87
|
+
@wordclass = @config.fetch('def-wc', '?').downcase
|
|
88
|
+
@separator = @config['separator']
|
|
89
|
+
|
|
90
|
+
@legal_word = '(?:' + PRINTABLE_CHAR + '|[' + Regexp.escape('- /&()[].,') + '])+' # TODO: v1.60 - ',' bei Source zulassen; in const.rb einbauen
|
|
91
|
+
@line_pattern = Regexp.new('^'+@legal_word+'$')
|
|
92
|
+
|
|
93
|
+
@position = 0
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def size
|
|
97
|
+
@pn_source.size
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def each
|
|
101
|
+
reject_file = @pn_reject.open('w', encoding: ENC) if @pn_reject
|
|
102
|
+
|
|
103
|
+
@pn_source.each_line($/, encoding: ENC) { |line|
|
|
104
|
+
@position += length = line.bytesize
|
|
105
|
+
|
|
106
|
+
next if line =~ /\A\s*#/ || line.strip.empty?
|
|
107
|
+
|
|
108
|
+
line.chomp!
|
|
109
|
+
line.downcase!
|
|
110
|
+
|
|
111
|
+
if length < 4096 && line =~ @line_pattern
|
|
112
|
+
yield convert_line(line, $1, $2)
|
|
113
|
+
else
|
|
114
|
+
reject_file.puts(line) if reject_file
|
|
115
|
+
end
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
self
|
|
119
|
+
ensure
|
|
120
|
+
if reject_file
|
|
121
|
+
reject_file.close
|
|
122
|
+
@pn_reject.delete if @pn_reject.size == 0
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def set(db, key, val)
|
|
127
|
+
db[key] = val
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
end
|