lingo 1.8.4.2 → 1.8.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ChangeLog +413 -325
- data/README +380 -131
- data/Rakefile +19 -21
- data/de/lingo-abk.txt +15 -17
- data/de/lingo-dic.txt +20210 -20659
- data/de/lingo-mul.txt +5 -13
- data/de/lingo-syn.txt +5 -8
- data/de/test_dic.txt +2 -0
- data/de/test_gen.txt +8 -0
- data/de/{test_mul2.txt → test_mu2.txt} +0 -0
- data/de/{test_singleword.txt → test_sgw.txt} +0 -0
- data/de/user-dic.txt +5 -7
- data/de.lang +64 -49
- data/en/lingo-dic.txt +6398 -6404
- data/en/lingo-irr.txt +2 -3
- data/en/lingo-mul.txt +6 -7
- data/en/lingo-wdn.txt +881 -1762
- data/en/user-dic.txt +2 -5
- data/en.lang +39 -39
- data/lib/lingo/app.rb +10 -6
- data/lib/lingo/attendee/abbreviator.rb +1 -0
- data/lib/lingo/attendee/decomposer.rb +2 -1
- data/lib/lingo/attendee/multi_worder.rb +5 -6
- data/lib/lingo/attendee/stemmer.rb +1 -1
- data/lib/lingo/attendee/synonymer.rb +4 -2
- data/lib/lingo/attendee/text_reader.rb +77 -57
- data/lib/lingo/attendee/text_writer.rb +1 -1
- data/lib/lingo/attendee/tokenizer.rb +101 -50
- data/lib/lingo/attendee/variator.rb +2 -1
- data/lib/lingo/attendee/vector_filter.rb +28 -6
- data/lib/lingo/attendee/word_searcher.rb +2 -1
- data/lib/lingo/attendee.rb +8 -4
- data/lib/lingo/call.rb +7 -3
- data/lib/lingo/cli.rb +8 -16
- data/lib/lingo/config.rb +11 -6
- data/lib/lingo/ctl.rb +54 -3
- data/lib/lingo/database/crypter.rb +8 -14
- data/lib/lingo/database/hash_store.rb +1 -1
- data/lib/lingo/database/{show_progress.rb → progress.rb} +7 -8
- data/lib/lingo/database/source/key_value.rb +6 -5
- data/lib/lingo/database/source/multi_key.rb +5 -2
- data/lib/lingo/database/source/multi_value.rb +6 -4
- data/lib/lingo/database/source/single_word.rb +2 -3
- data/lib/lingo/database/source/word_class.rb +24 -5
- data/lib/lingo/database/source.rb +5 -3
- data/lib/lingo/database.rb +102 -41
- data/lib/lingo/error.rb +24 -2
- data/lib/lingo/language/dictionary.rb +26 -54
- data/lib/lingo/language/grammar.rb +19 -23
- data/lib/lingo/language/lexical.rb +5 -1
- data/lib/lingo/language/lexical_hash.rb +7 -12
- data/lib/lingo/language/token.rb +10 -1
- data/lib/lingo/language/word.rb +35 -23
- data/lib/lingo/language/word_form.rb +5 -4
- data/lib/lingo/{show_progress.rb → progress.rb} +43 -30
- data/lib/lingo/srv/lingosrv.cfg +1 -1
- data/lib/lingo/srv/public/.gitkeep +0 -0
- data/lib/lingo/srv.rb +11 -6
- data/lib/lingo/version.rb +2 -2
- data/lib/lingo/web/lingoweb.cfg +1 -1
- data/lib/lingo/web/views/index.erb +4 -4
- data/lib/lingo/web.rb +4 -6
- data/lib/lingo.rb +4 -12
- data/lingo.cfg +1 -1
- data/lir.cfg +1 -1
- data/ru/lingo-dic.txt +33473 -2113
- data/ru/lingo-mul.txt +8430 -1913
- data/ru/lingo-syn.txt +1634 -0
- data/ru/user-dic.txt +6 -0
- data/ru.lang +49 -47
- data/spec/spec_helper.rb +4 -0
- data/test/attendee/ts_decomposer.rb +2 -2
- data/test/attendee/ts_synonymer.rb +3 -3
- data/test/attendee/ts_tokenizer.rb +215 -2
- data/test/attendee/ts_variator.rb +2 -2
- data/test/attendee/ts_word_searcher.rb +10 -6
- data/test/ref/artikel.seq +2 -2
- data/test/ref/artikel.vec +5 -5
- data/test/ref/artikel.ven +11 -11
- data/test/ref/artikel.ver +11 -11
- data/test/ref/lir.seq +13 -13
- data/test/ref/lir.vec +31 -31
- data/test/test_helper.rb +19 -5
- data/test/ts_database.rb +206 -77
- data/test/ts_language.rb +86 -26
- metadata +93 -49
- data/.rspec +0 -1
- data/de/test_syn2.txt +0 -1
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2013 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -37,16 +37,17 @@ class Lingo
|
|
37
37
|
|
38
38
|
class KeyValue < self
|
39
39
|
|
40
|
+
DEFAULT_SEPARATOR = '*'
|
41
|
+
|
40
42
|
def initialize(id, lingo)
|
41
|
-
super
|
42
|
-
@pat = /^(#{@wrd})#{Regexp.escape(@sep ||=
|
43
|
+
super(id, lingo, Language::LA_UNKNOWN)
|
44
|
+
@pat = /^(#{@wrd})#{Regexp.escape(@sep ||= DEFAULT_SEPARATOR)}(#{@wrd})$/
|
43
45
|
end
|
44
46
|
|
45
47
|
private
|
46
48
|
|
47
49
|
def convert_line(line, key, val)
|
48
|
-
key
|
49
|
-
[key, %W[#{val unless key == val}##{@def}]]
|
50
|
+
[key.strip, %W[#{val.strip}##{@def}]]
|
50
51
|
end
|
51
52
|
|
52
53
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2013 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -38,12 +38,15 @@ class Lingo
|
|
38
38
|
|
39
39
|
class MultiKey < self
|
40
40
|
|
41
|
+
DEFAULT_SEPARATOR = ';'
|
42
|
+
|
41
43
|
def initialize(id, lingo)
|
42
44
|
super
|
43
|
-
@pat = /^#{@wrd}(?:#{Regexp.escape(@sep ||=
|
45
|
+
@pat = /^#{@wrd}(?:#{Regexp.escape(@sep ||= DEFAULT_SEPARATOR)}#{@wrd})*$/
|
44
46
|
end
|
45
47
|
|
46
48
|
def set(db, key, val)
|
49
|
+
key += "##{@def}" if @def
|
47
50
|
val.each { |v| db[v] = [key] }
|
48
51
|
end
|
49
52
|
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2013 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -36,14 +36,16 @@ class Lingo
|
|
36
36
|
|
37
37
|
class MultiValue < self
|
38
38
|
|
39
|
+
DEFAULT_SEPARATOR = ';'
|
40
|
+
|
39
41
|
def initialize(id, lingo)
|
40
42
|
super
|
41
|
-
@pat
|
43
|
+
@pat = /^#{@wrd}(?:#{Regexp.escape(@sep ||= DEFAULT_SEPARATOR)}#{@wrd})*$/
|
42
44
|
end
|
43
45
|
|
44
46
|
def set(db, key, val)
|
45
|
-
|
46
|
-
val.each { |v| db[v] =
|
47
|
+
values = val.map { |v| @def ? "#{v}##{@def}" : v }
|
48
|
+
val.each { |v| db[v] = values }
|
47
49
|
end
|
48
50
|
|
49
51
|
private
|
@@ -37,16 +37,15 @@ class Lingo
|
|
37
37
|
class SingleWord < self
|
38
38
|
|
39
39
|
def initialize(id, lingo)
|
40
|
-
super
|
40
|
+
super(id, lingo, Language::LA_NOUN)
|
41
41
|
@pat = /^(#{@wrd})$/
|
42
|
-
@def = @config.fetch('def-wc', 's').downcase
|
43
42
|
@mul = @config.fetch('def-mul-wc', @def).downcase
|
44
43
|
end
|
45
44
|
|
46
45
|
private
|
47
46
|
|
48
47
|
def convert_line(line, key, val)
|
49
|
-
[
|
48
|
+
[k = key.strip, %W[#{k}##{k.include?(' ') ? @mul : @def}]]
|
50
49
|
end
|
51
50
|
|
52
51
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2013 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -36,17 +36,36 @@ class Lingo
|
|
36
36
|
|
37
37
|
class WordClass < self
|
38
38
|
|
39
|
+
DEFAULT_SEPARATOR = ','
|
40
|
+
|
41
|
+
GENDER_SEPARATOR = '.'
|
42
|
+
|
39
43
|
def initialize(id, lingo)
|
40
44
|
super
|
41
|
-
|
45
|
+
|
46
|
+
gen = Regexp.escape(GENDER_SEPARATOR)
|
47
|
+
sep = Regexp.escape(@sep ||= DEFAULT_SEPARATOR)
|
48
|
+
|
49
|
+
w, a = '\w%1$s(?:\|\w%1$s)*', '[+]?'
|
50
|
+
wc = "##{w % a}(?:#{gen}#{w % ''})?"
|
51
|
+
|
52
|
+
@pat = /^(#{@wrd})#{sep}((?:#{@wrd}#{wc})+)$/
|
42
53
|
end
|
43
54
|
|
44
55
|
private
|
45
56
|
|
46
57
|
def convert_line(line, key, val)
|
47
|
-
|
48
|
-
|
49
|
-
|
58
|
+
values = []
|
59
|
+
|
60
|
+
val.strip.scan(/(\S.*?)\s*#(\S+)/) { |k, v|
|
61
|
+
v, f = v.split('.')
|
62
|
+
|
63
|
+
v.split('|').product(f ? f.split('|') : [nil]) { |w, g|
|
64
|
+
values << "#{k}##{w}##{g}"
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
[key.strip, values]
|
50
69
|
end
|
51
70
|
|
52
71
|
end
|
@@ -53,7 +53,7 @@ class Lingo
|
|
53
53
|
|
54
54
|
attr_reader :pos
|
55
55
|
|
56
|
-
def initialize(id, lingo)
|
56
|
+
def initialize(id, lingo, def_wc_default = nil)
|
57
57
|
@config = lingo.database_config(id)
|
58
58
|
|
59
59
|
source_file = Lingo.find(:dict, name = @config['name'], relax: true)
|
@@ -68,7 +68,8 @@ class Lingo
|
|
68
68
|
|
69
69
|
raise SourceFileNotFoundError.new(name, id) unless @src.exist?
|
70
70
|
|
71
|
-
@def = @config.fetch('def-wc',
|
71
|
+
@def = @config.fetch('def-wc', def_wc_default)
|
72
|
+
@def = @def.downcase if @def
|
72
73
|
@sep = @config['separator']
|
73
74
|
|
74
75
|
@wrd = "(?:#{Language::Char::ANY})+"
|
@@ -87,7 +88,8 @@ class Lingo
|
|
87
88
|
@src.each_line($/, encoding: ENC) { |line|
|
88
89
|
@pos += length = line.bytesize
|
89
90
|
|
90
|
-
|
91
|
+
line.strip!
|
92
|
+
next if line.empty? || line.start_with?('#')
|
91
93
|
|
92
94
|
line.chomp!
|
93
95
|
line.replace(Unicode.downcase(line))
|
data/lib/lingo/database.rb
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -24,7 +24,7 @@
|
|
24
24
|
###############################################################################
|
25
25
|
#++
|
26
26
|
|
27
|
-
require_relative 'database/
|
27
|
+
require_relative 'database/progress'
|
28
28
|
require_relative 'database/crypter'
|
29
29
|
require_relative 'database/source'
|
30
30
|
|
@@ -40,15 +40,9 @@ class Lingo
|
|
40
40
|
class Database
|
41
41
|
|
42
42
|
FLD_SEP = '|'
|
43
|
-
IDX_REF = '^'
|
44
43
|
KEY_REF = '*'
|
45
44
|
SYS_KEY = '~'
|
46
45
|
|
47
|
-
IDX_REF_ESC = Regexp.escape(IDX_REF)
|
48
|
-
KEY_REF_ESC = Regexp.escape(KEY_REF)
|
49
|
-
|
50
|
-
INDEX_PATTERN = %r{\A#{IDX_REF_ESC}\d+\z}
|
51
|
-
|
52
46
|
BACKENDS = []
|
53
47
|
BACKEND_BY_EXT = {}
|
54
48
|
|
@@ -68,13 +62,11 @@ class Lingo
|
|
68
62
|
|
69
63
|
end
|
70
64
|
|
71
|
-
attr_reader :backend
|
72
|
-
|
73
65
|
def initialize(id, lingo)
|
74
66
|
@id, @lingo, @config, @db = id, lingo, lingo.database_config(id), nil
|
75
67
|
|
76
|
-
@srcfile = Lingo.find(:dict,
|
77
|
-
@crypter =
|
68
|
+
@srcfile = Lingo.find(:dict, config['name'], relax: true)
|
69
|
+
@crypter = config.key?('crypt') && Crypter.new
|
78
70
|
|
79
71
|
@val = Hash.new { |h, k| h[k] = [] }
|
80
72
|
|
@@ -93,6 +85,8 @@ class Lingo
|
|
93
85
|
convert unless uptodate?
|
94
86
|
end
|
95
87
|
|
88
|
+
attr_reader :lingo, :config, :backend
|
89
|
+
|
96
90
|
def closed?
|
97
91
|
!@db || _closed?
|
98
92
|
end
|
@@ -125,18 +119,13 @@ class Lingo
|
|
125
119
|
|
126
120
|
def [](key)
|
127
121
|
val = _val(key) unless closed?
|
128
|
-
|
129
|
-
|
130
|
-
# Äquvalenzklassen behandeln
|
131
|
-
val.split(FLD_SEP).map { |v|
|
132
|
-
v =~ INDEX_PATTERN ? _val(v) : v
|
133
|
-
}.compact.join(FLD_SEP).split(FLD_SEP)
|
122
|
+
val.split(FLD_SEP) if val
|
134
123
|
end
|
135
124
|
|
136
125
|
def []=(key, val)
|
137
126
|
return if closed?
|
138
127
|
|
139
|
-
val = @val[key].concat(val)
|
128
|
+
val = @val[key].concat(val)
|
140
129
|
val.uniq!
|
141
130
|
|
142
131
|
val = val.join(FLD_SEP)
|
@@ -144,7 +133,7 @@ class Lingo
|
|
144
133
|
end
|
145
134
|
|
146
135
|
def warn(*msg)
|
147
|
-
|
136
|
+
lingo.warn(*msg)
|
148
137
|
end
|
149
138
|
|
150
139
|
private
|
@@ -171,9 +160,24 @@ class Lingo
|
|
171
160
|
get_backend(mod) or raise BackendNotAvailableError.new(mod, file)
|
172
161
|
end
|
173
162
|
|
163
|
+
def config_hash
|
164
|
+
hashes = [config]
|
165
|
+
|
166
|
+
if use_lex = config['use-lex']
|
167
|
+
hashes.concat(lingo.
|
168
|
+
dictionary_config['databases'].
|
169
|
+
values_at(*use_lex.split(SEP_RE)))
|
170
|
+
end
|
171
|
+
|
172
|
+
Crypter.digest(hashes.inspect)
|
173
|
+
end
|
174
|
+
|
174
175
|
def uptodate?(file = @stofile)
|
175
176
|
src = Pathname.new(@srcfile)
|
176
|
-
|
177
|
+
|
178
|
+
@source_key = lambda {
|
179
|
+
[src.size, src.mtime, VERSION, config_hash].join(FLD_SEP)
|
180
|
+
}
|
177
181
|
|
178
182
|
sys_key = open { @db[SYS_KEY] } if File.exist?(file)
|
179
183
|
sys_key && (!src.exist? || sys_key == @source_key.call)
|
@@ -217,7 +221,7 @@ class Lingo
|
|
217
221
|
end
|
218
222
|
|
219
223
|
def _val(key)
|
220
|
-
if val = _get(@crypter ?
|
224
|
+
if val = _get(@crypter ? Crypter.digest(key) : key)
|
221
225
|
_encode!(val)
|
222
226
|
@crypter ? @crypter.decode(key, val) : val
|
223
227
|
end
|
@@ -227,33 +231,25 @@ class Lingo
|
|
227
231
|
str.force_encoding(ENC)
|
228
232
|
end
|
229
233
|
|
230
|
-
def convert(verbose =
|
231
|
-
src = Source.get(
|
232
|
-
|
233
|
-
if lex = @config['use-lex']
|
234
|
-
a = [{ 'source' => lex.split(SEP_RE), 'mode' => @config['lex-mode'] }, @lingo]
|
235
|
-
d, g = Language::Dictionary.new(*a), Language::Grammar.new(*a); a = nil
|
234
|
+
def convert(verbose = lingo.config.stderr.tty?)
|
235
|
+
src = Source.get(config.fetch('txt-format', 'key_value'), @id, lingo)
|
236
236
|
|
237
|
-
|
238
|
-
(r = d.find_word(f)).unknown? &&
|
239
|
-
(c = (r = g.find_compound(f)).compo_form) ? c.form : r.norm
|
240
|
-
}
|
241
|
-
end
|
237
|
+
sep, key_map, val_map = prepare_lex
|
242
238
|
|
243
|
-
|
239
|
+
Progress.new(self, src, verbose) { |progress| create {
|
244
240
|
src.each { |key, val|
|
245
|
-
progress
|
241
|
+
progress << src.pos
|
246
242
|
|
247
243
|
if key
|
248
244
|
key.chomp!('.')
|
249
245
|
|
250
|
-
if
|
251
|
-
|
246
|
+
if sep && key.include?(sep)
|
247
|
+
key = key.split(sep).map!(&key_map).join(sep)
|
248
|
+
val = val.map { |v| val_map[v.split(sep)].join(sep) } if val_map
|
252
249
|
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
key, val = k, val.map { |v| v.start_with?('#') ? key + v : v }
|
250
|
+
if (cnt = key.count(sep)) > 2
|
251
|
+
self[key.split(sep)[0, 3].join(sep)] = ["#{KEY_REF}#{cnt + 1}"]
|
252
|
+
end
|
257
253
|
end
|
258
254
|
end
|
259
255
|
|
@@ -264,6 +260,71 @@ class Lingo
|
|
264
260
|
} }
|
265
261
|
end
|
266
262
|
|
263
|
+
def prepare_lex
|
264
|
+
use_lex = config['use-lex'] or return
|
265
|
+
|
266
|
+
args = [{
|
267
|
+
'source' => use_lex.split(SEP_RE),
|
268
|
+
'mode' => config['lex-mode']
|
269
|
+
}, lingo]
|
270
|
+
|
271
|
+
dic = Language::Dictionary.new(*args)
|
272
|
+
gra = Language::Grammar.new(*args)
|
273
|
+
|
274
|
+
args = nil
|
275
|
+
|
276
|
+
if inflect = config['inflect']
|
277
|
+
inflect, wc = inflect == true ? %w[s e] : inflect.split(SEP_RE), 'a'
|
278
|
+
|
279
|
+
if cfg = lingo.dictionary_config['inflect'] and suffixes = cfg[wc]
|
280
|
+
wc, re = /#{wc}/, /\A[^#]+/
|
281
|
+
else
|
282
|
+
warn "#{self.class}: No suffixes to inflect ##{wc}: #{@id}"
|
283
|
+
inflect = false
|
284
|
+
end
|
285
|
+
end
|
286
|
+
|
287
|
+
[' ', lambda { |form|
|
288
|
+
word = dic.find_word(form)
|
289
|
+
|
290
|
+
if word.unknown?
|
291
|
+
compo = gra.find_compound(form)
|
292
|
+
|
293
|
+
if compo_form = compo.compo_form
|
294
|
+
compo_form.form
|
295
|
+
else
|
296
|
+
compo.norm
|
297
|
+
end
|
298
|
+
else
|
299
|
+
word.norm
|
300
|
+
end
|
301
|
+
}, inflect && lambda { |forms|
|
302
|
+
inflectables = []
|
303
|
+
|
304
|
+
forms.each { |form|
|
305
|
+
word = dic.find_word(word_form = form[re])
|
306
|
+
|
307
|
+
if word.identified? and lexical = word.get_class(wc).first
|
308
|
+
inflectables << form if form == lexical.form
|
309
|
+
else
|
310
|
+
unless inflectables.empty?
|
311
|
+
comp = gra.find_compound(word_form) if word.unknown?
|
312
|
+
word = comp.head || comp if comp && !comp.unknown?
|
313
|
+
|
314
|
+
if word.attr?(*inflect)
|
315
|
+
suffix = suffixes[word.genders.compact.first]
|
316
|
+
inflectables.each { |lex_form| lex_form << suffix } if suffix
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
break
|
321
|
+
end
|
322
|
+
}
|
323
|
+
|
324
|
+
forms
|
325
|
+
}]
|
326
|
+
end
|
327
|
+
|
267
328
|
end
|
268
329
|
|
269
330
|
end
|
data/lib/lingo/error.rb
CHANGED
@@ -89,7 +89,7 @@ class Lingo
|
|
89
89
|
end
|
90
90
|
|
91
91
|
def to_s
|
92
|
-
error("An error occured
|
92
|
+
error("An error occured while trying to #{action} `#{file}'")
|
93
93
|
end
|
94
94
|
|
95
95
|
end
|
@@ -194,7 +194,29 @@ class Lingo
|
|
194
194
|
end
|
195
195
|
|
196
196
|
def to_s
|
197
|
-
error("#{class_name}: An error occured while trying to load
|
197
|
+
error("#{class_name}: An error occured while trying to load `#{lib}'")
|
198
|
+
end
|
199
|
+
|
200
|
+
end
|
201
|
+
|
202
|
+
class TokenizeError < LingoError
|
203
|
+
|
204
|
+
attr_reader :line, :file, :num, :err
|
205
|
+
|
206
|
+
def initialize(line, file, num, err)
|
207
|
+
@line, @file, @num, @err = line, file, num, err
|
208
|
+
end
|
209
|
+
|
210
|
+
def to_s
|
211
|
+
line, file = self.line, self.file
|
212
|
+
|
213
|
+
if line.is_a?(String) && line.length > 48
|
214
|
+
line = line[0, 45] + '...'
|
215
|
+
end
|
216
|
+
|
217
|
+
file &&= "#{file}:#{num}: "
|
218
|
+
|
219
|
+
error("An error occured while trying to tokenize #{file}#{line.inspect}")
|
198
220
|
end
|
199
221
|
|
200
222
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -30,8 +30,6 @@ class Lingo
|
|
30
30
|
|
31
31
|
class Dictionary
|
32
32
|
|
33
|
-
KEY_REF_RE = %r{\A#{Database::KEY_REF_ESC}\d+}
|
34
|
-
|
35
33
|
def self.open(*args)
|
36
34
|
yield dictionary = new(*args)
|
37
35
|
ensure
|
@@ -39,7 +37,7 @@ class Lingo
|
|
39
37
|
end
|
40
38
|
|
41
39
|
def initialize(config, lingo)
|
42
|
-
unless config.
|
40
|
+
unless config.key?('source')
|
43
41
|
raise ArgumentError, "Required parameter `source' missing."
|
44
42
|
end
|
45
43
|
|
@@ -70,22 +68,18 @@ class Lingo
|
|
70
68
|
#
|
71
69
|
# Erstellt aus dem String ein Wort und sucht nach diesem im Wörterbuch.
|
72
70
|
def find_word(str)
|
73
|
-
(@_word ||= {})[str] ||=
|
74
|
-
|
75
|
-
w.lexicals = lexicals
|
76
|
-
w.attr = WA_IDENTIFIED
|
77
|
-
end
|
78
|
-
}
|
71
|
+
(@_word ||= {})[str] ||=
|
72
|
+
Word.new(str, WA_UNKNOWN).identify(select_with_suffix(str))
|
79
73
|
end
|
80
74
|
|
81
|
-
def find_synonyms(obj, syn = [])
|
75
|
+
def find_synonyms(obj, syn = [], com = true)
|
82
76
|
lex = obj.lexicals
|
83
77
|
lex = [obj] if lex.empty? && obj.unknown?
|
84
78
|
|
85
|
-
com
|
79
|
+
com &&= obj.attr == WA_COMPOUND
|
86
80
|
|
87
81
|
lex.each { |l|
|
88
|
-
select(l.form, syn)
|
82
|
+
select(l.form, syn) unless com &&
|
89
83
|
l.attr != LA_COMPOUND || l.attr == LA_SYNONYM
|
90
84
|
}
|
91
85
|
|
@@ -97,14 +91,11 @@ class Lingo
|
|
97
91
|
# Sucht alle Wörterbücher durch und gibt den ersten Treffer zurück (+mode = first+), oder alle Treffer (+mode = all+)
|
98
92
|
def select(str, lex = [])
|
99
93
|
@src.each { |src|
|
100
|
-
|
101
|
-
lex.concat(block_given? ? l.delete_if { |i| yield i } : l)
|
94
|
+
lex.concat(src[str] || next)
|
102
95
|
break unless @all
|
103
96
|
}
|
104
97
|
|
105
|
-
lex.
|
106
|
-
lex.uniq!
|
107
|
-
|
98
|
+
lex.empty? && block_given? ? yield(lex) : lex.uniq!
|
108
99
|
lex
|
109
100
|
end
|
110
101
|
|
@@ -113,7 +104,17 @@ class Lingo
|
|
113
104
|
# Sucht alle Wörterbücher durch und gibt den ersten Treffer zurück (+mode = first+), oder alle Treffer (+mode = all+).
|
114
105
|
# Sucht dabei auch Wörter, die um wortklassenspezifische Suffixe bereinigt wurden.
|
115
106
|
def select_with_suffix(str)
|
116
|
-
|
107
|
+
select(str) { |lex|
|
108
|
+
each_affix(str) { |form, attr|
|
109
|
+
unless (selected = select(form)).empty?
|
110
|
+
if selected.first.attr == LA_COMPOUND
|
111
|
+
lex.concat(selected) if selected.last.attr?(attr)
|
112
|
+
else
|
113
|
+
selected.each { |l| lex << l if l.attr?(attr) }
|
114
|
+
end
|
115
|
+
end
|
116
|
+
}
|
117
|
+
}
|
117
118
|
end
|
118
119
|
|
119
120
|
# _dic_.select_with_infix( _aString_ ) -> _ArrayOfLexicals_
|
@@ -121,44 +122,15 @@ class Lingo
|
|
121
122
|
# Sucht alle Wörterbücher durch und gibt den ersten Treffer zurück (+mode = first+), oder alle Treffer (+mode = all+).
|
122
123
|
# Sucht dabei auch Wörter, die eine Fugung am Ende haben.
|
123
124
|
def select_with_infix(str)
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
# _dic_.suffix_lexicals( _aString_ ) -> _ArrayOfLexicals_
|
128
|
-
#
|
129
|
-
# Gibt alle möglichen Lexicals zurück, die von der Endung her auf den String anwendbar sind:
|
130
|
-
#
|
131
|
-
# dic.suffix_lexicals("Hasens") -> [(hasen/s), (hasen/e), (has/e)]
|
132
|
-
def suffix_lexicals(str)
|
133
|
-
affix_lexicals(:suffix, str)
|
134
|
-
end
|
135
|
-
|
136
|
-
# _dic_.gap_lexicals( _aString_ ) -> _ArrayOfLexicals_
|
137
|
-
#
|
138
|
-
# Gibt alle möglichen Lexicals zurück, die von der Endung her auf den String anwendbar sind:
|
139
|
-
def infix_lexicals(str)
|
140
|
-
affix_lexicals(:infix, str)
|
141
|
-
end
|
142
|
-
|
143
|
-
private
|
144
|
-
|
145
|
-
def select_with_affix(affix, str)
|
146
|
-
lex = select(str)
|
147
|
-
|
148
|
-
affix_lexicals(affix, str).each { |a| select(a.form, lex) { |b|
|
149
|
-
affix == :suffix && a.attr != b.attr
|
150
|
-
} } if lex.empty?
|
151
|
-
|
152
|
-
lex
|
125
|
+
select(str) { |lex|
|
126
|
+
each_affix(str, :infix) { |form, _| select(form, lex) }
|
127
|
+
}
|
153
128
|
end
|
154
129
|
|
155
|
-
def
|
156
|
-
|
157
|
-
|
130
|
+
def each_affix(str, affix = :suffix)
|
131
|
+
instance_variable_get("@#{affix}es").each { |r, e, t|
|
132
|
+
yield "#{$`}#{e == '*' ? '' : e}#{$'}", t if str =~ r
|
158
133
|
}
|
159
|
-
|
160
|
-
lex.compact!
|
161
|
-
lex
|
162
134
|
end
|
163
135
|
|
164
136
|
end
|