lingo 1.8.4.2 → 1.8.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ChangeLog +413 -325
- data/README +380 -131
- data/Rakefile +19 -21
- data/de/lingo-abk.txt +15 -17
- data/de/lingo-dic.txt +20210 -20659
- data/de/lingo-mul.txt +5 -13
- data/de/lingo-syn.txt +5 -8
- data/de/test_dic.txt +2 -0
- data/de/test_gen.txt +8 -0
- data/de/{test_mul2.txt → test_mu2.txt} +0 -0
- data/de/{test_singleword.txt → test_sgw.txt} +0 -0
- data/de/user-dic.txt +5 -7
- data/de.lang +64 -49
- data/en/lingo-dic.txt +6398 -6404
- data/en/lingo-irr.txt +2 -3
- data/en/lingo-mul.txt +6 -7
- data/en/lingo-wdn.txt +881 -1762
- data/en/user-dic.txt +2 -5
- data/en.lang +39 -39
- data/lib/lingo/app.rb +10 -6
- data/lib/lingo/attendee/abbreviator.rb +1 -0
- data/lib/lingo/attendee/decomposer.rb +2 -1
- data/lib/lingo/attendee/multi_worder.rb +5 -6
- data/lib/lingo/attendee/stemmer.rb +1 -1
- data/lib/lingo/attendee/synonymer.rb +4 -2
- data/lib/lingo/attendee/text_reader.rb +77 -57
- data/lib/lingo/attendee/text_writer.rb +1 -1
- data/lib/lingo/attendee/tokenizer.rb +101 -50
- data/lib/lingo/attendee/variator.rb +2 -1
- data/lib/lingo/attendee/vector_filter.rb +28 -6
- data/lib/lingo/attendee/word_searcher.rb +2 -1
- data/lib/lingo/attendee.rb +8 -4
- data/lib/lingo/call.rb +7 -3
- data/lib/lingo/cli.rb +8 -16
- data/lib/lingo/config.rb +11 -6
- data/lib/lingo/ctl.rb +54 -3
- data/lib/lingo/database/crypter.rb +8 -14
- data/lib/lingo/database/hash_store.rb +1 -1
- data/lib/lingo/database/{show_progress.rb → progress.rb} +7 -8
- data/lib/lingo/database/source/key_value.rb +6 -5
- data/lib/lingo/database/source/multi_key.rb +5 -2
- data/lib/lingo/database/source/multi_value.rb +6 -4
- data/lib/lingo/database/source/single_word.rb +2 -3
- data/lib/lingo/database/source/word_class.rb +24 -5
- data/lib/lingo/database/source.rb +5 -3
- data/lib/lingo/database.rb +102 -41
- data/lib/lingo/error.rb +24 -2
- data/lib/lingo/language/dictionary.rb +26 -54
- data/lib/lingo/language/grammar.rb +19 -23
- data/lib/lingo/language/lexical.rb +5 -1
- data/lib/lingo/language/lexical_hash.rb +7 -12
- data/lib/lingo/language/token.rb +10 -1
- data/lib/lingo/language/word.rb +35 -23
- data/lib/lingo/language/word_form.rb +5 -4
- data/lib/lingo/{show_progress.rb → progress.rb} +43 -30
- data/lib/lingo/srv/lingosrv.cfg +1 -1
- data/lib/lingo/srv/public/.gitkeep +0 -0
- data/lib/lingo/srv.rb +11 -6
- data/lib/lingo/version.rb +2 -2
- data/lib/lingo/web/lingoweb.cfg +1 -1
- data/lib/lingo/web/views/index.erb +4 -4
- data/lib/lingo/web.rb +4 -6
- data/lib/lingo.rb +4 -12
- data/lingo.cfg +1 -1
- data/lir.cfg +1 -1
- data/ru/lingo-dic.txt +33473 -2113
- data/ru/lingo-mul.txt +8430 -1913
- data/ru/lingo-syn.txt +1634 -0
- data/ru/user-dic.txt +6 -0
- data/ru.lang +49 -47
- data/spec/spec_helper.rb +4 -0
- data/test/attendee/ts_decomposer.rb +2 -2
- data/test/attendee/ts_synonymer.rb +3 -3
- data/test/attendee/ts_tokenizer.rb +215 -2
- data/test/attendee/ts_variator.rb +2 -2
- data/test/attendee/ts_word_searcher.rb +10 -6
- data/test/ref/artikel.seq +2 -2
- data/test/ref/artikel.vec +5 -5
- data/test/ref/artikel.ven +11 -11
- data/test/ref/artikel.ver +11 -11
- data/test/ref/lir.seq +13 -13
- data/test/ref/lir.vec +31 -31
- data/test/test_helper.rb +19 -5
- data/test/ts_database.rb +206 -77
- data/test/ts_language.rb +86 -26
- metadata +93 -49
- data/.rspec +0 -1
- data/de/test_syn2.txt +0 -1
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2013 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -37,16 +37,17 @@ class Lingo
|
|
37
37
|
|
38
38
|
class KeyValue < self
|
39
39
|
|
40
|
+
DEFAULT_SEPARATOR = '*'
|
41
|
+
|
40
42
|
def initialize(id, lingo)
|
41
|
-
super
|
42
|
-
@pat = /^(#{@wrd})#{Regexp.escape(@sep ||=
|
43
|
+
super(id, lingo, Language::LA_UNKNOWN)
|
44
|
+
@pat = /^(#{@wrd})#{Regexp.escape(@sep ||= DEFAULT_SEPARATOR)}(#{@wrd})$/
|
43
45
|
end
|
44
46
|
|
45
47
|
private
|
46
48
|
|
47
49
|
def convert_line(line, key, val)
|
48
|
-
key
|
49
|
-
[key, %W[#{val unless key == val}##{@def}]]
|
50
|
+
[key.strip, %W[#{val.strip}##{@def}]]
|
50
51
|
end
|
51
52
|
|
52
53
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2013 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -38,12 +38,15 @@ class Lingo
|
|
38
38
|
|
39
39
|
class MultiKey < self
|
40
40
|
|
41
|
+
DEFAULT_SEPARATOR = ';'
|
42
|
+
|
41
43
|
def initialize(id, lingo)
|
42
44
|
super
|
43
|
-
@pat = /^#{@wrd}(?:#{Regexp.escape(@sep ||=
|
45
|
+
@pat = /^#{@wrd}(?:#{Regexp.escape(@sep ||= DEFAULT_SEPARATOR)}#{@wrd})*$/
|
44
46
|
end
|
45
47
|
|
46
48
|
def set(db, key, val)
|
49
|
+
key += "##{@def}" if @def
|
47
50
|
val.each { |v| db[v] = [key] }
|
48
51
|
end
|
49
52
|
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2013 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -36,14 +36,16 @@ class Lingo
|
|
36
36
|
|
37
37
|
class MultiValue < self
|
38
38
|
|
39
|
+
DEFAULT_SEPARATOR = ';'
|
40
|
+
|
39
41
|
def initialize(id, lingo)
|
40
42
|
super
|
41
|
-
@pat
|
43
|
+
@pat = /^#{@wrd}(?:#{Regexp.escape(@sep ||= DEFAULT_SEPARATOR)}#{@wrd})*$/
|
42
44
|
end
|
43
45
|
|
44
46
|
def set(db, key, val)
|
45
|
-
|
46
|
-
val.each { |v| db[v] =
|
47
|
+
values = val.map { |v| @def ? "#{v}##{@def}" : v }
|
48
|
+
val.each { |v| db[v] = values }
|
47
49
|
end
|
48
50
|
|
49
51
|
private
|
@@ -37,16 +37,15 @@ class Lingo
|
|
37
37
|
class SingleWord < self
|
38
38
|
|
39
39
|
def initialize(id, lingo)
|
40
|
-
super
|
40
|
+
super(id, lingo, Language::LA_NOUN)
|
41
41
|
@pat = /^(#{@wrd})$/
|
42
|
-
@def = @config.fetch('def-wc', 's').downcase
|
43
42
|
@mul = @config.fetch('def-mul-wc', @def).downcase
|
44
43
|
end
|
45
44
|
|
46
45
|
private
|
47
46
|
|
48
47
|
def convert_line(line, key, val)
|
49
|
-
[
|
48
|
+
[k = key.strip, %W[#{k}##{k.include?(' ') ? @mul : @def}]]
|
50
49
|
end
|
51
50
|
|
52
51
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2013 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -36,17 +36,36 @@ class Lingo
|
|
36
36
|
|
37
37
|
class WordClass < self
|
38
38
|
|
39
|
+
DEFAULT_SEPARATOR = ','
|
40
|
+
|
41
|
+
GENDER_SEPARATOR = '.'
|
42
|
+
|
39
43
|
def initialize(id, lingo)
|
40
44
|
super
|
41
|
-
|
45
|
+
|
46
|
+
gen = Regexp.escape(GENDER_SEPARATOR)
|
47
|
+
sep = Regexp.escape(@sep ||= DEFAULT_SEPARATOR)
|
48
|
+
|
49
|
+
w, a = '\w%1$s(?:\|\w%1$s)*', '[+]?'
|
50
|
+
wc = "##{w % a}(?:#{gen}#{w % ''})?"
|
51
|
+
|
52
|
+
@pat = /^(#{@wrd})#{sep}((?:#{@wrd}#{wc})+)$/
|
42
53
|
end
|
43
54
|
|
44
55
|
private
|
45
56
|
|
46
57
|
def convert_line(line, key, val)
|
47
|
-
|
48
|
-
|
49
|
-
|
58
|
+
values = []
|
59
|
+
|
60
|
+
val.strip.scan(/(\S.*?)\s*#(\S+)/) { |k, v|
|
61
|
+
v, f = v.split('.')
|
62
|
+
|
63
|
+
v.split('|').product(f ? f.split('|') : [nil]) { |w, g|
|
64
|
+
values << "#{k}##{w}##{g}"
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
[key.strip, values]
|
50
69
|
end
|
51
70
|
|
52
71
|
end
|
@@ -53,7 +53,7 @@ class Lingo
|
|
53
53
|
|
54
54
|
attr_reader :pos
|
55
55
|
|
56
|
-
def initialize(id, lingo)
|
56
|
+
def initialize(id, lingo, def_wc_default = nil)
|
57
57
|
@config = lingo.database_config(id)
|
58
58
|
|
59
59
|
source_file = Lingo.find(:dict, name = @config['name'], relax: true)
|
@@ -68,7 +68,8 @@ class Lingo
|
|
68
68
|
|
69
69
|
raise SourceFileNotFoundError.new(name, id) unless @src.exist?
|
70
70
|
|
71
|
-
@def = @config.fetch('def-wc',
|
71
|
+
@def = @config.fetch('def-wc', def_wc_default)
|
72
|
+
@def = @def.downcase if @def
|
72
73
|
@sep = @config['separator']
|
73
74
|
|
74
75
|
@wrd = "(?:#{Language::Char::ANY})+"
|
@@ -87,7 +88,8 @@ class Lingo
|
|
87
88
|
@src.each_line($/, encoding: ENC) { |line|
|
88
89
|
@pos += length = line.bytesize
|
89
90
|
|
90
|
-
|
91
|
+
line.strip!
|
92
|
+
next if line.empty? || line.start_with?('#')
|
91
93
|
|
92
94
|
line.chomp!
|
93
95
|
line.replace(Unicode.downcase(line))
|
data/lib/lingo/database.rb
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -24,7 +24,7 @@
|
|
24
24
|
###############################################################################
|
25
25
|
#++
|
26
26
|
|
27
|
-
require_relative 'database/
|
27
|
+
require_relative 'database/progress'
|
28
28
|
require_relative 'database/crypter'
|
29
29
|
require_relative 'database/source'
|
30
30
|
|
@@ -40,15 +40,9 @@ class Lingo
|
|
40
40
|
class Database
|
41
41
|
|
42
42
|
FLD_SEP = '|'
|
43
|
-
IDX_REF = '^'
|
44
43
|
KEY_REF = '*'
|
45
44
|
SYS_KEY = '~'
|
46
45
|
|
47
|
-
IDX_REF_ESC = Regexp.escape(IDX_REF)
|
48
|
-
KEY_REF_ESC = Regexp.escape(KEY_REF)
|
49
|
-
|
50
|
-
INDEX_PATTERN = %r{\A#{IDX_REF_ESC}\d+\z}
|
51
|
-
|
52
46
|
BACKENDS = []
|
53
47
|
BACKEND_BY_EXT = {}
|
54
48
|
|
@@ -68,13 +62,11 @@ class Lingo
|
|
68
62
|
|
69
63
|
end
|
70
64
|
|
71
|
-
attr_reader :backend
|
72
|
-
|
73
65
|
def initialize(id, lingo)
|
74
66
|
@id, @lingo, @config, @db = id, lingo, lingo.database_config(id), nil
|
75
67
|
|
76
|
-
@srcfile = Lingo.find(:dict,
|
77
|
-
@crypter =
|
68
|
+
@srcfile = Lingo.find(:dict, config['name'], relax: true)
|
69
|
+
@crypter = config.key?('crypt') && Crypter.new
|
78
70
|
|
79
71
|
@val = Hash.new { |h, k| h[k] = [] }
|
80
72
|
|
@@ -93,6 +85,8 @@ class Lingo
|
|
93
85
|
convert unless uptodate?
|
94
86
|
end
|
95
87
|
|
88
|
+
attr_reader :lingo, :config, :backend
|
89
|
+
|
96
90
|
def closed?
|
97
91
|
!@db || _closed?
|
98
92
|
end
|
@@ -125,18 +119,13 @@ class Lingo
|
|
125
119
|
|
126
120
|
def [](key)
|
127
121
|
val = _val(key) unless closed?
|
128
|
-
|
129
|
-
|
130
|
-
# Äquvalenzklassen behandeln
|
131
|
-
val.split(FLD_SEP).map { |v|
|
132
|
-
v =~ INDEX_PATTERN ? _val(v) : v
|
133
|
-
}.compact.join(FLD_SEP).split(FLD_SEP)
|
122
|
+
val.split(FLD_SEP) if val
|
134
123
|
end
|
135
124
|
|
136
125
|
def []=(key, val)
|
137
126
|
return if closed?
|
138
127
|
|
139
|
-
val = @val[key].concat(val)
|
128
|
+
val = @val[key].concat(val)
|
140
129
|
val.uniq!
|
141
130
|
|
142
131
|
val = val.join(FLD_SEP)
|
@@ -144,7 +133,7 @@ class Lingo
|
|
144
133
|
end
|
145
134
|
|
146
135
|
def warn(*msg)
|
147
|
-
|
136
|
+
lingo.warn(*msg)
|
148
137
|
end
|
149
138
|
|
150
139
|
private
|
@@ -171,9 +160,24 @@ class Lingo
|
|
171
160
|
get_backend(mod) or raise BackendNotAvailableError.new(mod, file)
|
172
161
|
end
|
173
162
|
|
163
|
+
def config_hash
|
164
|
+
hashes = [config]
|
165
|
+
|
166
|
+
if use_lex = config['use-lex']
|
167
|
+
hashes.concat(lingo.
|
168
|
+
dictionary_config['databases'].
|
169
|
+
values_at(*use_lex.split(SEP_RE)))
|
170
|
+
end
|
171
|
+
|
172
|
+
Crypter.digest(hashes.inspect)
|
173
|
+
end
|
174
|
+
|
174
175
|
def uptodate?(file = @stofile)
|
175
176
|
src = Pathname.new(@srcfile)
|
176
|
-
|
177
|
+
|
178
|
+
@source_key = lambda {
|
179
|
+
[src.size, src.mtime, VERSION, config_hash].join(FLD_SEP)
|
180
|
+
}
|
177
181
|
|
178
182
|
sys_key = open { @db[SYS_KEY] } if File.exist?(file)
|
179
183
|
sys_key && (!src.exist? || sys_key == @source_key.call)
|
@@ -217,7 +221,7 @@ class Lingo
|
|
217
221
|
end
|
218
222
|
|
219
223
|
def _val(key)
|
220
|
-
if val = _get(@crypter ?
|
224
|
+
if val = _get(@crypter ? Crypter.digest(key) : key)
|
221
225
|
_encode!(val)
|
222
226
|
@crypter ? @crypter.decode(key, val) : val
|
223
227
|
end
|
@@ -227,33 +231,25 @@ class Lingo
|
|
227
231
|
str.force_encoding(ENC)
|
228
232
|
end
|
229
233
|
|
230
|
-
def convert(verbose =
|
231
|
-
src = Source.get(
|
232
|
-
|
233
|
-
if lex = @config['use-lex']
|
234
|
-
a = [{ 'source' => lex.split(SEP_RE), 'mode' => @config['lex-mode'] }, @lingo]
|
235
|
-
d, g = Language::Dictionary.new(*a), Language::Grammar.new(*a); a = nil
|
234
|
+
def convert(verbose = lingo.config.stderr.tty?)
|
235
|
+
src = Source.get(config.fetch('txt-format', 'key_value'), @id, lingo)
|
236
236
|
|
237
|
-
|
238
|
-
(r = d.find_word(f)).unknown? &&
|
239
|
-
(c = (r = g.find_compound(f)).compo_form) ? c.form : r.norm
|
240
|
-
}
|
241
|
-
end
|
237
|
+
sep, key_map, val_map = prepare_lex
|
242
238
|
|
243
|
-
|
239
|
+
Progress.new(self, src, verbose) { |progress| create {
|
244
240
|
src.each { |key, val|
|
245
|
-
progress
|
241
|
+
progress << src.pos
|
246
242
|
|
247
243
|
if key
|
248
244
|
key.chomp!('.')
|
249
245
|
|
250
|
-
if
|
251
|
-
|
246
|
+
if sep && key.include?(sep)
|
247
|
+
key = key.split(sep).map!(&key_map).join(sep)
|
248
|
+
val = val.map { |v| val_map[v.split(sep)].join(sep) } if val_map
|
252
249
|
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
key, val = k, val.map { |v| v.start_with?('#') ? key + v : v }
|
250
|
+
if (cnt = key.count(sep)) > 2
|
251
|
+
self[key.split(sep)[0, 3].join(sep)] = ["#{KEY_REF}#{cnt + 1}"]
|
252
|
+
end
|
257
253
|
end
|
258
254
|
end
|
259
255
|
|
@@ -264,6 +260,71 @@ class Lingo
|
|
264
260
|
} }
|
265
261
|
end
|
266
262
|
|
263
|
+
def prepare_lex
|
264
|
+
use_lex = config['use-lex'] or return
|
265
|
+
|
266
|
+
args = [{
|
267
|
+
'source' => use_lex.split(SEP_RE),
|
268
|
+
'mode' => config['lex-mode']
|
269
|
+
}, lingo]
|
270
|
+
|
271
|
+
dic = Language::Dictionary.new(*args)
|
272
|
+
gra = Language::Grammar.new(*args)
|
273
|
+
|
274
|
+
args = nil
|
275
|
+
|
276
|
+
if inflect = config['inflect']
|
277
|
+
inflect, wc = inflect == true ? %w[s e] : inflect.split(SEP_RE), 'a'
|
278
|
+
|
279
|
+
if cfg = lingo.dictionary_config['inflect'] and suffixes = cfg[wc]
|
280
|
+
wc, re = /#{wc}/, /\A[^#]+/
|
281
|
+
else
|
282
|
+
warn "#{self.class}: No suffixes to inflect ##{wc}: #{@id}"
|
283
|
+
inflect = false
|
284
|
+
end
|
285
|
+
end
|
286
|
+
|
287
|
+
[' ', lambda { |form|
|
288
|
+
word = dic.find_word(form)
|
289
|
+
|
290
|
+
if word.unknown?
|
291
|
+
compo = gra.find_compound(form)
|
292
|
+
|
293
|
+
if compo_form = compo.compo_form
|
294
|
+
compo_form.form
|
295
|
+
else
|
296
|
+
compo.norm
|
297
|
+
end
|
298
|
+
else
|
299
|
+
word.norm
|
300
|
+
end
|
301
|
+
}, inflect && lambda { |forms|
|
302
|
+
inflectables = []
|
303
|
+
|
304
|
+
forms.each { |form|
|
305
|
+
word = dic.find_word(word_form = form[re])
|
306
|
+
|
307
|
+
if word.identified? and lexical = word.get_class(wc).first
|
308
|
+
inflectables << form if form == lexical.form
|
309
|
+
else
|
310
|
+
unless inflectables.empty?
|
311
|
+
comp = gra.find_compound(word_form) if word.unknown?
|
312
|
+
word = comp.head || comp if comp && !comp.unknown?
|
313
|
+
|
314
|
+
if word.attr?(*inflect)
|
315
|
+
suffix = suffixes[word.genders.compact.first]
|
316
|
+
inflectables.each { |lex_form| lex_form << suffix } if suffix
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
break
|
321
|
+
end
|
322
|
+
}
|
323
|
+
|
324
|
+
forms
|
325
|
+
}]
|
326
|
+
end
|
327
|
+
|
267
328
|
end
|
268
329
|
|
269
330
|
end
|
data/lib/lingo/error.rb
CHANGED
@@ -89,7 +89,7 @@ class Lingo
|
|
89
89
|
end
|
90
90
|
|
91
91
|
def to_s
|
92
|
-
error("An error occured
|
92
|
+
error("An error occured while trying to #{action} `#{file}'")
|
93
93
|
end
|
94
94
|
|
95
95
|
end
|
@@ -194,7 +194,29 @@ class Lingo
|
|
194
194
|
end
|
195
195
|
|
196
196
|
def to_s
|
197
|
-
error("#{class_name}: An error occured while trying to load
|
197
|
+
error("#{class_name}: An error occured while trying to load `#{lib}'")
|
198
|
+
end
|
199
|
+
|
200
|
+
end
|
201
|
+
|
202
|
+
class TokenizeError < LingoError
|
203
|
+
|
204
|
+
attr_reader :line, :file, :num, :err
|
205
|
+
|
206
|
+
def initialize(line, file, num, err)
|
207
|
+
@line, @file, @num, @err = line, file, num, err
|
208
|
+
end
|
209
|
+
|
210
|
+
def to_s
|
211
|
+
line, file = self.line, self.file
|
212
|
+
|
213
|
+
if line.is_a?(String) && line.length > 48
|
214
|
+
line = line[0, 45] + '...'
|
215
|
+
end
|
216
|
+
|
217
|
+
file &&= "#{file}:#{num}: "
|
218
|
+
|
219
|
+
error("An error occured while trying to tokenize #{file}#{line.inspect}")
|
198
220
|
end
|
199
221
|
|
200
222
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -30,8 +30,6 @@ class Lingo
|
|
30
30
|
|
31
31
|
class Dictionary
|
32
32
|
|
33
|
-
KEY_REF_RE = %r{\A#{Database::KEY_REF_ESC}\d+}
|
34
|
-
|
35
33
|
def self.open(*args)
|
36
34
|
yield dictionary = new(*args)
|
37
35
|
ensure
|
@@ -39,7 +37,7 @@ class Lingo
|
|
39
37
|
end
|
40
38
|
|
41
39
|
def initialize(config, lingo)
|
42
|
-
unless config.
|
40
|
+
unless config.key?('source')
|
43
41
|
raise ArgumentError, "Required parameter `source' missing."
|
44
42
|
end
|
45
43
|
|
@@ -70,22 +68,18 @@ class Lingo
|
|
70
68
|
#
|
71
69
|
# Erstellt aus dem String ein Wort und sucht nach diesem im Wörterbuch.
|
72
70
|
def find_word(str)
|
73
|
-
(@_word ||= {})[str] ||=
|
74
|
-
|
75
|
-
w.lexicals = lexicals
|
76
|
-
w.attr = WA_IDENTIFIED
|
77
|
-
end
|
78
|
-
}
|
71
|
+
(@_word ||= {})[str] ||=
|
72
|
+
Word.new(str, WA_UNKNOWN).identify(select_with_suffix(str))
|
79
73
|
end
|
80
74
|
|
81
|
-
def find_synonyms(obj, syn = [])
|
75
|
+
def find_synonyms(obj, syn = [], com = true)
|
82
76
|
lex = obj.lexicals
|
83
77
|
lex = [obj] if lex.empty? && obj.unknown?
|
84
78
|
|
85
|
-
com
|
79
|
+
com &&= obj.attr == WA_COMPOUND
|
86
80
|
|
87
81
|
lex.each { |l|
|
88
|
-
select(l.form, syn)
|
82
|
+
select(l.form, syn) unless com &&
|
89
83
|
l.attr != LA_COMPOUND || l.attr == LA_SYNONYM
|
90
84
|
}
|
91
85
|
|
@@ -97,14 +91,11 @@ class Lingo
|
|
97
91
|
# Sucht alle Wörterbücher durch und gibt den ersten Treffer zurück (+mode = first+), oder alle Treffer (+mode = all+)
|
98
92
|
def select(str, lex = [])
|
99
93
|
@src.each { |src|
|
100
|
-
|
101
|
-
lex.concat(block_given? ? l.delete_if { |i| yield i } : l)
|
94
|
+
lex.concat(src[str] || next)
|
102
95
|
break unless @all
|
103
96
|
}
|
104
97
|
|
105
|
-
lex.
|
106
|
-
lex.uniq!
|
107
|
-
|
98
|
+
lex.empty? && block_given? ? yield(lex) : lex.uniq!
|
108
99
|
lex
|
109
100
|
end
|
110
101
|
|
@@ -113,7 +104,17 @@ class Lingo
|
|
113
104
|
# Sucht alle Wörterbücher durch und gibt den ersten Treffer zurück (+mode = first+), oder alle Treffer (+mode = all+).
|
114
105
|
# Sucht dabei auch Wörter, die um wortklassenspezifische Suffixe bereinigt wurden.
|
115
106
|
def select_with_suffix(str)
|
116
|
-
|
107
|
+
select(str) { |lex|
|
108
|
+
each_affix(str) { |form, attr|
|
109
|
+
unless (selected = select(form)).empty?
|
110
|
+
if selected.first.attr == LA_COMPOUND
|
111
|
+
lex.concat(selected) if selected.last.attr?(attr)
|
112
|
+
else
|
113
|
+
selected.each { |l| lex << l if l.attr?(attr) }
|
114
|
+
end
|
115
|
+
end
|
116
|
+
}
|
117
|
+
}
|
117
118
|
end
|
118
119
|
|
119
120
|
# _dic_.select_with_infix( _aString_ ) -> _ArrayOfLexicals_
|
@@ -121,44 +122,15 @@ class Lingo
|
|
121
122
|
# Sucht alle Wörterbücher durch und gibt den ersten Treffer zurück (+mode = first+), oder alle Treffer (+mode = all+).
|
122
123
|
# Sucht dabei auch Wörter, die eine Fugung am Ende haben.
|
123
124
|
def select_with_infix(str)
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
# _dic_.suffix_lexicals( _aString_ ) -> _ArrayOfLexicals_
|
128
|
-
#
|
129
|
-
# Gibt alle möglichen Lexicals zurück, die von der Endung her auf den String anwendbar sind:
|
130
|
-
#
|
131
|
-
# dic.suffix_lexicals("Hasens") -> [(hasen/s), (hasen/e), (has/e)]
|
132
|
-
def suffix_lexicals(str)
|
133
|
-
affix_lexicals(:suffix, str)
|
134
|
-
end
|
135
|
-
|
136
|
-
# _dic_.gap_lexicals( _aString_ ) -> _ArrayOfLexicals_
|
137
|
-
#
|
138
|
-
# Gibt alle möglichen Lexicals zurück, die von der Endung her auf den String anwendbar sind:
|
139
|
-
def infix_lexicals(str)
|
140
|
-
affix_lexicals(:infix, str)
|
141
|
-
end
|
142
|
-
|
143
|
-
private
|
144
|
-
|
145
|
-
def select_with_affix(affix, str)
|
146
|
-
lex = select(str)
|
147
|
-
|
148
|
-
affix_lexicals(affix, str).each { |a| select(a.form, lex) { |b|
|
149
|
-
affix == :suffix && a.attr != b.attr
|
150
|
-
} } if lex.empty?
|
151
|
-
|
152
|
-
lex
|
125
|
+
select(str) { |lex|
|
126
|
+
each_affix(str, :infix) { |form, _| select(form, lex) }
|
127
|
+
}
|
153
128
|
end
|
154
129
|
|
155
|
-
def
|
156
|
-
|
157
|
-
|
130
|
+
def each_affix(str, affix = :suffix)
|
131
|
+
instance_variable_get("@#{affix}es").each { |r, e, t|
|
132
|
+
yield "#{$`}#{e == '*' ? '' : e}#{$'}", t if str =~ r
|
158
133
|
}
|
159
|
-
|
160
|
-
lex.compact!
|
161
|
-
lex
|
162
134
|
end
|
163
135
|
|
164
136
|
end
|