lingo 1.9.0.pre1 → 1.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ChangeLog +18 -7
- data/README +6 -8
- data/Rakefile +5 -5
- data/dict/en/lingo-dic.txt +52625 -15693
- data/lang/en.lang +2 -2
- data/lib/lingo.rb +15 -3
- data/lib/lingo/array_utils.rb +39 -0
- data/lib/lingo/attendee.rb +1 -3
- data/lib/lingo/attendee/multi_worder.rb +4 -2
- data/lib/lingo/attendee/sequencer.rb +122 -73
- data/lib/lingo/attendee/text_writer.rb +4 -6
- data/lib/lingo/attendee/vector_filter.rb +5 -5
- data/lib/lingo/cli.rb +20 -2
- data/lib/lingo/config.rb +4 -3
- data/lib/lingo/ctl.rb +2 -20
- data/lib/lingo/ctl/analysis.rb +3 -5
- data/lib/lingo/ctl/files.rb +3 -3
- data/lib/lingo/database.rb +26 -25
- data/lib/lingo/database/crypter.rb +10 -6
- data/lib/lingo/database/source.rb +72 -25
- data/lib/lingo/database/source/key_value.rb +12 -8
- data/lib/lingo/database/source/multi_key.rb +11 -9
- data/lib/lingo/database/source/multi_value.rb +10 -8
- data/lib/lingo/database/source/single_word.rb +10 -6
- data/lib/lingo/database/source/word_class.rb +43 -14
- data/lib/lingo/debug.rb +2 -2
- data/lib/lingo/error.rb +21 -5
- data/lib/lingo/filter.rb +1 -1
- data/lib/lingo/language.rb +21 -21
- data/lib/lingo/language/grammar.rb +4 -2
- data/lib/lingo/language/lexical_hash.rb +2 -14
- data/lib/lingo/language/word.rb +1 -5
- data/lib/lingo/text_utils.rb +113 -20
- data/lib/lingo/version.rb +1 -1
- data/test/attendee/ts_sequencer.rb +286 -32
- data/test/attendee/ts_text_reader.rb +4 -4
- data/test/attendee/ts_text_writer.rb +19 -5
- data/test/test_helper.rb +2 -0
- data/test/ts_database.rb +213 -14
- metadata +36 -24
data/lang/en.lang
CHANGED
@@ -68,9 +68,9 @@ language:
|
|
68
68
|
# Suffixliste, Stand: *****ENGLISCH****
|
69
69
|
# Suffixklasse: s = Substantiv, a = Adjektiv, v = Verb, e = Eigenwort, f = Fugung
|
70
70
|
# Suffixe je Klasse: "<suffix>['/'<ersetzung>][ <suffix>['/'<ersetzung>]]"
|
71
|
-
- [s, 'es s ves/f ves/fe ies/y']
|
71
|
+
- [s, 'es es/is s ves/f ves/fe ies/y']
|
72
72
|
- [a, 'er est r st ier/y iest/y ly al ally']
|
73
|
-
- [v, 'd ed en es ing s ing/e']
|
73
|
+
- [v, 'd ed en es ies/y ing s ing/e']
|
74
74
|
- [e, 's']
|
75
75
|
- [f, '']
|
76
76
|
|
data/lib/lingo.rb
CHANGED
@@ -32,6 +32,7 @@ require 'nuggets/file/ext'
|
|
32
32
|
require 'nuggets/hash/nest'
|
33
33
|
require 'nuggets/hash/seen'
|
34
34
|
require 'nuggets/env/user_home'
|
35
|
+
require 'nuggets/object/silence'
|
35
36
|
require 'nuggets/string/camelscore'
|
36
37
|
|
37
38
|
class Lingo
|
@@ -61,8 +62,12 @@ class Lingo
|
|
61
62
|
}
|
62
63
|
|
63
64
|
# Default encoding
|
64
|
-
|
65
|
-
|
65
|
+
ENCODING = 'UTF-8'.freeze
|
66
|
+
|
67
|
+
silence {
|
68
|
+
Encoding.default_external = ENCODING
|
69
|
+
Encoding.default_internal = ENCODING unless RUBY_ENGINE == 'jruby'
|
70
|
+
}
|
66
71
|
|
67
72
|
SEP_RE = %r{[; ,|]}
|
68
73
|
|
@@ -298,6 +303,11 @@ class Lingo
|
|
298
303
|
} }
|
299
304
|
end
|
300
305
|
|
306
|
+
def attendees(arg = Object)
|
307
|
+
@attendees.grep(arg.is_a?(Class) ? arg :
|
308
|
+
Attendee.const_get(arg.to_s.camelcase))
|
309
|
+
end
|
310
|
+
|
301
311
|
def start
|
302
312
|
@attendees.first.control(:TALK)
|
303
313
|
end
|
@@ -319,9 +329,11 @@ require_relative 'lingo/error'
|
|
319
329
|
require_relative 'lingo/debug'
|
320
330
|
require_relative 'lingo/config'
|
321
331
|
require_relative 'lingo/filter'
|
332
|
+
require_relative 'lingo/array_utils'
|
333
|
+
require_relative 'lingo/text_utils'
|
334
|
+
require_relative 'lingo/language'
|
322
335
|
require_relative 'lingo/progress'
|
323
336
|
require_relative 'lingo/database'
|
324
|
-
require_relative 'lingo/language'
|
325
337
|
require_relative 'lingo/attendee'
|
326
338
|
require_relative 'lingo/version'
|
327
339
|
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
module ArrayUtils
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def combinations(first, *rest, &block)
|
34
|
+
first.product(*rest, &block)
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
data/lib/lingo/attendee.rb
CHANGED
@@ -146,7 +146,7 @@ class Lingo
|
|
146
146
|
end
|
147
147
|
end
|
148
148
|
|
149
|
-
def get_enc(key = 'encoding', default =
|
149
|
+
def get_enc(key = 'encoding', default = ENCODING)
|
150
150
|
Encoding.find(get_key(key, default))
|
151
151
|
rescue ArgumentError => err
|
152
152
|
raise ConfigLoadError.new(err)
|
@@ -182,8 +182,6 @@ class Lingo
|
|
182
182
|
|
183
183
|
end
|
184
184
|
|
185
|
-
require_relative 'text_utils'
|
186
|
-
|
187
185
|
require_relative 'buffered_attendee'
|
188
186
|
require_relative 'deferred_attendee'
|
189
187
|
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -79,6 +79,8 @@ class Lingo
|
|
79
79
|
|
80
80
|
class MultiWorder < BufferedAttendee
|
81
81
|
|
82
|
+
include ArrayUtils
|
83
|
+
|
82
84
|
def init
|
83
85
|
# combine lexical variants?
|
84
86
|
#
|
@@ -220,7 +222,7 @@ class Lingo
|
|
220
222
|
}
|
221
223
|
|
222
224
|
if @combine
|
223
|
-
|
225
|
+
combinations(*seq) { |key|
|
224
226
|
@mul_dic.select(key.join(sep), mul)
|
225
227
|
break unless @all || mul.empty?
|
226
228
|
} && mul.uniq!
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -97,23 +97,53 @@ class Lingo
|
|
97
97
|
|
98
98
|
class Sequencer < BufferedAttendee
|
99
99
|
|
100
|
+
include ArrayUtils
|
101
|
+
|
100
102
|
UNK = %w[#]
|
101
103
|
NUM = %w[0]
|
102
104
|
|
103
|
-
|
105
|
+
CLASS_RE = %r{[a-z#{NUM.join}]}o
|
106
|
+
|
107
|
+
REGEX_RE = %r{
|
108
|
+
( #{CLASS_RE}+ )
|
109
|
+
|
|
110
|
+
\[
|
111
|
+
( #{CLASS_RE}+ )
|
112
|
+
\]
|
113
|
+
|
|
114
|
+
\(
|
115
|
+
(?: \?: )?
|
116
|
+
( #{CLASS_RE}+ (?: \| #{CLASS_RE}+ )* )
|
117
|
+
\)
|
118
|
+
}xo
|
119
|
+
|
120
|
+
FULL_CLASS_RE = %r{\A(?:#{CLASS_RE})+\z}o
|
121
|
+
FULL_REGEX_RE = %r{\A(?:#{REGEX_RE})+\z}o
|
104
122
|
|
105
123
|
def init
|
106
124
|
@stopper = get_ary('stopper', DEFAULT_SKIP)
|
107
125
|
.push(WA_UNKNOWN, WA_UNKMULPART)
|
108
126
|
|
109
127
|
@mwc = get_key('multiword', LA_MULTIWORD)
|
110
|
-
@cls = []
|
111
128
|
|
112
|
-
@seq =
|
113
|
-
|
129
|
+
@cls, @seq = [], []
|
130
|
+
|
131
|
+
get_key('sequences').each { |str, fmt|
|
132
|
+
seq, fmt = lambda { |*a| @seq << (a << fmt) },
|
133
|
+
fmt == true ? '|' : fmt ? fmt.gsub(/\d+/, '%\&$s') : nil
|
134
|
+
|
135
|
+
@cls.concat(cls = (str = str.downcase).scan(CLASS_RE))
|
114
136
|
|
115
|
-
|
116
|
-
|
137
|
+
case str
|
138
|
+
when FULL_CLASS_RE then seq[str, cls]
|
139
|
+
when FULL_REGEX_RE then m = []
|
140
|
+
str.scan(REGEX_RE) { |m1, m2, m3|
|
141
|
+
m1 ? m1.each_char { |c| m << [c] } : m << (
|
142
|
+
m2 ? m2.chars : m3.split('|').map(&:chars)) }
|
143
|
+
|
144
|
+
combinations(*m) { |q| seq[q.join, q.flatten] }
|
145
|
+
else seq[Regexp.new(str), nil]
|
146
|
+
end
|
117
147
|
}
|
118
148
|
|
119
149
|
@cls.uniq!
|
@@ -130,90 +160,109 @@ class Lingo
|
|
130
160
|
end
|
131
161
|
|
132
162
|
def process_buffer
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
iter, skip, rewind = @buffer.each_with_index, 0, lambda {
|
137
|
-
iter.rewind; skip.times { iter.next }; skip = 0
|
138
|
-
}
|
139
|
-
|
140
|
-
loop {
|
141
|
-
obj, idx = begin
|
142
|
-
iter.next
|
143
|
-
rescue StopIteration
|
144
|
-
raise unless skip > 0
|
145
|
-
|
146
|
-
buf.slice!(0, skip)
|
147
|
-
map.slice!(0, skip)
|
148
|
-
|
149
|
-
rewind.call
|
150
|
-
end
|
151
|
-
|
152
|
-
att = (tok = obj.is_a?(Token)) ? obj.number? ? NUM : UNK :
|
153
|
-
obj.is_a?(Word) && !obj.unknown? ? obj.compound_attrs : UNK
|
154
|
-
|
155
|
-
if (att &= @cls).empty?
|
156
|
-
find_seq(*arg)
|
157
|
-
rewind.call if skip > 0
|
158
|
-
else
|
159
|
-
obj.each_lex(@mwc) { |lex|
|
160
|
-
lex.form.count(' ').succ.times { iter.next }
|
161
|
-
break skip = idx + 1
|
162
|
-
} unless tok
|
163
|
-
|
164
|
-
buf << obj
|
165
|
-
map << att
|
166
|
-
end
|
167
|
-
}
|
168
|
-
|
169
|
-
@buffer.concat(find_seq(*arg))
|
170
|
-
end)
|
163
|
+
process_seq if @buffer.size > 1
|
164
|
+
flush(@buffer)
|
171
165
|
end
|
172
166
|
|
173
167
|
private
|
174
168
|
|
175
|
-
def
|
176
|
-
|
169
|
+
def process_seq
|
170
|
+
buf, map = [], []
|
177
171
|
|
178
|
-
|
172
|
+
iter, skip, rewind = @buffer.each_with_index, 0, lambda {
|
173
|
+
iter.rewind; skip.times { iter.next }; skip = 0
|
174
|
+
}
|
179
175
|
|
180
|
-
|
176
|
+
loop {
|
177
|
+
obj, idx = begin
|
178
|
+
iter.next
|
179
|
+
rescue StopIteration
|
180
|
+
raise unless skip > 0
|
181
|
+
|
182
|
+
buf.slice!(0, skip)
|
183
|
+
map.slice!(0, skip)
|
184
|
+
|
185
|
+
rewind.call
|
186
|
+
end
|
187
|
+
|
188
|
+
att = (tok = obj.is_a?(Token)) ? obj.number? ? NUM : UNK :
|
189
|
+
obj.is_a?(Word) && !obj.unknown? ? obj.compound_attrs : UNK
|
190
|
+
|
191
|
+
if (att &= @cls).empty?
|
192
|
+
find_seq(buf, map)
|
193
|
+
rewind.call if skip > 0
|
194
|
+
else
|
195
|
+
obj.each_lex(@mwc) { |lex|
|
196
|
+
lex.form.count(' ').succ.times { iter.next }
|
197
|
+
break skip = idx + 1
|
198
|
+
} unless tok
|
199
|
+
|
200
|
+
buf << obj
|
201
|
+
map << att
|
202
|
+
end
|
203
|
+
}
|
181
204
|
|
182
|
-
map
|
183
|
-
|
184
|
-
_str, _cls = [str, cls]
|
205
|
+
find_seq(buf, map)
|
206
|
+
end
|
185
207
|
|
186
|
-
|
187
|
-
|
208
|
+
def find_seq(buf, map)
|
209
|
+
return if buf.empty?
|
188
210
|
|
189
|
-
|
211
|
+
objs, args = [], []
|
190
212
|
|
191
|
-
|
192
|
-
|
213
|
+
@seq.each { |str, cls, fmt|
|
214
|
+
if cls
|
215
|
+
len = cls.size
|
193
216
|
|
194
|
-
|
195
|
-
|
196
|
-
} or next
|
217
|
+
buf.each_cons(len).zip(map.each_cons(len)) { |_buf, _map|
|
218
|
+
obj = _buf.each; objs.clear; args.clear
|
197
219
|
|
198
|
-
|
199
|
-
|
200
|
-
|
220
|
+
next if _map.zip(cls) { |_wc, wc|
|
221
|
+
break true unless _wc.include?(wc) &&
|
222
|
+
find_form(obj.next, wc, objs, args)
|
223
|
+
}
|
201
224
|
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
225
|
+
forward_seq(fmt, str, objs, args)
|
226
|
+
}
|
227
|
+
else
|
228
|
+
combinations(*map) { |q|
|
229
|
+
q, pos = q.join, -1
|
206
230
|
|
207
|
-
|
231
|
+
while pos = q.index(str, pos += 1)
|
232
|
+
objs.clear; args.clear
|
208
233
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
234
|
+
next unless $&.each_char.with_index { |wc, i|
|
235
|
+
find_form(buf[pos + i], wc, objs, args) or break
|
236
|
+
}
|
237
|
+
|
238
|
+
forward_seq(fmt, $&, objs, args)
|
239
|
+
end
|
240
|
+
}
|
241
|
+
end
|
213
242
|
}
|
214
243
|
|
215
244
|
buf.clear
|
216
|
-
|
245
|
+
map.clear
|
246
|
+
end
|
247
|
+
|
248
|
+
def find_form(obj, wc, objs, args)
|
249
|
+
form = obj.is_a?(Word) ? obj.lexicals.find { |lex|
|
250
|
+
break lex.form if lex.attr == wc } : obj.form or return
|
251
|
+
|
252
|
+
objs << obj
|
253
|
+
args << form
|
254
|
+
end
|
255
|
+
|
256
|
+
def forward_seq(fmt, str, objs, args)
|
257
|
+
wrd_form, form = objs.map(&:form).join(' '),
|
258
|
+
fmt =~ /\d/ ? fmt.gsub('%0$s', str) % args :
|
259
|
+
fmt ? "#{str}:#{args.join(fmt)}" : args.join(' ')
|
260
|
+
|
261
|
+
wrd = Word.new(wrd_form, WA_SEQUENCE)
|
262
|
+
wrd << Lexical.new(form, LA_SEQUENCE)
|
263
|
+
wrd.pattern, wrd.token = str, objs.first.token
|
264
|
+
|
265
|
+
@buffer << wrd
|
217
266
|
end
|
218
267
|
|
219
268
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -99,10 +99,8 @@ class Lingo
|
|
99
99
|
when :LIR
|
100
100
|
@lir = true unless @lir.nil?
|
101
101
|
when :FILE
|
102
|
-
@no_sep = true
|
103
|
-
|
104
|
-
@io = stdout?(@ext) ? (@path = @ext; open_stdout) :
|
105
|
-
open_path(@path = set_ext(param, @ext), 'w')
|
102
|
+
@no_sep, @io = true, (@stdout = stdout?(@ext)) ?
|
103
|
+
open_stdout : open_path(get_path(param, @ext), 'w')
|
106
104
|
|
107
105
|
@lir_rec_no, @lir_rec_buf = '', []
|
108
106
|
when :RECORD
|
@@ -117,7 +115,7 @@ class Lingo
|
|
117
115
|
@io.puts unless @lir || @no_puts
|
118
116
|
when :EOF
|
119
117
|
flush_lir_buffer if @lir
|
120
|
-
@io.close unless stdout
|
118
|
+
@io.close unless @stdout
|
121
119
|
end
|
122
120
|
end
|
123
121
|
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -24,8 +24,6 @@
|
|
24
24
|
###############################################################################
|
25
25
|
#++
|
26
26
|
|
27
|
-
require 'csv'
|
28
|
-
|
29
27
|
class Lingo
|
30
28
|
|
31
29
|
class Attendee
|
@@ -83,6 +81,8 @@ class Lingo
|
|
83
81
|
|
84
82
|
class VectorFilter < self
|
85
83
|
|
84
|
+
include TextUtils
|
85
|
+
|
86
86
|
DEFAULT_SRC_SEPARATOR = '|'
|
87
87
|
DEFAULT_POS_SEPARATOR = '@'
|
88
88
|
|
@@ -186,7 +186,7 @@ class Lingo
|
|
186
186
|
vec = vec.form if vec.is_a?(WordForm)
|
187
187
|
|
188
188
|
vec = Unicode.downcase(vec)
|
189
|
-
vec << @src << src if @src && src
|
189
|
+
vec << @src << src.form if @src && src
|
190
190
|
|
191
191
|
@sort_fmt ? vectors[vec] << pos : forward(vec_pos(vec, [pos]))
|
192
192
|
end
|
@@ -198,7 +198,7 @@ class Lingo
|
|
198
198
|
@vectors.each_value { |w| w.each_key { |v| df[v] += 1 } }
|
199
199
|
|
200
200
|
if @tfidf.is_a?(String)
|
201
|
-
|
201
|
+
open_csv(@tfidf, 'wb') { |c| df.sort.each { |v| c << v } }
|
202
202
|
end
|
203
203
|
|
204
204
|
yield lambda { |docnum|
|