lingo 1.9.0.pre1 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ChangeLog +18 -7
- data/README +6 -8
- data/Rakefile +5 -5
- data/dict/en/lingo-dic.txt +52625 -15693
- data/lang/en.lang +2 -2
- data/lib/lingo.rb +15 -3
- data/lib/lingo/array_utils.rb +39 -0
- data/lib/lingo/attendee.rb +1 -3
- data/lib/lingo/attendee/multi_worder.rb +4 -2
- data/lib/lingo/attendee/sequencer.rb +122 -73
- data/lib/lingo/attendee/text_writer.rb +4 -6
- data/lib/lingo/attendee/vector_filter.rb +5 -5
- data/lib/lingo/cli.rb +20 -2
- data/lib/lingo/config.rb +4 -3
- data/lib/lingo/ctl.rb +2 -20
- data/lib/lingo/ctl/analysis.rb +3 -5
- data/lib/lingo/ctl/files.rb +3 -3
- data/lib/lingo/database.rb +26 -25
- data/lib/lingo/database/crypter.rb +10 -6
- data/lib/lingo/database/source.rb +72 -25
- data/lib/lingo/database/source/key_value.rb +12 -8
- data/lib/lingo/database/source/multi_key.rb +11 -9
- data/lib/lingo/database/source/multi_value.rb +10 -8
- data/lib/lingo/database/source/single_word.rb +10 -6
- data/lib/lingo/database/source/word_class.rb +43 -14
- data/lib/lingo/debug.rb +2 -2
- data/lib/lingo/error.rb +21 -5
- data/lib/lingo/filter.rb +1 -1
- data/lib/lingo/language.rb +21 -21
- data/lib/lingo/language/grammar.rb +4 -2
- data/lib/lingo/language/lexical_hash.rb +2 -14
- data/lib/lingo/language/word.rb +1 -5
- data/lib/lingo/text_utils.rb +113 -20
- data/lib/lingo/version.rb +1 -1
- data/test/attendee/ts_sequencer.rb +286 -32
- data/test/attendee/ts_text_reader.rb +4 -4
- data/test/attendee/ts_text_writer.rb +19 -5
- data/test/test_helper.rb +2 -0
- data/test/ts_database.rb +213 -14
- metadata +36 -24
data/lang/en.lang
CHANGED
@@ -68,9 +68,9 @@ language:
|
|
68
68
|
# Suffixliste, Stand: *****ENGLISCH****
|
69
69
|
# Suffixklasse: s = Substantiv, a = Adjektiv, v = Verb, e = Eigenwort, f = Fugung
|
70
70
|
# Suffixe je Klasse: "<suffix>['/'<ersetzung>][ <suffix>['/'<ersetzung>]]"
|
71
|
-
- [s, 'es s ves/f ves/fe ies/y']
|
71
|
+
- [s, 'es es/is s ves/f ves/fe ies/y']
|
72
72
|
- [a, 'er est r st ier/y iest/y ly al ally']
|
73
|
-
- [v, 'd ed en es ing s ing/e']
|
73
|
+
- [v, 'd ed en es ies/y ing s ing/e']
|
74
74
|
- [e, 's']
|
75
75
|
- [f, '']
|
76
76
|
|
data/lib/lingo.rb
CHANGED
@@ -32,6 +32,7 @@ require 'nuggets/file/ext'
|
|
32
32
|
require 'nuggets/hash/nest'
|
33
33
|
require 'nuggets/hash/seen'
|
34
34
|
require 'nuggets/env/user_home'
|
35
|
+
require 'nuggets/object/silence'
|
35
36
|
require 'nuggets/string/camelscore'
|
36
37
|
|
37
38
|
class Lingo
|
@@ -61,8 +62,12 @@ class Lingo
|
|
61
62
|
}
|
62
63
|
|
63
64
|
# Default encoding
|
64
|
-
|
65
|
-
|
65
|
+
ENCODING = 'UTF-8'.freeze
|
66
|
+
|
67
|
+
silence {
|
68
|
+
Encoding.default_external = ENCODING
|
69
|
+
Encoding.default_internal = ENCODING unless RUBY_ENGINE == 'jruby'
|
70
|
+
}
|
66
71
|
|
67
72
|
SEP_RE = %r{[; ,|]}
|
68
73
|
|
@@ -298,6 +303,11 @@ class Lingo
|
|
298
303
|
} }
|
299
304
|
end
|
300
305
|
|
306
|
+
def attendees(arg = Object)
|
307
|
+
@attendees.grep(arg.is_a?(Class) ? arg :
|
308
|
+
Attendee.const_get(arg.to_s.camelcase))
|
309
|
+
end
|
310
|
+
|
301
311
|
def start
|
302
312
|
@attendees.first.control(:TALK)
|
303
313
|
end
|
@@ -319,9 +329,11 @@ require_relative 'lingo/error'
|
|
319
329
|
require_relative 'lingo/debug'
|
320
330
|
require_relative 'lingo/config'
|
321
331
|
require_relative 'lingo/filter'
|
332
|
+
require_relative 'lingo/array_utils'
|
333
|
+
require_relative 'lingo/text_utils'
|
334
|
+
require_relative 'lingo/language'
|
322
335
|
require_relative 'lingo/progress'
|
323
336
|
require_relative 'lingo/database'
|
324
|
-
require_relative 'lingo/language'
|
325
337
|
require_relative 'lingo/attendee'
|
326
338
|
require_relative 'lingo/version'
|
327
339
|
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
module ArrayUtils
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def combinations(first, *rest, &block)
|
34
|
+
first.product(*rest, &block)
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
data/lib/lingo/attendee.rb
CHANGED
@@ -146,7 +146,7 @@ class Lingo
|
|
146
146
|
end
|
147
147
|
end
|
148
148
|
|
149
|
-
def get_enc(key = 'encoding', default =
|
149
|
+
def get_enc(key = 'encoding', default = ENCODING)
|
150
150
|
Encoding.find(get_key(key, default))
|
151
151
|
rescue ArgumentError => err
|
152
152
|
raise ConfigLoadError.new(err)
|
@@ -182,8 +182,6 @@ class Lingo
|
|
182
182
|
|
183
183
|
end
|
184
184
|
|
185
|
-
require_relative 'text_utils'
|
186
|
-
|
187
185
|
require_relative 'buffered_attendee'
|
188
186
|
require_relative 'deferred_attendee'
|
189
187
|
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -79,6 +79,8 @@ class Lingo
|
|
79
79
|
|
80
80
|
class MultiWorder < BufferedAttendee
|
81
81
|
|
82
|
+
include ArrayUtils
|
83
|
+
|
82
84
|
def init
|
83
85
|
# combine lexical variants?
|
84
86
|
#
|
@@ -220,7 +222,7 @@ class Lingo
|
|
220
222
|
}
|
221
223
|
|
222
224
|
if @combine
|
223
|
-
|
225
|
+
combinations(*seq) { |key|
|
224
226
|
@mul_dic.select(key.join(sep), mul)
|
225
227
|
break unless @all || mul.empty?
|
226
228
|
} && mul.uniq!
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -97,23 +97,53 @@ class Lingo
|
|
97
97
|
|
98
98
|
class Sequencer < BufferedAttendee
|
99
99
|
|
100
|
+
include ArrayUtils
|
101
|
+
|
100
102
|
UNK = %w[#]
|
101
103
|
NUM = %w[0]
|
102
104
|
|
103
|
-
|
105
|
+
CLASS_RE = %r{[a-z#{NUM.join}]}o
|
106
|
+
|
107
|
+
REGEX_RE = %r{
|
108
|
+
( #{CLASS_RE}+ )
|
109
|
+
|
|
110
|
+
\[
|
111
|
+
( #{CLASS_RE}+ )
|
112
|
+
\]
|
113
|
+
|
|
114
|
+
\(
|
115
|
+
(?: \?: )?
|
116
|
+
( #{CLASS_RE}+ (?: \| #{CLASS_RE}+ )* )
|
117
|
+
\)
|
118
|
+
}xo
|
119
|
+
|
120
|
+
FULL_CLASS_RE = %r{\A(?:#{CLASS_RE})+\z}o
|
121
|
+
FULL_REGEX_RE = %r{\A(?:#{REGEX_RE})+\z}o
|
104
122
|
|
105
123
|
def init
|
106
124
|
@stopper = get_ary('stopper', DEFAULT_SKIP)
|
107
125
|
.push(WA_UNKNOWN, WA_UNKMULPART)
|
108
126
|
|
109
127
|
@mwc = get_key('multiword', LA_MULTIWORD)
|
110
|
-
@cls = []
|
111
128
|
|
112
|
-
@seq =
|
113
|
-
|
129
|
+
@cls, @seq = [], []
|
130
|
+
|
131
|
+
get_key('sequences').each { |str, fmt|
|
132
|
+
seq, fmt = lambda { |*a| @seq << (a << fmt) },
|
133
|
+
fmt == true ? '|' : fmt ? fmt.gsub(/\d+/, '%\&$s') : nil
|
134
|
+
|
135
|
+
@cls.concat(cls = (str = str.downcase).scan(CLASS_RE))
|
114
136
|
|
115
|
-
|
116
|
-
|
137
|
+
case str
|
138
|
+
when FULL_CLASS_RE then seq[str, cls]
|
139
|
+
when FULL_REGEX_RE then m = []
|
140
|
+
str.scan(REGEX_RE) { |m1, m2, m3|
|
141
|
+
m1 ? m1.each_char { |c| m << [c] } : m << (
|
142
|
+
m2 ? m2.chars : m3.split('|').map(&:chars)) }
|
143
|
+
|
144
|
+
combinations(*m) { |q| seq[q.join, q.flatten] }
|
145
|
+
else seq[Regexp.new(str), nil]
|
146
|
+
end
|
117
147
|
}
|
118
148
|
|
119
149
|
@cls.uniq!
|
@@ -130,90 +160,109 @@ class Lingo
|
|
130
160
|
end
|
131
161
|
|
132
162
|
def process_buffer
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
iter, skip, rewind = @buffer.each_with_index, 0, lambda {
|
137
|
-
iter.rewind; skip.times { iter.next }; skip = 0
|
138
|
-
}
|
139
|
-
|
140
|
-
loop {
|
141
|
-
obj, idx = begin
|
142
|
-
iter.next
|
143
|
-
rescue StopIteration
|
144
|
-
raise unless skip > 0
|
145
|
-
|
146
|
-
buf.slice!(0, skip)
|
147
|
-
map.slice!(0, skip)
|
148
|
-
|
149
|
-
rewind.call
|
150
|
-
end
|
151
|
-
|
152
|
-
att = (tok = obj.is_a?(Token)) ? obj.number? ? NUM : UNK :
|
153
|
-
obj.is_a?(Word) && !obj.unknown? ? obj.compound_attrs : UNK
|
154
|
-
|
155
|
-
if (att &= @cls).empty?
|
156
|
-
find_seq(*arg)
|
157
|
-
rewind.call if skip > 0
|
158
|
-
else
|
159
|
-
obj.each_lex(@mwc) { |lex|
|
160
|
-
lex.form.count(' ').succ.times { iter.next }
|
161
|
-
break skip = idx + 1
|
162
|
-
} unless tok
|
163
|
-
|
164
|
-
buf << obj
|
165
|
-
map << att
|
166
|
-
end
|
167
|
-
}
|
168
|
-
|
169
|
-
@buffer.concat(find_seq(*arg))
|
170
|
-
end)
|
163
|
+
process_seq if @buffer.size > 1
|
164
|
+
flush(@buffer)
|
171
165
|
end
|
172
166
|
|
173
167
|
private
|
174
168
|
|
175
|
-
def
|
176
|
-
|
169
|
+
def process_seq
|
170
|
+
buf, map = [], []
|
177
171
|
|
178
|
-
|
172
|
+
iter, skip, rewind = @buffer.each_with_index, 0, lambda {
|
173
|
+
iter.rewind; skip.times { iter.next }; skip = 0
|
174
|
+
}
|
179
175
|
|
180
|
-
|
176
|
+
loop {
|
177
|
+
obj, idx = begin
|
178
|
+
iter.next
|
179
|
+
rescue StopIteration
|
180
|
+
raise unless skip > 0
|
181
|
+
|
182
|
+
buf.slice!(0, skip)
|
183
|
+
map.slice!(0, skip)
|
184
|
+
|
185
|
+
rewind.call
|
186
|
+
end
|
187
|
+
|
188
|
+
att = (tok = obj.is_a?(Token)) ? obj.number? ? NUM : UNK :
|
189
|
+
obj.is_a?(Word) && !obj.unknown? ? obj.compound_attrs : UNK
|
190
|
+
|
191
|
+
if (att &= @cls).empty?
|
192
|
+
find_seq(buf, map)
|
193
|
+
rewind.call if skip > 0
|
194
|
+
else
|
195
|
+
obj.each_lex(@mwc) { |lex|
|
196
|
+
lex.form.count(' ').succ.times { iter.next }
|
197
|
+
break skip = idx + 1
|
198
|
+
} unless tok
|
199
|
+
|
200
|
+
buf << obj
|
201
|
+
map << att
|
202
|
+
end
|
203
|
+
}
|
181
204
|
|
182
|
-
map
|
183
|
-
|
184
|
-
_str, _cls = [str, cls]
|
205
|
+
find_seq(buf, map)
|
206
|
+
end
|
185
207
|
|
186
|
-
|
187
|
-
|
208
|
+
def find_seq(buf, map)
|
209
|
+
return if buf.empty?
|
188
210
|
|
189
|
-
|
211
|
+
objs, args = [], []
|
190
212
|
|
191
|
-
|
192
|
-
|
213
|
+
@seq.each { |str, cls, fmt|
|
214
|
+
if cls
|
215
|
+
len = cls.size
|
193
216
|
|
194
|
-
|
195
|
-
|
196
|
-
} or next
|
217
|
+
buf.each_cons(len).zip(map.each_cons(len)) { |_buf, _map|
|
218
|
+
obj = _buf.each; objs.clear; args.clear
|
197
219
|
|
198
|
-
|
199
|
-
|
200
|
-
|
220
|
+
next if _map.zip(cls) { |_wc, wc|
|
221
|
+
break true unless _wc.include?(wc) &&
|
222
|
+
find_form(obj.next, wc, objs, args)
|
223
|
+
}
|
201
224
|
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
225
|
+
forward_seq(fmt, str, objs, args)
|
226
|
+
}
|
227
|
+
else
|
228
|
+
combinations(*map) { |q|
|
229
|
+
q, pos = q.join, -1
|
206
230
|
|
207
|
-
|
231
|
+
while pos = q.index(str, pos += 1)
|
232
|
+
objs.clear; args.clear
|
208
233
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
234
|
+
next unless $&.each_char.with_index { |wc, i|
|
235
|
+
find_form(buf[pos + i], wc, objs, args) or break
|
236
|
+
}
|
237
|
+
|
238
|
+
forward_seq(fmt, $&, objs, args)
|
239
|
+
end
|
240
|
+
}
|
241
|
+
end
|
213
242
|
}
|
214
243
|
|
215
244
|
buf.clear
|
216
|
-
|
245
|
+
map.clear
|
246
|
+
end
|
247
|
+
|
248
|
+
def find_form(obj, wc, objs, args)
|
249
|
+
form = obj.is_a?(Word) ? obj.lexicals.find { |lex|
|
250
|
+
break lex.form if lex.attr == wc } : obj.form or return
|
251
|
+
|
252
|
+
objs << obj
|
253
|
+
args << form
|
254
|
+
end
|
255
|
+
|
256
|
+
def forward_seq(fmt, str, objs, args)
|
257
|
+
wrd_form, form = objs.map(&:form).join(' '),
|
258
|
+
fmt =~ /\d/ ? fmt.gsub('%0$s', str) % args :
|
259
|
+
fmt ? "#{str}:#{args.join(fmt)}" : args.join(' ')
|
260
|
+
|
261
|
+
wrd = Word.new(wrd_form, WA_SEQUENCE)
|
262
|
+
wrd << Lexical.new(form, LA_SEQUENCE)
|
263
|
+
wrd.pattern, wrd.token = str, objs.first.token
|
264
|
+
|
265
|
+
@buffer << wrd
|
217
266
|
end
|
218
267
|
|
219
268
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -99,10 +99,8 @@ class Lingo
|
|
99
99
|
when :LIR
|
100
100
|
@lir = true unless @lir.nil?
|
101
101
|
when :FILE
|
102
|
-
@no_sep = true
|
103
|
-
|
104
|
-
@io = stdout?(@ext) ? (@path = @ext; open_stdout) :
|
105
|
-
open_path(@path = set_ext(param, @ext), 'w')
|
102
|
+
@no_sep, @io = true, (@stdout = stdout?(@ext)) ?
|
103
|
+
open_stdout : open_path(get_path(param, @ext), 'w')
|
106
104
|
|
107
105
|
@lir_rec_no, @lir_rec_buf = '', []
|
108
106
|
when :RECORD
|
@@ -117,7 +115,7 @@ class Lingo
|
|
117
115
|
@io.puts unless @lir || @no_puts
|
118
116
|
when :EOF
|
119
117
|
flush_lir_buffer if @lir
|
120
|
-
@io.close unless stdout
|
118
|
+
@io.close unless @stdout
|
121
119
|
end
|
122
120
|
end
|
123
121
|
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -24,8 +24,6 @@
|
|
24
24
|
###############################################################################
|
25
25
|
#++
|
26
26
|
|
27
|
-
require 'csv'
|
28
|
-
|
29
27
|
class Lingo
|
30
28
|
|
31
29
|
class Attendee
|
@@ -83,6 +81,8 @@ class Lingo
|
|
83
81
|
|
84
82
|
class VectorFilter < self
|
85
83
|
|
84
|
+
include TextUtils
|
85
|
+
|
86
86
|
DEFAULT_SRC_SEPARATOR = '|'
|
87
87
|
DEFAULT_POS_SEPARATOR = '@'
|
88
88
|
|
@@ -186,7 +186,7 @@ class Lingo
|
|
186
186
|
vec = vec.form if vec.is_a?(WordForm)
|
187
187
|
|
188
188
|
vec = Unicode.downcase(vec)
|
189
|
-
vec << @src << src if @src && src
|
189
|
+
vec << @src << src.form if @src && src
|
190
190
|
|
191
191
|
@sort_fmt ? vectors[vec] << pos : forward(vec_pos(vec, [pos]))
|
192
192
|
end
|
@@ -198,7 +198,7 @@ class Lingo
|
|
198
198
|
@vectors.each_value { |w| w.each_key { |v| df[v] += 1 } }
|
199
199
|
|
200
200
|
if @tfidf.is_a?(String)
|
201
|
-
|
201
|
+
open_csv(@tfidf, 'wb') { |c| df.sort.each { |v| c << v } }
|
202
202
|
end
|
203
203
|
|
204
204
|
yield lambda { |docnum|
|