lingo 1.9.0.pre1 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/ChangeLog +18 -7
  3. data/README +6 -8
  4. data/Rakefile +5 -5
  5. data/dict/en/lingo-dic.txt +52625 -15693
  6. data/lang/en.lang +2 -2
  7. data/lib/lingo.rb +15 -3
  8. data/lib/lingo/array_utils.rb +39 -0
  9. data/lib/lingo/attendee.rb +1 -3
  10. data/lib/lingo/attendee/multi_worder.rb +4 -2
  11. data/lib/lingo/attendee/sequencer.rb +122 -73
  12. data/lib/lingo/attendee/text_writer.rb +4 -6
  13. data/lib/lingo/attendee/vector_filter.rb +5 -5
  14. data/lib/lingo/cli.rb +20 -2
  15. data/lib/lingo/config.rb +4 -3
  16. data/lib/lingo/ctl.rb +2 -20
  17. data/lib/lingo/ctl/analysis.rb +3 -5
  18. data/lib/lingo/ctl/files.rb +3 -3
  19. data/lib/lingo/database.rb +26 -25
  20. data/lib/lingo/database/crypter.rb +10 -6
  21. data/lib/lingo/database/source.rb +72 -25
  22. data/lib/lingo/database/source/key_value.rb +12 -8
  23. data/lib/lingo/database/source/multi_key.rb +11 -9
  24. data/lib/lingo/database/source/multi_value.rb +10 -8
  25. data/lib/lingo/database/source/single_word.rb +10 -6
  26. data/lib/lingo/database/source/word_class.rb +43 -14
  27. data/lib/lingo/debug.rb +2 -2
  28. data/lib/lingo/error.rb +21 -5
  29. data/lib/lingo/filter.rb +1 -1
  30. data/lib/lingo/language.rb +21 -21
  31. data/lib/lingo/language/grammar.rb +4 -2
  32. data/lib/lingo/language/lexical_hash.rb +2 -14
  33. data/lib/lingo/language/word.rb +1 -5
  34. data/lib/lingo/text_utils.rb +113 -20
  35. data/lib/lingo/version.rb +1 -1
  36. data/test/attendee/ts_sequencer.rb +286 -32
  37. data/test/attendee/ts_text_reader.rb +4 -4
  38. data/test/attendee/ts_text_writer.rb +19 -5
  39. data/test/test_helper.rb +2 -0
  40. data/test/ts_database.rb +213 -14
  41. metadata +36 -24
@@ -68,9 +68,9 @@ language:
68
68
  # Suffixliste, Stand: *****ENGLISCH****
69
69
  # Suffixklasse: s = Substantiv, a = Adjektiv, v = Verb, e = Eigenwort, f = Fugung
70
70
  # Suffixe je Klasse: "<suffix>['/'<ersetzung>][ <suffix>['/'<ersetzung>]]"
71
- - [s, 'es s ves/f ves/fe ies/y']
71
+ - [s, 'es es/is s ves/f ves/fe ies/y']
72
72
  - [a, 'er est r st ier/y iest/y ly al ally']
73
- - [v, 'd ed en es ing s ing/e']
73
+ - [v, 'd ed en es ies/y ing s ing/e']
74
74
  - [e, 's']
75
75
  - [f, '']
76
76
 
@@ -32,6 +32,7 @@ require 'nuggets/file/ext'
32
32
  require 'nuggets/hash/nest'
33
33
  require 'nuggets/hash/seen'
34
34
  require 'nuggets/env/user_home'
35
+ require 'nuggets/object/silence'
35
36
  require 'nuggets/string/camelscore'
36
37
 
37
38
  class Lingo
@@ -61,8 +62,12 @@ class Lingo
61
62
  }
62
63
 
63
64
  # Default encoding
64
- Encoding.default_external = ENC = 'UTF-8'.freeze
65
- Encoding.default_internal = ENC unless RUBY_ENGINE == 'jruby'
65
+ ENCODING = 'UTF-8'.freeze
66
+
67
+ silence {
68
+ Encoding.default_external = ENCODING
69
+ Encoding.default_internal = ENCODING unless RUBY_ENGINE == 'jruby'
70
+ }
66
71
 
67
72
  SEP_RE = %r{[; ,|]}
68
73
 
@@ -298,6 +303,11 @@ class Lingo
298
303
  } }
299
304
  end
300
305
 
306
+ def attendees(arg = Object)
307
+ @attendees.grep(arg.is_a?(Class) ? arg :
308
+ Attendee.const_get(arg.to_s.camelcase))
309
+ end
310
+
301
311
  def start
302
312
  @attendees.first.control(:TALK)
303
313
  end
@@ -319,9 +329,11 @@ require_relative 'lingo/error'
319
329
  require_relative 'lingo/debug'
320
330
  require_relative 'lingo/config'
321
331
  require_relative 'lingo/filter'
332
+ require_relative 'lingo/array_utils'
333
+ require_relative 'lingo/text_utils'
334
+ require_relative 'lingo/language'
322
335
  require_relative 'lingo/progress'
323
336
  require_relative 'lingo/database'
324
- require_relative 'lingo/language'
325
337
  require_relative 'lingo/attendee'
326
338
  require_relative 'lingo/version'
327
339
 
@@ -0,0 +1,39 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ module ArrayUtils
30
+
31
+ private
32
+
33
+ def combinations(first, *rest, &block)
34
+ first.product(*rest, &block)
35
+ end
36
+
37
+ end
38
+
39
+ end
@@ -146,7 +146,7 @@ class Lingo
146
146
  end
147
147
  end
148
148
 
149
- def get_enc(key = 'encoding', default = ENC)
149
+ def get_enc(key = 'encoding', default = ENCODING)
150
150
  Encoding.find(get_key(key, default))
151
151
  rescue ArgumentError => err
152
152
  raise ConfigLoadError.new(err)
@@ -182,8 +182,6 @@ class Lingo
182
182
 
183
183
  end
184
184
 
185
- require_relative 'text_utils'
186
-
187
185
  require_relative 'buffered_attendee'
188
186
  require_relative 'deferred_attendee'
189
187
 
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -79,6 +79,8 @@ class Lingo
79
79
 
80
80
  class MultiWorder < BufferedAttendee
81
81
 
82
+ include ArrayUtils
83
+
82
84
  def init
83
85
  # combine lexical variants?
84
86
  #
@@ -220,7 +222,7 @@ class Lingo
220
222
  }
221
223
 
222
224
  if @combine
223
- seq.shift.product(*seq) { |key|
225
+ combinations(*seq) { |key|
224
226
  @mul_dic.select(key.join(sep), mul)
225
227
  break unless @all || mul.empty?
226
228
  } && mul.uniq!
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -97,23 +97,53 @@ class Lingo
97
97
 
98
98
  class Sequencer < BufferedAttendee
99
99
 
100
+ include ArrayUtils
101
+
100
102
  UNK = %w[#]
101
103
  NUM = %w[0]
102
104
 
103
- CLS = /[[:alpha:]#{NUM.join}]/o
105
+ CLASS_RE = %r{[a-z#{NUM.join}]}o
106
+
107
+ REGEX_RE = %r{
108
+ ( #{CLASS_RE}+ )
109
+ |
110
+ \[
111
+ ( #{CLASS_RE}+ )
112
+ \]
113
+ |
114
+ \(
115
+ (?: \?: )?
116
+ ( #{CLASS_RE}+ (?: \| #{CLASS_RE}+ )* )
117
+ \)
118
+ }xo
119
+
120
+ FULL_CLASS_RE = %r{\A(?:#{CLASS_RE})+\z}o
121
+ FULL_REGEX_RE = %r{\A(?:#{REGEX_RE})+\z}o
104
122
 
105
123
  def init
106
124
  @stopper = get_ary('stopper', DEFAULT_SKIP)
107
125
  .push(WA_UNKNOWN, WA_UNKMULPART)
108
126
 
109
127
  @mwc = get_key('multiword', LA_MULTIWORD)
110
- @cls = []
111
128
 
112
- @seq = get_key('sequences').map { |str, fmt|
113
- @cls.concat(cls = (str = str.downcase).scan(CLS))
129
+ @cls, @seq = [], []
130
+
131
+ get_key('sequences').each { |str, fmt|
132
+ seq, fmt = lambda { |*a| @seq << (a << fmt) },
133
+ fmt == true ? '|' : fmt ? fmt.gsub(/\d+/, '%\&$s') : nil
134
+
135
+ @cls.concat(cls = (str = str.downcase).scan(CLASS_RE))
114
136
 
115
- (str =~ /\W/ ? [Regexp.new(str), nil] : [str, cls]).push(
116
- fmt == true ? '|' : fmt ? fmt.gsub(/\d+/, '%\&$s') : nil)
137
+ case str
138
+ when FULL_CLASS_RE then seq[str, cls]
139
+ when FULL_REGEX_RE then m = []
140
+ str.scan(REGEX_RE) { |m1, m2, m3|
141
+ m1 ? m1.each_char { |c| m << [c] } : m << (
142
+ m2 ? m2.chars : m3.split('|').map(&:chars)) }
143
+
144
+ combinations(*m) { |q| seq[q.join, q.flatten] }
145
+ else seq[Regexp.new(str), nil]
146
+ end
117
147
  }
118
148
 
119
149
  @cls.uniq!
@@ -130,90 +160,109 @@ class Lingo
130
160
  end
131
161
 
132
162
  def process_buffer
133
- flush(@buffer.size < 2 ? @buffer : begin
134
- arg = [[], buf = [], map = [], @seq]
135
-
136
- iter, skip, rewind = @buffer.each_with_index, 0, lambda {
137
- iter.rewind; skip.times { iter.next }; skip = 0
138
- }
139
-
140
- loop {
141
- obj, idx = begin
142
- iter.next
143
- rescue StopIteration
144
- raise unless skip > 0
145
-
146
- buf.slice!(0, skip)
147
- map.slice!(0, skip)
148
-
149
- rewind.call
150
- end
151
-
152
- att = (tok = obj.is_a?(Token)) ? obj.number? ? NUM : UNK :
153
- obj.is_a?(Word) && !obj.unknown? ? obj.compound_attrs : UNK
154
-
155
- if (att &= @cls).empty?
156
- find_seq(*arg)
157
- rewind.call if skip > 0
158
- else
159
- obj.each_lex(@mwc) { |lex|
160
- lex.form.count(' ').succ.times { iter.next }
161
- break skip = idx + 1
162
- } unless tok
163
-
164
- buf << obj
165
- map << att
166
- end
167
- }
168
-
169
- @buffer.concat(find_seq(*arg))
170
- end)
163
+ process_seq if @buffer.size > 1
164
+ flush(@buffer)
171
165
  end
172
166
 
173
167
  private
174
168
 
175
- def find_seq(mat, buf, map, seq)
176
- return mat if buf.empty?
169
+ def process_seq
170
+ buf, map = [], []
177
171
 
178
- forms, args = [], []
172
+ iter, skip, rewind = @buffer.each_with_index, 0, lambda {
173
+ iter.rewind; skip.times { iter.next }; skip = 0
174
+ }
179
175
 
180
- map.replace(map.shift.product(*map)).map! { |i| i.join }.uniq!
176
+ loop {
177
+ obj, idx = begin
178
+ iter.next
179
+ rescue StopIteration
180
+ raise unless skip > 0
181
+
182
+ buf.slice!(0, skip)
183
+ map.slice!(0, skip)
184
+
185
+ rewind.call
186
+ end
187
+
188
+ att = (tok = obj.is_a?(Token)) ? obj.number? ? NUM : UNK :
189
+ obj.is_a?(Word) && !obj.unknown? ? obj.compound_attrs : UNK
190
+
191
+ if (att &= @cls).empty?
192
+ find_seq(buf, map)
193
+ rewind.call if skip > 0
194
+ else
195
+ obj.each_lex(@mwc) { |lex|
196
+ lex.form.count(' ').succ.times { iter.next }
197
+ break skip = idx + 1
198
+ } unless tok
199
+
200
+ buf << obj
201
+ map << att
202
+ end
203
+ }
181
204
 
182
- map.each { |q|
183
- seq.each { |str, cls, fmt|
184
- _str, _cls = [str, cls]
205
+ find_seq(buf, map)
206
+ end
185
207
 
186
- while pos = q.index(str, pos || 0)
187
- _str, _cls = [$&, $&.chars] unless cls
208
+ def find_seq(buf, map)
209
+ return if buf.empty?
188
210
 
189
- _tok = nil; args.clear
211
+ objs, args = [], []
190
212
 
191
- _cls.each_with_index { |wc, i|
192
- obj = buf[pos + i];_tok ||= obj.token
213
+ @seq.each { |str, cls, fmt|
214
+ if cls
215
+ len = cls.size
193
216
 
194
- args[i] = obj.is_a?(Word) ? obj.lexicals.find { |lex|
195
- break lex.form if lex.attr == wc } : obj.form or break
196
- } or next
217
+ buf.each_cons(len).zip(map.each_cons(len)) { |_buf, _map|
218
+ obj = _buf.each; objs.clear; args.clear
197
219
 
198
- forms << [_str, _tok,
199
- fmt =~ /\d/ ? fmt.gsub('%0$s', _str) % args :
200
- fmt ? "#{_str}:#{args.join(fmt)}" : args.join(' ')]
220
+ next if _map.zip(cls) { |_wc, wc|
221
+ break true unless _wc.include?(wc) &&
222
+ find_form(obj.next, wc, objs, args)
223
+ }
201
224
 
202
- pos += 1
203
- end
204
- }
205
- }.clear
225
+ forward_seq(fmt, str, objs, args)
226
+ }
227
+ else
228
+ combinations(*map) { |q|
229
+ q, pos = q.join, -1
206
230
 
207
- forms.uniq!
231
+ while pos = q.index(str, pos += 1)
232
+ objs.clear; args.clear
208
233
 
209
- forms.each { |s, t, f|
210
- wrd = Word.new_lexical(f, WA_SEQUENCE, LA_SEQUENCE)
211
- wrd.pattern, wrd.token = s, t
212
- mat << wrd
234
+ next unless $&.each_char.with_index { |wc, i|
235
+ find_form(buf[pos + i], wc, objs, args) or break
236
+ }
237
+
238
+ forward_seq(fmt, $&, objs, args)
239
+ end
240
+ }
241
+ end
213
242
  }
214
243
 
215
244
  buf.clear
216
- mat
245
+ map.clear
246
+ end
247
+
248
+ def find_form(obj, wc, objs, args)
249
+ form = obj.is_a?(Word) ? obj.lexicals.find { |lex|
250
+ break lex.form if lex.attr == wc } : obj.form or return
251
+
252
+ objs << obj
253
+ args << form
254
+ end
255
+
256
+ def forward_seq(fmt, str, objs, args)
257
+ wrd_form, form = objs.map(&:form).join(' '),
258
+ fmt =~ /\d/ ? fmt.gsub('%0$s', str) % args :
259
+ fmt ? "#{str}:#{args.join(fmt)}" : args.join(' ')
260
+
261
+ wrd = Word.new(wrd_form, WA_SEQUENCE)
262
+ wrd << Lexical.new(form, LA_SEQUENCE)
263
+ wrd.pattern, wrd.token = str, objs.first.token
264
+
265
+ @buffer << wrd
217
266
  end
218
267
 
219
268
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -99,10 +99,8 @@ class Lingo
99
99
  when :LIR
100
100
  @lir = true unless @lir.nil?
101
101
  when :FILE
102
- @no_sep = true
103
-
104
- @io = stdout?(@ext) ? (@path = @ext; open_stdout) :
105
- open_path(@path = set_ext(param, @ext), 'w')
102
+ @no_sep, @io = true, (@stdout = stdout?(@ext)) ?
103
+ open_stdout : open_path(get_path(param, @ext), 'w')
106
104
 
107
105
  @lir_rec_no, @lir_rec_buf = '', []
108
106
  when :RECORD
@@ -117,7 +115,7 @@ class Lingo
117
115
  @io.puts unless @lir || @no_puts
118
116
  when :EOF
119
117
  flush_lir_buffer if @lir
120
- @io.close unless stdout?(@path)
118
+ @io.close unless @stdout
121
119
  end
122
120
  end
123
121
 
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -24,8 +24,6 @@
24
24
  ###############################################################################
25
25
  #++
26
26
 
27
- require 'csv'
28
-
29
27
  class Lingo
30
28
 
31
29
  class Attendee
@@ -83,6 +81,8 @@ class Lingo
83
81
 
84
82
  class VectorFilter < self
85
83
 
84
+ include TextUtils
85
+
86
86
  DEFAULT_SRC_SEPARATOR = '|'
87
87
  DEFAULT_POS_SEPARATOR = '@'
88
88
 
@@ -186,7 +186,7 @@ class Lingo
186
186
  vec = vec.form if vec.is_a?(WordForm)
187
187
 
188
188
  vec = Unicode.downcase(vec)
189
- vec << @src << src if @src && src
189
+ vec << @src << src.form if @src && src
190
190
 
191
191
  @sort_fmt ? vectors[vec] << pos : forward(vec_pos(vec, [pos]))
192
192
  end
@@ -198,7 +198,7 @@ class Lingo
198
198
  @vectors.each_value { |w| w.each_key { |v| df[v] += 1 } }
199
199
 
200
200
  if @tfidf.is_a?(String)
201
- CSV.open(@tfidf, 'wb') { |c| df.sort.each { |v| c << v } }
201
+ open_csv(@tfidf, 'wb') { |c| df.sort.each { |v| c << v } }
202
202
  end
203
203
 
204
204
  yield lambda { |docnum|