lingo 1.9.0.pre1 → 1.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/ChangeLog +18 -7
  3. data/README +6 -8
  4. data/Rakefile +5 -5
  5. data/dict/en/lingo-dic.txt +52625 -15693
  6. data/lang/en.lang +2 -2
  7. data/lib/lingo.rb +15 -3
  8. data/lib/lingo/array_utils.rb +39 -0
  9. data/lib/lingo/attendee.rb +1 -3
  10. data/lib/lingo/attendee/multi_worder.rb +4 -2
  11. data/lib/lingo/attendee/sequencer.rb +122 -73
  12. data/lib/lingo/attendee/text_writer.rb +4 -6
  13. data/lib/lingo/attendee/vector_filter.rb +5 -5
  14. data/lib/lingo/cli.rb +20 -2
  15. data/lib/lingo/config.rb +4 -3
  16. data/lib/lingo/ctl.rb +2 -20
  17. data/lib/lingo/ctl/analysis.rb +3 -5
  18. data/lib/lingo/ctl/files.rb +3 -3
  19. data/lib/lingo/database.rb +26 -25
  20. data/lib/lingo/database/crypter.rb +10 -6
  21. data/lib/lingo/database/source.rb +72 -25
  22. data/lib/lingo/database/source/key_value.rb +12 -8
  23. data/lib/lingo/database/source/multi_key.rb +11 -9
  24. data/lib/lingo/database/source/multi_value.rb +10 -8
  25. data/lib/lingo/database/source/single_word.rb +10 -6
  26. data/lib/lingo/database/source/word_class.rb +43 -14
  27. data/lib/lingo/debug.rb +2 -2
  28. data/lib/lingo/error.rb +21 -5
  29. data/lib/lingo/filter.rb +1 -1
  30. data/lib/lingo/language.rb +21 -21
  31. data/lib/lingo/language/grammar.rb +4 -2
  32. data/lib/lingo/language/lexical_hash.rb +2 -14
  33. data/lib/lingo/language/word.rb +1 -5
  34. data/lib/lingo/text_utils.rb +113 -20
  35. data/lib/lingo/version.rb +1 -1
  36. data/test/attendee/ts_sequencer.rb +286 -32
  37. data/test/attendee/ts_text_reader.rb +4 -4
  38. data/test/attendee/ts_text_writer.rb +19 -5
  39. data/test/test_helper.rb +2 -0
  40. data/test/ts_database.rb +213 -14
  41. metadata +36 -24
@@ -68,9 +68,9 @@ language:
68
68
  # Suffixliste, Stand: *****ENGLISCH****
69
69
  # Suffixklasse: s = Substantiv, a = Adjektiv, v = Verb, e = Eigenwort, f = Fugung
70
70
  # Suffixe je Klasse: "<suffix>['/'<ersetzung>][ <suffix>['/'<ersetzung>]]"
71
- - [s, 'es s ves/f ves/fe ies/y']
71
+ - [s, 'es es/is s ves/f ves/fe ies/y']
72
72
  - [a, 'er est r st ier/y iest/y ly al ally']
73
- - [v, 'd ed en es ing s ing/e']
73
+ - [v, 'd ed en es ies/y ing s ing/e']
74
74
  - [e, 's']
75
75
  - [f, '']
76
76
 
@@ -32,6 +32,7 @@ require 'nuggets/file/ext'
32
32
  require 'nuggets/hash/nest'
33
33
  require 'nuggets/hash/seen'
34
34
  require 'nuggets/env/user_home'
35
+ require 'nuggets/object/silence'
35
36
  require 'nuggets/string/camelscore'
36
37
 
37
38
  class Lingo
@@ -61,8 +62,12 @@ class Lingo
61
62
  }
62
63
 
63
64
  # Default encoding
64
- Encoding.default_external = ENC = 'UTF-8'.freeze
65
- Encoding.default_internal = ENC unless RUBY_ENGINE == 'jruby'
65
+ ENCODING = 'UTF-8'.freeze
66
+
67
+ silence {
68
+ Encoding.default_external = ENCODING
69
+ Encoding.default_internal = ENCODING unless RUBY_ENGINE == 'jruby'
70
+ }
66
71
 
67
72
  SEP_RE = %r{[; ,|]}
68
73
 
@@ -298,6 +303,11 @@ class Lingo
298
303
  } }
299
304
  end
300
305
 
306
+ def attendees(arg = Object)
307
+ @attendees.grep(arg.is_a?(Class) ? arg :
308
+ Attendee.const_get(arg.to_s.camelcase))
309
+ end
310
+
301
311
  def start
302
312
  @attendees.first.control(:TALK)
303
313
  end
@@ -319,9 +329,11 @@ require_relative 'lingo/error'
319
329
  require_relative 'lingo/debug'
320
330
  require_relative 'lingo/config'
321
331
  require_relative 'lingo/filter'
332
+ require_relative 'lingo/array_utils'
333
+ require_relative 'lingo/text_utils'
334
+ require_relative 'lingo/language'
322
335
  require_relative 'lingo/progress'
323
336
  require_relative 'lingo/database'
324
- require_relative 'lingo/language'
325
337
  require_relative 'lingo/attendee'
326
338
  require_relative 'lingo/version'
327
339
 
@@ -0,0 +1,39 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ module ArrayUtils
30
+
31
+ private
32
+
33
+ def combinations(first, *rest, &block)
34
+ first.product(*rest, &block)
35
+ end
36
+
37
+ end
38
+
39
+ end
@@ -146,7 +146,7 @@ class Lingo
146
146
  end
147
147
  end
148
148
 
149
- def get_enc(key = 'encoding', default = ENC)
149
+ def get_enc(key = 'encoding', default = ENCODING)
150
150
  Encoding.find(get_key(key, default))
151
151
  rescue ArgumentError => err
152
152
  raise ConfigLoadError.new(err)
@@ -182,8 +182,6 @@ class Lingo
182
182
 
183
183
  end
184
184
 
185
- require_relative 'text_utils'
186
-
187
185
  require_relative 'buffered_attendee'
188
186
  require_relative 'deferred_attendee'
189
187
 
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -79,6 +79,8 @@ class Lingo
79
79
 
80
80
  class MultiWorder < BufferedAttendee
81
81
 
82
+ include ArrayUtils
83
+
82
84
  def init
83
85
  # combine lexical variants?
84
86
  #
@@ -220,7 +222,7 @@ class Lingo
220
222
  }
221
223
 
222
224
  if @combine
223
- seq.shift.product(*seq) { |key|
225
+ combinations(*seq) { |key|
224
226
  @mul_dic.select(key.join(sep), mul)
225
227
  break unless @all || mul.empty?
226
228
  } && mul.uniq!
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -97,23 +97,53 @@ class Lingo
97
97
 
98
98
  class Sequencer < BufferedAttendee
99
99
 
100
+ include ArrayUtils
101
+
100
102
  UNK = %w[#]
101
103
  NUM = %w[0]
102
104
 
103
- CLS = /[[:alpha:]#{NUM.join}]/o
105
+ CLASS_RE = %r{[a-z#{NUM.join}]}o
106
+
107
+ REGEX_RE = %r{
108
+ ( #{CLASS_RE}+ )
109
+ |
110
+ \[
111
+ ( #{CLASS_RE}+ )
112
+ \]
113
+ |
114
+ \(
115
+ (?: \?: )?
116
+ ( #{CLASS_RE}+ (?: \| #{CLASS_RE}+ )* )
117
+ \)
118
+ }xo
119
+
120
+ FULL_CLASS_RE = %r{\A(?:#{CLASS_RE})+\z}o
121
+ FULL_REGEX_RE = %r{\A(?:#{REGEX_RE})+\z}o
104
122
 
105
123
  def init
106
124
  @stopper = get_ary('stopper', DEFAULT_SKIP)
107
125
  .push(WA_UNKNOWN, WA_UNKMULPART)
108
126
 
109
127
  @mwc = get_key('multiword', LA_MULTIWORD)
110
- @cls = []
111
128
 
112
- @seq = get_key('sequences').map { |str, fmt|
113
- @cls.concat(cls = (str = str.downcase).scan(CLS))
129
+ @cls, @seq = [], []
130
+
131
+ get_key('sequences').each { |str, fmt|
132
+ seq, fmt = lambda { |*a| @seq << (a << fmt) },
133
+ fmt == true ? '|' : fmt ? fmt.gsub(/\d+/, '%\&$s') : nil
134
+
135
+ @cls.concat(cls = (str = str.downcase).scan(CLASS_RE))
114
136
 
115
- (str =~ /\W/ ? [Regexp.new(str), nil] : [str, cls]).push(
116
- fmt == true ? '|' : fmt ? fmt.gsub(/\d+/, '%\&$s') : nil)
137
+ case str
138
+ when FULL_CLASS_RE then seq[str, cls]
139
+ when FULL_REGEX_RE then m = []
140
+ str.scan(REGEX_RE) { |m1, m2, m3|
141
+ m1 ? m1.each_char { |c| m << [c] } : m << (
142
+ m2 ? m2.chars : m3.split('|').map(&:chars)) }
143
+
144
+ combinations(*m) { |q| seq[q.join, q.flatten] }
145
+ else seq[Regexp.new(str), nil]
146
+ end
117
147
  }
118
148
 
119
149
  @cls.uniq!
@@ -130,90 +160,109 @@ class Lingo
130
160
  end
131
161
 
132
162
  def process_buffer
133
- flush(@buffer.size < 2 ? @buffer : begin
134
- arg = [[], buf = [], map = [], @seq]
135
-
136
- iter, skip, rewind = @buffer.each_with_index, 0, lambda {
137
- iter.rewind; skip.times { iter.next }; skip = 0
138
- }
139
-
140
- loop {
141
- obj, idx = begin
142
- iter.next
143
- rescue StopIteration
144
- raise unless skip > 0
145
-
146
- buf.slice!(0, skip)
147
- map.slice!(0, skip)
148
-
149
- rewind.call
150
- end
151
-
152
- att = (tok = obj.is_a?(Token)) ? obj.number? ? NUM : UNK :
153
- obj.is_a?(Word) && !obj.unknown? ? obj.compound_attrs : UNK
154
-
155
- if (att &= @cls).empty?
156
- find_seq(*arg)
157
- rewind.call if skip > 0
158
- else
159
- obj.each_lex(@mwc) { |lex|
160
- lex.form.count(' ').succ.times { iter.next }
161
- break skip = idx + 1
162
- } unless tok
163
-
164
- buf << obj
165
- map << att
166
- end
167
- }
168
-
169
- @buffer.concat(find_seq(*arg))
170
- end)
163
+ process_seq if @buffer.size > 1
164
+ flush(@buffer)
171
165
  end
172
166
 
173
167
  private
174
168
 
175
- def find_seq(mat, buf, map, seq)
176
- return mat if buf.empty?
169
+ def process_seq
170
+ buf, map = [], []
177
171
 
178
- forms, args = [], []
172
+ iter, skip, rewind = @buffer.each_with_index, 0, lambda {
173
+ iter.rewind; skip.times { iter.next }; skip = 0
174
+ }
179
175
 
180
- map.replace(map.shift.product(*map)).map! { |i| i.join }.uniq!
176
+ loop {
177
+ obj, idx = begin
178
+ iter.next
179
+ rescue StopIteration
180
+ raise unless skip > 0
181
+
182
+ buf.slice!(0, skip)
183
+ map.slice!(0, skip)
184
+
185
+ rewind.call
186
+ end
187
+
188
+ att = (tok = obj.is_a?(Token)) ? obj.number? ? NUM : UNK :
189
+ obj.is_a?(Word) && !obj.unknown? ? obj.compound_attrs : UNK
190
+
191
+ if (att &= @cls).empty?
192
+ find_seq(buf, map)
193
+ rewind.call if skip > 0
194
+ else
195
+ obj.each_lex(@mwc) { |lex|
196
+ lex.form.count(' ').succ.times { iter.next }
197
+ break skip = idx + 1
198
+ } unless tok
199
+
200
+ buf << obj
201
+ map << att
202
+ end
203
+ }
181
204
 
182
- map.each { |q|
183
- seq.each { |str, cls, fmt|
184
- _str, _cls = [str, cls]
205
+ find_seq(buf, map)
206
+ end
185
207
 
186
- while pos = q.index(str, pos || 0)
187
- _str, _cls = [$&, $&.chars] unless cls
208
+ def find_seq(buf, map)
209
+ return if buf.empty?
188
210
 
189
- _tok = nil; args.clear
211
+ objs, args = [], []
190
212
 
191
- _cls.each_with_index { |wc, i|
192
- obj = buf[pos + i];_tok ||= obj.token
213
+ @seq.each { |str, cls, fmt|
214
+ if cls
215
+ len = cls.size
193
216
 
194
- args[i] = obj.is_a?(Word) ? obj.lexicals.find { |lex|
195
- break lex.form if lex.attr == wc } : obj.form or break
196
- } or next
217
+ buf.each_cons(len).zip(map.each_cons(len)) { |_buf, _map|
218
+ obj = _buf.each; objs.clear; args.clear
197
219
 
198
- forms << [_str, _tok,
199
- fmt =~ /\d/ ? fmt.gsub('%0$s', _str) % args :
200
- fmt ? "#{_str}:#{args.join(fmt)}" : args.join(' ')]
220
+ next if _map.zip(cls) { |_wc, wc|
221
+ break true unless _wc.include?(wc) &&
222
+ find_form(obj.next, wc, objs, args)
223
+ }
201
224
 
202
- pos += 1
203
- end
204
- }
205
- }.clear
225
+ forward_seq(fmt, str, objs, args)
226
+ }
227
+ else
228
+ combinations(*map) { |q|
229
+ q, pos = q.join, -1
206
230
 
207
- forms.uniq!
231
+ while pos = q.index(str, pos += 1)
232
+ objs.clear; args.clear
208
233
 
209
- forms.each { |s, t, f|
210
- wrd = Word.new_lexical(f, WA_SEQUENCE, LA_SEQUENCE)
211
- wrd.pattern, wrd.token = s, t
212
- mat << wrd
234
+ next unless $&.each_char.with_index { |wc, i|
235
+ find_form(buf[pos + i], wc, objs, args) or break
236
+ }
237
+
238
+ forward_seq(fmt, $&, objs, args)
239
+ end
240
+ }
241
+ end
213
242
  }
214
243
 
215
244
  buf.clear
216
- mat
245
+ map.clear
246
+ end
247
+
248
+ def find_form(obj, wc, objs, args)
249
+ form = obj.is_a?(Word) ? obj.lexicals.find { |lex|
250
+ break lex.form if lex.attr == wc } : obj.form or return
251
+
252
+ objs << obj
253
+ args << form
254
+ end
255
+
256
+ def forward_seq(fmt, str, objs, args)
257
+ wrd_form, form = objs.map(&:form).join(' '),
258
+ fmt =~ /\d/ ? fmt.gsub('%0$s', str) % args :
259
+ fmt ? "#{str}:#{args.join(fmt)}" : args.join(' ')
260
+
261
+ wrd = Word.new(wrd_form, WA_SEQUENCE)
262
+ wrd << Lexical.new(form, LA_SEQUENCE)
263
+ wrd.pattern, wrd.token = str, objs.first.token
264
+
265
+ @buffer << wrd
217
266
  end
218
267
 
219
268
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -99,10 +99,8 @@ class Lingo
99
99
  when :LIR
100
100
  @lir = true unless @lir.nil?
101
101
  when :FILE
102
- @no_sep = true
103
-
104
- @io = stdout?(@ext) ? (@path = @ext; open_stdout) :
105
- open_path(@path = set_ext(param, @ext), 'w')
102
+ @no_sep, @io = true, (@stdout = stdout?(@ext)) ?
103
+ open_stdout : open_path(get_path(param, @ext), 'w')
106
104
 
107
105
  @lir_rec_no, @lir_rec_buf = '', []
108
106
  when :RECORD
@@ -117,7 +115,7 @@ class Lingo
117
115
  @io.puts unless @lir || @no_puts
118
116
  when :EOF
119
117
  flush_lir_buffer if @lir
120
- @io.close unless stdout?(@path)
118
+ @io.close unless @stdout
121
119
  end
122
120
  end
123
121
 
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -24,8 +24,6 @@
24
24
  ###############################################################################
25
25
  #++
26
26
 
27
- require 'csv'
28
-
29
27
  class Lingo
30
28
 
31
29
  class Attendee
@@ -83,6 +81,8 @@ class Lingo
83
81
 
84
82
  class VectorFilter < self
85
83
 
84
+ include TextUtils
85
+
86
86
  DEFAULT_SRC_SEPARATOR = '|'
87
87
  DEFAULT_POS_SEPARATOR = '@'
88
88
 
@@ -186,7 +186,7 @@ class Lingo
186
186
  vec = vec.form if vec.is_a?(WordForm)
187
187
 
188
188
  vec = Unicode.downcase(vec)
189
- vec << @src << src if @src && src
189
+ vec << @src << src.form if @src && src
190
190
 
191
191
  @sort_fmt ? vectors[vec] << pos : forward(vec_pos(vec, [pos]))
192
192
  end
@@ -198,7 +198,7 @@ class Lingo
198
198
  @vectors.each_value { |w| w.each_key { |v| df[v] += 1 } }
199
199
 
200
200
  if @tfidf.is_a?(String)
201
- CSV.open(@tfidf, 'wb') { |c| df.sort.each { |v| c << v } }
201
+ open_csv(@tfidf, 'wb') { |c| df.sort.each { |v| c << v } }
202
202
  end
203
203
 
204
204
  yield lambda { |docnum|