lingo 1.9.0.pre1 → 1.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/ChangeLog +18 -7
  3. data/README +6 -8
  4. data/Rakefile +5 -5
  5. data/dict/en/lingo-dic.txt +52625 -15693
  6. data/lang/en.lang +2 -2
  7. data/lib/lingo.rb +15 -3
  8. data/lib/lingo/array_utils.rb +39 -0
  9. data/lib/lingo/attendee.rb +1 -3
  10. data/lib/lingo/attendee/multi_worder.rb +4 -2
  11. data/lib/lingo/attendee/sequencer.rb +122 -73
  12. data/lib/lingo/attendee/text_writer.rb +4 -6
  13. data/lib/lingo/attendee/vector_filter.rb +5 -5
  14. data/lib/lingo/cli.rb +20 -2
  15. data/lib/lingo/config.rb +4 -3
  16. data/lib/lingo/ctl.rb +2 -20
  17. data/lib/lingo/ctl/analysis.rb +3 -5
  18. data/lib/lingo/ctl/files.rb +3 -3
  19. data/lib/lingo/database.rb +26 -25
  20. data/lib/lingo/database/crypter.rb +10 -6
  21. data/lib/lingo/database/source.rb +72 -25
  22. data/lib/lingo/database/source/key_value.rb +12 -8
  23. data/lib/lingo/database/source/multi_key.rb +11 -9
  24. data/lib/lingo/database/source/multi_value.rb +10 -8
  25. data/lib/lingo/database/source/single_word.rb +10 -6
  26. data/lib/lingo/database/source/word_class.rb +43 -14
  27. data/lib/lingo/debug.rb +2 -2
  28. data/lib/lingo/error.rb +21 -5
  29. data/lib/lingo/filter.rb +1 -1
  30. data/lib/lingo/language.rb +21 -21
  31. data/lib/lingo/language/grammar.rb +4 -2
  32. data/lib/lingo/language/lexical_hash.rb +2 -14
  33. data/lib/lingo/language/word.rb +1 -5
  34. data/lib/lingo/text_utils.rb +113 -20
  35. data/lib/lingo/version.rb +1 -1
  36. data/test/attendee/ts_sequencer.rb +286 -32
  37. data/test/attendee/ts_text_reader.rb +4 -4
  38. data/test/attendee/ts_text_writer.rb +19 -5
  39. data/test/test_helper.rb +2 -0
  40. data/test/ts_database.rb +213 -14
  41. metadata +36 -24
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -40,25 +40,27 @@ class Lingo
40
40
 
41
41
  class MultiKey < self
42
42
 
43
- DEFAULT_SEPARATOR = ';'
43
+ DEFAULT_SEPARATOR = ';'.freeze
44
44
 
45
- def initialize(id, lingo)
45
+ def initialize(*)
46
46
  super
47
- @pat = /^#{@wrd}(?:#{Regexp.escape(@sep ||= DEFAULT_SEPARATOR)}#{@wrd})*$/
47
+ @pat = /^#{@wrd}(?:#{Regexp.escape(@sep)}#{@wrd})*$/
48
48
  end
49
49
 
50
50
  def set(db, key, val)
51
- key += "##{@def}" if @def
51
+ key = lexical(key, @def) if @def
52
52
  val.each { |v| db[v] = [key] }
53
53
  end
54
54
 
55
- private
56
-
57
- def convert_line(line, key, val)
58
- values = line.split(@sep).each { |i| i.strip! }
55
+ def parse_line(line, key, val)
56
+ values = line.split(@sep).each(&:strip!)
59
57
  [values.shift, values]
60
58
  end
61
59
 
60
+ def dump_line(key, val, sep = @sep, *)
61
+ val.map(&:form).unshift(key).join(sep)
62
+ end
63
+
62
64
  end
63
65
 
64
66
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -38,22 +38,24 @@ class Lingo
38
38
 
39
39
  class MultiValue < self
40
40
 
41
- DEFAULT_SEPARATOR = ';'
41
+ DEFAULT_SEPARATOR = ';'.freeze
42
42
 
43
- def initialize(id, lingo)
43
+ def initialize(*)
44
44
  super
45
- @pat = /^#{@wrd}(?:#{Regexp.escape(@sep ||= DEFAULT_SEPARATOR)}#{@wrd})*$/
45
+ @pat = /^#{@wrd}(?:#{Regexp.escape(@sep)}#{@wrd})*$/
46
46
  end
47
47
 
48
48
  def set(db, key, val)
49
- values = val.map { |v| @def ? "#{v}##{@def}" : v }
49
+ values = val.map { |v| @def ? lexical(v, @def) : v }
50
50
  val.each { |v| db[v] = values }
51
51
  end
52
52
 
53
- private
53
+ def parse_line(line, key, val)
54
+ [nil, line.split(@sep).each(&:strip!)]
55
+ end
54
56
 
55
- def convert_line(line, key, val)
56
- [nil, line.split(@sep).each { |i| i.strip! }]
57
+ def dump_line(key, val, sep = @sep, *)
58
+ val.map(&:form).join(sep)
57
59
  end
58
60
 
59
61
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -38,16 +38,20 @@ class Lingo
38
38
 
39
39
  class SingleWord < self
40
40
 
41
- def initialize(id, lingo)
42
- super(id, lingo, Language::LA_NOUN)
41
+ DEFAULT_DEF_WC = Language::LA_NOUN
42
+
43
+ def initialize(*)
44
+ super
43
45
  @pat = /^(#{@wrd})$/
44
46
  @mul = @config.fetch('def-mul-wc', @def).downcase
45
47
  end
46
48
 
47
- private
49
+ def parse_line(line, key, val)
50
+ [k = key.strip, [lexical(k, k.include?(' ') ? @mul : @def)]]
51
+ end
48
52
 
49
- def convert_line(line, key, val)
50
- [k = key.strip, %W[#{k}##{k.include?(' ') ? @mul : @def}]]
53
+ def dump_line(key, val, *)
54
+ key
51
55
  end
52
56
 
53
57
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -38,38 +38,67 @@ class Lingo
38
38
 
39
39
  class WordClass < self
40
40
 
41
- DEFAULT_SEPARATOR = ','
41
+ include ArrayUtils
42
42
 
43
- GENDER_SEPARATOR = '.'
43
+ DEFAULT_SEPARATOR = ','.freeze
44
44
 
45
- def initialize(id, lingo)
45
+ GENDER_SEPARATOR = '.'.freeze
46
+
47
+ VALUE_SEPARATOR = '|'.freeze
48
+
49
+ WC_SEPARATOR = '#'.freeze
50
+
51
+ SCAN_RE = /(\S.*?)\s*#{WC_SEPARATOR}(\S+)/o
52
+
53
+ def initialize(*)
46
54
  super
47
55
 
48
56
  gen = Regexp.escape(GENDER_SEPARATOR)
49
- sep = Regexp.escape(@sep ||= DEFAULT_SEPARATOR)
57
+ val = Regexp.escape(VALUE_SEPARATOR)
58
+ sep = Regexp.escape(@sep)
50
59
 
51
- w, a = '\w%1$s(?:\|\w%1$s)*', '[+]?'
52
- wc = "##{w % a}(?:#{gen}#{w % ''})?"
60
+ w, a = "\\w%1$s(?:#{val}\\w%1$s)*", '[+]?'
61
+ wc = "#{WC_SEPARATOR}#{w % a}(?:#{gen}#{w % ''})?"
53
62
 
54
63
  @pat = /^(#{@wrd})#{sep}((?:#{@wrd}#{wc})+)$/
55
64
  end
56
65
 
57
- private
58
-
59
- def convert_line(line, key, val)
66
+ def parse_line(line, key, val)
60
67
  values = []
61
68
 
62
- val.strip.scan(/(\S.*?)\s*#(\S+)/) { |k, v|
63
- v, f = v.split('.')
69
+ val.strip.scan(SCAN_RE) { |k, v|
70
+ v, f = v.split(GENDER_SEPARATOR)
71
+ f = f ? f.split(VALUE_SEPARATOR) : [nil]
64
72
 
65
- v.split('|').product(f ? f.split('|') : [nil]) { |w, g|
66
- values << "#{k}##{w}##{g}"
73
+ combinations(v.split(VALUE_SEPARATOR), f) { |w, g|
74
+ values << lexical(k, w, g)
67
75
  }
68
76
  }
69
77
 
70
78
  [key.strip, values]
71
79
  end
72
80
 
81
+ def dump_line(key, val, key_sep = nil, val_sep = nil, compact = true, *)
82
+ "#{key}#{key_sep || @sep}#{dump_values(val, compact).join(val_sep || ' ')}"
83
+ end
84
+
85
+ def dump_values(val, compact = true)
86
+ join = lambda { |v|
87
+ v.compact!; v.uniq!; v.sort!; v.join(VALUE_SEPARATOR) }
88
+
89
+ if compact
90
+ values = Hash.new { |h, k| h[k] = [[], []] }; val.each { |lex|
91
+ a, g = values[lex.form]; a << lex.attr; g << lex.gender }
92
+ else
93
+ values = val.map { |lex| [lex.form, [[lex.attr], [lex.gender]]] }
94
+ end
95
+
96
+ values.sort.map { |form, (attrs, genders)|
97
+ res = "#{form} #{WC_SEPARATOR}#{join[attrs]}"
98
+ genders.any? ? "#{res}#{GENDER_SEPARATOR}#{join[genders]}" : res
99
+ }
100
+ end
101
+
73
102
  end
74
103
 
75
104
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -67,7 +67,7 @@ class Lingo
67
67
  html: :GraphHtmlPrinter,
68
68
  stack: :CallStackPrinter
69
69
  }.each { |ext, name|
70
- File.open("#{base}.#{ext}", 'a+', encoding: ENC) { |f|
70
+ File.open("#{base}.#{ext}", 'a+', encoding: ENCODING) { |f|
71
71
  RubyProf.const_get(name).new(result).print(f)
72
72
  }
73
73
  }
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -38,7 +38,9 @@ class Lingo
38
38
 
39
39
  end
40
40
 
41
- class NoWritableStoreError < LingoError
41
+ class StoreError < LingoError; end
42
+
43
+ class NoWritableStoreError < StoreError
42
44
 
43
45
  attr_reader :file, :path
44
46
 
@@ -52,7 +54,9 @@ class Lingo
52
54
 
53
55
  end
54
56
 
55
- class BackendNotFoundError < LingoError
57
+ class BackendError < LingoError; end
58
+
59
+ class BackendNotFoundError < BackendError
56
60
 
57
61
  attr_reader :file
58
62
 
@@ -66,7 +70,7 @@ class Lingo
66
70
 
67
71
  end
68
72
 
69
- class BackendNotAvailableError < LingoError
73
+ class BackendNotAvailableError < BackendError
70
74
 
71
75
  attr_reader :name, :file, :err
72
76
 
@@ -143,7 +147,7 @@ class Lingo
143
147
 
144
148
  end
145
149
 
146
- class FileNotFoundError < LingoError
150
+ class FileError < LingoError
147
151
 
148
152
  attr_reader :name
149
153
 
@@ -151,6 +155,10 @@ class Lingo
151
155
  @name = name
152
156
  end
153
157
 
158
+ end
159
+
160
+ class FileNotFoundError < FileError
161
+
154
162
  def to_s
155
163
  "No such file `#{name}'."
156
164
  end
@@ -172,6 +180,14 @@ class Lingo
172
180
 
173
181
  end
174
182
 
183
+ class FileExistsError < FileError
184
+
185
+ def to_s
186
+ "File `#{name}' already exists."
187
+ end
188
+
189
+ end
190
+
175
191
  class NameNotFoundError < LingoError
176
192
 
177
193
  attr_reader :klass, :name
@@ -28,7 +28,7 @@ class Lingo
28
28
 
29
29
  class Filter
30
30
 
31
- def initialize(io, encoding = ENC)
31
+ def initialize(io, encoding = ENCODING)
32
32
  @io, @encoding = io, encoding
33
33
  end
34
34
 
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -37,27 +37,27 @@ class Lingo
37
37
 
38
38
  module Language
39
39
 
40
- CHAR_PUNCT = '.'
40
+ CHAR_PUNCT = '.'.freeze
41
41
 
42
- TA_ABBREVIATION = 'ABRV'
43
- TA_HELP = 'HELP'
44
- TA_HTML = 'HTML'
45
- TA_NUMBER = 'NUMS'
46
- TA_OTHER = 'OTHR'
47
- TA_PUNCTUATION = 'PUNC'
48
- TA_SKIP = 'SKIP'
49
- TA_SPACE = 'SPAC'
50
- TA_URL = 'URLS'
51
- TA_WIKI = 'WIKI'
52
- TA_WORD = 'WORD'
42
+ TA_ABBREVIATION = 'ABRV'.freeze
43
+ TA_HELP = 'HELP'.freeze
44
+ TA_HTML = 'HTML'.freeze
45
+ TA_NUMBER = 'NUMS'.freeze
46
+ TA_OTHER = 'OTHR'.freeze
47
+ TA_PUNCTUATION = 'PUNC'.freeze
48
+ TA_SKIP = 'SKIP'.freeze
49
+ TA_SPACE = 'SPAC'.freeze
50
+ TA_URL = 'URLS'.freeze
51
+ TA_WIKI = 'WIKI'.freeze
52
+ TA_WORD = 'WORD'.freeze
53
53
 
54
- WA_UNSET = '-'
55
- WA_IDENTIFIED = 'IDF'
56
- WA_UNKNOWN = '?'
57
- WA_COMPOUND = 'COM'
58
- WA_MULTIWORD = 'MUL'
59
- WA_SEQUENCE = 'SEQ'
60
- WA_UNKMULPART = 'MU?'
54
+ WA_UNSET = '-'.freeze
55
+ WA_IDENTIFIED = 'IDF'.freeze
56
+ WA_UNKNOWN = '?'.freeze
57
+ WA_COMPOUND = 'COM'.freeze
58
+ WA_MULTIWORD = 'MUL'.freeze
59
+ WA_SEQUENCE = 'SEQ'.freeze
60
+ WA_UNKMULPART = 'MU?'.freeze
61
61
 
62
62
  LA_SORTORDER = [
63
63
  LA_SEQUENCE = 'q',
@@ -73,7 +73,7 @@ class Lingo
73
73
  LA_SYNONYM = 'y',
74
74
  LA_STEM = 'z',
75
75
  LA_UNKNOWN = '?'
76
- ].each_with_index.inject({}) { |h, (i, j)| h[i] = j; h }
76
+ ].each_with_index.inject({}) { |h, (i, j)| h[i.freeze] = j; h }
77
77
 
78
78
  end
79
79
 
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -37,6 +37,8 @@ class Lingo
37
37
 
38
38
  class Grammar
39
39
 
40
+ include ArrayUtils
41
+
40
42
  HYPHEN_RE = %r{\A(.+)-([^-]+)\z}
41
43
 
42
44
  DEFAULTS = {
@@ -170,7 +172,7 @@ class Lingo
170
172
 
171
173
  flex.concat(blex).delete_if { |lex| lex.attr == LA_COMPOUND }
172
174
 
173
- [forms.shift.product(*forms).map { |front, back|
175
+ [combinations(*forms).map { |front, back|
174
176
  Lexical.new(front + infix + back, LA_COMPOUND)
175
177
  }.concat(flex), sta, seq.join]
176
178
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -36,8 +36,6 @@ class Lingo
36
36
 
37
37
  class LexicalHash
38
38
 
39
- KEY_REF_RE = %r{\A#{Regexp.escape(Database::KEY_REF)}(\d+)\z}o
40
-
41
39
  def self.open(*args)
42
40
  yield lexical_hash = new(*args)
43
41
  ensure
@@ -53,17 +51,7 @@ class Lingo
53
51
  end
54
52
 
55
53
  def [](key)
56
- rec = @src[Unicode.downcase(key)] or return
57
-
58
- res = rec.map { |str|
59
- str =~ KEY_REF_RE ? $1.to_i : begin
60
- k, *w = str.split('#')
61
- Lexical.new(k.strip, w)
62
- end
63
- }
64
-
65
- res.uniq!
66
- res
54
+ Database::Source.lexicals(@src[Unicode.downcase(key)])
67
55
  end
68
56
 
69
57
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -41,10 +41,6 @@ class Lingo
41
41
  new(form, attr) << lex
42
42
  end
43
43
 
44
- def new_lexical(form, attr, lex_attr)
45
- new_lexicals(form, attr, Lexical.new(form, lex_attr))
46
- end
47
-
48
44
  def new_compound_head(lex, attr = WA_UNSET)
49
45
  form, head_lex = nil, []
50
46