lingo 1.9.0.pre1 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/ChangeLog +18 -7
  3. data/README +6 -8
  4. data/Rakefile +5 -5
  5. data/dict/en/lingo-dic.txt +52625 -15693
  6. data/lang/en.lang +2 -2
  7. data/lib/lingo.rb +15 -3
  8. data/lib/lingo/array_utils.rb +39 -0
  9. data/lib/lingo/attendee.rb +1 -3
  10. data/lib/lingo/attendee/multi_worder.rb +4 -2
  11. data/lib/lingo/attendee/sequencer.rb +122 -73
  12. data/lib/lingo/attendee/text_writer.rb +4 -6
  13. data/lib/lingo/attendee/vector_filter.rb +5 -5
  14. data/lib/lingo/cli.rb +20 -2
  15. data/lib/lingo/config.rb +4 -3
  16. data/lib/lingo/ctl.rb +2 -20
  17. data/lib/lingo/ctl/analysis.rb +3 -5
  18. data/lib/lingo/ctl/files.rb +3 -3
  19. data/lib/lingo/database.rb +26 -25
  20. data/lib/lingo/database/crypter.rb +10 -6
  21. data/lib/lingo/database/source.rb +72 -25
  22. data/lib/lingo/database/source/key_value.rb +12 -8
  23. data/lib/lingo/database/source/multi_key.rb +11 -9
  24. data/lib/lingo/database/source/multi_value.rb +10 -8
  25. data/lib/lingo/database/source/single_word.rb +10 -6
  26. data/lib/lingo/database/source/word_class.rb +43 -14
  27. data/lib/lingo/debug.rb +2 -2
  28. data/lib/lingo/error.rb +21 -5
  29. data/lib/lingo/filter.rb +1 -1
  30. data/lib/lingo/language.rb +21 -21
  31. data/lib/lingo/language/grammar.rb +4 -2
  32. data/lib/lingo/language/lexical_hash.rb +2 -14
  33. data/lib/lingo/language/word.rb +1 -5
  34. data/lib/lingo/text_utils.rb +113 -20
  35. data/lib/lingo/version.rb +1 -1
  36. data/test/attendee/ts_sequencer.rb +286 -32
  37. data/test/attendee/ts_text_reader.rb +4 -4
  38. data/test/attendee/ts_text_writer.rb +19 -5
  39. data/test/test_helper.rb +2 -0
  40. data/test/ts_database.rb +213 -14
  41. metadata +36 -24
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -40,25 +40,27 @@ class Lingo
40
40
 
41
41
  class MultiKey < self
42
42
 
43
- DEFAULT_SEPARATOR = ';'
43
+ DEFAULT_SEPARATOR = ';'.freeze
44
44
 
45
- def initialize(id, lingo)
45
+ def initialize(*)
46
46
  super
47
- @pat = /^#{@wrd}(?:#{Regexp.escape(@sep ||= DEFAULT_SEPARATOR)}#{@wrd})*$/
47
+ @pat = /^#{@wrd}(?:#{Regexp.escape(@sep)}#{@wrd})*$/
48
48
  end
49
49
 
50
50
  def set(db, key, val)
51
- key += "##{@def}" if @def
51
+ key = lexical(key, @def) if @def
52
52
  val.each { |v| db[v] = [key] }
53
53
  end
54
54
 
55
- private
56
-
57
- def convert_line(line, key, val)
58
- values = line.split(@sep).each { |i| i.strip! }
55
+ def parse_line(line, key, val)
56
+ values = line.split(@sep).each(&:strip!)
59
57
  [values.shift, values]
60
58
  end
61
59
 
60
+ def dump_line(key, val, sep = @sep, *)
61
+ val.map(&:form).unshift(key).join(sep)
62
+ end
63
+
62
64
  end
63
65
 
64
66
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -38,22 +38,24 @@ class Lingo
38
38
 
39
39
  class MultiValue < self
40
40
 
41
- DEFAULT_SEPARATOR = ';'
41
+ DEFAULT_SEPARATOR = ';'.freeze
42
42
 
43
- def initialize(id, lingo)
43
+ def initialize(*)
44
44
  super
45
- @pat = /^#{@wrd}(?:#{Regexp.escape(@sep ||= DEFAULT_SEPARATOR)}#{@wrd})*$/
45
+ @pat = /^#{@wrd}(?:#{Regexp.escape(@sep)}#{@wrd})*$/
46
46
  end
47
47
 
48
48
  def set(db, key, val)
49
- values = val.map { |v| @def ? "#{v}##{@def}" : v }
49
+ values = val.map { |v| @def ? lexical(v, @def) : v }
50
50
  val.each { |v| db[v] = values }
51
51
  end
52
52
 
53
- private
53
+ def parse_line(line, key, val)
54
+ [nil, line.split(@sep).each(&:strip!)]
55
+ end
54
56
 
55
- def convert_line(line, key, val)
56
- [nil, line.split(@sep).each { |i| i.strip! }]
57
+ def dump_line(key, val, sep = @sep, *)
58
+ val.map(&:form).join(sep)
57
59
  end
58
60
 
59
61
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -38,16 +38,20 @@ class Lingo
38
38
 
39
39
  class SingleWord < self
40
40
 
41
- def initialize(id, lingo)
42
- super(id, lingo, Language::LA_NOUN)
41
+ DEFAULT_DEF_WC = Language::LA_NOUN
42
+
43
+ def initialize(*)
44
+ super
43
45
  @pat = /^(#{@wrd})$/
44
46
  @mul = @config.fetch('def-mul-wc', @def).downcase
45
47
  end
46
48
 
47
- private
49
+ def parse_line(line, key, val)
50
+ [k = key.strip, [lexical(k, k.include?(' ') ? @mul : @def)]]
51
+ end
48
52
 
49
- def convert_line(line, key, val)
50
- [k = key.strip, %W[#{k}##{k.include?(' ') ? @mul : @def}]]
53
+ def dump_line(key, val, *)
54
+ key
51
55
  end
52
56
 
53
57
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -38,38 +38,67 @@ class Lingo
38
38
 
39
39
  class WordClass < self
40
40
 
41
- DEFAULT_SEPARATOR = ','
41
+ include ArrayUtils
42
42
 
43
- GENDER_SEPARATOR = '.'
43
+ DEFAULT_SEPARATOR = ','.freeze
44
44
 
45
- def initialize(id, lingo)
45
+ GENDER_SEPARATOR = '.'.freeze
46
+
47
+ VALUE_SEPARATOR = '|'.freeze
48
+
49
+ WC_SEPARATOR = '#'.freeze
50
+
51
+ SCAN_RE = /(\S.*?)\s*#{WC_SEPARATOR}(\S+)/o
52
+
53
+ def initialize(*)
46
54
  super
47
55
 
48
56
  gen = Regexp.escape(GENDER_SEPARATOR)
49
- sep = Regexp.escape(@sep ||= DEFAULT_SEPARATOR)
57
+ val = Regexp.escape(VALUE_SEPARATOR)
58
+ sep = Regexp.escape(@sep)
50
59
 
51
- w, a = '\w%1$s(?:\|\w%1$s)*', '[+]?'
52
- wc = "##{w % a}(?:#{gen}#{w % ''})?"
60
+ w, a = "\\w%1$s(?:#{val}\\w%1$s)*", '[+]?'
61
+ wc = "#{WC_SEPARATOR}#{w % a}(?:#{gen}#{w % ''})?"
53
62
 
54
63
  @pat = /^(#{@wrd})#{sep}((?:#{@wrd}#{wc})+)$/
55
64
  end
56
65
 
57
- private
58
-
59
- def convert_line(line, key, val)
66
+ def parse_line(line, key, val)
60
67
  values = []
61
68
 
62
- val.strip.scan(/(\S.*?)\s*#(\S+)/) { |k, v|
63
- v, f = v.split('.')
69
+ val.strip.scan(SCAN_RE) { |k, v|
70
+ v, f = v.split(GENDER_SEPARATOR)
71
+ f = f ? f.split(VALUE_SEPARATOR) : [nil]
64
72
 
65
- v.split('|').product(f ? f.split('|') : [nil]) { |w, g|
66
- values << "#{k}##{w}##{g}"
73
+ combinations(v.split(VALUE_SEPARATOR), f) { |w, g|
74
+ values << lexical(k, w, g)
67
75
  }
68
76
  }
69
77
 
70
78
  [key.strip, values]
71
79
  end
72
80
 
81
+ def dump_line(key, val, key_sep = nil, val_sep = nil, compact = true, *)
82
+ "#{key}#{key_sep || @sep}#{dump_values(val, compact).join(val_sep || ' ')}"
83
+ end
84
+
85
+ def dump_values(val, compact = true)
86
+ join = lambda { |v|
87
+ v.compact!; v.uniq!; v.sort!; v.join(VALUE_SEPARATOR) }
88
+
89
+ if compact
90
+ values = Hash.new { |h, k| h[k] = [[], []] }; val.each { |lex|
91
+ a, g = values[lex.form]; a << lex.attr; g << lex.gender }
92
+ else
93
+ values = val.map { |lex| [lex.form, [[lex.attr], [lex.gender]]] }
94
+ end
95
+
96
+ values.sort.map { |form, (attrs, genders)|
97
+ res = "#{form} #{WC_SEPARATOR}#{join[attrs]}"
98
+ genders.any? ? "#{res}#{GENDER_SEPARATOR}#{join[genders]}" : res
99
+ }
100
+ end
101
+
73
102
  end
74
103
 
75
104
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -67,7 +67,7 @@ class Lingo
67
67
  html: :GraphHtmlPrinter,
68
68
  stack: :CallStackPrinter
69
69
  }.each { |ext, name|
70
- File.open("#{base}.#{ext}", 'a+', encoding: ENC) { |f|
70
+ File.open("#{base}.#{ext}", 'a+', encoding: ENCODING) { |f|
71
71
  RubyProf.const_get(name).new(result).print(f)
72
72
  }
73
73
  }
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -38,7 +38,9 @@ class Lingo
38
38
 
39
39
  end
40
40
 
41
- class NoWritableStoreError < LingoError
41
+ class StoreError < LingoError; end
42
+
43
+ class NoWritableStoreError < StoreError
42
44
 
43
45
  attr_reader :file, :path
44
46
 
@@ -52,7 +54,9 @@ class Lingo
52
54
 
53
55
  end
54
56
 
55
- class BackendNotFoundError < LingoError
57
+ class BackendError < LingoError; end
58
+
59
+ class BackendNotFoundError < BackendError
56
60
 
57
61
  attr_reader :file
58
62
 
@@ -66,7 +70,7 @@ class Lingo
66
70
 
67
71
  end
68
72
 
69
- class BackendNotAvailableError < LingoError
73
+ class BackendNotAvailableError < BackendError
70
74
 
71
75
  attr_reader :name, :file, :err
72
76
 
@@ -143,7 +147,7 @@ class Lingo
143
147
 
144
148
  end
145
149
 
146
- class FileNotFoundError < LingoError
150
+ class FileError < LingoError
147
151
 
148
152
  attr_reader :name
149
153
 
@@ -151,6 +155,10 @@ class Lingo
151
155
  @name = name
152
156
  end
153
157
 
158
+ end
159
+
160
+ class FileNotFoundError < FileError
161
+
154
162
  def to_s
155
163
  "No such file `#{name}'."
156
164
  end
@@ -172,6 +180,14 @@ class Lingo
172
180
 
173
181
  end
174
182
 
183
+ class FileExistsError < FileError
184
+
185
+ def to_s
186
+ "File `#{name}' already exists."
187
+ end
188
+
189
+ end
190
+
175
191
  class NameNotFoundError < LingoError
176
192
 
177
193
  attr_reader :klass, :name
@@ -28,7 +28,7 @@ class Lingo
28
28
 
29
29
  class Filter
30
30
 
31
- def initialize(io, encoding = ENC)
31
+ def initialize(io, encoding = ENCODING)
32
32
  @io, @encoding = io, encoding
33
33
  end
34
34
 
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -37,27 +37,27 @@ class Lingo
37
37
 
38
38
  module Language
39
39
 
40
- CHAR_PUNCT = '.'
40
+ CHAR_PUNCT = '.'.freeze
41
41
 
42
- TA_ABBREVIATION = 'ABRV'
43
- TA_HELP = 'HELP'
44
- TA_HTML = 'HTML'
45
- TA_NUMBER = 'NUMS'
46
- TA_OTHER = 'OTHR'
47
- TA_PUNCTUATION = 'PUNC'
48
- TA_SKIP = 'SKIP'
49
- TA_SPACE = 'SPAC'
50
- TA_URL = 'URLS'
51
- TA_WIKI = 'WIKI'
52
- TA_WORD = 'WORD'
42
+ TA_ABBREVIATION = 'ABRV'.freeze
43
+ TA_HELP = 'HELP'.freeze
44
+ TA_HTML = 'HTML'.freeze
45
+ TA_NUMBER = 'NUMS'.freeze
46
+ TA_OTHER = 'OTHR'.freeze
47
+ TA_PUNCTUATION = 'PUNC'.freeze
48
+ TA_SKIP = 'SKIP'.freeze
49
+ TA_SPACE = 'SPAC'.freeze
50
+ TA_URL = 'URLS'.freeze
51
+ TA_WIKI = 'WIKI'.freeze
52
+ TA_WORD = 'WORD'.freeze
53
53
 
54
- WA_UNSET = '-'
55
- WA_IDENTIFIED = 'IDF'
56
- WA_UNKNOWN = '?'
57
- WA_COMPOUND = 'COM'
58
- WA_MULTIWORD = 'MUL'
59
- WA_SEQUENCE = 'SEQ'
60
- WA_UNKMULPART = 'MU?'
54
+ WA_UNSET = '-'.freeze
55
+ WA_IDENTIFIED = 'IDF'.freeze
56
+ WA_UNKNOWN = '?'.freeze
57
+ WA_COMPOUND = 'COM'.freeze
58
+ WA_MULTIWORD = 'MUL'.freeze
59
+ WA_SEQUENCE = 'SEQ'.freeze
60
+ WA_UNKMULPART = 'MU?'.freeze
61
61
 
62
62
  LA_SORTORDER = [
63
63
  LA_SEQUENCE = 'q',
@@ -73,7 +73,7 @@ class Lingo
73
73
  LA_SYNONYM = 'y',
74
74
  LA_STEM = 'z',
75
75
  LA_UNKNOWN = '?'
76
- ].each_with_index.inject({}) { |h, (i, j)| h[i] = j; h }
76
+ ].each_with_index.inject({}) { |h, (i, j)| h[i.freeze] = j; h }
77
77
 
78
78
  end
79
79
 
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -37,6 +37,8 @@ class Lingo
37
37
 
38
38
  class Grammar
39
39
 
40
+ include ArrayUtils
41
+
40
42
  HYPHEN_RE = %r{\A(.+)-([^-]+)\z}
41
43
 
42
44
  DEFAULTS = {
@@ -170,7 +172,7 @@ class Lingo
170
172
 
171
173
  flex.concat(blex).delete_if { |lex| lex.attr == LA_COMPOUND }
172
174
 
173
- [forms.shift.product(*forms).map { |front, back|
175
+ [combinations(*forms).map { |front, back|
174
176
  Lexical.new(front + infix + back, LA_COMPOUND)
175
177
  }.concat(flex), sta, seq.join]
176
178
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -36,8 +36,6 @@ class Lingo
36
36
 
37
37
  class LexicalHash
38
38
 
39
- KEY_REF_RE = %r{\A#{Regexp.escape(Database::KEY_REF)}(\d+)\z}o
40
-
41
39
  def self.open(*args)
42
40
  yield lexical_hash = new(*args)
43
41
  ensure
@@ -53,17 +51,7 @@ class Lingo
53
51
  end
54
52
 
55
53
  def [](key)
56
- rec = @src[Unicode.downcase(key)] or return
57
-
58
- res = rec.map { |str|
59
- str =~ KEY_REF_RE ? $1.to_i : begin
60
- k, *w = str.split('#')
61
- Lexical.new(k.strip, w)
62
- end
63
- }
64
-
65
- res.uniq!
66
- res
54
+ Database::Source.lexicals(@src[Unicode.downcase(key)])
67
55
  end
68
56
 
69
57
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -41,10 +41,6 @@ class Lingo
41
41
  new(form, attr) << lex
42
42
  end
43
43
 
44
- def new_lexical(form, attr, lex_attr)
45
- new_lexicals(form, attr, Lexical.new(form, lex_attr))
46
- end
47
-
48
44
  def new_compound_head(lex, attr = WA_UNSET)
49
45
  form, head_lex = nil, []
50
46