lingo 1.9.0.pre1 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ChangeLog +18 -7
- data/README +6 -8
- data/Rakefile +5 -5
- data/dict/en/lingo-dic.txt +52625 -15693
- data/lang/en.lang +2 -2
- data/lib/lingo.rb +15 -3
- data/lib/lingo/array_utils.rb +39 -0
- data/lib/lingo/attendee.rb +1 -3
- data/lib/lingo/attendee/multi_worder.rb +4 -2
- data/lib/lingo/attendee/sequencer.rb +122 -73
- data/lib/lingo/attendee/text_writer.rb +4 -6
- data/lib/lingo/attendee/vector_filter.rb +5 -5
- data/lib/lingo/cli.rb +20 -2
- data/lib/lingo/config.rb +4 -3
- data/lib/lingo/ctl.rb +2 -20
- data/lib/lingo/ctl/analysis.rb +3 -5
- data/lib/lingo/ctl/files.rb +3 -3
- data/lib/lingo/database.rb +26 -25
- data/lib/lingo/database/crypter.rb +10 -6
- data/lib/lingo/database/source.rb +72 -25
- data/lib/lingo/database/source/key_value.rb +12 -8
- data/lib/lingo/database/source/multi_key.rb +11 -9
- data/lib/lingo/database/source/multi_value.rb +10 -8
- data/lib/lingo/database/source/single_word.rb +10 -6
- data/lib/lingo/database/source/word_class.rb +43 -14
- data/lib/lingo/debug.rb +2 -2
- data/lib/lingo/error.rb +21 -5
- data/lib/lingo/filter.rb +1 -1
- data/lib/lingo/language.rb +21 -21
- data/lib/lingo/language/grammar.rb +4 -2
- data/lib/lingo/language/lexical_hash.rb +2 -14
- data/lib/lingo/language/word.rb +1 -5
- data/lib/lingo/text_utils.rb +113 -20
- data/lib/lingo/version.rb +1 -1
- data/test/attendee/ts_sequencer.rb +286 -32
- data/test/attendee/ts_text_reader.rb +4 -4
- data/test/attendee/ts_text_writer.rb +19 -5
- data/test/test_helper.rb +2 -0
- data/test/ts_database.rb +213 -14
- metadata +36 -24
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -40,25 +40,27 @@ class Lingo
|
|
40
40
|
|
41
41
|
class MultiKey < self
|
42
42
|
|
43
|
-
DEFAULT_SEPARATOR = ';'
|
43
|
+
DEFAULT_SEPARATOR = ';'.freeze
|
44
44
|
|
45
|
-
def initialize(
|
45
|
+
def initialize(*)
|
46
46
|
super
|
47
|
-
@pat = /^#{@wrd}(?:#{Regexp.escape(@sep
|
47
|
+
@pat = /^#{@wrd}(?:#{Regexp.escape(@sep)}#{@wrd})*$/
|
48
48
|
end
|
49
49
|
|
50
50
|
def set(db, key, val)
|
51
|
-
key
|
51
|
+
key = lexical(key, @def) if @def
|
52
52
|
val.each { |v| db[v] = [key] }
|
53
53
|
end
|
54
54
|
|
55
|
-
|
56
|
-
|
57
|
-
def convert_line(line, key, val)
|
58
|
-
values = line.split(@sep).each { |i| i.strip! }
|
55
|
+
def parse_line(line, key, val)
|
56
|
+
values = line.split(@sep).each(&:strip!)
|
59
57
|
[values.shift, values]
|
60
58
|
end
|
61
59
|
|
60
|
+
def dump_line(key, val, sep = @sep, *)
|
61
|
+
val.map(&:form).unshift(key).join(sep)
|
62
|
+
end
|
63
|
+
|
62
64
|
end
|
63
65
|
|
64
66
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -38,22 +38,24 @@ class Lingo
|
|
38
38
|
|
39
39
|
class MultiValue < self
|
40
40
|
|
41
|
-
DEFAULT_SEPARATOR = ';'
|
41
|
+
DEFAULT_SEPARATOR = ';'.freeze
|
42
42
|
|
43
|
-
def initialize(
|
43
|
+
def initialize(*)
|
44
44
|
super
|
45
|
-
@pat = /^#{@wrd}(?:#{Regexp.escape(@sep
|
45
|
+
@pat = /^#{@wrd}(?:#{Regexp.escape(@sep)}#{@wrd})*$/
|
46
46
|
end
|
47
47
|
|
48
48
|
def set(db, key, val)
|
49
|
-
values = val.map { |v| @def ?
|
49
|
+
values = val.map { |v| @def ? lexical(v, @def) : v }
|
50
50
|
val.each { |v| db[v] = values }
|
51
51
|
end
|
52
52
|
|
53
|
-
|
53
|
+
def parse_line(line, key, val)
|
54
|
+
[nil, line.split(@sep).each(&:strip!)]
|
55
|
+
end
|
54
56
|
|
55
|
-
def
|
56
|
-
|
57
|
+
def dump_line(key, val, sep = @sep, *)
|
58
|
+
val.map(&:form).join(sep)
|
57
59
|
end
|
58
60
|
|
59
61
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -38,16 +38,20 @@ class Lingo
|
|
38
38
|
|
39
39
|
class SingleWord < self
|
40
40
|
|
41
|
-
|
42
|
-
|
41
|
+
DEFAULT_DEF_WC = Language::LA_NOUN
|
42
|
+
|
43
|
+
def initialize(*)
|
44
|
+
super
|
43
45
|
@pat = /^(#{@wrd})$/
|
44
46
|
@mul = @config.fetch('def-mul-wc', @def).downcase
|
45
47
|
end
|
46
48
|
|
47
|
-
|
49
|
+
def parse_line(line, key, val)
|
50
|
+
[k = key.strip, [lexical(k, k.include?(' ') ? @mul : @def)]]
|
51
|
+
end
|
48
52
|
|
49
|
-
def
|
50
|
-
|
53
|
+
def dump_line(key, val, *)
|
54
|
+
key
|
51
55
|
end
|
52
56
|
|
53
57
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -38,38 +38,67 @@ class Lingo
|
|
38
38
|
|
39
39
|
class WordClass < self
|
40
40
|
|
41
|
-
|
41
|
+
include ArrayUtils
|
42
42
|
|
43
|
-
|
43
|
+
DEFAULT_SEPARATOR = ','.freeze
|
44
44
|
|
45
|
-
|
45
|
+
GENDER_SEPARATOR = '.'.freeze
|
46
|
+
|
47
|
+
VALUE_SEPARATOR = '|'.freeze
|
48
|
+
|
49
|
+
WC_SEPARATOR = '#'.freeze
|
50
|
+
|
51
|
+
SCAN_RE = /(\S.*?)\s*#{WC_SEPARATOR}(\S+)/o
|
52
|
+
|
53
|
+
def initialize(*)
|
46
54
|
super
|
47
55
|
|
48
56
|
gen = Regexp.escape(GENDER_SEPARATOR)
|
49
|
-
|
57
|
+
val = Regexp.escape(VALUE_SEPARATOR)
|
58
|
+
sep = Regexp.escape(@sep)
|
50
59
|
|
51
|
-
w, a =
|
52
|
-
wc = "
|
60
|
+
w, a = "\\w%1$s(?:#{val}\\w%1$s)*", '[+]?'
|
61
|
+
wc = "#{WC_SEPARATOR}#{w % a}(?:#{gen}#{w % ''})?"
|
53
62
|
|
54
63
|
@pat = /^(#{@wrd})#{sep}((?:#{@wrd}#{wc})+)$/
|
55
64
|
end
|
56
65
|
|
57
|
-
|
58
|
-
|
59
|
-
def convert_line(line, key, val)
|
66
|
+
def parse_line(line, key, val)
|
60
67
|
values = []
|
61
68
|
|
62
|
-
val.strip.scan(
|
63
|
-
v, f = v.split(
|
69
|
+
val.strip.scan(SCAN_RE) { |k, v|
|
70
|
+
v, f = v.split(GENDER_SEPARATOR)
|
71
|
+
f = f ? f.split(VALUE_SEPARATOR) : [nil]
|
64
72
|
|
65
|
-
v.split(
|
66
|
-
values <<
|
73
|
+
combinations(v.split(VALUE_SEPARATOR), f) { |w, g|
|
74
|
+
values << lexical(k, w, g)
|
67
75
|
}
|
68
76
|
}
|
69
77
|
|
70
78
|
[key.strip, values]
|
71
79
|
end
|
72
80
|
|
81
|
+
def dump_line(key, val, key_sep = nil, val_sep = nil, compact = true, *)
|
82
|
+
"#{key}#{key_sep || @sep}#{dump_values(val, compact).join(val_sep || ' ')}"
|
83
|
+
end
|
84
|
+
|
85
|
+
def dump_values(val, compact = true)
|
86
|
+
join = lambda { |v|
|
87
|
+
v.compact!; v.uniq!; v.sort!; v.join(VALUE_SEPARATOR) }
|
88
|
+
|
89
|
+
if compact
|
90
|
+
values = Hash.new { |h, k| h[k] = [[], []] }; val.each { |lex|
|
91
|
+
a, g = values[lex.form]; a << lex.attr; g << lex.gender }
|
92
|
+
else
|
93
|
+
values = val.map { |lex| [lex.form, [[lex.attr], [lex.gender]]] }
|
94
|
+
end
|
95
|
+
|
96
|
+
values.sort.map { |form, (attrs, genders)|
|
97
|
+
res = "#{form} #{WC_SEPARATOR}#{join[attrs]}"
|
98
|
+
genders.any? ? "#{res}#{GENDER_SEPARATOR}#{join[genders]}" : res
|
99
|
+
}
|
100
|
+
end
|
101
|
+
|
73
102
|
end
|
74
103
|
|
75
104
|
end
|
data/lib/lingo/debug.rb
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -67,7 +67,7 @@ class Lingo
|
|
67
67
|
html: :GraphHtmlPrinter,
|
68
68
|
stack: :CallStackPrinter
|
69
69
|
}.each { |ext, name|
|
70
|
-
File.open("#{base}.#{ext}", 'a+', encoding:
|
70
|
+
File.open("#{base}.#{ext}", 'a+', encoding: ENCODING) { |f|
|
71
71
|
RubyProf.const_get(name).new(result).print(f)
|
72
72
|
}
|
73
73
|
}
|
data/lib/lingo/error.rb
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -38,7 +38,9 @@ class Lingo
|
|
38
38
|
|
39
39
|
end
|
40
40
|
|
41
|
-
class
|
41
|
+
class StoreError < LingoError; end
|
42
|
+
|
43
|
+
class NoWritableStoreError < StoreError
|
42
44
|
|
43
45
|
attr_reader :file, :path
|
44
46
|
|
@@ -52,7 +54,9 @@ class Lingo
|
|
52
54
|
|
53
55
|
end
|
54
56
|
|
55
|
-
class
|
57
|
+
class BackendError < LingoError; end
|
58
|
+
|
59
|
+
class BackendNotFoundError < BackendError
|
56
60
|
|
57
61
|
attr_reader :file
|
58
62
|
|
@@ -66,7 +70,7 @@ class Lingo
|
|
66
70
|
|
67
71
|
end
|
68
72
|
|
69
|
-
class BackendNotAvailableError <
|
73
|
+
class BackendNotAvailableError < BackendError
|
70
74
|
|
71
75
|
attr_reader :name, :file, :err
|
72
76
|
|
@@ -143,7 +147,7 @@ class Lingo
|
|
143
147
|
|
144
148
|
end
|
145
149
|
|
146
|
-
class
|
150
|
+
class FileError < LingoError
|
147
151
|
|
148
152
|
attr_reader :name
|
149
153
|
|
@@ -151,6 +155,10 @@ class Lingo
|
|
151
155
|
@name = name
|
152
156
|
end
|
153
157
|
|
158
|
+
end
|
159
|
+
|
160
|
+
class FileNotFoundError < FileError
|
161
|
+
|
154
162
|
def to_s
|
155
163
|
"No such file `#{name}'."
|
156
164
|
end
|
@@ -172,6 +180,14 @@ class Lingo
|
|
172
180
|
|
173
181
|
end
|
174
182
|
|
183
|
+
class FileExistsError < FileError
|
184
|
+
|
185
|
+
def to_s
|
186
|
+
"File `#{name}' already exists."
|
187
|
+
end
|
188
|
+
|
189
|
+
end
|
190
|
+
|
175
191
|
class NameNotFoundError < LingoError
|
176
192
|
|
177
193
|
attr_reader :klass, :name
|
data/lib/lingo/filter.rb
CHANGED
data/lib/lingo/language.rb
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -37,27 +37,27 @@ class Lingo
|
|
37
37
|
|
38
38
|
module Language
|
39
39
|
|
40
|
-
CHAR_PUNCT = '.'
|
40
|
+
CHAR_PUNCT = '.'.freeze
|
41
41
|
|
42
|
-
TA_ABBREVIATION = 'ABRV'
|
43
|
-
TA_HELP = 'HELP'
|
44
|
-
TA_HTML = 'HTML'
|
45
|
-
TA_NUMBER = 'NUMS'
|
46
|
-
TA_OTHER = 'OTHR'
|
47
|
-
TA_PUNCTUATION = 'PUNC'
|
48
|
-
TA_SKIP = 'SKIP'
|
49
|
-
TA_SPACE = 'SPAC'
|
50
|
-
TA_URL = 'URLS'
|
51
|
-
TA_WIKI = 'WIKI'
|
52
|
-
TA_WORD = 'WORD'
|
42
|
+
TA_ABBREVIATION = 'ABRV'.freeze
|
43
|
+
TA_HELP = 'HELP'.freeze
|
44
|
+
TA_HTML = 'HTML'.freeze
|
45
|
+
TA_NUMBER = 'NUMS'.freeze
|
46
|
+
TA_OTHER = 'OTHR'.freeze
|
47
|
+
TA_PUNCTUATION = 'PUNC'.freeze
|
48
|
+
TA_SKIP = 'SKIP'.freeze
|
49
|
+
TA_SPACE = 'SPAC'.freeze
|
50
|
+
TA_URL = 'URLS'.freeze
|
51
|
+
TA_WIKI = 'WIKI'.freeze
|
52
|
+
TA_WORD = 'WORD'.freeze
|
53
53
|
|
54
|
-
WA_UNSET = '-'
|
55
|
-
WA_IDENTIFIED = 'IDF'
|
56
|
-
WA_UNKNOWN = '?'
|
57
|
-
WA_COMPOUND = 'COM'
|
58
|
-
WA_MULTIWORD = 'MUL'
|
59
|
-
WA_SEQUENCE = 'SEQ'
|
60
|
-
WA_UNKMULPART = 'MU?'
|
54
|
+
WA_UNSET = '-'.freeze
|
55
|
+
WA_IDENTIFIED = 'IDF'.freeze
|
56
|
+
WA_UNKNOWN = '?'.freeze
|
57
|
+
WA_COMPOUND = 'COM'.freeze
|
58
|
+
WA_MULTIWORD = 'MUL'.freeze
|
59
|
+
WA_SEQUENCE = 'SEQ'.freeze
|
60
|
+
WA_UNKMULPART = 'MU?'.freeze
|
61
61
|
|
62
62
|
LA_SORTORDER = [
|
63
63
|
LA_SEQUENCE = 'q',
|
@@ -73,7 +73,7 @@ class Lingo
|
|
73
73
|
LA_SYNONYM = 'y',
|
74
74
|
LA_STEM = 'z',
|
75
75
|
LA_UNKNOWN = '?'
|
76
|
-
].each_with_index.inject({}) { |h, (i, j)| h[i] = j; h }
|
76
|
+
].each_with_index.inject({}) { |h, (i, j)| h[i.freeze] = j; h }
|
77
77
|
|
78
78
|
end
|
79
79
|
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -37,6 +37,8 @@ class Lingo
|
|
37
37
|
|
38
38
|
class Grammar
|
39
39
|
|
40
|
+
include ArrayUtils
|
41
|
+
|
40
42
|
HYPHEN_RE = %r{\A(.+)-([^-]+)\z}
|
41
43
|
|
42
44
|
DEFAULTS = {
|
@@ -170,7 +172,7 @@ class Lingo
|
|
170
172
|
|
171
173
|
flex.concat(blex).delete_if { |lex| lex.attr == LA_COMPOUND }
|
172
174
|
|
173
|
-
[
|
175
|
+
[combinations(*forms).map { |front, back|
|
174
176
|
Lexical.new(front + infix + back, LA_COMPOUND)
|
175
177
|
}.concat(flex), sta, seq.join]
|
176
178
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -36,8 +36,6 @@ class Lingo
|
|
36
36
|
|
37
37
|
class LexicalHash
|
38
38
|
|
39
|
-
KEY_REF_RE = %r{\A#{Regexp.escape(Database::KEY_REF)}(\d+)\z}o
|
40
|
-
|
41
39
|
def self.open(*args)
|
42
40
|
yield lexical_hash = new(*args)
|
43
41
|
ensure
|
@@ -53,17 +51,7 @@ class Lingo
|
|
53
51
|
end
|
54
52
|
|
55
53
|
def [](key)
|
56
|
-
|
57
|
-
|
58
|
-
res = rec.map { |str|
|
59
|
-
str =~ KEY_REF_RE ? $1.to_i : begin
|
60
|
-
k, *w = str.split('#')
|
61
|
-
Lexical.new(k.strip, w)
|
62
|
-
end
|
63
|
-
}
|
64
|
-
|
65
|
-
res.uniq!
|
66
|
-
res
|
54
|
+
Database::Source.lexicals(@src[Unicode.downcase(key)])
|
67
55
|
end
|
68
56
|
|
69
57
|
end
|
data/lib/lingo/language/word.rb
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -41,10 +41,6 @@ class Lingo
|
|
41
41
|
new(form, attr) << lex
|
42
42
|
end
|
43
43
|
|
44
|
-
def new_lexical(form, attr, lex_attr)
|
45
|
-
new_lexicals(form, attr, Lexical.new(form, lex_attr))
|
46
|
-
end
|
47
|
-
|
48
44
|
def new_compound_head(lex, attr = WA_UNSET)
|
49
45
|
form, head_lex = nil, []
|
50
46
|
|