lingo 1.9.0.pre1 → 1.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ChangeLog +18 -7
- data/README +6 -8
- data/Rakefile +5 -5
- data/dict/en/lingo-dic.txt +52625 -15693
- data/lang/en.lang +2 -2
- data/lib/lingo.rb +15 -3
- data/lib/lingo/array_utils.rb +39 -0
- data/lib/lingo/attendee.rb +1 -3
- data/lib/lingo/attendee/multi_worder.rb +4 -2
- data/lib/lingo/attendee/sequencer.rb +122 -73
- data/lib/lingo/attendee/text_writer.rb +4 -6
- data/lib/lingo/attendee/vector_filter.rb +5 -5
- data/lib/lingo/cli.rb +20 -2
- data/lib/lingo/config.rb +4 -3
- data/lib/lingo/ctl.rb +2 -20
- data/lib/lingo/ctl/analysis.rb +3 -5
- data/lib/lingo/ctl/files.rb +3 -3
- data/lib/lingo/database.rb +26 -25
- data/lib/lingo/database/crypter.rb +10 -6
- data/lib/lingo/database/source.rb +72 -25
- data/lib/lingo/database/source/key_value.rb +12 -8
- data/lib/lingo/database/source/multi_key.rb +11 -9
- data/lib/lingo/database/source/multi_value.rb +10 -8
- data/lib/lingo/database/source/single_word.rb +10 -6
- data/lib/lingo/database/source/word_class.rb +43 -14
- data/lib/lingo/debug.rb +2 -2
- data/lib/lingo/error.rb +21 -5
- data/lib/lingo/filter.rb +1 -1
- data/lib/lingo/language.rb +21 -21
- data/lib/lingo/language/grammar.rb +4 -2
- data/lib/lingo/language/lexical_hash.rb +2 -14
- data/lib/lingo/language/word.rb +1 -5
- data/lib/lingo/text_utils.rb +113 -20
- data/lib/lingo/version.rb +1 -1
- data/test/attendee/ts_sequencer.rb +286 -32
- data/test/attendee/ts_text_reader.rb +4 -4
- data/test/attendee/ts_text_writer.rb +19 -5
- data/test/test_helper.rb +2 -0
- data/test/ts_database.rb +213 -14
- metadata +36 -24
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -40,25 +40,27 @@ class Lingo
|
|
40
40
|
|
41
41
|
class MultiKey < self
|
42
42
|
|
43
|
-
DEFAULT_SEPARATOR = ';'
|
43
|
+
DEFAULT_SEPARATOR = ';'.freeze
|
44
44
|
|
45
|
-
def initialize(
|
45
|
+
def initialize(*)
|
46
46
|
super
|
47
|
-
@pat = /^#{@wrd}(?:#{Regexp.escape(@sep
|
47
|
+
@pat = /^#{@wrd}(?:#{Regexp.escape(@sep)}#{@wrd})*$/
|
48
48
|
end
|
49
49
|
|
50
50
|
def set(db, key, val)
|
51
|
-
key
|
51
|
+
key = lexical(key, @def) if @def
|
52
52
|
val.each { |v| db[v] = [key] }
|
53
53
|
end
|
54
54
|
|
55
|
-
|
56
|
-
|
57
|
-
def convert_line(line, key, val)
|
58
|
-
values = line.split(@sep).each { |i| i.strip! }
|
55
|
+
def parse_line(line, key, val)
|
56
|
+
values = line.split(@sep).each(&:strip!)
|
59
57
|
[values.shift, values]
|
60
58
|
end
|
61
59
|
|
60
|
+
def dump_line(key, val, sep = @sep, *)
|
61
|
+
val.map(&:form).unshift(key).join(sep)
|
62
|
+
end
|
63
|
+
|
62
64
|
end
|
63
65
|
|
64
66
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -38,22 +38,24 @@ class Lingo
|
|
38
38
|
|
39
39
|
class MultiValue < self
|
40
40
|
|
41
|
-
DEFAULT_SEPARATOR = ';'
|
41
|
+
DEFAULT_SEPARATOR = ';'.freeze
|
42
42
|
|
43
|
-
def initialize(
|
43
|
+
def initialize(*)
|
44
44
|
super
|
45
|
-
@pat = /^#{@wrd}(?:#{Regexp.escape(@sep
|
45
|
+
@pat = /^#{@wrd}(?:#{Regexp.escape(@sep)}#{@wrd})*$/
|
46
46
|
end
|
47
47
|
|
48
48
|
def set(db, key, val)
|
49
|
-
values = val.map { |v| @def ?
|
49
|
+
values = val.map { |v| @def ? lexical(v, @def) : v }
|
50
50
|
val.each { |v| db[v] = values }
|
51
51
|
end
|
52
52
|
|
53
|
-
|
53
|
+
def parse_line(line, key, val)
|
54
|
+
[nil, line.split(@sep).each(&:strip!)]
|
55
|
+
end
|
54
56
|
|
55
|
-
def
|
56
|
-
|
57
|
+
def dump_line(key, val, sep = @sep, *)
|
58
|
+
val.map(&:form).join(sep)
|
57
59
|
end
|
58
60
|
|
59
61
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -38,16 +38,20 @@ class Lingo
|
|
38
38
|
|
39
39
|
class SingleWord < self
|
40
40
|
|
41
|
-
|
42
|
-
|
41
|
+
DEFAULT_DEF_WC = Language::LA_NOUN
|
42
|
+
|
43
|
+
def initialize(*)
|
44
|
+
super
|
43
45
|
@pat = /^(#{@wrd})$/
|
44
46
|
@mul = @config.fetch('def-mul-wc', @def).downcase
|
45
47
|
end
|
46
48
|
|
47
|
-
|
49
|
+
def parse_line(line, key, val)
|
50
|
+
[k = key.strip, [lexical(k, k.include?(' ') ? @mul : @def)]]
|
51
|
+
end
|
48
52
|
|
49
|
-
def
|
50
|
-
|
53
|
+
def dump_line(key, val, *)
|
54
|
+
key
|
51
55
|
end
|
52
56
|
|
53
57
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -38,38 +38,67 @@ class Lingo
|
|
38
38
|
|
39
39
|
class WordClass < self
|
40
40
|
|
41
|
-
|
41
|
+
include ArrayUtils
|
42
42
|
|
43
|
-
|
43
|
+
DEFAULT_SEPARATOR = ','.freeze
|
44
44
|
|
45
|
-
|
45
|
+
GENDER_SEPARATOR = '.'.freeze
|
46
|
+
|
47
|
+
VALUE_SEPARATOR = '|'.freeze
|
48
|
+
|
49
|
+
WC_SEPARATOR = '#'.freeze
|
50
|
+
|
51
|
+
SCAN_RE = /(\S.*?)\s*#{WC_SEPARATOR}(\S+)/o
|
52
|
+
|
53
|
+
def initialize(*)
|
46
54
|
super
|
47
55
|
|
48
56
|
gen = Regexp.escape(GENDER_SEPARATOR)
|
49
|
-
|
57
|
+
val = Regexp.escape(VALUE_SEPARATOR)
|
58
|
+
sep = Regexp.escape(@sep)
|
50
59
|
|
51
|
-
w, a =
|
52
|
-
wc = "
|
60
|
+
w, a = "\\w%1$s(?:#{val}\\w%1$s)*", '[+]?'
|
61
|
+
wc = "#{WC_SEPARATOR}#{w % a}(?:#{gen}#{w % ''})?"
|
53
62
|
|
54
63
|
@pat = /^(#{@wrd})#{sep}((?:#{@wrd}#{wc})+)$/
|
55
64
|
end
|
56
65
|
|
57
|
-
|
58
|
-
|
59
|
-
def convert_line(line, key, val)
|
66
|
+
def parse_line(line, key, val)
|
60
67
|
values = []
|
61
68
|
|
62
|
-
val.strip.scan(
|
63
|
-
v, f = v.split(
|
69
|
+
val.strip.scan(SCAN_RE) { |k, v|
|
70
|
+
v, f = v.split(GENDER_SEPARATOR)
|
71
|
+
f = f ? f.split(VALUE_SEPARATOR) : [nil]
|
64
72
|
|
65
|
-
v.split(
|
66
|
-
values <<
|
73
|
+
combinations(v.split(VALUE_SEPARATOR), f) { |w, g|
|
74
|
+
values << lexical(k, w, g)
|
67
75
|
}
|
68
76
|
}
|
69
77
|
|
70
78
|
[key.strip, values]
|
71
79
|
end
|
72
80
|
|
81
|
+
def dump_line(key, val, key_sep = nil, val_sep = nil, compact = true, *)
|
82
|
+
"#{key}#{key_sep || @sep}#{dump_values(val, compact).join(val_sep || ' ')}"
|
83
|
+
end
|
84
|
+
|
85
|
+
def dump_values(val, compact = true)
|
86
|
+
join = lambda { |v|
|
87
|
+
v.compact!; v.uniq!; v.sort!; v.join(VALUE_SEPARATOR) }
|
88
|
+
|
89
|
+
if compact
|
90
|
+
values = Hash.new { |h, k| h[k] = [[], []] }; val.each { |lex|
|
91
|
+
a, g = values[lex.form]; a << lex.attr; g << lex.gender }
|
92
|
+
else
|
93
|
+
values = val.map { |lex| [lex.form, [[lex.attr], [lex.gender]]] }
|
94
|
+
end
|
95
|
+
|
96
|
+
values.sort.map { |form, (attrs, genders)|
|
97
|
+
res = "#{form} #{WC_SEPARATOR}#{join[attrs]}"
|
98
|
+
genders.any? ? "#{res}#{GENDER_SEPARATOR}#{join[genders]}" : res
|
99
|
+
}
|
100
|
+
end
|
101
|
+
|
73
102
|
end
|
74
103
|
|
75
104
|
end
|
data/lib/lingo/debug.rb
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -67,7 +67,7 @@ class Lingo
|
|
67
67
|
html: :GraphHtmlPrinter,
|
68
68
|
stack: :CallStackPrinter
|
69
69
|
}.each { |ext, name|
|
70
|
-
File.open("#{base}.#{ext}", 'a+', encoding:
|
70
|
+
File.open("#{base}.#{ext}", 'a+', encoding: ENCODING) { |f|
|
71
71
|
RubyProf.const_get(name).new(result).print(f)
|
72
72
|
}
|
73
73
|
}
|
data/lib/lingo/error.rb
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -38,7 +38,9 @@ class Lingo
|
|
38
38
|
|
39
39
|
end
|
40
40
|
|
41
|
-
class
|
41
|
+
class StoreError < LingoError; end
|
42
|
+
|
43
|
+
class NoWritableStoreError < StoreError
|
42
44
|
|
43
45
|
attr_reader :file, :path
|
44
46
|
|
@@ -52,7 +54,9 @@ class Lingo
|
|
52
54
|
|
53
55
|
end
|
54
56
|
|
55
|
-
class
|
57
|
+
class BackendError < LingoError; end
|
58
|
+
|
59
|
+
class BackendNotFoundError < BackendError
|
56
60
|
|
57
61
|
attr_reader :file
|
58
62
|
|
@@ -66,7 +70,7 @@ class Lingo
|
|
66
70
|
|
67
71
|
end
|
68
72
|
|
69
|
-
class BackendNotAvailableError <
|
73
|
+
class BackendNotAvailableError < BackendError
|
70
74
|
|
71
75
|
attr_reader :name, :file, :err
|
72
76
|
|
@@ -143,7 +147,7 @@ class Lingo
|
|
143
147
|
|
144
148
|
end
|
145
149
|
|
146
|
-
class
|
150
|
+
class FileError < LingoError
|
147
151
|
|
148
152
|
attr_reader :name
|
149
153
|
|
@@ -151,6 +155,10 @@ class Lingo
|
|
151
155
|
@name = name
|
152
156
|
end
|
153
157
|
|
158
|
+
end
|
159
|
+
|
160
|
+
class FileNotFoundError < FileError
|
161
|
+
|
154
162
|
def to_s
|
155
163
|
"No such file `#{name}'."
|
156
164
|
end
|
@@ -172,6 +180,14 @@ class Lingo
|
|
172
180
|
|
173
181
|
end
|
174
182
|
|
183
|
+
class FileExistsError < FileError
|
184
|
+
|
185
|
+
def to_s
|
186
|
+
"File `#{name}' already exists."
|
187
|
+
end
|
188
|
+
|
189
|
+
end
|
190
|
+
|
175
191
|
class NameNotFoundError < LingoError
|
176
192
|
|
177
193
|
attr_reader :klass, :name
|
data/lib/lingo/filter.rb
CHANGED
data/lib/lingo/language.rb
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -37,27 +37,27 @@ class Lingo
|
|
37
37
|
|
38
38
|
module Language
|
39
39
|
|
40
|
-
CHAR_PUNCT = '.'
|
40
|
+
CHAR_PUNCT = '.'.freeze
|
41
41
|
|
42
|
-
TA_ABBREVIATION = 'ABRV'
|
43
|
-
TA_HELP = 'HELP'
|
44
|
-
TA_HTML = 'HTML'
|
45
|
-
TA_NUMBER = 'NUMS'
|
46
|
-
TA_OTHER = 'OTHR'
|
47
|
-
TA_PUNCTUATION = 'PUNC'
|
48
|
-
TA_SKIP = 'SKIP'
|
49
|
-
TA_SPACE = 'SPAC'
|
50
|
-
TA_URL = 'URLS'
|
51
|
-
TA_WIKI = 'WIKI'
|
52
|
-
TA_WORD = 'WORD'
|
42
|
+
TA_ABBREVIATION = 'ABRV'.freeze
|
43
|
+
TA_HELP = 'HELP'.freeze
|
44
|
+
TA_HTML = 'HTML'.freeze
|
45
|
+
TA_NUMBER = 'NUMS'.freeze
|
46
|
+
TA_OTHER = 'OTHR'.freeze
|
47
|
+
TA_PUNCTUATION = 'PUNC'.freeze
|
48
|
+
TA_SKIP = 'SKIP'.freeze
|
49
|
+
TA_SPACE = 'SPAC'.freeze
|
50
|
+
TA_URL = 'URLS'.freeze
|
51
|
+
TA_WIKI = 'WIKI'.freeze
|
52
|
+
TA_WORD = 'WORD'.freeze
|
53
53
|
|
54
|
-
WA_UNSET = '-'
|
55
|
-
WA_IDENTIFIED = 'IDF'
|
56
|
-
WA_UNKNOWN = '?'
|
57
|
-
WA_COMPOUND = 'COM'
|
58
|
-
WA_MULTIWORD = 'MUL'
|
59
|
-
WA_SEQUENCE = 'SEQ'
|
60
|
-
WA_UNKMULPART = 'MU?'
|
54
|
+
WA_UNSET = '-'.freeze
|
55
|
+
WA_IDENTIFIED = 'IDF'.freeze
|
56
|
+
WA_UNKNOWN = '?'.freeze
|
57
|
+
WA_COMPOUND = 'COM'.freeze
|
58
|
+
WA_MULTIWORD = 'MUL'.freeze
|
59
|
+
WA_SEQUENCE = 'SEQ'.freeze
|
60
|
+
WA_UNKMULPART = 'MU?'.freeze
|
61
61
|
|
62
62
|
LA_SORTORDER = [
|
63
63
|
LA_SEQUENCE = 'q',
|
@@ -73,7 +73,7 @@ class Lingo
|
|
73
73
|
LA_SYNONYM = 'y',
|
74
74
|
LA_STEM = 'z',
|
75
75
|
LA_UNKNOWN = '?'
|
76
|
-
].each_with_index.inject({}) { |h, (i, j)| h[i] = j; h }
|
76
|
+
].each_with_index.inject({}) { |h, (i, j)| h[i.freeze] = j; h }
|
77
77
|
|
78
78
|
end
|
79
79
|
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -37,6 +37,8 @@ class Lingo
|
|
37
37
|
|
38
38
|
class Grammar
|
39
39
|
|
40
|
+
include ArrayUtils
|
41
|
+
|
40
42
|
HYPHEN_RE = %r{\A(.+)-([^-]+)\z}
|
41
43
|
|
42
44
|
DEFAULTS = {
|
@@ -170,7 +172,7 @@ class Lingo
|
|
170
172
|
|
171
173
|
flex.concat(blex).delete_if { |lex| lex.attr == LA_COMPOUND }
|
172
174
|
|
173
|
-
[
|
175
|
+
[combinations(*forms).map { |front, back|
|
174
176
|
Lexical.new(front + infix + back, LA_COMPOUND)
|
175
177
|
}.concat(flex), sta, seq.join]
|
176
178
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -36,8 +36,6 @@ class Lingo
|
|
36
36
|
|
37
37
|
class LexicalHash
|
38
38
|
|
39
|
-
KEY_REF_RE = %r{\A#{Regexp.escape(Database::KEY_REF)}(\d+)\z}o
|
40
|
-
|
41
39
|
def self.open(*args)
|
42
40
|
yield lexical_hash = new(*args)
|
43
41
|
ensure
|
@@ -53,17 +51,7 @@ class Lingo
|
|
53
51
|
end
|
54
52
|
|
55
53
|
def [](key)
|
56
|
-
|
57
|
-
|
58
|
-
res = rec.map { |str|
|
59
|
-
str =~ KEY_REF_RE ? $1.to_i : begin
|
60
|
-
k, *w = str.split('#')
|
61
|
-
Lexical.new(k.strip, w)
|
62
|
-
end
|
63
|
-
}
|
64
|
-
|
65
|
-
res.uniq!
|
66
|
-
res
|
54
|
+
Database::Source.lexicals(@src[Unicode.downcase(key)])
|
67
55
|
end
|
68
56
|
|
69
57
|
end
|
data/lib/lingo/language/word.rb
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -41,10 +41,6 @@ class Lingo
|
|
41
41
|
new(form, attr) << lex
|
42
42
|
end
|
43
43
|
|
44
|
-
def new_lexical(form, attr, lex_attr)
|
45
|
-
new_lexicals(form, attr, Lexical.new(form, lex_attr))
|
46
|
-
end
|
47
|
-
|
48
44
|
def new_compound_head(lex, attr = WA_UNSET)
|
49
45
|
form, head_lex = nil, []
|
50
46
|
|