lingo 1.8.6 → 1.8.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ChangeLog +40 -4
- data/README +22 -51
- data/Rakefile +3 -17
- data/config/lingo.cfg +24 -15
- data/config/lir.cfg +25 -16
- data/dict/de/test_muh.txt +6 -0
- data/dict/en/lingo-dic.txt +2 -3
- data/lang/de.lang +10 -9
- data/lang/en.lang +1 -1
- data/lib/lingo.rb +4 -4
- data/lib/lingo/attendee.rb +27 -7
- data/lib/lingo/attendee/analysis_filter.rb +81 -0
- data/lib/lingo/attendee/debug_filter.rb +42 -0
- data/lib/lingo/attendee/debugger.rb +2 -11
- data/lib/lingo/attendee/decomposer.rb +6 -3
- data/lib/lingo/attendee/formatter.rb +6 -6
- data/lib/lingo/attendee/hal_filter.rb +94 -0
- data/lib/lingo/attendee/lsi_filter.rb +99 -0
- data/lib/lingo/attendee/multi_worder.rb +69 -43
- data/lib/lingo/attendee/sequencer.rb +32 -19
- data/lib/lingo/attendee/synonymer.rb +2 -2
- data/lib/lingo/attendee/text_reader.rb +63 -92
- data/lib/lingo/attendee/text_writer.rb +12 -21
- data/lib/lingo/attendee/tokenizer.rb +32 -21
- data/lib/lingo/attendee/variator.rb +3 -3
- data/lib/lingo/attendee/vector_filter.rb +7 -9
- data/lib/lingo/attendee/word_searcher.rb +3 -3
- data/lib/lingo/buffered_attendee.rb +3 -36
- data/lib/lingo/config.rb +1 -1
- data/lib/lingo/ctl.rb +7 -155
- data/lib/lingo/ctl/analysis.rb +136 -0
- data/lib/lingo/ctl/files.rb +86 -0
- data/lib/lingo/ctl/other.rb +140 -0
- data/lib/lingo/database.rb +64 -60
- data/lib/lingo/database/crypter.rb +7 -5
- data/lib/lingo/error.rb +5 -4
- data/lib/lingo/language.rb +13 -5
- data/lib/lingo/language/grammar.rb +13 -7
- data/lib/lingo/language/token.rb +6 -0
- data/lib/lingo/language/word.rb +23 -36
- data/lib/lingo/language/word_form.rb +5 -1
- data/lib/lingo/srv.rb +2 -2
- data/lib/lingo/text_utils.rb +96 -0
- data/lib/lingo/version.rb +1 -1
- data/lib/lingo/web/views/index.erb +1 -1
- data/test/attendee/ts_decomposer.rb +23 -5
- data/test/attendee/ts_multi_worder.rb +66 -0
- data/test/attendee/ts_sequencer.rb +28 -4
- data/test/attendee/ts_text_reader.rb +20 -0
- data/test/attendee/ts_tokenizer.rb +20 -0
- data/test/attendee/ts_variator.rb +1 -1
- data/test/attendee/ts_word_searcher.rb +39 -3
- data/test/lir3.txt +12 -0
- data/test/ref/artikel.non +1 -12
- data/test/ref/artikel.seq +3 -1
- data/test/ref/artikel.vec +1 -0
- data/test/ref/artikel.vef +35 -34
- data/test/ref/artikel.ven +8 -7
- data/test/ref/artikel.ver +34 -33
- data/test/ref/artikel.vet +2573 -2563
- data/test/ref/lir.non +77 -78
- data/test/ref/lir.seq +9 -7
- data/test/ref/lir.syn +1 -1
- data/test/ref/lir.vec +41 -41
- data/test/ref/lir.vef +210 -210
- data/test/ref/lir.ven +46 -46
- data/test/ref/lir.ver +72 -72
- data/test/ref/lir.vet +329 -329
- data/test/ts_database.rb +166 -62
- data/test/ts_language.rb +23 -23
- metadata +53 -34
- data/lib/lingo/attendee/dehyphenizer.rb +0 -120
- data/lib/lingo/attendee/noneword_filter.rb +0 -115
- data/test/attendee/ts_noneword_filter.rb +0 -15
data/dict/en/lingo-dic.txt
CHANGED
@@ -19913,7 +19913,7 @@ fodder=fodder #s|v
|
|
19913
19913
|
foe=foe #s
|
19914
19914
|
foederatus=foederatus #s
|
19915
19915
|
foetal=foetal #a
|
19916
|
-
|
19916
|
+
foetid=foetid #a
|
19917
19917
|
foetidness=foetidness #s
|
19918
19918
|
foetus=foetus #s
|
19919
19919
|
fog=fog #s|v
|
@@ -53175,7 +53175,6 @@ vedette=vedette #s
|
|
53175
53175
|
veejay=veejay #s
|
53176
53176
|
veel=veel #v
|
53177
53177
|
veer=veer #s|v
|
53178
|
-
veg*n=veg*n #s|a
|
53179
53178
|
vega=vega #s
|
53180
53179
|
vegan=vegan #s|a
|
53181
53180
|
veganism=veganism #s
|
@@ -55392,7 +55391,7 @@ zony=zony #s
|
|
55392
55391
|
zoo=zoo #s
|
55393
55392
|
zooarchaeology=zooarchaeology #s
|
55394
55393
|
zoobie=zoobie #s
|
55395
|
-
|
55394
|
+
zooecium=zooecium #s
|
55396
55395
|
zoogeography=zoogeography #s
|
55397
55396
|
zoolater=zoolater #s
|
55398
55397
|
zoological=zoological #a
|
data/lang/de.lang
CHANGED
@@ -56,12 +56,13 @@ language:
|
|
56
56
|
usr-dic: { name: de/user-dic.txt, txt-format: WordClass, separator: '=' }
|
57
57
|
|
58
58
|
# Test dictionaries
|
59
|
-
tst-dic: { name: de/test_dic.txt, txt-format: WordClass }
|
60
|
-
tst-syn: { name: de/test_syn.txt, txt-format: MultiValue, def-wc: y }
|
61
|
-
tst-mul: { name: de/test_mul.txt, use-lex: sys-dic, def-wc: m }
|
62
|
-
tst-mu2: { name: de/test_mu2.txt, use-lex: sys-dic, def-wc: m }
|
63
|
-
tst-
|
64
|
-
tst-
|
59
|
+
tst-dic: { name: de/test_dic.txt, txt-format: WordClass }
|
60
|
+
tst-syn: { name: de/test_syn.txt, txt-format: MultiValue, def-wc: y }
|
61
|
+
tst-mul: { name: de/test_mul.txt, use-lex: sys-dic, def-wc: m }
|
62
|
+
tst-mu2: { name: de/test_mu2.txt, use-lex: sys-dic, def-wc: m }
|
63
|
+
tst-muh: { name: de/test_muh.txt, txt-format: SingleWord, use-lex: sys-dic, def-wc: m, hyphenate: true }
|
64
|
+
tst-sgw: { name: de/test_sgw.txt, txt-format: SingleWord }
|
65
|
+
tst-gen: { name: de/test_gen.txt, txt-format: WordClass }
|
65
66
|
|
66
67
|
compound:
|
67
68
|
min-word-size: '7'
|
@@ -118,8 +119,8 @@ language:
|
|
118
119
|
# SPAC = \s+
|
119
120
|
# NUMS = [+-]?(?:\d{4,}|\d{1,3}(?:\.\d{3,3})*)(?:\.|(?:,\d+)?%?)
|
120
121
|
# URLS = (?:www\.|mailto:|(?:news|https?|ftps?)://|\S+?[._]\S+?@\S+?\.)\S+
|
121
|
-
# ABRV = (?:(?:(
|
122
|
-
# WORD = (
|
122
|
+
# ABRV = (?:(?:(?:CHAR)+\.)+)(?:CHAR)+
|
123
|
+
# WORD = ALNUM(?:-*ALNUM)*
|
123
124
|
# PUNC = [!,.:;?¡¿]
|
124
|
-
# OTHR = ["$#%&'()*+\-/<=>@\[\\\]^_{|}~¢£¤¥¦§¨©«¬®¯°±²³´¶·¸¹»¼½¾×÷]
|
125
|
+
# OTHR = [-"$#%&'()*+\-/<=>@\[\\\]^_{|}~¢£¤¥¦§¨©«¬®¯°±²³´¶·¸¹»¼½¾×÷]
|
125
126
|
# HELP = \S*
|
data/lang/en.lang
CHANGED
@@ -69,7 +69,7 @@ language:
|
|
69
69
|
# Suffixklasse: s = Substantiv, a = Adjektiv, v = Verb, e = Eigenwort, f = Fugung
|
70
70
|
# Suffixe je Klasse: "<suffix>['/'<ersetzung>][ <suffix>['/'<ersetzung>]]"
|
71
71
|
- [s, 'es s ves/f ves/fe ies/y']
|
72
|
-
- [a, 'er est r st ier/y iest/y']
|
72
|
+
- [a, 'er est r st ier/y iest/y ly al ally']
|
73
73
|
- [v, 'd ed en es ing s ing/e']
|
74
74
|
- [e, 's']
|
75
75
|
- [f, '']
|
data/lib/lingo.rb
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -61,7 +61,8 @@ class Lingo
|
|
61
61
|
}
|
62
62
|
|
63
63
|
# Default encoding
|
64
|
-
Encoding.default_external =
|
64
|
+
Encoding.default_external = ENC = 'UTF-8'.freeze
|
65
|
+
Encoding.default_internal = ENC unless RUBY_ENGINE == 'jruby'
|
65
66
|
|
66
67
|
SEP_RE = %r{[; ,|]}
|
67
68
|
|
@@ -265,8 +266,7 @@ class Lingo
|
|
265
266
|
end
|
266
267
|
|
267
268
|
def invite(list = config['meeting/attendees'])
|
268
|
-
supplier
|
269
|
-
subscriber = Hash.nest { [] }
|
269
|
+
supplier, subscriber = Hash.array, Hash.array
|
270
270
|
|
271
271
|
last_link, auto_link = '', 0
|
272
272
|
|
data/lib/lingo/attendee.rb
CHANGED
@@ -70,6 +70,8 @@ class Lingo
|
|
70
70
|
|
71
71
|
include Language
|
72
72
|
|
73
|
+
TERMINALS = [:FILE, :RECORD, :EOF]
|
74
|
+
|
73
75
|
DEFAULT_SKIP = [TA_PUNCTUATION, TA_OTHER].join(',')
|
74
76
|
|
75
77
|
def initialize(config, lingo)
|
@@ -124,7 +126,15 @@ class Lingo
|
|
124
126
|
@config.fetch(key, default)
|
125
127
|
end
|
126
128
|
|
127
|
-
def
|
129
|
+
def get_int(*args)
|
130
|
+
Integer(get_key(*args))
|
131
|
+
end
|
132
|
+
|
133
|
+
def get_flo(*args)
|
134
|
+
((val = get_key(*args)) && val.respond_to?(:to_f)) ? val.to_f : val
|
135
|
+
end
|
136
|
+
|
137
|
+
def get_ary(key, default = nil, method = nil)
|
128
138
|
ary = get_key(key, default).split(SEP_RE)
|
129
139
|
ary.map!(&method) if method
|
130
140
|
ary
|
@@ -136,6 +146,12 @@ class Lingo
|
|
136
146
|
end
|
137
147
|
end
|
138
148
|
|
149
|
+
def get_enc(key = 'encoding', default = ENC)
|
150
|
+
Encoding.find(get_key(key, default))
|
151
|
+
rescue ArgumentError => err
|
152
|
+
raise ConfigLoadError.new(err)
|
153
|
+
end
|
154
|
+
|
139
155
|
def dictionary(src, mod)
|
140
156
|
Language::Dictionary.new({ 'source' => src, 'mode' => mod }, lingo)
|
141
157
|
end
|
@@ -145,11 +161,11 @@ class Lingo
|
|
145
161
|
end
|
146
162
|
|
147
163
|
def set_dic
|
148
|
-
@dic = dictionary(
|
164
|
+
@dic = dictionary(get_ary('source'), get_key('mode', 'all'))
|
149
165
|
end
|
150
166
|
|
151
167
|
def set_gra
|
152
|
-
@gra = grammar(
|
168
|
+
@gra = grammar(get_ary('source'), get_key('mode', 'all'))
|
153
169
|
end
|
154
170
|
|
155
171
|
def warn(*msg)
|
@@ -166,23 +182,27 @@ class Lingo
|
|
166
182
|
|
167
183
|
end
|
168
184
|
|
185
|
+
require_relative 'text_utils'
|
186
|
+
|
169
187
|
require_relative 'buffered_attendee'
|
170
188
|
require_relative 'deferred_attendee'
|
171
189
|
|
172
190
|
require_relative 'attendee/abbreviator'
|
191
|
+
require_relative 'attendee/analysis_filter'
|
173
192
|
require_relative 'attendee/debugger'
|
193
|
+
require_relative 'attendee/debug_filter' # < Debugger
|
174
194
|
require_relative 'attendee/decomposer'
|
175
|
-
require_relative 'attendee/
|
195
|
+
require_relative 'attendee/hal_filter'
|
196
|
+
require_relative 'attendee/lsi_filter'
|
176
197
|
require_relative 'attendee/multi_worder'
|
177
|
-
require_relative 'attendee/noneword_filter'
|
178
198
|
require_relative 'attendee/object_filter'
|
179
|
-
require_relative 'attendee/variator'
|
180
199
|
require_relative 'attendee/sequencer'
|
181
200
|
require_relative 'attendee/stemmer'
|
182
201
|
require_relative 'attendee/synonymer'
|
183
202
|
require_relative 'attendee/text_reader'
|
184
203
|
require_relative 'attendee/text_writer'
|
185
|
-
require_relative 'attendee/formatter'
|
204
|
+
require_relative 'attendee/formatter' # < TextWriter
|
186
205
|
require_relative 'attendee/tokenizer'
|
206
|
+
require_relative 'attendee/variator'
|
187
207
|
require_relative 'attendee/vector_filter'
|
188
208
|
require_relative 'attendee/word_searcher'
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
require 'csv'
|
28
|
+
|
29
|
+
class Lingo
|
30
|
+
|
31
|
+
class Attendee
|
32
|
+
|
33
|
+
class AnalysisFilter < self
|
34
|
+
|
35
|
+
FIELDS = {
|
36
|
+
string: :form,
|
37
|
+
token: :attr,
|
38
|
+
position: :position,
|
39
|
+
offset: :offset,
|
40
|
+
word: :attr,
|
41
|
+
pattern: :pattern
|
42
|
+
}
|
43
|
+
|
44
|
+
def init
|
45
|
+
@csv, @header = CSV.new('', row_sep: ''), FIELDS.keys
|
46
|
+
end
|
47
|
+
|
48
|
+
def control(cmd, *)
|
49
|
+
:skip_command if cmd == :EOL
|
50
|
+
end
|
51
|
+
|
52
|
+
def process(obj, *)
|
53
|
+
forward_row(@header.tap { @header = nil }) if @header
|
54
|
+
|
55
|
+
obj.is_a?(Token) ?
|
56
|
+
forward_obj(obj, obj, obj, obj) : begin
|
57
|
+
tok = obj.token
|
58
|
+
forward_obj(obj, nil, tok, tok, obj, obj)
|
59
|
+
obj.lexicals.each { |lex|
|
60
|
+
forward_obj(lex, nil, tok, tok, lex, obj) }
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
def forward_obj(*args)
|
67
|
+
forward_row(FIELDS.map.with_index { |(_, method), index|
|
68
|
+
arg = args[index] and arg.send(method) })
|
69
|
+
end
|
70
|
+
|
71
|
+
def forward_row(row)
|
72
|
+
forward(@csv.add_row(row).string.dup)
|
73
|
+
@csv.string.clear
|
74
|
+
@csv.rewind
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
class Attendee
|
30
|
+
|
31
|
+
class DebugFilter < Debugger
|
32
|
+
|
33
|
+
def init
|
34
|
+
@filter = true
|
35
|
+
super('')
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -112,7 +112,7 @@ class Lingo
|
|
112
112
|
end
|
113
113
|
end
|
114
114
|
|
115
|
-
def process(obj)
|
115
|
+
def process(obj, *)
|
116
116
|
debug(eval(@obj_eval)) { obj.inspect }
|
117
117
|
forward(obj) unless @filter
|
118
118
|
end
|
@@ -130,15 +130,6 @@ class Lingo
|
|
130
130
|
|
131
131
|
end
|
132
132
|
|
133
|
-
class DebugFilter < Debugger
|
134
|
-
|
135
|
-
def init
|
136
|
-
@filter = true
|
137
|
-
super('')
|
138
|
-
end
|
139
|
-
|
140
|
-
end
|
141
|
-
|
142
133
|
end
|
143
134
|
|
144
135
|
end
|
@@ -40,7 +40,8 @@ class Lingo
|
|
40
40
|
#
|
41
41
|
# === Mögliche Verlinkung
|
42
42
|
# Erwartet:: Daten vom Typ *Word* (andere werden einfach durchgereicht) z.B. von Wordsearcher
|
43
|
-
# Erzeugt:: Daten vom Typ *Word* (erkannte Komposita werden entsprechend erweitert) z.B. für
|
43
|
+
# Erzeugt:: Daten vom Typ *Word* (erkannte Komposita werden entsprechend erweitert) z.B. für
|
44
|
+
# Synonymer, Ocr_variator, Multiworder, Sequencer, Vector_filter
|
44
45
|
#
|
45
46
|
# === Parameter
|
46
47
|
# Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
|
@@ -65,9 +66,9 @@ class Lingo
|
|
65
66
|
# out> <Lingo|?>
|
66
67
|
# out> :,/PUNC:
|
67
68
|
# out> <ein = [(ein/w)]>
|
68
|
-
# out> <Indexierungssystem|
|
69
|
+
# out> <Indexierungssystem|COM = [(indexierungssystem/k), (indexierung/s), (system/s)]>
|
69
70
|
# out> <mit = [(mit/w)]>
|
70
|
-
# out> <Kompositumerkennung|
|
71
|
+
# out> <Kompositumerkennung|COM = [(kompositumerkennung/k), (erkennung/s), (kompositum/s)]>
|
71
72
|
# out> :./PUNC:
|
72
73
|
# out> *EOL('test.txt')
|
73
74
|
# out> *EOF('test.txt')
|
@@ -85,6 +86,8 @@ class Lingo
|
|
85
86
|
def process(obj)
|
86
87
|
if obj.is_a?(Word) && obj.unknown?
|
87
88
|
com = @gra.find_compound(obj.form)
|
89
|
+
com.token = obj.token
|
90
|
+
|
88
91
|
obj = com unless com.unknown?
|
89
92
|
end
|
90
93
|
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -35,7 +35,7 @@ class Lingo
|
|
35
35
|
|
36
36
|
@ext = get_key('ext', '-')
|
37
37
|
@format = get_key('format', '%s')
|
38
|
-
@map = get_key('map', Hash.
|
38
|
+
@map = get_key('map', Hash.identity)
|
39
39
|
|
40
40
|
@no_puts = true
|
41
41
|
end
|
@@ -44,10 +44,10 @@ class Lingo
|
|
44
44
|
if obj.is_a?(WordForm)
|
45
45
|
str = obj.form
|
46
46
|
|
47
|
-
if obj.
|
48
|
-
|
49
|
-
|
50
|
-
|
47
|
+
if obj.is_a?(Word)
|
48
|
+
# TODO: which lexical to select? (currently: first)
|
49
|
+
obj.each_lex { |lex|
|
50
|
+
att = @map[lex.attr] and str = @format % [str, lex.form, att] }
|
51
51
|
end
|
52
52
|
else
|
53
53
|
str = obj.to_s
|
@@ -0,0 +1,94 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
class Attendee
|
30
|
+
|
31
|
+
class HalFilter < self
|
32
|
+
|
33
|
+
def init
|
34
|
+
require_lib('hal4r')
|
35
|
+
|
36
|
+
@lex = get_re('lexicals', '[sy]')
|
37
|
+
@skip = get_ary('skip', DEFAULT_SKIP, :upcase)
|
38
|
+
|
39
|
+
@norm = get_key('norm', true)
|
40
|
+
@sep = get_key('sep', '^')
|
41
|
+
@min = get_flo('min', false)
|
42
|
+
@dim = get_int('dim', 2)
|
43
|
+
|
44
|
+
@sort = get_key('sort', false)
|
45
|
+
@sort.downcase! if @sort.respond_to?(:downcase!)
|
46
|
+
|
47
|
+
@hal = Hal4R.new([], get_int('window-size', Hal4R::DEFAULT_WINDOW_SIZE))
|
48
|
+
end
|
49
|
+
|
50
|
+
def control(cmd, *)
|
51
|
+
case cmd
|
52
|
+
when :EOL then :skip_command
|
53
|
+
when *TERMINALS then send_vectors unless @hal.empty?
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def process(obj)
|
58
|
+
obj.is_a?(Word) && !@skip.include?(obj.attr) &&
|
59
|
+
# TODO: which lexical to select? (currently: first)
|
60
|
+
obj.lex_form(@lex) { |form| @hal << Unicode.downcase(form) }
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
|
65
|
+
def send_vectors
|
66
|
+
vec = []
|
67
|
+
|
68
|
+
fmt = @sort ? @sort == 'sto' ?
|
69
|
+
'%s {%.5f}' : '%2$.5f %1$s' : '%s %.5f' unless @sort == 'normal'
|
70
|
+
|
71
|
+
unless @sort
|
72
|
+
each_vector { |v| forward(fmt % v) }
|
73
|
+
else
|
74
|
+
each_vector { |v| vec << v }
|
75
|
+
|
76
|
+
!fmt ? vec.sort!.each { |v, _| forward(v) } :
|
77
|
+
vec.sort_by { |v, w| [-w, v] }.each { |v| forward(fmt % v) }
|
78
|
+
|
79
|
+
vec.clear
|
80
|
+
end
|
81
|
+
|
82
|
+
@hal.reset
|
83
|
+
end
|
84
|
+
|
85
|
+
def each_vector
|
86
|
+
@hal.each_distance(@norm, @dim) { |*t, v| v = 1 / v
|
87
|
+
yield [t.join(@sep), v] unless v.nan? || (@min && v < @min) }
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|