lingo 1.8.6 → 1.8.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ChangeLog +40 -4
- data/README +22 -51
- data/Rakefile +3 -17
- data/config/lingo.cfg +24 -15
- data/config/lir.cfg +25 -16
- data/dict/de/test_muh.txt +6 -0
- data/dict/en/lingo-dic.txt +2 -3
- data/lang/de.lang +10 -9
- data/lang/en.lang +1 -1
- data/lib/lingo.rb +4 -4
- data/lib/lingo/attendee.rb +27 -7
- data/lib/lingo/attendee/analysis_filter.rb +81 -0
- data/lib/lingo/attendee/debug_filter.rb +42 -0
- data/lib/lingo/attendee/debugger.rb +2 -11
- data/lib/lingo/attendee/decomposer.rb +6 -3
- data/lib/lingo/attendee/formatter.rb +6 -6
- data/lib/lingo/attendee/hal_filter.rb +94 -0
- data/lib/lingo/attendee/lsi_filter.rb +99 -0
- data/lib/lingo/attendee/multi_worder.rb +69 -43
- data/lib/lingo/attendee/sequencer.rb +32 -19
- data/lib/lingo/attendee/synonymer.rb +2 -2
- data/lib/lingo/attendee/text_reader.rb +63 -92
- data/lib/lingo/attendee/text_writer.rb +12 -21
- data/lib/lingo/attendee/tokenizer.rb +32 -21
- data/lib/lingo/attendee/variator.rb +3 -3
- data/lib/lingo/attendee/vector_filter.rb +7 -9
- data/lib/lingo/attendee/word_searcher.rb +3 -3
- data/lib/lingo/buffered_attendee.rb +3 -36
- data/lib/lingo/config.rb +1 -1
- data/lib/lingo/ctl.rb +7 -155
- data/lib/lingo/ctl/analysis.rb +136 -0
- data/lib/lingo/ctl/files.rb +86 -0
- data/lib/lingo/ctl/other.rb +140 -0
- data/lib/lingo/database.rb +64 -60
- data/lib/lingo/database/crypter.rb +7 -5
- data/lib/lingo/error.rb +5 -4
- data/lib/lingo/language.rb +13 -5
- data/lib/lingo/language/grammar.rb +13 -7
- data/lib/lingo/language/token.rb +6 -0
- data/lib/lingo/language/word.rb +23 -36
- data/lib/lingo/language/word_form.rb +5 -1
- data/lib/lingo/srv.rb +2 -2
- data/lib/lingo/text_utils.rb +96 -0
- data/lib/lingo/version.rb +1 -1
- data/lib/lingo/web/views/index.erb +1 -1
- data/test/attendee/ts_decomposer.rb +23 -5
- data/test/attendee/ts_multi_worder.rb +66 -0
- data/test/attendee/ts_sequencer.rb +28 -4
- data/test/attendee/ts_text_reader.rb +20 -0
- data/test/attendee/ts_tokenizer.rb +20 -0
- data/test/attendee/ts_variator.rb +1 -1
- data/test/attendee/ts_word_searcher.rb +39 -3
- data/test/lir3.txt +12 -0
- data/test/ref/artikel.non +1 -12
- data/test/ref/artikel.seq +3 -1
- data/test/ref/artikel.vec +1 -0
- data/test/ref/artikel.vef +35 -34
- data/test/ref/artikel.ven +8 -7
- data/test/ref/artikel.ver +34 -33
- data/test/ref/artikel.vet +2573 -2563
- data/test/ref/lir.non +77 -78
- data/test/ref/lir.seq +9 -7
- data/test/ref/lir.syn +1 -1
- data/test/ref/lir.vec +41 -41
- data/test/ref/lir.vef +210 -210
- data/test/ref/lir.ven +46 -46
- data/test/ref/lir.ver +72 -72
- data/test/ref/lir.vet +329 -329
- data/test/ts_database.rb +166 -62
- data/test/ts_language.rb +23 -23
- metadata +53 -34
- data/lib/lingo/attendee/dehyphenizer.rb +0 -120
- data/lib/lingo/attendee/noneword_filter.rb +0 -115
- data/test/attendee/ts_noneword_filter.rb +0 -15
data/dict/en/lingo-dic.txt
CHANGED
@@ -19913,7 +19913,7 @@ fodder=fodder #s|v
|
|
19913
19913
|
foe=foe #s
|
19914
19914
|
foederatus=foederatus #s
|
19915
19915
|
foetal=foetal #a
|
19916
|
-
|
19916
|
+
foetid=foetid #a
|
19917
19917
|
foetidness=foetidness #s
|
19918
19918
|
foetus=foetus #s
|
19919
19919
|
fog=fog #s|v
|
@@ -53175,7 +53175,6 @@ vedette=vedette #s
|
|
53175
53175
|
veejay=veejay #s
|
53176
53176
|
veel=veel #v
|
53177
53177
|
veer=veer #s|v
|
53178
|
-
veg*n=veg*n #s|a
|
53179
53178
|
vega=vega #s
|
53180
53179
|
vegan=vegan #s|a
|
53181
53180
|
veganism=veganism #s
|
@@ -55392,7 +55391,7 @@ zony=zony #s
|
|
55392
55391
|
zoo=zoo #s
|
55393
55392
|
zooarchaeology=zooarchaeology #s
|
55394
55393
|
zoobie=zoobie #s
|
55395
|
-
|
55394
|
+
zooecium=zooecium #s
|
55396
55395
|
zoogeography=zoogeography #s
|
55397
55396
|
zoolater=zoolater #s
|
55398
55397
|
zoological=zoological #a
|
data/lang/de.lang
CHANGED
@@ -56,12 +56,13 @@ language:
|
|
56
56
|
usr-dic: { name: de/user-dic.txt, txt-format: WordClass, separator: '=' }
|
57
57
|
|
58
58
|
# Test dictionaries
|
59
|
-
tst-dic: { name: de/test_dic.txt, txt-format: WordClass }
|
60
|
-
tst-syn: { name: de/test_syn.txt, txt-format: MultiValue, def-wc: y }
|
61
|
-
tst-mul: { name: de/test_mul.txt, use-lex: sys-dic, def-wc: m }
|
62
|
-
tst-mu2: { name: de/test_mu2.txt, use-lex: sys-dic, def-wc: m }
|
63
|
-
tst-
|
64
|
-
tst-
|
59
|
+
tst-dic: { name: de/test_dic.txt, txt-format: WordClass }
|
60
|
+
tst-syn: { name: de/test_syn.txt, txt-format: MultiValue, def-wc: y }
|
61
|
+
tst-mul: { name: de/test_mul.txt, use-lex: sys-dic, def-wc: m }
|
62
|
+
tst-mu2: { name: de/test_mu2.txt, use-lex: sys-dic, def-wc: m }
|
63
|
+
tst-muh: { name: de/test_muh.txt, txt-format: SingleWord, use-lex: sys-dic, def-wc: m, hyphenate: true }
|
64
|
+
tst-sgw: { name: de/test_sgw.txt, txt-format: SingleWord }
|
65
|
+
tst-gen: { name: de/test_gen.txt, txt-format: WordClass }
|
65
66
|
|
66
67
|
compound:
|
67
68
|
min-word-size: '7'
|
@@ -118,8 +119,8 @@ language:
|
|
118
119
|
# SPAC = \s+
|
119
120
|
# NUMS = [+-]?(?:\d{4,}|\d{1,3}(?:\.\d{3,3})*)(?:\.|(?:,\d+)?%?)
|
120
121
|
# URLS = (?:www\.|mailto:|(?:news|https?|ftps?)://|\S+?[._]\S+?@\S+?\.)\S+
|
121
|
-
# ABRV = (?:(?:(
|
122
|
-
# WORD = (
|
122
|
+
# ABRV = (?:(?:(?:CHAR)+\.)+)(?:CHAR)+
|
123
|
+
# WORD = ALNUM(?:-*ALNUM)*
|
123
124
|
# PUNC = [!,.:;?¡¿]
|
124
|
-
# OTHR = ["$#%&'()*+\-/<=>@\[\\\]^_{|}~¢£¤¥¦§¨©«¬®¯°±²³´¶·¸¹»¼½¾×÷]
|
125
|
+
# OTHR = [-"$#%&'()*+\-/<=>@\[\\\]^_{|}~¢£¤¥¦§¨©«¬®¯°±²³´¶·¸¹»¼½¾×÷]
|
125
126
|
# HELP = \S*
|
data/lang/en.lang
CHANGED
@@ -69,7 +69,7 @@ language:
|
|
69
69
|
# Suffixklasse: s = Substantiv, a = Adjektiv, v = Verb, e = Eigenwort, f = Fugung
|
70
70
|
# Suffixe je Klasse: "<suffix>['/'<ersetzung>][ <suffix>['/'<ersetzung>]]"
|
71
71
|
- [s, 'es s ves/f ves/fe ies/y']
|
72
|
-
- [a, 'er est r st ier/y iest/y']
|
72
|
+
- [a, 'er est r st ier/y iest/y ly al ally']
|
73
73
|
- [v, 'd ed en es ing s ing/e']
|
74
74
|
- [e, 's']
|
75
75
|
- [f, '']
|
data/lib/lingo.rb
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -61,7 +61,8 @@ class Lingo
|
|
61
61
|
}
|
62
62
|
|
63
63
|
# Default encoding
|
64
|
-
Encoding.default_external =
|
64
|
+
Encoding.default_external = ENC = 'UTF-8'.freeze
|
65
|
+
Encoding.default_internal = ENC unless RUBY_ENGINE == 'jruby'
|
65
66
|
|
66
67
|
SEP_RE = %r{[; ,|]}
|
67
68
|
|
@@ -265,8 +266,7 @@ class Lingo
|
|
265
266
|
end
|
266
267
|
|
267
268
|
def invite(list = config['meeting/attendees'])
|
268
|
-
supplier
|
269
|
-
subscriber = Hash.nest { [] }
|
269
|
+
supplier, subscriber = Hash.array, Hash.array
|
270
270
|
|
271
271
|
last_link, auto_link = '', 0
|
272
272
|
|
data/lib/lingo/attendee.rb
CHANGED
@@ -70,6 +70,8 @@ class Lingo
|
|
70
70
|
|
71
71
|
include Language
|
72
72
|
|
73
|
+
TERMINALS = [:FILE, :RECORD, :EOF]
|
74
|
+
|
73
75
|
DEFAULT_SKIP = [TA_PUNCTUATION, TA_OTHER].join(',')
|
74
76
|
|
75
77
|
def initialize(config, lingo)
|
@@ -124,7 +126,15 @@ class Lingo
|
|
124
126
|
@config.fetch(key, default)
|
125
127
|
end
|
126
128
|
|
127
|
-
def
|
129
|
+
def get_int(*args)
|
130
|
+
Integer(get_key(*args))
|
131
|
+
end
|
132
|
+
|
133
|
+
def get_flo(*args)
|
134
|
+
((val = get_key(*args)) && val.respond_to?(:to_f)) ? val.to_f : val
|
135
|
+
end
|
136
|
+
|
137
|
+
def get_ary(key, default = nil, method = nil)
|
128
138
|
ary = get_key(key, default).split(SEP_RE)
|
129
139
|
ary.map!(&method) if method
|
130
140
|
ary
|
@@ -136,6 +146,12 @@ class Lingo
|
|
136
146
|
end
|
137
147
|
end
|
138
148
|
|
149
|
+
def get_enc(key = 'encoding', default = ENC)
|
150
|
+
Encoding.find(get_key(key, default))
|
151
|
+
rescue ArgumentError => err
|
152
|
+
raise ConfigLoadError.new(err)
|
153
|
+
end
|
154
|
+
|
139
155
|
def dictionary(src, mod)
|
140
156
|
Language::Dictionary.new({ 'source' => src, 'mode' => mod }, lingo)
|
141
157
|
end
|
@@ -145,11 +161,11 @@ class Lingo
|
|
145
161
|
end
|
146
162
|
|
147
163
|
def set_dic
|
148
|
-
@dic = dictionary(
|
164
|
+
@dic = dictionary(get_ary('source'), get_key('mode', 'all'))
|
149
165
|
end
|
150
166
|
|
151
167
|
def set_gra
|
152
|
-
@gra = grammar(
|
168
|
+
@gra = grammar(get_ary('source'), get_key('mode', 'all'))
|
153
169
|
end
|
154
170
|
|
155
171
|
def warn(*msg)
|
@@ -166,23 +182,27 @@ class Lingo
|
|
166
182
|
|
167
183
|
end
|
168
184
|
|
185
|
+
require_relative 'text_utils'
|
186
|
+
|
169
187
|
require_relative 'buffered_attendee'
|
170
188
|
require_relative 'deferred_attendee'
|
171
189
|
|
172
190
|
require_relative 'attendee/abbreviator'
|
191
|
+
require_relative 'attendee/analysis_filter'
|
173
192
|
require_relative 'attendee/debugger'
|
193
|
+
require_relative 'attendee/debug_filter' # < Debugger
|
174
194
|
require_relative 'attendee/decomposer'
|
175
|
-
require_relative 'attendee/
|
195
|
+
require_relative 'attendee/hal_filter'
|
196
|
+
require_relative 'attendee/lsi_filter'
|
176
197
|
require_relative 'attendee/multi_worder'
|
177
|
-
require_relative 'attendee/noneword_filter'
|
178
198
|
require_relative 'attendee/object_filter'
|
179
|
-
require_relative 'attendee/variator'
|
180
199
|
require_relative 'attendee/sequencer'
|
181
200
|
require_relative 'attendee/stemmer'
|
182
201
|
require_relative 'attendee/synonymer'
|
183
202
|
require_relative 'attendee/text_reader'
|
184
203
|
require_relative 'attendee/text_writer'
|
185
|
-
require_relative 'attendee/formatter'
|
204
|
+
require_relative 'attendee/formatter' # < TextWriter
|
186
205
|
require_relative 'attendee/tokenizer'
|
206
|
+
require_relative 'attendee/variator'
|
187
207
|
require_relative 'attendee/vector_filter'
|
188
208
|
require_relative 'attendee/word_searcher'
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
require 'csv'
|
28
|
+
|
29
|
+
class Lingo
|
30
|
+
|
31
|
+
class Attendee
|
32
|
+
|
33
|
+
class AnalysisFilter < self
|
34
|
+
|
35
|
+
FIELDS = {
|
36
|
+
string: :form,
|
37
|
+
token: :attr,
|
38
|
+
position: :position,
|
39
|
+
offset: :offset,
|
40
|
+
word: :attr,
|
41
|
+
pattern: :pattern
|
42
|
+
}
|
43
|
+
|
44
|
+
def init
|
45
|
+
@csv, @header = CSV.new('', row_sep: ''), FIELDS.keys
|
46
|
+
end
|
47
|
+
|
48
|
+
def control(cmd, *)
|
49
|
+
:skip_command if cmd == :EOL
|
50
|
+
end
|
51
|
+
|
52
|
+
def process(obj, *)
|
53
|
+
forward_row(@header.tap { @header = nil }) if @header
|
54
|
+
|
55
|
+
obj.is_a?(Token) ?
|
56
|
+
forward_obj(obj, obj, obj, obj) : begin
|
57
|
+
tok = obj.token
|
58
|
+
forward_obj(obj, nil, tok, tok, obj, obj)
|
59
|
+
obj.lexicals.each { |lex|
|
60
|
+
forward_obj(lex, nil, tok, tok, lex, obj) }
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
def forward_obj(*args)
|
67
|
+
forward_row(FIELDS.map.with_index { |(_, method), index|
|
68
|
+
arg = args[index] and arg.send(method) })
|
69
|
+
end
|
70
|
+
|
71
|
+
def forward_row(row)
|
72
|
+
forward(@csv.add_row(row).string.dup)
|
73
|
+
@csv.string.clear
|
74
|
+
@csv.rewind
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
class Attendee
|
30
|
+
|
31
|
+
class DebugFilter < Debugger
|
32
|
+
|
33
|
+
def init
|
34
|
+
@filter = true
|
35
|
+
super('')
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -112,7 +112,7 @@ class Lingo
|
|
112
112
|
end
|
113
113
|
end
|
114
114
|
|
115
|
-
def process(obj)
|
115
|
+
def process(obj, *)
|
116
116
|
debug(eval(@obj_eval)) { obj.inspect }
|
117
117
|
forward(obj) unless @filter
|
118
118
|
end
|
@@ -130,15 +130,6 @@ class Lingo
|
|
130
130
|
|
131
131
|
end
|
132
132
|
|
133
|
-
class DebugFilter < Debugger
|
134
|
-
|
135
|
-
def init
|
136
|
-
@filter = true
|
137
|
-
super('')
|
138
|
-
end
|
139
|
-
|
140
|
-
end
|
141
|
-
|
142
133
|
end
|
143
134
|
|
144
135
|
end
|
@@ -40,7 +40,8 @@ class Lingo
|
|
40
40
|
#
|
41
41
|
# === Mögliche Verlinkung
|
42
42
|
# Erwartet:: Daten vom Typ *Word* (andere werden einfach durchgereicht) z.B. von Wordsearcher
|
43
|
-
# Erzeugt:: Daten vom Typ *Word* (erkannte Komposita werden entsprechend erweitert) z.B. für
|
43
|
+
# Erzeugt:: Daten vom Typ *Word* (erkannte Komposita werden entsprechend erweitert) z.B. für
|
44
|
+
# Synonymer, Ocr_variator, Multiworder, Sequencer, Vector_filter
|
44
45
|
#
|
45
46
|
# === Parameter
|
46
47
|
# Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
|
@@ -65,9 +66,9 @@ class Lingo
|
|
65
66
|
# out> <Lingo|?>
|
66
67
|
# out> :,/PUNC:
|
67
68
|
# out> <ein = [(ein/w)]>
|
68
|
-
# out> <Indexierungssystem|
|
69
|
+
# out> <Indexierungssystem|COM = [(indexierungssystem/k), (indexierung/s), (system/s)]>
|
69
70
|
# out> <mit = [(mit/w)]>
|
70
|
-
# out> <Kompositumerkennung|
|
71
|
+
# out> <Kompositumerkennung|COM = [(kompositumerkennung/k), (erkennung/s), (kompositum/s)]>
|
71
72
|
# out> :./PUNC:
|
72
73
|
# out> *EOL('test.txt')
|
73
74
|
# out> *EOF('test.txt')
|
@@ -85,6 +86,8 @@ class Lingo
|
|
85
86
|
def process(obj)
|
86
87
|
if obj.is_a?(Word) && obj.unknown?
|
87
88
|
com = @gra.find_compound(obj.form)
|
89
|
+
com.token = obj.token
|
90
|
+
|
88
91
|
obj = com unless com.unknown?
|
89
92
|
end
|
90
93
|
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -35,7 +35,7 @@ class Lingo
|
|
35
35
|
|
36
36
|
@ext = get_key('ext', '-')
|
37
37
|
@format = get_key('format', '%s')
|
38
|
-
@map = get_key('map', Hash.
|
38
|
+
@map = get_key('map', Hash.identity)
|
39
39
|
|
40
40
|
@no_puts = true
|
41
41
|
end
|
@@ -44,10 +44,10 @@ class Lingo
|
|
44
44
|
if obj.is_a?(WordForm)
|
45
45
|
str = obj.form
|
46
46
|
|
47
|
-
if obj.
|
48
|
-
|
49
|
-
|
50
|
-
|
47
|
+
if obj.is_a?(Word)
|
48
|
+
# TODO: which lexical to select? (currently: first)
|
49
|
+
obj.each_lex { |lex|
|
50
|
+
att = @map[lex.attr] and str = @format % [str, lex.form, att] }
|
51
51
|
end
|
52
52
|
else
|
53
53
|
str = obj.to_s
|
@@ -0,0 +1,94 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
class Attendee
|
30
|
+
|
31
|
+
class HalFilter < self
|
32
|
+
|
33
|
+
def init
|
34
|
+
require_lib('hal4r')
|
35
|
+
|
36
|
+
@lex = get_re('lexicals', '[sy]')
|
37
|
+
@skip = get_ary('skip', DEFAULT_SKIP, :upcase)
|
38
|
+
|
39
|
+
@norm = get_key('norm', true)
|
40
|
+
@sep = get_key('sep', '^')
|
41
|
+
@min = get_flo('min', false)
|
42
|
+
@dim = get_int('dim', 2)
|
43
|
+
|
44
|
+
@sort = get_key('sort', false)
|
45
|
+
@sort.downcase! if @sort.respond_to?(:downcase!)
|
46
|
+
|
47
|
+
@hal = Hal4R.new([], get_int('window-size', Hal4R::DEFAULT_WINDOW_SIZE))
|
48
|
+
end
|
49
|
+
|
50
|
+
def control(cmd, *)
|
51
|
+
case cmd
|
52
|
+
when :EOL then :skip_command
|
53
|
+
when *TERMINALS then send_vectors unless @hal.empty?
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def process(obj)
|
58
|
+
obj.is_a?(Word) && !@skip.include?(obj.attr) &&
|
59
|
+
# TODO: which lexical to select? (currently: first)
|
60
|
+
obj.lex_form(@lex) { |form| @hal << Unicode.downcase(form) }
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
|
65
|
+
def send_vectors
|
66
|
+
vec = []
|
67
|
+
|
68
|
+
fmt = @sort ? @sort == 'sto' ?
|
69
|
+
'%s {%.5f}' : '%2$.5f %1$s' : '%s %.5f' unless @sort == 'normal'
|
70
|
+
|
71
|
+
unless @sort
|
72
|
+
each_vector { |v| forward(fmt % v) }
|
73
|
+
else
|
74
|
+
each_vector { |v| vec << v }
|
75
|
+
|
76
|
+
!fmt ? vec.sort!.each { |v, _| forward(v) } :
|
77
|
+
vec.sort_by { |v, w| [-w, v] }.each { |v| forward(fmt % v) }
|
78
|
+
|
79
|
+
vec.clear
|
80
|
+
end
|
81
|
+
|
82
|
+
@hal.reset
|
83
|
+
end
|
84
|
+
|
85
|
+
def each_vector
|
86
|
+
@hal.each_distance(@norm, @dim) { |*t, v| v = 1 / v
|
87
|
+
yield [t.join(@sep), v] unless v.nan? || (@min && v < @min) }
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|