lingo 1.8.6 → 1.8.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ChangeLog +40 -4
- data/README +22 -51
- data/Rakefile +3 -17
- data/config/lingo.cfg +24 -15
- data/config/lir.cfg +25 -16
- data/dict/de/test_muh.txt +6 -0
- data/dict/en/lingo-dic.txt +2 -3
- data/lang/de.lang +10 -9
- data/lang/en.lang +1 -1
- data/lib/lingo.rb +4 -4
- data/lib/lingo/attendee.rb +27 -7
- data/lib/lingo/attendee/analysis_filter.rb +81 -0
- data/lib/lingo/attendee/debug_filter.rb +42 -0
- data/lib/lingo/attendee/debugger.rb +2 -11
- data/lib/lingo/attendee/decomposer.rb +6 -3
- data/lib/lingo/attendee/formatter.rb +6 -6
- data/lib/lingo/attendee/hal_filter.rb +94 -0
- data/lib/lingo/attendee/lsi_filter.rb +99 -0
- data/lib/lingo/attendee/multi_worder.rb +69 -43
- data/lib/lingo/attendee/sequencer.rb +32 -19
- data/lib/lingo/attendee/synonymer.rb +2 -2
- data/lib/lingo/attendee/text_reader.rb +63 -92
- data/lib/lingo/attendee/text_writer.rb +12 -21
- data/lib/lingo/attendee/tokenizer.rb +32 -21
- data/lib/lingo/attendee/variator.rb +3 -3
- data/lib/lingo/attendee/vector_filter.rb +7 -9
- data/lib/lingo/attendee/word_searcher.rb +3 -3
- data/lib/lingo/buffered_attendee.rb +3 -36
- data/lib/lingo/config.rb +1 -1
- data/lib/lingo/ctl.rb +7 -155
- data/lib/lingo/ctl/analysis.rb +136 -0
- data/lib/lingo/ctl/files.rb +86 -0
- data/lib/lingo/ctl/other.rb +140 -0
- data/lib/lingo/database.rb +64 -60
- data/lib/lingo/database/crypter.rb +7 -5
- data/lib/lingo/error.rb +5 -4
- data/lib/lingo/language.rb +13 -5
- data/lib/lingo/language/grammar.rb +13 -7
- data/lib/lingo/language/token.rb +6 -0
- data/lib/lingo/language/word.rb +23 -36
- data/lib/lingo/language/word_form.rb +5 -1
- data/lib/lingo/srv.rb +2 -2
- data/lib/lingo/text_utils.rb +96 -0
- data/lib/lingo/version.rb +1 -1
- data/lib/lingo/web/views/index.erb +1 -1
- data/test/attendee/ts_decomposer.rb +23 -5
- data/test/attendee/ts_multi_worder.rb +66 -0
- data/test/attendee/ts_sequencer.rb +28 -4
- data/test/attendee/ts_text_reader.rb +20 -0
- data/test/attendee/ts_tokenizer.rb +20 -0
- data/test/attendee/ts_variator.rb +1 -1
- data/test/attendee/ts_word_searcher.rb +39 -3
- data/test/lir3.txt +12 -0
- data/test/ref/artikel.non +1 -12
- data/test/ref/artikel.seq +3 -1
- data/test/ref/artikel.vec +1 -0
- data/test/ref/artikel.vef +35 -34
- data/test/ref/artikel.ven +8 -7
- data/test/ref/artikel.ver +34 -33
- data/test/ref/artikel.vet +2573 -2563
- data/test/ref/lir.non +77 -78
- data/test/ref/lir.seq +9 -7
- data/test/ref/lir.syn +1 -1
- data/test/ref/lir.vec +41 -41
- data/test/ref/lir.vef +210 -210
- data/test/ref/lir.ven +46 -46
- data/test/ref/lir.ver +72 -72
- data/test/ref/lir.vet +329 -329
- data/test/ts_database.rb +166 -62
- data/test/ts_language.rb +23 -23
- metadata +53 -34
- data/lib/lingo/attendee/dehyphenizer.rb +0 -120
- data/lib/lingo/attendee/noneword_filter.rb +0 -115
- data/test/attendee/ts_noneword_filter.rb +0 -15
@@ -0,0 +1,99 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
class Attendee
|
30
|
+
|
31
|
+
class LsiFilter < DeferredAttendee
|
32
|
+
|
33
|
+
def init
|
34
|
+
require_lib('lsi4r')
|
35
|
+
|
36
|
+
@lex = get_re('lexicals', '[sy]')
|
37
|
+
@skip = get_ary('skip', DEFAULT_SKIP, :upcase)
|
38
|
+
|
39
|
+
@transform = get_key('transform', Lsi4R::DEFAULT_TRANSFORM)
|
40
|
+
@cutoff = get_flo('cut', Lsi4R::DEFAULT_CUTOFF)
|
41
|
+
|
42
|
+
@min = get_flo('min', false)
|
43
|
+
@abs = get_flo('abs', false)
|
44
|
+
@nul = get_flo('nul', false)
|
45
|
+
@new = get_key('new', true)
|
46
|
+
|
47
|
+
@sort = get_key('sort', false)
|
48
|
+
@sort.downcase! if @sort.respond_to?(:downcase!)
|
49
|
+
|
50
|
+
@docnum, @vectors = 0, Hash.new { |h, k| h[k] = [] }
|
51
|
+
end
|
52
|
+
|
53
|
+
def control(cmd, *)
|
54
|
+
:skip_command if cmd == :EOL
|
55
|
+
end
|
56
|
+
|
57
|
+
def control_deferred(cmd, *)
|
58
|
+
@docnum += 1 if TERMINALS.include?(cmd)
|
59
|
+
end
|
60
|
+
|
61
|
+
def process(obj)
|
62
|
+
if obj.is_a?(Word) && !@skip.include?(obj.attr)
|
63
|
+
vec = []
|
64
|
+
obj.each_lex(@lex) { |lex| vec << Unicode.downcase(lex.form) }
|
65
|
+
@vectors[@docnum].concat(vec) unless vec.empty?
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
def send_lsi
|
72
|
+
lsi = Lsi4R.new(@vectors); @vectors.clear
|
73
|
+
|
74
|
+
if lsi.build(transform: @transform, cutoff: @cutoff)
|
75
|
+
options, vec = { min: @min, abs: @abs, nul: @nul, new: @new }, []
|
76
|
+
|
77
|
+
fmt = @sort ? @sort == 'sto' ?
|
78
|
+
'%s {%.5f}' : '%2$.5f %1$s' : '%s %.5f' unless @sort == 'normal'
|
79
|
+
|
80
|
+
yield !@sort ? lambda { |docnum|
|
81
|
+
lsi.each_norm(docnum, options) { |_, *v| forward(fmt % v) }
|
82
|
+
} : lambda { |docnum|
|
83
|
+
lsi.each_norm(docnum, options) { |_, *v| vec << v }
|
84
|
+
|
85
|
+
!fmt ? vec.sort!.each { |v, _| forward(v) } :
|
86
|
+
vec.sort_by { |v, w| [-w, v] }.each { |v| forward(fmt % v) }
|
87
|
+
|
88
|
+
vec.clear
|
89
|
+
}
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
alias_method :flush_deferred, :send_lsi
|
94
|
+
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -40,7 +40,9 @@ class Lingo
|
|
40
40
|
#
|
41
41
|
# === Mögliche Verlinkung
|
42
42
|
# Erwartet:: Daten vom Typ *Word* z.B. von Wordsearcher, Decomposer, Ocr_variator, MultiWorder
|
43
|
-
# Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_MULTIWORD). Je erkannter Mehrwortgruppe wird
|
43
|
+
# Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_MULTIWORD). Je erkannter Mehrwortgruppe wird
|
44
|
+
# ein zusätzliches Word-Objekt in den Datenstrom eingefügt. Z.B. für Ocr_variator, Sequencer,
|
45
|
+
# Vector_filter
|
44
46
|
#
|
45
47
|
# === Parameter
|
46
48
|
# Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
|
@@ -88,7 +90,7 @@ class Lingo
|
|
88
90
|
|
89
91
|
lex_src, lex_mod, d = nil, nil, lingo.dictionary_config['databases']
|
90
92
|
|
91
|
-
(mul_src =
|
93
|
+
(mul_src = get_ary('source')).each { |src|
|
92
94
|
s, m = d[src].values_at('use-lex', 'lex-mode')
|
93
95
|
|
94
96
|
if lex_src.nil? || lex_src == s
|
@@ -106,76 +108,99 @@ class Lingo
|
|
106
108
|
@lex_gra = grammar(lex_src, lex_mod)
|
107
109
|
|
108
110
|
@syn_dic = if @combine && has_key?('use-syn')
|
109
|
-
dictionary(
|
111
|
+
dictionary(get_ary('use-syn'), get_key('syn-mode', 'all'))
|
110
112
|
end
|
111
113
|
|
112
114
|
@expected_tokens_in_buffer, @eof_handling = 3, false
|
113
115
|
end
|
114
116
|
|
115
117
|
def control(cmd, *)
|
116
|
-
|
118
|
+
if [:RECORD, :EOF].include?(cmd)
|
119
|
+
@eof_handling = true
|
120
|
+
|
121
|
+
while process_buffer?(2)
|
122
|
+
process_buffer
|
123
|
+
end
|
124
|
+
|
125
|
+
forward_number_of_token
|
126
|
+
|
127
|
+
@eof_handling = false
|
128
|
+
end
|
117
129
|
end
|
118
130
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
131
|
+
private
|
132
|
+
|
133
|
+
def form_at(index)
|
134
|
+
obj = @buffer[index]
|
135
|
+
obj.form if obj.is_a?(WordForm) && obj.form != CHAR_PUNCT
|
136
|
+
end
|
137
|
+
|
138
|
+
def forward_number_of_token(len = default = @buffer.size, punct = !default)
|
139
|
+
begin
|
140
|
+
unless @buffer.empty?
|
141
|
+
forward(item = @buffer.delete_at(0))
|
142
|
+
len -= 1 unless punct && item.form == CHAR_PUNCT
|
143
|
+
end
|
144
|
+
end while len > 0
|
145
|
+
end
|
146
|
+
|
147
|
+
def process_buffer?(num = @expected_tokens_in_buffer)
|
148
|
+
@buffer.count { |item| item.form != CHAR_PUNCT } >= num
|
149
|
+
end
|
124
150
|
|
125
|
-
|
151
|
+
def process_buffer
|
152
|
+
if form_at(0)
|
153
|
+
if res = check_multiword(3, len = [])
|
154
|
+
if (max = len.first) <= 3
|
126
155
|
create_and_forward_multiword(3, res)
|
127
156
|
forward_number_of_token(3)
|
157
|
+
elsif !@eof_handling && @buffer.size < max
|
158
|
+
@expected_tokens_in_buffer = max
|
128
159
|
else
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
create_and_forward_multiword(l, r) unless r.empty?
|
135
|
-
} || 1)
|
136
|
-
|
137
|
-
@expected_tokens_in_buffer = 3
|
138
|
-
process_buffer if process_buffer?
|
139
|
-
end
|
160
|
+
forward_number_of_token(len.find { |l|
|
161
|
+
create_and_forward_multiword(l) } || 1)
|
162
|
+
|
163
|
+
@expected_tokens_in_buffer = 3
|
164
|
+
process_buffer if process_buffer?
|
140
165
|
end
|
141
166
|
|
142
167
|
return
|
143
168
|
end
|
144
169
|
|
145
|
-
|
146
|
-
create_and_forward_multiword(2, res)
|
147
|
-
forward_number_of_token(1)
|
148
|
-
end
|
170
|
+
create_and_forward_multiword(2) && forward_number_of_token(1)
|
149
171
|
end
|
150
172
|
|
151
173
|
forward_number_of_token(1, false)
|
152
174
|
@expected_tokens_in_buffer = 3
|
153
175
|
end
|
154
176
|
|
155
|
-
|
177
|
+
def create_and_forward_multiword(len, lex = check_multiword(len))
|
178
|
+
return unless lex
|
156
179
|
|
157
|
-
def create_and_forward_multiword(len, lex)
|
158
180
|
pos, parts = 0, []
|
159
181
|
|
160
182
|
begin
|
161
|
-
if
|
162
|
-
@buffer.delete_at(pos)
|
163
|
-
parts[-1] += CHAR_PUNCT
|
164
|
-
else
|
183
|
+
if form = form_at(pos)
|
165
184
|
@buffer[pos].attr = WA_UNKMULPART if @buffer[pos].unknown?
|
166
185
|
parts << form
|
167
186
|
pos += 1
|
187
|
+
else
|
188
|
+
@buffer.delete_at(pos)
|
189
|
+
parts[-1] += CHAR_PUNCT
|
168
190
|
end
|
169
191
|
end while pos < len
|
170
192
|
|
171
|
-
|
172
|
-
WA_MULTIWORD, lex.select { |l| l.is_a?(Lexical) })
|
193
|
+
wrd = Word.new_lexicals(parts.join(' '),
|
194
|
+
WA_MULTIWORD, lex.select { |l| l.is_a?(Lexical) })
|
195
|
+
wrd.token = @buffer[pos - 1].token
|
196
|
+
|
197
|
+
forward(wrd)
|
173
198
|
end
|
174
199
|
|
175
|
-
def
|
176
|
-
return
|
200
|
+
def check_multiword(len, lst = nil)
|
201
|
+
return unless process_buffer?(len)
|
177
202
|
|
178
|
-
seq = []
|
203
|
+
seq, mul, sep = [], [], ' '
|
179
204
|
|
180
205
|
@buffer.each { |obj|
|
181
206
|
next seq << [obj] unless obj.is_a?(WordForm)
|
@@ -195,17 +220,18 @@ class Lingo
|
|
195
220
|
}
|
196
221
|
|
197
222
|
if @combine
|
198
|
-
mul = []
|
199
|
-
|
200
223
|
seq.shift.product(*seq) { |key|
|
201
|
-
@mul_dic.select(key.join(
|
224
|
+
@mul_dic.select(key.join(sep), mul)
|
202
225
|
break unless @all || mul.empty?
|
203
226
|
} && mul.uniq!
|
204
|
-
|
205
|
-
mul
|
206
227
|
else
|
207
|
-
@mul_dic.select(seq.map! { |i,| i }.join(
|
228
|
+
@mul_dic.select(seq.map! { |i,| i }.join(sep), mul)
|
208
229
|
end
|
230
|
+
|
231
|
+
lst.push(seq.size).concat(mul.map { |r| r.is_a?(Lexical) ?
|
232
|
+
r.form.count(sep) + 1 : r }).sort!.reverse!.uniq! if lst
|
233
|
+
|
234
|
+
mul unless mul.empty?
|
209
235
|
end
|
210
236
|
|
211
237
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -43,7 +43,9 @@ class Lingo
|
|
43
43
|
#
|
44
44
|
# === Mögliche Verlinkung
|
45
45
|
# Erwartet:: Daten vom Typ *Word* z.B. von Wordsearcher, Decomposer, Ocr_variator, Multiworder
|
46
|
-
# Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_SEQUENCE). Je erkannter Mehrwortgruppe wird
|
46
|
+
# Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_SEQUENCE). Je erkannter Mehrwortgruppe wird
|
47
|
+
# ein zusätzliches Word-Objekt in den Datenstrom eingefügt. Z.B. für Ocr_variator, Sequencer,
|
48
|
+
# Vector_filter
|
47
49
|
#
|
48
50
|
# === Parameter
|
49
51
|
# Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
|
@@ -95,15 +97,20 @@ class Lingo
|
|
95
97
|
|
96
98
|
class Sequencer < BufferedAttendee
|
97
99
|
|
100
|
+
UNK = %w[#]
|
101
|
+
NUM = %w[0]
|
102
|
+
|
103
|
+
CLS = /[[:alpha:]#{NUM.join}]/o
|
104
|
+
|
98
105
|
def init
|
99
|
-
@stopper =
|
106
|
+
@stopper = get_ary('stopper', DEFAULT_SKIP)
|
100
107
|
.push(WA_UNKNOWN, WA_UNKMULPART)
|
101
108
|
|
102
109
|
@mwc = get_key('multiword', LA_MULTIWORD)
|
103
110
|
@cls = []
|
104
111
|
|
105
112
|
@seq = get_key('sequences').map { |str, fmt|
|
106
|
-
@cls.concat(cls = (str = str.downcase).scan(
|
113
|
+
@cls.concat(cls = (str = str.downcase).scan(CLS))
|
107
114
|
|
108
115
|
(str =~ /\W/ ? [Regexp.new(str), nil] : [str, cls]).push(
|
109
116
|
fmt == true ? '|' : fmt ? fmt.gsub(/\d+/, '%\&$s') : nil)
|
@@ -124,7 +131,7 @@ class Lingo
|
|
124
131
|
|
125
132
|
def process_buffer
|
126
133
|
flush(@buffer.size < 2 ? @buffer : begin
|
127
|
-
arg
|
134
|
+
arg = [[], buf = [], map = [], @seq]
|
128
135
|
|
129
136
|
iter, skip, rewind = @buffer.each_with_index, 0, lambda {
|
130
137
|
iter.rewind; skip.times { iter.next }; skip = 0
|
@@ -142,16 +149,17 @@ class Lingo
|
|
142
149
|
rewind.call
|
143
150
|
end
|
144
151
|
|
145
|
-
att = obj.is_a?(
|
152
|
+
att = (tok = obj.is_a?(Token)) ? obj.number? ? NUM : UNK :
|
153
|
+
obj.is_a?(Word) && !obj.unknown? ? obj.compound_attrs : UNK
|
146
154
|
|
147
|
-
if (att &= cls).empty?
|
155
|
+
if (att &= @cls).empty?
|
148
156
|
find_seq(*arg)
|
149
157
|
rewind.call if skip > 0
|
150
158
|
else
|
151
|
-
|
152
|
-
|
153
|
-
skip = idx + 1
|
154
|
-
|
159
|
+
obj.each_lex(@mwc) { |lex|
|
160
|
+
lex.form.count(' ').succ.times { iter.next }
|
161
|
+
break skip = idx + 1
|
162
|
+
} unless tok
|
155
163
|
|
156
164
|
buf << obj
|
157
165
|
map << att
|
@@ -178,18 +186,18 @@ class Lingo
|
|
178
186
|
while pos = q.index(str, pos || 0)
|
179
187
|
_str, _cls = [$&, $&.chars] unless cls
|
180
188
|
|
181
|
-
args.clear
|
189
|
+
_tok = nil; args.clear
|
182
190
|
|
183
191
|
_cls.each_with_index { |wc, i|
|
184
|
-
buf[pos + i]
|
185
|
-
|
186
|
-
|
192
|
+
obj = buf[pos + i];_tok ||= obj.token
|
193
|
+
|
194
|
+
args[i] = obj.is_a?(Word) ? obj.lexicals.find { |lex|
|
195
|
+
break lex.form if lex.attr == wc } : obj.form or break
|
187
196
|
} or next
|
188
197
|
|
189
|
-
forms <<
|
198
|
+
forms << [_str, _tok,
|
190
199
|
fmt =~ /\d/ ? fmt.gsub('%0$s', _str) % args :
|
191
|
-
fmt ? "#{_str}:#{args.join(fmt)}" : args.join(' ')
|
192
|
-
)
|
200
|
+
fmt ? "#{_str}:#{args.join(fmt)}" : args.join(' ')]
|
193
201
|
|
194
202
|
pos += 1
|
195
203
|
end
|
@@ -197,7 +205,12 @@ class Lingo
|
|
197
205
|
}.clear
|
198
206
|
|
199
207
|
forms.uniq!
|
200
|
-
|
208
|
+
|
209
|
+
forms.each { |s, t, f|
|
210
|
+
wrd = Word.new_lexical(f, WA_SEQUENCE, LA_SEQUENCE)
|
211
|
+
wrd.pattern, wrd.token = s, t
|
212
|
+
mat << wrd
|
213
|
+
}
|
201
214
|
|
202
215
|
buf.clear
|
203
216
|
mat
|
@@ -36,7 +36,7 @@ class Lingo
|
|
36
36
|
#
|
37
37
|
# === Mögliche Verlinkung
|
38
38
|
# Erwartet:: Daten vom Typ *Word* z.B. von Wordsearcher, Decomposer, Ocr_variator, Multiworder
|
39
|
-
# Erzeugt:: Daten vom Typ *Word* (ggf. um Relationen ergänzt) z.B. für Decomposer, Ocr_variator, Multiworder, Sequencer,
|
39
|
+
# Erzeugt:: Daten vom Typ *Word* (ggf. um Relationen ergänzt) z.B. für Decomposer, Ocr_variator, Multiworder, Sequencer, Vector_filter
|
40
40
|
#
|
41
41
|
# === Parameter
|
42
42
|
# Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
|
@@ -75,7 +75,7 @@ class Lingo
|
|
75
75
|
def init
|
76
76
|
set_dic
|
77
77
|
@com = !get_key('compound-parts', false)
|
78
|
-
@skip =
|
78
|
+
@skip = get_ary('skip', WA_UNKNOWN, :upcase)
|
79
79
|
end
|
80
80
|
|
81
81
|
def control(*)
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -107,10 +107,14 @@ class Lingo
|
|
107
107
|
|
108
108
|
class TextReader < self
|
109
109
|
|
110
|
+
include TextUtils
|
111
|
+
|
110
112
|
# TODO: FILE/LIR-FILE (?)
|
111
113
|
def init
|
112
114
|
get_files
|
113
115
|
|
116
|
+
@encoding = get_enc
|
117
|
+
|
114
118
|
@filter = get_key('filter', false)
|
115
119
|
@progress = get_key('progress', false)
|
116
120
|
|
@@ -124,42 +128,40 @@ class Lingo
|
|
124
128
|
end
|
125
129
|
|
126
130
|
def control(cmd, *)
|
127
|
-
|
128
|
-
command(:LIR) if @lir
|
129
|
-
@files.each { |i| spool(i) }
|
130
|
-
|
131
|
-
command(:EOT)
|
132
|
-
:skip_command
|
133
|
-
end
|
134
|
-
end
|
131
|
+
return unless cmd == :TALK
|
135
132
|
|
136
|
-
|
133
|
+
command(:LIR) if @lir
|
137
134
|
|
138
|
-
|
139
|
-
|
135
|
+
@files.each { |path|
|
136
|
+
command(:FILE, path)
|
140
137
|
|
141
|
-
|
142
|
-
stdin = lingo.config.stdin.set_encoding(ENC)
|
143
|
-
@progress ? StringIO.new(stdin.read) : stdin
|
144
|
-
end
|
138
|
+
io = stdin?(path) ? open_stdin : open_path(name = path)
|
145
139
|
|
146
|
-
|
147
|
-
|
140
|
+
Progress.new(self, @progress && io.size, name) { |progress|
|
141
|
+
pos = 0 unless pos?(io = filter(io, path, progress))
|
148
142
|
|
149
|
-
|
150
|
-
|
143
|
+
io.each { |line|
|
144
|
+
progress << offset = pos ? pos += line.bytesize : io.pos
|
151
145
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
146
|
+
line =~ @skip ? nil : line =~ @lir ?
|
147
|
+
command(:RECORD, $1 || $&) : begin
|
148
|
+
line.sub!(@cut, '') if @cut
|
149
|
+
forward(line, offset) unless line.empty?
|
150
|
+
end
|
151
|
+
}
|
157
152
|
}
|
153
|
+
|
154
|
+
io.close unless stdin?(path)
|
155
|
+
|
156
|
+
command(:EOF, path)
|
158
157
|
}
|
159
158
|
|
160
|
-
command(:
|
159
|
+
command(:EOT)
|
160
|
+
:skip_command
|
161
161
|
end
|
162
162
|
|
163
|
+
private
|
164
|
+
|
163
165
|
def filter(io, path, progress)
|
164
166
|
case @filter == true ? file_type(io, path) : @filter.to_s
|
165
167
|
when 'pdftotext' then filter_pdftotext(io, path, progress)
|
@@ -170,64 +172,51 @@ class Lingo
|
|
170
172
|
end
|
171
173
|
end
|
172
174
|
|
173
|
-
def filter_pdftotext(io, path, progress)
|
174
|
-
|
175
|
-
with_tempfile(name) { |tempfile|
|
176
|
-
pdf_path = stdin?(path) ? tempfile[:pdf, io] : path
|
177
|
-
system(cmd, '-q', pdf_path, txt_path = tempfile[:txt])
|
175
|
+
def filter_pdftotext(io, path, progress, name = 'pdftotext')
|
176
|
+
cancel_filter(:PDF, name, :command) unless cmd = File.which(name)
|
178
177
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
178
|
+
with_tempfile(name) { |tempfile|
|
179
|
+
pdf_path = stdin?(path) ? tempfile[:pdf, io] : path
|
180
|
+
system(cmd, '-q', pdf_path, txt_path = tempfile[:txt])
|
181
|
+
|
182
|
+
progress.init(File.size(txt_path)) if @progress
|
183
|
+
open_path(txt_path)
|
184
|
+
}
|
185
185
|
end
|
186
186
|
|
187
187
|
def filter_pdf(io)
|
188
|
-
|
189
|
-
|
190
|
-
else
|
191
|
-
cancel_filter(:PDF, 'pdf-reader')
|
192
|
-
end
|
188
|
+
Object.const_defined?(:PDF) && PDF.const_defined?(:Reader) ? text_enum(
|
189
|
+
PDF::Reader.new(io).pages) : cancel_filter(:PDF, 'pdf-reader')
|
193
190
|
end
|
194
191
|
|
195
|
-
def filter_html(io, xml = false)
|
196
|
-
|
197
|
-
|
198
|
-
if Object.const_defined?(:Nokogiri)
|
199
|
-
text_enum(Nokogiri.send(type, io, nil, ENC).children)
|
200
|
-
else
|
201
|
-
cancel_filter(type, :nokogiri)
|
202
|
-
end
|
192
|
+
def filter_html(io, xml = false, type = xml ? :XML : :HTML)
|
193
|
+
Object.const_defined?(:Nokogiri) ? text_enum(Nokogiri.send(type,
|
194
|
+
io, nil, @encoding).children) : cancel_filter(type, :nokogiri)
|
203
195
|
end
|
204
196
|
|
205
197
|
def file_type(io, path)
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
type
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
else
|
217
|
-
cancel("Filters not available. Please install the `ruby-filemagic' or `mime-types' gem.")
|
218
|
-
end
|
198
|
+
Object.const_defined?(:FileMagic) && io.respond_to?(:pos=) ?
|
199
|
+
FileMagic.fm(:mime, simplified: true).io(io, 256, true) :
|
200
|
+
Object.const_defined?(:MIME) && MIME.const_defined?(:Types) ?
|
201
|
+
(type = MIME::Types.of(path).first) ? type.content_type :
|
202
|
+
cancel_filters('File type could not be determined.') :
|
203
|
+
cancel_filters(please_install(:gem, 'ruby-filemagic', 'mime-types'))
|
204
|
+
end
|
205
|
+
|
206
|
+
def cancel_filters(msg)
|
207
|
+
cancel("Filters not available. #{msg}")
|
219
208
|
end
|
220
209
|
|
221
210
|
def cancel_filter(type, name, what = :gem)
|
222
|
-
cancel("#{type} filter not available.
|
211
|
+
cancel("#{type} filter not available. #{please_install(what, name)}")
|
223
212
|
end
|
224
213
|
|
225
|
-
def
|
226
|
-
|
214
|
+
def please_install(what, *names)
|
215
|
+
"Please install the `#{names.join("' or `")}' #{what}."
|
227
216
|
end
|
228
217
|
|
229
|
-
def
|
230
|
-
|
218
|
+
def cancel(msg)
|
219
|
+
throw(:cancel, msg)
|
231
220
|
end
|
232
221
|
|
233
222
|
def pos?(io)
|
@@ -235,10 +224,6 @@ class Lingo
|
|
235
224
|
rescue Errno::ESPIPE
|
236
225
|
end
|
237
226
|
|
238
|
-
def open_file(path)
|
239
|
-
File.open(path, 'rb', encoding: ENC)
|
240
|
-
end
|
241
|
-
|
242
227
|
def with_tempfile(name)
|
243
228
|
require 'tempfile'
|
244
229
|
|
@@ -263,31 +248,17 @@ class Lingo
|
|
263
248
|
|
264
249
|
@files = []
|
265
250
|
|
266
|
-
Array(get_key('files', '-')).each { |path|
|
267
|
-
|
268
|
-
add_files(File.expand_path(path), *args)
|
269
|
-
}
|
251
|
+
Array(get_key('files', '-')).each { |path| stdin?(path) ?
|
252
|
+
@files << path : add_files(File.expand_path(path), *args) }
|
270
253
|
end
|
271
254
|
|
272
255
|
def add_files(path, glob, recursive = false)
|
273
|
-
entries = Dir[path].sort
|
274
|
-
raise FileNotFoundError.new(path) if entries.empty?
|
256
|
+
raise FileNotFoundError.new(path) if (entries = Dir[path]).sort!.empty?
|
275
257
|
|
276
258
|
entries.each { |entry|
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
if File.file?(match) && File.fnmatch?(glob, match)
|
281
|
-
@files << match
|
282
|
-
end
|
283
|
-
}
|
284
|
-
else
|
285
|
-
add_files(File.join(entry, glob), glob)
|
286
|
-
end
|
287
|
-
else
|
288
|
-
@files << entry
|
289
|
-
end
|
290
|
-
}
|
259
|
+
!File.directory?(entry) ? @files << entry : !recursive ?
|
260
|
+
add_files(File.join(entry, glob), glob) : Find.find(entry) { |match|
|
261
|
+
@files << match if File.file?(match) && File.fnmatch?(glob, match) } }
|
291
262
|
end
|
292
263
|
|
293
264
|
end
|