lingo 1.8.6 → 1.8.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ChangeLog +40 -4
- data/README +22 -51
- data/Rakefile +3 -17
- data/config/lingo.cfg +24 -15
- data/config/lir.cfg +25 -16
- data/dict/de/test_muh.txt +6 -0
- data/dict/en/lingo-dic.txt +2 -3
- data/lang/de.lang +10 -9
- data/lang/en.lang +1 -1
- data/lib/lingo.rb +4 -4
- data/lib/lingo/attendee.rb +27 -7
- data/lib/lingo/attendee/analysis_filter.rb +81 -0
- data/lib/lingo/attendee/debug_filter.rb +42 -0
- data/lib/lingo/attendee/debugger.rb +2 -11
- data/lib/lingo/attendee/decomposer.rb +6 -3
- data/lib/lingo/attendee/formatter.rb +6 -6
- data/lib/lingo/attendee/hal_filter.rb +94 -0
- data/lib/lingo/attendee/lsi_filter.rb +99 -0
- data/lib/lingo/attendee/multi_worder.rb +69 -43
- data/lib/lingo/attendee/sequencer.rb +32 -19
- data/lib/lingo/attendee/synonymer.rb +2 -2
- data/lib/lingo/attendee/text_reader.rb +63 -92
- data/lib/lingo/attendee/text_writer.rb +12 -21
- data/lib/lingo/attendee/tokenizer.rb +32 -21
- data/lib/lingo/attendee/variator.rb +3 -3
- data/lib/lingo/attendee/vector_filter.rb +7 -9
- data/lib/lingo/attendee/word_searcher.rb +3 -3
- data/lib/lingo/buffered_attendee.rb +3 -36
- data/lib/lingo/config.rb +1 -1
- data/lib/lingo/ctl.rb +7 -155
- data/lib/lingo/ctl/analysis.rb +136 -0
- data/lib/lingo/ctl/files.rb +86 -0
- data/lib/lingo/ctl/other.rb +140 -0
- data/lib/lingo/database.rb +64 -60
- data/lib/lingo/database/crypter.rb +7 -5
- data/lib/lingo/error.rb +5 -4
- data/lib/lingo/language.rb +13 -5
- data/lib/lingo/language/grammar.rb +13 -7
- data/lib/lingo/language/token.rb +6 -0
- data/lib/lingo/language/word.rb +23 -36
- data/lib/lingo/language/word_form.rb +5 -1
- data/lib/lingo/srv.rb +2 -2
- data/lib/lingo/text_utils.rb +96 -0
- data/lib/lingo/version.rb +1 -1
- data/lib/lingo/web/views/index.erb +1 -1
- data/test/attendee/ts_decomposer.rb +23 -5
- data/test/attendee/ts_multi_worder.rb +66 -0
- data/test/attendee/ts_sequencer.rb +28 -4
- data/test/attendee/ts_text_reader.rb +20 -0
- data/test/attendee/ts_tokenizer.rb +20 -0
- data/test/attendee/ts_variator.rb +1 -1
- data/test/attendee/ts_word_searcher.rb +39 -3
- data/test/lir3.txt +12 -0
- data/test/ref/artikel.non +1 -12
- data/test/ref/artikel.seq +3 -1
- data/test/ref/artikel.vec +1 -0
- data/test/ref/artikel.vef +35 -34
- data/test/ref/artikel.ven +8 -7
- data/test/ref/artikel.ver +34 -33
- data/test/ref/artikel.vet +2573 -2563
- data/test/ref/lir.non +77 -78
- data/test/ref/lir.seq +9 -7
- data/test/ref/lir.syn +1 -1
- data/test/ref/lir.vec +41 -41
- data/test/ref/lir.vef +210 -210
- data/test/ref/lir.ven +46 -46
- data/test/ref/lir.ver +72 -72
- data/test/ref/lir.vet +329 -329
- data/test/ts_database.rb +166 -62
- data/test/ts_language.rb +23 -23
- metadata +53 -34
- data/lib/lingo/attendee/dehyphenizer.rb +0 -120
- data/lib/lingo/attendee/noneword_filter.rb +0 -115
- data/test/attendee/ts_noneword_filter.rb +0 -15
@@ -0,0 +1,99 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class Lingo
|
28
|
+
|
29
|
+
class Attendee
|
30
|
+
|
31
|
+
class LsiFilter < DeferredAttendee
|
32
|
+
|
33
|
+
def init
|
34
|
+
require_lib('lsi4r')
|
35
|
+
|
36
|
+
@lex = get_re('lexicals', '[sy]')
|
37
|
+
@skip = get_ary('skip', DEFAULT_SKIP, :upcase)
|
38
|
+
|
39
|
+
@transform = get_key('transform', Lsi4R::DEFAULT_TRANSFORM)
|
40
|
+
@cutoff = get_flo('cut', Lsi4R::DEFAULT_CUTOFF)
|
41
|
+
|
42
|
+
@min = get_flo('min', false)
|
43
|
+
@abs = get_flo('abs', false)
|
44
|
+
@nul = get_flo('nul', false)
|
45
|
+
@new = get_key('new', true)
|
46
|
+
|
47
|
+
@sort = get_key('sort', false)
|
48
|
+
@sort.downcase! if @sort.respond_to?(:downcase!)
|
49
|
+
|
50
|
+
@docnum, @vectors = 0, Hash.new { |h, k| h[k] = [] }
|
51
|
+
end
|
52
|
+
|
53
|
+
def control(cmd, *)
|
54
|
+
:skip_command if cmd == :EOL
|
55
|
+
end
|
56
|
+
|
57
|
+
def control_deferred(cmd, *)
|
58
|
+
@docnum += 1 if TERMINALS.include?(cmd)
|
59
|
+
end
|
60
|
+
|
61
|
+
def process(obj)
|
62
|
+
if obj.is_a?(Word) && !@skip.include?(obj.attr)
|
63
|
+
vec = []
|
64
|
+
obj.each_lex(@lex) { |lex| vec << Unicode.downcase(lex.form) }
|
65
|
+
@vectors[@docnum].concat(vec) unless vec.empty?
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
def send_lsi
|
72
|
+
lsi = Lsi4R.new(@vectors); @vectors.clear
|
73
|
+
|
74
|
+
if lsi.build(transform: @transform, cutoff: @cutoff)
|
75
|
+
options, vec = { min: @min, abs: @abs, nul: @nul, new: @new }, []
|
76
|
+
|
77
|
+
fmt = @sort ? @sort == 'sto' ?
|
78
|
+
'%s {%.5f}' : '%2$.5f %1$s' : '%s %.5f' unless @sort == 'normal'
|
79
|
+
|
80
|
+
yield !@sort ? lambda { |docnum|
|
81
|
+
lsi.each_norm(docnum, options) { |_, *v| forward(fmt % v) }
|
82
|
+
} : lambda { |docnum|
|
83
|
+
lsi.each_norm(docnum, options) { |_, *v| vec << v }
|
84
|
+
|
85
|
+
!fmt ? vec.sort!.each { |v, _| forward(v) } :
|
86
|
+
vec.sort_by { |v, w| [-w, v] }.each { |v| forward(fmt % v) }
|
87
|
+
|
88
|
+
vec.clear
|
89
|
+
}
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
alias_method :flush_deferred, :send_lsi
|
94
|
+
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -40,7 +40,9 @@ class Lingo
|
|
40
40
|
#
|
41
41
|
# === Mögliche Verlinkung
|
42
42
|
# Erwartet:: Daten vom Typ *Word* z.B. von Wordsearcher, Decomposer, Ocr_variator, MultiWorder
|
43
|
-
# Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_MULTIWORD). Je erkannter Mehrwortgruppe wird
|
43
|
+
# Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_MULTIWORD). Je erkannter Mehrwortgruppe wird
|
44
|
+
# ein zusätzliches Word-Objekt in den Datenstrom eingefügt. Z.B. für Ocr_variator, Sequencer,
|
45
|
+
# Vector_filter
|
44
46
|
#
|
45
47
|
# === Parameter
|
46
48
|
# Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
|
@@ -88,7 +90,7 @@ class Lingo
|
|
88
90
|
|
89
91
|
lex_src, lex_mod, d = nil, nil, lingo.dictionary_config['databases']
|
90
92
|
|
91
|
-
(mul_src =
|
93
|
+
(mul_src = get_ary('source')).each { |src|
|
92
94
|
s, m = d[src].values_at('use-lex', 'lex-mode')
|
93
95
|
|
94
96
|
if lex_src.nil? || lex_src == s
|
@@ -106,76 +108,99 @@ class Lingo
|
|
106
108
|
@lex_gra = grammar(lex_src, lex_mod)
|
107
109
|
|
108
110
|
@syn_dic = if @combine && has_key?('use-syn')
|
109
|
-
dictionary(
|
111
|
+
dictionary(get_ary('use-syn'), get_key('syn-mode', 'all'))
|
110
112
|
end
|
111
113
|
|
112
114
|
@expected_tokens_in_buffer, @eof_handling = 3, false
|
113
115
|
end
|
114
116
|
|
115
117
|
def control(cmd, *)
|
116
|
-
|
118
|
+
if [:RECORD, :EOF].include?(cmd)
|
119
|
+
@eof_handling = true
|
120
|
+
|
121
|
+
while process_buffer?(2)
|
122
|
+
process_buffer
|
123
|
+
end
|
124
|
+
|
125
|
+
forward_number_of_token
|
126
|
+
|
127
|
+
@eof_handling = false
|
128
|
+
end
|
117
129
|
end
|
118
130
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
131
|
+
private
|
132
|
+
|
133
|
+
def form_at(index)
|
134
|
+
obj = @buffer[index]
|
135
|
+
obj.form if obj.is_a?(WordForm) && obj.form != CHAR_PUNCT
|
136
|
+
end
|
137
|
+
|
138
|
+
def forward_number_of_token(len = default = @buffer.size, punct = !default)
|
139
|
+
begin
|
140
|
+
unless @buffer.empty?
|
141
|
+
forward(item = @buffer.delete_at(0))
|
142
|
+
len -= 1 unless punct && item.form == CHAR_PUNCT
|
143
|
+
end
|
144
|
+
end while len > 0
|
145
|
+
end
|
146
|
+
|
147
|
+
def process_buffer?(num = @expected_tokens_in_buffer)
|
148
|
+
@buffer.count { |item| item.form != CHAR_PUNCT } >= num
|
149
|
+
end
|
124
150
|
|
125
|
-
|
151
|
+
def process_buffer
|
152
|
+
if form_at(0)
|
153
|
+
if res = check_multiword(3, len = [])
|
154
|
+
if (max = len.first) <= 3
|
126
155
|
create_and_forward_multiword(3, res)
|
127
156
|
forward_number_of_token(3)
|
157
|
+
elsif !@eof_handling && @buffer.size < max
|
158
|
+
@expected_tokens_in_buffer = max
|
128
159
|
else
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
create_and_forward_multiword(l, r) unless r.empty?
|
135
|
-
} || 1)
|
136
|
-
|
137
|
-
@expected_tokens_in_buffer = 3
|
138
|
-
process_buffer if process_buffer?
|
139
|
-
end
|
160
|
+
forward_number_of_token(len.find { |l|
|
161
|
+
create_and_forward_multiword(l) } || 1)
|
162
|
+
|
163
|
+
@expected_tokens_in_buffer = 3
|
164
|
+
process_buffer if process_buffer?
|
140
165
|
end
|
141
166
|
|
142
167
|
return
|
143
168
|
end
|
144
169
|
|
145
|
-
|
146
|
-
create_and_forward_multiword(2, res)
|
147
|
-
forward_number_of_token(1)
|
148
|
-
end
|
170
|
+
create_and_forward_multiword(2) && forward_number_of_token(1)
|
149
171
|
end
|
150
172
|
|
151
173
|
forward_number_of_token(1, false)
|
152
174
|
@expected_tokens_in_buffer = 3
|
153
175
|
end
|
154
176
|
|
155
|
-
|
177
|
+
def create_and_forward_multiword(len, lex = check_multiword(len))
|
178
|
+
return unless lex
|
156
179
|
|
157
|
-
def create_and_forward_multiword(len, lex)
|
158
180
|
pos, parts = 0, []
|
159
181
|
|
160
182
|
begin
|
161
|
-
if
|
162
|
-
@buffer.delete_at(pos)
|
163
|
-
parts[-1] += CHAR_PUNCT
|
164
|
-
else
|
183
|
+
if form = form_at(pos)
|
165
184
|
@buffer[pos].attr = WA_UNKMULPART if @buffer[pos].unknown?
|
166
185
|
parts << form
|
167
186
|
pos += 1
|
187
|
+
else
|
188
|
+
@buffer.delete_at(pos)
|
189
|
+
parts[-1] += CHAR_PUNCT
|
168
190
|
end
|
169
191
|
end while pos < len
|
170
192
|
|
171
|
-
|
172
|
-
WA_MULTIWORD, lex.select { |l| l.is_a?(Lexical) })
|
193
|
+
wrd = Word.new_lexicals(parts.join(' '),
|
194
|
+
WA_MULTIWORD, lex.select { |l| l.is_a?(Lexical) })
|
195
|
+
wrd.token = @buffer[pos - 1].token
|
196
|
+
|
197
|
+
forward(wrd)
|
173
198
|
end
|
174
199
|
|
175
|
-
def
|
176
|
-
return
|
200
|
+
def check_multiword(len, lst = nil)
|
201
|
+
return unless process_buffer?(len)
|
177
202
|
|
178
|
-
seq = []
|
203
|
+
seq, mul, sep = [], [], ' '
|
179
204
|
|
180
205
|
@buffer.each { |obj|
|
181
206
|
next seq << [obj] unless obj.is_a?(WordForm)
|
@@ -195,17 +220,18 @@ class Lingo
|
|
195
220
|
}
|
196
221
|
|
197
222
|
if @combine
|
198
|
-
mul = []
|
199
|
-
|
200
223
|
seq.shift.product(*seq) { |key|
|
201
|
-
@mul_dic.select(key.join(
|
224
|
+
@mul_dic.select(key.join(sep), mul)
|
202
225
|
break unless @all || mul.empty?
|
203
226
|
} && mul.uniq!
|
204
|
-
|
205
|
-
mul
|
206
227
|
else
|
207
|
-
@mul_dic.select(seq.map! { |i,| i }.join(
|
228
|
+
@mul_dic.select(seq.map! { |i,| i }.join(sep), mul)
|
208
229
|
end
|
230
|
+
|
231
|
+
lst.push(seq.size).concat(mul.map { |r| r.is_a?(Lexical) ?
|
232
|
+
r.form.count(sep) + 1 : r }).sort!.reverse!.uniq! if lst
|
233
|
+
|
234
|
+
mul unless mul.empty?
|
209
235
|
end
|
210
236
|
|
211
237
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -43,7 +43,9 @@ class Lingo
|
|
43
43
|
#
|
44
44
|
# === Mögliche Verlinkung
|
45
45
|
# Erwartet:: Daten vom Typ *Word* z.B. von Wordsearcher, Decomposer, Ocr_variator, Multiworder
|
46
|
-
# Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_SEQUENCE). Je erkannter Mehrwortgruppe wird
|
46
|
+
# Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_SEQUENCE). Je erkannter Mehrwortgruppe wird
|
47
|
+
# ein zusätzliches Word-Objekt in den Datenstrom eingefügt. Z.B. für Ocr_variator, Sequencer,
|
48
|
+
# Vector_filter
|
47
49
|
#
|
48
50
|
# === Parameter
|
49
51
|
# Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
|
@@ -95,15 +97,20 @@ class Lingo
|
|
95
97
|
|
96
98
|
class Sequencer < BufferedAttendee
|
97
99
|
|
100
|
+
UNK = %w[#]
|
101
|
+
NUM = %w[0]
|
102
|
+
|
103
|
+
CLS = /[[:alpha:]#{NUM.join}]/o
|
104
|
+
|
98
105
|
def init
|
99
|
-
@stopper =
|
106
|
+
@stopper = get_ary('stopper', DEFAULT_SKIP)
|
100
107
|
.push(WA_UNKNOWN, WA_UNKMULPART)
|
101
108
|
|
102
109
|
@mwc = get_key('multiword', LA_MULTIWORD)
|
103
110
|
@cls = []
|
104
111
|
|
105
112
|
@seq = get_key('sequences').map { |str, fmt|
|
106
|
-
@cls.concat(cls = (str = str.downcase).scan(
|
113
|
+
@cls.concat(cls = (str = str.downcase).scan(CLS))
|
107
114
|
|
108
115
|
(str =~ /\W/ ? [Regexp.new(str), nil] : [str, cls]).push(
|
109
116
|
fmt == true ? '|' : fmt ? fmt.gsub(/\d+/, '%\&$s') : nil)
|
@@ -124,7 +131,7 @@ class Lingo
|
|
124
131
|
|
125
132
|
def process_buffer
|
126
133
|
flush(@buffer.size < 2 ? @buffer : begin
|
127
|
-
arg
|
134
|
+
arg = [[], buf = [], map = [], @seq]
|
128
135
|
|
129
136
|
iter, skip, rewind = @buffer.each_with_index, 0, lambda {
|
130
137
|
iter.rewind; skip.times { iter.next }; skip = 0
|
@@ -142,16 +149,17 @@ class Lingo
|
|
142
149
|
rewind.call
|
143
150
|
end
|
144
151
|
|
145
|
-
att = obj.is_a?(
|
152
|
+
att = (tok = obj.is_a?(Token)) ? obj.number? ? NUM : UNK :
|
153
|
+
obj.is_a?(Word) && !obj.unknown? ? obj.compound_attrs : UNK
|
146
154
|
|
147
|
-
if (att &= cls).empty?
|
155
|
+
if (att &= @cls).empty?
|
148
156
|
find_seq(*arg)
|
149
157
|
rewind.call if skip > 0
|
150
158
|
else
|
151
|
-
|
152
|
-
|
153
|
-
skip = idx + 1
|
154
|
-
|
159
|
+
obj.each_lex(@mwc) { |lex|
|
160
|
+
lex.form.count(' ').succ.times { iter.next }
|
161
|
+
break skip = idx + 1
|
162
|
+
} unless tok
|
155
163
|
|
156
164
|
buf << obj
|
157
165
|
map << att
|
@@ -178,18 +186,18 @@ class Lingo
|
|
178
186
|
while pos = q.index(str, pos || 0)
|
179
187
|
_str, _cls = [$&, $&.chars] unless cls
|
180
188
|
|
181
|
-
args.clear
|
189
|
+
_tok = nil; args.clear
|
182
190
|
|
183
191
|
_cls.each_with_index { |wc, i|
|
184
|
-
buf[pos + i]
|
185
|
-
|
186
|
-
|
192
|
+
obj = buf[pos + i];_tok ||= obj.token
|
193
|
+
|
194
|
+
args[i] = obj.is_a?(Word) ? obj.lexicals.find { |lex|
|
195
|
+
break lex.form if lex.attr == wc } : obj.form or break
|
187
196
|
} or next
|
188
197
|
|
189
|
-
forms <<
|
198
|
+
forms << [_str, _tok,
|
190
199
|
fmt =~ /\d/ ? fmt.gsub('%0$s', _str) % args :
|
191
|
-
fmt ? "#{_str}:#{args.join(fmt)}" : args.join(' ')
|
192
|
-
)
|
200
|
+
fmt ? "#{_str}:#{args.join(fmt)}" : args.join(' ')]
|
193
201
|
|
194
202
|
pos += 1
|
195
203
|
end
|
@@ -197,7 +205,12 @@ class Lingo
|
|
197
205
|
}.clear
|
198
206
|
|
199
207
|
forms.uniq!
|
200
|
-
|
208
|
+
|
209
|
+
forms.each { |s, t, f|
|
210
|
+
wrd = Word.new_lexical(f, WA_SEQUENCE, LA_SEQUENCE)
|
211
|
+
wrd.pattern, wrd.token = s, t
|
212
|
+
mat << wrd
|
213
|
+
}
|
201
214
|
|
202
215
|
buf.clear
|
203
216
|
mat
|
@@ -36,7 +36,7 @@ class Lingo
|
|
36
36
|
#
|
37
37
|
# === Mögliche Verlinkung
|
38
38
|
# Erwartet:: Daten vom Typ *Word* z.B. von Wordsearcher, Decomposer, Ocr_variator, Multiworder
|
39
|
-
# Erzeugt:: Daten vom Typ *Word* (ggf. um Relationen ergänzt) z.B. für Decomposer, Ocr_variator, Multiworder, Sequencer,
|
39
|
+
# Erzeugt:: Daten vom Typ *Word* (ggf. um Relationen ergänzt) z.B. für Decomposer, Ocr_variator, Multiworder, Sequencer, Vector_filter
|
40
40
|
#
|
41
41
|
# === Parameter
|
42
42
|
# Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
|
@@ -75,7 +75,7 @@ class Lingo
|
|
75
75
|
def init
|
76
76
|
set_dic
|
77
77
|
@com = !get_key('compound-parts', false)
|
78
|
-
@skip =
|
78
|
+
@skip = get_ary('skip', WA_UNKNOWN, :upcase)
|
79
79
|
end
|
80
80
|
|
81
81
|
def control(*)
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -107,10 +107,14 @@ class Lingo
|
|
107
107
|
|
108
108
|
class TextReader < self
|
109
109
|
|
110
|
+
include TextUtils
|
111
|
+
|
110
112
|
# TODO: FILE/LIR-FILE (?)
|
111
113
|
def init
|
112
114
|
get_files
|
113
115
|
|
116
|
+
@encoding = get_enc
|
117
|
+
|
114
118
|
@filter = get_key('filter', false)
|
115
119
|
@progress = get_key('progress', false)
|
116
120
|
|
@@ -124,42 +128,40 @@ class Lingo
|
|
124
128
|
end
|
125
129
|
|
126
130
|
def control(cmd, *)
|
127
|
-
|
128
|
-
command(:LIR) if @lir
|
129
|
-
@files.each { |i| spool(i) }
|
130
|
-
|
131
|
-
command(:EOT)
|
132
|
-
:skip_command
|
133
|
-
end
|
134
|
-
end
|
131
|
+
return unless cmd == :TALK
|
135
132
|
|
136
|
-
|
133
|
+
command(:LIR) if @lir
|
137
134
|
|
138
|
-
|
139
|
-
|
135
|
+
@files.each { |path|
|
136
|
+
command(:FILE, path)
|
140
137
|
|
141
|
-
|
142
|
-
stdin = lingo.config.stdin.set_encoding(ENC)
|
143
|
-
@progress ? StringIO.new(stdin.read) : stdin
|
144
|
-
end
|
138
|
+
io = stdin?(path) ? open_stdin : open_path(name = path)
|
145
139
|
|
146
|
-
|
147
|
-
|
140
|
+
Progress.new(self, @progress && io.size, name) { |progress|
|
141
|
+
pos = 0 unless pos?(io = filter(io, path, progress))
|
148
142
|
|
149
|
-
|
150
|
-
|
143
|
+
io.each { |line|
|
144
|
+
progress << offset = pos ? pos += line.bytesize : io.pos
|
151
145
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
146
|
+
line =~ @skip ? nil : line =~ @lir ?
|
147
|
+
command(:RECORD, $1 || $&) : begin
|
148
|
+
line.sub!(@cut, '') if @cut
|
149
|
+
forward(line, offset) unless line.empty?
|
150
|
+
end
|
151
|
+
}
|
157
152
|
}
|
153
|
+
|
154
|
+
io.close unless stdin?(path)
|
155
|
+
|
156
|
+
command(:EOF, path)
|
158
157
|
}
|
159
158
|
|
160
|
-
command(:
|
159
|
+
command(:EOT)
|
160
|
+
:skip_command
|
161
161
|
end
|
162
162
|
|
163
|
+
private
|
164
|
+
|
163
165
|
def filter(io, path, progress)
|
164
166
|
case @filter == true ? file_type(io, path) : @filter.to_s
|
165
167
|
when 'pdftotext' then filter_pdftotext(io, path, progress)
|
@@ -170,64 +172,51 @@ class Lingo
|
|
170
172
|
end
|
171
173
|
end
|
172
174
|
|
173
|
-
def filter_pdftotext(io, path, progress)
|
174
|
-
|
175
|
-
with_tempfile(name) { |tempfile|
|
176
|
-
pdf_path = stdin?(path) ? tempfile[:pdf, io] : path
|
177
|
-
system(cmd, '-q', pdf_path, txt_path = tempfile[:txt])
|
175
|
+
def filter_pdftotext(io, path, progress, name = 'pdftotext')
|
176
|
+
cancel_filter(:PDF, name, :command) unless cmd = File.which(name)
|
178
177
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
178
|
+
with_tempfile(name) { |tempfile|
|
179
|
+
pdf_path = stdin?(path) ? tempfile[:pdf, io] : path
|
180
|
+
system(cmd, '-q', pdf_path, txt_path = tempfile[:txt])
|
181
|
+
|
182
|
+
progress.init(File.size(txt_path)) if @progress
|
183
|
+
open_path(txt_path)
|
184
|
+
}
|
185
185
|
end
|
186
186
|
|
187
187
|
def filter_pdf(io)
|
188
|
-
|
189
|
-
|
190
|
-
else
|
191
|
-
cancel_filter(:PDF, 'pdf-reader')
|
192
|
-
end
|
188
|
+
Object.const_defined?(:PDF) && PDF.const_defined?(:Reader) ? text_enum(
|
189
|
+
PDF::Reader.new(io).pages) : cancel_filter(:PDF, 'pdf-reader')
|
193
190
|
end
|
194
191
|
|
195
|
-
def filter_html(io, xml = false)
|
196
|
-
|
197
|
-
|
198
|
-
if Object.const_defined?(:Nokogiri)
|
199
|
-
text_enum(Nokogiri.send(type, io, nil, ENC).children)
|
200
|
-
else
|
201
|
-
cancel_filter(type, :nokogiri)
|
202
|
-
end
|
192
|
+
def filter_html(io, xml = false, type = xml ? :XML : :HTML)
|
193
|
+
Object.const_defined?(:Nokogiri) ? text_enum(Nokogiri.send(type,
|
194
|
+
io, nil, @encoding).children) : cancel_filter(type, :nokogiri)
|
203
195
|
end
|
204
196
|
|
205
197
|
def file_type(io, path)
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
type
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
else
|
217
|
-
cancel("Filters not available. Please install the `ruby-filemagic' or `mime-types' gem.")
|
218
|
-
end
|
198
|
+
Object.const_defined?(:FileMagic) && io.respond_to?(:pos=) ?
|
199
|
+
FileMagic.fm(:mime, simplified: true).io(io, 256, true) :
|
200
|
+
Object.const_defined?(:MIME) && MIME.const_defined?(:Types) ?
|
201
|
+
(type = MIME::Types.of(path).first) ? type.content_type :
|
202
|
+
cancel_filters('File type could not be determined.') :
|
203
|
+
cancel_filters(please_install(:gem, 'ruby-filemagic', 'mime-types'))
|
204
|
+
end
|
205
|
+
|
206
|
+
def cancel_filters(msg)
|
207
|
+
cancel("Filters not available. #{msg}")
|
219
208
|
end
|
220
209
|
|
221
210
|
def cancel_filter(type, name, what = :gem)
|
222
|
-
cancel("#{type} filter not available.
|
211
|
+
cancel("#{type} filter not available. #{please_install(what, name)}")
|
223
212
|
end
|
224
213
|
|
225
|
-
def
|
226
|
-
|
214
|
+
def please_install(what, *names)
|
215
|
+
"Please install the `#{names.join("' or `")}' #{what}."
|
227
216
|
end
|
228
217
|
|
229
|
-
def
|
230
|
-
|
218
|
+
def cancel(msg)
|
219
|
+
throw(:cancel, msg)
|
231
220
|
end
|
232
221
|
|
233
222
|
def pos?(io)
|
@@ -235,10 +224,6 @@ class Lingo
|
|
235
224
|
rescue Errno::ESPIPE
|
236
225
|
end
|
237
226
|
|
238
|
-
def open_file(path)
|
239
|
-
File.open(path, 'rb', encoding: ENC)
|
240
|
-
end
|
241
|
-
|
242
227
|
def with_tempfile(name)
|
243
228
|
require 'tempfile'
|
244
229
|
|
@@ -263,31 +248,17 @@ class Lingo
|
|
263
248
|
|
264
249
|
@files = []
|
265
250
|
|
266
|
-
Array(get_key('files', '-')).each { |path|
|
267
|
-
|
268
|
-
add_files(File.expand_path(path), *args)
|
269
|
-
}
|
251
|
+
Array(get_key('files', '-')).each { |path| stdin?(path) ?
|
252
|
+
@files << path : add_files(File.expand_path(path), *args) }
|
270
253
|
end
|
271
254
|
|
272
255
|
def add_files(path, glob, recursive = false)
|
273
|
-
entries = Dir[path].sort
|
274
|
-
raise FileNotFoundError.new(path) if entries.empty?
|
256
|
+
raise FileNotFoundError.new(path) if (entries = Dir[path]).sort!.empty?
|
275
257
|
|
276
258
|
entries.each { |entry|
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
if File.file?(match) && File.fnmatch?(glob, match)
|
281
|
-
@files << match
|
282
|
-
end
|
283
|
-
}
|
284
|
-
else
|
285
|
-
add_files(File.join(entry, glob), glob)
|
286
|
-
end
|
287
|
-
else
|
288
|
-
@files << entry
|
289
|
-
end
|
290
|
-
}
|
259
|
+
!File.directory?(entry) ? @files << entry : !recursive ?
|
260
|
+
add_files(File.join(entry, glob), glob) : Find.find(entry) { |match|
|
261
|
+
@files << match if File.file?(match) && File.fnmatch?(glob, match) } }
|
291
262
|
end
|
292
263
|
|
293
264
|
end
|