lingo 1.8.2 → 1.8.3
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +33 -0
- data/README +6 -5
- data/Rakefile +6 -4
- data/{lib/lingo/cachable.rb → bin/lingosrv} +30 -58
- data/bin/lingoweb +30 -0
- data/de.lang +2 -13
- data/en/lingo-irr.txt +266 -0
- data/en/lingo-wdn.txt +37319 -0
- data/en.lang +2 -15
- data/lib/lingo/app.rb +82 -0
- data/lib/lingo/attendee/abbreviator.rb +22 -26
- data/lib/lingo/attendee/debugger.rb +8 -4
- data/lib/lingo/attendee/decomposer.rb +0 -1
- data/lib/lingo/attendee/dehyphenizer.rb +2 -2
- data/lib/lingo/attendee/multi_worder.rb +20 -13
- data/lib/lingo/attendee/noneword_filter.rb +2 -7
- data/lib/lingo/attendee/sequencer.rb +43 -19
- data/lib/lingo/attendee/stemmer/porter.rb +2 -2
- data/lib/lingo/attendee/stemmer.rb +1 -1
- data/lib/lingo/attendee/synonymer.rb +1 -9
- data/lib/lingo/attendee/text_reader.rb +42 -29
- data/lib/lingo/attendee/text_writer.rb +3 -6
- data/lib/lingo/attendee/tokenizer.rb +87 -69
- data/lib/lingo/attendee/variator.rb +7 -5
- data/lib/lingo/attendee/vector_filter.rb +11 -11
- data/lib/lingo/attendee/word_searcher.rb +1 -9
- data/lib/lingo/attendee.rb +24 -105
- data/lib/lingo/buffered_attendee.rb +2 -9
- data/lib/lingo/call.rb +18 -13
- data/lib/lingo/cli.rb +5 -10
- data/lib/lingo/config.rb +40 -7
- data/lib/lingo/ctl.rb +69 -57
- data/lib/lingo/database/hash_store.rb +9 -4
- data/lib/lingo/database/sdbm_store.rb +4 -7
- data/lib/lingo/database/source/multi_key.rb +1 -1
- data/lib/lingo/database/source/multi_value.rb +1 -1
- data/lib/lingo/database/source.rb +2 -20
- data/lib/lingo/database.rb +30 -19
- data/lib/lingo/debug.rb +79 -0
- data/lib/lingo/{core_ext.rb → language/char.rb} +43 -42
- data/lib/lingo/language/dictionary.rb +38 -46
- data/lib/lingo/language/grammar.rb +40 -57
- data/lib/lingo/language/lexical.rb +4 -7
- data/lib/lingo/language/lexical_hash.rb +17 -35
- data/lib/lingo/language/token.rb +4 -0
- data/lib/lingo/language/word.rb +7 -8
- data/lib/lingo/language/word_form.rb +4 -4
- data/lib/lingo/language.rb +2 -1
- data/lib/lingo/srv/config.ru +4 -0
- data/lib/lingo/srv/lingosrv.cfg +14 -0
- data/lib/lingo/{reportable.rb → srv.rb} +59 -61
- data/lib/lingo/version.rb +1 -1
- data/lib/lingo/web/config.ru +4 -0
- data/lib/lingo/web/lingoweb.cfg +14 -0
- data/lib/lingo/web/public/lingo.png +0 -0
- data/lib/lingo/web/public/lingoweb.css +74 -0
- data/lib/lingo/web/views/index.erb +92 -0
- data/lib/lingo/web.rb +94 -0
- data/lib/lingo.rb +27 -29
- data/lingo.cfg +1 -1
- data/lir.cfg +24 -0
- data/ru/lingo-dic.txt +22342 -0
- data/ru/lingo-mul.txt +5151 -0
- data/ru/lingo-syn.txt +0 -0
- data/ru.lang +99 -0
- data/test/attendee/ts_sequencer.rb +2 -2
- data/test/attendee/ts_text_reader.rb +36 -2
- data/test/attendee/ts_text_writer.rb +6 -6
- data/test/lir.vec +3 -3
- data/test/test_helper.rb +104 -102
- data/test/ts_database.rb +1 -1
- data/test/ts_language.rb +55 -96
- data/txt/artikel-ru.txt +45 -0
- data/txt/lir.txt +1 -3
- metadata +143 -83
- data/TODO +0 -23
data/en.lang
CHANGED
@@ -43,18 +43,16 @@
|
|
43
43
|
# lingo language definition
|
44
44
|
---
|
45
45
|
language:
|
46
|
-
|
47
46
|
name: 'Englisch'
|
48
47
|
|
49
48
|
dictionary:
|
50
|
-
|
51
49
|
databases:
|
52
|
-
|
53
50
|
# Systemwörterbücher
|
54
51
|
sys-dic: { name: en/lingo-dic.txt, txt-format: WordClass, separator: '=' }
|
55
52
|
sys-syn: { name: en/lingo-syn.txt, txt-format: KeyValue, separator: '=', def-wc: y }
|
56
53
|
sys-mul: { name: en/lingo-mul.txt, txt-format: SingleWord, use-lex: 'sys-dic', def-wc: m }
|
57
|
-
|
54
|
+
sys-irr: { name: en/lingo-irr.txt, txt-format: WordClass, separator: '=' }
|
55
|
+
sys-wdn: { name: en/lingo-wdn.txt, txt-format: WordClass, separator: '=' }
|
58
56
|
# Benutzerwörterbücher
|
59
57
|
usr-dic: { name: en/user-dic.txt, txt-format: WordClass, separator: '=' }
|
60
58
|
|
@@ -77,17 +75,6 @@ language:
|
|
77
75
|
- [f, ""]
|
78
76
|
|
79
77
|
attendees:
|
80
|
-
tokenizer:
|
81
|
-
regulars:
|
82
|
-
- _char_: '_baslat_|_lat1sp_|_latexa_|_latexb_|_ipaext_'
|
83
|
-
- NUMS: '[+-]?(\d{4,}|\d{1,3}(\.\d{3,3})*)(\.|(,\d+)?%?)'
|
84
|
-
- URLS: '((mailto:|(news|http|https|ftp|ftps)://)\S+|^(www(\.\S+)+)|[^\s.]+([\._]\S+)+@\S+(\.\S+)+)'
|
85
|
-
- ABRV: '(((_char_)+\.)+)(_char_)+'
|
86
|
-
- WORD: '(_char_|_digit_|\-)+'
|
87
|
-
- PUNC: '([!,\.:;?]|\xc2\xa1|\xc2\xbf)'
|
88
|
-
- OTHR: '([\"#$%&\x27()*\+\-/<=>@\[\\\]^_{|}~]|\xc2\xa2|\xc2\xa3|\xc2\xa4|\xc2\xa5|\xc2\xa6|\xc2\xa7|\xc2\xa8|\xc2\xa9|\xc2\xaa|\xc2\xab|\xc2\xac|\xc2\xae|\xc2\xaf|\xc2\xb0|\xc2\xb1|\xc2\xb2|\xc2\xb3|\xc2\xb4|\xc2\xb5|\xc2\xb6|\xc2\xb7|\xc2\xb8|\xc2\xb9|\xc2\xba|\xc2\xbb|\xc2\xbc|\xc2\xbd|\xc2\xbe|\xc3\x97|\xc3\xb7)'
|
89
|
-
- HELP: '[^ ]*'
|
90
|
-
|
91
78
|
variator:
|
92
79
|
variations:
|
93
80
|
- [ ieh, sch ]
|
data/lib/lingo/app.rb
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
require 'optparse'
|
28
|
+
require 'shellwords'
|
29
|
+
require 'sinatra/base'
|
30
|
+
|
31
|
+
class Lingo
|
32
|
+
|
33
|
+
class App < Sinatra::Base
|
34
|
+
|
35
|
+
class << self
|
36
|
+
|
37
|
+
def init_app(file, *args, &block)
|
38
|
+
set :root, File.chomp_ext(file)
|
39
|
+
parse_options(*args, &block)
|
40
|
+
end
|
41
|
+
|
42
|
+
def parse_options(lingo_options = false)
|
43
|
+
argv, banner = [], "Usage: #{$0} [-h|--help] [sinatra-options]"
|
44
|
+
while arg = ARGV.shift and arg != '--'; argv << arg; end
|
45
|
+
|
46
|
+
if lingo_options || block_given?
|
47
|
+
banner << ' [-- lingo-options]'
|
48
|
+
|
49
|
+
opts = ENV["LINGO_#{name.split('::').last.upcase}_OPTS"]
|
50
|
+
ARGV.unshift(*Shellwords.shellsplit(opts)) if opts
|
51
|
+
|
52
|
+
ARGV.unshift(*lingo_options) if lingo_options.is_a?(Array)
|
53
|
+
end
|
54
|
+
|
55
|
+
OptionParser.new(banner, 16) { |o|
|
56
|
+
o.on('-p port', 'set the port (default is 4567)') { |v| set :port, Integer(v) }
|
57
|
+
o.on('-o addr', 'set the host (default is 0.0.0.0)') { |v| set :bind, v }
|
58
|
+
o.on('-e env', 'set the environment (default is development)') { |v| set :environment, v.to_sym }
|
59
|
+
o.on('-s server', 'specify rack server/handler (default is thin)') { |v| set :server, v }
|
60
|
+
o.on('-x', 'turn on the mutex lock (default is off)') { set :lock, true }
|
61
|
+
}.parse!(argv)
|
62
|
+
|
63
|
+
ARGV.unshift(*yield) if block_given?
|
64
|
+
end
|
65
|
+
|
66
|
+
def rackup(name)
|
67
|
+
file = File.join(File.dirname(__FILE__), name, 'config.ru')
|
68
|
+
file if File.readable?(file)
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
def to_json(q, r)
|
74
|
+
q, r = 'q', 'Required parameter -- Input string' unless q
|
75
|
+
|
76
|
+
content_type :json
|
77
|
+
{ q => r }.to_json
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
@@ -68,7 +68,7 @@ class Lingo
|
|
68
68
|
# out> *EOL('test.txt')
|
69
69
|
# out> *EOF('test.txt')
|
70
70
|
|
71
|
-
class Abbreviator <
|
71
|
+
class Abbreviator < self
|
72
72
|
|
73
73
|
protected
|
74
74
|
|
@@ -77,36 +77,32 @@ class Lingo
|
|
77
77
|
end
|
78
78
|
|
79
79
|
def control(cmd, param)
|
80
|
-
|
81
|
-
process_buffer
|
80
|
+
send_abbr(nil) if [STR_CMD_RECORD, STR_CMD_EOF].include?(cmd)
|
82
81
|
end
|
83
82
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
if form = form_at(-2, Token)
|
97
|
-
inc('Anzahl gesuchter Abkürzungen')
|
98
|
-
|
99
|
-
if (abbr = find_word(form)).identified?
|
100
|
-
inc('Anzahl gefundener Abkürzungen')
|
101
|
-
|
102
|
-
abbr.form += CHAR_PUNCT
|
103
|
-
|
104
|
-
@buffer[-2] = abbr
|
105
|
-
@buffer.delete_at(-1)
|
83
|
+
def process(obj)
|
84
|
+
if obj.is_a?(Token)
|
85
|
+
if obj.form == CHAR_PUNCT
|
86
|
+
if @abbr && (abbr = find_word(form = @abbr.form)).identified?
|
87
|
+
form << CHAR_PUNCT unless form.end_with?(CHAR_PUNCT)
|
88
|
+
send_abbr(abbr)
|
89
|
+
else
|
90
|
+
send_abbr(@abbr)
|
91
|
+
forward(obj)
|
92
|
+
end
|
93
|
+
else
|
94
|
+
send_abbr(@abbr, obj)
|
106
95
|
end
|
96
|
+
else
|
97
|
+
send_abbr(obj)
|
107
98
|
end
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
108
102
|
|
109
|
-
|
103
|
+
def send_abbr(abbr, obj = nil)
|
104
|
+
@abbr = obj
|
105
|
+
forward(abbr) if abbr
|
110
106
|
end
|
111
107
|
|
112
108
|
end
|
@@ -96,16 +96,20 @@ class Lingo
|
|
96
96
|
end
|
97
97
|
|
98
98
|
def control(cmd, param)
|
99
|
-
|
100
|
-
warn "#{@prompt} #{AgendaItem.new(cmd, param).inspect}"
|
101
|
-
end
|
99
|
+
debug(AgendaItem.new(cmd, param), @cmd_eval)
|
102
100
|
end
|
103
101
|
|
104
102
|
def process(obj)
|
105
|
-
|
103
|
+
debug(obj, @obj_eval)
|
106
104
|
forward(obj)
|
107
105
|
end
|
108
106
|
|
107
|
+
private
|
108
|
+
|
109
|
+
def debug(obj, cond)
|
110
|
+
warn "#{@prompt} #{obj.inspect}" if eval(cond)
|
111
|
+
end
|
112
|
+
|
109
113
|
end
|
110
114
|
|
111
115
|
end
|
@@ -89,7 +89,7 @@ class Lingo
|
|
89
89
|
if ab.all? { |i| i.is_a?(Word) } && a.form[-1, 1] == h && !(
|
90
90
|
(c = b.get_class(/./).first) && @skip.include?(c.attr)
|
91
91
|
)
|
92
|
-
a, b = ab.map!
|
92
|
+
a, b = ab.map! { |i| i.form }
|
93
93
|
|
94
94
|
word = dehyphenize(a.chomp(h) + b)
|
95
95
|
word = dehyphenize(a + b) unless dehyphenized?(word)
|
@@ -106,7 +106,7 @@ class Lingo
|
|
106
106
|
private
|
107
107
|
|
108
108
|
def dehyphenize(form)
|
109
|
-
find_word(form
|
109
|
+
find_word(form) { |i| i.identified? }
|
110
110
|
end
|
111
111
|
|
112
112
|
def dehyphenized?(word)
|
@@ -113,7 +113,7 @@ class Lingo
|
|
113
113
|
end
|
114
114
|
|
115
115
|
def control(cmd, param)
|
116
|
-
control_multi(cmd
|
116
|
+
control_multi(cmd)
|
117
117
|
end
|
118
118
|
|
119
119
|
def process_buffer
|
@@ -177,29 +177,36 @@ class Lingo
|
|
177
177
|
def check_multiword_key(len)
|
178
178
|
return [] if valid_tokens_in_buffer < len
|
179
179
|
|
180
|
-
seq =
|
180
|
+
seq = []
|
181
|
+
|
182
|
+
@buffer.each { |obj|
|
181
183
|
next [obj] unless obj.is_a?(WordForm)
|
182
184
|
next if (form = obj.form) == CHAR_PUNCT
|
183
185
|
|
184
186
|
w = find_word(form, @lex_dic, @lex_gra)
|
185
187
|
l = w.lexicals
|
186
188
|
|
187
|
-
|
188
|
-
i.concat(@syn_dic.find_synonyms(w)) if @syn_dic
|
189
|
-
i.map! { |j| j.form.downcase }.uniq!
|
190
|
-
}
|
191
|
-
}
|
189
|
+
i = w.attr == WA_COMPOUND ? [l.first] : l.empty? ? [w] : l.dup
|
192
190
|
|
193
|
-
|
194
|
-
|
191
|
+
@syn_dic.find_synonyms(w, i) if @syn_dic
|
192
|
+
i.map! { |j| Unicode.downcase(j.form) }.uniq!
|
193
|
+
|
194
|
+
seq << i
|
195
|
+
|
196
|
+
break unless seq.length < len
|
197
|
+
}
|
195
198
|
|
196
199
|
if @combine
|
197
|
-
|
198
|
-
|
200
|
+
mul = []
|
201
|
+
|
202
|
+
seq.shift.product(*seq) { |key|
|
203
|
+
@mul_dic.select(key.join(' '), mul)
|
199
204
|
break unless @all_keys || mul.empty?
|
200
|
-
} && mul.uniq!
|
205
|
+
} && mul.uniq!
|
206
|
+
|
207
|
+
mul
|
201
208
|
else
|
202
|
-
@mul_dic.select(seq.map!
|
209
|
+
@mul_dic.select(seq.map! { |i,| i }.join(' '))
|
203
210
|
end
|
204
211
|
end
|
205
212
|
|
@@ -87,9 +87,7 @@ class Lingo
|
|
87
87
|
|
88
88
|
def process(obj)
|
89
89
|
if obj.is_a?(Word) && obj.unknown?
|
90
|
-
|
91
|
-
|
92
|
-
non = obj.form.downcase
|
90
|
+
non = Unicode.downcase(obj.form)
|
93
91
|
@sort ? @nonewords << non : forward(non)
|
94
92
|
end
|
95
93
|
end
|
@@ -97,11 +95,8 @@ class Lingo
|
|
97
95
|
private
|
98
96
|
|
99
97
|
def send_nonewords
|
100
|
-
@nonewords.sort!
|
101
98
|
@nonewords.uniq!
|
102
|
-
|
103
|
-
add('Objekte gefiltert', @nonewords.size)
|
104
|
-
@nonewords.each(&method(:forward)).clear
|
99
|
+
flush(@nonewords.sort!)
|
105
100
|
end
|
106
101
|
|
107
102
|
end
|
@@ -97,11 +97,15 @@ class Lingo
|
|
97
97
|
|
98
98
|
def init
|
99
99
|
@stopper = get_array('stopper', DEFAULT_SKIP, :upcase)
|
100
|
+
@classes = []
|
100
101
|
|
101
102
|
@seq = get_key('sequences').map { |string, format|
|
102
|
-
|
103
|
+
@classes.concat(classes = string.downcase!.chars.to_a)
|
104
|
+
[string, classes, format]
|
103
105
|
}
|
104
106
|
|
107
|
+
@classes.uniq!
|
108
|
+
|
105
109
|
raise MissingConfigError.new(:sequences) if @seq.empty?
|
106
110
|
end
|
107
111
|
|
@@ -115,42 +119,62 @@ class Lingo
|
|
115
119
|
end
|
116
120
|
|
117
121
|
def process_buffer
|
118
|
-
|
119
|
-
|
122
|
+
matches = []
|
123
|
+
|
124
|
+
if @buffer.size > 1
|
125
|
+
buf, map, seq, cls, unk = [], [], @seq, @classes, %w[#]
|
126
|
+
|
127
|
+
@buffer.each { |obj|
|
128
|
+
att = obj.is_a?(Word) && !obj.unknown? ? obj.attrs(false) : unk
|
129
|
+
|
130
|
+
(att &= cls).empty? ? find_seq(buf, map, seq, matches) : begin
|
131
|
+
buf << obj
|
132
|
+
map << att
|
133
|
+
end
|
134
|
+
}
|
135
|
+
|
136
|
+
find_seq(buf, map, seq, matches)
|
137
|
+
end
|
138
|
+
|
139
|
+
flush(@buffer.concat(matches))
|
120
140
|
end
|
121
141
|
|
122
142
|
private
|
123
143
|
|
124
|
-
def
|
125
|
-
|
144
|
+
def find_seq(buf, map, seq, matches)
|
145
|
+
return if buf.empty?
|
126
146
|
|
127
|
-
|
128
|
-
|
129
|
-
|
147
|
+
match = Hash.new { |h, k| h[k] = [] }
|
148
|
+
|
149
|
+
map.replace(map.shift.product(*map))
|
150
|
+
map.map! { |i| i.join }
|
151
|
+
map.uniq!
|
130
152
|
|
131
|
-
map.
|
153
|
+
map.each { |q|
|
132
154
|
seq.each { |string, classes, format|
|
133
155
|
while pos = q.index(string, pos || 0)
|
134
|
-
|
135
|
-
|
136
|
-
fmt = format.dup
|
156
|
+
form = format.dup
|
137
157
|
|
138
158
|
classes.each_with_index { |wc, i|
|
139
159
|
buf[pos + i].lexicals.find { |l|
|
140
|
-
|
160
|
+
form.gsub!(i.succ.to_s, l.form) if l.attr == wc
|
141
161
|
} or break
|
142
162
|
} or next
|
143
163
|
|
144
|
-
|
145
|
-
|
146
|
-
pos += 1
|
164
|
+
match[pos += 1] << form
|
147
165
|
end
|
148
166
|
}
|
149
167
|
}
|
150
168
|
|
151
|
-
|
152
|
-
|
153
|
-
|
169
|
+
match.each_value { |forms|
|
170
|
+
forms.uniq!
|
171
|
+
forms.each { |form|
|
172
|
+
matches << Word.new_lexical(form, WA_SEQUENCE, LA_SEQUENCE)
|
173
|
+
}
|
174
|
+
}
|
175
|
+
|
176
|
+
buf.clear
|
177
|
+
map.clear
|
154
178
|
end
|
155
179
|
|
156
180
|
end
|
@@ -290,7 +290,7 @@ class Lingo
|
|
290
290
|
case rule
|
291
291
|
when RULE_RE
|
292
292
|
cond, repl, goto = $1, $3, $4
|
293
|
-
stem = word[/(.+)#{$2
|
293
|
+
stem = word[/(.+)#{Unicode.downcase($2)}$/, 1] or next
|
294
294
|
when GOTO_RE
|
295
295
|
goto = $1
|
296
296
|
break
|
@@ -324,7 +324,7 @@ class Lingo
|
|
324
324
|
found, word = true, begin
|
325
325
|
stem[0...Integer(repl)]
|
326
326
|
rescue ArgumentError
|
327
|
-
stem <<
|
327
|
+
stem << Unicode.downcase(repl)
|
328
328
|
end
|
329
329
|
|
330
330
|
break
|
@@ -78,19 +78,11 @@ class Lingo
|
|
78
78
|
end
|
79
79
|
|
80
80
|
def control(cmd, param)
|
81
|
-
report_on(cmd, @dic)
|
82
81
|
end
|
83
82
|
|
84
83
|
def process(obj)
|
85
84
|
if obj.is_a?(Word) && !@skip.include?(obj.attr)
|
86
|
-
|
87
|
-
|
88
|
-
unless (syn = @dic.find_synonyms(obj)).empty?
|
89
|
-
inc('Anzahl erweiteter Wörter')
|
90
|
-
|
91
|
-
obj.add_lexicals(syn.tap(&:uniq!))
|
92
|
-
add('Anzahl gefundener Synonyme', syn.size)
|
93
|
-
end
|
85
|
+
obj.add_lexicals(@dic.find_synonyms(obj))
|
94
86
|
end
|
95
87
|
|
96
88
|
forward(obj)
|
@@ -115,15 +115,17 @@ class Lingo
|
|
115
115
|
@filter = get_key('filter', false)
|
116
116
|
@progress = get_key('progress', false)
|
117
117
|
|
118
|
-
|
119
|
-
|
120
|
-
|
118
|
+
@lingo.deprecate('lir-record-pattern', :records, self) if has_key?('lir-record-pattern')
|
119
|
+
|
120
|
+
@lir = get_re('records', get_key('lir-record-pattern', nil), %r{^\[(\d+)\.\]}) # DEPRECATE lir-record-pattern
|
121
|
+
@cut = get_re('fields', !!@lir, %r{^.+?:\s*})
|
122
|
+
@skip = get_re('skip', nil)
|
121
123
|
end
|
122
124
|
|
123
125
|
def control(cmd, param)
|
124
126
|
if cmd == STR_CMD_TALK
|
125
127
|
forward(STR_CMD_LIR, '') if @lir
|
126
|
-
@files.each(
|
128
|
+
@files.each { |i| spool(i) }
|
127
129
|
end
|
128
130
|
end
|
129
131
|
|
@@ -132,24 +134,22 @@ class Lingo
|
|
132
134
|
# Gibt eine Datei zeilenweise in den Ausgabekanal
|
133
135
|
def spool(path)
|
134
136
|
unless stdin = stdin?(path)
|
135
|
-
|
136
|
-
add('Anzahl Bytes', size = File.size(path))
|
137
|
-
|
138
|
-
size = nil unless @progress
|
137
|
+
size = File.size(path) if @progress
|
139
138
|
end
|
140
139
|
|
141
140
|
forward(STR_CMD_FILE, path)
|
142
141
|
|
143
142
|
ShowProgress.new(self, size, path) { |progress|
|
144
143
|
filter(path, stdin) { |line, pos|
|
145
|
-
inc('Anzahl Zeilen')
|
146
144
|
progress[pos]
|
147
145
|
|
148
146
|
line.chomp! if @chomp
|
147
|
+
next if line =~ @skip
|
149
148
|
|
150
149
|
if line =~ @lir
|
151
150
|
forward(STR_CMD_RECORD, $1)
|
152
151
|
else
|
152
|
+
line.sub!(@cut, '') if @cut
|
153
153
|
forward(line) unless line.empty?
|
154
154
|
end
|
155
155
|
}
|
@@ -159,13 +159,13 @@ class Lingo
|
|
159
159
|
end
|
160
160
|
|
161
161
|
def filter(path, stdin = stdin?(path))
|
162
|
-
io
|
163
|
-
@lingo.config.stdin.set_encoding(ENC)
|
164
|
-
|
165
|
-
|
166
|
-
|
162
|
+
io = stdin ?
|
163
|
+
@lingo.config.stdin.set_encoding(ENC) :
|
164
|
+
File.open(path, 'rb', encoding: ENC)
|
165
|
+
|
166
|
+
block = stdin || !@progress ?
|
167
|
+
lambda { |line| yield line, 0 } :
|
167
168
|
lambda { |line| yield line, io.pos }
|
168
|
-
]
|
169
169
|
|
170
170
|
case @filter == true ? file_type(path, io) : @filter.to_s
|
171
171
|
when /html/i then io = filter_html(io)
|
@@ -195,13 +195,16 @@ class Lingo
|
|
195
195
|
|
196
196
|
def file_type(path, io)
|
197
197
|
if Object.const_defined?(:FileMagic) && io.respond_to?(:rewind)
|
198
|
-
FileMagic.fm(:mime, simplified: true).buffer(io.read(256))
|
199
|
-
|
200
|
-
|
198
|
+
type = FileMagic.fm(:mime, simplified: true).buffer(io.read(256))
|
199
|
+
io.rewind
|
200
|
+
type
|
201
201
|
elsif Object.const_defined?(:MIME) && MIME.const_defined?(:Types)
|
202
|
-
MIME::Types.of(path).first
|
203
|
-
|
204
|
-
|
202
|
+
if type = MIME::Types.of(path).first
|
203
|
+
type.content_type
|
204
|
+
else
|
205
|
+
warn 'Filters not available. File type could not be determined.'
|
206
|
+
nil
|
207
|
+
end
|
205
208
|
else
|
206
209
|
warn "Filters not available. Please install `ruby-filemagic' or `mime-types'."
|
207
210
|
nil
|
@@ -220,17 +223,27 @@ class Lingo
|
|
220
223
|
Array(get_key('files', '-')).each { |path|
|
221
224
|
stdin?(path) ? @files << path : add_files(path, *args)
|
222
225
|
}
|
223
|
-
|
224
|
-
@files.map!(&File.method(:expand_path))
|
225
|
-
@files.uniq!
|
226
226
|
end
|
227
227
|
|
228
228
|
def add_files(path, glob, recursive = false)
|
229
|
-
Dir[path].sort
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
229
|
+
entries = Dir[path].sort!
|
230
|
+
raise FileNotFoundError.new(path) if entries.empty?
|
231
|
+
|
232
|
+
entries.each { |entry|
|
233
|
+
if File.directory?(entry)
|
234
|
+
if recursive
|
235
|
+
Find.find(entry) { |match|
|
236
|
+
if File.file?(match) && File.fnmatch?(glob, match)
|
237
|
+
@files << File.expand_path(match)
|
238
|
+
end
|
239
|
+
}
|
240
|
+
else
|
241
|
+
add_files(File.join(entry, glob), glob)
|
242
|
+
end
|
243
|
+
else
|
244
|
+
@files << File.expand_path(entry)
|
245
|
+
end
|
246
|
+
}
|
234
247
|
end
|
235
248
|
|
236
249
|
class PDFFilter
|
@@ -100,7 +100,6 @@ class Lingo
|
|
100
100
|
if stdout?(@ext)
|
101
101
|
@filename, @file = @ext, @lingo.config.stdout
|
102
102
|
else
|
103
|
-
inc('Anzahl Dateien')
|
104
103
|
@file = File.open(@filename = File.set_ext(param, ".#{@ext}"), 'w')
|
105
104
|
end
|
106
105
|
|
@@ -116,14 +115,12 @@ class Lingo
|
|
116
115
|
@no_sep = true
|
117
116
|
|
118
117
|
unless @lir
|
119
|
-
inc('Anzahl Zeilen')
|
120
118
|
@file.puts unless @no_puts
|
121
119
|
end
|
122
120
|
when STR_CMD_EOF
|
123
121
|
flush_lir_buffer if @lir
|
124
122
|
|
125
123
|
unless stdout?(@filename)
|
126
|
-
add('Anzahl Bytes', @file.size)
|
127
124
|
@file.close
|
128
125
|
end
|
129
126
|
end
|
@@ -142,9 +139,9 @@ class Lingo
|
|
142
139
|
|
143
140
|
def flush_lir_buffer
|
144
141
|
unless @lir_rec_no.empty? || @lir_rec_buf.empty?
|
145
|
-
|
146
|
-
|
147
|
-
|
142
|
+
buf = [@lir_rec_no, @lir_rec_buf.join(@sep), "\n"]
|
143
|
+
@sep =~ /\n/ ? buf.insert(1, "\n").unshift('*') : buf.insert(1, '*')
|
144
|
+
@file.print(*buf)
|
148
145
|
end
|
149
146
|
|
150
147
|
@lir_rec_no = ''
|