lingo 1.8.2 → 1.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +33 -0
- data/README +6 -5
- data/Rakefile +6 -4
- data/{lib/lingo/cachable.rb → bin/lingosrv} +30 -58
- data/bin/lingoweb +30 -0
- data/de.lang +2 -13
- data/en/lingo-irr.txt +266 -0
- data/en/lingo-wdn.txt +37319 -0
- data/en.lang +2 -15
- data/lib/lingo/app.rb +82 -0
- data/lib/lingo/attendee/abbreviator.rb +22 -26
- data/lib/lingo/attendee/debugger.rb +8 -4
- data/lib/lingo/attendee/decomposer.rb +0 -1
- data/lib/lingo/attendee/dehyphenizer.rb +2 -2
- data/lib/lingo/attendee/multi_worder.rb +20 -13
- data/lib/lingo/attendee/noneword_filter.rb +2 -7
- data/lib/lingo/attendee/sequencer.rb +43 -19
- data/lib/lingo/attendee/stemmer/porter.rb +2 -2
- data/lib/lingo/attendee/stemmer.rb +1 -1
- data/lib/lingo/attendee/synonymer.rb +1 -9
- data/lib/lingo/attendee/text_reader.rb +42 -29
- data/lib/lingo/attendee/text_writer.rb +3 -6
- data/lib/lingo/attendee/tokenizer.rb +87 -69
- data/lib/lingo/attendee/variator.rb +7 -5
- data/lib/lingo/attendee/vector_filter.rb +11 -11
- data/lib/lingo/attendee/word_searcher.rb +1 -9
- data/lib/lingo/attendee.rb +24 -105
- data/lib/lingo/buffered_attendee.rb +2 -9
- data/lib/lingo/call.rb +18 -13
- data/lib/lingo/cli.rb +5 -10
- data/lib/lingo/config.rb +40 -7
- data/lib/lingo/ctl.rb +69 -57
- data/lib/lingo/database/hash_store.rb +9 -4
- data/lib/lingo/database/sdbm_store.rb +4 -7
- data/lib/lingo/database/source/multi_key.rb +1 -1
- data/lib/lingo/database/source/multi_value.rb +1 -1
- data/lib/lingo/database/source.rb +2 -20
- data/lib/lingo/database.rb +30 -19
- data/lib/lingo/debug.rb +79 -0
- data/lib/lingo/{core_ext.rb → language/char.rb} +43 -42
- data/lib/lingo/language/dictionary.rb +38 -46
- data/lib/lingo/language/grammar.rb +40 -57
- data/lib/lingo/language/lexical.rb +4 -7
- data/lib/lingo/language/lexical_hash.rb +17 -35
- data/lib/lingo/language/token.rb +4 -0
- data/lib/lingo/language/word.rb +7 -8
- data/lib/lingo/language/word_form.rb +4 -4
- data/lib/lingo/language.rb +2 -1
- data/lib/lingo/srv/config.ru +4 -0
- data/lib/lingo/srv/lingosrv.cfg +14 -0
- data/lib/lingo/{reportable.rb → srv.rb} +59 -61
- data/lib/lingo/version.rb +1 -1
- data/lib/lingo/web/config.ru +4 -0
- data/lib/lingo/web/lingoweb.cfg +14 -0
- data/lib/lingo/web/public/lingo.png +0 -0
- data/lib/lingo/web/public/lingoweb.css +74 -0
- data/lib/lingo/web/views/index.erb +92 -0
- data/lib/lingo/web.rb +94 -0
- data/lib/lingo.rb +27 -29
- data/lingo.cfg +1 -1
- data/lir.cfg +24 -0
- data/ru/lingo-dic.txt +22342 -0
- data/ru/lingo-mul.txt +5151 -0
- data/ru/lingo-syn.txt +0 -0
- data/ru.lang +99 -0
- data/test/attendee/ts_sequencer.rb +2 -2
- data/test/attendee/ts_text_reader.rb +36 -2
- data/test/attendee/ts_text_writer.rb +6 -6
- data/test/lir.vec +3 -3
- data/test/test_helper.rb +104 -102
- data/test/ts_database.rb +1 -1
- data/test/ts_language.rb +55 -96
- data/txt/artikel-ru.txt +45 -0
- data/txt/lir.txt +1 -3
- metadata +143 -83
- data/TODO +0 -23
data/en.lang
CHANGED
@@ -43,18 +43,16 @@
|
|
43
43
|
# lingo language definition
|
44
44
|
---
|
45
45
|
language:
|
46
|
-
|
47
46
|
name: 'Englisch'
|
48
47
|
|
49
48
|
dictionary:
|
50
|
-
|
51
49
|
databases:
|
52
|
-
|
53
50
|
# Systemwörterbücher
|
54
51
|
sys-dic: { name: en/lingo-dic.txt, txt-format: WordClass, separator: '=' }
|
55
52
|
sys-syn: { name: en/lingo-syn.txt, txt-format: KeyValue, separator: '=', def-wc: y }
|
56
53
|
sys-mul: { name: en/lingo-mul.txt, txt-format: SingleWord, use-lex: 'sys-dic', def-wc: m }
|
57
|
-
|
54
|
+
sys-irr: { name: en/lingo-irr.txt, txt-format: WordClass, separator: '=' }
|
55
|
+
sys-wdn: { name: en/lingo-wdn.txt, txt-format: WordClass, separator: '=' }
|
58
56
|
# Benutzerwörterbücher
|
59
57
|
usr-dic: { name: en/user-dic.txt, txt-format: WordClass, separator: '=' }
|
60
58
|
|
@@ -77,17 +75,6 @@ language:
|
|
77
75
|
- [f, ""]
|
78
76
|
|
79
77
|
attendees:
|
80
|
-
tokenizer:
|
81
|
-
regulars:
|
82
|
-
- _char_: '_baslat_|_lat1sp_|_latexa_|_latexb_|_ipaext_'
|
83
|
-
- NUMS: '[+-]?(\d{4,}|\d{1,3}(\.\d{3,3})*)(\.|(,\d+)?%?)'
|
84
|
-
- URLS: '((mailto:|(news|http|https|ftp|ftps)://)\S+|^(www(\.\S+)+)|[^\s.]+([\._]\S+)+@\S+(\.\S+)+)'
|
85
|
-
- ABRV: '(((_char_)+\.)+)(_char_)+'
|
86
|
-
- WORD: '(_char_|_digit_|\-)+'
|
87
|
-
- PUNC: '([!,\.:;?]|\xc2\xa1|\xc2\xbf)'
|
88
|
-
- OTHR: '([\"#$%&\x27()*\+\-/<=>@\[\\\]^_{|}~]|\xc2\xa2|\xc2\xa3|\xc2\xa4|\xc2\xa5|\xc2\xa6|\xc2\xa7|\xc2\xa8|\xc2\xa9|\xc2\xaa|\xc2\xab|\xc2\xac|\xc2\xae|\xc2\xaf|\xc2\xb0|\xc2\xb1|\xc2\xb2|\xc2\xb3|\xc2\xb4|\xc2\xb5|\xc2\xb6|\xc2\xb7|\xc2\xb8|\xc2\xb9|\xc2\xba|\xc2\xbb|\xc2\xbc|\xc2\xbd|\xc2\xbe|\xc3\x97|\xc3\xb7)'
|
89
|
-
- HELP: '[^ ]*'
|
90
|
-
|
91
78
|
variator:
|
92
79
|
variations:
|
93
80
|
- [ ieh, sch ]
|
data/lib/lingo/app.rb
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# Lingo -- A full-featured automatic indexing system #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
+
# Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
|
10
|
+
# #
|
11
|
+
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU Affero General Public License as published by the Free #
|
13
|
+
# Software Foundation; either version 3 of the License, or (at your option) #
|
14
|
+
# any later version. #
|
15
|
+
# #
|
16
|
+
# Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
|
19
|
+
# more details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU Affero General Public License #
|
22
|
+
# along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
require 'optparse'
|
28
|
+
require 'shellwords'
|
29
|
+
require 'sinatra/base'
|
30
|
+
|
31
|
+
class Lingo
|
32
|
+
|
33
|
+
class App < Sinatra::Base
|
34
|
+
|
35
|
+
class << self
|
36
|
+
|
37
|
+
def init_app(file, *args, &block)
|
38
|
+
set :root, File.chomp_ext(file)
|
39
|
+
parse_options(*args, &block)
|
40
|
+
end
|
41
|
+
|
42
|
+
def parse_options(lingo_options = false)
|
43
|
+
argv, banner = [], "Usage: #{$0} [-h|--help] [sinatra-options]"
|
44
|
+
while arg = ARGV.shift and arg != '--'; argv << arg; end
|
45
|
+
|
46
|
+
if lingo_options || block_given?
|
47
|
+
banner << ' [-- lingo-options]'
|
48
|
+
|
49
|
+
opts = ENV["LINGO_#{name.split('::').last.upcase}_OPTS"]
|
50
|
+
ARGV.unshift(*Shellwords.shellsplit(opts)) if opts
|
51
|
+
|
52
|
+
ARGV.unshift(*lingo_options) if lingo_options.is_a?(Array)
|
53
|
+
end
|
54
|
+
|
55
|
+
OptionParser.new(banner, 16) { |o|
|
56
|
+
o.on('-p port', 'set the port (default is 4567)') { |v| set :port, Integer(v) }
|
57
|
+
o.on('-o addr', 'set the host (default is 0.0.0.0)') { |v| set :bind, v }
|
58
|
+
o.on('-e env', 'set the environment (default is development)') { |v| set :environment, v.to_sym }
|
59
|
+
o.on('-s server', 'specify rack server/handler (default is thin)') { |v| set :server, v }
|
60
|
+
o.on('-x', 'turn on the mutex lock (default is off)') { set :lock, true }
|
61
|
+
}.parse!(argv)
|
62
|
+
|
63
|
+
ARGV.unshift(*yield) if block_given?
|
64
|
+
end
|
65
|
+
|
66
|
+
def rackup(name)
|
67
|
+
file = File.join(File.dirname(__FILE__), name, 'config.ru')
|
68
|
+
file if File.readable?(file)
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
def to_json(q, r)
|
74
|
+
q, r = 'q', 'Required parameter -- Input string' unless q
|
75
|
+
|
76
|
+
content_type :json
|
77
|
+
{ q => r }.to_json
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
@@ -68,7 +68,7 @@ class Lingo
|
|
68
68
|
# out> *EOL('test.txt')
|
69
69
|
# out> *EOF('test.txt')
|
70
70
|
|
71
|
-
class Abbreviator <
|
71
|
+
class Abbreviator < self
|
72
72
|
|
73
73
|
protected
|
74
74
|
|
@@ -77,36 +77,32 @@ class Lingo
|
|
77
77
|
end
|
78
78
|
|
79
79
|
def control(cmd, param)
|
80
|
-
|
81
|
-
process_buffer
|
80
|
+
send_abbr(nil) if [STR_CMD_RECORD, STR_CMD_EOF].include?(cmd)
|
82
81
|
end
|
83
82
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
if form = form_at(-2, Token)
|
97
|
-
inc('Anzahl gesuchter Abkürzungen')
|
98
|
-
|
99
|
-
if (abbr = find_word(form)).identified?
|
100
|
-
inc('Anzahl gefundener Abkürzungen')
|
101
|
-
|
102
|
-
abbr.form += CHAR_PUNCT
|
103
|
-
|
104
|
-
@buffer[-2] = abbr
|
105
|
-
@buffer.delete_at(-1)
|
83
|
+
def process(obj)
|
84
|
+
if obj.is_a?(Token)
|
85
|
+
if obj.form == CHAR_PUNCT
|
86
|
+
if @abbr && (abbr = find_word(form = @abbr.form)).identified?
|
87
|
+
form << CHAR_PUNCT unless form.end_with?(CHAR_PUNCT)
|
88
|
+
send_abbr(abbr)
|
89
|
+
else
|
90
|
+
send_abbr(@abbr)
|
91
|
+
forward(obj)
|
92
|
+
end
|
93
|
+
else
|
94
|
+
send_abbr(@abbr, obj)
|
106
95
|
end
|
96
|
+
else
|
97
|
+
send_abbr(obj)
|
107
98
|
end
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
108
102
|
|
109
|
-
|
103
|
+
def send_abbr(abbr, obj = nil)
|
104
|
+
@abbr = obj
|
105
|
+
forward(abbr) if abbr
|
110
106
|
end
|
111
107
|
|
112
108
|
end
|
@@ -96,16 +96,20 @@ class Lingo
|
|
96
96
|
end
|
97
97
|
|
98
98
|
def control(cmd, param)
|
99
|
-
|
100
|
-
warn "#{@prompt} #{AgendaItem.new(cmd, param).inspect}"
|
101
|
-
end
|
99
|
+
debug(AgendaItem.new(cmd, param), @cmd_eval)
|
102
100
|
end
|
103
101
|
|
104
102
|
def process(obj)
|
105
|
-
|
103
|
+
debug(obj, @obj_eval)
|
106
104
|
forward(obj)
|
107
105
|
end
|
108
106
|
|
107
|
+
private
|
108
|
+
|
109
|
+
def debug(obj, cond)
|
110
|
+
warn "#{@prompt} #{obj.inspect}" if eval(cond)
|
111
|
+
end
|
112
|
+
|
109
113
|
end
|
110
114
|
|
111
115
|
end
|
@@ -89,7 +89,7 @@ class Lingo
|
|
89
89
|
if ab.all? { |i| i.is_a?(Word) } && a.form[-1, 1] == h && !(
|
90
90
|
(c = b.get_class(/./).first) && @skip.include?(c.attr)
|
91
91
|
)
|
92
|
-
a, b = ab.map!
|
92
|
+
a, b = ab.map! { |i| i.form }
|
93
93
|
|
94
94
|
word = dehyphenize(a.chomp(h) + b)
|
95
95
|
word = dehyphenize(a + b) unless dehyphenized?(word)
|
@@ -106,7 +106,7 @@ class Lingo
|
|
106
106
|
private
|
107
107
|
|
108
108
|
def dehyphenize(form)
|
109
|
-
find_word(form
|
109
|
+
find_word(form) { |i| i.identified? }
|
110
110
|
end
|
111
111
|
|
112
112
|
def dehyphenized?(word)
|
@@ -113,7 +113,7 @@ class Lingo
|
|
113
113
|
end
|
114
114
|
|
115
115
|
def control(cmd, param)
|
116
|
-
control_multi(cmd
|
116
|
+
control_multi(cmd)
|
117
117
|
end
|
118
118
|
|
119
119
|
def process_buffer
|
@@ -177,29 +177,36 @@ class Lingo
|
|
177
177
|
def check_multiword_key(len)
|
178
178
|
return [] if valid_tokens_in_buffer < len
|
179
179
|
|
180
|
-
seq =
|
180
|
+
seq = []
|
181
|
+
|
182
|
+
@buffer.each { |obj|
|
181
183
|
next [obj] unless obj.is_a?(WordForm)
|
182
184
|
next if (form = obj.form) == CHAR_PUNCT
|
183
185
|
|
184
186
|
w = find_word(form, @lex_dic, @lex_gra)
|
185
187
|
l = w.lexicals
|
186
188
|
|
187
|
-
|
188
|
-
i.concat(@syn_dic.find_synonyms(w)) if @syn_dic
|
189
|
-
i.map! { |j| j.form.downcase }.uniq!
|
190
|
-
}
|
191
|
-
}
|
189
|
+
i = w.attr == WA_COMPOUND ? [l.first] : l.empty? ? [w] : l.dup
|
192
190
|
|
193
|
-
|
194
|
-
|
191
|
+
@syn_dic.find_synonyms(w, i) if @syn_dic
|
192
|
+
i.map! { |j| Unicode.downcase(j.form) }.uniq!
|
193
|
+
|
194
|
+
seq << i
|
195
|
+
|
196
|
+
break unless seq.length < len
|
197
|
+
}
|
195
198
|
|
196
199
|
if @combine
|
197
|
-
|
198
|
-
|
200
|
+
mul = []
|
201
|
+
|
202
|
+
seq.shift.product(*seq) { |key|
|
203
|
+
@mul_dic.select(key.join(' '), mul)
|
199
204
|
break unless @all_keys || mul.empty?
|
200
|
-
} && mul.uniq!
|
205
|
+
} && mul.uniq!
|
206
|
+
|
207
|
+
mul
|
201
208
|
else
|
202
|
-
@mul_dic.select(seq.map!
|
209
|
+
@mul_dic.select(seq.map! { |i,| i }.join(' '))
|
203
210
|
end
|
204
211
|
end
|
205
212
|
|
@@ -87,9 +87,7 @@ class Lingo
|
|
87
87
|
|
88
88
|
def process(obj)
|
89
89
|
if obj.is_a?(Word) && obj.unknown?
|
90
|
-
|
91
|
-
|
92
|
-
non = obj.form.downcase
|
90
|
+
non = Unicode.downcase(obj.form)
|
93
91
|
@sort ? @nonewords << non : forward(non)
|
94
92
|
end
|
95
93
|
end
|
@@ -97,11 +95,8 @@ class Lingo
|
|
97
95
|
private
|
98
96
|
|
99
97
|
def send_nonewords
|
100
|
-
@nonewords.sort!
|
101
98
|
@nonewords.uniq!
|
102
|
-
|
103
|
-
add('Objekte gefiltert', @nonewords.size)
|
104
|
-
@nonewords.each(&method(:forward)).clear
|
99
|
+
flush(@nonewords.sort!)
|
105
100
|
end
|
106
101
|
|
107
102
|
end
|
@@ -97,11 +97,15 @@ class Lingo
|
|
97
97
|
|
98
98
|
def init
|
99
99
|
@stopper = get_array('stopper', DEFAULT_SKIP, :upcase)
|
100
|
+
@classes = []
|
100
101
|
|
101
102
|
@seq = get_key('sequences').map { |string, format|
|
102
|
-
|
103
|
+
@classes.concat(classes = string.downcase!.chars.to_a)
|
104
|
+
[string, classes, format]
|
103
105
|
}
|
104
106
|
|
107
|
+
@classes.uniq!
|
108
|
+
|
105
109
|
raise MissingConfigError.new(:sequences) if @seq.empty?
|
106
110
|
end
|
107
111
|
|
@@ -115,42 +119,62 @@ class Lingo
|
|
115
119
|
end
|
116
120
|
|
117
121
|
def process_buffer
|
118
|
-
|
119
|
-
|
122
|
+
matches = []
|
123
|
+
|
124
|
+
if @buffer.size > 1
|
125
|
+
buf, map, seq, cls, unk = [], [], @seq, @classes, %w[#]
|
126
|
+
|
127
|
+
@buffer.each { |obj|
|
128
|
+
att = obj.is_a?(Word) && !obj.unknown? ? obj.attrs(false) : unk
|
129
|
+
|
130
|
+
(att &= cls).empty? ? find_seq(buf, map, seq, matches) : begin
|
131
|
+
buf << obj
|
132
|
+
map << att
|
133
|
+
end
|
134
|
+
}
|
135
|
+
|
136
|
+
find_seq(buf, map, seq, matches)
|
137
|
+
end
|
138
|
+
|
139
|
+
flush(@buffer.concat(matches))
|
120
140
|
end
|
121
141
|
|
122
142
|
private
|
123
143
|
|
124
|
-
def
|
125
|
-
|
144
|
+
def find_seq(buf, map, seq, matches)
|
145
|
+
return if buf.empty?
|
126
146
|
|
127
|
-
|
128
|
-
|
129
|
-
|
147
|
+
match = Hash.new { |h, k| h[k] = [] }
|
148
|
+
|
149
|
+
map.replace(map.shift.product(*map))
|
150
|
+
map.map! { |i| i.join }
|
151
|
+
map.uniq!
|
130
152
|
|
131
|
-
map.
|
153
|
+
map.each { |q|
|
132
154
|
seq.each { |string, classes, format|
|
133
155
|
while pos = q.index(string, pos || 0)
|
134
|
-
|
135
|
-
|
136
|
-
fmt = format.dup
|
156
|
+
form = format.dup
|
137
157
|
|
138
158
|
classes.each_with_index { |wc, i|
|
139
159
|
buf[pos + i].lexicals.find { |l|
|
140
|
-
|
160
|
+
form.gsub!(i.succ.to_s, l.form) if l.attr == wc
|
141
161
|
} or break
|
142
162
|
} or next
|
143
163
|
|
144
|
-
|
145
|
-
|
146
|
-
pos += 1
|
164
|
+
match[pos += 1] << form
|
147
165
|
end
|
148
166
|
}
|
149
167
|
}
|
150
168
|
|
151
|
-
|
152
|
-
|
153
|
-
|
169
|
+
match.each_value { |forms|
|
170
|
+
forms.uniq!
|
171
|
+
forms.each { |form|
|
172
|
+
matches << Word.new_lexical(form, WA_SEQUENCE, LA_SEQUENCE)
|
173
|
+
}
|
174
|
+
}
|
175
|
+
|
176
|
+
buf.clear
|
177
|
+
map.clear
|
154
178
|
end
|
155
179
|
|
156
180
|
end
|
@@ -290,7 +290,7 @@ class Lingo
|
|
290
290
|
case rule
|
291
291
|
when RULE_RE
|
292
292
|
cond, repl, goto = $1, $3, $4
|
293
|
-
stem = word[/(.+)#{$2
|
293
|
+
stem = word[/(.+)#{Unicode.downcase($2)}$/, 1] or next
|
294
294
|
when GOTO_RE
|
295
295
|
goto = $1
|
296
296
|
break
|
@@ -324,7 +324,7 @@ class Lingo
|
|
324
324
|
found, word = true, begin
|
325
325
|
stem[0...Integer(repl)]
|
326
326
|
rescue ArgumentError
|
327
|
-
stem <<
|
327
|
+
stem << Unicode.downcase(repl)
|
328
328
|
end
|
329
329
|
|
330
330
|
break
|
@@ -78,19 +78,11 @@ class Lingo
|
|
78
78
|
end
|
79
79
|
|
80
80
|
def control(cmd, param)
|
81
|
-
report_on(cmd, @dic)
|
82
81
|
end
|
83
82
|
|
84
83
|
def process(obj)
|
85
84
|
if obj.is_a?(Word) && !@skip.include?(obj.attr)
|
86
|
-
|
87
|
-
|
88
|
-
unless (syn = @dic.find_synonyms(obj)).empty?
|
89
|
-
inc('Anzahl erweiteter Wörter')
|
90
|
-
|
91
|
-
obj.add_lexicals(syn.tap(&:uniq!))
|
92
|
-
add('Anzahl gefundener Synonyme', syn.size)
|
93
|
-
end
|
85
|
+
obj.add_lexicals(@dic.find_synonyms(obj))
|
94
86
|
end
|
95
87
|
|
96
88
|
forward(obj)
|
@@ -115,15 +115,17 @@ class Lingo
|
|
115
115
|
@filter = get_key('filter', false)
|
116
116
|
@progress = get_key('progress', false)
|
117
117
|
|
118
|
-
|
119
|
-
|
120
|
-
|
118
|
+
@lingo.deprecate('lir-record-pattern', :records, self) if has_key?('lir-record-pattern')
|
119
|
+
|
120
|
+
@lir = get_re('records', get_key('lir-record-pattern', nil), %r{^\[(\d+)\.\]}) # DEPRECATE lir-record-pattern
|
121
|
+
@cut = get_re('fields', !!@lir, %r{^.+?:\s*})
|
122
|
+
@skip = get_re('skip', nil)
|
121
123
|
end
|
122
124
|
|
123
125
|
def control(cmd, param)
|
124
126
|
if cmd == STR_CMD_TALK
|
125
127
|
forward(STR_CMD_LIR, '') if @lir
|
126
|
-
@files.each(
|
128
|
+
@files.each { |i| spool(i) }
|
127
129
|
end
|
128
130
|
end
|
129
131
|
|
@@ -132,24 +134,22 @@ class Lingo
|
|
132
134
|
# Gibt eine Datei zeilenweise in den Ausgabekanal
|
133
135
|
def spool(path)
|
134
136
|
unless stdin = stdin?(path)
|
135
|
-
|
136
|
-
add('Anzahl Bytes', size = File.size(path))
|
137
|
-
|
138
|
-
size = nil unless @progress
|
137
|
+
size = File.size(path) if @progress
|
139
138
|
end
|
140
139
|
|
141
140
|
forward(STR_CMD_FILE, path)
|
142
141
|
|
143
142
|
ShowProgress.new(self, size, path) { |progress|
|
144
143
|
filter(path, stdin) { |line, pos|
|
145
|
-
inc('Anzahl Zeilen')
|
146
144
|
progress[pos]
|
147
145
|
|
148
146
|
line.chomp! if @chomp
|
147
|
+
next if line =~ @skip
|
149
148
|
|
150
149
|
if line =~ @lir
|
151
150
|
forward(STR_CMD_RECORD, $1)
|
152
151
|
else
|
152
|
+
line.sub!(@cut, '') if @cut
|
153
153
|
forward(line) unless line.empty?
|
154
154
|
end
|
155
155
|
}
|
@@ -159,13 +159,13 @@ class Lingo
|
|
159
159
|
end
|
160
160
|
|
161
161
|
def filter(path, stdin = stdin?(path))
|
162
|
-
io
|
163
|
-
@lingo.config.stdin.set_encoding(ENC)
|
164
|
-
|
165
|
-
|
166
|
-
|
162
|
+
io = stdin ?
|
163
|
+
@lingo.config.stdin.set_encoding(ENC) :
|
164
|
+
File.open(path, 'rb', encoding: ENC)
|
165
|
+
|
166
|
+
block = stdin || !@progress ?
|
167
|
+
lambda { |line| yield line, 0 } :
|
167
168
|
lambda { |line| yield line, io.pos }
|
168
|
-
]
|
169
169
|
|
170
170
|
case @filter == true ? file_type(path, io) : @filter.to_s
|
171
171
|
when /html/i then io = filter_html(io)
|
@@ -195,13 +195,16 @@ class Lingo
|
|
195
195
|
|
196
196
|
def file_type(path, io)
|
197
197
|
if Object.const_defined?(:FileMagic) && io.respond_to?(:rewind)
|
198
|
-
FileMagic.fm(:mime, simplified: true).buffer(io.read(256))
|
199
|
-
|
200
|
-
|
198
|
+
type = FileMagic.fm(:mime, simplified: true).buffer(io.read(256))
|
199
|
+
io.rewind
|
200
|
+
type
|
201
201
|
elsif Object.const_defined?(:MIME) && MIME.const_defined?(:Types)
|
202
|
-
MIME::Types.of(path).first
|
203
|
-
|
204
|
-
|
202
|
+
if type = MIME::Types.of(path).first
|
203
|
+
type.content_type
|
204
|
+
else
|
205
|
+
warn 'Filters not available. File type could not be determined.'
|
206
|
+
nil
|
207
|
+
end
|
205
208
|
else
|
206
209
|
warn "Filters not available. Please install `ruby-filemagic' or `mime-types'."
|
207
210
|
nil
|
@@ -220,17 +223,27 @@ class Lingo
|
|
220
223
|
Array(get_key('files', '-')).each { |path|
|
221
224
|
stdin?(path) ? @files << path : add_files(path, *args)
|
222
225
|
}
|
223
|
-
|
224
|
-
@files.map!(&File.method(:expand_path))
|
225
|
-
@files.uniq!
|
226
226
|
end
|
227
227
|
|
228
228
|
def add_files(path, glob, recursive = false)
|
229
|
-
Dir[path].sort
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
229
|
+
entries = Dir[path].sort!
|
230
|
+
raise FileNotFoundError.new(path) if entries.empty?
|
231
|
+
|
232
|
+
entries.each { |entry|
|
233
|
+
if File.directory?(entry)
|
234
|
+
if recursive
|
235
|
+
Find.find(entry) { |match|
|
236
|
+
if File.file?(match) && File.fnmatch?(glob, match)
|
237
|
+
@files << File.expand_path(match)
|
238
|
+
end
|
239
|
+
}
|
240
|
+
else
|
241
|
+
add_files(File.join(entry, glob), glob)
|
242
|
+
end
|
243
|
+
else
|
244
|
+
@files << File.expand_path(entry)
|
245
|
+
end
|
246
|
+
}
|
234
247
|
end
|
235
248
|
|
236
249
|
class PDFFilter
|
@@ -100,7 +100,6 @@ class Lingo
|
|
100
100
|
if stdout?(@ext)
|
101
101
|
@filename, @file = @ext, @lingo.config.stdout
|
102
102
|
else
|
103
|
-
inc('Anzahl Dateien')
|
104
103
|
@file = File.open(@filename = File.set_ext(param, ".#{@ext}"), 'w')
|
105
104
|
end
|
106
105
|
|
@@ -116,14 +115,12 @@ class Lingo
|
|
116
115
|
@no_sep = true
|
117
116
|
|
118
117
|
unless @lir
|
119
|
-
inc('Anzahl Zeilen')
|
120
118
|
@file.puts unless @no_puts
|
121
119
|
end
|
122
120
|
when STR_CMD_EOF
|
123
121
|
flush_lir_buffer if @lir
|
124
122
|
|
125
123
|
unless stdout?(@filename)
|
126
|
-
add('Anzahl Bytes', @file.size)
|
127
124
|
@file.close
|
128
125
|
end
|
129
126
|
end
|
@@ -142,9 +139,9 @@ class Lingo
|
|
142
139
|
|
143
140
|
def flush_lir_buffer
|
144
141
|
unless @lir_rec_no.empty? || @lir_rec_buf.empty?
|
145
|
-
|
146
|
-
|
147
|
-
|
142
|
+
buf = [@lir_rec_no, @lir_rec_buf.join(@sep), "\n"]
|
143
|
+
@sep =~ /\n/ ? buf.insert(1, "\n").unshift('*') : buf.insert(1, '*')
|
144
|
+
@file.print(*buf)
|
148
145
|
end
|
149
146
|
|
150
147
|
@lir_rec_no = ''
|