lingo 1.8.2 → 1.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/ChangeLog +33 -0
  2. data/README +6 -5
  3. data/Rakefile +6 -4
  4. data/{lib/lingo/cachable.rb → bin/lingosrv} +30 -58
  5. data/bin/lingoweb +30 -0
  6. data/de.lang +2 -13
  7. data/en/lingo-irr.txt +266 -0
  8. data/en/lingo-wdn.txt +37319 -0
  9. data/en.lang +2 -15
  10. data/lib/lingo/app.rb +82 -0
  11. data/lib/lingo/attendee/abbreviator.rb +22 -26
  12. data/lib/lingo/attendee/debugger.rb +8 -4
  13. data/lib/lingo/attendee/decomposer.rb +0 -1
  14. data/lib/lingo/attendee/dehyphenizer.rb +2 -2
  15. data/lib/lingo/attendee/multi_worder.rb +20 -13
  16. data/lib/lingo/attendee/noneword_filter.rb +2 -7
  17. data/lib/lingo/attendee/sequencer.rb +43 -19
  18. data/lib/lingo/attendee/stemmer/porter.rb +2 -2
  19. data/lib/lingo/attendee/stemmer.rb +1 -1
  20. data/lib/lingo/attendee/synonymer.rb +1 -9
  21. data/lib/lingo/attendee/text_reader.rb +42 -29
  22. data/lib/lingo/attendee/text_writer.rb +3 -6
  23. data/lib/lingo/attendee/tokenizer.rb +87 -69
  24. data/lib/lingo/attendee/variator.rb +7 -5
  25. data/lib/lingo/attendee/vector_filter.rb +11 -11
  26. data/lib/lingo/attendee/word_searcher.rb +1 -9
  27. data/lib/lingo/attendee.rb +24 -105
  28. data/lib/lingo/buffered_attendee.rb +2 -9
  29. data/lib/lingo/call.rb +18 -13
  30. data/lib/lingo/cli.rb +5 -10
  31. data/lib/lingo/config.rb +40 -7
  32. data/lib/lingo/ctl.rb +69 -57
  33. data/lib/lingo/database/hash_store.rb +9 -4
  34. data/lib/lingo/database/sdbm_store.rb +4 -7
  35. data/lib/lingo/database/source/multi_key.rb +1 -1
  36. data/lib/lingo/database/source/multi_value.rb +1 -1
  37. data/lib/lingo/database/source.rb +2 -20
  38. data/lib/lingo/database.rb +30 -19
  39. data/lib/lingo/debug.rb +79 -0
  40. data/lib/lingo/{core_ext.rb → language/char.rb} +43 -42
  41. data/lib/lingo/language/dictionary.rb +38 -46
  42. data/lib/lingo/language/grammar.rb +40 -57
  43. data/lib/lingo/language/lexical.rb +4 -7
  44. data/lib/lingo/language/lexical_hash.rb +17 -35
  45. data/lib/lingo/language/token.rb +4 -0
  46. data/lib/lingo/language/word.rb +7 -8
  47. data/lib/lingo/language/word_form.rb +4 -4
  48. data/lib/lingo/language.rb +2 -1
  49. data/lib/lingo/srv/config.ru +4 -0
  50. data/lib/lingo/srv/lingosrv.cfg +14 -0
  51. data/lib/lingo/{reportable.rb → srv.rb} +59 -61
  52. data/lib/lingo/version.rb +1 -1
  53. data/lib/lingo/web/config.ru +4 -0
  54. data/lib/lingo/web/lingoweb.cfg +14 -0
  55. data/lib/lingo/web/public/lingo.png +0 -0
  56. data/lib/lingo/web/public/lingoweb.css +74 -0
  57. data/lib/lingo/web/views/index.erb +92 -0
  58. data/lib/lingo/web.rb +94 -0
  59. data/lib/lingo.rb +27 -29
  60. data/lingo.cfg +1 -1
  61. data/lir.cfg +24 -0
  62. data/ru/lingo-dic.txt +22342 -0
  63. data/ru/lingo-mul.txt +5151 -0
  64. data/ru/lingo-syn.txt +0 -0
  65. data/ru.lang +99 -0
  66. data/test/attendee/ts_sequencer.rb +2 -2
  67. data/test/attendee/ts_text_reader.rb +36 -2
  68. data/test/attendee/ts_text_writer.rb +6 -6
  69. data/test/lir.vec +3 -3
  70. data/test/test_helper.rb +104 -102
  71. data/test/ts_database.rb +1 -1
  72. data/test/ts_language.rb +55 -96
  73. data/txt/artikel-ru.txt +45 -0
  74. data/txt/lir.txt +1 -3
  75. metadata +143 -83
  76. data/TODO +0 -23
data/en.lang CHANGED
@@ -43,18 +43,16 @@
43
43
  # lingo language definition
44
44
  ---
45
45
  language:
46
-
47
46
  name: 'Englisch'
48
47
 
49
48
  dictionary:
50
-
51
49
  databases:
52
-
53
50
  # Systemwörterbücher
54
51
  sys-dic: { name: en/lingo-dic.txt, txt-format: WordClass, separator: '=' }
55
52
  sys-syn: { name: en/lingo-syn.txt, txt-format: KeyValue, separator: '=', def-wc: y }
56
53
  sys-mul: { name: en/lingo-mul.txt, txt-format: SingleWord, use-lex: 'sys-dic', def-wc: m }
57
-
54
+ sys-irr: { name: en/lingo-irr.txt, txt-format: WordClass, separator: '=' }
55
+ sys-wdn: { name: en/lingo-wdn.txt, txt-format: WordClass, separator: '=' }
58
56
  # Benutzerwörterbücher
59
57
  usr-dic: { name: en/user-dic.txt, txt-format: WordClass, separator: '=' }
60
58
 
@@ -77,17 +75,6 @@ language:
77
75
  - [f, ""]
78
76
 
79
77
  attendees:
80
- tokenizer:
81
- regulars:
82
- - _char_: '_baslat_|_lat1sp_|_latexa_|_latexb_|_ipaext_'
83
- - NUMS: '[+-]?(\d{4,}|\d{1,3}(\.\d{3,3})*)(\.|(,\d+)?%?)'
84
- - URLS: '((mailto:|(news|http|https|ftp|ftps)://)\S+|^(www(\.\S+)+)|[^\s.]+([\._]\S+)+@\S+(\.\S+)+)'
85
- - ABRV: '(((_char_)+\.)+)(_char_)+'
86
- - WORD: '(_char_|_digit_|\-)+'
87
- - PUNC: '([!,\.:;?]|\xc2\xa1|\xc2\xbf)'
88
- - OTHR: '([\"#$%&\x27()*\+\-/<=>@\[\\\]^_{|}~]|\xc2\xa2|\xc2\xa3|\xc2\xa4|\xc2\xa5|\xc2\xa6|\xc2\xa7|\xc2\xa8|\xc2\xa9|\xc2\xaa|\xc2\xab|\xc2\xac|\xc2\xae|\xc2\xaf|\xc2\xb0|\xc2\xb1|\xc2\xb2|\xc2\xb3|\xc2\xb4|\xc2\xb5|\xc2\xb6|\xc2\xb7|\xc2\xb8|\xc2\xb9|\xc2\xba|\xc2\xbb|\xc2\xbc|\xc2\xbd|\xc2\xbe|\xc3\x97|\xc3\xb7)'
89
- - HELP: '[^ ]*'
90
-
91
78
  variator:
92
79
  variations:
93
80
  - [ ieh, sch ]
data/lib/lingo/app.rb ADDED
@@ -0,0 +1,82 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ require 'optparse'
28
+ require 'shellwords'
29
+ require 'sinatra/base'
30
+
31
+ class Lingo
32
+
33
+ class App < Sinatra::Base
34
+
35
+ class << self
36
+
37
+ def init_app(file, *args, &block)
38
+ set :root, File.chomp_ext(file)
39
+ parse_options(*args, &block)
40
+ end
41
+
42
+ def parse_options(lingo_options = false)
43
+ argv, banner = [], "Usage: #{$0} [-h|--help] [sinatra-options]"
44
+ while arg = ARGV.shift and arg != '--'; argv << arg; end
45
+
46
+ if lingo_options || block_given?
47
+ banner << ' [-- lingo-options]'
48
+
49
+ opts = ENV["LINGO_#{name.split('::').last.upcase}_OPTS"]
50
+ ARGV.unshift(*Shellwords.shellsplit(opts)) if opts
51
+
52
+ ARGV.unshift(*lingo_options) if lingo_options.is_a?(Array)
53
+ end
54
+
55
+ OptionParser.new(banner, 16) { |o|
56
+ o.on('-p port', 'set the port (default is 4567)') { |v| set :port, Integer(v) }
57
+ o.on('-o addr', 'set the host (default is 0.0.0.0)') { |v| set :bind, v }
58
+ o.on('-e env', 'set the environment (default is development)') { |v| set :environment, v.to_sym }
59
+ o.on('-s server', 'specify rack server/handler (default is thin)') { |v| set :server, v }
60
+ o.on('-x', 'turn on the mutex lock (default is off)') { set :lock, true }
61
+ }.parse!(argv)
62
+
63
+ ARGV.unshift(*yield) if block_given?
64
+ end
65
+
66
+ def rackup(name)
67
+ file = File.join(File.dirname(__FILE__), name, 'config.ru')
68
+ file if File.readable?(file)
69
+ end
70
+
71
+ end
72
+
73
+ def to_json(q, r)
74
+ q, r = 'q', 'Required parameter -- Input string' unless q
75
+
76
+ content_type :json
77
+ { q => r }.to_json
78
+ end
79
+
80
+ end
81
+
82
+ end
@@ -68,7 +68,7 @@ class Lingo
68
68
  # out> *EOL('test.txt')
69
69
  # out> *EOF('test.txt')
70
70
 
71
- class Abbreviator < BufferedAttendee
71
+ class Abbreviator < self
72
72
 
73
73
  protected
74
74
 
@@ -77,36 +77,32 @@ class Lingo
77
77
  end
78
78
 
79
79
  def control(cmd, param)
80
- report_on(cmd, @dic)
81
- process_buffer
80
+ send_abbr(nil) if [STR_CMD_RECORD, STR_CMD_EOF].include?(cmd)
82
81
  end
83
82
 
84
- private
85
-
86
- def process_buffer?
87
- form_at(-1, Token) == CHAR_PUNCT
88
- end
89
-
90
- def process_buffer
91
- if @buffer.size < 2
92
- forward_buffer
93
- return
94
- end
95
-
96
- if form = form_at(-2, Token)
97
- inc('Anzahl gesuchter Abkürzungen')
98
-
99
- if (abbr = find_word(form)).identified?
100
- inc('Anzahl gefundener Abkürzungen')
101
-
102
- abbr.form += CHAR_PUNCT
103
-
104
- @buffer[-2] = abbr
105
- @buffer.delete_at(-1)
83
+ def process(obj)
84
+ if obj.is_a?(Token)
85
+ if obj.form == CHAR_PUNCT
86
+ if @abbr && (abbr = find_word(form = @abbr.form)).identified?
87
+ form << CHAR_PUNCT unless form.end_with?(CHAR_PUNCT)
88
+ send_abbr(abbr)
89
+ else
90
+ send_abbr(@abbr)
91
+ forward(obj)
92
+ end
93
+ else
94
+ send_abbr(@abbr, obj)
106
95
  end
96
+ else
97
+ send_abbr(obj)
107
98
  end
99
+ end
100
+
101
+ private
108
102
 
109
- forward_buffer
103
+ def send_abbr(abbr, obj = nil)
104
+ @abbr = obj
105
+ forward(abbr) if abbr
110
106
  end
111
107
 
112
108
  end
@@ -96,16 +96,20 @@ class Lingo
96
96
  end
97
97
 
98
98
  def control(cmd, param)
99
- if cmd != STR_CMD_STATUS && eval(@cmd_eval)
100
- warn "#{@prompt} #{AgendaItem.new(cmd, param).inspect}"
101
- end
99
+ debug(AgendaItem.new(cmd, param), @cmd_eval)
102
100
  end
103
101
 
104
102
  def process(obj)
105
- warn "#{@prompt} #{obj.inspect}" if eval(@obj_eval)
103
+ debug(obj, @obj_eval)
106
104
  forward(obj)
107
105
  end
108
106
 
107
+ private
108
+
109
+ def debug(obj, cond)
110
+ warn "#{@prompt} #{obj.inspect}" if eval(cond)
111
+ end
112
+
109
113
  end
110
114
 
111
115
  end
@@ -80,7 +80,6 @@ class Lingo
80
80
  end
81
81
 
82
82
  def control(cmd, param)
83
- report_on(cmd, @gra)
84
83
  end
85
84
 
86
85
  def process(obj)
@@ -89,7 +89,7 @@ class Lingo
89
89
  if ab.all? { |i| i.is_a?(Word) } && a.form[-1, 1] == h && !(
90
90
  (c = b.get_class(/./).first) && @skip.include?(c.attr)
91
91
  )
92
- a, b = ab.map!(&:form)
92
+ a, b = ab.map! { |i| i.form }
93
93
 
94
94
  word = dehyphenize(a.chomp(h) + b)
95
95
  word = dehyphenize(a + b) unless dehyphenized?(word)
@@ -106,7 +106,7 @@ class Lingo
106
106
  private
107
107
 
108
108
  def dehyphenize(form)
109
- find_word(form, &:identified?)
109
+ find_word(form) { |i| i.identified? }
110
110
  end
111
111
 
112
112
  def dehyphenized?(word)
@@ -113,7 +113,7 @@ class Lingo
113
113
  end
114
114
 
115
115
  def control(cmd, param)
116
- control_multi(cmd, @mul_dic)
116
+ control_multi(cmd)
117
117
  end
118
118
 
119
119
  def process_buffer
@@ -177,29 +177,36 @@ class Lingo
177
177
  def check_multiword_key(len)
178
178
  return [] if valid_tokens_in_buffer < len
179
179
 
180
- seq = @buffer.map { |obj|
180
+ seq = []
181
+
182
+ @buffer.each { |obj|
181
183
  next [obj] unless obj.is_a?(WordForm)
182
184
  next if (form = obj.form) == CHAR_PUNCT
183
185
 
184
186
  w = find_word(form, @lex_dic, @lex_gra)
185
187
  l = w.lexicals
186
188
 
187
- (w.attr == WA_COMPOUND ? [l.first] : l.empty? ? [w] : l.dup).tap { |i|
188
- i.concat(@syn_dic.find_synonyms(w)) if @syn_dic
189
- i.map! { |j| j.form.downcase }.uniq!
190
- }
191
- }
189
+ i = w.attr == WA_COMPOUND ? [l.first] : l.empty? ? [w] : l.dup
192
190
 
193
- seq.compact!
194
- seq.slice!(len..-1)
191
+ @syn_dic.find_synonyms(w, i) if @syn_dic
192
+ i.map! { |j| Unicode.downcase(j.form) }.uniq!
193
+
194
+ seq << i
195
+
196
+ break unless seq.length < len
197
+ }
195
198
 
196
199
  if @combine
197
- [].tap { |mul| seq.shift.product(*seq) { |key|
198
- mul.concat(@mul_dic.select(key.join(' ')))
200
+ mul = []
201
+
202
+ seq.shift.product(*seq) { |key|
203
+ @mul_dic.select(key.join(' '), mul)
199
204
  break unless @all_keys || mul.empty?
200
- } && mul.uniq! }
205
+ } && mul.uniq!
206
+
207
+ mul
201
208
  else
202
- @mul_dic.select(seq.map!(&:first).join(' '))
209
+ @mul_dic.select(seq.map! { |i,| i }.join(' '))
203
210
  end
204
211
  end
205
212
 
@@ -87,9 +87,7 @@ class Lingo
87
87
 
88
88
  def process(obj)
89
89
  if obj.is_a?(Word) && obj.unknown?
90
- inc('Anzahl nicht erkannter Wörter')
91
-
92
- non = obj.form.downcase
90
+ non = Unicode.downcase(obj.form)
93
91
  @sort ? @nonewords << non : forward(non)
94
92
  end
95
93
  end
@@ -97,11 +95,8 @@ class Lingo
97
95
  private
98
96
 
99
97
  def send_nonewords
100
- @nonewords.sort!
101
98
  @nonewords.uniq!
102
-
103
- add('Objekte gefiltert', @nonewords.size)
104
- @nonewords.each(&method(:forward)).clear
99
+ flush(@nonewords.sort!)
105
100
  end
106
101
 
107
102
  end
@@ -97,11 +97,15 @@ class Lingo
97
97
 
98
98
  def init
99
99
  @stopper = get_array('stopper', DEFAULT_SKIP, :upcase)
100
+ @classes = []
100
101
 
101
102
  @seq = get_key('sequences').map { |string, format|
102
- [string = string.downcase, string.split(//), format]
103
+ @classes.concat(classes = string.downcase!.chars.to_a)
104
+ [string, classes, format]
103
105
  }
104
106
 
107
+ @classes.uniq!
108
+
105
109
  raise MissingConfigError.new(:sequences) if @seq.empty?
106
110
  end
107
111
 
@@ -115,42 +119,62 @@ class Lingo
115
119
  end
116
120
 
117
121
  def process_buffer
118
- insert_sequences if @buffer.size > 1
119
- forward_buffer
122
+ matches = []
123
+
124
+ if @buffer.size > 1
125
+ buf, map, seq, cls, unk = [], [], @seq, @classes, %w[#]
126
+
127
+ @buffer.each { |obj|
128
+ att = obj.is_a?(Word) && !obj.unknown? ? obj.attrs(false) : unk
129
+
130
+ (att &= cls).empty? ? find_seq(buf, map, seq, matches) : begin
131
+ buf << obj
132
+ map << att
133
+ end
134
+ }
135
+
136
+ find_seq(buf, map, seq, matches)
137
+ end
138
+
139
+ flush(@buffer.concat(matches))
120
140
  end
121
141
 
122
142
  private
123
143
 
124
- def insert_sequences
125
- matches, buf, seq = Hash.new { |h, k| h[k] = [] }, @buffer, @seq
144
+ def find_seq(buf, map, seq, matches)
145
+ return if buf.empty?
126
146
 
127
- map = buf.map { |obj|
128
- obj.is_a?(Word) && !obj.unknown? ? obj.attrs(false) : ['#']
129
- }
147
+ match = Hash.new { |h, k| h[k] = [] }
148
+
149
+ map.replace(map.shift.product(*map))
150
+ map.map! { |i| i.join }
151
+ map.uniq!
130
152
 
131
- map.shift.product(*map).map!(&:join).tap(&:uniq!).each { |q|
153
+ map.each { |q|
132
154
  seq.each { |string, classes, format|
133
155
  while pos = q.index(string, pos || 0)
134
- inc('Anzahl erkannter Sequenzen')
135
-
136
- fmt = format.dup
156
+ form = format.dup
137
157
 
138
158
  classes.each_with_index { |wc, i|
139
159
  buf[pos + i].lexicals.find { |l|
140
- fmt.gsub!(i.succ.to_s, l.form) if l.attr == wc
160
+ form.gsub!(i.succ.to_s, l.form) if l.attr == wc
141
161
  } or break
142
162
  } or next
143
163
 
144
- matches[pos] << fmt
145
-
146
- pos += 1
164
+ match[pos += 1] << form
147
165
  end
148
166
  }
149
167
  }
150
168
 
151
- matches.sort.each { |pos, forms| forms.tap(&:uniq!).each { |form|
152
- @inserts << [pos, Word.new_lexical(form, WA_SEQUENCE, LA_SEQUENCE)]
153
- } }
169
+ match.each_value { |forms|
170
+ forms.uniq!
171
+ forms.each { |form|
172
+ matches << Word.new_lexical(form, WA_SEQUENCE, LA_SEQUENCE)
173
+ }
174
+ }
175
+
176
+ buf.clear
177
+ map.clear
154
178
  end
155
179
 
156
180
  end
@@ -290,7 +290,7 @@ class Lingo
290
290
  case rule
291
291
  when RULE_RE
292
292
  cond, repl, goto = $1, $3, $4
293
- stem = word[/(.+)#{$2.downcase}$/, 1] or next
293
+ stem = word[/(.+)#{Unicode.downcase($2)}$/, 1] or next
294
294
  when GOTO_RE
295
295
  goto = $1
296
296
  break
@@ -324,7 +324,7 @@ class Lingo
324
324
  found, word = true, begin
325
325
  stem[0...Integer(repl)]
326
326
  rescue ArgumentError
327
- stem << repl.downcase
327
+ stem << Unicode.downcase(repl)
328
328
  end
329
329
 
330
330
  break
@@ -41,7 +41,7 @@ class Lingo
41
41
 
42
42
  def process(obj)
43
43
  if obj.is_a?(Word) && obj.unknown?
44
- stem = stem(obj.form.downcase, @all)
44
+ stem = stem(Unicode.downcase(obj.form), @all)
45
45
  obj.add_lexicals([Lexical.new(stem, @wc)]) if stem
46
46
  end
47
47
 
@@ -78,19 +78,11 @@ class Lingo
78
78
  end
79
79
 
80
80
  def control(cmd, param)
81
- report_on(cmd, @dic)
82
81
  end
83
82
 
84
83
  def process(obj)
85
84
  if obj.is_a?(Word) && !@skip.include?(obj.attr)
86
- inc('Anzahl gesuchter Wörter')
87
-
88
- unless (syn = @dic.find_synonyms(obj)).empty?
89
- inc('Anzahl erweiteter Wörter')
90
-
91
- obj.add_lexicals(syn.tap(&:uniq!))
92
- add('Anzahl gefundener Synonyme', syn.size)
93
- end
85
+ obj.add_lexicals(@dic.find_synonyms(obj))
94
86
  end
95
87
 
96
88
  forward(obj)
@@ -115,15 +115,17 @@ class Lingo
115
115
  @filter = get_key('filter', false)
116
116
  @progress = get_key('progress', false)
117
117
 
118
- if @lir = get_key('records', get_key('lir-record-pattern', nil)) # DEPRECATE lir-record-pattern
119
- @lir = @lir == true ? %r{^\[(\d+)\.\]} : Regexp.new(@lir)
120
- end
118
+ @lingo.deprecate('lir-record-pattern', :records, self) if has_key?('lir-record-pattern')
119
+
120
+ @lir = get_re('records', get_key('lir-record-pattern', nil), %r{^\[(\d+)\.\]}) # DEPRECATE lir-record-pattern
121
+ @cut = get_re('fields', !!@lir, %r{^.+?:\s*})
122
+ @skip = get_re('skip', nil)
121
123
  end
122
124
 
123
125
  def control(cmd, param)
124
126
  if cmd == STR_CMD_TALK
125
127
  forward(STR_CMD_LIR, '') if @lir
126
- @files.each(&method(:spool))
128
+ @files.each { |i| spool(i) }
127
129
  end
128
130
  end
129
131
 
@@ -132,24 +134,22 @@ class Lingo
132
134
  # Gibt eine Datei zeilenweise in den Ausgabekanal
133
135
  def spool(path)
134
136
  unless stdin = stdin?(path)
135
- inc('Anzahl Dateien')
136
- add('Anzahl Bytes', size = File.size(path))
137
-
138
- size = nil unless @progress
137
+ size = File.size(path) if @progress
139
138
  end
140
139
 
141
140
  forward(STR_CMD_FILE, path)
142
141
 
143
142
  ShowProgress.new(self, size, path) { |progress|
144
143
  filter(path, stdin) { |line, pos|
145
- inc('Anzahl Zeilen')
146
144
  progress[pos]
147
145
 
148
146
  line.chomp! if @chomp
147
+ next if line =~ @skip
149
148
 
150
149
  if line =~ @lir
151
150
  forward(STR_CMD_RECORD, $1)
152
151
  else
152
+ line.sub!(@cut, '') if @cut
153
153
  forward(line) unless line.empty?
154
154
  end
155
155
  }
@@ -159,13 +159,13 @@ class Lingo
159
159
  end
160
160
 
161
161
  def filter(path, stdin = stdin?(path))
162
- io, block = stdin ? [
163
- @lingo.config.stdin.set_encoding(ENC),
164
- lambda { |line| yield line, 0 }
165
- ] : [
166
- File.open(path, 'rb', encoding: ENC),
162
+ io = stdin ?
163
+ @lingo.config.stdin.set_encoding(ENC) :
164
+ File.open(path, 'rb', encoding: ENC)
165
+
166
+ block = stdin || !@progress ?
167
+ lambda { |line| yield line, 0 } :
167
168
  lambda { |line| yield line, io.pos }
168
- ]
169
169
 
170
170
  case @filter == true ? file_type(path, io) : @filter.to_s
171
171
  when /html/i then io = filter_html(io)
@@ -195,13 +195,16 @@ class Lingo
195
195
 
196
196
  def file_type(path, io)
197
197
  if Object.const_defined?(:FileMagic) && io.respond_to?(:rewind)
198
- FileMagic.fm(:mime, simplified: true).buffer(io.read(256)).tap {
199
- io.rewind
200
- }
198
+ type = FileMagic.fm(:mime, simplified: true).buffer(io.read(256))
199
+ io.rewind
200
+ type
201
201
  elsif Object.const_defined?(:MIME) && MIME.const_defined?(:Types)
202
- MIME::Types.of(path).first.tap { |type| type ? type.content_type :
203
- warn('Filters not available. File type could not be determined.')
204
- }
202
+ if type = MIME::Types.of(path).first
203
+ type.content_type
204
+ else
205
+ warn 'Filters not available. File type could not be determined.'
206
+ nil
207
+ end
205
208
  else
206
209
  warn "Filters not available. Please install `ruby-filemagic' or `mime-types'."
207
210
  nil
@@ -220,17 +223,27 @@ class Lingo
220
223
  Array(get_key('files', '-')).each { |path|
221
224
  stdin?(path) ? @files << path : add_files(path, *args)
222
225
  }
223
-
224
- @files.map!(&File.method(:expand_path))
225
- @files.uniq!
226
226
  end
227
227
 
228
228
  def add_files(path, glob, recursive = false)
229
- Dir[path].sort!.each { |match|
230
- File.directory?(match) ? recursive ? Find.find(match) { |entry|
231
- @files << entry if File.file?(entry) && File.fnmatch?(glob, entry)
232
- } : add_files(File.join(match, glob), glob) : @files << match
233
- }.empty? and raise FileNotFoundError.new(path)
229
+ entries = Dir[path].sort!
230
+ raise FileNotFoundError.new(path) if entries.empty?
231
+
232
+ entries.each { |entry|
233
+ if File.directory?(entry)
234
+ if recursive
235
+ Find.find(entry) { |match|
236
+ if File.file?(match) && File.fnmatch?(glob, match)
237
+ @files << File.expand_path(match)
238
+ end
239
+ }
240
+ else
241
+ add_files(File.join(entry, glob), glob)
242
+ end
243
+ else
244
+ @files << File.expand_path(entry)
245
+ end
246
+ }
234
247
  end
235
248
 
236
249
  class PDFFilter
@@ -100,7 +100,6 @@ class Lingo
100
100
  if stdout?(@ext)
101
101
  @filename, @file = @ext, @lingo.config.stdout
102
102
  else
103
- inc('Anzahl Dateien')
104
103
  @file = File.open(@filename = File.set_ext(param, ".#{@ext}"), 'w')
105
104
  end
106
105
 
@@ -116,14 +115,12 @@ class Lingo
116
115
  @no_sep = true
117
116
 
118
117
  unless @lir
119
- inc('Anzahl Zeilen')
120
118
  @file.puts unless @no_puts
121
119
  end
122
120
  when STR_CMD_EOF
123
121
  flush_lir_buffer if @lir
124
122
 
125
123
  unless stdout?(@filename)
126
- add('Anzahl Bytes', @file.size)
127
124
  @file.close
128
125
  end
129
126
  end
@@ -142,9 +139,9 @@ class Lingo
142
139
 
143
140
  def flush_lir_buffer
144
141
  unless @lir_rec_no.empty? || @lir_rec_buf.empty?
145
- @file.print(*[@lir_rec_no, @lir_rec_buf.join(@sep), "\n"].tap { |buf|
146
- @sep =~ /\n/ ? buf.insert(1, "\n").unshift('*') : buf.insert(1, '*')
147
- })
142
+ buf = [@lir_rec_no, @lir_rec_buf.join(@sep), "\n"]
143
+ @sep =~ /\n/ ? buf.insert(1, "\n").unshift('*') : buf.insert(1, '*')
144
+ @file.print(*buf)
148
145
  end
149
146
 
150
147
  @lir_rec_no = ''