lingo 1.8.6 → 1.8.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/ChangeLog +40 -4
  3. data/README +22 -51
  4. data/Rakefile +3 -17
  5. data/config/lingo.cfg +24 -15
  6. data/config/lir.cfg +25 -16
  7. data/dict/de/test_muh.txt +6 -0
  8. data/dict/en/lingo-dic.txt +2 -3
  9. data/lang/de.lang +10 -9
  10. data/lang/en.lang +1 -1
  11. data/lib/lingo.rb +4 -4
  12. data/lib/lingo/attendee.rb +27 -7
  13. data/lib/lingo/attendee/analysis_filter.rb +81 -0
  14. data/lib/lingo/attendee/debug_filter.rb +42 -0
  15. data/lib/lingo/attendee/debugger.rb +2 -11
  16. data/lib/lingo/attendee/decomposer.rb +6 -3
  17. data/lib/lingo/attendee/formatter.rb +6 -6
  18. data/lib/lingo/attendee/hal_filter.rb +94 -0
  19. data/lib/lingo/attendee/lsi_filter.rb +99 -0
  20. data/lib/lingo/attendee/multi_worder.rb +69 -43
  21. data/lib/lingo/attendee/sequencer.rb +32 -19
  22. data/lib/lingo/attendee/synonymer.rb +2 -2
  23. data/lib/lingo/attendee/text_reader.rb +63 -92
  24. data/lib/lingo/attendee/text_writer.rb +12 -21
  25. data/lib/lingo/attendee/tokenizer.rb +32 -21
  26. data/lib/lingo/attendee/variator.rb +3 -3
  27. data/lib/lingo/attendee/vector_filter.rb +7 -9
  28. data/lib/lingo/attendee/word_searcher.rb +3 -3
  29. data/lib/lingo/buffered_attendee.rb +3 -36
  30. data/lib/lingo/config.rb +1 -1
  31. data/lib/lingo/ctl.rb +7 -155
  32. data/lib/lingo/ctl/analysis.rb +136 -0
  33. data/lib/lingo/ctl/files.rb +86 -0
  34. data/lib/lingo/ctl/other.rb +140 -0
  35. data/lib/lingo/database.rb +64 -60
  36. data/lib/lingo/database/crypter.rb +7 -5
  37. data/lib/lingo/error.rb +5 -4
  38. data/lib/lingo/language.rb +13 -5
  39. data/lib/lingo/language/grammar.rb +13 -7
  40. data/lib/lingo/language/token.rb +6 -0
  41. data/lib/lingo/language/word.rb +23 -36
  42. data/lib/lingo/language/word_form.rb +5 -1
  43. data/lib/lingo/srv.rb +2 -2
  44. data/lib/lingo/text_utils.rb +96 -0
  45. data/lib/lingo/version.rb +1 -1
  46. data/lib/lingo/web/views/index.erb +1 -1
  47. data/test/attendee/ts_decomposer.rb +23 -5
  48. data/test/attendee/ts_multi_worder.rb +66 -0
  49. data/test/attendee/ts_sequencer.rb +28 -4
  50. data/test/attendee/ts_text_reader.rb +20 -0
  51. data/test/attendee/ts_tokenizer.rb +20 -0
  52. data/test/attendee/ts_variator.rb +1 -1
  53. data/test/attendee/ts_word_searcher.rb +39 -3
  54. data/test/lir3.txt +12 -0
  55. data/test/ref/artikel.non +1 -12
  56. data/test/ref/artikel.seq +3 -1
  57. data/test/ref/artikel.vec +1 -0
  58. data/test/ref/artikel.vef +35 -34
  59. data/test/ref/artikel.ven +8 -7
  60. data/test/ref/artikel.ver +34 -33
  61. data/test/ref/artikel.vet +2573 -2563
  62. data/test/ref/lir.non +77 -78
  63. data/test/ref/lir.seq +9 -7
  64. data/test/ref/lir.syn +1 -1
  65. data/test/ref/lir.vec +41 -41
  66. data/test/ref/lir.vef +210 -210
  67. data/test/ref/lir.ven +46 -46
  68. data/test/ref/lir.ver +72 -72
  69. data/test/ref/lir.vet +329 -329
  70. data/test/ts_database.rb +166 -62
  71. data/test/ts_language.rb +23 -23
  72. metadata +53 -34
  73. data/lib/lingo/attendee/dehyphenizer.rb +0 -120
  74. data/lib/lingo/attendee/noneword_filter.rb +0 -115
  75. data/test/attendee/ts_noneword_filter.rb +0 -15
@@ -0,0 +1,99 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ class Attendee
30
+
31
+ class LsiFilter < DeferredAttendee
32
+
33
+ def init
34
+ require_lib('lsi4r')
35
+
36
+ @lex = get_re('lexicals', '[sy]')
37
+ @skip = get_ary('skip', DEFAULT_SKIP, :upcase)
38
+
39
+ @transform = get_key('transform', Lsi4R::DEFAULT_TRANSFORM)
40
+ @cutoff = get_flo('cut', Lsi4R::DEFAULT_CUTOFF)
41
+
42
+ @min = get_flo('min', false)
43
+ @abs = get_flo('abs', false)
44
+ @nul = get_flo('nul', false)
45
+ @new = get_key('new', true)
46
+
47
+ @sort = get_key('sort', false)
48
+ @sort.downcase! if @sort.respond_to?(:downcase!)
49
+
50
+ @docnum, @vectors = 0, Hash.new { |h, k| h[k] = [] }
51
+ end
52
+
53
+ def control(cmd, *)
54
+ :skip_command if cmd == :EOL
55
+ end
56
+
57
+ def control_deferred(cmd, *)
58
+ @docnum += 1 if TERMINALS.include?(cmd)
59
+ end
60
+
61
+ def process(obj)
62
+ if obj.is_a?(Word) && !@skip.include?(obj.attr)
63
+ vec = []
64
+ obj.each_lex(@lex) { |lex| vec << Unicode.downcase(lex.form) }
65
+ @vectors[@docnum].concat(vec) unless vec.empty?
66
+ end
67
+ end
68
+
69
+ private
70
+
71
+ def send_lsi
72
+ lsi = Lsi4R.new(@vectors); @vectors.clear
73
+
74
+ if lsi.build(transform: @transform, cutoff: @cutoff)
75
+ options, vec = { min: @min, abs: @abs, nul: @nul, new: @new }, []
76
+
77
+ fmt = @sort ? @sort == 'sto' ?
78
+ '%s {%.5f}' : '%2$.5f %1$s' : '%s %.5f' unless @sort == 'normal'
79
+
80
+ yield !@sort ? lambda { |docnum|
81
+ lsi.each_norm(docnum, options) { |_, *v| forward(fmt % v) }
82
+ } : lambda { |docnum|
83
+ lsi.each_norm(docnum, options) { |_, *v| vec << v }
84
+
85
+ !fmt ? vec.sort!.each { |v, _| forward(v) } :
86
+ vec.sort_by { |v, w| [-w, v] }.each { |v| forward(fmt % v) }
87
+
88
+ vec.clear
89
+ }
90
+ end
91
+ end
92
+
93
+ alias_method :flush_deferred, :send_lsi
94
+
95
+ end
96
+
97
+ end
98
+
99
+ end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -40,7 +40,9 @@ class Lingo
40
40
  #
41
41
  # === Mögliche Verlinkung
42
42
  # Erwartet:: Daten vom Typ *Word* z.B. von Wordsearcher, Decomposer, Ocr_variator, MultiWorder
43
- # Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_MULTIWORD). Je erkannter Mehrwortgruppe wird ein zusätzliches Word-Objekt in den Datenstrom eingefügt. Z.B. für Ocr_variator, Sequencer, Noneword_filter, Vector_filter
43
+ # Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_MULTIWORD). Je erkannter Mehrwortgruppe wird
44
+ # ein zusätzliches Word-Objekt in den Datenstrom eingefügt. Z.B. für Ocr_variator, Sequencer,
45
+ # Vector_filter
44
46
  #
45
47
  # === Parameter
46
48
  # Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
@@ -88,7 +90,7 @@ class Lingo
88
90
 
89
91
  lex_src, lex_mod, d = nil, nil, lingo.dictionary_config['databases']
90
92
 
91
- (mul_src = get_array('source')).each { |src|
93
+ (mul_src = get_ary('source')).each { |src|
92
94
  s, m = d[src].values_at('use-lex', 'lex-mode')
93
95
 
94
96
  if lex_src.nil? || lex_src == s
@@ -106,76 +108,99 @@ class Lingo
106
108
  @lex_gra = grammar(lex_src, lex_mod)
107
109
 
108
110
  @syn_dic = if @combine && has_key?('use-syn')
109
- dictionary(get_array('use-syn'), get_key('syn-mode', 'all'))
111
+ dictionary(get_ary('use-syn'), get_key('syn-mode', 'all'))
110
112
  end
111
113
 
112
114
  @expected_tokens_in_buffer, @eof_handling = 3, false
113
115
  end
114
116
 
115
117
  def control(cmd, *)
116
- control_multi(cmd)
118
+ if [:RECORD, :EOF].include?(cmd)
119
+ @eof_handling = true
120
+
121
+ while process_buffer?(2)
122
+ process_buffer
123
+ end
124
+
125
+ forward_number_of_token
126
+
127
+ @eof_handling = false
128
+ end
117
129
  end
118
130
 
119
- def process_buffer
120
- unless form_at(0) == CHAR_PUNCT
121
- unless (res = check_multiword_key(3)).empty?
122
- len = res.map { |r| r.is_a?(Lexical) ? r.form.count(' ') + 1 : r }
123
- len.sort!.reverse!
131
+ private
132
+
133
+ def form_at(index)
134
+ obj = @buffer[index]
135
+ obj.form if obj.is_a?(WordForm) && obj.form != CHAR_PUNCT
136
+ end
137
+
138
+ def forward_number_of_token(len = default = @buffer.size, punct = !default)
139
+ begin
140
+ unless @buffer.empty?
141
+ forward(item = @buffer.delete_at(0))
142
+ len -= 1 unless punct && item.form == CHAR_PUNCT
143
+ end
144
+ end while len > 0
145
+ end
146
+
147
+ def process_buffer?(num = @expected_tokens_in_buffer)
148
+ @buffer.count { |item| item.form != CHAR_PUNCT } >= num
149
+ end
124
150
 
125
- unless (max = len.first) > 3
151
+ def process_buffer
152
+ if form_at(0)
153
+ if res = check_multiword(3, len = [])
154
+ if (max = len.first) <= 3
126
155
  create_and_forward_multiword(3, res)
127
156
  forward_number_of_token(3)
157
+ elsif !@eof_handling && @buffer.size < max
158
+ @expected_tokens_in_buffer = max
128
159
  else
129
- unless @eof_handling || @buffer.size >= max
130
- @expected_tokens_in_buffer = max
131
- else
132
- forward_number_of_token(len.find { |l|
133
- r = check_multiword_key(l)
134
- create_and_forward_multiword(l, r) unless r.empty?
135
- } || 1)
136
-
137
- @expected_tokens_in_buffer = 3
138
- process_buffer if process_buffer?
139
- end
160
+ forward_number_of_token(len.find { |l|
161
+ create_and_forward_multiword(l) } || 1)
162
+
163
+ @expected_tokens_in_buffer = 3
164
+ process_buffer if process_buffer?
140
165
  end
141
166
 
142
167
  return
143
168
  end
144
169
 
145
- unless (res = check_multiword_key(2)).empty?
146
- create_and_forward_multiword(2, res)
147
- forward_number_of_token(1)
148
- end
170
+ create_and_forward_multiword(2) && forward_number_of_token(1)
149
171
  end
150
172
 
151
173
  forward_number_of_token(1, false)
152
174
  @expected_tokens_in_buffer = 3
153
175
  end
154
176
 
155
- private
177
+ def create_and_forward_multiword(len, lex = check_multiword(len))
178
+ return unless lex
156
179
 
157
- def create_and_forward_multiword(len, lex)
158
180
  pos, parts = 0, []
159
181
 
160
182
  begin
161
- if (form = form_at(pos)) == CHAR_PUNCT
162
- @buffer.delete_at(pos)
163
- parts[-1] += CHAR_PUNCT
164
- else
183
+ if form = form_at(pos)
165
184
  @buffer[pos].attr = WA_UNKMULPART if @buffer[pos].unknown?
166
185
  parts << form
167
186
  pos += 1
187
+ else
188
+ @buffer.delete_at(pos)
189
+ parts[-1] += CHAR_PUNCT
168
190
  end
169
191
  end while pos < len
170
192
 
171
- forward(Word.new_lexicals(parts.join(' '),
172
- WA_MULTIWORD, lex.select { |l| l.is_a?(Lexical) }))
193
+ wrd = Word.new_lexicals(parts.join(' '),
194
+ WA_MULTIWORD, lex.select { |l| l.is_a?(Lexical) })
195
+ wrd.token = @buffer[pos - 1].token
196
+
197
+ forward(wrd)
173
198
  end
174
199
 
175
- def check_multiword_key(len)
176
- return [] if valid_tokens_in_buffer < len
200
+ def check_multiword(len, lst = nil)
201
+ return unless process_buffer?(len)
177
202
 
178
- seq = []
203
+ seq, mul, sep = [], [], ' '
179
204
 
180
205
  @buffer.each { |obj|
181
206
  next seq << [obj] unless obj.is_a?(WordForm)
@@ -195,17 +220,18 @@ class Lingo
195
220
  }
196
221
 
197
222
  if @combine
198
- mul = []
199
-
200
223
  seq.shift.product(*seq) { |key|
201
- @mul_dic.select(key.join(' '), mul)
224
+ @mul_dic.select(key.join(sep), mul)
202
225
  break unless @all || mul.empty?
203
226
  } && mul.uniq!
204
-
205
- mul
206
227
  else
207
- @mul_dic.select(seq.map! { |i,| i }.join(' '))
228
+ @mul_dic.select(seq.map! { |i,| i }.join(sep), mul)
208
229
  end
230
+
231
+ lst.push(seq.size).concat(mul.map { |r| r.is_a?(Lexical) ?
232
+ r.form.count(sep) + 1 : r }).sort!.reverse!.uniq! if lst
233
+
234
+ mul unless mul.empty?
209
235
  end
210
236
 
211
237
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -43,7 +43,9 @@ class Lingo
43
43
  #
44
44
  # === Mögliche Verlinkung
45
45
  # Erwartet:: Daten vom Typ *Word* z.B. von Wordsearcher, Decomposer, Ocr_variator, Multiworder
46
- # Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_SEQUENCE). Je erkannter Mehrwortgruppe wird ein zusätzliches Word-Objekt in den Datenstrom eingefügt. Z.B. für Ocr_variator, Sequencer, Noneword_filter, Vector_filter
46
+ # Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_SEQUENCE). Je erkannter Mehrwortgruppe wird
47
+ # ein zusätzliches Word-Objekt in den Datenstrom eingefügt. Z.B. für Ocr_variator, Sequencer,
48
+ # Vector_filter
47
49
  #
48
50
  # === Parameter
49
51
  # Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
@@ -95,15 +97,20 @@ class Lingo
95
97
 
96
98
  class Sequencer < BufferedAttendee
97
99
 
100
+ UNK = %w[#]
101
+ NUM = %w[0]
102
+
103
+ CLS = /[[:alpha:]#{NUM.join}]/o
104
+
98
105
  def init
99
- @stopper = get_array('stopper', DEFAULT_SKIP)
106
+ @stopper = get_ary('stopper', DEFAULT_SKIP)
100
107
  .push(WA_UNKNOWN, WA_UNKMULPART)
101
108
 
102
109
  @mwc = get_key('multiword', LA_MULTIWORD)
103
110
  @cls = []
104
111
 
105
112
  @seq = get_key('sequences').map { |str, fmt|
106
- @cls.concat(cls = (str = str.downcase).scan(/[[:alpha:]]/))
113
+ @cls.concat(cls = (str = str.downcase).scan(CLS))
107
114
 
108
115
  (str =~ /\W/ ? [Regexp.new(str), nil] : [str, cls]).push(
109
116
  fmt == true ? '|' : fmt ? fmt.gsub(/\d+/, '%\&$s') : nil)
@@ -124,7 +131,7 @@ class Lingo
124
131
 
125
132
  def process_buffer
126
133
  flush(@buffer.size < 2 ? @buffer : begin
127
- arg, cls, mwc, unk = [[], buf = [], map = [], @seq], @cls, @mwc, %w[#]
134
+ arg = [[], buf = [], map = [], @seq]
128
135
 
129
136
  iter, skip, rewind = @buffer.each_with_index, 0, lambda {
130
137
  iter.rewind; skip.times { iter.next }; skip = 0
@@ -142,16 +149,17 @@ class Lingo
142
149
  rewind.call
143
150
  end
144
151
 
145
- att = obj.is_a?(Word) && !obj.unknown? ? obj.attrs(false) : unk
152
+ att = (tok = obj.is_a?(Token)) ? obj.number? ? NUM : UNK :
153
+ obj.is_a?(Word) && !obj.unknown? ? obj.compound_attrs : UNK
146
154
 
147
- if (att &= cls).empty?
155
+ if (att &= @cls).empty?
148
156
  find_seq(*arg)
149
157
  rewind.call if skip > 0
150
158
  else
151
- if n = obj.multiword_size(mwc)
152
- n.times { iter.next }
153
- skip = idx + 1
154
- end
159
+ obj.each_lex(@mwc) { |lex|
160
+ lex.form.count(' ').succ.times { iter.next }
161
+ break skip = idx + 1
162
+ } unless tok
155
163
 
156
164
  buf << obj
157
165
  map << att
@@ -178,18 +186,18 @@ class Lingo
178
186
  while pos = q.index(str, pos || 0)
179
187
  _str, _cls = [$&, $&.chars] unless cls
180
188
 
181
- args.clear
189
+ _tok = nil; args.clear
182
190
 
183
191
  _cls.each_with_index { |wc, i|
184
- buf[pos + i].lexicals.find { |l|
185
- args[i] = l.form if l.attr == wc
186
- } or break
192
+ obj = buf[pos + i];_tok ||= obj.token
193
+
194
+ args[i] = obj.is_a?(Word) ? obj.lexicals.find { |lex|
195
+ break lex.form if lex.attr == wc } : obj.form or break
187
196
  } or next
188
197
 
189
- forms << (
198
+ forms << [_str, _tok,
190
199
  fmt =~ /\d/ ? fmt.gsub('%0$s', _str) % args :
191
- fmt ? "#{_str}:#{args.join(fmt)}" : args.join(' ')
192
- )
200
+ fmt ? "#{_str}:#{args.join(fmt)}" : args.join(' ')]
193
201
 
194
202
  pos += 1
195
203
  end
@@ -197,7 +205,12 @@ class Lingo
197
205
  }.clear
198
206
 
199
207
  forms.uniq!
200
- forms.each { |f| mat << Word.new_lexical(f, WA_SEQUENCE, LA_SEQUENCE) }
208
+
209
+ forms.each { |s, t, f|
210
+ wrd = Word.new_lexical(f, WA_SEQUENCE, LA_SEQUENCE)
211
+ wrd.pattern, wrd.token = s, t
212
+ mat << wrd
213
+ }
201
214
 
202
215
  buf.clear
203
216
  mat
@@ -36,7 +36,7 @@ class Lingo
36
36
  #
37
37
  # === Mögliche Verlinkung
38
38
  # Erwartet:: Daten vom Typ *Word* z.B. von Wordsearcher, Decomposer, Ocr_variator, Multiworder
39
- # Erzeugt:: Daten vom Typ *Word* (ggf. um Relationen ergänzt) z.B. für Decomposer, Ocr_variator, Multiworder, Sequencer, Noneword_filter, Vector_filter
39
+ # Erzeugt:: Daten vom Typ *Word* (ggf. um Relationen ergänzt) z.B. für Decomposer, Ocr_variator, Multiworder, Sequencer, Vector_filter
40
40
  #
41
41
  # === Parameter
42
42
  # Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
@@ -75,7 +75,7 @@ class Lingo
75
75
  def init
76
76
  set_dic
77
77
  @com = !get_key('compound-parts', false)
78
- @skip = get_array('skip', WA_UNKNOWN, :upcase)
78
+ @skip = get_ary('skip', WA_UNKNOWN, :upcase)
79
79
  end
80
80
 
81
81
  def control(*)
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -107,10 +107,14 @@ class Lingo
107
107
 
108
108
  class TextReader < self
109
109
 
110
+ include TextUtils
111
+
110
112
  # TODO: FILE/LIR-FILE (?)
111
113
  def init
112
114
  get_files
113
115
 
116
+ @encoding = get_enc
117
+
114
118
  @filter = get_key('filter', false)
115
119
  @progress = get_key('progress', false)
116
120
 
@@ -124,42 +128,40 @@ class Lingo
124
128
  end
125
129
 
126
130
  def control(cmd, *)
127
- if cmd == :TALK
128
- command(:LIR) if @lir
129
- @files.each { |i| spool(i) }
130
-
131
- command(:EOT)
132
- :skip_command
133
- end
134
- end
131
+ return unless cmd == :TALK
135
132
 
136
- private
133
+ command(:LIR) if @lir
137
134
 
138
- def spool(path)
139
- command(:FILE, path)
135
+ @files.each { |path|
136
+ command(:FILE, path)
140
137
 
141
- io = !stdin?(path) ? open_file(name = path) : begin
142
- stdin = lingo.config.stdin.set_encoding(ENC)
143
- @progress ? StringIO.new(stdin.read) : stdin
144
- end
138
+ io = stdin?(path) ? open_stdin : open_path(name = path)
145
139
 
146
- Progress.new(self, @progress && io.size, name) { |progress|
147
- pos = 0 unless pos?(io = filter(io, path, progress))
140
+ Progress.new(self, @progress && io.size, name) { |progress|
141
+ pos = 0 unless pos?(io = filter(io, path, progress))
148
142
 
149
- io.each { |line|
150
- progress << offset = pos ? pos += line.bytesize : io.pos
143
+ io.each { |line|
144
+ progress << offset = pos ? pos += line.bytesize : io.pos
151
145
 
152
- line =~ @skip ? nil : line =~ @lir ?
153
- command(:RECORD, $1 || $&) : begin
154
- line.sub!(@cut, '') if @cut
155
- forward(line, offset) unless line.empty?
156
- end
146
+ line =~ @skip ? nil : line =~ @lir ?
147
+ command(:RECORD, $1 || $&) : begin
148
+ line.sub!(@cut, '') if @cut
149
+ forward(line, offset) unless line.empty?
150
+ end
151
+ }
157
152
  }
153
+
154
+ io.close unless stdin?(path)
155
+
156
+ command(:EOF, path)
158
157
  }
159
158
 
160
- command(:EOF, path)
159
+ command(:EOT)
160
+ :skip_command
161
161
  end
162
162
 
163
+ private
164
+
163
165
  def filter(io, path, progress)
164
166
  case @filter == true ? file_type(io, path) : @filter.to_s
165
167
  when 'pdftotext' then filter_pdftotext(io, path, progress)
@@ -170,64 +172,51 @@ class Lingo
170
172
  end
171
173
  end
172
174
 
173
- def filter_pdftotext(io, path, progress)
174
- if cmd = File.which(name = 'pdftotext')
175
- with_tempfile(name) { |tempfile|
176
- pdf_path = stdin?(path) ? tempfile[:pdf, io] : path
177
- system(cmd, '-q', pdf_path, txt_path = tempfile[:txt])
175
+ def filter_pdftotext(io, path, progress, name = 'pdftotext')
176
+ cancel_filter(:PDF, name, :command) unless cmd = File.which(name)
178
177
 
179
- progress.init(File.size(txt_path)) if @progress
180
- open_file(txt_path)
181
- }
182
- else
183
- cancel_filter(:PDF, name, :command)
184
- end
178
+ with_tempfile(name) { |tempfile|
179
+ pdf_path = stdin?(path) ? tempfile[:pdf, io] : path
180
+ system(cmd, '-q', pdf_path, txt_path = tempfile[:txt])
181
+
182
+ progress.init(File.size(txt_path)) if @progress
183
+ open_path(txt_path)
184
+ }
185
185
  end
186
186
 
187
187
  def filter_pdf(io)
188
- if Object.const_defined?(:PDF) && PDF.const_defined?(:Reader)
189
- text_enum(PDF::Reader.new(io).pages)
190
- else
191
- cancel_filter(:PDF, 'pdf-reader')
192
- end
188
+ Object.const_defined?(:PDF) && PDF.const_defined?(:Reader) ? text_enum(
189
+ PDF::Reader.new(io).pages) : cancel_filter(:PDF, 'pdf-reader')
193
190
  end
194
191
 
195
- def filter_html(io, xml = false)
196
- type = xml ? :XML : :HTML
197
-
198
- if Object.const_defined?(:Nokogiri)
199
- text_enum(Nokogiri.send(type, io, nil, ENC).children)
200
- else
201
- cancel_filter(type, :nokogiri)
202
- end
192
+ def filter_html(io, xml = false, type = xml ? :XML : :HTML)
193
+ Object.const_defined?(:Nokogiri) ? text_enum(Nokogiri.send(type,
194
+ io, nil, @encoding).children) : cancel_filter(type, :nokogiri)
203
195
  end
204
196
 
205
197
  def file_type(io, path)
206
- if Object.const_defined?(:FileMagic) && io.respond_to?(:rewind)
207
- type = FileMagic.fm(:mime, simplified: true).io(io, 256)
208
- io.rewind
209
- type
210
- elsif Object.const_defined?(:MIME) && MIME.const_defined?(:Types)
211
- if type = MIME::Types.of(path).first
212
- type.content_type
213
- else
214
- cancel('Filters not available. File type could not be determined.')
215
- end
216
- else
217
- cancel("Filters not available. Please install the `ruby-filemagic' or `mime-types' gem.")
218
- end
198
+ Object.const_defined?(:FileMagic) && io.respond_to?(:pos=) ?
199
+ FileMagic.fm(:mime, simplified: true).io(io, 256, true) :
200
+ Object.const_defined?(:MIME) && MIME.const_defined?(:Types) ?
201
+ (type = MIME::Types.of(path).first) ? type.content_type :
202
+ cancel_filters('File type could not be determined.') :
203
+ cancel_filters(please_install(:gem, 'ruby-filemagic', 'mime-types'))
204
+ end
205
+
206
+ def cancel_filters(msg)
207
+ cancel("Filters not available. #{msg}")
219
208
  end
220
209
 
221
210
  def cancel_filter(type, name, what = :gem)
222
- cancel("#{type} filter not available. Please install the `#{name}' #{what}.")
211
+ cancel("#{type} filter not available. #{please_install(what, name)}")
223
212
  end
224
213
 
225
- def cancel(msg)
226
- throw(:cancel, msg)
214
+ def please_install(what, *names)
215
+ "Please install the `#{names.join("' or `")}' #{what}."
227
216
  end
228
217
 
229
- def stdin?(path)
230
- %w[STDIN -].include?(path)
218
+ def cancel(msg)
219
+ throw(:cancel, msg)
231
220
  end
232
221
 
233
222
  def pos?(io)
@@ -235,10 +224,6 @@ class Lingo
235
224
  rescue Errno::ESPIPE
236
225
  end
237
226
 
238
- def open_file(path)
239
- File.open(path, 'rb', encoding: ENC)
240
- end
241
-
242
227
  def with_tempfile(name)
243
228
  require 'tempfile'
244
229
 
@@ -263,31 +248,17 @@ class Lingo
263
248
 
264
249
  @files = []
265
250
 
266
- Array(get_key('files', '-')).each { |path|
267
- stdin?(path) ? @files << path :
268
- add_files(File.expand_path(path), *args)
269
- }
251
+ Array(get_key('files', '-')).each { |path| stdin?(path) ?
252
+ @files << path : add_files(File.expand_path(path), *args) }
270
253
  end
271
254
 
272
255
  def add_files(path, glob, recursive = false)
273
- entries = Dir[path].sort!
274
- raise FileNotFoundError.new(path) if entries.empty?
256
+ raise FileNotFoundError.new(path) if (entries = Dir[path]).sort!.empty?
275
257
 
276
258
  entries.each { |entry|
277
- if File.directory?(entry)
278
- if recursive
279
- Find.find(entry) { |match|
280
- if File.file?(match) && File.fnmatch?(glob, match)
281
- @files << match
282
- end
283
- }
284
- else
285
- add_files(File.join(entry, glob), glob)
286
- end
287
- else
288
- @files << entry
289
- end
290
- }
259
+ !File.directory?(entry) ? @files << entry : !recursive ?
260
+ add_files(File.join(entry, glob), glob) : Find.find(entry) { |match|
261
+ @files << match if File.file?(match) && File.fnmatch?(glob, match) } }
291
262
  end
292
263
 
293
264
  end