lingo 1.8.6 → 1.8.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/ChangeLog +40 -4
  3. data/README +22 -51
  4. data/Rakefile +3 -17
  5. data/config/lingo.cfg +24 -15
  6. data/config/lir.cfg +25 -16
  7. data/dict/de/test_muh.txt +6 -0
  8. data/dict/en/lingo-dic.txt +2 -3
  9. data/lang/de.lang +10 -9
  10. data/lang/en.lang +1 -1
  11. data/lib/lingo.rb +4 -4
  12. data/lib/lingo/attendee.rb +27 -7
  13. data/lib/lingo/attendee/analysis_filter.rb +81 -0
  14. data/lib/lingo/attendee/debug_filter.rb +42 -0
  15. data/lib/lingo/attendee/debugger.rb +2 -11
  16. data/lib/lingo/attendee/decomposer.rb +6 -3
  17. data/lib/lingo/attendee/formatter.rb +6 -6
  18. data/lib/lingo/attendee/hal_filter.rb +94 -0
  19. data/lib/lingo/attendee/lsi_filter.rb +99 -0
  20. data/lib/lingo/attendee/multi_worder.rb +69 -43
  21. data/lib/lingo/attendee/sequencer.rb +32 -19
  22. data/lib/lingo/attendee/synonymer.rb +2 -2
  23. data/lib/lingo/attendee/text_reader.rb +63 -92
  24. data/lib/lingo/attendee/text_writer.rb +12 -21
  25. data/lib/lingo/attendee/tokenizer.rb +32 -21
  26. data/lib/lingo/attendee/variator.rb +3 -3
  27. data/lib/lingo/attendee/vector_filter.rb +7 -9
  28. data/lib/lingo/attendee/word_searcher.rb +3 -3
  29. data/lib/lingo/buffered_attendee.rb +3 -36
  30. data/lib/lingo/config.rb +1 -1
  31. data/lib/lingo/ctl.rb +7 -155
  32. data/lib/lingo/ctl/analysis.rb +136 -0
  33. data/lib/lingo/ctl/files.rb +86 -0
  34. data/lib/lingo/ctl/other.rb +140 -0
  35. data/lib/lingo/database.rb +64 -60
  36. data/lib/lingo/database/crypter.rb +7 -5
  37. data/lib/lingo/error.rb +5 -4
  38. data/lib/lingo/language.rb +13 -5
  39. data/lib/lingo/language/grammar.rb +13 -7
  40. data/lib/lingo/language/token.rb +6 -0
  41. data/lib/lingo/language/word.rb +23 -36
  42. data/lib/lingo/language/word_form.rb +5 -1
  43. data/lib/lingo/srv.rb +2 -2
  44. data/lib/lingo/text_utils.rb +96 -0
  45. data/lib/lingo/version.rb +1 -1
  46. data/lib/lingo/web/views/index.erb +1 -1
  47. data/test/attendee/ts_decomposer.rb +23 -5
  48. data/test/attendee/ts_multi_worder.rb +66 -0
  49. data/test/attendee/ts_sequencer.rb +28 -4
  50. data/test/attendee/ts_text_reader.rb +20 -0
  51. data/test/attendee/ts_tokenizer.rb +20 -0
  52. data/test/attendee/ts_variator.rb +1 -1
  53. data/test/attendee/ts_word_searcher.rb +39 -3
  54. data/test/lir3.txt +12 -0
  55. data/test/ref/artikel.non +1 -12
  56. data/test/ref/artikel.seq +3 -1
  57. data/test/ref/artikel.vec +1 -0
  58. data/test/ref/artikel.vef +35 -34
  59. data/test/ref/artikel.ven +8 -7
  60. data/test/ref/artikel.ver +34 -33
  61. data/test/ref/artikel.vet +2573 -2563
  62. data/test/ref/lir.non +77 -78
  63. data/test/ref/lir.seq +9 -7
  64. data/test/ref/lir.syn +1 -1
  65. data/test/ref/lir.vec +41 -41
  66. data/test/ref/lir.vef +210 -210
  67. data/test/ref/lir.ven +46 -46
  68. data/test/ref/lir.ver +72 -72
  69. data/test/ref/lir.vet +329 -329
  70. data/test/ts_database.rb +166 -62
  71. data/test/ts_language.rb +23 -23
  72. metadata +53 -34
  73. data/lib/lingo/attendee/dehyphenizer.rb +0 -120
  74. data/lib/lingo/attendee/noneword_filter.rb +0 -115
  75. data/test/attendee/ts_noneword_filter.rb +0 -15
@@ -0,0 +1,99 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # Lingo -- A full-featured automatic indexing system #
7
+ # #
8
+ # Copyright (C) 2005-2007 John Vorhauer #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
+ # #
11
+ # Lingo is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU Affero General Public License as published by the Free #
13
+ # Software Foundation; either version 3 of the License, or (at your option) #
14
+ # any later version. #
15
+ # #
16
+ # Lingo is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
19
+ # more details. #
20
+ # #
21
+ # You should have received a copy of the GNU Affero General Public License #
22
+ # along with Lingo. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class Lingo
28
+
29
+ class Attendee
30
+
31
+ class LsiFilter < DeferredAttendee
32
+
33
+ def init
34
+ require_lib('lsi4r')
35
+
36
+ @lex = get_re('lexicals', '[sy]')
37
+ @skip = get_ary('skip', DEFAULT_SKIP, :upcase)
38
+
39
+ @transform = get_key('transform', Lsi4R::DEFAULT_TRANSFORM)
40
+ @cutoff = get_flo('cut', Lsi4R::DEFAULT_CUTOFF)
41
+
42
+ @min = get_flo('min', false)
43
+ @abs = get_flo('abs', false)
44
+ @nul = get_flo('nul', false)
45
+ @new = get_key('new', true)
46
+
47
+ @sort = get_key('sort', false)
48
+ @sort.downcase! if @sort.respond_to?(:downcase!)
49
+
50
+ @docnum, @vectors = 0, Hash.new { |h, k| h[k] = [] }
51
+ end
52
+
53
+ def control(cmd, *)
54
+ :skip_command if cmd == :EOL
55
+ end
56
+
57
+ def control_deferred(cmd, *)
58
+ @docnum += 1 if TERMINALS.include?(cmd)
59
+ end
60
+
61
+ def process(obj)
62
+ if obj.is_a?(Word) && !@skip.include?(obj.attr)
63
+ vec = []
64
+ obj.each_lex(@lex) { |lex| vec << Unicode.downcase(lex.form) }
65
+ @vectors[@docnum].concat(vec) unless vec.empty?
66
+ end
67
+ end
68
+
69
+ private
70
+
71
+ def send_lsi
72
+ lsi = Lsi4R.new(@vectors); @vectors.clear
73
+
74
+ if lsi.build(transform: @transform, cutoff: @cutoff)
75
+ options, vec = { min: @min, abs: @abs, nul: @nul, new: @new }, []
76
+
77
+ fmt = @sort ? @sort == 'sto' ?
78
+ '%s {%.5f}' : '%2$.5f %1$s' : '%s %.5f' unless @sort == 'normal'
79
+
80
+ yield !@sort ? lambda { |docnum|
81
+ lsi.each_norm(docnum, options) { |_, *v| forward(fmt % v) }
82
+ } : lambda { |docnum|
83
+ lsi.each_norm(docnum, options) { |_, *v| vec << v }
84
+
85
+ !fmt ? vec.sort!.each { |v, _| forward(v) } :
86
+ vec.sort_by { |v, w| [-w, v] }.each { |v| forward(fmt % v) }
87
+
88
+ vec.clear
89
+ }
90
+ end
91
+ end
92
+
93
+ alias_method :flush_deferred, :send_lsi
94
+
95
+ end
96
+
97
+ end
98
+
99
+ end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -40,7 +40,9 @@ class Lingo
40
40
  #
41
41
  # === Mögliche Verlinkung
42
42
  # Erwartet:: Daten vom Typ *Word* z.B. von Wordsearcher, Decomposer, Ocr_variator, MultiWorder
43
- # Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_MULTIWORD). Je erkannter Mehrwortgruppe wird ein zusätzliches Word-Objekt in den Datenstrom eingefügt. Z.B. für Ocr_variator, Sequencer, Noneword_filter, Vector_filter
43
+ # Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_MULTIWORD). Je erkannter Mehrwortgruppe wird
44
+ # ein zusätzliches Word-Objekt in den Datenstrom eingefügt. Z.B. für Ocr_variator, Sequencer,
45
+ # Vector_filter
44
46
  #
45
47
  # === Parameter
46
48
  # Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
@@ -88,7 +90,7 @@ class Lingo
88
90
 
89
91
  lex_src, lex_mod, d = nil, nil, lingo.dictionary_config['databases']
90
92
 
91
- (mul_src = get_array('source')).each { |src|
93
+ (mul_src = get_ary('source')).each { |src|
92
94
  s, m = d[src].values_at('use-lex', 'lex-mode')
93
95
 
94
96
  if lex_src.nil? || lex_src == s
@@ -106,76 +108,99 @@ class Lingo
106
108
  @lex_gra = grammar(lex_src, lex_mod)
107
109
 
108
110
  @syn_dic = if @combine && has_key?('use-syn')
109
- dictionary(get_array('use-syn'), get_key('syn-mode', 'all'))
111
+ dictionary(get_ary('use-syn'), get_key('syn-mode', 'all'))
110
112
  end
111
113
 
112
114
  @expected_tokens_in_buffer, @eof_handling = 3, false
113
115
  end
114
116
 
115
117
  def control(cmd, *)
116
- control_multi(cmd)
118
+ if [:RECORD, :EOF].include?(cmd)
119
+ @eof_handling = true
120
+
121
+ while process_buffer?(2)
122
+ process_buffer
123
+ end
124
+
125
+ forward_number_of_token
126
+
127
+ @eof_handling = false
128
+ end
117
129
  end
118
130
 
119
- def process_buffer
120
- unless form_at(0) == CHAR_PUNCT
121
- unless (res = check_multiword_key(3)).empty?
122
- len = res.map { |r| r.is_a?(Lexical) ? r.form.count(' ') + 1 : r }
123
- len.sort!.reverse!
131
+ private
132
+
133
+ def form_at(index)
134
+ obj = @buffer[index]
135
+ obj.form if obj.is_a?(WordForm) && obj.form != CHAR_PUNCT
136
+ end
137
+
138
+ def forward_number_of_token(len = default = @buffer.size, punct = !default)
139
+ begin
140
+ unless @buffer.empty?
141
+ forward(item = @buffer.delete_at(0))
142
+ len -= 1 unless punct && item.form == CHAR_PUNCT
143
+ end
144
+ end while len > 0
145
+ end
146
+
147
+ def process_buffer?(num = @expected_tokens_in_buffer)
148
+ @buffer.count { |item| item.form != CHAR_PUNCT } >= num
149
+ end
124
150
 
125
- unless (max = len.first) > 3
151
+ def process_buffer
152
+ if form_at(0)
153
+ if res = check_multiword(3, len = [])
154
+ if (max = len.first) <= 3
126
155
  create_and_forward_multiword(3, res)
127
156
  forward_number_of_token(3)
157
+ elsif !@eof_handling && @buffer.size < max
158
+ @expected_tokens_in_buffer = max
128
159
  else
129
- unless @eof_handling || @buffer.size >= max
130
- @expected_tokens_in_buffer = max
131
- else
132
- forward_number_of_token(len.find { |l|
133
- r = check_multiword_key(l)
134
- create_and_forward_multiword(l, r) unless r.empty?
135
- } || 1)
136
-
137
- @expected_tokens_in_buffer = 3
138
- process_buffer if process_buffer?
139
- end
160
+ forward_number_of_token(len.find { |l|
161
+ create_and_forward_multiword(l) } || 1)
162
+
163
+ @expected_tokens_in_buffer = 3
164
+ process_buffer if process_buffer?
140
165
  end
141
166
 
142
167
  return
143
168
  end
144
169
 
145
- unless (res = check_multiword_key(2)).empty?
146
- create_and_forward_multiword(2, res)
147
- forward_number_of_token(1)
148
- end
170
+ create_and_forward_multiword(2) && forward_number_of_token(1)
149
171
  end
150
172
 
151
173
  forward_number_of_token(1, false)
152
174
  @expected_tokens_in_buffer = 3
153
175
  end
154
176
 
155
- private
177
+ def create_and_forward_multiword(len, lex = check_multiword(len))
178
+ return unless lex
156
179
 
157
- def create_and_forward_multiword(len, lex)
158
180
  pos, parts = 0, []
159
181
 
160
182
  begin
161
- if (form = form_at(pos)) == CHAR_PUNCT
162
- @buffer.delete_at(pos)
163
- parts[-1] += CHAR_PUNCT
164
- else
183
+ if form = form_at(pos)
165
184
  @buffer[pos].attr = WA_UNKMULPART if @buffer[pos].unknown?
166
185
  parts << form
167
186
  pos += 1
187
+ else
188
+ @buffer.delete_at(pos)
189
+ parts[-1] += CHAR_PUNCT
168
190
  end
169
191
  end while pos < len
170
192
 
171
- forward(Word.new_lexicals(parts.join(' '),
172
- WA_MULTIWORD, lex.select { |l| l.is_a?(Lexical) }))
193
+ wrd = Word.new_lexicals(parts.join(' '),
194
+ WA_MULTIWORD, lex.select { |l| l.is_a?(Lexical) })
195
+ wrd.token = @buffer[pos - 1].token
196
+
197
+ forward(wrd)
173
198
  end
174
199
 
175
- def check_multiword_key(len)
176
- return [] if valid_tokens_in_buffer < len
200
+ def check_multiword(len, lst = nil)
201
+ return unless process_buffer?(len)
177
202
 
178
- seq = []
203
+ seq, mul, sep = [], [], ' '
179
204
 
180
205
  @buffer.each { |obj|
181
206
  next seq << [obj] unless obj.is_a?(WordForm)
@@ -195,17 +220,18 @@ class Lingo
195
220
  }
196
221
 
197
222
  if @combine
198
- mul = []
199
-
200
223
  seq.shift.product(*seq) { |key|
201
- @mul_dic.select(key.join(' '), mul)
224
+ @mul_dic.select(key.join(sep), mul)
202
225
  break unless @all || mul.empty?
203
226
  } && mul.uniq!
204
-
205
- mul
206
227
  else
207
- @mul_dic.select(seq.map! { |i,| i }.join(' '))
228
+ @mul_dic.select(seq.map! { |i,| i }.join(sep), mul)
208
229
  end
230
+
231
+ lst.push(seq.size).concat(mul.map { |r| r.is_a?(Lexical) ?
232
+ r.form.count(sep) + 1 : r }).sort!.reverse!.uniq! if lst
233
+
234
+ mul unless mul.empty?
209
235
  end
210
236
 
211
237
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -43,7 +43,9 @@ class Lingo
43
43
  #
44
44
  # === Mögliche Verlinkung
45
45
  # Erwartet:: Daten vom Typ *Word* z.B. von Wordsearcher, Decomposer, Ocr_variator, Multiworder
46
- # Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_SEQUENCE). Je erkannter Mehrwortgruppe wird ein zusätzliches Word-Objekt in den Datenstrom eingefügt. Z.B. für Ocr_variator, Sequencer, Noneword_filter, Vector_filter
46
+ # Erzeugt:: Daten vom Typ *Word* (mit Attribut WA_SEQUENCE). Je erkannter Mehrwortgruppe wird
47
+ # ein zusätzliches Word-Objekt in den Datenstrom eingefügt. Z.B. für Ocr_variator, Sequencer,
48
+ # Vector_filter
47
49
  #
48
50
  # === Parameter
49
51
  # Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
@@ -95,15 +97,20 @@ class Lingo
95
97
 
96
98
  class Sequencer < BufferedAttendee
97
99
 
100
+ UNK = %w[#]
101
+ NUM = %w[0]
102
+
103
+ CLS = /[[:alpha:]#{NUM.join}]/o
104
+
98
105
  def init
99
- @stopper = get_array('stopper', DEFAULT_SKIP)
106
+ @stopper = get_ary('stopper', DEFAULT_SKIP)
100
107
  .push(WA_UNKNOWN, WA_UNKMULPART)
101
108
 
102
109
  @mwc = get_key('multiword', LA_MULTIWORD)
103
110
  @cls = []
104
111
 
105
112
  @seq = get_key('sequences').map { |str, fmt|
106
- @cls.concat(cls = (str = str.downcase).scan(/[[:alpha:]]/))
113
+ @cls.concat(cls = (str = str.downcase).scan(CLS))
107
114
 
108
115
  (str =~ /\W/ ? [Regexp.new(str), nil] : [str, cls]).push(
109
116
  fmt == true ? '|' : fmt ? fmt.gsub(/\d+/, '%\&$s') : nil)
@@ -124,7 +131,7 @@ class Lingo
124
131
 
125
132
  def process_buffer
126
133
  flush(@buffer.size < 2 ? @buffer : begin
127
- arg, cls, mwc, unk = [[], buf = [], map = [], @seq], @cls, @mwc, %w[#]
134
+ arg = [[], buf = [], map = [], @seq]
128
135
 
129
136
  iter, skip, rewind = @buffer.each_with_index, 0, lambda {
130
137
  iter.rewind; skip.times { iter.next }; skip = 0
@@ -142,16 +149,17 @@ class Lingo
142
149
  rewind.call
143
150
  end
144
151
 
145
- att = obj.is_a?(Word) && !obj.unknown? ? obj.attrs(false) : unk
152
+ att = (tok = obj.is_a?(Token)) ? obj.number? ? NUM : UNK :
153
+ obj.is_a?(Word) && !obj.unknown? ? obj.compound_attrs : UNK
146
154
 
147
- if (att &= cls).empty?
155
+ if (att &= @cls).empty?
148
156
  find_seq(*arg)
149
157
  rewind.call if skip > 0
150
158
  else
151
- if n = obj.multiword_size(mwc)
152
- n.times { iter.next }
153
- skip = idx + 1
154
- end
159
+ obj.each_lex(@mwc) { |lex|
160
+ lex.form.count(' ').succ.times { iter.next }
161
+ break skip = idx + 1
162
+ } unless tok
155
163
 
156
164
  buf << obj
157
165
  map << att
@@ -178,18 +186,18 @@ class Lingo
178
186
  while pos = q.index(str, pos || 0)
179
187
  _str, _cls = [$&, $&.chars] unless cls
180
188
 
181
- args.clear
189
+ _tok = nil; args.clear
182
190
 
183
191
  _cls.each_with_index { |wc, i|
184
- buf[pos + i].lexicals.find { |l|
185
- args[i] = l.form if l.attr == wc
186
- } or break
192
+ obj = buf[pos + i];_tok ||= obj.token
193
+
194
+ args[i] = obj.is_a?(Word) ? obj.lexicals.find { |lex|
195
+ break lex.form if lex.attr == wc } : obj.form or break
187
196
  } or next
188
197
 
189
- forms << (
198
+ forms << [_str, _tok,
190
199
  fmt =~ /\d/ ? fmt.gsub('%0$s', _str) % args :
191
- fmt ? "#{_str}:#{args.join(fmt)}" : args.join(' ')
192
- )
200
+ fmt ? "#{_str}:#{args.join(fmt)}" : args.join(' ')]
193
201
 
194
202
  pos += 1
195
203
  end
@@ -197,7 +205,12 @@ class Lingo
197
205
  }.clear
198
206
 
199
207
  forms.uniq!
200
- forms.each { |f| mat << Word.new_lexical(f, WA_SEQUENCE, LA_SEQUENCE) }
208
+
209
+ forms.each { |s, t, f|
210
+ wrd = Word.new_lexical(f, WA_SEQUENCE, LA_SEQUENCE)
211
+ wrd.pattern, wrd.token = s, t
212
+ mat << wrd
213
+ }
201
214
 
202
215
  buf.clear
203
216
  mat
@@ -36,7 +36,7 @@ class Lingo
36
36
  #
37
37
  # === Mögliche Verlinkung
38
38
  # Erwartet:: Daten vom Typ *Word* z.B. von Wordsearcher, Decomposer, Ocr_variator, Multiworder
39
- # Erzeugt:: Daten vom Typ *Word* (ggf. um Relationen ergänzt) z.B. für Decomposer, Ocr_variator, Multiworder, Sequencer, Noneword_filter, Vector_filter
39
+ # Erzeugt:: Daten vom Typ *Word* (ggf. um Relationen ergänzt) z.B. für Decomposer, Ocr_variator, Multiworder, Sequencer, Vector_filter
40
40
  #
41
41
  # === Parameter
42
42
  # Kursiv dargestellte Parameter sind optional (ggf. mit Angabe der Voreinstellung).
@@ -75,7 +75,7 @@ class Lingo
75
75
  def init
76
76
  set_dic
77
77
  @com = !get_key('compound-parts', false)
78
- @skip = get_array('skip', WA_UNKNOWN, :upcase)
78
+ @skip = get_ary('skip', WA_UNKNOWN, :upcase)
79
79
  end
80
80
 
81
81
  def control(*)
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -107,10 +107,14 @@ class Lingo
107
107
 
108
108
  class TextReader < self
109
109
 
110
+ include TextUtils
111
+
110
112
  # TODO: FILE/LIR-FILE (?)
111
113
  def init
112
114
  get_files
113
115
 
116
+ @encoding = get_enc
117
+
114
118
  @filter = get_key('filter', false)
115
119
  @progress = get_key('progress', false)
116
120
 
@@ -124,42 +128,40 @@ class Lingo
124
128
  end
125
129
 
126
130
  def control(cmd, *)
127
- if cmd == :TALK
128
- command(:LIR) if @lir
129
- @files.each { |i| spool(i) }
130
-
131
- command(:EOT)
132
- :skip_command
133
- end
134
- end
131
+ return unless cmd == :TALK
135
132
 
136
- private
133
+ command(:LIR) if @lir
137
134
 
138
- def spool(path)
139
- command(:FILE, path)
135
+ @files.each { |path|
136
+ command(:FILE, path)
140
137
 
141
- io = !stdin?(path) ? open_file(name = path) : begin
142
- stdin = lingo.config.stdin.set_encoding(ENC)
143
- @progress ? StringIO.new(stdin.read) : stdin
144
- end
138
+ io = stdin?(path) ? open_stdin : open_path(name = path)
145
139
 
146
- Progress.new(self, @progress && io.size, name) { |progress|
147
- pos = 0 unless pos?(io = filter(io, path, progress))
140
+ Progress.new(self, @progress && io.size, name) { |progress|
141
+ pos = 0 unless pos?(io = filter(io, path, progress))
148
142
 
149
- io.each { |line|
150
- progress << offset = pos ? pos += line.bytesize : io.pos
143
+ io.each { |line|
144
+ progress << offset = pos ? pos += line.bytesize : io.pos
151
145
 
152
- line =~ @skip ? nil : line =~ @lir ?
153
- command(:RECORD, $1 || $&) : begin
154
- line.sub!(@cut, '') if @cut
155
- forward(line, offset) unless line.empty?
156
- end
146
+ line =~ @skip ? nil : line =~ @lir ?
147
+ command(:RECORD, $1 || $&) : begin
148
+ line.sub!(@cut, '') if @cut
149
+ forward(line, offset) unless line.empty?
150
+ end
151
+ }
157
152
  }
153
+
154
+ io.close unless stdin?(path)
155
+
156
+ command(:EOF, path)
158
157
  }
159
158
 
160
- command(:EOF, path)
159
+ command(:EOT)
160
+ :skip_command
161
161
  end
162
162
 
163
+ private
164
+
163
165
  def filter(io, path, progress)
164
166
  case @filter == true ? file_type(io, path) : @filter.to_s
165
167
  when 'pdftotext' then filter_pdftotext(io, path, progress)
@@ -170,64 +172,51 @@ class Lingo
170
172
  end
171
173
  end
172
174
 
173
- def filter_pdftotext(io, path, progress)
174
- if cmd = File.which(name = 'pdftotext')
175
- with_tempfile(name) { |tempfile|
176
- pdf_path = stdin?(path) ? tempfile[:pdf, io] : path
177
- system(cmd, '-q', pdf_path, txt_path = tempfile[:txt])
175
+ def filter_pdftotext(io, path, progress, name = 'pdftotext')
176
+ cancel_filter(:PDF, name, :command) unless cmd = File.which(name)
178
177
 
179
- progress.init(File.size(txt_path)) if @progress
180
- open_file(txt_path)
181
- }
182
- else
183
- cancel_filter(:PDF, name, :command)
184
- end
178
+ with_tempfile(name) { |tempfile|
179
+ pdf_path = stdin?(path) ? tempfile[:pdf, io] : path
180
+ system(cmd, '-q', pdf_path, txt_path = tempfile[:txt])
181
+
182
+ progress.init(File.size(txt_path)) if @progress
183
+ open_path(txt_path)
184
+ }
185
185
  end
186
186
 
187
187
  def filter_pdf(io)
188
- if Object.const_defined?(:PDF) && PDF.const_defined?(:Reader)
189
- text_enum(PDF::Reader.new(io).pages)
190
- else
191
- cancel_filter(:PDF, 'pdf-reader')
192
- end
188
+ Object.const_defined?(:PDF) && PDF.const_defined?(:Reader) ? text_enum(
189
+ PDF::Reader.new(io).pages) : cancel_filter(:PDF, 'pdf-reader')
193
190
  end
194
191
 
195
- def filter_html(io, xml = false)
196
- type = xml ? :XML : :HTML
197
-
198
- if Object.const_defined?(:Nokogiri)
199
- text_enum(Nokogiri.send(type, io, nil, ENC).children)
200
- else
201
- cancel_filter(type, :nokogiri)
202
- end
192
+ def filter_html(io, xml = false, type = xml ? :XML : :HTML)
193
+ Object.const_defined?(:Nokogiri) ? text_enum(Nokogiri.send(type,
194
+ io, nil, @encoding).children) : cancel_filter(type, :nokogiri)
203
195
  end
204
196
 
205
197
  def file_type(io, path)
206
- if Object.const_defined?(:FileMagic) && io.respond_to?(:rewind)
207
- type = FileMagic.fm(:mime, simplified: true).io(io, 256)
208
- io.rewind
209
- type
210
- elsif Object.const_defined?(:MIME) && MIME.const_defined?(:Types)
211
- if type = MIME::Types.of(path).first
212
- type.content_type
213
- else
214
- cancel('Filters not available. File type could not be determined.')
215
- end
216
- else
217
- cancel("Filters not available. Please install the `ruby-filemagic' or `mime-types' gem.")
218
- end
198
+ Object.const_defined?(:FileMagic) && io.respond_to?(:pos=) ?
199
+ FileMagic.fm(:mime, simplified: true).io(io, 256, true) :
200
+ Object.const_defined?(:MIME) && MIME.const_defined?(:Types) ?
201
+ (type = MIME::Types.of(path).first) ? type.content_type :
202
+ cancel_filters('File type could not be determined.') :
203
+ cancel_filters(please_install(:gem, 'ruby-filemagic', 'mime-types'))
204
+ end
205
+
206
+ def cancel_filters(msg)
207
+ cancel("Filters not available. #{msg}")
219
208
  end
220
209
 
221
210
  def cancel_filter(type, name, what = :gem)
222
- cancel("#{type} filter not available. Please install the `#{name}' #{what}.")
211
+ cancel("#{type} filter not available. #{please_install(what, name)}")
223
212
  end
224
213
 
225
- def cancel(msg)
226
- throw(:cancel, msg)
214
+ def please_install(what, *names)
215
+ "Please install the `#{names.join("' or `")}' #{what}."
227
216
  end
228
217
 
229
- def stdin?(path)
230
- %w[STDIN -].include?(path)
218
+ def cancel(msg)
219
+ throw(:cancel, msg)
231
220
  end
232
221
 
233
222
  def pos?(io)
@@ -235,10 +224,6 @@ class Lingo
235
224
  rescue Errno::ESPIPE
236
225
  end
237
226
 
238
- def open_file(path)
239
- File.open(path, 'rb', encoding: ENC)
240
- end
241
-
242
227
  def with_tempfile(name)
243
228
  require 'tempfile'
244
229
 
@@ -263,31 +248,17 @@ class Lingo
263
248
 
264
249
  @files = []
265
250
 
266
- Array(get_key('files', '-')).each { |path|
267
- stdin?(path) ? @files << path :
268
- add_files(File.expand_path(path), *args)
269
- }
251
+ Array(get_key('files', '-')).each { |path| stdin?(path) ?
252
+ @files << path : add_files(File.expand_path(path), *args) }
270
253
  end
271
254
 
272
255
  def add_files(path, glob, recursive = false)
273
- entries = Dir[path].sort!
274
- raise FileNotFoundError.new(path) if entries.empty?
256
+ raise FileNotFoundError.new(path) if (entries = Dir[path]).sort!.empty?
275
257
 
276
258
  entries.each { |entry|
277
- if File.directory?(entry)
278
- if recursive
279
- Find.find(entry) { |match|
280
- if File.file?(match) && File.fnmatch?(glob, match)
281
- @files << match
282
- end
283
- }
284
- else
285
- add_files(File.join(entry, glob), glob)
286
- end
287
- else
288
- @files << entry
289
- end
290
- }
259
+ !File.directory?(entry) ? @files << entry : !recursive ?
260
+ add_files(File.join(entry, glob), glob) : Find.find(entry) { |match|
261
+ @files << match if File.file?(match) && File.fnmatch?(glob, match) } }
291
262
  end
292
263
 
293
264
  end