lingo 1.8.5 → 1.8.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (109) hide show
  1. checksums.yaml +4 -4
  2. data/ChangeLog +25 -0
  3. data/README +7 -5
  4. data/Rakefile +58 -55
  5. data/{lingo-call.cfg → config/lingo-call.cfg} +1 -1
  6. data/{lingo.cfg → config/lingo.cfg} +10 -2
  7. data/{lir.cfg → config/lir.cfg} +10 -2
  8. data/{de → dict/de}/lingo-abk.txt +0 -0
  9. data/{de → dict/de}/lingo-dic.txt +0 -0
  10. data/{de → dict/de}/lingo-mul.txt +0 -0
  11. data/{de → dict/de}/lingo-syn.txt +0 -0
  12. data/{de → dict/de}/test_dic.txt +0 -0
  13. data/{de → dict/de}/test_gen.txt +0 -0
  14. data/{de → dict/de}/test_mu2.txt +0 -0
  15. data/{de → dict/de}/test_mul.txt +0 -0
  16. data/{de → dict/de}/test_sgw.txt +0 -0
  17. data/{de → dict/de}/test_syn.txt +0 -0
  18. data/{de → dict/de}/user-dic.txt +0 -0
  19. data/{en → dict/en}/lingo-dic.txt +0 -0
  20. data/{en → dict/en}/lingo-irr.txt +0 -0
  21. data/{en → dict/en}/lingo-mul.txt +0 -0
  22. data/{en → dict/en}/lingo-syn.txt +0 -0
  23. data/{en → dict/en}/lingo-wdn.txt +0 -0
  24. data/{en → dict/en}/user-dic.txt +0 -0
  25. data/{ru → dict/ru}/lingo-dic.txt +0 -0
  26. data/{ru → dict/ru}/lingo-mul.txt +0 -0
  27. data/{ru → dict/ru}/lingo-syn.txt +0 -0
  28. data/{ru → dict/ru}/user-dic.txt +0 -0
  29. data/{de.lang → lang/de.lang} +1 -1
  30. data/{en.lang → lang/en.lang} +0 -0
  31. data/{ru.lang → lang/ru.lang} +0 -0
  32. data/lib/lingo.rb +14 -15
  33. data/lib/lingo/app.rb +4 -2
  34. data/lib/lingo/attendee.rb +23 -43
  35. data/lib/lingo/attendee/abbreviator.rb +5 -5
  36. data/lib/lingo/attendee/debugger.rb +39 -12
  37. data/lib/lingo/attendee/decomposer.rb +3 -4
  38. data/lib/lingo/attendee/dehyphenizer.rb +4 -4
  39. data/lib/lingo/attendee/formatter.rb +1 -3
  40. data/lib/lingo/attendee/multi_worder.rb +3 -4
  41. data/lib/lingo/attendee/noneword_filter.rb +8 -12
  42. data/lib/lingo/attendee/object_filter.rb +6 -3
  43. data/lib/lingo/attendee/sequencer.rb +5 -5
  44. data/lib/lingo/attendee/stemmer.rb +3 -2
  45. data/lib/lingo/attendee/synonymer.rb +3 -4
  46. data/lib/lingo/attendee/text_reader.rb +39 -38
  47. data/lib/lingo/attendee/text_writer.rb +10 -10
  48. data/lib/lingo/attendee/tokenizer.rb +63 -33
  49. data/lib/lingo/attendee/variator.rb +3 -7
  50. data/lib/lingo/attendee/vector_filter.rb +132 -65
  51. data/lib/lingo/attendee/word_searcher.rb +5 -3
  52. data/lib/lingo/buffered_attendee.rb +1 -3
  53. data/lib/lingo/call.rb +4 -3
  54. data/lib/lingo/cli.rb +5 -1
  55. data/lib/lingo/config.rb +11 -5
  56. data/lib/lingo/ctl.rb +3 -3
  57. data/lib/lingo/database.rb +3 -1
  58. data/lib/lingo/database/crypter.rb +1 -3
  59. data/lib/lingo/database/source.rb +3 -1
  60. data/lib/lingo/database/source/key_value.rb +3 -1
  61. data/lib/lingo/database/source/multi_key.rb +3 -1
  62. data/lib/lingo/database/source/multi_value.rb +3 -1
  63. data/lib/lingo/database/source/single_word.rb +3 -1
  64. data/lib/lingo/database/source/word_class.rb +3 -1
  65. data/lib/lingo/debug.rb +5 -5
  66. data/lib/lingo/{agenda_item.rb → deferred_attendee.rb} +21 -12
  67. data/lib/lingo/error.rb +1 -1
  68. data/lib/lingo/language.rb +1 -9
  69. data/lib/lingo/language/dictionary.rb +2 -17
  70. data/lib/lingo/language/grammar.rb +10 -10
  71. data/lib/lingo/language/lexical.rb +2 -0
  72. data/lib/lingo/language/lexical_hash.rb +2 -0
  73. data/lib/lingo/language/token.rb +17 -3
  74. data/lib/lingo/language/word.rb +13 -5
  75. data/lib/lingo/language/word_form.rb +5 -3
  76. data/lib/lingo/progress.rb +2 -2
  77. data/lib/lingo/srv.rb +1 -1
  78. data/lib/lingo/srv/lingosrv.cfg +1 -1
  79. data/lib/lingo/version.rb +1 -1
  80. data/lib/lingo/web.rb +1 -1
  81. data/lib/lingo/web/lingoweb.cfg +1 -1
  82. data/test/attendee/ts_abbreviator.rb +4 -2
  83. data/test/attendee/ts_multi_worder.rb +81 -88
  84. data/test/attendee/ts_noneword_filter.rb +2 -2
  85. data/test/attendee/ts_object_filter.rb +2 -2
  86. data/test/attendee/ts_sequencer.rb +40 -20
  87. data/test/attendee/ts_stemmer.rb +52 -26
  88. data/test/attendee/ts_text_reader.rb +75 -56
  89. data/test/attendee/ts_text_writer.rb +6 -4
  90. data/test/attendee/ts_tokenizer.rb +304 -193
  91. data/test/attendee/ts_vector_filter.rb +242 -9
  92. data/test/ref/artikel.non +3 -0
  93. data/test/ref/artikel.vec +1 -4
  94. data/test/ref/artikel.vef +940 -0
  95. data/test/ref/artikel.ven +0 -3
  96. data/test/ref/artikel.ver +0 -3
  97. data/test/ref/artikel.vet +2580 -0
  98. data/test/ref/lir.non +34 -31
  99. data/test/ref/lir.seq +14 -15
  100. data/test/ref/lir.vec +37 -37
  101. data/test/ref/lir.vef +329 -0
  102. data/test/ref/lir.ven +329 -0
  103. data/test/ref/lir.ver +329 -0
  104. data/test/ref/lir.vet +329 -0
  105. data/test/test_helper.rb +29 -16
  106. data/test/ts_language.rb +6 -47
  107. metadata +74 -87
  108. data/lingo.rb +0 -29
  109. data/spec/spec_helper.rb +0 -5
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -28,6 +28,7 @@ class Lingo
28
28
 
29
29
  class Attendee
30
30
 
31
+ #--
31
32
  # Der Sequencer ist von seiner Funktion her ähnlich dem Multiworder. Der Multiworder
32
33
  # nutzt zur Erkennung von Mehrwortgruppen spezielle Wörterbücher, der Sequencer hingegen
33
34
  # definierte Folgen von Wortklassen. Mit dem Sequencer können Indexterme generiert werden,
@@ -90,11 +91,10 @@ class Lingo
90
91
  # out> :./PUNC:
91
92
  # out> *EOL('test.txt')
92
93
  # out> *EOF('test.txt')
94
+ #++
93
95
 
94
96
  class Sequencer < BufferedAttendee
95
97
 
96
- protected
97
-
98
98
  def init
99
99
  @stopper = get_array('stopper', DEFAULT_SKIP)
100
100
  .push(WA_UNKNOWN, WA_UNKMULPART)
@@ -114,8 +114,8 @@ class Lingo
114
114
  raise MissingConfigError.new(:sequences) if @seq.empty?
115
115
  end
116
116
 
117
- def control(cmd, param)
118
- process_buffer if [STR_CMD_RECORD, STR_CMD_EOF].include?(cmd)
117
+ def control(cmd, *)
118
+ process_buffer if [:RECORD, :EOF].include?(cmd)
119
119
  end
120
120
 
121
121
  def process_buffer?
@@ -30,8 +30,6 @@ class Lingo
30
30
 
31
31
  class Stemmer < self
32
32
 
33
- protected
34
-
35
33
  def init
36
34
  extend(Lingo.get_const(get_key('type', 'porter'), self.class))
37
35
 
@@ -39,6 +37,9 @@ class Lingo
39
37
  @all = get_key('mode', '').downcase == 'all'
40
38
  end
41
39
 
40
+ def control(*)
41
+ end
42
+
42
43
  def process(obj)
43
44
  if obj.is_a?(Word) && obj.unknown?
44
45
  stem = stem(Unicode.downcase(obj.form), @all)
@@ -28,6 +28,7 @@ class Lingo
28
28
 
29
29
  class Attendee
30
30
 
31
+ #--
31
32
  # Der Synonymer untersucht die von anderen Attendees ermittelten Grundformen eines Wortes
32
33
  # und sucht in den angegebenen Wörterbüchern nach Relationen zu anderen Grundformen.
33
34
  # Gefundene Relationen erweitern die Liste des Word-Objektes und werden zur späteren
@@ -67,19 +68,17 @@ class Lingo
67
68
  # out> :./PUNC:
68
69
  # out> *EOL('test.txt')
69
70
  # out> *EOF('test.txt')
71
+ #++
70
72
 
71
73
  class Synonymer < self
72
74
 
73
- protected
74
-
75
75
  def init
76
76
  set_dic
77
77
  @com = !get_key('compound-parts', false)
78
78
  @skip = get_array('skip', WA_UNKNOWN, :upcase)
79
79
  end
80
80
 
81
- def control(cmd, param)
82
- # can control
81
+ def control(*)
83
82
  end
84
83
 
85
84
  def process(obj)
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2013 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -37,6 +37,7 @@ class Lingo
37
37
 
38
38
  class Attendee
39
39
 
40
+ #--
40
41
  # Der TextReader ist eine klassische Datenquelle. Er liest eine oder mehrere Dateien
41
42
  # und gibt sie Zeilenweise in den Ausgabekanal. Der Start bzw. Wechsel einer Datei
42
43
  # wird dabei über den Kommandokanal angekündigt, ebenso wie das Ende.
@@ -102,75 +103,71 @@ class Lingo
102
103
  # out> *RECORD('00002')
103
104
  # out> "020: Nicht-konventionelle Thesaurusrelationen als Orientierungshilfen."
104
105
  # out> *EOF('lir.txt')
106
+ #++
105
107
 
106
108
  class TextReader < self
107
109
 
108
- protected
109
-
110
- # TODO: FILE und LIR-FILE (?)
110
+ # TODO: FILE/LIR-FILE (?)
111
111
  def init
112
112
  get_files
113
113
 
114
- @chomp = get_key('chomp', true)
115
114
  @filter = get_key('filter', false)
116
115
  @progress = get_key('progress', false)
117
116
 
118
- lingo.deprecate('lir-record-pattern', :records, self) if has_key?('lir-record-pattern')
117
+ if has_key?('lir-record-pattern')
118
+ lingo.config.deprecate('lir-record-pattern', :records, self)
119
+ end
119
120
 
120
121
  @lir = get_re('records', get_key('lir-record-pattern', nil), %r{^\[(\d+)\.\]}) # DEPRECATE lir-record-pattern
121
122
  @cut = get_re('fields', !!@lir, %r{^.+?:\s*})
122
123
  @skip = get_re('skip', nil)
123
124
  end
124
125
 
125
- def control(cmd, param)
126
- if cmd == STR_CMD_TALK
127
- forward(STR_CMD_LIR, '') if @lir
126
+ def control(cmd, *)
127
+ if cmd == :TALK
128
+ command(:LIR) if @lir
128
129
  @files.each { |i| spool(i) }
130
+
131
+ command(:EOT)
132
+ :skip_command
129
133
  end
130
134
  end
131
135
 
132
136
  private
133
137
 
134
- # Gibt eine Datei zeilenweise in den Ausgabekanal
135
138
  def spool(path)
136
- forward(STR_CMD_FILE, path)
139
+ command(:FILE, path)
137
140
 
138
- io = !stdin?(path) ? open_file(name = path) :
139
- string_or_io(lingo.config.stdin.set_encoding(ENC))
141
+ io = !stdin?(path) ? open_file(name = path) : begin
142
+ stdin = lingo.config.stdin.set_encoding(ENC)
143
+ @progress ? StringIO.new(stdin.read) : stdin
144
+ end
140
145
 
141
146
  Progress.new(self, @progress && io.size, name) { |progress|
142
- filter(io, path, progress) { |line, pos|
143
- progress << pos
147
+ pos = 0 unless pos?(io = filter(io, path, progress))
144
148
 
145
- line.chomp! if @chomp
146
- next if line =~ @skip
149
+ io.each { |line|
150
+ progress << offset = pos ? pos += line.bytesize : io.pos
147
151
 
148
- if line =~ @lir
149
- forward(STR_CMD_RECORD, $1 || $&)
150
- else
151
- line.sub!(@cut, '') if @cut
152
- forward(line) unless line.empty?
153
- end
152
+ line =~ @skip ? nil : line =~ @lir ?
153
+ command(:RECORD, $1 || $&) : begin
154
+ line.sub!(@cut, '') if @cut
155
+ forward(line, offset) unless line.empty?
156
+ end
154
157
  }
155
158
  }
156
159
 
157
- forward(STR_CMD_EOF, path)
160
+ command(:EOF, path)
158
161
  end
159
162
 
160
163
  def filter(io, path, progress)
161
- block = @progress ?
162
- lambda { |line| yield line, io.pos } :
163
- lambda { |line| yield line, 0 }
164
-
165
- io = case @filter == true ? file_type(io, path) : @filter.to_s
164
+ case @filter == true ? file_type(io, path) : @filter.to_s
166
165
  when 'pdftotext' then filter_pdftotext(io, path, progress)
167
166
  when /html/i then filter_html(io)
168
167
  when /xml/i then filter_html(io, true)
169
- when /pdf/i then filter_pdf(io, &block); return
168
+ when /pdf/i then filter_pdf(io)
170
169
  else io
171
170
  end
172
-
173
- io.each_line(&block)
174
171
  end
175
172
 
176
173
  def filter_pdftotext(io, path, progress)
@@ -189,7 +186,7 @@ class Lingo
189
186
 
190
187
  def filter_pdf(io)
191
188
  if Object.const_defined?(:PDF) && PDF.const_defined?(:Reader)
192
- PDF::Reader.new(io).pages.each { |page| yield page.text }
189
+ text_enum(PDF::Reader.new(io).pages)
193
190
  else
194
191
  cancel_filter(:PDF, 'pdf-reader')
195
192
  end
@@ -199,8 +196,7 @@ class Lingo
199
196
  type = xml ? :XML : :HTML
200
197
 
201
198
  if Object.const_defined?(:Nokogiri)
202
- doc = Nokogiri.send(type, io, nil, ENC)
203
- string_or_io(doc.children.map { |x| x.inner_text }.join)
199
+ text_enum(Nokogiri.send(type, io, nil, ENC).children)
204
200
  else
205
201
  cancel_filter(type, :nokogiri)
206
202
  end
@@ -208,7 +204,7 @@ class Lingo
208
204
 
209
205
  def file_type(io, path)
210
206
  if Object.const_defined?(:FileMagic) && io.respond_to?(:rewind)
211
- type = FileMagic.fm(:mime, simplified: true).buffer(io.read(256))
207
+ type = FileMagic.fm(:mime, simplified: true).io(io, 256)
212
208
  io.rewind
213
209
  type
214
210
  elsif Object.const_defined?(:MIME) && MIME.const_defined?(:Types)
@@ -234,8 +230,9 @@ class Lingo
234
230
  %w[STDIN -].include?(path)
235
231
  end
236
232
 
237
- def string_or_io(io)
238
- @progress ? StringIO.new(io.is_a?(String) ? io : io.read) : io
233
+ def pos?(io)
234
+ io.pos if io.respond_to?(:pos)
235
+ rescue Errno::ESPIPE
239
236
  end
240
237
 
241
238
  def open_file(path)
@@ -257,6 +254,10 @@ class Lingo
257
254
  tempfiles.each(&:unlink)
258
255
  end
259
256
 
257
+ def text_enum(collection)
258
+ Enumerator.new { |y| collection.each { |x| y << x.text } }
259
+ end
260
+
260
261
  def get_files
261
262
  args = [get_key('glob', '*.txt'), get_key('recursive', false)]
262
263
 
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -28,6 +28,7 @@ class Lingo
28
28
 
29
29
  class Attendee
30
30
 
31
+ #--
31
32
  # Der TextWriter ermöglicht die Umleitung des Datenstroms in eine Textdatei. Dabei werden
32
33
  # Objekte, die nicht vom Typ String sind in eine sinnvolle Textrepresentation gewandelt.
33
34
  # Der Name der Ausgabedatei wird durch den Namen der Eingabedatei (des Textreaders) bestimmt.
@@ -74,27 +75,26 @@ class Lingo
74
75
  # 0.01923 textdatei
75
76
  # 0.01923 typ
76
77
  # 0.01923 umleitung
78
+ #++
77
79
 
78
80
  class TextWriter < self
79
81
 
80
- protected
81
-
82
82
  def init
83
83
  @ext = get_key('ext', 'txt2')
84
84
  @lir = get_key('lir-format', false)
85
85
 
86
- @sep = @config['sep'] unless @lir
86
+ @sep = get_key('sep', nil) unless @lir
87
87
  @sep &&= @sep.evaluate
88
88
  @sep ||= ' '
89
89
 
90
90
  @no_sep, @no_puts = true, false
91
91
  end
92
92
 
93
- def control(cmd, param)
93
+ def control(cmd, param = nil, *)
94
94
  case cmd
95
- when STR_CMD_LIR
95
+ when :LIR
96
96
  @lir = true unless @lir.nil?
97
- when STR_CMD_FILE
97
+ when :FILE
98
98
  @no_sep = true
99
99
 
100
100
  if stdout?(@ext)
@@ -104,20 +104,20 @@ class Lingo
104
104
  end
105
105
 
106
106
  @lir_rec_no, @lir_rec_buf = '', []
107
- when STR_CMD_RECORD
107
+ when :RECORD
108
108
  if @lir
109
109
  @no_sep = true
110
110
 
111
111
  flush_lir_buffer
112
112
  @lir_rec_no = param
113
113
  end
114
- when STR_CMD_EOL
114
+ when :EOL
115
115
  @no_sep = true
116
116
 
117
117
  unless @lir
118
118
  @file.puts unless @no_puts
119
119
  end
120
- when STR_CMD_EOF
120
+ when :EOF
121
121
  flush_lir_buffer if @lir
122
122
 
123
123
  unless stdout?(@filename)
@@ -28,6 +28,7 @@ class Lingo
28
28
 
29
29
  class Attendee
30
30
 
31
+ #--
31
32
  # Der Tokenizer zerlegt eine Textzeile in einzelne Token. Dies ist notwendig,
32
33
  # damit nachfolgende Attendees die Textdatei häppchenweise verarbeiten können.
33
34
  #
@@ -77,6 +78,7 @@ class Lingo
77
78
  # out> :./PUNC:
78
79
  # out> *EOL('test.txt')
79
80
  # out> *EOF('test.txt')
81
+ #++
80
82
 
81
83
  class Tokenizer < self
82
84
 
@@ -88,14 +90,14 @@ class Lingo
88
90
  ['SPAC', /^\s+/],
89
91
  ['WIKI', /^=+.+=+|^__[A-Z]+__/],
90
92
  ['NUMS', /^[+-]?(?:\d{4,}|\d{1,3}(?:\.\d{3,3})*)(?:\.|(?:,\d+)?%?)/],
91
- ['URLS', /^(?:www\.|mailto:|#{PROTO}|\S+?[._]\S+?@\S+?\.)\S+/],
93
+ ['URLS', /^(?:www\.|mailto:|#{PROTO}|\S+?[._]\S+?@\S+?\.)[^\s<>]+/],
92
94
  ['ABRV', /^(?:(?:(?:#{CHAR})+\.)+)(?:#{CHAR})+/],
93
95
  ['WORD', /^(?:#{CHAR}|#{DIGIT}|-)+/],
94
96
  ['PUNC', /^[!,.:;?¡¿]+/]
95
97
  ]
96
98
 
97
99
  OTHER = [
98
- ['OTHR', /^["$#%&'()*+\/<=>@\[\\\]^_{|}~¢£¤¥¦§¨©«¬®¯°±²³´¶·¸¹»¼½¾×÷]/],
100
+ ['OTHR', /^["$#%&'()*+\/<=>@\[\\\]^_{|}~¢£¤¥¦§¨©«¬®¯°±²³´¶·¸¹»¼½¾×÷„“–]/],
99
101
  ['HELP', /^\S+/]
100
102
  ]
101
103
 
@@ -113,8 +115,8 @@ class Lingo
113
115
  RULES.assoc(name)
114
116
  end
115
117
 
116
- def rules(name)
117
- RULES.select { |rule,| rule == name }
118
+ def rules(name = nil)
119
+ name ? RULES.select { |rule,| rule == name } : RULES.map(&:first)
118
120
  end
119
121
 
120
122
  def delete(*names)
@@ -155,13 +157,14 @@ class Lingo
155
157
 
156
158
  end
157
159
 
158
- protected
159
-
160
160
  def init
161
161
  @space = get_key('space', false)
162
162
  @tags = get_key('tags', false)
163
163
  @wiki = get_key('wiki', false)
164
164
 
165
+ @skip_tags = get_array('skip-tags', '', :downcase)
166
+ @tags = true unless @skip_tags.empty?
167
+
165
168
  skip = []
166
169
  skip << 'HTML' unless @tags
167
170
  skip << 'WIKI' unless @wiki
@@ -170,7 +173,7 @@ class Lingo
170
173
  hash.delete_if { |name, _| skip.include?(Token.clean(name)) }
171
174
  }
172
175
 
173
- @nest, nest_re = [], []
176
+ @override, @nest, nest_re = [], [], []
174
177
 
175
178
  @nests.each { |name, re|
176
179
  re.map!.with_index { |r, i| r.is_a?(Regexp) ?
@@ -182,30 +185,30 @@ class Lingo
182
185
 
183
186
  @nest_re = /^(?<_>.*?)(?:#{nest_re.join('|')})/
184
187
 
185
- @filename = @linenum = nil
188
+ reset
186
189
  end
187
190
 
188
- def control(cmd, param)
191
+ def control(cmd, filename = nil, *)
189
192
  case cmd
190
- when STR_CMD_FILE then @filename, @linenum = param, 1
191
- when STR_CMD_LIR then @filename, @linenum = nil, nil
192
- when STR_CMD_EOL then @linenum += 1 if @linenum
193
- when STR_CMD_EOF then @nest.clear
193
+ when :FILE then reset(filename)
194
+ when :LIR then reset(nil, nil)
195
+ when :EOL then @linenum += 1 if @linenum
196
+ when :EOF then @override.clear; @nest.clear
194
197
  end
195
198
  end
196
199
 
197
- def process(obj)
198
- if obj.is_a?(String)
199
- tokenize(obj)
200
- forward(STR_CMD_EOL, @filename) if @filename
201
- else
202
- forward(obj)
203
- end
200
+ def process(line, offset)
201
+ @offset = offset
202
+ tokenize(line)
203
+ command(:EOL, @filename) if @filename
204
204
  end
205
205
 
206
206
  private
207
207
 
208
- # tokenize("Eine Zeile.") -> [:Eine/WORD:, :Zeile/WORD:, :./PUNC:]
208
+ def reset(filename = nil, linenum = 1)
209
+ @filename, @linenum, @position, @offset = filename, linenum, -1, 0
210
+ end
211
+
209
212
  def tokenize(line)
210
213
  @nest.empty? ? tokenize_line(line) : tokenize_nest(line)
211
214
  rescue => err
@@ -225,8 +228,11 @@ class Lingo
225
228
  def tokenize_rule(line, rules = @rules)
226
229
  rules.find { |name, expr|
227
230
  next unless line =~ expr
228
- forward_token($&, name) if name != 'SPAC' || @space
229
- yield $'
231
+
232
+ rest = $'
233
+ forward_token($&, name, rest) if name != 'SPAC' || @space
234
+
235
+ yield rest
230
236
  }
231
237
  end
232
238
 
@@ -235,13 +241,26 @@ class Lingo
235
241
  mdc = @nests[@nest.last].last.match(line)
236
242
 
237
243
  if mdo && (!mdc || mdo[0].length < mdc[0].length)
238
- forward_token(mdo[:_], @nest.last) unless mdo[:_].empty?
239
-
244
+ rest = mdo.post_match
240
245
  nest = @nests.keys.find { |name| mdo[name] }
241
- forward_nest(mdo[nest], mdo.post_match, nest)
246
+ text = mdo[nest]
247
+ lead = mdo[:_]
248
+
249
+ forward_token(lead, @nest.last, text + rest) unless lead.empty?
250
+
251
+ forward_nest(text, nest, rest)
242
252
  elsif mdc
243
- forward_token(mdc[0], @nest.pop)
244
- tokenize(mdc.post_match)
253
+ rest = mdc.post_match
254
+ nest = @nest.pop
255
+ text = mdc[0]
256
+
257
+ forward_token(text, nest, rest)
258
+
259
+ if overriding?(nest)
260
+ @override.pop if text.downcase.end_with?("/#{@override.last}>")
261
+ end
262
+
263
+ tokenize(rest)
245
264
  else
246
265
  forward_token(line, @nest.last)
247
266
  end
@@ -250,21 +269,32 @@ class Lingo
250
269
  def tokenize_open(line)
251
270
  @nests.each { |nest, (open_re, _)|
252
271
  next unless line =~ open_re
253
- return forward_nest($&, $', nest)
272
+ return forward_nest($&, nest, $')
254
273
  }
255
274
 
256
275
  tokenize_rule(line, OTHER) { |rest| line = rest }
257
276
  tokenize(line)
258
277
  end
259
278
 
260
- def forward_nest(match, rest, nest)
261
- forward_token(match, nest)
279
+ def forward_nest(match, nest, rest)
280
+ if overriding?(nest)
281
+ tag = rest[/^[^\s>]*/].downcase
282
+ @override << tag if @skip_tags.include?(tag)
283
+ end
284
+
285
+ forward_token(match, nest, rest)
286
+
262
287
  @nest << nest
263
288
  tokenize(rest)
264
289
  end
265
290
 
266
- def forward_token(*args)
267
- forward(Token.new(*args))
291
+ def forward_token(form, attr, rest = '')
292
+ forward(Token.new(form, @override.empty? ? attr : 'SKIP',
293
+ @position += 1, @offset - form.bytesize - rest.bytesize))
294
+ end
295
+
296
+ def overriding?(nest)
297
+ nest == 'HTML' && !@skip_tags.empty?
268
298
  end
269
299
 
270
300
  end