lingo 1.8.5 → 1.8.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. checksums.yaml +4 -4
  2. data/ChangeLog +25 -0
  3. data/README +7 -5
  4. data/Rakefile +58 -55
  5. data/{lingo-call.cfg → config/lingo-call.cfg} +1 -1
  6. data/{lingo.cfg → config/lingo.cfg} +10 -2
  7. data/{lir.cfg → config/lir.cfg} +10 -2
  8. data/{de → dict/de}/lingo-abk.txt +0 -0
  9. data/{de → dict/de}/lingo-dic.txt +0 -0
  10. data/{de → dict/de}/lingo-mul.txt +0 -0
  11. data/{de → dict/de}/lingo-syn.txt +0 -0
  12. data/{de → dict/de}/test_dic.txt +0 -0
  13. data/{de → dict/de}/test_gen.txt +0 -0
  14. data/{de → dict/de}/test_mu2.txt +0 -0
  15. data/{de → dict/de}/test_mul.txt +0 -0
  16. data/{de → dict/de}/test_sgw.txt +0 -0
  17. data/{de → dict/de}/test_syn.txt +0 -0
  18. data/{de → dict/de}/user-dic.txt +0 -0
  19. data/{en → dict/en}/lingo-dic.txt +0 -0
  20. data/{en → dict/en}/lingo-irr.txt +0 -0
  21. data/{en → dict/en}/lingo-mul.txt +0 -0
  22. data/{en → dict/en}/lingo-syn.txt +0 -0
  23. data/{en → dict/en}/lingo-wdn.txt +0 -0
  24. data/{en → dict/en}/user-dic.txt +0 -0
  25. data/{ru → dict/ru}/lingo-dic.txt +0 -0
  26. data/{ru → dict/ru}/lingo-mul.txt +0 -0
  27. data/{ru → dict/ru}/lingo-syn.txt +0 -0
  28. data/{ru → dict/ru}/user-dic.txt +0 -0
  29. data/{de.lang → lang/de.lang} +1 -1
  30. data/{en.lang → lang/en.lang} +0 -0
  31. data/{ru.lang → lang/ru.lang} +0 -0
  32. data/lib/lingo.rb +14 -15
  33. data/lib/lingo/app.rb +4 -2
  34. data/lib/lingo/attendee.rb +23 -43
  35. data/lib/lingo/attendee/abbreviator.rb +5 -5
  36. data/lib/lingo/attendee/debugger.rb +39 -12
  37. data/lib/lingo/attendee/decomposer.rb +3 -4
  38. data/lib/lingo/attendee/dehyphenizer.rb +4 -4
  39. data/lib/lingo/attendee/formatter.rb +1 -3
  40. data/lib/lingo/attendee/multi_worder.rb +3 -4
  41. data/lib/lingo/attendee/noneword_filter.rb +8 -12
  42. data/lib/lingo/attendee/object_filter.rb +6 -3
  43. data/lib/lingo/attendee/sequencer.rb +5 -5
  44. data/lib/lingo/attendee/stemmer.rb +3 -2
  45. data/lib/lingo/attendee/synonymer.rb +3 -4
  46. data/lib/lingo/attendee/text_reader.rb +39 -38
  47. data/lib/lingo/attendee/text_writer.rb +10 -10
  48. data/lib/lingo/attendee/tokenizer.rb +63 -33
  49. data/lib/lingo/attendee/variator.rb +3 -7
  50. data/lib/lingo/attendee/vector_filter.rb +132 -65
  51. data/lib/lingo/attendee/word_searcher.rb +5 -3
  52. data/lib/lingo/buffered_attendee.rb +1 -3
  53. data/lib/lingo/call.rb +4 -3
  54. data/lib/lingo/cli.rb +5 -1
  55. data/lib/lingo/config.rb +11 -5
  56. data/lib/lingo/ctl.rb +3 -3
  57. data/lib/lingo/database.rb +3 -1
  58. data/lib/lingo/database/crypter.rb +1 -3
  59. data/lib/lingo/database/source.rb +3 -1
  60. data/lib/lingo/database/source/key_value.rb +3 -1
  61. data/lib/lingo/database/source/multi_key.rb +3 -1
  62. data/lib/lingo/database/source/multi_value.rb +3 -1
  63. data/lib/lingo/database/source/single_word.rb +3 -1
  64. data/lib/lingo/database/source/word_class.rb +3 -1
  65. data/lib/lingo/debug.rb +5 -5
  66. data/lib/lingo/{agenda_item.rb → deferred_attendee.rb} +21 -12
  67. data/lib/lingo/error.rb +1 -1
  68. data/lib/lingo/language.rb +1 -9
  69. data/lib/lingo/language/dictionary.rb +2 -17
  70. data/lib/lingo/language/grammar.rb +10 -10
  71. data/lib/lingo/language/lexical.rb +2 -0
  72. data/lib/lingo/language/lexical_hash.rb +2 -0
  73. data/lib/lingo/language/token.rb +17 -3
  74. data/lib/lingo/language/word.rb +13 -5
  75. data/lib/lingo/language/word_form.rb +5 -3
  76. data/lib/lingo/progress.rb +2 -2
  77. data/lib/lingo/srv.rb +1 -1
  78. data/lib/lingo/srv/lingosrv.cfg +1 -1
  79. data/lib/lingo/version.rb +1 -1
  80. data/lib/lingo/web.rb +1 -1
  81. data/lib/lingo/web/lingoweb.cfg +1 -1
  82. data/test/attendee/ts_abbreviator.rb +4 -2
  83. data/test/attendee/ts_multi_worder.rb +81 -88
  84. data/test/attendee/ts_noneword_filter.rb +2 -2
  85. data/test/attendee/ts_object_filter.rb +2 -2
  86. data/test/attendee/ts_sequencer.rb +40 -20
  87. data/test/attendee/ts_stemmer.rb +52 -26
  88. data/test/attendee/ts_text_reader.rb +75 -56
  89. data/test/attendee/ts_text_writer.rb +6 -4
  90. data/test/attendee/ts_tokenizer.rb +304 -193
  91. data/test/attendee/ts_vector_filter.rb +242 -9
  92. data/test/ref/artikel.non +3 -0
  93. data/test/ref/artikel.vec +1 -4
  94. data/test/ref/artikel.vef +940 -0
  95. data/test/ref/artikel.ven +0 -3
  96. data/test/ref/artikel.ver +0 -3
  97. data/test/ref/artikel.vet +2580 -0
  98. data/test/ref/lir.non +34 -31
  99. data/test/ref/lir.seq +14 -15
  100. data/test/ref/lir.vec +37 -37
  101. data/test/ref/lir.vef +329 -0
  102. data/test/ref/lir.ven +329 -0
  103. data/test/ref/lir.ver +329 -0
  104. data/test/ref/lir.vet +329 -0
  105. data/test/test_helper.rb +29 -16
  106. data/test/ts_language.rb +6 -47
  107. metadata +74 -87
  108. data/lingo.rb +0 -29
  109. data/spec/spec_helper.rb +0 -5
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -28,6 +28,7 @@ class Lingo
28
28
 
29
29
  class Attendee
30
30
 
31
+ #--
31
32
  # Der Sequencer ist von seiner Funktion her ähnlich dem Multiworder. Der Multiworder
32
33
  # nutzt zur Erkennung von Mehrwortgruppen spezielle Wörterbücher, der Sequencer hingegen
33
34
  # definierte Folgen von Wortklassen. Mit dem Sequencer können Indexterme generiert werden,
@@ -90,11 +91,10 @@ class Lingo
90
91
  # out> :./PUNC:
91
92
  # out> *EOL('test.txt')
92
93
  # out> *EOF('test.txt')
94
+ #++
93
95
 
94
96
  class Sequencer < BufferedAttendee
95
97
 
96
- protected
97
-
98
98
  def init
99
99
  @stopper = get_array('stopper', DEFAULT_SKIP)
100
100
  .push(WA_UNKNOWN, WA_UNKMULPART)
@@ -114,8 +114,8 @@ class Lingo
114
114
  raise MissingConfigError.new(:sequences) if @seq.empty?
115
115
  end
116
116
 
117
- def control(cmd, param)
118
- process_buffer if [STR_CMD_RECORD, STR_CMD_EOF].include?(cmd)
117
+ def control(cmd, *)
118
+ process_buffer if [:RECORD, :EOF].include?(cmd)
119
119
  end
120
120
 
121
121
  def process_buffer?
@@ -30,8 +30,6 @@ class Lingo
30
30
 
31
31
  class Stemmer < self
32
32
 
33
- protected
34
-
35
33
  def init
36
34
  extend(Lingo.get_const(get_key('type', 'porter'), self.class))
37
35
 
@@ -39,6 +37,9 @@ class Lingo
39
37
  @all = get_key('mode', '').downcase == 'all'
40
38
  end
41
39
 
40
+ def control(*)
41
+ end
42
+
42
43
  def process(obj)
43
44
  if obj.is_a?(Word) && obj.unknown?
44
45
  stem = stem(Unicode.downcase(obj.form), @all)
@@ -28,6 +28,7 @@ class Lingo
28
28
 
29
29
  class Attendee
30
30
 
31
+ #--
31
32
  # Der Synonymer untersucht die von anderen Attendees ermittelten Grundformen eines Wortes
32
33
  # und sucht in den angegebenen Wörterbüchern nach Relationen zu anderen Grundformen.
33
34
  # Gefundene Relationen erweitern die Liste des Word-Objektes und werden zur späteren
@@ -67,19 +68,17 @@ class Lingo
67
68
  # out> :./PUNC:
68
69
  # out> *EOL('test.txt')
69
70
  # out> *EOF('test.txt')
71
+ #++
70
72
 
71
73
  class Synonymer < self
72
74
 
73
- protected
74
-
75
75
  def init
76
76
  set_dic
77
77
  @com = !get_key('compound-parts', false)
78
78
  @skip = get_array('skip', WA_UNKNOWN, :upcase)
79
79
  end
80
80
 
81
- def control(cmd, param)
82
- # can control
81
+ def control(*)
83
82
  end
84
83
 
85
84
  def process(obj)
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2013 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -37,6 +37,7 @@ class Lingo
37
37
 
38
38
  class Attendee
39
39
 
40
+ #--
40
41
  # Der TextReader ist eine klassische Datenquelle. Er liest eine oder mehrere Dateien
41
42
  # und gibt sie Zeilenweise in den Ausgabekanal. Der Start bzw. Wechsel einer Datei
42
43
  # wird dabei über den Kommandokanal angekündigt, ebenso wie das Ende.
@@ -102,75 +103,71 @@ class Lingo
102
103
  # out> *RECORD('00002')
103
104
  # out> "020: Nicht-konventionelle Thesaurusrelationen als Orientierungshilfen."
104
105
  # out> *EOF('lir.txt')
106
+ #++
105
107
 
106
108
  class TextReader < self
107
109
 
108
- protected
109
-
110
- # TODO: FILE und LIR-FILE (?)
110
+ # TODO: FILE/LIR-FILE (?)
111
111
  def init
112
112
  get_files
113
113
 
114
- @chomp = get_key('chomp', true)
115
114
  @filter = get_key('filter', false)
116
115
  @progress = get_key('progress', false)
117
116
 
118
- lingo.deprecate('lir-record-pattern', :records, self) if has_key?('lir-record-pattern')
117
+ if has_key?('lir-record-pattern')
118
+ lingo.config.deprecate('lir-record-pattern', :records, self)
119
+ end
119
120
 
120
121
  @lir = get_re('records', get_key('lir-record-pattern', nil), %r{^\[(\d+)\.\]}) # DEPRECATE lir-record-pattern
121
122
  @cut = get_re('fields', !!@lir, %r{^.+?:\s*})
122
123
  @skip = get_re('skip', nil)
123
124
  end
124
125
 
125
- def control(cmd, param)
126
- if cmd == STR_CMD_TALK
127
- forward(STR_CMD_LIR, '') if @lir
126
+ def control(cmd, *)
127
+ if cmd == :TALK
128
+ command(:LIR) if @lir
128
129
  @files.each { |i| spool(i) }
130
+
131
+ command(:EOT)
132
+ :skip_command
129
133
  end
130
134
  end
131
135
 
132
136
  private
133
137
 
134
- # Gibt eine Datei zeilenweise in den Ausgabekanal
135
138
  def spool(path)
136
- forward(STR_CMD_FILE, path)
139
+ command(:FILE, path)
137
140
 
138
- io = !stdin?(path) ? open_file(name = path) :
139
- string_or_io(lingo.config.stdin.set_encoding(ENC))
141
+ io = !stdin?(path) ? open_file(name = path) : begin
142
+ stdin = lingo.config.stdin.set_encoding(ENC)
143
+ @progress ? StringIO.new(stdin.read) : stdin
144
+ end
140
145
 
141
146
  Progress.new(self, @progress && io.size, name) { |progress|
142
- filter(io, path, progress) { |line, pos|
143
- progress << pos
147
+ pos = 0 unless pos?(io = filter(io, path, progress))
144
148
 
145
- line.chomp! if @chomp
146
- next if line =~ @skip
149
+ io.each { |line|
150
+ progress << offset = pos ? pos += line.bytesize : io.pos
147
151
 
148
- if line =~ @lir
149
- forward(STR_CMD_RECORD, $1 || $&)
150
- else
151
- line.sub!(@cut, '') if @cut
152
- forward(line) unless line.empty?
153
- end
152
+ line =~ @skip ? nil : line =~ @lir ?
153
+ command(:RECORD, $1 || $&) : begin
154
+ line.sub!(@cut, '') if @cut
155
+ forward(line, offset) unless line.empty?
156
+ end
154
157
  }
155
158
  }
156
159
 
157
- forward(STR_CMD_EOF, path)
160
+ command(:EOF, path)
158
161
  end
159
162
 
160
163
  def filter(io, path, progress)
161
- block = @progress ?
162
- lambda { |line| yield line, io.pos } :
163
- lambda { |line| yield line, 0 }
164
-
165
- io = case @filter == true ? file_type(io, path) : @filter.to_s
164
+ case @filter == true ? file_type(io, path) : @filter.to_s
166
165
  when 'pdftotext' then filter_pdftotext(io, path, progress)
167
166
  when /html/i then filter_html(io)
168
167
  when /xml/i then filter_html(io, true)
169
- when /pdf/i then filter_pdf(io, &block); return
168
+ when /pdf/i then filter_pdf(io)
170
169
  else io
171
170
  end
172
-
173
- io.each_line(&block)
174
171
  end
175
172
 
176
173
  def filter_pdftotext(io, path, progress)
@@ -189,7 +186,7 @@ class Lingo
189
186
 
190
187
  def filter_pdf(io)
191
188
  if Object.const_defined?(:PDF) && PDF.const_defined?(:Reader)
192
- PDF::Reader.new(io).pages.each { |page| yield page.text }
189
+ text_enum(PDF::Reader.new(io).pages)
193
190
  else
194
191
  cancel_filter(:PDF, 'pdf-reader')
195
192
  end
@@ -199,8 +196,7 @@ class Lingo
199
196
  type = xml ? :XML : :HTML
200
197
 
201
198
  if Object.const_defined?(:Nokogiri)
202
- doc = Nokogiri.send(type, io, nil, ENC)
203
- string_or_io(doc.children.map { |x| x.inner_text }.join)
199
+ text_enum(Nokogiri.send(type, io, nil, ENC).children)
204
200
  else
205
201
  cancel_filter(type, :nokogiri)
206
202
  end
@@ -208,7 +204,7 @@ class Lingo
208
204
 
209
205
  def file_type(io, path)
210
206
  if Object.const_defined?(:FileMagic) && io.respond_to?(:rewind)
211
- type = FileMagic.fm(:mime, simplified: true).buffer(io.read(256))
207
+ type = FileMagic.fm(:mime, simplified: true).io(io, 256)
212
208
  io.rewind
213
209
  type
214
210
  elsif Object.const_defined?(:MIME) && MIME.const_defined?(:Types)
@@ -234,8 +230,9 @@ class Lingo
234
230
  %w[STDIN -].include?(path)
235
231
  end
236
232
 
237
- def string_or_io(io)
238
- @progress ? StringIO.new(io.is_a?(String) ? io : io.read) : io
233
+ def pos?(io)
234
+ io.pos if io.respond_to?(:pos)
235
+ rescue Errno::ESPIPE
239
236
  end
240
237
 
241
238
  def open_file(path)
@@ -257,6 +254,10 @@ class Lingo
257
254
  tempfiles.each(&:unlink)
258
255
  end
259
256
 
257
+ def text_enum(collection)
258
+ Enumerator.new { |y| collection.each { |x| y << x.text } }
259
+ end
260
+
260
261
  def get_files
261
262
  args = [get_key('glob', '*.txt'), get_key('recursive', false)]
262
263
 
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2012 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -28,6 +28,7 @@ class Lingo
28
28
 
29
29
  class Attendee
30
30
 
31
+ #--
31
32
  # Der TextWriter ermöglicht die Umleitung des Datenstroms in eine Textdatei. Dabei werden
32
33
  # Objekte, die nicht vom Typ String sind in eine sinnvolle Textrepresentation gewandelt.
33
34
  # Der Name der Ausgabedatei wird durch den Namen der Eingabedatei (des Textreaders) bestimmt.
@@ -74,27 +75,26 @@ class Lingo
74
75
  # 0.01923 textdatei
75
76
  # 0.01923 typ
76
77
  # 0.01923 umleitung
78
+ #++
77
79
 
78
80
  class TextWriter < self
79
81
 
80
- protected
81
-
82
82
  def init
83
83
  @ext = get_key('ext', 'txt2')
84
84
  @lir = get_key('lir-format', false)
85
85
 
86
- @sep = @config['sep'] unless @lir
86
+ @sep = get_key('sep', nil) unless @lir
87
87
  @sep &&= @sep.evaluate
88
88
  @sep ||= ' '
89
89
 
90
90
  @no_sep, @no_puts = true, false
91
91
  end
92
92
 
93
- def control(cmd, param)
93
+ def control(cmd, param = nil, *)
94
94
  case cmd
95
- when STR_CMD_LIR
95
+ when :LIR
96
96
  @lir = true unless @lir.nil?
97
- when STR_CMD_FILE
97
+ when :FILE
98
98
  @no_sep = true
99
99
 
100
100
  if stdout?(@ext)
@@ -104,20 +104,20 @@ class Lingo
104
104
  end
105
105
 
106
106
  @lir_rec_no, @lir_rec_buf = '', []
107
- when STR_CMD_RECORD
107
+ when :RECORD
108
108
  if @lir
109
109
  @no_sep = true
110
110
 
111
111
  flush_lir_buffer
112
112
  @lir_rec_no = param
113
113
  end
114
- when STR_CMD_EOL
114
+ when :EOL
115
115
  @no_sep = true
116
116
 
117
117
  unless @lir
118
118
  @file.puts unless @no_puts
119
119
  end
120
- when STR_CMD_EOF
120
+ when :EOF
121
121
  flush_lir_buffer if @lir
122
122
 
123
123
  unless stdout?(@filename)
@@ -28,6 +28,7 @@ class Lingo
28
28
 
29
29
  class Attendee
30
30
 
31
+ #--
31
32
  # Der Tokenizer zerlegt eine Textzeile in einzelne Token. Dies ist notwendig,
32
33
  # damit nachfolgende Attendees die Textdatei häppchenweise verarbeiten können.
33
34
  #
@@ -77,6 +78,7 @@ class Lingo
77
78
  # out> :./PUNC:
78
79
  # out> *EOL('test.txt')
79
80
  # out> *EOF('test.txt')
81
+ #++
80
82
 
81
83
  class Tokenizer < self
82
84
 
@@ -88,14 +90,14 @@ class Lingo
88
90
  ['SPAC', /^\s+/],
89
91
  ['WIKI', /^=+.+=+|^__[A-Z]+__/],
90
92
  ['NUMS', /^[+-]?(?:\d{4,}|\d{1,3}(?:\.\d{3,3})*)(?:\.|(?:,\d+)?%?)/],
91
- ['URLS', /^(?:www\.|mailto:|#{PROTO}|\S+?[._]\S+?@\S+?\.)\S+/],
93
+ ['URLS', /^(?:www\.|mailto:|#{PROTO}|\S+?[._]\S+?@\S+?\.)[^\s<>]+/],
92
94
  ['ABRV', /^(?:(?:(?:#{CHAR})+\.)+)(?:#{CHAR})+/],
93
95
  ['WORD', /^(?:#{CHAR}|#{DIGIT}|-)+/],
94
96
  ['PUNC', /^[!,.:;?¡¿]+/]
95
97
  ]
96
98
 
97
99
  OTHER = [
98
- ['OTHR', /^["$#%&'()*+\/<=>@\[\\\]^_{|}~¢£¤¥¦§¨©«¬®¯°±²³´¶·¸¹»¼½¾×÷]/],
100
+ ['OTHR', /^["$#%&'()*+\/<=>@\[\\\]^_{|}~¢£¤¥¦§¨©«¬®¯°±²³´¶·¸¹»¼½¾×÷„“–]/],
99
101
  ['HELP', /^\S+/]
100
102
  ]
101
103
 
@@ -113,8 +115,8 @@ class Lingo
113
115
  RULES.assoc(name)
114
116
  end
115
117
 
116
- def rules(name)
117
- RULES.select { |rule,| rule == name }
118
+ def rules(name = nil)
119
+ name ? RULES.select { |rule,| rule == name } : RULES.map(&:first)
118
120
  end
119
121
 
120
122
  def delete(*names)
@@ -155,13 +157,14 @@ class Lingo
155
157
 
156
158
  end
157
159
 
158
- protected
159
-
160
160
  def init
161
161
  @space = get_key('space', false)
162
162
  @tags = get_key('tags', false)
163
163
  @wiki = get_key('wiki', false)
164
164
 
165
+ @skip_tags = get_array('skip-tags', '', :downcase)
166
+ @tags = true unless @skip_tags.empty?
167
+
165
168
  skip = []
166
169
  skip << 'HTML' unless @tags
167
170
  skip << 'WIKI' unless @wiki
@@ -170,7 +173,7 @@ class Lingo
170
173
  hash.delete_if { |name, _| skip.include?(Token.clean(name)) }
171
174
  }
172
175
 
173
- @nest, nest_re = [], []
176
+ @override, @nest, nest_re = [], [], []
174
177
 
175
178
  @nests.each { |name, re|
176
179
  re.map!.with_index { |r, i| r.is_a?(Regexp) ?
@@ -182,30 +185,30 @@ class Lingo
182
185
 
183
186
  @nest_re = /^(?<_>.*?)(?:#{nest_re.join('|')})/
184
187
 
185
- @filename = @linenum = nil
188
+ reset
186
189
  end
187
190
 
188
- def control(cmd, param)
191
+ def control(cmd, filename = nil, *)
189
192
  case cmd
190
- when STR_CMD_FILE then @filename, @linenum = param, 1
191
- when STR_CMD_LIR then @filename, @linenum = nil, nil
192
- when STR_CMD_EOL then @linenum += 1 if @linenum
193
- when STR_CMD_EOF then @nest.clear
193
+ when :FILE then reset(filename)
194
+ when :LIR then reset(nil, nil)
195
+ when :EOL then @linenum += 1 if @linenum
196
+ when :EOF then @override.clear; @nest.clear
194
197
  end
195
198
  end
196
199
 
197
- def process(obj)
198
- if obj.is_a?(String)
199
- tokenize(obj)
200
- forward(STR_CMD_EOL, @filename) if @filename
201
- else
202
- forward(obj)
203
- end
200
+ def process(line, offset)
201
+ @offset = offset
202
+ tokenize(line)
203
+ command(:EOL, @filename) if @filename
204
204
  end
205
205
 
206
206
  private
207
207
 
208
- # tokenize("Eine Zeile.") -> [:Eine/WORD:, :Zeile/WORD:, :./PUNC:]
208
+ def reset(filename = nil, linenum = 1)
209
+ @filename, @linenum, @position, @offset = filename, linenum, -1, 0
210
+ end
211
+
209
212
  def tokenize(line)
210
213
  @nest.empty? ? tokenize_line(line) : tokenize_nest(line)
211
214
  rescue => err
@@ -225,8 +228,11 @@ class Lingo
225
228
  def tokenize_rule(line, rules = @rules)
226
229
  rules.find { |name, expr|
227
230
  next unless line =~ expr
228
- forward_token($&, name) if name != 'SPAC' || @space
229
- yield $'
231
+
232
+ rest = $'
233
+ forward_token($&, name, rest) if name != 'SPAC' || @space
234
+
235
+ yield rest
230
236
  }
231
237
  end
232
238
 
@@ -235,13 +241,26 @@ class Lingo
235
241
  mdc = @nests[@nest.last].last.match(line)
236
242
 
237
243
  if mdo && (!mdc || mdo[0].length < mdc[0].length)
238
- forward_token(mdo[:_], @nest.last) unless mdo[:_].empty?
239
-
244
+ rest = mdo.post_match
240
245
  nest = @nests.keys.find { |name| mdo[name] }
241
- forward_nest(mdo[nest], mdo.post_match, nest)
246
+ text = mdo[nest]
247
+ lead = mdo[:_]
248
+
249
+ forward_token(lead, @nest.last, text + rest) unless lead.empty?
250
+
251
+ forward_nest(text, nest, rest)
242
252
  elsif mdc
243
- forward_token(mdc[0], @nest.pop)
244
- tokenize(mdc.post_match)
253
+ rest = mdc.post_match
254
+ nest = @nest.pop
255
+ text = mdc[0]
256
+
257
+ forward_token(text, nest, rest)
258
+
259
+ if overriding?(nest)
260
+ @override.pop if text.downcase.end_with?("/#{@override.last}>")
261
+ end
262
+
263
+ tokenize(rest)
245
264
  else
246
265
  forward_token(line, @nest.last)
247
266
  end
@@ -250,21 +269,32 @@ class Lingo
250
269
  def tokenize_open(line)
251
270
  @nests.each { |nest, (open_re, _)|
252
271
  next unless line =~ open_re
253
- return forward_nest($&, $', nest)
272
+ return forward_nest($&, nest, $')
254
273
  }
255
274
 
256
275
  tokenize_rule(line, OTHER) { |rest| line = rest }
257
276
  tokenize(line)
258
277
  end
259
278
 
260
- def forward_nest(match, rest, nest)
261
- forward_token(match, nest)
279
+ def forward_nest(match, nest, rest)
280
+ if overriding?(nest)
281
+ tag = rest[/^[^\s>]*/].downcase
282
+ @override << tag if @skip_tags.include?(tag)
283
+ end
284
+
285
+ forward_token(match, nest, rest)
286
+
262
287
  @nest << nest
263
288
  tokenize(rest)
264
289
  end
265
290
 
266
- def forward_token(*args)
267
- forward(Token.new(*args))
291
+ def forward_token(form, attr, rest = '')
292
+ forward(Token.new(form, @override.empty? ? attr : 'SKIP',
293
+ @position += 1, @offset - form.bytesize - rest.bytesize))
294
+ end
295
+
296
+ def overriding?(nest)
297
+ nest == 'HTML' && !@skip_tags.empty?
268
298
  end
269
299
 
270
300
  end