lingo 1.8.5 → 1.8.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ChangeLog +25 -0
- data/README +7 -5
- data/Rakefile +58 -55
- data/{lingo-call.cfg → config/lingo-call.cfg} +1 -1
- data/{lingo.cfg → config/lingo.cfg} +10 -2
- data/{lir.cfg → config/lir.cfg} +10 -2
- data/{de → dict/de}/lingo-abk.txt +0 -0
- data/{de → dict/de}/lingo-dic.txt +0 -0
- data/{de → dict/de}/lingo-mul.txt +0 -0
- data/{de → dict/de}/lingo-syn.txt +0 -0
- data/{de → dict/de}/test_dic.txt +0 -0
- data/{de → dict/de}/test_gen.txt +0 -0
- data/{de → dict/de}/test_mu2.txt +0 -0
- data/{de → dict/de}/test_mul.txt +0 -0
- data/{de → dict/de}/test_sgw.txt +0 -0
- data/{de → dict/de}/test_syn.txt +0 -0
- data/{de → dict/de}/user-dic.txt +0 -0
- data/{en → dict/en}/lingo-dic.txt +0 -0
- data/{en → dict/en}/lingo-irr.txt +0 -0
- data/{en → dict/en}/lingo-mul.txt +0 -0
- data/{en → dict/en}/lingo-syn.txt +0 -0
- data/{en → dict/en}/lingo-wdn.txt +0 -0
- data/{en → dict/en}/user-dic.txt +0 -0
- data/{ru → dict/ru}/lingo-dic.txt +0 -0
- data/{ru → dict/ru}/lingo-mul.txt +0 -0
- data/{ru → dict/ru}/lingo-syn.txt +0 -0
- data/{ru → dict/ru}/user-dic.txt +0 -0
- data/{de.lang → lang/de.lang} +1 -1
- data/{en.lang → lang/en.lang} +0 -0
- data/{ru.lang → lang/ru.lang} +0 -0
- data/lib/lingo.rb +14 -15
- data/lib/lingo/app.rb +4 -2
- data/lib/lingo/attendee.rb +23 -43
- data/lib/lingo/attendee/abbreviator.rb +5 -5
- data/lib/lingo/attendee/debugger.rb +39 -12
- data/lib/lingo/attendee/decomposer.rb +3 -4
- data/lib/lingo/attendee/dehyphenizer.rb +4 -4
- data/lib/lingo/attendee/formatter.rb +1 -3
- data/lib/lingo/attendee/multi_worder.rb +3 -4
- data/lib/lingo/attendee/noneword_filter.rb +8 -12
- data/lib/lingo/attendee/object_filter.rb +6 -3
- data/lib/lingo/attendee/sequencer.rb +5 -5
- data/lib/lingo/attendee/stemmer.rb +3 -2
- data/lib/lingo/attendee/synonymer.rb +3 -4
- data/lib/lingo/attendee/text_reader.rb +39 -38
- data/lib/lingo/attendee/text_writer.rb +10 -10
- data/lib/lingo/attendee/tokenizer.rb +63 -33
- data/lib/lingo/attendee/variator.rb +3 -7
- data/lib/lingo/attendee/vector_filter.rb +132 -65
- data/lib/lingo/attendee/word_searcher.rb +5 -3
- data/lib/lingo/buffered_attendee.rb +1 -3
- data/lib/lingo/call.rb +4 -3
- data/lib/lingo/cli.rb +5 -1
- data/lib/lingo/config.rb +11 -5
- data/lib/lingo/ctl.rb +3 -3
- data/lib/lingo/database.rb +3 -1
- data/lib/lingo/database/crypter.rb +1 -3
- data/lib/lingo/database/source.rb +3 -1
- data/lib/lingo/database/source/key_value.rb +3 -1
- data/lib/lingo/database/source/multi_key.rb +3 -1
- data/lib/lingo/database/source/multi_value.rb +3 -1
- data/lib/lingo/database/source/single_word.rb +3 -1
- data/lib/lingo/database/source/word_class.rb +3 -1
- data/lib/lingo/debug.rb +5 -5
- data/lib/lingo/{agenda_item.rb → deferred_attendee.rb} +21 -12
- data/lib/lingo/error.rb +1 -1
- data/lib/lingo/language.rb +1 -9
- data/lib/lingo/language/dictionary.rb +2 -17
- data/lib/lingo/language/grammar.rb +10 -10
- data/lib/lingo/language/lexical.rb +2 -0
- data/lib/lingo/language/lexical_hash.rb +2 -0
- data/lib/lingo/language/token.rb +17 -3
- data/lib/lingo/language/word.rb +13 -5
- data/lib/lingo/language/word_form.rb +5 -3
- data/lib/lingo/progress.rb +2 -2
- data/lib/lingo/srv.rb +1 -1
- data/lib/lingo/srv/lingosrv.cfg +1 -1
- data/lib/lingo/version.rb +1 -1
- data/lib/lingo/web.rb +1 -1
- data/lib/lingo/web/lingoweb.cfg +1 -1
- data/test/attendee/ts_abbreviator.rb +4 -2
- data/test/attendee/ts_multi_worder.rb +81 -88
- data/test/attendee/ts_noneword_filter.rb +2 -2
- data/test/attendee/ts_object_filter.rb +2 -2
- data/test/attendee/ts_sequencer.rb +40 -20
- data/test/attendee/ts_stemmer.rb +52 -26
- data/test/attendee/ts_text_reader.rb +75 -56
- data/test/attendee/ts_text_writer.rb +6 -4
- data/test/attendee/ts_tokenizer.rb +304 -193
- data/test/attendee/ts_vector_filter.rb +242 -9
- data/test/ref/artikel.non +3 -0
- data/test/ref/artikel.vec +1 -4
- data/test/ref/artikel.vef +940 -0
- data/test/ref/artikel.ven +0 -3
- data/test/ref/artikel.ver +0 -3
- data/test/ref/artikel.vet +2580 -0
- data/test/ref/lir.non +34 -31
- data/test/ref/lir.seq +14 -15
- data/test/ref/lir.vec +37 -37
- data/test/ref/lir.vef +329 -0
- data/test/ref/lir.ven +329 -0
- data/test/ref/lir.ver +329 -0
- data/test/ref/lir.vet +329 -0
- data/test/test_helper.rb +29 -16
- data/test/ts_language.rb +6 -47
- metadata +74 -87
- data/lingo.rb +0 -29
- data/spec/spec_helper.rb +0 -5
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -28,6 +28,7 @@ class Lingo
|
|
28
28
|
|
29
29
|
class Attendee
|
30
30
|
|
31
|
+
#--
|
31
32
|
# Der Sequencer ist von seiner Funktion her ähnlich dem Multiworder. Der Multiworder
|
32
33
|
# nutzt zur Erkennung von Mehrwortgruppen spezielle Wörterbücher, der Sequencer hingegen
|
33
34
|
# definierte Folgen von Wortklassen. Mit dem Sequencer können Indexterme generiert werden,
|
@@ -90,11 +91,10 @@ class Lingo
|
|
90
91
|
# out> :./PUNC:
|
91
92
|
# out> *EOL('test.txt')
|
92
93
|
# out> *EOF('test.txt')
|
94
|
+
#++
|
93
95
|
|
94
96
|
class Sequencer < BufferedAttendee
|
95
97
|
|
96
|
-
protected
|
97
|
-
|
98
98
|
def init
|
99
99
|
@stopper = get_array('stopper', DEFAULT_SKIP)
|
100
100
|
.push(WA_UNKNOWN, WA_UNKMULPART)
|
@@ -114,8 +114,8 @@ class Lingo
|
|
114
114
|
raise MissingConfigError.new(:sequences) if @seq.empty?
|
115
115
|
end
|
116
116
|
|
117
|
-
def control(cmd,
|
118
|
-
process_buffer if [
|
117
|
+
def control(cmd, *)
|
118
|
+
process_buffer if [:RECORD, :EOF].include?(cmd)
|
119
119
|
end
|
120
120
|
|
121
121
|
def process_buffer?
|
@@ -30,8 +30,6 @@ class Lingo
|
|
30
30
|
|
31
31
|
class Stemmer < self
|
32
32
|
|
33
|
-
protected
|
34
|
-
|
35
33
|
def init
|
36
34
|
extend(Lingo.get_const(get_key('type', 'porter'), self.class))
|
37
35
|
|
@@ -39,6 +37,9 @@ class Lingo
|
|
39
37
|
@all = get_key('mode', '').downcase == 'all'
|
40
38
|
end
|
41
39
|
|
40
|
+
def control(*)
|
41
|
+
end
|
42
|
+
|
42
43
|
def process(obj)
|
43
44
|
if obj.is_a?(Word) && obj.unknown?
|
44
45
|
stem = stem(Unicode.downcase(obj.form), @all)
|
@@ -28,6 +28,7 @@ class Lingo
|
|
28
28
|
|
29
29
|
class Attendee
|
30
30
|
|
31
|
+
#--
|
31
32
|
# Der Synonymer untersucht die von anderen Attendees ermittelten Grundformen eines Wortes
|
32
33
|
# und sucht in den angegebenen Wörterbüchern nach Relationen zu anderen Grundformen.
|
33
34
|
# Gefundene Relationen erweitern die Liste des Word-Objektes und werden zur späteren
|
@@ -67,19 +68,17 @@ class Lingo
|
|
67
68
|
# out> :./PUNC:
|
68
69
|
# out> *EOL('test.txt')
|
69
70
|
# out> *EOF('test.txt')
|
71
|
+
#++
|
70
72
|
|
71
73
|
class Synonymer < self
|
72
74
|
|
73
|
-
protected
|
74
|
-
|
75
75
|
def init
|
76
76
|
set_dic
|
77
77
|
@com = !get_key('compound-parts', false)
|
78
78
|
@skip = get_array('skip', WA_UNKNOWN, :upcase)
|
79
79
|
end
|
80
80
|
|
81
|
-
def control(
|
82
|
-
# can control
|
81
|
+
def control(*)
|
83
82
|
end
|
84
83
|
|
85
84
|
def process(obj)
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -37,6 +37,7 @@ class Lingo
|
|
37
37
|
|
38
38
|
class Attendee
|
39
39
|
|
40
|
+
#--
|
40
41
|
# Der TextReader ist eine klassische Datenquelle. Er liest eine oder mehrere Dateien
|
41
42
|
# und gibt sie Zeilenweise in den Ausgabekanal. Der Start bzw. Wechsel einer Datei
|
42
43
|
# wird dabei über den Kommandokanal angekündigt, ebenso wie das Ende.
|
@@ -102,75 +103,71 @@ class Lingo
|
|
102
103
|
# out> *RECORD('00002')
|
103
104
|
# out> "020: Nicht-konventionelle Thesaurusrelationen als Orientierungshilfen."
|
104
105
|
# out> *EOF('lir.txt')
|
106
|
+
#++
|
105
107
|
|
106
108
|
class TextReader < self
|
107
109
|
|
108
|
-
|
109
|
-
|
110
|
-
# TODO: FILE und LIR-FILE (?)
|
110
|
+
# TODO: FILE/LIR-FILE (?)
|
111
111
|
def init
|
112
112
|
get_files
|
113
113
|
|
114
|
-
@chomp = get_key('chomp', true)
|
115
114
|
@filter = get_key('filter', false)
|
116
115
|
@progress = get_key('progress', false)
|
117
116
|
|
118
|
-
|
117
|
+
if has_key?('lir-record-pattern')
|
118
|
+
lingo.config.deprecate('lir-record-pattern', :records, self)
|
119
|
+
end
|
119
120
|
|
120
121
|
@lir = get_re('records', get_key('lir-record-pattern', nil), %r{^\[(\d+)\.\]}) # DEPRECATE lir-record-pattern
|
121
122
|
@cut = get_re('fields', !!@lir, %r{^.+?:\s*})
|
122
123
|
@skip = get_re('skip', nil)
|
123
124
|
end
|
124
125
|
|
125
|
-
def control(cmd,
|
126
|
-
if cmd ==
|
127
|
-
|
126
|
+
def control(cmd, *)
|
127
|
+
if cmd == :TALK
|
128
|
+
command(:LIR) if @lir
|
128
129
|
@files.each { |i| spool(i) }
|
130
|
+
|
131
|
+
command(:EOT)
|
132
|
+
:skip_command
|
129
133
|
end
|
130
134
|
end
|
131
135
|
|
132
136
|
private
|
133
137
|
|
134
|
-
# Gibt eine Datei zeilenweise in den Ausgabekanal
|
135
138
|
def spool(path)
|
136
|
-
|
139
|
+
command(:FILE, path)
|
137
140
|
|
138
|
-
io = !stdin?(path) ? open_file(name = path) :
|
139
|
-
|
141
|
+
io = !stdin?(path) ? open_file(name = path) : begin
|
142
|
+
stdin = lingo.config.stdin.set_encoding(ENC)
|
143
|
+
@progress ? StringIO.new(stdin.read) : stdin
|
144
|
+
end
|
140
145
|
|
141
146
|
Progress.new(self, @progress && io.size, name) { |progress|
|
142
|
-
filter(io, path, progress)
|
143
|
-
progress << pos
|
147
|
+
pos = 0 unless pos?(io = filter(io, path, progress))
|
144
148
|
|
145
|
-
|
146
|
-
|
149
|
+
io.each { |line|
|
150
|
+
progress << offset = pos ? pos += line.bytesize : io.pos
|
147
151
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
end
|
152
|
+
line =~ @skip ? nil : line =~ @lir ?
|
153
|
+
command(:RECORD, $1 || $&) : begin
|
154
|
+
line.sub!(@cut, '') if @cut
|
155
|
+
forward(line, offset) unless line.empty?
|
156
|
+
end
|
154
157
|
}
|
155
158
|
}
|
156
159
|
|
157
|
-
|
160
|
+
command(:EOF, path)
|
158
161
|
end
|
159
162
|
|
160
163
|
def filter(io, path, progress)
|
161
|
-
|
162
|
-
lambda { |line| yield line, io.pos } :
|
163
|
-
lambda { |line| yield line, 0 }
|
164
|
-
|
165
|
-
io = case @filter == true ? file_type(io, path) : @filter.to_s
|
164
|
+
case @filter == true ? file_type(io, path) : @filter.to_s
|
166
165
|
when 'pdftotext' then filter_pdftotext(io, path, progress)
|
167
166
|
when /html/i then filter_html(io)
|
168
167
|
when /xml/i then filter_html(io, true)
|
169
|
-
when /pdf/i then filter_pdf(io
|
168
|
+
when /pdf/i then filter_pdf(io)
|
170
169
|
else io
|
171
170
|
end
|
172
|
-
|
173
|
-
io.each_line(&block)
|
174
171
|
end
|
175
172
|
|
176
173
|
def filter_pdftotext(io, path, progress)
|
@@ -189,7 +186,7 @@ class Lingo
|
|
189
186
|
|
190
187
|
def filter_pdf(io)
|
191
188
|
if Object.const_defined?(:PDF) && PDF.const_defined?(:Reader)
|
192
|
-
PDF::Reader.new(io).pages
|
189
|
+
text_enum(PDF::Reader.new(io).pages)
|
193
190
|
else
|
194
191
|
cancel_filter(:PDF, 'pdf-reader')
|
195
192
|
end
|
@@ -199,8 +196,7 @@ class Lingo
|
|
199
196
|
type = xml ? :XML : :HTML
|
200
197
|
|
201
198
|
if Object.const_defined?(:Nokogiri)
|
202
|
-
|
203
|
-
string_or_io(doc.children.map { |x| x.inner_text }.join)
|
199
|
+
text_enum(Nokogiri.send(type, io, nil, ENC).children)
|
204
200
|
else
|
205
201
|
cancel_filter(type, :nokogiri)
|
206
202
|
end
|
@@ -208,7 +204,7 @@ class Lingo
|
|
208
204
|
|
209
205
|
def file_type(io, path)
|
210
206
|
if Object.const_defined?(:FileMagic) && io.respond_to?(:rewind)
|
211
|
-
type = FileMagic.fm(:mime, simplified: true).
|
207
|
+
type = FileMagic.fm(:mime, simplified: true).io(io, 256)
|
212
208
|
io.rewind
|
213
209
|
type
|
214
210
|
elsif Object.const_defined?(:MIME) && MIME.const_defined?(:Types)
|
@@ -234,8 +230,9 @@ class Lingo
|
|
234
230
|
%w[STDIN -].include?(path)
|
235
231
|
end
|
236
232
|
|
237
|
-
def
|
238
|
-
|
233
|
+
def pos?(io)
|
234
|
+
io.pos if io.respond_to?(:pos)
|
235
|
+
rescue Errno::ESPIPE
|
239
236
|
end
|
240
237
|
|
241
238
|
def open_file(path)
|
@@ -257,6 +254,10 @@ class Lingo
|
|
257
254
|
tempfiles.each(&:unlink)
|
258
255
|
end
|
259
256
|
|
257
|
+
def text_enum(collection)
|
258
|
+
Enumerator.new { |y| collection.each { |x| y << x.text } }
|
259
|
+
end
|
260
|
+
|
260
261
|
def get_files
|
261
262
|
args = [get_key('glob', '*.txt'), get_key('recursive', false)]
|
262
263
|
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -28,6 +28,7 @@ class Lingo
|
|
28
28
|
|
29
29
|
class Attendee
|
30
30
|
|
31
|
+
#--
|
31
32
|
# Der TextWriter ermöglicht die Umleitung des Datenstroms in eine Textdatei. Dabei werden
|
32
33
|
# Objekte, die nicht vom Typ String sind in eine sinnvolle Textrepresentation gewandelt.
|
33
34
|
# Der Name der Ausgabedatei wird durch den Namen der Eingabedatei (des Textreaders) bestimmt.
|
@@ -74,27 +75,26 @@ class Lingo
|
|
74
75
|
# 0.01923 textdatei
|
75
76
|
# 0.01923 typ
|
76
77
|
# 0.01923 umleitung
|
78
|
+
#++
|
77
79
|
|
78
80
|
class TextWriter < self
|
79
81
|
|
80
|
-
protected
|
81
|
-
|
82
82
|
def init
|
83
83
|
@ext = get_key('ext', 'txt2')
|
84
84
|
@lir = get_key('lir-format', false)
|
85
85
|
|
86
|
-
@sep
|
86
|
+
@sep = get_key('sep', nil) unless @lir
|
87
87
|
@sep &&= @sep.evaluate
|
88
88
|
@sep ||= ' '
|
89
89
|
|
90
90
|
@no_sep, @no_puts = true, false
|
91
91
|
end
|
92
92
|
|
93
|
-
def control(cmd, param)
|
93
|
+
def control(cmd, param = nil, *)
|
94
94
|
case cmd
|
95
|
-
when
|
95
|
+
when :LIR
|
96
96
|
@lir = true unless @lir.nil?
|
97
|
-
when
|
97
|
+
when :FILE
|
98
98
|
@no_sep = true
|
99
99
|
|
100
100
|
if stdout?(@ext)
|
@@ -104,20 +104,20 @@ class Lingo
|
|
104
104
|
end
|
105
105
|
|
106
106
|
@lir_rec_no, @lir_rec_buf = '', []
|
107
|
-
when
|
107
|
+
when :RECORD
|
108
108
|
if @lir
|
109
109
|
@no_sep = true
|
110
110
|
|
111
111
|
flush_lir_buffer
|
112
112
|
@lir_rec_no = param
|
113
113
|
end
|
114
|
-
when
|
114
|
+
when :EOL
|
115
115
|
@no_sep = true
|
116
116
|
|
117
117
|
unless @lir
|
118
118
|
@file.puts unless @no_puts
|
119
119
|
end
|
120
|
-
when
|
120
|
+
when :EOF
|
121
121
|
flush_lir_buffer if @lir
|
122
122
|
|
123
123
|
unless stdout?(@filename)
|
@@ -28,6 +28,7 @@ class Lingo
|
|
28
28
|
|
29
29
|
class Attendee
|
30
30
|
|
31
|
+
#--
|
31
32
|
# Der Tokenizer zerlegt eine Textzeile in einzelne Token. Dies ist notwendig,
|
32
33
|
# damit nachfolgende Attendees die Textdatei häppchenweise verarbeiten können.
|
33
34
|
#
|
@@ -77,6 +78,7 @@ class Lingo
|
|
77
78
|
# out> :./PUNC:
|
78
79
|
# out> *EOL('test.txt')
|
79
80
|
# out> *EOF('test.txt')
|
81
|
+
#++
|
80
82
|
|
81
83
|
class Tokenizer < self
|
82
84
|
|
@@ -88,14 +90,14 @@ class Lingo
|
|
88
90
|
['SPAC', /^\s+/],
|
89
91
|
['WIKI', /^=+.+=+|^__[A-Z]+__/],
|
90
92
|
['NUMS', /^[+-]?(?:\d{4,}|\d{1,3}(?:\.\d{3,3})*)(?:\.|(?:,\d+)?%?)/],
|
91
|
-
['URLS', /^(?:www\.|mailto:|#{PROTO}|\S+?[._]\S+?@\S+?\.)
|
93
|
+
['URLS', /^(?:www\.|mailto:|#{PROTO}|\S+?[._]\S+?@\S+?\.)[^\s<>]+/],
|
92
94
|
['ABRV', /^(?:(?:(?:#{CHAR})+\.)+)(?:#{CHAR})+/],
|
93
95
|
['WORD', /^(?:#{CHAR}|#{DIGIT}|-)+/],
|
94
96
|
['PUNC', /^[!,.:;?¡¿]+/]
|
95
97
|
]
|
96
98
|
|
97
99
|
OTHER = [
|
98
|
-
['OTHR', /^["$#%&'()*+\/<=>@\[\\\]^_{|}
|
100
|
+
['OTHR', /^["$#%&'()*+\/<=>@\[\\\]^_{|}~¢£¤¥¦§¨©«¬®¯°±²³´¶·¸¹»¼½¾×÷„“–]/],
|
99
101
|
['HELP', /^\S+/]
|
100
102
|
]
|
101
103
|
|
@@ -113,8 +115,8 @@ class Lingo
|
|
113
115
|
RULES.assoc(name)
|
114
116
|
end
|
115
117
|
|
116
|
-
def rules(name)
|
117
|
-
RULES.select { |rule,| rule == name }
|
118
|
+
def rules(name = nil)
|
119
|
+
name ? RULES.select { |rule,| rule == name } : RULES.map(&:first)
|
118
120
|
end
|
119
121
|
|
120
122
|
def delete(*names)
|
@@ -155,13 +157,14 @@ class Lingo
|
|
155
157
|
|
156
158
|
end
|
157
159
|
|
158
|
-
protected
|
159
|
-
|
160
160
|
def init
|
161
161
|
@space = get_key('space', false)
|
162
162
|
@tags = get_key('tags', false)
|
163
163
|
@wiki = get_key('wiki', false)
|
164
164
|
|
165
|
+
@skip_tags = get_array('skip-tags', '', :downcase)
|
166
|
+
@tags = true unless @skip_tags.empty?
|
167
|
+
|
165
168
|
skip = []
|
166
169
|
skip << 'HTML' unless @tags
|
167
170
|
skip << 'WIKI' unless @wiki
|
@@ -170,7 +173,7 @@ class Lingo
|
|
170
173
|
hash.delete_if { |name, _| skip.include?(Token.clean(name)) }
|
171
174
|
}
|
172
175
|
|
173
|
-
@nest, nest_re = [], []
|
176
|
+
@override, @nest, nest_re = [], [], []
|
174
177
|
|
175
178
|
@nests.each { |name, re|
|
176
179
|
re.map!.with_index { |r, i| r.is_a?(Regexp) ?
|
@@ -182,30 +185,30 @@ class Lingo
|
|
182
185
|
|
183
186
|
@nest_re = /^(?<_>.*?)(?:#{nest_re.join('|')})/
|
184
187
|
|
185
|
-
|
188
|
+
reset
|
186
189
|
end
|
187
190
|
|
188
|
-
def control(cmd,
|
191
|
+
def control(cmd, filename = nil, *)
|
189
192
|
case cmd
|
190
|
-
when
|
191
|
-
when
|
192
|
-
when
|
193
|
-
when
|
193
|
+
when :FILE then reset(filename)
|
194
|
+
when :LIR then reset(nil, nil)
|
195
|
+
when :EOL then @linenum += 1 if @linenum
|
196
|
+
when :EOF then @override.clear; @nest.clear
|
194
197
|
end
|
195
198
|
end
|
196
199
|
|
197
|
-
def process(
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
else
|
202
|
-
forward(obj)
|
203
|
-
end
|
200
|
+
def process(line, offset)
|
201
|
+
@offset = offset
|
202
|
+
tokenize(line)
|
203
|
+
command(:EOL, @filename) if @filename
|
204
204
|
end
|
205
205
|
|
206
206
|
private
|
207
207
|
|
208
|
-
|
208
|
+
def reset(filename = nil, linenum = 1)
|
209
|
+
@filename, @linenum, @position, @offset = filename, linenum, -1, 0
|
210
|
+
end
|
211
|
+
|
209
212
|
def tokenize(line)
|
210
213
|
@nest.empty? ? tokenize_line(line) : tokenize_nest(line)
|
211
214
|
rescue => err
|
@@ -225,8 +228,11 @@ class Lingo
|
|
225
228
|
def tokenize_rule(line, rules = @rules)
|
226
229
|
rules.find { |name, expr|
|
227
230
|
next unless line =~ expr
|
228
|
-
|
229
|
-
|
231
|
+
|
232
|
+
rest = $'
|
233
|
+
forward_token($&, name, rest) if name != 'SPAC' || @space
|
234
|
+
|
235
|
+
yield rest
|
230
236
|
}
|
231
237
|
end
|
232
238
|
|
@@ -235,13 +241,26 @@ class Lingo
|
|
235
241
|
mdc = @nests[@nest.last].last.match(line)
|
236
242
|
|
237
243
|
if mdo && (!mdc || mdo[0].length < mdc[0].length)
|
238
|
-
|
239
|
-
|
244
|
+
rest = mdo.post_match
|
240
245
|
nest = @nests.keys.find { |name| mdo[name] }
|
241
|
-
|
246
|
+
text = mdo[nest]
|
247
|
+
lead = mdo[:_]
|
248
|
+
|
249
|
+
forward_token(lead, @nest.last, text + rest) unless lead.empty?
|
250
|
+
|
251
|
+
forward_nest(text, nest, rest)
|
242
252
|
elsif mdc
|
243
|
-
|
244
|
-
|
253
|
+
rest = mdc.post_match
|
254
|
+
nest = @nest.pop
|
255
|
+
text = mdc[0]
|
256
|
+
|
257
|
+
forward_token(text, nest, rest)
|
258
|
+
|
259
|
+
if overriding?(nest)
|
260
|
+
@override.pop if text.downcase.end_with?("/#{@override.last}>")
|
261
|
+
end
|
262
|
+
|
263
|
+
tokenize(rest)
|
245
264
|
else
|
246
265
|
forward_token(line, @nest.last)
|
247
266
|
end
|
@@ -250,21 +269,32 @@ class Lingo
|
|
250
269
|
def tokenize_open(line)
|
251
270
|
@nests.each { |nest, (open_re, _)|
|
252
271
|
next unless line =~ open_re
|
253
|
-
return forward_nest($&, $'
|
272
|
+
return forward_nest($&, nest, $')
|
254
273
|
}
|
255
274
|
|
256
275
|
tokenize_rule(line, OTHER) { |rest| line = rest }
|
257
276
|
tokenize(line)
|
258
277
|
end
|
259
278
|
|
260
|
-
def forward_nest(match,
|
261
|
-
|
279
|
+
def forward_nest(match, nest, rest)
|
280
|
+
if overriding?(nest)
|
281
|
+
tag = rest[/^[^\s>]*/].downcase
|
282
|
+
@override << tag if @skip_tags.include?(tag)
|
283
|
+
end
|
284
|
+
|
285
|
+
forward_token(match, nest, rest)
|
286
|
+
|
262
287
|
@nest << nest
|
263
288
|
tokenize(rest)
|
264
289
|
end
|
265
290
|
|
266
|
-
def forward_token(
|
267
|
-
forward(Token.new(
|
291
|
+
def forward_token(form, attr, rest = '')
|
292
|
+
forward(Token.new(form, @override.empty? ? attr : 'SKIP',
|
293
|
+
@position += 1, @offset - form.bytesize - rest.bytesize))
|
294
|
+
end
|
295
|
+
|
296
|
+
def overriding?(nest)
|
297
|
+
nest == 'HTML' && !@skip_tags.empty?
|
268
298
|
end
|
269
299
|
|
270
300
|
end
|