lingo 1.8.5 → 1.8.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ChangeLog +25 -0
- data/README +7 -5
- data/Rakefile +58 -55
- data/{lingo-call.cfg → config/lingo-call.cfg} +1 -1
- data/{lingo.cfg → config/lingo.cfg} +10 -2
- data/{lir.cfg → config/lir.cfg} +10 -2
- data/{de → dict/de}/lingo-abk.txt +0 -0
- data/{de → dict/de}/lingo-dic.txt +0 -0
- data/{de → dict/de}/lingo-mul.txt +0 -0
- data/{de → dict/de}/lingo-syn.txt +0 -0
- data/{de → dict/de}/test_dic.txt +0 -0
- data/{de → dict/de}/test_gen.txt +0 -0
- data/{de → dict/de}/test_mu2.txt +0 -0
- data/{de → dict/de}/test_mul.txt +0 -0
- data/{de → dict/de}/test_sgw.txt +0 -0
- data/{de → dict/de}/test_syn.txt +0 -0
- data/{de → dict/de}/user-dic.txt +0 -0
- data/{en → dict/en}/lingo-dic.txt +0 -0
- data/{en → dict/en}/lingo-irr.txt +0 -0
- data/{en → dict/en}/lingo-mul.txt +0 -0
- data/{en → dict/en}/lingo-syn.txt +0 -0
- data/{en → dict/en}/lingo-wdn.txt +0 -0
- data/{en → dict/en}/user-dic.txt +0 -0
- data/{ru → dict/ru}/lingo-dic.txt +0 -0
- data/{ru → dict/ru}/lingo-mul.txt +0 -0
- data/{ru → dict/ru}/lingo-syn.txt +0 -0
- data/{ru → dict/ru}/user-dic.txt +0 -0
- data/{de.lang → lang/de.lang} +1 -1
- data/{en.lang → lang/en.lang} +0 -0
- data/{ru.lang → lang/ru.lang} +0 -0
- data/lib/lingo.rb +14 -15
- data/lib/lingo/app.rb +4 -2
- data/lib/lingo/attendee.rb +23 -43
- data/lib/lingo/attendee/abbreviator.rb +5 -5
- data/lib/lingo/attendee/debugger.rb +39 -12
- data/lib/lingo/attendee/decomposer.rb +3 -4
- data/lib/lingo/attendee/dehyphenizer.rb +4 -4
- data/lib/lingo/attendee/formatter.rb +1 -3
- data/lib/lingo/attendee/multi_worder.rb +3 -4
- data/lib/lingo/attendee/noneword_filter.rb +8 -12
- data/lib/lingo/attendee/object_filter.rb +6 -3
- data/lib/lingo/attendee/sequencer.rb +5 -5
- data/lib/lingo/attendee/stemmer.rb +3 -2
- data/lib/lingo/attendee/synonymer.rb +3 -4
- data/lib/lingo/attendee/text_reader.rb +39 -38
- data/lib/lingo/attendee/text_writer.rb +10 -10
- data/lib/lingo/attendee/tokenizer.rb +63 -33
- data/lib/lingo/attendee/variator.rb +3 -7
- data/lib/lingo/attendee/vector_filter.rb +132 -65
- data/lib/lingo/attendee/word_searcher.rb +5 -3
- data/lib/lingo/buffered_attendee.rb +1 -3
- data/lib/lingo/call.rb +4 -3
- data/lib/lingo/cli.rb +5 -1
- data/lib/lingo/config.rb +11 -5
- data/lib/lingo/ctl.rb +3 -3
- data/lib/lingo/database.rb +3 -1
- data/lib/lingo/database/crypter.rb +1 -3
- data/lib/lingo/database/source.rb +3 -1
- data/lib/lingo/database/source/key_value.rb +3 -1
- data/lib/lingo/database/source/multi_key.rb +3 -1
- data/lib/lingo/database/source/multi_value.rb +3 -1
- data/lib/lingo/database/source/single_word.rb +3 -1
- data/lib/lingo/database/source/word_class.rb +3 -1
- data/lib/lingo/debug.rb +5 -5
- data/lib/lingo/{agenda_item.rb → deferred_attendee.rb} +21 -12
- data/lib/lingo/error.rb +1 -1
- data/lib/lingo/language.rb +1 -9
- data/lib/lingo/language/dictionary.rb +2 -17
- data/lib/lingo/language/grammar.rb +10 -10
- data/lib/lingo/language/lexical.rb +2 -0
- data/lib/lingo/language/lexical_hash.rb +2 -0
- data/lib/lingo/language/token.rb +17 -3
- data/lib/lingo/language/word.rb +13 -5
- data/lib/lingo/language/word_form.rb +5 -3
- data/lib/lingo/progress.rb +2 -2
- data/lib/lingo/srv.rb +1 -1
- data/lib/lingo/srv/lingosrv.cfg +1 -1
- data/lib/lingo/version.rb +1 -1
- data/lib/lingo/web.rb +1 -1
- data/lib/lingo/web/lingoweb.cfg +1 -1
- data/test/attendee/ts_abbreviator.rb +4 -2
- data/test/attendee/ts_multi_worder.rb +81 -88
- data/test/attendee/ts_noneword_filter.rb +2 -2
- data/test/attendee/ts_object_filter.rb +2 -2
- data/test/attendee/ts_sequencer.rb +40 -20
- data/test/attendee/ts_stemmer.rb +52 -26
- data/test/attendee/ts_text_reader.rb +75 -56
- data/test/attendee/ts_text_writer.rb +6 -4
- data/test/attendee/ts_tokenizer.rb +304 -193
- data/test/attendee/ts_vector_filter.rb +242 -9
- data/test/ref/artikel.non +3 -0
- data/test/ref/artikel.vec +1 -4
- data/test/ref/artikel.vef +940 -0
- data/test/ref/artikel.ven +0 -3
- data/test/ref/artikel.ver +0 -3
- data/test/ref/artikel.vet +2580 -0
- data/test/ref/lir.non +34 -31
- data/test/ref/lir.seq +14 -15
- data/test/ref/lir.vec +37 -37
- data/test/ref/lir.vef +329 -0
- data/test/ref/lir.ven +329 -0
- data/test/ref/lir.ver +329 -0
- data/test/ref/lir.vet +329 -0
- data/test/test_helper.rb +29 -16
- data/test/ts_language.rb +6 -47
- metadata +74 -87
- data/lingo.rb +0 -29
- data/spec/spec_helper.rb +0 -5
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -28,6 +28,7 @@ class Lingo
|
|
28
28
|
|
29
29
|
class Attendee
|
30
30
|
|
31
|
+
#--
|
31
32
|
# Der Sequencer ist von seiner Funktion her ähnlich dem Multiworder. Der Multiworder
|
32
33
|
# nutzt zur Erkennung von Mehrwortgruppen spezielle Wörterbücher, der Sequencer hingegen
|
33
34
|
# definierte Folgen von Wortklassen. Mit dem Sequencer können Indexterme generiert werden,
|
@@ -90,11 +91,10 @@ class Lingo
|
|
90
91
|
# out> :./PUNC:
|
91
92
|
# out> *EOL('test.txt')
|
92
93
|
# out> *EOF('test.txt')
|
94
|
+
#++
|
93
95
|
|
94
96
|
class Sequencer < BufferedAttendee
|
95
97
|
|
96
|
-
protected
|
97
|
-
|
98
98
|
def init
|
99
99
|
@stopper = get_array('stopper', DEFAULT_SKIP)
|
100
100
|
.push(WA_UNKNOWN, WA_UNKMULPART)
|
@@ -114,8 +114,8 @@ class Lingo
|
|
114
114
|
raise MissingConfigError.new(:sequences) if @seq.empty?
|
115
115
|
end
|
116
116
|
|
117
|
-
def control(cmd,
|
118
|
-
process_buffer if [
|
117
|
+
def control(cmd, *)
|
118
|
+
process_buffer if [:RECORD, :EOF].include?(cmd)
|
119
119
|
end
|
120
120
|
|
121
121
|
def process_buffer?
|
@@ -30,8 +30,6 @@ class Lingo
|
|
30
30
|
|
31
31
|
class Stemmer < self
|
32
32
|
|
33
|
-
protected
|
34
|
-
|
35
33
|
def init
|
36
34
|
extend(Lingo.get_const(get_key('type', 'porter'), self.class))
|
37
35
|
|
@@ -39,6 +37,9 @@ class Lingo
|
|
39
37
|
@all = get_key('mode', '').downcase == 'all'
|
40
38
|
end
|
41
39
|
|
40
|
+
def control(*)
|
41
|
+
end
|
42
|
+
|
42
43
|
def process(obj)
|
43
44
|
if obj.is_a?(Word) && obj.unknown?
|
44
45
|
stem = stem(Unicode.downcase(obj.form), @all)
|
@@ -28,6 +28,7 @@ class Lingo
|
|
28
28
|
|
29
29
|
class Attendee
|
30
30
|
|
31
|
+
#--
|
31
32
|
# Der Synonymer untersucht die von anderen Attendees ermittelten Grundformen eines Wortes
|
32
33
|
# und sucht in den angegebenen Wörterbüchern nach Relationen zu anderen Grundformen.
|
33
34
|
# Gefundene Relationen erweitern die Liste des Word-Objektes und werden zur späteren
|
@@ -67,19 +68,17 @@ class Lingo
|
|
67
68
|
# out> :./PUNC:
|
68
69
|
# out> *EOL('test.txt')
|
69
70
|
# out> *EOF('test.txt')
|
71
|
+
#++
|
70
72
|
|
71
73
|
class Synonymer < self
|
72
74
|
|
73
|
-
protected
|
74
|
-
|
75
75
|
def init
|
76
76
|
set_dic
|
77
77
|
@com = !get_key('compound-parts', false)
|
78
78
|
@skip = get_array('skip', WA_UNKNOWN, :upcase)
|
79
79
|
end
|
80
80
|
|
81
|
-
def control(
|
82
|
-
# can control
|
81
|
+
def control(*)
|
83
82
|
end
|
84
83
|
|
85
84
|
def process(obj)
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -37,6 +37,7 @@ class Lingo
|
|
37
37
|
|
38
38
|
class Attendee
|
39
39
|
|
40
|
+
#--
|
40
41
|
# Der TextReader ist eine klassische Datenquelle. Er liest eine oder mehrere Dateien
|
41
42
|
# und gibt sie Zeilenweise in den Ausgabekanal. Der Start bzw. Wechsel einer Datei
|
42
43
|
# wird dabei über den Kommandokanal angekündigt, ebenso wie das Ende.
|
@@ -102,75 +103,71 @@ class Lingo
|
|
102
103
|
# out> *RECORD('00002')
|
103
104
|
# out> "020: Nicht-konventionelle Thesaurusrelationen als Orientierungshilfen."
|
104
105
|
# out> *EOF('lir.txt')
|
106
|
+
#++
|
105
107
|
|
106
108
|
class TextReader < self
|
107
109
|
|
108
|
-
|
109
|
-
|
110
|
-
# TODO: FILE und LIR-FILE (?)
|
110
|
+
# TODO: FILE/LIR-FILE (?)
|
111
111
|
def init
|
112
112
|
get_files
|
113
113
|
|
114
|
-
@chomp = get_key('chomp', true)
|
115
114
|
@filter = get_key('filter', false)
|
116
115
|
@progress = get_key('progress', false)
|
117
116
|
|
118
|
-
|
117
|
+
if has_key?('lir-record-pattern')
|
118
|
+
lingo.config.deprecate('lir-record-pattern', :records, self)
|
119
|
+
end
|
119
120
|
|
120
121
|
@lir = get_re('records', get_key('lir-record-pattern', nil), %r{^\[(\d+)\.\]}) # DEPRECATE lir-record-pattern
|
121
122
|
@cut = get_re('fields', !!@lir, %r{^.+?:\s*})
|
122
123
|
@skip = get_re('skip', nil)
|
123
124
|
end
|
124
125
|
|
125
|
-
def control(cmd,
|
126
|
-
if cmd ==
|
127
|
-
|
126
|
+
def control(cmd, *)
|
127
|
+
if cmd == :TALK
|
128
|
+
command(:LIR) if @lir
|
128
129
|
@files.each { |i| spool(i) }
|
130
|
+
|
131
|
+
command(:EOT)
|
132
|
+
:skip_command
|
129
133
|
end
|
130
134
|
end
|
131
135
|
|
132
136
|
private
|
133
137
|
|
134
|
-
# Gibt eine Datei zeilenweise in den Ausgabekanal
|
135
138
|
def spool(path)
|
136
|
-
|
139
|
+
command(:FILE, path)
|
137
140
|
|
138
|
-
io = !stdin?(path) ? open_file(name = path) :
|
139
|
-
|
141
|
+
io = !stdin?(path) ? open_file(name = path) : begin
|
142
|
+
stdin = lingo.config.stdin.set_encoding(ENC)
|
143
|
+
@progress ? StringIO.new(stdin.read) : stdin
|
144
|
+
end
|
140
145
|
|
141
146
|
Progress.new(self, @progress && io.size, name) { |progress|
|
142
|
-
filter(io, path, progress)
|
143
|
-
progress << pos
|
147
|
+
pos = 0 unless pos?(io = filter(io, path, progress))
|
144
148
|
|
145
|
-
|
146
|
-
|
149
|
+
io.each { |line|
|
150
|
+
progress << offset = pos ? pos += line.bytesize : io.pos
|
147
151
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
end
|
152
|
+
line =~ @skip ? nil : line =~ @lir ?
|
153
|
+
command(:RECORD, $1 || $&) : begin
|
154
|
+
line.sub!(@cut, '') if @cut
|
155
|
+
forward(line, offset) unless line.empty?
|
156
|
+
end
|
154
157
|
}
|
155
158
|
}
|
156
159
|
|
157
|
-
|
160
|
+
command(:EOF, path)
|
158
161
|
end
|
159
162
|
|
160
163
|
def filter(io, path, progress)
|
161
|
-
|
162
|
-
lambda { |line| yield line, io.pos } :
|
163
|
-
lambda { |line| yield line, 0 }
|
164
|
-
|
165
|
-
io = case @filter == true ? file_type(io, path) : @filter.to_s
|
164
|
+
case @filter == true ? file_type(io, path) : @filter.to_s
|
166
165
|
when 'pdftotext' then filter_pdftotext(io, path, progress)
|
167
166
|
when /html/i then filter_html(io)
|
168
167
|
when /xml/i then filter_html(io, true)
|
169
|
-
when /pdf/i then filter_pdf(io
|
168
|
+
when /pdf/i then filter_pdf(io)
|
170
169
|
else io
|
171
170
|
end
|
172
|
-
|
173
|
-
io.each_line(&block)
|
174
171
|
end
|
175
172
|
|
176
173
|
def filter_pdftotext(io, path, progress)
|
@@ -189,7 +186,7 @@ class Lingo
|
|
189
186
|
|
190
187
|
def filter_pdf(io)
|
191
188
|
if Object.const_defined?(:PDF) && PDF.const_defined?(:Reader)
|
192
|
-
PDF::Reader.new(io).pages
|
189
|
+
text_enum(PDF::Reader.new(io).pages)
|
193
190
|
else
|
194
191
|
cancel_filter(:PDF, 'pdf-reader')
|
195
192
|
end
|
@@ -199,8 +196,7 @@ class Lingo
|
|
199
196
|
type = xml ? :XML : :HTML
|
200
197
|
|
201
198
|
if Object.const_defined?(:Nokogiri)
|
202
|
-
|
203
|
-
string_or_io(doc.children.map { |x| x.inner_text }.join)
|
199
|
+
text_enum(Nokogiri.send(type, io, nil, ENC).children)
|
204
200
|
else
|
205
201
|
cancel_filter(type, :nokogiri)
|
206
202
|
end
|
@@ -208,7 +204,7 @@ class Lingo
|
|
208
204
|
|
209
205
|
def file_type(io, path)
|
210
206
|
if Object.const_defined?(:FileMagic) && io.respond_to?(:rewind)
|
211
|
-
type = FileMagic.fm(:mime, simplified: true).
|
207
|
+
type = FileMagic.fm(:mime, simplified: true).io(io, 256)
|
212
208
|
io.rewind
|
213
209
|
type
|
214
210
|
elsif Object.const_defined?(:MIME) && MIME.const_defined?(:Types)
|
@@ -234,8 +230,9 @@ class Lingo
|
|
234
230
|
%w[STDIN -].include?(path)
|
235
231
|
end
|
236
232
|
|
237
|
-
def
|
238
|
-
|
233
|
+
def pos?(io)
|
234
|
+
io.pos if io.respond_to?(:pos)
|
235
|
+
rescue Errno::ESPIPE
|
239
236
|
end
|
240
237
|
|
241
238
|
def open_file(path)
|
@@ -257,6 +254,10 @@ class Lingo
|
|
257
254
|
tempfiles.each(&:unlink)
|
258
255
|
end
|
259
256
|
|
257
|
+
def text_enum(collection)
|
258
|
+
Enumerator.new { |y| collection.each { |x| y << x.text } }
|
259
|
+
end
|
260
|
+
|
260
261
|
def get_files
|
261
262
|
args = [get_key('glob', '*.txt'), get_key('recursive', false)]
|
262
263
|
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Lingo -- A full-featured automatic indexing system #
|
7
7
|
# #
|
8
8
|
# Copyright (C) 2005-2007 John Vorhauer #
|
9
|
-
# Copyright (C) 2007-
|
9
|
+
# Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
|
10
10
|
# #
|
11
11
|
# Lingo is free software; you can redistribute it and/or modify it under the #
|
12
12
|
# terms of the GNU Affero General Public License as published by the Free #
|
@@ -28,6 +28,7 @@ class Lingo
|
|
28
28
|
|
29
29
|
class Attendee
|
30
30
|
|
31
|
+
#--
|
31
32
|
# Der TextWriter ermöglicht die Umleitung des Datenstroms in eine Textdatei. Dabei werden
|
32
33
|
# Objekte, die nicht vom Typ String sind in eine sinnvolle Textrepresentation gewandelt.
|
33
34
|
# Der Name der Ausgabedatei wird durch den Namen der Eingabedatei (des Textreaders) bestimmt.
|
@@ -74,27 +75,26 @@ class Lingo
|
|
74
75
|
# 0.01923 textdatei
|
75
76
|
# 0.01923 typ
|
76
77
|
# 0.01923 umleitung
|
78
|
+
#++
|
77
79
|
|
78
80
|
class TextWriter < self
|
79
81
|
|
80
|
-
protected
|
81
|
-
|
82
82
|
def init
|
83
83
|
@ext = get_key('ext', 'txt2')
|
84
84
|
@lir = get_key('lir-format', false)
|
85
85
|
|
86
|
-
@sep
|
86
|
+
@sep = get_key('sep', nil) unless @lir
|
87
87
|
@sep &&= @sep.evaluate
|
88
88
|
@sep ||= ' '
|
89
89
|
|
90
90
|
@no_sep, @no_puts = true, false
|
91
91
|
end
|
92
92
|
|
93
|
-
def control(cmd, param)
|
93
|
+
def control(cmd, param = nil, *)
|
94
94
|
case cmd
|
95
|
-
when
|
95
|
+
when :LIR
|
96
96
|
@lir = true unless @lir.nil?
|
97
|
-
when
|
97
|
+
when :FILE
|
98
98
|
@no_sep = true
|
99
99
|
|
100
100
|
if stdout?(@ext)
|
@@ -104,20 +104,20 @@ class Lingo
|
|
104
104
|
end
|
105
105
|
|
106
106
|
@lir_rec_no, @lir_rec_buf = '', []
|
107
|
-
when
|
107
|
+
when :RECORD
|
108
108
|
if @lir
|
109
109
|
@no_sep = true
|
110
110
|
|
111
111
|
flush_lir_buffer
|
112
112
|
@lir_rec_no = param
|
113
113
|
end
|
114
|
-
when
|
114
|
+
when :EOL
|
115
115
|
@no_sep = true
|
116
116
|
|
117
117
|
unless @lir
|
118
118
|
@file.puts unless @no_puts
|
119
119
|
end
|
120
|
-
when
|
120
|
+
when :EOF
|
121
121
|
flush_lir_buffer if @lir
|
122
122
|
|
123
123
|
unless stdout?(@filename)
|
@@ -28,6 +28,7 @@ class Lingo
|
|
28
28
|
|
29
29
|
class Attendee
|
30
30
|
|
31
|
+
#--
|
31
32
|
# Der Tokenizer zerlegt eine Textzeile in einzelne Token. Dies ist notwendig,
|
32
33
|
# damit nachfolgende Attendees die Textdatei häppchenweise verarbeiten können.
|
33
34
|
#
|
@@ -77,6 +78,7 @@ class Lingo
|
|
77
78
|
# out> :./PUNC:
|
78
79
|
# out> *EOL('test.txt')
|
79
80
|
# out> *EOF('test.txt')
|
81
|
+
#++
|
80
82
|
|
81
83
|
class Tokenizer < self
|
82
84
|
|
@@ -88,14 +90,14 @@ class Lingo
|
|
88
90
|
['SPAC', /^\s+/],
|
89
91
|
['WIKI', /^=+.+=+|^__[A-Z]+__/],
|
90
92
|
['NUMS', /^[+-]?(?:\d{4,}|\d{1,3}(?:\.\d{3,3})*)(?:\.|(?:,\d+)?%?)/],
|
91
|
-
['URLS', /^(?:www\.|mailto:|#{PROTO}|\S+?[._]\S+?@\S+?\.)
|
93
|
+
['URLS', /^(?:www\.|mailto:|#{PROTO}|\S+?[._]\S+?@\S+?\.)[^\s<>]+/],
|
92
94
|
['ABRV', /^(?:(?:(?:#{CHAR})+\.)+)(?:#{CHAR})+/],
|
93
95
|
['WORD', /^(?:#{CHAR}|#{DIGIT}|-)+/],
|
94
96
|
['PUNC', /^[!,.:;?¡¿]+/]
|
95
97
|
]
|
96
98
|
|
97
99
|
OTHER = [
|
98
|
-
['OTHR', /^["$#%&'()*+\/<=>@\[\\\]^_{|}
|
100
|
+
['OTHR', /^["$#%&'()*+\/<=>@\[\\\]^_{|}~¢£¤¥¦§¨©«¬®¯°±²³´¶·¸¹»¼½¾×÷„“–]/],
|
99
101
|
['HELP', /^\S+/]
|
100
102
|
]
|
101
103
|
|
@@ -113,8 +115,8 @@ class Lingo
|
|
113
115
|
RULES.assoc(name)
|
114
116
|
end
|
115
117
|
|
116
|
-
def rules(name)
|
117
|
-
RULES.select { |rule,| rule == name }
|
118
|
+
def rules(name = nil)
|
119
|
+
name ? RULES.select { |rule,| rule == name } : RULES.map(&:first)
|
118
120
|
end
|
119
121
|
|
120
122
|
def delete(*names)
|
@@ -155,13 +157,14 @@ class Lingo
|
|
155
157
|
|
156
158
|
end
|
157
159
|
|
158
|
-
protected
|
159
|
-
|
160
160
|
def init
|
161
161
|
@space = get_key('space', false)
|
162
162
|
@tags = get_key('tags', false)
|
163
163
|
@wiki = get_key('wiki', false)
|
164
164
|
|
165
|
+
@skip_tags = get_array('skip-tags', '', :downcase)
|
166
|
+
@tags = true unless @skip_tags.empty?
|
167
|
+
|
165
168
|
skip = []
|
166
169
|
skip << 'HTML' unless @tags
|
167
170
|
skip << 'WIKI' unless @wiki
|
@@ -170,7 +173,7 @@ class Lingo
|
|
170
173
|
hash.delete_if { |name, _| skip.include?(Token.clean(name)) }
|
171
174
|
}
|
172
175
|
|
173
|
-
@nest, nest_re = [], []
|
176
|
+
@override, @nest, nest_re = [], [], []
|
174
177
|
|
175
178
|
@nests.each { |name, re|
|
176
179
|
re.map!.with_index { |r, i| r.is_a?(Regexp) ?
|
@@ -182,30 +185,30 @@ class Lingo
|
|
182
185
|
|
183
186
|
@nest_re = /^(?<_>.*?)(?:#{nest_re.join('|')})/
|
184
187
|
|
185
|
-
|
188
|
+
reset
|
186
189
|
end
|
187
190
|
|
188
|
-
def control(cmd,
|
191
|
+
def control(cmd, filename = nil, *)
|
189
192
|
case cmd
|
190
|
-
when
|
191
|
-
when
|
192
|
-
when
|
193
|
-
when
|
193
|
+
when :FILE then reset(filename)
|
194
|
+
when :LIR then reset(nil, nil)
|
195
|
+
when :EOL then @linenum += 1 if @linenum
|
196
|
+
when :EOF then @override.clear; @nest.clear
|
194
197
|
end
|
195
198
|
end
|
196
199
|
|
197
|
-
def process(
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
else
|
202
|
-
forward(obj)
|
203
|
-
end
|
200
|
+
def process(line, offset)
|
201
|
+
@offset = offset
|
202
|
+
tokenize(line)
|
203
|
+
command(:EOL, @filename) if @filename
|
204
204
|
end
|
205
205
|
|
206
206
|
private
|
207
207
|
|
208
|
-
|
208
|
+
def reset(filename = nil, linenum = 1)
|
209
|
+
@filename, @linenum, @position, @offset = filename, linenum, -1, 0
|
210
|
+
end
|
211
|
+
|
209
212
|
def tokenize(line)
|
210
213
|
@nest.empty? ? tokenize_line(line) : tokenize_nest(line)
|
211
214
|
rescue => err
|
@@ -225,8 +228,11 @@ class Lingo
|
|
225
228
|
def tokenize_rule(line, rules = @rules)
|
226
229
|
rules.find { |name, expr|
|
227
230
|
next unless line =~ expr
|
228
|
-
|
229
|
-
|
231
|
+
|
232
|
+
rest = $'
|
233
|
+
forward_token($&, name, rest) if name != 'SPAC' || @space
|
234
|
+
|
235
|
+
yield rest
|
230
236
|
}
|
231
237
|
end
|
232
238
|
|
@@ -235,13 +241,26 @@ class Lingo
|
|
235
241
|
mdc = @nests[@nest.last].last.match(line)
|
236
242
|
|
237
243
|
if mdo && (!mdc || mdo[0].length < mdc[0].length)
|
238
|
-
|
239
|
-
|
244
|
+
rest = mdo.post_match
|
240
245
|
nest = @nests.keys.find { |name| mdo[name] }
|
241
|
-
|
246
|
+
text = mdo[nest]
|
247
|
+
lead = mdo[:_]
|
248
|
+
|
249
|
+
forward_token(lead, @nest.last, text + rest) unless lead.empty?
|
250
|
+
|
251
|
+
forward_nest(text, nest, rest)
|
242
252
|
elsif mdc
|
243
|
-
|
244
|
-
|
253
|
+
rest = mdc.post_match
|
254
|
+
nest = @nest.pop
|
255
|
+
text = mdc[0]
|
256
|
+
|
257
|
+
forward_token(text, nest, rest)
|
258
|
+
|
259
|
+
if overriding?(nest)
|
260
|
+
@override.pop if text.downcase.end_with?("/#{@override.last}>")
|
261
|
+
end
|
262
|
+
|
263
|
+
tokenize(rest)
|
245
264
|
else
|
246
265
|
forward_token(line, @nest.last)
|
247
266
|
end
|
@@ -250,21 +269,32 @@ class Lingo
|
|
250
269
|
def tokenize_open(line)
|
251
270
|
@nests.each { |nest, (open_re, _)|
|
252
271
|
next unless line =~ open_re
|
253
|
-
return forward_nest($&, $'
|
272
|
+
return forward_nest($&, nest, $')
|
254
273
|
}
|
255
274
|
|
256
275
|
tokenize_rule(line, OTHER) { |rest| line = rest }
|
257
276
|
tokenize(line)
|
258
277
|
end
|
259
278
|
|
260
|
-
def forward_nest(match,
|
261
|
-
|
279
|
+
def forward_nest(match, nest, rest)
|
280
|
+
if overriding?(nest)
|
281
|
+
tag = rest[/^[^\s>]*/].downcase
|
282
|
+
@override << tag if @skip_tags.include?(tag)
|
283
|
+
end
|
284
|
+
|
285
|
+
forward_token(match, nest, rest)
|
286
|
+
|
262
287
|
@nest << nest
|
263
288
|
tokenize(rest)
|
264
289
|
end
|
265
290
|
|
266
|
-
def forward_token(
|
267
|
-
forward(Token.new(
|
291
|
+
def forward_token(form, attr, rest = '')
|
292
|
+
forward(Token.new(form, @override.empty? ? attr : 'SKIP',
|
293
|
+
@position += 1, @offset - form.bytesize - rest.bytesize))
|
294
|
+
end
|
295
|
+
|
296
|
+
def overriding?(nest)
|
297
|
+
nest == 'HTML' && !@skip_tags.empty?
|
268
298
|
end
|
269
299
|
|
270
300
|
end
|