mediacloth 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/mediacloth/mediawikiast.rb +42 -0
- data/lib/mediacloth/mediawikihtmlgenerator.rb +100 -29
- data/lib/mediacloth/mediawikilexer.rb +292 -37
- data/lib/mediacloth/mediawikilexer.rb~ +491 -0
- data/lib/mediacloth/mediawikiparser.rb +535 -173
- data/lib/mediacloth/mediawikiparser.y +183 -15
- data/lib/mediacloth/mediawikiparser.y~ +210 -0
- data/lib/mediacloth/mediawikiwalker.rb +56 -8
- data/test/data/html1 +1 -1
- data/test/data/html10 +98 -0
- data/test/data/html3 +1 -1
- data/test/data/html4 +11 -1
- data/test/data/html5 +5 -1
- data/test/data/html7 +1 -2
- data/test/data/html8 +1 -1
- data/test/data/html9 +6 -0
- data/test/data/input1 +5 -0
- data/test/data/input10 +124 -0
- data/test/data/input4 +50 -1
- data/test/data/input5 +8 -0
- data/test/data/input7 +35 -2
- data/test/data/input9 +14 -0
- data/test/data/lex1 +5 -1
- data/test/data/lex10 +87 -0
- data/test/data/lex4 +47 -1
- data/test/data/lex5 +7 -1
- data/test/data/lex7 +35 -2
- data/test/data/lex9 +14 -0
- data/test/dataproducers/html.rb +2 -2
- data/test/dataproducers/html.rb~ +24 -0
- data/test/dataproducers/lex.rb +3 -3
- data/test/dataproducers/lex.rb~ +15 -0
- data/test/debugwalker.rb +1 -1
- data/test/htmlgenerator.rb +5 -4
- data/test/lexer.rb +40 -3
- data/test/parser.rb +0 -1
- metadata +14 -3
@@ -0,0 +1,491 @@
|
|
1
|
+
#The lexer for MediaWiki language.
|
2
|
+
#
|
3
|
+
#Standalone usage:
|
4
|
+
# file = File.new("somefile", "r")
|
5
|
+
# input = file.read
|
6
|
+
# lexer = MediaWikiLexer.new
|
7
|
+
# lexer.tokenize(input)
|
8
|
+
#
|
9
|
+
#Inside RACC-generated parser:
|
10
|
+
# ...
|
11
|
+
# ---- inner ----
|
12
|
+
# attr_accessor :lexer
|
13
|
+
# def parse(input)
|
14
|
+
# lexer.tokenize(input)
|
15
|
+
# return do_parse
|
16
|
+
# end
|
17
|
+
# def next_token
|
18
|
+
# return @lexer.lex
|
19
|
+
# end
|
20
|
+
# ...
|
21
|
+
# parser = MediaWikiParser.new
|
22
|
+
# parser.lexer = MediaWikiLexer.new
|
23
|
+
# parser.parse(input)
|
24
|
+
class MediaWikiLexer
|
25
|
+
|
26
|
+
#Initialized the lexer with a match table.
|
27
|
+
#
|
28
|
+
#The match table tells the lexer which method to invoke
|
29
|
+
#on given input char during "tokenize" phase.
|
30
|
+
def initialize
|
31
|
+
@position = 0
|
32
|
+
@pair_stack = [[false, false]] #stack of tokens for which a pair should be found
|
33
|
+
@list_stack = []
|
34
|
+
@lexer_table = Hash.new(method(:match_other))
|
35
|
+
@lexer_table["'"] = method(:match_italic_or_bold)
|
36
|
+
@lexer_table["="] = method(:match_section)
|
37
|
+
@lexer_table["["] = method(:match_link_start)
|
38
|
+
@lexer_table["]"] = method(:match_link_end)
|
39
|
+
@lexer_table[" "] = method(:match_space)
|
40
|
+
@lexer_table["*"] = method(:match_list)
|
41
|
+
@lexer_table["#"] = method(:match_list)
|
42
|
+
@lexer_table[";"] = method(:match_list)
|
43
|
+
@lexer_table[":"] = method(:match_list)
|
44
|
+
@lexer_table["-"] = method(:match_line)
|
45
|
+
@lexer_table["~"] = method(:match_signature)
|
46
|
+
@lexer_table["h"] = method(:match_inline_link)
|
47
|
+
@lexer_table["\n"] = method(:match_newline)
|
48
|
+
end
|
49
|
+
|
50
|
+
#Transforms input stream (string) into the stream of tokens.
|
51
|
+
#Tokens are collected into an array of type [ [TOKEN_SYMBOL, TOKEN_VALUE], ..., [false, false] ].
|
52
|
+
#This array can be given as input token-by token to RACC based parser with no
|
53
|
+
#modification. The last token [false, false] inficates EOF.
|
54
|
+
def tokenize(input)
|
55
|
+
@tokens = []
|
56
|
+
start_para
|
57
|
+
@cursor = 0
|
58
|
+
@text = input
|
59
|
+
@next_token = []
|
60
|
+
|
61
|
+
#This tokenizer algorithm assumes that everything that is not
|
62
|
+
#matched by the lexer is going to be :TEXT token. Otherwise it's usual
|
63
|
+
#lexer algo which call methods from the match table to define next tokens.
|
64
|
+
while (@cursor < @text.length)
|
65
|
+
@current_token = [:TEXT, ''] unless @current_token
|
66
|
+
@token_start = @cursor
|
67
|
+
@char = @text[@cursor, 1]
|
68
|
+
|
69
|
+
if @lexer_table[@char].call == :TEXT
|
70
|
+
@current_token[1] += @text[@token_start, 1]
|
71
|
+
else
|
72
|
+
#skip empty :TEXT tokens
|
73
|
+
puts "not a text: #{@next_token[0]}"
|
74
|
+
unless empty_text_token?
|
75
|
+
@tokens << @current_token
|
76
|
+
puts "chance to break para before #{@next_token[0]}"
|
77
|
+
unless para_breaker?(@next_token[0])
|
78
|
+
#if no paragraph was previously started
|
79
|
+
#then we should start it
|
80
|
+
start_para if !@para
|
81
|
+
else
|
82
|
+
#if we already have a paragraph this is the time to close it
|
83
|
+
end_para if @para
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
if para_breaker?(@next_token[0])
|
88
|
+
if @tokens.last and @tokens.last[0] == :PARA_START
|
89
|
+
#we need to remove para start token because no para end is possible
|
90
|
+
@tokens.pop
|
91
|
+
@para = false
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
@next_token[1] = @text[@token_start, @cursor - @token_start]
|
96
|
+
@tokens << @next_token
|
97
|
+
#hack to enable sub-lexing!
|
98
|
+
if @sub_tokens
|
99
|
+
@tokens += @sub_tokens
|
100
|
+
@sub_tokens = nil
|
101
|
+
end
|
102
|
+
#end of hack!
|
103
|
+
|
104
|
+
#if the next token can start the paragraph, let's try that
|
105
|
+
start_para if @tokens.last and para_starter?(@tokens.last[0])
|
106
|
+
|
107
|
+
@current_token = nil
|
108
|
+
@next_token = []
|
109
|
+
end
|
110
|
+
end
|
111
|
+
#add the last TEXT token if it exists
|
112
|
+
puts @current_token
|
113
|
+
if @current_token and not empty_text_token?
|
114
|
+
puts "here"
|
115
|
+
if para_breaker?(@current_token[0])
|
116
|
+
#if we already have a paragraph this is the time to close it
|
117
|
+
end_para if @para
|
118
|
+
end
|
119
|
+
@tokens << @current_token
|
120
|
+
end
|
121
|
+
|
122
|
+
#remove empty para start or finish the paragraph if necessary
|
123
|
+
if @tokens.last and @tokens.last[0] == :PARA_START
|
124
|
+
@tokens.pop
|
125
|
+
@para = false
|
126
|
+
else
|
127
|
+
end_para if @para
|
128
|
+
end
|
129
|
+
#RACC wants us to put this to indicate EOF
|
130
|
+
@tokens << [false, false]
|
131
|
+
@tokens
|
132
|
+
end
|
133
|
+
|
134
|
+
#Returns the next token from the stream. Useful for RACC parsers.
|
135
|
+
def lex
|
136
|
+
token = @tokens[@position]
|
137
|
+
@position += 1
|
138
|
+
return token
|
139
|
+
end
|
140
|
+
|
141
|
+
|
142
|
+
private
|
143
|
+
#Returns true if the token breaks the paragraph.
|
144
|
+
def para_breaker?(token)
|
145
|
+
[:SECTION_START, :SECTION_END,
|
146
|
+
:UL_START, :UL_END, :OL_START, :OL_END,
|
147
|
+
:DL_START, :DL_END, :HLINE, :PRE].include?(token)
|
148
|
+
end
|
149
|
+
|
150
|
+
#Returns true if the paragraph can be started after the token
|
151
|
+
def para_starter?(token)
|
152
|
+
[:SECTION_END, :UL_END, :OL_END, :DL_END, :HLINE, :PRE].include?(token)
|
153
|
+
end
|
154
|
+
|
155
|
+
#-- ================== Match methods ================== ++#
|
156
|
+
|
157
|
+
#Matches anything that was not matched. Returns :TEXT to indicate
|
158
|
+
#that matched characters should go into :TEXT token.
|
159
|
+
def match_other
|
160
|
+
@cursor += 1
|
161
|
+
return :TEXT
|
162
|
+
end
|
163
|
+
|
164
|
+
#Matches italic or bold symbols:
|
165
|
+
# "'''" { return :BOLD; }
|
166
|
+
# "''" { return :ITALIC; }
|
167
|
+
def match_italic_or_bold
|
168
|
+
if @text[@cursor, 3] == "'''" and @pair_stack.last[0] != :ITALICSTART
|
169
|
+
matchBold
|
170
|
+
@cursor += 3
|
171
|
+
return
|
172
|
+
end
|
173
|
+
if @text[@cursor, 2] == "''"
|
174
|
+
matchItalic
|
175
|
+
@cursor += 2
|
176
|
+
return
|
177
|
+
end
|
178
|
+
match_other
|
179
|
+
end
|
180
|
+
|
181
|
+
def matchBold
|
182
|
+
if @pair_stack.last[0] == :BOLDSTART
|
183
|
+
@next_token[0] = :BOLDEND
|
184
|
+
@pair_stack.pop
|
185
|
+
else
|
186
|
+
@next_token[0] = :BOLDSTART
|
187
|
+
@pair_stack.push @next_token
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def matchItalic
|
192
|
+
if @pair_stack.last[0] == :ITALICSTART
|
193
|
+
@next_token[0] = :ITALICEND
|
194
|
+
@pair_stack.pop
|
195
|
+
else
|
196
|
+
@next_token[0] = :ITALICSTART
|
197
|
+
@pair_stack.push @next_token
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
#Matches sections
|
202
|
+
def match_section
|
203
|
+
if at_start_of_line? or (@pair_stack.last[0] == :SECTION_START)
|
204
|
+
i = 0
|
205
|
+
i += 1 while @text[@cursor+i, 1] == "="
|
206
|
+
@cursor += i
|
207
|
+
|
208
|
+
if @pair_stack.last[0] == :SECTION_START
|
209
|
+
@next_token[0] = :SECTION_END
|
210
|
+
@pair_stack.pop
|
211
|
+
else
|
212
|
+
@next_token[0] = :SECTION_START
|
213
|
+
@pair_stack.push @next_token
|
214
|
+
end
|
215
|
+
else
|
216
|
+
match_other
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
#Matches start of the hyperlinks
|
221
|
+
# "[[" { return INTLINKSTART; }
|
222
|
+
# "[" { return LINKSTART; }
|
223
|
+
def match_link_start
|
224
|
+
if @text[@cursor, 2] == "[["
|
225
|
+
@next_token[0] = :INTLINKSTART
|
226
|
+
@pair_stack.push @next_token
|
227
|
+
@cursor += 2
|
228
|
+
elsif @text[@cursor, 1] == "[" and html_link?(@cursor+1)
|
229
|
+
@next_token[0] = :LINKSTART
|
230
|
+
@pair_stack.push @next_token
|
231
|
+
@cursor += 1
|
232
|
+
else
|
233
|
+
match_other
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
#Matches end of the hyperlinks
|
238
|
+
# "]]" { return INTLINKEND; }
|
239
|
+
# "]" { return LINKEND; }
|
240
|
+
def match_link_end
|
241
|
+
if @text[@cursor, 2] == "]]" and @pair_stack.last[0] == :INTLINKSTART
|
242
|
+
@next_token[0] = :INTLINKEND
|
243
|
+
@pair_stack.pop
|
244
|
+
@cursor += 2
|
245
|
+
elsif @text[@cursor, 1] == "]" and @pair_stack.last[0] == :LINKSTART
|
246
|
+
@next_token[0] = :LINKEND
|
247
|
+
@pair_stack.pop
|
248
|
+
@cursor += 1
|
249
|
+
else
|
250
|
+
match_other
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
#Matches inlined unformatted html link
|
255
|
+
# "http://[^\s]*" { return [ LINKSTART TEXT LINKEND]; }
|
256
|
+
def match_inline_link
|
257
|
+
#if no link start token was detected and the text starts with http://
|
258
|
+
#then it's the inlined unformatted html link
|
259
|
+
if html_link?(@cursor) and @pair_stack.last[0] != :INTLINKSTART and
|
260
|
+
@pair_stack.last[0] != :LINKSTART
|
261
|
+
@next_token[0] = :LINKSTART
|
262
|
+
linkText = extract_till_whitespace
|
263
|
+
@sub_tokens = []
|
264
|
+
@sub_tokens << [:TEXT, linkText]
|
265
|
+
@sub_tokens << [:LINKEND, ']']
|
266
|
+
@cursor += linkText.length
|
267
|
+
@token_start = @cursor
|
268
|
+
else
|
269
|
+
match_other
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
#Matches space to find preformatted areas which start with a space after a newline
|
274
|
+
# "\n\s[^\n]*" { return PRE; }
|
275
|
+
def match_space
|
276
|
+
if at_start_of_line?
|
277
|
+
match_untill_eol
|
278
|
+
@next_token[0] = :PRE
|
279
|
+
strip_ws_from_token_start
|
280
|
+
else
|
281
|
+
match_other
|
282
|
+
end
|
283
|
+
end
|
284
|
+
|
285
|
+
#Matches any kind of list by using sublexing technique. MediaWiki lists are context-sensitive
|
286
|
+
#therefore we need to do some special processing with lists. The idea here is to strip
|
287
|
+
#the leftmost symbol indicating the list from the group of input lines and use separate
|
288
|
+
#lexer to process extracted fragment.
|
289
|
+
def match_list
|
290
|
+
if at_start_of_line?
|
291
|
+
list_id = @text[@cursor, 1]
|
292
|
+
sub_text = extract_list_contents(list_id)
|
293
|
+
extracted = 0
|
294
|
+
|
295
|
+
#hack to tokenize everything inside the list
|
296
|
+
@sub_tokens = []
|
297
|
+
sub_lines = ""
|
298
|
+
@sub_tokens << [:LI_START, ""]
|
299
|
+
sub_text.each do |t|
|
300
|
+
extracted += 1
|
301
|
+
if text_is_list? t
|
302
|
+
sub_lines += t
|
303
|
+
else
|
304
|
+
if not sub_lines.empty?
|
305
|
+
@sub_tokens += sub_lex(sub_lines)
|
306
|
+
sub_lines = ""
|
307
|
+
end
|
308
|
+
if @sub_tokens.last[0] != :LI_START
|
309
|
+
@sub_tokens << [:LI_END, ""]
|
310
|
+
@sub_tokens << [:LI_START, ""]
|
311
|
+
end
|
312
|
+
@sub_tokens += sub_lex(t.lstrip)
|
313
|
+
end
|
314
|
+
end
|
315
|
+
if not sub_lines.empty?
|
316
|
+
@sub_tokens += sub_lex(sub_lines)
|
317
|
+
@sub_tokens << [:LI_END, ""]
|
318
|
+
else
|
319
|
+
@sub_tokens << [:LI_END, ""]
|
320
|
+
end
|
321
|
+
|
322
|
+
#end of hack
|
323
|
+
@cursor += sub_text.length + extracted
|
324
|
+
@token_start = @cursor
|
325
|
+
|
326
|
+
case
|
327
|
+
when list_id == "*"
|
328
|
+
@next_token[0] = :UL_START
|
329
|
+
@sub_tokens << [:UL_END, ""]
|
330
|
+
when list_id == "#"
|
331
|
+
@next_token[0] = :OL_START
|
332
|
+
@sub_tokens << [:OL_END, ""]
|
333
|
+
when list_id == ";", list_id == ":"
|
334
|
+
@next_token[0] = :DL_START
|
335
|
+
@sub_tokens << [:DL_END, ""]
|
336
|
+
end
|
337
|
+
|
338
|
+
else
|
339
|
+
match_other
|
340
|
+
end
|
341
|
+
end
|
342
|
+
|
343
|
+
#Matches the line until \n
|
344
|
+
def match_untill_eol
|
345
|
+
val = @text[@cursor, 1]
|
346
|
+
while (val != "\n") and (!val.nil?)
|
347
|
+
@cursor += 1
|
348
|
+
val = @text[@cursor, 1]
|
349
|
+
end
|
350
|
+
@cursor += 1
|
351
|
+
end
|
352
|
+
|
353
|
+
#Matches hline tag that start with "-"
|
354
|
+
# "\n----" { return HLINE; }
|
355
|
+
def match_line
|
356
|
+
if at_start_of_line? and @text[@cursor, 4] == "----"
|
357
|
+
@next_token[0] = :HLINE
|
358
|
+
@cursor += 4
|
359
|
+
else
|
360
|
+
match_other
|
361
|
+
end
|
362
|
+
end
|
363
|
+
|
364
|
+
#Matches signature
|
365
|
+
# "~~~~~" { return SIGNATURE_DATE; }
|
366
|
+
# "~~~~" { return SIGNATURE_FULL; }
|
367
|
+
# "~~~" { return SIGNATURE_NAME; }
|
368
|
+
def match_signature
|
369
|
+
if @text[@cursor, 5] == "~~~~~"
|
370
|
+
@next_token[0] = :SIGNATURE_DATE
|
371
|
+
@cursor += 5
|
372
|
+
elsif @text[@cursor, 4] == "~~~~"
|
373
|
+
@next_token[0] = :SIGNATURE_FULL
|
374
|
+
@cursor += 4
|
375
|
+
elsif @text[@cursor, 3] == "~~~"
|
376
|
+
@next_token[0] = :SIGNATURE_NAME
|
377
|
+
@cursor += 3
|
378
|
+
else
|
379
|
+
match_other
|
380
|
+
end
|
381
|
+
end
|
382
|
+
|
383
|
+
#Matches new line and breaks the paragraph if two newlines are met
|
384
|
+
def match_newline
|
385
|
+
if @text[@cursor, 2] == "\n\n"
|
386
|
+
if @para
|
387
|
+
@next_token[0] = :PARA_END
|
388
|
+
# @para = false
|
389
|
+
@sub_tokens = [[:PARA_START, ""]]
|
390
|
+
@cursor += 2
|
391
|
+
return
|
392
|
+
end
|
393
|
+
end
|
394
|
+
match_other
|
395
|
+
end
|
396
|
+
|
397
|
+
#-- ================== Helper methods ================== ++#
|
398
|
+
|
399
|
+
#Checks if the token is placed at the start of the line.
|
400
|
+
def at_start_of_line?
|
401
|
+
if @cursor == 0 or @text[@cursor-1, 1] == "\n"
|
402
|
+
true
|
403
|
+
else
|
404
|
+
false
|
405
|
+
end
|
406
|
+
end
|
407
|
+
|
408
|
+
#Checks if the text at position contains the start of the html link
|
409
|
+
def html_link?(position)
|
410
|
+
return @text[position, 7] == 'http://'
|
411
|
+
end
|
412
|
+
|
413
|
+
#Adjusts @token_start to skip leading whitespaces
|
414
|
+
def strip_ws_from_token_start
|
415
|
+
@token_start += 1 while @text[@token_start, 1] == " "
|
416
|
+
end
|
417
|
+
|
418
|
+
#Returns true if the TEXT token is empty or contains newline only
|
419
|
+
def empty_text_token?
|
420
|
+
@current_token == [:TEXT, ''] or @current_token == [:TEXT, "\n"]
|
421
|
+
end
|
422
|
+
|
423
|
+
#Returns true if the text is a list, i.e. starts with one of #;*: symbols
|
424
|
+
#that indicate a list
|
425
|
+
def text_is_list?(text)
|
426
|
+
return text =~ /^[#;*:].*/
|
427
|
+
end
|
428
|
+
|
429
|
+
#Runs sublexer to tokenize sub_text
|
430
|
+
def sub_lex(sub_text, strip_paragraphs=true)
|
431
|
+
sub_lexer = MediaWikiLexer.new
|
432
|
+
sub_tokens = sub_lexer.tokenize(sub_text)
|
433
|
+
sub_tokens.pop #false token
|
434
|
+
if strip_paragraphs
|
435
|
+
#the last PARA_END token
|
436
|
+
sub_tokens.pop if sub_tokens.last[0] == :PARA_END
|
437
|
+
#the first PARA_START token
|
438
|
+
sub_tokens.delete_at(0) if sub_tokens[0][0] == :PARA_START
|
439
|
+
end
|
440
|
+
sub_tokens
|
441
|
+
end
|
442
|
+
|
443
|
+
#Extracts the text from current cursor position till the next whitespace
|
444
|
+
def extract_till_whitespace
|
445
|
+
i = @cursor
|
446
|
+
text = ""
|
447
|
+
while i < @text.length
|
448
|
+
curr = @text[i, 1]
|
449
|
+
if (curr == "\n") or (curr == "\t") or (curr == " ")
|
450
|
+
break
|
451
|
+
end
|
452
|
+
text += curr
|
453
|
+
i += 1
|
454
|
+
end
|
455
|
+
text
|
456
|
+
end
|
457
|
+
|
458
|
+
#Extract list contents of list type set by list_id variable.
|
459
|
+
#Example list:
|
460
|
+
# *a
|
461
|
+
# **a
|
462
|
+
#Extracted list with id "*" will look like:
|
463
|
+
# a
|
464
|
+
# *a
|
465
|
+
def extract_list_contents(list_id)
|
466
|
+
i = @cursor+1
|
467
|
+
list = ""
|
468
|
+
while i < @text.length
|
469
|
+
curr = @text[i, 1]
|
470
|
+
if (curr == "\n") and (@text[i+1, 1] != list_id)
|
471
|
+
list+=curr
|
472
|
+
break
|
473
|
+
end
|
474
|
+
list += curr unless (curr == list_id) and (@text[i-1, 1] == "\n")
|
475
|
+
i += 1
|
476
|
+
end
|
477
|
+
list
|
478
|
+
end
|
479
|
+
|
480
|
+
def start_para
|
481
|
+
@tokens << [:PARA_START, ""]
|
482
|
+
@para = true
|
483
|
+
end
|
484
|
+
|
485
|
+
def end_para
|
486
|
+
@tokens << [:PARA_END, ""]
|
487
|
+
@para = false
|
488
|
+
end
|
489
|
+
|
490
|
+
end
|
491
|
+
|