mediacloth 0.0.3 → 0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +36 -0
- data/lib/mediacloth/mediawikiast.rb +58 -1
- data/lib/mediacloth/mediawikihtmlgenerator.rb +229 -73
- data/lib/mediacloth/mediawikilexer.rb +1030 -656
- data/lib/mediacloth/mediawikilinkhandler.rb +89 -0
- data/lib/mediacloth/mediawikiparams.rb +1 -10
- data/lib/mediacloth/mediawikiparser.rb +939 -409
- data/lib/mediacloth/mediawikiparser.tab.rb +1357 -0
- data/lib/mediacloth/mediawikiparser.y +256 -52
- data/lib/mediacloth/mediawikisignedwikigenerator.rb +42 -0
- data/lib/mediacloth/mediawikitemplatehandler.rb +8 -0
- data/lib/mediacloth/mediawikiwalker.rb +72 -1
- data/lib/mediacloth.rb +33 -10
- data/test/data/ast1 +68 -0
- data/test/data/ast10 +196 -0
- data/test/data/ast11 +34 -0
- data/test/data/ast12 +39 -0
- data/test/data/ast13 +25 -0
- data/test/data/ast14 +13 -0
- data/test/data/ast15 +25 -0
- data/test/data/ast16 +17 -0
- data/test/data/ast17 +9 -0
- data/test/data/ast18 +21 -0
- data/test/data/ast19 +32 -0
- data/test/data/ast2 +4 -0
- data/test/data/ast20 +10 -0
- data/test/data/ast21 +27 -0
- data/test/data/ast22 +22 -0
- data/test/data/ast23 +5 -0
- data/test/data/ast3 +6 -0
- data/test/data/ast4 +122 -0
- data/test/data/ast5 +122 -0
- data/test/data/ast6 +22 -0
- data/test/data/ast7 +143 -0
- data/test/data/ast8 +3 -0
- data/test/data/ast9 +11 -0
- data/test/data/html1 +33 -5
- data/test/data/html10 +31 -27
- data/test/data/html11 +19 -0
- data/test/data/html12 +32 -0
- data/test/data/html13 +29 -0
- data/test/data/html14 +4 -0
- data/test/data/html15 +29 -0
- data/test/data/html16 +28 -0
- data/test/data/html17 +10 -0
- data/test/data/html18 +8 -0
- data/test/data/html19 +27 -0
- data/test/data/html2 +1 -1
- data/test/data/html20 +7 -0
- data/test/data/html21 +5 -0
- data/test/data/html22 +24 -0
- data/test/data/html23 +7 -0
- data/test/data/html3 +1 -1
- data/test/data/html4 +60 -11
- data/test/data/html5 +45 -6
- data/test/data/html6 +5 -5
- data/test/data/html7 +59 -1
- data/test/data/html8 +1 -1
- data/test/data/html9 +10 -2
- data/test/data/input1 +4 -0
- data/test/data/input11 +19 -0
- data/test/data/input12 +32 -0
- data/test/data/input13 +10 -0
- data/test/data/input14 +8 -0
- data/test/data/input15 +10 -0
- data/test/data/input16 +28 -0
- data/test/data/input17 +10 -0
- data/test/data/input18 +16 -0
- data/test/data/input19 +29 -0
- data/test/data/input20 +8 -0
- data/test/data/input21 +18 -0
- data/test/data/input22 +20 -0
- data/test/data/input23 +8 -0
- data/test/data/input4 +13 -1
- data/test/data/input5 +45 -4
- data/test/data/input7 +25 -1
- data/test/data/lex1 +17 -18
- data/test/data/lex10 +57 -87
- data/test/data/lex11 +18 -0
- data/test/data/lex12 +32 -0
- data/test/data/lex13 +3 -0
- data/test/data/lex14 +1 -0
- data/test/data/lex15 +3 -0
- data/test/data/lex16 +27 -0
- data/test/data/lex17 +9 -0
- data/test/data/lex18 +4 -0
- data/test/data/lex19 +27 -0
- data/test/data/lex2 +2 -2
- data/test/data/lex20 +7 -0
- data/test/data/lex21 +4 -0
- data/test/data/lex22 +3 -0
- data/test/data/lex23 +7 -0
- data/test/data/lex3 +1 -1
- data/test/data/lex4 +35 -29
- data/test/data/lex5 +57 -18
- data/test/data/lex6 +7 -7
- data/test/data/lex7 +42 -18
- data/test/data/lex8 +1 -1
- data/test/data/lex9 +6 -6
- data/test/dataproducers/ast.rb +24 -0
- data/test/dataproducers/html.rb +11 -12
- data/test/dataproducers/lex.rb +9 -4
- data/test/debugwalker.rb +25 -11
- data/test/htmlgenerator.rb +170 -13
- data/test/lexer.rb +626 -83
- data/test/linkhandler.rb +39 -0
- data/test/parser.rb +176 -9
- data/test/signedwikigenerator.rb +113 -0
- metadata +158 -79
- data/README +0 -37
- data/lib/mediacloth/mediawikilexer.rb~ +0 -491
- data/lib/mediacloth/mediawikiparser.y~ +0 -210
- data/test/data/result1 +0 -48
- data/test/dataproducers/html.rb~ +0 -24
- data/test/dataproducers/lex.rb~ +0 -15
@@ -1,491 +0,0 @@
|
|
1
|
-
#The lexer for MediaWiki language.
|
2
|
-
#
|
3
|
-
#Standalone usage:
|
4
|
-
# file = File.new("somefile", "r")
|
5
|
-
# input = file.read
|
6
|
-
# lexer = MediaWikiLexer.new
|
7
|
-
# lexer.tokenize(input)
|
8
|
-
#
|
9
|
-
#Inside RACC-generated parser:
|
10
|
-
# ...
|
11
|
-
# ---- inner ----
|
12
|
-
# attr_accessor :lexer
|
13
|
-
# def parse(input)
|
14
|
-
# lexer.tokenize(input)
|
15
|
-
# return do_parse
|
16
|
-
# end
|
17
|
-
# def next_token
|
18
|
-
# return @lexer.lex
|
19
|
-
# end
|
20
|
-
# ...
|
21
|
-
# parser = MediaWikiParser.new
|
22
|
-
# parser.lexer = MediaWikiLexer.new
|
23
|
-
# parser.parse(input)
|
24
|
-
class MediaWikiLexer
|
25
|
-
|
26
|
-
#Initialized the lexer with a match table.
|
27
|
-
#
|
28
|
-
#The match table tells the lexer which method to invoke
|
29
|
-
#on given input char during "tokenize" phase.
|
30
|
-
def initialize
|
31
|
-
@position = 0
|
32
|
-
@pair_stack = [[false, false]] #stack of tokens for which a pair should be found
|
33
|
-
@list_stack = []
|
34
|
-
@lexer_table = Hash.new(method(:match_other))
|
35
|
-
@lexer_table["'"] = method(:match_italic_or_bold)
|
36
|
-
@lexer_table["="] = method(:match_section)
|
37
|
-
@lexer_table["["] = method(:match_link_start)
|
38
|
-
@lexer_table["]"] = method(:match_link_end)
|
39
|
-
@lexer_table[" "] = method(:match_space)
|
40
|
-
@lexer_table["*"] = method(:match_list)
|
41
|
-
@lexer_table["#"] = method(:match_list)
|
42
|
-
@lexer_table[";"] = method(:match_list)
|
43
|
-
@lexer_table[":"] = method(:match_list)
|
44
|
-
@lexer_table["-"] = method(:match_line)
|
45
|
-
@lexer_table["~"] = method(:match_signature)
|
46
|
-
@lexer_table["h"] = method(:match_inline_link)
|
47
|
-
@lexer_table["\n"] = method(:match_newline)
|
48
|
-
end
|
49
|
-
|
50
|
-
#Transforms input stream (string) into the stream of tokens.
|
51
|
-
#Tokens are collected into an array of type [ [TOKEN_SYMBOL, TOKEN_VALUE], ..., [false, false] ].
|
52
|
-
#This array can be given as input token-by token to RACC based parser with no
|
53
|
-
#modification. The last token [false, false] inficates EOF.
|
54
|
-
def tokenize(input)
|
55
|
-
@tokens = []
|
56
|
-
start_para
|
57
|
-
@cursor = 0
|
58
|
-
@text = input
|
59
|
-
@next_token = []
|
60
|
-
|
61
|
-
#This tokenizer algorithm assumes that everything that is not
|
62
|
-
#matched by the lexer is going to be :TEXT token. Otherwise it's usual
|
63
|
-
#lexer algo which call methods from the match table to define next tokens.
|
64
|
-
while (@cursor < @text.length)
|
65
|
-
@current_token = [:TEXT, ''] unless @current_token
|
66
|
-
@token_start = @cursor
|
67
|
-
@char = @text[@cursor, 1]
|
68
|
-
|
69
|
-
if @lexer_table[@char].call == :TEXT
|
70
|
-
@current_token[1] += @text[@token_start, 1]
|
71
|
-
else
|
72
|
-
#skip empty :TEXT tokens
|
73
|
-
puts "not a text: #{@next_token[0]}"
|
74
|
-
unless empty_text_token?
|
75
|
-
@tokens << @current_token
|
76
|
-
puts "chance to break para before #{@next_token[0]}"
|
77
|
-
unless para_breaker?(@next_token[0])
|
78
|
-
#if no paragraph was previously started
|
79
|
-
#then we should start it
|
80
|
-
start_para if !@para
|
81
|
-
else
|
82
|
-
#if we already have a paragraph this is the time to close it
|
83
|
-
end_para if @para
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
if para_breaker?(@next_token[0])
|
88
|
-
if @tokens.last and @tokens.last[0] == :PARA_START
|
89
|
-
#we need to remove para start token because no para end is possible
|
90
|
-
@tokens.pop
|
91
|
-
@para = false
|
92
|
-
end
|
93
|
-
end
|
94
|
-
|
95
|
-
@next_token[1] = @text[@token_start, @cursor - @token_start]
|
96
|
-
@tokens << @next_token
|
97
|
-
#hack to enable sub-lexing!
|
98
|
-
if @sub_tokens
|
99
|
-
@tokens += @sub_tokens
|
100
|
-
@sub_tokens = nil
|
101
|
-
end
|
102
|
-
#end of hack!
|
103
|
-
|
104
|
-
#if the next token can start the paragraph, let's try that
|
105
|
-
start_para if @tokens.last and para_starter?(@tokens.last[0])
|
106
|
-
|
107
|
-
@current_token = nil
|
108
|
-
@next_token = []
|
109
|
-
end
|
110
|
-
end
|
111
|
-
#add the last TEXT token if it exists
|
112
|
-
puts @current_token
|
113
|
-
if @current_token and not empty_text_token?
|
114
|
-
puts "here"
|
115
|
-
if para_breaker?(@current_token[0])
|
116
|
-
#if we already have a paragraph this is the time to close it
|
117
|
-
end_para if @para
|
118
|
-
end
|
119
|
-
@tokens << @current_token
|
120
|
-
end
|
121
|
-
|
122
|
-
#remove empty para start or finish the paragraph if necessary
|
123
|
-
if @tokens.last and @tokens.last[0] == :PARA_START
|
124
|
-
@tokens.pop
|
125
|
-
@para = false
|
126
|
-
else
|
127
|
-
end_para if @para
|
128
|
-
end
|
129
|
-
#RACC wants us to put this to indicate EOF
|
130
|
-
@tokens << [false, false]
|
131
|
-
@tokens
|
132
|
-
end
|
133
|
-
|
134
|
-
#Returns the next token from the stream. Useful for RACC parsers.
|
135
|
-
def lex
|
136
|
-
token = @tokens[@position]
|
137
|
-
@position += 1
|
138
|
-
return token
|
139
|
-
end
|
140
|
-
|
141
|
-
|
142
|
-
private
|
143
|
-
#Returns true if the token breaks the paragraph.
|
144
|
-
def para_breaker?(token)
|
145
|
-
[:SECTION_START, :SECTION_END,
|
146
|
-
:UL_START, :UL_END, :OL_START, :OL_END,
|
147
|
-
:DL_START, :DL_END, :HLINE, :PRE].include?(token)
|
148
|
-
end
|
149
|
-
|
150
|
-
#Returns true if the paragraph can be started after the token
|
151
|
-
def para_starter?(token)
|
152
|
-
[:SECTION_END, :UL_END, :OL_END, :DL_END, :HLINE, :PRE].include?(token)
|
153
|
-
end
|
154
|
-
|
155
|
-
#-- ================== Match methods ================== ++#
|
156
|
-
|
157
|
-
#Matches anything that was not matched. Returns :TEXT to indicate
|
158
|
-
#that matched characters should go into :TEXT token.
|
159
|
-
def match_other
|
160
|
-
@cursor += 1
|
161
|
-
return :TEXT
|
162
|
-
end
|
163
|
-
|
164
|
-
#Matches italic or bold symbols:
|
165
|
-
# "'''" { return :BOLD; }
|
166
|
-
# "''" { return :ITALIC; }
|
167
|
-
def match_italic_or_bold
|
168
|
-
if @text[@cursor, 3] == "'''" and @pair_stack.last[0] != :ITALICSTART
|
169
|
-
matchBold
|
170
|
-
@cursor += 3
|
171
|
-
return
|
172
|
-
end
|
173
|
-
if @text[@cursor, 2] == "''"
|
174
|
-
matchItalic
|
175
|
-
@cursor += 2
|
176
|
-
return
|
177
|
-
end
|
178
|
-
match_other
|
179
|
-
end
|
180
|
-
|
181
|
-
def matchBold
|
182
|
-
if @pair_stack.last[0] == :BOLDSTART
|
183
|
-
@next_token[0] = :BOLDEND
|
184
|
-
@pair_stack.pop
|
185
|
-
else
|
186
|
-
@next_token[0] = :BOLDSTART
|
187
|
-
@pair_stack.push @next_token
|
188
|
-
end
|
189
|
-
end
|
190
|
-
|
191
|
-
def matchItalic
|
192
|
-
if @pair_stack.last[0] == :ITALICSTART
|
193
|
-
@next_token[0] = :ITALICEND
|
194
|
-
@pair_stack.pop
|
195
|
-
else
|
196
|
-
@next_token[0] = :ITALICSTART
|
197
|
-
@pair_stack.push @next_token
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
|
-
#Matches sections
|
202
|
-
def match_section
|
203
|
-
if at_start_of_line? or (@pair_stack.last[0] == :SECTION_START)
|
204
|
-
i = 0
|
205
|
-
i += 1 while @text[@cursor+i, 1] == "="
|
206
|
-
@cursor += i
|
207
|
-
|
208
|
-
if @pair_stack.last[0] == :SECTION_START
|
209
|
-
@next_token[0] = :SECTION_END
|
210
|
-
@pair_stack.pop
|
211
|
-
else
|
212
|
-
@next_token[0] = :SECTION_START
|
213
|
-
@pair_stack.push @next_token
|
214
|
-
end
|
215
|
-
else
|
216
|
-
match_other
|
217
|
-
end
|
218
|
-
end
|
219
|
-
|
220
|
-
#Matches start of the hyperlinks
|
221
|
-
# "[[" { return INTLINKSTART; }
|
222
|
-
# "[" { return LINKSTART; }
|
223
|
-
def match_link_start
|
224
|
-
if @text[@cursor, 2] == "[["
|
225
|
-
@next_token[0] = :INTLINKSTART
|
226
|
-
@pair_stack.push @next_token
|
227
|
-
@cursor += 2
|
228
|
-
elsif @text[@cursor, 1] == "[" and html_link?(@cursor+1)
|
229
|
-
@next_token[0] = :LINKSTART
|
230
|
-
@pair_stack.push @next_token
|
231
|
-
@cursor += 1
|
232
|
-
else
|
233
|
-
match_other
|
234
|
-
end
|
235
|
-
end
|
236
|
-
|
237
|
-
#Matches end of the hyperlinks
|
238
|
-
# "]]" { return INTLINKEND; }
|
239
|
-
# "]" { return LINKEND; }
|
240
|
-
def match_link_end
|
241
|
-
if @text[@cursor, 2] == "]]" and @pair_stack.last[0] == :INTLINKSTART
|
242
|
-
@next_token[0] = :INTLINKEND
|
243
|
-
@pair_stack.pop
|
244
|
-
@cursor += 2
|
245
|
-
elsif @text[@cursor, 1] == "]" and @pair_stack.last[0] == :LINKSTART
|
246
|
-
@next_token[0] = :LINKEND
|
247
|
-
@pair_stack.pop
|
248
|
-
@cursor += 1
|
249
|
-
else
|
250
|
-
match_other
|
251
|
-
end
|
252
|
-
end
|
253
|
-
|
254
|
-
#Matches inlined unformatted html link
|
255
|
-
# "http://[^\s]*" { return [ LINKSTART TEXT LINKEND]; }
|
256
|
-
def match_inline_link
|
257
|
-
#if no link start token was detected and the text starts with http://
|
258
|
-
#then it's the inlined unformatted html link
|
259
|
-
if html_link?(@cursor) and @pair_stack.last[0] != :INTLINKSTART and
|
260
|
-
@pair_stack.last[0] != :LINKSTART
|
261
|
-
@next_token[0] = :LINKSTART
|
262
|
-
linkText = extract_till_whitespace
|
263
|
-
@sub_tokens = []
|
264
|
-
@sub_tokens << [:TEXT, linkText]
|
265
|
-
@sub_tokens << [:LINKEND, ']']
|
266
|
-
@cursor += linkText.length
|
267
|
-
@token_start = @cursor
|
268
|
-
else
|
269
|
-
match_other
|
270
|
-
end
|
271
|
-
end
|
272
|
-
|
273
|
-
#Matches space to find preformatted areas which start with a space after a newline
|
274
|
-
# "\n\s[^\n]*" { return PRE; }
|
275
|
-
def match_space
|
276
|
-
if at_start_of_line?
|
277
|
-
match_untill_eol
|
278
|
-
@next_token[0] = :PRE
|
279
|
-
strip_ws_from_token_start
|
280
|
-
else
|
281
|
-
match_other
|
282
|
-
end
|
283
|
-
end
|
284
|
-
|
285
|
-
#Matches any kind of list by using sublexing technique. MediaWiki lists are context-sensitive
|
286
|
-
#therefore we need to do some special processing with lists. The idea here is to strip
|
287
|
-
#the leftmost symbol indicating the list from the group of input lines and use separate
|
288
|
-
#lexer to process extracted fragment.
|
289
|
-
def match_list
|
290
|
-
if at_start_of_line?
|
291
|
-
list_id = @text[@cursor, 1]
|
292
|
-
sub_text = extract_list_contents(list_id)
|
293
|
-
extracted = 0
|
294
|
-
|
295
|
-
#hack to tokenize everything inside the list
|
296
|
-
@sub_tokens = []
|
297
|
-
sub_lines = ""
|
298
|
-
@sub_tokens << [:LI_START, ""]
|
299
|
-
sub_text.each do |t|
|
300
|
-
extracted += 1
|
301
|
-
if text_is_list? t
|
302
|
-
sub_lines += t
|
303
|
-
else
|
304
|
-
if not sub_lines.empty?
|
305
|
-
@sub_tokens += sub_lex(sub_lines)
|
306
|
-
sub_lines = ""
|
307
|
-
end
|
308
|
-
if @sub_tokens.last[0] != :LI_START
|
309
|
-
@sub_tokens << [:LI_END, ""]
|
310
|
-
@sub_tokens << [:LI_START, ""]
|
311
|
-
end
|
312
|
-
@sub_tokens += sub_lex(t.lstrip)
|
313
|
-
end
|
314
|
-
end
|
315
|
-
if not sub_lines.empty?
|
316
|
-
@sub_tokens += sub_lex(sub_lines)
|
317
|
-
@sub_tokens << [:LI_END, ""]
|
318
|
-
else
|
319
|
-
@sub_tokens << [:LI_END, ""]
|
320
|
-
end
|
321
|
-
|
322
|
-
#end of hack
|
323
|
-
@cursor += sub_text.length + extracted
|
324
|
-
@token_start = @cursor
|
325
|
-
|
326
|
-
case
|
327
|
-
when list_id == "*"
|
328
|
-
@next_token[0] = :UL_START
|
329
|
-
@sub_tokens << [:UL_END, ""]
|
330
|
-
when list_id == "#"
|
331
|
-
@next_token[0] = :OL_START
|
332
|
-
@sub_tokens << [:OL_END, ""]
|
333
|
-
when list_id == ";", list_id == ":"
|
334
|
-
@next_token[0] = :DL_START
|
335
|
-
@sub_tokens << [:DL_END, ""]
|
336
|
-
end
|
337
|
-
|
338
|
-
else
|
339
|
-
match_other
|
340
|
-
end
|
341
|
-
end
|
342
|
-
|
343
|
-
#Matches the line until \n
|
344
|
-
def match_untill_eol
|
345
|
-
val = @text[@cursor, 1]
|
346
|
-
while (val != "\n") and (!val.nil?)
|
347
|
-
@cursor += 1
|
348
|
-
val = @text[@cursor, 1]
|
349
|
-
end
|
350
|
-
@cursor += 1
|
351
|
-
end
|
352
|
-
|
353
|
-
#Matches hline tag that start with "-"
|
354
|
-
# "\n----" { return HLINE; }
|
355
|
-
def match_line
|
356
|
-
if at_start_of_line? and @text[@cursor, 4] == "----"
|
357
|
-
@next_token[0] = :HLINE
|
358
|
-
@cursor += 4
|
359
|
-
else
|
360
|
-
match_other
|
361
|
-
end
|
362
|
-
end
|
363
|
-
|
364
|
-
#Matches signature
|
365
|
-
# "~~~~~" { return SIGNATURE_DATE; }
|
366
|
-
# "~~~~" { return SIGNATURE_FULL; }
|
367
|
-
# "~~~" { return SIGNATURE_NAME; }
|
368
|
-
def match_signature
|
369
|
-
if @text[@cursor, 5] == "~~~~~"
|
370
|
-
@next_token[0] = :SIGNATURE_DATE
|
371
|
-
@cursor += 5
|
372
|
-
elsif @text[@cursor, 4] == "~~~~"
|
373
|
-
@next_token[0] = :SIGNATURE_FULL
|
374
|
-
@cursor += 4
|
375
|
-
elsif @text[@cursor, 3] == "~~~"
|
376
|
-
@next_token[0] = :SIGNATURE_NAME
|
377
|
-
@cursor += 3
|
378
|
-
else
|
379
|
-
match_other
|
380
|
-
end
|
381
|
-
end
|
382
|
-
|
383
|
-
#Matches new line and breaks the paragraph if two newlines are met
|
384
|
-
def match_newline
|
385
|
-
if @text[@cursor, 2] == "\n\n"
|
386
|
-
if @para
|
387
|
-
@next_token[0] = :PARA_END
|
388
|
-
# @para = false
|
389
|
-
@sub_tokens = [[:PARA_START, ""]]
|
390
|
-
@cursor += 2
|
391
|
-
return
|
392
|
-
end
|
393
|
-
end
|
394
|
-
match_other
|
395
|
-
end
|
396
|
-
|
397
|
-
#-- ================== Helper methods ================== ++#
|
398
|
-
|
399
|
-
#Checks if the token is placed at the start of the line.
|
400
|
-
def at_start_of_line?
|
401
|
-
if @cursor == 0 or @text[@cursor-1, 1] == "\n"
|
402
|
-
true
|
403
|
-
else
|
404
|
-
false
|
405
|
-
end
|
406
|
-
end
|
407
|
-
|
408
|
-
#Checks if the text at position contains the start of the html link
|
409
|
-
def html_link?(position)
|
410
|
-
return @text[position, 7] == 'http://'
|
411
|
-
end
|
412
|
-
|
413
|
-
#Adjusts @token_start to skip leading whitespaces
|
414
|
-
def strip_ws_from_token_start
|
415
|
-
@token_start += 1 while @text[@token_start, 1] == " "
|
416
|
-
end
|
417
|
-
|
418
|
-
#Returns true if the TEXT token is empty or contains newline only
|
419
|
-
def empty_text_token?
|
420
|
-
@current_token == [:TEXT, ''] or @current_token == [:TEXT, "\n"]
|
421
|
-
end
|
422
|
-
|
423
|
-
#Returns true if the text is a list, i.e. starts with one of #;*: symbols
|
424
|
-
#that indicate a list
|
425
|
-
def text_is_list?(text)
|
426
|
-
return text =~ /^[#;*:].*/
|
427
|
-
end
|
428
|
-
|
429
|
-
#Runs sublexer to tokenize sub_text
|
430
|
-
def sub_lex(sub_text, strip_paragraphs=true)
|
431
|
-
sub_lexer = MediaWikiLexer.new
|
432
|
-
sub_tokens = sub_lexer.tokenize(sub_text)
|
433
|
-
sub_tokens.pop #false token
|
434
|
-
if strip_paragraphs
|
435
|
-
#the last PARA_END token
|
436
|
-
sub_tokens.pop if sub_tokens.last[0] == :PARA_END
|
437
|
-
#the first PARA_START token
|
438
|
-
sub_tokens.delete_at(0) if sub_tokens[0][0] == :PARA_START
|
439
|
-
end
|
440
|
-
sub_tokens
|
441
|
-
end
|
442
|
-
|
443
|
-
#Extracts the text from current cursor position till the next whitespace
|
444
|
-
def extract_till_whitespace
|
445
|
-
i = @cursor
|
446
|
-
text = ""
|
447
|
-
while i < @text.length
|
448
|
-
curr = @text[i, 1]
|
449
|
-
if (curr == "\n") or (curr == "\t") or (curr == " ")
|
450
|
-
break
|
451
|
-
end
|
452
|
-
text += curr
|
453
|
-
i += 1
|
454
|
-
end
|
455
|
-
text
|
456
|
-
end
|
457
|
-
|
458
|
-
#Extract list contents of list type set by list_id variable.
|
459
|
-
#Example list:
|
460
|
-
# *a
|
461
|
-
# **a
|
462
|
-
#Extracted list with id "*" will look like:
|
463
|
-
# a
|
464
|
-
# *a
|
465
|
-
def extract_list_contents(list_id)
|
466
|
-
i = @cursor+1
|
467
|
-
list = ""
|
468
|
-
while i < @text.length
|
469
|
-
curr = @text[i, 1]
|
470
|
-
if (curr == "\n") and (@text[i+1, 1] != list_id)
|
471
|
-
list+=curr
|
472
|
-
break
|
473
|
-
end
|
474
|
-
list += curr unless (curr == list_id) and (@text[i-1, 1] == "\n")
|
475
|
-
i += 1
|
476
|
-
end
|
477
|
-
list
|
478
|
-
end
|
479
|
-
|
480
|
-
def start_para
|
481
|
-
@tokens << [:PARA_START, ""]
|
482
|
-
@para = true
|
483
|
-
end
|
484
|
-
|
485
|
-
def end_para
|
486
|
-
@tokens << [:PARA_END, ""]
|
487
|
-
@para = false
|
488
|
-
end
|
489
|
-
|
490
|
-
end
|
491
|
-
|
@@ -1,210 +0,0 @@
|
|
1
|
-
#The parser for the MediaWiki language.
|
2
|
-
#
|
3
|
-
#Usage together with a lexer:
|
4
|
-
# inputFile = File.new("data/input1", "r")
|
5
|
-
# input = inputFile.read
|
6
|
-
# parser = MediaWikiParser.new
|
7
|
-
# parser.lexer = MediaWikiLexer.new
|
8
|
-
# parser.parse(input)
|
9
|
-
class MediaWikiParser
|
10
|
-
|
11
|
-
token BOLDSTART BOLDEND ITALICSTART ITALICEND LINKSTART LINKEND
|
12
|
-
INTLINKSTART INTLINKEND SECTION_START SECTION_END TEXT PRE
|
13
|
-
HLINE SIGNATURE_NAME SIGNATURE_DATE SIGNATURE_FULL
|
14
|
-
UL_START UL_END LI_START LI_END OL_START OL_END
|
15
|
-
PARA_START PARA_END
|
16
|
-
|
17
|
-
rule
|
18
|
-
|
19
|
-
wiki:
|
20
|
-
repeated_contents
|
21
|
-
{
|
22
|
-
@nodes.push WikiAST.new
|
23
|
-
#@nodes.last.children.insert(0, val[0])
|
24
|
-
#puts val[0]
|
25
|
-
@nodes.last.children += val[0]
|
26
|
-
}
|
27
|
-
;
|
28
|
-
|
29
|
-
contents:
|
30
|
-
text
|
31
|
-
{
|
32
|
-
result = val[0]
|
33
|
-
}
|
34
|
-
| bulleted_list
|
35
|
-
{
|
36
|
-
result = val[0]
|
37
|
-
}
|
38
|
-
| numbered_list
|
39
|
-
{
|
40
|
-
result = val[0]
|
41
|
-
}
|
42
|
-
| preformatted
|
43
|
-
{
|
44
|
-
p = PreformattedAST.new
|
45
|
-
p.contents = val[0]
|
46
|
-
result = p
|
47
|
-
}
|
48
|
-
| section
|
49
|
-
{
|
50
|
-
s = SectionAST.new
|
51
|
-
s.contents = val[0][0]
|
52
|
-
s.level = val[0][1]
|
53
|
-
result = s
|
54
|
-
}
|
55
|
-
| PARA_START para_contents PARA_END
|
56
|
-
{
|
57
|
-
if val[1]
|
58
|
-
p = ParagraphAST.new
|
59
|
-
p.children = val[1]
|
60
|
-
result = p
|
61
|
-
end
|
62
|
-
}
|
63
|
-
| error
|
64
|
-
{
|
65
|
-
puts "ERR"
|
66
|
-
yyerrok
|
67
|
-
}
|
68
|
-
;
|
69
|
-
|
70
|
-
#TODO: remove empty paragraphs in lexer
|
71
|
-
para_contents:
|
72
|
-
{
|
73
|
-
result = nil
|
74
|
-
}
|
75
|
-
| repeated_contents
|
76
|
-
{
|
77
|
-
result = val[0]
|
78
|
-
}
|
79
|
-
|
80
|
-
repeated_contents: contents
|
81
|
-
{
|
82
|
-
result = []
|
83
|
-
result << val[0]
|
84
|
-
}
|
85
|
-
| repeated_contents contents
|
86
|
-
{
|
87
|
-
result = []
|
88
|
-
result += val[0]
|
89
|
-
result << val[1]
|
90
|
-
}
|
91
|
-
;
|
92
|
-
|
93
|
-
text: element
|
94
|
-
{
|
95
|
-
p = TextAST.new
|
96
|
-
p.formatting = val[0][0]
|
97
|
-
p.contents = val[0][1]
|
98
|
-
result = p
|
99
|
-
}
|
100
|
-
| formatted_element
|
101
|
-
{
|
102
|
-
result = val[0]
|
103
|
-
}
|
104
|
-
;
|
105
|
-
|
106
|
-
element: LINKSTART TEXT LINKEND
|
107
|
-
{ return [:Link, val[1]] }
|
108
|
-
| INTLINKSTART TEXT INTLINKEND
|
109
|
-
{ return [:InternalLink, val[1]] }
|
110
|
-
| TEXT
|
111
|
-
{ return [:None, val[0]] }
|
112
|
-
| HLINE
|
113
|
-
{ return [:HLine, val[0]] }
|
114
|
-
| SIGNATURE_DATE
|
115
|
-
{ return [:SignatureDate, val[0]] }
|
116
|
-
| SIGNATURE_NAME
|
117
|
-
{ return [:SignatureName, val[0]] }
|
118
|
-
| SIGNATURE_FULL
|
119
|
-
{ return [:SignatureFull, val[0]] }
|
120
|
-
;
|
121
|
-
|
122
|
-
formatted_element: BOLDSTART repeated_contents BOLDEND
|
123
|
-
{
|
124
|
-
p = FormattedAST.new
|
125
|
-
p.formatting = :Bold
|
126
|
-
p.children += val[1]
|
127
|
-
result = p
|
128
|
-
}
|
129
|
-
| ITALICSTART repeated_contents ITALICEND
|
130
|
-
{
|
131
|
-
p = FormattedAST.new
|
132
|
-
p.formatting = :Italic
|
133
|
-
p.children += val[1]
|
134
|
-
result = p
|
135
|
-
}
|
136
|
-
;
|
137
|
-
|
138
|
-
bulleted_list: UL_START list_item list_contents UL_END
|
139
|
-
{
|
140
|
-
list = ListAST.new
|
141
|
-
list.list_type = :Bulleted
|
142
|
-
list.children << val[1]
|
143
|
-
list.children += val[2]
|
144
|
-
result = list
|
145
|
-
}
|
146
|
-
;
|
147
|
-
|
148
|
-
numbered_list: OL_START list_item list_contents OL_END
|
149
|
-
{
|
150
|
-
list = ListAST.new
|
151
|
-
list.list_type = :Numbered
|
152
|
-
list.children << val[1]
|
153
|
-
list.children += val[2]
|
154
|
-
result = list
|
155
|
-
}
|
156
|
-
;
|
157
|
-
|
158
|
-
list_contents:
|
159
|
-
{ result = [] }
|
160
|
-
list_item list_contents
|
161
|
-
{
|
162
|
-
result << val[1]
|
163
|
-
result += val[2]
|
164
|
-
}
|
165
|
-
|
|
166
|
-
{ result = [] }
|
167
|
-
;
|
168
|
-
|
169
|
-
list_item: LI_START repeated_contents LI_END
|
170
|
-
{
|
171
|
-
li = ListItemAST.new
|
172
|
-
li.children += val[1]
|
173
|
-
result = li
|
174
|
-
}
|
175
|
-
;
|
176
|
-
|
177
|
-
preformatted: PRE
|
178
|
-
{ result = val[0] }
|
179
|
-
;
|
180
|
-
|
181
|
-
section: SECTION_START TEXT SECTION_END
|
182
|
-
{ result = [val[1], val[0].length] }
|
183
|
-
;
|
184
|
-
|
185
|
-
end
|
186
|
-
|
187
|
-
---- header ----
|
188
|
-
require 'mediacloth/mediawikiast'
|
189
|
-
|
190
|
-
---- inner ----
|
191
|
-
|
192
|
-
attr_accessor :lexer
|
193
|
-
|
194
|
-
def initialize
|
195
|
-
@nodes = []
|
196
|
-
super
|
197
|
-
end
|
198
|
-
|
199
|
-
#Tokenizes input string and parses it.
|
200
|
-
def parse(input)
|
201
|
-
@yydebug=true
|
202
|
-
lexer.tokenize(input)
|
203
|
-
do_parse
|
204
|
-
return @nodes.last
|
205
|
-
end
|
206
|
-
|
207
|
-
#Asks the lexer to return the next token.
|
208
|
-
def next_token
|
209
|
-
return @lexer.lex
|
210
|
-
end
|