mediacloth 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,491 @@
1
+ #The lexer for MediaWiki language.
2
+ #
3
+ #Standalone usage:
4
+ # file = File.new("somefile", "r")
5
+ # input = file.read
6
+ # lexer = MediaWikiLexer.new
7
+ # lexer.tokenize(input)
8
+ #
9
+ #Inside RACC-generated parser:
10
+ # ...
11
+ # ---- inner ----
12
+ # attr_accessor :lexer
13
+ # def parse(input)
14
+ # lexer.tokenize(input)
15
+ # return do_parse
16
+ # end
17
+ # def next_token
18
+ # return @lexer.lex
19
+ # end
20
+ # ...
21
+ # parser = MediaWikiParser.new
22
+ # parser.lexer = MediaWikiLexer.new
23
+ # parser.parse(input)
24
+ class MediaWikiLexer
25
+
26
+ #Initialized the lexer with a match table.
27
+ #
28
+ #The match table tells the lexer which method to invoke
29
+ #on given input char during "tokenize" phase.
30
+ def initialize
31
+ @position = 0
32
+ @pair_stack = [[false, false]] #stack of tokens for which a pair should be found
33
+ @list_stack = []
34
+ @lexer_table = Hash.new(method(:match_other))
35
+ @lexer_table["'"] = method(:match_italic_or_bold)
36
+ @lexer_table["="] = method(:match_section)
37
+ @lexer_table["["] = method(:match_link_start)
38
+ @lexer_table["]"] = method(:match_link_end)
39
+ @lexer_table[" "] = method(:match_space)
40
+ @lexer_table["*"] = method(:match_list)
41
+ @lexer_table["#"] = method(:match_list)
42
+ @lexer_table[";"] = method(:match_list)
43
+ @lexer_table[":"] = method(:match_list)
44
+ @lexer_table["-"] = method(:match_line)
45
+ @lexer_table["~"] = method(:match_signature)
46
+ @lexer_table["h"] = method(:match_inline_link)
47
+ @lexer_table["\n"] = method(:match_newline)
48
+ end
49
+
50
+ #Transforms input stream (string) into the stream of tokens.
51
+ #Tokens are collected into an array of type [ [TOKEN_SYMBOL, TOKEN_VALUE], ..., [false, false] ].
52
+ #This array can be given as input token-by token to RACC based parser with no
53
+ #modification. The last token [false, false] inficates EOF.
54
+ def tokenize(input)
55
+ @tokens = []
56
+ start_para
57
+ @cursor = 0
58
+ @text = input
59
+ @next_token = []
60
+
61
+ #This tokenizer algorithm assumes that everything that is not
62
+ #matched by the lexer is going to be :TEXT token. Otherwise it's usual
63
+ #lexer algo which call methods from the match table to define next tokens.
64
+ while (@cursor < @text.length)
65
+ @current_token = [:TEXT, ''] unless @current_token
66
+ @token_start = @cursor
67
+ @char = @text[@cursor, 1]
68
+
69
+ if @lexer_table[@char].call == :TEXT
70
+ @current_token[1] += @text[@token_start, 1]
71
+ else
72
+ #skip empty :TEXT tokens
73
+ puts "not a text: #{@next_token[0]}"
74
+ unless empty_text_token?
75
+ @tokens << @current_token
76
+ puts "chance to break para before #{@next_token[0]}"
77
+ unless para_breaker?(@next_token[0])
78
+ #if no paragraph was previously started
79
+ #then we should start it
80
+ start_para if !@para
81
+ else
82
+ #if we already have a paragraph this is the time to close it
83
+ end_para if @para
84
+ end
85
+ end
86
+
87
+ if para_breaker?(@next_token[0])
88
+ if @tokens.last and @tokens.last[0] == :PARA_START
89
+ #we need to remove para start token because no para end is possible
90
+ @tokens.pop
91
+ @para = false
92
+ end
93
+ end
94
+
95
+ @next_token[1] = @text[@token_start, @cursor - @token_start]
96
+ @tokens << @next_token
97
+ #hack to enable sub-lexing!
98
+ if @sub_tokens
99
+ @tokens += @sub_tokens
100
+ @sub_tokens = nil
101
+ end
102
+ #end of hack!
103
+
104
+ #if the next token can start the paragraph, let's try that
105
+ start_para if @tokens.last and para_starter?(@tokens.last[0])
106
+
107
+ @current_token = nil
108
+ @next_token = []
109
+ end
110
+ end
111
+ #add the last TEXT token if it exists
112
+ puts @current_token
113
+ if @current_token and not empty_text_token?
114
+ puts "here"
115
+ if para_breaker?(@current_token[0])
116
+ #if we already have a paragraph this is the time to close it
117
+ end_para if @para
118
+ end
119
+ @tokens << @current_token
120
+ end
121
+
122
+ #remove empty para start or finish the paragraph if necessary
123
+ if @tokens.last and @tokens.last[0] == :PARA_START
124
+ @tokens.pop
125
+ @para = false
126
+ else
127
+ end_para if @para
128
+ end
129
+ #RACC wants us to put this to indicate EOF
130
+ @tokens << [false, false]
131
+ @tokens
132
+ end
133
+
134
+ #Returns the next token from the stream. Useful for RACC parsers.
135
+ def lex
136
+ token = @tokens[@position]
137
+ @position += 1
138
+ return token
139
+ end
140
+
141
+
142
+ private
143
+ #Returns true if the token breaks the paragraph.
144
+ def para_breaker?(token)
145
+ [:SECTION_START, :SECTION_END,
146
+ :UL_START, :UL_END, :OL_START, :OL_END,
147
+ :DL_START, :DL_END, :HLINE, :PRE].include?(token)
148
+ end
149
+
150
+ #Returns true if the paragraph can be started after the token
151
+ def para_starter?(token)
152
+ [:SECTION_END, :UL_END, :OL_END, :DL_END, :HLINE, :PRE].include?(token)
153
+ end
154
+
155
+ #-- ================== Match methods ================== ++#
156
+
157
+ #Matches anything that was not matched. Returns :TEXT to indicate
158
+ #that matched characters should go into :TEXT token.
159
+ def match_other
160
+ @cursor += 1
161
+ return :TEXT
162
+ end
163
+
164
+ #Matches italic or bold symbols:
165
+ # "'''" { return :BOLD; }
166
+ # "''" { return :ITALIC; }
167
+ def match_italic_or_bold
168
+ if @text[@cursor, 3] == "'''" and @pair_stack.last[0] != :ITALICSTART
169
+ matchBold
170
+ @cursor += 3
171
+ return
172
+ end
173
+ if @text[@cursor, 2] == "''"
174
+ matchItalic
175
+ @cursor += 2
176
+ return
177
+ end
178
+ match_other
179
+ end
180
+
181
+ def matchBold
182
+ if @pair_stack.last[0] == :BOLDSTART
183
+ @next_token[0] = :BOLDEND
184
+ @pair_stack.pop
185
+ else
186
+ @next_token[0] = :BOLDSTART
187
+ @pair_stack.push @next_token
188
+ end
189
+ end
190
+
191
+ def matchItalic
192
+ if @pair_stack.last[0] == :ITALICSTART
193
+ @next_token[0] = :ITALICEND
194
+ @pair_stack.pop
195
+ else
196
+ @next_token[0] = :ITALICSTART
197
+ @pair_stack.push @next_token
198
+ end
199
+ end
200
+
201
+ #Matches sections
202
+ def match_section
203
+ if at_start_of_line? or (@pair_stack.last[0] == :SECTION_START)
204
+ i = 0
205
+ i += 1 while @text[@cursor+i, 1] == "="
206
+ @cursor += i
207
+
208
+ if @pair_stack.last[0] == :SECTION_START
209
+ @next_token[0] = :SECTION_END
210
+ @pair_stack.pop
211
+ else
212
+ @next_token[0] = :SECTION_START
213
+ @pair_stack.push @next_token
214
+ end
215
+ else
216
+ match_other
217
+ end
218
+ end
219
+
220
+ #Matches start of the hyperlinks
221
+ # "[[" { return INTLINKSTART; }
222
+ # "[" { return LINKSTART; }
223
+ def match_link_start
224
+ if @text[@cursor, 2] == "[["
225
+ @next_token[0] = :INTLINKSTART
226
+ @pair_stack.push @next_token
227
+ @cursor += 2
228
+ elsif @text[@cursor, 1] == "[" and html_link?(@cursor+1)
229
+ @next_token[0] = :LINKSTART
230
+ @pair_stack.push @next_token
231
+ @cursor += 1
232
+ else
233
+ match_other
234
+ end
235
+ end
236
+
237
+ #Matches end of the hyperlinks
238
+ # "]]" { return INTLINKEND; }
239
+ # "]" { return LINKEND; }
240
+ def match_link_end
241
+ if @text[@cursor, 2] == "]]" and @pair_stack.last[0] == :INTLINKSTART
242
+ @next_token[0] = :INTLINKEND
243
+ @pair_stack.pop
244
+ @cursor += 2
245
+ elsif @text[@cursor, 1] == "]" and @pair_stack.last[0] == :LINKSTART
246
+ @next_token[0] = :LINKEND
247
+ @pair_stack.pop
248
+ @cursor += 1
249
+ else
250
+ match_other
251
+ end
252
+ end
253
+
254
+ #Matches inlined unformatted html link
255
+ # "http://[^\s]*" { return [ LINKSTART TEXT LINKEND]; }
256
+ def match_inline_link
257
+ #if no link start token was detected and the text starts with http://
258
+ #then it's the inlined unformatted html link
259
+ if html_link?(@cursor) and @pair_stack.last[0] != :INTLINKSTART and
260
+ @pair_stack.last[0] != :LINKSTART
261
+ @next_token[0] = :LINKSTART
262
+ linkText = extract_till_whitespace
263
+ @sub_tokens = []
264
+ @sub_tokens << [:TEXT, linkText]
265
+ @sub_tokens << [:LINKEND, ']']
266
+ @cursor += linkText.length
267
+ @token_start = @cursor
268
+ else
269
+ match_other
270
+ end
271
+ end
272
+
273
+ #Matches space to find preformatted areas which start with a space after a newline
274
+ # "\n\s[^\n]*" { return PRE; }
275
+ def match_space
276
+ if at_start_of_line?
277
+ match_untill_eol
278
+ @next_token[0] = :PRE
279
+ strip_ws_from_token_start
280
+ else
281
+ match_other
282
+ end
283
+ end
284
+
285
+ #Matches any kind of list by using sublexing technique. MediaWiki lists are context-sensitive
286
+ #therefore we need to do some special processing with lists. The idea here is to strip
287
+ #the leftmost symbol indicating the list from the group of input lines and use separate
288
+ #lexer to process extracted fragment.
289
+ def match_list
290
+ if at_start_of_line?
291
+ list_id = @text[@cursor, 1]
292
+ sub_text = extract_list_contents(list_id)
293
+ extracted = 0
294
+
295
+ #hack to tokenize everything inside the list
296
+ @sub_tokens = []
297
+ sub_lines = ""
298
+ @sub_tokens << [:LI_START, ""]
299
+ sub_text.each do |t|
300
+ extracted += 1
301
+ if text_is_list? t
302
+ sub_lines += t
303
+ else
304
+ if not sub_lines.empty?
305
+ @sub_tokens += sub_lex(sub_lines)
306
+ sub_lines = ""
307
+ end
308
+ if @sub_tokens.last[0] != :LI_START
309
+ @sub_tokens << [:LI_END, ""]
310
+ @sub_tokens << [:LI_START, ""]
311
+ end
312
+ @sub_tokens += sub_lex(t.lstrip)
313
+ end
314
+ end
315
+ if not sub_lines.empty?
316
+ @sub_tokens += sub_lex(sub_lines)
317
+ @sub_tokens << [:LI_END, ""]
318
+ else
319
+ @sub_tokens << [:LI_END, ""]
320
+ end
321
+
322
+ #end of hack
323
+ @cursor += sub_text.length + extracted
324
+ @token_start = @cursor
325
+
326
+ case
327
+ when list_id == "*"
328
+ @next_token[0] = :UL_START
329
+ @sub_tokens << [:UL_END, ""]
330
+ when list_id == "#"
331
+ @next_token[0] = :OL_START
332
+ @sub_tokens << [:OL_END, ""]
333
+ when list_id == ";", list_id == ":"
334
+ @next_token[0] = :DL_START
335
+ @sub_tokens << [:DL_END, ""]
336
+ end
337
+
338
+ else
339
+ match_other
340
+ end
341
+ end
342
+
343
+ #Matches the line until \n
344
+ def match_untill_eol
345
+ val = @text[@cursor, 1]
346
+ while (val != "\n") and (!val.nil?)
347
+ @cursor += 1
348
+ val = @text[@cursor, 1]
349
+ end
350
+ @cursor += 1
351
+ end
352
+
353
+ #Matches hline tag that start with "-"
354
+ # "\n----" { return HLINE; }
355
+ def match_line
356
+ if at_start_of_line? and @text[@cursor, 4] == "----"
357
+ @next_token[0] = :HLINE
358
+ @cursor += 4
359
+ else
360
+ match_other
361
+ end
362
+ end
363
+
364
+ #Matches signature
365
+ # "~~~~~" { return SIGNATURE_DATE; }
366
+ # "~~~~" { return SIGNATURE_FULL; }
367
+ # "~~~" { return SIGNATURE_NAME; }
368
+ def match_signature
369
+ if @text[@cursor, 5] == "~~~~~"
370
+ @next_token[0] = :SIGNATURE_DATE
371
+ @cursor += 5
372
+ elsif @text[@cursor, 4] == "~~~~"
373
+ @next_token[0] = :SIGNATURE_FULL
374
+ @cursor += 4
375
+ elsif @text[@cursor, 3] == "~~~"
376
+ @next_token[0] = :SIGNATURE_NAME
377
+ @cursor += 3
378
+ else
379
+ match_other
380
+ end
381
+ end
382
+
383
+ #Matches new line and breaks the paragraph if two newlines are met
384
+ def match_newline
385
+ if @text[@cursor, 2] == "\n\n"
386
+ if @para
387
+ @next_token[0] = :PARA_END
388
+ # @para = false
389
+ @sub_tokens = [[:PARA_START, ""]]
390
+ @cursor += 2
391
+ return
392
+ end
393
+ end
394
+ match_other
395
+ end
396
+
397
+ #-- ================== Helper methods ================== ++#
398
+
399
+ #Checks if the token is placed at the start of the line.
400
+ def at_start_of_line?
401
+ if @cursor == 0 or @text[@cursor-1, 1] == "\n"
402
+ true
403
+ else
404
+ false
405
+ end
406
+ end
407
+
408
+ #Checks if the text at position contains the start of the html link
409
+ def html_link?(position)
410
+ return @text[position, 7] == 'http://'
411
+ end
412
+
413
+ #Adjusts @token_start to skip leading whitespaces
414
+ def strip_ws_from_token_start
415
+ @token_start += 1 while @text[@token_start, 1] == " "
416
+ end
417
+
418
+ #Returns true if the TEXT token is empty or contains newline only
419
+ def empty_text_token?
420
+ @current_token == [:TEXT, ''] or @current_token == [:TEXT, "\n"]
421
+ end
422
+
423
+ #Returns true if the text is a list, i.e. starts with one of #;*: symbols
424
+ #that indicate a list
425
+ def text_is_list?(text)
426
+ return text =~ /^[#;*:].*/
427
+ end
428
+
429
+ #Runs sublexer to tokenize sub_text
430
+ def sub_lex(sub_text, strip_paragraphs=true)
431
+ sub_lexer = MediaWikiLexer.new
432
+ sub_tokens = sub_lexer.tokenize(sub_text)
433
+ sub_tokens.pop #false token
434
+ if strip_paragraphs
435
+ #the last PARA_END token
436
+ sub_tokens.pop if sub_tokens.last[0] == :PARA_END
437
+ #the first PARA_START token
438
+ sub_tokens.delete_at(0) if sub_tokens[0][0] == :PARA_START
439
+ end
440
+ sub_tokens
441
+ end
442
+
443
+ #Extracts the text from current cursor position till the next whitespace
444
+ def extract_till_whitespace
445
+ i = @cursor
446
+ text = ""
447
+ while i < @text.length
448
+ curr = @text[i, 1]
449
+ if (curr == "\n") or (curr == "\t") or (curr == " ")
450
+ break
451
+ end
452
+ text += curr
453
+ i += 1
454
+ end
455
+ text
456
+ end
457
+
458
+ #Extract list contents of list type set by list_id variable.
459
+ #Example list:
460
+ # *a
461
+ # **a
462
+ #Extracted list with id "*" will look like:
463
+ # a
464
+ # *a
465
+ def extract_list_contents(list_id)
466
+ i = @cursor+1
467
+ list = ""
468
+ while i < @text.length
469
+ curr = @text[i, 1]
470
+ if (curr == "\n") and (@text[i+1, 1] != list_id)
471
+ list+=curr
472
+ break
473
+ end
474
+ list += curr unless (curr == list_id) and (@text[i-1, 1] == "\n")
475
+ i += 1
476
+ end
477
+ list
478
+ end
479
+
480
+ def start_para
481
+ @tokens << [:PARA_START, ""]
482
+ @para = true
483
+ end
484
+
485
+ def end_para
486
+ @tokens << [:PARA_END, ""]
487
+ @para = false
488
+ end
489
+
490
+ end
491
+