mediacloth 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,491 @@
1
+ #The lexer for MediaWiki language.
2
+ #
3
+ #Standalone usage:
4
+ # file = File.new("somefile", "r")
5
+ # input = file.read
6
+ # lexer = MediaWikiLexer.new
7
+ # lexer.tokenize(input)
8
+ #
9
+ #Inside RACC-generated parser:
10
+ # ...
11
+ # ---- inner ----
12
+ # attr_accessor :lexer
13
+ # def parse(input)
14
+ # lexer.tokenize(input)
15
+ # return do_parse
16
+ # end
17
+ # def next_token
18
+ # return @lexer.lex
19
+ # end
20
+ # ...
21
+ # parser = MediaWikiParser.new
22
+ # parser.lexer = MediaWikiLexer.new
23
+ # parser.parse(input)
24
+ class MediaWikiLexer
25
+
26
+ #Initialized the lexer with a match table.
27
+ #
28
+ #The match table tells the lexer which method to invoke
29
+ #on given input char during "tokenize" phase.
30
+ def initialize
31
+ @position = 0
32
+ @pair_stack = [[false, false]] #stack of tokens for which a pair should be found
33
+ @list_stack = []
34
+ @lexer_table = Hash.new(method(:match_other))
35
+ @lexer_table["'"] = method(:match_italic_or_bold)
36
+ @lexer_table["="] = method(:match_section)
37
+ @lexer_table["["] = method(:match_link_start)
38
+ @lexer_table["]"] = method(:match_link_end)
39
+ @lexer_table[" "] = method(:match_space)
40
+ @lexer_table["*"] = method(:match_list)
41
+ @lexer_table["#"] = method(:match_list)
42
+ @lexer_table[";"] = method(:match_list)
43
+ @lexer_table[":"] = method(:match_list)
44
+ @lexer_table["-"] = method(:match_line)
45
+ @lexer_table["~"] = method(:match_signature)
46
+ @lexer_table["h"] = method(:match_inline_link)
47
+ @lexer_table["\n"] = method(:match_newline)
48
+ end
49
+
50
+ #Transforms input stream (string) into the stream of tokens.
51
+ #Tokens are collected into an array of type [ [TOKEN_SYMBOL, TOKEN_VALUE], ..., [false, false] ].
52
+ #This array can be given as input token-by token to RACC based parser with no
53
+ #modification. The last token [false, false] inficates EOF.
54
+ def tokenize(input)
55
+ @tokens = []
56
+ start_para
57
+ @cursor = 0
58
+ @text = input
59
+ @next_token = []
60
+
61
+ #This tokenizer algorithm assumes that everything that is not
62
+ #matched by the lexer is going to be :TEXT token. Otherwise it's usual
63
+ #lexer algo which call methods from the match table to define next tokens.
64
+ while (@cursor < @text.length)
65
+ @current_token = [:TEXT, ''] unless @current_token
66
+ @token_start = @cursor
67
+ @char = @text[@cursor, 1]
68
+
69
+ if @lexer_table[@char].call == :TEXT
70
+ @current_token[1] += @text[@token_start, 1]
71
+ else
72
+ #skip empty :TEXT tokens
73
+ puts "not a text: #{@next_token[0]}"
74
+ unless empty_text_token?
75
+ @tokens << @current_token
76
+ puts "chance to break para before #{@next_token[0]}"
77
+ unless para_breaker?(@next_token[0])
78
+ #if no paragraph was previously started
79
+ #then we should start it
80
+ start_para if !@para
81
+ else
82
+ #if we already have a paragraph this is the time to close it
83
+ end_para if @para
84
+ end
85
+ end
86
+
87
+ if para_breaker?(@next_token[0])
88
+ if @tokens.last and @tokens.last[0] == :PARA_START
89
+ #we need to remove para start token because no para end is possible
90
+ @tokens.pop
91
+ @para = false
92
+ end
93
+ end
94
+
95
+ @next_token[1] = @text[@token_start, @cursor - @token_start]
96
+ @tokens << @next_token
97
+ #hack to enable sub-lexing!
98
+ if @sub_tokens
99
+ @tokens += @sub_tokens
100
+ @sub_tokens = nil
101
+ end
102
+ #end of hack!
103
+
104
+ #if the next token can start the paragraph, let's try that
105
+ start_para if @tokens.last and para_starter?(@tokens.last[0])
106
+
107
+ @current_token = nil
108
+ @next_token = []
109
+ end
110
+ end
111
+ #add the last TEXT token if it exists
112
+ puts @current_token
113
+ if @current_token and not empty_text_token?
114
+ puts "here"
115
+ if para_breaker?(@current_token[0])
116
+ #if we already have a paragraph this is the time to close it
117
+ end_para if @para
118
+ end
119
+ @tokens << @current_token
120
+ end
121
+
122
+ #remove empty para start or finish the paragraph if necessary
123
+ if @tokens.last and @tokens.last[0] == :PARA_START
124
+ @tokens.pop
125
+ @para = false
126
+ else
127
+ end_para if @para
128
+ end
129
+ #RACC wants us to put this to indicate EOF
130
+ @tokens << [false, false]
131
+ @tokens
132
+ end
133
+
134
+ #Returns the next token from the stream. Useful for RACC parsers.
135
+ def lex
136
+ token = @tokens[@position]
137
+ @position += 1
138
+ return token
139
+ end
140
+
141
+
142
+ private
143
+ #Returns true if the token breaks the paragraph.
144
+ def para_breaker?(token)
145
+ [:SECTION_START, :SECTION_END,
146
+ :UL_START, :UL_END, :OL_START, :OL_END,
147
+ :DL_START, :DL_END, :HLINE, :PRE].include?(token)
148
+ end
149
+
150
+ #Returns true if the paragraph can be started after the token
151
+ def para_starter?(token)
152
+ [:SECTION_END, :UL_END, :OL_END, :DL_END, :HLINE, :PRE].include?(token)
153
+ end
154
+
155
+ #-- ================== Match methods ================== ++#
156
+
157
+ #Matches anything that was not matched. Returns :TEXT to indicate
158
+ #that matched characters should go into :TEXT token.
159
+ def match_other
160
+ @cursor += 1
161
+ return :TEXT
162
+ end
163
+
164
+ #Matches italic or bold symbols:
165
+ # "'''" { return :BOLD; }
166
+ # "''" { return :ITALIC; }
167
+ def match_italic_or_bold
168
+ if @text[@cursor, 3] == "'''" and @pair_stack.last[0] != :ITALICSTART
169
+ matchBold
170
+ @cursor += 3
171
+ return
172
+ end
173
+ if @text[@cursor, 2] == "''"
174
+ matchItalic
175
+ @cursor += 2
176
+ return
177
+ end
178
+ match_other
179
+ end
180
+
181
+ def matchBold
182
+ if @pair_stack.last[0] == :BOLDSTART
183
+ @next_token[0] = :BOLDEND
184
+ @pair_stack.pop
185
+ else
186
+ @next_token[0] = :BOLDSTART
187
+ @pair_stack.push @next_token
188
+ end
189
+ end
190
+
191
+ def matchItalic
192
+ if @pair_stack.last[0] == :ITALICSTART
193
+ @next_token[0] = :ITALICEND
194
+ @pair_stack.pop
195
+ else
196
+ @next_token[0] = :ITALICSTART
197
+ @pair_stack.push @next_token
198
+ end
199
+ end
200
+
201
+ #Matches sections
202
+ def match_section
203
+ if at_start_of_line? or (@pair_stack.last[0] == :SECTION_START)
204
+ i = 0
205
+ i += 1 while @text[@cursor+i, 1] == "="
206
+ @cursor += i
207
+
208
+ if @pair_stack.last[0] == :SECTION_START
209
+ @next_token[0] = :SECTION_END
210
+ @pair_stack.pop
211
+ else
212
+ @next_token[0] = :SECTION_START
213
+ @pair_stack.push @next_token
214
+ end
215
+ else
216
+ match_other
217
+ end
218
+ end
219
+
220
+ #Matches start of the hyperlinks
221
+ # "[[" { return INTLINKSTART; }
222
+ # "[" { return LINKSTART; }
223
+ def match_link_start
224
+ if @text[@cursor, 2] == "[["
225
+ @next_token[0] = :INTLINKSTART
226
+ @pair_stack.push @next_token
227
+ @cursor += 2
228
+ elsif @text[@cursor, 1] == "[" and html_link?(@cursor+1)
229
+ @next_token[0] = :LINKSTART
230
+ @pair_stack.push @next_token
231
+ @cursor += 1
232
+ else
233
+ match_other
234
+ end
235
+ end
236
+
237
+ #Matches end of the hyperlinks
238
+ # "]]" { return INTLINKEND; }
239
+ # "]" { return LINKEND; }
240
+ def match_link_end
241
+ if @text[@cursor, 2] == "]]" and @pair_stack.last[0] == :INTLINKSTART
242
+ @next_token[0] = :INTLINKEND
243
+ @pair_stack.pop
244
+ @cursor += 2
245
+ elsif @text[@cursor, 1] == "]" and @pair_stack.last[0] == :LINKSTART
246
+ @next_token[0] = :LINKEND
247
+ @pair_stack.pop
248
+ @cursor += 1
249
+ else
250
+ match_other
251
+ end
252
+ end
253
+
254
+ #Matches inlined unformatted html link
255
+ # "http://[^\s]*" { return [ LINKSTART TEXT LINKEND]; }
256
+ def match_inline_link
257
+ #if no link start token was detected and the text starts with http://
258
+ #then it's the inlined unformatted html link
259
+ if html_link?(@cursor) and @pair_stack.last[0] != :INTLINKSTART and
260
+ @pair_stack.last[0] != :LINKSTART
261
+ @next_token[0] = :LINKSTART
262
+ linkText = extract_till_whitespace
263
+ @sub_tokens = []
264
+ @sub_tokens << [:TEXT, linkText]
265
+ @sub_tokens << [:LINKEND, ']']
266
+ @cursor += linkText.length
267
+ @token_start = @cursor
268
+ else
269
+ match_other
270
+ end
271
+ end
272
+
273
+ #Matches space to find preformatted areas which start with a space after a newline
274
+ # "\n\s[^\n]*" { return PRE; }
275
+ def match_space
276
+ if at_start_of_line?
277
+ match_untill_eol
278
+ @next_token[0] = :PRE
279
+ strip_ws_from_token_start
280
+ else
281
+ match_other
282
+ end
283
+ end
284
+
285
+ #Matches any kind of list by using sublexing technique. MediaWiki lists are context-sensitive
286
+ #therefore we need to do some special processing with lists. The idea here is to strip
287
+ #the leftmost symbol indicating the list from the group of input lines and use separate
288
+ #lexer to process extracted fragment.
289
+ def match_list
290
+ if at_start_of_line?
291
+ list_id = @text[@cursor, 1]
292
+ sub_text = extract_list_contents(list_id)
293
+ extracted = 0
294
+
295
+ #hack to tokenize everything inside the list
296
+ @sub_tokens = []
297
+ sub_lines = ""
298
+ @sub_tokens << [:LI_START, ""]
299
+ sub_text.each do |t|
300
+ extracted += 1
301
+ if text_is_list? t
302
+ sub_lines += t
303
+ else
304
+ if not sub_lines.empty?
305
+ @sub_tokens += sub_lex(sub_lines)
306
+ sub_lines = ""
307
+ end
308
+ if @sub_tokens.last[0] != :LI_START
309
+ @sub_tokens << [:LI_END, ""]
310
+ @sub_tokens << [:LI_START, ""]
311
+ end
312
+ @sub_tokens += sub_lex(t.lstrip)
313
+ end
314
+ end
315
+ if not sub_lines.empty?
316
+ @sub_tokens += sub_lex(sub_lines)
317
+ @sub_tokens << [:LI_END, ""]
318
+ else
319
+ @sub_tokens << [:LI_END, ""]
320
+ end
321
+
322
+ #end of hack
323
+ @cursor += sub_text.length + extracted
324
+ @token_start = @cursor
325
+
326
+ case
327
+ when list_id == "*"
328
+ @next_token[0] = :UL_START
329
+ @sub_tokens << [:UL_END, ""]
330
+ when list_id == "#"
331
+ @next_token[0] = :OL_START
332
+ @sub_tokens << [:OL_END, ""]
333
+ when list_id == ";", list_id == ":"
334
+ @next_token[0] = :DL_START
335
+ @sub_tokens << [:DL_END, ""]
336
+ end
337
+
338
+ else
339
+ match_other
340
+ end
341
+ end
342
+
343
+ #Matches the line until \n
344
+ def match_untill_eol
345
+ val = @text[@cursor, 1]
346
+ while (val != "\n") and (!val.nil?)
347
+ @cursor += 1
348
+ val = @text[@cursor, 1]
349
+ end
350
+ @cursor += 1
351
+ end
352
+
353
+ #Matches hline tag that start with "-"
354
+ # "\n----" { return HLINE; }
355
+ def match_line
356
+ if at_start_of_line? and @text[@cursor, 4] == "----"
357
+ @next_token[0] = :HLINE
358
+ @cursor += 4
359
+ else
360
+ match_other
361
+ end
362
+ end
363
+
364
+ #Matches signature
365
+ # "~~~~~" { return SIGNATURE_DATE; }
366
+ # "~~~~" { return SIGNATURE_FULL; }
367
+ # "~~~" { return SIGNATURE_NAME; }
368
+ def match_signature
369
+ if @text[@cursor, 5] == "~~~~~"
370
+ @next_token[0] = :SIGNATURE_DATE
371
+ @cursor += 5
372
+ elsif @text[@cursor, 4] == "~~~~"
373
+ @next_token[0] = :SIGNATURE_FULL
374
+ @cursor += 4
375
+ elsif @text[@cursor, 3] == "~~~"
376
+ @next_token[0] = :SIGNATURE_NAME
377
+ @cursor += 3
378
+ else
379
+ match_other
380
+ end
381
+ end
382
+
383
+ #Matches new line and breaks the paragraph if two newlines are met
384
+ def match_newline
385
+ if @text[@cursor, 2] == "\n\n"
386
+ if @para
387
+ @next_token[0] = :PARA_END
388
+ # @para = false
389
+ @sub_tokens = [[:PARA_START, ""]]
390
+ @cursor += 2
391
+ return
392
+ end
393
+ end
394
+ match_other
395
+ end
396
+
397
+ #-- ================== Helper methods ================== ++#
398
+
399
+ #Checks if the token is placed at the start of the line.
400
+ def at_start_of_line?
401
+ if @cursor == 0 or @text[@cursor-1, 1] == "\n"
402
+ true
403
+ else
404
+ false
405
+ end
406
+ end
407
+
408
+ #Checks if the text at position contains the start of the html link
409
+ def html_link?(position)
410
+ return @text[position, 7] == 'http://'
411
+ end
412
+
413
+ #Adjusts @token_start to skip leading whitespaces
414
+ def strip_ws_from_token_start
415
+ @token_start += 1 while @text[@token_start, 1] == " "
416
+ end
417
+
418
+ #Returns true if the TEXT token is empty or contains newline only
419
+ def empty_text_token?
420
+ @current_token == [:TEXT, ''] or @current_token == [:TEXT, "\n"]
421
+ end
422
+
423
+ #Returns true if the text is a list, i.e. starts with one of #;*: symbols
424
+ #that indicate a list
425
+ def text_is_list?(text)
426
+ return text =~ /^[#;*:].*/
427
+ end
428
+
429
+ #Runs sublexer to tokenize sub_text
430
+ def sub_lex(sub_text, strip_paragraphs=true)
431
+ sub_lexer = MediaWikiLexer.new
432
+ sub_tokens = sub_lexer.tokenize(sub_text)
433
+ sub_tokens.pop #false token
434
+ if strip_paragraphs
435
+ #the last PARA_END token
436
+ sub_tokens.pop if sub_tokens.last[0] == :PARA_END
437
+ #the first PARA_START token
438
+ sub_tokens.delete_at(0) if sub_tokens[0][0] == :PARA_START
439
+ end
440
+ sub_tokens
441
+ end
442
+
443
+ #Extracts the text from current cursor position till the next whitespace
444
+ def extract_till_whitespace
445
+ i = @cursor
446
+ text = ""
447
+ while i < @text.length
448
+ curr = @text[i, 1]
449
+ if (curr == "\n") or (curr == "\t") or (curr == " ")
450
+ break
451
+ end
452
+ text += curr
453
+ i += 1
454
+ end
455
+ text
456
+ end
457
+
458
+ #Extract list contents of list type set by list_id variable.
459
+ #Example list:
460
+ # *a
461
+ # **a
462
+ #Extracted list with id "*" will look like:
463
+ # a
464
+ # *a
465
+ def extract_list_contents(list_id)
466
+ i = @cursor+1
467
+ list = ""
468
+ while i < @text.length
469
+ curr = @text[i, 1]
470
+ if (curr == "\n") and (@text[i+1, 1] != list_id)
471
+ list+=curr
472
+ break
473
+ end
474
+ list += curr unless (curr == list_id) and (@text[i-1, 1] == "\n")
475
+ i += 1
476
+ end
477
+ list
478
+ end
479
+
480
+ def start_para
481
+ @tokens << [:PARA_START, ""]
482
+ @para = true
483
+ end
484
+
485
+ def end_para
486
+ @tokens << [:PARA_END, ""]
487
+ @para = false
488
+ end
489
+
490
+ end
491
+