mediacloth 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -31,11 +31,13 @@ class MediaWikiLexer
31
31
  @position = 0
32
32
  @pair_stack = [[false, false]] #stack of tokens for which a pair should be found
33
33
  @list_stack = []
34
+ # Default lexer table
34
35
  @lexer_table = Hash.new(method(:match_other))
35
36
  @lexer_table["'"] = method(:match_italic_or_bold)
36
37
  @lexer_table["="] = method(:match_section)
37
38
  @lexer_table["["] = method(:match_link_start)
38
39
  @lexer_table["]"] = method(:match_link_end)
40
+ @lexer_table["|"] = method(:match_link_sep_or_table_cell)
39
41
  @lexer_table[" "] = method(:match_space)
40
42
  @lexer_table["*"] = method(:match_list)
41
43
  @lexer_table["#"] = method(:match_list)
@@ -45,6 +47,15 @@ class MediaWikiLexer
45
47
  @lexer_table["~"] = method(:match_signature)
46
48
  @lexer_table["h"] = method(:match_inline_link)
47
49
  @lexer_table["\n"] = method(:match_newline)
50
+ @lexer_table["\r"] = method(:match_carriagereturn)
51
+ @lexer_table["<"] = method(:match_tag_start)
52
+ @lexer_table["{"] = method(:match_table)
53
+ @lexer_table["!"] = method(:match_table_head)
54
+ # Lexer table used when inside :match_tag_start ... :match_tag_end
55
+ @tag_lexer_table = Hash.new(method(:match_other))
56
+ @tag_lexer_table["<"] = method(:match_tag_end)
57
+ # Begin lexing in default state
58
+ @current_lexer_table = @lexer_table
48
59
  end
49
60
 
50
61
  #Transforms input stream (string) into the stream of tokens.
@@ -66,13 +77,13 @@ class MediaWikiLexer
66
77
  @token_start = @cursor
67
78
  @char = @text[@cursor, 1]
68
79
 
69
- if @lexer_table[@char].call == :TEXT
80
+ if @current_lexer_table[@char].call == :TEXT
70
81
  @current_token[1] += @text[@token_start, 1]
71
82
  else
72
83
  #skip empty :TEXT tokens
73
84
  unless empty_text_token?
74
85
  @tokens << @current_token
75
- unless para_breaker?(@next_token[0])
86
+ unless para_breaker?(@next_token[0]) or in_block?
76
87
  #if no paragraph was previously started
77
88
  #then we should start it
78
89
  start_para if !@para
@@ -88,6 +99,8 @@ class MediaWikiLexer
88
99
  #we need to remove para start token because no para end is possible
89
100
  @tokens.pop
90
101
  @para = false
102
+ elsif @para
103
+ end_para
91
104
  end
92
105
  end
93
106
 
@@ -134,13 +147,19 @@ private
134
147
  #Returns true if the token breaks the paragraph.
135
148
  def para_breaker?(token)
136
149
  [:SECTION_START, :SECTION_END,
150
+ :TABLE_START, :TABLE_END, :ROW_START, :ROW_END, :HEAD_START, :HEAD_END, :CELL_START, :CELL_END,
137
151
  :UL_START, :UL_END, :OL_START, :OL_END,
138
152
  :DL_START, :DL_END, :HLINE, :PRE].include?(token)
139
153
  end
140
154
 
141
155
  #Returns true if the paragraph can be started after the token
142
156
  def para_starter?(token)
143
- [:SECTION_END, :UL_END, :OL_END, :DL_END, :HLINE, :PRE].include?(token)
157
+ [:SECTION_END, :TABLE_END, :UL_END, :OL_END, :DL_END, :HLINE, :PRE].include?(token)
158
+ end
159
+
160
+ def in_block?
161
+ @pair_stack.select {|token| para_breaker?(token[0])}.size > 0 or
162
+ (@sub_tokens and @sub_tokens.select {|token| para_breaker?(token[0])}.size > 0)
144
163
  end
145
164
 
146
165
  #-- ================== Match methods ================== ++#
@@ -156,7 +175,17 @@ private
156
175
  # "'''" { return :BOLD; }
157
176
  # "''" { return :ITALIC; }
158
177
  def match_italic_or_bold
159
- if @text[@cursor, 3] == "'''" and @pair_stack.last[0] != :ITALICSTART
178
+ if @text[@cursor, 5] == "'''''"
179
+ if @pair_stack.last[0] == :BOLDSTART
180
+ matchBold
181
+ @cursor += 3
182
+ else
183
+ matchItalic
184
+ @cursor += 2
185
+ end
186
+ return
187
+ end
188
+ if @text[@cursor, 3] == "'''"
160
189
  matchBold
161
190
  @cursor += 3
162
191
  return
@@ -212,11 +241,11 @@ private
212
241
  # "[[" { return INTLINKSTART; }
213
242
  # "[" { return LINKSTART; }
214
243
  def match_link_start
215
- if @text[@cursor, 2] == "[["
244
+ if @text[@cursor, 2] == "[[" and @text[@cursor+2, @text.length - (@cursor + 2)] =~ %r{\A\s*[^\s\]]}
216
245
  @next_token[0] = :INTLINKSTART
217
246
  @pair_stack.push @next_token
218
247
  @cursor += 2
219
- elsif @text[@cursor, 1] == "[" and html_link?(@cursor+1)
248
+ elsif @text[@cursor, 1] == "[" and link_protocol?(@cursor+1)
220
249
  @next_token[0] = :LINKSTART
221
250
  @pair_stack.push @next_token
222
251
  @cursor += 1
@@ -241,16 +270,35 @@ private
241
270
  match_other
242
271
  end
243
272
  end
273
+
274
+ #Matches link separator inside of internal links
275
+ def match_link_sep
276
+ if @tokens[-1][0] == :INTLINKSTART or inside_resource_link
277
+ @next_token[0] = :INTLINKSEP
278
+ @cursor += 1
279
+ else
280
+ match_other
281
+ end
282
+ end
244
283
 
245
284
  #Matches inlined unformatted html link
246
285
  # "http://[^\s]*" { return [ LINKSTART TEXT LINKEND]; }
247
286
  def match_inline_link
248
287
  #if no link start token was detected and the text starts with http://
249
288
  #then it's the inlined unformatted html link
250
- if html_link?(@cursor) and @pair_stack.last[0] != :INTLINKSTART and
251
- @pair_stack.last[0] != :LINKSTART
289
+ last_pair_token = @pair_stack.last[0]
290
+ if link_protocol?(@cursor) and last_pair_token != :INTLINKSTART and last_pair_token != :LINKSTART
252
291
  @next_token[0] = :LINKSTART
253
- linkText = extract_till_whitespace
292
+ text = @text[@cursor..-1]
293
+ if last_pair_token == :ITALICSTART and text =~ /\A([^\s\n]+)''/
294
+ linkText = $1
295
+ elsif last_pair_token == :BOLDSTART and text =~ /\A([^\s\n]+)'''/
296
+ linkText = $1
297
+ elsif text =~ /\A([^\s\n]+)[\s\n]/
298
+ linkText = $1
299
+ else
300
+ linkText = text
301
+ end
254
302
  @sub_tokens = []
255
303
  @sub_tokens << [:TEXT, linkText]
256
304
  @sub_tokens << [:LINKEND, ']']
@@ -264,10 +312,14 @@ private
264
312
  #Matches space to find preformatted areas which start with a space after a newline
265
313
  # "\n\s[^\n]*" { return PRE; }
266
314
  def match_space
267
- if at_start_of_line?
315
+ if at_start_of_line? and ! in_table?
268
316
  match_untill_eol
269
317
  @next_token[0] = :PRE
270
318
  strip_ws_from_token_start
319
+ elsif @pair_stack.last[0] == :LINKSTART and @current_token[0] == :TEXT and @tokens.last[0] != :LINKSEP
320
+ @next_token[0] = :LINKSEP
321
+ @cursor += 1
322
+ strip_ws_from_token_start
271
323
  else
272
324
  match_other
273
325
  end
@@ -325,7 +377,9 @@ private
325
377
  @next_token[0] = :DL_START
326
378
  @sub_tokens << [:DL_END, ""]
327
379
  end
328
-
380
+ elsif @text[@cursor, 1] == ':' and @tokens[-1][0] == :INTLINKSTART
381
+ @next_token[0] = :RESOURCE_SEP
382
+ @cursor += 1
329
383
  else
330
384
  match_other
331
385
  end
@@ -370,14 +424,125 @@ private
370
424
  match_other
371
425
  end
372
426
  end
427
+
428
+ def match_tag_start
429
+ if @text[@cursor, 8] == '<nowiki>'
430
+ @cursor += 8
431
+ @token_start = @cursor
432
+ @current_lexer_table = @tag_lexer_table
433
+ @current_lexer_table[@text[@cursor, 1]].call
434
+ else
435
+ match_other
436
+ end
437
+ end
438
+
439
+ def match_tag_end
440
+ if @text[@cursor, 9] == '</nowiki>'
441
+ @cursor += 9
442
+ @token_start = @cursor
443
+ @current_lexer_table = @lexer_table
444
+ @current_lexer_table[@text[@cursor, 1]].call
445
+ else
446
+ match_other
447
+ end
448
+ end
449
+
450
+ def match_table
451
+ if at_start_of_line? and @text[@cursor + 1, 1] == '|'
452
+ tokens = []
453
+ if @para
454
+ tokens = end_tokens_for_open_pairs
455
+ if @tokens.last and @tokens.last[0] == :PARA_START and empty_text_token?
456
+ tokens.pop
457
+ else
458
+ tokens << [:PARA_END, ""]
459
+ end
460
+ @para = false
461
+ end
462
+ tokens << [:TABLE_START, '']
463
+ @pair_stack.push [:TABLE_START, '']
464
+ @next_token = tokens.shift
465
+ @sub_tokens = tokens
466
+ @cursor += 2
467
+ else
468
+ match_other
469
+ end
470
+ end
471
+
472
+ def match_table_head
473
+ if at_start_of_line? and in_table?
474
+ @cursor += 1
475
+ tokens = []
476
+ if @pair_stack.last[0] == :CELL_START
477
+ tokens << [:CELL_END, '']
478
+ @pair_stack.pop
479
+ elsif @pair_stack.last[0] == :HEAD_START
480
+ tokens << [:HEAD_END, '']
481
+ @pair_stack.pop
482
+ elsif @pair_stack.last[0] != :ROW_START
483
+ tokens << [:ROW_START, '']
484
+ @pair_stack.push [:ROW_START, '']
485
+ end
486
+ tokens << [:HEAD_START, '']
487
+ @pair_stack.push [:HEAD_START, '']
488
+ @next_token = tokens.shift
489
+ @sub_tokens = tokens
490
+ else
491
+ match_other
492
+ end
493
+ end
494
+
495
+ def match_link_sep_or_table_cell
496
+ if in_table?
497
+ tokens = []
498
+ if at_start_of_line?
499
+ @cursor += 1
500
+ close_table_cell(tokens)
501
+ if ['-', '}'].include?(@text[@cursor, 1])
502
+ close_table_row(tokens)
503
+ if @text[@cursor, 1] == '-'
504
+ tokens << [:ROW_START, '']
505
+ @pair_stack.push [:ROW_START, '']
506
+ else
507
+ tokens << [:TABLE_END, '']
508
+ @pair_stack.pop
509
+ end
510
+ @cursor += 1
511
+ else
512
+ if @pair_stack.last[0] != :ROW_START
513
+ tokens << [:ROW_START, '']
514
+ @pair_stack.push [:ROW_START, '']
515
+ end
516
+ tokens << [:CELL_START, '']
517
+ @pair_stack.push [:CELL_START, '']
518
+ end
519
+ @next_token = tokens.shift
520
+ @sub_tokens = tokens
521
+ elsif @text[@cursor + 1, 1] == '|'
522
+ @cursor += 2
523
+ close_table_cell(tokens)
524
+ next_token = tokens.last[0] == :HEAD_END ? [:HEAD_START, ''] : [:CELL_START, '']
525
+ tokens << next_token
526
+ @pair_stack.push next_token
527
+ @next_token = tokens.shift
528
+ @sub_tokens = tokens
529
+ else
530
+ match_link_sep
531
+ end
532
+ else
533
+ match_link_sep
534
+ end
535
+ end
373
536
 
374
- #Matches new line and breaks the paragraph if two newlines are met
537
+ #Matches a new line and breaks the paragraph if two newline characters
538
+ #("\n\n") are met.
375
539
  def match_newline
376
540
  if @text[@cursor, 2] == "\n\n"
377
541
  if @para
378
- @next_token[0] = :PARA_END
379
- # @para = false
380
- @sub_tokens = [[:PARA_START, ""]]
542
+ @sub_tokens = end_tokens_for_open_pairs
543
+ @sub_tokens << [:PARA_END, '']
544
+ @sub_tokens << [:PARA_START, '']
545
+ @next_token[0] = @sub_tokens.slice!(0)[0]
381
546
  @cursor += 2
382
547
  return
383
548
  end
@@ -385,8 +550,40 @@ private
385
550
  match_other
386
551
  end
387
552
 
553
+ #Matches a new line and breaks the paragraph if two carriage return - newline
554
+ #sequences ("\r\n\r\n") are met.
555
+ def match_carriagereturn
556
+ if @text[@cursor, 4] == "\r\n\r\n"
557
+ if @para
558
+ @sub_tokens = end_tokens_for_open_pairs
559
+ @sub_tokens << [:PARA_END, '']
560
+ @sub_tokens << [:PARA_START, '']
561
+ @next_token[0] = @sub_tokens.slice!(0)[0]
562
+ @cursor += 4
563
+ return
564
+ end
565
+ end
566
+ match_other
567
+ end
568
+
388
569
  #-- ================== Helper methods ================== ++#
389
570
 
571
+ # Checks if we are lexing inside a resource link like
572
+ # [[Image:example.png|100px|Embedded image]]
573
+ def inside_resource_link
574
+ if @pair_stack.last[0] == :INTLINKSTART
575
+ pos = -1
576
+ while((token = @tokens[pos][0]) != :INTLINKSTART)
577
+ if token == :RESOURCE_SEP
578
+ return true
579
+ else
580
+ pos -= 1
581
+ end
582
+ end
583
+ end
584
+ false
585
+ end
586
+
390
587
  #Checks if the token is placed at the start of the line.
391
588
  def at_start_of_line?
392
589
  if @cursor == 0 or @text[@cursor-1, 1] == "\n"
@@ -395,10 +592,15 @@ private
395
592
  false
396
593
  end
397
594
  end
595
+
596
+ def in_table?
597
+ @pair_stack.include?([:TABLE_START, ''])
598
+ end
398
599
 
399
- #Checks if the text at position contains the start of the html link
400
- def html_link?(position)
401
- return @text[position, 7] == 'http://'
600
+ #Checks if the text at position contains the start of a link using any of
601
+ #HTTP, HTTPS, MAILTO or FILE protocols
602
+ def link_protocol?(position)
603
+ return @text[position, @text.length - position] =~ %r{\A((http|https|file)://|mailto:)}
402
604
  end
403
605
 
404
606
  #Adjusts @token_start to skip leading whitespaces
@@ -408,7 +610,8 @@ private
408
610
 
409
611
  #Returns true if the TEXT token is empty or contains newline only
410
612
  def empty_text_token?
411
- @current_token == [:TEXT, ''] or @current_token == [:TEXT, "\n"]
613
+ @current_token[0] == :TEXT and
614
+ (@current_token[1] == '' or @current_token[1] == "\n" or @current_token[1] == "\r\n")
412
615
  end
413
616
 
414
617
  #Returns true if the text is a list, i.e. starts with one of #;*: symbols
@@ -422,7 +625,7 @@ private
422
625
  sub_lexer = MediaWikiLexer.new
423
626
  sub_tokens = sub_lexer.tokenize(sub_text)
424
627
  sub_tokens.pop #false token
425
- if strip_paragraphs
628
+ if strip_paragraphs and sub_tokens.size > 0
426
629
  #the last PARA_END token
427
630
  sub_tokens.pop if sub_tokens.last[0] == :PARA_END
428
631
  #the first PARA_START token
@@ -431,21 +634,6 @@ private
431
634
  sub_tokens
432
635
  end
433
636
 
434
- #Extracts the text from current cursor position till the next whitespace
435
- def extract_till_whitespace
436
- i = @cursor
437
- text = ""
438
- while i < @text.length
439
- curr = @text[i, 1]
440
- if (curr == "\n") or (curr == "\t") or (curr == " ")
441
- break
442
- end
443
- text += curr
444
- i += 1
445
- end
446
- text
447
- end
448
-
449
637
  #Extract list contents of list type set by list_id variable.
450
638
  #Example list:
451
639
  # *a
@@ -462,9 +650,13 @@ private
462
650
  list+=curr
463
651
  break
464
652
  end
465
- list += curr unless (curr == list_id) and (@text[i-1, 1] == "\n")
653
+ if (curr == list_id) and (@text[i-1, 1] == "\n")
654
+ list += "\n" if i + 1 == @text.length
655
+ else
656
+ list += curr
657
+ end
466
658
  i += 1
467
- end
659
+ end
468
660
  list
469
661
  end
470
662
 
@@ -474,9 +666,72 @@ private
474
666
  end
475
667
 
476
668
  def end_para
669
+ @tokens += end_tokens_for_open_pairs
477
670
  @tokens << [:PARA_END, ""]
478
671
  @para = false
479
672
  end
673
+
674
+ def end_tokens_for_open_pairs
675
+ tokens = []
676
+ restore = []
677
+ while(@pair_stack.size > 1) do
678
+ last = @pair_stack.pop
679
+ case last[0]
680
+ when :ITALICSTART
681
+ tokens << [:ITALICEND, '']
682
+ when :BOLDSTART
683
+ tokens << [:BOLDEND, '']
684
+ when :INTLINKSTART
685
+ tokens << [:INTLINKEND, '']
686
+ when :LINKSTART
687
+ tokens << [:LINKEND, '']
688
+ when :TABLE_START
689
+ tokens << [:TABLE_END, '']
690
+ when :ROW_START
691
+ tokens << [:ROW_END, '']
692
+ when :CELL_START
693
+ tokens << [:CELL_END, '']
694
+ when :HEAD_START
695
+ tokens << [:HEAD_END, '']
696
+ else
697
+ restore << last
698
+ end
699
+ end
700
+ @pair_stack += restore.reverse
701
+ tokens
702
+ end
703
+
704
+ def close_table_cell(tokens)
705
+ restore = []
706
+ last = @pair_stack.pop
707
+ while (last[0] != :CELL_START and last[0] != :HEAD_START and last[0] != :ROW_START and last[0] != :TABLE_START) do
708
+ case last[0]
709
+ when :ITALICSTART
710
+ tokens << [:ITALICEND, '']
711
+ when :BOLDSTART
712
+ tokens << [:BOLDEND, '']
713
+ when :INTLINKSTART
714
+ tokens << [:INTLINKEND, '']
715
+ when :LINKSTART
716
+ tokens << [:LINKEND, '']
717
+ end
718
+ last = @pair_stack.pop
719
+ end
720
+ if last[0] == :CELL_START
721
+ tokens << [:CELL_END, '']
722
+ elsif last[0] == :HEAD_START
723
+ tokens << [:HEAD_END, '']
724
+ else
725
+ @pair_stack.push last
726
+ end
727
+ end
728
+
729
+ def close_table_row(tokens)
730
+ if @pair_stack.last[0] == :ROW_START
731
+ @pair_stack.pop
732
+ tokens << [:ROW_END, '']
733
+ end
734
+ end
480
735
 
481
736
  end
482
737