mediacloth 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,11 +31,13 @@ class MediaWikiLexer
31
31
  @position = 0
32
32
  @pair_stack = [[false, false]] #stack of tokens for which a pair should be found
33
33
  @list_stack = []
34
+ # Default lexer table
34
35
  @lexer_table = Hash.new(method(:match_other))
35
36
  @lexer_table["'"] = method(:match_italic_or_bold)
36
37
  @lexer_table["="] = method(:match_section)
37
38
  @lexer_table["["] = method(:match_link_start)
38
39
  @lexer_table["]"] = method(:match_link_end)
40
+ @lexer_table["|"] = method(:match_link_sep_or_table_cell)
39
41
  @lexer_table[" "] = method(:match_space)
40
42
  @lexer_table["*"] = method(:match_list)
41
43
  @lexer_table["#"] = method(:match_list)
@@ -45,6 +47,15 @@ class MediaWikiLexer
45
47
  @lexer_table["~"] = method(:match_signature)
46
48
  @lexer_table["h"] = method(:match_inline_link)
47
49
  @lexer_table["\n"] = method(:match_newline)
50
+ @lexer_table["\r"] = method(:match_carriagereturn)
51
+ @lexer_table["<"] = method(:match_tag_start)
52
+ @lexer_table["{"] = method(:match_table)
53
+ @lexer_table["!"] = method(:match_table_head)
54
+ # Lexer table used when inside :match_tag_start ... :match_tag_end
55
+ @tag_lexer_table = Hash.new(method(:match_other))
56
+ @tag_lexer_table["<"] = method(:match_tag_end)
57
+ # Begin lexing in default state
58
+ @current_lexer_table = @lexer_table
48
59
  end
49
60
 
50
61
  #Transforms input stream (string) into the stream of tokens.
@@ -66,13 +77,13 @@ class MediaWikiLexer
66
77
  @token_start = @cursor
67
78
  @char = @text[@cursor, 1]
68
79
 
69
- if @lexer_table[@char].call == :TEXT
80
+ if @current_lexer_table[@char].call == :TEXT
70
81
  @current_token[1] += @text[@token_start, 1]
71
82
  else
72
83
  #skip empty :TEXT tokens
73
84
  unless empty_text_token?
74
85
  @tokens << @current_token
75
- unless para_breaker?(@next_token[0])
86
+ unless para_breaker?(@next_token[0]) or in_block?
76
87
  #if no paragraph was previously started
77
88
  #then we should start it
78
89
  start_para if !@para
@@ -88,6 +99,8 @@ class MediaWikiLexer
88
99
  #we need to remove para start token because no para end is possible
89
100
  @tokens.pop
90
101
  @para = false
102
+ elsif @para
103
+ end_para
91
104
  end
92
105
  end
93
106
 
@@ -134,13 +147,19 @@ private
134
147
  #Returns true if the token breaks the paragraph.
135
148
  def para_breaker?(token)
136
149
  [:SECTION_START, :SECTION_END,
150
+ :TABLE_START, :TABLE_END, :ROW_START, :ROW_END, :HEAD_START, :HEAD_END, :CELL_START, :CELL_END,
137
151
  :UL_START, :UL_END, :OL_START, :OL_END,
138
152
  :DL_START, :DL_END, :HLINE, :PRE].include?(token)
139
153
  end
140
154
 
141
155
  #Returns true if the paragraph can be started after the token
142
156
  def para_starter?(token)
143
- [:SECTION_END, :UL_END, :OL_END, :DL_END, :HLINE, :PRE].include?(token)
157
+ [:SECTION_END, :TABLE_END, :UL_END, :OL_END, :DL_END, :HLINE, :PRE].include?(token)
158
+ end
159
+
160
+ def in_block?
161
+ @pair_stack.select {|token| para_breaker?(token[0])}.size > 0 or
162
+ (@sub_tokens and @sub_tokens.select {|token| para_breaker?(token[0])}.size > 0)
144
163
  end
145
164
 
146
165
  #-- ================== Match methods ================== ++#
@@ -156,7 +175,17 @@ private
156
175
  # "'''" { return :BOLD; }
157
176
  # "''" { return :ITALIC; }
158
177
  def match_italic_or_bold
159
- if @text[@cursor, 3] == "'''" and @pair_stack.last[0] != :ITALICSTART
178
+ if @text[@cursor, 5] == "'''''"
179
+ if @pair_stack.last[0] == :BOLDSTART
180
+ matchBold
181
+ @cursor += 3
182
+ else
183
+ matchItalic
184
+ @cursor += 2
185
+ end
186
+ return
187
+ end
188
+ if @text[@cursor, 3] == "'''"
160
189
  matchBold
161
190
  @cursor += 3
162
191
  return
@@ -212,11 +241,11 @@ private
212
241
  # "[[" { return INTLINKSTART; }
213
242
  # "[" { return LINKSTART; }
214
243
  def match_link_start
215
- if @text[@cursor, 2] == "[["
244
+ if @text[@cursor, 2] == "[[" and @text[@cursor+2, @text.length - (@cursor + 2)] =~ %r{\A\s*[^\s\]]}
216
245
  @next_token[0] = :INTLINKSTART
217
246
  @pair_stack.push @next_token
218
247
  @cursor += 2
219
- elsif @text[@cursor, 1] == "[" and html_link?(@cursor+1)
248
+ elsif @text[@cursor, 1] == "[" and link_protocol?(@cursor+1)
220
249
  @next_token[0] = :LINKSTART
221
250
  @pair_stack.push @next_token
222
251
  @cursor += 1
@@ -241,16 +270,35 @@ private
241
270
  match_other
242
271
  end
243
272
  end
273
+
274
+ #Matches link separator inside of internal links
275
+ def match_link_sep
276
+ if @tokens[-1][0] == :INTLINKSTART or inside_resource_link
277
+ @next_token[0] = :INTLINKSEP
278
+ @cursor += 1
279
+ else
280
+ match_other
281
+ end
282
+ end
244
283
 
245
284
  #Matches inlined unformatted html link
246
285
  # "http://[^\s]*" { return [ LINKSTART TEXT LINKEND]; }
247
286
  def match_inline_link
248
287
  #if no link start token was detected and the text starts with http://
249
288
  #then it's the inlined unformatted html link
250
- if html_link?(@cursor) and @pair_stack.last[0] != :INTLINKSTART and
251
- @pair_stack.last[0] != :LINKSTART
289
+ last_pair_token = @pair_stack.last[0]
290
+ if link_protocol?(@cursor) and last_pair_token != :INTLINKSTART and last_pair_token != :LINKSTART
252
291
  @next_token[0] = :LINKSTART
253
- linkText = extract_till_whitespace
292
+ text = @text[@cursor..-1]
293
+ if last_pair_token == :ITALICSTART and text =~ /\A([^\s\n]+)''/
294
+ linkText = $1
295
+ elsif last_pair_token == :BOLDSTART and text =~ /\A([^\s\n]+)'''/
296
+ linkText = $1
297
+ elsif text =~ /\A([^\s\n]+)[\s\n]/
298
+ linkText = $1
299
+ else
300
+ linkText = text
301
+ end
254
302
  @sub_tokens = []
255
303
  @sub_tokens << [:TEXT, linkText]
256
304
  @sub_tokens << [:LINKEND, ']']
@@ -264,10 +312,14 @@ private
264
312
  #Matches space to find preformatted areas which start with a space after a newline
265
313
  # "\n\s[^\n]*" { return PRE; }
266
314
  def match_space
267
- if at_start_of_line?
315
+ if at_start_of_line? and ! in_table?
268
316
  match_untill_eol
269
317
  @next_token[0] = :PRE
270
318
  strip_ws_from_token_start
319
+ elsif @pair_stack.last[0] == :LINKSTART and @current_token[0] == :TEXT and @tokens.last[0] != :LINKSEP
320
+ @next_token[0] = :LINKSEP
321
+ @cursor += 1
322
+ strip_ws_from_token_start
271
323
  else
272
324
  match_other
273
325
  end
@@ -325,7 +377,9 @@ private
325
377
  @next_token[0] = :DL_START
326
378
  @sub_tokens << [:DL_END, ""]
327
379
  end
328
-
380
+ elsif @text[@cursor, 1] == ':' and @tokens[-1][0] == :INTLINKSTART
381
+ @next_token[0] = :RESOURCE_SEP
382
+ @cursor += 1
329
383
  else
330
384
  match_other
331
385
  end
@@ -370,14 +424,125 @@ private
370
424
  match_other
371
425
  end
372
426
  end
427
+
428
+ def match_tag_start
429
+ if @text[@cursor, 8] == '<nowiki>'
430
+ @cursor += 8
431
+ @token_start = @cursor
432
+ @current_lexer_table = @tag_lexer_table
433
+ @current_lexer_table[@text[@cursor, 1]].call
434
+ else
435
+ match_other
436
+ end
437
+ end
438
+
439
+ def match_tag_end
440
+ if @text[@cursor, 9] == '</nowiki>'
441
+ @cursor += 9
442
+ @token_start = @cursor
443
+ @current_lexer_table = @lexer_table
444
+ @current_lexer_table[@text[@cursor, 1]].call
445
+ else
446
+ match_other
447
+ end
448
+ end
449
+
450
+ def match_table
451
+ if at_start_of_line? and @text[@cursor + 1, 1] == '|'
452
+ tokens = []
453
+ if @para
454
+ tokens = end_tokens_for_open_pairs
455
+ if @tokens.last and @tokens.last[0] == :PARA_START and empty_text_token?
456
+ tokens.pop
457
+ else
458
+ tokens << [:PARA_END, ""]
459
+ end
460
+ @para = false
461
+ end
462
+ tokens << [:TABLE_START, '']
463
+ @pair_stack.push [:TABLE_START, '']
464
+ @next_token = tokens.shift
465
+ @sub_tokens = tokens
466
+ @cursor += 2
467
+ else
468
+ match_other
469
+ end
470
+ end
471
+
472
+ def match_table_head
473
+ if at_start_of_line? and in_table?
474
+ @cursor += 1
475
+ tokens = []
476
+ if @pair_stack.last[0] == :CELL_START
477
+ tokens << [:CELL_END, '']
478
+ @pair_stack.pop
479
+ elsif @pair_stack.last[0] == :HEAD_START
480
+ tokens << [:HEAD_END, '']
481
+ @pair_stack.pop
482
+ elsif @pair_stack.last[0] != :ROW_START
483
+ tokens << [:ROW_START, '']
484
+ @pair_stack.push [:ROW_START, '']
485
+ end
486
+ tokens << [:HEAD_START, '']
487
+ @pair_stack.push [:HEAD_START, '']
488
+ @next_token = tokens.shift
489
+ @sub_tokens = tokens
490
+ else
491
+ match_other
492
+ end
493
+ end
494
+
495
+ def match_link_sep_or_table_cell
496
+ if in_table?
497
+ tokens = []
498
+ if at_start_of_line?
499
+ @cursor += 1
500
+ close_table_cell(tokens)
501
+ if ['-', '}'].include?(@text[@cursor, 1])
502
+ close_table_row(tokens)
503
+ if @text[@cursor, 1] == '-'
504
+ tokens << [:ROW_START, '']
505
+ @pair_stack.push [:ROW_START, '']
506
+ else
507
+ tokens << [:TABLE_END, '']
508
+ @pair_stack.pop
509
+ end
510
+ @cursor += 1
511
+ else
512
+ if @pair_stack.last[0] != :ROW_START
513
+ tokens << [:ROW_START, '']
514
+ @pair_stack.push [:ROW_START, '']
515
+ end
516
+ tokens << [:CELL_START, '']
517
+ @pair_stack.push [:CELL_START, '']
518
+ end
519
+ @next_token = tokens.shift
520
+ @sub_tokens = tokens
521
+ elsif @text[@cursor + 1, 1] == '|'
522
+ @cursor += 2
523
+ close_table_cell(tokens)
524
+ next_token = tokens.last[0] == :HEAD_END ? [:HEAD_START, ''] : [:CELL_START, '']
525
+ tokens << next_token
526
+ @pair_stack.push next_token
527
+ @next_token = tokens.shift
528
+ @sub_tokens = tokens
529
+ else
530
+ match_link_sep
531
+ end
532
+ else
533
+ match_link_sep
534
+ end
535
+ end
373
536
 
374
- #Matches new line and breaks the paragraph if two newlines are met
537
+ #Matches a new line and breaks the paragraph if two newline characters
538
+ #("\n\n") are met.
375
539
  def match_newline
376
540
  if @text[@cursor, 2] == "\n\n"
377
541
  if @para
378
- @next_token[0] = :PARA_END
379
- # @para = false
380
- @sub_tokens = [[:PARA_START, ""]]
542
+ @sub_tokens = end_tokens_for_open_pairs
543
+ @sub_tokens << [:PARA_END, '']
544
+ @sub_tokens << [:PARA_START, '']
545
+ @next_token[0] = @sub_tokens.slice!(0)[0]
381
546
  @cursor += 2
382
547
  return
383
548
  end
@@ -385,8 +550,40 @@ private
385
550
  match_other
386
551
  end
387
552
 
553
+ #Matches a new line and breaks the paragraph if two carriage return - newline
554
+ #sequences ("\r\n\r\n") are met.
555
+ def match_carriagereturn
556
+ if @text[@cursor, 4] == "\r\n\r\n"
557
+ if @para
558
+ @sub_tokens = end_tokens_for_open_pairs
559
+ @sub_tokens << [:PARA_END, '']
560
+ @sub_tokens << [:PARA_START, '']
561
+ @next_token[0] = @sub_tokens.slice!(0)[0]
562
+ @cursor += 4
563
+ return
564
+ end
565
+ end
566
+ match_other
567
+ end
568
+
388
569
  #-- ================== Helper methods ================== ++#
389
570
 
571
+ # Checks if we are lexing inside a resource link like
572
+ # [[Image:example.png|100px|Embedded image]]
573
+ def inside_resource_link
574
+ if @pair_stack.last[0] == :INTLINKSTART
575
+ pos = -1
576
+ while((token = @tokens[pos][0]) != :INTLINKSTART)
577
+ if token == :RESOURCE_SEP
578
+ return true
579
+ else
580
+ pos -= 1
581
+ end
582
+ end
583
+ end
584
+ false
585
+ end
586
+
390
587
  #Checks if the token is placed at the start of the line.
391
588
  def at_start_of_line?
392
589
  if @cursor == 0 or @text[@cursor-1, 1] == "\n"
@@ -395,10 +592,15 @@ private
395
592
  false
396
593
  end
397
594
  end
595
+
596
+ def in_table?
597
+ @pair_stack.include?([:TABLE_START, ''])
598
+ end
398
599
 
399
- #Checks if the text at position contains the start of the html link
400
- def html_link?(position)
401
- return @text[position, 7] == 'http://'
600
+ #Checks if the text at position contains the start of a link using any of
601
+ #HTTP, HTTPS, MAILTO or FILE protocols
602
+ def link_protocol?(position)
603
+ return @text[position, @text.length - position] =~ %r{\A((http|https|file)://|mailto:)}
402
604
  end
403
605
 
404
606
  #Adjusts @token_start to skip leading whitespaces
@@ -408,7 +610,8 @@ private
408
610
 
409
611
  #Returns true if the TEXT token is empty or contains newline only
410
612
  def empty_text_token?
411
- @current_token == [:TEXT, ''] or @current_token == [:TEXT, "\n"]
613
+ @current_token[0] == :TEXT and
614
+ (@current_token[1] == '' or @current_token[1] == "\n" or @current_token[1] == "\r\n")
412
615
  end
413
616
 
414
617
  #Returns true if the text is a list, i.e. starts with one of #;*: symbols
@@ -422,7 +625,7 @@ private
422
625
  sub_lexer = MediaWikiLexer.new
423
626
  sub_tokens = sub_lexer.tokenize(sub_text)
424
627
  sub_tokens.pop #false token
425
- if strip_paragraphs
628
+ if strip_paragraphs and sub_tokens.size > 0
426
629
  #the last PARA_END token
427
630
  sub_tokens.pop if sub_tokens.last[0] == :PARA_END
428
631
  #the first PARA_START token
@@ -431,21 +634,6 @@ private
431
634
  sub_tokens
432
635
  end
433
636
 
434
- #Extracts the text from current cursor position till the next whitespace
435
- def extract_till_whitespace
436
- i = @cursor
437
- text = ""
438
- while i < @text.length
439
- curr = @text[i, 1]
440
- if (curr == "\n") or (curr == "\t") or (curr == " ")
441
- break
442
- end
443
- text += curr
444
- i += 1
445
- end
446
- text
447
- end
448
-
449
637
  #Extract list contents of list type set by list_id variable.
450
638
  #Example list:
451
639
  # *a
@@ -462,9 +650,13 @@ private
462
650
  list+=curr
463
651
  break
464
652
  end
465
- list += curr unless (curr == list_id) and (@text[i-1, 1] == "\n")
653
+ if (curr == list_id) and (@text[i-1, 1] == "\n")
654
+ list += "\n" if i + 1 == @text.length
655
+ else
656
+ list += curr
657
+ end
466
658
  i += 1
467
- end
659
+ end
468
660
  list
469
661
  end
470
662
 
@@ -474,9 +666,72 @@ private
474
666
  end
475
667
 
476
668
  def end_para
669
+ @tokens += end_tokens_for_open_pairs
477
670
  @tokens << [:PARA_END, ""]
478
671
  @para = false
479
672
  end
673
+
674
+ def end_tokens_for_open_pairs
675
+ tokens = []
676
+ restore = []
677
+ while(@pair_stack.size > 1) do
678
+ last = @pair_stack.pop
679
+ case last[0]
680
+ when :ITALICSTART
681
+ tokens << [:ITALICEND, '']
682
+ when :BOLDSTART
683
+ tokens << [:BOLDEND, '']
684
+ when :INTLINKSTART
685
+ tokens << [:INTLINKEND, '']
686
+ when :LINKSTART
687
+ tokens << [:LINKEND, '']
688
+ when :TABLE_START
689
+ tokens << [:TABLE_END, '']
690
+ when :ROW_START
691
+ tokens << [:ROW_END, '']
692
+ when :CELL_START
693
+ tokens << [:CELL_END, '']
694
+ when :HEAD_START
695
+ tokens << [:HEAD_END, '']
696
+ else
697
+ restore << last
698
+ end
699
+ end
700
+ @pair_stack += restore.reverse
701
+ tokens
702
+ end
703
+
704
+ def close_table_cell(tokens)
705
+ restore = []
706
+ last = @pair_stack.pop
707
+ while (last[0] != :CELL_START and last[0] != :HEAD_START and last[0] != :ROW_START and last[0] != :TABLE_START) do
708
+ case last[0]
709
+ when :ITALICSTART
710
+ tokens << [:ITALICEND, '']
711
+ when :BOLDSTART
712
+ tokens << [:BOLDEND, '']
713
+ when :INTLINKSTART
714
+ tokens << [:INTLINKEND, '']
715
+ when :LINKSTART
716
+ tokens << [:LINKEND, '']
717
+ end
718
+ last = @pair_stack.pop
719
+ end
720
+ if last[0] == :CELL_START
721
+ tokens << [:CELL_END, '']
722
+ elsif last[0] == :HEAD_START
723
+ tokens << [:HEAD_END, '']
724
+ else
725
+ @pair_stack.push last
726
+ end
727
+ end
728
+
729
+ def close_table_row(tokens)
730
+ if @pair_stack.last[0] == :ROW_START
731
+ @pair_stack.pop
732
+ tokens << [:ROW_END, '']
733
+ end
734
+ end
480
735
 
481
736
  end
482
737