RubyGems - mediacloth - Versions diffs - 0.0.2 → 0.0.3 - Mend

mediacloth 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

data/lib/mediacloth/mediawikiast.rb +42 -0
data/lib/mediacloth/mediawikihtmlgenerator.rb +100 -29
data/lib/mediacloth/mediawikilexer.rb +292 -37
data/lib/mediacloth/mediawikilexer.rb~ +491 -0
data/lib/mediacloth/mediawikiparser.rb +535 -173
data/lib/mediacloth/mediawikiparser.y +183 -15
data/lib/mediacloth/mediawikiparser.y~ +210 -0
data/lib/mediacloth/mediawikiwalker.rb +56 -8
data/test/data/html1 +1 -1
data/test/data/html10 +98 -0
data/test/data/html3 +1 -1
data/test/data/html4 +11 -1
data/test/data/html5 +5 -1
data/test/data/html7 +1 -2
data/test/data/html8 +1 -1
data/test/data/html9 +6 -0
data/test/data/input1 +5 -0
data/test/data/input10 +124 -0
data/test/data/input4 +50 -1
data/test/data/input5 +8 -0
data/test/data/input7 +35 -2
data/test/data/input9 +14 -0
data/test/data/lex1 +5 -1
data/test/data/lex10 +87 -0
data/test/data/lex4 +47 -1
data/test/data/lex5 +7 -1
data/test/data/lex7 +35 -2
data/test/data/lex9 +14 -0
data/test/dataproducers/html.rb +2 -2
data/test/dataproducers/html.rb~ +24 -0
data/test/dataproducers/lex.rb +3 -3
data/test/dataproducers/lex.rb~ +15 -0
data/test/debugwalker.rb +1 -1
data/test/htmlgenerator.rb +5 -4
data/test/lexer.rb +40 -3
data/test/parser.rb +0 -1
metadata +14 -3

data/lib/mediacloth/mediawikilexer.rb CHANGED Viewed

@@ -31,11 +31,13 @@ class MediaWikiLexer
         @position = 0
         @pair_stack = [[false, false]] #stack of tokens for which a pair should be found
         @list_stack = []
+        # Default lexer table
         @lexer_table = Hash.new(method(:match_other))
         @lexer_table["'"] = method(:match_italic_or_bold)
         @lexer_table["="] = method(:match_section)
         @lexer_table["["] = method(:match_link_start)
         @lexer_table["]"] = method(:match_link_end)
+        @lexer_table["|"] = method(:match_link_sep_or_table_cell)
         @lexer_table[" "] = method(:match_space)
         @lexer_table["*"] = method(:match_list)
         @lexer_table["#"] = method(:match_list)
@@ -45,6 +47,15 @@ class MediaWikiLexer
         @lexer_table["~"] = method(:match_signature)
         @lexer_table["h"] = method(:match_inline_link)
         @lexer_table["\n"] = method(:match_newline)
+        @lexer_table["\r"] = method(:match_carriagereturn)
+        @lexer_table["<"] = method(:match_tag_start)
+        @lexer_table["{"] = method(:match_table)
+        @lexer_table["!"] = method(:match_table_head)
+        # Lexer table used when inside :match_tag_start ... :match_tag_end
+        @tag_lexer_table = Hash.new(method(:match_other))
+        @tag_lexer_table["<"] = method(:match_tag_end)
+        # Begin lexing in default state
+        @current_lexer_table = @lexer_table
     end
     #Transforms input stream (string) into the stream of tokens.
@@ -66,13 +77,13 @@ class MediaWikiLexer
             @token_start = @cursor
             @char = @text[@cursor, 1]
-            if @lexer_table[@char].call == :TEXT
+            if @current_lexer_table[@char].call == :TEXT
                 @current_token[1] += @text[@token_start, 1]
             else
                 #skip empty :TEXT tokens
                 unless empty_text_token?
                     @tokens << @current_token
-                    unless para_breaker?(@next_token[0])
+                    unless para_breaker?(@next_token[0]) or in_block?
                         #if no paragraph was previously started
                         #then we should start it
                         start_para if !@para
@@ -88,6 +99,8 @@ class MediaWikiLexer
                         #we need to remove para start token because no para end is possible
                         @tokens.pop
                         @para = false
+                    elsif @para
+                        end_para
                     end
                 end
@@ -134,13 +147,19 @@ private
     #Returns true if the token breaks the paragraph.
     def para_breaker?(token)
         [:SECTION_START, :SECTION_END,
+        :TABLE_START, :TABLE_END, :ROW_START, :ROW_END, :HEAD_START, :HEAD_END, :CELL_START, :CELL_END,
         :UL_START, :UL_END, :OL_START, :OL_END,
         :DL_START, :DL_END, :HLINE, :PRE].include?(token)
     end
     #Returns true if the paragraph can be started after the token
     def para_starter?(token)
-        [:SECTION_END, :UL_END, :OL_END, :DL_END, :HLINE, :PRE].include?(token)
+        [:SECTION_END, :TABLE_END, :UL_END, :OL_END, :DL_END, :HLINE, :PRE].include?(token)
+    end
+    def in_block?
+      @pair_stack.select {|token| para_breaker?(token[0])}.size > 0 or
+        (@sub_tokens and @sub_tokens.select {|token| para_breaker?(token[0])}.size > 0)
     end
     #-- ================== Match methods ================== ++#
@@ -156,7 +175,17 @@ private
     # "'''"     { return :BOLD; }
     # "''"      { return :ITALIC; }
     def match_italic_or_bold
-        if @text[@cursor, 3] == "'''" and @pair_stack.last[0] != :ITALICSTART
+        if @text[@cursor, 5] == "'''''"
+            if @pair_stack.last[0] == :BOLDSTART
+              matchBold
+              @cursor += 3
+            else
+              matchItalic
+              @cursor += 2
+            end
+            return
+        end
+        if @text[@cursor, 3] == "'''"
             matchBold
             @cursor += 3
             return
@@ -212,11 +241,11 @@ private
     # "[["      { return INTLINKSTART; }
     # "["       { return LINKSTART; }
     def match_link_start
-        if @text[@cursor, 2] == "[["
+        if @text[@cursor, 2] == "[[" and @text[@cursor+2, @text.length - (@cursor + 2)] =~ %r{\A\s*[^\s\]]}
             @next_token[0] = :INTLINKSTART
             @pair_stack.push @next_token
             @cursor += 2
-        elsif @text[@cursor, 1] == "[" and html_link?(@cursor+1)
+        elsif @text[@cursor, 1] == "[" and link_protocol?(@cursor+1)
             @next_token[0] = :LINKSTART
             @pair_stack.push @next_token
             @cursor += 1
@@ -241,16 +270,35 @@ private
             match_other
         end
     end
+    #Matches link separator inside of internal links
+    def match_link_sep
+      if @tokens[-1][0] == :INTLINKSTART or inside_resource_link
+        @next_token[0] = :INTLINKSEP
+        @cursor += 1
+      else
+        match_other
+      end
+    end
     #Matches inlined unformatted html link
     # "http://[^\s]*"   { return [ LINKSTART TEXT LINKEND]; }
     def match_inline_link
         #if no link start token was detected and the text starts with http://
         #then it's the inlined unformatted html link
-        if html_link?(@cursor) and @pair_stack.last[0] != :INTLINKSTART and
-                @pair_stack.last[0] != :LINKSTART
+        last_pair_token = @pair_stack.last[0]
+        if link_protocol?(@cursor) and last_pair_token != :INTLINKSTART and last_pair_token != :LINKSTART
             @next_token[0] = :LINKSTART
-            linkText = extract_till_whitespace
+            text = @text[@cursor..-1]
+            if last_pair_token == :ITALICSTART and text =~ /\A([^\s\n]+)''/
+              linkText = $1
+            elsif last_pair_token == :BOLDSTART and text =~ /\A([^\s\n]+)'''/
+              linkText = $1
+            elsif text =~ /\A([^\s\n]+)[\s\n]/
+              linkText = $1
+            else
+              linkText = text
+            end
             @sub_tokens = []
             @sub_tokens << [:TEXT, linkText]
             @sub_tokens << [:LINKEND, ']']
@@ -264,10 +312,14 @@ private
     #Matches space to find preformatted areas which start with a space after a newline
     # "\n\s[^\n]*"     { return PRE; }
     def match_space
-        if at_start_of_line?
+        if at_start_of_line? and ! in_table?
             match_untill_eol
             @next_token[0] = :PRE
             strip_ws_from_token_start
+        elsif @pair_stack.last[0] == :LINKSTART and @current_token[0] == :TEXT and @tokens.last[0] != :LINKSEP
+            @next_token[0] = :LINKSEP
+            @cursor += 1
+            strip_ws_from_token_start
         else
             match_other
         end
@@ -325,7 +377,9 @@ private
                     @next_token[0] = :DL_START
                     @sub_tokens << [:DL_END, ""]
             end
+        elsif @text[@cursor, 1] == ':' and @tokens[-1][0] == :INTLINKSTART
+            @next_token[0] = :RESOURCE_SEP
+            @cursor += 1
         else
             match_other
         end
@@ -370,14 +424,125 @@ private
             match_other
         end
     end
+    def match_tag_start
+        if @text[@cursor, 8] == '<nowiki>'
+            @cursor += 8
+            @token_start = @cursor
+            @current_lexer_table = @tag_lexer_table
+            @current_lexer_table[@text[@cursor, 1]].call
+        else
+            match_other
+        end
+    end
+    def match_tag_end
+        if @text[@cursor, 9] == '</nowiki>'
+            @cursor += 9
+            @token_start = @cursor
+            @current_lexer_table = @lexer_table
+            @current_lexer_table[@text[@cursor, 1]].call
+        else
+            match_other
+        end
+    end
+    def match_table
+        if at_start_of_line? and @text[@cursor + 1, 1] == '|'
+            tokens = []
+            if @para
+                tokens = end_tokens_for_open_pairs
+                if @tokens.last and @tokens.last[0] == :PARA_START and empty_text_token?
+                    tokens.pop
+                else
+                    tokens << [:PARA_END, ""]
+                end
+                @para = false
+            end
+            tokens << [:TABLE_START, '']
+            @pair_stack.push [:TABLE_START, '']
+            @next_token = tokens.shift
+            @sub_tokens = tokens
+            @cursor += 2
+        else
+            match_other
+        end
+    end
+    def match_table_head
+        if at_start_of_line? and in_table?
+            @cursor += 1
+            tokens = []
+            if @pair_stack.last[0] == :CELL_START
+                tokens << [:CELL_END, '']
+                @pair_stack.pop
+            elsif @pair_stack.last[0] == :HEAD_START
+                tokens << [:HEAD_END, '']
+                @pair_stack.pop
+            elsif @pair_stack.last[0] != :ROW_START
+                tokens << [:ROW_START, '']
+                @pair_stack.push [:ROW_START, '']
+            end
+            tokens << [:HEAD_START, '']
+            @pair_stack.push [:HEAD_START, '']
+            @next_token = tokens.shift
+            @sub_tokens = tokens
+        else
+            match_other
+        end
+    end
+    def match_link_sep_or_table_cell
+        if in_table?
+            tokens = []
+            if at_start_of_line?
+                @cursor += 1
+                close_table_cell(tokens)
+                if ['-', '}'].include?(@text[@cursor, 1])
+                    close_table_row(tokens)
+                    if @text[@cursor, 1] == '-'
+                        tokens << [:ROW_START, '']
+                        @pair_stack.push [:ROW_START, '']
+                    else
+                        tokens << [:TABLE_END, '']
+                        @pair_stack.pop
+                    end
+                    @cursor += 1
+                else
+                    if @pair_stack.last[0] != :ROW_START
+                        tokens << [:ROW_START, '']
+                        @pair_stack.push [:ROW_START, '']
+                    end
+                    tokens << [:CELL_START, '']
+                    @pair_stack.push [:CELL_START, '']
+                end
+                @next_token = tokens.shift
+                @sub_tokens = tokens
+            elsif @text[@cursor + 1, 1] == '|'
+                @cursor += 2
+                close_table_cell(tokens)
+                next_token = tokens.last[0] == :HEAD_END ? [:HEAD_START, ''] : [:CELL_START, '']
+                tokens << next_token
+                @pair_stack.push next_token
+                @next_token = tokens.shift
+                @sub_tokens = tokens
+            else
+                match_link_sep
+            end
+        else
+            match_link_sep
+        end
+    end
-    #Matches new line and breaks the paragraph if two newlines are met
+    #Matches a new line and breaks the paragraph if two newline characters
+    #("\n\n") are met.
     def match_newline
         if @text[@cursor, 2] == "\n\n"
             if @para
-                @next_token[0] = :PARA_END
-#                @para = false
-                @sub_tokens = [[:PARA_START, ""]]
+                @sub_tokens = end_tokens_for_open_pairs
+                @sub_tokens << [:PARA_END, '']
+                @sub_tokens << [:PARA_START, '']
+                @next_token[0] = @sub_tokens.slice!(0)[0]
                 @cursor += 2
                 return
             end
@@ -385,8 +550,40 @@ private
         match_other
     end
+    #Matches a new line and breaks the paragraph if two carriage return - newline
+    #sequences ("\r\n\r\n") are met.
+    def match_carriagereturn
+        if @text[@cursor, 4] == "\r\n\r\n"
+            if @para
+                @sub_tokens = end_tokens_for_open_pairs
+                @sub_tokens << [:PARA_END, '']
+                @sub_tokens << [:PARA_START, '']
+                @next_token[0] = @sub_tokens.slice!(0)[0]
+                @cursor += 4
+                return
+            end
+        end
+        match_other
+    end
     #-- ================== Helper methods ================== ++#
+    # Checks if we are lexing inside a resource link like
+    # [[Image:example.png|100px|Embedded image]]
+    def inside_resource_link
+      if @pair_stack.last[0] == :INTLINKSTART
+        pos = -1
+        while((token = @tokens[pos][0])  != :INTLINKSTART)
+          if token == :RESOURCE_SEP
+            return true
+          else
+            pos -= 1
+          end
+        end
+      end
+      false
+    end
     #Checks if the token is placed at the start of the line.
     def at_start_of_line?
         if @cursor == 0 or @text[@cursor-1, 1] == "\n"
@@ -395,10 +592,15 @@ private
             false
         end
     end
+    def in_table?
+        @pair_stack.include?([:TABLE_START, ''])
+    end
-    #Checks if the text at position contains the start of the html link
-    def html_link?(position)
-        return @text[position, 7] == 'http://'
+    #Checks if the text at position contains the start of a link using any of
+    #HTTP, HTTPS, MAILTO or FILE protocols
+    def link_protocol?(position)
+        return @text[position, @text.length - position] =~ %r{\A((http|https|file)://|mailto:)}
     end
     #Adjusts @token_start to skip leading whitespaces
@@ -408,7 +610,8 @@ private
     #Returns true if the TEXT token is empty or contains newline only
     def empty_text_token?
-        @current_token == [:TEXT, ''] or @current_token == [:TEXT, "\n"]
+        @current_token[0] == :TEXT and
+            (@current_token[1] == '' or @current_token[1] == "\n" or @current_token[1] == "\r\n")
     end
     #Returns true if the text is a list, i.e. starts with one of #;*: symbols
@@ -422,7 +625,7 @@ private
         sub_lexer = MediaWikiLexer.new
         sub_tokens = sub_lexer.tokenize(sub_text)
         sub_tokens.pop #false token
-        if strip_paragraphs
+        if strip_paragraphs and sub_tokens.size > 0
             #the last PARA_END token
             sub_tokens.pop if sub_tokens.last[0] == :PARA_END
             #the first PARA_START token
@@ -431,21 +634,6 @@ private
         sub_tokens
     end
-    #Extracts the text from current cursor position till the next whitespace
-    def extract_till_whitespace
-        i = @cursor
-        text = ""
-        while i < @text.length
-            curr = @text[i, 1]
-            if (curr == "\n") or (curr == "\t") or (curr == " ")
-                break
-            end
-            text += curr
-            i += 1
-        end
-        text
-    end
     #Extract list contents of list type set by list_id variable.
     #Example list:
     # *a
@@ -462,9 +650,13 @@ private
                 list+=curr
                 break
             end
-            list += curr unless (curr == list_id) and (@text[i-1, 1] == "\n")
+            if (curr == list_id) and (@text[i-1, 1] == "\n")
+                list += "\n" if i + 1 == @text.length
+            else
+                list += curr
+            end
             i += 1
-        end
+        end
         list
     end
@@ -474,9 +666,72 @@ private
     end
     def end_para
+        @tokens += end_tokens_for_open_pairs
         @tokens << [:PARA_END, ""]
         @para = false
     end
+    def end_tokens_for_open_pairs
+        tokens = []
+        restore = []
+        while(@pair_stack.size > 1) do
+          last = @pair_stack.pop
+          case last[0]
+          when :ITALICSTART
+              tokens << [:ITALICEND, '']
+          when :BOLDSTART
+              tokens << [:BOLDEND, '']
+          when :INTLINKSTART
+              tokens << [:INTLINKEND, '']
+          when :LINKSTART
+              tokens << [:LINKEND, '']
+          when :TABLE_START
+              tokens << [:TABLE_END, '']
+          when :ROW_START
+              tokens << [:ROW_END, '']
+          when :CELL_START
+              tokens << [:CELL_END, '']
+          when :HEAD_START
+              tokens << [:HEAD_END, '']
+          else
+              restore << last
+          end
+        end
+        @pair_stack += restore.reverse
+        tokens
+    end
+    def close_table_cell(tokens)
+        restore = []
+        last = @pair_stack.pop
+        while (last[0] != :CELL_START and last[0] != :HEAD_START and last[0] != :ROW_START and last[0] != :TABLE_START) do
+            case last[0]
+            when :ITALICSTART
+                tokens << [:ITALICEND, '']
+            when :BOLDSTART
+                tokens << [:BOLDEND, '']
+            when :INTLINKSTART
+                tokens << [:INTLINKEND, '']
+            when :LINKSTART
+                tokens << [:LINKEND, '']
+            end
+            last = @pair_stack.pop
+        end
+        if last[0] == :CELL_START
+            tokens << [:CELL_END, '']
+        elsif last[0] == :HEAD_START
+            tokens << [:HEAD_END, '']
+        else
+            @pair_stack.push last
+        end
+    end
+    def close_table_row(tokens)
+        if @pair_stack.last[0] == :ROW_START
+            @pair_stack.pop
+            tokens << [:ROW_END, '']
+        end
+    end
 end