mediacloth 0.0.3 → 0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. data/README.md +36 -0
  2. data/lib/mediacloth/mediawikiast.rb +58 -1
  3. data/lib/mediacloth/mediawikihtmlgenerator.rb +229 -73
  4. data/lib/mediacloth/mediawikilexer.rb +1030 -656
  5. data/lib/mediacloth/mediawikilinkhandler.rb +89 -0
  6. data/lib/mediacloth/mediawikiparams.rb +1 -10
  7. data/lib/mediacloth/mediawikiparser.rb +939 -409
  8. data/lib/mediacloth/mediawikiparser.tab.rb +1357 -0
  9. data/lib/mediacloth/mediawikiparser.y +256 -52
  10. data/lib/mediacloth/mediawikisignedwikigenerator.rb +42 -0
  11. data/lib/mediacloth/mediawikitemplatehandler.rb +8 -0
  12. data/lib/mediacloth/mediawikiwalker.rb +72 -1
  13. data/lib/mediacloth.rb +33 -10
  14. data/test/data/ast1 +68 -0
  15. data/test/data/ast10 +196 -0
  16. data/test/data/ast11 +34 -0
  17. data/test/data/ast12 +39 -0
  18. data/test/data/ast13 +25 -0
  19. data/test/data/ast14 +13 -0
  20. data/test/data/ast15 +25 -0
  21. data/test/data/ast16 +17 -0
  22. data/test/data/ast17 +9 -0
  23. data/test/data/ast18 +21 -0
  24. data/test/data/ast19 +32 -0
  25. data/test/data/ast2 +4 -0
  26. data/test/data/ast20 +10 -0
  27. data/test/data/ast21 +27 -0
  28. data/test/data/ast22 +22 -0
  29. data/test/data/ast23 +5 -0
  30. data/test/data/ast3 +6 -0
  31. data/test/data/ast4 +122 -0
  32. data/test/data/ast5 +122 -0
  33. data/test/data/ast6 +22 -0
  34. data/test/data/ast7 +143 -0
  35. data/test/data/ast8 +3 -0
  36. data/test/data/ast9 +11 -0
  37. data/test/data/html1 +33 -5
  38. data/test/data/html10 +31 -27
  39. data/test/data/html11 +19 -0
  40. data/test/data/html12 +32 -0
  41. data/test/data/html13 +29 -0
  42. data/test/data/html14 +4 -0
  43. data/test/data/html15 +29 -0
  44. data/test/data/html16 +28 -0
  45. data/test/data/html17 +10 -0
  46. data/test/data/html18 +8 -0
  47. data/test/data/html19 +27 -0
  48. data/test/data/html2 +1 -1
  49. data/test/data/html20 +7 -0
  50. data/test/data/html21 +5 -0
  51. data/test/data/html22 +24 -0
  52. data/test/data/html23 +7 -0
  53. data/test/data/html3 +1 -1
  54. data/test/data/html4 +60 -11
  55. data/test/data/html5 +45 -6
  56. data/test/data/html6 +5 -5
  57. data/test/data/html7 +59 -1
  58. data/test/data/html8 +1 -1
  59. data/test/data/html9 +10 -2
  60. data/test/data/input1 +4 -0
  61. data/test/data/input11 +19 -0
  62. data/test/data/input12 +32 -0
  63. data/test/data/input13 +10 -0
  64. data/test/data/input14 +8 -0
  65. data/test/data/input15 +10 -0
  66. data/test/data/input16 +28 -0
  67. data/test/data/input17 +10 -0
  68. data/test/data/input18 +16 -0
  69. data/test/data/input19 +29 -0
  70. data/test/data/input20 +8 -0
  71. data/test/data/input21 +18 -0
  72. data/test/data/input22 +20 -0
  73. data/test/data/input23 +8 -0
  74. data/test/data/input4 +13 -1
  75. data/test/data/input5 +45 -4
  76. data/test/data/input7 +25 -1
  77. data/test/data/lex1 +17 -18
  78. data/test/data/lex10 +57 -87
  79. data/test/data/lex11 +18 -0
  80. data/test/data/lex12 +32 -0
  81. data/test/data/lex13 +3 -0
  82. data/test/data/lex14 +1 -0
  83. data/test/data/lex15 +3 -0
  84. data/test/data/lex16 +27 -0
  85. data/test/data/lex17 +9 -0
  86. data/test/data/lex18 +4 -0
  87. data/test/data/lex19 +27 -0
  88. data/test/data/lex2 +2 -2
  89. data/test/data/lex20 +7 -0
  90. data/test/data/lex21 +4 -0
  91. data/test/data/lex22 +3 -0
  92. data/test/data/lex23 +7 -0
  93. data/test/data/lex3 +1 -1
  94. data/test/data/lex4 +35 -29
  95. data/test/data/lex5 +57 -18
  96. data/test/data/lex6 +7 -7
  97. data/test/data/lex7 +42 -18
  98. data/test/data/lex8 +1 -1
  99. data/test/data/lex9 +6 -6
  100. data/test/dataproducers/ast.rb +24 -0
  101. data/test/dataproducers/html.rb +11 -12
  102. data/test/dataproducers/lex.rb +9 -4
  103. data/test/debugwalker.rb +25 -11
  104. data/test/htmlgenerator.rb +170 -13
  105. data/test/lexer.rb +626 -83
  106. data/test/linkhandler.rb +39 -0
  107. data/test/parser.rb +176 -9
  108. data/test/signedwikigenerator.rb +113 -0
  109. metadata +158 -79
  110. data/README +0 -37
  111. data/lib/mediacloth/mediawikilexer.rb~ +0 -491
  112. data/lib/mediacloth/mediawikiparser.y~ +0 -210
  113. data/test/data/result1 +0 -48
  114. data/test/dataproducers/html.rb~ +0 -24
  115. data/test/dataproducers/lex.rb~ +0 -15
@@ -1,737 +1,1111 @@
1
- #The lexer for MediaWiki language.
2
- #
3
- #Standalone usage:
4
- # file = File.new("somefile", "r")
5
- # input = file.read
6
- # lexer = MediaWikiLexer.new
7
- # lexer.tokenize(input)
8
- #
9
- #Inside RACC-generated parser:
10
- # ...
11
- # ---- inner ----
12
- # attr_accessor :lexer
13
- # def parse(input)
14
- # lexer.tokenize(input)
15
- # return do_parse
16
- # end
17
- # def next_token
18
- # return @lexer.lex
19
- # end
20
- # ...
21
- # parser = MediaWikiParser.new
22
- # parser.lexer = MediaWikiLexer.new
23
- # parser.parse(input)
24
- class MediaWikiLexer
25
-
26
- #Initialized the lexer with a match table.
27
- #
28
- #The match table tells the lexer which method to invoke
29
- #on given input char during "tokenize" phase.
30
- def initialize
31
- @position = 0
32
- @pair_stack = [[false, false]] #stack of tokens for which a pair should be found
33
- @list_stack = []
34
- # Default lexer table
35
- @lexer_table = Hash.new(method(:match_other))
36
- @lexer_table["'"] = method(:match_italic_or_bold)
37
- @lexer_table["="] = method(:match_section)
38
- @lexer_table["["] = method(:match_link_start)
39
- @lexer_table["]"] = method(:match_link_end)
40
- @lexer_table["|"] = method(:match_link_sep_or_table_cell)
41
- @lexer_table[" "] = method(:match_space)
42
- @lexer_table["*"] = method(:match_list)
43
- @lexer_table["#"] = method(:match_list)
44
- @lexer_table[";"] = method(:match_list)
45
- @lexer_table[":"] = method(:match_list)
46
- @lexer_table["-"] = method(:match_line)
47
- @lexer_table["~"] = method(:match_signature)
48
- @lexer_table["h"] = method(:match_inline_link)
49
- @lexer_table["\n"] = method(:match_newline)
50
- @lexer_table["\r"] = method(:match_carriagereturn)
51
- @lexer_table["<"] = method(:match_tag_start)
52
- @lexer_table["{"] = method(:match_table)
53
- @lexer_table["!"] = method(:match_table_head)
54
- # Lexer table used when inside :match_tag_start ... :match_tag_end
55
- @tag_lexer_table = Hash.new(method(:match_other))
56
- @tag_lexer_table["<"] = method(:match_tag_end)
57
- # Begin lexing in default state
58
- @current_lexer_table = @lexer_table
59
- end
60
-
61
- #Transforms input stream (string) into the stream of tokens.
62
- #Tokens are collected into an array of type [ [TOKEN_SYMBOL, TOKEN_VALUE], ..., [false, false] ].
63
- #This array can be given as input token-by token to RACC based parser with no
64
- #modification. The last token [false, false] inficates EOF.
65
- def tokenize(input)
66
- @tokens = []
67
- start_para
68
- @cursor = 0
69
- @text = input
70
- @next_token = []
71
-
72
- #This tokenizer algorithm assumes that everything that is not
73
- #matched by the lexer is going to be :TEXT token. Otherwise it's usual
74
- #lexer algo which call methods from the match table to define next tokens.
75
- while (@cursor < @text.length)
76
- @current_token = [:TEXT, ''] unless @current_token
77
- @token_start = @cursor
78
- @char = @text[@cursor, 1]
79
-
80
- if @current_lexer_table[@char].call == :TEXT
81
- @current_token[1] += @text[@token_start, 1]
82
- else
83
- #skip empty :TEXT tokens
84
- unless empty_text_token?
85
- @tokens << @current_token
86
- unless para_breaker?(@next_token[0]) or in_block?
87
- #if no paragraph was previously started
88
- #then we should start it
89
- start_para if !@para
90
- else
91
- #if we already have a paragraph this is the time to close it
92
- end_para if @para
93
- end
94
-
95
- end
96
-
97
- if para_breaker?(@next_token[0])
98
- if @tokens.last and @tokens.last[0] == :PARA_START
99
- #we need to remove para start token because no para end is possible
100
- @tokens.pop
101
- @para = false
102
- elsif @para
103
- end_para
104
- end
105
- end
1
+ require 'strscan'
2
+
3
+ class String
4
+ def is_empty_token?
5
+ self.size == 0 or self == "\n" or self == "\r\n"
6
+ end
7
+ end
106
8
 
107
- @next_token[1] = @text[@token_start, @cursor - @token_start]
108
- @tokens << @next_token
109
- #hack to enable sub-lexing!
110
- if @sub_tokens
111
- @tokens += @sub_tokens
112
- @sub_tokens = nil
113
- end
114
- #end of hack!
9
+ # Class for storing text tokens data - index and text
10
+ class TokenString < String
11
+ attr_reader :idx
12
+
13
+ def initialize(lexer, text = '')
14
+ @lexer = lexer
15
+ @idx = 0
16
+ super(text)
17
+ end
18
+
19
+ def <<(pending_text)
20
+ # If TokenString.length is 0 and we are pushing some text
21
+ # than in this moment we can retreive this tokes's index
22
+ if length == 0
23
+ @idx = @lexer.cursor
24
+ end
25
+ super(pending_text)
26
+ end
27
+ end
115
28
 
116
- #if the next token can start the paragraph, let's try that
117
- start_para if @tokens.last and para_starter?(@tokens.last[0])
29
+ class TokenArray < Array
30
+ def initialize(lexer)
31
+ @lexer = lexer
32
+ end
118
33
 
119
- @current_token = nil
120
- @next_token = []
121
- end
34
+ def <<(token)
35
+ if @lexer.tokens.last && (@lexer.tokens.last[3].nil? || @lexer.tokens.last[3] == 0)
36
+ @lexer.tokens.last[3] = @lexer.cursor - @lexer.tokens.last[2]
122
37
  end
123
- #add the last TEXT token if it exists
124
- @tokens << @current_token if @current_token and not empty_text_token?
38
+ token[2] = @lexer.cursor
39
+ super(token)
40
+ end
125
41
 
126
- #remove empty para start or finish the paragraph if necessary
127
- if @tokens.last and @tokens.last[0] == :PARA_START
128
- @tokens.pop
129
- @para = false
130
- else
131
- end_para if @para
42
+ def append_pending(text)
43
+ if @lexer.tokens.last && @lexer.tokens.last[3].nil?
44
+ @lexer.tokens.last[3] = text.idx - @lexer.tokens.last[2]
132
45
  end
133
- #RACC wants us to put this to indicate EOF
134
- @tokens << [false, false]
135
- @tokens
46
+ token = [:TEXT, text, text.idx, text.length]
47
+ push(token)
136
48
  end
137
49
 
138
- #Returns the next token from the stream. Useful for RACC parsers.
139
- def lex
140
- token = @tokens[@position]
141
- @position += 1
142
- return token
50
+ def to_s
51
+ string_copy = ""
52
+ each do |token|
53
+ string_copy << "#{token[0..1]}[#{token[2]}, #{token[3]}]"
54
+ end
55
+ string_copy
143
56
  end
144
57
 
58
+ end
145
59
 
146
- private
147
- #Returns true if the token breaks the paragraph.
148
- def para_breaker?(token)
149
- [:SECTION_START, :SECTION_END,
150
- :TABLE_START, :TABLE_END, :ROW_START, :ROW_END, :HEAD_START, :HEAD_END, :CELL_START, :CELL_END,
151
- :UL_START, :UL_END, :OL_START, :OL_END,
152
- :DL_START, :DL_END, :HLINE, :PRE].include?(token)
153
- end
154
60
 
155
- #Returns true if the paragraph can be started after the token
156
- def para_starter?(token)
157
- [:SECTION_END, :TABLE_END, :UL_END, :OL_END, :DL_END, :HLINE, :PRE].include?(token)
158
- end
61
+ class MediaWikiLexer
62
+
63
+ INLINE_ELEMENTS = [:LINK, :INTLINK, :BOLD, :ITALIC]
64
+ BLOCK_ELEMENTS = [:PARA, :PRE, :PREINDENT, :UL, :OL, :DL, :LI, :SECTION, :TABLE, :ROW, :CELL, :HEAD]
65
+ PARA_BREAK_ELEMENTS = [:UL, :OL, :DL, :PRE, :PREINDENT, :PASTE_START, :SECTION, :TABLE, :HLINE, :KEYWORD]
66
+
67
+ NAME_CHAR_TABLE = (0 .. 255).collect{|n| n.chr =~ /[a-zA-Z0-9_\-]/ ? true : false}
68
+ TOKEN_CHAR_TABLE = (0 .. 255).collect{|n| n.chr =~ /[a-zA-Z0-9_\-.;:?&@~=#%\/]/ ? true : false}
69
+ PUNCTUATION_CHAR_TABLE = (0 .. 255).collect{|n| n.chr =~ /[\.,;:\-?]/ ? true : false}
70
+
71
+
72
+ HTML_TAGS = %w{ a abbr acronym address applet area b base basefont bdo big blockquote body br
73
+ button caption center cite code col colgroup dd del dir div dfn dl dt em fieldset font form frame
74
+ frameset h1 h2 h3 h4 h5 h6 head hr html i iframe img input ins isindex kbd label legend li link map
75
+ menu meta noframes noscript object ol optgroup option p param pre q s samp script select small span
76
+ strike strong style sub sup table tbody td textarea tfoot th thead title tr tt u ul var xmp }
77
+ WIKI_TAGS = %w{ nowiki math paste }
78
+ TAGS_WITHOUT_CLOSE_TAG = %w{ br hr img }
79
+
80
+ attr_reader :cursor
81
+ attr_reader :tokens
82
+
83
+
84
+ def initialize
85
+ # Current position in token list
86
+ @position = 0
159
87
 
160
- def in_block?
161
- @pair_stack.select {|token| para_breaker?(token[0])}.size > 0 or
162
- (@sub_tokens and @sub_tokens.select {|token| para_breaker?(token[0])}.size > 0)
163
- end
88
+ # Lexer table of methods that handle only formatting, e.g. bold or italicized
89
+ # text; or spans of XHTML, or wiki-escape, markup
90
+ @formatting_lexer_table = {}
91
+ @formatting_lexer_table["'"] = method(:match_quote)
92
+ @formatting_lexer_table["<"] = method(:match_left_angle)
93
+ @formatting_lexer_table["&"] = method(:match_ampersand)
94
+ @formatting_lexer_table["{"] = method(:match_left_curly)
95
+
96
+ # Lexer table of methods that handle everything that may occur in-line in
97
+ # addition to formatting, i.e. links and signatures
98
+ @inline_lexer_table = @formatting_lexer_table.dup
99
+ @inline_lexer_table["["] = method(:match_left_square)
100
+ @inline_lexer_table["~"] = method(:match_tilde)
101
+ @inline_lexer_table["h"] = method(:match_h_char)
102
+
103
+ # Default lexer table, which includes all in-line formatting and links, plus
104
+ # methods that handle constructs that begin on a newline
105
+ @default_lexer_table = @inline_lexer_table.dup
106
+ @default_lexer_table[" "] = method(:match_space)
107
+ @default_lexer_table["="] = method(:match_equal)
108
+ @default_lexer_table["*"] = method(:match_star)
109
+ @default_lexer_table["#"] = method(:match_hash)
110
+ @default_lexer_table[":"] = method(:match_colon)
111
+ @default_lexer_table[";"] = method(:match_semicolon)
112
+ @default_lexer_table["-"] = method(:match_dash)
113
+ @default_lexer_table["_"] = method(:match_underscore)
114
+ @default_lexer_table["\n"] = method(:match_newline)
115
+ @default_lexer_table["\r"] = method(:match_newline)
116
+
117
+ # Lexer table used inside spans of markup, wherein spans of newlines are not
118
+ # automatically treated as paragraphs.
119
+ @markup_lexer_table = @default_lexer_table.dup
120
+ @markup_lexer_table["\n"] = nil
121
+ @markup_lexer_table["\r"] = nil
122
+
123
+ # Lexer table used inside of headings
124
+ @heading_lexer_table = @inline_lexer_table.dup
125
+ @heading_lexer_table["="] = method(:match_equal_in_heading)
126
+ @heading_lexer_table["\n"] = method(:match_newline_in_heading)
127
+
128
+ # Lexer table used inside the left half of an external link
129
+ @link_lexer_table = {}
130
+ @link_lexer_table["]"] = method(:match_right_square_in_link)
131
+ @link_lexer_table["\n"] = method(:match_newline_in_link)
132
+ @link_lexer_table["\r"] = method(:match_newline_in_link)
133
+ @link_lexer_table[" "] = method(:match_space_in_link)
134
+
135
+ # Lexer table used inside the right half of an external link, or the right
136
+ # half of an internal link
137
+ @link_opt_lexer_table = @inline_lexer_table.dup
138
+ @link_opt_lexer_table["]"] = method(:match_right_square_in_link)
139
+ @link_opt_lexer_table["\n"] = method(:match_newline_in_link)
140
+ @link_opt_lexer_table["\r"] = method(:match_newline_in_link)
141
+
142
+ # Lexer table used inside the left half of an internal link or internal
143
+ # resource link
144
+ @intlink_lexer_table = {}
145
+ @intlink_lexer_table["]"] = method(:match_right_square_in_intlink)
146
+ @intlink_lexer_table["\r"] = method(:match_newline_in_intlink)
147
+ @intlink_lexer_table["\n"] = method(:match_newline_in_intlink)
148
+ @intlink_lexer_table[":"] = method(:match_colon_in_intlink)
149
+ @intlink_lexer_table["|"] = method(:match_pipe_in_intlink)
150
+ @intlink_lexer_table["C"] = method(:match_c_char_in_intlink)
151
+
152
+ # Lexer table used inside the category name of the left half of an
153
+ # internal link
154
+ @intlink_cat_lexer_table = {}
155
+ @intlink_cat_lexer_table["]"] = method(:match_right_square_in_intlink)
156
+ @intlink_cat_lexer_table["\r"] = method(:match_newline_in_intlink)
157
+ @intlink_cat_lexer_table["\n"] = method(:match_newline_in_intlink)
158
+ @intlink_cat_lexer_table["|"] = method(:match_pipe_in_intlink)
159
+
160
+ # Lexer table used inside the right half of an internal link
161
+ @intlink_opt_lexer_table = @formatting_lexer_table.dup
162
+ @intlink_opt_lexer_table["]"] = method(:match_right_square_in_intlink)
163
+ @intlink_opt_lexer_table["\n"] = method(:match_newline_in_intlink)
164
+ @intlink_opt_lexer_table["\r"] = method(:match_newline_in_intlink)
165
+
166
+ # Lexer table used inside the right half of an internal resource link
167
+ @resourcelink_opt_lexer_table = @inline_lexer_table.dup
168
+ @resourcelink_opt_lexer_table["]"] = method(:match_right_square_in_intlink)
169
+ @resourcelink_opt_lexer_table["\n"] = method(:match_newline_in_intlink)
170
+ @resourcelink_opt_lexer_table["\r"] = method(:match_newline_in_intlink)
171
+ @resourcelink_opt_lexer_table["|"] = method(:match_pipe_in_intlink)
172
+
173
+ # Lexer table used to parse tables
174
+ @table_lexer_table = @inline_lexer_table.dup
175
+ @table_lexer_table["*"] = method(:match_star)
176
+ @table_lexer_table["#"] = method(:match_hash)
177
+ @table_lexer_table["|"] = method(:match_pipe_in_table)
178
+ @table_lexer_table["!"] = method(:match_bang_in_table)
179
+ @table_lexer_table["{"] = method(:match_left_curly)
180
+ @table_lexer_table[" "] = method(:match_space)
181
+
182
+ # Lexer table used to parse ordered and unordered list items (which may nest)
183
+ @items_lexer_table = @inline_lexer_table.dup
184
+ @items_lexer_table["\n"] = method(:match_newline_in_items)
185
+
186
+ # Lexer table used to parse entries in a definition list (which may not nest)
187
+ @entries_lexer_table = @inline_lexer_table.dup
188
+ @entries_lexer_table["\n"] = method(:match_newline_in_entries)
189
+ @entries_lexer_table[":"] = method(:match_colon_in_entries)
190
+
191
+ # Lexer table used inside spans of indented text
192
+ @indent_lexer_table = @inline_lexer_table.dup
193
+ @indent_lexer_table["\n"] = method(:match_newline_in_indent)
194
+
195
+ # Lexer table used inside spans of pre-formatted text
196
+ @pre_lexer_table = {}
197
+ @pre_lexer_table["<"] = method(:match_left_angle_in_pre)
198
+
199
+ # Lexer table used inside spans of <code>
200
+ @code_lexer_table = @inline_lexer_table.dup
201
+ @code_lexer_table[" "] = method(:match_space_in_code)
202
+ @code_lexer_table["<"] = method(:match_left_angle_in_code)
203
+
204
+ # Lexer table used when inside spans of wiki-escaped text
205
+ @nowiki_lexer_table = {}
206
+ @nowiki_lexer_table["<"] = method(:match_left_angle_in_nowiki)
164
207
 
165
- #-- ================== Match methods ================== ++#
208
+ @paste_lexer_table = {}
209
+ @paste_lexer_table["<"] = method(:match_left_angle_in_paste)
210
+ @paste_lexer_table["\n"] = method(:match_newline_in_paste)
211
+ @paste_lexer_table["\r"] = method(:match_newline_in_paste)
166
212
 
167
- #Matches anything that was not matched. Returns :TEXT to indicate
168
- #that matched characters should go into :TEXT token.
169
- def match_other
213
+ # Lexer table used when inside spans of math
214
+ @math_lexer_table = {}
215
+ @math_lexer_table["<"] = method(:match_left_angle_in_math)
216
+
217
+ # Lexer table used when inside a wiki template inclusion
218
+ @template_lexer_table = {}
219
+ @template_lexer_table["{"] = method(:match_left_curly_in_template)
220
+ @template_lexer_table["|"] = method(:match_pipe_in_template)
221
+ @template_lexer_table["}"] = method(:match_right_curly_in_template)
222
+
223
+ @template_param_lexer_table = {}
224
+ @template_param_lexer_table["{"] = method(:match_left_curly_in_template)
225
+ @template_param_lexer_table["}"] = method(:match_right_curly_in_template)
226
+ @template_param_lexer_table["|"] = method(:match_pipe_in_template)
227
+
228
+ # Begin lexing in default state
229
+ @lexer_table = LexerTable.new
230
+ @lexer_table.push(@default_lexer_table)
231
+ end
232
+
233
+
234
+ def tokenize(input)
235
+ @text = input
236
+ # Current position in the input text
237
+ @cursor = 0
238
+ # Tokens to be returned
239
+ @tokens = TokenArray.new(self)
240
+ # Stack of open token spans
241
+ @context = []
242
+ # Already lexed character data, not yet added to a TEXT token
243
+ @pending = TokenString.new(self)
244
+ # List symbols from the most recent line item of a list, e.g. '***'
245
+ @list = ''
246
+
247
+ start_span(:PARA)
248
+
249
+ while (@cursor < @text.length)
250
+ @char = @text[@cursor, 1]
251
+ if @lexer_table[@char]
252
+ @lexer_table[@char].call
253
+ else
254
+ @pending << @char
170
255
  @cursor += 1
171
- return :TEXT
256
+ end
172
257
  end
173
-
174
- #Matches italic or bold symbols:
175
- # "'''" { return :BOLD; }
176
- # "''" { return :ITALIC; }
177
- def match_italic_or_bold
178
- if @text[@cursor, 5] == "'''''"
179
- if @pair_stack.last[0] == :BOLDSTART
180
- matchBold
181
- @cursor += 3
182
- else
183
- matchItalic
184
- @cursor += 2
185
- end
186
- return
187
- end
188
- if @text[@cursor, 3] == "'''"
189
- matchBold
190
- @cursor += 3
191
- return
192
- end
193
- if @text[@cursor, 2] == "''"
194
- matchItalic
195
- @cursor += 2
196
- return
197
- end
198
- match_other
258
+
259
+ if @pending.is_empty_token?
260
+ if @context.size > 0 and @tokens.last[0] == :PARA_START
261
+ @context.pop
262
+ @tokens.pop
263
+ end
264
+ else
265
+ @tokens.append_pending(@pending)
266
+ @pending = TokenString.new(self)
199
267
  end
268
+ while(@context.size > 0) do
269
+ @tokens << [(@context.pop.to_s + '_END').to_sym, '']
270
+ end
271
+ @tokens << [false, false, 0, 0]
272
+ @tokens
273
+
274
+ end
200
275
 
201
- def matchBold
202
- if @pair_stack.last[0] == :BOLDSTART
203
- @next_token[0] = :BOLDEND
204
- @pair_stack.pop
205
- else
206
- @next_token[0] = :BOLDSTART
207
- @pair_stack.push @next_token
208
- end
276
+ #Returns the next token from the stream. Useful for RACC parsers.
277
+ def lex
278
+ token = @tokens[@position]
279
+ @position += 1
280
+ return token
281
+ end
282
+
283
+
284
+ private
285
+
286
+ def match_text
287
+ @pending << @char
288
+ @cursor += 1
289
+ end
290
+
291
+ def match_ampersand
292
+ i = @cursor + 1
293
+ i += 1 while i < @text.size and NAME_CHAR_TABLE[@text[i].ord]
294
+ if @text[i, 1] == ';'
295
+ append_to_tokens([:CHAR_ENT, @text[(@cursor + 1) ... i]])
296
+ @cursor = i + 1
297
+ else
298
+ match_text
299
+ end
300
+ end
301
+
302
+ def match_quote
303
+ if @text[@cursor, 5] == "'''''"
304
+ if @context.last == :BOLD
305
+ match_bold
306
+ @cursor += 3
307
+ else
308
+ match_italic
309
+ @cursor += 2
310
+ end
311
+ elsif @text[@cursor, 3] == "'''"
312
+ match_bold
313
+ @cursor += 3
314
+ elsif @text[@cursor, 2] == "''"
315
+ match_italic
316
+ @cursor += 2
317
+ else
318
+ match_text
209
319
  end
320
+ end
210
321
 
211
- def matchItalic
212
- if @pair_stack.last[0] == :ITALICSTART
213
- @next_token[0] = :ITALICEND
214
- @pair_stack.pop
215
- else
216
- @next_token[0] = :ITALICSTART
217
- @pair_stack.push @next_token
218
- end
322
+ def match_bold
323
+ if @context.last == :BOLD
324
+ end_span(:BOLD, "'''")
325
+ else
326
+ start_span(:BOLD, "'''")
219
327
  end
328
+ end
220
329
 
221
- #Matches sections
222
- def match_section
223
- if at_start_of_line? or (@pair_stack.last[0] == :SECTION_START)
224
- i = 0
225
- i += 1 while @text[@cursor+i, 1] == "="
226
- @cursor += i
330
+ def match_italic
331
+ if @context.last == :ITALIC
332
+ end_span(:ITALIC, "''")
333
+ else
334
+ start_span(:ITALIC, "''")
335
+ end
336
+ end
227
337
 
228
- if @pair_stack.last[0] == :SECTION_START
229
- @next_token[0] = :SECTION_END
230
- @pair_stack.pop
338
+ def match_tilde
339
+ if @text[@cursor, 5] == "~~~~~"
340
+ empty_span(:SIGNATURE_DATE, "~~~~~", 5)
341
+ elsif @text[@cursor, 4] == "~~~~"
342
+ empty_span(:SIGNATURE_FULL, "~~~~", 4)
343
+ elsif @text[@cursor, 3] == "~~~"
344
+ empty_span(:SIGNATURE_NAME, "~~~", 3)
345
+ else
346
+ match_text
347
+ end
348
+ end
349
+
350
+ def match_left_angle
351
+ next_char = @text[@cursor + 1]
352
+ if !next_char
353
+ match_text
354
+ elsif next_char.ord == 47
355
+ # Might be an XHTML end tag
356
+ if @text[@cursor .. -1] =~ %r{</([a-zA-Z][a-zA-Z0-9\-_]*)(\s*)>} and @context.include?(:TAG)
357
+ # Found an XHTML end tag
358
+ tag_name = $1
359
+ end_span(:TAG, $1)
360
+ @lexer_table.pop
361
+ @cursor += $1.length + $2.length + 3
362
+ else
363
+ match_text
364
+ end
365
+ elsif next_char.ord > 64 and next_char.ord < 123
366
+ # Might be an XHTML open or empty tag
367
+ scanner = StringScanner.new(@text[@cursor .. -1])
368
+ if scanner.scan(%r{<([a-zA-Z][a-zA-Z0-9\-_]*)}) and (HTML_TAGS.include?(scanner[1]) or WIKI_TAGS.include?(scanner[1]))
369
+ # Sequence begins with a valid tag name, so check for attributes
370
+ tag_name = scanner[1]
371
+ attrs = {}
372
+ while scanner.scan(%r{\s+([a-zA-Z][a-zA-Z0-9\-_]*)\s*=\s*('([^']+)'|"([^"]+)"|([^>\s]+))}) do
373
+ attrs[scanner[1]] = scanner[3] ? scanner[3] : (scanner[4] ? scanner[4] : scanner[5])
374
+ end
375
+ scanner.scan(%r{\s*})
376
+ if ((c = scanner.get_byte) == '>' or (c == '/' and scanner.get_byte == '>'))
377
+ # Found an XHTML start or empty tag
378
+ if tag_name == 'nowiki'
379
+ @lexer_table.push(@nowiki_lexer_table) unless c == '/'
380
+ elsif tag_name == 'paste'
381
+ unless c == '/'
382
+ maybe_close_para(:PASTE_START, true)
383
+ append_to_tokens([:PASTE_START, ''])
384
+ @cursor += scanner.pos
385
+ @lexer_table.push(@paste_lexer_table)
386
+ #eat newline after <paste> if if exists because otherwise
387
+ #it will be transformed into <br/>
388
+ if @text[@cursor, 1] == "\n"
389
+ @cursor += 1
390
+ elsif @text[@cursor, 2] == "\r\n"
391
+ @cursor += 2
392
+ end
393
+ return
394
+ end
395
+ else
396
+ if tag_name == 'pre'
397
+ table = @pre_lexer_table
398
+ elsif tag_name == 'code'
399
+ table = @code_lexer_table
400
+ elsif tag_name == 'math'
401
+ table = @math_lexer_table
402
+ else
403
+ table = @markup_lexer_table
404
+ end
405
+ start_span(:TAG, tag_name)
406
+ attrs.collect do |(name, value)|
407
+ append_to_tokens([:ATTR_NAME, name])
408
+ append_to_tokens([:ATTR_VALUE, value]) if value
409
+ end
410
+ if c == '/' or TAGS_WITHOUT_CLOSE_TAG.include? tag_name
411
+ end_span(:TAG, tag_name)
231
412
  else
232
- @next_token[0] = :SECTION_START
233
- @pair_stack.push @next_token
413
+ @lexer_table.push(table)
234
414
  end
415
+ end
416
+ @cursor += scanner.pos #FIXME: will break xhtml attribute length calculation
235
417
  else
236
- match_other
418
+ match_text
237
419
  end
420
+ else
421
+ match_text
422
+ end
423
+ else
424
+ match_text
238
425
  end
426
+ end
239
427
 
240
- #Matches start of the hyperlinks
241
- # "[[" { return INTLINKSTART; }
242
- # "[" { return LINKSTART; }
243
- def match_link_start
244
- if @text[@cursor, 2] == "[[" and @text[@cursor+2, @text.length - (@cursor + 2)] =~ %r{\A\s*[^\s\]]}
245
- @next_token[0] = :INTLINKSTART
246
- @pair_stack.push @next_token
247
- @cursor += 2
248
- elsif @text[@cursor, 1] == "[" and link_protocol?(@cursor+1)
249
- @next_token[0] = :LINKSTART
250
- @pair_stack.push @next_token
251
- @cursor += 1
428
+ def match_equal
429
+ if at_start_of_line?
430
+ @heading = extract_char_sequence('=')
431
+ @cursor += @heading.length
432
+ if at_end_of_line? or blank_line?
433
+ @cursor -= @heading.length
434
+ #special case - no header text, just "=" signs
435
+ #try to split header into "=" formatting and text with "=":
436
+ # example:
437
+ # ==== should become: = == =
438
+ # ===== should become: == = ==
439
+ if @heading =~ /(={6})(=+)(={6})/ or
440
+ @heading =~ /(={5})(=+)(={5})/ or
441
+ @heading =~ /(={4})(=+)(={4})/ or
442
+ @heading =~ /(={3})(=+)(={3})/ or
443
+ @heading =~ /(={2})(=+)(={2})/ or
444
+ @heading =~ /(=)(=+)(=)/
445
+ start_span(:SECTION, $1)
446
+ @cursor += $1.length
447
+ @tokens << [:TEXT, $2]
448
+ @cursor += $2.length
449
+ end_span(:SECTION, $3)
450
+ @cursor += $3.length
252
451
  else
253
- match_other
452
+ match_text
254
453
  end
454
+ else
455
+ @cursor -= @heading.length
456
+ start_span(:SECTION, @heading)
457
+ @cursor += @heading.length
458
+ @lexer_table.push(@heading_lexer_table)
459
+ end
460
+ else
461
+ match_text
462
+ end
463
+ end
464
+
465
+ def match_equal_in_heading
466
+ heading = extract_char_sequence('=')
467
+ if @heading.length <= heading.length
468
+ end_span(:SECTION, heading)
469
+ @lexer_table.pop
470
+ @cursor += heading.length
471
+ skip_newline
472
+ else
473
+ @pending << heading
474
+ @cursor += heading.length
255
475
  end
476
+ end
477
+
478
+ def match_newline_in_heading
479
+ end_span(:SECTION)
480
+ @lexer_table.pop
481
+ end
256
482
 
257
- #Matches end of the hyperlinks
258
- # "]]" { return INTLINKEND; }
259
- # "]" { return LINKEND; }
260
- def match_link_end
261
- if @text[@cursor, 2] == "]]" and @pair_stack.last[0] == :INTLINKSTART
262
- @next_token[0] = :INTLINKEND
263
- @pair_stack.pop
264
- @cursor += 2
265
- elsif @text[@cursor, 1] == "]" and @pair_stack.last[0] == :LINKSTART
266
- @next_token[0] = :LINKEND
267
- @pair_stack.pop
268
- @cursor += 1
269
- else
270
- match_other
271
- end
483
+ def match_left_square
484
+ if @text[@cursor, 2] == "[["
485
+ if @text[@cursor + 2, 1] != "]"
486
+ start_span(:INTLINK, "[[")
487
+ @cursor += 2
488
+ @lexer_table.push(@intlink_lexer_table)
489
+ else
490
+ match_text
491
+ end
492
+ elsif @text[@cursor + 1 .. -1] =~ %r{\A\s*((http|https|file)://|mailto:)}
493
+ start_span(:LINK, "[")
494
+ @cursor += 1
495
+ skip_whitespace
496
+ @lexer_table.push(@link_lexer_table)
497
+ else
498
+ match_text
272
499
  end
500
+ end
501
+
502
+ def match_right_square_in_link
503
+ end_span(:LINK, "]")
504
+ @cursor += 1
505
+ @lexer_table.pop
506
+ end
507
+
508
+ def match_right_square_in_intlink
509
+ if @text[@cursor, 2] == "]]"
510
+ end_span(:INTLINK, "]]")
511
+ @cursor += 2
512
+ @lexer_table.pop
513
+ else
514
+ match_text
515
+ end
516
+ end
517
+
518
+ def match_space_in_link
519
+ spaces = extract_char_sequence(' ')
520
+ append_to_tokens([:LINKSEP, ' ']) unless @text[@cursor, 1] == ']'
521
+ @cursor += spaces.length
522
+ @lexer_table.pop
523
+ @lexer_table.push(@link_opt_lexer_table)
524
+ end
273
525
 
274
- #Matches link separator inside of internal links
275
- def match_link_sep
276
- if @tokens[-1][0] == :INTLINKSTART or inside_resource_link
277
- @next_token[0] = :INTLINKSEP
278
- @cursor += 1
279
- else
280
- match_other
526
+ def match_pipe_in_intlink
527
+ if @tokens.last[0] == :INTLINK_START
528
+ @lexer_table.pop
529
+ @lexer_table.push(@intlink_opt_lexer_table)
530
+ end
531
+ append_to_tokens([:INTLINKSEP, "|"])
532
+ @cursor += 1
533
+ end
534
+
535
+ def match_colon_in_intlink
536
+ if not @pending.is_empty_token?
537
+ @lexer_table.pop
538
+ @lexer_table.push(@resourcelink_opt_lexer_table)
539
+ end
540
+ append_to_tokens([:RESOURCESEP, ":"])
541
+ @cursor += 1
542
+ end
543
+
544
+ def match_c_char_in_intlink
545
+ if @text[@cursor, 9] == 'Category:'
546
+ append_to_tokens([:CATEGORY, 'Category:'])
547
+ @lexer_table.pop
548
+ @lexer_table.push(@intlink_cat_lexer_table)
549
+ @cursor += 9
550
+ else
551
+ match_text
552
+ end
553
+ end
554
+
555
+ def match_newline_in_link
556
+ end_span(:LINK)
557
+ @lexer_table.pop
558
+ end
559
+
560
+ def match_newline_in_intlink
561
+ end_span(:INTLINK)
562
+ @lexer_table.pop
563
+ end
564
+
565
+ def match_h_char
566
+ link = @text[@cursor, 7] if @text[@cursor, 7] == 'http://'
567
+ link = @text[@cursor, 8] if @text[@cursor, 8] == 'https://'
568
+ if link
569
+ start_span(:LINK)
570
+ i = @cursor + link.length
571
+ while i < @text.size and TOKEN_CHAR_TABLE[@text[i].ord] do
572
+ link << @text[i, 1]
573
+ i += 1
574
+ end
575
+
576
+ #exclude punctuation at the end
577
+ while link.length > 0 and PUNCTUATION_CHAR_TABLE[link[-1].ord] do
578
+ link = link[0..-2]
579
+ i -= 1
281
580
  end
581
+
582
+ @pending = TokenString.new(self)
583
+ @pending << link
584
+ @cursor = i
585
+ end_span(:LINK)
586
+ else
587
+ match_text
282
588
  end
589
+ end
283
590
 
284
- #Matches inlined unformatted html link
285
- # "http://[^\s]*" { return [ LINKSTART TEXT LINKEND]; }
286
- def match_inline_link
287
- #if no link start token was detected and the text starts with http://
288
- #then it's the inlined unformatted html link
289
- last_pair_token = @pair_stack.last[0]
290
- if link_protocol?(@cursor) and last_pair_token != :INTLINKSTART and last_pair_token != :LINKSTART
291
- @next_token[0] = :LINKSTART
292
- text = @text[@cursor..-1]
293
- if last_pair_token == :ITALICSTART and text =~ /\A([^\s\n]+)''/
294
- linkText = $1
295
- elsif last_pair_token == :BOLDSTART and text =~ /\A([^\s\n]+)'''/
296
- linkText = $1
297
- elsif text =~ /\A([^\s\n]+)[\s\n]/
298
- linkText = $1
299
- else
300
- linkText = text
301
- end
302
- @sub_tokens = []
303
- @sub_tokens << [:TEXT, linkText]
304
- @sub_tokens << [:LINKEND, ']']
305
- @cursor += linkText.length
306
- @token_start = @cursor
307
- else
308
- match_other
309
- end
591
+ def match_space
592
+ if at_start_of_line? and !blank_line?
593
+ start_span(:PREINDENT)
594
+ @lexer_table.push(@indent_lexer_table)
595
+ match_text
596
+ else
597
+ match_text
598
+ end
599
+ end
600
+
601
+ def match_newline_in_indent
602
+ match_text
603
+ unless @text[@cursor, 1] == " "
604
+ @tokens.append_pending(@pending)
605
+ @pending = TokenString.new(self)
606
+ end_span(:PREINDENT)
607
+ @lexer_table.pop
310
608
  end
609
+ end
311
610
 
312
- #Matches space to find preformatted areas which start with a space after a newline
313
- # "\n\s[^\n]*" { return PRE; }
314
- def match_space
315
- if at_start_of_line? and ! in_table?
316
- match_untill_eol
317
- @next_token[0] = :PRE
318
- strip_ws_from_token_start
319
- elsif @pair_stack.last[0] == :LINKSTART and @current_token[0] == :TEXT and @tokens.last[0] != :LINKSEP
320
- @next_token[0] = :LINKSEP
321
- @cursor += 1
322
- strip_ws_from_token_start
323
- else
324
- match_other
611
+ def match_star
612
+ if at_start_of_line?
613
+ @list = extract_char_sequence('#*')
614
+ open_list(@list)
615
+ @lexer_table.push(@items_lexer_table)
616
+ else
617
+ match_text
618
+ end
619
+ end
620
+
621
+ def match_hash
622
+ if at_start_of_line?
623
+ @list = extract_char_sequence('#*')
624
+ open_list(@list)
625
+ @lexer_table.push(@items_lexer_table)
626
+ else
627
+ match_text
628
+ end
629
+ end
630
+
631
+ def match_underscore
632
+ if @text[@cursor, 7] == '__TOC__'
633
+ empty_span(:KEYWORD, 'TOC', 7)
634
+ elsif @text[@cursor, 9] == '__NOTOC__'
635
+ empty_span(:KEYWORD, 'NOTOC', 9)
636
+ else
637
+ match_text
638
+ end
639
+ end
640
+
641
+ def match_newline_in_items
642
+ if @text[@cursor, 1] == "\n"
643
+ newline = "\n"
644
+ char = @text[@cursor + 1, 1]
645
+ else
646
+ newline = "\r\n"
647
+ char = @text[@cursor + 2, 1]
648
+ end
649
+ @pending << newline
650
+ @cursor += newline.length
651
+ if (char == @list[0, 1])
652
+ list = extract_char_sequence('#*')
653
+ if list == @list
654
+ end_span(:LI)
655
+ start_span(:LI)
656
+ @cursor += list.length
657
+ else
658
+ l = @list.length > list.length ? list.length : @list.length
659
+ i = 0
660
+ i += 1 while (i < l and @list[i] == list[i])
661
+ if i < @list.length
662
+ close_list(@list[i .. -1])
663
+ if @context.last == :LI
664
+ end_span(:LI)
665
+ start_span(:LI)
666
+ end
325
667
  end
668
+ if i < list.length
669
+ start_span(:LI) if @context.last != :LI
670
+ open_list(list[i .. -1])
671
+ end
672
+ @cursor += i
673
+ @list = list
674
+ end
675
+ else
676
+ close_list(@list)
677
+ @lexer_table.pop
326
678
  end
679
+ end
680
+
681
+ def match_dash
682
+ if at_start_of_line? and @text[@cursor, 4] == "----"
683
+ empty_span(:HLINE, "----", 4)
684
+ else
685
+ match_text
686
+ end
687
+ end
688
+
689
+ def match_left_angle_in_nowiki
690
+ if @text[@cursor, 9] == '</nowiki>'
691
+ @cursor += 9
692
+ @lexer_table.pop
693
+ else
694
+ match_text
695
+ end
696
+ end
327
697
 
328
- #Matches any kind of list by using sublexing technique. MediaWiki lists are context-sensitive
329
- #therefore we need to do some special processing with lists. The idea here is to strip
330
- #the leftmost symbol indicating the list from the group of input lines and use separate
331
- #lexer to process extracted fragment.
332
- def match_list
333
- if at_start_of_line?
334
- list_id = @text[@cursor, 1]
335
- sub_text = extract_list_contents(list_id)
336
- extracted = 0
337
-
338
- #hack to tokenize everything inside the list
339
- @sub_tokens = []
340
- sub_lines = ""
341
- @sub_tokens << [:LI_START, ""]
342
- sub_text.each do |t|
343
- extracted += 1
344
- if text_is_list? t
345
- sub_lines += t
346
- else
347
- if not sub_lines.empty?
348
- @sub_tokens += sub_lex(sub_lines)
349
- sub_lines = ""
350
- end
351
- if @sub_tokens.last[0] != :LI_START
352
- @sub_tokens << [:LI_END, ""]
353
- @sub_tokens << [:LI_START, ""]
354
- end
355
- @sub_tokens += sub_lex(t.lstrip)
356
- end
357
- end
358
- if not sub_lines.empty?
359
- @sub_tokens += sub_lex(sub_lines)
360
- @sub_tokens << [:LI_END, ""]
361
- else
362
- @sub_tokens << [:LI_END, ""]
363
- end
698
+ def match_left_angle_in_paste
699
+ if @text[@cursor, 8] == '</paste>'
700
+ @lexer_table.pop
701
+ append_to_tokens([:PASTE_END, ''])
702
+ @cursor += 8
703
+ maybe_open_para(:PASTE_END)
704
+ else
705
+ match_text
706
+ end
707
+ end
364
708
 
365
- #end of hack
366
- @cursor += sub_text.length + extracted
367
- @token_start = @cursor
368
-
369
- case
370
- when list_id == "*"
371
- @next_token[0] = :UL_START
372
- @sub_tokens << [:UL_END, ""]
373
- when list_id == "#"
374
- @next_token[0] = :OL_START
375
- @sub_tokens << [:OL_END, ""]
376
- when list_id == ";", list_id == ":"
377
- @next_token[0] = :DL_START
378
- @sub_tokens << [:DL_END, ""]
379
- end
380
- elsif @text[@cursor, 1] == ':' and @tokens[-1][0] == :INTLINKSTART
381
- @next_token[0] = :RESOURCE_SEP
382
- @cursor += 1
383
- else
384
- match_other
385
- end
709
+ def match_newline_in_paste
710
+ append_to_tokens([:TAG_START, 'br'])
711
+ if @text[@cursor, 1] == "\n"
712
+ @cursor += 1
713
+ elsif @text[@cursor, 2] == "\r\n"
714
+ @cursor += 2
386
715
  end
716
+ append_to_tokens([:TAG_END, 'br'])
717
+ end
387
718
 
388
- #Matches the line until \n
389
- def match_untill_eol
390
- val = @text[@cursor, 1]
391
- while (val != "\n") and (!val.nil?)
392
- @cursor += 1
393
- val = @text[@cursor, 1]
394
- end
719
+ def match_left_angle_in_math
720
+ if @text[@cursor, 7] == '</math>'
721
+ end_span(:TAG, 'math')
722
+ @cursor += 7
723
+ @lexer_table.pop
724
+ else
725
+ match_text
726
+ end
727
+ end
728
+
729
+ def match_left_angle_in_pre
730
+ if @text[@cursor, 6] == '</pre>'
731
+ end_span(:TAG, 'pre')
732
+ @cursor += 6
733
+ #eat newline after </pre>
734
+ if @text[@cursor, 1] == "\n"
395
735
  @cursor += 1
736
+ elsif @text[@cursor, 2] == "\r\n"
737
+ @cursor += 2
738
+ end
739
+ @lexer_table.pop
740
+ else
741
+ match_text
396
742
  end
743
+ end
397
744
 
398
- #Matches hline tag that start with "-"
399
- # "\n----" { return HLINE; }
400
- def match_line
401
- if at_start_of_line? and @text[@cursor, 4] == "----"
402
- @next_token[0] = :HLINE
403
- @cursor += 4
404
- else
405
- match_other
406
- end
745
+ def match_space_in_code
746
+ match_text
747
+ end
748
+
749
+ def match_left_angle_in_code
750
+ if @text[@cursor, 7] == '</code>'
751
+ end_span(:TAG, 'code')
752
+ @cursor += 7
753
+ @lexer_table.pop
754
+ else
755
+ match_left_angle
407
756
  end
757
+ end
408
758
 
409
- #Matches signature
410
- # "~~~~~" { return SIGNATURE_DATE; }
411
- # "~~~~" { return SIGNATURE_FULL; }
412
- # "~~~" { return SIGNATURE_NAME; }
413
- def match_signature
414
- if @text[@cursor, 5] == "~~~~~"
415
- @next_token[0] = :SIGNATURE_DATE
416
- @cursor += 5
417
- elsif @text[@cursor, 4] == "~~~~"
418
- @next_token[0] = :SIGNATURE_FULL
419
- @cursor += 4
420
- elsif @text[@cursor, 3] == "~~~"
421
- @next_token[0] = :SIGNATURE_NAME
422
- @cursor += 3
423
- else
424
- match_other
425
- end
759
+ def match_left_curly
760
+ if at_start_of_line? and @text[@cursor + 1, 1] == '|'
761
+ start_span(:TABLE, "{|")
762
+ @cursor += 2
763
+ @lexer_table.push(@table_lexer_table)
764
+ elsif @text[@cursor + 1, 1] == '{' and @text[@cursor + 2, 2] != "}}"
765
+ start_span(:TEMPLATE, "{{")
766
+ @cursor += 2
767
+ @lexer_table.push(@template_lexer_table)
768
+ else
769
+ match_text
426
770
  end
427
-
428
- def match_tag_start
429
- if @text[@cursor, 8] == '<nowiki>'
430
- @cursor += 8
431
- @token_start = @cursor
432
- @current_lexer_table = @tag_lexer_table
433
- @current_lexer_table[@text[@cursor, 1]].call
434
- else
435
- match_other
436
- end
771
+ end
772
+
773
+ def match_left_curly_in_template
774
+ if @text[@cursor + 1, 1] == '{' and @text[@cursor + 2, 2] != "}}"
775
+ start_span(:TEMPLATE, "{{")
776
+ @cursor += 2
777
+ @lexer_table.push(@template_lexer_table)
778
+ else
779
+ match_text
437
780
  end
438
-
439
- def match_tag_end
440
- if @text[@cursor, 9] == '</nowiki>'
441
- @cursor += 9
442
- @token_start = @cursor
443
- @current_lexer_table = @lexer_table
444
- @current_lexer_table[@text[@cursor, 1]].call
445
- else
446
- match_other
447
- end
781
+ end
782
+
783
+ def match_right_curly_in_template
784
+ if @text[@cursor + 1, 1] == '}'
785
+ end_span(:TEMPLATE, "}}")
786
+ @cursor += 2
787
+ @lexer_table.pop
788
+ else
789
+ match_text
448
790
  end
449
-
450
- def match_table
451
- if at_start_of_line? and @text[@cursor + 1, 1] == '|'
452
- tokens = []
453
- if @para
454
- tokens = end_tokens_for_open_pairs
455
- if @tokens.last and @tokens.last[0] == :PARA_START and empty_text_token?
456
- tokens.pop
457
- else
458
- tokens << [:PARA_END, ""]
459
- end
460
- @para = false
461
- end
462
- tokens << [:TABLE_START, '']
463
- @pair_stack.push [:TABLE_START, '']
464
- @next_token = tokens.shift
465
- @sub_tokens = tokens
466
- @cursor += 2
467
- else
468
- match_other
469
- end
791
+ end
792
+
793
+ def match_pipe_in_template
794
+ if @tokens.last[0] == :TEMPLATE_START
795
+ @lexer_table.pop
796
+ @lexer_table.push(@template_param_lexer_table)
470
797
  end
798
+ append_to_tokens([:INTLINKSEP, "|"])
799
+ @cursor += 1
800
+ end
471
801
 
472
- def match_table_head
473
- if at_start_of_line? and in_table?
474
- @cursor += 1
475
- tokens = []
476
- if @pair_stack.last[0] == :CELL_START
477
- tokens << [:CELL_END, '']
478
- @pair_stack.pop
479
- elsif @pair_stack.last[0] == :HEAD_START
480
- tokens << [:HEAD_END, '']
481
- @pair_stack.pop
482
- elsif @pair_stack.last[0] != :ROW_START
483
- tokens << [:ROW_START, '']
484
- @pair_stack.push [:ROW_START, '']
485
- end
486
- tokens << [:HEAD_START, '']
487
- @pair_stack.push [:HEAD_START, '']
488
- @next_token = tokens.shift
489
- @sub_tokens = tokens
490
- else
491
- match_other
492
- end
802
+ def match_bang_in_table
803
+ if at_start_of_line?
804
+ if @context.last == :CELL
805
+ end_span(:CELL)
806
+ elsif @context.last == :HEAD
807
+ end_span(:HEAD)
808
+ elsif @context.last != :ROW
809
+ start_span(:ROW)
810
+ end
811
+ start_span(:HEAD, "!")
812
+ @cursor += 1
813
+ else
814
+ match_text
493
815
  end
816
+ end
494
817
 
495
- def match_link_sep_or_table_cell
496
- if in_table?
497
- tokens = []
498
- if at_start_of_line?
499
- @cursor += 1
500
- close_table_cell(tokens)
501
- if ['-', '}'].include?(@text[@cursor, 1])
502
- close_table_row(tokens)
503
- if @text[@cursor, 1] == '-'
504
- tokens << [:ROW_START, '']
505
- @pair_stack.push [:ROW_START, '']
506
- else
507
- tokens << [:TABLE_END, '']
508
- @pair_stack.pop
509
- end
510
- @cursor += 1
511
- else
512
- if @pair_stack.last[0] != :ROW_START
513
- tokens << [:ROW_START, '']
514
- @pair_stack.push [:ROW_START, '']
515
- end
516
- tokens << [:CELL_START, '']
517
- @pair_stack.push [:CELL_START, '']
518
- end
519
- @next_token = tokens.shift
520
- @sub_tokens = tokens
521
- elsif @text[@cursor + 1, 1] == '|'
522
- @cursor += 2
523
- close_table_cell(tokens)
524
- next_token = tokens.last[0] == :HEAD_END ? [:HEAD_START, ''] : [:CELL_START, '']
525
- tokens << next_token
526
- @pair_stack.push next_token
527
- @next_token = tokens.shift
528
- @sub_tokens = tokens
529
- else
530
- match_link_sep
531
- end
532
- else
533
- match_link_sep
818
+ def match_pipe_in_table
819
+ if at_start_of_line?
820
+ context = @context[@context.rindex(:TABLE) + 1 .. -1]
821
+ if @text[@cursor+1, 1] == '-'
822
+ end_span(:ROW) if context.include? :ROW
823
+ start_span(:ROW, "|-")
824
+ @cursor += 2
825
+ elsif @text[@cursor+1, 1] == '}'
826
+ end_span(:TABLE, "|}")
827
+ @cursor += 2
828
+ @lexer_table.pop
829
+ skip_newline
830
+ else
831
+ if context.include? :CELL
832
+ end_span(:CELL)
833
+ elsif context.include? :HEAD
834
+ end_span(:HEAD)
534
835
  end
836
+ start_span(:ROW) unless @context.last == :ROW
837
+ start_span(:CELL, "|")
838
+ @cursor += 1
839
+ end
840
+ elsif @text[@cursor + 1, 1] == '|'
841
+ context = @context[@context.rindex(:TABLE) + 1 .. -1]
842
+ if context.include?:CELL
843
+ end_span(:CELL)
844
+ start_span(:CELL, "||")
845
+ elsif context.include? :HEAD
846
+ end_span(:HEAD)
847
+ start_span(:HEAD, "||")
848
+ end
849
+ @cursor += 2
850
+ else
851
+ context = @context[@context.rindex(:TABLE) + 1 .. -1]
852
+ if context.include? :CELL
853
+ end_span(:CELL, "attributes")
854
+ start_span(:CELL, "|")
855
+ @char = '' #WTF?
856
+ #CHECK: this usecase and cursor increments
857
+ end
858
+ match_text
535
859
  end
860
+ end
536
861
 
537
- #Matches a new line and breaks the paragraph if two newline characters
538
- #("\n\n") are met.
539
- def match_newline
540
- if @text[@cursor, 2] == "\n\n"
541
- if @para
542
- @sub_tokens = end_tokens_for_open_pairs
543
- @sub_tokens << [:PARA_END, '']
544
- @sub_tokens << [:PARA_START, '']
545
- @next_token[0] = @sub_tokens.slice!(0)[0]
546
- @cursor += 2
547
- return
548
- end
549
- end
550
- match_other
551
- end
552
-
553
- #Matches a new line and breaks the paragraph if two carriage return - newline
554
- #sequences ("\r\n\r\n") are met.
555
- def match_carriagereturn
556
- if @text[@cursor, 4] == "\r\n\r\n"
557
- if @para
558
- @sub_tokens = end_tokens_for_open_pairs
559
- @sub_tokens << [:PARA_END, '']
560
- @sub_tokens << [:PARA_START, '']
561
- @next_token[0] = @sub_tokens.slice!(0)[0]
562
- @cursor += 4
563
- return
564
- end
565
- end
566
- match_other
862
+ def match_newline
863
+ if @text[@cursor, 2] == "\n\n"
864
+ @pending << "\n\n"
865
+ @cursor += 2
866
+ end_span(:PARA)
867
+ start_span(:PARA)
868
+ elsif @text[@cursor, 4] == "\r\n\r\n"
869
+ @pending << "\r\n\r\n"
870
+ @cursor += 4
871
+ end_span(:PARA)
872
+ start_span(:PARA)
873
+ else
874
+ match_text
567
875
  end
876
+ end
877
+
878
+ def match_newline_in_table
879
+ if @text[@cursor, 2] == "\n\n"
880
+ start_span(:PARA)
881
+ append_to_tokens([:TEXT, "\n\n"])
882
+ @cursor += 2
883
+ end_span(:PARA)
884
+ elsif @text[@cursor, 4] == "\r\n\r\n"
885
+ start_span(:PARA)
886
+ append_to_tokens([:TEXT, "\r\n\r\n"])
887
+ @cursor += 4
888
+ end_span(:PARA)
889
+ else
890
+ match_text
891
+ end
892
+ end
893
+
894
+ def match_semicolon
895
+ if at_start_of_line?
896
+ start_span(:DL)
897
+ start_span(:DT, ';')
898
+ @lexer_table.push(@entries_lexer_table)
899
+ @cursor += 1
900
+ else
901
+ match_text
902
+ end
903
+ end
904
+
905
+ def match_colon
906
+ if at_start_of_line?
907
+ start_span(:DL)
908
+ start_span(:DD, ':')
909
+ @lexer_table.push(@entries_lexer_table)
910
+ @cursor += 1
911
+ else
912
+ match_text
913
+ end
914
+ end
915
+
916
+ def match_colon_in_entries
917
+ if @context.include? :DD
918
+ end_span(:DD)
919
+ elsif @context.include? :DT
920
+ end_span(:DT)
921
+ end
922
+ start_span(:DD, ':')
923
+ @cursor += 1
924
+ end
925
+
926
+ def match_newline_in_entries
927
+ match_text
928
+ unless @text[@cursor, 1] == ':'
929
+ if @context.include? :DD
930
+ end_span(:DD)
931
+ elsif @context.include? :DT
932
+ end_span(:DT)
933
+ end
934
+ end_span(:DL)
935
+ @lexer_table.pop
936
+ end
937
+ end
938
+
939
+
940
+ #-- ================== Helper methods ================== ++#
941
+
942
+ # Returns true if the text cursor is on the first character of a line
943
+ def at_start_of_line?
944
+ @cursor == 0 or @text[@cursor - 1, 1] == "\n"
945
+ end
568
946
 
569
- #-- ================== Helper methods ================== ++#
947
+ # Returns true if the text cursor is after the last character of a line
948
+ def at_end_of_line?
949
+ @text[@cursor, 1] == "\n" or @text[@cursor, 1].nil?
950
+ end
570
951
 
571
- # Checks if we are lexing inside a resource link like
572
- # [[Image:example.png|100px|Embedded image]]
573
- def inside_resource_link
574
- if @pair_stack.last[0] == :INTLINKSTART
575
- pos = -1
576
- while((token = @tokens[pos][0]) != :INTLINKSTART)
577
- if token == :RESOURCE_SEP
578
- return true
579
- else
580
- pos -= 1
581
- end
582
- end
952
+ def blank_line?
953
+ i = @cursor
954
+ i += 1 while (@text[i,1] == ' ')
955
+ return (@text[i,1] == '' or (@text[i,1] == "\n") or (@text[i,2] == "\r\n"))
956
+ end
957
+
958
+ # Advances the text cursor to the next non-blank character, without appending
959
+ # any of the blank characters to the pending text buffer
960
+ def skip_whitespace
961
+ @cursor += 1 while @text[@cursor, 1] == ' '
962
+ end
963
+
964
+ # Advances the text cursor beyond the next newline sequence, if any. This is
965
+ # used to strip newlines after certain block-level elements, like section
966
+ # headings and tables, to prevent an empty paragraph when the block is followed
967
+ # by an extra newline sequence.
968
+ def skip_newline
969
+ if @text[@cursor, 2] == "\r\n"
970
+ @cursor += 2
971
+ elsif @text[@cursor, 1] == "\n"
972
+ @cursor += 1
973
+ end
974
+ end
975
+
976
+ # Extracts from the input text the sequence of characters consisting of the
977
+ # character or characters specified, and returns the sequence as a string. The
978
+ # text cursor is advanaced to point to the next character after the sequence.
979
+ def extract_char_sequence(char)
980
+ sequence = ''
981
+ i = @cursor
982
+ if char.length == 1
983
+ while @text[i, 1] == char do
984
+ sequence << char
985
+ i += 1
986
+ end
987
+ else
988
+ chars = char.split('')
989
+ while chars.include?(@text[i, 1]) do
990
+ sequence << @text[i, 1]
991
+ i += 1
583
992
  end
584
- false
585
993
  end
994
+ sequence
995
+ end
996
+
997
+ # Opens list and list item spans for each item symbol in the string specified.
998
+ def open_list(symbols)
999
+ symbols.split('').each do |symbol|
1000
+ if symbol == '*'
1001
+ start_span(:UL)
1002
+ else
1003
+ start_span(:OL)
1004
+ end
1005
+ start_span(:LI)
1006
+ @cursor += symbol.length
1007
+ end
1008
+ end
1009
+
1010
+ # Closes list and list item spans for each item symbol in the string specified.
1011
+ def close_list(symbols)
1012
+ symbols.split('').reverse.each do |symbol|
1013
+ end_span(:LI)
1014
+ if symbol == '*'
1015
+ end_span(:UL)
1016
+ else
1017
+ end_span(:OL)
1018
+ end
1019
+ end
1020
+ end
1021
+
1022
+ # Open a token span for the symbol specified. This will append a token start
1023
+ # to the list of output tokens, and push the symbol onto the context stack. If
1024
+ # there is an open paragraph, and the symbol is a block element, then the
1025
+ # open paragraph will be closed (or, if empty, removed) before the token start
1026
+ # is appended.
1027
+ def start_span(symbol, text='')
1028
+ maybe_close_para(symbol, ['pre','table','p'].include?(text))
1029
+ @context << symbol
1030
+ append_to_tokens [(symbol.to_s + '_START').to_sym, text]
1031
+ end
586
1032
 
587
- #Checks if the token is placed at the start of the line.
588
- def at_start_of_line?
589
- if @cursor == 0 or @text[@cursor-1, 1] == "\n"
590
- true
1033
+ # Close a token span for the symbol specified. This will append an end token
1034
+ # to the list of output tokens, and pop the symbol from the context stack. Any
1035
+ # unclosed contexts on top of this symbol's context will also be close (this
1036
+ # generally happens when in-line markup is not terminated before a new block
1037
+ # begins). If the context is empty as a result, a new paragraph will be opened.
1038
+ def end_span(symbol, text='')
1039
+ while(@context.size > 0 and @context.last != symbol) do
1040
+ append_to_tokens [(@context.pop.to_s + '_END').to_sym, '']
1041
+ end
1042
+ @context.pop
1043
+ append_to_tokens [(symbol.to_s + '_END').to_sym, text]
1044
+ maybe_open_para(symbol)
1045
+ end
1046
+
1047
+ def empty_span(symbol, text, cursor_increment)
1048
+ maybe_close_para(symbol)
1049
+ append_to_tokens [symbol, text, @cursor, cursor_increment]
1050
+ @cursor += cursor_increment
1051
+ maybe_open_para(symbol)
1052
+ end
1053
+
1054
+ def maybe_close_para(symbol, force = false)
1055
+ if @context.size > 0 and (PARA_BREAK_ELEMENTS.include?(symbol) or force)
1056
+ i = 1
1057
+ i += 1 while INLINE_ELEMENTS.include?(@context[-i])
1058
+ if @context[-i] == :PARA
1059
+ if @pending.is_empty_token? and @tokens.last[0] == :PARA_START
1060
+ @context.pop
1061
+ @tokens.pop
591
1062
  else
592
- false
1063
+ (1 .. i).each do
1064
+ symbol = @context.pop
1065
+ append_to_tokens [(symbol.to_s + '_END').to_sym, '']
1066
+ end
593
1067
  end
1068
+ end
594
1069
  end
595
-
596
- def in_table?
597
- @pair_stack.include?([:TABLE_START, ''])
598
- end
599
-
600
- #Checks if the text at position contains the start of a link using any of
601
- #HTTP, HTTPS, MAILTO or FILE protocols
602
- def link_protocol?(position)
603
- return @text[position, @text.length - position] =~ %r{\A((http|https|file)://|mailto:)}
1070
+ end
1071
+
1072
+ def maybe_open_para(symbol)
1073
+ if @context.size == 0 and symbol != :PARA
1074
+ @tokens << [:PARA_START, '']
1075
+ @context << :PARA
604
1076
  end
605
-
606
- #Adjusts @token_start to skip leading whitespaces
607
- def strip_ws_from_token_start
608
- @token_start += 1 while @text[@token_start, 1] == " "
1077
+ end
1078
+
1079
+ def append_to_tokens(token)
1080
+ unless @pending.is_empty_token?
1081
+ @tokens.append_pending(@pending)
609
1082
  end
610
-
611
- #Returns true if the TEXT token is empty or contains newline only
612
- def empty_text_token?
613
- @current_token[0] == :TEXT and
614
- (@current_token[1] == '' or @current_token[1] == "\n" or @current_token[1] == "\r\n")
1083
+ @pending = TokenString.new(self)
1084
+ @tokens << token
1085
+ end
1086
+
1087
+
1088
+ class LexerTable
1089
+
1090
+ def initialize
1091
+ @tables = []
615
1092
  end
616
-
617
- #Returns true if the text is a list, i.e. starts with one of #;*: symbols
618
- #that indicate a list
619
- def text_is_list?(text)
620
- return text =~ /^[#;*:].*/
1093
+
1094
+ def push(table)
1095
+ @tables << table
1096
+ @table = table
621
1097
  end
622
-
623
- #Runs sublexer to tokenize sub_text
624
- def sub_lex(sub_text, strip_paragraphs=true)
625
- sub_lexer = MediaWikiLexer.new
626
- sub_tokens = sub_lexer.tokenize(sub_text)
627
- sub_tokens.pop #false token
628
- if strip_paragraphs and sub_tokens.size > 0
629
- #the last PARA_END token
630
- sub_tokens.pop if sub_tokens.last[0] == :PARA_END
631
- #the first PARA_START token
632
- sub_tokens.delete_at(0) if sub_tokens[0][0] == :PARA_START
633
- end
634
- sub_tokens
635
- end
636
-
637
- #Extract list contents of list type set by list_id variable.
638
- #Example list:
639
- # *a
640
- # **a
641
- #Extracted list with id "*" will look like:
642
- # a
643
- # *a
644
- def extract_list_contents(list_id)
645
- i = @cursor+1
646
- list = ""
647
- while i < @text.length
648
- curr = @text[i, 1]
649
- if (curr == "\n") and (@text[i+1, 1] != list_id)
650
- list+=curr
651
- break
652
- end
653
- if (curr == list_id) and (@text[i-1, 1] == "\n")
654
- list += "\n" if i + 1 == @text.length
655
- else
656
- list += curr
657
- end
658
- i += 1
659
- end
660
- list
661
- end
662
-
663
- def start_para
664
- @tokens << [:PARA_START, ""]
665
- @para = true
666
- end
667
-
668
- def end_para
669
- @tokens += end_tokens_for_open_pairs
670
- @tokens << [:PARA_END, ""]
671
- @para = false
672
- end
673
-
674
- def end_tokens_for_open_pairs
675
- tokens = []
676
- restore = []
677
- while(@pair_stack.size > 1) do
678
- last = @pair_stack.pop
679
- case last[0]
680
- when :ITALICSTART
681
- tokens << [:ITALICEND, '']
682
- when :BOLDSTART
683
- tokens << [:BOLDEND, '']
684
- when :INTLINKSTART
685
- tokens << [:INTLINKEND, '']
686
- when :LINKSTART
687
- tokens << [:LINKEND, '']
688
- when :TABLE_START
689
- tokens << [:TABLE_END, '']
690
- when :ROW_START
691
- tokens << [:ROW_END, '']
692
- when :CELL_START
693
- tokens << [:CELL_END, '']
694
- when :HEAD_START
695
- tokens << [:HEAD_END, '']
696
- else
697
- restore << last
698
- end
699
- end
700
- @pair_stack += restore.reverse
701
- tokens
702
- end
703
-
704
- def close_table_cell(tokens)
705
- restore = []
706
- last = @pair_stack.pop
707
- while (last[0] != :CELL_START and last[0] != :HEAD_START and last[0] != :ROW_START and last[0] != :TABLE_START) do
708
- case last[0]
709
- when :ITALICSTART
710
- tokens << [:ITALICEND, '']
711
- when :BOLDSTART
712
- tokens << [:BOLDEND, '']
713
- when :INTLINKSTART
714
- tokens << [:INTLINKEND, '']
715
- when :LINKSTART
716
- tokens << [:LINKEND, '']
717
- end
718
- last = @pair_stack.pop
719
- end
720
- if last[0] == :CELL_START
721
- tokens << [:CELL_END, '']
722
- elsif last[0] == :HEAD_START
723
- tokens << [:HEAD_END, '']
724
- else
725
- @pair_stack.push last
726
- end
1098
+
1099
+ def pop
1100
+ @tables.pop
1101
+ @table = @tables.last
727
1102
  end
728
1103
 
729
- def close_table_row(tokens)
730
- if @pair_stack.last[0] == :ROW_START
731
- @pair_stack.pop
732
- tokens << [:ROW_END, '']
733
- end
1104
+ def[] (char)
1105
+ @table[char]
734
1106
  end
1107
+
1108
+ end
735
1109
 
736
1110
  end
737
1111