mediacloth 0.0.3 → 0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. data/README.md +36 -0
  2. data/lib/mediacloth/mediawikiast.rb +58 -1
  3. data/lib/mediacloth/mediawikihtmlgenerator.rb +229 -73
  4. data/lib/mediacloth/mediawikilexer.rb +1030 -656
  5. data/lib/mediacloth/mediawikilinkhandler.rb +89 -0
  6. data/lib/mediacloth/mediawikiparams.rb +1 -10
  7. data/lib/mediacloth/mediawikiparser.rb +939 -409
  8. data/lib/mediacloth/mediawikiparser.tab.rb +1357 -0
  9. data/lib/mediacloth/mediawikiparser.y +256 -52
  10. data/lib/mediacloth/mediawikisignedwikigenerator.rb +42 -0
  11. data/lib/mediacloth/mediawikitemplatehandler.rb +8 -0
  12. data/lib/mediacloth/mediawikiwalker.rb +72 -1
  13. data/lib/mediacloth.rb +33 -10
  14. data/test/data/ast1 +68 -0
  15. data/test/data/ast10 +196 -0
  16. data/test/data/ast11 +34 -0
  17. data/test/data/ast12 +39 -0
  18. data/test/data/ast13 +25 -0
  19. data/test/data/ast14 +13 -0
  20. data/test/data/ast15 +25 -0
  21. data/test/data/ast16 +17 -0
  22. data/test/data/ast17 +9 -0
  23. data/test/data/ast18 +21 -0
  24. data/test/data/ast19 +32 -0
  25. data/test/data/ast2 +4 -0
  26. data/test/data/ast20 +10 -0
  27. data/test/data/ast21 +27 -0
  28. data/test/data/ast22 +22 -0
  29. data/test/data/ast23 +5 -0
  30. data/test/data/ast3 +6 -0
  31. data/test/data/ast4 +122 -0
  32. data/test/data/ast5 +122 -0
  33. data/test/data/ast6 +22 -0
  34. data/test/data/ast7 +143 -0
  35. data/test/data/ast8 +3 -0
  36. data/test/data/ast9 +11 -0
  37. data/test/data/html1 +33 -5
  38. data/test/data/html10 +31 -27
  39. data/test/data/html11 +19 -0
  40. data/test/data/html12 +32 -0
  41. data/test/data/html13 +29 -0
  42. data/test/data/html14 +4 -0
  43. data/test/data/html15 +29 -0
  44. data/test/data/html16 +28 -0
  45. data/test/data/html17 +10 -0
  46. data/test/data/html18 +8 -0
  47. data/test/data/html19 +27 -0
  48. data/test/data/html2 +1 -1
  49. data/test/data/html20 +7 -0
  50. data/test/data/html21 +5 -0
  51. data/test/data/html22 +24 -0
  52. data/test/data/html23 +7 -0
  53. data/test/data/html3 +1 -1
  54. data/test/data/html4 +60 -11
  55. data/test/data/html5 +45 -6
  56. data/test/data/html6 +5 -5
  57. data/test/data/html7 +59 -1
  58. data/test/data/html8 +1 -1
  59. data/test/data/html9 +10 -2
  60. data/test/data/input1 +4 -0
  61. data/test/data/input11 +19 -0
  62. data/test/data/input12 +32 -0
  63. data/test/data/input13 +10 -0
  64. data/test/data/input14 +8 -0
  65. data/test/data/input15 +10 -0
  66. data/test/data/input16 +28 -0
  67. data/test/data/input17 +10 -0
  68. data/test/data/input18 +16 -0
  69. data/test/data/input19 +29 -0
  70. data/test/data/input20 +8 -0
  71. data/test/data/input21 +18 -0
  72. data/test/data/input22 +20 -0
  73. data/test/data/input23 +8 -0
  74. data/test/data/input4 +13 -1
  75. data/test/data/input5 +45 -4
  76. data/test/data/input7 +25 -1
  77. data/test/data/lex1 +17 -18
  78. data/test/data/lex10 +57 -87
  79. data/test/data/lex11 +18 -0
  80. data/test/data/lex12 +32 -0
  81. data/test/data/lex13 +3 -0
  82. data/test/data/lex14 +1 -0
  83. data/test/data/lex15 +3 -0
  84. data/test/data/lex16 +27 -0
  85. data/test/data/lex17 +9 -0
  86. data/test/data/lex18 +4 -0
  87. data/test/data/lex19 +27 -0
  88. data/test/data/lex2 +2 -2
  89. data/test/data/lex20 +7 -0
  90. data/test/data/lex21 +4 -0
  91. data/test/data/lex22 +3 -0
  92. data/test/data/lex23 +7 -0
  93. data/test/data/lex3 +1 -1
  94. data/test/data/lex4 +35 -29
  95. data/test/data/lex5 +57 -18
  96. data/test/data/lex6 +7 -7
  97. data/test/data/lex7 +42 -18
  98. data/test/data/lex8 +1 -1
  99. data/test/data/lex9 +6 -6
  100. data/test/dataproducers/ast.rb +24 -0
  101. data/test/dataproducers/html.rb +11 -12
  102. data/test/dataproducers/lex.rb +9 -4
  103. data/test/debugwalker.rb +25 -11
  104. data/test/htmlgenerator.rb +170 -13
  105. data/test/lexer.rb +626 -83
  106. data/test/linkhandler.rb +39 -0
  107. data/test/parser.rb +176 -9
  108. data/test/signedwikigenerator.rb +113 -0
  109. metadata +158 -79
  110. data/README +0 -37
  111. data/lib/mediacloth/mediawikilexer.rb~ +0 -491
  112. data/lib/mediacloth/mediawikiparser.y~ +0 -210
  113. data/test/data/result1 +0 -48
  114. data/test/dataproducers/html.rb~ +0 -24
  115. data/test/dataproducers/lex.rb~ +0 -15
@@ -1,737 +1,1111 @@
1
- #The lexer for MediaWiki language.
2
- #
3
- #Standalone usage:
4
- # file = File.new("somefile", "r")
5
- # input = file.read
6
- # lexer = MediaWikiLexer.new
7
- # lexer.tokenize(input)
8
- #
9
- #Inside RACC-generated parser:
10
- # ...
11
- # ---- inner ----
12
- # attr_accessor :lexer
13
- # def parse(input)
14
- # lexer.tokenize(input)
15
- # return do_parse
16
- # end
17
- # def next_token
18
- # return @lexer.lex
19
- # end
20
- # ...
21
- # parser = MediaWikiParser.new
22
- # parser.lexer = MediaWikiLexer.new
23
- # parser.parse(input)
24
- class MediaWikiLexer
25
-
26
- #Initialized the lexer with a match table.
27
- #
28
- #The match table tells the lexer which method to invoke
29
- #on given input char during "tokenize" phase.
30
- def initialize
31
- @position = 0
32
- @pair_stack = [[false, false]] #stack of tokens for which a pair should be found
33
- @list_stack = []
34
- # Default lexer table
35
- @lexer_table = Hash.new(method(:match_other))
36
- @lexer_table["'"] = method(:match_italic_or_bold)
37
- @lexer_table["="] = method(:match_section)
38
- @lexer_table["["] = method(:match_link_start)
39
- @lexer_table["]"] = method(:match_link_end)
40
- @lexer_table["|"] = method(:match_link_sep_or_table_cell)
41
- @lexer_table[" "] = method(:match_space)
42
- @lexer_table["*"] = method(:match_list)
43
- @lexer_table["#"] = method(:match_list)
44
- @lexer_table[";"] = method(:match_list)
45
- @lexer_table[":"] = method(:match_list)
46
- @lexer_table["-"] = method(:match_line)
47
- @lexer_table["~"] = method(:match_signature)
48
- @lexer_table["h"] = method(:match_inline_link)
49
- @lexer_table["\n"] = method(:match_newline)
50
- @lexer_table["\r"] = method(:match_carriagereturn)
51
- @lexer_table["<"] = method(:match_tag_start)
52
- @lexer_table["{"] = method(:match_table)
53
- @lexer_table["!"] = method(:match_table_head)
54
- # Lexer table used when inside :match_tag_start ... :match_tag_end
55
- @tag_lexer_table = Hash.new(method(:match_other))
56
- @tag_lexer_table["<"] = method(:match_tag_end)
57
- # Begin lexing in default state
58
- @current_lexer_table = @lexer_table
59
- end
60
-
61
- #Transforms input stream (string) into the stream of tokens.
62
- #Tokens are collected into an array of type [ [TOKEN_SYMBOL, TOKEN_VALUE], ..., [false, false] ].
63
- #This array can be given as input token-by token to RACC based parser with no
64
- #modification. The last token [false, false] inficates EOF.
65
- def tokenize(input)
66
- @tokens = []
67
- start_para
68
- @cursor = 0
69
- @text = input
70
- @next_token = []
71
-
72
- #This tokenizer algorithm assumes that everything that is not
73
- #matched by the lexer is going to be :TEXT token. Otherwise it's usual
74
- #lexer algo which call methods from the match table to define next tokens.
75
- while (@cursor < @text.length)
76
- @current_token = [:TEXT, ''] unless @current_token
77
- @token_start = @cursor
78
- @char = @text[@cursor, 1]
79
-
80
- if @current_lexer_table[@char].call == :TEXT
81
- @current_token[1] += @text[@token_start, 1]
82
- else
83
- #skip empty :TEXT tokens
84
- unless empty_text_token?
85
- @tokens << @current_token
86
- unless para_breaker?(@next_token[0]) or in_block?
87
- #if no paragraph was previously started
88
- #then we should start it
89
- start_para if !@para
90
- else
91
- #if we already have a paragraph this is the time to close it
92
- end_para if @para
93
- end
94
-
95
- end
96
-
97
- if para_breaker?(@next_token[0])
98
- if @tokens.last and @tokens.last[0] == :PARA_START
99
- #we need to remove para start token because no para end is possible
100
- @tokens.pop
101
- @para = false
102
- elsif @para
103
- end_para
104
- end
105
- end
1
+ require 'strscan'
2
+
3
+ class String
4
+ def is_empty_token?
5
+ self.size == 0 or self == "\n" or self == "\r\n"
6
+ end
7
+ end
106
8
 
107
- @next_token[1] = @text[@token_start, @cursor - @token_start]
108
- @tokens << @next_token
109
- #hack to enable sub-lexing!
110
- if @sub_tokens
111
- @tokens += @sub_tokens
112
- @sub_tokens = nil
113
- end
114
- #end of hack!
9
+ # Class for storing text tokens data - index and text
10
+ class TokenString < String
11
+ attr_reader :idx
12
+
13
+ def initialize(lexer, text = '')
14
+ @lexer = lexer
15
+ @idx = 0
16
+ super(text)
17
+ end
18
+
19
+ def <<(pending_text)
20
+ # If TokenString.length is 0 and we are pushing some text
21
+ # than in this moment we can retreive this tokes's index
22
+ if length == 0
23
+ @idx = @lexer.cursor
24
+ end
25
+ super(pending_text)
26
+ end
27
+ end
115
28
 
116
- #if the next token can start the paragraph, let's try that
117
- start_para if @tokens.last and para_starter?(@tokens.last[0])
29
+ class TokenArray < Array
30
+ def initialize(lexer)
31
+ @lexer = lexer
32
+ end
118
33
 
119
- @current_token = nil
120
- @next_token = []
121
- end
34
+ def <<(token)
35
+ if @lexer.tokens.last && (@lexer.tokens.last[3].nil? || @lexer.tokens.last[3] == 0)
36
+ @lexer.tokens.last[3] = @lexer.cursor - @lexer.tokens.last[2]
122
37
  end
123
- #add the last TEXT token if it exists
124
- @tokens << @current_token if @current_token and not empty_text_token?
38
+ token[2] = @lexer.cursor
39
+ super(token)
40
+ end
125
41
 
126
- #remove empty para start or finish the paragraph if necessary
127
- if @tokens.last and @tokens.last[0] == :PARA_START
128
- @tokens.pop
129
- @para = false
130
- else
131
- end_para if @para
42
+ def append_pending(text)
43
+ if @lexer.tokens.last && @lexer.tokens.last[3].nil?
44
+ @lexer.tokens.last[3] = text.idx - @lexer.tokens.last[2]
132
45
  end
133
- #RACC wants us to put this to indicate EOF
134
- @tokens << [false, false]
135
- @tokens
46
+ token = [:TEXT, text, text.idx, text.length]
47
+ push(token)
136
48
  end
137
49
 
138
- #Returns the next token from the stream. Useful for RACC parsers.
139
- def lex
140
- token = @tokens[@position]
141
- @position += 1
142
- return token
50
+ def to_s
51
+ string_copy = ""
52
+ each do |token|
53
+ string_copy << "#{token[0..1]}[#{token[2]}, #{token[3]}]"
54
+ end
55
+ string_copy
143
56
  end
144
57
 
58
+ end
145
59
 
146
- private
147
- #Returns true if the token breaks the paragraph.
148
- def para_breaker?(token)
149
- [:SECTION_START, :SECTION_END,
150
- :TABLE_START, :TABLE_END, :ROW_START, :ROW_END, :HEAD_START, :HEAD_END, :CELL_START, :CELL_END,
151
- :UL_START, :UL_END, :OL_START, :OL_END,
152
- :DL_START, :DL_END, :HLINE, :PRE].include?(token)
153
- end
154
60
 
155
- #Returns true if the paragraph can be started after the token
156
- def para_starter?(token)
157
- [:SECTION_END, :TABLE_END, :UL_END, :OL_END, :DL_END, :HLINE, :PRE].include?(token)
158
- end
61
+ class MediaWikiLexer
62
+
63
+ INLINE_ELEMENTS = [:LINK, :INTLINK, :BOLD, :ITALIC]
64
+ BLOCK_ELEMENTS = [:PARA, :PRE, :PREINDENT, :UL, :OL, :DL, :LI, :SECTION, :TABLE, :ROW, :CELL, :HEAD]
65
+ PARA_BREAK_ELEMENTS = [:UL, :OL, :DL, :PRE, :PREINDENT, :PASTE_START, :SECTION, :TABLE, :HLINE, :KEYWORD]
66
+
67
+ NAME_CHAR_TABLE = (0 .. 255).collect{|n| n.chr =~ /[a-zA-Z0-9_\-]/ ? true : false}
68
+ TOKEN_CHAR_TABLE = (0 .. 255).collect{|n| n.chr =~ /[a-zA-Z0-9_\-.;:?&@~=#%\/]/ ? true : false}
69
+ PUNCTUATION_CHAR_TABLE = (0 .. 255).collect{|n| n.chr =~ /[\.,;:\-?]/ ? true : false}
70
+
71
+
72
+ HTML_TAGS = %w{ a abbr acronym address applet area b base basefont bdo big blockquote body br
73
+ button caption center cite code col colgroup dd del dir div dfn dl dt em fieldset font form frame
74
+ frameset h1 h2 h3 h4 h5 h6 head hr html i iframe img input ins isindex kbd label legend li link map
75
+ menu meta noframes noscript object ol optgroup option p param pre q s samp script select small span
76
+ strike strong style sub sup table tbody td textarea tfoot th thead title tr tt u ul var xmp }
77
+ WIKI_TAGS = %w{ nowiki math paste }
78
+ TAGS_WITHOUT_CLOSE_TAG = %w{ br hr img }
79
+
80
+ attr_reader :cursor
81
+ attr_reader :tokens
82
+
83
+
84
+ def initialize
85
+ # Current position in token list
86
+ @position = 0
159
87
 
160
- def in_block?
161
- @pair_stack.select {|token| para_breaker?(token[0])}.size > 0 or
162
- (@sub_tokens and @sub_tokens.select {|token| para_breaker?(token[0])}.size > 0)
163
- end
88
+ # Lexer table of methods that handle only formatting, e.g. bold or italicized
89
+ # text; or spans of XHTML, or wiki-escape, markup
90
+ @formatting_lexer_table = {}
91
+ @formatting_lexer_table["'"] = method(:match_quote)
92
+ @formatting_lexer_table["<"] = method(:match_left_angle)
93
+ @formatting_lexer_table["&"] = method(:match_ampersand)
94
+ @formatting_lexer_table["{"] = method(:match_left_curly)
95
+
96
+ # Lexer table of methods that handle everything that may occur in-line in
97
+ # addition to formatting, i.e. links and signatures
98
+ @inline_lexer_table = @formatting_lexer_table.dup
99
+ @inline_lexer_table["["] = method(:match_left_square)
100
+ @inline_lexer_table["~"] = method(:match_tilde)
101
+ @inline_lexer_table["h"] = method(:match_h_char)
102
+
103
+ # Default lexer table, which includes all in-line formatting and links, plus
104
+ # methods that handle constructs that begin on a newline
105
+ @default_lexer_table = @inline_lexer_table.dup
106
+ @default_lexer_table[" "] = method(:match_space)
107
+ @default_lexer_table["="] = method(:match_equal)
108
+ @default_lexer_table["*"] = method(:match_star)
109
+ @default_lexer_table["#"] = method(:match_hash)
110
+ @default_lexer_table[":"] = method(:match_colon)
111
+ @default_lexer_table[";"] = method(:match_semicolon)
112
+ @default_lexer_table["-"] = method(:match_dash)
113
+ @default_lexer_table["_"] = method(:match_underscore)
114
+ @default_lexer_table["\n"] = method(:match_newline)
115
+ @default_lexer_table["\r"] = method(:match_newline)
116
+
117
+ # Lexer table used inside spans of markup, wherein spans of newlines are not
118
+ # automatically treated as paragraphs.
119
+ @markup_lexer_table = @default_lexer_table.dup
120
+ @markup_lexer_table["\n"] = nil
121
+ @markup_lexer_table["\r"] = nil
122
+
123
+ # Lexer table used inside of headings
124
+ @heading_lexer_table = @inline_lexer_table.dup
125
+ @heading_lexer_table["="] = method(:match_equal_in_heading)
126
+ @heading_lexer_table["\n"] = method(:match_newline_in_heading)
127
+
128
+ # Lexer table used inside the left half of an external link
129
+ @link_lexer_table = {}
130
+ @link_lexer_table["]"] = method(:match_right_square_in_link)
131
+ @link_lexer_table["\n"] = method(:match_newline_in_link)
132
+ @link_lexer_table["\r"] = method(:match_newline_in_link)
133
+ @link_lexer_table[" "] = method(:match_space_in_link)
134
+
135
+ # Lexer table used inside the right half of an external link, or the right
136
+ # half of an internal link
137
+ @link_opt_lexer_table = @inline_lexer_table.dup
138
+ @link_opt_lexer_table["]"] = method(:match_right_square_in_link)
139
+ @link_opt_lexer_table["\n"] = method(:match_newline_in_link)
140
+ @link_opt_lexer_table["\r"] = method(:match_newline_in_link)
141
+
142
+ # Lexer table used inside the left half of an internal link or internal
143
+ # resource link
144
+ @intlink_lexer_table = {}
145
+ @intlink_lexer_table["]"] = method(:match_right_square_in_intlink)
146
+ @intlink_lexer_table["\r"] = method(:match_newline_in_intlink)
147
+ @intlink_lexer_table["\n"] = method(:match_newline_in_intlink)
148
+ @intlink_lexer_table[":"] = method(:match_colon_in_intlink)
149
+ @intlink_lexer_table["|"] = method(:match_pipe_in_intlink)
150
+ @intlink_lexer_table["C"] = method(:match_c_char_in_intlink)
151
+
152
+ # Lexer table used inside the category name of the left half of an
153
+ # internal link
154
+ @intlink_cat_lexer_table = {}
155
+ @intlink_cat_lexer_table["]"] = method(:match_right_square_in_intlink)
156
+ @intlink_cat_lexer_table["\r"] = method(:match_newline_in_intlink)
157
+ @intlink_cat_lexer_table["\n"] = method(:match_newline_in_intlink)
158
+ @intlink_cat_lexer_table["|"] = method(:match_pipe_in_intlink)
159
+
160
+ # Lexer table used inside the right half of an internal link
161
+ @intlink_opt_lexer_table = @formatting_lexer_table.dup
162
+ @intlink_opt_lexer_table["]"] = method(:match_right_square_in_intlink)
163
+ @intlink_opt_lexer_table["\n"] = method(:match_newline_in_intlink)
164
+ @intlink_opt_lexer_table["\r"] = method(:match_newline_in_intlink)
165
+
166
+ # Lexer table used inside the right half of an internal resource link
167
+ @resourcelink_opt_lexer_table = @inline_lexer_table.dup
168
+ @resourcelink_opt_lexer_table["]"] = method(:match_right_square_in_intlink)
169
+ @resourcelink_opt_lexer_table["\n"] = method(:match_newline_in_intlink)
170
+ @resourcelink_opt_lexer_table["\r"] = method(:match_newline_in_intlink)
171
+ @resourcelink_opt_lexer_table["|"] = method(:match_pipe_in_intlink)
172
+
173
+ # Lexer table used to parse tables
174
+ @table_lexer_table = @inline_lexer_table.dup
175
+ @table_lexer_table["*"] = method(:match_star)
176
+ @table_lexer_table["#"] = method(:match_hash)
177
+ @table_lexer_table["|"] = method(:match_pipe_in_table)
178
+ @table_lexer_table["!"] = method(:match_bang_in_table)
179
+ @table_lexer_table["{"] = method(:match_left_curly)
180
+ @table_lexer_table[" "] = method(:match_space)
181
+
182
+ # Lexer table used to parse ordered and unordered list items (which may nest)
183
+ @items_lexer_table = @inline_lexer_table.dup
184
+ @items_lexer_table["\n"] = method(:match_newline_in_items)
185
+
186
+ # Lexer table used to parse entries in a definition list (which may not nest)
187
+ @entries_lexer_table = @inline_lexer_table.dup
188
+ @entries_lexer_table["\n"] = method(:match_newline_in_entries)
189
+ @entries_lexer_table[":"] = method(:match_colon_in_entries)
190
+
191
+ # Lexer table used inside spans of indented text
192
+ @indent_lexer_table = @inline_lexer_table.dup
193
+ @indent_lexer_table["\n"] = method(:match_newline_in_indent)
194
+
195
+ # Lexer table used inside spans of pre-formatted text
196
+ @pre_lexer_table = {}
197
+ @pre_lexer_table["<"] = method(:match_left_angle_in_pre)
198
+
199
+ # Lexer table used inside spans of <code>
200
+ @code_lexer_table = @inline_lexer_table.dup
201
+ @code_lexer_table[" "] = method(:match_space_in_code)
202
+ @code_lexer_table["<"] = method(:match_left_angle_in_code)
203
+
204
+ # Lexer table used when inside spans of wiki-escaped text
205
+ @nowiki_lexer_table = {}
206
+ @nowiki_lexer_table["<"] = method(:match_left_angle_in_nowiki)
164
207
 
165
- #-- ================== Match methods ================== ++#
208
+ @paste_lexer_table = {}
209
+ @paste_lexer_table["<"] = method(:match_left_angle_in_paste)
210
+ @paste_lexer_table["\n"] = method(:match_newline_in_paste)
211
+ @paste_lexer_table["\r"] = method(:match_newline_in_paste)
166
212
 
167
- #Matches anything that was not matched. Returns :TEXT to indicate
168
- #that matched characters should go into :TEXT token.
169
- def match_other
213
+ # Lexer table used when inside spans of math
214
+ @math_lexer_table = {}
215
+ @math_lexer_table["<"] = method(:match_left_angle_in_math)
216
+
217
+ # Lexer table used when inside a wiki template inclusion
218
+ @template_lexer_table = {}
219
+ @template_lexer_table["{"] = method(:match_left_curly_in_template)
220
+ @template_lexer_table["|"] = method(:match_pipe_in_template)
221
+ @template_lexer_table["}"] = method(:match_right_curly_in_template)
222
+
223
+ @template_param_lexer_table = {}
224
+ @template_param_lexer_table["{"] = method(:match_left_curly_in_template)
225
+ @template_param_lexer_table["}"] = method(:match_right_curly_in_template)
226
+ @template_param_lexer_table["|"] = method(:match_pipe_in_template)
227
+
228
+ # Begin lexing in default state
229
+ @lexer_table = LexerTable.new
230
+ @lexer_table.push(@default_lexer_table)
231
+ end
232
+
233
+
234
+ def tokenize(input)
235
+ @text = input
236
+ # Current position in the input text
237
+ @cursor = 0
238
+ # Tokens to be returned
239
+ @tokens = TokenArray.new(self)
240
+ # Stack of open token spans
241
+ @context = []
242
+ # Already lexed character data, not yet added to a TEXT token
243
+ @pending = TokenString.new(self)
244
+ # List symbols from the most recent line item of a list, e.g. '***'
245
+ @list = ''
246
+
247
+ start_span(:PARA)
248
+
249
+ while (@cursor < @text.length)
250
+ @char = @text[@cursor, 1]
251
+ if @lexer_table[@char]
252
+ @lexer_table[@char].call
253
+ else
254
+ @pending << @char
170
255
  @cursor += 1
171
- return :TEXT
256
+ end
172
257
  end
173
-
174
- #Matches italic or bold symbols:
175
- # "'''" { return :BOLD; }
176
- # "''" { return :ITALIC; }
177
- def match_italic_or_bold
178
- if @text[@cursor, 5] == "'''''"
179
- if @pair_stack.last[0] == :BOLDSTART
180
- matchBold
181
- @cursor += 3
182
- else
183
- matchItalic
184
- @cursor += 2
185
- end
186
- return
187
- end
188
- if @text[@cursor, 3] == "'''"
189
- matchBold
190
- @cursor += 3
191
- return
192
- end
193
- if @text[@cursor, 2] == "''"
194
- matchItalic
195
- @cursor += 2
196
- return
197
- end
198
- match_other
258
+
259
+ if @pending.is_empty_token?
260
+ if @context.size > 0 and @tokens.last[0] == :PARA_START
261
+ @context.pop
262
+ @tokens.pop
263
+ end
264
+ else
265
+ @tokens.append_pending(@pending)
266
+ @pending = TokenString.new(self)
199
267
  end
268
+ while(@context.size > 0) do
269
+ @tokens << [(@context.pop.to_s + '_END').to_sym, '']
270
+ end
271
+ @tokens << [false, false, 0, 0]
272
+ @tokens
273
+
274
+ end
200
275
 
201
- def matchBold
202
- if @pair_stack.last[0] == :BOLDSTART
203
- @next_token[0] = :BOLDEND
204
- @pair_stack.pop
205
- else
206
- @next_token[0] = :BOLDSTART
207
- @pair_stack.push @next_token
208
- end
276
+ #Returns the next token from the stream. Useful for RACC parsers.
277
+ def lex
278
+ token = @tokens[@position]
279
+ @position += 1
280
+ return token
281
+ end
282
+
283
+
284
+ private
285
+
286
+ def match_text
287
+ @pending << @char
288
+ @cursor += 1
289
+ end
290
+
291
+ def match_ampersand
292
+ i = @cursor + 1
293
+ i += 1 while i < @text.size and NAME_CHAR_TABLE[@text[i].ord]
294
+ if @text[i, 1] == ';'
295
+ append_to_tokens([:CHAR_ENT, @text[(@cursor + 1) ... i]])
296
+ @cursor = i + 1
297
+ else
298
+ match_text
299
+ end
300
+ end
301
+
302
+ def match_quote
303
+ if @text[@cursor, 5] == "'''''"
304
+ if @context.last == :BOLD
305
+ match_bold
306
+ @cursor += 3
307
+ else
308
+ match_italic
309
+ @cursor += 2
310
+ end
311
+ elsif @text[@cursor, 3] == "'''"
312
+ match_bold
313
+ @cursor += 3
314
+ elsif @text[@cursor, 2] == "''"
315
+ match_italic
316
+ @cursor += 2
317
+ else
318
+ match_text
209
319
  end
320
+ end
210
321
 
211
- def matchItalic
212
- if @pair_stack.last[0] == :ITALICSTART
213
- @next_token[0] = :ITALICEND
214
- @pair_stack.pop
215
- else
216
- @next_token[0] = :ITALICSTART
217
- @pair_stack.push @next_token
218
- end
322
+ def match_bold
323
+ if @context.last == :BOLD
324
+ end_span(:BOLD, "'''")
325
+ else
326
+ start_span(:BOLD, "'''")
219
327
  end
328
+ end
220
329
 
221
- #Matches sections
222
- def match_section
223
- if at_start_of_line? or (@pair_stack.last[0] == :SECTION_START)
224
- i = 0
225
- i += 1 while @text[@cursor+i, 1] == "="
226
- @cursor += i
330
+ def match_italic
331
+ if @context.last == :ITALIC
332
+ end_span(:ITALIC, "''")
333
+ else
334
+ start_span(:ITALIC, "''")
335
+ end
336
+ end
227
337
 
228
- if @pair_stack.last[0] == :SECTION_START
229
- @next_token[0] = :SECTION_END
230
- @pair_stack.pop
338
+ def match_tilde
339
+ if @text[@cursor, 5] == "~~~~~"
340
+ empty_span(:SIGNATURE_DATE, "~~~~~", 5)
341
+ elsif @text[@cursor, 4] == "~~~~"
342
+ empty_span(:SIGNATURE_FULL, "~~~~", 4)
343
+ elsif @text[@cursor, 3] == "~~~"
344
+ empty_span(:SIGNATURE_NAME, "~~~", 3)
345
+ else
346
+ match_text
347
+ end
348
+ end
349
+
350
+ def match_left_angle
351
+ next_char = @text[@cursor + 1]
352
+ if !next_char
353
+ match_text
354
+ elsif next_char.ord == 47
355
+ # Might be an XHTML end tag
356
+ if @text[@cursor .. -1] =~ %r{</([a-zA-Z][a-zA-Z0-9\-_]*)(\s*)>} and @context.include?(:TAG)
357
+ # Found an XHTML end tag
358
+ tag_name = $1
359
+ end_span(:TAG, $1)
360
+ @lexer_table.pop
361
+ @cursor += $1.length + $2.length + 3
362
+ else
363
+ match_text
364
+ end
365
+ elsif next_char.ord > 64 and next_char.ord < 123
366
+ # Might be an XHTML open or empty tag
367
+ scanner = StringScanner.new(@text[@cursor .. -1])
368
+ if scanner.scan(%r{<([a-zA-Z][a-zA-Z0-9\-_]*)}) and (HTML_TAGS.include?(scanner[1]) or WIKI_TAGS.include?(scanner[1]))
369
+ # Sequence begins with a valid tag name, so check for attributes
370
+ tag_name = scanner[1]
371
+ attrs = {}
372
+ while scanner.scan(%r{\s+([a-zA-Z][a-zA-Z0-9\-_]*)\s*=\s*('([^']+)'|"([^"]+)"|([^>\s]+))}) do
373
+ attrs[scanner[1]] = scanner[3] ? scanner[3] : (scanner[4] ? scanner[4] : scanner[5])
374
+ end
375
+ scanner.scan(%r{\s*})
376
+ if ((c = scanner.get_byte) == '>' or (c == '/' and scanner.get_byte == '>'))
377
+ # Found an XHTML start or empty tag
378
+ if tag_name == 'nowiki'
379
+ @lexer_table.push(@nowiki_lexer_table) unless c == '/'
380
+ elsif tag_name == 'paste'
381
+ unless c == '/'
382
+ maybe_close_para(:PASTE_START, true)
383
+ append_to_tokens([:PASTE_START, ''])
384
+ @cursor += scanner.pos
385
+ @lexer_table.push(@paste_lexer_table)
386
+ #eat newline after <paste> if if exists because otherwise
387
+ #it will be transformed into <br/>
388
+ if @text[@cursor, 1] == "\n"
389
+ @cursor += 1
390
+ elsif @text[@cursor, 2] == "\r\n"
391
+ @cursor += 2
392
+ end
393
+ return
394
+ end
395
+ else
396
+ if tag_name == 'pre'
397
+ table = @pre_lexer_table
398
+ elsif tag_name == 'code'
399
+ table = @code_lexer_table
400
+ elsif tag_name == 'math'
401
+ table = @math_lexer_table
402
+ else
403
+ table = @markup_lexer_table
404
+ end
405
+ start_span(:TAG, tag_name)
406
+ attrs.collect do |(name, value)|
407
+ append_to_tokens([:ATTR_NAME, name])
408
+ append_to_tokens([:ATTR_VALUE, value]) if value
409
+ end
410
+ if c == '/' or TAGS_WITHOUT_CLOSE_TAG.include? tag_name
411
+ end_span(:TAG, tag_name)
231
412
  else
232
- @next_token[0] = :SECTION_START
233
- @pair_stack.push @next_token
413
+ @lexer_table.push(table)
234
414
  end
415
+ end
416
+ @cursor += scanner.pos #FIXME: will break xhtml attribute length calculation
235
417
  else
236
- match_other
418
+ match_text
237
419
  end
420
+ else
421
+ match_text
422
+ end
423
+ else
424
+ match_text
238
425
  end
426
+ end
239
427
 
240
- #Matches start of the hyperlinks
241
- # "[[" { return INTLINKSTART; }
242
- # "[" { return LINKSTART; }
243
- def match_link_start
244
- if @text[@cursor, 2] == "[[" and @text[@cursor+2, @text.length - (@cursor + 2)] =~ %r{\A\s*[^\s\]]}
245
- @next_token[0] = :INTLINKSTART
246
- @pair_stack.push @next_token
247
- @cursor += 2
248
- elsif @text[@cursor, 1] == "[" and link_protocol?(@cursor+1)
249
- @next_token[0] = :LINKSTART
250
- @pair_stack.push @next_token
251
- @cursor += 1
428
+ def match_equal
429
+ if at_start_of_line?
430
+ @heading = extract_char_sequence('=')
431
+ @cursor += @heading.length
432
+ if at_end_of_line? or blank_line?
433
+ @cursor -= @heading.length
434
+ #special case - no header text, just "=" signs
435
+ #try to split header into "=" formatting and text with "=":
436
+ # example:
437
+ # ==== should become: = == =
438
+ # ===== should become: == = ==
439
+ if @heading =~ /(={6})(=+)(={6})/ or
440
+ @heading =~ /(={5})(=+)(={5})/ or
441
+ @heading =~ /(={4})(=+)(={4})/ or
442
+ @heading =~ /(={3})(=+)(={3})/ or
443
+ @heading =~ /(={2})(=+)(={2})/ or
444
+ @heading =~ /(=)(=+)(=)/
445
+ start_span(:SECTION, $1)
446
+ @cursor += $1.length
447
+ @tokens << [:TEXT, $2]
448
+ @cursor += $2.length
449
+ end_span(:SECTION, $3)
450
+ @cursor += $3.length
252
451
  else
253
- match_other
452
+ match_text
254
453
  end
454
+ else
455
+ @cursor -= @heading.length
456
+ start_span(:SECTION, @heading)
457
+ @cursor += @heading.length
458
+ @lexer_table.push(@heading_lexer_table)
459
+ end
460
+ else
461
+ match_text
462
+ end
463
+ end
464
+
465
+ def match_equal_in_heading
466
+ heading = extract_char_sequence('=')
467
+ if @heading.length <= heading.length
468
+ end_span(:SECTION, heading)
469
+ @lexer_table.pop
470
+ @cursor += heading.length
471
+ skip_newline
472
+ else
473
+ @pending << heading
474
+ @cursor += heading.length
255
475
  end
476
+ end
477
+
478
+ def match_newline_in_heading
479
+ end_span(:SECTION)
480
+ @lexer_table.pop
481
+ end
256
482
 
257
- #Matches end of the hyperlinks
258
- # "]]" { return INTLINKEND; }
259
- # "]" { return LINKEND; }
260
- def match_link_end
261
- if @text[@cursor, 2] == "]]" and @pair_stack.last[0] == :INTLINKSTART
262
- @next_token[0] = :INTLINKEND
263
- @pair_stack.pop
264
- @cursor += 2
265
- elsif @text[@cursor, 1] == "]" and @pair_stack.last[0] == :LINKSTART
266
- @next_token[0] = :LINKEND
267
- @pair_stack.pop
268
- @cursor += 1
269
- else
270
- match_other
271
- end
483
+ def match_left_square
484
+ if @text[@cursor, 2] == "[["
485
+ if @text[@cursor + 2, 1] != "]"
486
+ start_span(:INTLINK, "[[")
487
+ @cursor += 2
488
+ @lexer_table.push(@intlink_lexer_table)
489
+ else
490
+ match_text
491
+ end
492
+ elsif @text[@cursor + 1 .. -1] =~ %r{\A\s*((http|https|file)://|mailto:)}
493
+ start_span(:LINK, "[")
494
+ @cursor += 1
495
+ skip_whitespace
496
+ @lexer_table.push(@link_lexer_table)
497
+ else
498
+ match_text
272
499
  end
500
+ end
501
+
502
+ def match_right_square_in_link
503
+ end_span(:LINK, "]")
504
+ @cursor += 1
505
+ @lexer_table.pop
506
+ end
507
+
508
+ def match_right_square_in_intlink
509
+ if @text[@cursor, 2] == "]]"
510
+ end_span(:INTLINK, "]]")
511
+ @cursor += 2
512
+ @lexer_table.pop
513
+ else
514
+ match_text
515
+ end
516
+ end
517
+
518
+ def match_space_in_link
519
+ spaces = extract_char_sequence(' ')
520
+ append_to_tokens([:LINKSEP, ' ']) unless @text[@cursor, 1] == ']'
521
+ @cursor += spaces.length
522
+ @lexer_table.pop
523
+ @lexer_table.push(@link_opt_lexer_table)
524
+ end
273
525
 
274
- #Matches link separator inside of internal links
275
- def match_link_sep
276
- if @tokens[-1][0] == :INTLINKSTART or inside_resource_link
277
- @next_token[0] = :INTLINKSEP
278
- @cursor += 1
279
- else
280
- match_other
526
+ def match_pipe_in_intlink
527
+ if @tokens.last[0] == :INTLINK_START
528
+ @lexer_table.pop
529
+ @lexer_table.push(@intlink_opt_lexer_table)
530
+ end
531
+ append_to_tokens([:INTLINKSEP, "|"])
532
+ @cursor += 1
533
+ end
534
+
535
+ def match_colon_in_intlink
536
+ if not @pending.is_empty_token?
537
+ @lexer_table.pop
538
+ @lexer_table.push(@resourcelink_opt_lexer_table)
539
+ end
540
+ append_to_tokens([:RESOURCESEP, ":"])
541
+ @cursor += 1
542
+ end
543
+
544
+ def match_c_char_in_intlink
545
+ if @text[@cursor, 9] == 'Category:'
546
+ append_to_tokens([:CATEGORY, 'Category:'])
547
+ @lexer_table.pop
548
+ @lexer_table.push(@intlink_cat_lexer_table)
549
+ @cursor += 9
550
+ else
551
+ match_text
552
+ end
553
+ end
554
+
555
+ def match_newline_in_link
556
+ end_span(:LINK)
557
+ @lexer_table.pop
558
+ end
559
+
560
+ def match_newline_in_intlink
561
+ end_span(:INTLINK)
562
+ @lexer_table.pop
563
+ end
564
+
565
+ def match_h_char
566
+ link = @text[@cursor, 7] if @text[@cursor, 7] == 'http://'
567
+ link = @text[@cursor, 8] if @text[@cursor, 8] == 'https://'
568
+ if link
569
+ start_span(:LINK)
570
+ i = @cursor + link.length
571
+ while i < @text.size and TOKEN_CHAR_TABLE[@text[i].ord] do
572
+ link << @text[i, 1]
573
+ i += 1
574
+ end
575
+
576
+ #exclude punctuation at the end
577
+ while link.length > 0 and PUNCTUATION_CHAR_TABLE[link[-1].ord] do
578
+ link = link[0..-2]
579
+ i -= 1
281
580
  end
581
+
582
+ @pending = TokenString.new(self)
583
+ @pending << link
584
+ @cursor = i
585
+ end_span(:LINK)
586
+ else
587
+ match_text
282
588
  end
589
+ end
283
590
 
284
- #Matches inlined unformatted html link
285
- # "http://[^\s]*" { return [ LINKSTART TEXT LINKEND]; }
286
- def match_inline_link
287
- #if no link start token was detected and the text starts with http://
288
- #then it's the inlined unformatted html link
289
- last_pair_token = @pair_stack.last[0]
290
- if link_protocol?(@cursor) and last_pair_token != :INTLINKSTART and last_pair_token != :LINKSTART
291
- @next_token[0] = :LINKSTART
292
- text = @text[@cursor..-1]
293
- if last_pair_token == :ITALICSTART and text =~ /\A([^\s\n]+)''/
294
- linkText = $1
295
- elsif last_pair_token == :BOLDSTART and text =~ /\A([^\s\n]+)'''/
296
- linkText = $1
297
- elsif text =~ /\A([^\s\n]+)[\s\n]/
298
- linkText = $1
299
- else
300
- linkText = text
301
- end
302
- @sub_tokens = []
303
- @sub_tokens << [:TEXT, linkText]
304
- @sub_tokens << [:LINKEND, ']']
305
- @cursor += linkText.length
306
- @token_start = @cursor
307
- else
308
- match_other
309
- end
591
+ def match_space
592
+ if at_start_of_line? and !blank_line?
593
+ start_span(:PREINDENT)
594
+ @lexer_table.push(@indent_lexer_table)
595
+ match_text
596
+ else
597
+ match_text
598
+ end
599
+ end
600
+
601
+ def match_newline_in_indent
602
+ match_text
603
+ unless @text[@cursor, 1] == " "
604
+ @tokens.append_pending(@pending)
605
+ @pending = TokenString.new(self)
606
+ end_span(:PREINDENT)
607
+ @lexer_table.pop
310
608
  end
609
+ end
311
610
 
312
- #Matches space to find preformatted areas which start with a space after a newline
313
- # "\n\s[^\n]*" { return PRE; }
314
- def match_space
315
- if at_start_of_line? and ! in_table?
316
- match_untill_eol
317
- @next_token[0] = :PRE
318
- strip_ws_from_token_start
319
- elsif @pair_stack.last[0] == :LINKSTART and @current_token[0] == :TEXT and @tokens.last[0] != :LINKSEP
320
- @next_token[0] = :LINKSEP
321
- @cursor += 1
322
- strip_ws_from_token_start
323
- else
324
- match_other
611
+ def match_star
612
+ if at_start_of_line?
613
+ @list = extract_char_sequence('#*')
614
+ open_list(@list)
615
+ @lexer_table.push(@items_lexer_table)
616
+ else
617
+ match_text
618
+ end
619
+ end
620
+
621
+ def match_hash
622
+ if at_start_of_line?
623
+ @list = extract_char_sequence('#*')
624
+ open_list(@list)
625
+ @lexer_table.push(@items_lexer_table)
626
+ else
627
+ match_text
628
+ end
629
+ end
630
+
631
+ def match_underscore
632
+ if @text[@cursor, 7] == '__TOC__'
633
+ empty_span(:KEYWORD, 'TOC', 7)
634
+ elsif @text[@cursor, 9] == '__NOTOC__'
635
+ empty_span(:KEYWORD, 'NOTOC', 9)
636
+ else
637
+ match_text
638
+ end
639
+ end
640
+
641
+ def match_newline_in_items
642
+ if @text[@cursor, 1] == "\n"
643
+ newline = "\n"
644
+ char = @text[@cursor + 1, 1]
645
+ else
646
+ newline = "\r\n"
647
+ char = @text[@cursor + 2, 1]
648
+ end
649
+ @pending << newline
650
+ @cursor += newline.length
651
+ if (char == @list[0, 1])
652
+ list = extract_char_sequence('#*')
653
+ if list == @list
654
+ end_span(:LI)
655
+ start_span(:LI)
656
+ @cursor += list.length
657
+ else
658
+ l = @list.length > list.length ? list.length : @list.length
659
+ i = 0
660
+ i += 1 while (i < l and @list[i] == list[i])
661
+ if i < @list.length
662
+ close_list(@list[i .. -1])
663
+ if @context.last == :LI
664
+ end_span(:LI)
665
+ start_span(:LI)
666
+ end
325
667
  end
668
+ if i < list.length
669
+ start_span(:LI) if @context.last != :LI
670
+ open_list(list[i .. -1])
671
+ end
672
+ @cursor += i
673
+ @list = list
674
+ end
675
+ else
676
+ close_list(@list)
677
+ @lexer_table.pop
326
678
  end
679
+ end
680
+
681
+ def match_dash
682
+ if at_start_of_line? and @text[@cursor, 4] == "----"
683
+ empty_span(:HLINE, "----", 4)
684
+ else
685
+ match_text
686
+ end
687
+ end
688
+
689
+ def match_left_angle_in_nowiki
690
+ if @text[@cursor, 9] == '</nowiki>'
691
+ @cursor += 9
692
+ @lexer_table.pop
693
+ else
694
+ match_text
695
+ end
696
+ end
327
697
 
328
- #Matches any kind of list by using sublexing technique. MediaWiki lists are context-sensitive
329
- #therefore we need to do some special processing with lists. The idea here is to strip
330
- #the leftmost symbol indicating the list from the group of input lines and use separate
331
- #lexer to process extracted fragment.
332
- def match_list
333
- if at_start_of_line?
334
- list_id = @text[@cursor, 1]
335
- sub_text = extract_list_contents(list_id)
336
- extracted = 0
337
-
338
- #hack to tokenize everything inside the list
339
- @sub_tokens = []
340
- sub_lines = ""
341
- @sub_tokens << [:LI_START, ""]
342
- sub_text.each do |t|
343
- extracted += 1
344
- if text_is_list? t
345
- sub_lines += t
346
- else
347
- if not sub_lines.empty?
348
- @sub_tokens += sub_lex(sub_lines)
349
- sub_lines = ""
350
- end
351
- if @sub_tokens.last[0] != :LI_START
352
- @sub_tokens << [:LI_END, ""]
353
- @sub_tokens << [:LI_START, ""]
354
- end
355
- @sub_tokens += sub_lex(t.lstrip)
356
- end
357
- end
358
- if not sub_lines.empty?
359
- @sub_tokens += sub_lex(sub_lines)
360
- @sub_tokens << [:LI_END, ""]
361
- else
362
- @sub_tokens << [:LI_END, ""]
363
- end
698
+ def match_left_angle_in_paste
699
+ if @text[@cursor, 8] == '</paste>'
700
+ @lexer_table.pop
701
+ append_to_tokens([:PASTE_END, ''])
702
+ @cursor += 8
703
+ maybe_open_para(:PASTE_END)
704
+ else
705
+ match_text
706
+ end
707
+ end
364
708
 
365
- #end of hack
366
- @cursor += sub_text.length + extracted
367
- @token_start = @cursor
368
-
369
- case
370
- when list_id == "*"
371
- @next_token[0] = :UL_START
372
- @sub_tokens << [:UL_END, ""]
373
- when list_id == "#"
374
- @next_token[0] = :OL_START
375
- @sub_tokens << [:OL_END, ""]
376
- when list_id == ";", list_id == ":"
377
- @next_token[0] = :DL_START
378
- @sub_tokens << [:DL_END, ""]
379
- end
380
- elsif @text[@cursor, 1] == ':' and @tokens[-1][0] == :INTLINKSTART
381
- @next_token[0] = :RESOURCE_SEP
382
- @cursor += 1
383
- else
384
- match_other
385
- end
709
+ def match_newline_in_paste
710
+ append_to_tokens([:TAG_START, 'br'])
711
+ if @text[@cursor, 1] == "\n"
712
+ @cursor += 1
713
+ elsif @text[@cursor, 2] == "\r\n"
714
+ @cursor += 2
386
715
  end
716
+ append_to_tokens([:TAG_END, 'br'])
717
+ end
387
718
 
388
- #Matches the line until \n
389
- def match_untill_eol
390
- val = @text[@cursor, 1]
391
- while (val != "\n") and (!val.nil?)
392
- @cursor += 1
393
- val = @text[@cursor, 1]
394
- end
719
+ def match_left_angle_in_math
720
+ if @text[@cursor, 7] == '</math>'
721
+ end_span(:TAG, 'math')
722
+ @cursor += 7
723
+ @lexer_table.pop
724
+ else
725
+ match_text
726
+ end
727
+ end
728
+
729
+ def match_left_angle_in_pre
730
+ if @text[@cursor, 6] == '</pre>'
731
+ end_span(:TAG, 'pre')
732
+ @cursor += 6
733
+ #eat newline after </pre>
734
+ if @text[@cursor, 1] == "\n"
395
735
  @cursor += 1
736
+ elsif @text[@cursor, 2] == "\r\n"
737
+ @cursor += 2
738
+ end
739
+ @lexer_table.pop
740
+ else
741
+ match_text
396
742
  end
743
+ end
397
744
 
398
- #Matches hline tag that start with "-"
399
- # "\n----" { return HLINE; }
400
- def match_line
401
- if at_start_of_line? and @text[@cursor, 4] == "----"
402
- @next_token[0] = :HLINE
403
- @cursor += 4
404
- else
405
- match_other
406
- end
745
+ def match_space_in_code
746
+ match_text
747
+ end
748
+
749
+ def match_left_angle_in_code
750
+ if @text[@cursor, 7] == '</code>'
751
+ end_span(:TAG, 'code')
752
+ @cursor += 7
753
+ @lexer_table.pop
754
+ else
755
+ match_left_angle
407
756
  end
757
+ end
408
758
 
409
- #Matches signature
410
- # "~~~~~" { return SIGNATURE_DATE; }
411
- # "~~~~" { return SIGNATURE_FULL; }
412
- # "~~~" { return SIGNATURE_NAME; }
413
- def match_signature
414
- if @text[@cursor, 5] == "~~~~~"
415
- @next_token[0] = :SIGNATURE_DATE
416
- @cursor += 5
417
- elsif @text[@cursor, 4] == "~~~~"
418
- @next_token[0] = :SIGNATURE_FULL
419
- @cursor += 4
420
- elsif @text[@cursor, 3] == "~~~"
421
- @next_token[0] = :SIGNATURE_NAME
422
- @cursor += 3
423
- else
424
- match_other
425
- end
759
+ def match_left_curly
760
+ if at_start_of_line? and @text[@cursor + 1, 1] == '|'
761
+ start_span(:TABLE, "{|")
762
+ @cursor += 2
763
+ @lexer_table.push(@table_lexer_table)
764
+ elsif @text[@cursor + 1, 1] == '{' and @text[@cursor + 2, 2] != "}}"
765
+ start_span(:TEMPLATE, "{{")
766
+ @cursor += 2
767
+ @lexer_table.push(@template_lexer_table)
768
+ else
769
+ match_text
426
770
  end
427
-
428
- def match_tag_start
429
- if @text[@cursor, 8] == '<nowiki>'
430
- @cursor += 8
431
- @token_start = @cursor
432
- @current_lexer_table = @tag_lexer_table
433
- @current_lexer_table[@text[@cursor, 1]].call
434
- else
435
- match_other
436
- end
771
+ end
772
+
773
+ def match_left_curly_in_template
774
+ if @text[@cursor + 1, 1] == '{' and @text[@cursor + 2, 2] != "}}"
775
+ start_span(:TEMPLATE, "{{")
776
+ @cursor += 2
777
+ @lexer_table.push(@template_lexer_table)
778
+ else
779
+ match_text
437
780
  end
438
-
439
- def match_tag_end
440
- if @text[@cursor, 9] == '</nowiki>'
441
- @cursor += 9
442
- @token_start = @cursor
443
- @current_lexer_table = @lexer_table
444
- @current_lexer_table[@text[@cursor, 1]].call
445
- else
446
- match_other
447
- end
781
+ end
782
+
783
+ def match_right_curly_in_template
784
+ if @text[@cursor + 1, 1] == '}'
785
+ end_span(:TEMPLATE, "}}")
786
+ @cursor += 2
787
+ @lexer_table.pop
788
+ else
789
+ match_text
448
790
  end
449
-
450
- def match_table
451
- if at_start_of_line? and @text[@cursor + 1, 1] == '|'
452
- tokens = []
453
- if @para
454
- tokens = end_tokens_for_open_pairs
455
- if @tokens.last and @tokens.last[0] == :PARA_START and empty_text_token?
456
- tokens.pop
457
- else
458
- tokens << [:PARA_END, ""]
459
- end
460
- @para = false
461
- end
462
- tokens << [:TABLE_START, '']
463
- @pair_stack.push [:TABLE_START, '']
464
- @next_token = tokens.shift
465
- @sub_tokens = tokens
466
- @cursor += 2
467
- else
468
- match_other
469
- end
791
+ end
792
+
793
+ def match_pipe_in_template
794
+ if @tokens.last[0] == :TEMPLATE_START
795
+ @lexer_table.pop
796
+ @lexer_table.push(@template_param_lexer_table)
470
797
  end
798
+ append_to_tokens([:INTLINKSEP, "|"])
799
+ @cursor += 1
800
+ end
471
801
 
472
- def match_table_head
473
- if at_start_of_line? and in_table?
474
- @cursor += 1
475
- tokens = []
476
- if @pair_stack.last[0] == :CELL_START
477
- tokens << [:CELL_END, '']
478
- @pair_stack.pop
479
- elsif @pair_stack.last[0] == :HEAD_START
480
- tokens << [:HEAD_END, '']
481
- @pair_stack.pop
482
- elsif @pair_stack.last[0] != :ROW_START
483
- tokens << [:ROW_START, '']
484
- @pair_stack.push [:ROW_START, '']
485
- end
486
- tokens << [:HEAD_START, '']
487
- @pair_stack.push [:HEAD_START, '']
488
- @next_token = tokens.shift
489
- @sub_tokens = tokens
490
- else
491
- match_other
492
- end
802
+ def match_bang_in_table
803
+ if at_start_of_line?
804
+ if @context.last == :CELL
805
+ end_span(:CELL)
806
+ elsif @context.last == :HEAD
807
+ end_span(:HEAD)
808
+ elsif @context.last != :ROW
809
+ start_span(:ROW)
810
+ end
811
+ start_span(:HEAD, "!")
812
+ @cursor += 1
813
+ else
814
+ match_text
493
815
  end
816
+ end
494
817
 
495
- def match_link_sep_or_table_cell
496
- if in_table?
497
- tokens = []
498
- if at_start_of_line?
499
- @cursor += 1
500
- close_table_cell(tokens)
501
- if ['-', '}'].include?(@text[@cursor, 1])
502
- close_table_row(tokens)
503
- if @text[@cursor, 1] == '-'
504
- tokens << [:ROW_START, '']
505
- @pair_stack.push [:ROW_START, '']
506
- else
507
- tokens << [:TABLE_END, '']
508
- @pair_stack.pop
509
- end
510
- @cursor += 1
511
- else
512
- if @pair_stack.last[0] != :ROW_START
513
- tokens << [:ROW_START, '']
514
- @pair_stack.push [:ROW_START, '']
515
- end
516
- tokens << [:CELL_START, '']
517
- @pair_stack.push [:CELL_START, '']
518
- end
519
- @next_token = tokens.shift
520
- @sub_tokens = tokens
521
- elsif @text[@cursor + 1, 1] == '|'
522
- @cursor += 2
523
- close_table_cell(tokens)
524
- next_token = tokens.last[0] == :HEAD_END ? [:HEAD_START, ''] : [:CELL_START, '']
525
- tokens << next_token
526
- @pair_stack.push next_token
527
- @next_token = tokens.shift
528
- @sub_tokens = tokens
529
- else
530
- match_link_sep
531
- end
532
- else
533
- match_link_sep
818
+ def match_pipe_in_table
819
+ if at_start_of_line?
820
+ context = @context[@context.rindex(:TABLE) + 1 .. -1]
821
+ if @text[@cursor+1, 1] == '-'
822
+ end_span(:ROW) if context.include? :ROW
823
+ start_span(:ROW, "|-")
824
+ @cursor += 2
825
+ elsif @text[@cursor+1, 1] == '}'
826
+ end_span(:TABLE, "|}")
827
+ @cursor += 2
828
+ @lexer_table.pop
829
+ skip_newline
830
+ else
831
+ if context.include? :CELL
832
+ end_span(:CELL)
833
+ elsif context.include? :HEAD
834
+ end_span(:HEAD)
534
835
  end
836
+ start_span(:ROW) unless @context.last == :ROW
837
+ start_span(:CELL, "|")
838
+ @cursor += 1
839
+ end
840
+ elsif @text[@cursor + 1, 1] == '|'
841
+ context = @context[@context.rindex(:TABLE) + 1 .. -1]
842
+ if context.include?:CELL
843
+ end_span(:CELL)
844
+ start_span(:CELL, "||")
845
+ elsif context.include? :HEAD
846
+ end_span(:HEAD)
847
+ start_span(:HEAD, "||")
848
+ end
849
+ @cursor += 2
850
+ else
851
+ context = @context[@context.rindex(:TABLE) + 1 .. -1]
852
+ if context.include? :CELL
853
+ end_span(:CELL, "attributes")
854
+ start_span(:CELL, "|")
855
+ @char = '' #WTF?
856
+ #CHECK: this usecase and cursor increments
857
+ end
858
+ match_text
535
859
  end
860
+ end
536
861
 
537
- #Matches a new line and breaks the paragraph if two newline characters
538
- #("\n\n") are met.
539
- def match_newline
540
- if @text[@cursor, 2] == "\n\n"
541
- if @para
542
- @sub_tokens = end_tokens_for_open_pairs
543
- @sub_tokens << [:PARA_END, '']
544
- @sub_tokens << [:PARA_START, '']
545
- @next_token[0] = @sub_tokens.slice!(0)[0]
546
- @cursor += 2
547
- return
548
- end
549
- end
550
- match_other
551
- end
552
-
553
- #Matches a new line and breaks the paragraph if two carriage return - newline
554
- #sequences ("\r\n\r\n") are met.
555
- def match_carriagereturn
556
- if @text[@cursor, 4] == "\r\n\r\n"
557
- if @para
558
- @sub_tokens = end_tokens_for_open_pairs
559
- @sub_tokens << [:PARA_END, '']
560
- @sub_tokens << [:PARA_START, '']
561
- @next_token[0] = @sub_tokens.slice!(0)[0]
562
- @cursor += 4
563
- return
564
- end
565
- end
566
- match_other
862
+ def match_newline
863
+ if @text[@cursor, 2] == "\n\n"
864
+ @pending << "\n\n"
865
+ @cursor += 2
866
+ end_span(:PARA)
867
+ start_span(:PARA)
868
+ elsif @text[@cursor, 4] == "\r\n\r\n"
869
+ @pending << "\r\n\r\n"
870
+ @cursor += 4
871
+ end_span(:PARA)
872
+ start_span(:PARA)
873
+ else
874
+ match_text
567
875
  end
876
+ end
877
+
878
+ def match_newline_in_table
879
+ if @text[@cursor, 2] == "\n\n"
880
+ start_span(:PARA)
881
+ append_to_tokens([:TEXT, "\n\n"])
882
+ @cursor += 2
883
+ end_span(:PARA)
884
+ elsif @text[@cursor, 4] == "\r\n\r\n"
885
+ start_span(:PARA)
886
+ append_to_tokens([:TEXT, "\r\n\r\n"])
887
+ @cursor += 4
888
+ end_span(:PARA)
889
+ else
890
+ match_text
891
+ end
892
+ end
893
+
894
+ def match_semicolon
895
+ if at_start_of_line?
896
+ start_span(:DL)
897
+ start_span(:DT, ';')
898
+ @lexer_table.push(@entries_lexer_table)
899
+ @cursor += 1
900
+ else
901
+ match_text
902
+ end
903
+ end
904
+
905
+ def match_colon
906
+ if at_start_of_line?
907
+ start_span(:DL)
908
+ start_span(:DD, ':')
909
+ @lexer_table.push(@entries_lexer_table)
910
+ @cursor += 1
911
+ else
912
+ match_text
913
+ end
914
+ end
915
+
916
+ def match_colon_in_entries
917
+ if @context.include? :DD
918
+ end_span(:DD)
919
+ elsif @context.include? :DT
920
+ end_span(:DT)
921
+ end
922
+ start_span(:DD, ':')
923
+ @cursor += 1
924
+ end
925
+
926
+ def match_newline_in_entries
927
+ match_text
928
+ unless @text[@cursor, 1] == ':'
929
+ if @context.include? :DD
930
+ end_span(:DD)
931
+ elsif @context.include? :DT
932
+ end_span(:DT)
933
+ end
934
+ end_span(:DL)
935
+ @lexer_table.pop
936
+ end
937
+ end
938
+
939
+
940
+ #-- ================== Helper methods ================== ++#
941
+
942
+ # Returns true if the text cursor is on the first character of a line
943
+ def at_start_of_line?
944
+ @cursor == 0 or @text[@cursor - 1, 1] == "\n"
945
+ end
568
946
 
569
- #-- ================== Helper methods ================== ++#
947
+ # Returns true if the text cursor is after the last character of a line
948
+ def at_end_of_line?
949
+ @text[@cursor, 1] == "\n" or @text[@cursor, 1].nil?
950
+ end
570
951
 
571
- # Checks if we are lexing inside a resource link like
572
- # [[Image:example.png|100px|Embedded image]]
573
- def inside_resource_link
574
- if @pair_stack.last[0] == :INTLINKSTART
575
- pos = -1
576
- while((token = @tokens[pos][0]) != :INTLINKSTART)
577
- if token == :RESOURCE_SEP
578
- return true
579
- else
580
- pos -= 1
581
- end
582
- end
952
+ def blank_line?
953
+ i = @cursor
954
+ i += 1 while (@text[i,1] == ' ')
955
+ return (@text[i,1] == '' or (@text[i,1] == "\n") or (@text[i,2] == "\r\n"))
956
+ end
957
+
958
+ # Advances the text cursor to the next non-blank character, without appending
959
+ # any of the blank characters to the pending text buffer
960
+ def skip_whitespace
961
+ @cursor += 1 while @text[@cursor, 1] == ' '
962
+ end
963
+
964
+ # Advances the text cursor beyond the next newline sequence, if any. This is
965
+ # used to strip newlines after certain block-level elements, like section
966
+ # headings and tables, to prevent an empty paragraph when the block is followed
967
+ # by an extra newline sequence.
968
+ def skip_newline
969
+ if @text[@cursor, 2] == "\r\n"
970
+ @cursor += 2
971
+ elsif @text[@cursor, 1] == "\n"
972
+ @cursor += 1
973
+ end
974
+ end
975
+
976
+ # Extracts from the input text the sequence of characters consisting of the
977
+ # character or characters specified, and returns the sequence as a string. The
978
+ # text cursor is advanaced to point to the next character after the sequence.
979
+ def extract_char_sequence(char)
980
+ sequence = ''
981
+ i = @cursor
982
+ if char.length == 1
983
+ while @text[i, 1] == char do
984
+ sequence << char
985
+ i += 1
986
+ end
987
+ else
988
+ chars = char.split('')
989
+ while chars.include?(@text[i, 1]) do
990
+ sequence << @text[i, 1]
991
+ i += 1
583
992
  end
584
- false
585
993
  end
994
+ sequence
995
+ end
996
+
997
+ # Opens list and list item spans for each item symbol in the string specified.
998
+ def open_list(symbols)
999
+ symbols.split('').each do |symbol|
1000
+ if symbol == '*'
1001
+ start_span(:UL)
1002
+ else
1003
+ start_span(:OL)
1004
+ end
1005
+ start_span(:LI)
1006
+ @cursor += symbol.length
1007
+ end
1008
+ end
1009
+
1010
+ # Closes list and list item spans for each item symbol in the string specified.
1011
+ def close_list(symbols)
1012
+ symbols.split('').reverse.each do |symbol|
1013
+ end_span(:LI)
1014
+ if symbol == '*'
1015
+ end_span(:UL)
1016
+ else
1017
+ end_span(:OL)
1018
+ end
1019
+ end
1020
+ end
1021
+
1022
+ # Open a token span for the symbol specified. This will append a token start
1023
+ # to the list of output tokens, and push the symbol onto the context stack. If
1024
+ # there is an open paragraph, and the symbol is a block element, then the
1025
+ # open paragraph will be closed (or, if empty, removed) before the token start
1026
+ # is appended.
1027
+ def start_span(symbol, text='')
1028
+ maybe_close_para(symbol, ['pre','table','p'].include?(text))
1029
+ @context << symbol
1030
+ append_to_tokens [(symbol.to_s + '_START').to_sym, text]
1031
+ end
586
1032
 
587
- #Checks if the token is placed at the start of the line.
588
- def at_start_of_line?
589
- if @cursor == 0 or @text[@cursor-1, 1] == "\n"
590
- true
1033
+ # Close a token span for the symbol specified. This will append an end token
1034
+ # to the list of output tokens, and pop the symbol from the context stack. Any
1035
+ # unclosed contexts on top of this symbol's context will also be close (this
1036
+ # generally happens when in-line markup is not terminated before a new block
1037
+ # begins). If the context is empty as a result, a new paragraph will be opened.
1038
+ def end_span(symbol, text='')
1039
+ while(@context.size > 0 and @context.last != symbol) do
1040
+ append_to_tokens [(@context.pop.to_s + '_END').to_sym, '']
1041
+ end
1042
+ @context.pop
1043
+ append_to_tokens [(symbol.to_s + '_END').to_sym, text]
1044
+ maybe_open_para(symbol)
1045
+ end
1046
+
1047
+ def empty_span(symbol, text, cursor_increment)
1048
+ maybe_close_para(symbol)
1049
+ append_to_tokens [symbol, text, @cursor, cursor_increment]
1050
+ @cursor += cursor_increment
1051
+ maybe_open_para(symbol)
1052
+ end
1053
+
1054
+ def maybe_close_para(symbol, force = false)
1055
+ if @context.size > 0 and (PARA_BREAK_ELEMENTS.include?(symbol) or force)
1056
+ i = 1
1057
+ i += 1 while INLINE_ELEMENTS.include?(@context[-i])
1058
+ if @context[-i] == :PARA
1059
+ if @pending.is_empty_token? and @tokens.last[0] == :PARA_START
1060
+ @context.pop
1061
+ @tokens.pop
591
1062
  else
592
- false
1063
+ (1 .. i).each do
1064
+ symbol = @context.pop
1065
+ append_to_tokens [(symbol.to_s + '_END').to_sym, '']
1066
+ end
593
1067
  end
1068
+ end
594
1069
  end
595
-
596
- def in_table?
597
- @pair_stack.include?([:TABLE_START, ''])
598
- end
599
-
600
- #Checks if the text at position contains the start of a link using any of
601
- #HTTP, HTTPS, MAILTO or FILE protocols
602
- def link_protocol?(position)
603
- return @text[position, @text.length - position] =~ %r{\A((http|https|file)://|mailto:)}
1070
+ end
1071
+
1072
+ def maybe_open_para(symbol)
1073
+ if @context.size == 0 and symbol != :PARA
1074
+ @tokens << [:PARA_START, '']
1075
+ @context << :PARA
604
1076
  end
605
-
606
- #Adjusts @token_start to skip leading whitespaces
607
- def strip_ws_from_token_start
608
- @token_start += 1 while @text[@token_start, 1] == " "
1077
+ end
1078
+
1079
+ def append_to_tokens(token)
1080
+ unless @pending.is_empty_token?
1081
+ @tokens.append_pending(@pending)
609
1082
  end
610
-
611
- #Returns true if the TEXT token is empty or contains newline only
612
- def empty_text_token?
613
- @current_token[0] == :TEXT and
614
- (@current_token[1] == '' or @current_token[1] == "\n" or @current_token[1] == "\r\n")
1083
+ @pending = TokenString.new(self)
1084
+ @tokens << token
1085
+ end
1086
+
1087
+
1088
+ class LexerTable
1089
+
1090
+ def initialize
1091
+ @tables = []
615
1092
  end
616
-
617
- #Returns true if the text is a list, i.e. starts with one of #;*: symbols
618
- #that indicate a list
619
- def text_is_list?(text)
620
- return text =~ /^[#;*:].*/
1093
+
1094
+ def push(table)
1095
+ @tables << table
1096
+ @table = table
621
1097
  end
622
-
623
- #Runs sublexer to tokenize sub_text
624
- def sub_lex(sub_text, strip_paragraphs=true)
625
- sub_lexer = MediaWikiLexer.new
626
- sub_tokens = sub_lexer.tokenize(sub_text)
627
- sub_tokens.pop #false token
628
- if strip_paragraphs and sub_tokens.size > 0
629
- #the last PARA_END token
630
- sub_tokens.pop if sub_tokens.last[0] == :PARA_END
631
- #the first PARA_START token
632
- sub_tokens.delete_at(0) if sub_tokens[0][0] == :PARA_START
633
- end
634
- sub_tokens
635
- end
636
-
637
- #Extract list contents of list type set by list_id variable.
638
- #Example list:
639
- # *a
640
- # **a
641
- #Extracted list with id "*" will look like:
642
- # a
643
- # *a
644
- def extract_list_contents(list_id)
645
- i = @cursor+1
646
- list = ""
647
- while i < @text.length
648
- curr = @text[i, 1]
649
- if (curr == "\n") and (@text[i+1, 1] != list_id)
650
- list+=curr
651
- break
652
- end
653
- if (curr == list_id) and (@text[i-1, 1] == "\n")
654
- list += "\n" if i + 1 == @text.length
655
- else
656
- list += curr
657
- end
658
- i += 1
659
- end
660
- list
661
- end
662
-
663
- def start_para
664
- @tokens << [:PARA_START, ""]
665
- @para = true
666
- end
667
-
668
- def end_para
669
- @tokens += end_tokens_for_open_pairs
670
- @tokens << [:PARA_END, ""]
671
- @para = false
672
- end
673
-
674
- def end_tokens_for_open_pairs
675
- tokens = []
676
- restore = []
677
- while(@pair_stack.size > 1) do
678
- last = @pair_stack.pop
679
- case last[0]
680
- when :ITALICSTART
681
- tokens << [:ITALICEND, '']
682
- when :BOLDSTART
683
- tokens << [:BOLDEND, '']
684
- when :INTLINKSTART
685
- tokens << [:INTLINKEND, '']
686
- when :LINKSTART
687
- tokens << [:LINKEND, '']
688
- when :TABLE_START
689
- tokens << [:TABLE_END, '']
690
- when :ROW_START
691
- tokens << [:ROW_END, '']
692
- when :CELL_START
693
- tokens << [:CELL_END, '']
694
- when :HEAD_START
695
- tokens << [:HEAD_END, '']
696
- else
697
- restore << last
698
- end
699
- end
700
- @pair_stack += restore.reverse
701
- tokens
702
- end
703
-
704
- def close_table_cell(tokens)
705
- restore = []
706
- last = @pair_stack.pop
707
- while (last[0] != :CELL_START and last[0] != :HEAD_START and last[0] != :ROW_START and last[0] != :TABLE_START) do
708
- case last[0]
709
- when :ITALICSTART
710
- tokens << [:ITALICEND, '']
711
- when :BOLDSTART
712
- tokens << [:BOLDEND, '']
713
- when :INTLINKSTART
714
- tokens << [:INTLINKEND, '']
715
- when :LINKSTART
716
- tokens << [:LINKEND, '']
717
- end
718
- last = @pair_stack.pop
719
- end
720
- if last[0] == :CELL_START
721
- tokens << [:CELL_END, '']
722
- elsif last[0] == :HEAD_START
723
- tokens << [:HEAD_END, '']
724
- else
725
- @pair_stack.push last
726
- end
1098
+
1099
+ def pop
1100
+ @tables.pop
1101
+ @table = @tables.last
727
1102
  end
728
1103
 
729
- def close_table_row(tokens)
730
- if @pair_stack.last[0] == :ROW_START
731
- @pair_stack.pop
732
- tokens << [:ROW_END, '']
733
- end
1104
+ def[] (char)
1105
+ @table[char]
734
1106
  end
1107
+
1108
+ end
735
1109
 
736
1110
  end
737
1111