mediacloth 0.0.3 → 0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +36 -0
- data/lib/mediacloth/mediawikiast.rb +58 -1
- data/lib/mediacloth/mediawikihtmlgenerator.rb +229 -73
- data/lib/mediacloth/mediawikilexer.rb +1030 -656
- data/lib/mediacloth/mediawikilinkhandler.rb +89 -0
- data/lib/mediacloth/mediawikiparams.rb +1 -10
- data/lib/mediacloth/mediawikiparser.rb +939 -409
- data/lib/mediacloth/mediawikiparser.tab.rb +1357 -0
- data/lib/mediacloth/mediawikiparser.y +256 -52
- data/lib/mediacloth/mediawikisignedwikigenerator.rb +42 -0
- data/lib/mediacloth/mediawikitemplatehandler.rb +8 -0
- data/lib/mediacloth/mediawikiwalker.rb +72 -1
- data/lib/mediacloth.rb +33 -10
- data/test/data/ast1 +68 -0
- data/test/data/ast10 +196 -0
- data/test/data/ast11 +34 -0
- data/test/data/ast12 +39 -0
- data/test/data/ast13 +25 -0
- data/test/data/ast14 +13 -0
- data/test/data/ast15 +25 -0
- data/test/data/ast16 +17 -0
- data/test/data/ast17 +9 -0
- data/test/data/ast18 +21 -0
- data/test/data/ast19 +32 -0
- data/test/data/ast2 +4 -0
- data/test/data/ast20 +10 -0
- data/test/data/ast21 +27 -0
- data/test/data/ast22 +22 -0
- data/test/data/ast23 +5 -0
- data/test/data/ast3 +6 -0
- data/test/data/ast4 +122 -0
- data/test/data/ast5 +122 -0
- data/test/data/ast6 +22 -0
- data/test/data/ast7 +143 -0
- data/test/data/ast8 +3 -0
- data/test/data/ast9 +11 -0
- data/test/data/html1 +33 -5
- data/test/data/html10 +31 -27
- data/test/data/html11 +19 -0
- data/test/data/html12 +32 -0
- data/test/data/html13 +29 -0
- data/test/data/html14 +4 -0
- data/test/data/html15 +29 -0
- data/test/data/html16 +28 -0
- data/test/data/html17 +10 -0
- data/test/data/html18 +8 -0
- data/test/data/html19 +27 -0
- data/test/data/html2 +1 -1
- data/test/data/html20 +7 -0
- data/test/data/html21 +5 -0
- data/test/data/html22 +24 -0
- data/test/data/html23 +7 -0
- data/test/data/html3 +1 -1
- data/test/data/html4 +60 -11
- data/test/data/html5 +45 -6
- data/test/data/html6 +5 -5
- data/test/data/html7 +59 -1
- data/test/data/html8 +1 -1
- data/test/data/html9 +10 -2
- data/test/data/input1 +4 -0
- data/test/data/input11 +19 -0
- data/test/data/input12 +32 -0
- data/test/data/input13 +10 -0
- data/test/data/input14 +8 -0
- data/test/data/input15 +10 -0
- data/test/data/input16 +28 -0
- data/test/data/input17 +10 -0
- data/test/data/input18 +16 -0
- data/test/data/input19 +29 -0
- data/test/data/input20 +8 -0
- data/test/data/input21 +18 -0
- data/test/data/input22 +20 -0
- data/test/data/input23 +8 -0
- data/test/data/input4 +13 -1
- data/test/data/input5 +45 -4
- data/test/data/input7 +25 -1
- data/test/data/lex1 +17 -18
- data/test/data/lex10 +57 -87
- data/test/data/lex11 +18 -0
- data/test/data/lex12 +32 -0
- data/test/data/lex13 +3 -0
- data/test/data/lex14 +1 -0
- data/test/data/lex15 +3 -0
- data/test/data/lex16 +27 -0
- data/test/data/lex17 +9 -0
- data/test/data/lex18 +4 -0
- data/test/data/lex19 +27 -0
- data/test/data/lex2 +2 -2
- data/test/data/lex20 +7 -0
- data/test/data/lex21 +4 -0
- data/test/data/lex22 +3 -0
- data/test/data/lex23 +7 -0
- data/test/data/lex3 +1 -1
- data/test/data/lex4 +35 -29
- data/test/data/lex5 +57 -18
- data/test/data/lex6 +7 -7
- data/test/data/lex7 +42 -18
- data/test/data/lex8 +1 -1
- data/test/data/lex9 +6 -6
- data/test/dataproducers/ast.rb +24 -0
- data/test/dataproducers/html.rb +11 -12
- data/test/dataproducers/lex.rb +9 -4
- data/test/debugwalker.rb +25 -11
- data/test/htmlgenerator.rb +170 -13
- data/test/lexer.rb +626 -83
- data/test/linkhandler.rb +39 -0
- data/test/parser.rb +176 -9
- data/test/signedwikigenerator.rb +113 -0
- metadata +158 -79
- data/README +0 -37
- data/lib/mediacloth/mediawikilexer.rb~ +0 -491
- data/lib/mediacloth/mediawikiparser.y~ +0 -210
- data/test/data/result1 +0 -48
- data/test/dataproducers/html.rb~ +0 -24
- data/test/dataproducers/lex.rb~ +0 -15
@@ -1,737 +1,1111 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
#
|
9
|
-
#Inside RACC-generated parser:
|
10
|
-
# ...
|
11
|
-
# ---- inner ----
|
12
|
-
# attr_accessor :lexer
|
13
|
-
# def parse(input)
|
14
|
-
# lexer.tokenize(input)
|
15
|
-
# return do_parse
|
16
|
-
# end
|
17
|
-
# def next_token
|
18
|
-
# return @lexer.lex
|
19
|
-
# end
|
20
|
-
# ...
|
21
|
-
# parser = MediaWikiParser.new
|
22
|
-
# parser.lexer = MediaWikiLexer.new
|
23
|
-
# parser.parse(input)
|
24
|
-
class MediaWikiLexer
|
25
|
-
|
26
|
-
#Initialized the lexer with a match table.
|
27
|
-
#
|
28
|
-
#The match table tells the lexer which method to invoke
|
29
|
-
#on given input char during "tokenize" phase.
|
30
|
-
def initialize
|
31
|
-
@position = 0
|
32
|
-
@pair_stack = [[false, false]] #stack of tokens for which a pair should be found
|
33
|
-
@list_stack = []
|
34
|
-
# Default lexer table
|
35
|
-
@lexer_table = Hash.new(method(:match_other))
|
36
|
-
@lexer_table["'"] = method(:match_italic_or_bold)
|
37
|
-
@lexer_table["="] = method(:match_section)
|
38
|
-
@lexer_table["["] = method(:match_link_start)
|
39
|
-
@lexer_table["]"] = method(:match_link_end)
|
40
|
-
@lexer_table["|"] = method(:match_link_sep_or_table_cell)
|
41
|
-
@lexer_table[" "] = method(:match_space)
|
42
|
-
@lexer_table["*"] = method(:match_list)
|
43
|
-
@lexer_table["#"] = method(:match_list)
|
44
|
-
@lexer_table[";"] = method(:match_list)
|
45
|
-
@lexer_table[":"] = method(:match_list)
|
46
|
-
@lexer_table["-"] = method(:match_line)
|
47
|
-
@lexer_table["~"] = method(:match_signature)
|
48
|
-
@lexer_table["h"] = method(:match_inline_link)
|
49
|
-
@lexer_table["\n"] = method(:match_newline)
|
50
|
-
@lexer_table["\r"] = method(:match_carriagereturn)
|
51
|
-
@lexer_table["<"] = method(:match_tag_start)
|
52
|
-
@lexer_table["{"] = method(:match_table)
|
53
|
-
@lexer_table["!"] = method(:match_table_head)
|
54
|
-
# Lexer table used when inside :match_tag_start ... :match_tag_end
|
55
|
-
@tag_lexer_table = Hash.new(method(:match_other))
|
56
|
-
@tag_lexer_table["<"] = method(:match_tag_end)
|
57
|
-
# Begin lexing in default state
|
58
|
-
@current_lexer_table = @lexer_table
|
59
|
-
end
|
60
|
-
|
61
|
-
#Transforms input stream (string) into the stream of tokens.
|
62
|
-
#Tokens are collected into an array of type [ [TOKEN_SYMBOL, TOKEN_VALUE], ..., [false, false] ].
|
63
|
-
#This array can be given as input token-by token to RACC based parser with no
|
64
|
-
#modification. The last token [false, false] inficates EOF.
|
65
|
-
def tokenize(input)
|
66
|
-
@tokens = []
|
67
|
-
start_para
|
68
|
-
@cursor = 0
|
69
|
-
@text = input
|
70
|
-
@next_token = []
|
71
|
-
|
72
|
-
#This tokenizer algorithm assumes that everything that is not
|
73
|
-
#matched by the lexer is going to be :TEXT token. Otherwise it's usual
|
74
|
-
#lexer algo which call methods from the match table to define next tokens.
|
75
|
-
while (@cursor < @text.length)
|
76
|
-
@current_token = [:TEXT, ''] unless @current_token
|
77
|
-
@token_start = @cursor
|
78
|
-
@char = @text[@cursor, 1]
|
79
|
-
|
80
|
-
if @current_lexer_table[@char].call == :TEXT
|
81
|
-
@current_token[1] += @text[@token_start, 1]
|
82
|
-
else
|
83
|
-
#skip empty :TEXT tokens
|
84
|
-
unless empty_text_token?
|
85
|
-
@tokens << @current_token
|
86
|
-
unless para_breaker?(@next_token[0]) or in_block?
|
87
|
-
#if no paragraph was previously started
|
88
|
-
#then we should start it
|
89
|
-
start_para if !@para
|
90
|
-
else
|
91
|
-
#if we already have a paragraph this is the time to close it
|
92
|
-
end_para if @para
|
93
|
-
end
|
94
|
-
|
95
|
-
end
|
96
|
-
|
97
|
-
if para_breaker?(@next_token[0])
|
98
|
-
if @tokens.last and @tokens.last[0] == :PARA_START
|
99
|
-
#we need to remove para start token because no para end is possible
|
100
|
-
@tokens.pop
|
101
|
-
@para = false
|
102
|
-
elsif @para
|
103
|
-
end_para
|
104
|
-
end
|
105
|
-
end
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
class String
|
4
|
+
def is_empty_token?
|
5
|
+
self.size == 0 or self == "\n" or self == "\r\n"
|
6
|
+
end
|
7
|
+
end
|
106
8
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
9
|
+
# Class for storing text tokens data - index and text
|
10
|
+
class TokenString < String
|
11
|
+
attr_reader :idx
|
12
|
+
|
13
|
+
def initialize(lexer, text = '')
|
14
|
+
@lexer = lexer
|
15
|
+
@idx = 0
|
16
|
+
super(text)
|
17
|
+
end
|
18
|
+
|
19
|
+
def <<(pending_text)
|
20
|
+
# If TokenString.length is 0 and we are pushing some text
|
21
|
+
# than in this moment we can retreive this tokes's index
|
22
|
+
if length == 0
|
23
|
+
@idx = @lexer.cursor
|
24
|
+
end
|
25
|
+
super(pending_text)
|
26
|
+
end
|
27
|
+
end
|
115
28
|
|
116
|
-
|
117
|
-
|
29
|
+
class TokenArray < Array
|
30
|
+
def initialize(lexer)
|
31
|
+
@lexer = lexer
|
32
|
+
end
|
118
33
|
|
119
|
-
|
120
|
-
|
121
|
-
|
34
|
+
def <<(token)
|
35
|
+
if @lexer.tokens.last && (@lexer.tokens.last[3].nil? || @lexer.tokens.last[3] == 0)
|
36
|
+
@lexer.tokens.last[3] = @lexer.cursor - @lexer.tokens.last[2]
|
122
37
|
end
|
123
|
-
|
124
|
-
|
38
|
+
token[2] = @lexer.cursor
|
39
|
+
super(token)
|
40
|
+
end
|
125
41
|
|
126
|
-
|
127
|
-
if @tokens.last
|
128
|
-
@tokens.
|
129
|
-
@para = false
|
130
|
-
else
|
131
|
-
end_para if @para
|
42
|
+
def append_pending(text)
|
43
|
+
if @lexer.tokens.last && @lexer.tokens.last[3].nil?
|
44
|
+
@lexer.tokens.last[3] = text.idx - @lexer.tokens.last[2]
|
132
45
|
end
|
133
|
-
|
134
|
-
|
135
|
-
@tokens
|
46
|
+
token = [:TEXT, text, text.idx, text.length]
|
47
|
+
push(token)
|
136
48
|
end
|
137
49
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
50
|
+
def to_s
|
51
|
+
string_copy = ""
|
52
|
+
each do |token|
|
53
|
+
string_copy << "#{token[0..1]}[#{token[2]}, #{token[3]}]"
|
54
|
+
end
|
55
|
+
string_copy
|
143
56
|
end
|
144
57
|
|
58
|
+
end
|
145
59
|
|
146
|
-
private
|
147
|
-
#Returns true if the token breaks the paragraph.
|
148
|
-
def para_breaker?(token)
|
149
|
-
[:SECTION_START, :SECTION_END,
|
150
|
-
:TABLE_START, :TABLE_END, :ROW_START, :ROW_END, :HEAD_START, :HEAD_END, :CELL_START, :CELL_END,
|
151
|
-
:UL_START, :UL_END, :OL_START, :OL_END,
|
152
|
-
:DL_START, :DL_END, :HLINE, :PRE].include?(token)
|
153
|
-
end
|
154
60
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
61
|
+
class MediaWikiLexer
|
62
|
+
|
63
|
+
INLINE_ELEMENTS = [:LINK, :INTLINK, :BOLD, :ITALIC]
|
64
|
+
BLOCK_ELEMENTS = [:PARA, :PRE, :PREINDENT, :UL, :OL, :DL, :LI, :SECTION, :TABLE, :ROW, :CELL, :HEAD]
|
65
|
+
PARA_BREAK_ELEMENTS = [:UL, :OL, :DL, :PRE, :PREINDENT, :PASTE_START, :SECTION, :TABLE, :HLINE, :KEYWORD]
|
66
|
+
|
67
|
+
NAME_CHAR_TABLE = (0 .. 255).collect{|n| n.chr =~ /[a-zA-Z0-9_\-]/ ? true : false}
|
68
|
+
TOKEN_CHAR_TABLE = (0 .. 255).collect{|n| n.chr =~ /[a-zA-Z0-9_\-.;:?&@~=#%\/]/ ? true : false}
|
69
|
+
PUNCTUATION_CHAR_TABLE = (0 .. 255).collect{|n| n.chr =~ /[\.,;:\-?]/ ? true : false}
|
70
|
+
|
71
|
+
|
72
|
+
HTML_TAGS = %w{ a abbr acronym address applet area b base basefont bdo big blockquote body br
|
73
|
+
button caption center cite code col colgroup dd del dir div dfn dl dt em fieldset font form frame
|
74
|
+
frameset h1 h2 h3 h4 h5 h6 head hr html i iframe img input ins isindex kbd label legend li link map
|
75
|
+
menu meta noframes noscript object ol optgroup option p param pre q s samp script select small span
|
76
|
+
strike strong style sub sup table tbody td textarea tfoot th thead title tr tt u ul var xmp }
|
77
|
+
WIKI_TAGS = %w{ nowiki math paste }
|
78
|
+
TAGS_WITHOUT_CLOSE_TAG = %w{ br hr img }
|
79
|
+
|
80
|
+
attr_reader :cursor
|
81
|
+
attr_reader :tokens
|
82
|
+
|
83
|
+
|
84
|
+
def initialize
|
85
|
+
# Current position in token list
|
86
|
+
@position = 0
|
159
87
|
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
88
|
+
# Lexer table of methods that handle only formatting, e.g. bold or italicized
|
89
|
+
# text; or spans of XHTML, or wiki-escape, markup
|
90
|
+
@formatting_lexer_table = {}
|
91
|
+
@formatting_lexer_table["'"] = method(:match_quote)
|
92
|
+
@formatting_lexer_table["<"] = method(:match_left_angle)
|
93
|
+
@formatting_lexer_table["&"] = method(:match_ampersand)
|
94
|
+
@formatting_lexer_table["{"] = method(:match_left_curly)
|
95
|
+
|
96
|
+
# Lexer table of methods that handle everything that may occur in-line in
|
97
|
+
# addition to formatting, i.e. links and signatures
|
98
|
+
@inline_lexer_table = @formatting_lexer_table.dup
|
99
|
+
@inline_lexer_table["["] = method(:match_left_square)
|
100
|
+
@inline_lexer_table["~"] = method(:match_tilde)
|
101
|
+
@inline_lexer_table["h"] = method(:match_h_char)
|
102
|
+
|
103
|
+
# Default lexer table, which includes all in-line formatting and links, plus
|
104
|
+
# methods that handle constructs that begin on a newline
|
105
|
+
@default_lexer_table = @inline_lexer_table.dup
|
106
|
+
@default_lexer_table[" "] = method(:match_space)
|
107
|
+
@default_lexer_table["="] = method(:match_equal)
|
108
|
+
@default_lexer_table["*"] = method(:match_star)
|
109
|
+
@default_lexer_table["#"] = method(:match_hash)
|
110
|
+
@default_lexer_table[":"] = method(:match_colon)
|
111
|
+
@default_lexer_table[";"] = method(:match_semicolon)
|
112
|
+
@default_lexer_table["-"] = method(:match_dash)
|
113
|
+
@default_lexer_table["_"] = method(:match_underscore)
|
114
|
+
@default_lexer_table["\n"] = method(:match_newline)
|
115
|
+
@default_lexer_table["\r"] = method(:match_newline)
|
116
|
+
|
117
|
+
# Lexer table used inside spans of markup, wherein spans of newlines are not
|
118
|
+
# automatically treated as paragraphs.
|
119
|
+
@markup_lexer_table = @default_lexer_table.dup
|
120
|
+
@markup_lexer_table["\n"] = nil
|
121
|
+
@markup_lexer_table["\r"] = nil
|
122
|
+
|
123
|
+
# Lexer table used inside of headings
|
124
|
+
@heading_lexer_table = @inline_lexer_table.dup
|
125
|
+
@heading_lexer_table["="] = method(:match_equal_in_heading)
|
126
|
+
@heading_lexer_table["\n"] = method(:match_newline_in_heading)
|
127
|
+
|
128
|
+
# Lexer table used inside the left half of an external link
|
129
|
+
@link_lexer_table = {}
|
130
|
+
@link_lexer_table["]"] = method(:match_right_square_in_link)
|
131
|
+
@link_lexer_table["\n"] = method(:match_newline_in_link)
|
132
|
+
@link_lexer_table["\r"] = method(:match_newline_in_link)
|
133
|
+
@link_lexer_table[" "] = method(:match_space_in_link)
|
134
|
+
|
135
|
+
# Lexer table used inside the right half of an external link, or the right
|
136
|
+
# half of an internal link
|
137
|
+
@link_opt_lexer_table = @inline_lexer_table.dup
|
138
|
+
@link_opt_lexer_table["]"] = method(:match_right_square_in_link)
|
139
|
+
@link_opt_lexer_table["\n"] = method(:match_newline_in_link)
|
140
|
+
@link_opt_lexer_table["\r"] = method(:match_newline_in_link)
|
141
|
+
|
142
|
+
# Lexer table used inside the left half of an internal link or internal
|
143
|
+
# resource link
|
144
|
+
@intlink_lexer_table = {}
|
145
|
+
@intlink_lexer_table["]"] = method(:match_right_square_in_intlink)
|
146
|
+
@intlink_lexer_table["\r"] = method(:match_newline_in_intlink)
|
147
|
+
@intlink_lexer_table["\n"] = method(:match_newline_in_intlink)
|
148
|
+
@intlink_lexer_table[":"] = method(:match_colon_in_intlink)
|
149
|
+
@intlink_lexer_table["|"] = method(:match_pipe_in_intlink)
|
150
|
+
@intlink_lexer_table["C"] = method(:match_c_char_in_intlink)
|
151
|
+
|
152
|
+
# Lexer table used inside the category name of the left half of an
|
153
|
+
# internal link
|
154
|
+
@intlink_cat_lexer_table = {}
|
155
|
+
@intlink_cat_lexer_table["]"] = method(:match_right_square_in_intlink)
|
156
|
+
@intlink_cat_lexer_table["\r"] = method(:match_newline_in_intlink)
|
157
|
+
@intlink_cat_lexer_table["\n"] = method(:match_newline_in_intlink)
|
158
|
+
@intlink_cat_lexer_table["|"] = method(:match_pipe_in_intlink)
|
159
|
+
|
160
|
+
# Lexer table used inside the right half of an internal link
|
161
|
+
@intlink_opt_lexer_table = @formatting_lexer_table.dup
|
162
|
+
@intlink_opt_lexer_table["]"] = method(:match_right_square_in_intlink)
|
163
|
+
@intlink_opt_lexer_table["\n"] = method(:match_newline_in_intlink)
|
164
|
+
@intlink_opt_lexer_table["\r"] = method(:match_newline_in_intlink)
|
165
|
+
|
166
|
+
# Lexer table used inside the right half of an internal resource link
|
167
|
+
@resourcelink_opt_lexer_table = @inline_lexer_table.dup
|
168
|
+
@resourcelink_opt_lexer_table["]"] = method(:match_right_square_in_intlink)
|
169
|
+
@resourcelink_opt_lexer_table["\n"] = method(:match_newline_in_intlink)
|
170
|
+
@resourcelink_opt_lexer_table["\r"] = method(:match_newline_in_intlink)
|
171
|
+
@resourcelink_opt_lexer_table["|"] = method(:match_pipe_in_intlink)
|
172
|
+
|
173
|
+
# Lexer table used to parse tables
|
174
|
+
@table_lexer_table = @inline_lexer_table.dup
|
175
|
+
@table_lexer_table["*"] = method(:match_star)
|
176
|
+
@table_lexer_table["#"] = method(:match_hash)
|
177
|
+
@table_lexer_table["|"] = method(:match_pipe_in_table)
|
178
|
+
@table_lexer_table["!"] = method(:match_bang_in_table)
|
179
|
+
@table_lexer_table["{"] = method(:match_left_curly)
|
180
|
+
@table_lexer_table[" "] = method(:match_space)
|
181
|
+
|
182
|
+
# Lexer table used to parse ordered and unordered list items (which may nest)
|
183
|
+
@items_lexer_table = @inline_lexer_table.dup
|
184
|
+
@items_lexer_table["\n"] = method(:match_newline_in_items)
|
185
|
+
|
186
|
+
# Lexer table used to parse entries in a definition list (which may not nest)
|
187
|
+
@entries_lexer_table = @inline_lexer_table.dup
|
188
|
+
@entries_lexer_table["\n"] = method(:match_newline_in_entries)
|
189
|
+
@entries_lexer_table[":"] = method(:match_colon_in_entries)
|
190
|
+
|
191
|
+
# Lexer table used inside spans of indented text
|
192
|
+
@indent_lexer_table = @inline_lexer_table.dup
|
193
|
+
@indent_lexer_table["\n"] = method(:match_newline_in_indent)
|
194
|
+
|
195
|
+
# Lexer table used inside spans of pre-formatted text
|
196
|
+
@pre_lexer_table = {}
|
197
|
+
@pre_lexer_table["<"] = method(:match_left_angle_in_pre)
|
198
|
+
|
199
|
+
# Lexer table used inside spans of <code>
|
200
|
+
@code_lexer_table = @inline_lexer_table.dup
|
201
|
+
@code_lexer_table[" "] = method(:match_space_in_code)
|
202
|
+
@code_lexer_table["<"] = method(:match_left_angle_in_code)
|
203
|
+
|
204
|
+
# Lexer table used when inside spans of wiki-escaped text
|
205
|
+
@nowiki_lexer_table = {}
|
206
|
+
@nowiki_lexer_table["<"] = method(:match_left_angle_in_nowiki)
|
164
207
|
|
165
|
-
|
208
|
+
@paste_lexer_table = {}
|
209
|
+
@paste_lexer_table["<"] = method(:match_left_angle_in_paste)
|
210
|
+
@paste_lexer_table["\n"] = method(:match_newline_in_paste)
|
211
|
+
@paste_lexer_table["\r"] = method(:match_newline_in_paste)
|
166
212
|
|
167
|
-
#
|
168
|
-
|
169
|
-
|
213
|
+
# Lexer table used when inside spans of math
|
214
|
+
@math_lexer_table = {}
|
215
|
+
@math_lexer_table["<"] = method(:match_left_angle_in_math)
|
216
|
+
|
217
|
+
# Lexer table used when inside a wiki template inclusion
|
218
|
+
@template_lexer_table = {}
|
219
|
+
@template_lexer_table["{"] = method(:match_left_curly_in_template)
|
220
|
+
@template_lexer_table["|"] = method(:match_pipe_in_template)
|
221
|
+
@template_lexer_table["}"] = method(:match_right_curly_in_template)
|
222
|
+
|
223
|
+
@template_param_lexer_table = {}
|
224
|
+
@template_param_lexer_table["{"] = method(:match_left_curly_in_template)
|
225
|
+
@template_param_lexer_table["}"] = method(:match_right_curly_in_template)
|
226
|
+
@template_param_lexer_table["|"] = method(:match_pipe_in_template)
|
227
|
+
|
228
|
+
# Begin lexing in default state
|
229
|
+
@lexer_table = LexerTable.new
|
230
|
+
@lexer_table.push(@default_lexer_table)
|
231
|
+
end
|
232
|
+
|
233
|
+
|
234
|
+
def tokenize(input)
|
235
|
+
@text = input
|
236
|
+
# Current position in the input text
|
237
|
+
@cursor = 0
|
238
|
+
# Tokens to be returned
|
239
|
+
@tokens = TokenArray.new(self)
|
240
|
+
# Stack of open token spans
|
241
|
+
@context = []
|
242
|
+
# Already lexed character data, not yet added to a TEXT token
|
243
|
+
@pending = TokenString.new(self)
|
244
|
+
# List symbols from the most recent line item of a list, e.g. '***'
|
245
|
+
@list = ''
|
246
|
+
|
247
|
+
start_span(:PARA)
|
248
|
+
|
249
|
+
while (@cursor < @text.length)
|
250
|
+
@char = @text[@cursor, 1]
|
251
|
+
if @lexer_table[@char]
|
252
|
+
@lexer_table[@char].call
|
253
|
+
else
|
254
|
+
@pending << @char
|
170
255
|
@cursor += 1
|
171
|
-
|
256
|
+
end
|
172
257
|
end
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
else
|
183
|
-
matchItalic
|
184
|
-
@cursor += 2
|
185
|
-
end
|
186
|
-
return
|
187
|
-
end
|
188
|
-
if @text[@cursor, 3] == "'''"
|
189
|
-
matchBold
|
190
|
-
@cursor += 3
|
191
|
-
return
|
192
|
-
end
|
193
|
-
if @text[@cursor, 2] == "''"
|
194
|
-
matchItalic
|
195
|
-
@cursor += 2
|
196
|
-
return
|
197
|
-
end
|
198
|
-
match_other
|
258
|
+
|
259
|
+
if @pending.is_empty_token?
|
260
|
+
if @context.size > 0 and @tokens.last[0] == :PARA_START
|
261
|
+
@context.pop
|
262
|
+
@tokens.pop
|
263
|
+
end
|
264
|
+
else
|
265
|
+
@tokens.append_pending(@pending)
|
266
|
+
@pending = TokenString.new(self)
|
199
267
|
end
|
268
|
+
while(@context.size > 0) do
|
269
|
+
@tokens << [(@context.pop.to_s + '_END').to_sym, '']
|
270
|
+
end
|
271
|
+
@tokens << [false, false, 0, 0]
|
272
|
+
@tokens
|
273
|
+
|
274
|
+
end
|
200
275
|
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
276
|
+
#Returns the next token from the stream. Useful for RACC parsers.
|
277
|
+
def lex
|
278
|
+
token = @tokens[@position]
|
279
|
+
@position += 1
|
280
|
+
return token
|
281
|
+
end
|
282
|
+
|
283
|
+
|
284
|
+
private
|
285
|
+
|
286
|
+
def match_text
|
287
|
+
@pending << @char
|
288
|
+
@cursor += 1
|
289
|
+
end
|
290
|
+
|
291
|
+
def match_ampersand
|
292
|
+
i = @cursor + 1
|
293
|
+
i += 1 while i < @text.size and NAME_CHAR_TABLE[@text[i].ord]
|
294
|
+
if @text[i, 1] == ';'
|
295
|
+
append_to_tokens([:CHAR_ENT, @text[(@cursor + 1) ... i]])
|
296
|
+
@cursor = i + 1
|
297
|
+
else
|
298
|
+
match_text
|
299
|
+
end
|
300
|
+
end
|
301
|
+
|
302
|
+
def match_quote
|
303
|
+
if @text[@cursor, 5] == "'''''"
|
304
|
+
if @context.last == :BOLD
|
305
|
+
match_bold
|
306
|
+
@cursor += 3
|
307
|
+
else
|
308
|
+
match_italic
|
309
|
+
@cursor += 2
|
310
|
+
end
|
311
|
+
elsif @text[@cursor, 3] == "'''"
|
312
|
+
match_bold
|
313
|
+
@cursor += 3
|
314
|
+
elsif @text[@cursor, 2] == "''"
|
315
|
+
match_italic
|
316
|
+
@cursor += 2
|
317
|
+
else
|
318
|
+
match_text
|
209
319
|
end
|
320
|
+
end
|
210
321
|
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
@next_token[0] = :ITALICSTART
|
217
|
-
@pair_stack.push @next_token
|
218
|
-
end
|
322
|
+
def match_bold
|
323
|
+
if @context.last == :BOLD
|
324
|
+
end_span(:BOLD, "'''")
|
325
|
+
else
|
326
|
+
start_span(:BOLD, "'''")
|
219
327
|
end
|
328
|
+
end
|
220
329
|
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
330
|
+
def match_italic
|
331
|
+
if @context.last == :ITALIC
|
332
|
+
end_span(:ITALIC, "''")
|
333
|
+
else
|
334
|
+
start_span(:ITALIC, "''")
|
335
|
+
end
|
336
|
+
end
|
227
337
|
|
228
|
-
|
229
|
-
|
230
|
-
|
338
|
+
def match_tilde
|
339
|
+
if @text[@cursor, 5] == "~~~~~"
|
340
|
+
empty_span(:SIGNATURE_DATE, "~~~~~", 5)
|
341
|
+
elsif @text[@cursor, 4] == "~~~~"
|
342
|
+
empty_span(:SIGNATURE_FULL, "~~~~", 4)
|
343
|
+
elsif @text[@cursor, 3] == "~~~"
|
344
|
+
empty_span(:SIGNATURE_NAME, "~~~", 3)
|
345
|
+
else
|
346
|
+
match_text
|
347
|
+
end
|
348
|
+
end
|
349
|
+
|
350
|
+
def match_left_angle
|
351
|
+
next_char = @text[@cursor + 1]
|
352
|
+
if !next_char
|
353
|
+
match_text
|
354
|
+
elsif next_char.ord == 47
|
355
|
+
# Might be an XHTML end tag
|
356
|
+
if @text[@cursor .. -1] =~ %r{</([a-zA-Z][a-zA-Z0-9\-_]*)(\s*)>} and @context.include?(:TAG)
|
357
|
+
# Found an XHTML end tag
|
358
|
+
tag_name = $1
|
359
|
+
end_span(:TAG, $1)
|
360
|
+
@lexer_table.pop
|
361
|
+
@cursor += $1.length + $2.length + 3
|
362
|
+
else
|
363
|
+
match_text
|
364
|
+
end
|
365
|
+
elsif next_char.ord > 64 and next_char.ord < 123
|
366
|
+
# Might be an XHTML open or empty tag
|
367
|
+
scanner = StringScanner.new(@text[@cursor .. -1])
|
368
|
+
if scanner.scan(%r{<([a-zA-Z][a-zA-Z0-9\-_]*)}) and (HTML_TAGS.include?(scanner[1]) or WIKI_TAGS.include?(scanner[1]))
|
369
|
+
# Sequence begins with a valid tag name, so check for attributes
|
370
|
+
tag_name = scanner[1]
|
371
|
+
attrs = {}
|
372
|
+
while scanner.scan(%r{\s+([a-zA-Z][a-zA-Z0-9\-_]*)\s*=\s*('([^']+)'|"([^"]+)"|([^>\s]+))}) do
|
373
|
+
attrs[scanner[1]] = scanner[3] ? scanner[3] : (scanner[4] ? scanner[4] : scanner[5])
|
374
|
+
end
|
375
|
+
scanner.scan(%r{\s*})
|
376
|
+
if ((c = scanner.get_byte) == '>' or (c == '/' and scanner.get_byte == '>'))
|
377
|
+
# Found an XHTML start or empty tag
|
378
|
+
if tag_name == 'nowiki'
|
379
|
+
@lexer_table.push(@nowiki_lexer_table) unless c == '/'
|
380
|
+
elsif tag_name == 'paste'
|
381
|
+
unless c == '/'
|
382
|
+
maybe_close_para(:PASTE_START, true)
|
383
|
+
append_to_tokens([:PASTE_START, ''])
|
384
|
+
@cursor += scanner.pos
|
385
|
+
@lexer_table.push(@paste_lexer_table)
|
386
|
+
#eat newline after <paste> if if exists because otherwise
|
387
|
+
#it will be transformed into <br/>
|
388
|
+
if @text[@cursor, 1] == "\n"
|
389
|
+
@cursor += 1
|
390
|
+
elsif @text[@cursor, 2] == "\r\n"
|
391
|
+
@cursor += 2
|
392
|
+
end
|
393
|
+
return
|
394
|
+
end
|
395
|
+
else
|
396
|
+
if tag_name == 'pre'
|
397
|
+
table = @pre_lexer_table
|
398
|
+
elsif tag_name == 'code'
|
399
|
+
table = @code_lexer_table
|
400
|
+
elsif tag_name == 'math'
|
401
|
+
table = @math_lexer_table
|
402
|
+
else
|
403
|
+
table = @markup_lexer_table
|
404
|
+
end
|
405
|
+
start_span(:TAG, tag_name)
|
406
|
+
attrs.collect do |(name, value)|
|
407
|
+
append_to_tokens([:ATTR_NAME, name])
|
408
|
+
append_to_tokens([:ATTR_VALUE, value]) if value
|
409
|
+
end
|
410
|
+
if c == '/' or TAGS_WITHOUT_CLOSE_TAG.include? tag_name
|
411
|
+
end_span(:TAG, tag_name)
|
231
412
|
else
|
232
|
-
|
233
|
-
@pair_stack.push @next_token
|
413
|
+
@lexer_table.push(table)
|
234
414
|
end
|
415
|
+
end
|
416
|
+
@cursor += scanner.pos #FIXME: will break xhtml attribute length calculation
|
235
417
|
else
|
236
|
-
|
418
|
+
match_text
|
237
419
|
end
|
420
|
+
else
|
421
|
+
match_text
|
422
|
+
end
|
423
|
+
else
|
424
|
+
match_text
|
238
425
|
end
|
426
|
+
end
|
239
427
|
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
428
|
+
def match_equal
|
429
|
+
if at_start_of_line?
|
430
|
+
@heading = extract_char_sequence('=')
|
431
|
+
@cursor += @heading.length
|
432
|
+
if at_end_of_line? or blank_line?
|
433
|
+
@cursor -= @heading.length
|
434
|
+
#special case - no header text, just "=" signs
|
435
|
+
#try to split header into "=" formatting and text with "=":
|
436
|
+
# example:
|
437
|
+
# ==== should become: = == =
|
438
|
+
# ===== should become: == = ==
|
439
|
+
if @heading =~ /(={6})(=+)(={6})/ or
|
440
|
+
@heading =~ /(={5})(=+)(={5})/ or
|
441
|
+
@heading =~ /(={4})(=+)(={4})/ or
|
442
|
+
@heading =~ /(={3})(=+)(={3})/ or
|
443
|
+
@heading =~ /(={2})(=+)(={2})/ or
|
444
|
+
@heading =~ /(=)(=+)(=)/
|
445
|
+
start_span(:SECTION, $1)
|
446
|
+
@cursor += $1.length
|
447
|
+
@tokens << [:TEXT, $2]
|
448
|
+
@cursor += $2.length
|
449
|
+
end_span(:SECTION, $3)
|
450
|
+
@cursor += $3.length
|
252
451
|
else
|
253
|
-
|
452
|
+
match_text
|
254
453
|
end
|
454
|
+
else
|
455
|
+
@cursor -= @heading.length
|
456
|
+
start_span(:SECTION, @heading)
|
457
|
+
@cursor += @heading.length
|
458
|
+
@lexer_table.push(@heading_lexer_table)
|
459
|
+
end
|
460
|
+
else
|
461
|
+
match_text
|
462
|
+
end
|
463
|
+
end
|
464
|
+
|
465
|
+
def match_equal_in_heading
|
466
|
+
heading = extract_char_sequence('=')
|
467
|
+
if @heading.length <= heading.length
|
468
|
+
end_span(:SECTION, heading)
|
469
|
+
@lexer_table.pop
|
470
|
+
@cursor += heading.length
|
471
|
+
skip_newline
|
472
|
+
else
|
473
|
+
@pending << heading
|
474
|
+
@cursor += heading.length
|
255
475
|
end
|
476
|
+
end
|
477
|
+
|
478
|
+
def match_newline_in_heading
|
479
|
+
end_span(:SECTION)
|
480
|
+
@lexer_table.pop
|
481
|
+
end
|
256
482
|
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
483
|
+
def match_left_square
|
484
|
+
if @text[@cursor, 2] == "[["
|
485
|
+
if @text[@cursor + 2, 1] != "]"
|
486
|
+
start_span(:INTLINK, "[[")
|
487
|
+
@cursor += 2
|
488
|
+
@lexer_table.push(@intlink_lexer_table)
|
489
|
+
else
|
490
|
+
match_text
|
491
|
+
end
|
492
|
+
elsif @text[@cursor + 1 .. -1] =~ %r{\A\s*((http|https|file)://|mailto:)}
|
493
|
+
start_span(:LINK, "[")
|
494
|
+
@cursor += 1
|
495
|
+
skip_whitespace
|
496
|
+
@lexer_table.push(@link_lexer_table)
|
497
|
+
else
|
498
|
+
match_text
|
272
499
|
end
|
500
|
+
end
|
501
|
+
|
502
|
+
def match_right_square_in_link
|
503
|
+
end_span(:LINK, "]")
|
504
|
+
@cursor += 1
|
505
|
+
@lexer_table.pop
|
506
|
+
end
|
507
|
+
|
508
|
+
def match_right_square_in_intlink
|
509
|
+
if @text[@cursor, 2] == "]]"
|
510
|
+
end_span(:INTLINK, "]]")
|
511
|
+
@cursor += 2
|
512
|
+
@lexer_table.pop
|
513
|
+
else
|
514
|
+
match_text
|
515
|
+
end
|
516
|
+
end
|
517
|
+
|
518
|
+
def match_space_in_link
|
519
|
+
spaces = extract_char_sequence(' ')
|
520
|
+
append_to_tokens([:LINKSEP, ' ']) unless @text[@cursor, 1] == ']'
|
521
|
+
@cursor += spaces.length
|
522
|
+
@lexer_table.pop
|
523
|
+
@lexer_table.push(@link_opt_lexer_table)
|
524
|
+
end
|
273
525
|
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
526
|
+
def match_pipe_in_intlink
|
527
|
+
if @tokens.last[0] == :INTLINK_START
|
528
|
+
@lexer_table.pop
|
529
|
+
@lexer_table.push(@intlink_opt_lexer_table)
|
530
|
+
end
|
531
|
+
append_to_tokens([:INTLINKSEP, "|"])
|
532
|
+
@cursor += 1
|
533
|
+
end
|
534
|
+
|
535
|
+
def match_colon_in_intlink
|
536
|
+
if not @pending.is_empty_token?
|
537
|
+
@lexer_table.pop
|
538
|
+
@lexer_table.push(@resourcelink_opt_lexer_table)
|
539
|
+
end
|
540
|
+
append_to_tokens([:RESOURCESEP, ":"])
|
541
|
+
@cursor += 1
|
542
|
+
end
|
543
|
+
|
544
|
+
def match_c_char_in_intlink
|
545
|
+
if @text[@cursor, 9] == 'Category:'
|
546
|
+
append_to_tokens([:CATEGORY, 'Category:'])
|
547
|
+
@lexer_table.pop
|
548
|
+
@lexer_table.push(@intlink_cat_lexer_table)
|
549
|
+
@cursor += 9
|
550
|
+
else
|
551
|
+
match_text
|
552
|
+
end
|
553
|
+
end
|
554
|
+
|
555
|
+
def match_newline_in_link
|
556
|
+
end_span(:LINK)
|
557
|
+
@lexer_table.pop
|
558
|
+
end
|
559
|
+
|
560
|
+
def match_newline_in_intlink
|
561
|
+
end_span(:INTLINK)
|
562
|
+
@lexer_table.pop
|
563
|
+
end
|
564
|
+
|
565
|
+
def match_h_char
|
566
|
+
link = @text[@cursor, 7] if @text[@cursor, 7] == 'http://'
|
567
|
+
link = @text[@cursor, 8] if @text[@cursor, 8] == 'https://'
|
568
|
+
if link
|
569
|
+
start_span(:LINK)
|
570
|
+
i = @cursor + link.length
|
571
|
+
while i < @text.size and TOKEN_CHAR_TABLE[@text[i].ord] do
|
572
|
+
link << @text[i, 1]
|
573
|
+
i += 1
|
574
|
+
end
|
575
|
+
|
576
|
+
#exclude punctuation at the end
|
577
|
+
while link.length > 0 and PUNCTUATION_CHAR_TABLE[link[-1].ord] do
|
578
|
+
link = link[0..-2]
|
579
|
+
i -= 1
|
281
580
|
end
|
581
|
+
|
582
|
+
@pending = TokenString.new(self)
|
583
|
+
@pending << link
|
584
|
+
@cursor = i
|
585
|
+
end_span(:LINK)
|
586
|
+
else
|
587
|
+
match_text
|
282
588
|
end
|
589
|
+
end
|
283
590
|
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
end
|
302
|
-
@sub_tokens = []
|
303
|
-
@sub_tokens << [:TEXT, linkText]
|
304
|
-
@sub_tokens << [:LINKEND, ']']
|
305
|
-
@cursor += linkText.length
|
306
|
-
@token_start = @cursor
|
307
|
-
else
|
308
|
-
match_other
|
309
|
-
end
|
591
|
+
def match_space
|
592
|
+
if at_start_of_line? and !blank_line?
|
593
|
+
start_span(:PREINDENT)
|
594
|
+
@lexer_table.push(@indent_lexer_table)
|
595
|
+
match_text
|
596
|
+
else
|
597
|
+
match_text
|
598
|
+
end
|
599
|
+
end
|
600
|
+
|
601
|
+
def match_newline_in_indent
|
602
|
+
match_text
|
603
|
+
unless @text[@cursor, 1] == " "
|
604
|
+
@tokens.append_pending(@pending)
|
605
|
+
@pending = TokenString.new(self)
|
606
|
+
end_span(:PREINDENT)
|
607
|
+
@lexer_table.pop
|
310
608
|
end
|
609
|
+
end
|
311
610
|
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
611
|
+
def match_star
|
612
|
+
if at_start_of_line?
|
613
|
+
@list = extract_char_sequence('#*')
|
614
|
+
open_list(@list)
|
615
|
+
@lexer_table.push(@items_lexer_table)
|
616
|
+
else
|
617
|
+
match_text
|
618
|
+
end
|
619
|
+
end
|
620
|
+
|
621
|
+
def match_hash
|
622
|
+
if at_start_of_line?
|
623
|
+
@list = extract_char_sequence('#*')
|
624
|
+
open_list(@list)
|
625
|
+
@lexer_table.push(@items_lexer_table)
|
626
|
+
else
|
627
|
+
match_text
|
628
|
+
end
|
629
|
+
end
|
630
|
+
|
631
|
+
def match_underscore
|
632
|
+
if @text[@cursor, 7] == '__TOC__'
|
633
|
+
empty_span(:KEYWORD, 'TOC', 7)
|
634
|
+
elsif @text[@cursor, 9] == '__NOTOC__'
|
635
|
+
empty_span(:KEYWORD, 'NOTOC', 9)
|
636
|
+
else
|
637
|
+
match_text
|
638
|
+
end
|
639
|
+
end
|
640
|
+
|
641
|
+
def match_newline_in_items
|
642
|
+
if @text[@cursor, 1] == "\n"
|
643
|
+
newline = "\n"
|
644
|
+
char = @text[@cursor + 1, 1]
|
645
|
+
else
|
646
|
+
newline = "\r\n"
|
647
|
+
char = @text[@cursor + 2, 1]
|
648
|
+
end
|
649
|
+
@pending << newline
|
650
|
+
@cursor += newline.length
|
651
|
+
if (char == @list[0, 1])
|
652
|
+
list = extract_char_sequence('#*')
|
653
|
+
if list == @list
|
654
|
+
end_span(:LI)
|
655
|
+
start_span(:LI)
|
656
|
+
@cursor += list.length
|
657
|
+
else
|
658
|
+
l = @list.length > list.length ? list.length : @list.length
|
659
|
+
i = 0
|
660
|
+
i += 1 while (i < l and @list[i] == list[i])
|
661
|
+
if i < @list.length
|
662
|
+
close_list(@list[i .. -1])
|
663
|
+
if @context.last == :LI
|
664
|
+
end_span(:LI)
|
665
|
+
start_span(:LI)
|
666
|
+
end
|
325
667
|
end
|
668
|
+
if i < list.length
|
669
|
+
start_span(:LI) if @context.last != :LI
|
670
|
+
open_list(list[i .. -1])
|
671
|
+
end
|
672
|
+
@cursor += i
|
673
|
+
@list = list
|
674
|
+
end
|
675
|
+
else
|
676
|
+
close_list(@list)
|
677
|
+
@lexer_table.pop
|
326
678
|
end
|
679
|
+
end
|
680
|
+
|
681
|
+
def match_dash
|
682
|
+
if at_start_of_line? and @text[@cursor, 4] == "----"
|
683
|
+
empty_span(:HLINE, "----", 4)
|
684
|
+
else
|
685
|
+
match_text
|
686
|
+
end
|
687
|
+
end
|
688
|
+
|
689
|
+
def match_left_angle_in_nowiki
|
690
|
+
if @text[@cursor, 9] == '</nowiki>'
|
691
|
+
@cursor += 9
|
692
|
+
@lexer_table.pop
|
693
|
+
else
|
694
|
+
match_text
|
695
|
+
end
|
696
|
+
end
|
327
697
|
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
#hack to tokenize everything inside the list
|
339
|
-
@sub_tokens = []
|
340
|
-
sub_lines = ""
|
341
|
-
@sub_tokens << [:LI_START, ""]
|
342
|
-
sub_text.each do |t|
|
343
|
-
extracted += 1
|
344
|
-
if text_is_list? t
|
345
|
-
sub_lines += t
|
346
|
-
else
|
347
|
-
if not sub_lines.empty?
|
348
|
-
@sub_tokens += sub_lex(sub_lines)
|
349
|
-
sub_lines = ""
|
350
|
-
end
|
351
|
-
if @sub_tokens.last[0] != :LI_START
|
352
|
-
@sub_tokens << [:LI_END, ""]
|
353
|
-
@sub_tokens << [:LI_START, ""]
|
354
|
-
end
|
355
|
-
@sub_tokens += sub_lex(t.lstrip)
|
356
|
-
end
|
357
|
-
end
|
358
|
-
if not sub_lines.empty?
|
359
|
-
@sub_tokens += sub_lex(sub_lines)
|
360
|
-
@sub_tokens << [:LI_END, ""]
|
361
|
-
else
|
362
|
-
@sub_tokens << [:LI_END, ""]
|
363
|
-
end
|
698
|
+
def match_left_angle_in_paste
|
699
|
+
if @text[@cursor, 8] == '</paste>'
|
700
|
+
@lexer_table.pop
|
701
|
+
append_to_tokens([:PASTE_END, ''])
|
702
|
+
@cursor += 8
|
703
|
+
maybe_open_para(:PASTE_END)
|
704
|
+
else
|
705
|
+
match_text
|
706
|
+
end
|
707
|
+
end
|
364
708
|
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
@next_token[0] = :UL_START
|
372
|
-
@sub_tokens << [:UL_END, ""]
|
373
|
-
when list_id == "#"
|
374
|
-
@next_token[0] = :OL_START
|
375
|
-
@sub_tokens << [:OL_END, ""]
|
376
|
-
when list_id == ";", list_id == ":"
|
377
|
-
@next_token[0] = :DL_START
|
378
|
-
@sub_tokens << [:DL_END, ""]
|
379
|
-
end
|
380
|
-
elsif @text[@cursor, 1] == ':' and @tokens[-1][0] == :INTLINKSTART
|
381
|
-
@next_token[0] = :RESOURCE_SEP
|
382
|
-
@cursor += 1
|
383
|
-
else
|
384
|
-
match_other
|
385
|
-
end
|
709
|
+
def match_newline_in_paste
|
710
|
+
append_to_tokens([:TAG_START, 'br'])
|
711
|
+
if @text[@cursor, 1] == "\n"
|
712
|
+
@cursor += 1
|
713
|
+
elsif @text[@cursor, 2] == "\r\n"
|
714
|
+
@cursor += 2
|
386
715
|
end
|
716
|
+
append_to_tokens([:TAG_END, 'br'])
|
717
|
+
end
|
387
718
|
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
719
|
+
def match_left_angle_in_math
|
720
|
+
if @text[@cursor, 7] == '</math>'
|
721
|
+
end_span(:TAG, 'math')
|
722
|
+
@cursor += 7
|
723
|
+
@lexer_table.pop
|
724
|
+
else
|
725
|
+
match_text
|
726
|
+
end
|
727
|
+
end
|
728
|
+
|
729
|
+
def match_left_angle_in_pre
|
730
|
+
if @text[@cursor, 6] == '</pre>'
|
731
|
+
end_span(:TAG, 'pre')
|
732
|
+
@cursor += 6
|
733
|
+
#eat newline after </pre>
|
734
|
+
if @text[@cursor, 1] == "\n"
|
395
735
|
@cursor += 1
|
736
|
+
elsif @text[@cursor, 2] == "\r\n"
|
737
|
+
@cursor += 2
|
738
|
+
end
|
739
|
+
@lexer_table.pop
|
740
|
+
else
|
741
|
+
match_text
|
396
742
|
end
|
743
|
+
end
|
397
744
|
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
745
|
+
def match_space_in_code
|
746
|
+
match_text
|
747
|
+
end
|
748
|
+
|
749
|
+
def match_left_angle_in_code
|
750
|
+
if @text[@cursor, 7] == '</code>'
|
751
|
+
end_span(:TAG, 'code')
|
752
|
+
@cursor += 7
|
753
|
+
@lexer_table.pop
|
754
|
+
else
|
755
|
+
match_left_angle
|
407
756
|
end
|
757
|
+
end
|
408
758
|
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
elsif @text[@cursor, 3] == "~~~"
|
421
|
-
@next_token[0] = :SIGNATURE_NAME
|
422
|
-
@cursor += 3
|
423
|
-
else
|
424
|
-
match_other
|
425
|
-
end
|
759
|
+
def match_left_curly
|
760
|
+
if at_start_of_line? and @text[@cursor + 1, 1] == '|'
|
761
|
+
start_span(:TABLE, "{|")
|
762
|
+
@cursor += 2
|
763
|
+
@lexer_table.push(@table_lexer_table)
|
764
|
+
elsif @text[@cursor + 1, 1] == '{' and @text[@cursor + 2, 2] != "}}"
|
765
|
+
start_span(:TEMPLATE, "{{")
|
766
|
+
@cursor += 2
|
767
|
+
@lexer_table.push(@template_lexer_table)
|
768
|
+
else
|
769
|
+
match_text
|
426
770
|
end
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
end
|
771
|
+
end
|
772
|
+
|
773
|
+
def match_left_curly_in_template
|
774
|
+
if @text[@cursor + 1, 1] == '{' and @text[@cursor + 2, 2] != "}}"
|
775
|
+
start_span(:TEMPLATE, "{{")
|
776
|
+
@cursor += 2
|
777
|
+
@lexer_table.push(@template_lexer_table)
|
778
|
+
else
|
779
|
+
match_text
|
437
780
|
end
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
end
|
781
|
+
end
|
782
|
+
|
783
|
+
def match_right_curly_in_template
|
784
|
+
if @text[@cursor + 1, 1] == '}'
|
785
|
+
end_span(:TEMPLATE, "}}")
|
786
|
+
@cursor += 2
|
787
|
+
@lexer_table.pop
|
788
|
+
else
|
789
|
+
match_text
|
448
790
|
end
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
if @tokens.last and @tokens.last[0] == :PARA_START and empty_text_token?
|
456
|
-
tokens.pop
|
457
|
-
else
|
458
|
-
tokens << [:PARA_END, ""]
|
459
|
-
end
|
460
|
-
@para = false
|
461
|
-
end
|
462
|
-
tokens << [:TABLE_START, '']
|
463
|
-
@pair_stack.push [:TABLE_START, '']
|
464
|
-
@next_token = tokens.shift
|
465
|
-
@sub_tokens = tokens
|
466
|
-
@cursor += 2
|
467
|
-
else
|
468
|
-
match_other
|
469
|
-
end
|
791
|
+
end
|
792
|
+
|
793
|
+
def match_pipe_in_template
|
794
|
+
if @tokens.last[0] == :TEMPLATE_START
|
795
|
+
@lexer_table.pop
|
796
|
+
@lexer_table.push(@template_param_lexer_table)
|
470
797
|
end
|
798
|
+
append_to_tokens([:INTLINKSEP, "|"])
|
799
|
+
@cursor += 1
|
800
|
+
end
|
471
801
|
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
end
|
486
|
-
tokens << [:HEAD_START, '']
|
487
|
-
@pair_stack.push [:HEAD_START, '']
|
488
|
-
@next_token = tokens.shift
|
489
|
-
@sub_tokens = tokens
|
490
|
-
else
|
491
|
-
match_other
|
492
|
-
end
|
802
|
+
def match_bang_in_table
|
803
|
+
if at_start_of_line?
|
804
|
+
if @context.last == :CELL
|
805
|
+
end_span(:CELL)
|
806
|
+
elsif @context.last == :HEAD
|
807
|
+
end_span(:HEAD)
|
808
|
+
elsif @context.last != :ROW
|
809
|
+
start_span(:ROW)
|
810
|
+
end
|
811
|
+
start_span(:HEAD, "!")
|
812
|
+
@cursor += 1
|
813
|
+
else
|
814
|
+
match_text
|
493
815
|
end
|
816
|
+
end
|
494
817
|
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
if @pair_stack.last[0] != :ROW_START
|
513
|
-
tokens << [:ROW_START, '']
|
514
|
-
@pair_stack.push [:ROW_START, '']
|
515
|
-
end
|
516
|
-
tokens << [:CELL_START, '']
|
517
|
-
@pair_stack.push [:CELL_START, '']
|
518
|
-
end
|
519
|
-
@next_token = tokens.shift
|
520
|
-
@sub_tokens = tokens
|
521
|
-
elsif @text[@cursor + 1, 1] == '|'
|
522
|
-
@cursor += 2
|
523
|
-
close_table_cell(tokens)
|
524
|
-
next_token = tokens.last[0] == :HEAD_END ? [:HEAD_START, ''] : [:CELL_START, '']
|
525
|
-
tokens << next_token
|
526
|
-
@pair_stack.push next_token
|
527
|
-
@next_token = tokens.shift
|
528
|
-
@sub_tokens = tokens
|
529
|
-
else
|
530
|
-
match_link_sep
|
531
|
-
end
|
532
|
-
else
|
533
|
-
match_link_sep
|
818
|
+
def match_pipe_in_table
|
819
|
+
if at_start_of_line?
|
820
|
+
context = @context[@context.rindex(:TABLE) + 1 .. -1]
|
821
|
+
if @text[@cursor+1, 1] == '-'
|
822
|
+
end_span(:ROW) if context.include? :ROW
|
823
|
+
start_span(:ROW, "|-")
|
824
|
+
@cursor += 2
|
825
|
+
elsif @text[@cursor+1, 1] == '}'
|
826
|
+
end_span(:TABLE, "|}")
|
827
|
+
@cursor += 2
|
828
|
+
@lexer_table.pop
|
829
|
+
skip_newline
|
830
|
+
else
|
831
|
+
if context.include? :CELL
|
832
|
+
end_span(:CELL)
|
833
|
+
elsif context.include? :HEAD
|
834
|
+
end_span(:HEAD)
|
534
835
|
end
|
836
|
+
start_span(:ROW) unless @context.last == :ROW
|
837
|
+
start_span(:CELL, "|")
|
838
|
+
@cursor += 1
|
839
|
+
end
|
840
|
+
elsif @text[@cursor + 1, 1] == '|'
|
841
|
+
context = @context[@context.rindex(:TABLE) + 1 .. -1]
|
842
|
+
if context.include?:CELL
|
843
|
+
end_span(:CELL)
|
844
|
+
start_span(:CELL, "||")
|
845
|
+
elsif context.include? :HEAD
|
846
|
+
end_span(:HEAD)
|
847
|
+
start_span(:HEAD, "||")
|
848
|
+
end
|
849
|
+
@cursor += 2
|
850
|
+
else
|
851
|
+
context = @context[@context.rindex(:TABLE) + 1 .. -1]
|
852
|
+
if context.include? :CELL
|
853
|
+
end_span(:CELL, "attributes")
|
854
|
+
start_span(:CELL, "|")
|
855
|
+
@char = '' #WTF?
|
856
|
+
#CHECK: this usecase and cursor increments
|
857
|
+
end
|
858
|
+
match_text
|
535
859
|
end
|
860
|
+
end
|
536
861
|
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
match_other
|
551
|
-
end
|
552
|
-
|
553
|
-
#Matches a new line and breaks the paragraph if two carriage return - newline
|
554
|
-
#sequences ("\r\n\r\n") are met.
|
555
|
-
def match_carriagereturn
|
556
|
-
if @text[@cursor, 4] == "\r\n\r\n"
|
557
|
-
if @para
|
558
|
-
@sub_tokens = end_tokens_for_open_pairs
|
559
|
-
@sub_tokens << [:PARA_END, '']
|
560
|
-
@sub_tokens << [:PARA_START, '']
|
561
|
-
@next_token[0] = @sub_tokens.slice!(0)[0]
|
562
|
-
@cursor += 4
|
563
|
-
return
|
564
|
-
end
|
565
|
-
end
|
566
|
-
match_other
|
862
|
+
def match_newline
|
863
|
+
if @text[@cursor, 2] == "\n\n"
|
864
|
+
@pending << "\n\n"
|
865
|
+
@cursor += 2
|
866
|
+
end_span(:PARA)
|
867
|
+
start_span(:PARA)
|
868
|
+
elsif @text[@cursor, 4] == "\r\n\r\n"
|
869
|
+
@pending << "\r\n\r\n"
|
870
|
+
@cursor += 4
|
871
|
+
end_span(:PARA)
|
872
|
+
start_span(:PARA)
|
873
|
+
else
|
874
|
+
match_text
|
567
875
|
end
|
876
|
+
end
|
877
|
+
|
878
|
+
def match_newline_in_table
|
879
|
+
if @text[@cursor, 2] == "\n\n"
|
880
|
+
start_span(:PARA)
|
881
|
+
append_to_tokens([:TEXT, "\n\n"])
|
882
|
+
@cursor += 2
|
883
|
+
end_span(:PARA)
|
884
|
+
elsif @text[@cursor, 4] == "\r\n\r\n"
|
885
|
+
start_span(:PARA)
|
886
|
+
append_to_tokens([:TEXT, "\r\n\r\n"])
|
887
|
+
@cursor += 4
|
888
|
+
end_span(:PARA)
|
889
|
+
else
|
890
|
+
match_text
|
891
|
+
end
|
892
|
+
end
|
893
|
+
|
894
|
+
def match_semicolon
|
895
|
+
if at_start_of_line?
|
896
|
+
start_span(:DL)
|
897
|
+
start_span(:DT, ';')
|
898
|
+
@lexer_table.push(@entries_lexer_table)
|
899
|
+
@cursor += 1
|
900
|
+
else
|
901
|
+
match_text
|
902
|
+
end
|
903
|
+
end
|
904
|
+
|
905
|
+
def match_colon
|
906
|
+
if at_start_of_line?
|
907
|
+
start_span(:DL)
|
908
|
+
start_span(:DD, ':')
|
909
|
+
@lexer_table.push(@entries_lexer_table)
|
910
|
+
@cursor += 1
|
911
|
+
else
|
912
|
+
match_text
|
913
|
+
end
|
914
|
+
end
|
915
|
+
|
916
|
+
def match_colon_in_entries
|
917
|
+
if @context.include? :DD
|
918
|
+
end_span(:DD)
|
919
|
+
elsif @context.include? :DT
|
920
|
+
end_span(:DT)
|
921
|
+
end
|
922
|
+
start_span(:DD, ':')
|
923
|
+
@cursor += 1
|
924
|
+
end
|
925
|
+
|
926
|
+
def match_newline_in_entries
|
927
|
+
match_text
|
928
|
+
unless @text[@cursor, 1] == ':'
|
929
|
+
if @context.include? :DD
|
930
|
+
end_span(:DD)
|
931
|
+
elsif @context.include? :DT
|
932
|
+
end_span(:DT)
|
933
|
+
end
|
934
|
+
end_span(:DL)
|
935
|
+
@lexer_table.pop
|
936
|
+
end
|
937
|
+
end
|
938
|
+
|
939
|
+
|
940
|
+
#-- ================== Helper methods ================== ++#
|
941
|
+
|
942
|
+
# Returns true if the text cursor is on the first character of a line
|
943
|
+
def at_start_of_line?
|
944
|
+
@cursor == 0 or @text[@cursor - 1, 1] == "\n"
|
945
|
+
end
|
568
946
|
|
569
|
-
|
947
|
+
# Returns true if the text cursor is after the last character of a line
|
948
|
+
def at_end_of_line?
|
949
|
+
@text[@cursor, 1] == "\n" or @text[@cursor, 1].nil?
|
950
|
+
end
|
570
951
|
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
952
|
+
def blank_line?
|
953
|
+
i = @cursor
|
954
|
+
i += 1 while (@text[i,1] == ' ')
|
955
|
+
return (@text[i,1] == '' or (@text[i,1] == "\n") or (@text[i,2] == "\r\n"))
|
956
|
+
end
|
957
|
+
|
958
|
+
# Advances the text cursor to the next non-blank character, without appending
|
959
|
+
# any of the blank characters to the pending text buffer
|
960
|
+
def skip_whitespace
|
961
|
+
@cursor += 1 while @text[@cursor, 1] == ' '
|
962
|
+
end
|
963
|
+
|
964
|
+
# Advances the text cursor beyond the next newline sequence, if any. This is
|
965
|
+
# used to strip newlines after certain block-level elements, like section
|
966
|
+
# headings and tables, to prevent an empty paragraph when the block is followed
|
967
|
+
# by an extra newline sequence.
|
968
|
+
def skip_newline
|
969
|
+
if @text[@cursor, 2] == "\r\n"
|
970
|
+
@cursor += 2
|
971
|
+
elsif @text[@cursor, 1] == "\n"
|
972
|
+
@cursor += 1
|
973
|
+
end
|
974
|
+
end
|
975
|
+
|
976
|
+
# Extracts from the input text the sequence of characters consisting of the
|
977
|
+
# character or characters specified, and returns the sequence as a string. The
|
978
|
+
# text cursor is advanaced to point to the next character after the sequence.
|
979
|
+
def extract_char_sequence(char)
|
980
|
+
sequence = ''
|
981
|
+
i = @cursor
|
982
|
+
if char.length == 1
|
983
|
+
while @text[i, 1] == char do
|
984
|
+
sequence << char
|
985
|
+
i += 1
|
986
|
+
end
|
987
|
+
else
|
988
|
+
chars = char.split('')
|
989
|
+
while chars.include?(@text[i, 1]) do
|
990
|
+
sequence << @text[i, 1]
|
991
|
+
i += 1
|
583
992
|
end
|
584
|
-
false
|
585
993
|
end
|
994
|
+
sequence
|
995
|
+
end
|
996
|
+
|
997
|
+
# Opens list and list item spans for each item symbol in the string specified.
|
998
|
+
def open_list(symbols)
|
999
|
+
symbols.split('').each do |symbol|
|
1000
|
+
if symbol == '*'
|
1001
|
+
start_span(:UL)
|
1002
|
+
else
|
1003
|
+
start_span(:OL)
|
1004
|
+
end
|
1005
|
+
start_span(:LI)
|
1006
|
+
@cursor += symbol.length
|
1007
|
+
end
|
1008
|
+
end
|
1009
|
+
|
1010
|
+
# Closes list and list item spans for each item symbol in the string specified.
|
1011
|
+
def close_list(symbols)
|
1012
|
+
symbols.split('').reverse.each do |symbol|
|
1013
|
+
end_span(:LI)
|
1014
|
+
if symbol == '*'
|
1015
|
+
end_span(:UL)
|
1016
|
+
else
|
1017
|
+
end_span(:OL)
|
1018
|
+
end
|
1019
|
+
end
|
1020
|
+
end
|
1021
|
+
|
1022
|
+
# Open a token span for the symbol specified. This will append a token start
|
1023
|
+
# to the list of output tokens, and push the symbol onto the context stack. If
|
1024
|
+
# there is an open paragraph, and the symbol is a block element, then the
|
1025
|
+
# open paragraph will be closed (or, if empty, removed) before the token start
|
1026
|
+
# is appended.
|
1027
|
+
def start_span(symbol, text='')
|
1028
|
+
maybe_close_para(symbol, ['pre','table','p'].include?(text))
|
1029
|
+
@context << symbol
|
1030
|
+
append_to_tokens [(symbol.to_s + '_START').to_sym, text]
|
1031
|
+
end
|
586
1032
|
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
1033
|
+
# Close a token span for the symbol specified. This will append an end token
|
1034
|
+
# to the list of output tokens, and pop the symbol from the context stack. Any
|
1035
|
+
# unclosed contexts on top of this symbol's context will also be close (this
|
1036
|
+
# generally happens when in-line markup is not terminated before a new block
|
1037
|
+
# begins). If the context is empty as a result, a new paragraph will be opened.
|
1038
|
+
def end_span(symbol, text='')
|
1039
|
+
while(@context.size > 0 and @context.last != symbol) do
|
1040
|
+
append_to_tokens [(@context.pop.to_s + '_END').to_sym, '']
|
1041
|
+
end
|
1042
|
+
@context.pop
|
1043
|
+
append_to_tokens [(symbol.to_s + '_END').to_sym, text]
|
1044
|
+
maybe_open_para(symbol)
|
1045
|
+
end
|
1046
|
+
|
1047
|
+
def empty_span(symbol, text, cursor_increment)
|
1048
|
+
maybe_close_para(symbol)
|
1049
|
+
append_to_tokens [symbol, text, @cursor, cursor_increment]
|
1050
|
+
@cursor += cursor_increment
|
1051
|
+
maybe_open_para(symbol)
|
1052
|
+
end
|
1053
|
+
|
1054
|
+
def maybe_close_para(symbol, force = false)
|
1055
|
+
if @context.size > 0 and (PARA_BREAK_ELEMENTS.include?(symbol) or force)
|
1056
|
+
i = 1
|
1057
|
+
i += 1 while INLINE_ELEMENTS.include?(@context[-i])
|
1058
|
+
if @context[-i] == :PARA
|
1059
|
+
if @pending.is_empty_token? and @tokens.last[0] == :PARA_START
|
1060
|
+
@context.pop
|
1061
|
+
@tokens.pop
|
591
1062
|
else
|
592
|
-
|
1063
|
+
(1 .. i).each do
|
1064
|
+
symbol = @context.pop
|
1065
|
+
append_to_tokens [(symbol.to_s + '_END').to_sym, '']
|
1066
|
+
end
|
593
1067
|
end
|
1068
|
+
end
|
594
1069
|
end
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
#HTTP, HTTPS, MAILTO or FILE protocols
|
602
|
-
def link_protocol?(position)
|
603
|
-
return @text[position, @text.length - position] =~ %r{\A((http|https|file)://|mailto:)}
|
1070
|
+
end
|
1071
|
+
|
1072
|
+
def maybe_open_para(symbol)
|
1073
|
+
if @context.size == 0 and symbol != :PARA
|
1074
|
+
@tokens << [:PARA_START, '']
|
1075
|
+
@context << :PARA
|
604
1076
|
end
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
1077
|
+
end
|
1078
|
+
|
1079
|
+
def append_to_tokens(token)
|
1080
|
+
unless @pending.is_empty_token?
|
1081
|
+
@tokens.append_pending(@pending)
|
609
1082
|
end
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
1083
|
+
@pending = TokenString.new(self)
|
1084
|
+
@tokens << token
|
1085
|
+
end
|
1086
|
+
|
1087
|
+
|
1088
|
+
class LexerTable
|
1089
|
+
|
1090
|
+
def initialize
|
1091
|
+
@tables = []
|
615
1092
|
end
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
return text =~ /^[#;*:].*/
|
1093
|
+
|
1094
|
+
def push(table)
|
1095
|
+
@tables << table
|
1096
|
+
@table = table
|
621
1097
|
end
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
sub_tokens = sub_lexer.tokenize(sub_text)
|
627
|
-
sub_tokens.pop #false token
|
628
|
-
if strip_paragraphs and sub_tokens.size > 0
|
629
|
-
#the last PARA_END token
|
630
|
-
sub_tokens.pop if sub_tokens.last[0] == :PARA_END
|
631
|
-
#the first PARA_START token
|
632
|
-
sub_tokens.delete_at(0) if sub_tokens[0][0] == :PARA_START
|
633
|
-
end
|
634
|
-
sub_tokens
|
635
|
-
end
|
636
|
-
|
637
|
-
#Extract list contents of list type set by list_id variable.
|
638
|
-
#Example list:
|
639
|
-
# *a
|
640
|
-
# **a
|
641
|
-
#Extracted list with id "*" will look like:
|
642
|
-
# a
|
643
|
-
# *a
|
644
|
-
def extract_list_contents(list_id)
|
645
|
-
i = @cursor+1
|
646
|
-
list = ""
|
647
|
-
while i < @text.length
|
648
|
-
curr = @text[i, 1]
|
649
|
-
if (curr == "\n") and (@text[i+1, 1] != list_id)
|
650
|
-
list+=curr
|
651
|
-
break
|
652
|
-
end
|
653
|
-
if (curr == list_id) and (@text[i-1, 1] == "\n")
|
654
|
-
list += "\n" if i + 1 == @text.length
|
655
|
-
else
|
656
|
-
list += curr
|
657
|
-
end
|
658
|
-
i += 1
|
659
|
-
end
|
660
|
-
list
|
661
|
-
end
|
662
|
-
|
663
|
-
def start_para
|
664
|
-
@tokens << [:PARA_START, ""]
|
665
|
-
@para = true
|
666
|
-
end
|
667
|
-
|
668
|
-
def end_para
|
669
|
-
@tokens += end_tokens_for_open_pairs
|
670
|
-
@tokens << [:PARA_END, ""]
|
671
|
-
@para = false
|
672
|
-
end
|
673
|
-
|
674
|
-
def end_tokens_for_open_pairs
|
675
|
-
tokens = []
|
676
|
-
restore = []
|
677
|
-
while(@pair_stack.size > 1) do
|
678
|
-
last = @pair_stack.pop
|
679
|
-
case last[0]
|
680
|
-
when :ITALICSTART
|
681
|
-
tokens << [:ITALICEND, '']
|
682
|
-
when :BOLDSTART
|
683
|
-
tokens << [:BOLDEND, '']
|
684
|
-
when :INTLINKSTART
|
685
|
-
tokens << [:INTLINKEND, '']
|
686
|
-
when :LINKSTART
|
687
|
-
tokens << [:LINKEND, '']
|
688
|
-
when :TABLE_START
|
689
|
-
tokens << [:TABLE_END, '']
|
690
|
-
when :ROW_START
|
691
|
-
tokens << [:ROW_END, '']
|
692
|
-
when :CELL_START
|
693
|
-
tokens << [:CELL_END, '']
|
694
|
-
when :HEAD_START
|
695
|
-
tokens << [:HEAD_END, '']
|
696
|
-
else
|
697
|
-
restore << last
|
698
|
-
end
|
699
|
-
end
|
700
|
-
@pair_stack += restore.reverse
|
701
|
-
tokens
|
702
|
-
end
|
703
|
-
|
704
|
-
def close_table_cell(tokens)
|
705
|
-
restore = []
|
706
|
-
last = @pair_stack.pop
|
707
|
-
while (last[0] != :CELL_START and last[0] != :HEAD_START and last[0] != :ROW_START and last[0] != :TABLE_START) do
|
708
|
-
case last[0]
|
709
|
-
when :ITALICSTART
|
710
|
-
tokens << [:ITALICEND, '']
|
711
|
-
when :BOLDSTART
|
712
|
-
tokens << [:BOLDEND, '']
|
713
|
-
when :INTLINKSTART
|
714
|
-
tokens << [:INTLINKEND, '']
|
715
|
-
when :LINKSTART
|
716
|
-
tokens << [:LINKEND, '']
|
717
|
-
end
|
718
|
-
last = @pair_stack.pop
|
719
|
-
end
|
720
|
-
if last[0] == :CELL_START
|
721
|
-
tokens << [:CELL_END, '']
|
722
|
-
elsif last[0] == :HEAD_START
|
723
|
-
tokens << [:HEAD_END, '']
|
724
|
-
else
|
725
|
-
@pair_stack.push last
|
726
|
-
end
|
1098
|
+
|
1099
|
+
def pop
|
1100
|
+
@tables.pop
|
1101
|
+
@table = @tables.last
|
727
1102
|
end
|
728
1103
|
|
729
|
-
def
|
730
|
-
|
731
|
-
@pair_stack.pop
|
732
|
-
tokens << [:ROW_END, '']
|
733
|
-
end
|
1104
|
+
def[] (char)
|
1105
|
+
@table[char]
|
734
1106
|
end
|
1107
|
+
|
1108
|
+
end
|
735
1109
|
|
736
1110
|
end
|
737
1111
|
|