mediacloth 0.0.3 → 0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +36 -0
- data/lib/mediacloth/mediawikiast.rb +58 -1
- data/lib/mediacloth/mediawikihtmlgenerator.rb +229 -73
- data/lib/mediacloth/mediawikilexer.rb +1030 -656
- data/lib/mediacloth/mediawikilinkhandler.rb +89 -0
- data/lib/mediacloth/mediawikiparams.rb +1 -10
- data/lib/mediacloth/mediawikiparser.rb +939 -409
- data/lib/mediacloth/mediawikiparser.tab.rb +1357 -0
- data/lib/mediacloth/mediawikiparser.y +256 -52
- data/lib/mediacloth/mediawikisignedwikigenerator.rb +42 -0
- data/lib/mediacloth/mediawikitemplatehandler.rb +8 -0
- data/lib/mediacloth/mediawikiwalker.rb +72 -1
- data/lib/mediacloth.rb +33 -10
- data/test/data/ast1 +68 -0
- data/test/data/ast10 +196 -0
- data/test/data/ast11 +34 -0
- data/test/data/ast12 +39 -0
- data/test/data/ast13 +25 -0
- data/test/data/ast14 +13 -0
- data/test/data/ast15 +25 -0
- data/test/data/ast16 +17 -0
- data/test/data/ast17 +9 -0
- data/test/data/ast18 +21 -0
- data/test/data/ast19 +32 -0
- data/test/data/ast2 +4 -0
- data/test/data/ast20 +10 -0
- data/test/data/ast21 +27 -0
- data/test/data/ast22 +22 -0
- data/test/data/ast23 +5 -0
- data/test/data/ast3 +6 -0
- data/test/data/ast4 +122 -0
- data/test/data/ast5 +122 -0
- data/test/data/ast6 +22 -0
- data/test/data/ast7 +143 -0
- data/test/data/ast8 +3 -0
- data/test/data/ast9 +11 -0
- data/test/data/html1 +33 -5
- data/test/data/html10 +31 -27
- data/test/data/html11 +19 -0
- data/test/data/html12 +32 -0
- data/test/data/html13 +29 -0
- data/test/data/html14 +4 -0
- data/test/data/html15 +29 -0
- data/test/data/html16 +28 -0
- data/test/data/html17 +10 -0
- data/test/data/html18 +8 -0
- data/test/data/html19 +27 -0
- data/test/data/html2 +1 -1
- data/test/data/html20 +7 -0
- data/test/data/html21 +5 -0
- data/test/data/html22 +24 -0
- data/test/data/html23 +7 -0
- data/test/data/html3 +1 -1
- data/test/data/html4 +60 -11
- data/test/data/html5 +45 -6
- data/test/data/html6 +5 -5
- data/test/data/html7 +59 -1
- data/test/data/html8 +1 -1
- data/test/data/html9 +10 -2
- data/test/data/input1 +4 -0
- data/test/data/input11 +19 -0
- data/test/data/input12 +32 -0
- data/test/data/input13 +10 -0
- data/test/data/input14 +8 -0
- data/test/data/input15 +10 -0
- data/test/data/input16 +28 -0
- data/test/data/input17 +10 -0
- data/test/data/input18 +16 -0
- data/test/data/input19 +29 -0
- data/test/data/input20 +8 -0
- data/test/data/input21 +18 -0
- data/test/data/input22 +20 -0
- data/test/data/input23 +8 -0
- data/test/data/input4 +13 -1
- data/test/data/input5 +45 -4
- data/test/data/input7 +25 -1
- data/test/data/lex1 +17 -18
- data/test/data/lex10 +57 -87
- data/test/data/lex11 +18 -0
- data/test/data/lex12 +32 -0
- data/test/data/lex13 +3 -0
- data/test/data/lex14 +1 -0
- data/test/data/lex15 +3 -0
- data/test/data/lex16 +27 -0
- data/test/data/lex17 +9 -0
- data/test/data/lex18 +4 -0
- data/test/data/lex19 +27 -0
- data/test/data/lex2 +2 -2
- data/test/data/lex20 +7 -0
- data/test/data/lex21 +4 -0
- data/test/data/lex22 +3 -0
- data/test/data/lex23 +7 -0
- data/test/data/lex3 +1 -1
- data/test/data/lex4 +35 -29
- data/test/data/lex5 +57 -18
- data/test/data/lex6 +7 -7
- data/test/data/lex7 +42 -18
- data/test/data/lex8 +1 -1
- data/test/data/lex9 +6 -6
- data/test/dataproducers/ast.rb +24 -0
- data/test/dataproducers/html.rb +11 -12
- data/test/dataproducers/lex.rb +9 -4
- data/test/debugwalker.rb +25 -11
- data/test/htmlgenerator.rb +170 -13
- data/test/lexer.rb +626 -83
- data/test/linkhandler.rb +39 -0
- data/test/parser.rb +176 -9
- data/test/signedwikigenerator.rb +113 -0
- metadata +158 -79
- data/README +0 -37
- data/lib/mediacloth/mediawikilexer.rb~ +0 -491
- data/lib/mediacloth/mediawikiparser.y~ +0 -210
- data/test/data/result1 +0 -48
- data/test/dataproducers/html.rb~ +0 -24
- data/test/dataproducers/lex.rb~ +0 -15
|
@@ -1,737 +1,1111 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
#
|
|
9
|
-
#Inside RACC-generated parser:
|
|
10
|
-
# ...
|
|
11
|
-
# ---- inner ----
|
|
12
|
-
# attr_accessor :lexer
|
|
13
|
-
# def parse(input)
|
|
14
|
-
# lexer.tokenize(input)
|
|
15
|
-
# return do_parse
|
|
16
|
-
# end
|
|
17
|
-
# def next_token
|
|
18
|
-
# return @lexer.lex
|
|
19
|
-
# end
|
|
20
|
-
# ...
|
|
21
|
-
# parser = MediaWikiParser.new
|
|
22
|
-
# parser.lexer = MediaWikiLexer.new
|
|
23
|
-
# parser.parse(input)
|
|
24
|
-
class MediaWikiLexer
|
|
25
|
-
|
|
26
|
-
#Initialized the lexer with a match table.
|
|
27
|
-
#
|
|
28
|
-
#The match table tells the lexer which method to invoke
|
|
29
|
-
#on given input char during "tokenize" phase.
|
|
30
|
-
def initialize
|
|
31
|
-
@position = 0
|
|
32
|
-
@pair_stack = [[false, false]] #stack of tokens for which a pair should be found
|
|
33
|
-
@list_stack = []
|
|
34
|
-
# Default lexer table
|
|
35
|
-
@lexer_table = Hash.new(method(:match_other))
|
|
36
|
-
@lexer_table["'"] = method(:match_italic_or_bold)
|
|
37
|
-
@lexer_table["="] = method(:match_section)
|
|
38
|
-
@lexer_table["["] = method(:match_link_start)
|
|
39
|
-
@lexer_table["]"] = method(:match_link_end)
|
|
40
|
-
@lexer_table["|"] = method(:match_link_sep_or_table_cell)
|
|
41
|
-
@lexer_table[" "] = method(:match_space)
|
|
42
|
-
@lexer_table["*"] = method(:match_list)
|
|
43
|
-
@lexer_table["#"] = method(:match_list)
|
|
44
|
-
@lexer_table[";"] = method(:match_list)
|
|
45
|
-
@lexer_table[":"] = method(:match_list)
|
|
46
|
-
@lexer_table["-"] = method(:match_line)
|
|
47
|
-
@lexer_table["~"] = method(:match_signature)
|
|
48
|
-
@lexer_table["h"] = method(:match_inline_link)
|
|
49
|
-
@lexer_table["\n"] = method(:match_newline)
|
|
50
|
-
@lexer_table["\r"] = method(:match_carriagereturn)
|
|
51
|
-
@lexer_table["<"] = method(:match_tag_start)
|
|
52
|
-
@lexer_table["{"] = method(:match_table)
|
|
53
|
-
@lexer_table["!"] = method(:match_table_head)
|
|
54
|
-
# Lexer table used when inside :match_tag_start ... :match_tag_end
|
|
55
|
-
@tag_lexer_table = Hash.new(method(:match_other))
|
|
56
|
-
@tag_lexer_table["<"] = method(:match_tag_end)
|
|
57
|
-
# Begin lexing in default state
|
|
58
|
-
@current_lexer_table = @lexer_table
|
|
59
|
-
end
|
|
60
|
-
|
|
61
|
-
#Transforms input stream (string) into the stream of tokens.
|
|
62
|
-
#Tokens are collected into an array of type [ [TOKEN_SYMBOL, TOKEN_VALUE], ..., [false, false] ].
|
|
63
|
-
#This array can be given as input token-by token to RACC based parser with no
|
|
64
|
-
#modification. The last token [false, false] inficates EOF.
|
|
65
|
-
def tokenize(input)
|
|
66
|
-
@tokens = []
|
|
67
|
-
start_para
|
|
68
|
-
@cursor = 0
|
|
69
|
-
@text = input
|
|
70
|
-
@next_token = []
|
|
71
|
-
|
|
72
|
-
#This tokenizer algorithm assumes that everything that is not
|
|
73
|
-
#matched by the lexer is going to be :TEXT token. Otherwise it's usual
|
|
74
|
-
#lexer algo which call methods from the match table to define next tokens.
|
|
75
|
-
while (@cursor < @text.length)
|
|
76
|
-
@current_token = [:TEXT, ''] unless @current_token
|
|
77
|
-
@token_start = @cursor
|
|
78
|
-
@char = @text[@cursor, 1]
|
|
79
|
-
|
|
80
|
-
if @current_lexer_table[@char].call == :TEXT
|
|
81
|
-
@current_token[1] += @text[@token_start, 1]
|
|
82
|
-
else
|
|
83
|
-
#skip empty :TEXT tokens
|
|
84
|
-
unless empty_text_token?
|
|
85
|
-
@tokens << @current_token
|
|
86
|
-
unless para_breaker?(@next_token[0]) or in_block?
|
|
87
|
-
#if no paragraph was previously started
|
|
88
|
-
#then we should start it
|
|
89
|
-
start_para if !@para
|
|
90
|
-
else
|
|
91
|
-
#if we already have a paragraph this is the time to close it
|
|
92
|
-
end_para if @para
|
|
93
|
-
end
|
|
94
|
-
|
|
95
|
-
end
|
|
96
|
-
|
|
97
|
-
if para_breaker?(@next_token[0])
|
|
98
|
-
if @tokens.last and @tokens.last[0] == :PARA_START
|
|
99
|
-
#we need to remove para start token because no para end is possible
|
|
100
|
-
@tokens.pop
|
|
101
|
-
@para = false
|
|
102
|
-
elsif @para
|
|
103
|
-
end_para
|
|
104
|
-
end
|
|
105
|
-
end
|
|
1
|
+
require 'strscan'
|
|
2
|
+
|
|
3
|
+
class String
|
|
4
|
+
def is_empty_token?
|
|
5
|
+
self.size == 0 or self == "\n" or self == "\r\n"
|
|
6
|
+
end
|
|
7
|
+
end
|
|
106
8
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
9
|
+
# Class for storing text tokens data - index and text
|
|
10
|
+
class TokenString < String
|
|
11
|
+
attr_reader :idx
|
|
12
|
+
|
|
13
|
+
def initialize(lexer, text = '')
|
|
14
|
+
@lexer = lexer
|
|
15
|
+
@idx = 0
|
|
16
|
+
super(text)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def <<(pending_text)
|
|
20
|
+
# If TokenString.length is 0 and we are pushing some text
|
|
21
|
+
# than in this moment we can retreive this tokes's index
|
|
22
|
+
if length == 0
|
|
23
|
+
@idx = @lexer.cursor
|
|
24
|
+
end
|
|
25
|
+
super(pending_text)
|
|
26
|
+
end
|
|
27
|
+
end
|
|
115
28
|
|
|
116
|
-
|
|
117
|
-
|
|
29
|
+
class TokenArray < Array
|
|
30
|
+
def initialize(lexer)
|
|
31
|
+
@lexer = lexer
|
|
32
|
+
end
|
|
118
33
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
34
|
+
def <<(token)
|
|
35
|
+
if @lexer.tokens.last && (@lexer.tokens.last[3].nil? || @lexer.tokens.last[3] == 0)
|
|
36
|
+
@lexer.tokens.last[3] = @lexer.cursor - @lexer.tokens.last[2]
|
|
122
37
|
end
|
|
123
|
-
|
|
124
|
-
|
|
38
|
+
token[2] = @lexer.cursor
|
|
39
|
+
super(token)
|
|
40
|
+
end
|
|
125
41
|
|
|
126
|
-
|
|
127
|
-
if @tokens.last
|
|
128
|
-
@tokens.
|
|
129
|
-
@para = false
|
|
130
|
-
else
|
|
131
|
-
end_para if @para
|
|
42
|
+
def append_pending(text)
|
|
43
|
+
if @lexer.tokens.last && @lexer.tokens.last[3].nil?
|
|
44
|
+
@lexer.tokens.last[3] = text.idx - @lexer.tokens.last[2]
|
|
132
45
|
end
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
@tokens
|
|
46
|
+
token = [:TEXT, text, text.idx, text.length]
|
|
47
|
+
push(token)
|
|
136
48
|
end
|
|
137
49
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
50
|
+
def to_s
|
|
51
|
+
string_copy = ""
|
|
52
|
+
each do |token|
|
|
53
|
+
string_copy << "#{token[0..1]}[#{token[2]}, #{token[3]}]"
|
|
54
|
+
end
|
|
55
|
+
string_copy
|
|
143
56
|
end
|
|
144
57
|
|
|
58
|
+
end
|
|
145
59
|
|
|
146
|
-
private
|
|
147
|
-
#Returns true if the token breaks the paragraph.
|
|
148
|
-
def para_breaker?(token)
|
|
149
|
-
[:SECTION_START, :SECTION_END,
|
|
150
|
-
:TABLE_START, :TABLE_END, :ROW_START, :ROW_END, :HEAD_START, :HEAD_END, :CELL_START, :CELL_END,
|
|
151
|
-
:UL_START, :UL_END, :OL_START, :OL_END,
|
|
152
|
-
:DL_START, :DL_END, :HLINE, :PRE].include?(token)
|
|
153
|
-
end
|
|
154
60
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
61
|
+
class MediaWikiLexer
|
|
62
|
+
|
|
63
|
+
INLINE_ELEMENTS = [:LINK, :INTLINK, :BOLD, :ITALIC]
|
|
64
|
+
BLOCK_ELEMENTS = [:PARA, :PRE, :PREINDENT, :UL, :OL, :DL, :LI, :SECTION, :TABLE, :ROW, :CELL, :HEAD]
|
|
65
|
+
PARA_BREAK_ELEMENTS = [:UL, :OL, :DL, :PRE, :PREINDENT, :PASTE_START, :SECTION, :TABLE, :HLINE, :KEYWORD]
|
|
66
|
+
|
|
67
|
+
NAME_CHAR_TABLE = (0 .. 255).collect{|n| n.chr =~ /[a-zA-Z0-9_\-]/ ? true : false}
|
|
68
|
+
TOKEN_CHAR_TABLE = (0 .. 255).collect{|n| n.chr =~ /[a-zA-Z0-9_\-.;:?&@~=#%\/]/ ? true : false}
|
|
69
|
+
PUNCTUATION_CHAR_TABLE = (0 .. 255).collect{|n| n.chr =~ /[\.,;:\-?]/ ? true : false}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
HTML_TAGS = %w{ a abbr acronym address applet area b base basefont bdo big blockquote body br
|
|
73
|
+
button caption center cite code col colgroup dd del dir div dfn dl dt em fieldset font form frame
|
|
74
|
+
frameset h1 h2 h3 h4 h5 h6 head hr html i iframe img input ins isindex kbd label legend li link map
|
|
75
|
+
menu meta noframes noscript object ol optgroup option p param pre q s samp script select small span
|
|
76
|
+
strike strong style sub sup table tbody td textarea tfoot th thead title tr tt u ul var xmp }
|
|
77
|
+
WIKI_TAGS = %w{ nowiki math paste }
|
|
78
|
+
TAGS_WITHOUT_CLOSE_TAG = %w{ br hr img }
|
|
79
|
+
|
|
80
|
+
attr_reader :cursor
|
|
81
|
+
attr_reader :tokens
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def initialize
|
|
85
|
+
# Current position in token list
|
|
86
|
+
@position = 0
|
|
159
87
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
88
|
+
# Lexer table of methods that handle only formatting, e.g. bold or italicized
|
|
89
|
+
# text; or spans of XHTML, or wiki-escape, markup
|
|
90
|
+
@formatting_lexer_table = {}
|
|
91
|
+
@formatting_lexer_table["'"] = method(:match_quote)
|
|
92
|
+
@formatting_lexer_table["<"] = method(:match_left_angle)
|
|
93
|
+
@formatting_lexer_table["&"] = method(:match_ampersand)
|
|
94
|
+
@formatting_lexer_table["{"] = method(:match_left_curly)
|
|
95
|
+
|
|
96
|
+
# Lexer table of methods that handle everything that may occur in-line in
|
|
97
|
+
# addition to formatting, i.e. links and signatures
|
|
98
|
+
@inline_lexer_table = @formatting_lexer_table.dup
|
|
99
|
+
@inline_lexer_table["["] = method(:match_left_square)
|
|
100
|
+
@inline_lexer_table["~"] = method(:match_tilde)
|
|
101
|
+
@inline_lexer_table["h"] = method(:match_h_char)
|
|
102
|
+
|
|
103
|
+
# Default lexer table, which includes all in-line formatting and links, plus
|
|
104
|
+
# methods that handle constructs that begin on a newline
|
|
105
|
+
@default_lexer_table = @inline_lexer_table.dup
|
|
106
|
+
@default_lexer_table[" "] = method(:match_space)
|
|
107
|
+
@default_lexer_table["="] = method(:match_equal)
|
|
108
|
+
@default_lexer_table["*"] = method(:match_star)
|
|
109
|
+
@default_lexer_table["#"] = method(:match_hash)
|
|
110
|
+
@default_lexer_table[":"] = method(:match_colon)
|
|
111
|
+
@default_lexer_table[";"] = method(:match_semicolon)
|
|
112
|
+
@default_lexer_table["-"] = method(:match_dash)
|
|
113
|
+
@default_lexer_table["_"] = method(:match_underscore)
|
|
114
|
+
@default_lexer_table["\n"] = method(:match_newline)
|
|
115
|
+
@default_lexer_table["\r"] = method(:match_newline)
|
|
116
|
+
|
|
117
|
+
# Lexer table used inside spans of markup, wherein spans of newlines are not
|
|
118
|
+
# automatically treated as paragraphs.
|
|
119
|
+
@markup_lexer_table = @default_lexer_table.dup
|
|
120
|
+
@markup_lexer_table["\n"] = nil
|
|
121
|
+
@markup_lexer_table["\r"] = nil
|
|
122
|
+
|
|
123
|
+
# Lexer table used inside of headings
|
|
124
|
+
@heading_lexer_table = @inline_lexer_table.dup
|
|
125
|
+
@heading_lexer_table["="] = method(:match_equal_in_heading)
|
|
126
|
+
@heading_lexer_table["\n"] = method(:match_newline_in_heading)
|
|
127
|
+
|
|
128
|
+
# Lexer table used inside the left half of an external link
|
|
129
|
+
@link_lexer_table = {}
|
|
130
|
+
@link_lexer_table["]"] = method(:match_right_square_in_link)
|
|
131
|
+
@link_lexer_table["\n"] = method(:match_newline_in_link)
|
|
132
|
+
@link_lexer_table["\r"] = method(:match_newline_in_link)
|
|
133
|
+
@link_lexer_table[" "] = method(:match_space_in_link)
|
|
134
|
+
|
|
135
|
+
# Lexer table used inside the right half of an external link, or the right
|
|
136
|
+
# half of an internal link
|
|
137
|
+
@link_opt_lexer_table = @inline_lexer_table.dup
|
|
138
|
+
@link_opt_lexer_table["]"] = method(:match_right_square_in_link)
|
|
139
|
+
@link_opt_lexer_table["\n"] = method(:match_newline_in_link)
|
|
140
|
+
@link_opt_lexer_table["\r"] = method(:match_newline_in_link)
|
|
141
|
+
|
|
142
|
+
# Lexer table used inside the left half of an internal link or internal
|
|
143
|
+
# resource link
|
|
144
|
+
@intlink_lexer_table = {}
|
|
145
|
+
@intlink_lexer_table["]"] = method(:match_right_square_in_intlink)
|
|
146
|
+
@intlink_lexer_table["\r"] = method(:match_newline_in_intlink)
|
|
147
|
+
@intlink_lexer_table["\n"] = method(:match_newline_in_intlink)
|
|
148
|
+
@intlink_lexer_table[":"] = method(:match_colon_in_intlink)
|
|
149
|
+
@intlink_lexer_table["|"] = method(:match_pipe_in_intlink)
|
|
150
|
+
@intlink_lexer_table["C"] = method(:match_c_char_in_intlink)
|
|
151
|
+
|
|
152
|
+
# Lexer table used inside the category name of the left half of an
|
|
153
|
+
# internal link
|
|
154
|
+
@intlink_cat_lexer_table = {}
|
|
155
|
+
@intlink_cat_lexer_table["]"] = method(:match_right_square_in_intlink)
|
|
156
|
+
@intlink_cat_lexer_table["\r"] = method(:match_newline_in_intlink)
|
|
157
|
+
@intlink_cat_lexer_table["\n"] = method(:match_newline_in_intlink)
|
|
158
|
+
@intlink_cat_lexer_table["|"] = method(:match_pipe_in_intlink)
|
|
159
|
+
|
|
160
|
+
# Lexer table used inside the right half of an internal link
|
|
161
|
+
@intlink_opt_lexer_table = @formatting_lexer_table.dup
|
|
162
|
+
@intlink_opt_lexer_table["]"] = method(:match_right_square_in_intlink)
|
|
163
|
+
@intlink_opt_lexer_table["\n"] = method(:match_newline_in_intlink)
|
|
164
|
+
@intlink_opt_lexer_table["\r"] = method(:match_newline_in_intlink)
|
|
165
|
+
|
|
166
|
+
# Lexer table used inside the right half of an internal resource link
|
|
167
|
+
@resourcelink_opt_lexer_table = @inline_lexer_table.dup
|
|
168
|
+
@resourcelink_opt_lexer_table["]"] = method(:match_right_square_in_intlink)
|
|
169
|
+
@resourcelink_opt_lexer_table["\n"] = method(:match_newline_in_intlink)
|
|
170
|
+
@resourcelink_opt_lexer_table["\r"] = method(:match_newline_in_intlink)
|
|
171
|
+
@resourcelink_opt_lexer_table["|"] = method(:match_pipe_in_intlink)
|
|
172
|
+
|
|
173
|
+
# Lexer table used to parse tables
|
|
174
|
+
@table_lexer_table = @inline_lexer_table.dup
|
|
175
|
+
@table_lexer_table["*"] = method(:match_star)
|
|
176
|
+
@table_lexer_table["#"] = method(:match_hash)
|
|
177
|
+
@table_lexer_table["|"] = method(:match_pipe_in_table)
|
|
178
|
+
@table_lexer_table["!"] = method(:match_bang_in_table)
|
|
179
|
+
@table_lexer_table["{"] = method(:match_left_curly)
|
|
180
|
+
@table_lexer_table[" "] = method(:match_space)
|
|
181
|
+
|
|
182
|
+
# Lexer table used to parse ordered and unordered list items (which may nest)
|
|
183
|
+
@items_lexer_table = @inline_lexer_table.dup
|
|
184
|
+
@items_lexer_table["\n"] = method(:match_newline_in_items)
|
|
185
|
+
|
|
186
|
+
# Lexer table used to parse entries in a definition list (which may not nest)
|
|
187
|
+
@entries_lexer_table = @inline_lexer_table.dup
|
|
188
|
+
@entries_lexer_table["\n"] = method(:match_newline_in_entries)
|
|
189
|
+
@entries_lexer_table[":"] = method(:match_colon_in_entries)
|
|
190
|
+
|
|
191
|
+
# Lexer table used inside spans of indented text
|
|
192
|
+
@indent_lexer_table = @inline_lexer_table.dup
|
|
193
|
+
@indent_lexer_table["\n"] = method(:match_newline_in_indent)
|
|
194
|
+
|
|
195
|
+
# Lexer table used inside spans of pre-formatted text
|
|
196
|
+
@pre_lexer_table = {}
|
|
197
|
+
@pre_lexer_table["<"] = method(:match_left_angle_in_pre)
|
|
198
|
+
|
|
199
|
+
# Lexer table used inside spans of <code>
|
|
200
|
+
@code_lexer_table = @inline_lexer_table.dup
|
|
201
|
+
@code_lexer_table[" "] = method(:match_space_in_code)
|
|
202
|
+
@code_lexer_table["<"] = method(:match_left_angle_in_code)
|
|
203
|
+
|
|
204
|
+
# Lexer table used when inside spans of wiki-escaped text
|
|
205
|
+
@nowiki_lexer_table = {}
|
|
206
|
+
@nowiki_lexer_table["<"] = method(:match_left_angle_in_nowiki)
|
|
164
207
|
|
|
165
|
-
|
|
208
|
+
@paste_lexer_table = {}
|
|
209
|
+
@paste_lexer_table["<"] = method(:match_left_angle_in_paste)
|
|
210
|
+
@paste_lexer_table["\n"] = method(:match_newline_in_paste)
|
|
211
|
+
@paste_lexer_table["\r"] = method(:match_newline_in_paste)
|
|
166
212
|
|
|
167
|
-
#
|
|
168
|
-
|
|
169
|
-
|
|
213
|
+
# Lexer table used when inside spans of math
|
|
214
|
+
@math_lexer_table = {}
|
|
215
|
+
@math_lexer_table["<"] = method(:match_left_angle_in_math)
|
|
216
|
+
|
|
217
|
+
# Lexer table used when inside a wiki template inclusion
|
|
218
|
+
@template_lexer_table = {}
|
|
219
|
+
@template_lexer_table["{"] = method(:match_left_curly_in_template)
|
|
220
|
+
@template_lexer_table["|"] = method(:match_pipe_in_template)
|
|
221
|
+
@template_lexer_table["}"] = method(:match_right_curly_in_template)
|
|
222
|
+
|
|
223
|
+
@template_param_lexer_table = {}
|
|
224
|
+
@template_param_lexer_table["{"] = method(:match_left_curly_in_template)
|
|
225
|
+
@template_param_lexer_table["}"] = method(:match_right_curly_in_template)
|
|
226
|
+
@template_param_lexer_table["|"] = method(:match_pipe_in_template)
|
|
227
|
+
|
|
228
|
+
# Begin lexing in default state
|
|
229
|
+
@lexer_table = LexerTable.new
|
|
230
|
+
@lexer_table.push(@default_lexer_table)
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def tokenize(input)
|
|
235
|
+
@text = input
|
|
236
|
+
# Current position in the input text
|
|
237
|
+
@cursor = 0
|
|
238
|
+
# Tokens to be returned
|
|
239
|
+
@tokens = TokenArray.new(self)
|
|
240
|
+
# Stack of open token spans
|
|
241
|
+
@context = []
|
|
242
|
+
# Already lexed character data, not yet added to a TEXT token
|
|
243
|
+
@pending = TokenString.new(self)
|
|
244
|
+
# List symbols from the most recent line item of a list, e.g. '***'
|
|
245
|
+
@list = ''
|
|
246
|
+
|
|
247
|
+
start_span(:PARA)
|
|
248
|
+
|
|
249
|
+
while (@cursor < @text.length)
|
|
250
|
+
@char = @text[@cursor, 1]
|
|
251
|
+
if @lexer_table[@char]
|
|
252
|
+
@lexer_table[@char].call
|
|
253
|
+
else
|
|
254
|
+
@pending << @char
|
|
170
255
|
@cursor += 1
|
|
171
|
-
|
|
256
|
+
end
|
|
172
257
|
end
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
else
|
|
183
|
-
matchItalic
|
|
184
|
-
@cursor += 2
|
|
185
|
-
end
|
|
186
|
-
return
|
|
187
|
-
end
|
|
188
|
-
if @text[@cursor, 3] == "'''"
|
|
189
|
-
matchBold
|
|
190
|
-
@cursor += 3
|
|
191
|
-
return
|
|
192
|
-
end
|
|
193
|
-
if @text[@cursor, 2] == "''"
|
|
194
|
-
matchItalic
|
|
195
|
-
@cursor += 2
|
|
196
|
-
return
|
|
197
|
-
end
|
|
198
|
-
match_other
|
|
258
|
+
|
|
259
|
+
if @pending.is_empty_token?
|
|
260
|
+
if @context.size > 0 and @tokens.last[0] == :PARA_START
|
|
261
|
+
@context.pop
|
|
262
|
+
@tokens.pop
|
|
263
|
+
end
|
|
264
|
+
else
|
|
265
|
+
@tokens.append_pending(@pending)
|
|
266
|
+
@pending = TokenString.new(self)
|
|
199
267
|
end
|
|
268
|
+
while(@context.size > 0) do
|
|
269
|
+
@tokens << [(@context.pop.to_s + '_END').to_sym, '']
|
|
270
|
+
end
|
|
271
|
+
@tokens << [false, false, 0, 0]
|
|
272
|
+
@tokens
|
|
273
|
+
|
|
274
|
+
end
|
|
200
275
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
276
|
+
#Returns the next token from the stream. Useful for RACC parsers.
|
|
277
|
+
def lex
|
|
278
|
+
token = @tokens[@position]
|
|
279
|
+
@position += 1
|
|
280
|
+
return token
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
private
|
|
285
|
+
|
|
286
|
+
def match_text
|
|
287
|
+
@pending << @char
|
|
288
|
+
@cursor += 1
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
def match_ampersand
|
|
292
|
+
i = @cursor + 1
|
|
293
|
+
i += 1 while i < @text.size and NAME_CHAR_TABLE[@text[i].ord]
|
|
294
|
+
if @text[i, 1] == ';'
|
|
295
|
+
append_to_tokens([:CHAR_ENT, @text[(@cursor + 1) ... i]])
|
|
296
|
+
@cursor = i + 1
|
|
297
|
+
else
|
|
298
|
+
match_text
|
|
299
|
+
end
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
def match_quote
|
|
303
|
+
if @text[@cursor, 5] == "'''''"
|
|
304
|
+
if @context.last == :BOLD
|
|
305
|
+
match_bold
|
|
306
|
+
@cursor += 3
|
|
307
|
+
else
|
|
308
|
+
match_italic
|
|
309
|
+
@cursor += 2
|
|
310
|
+
end
|
|
311
|
+
elsif @text[@cursor, 3] == "'''"
|
|
312
|
+
match_bold
|
|
313
|
+
@cursor += 3
|
|
314
|
+
elsif @text[@cursor, 2] == "''"
|
|
315
|
+
match_italic
|
|
316
|
+
@cursor += 2
|
|
317
|
+
else
|
|
318
|
+
match_text
|
|
209
319
|
end
|
|
320
|
+
end
|
|
210
321
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
@next_token[0] = :ITALICSTART
|
|
217
|
-
@pair_stack.push @next_token
|
|
218
|
-
end
|
|
322
|
+
def match_bold
|
|
323
|
+
if @context.last == :BOLD
|
|
324
|
+
end_span(:BOLD, "'''")
|
|
325
|
+
else
|
|
326
|
+
start_span(:BOLD, "'''")
|
|
219
327
|
end
|
|
328
|
+
end
|
|
220
329
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
330
|
+
def match_italic
|
|
331
|
+
if @context.last == :ITALIC
|
|
332
|
+
end_span(:ITALIC, "''")
|
|
333
|
+
else
|
|
334
|
+
start_span(:ITALIC, "''")
|
|
335
|
+
end
|
|
336
|
+
end
|
|
227
337
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
338
|
+
def match_tilde
|
|
339
|
+
if @text[@cursor, 5] == "~~~~~"
|
|
340
|
+
empty_span(:SIGNATURE_DATE, "~~~~~", 5)
|
|
341
|
+
elsif @text[@cursor, 4] == "~~~~"
|
|
342
|
+
empty_span(:SIGNATURE_FULL, "~~~~", 4)
|
|
343
|
+
elsif @text[@cursor, 3] == "~~~"
|
|
344
|
+
empty_span(:SIGNATURE_NAME, "~~~", 3)
|
|
345
|
+
else
|
|
346
|
+
match_text
|
|
347
|
+
end
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
def match_left_angle
|
|
351
|
+
next_char = @text[@cursor + 1]
|
|
352
|
+
if !next_char
|
|
353
|
+
match_text
|
|
354
|
+
elsif next_char.ord == 47
|
|
355
|
+
# Might be an XHTML end tag
|
|
356
|
+
if @text[@cursor .. -1] =~ %r{</([a-zA-Z][a-zA-Z0-9\-_]*)(\s*)>} and @context.include?(:TAG)
|
|
357
|
+
# Found an XHTML end tag
|
|
358
|
+
tag_name = $1
|
|
359
|
+
end_span(:TAG, $1)
|
|
360
|
+
@lexer_table.pop
|
|
361
|
+
@cursor += $1.length + $2.length + 3
|
|
362
|
+
else
|
|
363
|
+
match_text
|
|
364
|
+
end
|
|
365
|
+
elsif next_char.ord > 64 and next_char.ord < 123
|
|
366
|
+
# Might be an XHTML open or empty tag
|
|
367
|
+
scanner = StringScanner.new(@text[@cursor .. -1])
|
|
368
|
+
if scanner.scan(%r{<([a-zA-Z][a-zA-Z0-9\-_]*)}) and (HTML_TAGS.include?(scanner[1]) or WIKI_TAGS.include?(scanner[1]))
|
|
369
|
+
# Sequence begins with a valid tag name, so check for attributes
|
|
370
|
+
tag_name = scanner[1]
|
|
371
|
+
attrs = {}
|
|
372
|
+
while scanner.scan(%r{\s+([a-zA-Z][a-zA-Z0-9\-_]*)\s*=\s*('([^']+)'|"([^"]+)"|([^>\s]+))}) do
|
|
373
|
+
attrs[scanner[1]] = scanner[3] ? scanner[3] : (scanner[4] ? scanner[4] : scanner[5])
|
|
374
|
+
end
|
|
375
|
+
scanner.scan(%r{\s*})
|
|
376
|
+
if ((c = scanner.get_byte) == '>' or (c == '/' and scanner.get_byte == '>'))
|
|
377
|
+
# Found an XHTML start or empty tag
|
|
378
|
+
if tag_name == 'nowiki'
|
|
379
|
+
@lexer_table.push(@nowiki_lexer_table) unless c == '/'
|
|
380
|
+
elsif tag_name == 'paste'
|
|
381
|
+
unless c == '/'
|
|
382
|
+
maybe_close_para(:PASTE_START, true)
|
|
383
|
+
append_to_tokens([:PASTE_START, ''])
|
|
384
|
+
@cursor += scanner.pos
|
|
385
|
+
@lexer_table.push(@paste_lexer_table)
|
|
386
|
+
#eat newline after <paste> if if exists because otherwise
|
|
387
|
+
#it will be transformed into <br/>
|
|
388
|
+
if @text[@cursor, 1] == "\n"
|
|
389
|
+
@cursor += 1
|
|
390
|
+
elsif @text[@cursor, 2] == "\r\n"
|
|
391
|
+
@cursor += 2
|
|
392
|
+
end
|
|
393
|
+
return
|
|
394
|
+
end
|
|
395
|
+
else
|
|
396
|
+
if tag_name == 'pre'
|
|
397
|
+
table = @pre_lexer_table
|
|
398
|
+
elsif tag_name == 'code'
|
|
399
|
+
table = @code_lexer_table
|
|
400
|
+
elsif tag_name == 'math'
|
|
401
|
+
table = @math_lexer_table
|
|
402
|
+
else
|
|
403
|
+
table = @markup_lexer_table
|
|
404
|
+
end
|
|
405
|
+
start_span(:TAG, tag_name)
|
|
406
|
+
attrs.collect do |(name, value)|
|
|
407
|
+
append_to_tokens([:ATTR_NAME, name])
|
|
408
|
+
append_to_tokens([:ATTR_VALUE, value]) if value
|
|
409
|
+
end
|
|
410
|
+
if c == '/' or TAGS_WITHOUT_CLOSE_TAG.include? tag_name
|
|
411
|
+
end_span(:TAG, tag_name)
|
|
231
412
|
else
|
|
232
|
-
|
|
233
|
-
@pair_stack.push @next_token
|
|
413
|
+
@lexer_table.push(table)
|
|
234
414
|
end
|
|
415
|
+
end
|
|
416
|
+
@cursor += scanner.pos #FIXME: will break xhtml attribute length calculation
|
|
235
417
|
else
|
|
236
|
-
|
|
418
|
+
match_text
|
|
237
419
|
end
|
|
420
|
+
else
|
|
421
|
+
match_text
|
|
422
|
+
end
|
|
423
|
+
else
|
|
424
|
+
match_text
|
|
238
425
|
end
|
|
426
|
+
end
|
|
239
427
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
428
|
+
def match_equal
|
|
429
|
+
if at_start_of_line?
|
|
430
|
+
@heading = extract_char_sequence('=')
|
|
431
|
+
@cursor += @heading.length
|
|
432
|
+
if at_end_of_line? or blank_line?
|
|
433
|
+
@cursor -= @heading.length
|
|
434
|
+
#special case - no header text, just "=" signs
|
|
435
|
+
#try to split header into "=" formatting and text with "=":
|
|
436
|
+
# example:
|
|
437
|
+
# ==== should become: = == =
|
|
438
|
+
# ===== should become: == = ==
|
|
439
|
+
if @heading =~ /(={6})(=+)(={6})/ or
|
|
440
|
+
@heading =~ /(={5})(=+)(={5})/ or
|
|
441
|
+
@heading =~ /(={4})(=+)(={4})/ or
|
|
442
|
+
@heading =~ /(={3})(=+)(={3})/ or
|
|
443
|
+
@heading =~ /(={2})(=+)(={2})/ or
|
|
444
|
+
@heading =~ /(=)(=+)(=)/
|
|
445
|
+
start_span(:SECTION, $1)
|
|
446
|
+
@cursor += $1.length
|
|
447
|
+
@tokens << [:TEXT, $2]
|
|
448
|
+
@cursor += $2.length
|
|
449
|
+
end_span(:SECTION, $3)
|
|
450
|
+
@cursor += $3.length
|
|
252
451
|
else
|
|
253
|
-
|
|
452
|
+
match_text
|
|
254
453
|
end
|
|
454
|
+
else
|
|
455
|
+
@cursor -= @heading.length
|
|
456
|
+
start_span(:SECTION, @heading)
|
|
457
|
+
@cursor += @heading.length
|
|
458
|
+
@lexer_table.push(@heading_lexer_table)
|
|
459
|
+
end
|
|
460
|
+
else
|
|
461
|
+
match_text
|
|
462
|
+
end
|
|
463
|
+
end
|
|
464
|
+
|
|
465
|
+
def match_equal_in_heading
|
|
466
|
+
heading = extract_char_sequence('=')
|
|
467
|
+
if @heading.length <= heading.length
|
|
468
|
+
end_span(:SECTION, heading)
|
|
469
|
+
@lexer_table.pop
|
|
470
|
+
@cursor += heading.length
|
|
471
|
+
skip_newline
|
|
472
|
+
else
|
|
473
|
+
@pending << heading
|
|
474
|
+
@cursor += heading.length
|
|
255
475
|
end
|
|
476
|
+
end
|
|
477
|
+
|
|
478
|
+
def match_newline_in_heading
|
|
479
|
+
end_span(:SECTION)
|
|
480
|
+
@lexer_table.pop
|
|
481
|
+
end
|
|
256
482
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
483
|
+
def match_left_square
|
|
484
|
+
if @text[@cursor, 2] == "[["
|
|
485
|
+
if @text[@cursor + 2, 1] != "]"
|
|
486
|
+
start_span(:INTLINK, "[[")
|
|
487
|
+
@cursor += 2
|
|
488
|
+
@lexer_table.push(@intlink_lexer_table)
|
|
489
|
+
else
|
|
490
|
+
match_text
|
|
491
|
+
end
|
|
492
|
+
elsif @text[@cursor + 1 .. -1] =~ %r{\A\s*((http|https|file)://|mailto:)}
|
|
493
|
+
start_span(:LINK, "[")
|
|
494
|
+
@cursor += 1
|
|
495
|
+
skip_whitespace
|
|
496
|
+
@lexer_table.push(@link_lexer_table)
|
|
497
|
+
else
|
|
498
|
+
match_text
|
|
272
499
|
end
|
|
500
|
+
end
|
|
501
|
+
|
|
502
|
+
def match_right_square_in_link
|
|
503
|
+
end_span(:LINK, "]")
|
|
504
|
+
@cursor += 1
|
|
505
|
+
@lexer_table.pop
|
|
506
|
+
end
|
|
507
|
+
|
|
508
|
+
def match_right_square_in_intlink
|
|
509
|
+
if @text[@cursor, 2] == "]]"
|
|
510
|
+
end_span(:INTLINK, "]]")
|
|
511
|
+
@cursor += 2
|
|
512
|
+
@lexer_table.pop
|
|
513
|
+
else
|
|
514
|
+
match_text
|
|
515
|
+
end
|
|
516
|
+
end
|
|
517
|
+
|
|
518
|
+
def match_space_in_link
|
|
519
|
+
spaces = extract_char_sequence(' ')
|
|
520
|
+
append_to_tokens([:LINKSEP, ' ']) unless @text[@cursor, 1] == ']'
|
|
521
|
+
@cursor += spaces.length
|
|
522
|
+
@lexer_table.pop
|
|
523
|
+
@lexer_table.push(@link_opt_lexer_table)
|
|
524
|
+
end
|
|
273
525
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
526
|
+
def match_pipe_in_intlink
|
|
527
|
+
if @tokens.last[0] == :INTLINK_START
|
|
528
|
+
@lexer_table.pop
|
|
529
|
+
@lexer_table.push(@intlink_opt_lexer_table)
|
|
530
|
+
end
|
|
531
|
+
append_to_tokens([:INTLINKSEP, "|"])
|
|
532
|
+
@cursor += 1
|
|
533
|
+
end
|
|
534
|
+
|
|
535
|
+
def match_colon_in_intlink
|
|
536
|
+
if not @pending.is_empty_token?
|
|
537
|
+
@lexer_table.pop
|
|
538
|
+
@lexer_table.push(@resourcelink_opt_lexer_table)
|
|
539
|
+
end
|
|
540
|
+
append_to_tokens([:RESOURCESEP, ":"])
|
|
541
|
+
@cursor += 1
|
|
542
|
+
end
|
|
543
|
+
|
|
544
|
+
def match_c_char_in_intlink
|
|
545
|
+
if @text[@cursor, 9] == 'Category:'
|
|
546
|
+
append_to_tokens([:CATEGORY, 'Category:'])
|
|
547
|
+
@lexer_table.pop
|
|
548
|
+
@lexer_table.push(@intlink_cat_lexer_table)
|
|
549
|
+
@cursor += 9
|
|
550
|
+
else
|
|
551
|
+
match_text
|
|
552
|
+
end
|
|
553
|
+
end
|
|
554
|
+
|
|
555
|
+
def match_newline_in_link
|
|
556
|
+
end_span(:LINK)
|
|
557
|
+
@lexer_table.pop
|
|
558
|
+
end
|
|
559
|
+
|
|
560
|
+
def match_newline_in_intlink
|
|
561
|
+
end_span(:INTLINK)
|
|
562
|
+
@lexer_table.pop
|
|
563
|
+
end
|
|
564
|
+
|
|
565
|
+
def match_h_char
|
|
566
|
+
link = @text[@cursor, 7] if @text[@cursor, 7] == 'http://'
|
|
567
|
+
link = @text[@cursor, 8] if @text[@cursor, 8] == 'https://'
|
|
568
|
+
if link
|
|
569
|
+
start_span(:LINK)
|
|
570
|
+
i = @cursor + link.length
|
|
571
|
+
while i < @text.size and TOKEN_CHAR_TABLE[@text[i].ord] do
|
|
572
|
+
link << @text[i, 1]
|
|
573
|
+
i += 1
|
|
574
|
+
end
|
|
575
|
+
|
|
576
|
+
#exclude punctuation at the end
|
|
577
|
+
while link.length > 0 and PUNCTUATION_CHAR_TABLE[link[-1].ord] do
|
|
578
|
+
link = link[0..-2]
|
|
579
|
+
i -= 1
|
|
281
580
|
end
|
|
581
|
+
|
|
582
|
+
@pending = TokenString.new(self)
|
|
583
|
+
@pending << link
|
|
584
|
+
@cursor = i
|
|
585
|
+
end_span(:LINK)
|
|
586
|
+
else
|
|
587
|
+
match_text
|
|
282
588
|
end
|
|
589
|
+
end
|
|
283
590
|
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
end
|
|
302
|
-
@sub_tokens = []
|
|
303
|
-
@sub_tokens << [:TEXT, linkText]
|
|
304
|
-
@sub_tokens << [:LINKEND, ']']
|
|
305
|
-
@cursor += linkText.length
|
|
306
|
-
@token_start = @cursor
|
|
307
|
-
else
|
|
308
|
-
match_other
|
|
309
|
-
end
|
|
591
|
+
def match_space
|
|
592
|
+
if at_start_of_line? and !blank_line?
|
|
593
|
+
start_span(:PREINDENT)
|
|
594
|
+
@lexer_table.push(@indent_lexer_table)
|
|
595
|
+
match_text
|
|
596
|
+
else
|
|
597
|
+
match_text
|
|
598
|
+
end
|
|
599
|
+
end
|
|
600
|
+
|
|
601
|
+
def match_newline_in_indent
|
|
602
|
+
match_text
|
|
603
|
+
unless @text[@cursor, 1] == " "
|
|
604
|
+
@tokens.append_pending(@pending)
|
|
605
|
+
@pending = TokenString.new(self)
|
|
606
|
+
end_span(:PREINDENT)
|
|
607
|
+
@lexer_table.pop
|
|
310
608
|
end
|
|
609
|
+
end
|
|
311
610
|
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
611
|
+
def match_star
|
|
612
|
+
if at_start_of_line?
|
|
613
|
+
@list = extract_char_sequence('#*')
|
|
614
|
+
open_list(@list)
|
|
615
|
+
@lexer_table.push(@items_lexer_table)
|
|
616
|
+
else
|
|
617
|
+
match_text
|
|
618
|
+
end
|
|
619
|
+
end
|
|
620
|
+
|
|
621
|
+
def match_hash
|
|
622
|
+
if at_start_of_line?
|
|
623
|
+
@list = extract_char_sequence('#*')
|
|
624
|
+
open_list(@list)
|
|
625
|
+
@lexer_table.push(@items_lexer_table)
|
|
626
|
+
else
|
|
627
|
+
match_text
|
|
628
|
+
end
|
|
629
|
+
end
|
|
630
|
+
|
|
631
|
+
def match_underscore
|
|
632
|
+
if @text[@cursor, 7] == '__TOC__'
|
|
633
|
+
empty_span(:KEYWORD, 'TOC', 7)
|
|
634
|
+
elsif @text[@cursor, 9] == '__NOTOC__'
|
|
635
|
+
empty_span(:KEYWORD, 'NOTOC', 9)
|
|
636
|
+
else
|
|
637
|
+
match_text
|
|
638
|
+
end
|
|
639
|
+
end
|
|
640
|
+
|
|
641
|
+
def match_newline_in_items
|
|
642
|
+
if @text[@cursor, 1] == "\n"
|
|
643
|
+
newline = "\n"
|
|
644
|
+
char = @text[@cursor + 1, 1]
|
|
645
|
+
else
|
|
646
|
+
newline = "\r\n"
|
|
647
|
+
char = @text[@cursor + 2, 1]
|
|
648
|
+
end
|
|
649
|
+
@pending << newline
|
|
650
|
+
@cursor += newline.length
|
|
651
|
+
if (char == @list[0, 1])
|
|
652
|
+
list = extract_char_sequence('#*')
|
|
653
|
+
if list == @list
|
|
654
|
+
end_span(:LI)
|
|
655
|
+
start_span(:LI)
|
|
656
|
+
@cursor += list.length
|
|
657
|
+
else
|
|
658
|
+
l = @list.length > list.length ? list.length : @list.length
|
|
659
|
+
i = 0
|
|
660
|
+
i += 1 while (i < l and @list[i] == list[i])
|
|
661
|
+
if i < @list.length
|
|
662
|
+
close_list(@list[i .. -1])
|
|
663
|
+
if @context.last == :LI
|
|
664
|
+
end_span(:LI)
|
|
665
|
+
start_span(:LI)
|
|
666
|
+
end
|
|
325
667
|
end
|
|
668
|
+
if i < list.length
|
|
669
|
+
start_span(:LI) if @context.last != :LI
|
|
670
|
+
open_list(list[i .. -1])
|
|
671
|
+
end
|
|
672
|
+
@cursor += i
|
|
673
|
+
@list = list
|
|
674
|
+
end
|
|
675
|
+
else
|
|
676
|
+
close_list(@list)
|
|
677
|
+
@lexer_table.pop
|
|
326
678
|
end
|
|
679
|
+
end
|
|
680
|
+
|
|
681
|
+
def match_dash
|
|
682
|
+
if at_start_of_line? and @text[@cursor, 4] == "----"
|
|
683
|
+
empty_span(:HLINE, "----", 4)
|
|
684
|
+
else
|
|
685
|
+
match_text
|
|
686
|
+
end
|
|
687
|
+
end
|
|
688
|
+
|
|
689
|
+
def match_left_angle_in_nowiki
|
|
690
|
+
if @text[@cursor, 9] == '</nowiki>'
|
|
691
|
+
@cursor += 9
|
|
692
|
+
@lexer_table.pop
|
|
693
|
+
else
|
|
694
|
+
match_text
|
|
695
|
+
end
|
|
696
|
+
end
|
|
327
697
|
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
#hack to tokenize everything inside the list
|
|
339
|
-
@sub_tokens = []
|
|
340
|
-
sub_lines = ""
|
|
341
|
-
@sub_tokens << [:LI_START, ""]
|
|
342
|
-
sub_text.each do |t|
|
|
343
|
-
extracted += 1
|
|
344
|
-
if text_is_list? t
|
|
345
|
-
sub_lines += t
|
|
346
|
-
else
|
|
347
|
-
if not sub_lines.empty?
|
|
348
|
-
@sub_tokens += sub_lex(sub_lines)
|
|
349
|
-
sub_lines = ""
|
|
350
|
-
end
|
|
351
|
-
if @sub_tokens.last[0] != :LI_START
|
|
352
|
-
@sub_tokens << [:LI_END, ""]
|
|
353
|
-
@sub_tokens << [:LI_START, ""]
|
|
354
|
-
end
|
|
355
|
-
@sub_tokens += sub_lex(t.lstrip)
|
|
356
|
-
end
|
|
357
|
-
end
|
|
358
|
-
if not sub_lines.empty?
|
|
359
|
-
@sub_tokens += sub_lex(sub_lines)
|
|
360
|
-
@sub_tokens << [:LI_END, ""]
|
|
361
|
-
else
|
|
362
|
-
@sub_tokens << [:LI_END, ""]
|
|
363
|
-
end
|
|
698
|
+
def match_left_angle_in_paste
|
|
699
|
+
if @text[@cursor, 8] == '</paste>'
|
|
700
|
+
@lexer_table.pop
|
|
701
|
+
append_to_tokens([:PASTE_END, ''])
|
|
702
|
+
@cursor += 8
|
|
703
|
+
maybe_open_para(:PASTE_END)
|
|
704
|
+
else
|
|
705
|
+
match_text
|
|
706
|
+
end
|
|
707
|
+
end
|
|
364
708
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
@next_token[0] = :UL_START
|
|
372
|
-
@sub_tokens << [:UL_END, ""]
|
|
373
|
-
when list_id == "#"
|
|
374
|
-
@next_token[0] = :OL_START
|
|
375
|
-
@sub_tokens << [:OL_END, ""]
|
|
376
|
-
when list_id == ";", list_id == ":"
|
|
377
|
-
@next_token[0] = :DL_START
|
|
378
|
-
@sub_tokens << [:DL_END, ""]
|
|
379
|
-
end
|
|
380
|
-
elsif @text[@cursor, 1] == ':' and @tokens[-1][0] == :INTLINKSTART
|
|
381
|
-
@next_token[0] = :RESOURCE_SEP
|
|
382
|
-
@cursor += 1
|
|
383
|
-
else
|
|
384
|
-
match_other
|
|
385
|
-
end
|
|
709
|
+
def match_newline_in_paste
|
|
710
|
+
append_to_tokens([:TAG_START, 'br'])
|
|
711
|
+
if @text[@cursor, 1] == "\n"
|
|
712
|
+
@cursor += 1
|
|
713
|
+
elsif @text[@cursor, 2] == "\r\n"
|
|
714
|
+
@cursor += 2
|
|
386
715
|
end
|
|
716
|
+
append_to_tokens([:TAG_END, 'br'])
|
|
717
|
+
end
|
|
387
718
|
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
719
|
+
def match_left_angle_in_math
|
|
720
|
+
if @text[@cursor, 7] == '</math>'
|
|
721
|
+
end_span(:TAG, 'math')
|
|
722
|
+
@cursor += 7
|
|
723
|
+
@lexer_table.pop
|
|
724
|
+
else
|
|
725
|
+
match_text
|
|
726
|
+
end
|
|
727
|
+
end
|
|
728
|
+
|
|
729
|
+
def match_left_angle_in_pre
|
|
730
|
+
if @text[@cursor, 6] == '</pre>'
|
|
731
|
+
end_span(:TAG, 'pre')
|
|
732
|
+
@cursor += 6
|
|
733
|
+
#eat newline after </pre>
|
|
734
|
+
if @text[@cursor, 1] == "\n"
|
|
395
735
|
@cursor += 1
|
|
736
|
+
elsif @text[@cursor, 2] == "\r\n"
|
|
737
|
+
@cursor += 2
|
|
738
|
+
end
|
|
739
|
+
@lexer_table.pop
|
|
740
|
+
else
|
|
741
|
+
match_text
|
|
396
742
|
end
|
|
743
|
+
end
|
|
397
744
|
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
745
|
+
def match_space_in_code
|
|
746
|
+
match_text
|
|
747
|
+
end
|
|
748
|
+
|
|
749
|
+
def match_left_angle_in_code
|
|
750
|
+
if @text[@cursor, 7] == '</code>'
|
|
751
|
+
end_span(:TAG, 'code')
|
|
752
|
+
@cursor += 7
|
|
753
|
+
@lexer_table.pop
|
|
754
|
+
else
|
|
755
|
+
match_left_angle
|
|
407
756
|
end
|
|
757
|
+
end
|
|
408
758
|
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
elsif @text[@cursor, 3] == "~~~"
|
|
421
|
-
@next_token[0] = :SIGNATURE_NAME
|
|
422
|
-
@cursor += 3
|
|
423
|
-
else
|
|
424
|
-
match_other
|
|
425
|
-
end
|
|
759
|
+
def match_left_curly
|
|
760
|
+
if at_start_of_line? and @text[@cursor + 1, 1] == '|'
|
|
761
|
+
start_span(:TABLE, "{|")
|
|
762
|
+
@cursor += 2
|
|
763
|
+
@lexer_table.push(@table_lexer_table)
|
|
764
|
+
elsif @text[@cursor + 1, 1] == '{' and @text[@cursor + 2, 2] != "}}"
|
|
765
|
+
start_span(:TEMPLATE, "{{")
|
|
766
|
+
@cursor += 2
|
|
767
|
+
@lexer_table.push(@template_lexer_table)
|
|
768
|
+
else
|
|
769
|
+
match_text
|
|
426
770
|
end
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
end
|
|
771
|
+
end
|
|
772
|
+
|
|
773
|
+
def match_left_curly_in_template
|
|
774
|
+
if @text[@cursor + 1, 1] == '{' and @text[@cursor + 2, 2] != "}}"
|
|
775
|
+
start_span(:TEMPLATE, "{{")
|
|
776
|
+
@cursor += 2
|
|
777
|
+
@lexer_table.push(@template_lexer_table)
|
|
778
|
+
else
|
|
779
|
+
match_text
|
|
437
780
|
end
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
end
|
|
781
|
+
end
|
|
782
|
+
|
|
783
|
+
def match_right_curly_in_template
|
|
784
|
+
if @text[@cursor + 1, 1] == '}'
|
|
785
|
+
end_span(:TEMPLATE, "}}")
|
|
786
|
+
@cursor += 2
|
|
787
|
+
@lexer_table.pop
|
|
788
|
+
else
|
|
789
|
+
match_text
|
|
448
790
|
end
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
if @tokens.last and @tokens.last[0] == :PARA_START and empty_text_token?
|
|
456
|
-
tokens.pop
|
|
457
|
-
else
|
|
458
|
-
tokens << [:PARA_END, ""]
|
|
459
|
-
end
|
|
460
|
-
@para = false
|
|
461
|
-
end
|
|
462
|
-
tokens << [:TABLE_START, '']
|
|
463
|
-
@pair_stack.push [:TABLE_START, '']
|
|
464
|
-
@next_token = tokens.shift
|
|
465
|
-
@sub_tokens = tokens
|
|
466
|
-
@cursor += 2
|
|
467
|
-
else
|
|
468
|
-
match_other
|
|
469
|
-
end
|
|
791
|
+
end
|
|
792
|
+
|
|
793
|
+
def match_pipe_in_template
|
|
794
|
+
if @tokens.last[0] == :TEMPLATE_START
|
|
795
|
+
@lexer_table.pop
|
|
796
|
+
@lexer_table.push(@template_param_lexer_table)
|
|
470
797
|
end
|
|
798
|
+
append_to_tokens([:INTLINKSEP, "|"])
|
|
799
|
+
@cursor += 1
|
|
800
|
+
end
|
|
471
801
|
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
end
|
|
486
|
-
tokens << [:HEAD_START, '']
|
|
487
|
-
@pair_stack.push [:HEAD_START, '']
|
|
488
|
-
@next_token = tokens.shift
|
|
489
|
-
@sub_tokens = tokens
|
|
490
|
-
else
|
|
491
|
-
match_other
|
|
492
|
-
end
|
|
802
|
+
def match_bang_in_table
|
|
803
|
+
if at_start_of_line?
|
|
804
|
+
if @context.last == :CELL
|
|
805
|
+
end_span(:CELL)
|
|
806
|
+
elsif @context.last == :HEAD
|
|
807
|
+
end_span(:HEAD)
|
|
808
|
+
elsif @context.last != :ROW
|
|
809
|
+
start_span(:ROW)
|
|
810
|
+
end
|
|
811
|
+
start_span(:HEAD, "!")
|
|
812
|
+
@cursor += 1
|
|
813
|
+
else
|
|
814
|
+
match_text
|
|
493
815
|
end
|
|
816
|
+
end
|
|
494
817
|
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
if @pair_stack.last[0] != :ROW_START
|
|
513
|
-
tokens << [:ROW_START, '']
|
|
514
|
-
@pair_stack.push [:ROW_START, '']
|
|
515
|
-
end
|
|
516
|
-
tokens << [:CELL_START, '']
|
|
517
|
-
@pair_stack.push [:CELL_START, '']
|
|
518
|
-
end
|
|
519
|
-
@next_token = tokens.shift
|
|
520
|
-
@sub_tokens = tokens
|
|
521
|
-
elsif @text[@cursor + 1, 1] == '|'
|
|
522
|
-
@cursor += 2
|
|
523
|
-
close_table_cell(tokens)
|
|
524
|
-
next_token = tokens.last[0] == :HEAD_END ? [:HEAD_START, ''] : [:CELL_START, '']
|
|
525
|
-
tokens << next_token
|
|
526
|
-
@pair_stack.push next_token
|
|
527
|
-
@next_token = tokens.shift
|
|
528
|
-
@sub_tokens = tokens
|
|
529
|
-
else
|
|
530
|
-
match_link_sep
|
|
531
|
-
end
|
|
532
|
-
else
|
|
533
|
-
match_link_sep
|
|
818
|
+
def match_pipe_in_table
|
|
819
|
+
if at_start_of_line?
|
|
820
|
+
context = @context[@context.rindex(:TABLE) + 1 .. -1]
|
|
821
|
+
if @text[@cursor+1, 1] == '-'
|
|
822
|
+
end_span(:ROW) if context.include? :ROW
|
|
823
|
+
start_span(:ROW, "|-")
|
|
824
|
+
@cursor += 2
|
|
825
|
+
elsif @text[@cursor+1, 1] == '}'
|
|
826
|
+
end_span(:TABLE, "|}")
|
|
827
|
+
@cursor += 2
|
|
828
|
+
@lexer_table.pop
|
|
829
|
+
skip_newline
|
|
830
|
+
else
|
|
831
|
+
if context.include? :CELL
|
|
832
|
+
end_span(:CELL)
|
|
833
|
+
elsif context.include? :HEAD
|
|
834
|
+
end_span(:HEAD)
|
|
534
835
|
end
|
|
836
|
+
start_span(:ROW) unless @context.last == :ROW
|
|
837
|
+
start_span(:CELL, "|")
|
|
838
|
+
@cursor += 1
|
|
839
|
+
end
|
|
840
|
+
elsif @text[@cursor + 1, 1] == '|'
|
|
841
|
+
context = @context[@context.rindex(:TABLE) + 1 .. -1]
|
|
842
|
+
if context.include?:CELL
|
|
843
|
+
end_span(:CELL)
|
|
844
|
+
start_span(:CELL, "||")
|
|
845
|
+
elsif context.include? :HEAD
|
|
846
|
+
end_span(:HEAD)
|
|
847
|
+
start_span(:HEAD, "||")
|
|
848
|
+
end
|
|
849
|
+
@cursor += 2
|
|
850
|
+
else
|
|
851
|
+
context = @context[@context.rindex(:TABLE) + 1 .. -1]
|
|
852
|
+
if context.include? :CELL
|
|
853
|
+
end_span(:CELL, "attributes")
|
|
854
|
+
start_span(:CELL, "|")
|
|
855
|
+
@char = '' #WTF?
|
|
856
|
+
#CHECK: this usecase and cursor increments
|
|
857
|
+
end
|
|
858
|
+
match_text
|
|
535
859
|
end
|
|
860
|
+
end
|
|
536
861
|
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
match_other
|
|
551
|
-
end
|
|
552
|
-
|
|
553
|
-
#Matches a new line and breaks the paragraph if two carriage return - newline
|
|
554
|
-
#sequences ("\r\n\r\n") are met.
|
|
555
|
-
def match_carriagereturn
|
|
556
|
-
if @text[@cursor, 4] == "\r\n\r\n"
|
|
557
|
-
if @para
|
|
558
|
-
@sub_tokens = end_tokens_for_open_pairs
|
|
559
|
-
@sub_tokens << [:PARA_END, '']
|
|
560
|
-
@sub_tokens << [:PARA_START, '']
|
|
561
|
-
@next_token[0] = @sub_tokens.slice!(0)[0]
|
|
562
|
-
@cursor += 4
|
|
563
|
-
return
|
|
564
|
-
end
|
|
565
|
-
end
|
|
566
|
-
match_other
|
|
862
|
+
def match_newline
|
|
863
|
+
if @text[@cursor, 2] == "\n\n"
|
|
864
|
+
@pending << "\n\n"
|
|
865
|
+
@cursor += 2
|
|
866
|
+
end_span(:PARA)
|
|
867
|
+
start_span(:PARA)
|
|
868
|
+
elsif @text[@cursor, 4] == "\r\n\r\n"
|
|
869
|
+
@pending << "\r\n\r\n"
|
|
870
|
+
@cursor += 4
|
|
871
|
+
end_span(:PARA)
|
|
872
|
+
start_span(:PARA)
|
|
873
|
+
else
|
|
874
|
+
match_text
|
|
567
875
|
end
|
|
876
|
+
end
|
|
877
|
+
|
|
878
|
+
def match_newline_in_table
|
|
879
|
+
if @text[@cursor, 2] == "\n\n"
|
|
880
|
+
start_span(:PARA)
|
|
881
|
+
append_to_tokens([:TEXT, "\n\n"])
|
|
882
|
+
@cursor += 2
|
|
883
|
+
end_span(:PARA)
|
|
884
|
+
elsif @text[@cursor, 4] == "\r\n\r\n"
|
|
885
|
+
start_span(:PARA)
|
|
886
|
+
append_to_tokens([:TEXT, "\r\n\r\n"])
|
|
887
|
+
@cursor += 4
|
|
888
|
+
end_span(:PARA)
|
|
889
|
+
else
|
|
890
|
+
match_text
|
|
891
|
+
end
|
|
892
|
+
end
|
|
893
|
+
|
|
894
|
+
def match_semicolon
|
|
895
|
+
if at_start_of_line?
|
|
896
|
+
start_span(:DL)
|
|
897
|
+
start_span(:DT, ';')
|
|
898
|
+
@lexer_table.push(@entries_lexer_table)
|
|
899
|
+
@cursor += 1
|
|
900
|
+
else
|
|
901
|
+
match_text
|
|
902
|
+
end
|
|
903
|
+
end
|
|
904
|
+
|
|
905
|
+
def match_colon
|
|
906
|
+
if at_start_of_line?
|
|
907
|
+
start_span(:DL)
|
|
908
|
+
start_span(:DD, ':')
|
|
909
|
+
@lexer_table.push(@entries_lexer_table)
|
|
910
|
+
@cursor += 1
|
|
911
|
+
else
|
|
912
|
+
match_text
|
|
913
|
+
end
|
|
914
|
+
end
|
|
915
|
+
|
|
916
|
+
def match_colon_in_entries
|
|
917
|
+
if @context.include? :DD
|
|
918
|
+
end_span(:DD)
|
|
919
|
+
elsif @context.include? :DT
|
|
920
|
+
end_span(:DT)
|
|
921
|
+
end
|
|
922
|
+
start_span(:DD, ':')
|
|
923
|
+
@cursor += 1
|
|
924
|
+
end
|
|
925
|
+
|
|
926
|
+
def match_newline_in_entries
|
|
927
|
+
match_text
|
|
928
|
+
unless @text[@cursor, 1] == ':'
|
|
929
|
+
if @context.include? :DD
|
|
930
|
+
end_span(:DD)
|
|
931
|
+
elsif @context.include? :DT
|
|
932
|
+
end_span(:DT)
|
|
933
|
+
end
|
|
934
|
+
end_span(:DL)
|
|
935
|
+
@lexer_table.pop
|
|
936
|
+
end
|
|
937
|
+
end
|
|
938
|
+
|
|
939
|
+
|
|
940
|
+
#-- ================== Helper methods ================== ++#
|
|
941
|
+
|
|
942
|
+
# Returns true if the text cursor is on the first character of a line
|
|
943
|
+
def at_start_of_line?
|
|
944
|
+
@cursor == 0 or @text[@cursor - 1, 1] == "\n"
|
|
945
|
+
end
|
|
568
946
|
|
|
569
|
-
|
|
947
|
+
# Returns true if the text cursor is after the last character of a line
|
|
948
|
+
def at_end_of_line?
|
|
949
|
+
@text[@cursor, 1] == "\n" or @text[@cursor, 1].nil?
|
|
950
|
+
end
|
|
570
951
|
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
952
|
+
def blank_line?
|
|
953
|
+
i = @cursor
|
|
954
|
+
i += 1 while (@text[i,1] == ' ')
|
|
955
|
+
return (@text[i,1] == '' or (@text[i,1] == "\n") or (@text[i,2] == "\r\n"))
|
|
956
|
+
end
|
|
957
|
+
|
|
958
|
+
# Advances the text cursor to the next non-blank character, without appending
|
|
959
|
+
# any of the blank characters to the pending text buffer
|
|
960
|
+
def skip_whitespace
|
|
961
|
+
@cursor += 1 while @text[@cursor, 1] == ' '
|
|
962
|
+
end
|
|
963
|
+
|
|
964
|
+
# Advances the text cursor beyond the next newline sequence, if any. This is
|
|
965
|
+
# used to strip newlines after certain block-level elements, like section
|
|
966
|
+
# headings and tables, to prevent an empty paragraph when the block is followed
|
|
967
|
+
# by an extra newline sequence.
|
|
968
|
+
def skip_newline
|
|
969
|
+
if @text[@cursor, 2] == "\r\n"
|
|
970
|
+
@cursor += 2
|
|
971
|
+
elsif @text[@cursor, 1] == "\n"
|
|
972
|
+
@cursor += 1
|
|
973
|
+
end
|
|
974
|
+
end
|
|
975
|
+
|
|
976
|
+
# Extracts from the input text the sequence of characters consisting of the
|
|
977
|
+
# character or characters specified, and returns the sequence as a string. The
|
|
978
|
+
# text cursor is advanaced to point to the next character after the sequence.
|
|
979
|
+
def extract_char_sequence(char)
|
|
980
|
+
sequence = ''
|
|
981
|
+
i = @cursor
|
|
982
|
+
if char.length == 1
|
|
983
|
+
while @text[i, 1] == char do
|
|
984
|
+
sequence << char
|
|
985
|
+
i += 1
|
|
986
|
+
end
|
|
987
|
+
else
|
|
988
|
+
chars = char.split('')
|
|
989
|
+
while chars.include?(@text[i, 1]) do
|
|
990
|
+
sequence << @text[i, 1]
|
|
991
|
+
i += 1
|
|
583
992
|
end
|
|
584
|
-
false
|
|
585
993
|
end
|
|
994
|
+
sequence
|
|
995
|
+
end
|
|
996
|
+
|
|
997
|
+
# Opens list and list item spans for each item symbol in the string specified.
|
|
998
|
+
def open_list(symbols)
|
|
999
|
+
symbols.split('').each do |symbol|
|
|
1000
|
+
if symbol == '*'
|
|
1001
|
+
start_span(:UL)
|
|
1002
|
+
else
|
|
1003
|
+
start_span(:OL)
|
|
1004
|
+
end
|
|
1005
|
+
start_span(:LI)
|
|
1006
|
+
@cursor += symbol.length
|
|
1007
|
+
end
|
|
1008
|
+
end
|
|
1009
|
+
|
|
1010
|
+
# Closes list and list item spans for each item symbol in the string specified.
|
|
1011
|
+
def close_list(symbols)
|
|
1012
|
+
symbols.split('').reverse.each do |symbol|
|
|
1013
|
+
end_span(:LI)
|
|
1014
|
+
if symbol == '*'
|
|
1015
|
+
end_span(:UL)
|
|
1016
|
+
else
|
|
1017
|
+
end_span(:OL)
|
|
1018
|
+
end
|
|
1019
|
+
end
|
|
1020
|
+
end
|
|
1021
|
+
|
|
1022
|
+
# Open a token span for the symbol specified. This will append a token start
|
|
1023
|
+
# to the list of output tokens, and push the symbol onto the context stack. If
|
|
1024
|
+
# there is an open paragraph, and the symbol is a block element, then the
|
|
1025
|
+
# open paragraph will be closed (or, if empty, removed) before the token start
|
|
1026
|
+
# is appended.
|
|
1027
|
+
def start_span(symbol, text='')
|
|
1028
|
+
maybe_close_para(symbol, ['pre','table','p'].include?(text))
|
|
1029
|
+
@context << symbol
|
|
1030
|
+
append_to_tokens [(symbol.to_s + '_START').to_sym, text]
|
|
1031
|
+
end
|
|
586
1032
|
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
1033
|
+
# Close a token span for the symbol specified. This will append an end token
|
|
1034
|
+
# to the list of output tokens, and pop the symbol from the context stack. Any
|
|
1035
|
+
# unclosed contexts on top of this symbol's context will also be close (this
|
|
1036
|
+
# generally happens when in-line markup is not terminated before a new block
|
|
1037
|
+
# begins). If the context is empty as a result, a new paragraph will be opened.
|
|
1038
|
+
def end_span(symbol, text='')
|
|
1039
|
+
while(@context.size > 0 and @context.last != symbol) do
|
|
1040
|
+
append_to_tokens [(@context.pop.to_s + '_END').to_sym, '']
|
|
1041
|
+
end
|
|
1042
|
+
@context.pop
|
|
1043
|
+
append_to_tokens [(symbol.to_s + '_END').to_sym, text]
|
|
1044
|
+
maybe_open_para(symbol)
|
|
1045
|
+
end
|
|
1046
|
+
|
|
1047
|
+
def empty_span(symbol, text, cursor_increment)
|
|
1048
|
+
maybe_close_para(symbol)
|
|
1049
|
+
append_to_tokens [symbol, text, @cursor, cursor_increment]
|
|
1050
|
+
@cursor += cursor_increment
|
|
1051
|
+
maybe_open_para(symbol)
|
|
1052
|
+
end
|
|
1053
|
+
|
|
1054
|
+
def maybe_close_para(symbol, force = false)
|
|
1055
|
+
if @context.size > 0 and (PARA_BREAK_ELEMENTS.include?(symbol) or force)
|
|
1056
|
+
i = 1
|
|
1057
|
+
i += 1 while INLINE_ELEMENTS.include?(@context[-i])
|
|
1058
|
+
if @context[-i] == :PARA
|
|
1059
|
+
if @pending.is_empty_token? and @tokens.last[0] == :PARA_START
|
|
1060
|
+
@context.pop
|
|
1061
|
+
@tokens.pop
|
|
591
1062
|
else
|
|
592
|
-
|
|
1063
|
+
(1 .. i).each do
|
|
1064
|
+
symbol = @context.pop
|
|
1065
|
+
append_to_tokens [(symbol.to_s + '_END').to_sym, '']
|
|
1066
|
+
end
|
|
593
1067
|
end
|
|
1068
|
+
end
|
|
594
1069
|
end
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
#HTTP, HTTPS, MAILTO or FILE protocols
|
|
602
|
-
def link_protocol?(position)
|
|
603
|
-
return @text[position, @text.length - position] =~ %r{\A((http|https|file)://|mailto:)}
|
|
1070
|
+
end
|
|
1071
|
+
|
|
1072
|
+
def maybe_open_para(symbol)
|
|
1073
|
+
if @context.size == 0 and symbol != :PARA
|
|
1074
|
+
@tokens << [:PARA_START, '']
|
|
1075
|
+
@context << :PARA
|
|
604
1076
|
end
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
1077
|
+
end
|
|
1078
|
+
|
|
1079
|
+
def append_to_tokens(token)
|
|
1080
|
+
unless @pending.is_empty_token?
|
|
1081
|
+
@tokens.append_pending(@pending)
|
|
609
1082
|
end
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
1083
|
+
@pending = TokenString.new(self)
|
|
1084
|
+
@tokens << token
|
|
1085
|
+
end
|
|
1086
|
+
|
|
1087
|
+
|
|
1088
|
+
class LexerTable
|
|
1089
|
+
|
|
1090
|
+
def initialize
|
|
1091
|
+
@tables = []
|
|
615
1092
|
end
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
return text =~ /^[#;*:].*/
|
|
1093
|
+
|
|
1094
|
+
def push(table)
|
|
1095
|
+
@tables << table
|
|
1096
|
+
@table = table
|
|
621
1097
|
end
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
sub_tokens = sub_lexer.tokenize(sub_text)
|
|
627
|
-
sub_tokens.pop #false token
|
|
628
|
-
if strip_paragraphs and sub_tokens.size > 0
|
|
629
|
-
#the last PARA_END token
|
|
630
|
-
sub_tokens.pop if sub_tokens.last[0] == :PARA_END
|
|
631
|
-
#the first PARA_START token
|
|
632
|
-
sub_tokens.delete_at(0) if sub_tokens[0][0] == :PARA_START
|
|
633
|
-
end
|
|
634
|
-
sub_tokens
|
|
635
|
-
end
|
|
636
|
-
|
|
637
|
-
#Extract list contents of list type set by list_id variable.
|
|
638
|
-
#Example list:
|
|
639
|
-
# *a
|
|
640
|
-
# **a
|
|
641
|
-
#Extracted list with id "*" will look like:
|
|
642
|
-
# a
|
|
643
|
-
# *a
|
|
644
|
-
def extract_list_contents(list_id)
|
|
645
|
-
i = @cursor+1
|
|
646
|
-
list = ""
|
|
647
|
-
while i < @text.length
|
|
648
|
-
curr = @text[i, 1]
|
|
649
|
-
if (curr == "\n") and (@text[i+1, 1] != list_id)
|
|
650
|
-
list+=curr
|
|
651
|
-
break
|
|
652
|
-
end
|
|
653
|
-
if (curr == list_id) and (@text[i-1, 1] == "\n")
|
|
654
|
-
list += "\n" if i + 1 == @text.length
|
|
655
|
-
else
|
|
656
|
-
list += curr
|
|
657
|
-
end
|
|
658
|
-
i += 1
|
|
659
|
-
end
|
|
660
|
-
list
|
|
661
|
-
end
|
|
662
|
-
|
|
663
|
-
def start_para
|
|
664
|
-
@tokens << [:PARA_START, ""]
|
|
665
|
-
@para = true
|
|
666
|
-
end
|
|
667
|
-
|
|
668
|
-
def end_para
|
|
669
|
-
@tokens += end_tokens_for_open_pairs
|
|
670
|
-
@tokens << [:PARA_END, ""]
|
|
671
|
-
@para = false
|
|
672
|
-
end
|
|
673
|
-
|
|
674
|
-
def end_tokens_for_open_pairs
|
|
675
|
-
tokens = []
|
|
676
|
-
restore = []
|
|
677
|
-
while(@pair_stack.size > 1) do
|
|
678
|
-
last = @pair_stack.pop
|
|
679
|
-
case last[0]
|
|
680
|
-
when :ITALICSTART
|
|
681
|
-
tokens << [:ITALICEND, '']
|
|
682
|
-
when :BOLDSTART
|
|
683
|
-
tokens << [:BOLDEND, '']
|
|
684
|
-
when :INTLINKSTART
|
|
685
|
-
tokens << [:INTLINKEND, '']
|
|
686
|
-
when :LINKSTART
|
|
687
|
-
tokens << [:LINKEND, '']
|
|
688
|
-
when :TABLE_START
|
|
689
|
-
tokens << [:TABLE_END, '']
|
|
690
|
-
when :ROW_START
|
|
691
|
-
tokens << [:ROW_END, '']
|
|
692
|
-
when :CELL_START
|
|
693
|
-
tokens << [:CELL_END, '']
|
|
694
|
-
when :HEAD_START
|
|
695
|
-
tokens << [:HEAD_END, '']
|
|
696
|
-
else
|
|
697
|
-
restore << last
|
|
698
|
-
end
|
|
699
|
-
end
|
|
700
|
-
@pair_stack += restore.reverse
|
|
701
|
-
tokens
|
|
702
|
-
end
|
|
703
|
-
|
|
704
|
-
def close_table_cell(tokens)
|
|
705
|
-
restore = []
|
|
706
|
-
last = @pair_stack.pop
|
|
707
|
-
while (last[0] != :CELL_START and last[0] != :HEAD_START and last[0] != :ROW_START and last[0] != :TABLE_START) do
|
|
708
|
-
case last[0]
|
|
709
|
-
when :ITALICSTART
|
|
710
|
-
tokens << [:ITALICEND, '']
|
|
711
|
-
when :BOLDSTART
|
|
712
|
-
tokens << [:BOLDEND, '']
|
|
713
|
-
when :INTLINKSTART
|
|
714
|
-
tokens << [:INTLINKEND, '']
|
|
715
|
-
when :LINKSTART
|
|
716
|
-
tokens << [:LINKEND, '']
|
|
717
|
-
end
|
|
718
|
-
last = @pair_stack.pop
|
|
719
|
-
end
|
|
720
|
-
if last[0] == :CELL_START
|
|
721
|
-
tokens << [:CELL_END, '']
|
|
722
|
-
elsif last[0] == :HEAD_START
|
|
723
|
-
tokens << [:HEAD_END, '']
|
|
724
|
-
else
|
|
725
|
-
@pair_stack.push last
|
|
726
|
-
end
|
|
1098
|
+
|
|
1099
|
+
def pop
|
|
1100
|
+
@tables.pop
|
|
1101
|
+
@table = @tables.last
|
|
727
1102
|
end
|
|
728
1103
|
|
|
729
|
-
def
|
|
730
|
-
|
|
731
|
-
@pair_stack.pop
|
|
732
|
-
tokens << [:ROW_END, '']
|
|
733
|
-
end
|
|
1104
|
+
def[] (char)
|
|
1105
|
+
@table[char]
|
|
734
1106
|
end
|
|
1107
|
+
|
|
1108
|
+
end
|
|
735
1109
|
|
|
736
1110
|
end
|
|
737
1111
|
|