mediacloth 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. data/README +37 -0
  2. data/lib/mediacloth/mediawikiast.rb +50 -0
  3. data/lib/mediacloth/mediawikihtmlgenerator.rb +105 -0
  4. data/lib/mediacloth/mediawikihtmlgenerator.rb~ +105 -0
  5. data/lib/mediacloth/mediawikilexer.rb +407 -0
  6. data/lib/mediacloth/mediawikiparams.rb +33 -0
  7. data/lib/mediacloth/mediawikiparser.rb +429 -0
  8. data/lib/mediacloth/mediawikiparser.y +172 -0
  9. data/lib/mediacloth/mediawikiparser.y~ +172 -0
  10. data/lib/mediacloth/mediawikiwalker.rb +62 -0
  11. data/lib/mediacloth/mediawikiwalker.rb~ +62 -0
  12. data/lib/mediacloth.rb +23 -0
  13. data/lib/mediacloth.rb~ +23 -0
  14. data/test/data/html1 +21 -0
  15. data/test/data/html2 +2 -0
  16. data/test/data/html3 +1 -0
  17. data/test/data/html4 +1 -0
  18. data/test/data/html6 +8 -0
  19. data/test/data/html7 +1 -0
  20. data/test/data/input1 +29 -0
  21. data/test/data/input2 +2 -0
  22. data/test/data/input3 +2 -0
  23. data/test/data/input4 +1 -0
  24. data/test/data/input5 +12 -0
  25. data/test/data/input6 +8 -0
  26. data/test/data/input7 +2 -0
  27. data/test/data/lex1 +23 -0
  28. data/test/data/lex2 +2 -0
  29. data/test/data/lex3 +1 -0
  30. data/test/data/lex4 +1 -0
  31. data/test/data/lex5 +12 -0
  32. data/test/data/lex6 +8 -0
  33. data/test/data/lex7 +2 -0
  34. data/test/data/result1 +48 -0
  35. data/test/dataproducers/html.rb +23 -0
  36. data/test/dataproducers/html.rb~ +23 -0
  37. data/test/dataproducers/lex.rb +15 -0
  38. data/test/debugwalker.rb +63 -0
  39. data/test/debugwalker.rb~ +63 -0
  40. data/test/htmlgenerator.rb +25 -0
  41. data/test/htmlgenerator.rb~ +25 -0
  42. data/test/lexer.rb +57 -0
  43. data/test/lexer.rb~ +57 -0
  44. data/test/parser.rb +23 -0
  45. data/test/parser.rb~ +23 -0
  46. data/test/testhelper.rb +27 -0
  47. data/test/testhelper.rb~ +28 -0
  48. metadata +97 -0
data/README ADDED
@@ -0,0 +1,37 @@
1
+ MediaCloth is the first MediaWiki parser and html generator written in ruby.
2
+ It's small, fast and aims to recognize the complete MediaWiki language.
3
+
4
+ = INSTALLATION
5
+ To install the library run:
6
+ ruby setup.rb
7
+
8
+
9
+ = USAGE
10
+ The quickest way to parse your input and produce html formatted text is:
11
+ require 'mediacloth'
12
+ puts MediaCloth::wiki_to_html("'''Hello'''''World''!")
13
+
14
+ Alternatively, it's possible to create and use each component of MediaCloth manually:
15
+ require 'mediacloth'
16
+
17
+ parser = MediaWikiParser.new
18
+ parser.lexer = MediaWikiLexer.new
19
+ ast = parser.parse("'''Hello'''''World''!")
20
+ walker = MediaWikiHTMLGenerator.new
21
+ walker.parse(ast)
22
+ puts walker.html
23
+ This is useful if you want to use another generator.
24
+
25
+ Both examples should produce
26
+ <b>Hello</b><i>World</i>!
27
+
28
+ = API DOCS
29
+ To generate API documentation run:
30
+ rake rdoc
31
+
32
+ = DEVELOPMENT
33
+ If you want to modify mediacloth sources you will need:
34
+ 1. Download and install RACC parser generator (http://i.loveruby.net/en/projects/racc/)
35
+ 2. Execute "rake parser" to update your parser from .y definition
36
+ To run tests execute
37
+ rake test
@@ -0,0 +1,50 @@
1
+ #AST Node
2
+ class AST
3
+ attr_accessor :contents
4
+ attr_accessor :parent
5
+ attr_accessor :children
6
+
7
+ def initialize
8
+ @children = []
9
+ @parent = nil
10
+ @contents = ""
11
+ end
12
+ end
13
+
14
+ #The root node for all wiki parse trees
15
+ class WikiAST < AST
16
+
17
+ end
18
+
19
+ #The node to represent a simple or formatted text
20
+ #with more AST nodes inside.
21
+ class FormattedAST < AST
22
+ #Currently recognized formatting: :Bold, :Italic, :Link, :InternalLink, :HLine
23
+ attr_accessor :formatting
24
+ end
25
+
26
+ #The node to represent a simple or formatted text
27
+ class TextAST < FormattedAST
28
+ #Currently recognized formatting: :Link, :InternalLink, :HLine
29
+ end
30
+
31
+ #The node to represent a list
32
+ class ListAST < AST
33
+ #Currently recognized types: :Bulleted, :Numbered
34
+ attr_accessor :type
35
+ end
36
+
37
+ #The node to represent a list item
38
+ class ListItemAST < AST
39
+ end
40
+
41
+ #The node to represent a section
42
+ class SectionAST < AST
43
+ #The level of the section (1,2,3...) that would correspond to
44
+ #<h1>, <h2>, <h3>, etc.
45
+ attr_accessor :level
46
+ end
47
+
48
+ #The node to represent a preformatted contents
49
+ class PreformattedAST < AST
50
+ end
@@ -0,0 +1,105 @@
1
+ require 'mediacloth/mediawikiwalker'
2
+ require 'mediacloth/mediawikiparams'
3
+
4
+ #HTML generator for a MediaWiki parse tree
5
+ #
6
+ #Typical use case:
7
+ # parser = MediaWikiParser.new
8
+ # parser.lexer = MediaWikiLexer.new
9
+ # ast = parser.parse(input)
10
+ # walker = MediaWikiHTMLGenerator.new
11
+ # walker.parse(ast)
12
+ # puts walker.html
13
+ class MediaWikiHTMLGenerator < MediaWikiWalker
14
+ attr_reader :html
15
+
16
+ def initialize
17
+ @html = ""
18
+ end
19
+
20
+ protected
21
+
22
+ def parse_wiki_ast(ast)
23
+ super(ast)
24
+ end
25
+
26
+ def parse_text(ast)
27
+ tag = formatting_to_tag(ast)
28
+ if tag[0].empty?
29
+ @html += ast.contents
30
+ else
31
+ @html += "<#{tag[0]}#{tag[1]}>#{ast.contents}</#{tag[0]}>"
32
+ end
33
+ super(ast)
34
+ end
35
+
36
+ def parse_formatted(ast)
37
+ tag = formatting_to_tag(ast)
38
+ @html += "<#{tag}>"
39
+ super(ast)
40
+ @html += "</#{tag}>"
41
+ end
42
+
43
+ def parse_list(ast)
44
+ tag = list_tag(ast)
45
+ @html += "<#{tag}>"
46
+ super(ast)
47
+ @html += "</#{tag}>"
48
+ end
49
+
50
+ def parse_list_item(ast)
51
+ @html += "<li>"
52
+ super(ast)
53
+ @html += "</li>"
54
+ end
55
+
56
+ def parse_preformatted(ast)
57
+ super(ast)
58
+ end
59
+
60
+ def parse_section(ast)
61
+ @html += "<h#{ast.level}>"
62
+ @html += ast.contents.strip
63
+ @html += "</h#{ast.level}>"
64
+ super(ast)
65
+ end
66
+
67
+ private
68
+
69
+ #returns an array with a tag name and tag attributes
70
+ def formatting_to_tag(ast)
71
+ tag = ["", ""]
72
+ if ast.formatting == :Bold
73
+ tag = ["b", ""]
74
+ elsif ast.formatting == :Italic
75
+ tag = ["i", ""]
76
+ elsif ast.formatting == :Link or ast.formatting == :ExternalLink
77
+ links = ast.contents.split
78
+ link = links[0]
79
+ link_name = links[1, links.length-1].join(" ")
80
+ link_name = link if link_name.empty?
81
+ ast.contents = link_name
82
+ tag = ["a", " href=\"#{link}\" rel=\"nofollow\""]
83
+ elsif ast.formatting == :HLine
84
+ ast.contents = ""
85
+ tag = ["hr", ""]
86
+ elsif ast.formatting == :SignatureDate
87
+ ast.contents = MediaWikiParams.instance.time.to_s
88
+ elsif ast.formatting == :SignatureName
89
+ ast.contents = MediaWikiParams.instance.author
90
+ elsif ast.formatting == :SignatureFull
91
+ ast.contents = MediaWikiParams.instance.author + " " + MediaWikiParams.instance.time.to_s
92
+ end
93
+ tag
94
+ end
95
+
96
+ #returns a tag name of the list in ast node
97
+ def list_tag(ast)
98
+ if ast.type == :Bulleted
99
+ return "ul"
100
+ elsif ast.type == :Numbered
101
+ return "ol"
102
+ end
103
+ end
104
+
105
+ end
@@ -0,0 +1,105 @@
1
+ require 'mediawikiwalker'
2
+ require 'mediawikiparams'
3
+
4
+ #HTML generator for a MediaWiki parse tree
5
+ #
6
+ #Typical use case:
7
+ # parser = MediaWikiParser.new
8
+ # parser.lexer = MediaWikiLexer.new
9
+ # ast = parser.parse(input)
10
+ # walker = MediaWikiHTMLGenerator.new
11
+ # walker.parse(ast)
12
+ # puts walker.html
13
+ class MediaWikiHTMLGenerator < MediaWikiWalker
14
+ attr_reader :html
15
+
16
+ def initialize
17
+ @html = ""
18
+ end
19
+
20
+ protected
21
+
22
+ def parse_wiki_ast(ast)
23
+ super(ast)
24
+ end
25
+
26
+ def parse_text(ast)
27
+ tag = formatting_to_tag(ast)
28
+ if tag[0].empty?
29
+ @html += ast.contents
30
+ else
31
+ @html += "<#{tag[0]}#{tag[1]}>#{ast.contents}</#{tag[0]}>"
32
+ end
33
+ super(ast)
34
+ end
35
+
36
+ def parse_formatted(ast)
37
+ tag = formatting_to_tag(ast)
38
+ @html += "<#{tag}>"
39
+ super(ast)
40
+ @html += "</#{tag}>"
41
+ end
42
+
43
+ def parse_list(ast)
44
+ tag = list_tag(ast)
45
+ @html += "<#{tag}>"
46
+ super(ast)
47
+ @html += "</#{tag}>"
48
+ end
49
+
50
+ def parse_list_item(ast)
51
+ @html += "<li>"
52
+ super(ast)
53
+ @html += "</li>"
54
+ end
55
+
56
+ def parse_preformatted(ast)
57
+ super(ast)
58
+ end
59
+
60
+ def parse_section(ast)
61
+ @html += "<h#{ast.level}>"
62
+ @html += ast.contents.strip
63
+ @html += "</h#{ast.level}>"
64
+ super(ast)
65
+ end
66
+
67
+ private
68
+
69
+ #returns an array with a tag name and tag attributes
70
+ def formatting_to_tag(ast)
71
+ tag = ["", ""]
72
+ if ast.formatting == :Bold
73
+ tag = ["b", ""]
74
+ elsif ast.formatting == :Italic
75
+ tag = ["i", ""]
76
+ elsif ast.formatting == :Link or ast.formatting == :ExternalLink
77
+ links = ast.contents.split
78
+ link = links[0]
79
+ link_name = links[1, links.length-1].join(" ")
80
+ link_name = link if link_name.empty?
81
+ ast.contents = link_name
82
+ tag = ["a", " href=\"#{link}\" rel=\"nofollow\""]
83
+ elsif ast.formatting == :HLine
84
+ ast.contents = ""
85
+ tag = ["hr", ""]
86
+ elsif ast.formatting == :SignatureDate
87
+ ast.contents = MediaWikiParams.instance.time.to_s
88
+ elsif ast.formatting == :SignatureName
89
+ ast.contents = MediaWikiParams.instance.author
90
+ elsif ast.formatting == :SignatureFull
91
+ ast.contents = MediaWikiParams.instance.author + " " + MediaWikiParams.instance.time.to_s
92
+ end
93
+ tag
94
+ end
95
+
96
+ #returns a tag name of the list in ast node
97
+ def list_tag(ast)
98
+ if ast.type == :Bulleted
99
+ return "ul"
100
+ elsif ast.type == :Numbered
101
+ return "ol"
102
+ end
103
+ end
104
+
105
+ end
@@ -0,0 +1,407 @@
1
+ #The lexer for MediaWiki language.
2
+ #
3
+ #Standalone usage:
4
+ # file = File.new("somefile", "r")
5
+ # input = file.read
6
+ # lexer = MediaWikiLexer.new
7
+ # lexer.tokenize(input)
8
+ #
9
+ #Inside RACC-generated parser:
10
+ # ...
11
+ # ---- inner ----
12
+ # attr_accessor :lexer
13
+ # def parse(input)
14
+ # lexer.tokenize(input)
15
+ # return do_parse
16
+ # end
17
+ # def next_token
18
+ # return @lexer.lex
19
+ # end
20
+ # ...
21
+ # parser = MediaWikiParser.new
22
+ # parser.lexer = MediaWikiLexer.new
23
+ # parser.parse(input)
24
+ class MediaWikiLexer
25
+
26
+ #Initialized the lexer with a match table.
27
+ #
28
+ #The match table tells the lexer which method to invoke
29
+ #on given input char during "tokenize" phase.
30
+ def initialize
31
+ @position = 0
32
+ @pair_stack = [[false, false]] #stack of tokens for which a pair should be found
33
+ @list_stack = []
34
+ @lexer_table = Hash.new(method(:match_other))
35
+ @lexer_table["'"] = method(:match_italic_or_bold)
36
+ @lexer_table["="] = method(:match_section)
37
+ @lexer_table["["] = method(:match_link_start)
38
+ @lexer_table["]"] = method(:match_link_end)
39
+ @lexer_table[" "] = method(:match_space)
40
+ @lexer_table["*"] = method(:match_list)
41
+ @lexer_table["#"] = method(:match_list)
42
+ @lexer_table[";"] = method(:match_list)
43
+ @lexer_table[":"] = method(:match_list)
44
+ @lexer_table["-"] = method(:match_line)
45
+ @lexer_table["~"] = method(:match_signature)
46
+ @lexer_table["h"] = method(:match_inline_link)
47
+ end
48
+
49
+ #Transforms input stream (string) into the stream of tokens.
50
+ #Tokens are collected into an array of type [ [TOKEN_SYMBOL, TOKEN_VALUE], ..., [false, false] ].
51
+ #This array can be given as input token-by token to RACC based parser with no
52
+ #modification. The last token [false, false] inficates EOF.
53
+ def tokenize(input)
54
+ @tokens = []
55
+ @cursor = 0
56
+ @text = input
57
+ @next_token = []
58
+
59
+ #This tokenizer algorithm assumes that everything that is not
60
+ #matched by the lexer is going to be :TEXT token. Otherwise it's usual
61
+ #lexer algo which call methods from the match table to define next tokens.
62
+ while (@cursor < @text.length)
63
+ @current_token = [:TEXT, ''] unless @current_token
64
+ @token_start = @cursor
65
+ @char = @text[@cursor, 1]
66
+
67
+ if @lexer_table[@char].call == :TEXT
68
+ @current_token[1] += @text[@token_start, 1]
69
+ else
70
+ #skip empty :TEXT tokens
71
+ @tokens << @current_token unless empty_text_token?
72
+ @next_token[1] = @text[@token_start, @cursor - @token_start]
73
+ @tokens << @next_token
74
+ #hack to enable sub-lexing!
75
+ if @sub_tokens
76
+ @tokens += @sub_tokens
77
+ @sub_tokens = nil
78
+ end
79
+ #end of hack!
80
+ @current_token = nil
81
+ @next_token = []
82
+ end
83
+ end
84
+ #add the last TEXT token if it exists
85
+ @tokens << @current_token if @current_token and not empty_text_token?
86
+
87
+ #RACC wants us to put this to indicate EOF
88
+ @tokens << [false, false]
89
+ @tokens
90
+ end
91
+
92
+ #Returns the next token from the stream. Useful for RACC parsers.
93
+ def lex
94
+ token = @tokens[@position]
95
+ @position += 1
96
+ return token
97
+ end
98
+
99
+
100
+ private
101
+ #-- ================== Match methods ================== ++#
102
+
103
+ #Matches anything that was not matched. Returns :TEXT to indicate
104
+ #that matched characters should go into :TEXT token.
105
+ def match_other
106
+ @cursor += 1
107
+ return :TEXT
108
+ end
109
+
110
+ #Matches italic or bold symbols:
111
+ # "'''" { return :BOLD; }
112
+ # "''" { return :ITALIC; }
113
+ def match_italic_or_bold
114
+ if @text[@cursor, 3] == "'''" and @pair_stack.last[0] != :ITALICSTART
115
+ matchBold
116
+ @cursor += 3
117
+ return
118
+ end
119
+ if @text[@cursor, 2] == "''"
120
+ matchItalic
121
+ @cursor += 2
122
+ return
123
+ end
124
+ match_other
125
+ end
126
+
127
+ def matchBold
128
+ if @pair_stack.last[0] == :BOLDSTART
129
+ @next_token[0] = :BOLDEND
130
+ @pair_stack.pop
131
+ else
132
+ @next_token[0] = :BOLDSTART
133
+ @pair_stack.push @next_token
134
+ end
135
+ end
136
+
137
+ def matchItalic
138
+ if @pair_stack.last[0] == :ITALICSTART
139
+ @next_token[0] = :ITALICEND
140
+ @pair_stack.pop
141
+ else
142
+ @next_token[0] = :ITALICSTART
143
+ @pair_stack.push @next_token
144
+ end
145
+ end
146
+
147
+ #Matches sections
148
+ # "=+" { return SECTION; }
149
+ def match_section
150
+ if (@text[@cursor-1, 1] == "\n") or (@pair_stack.last[0] == :SECTION)
151
+ i = 0
152
+ i += 1 while @text[@cursor+i, 1] == "="
153
+ @cursor += i
154
+ @next_token[0] = :SECTION
155
+
156
+ if @pair_stack.last[0] == :SECTION
157
+ @pair_stack.pop
158
+ else
159
+ @pair_stack.push @next_token
160
+ end
161
+ else
162
+ match_other
163
+ end
164
+ end
165
+
166
+ #Matches start of the hyperlinks
167
+ # "[[" { return INTLINKSTART; }
168
+ # "[" { return LINKSTART; }
169
+ def match_link_start
170
+ if @text[@cursor, 2] == "[["
171
+ @next_token[0] = :INTLINKSTART
172
+ @pair_stack.push @next_token
173
+ @cursor += 2
174
+ elsif @text[@cursor, 1] == "[" and html_link?(@cursor+1)
175
+ @next_token[0] = :LINKSTART
176
+ @pair_stack.push @next_token
177
+ @cursor += 1
178
+ else
179
+ match_other
180
+ end
181
+ end
182
+
183
+ #Matches end of the hyperlinks
184
+ # "]]" { return INTLINKEND; }
185
+ # "]" { return LINKEND; }
186
+ def match_link_end
187
+ if @text[@cursor, 2] == "]]" and @pair_stack.last[0] == :INTLINKSTART
188
+ @next_token[0] = :INTLINKEND
189
+ @pair_stack.pop
190
+ @cursor += 2
191
+ elsif @text[@cursor, 1] == "]" and @pair_stack.last[0] == :LINKSTART
192
+ @next_token[0] = :LINKEND
193
+ @pair_stack.pop
194
+ @cursor += 1
195
+ else
196
+ match_other
197
+ end
198
+ end
199
+
200
+ #Matches inlined unformatted html link
201
+ # "http://[^\s]*" { return [ LINKSTART TEXT LINKEND]; }
202
+ def match_inline_link
203
+ #if no link start token was detected and the text starts with http://
204
+ #then it's the inlined unformatted html link
205
+ if html_link?(@cursor) and @pair_stack.last[0] != :INTLINKSTART and
206
+ @pair_stack.last[0] != :LINKSTART
207
+ @next_token[0] = :LINKSTART
208
+ linkText = extract_till_whitespace
209
+ @sub_tokens = []
210
+ @sub_tokens << [:TEXT, linkText]
211
+ @sub_tokens << [:LINKEND, ']']
212
+ @cursor += linkText.length
213
+ @token_start = @cursor
214
+ else
215
+ match_other
216
+ end
217
+ end
218
+
219
+ #Matches space to find preformatted areas which start with a space after a newline
220
+ # "\n\s[^\n]*" { return PRE; }
221
+ def match_space
222
+ if at_start_of_line?
223
+ match_untill_eol
224
+ @next_token[0] = :PRE
225
+ strip_ws_from_token_start
226
+ else
227
+ match_other
228
+ end
229
+ end
230
+
231
+ #Matches any kind of list by using sublexing technique. MediaWiki lists are context-sensitive
232
+ #therefore we need to do some special processing with lists. The idea here is to strip
233
+ #the leftmost symbol indicating the list from the group of input lines and use separate
234
+ #lexer to process extracted fragment.
235
+ def match_list
236
+ if at_start_of_line?
237
+ list_id = @text[@cursor, 1]
238
+ sub_text = extract_list_contents(list_id)
239
+ extracted = 0
240
+
241
+ #hack to tokenize everything inside the list
242
+ @sub_tokens = []
243
+ sub_lines = ""
244
+ @sub_tokens << [:LI_START, ""]
245
+ sub_text.each do |t|
246
+ extracted += 1
247
+ if text_is_list? t
248
+ sub_lines += t
249
+ else
250
+ if not sub_lines.empty?
251
+ @sub_tokens += sub_lex(sub_lines)
252
+ sub_lines = ""
253
+ end
254
+ if @sub_tokens.last[0] != :LI_START
255
+ @sub_tokens << [:LI_END, ""]
256
+ @sub_tokens << [:LI_START, ""]
257
+ end
258
+ @sub_tokens += sub_lex(t.lstrip)
259
+ end
260
+ end
261
+ if not sub_lines.empty?
262
+ @sub_tokens += sub_lex(sub_lines)
263
+ @sub_tokens << [:LI_END, ""]
264
+ else
265
+ @sub_tokens << [:LI_END, ""]
266
+ end
267
+
268
+ #end of hack
269
+ @cursor += sub_text.length + extracted
270
+ @token_start = @cursor
271
+
272
+ case
273
+ when list_id == "*"
274
+ @next_token[0] = :UL_START
275
+ @sub_tokens << [:UL_END, ""]
276
+ when list_id == "#"
277
+ @next_token[0] = :OL_START
278
+ @sub_tokens << [:OL_END, ""]
279
+ when list_id == ";", list_id == ":"
280
+ @next_token[0] = :DL_START
281
+ @sub_tokens << [:DL_END, ""]
282
+ end
283
+
284
+ else
285
+ match_other
286
+ end
287
+ end
288
+
289
+ #Matches the line until \n
290
+ def match_untill_eol
291
+ val = @text[@cursor, 1]
292
+ while (val != "\n") and (!val.nil?)
293
+ @cursor += 1
294
+ val = @text[@cursor, 1]
295
+ end
296
+ @cursor += 1
297
+ end
298
+
299
+ #Matches hline tag that start with "-"
300
+ # "\n----" { return HLINE; }
301
+ def match_line
302
+ if at_start_of_line? and @text[@cursor, 4] == "----"
303
+ @next_token[0] = :HLINE
304
+ @cursor += 4
305
+ else
306
+ match_other
307
+ end
308
+ end
309
+
310
+ #Matches signature
311
+ # "~~~~~" { return SIGNATURE_DATE; }
312
+ # "~~~~" { return SIGNATURE_FULL; }
313
+ # "~~~" { return SIGNATURE_NAME; }
314
+ def match_signature
315
+ if @text[@cursor, 5] == "~~~~~"
316
+ @next_token[0] = :SIGNATURE_DATE
317
+ @cursor += 5
318
+ elsif @text[@cursor, 4] == "~~~~"
319
+ @next_token[0] = :SIGNATURE_FULL
320
+ @cursor += 4
321
+ elsif @text[@cursor, 3] == "~~~"
322
+ @next_token[0] = :SIGNATURE_NAME
323
+ @cursor += 3
324
+ else
325
+ match_other
326
+ end
327
+ end
328
+
329
+ #-- ================== Helper methods ================== ++#
330
+
331
+ #Checks if the token is placed at the start of the line.
332
+ def at_start_of_line?
333
+ if @cursor == 0 or @text[@cursor-1, 1] == "\n"
334
+ true
335
+ else
336
+ false
337
+ end
338
+ end
339
+
340
+ #Checks if the text at position contains the start of the html link
341
+ def html_link?(position)
342
+ return @text[position, 7] == 'http://'
343
+ end
344
+
345
+ #Adjusts @token_start to skip leading whitespaces
346
+ def strip_ws_from_token_start
347
+ @token_start += 1 while @text[@token_start, 1] == " "
348
+ end
349
+
350
+ #Returns true if the TEXT token is empty or contains newline only
351
+ def empty_text_token?
352
+ @current_token == [:TEXT, ''] or @current_token == [:TEXT, "\n"]
353
+ end
354
+
355
+ #Returns true if the text is a list, i.e. starts with one of #;*: symbols
356
+ #that indicate a list
357
+ def text_is_list?(text)
358
+ return text =~ /^[#;*:].*/
359
+ end
360
+
361
+ #Runs sublexer to tokenize sub_text
362
+ def sub_lex(sub_text)
363
+ sub_lexer = MediaWikiLexer.new
364
+ sub_tokens = sub_lexer.tokenize(sub_text)
365
+ sub_tokens.pop
366
+ sub_tokens
367
+ end
368
+
369
+ #Extracts the text from current cursor position till the next whitespace
370
+ def extract_till_whitespace
371
+ i = @cursor
372
+ text = ""
373
+ while i < @text.length
374
+ curr = @text[i, 1]
375
+ if (curr == "\n") or (curr == "\t") or (curr == " ")
376
+ break
377
+ end
378
+ text += curr
379
+ i += 1
380
+ end
381
+ text
382
+ end
383
+
384
+ #Extract list contents of list type set by list_id variable.
385
+ #Example list:
386
+ # *a
387
+ # **a
388
+ #Extracted list with id "*" will look like:
389
+ # a
390
+ # *a
391
+ def extract_list_contents(list_id)
392
+ i = @cursor+1
393
+ list = ""
394
+ while i < @text.length
395
+ curr = @text[i, 1]
396
+ if (curr == "\n") and (@text[i+1, 1] != list_id)
397
+ list+=curr
398
+ break
399
+ end
400
+ list += curr unless (curr == list_id) and (@text[i-1, 1] == "\n")
401
+ i += 1
402
+ end
403
+ list
404
+ end
405
+
406
+ end
407
+