mediacloth 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. data/README +37 -0
  2. data/lib/mediacloth/mediawikiast.rb +50 -0
  3. data/lib/mediacloth/mediawikihtmlgenerator.rb +105 -0
  4. data/lib/mediacloth/mediawikihtmlgenerator.rb~ +105 -0
  5. data/lib/mediacloth/mediawikilexer.rb +407 -0
  6. data/lib/mediacloth/mediawikiparams.rb +33 -0
  7. data/lib/mediacloth/mediawikiparser.rb +429 -0
  8. data/lib/mediacloth/mediawikiparser.y +172 -0
  9. data/lib/mediacloth/mediawikiparser.y~ +172 -0
  10. data/lib/mediacloth/mediawikiwalker.rb +62 -0
  11. data/lib/mediacloth/mediawikiwalker.rb~ +62 -0
  12. data/lib/mediacloth.rb +23 -0
  13. data/lib/mediacloth.rb~ +23 -0
  14. data/test/data/html1 +21 -0
  15. data/test/data/html2 +2 -0
  16. data/test/data/html3 +1 -0
  17. data/test/data/html4 +1 -0
  18. data/test/data/html6 +8 -0
  19. data/test/data/html7 +1 -0
  20. data/test/data/input1 +29 -0
  21. data/test/data/input2 +2 -0
  22. data/test/data/input3 +2 -0
  23. data/test/data/input4 +1 -0
  24. data/test/data/input5 +12 -0
  25. data/test/data/input6 +8 -0
  26. data/test/data/input7 +2 -0
  27. data/test/data/lex1 +23 -0
  28. data/test/data/lex2 +2 -0
  29. data/test/data/lex3 +1 -0
  30. data/test/data/lex4 +1 -0
  31. data/test/data/lex5 +12 -0
  32. data/test/data/lex6 +8 -0
  33. data/test/data/lex7 +2 -0
  34. data/test/data/result1 +48 -0
  35. data/test/dataproducers/html.rb +23 -0
  36. data/test/dataproducers/html.rb~ +23 -0
  37. data/test/dataproducers/lex.rb +15 -0
  38. data/test/debugwalker.rb +63 -0
  39. data/test/debugwalker.rb~ +63 -0
  40. data/test/htmlgenerator.rb +25 -0
  41. data/test/htmlgenerator.rb~ +25 -0
  42. data/test/lexer.rb +57 -0
  43. data/test/lexer.rb~ +57 -0
  44. data/test/parser.rb +23 -0
  45. data/test/parser.rb~ +23 -0
  46. data/test/testhelper.rb +27 -0
  47. data/test/testhelper.rb~ +28 -0
  48. metadata +97 -0
data/README ADDED
@@ -0,0 +1,37 @@
1
+ MediaCloth is the first MediaWiki parser and html generator written in ruby.
2
+ It's small, fast and aims to recognize the complete MediaWiki language.
3
+
4
+ = INSTALLATION
5
+ To install the library run:
6
+ ruby setup.rb
7
+
8
+
9
+ = USAGE
10
+ The quickest way to parse your input and produce html formatted text is:
11
+ require 'mediacloth'
12
+ puts MediaCloth::wiki_to_html("'''Hello'''''World''!")
13
+
14
+ Alternatively, it's possible to create and use each component of MediaCloth manually:
15
+ require 'mediacloth'
16
+
17
+ parser = MediaWikiParser.new
18
+ parser.lexer = MediaWikiLexer.new
19
+ ast = parser.parse("'''Hello'''''World''!")
20
+ walker = MediaWikiHTMLGenerator.new
21
+ walker.parse(ast)
22
+ puts walker.html
23
+ This is useful if you want to use another generator.
24
+
25
+ Both examples should produce
26
+ <b>Hello</b><i>World</i>!
27
+
28
+ = API DOCS
29
+ To generate API documentation run:
30
+ rake rdoc
31
+
32
+ = DEVELOPMENT
33
+ If you want to modify mediacloth sources you will need:
34
+ 1. Download and install RACC parser generator (http://i.loveruby.net/en/projects/racc/)
35
+ 2. Execute "rake parser" to update your parser from .y definition
36
+ To run tests execute
37
+ rake test
@@ -0,0 +1,50 @@
1
+ #AST Node
2
+ class AST
3
+ attr_accessor :contents
4
+ attr_accessor :parent
5
+ attr_accessor :children
6
+
7
+ def initialize
8
+ @children = []
9
+ @parent = nil
10
+ @contents = ""
11
+ end
12
+ end
13
+
14
+ #The root node for all wiki parse trees
15
+ class WikiAST < AST
16
+
17
+ end
18
+
19
+ #The node to represent a simple or formatted text
20
+ #with more AST nodes inside.
21
+ class FormattedAST < AST
22
+ #Currently recognized formatting: :Bold, :Italic, :Link, :InternalLink, :HLine
23
+ attr_accessor :formatting
24
+ end
25
+
26
+ #The node to represent a simple or formatted text
27
+ class TextAST < FormattedAST
28
+ #Currently recognized formatting: :Link, :InternalLink, :HLine
29
+ end
30
+
31
+ #The node to represent a list
32
+ class ListAST < AST
33
+ #Currently recognized types: :Bulleted, :Numbered
34
+ attr_accessor :type
35
+ end
36
+
37
+ #The node to represent a list item
38
+ class ListItemAST < AST
39
+ end
40
+
41
+ #The node to represent a section
42
+ class SectionAST < AST
43
+ #The level of the section (1,2,3...) that would correspond to
44
+ #<h1>, <h2>, <h3>, etc.
45
+ attr_accessor :level
46
+ end
47
+
48
+ #The node to represent a preformatted contents
49
+ class PreformattedAST < AST
50
+ end
@@ -0,0 +1,105 @@
1
+ require 'mediacloth/mediawikiwalker'
2
+ require 'mediacloth/mediawikiparams'
3
+
4
+ #HTML generator for a MediaWiki parse tree
5
+ #
6
+ #Typical use case:
7
+ # parser = MediaWikiParser.new
8
+ # parser.lexer = MediaWikiLexer.new
9
+ # ast = parser.parse(input)
10
+ # walker = MediaWikiHTMLGenerator.new
11
+ # walker.parse(ast)
12
+ # puts walker.html
13
+ class MediaWikiHTMLGenerator < MediaWikiWalker
14
+ attr_reader :html
15
+
16
+ def initialize
17
+ @html = ""
18
+ end
19
+
20
+ protected
21
+
22
+ def parse_wiki_ast(ast)
23
+ super(ast)
24
+ end
25
+
26
+ def parse_text(ast)
27
+ tag = formatting_to_tag(ast)
28
+ if tag[0].empty?
29
+ @html += ast.contents
30
+ else
31
+ @html += "<#{tag[0]}#{tag[1]}>#{ast.contents}</#{tag[0]}>"
32
+ end
33
+ super(ast)
34
+ end
35
+
36
+ def parse_formatted(ast)
37
+ tag = formatting_to_tag(ast)
38
+ @html += "<#{tag}>"
39
+ super(ast)
40
+ @html += "</#{tag}>"
41
+ end
42
+
43
+ def parse_list(ast)
44
+ tag = list_tag(ast)
45
+ @html += "<#{tag}>"
46
+ super(ast)
47
+ @html += "</#{tag}>"
48
+ end
49
+
50
+ def parse_list_item(ast)
51
+ @html += "<li>"
52
+ super(ast)
53
+ @html += "</li>"
54
+ end
55
+
56
+ def parse_preformatted(ast)
57
+ super(ast)
58
+ end
59
+
60
+ def parse_section(ast)
61
+ @html += "<h#{ast.level}>"
62
+ @html += ast.contents.strip
63
+ @html += "</h#{ast.level}>"
64
+ super(ast)
65
+ end
66
+
67
+ private
68
+
69
+ #returns an array with a tag name and tag attributes
70
+ def formatting_to_tag(ast)
71
+ tag = ["", ""]
72
+ if ast.formatting == :Bold
73
+ tag = ["b", ""]
74
+ elsif ast.formatting == :Italic
75
+ tag = ["i", ""]
76
+ elsif ast.formatting == :Link or ast.formatting == :ExternalLink
77
+ links = ast.contents.split
78
+ link = links[0]
79
+ link_name = links[1, links.length-1].join(" ")
80
+ link_name = link if link_name.empty?
81
+ ast.contents = link_name
82
+ tag = ["a", " href=\"#{link}\" rel=\"nofollow\""]
83
+ elsif ast.formatting == :HLine
84
+ ast.contents = ""
85
+ tag = ["hr", ""]
86
+ elsif ast.formatting == :SignatureDate
87
+ ast.contents = MediaWikiParams.instance.time.to_s
88
+ elsif ast.formatting == :SignatureName
89
+ ast.contents = MediaWikiParams.instance.author
90
+ elsif ast.formatting == :SignatureFull
91
+ ast.contents = MediaWikiParams.instance.author + " " + MediaWikiParams.instance.time.to_s
92
+ end
93
+ tag
94
+ end
95
+
96
+ #returns a tag name of the list in ast node
97
+ def list_tag(ast)
98
+ if ast.type == :Bulleted
99
+ return "ul"
100
+ elsif ast.type == :Numbered
101
+ return "ol"
102
+ end
103
+ end
104
+
105
+ end
@@ -0,0 +1,105 @@
1
+ require 'mediawikiwalker'
2
+ require 'mediawikiparams'
3
+
4
+ #HTML generator for a MediaWiki parse tree
5
+ #
6
+ #Typical use case:
7
+ # parser = MediaWikiParser.new
8
+ # parser.lexer = MediaWikiLexer.new
9
+ # ast = parser.parse(input)
10
+ # walker = MediaWikiHTMLGenerator.new
11
+ # walker.parse(ast)
12
+ # puts walker.html
13
+ class MediaWikiHTMLGenerator < MediaWikiWalker
14
+ attr_reader :html
15
+
16
+ def initialize
17
+ @html = ""
18
+ end
19
+
20
+ protected
21
+
22
+ def parse_wiki_ast(ast)
23
+ super(ast)
24
+ end
25
+
26
+ def parse_text(ast)
27
+ tag = formatting_to_tag(ast)
28
+ if tag[0].empty?
29
+ @html += ast.contents
30
+ else
31
+ @html += "<#{tag[0]}#{tag[1]}>#{ast.contents}</#{tag[0]}>"
32
+ end
33
+ super(ast)
34
+ end
35
+
36
+ def parse_formatted(ast)
37
+ tag = formatting_to_tag(ast)
38
+ @html += "<#{tag}>"
39
+ super(ast)
40
+ @html += "</#{tag}>"
41
+ end
42
+
43
+ def parse_list(ast)
44
+ tag = list_tag(ast)
45
+ @html += "<#{tag}>"
46
+ super(ast)
47
+ @html += "</#{tag}>"
48
+ end
49
+
50
+ def parse_list_item(ast)
51
+ @html += "<li>"
52
+ super(ast)
53
+ @html += "</li>"
54
+ end
55
+
56
+ def parse_preformatted(ast)
57
+ super(ast)
58
+ end
59
+
60
+ def parse_section(ast)
61
+ @html += "<h#{ast.level}>"
62
+ @html += ast.contents.strip
63
+ @html += "</h#{ast.level}>"
64
+ super(ast)
65
+ end
66
+
67
+ private
68
+
69
+ #returns an array with a tag name and tag attributes
70
+ def formatting_to_tag(ast)
71
+ tag = ["", ""]
72
+ if ast.formatting == :Bold
73
+ tag = ["b", ""]
74
+ elsif ast.formatting == :Italic
75
+ tag = ["i", ""]
76
+ elsif ast.formatting == :Link or ast.formatting == :ExternalLink
77
+ links = ast.contents.split
78
+ link = links[0]
79
+ link_name = links[1, links.length-1].join(" ")
80
+ link_name = link if link_name.empty?
81
+ ast.contents = link_name
82
+ tag = ["a", " href=\"#{link}\" rel=\"nofollow\""]
83
+ elsif ast.formatting == :HLine
84
+ ast.contents = ""
85
+ tag = ["hr", ""]
86
+ elsif ast.formatting == :SignatureDate
87
+ ast.contents = MediaWikiParams.instance.time.to_s
88
+ elsif ast.formatting == :SignatureName
89
+ ast.contents = MediaWikiParams.instance.author
90
+ elsif ast.formatting == :SignatureFull
91
+ ast.contents = MediaWikiParams.instance.author + " " + MediaWikiParams.instance.time.to_s
92
+ end
93
+ tag
94
+ end
95
+
96
+ #returns a tag name of the list in ast node
97
+ def list_tag(ast)
98
+ if ast.type == :Bulleted
99
+ return "ul"
100
+ elsif ast.type == :Numbered
101
+ return "ol"
102
+ end
103
+ end
104
+
105
+ end
@@ -0,0 +1,407 @@
1
+ #The lexer for MediaWiki language.
2
+ #
3
+ #Standalone usage:
4
+ # file = File.new("somefile", "r")
5
+ # input = file.read
6
+ # lexer = MediaWikiLexer.new
7
+ # lexer.tokenize(input)
8
+ #
9
+ #Inside RACC-generated parser:
10
+ # ...
11
+ # ---- inner ----
12
+ # attr_accessor :lexer
13
+ # def parse(input)
14
+ # lexer.tokenize(input)
15
+ # return do_parse
16
+ # end
17
+ # def next_token
18
+ # return @lexer.lex
19
+ # end
20
+ # ...
21
+ # parser = MediaWikiParser.new
22
+ # parser.lexer = MediaWikiLexer.new
23
+ # parser.parse(input)
24
+ class MediaWikiLexer
25
+
26
+ #Initialized the lexer with a match table.
27
+ #
28
+ #The match table tells the lexer which method to invoke
29
+ #on given input char during "tokenize" phase.
30
+ def initialize
31
+ @position = 0
32
+ @pair_stack = [[false, false]] #stack of tokens for which a pair should be found
33
+ @list_stack = []
34
+ @lexer_table = Hash.new(method(:match_other))
35
+ @lexer_table["'"] = method(:match_italic_or_bold)
36
+ @lexer_table["="] = method(:match_section)
37
+ @lexer_table["["] = method(:match_link_start)
38
+ @lexer_table["]"] = method(:match_link_end)
39
+ @lexer_table[" "] = method(:match_space)
40
+ @lexer_table["*"] = method(:match_list)
41
+ @lexer_table["#"] = method(:match_list)
42
+ @lexer_table[";"] = method(:match_list)
43
+ @lexer_table[":"] = method(:match_list)
44
+ @lexer_table["-"] = method(:match_line)
45
+ @lexer_table["~"] = method(:match_signature)
46
+ @lexer_table["h"] = method(:match_inline_link)
47
+ end
48
+
49
+ #Transforms input stream (string) into the stream of tokens.
50
+ #Tokens are collected into an array of type [ [TOKEN_SYMBOL, TOKEN_VALUE], ..., [false, false] ].
51
+ #This array can be given as input token-by token to RACC based parser with no
52
+ #modification. The last token [false, false] inficates EOF.
53
+ def tokenize(input)
54
+ @tokens = []
55
+ @cursor = 0
56
+ @text = input
57
+ @next_token = []
58
+
59
+ #This tokenizer algorithm assumes that everything that is not
60
+ #matched by the lexer is going to be :TEXT token. Otherwise it's usual
61
+ #lexer algo which call methods from the match table to define next tokens.
62
+ while (@cursor < @text.length)
63
+ @current_token = [:TEXT, ''] unless @current_token
64
+ @token_start = @cursor
65
+ @char = @text[@cursor, 1]
66
+
67
+ if @lexer_table[@char].call == :TEXT
68
+ @current_token[1] += @text[@token_start, 1]
69
+ else
70
+ #skip empty :TEXT tokens
71
+ @tokens << @current_token unless empty_text_token?
72
+ @next_token[1] = @text[@token_start, @cursor - @token_start]
73
+ @tokens << @next_token
74
+ #hack to enable sub-lexing!
75
+ if @sub_tokens
76
+ @tokens += @sub_tokens
77
+ @sub_tokens = nil
78
+ end
79
+ #end of hack!
80
+ @current_token = nil
81
+ @next_token = []
82
+ end
83
+ end
84
+ #add the last TEXT token if it exists
85
+ @tokens << @current_token if @current_token and not empty_text_token?
86
+
87
+ #RACC wants us to put this to indicate EOF
88
+ @tokens << [false, false]
89
+ @tokens
90
+ end
91
+
92
+ #Returns the next token from the stream. Useful for RACC parsers.
93
+ def lex
94
+ token = @tokens[@position]
95
+ @position += 1
96
+ return token
97
+ end
98
+
99
+
100
+ private
101
+ #-- ================== Match methods ================== ++#
102
+
103
+ #Matches anything that was not matched. Returns :TEXT to indicate
104
+ #that matched characters should go into :TEXT token.
105
+ def match_other
106
+ @cursor += 1
107
+ return :TEXT
108
+ end
109
+
110
+ #Matches italic or bold symbols:
111
+ # "'''" { return :BOLD; }
112
+ # "''" { return :ITALIC; }
113
+ def match_italic_or_bold
114
+ if @text[@cursor, 3] == "'''" and @pair_stack.last[0] != :ITALICSTART
115
+ matchBold
116
+ @cursor += 3
117
+ return
118
+ end
119
+ if @text[@cursor, 2] == "''"
120
+ matchItalic
121
+ @cursor += 2
122
+ return
123
+ end
124
+ match_other
125
+ end
126
+
127
+ def matchBold
128
+ if @pair_stack.last[0] == :BOLDSTART
129
+ @next_token[0] = :BOLDEND
130
+ @pair_stack.pop
131
+ else
132
+ @next_token[0] = :BOLDSTART
133
+ @pair_stack.push @next_token
134
+ end
135
+ end
136
+
137
+ def matchItalic
138
+ if @pair_stack.last[0] == :ITALICSTART
139
+ @next_token[0] = :ITALICEND
140
+ @pair_stack.pop
141
+ else
142
+ @next_token[0] = :ITALICSTART
143
+ @pair_stack.push @next_token
144
+ end
145
+ end
146
+
147
+ #Matches sections
148
+ # "=+" { return SECTION; }
149
+ def match_section
150
+ if (@text[@cursor-1, 1] == "\n") or (@pair_stack.last[0] == :SECTION)
151
+ i = 0
152
+ i += 1 while @text[@cursor+i, 1] == "="
153
+ @cursor += i
154
+ @next_token[0] = :SECTION
155
+
156
+ if @pair_stack.last[0] == :SECTION
157
+ @pair_stack.pop
158
+ else
159
+ @pair_stack.push @next_token
160
+ end
161
+ else
162
+ match_other
163
+ end
164
+ end
165
+
166
+ #Matches start of the hyperlinks
167
+ # "[[" { return INTLINKSTART; }
168
+ # "[" { return LINKSTART; }
169
+ def match_link_start
170
+ if @text[@cursor, 2] == "[["
171
+ @next_token[0] = :INTLINKSTART
172
+ @pair_stack.push @next_token
173
+ @cursor += 2
174
+ elsif @text[@cursor, 1] == "[" and html_link?(@cursor+1)
175
+ @next_token[0] = :LINKSTART
176
+ @pair_stack.push @next_token
177
+ @cursor += 1
178
+ else
179
+ match_other
180
+ end
181
+ end
182
+
183
+ #Matches end of the hyperlinks
184
+ # "]]" { return INTLINKEND; }
185
+ # "]" { return LINKEND; }
186
+ def match_link_end
187
+ if @text[@cursor, 2] == "]]" and @pair_stack.last[0] == :INTLINKSTART
188
+ @next_token[0] = :INTLINKEND
189
+ @pair_stack.pop
190
+ @cursor += 2
191
+ elsif @text[@cursor, 1] == "]" and @pair_stack.last[0] == :LINKSTART
192
+ @next_token[0] = :LINKEND
193
+ @pair_stack.pop
194
+ @cursor += 1
195
+ else
196
+ match_other
197
+ end
198
+ end
199
+
200
+ #Matches inlined unformatted html link
201
+ # "http://[^\s]*" { return [ LINKSTART TEXT LINKEND]; }
202
+ def match_inline_link
203
+ #if no link start token was detected and the text starts with http://
204
+ #then it's the inlined unformatted html link
205
+ if html_link?(@cursor) and @pair_stack.last[0] != :INTLINKSTART and
206
+ @pair_stack.last[0] != :LINKSTART
207
+ @next_token[0] = :LINKSTART
208
+ linkText = extract_till_whitespace
209
+ @sub_tokens = []
210
+ @sub_tokens << [:TEXT, linkText]
211
+ @sub_tokens << [:LINKEND, ']']
212
+ @cursor += linkText.length
213
+ @token_start = @cursor
214
+ else
215
+ match_other
216
+ end
217
+ end
218
+
219
+ #Matches space to find preformatted areas which start with a space after a newline
220
+ # "\n\s[^\n]*" { return PRE; }
221
+ def match_space
222
+ if at_start_of_line?
223
+ match_untill_eol
224
+ @next_token[0] = :PRE
225
+ strip_ws_from_token_start
226
+ else
227
+ match_other
228
+ end
229
+ end
230
+
231
+ #Matches any kind of list by using sublexing technique. MediaWiki lists are context-sensitive
232
+ #therefore we need to do some special processing with lists. The idea here is to strip
233
+ #the leftmost symbol indicating the list from the group of input lines and use separate
234
+ #lexer to process extracted fragment.
235
+ def match_list
236
+ if at_start_of_line?
237
+ list_id = @text[@cursor, 1]
238
+ sub_text = extract_list_contents(list_id)
239
+ extracted = 0
240
+
241
+ #hack to tokenize everything inside the list
242
+ @sub_tokens = []
243
+ sub_lines = ""
244
+ @sub_tokens << [:LI_START, ""]
245
+ sub_text.each do |t|
246
+ extracted += 1
247
+ if text_is_list? t
248
+ sub_lines += t
249
+ else
250
+ if not sub_lines.empty?
251
+ @sub_tokens += sub_lex(sub_lines)
252
+ sub_lines = ""
253
+ end
254
+ if @sub_tokens.last[0] != :LI_START
255
+ @sub_tokens << [:LI_END, ""]
256
+ @sub_tokens << [:LI_START, ""]
257
+ end
258
+ @sub_tokens += sub_lex(t.lstrip)
259
+ end
260
+ end
261
+ if not sub_lines.empty?
262
+ @sub_tokens += sub_lex(sub_lines)
263
+ @sub_tokens << [:LI_END, ""]
264
+ else
265
+ @sub_tokens << [:LI_END, ""]
266
+ end
267
+
268
+ #end of hack
269
+ @cursor += sub_text.length + extracted
270
+ @token_start = @cursor
271
+
272
+ case
273
+ when list_id == "*"
274
+ @next_token[0] = :UL_START
275
+ @sub_tokens << [:UL_END, ""]
276
+ when list_id == "#"
277
+ @next_token[0] = :OL_START
278
+ @sub_tokens << [:OL_END, ""]
279
+ when list_id == ";", list_id == ":"
280
+ @next_token[0] = :DL_START
281
+ @sub_tokens << [:DL_END, ""]
282
+ end
283
+
284
+ else
285
+ match_other
286
+ end
287
+ end
288
+
289
+ #Matches the line until \n
290
+ def match_untill_eol
291
+ val = @text[@cursor, 1]
292
+ while (val != "\n") and (!val.nil?)
293
+ @cursor += 1
294
+ val = @text[@cursor, 1]
295
+ end
296
+ @cursor += 1
297
+ end
298
+
299
+ #Matches hline tag that start with "-"
300
+ # "\n----" { return HLINE; }
301
+ def match_line
302
+ if at_start_of_line? and @text[@cursor, 4] == "----"
303
+ @next_token[0] = :HLINE
304
+ @cursor += 4
305
+ else
306
+ match_other
307
+ end
308
+ end
309
+
310
+ #Matches signature
311
+ # "~~~~~" { return SIGNATURE_DATE; }
312
+ # "~~~~" { return SIGNATURE_FULL; }
313
+ # "~~~" { return SIGNATURE_NAME; }
314
+ def match_signature
315
+ if @text[@cursor, 5] == "~~~~~"
316
+ @next_token[0] = :SIGNATURE_DATE
317
+ @cursor += 5
318
+ elsif @text[@cursor, 4] == "~~~~"
319
+ @next_token[0] = :SIGNATURE_FULL
320
+ @cursor += 4
321
+ elsif @text[@cursor, 3] == "~~~"
322
+ @next_token[0] = :SIGNATURE_NAME
323
+ @cursor += 3
324
+ else
325
+ match_other
326
+ end
327
+ end
328
+
329
+ #-- ================== Helper methods ================== ++#
330
+
331
+ #Checks if the token is placed at the start of the line.
332
+ def at_start_of_line?
333
+ if @cursor == 0 or @text[@cursor-1, 1] == "\n"
334
+ true
335
+ else
336
+ false
337
+ end
338
+ end
339
+
340
+ #Checks if the text at position contains the start of the html link
341
+ def html_link?(position)
342
+ return @text[position, 7] == 'http://'
343
+ end
344
+
345
+ #Adjusts @token_start to skip leading whitespaces
346
+ def strip_ws_from_token_start
347
+ @token_start += 1 while @text[@token_start, 1] == " "
348
+ end
349
+
350
+ #Returns true if the TEXT token is empty or contains newline only
351
+ def empty_text_token?
352
+ @current_token == [:TEXT, ''] or @current_token == [:TEXT, "\n"]
353
+ end
354
+
355
+ #Returns true if the text is a list, i.e. starts with one of #;*: symbols
356
+ #that indicate a list
357
+ def text_is_list?(text)
358
+ return text =~ /^[#;*:].*/
359
+ end
360
+
361
+ #Runs sublexer to tokenize sub_text
362
+ def sub_lex(sub_text)
363
+ sub_lexer = MediaWikiLexer.new
364
+ sub_tokens = sub_lexer.tokenize(sub_text)
365
+ sub_tokens.pop
366
+ sub_tokens
367
+ end
368
+
369
+ #Extracts the text from current cursor position till the next whitespace
370
+ def extract_till_whitespace
371
+ i = @cursor
372
+ text = ""
373
+ while i < @text.length
374
+ curr = @text[i, 1]
375
+ if (curr == "\n") or (curr == "\t") or (curr == " ")
376
+ break
377
+ end
378
+ text += curr
379
+ i += 1
380
+ end
381
+ text
382
+ end
383
+
384
+ #Extract list contents of list type set by list_id variable.
385
+ #Example list:
386
+ # *a
387
+ # **a
388
+ #Extracted list with id "*" will look like:
389
+ # a
390
+ # *a
391
+ def extract_list_contents(list_id)
392
+ i = @cursor+1
393
+ list = ""
394
+ while i < @text.length
395
+ curr = @text[i, 1]
396
+ if (curr == "\n") and (@text[i+1, 1] != list_id)
397
+ list+=curr
398
+ break
399
+ end
400
+ list += curr unless (curr == list_id) and (@text[i-1, 1] == "\n")
401
+ i += 1
402
+ end
403
+ list
404
+ end
405
+
406
+ end
407
+