mediacloth 0.0.3 → 0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. data/README.md +36 -0
  2. data/lib/mediacloth/mediawikiast.rb +58 -1
  3. data/lib/mediacloth/mediawikihtmlgenerator.rb +229 -73
  4. data/lib/mediacloth/mediawikilexer.rb +1030 -656
  5. data/lib/mediacloth/mediawikilinkhandler.rb +89 -0
  6. data/lib/mediacloth/mediawikiparams.rb +1 -10
  7. data/lib/mediacloth/mediawikiparser.rb +939 -409
  8. data/lib/mediacloth/mediawikiparser.tab.rb +1357 -0
  9. data/lib/mediacloth/mediawikiparser.y +256 -52
  10. data/lib/mediacloth/mediawikisignedwikigenerator.rb +42 -0
  11. data/lib/mediacloth/mediawikitemplatehandler.rb +8 -0
  12. data/lib/mediacloth/mediawikiwalker.rb +72 -1
  13. data/lib/mediacloth.rb +33 -10
  14. data/test/data/ast1 +68 -0
  15. data/test/data/ast10 +196 -0
  16. data/test/data/ast11 +34 -0
  17. data/test/data/ast12 +39 -0
  18. data/test/data/ast13 +25 -0
  19. data/test/data/ast14 +13 -0
  20. data/test/data/ast15 +25 -0
  21. data/test/data/ast16 +17 -0
  22. data/test/data/ast17 +9 -0
  23. data/test/data/ast18 +21 -0
  24. data/test/data/ast19 +32 -0
  25. data/test/data/ast2 +4 -0
  26. data/test/data/ast20 +10 -0
  27. data/test/data/ast21 +27 -0
  28. data/test/data/ast22 +22 -0
  29. data/test/data/ast23 +5 -0
  30. data/test/data/ast3 +6 -0
  31. data/test/data/ast4 +122 -0
  32. data/test/data/ast5 +122 -0
  33. data/test/data/ast6 +22 -0
  34. data/test/data/ast7 +143 -0
  35. data/test/data/ast8 +3 -0
  36. data/test/data/ast9 +11 -0
  37. data/test/data/html1 +33 -5
  38. data/test/data/html10 +31 -27
  39. data/test/data/html11 +19 -0
  40. data/test/data/html12 +32 -0
  41. data/test/data/html13 +29 -0
  42. data/test/data/html14 +4 -0
  43. data/test/data/html15 +29 -0
  44. data/test/data/html16 +28 -0
  45. data/test/data/html17 +10 -0
  46. data/test/data/html18 +8 -0
  47. data/test/data/html19 +27 -0
  48. data/test/data/html2 +1 -1
  49. data/test/data/html20 +7 -0
  50. data/test/data/html21 +5 -0
  51. data/test/data/html22 +24 -0
  52. data/test/data/html23 +7 -0
  53. data/test/data/html3 +1 -1
  54. data/test/data/html4 +60 -11
  55. data/test/data/html5 +45 -6
  56. data/test/data/html6 +5 -5
  57. data/test/data/html7 +59 -1
  58. data/test/data/html8 +1 -1
  59. data/test/data/html9 +10 -2
  60. data/test/data/input1 +4 -0
  61. data/test/data/input11 +19 -0
  62. data/test/data/input12 +32 -0
  63. data/test/data/input13 +10 -0
  64. data/test/data/input14 +8 -0
  65. data/test/data/input15 +10 -0
  66. data/test/data/input16 +28 -0
  67. data/test/data/input17 +10 -0
  68. data/test/data/input18 +16 -0
  69. data/test/data/input19 +29 -0
  70. data/test/data/input20 +8 -0
  71. data/test/data/input21 +18 -0
  72. data/test/data/input22 +20 -0
  73. data/test/data/input23 +8 -0
  74. data/test/data/input4 +13 -1
  75. data/test/data/input5 +45 -4
  76. data/test/data/input7 +25 -1
  77. data/test/data/lex1 +17 -18
  78. data/test/data/lex10 +57 -87
  79. data/test/data/lex11 +18 -0
  80. data/test/data/lex12 +32 -0
  81. data/test/data/lex13 +3 -0
  82. data/test/data/lex14 +1 -0
  83. data/test/data/lex15 +3 -0
  84. data/test/data/lex16 +27 -0
  85. data/test/data/lex17 +9 -0
  86. data/test/data/lex18 +4 -0
  87. data/test/data/lex19 +27 -0
  88. data/test/data/lex2 +2 -2
  89. data/test/data/lex20 +7 -0
  90. data/test/data/lex21 +4 -0
  91. data/test/data/lex22 +3 -0
  92. data/test/data/lex23 +7 -0
  93. data/test/data/lex3 +1 -1
  94. data/test/data/lex4 +35 -29
  95. data/test/data/lex5 +57 -18
  96. data/test/data/lex6 +7 -7
  97. data/test/data/lex7 +42 -18
  98. data/test/data/lex8 +1 -1
  99. data/test/data/lex9 +6 -6
  100. data/test/dataproducers/ast.rb +24 -0
  101. data/test/dataproducers/html.rb +11 -12
  102. data/test/dataproducers/lex.rb +9 -4
  103. data/test/debugwalker.rb +25 -11
  104. data/test/htmlgenerator.rb +170 -13
  105. data/test/lexer.rb +626 -83
  106. data/test/linkhandler.rb +39 -0
  107. data/test/parser.rb +176 -9
  108. data/test/signedwikigenerator.rb +113 -0
  109. metadata +158 -79
  110. data/README +0 -37
  111. data/lib/mediacloth/mediawikilexer.rb~ +0 -491
  112. data/lib/mediacloth/mediawikiparser.y~ +0 -210
  113. data/test/data/result1 +0 -48
  114. data/test/dataproducers/html.rb~ +0 -24
  115. data/test/dataproducers/lex.rb~ +0 -15
data/README.md ADDED
@@ -0,0 +1,36 @@
1
+ MediaCloth is the MediaWiki syntax parser and html generator written in ruby. It's small, fast and aims to recognize the complete MediaWiki language.
2
+
3
+ ## Installation
4
+ To install the library run:
5
+
6
+ ruby setup.rb
7
+
8
+
9
+ ## Usage
10
+ The quickest way to parse your input and produce html formatted text is:
11
+
12
+ require 'mediacloth'
13
+ puts MediaCloth::wiki_to_html("'''Hello'''''World''!")
14
+
15
+ You can also provide a hash with custom options if you want to use another generator or link handler:
16
+
17
+ require 'mediacloth'
18
+ puts MediaCloth::wiki_to_html("'''Hello'''''World''!", :link_handler => MyLinkHandler.new)
19
+
20
+ Both examples should produce
21
+
22
+ <b>Hello</b><i>World</i>!
23
+
24
+ ## API Docs
25
+ To generate API documentation run:
26
+
27
+ rake rdoc
28
+
29
+ ## Development
30
+ To run tests execute
31
+
32
+ rake test
33
+
34
+ To regenerate test data (html and lex files from wiki input), run:
35
+
36
+ rake test:regenerate
@@ -3,11 +3,15 @@ class AST
3
3
  attr_accessor :contents
4
4
  attr_accessor :parent
5
5
  attr_accessor :children
6
+ attr_accessor :index
7
+ attr_accessor :length
6
8
 
7
- def initialize
9
+ def initialize(index = 0,length = 0)
8
10
  @children = []
9
11
  @parent = nil
10
12
  @contents = ""
13
+ @index = index
14
+ @length = length
11
15
  end
12
16
  end
13
17
 
@@ -20,6 +24,10 @@ end
20
24
  class ParagraphAST < AST
21
25
  end
22
26
 
27
+ #The node to represent paragraph with text pasted into wiki
28
+ class PasteAST < AST
29
+ end
30
+
23
31
  #The node to represent a simple or formatted text
24
32
  #with more AST nodes inside.
25
33
  class FormattedAST < AST
@@ -36,6 +44,7 @@ end
36
44
  class LinkAST < AST
37
45
  #The link's URL
38
46
  attr_accessor :url
47
+ attr_accessor :link_type
39
48
  end
40
49
 
41
50
  #The node to represent a Mediawiki internal link
@@ -45,6 +54,13 @@ class InternalLinkAST < AST
45
54
  attr_accessor :locator
46
55
  end
47
56
 
57
+ #The node to represent a Mediawiki category link
58
+ class CategoryLinkAST < AST
59
+ #Holds the category locator, which is composed of a category name only
60
+ #(e.g. the name of the category)
61
+ attr_accessor :locator
62
+ end
63
+
48
64
  #The node to represent a MediaWiki resource reference (embedded images, videos,
49
65
  #etc.)
50
66
  class ResourceLinkAST < AST
@@ -72,6 +88,7 @@ end
72
88
  class TableCellAST < AST
73
89
  #the type of cell, :head or :body
74
90
  attr_accessor :type
91
+ attr_accessor :attributes
75
92
  end
76
93
 
77
94
  #The node to represent a list
@@ -84,6 +101,14 @@ end
84
101
  class ListItemAST < AST
85
102
  end
86
103
 
104
+ # The node to represent a leading term in a dictionary list
105
+ class ListTermAST < AST
106
+ end
107
+
108
+ # The node to represent a definition in a dictionary list
109
+ class ListDefinitionAST < AST
110
+ end
111
+
87
112
  #The node to represent a section
88
113
  class SectionAST < AST
89
114
  #The level of the section (1,2,3...) that would correspond to
@@ -93,4 +118,36 @@ end
93
118
 
94
119
  #The node to represent a preformatted contents
95
120
  class PreformattedAST < AST
121
+ attr_accessor :indented
122
+ end
123
+
124
+ #The node to represent an XHTML element and its contents
125
+ class ElementAST < AST
126
+ attr_accessor :name, :attributes
127
+ end
128
+
129
+ # The node to represent special Mediawiki keywords, such as __TOC__. The text
130
+ # attribute contains the entire string inbetween '__' and '__'.
131
+ class KeywordAST < AST
132
+ attr_accessor :text
133
+ end
134
+
135
+ # The node to represent templates and pre-defined (or user-defined) variables, such as
136
+ # {{Date}}.
137
+ class TemplateAST < AST
138
+ attr_accessor :template_name
139
+ end
140
+
141
+ # The node to represent template parameter
142
+ class TemplateParameterAST < AST
143
+ attr_accessor :parameter_name #not used atm
144
+ attr_accessor :parameter_value
145
+ end
146
+
147
+ #The node to represent categories to which this page belongs
148
+ class CategoryAST < AST
149
+ #Holds the name of the category
150
+ attr_accessor :locator
151
+ #Holds the string the page is to be sorted as
152
+ attr_accessor :sort_as
96
153
  end
@@ -11,55 +11,37 @@ require 'mediacloth/mediawikiparams'
11
11
  # walker.parse(ast)
12
12
  # puts walker.html
13
13
  class MediaWikiHTMLGenerator < MediaWikiWalker
14
+
14
15
  attr_reader :html
15
16
 
16
- def initialize
17
- @html = ""
18
- end
19
-
20
17
  def parse(ast)
18
+ @html = ""
19
+ @ast = ast
21
20
  @html = super(ast)
22
21
  end
23
22
 
24
- #The default link handler. A custom link handler may extend this class.
25
- class MediaWikiLinkHandler
26
-
27
- #Method invoked to resolve references to wiki pages when they occur in an
28
- #internal link. In all the following internal links, the page name is
29
- #<tt>My Page</tt>:
30
- #* <tt>[[My Page]]</tt>
31
- #* <tt>[[My Page|Click here to view my page]]</tt>
32
- #* <tt>[[My Page|Click ''here'' to view my page]]</tt>
33
- #The return value should be a URL that references the page resource.
34
- def url_for(resource)
35
- "javascript:void(0)"
36
- end
37
-
38
- #Method invoked to resolve references to resources of unknown types. The
39
- #type is indicated by the resource prefix. Examples of inline links to
40
- #unknown references include:
41
- #* <tt>[[Media:video.mpg]]</tt> (prefix <tt>Media</tt>, resource <tt>video.mpg</tt>)
42
- #* <tt>[[Image:pretty.png|100px|A ''pretty'' picture]]</tt> (prefix <tt>Image</tt>,
43
- # resource <tt>pretty.png</tt>, and options <tt>100px</tt> and <tt>A
44
- # <i>pretty</i> picture</tt>.
45
- #The return value should be a well-formed hyperlink, image, object or
46
- #applet tag.
47
- def link_for(prefix, resource, options=[])
48
- "<a href=\"javascript:void(0)\">#{prefix}:#{resource}(#{options.join(', ')})</a>"
49
- end
50
- end
51
-
52
- #Set this generator's URL handler.
53
- def link_handler=(handler)
54
- @link_handler = handler
23
+ # Utility method that returns the string with '<', '>', '&' and '"' escaped as
24
+ # XHTML character entities
25
+ def MediaWikiHTMLGenerator.escape(str)
26
+ r = str.gsub(%r{[<>&"]}) do
27
+ |match|
28
+ case match
29
+ when '<' then '&lt;'
30
+ when '>' then '&gt;'
31
+ when '&' then '&amp;'
32
+ when '"' then '&quot;'
33
+ end
34
+ end
35
+ r
55
36
  end
56
37
 
57
- #Returns's this generator URL handler. If no handler was set, returns the default
58
- #handler.
59
- def link_handler
60
- @link_handler ||= MediaWikiLinkHandler.new
38
+ # Utility method that converts the string specified into a specially formatted text
39
+ # string which can be used as an XHTML link anchor name.
40
+ def MediaWikiHTMLGenerator.anchor_for(str)
41
+ str.strip.squeeze(' ').gsub(' ', '_').gsub('\'', '_')
61
42
  end
62
43
 
44
+
63
45
  protected
64
46
 
65
47
  def parse_wiki_ast(ast)
@@ -67,20 +49,39 @@ protected
67
49
  end
68
50
 
69
51
  def parse_paragraph(ast)
70
- "<p>" + super(ast) + "</p>"
52
+ if (children = ast.children)
53
+ if children.size == 1 and ((text = children.first.contents) == "\n\n" || text == "\r\n\r\n")
54
+ "<p><br />#{text}</p>"
55
+ else
56
+ "<p>#{super(ast)}</p>"
57
+ end
58
+ else
59
+ "<p><br /></p>"
60
+ end
61
+ end
62
+
63
+ def parse_paste(ast)
64
+ return '' unless ast.children
65
+ "<div class=\"paste\" style=\"white-space: pre-wrap;\">#{super(ast)}</div>"
71
66
  end
72
67
 
73
68
  def parse_text(ast)
74
- tag = formatting_to_tag(ast)
75
- if tag[0].empty?
76
- ast.contents
69
+ if ast.formatting
70
+ case(ast.formatting)
71
+ when :None then MediaWikiHTMLGenerator.escape(ast.contents)
72
+ when :CharacterEntity then "&#{ast.contents};"
73
+ when :HLine then "<hr/>"
74
+ when :SignatureDate then @params.time.to_s
75
+ when :SignatureName then link_handler.link_for("User:#{@params.author}", @params.author)
76
+ when :SignatureFull then "#{link_handler.link_for("User:#{@params.author}", @params.author)} #{@params.time.to_s}"
77
+ end
77
78
  else
78
- "<#{tag[0]}#{tag[1]}>#{ast.contents}</#{tag[0]}>"
79
+ escape(ast.contents)
79
80
  end
80
81
  end
81
82
 
82
83
  def parse_formatted(ast)
83
- tag = formatting_to_tag(ast)
84
+ tag = ast.formatting == :Bold ? 'b' : 'i'
84
85
  "<#{tag}>" + super(ast) + "</#{tag}>"
85
86
  end
86
87
 
@@ -95,28 +96,74 @@ protected
95
96
  "<li>" + super(ast) + "</li>"
96
97
  end
97
98
 
99
+ def parse_list_term(ast)
100
+ "<dt>" + super(ast) + "</dt>"
101
+ end
102
+
103
+ def parse_list_definition(ast)
104
+ "<dd>" + super(ast) + "</dd>"
105
+ end
106
+
98
107
  def parse_preformatted(ast)
108
+ if ast.indented
109
+ original_text = super(ast)
110
+ lines = original_text.split("\n").sort
111
+ shortest_space = lines.last.scan(/^\s+/)[0]
112
+ contents = ""
113
+ if shortest_space
114
+ original_text.each_line do |line|
115
+ contents << line.sub(shortest_space, "")
116
+ end
117
+ else
118
+ contents = original_text
119
+ end
120
+ "<pre class=\"indent\">" + contents + "</pre>"
121
+ else
122
+ "<pre>" + super(ast) + "</pre>"
123
+ end
99
124
  end
100
125
 
101
126
  def parse_section(ast)
102
- "<h#{ast.level}>" + super(ast) + "</h#{ast.level}>"
127
+ generator = TextGenerator.new
128
+ anchor = MediaWikiHTMLGenerator.anchor_for(generator.parse(ast).join(' '))
129
+ "<h#{ast.level}><a name='#{anchor}'></a>" + super(ast) + "</h#{ast.level}>\n"
103
130
  end
104
131
 
105
132
  def parse_internal_link(ast)
106
133
  text = parse_wiki_ast(ast)
107
- text = ast.locator if text.length == 0
108
- href = link_handler.url_for(ast.locator)
109
- "<a href=\"#{href}\">#{text}</a>"
134
+ text = MediaWikiHTMLGenerator.escape(ast.locator) if text.length == 0
135
+ link_handler.link_for(ast.locator, text)
110
136
  end
111
-
137
+
112
138
  def parse_resource_link(ast)
113
139
  options = ast.children.map do |node|
114
140
  parse_internal_link_item(node)
115
141
  end
116
- link_handler.link_for(ast.prefix, ast.locator, options)
142
+ link_handler.link_for_resource(ast.prefix, ast.locator, options)
143
+ end
144
+
145
+ def parse_template(ast)
146
+ parameters = ast.children.map do |node|
147
+ if node.parameter_value
148
+ node.parameter_value
149
+ else
150
+ parse_template(node.children.first)
151
+ end
152
+ end
153
+ template_handler.included_template(ast.template_name, parameters)
154
+ end
155
+
156
+ def parse_category_link(ast)
157
+ text = parse_wiki_ast(ast)
158
+ text = MediaWikiHTMLGenerator.escape(ast.locator) if text.length == 0
159
+ link_handler.link_for_category(ast.locator, text)
160
+ end
161
+
162
+ def parse_category(ast)
163
+ text = parse_wiki_ast(ast)
164
+ link_handler.category_add(ast.locator, ast.sort_as)
117
165
  end
118
166
 
119
- #Reimplement this
120
167
  def parse_internal_link_item(ast)
121
168
  text = super(ast)
122
169
  text.strip
@@ -125,13 +172,15 @@ protected
125
172
  def parse_link(ast)
126
173
  text = super(ast)
127
174
  href = ast.url
128
- text = href if text.length == 0
129
- "<a href=\"#{href}\">#{text}</a>"
175
+ text = MediaWikiHTMLGenerator.escape(href) if text.length == 0
176
+ link_handler.absolute_link_for(href, text, ast.link_type)
130
177
  end
131
178
 
132
179
  #Reimplement this
133
180
  def parse_table(ast)
134
181
  options = ast.options ? ' ' + ast.options.strip : ''
182
+ options << ' cellpadding="5"' unless options.include?('cellpadding')
183
+ options << ' border="1"' unless options.include?('border')
135
184
  "<table#{options}>" + super(ast) + "</table>\n"
136
185
  end
137
186
 
@@ -146,28 +195,33 @@ protected
146
195
  if ast.type == :head
147
196
  "<th>" + super(ast) + "</th>"
148
197
  else
149
- "<td>" + super(ast) + "</td>"
198
+ if ast.attributes
199
+ "<td #{ast.attributes.first.contents}>" + super(ast) + "</td>"
200
+ else
201
+ "<td>" + super(ast) + "</td>"
202
+ end
150
203
  end
151
204
  end
152
205
 
153
- #returns an array with a tag name and tag attributes
154
- def formatting_to_tag(ast)
155
- tag = ["", ""]
156
- if ast.formatting == :Bold
157
- tag = ["b", ""]
158
- elsif ast.formatting == :Italic
159
- tag = ["i", ""]
160
- elsif ast.formatting == :HLine
161
- ast.contents = ""
162
- tag = ["hr", ""]
163
- elsif ast.formatting == :SignatureDate
164
- ast.contents = MediaWikiParams.instance.time.to_s
165
- elsif ast.formatting == :SignatureName
166
- ast.contents = MediaWikiParams.instance.author
167
- elsif ast.formatting == :SignatureFull
168
- ast.contents = MediaWikiParams.instance.author + " " + MediaWikiParams.instance.time.to_s
169
- end
170
- tag
206
+ def parse_element(ast)
207
+ attr = ''
208
+ if ast.attributes
209
+ attr = ' ' + ast.attributes.collect{ |name, value|
210
+ name + '="' + MediaWikiHTMLGenerator.escape(value) + '"' }.join(' ')
211
+ end
212
+ if ast.children.size == 0
213
+ "<#{ast.name}#{attr} />"
214
+ else
215
+ "<#{ast.name}#{attr}>" + super(ast) + "</#{ast.name}>"
216
+ end
217
+ end
218
+
219
+ def parse_keyword(ast)
220
+ if ast.text == 'TOC'
221
+ generator = TocGenerator.new
222
+ generator.parse(@ast)
223
+ generator.html
224
+ end
171
225
  end
172
226
 
173
227
  #returns a tag name of the list in ast node
@@ -176,7 +230,109 @@ protected
176
230
  return "ul"
177
231
  elsif ast.list_type == :Numbered
178
232
  return "ol"
233
+ elsif ast.list_type == :Dictionary
234
+ return "dl"
235
+ end
236
+ end
237
+
238
+ # AST walker that generates a table of contents, containing links to all
239
+ # section headings in the page.
240
+ class TocGenerator < MediaWikiHTMLGenerator
241
+
242
+ class TocNode
243
+ attr_accessor :children
244
+ attr_accessor :parent
245
+ attr_accessor :section
246
+ def initialize
247
+ @children = []
248
+ end
249
+
250
+ def add_child(child)
251
+ @children << child
252
+ child.parent = self
253
+ end
254
+
255
+ def level
256
+ res = 0
257
+ node = self
258
+ while p = node.parent
259
+ res += 1
260
+ node = p
261
+ end
262
+ res
263
+ end
264
+
265
+ def number
266
+ res = ''
267
+ node = self
268
+ while p = node.parent
269
+ res = "#{p.children.index(node)+1}." + res
270
+ node = p
271
+ end
272
+ res
273
+ end
179
274
  end
275
+
276
+ def parse(ast)
277
+ @html = ''
278
+ @text_generator = TextGenerator.new
279
+
280
+ root = TocNode.new
281
+ root_stack = [root]
282
+
283
+ parse_branch = lambda do |ast|
284
+ ast.children.each do |child|
285
+ if child.class == SectionAST
286
+ root_stack.pop while child.level <= ((sec = root_stack.last.section) ? sec.level : 0)
287
+
288
+ node = TocNode.new
289
+ node.section = child
290
+ root_stack.last.add_child(node)
291
+
292
+ root_stack.push node
293
+ end
294
+ parse_branch.call(child)
295
+ end
296
+ end
297
+ parse_branch.call(ast)
298
+
299
+ @html += parse_section(root)
300
+ @html = "<div class=\"wikitoc\">\n<div class=\"wikitoctitle\">Contents</div>#{@html}\n</div>\n" if @html != ''
301
+ end
302
+
303
+ protected
304
+
305
+ def parse_section(toc_node)
306
+ html = ''
307
+ if toc_node.section
308
+ anchor = MediaWikiHTMLGenerator.anchor_for(@text_generator.parse(toc_node.section).join(' '))
309
+ html += "\n<li><a href='##{anchor}'><span class=\"wikitocnumber\">#{toc_node.number}</span><span class=\"wikitoctext\">#{parse_wiki_ast(toc_node.section).strip}</span></a>"
310
+ end
311
+
312
+ unless toc_node.children.empty?
313
+ html += "\n<ul>"
314
+ toc_node.children.each do |child_node|
315
+ html += parse_section(child_node)
316
+ end
317
+ html += "\n</ul>"
318
+ end
319
+
320
+ html += "</li>" if html[0,4] == "<li>"
321
+ html
322
+ end
323
+
324
+ end
325
+
326
+
327
+ # AST walker that outputs just the text portions of a page.
328
+ class TextGenerator < MediaWikiWalker
329
+
330
+ protected
331
+
332
+ def parse_text(ast)
333
+ MediaWikiHTMLGenerator.escape(ast.contents)
334
+ end
335
+
180
336
  end
181
337
 
182
338
  end