mediacloth 0.0.3 → 0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. data/README.md +36 -0
  2. data/lib/mediacloth/mediawikiast.rb +58 -1
  3. data/lib/mediacloth/mediawikihtmlgenerator.rb +229 -73
  4. data/lib/mediacloth/mediawikilexer.rb +1030 -656
  5. data/lib/mediacloth/mediawikilinkhandler.rb +89 -0
  6. data/lib/mediacloth/mediawikiparams.rb +1 -10
  7. data/lib/mediacloth/mediawikiparser.rb +939 -409
  8. data/lib/mediacloth/mediawikiparser.tab.rb +1357 -0
  9. data/lib/mediacloth/mediawikiparser.y +256 -52
  10. data/lib/mediacloth/mediawikisignedwikigenerator.rb +42 -0
  11. data/lib/mediacloth/mediawikitemplatehandler.rb +8 -0
  12. data/lib/mediacloth/mediawikiwalker.rb +72 -1
  13. data/lib/mediacloth.rb +33 -10
  14. data/test/data/ast1 +68 -0
  15. data/test/data/ast10 +196 -0
  16. data/test/data/ast11 +34 -0
  17. data/test/data/ast12 +39 -0
  18. data/test/data/ast13 +25 -0
  19. data/test/data/ast14 +13 -0
  20. data/test/data/ast15 +25 -0
  21. data/test/data/ast16 +17 -0
  22. data/test/data/ast17 +9 -0
  23. data/test/data/ast18 +21 -0
  24. data/test/data/ast19 +32 -0
  25. data/test/data/ast2 +4 -0
  26. data/test/data/ast20 +10 -0
  27. data/test/data/ast21 +27 -0
  28. data/test/data/ast22 +22 -0
  29. data/test/data/ast23 +5 -0
  30. data/test/data/ast3 +6 -0
  31. data/test/data/ast4 +122 -0
  32. data/test/data/ast5 +122 -0
  33. data/test/data/ast6 +22 -0
  34. data/test/data/ast7 +143 -0
  35. data/test/data/ast8 +3 -0
  36. data/test/data/ast9 +11 -0
  37. data/test/data/html1 +33 -5
  38. data/test/data/html10 +31 -27
  39. data/test/data/html11 +19 -0
  40. data/test/data/html12 +32 -0
  41. data/test/data/html13 +29 -0
  42. data/test/data/html14 +4 -0
  43. data/test/data/html15 +29 -0
  44. data/test/data/html16 +28 -0
  45. data/test/data/html17 +10 -0
  46. data/test/data/html18 +8 -0
  47. data/test/data/html19 +27 -0
  48. data/test/data/html2 +1 -1
  49. data/test/data/html20 +7 -0
  50. data/test/data/html21 +5 -0
  51. data/test/data/html22 +24 -0
  52. data/test/data/html23 +7 -0
  53. data/test/data/html3 +1 -1
  54. data/test/data/html4 +60 -11
  55. data/test/data/html5 +45 -6
  56. data/test/data/html6 +5 -5
  57. data/test/data/html7 +59 -1
  58. data/test/data/html8 +1 -1
  59. data/test/data/html9 +10 -2
  60. data/test/data/input1 +4 -0
  61. data/test/data/input11 +19 -0
  62. data/test/data/input12 +32 -0
  63. data/test/data/input13 +10 -0
  64. data/test/data/input14 +8 -0
  65. data/test/data/input15 +10 -0
  66. data/test/data/input16 +28 -0
  67. data/test/data/input17 +10 -0
  68. data/test/data/input18 +16 -0
  69. data/test/data/input19 +29 -0
  70. data/test/data/input20 +8 -0
  71. data/test/data/input21 +18 -0
  72. data/test/data/input22 +20 -0
  73. data/test/data/input23 +8 -0
  74. data/test/data/input4 +13 -1
  75. data/test/data/input5 +45 -4
  76. data/test/data/input7 +25 -1
  77. data/test/data/lex1 +17 -18
  78. data/test/data/lex10 +57 -87
  79. data/test/data/lex11 +18 -0
  80. data/test/data/lex12 +32 -0
  81. data/test/data/lex13 +3 -0
  82. data/test/data/lex14 +1 -0
  83. data/test/data/lex15 +3 -0
  84. data/test/data/lex16 +27 -0
  85. data/test/data/lex17 +9 -0
  86. data/test/data/lex18 +4 -0
  87. data/test/data/lex19 +27 -0
  88. data/test/data/lex2 +2 -2
  89. data/test/data/lex20 +7 -0
  90. data/test/data/lex21 +4 -0
  91. data/test/data/lex22 +3 -0
  92. data/test/data/lex23 +7 -0
  93. data/test/data/lex3 +1 -1
  94. data/test/data/lex4 +35 -29
  95. data/test/data/lex5 +57 -18
  96. data/test/data/lex6 +7 -7
  97. data/test/data/lex7 +42 -18
  98. data/test/data/lex8 +1 -1
  99. data/test/data/lex9 +6 -6
  100. data/test/dataproducers/ast.rb +24 -0
  101. data/test/dataproducers/html.rb +11 -12
  102. data/test/dataproducers/lex.rb +9 -4
  103. data/test/debugwalker.rb +25 -11
  104. data/test/htmlgenerator.rb +170 -13
  105. data/test/lexer.rb +626 -83
  106. data/test/linkhandler.rb +39 -0
  107. data/test/parser.rb +176 -9
  108. data/test/signedwikigenerator.rb +113 -0
  109. metadata +158 -79
  110. data/README +0 -37
  111. data/lib/mediacloth/mediawikilexer.rb~ +0 -491
  112. data/lib/mediacloth/mediawikiparser.y~ +0 -210
  113. data/test/data/result1 +0 -48
  114. data/test/dataproducers/html.rb~ +0 -24
  115. data/test/dataproducers/lex.rb~ +0 -15
data/README.md ADDED
@@ -0,0 +1,36 @@
1
+ MediaCloth is the MediaWiki syntax parser and html generator written in ruby. It's small, fast and aims to recognize the complete MediaWiki language.
2
+
3
+ ## Installation
4
+ To install the library run:
5
+
6
+ ruby setup.rb
7
+
8
+
9
+ ## Usage
10
+ The quickest way to parse your input and produce html formatted text is:
11
+
12
+ require 'mediacloth'
13
+ puts MediaCloth::wiki_to_html("'''Hello'''''World''!")
14
+
15
+ You can also provide a hash with custom options if you want to use another generator or link handler:
16
+
17
+ require 'mediacloth'
18
+ puts MediaCloth::wiki_to_html("'''Hello'''''World''!", :link_handler => MyLinkHandler.new)
19
+
20
+ Both examples should produce
21
+
22
+ <b>Hello</b><i>World</i>!
23
+
24
+ ## API Docs
25
+ To generate API documentation run:
26
+
27
+ rake rdoc
28
+
29
+ ## Development
30
+ To run tests execute
31
+
32
+ rake test
33
+
34
+ To regenerate test data (html and lex files from wiki input), run:
35
+
36
+ rake test:regenerate
@@ -3,11 +3,15 @@ class AST
3
3
  attr_accessor :contents
4
4
  attr_accessor :parent
5
5
  attr_accessor :children
6
+ attr_accessor :index
7
+ attr_accessor :length
6
8
 
7
- def initialize
9
+ def initialize(index = 0,length = 0)
8
10
  @children = []
9
11
  @parent = nil
10
12
  @contents = ""
13
+ @index = index
14
+ @length = length
11
15
  end
12
16
  end
13
17
 
@@ -20,6 +24,10 @@ end
20
24
  class ParagraphAST < AST
21
25
  end
22
26
 
27
+ #The node to represent paragraph with text pasted into wiki
28
+ class PasteAST < AST
29
+ end
30
+
23
31
  #The node to represent a simple or formatted text
24
32
  #with more AST nodes inside.
25
33
  class FormattedAST < AST
@@ -36,6 +44,7 @@ end
36
44
  class LinkAST < AST
37
45
  #The link's URL
38
46
  attr_accessor :url
47
+ attr_accessor :link_type
39
48
  end
40
49
 
41
50
  #The node to represent a Mediawiki internal link
@@ -45,6 +54,13 @@ class InternalLinkAST < AST
45
54
  attr_accessor :locator
46
55
  end
47
56
 
57
+ #The node to represent a Mediawiki category link
58
+ class CategoryLinkAST < AST
59
+ #Holds the category locator, which is composed of a category name only
60
+ #(e.g. the name of the category)
61
+ attr_accessor :locator
62
+ end
63
+
48
64
  #The node to represent a MediaWiki resource reference (embedded images, videos,
49
65
  #etc.)
50
66
  class ResourceLinkAST < AST
@@ -72,6 +88,7 @@ end
72
88
  class TableCellAST < AST
73
89
  #the type of cell, :head or :body
74
90
  attr_accessor :type
91
+ attr_accessor :attributes
75
92
  end
76
93
 
77
94
  #The node to represent a list
@@ -84,6 +101,14 @@ end
84
101
  class ListItemAST < AST
85
102
  end
86
103
 
104
+ # The node to represent a leading term in a dictionary list
105
+ class ListTermAST < AST
106
+ end
107
+
108
+ # The node to represent a definition in a dictionary list
109
+ class ListDefinitionAST < AST
110
+ end
111
+
87
112
  #The node to represent a section
88
113
  class SectionAST < AST
89
114
  #The level of the section (1,2,3...) that would correspond to
@@ -93,4 +118,36 @@ end
93
118
 
94
119
  #The node to represent a preformatted contents
95
120
  class PreformattedAST < AST
121
+ attr_accessor :indented
122
+ end
123
+
124
+ #The node to represent an XHTML element and its contents
125
+ class ElementAST < AST
126
+ attr_accessor :name, :attributes
127
+ end
128
+
129
+ # The node to represent special Mediawiki keywords, such as __TOC__. The text
130
+ # attribute contains the entire string inbetween '__' and '__'.
131
+ class KeywordAST < AST
132
+ attr_accessor :text
133
+ end
134
+
135
+ # The node to represent templates and pre-defined (or user-defined) variables, such as
136
+ # {{Date}}.
137
+ class TemplateAST < AST
138
+ attr_accessor :template_name
139
+ end
140
+
141
+ # The node to represent template parameter
142
+ class TemplateParameterAST < AST
143
+ attr_accessor :parameter_name #not used atm
144
+ attr_accessor :parameter_value
145
+ end
146
+
147
+ #The node to represent categories to which this page belongs
148
+ class CategoryAST < AST
149
+ #Holds the name of the category
150
+ attr_accessor :locator
151
+ #Holds the string the page is to be sorted as
152
+ attr_accessor :sort_as
96
153
  end
@@ -11,55 +11,37 @@ require 'mediacloth/mediawikiparams'
11
11
  # walker.parse(ast)
12
12
  # puts walker.html
13
13
  class MediaWikiHTMLGenerator < MediaWikiWalker
14
+
14
15
  attr_reader :html
15
16
 
16
- def initialize
17
- @html = ""
18
- end
19
-
20
17
  def parse(ast)
18
+ @html = ""
19
+ @ast = ast
21
20
  @html = super(ast)
22
21
  end
23
22
 
24
- #The default link handler. A custom link handler may extend this class.
25
- class MediaWikiLinkHandler
26
-
27
- #Method invoked to resolve references to wiki pages when they occur in an
28
- #internal link. In all the following internal links, the page name is
29
- #<tt>My Page</tt>:
30
- #* <tt>[[My Page]]</tt>
31
- #* <tt>[[My Page|Click here to view my page]]</tt>
32
- #* <tt>[[My Page|Click ''here'' to view my page]]</tt>
33
- #The return value should be a URL that references the page resource.
34
- def url_for(resource)
35
- "javascript:void(0)"
36
- end
37
-
38
- #Method invoked to resolve references to resources of unknown types. The
39
- #type is indicated by the resource prefix. Examples of inline links to
40
- #unknown references include:
41
- #* <tt>[[Media:video.mpg]]</tt> (prefix <tt>Media</tt>, resource <tt>video.mpg</tt>)
42
- #* <tt>[[Image:pretty.png|100px|A ''pretty'' picture]]</tt> (prefix <tt>Image</tt>,
43
- # resource <tt>pretty.png</tt>, and options <tt>100px</tt> and <tt>A
44
- # <i>pretty</i> picture</tt>.
45
- #The return value should be a well-formed hyperlink, image, object or
46
- #applet tag.
47
- def link_for(prefix, resource, options=[])
48
- "<a href=\"javascript:void(0)\">#{prefix}:#{resource}(#{options.join(', ')})</a>"
49
- end
50
- end
51
-
52
- #Set this generator's URL handler.
53
- def link_handler=(handler)
54
- @link_handler = handler
23
+ # Utility method that returns the string with '<', '>', '&' and '"' escaped as
24
+ # XHTML character entities
25
+ def MediaWikiHTMLGenerator.escape(str)
26
+ r = str.gsub(%r{[<>&"]}) do
27
+ |match|
28
+ case match
29
+ when '<' then '&lt;'
30
+ when '>' then '&gt;'
31
+ when '&' then '&amp;'
32
+ when '"' then '&quot;'
33
+ end
34
+ end
35
+ r
55
36
  end
56
37
 
57
- #Returns's this generator URL handler. If no handler was set, returns the default
58
- #handler.
59
- def link_handler
60
- @link_handler ||= MediaWikiLinkHandler.new
38
+ # Utility method that converts the string specified into a specially formatted text
39
+ # string which can be used as an XHTML link anchor name.
40
+ def MediaWikiHTMLGenerator.anchor_for(str)
41
+ str.strip.squeeze(' ').gsub(' ', '_').gsub('\'', '_')
61
42
  end
62
43
 
44
+
63
45
  protected
64
46
 
65
47
  def parse_wiki_ast(ast)
@@ -67,20 +49,39 @@ protected
67
49
  end
68
50
 
69
51
  def parse_paragraph(ast)
70
- "<p>" + super(ast) + "</p>"
52
+ if (children = ast.children)
53
+ if children.size == 1 and ((text = children.first.contents) == "\n\n" || text == "\r\n\r\n")
54
+ "<p><br />#{text}</p>"
55
+ else
56
+ "<p>#{super(ast)}</p>"
57
+ end
58
+ else
59
+ "<p><br /></p>"
60
+ end
61
+ end
62
+
63
+ def parse_paste(ast)
64
+ return '' unless ast.children
65
+ "<div class=\"paste\" style=\"white-space: pre-wrap;\">#{super(ast)}</div>"
71
66
  end
72
67
 
73
68
  def parse_text(ast)
74
- tag = formatting_to_tag(ast)
75
- if tag[0].empty?
76
- ast.contents
69
+ if ast.formatting
70
+ case(ast.formatting)
71
+ when :None then MediaWikiHTMLGenerator.escape(ast.contents)
72
+ when :CharacterEntity then "&#{ast.contents};"
73
+ when :HLine then "<hr/>"
74
+ when :SignatureDate then @params.time.to_s
75
+ when :SignatureName then link_handler.link_for("User:#{@params.author}", @params.author)
76
+ when :SignatureFull then "#{link_handler.link_for("User:#{@params.author}", @params.author)} #{@params.time.to_s}"
77
+ end
77
78
  else
78
- "<#{tag[0]}#{tag[1]}>#{ast.contents}</#{tag[0]}>"
79
+ escape(ast.contents)
79
80
  end
80
81
  end
81
82
 
82
83
  def parse_formatted(ast)
83
- tag = formatting_to_tag(ast)
84
+ tag = ast.formatting == :Bold ? 'b' : 'i'
84
85
  "<#{tag}>" + super(ast) + "</#{tag}>"
85
86
  end
86
87
 
@@ -95,28 +96,74 @@ protected
95
96
  "<li>" + super(ast) + "</li>"
96
97
  end
97
98
 
99
+ def parse_list_term(ast)
100
+ "<dt>" + super(ast) + "</dt>"
101
+ end
102
+
103
+ def parse_list_definition(ast)
104
+ "<dd>" + super(ast) + "</dd>"
105
+ end
106
+
98
107
  def parse_preformatted(ast)
108
+ if ast.indented
109
+ original_text = super(ast)
110
+ lines = original_text.split("\n").sort
111
+ shortest_space = lines.last.scan(/^\s+/)[0]
112
+ contents = ""
113
+ if shortest_space
114
+ original_text.each_line do |line|
115
+ contents << line.sub(shortest_space, "")
116
+ end
117
+ else
118
+ contents = original_text
119
+ end
120
+ "<pre class=\"indent\">" + contents + "</pre>"
121
+ else
122
+ "<pre>" + super(ast) + "</pre>"
123
+ end
99
124
  end
100
125
 
101
126
  def parse_section(ast)
102
- "<h#{ast.level}>" + super(ast) + "</h#{ast.level}>"
127
+ generator = TextGenerator.new
128
+ anchor = MediaWikiHTMLGenerator.anchor_for(generator.parse(ast).join(' '))
129
+ "<h#{ast.level}><a name='#{anchor}'></a>" + super(ast) + "</h#{ast.level}>\n"
103
130
  end
104
131
 
105
132
  def parse_internal_link(ast)
106
133
  text = parse_wiki_ast(ast)
107
- text = ast.locator if text.length == 0
108
- href = link_handler.url_for(ast.locator)
109
- "<a href=\"#{href}\">#{text}</a>"
134
+ text = MediaWikiHTMLGenerator.escape(ast.locator) if text.length == 0
135
+ link_handler.link_for(ast.locator, text)
110
136
  end
111
-
137
+
112
138
  def parse_resource_link(ast)
113
139
  options = ast.children.map do |node|
114
140
  parse_internal_link_item(node)
115
141
  end
116
- link_handler.link_for(ast.prefix, ast.locator, options)
142
+ link_handler.link_for_resource(ast.prefix, ast.locator, options)
143
+ end
144
+
145
+ def parse_template(ast)
146
+ parameters = ast.children.map do |node|
147
+ if node.parameter_value
148
+ node.parameter_value
149
+ else
150
+ parse_template(node.children.first)
151
+ end
152
+ end
153
+ template_handler.included_template(ast.template_name, parameters)
154
+ end
155
+
156
+ def parse_category_link(ast)
157
+ text = parse_wiki_ast(ast)
158
+ text = MediaWikiHTMLGenerator.escape(ast.locator) if text.length == 0
159
+ link_handler.link_for_category(ast.locator, text)
160
+ end
161
+
162
+ def parse_category(ast)
163
+ text = parse_wiki_ast(ast)
164
+ link_handler.category_add(ast.locator, ast.sort_as)
117
165
  end
118
166
 
119
- #Reimplement this
120
167
  def parse_internal_link_item(ast)
121
168
  text = super(ast)
122
169
  text.strip
@@ -125,13 +172,15 @@ protected
125
172
  def parse_link(ast)
126
173
  text = super(ast)
127
174
  href = ast.url
128
- text = href if text.length == 0
129
- "<a href=\"#{href}\">#{text}</a>"
175
+ text = MediaWikiHTMLGenerator.escape(href) if text.length == 0
176
+ link_handler.absolute_link_for(href, text, ast.link_type)
130
177
  end
131
178
 
132
179
  #Reimplement this
133
180
  def parse_table(ast)
134
181
  options = ast.options ? ' ' + ast.options.strip : ''
182
+ options << ' cellpadding="5"' unless options.include?('cellpadding')
183
+ options << ' border="1"' unless options.include?('border')
135
184
  "<table#{options}>" + super(ast) + "</table>\n"
136
185
  end
137
186
 
@@ -146,28 +195,33 @@ protected
146
195
  if ast.type == :head
147
196
  "<th>" + super(ast) + "</th>"
148
197
  else
149
- "<td>" + super(ast) + "</td>"
198
+ if ast.attributes
199
+ "<td #{ast.attributes.first.contents}>" + super(ast) + "</td>"
200
+ else
201
+ "<td>" + super(ast) + "</td>"
202
+ end
150
203
  end
151
204
  end
152
205
 
153
- #returns an array with a tag name and tag attributes
154
- def formatting_to_tag(ast)
155
- tag = ["", ""]
156
- if ast.formatting == :Bold
157
- tag = ["b", ""]
158
- elsif ast.formatting == :Italic
159
- tag = ["i", ""]
160
- elsif ast.formatting == :HLine
161
- ast.contents = ""
162
- tag = ["hr", ""]
163
- elsif ast.formatting == :SignatureDate
164
- ast.contents = MediaWikiParams.instance.time.to_s
165
- elsif ast.formatting == :SignatureName
166
- ast.contents = MediaWikiParams.instance.author
167
- elsif ast.formatting == :SignatureFull
168
- ast.contents = MediaWikiParams.instance.author + " " + MediaWikiParams.instance.time.to_s
169
- end
170
- tag
206
+ def parse_element(ast)
207
+ attr = ''
208
+ if ast.attributes
209
+ attr = ' ' + ast.attributes.collect{ |name, value|
210
+ name + '="' + MediaWikiHTMLGenerator.escape(value) + '"' }.join(' ')
211
+ end
212
+ if ast.children.size == 0
213
+ "<#{ast.name}#{attr} />"
214
+ else
215
+ "<#{ast.name}#{attr}>" + super(ast) + "</#{ast.name}>"
216
+ end
217
+ end
218
+
219
+ def parse_keyword(ast)
220
+ if ast.text == 'TOC'
221
+ generator = TocGenerator.new
222
+ generator.parse(@ast)
223
+ generator.html
224
+ end
171
225
  end
172
226
 
173
227
  #returns a tag name of the list in ast node
@@ -176,7 +230,109 @@ protected
176
230
  return "ul"
177
231
  elsif ast.list_type == :Numbered
178
232
  return "ol"
233
+ elsif ast.list_type == :Dictionary
234
+ return "dl"
235
+ end
236
+ end
237
+
238
+ # AST walker that generates a table of contents, containing links to all
239
+ # section headings in the page.
240
+ class TocGenerator < MediaWikiHTMLGenerator
241
+
242
+ class TocNode
243
+ attr_accessor :children
244
+ attr_accessor :parent
245
+ attr_accessor :section
246
+ def initialize
247
+ @children = []
248
+ end
249
+
250
+ def add_child(child)
251
+ @children << child
252
+ child.parent = self
253
+ end
254
+
255
+ def level
256
+ res = 0
257
+ node = self
258
+ while p = node.parent
259
+ res += 1
260
+ node = p
261
+ end
262
+ res
263
+ end
264
+
265
+ def number
266
+ res = ''
267
+ node = self
268
+ while p = node.parent
269
+ res = "#{p.children.index(node)+1}." + res
270
+ node = p
271
+ end
272
+ res
273
+ end
179
274
  end
275
+
276
+ def parse(ast)
277
+ @html = ''
278
+ @text_generator = TextGenerator.new
279
+
280
+ root = TocNode.new
281
+ root_stack = [root]
282
+
283
+ parse_branch = lambda do |ast|
284
+ ast.children.each do |child|
285
+ if child.class == SectionAST
286
+ root_stack.pop while child.level <= ((sec = root_stack.last.section) ? sec.level : 0)
287
+
288
+ node = TocNode.new
289
+ node.section = child
290
+ root_stack.last.add_child(node)
291
+
292
+ root_stack.push node
293
+ end
294
+ parse_branch.call(child)
295
+ end
296
+ end
297
+ parse_branch.call(ast)
298
+
299
+ @html += parse_section(root)
300
+ @html = "<div class=\"wikitoc\">\n<div class=\"wikitoctitle\">Contents</div>#{@html}\n</div>\n" if @html != ''
301
+ end
302
+
303
+ protected
304
+
305
+ def parse_section(toc_node)
306
+ html = ''
307
+ if toc_node.section
308
+ anchor = MediaWikiHTMLGenerator.anchor_for(@text_generator.parse(toc_node.section).join(' '))
309
+ html += "\n<li><a href='##{anchor}'><span class=\"wikitocnumber\">#{toc_node.number}</span><span class=\"wikitoctext\">#{parse_wiki_ast(toc_node.section).strip}</span></a>"
310
+ end
311
+
312
+ unless toc_node.children.empty?
313
+ html += "\n<ul>"
314
+ toc_node.children.each do |child_node|
315
+ html += parse_section(child_node)
316
+ end
317
+ html += "\n</ul>"
318
+ end
319
+
320
+ html += "</li>" if html[0,4] == "<li>"
321
+ html
322
+ end
323
+
324
+ end
325
+
326
+
327
+ # AST walker that outputs just the text portions of a page.
328
+ class TextGenerator < MediaWikiWalker
329
+
330
+ protected
331
+
332
+ def parse_text(ast)
333
+ MediaWikiHTMLGenerator.escape(ast.contents)
334
+ end
335
+
180
336
  end
181
337
 
182
338
  end