mediacloth 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/mediacloth/mediawikiast.rb +42 -0
- data/lib/mediacloth/mediawikihtmlgenerator.rb +100 -29
- data/lib/mediacloth/mediawikilexer.rb +292 -37
- data/lib/mediacloth/mediawikilexer.rb~ +491 -0
- data/lib/mediacloth/mediawikiparser.rb +535 -173
- data/lib/mediacloth/mediawikiparser.y +183 -15
- data/lib/mediacloth/mediawikiparser.y~ +210 -0
- data/lib/mediacloth/mediawikiwalker.rb +56 -8
- data/test/data/html1 +1 -1
- data/test/data/html10 +98 -0
- data/test/data/html3 +1 -1
- data/test/data/html4 +11 -1
- data/test/data/html5 +5 -1
- data/test/data/html7 +1 -2
- data/test/data/html8 +1 -1
- data/test/data/html9 +6 -0
- data/test/data/input1 +5 -0
- data/test/data/input10 +124 -0
- data/test/data/input4 +50 -1
- data/test/data/input5 +8 -0
- data/test/data/input7 +35 -2
- data/test/data/input9 +14 -0
- data/test/data/lex1 +5 -1
- data/test/data/lex10 +87 -0
- data/test/data/lex4 +47 -1
- data/test/data/lex5 +7 -1
- data/test/data/lex7 +35 -2
- data/test/data/lex9 +14 -0
- data/test/dataproducers/html.rb +2 -2
- data/test/dataproducers/html.rb~ +24 -0
- data/test/dataproducers/lex.rb +3 -3
- data/test/dataproducers/lex.rb~ +15 -0
- data/test/debugwalker.rb +1 -1
- data/test/htmlgenerator.rb +5 -4
- data/test/lexer.rb +40 -3
- data/test/parser.rb +0 -1
- metadata +14 -3
@@ -8,12 +8,15 @@
|
|
8
8
|
# parser.parse(input)
|
9
9
|
class MediaWikiParser
|
10
10
|
|
11
|
-
token BOLDSTART BOLDEND ITALICSTART ITALICEND LINKSTART LINKEND
|
12
|
-
INTLINKSTART INTLINKEND
|
11
|
+
token BOLDSTART BOLDEND ITALICSTART ITALICEND LINKSTART LINKEND LINKSEP
|
12
|
+
INTLINKSTART INTLINKEND INTLINKSEP RESOURCE_SEP
|
13
|
+
SECTION_START SECTION_END TEXT PRE
|
13
14
|
HLINE SIGNATURE_NAME SIGNATURE_DATE SIGNATURE_FULL
|
14
15
|
UL_START UL_END LI_START LI_END OL_START OL_END
|
16
|
+
TABLE_START TABLE_END ROW_START ROW_END HEAD_START HEAD_END CELL_START CELL_END
|
15
17
|
PARA_START PARA_END
|
16
18
|
|
19
|
+
|
17
20
|
rule
|
18
21
|
|
19
22
|
wiki:
|
@@ -47,10 +50,7 @@ contents:
|
|
47
50
|
}
|
48
51
|
| section
|
49
52
|
{
|
50
|
-
|
51
|
-
s.contents = val[0][0]
|
52
|
-
s.level = val[0][1]
|
53
|
-
result = s
|
53
|
+
result = val[0]
|
54
54
|
}
|
55
55
|
| PARA_START para_contents PARA_END
|
56
56
|
{
|
@@ -60,6 +60,29 @@ contents:
|
|
60
60
|
result = p
|
61
61
|
end
|
62
62
|
}
|
63
|
+
| LINKSTART link_contents LINKEND
|
64
|
+
{
|
65
|
+
l = LinkAST.new
|
66
|
+
l.url = val[1][0]
|
67
|
+
l.children += val[1][1..-1] if val[1].length > 1
|
68
|
+
result = l
|
69
|
+
}
|
70
|
+
| INTLINKSTART TEXT RESOURCE_SEP TEXT reslink_repeated_contents INTLINKEND
|
71
|
+
{
|
72
|
+
l = ResourceLinkAST.new
|
73
|
+
l.prefix = val[1]
|
74
|
+
l.locator = val[3]
|
75
|
+
l.children = val[4] unless val[4].nil? or val[4].empty?
|
76
|
+
result = l
|
77
|
+
}
|
78
|
+
| INTLINKSTART TEXT intlink_repeated_contents INTLINKEND
|
79
|
+
{
|
80
|
+
l = InternalLinkAST.new
|
81
|
+
l.locator = val[1]
|
82
|
+
l.children = val[2] unless val[2].nil? or val[2].empty?
|
83
|
+
result = l
|
84
|
+
}
|
85
|
+
| table
|
63
86
|
;
|
64
87
|
|
65
88
|
#TODO: remove empty paragraphs in lexer
|
@@ -71,6 +94,60 @@ para_contents:
|
|
71
94
|
{
|
72
95
|
result = val[0]
|
73
96
|
}
|
97
|
+
;
|
98
|
+
|
99
|
+
link_contents:
|
100
|
+
TEXT
|
101
|
+
{
|
102
|
+
result = val
|
103
|
+
}
|
104
|
+
| TEXT LINKSEP link_repeated_contents
|
105
|
+
{
|
106
|
+
result = [val[0]]
|
107
|
+
result += val[2]
|
108
|
+
}
|
109
|
+
;
|
110
|
+
|
111
|
+
|
112
|
+
link_repeated_contents:
|
113
|
+
repeated_contents
|
114
|
+
{
|
115
|
+
result = val[0]
|
116
|
+
}
|
117
|
+
| repeated_contents LINKSEP link_repeated_contents
|
118
|
+
{
|
119
|
+
result = val[0]
|
120
|
+
result += val[2] if val[2]
|
121
|
+
}
|
122
|
+
;
|
123
|
+
|
124
|
+
|
125
|
+
intlink_repeated_contents:
|
126
|
+
{
|
127
|
+
result = nil
|
128
|
+
}
|
129
|
+
| INTLINKSEP repeated_contents
|
130
|
+
{
|
131
|
+
result = val[1]
|
132
|
+
}
|
133
|
+
;
|
134
|
+
|
135
|
+
reslink_repeated_contents:
|
136
|
+
{
|
137
|
+
result = nil
|
138
|
+
}
|
139
|
+
| INTLINKSEP reslink_repeated_contents
|
140
|
+
{
|
141
|
+
result = val[1]
|
142
|
+
}
|
143
|
+
| INTLINKSEP repeated_contents reslink_repeated_contents
|
144
|
+
{
|
145
|
+
i = InternalLinkItemAST.new
|
146
|
+
i.children = val[1]
|
147
|
+
result = [i]
|
148
|
+
result += val[2] if val[2]
|
149
|
+
}
|
150
|
+
;
|
74
151
|
|
75
152
|
repeated_contents: contents
|
76
153
|
{
|
@@ -98,11 +175,79 @@ text: element
|
|
98
175
|
}
|
99
176
|
;
|
100
177
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
178
|
+
table:
|
179
|
+
TABLE_START table_contents TABLE_END
|
180
|
+
{
|
181
|
+
table = TableAST.new
|
182
|
+
table.children = val[1] unless val[1].nil? or val[1].empty?
|
183
|
+
result = table
|
184
|
+
}
|
185
|
+
| TABLE_START TEXT table_contents TABLE_END
|
186
|
+
{
|
187
|
+
table = TableAST.new
|
188
|
+
table.options = val[1]
|
189
|
+
table.children = val[2] unless val[2].nil? or val[2].empty?
|
190
|
+
result = table
|
191
|
+
}
|
192
|
+
|
193
|
+
table_contents:
|
194
|
+
{
|
195
|
+
result = nil
|
196
|
+
}
|
197
|
+
| ROW_START row_contents ROW_END table_contents
|
198
|
+
{
|
199
|
+
row = TableRowAST.new
|
200
|
+
row.children = val[1] unless val[1].nil? or val[1].empty?
|
201
|
+
result = [row]
|
202
|
+
result += val[3] unless val[3].nil? or val[3].empty?
|
203
|
+
}
|
204
|
+
| ROW_START TEXT row_contents ROW_END table_contents
|
205
|
+
{
|
206
|
+
row = TableRowAST.new
|
207
|
+
row.children = val[2] unless val[2].nil? or val[2].empty?
|
208
|
+
row.options = val[1]
|
209
|
+
result = [row]
|
210
|
+
result += val[4] unless val[4].nil? or val[4].empty?
|
211
|
+
}
|
212
|
+
|
213
|
+
row_contents:
|
214
|
+
{
|
215
|
+
result = nil
|
216
|
+
}
|
217
|
+
| HEAD_START HEAD_END row_contents
|
218
|
+
{
|
219
|
+
cell = TableCellAST.new
|
220
|
+
cell.type = :head
|
221
|
+
result = [cell]
|
222
|
+
result += val[2] unless val[2].nil? or val[2].empty?
|
223
|
+
}
|
224
|
+
| HEAD_START repeated_contents HEAD_END row_contents
|
225
|
+
{
|
226
|
+
cell = TableCellAST.new
|
227
|
+
cell.children = val[1] unless val[1].nil? or val[1].empty?
|
228
|
+
cell.type = :head
|
229
|
+
result = [cell]
|
230
|
+
result += val[3] unless val[3].nil? or val[3].empty?
|
231
|
+
}
|
232
|
+
| CELL_START CELL_END row_contents
|
233
|
+
{
|
234
|
+
cell = TableCellAST.new
|
235
|
+
cell.type = :body
|
236
|
+
result = [cell]
|
237
|
+
result += val[2] unless val[2].nil? or val[2].empty?
|
238
|
+
}
|
239
|
+
| CELL_START repeated_contents CELL_END row_contents
|
240
|
+
{
|
241
|
+
cell = TableCellAST.new
|
242
|
+
cell.children = val[1] unless val[1].nil? or val[1].empty?
|
243
|
+
cell.type = :body
|
244
|
+
result = [cell]
|
245
|
+
result += val[3] unless val[3].nil? or val[3].empty?
|
246
|
+
}
|
247
|
+
|
248
|
+
|
249
|
+
element:
|
250
|
+
TEXT
|
106
251
|
{ return [:None, val[0]] }
|
107
252
|
| HLINE
|
108
253
|
{ return [:HLine, val[0]] }
|
@@ -114,7 +259,20 @@ element: LINKSTART TEXT LINKEND
|
|
114
259
|
{ return [:SignatureFull, val[0]] }
|
115
260
|
;
|
116
261
|
|
117
|
-
formatted_element:
|
262
|
+
formatted_element:
|
263
|
+
BOLDSTART BOLDEND
|
264
|
+
{
|
265
|
+
result = FormattedAST.new
|
266
|
+
result.formatting = :Bold
|
267
|
+
result
|
268
|
+
}
|
269
|
+
| ITALICSTART ITALICEND
|
270
|
+
{
|
271
|
+
result = FormattedAST.new
|
272
|
+
result.formatting = :Italic
|
273
|
+
result
|
274
|
+
}
|
275
|
+
| BOLDSTART repeated_contents BOLDEND
|
118
276
|
{
|
119
277
|
p = FormattedAST.new
|
120
278
|
p.formatting = :Bold
|
@@ -161,7 +319,12 @@ list_contents:
|
|
161
319
|
{ result = [] }
|
162
320
|
;
|
163
321
|
|
164
|
-
list_item:
|
322
|
+
list_item:
|
323
|
+
LI_START LI_END
|
324
|
+
{
|
325
|
+
result = ListItemAST.new
|
326
|
+
}
|
327
|
+
| LI_START repeated_contents LI_END
|
165
328
|
{
|
166
329
|
li = ListItemAST.new
|
167
330
|
li.children += val[1]
|
@@ -173,8 +336,13 @@ preformatted: PRE
|
|
173
336
|
{ result = val[0] }
|
174
337
|
;
|
175
338
|
|
176
|
-
section: SECTION_START
|
177
|
-
{ result = [val[1], val[0].length]
|
339
|
+
section: SECTION_START repeated_contents SECTION_END
|
340
|
+
{ result = [val[1], val[0].length]
|
341
|
+
s = SectionAST.new
|
342
|
+
s.children = val[1]
|
343
|
+
s.level = val[0].length
|
344
|
+
result = s
|
345
|
+
}
|
178
346
|
;
|
179
347
|
|
180
348
|
end
|
@@ -0,0 +1,210 @@
|
|
1
|
+
#The parser for the MediaWiki language.
|
2
|
+
#
|
3
|
+
#Usage together with a lexer:
|
4
|
+
# inputFile = File.new("data/input1", "r")
|
5
|
+
# input = inputFile.read
|
6
|
+
# parser = MediaWikiParser.new
|
7
|
+
# parser.lexer = MediaWikiLexer.new
|
8
|
+
# parser.parse(input)
|
9
|
+
class MediaWikiParser
|
10
|
+
|
11
|
+
token BOLDSTART BOLDEND ITALICSTART ITALICEND LINKSTART LINKEND
|
12
|
+
INTLINKSTART INTLINKEND SECTION_START SECTION_END TEXT PRE
|
13
|
+
HLINE SIGNATURE_NAME SIGNATURE_DATE SIGNATURE_FULL
|
14
|
+
UL_START UL_END LI_START LI_END OL_START OL_END
|
15
|
+
PARA_START PARA_END
|
16
|
+
|
17
|
+
rule
|
18
|
+
|
19
|
+
wiki:
|
20
|
+
repeated_contents
|
21
|
+
{
|
22
|
+
@nodes.push WikiAST.new
|
23
|
+
#@nodes.last.children.insert(0, val[0])
|
24
|
+
#puts val[0]
|
25
|
+
@nodes.last.children += val[0]
|
26
|
+
}
|
27
|
+
;
|
28
|
+
|
29
|
+
contents:
|
30
|
+
text
|
31
|
+
{
|
32
|
+
result = val[0]
|
33
|
+
}
|
34
|
+
| bulleted_list
|
35
|
+
{
|
36
|
+
result = val[0]
|
37
|
+
}
|
38
|
+
| numbered_list
|
39
|
+
{
|
40
|
+
result = val[0]
|
41
|
+
}
|
42
|
+
| preformatted
|
43
|
+
{
|
44
|
+
p = PreformattedAST.new
|
45
|
+
p.contents = val[0]
|
46
|
+
result = p
|
47
|
+
}
|
48
|
+
| section
|
49
|
+
{
|
50
|
+
s = SectionAST.new
|
51
|
+
s.contents = val[0][0]
|
52
|
+
s.level = val[0][1]
|
53
|
+
result = s
|
54
|
+
}
|
55
|
+
| PARA_START para_contents PARA_END
|
56
|
+
{
|
57
|
+
if val[1]
|
58
|
+
p = ParagraphAST.new
|
59
|
+
p.children = val[1]
|
60
|
+
result = p
|
61
|
+
end
|
62
|
+
}
|
63
|
+
| error
|
64
|
+
{
|
65
|
+
puts "ERR"
|
66
|
+
yyerrok
|
67
|
+
}
|
68
|
+
;
|
69
|
+
|
70
|
+
#TODO: remove empty paragraphs in lexer
|
71
|
+
para_contents:
|
72
|
+
{
|
73
|
+
result = nil
|
74
|
+
}
|
75
|
+
| repeated_contents
|
76
|
+
{
|
77
|
+
result = val[0]
|
78
|
+
}
|
79
|
+
|
80
|
+
repeated_contents: contents
|
81
|
+
{
|
82
|
+
result = []
|
83
|
+
result << val[0]
|
84
|
+
}
|
85
|
+
| repeated_contents contents
|
86
|
+
{
|
87
|
+
result = []
|
88
|
+
result += val[0]
|
89
|
+
result << val[1]
|
90
|
+
}
|
91
|
+
;
|
92
|
+
|
93
|
+
text: element
|
94
|
+
{
|
95
|
+
p = TextAST.new
|
96
|
+
p.formatting = val[0][0]
|
97
|
+
p.contents = val[0][1]
|
98
|
+
result = p
|
99
|
+
}
|
100
|
+
| formatted_element
|
101
|
+
{
|
102
|
+
result = val[0]
|
103
|
+
}
|
104
|
+
;
|
105
|
+
|
106
|
+
element: LINKSTART TEXT LINKEND
|
107
|
+
{ return [:Link, val[1]] }
|
108
|
+
| INTLINKSTART TEXT INTLINKEND
|
109
|
+
{ return [:InternalLink, val[1]] }
|
110
|
+
| TEXT
|
111
|
+
{ return [:None, val[0]] }
|
112
|
+
| HLINE
|
113
|
+
{ return [:HLine, val[0]] }
|
114
|
+
| SIGNATURE_DATE
|
115
|
+
{ return [:SignatureDate, val[0]] }
|
116
|
+
| SIGNATURE_NAME
|
117
|
+
{ return [:SignatureName, val[0]] }
|
118
|
+
| SIGNATURE_FULL
|
119
|
+
{ return [:SignatureFull, val[0]] }
|
120
|
+
;
|
121
|
+
|
122
|
+
formatted_element: BOLDSTART repeated_contents BOLDEND
|
123
|
+
{
|
124
|
+
p = FormattedAST.new
|
125
|
+
p.formatting = :Bold
|
126
|
+
p.children += val[1]
|
127
|
+
result = p
|
128
|
+
}
|
129
|
+
| ITALICSTART repeated_contents ITALICEND
|
130
|
+
{
|
131
|
+
p = FormattedAST.new
|
132
|
+
p.formatting = :Italic
|
133
|
+
p.children += val[1]
|
134
|
+
result = p
|
135
|
+
}
|
136
|
+
;
|
137
|
+
|
138
|
+
bulleted_list: UL_START list_item list_contents UL_END
|
139
|
+
{
|
140
|
+
list = ListAST.new
|
141
|
+
list.list_type = :Bulleted
|
142
|
+
list.children << val[1]
|
143
|
+
list.children += val[2]
|
144
|
+
result = list
|
145
|
+
}
|
146
|
+
;
|
147
|
+
|
148
|
+
numbered_list: OL_START list_item list_contents OL_END
|
149
|
+
{
|
150
|
+
list = ListAST.new
|
151
|
+
list.list_type = :Numbered
|
152
|
+
list.children << val[1]
|
153
|
+
list.children += val[2]
|
154
|
+
result = list
|
155
|
+
}
|
156
|
+
;
|
157
|
+
|
158
|
+
list_contents:
|
159
|
+
{ result = [] }
|
160
|
+
list_item list_contents
|
161
|
+
{
|
162
|
+
result << val[1]
|
163
|
+
result += val[2]
|
164
|
+
}
|
165
|
+
|
|
166
|
+
{ result = [] }
|
167
|
+
;
|
168
|
+
|
169
|
+
list_item: LI_START repeated_contents LI_END
|
170
|
+
{
|
171
|
+
li = ListItemAST.new
|
172
|
+
li.children += val[1]
|
173
|
+
result = li
|
174
|
+
}
|
175
|
+
;
|
176
|
+
|
177
|
+
preformatted: PRE
|
178
|
+
{ result = val[0] }
|
179
|
+
;
|
180
|
+
|
181
|
+
section: SECTION_START TEXT SECTION_END
|
182
|
+
{ result = [val[1], val[0].length] }
|
183
|
+
;
|
184
|
+
|
185
|
+
end
|
186
|
+
|
187
|
+
---- header ----
|
188
|
+
require 'mediacloth/mediawikiast'
|
189
|
+
|
190
|
+
---- inner ----
|
191
|
+
|
192
|
+
attr_accessor :lexer
|
193
|
+
|
194
|
+
def initialize
|
195
|
+
@nodes = []
|
196
|
+
super
|
197
|
+
end
|
198
|
+
|
199
|
+
#Tokenizes input string and parses it.
|
200
|
+
def parse(input)
|
201
|
+
@yydebug=true
|
202
|
+
lexer.tokenize(input)
|
203
|
+
do_parse
|
204
|
+
return @nodes.last
|
205
|
+
end
|
206
|
+
|
207
|
+
#Asks the lexer to return the next token.
|
208
|
+
def next_token
|
209
|
+
return @lexer.lex
|
210
|
+
end
|
@@ -21,13 +21,21 @@ protected
|
|
21
21
|
|
22
22
|
#Reimplement this
|
23
23
|
def parse_wiki_ast(ast)
|
24
|
-
ast.children.
|
25
|
-
parse_formatted(c) if c.class == FormattedAST
|
26
|
-
parse_text(c) if c.class == TextAST
|
27
|
-
parse_list(c) if c.class == ListAST
|
28
|
-
parse_preformatted(c) if c.class == PreformattedAST
|
29
|
-
parse_section(c) if c.class == SectionAST
|
30
|
-
parse_paragraph(c) if c.class == ParagraphAST
|
24
|
+
ast.children.map do |c|
|
25
|
+
r = parse_formatted(c) if c.class == FormattedAST
|
26
|
+
r = parse_text(c) if c.class == TextAST
|
27
|
+
r = parse_list(c) if c.class == ListAST
|
28
|
+
r = parse_preformatted(c) if c.class == PreformattedAST
|
29
|
+
r = parse_section(c) if c.class == SectionAST
|
30
|
+
r = parse_paragraph(c) if c.class == ParagraphAST
|
31
|
+
r = parse_link(c) if c.class == LinkAST
|
32
|
+
r = parse_internal_link(c) if c.class == InternalLinkAST
|
33
|
+
r = parse_resource_link(c) if c.class == ResourceLinkAST
|
34
|
+
r = parse_internal_link_item(c) if c.class == InternalLinkItemAST
|
35
|
+
r = parse_table(c) if c.class == TableAST
|
36
|
+
r = parse_table_row(c) if c.class == TableRowAST
|
37
|
+
r = parse_table_cell(c) if c.class == TableCellAST
|
38
|
+
r
|
31
39
|
end
|
32
40
|
end
|
33
41
|
|
@@ -47,7 +55,7 @@ protected
|
|
47
55
|
|
48
56
|
#Reimplement this
|
49
57
|
def parse_list(ast)
|
50
|
-
ast.children.
|
58
|
+
ast.children.map do |c|
|
51
59
|
parse_list_item(c) if c.class == ListItemAST
|
52
60
|
end
|
53
61
|
end
|
@@ -63,6 +71,46 @@ protected
|
|
63
71
|
|
64
72
|
#Reimplement this
|
65
73
|
def parse_section(ast)
|
74
|
+
parse_wiki_ast(ast)
|
75
|
+
end
|
76
|
+
|
77
|
+
#Reimplement this
|
78
|
+
def parse_link(ast)
|
79
|
+
parse_wiki_ast(ast)
|
80
|
+
end
|
81
|
+
|
82
|
+
#Reimplement this
|
83
|
+
def parse_internal_link(ast)
|
84
|
+
ast.children.map do |c|
|
85
|
+
parse_internal_link_item(c) if c.class == InternalLinkItemAST
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
#Reimplement this
|
90
|
+
def parse_resource_link(ast)
|
91
|
+
ast.children.map do |c|
|
92
|
+
parse_internal_link_item(c) if c.class == InternalLinkItemAST
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
#Reimplement this
|
97
|
+
def parse_internal_link_item(ast)
|
98
|
+
parse_wiki_ast(ast)
|
99
|
+
end
|
100
|
+
|
101
|
+
#Reimplement this
|
102
|
+
def parse_table(ast)
|
103
|
+
parse_wiki_ast(ast)
|
104
|
+
end
|
105
|
+
|
106
|
+
#Reimplement this
|
107
|
+
def parse_table_row(ast)
|
108
|
+
parse_wiki_ast(ast)
|
109
|
+
end
|
110
|
+
|
111
|
+
#Reimplement this
|
112
|
+
def parse_table_cell(ast)
|
113
|
+
parse_wiki_ast(ast)
|
66
114
|
end
|
67
115
|
|
68
116
|
end
|
data/test/data/html1
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
<p>This is a simple text with <b>Bold text</b> and <i>Italic text</i> inside.
|
2
2
|
One paragraph can be written in several lines.</p><p>Another paragraph starts after a blank line.</p><p>
|
3
|
-
Another one.</p><p>This is text with Internal Link and <a href="http://www.example.com"
|
3
|
+
Another one.</p><p>This is text with <a href="javascript:void(0)">Internal Link</a> and <a href="http://www.example.com">external link</a>.</p><p>We can have headlines:</p><h1> Headline1 </h1><h2> Headline2 </h2><h3> Headline3 </h3><h4> Headline4 </h4><h5> Headline5 </h5><h6> Headline6 </h6><h7> Headline7 </h7><p>Headlines may have formatting:</p><h1> See <a href="javascript:void(0)">Internal Link</a> for more info </h1><h2> This is an <b>important</b> heading </h2><hr></hr><p>
|
4
4
|
This is a text after the line.</p><ul><li>foo
|
5
5
|
</li><li>foo2
|
6
6
|
</li></ul>
|
data/test/data/html10
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
<p>Some examples of tables.</p><table></table>
|
2
|
+
<table><tr></tr>
|
3
|
+
</table>
|
4
|
+
<table><tr><td> a
|
5
|
+
</td><td> b
|
6
|
+
</td></tr>
|
7
|
+
</table>
|
8
|
+
<table><tr><td> a
|
9
|
+
</td><td> b
|
10
|
+
</td></tr>
|
11
|
+
<tr><td> 1
|
12
|
+
</td><td> 2
|
13
|
+
</td></tr>
|
14
|
+
</table>
|
15
|
+
<table><tr><td> a </td><td> b
|
16
|
+
</td></tr>
|
17
|
+
<tr><td> 1 </td><td> 2
|
18
|
+
</td></tr>
|
19
|
+
</table>
|
20
|
+
<table><tr><th> a
|
21
|
+
</th><th> b
|
22
|
+
</th></tr>
|
23
|
+
<tr><td> 1
|
24
|
+
</td><td> 2
|
25
|
+
</td></tr>
|
26
|
+
</table>
|
27
|
+
<table><tr><th> a </th><th> b
|
28
|
+
</th></tr>
|
29
|
+
<tr><td> 1 </td><td> 2
|
30
|
+
</td></tr>
|
31
|
+
</table>
|
32
|
+
<table><tr><td> a
|
33
|
+
</td></tr>
|
34
|
+
<tr><td> 1
|
35
|
+
</td><td> 2
|
36
|
+
</td></tr>
|
37
|
+
</table>
|
38
|
+
<table><tr><td> a
|
39
|
+
</td><td> b
|
40
|
+
</td></tr>
|
41
|
+
<tr><td> 1
|
42
|
+
</td></tr>
|
43
|
+
</table>
|
44
|
+
<table><tr><td> a
|
45
|
+
</td><td> b
|
46
|
+
</td></tr>
|
47
|
+
<tr><td></td><td> 2
|
48
|
+
</td></tr>
|
49
|
+
</table>
|
50
|
+
<table><tr><td> <a href="http://example.com">Example</a></td><td> <b>bold</b></td></tr>
|
51
|
+
<tr><td> <a href="javascript:void(0)">Example</a></td><td> <a href="javascript:void(0)">image:example.jpg(1, 2, 3)</a></td></tr>
|
52
|
+
</table>
|
53
|
+
<table><tr><td> <a href="http://example.com">Example
|
54
|
+
</a></td><td> <b>bold
|
55
|
+
</b></td></tr>
|
56
|
+
<tr><td> <a href="javascript:void(0)">Example
|
57
|
+
</a></td><td> <b>bold <i>italic
|
58
|
+
</i></b></td></tr>
|
59
|
+
</table>
|
60
|
+
<table><tr><td> a
|
61
|
+
</td><td> b
|
62
|
+
</td></tr>
|
63
|
+
<tr><td> A list in a cell:
|
64
|
+
<ul><li>one
|
65
|
+
</li><li>two
|
66
|
+
</li></ul></td></tr>
|
67
|
+
</table>
|
68
|
+
<table><tr><td> a
|
69
|
+
</td><td> b
|
70
|
+
</td></tr>
|
71
|
+
<tr><td> A table in a cell:
|
72
|
+
<table><tr><td> 1
|
73
|
+
</td><td> 2
|
74
|
+
</td></tr>
|
75
|
+
<tr><td> one
|
76
|
+
</td><td> two
|
77
|
+
</td></tr>
|
78
|
+
</table>
|
79
|
+
</td></tr>
|
80
|
+
</table>
|
81
|
+
<table border="1"><tr><th> a
|
82
|
+
</th><th> b
|
83
|
+
</th></tr>
|
84
|
+
<tr align="left"><td> 1
|
85
|
+
</td><td> 2
|
86
|
+
</td></tr>
|
87
|
+
</table>
|
88
|
+
<p>
|
89
|
+
Text before
|
90
|
+
</p><table><tr><td> a
|
91
|
+
</td><td> b
|
92
|
+
</td></tr>
|
93
|
+
</table>
|
94
|
+
<p>Text before with <i>italic
|
95
|
+
</i></p><table><tr><td> a
|
96
|
+
</td><td> b
|
97
|
+
</td></tr>
|
98
|
+
</table>
|
data/test/data/html3
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<p>--Sat Jan 01 01:01:01
|
1
|
+
<p>--Sat Jan 01 01:01:01 UTC 2000CreatorCreator Sat Jan 01 01:01:01 UTC 2000</p>
|
data/test/data/html4
CHANGED
@@ -1 +1,11 @@
|
|
1
|
-
<p><b><i>
|
1
|
+
<p>Test mixing of bold and italic formatting:</p><p><i>italic<b>bold</b>italic</i></p><p><b>bold<i>italic</i>bold</b></p><p><i><b>boldalic</b></i></p><p><i>italic<b>bold</b>italic</i><b>bold<i>italic</i>bold</b></p><p>
|
2
|
+
Test bold and italic wrapped around inline links:</p><p><i><a href="http://example.com">http://example.com</a></i></p><p><b><a href="http://example.com">http://example.com</a></b></p><p><i><a href="http://example.com'">http://example.com'</a></i></p><p><b><a href="http://example.com'">http://example.com'</a> is good</b></p><p><i><a href="http://example.com'">http://example.com'</a> is good</i></p><p>
|
3
|
+
Test unclosed bold and italic formatting:</p><p><i>Some italic and now </i>bold
|
4
|
+
</p><h2>Heading</h2><p>
|
5
|
+
Text</p><p><i>Some italic and now <b>bold
|
6
|
+
</b></i></p><h2>Heading</h2><p>
|
7
|
+
Text</p><p><i>Some italic and now <b>bold</b></i></p><h2>Heading</h2><p>
|
8
|
+
Text</p><p><i>Some italic and now <b>bold
|
9
|
+
</b></i></p><ul><li>one
|
10
|
+
</li><li>two
|
11
|
+
</li></ul><p><i>Some italic and now <b>bold</b></i></p><p>Text</p>
|
data/test/data/html5
CHANGED
data/test/data/html7
CHANGED
@@ -1,2 +1 @@
|
|
1
|
-
<p><a href="http://
|
2
|
-
</p>
|
1
|
+
<p><a href="http://sun.com">http://sun.com</a></p><p><a href="http://sun.com">http://sun.com</a></p><p><a href="mailto:joe@sun.com">mailto:joe@sun.com</a></p><p><a href="http://sun.com">stars</a></p><p><a href="http://sun.com">stars and moon</a></p><p><a href="http://sun.com">stars and <i>moon</i>and <b>trees</b>and birds</a></p><p><a href="javascript:void(0)">sun</a></p><p><a href="javascript:void(0)">All about Sun</a></p><p><a href="javascript:void(0)">image:sun(All about Sun)</a></p><p><a href="javascript:void(0)">nofollow|All about Sun</a></p><p><a href="javascript:void(0)">image:sun(nofollow, All about Sun)</a></p><p><a href="javascript:void(0)">image:sun(All about <a href="javascript:void(0)">Sun</a>)</a></p><p><a href="javascript:void(0)">image:sun(All about <a href="javascript:void(0)">More about</a>)</a></p><p><a href="javascript:void(0)">image:sun(one, two, three)</a></p><p>[]</p><p>[ ]</p><p>[[]]</p><p>[[ ]]</p>
|
data/test/data/html8
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<
|
1
|
+
<h3> foo </h3>
|