mediacloth 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/mediacloth/mediawikiast.rb +42 -0
- data/lib/mediacloth/mediawikihtmlgenerator.rb +100 -29
- data/lib/mediacloth/mediawikilexer.rb +292 -37
- data/lib/mediacloth/mediawikilexer.rb~ +491 -0
- data/lib/mediacloth/mediawikiparser.rb +535 -173
- data/lib/mediacloth/mediawikiparser.y +183 -15
- data/lib/mediacloth/mediawikiparser.y~ +210 -0
- data/lib/mediacloth/mediawikiwalker.rb +56 -8
- data/test/data/html1 +1 -1
- data/test/data/html10 +98 -0
- data/test/data/html3 +1 -1
- data/test/data/html4 +11 -1
- data/test/data/html5 +5 -1
- data/test/data/html7 +1 -2
- data/test/data/html8 +1 -1
- data/test/data/html9 +6 -0
- data/test/data/input1 +5 -0
- data/test/data/input10 +124 -0
- data/test/data/input4 +50 -1
- data/test/data/input5 +8 -0
- data/test/data/input7 +35 -2
- data/test/data/input9 +14 -0
- data/test/data/lex1 +5 -1
- data/test/data/lex10 +87 -0
- data/test/data/lex4 +47 -1
- data/test/data/lex5 +7 -1
- data/test/data/lex7 +35 -2
- data/test/data/lex9 +14 -0
- data/test/dataproducers/html.rb +2 -2
- data/test/dataproducers/html.rb~ +24 -0
- data/test/dataproducers/lex.rb +3 -3
- data/test/dataproducers/lex.rb~ +15 -0
- data/test/debugwalker.rb +1 -1
- data/test/htmlgenerator.rb +5 -4
- data/test/lexer.rb +40 -3
- data/test/parser.rb +0 -1
- metadata +14 -3
@@ -8,12 +8,15 @@
|
|
8
8
|
# parser.parse(input)
|
9
9
|
class MediaWikiParser
|
10
10
|
|
11
|
-
token BOLDSTART BOLDEND ITALICSTART ITALICEND LINKSTART LINKEND
|
12
|
-
INTLINKSTART INTLINKEND
|
11
|
+
token BOLDSTART BOLDEND ITALICSTART ITALICEND LINKSTART LINKEND LINKSEP
|
12
|
+
INTLINKSTART INTLINKEND INTLINKSEP RESOURCE_SEP
|
13
|
+
SECTION_START SECTION_END TEXT PRE
|
13
14
|
HLINE SIGNATURE_NAME SIGNATURE_DATE SIGNATURE_FULL
|
14
15
|
UL_START UL_END LI_START LI_END OL_START OL_END
|
16
|
+
TABLE_START TABLE_END ROW_START ROW_END HEAD_START HEAD_END CELL_START CELL_END
|
15
17
|
PARA_START PARA_END
|
16
18
|
|
19
|
+
|
17
20
|
rule
|
18
21
|
|
19
22
|
wiki:
|
@@ -47,10 +50,7 @@ contents:
|
|
47
50
|
}
|
48
51
|
| section
|
49
52
|
{
|
50
|
-
|
51
|
-
s.contents = val[0][0]
|
52
|
-
s.level = val[0][1]
|
53
|
-
result = s
|
53
|
+
result = val[0]
|
54
54
|
}
|
55
55
|
| PARA_START para_contents PARA_END
|
56
56
|
{
|
@@ -60,6 +60,29 @@ contents:
|
|
60
60
|
result = p
|
61
61
|
end
|
62
62
|
}
|
63
|
+
| LINKSTART link_contents LINKEND
|
64
|
+
{
|
65
|
+
l = LinkAST.new
|
66
|
+
l.url = val[1][0]
|
67
|
+
l.children += val[1][1..-1] if val[1].length > 1
|
68
|
+
result = l
|
69
|
+
}
|
70
|
+
| INTLINKSTART TEXT RESOURCE_SEP TEXT reslink_repeated_contents INTLINKEND
|
71
|
+
{
|
72
|
+
l = ResourceLinkAST.new
|
73
|
+
l.prefix = val[1]
|
74
|
+
l.locator = val[3]
|
75
|
+
l.children = val[4] unless val[4].nil? or val[4].empty?
|
76
|
+
result = l
|
77
|
+
}
|
78
|
+
| INTLINKSTART TEXT intlink_repeated_contents INTLINKEND
|
79
|
+
{
|
80
|
+
l = InternalLinkAST.new
|
81
|
+
l.locator = val[1]
|
82
|
+
l.children = val[2] unless val[2].nil? or val[2].empty?
|
83
|
+
result = l
|
84
|
+
}
|
85
|
+
| table
|
63
86
|
;
|
64
87
|
|
65
88
|
#TODO: remove empty paragraphs in lexer
|
@@ -71,6 +94,60 @@ para_contents:
|
|
71
94
|
{
|
72
95
|
result = val[0]
|
73
96
|
}
|
97
|
+
;
|
98
|
+
|
99
|
+
link_contents:
|
100
|
+
TEXT
|
101
|
+
{
|
102
|
+
result = val
|
103
|
+
}
|
104
|
+
| TEXT LINKSEP link_repeated_contents
|
105
|
+
{
|
106
|
+
result = [val[0]]
|
107
|
+
result += val[2]
|
108
|
+
}
|
109
|
+
;
|
110
|
+
|
111
|
+
|
112
|
+
link_repeated_contents:
|
113
|
+
repeated_contents
|
114
|
+
{
|
115
|
+
result = val[0]
|
116
|
+
}
|
117
|
+
| repeated_contents LINKSEP link_repeated_contents
|
118
|
+
{
|
119
|
+
result = val[0]
|
120
|
+
result += val[2] if val[2]
|
121
|
+
}
|
122
|
+
;
|
123
|
+
|
124
|
+
|
125
|
+
intlink_repeated_contents:
|
126
|
+
{
|
127
|
+
result = nil
|
128
|
+
}
|
129
|
+
| INTLINKSEP repeated_contents
|
130
|
+
{
|
131
|
+
result = val[1]
|
132
|
+
}
|
133
|
+
;
|
134
|
+
|
135
|
+
reslink_repeated_contents:
|
136
|
+
{
|
137
|
+
result = nil
|
138
|
+
}
|
139
|
+
| INTLINKSEP reslink_repeated_contents
|
140
|
+
{
|
141
|
+
result = val[1]
|
142
|
+
}
|
143
|
+
| INTLINKSEP repeated_contents reslink_repeated_contents
|
144
|
+
{
|
145
|
+
i = InternalLinkItemAST.new
|
146
|
+
i.children = val[1]
|
147
|
+
result = [i]
|
148
|
+
result += val[2] if val[2]
|
149
|
+
}
|
150
|
+
;
|
74
151
|
|
75
152
|
repeated_contents: contents
|
76
153
|
{
|
@@ -98,11 +175,79 @@ text: element
|
|
98
175
|
}
|
99
176
|
;
|
100
177
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
178
|
+
table:
|
179
|
+
TABLE_START table_contents TABLE_END
|
180
|
+
{
|
181
|
+
table = TableAST.new
|
182
|
+
table.children = val[1] unless val[1].nil? or val[1].empty?
|
183
|
+
result = table
|
184
|
+
}
|
185
|
+
| TABLE_START TEXT table_contents TABLE_END
|
186
|
+
{
|
187
|
+
table = TableAST.new
|
188
|
+
table.options = val[1]
|
189
|
+
table.children = val[2] unless val[2].nil? or val[2].empty?
|
190
|
+
result = table
|
191
|
+
}
|
192
|
+
|
193
|
+
table_contents:
|
194
|
+
{
|
195
|
+
result = nil
|
196
|
+
}
|
197
|
+
| ROW_START row_contents ROW_END table_contents
|
198
|
+
{
|
199
|
+
row = TableRowAST.new
|
200
|
+
row.children = val[1] unless val[1].nil? or val[1].empty?
|
201
|
+
result = [row]
|
202
|
+
result += val[3] unless val[3].nil? or val[3].empty?
|
203
|
+
}
|
204
|
+
| ROW_START TEXT row_contents ROW_END table_contents
|
205
|
+
{
|
206
|
+
row = TableRowAST.new
|
207
|
+
row.children = val[2] unless val[2].nil? or val[2].empty?
|
208
|
+
row.options = val[1]
|
209
|
+
result = [row]
|
210
|
+
result += val[4] unless val[4].nil? or val[4].empty?
|
211
|
+
}
|
212
|
+
|
213
|
+
row_contents:
|
214
|
+
{
|
215
|
+
result = nil
|
216
|
+
}
|
217
|
+
| HEAD_START HEAD_END row_contents
|
218
|
+
{
|
219
|
+
cell = TableCellAST.new
|
220
|
+
cell.type = :head
|
221
|
+
result = [cell]
|
222
|
+
result += val[2] unless val[2].nil? or val[2].empty?
|
223
|
+
}
|
224
|
+
| HEAD_START repeated_contents HEAD_END row_contents
|
225
|
+
{
|
226
|
+
cell = TableCellAST.new
|
227
|
+
cell.children = val[1] unless val[1].nil? or val[1].empty?
|
228
|
+
cell.type = :head
|
229
|
+
result = [cell]
|
230
|
+
result += val[3] unless val[3].nil? or val[3].empty?
|
231
|
+
}
|
232
|
+
| CELL_START CELL_END row_contents
|
233
|
+
{
|
234
|
+
cell = TableCellAST.new
|
235
|
+
cell.type = :body
|
236
|
+
result = [cell]
|
237
|
+
result += val[2] unless val[2].nil? or val[2].empty?
|
238
|
+
}
|
239
|
+
| CELL_START repeated_contents CELL_END row_contents
|
240
|
+
{
|
241
|
+
cell = TableCellAST.new
|
242
|
+
cell.children = val[1] unless val[1].nil? or val[1].empty?
|
243
|
+
cell.type = :body
|
244
|
+
result = [cell]
|
245
|
+
result += val[3] unless val[3].nil? or val[3].empty?
|
246
|
+
}
|
247
|
+
|
248
|
+
|
249
|
+
element:
|
250
|
+
TEXT
|
106
251
|
{ return [:None, val[0]] }
|
107
252
|
| HLINE
|
108
253
|
{ return [:HLine, val[0]] }
|
@@ -114,7 +259,20 @@ element: LINKSTART TEXT LINKEND
|
|
114
259
|
{ return [:SignatureFull, val[0]] }
|
115
260
|
;
|
116
261
|
|
117
|
-
formatted_element:
|
262
|
+
formatted_element:
|
263
|
+
BOLDSTART BOLDEND
|
264
|
+
{
|
265
|
+
result = FormattedAST.new
|
266
|
+
result.formatting = :Bold
|
267
|
+
result
|
268
|
+
}
|
269
|
+
| ITALICSTART ITALICEND
|
270
|
+
{
|
271
|
+
result = FormattedAST.new
|
272
|
+
result.formatting = :Italic
|
273
|
+
result
|
274
|
+
}
|
275
|
+
| BOLDSTART repeated_contents BOLDEND
|
118
276
|
{
|
119
277
|
p = FormattedAST.new
|
120
278
|
p.formatting = :Bold
|
@@ -161,7 +319,12 @@ list_contents:
|
|
161
319
|
{ result = [] }
|
162
320
|
;
|
163
321
|
|
164
|
-
list_item:
|
322
|
+
list_item:
|
323
|
+
LI_START LI_END
|
324
|
+
{
|
325
|
+
result = ListItemAST.new
|
326
|
+
}
|
327
|
+
| LI_START repeated_contents LI_END
|
165
328
|
{
|
166
329
|
li = ListItemAST.new
|
167
330
|
li.children += val[1]
|
@@ -173,8 +336,13 @@ preformatted: PRE
|
|
173
336
|
{ result = val[0] }
|
174
337
|
;
|
175
338
|
|
176
|
-
section: SECTION_START
|
177
|
-
{ result = [val[1], val[0].length]
|
339
|
+
section: SECTION_START repeated_contents SECTION_END
|
340
|
+
{ result = [val[1], val[0].length]
|
341
|
+
s = SectionAST.new
|
342
|
+
s.children = val[1]
|
343
|
+
s.level = val[0].length
|
344
|
+
result = s
|
345
|
+
}
|
178
346
|
;
|
179
347
|
|
180
348
|
end
|
@@ -0,0 +1,210 @@
|
|
1
|
+
#The parser for the MediaWiki language.
|
2
|
+
#
|
3
|
+
#Usage together with a lexer:
|
4
|
+
# inputFile = File.new("data/input1", "r")
|
5
|
+
# input = inputFile.read
|
6
|
+
# parser = MediaWikiParser.new
|
7
|
+
# parser.lexer = MediaWikiLexer.new
|
8
|
+
# parser.parse(input)
|
9
|
+
class MediaWikiParser
|
10
|
+
|
11
|
+
token BOLDSTART BOLDEND ITALICSTART ITALICEND LINKSTART LINKEND
|
12
|
+
INTLINKSTART INTLINKEND SECTION_START SECTION_END TEXT PRE
|
13
|
+
HLINE SIGNATURE_NAME SIGNATURE_DATE SIGNATURE_FULL
|
14
|
+
UL_START UL_END LI_START LI_END OL_START OL_END
|
15
|
+
PARA_START PARA_END
|
16
|
+
|
17
|
+
rule
|
18
|
+
|
19
|
+
wiki:
|
20
|
+
repeated_contents
|
21
|
+
{
|
22
|
+
@nodes.push WikiAST.new
|
23
|
+
#@nodes.last.children.insert(0, val[0])
|
24
|
+
#puts val[0]
|
25
|
+
@nodes.last.children += val[0]
|
26
|
+
}
|
27
|
+
;
|
28
|
+
|
29
|
+
contents:
|
30
|
+
text
|
31
|
+
{
|
32
|
+
result = val[0]
|
33
|
+
}
|
34
|
+
| bulleted_list
|
35
|
+
{
|
36
|
+
result = val[0]
|
37
|
+
}
|
38
|
+
| numbered_list
|
39
|
+
{
|
40
|
+
result = val[0]
|
41
|
+
}
|
42
|
+
| preformatted
|
43
|
+
{
|
44
|
+
p = PreformattedAST.new
|
45
|
+
p.contents = val[0]
|
46
|
+
result = p
|
47
|
+
}
|
48
|
+
| section
|
49
|
+
{
|
50
|
+
s = SectionAST.new
|
51
|
+
s.contents = val[0][0]
|
52
|
+
s.level = val[0][1]
|
53
|
+
result = s
|
54
|
+
}
|
55
|
+
| PARA_START para_contents PARA_END
|
56
|
+
{
|
57
|
+
if val[1]
|
58
|
+
p = ParagraphAST.new
|
59
|
+
p.children = val[1]
|
60
|
+
result = p
|
61
|
+
end
|
62
|
+
}
|
63
|
+
| error
|
64
|
+
{
|
65
|
+
puts "ERR"
|
66
|
+
yyerrok
|
67
|
+
}
|
68
|
+
;
|
69
|
+
|
70
|
+
#TODO: remove empty paragraphs in lexer
|
71
|
+
para_contents:
|
72
|
+
{
|
73
|
+
result = nil
|
74
|
+
}
|
75
|
+
| repeated_contents
|
76
|
+
{
|
77
|
+
result = val[0]
|
78
|
+
}
|
79
|
+
|
80
|
+
repeated_contents: contents
|
81
|
+
{
|
82
|
+
result = []
|
83
|
+
result << val[0]
|
84
|
+
}
|
85
|
+
| repeated_contents contents
|
86
|
+
{
|
87
|
+
result = []
|
88
|
+
result += val[0]
|
89
|
+
result << val[1]
|
90
|
+
}
|
91
|
+
;
|
92
|
+
|
93
|
+
text: element
|
94
|
+
{
|
95
|
+
p = TextAST.new
|
96
|
+
p.formatting = val[0][0]
|
97
|
+
p.contents = val[0][1]
|
98
|
+
result = p
|
99
|
+
}
|
100
|
+
| formatted_element
|
101
|
+
{
|
102
|
+
result = val[0]
|
103
|
+
}
|
104
|
+
;
|
105
|
+
|
106
|
+
element: LINKSTART TEXT LINKEND
|
107
|
+
{ return [:Link, val[1]] }
|
108
|
+
| INTLINKSTART TEXT INTLINKEND
|
109
|
+
{ return [:InternalLink, val[1]] }
|
110
|
+
| TEXT
|
111
|
+
{ return [:None, val[0]] }
|
112
|
+
| HLINE
|
113
|
+
{ return [:HLine, val[0]] }
|
114
|
+
| SIGNATURE_DATE
|
115
|
+
{ return [:SignatureDate, val[0]] }
|
116
|
+
| SIGNATURE_NAME
|
117
|
+
{ return [:SignatureName, val[0]] }
|
118
|
+
| SIGNATURE_FULL
|
119
|
+
{ return [:SignatureFull, val[0]] }
|
120
|
+
;
|
121
|
+
|
122
|
+
formatted_element: BOLDSTART repeated_contents BOLDEND
|
123
|
+
{
|
124
|
+
p = FormattedAST.new
|
125
|
+
p.formatting = :Bold
|
126
|
+
p.children += val[1]
|
127
|
+
result = p
|
128
|
+
}
|
129
|
+
| ITALICSTART repeated_contents ITALICEND
|
130
|
+
{
|
131
|
+
p = FormattedAST.new
|
132
|
+
p.formatting = :Italic
|
133
|
+
p.children += val[1]
|
134
|
+
result = p
|
135
|
+
}
|
136
|
+
;
|
137
|
+
|
138
|
+
bulleted_list: UL_START list_item list_contents UL_END
|
139
|
+
{
|
140
|
+
list = ListAST.new
|
141
|
+
list.list_type = :Bulleted
|
142
|
+
list.children << val[1]
|
143
|
+
list.children += val[2]
|
144
|
+
result = list
|
145
|
+
}
|
146
|
+
;
|
147
|
+
|
148
|
+
numbered_list: OL_START list_item list_contents OL_END
|
149
|
+
{
|
150
|
+
list = ListAST.new
|
151
|
+
list.list_type = :Numbered
|
152
|
+
list.children << val[1]
|
153
|
+
list.children += val[2]
|
154
|
+
result = list
|
155
|
+
}
|
156
|
+
;
|
157
|
+
|
158
|
+
list_contents:
|
159
|
+
{ result = [] }
|
160
|
+
list_item list_contents
|
161
|
+
{
|
162
|
+
result << val[1]
|
163
|
+
result += val[2]
|
164
|
+
}
|
165
|
+
|
|
166
|
+
{ result = [] }
|
167
|
+
;
|
168
|
+
|
169
|
+
list_item: LI_START repeated_contents LI_END
|
170
|
+
{
|
171
|
+
li = ListItemAST.new
|
172
|
+
li.children += val[1]
|
173
|
+
result = li
|
174
|
+
}
|
175
|
+
;
|
176
|
+
|
177
|
+
preformatted: PRE
|
178
|
+
{ result = val[0] }
|
179
|
+
;
|
180
|
+
|
181
|
+
section: SECTION_START TEXT SECTION_END
|
182
|
+
{ result = [val[1], val[0].length] }
|
183
|
+
;
|
184
|
+
|
185
|
+
end
|
186
|
+
|
187
|
+
---- header ----
|
188
|
+
require 'mediacloth/mediawikiast'
|
189
|
+
|
190
|
+
---- inner ----
|
191
|
+
|
192
|
+
attr_accessor :lexer
|
193
|
+
|
194
|
+
def initialize
|
195
|
+
@nodes = []
|
196
|
+
super
|
197
|
+
end
|
198
|
+
|
199
|
+
#Tokenizes input string and parses it.
|
200
|
+
def parse(input)
|
201
|
+
@yydebug=true
|
202
|
+
lexer.tokenize(input)
|
203
|
+
do_parse
|
204
|
+
return @nodes.last
|
205
|
+
end
|
206
|
+
|
207
|
+
#Asks the lexer to return the next token.
|
208
|
+
def next_token
|
209
|
+
return @lexer.lex
|
210
|
+
end
|
@@ -21,13 +21,21 @@ protected
|
|
21
21
|
|
22
22
|
#Reimplement this
|
23
23
|
def parse_wiki_ast(ast)
|
24
|
-
ast.children.
|
25
|
-
parse_formatted(c) if c.class == FormattedAST
|
26
|
-
parse_text(c) if c.class == TextAST
|
27
|
-
parse_list(c) if c.class == ListAST
|
28
|
-
parse_preformatted(c) if c.class == PreformattedAST
|
29
|
-
parse_section(c) if c.class == SectionAST
|
30
|
-
parse_paragraph(c) if c.class == ParagraphAST
|
24
|
+
ast.children.map do |c|
|
25
|
+
r = parse_formatted(c) if c.class == FormattedAST
|
26
|
+
r = parse_text(c) if c.class == TextAST
|
27
|
+
r = parse_list(c) if c.class == ListAST
|
28
|
+
r = parse_preformatted(c) if c.class == PreformattedAST
|
29
|
+
r = parse_section(c) if c.class == SectionAST
|
30
|
+
r = parse_paragraph(c) if c.class == ParagraphAST
|
31
|
+
r = parse_link(c) if c.class == LinkAST
|
32
|
+
r = parse_internal_link(c) if c.class == InternalLinkAST
|
33
|
+
r = parse_resource_link(c) if c.class == ResourceLinkAST
|
34
|
+
r = parse_internal_link_item(c) if c.class == InternalLinkItemAST
|
35
|
+
r = parse_table(c) if c.class == TableAST
|
36
|
+
r = parse_table_row(c) if c.class == TableRowAST
|
37
|
+
r = parse_table_cell(c) if c.class == TableCellAST
|
38
|
+
r
|
31
39
|
end
|
32
40
|
end
|
33
41
|
|
@@ -47,7 +55,7 @@ protected
|
|
47
55
|
|
48
56
|
#Reimplement this
|
49
57
|
def parse_list(ast)
|
50
|
-
ast.children.
|
58
|
+
ast.children.map do |c|
|
51
59
|
parse_list_item(c) if c.class == ListItemAST
|
52
60
|
end
|
53
61
|
end
|
@@ -63,6 +71,46 @@ protected
|
|
63
71
|
|
64
72
|
#Reimplement this
|
65
73
|
def parse_section(ast)
|
74
|
+
parse_wiki_ast(ast)
|
75
|
+
end
|
76
|
+
|
77
|
+
#Reimplement this
|
78
|
+
def parse_link(ast)
|
79
|
+
parse_wiki_ast(ast)
|
80
|
+
end
|
81
|
+
|
82
|
+
#Reimplement this
|
83
|
+
def parse_internal_link(ast)
|
84
|
+
ast.children.map do |c|
|
85
|
+
parse_internal_link_item(c) if c.class == InternalLinkItemAST
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
#Reimplement this
|
90
|
+
def parse_resource_link(ast)
|
91
|
+
ast.children.map do |c|
|
92
|
+
parse_internal_link_item(c) if c.class == InternalLinkItemAST
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
#Reimplement this
|
97
|
+
def parse_internal_link_item(ast)
|
98
|
+
parse_wiki_ast(ast)
|
99
|
+
end
|
100
|
+
|
101
|
+
#Reimplement this
|
102
|
+
def parse_table(ast)
|
103
|
+
parse_wiki_ast(ast)
|
104
|
+
end
|
105
|
+
|
106
|
+
#Reimplement this
|
107
|
+
def parse_table_row(ast)
|
108
|
+
parse_wiki_ast(ast)
|
109
|
+
end
|
110
|
+
|
111
|
+
#Reimplement this
|
112
|
+
def parse_table_cell(ast)
|
113
|
+
parse_wiki_ast(ast)
|
66
114
|
end
|
67
115
|
|
68
116
|
end
|
data/test/data/html1
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
<p>This is a simple text with <b>Bold text</b> and <i>Italic text</i> inside.
|
2
2
|
One paragraph can be written in several lines.</p><p>Another paragraph starts after a blank line.</p><p>
|
3
|
-
Another one.</p><p>This is text with Internal Link and <a href="http://www.example.com"
|
3
|
+
Another one.</p><p>This is text with <a href="javascript:void(0)">Internal Link</a> and <a href="http://www.example.com">external link</a>.</p><p>We can have headlines:</p><h1> Headline1 </h1><h2> Headline2 </h2><h3> Headline3 </h3><h4> Headline4 </h4><h5> Headline5 </h5><h6> Headline6 </h6><h7> Headline7 </h7><p>Headlines may have formatting:</p><h1> See <a href="javascript:void(0)">Internal Link</a> for more info </h1><h2> This is an <b>important</b> heading </h2><hr></hr><p>
|
4
4
|
This is a text after the line.</p><ul><li>foo
|
5
5
|
</li><li>foo2
|
6
6
|
</li></ul>
|
data/test/data/html10
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
<p>Some examples of tables.</p><table></table>
|
2
|
+
<table><tr></tr>
|
3
|
+
</table>
|
4
|
+
<table><tr><td> a
|
5
|
+
</td><td> b
|
6
|
+
</td></tr>
|
7
|
+
</table>
|
8
|
+
<table><tr><td> a
|
9
|
+
</td><td> b
|
10
|
+
</td></tr>
|
11
|
+
<tr><td> 1
|
12
|
+
</td><td> 2
|
13
|
+
</td></tr>
|
14
|
+
</table>
|
15
|
+
<table><tr><td> a </td><td> b
|
16
|
+
</td></tr>
|
17
|
+
<tr><td> 1 </td><td> 2
|
18
|
+
</td></tr>
|
19
|
+
</table>
|
20
|
+
<table><tr><th> a
|
21
|
+
</th><th> b
|
22
|
+
</th></tr>
|
23
|
+
<tr><td> 1
|
24
|
+
</td><td> 2
|
25
|
+
</td></tr>
|
26
|
+
</table>
|
27
|
+
<table><tr><th> a </th><th> b
|
28
|
+
</th></tr>
|
29
|
+
<tr><td> 1 </td><td> 2
|
30
|
+
</td></tr>
|
31
|
+
</table>
|
32
|
+
<table><tr><td> a
|
33
|
+
</td></tr>
|
34
|
+
<tr><td> 1
|
35
|
+
</td><td> 2
|
36
|
+
</td></tr>
|
37
|
+
</table>
|
38
|
+
<table><tr><td> a
|
39
|
+
</td><td> b
|
40
|
+
</td></tr>
|
41
|
+
<tr><td> 1
|
42
|
+
</td></tr>
|
43
|
+
</table>
|
44
|
+
<table><tr><td> a
|
45
|
+
</td><td> b
|
46
|
+
</td></tr>
|
47
|
+
<tr><td></td><td> 2
|
48
|
+
</td></tr>
|
49
|
+
</table>
|
50
|
+
<table><tr><td> <a href="http://example.com">Example</a></td><td> <b>bold</b></td></tr>
|
51
|
+
<tr><td> <a href="javascript:void(0)">Example</a></td><td> <a href="javascript:void(0)">image:example.jpg(1, 2, 3)</a></td></tr>
|
52
|
+
</table>
|
53
|
+
<table><tr><td> <a href="http://example.com">Example
|
54
|
+
</a></td><td> <b>bold
|
55
|
+
</b></td></tr>
|
56
|
+
<tr><td> <a href="javascript:void(0)">Example
|
57
|
+
</a></td><td> <b>bold <i>italic
|
58
|
+
</i></b></td></tr>
|
59
|
+
</table>
|
60
|
+
<table><tr><td> a
|
61
|
+
</td><td> b
|
62
|
+
</td></tr>
|
63
|
+
<tr><td> A list in a cell:
|
64
|
+
<ul><li>one
|
65
|
+
</li><li>two
|
66
|
+
</li></ul></td></tr>
|
67
|
+
</table>
|
68
|
+
<table><tr><td> a
|
69
|
+
</td><td> b
|
70
|
+
</td></tr>
|
71
|
+
<tr><td> A table in a cell:
|
72
|
+
<table><tr><td> 1
|
73
|
+
</td><td> 2
|
74
|
+
</td></tr>
|
75
|
+
<tr><td> one
|
76
|
+
</td><td> two
|
77
|
+
</td></tr>
|
78
|
+
</table>
|
79
|
+
</td></tr>
|
80
|
+
</table>
|
81
|
+
<table border="1"><tr><th> a
|
82
|
+
</th><th> b
|
83
|
+
</th></tr>
|
84
|
+
<tr align="left"><td> 1
|
85
|
+
</td><td> 2
|
86
|
+
</td></tr>
|
87
|
+
</table>
|
88
|
+
<p>
|
89
|
+
Text before
|
90
|
+
</p><table><tr><td> a
|
91
|
+
</td><td> b
|
92
|
+
</td></tr>
|
93
|
+
</table>
|
94
|
+
<p>Text before with <i>italic
|
95
|
+
</i></p><table><tr><td> a
|
96
|
+
</td><td> b
|
97
|
+
</td></tr>
|
98
|
+
</table>
|
data/test/data/html3
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<p>--Sat Jan 01 01:01:01
|
1
|
+
<p>--Sat Jan 01 01:01:01 UTC 2000CreatorCreator Sat Jan 01 01:01:01 UTC 2000</p>
|
data/test/data/html4
CHANGED
@@ -1 +1,11 @@
|
|
1
|
-
<p><b><i>
|
1
|
+
<p>Test mixing of bold and italic formatting:</p><p><i>italic<b>bold</b>italic</i></p><p><b>bold<i>italic</i>bold</b></p><p><i><b>boldalic</b></i></p><p><i>italic<b>bold</b>italic</i><b>bold<i>italic</i>bold</b></p><p>
|
2
|
+
Test bold and italic wrapped around inline links:</p><p><i><a href="http://example.com">http://example.com</a></i></p><p><b><a href="http://example.com">http://example.com</a></b></p><p><i><a href="http://example.com'">http://example.com'</a></i></p><p><b><a href="http://example.com'">http://example.com'</a> is good</b></p><p><i><a href="http://example.com'">http://example.com'</a> is good</i></p><p>
|
3
|
+
Test unclosed bold and italic formatting:</p><p><i>Some italic and now </i>bold
|
4
|
+
</p><h2>Heading</h2><p>
|
5
|
+
Text</p><p><i>Some italic and now <b>bold
|
6
|
+
</b></i></p><h2>Heading</h2><p>
|
7
|
+
Text</p><p><i>Some italic and now <b>bold</b></i></p><h2>Heading</h2><p>
|
8
|
+
Text</p><p><i>Some italic and now <b>bold
|
9
|
+
</b></i></p><ul><li>one
|
10
|
+
</li><li>two
|
11
|
+
</li></ul><p><i>Some italic and now <b>bold</b></i></p><p>Text</p>
|
data/test/data/html5
CHANGED
data/test/data/html7
CHANGED
@@ -1,2 +1 @@
|
|
1
|
-
<p><a href="http://
|
2
|
-
</p>
|
1
|
+
<p><a href="http://sun.com">http://sun.com</a></p><p><a href="http://sun.com">http://sun.com</a></p><p><a href="mailto:joe@sun.com">mailto:joe@sun.com</a></p><p><a href="http://sun.com">stars</a></p><p><a href="http://sun.com">stars and moon</a></p><p><a href="http://sun.com">stars and <i>moon</i>and <b>trees</b>and birds</a></p><p><a href="javascript:void(0)">sun</a></p><p><a href="javascript:void(0)">All about Sun</a></p><p><a href="javascript:void(0)">image:sun(All about Sun)</a></p><p><a href="javascript:void(0)">nofollow|All about Sun</a></p><p><a href="javascript:void(0)">image:sun(nofollow, All about Sun)</a></p><p><a href="javascript:void(0)">image:sun(All about <a href="javascript:void(0)">Sun</a>)</a></p><p><a href="javascript:void(0)">image:sun(All about <a href="javascript:void(0)">More about</a>)</a></p><p><a href="javascript:void(0)">image:sun(one, two, three)</a></p><p>[]</p><p>[ ]</p><p>[[]]</p><p>[[ ]]</p>
|
data/test/data/html8
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<
|
1
|
+
<h3> foo </h3>
|