@birthday8/doc-mcp 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/index.js +61 -65
- package/install.js +45 -35
- package/package.json +2 -4
- package/python/docx_converter.py +1152 -428
- package/python/html_fixer.py +125 -0
- package/python/html_rules.py +570 -0
- package/python/html_validator.py +174 -0
- package/python/html_validator_strict.py +428 -0
- package/python/sample/example.html +407 -0
- package/python/sample/html_schema.py +283 -0
- package/python/server.py +233 -123
- package/python/test_error_detection.py +84 -0
- package/python/test_strict_validation.py +118 -0
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""HTML格式约束Schema - 基于实际代码支持"""
|
|
3
|
+
|
|
4
|
+
HTML_SCHEMA = {
|
|
5
|
+
"version": "2.0",
|
|
6
|
+
"description": "HTML转Word文档的结构化格式约束(基于实际代码支持)",
|
|
7
|
+
|
|
8
|
+
"globalStyle": {
|
|
9
|
+
"body": {
|
|
10
|
+
"font-family": {
|
|
11
|
+
"type": "string",
|
|
12
|
+
"description": "全局默认字体",
|
|
13
|
+
"default": "微软雅黑",
|
|
14
|
+
"note": "支持任意中英文字体名称,如:微软雅黑、宋体、黑体、Arial、Times New Roman等"
|
|
15
|
+
},
|
|
16
|
+
"font-size": {
|
|
17
|
+
"type": "string",
|
|
18
|
+
"description": "全局默认字号",
|
|
19
|
+
"default": "12pt",
|
|
20
|
+
"pattern": "\\d+pt",
|
|
21
|
+
"note": "单位必须是pt,支持任意正整数"
|
|
22
|
+
},
|
|
23
|
+
"color": {
|
|
24
|
+
"type": "string",
|
|
25
|
+
"description": "文本颜色",
|
|
26
|
+
"default": "#333333",
|
|
27
|
+
"pattern": "#[0-9A-Fa-f]{6}",
|
|
28
|
+
"note": "必须使用6位十六进制格式"
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
},
|
|
32
|
+
|
|
33
|
+
"allowedTags": [
|
|
34
|
+
"h1", "h2", "h3", "h4", "h5", "h6",
|
|
35
|
+
"p", "strong", "em", "u", "s", "sup", "sub", "code",
|
|
36
|
+
"span", "div", "ul", "ol", "li",
|
|
37
|
+
"table", "tr", "td", "th",
|
|
38
|
+
"img", "br", "hr",
|
|
39
|
+
"math", "latex"
|
|
40
|
+
],
|
|
41
|
+
|
|
42
|
+
"allowedStyles": {
|
|
43
|
+
"color": {
|
|
44
|
+
"type": "color",
|
|
45
|
+
"pattern": "#[0-9A-Fa-f]{6}",
|
|
46
|
+
"description": "文本颜色,必须使用十六进制格式",
|
|
47
|
+
"note": "例如:#FF0000(红色)、#0000FF(蓝色)、#00FF00(绿色)"
|
|
48
|
+
},
|
|
49
|
+
"background-color": {
|
|
50
|
+
"type": "color",
|
|
51
|
+
"pattern": "#[0-9A-Fa-f]{6}",
|
|
52
|
+
"description": "背景颜色,必须使用十六进制格式",
|
|
53
|
+
"note": "例如:#E3F2FD(浅蓝)、#D4EDDA(浅绿)"
|
|
54
|
+
},
|
|
55
|
+
"font-family": {
|
|
56
|
+
"type": "string",
|
|
57
|
+
"description": "字体名称(支持任意字体)",
|
|
58
|
+
"note": "可以使用系统中已安装的任何字体,如:微软雅黑、宋体、黑体、楷体、Arial、Times New Roman、Consolas等"
|
|
59
|
+
},
|
|
60
|
+
"font-size": {
|
|
61
|
+
"type": "string",
|
|
62
|
+
"pattern": "\\d+pt",
|
|
63
|
+
"description": "字号(单位必须是pt)",
|
|
64
|
+
"note": "支持任意正整数,如:12pt、14pt、16pt、18pt、20pt等"
|
|
65
|
+
},
|
|
66
|
+
"text-align": {
|
|
67
|
+
"type": "string",
|
|
68
|
+
"allowed": ["left", "center", "right", "justify"],
|
|
69
|
+
"description": "文本对齐方式",
|
|
70
|
+
"note": "left(左对齐)、center(居中)、right(右对齐)、justify(两端对齐)"
|
|
71
|
+
},
|
|
72
|
+
"line-height": {
|
|
73
|
+
"type": "string",
|
|
74
|
+
"pattern": "\\d+(\\.\\d+)?",
|
|
75
|
+
"description": "行距(数字或小数)",
|
|
76
|
+
"note": "如:1.5、1.8、2.0等"
|
|
77
|
+
},
|
|
78
|
+
"margin-top": {
|
|
79
|
+
"type": "string",
|
|
80
|
+
"pattern": "\\d+pt",
|
|
81
|
+
"description": "上边距(单位必须是pt)",
|
|
82
|
+
"note": "如:10pt、12pt、15pt等"
|
|
83
|
+
},
|
|
84
|
+
"margin-bottom": {
|
|
85
|
+
"type": "string",
|
|
86
|
+
"pattern": "\\d+pt",
|
|
87
|
+
"description": "下边距(单位必须是pt)",
|
|
88
|
+
"note": "如:10pt、12pt、15pt等"
|
|
89
|
+
}
|
|
90
|
+
},
|
|
91
|
+
|
|
92
|
+
"allowedAttributes": {
|
|
93
|
+
"src": {
|
|
94
|
+
"tags": ["img"],
|
|
95
|
+
"required": True,
|
|
96
|
+
"description": "资源路径",
|
|
97
|
+
"note": "图片文件路径,可以是相对路径或绝对路径"
|
|
98
|
+
},
|
|
99
|
+
"alt": {
|
|
100
|
+
"tags": ["img"],
|
|
101
|
+
"description": "替代文本",
|
|
102
|
+
"note": "图片的描述文本,提高可访问性"
|
|
103
|
+
},
|
|
104
|
+
"width": {
|
|
105
|
+
"tags": ["img"],
|
|
106
|
+
"type": "number",
|
|
107
|
+
"description": "宽度(像素)",
|
|
108
|
+
"note": "如:384、512、768等"
|
|
109
|
+
},
|
|
110
|
+
"height": {
|
|
111
|
+
"tags": ["img"],
|
|
112
|
+
"type": "number",
|
|
113
|
+
"description": "高度(像素)",
|
|
114
|
+
"note": "如:288、384、512等"
|
|
115
|
+
},
|
|
116
|
+
"align": {
|
|
117
|
+
"tags": ["img"],
|
|
118
|
+
"allowed": ["left", "center", "right"],
|
|
119
|
+
"description": "对齐方式",
|
|
120
|
+
"note": "图片在段落中的对齐方式"
|
|
121
|
+
},
|
|
122
|
+
"data-indent": {
|
|
123
|
+
"tags": ["p"],
|
|
124
|
+
"type": "string",
|
|
125
|
+
"description": "首行缩进(em单位)",
|
|
126
|
+
"note": "如:0.5em、1em、2em等"
|
|
127
|
+
},
|
|
128
|
+
"data-cols": {
|
|
129
|
+
"tags": ["div"],
|
|
130
|
+
"type": "number",
|
|
131
|
+
"description": "栏数",
|
|
132
|
+
"note": "支持任意正整数,常用:1(单栏)、2(双栏)、3(三栏)、4(四栏)等。多栏结束后必须用 data-cols=\"1\" 恢复单栏"
|
|
133
|
+
},
|
|
134
|
+
"colspan": {
|
|
135
|
+
"tags": ["td", "th"],
|
|
136
|
+
"type": "number",
|
|
137
|
+
"description": "跨列数",
|
|
138
|
+
"note": "单元格跨越的列数"
|
|
139
|
+
},
|
|
140
|
+
"rowspan": {
|
|
141
|
+
"tags": ["td", "th"],
|
|
142
|
+
"type": "number",
|
|
143
|
+
"description": "跨行数",
|
|
144
|
+
"note": "单元格跨越的行数"
|
|
145
|
+
}
|
|
146
|
+
},
|
|
147
|
+
|
|
148
|
+
"colorFormatRules": {
|
|
149
|
+
"required": "hex",
|
|
150
|
+
"pattern": "#[0-9A-Fa-f]{6}",
|
|
151
|
+
"examples": {
|
|
152
|
+
"correct": ["#FF0000", "#00FF00", "#0000FF", "#FFFFFF", "#000000", "#333333", "#666666", "#999999"],
|
|
153
|
+
"incorrect": ["red", "blue", "green", "yellow", "rgb(255,0,0)", "rgba(255,0,0,0.5)", "hsl(0,100%,50%)"]
|
|
154
|
+
}
|
|
155
|
+
},
|
|
156
|
+
|
|
157
|
+
"selfClosingTags": ["img", "br", "hr"],
|
|
158
|
+
|
|
159
|
+
"nestingRules": {
|
|
160
|
+
"forbidden": [
|
|
161
|
+
{
|
|
162
|
+
"parent": "p",
|
|
163
|
+
"children": ["p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "table", "ul", "ol"],
|
|
164
|
+
"reason": "段落不能包含块级元素(p只能包含行内元素)",
|
|
165
|
+
"example": "<p><div>内容</div></p> ❌",
|
|
166
|
+
"correct": "<div><p>内容</p></div> ✓"
|
|
167
|
+
},
|
|
168
|
+
{
|
|
169
|
+
"parent": "strong",
|
|
170
|
+
"children": ["p", "div", "table", "ul", "ol"],
|
|
171
|
+
"reason": "行内元素不能包含块级元素",
|
|
172
|
+
"example": "<strong><p>段落</p></strong> ❌",
|
|
173
|
+
"correct": "<p><strong>段落</strong></p> ✓"
|
|
174
|
+
},
|
|
175
|
+
{
|
|
176
|
+
"parent": "em",
|
|
177
|
+
"children": ["p", "div", "table", "ul", "ol"],
|
|
178
|
+
"reason": "行内元素不能包含块级元素"
|
|
179
|
+
},
|
|
180
|
+
{
|
|
181
|
+
"parent": "table",
|
|
182
|
+
"children": ["p", "div", "h1", "h2", "h3", "ul", "ol"],
|
|
183
|
+
"reason": "表格只能包含行元素(tr)",
|
|
184
|
+
"example": "<table><p>内容</p></table> ❌",
|
|
185
|
+
"correct": "<table><tr><td>内容</td></tr></table> ✓"
|
|
186
|
+
}
|
|
187
|
+
],
|
|
188
|
+
"recommended": [
|
|
189
|
+
{
|
|
190
|
+
"parent": "p",
|
|
191
|
+
"children": ["strong", "em", "u", "s", "sup", "sub", "code", "span"],
|
|
192
|
+
"reason": "段落可以包含行内格式化元素",
|
|
193
|
+
"example": "<p><strong>加粗</strong><em>斜体</em></p> ✓"
|
|
194
|
+
},
|
|
195
|
+
{
|
|
196
|
+
"parent": "div",
|
|
197
|
+
"children": ["h1", "h2", "h3", "h4", "h5", "h6", "p", "table", "ul", "ol"],
|
|
198
|
+
"reason": "div可以包含块级元素",
|
|
199
|
+
"example": "<div><h2>标题</h2><p>段落</p></div> ✓"
|
|
200
|
+
}
|
|
201
|
+
]
|
|
202
|
+
},
|
|
203
|
+
|
|
204
|
+
"commonErrors": [
|
|
205
|
+
{
|
|
206
|
+
"error": "颜色格式错误",
|
|
207
|
+
"example": "<span style=\"color: red;\">红色</span>",
|
|
208
|
+
"correct": "<span style=\"color: #FF0000;\">红色</span>",
|
|
209
|
+
"note": "必须使用6位十六进制格式 #RRGGBB"
|
|
210
|
+
},
|
|
211
|
+
{
|
|
212
|
+
"error": "未自闭合标签",
|
|
213
|
+
"example": "<img src=\"image.jpg\">",
|
|
214
|
+
"correct": "<img src=\"image.jpg\" alt=\"描述\" />",
|
|
215
|
+
"note": "空标签必须自闭合,包含斜杠"
|
|
216
|
+
},
|
|
217
|
+
{
|
|
218
|
+
"error": "不支持的样式",
|
|
219
|
+
"example": "<p style=\"float: right;\">浮动</p>",
|
|
220
|
+
"correct": "<p class=\"right\">右对齐</p>",
|
|
221
|
+
"note": "不支持float、text-shadow、display、position等CSS属性"
|
|
222
|
+
},
|
|
223
|
+
{
|
|
224
|
+
"error": "错误的嵌套",
|
|
225
|
+
"example": "<strong><p>段落</p></strong>",
|
|
226
|
+
"correct": "<p><strong>段落</strong></p>",
|
|
227
|
+
"note": "块级元素不能嵌套在行内元素内"
|
|
228
|
+
},
|
|
229
|
+
{
|
|
230
|
+
"error": "忘记恢复单栏",
|
|
231
|
+
"example": "<div class=\"columns\" data-cols=\"2\">双栏</div>",
|
|
232
|
+
"correct": "<div class=\"columns\" data-cols=\"2\">双栏</div><div class=\"columns\" data-cols=\"1\"></div>",
|
|
233
|
+
"note": "多栏布局后必须使用 data-cols=\"1\" 恢复单栏"
|
|
234
|
+
},
|
|
235
|
+
{
|
|
236
|
+
"error": "RGB颜色格式",
|
|
237
|
+
"example": "<span style=\"color: rgb(255,0,0);\">红色</span>",
|
|
238
|
+
"correct": "<span style=\"color: #FF0000;\">红色</span>",
|
|
239
|
+
"note": "不支持rgb()和rgba()格式,必须使用十六进制"
|
|
240
|
+
}
|
|
241
|
+
],
|
|
242
|
+
|
|
243
|
+
"bestPractices": [
|
|
244
|
+
"颜色必须使用十六进制格式 #RRGGBB(如 #FF0000)",
|
|
245
|
+
"空标签必须自闭合(如 <img /> <br /> <hr />)",
|
|
246
|
+
"字体可以使用任意系统已安装的字体(如 微软雅黑、宋体、Arial、Times New Roman)",
|
|
247
|
+
"字号单位必须是pt(如 12pt、14pt、16pt、18pt)",
|
|
248
|
+
"栏数支持任意正整数(1=单栏、2=双栏、3=三栏、4=四栏等)",
|
|
249
|
+
"多栏布局后必须恢复单栏(data-cols=\"1\")",
|
|
250
|
+
"图片必须包含 alt 属性以提高可访问性",
|
|
251
|
+
"优先使用语义化标签(h1-h6)而非仅用 font-size",
|
|
252
|
+
"表格单元格使用 th(表头)和 td(数据)区分",
|
|
253
|
+
"行内元素(strong、em、u、s、sup、sub、code、span)可以嵌套在块级元素(p、div)内",
|
|
254
|
+
"块级元素(p、div、table、ul、ol)不能嵌套在行内元素内",
|
|
255
|
+
"行距可以是数字或小数(如 1.5、1.8、2.0)"
|
|
256
|
+
],
|
|
257
|
+
|
|
258
|
+
"validationWorkflow": [
|
|
259
|
+
"1. 检查HTML基本结构(DOCTYPE, html, head, body)",
|
|
260
|
+
"2. 验证所有标签是否在允许列表中",
|
|
261
|
+
"3. 检查所有属性是否被支持",
|
|
262
|
+
"4. 验证所有样式是否符合规范(特别是颜色格式)",
|
|
263
|
+
"5. 检查颜色格式是否为十六进制 #RRGGBB",
|
|
264
|
+
"6. 验证标签嵌套是否正确",
|
|
265
|
+
"7. 确认自闭合标签格式正确",
|
|
266
|
+
"8. 检查多栏布局是否恢复单栏"
|
|
267
|
+
]
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def get_schema():
|
|
272
|
+
"""获取HTML格式约束schema"""
|
|
273
|
+
return HTML_SCHEMA
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def get_schema_json():
|
|
277
|
+
"""获取JSON格式的schema"""
|
|
278
|
+
import json
|
|
279
|
+
return json.dumps(HTML_SCHEMA, ensure_ascii=False, indent=2)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
if __name__ == "__main__":
|
|
283
|
+
print(get_schema_json())
|