@birthday8/doc-mcp 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/index.js +61 -56
- package/install.js +45 -35
- package/package.json +1 -1
- package/python/docx_converter.py +15 -83
- package/python/html_rules.py +570 -0
- package/python/html_validator.py +59 -274
- package/python/html_validator_strict.py +428 -0
- package/python/sample/html_schema.py +283 -0
- package/python/server.py +112 -75
- package/python/test_strict_validation.py +118 -0
package/python/html_validator.py
CHANGED
|
@@ -6,101 +6,19 @@ HTML格式验证器
|
|
|
6
6
|
import re
|
|
7
7
|
from bs4 import BeautifulSoup
|
|
8
8
|
from typing import Dict, List, Tuple
|
|
9
|
+
from html_validator_strict import StrictHTMLValidator, strict_validator
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class HTMLValidator:
|
|
12
|
-
"""HTML
|
|
13
|
+
"""HTML格式验证器(兼容旧接口)"""
|
|
13
14
|
|
|
14
15
|
def __init__(self):
|
|
15
|
-
#
|
|
16
|
-
self.
|
|
17
|
-
"h1",
|
|
18
|
-
"h2",
|
|
19
|
-
"h3",
|
|
20
|
-
"h4",
|
|
21
|
-
"h5",
|
|
22
|
-
"h6",
|
|
23
|
-
"p",
|
|
24
|
-
"strong",
|
|
25
|
-
"em",
|
|
26
|
-
"u",
|
|
27
|
-
"s",
|
|
28
|
-
"sup",
|
|
29
|
-
"sub",
|
|
30
|
-
"code",
|
|
31
|
-
"span",
|
|
32
|
-
"div",
|
|
33
|
-
"ul",
|
|
34
|
-
"ol",
|
|
35
|
-
"li",
|
|
36
|
-
"table",
|
|
37
|
-
"tr",
|
|
38
|
-
"td",
|
|
39
|
-
"th",
|
|
40
|
-
"img",
|
|
41
|
-
"br",
|
|
42
|
-
"hr",
|
|
43
|
-
"math",
|
|
44
|
-
"latex",
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
# 支持的class
|
|
48
|
-
self.supported_classes = {
|
|
49
|
-
"center",
|
|
50
|
-
"right",
|
|
51
|
-
"dialogue",
|
|
52
|
-
"quote",
|
|
53
|
-
"highlight",
|
|
54
|
-
"red",
|
|
55
|
-
"blue",
|
|
56
|
-
"green",
|
|
57
|
-
"purple",
|
|
58
|
-
"info",
|
|
59
|
-
"warning",
|
|
60
|
-
"success",
|
|
61
|
-
"chapter",
|
|
62
|
-
"ending",
|
|
63
|
-
"page-break",
|
|
64
|
-
"columns",
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
# 支持的style属性
|
|
68
|
-
self.supported_styles = {
|
|
69
|
-
"color",
|
|
70
|
-
"background-color",
|
|
71
|
-
"font-family",
|
|
72
|
-
"font-size",
|
|
73
|
-
"font-weight",
|
|
74
|
-
"font-style",
|
|
75
|
-
"text-decoration",
|
|
76
|
-
"text-align",
|
|
77
|
-
"line-height",
|
|
78
|
-
"margin-top",
|
|
79
|
-
"margin-bottom",
|
|
80
|
-
"margin-left",
|
|
81
|
-
"margin-right",
|
|
82
|
-
"padding-top",
|
|
83
|
-
"padding-bottom",
|
|
84
|
-
"padding-left",
|
|
85
|
-
"padding-right",
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
# 支持的属性
|
|
89
|
-
self.supported_attrs = {
|
|
90
|
-
"src",
|
|
91
|
-
"alt",
|
|
92
|
-
"width",
|
|
93
|
-
"height",
|
|
94
|
-
"align",
|
|
95
|
-
"data-indent",
|
|
96
|
-
"data-cols",
|
|
97
|
-
"colspan",
|
|
98
|
-
"rowspan",
|
|
99
|
-
}
|
|
16
|
+
# 使用严格验证器
|
|
17
|
+
self.strict_validator = strict_validator
|
|
100
18
|
|
|
101
19
|
def validate(self, html_content: str) -> Dict:
|
|
102
20
|
"""
|
|
103
|
-
验证HTML
|
|
21
|
+
验证HTML内容(兼容旧接口)
|
|
104
22
|
|
|
105
23
|
Returns:
|
|
106
24
|
Dict: {
|
|
@@ -110,158 +28,31 @@ class HTMLValidator:
|
|
|
110
28
|
'stats': Dict
|
|
111
29
|
}
|
|
112
30
|
"""
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
# 检查属性
|
|
125
|
-
self._validate_attributes(soup, result)
|
|
126
|
-
|
|
127
|
-
# 检查样式
|
|
128
|
-
self._validate_styles(soup, result)
|
|
129
|
-
|
|
130
|
-
# 检查嵌套
|
|
131
|
-
self._validate_nesting(soup, result)
|
|
132
|
-
|
|
133
|
-
# 统计信息
|
|
134
|
-
self._collect_stats(soup, result)
|
|
135
|
-
|
|
136
|
-
except Exception as e:
|
|
137
|
-
result["valid"] = False
|
|
138
|
-
result["errors"].append(f"解析错误: {str(e)}")
|
|
139
|
-
|
|
140
|
-
return result
|
|
141
|
-
|
|
142
|
-
def _validate_structure(self, soup: BeautifulSoup, result: Dict):
|
|
143
|
-
"""验证HTML基本结构"""
|
|
144
|
-
html_tag = soup.find("html")
|
|
145
|
-
if not html_tag:
|
|
146
|
-
result["errors"].append("缺少 <html> 标签")
|
|
147
|
-
result["valid"] = False
|
|
148
|
-
|
|
149
|
-
head_tag = soup.find("head")
|
|
150
|
-
if not head_tag:
|
|
151
|
-
result["warnings"].append("缺少 <head> 标签")
|
|
152
|
-
|
|
153
|
-
body_tag = soup.find("body")
|
|
154
|
-
if not body_tag:
|
|
155
|
-
result["errors"].append("缺少 <body> 标签")
|
|
156
|
-
result["valid"] = False
|
|
157
|
-
|
|
158
|
-
# 检查编码
|
|
159
|
-
charset = soup.find("meta", {"charset": True})
|
|
160
|
-
if not charset or charset.get("charset") != "UTF-8":
|
|
161
|
-
result["warnings"].append("建议使用 UTF-8 编码")
|
|
162
|
-
|
|
163
|
-
# 检查语言
|
|
164
|
-
if html_tag and html_tag.get("lang") != "zh-CN":
|
|
165
|
-
result["warnings"].append("建议设置 lang='zh-CN'")
|
|
166
|
-
|
|
167
|
-
def _validate_tags(self, soup: BeautifulSoup, result: Dict):
|
|
168
|
-
"""验证标签"""
|
|
169
|
-
all_tags = set(tag.name for tag in soup.find_all(True))
|
|
170
|
-
unsupported = all_tags - self.supported_tags
|
|
171
|
-
|
|
172
|
-
if unsupported:
|
|
173
|
-
result["errors"].append(f"不支持的标签: {', '.join(unsupported)}")
|
|
174
|
-
result["valid"] = False
|
|
175
|
-
|
|
176
|
-
# 检查自闭合标签
|
|
177
|
-
self_closing = ["img", "br", "hr"]
|
|
178
|
-
for tag_name in self_closing:
|
|
179
|
-
for tag in soup.find_all(tag_name):
|
|
180
|
-
if str(tag).startswith(f"<{tag_name}>") and not str(tag).endswith("/>"):
|
|
181
|
-
result["warnings"].append(f"标签 <{tag_name}> 建议自闭合")
|
|
182
|
-
|
|
183
|
-
def _validate_attributes(self, soup: BeautifulSoup, result: Dict):
|
|
184
|
-
"""验证属性"""
|
|
185
|
-
for tag in soup.find_all(True):
|
|
186
|
-
for attr in tag.attrs:
|
|
187
|
-
if attr in ["class", "style"]:
|
|
188
|
-
continue
|
|
189
|
-
|
|
190
|
-
if attr not in self.supported_attrs:
|
|
191
|
-
result["warnings"].append(
|
|
192
|
-
f"标签 <{tag.name}> 包含不支持的属性: {attr}"
|
|
193
|
-
)
|
|
194
|
-
|
|
195
|
-
def _validate_styles(self, soup: BeautifulSoup, result: Dict):
|
|
196
|
-
"""验证样式"""
|
|
197
|
-
for tag in soup.find_all(style=True):
|
|
198
|
-
style_str = tag.get("style", "")
|
|
199
|
-
styles = self._parse_style(style_str)
|
|
200
|
-
|
|
201
|
-
for style_name in styles:
|
|
202
|
-
if style_name not in self.supported_styles:
|
|
203
|
-
result["warnings"].append(f"不支持的样式: {style_name}")
|
|
204
|
-
|
|
205
|
-
# 验证颜色格式
|
|
206
|
-
if style_name in ["color", "background-color"]:
|
|
207
|
-
color = styles[style_name]
|
|
208
|
-
if color and not color.startswith("#"):
|
|
209
|
-
result["warnings"].append(f"颜色格式建议使用十六进制: {color}")
|
|
210
|
-
elif color and len(color) != 7:
|
|
211
|
-
result["errors"].append(f"颜色格式错误: {color}")
|
|
212
|
-
result["valid"] = False
|
|
213
|
-
|
|
214
|
-
def _validate_nesting(self, soup: BeautifulSoup, result: Dict):
|
|
215
|
-
"""验证嵌套"""
|
|
216
|
-
# 检查段落嵌套
|
|
217
|
-
for p in soup.find_all("p"):
|
|
218
|
-
children = p.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6", "div"])
|
|
219
|
-
if children:
|
|
220
|
-
result["errors"].append(f"<p> 标签不能包含块级元素")
|
|
221
|
-
result["valid"] = False
|
|
222
|
-
|
|
223
|
-
# 检查表格嵌套
|
|
224
|
-
for table in soup.find_all("table"):
|
|
225
|
-
direct_content = [
|
|
226
|
-
c.name for c in table.children if c.name and c.name not in ["tr"]
|
|
227
|
-
]
|
|
228
|
-
if direct_content:
|
|
229
|
-
result["errors"].append(f"<table> 只能包含 <tr> 标签")
|
|
230
|
-
result["valid"] = False
|
|
231
|
-
|
|
232
|
-
# 检查列表嵌套
|
|
233
|
-
for li in soup.find_all("li"):
|
|
234
|
-
direct_lists = li.find_all(["ul", "ol"], recursive=False)
|
|
235
|
-
if len(direct_lists) > 1:
|
|
236
|
-
result["warnings"].append(f"<li> 建议只包含一个列表")
|
|
237
|
-
|
|
238
|
-
def _collect_stats(self, soup: BeautifulSoup, result: Dict):
|
|
239
|
-
"""收集统计信息"""
|
|
240
|
-
stats = {}
|
|
241
|
-
|
|
242
|
-
# 标签统计
|
|
243
|
-
for tag in self.supported_tags:
|
|
244
|
-
count = len(soup.find_all(tag))
|
|
245
|
-
if count > 0:
|
|
246
|
-
stats[tag] = count
|
|
31
|
+
# 使用严格验证器
|
|
32
|
+
result = self.strict_validator.validate(html_content, strict_mode=False)
|
|
33
|
+
|
|
34
|
+
# 转换为旧格式
|
|
35
|
+
old_format = {
|
|
36
|
+
"valid": result.is_valid,
|
|
37
|
+
"errors": [e["message"] for e in result.errors],
|
|
38
|
+
"warnings": [w["message"] for w in result.warnings],
|
|
39
|
+
"stats": getattr(result, "details", {}),
|
|
40
|
+
}
|
|
247
41
|
|
|
248
|
-
|
|
249
|
-
stats["total_tags"] = len(soup.find_all(True))
|
|
250
|
-
stats["text_length"] = len(soup.get_text())
|
|
42
|
+
return old_format
|
|
251
43
|
|
|
252
|
-
|
|
44
|
+
def validate_strict(self, html_content: str) -> Dict:
|
|
45
|
+
"""
|
|
46
|
+
严格验证HTML内容(返回完整结构化结果)
|
|
253
47
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
key, value = item.split(":", 1)
|
|
260
|
-
styles[key.strip()] = value.strip()
|
|
261
|
-
return styles
|
|
48
|
+
Returns:
|
|
49
|
+
Dict: 完整的结构化验证结果
|
|
50
|
+
"""
|
|
51
|
+
result = self.strict_validator.validate(html_content, strict_mode=False)
|
|
52
|
+
return result.to_json()
|
|
262
53
|
|
|
263
54
|
def get_validation_report(self, result: Dict) -> str:
|
|
264
|
-
"""
|
|
55
|
+
"""生成验证报告(兼容旧接口)"""
|
|
265
56
|
report = []
|
|
266
57
|
|
|
267
58
|
report.append("=== HTML 验证报告 ===\n")
|
|
@@ -271,27 +62,27 @@ class HTMLValidator:
|
|
|
271
62
|
report.append(f"验证状态: {status}\n")
|
|
272
63
|
|
|
273
64
|
# 统计信息
|
|
274
|
-
if result
|
|
65
|
+
if result.get("stats"):
|
|
275
66
|
report.append("📊 统计信息:")
|
|
276
67
|
for key, value in result["stats"].items():
|
|
277
68
|
report.append(f" - {key}: {value}")
|
|
278
69
|
report.append("")
|
|
279
70
|
|
|
280
71
|
# 错误
|
|
281
|
-
if result
|
|
72
|
+
if result.get("errors"):
|
|
282
73
|
report.append("❌ 错误:")
|
|
283
74
|
for error in result["errors"]:
|
|
284
75
|
report.append(f" - {error}")
|
|
285
76
|
report.append("")
|
|
286
77
|
|
|
287
78
|
# 警告
|
|
288
|
-
if result
|
|
79
|
+
if result.get("warnings"):
|
|
289
80
|
report.append("⚠️ 警告:")
|
|
290
81
|
for warning in result["warnings"]:
|
|
291
82
|
report.append(f" - {warning}")
|
|
292
83
|
report.append("")
|
|
293
84
|
|
|
294
|
-
if not result
|
|
85
|
+
if not result.get("errors") and not result.get("warnings"):
|
|
295
86
|
report.append("🎉 完美!没有发现任何问题。")
|
|
296
87
|
|
|
297
88
|
return "\n".join(report)
|
|
@@ -301,42 +92,32 @@ class HTMLTemplateGenerator:
|
|
|
301
92
|
"""HTML模板生成器"""
|
|
302
93
|
|
|
303
94
|
def __init__(self):
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
.
|
|
329
|
-
.
|
|
330
|
-
</style>
|
|
331
|
-
</head>
|
|
332
|
-
<body>
|
|
333
|
-
{content}
|
|
334
|
-
</body>
|
|
335
|
-
</html>"""
|
|
336
|
-
|
|
337
|
-
def generate_template(self, title: str = "文档", content: str = "") -> str:
|
|
338
|
-
"""生成HTML模板"""
|
|
339
|
-
return self.base_template.format(title=title, content=content)
|
|
95
|
+
# 加载schema
|
|
96
|
+
import os
|
|
97
|
+
import sys
|
|
98
|
+
|
|
99
|
+
schema_path = os.path.join(
|
|
100
|
+
os.path.dirname(__file__), "sample", "html_schema.py"
|
|
101
|
+
)
|
|
102
|
+
if os.path.exists(schema_path):
|
|
103
|
+
sys.path.insert(0, os.path.dirname(schema_path))
|
|
104
|
+
from html_schema import get_schema
|
|
105
|
+
|
|
106
|
+
self.schema = get_schema()
|
|
107
|
+
else:
|
|
108
|
+
self.schema = {}
|
|
109
|
+
|
|
110
|
+
# 加载example.html作为约束示例
|
|
111
|
+
example_path = os.path.join(os.path.dirname(__file__), "sample", "example.html")
|
|
112
|
+
if os.path.exists(example_path):
|
|
113
|
+
with open(example_path, "r", encoding="utf-8") as f:
|
|
114
|
+
self.constraint_example = f.read()
|
|
115
|
+
else:
|
|
116
|
+
self.constraint_example = ""
|
|
117
|
+
|
|
118
|
+
def get_constraint_example(self) -> str:
|
|
119
|
+
"""获取HTML约束示例(从example.html加载)"""
|
|
120
|
+
return self.constraint_example
|
|
340
121
|
|
|
341
122
|
def get_element_examples(self) -> Dict:
|
|
342
123
|
"""获取元素示例"""
|
|
@@ -383,6 +164,10 @@ class HTMLTemplateGenerator:
|
|
|
383
164
|
"formula": "<math>E = mc^2</math>",
|
|
384
165
|
}
|
|
385
166
|
|
|
167
|
+
def get_schema(self) -> dict:
|
|
168
|
+
"""获取HTML格式约束schema"""
|
|
169
|
+
return self.schema
|
|
170
|
+
|
|
386
171
|
|
|
387
172
|
# 全局实例
|
|
388
173
|
validator = HTMLValidator()
|