@birthday8/doc-mcp 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +61 -21
- package/index.js +61 -56
- package/install.js +45 -35
- package/package.json +1 -1
- package/python/docx_converter.py +15 -83
- package/python/html_rules.py +652 -0
- package/python/html_validator.py +59 -274
- package/python/html_validator_strict.py +430 -0
- package/python/sample/example.html +0 -14
- package/python/sample/html_schema.py +352 -0
- package/python/server.py +112 -75
- package/python/test_strict_validation.py +118 -0
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
"""
|
|
2
|
+
严格HTML验证器
|
|
3
|
+
基于规则引擎的严格验证,提供结构化JSON输出
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from bs4 import BeautifulSoup, Tag, NavigableString
|
|
7
|
+
from typing import Dict, List, Set, Optional, Any
|
|
8
|
+
from html_rules import RuleEngine, ValidationResult
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class StrictHTMLValidator:
|
|
12
|
+
"""严格HTML验证器"""
|
|
13
|
+
|
|
14
|
+
# 支持的类名列表
|
|
15
|
+
SUPPORTED_CLASSES = {
|
|
16
|
+
"center",
|
|
17
|
+
"right",
|
|
18
|
+
"left",
|
|
19
|
+
"info",
|
|
20
|
+
"warning",
|
|
21
|
+
"success",
|
|
22
|
+
"columns",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
def __init__(self, rule_engine: RuleEngine = None):
|
|
26
|
+
self.rule_engine = rule_engine or RuleEngine()
|
|
27
|
+
|
|
28
|
+
def validate(self, html_content: str, strict_mode: bool = True) -> ValidationResult:
|
|
29
|
+
"""
|
|
30
|
+
验证HTML内容
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
html_content: HTML内容
|
|
34
|
+
strict_mode: 严格模式,True时遇到错误立即返回,False时收集所有错误
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
ValidationResult: 验证结果
|
|
38
|
+
"""
|
|
39
|
+
result = ValidationResult(is_valid=True)
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
soup = BeautifulSoup(html_content, "html.parser")
|
|
43
|
+
|
|
44
|
+
# 检查基本结构
|
|
45
|
+
self._validate_structure(soup, result, strict_mode)
|
|
46
|
+
|
|
47
|
+
# 检查标签
|
|
48
|
+
self._validate_tags(soup, result, strict_mode)
|
|
49
|
+
|
|
50
|
+
# 检查属性
|
|
51
|
+
self._validate_attributes(soup, result, strict_mode)
|
|
52
|
+
|
|
53
|
+
# 检查类名
|
|
54
|
+
self._validate_classes(soup, result, strict_mode)
|
|
55
|
+
|
|
56
|
+
# 检查样式
|
|
57
|
+
self._validate_styles(soup, result, strict_mode)
|
|
58
|
+
|
|
59
|
+
# 检查嵌套
|
|
60
|
+
self._validate_nesting(soup, result, strict_mode)
|
|
61
|
+
|
|
62
|
+
# 检查自闭合标签
|
|
63
|
+
self._validate_self_closing(soup, result, strict_mode)
|
|
64
|
+
|
|
65
|
+
# 检查style标签
|
|
66
|
+
self._validate_style_tags(soup, result, strict_mode)
|
|
67
|
+
|
|
68
|
+
except Exception as e:
|
|
69
|
+
result.add_error(
|
|
70
|
+
code="PARSING_ERROR",
|
|
71
|
+
message=f"HTML解析错误: {str(e)}",
|
|
72
|
+
details={"exception": str(e)},
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
return result
|
|
76
|
+
|
|
77
|
+
def _validate_structure(
|
|
78
|
+
self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
|
|
79
|
+
):
|
|
80
|
+
"""验证HTML基本结构"""
|
|
81
|
+
html_tag = soup.find("html")
|
|
82
|
+
if not html_tag:
|
|
83
|
+
result.add_error(
|
|
84
|
+
code="MISSING_HTML_TAG",
|
|
85
|
+
message="缺少 <html> 根标签",
|
|
86
|
+
details={"tag": "html"},
|
|
87
|
+
)
|
|
88
|
+
if strict_mode:
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
head_tag = soup.find("head")
|
|
92
|
+
if not head_tag:
|
|
93
|
+
result.add_warning(code="MISSING_HEAD_TAG", message="缺少 <head> 标签")
|
|
94
|
+
|
|
95
|
+
body_tag = soup.find("body")
|
|
96
|
+
if not body_tag:
|
|
97
|
+
result.add_error(
|
|
98
|
+
code="MISSING_BODY_TAG",
|
|
99
|
+
message="缺少 <body> 标签",
|
|
100
|
+
details={"tag": "body"},
|
|
101
|
+
)
|
|
102
|
+
if strict_mode:
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
# 检查编码
|
|
106
|
+
charset = soup.find("meta", {"charset": True})
|
|
107
|
+
if not charset or charset.get("charset") != "UTF-8":
|
|
108
|
+
result.add_warning(
|
|
109
|
+
code="CHARSET_WARNING",
|
|
110
|
+
message="建议使用 UTF-8 编码",
|
|
111
|
+
details={"current": charset.get("charset") if charset else "None"},
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# 检查语言
|
|
115
|
+
if html_tag and html_tag.get("lang") != "zh-CN":
|
|
116
|
+
result.add_warning(
|
|
117
|
+
code="LANG_WARNING",
|
|
118
|
+
message="建议设置 lang='zh-CN'",
|
|
119
|
+
details={"current": html_tag.get("lang", "None")},
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def _validate_tags(
|
|
123
|
+
self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
|
|
124
|
+
):
|
|
125
|
+
"""验证标签"""
|
|
126
|
+
for element in soup.find_all(True):
|
|
127
|
+
if not isinstance(element, Tag):
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
tag_name = element.name
|
|
131
|
+
|
|
132
|
+
# 检查标签是否允许
|
|
133
|
+
if not self.rule_engine.is_tag_allowed(tag_name):
|
|
134
|
+
result.add_error(
|
|
135
|
+
code="DISALLOWED_TAG",
|
|
136
|
+
message=f"标签 <{tag_name}> 不在允许列表中",
|
|
137
|
+
details={
|
|
138
|
+
"tag": tag_name,
|
|
139
|
+
"position": str(element)[:100],
|
|
140
|
+
"allowed_tags": list(self.rule_engine.tag_rules.keys()),
|
|
141
|
+
},
|
|
142
|
+
)
|
|
143
|
+
if strict_mode:
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
# 检查必需属性
|
|
147
|
+
tag_rule = self.rule_engine.get_tag_rule(tag_name)
|
|
148
|
+
if tag_rule and tag_rule.required_attrs:
|
|
149
|
+
for required_attr in tag_rule.required_attrs:
|
|
150
|
+
if required_attr not in element.attrs:
|
|
151
|
+
result.add_error(
|
|
152
|
+
code="MISSING_REQUIRED_ATTR",
|
|
153
|
+
message=f"标签 <{tag_name}> 缺少必需属性 '{required_attr}'",
|
|
154
|
+
details={
|
|
155
|
+
"tag": tag_name,
|
|
156
|
+
"missing_attr": required_attr,
|
|
157
|
+
"required_attrs": list(tag_rule.required_attrs),
|
|
158
|
+
},
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def _validate_attributes(
|
|
162
|
+
self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
|
|
163
|
+
):
|
|
164
|
+
"""验证属性"""
|
|
165
|
+
for element in soup.find_all(True):
|
|
166
|
+
if not isinstance(element, Tag):
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
tag_name = element.name
|
|
170
|
+
|
|
171
|
+
# 检查属性是否允许
|
|
172
|
+
tag_rule = self.rule_engine.get_tag_rule(tag_name)
|
|
173
|
+
if not tag_rule:
|
|
174
|
+
continue
|
|
175
|
+
|
|
176
|
+
for attr_name, attr_value in element.attrs.items():
|
|
177
|
+
if not self.rule_engine.is_attribute_allowed(attr_name, tag_name):
|
|
178
|
+
result.add_error(
|
|
179
|
+
code="DISALLOWED_ATTR",
|
|
180
|
+
message=f"标签 <{tag_name}> 的属性 '{attr_name}' 不在允许列表中",
|
|
181
|
+
details={
|
|
182
|
+
"tag": tag_name,
|
|
183
|
+
"attribute": attr_name,
|
|
184
|
+
"value": str(attr_value)[:100],
|
|
185
|
+
},
|
|
186
|
+
)
|
|
187
|
+
if strict_mode:
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
def _validate_classes(
|
|
191
|
+
self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
|
|
192
|
+
):
|
|
193
|
+
"""验证类名是否支持"""
|
|
194
|
+
for element in soup.find_all(True):
|
|
195
|
+
if not isinstance(element, Tag):
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
class_attr = element.get("class")
|
|
199
|
+
if not class_attr:
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
# 处理类名列表
|
|
203
|
+
if isinstance(class_attr, list):
|
|
204
|
+
classes = class_attr
|
|
205
|
+
else:
|
|
206
|
+
classes = class_attr.split()
|
|
207
|
+
|
|
208
|
+
unsupported_classes = []
|
|
209
|
+
for cls in classes:
|
|
210
|
+
if cls not in self.SUPPORTED_CLASSES:
|
|
211
|
+
unsupported_classes.append(cls)
|
|
212
|
+
|
|
213
|
+
if unsupported_classes:
|
|
214
|
+
result.add_error(
|
|
215
|
+
code="UNSUPPORTED_CLASS",
|
|
216
|
+
message=f"类名 '{', '.join(unsupported_classes)}' 不在支持列表中,请使用内联 style 属性代替",
|
|
217
|
+
details={
|
|
218
|
+
"unsupported_classes": unsupported_classes,
|
|
219
|
+
"supported_classes": list(self.SUPPORTED_CLASSES),
|
|
220
|
+
"element": element.name,
|
|
221
|
+
"note": "只支持:center, right, left, info, warning, success, columns",
|
|
222
|
+
},
|
|
223
|
+
)
|
|
224
|
+
if strict_mode:
|
|
225
|
+
continue
|
|
226
|
+
|
|
227
|
+
def _validate_styles(
|
|
228
|
+
self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
|
|
229
|
+
):
|
|
230
|
+
"""验证样式"""
|
|
231
|
+
for element in soup.find_all(True):
|
|
232
|
+
if not isinstance(element, Tag):
|
|
233
|
+
continue
|
|
234
|
+
|
|
235
|
+
style_attr = element.get("style")
|
|
236
|
+
if not style_attr:
|
|
237
|
+
continue
|
|
238
|
+
|
|
239
|
+
# 解析样式
|
|
240
|
+
styles = self._parse_style_string(style_attr)
|
|
241
|
+
|
|
242
|
+
for prop_name, prop_value in styles.items():
|
|
243
|
+
if not self.rule_engine.is_style_allowed(prop_name, element.name):
|
|
244
|
+
result.add_error(
|
|
245
|
+
code="DISALLOWED_STYLE",
|
|
246
|
+
message=f"样式属性 '{prop_name}' 不在允许列表中",
|
|
247
|
+
details={
|
|
248
|
+
"style": prop_name,
|
|
249
|
+
"value": prop_value,
|
|
250
|
+
"element": element.name,
|
|
251
|
+
},
|
|
252
|
+
)
|
|
253
|
+
if strict_mode:
|
|
254
|
+
continue
|
|
255
|
+
|
|
256
|
+
# 验证颜色格式
|
|
257
|
+
if prop_name in ["color", "background-color"]:
|
|
258
|
+
if not self.rule_engine.is_valid_color(prop_value):
|
|
259
|
+
result.add_error(
|
|
260
|
+
code="INVALID_COLOR_FORMAT",
|
|
261
|
+
message="颜色格式错误,必须使用十六进制格式 #RRGGBB",
|
|
262
|
+
details={
|
|
263
|
+
"style": prop_name,
|
|
264
|
+
"value": prop_value,
|
|
265
|
+
"correct_format": "#RRGGBB",
|
|
266
|
+
},
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# 验证字号格式
|
|
270
|
+
if prop_name == "font-size":
|
|
271
|
+
if not prop_value.endswith("pt"):
|
|
272
|
+
result.add_error(
|
|
273
|
+
code="INVALID_FONT_SIZE_FORMAT",
|
|
274
|
+
message="字号格式错误,必须使用 pt 单位",
|
|
275
|
+
details={
|
|
276
|
+
"style": prop_name,
|
|
277
|
+
"value": prop_value,
|
|
278
|
+
"correct_format": "14pt, 16pt, 18pt",
|
|
279
|
+
},
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
# 验证行距格式
|
|
283
|
+
if prop_name == "line-height":
|
|
284
|
+
try:
|
|
285
|
+
float(prop_value)
|
|
286
|
+
except ValueError:
|
|
287
|
+
result.add_error(
|
|
288
|
+
code="INVALID_LINE_HEIGHT_FORMAT",
|
|
289
|
+
message="行距格式错误,必须是数字或小数",
|
|
290
|
+
details={
|
|
291
|
+
"style": prop_name,
|
|
292
|
+
"value": prop_value,
|
|
293
|
+
"correct_format": "1.5, 1.8, 2.0",
|
|
294
|
+
},
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
# 验证边距格式
|
|
298
|
+
if prop_name in ["margin-top", "margin-bottom"]:
|
|
299
|
+
if not prop_value.endswith("pt"):
|
|
300
|
+
result.add_error(
|
|
301
|
+
code="INVALID_MARGIN_FORMAT",
|
|
302
|
+
message="边距格式错误,必须使用 pt 单位",
|
|
303
|
+
details={
|
|
304
|
+
"style": prop_name,
|
|
305
|
+
"value": prop_value,
|
|
306
|
+
"correct_format": "10pt, 12pt, 15pt",
|
|
307
|
+
},
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
def _validate_nesting(
|
|
311
|
+
self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
|
|
312
|
+
):
|
|
313
|
+
"""验证嵌套规则"""
|
|
314
|
+
for parent in soup.find_all(True):
|
|
315
|
+
if not isinstance(parent, Tag):
|
|
316
|
+
continue
|
|
317
|
+
|
|
318
|
+
for child in parent.find_all(True, recursive=False):
|
|
319
|
+
if not isinstance(child, Tag):
|
|
320
|
+
continue
|
|
321
|
+
|
|
322
|
+
if not self.rule_engine.is_nesting_allowed(parent.name, child.name):
|
|
323
|
+
result.add_error(
|
|
324
|
+
code="INVALID_NESTING",
|
|
325
|
+
message=f"标签 <{child.name}> 不能嵌套在 <{parent.name}> 中",
|
|
326
|
+
details={
|
|
327
|
+
"parent": parent.name,
|
|
328
|
+
"child": child.name,
|
|
329
|
+
"context": str(parent)[:100],
|
|
330
|
+
},
|
|
331
|
+
)
|
|
332
|
+
if strict_mode:
|
|
333
|
+
continue
|
|
334
|
+
|
|
335
|
+
def _validate_self_closing(
|
|
336
|
+
self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
|
|
337
|
+
):
|
|
338
|
+
"""验证自闭合标签"""
|
|
339
|
+
for element in soup.find_all(True):
|
|
340
|
+
if not isinstance(element, Tag):
|
|
341
|
+
continue
|
|
342
|
+
|
|
343
|
+
tag_name = element.name
|
|
344
|
+
|
|
345
|
+
# 检查空标签是否自闭合
|
|
346
|
+
if self.rule_engine.is_self_closing_tag(tag_name):
|
|
347
|
+
# 检查是否有内容
|
|
348
|
+
if element.contents and not all(
|
|
349
|
+
isinstance(c, NavigableString) and not str(c).strip()
|
|
350
|
+
for c in element.contents
|
|
351
|
+
):
|
|
352
|
+
result.add_warning(
|
|
353
|
+
code="NON_EMPTY_SELF_CLOSING_TAG",
|
|
354
|
+
message=f"标签 <{tag_name}> 应该是自闭合的,但包含内容",
|
|
355
|
+
details={"tag": tag_name},
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
def _validate_style_tags(
|
|
359
|
+
self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
|
|
360
|
+
):
|
|
361
|
+
"""验证style标签(检测并警告)"""
|
|
362
|
+
style_tags = soup.find_all("style")
|
|
363
|
+
if style_tags:
|
|
364
|
+
result.add_warning(
|
|
365
|
+
code="STYLE_TAG_DETECTED",
|
|
366
|
+
message=f"检测到 <style> 标签,其中的 CSS 规则不会被转换。请使用内联 style 属性代替。",
|
|
367
|
+
details={
|
|
368
|
+
"count": len(style_tags),
|
|
369
|
+
"note": '例如:将 <style>h1 { font-size: 18pt; }</style> 改为 <h1 style="font-size: 18pt;">标题</h1>',
|
|
370
|
+
},
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
def _parse_style_string(self, style_str: str) -> Dict[str, str]:
|
|
374
|
+
"""解析style字符串为字典"""
|
|
375
|
+
styles = {}
|
|
376
|
+
if not style_str:
|
|
377
|
+
return styles
|
|
378
|
+
|
|
379
|
+
for item in style_str.split(";"):
|
|
380
|
+
item = item.strip()
|
|
381
|
+
if not item or ":" not in item:
|
|
382
|
+
continue
|
|
383
|
+
|
|
384
|
+
key, value = item.split(":", 1)
|
|
385
|
+
styles[key.strip()] = value.strip()
|
|
386
|
+
|
|
387
|
+
return styles
|
|
388
|
+
|
|
389
|
+
def get_validation_report(self, result: ValidationResult) -> str:
|
|
390
|
+
"""生成验证报告"""
|
|
391
|
+
report = []
|
|
392
|
+
|
|
393
|
+
report.append("=== HTML 验证报告 ===\n")
|
|
394
|
+
|
|
395
|
+
# 验证结果
|
|
396
|
+
status = "✅ 通过" if result.is_valid else "❌ 失败"
|
|
397
|
+
report.append(f"验证状态: {status}\n")
|
|
398
|
+
|
|
399
|
+
# 统计信息
|
|
400
|
+
report.append("📊 统计信息:")
|
|
401
|
+
report.append(f" - 错误数量: {len(result.errors)}")
|
|
402
|
+
report.append(f" - 警告数量: {len(result.warnings)}")
|
|
403
|
+
report.append("")
|
|
404
|
+
|
|
405
|
+
# 错误
|
|
406
|
+
if result.errors:
|
|
407
|
+
report.append("❌ 错误:")
|
|
408
|
+
for i, error in enumerate(result.errors, 1):
|
|
409
|
+
report.append(f" {i}. [{error['code']}] {error['message']}")
|
|
410
|
+
if error.get("details"):
|
|
411
|
+
report.append(f" 详情: {error['details']}")
|
|
412
|
+
report.append("")
|
|
413
|
+
|
|
414
|
+
# 警告
|
|
415
|
+
if result.warnings:
|
|
416
|
+
report.append("⚠️ 警告:")
|
|
417
|
+
for i, warning in enumerate(result.warnings, 1):
|
|
418
|
+
report.append(f" {i}. [{warning['code']}] {warning['message']}")
|
|
419
|
+
if warning.get("details"):
|
|
420
|
+
report.append(f" 详情: {warning['details']}")
|
|
421
|
+
report.append("")
|
|
422
|
+
|
|
423
|
+
if not result.errors and not result.warnings:
|
|
424
|
+
report.append("🎉 完美!没有发现任何问题。")
|
|
425
|
+
|
|
426
|
+
return "\n".join(report)
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
# 全局实例
|
|
430
|
+
strict_validator = StrictHTMLValidator()
|
|
@@ -2,21 +2,7 @@
|
|
|
2
2
|
<html lang="zh-CN">
|
|
3
3
|
<head>
|
|
4
4
|
<meta charset="UTF-8">
|
|
5
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
5
|
<title>MCP复杂格式测试</title>
|
|
7
|
-
<style>
|
|
8
|
-
body {
|
|
9
|
-
font-family: '微软雅黑';
|
|
10
|
-
font-size: 12pt;
|
|
11
|
-
line-height: 1.8;
|
|
12
|
-
padding: 20px;
|
|
13
|
-
max-width: 800px;
|
|
14
|
-
margin: 0 auto;
|
|
15
|
-
}
|
|
16
|
-
h1 { font-family: '微软雅黑'; font-size: 18pt; text-align: center; color: #333; }
|
|
17
|
-
h2 { font-family: '微软雅黑'; font-size: 16pt; color: #4a3f6b; border-bottom: 2px solid #667eea; padding-bottom: 10px; }
|
|
18
|
-
h3 { font-family: '微软雅黑'; font-size: 14pt; color: #5b4e8c; }
|
|
19
|
-
</style>
|
|
20
6
|
</head>
|
|
21
7
|
<body>
|
|
22
8
|
<h1>MCP复杂格式测试文档</h1>
|