@birthday8/doc-mcp 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/index.js +61 -56
- package/install.js +45 -35
- package/package.json +1 -1
- package/python/docx_converter.py +15 -83
- package/python/html_rules.py +570 -0
- package/python/html_validator.py +59 -274
- package/python/html_validator_strict.py +428 -0
- package/python/sample/html_schema.py +283 -0
- package/python/server.py +112 -75
- package/python/test_strict_validation.py +118 -0
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
"""
|
|
2
|
+
严格HTML验证器
|
|
3
|
+
基于规则引擎的严格验证,提供结构化JSON输出
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from bs4 import BeautifulSoup, Tag, NavigableString
|
|
7
|
+
from typing import Dict, List, Set, Optional, Any
|
|
8
|
+
from html_rules import RuleEngine, ValidationResult
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class StrictHTMLValidator:
|
|
12
|
+
"""严格HTML验证器"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, rule_engine: RuleEngine = None):
|
|
15
|
+
self.rule_engine = rule_engine or RuleEngine()
|
|
16
|
+
|
|
17
|
+
def validate(self, html_content: str, strict_mode: bool = True) -> ValidationResult:
|
|
18
|
+
"""
|
|
19
|
+
验证HTML内容
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
html_content: HTML内容
|
|
23
|
+
strict_mode: 严格模式,True时遇到错误立即返回,False时收集所有错误
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
ValidationResult: 验证结果
|
|
27
|
+
"""
|
|
28
|
+
result = ValidationResult(is_valid=True)
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
soup = BeautifulSoup(html_content, "html.parser")
|
|
32
|
+
|
|
33
|
+
# 检查基本结构
|
|
34
|
+
self._validate_structure(soup, result, strict_mode)
|
|
35
|
+
|
|
36
|
+
# 检查标签
|
|
37
|
+
self._validate_tags(soup, result, strict_mode)
|
|
38
|
+
|
|
39
|
+
# 检查属性
|
|
40
|
+
self._validate_attributes(soup, result, strict_mode)
|
|
41
|
+
|
|
42
|
+
# 检查样式
|
|
43
|
+
self._validate_styles(soup, result, strict_mode)
|
|
44
|
+
|
|
45
|
+
# 检查嵌套
|
|
46
|
+
self._validate_nesting(soup, result, strict_mode)
|
|
47
|
+
|
|
48
|
+
# 检查自闭合标签
|
|
49
|
+
self._validate_self_closing(soup, result, strict_mode)
|
|
50
|
+
|
|
51
|
+
# 收集统计信息
|
|
52
|
+
self._collect_stats(soup, result)
|
|
53
|
+
|
|
54
|
+
except Exception as e:
|
|
55
|
+
result.add_error(
|
|
56
|
+
code="PARSE_ERROR",
|
|
57
|
+
message=f"HTML解析错误: {str(e)}",
|
|
58
|
+
details={"error": str(e)},
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
return result
|
|
62
|
+
|
|
63
|
+
def _validate_structure(
|
|
64
|
+
self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
|
|
65
|
+
):
|
|
66
|
+
"""验证HTML基本结构"""
|
|
67
|
+
html_tag = soup.find("html")
|
|
68
|
+
if not html_tag:
|
|
69
|
+
result.add_error(
|
|
70
|
+
code="MISSING_HTML_TAG",
|
|
71
|
+
message="缺少 <html> 根标签",
|
|
72
|
+
details={"tag": "html"},
|
|
73
|
+
)
|
|
74
|
+
if strict_mode:
|
|
75
|
+
return
|
|
76
|
+
|
|
77
|
+
head_tag = soup.find("head")
|
|
78
|
+
if not head_tag:
|
|
79
|
+
result.add_warning(code="MISSING_HEAD_TAG", message="缺少 <head> 标签")
|
|
80
|
+
|
|
81
|
+
body_tag = soup.find("body")
|
|
82
|
+
if not body_tag:
|
|
83
|
+
result.add_error(
|
|
84
|
+
code="MISSING_BODY_TAG",
|
|
85
|
+
message="缺少 <body> 标签",
|
|
86
|
+
details={"tag": "body"},
|
|
87
|
+
)
|
|
88
|
+
if strict_mode:
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
# 检查编码
|
|
92
|
+
charset = soup.find("meta", {"charset": True})
|
|
93
|
+
if not charset or charset.get("charset") != "UTF-8":
|
|
94
|
+
result.add_warning(
|
|
95
|
+
code="CHARSET_WARNING",
|
|
96
|
+
message="建议使用 UTF-8 编码",
|
|
97
|
+
details={"current": charset.get("charset") if charset else "None"},
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# 检查语言
|
|
101
|
+
if html_tag and html_tag.get("lang") != "zh-CN":
|
|
102
|
+
result.add_warning(
|
|
103
|
+
code="LANG_WARNING",
|
|
104
|
+
message="建议设置 lang='zh-CN'",
|
|
105
|
+
details={"current": html_tag.get("lang", "None")},
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def _validate_tags(
|
|
109
|
+
self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
|
|
110
|
+
):
|
|
111
|
+
"""验证标签"""
|
|
112
|
+
for element in soup.find_all(True):
|
|
113
|
+
if not isinstance(element, Tag):
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
tag_name = element.name
|
|
117
|
+
|
|
118
|
+
# 检查标签是否允许
|
|
119
|
+
if not self.rule_engine.is_tag_allowed(tag_name):
|
|
120
|
+
result.add_error(
|
|
121
|
+
code="DISALLOWED_TAG",
|
|
122
|
+
message=f"标签 <{tag_name}> 不在允许列表中",
|
|
123
|
+
details={
|
|
124
|
+
"tag": tag_name,
|
|
125
|
+
"position": str(element)[:100],
|
|
126
|
+
"allowed_tags": list(self.rule_engine.tag_rules.keys()),
|
|
127
|
+
},
|
|
128
|
+
)
|
|
129
|
+
if strict_mode:
|
|
130
|
+
continue
|
|
131
|
+
|
|
132
|
+
# 检查必需属性
|
|
133
|
+
tag_rule = self.rule_engine.get_tag_rule(tag_name)
|
|
134
|
+
if tag_rule and tag_rule.required_attrs:
|
|
135
|
+
for required_attr in tag_rule.required_attrs:
|
|
136
|
+
if required_attr not in element.attrs:
|
|
137
|
+
result.add_error(
|
|
138
|
+
code="MISSING_REQUIRED_ATTR",
|
|
139
|
+
message=f"标签 <{tag_name}> 缺少必需属性 '{required_attr}'",
|
|
140
|
+
details={
|
|
141
|
+
"tag": tag_name,
|
|
142
|
+
"missing_attr": required_attr,
|
|
143
|
+
"required_attrs": list(tag_rule.required_attrs),
|
|
144
|
+
},
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
def _validate_attributes(
|
|
148
|
+
self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
|
|
149
|
+
):
|
|
150
|
+
"""验证属性"""
|
|
151
|
+
for element in soup.find_all(True):
|
|
152
|
+
if not isinstance(element, Tag):
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
tag_name = element.name
|
|
156
|
+
|
|
157
|
+
for attr_name, attr_value in element.attrs.items():
|
|
158
|
+
# 跳过特殊属性
|
|
159
|
+
if attr_name in ["class", "style"]:
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
# 检查属性是否允许
|
|
163
|
+
if not self.rule_engine.is_attribute_allowed(attr_name, tag_name):
|
|
164
|
+
result.add_error(
|
|
165
|
+
code="DISALLOWED_ATTR",
|
|
166
|
+
message=f"标签 <{tag_name}> 的属性 '{attr_name}' 不在允许列表中",
|
|
167
|
+
details={
|
|
168
|
+
"tag": tag_name,
|
|
169
|
+
"attr": attr_name,
|
|
170
|
+
"value": str(attr_value)[:100],
|
|
171
|
+
"allowed_attrs_for_tag": self._get_allowed_attrs_for_tag(
|
|
172
|
+
tag_name
|
|
173
|
+
),
|
|
174
|
+
},
|
|
175
|
+
)
|
|
176
|
+
if strict_mode:
|
|
177
|
+
continue
|
|
178
|
+
|
|
179
|
+
# 验证属性值
|
|
180
|
+
attr_rule = self.rule_engine.get_attribute_rule(attr_name)
|
|
181
|
+
if attr_rule and attr_rule.validator and attr_value:
|
|
182
|
+
if not attr_rule.validator(str(attr_value)):
|
|
183
|
+
result.add_error(
|
|
184
|
+
code="INVALID_ATTR_VALUE",
|
|
185
|
+
message=f"标签 <{tag_name}> 的属性 '{attr_name}' 值 '{attr_value}' 无效",
|
|
186
|
+
details={
|
|
187
|
+
"tag": tag_name,
|
|
188
|
+
"attr": attr_name,
|
|
189
|
+
"value": str(attr_value),
|
|
190
|
+
},
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
def _validate_styles(
|
|
194
|
+
self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
|
|
195
|
+
):
|
|
196
|
+
"""验证样式"""
|
|
197
|
+
for element in soup.find_all(style=True):
|
|
198
|
+
if not isinstance(element, Tag):
|
|
199
|
+
continue
|
|
200
|
+
|
|
201
|
+
tag_name = element.name
|
|
202
|
+
style_str = element.get("style", "")
|
|
203
|
+
styles = self._parse_style(style_str)
|
|
204
|
+
|
|
205
|
+
for style_name, style_value in styles.items():
|
|
206
|
+
# 检查样式是否允许
|
|
207
|
+
if not self.rule_engine.is_style_allowed(style_name, tag_name):
|
|
208
|
+
result.add_error(
|
|
209
|
+
code="DISALLOWED_STYLE",
|
|
210
|
+
message=f"标签 <{tag_name}> 的样式 '{style_name}' 不在允许列表中",
|
|
211
|
+
details={
|
|
212
|
+
"tag": tag_name,
|
|
213
|
+
"style": style_name,
|
|
214
|
+
"value": style_value,
|
|
215
|
+
"allowed_styles_for_tag": self._get_allowed_styles_for_tag(
|
|
216
|
+
tag_name
|
|
217
|
+
),
|
|
218
|
+
},
|
|
219
|
+
)
|
|
220
|
+
if strict_mode:
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
# 验证样式值
|
|
224
|
+
style_rule = self.rule_engine.get_style_rule(style_name)
|
|
225
|
+
if style_rule and style_value:
|
|
226
|
+
is_valid, error_msg = style_rule.validate_value(style_value)
|
|
227
|
+
if not is_valid:
|
|
228
|
+
result.add_error(
|
|
229
|
+
code="INVALID_STYLE_VALUE",
|
|
230
|
+
message=f"标签 <{tag_name}> 的样式 '{style_name}' 值 '{style_value}' 无效: {error_msg}",
|
|
231
|
+
details={
|
|
232
|
+
"tag": tag_name,
|
|
233
|
+
"style": style_name,
|
|
234
|
+
"value": style_value,
|
|
235
|
+
"error": error_msg,
|
|
236
|
+
},
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
def _validate_nesting(
|
|
240
|
+
self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
|
|
241
|
+
):
|
|
242
|
+
"""验证嵌套规则"""
|
|
243
|
+
for element in soup.find_all(True):
|
|
244
|
+
if not isinstance(element, Tag):
|
|
245
|
+
continue
|
|
246
|
+
|
|
247
|
+
tag_name = element.name
|
|
248
|
+
parent = element.parent
|
|
249
|
+
|
|
250
|
+
if parent and isinstance(parent, Tag):
|
|
251
|
+
parent_name = parent.name
|
|
252
|
+
|
|
253
|
+
# 检查是否可以是父元素的子元素
|
|
254
|
+
tag_rule = self.rule_engine.get_tag_rule(tag_name)
|
|
255
|
+
parent_rule = self.rule_engine.get_tag_rule(parent_name)
|
|
256
|
+
|
|
257
|
+
if tag_rule and not tag_rule.can_be_child_of(parent_name):
|
|
258
|
+
result.add_error(
|
|
259
|
+
code="INVALID_NESTING",
|
|
260
|
+
message=f"标签 <{tag_name}> 不能作为 <{parent_name}> 的子元素",
|
|
261
|
+
details={
|
|
262
|
+
"child_tag": tag_name,
|
|
263
|
+
"parent_tag": parent_name,
|
|
264
|
+
"allowed_parents": (
|
|
265
|
+
list(tag_rule.allowed_parents)
|
|
266
|
+
if tag_rule.allowed_parents
|
|
267
|
+
else ["任意"]
|
|
268
|
+
),
|
|
269
|
+
},
|
|
270
|
+
)
|
|
271
|
+
if strict_mode:
|
|
272
|
+
continue
|
|
273
|
+
|
|
274
|
+
# 检查父元素是否可以包含此元素
|
|
275
|
+
if parent_rule and not parent_rule.can_contain(tag_name):
|
|
276
|
+
result.add_error(
|
|
277
|
+
code="INVALID_CONTAINMENT",
|
|
278
|
+
message=f"标签 <{parent_name}> 不能包含 <{tag_name}>",
|
|
279
|
+
details={
|
|
280
|
+
"parent_tag": parent_name,
|
|
281
|
+
"child_tag": tag_name,
|
|
282
|
+
"allowed_children": (
|
|
283
|
+
list(parent_rule.allowed_children)
|
|
284
|
+
if parent_rule.allowed_children
|
|
285
|
+
else ["任意"]
|
|
286
|
+
),
|
|
287
|
+
},
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
def _validate_self_closing(
|
|
291
|
+
self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
|
|
292
|
+
):
|
|
293
|
+
"""验证自闭合标签"""
|
|
294
|
+
for element in soup.find_all(True):
|
|
295
|
+
if not isinstance(element, Tag):
|
|
296
|
+
continue
|
|
297
|
+
|
|
298
|
+
tag_name = element.name
|
|
299
|
+
tag_rule = self.rule_engine.get_tag_rule(tag_name)
|
|
300
|
+
|
|
301
|
+
if tag_rule and tag_rule.must_be_self_closing:
|
|
302
|
+
# 检查是否有内容
|
|
303
|
+
if element.contents and not all(
|
|
304
|
+
isinstance(c, NavigableString) and not c.strip()
|
|
305
|
+
for c in element.contents
|
|
306
|
+
):
|
|
307
|
+
result.add_warning(
|
|
308
|
+
code="SELF_CLOSING_WARNING",
|
|
309
|
+
message=f"标签 <{tag_name}> 应该是自闭合标签,但包含内容",
|
|
310
|
+
details={"tag": tag_name, "content": str(element)[:100]},
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
def _collect_stats(self, soup: BeautifulSoup, result: ValidationResult):
|
|
314
|
+
"""收集统计信息"""
|
|
315
|
+
stats = {}
|
|
316
|
+
|
|
317
|
+
# 标签统计
|
|
318
|
+
tag_counts = {}
|
|
319
|
+
for element in soup.find_all(True):
|
|
320
|
+
if isinstance(element, Tag):
|
|
321
|
+
tag_counts[element.name] = tag_counts.get(element.name, 0) + 1
|
|
322
|
+
|
|
323
|
+
stats["tag_counts"] = tag_counts
|
|
324
|
+
stats["total_tags"] = sum(tag_counts.values())
|
|
325
|
+
stats["text_length"] = len(soup.get_text())
|
|
326
|
+
stats["total_errors"] = len(result.errors)
|
|
327
|
+
stats["total_warnings"] = len(result.warnings)
|
|
328
|
+
|
|
329
|
+
# 添加到结果中
|
|
330
|
+
result.details = stats
|
|
331
|
+
|
|
332
|
+
def _parse_style(self, style_str: str) -> Dict[str, str]:
|
|
333
|
+
"""解析样式字符串"""
|
|
334
|
+
styles = {}
|
|
335
|
+
for item in style_str.split(";"):
|
|
336
|
+
if ":" in item:
|
|
337
|
+
key, value = item.split(":", 1)
|
|
338
|
+
styles[key.strip()] = value.strip()
|
|
339
|
+
return styles
|
|
340
|
+
|
|
341
|
+
def _get_allowed_attrs_for_tag(self, tag_name: str) -> List[str]:
|
|
342
|
+
"""获取标签允许的属性列表"""
|
|
343
|
+
allowed = []
|
|
344
|
+
for attr_name, attr_rule in self.rule_engine.attr_rules.items():
|
|
345
|
+
if attr_rule.allowed and attr_rule.is_applicable_to(tag_name):
|
|
346
|
+
allowed.append(attr_name)
|
|
347
|
+
return allowed
|
|
348
|
+
|
|
349
|
+
def _get_allowed_styles_for_tag(self, tag_name: str) -> List[str]:
|
|
350
|
+
"""获取标签允许的样式列表"""
|
|
351
|
+
allowed = []
|
|
352
|
+
for style_name, style_rule in self.rule_engine.style_rules.items():
|
|
353
|
+
if style_rule.allowed and style_rule.is_applicable_to(tag_name):
|
|
354
|
+
allowed.append(style_name)
|
|
355
|
+
return allowed
|
|
356
|
+
|
|
357
|
+
def get_validation_report(self, result: ValidationResult) -> str:
|
|
358
|
+
"""生成验证报告"""
|
|
359
|
+
report = []
|
|
360
|
+
|
|
361
|
+
report.append("=== HTML 验证报告 ===\n")
|
|
362
|
+
|
|
363
|
+
# 验证结果
|
|
364
|
+
status = "✅ 通过" if result.is_valid else "❌ 失败"
|
|
365
|
+
report.append(f"验证状态: {status}")
|
|
366
|
+
report.append(f"错误数: {len(result.errors)}")
|
|
367
|
+
report.append(f"警告数: {len(result.warnings)}\n")
|
|
368
|
+
|
|
369
|
+
# 错误
|
|
370
|
+
if result.errors:
|
|
371
|
+
report.append("❌ 错误:")
|
|
372
|
+
for i, error in enumerate(result.errors, 1):
|
|
373
|
+
report.append(f"\n{i}. [{error['code']}] {error['message']}")
|
|
374
|
+
if error.get("details"):
|
|
375
|
+
report.append(f" 详情: {error['details']}")
|
|
376
|
+
report.append("")
|
|
377
|
+
|
|
378
|
+
# 警告
|
|
379
|
+
if result.warnings:
|
|
380
|
+
report.append("⚠️ 警告:")
|
|
381
|
+
for i, warning in enumerate(result.warnings, 1):
|
|
382
|
+
report.append(f"\n{i}. [{warning['code']}] {warning['message']}")
|
|
383
|
+
if warning.get("details"):
|
|
384
|
+
report.append(f" 详情: {warning['details']}")
|
|
385
|
+
report.append("")
|
|
386
|
+
|
|
387
|
+
# 统计信息
|
|
388
|
+
if hasattr(result, "details") and result.details:
|
|
389
|
+
report.append("📊 统计信息:")
|
|
390
|
+
for key, value in result.details.items():
|
|
391
|
+
report.append(f" - {key}: {value}")
|
|
392
|
+
report.append("")
|
|
393
|
+
|
|
394
|
+
if not result.errors and not result.warnings:
|
|
395
|
+
report.append("🎉 完美!没有发现任何问题。")
|
|
396
|
+
|
|
397
|
+
return "\n".join(report)
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
class ValidationError(Exception):
|
|
401
|
+
"""验证错误异常"""
|
|
402
|
+
|
|
403
|
+
def __init__(self, result: ValidationResult):
|
|
404
|
+
self.result = result
|
|
405
|
+
message = (
|
|
406
|
+
f"HTML验证失败: {len(result.errors)} 个错误, {len(result.warnings)} 个警告"
|
|
407
|
+
)
|
|
408
|
+
super().__init__(message)
|
|
409
|
+
|
|
410
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
411
|
+
"""转换为字典"""
|
|
412
|
+
return self.result.to_json()
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
# 延迟创建全局实例
|
|
416
|
+
_strict_validator_instance = None
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def get_strict_validator():
|
|
420
|
+
"""获取严格验证器单例"""
|
|
421
|
+
global _strict_validator_instance
|
|
422
|
+
if _strict_validator_instance is None:
|
|
423
|
+
_strict_validator_instance = StrictHTMLValidator()
|
|
424
|
+
return _strict_validator_instance
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
# 向后兼容的全局实例
|
|
428
|
+
strict_validator = get_strict_validator()
|