@birthday8/doc-mcp 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,428 @@
1
+ """
2
+ 严格HTML验证器
3
+ 基于规则引擎的严格验证,提供结构化JSON输出
4
+ """
5
+
6
+ from bs4 import BeautifulSoup, Tag, NavigableString
7
+ from typing import Dict, List, Set, Optional, Any
8
+ from html_rules import RuleEngine, ValidationResult
9
+
10
+
11
+ class StrictHTMLValidator:
12
+ """严格HTML验证器"""
13
+
14
+ def __init__(self, rule_engine: RuleEngine = None):
15
+ self.rule_engine = rule_engine or RuleEngine()
16
+
17
+ def validate(self, html_content: str, strict_mode: bool = True) -> ValidationResult:
18
+ """
19
+ 验证HTML内容
20
+
21
+ Args:
22
+ html_content: HTML内容
23
+ strict_mode: 严格模式,True时遇到错误立即返回,False时收集所有错误
24
+
25
+ Returns:
26
+ ValidationResult: 验证结果
27
+ """
28
+ result = ValidationResult(is_valid=True)
29
+
30
+ try:
31
+ soup = BeautifulSoup(html_content, "html.parser")
32
+
33
+ # 检查基本结构
34
+ self._validate_structure(soup, result, strict_mode)
35
+
36
+ # 检查标签
37
+ self._validate_tags(soup, result, strict_mode)
38
+
39
+ # 检查属性
40
+ self._validate_attributes(soup, result, strict_mode)
41
+
42
+ # 检查样式
43
+ self._validate_styles(soup, result, strict_mode)
44
+
45
+ # 检查嵌套
46
+ self._validate_nesting(soup, result, strict_mode)
47
+
48
+ # 检查自闭合标签
49
+ self._validate_self_closing(soup, result, strict_mode)
50
+
51
+ # 收集统计信息
52
+ self._collect_stats(soup, result)
53
+
54
+ except Exception as e:
55
+ result.add_error(
56
+ code="PARSE_ERROR",
57
+ message=f"HTML解析错误: {str(e)}",
58
+ details={"error": str(e)},
59
+ )
60
+
61
+ return result
62
+
63
+ def _validate_structure(
64
+ self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
65
+ ):
66
+ """验证HTML基本结构"""
67
+ html_tag = soup.find("html")
68
+ if not html_tag:
69
+ result.add_error(
70
+ code="MISSING_HTML_TAG",
71
+ message="缺少 <html> 根标签",
72
+ details={"tag": "html"},
73
+ )
74
+ if strict_mode:
75
+ return
76
+
77
+ head_tag = soup.find("head")
78
+ if not head_tag:
79
+ result.add_warning(code="MISSING_HEAD_TAG", message="缺少 <head> 标签")
80
+
81
+ body_tag = soup.find("body")
82
+ if not body_tag:
83
+ result.add_error(
84
+ code="MISSING_BODY_TAG",
85
+ message="缺少 <body> 标签",
86
+ details={"tag": "body"},
87
+ )
88
+ if strict_mode:
89
+ return
90
+
91
+ # 检查编码
92
+ charset = soup.find("meta", {"charset": True})
93
+ if not charset or charset.get("charset") != "UTF-8":
94
+ result.add_warning(
95
+ code="CHARSET_WARNING",
96
+ message="建议使用 UTF-8 编码",
97
+ details={"current": charset.get("charset") if charset else "None"},
98
+ )
99
+
100
+ # 检查语言
101
+ if html_tag and html_tag.get("lang") != "zh-CN":
102
+ result.add_warning(
103
+ code="LANG_WARNING",
104
+ message="建议设置 lang='zh-CN'",
105
+ details={"current": html_tag.get("lang", "None")},
106
+ )
107
+
108
+ def _validate_tags(
109
+ self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
110
+ ):
111
+ """验证标签"""
112
+ for element in soup.find_all(True):
113
+ if not isinstance(element, Tag):
114
+ continue
115
+
116
+ tag_name = element.name
117
+
118
+ # 检查标签是否允许
119
+ if not self.rule_engine.is_tag_allowed(tag_name):
120
+ result.add_error(
121
+ code="DISALLOWED_TAG",
122
+ message=f"标签 <{tag_name}> 不在允许列表中",
123
+ details={
124
+ "tag": tag_name,
125
+ "position": str(element)[:100],
126
+ "allowed_tags": list(self.rule_engine.tag_rules.keys()),
127
+ },
128
+ )
129
+ if strict_mode:
130
+ continue
131
+
132
+ # 检查必需属性
133
+ tag_rule = self.rule_engine.get_tag_rule(tag_name)
134
+ if tag_rule and tag_rule.required_attrs:
135
+ for required_attr in tag_rule.required_attrs:
136
+ if required_attr not in element.attrs:
137
+ result.add_error(
138
+ code="MISSING_REQUIRED_ATTR",
139
+ message=f"标签 <{tag_name}> 缺少必需属性 '{required_attr}'",
140
+ details={
141
+ "tag": tag_name,
142
+ "missing_attr": required_attr,
143
+ "required_attrs": list(tag_rule.required_attrs),
144
+ },
145
+ )
146
+
147
+ def _validate_attributes(
148
+ self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
149
+ ):
150
+ """验证属性"""
151
+ for element in soup.find_all(True):
152
+ if not isinstance(element, Tag):
153
+ continue
154
+
155
+ tag_name = element.name
156
+
157
+ for attr_name, attr_value in element.attrs.items():
158
+ # 跳过特殊属性
159
+ if attr_name in ["class", "style"]:
160
+ continue
161
+
162
+ # 检查属性是否允许
163
+ if not self.rule_engine.is_attribute_allowed(attr_name, tag_name):
164
+ result.add_error(
165
+ code="DISALLOWED_ATTR",
166
+ message=f"标签 <{tag_name}> 的属性 '{attr_name}' 不在允许列表中",
167
+ details={
168
+ "tag": tag_name,
169
+ "attr": attr_name,
170
+ "value": str(attr_value)[:100],
171
+ "allowed_attrs_for_tag": self._get_allowed_attrs_for_tag(
172
+ tag_name
173
+ ),
174
+ },
175
+ )
176
+ if strict_mode:
177
+ continue
178
+
179
+ # 验证属性值
180
+ attr_rule = self.rule_engine.get_attribute_rule(attr_name)
181
+ if attr_rule and attr_rule.validator and attr_value:
182
+ if not attr_rule.validator(str(attr_value)):
183
+ result.add_error(
184
+ code="INVALID_ATTR_VALUE",
185
+ message=f"标签 <{tag_name}> 的属性 '{attr_name}' 值 '{attr_value}' 无效",
186
+ details={
187
+ "tag": tag_name,
188
+ "attr": attr_name,
189
+ "value": str(attr_value),
190
+ },
191
+ )
192
+
193
+ def _validate_styles(
194
+ self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
195
+ ):
196
+ """验证样式"""
197
+ for element in soup.find_all(style=True):
198
+ if not isinstance(element, Tag):
199
+ continue
200
+
201
+ tag_name = element.name
202
+ style_str = element.get("style", "")
203
+ styles = self._parse_style(style_str)
204
+
205
+ for style_name, style_value in styles.items():
206
+ # 检查样式是否允许
207
+ if not self.rule_engine.is_style_allowed(style_name, tag_name):
208
+ result.add_error(
209
+ code="DISALLOWED_STYLE",
210
+ message=f"标签 <{tag_name}> 的样式 '{style_name}' 不在允许列表中",
211
+ details={
212
+ "tag": tag_name,
213
+ "style": style_name,
214
+ "value": style_value,
215
+ "allowed_styles_for_tag": self._get_allowed_styles_for_tag(
216
+ tag_name
217
+ ),
218
+ },
219
+ )
220
+ if strict_mode:
221
+ continue
222
+
223
+ # 验证样式值
224
+ style_rule = self.rule_engine.get_style_rule(style_name)
225
+ if style_rule and style_value:
226
+ is_valid, error_msg = style_rule.validate_value(style_value)
227
+ if not is_valid:
228
+ result.add_error(
229
+ code="INVALID_STYLE_VALUE",
230
+ message=f"标签 <{tag_name}> 的样式 '{style_name}' 值 '{style_value}' 无效: {error_msg}",
231
+ details={
232
+ "tag": tag_name,
233
+ "style": style_name,
234
+ "value": style_value,
235
+ "error": error_msg,
236
+ },
237
+ )
238
+
239
+ def _validate_nesting(
240
+ self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
241
+ ):
242
+ """验证嵌套规则"""
243
+ for element in soup.find_all(True):
244
+ if not isinstance(element, Tag):
245
+ continue
246
+
247
+ tag_name = element.name
248
+ parent = element.parent
249
+
250
+ if parent and isinstance(parent, Tag):
251
+ parent_name = parent.name
252
+
253
+ # 检查是否可以是父元素的子元素
254
+ tag_rule = self.rule_engine.get_tag_rule(tag_name)
255
+ parent_rule = self.rule_engine.get_tag_rule(parent_name)
256
+
257
+ if tag_rule and not tag_rule.can_be_child_of(parent_name):
258
+ result.add_error(
259
+ code="INVALID_NESTING",
260
+ message=f"标签 <{tag_name}> 不能作为 <{parent_name}> 的子元素",
261
+ details={
262
+ "child_tag": tag_name,
263
+ "parent_tag": parent_name,
264
+ "allowed_parents": (
265
+ list(tag_rule.allowed_parents)
266
+ if tag_rule.allowed_parents
267
+ else ["任意"]
268
+ ),
269
+ },
270
+ )
271
+ if strict_mode:
272
+ continue
273
+
274
+ # 检查父元素是否可以包含此元素
275
+ if parent_rule and not parent_rule.can_contain(tag_name):
276
+ result.add_error(
277
+ code="INVALID_CONTAINMENT",
278
+ message=f"标签 <{parent_name}> 不能包含 <{tag_name}>",
279
+ details={
280
+ "parent_tag": parent_name,
281
+ "child_tag": tag_name,
282
+ "allowed_children": (
283
+ list(parent_rule.allowed_children)
284
+ if parent_rule.allowed_children
285
+ else ["任意"]
286
+ ),
287
+ },
288
+ )
289
+
290
+ def _validate_self_closing(
291
+ self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
292
+ ):
293
+ """验证自闭合标签"""
294
+ for element in soup.find_all(True):
295
+ if not isinstance(element, Tag):
296
+ continue
297
+
298
+ tag_name = element.name
299
+ tag_rule = self.rule_engine.get_tag_rule(tag_name)
300
+
301
+ if tag_rule and tag_rule.must_be_self_closing:
302
+ # 检查是否有内容
303
+ if element.contents and not all(
304
+ isinstance(c, NavigableString) and not c.strip()
305
+ for c in element.contents
306
+ ):
307
+ result.add_warning(
308
+ code="SELF_CLOSING_WARNING",
309
+ message=f"标签 <{tag_name}> 应该是自闭合标签,但包含内容",
310
+ details={"tag": tag_name, "content": str(element)[:100]},
311
+ )
312
+
313
+ def _collect_stats(self, soup: BeautifulSoup, result: ValidationResult):
314
+ """收集统计信息"""
315
+ stats = {}
316
+
317
+ # 标签统计
318
+ tag_counts = {}
319
+ for element in soup.find_all(True):
320
+ if isinstance(element, Tag):
321
+ tag_counts[element.name] = tag_counts.get(element.name, 0) + 1
322
+
323
+ stats["tag_counts"] = tag_counts
324
+ stats["total_tags"] = sum(tag_counts.values())
325
+ stats["text_length"] = len(soup.get_text())
326
+ stats["total_errors"] = len(result.errors)
327
+ stats["total_warnings"] = len(result.warnings)
328
+
329
+ # 添加到结果中
330
+ result.details = stats
331
+
332
+ def _parse_style(self, style_str: str) -> Dict[str, str]:
333
+ """解析样式字符串"""
334
+ styles = {}
335
+ for item in style_str.split(";"):
336
+ if ":" in item:
337
+ key, value = item.split(":", 1)
338
+ styles[key.strip()] = value.strip()
339
+ return styles
340
+
341
+ def _get_allowed_attrs_for_tag(self, tag_name: str) -> List[str]:
342
+ """获取标签允许的属性列表"""
343
+ allowed = []
344
+ for attr_name, attr_rule in self.rule_engine.attr_rules.items():
345
+ if attr_rule.allowed and attr_rule.is_applicable_to(tag_name):
346
+ allowed.append(attr_name)
347
+ return allowed
348
+
349
+ def _get_allowed_styles_for_tag(self, tag_name: str) -> List[str]:
350
+ """获取标签允许的样式列表"""
351
+ allowed = []
352
+ for style_name, style_rule in self.rule_engine.style_rules.items():
353
+ if style_rule.allowed and style_rule.is_applicable_to(tag_name):
354
+ allowed.append(style_name)
355
+ return allowed
356
+
357
+ def get_validation_report(self, result: ValidationResult) -> str:
358
+ """生成验证报告"""
359
+ report = []
360
+
361
+ report.append("=== HTML 验证报告 ===\n")
362
+
363
+ # 验证结果
364
+ status = "✅ 通过" if result.is_valid else "❌ 失败"
365
+ report.append(f"验证状态: {status}")
366
+ report.append(f"错误数: {len(result.errors)}")
367
+ report.append(f"警告数: {len(result.warnings)}\n")
368
+
369
+ # 错误
370
+ if result.errors:
371
+ report.append("❌ 错误:")
372
+ for i, error in enumerate(result.errors, 1):
373
+ report.append(f"\n{i}. [{error['code']}] {error['message']}")
374
+ if error.get("details"):
375
+ report.append(f" 详情: {error['details']}")
376
+ report.append("")
377
+
378
+ # 警告
379
+ if result.warnings:
380
+ report.append("⚠️ 警告:")
381
+ for i, warning in enumerate(result.warnings, 1):
382
+ report.append(f"\n{i}. [{warning['code']}] {warning['message']}")
383
+ if warning.get("details"):
384
+ report.append(f" 详情: {warning['details']}")
385
+ report.append("")
386
+
387
+ # 统计信息
388
+ if hasattr(result, "details") and result.details:
389
+ report.append("📊 统计信息:")
390
+ for key, value in result.details.items():
391
+ report.append(f" - {key}: {value}")
392
+ report.append("")
393
+
394
+ if not result.errors and not result.warnings:
395
+ report.append("🎉 完美!没有发现任何问题。")
396
+
397
+ return "\n".join(report)
398
+
399
+
400
+ class ValidationError(Exception):
401
+ """验证错误异常"""
402
+
403
+ def __init__(self, result: ValidationResult):
404
+ self.result = result
405
+ message = (
406
+ f"HTML验证失败: {len(result.errors)} 个错误, {len(result.warnings)} 个警告"
407
+ )
408
+ super().__init__(message)
409
+
410
+ def to_dict(self) -> Dict[str, Any]:
411
+ """转换为字典"""
412
+ return self.result.to_json()
413
+
414
+
415
+ # 延迟创建全局实例
416
+ _strict_validator_instance = None
417
+
418
+
419
+ def get_strict_validator():
420
+ """获取严格验证器单例"""
421
+ global _strict_validator_instance
422
+ if _strict_validator_instance is None:
423
+ _strict_validator_instance = StrictHTMLValidator()
424
+ return _strict_validator_instance
425
+
426
+
427
+ # 向后兼容的全局实例
428
+ strict_validator = get_strict_validator()