@birthday8/doc-mcp 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,430 @@
1
+ """
2
+ 严格HTML验证器
3
+ 基于规则引擎的严格验证,提供结构化JSON输出
4
+ """
5
+
6
+ from bs4 import BeautifulSoup, Tag, NavigableString
7
+ from typing import Dict, List, Set, Optional, Any
8
+ from html_rules import RuleEngine, ValidationResult
9
+
10
+
11
+ class StrictHTMLValidator:
12
+ """严格HTML验证器"""
13
+
14
+ # 支持的类名列表
15
+ SUPPORTED_CLASSES = {
16
+ "center",
17
+ "right",
18
+ "left",
19
+ "info",
20
+ "warning",
21
+ "success",
22
+ "columns",
23
+ }
24
+
25
+ def __init__(self, rule_engine: RuleEngine = None):
26
+ self.rule_engine = rule_engine or RuleEngine()
27
+
28
+ def validate(self, html_content: str, strict_mode: bool = True) -> ValidationResult:
29
+ """
30
+ 验证HTML内容
31
+
32
+ Args:
33
+ html_content: HTML内容
34
+ strict_mode: 严格模式,True时遇到错误立即返回,False时收集所有错误
35
+
36
+ Returns:
37
+ ValidationResult: 验证结果
38
+ """
39
+ result = ValidationResult(is_valid=True)
40
+
41
+ try:
42
+ soup = BeautifulSoup(html_content, "html.parser")
43
+
44
+ # 检查基本结构
45
+ self._validate_structure(soup, result, strict_mode)
46
+
47
+ # 检查标签
48
+ self._validate_tags(soup, result, strict_mode)
49
+
50
+ # 检查属性
51
+ self._validate_attributes(soup, result, strict_mode)
52
+
53
+ # 检查类名
54
+ self._validate_classes(soup, result, strict_mode)
55
+
56
+ # 检查样式
57
+ self._validate_styles(soup, result, strict_mode)
58
+
59
+ # 检查嵌套
60
+ self._validate_nesting(soup, result, strict_mode)
61
+
62
+ # 检查自闭合标签
63
+ self._validate_self_closing(soup, result, strict_mode)
64
+
65
+ # 检查style标签
66
+ self._validate_style_tags(soup, result, strict_mode)
67
+
68
+ except Exception as e:
69
+ result.add_error(
70
+ code="PARSING_ERROR",
71
+ message=f"HTML解析错误: {str(e)}",
72
+ details={"exception": str(e)},
73
+ )
74
+
75
+ return result
76
+
77
+ def _validate_structure(
78
+ self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
79
+ ):
80
+ """验证HTML基本结构"""
81
+ html_tag = soup.find("html")
82
+ if not html_tag:
83
+ result.add_error(
84
+ code="MISSING_HTML_TAG",
85
+ message="缺少 <html> 根标签",
86
+ details={"tag": "html"},
87
+ )
88
+ if strict_mode:
89
+ return
90
+
91
+ head_tag = soup.find("head")
92
+ if not head_tag:
93
+ result.add_warning(code="MISSING_HEAD_TAG", message="缺少 <head> 标签")
94
+
95
+ body_tag = soup.find("body")
96
+ if not body_tag:
97
+ result.add_error(
98
+ code="MISSING_BODY_TAG",
99
+ message="缺少 <body> 标签",
100
+ details={"tag": "body"},
101
+ )
102
+ if strict_mode:
103
+ return
104
+
105
+ # 检查编码
106
+ charset = soup.find("meta", {"charset": True})
107
+ if not charset or charset.get("charset") != "UTF-8":
108
+ result.add_warning(
109
+ code="CHARSET_WARNING",
110
+ message="建议使用 UTF-8 编码",
111
+ details={"current": charset.get("charset") if charset else "None"},
112
+ )
113
+
114
+ # 检查语言
115
+ if html_tag and html_tag.get("lang") != "zh-CN":
116
+ result.add_warning(
117
+ code="LANG_WARNING",
118
+ message="建议设置 lang='zh-CN'",
119
+ details={"current": html_tag.get("lang", "None")},
120
+ )
121
+
122
+ def _validate_tags(
123
+ self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
124
+ ):
125
+ """验证标签"""
126
+ for element in soup.find_all(True):
127
+ if not isinstance(element, Tag):
128
+ continue
129
+
130
+ tag_name = element.name
131
+
132
+ # 检查标签是否允许
133
+ if not self.rule_engine.is_tag_allowed(tag_name):
134
+ result.add_error(
135
+ code="DISALLOWED_TAG",
136
+ message=f"标签 <{tag_name}> 不在允许列表中",
137
+ details={
138
+ "tag": tag_name,
139
+ "position": str(element)[:100],
140
+ "allowed_tags": list(self.rule_engine.tag_rules.keys()),
141
+ },
142
+ )
143
+ if strict_mode:
144
+ continue
145
+
146
+ # 检查必需属性
147
+ tag_rule = self.rule_engine.get_tag_rule(tag_name)
148
+ if tag_rule and tag_rule.required_attrs:
149
+ for required_attr in tag_rule.required_attrs:
150
+ if required_attr not in element.attrs:
151
+ result.add_error(
152
+ code="MISSING_REQUIRED_ATTR",
153
+ message=f"标签 <{tag_name}> 缺少必需属性 '{required_attr}'",
154
+ details={
155
+ "tag": tag_name,
156
+ "missing_attr": required_attr,
157
+ "required_attrs": list(tag_rule.required_attrs),
158
+ },
159
+ )
160
+
161
+ def _validate_attributes(
162
+ self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
163
+ ):
164
+ """验证属性"""
165
+ for element in soup.find_all(True):
166
+ if not isinstance(element, Tag):
167
+ continue
168
+
169
+ tag_name = element.name
170
+
171
+ # 检查属性是否允许
172
+ tag_rule = self.rule_engine.get_tag_rule(tag_name)
173
+ if not tag_rule:
174
+ continue
175
+
176
+ for attr_name, attr_value in element.attrs.items():
177
+ if not self.rule_engine.is_attribute_allowed(attr_name, tag_name):
178
+ result.add_error(
179
+ code="DISALLOWED_ATTR",
180
+ message=f"标签 <{tag_name}> 的属性 '{attr_name}' 不在允许列表中",
181
+ details={
182
+ "tag": tag_name,
183
+ "attribute": attr_name,
184
+ "value": str(attr_value)[:100],
185
+ },
186
+ )
187
+ if strict_mode:
188
+ continue
189
+
190
+ def _validate_classes(
191
+ self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
192
+ ):
193
+ """验证类名是否支持"""
194
+ for element in soup.find_all(True):
195
+ if not isinstance(element, Tag):
196
+ continue
197
+
198
+ class_attr = element.get("class")
199
+ if not class_attr:
200
+ continue
201
+
202
+ # 处理类名列表
203
+ if isinstance(class_attr, list):
204
+ classes = class_attr
205
+ else:
206
+ classes = class_attr.split()
207
+
208
+ unsupported_classes = []
209
+ for cls in classes:
210
+ if cls not in self.SUPPORTED_CLASSES:
211
+ unsupported_classes.append(cls)
212
+
213
+ if unsupported_classes:
214
+ result.add_error(
215
+ code="UNSUPPORTED_CLASS",
216
+ message=f"类名 '{', '.join(unsupported_classes)}' 不在支持列表中,请使用内联 style 属性代替",
217
+ details={
218
+ "unsupported_classes": unsupported_classes,
219
+ "supported_classes": list(self.SUPPORTED_CLASSES),
220
+ "element": element.name,
221
+ "note": "只支持:center, right, left, info, warning, success, columns",
222
+ },
223
+ )
224
+ if strict_mode:
225
+ continue
226
+
227
+ def _validate_styles(
228
+ self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
229
+ ):
230
+ """验证样式"""
231
+ for element in soup.find_all(True):
232
+ if not isinstance(element, Tag):
233
+ continue
234
+
235
+ style_attr = element.get("style")
236
+ if not style_attr:
237
+ continue
238
+
239
+ # 解析样式
240
+ styles = self._parse_style_string(style_attr)
241
+
242
+ for prop_name, prop_value in styles.items():
243
+ if not self.rule_engine.is_style_allowed(prop_name, element.name):
244
+ result.add_error(
245
+ code="DISALLOWED_STYLE",
246
+ message=f"样式属性 '{prop_name}' 不在允许列表中",
247
+ details={
248
+ "style": prop_name,
249
+ "value": prop_value,
250
+ "element": element.name,
251
+ },
252
+ )
253
+ if strict_mode:
254
+ continue
255
+
256
+ # 验证颜色格式
257
+ if prop_name in ["color", "background-color"]:
258
+ if not self.rule_engine.is_valid_color(prop_value):
259
+ result.add_error(
260
+ code="INVALID_COLOR_FORMAT",
261
+ message="颜色格式错误,必须使用十六进制格式 #RRGGBB",
262
+ details={
263
+ "style": prop_name,
264
+ "value": prop_value,
265
+ "correct_format": "#RRGGBB",
266
+ },
267
+ )
268
+
269
+ # 验证字号格式
270
+ if prop_name == "font-size":
271
+ if not prop_value.endswith("pt"):
272
+ result.add_error(
273
+ code="INVALID_FONT_SIZE_FORMAT",
274
+ message="字号格式错误,必须使用 pt 单位",
275
+ details={
276
+ "style": prop_name,
277
+ "value": prop_value,
278
+ "correct_format": "14pt, 16pt, 18pt",
279
+ },
280
+ )
281
+
282
+ # 验证行距格式
283
+ if prop_name == "line-height":
284
+ try:
285
+ float(prop_value)
286
+ except ValueError:
287
+ result.add_error(
288
+ code="INVALID_LINE_HEIGHT_FORMAT",
289
+ message="行距格式错误,必须是数字或小数",
290
+ details={
291
+ "style": prop_name,
292
+ "value": prop_value,
293
+ "correct_format": "1.5, 1.8, 2.0",
294
+ },
295
+ )
296
+
297
+ # 验证边距格式
298
+ if prop_name in ["margin-top", "margin-bottom"]:
299
+ if not prop_value.endswith("pt"):
300
+ result.add_error(
301
+ code="INVALID_MARGIN_FORMAT",
302
+ message="边距格式错误,必须使用 pt 单位",
303
+ details={
304
+ "style": prop_name,
305
+ "value": prop_value,
306
+ "correct_format": "10pt, 12pt, 15pt",
307
+ },
308
+ )
309
+
310
+ def _validate_nesting(
311
+ self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
312
+ ):
313
+ """验证嵌套规则"""
314
+ for parent in soup.find_all(True):
315
+ if not isinstance(parent, Tag):
316
+ continue
317
+
318
+ for child in parent.find_all(True, recursive=False):
319
+ if not isinstance(child, Tag):
320
+ continue
321
+
322
+ if not self.rule_engine.is_nesting_allowed(parent.name, child.name):
323
+ result.add_error(
324
+ code="INVALID_NESTING",
325
+ message=f"标签 <{child.name}> 不能嵌套在 <{parent.name}> 中",
326
+ details={
327
+ "parent": parent.name,
328
+ "child": child.name,
329
+ "context": str(parent)[:100],
330
+ },
331
+ )
332
+ if strict_mode:
333
+ continue
334
+
335
+ def _validate_self_closing(
336
+ self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
337
+ ):
338
+ """验证自闭合标签"""
339
+ for element in soup.find_all(True):
340
+ if not isinstance(element, Tag):
341
+ continue
342
+
343
+ tag_name = element.name
344
+
345
+ # 检查空标签是否自闭合
346
+ if self.rule_engine.is_self_closing_tag(tag_name):
347
+ # 检查是否有内容
348
+ if element.contents and not all(
349
+ isinstance(c, NavigableString) and not str(c).strip()
350
+ for c in element.contents
351
+ ):
352
+ result.add_warning(
353
+ code="NON_EMPTY_SELF_CLOSING_TAG",
354
+ message=f"标签 <{tag_name}> 应该是自闭合的,但包含内容",
355
+ details={"tag": tag_name},
356
+ )
357
+
358
+ def _validate_style_tags(
359
+ self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
360
+ ):
361
+ """验证style标签(检测并警告)"""
362
+ style_tags = soup.find_all("style")
363
+ if style_tags:
364
+ result.add_warning(
365
+ code="STYLE_TAG_DETECTED",
366
+ message=f"检测到 <style> 标签,其中的 CSS 规则不会被转换。请使用内联 style 属性代替。",
367
+ details={
368
+ "count": len(style_tags),
369
+ "note": '例如:将 <style>h1 { font-size: 18pt; }</style> 改为 <h1 style="font-size: 18pt;">标题</h1>',
370
+ },
371
+ )
372
+
373
+ def _parse_style_string(self, style_str: str) -> Dict[str, str]:
374
+ """解析style字符串为字典"""
375
+ styles = {}
376
+ if not style_str:
377
+ return styles
378
+
379
+ for item in style_str.split(";"):
380
+ item = item.strip()
381
+ if not item or ":" not in item:
382
+ continue
383
+
384
+ key, value = item.split(":", 1)
385
+ styles[key.strip()] = value.strip()
386
+
387
+ return styles
388
+
389
+ def get_validation_report(self, result: ValidationResult) -> str:
390
+ """生成验证报告"""
391
+ report = []
392
+
393
+ report.append("=== HTML 验证报告 ===\n")
394
+
395
+ # 验证结果
396
+ status = "✅ 通过" if result.is_valid else "❌ 失败"
397
+ report.append(f"验证状态: {status}\n")
398
+
399
+ # 统计信息
400
+ report.append("📊 统计信息:")
401
+ report.append(f" - 错误数量: {len(result.errors)}")
402
+ report.append(f" - 警告数量: {len(result.warnings)}")
403
+ report.append("")
404
+
405
+ # 错误
406
+ if result.errors:
407
+ report.append("❌ 错误:")
408
+ for i, error in enumerate(result.errors, 1):
409
+ report.append(f" {i}. [{error['code']}] {error['message']}")
410
+ if error.get("details"):
411
+ report.append(f" 详情: {error['details']}")
412
+ report.append("")
413
+
414
+ # 警告
415
+ if result.warnings:
416
+ report.append("⚠️ 警告:")
417
+ for i, warning in enumerate(result.warnings, 1):
418
+ report.append(f" {i}. [{warning['code']}] {warning['message']}")
419
+ if warning.get("details"):
420
+ report.append(f" 详情: {warning['details']}")
421
+ report.append("")
422
+
423
+ if not result.errors and not result.warnings:
424
+ report.append("🎉 完美!没有发现任何问题。")
425
+
426
+ return "\n".join(report)
427
+
428
+
429
+ # 全局实例
430
+ strict_validator = StrictHTMLValidator()
@@ -2,21 +2,7 @@
2
2
  <html lang="zh-CN">
3
3
  <head>
4
4
  <meta charset="UTF-8">
5
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
5
  <title>MCP复杂格式测试</title>
7
- <style>
8
- body {
9
- font-family: '微软雅黑';
10
- font-size: 12pt;
11
- line-height: 1.8;
12
- padding: 20px;
13
- max-width: 800px;
14
- margin: 0 auto;
15
- }
16
- h1 { font-family: '微软雅黑'; font-size: 18pt; text-align: center; color: #333; }
17
- h2 { font-family: '微软雅黑'; font-size: 16pt; color: #4a3f6b; border-bottom: 2px solid #667eea; padding-bottom: 10px; }
18
- h3 { font-family: '微软雅黑'; font-size: 14pt; color: #5b4e8c; }
19
- </style>
20
6
  </head>
21
7
  <body>
22
8
  <h1>MCP复杂格式测试文档</h1>