@birthday8/doc-mcp 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,101 +6,19 @@ HTML格式验证器
6
6
  import re
7
7
  from bs4 import BeautifulSoup
8
8
  from typing import Dict, List, Tuple
9
+ from html_validator_strict import StrictHTMLValidator, strict_validator
9
10
 
10
11
 
11
12
  class HTMLValidator:
12
- """HTML格式验证器"""
13
+ """HTML格式验证器(兼容旧接口)"""
13
14
 
14
15
  def __init__(self):
15
- # 支持的标签
16
- self.supported_tags = {
17
- "h1",
18
- "h2",
19
- "h3",
20
- "h4",
21
- "h5",
22
- "h6",
23
- "p",
24
- "strong",
25
- "em",
26
- "u",
27
- "s",
28
- "sup",
29
- "sub",
30
- "code",
31
- "span",
32
- "div",
33
- "ul",
34
- "ol",
35
- "li",
36
- "table",
37
- "tr",
38
- "td",
39
- "th",
40
- "img",
41
- "br",
42
- "hr",
43
- "math",
44
- "latex",
45
- }
46
-
47
- # 支持的class
48
- self.supported_classes = {
49
- "center",
50
- "right",
51
- "dialogue",
52
- "quote",
53
- "highlight",
54
- "red",
55
- "blue",
56
- "green",
57
- "purple",
58
- "info",
59
- "warning",
60
- "success",
61
- "chapter",
62
- "ending",
63
- "page-break",
64
- "columns",
65
- }
66
-
67
- # 支持的style属性
68
- self.supported_styles = {
69
- "color",
70
- "background-color",
71
- "font-family",
72
- "font-size",
73
- "font-weight",
74
- "font-style",
75
- "text-decoration",
76
- "text-align",
77
- "line-height",
78
- "margin-top",
79
- "margin-bottom",
80
- "margin-left",
81
- "margin-right",
82
- "padding-top",
83
- "padding-bottom",
84
- "padding-left",
85
- "padding-right",
86
- }
87
-
88
- # 支持的属性
89
- self.supported_attrs = {
90
- "src",
91
- "alt",
92
- "width",
93
- "height",
94
- "align",
95
- "data-indent",
96
- "data-cols",
97
- "colspan",
98
- "rowspan",
99
- }
16
+ # 使用严格验证器
17
+ self.strict_validator = strict_validator
100
18
 
101
19
  def validate(self, html_content: str) -> Dict:
102
20
  """
103
- 验证HTML内容
21
+ 验证HTML内容(兼容旧接口)
104
22
 
105
23
  Returns:
106
24
  Dict: {
@@ -110,158 +28,31 @@ class HTMLValidator:
110
28
  'stats': Dict
111
29
  }
112
30
  """
113
- result = {"valid": True, "errors": [], "warnings": [], "stats": {}}
114
-
115
- try:
116
- soup = BeautifulSoup(html_content, "html.parser")
117
-
118
- # 检查基本结构
119
- self._validate_structure(soup, result)
120
-
121
- # 检查标签
122
- self._validate_tags(soup, result)
123
-
124
- # 检查属性
125
- self._validate_attributes(soup, result)
126
-
127
- # 检查样式
128
- self._validate_styles(soup, result)
129
-
130
- # 检查嵌套
131
- self._validate_nesting(soup, result)
132
-
133
- # 统计信息
134
- self._collect_stats(soup, result)
135
-
136
- except Exception as e:
137
- result["valid"] = False
138
- result["errors"].append(f"解析错误: {str(e)}")
139
-
140
- return result
141
-
142
- def _validate_structure(self, soup: BeautifulSoup, result: Dict):
143
- """验证HTML基本结构"""
144
- html_tag = soup.find("html")
145
- if not html_tag:
146
- result["errors"].append("缺少 <html> 标签")
147
- result["valid"] = False
148
-
149
- head_tag = soup.find("head")
150
- if not head_tag:
151
- result["warnings"].append("缺少 <head> 标签")
152
-
153
- body_tag = soup.find("body")
154
- if not body_tag:
155
- result["errors"].append("缺少 <body> 标签")
156
- result["valid"] = False
157
-
158
- # 检查编码
159
- charset = soup.find("meta", {"charset": True})
160
- if not charset or charset.get("charset") != "UTF-8":
161
- result["warnings"].append("建议使用 UTF-8 编码")
162
-
163
- # 检查语言
164
- if html_tag and html_tag.get("lang") != "zh-CN":
165
- result["warnings"].append("建议设置 lang='zh-CN'")
166
-
167
- def _validate_tags(self, soup: BeautifulSoup, result: Dict):
168
- """验证标签"""
169
- all_tags = set(tag.name for tag in soup.find_all(True))
170
- unsupported = all_tags - self.supported_tags
171
-
172
- if unsupported:
173
- result["errors"].append(f"不支持的标签: {', '.join(unsupported)}")
174
- result["valid"] = False
175
-
176
- # 检查自闭合标签
177
- self_closing = ["img", "br", "hr"]
178
- for tag_name in self_closing:
179
- for tag in soup.find_all(tag_name):
180
- if str(tag).startswith(f"<{tag_name}>") and not str(tag).endswith("/>"):
181
- result["warnings"].append(f"标签 <{tag_name}> 建议自闭合")
182
-
183
- def _validate_attributes(self, soup: BeautifulSoup, result: Dict):
184
- """验证属性"""
185
- for tag in soup.find_all(True):
186
- for attr in tag.attrs:
187
- if attr in ["class", "style"]:
188
- continue
189
-
190
- if attr not in self.supported_attrs:
191
- result["warnings"].append(
192
- f"标签 <{tag.name}> 包含不支持的属性: {attr}"
193
- )
194
-
195
- def _validate_styles(self, soup: BeautifulSoup, result: Dict):
196
- """验证样式"""
197
- for tag in soup.find_all(style=True):
198
- style_str = tag.get("style", "")
199
- styles = self._parse_style(style_str)
200
-
201
- for style_name in styles:
202
- if style_name not in self.supported_styles:
203
- result["warnings"].append(f"不支持的样式: {style_name}")
204
-
205
- # 验证颜色格式
206
- if style_name in ["color", "background-color"]:
207
- color = styles[style_name]
208
- if color and not color.startswith("#"):
209
- result["warnings"].append(f"颜色格式建议使用十六进制: {color}")
210
- elif color and len(color) != 7:
211
- result["errors"].append(f"颜色格式错误: {color}")
212
- result["valid"] = False
213
-
214
- def _validate_nesting(self, soup: BeautifulSoup, result: Dict):
215
- """验证嵌套"""
216
- # 检查段落嵌套
217
- for p in soup.find_all("p"):
218
- children = p.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6", "div"])
219
- if children:
220
- result["errors"].append(f"<p> 标签不能包含块级元素")
221
- result["valid"] = False
222
-
223
- # 检查表格嵌套
224
- for table in soup.find_all("table"):
225
- direct_content = [
226
- c.name for c in table.children if c.name and c.name not in ["tr"]
227
- ]
228
- if direct_content:
229
- result["errors"].append(f"<table> 只能包含 <tr> 标签")
230
- result["valid"] = False
231
-
232
- # 检查列表嵌套
233
- for li in soup.find_all("li"):
234
- direct_lists = li.find_all(["ul", "ol"], recursive=False)
235
- if len(direct_lists) > 1:
236
- result["warnings"].append(f"<li> 建议只包含一个列表")
237
-
238
- def _collect_stats(self, soup: BeautifulSoup, result: Dict):
239
- """收集统计信息"""
240
- stats = {}
241
-
242
- # 标签统计
243
- for tag in self.supported_tags:
244
- count = len(soup.find_all(tag))
245
- if count > 0:
246
- stats[tag] = count
31
+ # 使用严格验证器
32
+ result = self.strict_validator.validate(html_content, strict_mode=False)
33
+
34
+ # 转换为旧格式
35
+ old_format = {
36
+ "valid": result.is_valid,
37
+ "errors": [e["message"] for e in result.errors],
38
+ "warnings": [w["message"] for w in result.warnings],
39
+ "stats": getattr(result, "details", {}),
40
+ }
247
41
 
248
- # 特殊统计
249
- stats["total_tags"] = len(soup.find_all(True))
250
- stats["text_length"] = len(soup.get_text())
42
+ return old_format
251
43
 
252
- result["stats"] = stats
44
+ def validate_strict(self, html_content: str) -> Dict:
45
+ """
46
+ 严格验证HTML内容(返回完整结构化结果)
253
47
 
254
- def _parse_style(self, style_str: str) -> Dict:
255
- """解析样式字符串"""
256
- styles = {}
257
- for item in style_str.split(";"):
258
- if ":" in item:
259
- key, value = item.split(":", 1)
260
- styles[key.strip()] = value.strip()
261
- return styles
48
+ Returns:
49
+ Dict: 完整的结构化验证结果
50
+ """
51
+ result = self.strict_validator.validate(html_content, strict_mode=False)
52
+ return result.to_json()
262
53
 
263
54
  def get_validation_report(self, result: Dict) -> str:
264
- """生成验证报告"""
55
+ """生成验证报告(兼容旧接口)"""
265
56
  report = []
266
57
 
267
58
  report.append("=== HTML 验证报告 ===\n")
@@ -271,27 +62,27 @@ class HTMLValidator:
271
62
  report.append(f"验证状态: {status}\n")
272
63
 
273
64
  # 统计信息
274
- if result["stats"]:
65
+ if result.get("stats"):
275
66
  report.append("📊 统计信息:")
276
67
  for key, value in result["stats"].items():
277
68
  report.append(f" - {key}: {value}")
278
69
  report.append("")
279
70
 
280
71
  # 错误
281
- if result["errors"]:
72
+ if result.get("errors"):
282
73
  report.append("❌ 错误:")
283
74
  for error in result["errors"]:
284
75
  report.append(f" - {error}")
285
76
  report.append("")
286
77
 
287
78
  # 警告
288
- if result["warnings"]:
79
+ if result.get("warnings"):
289
80
  report.append("⚠️ 警告:")
290
81
  for warning in result["warnings"]:
291
82
  report.append(f" - {warning}")
292
83
  report.append("")
293
84
 
294
- if not result["errors"] and not result["warnings"]:
85
+ if not result.get("errors") and not result.get("warnings"):
295
86
  report.append("🎉 完美!没有发现任何问题。")
296
87
 
297
88
  return "\n".join(report)
@@ -301,42 +92,32 @@ class HTMLTemplateGenerator:
301
92
  """HTML模板生成器"""
302
93
 
303
94
  def __init__(self):
304
- self.base_template = """<!DOCTYPE html>
305
- <html lang="zh-CN">
306
- <head>
307
- <meta charset="UTF-8">
308
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
309
- <title>{title}</title>
310
- <style>
311
- body {{
312
- font-family: '微软雅黑';
313
- font-size: 12pt;
314
- line-height: 1.8;
315
- padding: 20px;
316
- max-width: 800px;
317
- margin: 0 auto;
318
- }}
319
- h1 {{ font-family: '微软雅黑'; font-size: 18pt; text-align: center; color: #333; }}
320
- h2 {{ font-family: '微软雅黑'; font-size: 16pt; color: #4a3f6b; border-bottom: 2px solid #667eea; padding-bottom: 10px; }}
321
- h3 {{ font-family: '微软雅黑'; font-size: 14pt; color: #5b4e8c; }}
322
- p {{ margin: 6pt 0; }}
323
- .center {{ text-align: center; }}
324
- .right {{ text-align: right; }}
325
- .highlight {{ background-color: #FFFF00; }}
326
- .red {{ color: #FF0000; }}
327
- .blue {{ color: #0000FF; }}
328
- .green {{ color: #008000; }}
329
- .purple {{ color: #800080; }}
330
- </style>
331
- </head>
332
- <body>
333
- {content}
334
- </body>
335
- </html>"""
336
-
337
- def generate_template(self, title: str = "文档", content: str = "") -> str:
338
- """生成HTML模板"""
339
- return self.base_template.format(title=title, content=content)
95
+ # 加载schema
96
+ import os
97
+ import sys
98
+
99
+ schema_path = os.path.join(
100
+ os.path.dirname(__file__), "sample", "html_schema.py"
101
+ )
102
+ if os.path.exists(schema_path):
103
+ sys.path.insert(0, os.path.dirname(schema_path))
104
+ from html_schema import get_schema
105
+
106
+ self.schema = get_schema()
107
+ else:
108
+ self.schema = {}
109
+
110
+ # 加载example.html作为约束示例
111
+ example_path = os.path.join(os.path.dirname(__file__), "sample", "example.html")
112
+ if os.path.exists(example_path):
113
+ with open(example_path, "r", encoding="utf-8") as f:
114
+ self.constraint_example = f.read()
115
+ else:
116
+ self.constraint_example = ""
117
+
118
+ def get_constraint_example(self) -> str:
119
+ """获取HTML约束示例(从example.html加载)"""
120
+ return self.constraint_example
340
121
 
341
122
  def get_element_examples(self) -> Dict:
342
123
  """获取元素示例"""
@@ -383,6 +164,10 @@ class HTMLTemplateGenerator:
383
164
  "formula": "<math>E = mc^2</math>",
384
165
  }
385
166
 
167
+ def get_schema(self) -> dict:
168
+ """获取HTML格式约束schema"""
169
+ return self.schema
170
+
386
171
 
387
172
  # 全局实例
388
173
  validator = HTMLValidator()