@birthday8/doc-mcp 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,125 @@
1
+ # -*- coding: utf-8 -*-
2
+ """HTML自动修复器 - 修正常见的格式错误"""
3
+
4
+ import re
5
+
6
+
7
+ class HTMLFixer:
8
+ """HTML格式自动修复类"""
9
+
10
+ # 颜色名称到十六进制的映射
11
+ COLOR_MAP = {
12
+ "red": "#FF0000",
13
+ "blue": "#0000FF",
14
+ "green": "#008000",
15
+ "yellow": "#FFFF00",
16
+ "orange": "#FFA500",
17
+ "purple": "#800080",
18
+ "pink": "#FFC0CB",
19
+ "brown": "#A52A2A",
20
+ "black": "#000000",
21
+ "white": "#FFFFFF",
22
+ "gray": "#808080",
23
+ "grey": "#808080",
24
+ "lightblue": "#ADD8E6",
25
+ "lightgreen": "#90EE90",
26
+ "lightyellow": "#FFFFE0",
27
+ "lightgray": "#D3D3D3",
28
+ "darkred": "#8B0000",
29
+ "darkblue": "#00008B",
30
+ "darkgreen": "#006400",
31
+ }
32
+
33
+ def __init__(self):
34
+ self.fixes = []
35
+
36
+ def fix(self, html_content):
37
+ """修复HTML内容"""
38
+ self.fixes = []
39
+ result = html_content
40
+
41
+ # 1. 修复颜色格式
42
+ result = self._fix_colors(result)
43
+
44
+ # 2. 修复自闭合标签
45
+ result = self._fix_self_closing(result)
46
+
47
+ # 3. 修复常见不支持属性
48
+ result = self._fix_unsupported_attrs(result)
49
+
50
+ # 4. 修复RGB颜色格式
51
+ result = self._fix_rgb_colors(result)
52
+
53
+ return result
54
+
55
+ def _fix_colors(self, html):
56
+ """修复颜色名称为十六进制"""
57
+ for color_name, hex_value in self.COLOR_MAP.items():
58
+ # 修复 style="color: red;"
59
+ pattern = rf"color:\s*{color_name}\b"
60
+ replacement = f"color: {hex_value}"
61
+ new_html, count = re.subn(pattern, replacement, html, flags=re.IGNORECASE)
62
+ if count > 0:
63
+ self.fixes.append(f"修复颜色: {color_name} → {hex_value} ({count}次)")
64
+ html = new_html
65
+
66
+ # 修复 background-color: red;
67
+ pattern = rf"background(?:-color)?:\s*{color_name}\b"
68
+ replacement = f"background-color: {hex_value}"
69
+ new_html, count = re.subn(pattern, replacement, html, flags=re.IGNORECASE)
70
+ if count > 0:
71
+ self.fixes.append(f"修复背景色: {color_name} → {hex_value} ({count}次)")
72
+ html = new_html
73
+
74
+ return html
75
+
76
+ def _fix_self_closing(self, html):
77
+ """修复自闭合标签"""
78
+ # 修复 <img> → <img />
79
+ self_closing_tags = ["img", "br", "hr"]
80
+
81
+ for tag in self_closing_tags:
82
+ # 匹配 <tag ...> (不包含 />)
83
+ pattern = rf"<{tag}([^/>]*?)>"
84
+ replacement = rf"<{tag}\1 />"
85
+ new_html, count = re.subn(pattern, replacement, html)
86
+ if count > 0:
87
+ self.fixes.append(f"修复自闭合标签: <{tag}> → <{tag} /> ({count}次)")
88
+ html = new_html
89
+
90
+ return html
91
+
92
+ def _fix_unsupported_attrs(self, html):
93
+ """移除不支持的属性"""
94
+ unsupported_attrs = ["onclick", "onload", "onerror", "loading", "crossorigin"]
95
+
96
+ for attr in unsupported_attrs:
97
+ pattern = rf'\s*{attr}=["\'][^"\']*["\']'
98
+ new_html, count = re.subn(pattern, "", html, flags=re.IGNORECASE)
99
+ if count > 0:
100
+ self.fixes.append(f"移除不支持属性: {attr} ({count}次)")
101
+ html = new_html
102
+
103
+ return html
104
+
105
+ def _fix_rgb_colors(self, html):
106
+ """修复RGB颜色格式为十六进制"""
107
+ # 修复 rgb(r, g, b) → #RRGGBB
108
+ pattern = r"rgb\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*\)"
109
+
110
+ def rgb_to_hex(match):
111
+ r, g, b = map(int, match.groups())
112
+ return f"#{r:02X}{g:02X}{b:02X}"
113
+
114
+ new_html, count = re.subn(pattern, rgb_to_hex, html, flags=re.IGNORECASE)
115
+ if count > 0:
116
+ self.fixes.append(f"修复RGB颜色: rgb() → #RRGGBB ({count}次)")
117
+ html = new_html
118
+
119
+ return html
120
+
121
+ def get_fixes_report(self):
122
+ """获取修复报告"""
123
+ if not self.fixes:
124
+ return "✓ 无需修复"
125
+ return "\n".join([f" • {fix}" for fix in self.fixes])
@@ -0,0 +1,389 @@
1
+ """
2
+ HTML格式验证器
3
+ 用于验证HTML是否符合转换规范
4
+ """
5
+
6
+ import re
7
+ from bs4 import BeautifulSoup
8
+ from typing import Dict, List, Tuple
9
+
10
+
11
+ class HTMLValidator:
12
+ """HTML格式验证器"""
13
+
14
+ def __init__(self):
15
+ # 支持的标签
16
+ self.supported_tags = {
17
+ "h1",
18
+ "h2",
19
+ "h3",
20
+ "h4",
21
+ "h5",
22
+ "h6",
23
+ "p",
24
+ "strong",
25
+ "em",
26
+ "u",
27
+ "s",
28
+ "sup",
29
+ "sub",
30
+ "code",
31
+ "span",
32
+ "div",
33
+ "ul",
34
+ "ol",
35
+ "li",
36
+ "table",
37
+ "tr",
38
+ "td",
39
+ "th",
40
+ "img",
41
+ "br",
42
+ "hr",
43
+ "math",
44
+ "latex",
45
+ }
46
+
47
+ # 支持的class
48
+ self.supported_classes = {
49
+ "center",
50
+ "right",
51
+ "dialogue",
52
+ "quote",
53
+ "highlight",
54
+ "red",
55
+ "blue",
56
+ "green",
57
+ "purple",
58
+ "info",
59
+ "warning",
60
+ "success",
61
+ "chapter",
62
+ "ending",
63
+ "page-break",
64
+ "columns",
65
+ }
66
+
67
+ # 支持的style属性
68
+ self.supported_styles = {
69
+ "color",
70
+ "background-color",
71
+ "font-family",
72
+ "font-size",
73
+ "font-weight",
74
+ "font-style",
75
+ "text-decoration",
76
+ "text-align",
77
+ "line-height",
78
+ "margin-top",
79
+ "margin-bottom",
80
+ "margin-left",
81
+ "margin-right",
82
+ "padding-top",
83
+ "padding-bottom",
84
+ "padding-left",
85
+ "padding-right",
86
+ }
87
+
88
+ # 支持的属性
89
+ self.supported_attrs = {
90
+ "src",
91
+ "alt",
92
+ "width",
93
+ "height",
94
+ "align",
95
+ "data-indent",
96
+ "data-cols",
97
+ "colspan",
98
+ "rowspan",
99
+ }
100
+
101
+ def validate(self, html_content: str) -> Dict:
102
+ """
103
+ 验证HTML内容
104
+
105
+ Returns:
106
+ Dict: {
107
+ 'valid': bool,
108
+ 'errors': List[str],
109
+ 'warnings': List[str],
110
+ 'stats': Dict
111
+ }
112
+ """
113
+ result = {"valid": True, "errors": [], "warnings": [], "stats": {}}
114
+
115
+ try:
116
+ soup = BeautifulSoup(html_content, "html.parser")
117
+
118
+ # 检查基本结构
119
+ self._validate_structure(soup, result)
120
+
121
+ # 检查标签
122
+ self._validate_tags(soup, result)
123
+
124
+ # 检查属性
125
+ self._validate_attributes(soup, result)
126
+
127
+ # 检查样式
128
+ self._validate_styles(soup, result)
129
+
130
+ # 检查嵌套
131
+ self._validate_nesting(soup, result)
132
+
133
+ # 统计信息
134
+ self._collect_stats(soup, result)
135
+
136
+ except Exception as e:
137
+ result["valid"] = False
138
+ result["errors"].append(f"解析错误: {str(e)}")
139
+
140
+ return result
141
+
142
+ def _validate_structure(self, soup: BeautifulSoup, result: Dict):
143
+ """验证HTML基本结构"""
144
+ html_tag = soup.find("html")
145
+ if not html_tag:
146
+ result["errors"].append("缺少 <html> 标签")
147
+ result["valid"] = False
148
+
149
+ head_tag = soup.find("head")
150
+ if not head_tag:
151
+ result["warnings"].append("缺少 <head> 标签")
152
+
153
+ body_tag = soup.find("body")
154
+ if not body_tag:
155
+ result["errors"].append("缺少 <body> 标签")
156
+ result["valid"] = False
157
+
158
+ # 检查编码
159
+ charset = soup.find("meta", {"charset": True})
160
+ if not charset or charset.get("charset") != "UTF-8":
161
+ result["warnings"].append("建议使用 UTF-8 编码")
162
+
163
+ # 检查语言
164
+ if html_tag and html_tag.get("lang") != "zh-CN":
165
+ result["warnings"].append("建议设置 lang='zh-CN'")
166
+
167
+ def _validate_tags(self, soup: BeautifulSoup, result: Dict):
168
+ """验证标签"""
169
+ all_tags = set(tag.name for tag in soup.find_all(True))
170
+ unsupported = all_tags - self.supported_tags
171
+
172
+ if unsupported:
173
+ result["errors"].append(f"不支持的标签: {', '.join(unsupported)}")
174
+ result["valid"] = False
175
+
176
+ # 检查自闭合标签
177
+ self_closing = ["img", "br", "hr"]
178
+ for tag_name in self_closing:
179
+ for tag in soup.find_all(tag_name):
180
+ if str(tag).startswith(f"<{tag_name}>") and not str(tag).endswith("/>"):
181
+ result["warnings"].append(f"标签 <{tag_name}> 建议自闭合")
182
+
183
+ def _validate_attributes(self, soup: BeautifulSoup, result: Dict):
184
+ """验证属性"""
185
+ for tag in soup.find_all(True):
186
+ for attr in tag.attrs:
187
+ if attr in ["class", "style"]:
188
+ continue
189
+
190
+ if attr not in self.supported_attrs:
191
+ result["warnings"].append(
192
+ f"标签 <{tag.name}> 包含不支持的属性: {attr}"
193
+ )
194
+
195
+ def _validate_styles(self, soup: BeautifulSoup, result: Dict):
196
+ """验证样式"""
197
+ for tag in soup.find_all(style=True):
198
+ style_str = tag.get("style", "")
199
+ styles = self._parse_style(style_str)
200
+
201
+ for style_name in styles:
202
+ if style_name not in self.supported_styles:
203
+ result["warnings"].append(f"不支持的样式: {style_name}")
204
+
205
+ # 验证颜色格式
206
+ if style_name in ["color", "background-color"]:
207
+ color = styles[style_name]
208
+ if color and not color.startswith("#"):
209
+ result["warnings"].append(f"颜色格式建议使用十六进制: {color}")
210
+ elif color and len(color) != 7:
211
+ result["errors"].append(f"颜色格式错误: {color}")
212
+ result["valid"] = False
213
+
214
+ def _validate_nesting(self, soup: BeautifulSoup, result: Dict):
215
+ """验证嵌套"""
216
+ # 检查段落嵌套
217
+ for p in soup.find_all("p"):
218
+ children = p.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6", "div"])
219
+ if children:
220
+ result["errors"].append(f"<p> 标签不能包含块级元素")
221
+ result["valid"] = False
222
+
223
+ # 检查表格嵌套
224
+ for table in soup.find_all("table"):
225
+ direct_content = [
226
+ c.name for c in table.children if c.name and c.name not in ["tr"]
227
+ ]
228
+ if direct_content:
229
+ result["errors"].append(f"<table> 只能包含 <tr> 标签")
230
+ result["valid"] = False
231
+
232
+ # 检查列表嵌套
233
+ for li in soup.find_all("li"):
234
+ direct_lists = li.find_all(["ul", "ol"], recursive=False)
235
+ if len(direct_lists) > 1:
236
+ result["warnings"].append(f"<li> 建议只包含一个列表")
237
+
238
+ def _collect_stats(self, soup: BeautifulSoup, result: Dict):
239
+ """收集统计信息"""
240
+ stats = {}
241
+
242
+ # 标签统计
243
+ for tag in self.supported_tags:
244
+ count = len(soup.find_all(tag))
245
+ if count > 0:
246
+ stats[tag] = count
247
+
248
+ # 特殊统计
249
+ stats["total_tags"] = len(soup.find_all(True))
250
+ stats["text_length"] = len(soup.get_text())
251
+
252
+ result["stats"] = stats
253
+
254
+ def _parse_style(self, style_str: str) -> Dict:
255
+ """解析样式字符串"""
256
+ styles = {}
257
+ for item in style_str.split(";"):
258
+ if ":" in item:
259
+ key, value = item.split(":", 1)
260
+ styles[key.strip()] = value.strip()
261
+ return styles
262
+
263
+ def get_validation_report(self, result: Dict) -> str:
264
+ """生成验证报告"""
265
+ report = []
266
+
267
+ report.append("=== HTML 验证报告 ===\n")
268
+
269
+ # 验证结果
270
+ status = "✅ 通过" if result["valid"] else "❌ 失败"
271
+ report.append(f"验证状态: {status}\n")
272
+
273
+ # 统计信息
274
+ if result["stats"]:
275
+ report.append("📊 统计信息:")
276
+ for key, value in result["stats"].items():
277
+ report.append(f" - {key}: {value}")
278
+ report.append("")
279
+
280
+ # 错误
281
+ if result["errors"]:
282
+ report.append("❌ 错误:")
283
+ for error in result["errors"]:
284
+ report.append(f" - {error}")
285
+ report.append("")
286
+
287
+ # 警告
288
+ if result["warnings"]:
289
+ report.append("⚠️ 警告:")
290
+ for warning in result["warnings"]:
291
+ report.append(f" - {warning}")
292
+ report.append("")
293
+
294
+ if not result["errors"] and not result["warnings"]:
295
+ report.append("🎉 完美!没有发现任何问题。")
296
+
297
+ return "\n".join(report)
298
+
299
+
300
+ class HTMLTemplateGenerator:
301
+ """HTML模板生成器"""
302
+
303
+ def __init__(self):
304
+ self.base_template = """<!DOCTYPE html>
305
+ <html lang="zh-CN">
306
+ <head>
307
+ <meta charset="UTF-8">
308
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
309
+ <title>{title}</title>
310
+ <style>
311
+ body {{
312
+ font-family: '微软雅黑';
313
+ font-size: 12pt;
314
+ line-height: 1.8;
315
+ padding: 20px;
316
+ max-width: 800px;
317
+ margin: 0 auto;
318
+ }}
319
+ h1 {{ font-family: '微软雅黑'; font-size: 18pt; text-align: center; color: #333; }}
320
+ h2 {{ font-family: '微软雅黑'; font-size: 16pt; color: #4a3f6b; border-bottom: 2px solid #667eea; padding-bottom: 10px; }}
321
+ h3 {{ font-family: '微软雅黑'; font-size: 14pt; color: #5b4e8c; }}
322
+ p {{ margin: 6pt 0; }}
323
+ .center {{ text-align: center; }}
324
+ .right {{ text-align: right; }}
325
+ .highlight {{ background-color: #FFFF00; }}
326
+ .red {{ color: #FF0000; }}
327
+ .blue {{ color: #0000FF; }}
328
+ .green {{ color: #008000; }}
329
+ .purple {{ color: #800080; }}
330
+ </style>
331
+ </head>
332
+ <body>
333
+ {content}
334
+ </body>
335
+ </html>"""
336
+
337
+ def generate_template(self, title: str = "文档", content: str = "") -> str:
338
+ """生成HTML模板"""
339
+ return self.base_template.format(title=title, content=content)
340
+
341
+ def get_element_examples(self) -> Dict:
342
+ """获取元素示例"""
343
+ return {
344
+ "headings": {
345
+ "h1": "<h1>一级标题</h1>",
346
+ "h2": "<h2>二级标题</h2>",
347
+ "h3": "<h3>三级标题</h3>",
348
+ },
349
+ "text": {
350
+ "bold": "<strong>加粗文本</strong>",
351
+ "italic": "<em>斜体文本</em>",
352
+ "underline": "<u>下划线文本</u>",
353
+ "strikethrough": "<s>删除线文本</s>",
354
+ "superscript": "X<sup>2</sup>",
355
+ "subscript": "H<sub>2</sub>O",
356
+ },
357
+ "colors": {
358
+ "red": '<span style="color: #FF0000;">红色文本</span>',
359
+ "blue": '<span style="color: #0000FF;">蓝色文本</span>',
360
+ "highlight": '<span class="highlight">高亮文本</span>',
361
+ },
362
+ "lists": {
363
+ "unordered": """<ul>
364
+ <li>列表项1</li>
365
+ <li>列表项2</li>
366
+ </ul>""",
367
+ "ordered": """<ol>
368
+ <li>有序项1</li>
369
+ <li>有序项2</li>
370
+ </ol>""",
371
+ },
372
+ "table": """<table>
373
+ <tr>
374
+ <th style="background-color: #E3F2FD;">表头1</th>
375
+ <th style="background-color: #E3F2FD;">表头2</th>
376
+ </tr>
377
+ <tr>
378
+ <td>单元格1</td>
379
+ <td>单元格2</td>
380
+ </tr>
381
+ </table>""",
382
+ "image": '<img src="image.png" alt="图片描述" />',
383
+ "formula": "<math>E = mc^2</math>",
384
+ }
385
+
386
+
387
+ # 全局实例
388
+ validator = HTMLValidator()
389
+ template_generator = HTMLTemplateGenerator()