@birthday8/doc-mcp 1.0.3 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +59 -19
- package/package.json +1 -1
- package/python/html_rules.py +82 -0
- package/python/html_validator_strict.py +176 -174
- package/python/sample/example.html +0 -14
- package/python/sample/html_schema.py +76 -7
- package/python/server.py +1 -1
package/README.md
CHANGED
|
@@ -12,7 +12,6 @@ Doc Creator MCP Server - Generate Word documents from HTML with rich formatting
|
|
|
12
12
|
- ✅ Tables with styles
|
|
13
13
|
- ✅ Info/Warning/Success boxes
|
|
14
14
|
- ✅ Code blocks
|
|
15
|
-
- ✅ Blockquotes
|
|
16
15
|
- ✅ Multi-column layout
|
|
17
16
|
- ✅ Page breaks
|
|
18
17
|
|
|
@@ -66,6 +65,18 @@ Generate a Word document from HTML content.
|
|
|
66
65
|
### 3. get_html_constraints
|
|
67
66
|
Get HTML format constraints example with all supported formats.
|
|
68
67
|
|
|
68
|
+
### 4. get_html_schema
|
|
69
|
+
Get structured HTML format constraints as JSON.
|
|
70
|
+
|
|
71
|
+
### 5. validate_html
|
|
72
|
+
Validate HTML content against format constraints.
|
|
73
|
+
|
|
74
|
+
```json
|
|
75
|
+
{
|
|
76
|
+
"html_content": "<h1>Title</h1><p>Content...</p>"
|
|
77
|
+
}
|
|
78
|
+
```
|
|
79
|
+
|
|
69
80
|
## HTML Conventions
|
|
70
81
|
|
|
71
82
|
### Basic Structure
|
|
@@ -75,12 +86,9 @@ Get HTML format constraints example with all supported formats.
|
|
|
75
86
|
<head>
|
|
76
87
|
<meta charset="UTF-8">
|
|
77
88
|
<title>Document Title</title>
|
|
78
|
-
<style>
|
|
79
|
-
/* CSS styles */
|
|
80
|
-
</style>
|
|
81
89
|
</head>
|
|
82
90
|
<body>
|
|
83
|
-
<h1>Title</h1>
|
|
91
|
+
<h1 style="font-family: 黑体; font-size: 18pt; text-align: center;">Title</h1>
|
|
84
92
|
<p>Content with <strong>bold</strong> and <em>italic</em>.</p>
|
|
85
93
|
</body>
|
|
86
94
|
</html>
|
|
@@ -92,8 +100,8 @@ Get HTML format constraints example with all supported formats.
|
|
|
92
100
|
<em>Italic</em>
|
|
93
101
|
<u>Underline</u>
|
|
94
102
|
<s>Strikethrough</s>
|
|
95
|
-
<span
|
|
96
|
-
<span
|
|
103
|
+
<span style="color: #FF0000;">Red text</span>
|
|
104
|
+
<span style="background-color: #FFFF00;">Highlighted</span>
|
|
97
105
|
```
|
|
98
106
|
|
|
99
107
|
### Paragraph Indentation
|
|
@@ -110,17 +118,13 @@ Get HTML format constraints example with all supported formats.
|
|
|
110
118
|
|
|
111
119
|
### Tables
|
|
112
120
|
```html
|
|
113
|
-
<table
|
|
114
|
-
<
|
|
115
|
-
<
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
<tr>
|
|
121
|
-
<td style="border: 1px solid #ddd; padding: 12px;">Cell</td>
|
|
122
|
-
</tr>
|
|
123
|
-
</tbody>
|
|
121
|
+
<table>
|
|
122
|
+
<tr>
|
|
123
|
+
<th style="background-color: #E3F2FD;">Header</th>
|
|
124
|
+
</tr>
|
|
125
|
+
<tr>
|
|
126
|
+
<td>Cell</td>
|
|
127
|
+
</tr>
|
|
124
128
|
</table>
|
|
125
129
|
```
|
|
126
130
|
|
|
@@ -130,8 +134,44 @@ Get HTML format constraints example with all supported formats.
|
|
|
130
134
|
<p>Column 1 content...</p>
|
|
131
135
|
<p>Column 2 content...</p>
|
|
132
136
|
</div>
|
|
137
|
+
<!-- 必须恢复单栏 -->
|
|
138
|
+
<div class="columns" data-cols="1"></div>
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## Important Notes
|
|
142
|
+
|
|
143
|
+
### Style Usage
|
|
144
|
+
- **Inline styles only**: Use `style="..."` attributes on elements
|
|
145
|
+
- **No `<style>` tags**: CSS in `<style>` tags is not supported
|
|
146
|
+
- **Supported styles**: color, background-color, font-family, font-size, text-align, line-height, margin-top, margin-bottom
|
|
147
|
+
- **Color format**: Must use hex format `#RRGGBB` (e.g., `#FF0000`), NOT `red`, `rgb(255,0,0)`, etc.
|
|
148
|
+
- **Font size format**: Must use `pt` units (e.g., `14pt`), NOT `px`, `em`, `rem`
|
|
149
|
+
- **Margin format**: Must use `pt` units (e.g., `10pt`), NOT `px`, `em`, `rem`
|
|
150
|
+
- **Line height format**: Must be numeric (e.g., `1.5`, `1.8`, `2.0`)
|
|
151
|
+
|
|
152
|
+
### Class Names
|
|
153
|
+
**Only 7 class names are supported**:
|
|
154
|
+
- `center` - Center aligned paragraph
|
|
155
|
+
- `right` - Right aligned paragraph
|
|
156
|
+
- `left` - Left aligned paragraph
|
|
157
|
+
- `info` - Info message box
|
|
158
|
+
- `warning` - Warning message box
|
|
159
|
+
- `success` - Success message box
|
|
160
|
+
- `columns` - Multi-column layout
|
|
161
|
+
|
|
162
|
+
All other styles must use inline `style` attributes.
|
|
163
|
+
|
|
164
|
+
### Example with inline styles
|
|
165
|
+
```html
|
|
166
|
+
<h1 style="font-family: 黑体; font-size: 18pt; text-align: center;">Title</h1>
|
|
167
|
+
<p style="color: #FF0000;">Red text</p>
|
|
168
|
+
<span style="background-color: #FFFF00;">Highlighted</span>
|
|
169
|
+
|
|
170
|
+
<!-- 不支持的类名示例 -->
|
|
171
|
+
<!-- ❌ <span class="abstract-title">摘要</span> -->
|
|
172
|
+
<!-- ✅ <span style="font-family: 黑体; font-size: 12pt; font-weight: bold;">摘要</span> -->
|
|
133
173
|
```
|
|
134
174
|
|
|
135
175
|
## License
|
|
136
176
|
|
|
137
|
-
MIT
|
|
177
|
+
MIT
|
package/package.json
CHANGED
package/python/html_rules.py
CHANGED
|
@@ -339,6 +339,63 @@ class RuleEngine:
|
|
|
339
339
|
description="元数据内容",
|
|
340
340
|
)
|
|
341
341
|
|
|
342
|
+
# 通用属性
|
|
343
|
+
self.attr_rules["class"] = AttributeRule(
|
|
344
|
+
attr_name="class",
|
|
345
|
+
allowed=True,
|
|
346
|
+
applicable_tags={
|
|
347
|
+
"p",
|
|
348
|
+
"div",
|
|
349
|
+
"span",
|
|
350
|
+
"table",
|
|
351
|
+
"td",
|
|
352
|
+
"th",
|
|
353
|
+
"ul",
|
|
354
|
+
"ol",
|
|
355
|
+
"li",
|
|
356
|
+
"h1",
|
|
357
|
+
"h2",
|
|
358
|
+
"h3",
|
|
359
|
+
"h4",
|
|
360
|
+
"h5",
|
|
361
|
+
"h6",
|
|
362
|
+
},
|
|
363
|
+
description="CSS类名",
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
self.attr_rules["style"] = AttributeRule(
|
|
367
|
+
attr_name="style",
|
|
368
|
+
allowed=True,
|
|
369
|
+
applicable_tags={
|
|
370
|
+
"h1",
|
|
371
|
+
"h2",
|
|
372
|
+
"h3",
|
|
373
|
+
"h4",
|
|
374
|
+
"h5",
|
|
375
|
+
"h6",
|
|
376
|
+
"p",
|
|
377
|
+
"div",
|
|
378
|
+
"span",
|
|
379
|
+
"strong",
|
|
380
|
+
"em",
|
|
381
|
+
"u",
|
|
382
|
+
"s",
|
|
383
|
+
"sup",
|
|
384
|
+
"sub",
|
|
385
|
+
"code",
|
|
386
|
+
"table",
|
|
387
|
+
"tr",
|
|
388
|
+
"td",
|
|
389
|
+
"th",
|
|
390
|
+
"ul",
|
|
391
|
+
"ol",
|
|
392
|
+
"li",
|
|
393
|
+
"blockquote",
|
|
394
|
+
"pre",
|
|
395
|
+
},
|
|
396
|
+
description="内联样式",
|
|
397
|
+
)
|
|
398
|
+
|
|
342
399
|
def _init_style_rules(self):
|
|
343
400
|
"""初始化样式规则"""
|
|
344
401
|
|
|
@@ -497,6 +554,31 @@ class RuleEngine:
|
|
|
497
554
|
return False
|
|
498
555
|
return rule.is_applicable_to(tag_name)
|
|
499
556
|
|
|
557
|
+
def is_nesting_allowed(self, parent_tag: str, child_tag: str) -> bool:
|
|
558
|
+
"""检查嵌套是否允许"""
|
|
559
|
+
parent_rule = self.get_tag_rule(parent_tag)
|
|
560
|
+
child_rule = self.get_tag_rule(child_tag)
|
|
561
|
+
|
|
562
|
+
if not parent_rule or not child_rule:
|
|
563
|
+
return False
|
|
564
|
+
|
|
565
|
+
# 检查父标签是否允许包含子标签
|
|
566
|
+
if not parent_rule.can_contain(child_tag):
|
|
567
|
+
return False
|
|
568
|
+
|
|
569
|
+
# 检查子标签是否可以是父标签的子元素
|
|
570
|
+
if not child_rule.can_be_child_of(parent_tag):
|
|
571
|
+
return False
|
|
572
|
+
|
|
573
|
+
return True
|
|
574
|
+
|
|
575
|
+
def is_self_closing_tag(self, tag_name: str) -> bool:
|
|
576
|
+
"""检查标签是否是自闭合标签"""
|
|
577
|
+
rule = self.get_tag_rule(tag_name)
|
|
578
|
+
if not rule:
|
|
579
|
+
return False
|
|
580
|
+
return rule.must_be_self_closing
|
|
581
|
+
|
|
500
582
|
def get_disallowed_tags(self) -> List[Dict[str, str]]:
|
|
501
583
|
"""获取所有不允许的标签(用于文档说明)"""
|
|
502
584
|
return [
|
|
@@ -11,6 +11,17 @@ from html_rules import RuleEngine, ValidationResult
|
|
|
11
11
|
class StrictHTMLValidator:
|
|
12
12
|
"""严格HTML验证器"""
|
|
13
13
|
|
|
14
|
+
# 支持的类名列表
|
|
15
|
+
SUPPORTED_CLASSES = {
|
|
16
|
+
"center",
|
|
17
|
+
"right",
|
|
18
|
+
"left",
|
|
19
|
+
"info",
|
|
20
|
+
"warning",
|
|
21
|
+
"success",
|
|
22
|
+
"columns",
|
|
23
|
+
}
|
|
24
|
+
|
|
14
25
|
def __init__(self, rule_engine: RuleEngine = None):
|
|
15
26
|
self.rule_engine = rule_engine or RuleEngine()
|
|
16
27
|
|
|
@@ -39,6 +50,9 @@ class StrictHTMLValidator:
|
|
|
39
50
|
# 检查属性
|
|
40
51
|
self._validate_attributes(soup, result, strict_mode)
|
|
41
52
|
|
|
53
|
+
# 检查类名
|
|
54
|
+
self._validate_classes(soup, result, strict_mode)
|
|
55
|
+
|
|
42
56
|
# 检查样式
|
|
43
57
|
self._validate_styles(soup, result, strict_mode)
|
|
44
58
|
|
|
@@ -48,14 +62,14 @@ class StrictHTMLValidator:
|
|
|
48
62
|
# 检查自闭合标签
|
|
49
63
|
self._validate_self_closing(soup, result, strict_mode)
|
|
50
64
|
|
|
51
|
-
#
|
|
52
|
-
self.
|
|
65
|
+
# 检查style标签
|
|
66
|
+
self._validate_style_tags(soup, result, strict_mode)
|
|
53
67
|
|
|
54
68
|
except Exception as e:
|
|
55
69
|
result.add_error(
|
|
56
|
-
code="
|
|
70
|
+
code="PARSING_ERROR",
|
|
57
71
|
message=f"HTML解析错误: {str(e)}",
|
|
58
|
-
details={"
|
|
72
|
+
details={"exception": str(e)},
|
|
59
73
|
)
|
|
60
74
|
|
|
61
75
|
return result
|
|
@@ -154,85 +168,142 @@ class StrictHTMLValidator:
|
|
|
154
168
|
|
|
155
169
|
tag_name = element.name
|
|
156
170
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
171
|
+
# 检查属性是否允许
|
|
172
|
+
tag_rule = self.rule_engine.get_tag_rule(tag_name)
|
|
173
|
+
if not tag_rule:
|
|
174
|
+
continue
|
|
161
175
|
|
|
162
|
-
|
|
176
|
+
for attr_name, attr_value in element.attrs.items():
|
|
163
177
|
if not self.rule_engine.is_attribute_allowed(attr_name, tag_name):
|
|
164
178
|
result.add_error(
|
|
165
179
|
code="DISALLOWED_ATTR",
|
|
166
180
|
message=f"标签 <{tag_name}> 的属性 '{attr_name}' 不在允许列表中",
|
|
167
181
|
details={
|
|
168
182
|
"tag": tag_name,
|
|
169
|
-
"
|
|
183
|
+
"attribute": attr_name,
|
|
170
184
|
"value": str(attr_value)[:100],
|
|
171
|
-
"allowed_attrs_for_tag": self._get_allowed_attrs_for_tag(
|
|
172
|
-
tag_name
|
|
173
|
-
),
|
|
174
185
|
},
|
|
175
186
|
)
|
|
176
187
|
if strict_mode:
|
|
177
188
|
continue
|
|
178
189
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
190
|
+
def _validate_classes(
|
|
191
|
+
self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
|
|
192
|
+
):
|
|
193
|
+
"""验证类名是否支持"""
|
|
194
|
+
for element in soup.find_all(True):
|
|
195
|
+
if not isinstance(element, Tag):
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
class_attr = element.get("class")
|
|
199
|
+
if not class_attr:
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
# 处理类名列表
|
|
203
|
+
if isinstance(class_attr, list):
|
|
204
|
+
classes = class_attr
|
|
205
|
+
else:
|
|
206
|
+
classes = class_attr.split()
|
|
207
|
+
|
|
208
|
+
unsupported_classes = []
|
|
209
|
+
for cls in classes:
|
|
210
|
+
if cls not in self.SUPPORTED_CLASSES:
|
|
211
|
+
unsupported_classes.append(cls)
|
|
212
|
+
|
|
213
|
+
if unsupported_classes:
|
|
214
|
+
result.add_error(
|
|
215
|
+
code="UNSUPPORTED_CLASS",
|
|
216
|
+
message=f"类名 '{', '.join(unsupported_classes)}' 不在支持列表中,请使用内联 style 属性代替",
|
|
217
|
+
details={
|
|
218
|
+
"unsupported_classes": unsupported_classes,
|
|
219
|
+
"supported_classes": list(self.SUPPORTED_CLASSES),
|
|
220
|
+
"element": element.name,
|
|
221
|
+
"note": "只支持:center, right, left, info, warning, success, columns",
|
|
222
|
+
},
|
|
223
|
+
)
|
|
224
|
+
if strict_mode:
|
|
225
|
+
continue
|
|
192
226
|
|
|
193
227
|
def _validate_styles(
|
|
194
228
|
self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
|
|
195
229
|
):
|
|
196
230
|
"""验证样式"""
|
|
197
|
-
for element in soup.find_all(
|
|
231
|
+
for element in soup.find_all(True):
|
|
198
232
|
if not isinstance(element, Tag):
|
|
199
233
|
continue
|
|
200
234
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
235
|
+
style_attr = element.get("style")
|
|
236
|
+
if not style_attr:
|
|
237
|
+
continue
|
|
204
238
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
239
|
+
# 解析样式
|
|
240
|
+
styles = self._parse_style_string(style_attr)
|
|
241
|
+
|
|
242
|
+
for prop_name, prop_value in styles.items():
|
|
243
|
+
if not self.rule_engine.is_style_allowed(prop_name, element.name):
|
|
208
244
|
result.add_error(
|
|
209
245
|
code="DISALLOWED_STYLE",
|
|
210
|
-
message=f"
|
|
246
|
+
message=f"样式属性 '{prop_name}' 不在允许列表中",
|
|
211
247
|
details={
|
|
212
|
-
"
|
|
213
|
-
"
|
|
214
|
-
"
|
|
215
|
-
"allowed_styles_for_tag": self._get_allowed_styles_for_tag(
|
|
216
|
-
tag_name
|
|
217
|
-
),
|
|
248
|
+
"style": prop_name,
|
|
249
|
+
"value": prop_value,
|
|
250
|
+
"element": element.name,
|
|
218
251
|
},
|
|
219
252
|
)
|
|
220
253
|
if strict_mode:
|
|
221
254
|
continue
|
|
222
255
|
|
|
223
|
-
#
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
is_valid, error_msg = style_rule.validate_value(style_value)
|
|
227
|
-
if not is_valid:
|
|
256
|
+
# 验证颜色格式
|
|
257
|
+
if prop_name in ["color", "background-color"]:
|
|
258
|
+
if not self.rule_engine.is_valid_color(prop_value):
|
|
228
259
|
result.add_error(
|
|
229
|
-
code="
|
|
230
|
-
message=
|
|
260
|
+
code="INVALID_COLOR_FORMAT",
|
|
261
|
+
message="颜色格式错误,必须使用十六进制格式 #RRGGBB",
|
|
231
262
|
details={
|
|
232
|
-
"
|
|
233
|
-
"
|
|
234
|
-
"
|
|
235
|
-
|
|
263
|
+
"style": prop_name,
|
|
264
|
+
"value": prop_value,
|
|
265
|
+
"correct_format": "#RRGGBB",
|
|
266
|
+
},
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# 验证字号格式
|
|
270
|
+
if prop_name == "font-size":
|
|
271
|
+
if not prop_value.endswith("pt"):
|
|
272
|
+
result.add_error(
|
|
273
|
+
code="INVALID_FONT_SIZE_FORMAT",
|
|
274
|
+
message="字号格式错误,必须使用 pt 单位",
|
|
275
|
+
details={
|
|
276
|
+
"style": prop_name,
|
|
277
|
+
"value": prop_value,
|
|
278
|
+
"correct_format": "14pt, 16pt, 18pt",
|
|
279
|
+
},
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
# 验证行距格式
|
|
283
|
+
if prop_name == "line-height":
|
|
284
|
+
try:
|
|
285
|
+
float(prop_value)
|
|
286
|
+
except ValueError:
|
|
287
|
+
result.add_error(
|
|
288
|
+
code="INVALID_LINE_HEIGHT_FORMAT",
|
|
289
|
+
message="行距格式错误,必须是数字或小数",
|
|
290
|
+
details={
|
|
291
|
+
"style": prop_name,
|
|
292
|
+
"value": prop_value,
|
|
293
|
+
"correct_format": "1.5, 1.8, 2.0",
|
|
294
|
+
},
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
# 验证边距格式
|
|
298
|
+
if prop_name in ["margin-top", "margin-bottom"]:
|
|
299
|
+
if not prop_value.endswith("pt"):
|
|
300
|
+
result.add_error(
|
|
301
|
+
code="INVALID_MARGIN_FORMAT",
|
|
302
|
+
message="边距格式错误,必须使用 pt 单位",
|
|
303
|
+
details={
|
|
304
|
+
"style": prop_name,
|
|
305
|
+
"value": prop_value,
|
|
306
|
+
"correct_format": "10pt, 12pt, 15pt",
|
|
236
307
|
},
|
|
237
308
|
)
|
|
238
309
|
|
|
@@ -240,53 +311,27 @@ class StrictHTMLValidator:
|
|
|
240
311
|
self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
|
|
241
312
|
):
|
|
242
313
|
"""验证嵌套规则"""
|
|
243
|
-
for
|
|
244
|
-
if not isinstance(
|
|
314
|
+
for parent in soup.find_all(True):
|
|
315
|
+
if not isinstance(parent, Tag):
|
|
245
316
|
continue
|
|
246
317
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
if parent and isinstance(parent, Tag):
|
|
251
|
-
parent_name = parent.name
|
|
252
|
-
|
|
253
|
-
# 检查是否可以是父元素的子元素
|
|
254
|
-
tag_rule = self.rule_engine.get_tag_rule(tag_name)
|
|
255
|
-
parent_rule = self.rule_engine.get_tag_rule(parent_name)
|
|
318
|
+
for child in parent.find_all(True, recursive=False):
|
|
319
|
+
if not isinstance(child, Tag):
|
|
320
|
+
continue
|
|
256
321
|
|
|
257
|
-
if
|
|
322
|
+
if not self.rule_engine.is_nesting_allowed(parent.name, child.name):
|
|
258
323
|
result.add_error(
|
|
259
324
|
code="INVALID_NESTING",
|
|
260
|
-
message=f"标签 <{
|
|
325
|
+
message=f"标签 <{child.name}> 不能嵌套在 <{parent.name}> 中",
|
|
261
326
|
details={
|
|
262
|
-
"
|
|
263
|
-
"
|
|
264
|
-
"
|
|
265
|
-
list(tag_rule.allowed_parents)
|
|
266
|
-
if tag_rule.allowed_parents
|
|
267
|
-
else ["任意"]
|
|
268
|
-
),
|
|
327
|
+
"parent": parent.name,
|
|
328
|
+
"child": child.name,
|
|
329
|
+
"context": str(parent)[:100],
|
|
269
330
|
},
|
|
270
331
|
)
|
|
271
332
|
if strict_mode:
|
|
272
333
|
continue
|
|
273
334
|
|
|
274
|
-
# 检查父元素是否可以包含此元素
|
|
275
|
-
if parent_rule and not parent_rule.can_contain(tag_name):
|
|
276
|
-
result.add_error(
|
|
277
|
-
code="INVALID_CONTAINMENT",
|
|
278
|
-
message=f"标签 <{parent_name}> 不能包含 <{tag_name}>",
|
|
279
|
-
details={
|
|
280
|
-
"parent_tag": parent_name,
|
|
281
|
-
"child_tag": tag_name,
|
|
282
|
-
"allowed_children": (
|
|
283
|
-
list(parent_rule.allowed_children)
|
|
284
|
-
if parent_rule.allowed_children
|
|
285
|
-
else ["任意"]
|
|
286
|
-
),
|
|
287
|
-
},
|
|
288
|
-
)
|
|
289
|
-
|
|
290
335
|
def _validate_self_closing(
|
|
291
336
|
self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
|
|
292
337
|
):
|
|
@@ -296,64 +341,51 @@ class StrictHTMLValidator:
|
|
|
296
341
|
continue
|
|
297
342
|
|
|
298
343
|
tag_name = element.name
|
|
299
|
-
tag_rule = self.rule_engine.get_tag_rule(tag_name)
|
|
300
344
|
|
|
301
|
-
|
|
345
|
+
# 检查空标签是否自闭合
|
|
346
|
+
if self.rule_engine.is_self_closing_tag(tag_name):
|
|
302
347
|
# 检查是否有内容
|
|
303
348
|
if element.contents and not all(
|
|
304
|
-
isinstance(c, NavigableString) and not c.strip()
|
|
349
|
+
isinstance(c, NavigableString) and not str(c).strip()
|
|
305
350
|
for c in element.contents
|
|
306
351
|
):
|
|
307
352
|
result.add_warning(
|
|
308
|
-
code="
|
|
309
|
-
message=f"标签 <{tag_name}>
|
|
310
|
-
details={"tag": tag_name
|
|
353
|
+
code="NON_EMPTY_SELF_CLOSING_TAG",
|
|
354
|
+
message=f"标签 <{tag_name}> 应该是自闭合的,但包含内容",
|
|
355
|
+
details={"tag": tag_name},
|
|
311
356
|
)
|
|
312
357
|
|
|
313
|
-
def
|
|
314
|
-
|
|
315
|
-
|
|
358
|
+
def _validate_style_tags(
|
|
359
|
+
self, soup: BeautifulSoup, result: ValidationResult, strict_mode: bool
|
|
360
|
+
):
|
|
361
|
+
"""验证style标签(检测并警告)"""
|
|
362
|
+
style_tags = soup.find_all("style")
|
|
363
|
+
if style_tags:
|
|
364
|
+
result.add_warning(
|
|
365
|
+
code="STYLE_TAG_DETECTED",
|
|
366
|
+
message=f"检测到 <style> 标签,其中的 CSS 规则不会被转换。请使用内联 style 属性代替。",
|
|
367
|
+
details={
|
|
368
|
+
"count": len(style_tags),
|
|
369
|
+
"note": '例如:将 <style>h1 { font-size: 18pt; }</style> 改为 <h1 style="font-size: 18pt;">标题</h1>',
|
|
370
|
+
},
|
|
371
|
+
)
|
|
316
372
|
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
373
|
+
def _parse_style_string(self, style_str: str) -> Dict[str, str]:
|
|
374
|
+
"""解析style字符串为字典"""
|
|
375
|
+
styles = {}
|
|
376
|
+
if not style_str:
|
|
377
|
+
return styles
|
|
322
378
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
stats["total_warnings"] = len(result.warnings)
|
|
379
|
+
for item in style_str.split(";"):
|
|
380
|
+
item = item.strip()
|
|
381
|
+
if not item or ":" not in item:
|
|
382
|
+
continue
|
|
328
383
|
|
|
329
|
-
|
|
330
|
-
|
|
384
|
+
key, value = item.split(":", 1)
|
|
385
|
+
styles[key.strip()] = value.strip()
|
|
331
386
|
|
|
332
|
-
def _parse_style(self, style_str: str) -> Dict[str, str]:
|
|
333
|
-
"""解析样式字符串"""
|
|
334
|
-
styles = {}
|
|
335
|
-
for item in style_str.split(";"):
|
|
336
|
-
if ":" in item:
|
|
337
|
-
key, value = item.split(":", 1)
|
|
338
|
-
styles[key.strip()] = value.strip()
|
|
339
387
|
return styles
|
|
340
388
|
|
|
341
|
-
def _get_allowed_attrs_for_tag(self, tag_name: str) -> List[str]:
|
|
342
|
-
"""获取标签允许的属性列表"""
|
|
343
|
-
allowed = []
|
|
344
|
-
for attr_name, attr_rule in self.rule_engine.attr_rules.items():
|
|
345
|
-
if attr_rule.allowed and attr_rule.is_applicable_to(tag_name):
|
|
346
|
-
allowed.append(attr_name)
|
|
347
|
-
return allowed
|
|
348
|
-
|
|
349
|
-
def _get_allowed_styles_for_tag(self, tag_name: str) -> List[str]:
|
|
350
|
-
"""获取标签允许的样式列表"""
|
|
351
|
-
allowed = []
|
|
352
|
-
for style_name, style_rule in self.rule_engine.style_rules.items():
|
|
353
|
-
if style_rule.allowed and style_rule.is_applicable_to(tag_name):
|
|
354
|
-
allowed.append(style_name)
|
|
355
|
-
return allowed
|
|
356
|
-
|
|
357
389
|
def get_validation_report(self, result: ValidationResult) -> str:
|
|
358
390
|
"""生成验证报告"""
|
|
359
391
|
report = []
|
|
@@ -362,33 +394,30 @@ class StrictHTMLValidator:
|
|
|
362
394
|
|
|
363
395
|
# 验证结果
|
|
364
396
|
status = "✅ 通过" if result.is_valid else "❌ 失败"
|
|
365
|
-
report.append(f"验证状态: {status}")
|
|
366
|
-
|
|
367
|
-
|
|
397
|
+
report.append(f"验证状态: {status}\n")
|
|
398
|
+
|
|
399
|
+
# 统计信息
|
|
400
|
+
report.append("📊 统计信息:")
|
|
401
|
+
report.append(f" - 错误数量: {len(result.errors)}")
|
|
402
|
+
report.append(f" - 警告数量: {len(result.warnings)}")
|
|
403
|
+
report.append("")
|
|
368
404
|
|
|
369
405
|
# 错误
|
|
370
406
|
if result.errors:
|
|
371
407
|
report.append("❌ 错误:")
|
|
372
408
|
for i, error in enumerate(result.errors, 1):
|
|
373
|
-
report.append(f"
|
|
409
|
+
report.append(f" {i}. [{error['code']}] {error['message']}")
|
|
374
410
|
if error.get("details"):
|
|
375
|
-
report.append(f"
|
|
411
|
+
report.append(f" 详情: {error['details']}")
|
|
376
412
|
report.append("")
|
|
377
413
|
|
|
378
414
|
# 警告
|
|
379
415
|
if result.warnings:
|
|
380
416
|
report.append("⚠️ 警告:")
|
|
381
417
|
for i, warning in enumerate(result.warnings, 1):
|
|
382
|
-
report.append(f"
|
|
418
|
+
report.append(f" {i}. [{warning['code']}] {warning['message']}")
|
|
383
419
|
if warning.get("details"):
|
|
384
|
-
report.append(f"
|
|
385
|
-
report.append("")
|
|
386
|
-
|
|
387
|
-
# 统计信息
|
|
388
|
-
if hasattr(result, "details") and result.details:
|
|
389
|
-
report.append("📊 统计信息:")
|
|
390
|
-
for key, value in result.details.items():
|
|
391
|
-
report.append(f" - {key}: {value}")
|
|
420
|
+
report.append(f" 详情: {warning['details']}")
|
|
392
421
|
report.append("")
|
|
393
422
|
|
|
394
423
|
if not result.errors and not result.warnings:
|
|
@@ -397,32 +426,5 @@ class StrictHTMLValidator:
|
|
|
397
426
|
return "\n".join(report)
|
|
398
427
|
|
|
399
428
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
def __init__(self, result: ValidationResult):
|
|
404
|
-
self.result = result
|
|
405
|
-
message = (
|
|
406
|
-
f"HTML验证失败: {len(result.errors)} 个错误, {len(result.warnings)} 个警告"
|
|
407
|
-
)
|
|
408
|
-
super().__init__(message)
|
|
409
|
-
|
|
410
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
411
|
-
"""转换为字典"""
|
|
412
|
-
return self.result.to_json()
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
# 延迟创建全局实例
|
|
416
|
-
_strict_validator_instance = None
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
def get_strict_validator():
|
|
420
|
-
"""获取严格验证器单例"""
|
|
421
|
-
global _strict_validator_instance
|
|
422
|
-
if _strict_validator_instance is None:
|
|
423
|
-
_strict_validator_instance = StrictHTMLValidator()
|
|
424
|
-
return _strict_validator_instance
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
# 向后兼容的全局实例
|
|
428
|
-
strict_validator = get_strict_validator()
|
|
429
|
+
# 全局实例
|
|
430
|
+
strict_validator = StrictHTMLValidator()
|
|
@@ -2,21 +2,7 @@
|
|
|
2
2
|
<html lang="zh-CN">
|
|
3
3
|
<head>
|
|
4
4
|
<meta charset="UTF-8">
|
|
5
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
5
|
<title>MCP复杂格式测试</title>
|
|
7
|
-
<style>
|
|
8
|
-
body {
|
|
9
|
-
font-family: '微软雅黑';
|
|
10
|
-
font-size: 12pt;
|
|
11
|
-
line-height: 1.8;
|
|
12
|
-
padding: 20px;
|
|
13
|
-
max-width: 800px;
|
|
14
|
-
margin: 0 auto;
|
|
15
|
-
}
|
|
16
|
-
h1 { font-family: '微软雅黑'; font-size: 18pt; text-align: center; color: #333; }
|
|
17
|
-
h2 { font-family: '微软雅黑'; font-size: 16pt; color: #4a3f6b; border-bottom: 2px solid #667eea; padding-bottom: 10px; }
|
|
18
|
-
h3 { font-family: '微软雅黑'; font-size: 14pt; color: #5b4e8c; }
|
|
19
|
-
</style>
|
|
20
6
|
</head>
|
|
21
7
|
<body>
|
|
22
8
|
<h1>MCP复杂格式测试文档</h1>
|
|
@@ -145,6 +145,44 @@ HTML_SCHEMA = {
|
|
|
145
145
|
}
|
|
146
146
|
},
|
|
147
147
|
|
|
148
|
+
"supportedClasses": [
|
|
149
|
+
{
|
|
150
|
+
"name": "center",
|
|
151
|
+
"description": "居中对齐段落",
|
|
152
|
+
"note": "用于<p class=\"center\">"
|
|
153
|
+
},
|
|
154
|
+
{
|
|
155
|
+
"name": "right",
|
|
156
|
+
"description": "右对齐段落",
|
|
157
|
+
"note": "用于<p class=\"right\">"
|
|
158
|
+
},
|
|
159
|
+
{
|
|
160
|
+
"name": "left",
|
|
161
|
+
"description": "左对齐段落",
|
|
162
|
+
"note": "用于<p class=\"left\">"
|
|
163
|
+
},
|
|
164
|
+
{
|
|
165
|
+
"name": "info",
|
|
166
|
+
"description": "信息提示框",
|
|
167
|
+
"note": "用于<div class=\"info\">"
|
|
168
|
+
},
|
|
169
|
+
{
|
|
170
|
+
"name": "warning",
|
|
171
|
+
"description": "警告提示框",
|
|
172
|
+
"note": "用于<div class=\"warning\">"
|
|
173
|
+
},
|
|
174
|
+
{
|
|
175
|
+
"name": "success",
|
|
176
|
+
"description": "成功提示框",
|
|
177
|
+
"note": "用于<div class=\"success\">"
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
"name": "columns",
|
|
181
|
+
"description": "多栏布局",
|
|
182
|
+
"note": "用于<div class=\"columns\" data-cols=\"2\">"
|
|
183
|
+
}
|
|
184
|
+
],
|
|
185
|
+
|
|
148
186
|
"colorFormatRules": {
|
|
149
187
|
"required": "hex",
|
|
150
188
|
"pattern": "#[0-9A-Fa-f]{6}",
|
|
@@ -237,6 +275,30 @@ HTML_SCHEMA = {
|
|
|
237
275
|
"example": "<span style=\"color: rgb(255,0,0);\">红色</span>",
|
|
238
276
|
"correct": "<span style=\"color: #FF0000;\">红色</span>",
|
|
239
277
|
"note": "不支持rgb()和rgba()格式,必须使用十六进制"
|
|
278
|
+
},
|
|
279
|
+
{
|
|
280
|
+
"error": "使用style标签",
|
|
281
|
+
"example": "<style>h1 { font-size: 18pt; }</style>",
|
|
282
|
+
"correct": "<h1 style=\"font-size: 18pt;\">标题</h1>",
|
|
283
|
+
"note": "不支持<style>标签中的CSS规则,只支持内联style属性"
|
|
284
|
+
},
|
|
285
|
+
{
|
|
286
|
+
"error": "不支持的类名",
|
|
287
|
+
"example": "<span class=\"abstract-title\">摘要</span>",
|
|
288
|
+
"correct": "<span style=\"font-family: 黑体; font-size: 12pt; font-weight: bold;\">摘要</span>",
|
|
289
|
+
"note": "只支持:center, right, left, info, warning, success, columns"
|
|
290
|
+
},
|
|
291
|
+
{
|
|
292
|
+
"error": "字号格式错误",
|
|
293
|
+
"example": "<span style=\"font-size: 14px;\">14号字</span>",
|
|
294
|
+
"correct": "<span style=\"font-size: 14pt;\">14号字</span>",
|
|
295
|
+
"note": "字号单位必须是pt(磅),不支持px、em、rem等单位"
|
|
296
|
+
},
|
|
297
|
+
{
|
|
298
|
+
"error": "边距格式错误",
|
|
299
|
+
"example": "<p style=\"margin-bottom: 10px;\">段落</p>",
|
|
300
|
+
"correct": "<p style=\"margin-bottom: 10pt;\">段落</p>",
|
|
301
|
+
"note": "边距单位必须是pt(磅),不支持px、em、rem等单位"
|
|
240
302
|
}
|
|
241
303
|
],
|
|
242
304
|
|
|
@@ -252,18 +314,25 @@ HTML_SCHEMA = {
|
|
|
252
314
|
"表格单元格使用 th(表头)和 td(数据)区分",
|
|
253
315
|
"行内元素(strong、em、u、s、sup、sub、code、span)可以嵌套在块级元素(p、div)内",
|
|
254
316
|
"块级元素(p、div、table、ul、ol)不能嵌套在行内元素内",
|
|
255
|
-
"行距可以是数字或小数(如 1.5、1.8、2.0)"
|
|
317
|
+
"行距可以是数字或小数(如 1.5、1.8、2.0)",
|
|
318
|
+
"不要使用<style>标签中的CSS,只使用内联style属性(如 style=\"font-size: 14pt;\")",
|
|
319
|
+
"只支持特定类名:center, right, left, info, warning, success, columns,其他样式请用内联style"
|
|
256
320
|
],
|
|
257
321
|
|
|
258
322
|
"validationWorkflow": [
|
|
259
323
|
"1. 检查HTML基本结构(DOCTYPE, html, head, body)",
|
|
260
324
|
"2. 验证所有标签是否在允许列表中",
|
|
261
325
|
"3. 检查所有属性是否被支持",
|
|
262
|
-
"4.
|
|
263
|
-
"5.
|
|
264
|
-
"6.
|
|
265
|
-
"7.
|
|
266
|
-
"8.
|
|
326
|
+
"4. 验证类名是否在支持列表中(只支持:center, right, left, info, warning, success, columns)",
|
|
327
|
+
"5. 验证所有样式是否符合规范(特别是颜色格式)",
|
|
328
|
+
"6. 检查颜色格式是否为十六进制 #RRGGBB",
|
|
329
|
+
"7. 检查字号格式是否为pt单位",
|
|
330
|
+
"8. 检查行距格式是否为数字或小数",
|
|
331
|
+
"9. 检查边距格式是否为pt单位",
|
|
332
|
+
"10. 验证标签嵌套是否正确",
|
|
333
|
+
"11. 确认自闭合标签格式正确",
|
|
334
|
+
"12. 检查多栏布局是否恢复单栏",
|
|
335
|
+
"13. 检查是否使用了<style>标签(警告)"
|
|
267
336
|
]
|
|
268
337
|
}
|
|
269
338
|
|
|
@@ -280,4 +349,4 @@ def get_schema_json():
|
|
|
280
349
|
|
|
281
350
|
|
|
282
351
|
if __name__ == "__main__":
|
|
283
|
-
print(get_schema_json())
|
|
352
|
+
print(get_schema_json())
|
package/python/server.py
CHANGED
|
@@ -16,7 +16,7 @@ from mcp.types import TextContent, Tool
|
|
|
16
16
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
17
17
|
from docx_converter import convert_html_to_docx as docx_convert
|
|
18
18
|
from html_validator import validator, template_generator
|
|
19
|
-
from html_validator_strict import StrictHTMLValidator,
|
|
19
|
+
from html_validator_strict import StrictHTMLValidator, strict_validator
|
|
20
20
|
from html_fixer import HTMLFixer
|
|
21
21
|
|
|
22
22
|
# Create MCP server
|