hos-m2f 0.5.4__tar.gz → 0.5.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/PKG-INFO +1 -1
- hos_m2f-0.5.5/README.md +426 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f/converters/md_to_docx.py +51 -6
- hos_m2f-0.5.5/hos_m2f/converters/md_to_epub.py +68 -0
- hos_m2f-0.5.5/hos_m2f/converters/md_to_latex.py +63 -0
- hos_m2f-0.5.5/hos_m2f/converters/pdf_to_md.py +120 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f.egg-info/PKG-INFO +1 -1
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f.egg-info/SOURCES.txt +3 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/setup.py +1 -1
- hos_m2f-0.5.5/tests/test_latex.py +182 -0
- hos_m2f-0.5.4/README.md +0 -130
- hos_m2f-0.5.4/hos_m2f/converters/md_to_epub.py +0 -96
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f/__init__.py +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f/cli/__init__.py +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f/cli/cli.py +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f/converters/__init__.py +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f/converters/base_converter.py +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f/converters/docx_to_md.py +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f/converters/epub_to_md.py +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f/converters/html_to_md.py +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f/converters/json_to_md.py +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f/converters/md_to_html.py +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f/converters/md_to_json.py +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f/converters/md_to_xml.py +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f/converters/xml_to_md.py +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f.egg-info/dependency_links.txt +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f.egg-info/entry_points.txt +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f.egg-info/requires.txt +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/hos_m2f.egg-info/top_level.txt +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/setup.cfg +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/tests/__init__.py +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/tests/test_converters.py +0 -0
- {hos_m2f-0.5.4 → hos_m2f-0.5.5}/tests/test_modes.py +0 -0
hos_m2f-0.5.5/README.md
ADDED
|
@@ -0,0 +1,426 @@
|
|
|
1
|
+
# HOS-M2F v1.0
|
|
2
|
+
|
|
3
|
+
## 多场景结构化内容编译引擎
|
|
4
|
+
|
|
5
|
+
HOS-M2F = 面向 AI 写作与专业文档生产的 **内容编译器引擎**
|
|
6
|
+
|
|
7
|
+
Markdown 是源码,不同 Mode 是"目标行业标准"
|
|
8
|
+
|
|
9
|
+
## 快速开始
|
|
10
|
+
|
|
11
|
+
### 安装
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
# 克隆仓库
|
|
15
|
+
git clone <repository-url>
|
|
16
|
+
cd HOS-M2F
|
|
17
|
+
|
|
18
|
+
# 安装依赖
|
|
19
|
+
pip install -e .
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### 基本使用
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
# 构建电子书
|
|
26
|
+
HOS_M2F build input.md --mode book --format epub
|
|
27
|
+
|
|
28
|
+
# 构建技术文档
|
|
29
|
+
HOS_M2F build report.md --mode paper --format pdf
|
|
30
|
+
|
|
31
|
+
# 校验专利文档
|
|
32
|
+
HOS_M2F check patent.md --mode patent
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## 核心功能
|
|
36
|
+
|
|
37
|
+
### 内置官方模式
|
|
38
|
+
|
|
39
|
+
| Mode | 领域 | 核心产出 |
|
|
40
|
+
| --------- | ----- | -------------- |
|
|
41
|
+
| `paper` | 技术文档 | 报告 / 论文排版 |
|
|
42
|
+
| `patent` | 专利申请 | 合规专利文件 |
|
|
43
|
+
| `book` 📚 | 电子书出版 | EPUB / KDP |
|
|
44
|
+
| `sop` 🛠 | 运维与实施 | 巡检 / 报错 / 实施报告 |
|
|
45
|
+
|
|
46
|
+
### 输出支持
|
|
47
|
+
|
|
48
|
+
| 格式 | 描述 |
|
|
49
|
+
| ----------- | ----------------- |
|
|
50
|
+
| EPUB | 电子书格式,支持KDP出版 |
|
|
51
|
+
| DOCX | Microsoft Word文档格式 |
|
|
52
|
+
| PDF | 可打印文档格式 |
|
|
53
|
+
| JSON | 结构化数据格式 |
|
|
54
|
+
| HTML | 网页格式 |
|
|
55
|
+
| XML | 标记语言格式 |
|
|
56
|
+
| LaTeX | 学术排版格式 |
|
|
57
|
+
|
|
58
|
+
### 输入支持
|
|
59
|
+
|
|
60
|
+
| 格式 | 描述 |
|
|
61
|
+
| ----------- | ----------------- |
|
|
62
|
+
| Markdown | 主要输入格式 |
|
|
63
|
+
| PDF | 支持PDF转Markdown |
|
|
64
|
+
| DOCX | 支持Word转Markdown |
|
|
65
|
+
| EPUB | 支持电子书转Markdown |
|
|
66
|
+
| HTML | 支持网页转Markdown |
|
|
67
|
+
| XML | 支持XML转Markdown |
|
|
68
|
+
|
|
69
|
+
## 详细使用指南
|
|
70
|
+
|
|
71
|
+
### CLI 命令
|
|
72
|
+
|
|
73
|
+
#### 构建命令
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# 基本构建
|
|
77
|
+
HOS_M2F build input.md --mode book --format epub
|
|
78
|
+
|
|
79
|
+
# 指定输出路径
|
|
80
|
+
HOS_M2F build input.md --mode paper --format pdf --output output.pdf
|
|
81
|
+
|
|
82
|
+
# 使用KDP模式构建电子书
|
|
83
|
+
HOS_M2F build book.md --mode book --format epub --options "{\"platform\": \"kdp\"}"
|
|
84
|
+
|
|
85
|
+
# 批量构建
|
|
86
|
+
HOS_M2F build ./docs/ --mode sop --format pdf --batch
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
#### 校验命令
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
# 校验书籍
|
|
93
|
+
HOS_M2F check book.md --mode book --platform kdp
|
|
94
|
+
|
|
95
|
+
# 校验专利
|
|
96
|
+
HOS_M2F check patent.md --mode patent
|
|
97
|
+
|
|
98
|
+
# 校验SOP文档
|
|
99
|
+
HOS_M2F check sop.md --mode sop
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
#### 转换命令
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
# Markdown转DOCX
|
|
106
|
+
HOS_M2F convert input.md output.docx --from markdown --to docx
|
|
107
|
+
|
|
108
|
+
# PDF转Markdown
|
|
109
|
+
HOS_M2F convert input.pdf output.md --from pdf --to markdown
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Python SDK
|
|
113
|
+
|
|
114
|
+
#### 基本使用
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from hos_m2f import Engine
|
|
118
|
+
|
|
119
|
+
# 初始化引擎
|
|
120
|
+
engine = Engine()
|
|
121
|
+
|
|
122
|
+
# 构建电子书
|
|
123
|
+
with open("input.md", "r", encoding="utf-8") as f:
|
|
124
|
+
markdown_content = f.read()
|
|
125
|
+
|
|
126
|
+
result = engine.build(
|
|
127
|
+
content=markdown_content,
|
|
128
|
+
mode="book",
|
|
129
|
+
output_format="epub",
|
|
130
|
+
options={"platform": "kdp", "title": "我的电子书", "author": "作者名"}
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# 保存输出
|
|
134
|
+
with open("output.epub", "wb") as f:
|
|
135
|
+
f.write(result.binary)
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
#### 格式转换
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from hos_m2f import Engine
|
|
142
|
+
|
|
143
|
+
# 初始化引擎
|
|
144
|
+
engine = Engine()
|
|
145
|
+
|
|
146
|
+
# 读取PDF文件
|
|
147
|
+
with open("input.pdf", "rb") as f:
|
|
148
|
+
pdf_content = f.read()
|
|
149
|
+
|
|
150
|
+
# 转换为Markdown
|
|
151
|
+
md_content = engine.convert_content(
|
|
152
|
+
input_format="pdf",
|
|
153
|
+
output_format="markdown",
|
|
154
|
+
input_content=pdf_content
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# 保存结果
|
|
158
|
+
with open("output.md", "wb") as f:
|
|
159
|
+
f.write(md_content)
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
#### 校验文档
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
from hos_m2f import Engine
|
|
166
|
+
|
|
167
|
+
# 初始化引擎
|
|
168
|
+
engine = Engine()
|
|
169
|
+
|
|
170
|
+
# 读取文档
|
|
171
|
+
with open("patent.md", "r", encoding="utf-8") as f:
|
|
172
|
+
patent_content = f.read()
|
|
173
|
+
|
|
174
|
+
# 校验文档
|
|
175
|
+
validation_result = engine.check(
|
|
176
|
+
content=patent_content,
|
|
177
|
+
mode="patent"
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# 打印校验结果
|
|
181
|
+
print(f"Valid: {validation_result['valid']}")
|
|
182
|
+
if not validation_result['valid']:
|
|
183
|
+
print("Errors:")
|
|
184
|
+
for error in validation_result['errors']:
|
|
185
|
+
print(f"- {error}")
|
|
186
|
+
print("Warnings:")
|
|
187
|
+
for warning in validation_result['warnings']:
|
|
188
|
+
print(f"- {warning}")
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## 高级功能
|
|
192
|
+
|
|
193
|
+
### LaTeX 支持
|
|
194
|
+
|
|
195
|
+
HOS-M2F 支持 LaTeX 格式输出,特别适合学术论文和技术报告:
|
|
196
|
+
|
|
197
|
+
```bash
|
|
198
|
+
# 构建LaTeX文档
|
|
199
|
+
HOS_M2F build paper.md --mode paper --format latex
|
|
200
|
+
|
|
201
|
+
# 构建并编译为PDF
|
|
202
|
+
HOS_M2F build paper.md --mode paper --format latex --options "{\"compile\": true}"
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### 自定义模式
|
|
206
|
+
|
|
207
|
+
您可以通过 YAML 文件创建自定义模式:
|
|
208
|
+
|
|
209
|
+
```yaml
|
|
210
|
+
# user_modes/meeting_notes.yaml
|
|
211
|
+
name: meeting_notes
|
|
212
|
+
base: paper
|
|
213
|
+
description: 会议记录模式
|
|
214
|
+
|
|
215
|
+
rules:
|
|
216
|
+
- type: required_section
|
|
217
|
+
title: 会议主题
|
|
218
|
+
level: 2
|
|
219
|
+
- type: required_section
|
|
220
|
+
title: 参会人员
|
|
221
|
+
level: 2
|
|
222
|
+
- type: required_section
|
|
223
|
+
title: 会议内容
|
|
224
|
+
level: 2
|
|
225
|
+
- type: required_section
|
|
226
|
+
title: 行动项
|
|
227
|
+
level: 2
|
|
228
|
+
- type: required_section
|
|
229
|
+
title: 下次会议
|
|
230
|
+
level: 2
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
使用自定义模式:
|
|
234
|
+
|
|
235
|
+
```bash
|
|
236
|
+
HOS_M2F build meeting.md --mode meeting_notes --format docx
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
### KDP 出版支持
|
|
240
|
+
|
|
241
|
+
HOS-M2F 提供了专门的 KDP 模式,确保您的电子书符合 Amazon KDP 的出版要求:
|
|
242
|
+
|
|
243
|
+
```bash
|
|
244
|
+
# 构建KDP兼容的电子书
|
|
245
|
+
HOS_M2F build book.md --mode book --format epub --options "{\"platform\": \"kdp\"}"
|
|
246
|
+
|
|
247
|
+
# 校验KDP兼容性
|
|
248
|
+
HOS_M2F check book.md --mode book --platform kdp
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
## 项目结构
|
|
252
|
+
|
|
253
|
+
```
|
|
254
|
+
hos_m2f/
|
|
255
|
+
├── engine/ # ⭐ 对外统一引擎接口
|
|
256
|
+
│ └── engine.py
|
|
257
|
+
│
|
|
258
|
+
├── modes/ # 内置模式
|
|
259
|
+
│ ├── book_mode.py
|
|
260
|
+
│ ├── patent_mode.py
|
|
261
|
+
│ ├── sop_mode.py
|
|
262
|
+
│ └── paper_mode.py
|
|
263
|
+
│
|
|
264
|
+
├── user_modes/ # 用户自定义模式
|
|
265
|
+
│ └── *.yaml
|
|
266
|
+
│
|
|
267
|
+
├── structure/ # 结构解析层
|
|
268
|
+
│ ├── semantic_parser.py
|
|
269
|
+
│ ├── book_parser.py
|
|
270
|
+
│ ├── patent_parser.py
|
|
271
|
+
│ └── sop_parser.py
|
|
272
|
+
│
|
|
273
|
+
├── renderers/ # 渲染适配层
|
|
274
|
+
│ ├── epub_renderer.py
|
|
275
|
+
│ ├── pdf_renderer.py
|
|
276
|
+
│ ├── docx_renderer.py
|
|
277
|
+
│ ├── json_renderer.py
|
|
278
|
+
│ ├── html_renderer.py
|
|
279
|
+
│ ├── xml_renderer.py
|
|
280
|
+
│ └── latex_renderer.py
|
|
281
|
+
│
|
|
282
|
+
├── converters/ # 格式转换层
|
|
283
|
+
│ ├── md_to_docx.py
|
|
284
|
+
│ ├── md_to_epub.py
|
|
285
|
+
│ ├── md_to_html.py
|
|
286
|
+
│ ├── md_to_json.py
|
|
287
|
+
│ ├── md_to_latex.py
|
|
288
|
+
│ ├── md_to_xml.py
|
|
289
|
+
│ ├── docx_to_md.py
|
|
290
|
+
│ ├── epub_to_md.py
|
|
291
|
+
│ ├── html_to_md.py
|
|
292
|
+
│ ├── json_to_md.py
|
|
293
|
+
│ ├── pdf_to_md.py
|
|
294
|
+
│ └── xml_to_md.py
|
|
295
|
+
│
|
|
296
|
+
└── ide/ # IDE 集成接口
|
|
297
|
+
├── api.py
|
|
298
|
+
└── preview_server.py
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
## 扩展能力
|
|
302
|
+
|
|
303
|
+
### 语义校验引擎
|
|
304
|
+
- 专利是否缺少摘要
|
|
305
|
+
- 书籍是否缺版权页
|
|
306
|
+
- SOP 是否缺结论
|
|
307
|
+
- 论文是否缺参考文献
|
|
308
|
+
|
|
309
|
+
### 风格主题系统
|
|
310
|
+
企业可统一视觉风格:
|
|
311
|
+
|
|
312
|
+
```yaml
|
|
313
|
+
theme: blue_enterprise
|
|
314
|
+
font: Source Han Sans
|
|
315
|
+
cover_style: minimal
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
### 增量构建能力
|
|
319
|
+
只更新修改过的章节,提高大书籍或长报告构建速度
|
|
320
|
+
|
|
321
|
+
### 结构化数据导出
|
|
322
|
+
SOP 报告可导出 JSON 供企业系统分析
|
|
323
|
+
|
|
324
|
+
### Mermaid 图表支持
|
|
325
|
+
自动渲染 Mermaid 图表为图片:
|
|
326
|
+
|
|
327
|
+
```markdown
|
|
328
|
+
```mermaid
|
|
329
|
+
graph TD
|
|
330
|
+
A[开始] --> B[处理]
|
|
331
|
+
B --> C[结束]
|
|
332
|
+
```
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
### 表格增强
|
|
336
|
+
支持复杂表格的渲染,包括对齐和样式:
|
|
337
|
+
|
|
338
|
+
```markdown
|
|
339
|
+
| 左对齐 | 居中 | 右对齐 |
|
|
340
|
+
| :--- | :---: | ---: |
|
|
341
|
+
| 内容 | 内容 | 内容 |
|
|
342
|
+
| 内容 | 内容 | 内容 |
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
## 故障排除
|
|
346
|
+
|
|
347
|
+
### 常见问题
|
|
348
|
+
|
|
349
|
+
#### 1. LaTeX 编译失败
|
|
350
|
+
|
|
351
|
+
**问题**:LaTeX 编译失败,提示缺少依赖。
|
|
352
|
+
|
|
353
|
+
**解决方案**:安装 LaTeX 发行版,如 TeX Live 或 MiKTeX。
|
|
354
|
+
|
|
355
|
+
#### 2. PDF 渲染失败
|
|
356
|
+
|
|
357
|
+
**问题**:PDF 渲染失败,提示缺少 WeasyPrint 依赖。
|
|
358
|
+
|
|
359
|
+
**解决方案**:安装 WeasyPrint:
|
|
360
|
+
```bash
|
|
361
|
+
pip install weasyprint
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
#### 3. EPUB 生成错误
|
|
365
|
+
|
|
366
|
+
**问题**:EPUB 生成失败,提示缺少 ebooklib 依赖。
|
|
367
|
+
|
|
368
|
+
**解决方案**:安装 ebooklib:
|
|
369
|
+
```bash
|
|
370
|
+
pip install ebooklib
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
#### 4. Mermaid 图表不显示
|
|
374
|
+
|
|
375
|
+
**问题**:Mermaid 图表在输出中不显示。
|
|
376
|
+
|
|
377
|
+
**解决方案**:确保网络连接正常,Mermaid 图表需要通过网络渲染。
|
|
378
|
+
|
|
379
|
+
### 错误代码
|
|
380
|
+
|
|
381
|
+
| 错误代码 | 描述 | 解决方案 |
|
|
382
|
+
| ---- | ----------------- | ------------------ |
|
|
383
|
+
| 1001 | 模式不存在 | 检查模式名称是否正确 |
|
|
384
|
+
| 1002 | 输出格式不支持 | 检查格式是否在支持列表中 |
|
|
385
|
+
| 1003 | 输入文件不存在 | 检查文件路径是否正确 |
|
|
386
|
+
| 1004 | 依赖项缺失 | 安装缺失的依赖项 |
|
|
387
|
+
| 1005 | 文档结构无效 | 检查文档结构是否符合模式要求 |
|
|
388
|
+
| 1006 | KDP 兼容性错误 | 修复 KDP 兼容性问题 |
|
|
389
|
+
| 1007 | LaTeX 编译失败 | 检查 LaTeX 语法和依赖项 |
|
|
390
|
+
| 1008 | PDF 渲染失败 | 检查 WeasyPrint 安装和配置 |
|
|
391
|
+
|
|
392
|
+
## 性能优化
|
|
393
|
+
|
|
394
|
+
### 大文档处理
|
|
395
|
+
|
|
396
|
+
对于大型文档(超过 100 页),建议使用以下优化:
|
|
397
|
+
|
|
398
|
+
1. **启用增量构建**:只更新修改过的章节
|
|
399
|
+
2. **使用分段处理**:将大文档拆分为多个小文档
|
|
400
|
+
3. **选择合适的输出格式**:对于预览,使用 HTML 格式
|
|
401
|
+
4. **限制图片分辨率**:减少图片大小以提高处理速度
|
|
402
|
+
|
|
403
|
+
### 批量处理
|
|
404
|
+
|
|
405
|
+
对于批量处理多个文档,建议:
|
|
406
|
+
|
|
407
|
+
1. **使用 `--batch` 参数**:启用批量处理模式
|
|
408
|
+
2. **设置合理的并发数**:避免系统资源耗尽
|
|
409
|
+
3. **使用日志记录**:跟踪处理进度和错误
|
|
410
|
+
|
|
411
|
+
## 许可证
|
|
412
|
+
MIT License
|
|
413
|
+
|
|
414
|
+
## 贡献
|
|
415
|
+
|
|
416
|
+
欢迎贡献代码、报告问题或提出建议!
|
|
417
|
+
|
|
418
|
+
## 联系方式
|
|
419
|
+
|
|
420
|
+
- 项目主页:<repository-url>
|
|
421
|
+
- 问题反馈:<repository-url>/issues
|
|
422
|
+
- 邮件:<contact-email>
|
|
423
|
+
|
|
424
|
+
---
|
|
425
|
+
|
|
426
|
+
**HOS-M2F** - 让专业文档生产更简单!
|
|
@@ -95,10 +95,28 @@ class MDToDOCXConverter(BaseConverter):
|
|
|
95
95
|
# 填充表头
|
|
96
96
|
header_row = table.rows[0]
|
|
97
97
|
for i, cell_text in enumerate(header_cells):
|
|
98
|
-
header_row.cells[i]
|
|
98
|
+
cell = header_row.cells[i]
|
|
99
|
+
cell.text = cell_text
|
|
100
|
+
# 设置表头样式
|
|
101
|
+
for paragraph in cell.paragraphs:
|
|
102
|
+
for run in paragraph.runs:
|
|
103
|
+
run.bold = True
|
|
104
|
+
run.font.size = Pt(11)
|
|
105
|
+
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
99
106
|
|
|
100
|
-
#
|
|
107
|
+
# 跳过分隔线行,并解析对齐方式
|
|
108
|
+
alignments = []
|
|
101
109
|
if len(rows) > 1 and '---' in rows[1]:
|
|
110
|
+
# 解析对齐方式
|
|
111
|
+
alignment_row = rows[1]
|
|
112
|
+
alignment_cells = [cell.strip() for cell in alignment_row.split('|') if cell.strip()]
|
|
113
|
+
for cell in alignment_cells:
|
|
114
|
+
if cell.startswith(':') and cell.endswith(':'):
|
|
115
|
+
alignments.append(WD_ALIGN_PARAGRAPH.CENTER)
|
|
116
|
+
elif cell.endswith(':'):
|
|
117
|
+
alignments.append(WD_ALIGN_PARAGRAPH.RIGHT)
|
|
118
|
+
else:
|
|
119
|
+
alignments.append(WD_ALIGN_PARAGRAPH.LEFT)
|
|
102
120
|
data_rows = rows[2:]
|
|
103
121
|
else:
|
|
104
122
|
data_rows = rows[1:]
|
|
@@ -110,7 +128,12 @@ class MDToDOCXConverter(BaseConverter):
|
|
|
110
128
|
new_row = table.add_row()
|
|
111
129
|
for i, cell_text in enumerate(cells):
|
|
112
130
|
if i < len(new_row.cells):
|
|
113
|
-
new_row.cells[i]
|
|
131
|
+
cell = new_row.cells[i]
|
|
132
|
+
cell.text = cell_text
|
|
133
|
+
# 设置对齐方式
|
|
134
|
+
if i < len(alignments):
|
|
135
|
+
for paragraph in cell.paragraphs:
|
|
136
|
+
paragraph.alignment = alignments[i]
|
|
114
137
|
except Exception as e:
|
|
115
138
|
# 如果解析失败,回退到简单处理
|
|
116
139
|
self.doc.add_paragraph('Table: ' + text[:100] + '...')
|
|
@@ -217,9 +240,31 @@ class MDToDOCXConverter(BaseConverter):
|
|
|
217
240
|
|
|
218
241
|
def _render_mermaid(self, mermaid_code):
|
|
219
242
|
"""渲染Mermaid图表为图片"""
|
|
220
|
-
#
|
|
221
|
-
|
|
222
|
-
|
|
243
|
+
# 使用mermaid.ink API渲染Mermaid图表
|
|
244
|
+
try:
|
|
245
|
+
import requests
|
|
246
|
+
from io import BytesIO
|
|
247
|
+
import urllib.parse
|
|
248
|
+
|
|
249
|
+
# 编码Mermaid代码
|
|
250
|
+
encoded_code = urllib.parse.quote(mermaid_code)
|
|
251
|
+
|
|
252
|
+
# 构建API URL
|
|
253
|
+
url = f"https://mermaid.ink/img/{encoded_code}"
|
|
254
|
+
|
|
255
|
+
# 发送请求
|
|
256
|
+
response = requests.get(url, timeout=10)
|
|
257
|
+
|
|
258
|
+
if response.status_code == 200:
|
|
259
|
+
# 返回图片数据流
|
|
260
|
+
return BytesIO(response.content)
|
|
261
|
+
else:
|
|
262
|
+
# 如果API调用失败,返回None
|
|
263
|
+
return None
|
|
264
|
+
except Exception as e:
|
|
265
|
+
# 如果处理失败,返回None
|
|
266
|
+
print(f"Error rendering Mermaid chart: {e}")
|
|
267
|
+
return None
|
|
223
268
|
|
|
224
269
|
# 渲染Markdown
|
|
225
270
|
renderer = DOCXRenderer(doc)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Markdown到EPUB格式转换器"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional, Dict
|
|
4
|
+
from hos_m2f.converters.base_converter import BaseConverter
|
|
5
|
+
from hos_m2f.renderers.epub_renderer import EPUBRenderer
|
|
6
|
+
from hos_m2f.structure.book_parser import BookParser
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class MDToEPUBConverter(BaseConverter):
|
|
10
|
+
"""Markdown到EPUB格式转换器"""
|
|
11
|
+
|
|
12
|
+
def __init__(self):
|
|
13
|
+
"""初始化转换器"""
|
|
14
|
+
self.renderer = EPUBRenderer()
|
|
15
|
+
self.book_parser = BookParser()
|
|
16
|
+
|
|
17
|
+
def convert(self, input_content: str, options: Optional[Dict[str, Any]] = None) -> bytes:
|
|
18
|
+
"""将Markdown转换为EPUB
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
input_content: Markdown内容
|
|
22
|
+
options: 转换选项
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
bytes: EPUB文件的二进制数据
|
|
26
|
+
"""
|
|
27
|
+
if options is None:
|
|
28
|
+
options = {}
|
|
29
|
+
|
|
30
|
+
# 使用BookParser解析Markdown内容
|
|
31
|
+
parsed_content = self.book_parser.parse(input_content, options)
|
|
32
|
+
|
|
33
|
+
# 增强解析结果
|
|
34
|
+
parsed_content = self._enhance_parsed_content(parsed_content, options)
|
|
35
|
+
|
|
36
|
+
# 使用EPUBRenderer渲染EPUB文件
|
|
37
|
+
epub_content = self.renderer.render(parsed_content, options)
|
|
38
|
+
|
|
39
|
+
return epub_content
|
|
40
|
+
|
|
41
|
+
def _enhance_parsed_content(self, parsed_content: Dict[str, Any], options: Dict[str, Any]) -> Dict[str, Any]:
|
|
42
|
+
"""增强解析结果"""
|
|
43
|
+
# 添加选项中的元数据
|
|
44
|
+
if 'title' in options:
|
|
45
|
+
parsed_content.setdefault('book_metadata', {})['title'] = options['title']
|
|
46
|
+
if 'author' in options:
|
|
47
|
+
parsed_content.setdefault('book_metadata', {})['author'] = options['author']
|
|
48
|
+
if 'language' in options:
|
|
49
|
+
parsed_content.setdefault('book_metadata', {})['language'] = options['language']
|
|
50
|
+
if 'publisher' in options:
|
|
51
|
+
parsed_content.setdefault('book_metadata', {})['publisher'] = options['publisher']
|
|
52
|
+
if 'publish_date' in options:
|
|
53
|
+
parsed_content.setdefault('book_metadata', {})['publish_date'] = options['publish_date']
|
|
54
|
+
if 'description' in options:
|
|
55
|
+
parsed_content.setdefault('book_metadata', {})['description'] = options['description']
|
|
56
|
+
|
|
57
|
+
# 添加封面信息
|
|
58
|
+
if 'cover' in options:
|
|
59
|
+
parsed_content['cover'] = {
|
|
60
|
+
'src': options['cover'],
|
|
61
|
+
'type': 'image'
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
return parsed_content
|
|
65
|
+
|
|
66
|
+
def get_supported_formats(self) -> tuple:
|
|
67
|
+
"""获取支持的格式"""
|
|
68
|
+
return ('markdown', 'epub')
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Markdown到LaTeX格式转换器"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional, Dict
|
|
4
|
+
from hos_m2f.converters.base_converter import BaseConverter
|
|
5
|
+
from hos_m2f.renderers.latex_renderer import LaTeXRenderer
|
|
6
|
+
from hos_m2f.structure.semantic_parser import SemanticParser
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class MDToLaTeXConverter(BaseConverter):
|
|
10
|
+
"""Markdown到LaTeX格式转换器"""
|
|
11
|
+
|
|
12
|
+
def __init__(self):
|
|
13
|
+
"""初始化转换器"""
|
|
14
|
+
self.renderer = LaTeXRenderer()
|
|
15
|
+
self.parser = SemanticParser()
|
|
16
|
+
|
|
17
|
+
def convert(self, input_content: str, options: Optional[Dict[str, Any]] = None) -> bytes:
|
|
18
|
+
"""将Markdown转换为LaTeX
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
input_content: Markdown内容
|
|
22
|
+
options: 转换选项
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
bytes: LaTeX文件的二进制数据
|
|
26
|
+
"""
|
|
27
|
+
if options is None:
|
|
28
|
+
options = {}
|
|
29
|
+
|
|
30
|
+
# 使用SemanticParser解析Markdown内容
|
|
31
|
+
parsed_content = self.parser.parse(input_content)
|
|
32
|
+
|
|
33
|
+
# 增强解析结果
|
|
34
|
+
parsed_content = self._enhance_parsed_content(parsed_content, options)
|
|
35
|
+
|
|
36
|
+
# 使用LaTeXRenderer渲染LaTeX文件
|
|
37
|
+
latex_content = self.renderer.render(parsed_content, options)
|
|
38
|
+
|
|
39
|
+
return latex_content
|
|
40
|
+
|
|
41
|
+
def _enhance_parsed_content(self, parsed_content: Dict[str, Any], options: Dict[str, Any]) -> Dict[str, Any]:
|
|
42
|
+
"""增强解析结果"""
|
|
43
|
+
# 添加选项中的元数据
|
|
44
|
+
if 'title' in options:
|
|
45
|
+
parsed_content.setdefault('metadata', {})['title'] = options['title']
|
|
46
|
+
if 'author' in options:
|
|
47
|
+
parsed_content.setdefault('metadata', {})['author'] = options['author']
|
|
48
|
+
if 'date' in options:
|
|
49
|
+
parsed_content.setdefault('metadata', {})['date'] = options['date']
|
|
50
|
+
if 'abstract' in options:
|
|
51
|
+
parsed_content.setdefault('metadata', {})['abstract'] = options['abstract']
|
|
52
|
+
if 'keywords' in options:
|
|
53
|
+
parsed_content.setdefault('metadata', {})['keywords'] = options['keywords']
|
|
54
|
+
|
|
55
|
+
# 添加文档类型
|
|
56
|
+
if 'document_class' in options:
|
|
57
|
+
parsed_content['document_class'] = options['document_class']
|
|
58
|
+
|
|
59
|
+
return parsed_content
|
|
60
|
+
|
|
61
|
+
def get_supported_formats(self) -> tuple:
|
|
62
|
+
"""获取支持的格式"""
|
|
63
|
+
return ('markdown', 'latex')
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""PDF到Markdown格式转换器"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional, Dict
|
|
4
|
+
from hos_m2f.converters.base_converter import BaseConverter
|
|
5
|
+
|
|
6
|
+
# 延迟导入PyPDF2
|
|
7
|
+
pypdf2_available = False
|
|
8
|
+
PdfReader = None
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _check_pypdf2():
|
|
12
|
+
"""检查PyPDF2是否可用"""
|
|
13
|
+
global pypdf2_available, PdfReader
|
|
14
|
+
if not pypdf2_available:
|
|
15
|
+
try:
|
|
16
|
+
from PyPDF2 import PdfReader
|
|
17
|
+
pypdf2_available = True
|
|
18
|
+
except ImportError as e:
|
|
19
|
+
print(f"Warning: PyPDF2 not available: {e}")
|
|
20
|
+
print("PDF to Markdown conversion is disabled.")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PDFToMDConverter(BaseConverter):
|
|
24
|
+
"""PDF到Markdown格式转换器"""
|
|
25
|
+
|
|
26
|
+
def convert(self, input_content: bytes, options: Optional[Dict[str, Any]] = None) -> bytes:
|
|
27
|
+
"""将PDF转换为Markdown
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
input_content: PDF文件的二进制数据
|
|
31
|
+
options: 转换选项
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
bytes: Markdown文件的二进制数据
|
|
35
|
+
"""
|
|
36
|
+
# 检查PyPDF2是否可用
|
|
37
|
+
_check_pypdf2()
|
|
38
|
+
if not pypdf2_available:
|
|
39
|
+
raise ImportError("PyPDF2 is not available. PDF to Markdown conversion is disabled.")
|
|
40
|
+
|
|
41
|
+
if options is None:
|
|
42
|
+
options = {}
|
|
43
|
+
|
|
44
|
+
# 解析PDF内容
|
|
45
|
+
markdown_content = self._parse_pdf(input_content, options)
|
|
46
|
+
|
|
47
|
+
return markdown_content.encode('utf-8')
|
|
48
|
+
|
|
49
|
+
def _parse_pdf(self, pdf_content: bytes, options: Dict[str, Any]) -> str:
|
|
50
|
+
"""解析PDF内容并转换为Markdown"""
|
|
51
|
+
import io
|
|
52
|
+
|
|
53
|
+
# 创建PDF阅读器
|
|
54
|
+
pdf_reader = PdfReader(io.BytesIO(pdf_content))
|
|
55
|
+
|
|
56
|
+
# 提取文本
|
|
57
|
+
text_content = []
|
|
58
|
+
for page_num in range(len(pdf_reader.pages)):
|
|
59
|
+
page = pdf_reader.pages[page_num]
|
|
60
|
+
text = page.extract_text()
|
|
61
|
+
if text:
|
|
62
|
+
text_content.append(text)
|
|
63
|
+
|
|
64
|
+
# 合并文本
|
|
65
|
+
full_text = '\n\n'.join(text_content)
|
|
66
|
+
|
|
67
|
+
# 转换为Markdown
|
|
68
|
+
markdown_content = self._text_to_markdown(full_text, options)
|
|
69
|
+
|
|
70
|
+
return markdown_content
|
|
71
|
+
|
|
72
|
+
def _text_to_markdown(self, text: str, options: Dict[str, Any]) -> str:
|
|
73
|
+
"""将纯文本转换为Markdown"""
|
|
74
|
+
import re
|
|
75
|
+
|
|
76
|
+
# 分割行
|
|
77
|
+
lines = text.split('\n')
|
|
78
|
+
|
|
79
|
+
# 处理标题
|
|
80
|
+
markdown_lines = []
|
|
81
|
+
for line in lines:
|
|
82
|
+
line = line.strip()
|
|
83
|
+
if not line:
|
|
84
|
+
markdown_lines.append('')
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
# 简单的标题识别
|
|
88
|
+
# 假设以数字开头的行可能是标题
|
|
89
|
+
if re.match(r'^\d+\.', line):
|
|
90
|
+
# 检查数字级别
|
|
91
|
+
match = re.match(r'^(\d+)\.', line)
|
|
92
|
+
if match:
|
|
93
|
+
level = len(match.group(1).split('.'))
|
|
94
|
+
if level <= 6:
|
|
95
|
+
markdown_lines.append(f'{'#' * level} {line}')
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
# 检查是否是大写标题
|
|
99
|
+
if line.isupper() and len(line) < 50:
|
|
100
|
+
markdown_lines.append(f'## {line}')
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
# 普通行
|
|
104
|
+
markdown_lines.append(line)
|
|
105
|
+
|
|
106
|
+
# 合并行
|
|
107
|
+
markdown_content = '\n'.join(markdown_lines)
|
|
108
|
+
|
|
109
|
+
# 处理列表
|
|
110
|
+
markdown_content = re.sub(r'^\s*\-\s(.*)$', r'* \1', markdown_content, flags=re.MULTILINE)
|
|
111
|
+
markdown_content = re.sub(r'^\s*\*\s(.*)$', r'* \1', markdown_content, flags=re.MULTILINE)
|
|
112
|
+
|
|
113
|
+
# 处理粗体
|
|
114
|
+
markdown_content = re.sub(r'\b([A-Z]{3,})\b', r'**\1**', markdown_content)
|
|
115
|
+
|
|
116
|
+
return markdown_content
|
|
117
|
+
|
|
118
|
+
def get_supported_formats(self) -> tuple:
|
|
119
|
+
"""获取支持的格式"""
|
|
120
|
+
return ('pdf', 'md')
|
|
@@ -19,8 +19,11 @@ hos_m2f/converters/md_to_docx.py
|
|
|
19
19
|
hos_m2f/converters/md_to_epub.py
|
|
20
20
|
hos_m2f/converters/md_to_html.py
|
|
21
21
|
hos_m2f/converters/md_to_json.py
|
|
22
|
+
hos_m2f/converters/md_to_latex.py
|
|
22
23
|
hos_m2f/converters/md_to_xml.py
|
|
24
|
+
hos_m2f/converters/pdf_to_md.py
|
|
23
25
|
hos_m2f/converters/xml_to_md.py
|
|
24
26
|
tests/__init__.py
|
|
25
27
|
tests/test_converters.py
|
|
28
|
+
tests/test_latex.py
|
|
26
29
|
tests/test_modes.py
|
|
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
|
|
2
2
|
|
|
3
3
|
setup(
|
|
4
4
|
name="hos-m2f",
|
|
5
|
-
version="0.5.
|
|
5
|
+
version="0.5.5",
|
|
6
6
|
description="HOS-M2F: Markdown to Industry Standard Format Compiler Engine",
|
|
7
7
|
long_description="""HOS-M2F is a powerful compiler engine that converts Markdown files to various industry standard formats.
|
|
8
8
|
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""测试LaTeX渲染器和转换器"""
|
|
2
|
+
|
|
3
|
+
import unittest
|
|
4
|
+
import os
|
|
5
|
+
import tempfile
|
|
6
|
+
from hos_m2f.renderers.latex_renderer import LaTeXRenderer
|
|
7
|
+
from hos_m2f.converters.md_to_latex import MDToLaTeXConverter
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestLaTeX(unittest.TestCase):
|
|
11
|
+
"""测试LaTeX渲染器和转换器"""
|
|
12
|
+
|
|
13
|
+
def setUp(self):
|
|
14
|
+
"""设置测试环境"""
|
|
15
|
+
# 创建测试用的Markdown内容
|
|
16
|
+
self.test_content = """
|
|
17
|
+
# 测试文档
|
|
18
|
+
|
|
19
|
+
## 摘要
|
|
20
|
+
|
|
21
|
+
这是一个测试文档,用于测试LaTeX渲染功能。
|
|
22
|
+
|
|
23
|
+
## 引言
|
|
24
|
+
|
|
25
|
+
这是引言章节的内容。
|
|
26
|
+
|
|
27
|
+
### 背景
|
|
28
|
+
|
|
29
|
+
这是背景部分的内容。
|
|
30
|
+
|
|
31
|
+
## 方法
|
|
32
|
+
|
|
33
|
+
这是方法章节的内容。
|
|
34
|
+
|
|
35
|
+
### 实验设计
|
|
36
|
+
|
|
37
|
+
这是实验设计部分的内容。
|
|
38
|
+
|
|
39
|
+
## 结果
|
|
40
|
+
|
|
41
|
+
这是结果章节的内容。
|
|
42
|
+
|
|
43
|
+
### 数据表格
|
|
44
|
+
|
|
45
|
+
| 列1 | 列2 | 列3 |
|
|
46
|
+
| --- | --- | --- |
|
|
47
|
+
| 行1 | 行1 | 行1 |
|
|
48
|
+
| 行2 | 行2 | 行2 |
|
|
49
|
+
|
|
50
|
+
### 代码示例
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
print("Hello, world!")
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## 讨论
|
|
57
|
+
|
|
58
|
+
这是讨论章节的内容。
|
|
59
|
+
|
|
60
|
+
## 结论
|
|
61
|
+
|
|
62
|
+
这是结论章节的内容。
|
|
63
|
+
|
|
64
|
+
## 参考文献
|
|
65
|
+
|
|
66
|
+
[1] 参考文献1
|
|
67
|
+
[2] 参考文献2
|
|
68
|
+
""".strip()
|
|
69
|
+
|
|
70
|
+
# 创建测试用的结构化内容
|
|
71
|
+
self.structured_content = {
|
|
72
|
+
"metadata": {
|
|
73
|
+
"title": "测试文档",
|
|
74
|
+
"author": "测试作者",
|
|
75
|
+
"date": "2023-01-01",
|
|
76
|
+
"abstract": "这是一个测试文档,用于测试LaTeX渲染功能。",
|
|
77
|
+
"keywords": ["测试", "LaTeX", "渲染"]
|
|
78
|
+
},
|
|
79
|
+
"structure": [
|
|
80
|
+
{"level": 1, "title": "测试文档", "line_number": 1},
|
|
81
|
+
{"level": 2, "title": "摘要", "line_number": 3},
|
|
82
|
+
{"level": 2, "title": "引言", "line_number": 7},
|
|
83
|
+
{"level": 3, "title": "背景", "line_number": 9},
|
|
84
|
+
{"level": 2, "title": "方法", "line_number": 13},
|
|
85
|
+
{"level": 3, "title": "实验设计", "line_number": 15},
|
|
86
|
+
{"level": 2, "title": "结果", "line_number": 19},
|
|
87
|
+
{"level": 3, "title": "数据表格", "line_number": 21},
|
|
88
|
+
{"level": 3, "title": "代码示例", "line_number": 29},
|
|
89
|
+
{"level": 2, "title": "讨论", "line_number": 35},
|
|
90
|
+
{"level": 2, "title": "结论", "line_number": 39},
|
|
91
|
+
{"level": 2, "title": "参考文献", "line_number": 43}
|
|
92
|
+
],
|
|
93
|
+
"chapters": [
|
|
94
|
+
{"title": "测试文档", "content": "", "level": 1, "start_line": 1, "end_line": 1},
|
|
95
|
+
{"title": "摘要", "content": "这是一个测试文档,用于测试LaTeX渲染功能。", "level": 2, "start_line": 3, "end_line": 5},
|
|
96
|
+
{"title": "引言", "content": "这是引言章节的内容。", "level": 2, "start_line": 7, "end_line": 8},
|
|
97
|
+
{"title": "背景", "content": "这是背景部分的内容。", "level": 3, "start_line": 9, "end_line": 11},
|
|
98
|
+
{"title": "方法", "content": "这是方法章节的内容。", "level": 2, "start_line": 13, "end_line": 14},
|
|
99
|
+
{"title": "实验设计", "content": "这是实验设计部分的内容。", "level": 3, "start_line": 15, "end_line": 17},
|
|
100
|
+
{"title": "结果", "content": "这是结果章节的内容。", "level": 2, "start_line": 19, "end_line": 20},
|
|
101
|
+
{"title": "数据表格", "content": "| 列1 | 列2 | 列3 |\n| --- | --- | --- |\n| 行1 | 行1 | 行1 |\n| 行2 | 行2 | 行2 |", "level": 3, "start_line": 21, "end_line": 28},
|
|
102
|
+
{"title": "代码示例", "content": "```python\nprint(\"Hello, world!\")\n```", "level": 3, "start_line": 29, "end_line": 34},
|
|
103
|
+
{"title": "讨论", "content": "这是讨论章节的内容。", "level": 2, "start_line": 35, "end_line": 37},
|
|
104
|
+
{"title": "结论", "content": "这是结论章节的内容。", "level": 2, "start_line": 39, "end_line": 41},
|
|
105
|
+
{"title": "参考文献", "content": "[1] 参考文献1\n[2] 参考文献2", "level": 2, "start_line": 43, "end_line": 46}
|
|
106
|
+
],
|
|
107
|
+
"references": [
|
|
108
|
+
{"text": "参考文献1"},
|
|
109
|
+
{"text": "参考文献2"}
|
|
110
|
+
]
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
def test_latex_renderer(self):
|
|
114
|
+
"""测试LaTeX渲染器"""
|
|
115
|
+
renderer = LaTeXRenderer()
|
|
116
|
+
|
|
117
|
+
# 测试渲染功能
|
|
118
|
+
latex_content = renderer.render(self.structured_content)
|
|
119
|
+
self.assertIsInstance(latex_content, bytes)
|
|
120
|
+
self.assertGreater(len(latex_content), 0)
|
|
121
|
+
|
|
122
|
+
# 保存为临时文件,以便手动检查
|
|
123
|
+
with tempfile.NamedTemporaryFile(suffix=".tex", delete=False) as tmp:
|
|
124
|
+
tmp.write(latex_content)
|
|
125
|
+
tmp_path = tmp.name
|
|
126
|
+
|
|
127
|
+
try:
|
|
128
|
+
# 验证文件存在且大小大于0
|
|
129
|
+
self.assertTrue(os.path.exists(tmp_path))
|
|
130
|
+
self.assertGreater(os.path.getsize(tmp_path), 0)
|
|
131
|
+
finally:
|
|
132
|
+
# 清理临时文件
|
|
133
|
+
if os.path.exists(tmp_path):
|
|
134
|
+
os.unlink(tmp_path)
|
|
135
|
+
|
|
136
|
+
def test_md_to_latex_converter(self):
|
|
137
|
+
"""测试Markdown到LaTeX转换器"""
|
|
138
|
+
converter = MDToLaTeXConverter()
|
|
139
|
+
|
|
140
|
+
# 测试转换功能
|
|
141
|
+
result = converter.convert(self.test_content)
|
|
142
|
+
self.assertIsInstance(result, bytes)
|
|
143
|
+
self.assertGreater(len(result), 0)
|
|
144
|
+
|
|
145
|
+
# 保存为临时文件,以便手动检查
|
|
146
|
+
with tempfile.NamedTemporaryFile(suffix=".tex", delete=False) as tmp:
|
|
147
|
+
tmp.write(result)
|
|
148
|
+
tmp_path = tmp.name
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
# 验证文件存在且大小大于0
|
|
152
|
+
self.assertTrue(os.path.exists(tmp_path))
|
|
153
|
+
self.assertGreater(os.path.getsize(tmp_path), 0)
|
|
154
|
+
finally:
|
|
155
|
+
# 清理临时文件
|
|
156
|
+
if os.path.exists(tmp_path):
|
|
157
|
+
os.unlink(tmp_path)
|
|
158
|
+
|
|
159
|
+
def test_latex_with_options(self):
|
|
160
|
+
"""测试带选项的LaTeX渲染"""
|
|
161
|
+
renderer = LaTeXRenderer()
|
|
162
|
+
converter = MDToLaTeXConverter()
|
|
163
|
+
|
|
164
|
+
# 测试带选项的渲染
|
|
165
|
+
options = {
|
|
166
|
+
"document_class": "article",
|
|
167
|
+
"document_options": "a4paper, 12pt",
|
|
168
|
+
"table_of_contents": True
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
latex_content = renderer.render(self.structured_content, options)
|
|
172
|
+
self.assertIsInstance(latex_content, bytes)
|
|
173
|
+
self.assertGreater(len(latex_content), 0)
|
|
174
|
+
|
|
175
|
+
# 测试带选项的转换
|
|
176
|
+
result = converter.convert(self.test_content, options)
|
|
177
|
+
self.assertIsInstance(result, bytes)
|
|
178
|
+
self.assertGreater(len(result), 0)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
if __name__ == '__main__':
|
|
182
|
+
unittest.main()
|
hos_m2f-0.5.4/README.md
DELETED
|
@@ -1,130 +0,0 @@
|
|
|
1
|
-
# HOS-M2F v1.0
|
|
2
|
-
|
|
3
|
-
## 多场景结构化内容编译引擎
|
|
4
|
-
|
|
5
|
-
HOS-M2F = 面向 AI 写作与专业文档生产的 **内容编译器引擎**
|
|
6
|
-
|
|
7
|
-
Markdown 是源码,不同 Mode 是"目标行业标准"
|
|
8
|
-
|
|
9
|
-
## 核心功能
|
|
10
|
-
|
|
11
|
-
### 内置官方模式
|
|
12
|
-
|
|
13
|
-
| Mode | 领域 | 核心产出 |
|
|
14
|
-
| --------- | ----- | -------------- |
|
|
15
|
-
| `paper` | 技术文档 | 报告 / 论文排版 |
|
|
16
|
-
| `patent` | 专利申请 | 合规专利文件 |
|
|
17
|
-
| `book` 📚 | 电子书出版 | EPUB / KDP |
|
|
18
|
-
| `sop` 🛠 | 运维与实施 | 巡检 / 报错 / 实施报告 |
|
|
19
|
-
|
|
20
|
-
### 输出支持
|
|
21
|
-
|
|
22
|
-
| 平台 | 格式 |
|
|
23
|
-
| ---------- | ----------- |
|
|
24
|
-
| Amazon KDP | EPUB / DOCX |
|
|
25
|
-
| Kindle | MOBI / AZW3 |
|
|
26
|
-
| 通用阅读器 | EPUB 3 |
|
|
27
|
-
| 印刷 | Print PDF |
|
|
28
|
-
| 企业系统 | JSON / HTML |
|
|
29
|
-
|
|
30
|
-
## 安装
|
|
31
|
-
|
|
32
|
-
```bash
|
|
33
|
-
pip install -e .
|
|
34
|
-
```
|
|
35
|
-
|
|
36
|
-
## 使用方法
|
|
37
|
-
|
|
38
|
-
### CLI 命令
|
|
39
|
-
|
|
40
|
-
```bash
|
|
41
|
-
# 构建
|
|
42
|
-
HOS_M2F build input.md --mode book
|
|
43
|
-
HOS_M2F build report.md --mode sop --format pdf
|
|
44
|
-
|
|
45
|
-
# 校验
|
|
46
|
-
HOS_M2F check book.md --mode book --platform kdp
|
|
47
|
-
HOS_M2F check report.md --mode sop
|
|
48
|
-
|
|
49
|
-
# 批量
|
|
50
|
-
HOS_M2F build ./reports/ --mode sop --batch
|
|
51
|
-
```
|
|
52
|
-
|
|
53
|
-
### Python SDK
|
|
54
|
-
|
|
55
|
-
```python
|
|
56
|
-
from hos_m2f import Engine
|
|
57
|
-
|
|
58
|
-
engine = Engine()
|
|
59
|
-
|
|
60
|
-
result = engine.build(
|
|
61
|
-
content=markdown_text,
|
|
62
|
-
mode="book",
|
|
63
|
-
output_format="epub",
|
|
64
|
-
options={"platform": "kdp"}
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
with open("output.epub", "wb") as f:
|
|
68
|
-
f.write(result.binary)
|
|
69
|
-
```
|
|
70
|
-
|
|
71
|
-
## 项目结构
|
|
72
|
-
|
|
73
|
-
```
|
|
74
|
-
hos_m2f/
|
|
75
|
-
├── engine/ # ⭐ 对外统一引擎接口
|
|
76
|
-
│ └── engine.py
|
|
77
|
-
│
|
|
78
|
-
├── modes/ # 内置模式
|
|
79
|
-
│ ├── book_mode.py
|
|
80
|
-
│ ├── patent_mode.py
|
|
81
|
-
│ ├── sop_mode.py
|
|
82
|
-
│ └── paper_mode.py
|
|
83
|
-
│
|
|
84
|
-
├── user_modes/ # 用户自定义模式
|
|
85
|
-
│ └── *.yaml
|
|
86
|
-
│
|
|
87
|
-
├── structure/ # 结构解析层
|
|
88
|
-
│ ├── semantic_parser.py
|
|
89
|
-
│ ├── book_parser.py
|
|
90
|
-
│ ├── patent_parser.py
|
|
91
|
-
│ └── sop_parser.py
|
|
92
|
-
│
|
|
93
|
-
├── renderers/ # 渲染适配层
|
|
94
|
-
│ ├── epub_renderer.py
|
|
95
|
-
│ ├── pdf_renderer.py
|
|
96
|
-
│ ├── docx_renderer.py
|
|
97
|
-
│ └── json_renderer.py
|
|
98
|
-
│
|
|
99
|
-
└── ide/ # IDE 集成接口
|
|
100
|
-
├── api.py
|
|
101
|
-
└── preview_server.py
|
|
102
|
-
```
|
|
103
|
-
|
|
104
|
-
## 扩展能力
|
|
105
|
-
|
|
106
|
-
### 语义校验引擎
|
|
107
|
-
- 专利是否缺少摘要
|
|
108
|
-
- 书籍是否缺版权页
|
|
109
|
-
- SOP 是否缺结论
|
|
110
|
-
|
|
111
|
-
### 风格主题系统
|
|
112
|
-
企业可统一视觉风格:
|
|
113
|
-
|
|
114
|
-
```yaml
|
|
115
|
-
theme: blue_enterprise
|
|
116
|
-
font: Source Han Sans
|
|
117
|
-
cover_style: minimal
|
|
118
|
-
```
|
|
119
|
-
|
|
120
|
-
### 增量构建能力
|
|
121
|
-
只更新修改过的章节,提高大书籍或长报告构建速度
|
|
122
|
-
|
|
123
|
-
### 结构化数据导出
|
|
124
|
-
SOP 报告可导出 JSON 供企业系统分析
|
|
125
|
-
|
|
126
|
-
### AI 协同接口(未来扩展)
|
|
127
|
-
让大模型直接输出符合某 Mode 的结构,而不是自由 Markdown
|
|
128
|
-
|
|
129
|
-
## 许可证
|
|
130
|
-
MIT License
|
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
"""Markdown到EPUB格式转换器"""
|
|
2
|
-
|
|
3
|
-
from typing import Any, Optional, Dict
|
|
4
|
-
from hos_m2f.converters.base_converter import BaseConverter
|
|
5
|
-
import ebooklib
|
|
6
|
-
from ebooklib import epub
|
|
7
|
-
import mistune
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class MDToEPUBConverter(BaseConverter):
|
|
11
|
-
"""Markdown到EPUB格式转换器"""
|
|
12
|
-
|
|
13
|
-
def convert(self, input_content: str, options: Optional[Dict[str, Any]] = None) -> bytes:
|
|
14
|
-
"""将Markdown转换为EPUB
|
|
15
|
-
|
|
16
|
-
Args:
|
|
17
|
-
input_content: Markdown内容
|
|
18
|
-
options: 转换选项
|
|
19
|
-
|
|
20
|
-
Returns:
|
|
21
|
-
bytes: EPUB文件的二进制数据
|
|
22
|
-
"""
|
|
23
|
-
if options is None:
|
|
24
|
-
options = {}
|
|
25
|
-
|
|
26
|
-
# 创建EPUB书籍
|
|
27
|
-
book = epub.EpubBook()
|
|
28
|
-
|
|
29
|
-
# 设置元数据
|
|
30
|
-
book.set_identifier('id12345')
|
|
31
|
-
book.set_title(options.get('title', 'Untitled'))
|
|
32
|
-
book.set_language(options.get('language', 'zh'))
|
|
33
|
-
book.add_author(options.get('author', 'Unknown'))
|
|
34
|
-
|
|
35
|
-
# 添加封面
|
|
36
|
-
if 'cover' in options:
|
|
37
|
-
cover_image = epub.EpubItem(
|
|
38
|
-
uid='cover-image',
|
|
39
|
-
file_name='images/cover.jpg',
|
|
40
|
-
media_type='image/jpeg',
|
|
41
|
-
content=options['cover']
|
|
42
|
-
)
|
|
43
|
-
book.add_item(cover_image)
|
|
44
|
-
book.set_cover('images/cover.jpg', cover_image)
|
|
45
|
-
|
|
46
|
-
# 解析Markdown
|
|
47
|
-
markdown = mistune.create_markdown()
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
# 转换为HTML
|
|
51
|
-
html_content = markdown(input_content)
|
|
52
|
-
|
|
53
|
-
# 创建章节
|
|
54
|
-
chapter = epub.EpubHtml(
|
|
55
|
-
title=options.get('title', 'Chapter 1'),
|
|
56
|
-
file_name='chapter1.xhtml',
|
|
57
|
-
lang='zh'
|
|
58
|
-
)
|
|
59
|
-
chapter.content = f'''
|
|
60
|
-
<!DOCTYPE html>
|
|
61
|
-
<html>
|
|
62
|
-
<head>
|
|
63
|
-
<title>{options.get('title', 'Untitled')}</title>
|
|
64
|
-
<meta charset="utf-8" />
|
|
65
|
-
</head>
|
|
66
|
-
<body>
|
|
67
|
-
<h1>{options.get('title', 'Untitled')}</h1>
|
|
68
|
-
{html_content}
|
|
69
|
-
</body>
|
|
70
|
-
</html>
|
|
71
|
-
'''
|
|
72
|
-
|
|
73
|
-
# 添加章节
|
|
74
|
-
book.add_item(chapter)
|
|
75
|
-
|
|
76
|
-
# 创建目录
|
|
77
|
-
book.toc = [chapter]
|
|
78
|
-
|
|
79
|
-
# 添加导航文件
|
|
80
|
-
book.add_item(epub.EpubNcx())
|
|
81
|
-
book.add_item(epub.EpubNav())
|
|
82
|
-
|
|
83
|
-
# 定义spine
|
|
84
|
-
book.spine = ['nav', chapter]
|
|
85
|
-
|
|
86
|
-
# 保存为二进制数据
|
|
87
|
-
import io
|
|
88
|
-
output = io.BytesIO()
|
|
89
|
-
epub.write_epub(output, book, {})
|
|
90
|
-
output.seek(0)
|
|
91
|
-
|
|
92
|
-
return output.getvalue()
|
|
93
|
-
|
|
94
|
-
def get_supported_formats(self) -> tuple:
|
|
95
|
-
"""获取支持的格式"""
|
|
96
|
-
return ('markdown', 'epub')
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|