docstudio 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docstudio/__init__.py +35 -0
- docstudio/assistant.py +73 -0
- docstudio/cli.py +139 -0
- docstudio/core.py +190 -0
- docstudio/export.py +205 -0
- docstudio/ingest.py +205 -0
- docstudio/latex.py +124 -0
- docstudio/llm.py +106 -0
- docstudio/templates.py +61 -0
- docstudio/tools.py +107 -0
- docstudio-0.2.0.dist-info/METADATA +223 -0
- docstudio-0.2.0.dist-info/RECORD +15 -0
- docstudio-0.2.0.dist-info/WHEEL +4 -0
- docstudio-0.2.0.dist-info/entry_points.txt +2 -0
- docstudio-0.2.0.dist-info/licenses/LICENSE +21 -0
docstudio/__init__.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""DocumentStudio — bidirectional document conversion library.
|
|
2
|
+
|
|
3
|
+
Reverse (X -> Markdown): PDF, Word, PPT, Excel, EPUB, HTML, CSV/TSV, JSON,
|
|
4
|
+
ZIP, images (OCR / VLM). Optionally backed by
|
|
5
|
+
Microsoft `markitdown` when installed.
|
|
6
|
+
Forward (Markdown -> X): HTML, PDF, Word(.docx), LaTeX, EPUB, Excel(.xlsx),
|
|
7
|
+
plain text. High-fidelity export is the part
|
|
8
|
+
markitdown does NOT do.
|
|
9
|
+
AI assistant : polish, translate, summarise, expand, continue,
|
|
10
|
+
grammar, formalise, titles, outline, fix-LaTeX,
|
|
11
|
+
free-form instructions.
|
|
12
|
+
Toolbox : table of contents, merge PDFs, extract images.
|
|
13
|
+
Templates : academic, techdoc, minutes, readme, weekly, blog.
|
|
14
|
+
|
|
15
|
+
Quick start
|
|
16
|
+
-----------
|
|
17
|
+
from docstudio import DocumentStudio
|
|
18
|
+
ds = DocumentStudio()
|
|
19
|
+
|
|
20
|
+
md = ds.to_markdown("report.pdf") # anything -> Markdown
|
|
21
|
+
ds.convert("paper.md", to="pdf", out="paper.pdf") # Markdown -> anything
|
|
22
|
+
|
|
23
|
+
ds.generate_toc(md) # toolbox
|
|
24
|
+
ds.merge_pdfs(["a.pdf", "b.pdf"], "all.pdf")
|
|
25
|
+
body = ds.template("academic") # template library
|
|
26
|
+
|
|
27
|
+
from docstudio.llm import LLM # AI assistant
|
|
28
|
+
ds = DocumentStudio(llm=LLM(base_url="...", api_key="...", model="..."))
|
|
29
|
+
ds.assist(md, action="polish")
|
|
30
|
+
ds.assist(md, instruction="把所有表格改成要点列表")
|
|
31
|
+
"""
|
|
32
|
+
from .core import DocumentStudio, ConversionError, registry
|
|
33
|
+
|
|
34
|
+
__all__ = ["DocumentStudio", "ConversionError", "registry"]
|
|
35
|
+
__version__ = "0.2.0"
|
docstudio/assistant.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""AI document assistant — LLM-powered operations on Markdown / plain text.
|
|
2
|
+
|
|
3
|
+
Mirrors the *AI Assistant* of the Document Studio web app: a fixed set of
|
|
4
|
+
one-shot actions (polish, translate, summarise, expand, ...) plus a free-form
|
|
5
|
+
``instruction``. Every action is a single chat completion against whatever
|
|
6
|
+
OpenAI-compatible endpoint the :class:`~docstudio.llm.LLM` is configured for.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from typing import Optional
|
|
12
|
+
|
|
13
|
+
# action key -> system prompt (kept in sync with the web app's AI_ACTS)
|
|
14
|
+
ACTIONS = {
|
|
15
|
+
"polish": "你是中文技术写作助手。润色下面的 Markdown:提升流畅度、准确性与可读性,"
|
|
16
|
+
"规范标点与格式,严格保留原意与全部信息,不新增不删减。只输出 Markdown,不要解释。",
|
|
17
|
+
"to_en": "Translate the following Markdown into fluent, natural English. Preserve all "
|
|
18
|
+
"Markdown structure (headings, lists, tables, math $...$, code). Output only the "
|
|
19
|
+
"translated Markdown.",
|
|
20
|
+
"to_zh": "把下面的 Markdown 翻译成自然流畅的简体中文,保留所有 Markdown 结构"
|
|
21
|
+
"(标题、列表、表格、公式 $...$、代码)。只输出翻译后的 Markdown。",
|
|
22
|
+
"summary": "为下面的内容写一段简洁、准确的摘要(约150-250字)。只输出摘要的 Markdown 段落,不要解释。",
|
|
23
|
+
"expand": "扩展并丰富下面的内容:补充细节、例子与解释,保持原结构与风格,不偏离主题、不编造事实。"
|
|
24
|
+
"只输出 Markdown。",
|
|
25
|
+
"condense": "精炼下面的内容:去除冗余,保留要点与关键信息,保持 Markdown 结构。只输出 Markdown。",
|
|
26
|
+
"continue": "延续下面文档的主题、风格与结构,自然地继续往下写 1-3 段。只输出新增的 Markdown 内容,"
|
|
27
|
+
"不要重复已有内容。",
|
|
28
|
+
"grammar": "修正下面文档中的语法、拼写与标点错误,不改变原意与写作风格。只输出修正后的 Markdown。",
|
|
29
|
+
"formal": "把下面的内容改写为更正式、专业的书面语,保留全部信息与 Markdown 结构。只输出 Markdown。",
|
|
30
|
+
"titles": "为下面的文档给出 5 个高质量的标题建议,用 Markdown 无序列表呈现。只输出列表。",
|
|
31
|
+
"outline": "根据下面的内容或主题,生成一个结构化的 Markdown 多级标题大纲。只输出大纲。",
|
|
32
|
+
"fix_latex": "修正下面 LaTeX / Markdown 数学公式中的语法错误(缺失的 $、未配对的花括号、"
|
|
33
|
+
"拼错的命令等),不改变实际内容与结构。只输出修正后的内容。",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
_CUSTOM_SYS = ("你是专业文档编辑助手。严格按用户指令处理下面的 Markdown 文档/文本。"
|
|
37
|
+
"只输出处理后的 Markdown 结果,不要解释、不要整篇代码围栏。")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def list_actions():
|
|
41
|
+
"""Return the available action keys."""
|
|
42
|
+
return list(ACTIONS)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _strip_fence(s: str) -> str:
|
|
46
|
+
s = re.sub(r"^\s*```(?:markdown|md)?\s*\n?", "", s, flags=re.I)
|
|
47
|
+
s = re.sub(r"\n?```\s*$", "", s)
|
|
48
|
+
return s.strip()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def assist(llm, text: str, action: Optional[str] = None,
|
|
52
|
+
instruction: Optional[str] = None) -> str:
|
|
53
|
+
"""Run an AI assistant operation on ``text`` and return the result.
|
|
54
|
+
|
|
55
|
+
Provide ``action`` (one of :data:`ACTIONS`) and/or a free-form
|
|
56
|
+
``instruction``. With both, the instruction refines the chosen action.
|
|
57
|
+
"""
|
|
58
|
+
if llm is None:
|
|
59
|
+
raise RuntimeError("assist() needs an LLM — create DocumentStudio(llm=LLM(...)).")
|
|
60
|
+
text = text or ""
|
|
61
|
+
if action:
|
|
62
|
+
if action not in ACTIONS:
|
|
63
|
+
raise ValueError("unknown action %r; choose from: %s"
|
|
64
|
+
% (action, ", ".join(ACTIONS)))
|
|
65
|
+
system = ACTIONS[action]
|
|
66
|
+
user = (("指令:%s\n\n" % instruction) if instruction else "") + \
|
|
67
|
+
(("文档:\n" + text) if text.strip() else "(文档为空)")
|
|
68
|
+
else:
|
|
69
|
+
if not instruction:
|
|
70
|
+
raise ValueError("provide either action= or instruction=")
|
|
71
|
+
system = _CUSTOM_SYS
|
|
72
|
+
user = "指令:%s\n\n文档:\n%s" % (instruction, text)
|
|
73
|
+
return _strip_fence(llm.chat(system, user))
|
docstudio/cli.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Command-line interface.
|
|
2
|
+
|
|
3
|
+
docstudio report.pdf # -> report.md (X -> Markdown)
|
|
4
|
+
docstudio report.pdf -o out.md
|
|
5
|
+
cat report.pdf | docstudio # stdin -> stdout (Markdown)
|
|
6
|
+
docstudio paper.md --to pdf -o paper.pdf # Markdown -> anything
|
|
7
|
+
docstudio scan.pdf --to docx # PDF -> md -> docx
|
|
8
|
+
docstudio --list-formats
|
|
9
|
+
|
|
10
|
+
docstudio paper.md --toc -o paper.md # insert a table of contents
|
|
11
|
+
docstudio notes.md --assist polish -o clean.md # AI assistant
|
|
12
|
+
docstudio notes.md --instruction "翻译成英文" -o en.md
|
|
13
|
+
docstudio --merge a.pdf b.pdf -o all.pdf # merge PDFs
|
|
14
|
+
docstudio report.pdf --extract-images ./imgs
|
|
15
|
+
docstudio --template academic # print a built-in template
|
|
16
|
+
docstudio --list-templates
|
|
17
|
+
"""
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
import sys
|
|
22
|
+
import tempfile
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
from .core import DocumentStudio, ConversionError
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def main(argv=None) -> int:
|
|
29
|
+
p = argparse.ArgumentParser(prog="docstudio",
|
|
30
|
+
description="Bidirectional document conversion (Markdown-centric).")
|
|
31
|
+
p.add_argument("input", nargs="?", help="input file (omit to read from stdin)")
|
|
32
|
+
p.add_argument("-o", "--output", help="output file (default: stdout for md, alongside input otherwise)")
|
|
33
|
+
p.add_argument("--to", help="target format: html, pdf, docx, latex, epub, xlsx, text (default: markdown)")
|
|
34
|
+
p.add_argument("--no-markitdown", action="store_true", help="never delegate reverse conversion to markitdown")
|
|
35
|
+
p.add_argument("--vlm-model", help="vision model id for image / scanned-PDF recognition")
|
|
36
|
+
p.add_argument("--model", help="text model id for the AI assistant (required for --assist)")
|
|
37
|
+
p.add_argument("--base-url", default="",
|
|
38
|
+
help="OpenAI-compatible endpoint, e.g. https://api.openai.com (required for AI ops)")
|
|
39
|
+
p.add_argument("--api-key", default="")
|
|
40
|
+
p.add_argument("--list-formats", action="store_true")
|
|
41
|
+
# toolbox / assistant / templates
|
|
42
|
+
p.add_argument("--assist", metavar="ACTION",
|
|
43
|
+
help="AI action: polish, to_en, to_zh, summary, expand, condense, "
|
|
44
|
+
"continue, grammar, formal, titles, outline, fix_latex")
|
|
45
|
+
p.add_argument("--instruction", help="free-form AI instruction (with or without --assist)")
|
|
46
|
+
p.add_argument("--toc", action="store_true", help="insert a Markdown table of contents")
|
|
47
|
+
p.add_argument("--merge", nargs="+", metavar="PDF", help="merge the given PDFs into -o output")
|
|
48
|
+
p.add_argument("--extract-images", metavar="DIR", help="extract embedded images into DIR")
|
|
49
|
+
p.add_argument("--template", metavar="NAME", help="print a built-in template and exit")
|
|
50
|
+
p.add_argument("--list-templates", action="store_true")
|
|
51
|
+
args = p.parse_args(argv)
|
|
52
|
+
|
|
53
|
+
llm = None
|
|
54
|
+
if args.vlm_model or args.model or args.assist or args.instruction:
|
|
55
|
+
if (args.assist or args.instruction) and not args.model:
|
|
56
|
+
p.error("AI operations need --model (e.g. --model gpt-4o-mini)")
|
|
57
|
+
if not args.base_url:
|
|
58
|
+
p.error("AI operations need --base-url (your OpenAI-compatible endpoint)")
|
|
59
|
+
from .llm import LLM
|
|
60
|
+
llm = LLM(base_url=args.base_url, api_key=args.api_key,
|
|
61
|
+
model=args.model or "", vlm_model=args.vlm_model)
|
|
62
|
+
ds = DocumentStudio(llm=llm, use_markitdown=not args.no_markitdown)
|
|
63
|
+
|
|
64
|
+
if args.list_formats:
|
|
65
|
+
print("inputs :", ", ".join(ds.supported_inputs))
|
|
66
|
+
print("outputs:", ", ".join(ds.supported_outputs))
|
|
67
|
+
return 0
|
|
68
|
+
|
|
69
|
+
if args.list_templates:
|
|
70
|
+
for slug, (title, desc) in ds.templates().items():
|
|
71
|
+
print("%-10s %s — %s" % (slug, title, desc))
|
|
72
|
+
return 0
|
|
73
|
+
|
|
74
|
+
if args.template:
|
|
75
|
+
sys.stdout.write(ds.template(args.template) + "\n")
|
|
76
|
+
return 0
|
|
77
|
+
|
|
78
|
+
if args.merge:
|
|
79
|
+
out = args.output or "merged.pdf"
|
|
80
|
+
ds.merge_pdfs(args.merge, out)
|
|
81
|
+
print(out, file=sys.stderr)
|
|
82
|
+
return 0
|
|
83
|
+
|
|
84
|
+
# resolve input (file or stdin)
|
|
85
|
+
if args.input:
|
|
86
|
+
src = args.input
|
|
87
|
+
else:
|
|
88
|
+
data = sys.stdin.buffer.read()
|
|
89
|
+
tmp = tempfile.NamedTemporaryFile(suffix=".bin", delete=False)
|
|
90
|
+
tmp.write(data); tmp.close(); src = tmp.name
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
if args.extract_images:
|
|
94
|
+
for f in ds.extract_images(src, args.extract_images):
|
|
95
|
+
print(f)
|
|
96
|
+
return 0
|
|
97
|
+
|
|
98
|
+
# text operations (toc / AI assistant) -> Markdown, optionally then convert
|
|
99
|
+
if args.toc or args.assist or args.instruction:
|
|
100
|
+
ext = Path(src).suffix.lower().lstrip(".")
|
|
101
|
+
md = (Path(src).read_text(encoding="utf-8")
|
|
102
|
+
if ext in ("md", "markdown", "txt") else ds.to_markdown(src))
|
|
103
|
+
if args.toc:
|
|
104
|
+
md = ds.generate_toc(md)
|
|
105
|
+
if args.assist or args.instruction:
|
|
106
|
+
md = ds.assist(md, action=args.assist, instruction=args.instruction)
|
|
107
|
+
if args.to and args.to != "markdown":
|
|
108
|
+
tf = tempfile.NamedTemporaryFile(suffix=".md", delete=False,
|
|
109
|
+
mode="w", encoding="utf-8")
|
|
110
|
+
tf.write(md); tf.close()
|
|
111
|
+
out = args.output or "out." + args.to
|
|
112
|
+
ds.convert(tf.name, to=args.to, out=out)
|
|
113
|
+
print(out, file=sys.stderr)
|
|
114
|
+
elif args.output:
|
|
115
|
+
Path(args.output).write_text(md, encoding="utf-8")
|
|
116
|
+
print(args.output, file=sys.stderr)
|
|
117
|
+
else:
|
|
118
|
+
sys.stdout.write(md + "\n")
|
|
119
|
+
return 0
|
|
120
|
+
|
|
121
|
+
if args.to and args.to != "markdown":
|
|
122
|
+
out = args.output or str(Path(src).with_suffix("." + args.to))
|
|
123
|
+
ds.convert(src, to=args.to, out=out)
|
|
124
|
+
print(out, file=sys.stderr)
|
|
125
|
+
else:
|
|
126
|
+
md = ds.to_markdown(src)
|
|
127
|
+
if args.output:
|
|
128
|
+
Path(args.output).write_text(md, encoding="utf-8")
|
|
129
|
+
print(args.output, file=sys.stderr)
|
|
130
|
+
else:
|
|
131
|
+
sys.stdout.write(md + "\n")
|
|
132
|
+
except (ConversionError, ValueError, RuntimeError, KeyError) as e:
|
|
133
|
+
print("error:", e, file=sys.stderr)
|
|
134
|
+
return 1
|
|
135
|
+
return 0
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
if __name__ == "__main__":
|
|
139
|
+
raise SystemExit(main())
|
docstudio/core.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""Core orchestration: format detection, the converter registry, and the
|
|
2
|
+
public :class:`DocumentStudio` facade."""
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional, Union, Callable, Dict
|
|
8
|
+
|
|
9
|
+
PathLike = Union[str, os.PathLike]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ConversionError(RuntimeError):
|
|
13
|
+
"""Raised when a conversion cannot be completed."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# --------------------------------------------------------------------------- #
|
|
17
|
+
# extension -> logical source format
|
|
18
|
+
# --------------------------------------------------------------------------- #
|
|
19
|
+
EXT_FORMAT: Dict[str, str] = {
|
|
20
|
+
".md": "markdown", ".markdown": "markdown", ".txt": "text",
|
|
21
|
+
".pdf": "pdf", ".docx": "docx", ".doc": "docx",
|
|
22
|
+
".pptx": "pptx", ".ppt": "pptx",
|
|
23
|
+
".xlsx": "xlsx", ".xlsm": "xlsx", ".xls": "xlsx",
|
|
24
|
+
".epub": "epub", ".zip": "zip",
|
|
25
|
+
".csv": "csv", ".tsv": "tsv", ".json": "json",
|
|
26
|
+
".html": "html", ".htm": "html", ".tex": "latex",
|
|
27
|
+
".png": "image", ".jpg": "image", ".jpeg": "image",
|
|
28
|
+
".gif": "image", ".webp": "image", ".bmp": "image",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
# logical target format -> default file extension
|
|
32
|
+
TARGET_EXT: Dict[str, str] = {
|
|
33
|
+
"markdown": ".md", "html": ".html", "pdf": ".pdf", "docx": ".docx",
|
|
34
|
+
"latex": ".tex", "epub": ".epub", "xlsx": ".xlsx", "text": ".txt",
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def detect_format(path: PathLike) -> str:
|
|
39
|
+
ext = Path(path).suffix.lower()
|
|
40
|
+
if ext not in EXT_FORMAT:
|
|
41
|
+
raise ConversionError(f"Unsupported input extension: {ext!r}")
|
|
42
|
+
return EXT_FORMAT[ext]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class _Registry:
|
|
46
|
+
"""Holds the ingest (X->md) and export (md->X) callables so the set of
|
|
47
|
+
supported formats is open for extension, exactly like markitdown plugins."""
|
|
48
|
+
|
|
49
|
+
def __init__(self) -> None:
|
|
50
|
+
self.ingesters: Dict[str, Callable] = {}
|
|
51
|
+
self.exporters: Dict[str, Callable] = {}
|
|
52
|
+
|
|
53
|
+
def ingester(self, *formats: str):
|
|
54
|
+
def deco(fn):
|
|
55
|
+
for f in formats:
|
|
56
|
+
self.ingesters[f] = fn
|
|
57
|
+
return fn
|
|
58
|
+
return deco
|
|
59
|
+
|
|
60
|
+
def exporter(self, *formats: str):
|
|
61
|
+
def deco(fn):
|
|
62
|
+
for f in formats:
|
|
63
|
+
self.exporters[f] = fn
|
|
64
|
+
return fn
|
|
65
|
+
return deco
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
registry = _Registry()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class DocumentStudio:
|
|
72
|
+
"""Facade over the converter registry.
|
|
73
|
+
|
|
74
|
+
Parameters
|
|
75
|
+
----------
|
|
76
|
+
llm:
|
|
77
|
+
Optional :class:`docstudio.llm.LLM` used for AI cleanup and, when a
|
|
78
|
+
vision model is configured, for image / scanned-PDF recognition.
|
|
79
|
+
use_markitdown:
|
|
80
|
+
When True (default) and the ``markitdown`` package is installed, it is
|
|
81
|
+
tried first for the reverse direction; our own converters are the
|
|
82
|
+
fallback. Set False to always use the built-in converters.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def __init__(self, llm=None, use_markitdown: bool = True) -> None:
|
|
86
|
+
self.llm = llm
|
|
87
|
+
self.use_markitdown = use_markitdown
|
|
88
|
+
# import for side effects: registers the built-in converters
|
|
89
|
+
from . import ingest, export # noqa: F401
|
|
90
|
+
|
|
91
|
+
# -- reverse: anything -> Markdown ------------------------------------- #
|
|
92
|
+
def to_markdown(self, source: PathLike, fmt: Optional[str] = None,
|
|
93
|
+
**opts) -> str:
|
|
94
|
+
fmt = fmt or detect_format(source)
|
|
95
|
+
if fmt == "markdown":
|
|
96
|
+
return Path(source).read_text(encoding="utf-8")
|
|
97
|
+
|
|
98
|
+
if self.use_markitdown and fmt not in ("csv", "tsv", "json"):
|
|
99
|
+
md = self._try_markitdown(source)
|
|
100
|
+
if md is not None:
|
|
101
|
+
return md
|
|
102
|
+
|
|
103
|
+
fn = registry.ingesters.get(fmt)
|
|
104
|
+
if fn is None:
|
|
105
|
+
raise ConversionError(f"No ingester for format {fmt!r}")
|
|
106
|
+
return fn(source, ds=self, **opts)
|
|
107
|
+
|
|
108
|
+
# -- forward: Markdown -> anything ------------------------------------- #
|
|
109
|
+
def convert(self, source: PathLike, to: str,
|
|
110
|
+
out: Optional[PathLike] = None, **opts):
|
|
111
|
+
"""Convert *source* (any supported input) to target format *to*.
|
|
112
|
+
|
|
113
|
+
Inputs that are not Markdown are first turned into Markdown, so e.g.
|
|
114
|
+
``convert("scan.pdf", to="docx")`` works (PDF -> md -> docx)."""
|
|
115
|
+
src_fmt = opts.pop("fmt", None) or detect_format(source)
|
|
116
|
+
markdown_text = (Path(source).read_text(encoding="utf-8")
|
|
117
|
+
if src_fmt == "markdown"
|
|
118
|
+
else self.to_markdown(source, fmt=src_fmt))
|
|
119
|
+
|
|
120
|
+
exporter = registry.exporters.get(to)
|
|
121
|
+
if exporter is None:
|
|
122
|
+
raise ConversionError(f"No exporter for target {to!r}")
|
|
123
|
+
|
|
124
|
+
if out is None:
|
|
125
|
+
out = str(Path(source).with_suffix(TARGET_EXT.get(to, "." + to)))
|
|
126
|
+
result = exporter(markdown_text, out=out, ds=self, **opts)
|
|
127
|
+
return result if result is not None else out
|
|
128
|
+
|
|
129
|
+
# -- helpers ----------------------------------------------------------- #
|
|
130
|
+
@staticmethod
|
|
131
|
+
def _try_markitdown(source: PathLike) -> Optional[str]:
|
|
132
|
+
try:
|
|
133
|
+
from markitdown import MarkItDown
|
|
134
|
+
except Exception:
|
|
135
|
+
return None
|
|
136
|
+
try:
|
|
137
|
+
return MarkItDown().convert(str(source)).text_content
|
|
138
|
+
except Exception:
|
|
139
|
+
return None
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def supported_inputs(self):
|
|
143
|
+
return sorted(set(registry.ingesters) | {"markdown"})
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def supported_outputs(self):
|
|
147
|
+
return sorted(registry.exporters)
|
|
148
|
+
|
|
149
|
+
# -- AI assistant ----------------------------------------------------- #
|
|
150
|
+
def assist(self, text, action=None, instruction=None):
|
|
151
|
+
"""Run an AI assistant operation (polish / translate / summarise / ...).
|
|
152
|
+
|
|
153
|
+
See :data:`docstudio.assistant.ACTIONS` for the action keys, or pass a
|
|
154
|
+
free-form ``instruction``. Requires ``DocumentStudio(llm=LLM(...))``.
|
|
155
|
+
"""
|
|
156
|
+
from .assistant import assist
|
|
157
|
+
return assist(self.llm, text, action=action, instruction=instruction)
|
|
158
|
+
|
|
159
|
+
@staticmethod
|
|
160
|
+
def assist_actions():
|
|
161
|
+
"""List the available AI assistant actions."""
|
|
162
|
+
from .assistant import list_actions
|
|
163
|
+
return list_actions()
|
|
164
|
+
|
|
165
|
+
# -- toolbox ---------------------------------------------------------- #
|
|
166
|
+
def generate_toc(self, md, title="目录"):
|
|
167
|
+
"""Insert a Markdown table of contents built from ##/### headings."""
|
|
168
|
+
from .tools import generate_toc
|
|
169
|
+
return generate_toc(md, title=title)
|
|
170
|
+
|
|
171
|
+
def merge_pdfs(self, paths, out):
|
|
172
|
+
"""Merge several PDFs into one (requires pypdf)."""
|
|
173
|
+
from .tools import merge_pdfs
|
|
174
|
+
return merge_pdfs(paths, out)
|
|
175
|
+
|
|
176
|
+
def extract_images(self, source, out_dir):
|
|
177
|
+
"""Extract embedded images from PDF/DOCX/PPTX/EPUB into ``out_dir``."""
|
|
178
|
+
from .tools import extract_images
|
|
179
|
+
return extract_images(source, out_dir)
|
|
180
|
+
|
|
181
|
+
# -- template library ------------------------------------------------- #
|
|
182
|
+
def template(self, name):
|
|
183
|
+
"""Return the Markdown body of a built-in template."""
|
|
184
|
+
from .templates import get
|
|
185
|
+
return get(name)
|
|
186
|
+
|
|
187
|
+
def templates(self):
|
|
188
|
+
"""Return ``{slug: (title, description)}`` for built-in templates."""
|
|
189
|
+
from .templates import info
|
|
190
|
+
return info()
|
docstudio/export.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""Forward exporters: Markdown -> X.
|
|
2
|
+
|
|
3
|
+
High-fidelity targets (pdf/docx/epub) use a backend strategy: prefer the engine
|
|
4
|
+
that gives the best output and is installed, else raise a clear message telling
|
|
5
|
+
the user which extra to install — the same philosophy markitdown uses for its
|
|
6
|
+
optional dependencies."""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import re
|
|
11
|
+
import shutil
|
|
12
|
+
import subprocess
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from .core import registry, ConversionError
|
|
16
|
+
from .latex import md_to_latex
|
|
17
|
+
|
|
18
|
+
_CSS = """
|
|
19
|
+
body{font-family:-apple-system,'PingFang SC','Noto Sans SC',sans-serif;
|
|
20
|
+
font-size:15px;line-height:1.75;color:#1a1d23;max-width:760px;margin:40px auto;padding:0 20px}
|
|
21
|
+
h1,h2,h3{line-height:1.3}
|
|
22
|
+
pre{background:#1f2430;color:#e6e6e6;padding:14px;border-radius:8px;overflow:auto}
|
|
23
|
+
code{font-family:Consolas,monospace}:not(pre)>code{background:#f0f1f4;padding:.12em .4em;border-radius:5px}
|
|
24
|
+
blockquote{border-left:3px solid #e15139;background:#fbeae6;padding:.6em 1em;border-radius:0 7px 7px 0;color:#555}
|
|
25
|
+
table{border-collapse:collapse;width:100%}th,td{border:1px solid #ddd;padding:8px 12px}th{background:#f5f5f5}
|
|
26
|
+
img{max-width:100%}a{color:#2c72d0}
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
_KATEX = ('<link rel="stylesheet" '
|
|
30
|
+
'href="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.css">'
|
|
31
|
+
'<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.js"></script>'
|
|
32
|
+
'<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/contrib/auto-render.min.js" '
|
|
33
|
+
'onload="renderMathInElement(document.body)"></script>')
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _md_to_html_fragment(md: str) -> str:
|
|
37
|
+
import markdown
|
|
38
|
+
return markdown.markdown(
|
|
39
|
+
md, extensions=["tables", "fenced_code", "toc", "sane_lists", "footnotes"])
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _full_html(md: str, title: str = "", math: bool = True) -> str:
|
|
43
|
+
body = _md_to_html_fragment(md)
|
|
44
|
+
head = f"<meta charset='utf-8'><title>{title}</title><style>{_CSS}</style>"
|
|
45
|
+
if math:
|
|
46
|
+
head += _KATEX
|
|
47
|
+
return f"<!DOCTYPE html><html><head>{head}</head><body>{body}</body></html>"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@registry.exporter("text")
|
|
51
|
+
def to_text(md, out=None, ds=None, **_):
|
|
52
|
+
plain = re.sub(r"[#>*_`~]|!\[[^\]]*\]\([^)]*\)", "", md)
|
|
53
|
+
plain = re.sub(r"\[([^\]]+)\]\([^)]*\)", r"\1", plain)
|
|
54
|
+
return _write(out, plain.strip(), text=True)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@registry.exporter("html")
|
|
58
|
+
def to_html(md, out=None, ds=None, title="", **_):
|
|
59
|
+
return _write(out, _full_html(md, title), text=True)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@registry.exporter("latex")
|
|
63
|
+
def to_latex(md, out=None, ds=None, title="", backend="auto", **_):
|
|
64
|
+
if backend in ("auto", "pandoc") and shutil.which("pandoc"):
|
|
65
|
+
return _pandoc(md, out, "latex", extra=["-s"])
|
|
66
|
+
return _write(out, md_to_latex(md, title=title), text=True)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@registry.exporter("xlsx")
|
|
70
|
+
def to_xlsx(md, out=None, ds=None, **_):
|
|
71
|
+
try:
|
|
72
|
+
import openpyxl
|
|
73
|
+
except ImportError as e:
|
|
74
|
+
raise ConversionError("xlsx export needs `pip install docstudio[office]` (openpyxl)") from e
|
|
75
|
+
wb = openpyxl.Workbook()
|
|
76
|
+
wb.remove(wb.active)
|
|
77
|
+
tables = _extract_tables(md)
|
|
78
|
+
if not tables:
|
|
79
|
+
ws = wb.create_sheet("Sheet1")
|
|
80
|
+
for line in (l for l in md.splitlines() if l.strip()):
|
|
81
|
+
ws.append([c.strip() for c in re.split(r"\t|\s{2,}|,|,", re.sub(r"[#>*_`]", "", line))])
|
|
82
|
+
else:
|
|
83
|
+
seen = set()
|
|
84
|
+
for k, (name, rows) in enumerate(tables, 1):
|
|
85
|
+
nm = re.sub(r'[\\/?*\[\]:]', "", name or f"Sheet{k}")[:28] or f"Sheet{k}"
|
|
86
|
+
base, j = nm, 1
|
|
87
|
+
while nm in seen:
|
|
88
|
+
j += 1; nm = base[:26] + "_" + str(j)
|
|
89
|
+
seen.add(nm)
|
|
90
|
+
ws = wb.create_sheet(nm)
|
|
91
|
+
for r in rows:
|
|
92
|
+
ws.append(r)
|
|
93
|
+
wb.save(out)
|
|
94
|
+
return out
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@registry.exporter("pdf")
|
|
98
|
+
def to_pdf(md, out=None, ds=None, title="", backend="auto", **_):
|
|
99
|
+
order = (["pandoc", "weasyprint", "playwright"] if backend == "auto" else [backend])
|
|
100
|
+
errors = []
|
|
101
|
+
for b in order:
|
|
102
|
+
try:
|
|
103
|
+
if b == "pandoc" and shutil.which("pandoc"):
|
|
104
|
+
return _pandoc(md, out, "pdf", extra=["--pdf-engine=xelatex"])
|
|
105
|
+
if b == "weasyprint":
|
|
106
|
+
from weasyprint import HTML # needs cairo/pango system libs
|
|
107
|
+
HTML(string=_full_html(md, title, math=False)).write_pdf(out)
|
|
108
|
+
return out
|
|
109
|
+
if b == "playwright":
|
|
110
|
+
return _pdf_playwright(md, out, title)
|
|
111
|
+
except Exception as e: # noqa: BLE001
|
|
112
|
+
errors.append(f"{b}: {e}")
|
|
113
|
+
raise ConversionError(
|
|
114
|
+
"No working PDF backend. Install one of:\n"
|
|
115
|
+
" • pandoc + a TeX engine (best for math) -> apt install pandoc texlive-xetex\n"
|
|
116
|
+
" • weasyprint -> pip install docstudio[pdf-weasy]\n"
|
|
117
|
+
" • playwright (headless Chrome, full KaTeX)-> pip install docstudio[pdf-chrome] && playwright install chromium\n"
|
|
118
|
+
+ (" | tried: " + "; ".join(errors) if errors else ""))
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@registry.exporter("docx")
|
|
122
|
+
def to_docx(md, out=None, ds=None, **_):
|
|
123
|
+
if shutil.which("pandoc"):
|
|
124
|
+
return _pandoc(md, out, "docx")
|
|
125
|
+
try:
|
|
126
|
+
import docx # python-docx, basic fallback (no rich tables/math)
|
|
127
|
+
except ImportError as e:
|
|
128
|
+
raise ConversionError(
|
|
129
|
+
"docx export needs pandoc, or `pip install docstudio[office]` for a basic fallback") from e
|
|
130
|
+
document = docx.Document()
|
|
131
|
+
for line in md.splitlines():
|
|
132
|
+
h = re.match(r"^(#{1,6})\s+(.+)", line)
|
|
133
|
+
if h:
|
|
134
|
+
document.add_heading(h.group(2), level=min(len(h.group(1)), 4))
|
|
135
|
+
elif line.strip():
|
|
136
|
+
document.add_paragraph(re.sub(r"[*`]", "", line))
|
|
137
|
+
document.save(out)
|
|
138
|
+
return out
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@registry.exporter("epub")
|
|
142
|
+
def to_epub(md, out=None, ds=None, title="Document", **_):
|
|
143
|
+
if shutil.which("pandoc"):
|
|
144
|
+
return _pandoc(md, out, "epub", extra=["--metadata", f"title={title}"])
|
|
145
|
+
try:
|
|
146
|
+
from ebooklib import epub
|
|
147
|
+
except ImportError as e:
|
|
148
|
+
raise ConversionError("epub export needs pandoc, or `pip install docstudio[office]` (ebooklib)") from e
|
|
149
|
+
book = epub.EpubBook()
|
|
150
|
+
book.set_title(title)
|
|
151
|
+
ch = epub.EpubHtml(title=title, file_name="ch1.xhtml")
|
|
152
|
+
ch.content = "<html><body>" + _md_to_html_fragment(md) + "</body></html>"
|
|
153
|
+
book.add_item(ch)
|
|
154
|
+
book.spine = ["nav", ch]
|
|
155
|
+
book.add_item(epub.EpubNcx()); book.add_item(epub.EpubNav())
|
|
156
|
+
epub.write_epub(out, book)
|
|
157
|
+
return out
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# --------------------------------------------------------------------------- #
|
|
161
|
+
# backends / helpers
|
|
162
|
+
# --------------------------------------------------------------------------- #
|
|
163
|
+
def _pandoc(md, out, to, extra=None):
|
|
164
|
+
cmd = ["pandoc", "-f", "markdown", "-t", to, "-o", str(out)] + (extra or [])
|
|
165
|
+
subprocess.run(cmd, input=md.encode("utf-8"), check=True)
|
|
166
|
+
return out
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _pdf_playwright(md, out, title):
|
|
170
|
+
from playwright.sync_api import sync_playwright
|
|
171
|
+
html = _full_html(md, title, math=True)
|
|
172
|
+
with sync_playwright() as p:
|
|
173
|
+
browser = p.chromium.launch()
|
|
174
|
+
page = browser.new_page()
|
|
175
|
+
page.set_content(html, wait_until="networkidle")
|
|
176
|
+
page.pdf(path=str(out), format="A4",
|
|
177
|
+
margin={"top": "18mm", "bottom": "18mm", "left": "18mm", "right": "18mm"})
|
|
178
|
+
browser.close()
|
|
179
|
+
return out
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _extract_tables(md):
|
|
183
|
+
lines, tables, i, last = md.split("\n"), [], 0, ""
|
|
184
|
+
while i < len(lines):
|
|
185
|
+
h = re.match(r"^#{1,6}\s+(.+)", lines[i])
|
|
186
|
+
if h:
|
|
187
|
+
last = h.group(1).strip()
|
|
188
|
+
if (re.match(r"^\s*\|.*\|\s*$", lines[i]) and i + 1 < len(lines)
|
|
189
|
+
and re.match(r"^\s*\|?[\s:|-]+\|?\s*$", lines[i + 1]) and "-" in lines[i + 1]):
|
|
190
|
+
block, j = [], i
|
|
191
|
+
while j < len(lines) and "|" in lines[j]:
|
|
192
|
+
block.append(lines[j]); j += 1
|
|
193
|
+
rows = [[c.replace(r"\|", "|").replace("<br>", "\n").strip()
|
|
194
|
+
for c in re.sub(r"^\s*\||\|\s*$", "", r).split("|")]
|
|
195
|
+
for k, r in enumerate(block) if k != 1]
|
|
196
|
+
tables.append((last, rows)); last = ""; i = j; continue
|
|
197
|
+
i += 1
|
|
198
|
+
return tables
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _write(out, data, text=False):
|
|
202
|
+
if out is None:
|
|
203
|
+
return data
|
|
204
|
+
Path(out).write_text(data, encoding="utf-8") if text else Path(out).write_bytes(data)
|
|
205
|
+
return out
|