docstudio 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docstudio/ingest.py ADDED
@@ -0,0 +1,205 @@
1
+ """Reverse converters: X -> Markdown.
2
+
3
+ Pure-python paths (csv/tsv/json/html) have no heavy deps. Office / PDF / EPUB
4
+ paths import their library lazily and raise a friendly message if missing, so a
5
+ minimal install still works for the common cases (mirrors markitdown's optional
6
+ extras)."""
7
+ from __future__ import annotations
8
+
9
+ import csv as _csv
10
+ import io
11
+ import json
12
+ import zipfile
13
+ from pathlib import Path
14
+
15
+ from .core import registry, ConversionError, detect_format
16
+
17
+ # --------------------------------------------------------------------------- #
18
+ # small helpers shared by several converters
19
+ # --------------------------------------------------------------------------- #
20
+ def _rows_to_md(rows):
21
+ rows = [r for r in rows if any(str(c).strip() for c in r)]
22
+ if not rows:
23
+ return ""
24
+ cols = max(len(r) for r in rows)
25
+
26
+ def cell(x):
27
+ return str("" if x is None else x).replace("|", r"\|").replace("\n", "<br>").strip()
28
+
29
+ def pad(r):
30
+ r = list(r) + [""] * (cols - len(r))
31
+ return [cell(c) for c in r]
32
+
33
+ head = pad(rows[0])
34
+ out = "| " + " | ".join(head) + " |\n|" + "|".join(" --- " for _ in head) + "|\n"
35
+ for r in rows[1:]:
36
+ out += "| " + " | ".join(pad(r)) + " |\n"
37
+ return out
38
+
39
+
40
+ def _read_text(source):
41
+ return Path(source).read_text(encoding="utf-8-sig")
42
+
43
+
44
+ @registry.ingester("csv")
45
+ def csv_to_md(source, ds=None, **_):
46
+ return _rows_to_md(list(_csv.reader(io.StringIO(_read_text(source)))))
47
+
48
+
49
+ @registry.ingester("tsv")
50
+ def tsv_to_md(source, ds=None, **_):
51
+ return _rows_to_md(list(_csv.reader(io.StringIO(_read_text(source)), delimiter="\t")))
52
+
53
+
54
+ @registry.ingester("json")
55
+ def json_to_md(source, ds=None, **_):
56
+ raw = _read_text(source)
57
+ try:
58
+ data = json.loads(raw)
59
+ except Exception:
60
+ return "```json\n" + raw.strip() + "\n```"
61
+ if (isinstance(data, list) and data
62
+ and all(isinstance(x, dict) for x in data)):
63
+ cols = list(dict.fromkeys(k for o in data for k in o))
64
+ rows = [cols] + [[(json.dumps(o[c], ensure_ascii=False)
65
+ if isinstance(o.get(c), (dict, list))
66
+ else o.get(c, "")) for c in cols] for o in data]
67
+ return _rows_to_md(rows)
68
+ return "```json\n" + json.dumps(data, ensure_ascii=False, indent=2) + "\n```"
69
+
70
+
71
+ @registry.ingester("html")
72
+ def html_to_md(source, ds=None, html=None, **_):
73
+ from markdownify import markdownify as _md
74
+ text = html if html is not None else _read_text(source)
75
+ return _md(text, heading_style="ATX", bullets="-").strip()
76
+
77
+
78
+ @registry.ingester("docx")
79
+ def docx_to_md(source, ds=None, **_):
80
+ try:
81
+ import mammoth
82
+ except ImportError as e: # pragma: no cover
83
+ raise ConversionError("docx support needs `pip install docstudio[office]` (mammoth)") from e
84
+ with open(source, "rb") as fh:
85
+ html = mammoth.convert_to_html(fh).value
86
+ return html_to_md(None, html=html)
87
+
88
+
89
+ @registry.ingester("pptx")
90
+ def pptx_to_md(source, ds=None, **_):
91
+ try:
92
+ from pptx import Presentation
93
+ except ImportError as e: # pragma: no cover
94
+ raise ConversionError("pptx support needs `pip install docstudio[office]` (python-pptx)") from e
95
+ prs = Presentation(str(source))
96
+ out = []
97
+ for i, slide in enumerate(prs.slides, 1):
98
+ out.append(f"## Slide {i}\n")
99
+ for shape in slide.shapes:
100
+ if shape.has_text_frame:
101
+ for para in shape.text_frame.paragraphs:
102
+ t = "".join(run.text for run in para.runs).strip()
103
+ if t:
104
+ out.append(f"- {t}")
105
+ out.append("")
106
+ return "\n".join(out).strip()
107
+
108
+
109
+ @registry.ingester("xlsx")
110
+ def xlsx_to_md(source, ds=None, **_):
111
+ try:
112
+ import openpyxl
113
+ except ImportError as e: # pragma: no cover
114
+ raise ConversionError("xlsx support needs `pip install docstudio[office]` (openpyxl)") from e
115
+ wb = openpyxl.load_workbook(str(source), read_only=True, data_only=True)
116
+ parts = []
117
+ for ws in wb.worksheets:
118
+ rows = [[("" if c is None else c) for c in row]
119
+ for row in ws.iter_rows(values_only=True)]
120
+ if rows:
121
+ parts.append(f"## {ws.title}\n\n" + _rows_to_md(rows))
122
+ return "\n\n".join(parts).strip()
123
+
124
+
125
+ @registry.ingester("epub")
126
+ def epub_to_md(source, ds=None, **_):
127
+ try:
128
+ from ebooklib import epub, ITEM_DOCUMENT
129
+ except ImportError as e: # pragma: no cover
130
+ raise ConversionError("epub support needs `pip install docstudio[office]` (ebooklib)") from e
131
+ book = epub.read_epub(str(source))
132
+ parts = []
133
+ for item in book.get_items_of_type(ITEM_DOCUMENT):
134
+ parts.append(html_to_md(None, html=item.get_content().decode("utf-8", "ignore")))
135
+ return "\n\n".join(p for p in parts if p.strip()).strip()
136
+
137
+
138
+ @registry.ingester("pdf")
139
+ def pdf_to_md(source, ds=None, vlm_if_scanned=True, **_):
140
+ try:
141
+ from pdfminer.high_level import extract_text
142
+ except ImportError as e: # pragma: no cover
143
+ raise ConversionError("pdf support needs `pip install docstudio[pdf]` (pdfminer.six)") from e
144
+ text = (extract_text(str(source)) or "").strip()
145
+ # crude scanned detection; fall back to the VLM/OCR path when available
146
+ if len(text) < 40 and vlm_if_scanned and ds is not None and getattr(ds, "llm", None):
147
+ return _pdf_vlm(source, ds)
148
+ return _collapse(text)
149
+
150
+
151
+ def _pdf_vlm(source, ds):
152
+ try:
153
+ import fitz # PyMuPDF
154
+ except ImportError as e: # pragma: no cover
155
+ raise ConversionError("scanned-PDF VLM path needs `pip install docstudio[ocr]` (PyMuPDF)") from e
156
+ doc = fitz.open(str(source))
157
+ out = []
158
+ for page in doc:
159
+ pix = page.get_pixmap(dpi=180)
160
+ out.append(ds.llm.vlm_extract(pix.tobytes("png"), mode="text"))
161
+ return "\n\n".join(out).strip()
162
+
163
+
164
+ @registry.ingester("image")
165
+ def image_to_md(source, ds=None, mode="text", **_):
166
+ if ds is not None and getattr(ds, "llm", None) and ds.llm.has_vision:
167
+ return ds.llm.vlm_extract(Path(source).read_bytes(), mode=mode)
168
+ try:
169
+ import pytesseract
170
+ from PIL import Image
171
+ except ImportError as e: # pragma: no cover
172
+ raise ConversionError(
173
+ "image OCR needs a vision model (configure llm) or "
174
+ "`pip install docstudio[ocr]` (pytesseract+Pillow)") from e
175
+ return pytesseract.image_to_string(Image.open(source), lang="chi_sim+eng").strip()
176
+
177
+
178
+ @registry.ingester("zip")
179
+ def zip_to_md(source, ds=None, **_):
180
+ out, n = [], 0
181
+ with zipfile.ZipFile(source) as zf:
182
+ for name in sorted(zf.namelist()):
183
+ if name.endswith("/") or "__MACOSX" in name:
184
+ continue
185
+ try:
186
+ fmt = detect_format(name)
187
+ except ConversionError:
188
+ continue
189
+ tmp = Path("/tmp") / Path(name).name
190
+ tmp.write_bytes(zf.read(name))
191
+ try:
192
+ part = ds.to_markdown(tmp, fmt=fmt) if ds else ""
193
+ except Exception as e: # noqa: BLE001
194
+ part = f"_(failed: {e})_"
195
+ finally:
196
+ tmp.unlink(missing_ok=True)
197
+ if part.strip():
198
+ n += 1
199
+ out.append(f"## {name}\n\n{part.strip()}\n\n---")
200
+ return (f"# Archive ({n} files)\n\n" + "\n\n".join(out)).strip() if n else "(no convertible files)"
201
+
202
+
203
+ def _collapse(text):
204
+ import re
205
+ return re.sub(r"\n{3,}", "\n\n", text).strip()
docstudio/latex.py ADDED
@@ -0,0 +1,124 @@
1
+ """Markdown -> LaTeX (self-contained, no pandoc required).
2
+
3
+ A faithful port of the browser tool's tested `mdToLaTeX`: math (`$...$`,
4
+ `$$...$$`) and code are protected before escaping; Chinese documents use the
5
+ ``ctexart`` class (compile with xelatex), otherwise ``article``."""
6
+ from __future__ import annotations
7
+
8
+ import re
9
+
10
+ _SPECIAL = {"&": r"\&", "%": r"\%", "#": r"\#", "_": r"\_",
11
+ "{": r"\{", "}": r"\}", "$": r"\$",
12
+ "~": r"\textasciitilde{}", "^": r"\textasciicircum{}"}
13
+
14
+
15
+ def _escape(text: str) -> str:
16
+ text = text.replace("\\", "\x00")
17
+ for ch, rep in _SPECIAL.items():
18
+ text = text.replace(ch, rep)
19
+ return text.replace("\x00", r"\textbackslash{}")
20
+
21
+
22
+ def _inline(text: str) -> str:
23
+ # protect inline code first
24
+ codes = []
25
+ def _c(m):
26
+ codes.append(m.group(1)); return f"\x01{len(codes)-1}\x01"
27
+ text = re.sub(r"`([^`]+)`", _c, text)
28
+
29
+ text = _escape(text)
30
+ text = re.sub(r"\*\*(.+?)\*\*", r"\\textbf{\1}", text)
31
+ text = re.sub(r"(?<!\*)\*(?!\*)(.+?)\*(?!\*)", r"\\textit{\1}", text)
32
+ text = re.sub(r"\[(.+?)\]\((.+?)\)", r"\\href{\2}{\1}", text)
33
+
34
+ for i, c in enumerate(codes):
35
+ text = text.replace(f"\x01{i}\x01", r"\texttt{" + _escape(c) + "}")
36
+ return text
37
+
38
+
39
+ def md_to_latex(md: str, title: str = "", chinese: bool | None = None) -> str:
40
+ if chinese is None:
41
+ chinese = bool(re.search(r"[\u4e00-\u9fff]", md))
42
+
43
+ # protect math + fenced code with placeholders
44
+ store = []
45
+ def _stash(m):
46
+ store.append(m.group(0)); return f"\x02{len(store)-1}\x02"
47
+ md = re.sub(r"\$\$[\s\S]+?\$\$|\$(?:\\.|[^$\n])+?\$|```[\s\S]*?```",
48
+ _stash, md)
49
+
50
+ body, lines, i = [], md.split("\n"), 0
51
+ while i < len(lines):
52
+ line = lines[i]
53
+ m = re.match(r"^(#{1,6})\s+(.+)", line)
54
+ if m:
55
+ depth = len(m.group(1))
56
+ cmd = ["section", "section", "subsection", "subsubsection",
57
+ "paragraph", "subparagraph", "subparagraph"][depth]
58
+ body.append(f"\\{cmd}{{{_inline(m.group(2))}}}")
59
+ i += 1; continue
60
+ # table
61
+ if re.match(r"^\s*\|.*\|\s*$", line) and i + 1 < len(lines) \
62
+ and re.match(r"^\s*\|?[\s:|-]+\|?\s*$", lines[i + 1]) and "-" in lines[i + 1]:
63
+ block = []
64
+ while i < len(lines) and "|" in lines[i]:
65
+ block.append(lines[i]); i += 1
66
+ body.append(_table(block)); continue
67
+ # unordered / ordered list
68
+ if re.match(r"^\s*[-*]\s+", line) or re.match(r"^\s*\d+\.\s+", line):
69
+ ordered = bool(re.match(r"^\s*\d+\.", line))
70
+ env = "enumerate" if ordered else "itemize"
71
+ items = []
72
+ while i < len(lines) and (re.match(r"^\s*[-*]\s+", lines[i])
73
+ or re.match(r"^\s*\d+\.\s+", lines[i])):
74
+ items.append(re.sub(r"^\s*(?:[-*]|\d+\.)\s+", "", lines[i]))
75
+ i += 1
76
+ body.append(f"\\begin{{{env}}}")
77
+ body += [f" \\item {_inline(it)}" for it in items]
78
+ body.append(f"\\end{{{env}}}")
79
+ continue
80
+ # blockquote
81
+ if line.startswith(">"):
82
+ quote = []
83
+ while i < len(lines) and lines[i].startswith(">"):
84
+ quote.append(lines[i][1:].strip()); i += 1
85
+ body.append("\\begin{quote}\n" + _inline(" ".join(quote)) + "\n\\end{quote}")
86
+ continue
87
+ if line.strip() == "":
88
+ body.append(""); i += 1; continue
89
+ body.append(_inline(line)); i += 1
90
+
91
+ text = "\n".join(body)
92
+ # restore math / code
93
+ def _restore(m):
94
+ s = store[int(m.group(1))]
95
+ if s.startswith("```"):
96
+ inner = s.strip("`")
97
+ inner = re.sub(r"^[a-zA-Z0-9]*\n", "", inner)
98
+ return "\\begin{verbatim}\n" + inner + "\n\\end{verbatim}"
99
+ return s
100
+ text = re.sub(r"\x02(\d+)\x02", _restore, text)
101
+
102
+ cls = "ctexart" if chinese else "article"
103
+ pkgs = ["\\usepackage{amsmath,amssymb}", "\\usepackage{hyperref}",
104
+ "\\usepackage{graphicx}", "\\usepackage{booktabs}"]
105
+ head = (f"\\documentclass[12pt]{{{cls}}}\n" + "\n".join(pkgs) +
106
+ (f"\n\\title{{{_inline(title)}}}\n\\date{{}}" if title else ""))
107
+ doc = "\\begin{document}\n" + ("\\maketitle\n" if title else "") + text + "\n\\end{document}\n"
108
+ return head + "\n" + doc
109
+
110
+
111
+ def _table(block):
112
+ rows = [r for k, r in enumerate(block) if k != 1]
113
+ cells = [[c.replace(r"\|", "|").strip()
114
+ for c in re.sub(r"^\s*\||\|\s*$", "", r).split("|")] for r in rows]
115
+ ncol = max(len(r) for r in cells)
116
+ spec = "l" * ncol
117
+ out = ["\\begin{tabular}{" + spec + "}", "\\toprule"]
118
+ for k, r in enumerate(cells):
119
+ r = r + [""] * (ncol - len(r))
120
+ out.append(" & ".join(_inline(c) for c in r) + " \\\\")
121
+ if k == 0:
122
+ out.append("\\midrule")
123
+ out += ["\\bottomrule", "\\end{tabular}"]
124
+ return "\n".join(out)
docstudio/llm.py ADDED
@@ -0,0 +1,106 @@
1
+ """OpenAI-compatible LLM + Vision (VLM) client.
2
+
3
+ Mirrors the browser tool: one base URL / key for a text model, and an optional
4
+ (possibly different) endpoint for a vision model used to recognise images and
5
+ scanned PDFs. Only depends on ``requests``."""
6
+ from __future__ import annotations
7
+
8
+ import base64
9
+ from typing import List, Optional, Union
10
+
11
+
12
+ class LLM:
13
+ def __init__(self, base_url="", api_key="",
14
+ model="", *, vlm_model="", vlm_base_url="", vlm_api_key="",
15
+ temperature=0.2):
16
+ self.base_url = base_url.rstrip("/")
17
+ self.api_key = api_key
18
+ self.model = model
19
+ self.vlm_model = vlm_model
20
+ self.vlm_base_url = (vlm_base_url or base_url).rstrip("/")
21
+ self.vlm_api_key = vlm_api_key or api_key
22
+ self.temperature = temperature
23
+
24
+ # -- discovery -------------------------------------------------------- #
25
+ @staticmethod
26
+ def fetch_models(base_url: str, api_key: str) -> List[str]:
27
+ import requests
28
+ base = base_url.rstrip("/")
29
+ ep = base + ("/models" if base.endswith("/v1") else "/v1/models")
30
+ r = requests.get(ep, headers={"Authorization": f"Bearer {api_key}"}, timeout=30)
31
+ r.raise_for_status()
32
+ data = r.json().get("data", r.json())
33
+ ids = [(m if isinstance(m, str) else m.get("id") or m.get("name"))
34
+ for m in (data or [])]
35
+ return sorted({i for i in ids if i})
36
+
37
+ # -- text ------------------------------------------------------------- #
38
+ def chat(self, system: str, user: str) -> str:
39
+ import requests
40
+ if not self.base_url:
41
+ raise ValueError("LLM.base_url is not set — pass your OpenAI-compatible "
42
+ "endpoint, e.g. base_url='https://api.openai.com'")
43
+ if not self.model:
44
+ raise ValueError("LLM.model is not set — pass the model id for your provider, "
45
+ "e.g. model='gpt-4o-mini'")
46
+ ep = self._ep(self.base_url)
47
+ r = requests.post(ep, headers=self._h(self.api_key), timeout=120, json={
48
+ "model": self.model, "temperature": self.temperature,
49
+ "messages": [{"role": "system", "content": system},
50
+ {"role": "user", "content": user}]})
51
+ r.raise_for_status()
52
+ return r.json()["choices"][0]["message"]["content"]
53
+
54
+ def cleanup_markdown(self, raw: str) -> str:
55
+ """AI 'smart cleanup' — turn rough extracted text into clean Markdown."""
56
+ return self.chat(
57
+ "You convert rough text into clean, well-structured GitHub-Flavored "
58
+ "Markdown. Preserve headings, lists, tables and math ($...$). "
59
+ "Output only Markdown.", raw)
60
+
61
+ # -- vision ----------------------------------------------------------- #
62
+ @property
63
+ def has_vision(self) -> bool:
64
+ return bool(self.vlm_model and (self.vlm_api_key or self.api_key))
65
+
66
+ def vlm_extract(self, image: Union[bytes, str], mode: str = "text") -> str:
67
+ import requests
68
+ if not self.vlm_base_url:
69
+ raise ValueError("no vision endpoint — set vlm_base_url (or base_url) and vlm_model")
70
+ if not self.vlm_model:
71
+ raise ValueError("LLM.vlm_model is not set — pass the vision model id for your provider")
72
+ data_url = self._data_url(image)
73
+ prompt = (
74
+ "Recognise the table(s) in the image and output ONLY standard "
75
+ "Markdown table(s) with a header and |---| separator row, no prose."
76
+ if mode == "table" else
77
+ "Transcribe all text in the image as Markdown, preserving headings, "
78
+ "lists, tables and math ($...$). Output only the content.")
79
+ ep = self._ep(self.vlm_base_url)
80
+ r = requests.post(ep, headers=self._h(self.vlm_api_key), timeout=180, json={
81
+ "model": self.vlm_model, "temperature": 0.1, "max_tokens": 4096,
82
+ "messages": [{"role": "user", "content": [
83
+ {"type": "text", "text": prompt},
84
+ {"type": "image_url", "image_url": {"url": data_url}}]}]})
85
+ r.raise_for_status()
86
+ out = r.json()["choices"][0]["message"]["content"]
87
+ if isinstance(out, list):
88
+ out = "".join(p.get("text", "") for p in out)
89
+ import re
90
+ return re.sub(r"^```(?:markdown|md)?\s*\n?|\n?```\s*$", "", out).strip()
91
+
92
+ # -- helpers ---------------------------------------------------------- #
93
+ @staticmethod
94
+ def _ep(base):
95
+ return base + ("/chat/completions" if base.endswith("/v1") else "/v1/chat/completions")
96
+
97
+ @staticmethod
98
+ def _h(key):
99
+ return {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
100
+
101
+ @staticmethod
102
+ def _data_url(image):
103
+ if isinstance(image, str) and image.startswith("data:"):
104
+ return image
105
+ raw = image if isinstance(image, (bytes, bytearray)) else open(image, "rb").read()
106
+ return "data:image/png;base64," + base64.b64encode(raw).decode()
docstudio/templates.py ADDED
@@ -0,0 +1,61 @@
1
+ """Built-in Markdown templates — the template library from the web app.
2
+
3
+ Each template mirrors the Document Studio web app. Use :func:`names` to list
4
+ them and :func:`get` to fetch the Markdown body.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ # slug -> {"title": zh title, "desc": zh description, "md": markdown body}
9
+ TEMPLATES = {
10
+ "academic": {
11
+ "title": "学术论文",
12
+ "desc": "标题、摘要、引言、方法、实验、结论、参考文献结构",
13
+ "md": "# 论文标题\n\n## 摘要\n\n本文提出……\n\n**关键词**:关键词1;关键词2\n\n## 1. 引言\n\n研究背景与动机。公式示例 $f(x)=\\sum_{i=1}^{n} w_i x_i$。\n\n## 2. 方法\n\n### 2.1 模型\n\n$$\\mathcal{L}=\\mathcal{L}_{cls}+\\lambda\\mathcal{L}_{reg}$$\n\n## 3. 实验\n\n| 方法 | 准确率 |\n|------|--------|\n| Baseline | 78.3% |\n| Ours | **85.1%** |\n\n## 4. 结论\n\n## 参考文献\n\n[1] Author. Title. Venue, Year.",
14
+ },
15
+ "techdoc": {
16
+ "title": "技术文档",
17
+ "desc": "概述、安装、用法、API、示例的标准技术文档",
18
+ "md": "# 项目名称\n\n> 一句话简介。\n\n## 概述\n\n## 安装\n\n```bash\nnpm install package-name\n```\n\n## 快速开始\n\n```js\nimport { run } from 'package-name';\nrun();\n```\n\n## API\n\n### `run(options)`\n\n| 参数 | 类型 | 说明 |\n|------|------|------|\n| options | object | 配置项 |\n\n## 许可证\n\nMIT",
19
+ },
20
+ "minutes": {
21
+ "title": "会议纪要",
22
+ "desc": "时间、参会人、议题、决议、待办事项",
23
+ "md": "# 会议纪要\n\n- **时间**:2026-06-20 14:00\n- **参会人**:\n- **主持**:\n\n## 议题\n\n1. \n2. \n\n## 讨论与决议\n\n## 待办事项\n\n- [ ] 事项一 — 负责人 — 截止日期\n- [ ] 事项二",
24
+ },
25
+ "readme": {
26
+ "title": "README",
27
+ "desc": "开源项目 README 模板,含徽章、特性、贡献",
28
+ "md": "# 项目名\n\n简短描述。\n\n## ✨ 特性\n\n- 特性一\n- 特性二\n\n## 📦 安装\n\n```bash\n# ...\n```\n\n## 🚀 使用\n\n## 🤝 贡献\n\n欢迎 PR。\n\n## 📄 License\n\nMIT © 2026",
29
+ },
30
+ "weekly": {
31
+ "title": "周报",
32
+ "desc": "本周完成、下周计划、问题与风险",
33
+ "md": "# 本周工作周报\n\n## ✅ 本周完成\n\n1. \n2. \n\n## 📋 下周计划\n\n1. \n\n## ⚠️ 问题与风险\n\n## 💡 备注",
34
+ },
35
+ "blog": {
36
+ "title": "博客文章",
37
+ "desc": "带封面、引言、小标题的文章结构",
38
+ "md": "# 文章标题\n\n*发布于 2026-06-20*\n\n> 引言:用一段话抓住读者。\n\n## 背景\n\n## 正文小标题\n\n正文内容……\n\n## 结语\n\n感谢阅读。",
39
+ },
40
+ }
41
+
42
+
43
+ def names():
44
+ """Return the list of template slugs (academic, techdoc, minutes, ...)."""
45
+ return list(TEMPLATES)
46
+
47
+
48
+ def info():
49
+ """Return ``{slug: (title, description)}`` for all templates."""
50
+ return {k: (v["title"], v["desc"]) for k, v in TEMPLATES.items()}
51
+
52
+
53
+ def get(name: str) -> str:
54
+ """Return the Markdown body of template ``name`` (by slug or zh title)."""
55
+ if name in TEMPLATES:
56
+ return TEMPLATES[name]["md"]
57
+ for v in TEMPLATES.values():
58
+ if v["title"] == name:
59
+ return v["md"]
60
+ raise KeyError("unknown template %r; choose from: %s"
61
+ % (name, ", ".join(TEMPLATES)))
docstudio/tools.py ADDED
@@ -0,0 +1,107 @@
1
+ """Headless document utilities — the toolbox from the Document Studio web app.
2
+
3
+ These are pure / lightweight helpers that don't need a browser:
4
+
5
+ * :func:`generate_toc` — build & insert a Markdown table of contents
6
+ * :func:`merge_pdfs` — concatenate several PDFs into one (pypdf)
7
+ * :func:`extract_images` — pull embedded images out of PDF / DOCX / PPTX / EPUB
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ import zipfile
13
+ from pathlib import Path
14
+ from typing import List
15
+
16
+ _IMG_EXT = {"png", "jpg", "jpeg", "gif", "bmp", "svg", "webp", "emf", "wmf", "tiff"}
17
+
18
+
19
+ def generate_toc(md: str, title: str = "目录") -> str:
20
+ """Return ``md`` with a Markdown table of contents inserted.
21
+
22
+ The TOC is built from ``##`` and ``###`` headings (``###`` indented),
23
+ placed right after the first ``#`` title, or prepended if there is none.
24
+ Faithful port of the web app's *Table of contents* tool.
25
+ """
26
+ heads = re.findall(r"^(#{2,3})\s+(.+)$", md, flags=re.M)
27
+ if not heads:
28
+ return md
29
+ toc = "## %s\n\n" % title
30
+ for hashes, text in heads:
31
+ indent = " " if len(hashes) == 3 else ""
32
+ toc += "%s- %s\n" % (indent, text.strip())
33
+ m = re.search(r"^#\s+.+$", md, flags=re.M)
34
+ if m:
35
+ return md.replace(m.group(0), m.group(0) + "\n\n" + toc, 1)
36
+ return toc + "\n" + md
37
+
38
+
39
+ def merge_pdfs(paths: List[str], out: str) -> str:
40
+ """Merge ``paths`` (PDF files) into a single PDF written to ``out``.
41
+
42
+ Requires ``pypdf`` (``pip install "docstudio[pdf]"``).
43
+ """
44
+ if not paths:
45
+ raise ValueError("merge_pdfs: no input PDFs given")
46
+ try:
47
+ from pypdf import PdfReader, PdfWriter
48
+ except ImportError as e: # pragma: no cover
49
+ raise RuntimeError('merge_pdfs needs pypdf: pip install "docstudio[pdf]"') from e
50
+ writer = PdfWriter()
51
+ for p in paths:
52
+ reader = PdfReader(str(p))
53
+ for page in reader.pages:
54
+ writer.add_page(page)
55
+ out = str(out)
56
+ with open(out, "wb") as fh:
57
+ writer.write(fh)
58
+ return out
59
+
60
+
61
+ def extract_images(source: str, out_dir: str) -> List[str]:
62
+ """Extract embedded images from ``source`` into ``out_dir``.
63
+
64
+ Supports PDF (via PyMuPDF) and OOXML / EPUB containers (DOCX, PPTX, XLSX,
65
+ EPUB) by reading their ``media`` parts. Returns the written file paths.
66
+ """
67
+ src = Path(source)
68
+ out = Path(out_dir)
69
+ out.mkdir(parents=True, exist_ok=True)
70
+ ext = src.suffix.lower().lstrip(".")
71
+ written: List[str] = []
72
+
73
+ if ext == "pdf":
74
+ try:
75
+ import fitz # PyMuPDF
76
+ except ImportError as e: # pragma: no cover
77
+ raise RuntimeError('PDF image extraction needs PyMuPDF: '
78
+ 'pip install "docstudio[ocr]"') from e
79
+ doc = fitz.open(str(src))
80
+ seen = set()
81
+ for pno in range(len(doc)):
82
+ for img in doc.get_page_images(pno):
83
+ xref = img[0]
84
+ if xref in seen:
85
+ continue
86
+ seen.add(xref)
87
+ pix = fitz.Pixmap(doc, xref)
88
+ if pix.n - pix.alpha >= 4: # CMYK / other -> RGB
89
+ pix = fitz.Pixmap(fitz.csRGB, pix)
90
+ fn = out / ("img_%03d.png" % xref)
91
+ pix.save(str(fn))
92
+ written.append(str(fn))
93
+ doc.close()
94
+
95
+ elif ext in ("docx", "pptx", "xlsx", "epub"):
96
+ with zipfile.ZipFile(str(src)) as z:
97
+ for name in z.namelist():
98
+ low = name.lower()
99
+ tail = low.rsplit(".", 1)[-1]
100
+ if ("media" in low or "image" in low) and tail in _IMG_EXT:
101
+ target = out / Path(name).name
102
+ target.write_bytes(z.read(name))
103
+ written.append(str(target))
104
+ else:
105
+ raise ValueError("unsupported source for image extraction: .%s" % ext)
106
+
107
+ return written