docstudio 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docstudio/__init__.py +35 -0
- docstudio/assistant.py +73 -0
- docstudio/cli.py +139 -0
- docstudio/core.py +190 -0
- docstudio/export.py +205 -0
- docstudio/ingest.py +205 -0
- docstudio/latex.py +124 -0
- docstudio/llm.py +106 -0
- docstudio/templates.py +61 -0
- docstudio/tools.py +107 -0
- docstudio-0.2.0.dist-info/METADATA +223 -0
- docstudio-0.2.0.dist-info/RECORD +15 -0
- docstudio-0.2.0.dist-info/WHEEL +4 -0
- docstudio-0.2.0.dist-info/entry_points.txt +2 -0
- docstudio-0.2.0.dist-info/licenses/LICENSE +21 -0
docstudio/ingest.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""Reverse converters: X -> Markdown.
|
|
2
|
+
|
|
3
|
+
Pure-python paths (csv/tsv/json/html) have no heavy deps. Office / PDF / EPUB
|
|
4
|
+
paths import their library lazily and raise a friendly message if missing, so a
|
|
5
|
+
minimal install still works for the common cases (mirrors markitdown's optional
|
|
6
|
+
extras)."""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import csv as _csv
|
|
10
|
+
import io
|
|
11
|
+
import json
|
|
12
|
+
import zipfile
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from .core import registry, ConversionError, detect_format
|
|
16
|
+
|
|
17
|
+
# --------------------------------------------------------------------------- #
|
|
18
|
+
# small helpers shared by several converters
|
|
19
|
+
# --------------------------------------------------------------------------- #
|
|
20
|
+
def _rows_to_md(rows):
|
|
21
|
+
rows = [r for r in rows if any(str(c).strip() for c in r)]
|
|
22
|
+
if not rows:
|
|
23
|
+
return ""
|
|
24
|
+
cols = max(len(r) for r in rows)
|
|
25
|
+
|
|
26
|
+
def cell(x):
|
|
27
|
+
return str("" if x is None else x).replace("|", r"\|").replace("\n", "<br>").strip()
|
|
28
|
+
|
|
29
|
+
def pad(r):
|
|
30
|
+
r = list(r) + [""] * (cols - len(r))
|
|
31
|
+
return [cell(c) for c in r]
|
|
32
|
+
|
|
33
|
+
head = pad(rows[0])
|
|
34
|
+
out = "| " + " | ".join(head) + " |\n|" + "|".join(" --- " for _ in head) + "|\n"
|
|
35
|
+
for r in rows[1:]:
|
|
36
|
+
out += "| " + " | ".join(pad(r)) + " |\n"
|
|
37
|
+
return out
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _read_text(source):
|
|
41
|
+
return Path(source).read_text(encoding="utf-8-sig")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@registry.ingester("csv")
|
|
45
|
+
def csv_to_md(source, ds=None, **_):
|
|
46
|
+
return _rows_to_md(list(_csv.reader(io.StringIO(_read_text(source)))))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@registry.ingester("tsv")
|
|
50
|
+
def tsv_to_md(source, ds=None, **_):
|
|
51
|
+
return _rows_to_md(list(_csv.reader(io.StringIO(_read_text(source)), delimiter="\t")))
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@registry.ingester("json")
|
|
55
|
+
def json_to_md(source, ds=None, **_):
|
|
56
|
+
raw = _read_text(source)
|
|
57
|
+
try:
|
|
58
|
+
data = json.loads(raw)
|
|
59
|
+
except Exception:
|
|
60
|
+
return "```json\n" + raw.strip() + "\n```"
|
|
61
|
+
if (isinstance(data, list) and data
|
|
62
|
+
and all(isinstance(x, dict) for x in data)):
|
|
63
|
+
cols = list(dict.fromkeys(k for o in data for k in o))
|
|
64
|
+
rows = [cols] + [[(json.dumps(o[c], ensure_ascii=False)
|
|
65
|
+
if isinstance(o.get(c), (dict, list))
|
|
66
|
+
else o.get(c, "")) for c in cols] for o in data]
|
|
67
|
+
return _rows_to_md(rows)
|
|
68
|
+
return "```json\n" + json.dumps(data, ensure_ascii=False, indent=2) + "\n```"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@registry.ingester("html")
|
|
72
|
+
def html_to_md(source, ds=None, html=None, **_):
|
|
73
|
+
from markdownify import markdownify as _md
|
|
74
|
+
text = html if html is not None else _read_text(source)
|
|
75
|
+
return _md(text, heading_style="ATX", bullets="-").strip()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@registry.ingester("docx")
|
|
79
|
+
def docx_to_md(source, ds=None, **_):
|
|
80
|
+
try:
|
|
81
|
+
import mammoth
|
|
82
|
+
except ImportError as e: # pragma: no cover
|
|
83
|
+
raise ConversionError("docx support needs `pip install docstudio[office]` (mammoth)") from e
|
|
84
|
+
with open(source, "rb") as fh:
|
|
85
|
+
html = mammoth.convert_to_html(fh).value
|
|
86
|
+
return html_to_md(None, html=html)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@registry.ingester("pptx")
|
|
90
|
+
def pptx_to_md(source, ds=None, **_):
|
|
91
|
+
try:
|
|
92
|
+
from pptx import Presentation
|
|
93
|
+
except ImportError as e: # pragma: no cover
|
|
94
|
+
raise ConversionError("pptx support needs `pip install docstudio[office]` (python-pptx)") from e
|
|
95
|
+
prs = Presentation(str(source))
|
|
96
|
+
out = []
|
|
97
|
+
for i, slide in enumerate(prs.slides, 1):
|
|
98
|
+
out.append(f"## Slide {i}\n")
|
|
99
|
+
for shape in slide.shapes:
|
|
100
|
+
if shape.has_text_frame:
|
|
101
|
+
for para in shape.text_frame.paragraphs:
|
|
102
|
+
t = "".join(run.text for run in para.runs).strip()
|
|
103
|
+
if t:
|
|
104
|
+
out.append(f"- {t}")
|
|
105
|
+
out.append("")
|
|
106
|
+
return "\n".join(out).strip()
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@registry.ingester("xlsx")
|
|
110
|
+
def xlsx_to_md(source, ds=None, **_):
|
|
111
|
+
try:
|
|
112
|
+
import openpyxl
|
|
113
|
+
except ImportError as e: # pragma: no cover
|
|
114
|
+
raise ConversionError("xlsx support needs `pip install docstudio[office]` (openpyxl)") from e
|
|
115
|
+
wb = openpyxl.load_workbook(str(source), read_only=True, data_only=True)
|
|
116
|
+
parts = []
|
|
117
|
+
for ws in wb.worksheets:
|
|
118
|
+
rows = [[("" if c is None else c) for c in row]
|
|
119
|
+
for row in ws.iter_rows(values_only=True)]
|
|
120
|
+
if rows:
|
|
121
|
+
parts.append(f"## {ws.title}\n\n" + _rows_to_md(rows))
|
|
122
|
+
return "\n\n".join(parts).strip()
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@registry.ingester("epub")
|
|
126
|
+
def epub_to_md(source, ds=None, **_):
|
|
127
|
+
try:
|
|
128
|
+
from ebooklib import epub, ITEM_DOCUMENT
|
|
129
|
+
except ImportError as e: # pragma: no cover
|
|
130
|
+
raise ConversionError("epub support needs `pip install docstudio[office]` (ebooklib)") from e
|
|
131
|
+
book = epub.read_epub(str(source))
|
|
132
|
+
parts = []
|
|
133
|
+
for item in book.get_items_of_type(ITEM_DOCUMENT):
|
|
134
|
+
parts.append(html_to_md(None, html=item.get_content().decode("utf-8", "ignore")))
|
|
135
|
+
return "\n\n".join(p for p in parts if p.strip()).strip()
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@registry.ingester("pdf")
|
|
139
|
+
def pdf_to_md(source, ds=None, vlm_if_scanned=True, **_):
|
|
140
|
+
try:
|
|
141
|
+
from pdfminer.high_level import extract_text
|
|
142
|
+
except ImportError as e: # pragma: no cover
|
|
143
|
+
raise ConversionError("pdf support needs `pip install docstudio[pdf]` (pdfminer.six)") from e
|
|
144
|
+
text = (extract_text(str(source)) or "").strip()
|
|
145
|
+
# crude scanned detection; fall back to the VLM/OCR path when available
|
|
146
|
+
if len(text) < 40 and vlm_if_scanned and ds is not None and getattr(ds, "llm", None):
|
|
147
|
+
return _pdf_vlm(source, ds)
|
|
148
|
+
return _collapse(text)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _pdf_vlm(source, ds):
|
|
152
|
+
try:
|
|
153
|
+
import fitz # PyMuPDF
|
|
154
|
+
except ImportError as e: # pragma: no cover
|
|
155
|
+
raise ConversionError("scanned-PDF VLM path needs `pip install docstudio[ocr]` (PyMuPDF)") from e
|
|
156
|
+
doc = fitz.open(str(source))
|
|
157
|
+
out = []
|
|
158
|
+
for page in doc:
|
|
159
|
+
pix = page.get_pixmap(dpi=180)
|
|
160
|
+
out.append(ds.llm.vlm_extract(pix.tobytes("png"), mode="text"))
|
|
161
|
+
return "\n\n".join(out).strip()
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
@registry.ingester("image")
|
|
165
|
+
def image_to_md(source, ds=None, mode="text", **_):
|
|
166
|
+
if ds is not None and getattr(ds, "llm", None) and ds.llm.has_vision:
|
|
167
|
+
return ds.llm.vlm_extract(Path(source).read_bytes(), mode=mode)
|
|
168
|
+
try:
|
|
169
|
+
import pytesseract
|
|
170
|
+
from PIL import Image
|
|
171
|
+
except ImportError as e: # pragma: no cover
|
|
172
|
+
raise ConversionError(
|
|
173
|
+
"image OCR needs a vision model (configure llm) or "
|
|
174
|
+
"`pip install docstudio[ocr]` (pytesseract+Pillow)") from e
|
|
175
|
+
return pytesseract.image_to_string(Image.open(source), lang="chi_sim+eng").strip()
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
@registry.ingester("zip")
|
|
179
|
+
def zip_to_md(source, ds=None, **_):
|
|
180
|
+
out, n = [], 0
|
|
181
|
+
with zipfile.ZipFile(source) as zf:
|
|
182
|
+
for name in sorted(zf.namelist()):
|
|
183
|
+
if name.endswith("/") or "__MACOSX" in name:
|
|
184
|
+
continue
|
|
185
|
+
try:
|
|
186
|
+
fmt = detect_format(name)
|
|
187
|
+
except ConversionError:
|
|
188
|
+
continue
|
|
189
|
+
tmp = Path("/tmp") / Path(name).name
|
|
190
|
+
tmp.write_bytes(zf.read(name))
|
|
191
|
+
try:
|
|
192
|
+
part = ds.to_markdown(tmp, fmt=fmt) if ds else ""
|
|
193
|
+
except Exception as e: # noqa: BLE001
|
|
194
|
+
part = f"_(failed: {e})_"
|
|
195
|
+
finally:
|
|
196
|
+
tmp.unlink(missing_ok=True)
|
|
197
|
+
if part.strip():
|
|
198
|
+
n += 1
|
|
199
|
+
out.append(f"## {name}\n\n{part.strip()}\n\n---")
|
|
200
|
+
return (f"# Archive ({n} files)\n\n" + "\n\n".join(out)).strip() if n else "(no convertible files)"
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _collapse(text):
|
|
204
|
+
import re
|
|
205
|
+
return re.sub(r"\n{3,}", "\n\n", text).strip()
|
docstudio/latex.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""Markdown -> LaTeX (self-contained, no pandoc required).
|
|
2
|
+
|
|
3
|
+
A faithful port of the browser tool's tested `mdToLaTeX`: math (`$...$`,
|
|
4
|
+
`$$...$$`) and code are protected before escaping; Chinese documents use the
|
|
5
|
+
``ctexart`` class (compile with xelatex), otherwise ``article``."""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
_SPECIAL = {"&": r"\&", "%": r"\%", "#": r"\#", "_": r"\_",
|
|
11
|
+
"{": r"\{", "}": r"\}", "$": r"\$",
|
|
12
|
+
"~": r"\textasciitilde{}", "^": r"\textasciicircum{}"}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _escape(text: str) -> str:
|
|
16
|
+
text = text.replace("\\", "\x00")
|
|
17
|
+
for ch, rep in _SPECIAL.items():
|
|
18
|
+
text = text.replace(ch, rep)
|
|
19
|
+
return text.replace("\x00", r"\textbackslash{}")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _inline(text: str) -> str:
|
|
23
|
+
# protect inline code first
|
|
24
|
+
codes = []
|
|
25
|
+
def _c(m):
|
|
26
|
+
codes.append(m.group(1)); return f"\x01{len(codes)-1}\x01"
|
|
27
|
+
text = re.sub(r"`([^`]+)`", _c, text)
|
|
28
|
+
|
|
29
|
+
text = _escape(text)
|
|
30
|
+
text = re.sub(r"\*\*(.+?)\*\*", r"\\textbf{\1}", text)
|
|
31
|
+
text = re.sub(r"(?<!\*)\*(?!\*)(.+?)\*(?!\*)", r"\\textit{\1}", text)
|
|
32
|
+
text = re.sub(r"\[(.+?)\]\((.+?)\)", r"\\href{\2}{\1}", text)
|
|
33
|
+
|
|
34
|
+
for i, c in enumerate(codes):
|
|
35
|
+
text = text.replace(f"\x01{i}\x01", r"\texttt{" + _escape(c) + "}")
|
|
36
|
+
return text
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def md_to_latex(md: str, title: str = "", chinese: bool | None = None) -> str:
|
|
40
|
+
if chinese is None:
|
|
41
|
+
chinese = bool(re.search(r"[\u4e00-\u9fff]", md))
|
|
42
|
+
|
|
43
|
+
# protect math + fenced code with placeholders
|
|
44
|
+
store = []
|
|
45
|
+
def _stash(m):
|
|
46
|
+
store.append(m.group(0)); return f"\x02{len(store)-1}\x02"
|
|
47
|
+
md = re.sub(r"\$\$[\s\S]+?\$\$|\$(?:\\.|[^$\n])+?\$|```[\s\S]*?```",
|
|
48
|
+
_stash, md)
|
|
49
|
+
|
|
50
|
+
body, lines, i = [], md.split("\n"), 0
|
|
51
|
+
while i < len(lines):
|
|
52
|
+
line = lines[i]
|
|
53
|
+
m = re.match(r"^(#{1,6})\s+(.+)", line)
|
|
54
|
+
if m:
|
|
55
|
+
depth = len(m.group(1))
|
|
56
|
+
cmd = ["section", "section", "subsection", "subsubsection",
|
|
57
|
+
"paragraph", "subparagraph", "subparagraph"][depth]
|
|
58
|
+
body.append(f"\\{cmd}{{{_inline(m.group(2))}}}")
|
|
59
|
+
i += 1; continue
|
|
60
|
+
# table
|
|
61
|
+
if re.match(r"^\s*\|.*\|\s*$", line) and i + 1 < len(lines) \
|
|
62
|
+
and re.match(r"^\s*\|?[\s:|-]+\|?\s*$", lines[i + 1]) and "-" in lines[i + 1]:
|
|
63
|
+
block = []
|
|
64
|
+
while i < len(lines) and "|" in lines[i]:
|
|
65
|
+
block.append(lines[i]); i += 1
|
|
66
|
+
body.append(_table(block)); continue
|
|
67
|
+
# unordered / ordered list
|
|
68
|
+
if re.match(r"^\s*[-*]\s+", line) or re.match(r"^\s*\d+\.\s+", line):
|
|
69
|
+
ordered = bool(re.match(r"^\s*\d+\.", line))
|
|
70
|
+
env = "enumerate" if ordered else "itemize"
|
|
71
|
+
items = []
|
|
72
|
+
while i < len(lines) and (re.match(r"^\s*[-*]\s+", lines[i])
|
|
73
|
+
or re.match(r"^\s*\d+\.\s+", lines[i])):
|
|
74
|
+
items.append(re.sub(r"^\s*(?:[-*]|\d+\.)\s+", "", lines[i]))
|
|
75
|
+
i += 1
|
|
76
|
+
body.append(f"\\begin{{{env}}}")
|
|
77
|
+
body += [f" \\item {_inline(it)}" for it in items]
|
|
78
|
+
body.append(f"\\end{{{env}}}")
|
|
79
|
+
continue
|
|
80
|
+
# blockquote
|
|
81
|
+
if line.startswith(">"):
|
|
82
|
+
quote = []
|
|
83
|
+
while i < len(lines) and lines[i].startswith(">"):
|
|
84
|
+
quote.append(lines[i][1:].strip()); i += 1
|
|
85
|
+
body.append("\\begin{quote}\n" + _inline(" ".join(quote)) + "\n\\end{quote}")
|
|
86
|
+
continue
|
|
87
|
+
if line.strip() == "":
|
|
88
|
+
body.append(""); i += 1; continue
|
|
89
|
+
body.append(_inline(line)); i += 1
|
|
90
|
+
|
|
91
|
+
text = "\n".join(body)
|
|
92
|
+
# restore math / code
|
|
93
|
+
def _restore(m):
|
|
94
|
+
s = store[int(m.group(1))]
|
|
95
|
+
if s.startswith("```"):
|
|
96
|
+
inner = s.strip("`")
|
|
97
|
+
inner = re.sub(r"^[a-zA-Z0-9]*\n", "", inner)
|
|
98
|
+
return "\\begin{verbatim}\n" + inner + "\n\\end{verbatim}"
|
|
99
|
+
return s
|
|
100
|
+
text = re.sub(r"\x02(\d+)\x02", _restore, text)
|
|
101
|
+
|
|
102
|
+
cls = "ctexart" if chinese else "article"
|
|
103
|
+
pkgs = ["\\usepackage{amsmath,amssymb}", "\\usepackage{hyperref}",
|
|
104
|
+
"\\usepackage{graphicx}", "\\usepackage{booktabs}"]
|
|
105
|
+
head = (f"\\documentclass[12pt]{{{cls}}}\n" + "\n".join(pkgs) +
|
|
106
|
+
(f"\n\\title{{{_inline(title)}}}\n\\date{{}}" if title else ""))
|
|
107
|
+
doc = "\\begin{document}\n" + ("\\maketitle\n" if title else "") + text + "\n\\end{document}\n"
|
|
108
|
+
return head + "\n" + doc
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _table(block):
|
|
112
|
+
rows = [r for k, r in enumerate(block) if k != 1]
|
|
113
|
+
cells = [[c.replace(r"\|", "|").strip()
|
|
114
|
+
for c in re.sub(r"^\s*\||\|\s*$", "", r).split("|")] for r in rows]
|
|
115
|
+
ncol = max(len(r) for r in cells)
|
|
116
|
+
spec = "l" * ncol
|
|
117
|
+
out = ["\\begin{tabular}{" + spec + "}", "\\toprule"]
|
|
118
|
+
for k, r in enumerate(cells):
|
|
119
|
+
r = r + [""] * (ncol - len(r))
|
|
120
|
+
out.append(" & ".join(_inline(c) for c in r) + " \\\\")
|
|
121
|
+
if k == 0:
|
|
122
|
+
out.append("\\midrule")
|
|
123
|
+
out += ["\\bottomrule", "\\end{tabular}"]
|
|
124
|
+
return "\n".join(out)
|
docstudio/llm.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""OpenAI-compatible LLM + Vision (VLM) client.
|
|
2
|
+
|
|
3
|
+
Mirrors the browser tool: one base URL / key for a text model, and an optional
|
|
4
|
+
(possibly different) endpoint for a vision model used to recognise images and
|
|
5
|
+
scanned PDFs. Only depends on ``requests``."""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import base64
|
|
9
|
+
from typing import List, Optional, Union
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LLM:
|
|
13
|
+
def __init__(self, base_url="", api_key="",
|
|
14
|
+
model="", *, vlm_model="", vlm_base_url="", vlm_api_key="",
|
|
15
|
+
temperature=0.2):
|
|
16
|
+
self.base_url = base_url.rstrip("/")
|
|
17
|
+
self.api_key = api_key
|
|
18
|
+
self.model = model
|
|
19
|
+
self.vlm_model = vlm_model
|
|
20
|
+
self.vlm_base_url = (vlm_base_url or base_url).rstrip("/")
|
|
21
|
+
self.vlm_api_key = vlm_api_key or api_key
|
|
22
|
+
self.temperature = temperature
|
|
23
|
+
|
|
24
|
+
# -- discovery -------------------------------------------------------- #
|
|
25
|
+
@staticmethod
|
|
26
|
+
def fetch_models(base_url: str, api_key: str) -> List[str]:
|
|
27
|
+
import requests
|
|
28
|
+
base = base_url.rstrip("/")
|
|
29
|
+
ep = base + ("/models" if base.endswith("/v1") else "/v1/models")
|
|
30
|
+
r = requests.get(ep, headers={"Authorization": f"Bearer {api_key}"}, timeout=30)
|
|
31
|
+
r.raise_for_status()
|
|
32
|
+
data = r.json().get("data", r.json())
|
|
33
|
+
ids = [(m if isinstance(m, str) else m.get("id") or m.get("name"))
|
|
34
|
+
for m in (data or [])]
|
|
35
|
+
return sorted({i for i in ids if i})
|
|
36
|
+
|
|
37
|
+
# -- text ------------------------------------------------------------- #
|
|
38
|
+
def chat(self, system: str, user: str) -> str:
|
|
39
|
+
import requests
|
|
40
|
+
if not self.base_url:
|
|
41
|
+
raise ValueError("LLM.base_url is not set — pass your OpenAI-compatible "
|
|
42
|
+
"endpoint, e.g. base_url='https://api.openai.com'")
|
|
43
|
+
if not self.model:
|
|
44
|
+
raise ValueError("LLM.model is not set — pass the model id for your provider, "
|
|
45
|
+
"e.g. model='gpt-4o-mini'")
|
|
46
|
+
ep = self._ep(self.base_url)
|
|
47
|
+
r = requests.post(ep, headers=self._h(self.api_key), timeout=120, json={
|
|
48
|
+
"model": self.model, "temperature": self.temperature,
|
|
49
|
+
"messages": [{"role": "system", "content": system},
|
|
50
|
+
{"role": "user", "content": user}]})
|
|
51
|
+
r.raise_for_status()
|
|
52
|
+
return r.json()["choices"][0]["message"]["content"]
|
|
53
|
+
|
|
54
|
+
def cleanup_markdown(self, raw: str) -> str:
|
|
55
|
+
"""AI 'smart cleanup' — turn rough extracted text into clean Markdown."""
|
|
56
|
+
return self.chat(
|
|
57
|
+
"You convert rough text into clean, well-structured GitHub-Flavored "
|
|
58
|
+
"Markdown. Preserve headings, lists, tables and math ($...$). "
|
|
59
|
+
"Output only Markdown.", raw)
|
|
60
|
+
|
|
61
|
+
# -- vision ----------------------------------------------------------- #
|
|
62
|
+
@property
|
|
63
|
+
def has_vision(self) -> bool:
|
|
64
|
+
return bool(self.vlm_model and (self.vlm_api_key or self.api_key))
|
|
65
|
+
|
|
66
|
+
def vlm_extract(self, image: Union[bytes, str], mode: str = "text") -> str:
|
|
67
|
+
import requests
|
|
68
|
+
if not self.vlm_base_url:
|
|
69
|
+
raise ValueError("no vision endpoint — set vlm_base_url (or base_url) and vlm_model")
|
|
70
|
+
if not self.vlm_model:
|
|
71
|
+
raise ValueError("LLM.vlm_model is not set — pass the vision model id for your provider")
|
|
72
|
+
data_url = self._data_url(image)
|
|
73
|
+
prompt = (
|
|
74
|
+
"Recognise the table(s) in the image and output ONLY standard "
|
|
75
|
+
"Markdown table(s) with a header and |---| separator row, no prose."
|
|
76
|
+
if mode == "table" else
|
|
77
|
+
"Transcribe all text in the image as Markdown, preserving headings, "
|
|
78
|
+
"lists, tables and math ($...$). Output only the content.")
|
|
79
|
+
ep = self._ep(self.vlm_base_url)
|
|
80
|
+
r = requests.post(ep, headers=self._h(self.vlm_api_key), timeout=180, json={
|
|
81
|
+
"model": self.vlm_model, "temperature": 0.1, "max_tokens": 4096,
|
|
82
|
+
"messages": [{"role": "user", "content": [
|
|
83
|
+
{"type": "text", "text": prompt},
|
|
84
|
+
{"type": "image_url", "image_url": {"url": data_url}}]}]})
|
|
85
|
+
r.raise_for_status()
|
|
86
|
+
out = r.json()["choices"][0]["message"]["content"]
|
|
87
|
+
if isinstance(out, list):
|
|
88
|
+
out = "".join(p.get("text", "") for p in out)
|
|
89
|
+
import re
|
|
90
|
+
return re.sub(r"^```(?:markdown|md)?\s*\n?|\n?```\s*$", "", out).strip()
|
|
91
|
+
|
|
92
|
+
# -- helpers ---------------------------------------------------------- #
|
|
93
|
+
@staticmethod
|
|
94
|
+
def _ep(base):
|
|
95
|
+
return base + ("/chat/completions" if base.endswith("/v1") else "/v1/chat/completions")
|
|
96
|
+
|
|
97
|
+
@staticmethod
|
|
98
|
+
def _h(key):
|
|
99
|
+
return {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
|
|
100
|
+
|
|
101
|
+
@staticmethod
|
|
102
|
+
def _data_url(image):
|
|
103
|
+
if isinstance(image, str) and image.startswith("data:"):
|
|
104
|
+
return image
|
|
105
|
+
raw = image if isinstance(image, (bytes, bytearray)) else open(image, "rb").read()
|
|
106
|
+
return "data:image/png;base64," + base64.b64encode(raw).decode()
|
docstudio/templates.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Built-in Markdown templates — the template library from the web app.
|
|
2
|
+
|
|
3
|
+
Each template mirrors the Document Studio web app. Use :func:`names` to list
|
|
4
|
+
them and :func:`get` to fetch the Markdown body.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
# slug -> {"title": zh title, "desc": zh description, "md": markdown body}
|
|
9
|
+
TEMPLATES = {
|
|
10
|
+
"academic": {
|
|
11
|
+
"title": "学术论文",
|
|
12
|
+
"desc": "标题、摘要、引言、方法、实验、结论、参考文献结构",
|
|
13
|
+
"md": "# 论文标题\n\n## 摘要\n\n本文提出……\n\n**关键词**:关键词1;关键词2\n\n## 1. 引言\n\n研究背景与动机。公式示例 $f(x)=\\sum_{i=1}^{n} w_i x_i$。\n\n## 2. 方法\n\n### 2.1 模型\n\n$$\\mathcal{L}=\\mathcal{L}_{cls}+\\lambda\\mathcal{L}_{reg}$$\n\n## 3. 实验\n\n| 方法 | 准确率 |\n|------|--------|\n| Baseline | 78.3% |\n| Ours | **85.1%** |\n\n## 4. 结论\n\n## 参考文献\n\n[1] Author. Title. Venue, Year.",
|
|
14
|
+
},
|
|
15
|
+
"techdoc": {
|
|
16
|
+
"title": "技术文档",
|
|
17
|
+
"desc": "概述、安装、用法、API、示例的标准技术文档",
|
|
18
|
+
"md": "# 项目名称\n\n> 一句话简介。\n\n## 概述\n\n## 安装\n\n```bash\nnpm install package-name\n```\n\n## 快速开始\n\n```js\nimport { run } from 'package-name';\nrun();\n```\n\n## API\n\n### `run(options)`\n\n| 参数 | 类型 | 说明 |\n|------|------|------|\n| options | object | 配置项 |\n\n## 许可证\n\nMIT",
|
|
19
|
+
},
|
|
20
|
+
"minutes": {
|
|
21
|
+
"title": "会议纪要",
|
|
22
|
+
"desc": "时间、参会人、议题、决议、待办事项",
|
|
23
|
+
"md": "# 会议纪要\n\n- **时间**:2026-06-20 14:00\n- **参会人**:\n- **主持**:\n\n## 议题\n\n1. \n2. \n\n## 讨论与决议\n\n## 待办事项\n\n- [ ] 事项一 — 负责人 — 截止日期\n- [ ] 事项二",
|
|
24
|
+
},
|
|
25
|
+
"readme": {
|
|
26
|
+
"title": "README",
|
|
27
|
+
"desc": "开源项目 README 模板,含徽章、特性、贡献",
|
|
28
|
+
"md": "# 项目名\n\n简短描述。\n\n## ✨ 特性\n\n- 特性一\n- 特性二\n\n## 📦 安装\n\n```bash\n# ...\n```\n\n## 🚀 使用\n\n## 🤝 贡献\n\n欢迎 PR。\n\n## 📄 License\n\nMIT © 2026",
|
|
29
|
+
},
|
|
30
|
+
"weekly": {
|
|
31
|
+
"title": "周报",
|
|
32
|
+
"desc": "本周完成、下周计划、问题与风险",
|
|
33
|
+
"md": "# 本周工作周报\n\n## ✅ 本周完成\n\n1. \n2. \n\n## 📋 下周计划\n\n1. \n\n## ⚠️ 问题与风险\n\n## 💡 备注",
|
|
34
|
+
},
|
|
35
|
+
"blog": {
|
|
36
|
+
"title": "博客文章",
|
|
37
|
+
"desc": "带封面、引言、小标题的文章结构",
|
|
38
|
+
"md": "# 文章标题\n\n*发布于 2026-06-20*\n\n> 引言:用一段话抓住读者。\n\n## 背景\n\n## 正文小标题\n\n正文内容……\n\n## 结语\n\n感谢阅读。",
|
|
39
|
+
},
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def names():
|
|
44
|
+
"""Return the list of template slugs (academic, techdoc, minutes, ...)."""
|
|
45
|
+
return list(TEMPLATES)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def info():
|
|
49
|
+
"""Return ``{slug: (title, description)}`` for all templates."""
|
|
50
|
+
return {k: (v["title"], v["desc"]) for k, v in TEMPLATES.items()}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get(name: str) -> str:
|
|
54
|
+
"""Return the Markdown body of template ``name`` (by slug or zh title)."""
|
|
55
|
+
if name in TEMPLATES:
|
|
56
|
+
return TEMPLATES[name]["md"]
|
|
57
|
+
for v in TEMPLATES.values():
|
|
58
|
+
if v["title"] == name:
|
|
59
|
+
return v["md"]
|
|
60
|
+
raise KeyError("unknown template %r; choose from: %s"
|
|
61
|
+
% (name, ", ".join(TEMPLATES)))
|
docstudio/tools.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Headless document utilities — the toolbox from the Document Studio web app.
|
|
2
|
+
|
|
3
|
+
These are pure / lightweight helpers that don't need a browser:
|
|
4
|
+
|
|
5
|
+
* :func:`generate_toc` — build & insert a Markdown table of contents
|
|
6
|
+
* :func:`merge_pdfs` — concatenate several PDFs into one (pypdf)
|
|
7
|
+
* :func:`extract_images` — pull embedded images out of PDF / DOCX / PPTX / EPUB
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
import zipfile
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import List
|
|
15
|
+
|
|
16
|
+
_IMG_EXT = {"png", "jpg", "jpeg", "gif", "bmp", "svg", "webp", "emf", "wmf", "tiff"}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def generate_toc(md: str, title: str = "目录") -> str:
|
|
20
|
+
"""Return ``md`` with a Markdown table of contents inserted.
|
|
21
|
+
|
|
22
|
+
The TOC is built from ``##`` and ``###`` headings (``###`` indented),
|
|
23
|
+
placed right after the first ``#`` title, or prepended if there is none.
|
|
24
|
+
Faithful port of the web app's *Table of contents* tool.
|
|
25
|
+
"""
|
|
26
|
+
heads = re.findall(r"^(#{2,3})\s+(.+)$", md, flags=re.M)
|
|
27
|
+
if not heads:
|
|
28
|
+
return md
|
|
29
|
+
toc = "## %s\n\n" % title
|
|
30
|
+
for hashes, text in heads:
|
|
31
|
+
indent = " " if len(hashes) == 3 else ""
|
|
32
|
+
toc += "%s- %s\n" % (indent, text.strip())
|
|
33
|
+
m = re.search(r"^#\s+.+$", md, flags=re.M)
|
|
34
|
+
if m:
|
|
35
|
+
return md.replace(m.group(0), m.group(0) + "\n\n" + toc, 1)
|
|
36
|
+
return toc + "\n" + md
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def merge_pdfs(paths: List[str], out: str) -> str:
|
|
40
|
+
"""Merge ``paths`` (PDF files) into a single PDF written to ``out``.
|
|
41
|
+
|
|
42
|
+
Requires ``pypdf`` (``pip install "docstudio[pdf]"``).
|
|
43
|
+
"""
|
|
44
|
+
if not paths:
|
|
45
|
+
raise ValueError("merge_pdfs: no input PDFs given")
|
|
46
|
+
try:
|
|
47
|
+
from pypdf import PdfReader, PdfWriter
|
|
48
|
+
except ImportError as e: # pragma: no cover
|
|
49
|
+
raise RuntimeError('merge_pdfs needs pypdf: pip install "docstudio[pdf]"') from e
|
|
50
|
+
writer = PdfWriter()
|
|
51
|
+
for p in paths:
|
|
52
|
+
reader = PdfReader(str(p))
|
|
53
|
+
for page in reader.pages:
|
|
54
|
+
writer.add_page(page)
|
|
55
|
+
out = str(out)
|
|
56
|
+
with open(out, "wb") as fh:
|
|
57
|
+
writer.write(fh)
|
|
58
|
+
return out
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def extract_images(source: str, out_dir: str) -> List[str]:
|
|
62
|
+
"""Extract embedded images from ``source`` into ``out_dir``.
|
|
63
|
+
|
|
64
|
+
Supports PDF (via PyMuPDF) and OOXML / EPUB containers (DOCX, PPTX, XLSX,
|
|
65
|
+
EPUB) by reading their ``media`` parts. Returns the written file paths.
|
|
66
|
+
"""
|
|
67
|
+
src = Path(source)
|
|
68
|
+
out = Path(out_dir)
|
|
69
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
70
|
+
ext = src.suffix.lower().lstrip(".")
|
|
71
|
+
written: List[str] = []
|
|
72
|
+
|
|
73
|
+
if ext == "pdf":
|
|
74
|
+
try:
|
|
75
|
+
import fitz # PyMuPDF
|
|
76
|
+
except ImportError as e: # pragma: no cover
|
|
77
|
+
raise RuntimeError('PDF image extraction needs PyMuPDF: '
|
|
78
|
+
'pip install "docstudio[ocr]"') from e
|
|
79
|
+
doc = fitz.open(str(src))
|
|
80
|
+
seen = set()
|
|
81
|
+
for pno in range(len(doc)):
|
|
82
|
+
for img in doc.get_page_images(pno):
|
|
83
|
+
xref = img[0]
|
|
84
|
+
if xref in seen:
|
|
85
|
+
continue
|
|
86
|
+
seen.add(xref)
|
|
87
|
+
pix = fitz.Pixmap(doc, xref)
|
|
88
|
+
if pix.n - pix.alpha >= 4: # CMYK / other -> RGB
|
|
89
|
+
pix = fitz.Pixmap(fitz.csRGB, pix)
|
|
90
|
+
fn = out / ("img_%03d.png" % xref)
|
|
91
|
+
pix.save(str(fn))
|
|
92
|
+
written.append(str(fn))
|
|
93
|
+
doc.close()
|
|
94
|
+
|
|
95
|
+
elif ext in ("docx", "pptx", "xlsx", "epub"):
|
|
96
|
+
with zipfile.ZipFile(str(src)) as z:
|
|
97
|
+
for name in z.namelist():
|
|
98
|
+
low = name.lower()
|
|
99
|
+
tail = low.rsplit(".", 1)[-1]
|
|
100
|
+
if ("media" in low or "image" in low) and tail in _IMG_EXT:
|
|
101
|
+
target = out / Path(name).name
|
|
102
|
+
target.write_bytes(z.read(name))
|
|
103
|
+
written.append(str(target))
|
|
104
|
+
else:
|
|
105
|
+
raise ValueError("unsupported source for image extraction: .%s" % ext)
|
|
106
|
+
|
|
107
|
+
return written
|