docstudio 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docstudio-0.2.0/.gitignore +35 -0
- docstudio-0.2.0/LICENSE +21 -0
- docstudio-0.2.0/PKG-INFO +223 -0
- docstudio-0.2.0/README.md +162 -0
- docstudio-0.2.0/docstudio/__init__.py +35 -0
- docstudio-0.2.0/docstudio/assistant.py +73 -0
- docstudio-0.2.0/docstudio/cli.py +139 -0
- docstudio-0.2.0/docstudio/core.py +190 -0
- docstudio-0.2.0/docstudio/export.py +205 -0
- docstudio-0.2.0/docstudio/ingest.py +205 -0
- docstudio-0.2.0/docstudio/latex.py +124 -0
- docstudio-0.2.0/docstudio/llm.py +106 -0
- docstudio-0.2.0/docstudio/templates.py +61 -0
- docstudio-0.2.0/docstudio/tools.py +107 -0
- docstudio-0.2.0/pyproject.toml +59 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Build artifacts
|
|
2
|
+
dist/
|
|
3
|
+
build/
|
|
4
|
+
*.egg-info/
|
|
5
|
+
*.egg
|
|
6
|
+
.eggs/
|
|
7
|
+
|
|
8
|
+
# Byte-compiled / cache
|
|
9
|
+
__pycache__/
|
|
10
|
+
*.py[cod]
|
|
11
|
+
*.so
|
|
12
|
+
|
|
13
|
+
# Virtual environments
|
|
14
|
+
.venv/
|
|
15
|
+
venv/
|
|
16
|
+
env/
|
|
17
|
+
ENV/
|
|
18
|
+
|
|
19
|
+
# Tooling / IDE
|
|
20
|
+
.idea/
|
|
21
|
+
.vscode/
|
|
22
|
+
.mypy_cache/
|
|
23
|
+
.pytest_cache/
|
|
24
|
+
.ruff_cache/
|
|
25
|
+
.coverage
|
|
26
|
+
htmlcov/
|
|
27
|
+
|
|
28
|
+
# OS
|
|
29
|
+
.DS_Store
|
|
30
|
+
Thumbs.db
|
|
31
|
+
|
|
32
|
+
# Local secrets — never commit tokens
|
|
33
|
+
.pypirc
|
|
34
|
+
*.token
|
|
35
|
+
.env
|
docstudio-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 biaoli
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
docstudio-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docstudio
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Bidirectional, Markdown-centric document conversion: reverse (X->Markdown) like markitdown, plus high-fidelity forward export (Markdown->PDF/Word/LaTeX/EPUB/Excel) and optional VLM image recognition.
|
|
5
|
+
Project-URL: Homepage, https://github.com/Sudo-Biao/docstudio
|
|
6
|
+
Project-URL: Repository, https://github.com/Sudo-Biao/docstudio
|
|
7
|
+
Project-URL: Issues, https://github.com/Sudo-Biao/docstudio/issues
|
|
8
|
+
Author-email: biaoli <biaoli@swufe.edu.cn>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: conversion,document,docx,epub,latex,markdown,markitdown,ocr,pdf,vlm
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Office/Business
|
|
22
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
23
|
+
Requires-Python: >=3.9
|
|
24
|
+
Requires-Dist: markdown>=3.5
|
|
25
|
+
Requires-Dist: markdownify>=0.11
|
|
26
|
+
Provides-Extra: all
|
|
27
|
+
Requires-Dist: ebooklib; extra == 'all'
|
|
28
|
+
Requires-Dist: mammoth; extra == 'all'
|
|
29
|
+
Requires-Dist: markitdown[all]; extra == 'all'
|
|
30
|
+
Requires-Dist: openpyxl; extra == 'all'
|
|
31
|
+
Requires-Dist: pdfminer-six; extra == 'all'
|
|
32
|
+
Requires-Dist: pillow; extra == 'all'
|
|
33
|
+
Requires-Dist: pymupdf; extra == 'all'
|
|
34
|
+
Requires-Dist: pypdf; extra == 'all'
|
|
35
|
+
Requires-Dist: pytesseract; extra == 'all'
|
|
36
|
+
Requires-Dist: python-docx; extra == 'all'
|
|
37
|
+
Requires-Dist: python-pptx; extra == 'all'
|
|
38
|
+
Requires-Dist: requests; extra == 'all'
|
|
39
|
+
Provides-Extra: llm
|
|
40
|
+
Requires-Dist: requests; extra == 'llm'
|
|
41
|
+
Provides-Extra: markitdown
|
|
42
|
+
Requires-Dist: markitdown[all]; extra == 'markitdown'
|
|
43
|
+
Provides-Extra: ocr
|
|
44
|
+
Requires-Dist: pillow; extra == 'ocr'
|
|
45
|
+
Requires-Dist: pymupdf; extra == 'ocr'
|
|
46
|
+
Requires-Dist: pytesseract; extra == 'ocr'
|
|
47
|
+
Provides-Extra: office
|
|
48
|
+
Requires-Dist: ebooklib; extra == 'office'
|
|
49
|
+
Requires-Dist: mammoth; extra == 'office'
|
|
50
|
+
Requires-Dist: openpyxl; extra == 'office'
|
|
51
|
+
Requires-Dist: python-docx; extra == 'office'
|
|
52
|
+
Requires-Dist: python-pptx; extra == 'office'
|
|
53
|
+
Provides-Extra: pdf
|
|
54
|
+
Requires-Dist: pdfminer-six; extra == 'pdf'
|
|
55
|
+
Requires-Dist: pypdf; extra == 'pdf'
|
|
56
|
+
Provides-Extra: pdf-chrome
|
|
57
|
+
Requires-Dist: playwright; extra == 'pdf-chrome'
|
|
58
|
+
Provides-Extra: pdf-weasy
|
|
59
|
+
Requires-Dist: weasyprint; extra == 'pdf-weasy'
|
|
60
|
+
Description-Content-Type: text/markdown
|
|
61
|
+
|
|
62
|
+
# DocumentStudio (Python)
|
|
63
|
+
|
|
64
|
+
A **bidirectional, Markdown-centric** document converter — a Python library and
|
|
65
|
+
CLI in the spirit of Microsoft's [`markitdown`](https://github.com/microsoft/markitdown),
|
|
66
|
+
but going **both ways**:
|
|
67
|
+
|
|
68
|
+
| Direction | Formats | Notes |
|
|
69
|
+
|-----------|---------|-------|
|
|
70
|
+
| **Reverse** `X → Markdown` | PDF, Word, PPT, Excel, EPUB, HTML, CSV/TSV, JSON, ZIP, images | like markitdown; can *delegate to* markitdown when installed |
|
|
71
|
+
| **Forward** `Markdown → X` | HTML, **PDF**, **Word (.docx)**, **LaTeX**, EPUB, Excel (.xlsx), text | high-fidelity export — the part markitdown does **not** do |
|
|
72
|
+
| **AI / VLM** | image & scanned-PDF recognition, "smart cleanup" | any OpenAI-compatible endpoint; vision model optional |
|
|
73
|
+
| **AI assistant** | polish, translate, summarise, expand, continue, grammar, formalise, titles, outline, fix-LaTeX, free-form | one-shot ops on a document |
|
|
74
|
+
| **Toolbox** | table of contents, merge PDFs, extract images | headless, no browser |
|
|
75
|
+
| **Templates** | academic, techdoc, minutes, readme, weekly, blog | ready-to-edit Markdown |
|
|
76
|
+
|
|
77
|
+
The design mirrors markitdown's: a small core, a converter **registry** that's open
|
|
78
|
+
for extension, and **optional dependency extras** so a minimal install still works.
|
|
79
|
+
|
|
80
|
+
## Install
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
pip install docstudio # core: csv/tsv/json/html + md→html/latex/text
|
|
84
|
+
pip install "docstudio[office]" # docx, pptx, xlsx, epub
|
|
85
|
+
pip install "docstudio[pdf]" # PDF text extraction (pdfminer.six)
|
|
86
|
+
pip install "docstudio[ocr]" # scanned-PDF / image OCR (PyMuPDF, pytesseract)
|
|
87
|
+
pip install "docstudio[llm]" # AI cleanup + VLM (requests)
|
|
88
|
+
pip install "docstudio[markitdown]" # reuse Microsoft markitdown for the reverse path
|
|
89
|
+
pip install "docstudio[all]"
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
For **Markdown → PDF/DOCX/EPUB** with the best fidelity, install
|
|
93
|
+
[`pandoc`](https://pandoc.org) plus a TeX engine (`xelatex`):
|
|
94
|
+
```bash
|
|
95
|
+
sudo apt install pandoc texlive-xetex texlive-latex-recommended fonts-noto-cjk
|
|
96
|
+
```
|
|
97
|
+
PDF also has two pure-Python backends as fallbacks: `weasyprint`
|
|
98
|
+
(`docstudio[pdf-weasy]`) and headless-Chrome via `playwright`
|
|
99
|
+
(`docstudio[pdf-chrome]`, full KaTeX math).
|
|
100
|
+
|
|
101
|
+
## Library
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
from docstudio import DocumentStudio
|
|
105
|
+
ds = DocumentStudio() # use_markitdown=True by default
|
|
106
|
+
|
|
107
|
+
# anything → Markdown
|
|
108
|
+
md = ds.to_markdown("report.pdf")
|
|
109
|
+
md = ds.to_markdown("slides.pptx")
|
|
110
|
+
|
|
111
|
+
# Markdown → anything (non-md inputs are auto-converted first)
|
|
112
|
+
ds.convert("paper.md", to="pdf", out="paper.pdf")
|
|
113
|
+
ds.convert("paper.md", to="docx", out="paper.docx")
|
|
114
|
+
ds.convert("scan.pdf", to="docx", out="scan.docx") # PDF → md → docx
|
|
115
|
+
ds.convert("table.png", to="xlsx", out="table.xlsx") # image → md → xlsx
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### AI + Vision (VLM)
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
from docstudio import DocumentStudio
|
|
122
|
+
from docstudio.llm import LLM
|
|
123
|
+
|
|
124
|
+
llm = LLM(base_url="https://api.openai.com", api_key="sk-...",
|
|
125
|
+
model="gpt-4o-mini", vlm_model="gpt-4o")
|
|
126
|
+
|
|
127
|
+
print(LLM.fetch_models("https://api.openai.com", "sk-...")) # pick from the list
|
|
128
|
+
|
|
129
|
+
ds = DocumentStudio(llm=llm)
|
|
130
|
+
md = ds.to_markdown("photographed_table.jpg") # recognised by the vision model
|
|
131
|
+
md = ds.to_markdown("scanned_book.pdf") # page-by-page VLM when no text layer
|
|
132
|
+
md = llm.cleanup_markdown(rough_text) # turn messy OCR into clean Markdown
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## AI assistant (operate on a document)
|
|
136
|
+
|
|
137
|
+
One-shot AI operations on Markdown/text — the *AI Assistant* from the web app.
|
|
138
|
+
Needs an `llm` (any OpenAI-compatible endpoint).
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from docstudio import DocumentStudio
|
|
142
|
+
from docstudio.llm import LLM
|
|
143
|
+
|
|
144
|
+
# any OpenAI-compatible endpoint — OpenAI, DeepSeek, vLLM, Ollama, a gateway…
|
|
145
|
+
# you choose base_url + model; nothing is hard-coded to a provider
|
|
146
|
+
ds = DocumentStudio(llm=LLM(base_url="https://api.openai.com",
|
|
147
|
+
api_key="sk-...", model="gpt-4o-mini"))
|
|
148
|
+
|
|
149
|
+
ds.assist(md, action="polish") # 润色
|
|
150
|
+
ds.assist(md, action="to_en") # 翻译成英文(to_zh 反之)
|
|
151
|
+
ds.assist(md, action="summary") # 摘要
|
|
152
|
+
ds.assist(md, action="outline") # 生成大纲
|
|
153
|
+
ds.assist(md, instruction="把所有表格改成要点列表") # 自由指令
|
|
154
|
+
|
|
155
|
+
DocumentStudio.assist_actions()
|
|
156
|
+
# polish, to_en, to_zh, summary, expand, condense, continue,
|
|
157
|
+
# grammar, formal, titles, outline, fix_latex
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## Toolbox
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
ds.generate_toc(md) # insert a Markdown table of contents
|
|
164
|
+
ds.merge_pdfs(["a.pdf", "b.pdf"], "all.pdf") # concatenate PDFs (needs pypdf)
|
|
165
|
+
ds.extract_images("report.pdf", "./imgs") # pull embedded images out (PDF/DOCX/PPTX/EPUB)
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## Templates
|
|
169
|
+
|
|
170
|
+
Six ready-to-edit Markdown templates: `academic`, `techdoc`, `minutes`,
|
|
171
|
+
`readme`, `weekly`, `blog`.
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
ds.templates() # {slug: (title, description)}
|
|
175
|
+
body = ds.template("academic")
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## CLI
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
docstudio report.pdf # → report.md (prints to stdout)
|
|
182
|
+
docstudio report.pdf -o out.md
|
|
183
|
+
cat report.pdf | docstudio # stdin → stdout
|
|
184
|
+
docstudio paper.md --to pdf -o paper.pdf # Markdown → anything
|
|
185
|
+
docstudio scan.pdf --to docx # PDF → md → docx
|
|
186
|
+
docstudio photo.jpg --vlm-model gpt-4o --base-url https://api.openai.com --api-key sk-...
|
|
187
|
+
docstudio --list-formats
|
|
188
|
+
|
|
189
|
+
docstudio paper.md --toc -o paper.md # insert a table of contents
|
|
190
|
+
docstudio notes.md --assist polish --base-url https://api.openai.com --model gpt-4o-mini --api-key sk-... -o clean.md
|
|
191
|
+
docstudio notes.md --instruction "翻译成英文" --base-url https://api.openai.com --model gpt-4o-mini --api-key sk-... -o en.md
|
|
192
|
+
docstudio --merge a.pdf b.pdf -o all.pdf # merge PDFs
|
|
193
|
+
docstudio report.pdf --extract-images ./imgs # pull out images
|
|
194
|
+
docstudio --template academic # print a template
|
|
195
|
+
docstudio --list-templates
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
## Extending
|
|
199
|
+
|
|
200
|
+
Register your own converter — exactly how the built-ins are defined:
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
from docstudio.core import registry
|
|
204
|
+
|
|
205
|
+
@registry.ingester("rtf")
|
|
206
|
+
def rtf_to_md(source, ds=None, **opts):
|
|
207
|
+
...
|
|
208
|
+
return markdown_text
|
|
209
|
+
|
|
210
|
+
@registry.exporter("rst")
|
|
211
|
+
def md_to_rst(md, out=None, ds=None, **opts):
|
|
212
|
+
...
|
|
213
|
+
return out
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
## Relationship to markitdown
|
|
217
|
+
|
|
218
|
+
`markitdown` is excellent at `X → Markdown` for LLM pipelines. DocumentStudio
|
|
219
|
+
**reuses** it for that direction when present (`use_markitdown=True`), and adds the
|
|
220
|
+
missing half: turning Markdown back into polished, human-facing **PDF / Word /
|
|
221
|
+
LaTeX / EPUB / Excel**, plus a vision-model path for images and scanned PDFs.
|
|
222
|
+
|
|
223
|
+
MIT licensed.
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# DocumentStudio (Python)
|
|
2
|
+
|
|
3
|
+
A **bidirectional, Markdown-centric** document converter — a Python library and
|
|
4
|
+
CLI in the spirit of Microsoft's [`markitdown`](https://github.com/microsoft/markitdown),
|
|
5
|
+
but going **both ways**:
|
|
6
|
+
|
|
7
|
+
| Direction | Formats | Notes |
|
|
8
|
+
|-----------|---------|-------|
|
|
9
|
+
| **Reverse** `X → Markdown` | PDF, Word, PPT, Excel, EPUB, HTML, CSV/TSV, JSON, ZIP, images | like markitdown; can *delegate to* markitdown when installed |
|
|
10
|
+
| **Forward** `Markdown → X` | HTML, **PDF**, **Word (.docx)**, **LaTeX**, EPUB, Excel (.xlsx), text | high-fidelity export — the part markitdown does **not** do |
|
|
11
|
+
| **AI / VLM** | image & scanned-PDF recognition, "smart cleanup" | any OpenAI-compatible endpoint; vision model optional |
|
|
12
|
+
| **AI assistant** | polish, translate, summarise, expand, continue, grammar, formalise, titles, outline, fix-LaTeX, free-form | one-shot ops on a document |
|
|
13
|
+
| **Toolbox** | table of contents, merge PDFs, extract images | headless, no browser |
|
|
14
|
+
| **Templates** | academic, techdoc, minutes, readme, weekly, blog | ready-to-edit Markdown |
|
|
15
|
+
|
|
16
|
+
The design mirrors markitdown's: a small core, a converter **registry** that's open
|
|
17
|
+
for extension, and **optional dependency extras** so a minimal install still works.
|
|
18
|
+
|
|
19
|
+
## Install
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install docstudio # core: csv/tsv/json/html + md→html/latex/text
|
|
23
|
+
pip install "docstudio[office]" # docx, pptx, xlsx, epub
|
|
24
|
+
pip install "docstudio[pdf]" # PDF text extraction (pdfminer.six)
|
|
25
|
+
pip install "docstudio[ocr]" # scanned-PDF / image OCR (PyMuPDF, pytesseract)
|
|
26
|
+
pip install "docstudio[llm]" # AI cleanup + VLM (requests)
|
|
27
|
+
pip install "docstudio[markitdown]" # reuse Microsoft markitdown for the reverse path
|
|
28
|
+
pip install "docstudio[all]"
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
For **Markdown → PDF/DOCX/EPUB** with the best fidelity, install
|
|
32
|
+
[`pandoc`](https://pandoc.org) plus a TeX engine (`xelatex`):
|
|
33
|
+
```bash
|
|
34
|
+
sudo apt install pandoc texlive-xetex texlive-latex-recommended fonts-noto-cjk
|
|
35
|
+
```
|
|
36
|
+
PDF also has two pure-Python backends as fallbacks: `weasyprint`
|
|
37
|
+
(`docstudio[pdf-weasy]`) and headless-Chrome via `playwright`
|
|
38
|
+
(`docstudio[pdf-chrome]`, full KaTeX math).
|
|
39
|
+
|
|
40
|
+
## Library
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from docstudio import DocumentStudio
|
|
44
|
+
ds = DocumentStudio() # use_markitdown=True by default
|
|
45
|
+
|
|
46
|
+
# anything → Markdown
|
|
47
|
+
md = ds.to_markdown("report.pdf")
|
|
48
|
+
md = ds.to_markdown("slides.pptx")
|
|
49
|
+
|
|
50
|
+
# Markdown → anything (non-md inputs are auto-converted first)
|
|
51
|
+
ds.convert("paper.md", to="pdf", out="paper.pdf")
|
|
52
|
+
ds.convert("paper.md", to="docx", out="paper.docx")
|
|
53
|
+
ds.convert("scan.pdf", to="docx", out="scan.docx") # PDF → md → docx
|
|
54
|
+
ds.convert("table.png", to="xlsx", out="table.xlsx") # image → md → xlsx
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### AI + Vision (VLM)
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from docstudio import DocumentStudio
|
|
61
|
+
from docstudio.llm import LLM
|
|
62
|
+
|
|
63
|
+
llm = LLM(base_url="https://api.openai.com", api_key="sk-...",
|
|
64
|
+
model="gpt-4o-mini", vlm_model="gpt-4o")
|
|
65
|
+
|
|
66
|
+
print(LLM.fetch_models("https://api.openai.com", "sk-...")) # pick from the list
|
|
67
|
+
|
|
68
|
+
ds = DocumentStudio(llm=llm)
|
|
69
|
+
md = ds.to_markdown("photographed_table.jpg") # recognised by the vision model
|
|
70
|
+
md = ds.to_markdown("scanned_book.pdf") # page-by-page VLM when no text layer
|
|
71
|
+
md = llm.cleanup_markdown(rough_text) # turn messy OCR into clean Markdown
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## AI assistant (operate on a document)
|
|
75
|
+
|
|
76
|
+
One-shot AI operations on Markdown/text — the *AI Assistant* from the web app.
|
|
77
|
+
Needs an `llm` (any OpenAI-compatible endpoint).
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from docstudio import DocumentStudio
|
|
81
|
+
from docstudio.llm import LLM
|
|
82
|
+
|
|
83
|
+
# any OpenAI-compatible endpoint — OpenAI, DeepSeek, vLLM, Ollama, a gateway…
|
|
84
|
+
# you choose base_url + model; nothing is hard-coded to a provider
|
|
85
|
+
ds = DocumentStudio(llm=LLM(base_url="https://api.openai.com",
|
|
86
|
+
api_key="sk-...", model="gpt-4o-mini"))
|
|
87
|
+
|
|
88
|
+
ds.assist(md, action="polish") # 润色
|
|
89
|
+
ds.assist(md, action="to_en") # 翻译成英文(to_zh 反之)
|
|
90
|
+
ds.assist(md, action="summary") # 摘要
|
|
91
|
+
ds.assist(md, action="outline") # 生成大纲
|
|
92
|
+
ds.assist(md, instruction="把所有表格改成要点列表") # 自由指令
|
|
93
|
+
|
|
94
|
+
DocumentStudio.assist_actions()
|
|
95
|
+
# polish, to_en, to_zh, summary, expand, condense, continue,
|
|
96
|
+
# grammar, formal, titles, outline, fix_latex
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Toolbox
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
ds.generate_toc(md) # insert a Markdown table of contents
|
|
103
|
+
ds.merge_pdfs(["a.pdf", "b.pdf"], "all.pdf") # concatenate PDFs (needs pypdf)
|
|
104
|
+
ds.extract_images("report.pdf", "./imgs") # pull embedded images out (PDF/DOCX/PPTX/EPUB)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Templates
|
|
108
|
+
|
|
109
|
+
Six ready-to-edit Markdown templates: `academic`, `techdoc`, `minutes`,
|
|
110
|
+
`readme`, `weekly`, `blog`.
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
ds.templates() # {slug: (title, description)}
|
|
114
|
+
body = ds.template("academic")
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## CLI
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
docstudio report.pdf # → report.md (prints to stdout)
|
|
121
|
+
docstudio report.pdf -o out.md
|
|
122
|
+
cat report.pdf | docstudio # stdin → stdout
|
|
123
|
+
docstudio paper.md --to pdf -o paper.pdf # Markdown → anything
|
|
124
|
+
docstudio scan.pdf --to docx # PDF → md → docx
|
|
125
|
+
docstudio photo.jpg --vlm-model gpt-4o --base-url https://api.openai.com --api-key sk-...
|
|
126
|
+
docstudio --list-formats
|
|
127
|
+
|
|
128
|
+
docstudio paper.md --toc -o paper.md # insert a table of contents
|
|
129
|
+
docstudio notes.md --assist polish --base-url https://api.openai.com --model gpt-4o-mini --api-key sk-... -o clean.md
|
|
130
|
+
docstudio notes.md --instruction "翻译成英文" --base-url https://api.openai.com --model gpt-4o-mini --api-key sk-... -o en.md
|
|
131
|
+
docstudio --merge a.pdf b.pdf -o all.pdf # merge PDFs
|
|
132
|
+
docstudio report.pdf --extract-images ./imgs # pull out images
|
|
133
|
+
docstudio --template academic # print a template
|
|
134
|
+
docstudio --list-templates
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Extending
|
|
138
|
+
|
|
139
|
+
Register your own converter — exactly how the built-ins are defined:
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from docstudio.core import registry
|
|
143
|
+
|
|
144
|
+
@registry.ingester("rtf")
|
|
145
|
+
def rtf_to_md(source, ds=None, **opts):
|
|
146
|
+
...
|
|
147
|
+
return markdown_text
|
|
148
|
+
|
|
149
|
+
@registry.exporter("rst")
|
|
150
|
+
def md_to_rst(md, out=None, ds=None, **opts):
|
|
151
|
+
...
|
|
152
|
+
return out
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## Relationship to markitdown
|
|
156
|
+
|
|
157
|
+
`markitdown` is excellent at `X → Markdown` for LLM pipelines. DocumentStudio
|
|
158
|
+
**reuses** it for that direction when present (`use_markitdown=True`), and adds the
|
|
159
|
+
missing half: turning Markdown back into polished, human-facing **PDF / Word /
|
|
160
|
+
LaTeX / EPUB / Excel**, plus a vision-model path for images and scanned PDFs.
|
|
161
|
+
|
|
162
|
+
MIT licensed.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""DocumentStudio — bidirectional document conversion library.
|
|
2
|
+
|
|
3
|
+
Reverse (X -> Markdown): PDF, Word, PPT, Excel, EPUB, HTML, CSV/TSV, JSON,
|
|
4
|
+
ZIP, images (OCR / VLM). Optionally backed by
|
|
5
|
+
Microsoft `markitdown` when installed.
|
|
6
|
+
Forward (Markdown -> X): HTML, PDF, Word(.docx), LaTeX, EPUB, Excel(.xlsx),
|
|
7
|
+
plain text. High-fidelity export is the part
|
|
8
|
+
markitdown does NOT do.
|
|
9
|
+
AI assistant : polish, translate, summarise, expand, continue,
|
|
10
|
+
grammar, formalise, titles, outline, fix-LaTeX,
|
|
11
|
+
free-form instructions.
|
|
12
|
+
Toolbox : table of contents, merge PDFs, extract images.
|
|
13
|
+
Templates : academic, techdoc, minutes, readme, weekly, blog.
|
|
14
|
+
|
|
15
|
+
Quick start
|
|
16
|
+
-----------
|
|
17
|
+
from docstudio import DocumentStudio
|
|
18
|
+
ds = DocumentStudio()
|
|
19
|
+
|
|
20
|
+
md = ds.to_markdown("report.pdf") # anything -> Markdown
|
|
21
|
+
ds.convert("paper.md", to="pdf", out="paper.pdf") # Markdown -> anything
|
|
22
|
+
|
|
23
|
+
ds.generate_toc(md) # toolbox
|
|
24
|
+
ds.merge_pdfs(["a.pdf", "b.pdf"], "all.pdf")
|
|
25
|
+
body = ds.template("academic") # template library
|
|
26
|
+
|
|
27
|
+
from docstudio.llm import LLM # AI assistant
|
|
28
|
+
ds = DocumentStudio(llm=LLM(base_url="...", api_key="...", model="..."))
|
|
29
|
+
ds.assist(md, action="polish")
|
|
30
|
+
ds.assist(md, instruction="把所有表格改成要点列表")
|
|
31
|
+
"""
|
|
32
|
+
from .core import DocumentStudio, ConversionError, registry
|
|
33
|
+
|
|
34
|
+
__all__ = ["DocumentStudio", "ConversionError", "registry"]
|
|
35
|
+
__version__ = "0.2.0"
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""AI document assistant — LLM-powered operations on Markdown / plain text.
|
|
2
|
+
|
|
3
|
+
Mirrors the *AI Assistant* of the Document Studio web app: a fixed set of
|
|
4
|
+
one-shot actions (polish, translate, summarise, expand, ...) plus a free-form
|
|
5
|
+
``instruction``. Every action is a single chat completion against whatever
|
|
6
|
+
OpenAI-compatible endpoint the :class:`~docstudio.llm.LLM` is configured for.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from typing import Optional
|
|
12
|
+
|
|
13
|
+
# action key -> system prompt (kept in sync with the web app's AI_ACTS)
|
|
14
|
+
ACTIONS = {
|
|
15
|
+
"polish": "你是中文技术写作助手。润色下面的 Markdown:提升流畅度、准确性与可读性,"
|
|
16
|
+
"规范标点与格式,严格保留原意与全部信息,不新增不删减。只输出 Markdown,不要解释。",
|
|
17
|
+
"to_en": "Translate the following Markdown into fluent, natural English. Preserve all "
|
|
18
|
+
"Markdown structure (headings, lists, tables, math $...$, code). Output only the "
|
|
19
|
+
"translated Markdown.",
|
|
20
|
+
"to_zh": "把下面的 Markdown 翻译成自然流畅的简体中文,保留所有 Markdown 结构"
|
|
21
|
+
"(标题、列表、表格、公式 $...$、代码)。只输出翻译后的 Markdown。",
|
|
22
|
+
"summary": "为下面的内容写一段简洁、准确的摘要(约150-250字)。只输出摘要的 Markdown 段落,不要解释。",
|
|
23
|
+
"expand": "扩展并丰富下面的内容:补充细节、例子与解释,保持原结构与风格,不偏离主题、不编造事实。"
|
|
24
|
+
"只输出 Markdown。",
|
|
25
|
+
"condense": "精炼下面的内容:去除冗余,保留要点与关键信息,保持 Markdown 结构。只输出 Markdown。",
|
|
26
|
+
"continue": "延续下面文档的主题、风格与结构,自然地继续往下写 1-3 段。只输出新增的 Markdown 内容,"
|
|
27
|
+
"不要重复已有内容。",
|
|
28
|
+
"grammar": "修正下面文档中的语法、拼写与标点错误,不改变原意与写作风格。只输出修正后的 Markdown。",
|
|
29
|
+
"formal": "把下面的内容改写为更正式、专业的书面语,保留全部信息与 Markdown 结构。只输出 Markdown。",
|
|
30
|
+
"titles": "为下面的文档给出 5 个高质量的标题建议,用 Markdown 无序列表呈现。只输出列表。",
|
|
31
|
+
"outline": "根据下面的内容或主题,生成一个结构化的 Markdown 多级标题大纲。只输出大纲。",
|
|
32
|
+
"fix_latex": "修正下面 LaTeX / Markdown 数学公式中的语法错误(缺失的 $、未配对的花括号、"
|
|
33
|
+
"拼错的命令等),不改变实际内容与结构。只输出修正后的内容。",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
_CUSTOM_SYS = ("你是专业文档编辑助手。严格按用户指令处理下面的 Markdown 文档/文本。"
|
|
37
|
+
"只输出处理后的 Markdown 结果,不要解释、不要整篇代码围栏。")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def list_actions():
|
|
41
|
+
"""Return the available action keys."""
|
|
42
|
+
return list(ACTIONS)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _strip_fence(s: str) -> str:
|
|
46
|
+
s = re.sub(r"^\s*```(?:markdown|md)?\s*\n?", "", s, flags=re.I)
|
|
47
|
+
s = re.sub(r"\n?```\s*$", "", s)
|
|
48
|
+
return s.strip()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def assist(llm, text: str, action: Optional[str] = None,
|
|
52
|
+
instruction: Optional[str] = None) -> str:
|
|
53
|
+
"""Run an AI assistant operation on ``text`` and return the result.
|
|
54
|
+
|
|
55
|
+
Provide ``action`` (one of :data:`ACTIONS`) and/or a free-form
|
|
56
|
+
``instruction``. With both, the instruction refines the chosen action.
|
|
57
|
+
"""
|
|
58
|
+
if llm is None:
|
|
59
|
+
raise RuntimeError("assist() needs an LLM — create DocumentStudio(llm=LLM(...)).")
|
|
60
|
+
text = text or ""
|
|
61
|
+
if action:
|
|
62
|
+
if action not in ACTIONS:
|
|
63
|
+
raise ValueError("unknown action %r; choose from: %s"
|
|
64
|
+
% (action, ", ".join(ACTIONS)))
|
|
65
|
+
system = ACTIONS[action]
|
|
66
|
+
user = (("指令:%s\n\n" % instruction) if instruction else "") + \
|
|
67
|
+
(("文档:\n" + text) if text.strip() else "(文档为空)")
|
|
68
|
+
else:
|
|
69
|
+
if not instruction:
|
|
70
|
+
raise ValueError("provide either action= or instruction=")
|
|
71
|
+
system = _CUSTOM_SYS
|
|
72
|
+
user = "指令:%s\n\n文档:\n%s" % (instruction, text)
|
|
73
|
+
return _strip_fence(llm.chat(system, user))
|