semos-agentura-document 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- semos_agentura_document-0.5.0/.env.example +47 -0
- semos_agentura_document-0.5.0/.gitignore +6 -0
- semos_agentura_document-0.5.0/.python-version +1 -0
- semos_agentura_document-0.5.0/CONTRIBUTING.md +19 -0
- semos_agentura_document-0.5.0/Dockerfile +33 -0
- semos_agentura_document-0.5.0/PKG-INFO +295 -0
- semos_agentura_document-0.5.0/README.md +269 -0
- semos_agentura_document-0.5.0/docker/Dockerfile.libreoffice +10 -0
- semos_agentura_document-0.5.0/pyproject.toml +68 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/__init__.py +54 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/_constants.py +15 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/_image_client.py +231 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/_llm_client.py +348 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/_render.py +174 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/_utils.py +93 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/cli.py +456 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/composition/__init__.py +27 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_backends.py +81 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_data/pandoc_reference.pptx +0 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_diagram_optimize.py +475 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_diagram_source.py +474 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_documents.py +101 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_drawio.py +342 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_editable_slides.py +821 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_generate_diagram.py +169 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_generate_image.py +222 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_image_segment.py +58 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_markdown_prep.py +57 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_mermaid.py +114 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_reference_doc.py +441 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_slide_merge.py +462 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_slides.py +156 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/composition/compose.py +227 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/config.py +69 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/__init__.py +3 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/_docx_digest.py +259 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/_images.py +241 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/_ocr_models.py +90 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/_office.py +69 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/_pdf.py +86 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/_providers.py +202 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/_schema.py +28 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/_styles.py +184 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/_visual_digest.py +90 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/digest.py +241 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/exceptions.py +41 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/forms/__init__.py +15 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/forms/_docx_forms.py +780 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/forms/_pdf_forms.py +289 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/forms/_schema.py +158 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/forms/_visual_inspect.py +928 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/forms/fill.py +148 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/models.py +88 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/service.py +349 -0
- semos_agentura_document-0.5.0/src/semos/agentura/document/tools.py +774 -0
- semos_agentura_document-0.5.0/tests/conftest.py +478 -0
- semos_agentura_document-0.5.0/tests/test_compose_refdoc.py +140 -0
- semos_agentura_document-0.5.0/tests/test_config.py +59 -0
- semos_agentura_document-0.5.0/tests/test_constants.py +36 -0
- semos_agentura_document-0.5.0/tests/test_diagram_images.py +529 -0
- semos_agentura_document-0.5.0/tests/test_diagram_optimize.py +94 -0
- semos_agentura_document-0.5.0/tests/test_diagram_source.py +61 -0
- semos_agentura_document-0.5.0/tests/test_docx_digest.py +294 -0
- semos_agentura_document-0.5.0/tests/test_docx_forms_internal.py +71 -0
- semos_agentura_document-0.5.0/tests/test_drawio.py +91 -0
- semos_agentura_document-0.5.0/tests/test_editable_slides.py +461 -0
- semos_agentura_document-0.5.0/tests/test_form_schema.py +127 -0
- semos_agentura_document-0.5.0/tests/test_forms.py +165 -0
- semos_agentura_document-0.5.0/tests/test_generate_image.py +261 -0
- semos_agentura_document-0.5.0/tests/test_image_client.py +191 -0
- semos_agentura_document-0.5.0/tests/test_image_segment.py +86 -0
- semos_agentura_document-0.5.0/tests/test_images.py +143 -0
- semos_agentura_document-0.5.0/tests/test_integration.py +293 -0
- semos_agentura_document-0.5.0/tests/test_markdown_prep.py +69 -0
- semos_agentura_document-0.5.0/tests/test_mcp.py +393 -0
- semos_agentura_document-0.5.0/tests/test_mermaid.py +37 -0
- semos_agentura_document-0.5.0/tests/test_models.py +86 -0
- semos_agentura_document-0.5.0/tests/test_ocr_models.py +169 -0
- semos_agentura_document-0.5.0/tests/test_pdf.py +97 -0
- semos_agentura_document-0.5.0/tests/test_pdf_forms_internal.py +50 -0
- semos_agentura_document-0.5.0/tests/test_render.py +57 -0
- semos_agentura_document-0.5.0/tests/test_slides.py +25 -0
- semos_agentura_document-0.5.0/tests/test_styles.py +306 -0
- semos_agentura_document-0.5.0/tests/test_utils.py +94 -0
- semos_agentura_document-0.5.0/tools/package-lock.json +5670 -0
- semos_agentura_document-0.5.0/tools/package.json +15 -0
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# --- OCR Provider: Mistral API (default) ---
|
|
2
|
+
MISTRAL_API_KEY=
|
|
3
|
+
|
|
4
|
+
# --- Document AI (Azure AI Foundry, auto-detected if both set) ---
|
|
5
|
+
DOCUMENT_AI_ENDPOINT=
|
|
6
|
+
DOCUMENT_AI_API_KEY=
|
|
7
|
+
DOCUMENT_AI_MODEL=mistral-document-ai-2512
|
|
8
|
+
|
|
9
|
+
# --- Diagram code generation LLM ---
|
|
10
|
+
# DIAGRAM_CODEGEN_ENDPOINT=
|
|
11
|
+
# DIAGRAM_CODEGEN_API_KEY=
|
|
12
|
+
# DIAGRAM_CODEGEN_MODEL=
|
|
13
|
+
|
|
14
|
+
# --- Diagram review LLM (falls back to document_ai if unset) ---
|
|
15
|
+
# DIAGRAM_REVIEW_ENDPOINT=
|
|
16
|
+
# DIAGRAM_REVIEW_API_KEY=
|
|
17
|
+
# DIAGRAM_REVIEW_MODEL=
|
|
18
|
+
|
|
19
|
+
# --- OCR settings ---
|
|
20
|
+
# TABLE_FORMAT=markdown # markdown or html
|
|
21
|
+
# MAX_PDF_PAGES=10
|
|
22
|
+
|
|
23
|
+
# --- External tool paths (optional, auto-detected on PATH) ---
|
|
24
|
+
# LIBRE_OFFICE_PATH=
|
|
25
|
+
# MARP_PATH=
|
|
26
|
+
# PANDOC_PATH=
|
|
27
|
+
# MMDC_PATH=
|
|
28
|
+
# DRAWIO_PATH=
|
|
29
|
+
|
|
30
|
+
# --- draw.io desktop app (for rendering diagrams with embedded images) ---
|
|
31
|
+
# The npm drawio CLI cannot render inline base64 images. Set this to the
|
|
32
|
+
# draw.io desktop executable for correct image rendering.
|
|
33
|
+
# Windows installed: C:\Program Files\draw.io\draw.io.exe
|
|
34
|
+
# Windows portable: C:\PortableApps\DrawioPortable\App\Drawio\draw.io.exe
|
|
35
|
+
# Linux: /usr/bin/drawio
|
|
36
|
+
# macOS: /Applications/draw.io.app/Contents/MacOS/draw.io
|
|
37
|
+
# DRAWIO_DESKTOP_PATH=
|
|
38
|
+
|
|
39
|
+
# --- Slide template application ---
|
|
40
|
+
# TEMPLATE_BACKEND=auto # auto, com, uno, docker
|
|
41
|
+
# COM (PowerPoint) is the only backend with full corporate branding.
|
|
42
|
+
# UNO/Docker transfer placeholder structure only (no logos/bars).
|
|
43
|
+
|
|
44
|
+
# --- A2A LLM router (optional, enables NL delegation) ---
|
|
45
|
+
# ROUTER_LLM_MODEL=
|
|
46
|
+
# ROUTER_LLM_API_KEY=
|
|
47
|
+
# ROUTER_LLM_API_BASE=
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.13
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Contributing to semos-agentura-document
|
|
2
|
+
|
|
3
|
+
Part of the [Semos Agentura](../../README.md) monorepo. Read the
|
|
4
|
+
[root CONTRIBUTING guide](../../CONTRIBUTING.md) for setup, the dev loop, and conventions.
|
|
5
|
+
|
|
6
|
+
- **Import:** `from semos.agentura.document import ...`
|
|
7
|
+
- **What it is:** the document agent: OCR digest, compose to PDF/PPTX/DOCX/HTML, diagram
|
|
8
|
+
generation (Mermaid, draw.io), and PDF/DOCX form inspection and filling. Depends on
|
|
9
|
+
`semos-agentura-core`.
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
make install # from the repo root, once
|
|
13
|
+
cd packages/semos-agentura-document
|
|
14
|
+
uv run pytest tests/ # this package's tests
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Tests needing external renderers (pandoc, mermaid-cli, marp, draw.io) skip when the tool is
|
|
18
|
+
absent, so they pass without a full toolchain. Cross-package end-to-end tests live in the
|
|
19
|
+
repo-root `tests/integration/`.
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Document Agent - OCR, compose, diagrams, forms
|
|
2
|
+
FROM semos-agentura-base:latest
|
|
3
|
+
|
|
4
|
+
# System tools for document processing
|
|
5
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
6
|
+
pandoc \
|
|
7
|
+
texlive-xetex \
|
|
8
|
+
texlive-fonts-recommended \
|
|
9
|
+
libreoffice-calc \
|
|
10
|
+
libreoffice-writer \
|
|
11
|
+
libreoffice-impress \
|
|
12
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
13
|
+
|
|
14
|
+
# Node.js tools (marp, mermaid-cli, draw.io-export)
|
|
15
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
16
|
+
nodejs \
|
|
17
|
+
npm \
|
|
18
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
19
|
+
|
|
20
|
+
COPY packages/semos-agentura-document/tools/package.json /app/packages/semos-agentura-document/tools/
|
|
21
|
+
RUN cd /app/packages/semos-agentura-document/tools && npm install --omit=dev \
|
|
22
|
+
&& npx playwright install --with-deps chromium
|
|
23
|
+
|
|
24
|
+
WORKDIR /app/packages/semos-agentura-document
|
|
25
|
+
|
|
26
|
+
# Chromium needs --no-sandbox when running as root in Docker
|
|
27
|
+
RUN echo '{"args":["--no-sandbox","--disable-setuid-sandbox"]}' > /app/puppeteer.json
|
|
28
|
+
ENV PUPPETEER_CONFIG=/app/puppeteer.json
|
|
29
|
+
|
|
30
|
+
EXPOSE 8002
|
|
31
|
+
|
|
32
|
+
CMD ["uv", "run", "uvicorn", "semos.agentura.document.service:app", \
|
|
33
|
+
"--host", "0.0.0.0", "--port", "8002"]
|
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: semos-agentura-document
|
|
3
|
+
Version: 0.5.0
|
|
4
|
+
Summary: Document digestion (OCR to Markdown) and composition (Markdown to documents) agent
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: httpx>=0.28
|
|
7
|
+
Requires-Dist: lxml>=5.0
|
|
8
|
+
Requires-Dist: mistralai>=1.12
|
|
9
|
+
Requires-Dist: pillow>=10.0
|
|
10
|
+
Requires-Dist: pydantic-settings>=2.13
|
|
11
|
+
Requires-Dist: pydantic>=2.0
|
|
12
|
+
Requires-Dist: pypdf[crypto]>=5.0
|
|
13
|
+
Requires-Dist: pypdfium2>=4.0
|
|
14
|
+
Requires-Dist: python-dotenv>=1.2
|
|
15
|
+
Requires-Dist: python-pptx>=1.0
|
|
16
|
+
Requires-Dist: pyyaml>=6.0
|
|
17
|
+
Requires-Dist: semos-agentura-core
|
|
18
|
+
Provides-Extra: com
|
|
19
|
+
Requires-Dist: pywin32>=311; extra == 'com'
|
|
20
|
+
Provides-Extra: image
|
|
21
|
+
Requires-Dist: rembg>=2.0; extra == 'image'
|
|
22
|
+
Provides-Extra: slides
|
|
23
|
+
Requires-Dist: python-pptx>=1.0; extra == 'slides'
|
|
24
|
+
Requires-Dist: pyyaml>=6.0; extra == 'slides'
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# document-agent
|
|
28
|
+
|
|
29
|
+
Document digestion (OCR to Markdown), composition (Markdown to documents), diagram generation, raster image generation, and form filling. Full round-trip support for footnotes, tracked changes, comments, and styles.
|
|
30
|
+
|
|
31
|
+
## Digestion
|
|
32
|
+
|
|
33
|
+
Convert PDF, images, and Office documents to Markdown.
|
|
34
|
+
|
|
35
|
+
- **PDF/images**: OCR via Mistral Document AI (direct or Azure AI Foundry)
|
|
36
|
+
- **DOCX/ODT**: pandoc-based extraction preserving footnotes, tracked changes, comments, and document styles
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
# Basic digest (auto-selects pandoc for DOCX, OCR for PDF/images)
|
|
40
|
+
document-agent digest document.docx
|
|
41
|
+
|
|
42
|
+
# Show all tracked changes and comments
|
|
43
|
+
document-agent digest document.docx --track-changes all
|
|
44
|
+
|
|
45
|
+
# Force OCR pipeline for a scanned DOCX
|
|
46
|
+
document-agent digest scanned.docx --mode ocr
|
|
47
|
+
|
|
48
|
+
# Inline mode (base64-embedded images, prints to stdout)
|
|
49
|
+
document-agent digest document.pdf --inline
|
|
50
|
+
|
|
51
|
+
# With structured annotation extraction
|
|
52
|
+
document-agent digest document.pdf --schema schema.py --prompt "Extract all line items"
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Round-trip metadata
|
|
56
|
+
|
|
57
|
+
DOCX digestion preserves four types of metadata that round-trip through compose:
|
|
58
|
+
|
|
59
|
+
- **Footnotes**: `[^1]` with `[^1]: definition text` at end
|
|
60
|
+
- **Tracked changes**: `[text]{.insertion author="Name" date="2026-01-01"}` / `{.deletion ...}`
|
|
61
|
+
- **Comments**: `[comment text]{.comment-start id="1" author="Name"}...{.comment-end id="1"}`
|
|
62
|
+
- **Document styles**: YAML front matter block (fonts, sizes, colors, spacing, margins)
|
|
63
|
+
|
|
64
|
+
Supported input formats: PDF, PNG, JPG, JPEG, WEBP, TIFF, BMP, DOCX, PPTX, XLSX, ODT.
|
|
65
|
+
|
|
66
|
+
## Composition
|
|
67
|
+
|
|
68
|
+
Convert Markdown to various output formats with style control.
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
# Basic compose
|
|
72
|
+
document-agent compose input.md output.docx --format docx
|
|
73
|
+
|
|
74
|
+
# With a reference document for style inheritance
|
|
75
|
+
document-agent compose input.md output.docx --format docx \
|
|
76
|
+
--reference-doc template.docx
|
|
77
|
+
|
|
78
|
+
# With headers/footers from a template (combined with YAML styles)
|
|
79
|
+
document-agent compose input.md output.docx --format docx \
|
|
80
|
+
--header-footer-doc template.docx
|
|
81
|
+
|
|
82
|
+
# Slides - polished (Marp, limited editability)
|
|
83
|
+
document-agent compose slides.md out.pptx --format pptx --slides
|
|
84
|
+
|
|
85
|
+
# Slides - draft editable (pandoc, fully editable, rough layout)
|
|
86
|
+
document-agent compose slides.md out.pptx --format pptx --slides --draft
|
|
87
|
+
|
|
88
|
+
# Slides - draft + corporate template (requires PowerPoint)
|
|
89
|
+
document-agent compose slides.md out.pptx --format pptx --slides --draft \
|
|
90
|
+
--template corporate.pptx
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### YAML front matter styles
|
|
94
|
+
|
|
95
|
+
Define document formatting directly in the Markdown source. Styles are auto-extracted during digest and auto-applied during compose:
|
|
96
|
+
|
|
97
|
+
```yaml
|
|
98
|
+
---
|
|
99
|
+
title: "Document Title"
|
|
100
|
+
subtitle: "Optional *formatted* subtitle"
|
|
101
|
+
styles:
|
|
102
|
+
page:
|
|
103
|
+
size: "A4" # or "Letter"
|
|
104
|
+
margin-top: "1.5cm"
|
|
105
|
+
margin-bottom: "1.5cm"
|
|
106
|
+
margin-left: "1.5cm"
|
|
107
|
+
margin-right: "1.5cm"
|
|
108
|
+
body:
|
|
109
|
+
font: "Calibri"
|
|
110
|
+
size: 11
|
|
111
|
+
spacing-before: "0.0cm"
|
|
112
|
+
spacing-after: "0.1cm"
|
|
113
|
+
line-spacing: 1.1
|
|
114
|
+
heading1:
|
|
115
|
+
font: "Calibri"
|
|
116
|
+
size: 13
|
|
117
|
+
bold: true
|
|
118
|
+
color: "000080" # navy
|
|
119
|
+
spacing-before: "0.3cm"
|
|
120
|
+
spacing-after: "0.1cm"
|
|
121
|
+
heading2:
|
|
122
|
+
font: "Calibri"
|
|
123
|
+
size: 11
|
|
124
|
+
bold: true
|
|
125
|
+
color: "000080"
|
|
126
|
+
heading3:
|
|
127
|
+
font: "Calibri"
|
|
128
|
+
size: 11
|
|
129
|
+
bold: true
|
|
130
|
+
color: "333333"
|
|
131
|
+
table:
|
|
132
|
+
size: 9 # also used for footnotes and captions
|
|
133
|
+
border-color: "999999"
|
|
134
|
+
border-size: 4 # eighths of a point
|
|
135
|
+
fixed: false # true to keep equal-width columns
|
|
136
|
+
---
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
Style priority: YAML front matter > `--reference-doc` > pandoc defaults.
|
|
140
|
+
|
|
141
|
+
Mermaid and draw.io diagrams in fenced code blocks are automatically rendered as images.
|
|
142
|
+
|
|
143
|
+
## Diagrams
|
|
144
|
+
|
|
145
|
+
Generate and modify diagrams (Mermaid or draw.io) using LLM-powered optimization.
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
# Generate from description
|
|
149
|
+
document-agent diagram "flowchart of CI/CD pipeline" -o diagram.png
|
|
150
|
+
|
|
151
|
+
# Modify existing draw.io diagram
|
|
152
|
+
document-agent diagram "Change WP3 label to Digital Infrastructure" \
|
|
153
|
+
--source diagram.drawio.png -o updated.drawio.png \
|
|
154
|
+
--code-output updated.drawio
|
|
155
|
+
|
|
156
|
+
# Mermaid diagram
|
|
157
|
+
document-agent diagram "sequence diagram for auth flow" --type mermaid -o auth.png
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
draw.io diagrams with embedded raster images are handled automatically:
|
|
161
|
+
images are stripped for LLM editing (reducing context from ~400 KB to ~8 KB)
|
|
162
|
+
and restored in the output. For correct PNG rendering of embedded images,
|
|
163
|
+
set `DRAWIO_DESKTOP_PATH` in `.env` (the npm CLI can't render inline images).
|
|
164
|
+
|
|
165
|
+
## Image Generation
|
|
166
|
+
|
|
167
|
+
Generate, edit, and extract elements from raster images using text-to-image models.
|
|
168
|
+
|
|
169
|
+
### Modes
|
|
170
|
+
|
|
171
|
+
- **generate**: Text-to-image from a prompt, with optional style prefix
|
|
172
|
+
- **edit**: Modify an existing image with a text prompt (inpainting with optional mask for OpenAI, img2img for Flux)
|
|
173
|
+
- **cut**: Extract a specific element from an image (VLM-guided bounding box + background removal, falls back to image model isolation)
|
|
174
|
+
|
|
175
|
+
### Supported providers
|
|
176
|
+
|
|
177
|
+
| Provider | Endpoint format | Generate | Edit | Models |
|
|
178
|
+
|----------|----------------|----------|------|--------|
|
|
179
|
+
| Azure OpenAI | `https://{resource}.cognitiveservices.azure.com/openai/deployments/{model}` | Yes | Yes (mask) | gpt-image-2 |
|
|
180
|
+
| OpenAI | `https://api.openai.com` | Yes | Yes (mask) | gpt-image-2, dall-e-3 |
|
|
181
|
+
| Azure AI Foundry | `https://{resource}.services.ai.azure.com/providers/{provider}` | Yes | Yes (img2img) | flux-2-pro, flux-2-dev |
|
|
182
|
+
| Direct provider | `https://api.bfl.ai` | Yes | Yes (img2img) | flux-2-pro, flux-2-dev |
|
|
183
|
+
|
|
184
|
+
Configure via environment variables:
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
IMAGE_GEN_ENDPOINT=https://example.cognitiveservices.azure.com/openai/deployments/gpt-image-2
|
|
188
|
+
IMAGE_GEN_API_KEY=your-key
|
|
189
|
+
IMAGE_GEN_MODEL=gpt-image-2
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
### Crossover: embedding images in diagrams
|
|
193
|
+
|
|
194
|
+
Generated images (icons, symbols) can be embedded into draw.io diagrams via the `embeds` parameter on `generate_diagram`. Refer to each embed by its filename in the diagram description so the LLM knows where to place them.
|
|
195
|
+
|
|
196
|
+
The embed mechanism reuses the existing draw.io image strip/restore pipeline: images are represented as `__IMG_N__` placeholders during LLM codegen and replaced with base64 data URIs before rendering.
|
|
197
|
+
|
|
198
|
+
## Forms
|
|
199
|
+
|
|
200
|
+
Inspect and fill form fields in PDF and DOCX files.
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
# Inspect form fields
|
|
204
|
+
document-agent inspect form.pdf
|
|
205
|
+
document-agent inspect form.docx --json
|
|
206
|
+
|
|
207
|
+
# Fill form fields
|
|
208
|
+
document-agent fill form.pdf filled.pdf --data '{"name": "John", "date": "2026-01-01"}'
|
|
209
|
+
document-agent fill form.docx filled.docx --data fields.json
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
## Slide Merge
|
|
213
|
+
|
|
214
|
+
Cherry-pick and merge slides from multiple PPTX sources.
|
|
215
|
+
|
|
216
|
+
```bash
|
|
217
|
+
# Merge specific slides by index
|
|
218
|
+
document-agent merge-slides deck1.pptx:0-5 deck2.pptx:3,7,12 -o merged.pptx
|
|
219
|
+
|
|
220
|
+
# Use a different base template for theme/master
|
|
221
|
+
document-agent merge-slides deck1.pptx:0-5 deck2.pptx:3,7 \
|
|
222
|
+
--base template.pptx -o merged.pptx
|
|
223
|
+
|
|
224
|
+
# Force python-pptx backend (portable, no PowerPoint needed)
|
|
225
|
+
document-agent merge-slides deck.pptx:0-10 -o subset.pptx --backend pptx
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
Backends: COM (PowerPoint, preserves animations/transitions/media) is preferred.
|
|
229
|
+
Falls back to python-pptx (portable, pure Python) when PowerPoint is unavailable.
|
|
230
|
+
Install optional dependencies: `pip install document-agent[slides]`
|
|
231
|
+
|
|
232
|
+
## Setup
|
|
233
|
+
|
|
234
|
+
```bash
|
|
235
|
+
uv sync
|
|
236
|
+
cp .env.example .env
|
|
237
|
+
# Edit .env with your API keys and tool paths
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
### Node.js tools (Marp, Mermaid, draw.io CLI)
|
|
241
|
+
|
|
242
|
+
Marp CLI, Mermaid CLI, and draw.io export are installed locally via npm:
|
|
243
|
+
|
|
244
|
+
```bash
|
|
245
|
+
cd tools && npm install
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
The agent automatically discovers them in `tools/node_modules/.bin/`.
|
|
249
|
+
|
|
250
|
+
### Other external tools
|
|
251
|
+
|
|
252
|
+
- [Pandoc](https://pandoc.org/) - DOCX digest and document composition - install system-wide
|
|
253
|
+
- [LibreOffice](https://www.libreoffice.org/) - Office format OCR fallback - install system-wide
|
|
254
|
+
- [draw.io desktop](https://github.com/jgraph/drawio-desktop) - needed for rendering diagrams with embedded images. Set `DRAWIO_DESKTOP_PATH` in `.env`.
|
|
255
|
+
|
|
256
|
+
## Python API
|
|
257
|
+
|
|
258
|
+
```python
|
|
259
|
+
from document_agent import (
|
|
260
|
+
digest, compose, compose_editable_slides,
|
|
261
|
+
merge_slides, parse_source_args,
|
|
262
|
+
OutputFormat, OutputMode,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# Digest a DOCX with tracked changes
|
|
266
|
+
result = digest("document.docx", track_changes="all")
|
|
267
|
+
print(result.markdown)
|
|
268
|
+
|
|
269
|
+
# Compose with YAML styles
|
|
270
|
+
result = compose("styled.md", "output.docx", OutputFormat.PDF)
|
|
271
|
+
|
|
272
|
+
# Compose with reference doc
|
|
273
|
+
result = compose("input.md", "output.docx", OutputFormat.DOCX,
|
|
274
|
+
reference_doc="template.docx")
|
|
275
|
+
|
|
276
|
+
# Slides - polished (Marp)
|
|
277
|
+
result = compose("slides.md", "out.pptx", OutputFormat.PPTX,
|
|
278
|
+
is_slides=True)
|
|
279
|
+
|
|
280
|
+
# Slides - draft editable (pandoc)
|
|
281
|
+
result = compose("slides.md", "out.pptx", OutputFormat.PPTX,
|
|
282
|
+
is_slides=True, draft=True)
|
|
283
|
+
|
|
284
|
+
# Slides - draft + corporate template
|
|
285
|
+
result = compose("slides.md", "out.pptx", OutputFormat.PPTX,
|
|
286
|
+
is_slides=True, draft=True,
|
|
287
|
+
template="corporate.pptx")
|
|
288
|
+
|
|
289
|
+
# Merge slides from CLI-style args
|
|
290
|
+
config = parse_source_args(
|
|
291
|
+
["deck1.pptx:0-5", "deck2.pptx:3,7"],
|
|
292
|
+
output="merged.pptx",
|
|
293
|
+
)
|
|
294
|
+
result = merge_slides(config, "merged.pptx")
|
|
295
|
+
```
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
# document-agent
|
|
2
|
+
|
|
3
|
+
Document digestion (OCR to Markdown), composition (Markdown to documents), diagram generation, raster image generation, and form filling. Full round-trip support for footnotes, tracked changes, comments, and styles.
|
|
4
|
+
|
|
5
|
+
## Digestion
|
|
6
|
+
|
|
7
|
+
Convert PDF, images, and Office documents to Markdown.
|
|
8
|
+
|
|
9
|
+
- **PDF/images**: OCR via Mistral Document AI (direct or Azure AI Foundry)
|
|
10
|
+
- **DOCX/ODT**: pandoc-based extraction preserving footnotes, tracked changes, comments, and document styles
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
# Basic digest (auto-selects pandoc for DOCX, OCR for PDF/images)
|
|
14
|
+
document-agent digest document.docx
|
|
15
|
+
|
|
16
|
+
# Show all tracked changes and comments
|
|
17
|
+
document-agent digest document.docx --track-changes all
|
|
18
|
+
|
|
19
|
+
# Force OCR pipeline for a scanned DOCX
|
|
20
|
+
document-agent digest scanned.docx --mode ocr
|
|
21
|
+
|
|
22
|
+
# Inline mode (base64-embedded images, prints to stdout)
|
|
23
|
+
document-agent digest document.pdf --inline
|
|
24
|
+
|
|
25
|
+
# With structured annotation extraction
|
|
26
|
+
document-agent digest document.pdf --schema schema.py --prompt "Extract all line items"
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### Round-trip metadata
|
|
30
|
+
|
|
31
|
+
DOCX digestion preserves four types of metadata that round-trip through compose:
|
|
32
|
+
|
|
33
|
+
- **Footnotes**: `[^1]` with `[^1]: definition text` at end
|
|
34
|
+
- **Tracked changes**: `[text]{.insertion author="Name" date="2026-01-01"}` / `{.deletion ...}`
|
|
35
|
+
- **Comments**: `[comment text]{.comment-start id="1" author="Name"}...{.comment-end id="1"}`
|
|
36
|
+
- **Document styles**: YAML front matter block (fonts, sizes, colors, spacing, margins)
|
|
37
|
+
|
|
38
|
+
Supported input formats: PDF, PNG, JPG, JPEG, WEBP, TIFF, BMP, DOCX, PPTX, XLSX, ODT.
|
|
39
|
+
|
|
40
|
+
## Composition
|
|
41
|
+
|
|
42
|
+
Convert Markdown to various output formats with style control.
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
# Basic compose
|
|
46
|
+
document-agent compose input.md output.docx --format docx
|
|
47
|
+
|
|
48
|
+
# With a reference document for style inheritance
|
|
49
|
+
document-agent compose input.md output.docx --format docx \
|
|
50
|
+
--reference-doc template.docx
|
|
51
|
+
|
|
52
|
+
# With headers/footers from a template (combined with YAML styles)
|
|
53
|
+
document-agent compose input.md output.docx --format docx \
|
|
54
|
+
--header-footer-doc template.docx
|
|
55
|
+
|
|
56
|
+
# Slides - polished (Marp, limited editability)
|
|
57
|
+
document-agent compose slides.md out.pptx --format pptx --slides
|
|
58
|
+
|
|
59
|
+
# Slides - draft editable (pandoc, fully editable, rough layout)
|
|
60
|
+
document-agent compose slides.md out.pptx --format pptx --slides --draft
|
|
61
|
+
|
|
62
|
+
# Slides - draft + corporate template (requires PowerPoint)
|
|
63
|
+
document-agent compose slides.md out.pptx --format pptx --slides --draft \
|
|
64
|
+
--template corporate.pptx
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### YAML front matter styles
|
|
68
|
+
|
|
69
|
+
Define document formatting directly in the Markdown source. Styles are auto-extracted during digest and auto-applied during compose:
|
|
70
|
+
|
|
71
|
+
```yaml
|
|
72
|
+
---
|
|
73
|
+
title: "Document Title"
|
|
74
|
+
subtitle: "Optional *formatted* subtitle"
|
|
75
|
+
styles:
|
|
76
|
+
page:
|
|
77
|
+
size: "A4" # or "Letter"
|
|
78
|
+
margin-top: "1.5cm"
|
|
79
|
+
margin-bottom: "1.5cm"
|
|
80
|
+
margin-left: "1.5cm"
|
|
81
|
+
margin-right: "1.5cm"
|
|
82
|
+
body:
|
|
83
|
+
font: "Calibri"
|
|
84
|
+
size: 11
|
|
85
|
+
spacing-before: "0.0cm"
|
|
86
|
+
spacing-after: "0.1cm"
|
|
87
|
+
line-spacing: 1.1
|
|
88
|
+
heading1:
|
|
89
|
+
font: "Calibri"
|
|
90
|
+
size: 13
|
|
91
|
+
bold: true
|
|
92
|
+
color: "000080" # navy
|
|
93
|
+
spacing-before: "0.3cm"
|
|
94
|
+
spacing-after: "0.1cm"
|
|
95
|
+
heading2:
|
|
96
|
+
font: "Calibri"
|
|
97
|
+
size: 11
|
|
98
|
+
bold: true
|
|
99
|
+
color: "000080"
|
|
100
|
+
heading3:
|
|
101
|
+
font: "Calibri"
|
|
102
|
+
size: 11
|
|
103
|
+
bold: true
|
|
104
|
+
color: "333333"
|
|
105
|
+
table:
|
|
106
|
+
size: 9 # also used for footnotes and captions
|
|
107
|
+
border-color: "999999"
|
|
108
|
+
border-size: 4 # eighths of a point
|
|
109
|
+
fixed: false # true to keep equal-width columns
|
|
110
|
+
---
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Style priority: YAML front matter > `--reference-doc` > pandoc defaults.
|
|
114
|
+
|
|
115
|
+
Mermaid and draw.io diagrams in fenced code blocks are automatically rendered as images.
|
|
116
|
+
|
|
117
|
+
## Diagrams
|
|
118
|
+
|
|
119
|
+
Generate and modify diagrams (Mermaid or draw.io) using LLM-powered optimization.
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
# Generate from description
|
|
123
|
+
document-agent diagram "flowchart of CI/CD pipeline" -o diagram.png
|
|
124
|
+
|
|
125
|
+
# Modify existing draw.io diagram
|
|
126
|
+
document-agent diagram "Change WP3 label to Digital Infrastructure" \
|
|
127
|
+
--source diagram.drawio.png -o updated.drawio.png \
|
|
128
|
+
--code-output updated.drawio
|
|
129
|
+
|
|
130
|
+
# Mermaid diagram
|
|
131
|
+
document-agent diagram "sequence diagram for auth flow" --type mermaid -o auth.png
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
draw.io diagrams with embedded raster images are handled automatically:
|
|
135
|
+
images are stripped for LLM editing (reducing context from ~400 KB to ~8 KB)
|
|
136
|
+
and restored in the output. For correct PNG rendering of embedded images,
|
|
137
|
+
set `DRAWIO_DESKTOP_PATH` in `.env` (the npm CLI can't render inline images).
|
|
138
|
+
|
|
139
|
+
## Image Generation
|
|
140
|
+
|
|
141
|
+
Generate, edit, and extract elements from raster images using text-to-image models.
|
|
142
|
+
|
|
143
|
+
### Modes
|
|
144
|
+
|
|
145
|
+
- **generate**: Text-to-image from a prompt, with optional style prefix
|
|
146
|
+
- **edit**: Modify an existing image with a text prompt (inpainting with optional mask for OpenAI, img2img for Flux)
|
|
147
|
+
- **cut**: Extract a specific element from an image (VLM-guided bounding box + background removal, falls back to image model isolation)
|
|
148
|
+
|
|
149
|
+
### Supported providers
|
|
150
|
+
|
|
151
|
+
| Provider | Endpoint format | Generate | Edit | Models |
|
|
152
|
+
|----------|----------------|----------|------|--------|
|
|
153
|
+
| Azure OpenAI | `https://{resource}.cognitiveservices.azure.com/openai/deployments/{model}` | Yes | Yes (mask) | gpt-image-2 |
|
|
154
|
+
| OpenAI | `https://api.openai.com` | Yes | Yes (mask) | gpt-image-2, dall-e-3 |
|
|
155
|
+
| Azure AI Foundry | `https://{resource}.services.ai.azure.com/providers/{provider}` | Yes | Yes (img2img) | flux-2-pro, flux-2-dev |
|
|
156
|
+
| Direct provider | `https://api.bfl.ai` | Yes | Yes (img2img) | flux-2-pro, flux-2-dev |
|
|
157
|
+
|
|
158
|
+
Configure via environment variables:
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
IMAGE_GEN_ENDPOINT=https://example.cognitiveservices.azure.com/openai/deployments/gpt-image-2
|
|
162
|
+
IMAGE_GEN_API_KEY=your-key
|
|
163
|
+
IMAGE_GEN_MODEL=gpt-image-2
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### Crossover: embedding images in diagrams
|
|
167
|
+
|
|
168
|
+
Generated images (icons, symbols) can be embedded into draw.io diagrams via the `embeds` parameter on `generate_diagram`. Refer to each embed by its filename in the diagram description so the LLM knows where to place them.
|
|
169
|
+
|
|
170
|
+
The embed mechanism reuses the existing draw.io image strip/restore pipeline: images are represented as `__IMG_N__` placeholders during LLM codegen and replaced with base64 data URIs before rendering.
|
|
171
|
+
|
|
172
|
+
## Forms
|
|
173
|
+
|
|
174
|
+
Inspect and fill form fields in PDF and DOCX files.
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
# Inspect form fields
|
|
178
|
+
document-agent inspect form.pdf
|
|
179
|
+
document-agent inspect form.docx --json
|
|
180
|
+
|
|
181
|
+
# Fill form fields
|
|
182
|
+
document-agent fill form.pdf filled.pdf --data '{"name": "John", "date": "2026-01-01"}'
|
|
183
|
+
document-agent fill form.docx filled.docx --data fields.json
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## Slide Merge
|
|
187
|
+
|
|
188
|
+
Cherry-pick and merge slides from multiple PPTX sources.
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
# Merge specific slides by index
|
|
192
|
+
document-agent merge-slides deck1.pptx:0-5 deck2.pptx:3,7,12 -o merged.pptx
|
|
193
|
+
|
|
194
|
+
# Use a different base template for theme/master
|
|
195
|
+
document-agent merge-slides deck1.pptx:0-5 deck2.pptx:3,7 \
|
|
196
|
+
--base template.pptx -o merged.pptx
|
|
197
|
+
|
|
198
|
+
# Force python-pptx backend (portable, no PowerPoint needed)
|
|
199
|
+
document-agent merge-slides deck.pptx:0-10 -o subset.pptx --backend pptx
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
Backends: COM (PowerPoint, preserves animations/transitions/media) is preferred.
|
|
203
|
+
Falls back to python-pptx (portable, pure Python) when PowerPoint is unavailable.
|
|
204
|
+
Install optional dependencies: `pip install document-agent[slides]`
|
|
205
|
+
|
|
206
|
+
## Setup
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
uv sync
|
|
210
|
+
cp .env.example .env
|
|
211
|
+
# Edit .env with your API keys and tool paths
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
### Node.js tools (Marp, Mermaid, draw.io CLI)
|
|
215
|
+
|
|
216
|
+
Marp CLI, Mermaid CLI, and draw.io export are installed locally via npm:
|
|
217
|
+
|
|
218
|
+
```bash
|
|
219
|
+
cd tools && npm install
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
The agent automatically discovers them in `tools/node_modules/.bin/`.
|
|
223
|
+
|
|
224
|
+
### Other external tools
|
|
225
|
+
|
|
226
|
+
- [Pandoc](https://pandoc.org/) - DOCX digest and document composition - install system-wide
|
|
227
|
+
- [LibreOffice](https://www.libreoffice.org/) - Office format OCR fallback - install system-wide
|
|
228
|
+
- [draw.io desktop](https://github.com/jgraph/drawio-desktop) - needed for rendering diagrams with embedded images. Set `DRAWIO_DESKTOP_PATH` in `.env`.
|
|
229
|
+
|
|
230
|
+
## Python API
|
|
231
|
+
|
|
232
|
+
```python
|
|
233
|
+
from document_agent import (
|
|
234
|
+
digest, compose, compose_editable_slides,
|
|
235
|
+
merge_slides, parse_source_args,
|
|
236
|
+
OutputFormat, OutputMode,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# Digest a DOCX with tracked changes
|
|
240
|
+
result = digest("document.docx", track_changes="all")
|
|
241
|
+
print(result.markdown)
|
|
242
|
+
|
|
243
|
+
# Compose with YAML styles
|
|
244
|
+
result = compose("styled.md", "output.docx", OutputFormat.PDF)
|
|
245
|
+
|
|
246
|
+
# Compose with reference doc
|
|
247
|
+
result = compose("input.md", "output.docx", OutputFormat.DOCX,
|
|
248
|
+
reference_doc="template.docx")
|
|
249
|
+
|
|
250
|
+
# Slides - polished (Marp)
|
|
251
|
+
result = compose("slides.md", "out.pptx", OutputFormat.PPTX,
|
|
252
|
+
is_slides=True)
|
|
253
|
+
|
|
254
|
+
# Slides - draft editable (pandoc)
|
|
255
|
+
result = compose("slides.md", "out.pptx", OutputFormat.PPTX,
|
|
256
|
+
is_slides=True, draft=True)
|
|
257
|
+
|
|
258
|
+
# Slides - draft + corporate template
|
|
259
|
+
result = compose("slides.md", "out.pptx", OutputFormat.PPTX,
|
|
260
|
+
is_slides=True, draft=True,
|
|
261
|
+
template="corporate.pptx")
|
|
262
|
+
|
|
263
|
+
# Merge slides from CLI-style args
|
|
264
|
+
config = parse_source_args(
|
|
265
|
+
["deck1.pptx:0-5", "deck2.pptx:3,7"],
|
|
266
|
+
output="merged.pptx",
|
|
267
|
+
)
|
|
268
|
+
result = merge_slides(config, "merged.pptx")
|
|
269
|
+
```
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
FROM ubuntu:24.04
|
|
2
|
+
ENV DEBIAN_FRONTEND=noninteractive
|
|
3
|
+
RUN apt-get update && \
|
|
4
|
+
apt-get install -y \
|
|
5
|
+
libreoffice-impress libreoffice-script-provider-python \
|
|
6
|
+
python3-uno python3 && \
|
|
7
|
+
apt-get clean && rm -rf /var/lib/apt/lists/*
|
|
8
|
+
ENV SAL_USE_VCLPLUGIN=svp
|
|
9
|
+
ENV HOME=/tmp
|
|
10
|
+
WORKDIR /work
|