semos-agentura-document 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. semos_agentura_document-0.5.0/.env.example +47 -0
  2. semos_agentura_document-0.5.0/.gitignore +6 -0
  3. semos_agentura_document-0.5.0/.python-version +1 -0
  4. semos_agentura_document-0.5.0/CONTRIBUTING.md +19 -0
  5. semos_agentura_document-0.5.0/Dockerfile +33 -0
  6. semos_agentura_document-0.5.0/PKG-INFO +295 -0
  7. semos_agentura_document-0.5.0/README.md +269 -0
  8. semos_agentura_document-0.5.0/docker/Dockerfile.libreoffice +10 -0
  9. semos_agentura_document-0.5.0/pyproject.toml +68 -0
  10. semos_agentura_document-0.5.0/src/semos/agentura/document/__init__.py +54 -0
  11. semos_agentura_document-0.5.0/src/semos/agentura/document/_constants.py +15 -0
  12. semos_agentura_document-0.5.0/src/semos/agentura/document/_image_client.py +231 -0
  13. semos_agentura_document-0.5.0/src/semos/agentura/document/_llm_client.py +348 -0
  14. semos_agentura_document-0.5.0/src/semos/agentura/document/_render.py +174 -0
  15. semos_agentura_document-0.5.0/src/semos/agentura/document/_utils.py +93 -0
  16. semos_agentura_document-0.5.0/src/semos/agentura/document/cli.py +456 -0
  17. semos_agentura_document-0.5.0/src/semos/agentura/document/composition/__init__.py +27 -0
  18. semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_backends.py +81 -0
  19. semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_data/pandoc_reference.pptx +0 -0
  20. semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_diagram_optimize.py +475 -0
  21. semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_diagram_source.py +474 -0
  22. semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_documents.py +101 -0
  23. semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_drawio.py +342 -0
  24. semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_editable_slides.py +821 -0
  25. semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_generate_diagram.py +169 -0
  26. semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_generate_image.py +222 -0
  27. semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_image_segment.py +58 -0
  28. semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_markdown_prep.py +57 -0
  29. semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_mermaid.py +114 -0
  30. semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_reference_doc.py +441 -0
  31. semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_slide_merge.py +462 -0
  32. semos_agentura_document-0.5.0/src/semos/agentura/document/composition/_slides.py +156 -0
  33. semos_agentura_document-0.5.0/src/semos/agentura/document/composition/compose.py +227 -0
  34. semos_agentura_document-0.5.0/src/semos/agentura/document/config.py +69 -0
  35. semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/__init__.py +3 -0
  36. semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/_docx_digest.py +259 -0
  37. semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/_images.py +241 -0
  38. semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/_ocr_models.py +90 -0
  39. semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/_office.py +69 -0
  40. semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/_pdf.py +86 -0
  41. semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/_providers.py +202 -0
  42. semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/_schema.py +28 -0
  43. semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/_styles.py +184 -0
  44. semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/_visual_digest.py +90 -0
  45. semos_agentura_document-0.5.0/src/semos/agentura/document/digestion/digest.py +241 -0
  46. semos_agentura_document-0.5.0/src/semos/agentura/document/exceptions.py +41 -0
  47. semos_agentura_document-0.5.0/src/semos/agentura/document/forms/__init__.py +15 -0
  48. semos_agentura_document-0.5.0/src/semos/agentura/document/forms/_docx_forms.py +780 -0
  49. semos_agentura_document-0.5.0/src/semos/agentura/document/forms/_pdf_forms.py +289 -0
  50. semos_agentura_document-0.5.0/src/semos/agentura/document/forms/_schema.py +158 -0
  51. semos_agentura_document-0.5.0/src/semos/agentura/document/forms/_visual_inspect.py +928 -0
  52. semos_agentura_document-0.5.0/src/semos/agentura/document/forms/fill.py +148 -0
  53. semos_agentura_document-0.5.0/src/semos/agentura/document/models.py +88 -0
  54. semos_agentura_document-0.5.0/src/semos/agentura/document/service.py +349 -0
  55. semos_agentura_document-0.5.0/src/semos/agentura/document/tools.py +774 -0
  56. semos_agentura_document-0.5.0/tests/conftest.py +478 -0
  57. semos_agentura_document-0.5.0/tests/test_compose_refdoc.py +140 -0
  58. semos_agentura_document-0.5.0/tests/test_config.py +59 -0
  59. semos_agentura_document-0.5.0/tests/test_constants.py +36 -0
  60. semos_agentura_document-0.5.0/tests/test_diagram_images.py +529 -0
  61. semos_agentura_document-0.5.0/tests/test_diagram_optimize.py +94 -0
  62. semos_agentura_document-0.5.0/tests/test_diagram_source.py +61 -0
  63. semos_agentura_document-0.5.0/tests/test_docx_digest.py +294 -0
  64. semos_agentura_document-0.5.0/tests/test_docx_forms_internal.py +71 -0
  65. semos_agentura_document-0.5.0/tests/test_drawio.py +91 -0
  66. semos_agentura_document-0.5.0/tests/test_editable_slides.py +461 -0
  67. semos_agentura_document-0.5.0/tests/test_form_schema.py +127 -0
  68. semos_agentura_document-0.5.0/tests/test_forms.py +165 -0
  69. semos_agentura_document-0.5.0/tests/test_generate_image.py +261 -0
  70. semos_agentura_document-0.5.0/tests/test_image_client.py +191 -0
  71. semos_agentura_document-0.5.0/tests/test_image_segment.py +86 -0
  72. semos_agentura_document-0.5.0/tests/test_images.py +143 -0
  73. semos_agentura_document-0.5.0/tests/test_integration.py +293 -0
  74. semos_agentura_document-0.5.0/tests/test_markdown_prep.py +69 -0
  75. semos_agentura_document-0.5.0/tests/test_mcp.py +393 -0
  76. semos_agentura_document-0.5.0/tests/test_mermaid.py +37 -0
  77. semos_agentura_document-0.5.0/tests/test_models.py +86 -0
  78. semos_agentura_document-0.5.0/tests/test_ocr_models.py +169 -0
  79. semos_agentura_document-0.5.0/tests/test_pdf.py +97 -0
  80. semos_agentura_document-0.5.0/tests/test_pdf_forms_internal.py +50 -0
  81. semos_agentura_document-0.5.0/tests/test_render.py +57 -0
  82. semos_agentura_document-0.5.0/tests/test_slides.py +25 -0
  83. semos_agentura_document-0.5.0/tests/test_styles.py +306 -0
  84. semos_agentura_document-0.5.0/tests/test_utils.py +94 -0
  85. semos_agentura_document-0.5.0/tools/package-lock.json +5670 -0
  86. semos_agentura_document-0.5.0/tools/package.json +15 -0
@@ -0,0 +1,47 @@
1
+ # --- OCR Provider: Mistral API (default) ---
2
+ MISTRAL_API_KEY=
3
+
4
+ # --- Document AI (Azure AI Foundry, auto-detected if both set) ---
5
+ DOCUMENT_AI_ENDPOINT=
6
+ DOCUMENT_AI_API_KEY=
7
+ DOCUMENT_AI_MODEL=mistral-document-ai-2512
8
+
9
+ # --- Diagram code generation LLM ---
10
+ # DIAGRAM_CODEGEN_ENDPOINT=
11
+ # DIAGRAM_CODEGEN_API_KEY=
12
+ # DIAGRAM_CODEGEN_MODEL=
13
+
14
+ # --- Diagram review LLM (falls back to document_ai if unset) ---
15
+ # DIAGRAM_REVIEW_ENDPOINT=
16
+ # DIAGRAM_REVIEW_API_KEY=
17
+ # DIAGRAM_REVIEW_MODEL=
18
+
19
+ # --- OCR settings ---
20
+ # TABLE_FORMAT=markdown # markdown or html
21
+ # MAX_PDF_PAGES=10
22
+
23
+ # --- External tool paths (optional, auto-detected on PATH) ---
24
+ # LIBRE_OFFICE_PATH=
25
+ # MARP_PATH=
26
+ # PANDOC_PATH=
27
+ # MMDC_PATH=
28
+ # DRAWIO_PATH=
29
+
30
+ # --- draw.io desktop app (for rendering diagrams with embedded images) ---
31
+ # The npm drawio CLI cannot render inline base64 images. Set this to the
32
+ # draw.io desktop executable for correct image rendering.
33
+ # Windows installed: C:\Program Files\draw.io\draw.io.exe
34
+ # Windows portable: C:\PortableApps\DrawioPortable\App\Drawio\draw.io.exe
35
+ # Linux: /usr/bin/drawio
36
+ # macOS: /Applications/draw.io.app/Contents/MacOS/draw.io
37
+ # DRAWIO_DESKTOP_PATH=
38
+
39
+ # --- Slide template application ---
40
+ # TEMPLATE_BACKEND=auto # auto, com, uno, docker
41
+ # COM (PowerPoint) is the only backend with full corporate branding.
42
+ # UNO/Docker transfer placeholder structure only (no logos/bars).
43
+
44
+ # --- A2A LLM router (optional, enables NL delegation) ---
45
+ # ROUTER_LLM_MODEL=
46
+ # ROUTER_LLM_API_KEY=
47
+ # ROUTER_LLM_API_BASE=
@@ -0,0 +1,6 @@
1
+ __pycache__/
2
+ *.py[oc]
3
+ .venv/
4
+ .env
5
+ tools/node_modules/
6
+ test_forms/
@@ -0,0 +1 @@
1
+ 3.13
@@ -0,0 +1,19 @@
1
+ # Contributing to semos-agentura-document
2
+
3
+ Part of the [Semos Agentura](../../README.md) monorepo. Read the
4
+ [root CONTRIBUTING guide](../../CONTRIBUTING.md) for setup, the dev loop, and conventions.
5
+
6
+ - **Import:** `from semos.agentura.document import ...`
7
+ - **What it is:** the document agent: OCR digest, compose to PDF/PPTX/DOCX/HTML, diagram
8
+ generation (Mermaid, draw.io), and PDF/DOCX form inspection and filling. Depends on
9
+ `semos-agentura-core`.
10
+
11
+ ```bash
12
+ make install # from the repo root, once
13
+ cd packages/semos-agentura-document
14
+ uv run pytest tests/ # this package's tests
15
+ ```
16
+
17
+ Tests needing external renderers (pandoc, mermaid-cli, marp, draw.io) skip when the tool is
18
+ absent, so they pass without a full toolchain. Cross-package end-to-end tests live in the
19
+ repo-root `tests/integration/`.
@@ -0,0 +1,33 @@
1
+ # Document Agent - OCR, compose, diagrams, forms
2
+ FROM semos-agentura-base:latest
3
+
4
+ # System tools for document processing
5
+ RUN apt-get update && apt-get install -y --no-install-recommends \
6
+ pandoc \
7
+ texlive-xetex \
8
+ texlive-fonts-recommended \
9
+ libreoffice-calc \
10
+ libreoffice-writer \
11
+ libreoffice-impress \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # Node.js tools (marp, mermaid-cli, draw.io-export)
15
+ RUN apt-get update && apt-get install -y --no-install-recommends \
16
+ nodejs \
17
+ npm \
18
+ && rm -rf /var/lib/apt/lists/*
19
+
20
+ COPY packages/semos-agentura-document/tools/package.json /app/packages/semos-agentura-document/tools/
21
+ RUN cd /app/packages/semos-agentura-document/tools && npm install --omit=dev \
22
+ && npx playwright install --with-deps chromium
23
+
24
+ WORKDIR /app/packages/semos-agentura-document
25
+
26
+ # Chromium needs --no-sandbox when running as root in Docker
27
+ RUN echo '{"args":["--no-sandbox","--disable-setuid-sandbox"]}' > /app/puppeteer.json
28
+ ENV PUPPETEER_CONFIG=/app/puppeteer.json
29
+
30
+ EXPOSE 8002
31
+
32
+ CMD ["uv", "run", "uvicorn", "semos.agentura.document.service:app", \
33
+ "--host", "0.0.0.0", "--port", "8002"]
@@ -0,0 +1,295 @@
1
+ Metadata-Version: 2.4
2
+ Name: semos-agentura-document
3
+ Version: 0.5.0
4
+ Summary: Document digestion (OCR to Markdown) and composition (Markdown to documents) agent
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: httpx>=0.28
7
+ Requires-Dist: lxml>=5.0
8
+ Requires-Dist: mistralai>=1.12
9
+ Requires-Dist: pillow>=10.0
10
+ Requires-Dist: pydantic-settings>=2.13
11
+ Requires-Dist: pydantic>=2.0
12
+ Requires-Dist: pypdf[crypto]>=5.0
13
+ Requires-Dist: pypdfium2>=4.0
14
+ Requires-Dist: python-dotenv>=1.2
15
+ Requires-Dist: python-pptx>=1.0
16
+ Requires-Dist: pyyaml>=6.0
17
+ Requires-Dist: semos-agentura-core
18
+ Provides-Extra: com
19
+ Requires-Dist: pywin32>=311; extra == 'com'
20
+ Provides-Extra: image
21
+ Requires-Dist: rembg>=2.0; extra == 'image'
22
+ Provides-Extra: slides
23
+ Requires-Dist: python-pptx>=1.0; extra == 'slides'
24
+ Requires-Dist: pyyaml>=6.0; extra == 'slides'
25
+ Description-Content-Type: text/markdown
26
+
27
+ # document-agent
28
+
29
+ Document digestion (OCR to Markdown), composition (Markdown to documents), diagram generation, raster image generation, and form filling. Full round-trip support for footnotes, tracked changes, comments, and styles.
30
+
31
+ ## Digestion
32
+
33
+ Convert PDF, images, and Office documents to Markdown.
34
+
35
+ - **PDF/images**: OCR via Mistral Document AI (direct or Azure AI Foundry)
36
+ - **DOCX/ODT**: pandoc-based extraction preserving footnotes, tracked changes, comments, and document styles
37
+
38
+ ```bash
39
+ # Basic digest (auto-selects pandoc for DOCX, OCR for PDF/images)
40
+ document-agent digest document.docx
41
+
42
+ # Show all tracked changes and comments
43
+ document-agent digest document.docx --track-changes all
44
+
45
+ # Force OCR pipeline for a scanned DOCX
46
+ document-agent digest scanned.docx --mode ocr
47
+
48
+ # Inline mode (base64-embedded images, prints to stdout)
49
+ document-agent digest document.pdf --inline
50
+
51
+ # With structured annotation extraction
52
+ document-agent digest document.pdf --schema schema.py --prompt "Extract all line items"
53
+ ```
54
+
55
+ ### Round-trip metadata
56
+
57
+ DOCX digestion preserves four types of metadata that round-trip through compose:
58
+
59
+ - **Footnotes**: `[^1]` with `[^1]: definition text` at end
60
+ - **Tracked changes**: `[text]{.insertion author="Name" date="2026-01-01"}` / `{.deletion ...}`
61
+ - **Comments**: `[comment text]{.comment-start id="1" author="Name"}...{.comment-end id="1"}`
62
+ - **Document styles**: YAML front matter block (fonts, sizes, colors, spacing, margins)
63
+
64
+ Supported input formats: PDF, PNG, JPG, JPEG, WEBP, TIFF, BMP, DOCX, PPTX, XLSX, ODT.
65
+
66
+ ## Composition
67
+
68
+ Convert Markdown to various output formats with style control.
69
+
70
+ ```bash
71
+ # Basic compose
72
+ document-agent compose input.md output.docx --format docx
73
+
74
+ # With a reference document for style inheritance
75
+ document-agent compose input.md output.docx --format docx \
76
+ --reference-doc template.docx
77
+
78
+ # With headers/footers from a template (combined with YAML styles)
79
+ document-agent compose input.md output.docx --format docx \
80
+ --header-footer-doc template.docx
81
+
82
+ # Slides - polished (Marp, limited editability)
83
+ document-agent compose slides.md out.pptx --format pptx --slides
84
+
85
+ # Slides - draft editable (pandoc, fully editable, rough layout)
86
+ document-agent compose slides.md out.pptx --format pptx --slides --draft
87
+
88
+ # Slides - draft + corporate template (requires PowerPoint)
89
+ document-agent compose slides.md out.pptx --format pptx --slides --draft \
90
+ --template corporate.pptx
91
+ ```
92
+
93
+ ### YAML front matter styles
94
+
95
+ Define document formatting directly in the Markdown source. Styles are auto-extracted during digest and auto-applied during compose:
96
+
97
+ ```yaml
98
+ ---
99
+ title: "Document Title"
100
+ subtitle: "Optional *formatted* subtitle"
101
+ styles:
102
+ page:
103
+ size: "A4" # or "Letter"
104
+ margin-top: "1.5cm"
105
+ margin-bottom: "1.5cm"
106
+ margin-left: "1.5cm"
107
+ margin-right: "1.5cm"
108
+ body:
109
+ font: "Calibri"
110
+ size: 11
111
+ spacing-before: "0.0cm"
112
+ spacing-after: "0.1cm"
113
+ line-spacing: 1.1
114
+ heading1:
115
+ font: "Calibri"
116
+ size: 13
117
+ bold: true
118
+ color: "000080" # navy
119
+ spacing-before: "0.3cm"
120
+ spacing-after: "0.1cm"
121
+ heading2:
122
+ font: "Calibri"
123
+ size: 11
124
+ bold: true
125
+ color: "000080"
126
+ heading3:
127
+ font: "Calibri"
128
+ size: 11
129
+ bold: true
130
+ color: "333333"
131
+ table:
132
+ size: 9 # also used for footnotes and captions
133
+ border-color: "999999"
134
+ border-size: 4 # eighths of a point
135
+ fixed: false # true to keep equal-width columns
136
+ ---
137
+ ```
138
+
139
+ Style priority: YAML front matter > `--reference-doc` > pandoc defaults.
140
+
141
+ Mermaid and draw.io diagrams in fenced code blocks are automatically rendered as images.
142
+
143
+ ## Diagrams
144
+
145
+ Generate and modify diagrams (Mermaid or draw.io) using LLM-powered optimization.
146
+
147
+ ```bash
148
+ # Generate from description
149
+ document-agent diagram "flowchart of CI/CD pipeline" -o diagram.png
150
+
151
+ # Modify existing draw.io diagram
152
+ document-agent diagram "Change WP3 label to Digital Infrastructure" \
153
+ --source diagram.drawio.png -o updated.drawio.png \
154
+ --code-output updated.drawio
155
+
156
+ # Mermaid diagram
157
+ document-agent diagram "sequence diagram for auth flow" --type mermaid -o auth.png
158
+ ```
159
+
160
+ draw.io diagrams with embedded raster images are handled automatically:
161
+ images are stripped for LLM editing (reducing context from ~400 KB to ~8 KB)
162
+ and restored in the output. For correct PNG rendering of embedded images,
163
+ set `DRAWIO_DESKTOP_PATH` in `.env` (the npm CLI can't render inline images).
164
+
165
+ ## Image Generation
166
+
167
+ Generate, edit, and extract elements from raster images using text-to-image models.
168
+
169
+ ### Modes
170
+
171
+ - **generate**: Text-to-image from a prompt, with optional style prefix
172
+ - **edit**: Modify an existing image with a text prompt (inpainting with optional mask for OpenAI, img2img for Flux)
173
+ - **cut**: Extract a specific element from an image (VLM-guided bounding box + background removal, falls back to image model isolation)
174
+
175
+ ### Supported providers
176
+
177
+ | Provider | Endpoint format | Generate | Edit | Models |
178
+ |----------|----------------|----------|------|--------|
179
+ | Azure OpenAI | `https://{resource}.cognitiveservices.azure.com/openai/deployments/{model}` | Yes | Yes (mask) | gpt-image-2 |
180
+ | OpenAI | `https://api.openai.com` | Yes | Yes (mask) | gpt-image-2, dall-e-3 |
181
+ | Azure AI Foundry | `https://{resource}.services.ai.azure.com/providers/{provider}` | Yes | Yes (img2img) | flux-2-pro, flux-2-dev |
182
+ | Direct provider | `https://api.bfl.ai` | Yes | Yes (img2img) | flux-2-pro, flux-2-dev |
183
+
184
+ Configure via environment variables:
185
+
186
+ ```bash
187
+ IMAGE_GEN_ENDPOINT=https://example.cognitiveservices.azure.com/openai/deployments/gpt-image-2
188
+ IMAGE_GEN_API_KEY=your-key
189
+ IMAGE_GEN_MODEL=gpt-image-2
190
+ ```
191
+
192
+ ### Crossover: embedding images in diagrams
193
+
194
+ Generated images (icons, symbols) can be embedded into draw.io diagrams via the `embeds` parameter on `generate_diagram`. Refer to each embed by its filename in the diagram description so the LLM knows where to place them.
195
+
196
+ The embed mechanism reuses the existing draw.io image strip/restore pipeline: images are represented as `__IMG_N__` placeholders during LLM codegen and replaced with base64 data URIs before rendering.
197
+
198
+ ## Forms
199
+
200
+ Inspect and fill form fields in PDF and DOCX files.
201
+
202
+ ```bash
203
+ # Inspect form fields
204
+ document-agent inspect form.pdf
205
+ document-agent inspect form.docx --json
206
+
207
+ # Fill form fields
208
+ document-agent fill form.pdf filled.pdf --data '{"name": "John", "date": "2026-01-01"}'
209
+ document-agent fill form.docx filled.docx --data fields.json
210
+ ```
211
+
212
+ ## Slide Merge
213
+
214
+ Cherry-pick and merge slides from multiple PPTX sources.
215
+
216
+ ```bash
217
+ # Merge specific slides by index
218
+ document-agent merge-slides deck1.pptx:0-5 deck2.pptx:3,7,12 -o merged.pptx
219
+
220
+ # Use a different base template for theme/master
221
+ document-agent merge-slides deck1.pptx:0-5 deck2.pptx:3,7 \
222
+ --base template.pptx -o merged.pptx
223
+
224
+ # Force python-pptx backend (portable, no PowerPoint needed)
225
+ document-agent merge-slides deck.pptx:0-10 -o subset.pptx --backend pptx
226
+ ```
227
+
228
+ Backends: COM (PowerPoint, preserves animations/transitions/media) is preferred.
229
+ Falls back to python-pptx (portable, pure Python) when PowerPoint is unavailable.
230
+ Install optional dependencies: `pip install document-agent[slides]`
231
+
232
+ ## Setup
233
+
234
+ ```bash
235
+ uv sync
236
+ cp .env.example .env
237
+ # Edit .env with your API keys and tool paths
238
+ ```
239
+
240
+ ### Node.js tools (Marp, Mermaid, draw.io CLI)
241
+
242
+ Marp CLI, Mermaid CLI, and draw.io export are installed locally via npm:
243
+
244
+ ```bash
245
+ cd tools && npm install
246
+ ```
247
+
248
+ The agent automatically discovers them in `tools/node_modules/.bin/`.
249
+
250
+ ### Other external tools
251
+
252
+ - [Pandoc](https://pandoc.org/) - DOCX digest and document composition - install system-wide
253
+ - [LibreOffice](https://www.libreoffice.org/) - Office format OCR fallback - install system-wide
254
+ - [draw.io desktop](https://github.com/jgraph/drawio-desktop) - needed for rendering diagrams with embedded images. Set `DRAWIO_DESKTOP_PATH` in `.env`.
255
+
256
+ ## Python API
257
+
258
+ ```python
259
+ from document_agent import (
260
+ digest, compose, compose_editable_slides,
261
+ merge_slides, parse_source_args,
262
+ OutputFormat, OutputMode,
263
+ )
264
+
265
+ # Digest a DOCX with tracked changes
266
+ result = digest("document.docx", track_changes="all")
267
+ print(result.markdown)
268
+
269
+ # Compose with YAML styles
270
+ result = compose("styled.md", "output.docx", OutputFormat.PDF)
271
+
272
+ # Compose with reference doc
273
+ result = compose("input.md", "output.docx", OutputFormat.DOCX,
274
+ reference_doc="template.docx")
275
+
276
+ # Slides - polished (Marp)
277
+ result = compose("slides.md", "out.pptx", OutputFormat.PPTX,
278
+ is_slides=True)
279
+
280
+ # Slides - draft editable (pandoc)
281
+ result = compose("slides.md", "out.pptx", OutputFormat.PPTX,
282
+ is_slides=True, draft=True)
283
+
284
+ # Slides - draft + corporate template
285
+ result = compose("slides.md", "out.pptx", OutputFormat.PPTX,
286
+ is_slides=True, draft=True,
287
+ template="corporate.pptx")
288
+
289
+ # Merge slides from CLI-style args
290
+ config = parse_source_args(
291
+ ["deck1.pptx:0-5", "deck2.pptx:3,7"],
292
+ output="merged.pptx",
293
+ )
294
+ result = merge_slides(config, "merged.pptx")
295
+ ```
@@ -0,0 +1,269 @@
1
+ # document-agent
2
+
3
+ Document digestion (OCR to Markdown), composition (Markdown to documents), diagram generation, raster image generation, and form filling. Full round-trip support for footnotes, tracked changes, comments, and styles.
4
+
5
+ ## Digestion
6
+
7
+ Convert PDF, images, and Office documents to Markdown.
8
+
9
+ - **PDF/images**: OCR via Mistral Document AI (direct or Azure AI Foundry)
10
+ - **DOCX/ODT**: pandoc-based extraction preserving footnotes, tracked changes, comments, and document styles
11
+
12
+ ```bash
13
+ # Basic digest (auto-selects pandoc for DOCX, OCR for PDF/images)
14
+ document-agent digest document.docx
15
+
16
+ # Show all tracked changes and comments
17
+ document-agent digest document.docx --track-changes all
18
+
19
+ # Force OCR pipeline for a scanned DOCX
20
+ document-agent digest scanned.docx --mode ocr
21
+
22
+ # Inline mode (base64-embedded images, prints to stdout)
23
+ document-agent digest document.pdf --inline
24
+
25
+ # With structured annotation extraction
26
+ document-agent digest document.pdf --schema schema.py --prompt "Extract all line items"
27
+ ```
28
+
29
+ ### Round-trip metadata
30
+
31
+ DOCX digestion preserves four types of metadata that round-trip through compose:
32
+
33
+ - **Footnotes**: `[^1]` with `[^1]: definition text` at end
34
+ - **Tracked changes**: `[text]{.insertion author="Name" date="2026-01-01"}` / `{.deletion ...}`
35
+ - **Comments**: `[comment text]{.comment-start id="1" author="Name"}...{.comment-end id="1"}`
36
+ - **Document styles**: YAML front matter block (fonts, sizes, colors, spacing, margins)
37
+
38
+ Supported input formats: PDF, PNG, JPG, JPEG, WEBP, TIFF, BMP, DOCX, PPTX, XLSX, ODT.
39
+
40
+ ## Composition
41
+
42
+ Convert Markdown to various output formats with style control.
43
+
44
+ ```bash
45
+ # Basic compose
46
+ document-agent compose input.md output.docx --format docx
47
+
48
+ # With a reference document for style inheritance
49
+ document-agent compose input.md output.docx --format docx \
50
+ --reference-doc template.docx
51
+
52
+ # With headers/footers from a template (combined with YAML styles)
53
+ document-agent compose input.md output.docx --format docx \
54
+ --header-footer-doc template.docx
55
+
56
+ # Slides - polished (Marp, limited editability)
57
+ document-agent compose slides.md out.pptx --format pptx --slides
58
+
59
+ # Slides - draft editable (pandoc, fully editable, rough layout)
60
+ document-agent compose slides.md out.pptx --format pptx --slides --draft
61
+
62
+ # Slides - draft + corporate template (requires PowerPoint)
63
+ document-agent compose slides.md out.pptx --format pptx --slides --draft \
64
+ --template corporate.pptx
65
+ ```
66
+
67
+ ### YAML front matter styles
68
+
69
+ Define document formatting directly in the Markdown source. Styles are auto-extracted during digest and auto-applied during compose:
70
+
71
+ ```yaml
72
+ ---
73
+ title: "Document Title"
74
+ subtitle: "Optional *formatted* subtitle"
75
+ styles:
76
+ page:
77
+ size: "A4" # or "Letter"
78
+ margin-top: "1.5cm"
79
+ margin-bottom: "1.5cm"
80
+ margin-left: "1.5cm"
81
+ margin-right: "1.5cm"
82
+ body:
83
+ font: "Calibri"
84
+ size: 11
85
+ spacing-before: "0.0cm"
86
+ spacing-after: "0.1cm"
87
+ line-spacing: 1.1
88
+ heading1:
89
+ font: "Calibri"
90
+ size: 13
91
+ bold: true
92
+ color: "000080" # navy
93
+ spacing-before: "0.3cm"
94
+ spacing-after: "0.1cm"
95
+ heading2:
96
+ font: "Calibri"
97
+ size: 11
98
+ bold: true
99
+ color: "000080"
100
+ heading3:
101
+ font: "Calibri"
102
+ size: 11
103
+ bold: true
104
+ color: "333333"
105
+ table:
106
+ size: 9 # also used for footnotes and captions
107
+ border-color: "999999"
108
+ border-size: 4 # eighths of a point
109
+ fixed: false # true to keep equal-width columns
110
+ ---
111
+ ```
112
+
113
+ Style priority: YAML front matter > `--reference-doc` > pandoc defaults.
114
+
115
+ Mermaid and draw.io diagrams in fenced code blocks are automatically rendered as images.
116
+
117
+ ## Diagrams
118
+
119
+ Generate and modify diagrams (Mermaid or draw.io) using LLM-powered optimization.
120
+
121
+ ```bash
122
+ # Generate from description
123
+ document-agent diagram "flowchart of CI/CD pipeline" -o diagram.png
124
+
125
+ # Modify existing draw.io diagram
126
+ document-agent diagram "Change WP3 label to Digital Infrastructure" \
127
+ --source diagram.drawio.png -o updated.drawio.png \
128
+ --code-output updated.drawio
129
+
130
+ # Mermaid diagram
131
+ document-agent diagram "sequence diagram for auth flow" --type mermaid -o auth.png
132
+ ```
133
+
134
+ draw.io diagrams with embedded raster images are handled automatically:
135
+ images are stripped for LLM editing (reducing context from ~400 KB to ~8 KB)
136
+ and restored in the output. For correct PNG rendering of embedded images,
137
+ set `DRAWIO_DESKTOP_PATH` in `.env` (the npm CLI can't render inline images).
138
+
139
+ ## Image Generation
140
+
141
+ Generate, edit, and extract elements from raster images using text-to-image models.
142
+
143
+ ### Modes
144
+
145
+ - **generate**: Text-to-image from a prompt, with optional style prefix
146
+ - **edit**: Modify an existing image with a text prompt (inpainting with optional mask for OpenAI, img2img for Flux)
147
+ - **cut**: Extract a specific element from an image (VLM-guided bounding box + background removal, falls back to image model isolation)
148
+
149
+ ### Supported providers
150
+
151
+ | Provider | Endpoint format | Generate | Edit | Models |
152
+ |----------|----------------|----------|------|--------|
153
+ | Azure OpenAI | `https://{resource}.cognitiveservices.azure.com/openai/deployments/{model}` | Yes | Yes (mask) | gpt-image-2 |
154
+ | OpenAI | `https://api.openai.com` | Yes | Yes (mask) | gpt-image-2, dall-e-3 |
155
+ | Azure AI Foundry | `https://{resource}.services.ai.azure.com/providers/{provider}` | Yes | Yes (img2img) | flux-2-pro, flux-2-dev |
156
+ | Direct provider | `https://api.bfl.ai` | Yes | Yes (img2img) | flux-2-pro, flux-2-dev |
157
+
158
+ Configure via environment variables:
159
+
160
+ ```bash
161
+ IMAGE_GEN_ENDPOINT=https://example.cognitiveservices.azure.com/openai/deployments/gpt-image-2
162
+ IMAGE_GEN_API_KEY=your-key
163
+ IMAGE_GEN_MODEL=gpt-image-2
164
+ ```
165
+
166
+ ### Crossover: embedding images in diagrams
167
+
168
+ Generated images (icons, symbols) can be embedded into draw.io diagrams via the `embeds` parameter on `generate_diagram`. Refer to each embed by its filename in the diagram description so the LLM knows where to place them.
169
+
170
+ The embed mechanism reuses the existing draw.io image strip/restore pipeline: images are represented as `__IMG_N__` placeholders during LLM codegen and replaced with base64 data URIs before rendering.
171
+
172
+ ## Forms
173
+
174
+ Inspect and fill form fields in PDF and DOCX files.
175
+
176
+ ```bash
177
+ # Inspect form fields
178
+ document-agent inspect form.pdf
179
+ document-agent inspect form.docx --json
180
+
181
+ # Fill form fields
182
+ document-agent fill form.pdf filled.pdf --data '{"name": "John", "date": "2026-01-01"}'
183
+ document-agent fill form.docx filled.docx --data fields.json
184
+ ```
185
+
186
+ ## Slide Merge
187
+
188
+ Cherry-pick and merge slides from multiple PPTX sources.
189
+
190
+ ```bash
191
+ # Merge specific slides by index
192
+ document-agent merge-slides deck1.pptx:0-5 deck2.pptx:3,7,12 -o merged.pptx
193
+
194
+ # Use a different base template for theme/master
195
+ document-agent merge-slides deck1.pptx:0-5 deck2.pptx:3,7 \
196
+ --base template.pptx -o merged.pptx
197
+
198
+ # Force python-pptx backend (portable, no PowerPoint needed)
199
+ document-agent merge-slides deck.pptx:0-10 -o subset.pptx --backend pptx
200
+ ```
201
+
202
+ Backends: COM (PowerPoint, preserves animations/transitions/media) is preferred.
203
+ Falls back to python-pptx (portable, pure Python) when PowerPoint is unavailable.
204
+ Install optional dependencies: `pip install document-agent[slides]`
205
+
206
+ ## Setup
207
+
208
+ ```bash
209
+ uv sync
210
+ cp .env.example .env
211
+ # Edit .env with your API keys and tool paths
212
+ ```
213
+
214
+ ### Node.js tools (Marp, Mermaid, draw.io CLI)
215
+
216
+ Marp CLI, Mermaid CLI, and draw.io export are installed locally via npm:
217
+
218
+ ```bash
219
+ cd tools && npm install
220
+ ```
221
+
222
+ The agent automatically discovers them in `tools/node_modules/.bin/`.
223
+
224
+ ### Other external tools
225
+
226
+ - [Pandoc](https://pandoc.org/) - DOCX digest and document composition - install system-wide
227
+ - [LibreOffice](https://www.libreoffice.org/) - Office format OCR fallback - install system-wide
228
+ - [draw.io desktop](https://github.com/jgraph/drawio-desktop) - needed for rendering diagrams with embedded images. Set `DRAWIO_DESKTOP_PATH` in `.env`.
229
+
230
+ ## Python API
231
+
232
+ ```python
233
+ from document_agent import (
234
+ digest, compose, compose_editable_slides,
235
+ merge_slides, parse_source_args,
236
+ OutputFormat, OutputMode,
237
+ )
238
+
239
+ # Digest a DOCX with tracked changes
240
+ result = digest("document.docx", track_changes="all")
241
+ print(result.markdown)
242
+
243
+ # Compose with YAML styles
244
+ result = compose("styled.md", "output.docx", OutputFormat.PDF)
245
+
246
+ # Compose with reference doc
247
+ result = compose("input.md", "output.docx", OutputFormat.DOCX,
248
+ reference_doc="template.docx")
249
+
250
+ # Slides - polished (Marp)
251
+ result = compose("slides.md", "out.pptx", OutputFormat.PPTX,
252
+ is_slides=True)
253
+
254
+ # Slides - draft editable (pandoc)
255
+ result = compose("slides.md", "out.pptx", OutputFormat.PPTX,
256
+ is_slides=True, draft=True)
257
+
258
+ # Slides - draft + corporate template
259
+ result = compose("slides.md", "out.pptx", OutputFormat.PPTX,
260
+ is_slides=True, draft=True,
261
+ template="corporate.pptx")
262
+
263
+ # Merge slides from CLI-style args
264
+ config = parse_source_args(
265
+ ["deck1.pptx:0-5", "deck2.pptx:3,7"],
266
+ output="merged.pptx",
267
+ )
268
+ result = merge_slides(config, "merged.pptx")
269
+ ```
@@ -0,0 +1,10 @@
1
+ FROM ubuntu:24.04
2
+ ENV DEBIAN_FRONTEND=noninteractive
3
+ RUN apt-get update && \
4
+ apt-get install -y \
5
+ libreoffice-impress libreoffice-script-provider-python \
6
+ python3-uno python3 && \
7
+ apt-get clean && rm -rf /var/lib/apt/lists/*
8
+ ENV SAL_USE_VCLPLUGIN=svp
9
+ ENV HOME=/tmp
10
+ WORKDIR /work