docintel-platform 1.2.0__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docintel_platform-1.2.0/src/docintel_platform.egg-info → docintel_platform-1.3.0}/PKG-INFO +9 -5
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/README.md +5 -4
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/pyproject.toml +4 -1
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/__init__.py +1 -1
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/formats/extract.py +33 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/formats/models.py +1 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/formats/registry.py +8 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/formats/sniff.py +3 -1
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/client.py +42 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/config.py +1 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/jobs/models.py +1 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/jobs/queue.py +19 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/jobs/store.py +12 -2
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/jobs/tasks.py +39 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/openapi/openapi.yaml +60 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/documents.py +95 -0
- docintel_platform-1.3.0/src/docintel/storage/s3_ingest.py +61 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/ui.py +3 -3
- {docintel_platform-1.2.0 → docintel_platform-1.3.0/src/docintel_platform.egg-info}/PKG-INFO +9 -5
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel_platform.egg-info/SOURCES.txt +3 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel_platform.egg-info/requires.txt +3 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_document_formats.py +28 -1
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_health.py +1 -1
- docintel_platform-1.3.0/tests/test_job_ttl.py +40 -0
- docintel_platform-1.3.0/tests/test_s3_ingest.py +101 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/LICENSE +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/MANIFEST.in +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/setup.cfg +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/app.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/auth/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/auth/api_keys.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/auth/limiter.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/auth/middleware.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/auth/oidc.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/compliance/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/compliance/pii.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/compliance/presets.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/compliance/sensitive.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/formats/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/ocr.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/structure.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/structure_llm.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/structure_render.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/structure_schema.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/pdf/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/pdf/annotator.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/pdf/models.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/pdf/search.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/pipeline/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/pipeline/process.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/understanding/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/understanding/classify.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/understanding/compare.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/understanding/models.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/understanding/textrank.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/cli.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/jobs/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/jobs/helpers.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/jobs/webhooks.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/openapi/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/ops/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/ops/logging.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/ops/metrics.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/ops/middleware.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/ops/prometheus.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/async_enqueue.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/batch.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/document_upload.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/jobs.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/openapi_docs.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/ops.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/pdf.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/text.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/annotator.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/models.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/ocr.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/pii.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/presets.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/search.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/sensitive.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/structure.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/structure_llm.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/structure_render.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/structure_schema.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/summary/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/summary/models.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/summary/textrank.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/storage/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/storage/local.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/storage/s3.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/wsgi.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel_platform.egg-info/dependency_links.txt +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel_platform.egg-info/entry_points.txt +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel_platform.egg-info/top_level.txt +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_annotate_async.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_auth.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_batch.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_client.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_detect_sensitive_async.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_documents_async_routes.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_documents_classify.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_documents_compare.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_documents_compare_files.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_documents_detect_pii.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_documents_process.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_documents_process_async.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_documents_summarize.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_jobs.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_oidc.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_openapi.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_ops.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_pdf_routes.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_pdf_sensitive.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_pdf_service.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_pdf_structure.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_pii_mask.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_storage.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_structure_pii.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_summary_routes.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_summary_service.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_ui.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_ui_process.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_vertical_presets.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_webhooks.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docintel-platform
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.0
|
|
4
4
|
Summary: Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization.
|
|
5
5
|
Author: Babandeep Singh
|
|
6
6
|
License-Expression: MIT
|
|
@@ -40,9 +40,11 @@ Requires-Dist: fakeredis>=2.26.2; extra == "dev"
|
|
|
40
40
|
Requires-Dist: prometheus-client>=0.21.0; extra == "dev"
|
|
41
41
|
Requires-Dist: python-docx>=1.1.2; extra == "dev"
|
|
42
42
|
Requires-Dist: openpyxl>=3.1.5; extra == "dev"
|
|
43
|
+
Requires-Dist: python-pptx>=1.0.2; extra == "dev"
|
|
43
44
|
Provides-Extra: documents
|
|
44
45
|
Requires-Dist: python-docx>=1.1.2; extra == "documents"
|
|
45
46
|
Requires-Dist: openpyxl>=3.1.5; extra == "documents"
|
|
47
|
+
Requires-Dist: python-pptx>=1.0.2; extra == "documents"
|
|
46
48
|
Provides-Extra: ocr
|
|
47
49
|
Requires-Dist: easyocr>=1.7.2; extra == "ocr"
|
|
48
50
|
Requires-Dist: presidio-analyzer>=2.2.354; extra == "ocr"
|
|
@@ -80,6 +82,7 @@ Requires-Dist: cryptography>=43.0.0; extra == "all"
|
|
|
80
82
|
Requires-Dist: gradio>=4.44.0; extra == "all"
|
|
81
83
|
Requires-Dist: python-docx>=1.1.2; extra == "all"
|
|
82
84
|
Requires-Dist: openpyxl>=3.1.5; extra == "all"
|
|
85
|
+
Requires-Dist: python-pptx>=1.0.2; extra == "all"
|
|
83
86
|
|
|
84
87
|
# Document Intelligence Platform
|
|
85
88
|
|
|
@@ -90,7 +93,7 @@ Requires-Dist: openpyxl>=3.1.5; extra == "all"
|
|
|
90
93
|
|
|
91
94
|
Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
|
|
92
95
|
|
|
93
|
-
**Version:** 1.
|
|
96
|
+
**Version:** 1.3.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
|
|
94
97
|
|
|
95
98
|
---
|
|
96
99
|
|
|
@@ -119,7 +122,7 @@ Gradio includes a **Document process** tab (unified pipeline). It needs the API
|
|
|
119
122
|
```bash
|
|
120
123
|
pip install docintel-platform
|
|
121
124
|
pip install "docintel-platform[all]" # OCR, LLM, jobs, auth, UI, office formats
|
|
122
|
-
pip install "docintel-platform[documents]" # Word and
|
|
125
|
+
pip install "docintel-platform[documents]" # Word, Excel, and PowerPoint
|
|
123
126
|
```
|
|
124
127
|
|
|
125
128
|
**Python client:**
|
|
@@ -141,13 +144,13 @@ report = client.process_document("policy.docx", include_pii=True)
|
|
|
141
144
|
| PDF annotate | `POST /v1/pdf/annotate` | Regex highlight, redact, markup |
|
|
142
145
|
| PDF PII scan | `POST /v1/pdf/detect-sensitive` | Presidio + OCR for scanned PDFs |
|
|
143
146
|
| PDF structure | `POST /v1/pdf/structure` | OCR + LLM curated PDF (needs LLM key) |
|
|
144
|
-
| Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process** |
|
|
147
|
+
| Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process**, **ingest** (S3) |
|
|
145
148
|
| Text | `POST /v1/text/summarize` | TextRank extractive summary |
|
|
146
149
|
| Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
|
|
147
150
|
| Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
|
|
148
151
|
| Ops | `GET /health`, `GET /metrics` | Health and Prometheus-friendly metrics |
|
|
149
152
|
|
|
150
|
-
**Supported uploads (text workflows):** PDF, DOCX, XLSX, CSV, JSON, TXT, MD.
|
|
153
|
+
**Supported uploads (text workflows):** PDF, DOCX, XLSX, PPTX, CSV, JSON, TXT, MD.
|
|
151
154
|
|
|
152
155
|
**PDF-only routes** (annotate, sensitive, structure) return HTTP 415 for other types. Use `/v1/documents/extract-text` or `/v1/documents/process` for office files.
|
|
153
156
|
|
|
@@ -199,6 +202,7 @@ Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and
|
|
|
199
202
|
| [docs/PLATFORM.md](docs/PLATFORM.md) | Jobs, auth, storage, ops layout |
|
|
200
203
|
| [docs/PRODUCTION.md](docs/PRODUCTION.md) | Checklist, latency, failure modes |
|
|
201
204
|
| [docs/ROADMAP.md](docs/ROADMAP.md) | Milestones and history |
|
|
205
|
+
| [docs/WEBHOOKS.md](docs/WEBHOOKS.md) | Async callbacks and S3 ingest |
|
|
202
206
|
| [docs/adr/](docs/adr/) | Architecture decision records |
|
|
203
207
|
|
|
204
208
|
---
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
|
|
8
8
|
Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
|
|
9
9
|
|
|
10
|
-
**Version:** 1.
|
|
10
|
+
**Version:** 1.3.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
|
|
11
11
|
|
|
12
12
|
---
|
|
13
13
|
|
|
@@ -36,7 +36,7 @@ Gradio includes a **Document process** tab (unified pipeline). It needs the API
|
|
|
36
36
|
```bash
|
|
37
37
|
pip install docintel-platform
|
|
38
38
|
pip install "docintel-platform[all]" # OCR, LLM, jobs, auth, UI, office formats
|
|
39
|
-
pip install "docintel-platform[documents]" # Word and
|
|
39
|
+
pip install "docintel-platform[documents]" # Word, Excel, and PowerPoint
|
|
40
40
|
```
|
|
41
41
|
|
|
42
42
|
**Python client:**
|
|
@@ -58,13 +58,13 @@ report = client.process_document("policy.docx", include_pii=True)
|
|
|
58
58
|
| PDF annotate | `POST /v1/pdf/annotate` | Regex highlight, redact, markup |
|
|
59
59
|
| PDF PII scan | `POST /v1/pdf/detect-sensitive` | Presidio + OCR for scanned PDFs |
|
|
60
60
|
| PDF structure | `POST /v1/pdf/structure` | OCR + LLM curated PDF (needs LLM key) |
|
|
61
|
-
| Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process** |
|
|
61
|
+
| Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process**, **ingest** (S3) |
|
|
62
62
|
| Text | `POST /v1/text/summarize` | TextRank extractive summary |
|
|
63
63
|
| Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
|
|
64
64
|
| Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
|
|
65
65
|
| Ops | `GET /health`, `GET /metrics` | Health and Prometheus-friendly metrics |
|
|
66
66
|
|
|
67
|
-
**Supported uploads (text workflows):** PDF, DOCX, XLSX, CSV, JSON, TXT, MD.
|
|
67
|
+
**Supported uploads (text workflows):** PDF, DOCX, XLSX, PPTX, CSV, JSON, TXT, MD.
|
|
68
68
|
|
|
69
69
|
**PDF-only routes** (annotate, sensitive, structure) return HTTP 415 for other types. Use `/v1/documents/extract-text` or `/v1/documents/process` for office files.
|
|
70
70
|
|
|
@@ -116,6 +116,7 @@ Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and
|
|
|
116
116
|
| [docs/PLATFORM.md](docs/PLATFORM.md) | Jobs, auth, storage, ops layout |
|
|
117
117
|
| [docs/PRODUCTION.md](docs/PRODUCTION.md) | Checklist, latency, failure modes |
|
|
118
118
|
| [docs/ROADMAP.md](docs/ROADMAP.md) | Milestones and history |
|
|
119
|
+
| [docs/WEBHOOKS.md](docs/WEBHOOKS.md) | Async callbacks and S3 ingest |
|
|
119
120
|
| [docs/adr/](docs/adr/) | Architecture decision records |
|
|
120
121
|
|
|
121
122
|
---
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docintel-platform"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.3.0"
|
|
8
8
|
description = "Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -50,10 +50,12 @@ dev = [
|
|
|
50
50
|
"prometheus-client>=0.21.0",
|
|
51
51
|
"python-docx>=1.1.2",
|
|
52
52
|
"openpyxl>=3.1.5",
|
|
53
|
+
"python-pptx>=1.0.2",
|
|
53
54
|
]
|
|
54
55
|
documents = [
|
|
55
56
|
"python-docx>=1.1.2",
|
|
56
57
|
"openpyxl>=3.1.5",
|
|
58
|
+
"python-pptx>=1.0.2",
|
|
57
59
|
]
|
|
58
60
|
ocr = [
|
|
59
61
|
"easyocr>=1.7.2",
|
|
@@ -98,6 +100,7 @@ all = [
|
|
|
98
100
|
"gradio>=4.44.0",
|
|
99
101
|
"python-docx>=1.1.2",
|
|
100
102
|
"openpyxl>=3.1.5",
|
|
103
|
+
"python-pptx>=1.0.2",
|
|
101
104
|
]
|
|
102
105
|
|
|
103
106
|
[project.scripts]
|
|
@@ -39,6 +39,8 @@ def extract_document_text(
|
|
|
39
39
|
return _extract_docx(file_path, resolved)
|
|
40
40
|
if resolved.kind is DocumentKind.XLSX:
|
|
41
41
|
return _extract_xlsx(file_path, resolved)
|
|
42
|
+
if resolved.kind is DocumentKind.PPTX:
|
|
43
|
+
return _extract_pptx(file_path, resolved)
|
|
42
44
|
if resolved.kind is DocumentKind.CSV:
|
|
43
45
|
return _extract_csv(file_path, resolved)
|
|
44
46
|
if resolved.kind is DocumentKind.JSON:
|
|
@@ -129,6 +131,37 @@ def _extract_xlsx(path: Path, identification: IdentificationResult) -> Extractio
|
|
|
129
131
|
)
|
|
130
132
|
|
|
131
133
|
|
|
134
|
+
def _extract_pptx(path: Path, identification: IdentificationResult) -> ExtractionResult:
|
|
135
|
+
try:
|
|
136
|
+
from pptx import Presentation
|
|
137
|
+
except ImportError as exc:
|
|
138
|
+
raise RuntimeError(
|
|
139
|
+
"PowerPoint support requires optional dependencies. Install: pip install -e '.[documents]'"
|
|
140
|
+
) from exc
|
|
141
|
+
|
|
142
|
+
presentation = Presentation(path)
|
|
143
|
+
segments: list[dict] = []
|
|
144
|
+
parts: list[str] = []
|
|
145
|
+
for slide_index, slide in enumerate(presentation.slides, start=1):
|
|
146
|
+
slide_parts: list[str] = []
|
|
147
|
+
for shape in slide.shapes:
|
|
148
|
+
text = getattr(shape, "text", "").strip()
|
|
149
|
+
if text:
|
|
150
|
+
slide_parts.append(text)
|
|
151
|
+
slide_text = "\n".join(slide_parts)
|
|
152
|
+
segments.append({"slide": slide_index, "text": slide_text})
|
|
153
|
+
if slide_text:
|
|
154
|
+
parts.append(f"# Slide {slide_index}\n{slide_text}")
|
|
155
|
+
|
|
156
|
+
return ExtractionResult(
|
|
157
|
+
kind=identification.kind,
|
|
158
|
+
mime_type=identification.mime_type,
|
|
159
|
+
text="\n\n".join(parts),
|
|
160
|
+
segments=segments,
|
|
161
|
+
metadata={"slide_count": len(presentation.slides)},
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
132
165
|
def _extract_csv(path: Path, identification: IdentificationResult) -> ExtractionResult:
|
|
133
166
|
raw = path.read_text(encoding="utf-8", errors="replace")
|
|
134
167
|
sample = raw[:2048]
|
|
@@ -31,6 +31,14 @@ _PROFILES: tuple[DocumentProfile, ...] = (
|
|
|
31
31
|
supports_pdf_pipeline=False,
|
|
32
32
|
supports_text_extraction=True,
|
|
33
33
|
),
|
|
34
|
+
DocumentProfile(
|
|
35
|
+
kind=DocumentKind.PPTX,
|
|
36
|
+
mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
37
|
+
extensions=(".pptx",),
|
|
38
|
+
label="PowerPoint presentation",
|
|
39
|
+
supports_pdf_pipeline=False,
|
|
40
|
+
supports_text_extraction=True,
|
|
41
|
+
),
|
|
34
42
|
DocumentProfile(
|
|
35
43
|
kind=DocumentKind.CSV,
|
|
36
44
|
mime_type="text/csv",
|
|
@@ -31,6 +31,8 @@ def _sniff_zip_kind(path: Path) -> DocumentKind | None:
|
|
|
31
31
|
|
|
32
32
|
if any(name.startswith("word/") for name in names):
|
|
33
33
|
return DocumentKind.DOCX
|
|
34
|
+
if any(name.startswith("ppt/") for name in names):
|
|
35
|
+
return DocumentKind.PPTX
|
|
34
36
|
if any(name.startswith("xl/") for name in names):
|
|
35
37
|
return DocumentKind.XLSX
|
|
36
38
|
return None
|
|
@@ -80,7 +82,7 @@ def _looks_like_csv(sample: str) -> bool:
|
|
|
80
82
|
|
|
81
83
|
|
|
82
84
|
def _requires_content_confirmation(kind: DocumentKind) -> bool:
|
|
83
|
-
return kind in {DocumentKind.PDF, DocumentKind.DOCX, DocumentKind.XLSX}
|
|
85
|
+
return kind in {DocumentKind.PDF, DocumentKind.DOCX, DocumentKind.XLSX, DocumentKind.PPTX}
|
|
84
86
|
|
|
85
87
|
|
|
86
88
|
def _build_result(
|
|
@@ -416,3 +416,45 @@ class DocintelClient:
|
|
|
416
416
|
data=data,
|
|
417
417
|
poll=poll,
|
|
418
418
|
)
|
|
419
|
+
|
|
420
|
+
def ingest_document_from_s3(
|
|
421
|
+
self,
|
|
422
|
+
*,
|
|
423
|
+
s3_uri: str | None = None,
|
|
424
|
+
bucket: str | None = None,
|
|
425
|
+
key: str | None = None,
|
|
426
|
+
sentences: int = 3,
|
|
427
|
+
include_summarize: bool = True,
|
|
428
|
+
include_pii: bool = True,
|
|
429
|
+
include_text: bool = False,
|
|
430
|
+
entities: str | None = None,
|
|
431
|
+
vertical: str | None = None,
|
|
432
|
+
min_score: float = 0.35,
|
|
433
|
+
callback_url: str | None = None,
|
|
434
|
+
poll: bool = True,
|
|
435
|
+
) -> dict[str, Any]:
|
|
436
|
+
body: dict[str, Any] = {
|
|
437
|
+
"operation": "process",
|
|
438
|
+
"sentences": sentences,
|
|
439
|
+
"include_summarize": include_summarize,
|
|
440
|
+
"include_pii": include_pii,
|
|
441
|
+
"include_text": include_text,
|
|
442
|
+
"min_score": min_score,
|
|
443
|
+
}
|
|
444
|
+
if s3_uri:
|
|
445
|
+
body["s3_uri"] = s3_uri
|
|
446
|
+
if bucket:
|
|
447
|
+
body["bucket"] = bucket
|
|
448
|
+
if key:
|
|
449
|
+
body["key"] = key
|
|
450
|
+
if entities:
|
|
451
|
+
body["entities"] = entities
|
|
452
|
+
if vertical:
|
|
453
|
+
body["vertical"] = vertical
|
|
454
|
+
if callback_url:
|
|
455
|
+
body["callback_url"] = callback_url
|
|
456
|
+
return self._post_async_json(
|
|
457
|
+
"/v1/documents/ingest",
|
|
458
|
+
json_body=body,
|
|
459
|
+
poll=poll,
|
|
460
|
+
)
|
|
@@ -11,6 +11,7 @@ class Config:
|
|
|
11
11
|
LOG_LEVEL = os.getenv("DOCINTEL_LOG_LEVEL", "INFO")
|
|
12
12
|
REDIS_URL = os.getenv("DOCINTEL_REDIS_URL", "redis://localhost:6379/0")
|
|
13
13
|
JOBS_ENABLED = os.getenv("DOCINTEL_JOBS_ENABLED", "true").lower() == "true"
|
|
14
|
+
JOB_TTL_SECONDS = int(os.getenv("DOCINTEL_JOB_TTL_SECONDS", str(60 * 60 * 24 * 7)))
|
|
14
15
|
QUEUE_NAME = os.getenv("DOCINTEL_QUEUE_NAME", "docintel")
|
|
15
16
|
API_KEYS = os.getenv("DOCINTEL_API_KEYS", "")
|
|
16
17
|
AUTH_REQUIRED = os.getenv("DOCINTEL_AUTH_REQUIRED", "false").lower() == "true"
|
|
@@ -270,6 +270,25 @@ def enqueue_extract_text_job(
|
|
|
270
270
|
)
|
|
271
271
|
|
|
272
272
|
|
|
273
|
+
def enqueue_s3_document_process_job(
|
|
274
|
+
job_id: str,
|
|
275
|
+
bucket: str,
|
|
276
|
+
key: str,
|
|
277
|
+
options: dict,
|
|
278
|
+
) -> None:
|
|
279
|
+
queue = get_queue()
|
|
280
|
+
queue.enqueue(
|
|
281
|
+
"docintel.jobs.tasks.run_s3_document_process_job",
|
|
282
|
+
job_id=job_id,
|
|
283
|
+
bucket=bucket,
|
|
284
|
+
key=key,
|
|
285
|
+
options=options,
|
|
286
|
+
job_timeout=900,
|
|
287
|
+
result_ttl=DEFAULT_RESULT_TTL,
|
|
288
|
+
failure_ttl=DEFAULT_FAILURE_TTL,
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
|
|
273
292
|
def enqueue_compare_job(
|
|
274
293
|
job_id: str,
|
|
275
294
|
*,
|
|
@@ -12,6 +12,15 @@ JOB_KEY_PREFIX = "docintel:job:"
|
|
|
12
12
|
DEFAULT_JOB_TTL_SECONDS = 60 * 60 * 24 * 7
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
def job_ttl_seconds() -> int:
|
|
16
|
+
raw = os.getenv("DOCINTEL_JOB_TTL_SECONDS", str(DEFAULT_JOB_TTL_SECONDS)).strip()
|
|
17
|
+
try:
|
|
18
|
+
ttl = int(raw)
|
|
19
|
+
except ValueError:
|
|
20
|
+
return DEFAULT_JOB_TTL_SECONDS
|
|
21
|
+
return max(ttl, 60)
|
|
22
|
+
|
|
23
|
+
|
|
15
24
|
def redis_url() -> str:
|
|
16
25
|
return os.getenv("DOCINTEL_REDIS_URL", "redis://localhost:6379/0").strip()
|
|
17
26
|
|
|
@@ -37,9 +46,10 @@ def _job_key(job_id: str) -> str:
|
|
|
37
46
|
return f"{JOB_KEY_PREFIX}{job_id}"
|
|
38
47
|
|
|
39
48
|
|
|
40
|
-
def save_job(record: JobRecord, ttl_seconds: int =
|
|
49
|
+
def save_job(record: JobRecord, ttl_seconds: int | None = None) -> None:
|
|
41
50
|
client = _redis_client()
|
|
42
|
-
|
|
51
|
+
resolved_ttl = ttl_seconds if ttl_seconds is not None else job_ttl_seconds()
|
|
52
|
+
client.set(_job_key(record.job_id), json.dumps(record.to_dict()), ex=resolved_ttl)
|
|
43
53
|
|
|
44
54
|
|
|
45
55
|
def get_job(job_id: str) -> JobRecord | None:
|
|
@@ -645,6 +645,45 @@ def run_compare_job(
|
|
|
645
645
|
)
|
|
646
646
|
|
|
647
647
|
|
|
648
|
+
def run_s3_document_process_job(
|
|
649
|
+
*,
|
|
650
|
+
job_id: str,
|
|
651
|
+
bucket: str,
|
|
652
|
+
key: str,
|
|
653
|
+
options: dict,
|
|
654
|
+
) -> dict:
|
|
655
|
+
from docintel.storage.s3_ingest import download_s3_object_to_job_dir
|
|
656
|
+
|
|
657
|
+
record = get_job(job_id)
|
|
658
|
+
callback_url = record.callback_url if record else None
|
|
659
|
+
update_job(
|
|
660
|
+
job_id,
|
|
661
|
+
job_status=JobStatus.RUNNING.value,
|
|
662
|
+
progress=5,
|
|
663
|
+
progress_message="Downloading from S3",
|
|
664
|
+
)
|
|
665
|
+
try:
|
|
666
|
+
input_path, filename = download_s3_object_to_job_dir(job_id, bucket, key)
|
|
667
|
+
except Exception as exc:
|
|
668
|
+
failed = update_job(
|
|
669
|
+
job_id,
|
|
670
|
+
job_status=JobStatus.FAILED.value,
|
|
671
|
+
progress=100,
|
|
672
|
+
progress_message="Job failed",
|
|
673
|
+
error=str(exc),
|
|
674
|
+
)
|
|
675
|
+
_notify_webhook(callback_url, failed)
|
|
676
|
+
raise
|
|
677
|
+
|
|
678
|
+
return run_document_process_job(
|
|
679
|
+
job_id=job_id,
|
|
680
|
+
input_path=str(input_path),
|
|
681
|
+
filename=filename,
|
|
682
|
+
content_type=None,
|
|
683
|
+
options=options,
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
|
|
648
687
|
def create_queued_job(
|
|
649
688
|
job_id: str,
|
|
650
689
|
*,
|
|
@@ -541,6 +541,66 @@ paths:
|
|
|
541
541
|
"503":
|
|
542
542
|
description: Presidio stack unavailable
|
|
543
543
|
|
|
544
|
+
/v1/documents/ingest:
|
|
545
|
+
post:
|
|
546
|
+
tags: [documents]
|
|
547
|
+
summary: Queue process pipeline for an S3 object
|
|
548
|
+
description: |
|
|
549
|
+
Downloads the object in the worker, then runs the same pipeline as
|
|
550
|
+
POST /v1/documents/process. Always returns 202 when Redis is available.
|
|
551
|
+
requestBody:
|
|
552
|
+
required: true
|
|
553
|
+
content:
|
|
554
|
+
application/json:
|
|
555
|
+
schema:
|
|
556
|
+
type: object
|
|
557
|
+
required: [operation]
|
|
558
|
+
properties:
|
|
559
|
+
s3_uri:
|
|
560
|
+
type: string
|
|
561
|
+
example: s3://my-bucket/inbox/policy.docx
|
|
562
|
+
bucket:
|
|
563
|
+
type: string
|
|
564
|
+
key:
|
|
565
|
+
type: string
|
|
566
|
+
operation:
|
|
567
|
+
type: string
|
|
568
|
+
enum: [process]
|
|
569
|
+
default: process
|
|
570
|
+
sentences:
|
|
571
|
+
type: integer
|
|
572
|
+
minimum: 1
|
|
573
|
+
maximum: 20
|
|
574
|
+
include_summarize:
|
|
575
|
+
type: boolean
|
|
576
|
+
default: true
|
|
577
|
+
include_pii:
|
|
578
|
+
type: boolean
|
|
579
|
+
default: true
|
|
580
|
+
include_text:
|
|
581
|
+
type: boolean
|
|
582
|
+
default: false
|
|
583
|
+
vertical:
|
|
584
|
+
type: string
|
|
585
|
+
entities:
|
|
586
|
+
type: string
|
|
587
|
+
min_score:
|
|
588
|
+
type: number
|
|
589
|
+
callback_url:
|
|
590
|
+
type: string
|
|
591
|
+
format: uri
|
|
592
|
+
responses:
|
|
593
|
+
"202":
|
|
594
|
+
description: S3 ingest job queued
|
|
595
|
+
content:
|
|
596
|
+
application/json:
|
|
597
|
+
schema:
|
|
598
|
+
$ref: "#/components/schemas/AsyncAccepted"
|
|
599
|
+
"400":
|
|
600
|
+
description: Invalid S3 location or options
|
|
601
|
+
"503":
|
|
602
|
+
description: Async jobs unavailable
|
|
603
|
+
|
|
544
604
|
/v1/documents/process:
|
|
545
605
|
post:
|
|
546
606
|
tags: [documents]
|
|
@@ -182,6 +182,60 @@ def _parse_process_options() -> tuple[ProcessOptions | None, dict | None, int |
|
|
|
182
182
|
)
|
|
183
183
|
|
|
184
184
|
|
|
185
|
+
def _parse_process_options_from_dict(
|
|
186
|
+
payload: dict,
|
|
187
|
+
) -> tuple[ProcessOptions | None, dict | None, int | None]:
|
|
188
|
+
raw_sentences = payload.get("sentences", DEFAULT_SENTENCE_COUNT)
|
|
189
|
+
try:
|
|
190
|
+
sentences = int(raw_sentences)
|
|
191
|
+
except (TypeError, ValueError):
|
|
192
|
+
return None, {"error": "Field 'sentences' must be an integer."}, 400
|
|
193
|
+
if sentences < 1 or sentences > MAX_SENTENCE_COUNT:
|
|
194
|
+
return None, {
|
|
195
|
+
"error": f"Field 'sentences' must be between 1 and {MAX_SENTENCE_COUNT}."
|
|
196
|
+
}, 400
|
|
197
|
+
|
|
198
|
+
vertical = payload.get("vertical", "")
|
|
199
|
+
vertical = vertical.strip() if isinstance(vertical, str) else ""
|
|
200
|
+
entities_raw = payload.get("entities")
|
|
201
|
+
try:
|
|
202
|
+
entities = _resolve_entities(
|
|
203
|
+
entities_raw if isinstance(entities_raw, str) else None,
|
|
204
|
+
vertical or None,
|
|
205
|
+
)
|
|
206
|
+
except ValueError as exc:
|
|
207
|
+
return None, {"error": str(exc)}, 400
|
|
208
|
+
|
|
209
|
+
raw_min_score = payload.get("min_score", 0.35)
|
|
210
|
+
try:
|
|
211
|
+
min_score = float(raw_min_score)
|
|
212
|
+
except (TypeError, ValueError):
|
|
213
|
+
return None, {"error": "Field 'min_score' must be a number."}, 400
|
|
214
|
+
|
|
215
|
+
def _bool_value(name: str, default: bool) -> bool:
|
|
216
|
+
if name not in payload:
|
|
217
|
+
return default
|
|
218
|
+
value = payload[name]
|
|
219
|
+
if isinstance(value, bool):
|
|
220
|
+
return value
|
|
221
|
+
if isinstance(value, str):
|
|
222
|
+
return value.strip().lower() in {"1", "true", "yes", "on"}
|
|
223
|
+
return default
|
|
224
|
+
|
|
225
|
+
return (
|
|
226
|
+
ProcessOptions(
|
|
227
|
+
sentences=sentences,
|
|
228
|
+
include_summarize=_bool_value("include_summarize", True),
|
|
229
|
+
include_pii=_bool_value("include_pii", True),
|
|
230
|
+
include_text=_bool_value("include_text", False),
|
|
231
|
+
entities=entities,
|
|
232
|
+
min_score=min_score,
|
|
233
|
+
),
|
|
234
|
+
None,
|
|
235
|
+
None,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
|
|
185
239
|
@documents_bp.get("/types")
|
|
186
240
|
@limiter.limit("120 per hour")
|
|
187
241
|
def supported_document_types():
|
|
@@ -189,6 +243,47 @@ def supported_document_types():
|
|
|
189
243
|
return jsonify({"status": "ok", "types": list_supported_types()})
|
|
190
244
|
|
|
191
245
|
|
|
246
|
+
@documents_bp.post("/ingest")
|
|
247
|
+
@limiter.limit("20 per hour")
|
|
248
|
+
def ingest_document():
|
|
249
|
+
"""Queue unified document processing for an object already stored in S3."""
|
|
250
|
+
payload = request.get_json(silent=True)
|
|
251
|
+
if not isinstance(payload, dict):
|
|
252
|
+
return jsonify({"error": "Request body must be JSON."}), 400
|
|
253
|
+
|
|
254
|
+
operation = str(payload.get("operation", "process")).strip().lower()
|
|
255
|
+
if operation != "process":
|
|
256
|
+
return jsonify({"error": "Only operation 'process' is supported."}), 400
|
|
257
|
+
|
|
258
|
+
from docintel.storage.s3_ingest import resolve_s3_location
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
bucket, key = resolve_s3_location(payload)
|
|
262
|
+
except ValueError as exc:
|
|
263
|
+
return jsonify({"error": str(exc)}), 400
|
|
264
|
+
|
|
265
|
+
options, option_error, option_status = _parse_process_options_from_dict(payload)
|
|
266
|
+
if option_error is not None:
|
|
267
|
+
return jsonify(option_error), option_status
|
|
268
|
+
|
|
269
|
+
callback_raw = payload.get("callback_url", "")
|
|
270
|
+
callback_url = callback_raw.strip() if isinstance(callback_raw, str) and callback_raw.strip() else None
|
|
271
|
+
|
|
272
|
+
from docintel.jobs.models import JobType
|
|
273
|
+
from docintel.jobs.queue import enqueue_s3_document_process_job
|
|
274
|
+
|
|
275
|
+
job_id = uuid.uuid4().hex[:12]
|
|
276
|
+
return enqueue_background_job(
|
|
277
|
+
job_type=JobType.DOCUMENT_S3_PROCESS,
|
|
278
|
+
callback_url=callback_url,
|
|
279
|
+
enqueue_fn=enqueue_s3_document_process_job,
|
|
280
|
+
job_id=job_id,
|
|
281
|
+
bucket=bucket,
|
|
282
|
+
key=key,
|
|
283
|
+
options=options.to_dict() if options else {},
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
|
|
192
287
|
@documents_bp.post("/identify")
|
|
193
288
|
@limiter.limit("120 per hour")
|
|
194
289
|
def identify_upload():
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Download objects from S3 for async document ingest."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from urllib.parse import unquote
|
|
9
|
+
|
|
10
|
+
from werkzeug.utils import secure_filename
|
|
11
|
+
|
|
12
|
+
_S3_URI_PATTERN = re.compile(r"^s3://([^/]+)/(.+)$")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def parse_s3_uri(uri: str) -> tuple[str, str]:
|
|
16
|
+
"""Parse s3://bucket/key into bucket and key."""
|
|
17
|
+
normalized = uri.strip()
|
|
18
|
+
match = _S3_URI_PATTERN.match(normalized)
|
|
19
|
+
if not match:
|
|
20
|
+
raise ValueError("s3_uri must look like s3://bucket/path/to/object")
|
|
21
|
+
bucket = match.group(1).strip()
|
|
22
|
+
key = unquote(match.group(2).strip())
|
|
23
|
+
if not bucket or not key:
|
|
24
|
+
raise ValueError("s3_uri must include a bucket name and object key")
|
|
25
|
+
return bucket, key
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def resolve_s3_location(payload: dict) -> tuple[str, str]:
|
|
29
|
+
"""Resolve bucket and key from JSON body fields."""
|
|
30
|
+
s3_uri = payload.get("s3_uri")
|
|
31
|
+
if isinstance(s3_uri, str) and s3_uri.strip():
|
|
32
|
+
return parse_s3_uri(s3_uri)
|
|
33
|
+
|
|
34
|
+
bucket = payload.get("bucket")
|
|
35
|
+
key = payload.get("key")
|
|
36
|
+
if isinstance(bucket, str) and bucket.strip() and isinstance(key, str) and key.strip():
|
|
37
|
+
return bucket.strip(), key.strip()
|
|
38
|
+
|
|
39
|
+
raise ValueError("Provide s3_uri or both bucket and key.")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def s3_client():
|
|
43
|
+
import boto3
|
|
44
|
+
|
|
45
|
+
return boto3.client(
|
|
46
|
+
"s3",
|
|
47
|
+
region_name=os.getenv("DOCINTEL_S3_REGION", "us-east-1"),
|
|
48
|
+
endpoint_url=os.getenv("DOCINTEL_S3_ENDPOINT_URL", "") or None,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def download_s3_object_to_job_dir(job_id: str, bucket: str, key: str) -> tuple[Path, str]:
|
|
53
|
+
"""Download an S3 object into the job work directory."""
|
|
54
|
+
from docintel.storage import get_storage
|
|
55
|
+
|
|
56
|
+
filename = secure_filename(Path(key).name) or "document.bin"
|
|
57
|
+
work_dir = get_storage().job_dir(job_id)
|
|
58
|
+
work_dir.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
destination = work_dir / filename
|
|
60
|
+
s3_client().download_file(bucket, key, str(destination))
|
|
61
|
+
return destination, filename
|
|
@@ -516,12 +516,12 @@ def build_ui():
|
|
|
516
516
|
outputs=summary_output,
|
|
517
517
|
)
|
|
518
518
|
|
|
519
|
-
office_types = [".pdf", ".docx", ".xlsx", ".csv", ".txt", ".md", ".json"]
|
|
519
|
+
office_types = [".pdf", ".docx", ".xlsx", ".pptx", ".csv", ".txt", ".md", ".json"]
|
|
520
520
|
with gr.Tab("Document process"):
|
|
521
521
|
gr.Markdown(
|
|
522
522
|
"Run extract, classify, summarize, and PII detection in one async job. "
|
|
523
523
|
"Requires Redis and a worker (`make run-worker` or docker-compose worker). "
|
|
524
|
-
"
|
|
524
|
+
"Office formats need `pip install -e '.[documents]'` on the API server (Word, Excel, PowerPoint)."
|
|
525
525
|
)
|
|
526
526
|
from docintel.capabilities.compliance.presets import list_vertical_presets
|
|
527
527
|
|
|
@@ -563,7 +563,7 @@ def build_ui():
|
|
|
563
563
|
with gr.Tab("Document tools"):
|
|
564
564
|
gr.Markdown(
|
|
565
565
|
"Identify, extract, classify, summarize, scan for PII, and compare office documents. "
|
|
566
|
-
"Requires `pip install -e '.[documents]'` for Word and
|
|
566
|
+
"Requires `pip install -e '.[documents]'` for Word, Excel, and PowerPoint."
|
|
567
567
|
)
|
|
568
568
|
with gr.Row():
|
|
569
569
|
doc_file = gr.File(label="Document upload", file_types=office_types)
|