docintel-platform 1.2.0__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. {docintel_platform-1.2.0/src/docintel_platform.egg-info → docintel_platform-1.3.0}/PKG-INFO +9 -5
  2. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/README.md +5 -4
  3. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/pyproject.toml +4 -1
  4. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/__init__.py +1 -1
  5. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/formats/extract.py +33 -0
  6. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/formats/models.py +1 -0
  7. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/formats/registry.py +8 -0
  8. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/formats/sniff.py +3 -1
  9. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/client.py +42 -0
  10. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/config.py +1 -0
  11. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/jobs/models.py +1 -0
  12. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/jobs/queue.py +19 -0
  13. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/jobs/store.py +12 -2
  14. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/jobs/tasks.py +39 -0
  15. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/openapi/openapi.yaml +60 -0
  16. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/documents.py +95 -0
  17. docintel_platform-1.3.0/src/docintel/storage/s3_ingest.py +61 -0
  18. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/ui.py +3 -3
  19. {docintel_platform-1.2.0 → docintel_platform-1.3.0/src/docintel_platform.egg-info}/PKG-INFO +9 -5
  20. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel_platform.egg-info/SOURCES.txt +3 -0
  21. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel_platform.egg-info/requires.txt +3 -0
  22. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_document_formats.py +28 -1
  23. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_health.py +1 -1
  24. docintel_platform-1.3.0/tests/test_job_ttl.py +40 -0
  25. docintel_platform-1.3.0/tests/test_s3_ingest.py +101 -0
  26. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/LICENSE +0 -0
  27. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/MANIFEST.in +0 -0
  28. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/setup.cfg +0 -0
  29. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/app.py +0 -0
  30. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/auth/__init__.py +0 -0
  31. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/auth/api_keys.py +0 -0
  32. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/auth/limiter.py +0 -0
  33. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/auth/middleware.py +0 -0
  34. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/auth/oidc.py +0 -0
  35. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/__init__.py +0 -0
  36. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/compliance/__init__.py +0 -0
  37. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/compliance/pii.py +0 -0
  38. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/compliance/presets.py +0 -0
  39. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/compliance/sensitive.py +0 -0
  40. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/__init__.py +0 -0
  41. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/formats/__init__.py +0 -0
  42. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/ocr.py +0 -0
  43. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/structure.py +0 -0
  44. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/structure_llm.py +0 -0
  45. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/structure_render.py +0 -0
  46. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/extraction/structure_schema.py +0 -0
  47. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/pdf/__init__.py +0 -0
  48. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/pdf/annotator.py +0 -0
  49. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/pdf/models.py +0 -0
  50. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/pdf/search.py +0 -0
  51. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/pipeline/__init__.py +0 -0
  52. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/pipeline/process.py +0 -0
  53. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/understanding/__init__.py +0 -0
  54. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/understanding/classify.py +0 -0
  55. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/understanding/compare.py +0 -0
  56. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/understanding/models.py +0 -0
  57. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/capabilities/understanding/textrank.py +0 -0
  58. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/cli.py +0 -0
  59. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/jobs/__init__.py +0 -0
  60. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/jobs/helpers.py +0 -0
  61. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/jobs/webhooks.py +0 -0
  62. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/openapi/__init__.py +0 -0
  63. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/ops/__init__.py +0 -0
  64. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/ops/logging.py +0 -0
  65. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/ops/metrics.py +0 -0
  66. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/ops/middleware.py +0 -0
  67. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/ops/prometheus.py +0 -0
  68. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/__init__.py +0 -0
  69. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/async_enqueue.py +0 -0
  70. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/batch.py +0 -0
  71. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/document_upload.py +0 -0
  72. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/jobs.py +0 -0
  73. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/openapi_docs.py +0 -0
  74. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/ops.py +0 -0
  75. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/pdf.py +0 -0
  76. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/routes/text.py +0 -0
  77. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/__init__.py +0 -0
  78. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/__init__.py +0 -0
  79. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/annotator.py +0 -0
  80. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/models.py +0 -0
  81. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/ocr.py +0 -0
  82. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/pii.py +0 -0
  83. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/presets.py +0 -0
  84. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/search.py +0 -0
  85. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/sensitive.py +0 -0
  86. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/structure.py +0 -0
  87. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/structure_llm.py +0 -0
  88. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/structure_render.py +0 -0
  89. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/pdf/structure_schema.py +0 -0
  90. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/summary/__init__.py +0 -0
  91. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/summary/models.py +0 -0
  92. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/services/summary/textrank.py +0 -0
  93. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/storage/__init__.py +0 -0
  94. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/storage/local.py +0 -0
  95. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/storage/s3.py +0 -0
  96. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel/wsgi.py +0 -0
  97. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel_platform.egg-info/dependency_links.txt +0 -0
  98. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel_platform.egg-info/entry_points.txt +0 -0
  99. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/src/docintel_platform.egg-info/top_level.txt +0 -0
  100. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_annotate_async.py +0 -0
  101. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_auth.py +0 -0
  102. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_batch.py +0 -0
  103. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_client.py +0 -0
  104. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_detect_sensitive_async.py +0 -0
  105. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_documents_async_routes.py +0 -0
  106. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_documents_classify.py +0 -0
  107. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_documents_compare.py +0 -0
  108. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_documents_compare_files.py +0 -0
  109. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_documents_detect_pii.py +0 -0
  110. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_documents_process.py +0 -0
  111. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_documents_process_async.py +0 -0
  112. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_documents_summarize.py +0 -0
  113. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_jobs.py +0 -0
  114. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_oidc.py +0 -0
  115. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_openapi.py +0 -0
  116. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_ops.py +0 -0
  117. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_pdf_routes.py +0 -0
  118. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_pdf_sensitive.py +0 -0
  119. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_pdf_service.py +0 -0
  120. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_pdf_structure.py +0 -0
  121. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_pii_mask.py +0 -0
  122. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_storage.py +0 -0
  123. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_structure_pii.py +0 -0
  124. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_summary_routes.py +0 -0
  125. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_summary_service.py +0 -0
  126. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_ui.py +0 -0
  127. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_ui_process.py +0 -0
  128. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_vertical_presets.py +0 -0
  129. {docintel_platform-1.2.0 → docintel_platform-1.3.0}/tests/test_webhooks.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docintel-platform
3
- Version: 1.2.0
3
+ Version: 1.3.0
4
4
  Summary: Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization.
5
5
  Author: Babandeep Singh
6
6
  License-Expression: MIT
@@ -40,9 +40,11 @@ Requires-Dist: fakeredis>=2.26.2; extra == "dev"
40
40
  Requires-Dist: prometheus-client>=0.21.0; extra == "dev"
41
41
  Requires-Dist: python-docx>=1.1.2; extra == "dev"
42
42
  Requires-Dist: openpyxl>=3.1.5; extra == "dev"
43
+ Requires-Dist: python-pptx>=1.0.2; extra == "dev"
43
44
  Provides-Extra: documents
44
45
  Requires-Dist: python-docx>=1.1.2; extra == "documents"
45
46
  Requires-Dist: openpyxl>=3.1.5; extra == "documents"
47
+ Requires-Dist: python-pptx>=1.0.2; extra == "documents"
46
48
  Provides-Extra: ocr
47
49
  Requires-Dist: easyocr>=1.7.2; extra == "ocr"
48
50
  Requires-Dist: presidio-analyzer>=2.2.354; extra == "ocr"
@@ -80,6 +82,7 @@ Requires-Dist: cryptography>=43.0.0; extra == "all"
80
82
  Requires-Dist: gradio>=4.44.0; extra == "all"
81
83
  Requires-Dist: python-docx>=1.1.2; extra == "all"
82
84
  Requires-Dist: openpyxl>=3.1.5; extra == "all"
85
+ Requires-Dist: python-pptx>=1.0.2; extra == "all"
83
86
 
84
87
  # Document Intelligence Platform
85
88
 
@@ -90,7 +93,7 @@ Requires-Dist: openpyxl>=3.1.5; extra == "all"
90
93
 
91
94
  Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
92
95
 
93
- **Version:** 1.2.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
96
+ **Version:** 1.3.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
94
97
 
95
98
  ---
96
99
 
@@ -119,7 +122,7 @@ Gradio includes a **Document process** tab (unified pipeline). It needs the API
119
122
  ```bash
120
123
  pip install docintel-platform
121
124
  pip install "docintel-platform[all]" # OCR, LLM, jobs, auth, UI, office formats
122
- pip install "docintel-platform[documents]" # Word and Excel only
125
+ pip install "docintel-platform[documents]" # Word, Excel, and PowerPoint
123
126
  ```
124
127
 
125
128
  **Python client:**
@@ -141,13 +144,13 @@ report = client.process_document("policy.docx", include_pii=True)
141
144
  | PDF annotate | `POST /v1/pdf/annotate` | Regex highlight, redact, markup |
142
145
  | PDF PII scan | `POST /v1/pdf/detect-sensitive` | Presidio + OCR for scanned PDFs |
143
146
  | PDF structure | `POST /v1/pdf/structure` | OCR + LLM curated PDF (needs LLM key) |
144
- | Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process** |
147
+ | Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process**, **ingest** (S3) |
145
148
  | Text | `POST /v1/text/summarize` | TextRank extractive summary |
146
149
  | Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
147
150
  | Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
148
151
  | Ops | `GET /health`, `GET /metrics` | Health and Prometheus-friendly metrics |
149
152
 
150
- **Supported uploads (text workflows):** PDF, DOCX, XLSX, CSV, JSON, TXT, MD.
153
+ **Supported uploads (text workflows):** PDF, DOCX, XLSX, PPTX, CSV, JSON, TXT, MD.
151
154
 
152
155
  **PDF-only routes** (annotate, sensitive, structure) return HTTP 415 for other types. Use `/v1/documents/extract-text` or `/v1/documents/process` for office files.
153
156
 
@@ -199,6 +202,7 @@ Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and
199
202
  | [docs/PLATFORM.md](docs/PLATFORM.md) | Jobs, auth, storage, ops layout |
200
203
  | [docs/PRODUCTION.md](docs/PRODUCTION.md) | Checklist, latency, failure modes |
201
204
  | [docs/ROADMAP.md](docs/ROADMAP.md) | Milestones and history |
205
+ | [docs/WEBHOOKS.md](docs/WEBHOOKS.md) | Async callbacks and S3 ingest |
202
206
  | [docs/adr/](docs/adr/) | Architecture decision records |
203
207
 
204
208
  ---
@@ -7,7 +7,7 @@
7
7
 
8
8
  Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
9
9
 
10
- **Version:** 1.2.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
10
+ **Version:** 1.3.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
11
11
 
12
12
  ---
13
13
 
@@ -36,7 +36,7 @@ Gradio includes a **Document process** tab (unified pipeline). It needs the API
36
36
  ```bash
37
37
  pip install docintel-platform
38
38
  pip install "docintel-platform[all]" # OCR, LLM, jobs, auth, UI, office formats
39
- pip install "docintel-platform[documents]" # Word and Excel only
39
+ pip install "docintel-platform[documents]" # Word, Excel, and PowerPoint
40
40
  ```
41
41
 
42
42
  **Python client:**
@@ -58,13 +58,13 @@ report = client.process_document("policy.docx", include_pii=True)
58
58
  | PDF annotate | `POST /v1/pdf/annotate` | Regex highlight, redact, markup |
59
59
  | PDF PII scan | `POST /v1/pdf/detect-sensitive` | Presidio + OCR for scanned PDFs |
60
60
  | PDF structure | `POST /v1/pdf/structure` | OCR + LLM curated PDF (needs LLM key) |
61
- | Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process** |
61
+ | Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process**, **ingest** (S3) |
62
62
  | Text | `POST /v1/text/summarize` | TextRank extractive summary |
63
63
  | Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
64
64
  | Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
65
65
  | Ops | `GET /health`, `GET /metrics` | Health and Prometheus-friendly metrics |
66
66
 
67
- **Supported uploads (text workflows):** PDF, DOCX, XLSX, CSV, JSON, TXT, MD.
67
+ **Supported uploads (text workflows):** PDF, DOCX, XLSX, PPTX, CSV, JSON, TXT, MD.
68
68
 
69
69
  **PDF-only routes** (annotate, sensitive, structure) return HTTP 415 for other types. Use `/v1/documents/extract-text` or `/v1/documents/process` for office files.
70
70
 
@@ -116,6 +116,7 @@ Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and
116
116
  | [docs/PLATFORM.md](docs/PLATFORM.md) | Jobs, auth, storage, ops layout |
117
117
  | [docs/PRODUCTION.md](docs/PRODUCTION.md) | Checklist, latency, failure modes |
118
118
  | [docs/ROADMAP.md](docs/ROADMAP.md) | Milestones and history |
119
+ | [docs/WEBHOOKS.md](docs/WEBHOOKS.md) | Async callbacks and S3 ingest |
119
120
  | [docs/adr/](docs/adr/) | Architecture decision records |
120
121
 
121
122
  ---
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docintel-platform"
7
- version = "1.2.0"
7
+ version = "1.3.0"
8
8
  description = "Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization."
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -50,10 +50,12 @@ dev = [
50
50
  "prometheus-client>=0.21.0",
51
51
  "python-docx>=1.1.2",
52
52
  "openpyxl>=3.1.5",
53
+ "python-pptx>=1.0.2",
53
54
  ]
54
55
  documents = [
55
56
  "python-docx>=1.1.2",
56
57
  "openpyxl>=3.1.5",
58
+ "python-pptx>=1.0.2",
57
59
  ]
58
60
  ocr = [
59
61
  "easyocr>=1.7.2",
@@ -98,6 +100,7 @@ all = [
98
100
  "gradio>=4.44.0",
99
101
  "python-docx>=1.1.2",
100
102
  "openpyxl>=3.1.5",
103
+ "python-pptx>=1.0.2",
101
104
  ]
102
105
 
103
106
  [project.scripts]
@@ -2,5 +2,5 @@
2
2
 
3
3
  from docintel.client import DocintelClient, DocintelError
4
4
 
5
- __version__ = "1.2.0"
5
+ __version__ = "1.3.0"
6
6
  __all__ = ["DocintelClient", "DocintelError", "__version__"]
@@ -39,6 +39,8 @@ def extract_document_text(
39
39
  return _extract_docx(file_path, resolved)
40
40
  if resolved.kind is DocumentKind.XLSX:
41
41
  return _extract_xlsx(file_path, resolved)
42
+ if resolved.kind is DocumentKind.PPTX:
43
+ return _extract_pptx(file_path, resolved)
42
44
  if resolved.kind is DocumentKind.CSV:
43
45
  return _extract_csv(file_path, resolved)
44
46
  if resolved.kind is DocumentKind.JSON:
@@ -129,6 +131,37 @@ def _extract_xlsx(path: Path, identification: IdentificationResult) -> Extractio
129
131
  )
130
132
 
131
133
 
134
+ def _extract_pptx(path: Path, identification: IdentificationResult) -> ExtractionResult:
135
+ try:
136
+ from pptx import Presentation
137
+ except ImportError as exc:
138
+ raise RuntimeError(
139
+ "PowerPoint support requires optional dependencies. Install: pip install -e '.[documents]'"
140
+ ) from exc
141
+
142
+ presentation = Presentation(path)
143
+ segments: list[dict] = []
144
+ parts: list[str] = []
145
+ for slide_index, slide in enumerate(presentation.slides, start=1):
146
+ slide_parts: list[str] = []
147
+ for shape in slide.shapes:
148
+ text = getattr(shape, "text", "").strip()
149
+ if text:
150
+ slide_parts.append(text)
151
+ slide_text = "\n".join(slide_parts)
152
+ segments.append({"slide": slide_index, "text": slide_text})
153
+ if slide_text:
154
+ parts.append(f"# Slide {slide_index}\n{slide_text}")
155
+
156
+ return ExtractionResult(
157
+ kind=identification.kind,
158
+ mime_type=identification.mime_type,
159
+ text="\n\n".join(parts),
160
+ segments=segments,
161
+ metadata={"slide_count": len(presentation.slides)},
162
+ )
163
+
164
+
132
165
  def _extract_csv(path: Path, identification: IdentificationResult) -> ExtractionResult:
133
166
  raw = path.read_text(encoding="utf-8", errors="replace")
134
167
  sample = raw[:2048]
@@ -11,6 +11,7 @@ class DocumentKind(str, Enum):
11
11
  PDF = "pdf"
12
12
  DOCX = "docx"
13
13
  XLSX = "xlsx"
14
+ PPTX = "pptx"
14
15
  CSV = "csv"
15
16
  PLAIN_TEXT = "plain_text"
16
17
  JSON = "json"
@@ -31,6 +31,14 @@ _PROFILES: tuple[DocumentProfile, ...] = (
31
31
  supports_pdf_pipeline=False,
32
32
  supports_text_extraction=True,
33
33
  ),
34
+ DocumentProfile(
35
+ kind=DocumentKind.PPTX,
36
+ mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
37
+ extensions=(".pptx",),
38
+ label="PowerPoint presentation",
39
+ supports_pdf_pipeline=False,
40
+ supports_text_extraction=True,
41
+ ),
34
42
  DocumentProfile(
35
43
  kind=DocumentKind.CSV,
36
44
  mime_type="text/csv",
@@ -31,6 +31,8 @@ def _sniff_zip_kind(path: Path) -> DocumentKind | None:
31
31
 
32
32
  if any(name.startswith("word/") for name in names):
33
33
  return DocumentKind.DOCX
34
+ if any(name.startswith("ppt/") for name in names):
35
+ return DocumentKind.PPTX
34
36
  if any(name.startswith("xl/") for name in names):
35
37
  return DocumentKind.XLSX
36
38
  return None
@@ -80,7 +82,7 @@ def _looks_like_csv(sample: str) -> bool:
80
82
 
81
83
 
82
84
  def _requires_content_confirmation(kind: DocumentKind) -> bool:
83
- return kind in {DocumentKind.PDF, DocumentKind.DOCX, DocumentKind.XLSX}
85
+ return kind in {DocumentKind.PDF, DocumentKind.DOCX, DocumentKind.XLSX, DocumentKind.PPTX}
84
86
 
85
87
 
86
88
  def _build_result(
@@ -416,3 +416,45 @@ class DocintelClient:
416
416
  data=data,
417
417
  poll=poll,
418
418
  )
419
+
420
+ def ingest_document_from_s3(
421
+ self,
422
+ *,
423
+ s3_uri: str | None = None,
424
+ bucket: str | None = None,
425
+ key: str | None = None,
426
+ sentences: int = 3,
427
+ include_summarize: bool = True,
428
+ include_pii: bool = True,
429
+ include_text: bool = False,
430
+ entities: str | None = None,
431
+ vertical: str | None = None,
432
+ min_score: float = 0.35,
433
+ callback_url: str | None = None,
434
+ poll: bool = True,
435
+ ) -> dict[str, Any]:
436
+ body: dict[str, Any] = {
437
+ "operation": "process",
438
+ "sentences": sentences,
439
+ "include_summarize": include_summarize,
440
+ "include_pii": include_pii,
441
+ "include_text": include_text,
442
+ "min_score": min_score,
443
+ }
444
+ if s3_uri:
445
+ body["s3_uri"] = s3_uri
446
+ if bucket:
447
+ body["bucket"] = bucket
448
+ if key:
449
+ body["key"] = key
450
+ if entities:
451
+ body["entities"] = entities
452
+ if vertical:
453
+ body["vertical"] = vertical
454
+ if callback_url:
455
+ body["callback_url"] = callback_url
456
+ return self._post_async_json(
457
+ "/v1/documents/ingest",
458
+ json_body=body,
459
+ poll=poll,
460
+ )
@@ -11,6 +11,7 @@ class Config:
11
11
  LOG_LEVEL = os.getenv("DOCINTEL_LOG_LEVEL", "INFO")
12
12
  REDIS_URL = os.getenv("DOCINTEL_REDIS_URL", "redis://localhost:6379/0")
13
13
  JOBS_ENABLED = os.getenv("DOCINTEL_JOBS_ENABLED", "true").lower() == "true"
14
+ JOB_TTL_SECONDS = int(os.getenv("DOCINTEL_JOB_TTL_SECONDS", str(60 * 60 * 24 * 7)))
14
15
  QUEUE_NAME = os.getenv("DOCINTEL_QUEUE_NAME", "docintel")
15
16
  API_KEYS = os.getenv("DOCINTEL_API_KEYS", "")
16
17
  AUTH_REQUIRED = os.getenv("DOCINTEL_AUTH_REQUIRED", "false").lower() == "true"
@@ -35,6 +35,7 @@ class JobType(str, Enum):
35
35
  DOCUMENT_DETECT_PII = "document_detect_pii"
36
36
  DOCUMENT_EXTRACT_TEXT = "document_extract_text"
37
37
  DOCUMENT_COMPARE = "document_compare"
38
+ DOCUMENT_S3_PROCESS = "document_s3_process"
38
39
  BATCH = "batch"
39
40
 
40
41
 
@@ -270,6 +270,25 @@ def enqueue_extract_text_job(
270
270
  )
271
271
 
272
272
 
273
+ def enqueue_s3_document_process_job(
274
+ job_id: str,
275
+ bucket: str,
276
+ key: str,
277
+ options: dict,
278
+ ) -> None:
279
+ queue = get_queue()
280
+ queue.enqueue(
281
+ "docintel.jobs.tasks.run_s3_document_process_job",
282
+ job_id=job_id,
283
+ bucket=bucket,
284
+ key=key,
285
+ options=options,
286
+ job_timeout=900,
287
+ result_ttl=DEFAULT_RESULT_TTL,
288
+ failure_ttl=DEFAULT_FAILURE_TTL,
289
+ )
290
+
291
+
273
292
  def enqueue_compare_job(
274
293
  job_id: str,
275
294
  *,
@@ -12,6 +12,15 @@ JOB_KEY_PREFIX = "docintel:job:"
12
12
  DEFAULT_JOB_TTL_SECONDS = 60 * 60 * 24 * 7
13
13
 
14
14
 
15
+ def job_ttl_seconds() -> int:
16
+ raw = os.getenv("DOCINTEL_JOB_TTL_SECONDS", str(DEFAULT_JOB_TTL_SECONDS)).strip()
17
+ try:
18
+ ttl = int(raw)
19
+ except ValueError:
20
+ return DEFAULT_JOB_TTL_SECONDS
21
+ return max(ttl, 60)
22
+
23
+
15
24
  def redis_url() -> str:
16
25
  return os.getenv("DOCINTEL_REDIS_URL", "redis://localhost:6379/0").strip()
17
26
 
@@ -37,9 +46,10 @@ def _job_key(job_id: str) -> str:
37
46
  return f"{JOB_KEY_PREFIX}{job_id}"
38
47
 
39
48
 
40
- def save_job(record: JobRecord, ttl_seconds: int = DEFAULT_JOB_TTL_SECONDS) -> None:
49
+ def save_job(record: JobRecord, ttl_seconds: int | None = None) -> None:
41
50
  client = _redis_client()
42
- client.set(_job_key(record.job_id), json.dumps(record.to_dict()), ex=ttl_seconds)
51
+ resolved_ttl = ttl_seconds if ttl_seconds is not None else job_ttl_seconds()
52
+ client.set(_job_key(record.job_id), json.dumps(record.to_dict()), ex=resolved_ttl)
43
53
 
44
54
 
45
55
  def get_job(job_id: str) -> JobRecord | None:
@@ -645,6 +645,45 @@ def run_compare_job(
645
645
  )
646
646
 
647
647
 
648
+ def run_s3_document_process_job(
649
+ *,
650
+ job_id: str,
651
+ bucket: str,
652
+ key: str,
653
+ options: dict,
654
+ ) -> dict:
655
+ from docintel.storage.s3_ingest import download_s3_object_to_job_dir
656
+
657
+ record = get_job(job_id)
658
+ callback_url = record.callback_url if record else None
659
+ update_job(
660
+ job_id,
661
+ job_status=JobStatus.RUNNING.value,
662
+ progress=5,
663
+ progress_message="Downloading from S3",
664
+ )
665
+ try:
666
+ input_path, filename = download_s3_object_to_job_dir(job_id, bucket, key)
667
+ except Exception as exc:
668
+ failed = update_job(
669
+ job_id,
670
+ job_status=JobStatus.FAILED.value,
671
+ progress=100,
672
+ progress_message="Job failed",
673
+ error=str(exc),
674
+ )
675
+ _notify_webhook(callback_url, failed)
676
+ raise
677
+
678
+ return run_document_process_job(
679
+ job_id=job_id,
680
+ input_path=str(input_path),
681
+ filename=filename,
682
+ content_type=None,
683
+ options=options,
684
+ )
685
+
686
+
648
687
  def create_queued_job(
649
688
  job_id: str,
650
689
  *,
@@ -541,6 +541,66 @@ paths:
541
541
  "503":
542
542
  description: Presidio stack unavailable
543
543
 
544
+ /v1/documents/ingest:
545
+ post:
546
+ tags: [documents]
547
+ summary: Queue process pipeline for an S3 object
548
+ description: |
549
+ Downloads the object in the worker, then runs the same pipeline as
550
+ POST /v1/documents/process. Always returns 202 when Redis is available.
551
+ requestBody:
552
+ required: true
553
+ content:
554
+ application/json:
555
+ schema:
556
+ type: object
557
+ required: [operation]
558
+ properties:
559
+ s3_uri:
560
+ type: string
561
+ example: s3://my-bucket/inbox/policy.docx
562
+ bucket:
563
+ type: string
564
+ key:
565
+ type: string
566
+ operation:
567
+ type: string
568
+ enum: [process]
569
+ default: process
570
+ sentences:
571
+ type: integer
572
+ minimum: 1
573
+ maximum: 20
574
+ include_summarize:
575
+ type: boolean
576
+ default: true
577
+ include_pii:
578
+ type: boolean
579
+ default: true
580
+ include_text:
581
+ type: boolean
582
+ default: false
583
+ vertical:
584
+ type: string
585
+ entities:
586
+ type: string
587
+ min_score:
588
+ type: number
589
+ callback_url:
590
+ type: string
591
+ format: uri
592
+ responses:
593
+ "202":
594
+ description: S3 ingest job queued
595
+ content:
596
+ application/json:
597
+ schema:
598
+ $ref: "#/components/schemas/AsyncAccepted"
599
+ "400":
600
+ description: Invalid S3 location or options
601
+ "503":
602
+ description: Async jobs unavailable
603
+
544
604
  /v1/documents/process:
545
605
  post:
546
606
  tags: [documents]
@@ -182,6 +182,60 @@ def _parse_process_options() -> tuple[ProcessOptions | None, dict | None, int |
182
182
  )
183
183
 
184
184
 
185
+ def _parse_process_options_from_dict(
186
+ payload: dict,
187
+ ) -> tuple[ProcessOptions | None, dict | None, int | None]:
188
+ raw_sentences = payload.get("sentences", DEFAULT_SENTENCE_COUNT)
189
+ try:
190
+ sentences = int(raw_sentences)
191
+ except (TypeError, ValueError):
192
+ return None, {"error": "Field 'sentences' must be an integer."}, 400
193
+ if sentences < 1 or sentences > MAX_SENTENCE_COUNT:
194
+ return None, {
195
+ "error": f"Field 'sentences' must be between 1 and {MAX_SENTENCE_COUNT}."
196
+ }, 400
197
+
198
+ vertical = payload.get("vertical", "")
199
+ vertical = vertical.strip() if isinstance(vertical, str) else ""
200
+ entities_raw = payload.get("entities")
201
+ try:
202
+ entities = _resolve_entities(
203
+ entities_raw if isinstance(entities_raw, str) else None,
204
+ vertical or None,
205
+ )
206
+ except ValueError as exc:
207
+ return None, {"error": str(exc)}, 400
208
+
209
+ raw_min_score = payload.get("min_score", 0.35)
210
+ try:
211
+ min_score = float(raw_min_score)
212
+ except (TypeError, ValueError):
213
+ return None, {"error": "Field 'min_score' must be a number."}, 400
214
+
215
+ def _bool_value(name: str, default: bool) -> bool:
216
+ if name not in payload:
217
+ return default
218
+ value = payload[name]
219
+ if isinstance(value, bool):
220
+ return value
221
+ if isinstance(value, str):
222
+ return value.strip().lower() in {"1", "true", "yes", "on"}
223
+ return default
224
+
225
+ return (
226
+ ProcessOptions(
227
+ sentences=sentences,
228
+ include_summarize=_bool_value("include_summarize", True),
229
+ include_pii=_bool_value("include_pii", True),
230
+ include_text=_bool_value("include_text", False),
231
+ entities=entities,
232
+ min_score=min_score,
233
+ ),
234
+ None,
235
+ None,
236
+ )
237
+
238
+
185
239
  @documents_bp.get("/types")
186
240
  @limiter.limit("120 per hour")
187
241
  def supported_document_types():
@@ -189,6 +243,47 @@ def supported_document_types():
189
243
  return jsonify({"status": "ok", "types": list_supported_types()})
190
244
 
191
245
 
246
+ @documents_bp.post("/ingest")
247
+ @limiter.limit("20 per hour")
248
+ def ingest_document():
249
+ """Queue unified document processing for an object already stored in S3."""
250
+ payload = request.get_json(silent=True)
251
+ if not isinstance(payload, dict):
252
+ return jsonify({"error": "Request body must be JSON."}), 400
253
+
254
+ operation = str(payload.get("operation", "process")).strip().lower()
255
+ if operation != "process":
256
+ return jsonify({"error": "Only operation 'process' is supported."}), 400
257
+
258
+ from docintel.storage.s3_ingest import resolve_s3_location
259
+
260
+ try:
261
+ bucket, key = resolve_s3_location(payload)
262
+ except ValueError as exc:
263
+ return jsonify({"error": str(exc)}), 400
264
+
265
+ options, option_error, option_status = _parse_process_options_from_dict(payload)
266
+ if option_error is not None:
267
+ return jsonify(option_error), option_status
268
+
269
+ callback_raw = payload.get("callback_url", "")
270
+ callback_url = callback_raw.strip() if isinstance(callback_raw, str) and callback_raw.strip() else None
271
+
272
+ from docintel.jobs.models import JobType
273
+ from docintel.jobs.queue import enqueue_s3_document_process_job
274
+
275
+ job_id = uuid.uuid4().hex[:12]
276
+ return enqueue_background_job(
277
+ job_type=JobType.DOCUMENT_S3_PROCESS,
278
+ callback_url=callback_url,
279
+ enqueue_fn=enqueue_s3_document_process_job,
280
+ job_id=job_id,
281
+ bucket=bucket,
282
+ key=key,
283
+ options=options.to_dict() if options else {},
284
+ )
285
+
286
+
192
287
  @documents_bp.post("/identify")
193
288
  @limiter.limit("120 per hour")
194
289
  def identify_upload():
@@ -0,0 +1,61 @@
1
+ """Download objects from S3 for async document ingest."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ from pathlib import Path
8
+ from urllib.parse import unquote
9
+
10
+ from werkzeug.utils import secure_filename
11
+
12
+ _S3_URI_PATTERN = re.compile(r"^s3://([^/]+)/(.+)$")
13
+
14
+
15
+ def parse_s3_uri(uri: str) -> tuple[str, str]:
16
+ """Parse s3://bucket/key into bucket and key."""
17
+ normalized = uri.strip()
18
+ match = _S3_URI_PATTERN.match(normalized)
19
+ if not match:
20
+ raise ValueError("s3_uri must look like s3://bucket/path/to/object")
21
+ bucket = match.group(1).strip()
22
+ key = unquote(match.group(2).strip())
23
+ if not bucket or not key:
24
+ raise ValueError("s3_uri must include a bucket name and object key")
25
+ return bucket, key
26
+
27
+
28
+ def resolve_s3_location(payload: dict) -> tuple[str, str]:
29
+ """Resolve bucket and key from JSON body fields."""
30
+ s3_uri = payload.get("s3_uri")
31
+ if isinstance(s3_uri, str) and s3_uri.strip():
32
+ return parse_s3_uri(s3_uri)
33
+
34
+ bucket = payload.get("bucket")
35
+ key = payload.get("key")
36
+ if isinstance(bucket, str) and bucket.strip() and isinstance(key, str) and key.strip():
37
+ return bucket.strip(), key.strip()
38
+
39
+ raise ValueError("Provide s3_uri or both bucket and key.")
40
+
41
+
42
+ def s3_client():
43
+ import boto3
44
+
45
+ return boto3.client(
46
+ "s3",
47
+ region_name=os.getenv("DOCINTEL_S3_REGION", "us-east-1"),
48
+ endpoint_url=os.getenv("DOCINTEL_S3_ENDPOINT_URL", "") or None,
49
+ )
50
+
51
+
52
+ def download_s3_object_to_job_dir(job_id: str, bucket: str, key: str) -> tuple[Path, str]:
53
+ """Download an S3 object into the job work directory."""
54
+ from docintel.storage import get_storage
55
+
56
+ filename = secure_filename(Path(key).name) or "document.bin"
57
+ work_dir = get_storage().job_dir(job_id)
58
+ work_dir.mkdir(parents=True, exist_ok=True)
59
+ destination = work_dir / filename
60
+ s3_client().download_file(bucket, key, str(destination))
61
+ return destination, filename
@@ -516,12 +516,12 @@ def build_ui():
516
516
  outputs=summary_output,
517
517
  )
518
518
 
519
- office_types = [".pdf", ".docx", ".xlsx", ".csv", ".txt", ".md", ".json"]
519
+ office_types = [".pdf", ".docx", ".xlsx", ".pptx", ".csv", ".txt", ".md", ".json"]
520
520
  with gr.Tab("Document process"):
521
521
  gr.Markdown(
522
522
  "Run extract, classify, summarize, and PII detection in one async job. "
523
523
  "Requires Redis and a worker (`make run-worker` or docker-compose worker). "
524
- "Word and Excel need `pip install -e '.[documents]'` on the API server."
524
+ "Office formats need `pip install -e '.[documents]'` on the API server (Word, Excel, PowerPoint)."
525
525
  )
526
526
  from docintel.capabilities.compliance.presets import list_vertical_presets
527
527
 
@@ -563,7 +563,7 @@ def build_ui():
563
563
  with gr.Tab("Document tools"):
564
564
  gr.Markdown(
565
565
  "Identify, extract, classify, summarize, scan for PII, and compare office documents. "
566
- "Requires `pip install -e '.[documents]'` for Word and Excel."
566
+ "Requires `pip install -e '.[documents]'` for Word, Excel, and PowerPoint."
567
567
  )
568
568
  with gr.Row():
569
569
  doc_file = gr.File(label="Document upload", file_types=office_types)