docintel-platform 1.2.0__tar.gz → 1.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docintel_platform-1.2.0/src/docintel_platform.egg-info → docintel_platform-1.4.0}/PKG-INFO +37 -15
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/README.md +27 -7
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/pyproject.toml +13 -8
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/__init__.py +1 -1
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/compliance/__init__.py +12 -0
- docintel_platform-1.4.0/src/docintel/capabilities/compliance/integrity.py +483 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/compliance/sensitive.py +1 -1
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/extract.py +33 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/models.py +1 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/registry.py +8 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/sniff.py +3 -1
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pipeline/process.py +2 -2
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/client.py +75 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/config.py +1 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/jobs/models.py +3 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/jobs/queue.py +104 -78
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/jobs/store.py +12 -2
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/jobs/tasks.py +94 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/openapi/openapi.yaml +167 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/documents.py +169 -0
- docintel_platform-1.4.0/src/docintel/services/integrity.py +17 -0
- docintel_platform-1.4.0/src/docintel/storage/s3_ingest.py +61 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/ui.py +116 -3
- {docintel_platform-1.2.0 → docintel_platform-1.4.0/src/docintel_platform.egg-info}/PKG-INFO +37 -15
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/SOURCES.txt +8 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/requires.txt +9 -6
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_document_formats.py +28 -1
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_documents_async_routes.py +1 -0
- docintel_platform-1.4.0/tests/test_documents_integrity.py +55 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_health.py +1 -1
- docintel_platform-1.4.0/tests/test_integrity_analysis.py +61 -0
- docintel_platform-1.4.0/tests/test_job_ttl.py +40 -0
- docintel_platform-1.4.0/tests/test_s3_ingest.py +101 -0
- docintel_platform-1.4.0/tests/test_ui_integrity.py +79 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/LICENSE +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/MANIFEST.in +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/setup.cfg +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/app.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/auth/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/auth/api_keys.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/auth/limiter.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/auth/middleware.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/auth/oidc.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/compliance/pii.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/compliance/presets.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/ocr.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/structure.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/structure_llm.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/structure_render.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/structure_schema.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pdf/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pdf/annotator.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pdf/models.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pdf/search.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pipeline/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/classify.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/compare.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/models.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/textrank.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/cli.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/jobs/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/jobs/helpers.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/jobs/webhooks.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/openapi/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/ops/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/ops/logging.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/ops/metrics.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/ops/middleware.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/ops/prometheus.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/async_enqueue.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/batch.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/document_upload.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/jobs.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/openapi_docs.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/ops.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/pdf.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/text.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/annotator.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/models.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/ocr.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/pii.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/presets.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/search.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/sensitive.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/structure.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/structure_llm.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/structure_render.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/structure_schema.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/summary/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/summary/models.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/summary/textrank.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/storage/__init__.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/storage/local.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/storage/s3.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/wsgi.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/dependency_links.txt +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/entry_points.txt +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/top_level.txt +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_annotate_async.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_auth.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_batch.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_client.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_detect_sensitive_async.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_documents_classify.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_documents_compare.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_documents_compare_files.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_documents_detect_pii.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_documents_process.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_documents_process_async.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_documents_summarize.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_jobs.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_oidc.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_openapi.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_ops.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_pdf_routes.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_pdf_sensitive.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_pdf_service.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_pdf_structure.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_pii_mask.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_storage.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_structure_pii.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_summary_routes.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_summary_service.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_ui.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_ui_process.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_vertical_presets.py +0 -0
- {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_webhooks.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docintel-platform
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.0
|
|
4
4
|
Summary: Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization.
|
|
5
5
|
Author: Babandeep Singh
|
|
6
6
|
License-Expression: MIT
|
|
@@ -40,17 +40,19 @@ Requires-Dist: fakeredis>=2.26.2; extra == "dev"
|
|
|
40
40
|
Requires-Dist: prometheus-client>=0.21.0; extra == "dev"
|
|
41
41
|
Requires-Dist: python-docx>=1.1.2; extra == "dev"
|
|
42
42
|
Requires-Dist: openpyxl>=3.1.5; extra == "dev"
|
|
43
|
+
Requires-Dist: python-pptx>=1.0.2; extra == "dev"
|
|
43
44
|
Provides-Extra: documents
|
|
44
45
|
Requires-Dist: python-docx>=1.1.2; extra == "documents"
|
|
45
46
|
Requires-Dist: openpyxl>=3.1.5; extra == "documents"
|
|
47
|
+
Requires-Dist: python-pptx>=1.0.2; extra == "documents"
|
|
48
|
+
Provides-Extra: pii
|
|
49
|
+
Requires-Dist: presidio-analyzer>=2.2.354; extra == "pii"
|
|
50
|
+
Requires-Dist: spacy>=3.7.0; extra == "pii"
|
|
46
51
|
Provides-Extra: ocr
|
|
47
52
|
Requires-Dist: easyocr>=1.7.2; extra == "ocr"
|
|
48
|
-
Requires-Dist: presidio-analyzer>=2.2.354; extra == "ocr"
|
|
49
|
-
Requires-Dist: spacy>=3.7.0; extra == "ocr"
|
|
50
53
|
Requires-Dist: opencv-python-headless>=4.10.0; extra == "ocr"
|
|
51
|
-
Requires-Dist: torch>=2.4.1; extra == "ocr"
|
|
52
54
|
Provides-Extra: ui
|
|
53
|
-
Requires-Dist: gradio
|
|
55
|
+
Requires-Dist: gradio<6,>=5.7.1; extra == "ui"
|
|
54
56
|
Requires-Dist: requests>=2.32.3; extra == "ui"
|
|
55
57
|
Provides-Extra: llm
|
|
56
58
|
Requires-Dist: openai>=1.54.0; extra == "llm"
|
|
@@ -65,10 +67,9 @@ Requires-Dist: PyJWT>=2.9.0; extra == "auth"
|
|
|
65
67
|
Requires-Dist: cryptography>=43.0.0; extra == "auth"
|
|
66
68
|
Provides-Extra: all
|
|
67
69
|
Requires-Dist: easyocr>=1.7.2; extra == "all"
|
|
70
|
+
Requires-Dist: opencv-python-headless>=4.10.0; extra == "all"
|
|
68
71
|
Requires-Dist: presidio-analyzer>=2.2.354; extra == "all"
|
|
69
72
|
Requires-Dist: spacy>=3.7.0; extra == "all"
|
|
70
|
-
Requires-Dist: opencv-python-headless>=4.10.0; extra == "all"
|
|
71
|
-
Requires-Dist: torch>=2.4.1; extra == "all"
|
|
72
73
|
Requires-Dist: openai>=1.54.0; extra == "all"
|
|
73
74
|
Requires-Dist: redis>=5.0.8; extra == "all"
|
|
74
75
|
Requires-Dist: rq>=1.16.2; extra == "all"
|
|
@@ -77,9 +78,10 @@ Requires-Dist: prometheus-client>=0.21.0; extra == "all"
|
|
|
77
78
|
Requires-Dist: flask-limiter>=3.8.0; extra == "all"
|
|
78
79
|
Requires-Dist: PyJWT>=2.9.0; extra == "all"
|
|
79
80
|
Requires-Dist: cryptography>=43.0.0; extra == "all"
|
|
80
|
-
Requires-Dist: gradio
|
|
81
|
+
Requires-Dist: gradio<6,>=5.7.1; extra == "all"
|
|
81
82
|
Requires-Dist: python-docx>=1.1.2; extra == "all"
|
|
82
83
|
Requires-Dist: openpyxl>=3.1.5; extra == "all"
|
|
84
|
+
Requires-Dist: python-pptx>=1.0.2; extra == "all"
|
|
83
85
|
|
|
84
86
|
# Document Intelligence Platform
|
|
85
87
|
|
|
@@ -90,21 +92,31 @@ Requires-Dist: openpyxl>=3.1.5; extra == "all"
|
|
|
90
92
|
|
|
91
93
|
Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
|
|
92
94
|
|
|
93
|
-
**Version:** 1.
|
|
95
|
+
**Version:** 1.4.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
|
|
94
96
|
|
|
95
97
|
---
|
|
96
98
|
|
|
97
99
|
## Quick start
|
|
98
100
|
|
|
99
|
-
**Docker (
|
|
101
|
+
**Docker (slim core, optional UI and OCR):**
|
|
100
102
|
|
|
101
103
|
```bash
|
|
102
104
|
git clone https://github.com/baban9/document-intelligence-platform.git
|
|
103
105
|
cd document-intelligence-platform
|
|
104
106
|
cp .env.example .env # optional: ports, LLM key, auth
|
|
105
|
-
make docker-up
|
|
107
|
+
make docker-up # redis + API + worker (~2 min build, no PyTorch)
|
|
108
|
+
make docker-up-ui # add Gradio when API is healthy
|
|
106
109
|
```
|
|
107
110
|
|
|
111
|
+
| Command | What starts |
|
|
112
|
+
|---------|-------------|
|
|
113
|
+
| `make docker-up` | Redis, slim API, slim worker (documents, PII text, digital PDF) |
|
|
114
|
+
| `make docker-up-ui` | Gradio UI (`--profile ui`) |
|
|
115
|
+
| `make docker-up-ocr` | Rebuild with CPU-only OCR for scanned PDFs (no NVIDIA) |
|
|
116
|
+
| `make docker-up-full` | OCR stack + UI |
|
|
117
|
+
|
|
118
|
+
Slim image skips PyTorch and EasyOCR (~400MB+). Scanned PDF OCR is opt-in via `make docker-up-ocr`.
|
|
119
|
+
|
|
108
120
|
| Service | URL |
|
|
109
121
|
|---------|-----|
|
|
110
122
|
| API | http://127.0.0.1:5000 |
|
|
@@ -112,14 +124,14 @@ make docker-up
|
|
|
112
124
|
| Gradio UI | http://127.0.0.1:7860 |
|
|
113
125
|
| Health | http://127.0.0.1:5000/health |
|
|
114
126
|
|
|
115
|
-
Gradio includes a **Document process** tab (unified pipeline). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
|
|
127
|
+
Gradio includes a **Document process** tab (unified pipeline) and a **Document integrity** tab (gap and consistency checks). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
|
|
116
128
|
|
|
117
129
|
**pip install:**
|
|
118
130
|
|
|
119
131
|
```bash
|
|
120
132
|
pip install docintel-platform
|
|
121
133
|
pip install "docintel-platform[all]" # OCR, LLM, jobs, auth, UI, office formats
|
|
122
|
-
pip install "docintel-platform[documents]" # Word and
|
|
134
|
+
pip install "docintel-platform[documents]" # Word, Excel, and PowerPoint
|
|
123
135
|
```
|
|
124
136
|
|
|
125
137
|
**Python client:**
|
|
@@ -141,13 +153,13 @@ report = client.process_document("policy.docx", include_pii=True)
|
|
|
141
153
|
| PDF annotate | `POST /v1/pdf/annotate` | Regex highlight, redact, markup |
|
|
142
154
|
| PDF PII scan | `POST /v1/pdf/detect-sensitive` | Presidio + OCR for scanned PDFs |
|
|
143
155
|
| PDF structure | `POST /v1/pdf/structure` | OCR + LLM curated PDF (needs LLM key) |
|
|
144
|
-
| Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process** |
|
|
156
|
+
| Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process**, **ingest** (S3), **analyze-integrity** |
|
|
145
157
|
| Text | `POST /v1/text/summarize` | TextRank extractive summary |
|
|
146
158
|
| Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
|
|
147
159
|
| Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
|
|
148
160
|
| Ops | `GET /health`, `GET /metrics` | Health and Prometheus-friendly metrics |
|
|
149
161
|
|
|
150
|
-
**Supported uploads (text workflows):** PDF, DOCX, XLSX, CSV, JSON, TXT, MD.
|
|
162
|
+
**Supported uploads (text workflows):** PDF, DOCX, XLSX, PPTX, CSV, JSON, TXT, MD.
|
|
151
163
|
|
|
152
164
|
**PDF-only routes** (annotate, sensitive, structure) return HTTP 415 for other types. Use `/v1/documents/extract-text` or `/v1/documents/process` for office files.
|
|
153
165
|
|
|
@@ -169,6 +181,11 @@ curl -X POST http://127.0.0.1:5000/v1/documents/process \
|
|
|
169
181
|
# Async: add ?async=true, then poll /v1/jobs/<job_id>
|
|
170
182
|
curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
|
|
171
183
|
-F "file=@policy.docx"
|
|
184
|
+
|
|
185
|
+
# Document integrity analysis (placeholders, broken refs, drift, number mismatch)
|
|
186
|
+
curl -X POST http://127.0.0.1:5000/v1/documents/analyze-integrity \
|
|
187
|
+
-H "Content-Type: application/json" \
|
|
188
|
+
-d '{"text": "See Section 9.2. Total budget: $1M. Total budget: $900K. TBD"}'
|
|
172
189
|
```
|
|
173
190
|
|
|
174
191
|
---
|
|
@@ -177,9 +194,11 @@ curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
|
|
|
177
194
|
|
|
178
195
|
```bash
|
|
179
196
|
make setup # venv + dev deps
|
|
197
|
+
make setup-hooks # block Cursor agent co-author trailers on commit
|
|
180
198
|
make setup-ocr # EasyOCR, Presidio, spaCy model
|
|
181
199
|
make setup-llm # OpenAI client (structure endpoint)
|
|
182
200
|
make setup-ui # Gradio
|
|
201
|
+
make run-redis # Redis for async jobs (Docker, port 6379)
|
|
183
202
|
make run # API on :5000
|
|
184
203
|
make run-worker # RQ worker (separate terminal, needs Redis)
|
|
185
204
|
make run-ui # Gradio on :7860
|
|
@@ -187,6 +206,8 @@ make test
|
|
|
187
206
|
make eval # offline quality report (summary, classify, process, PII)
|
|
188
207
|
```
|
|
189
208
|
|
|
209
|
+
Async routes and the Gradio integrity tab need Redis. Start it once with `make run-redis` before `make run-worker`.
|
|
210
|
+
|
|
190
211
|
Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and S3. See comments in that file for all variables.
|
|
191
212
|
|
|
192
213
|
---
|
|
@@ -199,6 +220,7 @@ Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and
|
|
|
199
220
|
| [docs/PLATFORM.md](docs/PLATFORM.md) | Jobs, auth, storage, ops layout |
|
|
200
221
|
| [docs/PRODUCTION.md](docs/PRODUCTION.md) | Checklist, latency, failure modes |
|
|
201
222
|
| [docs/ROADMAP.md](docs/ROADMAP.md) | Milestones and history |
|
|
223
|
+
| [docs/WEBHOOKS.md](docs/WEBHOOKS.md) | Async callbacks and S3 ingest |
|
|
202
224
|
| [docs/adr/](docs/adr/) | Architecture decision records |
|
|
203
225
|
|
|
204
226
|
---
|
|
@@ -7,21 +7,31 @@
|
|
|
7
7
|
|
|
8
8
|
Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
|
|
9
9
|
|
|
10
|
-
**Version:** 1.
|
|
10
|
+
**Version:** 1.4.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
|
|
11
11
|
|
|
12
12
|
---
|
|
13
13
|
|
|
14
14
|
## Quick start
|
|
15
15
|
|
|
16
|
-
**Docker (
|
|
16
|
+
**Docker (slim core, optional UI and OCR):**
|
|
17
17
|
|
|
18
18
|
```bash
|
|
19
19
|
git clone https://github.com/baban9/document-intelligence-platform.git
|
|
20
20
|
cd document-intelligence-platform
|
|
21
21
|
cp .env.example .env # optional: ports, LLM key, auth
|
|
22
|
-
make docker-up
|
|
22
|
+
make docker-up # redis + API + worker (~2 min build, no PyTorch)
|
|
23
|
+
make docker-up-ui # add Gradio when API is healthy
|
|
23
24
|
```
|
|
24
25
|
|
|
26
|
+
| Command | What starts |
|
|
27
|
+
|---------|-------------|
|
|
28
|
+
| `make docker-up` | Redis, slim API, slim worker (documents, PII text, digital PDF) |
|
|
29
|
+
| `make docker-up-ui` | Gradio UI (`--profile ui`) |
|
|
30
|
+
| `make docker-up-ocr` | Rebuild with CPU-only OCR for scanned PDFs (no NVIDIA) |
|
|
31
|
+
| `make docker-up-full` | OCR stack + UI |
|
|
32
|
+
|
|
33
|
+
Slim image skips PyTorch and EasyOCR (~400MB+). Scanned PDF OCR is opt-in via `make docker-up-ocr`.
|
|
34
|
+
|
|
25
35
|
| Service | URL |
|
|
26
36
|
|---------|-----|
|
|
27
37
|
| API | http://127.0.0.1:5000 |
|
|
@@ -29,14 +39,14 @@ make docker-up
|
|
|
29
39
|
| Gradio UI | http://127.0.0.1:7860 |
|
|
30
40
|
| Health | http://127.0.0.1:5000/health |
|
|
31
41
|
|
|
32
|
-
Gradio includes a **Document process** tab (unified pipeline). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
|
|
42
|
+
Gradio includes a **Document process** tab (unified pipeline) and a **Document integrity** tab (gap and consistency checks). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
|
|
33
43
|
|
|
34
44
|
**pip install:**
|
|
35
45
|
|
|
36
46
|
```bash
|
|
37
47
|
pip install docintel-platform
|
|
38
48
|
pip install "docintel-platform[all]" # OCR, LLM, jobs, auth, UI, office formats
|
|
39
|
-
pip install "docintel-platform[documents]" # Word and
|
|
49
|
+
pip install "docintel-platform[documents]" # Word, Excel, and PowerPoint
|
|
40
50
|
```
|
|
41
51
|
|
|
42
52
|
**Python client:**
|
|
@@ -58,13 +68,13 @@ report = client.process_document("policy.docx", include_pii=True)
|
|
|
58
68
|
| PDF annotate | `POST /v1/pdf/annotate` | Regex highlight, redact, markup |
|
|
59
69
|
| PDF PII scan | `POST /v1/pdf/detect-sensitive` | Presidio + OCR for scanned PDFs |
|
|
60
70
|
| PDF structure | `POST /v1/pdf/structure` | OCR + LLM curated PDF (needs LLM key) |
|
|
61
|
-
| Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process** |
|
|
71
|
+
| Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process**, **ingest** (S3), **analyze-integrity** |
|
|
62
72
|
| Text | `POST /v1/text/summarize` | TextRank extractive summary |
|
|
63
73
|
| Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
|
|
64
74
|
| Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
|
|
65
75
|
| Ops | `GET /health`, `GET /metrics` | Health and Prometheus-friendly metrics |
|
|
66
76
|
|
|
67
|
-
**Supported uploads (text workflows):** PDF, DOCX, XLSX, CSV, JSON, TXT, MD.
|
|
77
|
+
**Supported uploads (text workflows):** PDF, DOCX, XLSX, PPTX, CSV, JSON, TXT, MD.
|
|
68
78
|
|
|
69
79
|
**PDF-only routes** (annotate, sensitive, structure) return HTTP 415 for other types. Use `/v1/documents/extract-text` or `/v1/documents/process` for office files.
|
|
70
80
|
|
|
@@ -86,6 +96,11 @@ curl -X POST http://127.0.0.1:5000/v1/documents/process \
|
|
|
86
96
|
# Async: add ?async=true, then poll /v1/jobs/<job_id>
|
|
87
97
|
curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
|
|
88
98
|
-F "file=@policy.docx"
|
|
99
|
+
|
|
100
|
+
# Document integrity analysis (placeholders, broken refs, drift, number mismatch)
|
|
101
|
+
curl -X POST http://127.0.0.1:5000/v1/documents/analyze-integrity \
|
|
102
|
+
-H "Content-Type: application/json" \
|
|
103
|
+
-d '{"text": "See Section 9.2. Total budget: $1M. Total budget: $900K. TBD"}'
|
|
89
104
|
```
|
|
90
105
|
|
|
91
106
|
---
|
|
@@ -94,9 +109,11 @@ curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
|
|
|
94
109
|
|
|
95
110
|
```bash
|
|
96
111
|
make setup # venv + dev deps
|
|
112
|
+
make setup-hooks # block Cursor agent co-author trailers on commit
|
|
97
113
|
make setup-ocr # EasyOCR, Presidio, spaCy model
|
|
98
114
|
make setup-llm # OpenAI client (structure endpoint)
|
|
99
115
|
make setup-ui # Gradio
|
|
116
|
+
make run-redis # Redis for async jobs (Docker, port 6379)
|
|
100
117
|
make run # API on :5000
|
|
101
118
|
make run-worker # RQ worker (separate terminal, needs Redis)
|
|
102
119
|
make run-ui # Gradio on :7860
|
|
@@ -104,6 +121,8 @@ make test
|
|
|
104
121
|
make eval # offline quality report (summary, classify, process, PII)
|
|
105
122
|
```
|
|
106
123
|
|
|
124
|
+
Async routes and the Gradio integrity tab need Redis. Start it once with `make run-redis` before `make run-worker`.
|
|
125
|
+
|
|
107
126
|
Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and S3. See comments in that file for all variables.
|
|
108
127
|
|
|
109
128
|
---
|
|
@@ -116,6 +135,7 @@ Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and
|
|
|
116
135
|
| [docs/PLATFORM.md](docs/PLATFORM.md) | Jobs, auth, storage, ops layout |
|
|
117
136
|
| [docs/PRODUCTION.md](docs/PRODUCTION.md) | Checklist, latency, failure modes |
|
|
118
137
|
| [docs/ROADMAP.md](docs/ROADMAP.md) | Milestones and history |
|
|
138
|
+
| [docs/WEBHOOKS.md](docs/WEBHOOKS.md) | Async callbacks and S3 ingest |
|
|
119
139
|
| [docs/adr/](docs/adr/) | Architecture decision records |
|
|
120
140
|
|
|
121
141
|
---
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docintel-platform"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.4.0"
|
|
8
8
|
description = "Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -50,20 +50,25 @@ dev = [
|
|
|
50
50
|
"prometheus-client>=0.21.0",
|
|
51
51
|
"python-docx>=1.1.2",
|
|
52
52
|
"openpyxl>=3.1.5",
|
|
53
|
+
"python-pptx>=1.0.2",
|
|
53
54
|
]
|
|
54
55
|
documents = [
|
|
55
56
|
"python-docx>=1.1.2",
|
|
56
57
|
"openpyxl>=3.1.5",
|
|
58
|
+
"python-pptx>=1.0.2",
|
|
57
59
|
]
|
|
58
|
-
|
|
59
|
-
"easyocr>=1.7.2",
|
|
60
|
+
pii = [
|
|
60
61
|
"presidio-analyzer>=2.2.354",
|
|
61
62
|
"spacy>=3.7.0",
|
|
63
|
+
]
|
|
64
|
+
# Scanned PDF OCR (EasyOCR). Install CPU torch before this extra in Docker:
|
|
65
|
+
# pip install torch --index-url https://download.pytorch.org/whl/cpu
|
|
66
|
+
ocr = [
|
|
67
|
+
"easyocr>=1.7.2",
|
|
62
68
|
"opencv-python-headless>=4.10.0",
|
|
63
|
-
"torch>=2.4.1",
|
|
64
69
|
]
|
|
65
70
|
ui = [
|
|
66
|
-
"gradio>=
|
|
71
|
+
"gradio>=5.7.1,<6",
|
|
67
72
|
"requests>=2.32.3",
|
|
68
73
|
]
|
|
69
74
|
llm = [
|
|
@@ -83,10 +88,9 @@ auth = [
|
|
|
83
88
|
]
|
|
84
89
|
all = [
|
|
85
90
|
"easyocr>=1.7.2",
|
|
91
|
+
"opencv-python-headless>=4.10.0",
|
|
86
92
|
"presidio-analyzer>=2.2.354",
|
|
87
93
|
"spacy>=3.7.0",
|
|
88
|
-
"opencv-python-headless>=4.10.0",
|
|
89
|
-
"torch>=2.4.1",
|
|
90
94
|
"openai>=1.54.0",
|
|
91
95
|
"redis>=5.0.8",
|
|
92
96
|
"rq>=1.16.2",
|
|
@@ -95,9 +99,10 @@ all = [
|
|
|
95
99
|
"flask-limiter>=3.8.0",
|
|
96
100
|
"PyJWT>=2.9.0",
|
|
97
101
|
"cryptography>=43.0.0",
|
|
98
|
-
"gradio>=
|
|
102
|
+
"gradio>=5.7.1,<6",
|
|
99
103
|
"python-docx>=1.1.2",
|
|
100
104
|
"openpyxl>=3.1.5",
|
|
105
|
+
"python-pptx>=1.0.2",
|
|
101
106
|
]
|
|
102
107
|
|
|
103
108
|
[project.scripts]
|
{docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/compliance/__init__.py
RENAMED
|
@@ -1,13 +1,25 @@
|
|
|
1
1
|
"""Compliance capabilities (PII detection, sensitive PDF scanning)."""
|
|
2
2
|
|
|
3
|
+
from docintel.capabilities.compliance.integrity import (
|
|
4
|
+
IntegrityEvidence,
|
|
5
|
+
IntegrityFinding,
|
|
6
|
+
IntegrityResult,
|
|
7
|
+
V1_CHECKS,
|
|
8
|
+
analyze_document_integrity,
|
|
9
|
+
)
|
|
3
10
|
from docintel.capabilities.compliance.pii import PIIHit, detect_pii_in_text, list_supported_entities, mask_pii_in_text
|
|
4
11
|
from docintel.capabilities.compliance.presets import DEFAULT_PII_ENTITIES, MIN_NATIVE_TEXT_CHARS, OCR_RENDER_SCALE
|
|
5
12
|
|
|
6
13
|
__all__ = [
|
|
7
14
|
"DEFAULT_PII_ENTITIES",
|
|
15
|
+
"IntegrityEvidence",
|
|
16
|
+
"IntegrityFinding",
|
|
17
|
+
"IntegrityResult",
|
|
8
18
|
"MIN_NATIVE_TEXT_CHARS",
|
|
9
19
|
"OCR_RENDER_SCALE",
|
|
10
20
|
"PIIHit",
|
|
21
|
+
"V1_CHECKS",
|
|
22
|
+
"analyze_document_integrity",
|
|
11
23
|
"detect_pii_in_text",
|
|
12
24
|
"list_supported_entities",
|
|
13
25
|
"mask_pii_in_text",
|