docintel-platform 1.3.0__tar.gz → 1.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docintel_platform-1.3.0/src/docintel_platform.egg-info → docintel_platform-1.4.0}/PKG-INFO +31 -13
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/README.md +24 -5
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/pyproject.toml +10 -8
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/__init__.py +1 -1
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/compliance/__init__.py +12 -0
- docintel_platform-1.4.0/src/docintel/capabilities/compliance/integrity.py +483 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/compliance/sensitive.py +1 -1
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pipeline/process.py +2 -2
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/client.py +33 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/jobs/models.py +2 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/jobs/queue.py +91 -84
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/jobs/tasks.py +55 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/openapi/openapi.yaml +107 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/documents.py +74 -0
- docintel_platform-1.4.0/src/docintel/services/integrity.py +17 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/ui.py +113 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0/src/docintel_platform.egg-info}/PKG-INFO +31 -13
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/SOURCES.txt +5 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/requires.txt +6 -6
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_documents_async_routes.py +1 -0
- docintel_platform-1.4.0/tests/test_documents_integrity.py +55 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_health.py +1 -1
- docintel_platform-1.4.0/tests/test_integrity_analysis.py +61 -0
- docintel_platform-1.4.0/tests/test_ui_integrity.py +79 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/LICENSE +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/MANIFEST.in +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/setup.cfg +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/app.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/auth/__init__.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/auth/api_keys.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/auth/limiter.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/auth/middleware.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/auth/oidc.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/__init__.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/compliance/pii.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/compliance/presets.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/__init__.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/__init__.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/extract.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/models.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/registry.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/sniff.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/ocr.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/structure.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/structure_llm.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/structure_render.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/structure_schema.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pdf/__init__.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pdf/annotator.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pdf/models.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pdf/search.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pipeline/__init__.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/__init__.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/classify.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/compare.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/models.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/textrank.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/cli.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/config.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/jobs/__init__.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/jobs/helpers.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/jobs/store.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/jobs/webhooks.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/openapi/__init__.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/ops/__init__.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/ops/logging.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/ops/metrics.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/ops/middleware.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/ops/prometheus.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/__init__.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/async_enqueue.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/batch.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/document_upload.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/jobs.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/openapi_docs.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/ops.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/pdf.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/text.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/__init__.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/__init__.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/annotator.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/models.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/ocr.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/pii.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/presets.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/search.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/sensitive.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/structure.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/structure_llm.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/structure_render.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/structure_schema.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/summary/__init__.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/summary/models.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/summary/textrank.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/storage/__init__.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/storage/local.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/storage/s3.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/storage/s3_ingest.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/wsgi.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/dependency_links.txt +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/entry_points.txt +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/top_level.txt +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_annotate_async.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_auth.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_batch.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_client.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_detect_sensitive_async.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_document_formats.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_documents_classify.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_documents_compare.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_documents_compare_files.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_documents_detect_pii.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_documents_process.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_documents_process_async.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_documents_summarize.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_job_ttl.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_jobs.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_oidc.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_openapi.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_ops.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_pdf_routes.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_pdf_sensitive.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_pdf_service.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_pdf_structure.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_pii_mask.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_s3_ingest.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_storage.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_structure_pii.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_summary_routes.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_summary_service.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_ui.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_ui_process.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_vertical_presets.py +0 -0
- {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_webhooks.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docintel-platform
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.0
|
|
4
4
|
Summary: Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization.
|
|
5
5
|
Author: Babandeep Singh
|
|
6
6
|
License-Expression: MIT
|
|
@@ -45,14 +45,14 @@ Provides-Extra: documents
|
|
|
45
45
|
Requires-Dist: python-docx>=1.1.2; extra == "documents"
|
|
46
46
|
Requires-Dist: openpyxl>=3.1.5; extra == "documents"
|
|
47
47
|
Requires-Dist: python-pptx>=1.0.2; extra == "documents"
|
|
48
|
+
Provides-Extra: pii
|
|
49
|
+
Requires-Dist: presidio-analyzer>=2.2.354; extra == "pii"
|
|
50
|
+
Requires-Dist: spacy>=3.7.0; extra == "pii"
|
|
48
51
|
Provides-Extra: ocr
|
|
49
52
|
Requires-Dist: easyocr>=1.7.2; extra == "ocr"
|
|
50
|
-
Requires-Dist: presidio-analyzer>=2.2.354; extra == "ocr"
|
|
51
|
-
Requires-Dist: spacy>=3.7.0; extra == "ocr"
|
|
52
53
|
Requires-Dist: opencv-python-headless>=4.10.0; extra == "ocr"
|
|
53
|
-
Requires-Dist: torch>=2.4.1; extra == "ocr"
|
|
54
54
|
Provides-Extra: ui
|
|
55
|
-
Requires-Dist: gradio
|
|
55
|
+
Requires-Dist: gradio<6,>=5.7.1; extra == "ui"
|
|
56
56
|
Requires-Dist: requests>=2.32.3; extra == "ui"
|
|
57
57
|
Provides-Extra: llm
|
|
58
58
|
Requires-Dist: openai>=1.54.0; extra == "llm"
|
|
@@ -67,10 +67,9 @@ Requires-Dist: PyJWT>=2.9.0; extra == "auth"
|
|
|
67
67
|
Requires-Dist: cryptography>=43.0.0; extra == "auth"
|
|
68
68
|
Provides-Extra: all
|
|
69
69
|
Requires-Dist: easyocr>=1.7.2; extra == "all"
|
|
70
|
+
Requires-Dist: opencv-python-headless>=4.10.0; extra == "all"
|
|
70
71
|
Requires-Dist: presidio-analyzer>=2.2.354; extra == "all"
|
|
71
72
|
Requires-Dist: spacy>=3.7.0; extra == "all"
|
|
72
|
-
Requires-Dist: opencv-python-headless>=4.10.0; extra == "all"
|
|
73
|
-
Requires-Dist: torch>=2.4.1; extra == "all"
|
|
74
73
|
Requires-Dist: openai>=1.54.0; extra == "all"
|
|
75
74
|
Requires-Dist: redis>=5.0.8; extra == "all"
|
|
76
75
|
Requires-Dist: rq>=1.16.2; extra == "all"
|
|
@@ -79,7 +78,7 @@ Requires-Dist: prometheus-client>=0.21.0; extra == "all"
|
|
|
79
78
|
Requires-Dist: flask-limiter>=3.8.0; extra == "all"
|
|
80
79
|
Requires-Dist: PyJWT>=2.9.0; extra == "all"
|
|
81
80
|
Requires-Dist: cryptography>=43.0.0; extra == "all"
|
|
82
|
-
Requires-Dist: gradio
|
|
81
|
+
Requires-Dist: gradio<6,>=5.7.1; extra == "all"
|
|
83
82
|
Requires-Dist: python-docx>=1.1.2; extra == "all"
|
|
84
83
|
Requires-Dist: openpyxl>=3.1.5; extra == "all"
|
|
85
84
|
Requires-Dist: python-pptx>=1.0.2; extra == "all"
|
|
@@ -93,21 +92,31 @@ Requires-Dist: python-pptx>=1.0.2; extra == "all"
|
|
|
93
92
|
|
|
94
93
|
Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
|
|
95
94
|
|
|
96
|
-
**Version:** 1.
|
|
95
|
+
**Version:** 1.4.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
|
|
97
96
|
|
|
98
97
|
---
|
|
99
98
|
|
|
100
99
|
## Quick start
|
|
101
100
|
|
|
102
|
-
**Docker (
|
|
101
|
+
**Docker (slim core, optional UI and OCR):**
|
|
103
102
|
|
|
104
103
|
```bash
|
|
105
104
|
git clone https://github.com/baban9/document-intelligence-platform.git
|
|
106
105
|
cd document-intelligence-platform
|
|
107
106
|
cp .env.example .env # optional: ports, LLM key, auth
|
|
108
|
-
make docker-up
|
|
107
|
+
make docker-up # redis + API + worker (~2 min build, no PyTorch)
|
|
108
|
+
make docker-up-ui # add Gradio when API is healthy
|
|
109
109
|
```
|
|
110
110
|
|
|
111
|
+
| Command | What starts |
|
|
112
|
+
|---------|-------------|
|
|
113
|
+
| `make docker-up` | Redis, slim API, slim worker (documents, PII text, digital PDF) |
|
|
114
|
+
| `make docker-up-ui` | Gradio UI (`--profile ui`) |
|
|
115
|
+
| `make docker-up-ocr` | Rebuild with CPU-only OCR for scanned PDFs (no NVIDIA) |
|
|
116
|
+
| `make docker-up-full` | OCR stack + UI |
|
|
117
|
+
|
|
118
|
+
Slim image skips PyTorch and EasyOCR (~400MB+). Scanned PDF OCR is opt-in via `make docker-up-ocr`.
|
|
119
|
+
|
|
111
120
|
| Service | URL |
|
|
112
121
|
|---------|-----|
|
|
113
122
|
| API | http://127.0.0.1:5000 |
|
|
@@ -115,7 +124,7 @@ make docker-up
|
|
|
115
124
|
| Gradio UI | http://127.0.0.1:7860 |
|
|
116
125
|
| Health | http://127.0.0.1:5000/health |
|
|
117
126
|
|
|
118
|
-
Gradio includes a **Document process** tab (unified pipeline). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
|
|
127
|
+
Gradio includes a **Document process** tab (unified pipeline) and a **Document integrity** tab (gap and consistency checks). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
|
|
119
128
|
|
|
120
129
|
**pip install:**
|
|
121
130
|
|
|
@@ -144,7 +153,7 @@ report = client.process_document("policy.docx", include_pii=True)
|
|
|
144
153
|
| PDF annotate | `POST /v1/pdf/annotate` | Regex highlight, redact, markup |
|
|
145
154
|
| PDF PII scan | `POST /v1/pdf/detect-sensitive` | Presidio + OCR for scanned PDFs |
|
|
146
155
|
| PDF structure | `POST /v1/pdf/structure` | OCR + LLM curated PDF (needs LLM key) |
|
|
147
|
-
| Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process**, **ingest** (S3) |
|
|
156
|
+
| Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process**, **ingest** (S3), **analyze-integrity** |
|
|
148
157
|
| Text | `POST /v1/text/summarize` | TextRank extractive summary |
|
|
149
158
|
| Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
|
|
150
159
|
| Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
|
|
@@ -172,6 +181,11 @@ curl -X POST http://127.0.0.1:5000/v1/documents/process \
|
|
|
172
181
|
# Async: add ?async=true, then poll /v1/jobs/<job_id>
|
|
173
182
|
curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
|
|
174
183
|
-F "file=@policy.docx"
|
|
184
|
+
|
|
185
|
+
# Document integrity analysis (placeholders, broken refs, drift, number mismatch)
|
|
186
|
+
curl -X POST http://127.0.0.1:5000/v1/documents/analyze-integrity \
|
|
187
|
+
-H "Content-Type: application/json" \
|
|
188
|
+
-d '{"text": "See Section 9.2. Total budget: $1M. Total budget: $900K. TBD"}'
|
|
175
189
|
```
|
|
176
190
|
|
|
177
191
|
---
|
|
@@ -180,9 +194,11 @@ curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
|
|
|
180
194
|
|
|
181
195
|
```bash
|
|
182
196
|
make setup # venv + dev deps
|
|
197
|
+
make setup-hooks # block Cursor agent co-author trailers on commit
|
|
183
198
|
make setup-ocr # EasyOCR, Presidio, spaCy model
|
|
184
199
|
make setup-llm # OpenAI client (structure endpoint)
|
|
185
200
|
make setup-ui # Gradio
|
|
201
|
+
make run-redis # Redis for async jobs (Docker, port 6379)
|
|
186
202
|
make run # API on :5000
|
|
187
203
|
make run-worker # RQ worker (separate terminal, needs Redis)
|
|
188
204
|
make run-ui # Gradio on :7860
|
|
@@ -190,6 +206,8 @@ make test
|
|
|
190
206
|
make eval # offline quality report (summary, classify, process, PII)
|
|
191
207
|
```
|
|
192
208
|
|
|
209
|
+
Async routes and the Gradio integrity tab need Redis. Start it once with `make run-redis` before `make run-worker`.
|
|
210
|
+
|
|
193
211
|
Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and S3. See comments in that file for all variables.
|
|
194
212
|
|
|
195
213
|
---
|
|
@@ -7,21 +7,31 @@
|
|
|
7
7
|
|
|
8
8
|
Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
|
|
9
9
|
|
|
10
|
-
**Version:** 1.
|
|
10
|
+
**Version:** 1.4.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
|
|
11
11
|
|
|
12
12
|
---
|
|
13
13
|
|
|
14
14
|
## Quick start
|
|
15
15
|
|
|
16
|
-
**Docker (
|
|
16
|
+
**Docker (slim core, optional UI and OCR):**
|
|
17
17
|
|
|
18
18
|
```bash
|
|
19
19
|
git clone https://github.com/baban9/document-intelligence-platform.git
|
|
20
20
|
cd document-intelligence-platform
|
|
21
21
|
cp .env.example .env # optional: ports, LLM key, auth
|
|
22
|
-
make docker-up
|
|
22
|
+
make docker-up # redis + API + worker (~2 min build, no PyTorch)
|
|
23
|
+
make docker-up-ui # add Gradio when API is healthy
|
|
23
24
|
```
|
|
24
25
|
|
|
26
|
+
| Command | What starts |
|
|
27
|
+
|---------|-------------|
|
|
28
|
+
| `make docker-up` | Redis, slim API, slim worker (documents, PII text, digital PDF) |
|
|
29
|
+
| `make docker-up-ui` | Gradio UI (`--profile ui`) |
|
|
30
|
+
| `make docker-up-ocr` | Rebuild with CPU-only OCR for scanned PDFs (no NVIDIA) |
|
|
31
|
+
| `make docker-up-full` | OCR stack + UI |
|
|
32
|
+
|
|
33
|
+
Slim image skips PyTorch and EasyOCR (~400MB+). Scanned PDF OCR is opt-in via `make docker-up-ocr`.
|
|
34
|
+
|
|
25
35
|
| Service | URL |
|
|
26
36
|
|---------|-----|
|
|
27
37
|
| API | http://127.0.0.1:5000 |
|
|
@@ -29,7 +39,7 @@ make docker-up
|
|
|
29
39
|
| Gradio UI | http://127.0.0.1:7860 |
|
|
30
40
|
| Health | http://127.0.0.1:5000/health |
|
|
31
41
|
|
|
32
|
-
Gradio includes a **Document process** tab (unified pipeline). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
|
|
42
|
+
Gradio includes a **Document process** tab (unified pipeline) and a **Document integrity** tab (gap and consistency checks). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
|
|
33
43
|
|
|
34
44
|
**pip install:**
|
|
35
45
|
|
|
@@ -58,7 +68,7 @@ report = client.process_document("policy.docx", include_pii=True)
|
|
|
58
68
|
| PDF annotate | `POST /v1/pdf/annotate` | Regex highlight, redact, markup |
|
|
59
69
|
| PDF PII scan | `POST /v1/pdf/detect-sensitive` | Presidio + OCR for scanned PDFs |
|
|
60
70
|
| PDF structure | `POST /v1/pdf/structure` | OCR + LLM curated PDF (needs LLM key) |
|
|
61
|
-
| Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process**, **ingest** (S3) |
|
|
71
|
+
| Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process**, **ingest** (S3), **analyze-integrity** |
|
|
62
72
|
| Text | `POST /v1/text/summarize` | TextRank extractive summary |
|
|
63
73
|
| Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
|
|
64
74
|
| Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
|
|
@@ -86,6 +96,11 @@ curl -X POST http://127.0.0.1:5000/v1/documents/process \
|
|
|
86
96
|
# Async: add ?async=true, then poll /v1/jobs/<job_id>
|
|
87
97
|
curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
|
|
88
98
|
-F "file=@policy.docx"
|
|
99
|
+
|
|
100
|
+
# Document integrity analysis (placeholders, broken refs, drift, number mismatch)
|
|
101
|
+
curl -X POST http://127.0.0.1:5000/v1/documents/analyze-integrity \
|
|
102
|
+
-H "Content-Type: application/json" \
|
|
103
|
+
-d '{"text": "See Section 9.2. Total budget: $1M. Total budget: $900K. TBD"}'
|
|
89
104
|
```
|
|
90
105
|
|
|
91
106
|
---
|
|
@@ -94,9 +109,11 @@ curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
|
|
|
94
109
|
|
|
95
110
|
```bash
|
|
96
111
|
make setup # venv + dev deps
|
|
112
|
+
make setup-hooks # block Cursor agent co-author trailers on commit
|
|
97
113
|
make setup-ocr # EasyOCR, Presidio, spaCy model
|
|
98
114
|
make setup-llm # OpenAI client (structure endpoint)
|
|
99
115
|
make setup-ui # Gradio
|
|
116
|
+
make run-redis # Redis for async jobs (Docker, port 6379)
|
|
100
117
|
make run # API on :5000
|
|
101
118
|
make run-worker # RQ worker (separate terminal, needs Redis)
|
|
102
119
|
make run-ui # Gradio on :7860
|
|
@@ -104,6 +121,8 @@ make test
|
|
|
104
121
|
make eval # offline quality report (summary, classify, process, PII)
|
|
105
122
|
```
|
|
106
123
|
|
|
124
|
+
Async routes and the Gradio integrity tab need Redis. Start it once with `make run-redis` before `make run-worker`.
|
|
125
|
+
|
|
107
126
|
Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and S3. See comments in that file for all variables.
|
|
108
127
|
|
|
109
128
|
---
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docintel-platform"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.4.0"
|
|
8
8
|
description = "Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -57,15 +57,18 @@ documents = [
|
|
|
57
57
|
"openpyxl>=3.1.5",
|
|
58
58
|
"python-pptx>=1.0.2",
|
|
59
59
|
]
|
|
60
|
-
|
|
61
|
-
"easyocr>=1.7.2",
|
|
60
|
+
pii = [
|
|
62
61
|
"presidio-analyzer>=2.2.354",
|
|
63
62
|
"spacy>=3.7.0",
|
|
63
|
+
]
|
|
64
|
+
# Scanned PDF OCR (EasyOCR). Install CPU torch before this extra in Docker:
|
|
65
|
+
# pip install torch --index-url https://download.pytorch.org/whl/cpu
|
|
66
|
+
ocr = [
|
|
67
|
+
"easyocr>=1.7.2",
|
|
64
68
|
"opencv-python-headless>=4.10.0",
|
|
65
|
-
"torch>=2.4.1",
|
|
66
69
|
]
|
|
67
70
|
ui = [
|
|
68
|
-
"gradio>=
|
|
71
|
+
"gradio>=5.7.1,<6",
|
|
69
72
|
"requests>=2.32.3",
|
|
70
73
|
]
|
|
71
74
|
llm = [
|
|
@@ -85,10 +88,9 @@ auth = [
|
|
|
85
88
|
]
|
|
86
89
|
all = [
|
|
87
90
|
"easyocr>=1.7.2",
|
|
91
|
+
"opencv-python-headless>=4.10.0",
|
|
88
92
|
"presidio-analyzer>=2.2.354",
|
|
89
93
|
"spacy>=3.7.0",
|
|
90
|
-
"opencv-python-headless>=4.10.0",
|
|
91
|
-
"torch>=2.4.1",
|
|
92
94
|
"openai>=1.54.0",
|
|
93
95
|
"redis>=5.0.8",
|
|
94
96
|
"rq>=1.16.2",
|
|
@@ -97,7 +99,7 @@ all = [
|
|
|
97
99
|
"flask-limiter>=3.8.0",
|
|
98
100
|
"PyJWT>=2.9.0",
|
|
99
101
|
"cryptography>=43.0.0",
|
|
100
|
-
"gradio>=
|
|
102
|
+
"gradio>=5.7.1,<6",
|
|
101
103
|
"python-docx>=1.1.2",
|
|
102
104
|
"openpyxl>=3.1.5",
|
|
103
105
|
"python-pptx>=1.0.2",
|
{docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/compliance/__init__.py
RENAMED
|
@@ -1,13 +1,25 @@
|
|
|
1
1
|
"""Compliance capabilities (PII detection, sensitive PDF scanning)."""
|
|
2
2
|
|
|
3
|
+
from docintel.capabilities.compliance.integrity import (
|
|
4
|
+
IntegrityEvidence,
|
|
5
|
+
IntegrityFinding,
|
|
6
|
+
IntegrityResult,
|
|
7
|
+
V1_CHECKS,
|
|
8
|
+
analyze_document_integrity,
|
|
9
|
+
)
|
|
3
10
|
from docintel.capabilities.compliance.pii import PIIHit, detect_pii_in_text, list_supported_entities, mask_pii_in_text
|
|
4
11
|
from docintel.capabilities.compliance.presets import DEFAULT_PII_ENTITIES, MIN_NATIVE_TEXT_CHARS, OCR_RENDER_SCALE
|
|
5
12
|
|
|
6
13
|
__all__ = [
|
|
7
14
|
"DEFAULT_PII_ENTITIES",
|
|
15
|
+
"IntegrityEvidence",
|
|
16
|
+
"IntegrityFinding",
|
|
17
|
+
"IntegrityResult",
|
|
8
18
|
"MIN_NATIVE_TEXT_CHARS",
|
|
9
19
|
"OCR_RENDER_SCALE",
|
|
10
20
|
"PIIHit",
|
|
21
|
+
"V1_CHECKS",
|
|
22
|
+
"analyze_document_integrity",
|
|
11
23
|
"detect_pii_in_text",
|
|
12
24
|
"list_supported_entities",
|
|
13
25
|
"mask_pii_in_text",
|