docintel-platform 1.3.0__tar.gz → 1.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. {docintel_platform-1.3.0/src/docintel_platform.egg-info → docintel_platform-1.4.0}/PKG-INFO +31 -13
  2. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/README.md +24 -5
  3. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/pyproject.toml +10 -8
  4. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/__init__.py +1 -1
  5. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/compliance/__init__.py +12 -0
  6. docintel_platform-1.4.0/src/docintel/capabilities/compliance/integrity.py +483 -0
  7. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/compliance/sensitive.py +1 -1
  8. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pipeline/process.py +2 -2
  9. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/client.py +33 -0
  10. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/jobs/models.py +2 -0
  11. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/jobs/queue.py +91 -84
  12. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/jobs/tasks.py +55 -0
  13. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/openapi/openapi.yaml +107 -0
  14. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/documents.py +74 -0
  15. docintel_platform-1.4.0/src/docintel/services/integrity.py +17 -0
  16. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/ui.py +113 -0
  17. {docintel_platform-1.3.0 → docintel_platform-1.4.0/src/docintel_platform.egg-info}/PKG-INFO +31 -13
  18. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/SOURCES.txt +5 -0
  19. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/requires.txt +6 -6
  20. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_documents_async_routes.py +1 -0
  21. docintel_platform-1.4.0/tests/test_documents_integrity.py +55 -0
  22. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_health.py +1 -1
  23. docintel_platform-1.4.0/tests/test_integrity_analysis.py +61 -0
  24. docintel_platform-1.4.0/tests/test_ui_integrity.py +79 -0
  25. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/LICENSE +0 -0
  26. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/MANIFEST.in +0 -0
  27. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/setup.cfg +0 -0
  28. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/app.py +0 -0
  29. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/auth/__init__.py +0 -0
  30. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/auth/api_keys.py +0 -0
  31. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/auth/limiter.py +0 -0
  32. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/auth/middleware.py +0 -0
  33. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/auth/oidc.py +0 -0
  34. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/__init__.py +0 -0
  35. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/compliance/pii.py +0 -0
  36. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/compliance/presets.py +0 -0
  37. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/__init__.py +0 -0
  38. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/__init__.py +0 -0
  39. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/extract.py +0 -0
  40. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/models.py +0 -0
  41. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/registry.py +0 -0
  42. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/sniff.py +0 -0
  43. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/ocr.py +0 -0
  44. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/structure.py +0 -0
  45. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/structure_llm.py +0 -0
  46. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/structure_render.py +0 -0
  47. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/structure_schema.py +0 -0
  48. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pdf/__init__.py +0 -0
  49. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pdf/annotator.py +0 -0
  50. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pdf/models.py +0 -0
  51. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pdf/search.py +0 -0
  52. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pipeline/__init__.py +0 -0
  53. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/__init__.py +0 -0
  54. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/classify.py +0 -0
  55. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/compare.py +0 -0
  56. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/models.py +0 -0
  57. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/textrank.py +0 -0
  58. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/cli.py +0 -0
  59. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/config.py +0 -0
  60. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/jobs/__init__.py +0 -0
  61. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/jobs/helpers.py +0 -0
  62. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/jobs/store.py +0 -0
  63. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/jobs/webhooks.py +0 -0
  64. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/openapi/__init__.py +0 -0
  65. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/ops/__init__.py +0 -0
  66. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/ops/logging.py +0 -0
  67. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/ops/metrics.py +0 -0
  68. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/ops/middleware.py +0 -0
  69. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/ops/prometheus.py +0 -0
  70. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/__init__.py +0 -0
  71. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/async_enqueue.py +0 -0
  72. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/batch.py +0 -0
  73. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/document_upload.py +0 -0
  74. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/jobs.py +0 -0
  75. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/openapi_docs.py +0 -0
  76. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/ops.py +0 -0
  77. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/pdf.py +0 -0
  78. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/routes/text.py +0 -0
  79. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/__init__.py +0 -0
  80. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/__init__.py +0 -0
  81. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/annotator.py +0 -0
  82. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/models.py +0 -0
  83. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/ocr.py +0 -0
  84. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/pii.py +0 -0
  85. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/presets.py +0 -0
  86. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/search.py +0 -0
  87. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/sensitive.py +0 -0
  88. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/structure.py +0 -0
  89. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/structure_llm.py +0 -0
  90. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/structure_render.py +0 -0
  91. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/structure_schema.py +0 -0
  92. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/summary/__init__.py +0 -0
  93. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/summary/models.py +0 -0
  94. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/services/summary/textrank.py +0 -0
  95. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/storage/__init__.py +0 -0
  96. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/storage/local.py +0 -0
  97. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/storage/s3.py +0 -0
  98. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/storage/s3_ingest.py +0 -0
  99. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel/wsgi.py +0 -0
  100. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/dependency_links.txt +0 -0
  101. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/entry_points.txt +0 -0
  102. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/top_level.txt +0 -0
  103. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_annotate_async.py +0 -0
  104. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_auth.py +0 -0
  105. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_batch.py +0 -0
  106. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_client.py +0 -0
  107. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_detect_sensitive_async.py +0 -0
  108. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_document_formats.py +0 -0
  109. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_documents_classify.py +0 -0
  110. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_documents_compare.py +0 -0
  111. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_documents_compare_files.py +0 -0
  112. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_documents_detect_pii.py +0 -0
  113. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_documents_process.py +0 -0
  114. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_documents_process_async.py +0 -0
  115. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_documents_summarize.py +0 -0
  116. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_job_ttl.py +0 -0
  117. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_jobs.py +0 -0
  118. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_oidc.py +0 -0
  119. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_openapi.py +0 -0
  120. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_ops.py +0 -0
  121. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_pdf_routes.py +0 -0
  122. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_pdf_sensitive.py +0 -0
  123. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_pdf_service.py +0 -0
  124. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_pdf_structure.py +0 -0
  125. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_pii_mask.py +0 -0
  126. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_s3_ingest.py +0 -0
  127. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_storage.py +0 -0
  128. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_structure_pii.py +0 -0
  129. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_summary_routes.py +0 -0
  130. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_summary_service.py +0 -0
  131. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_ui.py +0 -0
  132. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_ui_process.py +0 -0
  133. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_vertical_presets.py +0 -0
  134. {docintel_platform-1.3.0 → docintel_platform-1.4.0}/tests/test_webhooks.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docintel-platform
3
- Version: 1.3.0
3
+ Version: 1.4.0
4
4
  Summary: Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization.
5
5
  Author: Babandeep Singh
6
6
  License-Expression: MIT
@@ -45,14 +45,14 @@ Provides-Extra: documents
45
45
  Requires-Dist: python-docx>=1.1.2; extra == "documents"
46
46
  Requires-Dist: openpyxl>=3.1.5; extra == "documents"
47
47
  Requires-Dist: python-pptx>=1.0.2; extra == "documents"
48
+ Provides-Extra: pii
49
+ Requires-Dist: presidio-analyzer>=2.2.354; extra == "pii"
50
+ Requires-Dist: spacy>=3.7.0; extra == "pii"
48
51
  Provides-Extra: ocr
49
52
  Requires-Dist: easyocr>=1.7.2; extra == "ocr"
50
- Requires-Dist: presidio-analyzer>=2.2.354; extra == "ocr"
51
- Requires-Dist: spacy>=3.7.0; extra == "ocr"
52
53
  Requires-Dist: opencv-python-headless>=4.10.0; extra == "ocr"
53
- Requires-Dist: torch>=2.4.1; extra == "ocr"
54
54
  Provides-Extra: ui
55
- Requires-Dist: gradio>=4.44.0; extra == "ui"
55
+ Requires-Dist: gradio<6,>=5.7.1; extra == "ui"
56
56
  Requires-Dist: requests>=2.32.3; extra == "ui"
57
57
  Provides-Extra: llm
58
58
  Requires-Dist: openai>=1.54.0; extra == "llm"
@@ -67,10 +67,9 @@ Requires-Dist: PyJWT>=2.9.0; extra == "auth"
67
67
  Requires-Dist: cryptography>=43.0.0; extra == "auth"
68
68
  Provides-Extra: all
69
69
  Requires-Dist: easyocr>=1.7.2; extra == "all"
70
+ Requires-Dist: opencv-python-headless>=4.10.0; extra == "all"
70
71
  Requires-Dist: presidio-analyzer>=2.2.354; extra == "all"
71
72
  Requires-Dist: spacy>=3.7.0; extra == "all"
72
- Requires-Dist: opencv-python-headless>=4.10.0; extra == "all"
73
- Requires-Dist: torch>=2.4.1; extra == "all"
74
73
  Requires-Dist: openai>=1.54.0; extra == "all"
75
74
  Requires-Dist: redis>=5.0.8; extra == "all"
76
75
  Requires-Dist: rq>=1.16.2; extra == "all"
@@ -79,7 +78,7 @@ Requires-Dist: prometheus-client>=0.21.0; extra == "all"
79
78
  Requires-Dist: flask-limiter>=3.8.0; extra == "all"
80
79
  Requires-Dist: PyJWT>=2.9.0; extra == "all"
81
80
  Requires-Dist: cryptography>=43.0.0; extra == "all"
82
- Requires-Dist: gradio>=4.44.0; extra == "all"
81
+ Requires-Dist: gradio<6,>=5.7.1; extra == "all"
83
82
  Requires-Dist: python-docx>=1.1.2; extra == "all"
84
83
  Requires-Dist: openpyxl>=3.1.5; extra == "all"
85
84
  Requires-Dist: python-pptx>=1.0.2; extra == "all"
@@ -93,21 +92,31 @@ Requires-Dist: python-pptx>=1.0.2; extra == "all"
93
92
 
94
93
  Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
95
94
 
96
- **Version:** 1.3.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
95
+ **Version:** 1.4.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
97
96
 
98
97
  ---
99
98
 
100
99
  ## Quick start
101
100
 
102
- **Docker (API + Gradio UI + worker):**
101
+ **Docker (slim core, optional UI and OCR):**
103
102
 
104
103
  ```bash
105
104
  git clone https://github.com/baban9/document-intelligence-platform.git
106
105
  cd document-intelligence-platform
107
106
  cp .env.example .env # optional: ports, LLM key, auth
108
- make docker-up
107
+ make docker-up # redis + API + worker (~2 min build, no PyTorch)
108
+ make docker-up-ui # add Gradio when API is healthy
109
109
  ```
110
110
 
111
+ | Command | What starts |
112
+ |---------|-------------|
113
+ | `make docker-up` | Redis, slim API, slim worker (documents, PII text, digital PDF) |
114
+ | `make docker-up-ui` | Gradio UI (`--profile ui`) |
115
+ | `make docker-up-ocr` | Rebuild with CPU-only OCR for scanned PDFs (no NVIDIA) |
116
+ | `make docker-up-full` | OCR stack + UI |
117
+
118
+ Slim image skips PyTorch and EasyOCR (~400MB+). Scanned PDF OCR is opt-in via `make docker-up-ocr`.
119
+
111
120
  | Service | URL |
112
121
  |---------|-----|
113
122
  | API | http://127.0.0.1:5000 |
@@ -115,7 +124,7 @@ make docker-up
115
124
  | Gradio UI | http://127.0.0.1:7860 |
116
125
  | Health | http://127.0.0.1:5000/health |
117
126
 
118
- Gradio includes a **Document process** tab (unified pipeline). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
127
+ Gradio includes a **Document process** tab (unified pipeline) and a **Document integrity** tab (gap and consistency checks). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
119
128
 
120
129
  **pip install:**
121
130
 
@@ -144,7 +153,7 @@ report = client.process_document("policy.docx", include_pii=True)
144
153
  | PDF annotate | `POST /v1/pdf/annotate` | Regex highlight, redact, markup |
145
154
  | PDF PII scan | `POST /v1/pdf/detect-sensitive` | Presidio + OCR for scanned PDFs |
146
155
  | PDF structure | `POST /v1/pdf/structure` | OCR + LLM curated PDF (needs LLM key) |
147
- | Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process**, **ingest** (S3) |
156
+ | Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process**, **ingest** (S3), **analyze-integrity** |
148
157
  | Text | `POST /v1/text/summarize` | TextRank extractive summary |
149
158
  | Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
150
159
  | Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
@@ -172,6 +181,11 @@ curl -X POST http://127.0.0.1:5000/v1/documents/process \
172
181
  # Async: add ?async=true, then poll /v1/jobs/<job_id>
173
182
  curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
174
183
  -F "file=@policy.docx"
184
+
185
+ # Document integrity analysis (placeholders, broken refs, drift, number mismatch)
186
+ curl -X POST http://127.0.0.1:5000/v1/documents/analyze-integrity \
187
+ -H "Content-Type: application/json" \
188
+ -d '{"text": "See Section 9.2. Total budget: $1M. Total budget: $900K. TBD"}'
175
189
  ```
176
190
 
177
191
  ---
@@ -180,9 +194,11 @@ curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
180
194
 
181
195
  ```bash
182
196
  make setup # venv + dev deps
197
+ make setup-hooks # block Cursor agent co-author trailers on commit
183
198
  make setup-ocr # EasyOCR, Presidio, spaCy model
184
199
  make setup-llm # OpenAI client (structure endpoint)
185
200
  make setup-ui # Gradio
201
+ make run-redis # Redis for async jobs (Docker, port 6379)
186
202
  make run # API on :5000
187
203
  make run-worker # RQ worker (separate terminal, needs Redis)
188
204
  make run-ui # Gradio on :7860
@@ -190,6 +206,8 @@ make test
190
206
  make eval # offline quality report (summary, classify, process, PII)
191
207
  ```
192
208
 
209
+ Async routes and the Gradio integrity tab need Redis. Start it once with `make run-redis` before `make run-worker`.
210
+
193
211
  Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and S3. See comments in that file for all variables.
194
212
 
195
213
  ---
@@ -7,21 +7,31 @@
7
7
 
8
8
  Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
9
9
 
10
- **Version:** 1.3.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
10
+ **Version:** 1.4.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
11
11
 
12
12
  ---
13
13
 
14
14
  ## Quick start
15
15
 
16
- **Docker (API + Gradio UI + worker):**
16
+ **Docker (slim core, optional UI and OCR):**
17
17
 
18
18
  ```bash
19
19
  git clone https://github.com/baban9/document-intelligence-platform.git
20
20
  cd document-intelligence-platform
21
21
  cp .env.example .env # optional: ports, LLM key, auth
22
- make docker-up
22
+ make docker-up # redis + API + worker (~2 min build, no PyTorch)
23
+ make docker-up-ui # add Gradio when API is healthy
23
24
  ```
24
25
 
26
+ | Command | What starts |
27
+ |---------|-------------|
28
+ | `make docker-up` | Redis, slim API, slim worker (documents, PII text, digital PDF) |
29
+ | `make docker-up-ui` | Gradio UI (`--profile ui`) |
30
+ | `make docker-up-ocr` | Rebuild with CPU-only OCR for scanned PDFs (no NVIDIA) |
31
+ | `make docker-up-full` | OCR stack + UI |
32
+
33
+ Slim image skips PyTorch and EasyOCR (~400MB+). Scanned PDF OCR is opt-in via `make docker-up-ocr`.
34
+
25
35
  | Service | URL |
26
36
  |---------|-----|
27
37
  | API | http://127.0.0.1:5000 |
@@ -29,7 +39,7 @@ make docker-up
29
39
  | Gradio UI | http://127.0.0.1:7860 |
30
40
  | Health | http://127.0.0.1:5000/health |
31
41
 
32
- Gradio includes a **Document process** tab (unified pipeline). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
42
+ Gradio includes a **Document process** tab (unified pipeline) and a **Document integrity** tab (gap and consistency checks). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
33
43
 
34
44
  **pip install:**
35
45
 
@@ -58,7 +68,7 @@ report = client.process_document("policy.docx", include_pii=True)
58
68
  | PDF annotate | `POST /v1/pdf/annotate` | Regex highlight, redact, markup |
59
69
  | PDF PII scan | `POST /v1/pdf/detect-sensitive` | Presidio + OCR for scanned PDFs |
60
70
  | PDF structure | `POST /v1/pdf/structure` | OCR + LLM curated PDF (needs LLM key) |
61
- | Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process**, **ingest** (S3) |
71
+ | Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process**, **ingest** (S3), **analyze-integrity** |
62
72
  | Text | `POST /v1/text/summarize` | TextRank extractive summary |
63
73
  | Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
64
74
  | Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
@@ -86,6 +96,11 @@ curl -X POST http://127.0.0.1:5000/v1/documents/process \
86
96
  # Async: add ?async=true, then poll /v1/jobs/<job_id>
87
97
  curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
88
98
  -F "file=@policy.docx"
99
+
100
+ # Document integrity analysis (placeholders, broken refs, drift, number mismatch)
101
+ curl -X POST http://127.0.0.1:5000/v1/documents/analyze-integrity \
102
+ -H "Content-Type: application/json" \
103
+ -d '{"text": "See Section 9.2. Total budget: $1M. Total budget: $900K. TBD"}'
89
104
  ```
90
105
 
91
106
  ---
@@ -94,9 +109,11 @@ curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
94
109
 
95
110
  ```bash
96
111
  make setup # venv + dev deps
112
+ make setup-hooks # block Cursor agent co-author trailers on commit
97
113
  make setup-ocr # EasyOCR, Presidio, spaCy model
98
114
  make setup-llm # OpenAI client (structure endpoint)
99
115
  make setup-ui # Gradio
116
+ make run-redis # Redis for async jobs (Docker, port 6379)
100
117
  make run # API on :5000
101
118
  make run-worker # RQ worker (separate terminal, needs Redis)
102
119
  make run-ui # Gradio on :7860
@@ -104,6 +121,8 @@ make test
104
121
  make eval # offline quality report (summary, classify, process, PII)
105
122
  ```
106
123
 
124
+ Async routes and the Gradio integrity tab need Redis. Start it once with `make run-redis` before `make run-worker`.
125
+
107
126
  Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and S3. See comments in that file for all variables.
108
127
 
109
128
  ---
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docintel-platform"
7
- version = "1.3.0"
7
+ version = "1.4.0"
8
8
  description = "Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization."
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -57,15 +57,18 @@ documents = [
57
57
  "openpyxl>=3.1.5",
58
58
  "python-pptx>=1.0.2",
59
59
  ]
60
- ocr = [
61
- "easyocr>=1.7.2",
60
+ pii = [
62
61
  "presidio-analyzer>=2.2.354",
63
62
  "spacy>=3.7.0",
63
+ ]
64
+ # Scanned PDF OCR (EasyOCR). Install CPU torch before this extra in Docker:
65
+ # pip install torch --index-url https://download.pytorch.org/whl/cpu
66
+ ocr = [
67
+ "easyocr>=1.7.2",
64
68
  "opencv-python-headless>=4.10.0",
65
- "torch>=2.4.1",
66
69
  ]
67
70
  ui = [
68
- "gradio>=4.44.0",
71
+ "gradio>=5.7.1,<6",
69
72
  "requests>=2.32.3",
70
73
  ]
71
74
  llm = [
@@ -85,10 +88,9 @@ auth = [
85
88
  ]
86
89
  all = [
87
90
  "easyocr>=1.7.2",
91
+ "opencv-python-headless>=4.10.0",
88
92
  "presidio-analyzer>=2.2.354",
89
93
  "spacy>=3.7.0",
90
- "opencv-python-headless>=4.10.0",
91
- "torch>=2.4.1",
92
94
  "openai>=1.54.0",
93
95
  "redis>=5.0.8",
94
96
  "rq>=1.16.2",
@@ -97,7 +99,7 @@ all = [
97
99
  "flask-limiter>=3.8.0",
98
100
  "PyJWT>=2.9.0",
99
101
  "cryptography>=43.0.0",
100
- "gradio>=4.44.0",
102
+ "gradio>=5.7.1,<6",
101
103
  "python-docx>=1.1.2",
102
104
  "openpyxl>=3.1.5",
103
105
  "python-pptx>=1.0.2",
@@ -2,5 +2,5 @@
2
2
 
3
3
  from docintel.client import DocintelClient, DocintelError
4
4
 
5
- __version__ = "1.3.0"
5
+ __version__ = "1.4.0"
6
6
  __all__ = ["DocintelClient", "DocintelError", "__version__"]
@@ -1,13 +1,25 @@
1
1
  """Compliance capabilities (PII detection, sensitive PDF scanning)."""
2
2
 
3
+ from docintel.capabilities.compliance.integrity import (
4
+ IntegrityEvidence,
5
+ IntegrityFinding,
6
+ IntegrityResult,
7
+ V1_CHECKS,
8
+ analyze_document_integrity,
9
+ )
3
10
  from docintel.capabilities.compliance.pii import PIIHit, detect_pii_in_text, list_supported_entities, mask_pii_in_text
4
11
  from docintel.capabilities.compliance.presets import DEFAULT_PII_ENTITIES, MIN_NATIVE_TEXT_CHARS, OCR_RENDER_SCALE
5
12
 
6
13
  __all__ = [
7
14
  "DEFAULT_PII_ENTITIES",
15
+ "IntegrityEvidence",
16
+ "IntegrityFinding",
17
+ "IntegrityResult",
8
18
  "MIN_NATIVE_TEXT_CHARS",
9
19
  "OCR_RENDER_SCALE",
10
20
  "PIIHit",
21
+ "V1_CHECKS",
22
+ "analyze_document_integrity",
11
23
  "detect_pii_in_text",
12
24
  "list_supported_entities",
13
25
  "mask_pii_in_text",