docintel-platform 1.2.0__tar.gz → 1.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. {docintel_platform-1.2.0/src/docintel_platform.egg-info → docintel_platform-1.4.0}/PKG-INFO +37 -15
  2. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/README.md +27 -7
  3. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/pyproject.toml +13 -8
  4. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/__init__.py +1 -1
  5. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/compliance/__init__.py +12 -0
  6. docintel_platform-1.4.0/src/docintel/capabilities/compliance/integrity.py +483 -0
  7. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/compliance/sensitive.py +1 -1
  8. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/extract.py +33 -0
  9. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/models.py +1 -0
  10. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/registry.py +8 -0
  11. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/sniff.py +3 -1
  12. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pipeline/process.py +2 -2
  13. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/client.py +75 -0
  14. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/config.py +1 -0
  15. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/jobs/models.py +3 -0
  16. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/jobs/queue.py +104 -78
  17. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/jobs/store.py +12 -2
  18. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/jobs/tasks.py +94 -0
  19. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/openapi/openapi.yaml +167 -0
  20. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/documents.py +169 -0
  21. docintel_platform-1.4.0/src/docintel/services/integrity.py +17 -0
  22. docintel_platform-1.4.0/src/docintel/storage/s3_ingest.py +61 -0
  23. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/ui.py +116 -3
  24. {docintel_platform-1.2.0 → docintel_platform-1.4.0/src/docintel_platform.egg-info}/PKG-INFO +37 -15
  25. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/SOURCES.txt +8 -0
  26. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/requires.txt +9 -6
  27. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_document_formats.py +28 -1
  28. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_documents_async_routes.py +1 -0
  29. docintel_platform-1.4.0/tests/test_documents_integrity.py +55 -0
  30. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_health.py +1 -1
  31. docintel_platform-1.4.0/tests/test_integrity_analysis.py +61 -0
  32. docintel_platform-1.4.0/tests/test_job_ttl.py +40 -0
  33. docintel_platform-1.4.0/tests/test_s3_ingest.py +101 -0
  34. docintel_platform-1.4.0/tests/test_ui_integrity.py +79 -0
  35. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/LICENSE +0 -0
  36. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/MANIFEST.in +0 -0
  37. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/setup.cfg +0 -0
  38. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/app.py +0 -0
  39. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/auth/__init__.py +0 -0
  40. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/auth/api_keys.py +0 -0
  41. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/auth/limiter.py +0 -0
  42. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/auth/middleware.py +0 -0
  43. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/auth/oidc.py +0 -0
  44. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/__init__.py +0 -0
  45. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/compliance/pii.py +0 -0
  46. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/compliance/presets.py +0 -0
  47. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/__init__.py +0 -0
  48. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/formats/__init__.py +0 -0
  49. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/ocr.py +0 -0
  50. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/structure.py +0 -0
  51. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/structure_llm.py +0 -0
  52. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/structure_render.py +0 -0
  53. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/extraction/structure_schema.py +0 -0
  54. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pdf/__init__.py +0 -0
  55. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pdf/annotator.py +0 -0
  56. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pdf/models.py +0 -0
  57. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pdf/search.py +0 -0
  58. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/pipeline/__init__.py +0 -0
  59. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/__init__.py +0 -0
  60. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/classify.py +0 -0
  61. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/compare.py +0 -0
  62. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/models.py +0 -0
  63. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/capabilities/understanding/textrank.py +0 -0
  64. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/cli.py +0 -0
  65. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/jobs/__init__.py +0 -0
  66. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/jobs/helpers.py +0 -0
  67. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/jobs/webhooks.py +0 -0
  68. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/openapi/__init__.py +0 -0
  69. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/ops/__init__.py +0 -0
  70. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/ops/logging.py +0 -0
  71. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/ops/metrics.py +0 -0
  72. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/ops/middleware.py +0 -0
  73. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/ops/prometheus.py +0 -0
  74. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/__init__.py +0 -0
  75. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/async_enqueue.py +0 -0
  76. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/batch.py +0 -0
  77. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/document_upload.py +0 -0
  78. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/jobs.py +0 -0
  79. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/openapi_docs.py +0 -0
  80. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/ops.py +0 -0
  81. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/pdf.py +0 -0
  82. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/routes/text.py +0 -0
  83. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/__init__.py +0 -0
  84. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/__init__.py +0 -0
  85. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/annotator.py +0 -0
  86. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/models.py +0 -0
  87. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/ocr.py +0 -0
  88. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/pii.py +0 -0
  89. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/presets.py +0 -0
  90. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/search.py +0 -0
  91. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/sensitive.py +0 -0
  92. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/structure.py +0 -0
  93. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/structure_llm.py +0 -0
  94. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/structure_render.py +0 -0
  95. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/pdf/structure_schema.py +0 -0
  96. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/summary/__init__.py +0 -0
  97. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/summary/models.py +0 -0
  98. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/services/summary/textrank.py +0 -0
  99. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/storage/__init__.py +0 -0
  100. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/storage/local.py +0 -0
  101. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/storage/s3.py +0 -0
  102. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel/wsgi.py +0 -0
  103. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/dependency_links.txt +0 -0
  104. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/entry_points.txt +0 -0
  105. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/src/docintel_platform.egg-info/top_level.txt +0 -0
  106. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_annotate_async.py +0 -0
  107. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_auth.py +0 -0
  108. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_batch.py +0 -0
  109. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_client.py +0 -0
  110. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_detect_sensitive_async.py +0 -0
  111. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_documents_classify.py +0 -0
  112. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_documents_compare.py +0 -0
  113. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_documents_compare_files.py +0 -0
  114. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_documents_detect_pii.py +0 -0
  115. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_documents_process.py +0 -0
  116. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_documents_process_async.py +0 -0
  117. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_documents_summarize.py +0 -0
  118. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_jobs.py +0 -0
  119. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_oidc.py +0 -0
  120. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_openapi.py +0 -0
  121. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_ops.py +0 -0
  122. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_pdf_routes.py +0 -0
  123. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_pdf_sensitive.py +0 -0
  124. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_pdf_service.py +0 -0
  125. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_pdf_structure.py +0 -0
  126. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_pii_mask.py +0 -0
  127. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_storage.py +0 -0
  128. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_structure_pii.py +0 -0
  129. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_summary_routes.py +0 -0
  130. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_summary_service.py +0 -0
  131. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_ui.py +0 -0
  132. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_ui_process.py +0 -0
  133. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_vertical_presets.py +0 -0
  134. {docintel_platform-1.2.0 → docintel_platform-1.4.0}/tests/test_webhooks.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docintel-platform
3
- Version: 1.2.0
3
+ Version: 1.4.0
4
4
  Summary: Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization.
5
5
  Author: Babandeep Singh
6
6
  License-Expression: MIT
@@ -40,17 +40,19 @@ Requires-Dist: fakeredis>=2.26.2; extra == "dev"
40
40
  Requires-Dist: prometheus-client>=0.21.0; extra == "dev"
41
41
  Requires-Dist: python-docx>=1.1.2; extra == "dev"
42
42
  Requires-Dist: openpyxl>=3.1.5; extra == "dev"
43
+ Requires-Dist: python-pptx>=1.0.2; extra == "dev"
43
44
  Provides-Extra: documents
44
45
  Requires-Dist: python-docx>=1.1.2; extra == "documents"
45
46
  Requires-Dist: openpyxl>=3.1.5; extra == "documents"
47
+ Requires-Dist: python-pptx>=1.0.2; extra == "documents"
48
+ Provides-Extra: pii
49
+ Requires-Dist: presidio-analyzer>=2.2.354; extra == "pii"
50
+ Requires-Dist: spacy>=3.7.0; extra == "pii"
46
51
  Provides-Extra: ocr
47
52
  Requires-Dist: easyocr>=1.7.2; extra == "ocr"
48
- Requires-Dist: presidio-analyzer>=2.2.354; extra == "ocr"
49
- Requires-Dist: spacy>=3.7.0; extra == "ocr"
50
53
  Requires-Dist: opencv-python-headless>=4.10.0; extra == "ocr"
51
- Requires-Dist: torch>=2.4.1; extra == "ocr"
52
54
  Provides-Extra: ui
53
- Requires-Dist: gradio>=4.44.0; extra == "ui"
55
+ Requires-Dist: gradio<6,>=5.7.1; extra == "ui"
54
56
  Requires-Dist: requests>=2.32.3; extra == "ui"
55
57
  Provides-Extra: llm
56
58
  Requires-Dist: openai>=1.54.0; extra == "llm"
@@ -65,10 +67,9 @@ Requires-Dist: PyJWT>=2.9.0; extra == "auth"
65
67
  Requires-Dist: cryptography>=43.0.0; extra == "auth"
66
68
  Provides-Extra: all
67
69
  Requires-Dist: easyocr>=1.7.2; extra == "all"
70
+ Requires-Dist: opencv-python-headless>=4.10.0; extra == "all"
68
71
  Requires-Dist: presidio-analyzer>=2.2.354; extra == "all"
69
72
  Requires-Dist: spacy>=3.7.0; extra == "all"
70
- Requires-Dist: opencv-python-headless>=4.10.0; extra == "all"
71
- Requires-Dist: torch>=2.4.1; extra == "all"
72
73
  Requires-Dist: openai>=1.54.0; extra == "all"
73
74
  Requires-Dist: redis>=5.0.8; extra == "all"
74
75
  Requires-Dist: rq>=1.16.2; extra == "all"
@@ -77,9 +78,10 @@ Requires-Dist: prometheus-client>=0.21.0; extra == "all"
77
78
  Requires-Dist: flask-limiter>=3.8.0; extra == "all"
78
79
  Requires-Dist: PyJWT>=2.9.0; extra == "all"
79
80
  Requires-Dist: cryptography>=43.0.0; extra == "all"
80
- Requires-Dist: gradio>=4.44.0; extra == "all"
81
+ Requires-Dist: gradio<6,>=5.7.1; extra == "all"
81
82
  Requires-Dist: python-docx>=1.1.2; extra == "all"
82
83
  Requires-Dist: openpyxl>=3.1.5; extra == "all"
84
+ Requires-Dist: python-pptx>=1.0.2; extra == "all"
83
85
 
84
86
  # Document Intelligence Platform
85
87
 
@@ -90,21 +92,31 @@ Requires-Dist: openpyxl>=3.1.5; extra == "all"
90
92
 
91
93
  Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
92
94
 
93
- **Version:** 1.2.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
95
+ **Version:** 1.4.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
94
96
 
95
97
  ---
96
98
 
97
99
  ## Quick start
98
100
 
99
- **Docker (API + Gradio UI + worker):**
101
+ **Docker (slim core, optional UI and OCR):**
100
102
 
101
103
  ```bash
102
104
  git clone https://github.com/baban9/document-intelligence-platform.git
103
105
  cd document-intelligence-platform
104
106
  cp .env.example .env # optional: ports, LLM key, auth
105
- make docker-up
107
+ make docker-up # redis + API + worker (~2 min build, no PyTorch)
108
+ make docker-up-ui # add Gradio when API is healthy
106
109
  ```
107
110
 
111
+ | Command | What starts |
112
+ |---------|-------------|
113
+ | `make docker-up` | Redis, slim API, slim worker (documents, PII text, digital PDF) |
114
+ | `make docker-up-ui` | Gradio UI (`--profile ui`) |
115
+ | `make docker-up-ocr` | Rebuild with CPU-only OCR for scanned PDFs (no NVIDIA) |
116
+ | `make docker-up-full` | OCR stack + UI |
117
+
118
+ Slim image skips PyTorch and EasyOCR (~400MB+). Scanned PDF OCR is opt-in via `make docker-up-ocr`.
119
+
108
120
  | Service | URL |
109
121
  |---------|-----|
110
122
  | API | http://127.0.0.1:5000 |
@@ -112,14 +124,14 @@ make docker-up
112
124
  | Gradio UI | http://127.0.0.1:7860 |
113
125
  | Health | http://127.0.0.1:5000/health |
114
126
 
115
- Gradio includes a **Document process** tab (unified pipeline). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
127
+ Gradio includes a **Document process** tab (unified pipeline) and a **Document integrity** tab (gap and consistency checks). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
116
128
 
117
129
  **pip install:**
118
130
 
119
131
  ```bash
120
132
  pip install docintel-platform
121
133
  pip install "docintel-platform[all]" # OCR, LLM, jobs, auth, UI, office formats
122
- pip install "docintel-platform[documents]" # Word and Excel only
134
+ pip install "docintel-platform[documents]" # Word, Excel, and PowerPoint
123
135
  ```
124
136
 
125
137
  **Python client:**
@@ -141,13 +153,13 @@ report = client.process_document("policy.docx", include_pii=True)
141
153
  | PDF annotate | `POST /v1/pdf/annotate` | Regex highlight, redact, markup |
142
154
  | PDF PII scan | `POST /v1/pdf/detect-sensitive` | Presidio + OCR for scanned PDFs |
143
155
  | PDF structure | `POST /v1/pdf/structure` | OCR + LLM curated PDF (needs LLM key) |
144
- | Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process** |
156
+ | Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process**, **ingest** (S3), **analyze-integrity** |
145
157
  | Text | `POST /v1/text/summarize` | TextRank extractive summary |
146
158
  | Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
147
159
  | Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
148
160
  | Ops | `GET /health`, `GET /metrics` | Health and Prometheus-friendly metrics |
149
161
 
150
- **Supported uploads (text workflows):** PDF, DOCX, XLSX, CSV, JSON, TXT, MD.
162
+ **Supported uploads (text workflows):** PDF, DOCX, XLSX, PPTX, CSV, JSON, TXT, MD.
151
163
 
152
164
  **PDF-only routes** (annotate, sensitive, structure) return HTTP 415 for other types. Use `/v1/documents/extract-text` or `/v1/documents/process` for office files.
153
165
 
@@ -169,6 +181,11 @@ curl -X POST http://127.0.0.1:5000/v1/documents/process \
169
181
  # Async: add ?async=true, then poll /v1/jobs/<job_id>
170
182
  curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
171
183
  -F "file=@policy.docx"
184
+
185
+ # Document integrity analysis (placeholders, broken refs, drift, number mismatch)
186
+ curl -X POST http://127.0.0.1:5000/v1/documents/analyze-integrity \
187
+ -H "Content-Type: application/json" \
188
+ -d '{"text": "See Section 9.2. Total budget: $1M. Total budget: $900K. TBD"}'
172
189
  ```
173
190
 
174
191
  ---
@@ -177,9 +194,11 @@ curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
177
194
 
178
195
  ```bash
179
196
  make setup # venv + dev deps
197
+ make setup-hooks # block Cursor agent co-author trailers on commit
180
198
  make setup-ocr # EasyOCR, Presidio, spaCy model
181
199
  make setup-llm # OpenAI client (structure endpoint)
182
200
  make setup-ui # Gradio
201
+ make run-redis # Redis for async jobs (Docker, port 6379)
183
202
  make run # API on :5000
184
203
  make run-worker # RQ worker (separate terminal, needs Redis)
185
204
  make run-ui # Gradio on :7860
@@ -187,6 +206,8 @@ make test
187
206
  make eval # offline quality report (summary, classify, process, PII)
188
207
  ```
189
208
 
209
+ Async routes and the Gradio integrity tab need Redis. Start it once with `make run-redis` before `make run-worker`.
210
+
190
211
  Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and S3. See comments in that file for all variables.
191
212
 
192
213
  ---
@@ -199,6 +220,7 @@ Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and
199
220
  | [docs/PLATFORM.md](docs/PLATFORM.md) | Jobs, auth, storage, ops layout |
200
221
  | [docs/PRODUCTION.md](docs/PRODUCTION.md) | Checklist, latency, failure modes |
201
222
  | [docs/ROADMAP.md](docs/ROADMAP.md) | Milestones and history |
223
+ | [docs/WEBHOOKS.md](docs/WEBHOOKS.md) | Async callbacks and S3 ingest |
202
224
  | [docs/adr/](docs/adr/) | Architecture decision records |
203
225
 
204
226
  ---
@@ -7,21 +7,31 @@
7
7
 
8
8
  Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
9
9
 
10
- **Version:** 1.2.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
10
+ **Version:** 1.4.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
11
11
 
12
12
  ---
13
13
 
14
14
  ## Quick start
15
15
 
16
- **Docker (API + Gradio UI + worker):**
16
+ **Docker (slim core, optional UI and OCR):**
17
17
 
18
18
  ```bash
19
19
  git clone https://github.com/baban9/document-intelligence-platform.git
20
20
  cd document-intelligence-platform
21
21
  cp .env.example .env # optional: ports, LLM key, auth
22
- make docker-up
22
+ make docker-up # redis + API + worker (~2 min build, no PyTorch)
23
+ make docker-up-ui # add Gradio when API is healthy
23
24
  ```
24
25
 
26
+ | Command | What starts |
27
+ |---------|-------------|
28
+ | `make docker-up` | Redis, slim API, slim worker (documents, PII text, digital PDF) |
29
+ | `make docker-up-ui` | Gradio UI (`--profile ui`) |
30
+ | `make docker-up-ocr` | Rebuild with CPU-only OCR for scanned PDFs (no NVIDIA) |
31
+ | `make docker-up-full` | OCR stack + UI |
32
+
33
+ Slim image skips PyTorch and EasyOCR (~400MB+). Scanned PDF OCR is opt-in via `make docker-up-ocr`.
34
+
25
35
  | Service | URL |
26
36
  |---------|-----|
27
37
  | API | http://127.0.0.1:5000 |
@@ -29,14 +39,14 @@ make docker-up
29
39
  | Gradio UI | http://127.0.0.1:7860 |
30
40
  | Health | http://127.0.0.1:5000/health |
31
41
 
32
- Gradio includes a **Document process** tab (unified pipeline). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
42
+ Gradio includes a **Document process** tab (unified pipeline) and a **Document integrity** tab (gap and consistency checks). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
33
43
 
34
44
  **pip install:**
35
45
 
36
46
  ```bash
37
47
  pip install docintel-platform
38
48
  pip install "docintel-platform[all]" # OCR, LLM, jobs, auth, UI, office formats
39
- pip install "docintel-platform[documents]" # Word and Excel only
49
+ pip install "docintel-platform[documents]" # Word, Excel, and PowerPoint
40
50
  ```
41
51
 
42
52
  **Python client:**
@@ -58,13 +68,13 @@ report = client.process_document("policy.docx", include_pii=True)
58
68
  | PDF annotate | `POST /v1/pdf/annotate` | Regex highlight, redact, markup |
59
69
  | PDF PII scan | `POST /v1/pdf/detect-sensitive` | Presidio + OCR for scanned PDFs |
60
70
  | PDF structure | `POST /v1/pdf/structure` | OCR + LLM curated PDF (needs LLM key) |
61
- | Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process** |
71
+ | Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process**, **ingest** (S3), **analyze-integrity** |
62
72
  | Text | `POST /v1/text/summarize` | TextRank extractive summary |
63
73
  | Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
64
74
  | Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
65
75
  | Ops | `GET /health`, `GET /metrics` | Health and Prometheus-friendly metrics |
66
76
 
67
- **Supported uploads (text workflows):** PDF, DOCX, XLSX, CSV, JSON, TXT, MD.
77
+ **Supported uploads (text workflows):** PDF, DOCX, XLSX, PPTX, CSV, JSON, TXT, MD.
68
78
 
69
79
  **PDF-only routes** (annotate, sensitive, structure) return HTTP 415 for other types. Use `/v1/documents/extract-text` or `/v1/documents/process` for office files.
70
80
 
@@ -86,6 +96,11 @@ curl -X POST http://127.0.0.1:5000/v1/documents/process \
86
96
  # Async: add ?async=true, then poll /v1/jobs/<job_id>
87
97
  curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
88
98
  -F "file=@policy.docx"
99
+
100
+ # Document integrity analysis (placeholders, broken refs, drift, number mismatch)
101
+ curl -X POST http://127.0.0.1:5000/v1/documents/analyze-integrity \
102
+ -H "Content-Type: application/json" \
103
+ -d '{"text": "See Section 9.2. Total budget: $1M. Total budget: $900K. TBD"}'
89
104
  ```
90
105
 
91
106
  ---
@@ -94,9 +109,11 @@ curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
94
109
 
95
110
  ```bash
96
111
  make setup # venv + dev deps
112
+ make setup-hooks # block Cursor agent co-author trailers on commit
97
113
  make setup-ocr # EasyOCR, Presidio, spaCy model
98
114
  make setup-llm # OpenAI client (structure endpoint)
99
115
  make setup-ui # Gradio
116
+ make run-redis # Redis for async jobs (Docker, port 6379)
100
117
  make run # API on :5000
101
118
  make run-worker # RQ worker (separate terminal, needs Redis)
102
119
  make run-ui # Gradio on :7860
@@ -104,6 +121,8 @@ make test
104
121
  make eval # offline quality report (summary, classify, process, PII)
105
122
  ```
106
123
 
124
+ Async routes and the Gradio integrity tab need Redis. Start it once with `make run-redis` before `make run-worker`.
125
+
107
126
  Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and S3. See comments in that file for all variables.
108
127
 
109
128
  ---
@@ -116,6 +135,7 @@ Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and
116
135
  | [docs/PLATFORM.md](docs/PLATFORM.md) | Jobs, auth, storage, ops layout |
117
136
  | [docs/PRODUCTION.md](docs/PRODUCTION.md) | Checklist, latency, failure modes |
118
137
  | [docs/ROADMAP.md](docs/ROADMAP.md) | Milestones and history |
138
+ | [docs/WEBHOOKS.md](docs/WEBHOOKS.md) | Async callbacks and S3 ingest |
119
139
  | [docs/adr/](docs/adr/) | Architecture decision records |
120
140
 
121
141
  ---
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docintel-platform"
7
- version = "1.2.0"
7
+ version = "1.4.0"
8
8
  description = "Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization."
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -50,20 +50,25 @@ dev = [
50
50
  "prometheus-client>=0.21.0",
51
51
  "python-docx>=1.1.2",
52
52
  "openpyxl>=3.1.5",
53
+ "python-pptx>=1.0.2",
53
54
  ]
54
55
  documents = [
55
56
  "python-docx>=1.1.2",
56
57
  "openpyxl>=3.1.5",
58
+ "python-pptx>=1.0.2",
57
59
  ]
58
- ocr = [
59
- "easyocr>=1.7.2",
60
+ pii = [
60
61
  "presidio-analyzer>=2.2.354",
61
62
  "spacy>=3.7.0",
63
+ ]
64
+ # Scanned PDF OCR (EasyOCR). Install CPU torch before this extra in Docker:
65
+ # pip install torch --index-url https://download.pytorch.org/whl/cpu
66
+ ocr = [
67
+ "easyocr>=1.7.2",
62
68
  "opencv-python-headless>=4.10.0",
63
- "torch>=2.4.1",
64
69
  ]
65
70
  ui = [
66
- "gradio>=4.44.0",
71
+ "gradio>=5.7.1,<6",
67
72
  "requests>=2.32.3",
68
73
  ]
69
74
  llm = [
@@ -83,10 +88,9 @@ auth = [
83
88
  ]
84
89
  all = [
85
90
  "easyocr>=1.7.2",
91
+ "opencv-python-headless>=4.10.0",
86
92
  "presidio-analyzer>=2.2.354",
87
93
  "spacy>=3.7.0",
88
- "opencv-python-headless>=4.10.0",
89
- "torch>=2.4.1",
90
94
  "openai>=1.54.0",
91
95
  "redis>=5.0.8",
92
96
  "rq>=1.16.2",
@@ -95,9 +99,10 @@ all = [
95
99
  "flask-limiter>=3.8.0",
96
100
  "PyJWT>=2.9.0",
97
101
  "cryptography>=43.0.0",
98
- "gradio>=4.44.0",
102
+ "gradio>=5.7.1,<6",
99
103
  "python-docx>=1.1.2",
100
104
  "openpyxl>=3.1.5",
105
+ "python-pptx>=1.0.2",
101
106
  ]
102
107
 
103
108
  [project.scripts]
@@ -2,5 +2,5 @@
2
2
 
3
3
  from docintel.client import DocintelClient, DocintelError
4
4
 
5
- __version__ = "1.2.0"
5
+ __version__ = "1.4.0"
6
6
  __all__ = ["DocintelClient", "DocintelError", "__version__"]
@@ -1,13 +1,25 @@
1
1
  """Compliance capabilities (PII detection, sensitive PDF scanning)."""
2
2
 
3
+ from docintel.capabilities.compliance.integrity import (
4
+ IntegrityEvidence,
5
+ IntegrityFinding,
6
+ IntegrityResult,
7
+ V1_CHECKS,
8
+ analyze_document_integrity,
9
+ )
3
10
  from docintel.capabilities.compliance.pii import PIIHit, detect_pii_in_text, list_supported_entities, mask_pii_in_text
4
11
  from docintel.capabilities.compliance.presets import DEFAULT_PII_ENTITIES, MIN_NATIVE_TEXT_CHARS, OCR_RENDER_SCALE
5
12
 
6
13
  __all__ = [
7
14
  "DEFAULT_PII_ENTITIES",
15
+ "IntegrityEvidence",
16
+ "IntegrityFinding",
17
+ "IntegrityResult",
8
18
  "MIN_NATIVE_TEXT_CHARS",
9
19
  "OCR_RENDER_SCALE",
10
20
  "PIIHit",
21
+ "V1_CHECKS",
22
+ "analyze_document_integrity",
11
23
  "detect_pii_in_text",
12
24
  "list_supported_entities",
13
25
  "mask_pii_in_text",