docintel-platform 1.0.2__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. docintel_platform-1.2.0/PKG-INFO +222 -0
  2. docintel_platform-1.2.0/README.md +139 -0
  3. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/pyproject.toml +19 -4
  4. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/__init__.py +1 -1
  5. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/app.py +4 -2
  6. docintel_platform-1.2.0/src/docintel/capabilities/__init__.py +1 -0
  7. docintel_platform-1.2.0/src/docintel/capabilities/compliance/__init__.py +14 -0
  8. {docintel_platform-1.0.2/src/docintel/services/pdf → docintel_platform-1.2.0/src/docintel/capabilities/compliance}/pii.py +4 -2
  9. docintel_platform-1.2.0/src/docintel/capabilities/compliance/presets.py +80 -0
  10. {docintel_platform-1.0.2/src/docintel/services/pdf → docintel_platform-1.2.0/src/docintel/capabilities/compliance}/sensitive.py +5 -3
  11. docintel_platform-1.2.0/src/docintel/capabilities/extraction/__init__.py +29 -0
  12. docintel_platform-1.2.0/src/docintel/capabilities/extraction/formats/__init__.py +27 -0
  13. docintel_platform-1.2.0/src/docintel/capabilities/extraction/formats/extract.py +171 -0
  14. docintel_platform-1.2.0/src/docintel/capabilities/extraction/formats/models.py +77 -0
  15. docintel_platform-1.2.0/src/docintel/capabilities/extraction/formats/registry.py +100 -0
  16. docintel_platform-1.2.0/src/docintel/capabilities/extraction/formats/sniff.py +171 -0
  17. {docintel_platform-1.0.2/src/docintel/services/pdf → docintel_platform-1.2.0/src/docintel/capabilities/extraction}/ocr.py +1 -1
  18. {docintel_platform-1.0.2/src/docintel/services/pdf → docintel_platform-1.2.0/src/docintel/capabilities/extraction}/structure.py +6 -5
  19. {docintel_platform-1.0.2/src/docintel/services/pdf → docintel_platform-1.2.0/src/docintel/capabilities/extraction}/structure_llm.py +1 -1
  20. {docintel_platform-1.0.2/src/docintel/services/pdf → docintel_platform-1.2.0/src/docintel/capabilities/extraction}/structure_render.py +1 -1
  21. docintel_platform-1.2.0/src/docintel/capabilities/pdf/__init__.py +23 -0
  22. {docintel_platform-1.0.2/src/docintel/services → docintel_platform-1.2.0/src/docintel/capabilities}/pdf/annotator.py +2 -2
  23. docintel_platform-1.2.0/src/docintel/capabilities/pipeline/__init__.py +10 -0
  24. docintel_platform-1.2.0/src/docintel/capabilities/pipeline/process.py +210 -0
  25. docintel_platform-1.2.0/src/docintel/capabilities/understanding/__init__.py +6 -0
  26. docintel_platform-1.2.0/src/docintel/capabilities/understanding/classify.py +101 -0
  27. docintel_platform-1.2.0/src/docintel/capabilities/understanding/compare.py +49 -0
  28. {docintel_platform-1.0.2/src/docintel/services/summary → docintel_platform-1.2.0/src/docintel/capabilities/understanding}/textrank.py +1 -1
  29. docintel_platform-1.2.0/src/docintel/client.py +418 -0
  30. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/config.py +6 -0
  31. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/jobs/models.py +11 -0
  32. docintel_platform-1.2.0/src/docintel/jobs/queue.py +308 -0
  33. docintel_platform-1.2.0/src/docintel/jobs/tasks.py +663 -0
  34. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/jobs/webhooks.py +28 -1
  35. docintel_platform-1.2.0/src/docintel/openapi/openapi.yaml +708 -0
  36. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/ops/middleware.py +4 -0
  37. docintel_platform-1.2.0/src/docintel/ops/prometheus.py +62 -0
  38. docintel_platform-1.2.0/src/docintel/routes/async_enqueue.py +29 -0
  39. docintel_platform-1.2.0/src/docintel/routes/batch.py +169 -0
  40. docintel_platform-1.2.0/src/docintel/routes/document_upload.py +81 -0
  41. docintel_platform-1.2.0/src/docintel/routes/documents.py +576 -0
  42. docintel_platform-1.2.0/src/docintel/routes/ops.py +32 -0
  43. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/routes/pdf.py +137 -52
  44. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/routes/text.py +23 -0
  45. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/services/pdf/__init__.py +3 -1
  46. docintel_platform-1.2.0/src/docintel/services/pdf/annotator.py +19 -0
  47. docintel_platform-1.2.0/src/docintel/services/pdf/models.py +17 -0
  48. docintel_platform-1.2.0/src/docintel/services/pdf/ocr.py +23 -0
  49. docintel_platform-1.2.0/src/docintel/services/pdf/pii.py +5 -0
  50. docintel_platform-1.2.0/src/docintel/services/pdf/presets.py +19 -0
  51. docintel_platform-1.2.0/src/docintel/services/pdf/search.py +5 -0
  52. docintel_platform-1.2.0/src/docintel/services/pdf/sensitive.py +6 -0
  53. docintel_platform-1.2.0/src/docintel/services/pdf/structure.py +6 -0
  54. docintel_platform-1.2.0/src/docintel/services/pdf/structure_llm.py +5 -0
  55. docintel_platform-1.2.0/src/docintel/services/pdf/structure_render.py +5 -0
  56. docintel_platform-1.2.0/src/docintel/services/pdf/structure_schema.py +10 -0
  57. docintel_platform-1.2.0/src/docintel/services/summary/__init__.py +5 -0
  58. docintel_platform-1.2.0/src/docintel/services/summary/models.py +5 -0
  59. docintel_platform-1.2.0/src/docintel/services/summary/textrank.py +9 -0
  60. docintel_platform-1.2.0/src/docintel/storage/__init__.py +33 -0
  61. docintel_platform-1.2.0/src/docintel/storage/local.py +30 -0
  62. docintel_platform-1.2.0/src/docintel/storage/s3.py +76 -0
  63. docintel_platform-1.2.0/src/docintel/ui.py +607 -0
  64. docintel_platform-1.2.0/src/docintel_platform.egg-info/PKG-INFO +222 -0
  65. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/SOURCES.txt +49 -6
  66. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/requires.txt +15 -0
  67. docintel_platform-1.2.0/tests/test_annotate_async.py +76 -0
  68. docintel_platform-1.2.0/tests/test_batch.py +96 -0
  69. docintel_platform-1.2.0/tests/test_document_formats.py +180 -0
  70. docintel_platform-1.2.0/tests/test_documents_async_routes.py +115 -0
  71. docintel_platform-1.2.0/tests/test_documents_classify.py +30 -0
  72. docintel_platform-1.2.0/tests/test_documents_compare.py +30 -0
  73. docintel_platform-1.2.0/tests/test_documents_compare_files.py +40 -0
  74. docintel_platform-1.2.0/tests/test_documents_detect_pii.py +39 -0
  75. docintel_platform-1.2.0/tests/test_documents_process.py +78 -0
  76. docintel_platform-1.2.0/tests/test_documents_process_async.py +81 -0
  77. docintel_platform-1.2.0/tests/test_documents_summarize.py +43 -0
  78. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_health.py +1 -1
  79. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_ops.py +11 -1
  80. docintel_platform-1.2.0/tests/test_storage.py +17 -0
  81. docintel_platform-1.2.0/tests/test_ui_process.py +28 -0
  82. docintel_platform-1.2.0/tests/test_vertical_presets.py +32 -0
  83. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_webhooks.py +18 -0
  84. docintel_platform-1.0.2/PKG-INFO +0 -607
  85. docintel_platform-1.0.2/README.md +0 -537
  86. docintel_platform-1.0.2/src/docintel/client.py +0 -193
  87. docintel_platform-1.0.2/src/docintel/jobs/queue.py +0 -75
  88. docintel_platform-1.0.2/src/docintel/jobs/tasks.py +0 -173
  89. docintel_platform-1.0.2/src/docintel/openapi/openapi.yaml +0 -380
  90. docintel_platform-1.0.2/src/docintel/routes/match.py +0 -43
  91. docintel_platform-1.0.2/src/docintel/routes/ops.py +0 -22
  92. docintel_platform-1.0.2/src/docintel/services/matching/__init__.py +0 -6
  93. docintel_platform-1.0.2/src/docintel/services/matching/models.py +0 -19
  94. docintel_platform-1.0.2/src/docintel/services/matching/scorer.py +0 -64
  95. docintel_platform-1.0.2/src/docintel/services/pdf/presets.py +0 -26
  96. docintel_platform-1.0.2/src/docintel/services/summary/__init__.py +0 -6
  97. docintel_platform-1.0.2/src/docintel/ui.py +0 -347
  98. docintel_platform-1.0.2/src/docintel_platform.egg-info/PKG-INFO +0 -607
  99. docintel_platform-1.0.2/tests/test_matching_routes.py +0 -64
  100. docintel_platform-1.0.2/tests/test_matching_service.py +0 -59
  101. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/LICENSE +0 -0
  102. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/MANIFEST.in +0 -0
  103. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/setup.cfg +0 -0
  104. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/auth/__init__.py +0 -0
  105. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/auth/api_keys.py +0 -0
  106. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/auth/limiter.py +0 -0
  107. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/auth/middleware.py +0 -0
  108. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/auth/oidc.py +0 -0
  109. {docintel_platform-1.0.2/src/docintel/services/pdf → docintel_platform-1.2.0/src/docintel/capabilities/extraction}/structure_schema.py +0 -0
  110. {docintel_platform-1.0.2/src/docintel/services → docintel_platform-1.2.0/src/docintel/capabilities}/pdf/models.py +0 -0
  111. {docintel_platform-1.0.2/src/docintel/services → docintel_platform-1.2.0/src/docintel/capabilities}/pdf/search.py +0 -0
  112. {docintel_platform-1.0.2/src/docintel/services/summary → docintel_platform-1.2.0/src/docintel/capabilities/understanding}/models.py +0 -0
  113. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/cli.py +0 -0
  114. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/jobs/__init__.py +0 -0
  115. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/jobs/helpers.py +0 -0
  116. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/jobs/store.py +0 -0
  117. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/openapi/__init__.py +0 -0
  118. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/ops/__init__.py +0 -0
  119. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/ops/logging.py +0 -0
  120. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/ops/metrics.py +0 -0
  121. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/routes/__init__.py +0 -0
  122. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/routes/jobs.py +0 -0
  123. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/routes/openapi_docs.py +0 -0
  124. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/services/__init__.py +0 -0
  125. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/wsgi.py +0 -0
  126. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/dependency_links.txt +0 -0
  127. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/entry_points.txt +0 -0
  128. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/top_level.txt +0 -0
  129. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_auth.py +0 -0
  130. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_client.py +0 -0
  131. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_detect_sensitive_async.py +0 -0
  132. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_jobs.py +0 -0
  133. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_oidc.py +0 -0
  134. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_openapi.py +0 -0
  135. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_pdf_routes.py +0 -0
  136. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_pdf_sensitive.py +0 -0
  137. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_pdf_service.py +0 -0
  138. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_pdf_structure.py +0 -0
  139. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_pii_mask.py +0 -0
  140. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_structure_pii.py +0 -0
  141. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_summary_routes.py +0 -0
  142. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_summary_service.py +0 -0
  143. {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_ui.py +0 -0
@@ -0,0 +1,222 @@
1
+ Metadata-Version: 2.4
2
+ Name: docintel-platform
3
+ Version: 1.2.0
4
+ Summary: Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization.
5
+ Author: Babandeep Singh
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/baban9/document-intelligence-platform
8
+ Project-URL: Repository, https://github.com/baban9/document-intelligence-platform
9
+ Project-URL: Documentation, https://github.com/baban9/document-intelligence-platform#readme
10
+ Project-URL: Issues, https://github.com/baban9/document-intelligence-platform/issues
11
+ Keywords: nlp,pdf,flask,document-ai,ocr,pii,presidio,openapi,document-intelligence,compliance
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Framework :: Flask
20
+ Classifier: Topic :: Text Processing
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Classifier: Typing :: Typed
23
+ Requires-Python: >=3.9
24
+ Description-Content-Type: text/markdown
25
+ Requires-Dist: flask>=3.0.3
26
+ Requires-Dist: werkzeug>=3.0.3
27
+ Requires-Dist: pymupdf>=1.24.10
28
+ Requires-Dist: scikit-learn>=1.5.2
29
+ Requires-Dist: networkx>=3.2.1
30
+ Requires-Dist: numpy>=1.26.4
31
+ Requires-Dist: gunicorn>=23.0.0
32
+ Requires-Dist: pyyaml>=6.0.2
33
+ Requires-Dist: requests>=2.32.3
34
+ Requires-Dist: prometheus-client>=0.21.0
35
+ Provides-Extra: dev
36
+ Requires-Dist: pytest>=8.3.3; extra == "dev"
37
+ Requires-Dist: build>=1.2.2; extra == "dev"
38
+ Requires-Dist: twine>=5.1.1; extra == "dev"
39
+ Requires-Dist: fakeredis>=2.26.2; extra == "dev"
40
+ Requires-Dist: prometheus-client>=0.21.0; extra == "dev"
41
+ Requires-Dist: python-docx>=1.1.2; extra == "dev"
42
+ Requires-Dist: openpyxl>=3.1.5; extra == "dev"
43
+ Provides-Extra: documents
44
+ Requires-Dist: python-docx>=1.1.2; extra == "documents"
45
+ Requires-Dist: openpyxl>=3.1.5; extra == "documents"
46
+ Provides-Extra: ocr
47
+ Requires-Dist: easyocr>=1.7.2; extra == "ocr"
48
+ Requires-Dist: presidio-analyzer>=2.2.354; extra == "ocr"
49
+ Requires-Dist: spacy>=3.7.0; extra == "ocr"
50
+ Requires-Dist: opencv-python-headless>=4.10.0; extra == "ocr"
51
+ Requires-Dist: torch>=2.4.1; extra == "ocr"
52
+ Provides-Extra: ui
53
+ Requires-Dist: gradio>=4.44.0; extra == "ui"
54
+ Requires-Dist: requests>=2.32.3; extra == "ui"
55
+ Provides-Extra: llm
56
+ Requires-Dist: openai>=1.54.0; extra == "llm"
57
+ Provides-Extra: jobs
58
+ Requires-Dist: redis>=5.0.8; extra == "jobs"
59
+ Requires-Dist: rq>=1.16.2; extra == "jobs"
60
+ Provides-Extra: storage
61
+ Requires-Dist: boto3>=1.35.0; extra == "storage"
62
+ Provides-Extra: auth
63
+ Requires-Dist: flask-limiter>=3.8.0; extra == "auth"
64
+ Requires-Dist: PyJWT>=2.9.0; extra == "auth"
65
+ Requires-Dist: cryptography>=43.0.0; extra == "auth"
66
+ Provides-Extra: all
67
+ Requires-Dist: easyocr>=1.7.2; extra == "all"
68
+ Requires-Dist: presidio-analyzer>=2.2.354; extra == "all"
69
+ Requires-Dist: spacy>=3.7.0; extra == "all"
70
+ Requires-Dist: opencv-python-headless>=4.10.0; extra == "all"
71
+ Requires-Dist: torch>=2.4.1; extra == "all"
72
+ Requires-Dist: openai>=1.54.0; extra == "all"
73
+ Requires-Dist: redis>=5.0.8; extra == "all"
74
+ Requires-Dist: rq>=1.16.2; extra == "all"
75
+ Requires-Dist: boto3>=1.35.0; extra == "all"
76
+ Requires-Dist: prometheus-client>=0.21.0; extra == "all"
77
+ Requires-Dist: flask-limiter>=3.8.0; extra == "all"
78
+ Requires-Dist: PyJWT>=2.9.0; extra == "all"
79
+ Requires-Dist: cryptography>=43.0.0; extra == "all"
80
+ Requires-Dist: gradio>=4.44.0; extra == "all"
81
+ Requires-Dist: python-docx>=1.1.2; extra == "all"
82
+ Requires-Dist: openpyxl>=3.1.5; extra == "all"
83
+
84
+ # Document Intelligence Platform
85
+
86
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
87
+ [![Flask](https://img.shields.io/badge/flask-3.0+-green.svg)](https://flask.palletsprojects.com/)
88
+ [![Docker](https://img.shields.io/badge/docker-compose-ready-blue.svg)](docker-compose.yml)
89
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
90
+
91
+ Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
92
+
93
+ **Version:** 1.2.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
94
+
95
+ ---
96
+
97
+ ## Quick start
98
+
99
+ **Docker (API + Gradio UI + worker):**
100
+
101
+ ```bash
102
+ git clone https://github.com/baban9/document-intelligence-platform.git
103
+ cd document-intelligence-platform
104
+ cp .env.example .env # optional: ports, LLM key, auth
105
+ make docker-up
106
+ ```
107
+
108
+ | Service | URL |
109
+ |---------|-----|
110
+ | API | http://127.0.0.1:5000 |
111
+ | Interactive API docs | http://127.0.0.1:5000/docs |
112
+ | Gradio UI | http://127.0.0.1:7860 |
113
+ | Health | http://127.0.0.1:5000/health |
114
+
115
+ Gradio includes a **Document process** tab (unified pipeline). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
116
+
117
+ **pip install:**
118
+
119
+ ```bash
120
+ pip install docintel-platform
121
+ pip install "docintel-platform[all]" # OCR, LLM, jobs, auth, UI, office formats
122
+ pip install "docintel-platform[documents]" # Word and Excel only
123
+ ```
124
+
125
+ **Python client:**
126
+
127
+ ```python
128
+ from docintel import DocintelClient
129
+
130
+ client = DocintelClient("http://127.0.0.1:5000", api_key="your-key")
131
+ summary = client.summarize(report_text, sentences=3)
132
+ report = client.process_document("policy.docx", include_pii=True)
133
+ ```
134
+
135
+ ---
136
+
137
+ ## Capabilities
138
+
139
+ | Area | Endpoints | Notes |
140
+ |------|-----------|-------|
141
+ | PDF annotate | `POST /v1/pdf/annotate` | Regex highlight, redact, markup |
142
+ | PDF PII scan | `POST /v1/pdf/detect-sensitive` | Presidio + OCR for scanned PDFs |
143
+ | PDF structure | `POST /v1/pdf/structure` | OCR + LLM curated PDF (needs LLM key) |
144
+ | Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process** |
145
+ | Text | `POST /v1/text/summarize` | TextRank extractive summary |
146
+ | Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
147
+ | Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
148
+ | Ops | `GET /health`, `GET /metrics` | Health and Prometheus-friendly metrics |
149
+
150
+ **Supported uploads (text workflows):** PDF, DOCX, XLSX, CSV, JSON, TXT, MD.
151
+
152
+ **PDF-only routes** (annotate, sensitive, structure) return HTTP 415 for other types. Use `/v1/documents/extract-text` or `/v1/documents/process` for office files.
153
+
154
+ Full request and response schemas: **http://127.0.0.1:5000/docs** (OpenAPI).
155
+
156
+ ---
157
+
158
+ ## Example requests
159
+
160
+ ```bash
161
+ # Sensitive PDF (digital or scanned)
162
+ curl -X POST http://127.0.0.1:5000/v1/pdf/detect-sensitive \
163
+ -F "file=@contract.pdf" -F "action=Highlight" -o marked.pdf
164
+
165
+ # Unified document pipeline (extract + classify + summarize + PII)
166
+ curl -X POST http://127.0.0.1:5000/v1/documents/process \
167
+ -F "file=@policy.docx" -F "sentences=3"
168
+
169
+ # Async: add ?async=true, then poll /v1/jobs/<job_id>
170
+ curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
171
+ -F "file=@policy.docx"
172
+ ```
173
+
174
+ ---
175
+
176
+ ## Local development
177
+
178
+ ```bash
179
+ make setup # venv + dev deps
180
+ make setup-ocr # EasyOCR, Presidio, spaCy model
181
+ make setup-llm # OpenAI client (structure endpoint)
182
+ make setup-ui # Gradio
183
+ make run # API on :5000
184
+ make run-worker # RQ worker (separate terminal, needs Redis)
185
+ make run-ui # Gradio on :7860
186
+ make test
187
+ make eval # offline quality report (summary, classify, process, PII)
188
+ ```
189
+
190
+ Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and S3. See comments in that file for all variables.
191
+
192
+ ---
193
+
194
+ ## Documentation
195
+
196
+ | Doc | Contents |
197
+ |-----|----------|
198
+ | [/docs](http://127.0.0.1:5000/docs) | Live OpenAPI / Swagger (authoritative API reference) |
199
+ | [docs/PLATFORM.md](docs/PLATFORM.md) | Jobs, auth, storage, ops layout |
200
+ | [docs/PRODUCTION.md](docs/PRODUCTION.md) | Checklist, latency, failure modes |
201
+ | [docs/ROADMAP.md](docs/ROADMAP.md) | Milestones and history |
202
+ | [docs/adr/](docs/adr/) | Architecture decision records |
203
+
204
+ ---
205
+
206
+ ## Project layout
207
+
208
+ ```
209
+ src/docintel/
210
+ routes/ HTTP API
211
+ capabilities/ Compliance, extraction, understanding
212
+ jobs/ Async queue (Redis + RQ)
213
+ auth/ API keys, OIDC, rate limits
214
+ storage/ Local or S3 artifacts
215
+ ops/ Logging and metrics
216
+ ```
217
+
218
+ ---
219
+
220
+ ## License
221
+
222
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,139 @@
1
+ # Document Intelligence Platform
2
+
3
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
4
+ [![Flask](https://img.shields.io/badge/flask-3.0+-green.svg)](https://flask.palletsprojects.com/)
5
+ [![Docker](https://img.shields.io/badge/docker-compose-ready-blue.svg)](docker-compose.yml)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
7
+
8
+ Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
9
+
10
+ **Version:** 1.2.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
11
+
12
+ ---
13
+
14
+ ## Quick start
15
+
16
+ **Docker (API + Gradio UI + worker):**
17
+
18
+ ```bash
19
+ git clone https://github.com/baban9/document-intelligence-platform.git
20
+ cd document-intelligence-platform
21
+ cp .env.example .env # optional: ports, LLM key, auth
22
+ make docker-up
23
+ ```
24
+
25
+ | Service | URL |
26
+ |---------|-----|
27
+ | API | http://127.0.0.1:5000 |
28
+ | Interactive API docs | http://127.0.0.1:5000/docs |
29
+ | Gradio UI | http://127.0.0.1:7860 |
30
+ | Health | http://127.0.0.1:5000/health |
31
+
32
+ Gradio includes a **Document process** tab (unified pipeline). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
33
+
34
+ **pip install:**
35
+
36
+ ```bash
37
+ pip install docintel-platform
38
+ pip install "docintel-platform[all]" # OCR, LLM, jobs, auth, UI, office formats
39
+ pip install "docintel-platform[documents]" # Word and Excel only
40
+ ```
41
+
42
+ **Python client:**
43
+
44
+ ```python
45
+ from docintel import DocintelClient
46
+
47
+ client = DocintelClient("http://127.0.0.1:5000", api_key="your-key")
48
+ summary = client.summarize(report_text, sentences=3)
49
+ report = client.process_document("policy.docx", include_pii=True)
50
+ ```
51
+
52
+ ---
53
+
54
+ ## Capabilities
55
+
56
+ | Area | Endpoints | Notes |
57
+ |------|-----------|-------|
58
+ | PDF annotate | `POST /v1/pdf/annotate` | Regex highlight, redact, markup |
59
+ | PDF PII scan | `POST /v1/pdf/detect-sensitive` | Presidio + OCR for scanned PDFs |
60
+ | PDF structure | `POST /v1/pdf/structure` | OCR + LLM curated PDF (needs LLM key) |
61
+ | Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process** |
62
+ | Text | `POST /v1/text/summarize` | TextRank extractive summary |
63
+ | Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
64
+ | Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
65
+ | Ops | `GET /health`, `GET /metrics` | Health and Prometheus-friendly metrics |
66
+
67
+ **Supported uploads (text workflows):** PDF, DOCX, XLSX, CSV, JSON, TXT, MD.
68
+
69
+ **PDF-only routes** (annotate, sensitive, structure) return HTTP 415 for other types. Use `/v1/documents/extract-text` or `/v1/documents/process` for office files.
70
+
71
+ Full request and response schemas: **http://127.0.0.1:5000/docs** (OpenAPI).
72
+
73
+ ---
74
+
75
+ ## Example requests
76
+
77
+ ```bash
78
+ # Sensitive PDF (digital or scanned)
79
+ curl -X POST http://127.0.0.1:5000/v1/pdf/detect-sensitive \
80
+ -F "file=@contract.pdf" -F "action=Highlight" -o marked.pdf
81
+
82
+ # Unified document pipeline (extract + classify + summarize + PII)
83
+ curl -X POST http://127.0.0.1:5000/v1/documents/process \
84
+ -F "file=@policy.docx" -F "sentences=3"
85
+
86
+ # Async: add ?async=true, then poll /v1/jobs/<job_id>
87
+ curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
88
+ -F "file=@policy.docx"
89
+ ```
90
+
91
+ ---
92
+
93
+ ## Local development
94
+
95
+ ```bash
96
+ make setup # venv + dev deps
97
+ make setup-ocr # EasyOCR, Presidio, spaCy model
98
+ make setup-llm # OpenAI client (structure endpoint)
99
+ make setup-ui # Gradio
100
+ make run # API on :5000
101
+ make run-worker # RQ worker (separate terminal, needs Redis)
102
+ make run-ui # Gradio on :7860
103
+ make test
104
+ make eval # offline quality report (summary, classify, process, PII)
105
+ ```
106
+
107
+ Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and S3. See comments in that file for all variables.
108
+
109
+ ---
110
+
111
+ ## Documentation
112
+
113
+ | Doc | Contents |
114
+ |-----|----------|
115
+ | [/docs](http://127.0.0.1:5000/docs) | Live OpenAPI / Swagger (authoritative API reference) |
116
+ | [docs/PLATFORM.md](docs/PLATFORM.md) | Jobs, auth, storage, ops layout |
117
+ | [docs/PRODUCTION.md](docs/PRODUCTION.md) | Checklist, latency, failure modes |
118
+ | [docs/ROADMAP.md](docs/ROADMAP.md) | Milestones and history |
119
+ | [docs/adr/](docs/adr/) | Architecture decision records |
120
+
121
+ ---
122
+
123
+ ## Project layout
124
+
125
+ ```
126
+ src/docintel/
127
+ routes/ HTTP API
128
+ capabilities/ Compliance, extraction, understanding
129
+ jobs/ Async queue (Redis + RQ)
130
+ auth/ API keys, OIDC, rate limits
131
+ storage/ Local or S3 artifacts
132
+ ops/ Logging and metrics
133
+ ```
134
+
135
+ ---
136
+
137
+ ## License
138
+
139
+ MIT. See [LICENSE](LICENSE).
@@ -4,16 +4,16 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docintel-platform"
7
- version = "1.0.2"
8
- description = "Document intelligence API and Python client for PDF OCR, PII detection, LLM structuring, matching, and summarization."
7
+ version = "1.2.0"
8
+ description = "Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization."
9
9
  readme = "README.md"
10
10
  license = "MIT"
11
11
  license-files = []
12
12
  requires-python = ">=3.9"
13
13
  authors = [{name = "Babandeep Singh"}]
14
14
  keywords = [
15
- "nlp", "pdf", "flask", "document-ai", "resume-matching",
16
- "ocr", "pii", "presidio", "openapi", "document-intelligence",
15
+ "nlp", "pdf", "flask", "document-ai",
16
+ "ocr", "pii", "presidio", "openapi", "document-intelligence", "compliance",
17
17
  ]
18
18
  classifiers = [
19
19
  "Development Status :: 4 - Beta",
@@ -38,6 +38,7 @@ dependencies = [
38
38
  "gunicorn>=23.0.0",
39
39
  "pyyaml>=6.0.2",
40
40
  "requests>=2.32.3",
41
+ "prometheus-client>=0.21.0",
41
42
  ]
42
43
 
43
44
  [project.optional-dependencies]
@@ -46,6 +47,13 @@ dev = [
46
47
  "build>=1.2.2",
47
48
  "twine>=5.1.1",
48
49
  "fakeredis>=2.26.2",
50
+ "prometheus-client>=0.21.0",
51
+ "python-docx>=1.1.2",
52
+ "openpyxl>=3.1.5",
53
+ ]
54
+ documents = [
55
+ "python-docx>=1.1.2",
56
+ "openpyxl>=3.1.5",
49
57
  ]
50
58
  ocr = [
51
59
  "easyocr>=1.7.2",
@@ -65,6 +73,9 @@ jobs = [
65
73
  "redis>=5.0.8",
66
74
  "rq>=1.16.2",
67
75
  ]
76
+ storage = [
77
+ "boto3>=1.35.0",
78
+ ]
68
79
  auth = [
69
80
  "flask-limiter>=3.8.0",
70
81
  "PyJWT>=2.9.0",
@@ -79,10 +90,14 @@ all = [
79
90
  "openai>=1.54.0",
80
91
  "redis>=5.0.8",
81
92
  "rq>=1.16.2",
93
+ "boto3>=1.35.0",
94
+ "prometheus-client>=0.21.0",
82
95
  "flask-limiter>=3.8.0",
83
96
  "PyJWT>=2.9.0",
84
97
  "cryptography>=43.0.0",
85
98
  "gradio>=4.44.0",
99
+ "python-docx>=1.1.2",
100
+ "openpyxl>=3.1.5",
86
101
  ]
87
102
 
88
103
  [project.scripts]
@@ -2,5 +2,5 @@
2
2
 
3
3
  from docintel.client import DocintelClient, DocintelError
4
4
 
5
- __version__ = "1.0.0"
5
+ __version__ = "1.2.0"
6
6
  __all__ = ["DocintelClient", "DocintelError", "__version__"]
@@ -8,9 +8,10 @@ from docintel.auth.limiter import init_limiter
8
8
  from docintel.auth.middleware import register_auth
9
9
  from docintel.ops.logging import configure_logging
10
10
  from docintel.ops.middleware import register_request_hooks
11
+ from docintel.routes.documents import documents_bp
12
+ from docintel.routes.batch import batch_bp
11
13
  from docintel.routes.jobs import jobs_bp
12
14
  from docintel.routes.openapi_docs import docs_bp
13
- from docintel.routes.match import match_bp
14
15
  from docintel.routes.ops import ops_bp
15
16
  from docintel.routes.pdf import pdf_bp
16
17
  from docintel.routes.text import text_bp
@@ -36,9 +37,10 @@ def create_app(config: type[Config] = Config) -> Flask:
36
37
  )
37
38
 
38
39
  app.register_blueprint(docs_bp)
40
+ app.register_blueprint(documents_bp)
39
41
  app.register_blueprint(pdf_bp)
40
42
  app.register_blueprint(jobs_bp)
41
- app.register_blueprint(match_bp)
43
+ app.register_blueprint(batch_bp)
42
44
  app.register_blueprint(text_bp)
43
45
  app.register_blueprint(ops_bp)
44
46
 
@@ -0,0 +1 @@
1
+ """Enterprise capability modules."""
@@ -0,0 +1,14 @@
1
+ """Compliance capabilities (PII detection, sensitive PDF scanning)."""
2
+
3
+ from docintel.capabilities.compliance.pii import PIIHit, detect_pii_in_text, list_supported_entities, mask_pii_in_text
4
+ from docintel.capabilities.compliance.presets import DEFAULT_PII_ENTITIES, MIN_NATIVE_TEXT_CHARS, OCR_RENDER_SCALE
5
+
6
+ __all__ = [
7
+ "DEFAULT_PII_ENTITIES",
8
+ "MIN_NATIVE_TEXT_CHARS",
9
+ "OCR_RENDER_SCALE",
10
+ "PIIHit",
11
+ "detect_pii_in_text",
12
+ "list_supported_entities",
13
+ "mask_pii_in_text",
14
+ ]
@@ -6,7 +6,7 @@ from dataclasses import dataclass
6
6
  from functools import lru_cache
7
7
  from typing import Sequence
8
8
 
9
- from docintel.services.pdf.presets import DEFAULT_PII_ENTITIES
9
+ from docintel.capabilities.compliance.presets import DEFAULT_PII_ENTITIES
10
10
 
11
11
 
12
12
  @dataclass(frozen=True)
@@ -89,7 +89,9 @@ def mask_pii_in_text(
89
89
 
90
90
  Returns masked text and the number of entities redacted.
91
91
  """
92
- hits = detect_pii_in_text(
92
+ from docintel.services.pdf import pii as pii_compat
93
+
94
+ hits = pii_compat.detect_pii_in_text(
93
95
  text,
94
96
  entities=entities,
95
97
  language=language,
@@ -0,0 +1,80 @@
1
+ """Default Presidio entity presets (extend via API or custom recognizers)."""
2
+
3
+ # Core Presidio entities suitable for legal, finance, and compliance workflows.
4
+ DEFAULT_PII_ENTITIES: tuple[str, ...] = (
5
+ "EMAIL_ADDRESS",
6
+ "PHONE_NUMBER",
7
+ "US_SSN",
8
+ "CREDIT_CARD",
9
+ "US_BANK_NUMBER",
10
+ "US_DRIVER_LICENSE",
11
+ "US_ITIN",
12
+ "US_PASSPORT",
13
+ "PERSON",
14
+ "LOCATION",
15
+ "DATE_TIME",
16
+ "IP_ADDRESS",
17
+ "IBAN_CODE",
18
+ "MEDICAL_LICENSE",
19
+ "URL",
20
+ )
21
+
22
+ VERTICAL_ENTITY_PRESETS: dict[str, tuple[str, ...]] = {
23
+ "general": DEFAULT_PII_ENTITIES,
24
+ "healthcare": (
25
+ "PERSON",
26
+ "PHONE_NUMBER",
27
+ "EMAIL_ADDRESS",
28
+ "DATE_TIME",
29
+ "LOCATION",
30
+ "US_SSN",
31
+ "MEDICAL_LICENSE",
32
+ "US_DRIVER_LICENSE",
33
+ "URL",
34
+ ),
35
+ "financial": (
36
+ "PERSON",
37
+ "EMAIL_ADDRESS",
38
+ "PHONE_NUMBER",
39
+ "US_SSN",
40
+ "CREDIT_CARD",
41
+ "US_BANK_NUMBER",
42
+ "IBAN_CODE",
43
+ "US_ITIN",
44
+ "DATE_TIME",
45
+ "LOCATION",
46
+ "URL",
47
+ ),
48
+ "legal": (
49
+ "PERSON",
50
+ "EMAIL_ADDRESS",
51
+ "PHONE_NUMBER",
52
+ "LOCATION",
53
+ "DATE_TIME",
54
+ "US_PASSPORT",
55
+ "US_DRIVER_LICENSE",
56
+ "US_SSN",
57
+ "URL",
58
+ ),
59
+ }
60
+
61
+ # Minimum extracted characters before a page is treated as scanned (OCR fallback).
62
+ MIN_NATIVE_TEXT_CHARS = 20
63
+
64
+ # EasyOCR render scale (higher improves accuracy, increases memory).
65
+ OCR_RENDER_SCALE = 2.0
66
+
67
+
68
+ def list_vertical_presets() -> dict[str, list[str]]:
69
+ """Return named entity packs for vertical workflows."""
70
+ return {name: list(entities) for name, entities in VERTICAL_ENTITY_PRESETS.items()}
71
+
72
+
73
+ def entities_for_vertical(name: str) -> tuple[str, ...]:
74
+ """Resolve a vertical preset name to a Presidio entity list."""
75
+ key = name.strip().lower()
76
+ try:
77
+ return VERTICAL_ENTITY_PRESETS[key]
78
+ except KeyError as exc:
79
+ valid = ", ".join(sorted(VERTICAL_ENTITY_PRESETS))
80
+ raise ValueError(f"Unknown vertical preset '{name}'. Choose from: {valid}") from exc
@@ -18,7 +18,7 @@ from docintel.services.pdf.ocr import (
18
18
  page_has_native_text,
19
19
  rects_for_char_range,
20
20
  )
21
- from docintel.services.pdf.pii import PIIHit, detect_pii_in_text
21
+ from docintel.capabilities.compliance.pii import PIIHit
22
22
  from docintel.services.pdf.search import search_for_text
23
23
 
24
24
 
@@ -124,7 +124,9 @@ def detect_sensitive_pdf(
124
124
 
125
125
  Uses native PDF text when available. Falls back to EasyOCR for scanned pages.
126
126
  """
127
- _ensure_ocr_stack()
127
+ from docintel.services.pdf import sensitive as sensitive_compat
128
+
129
+ sensitive_compat._ensure_ocr_stack()
128
130
  selected_action = action if isinstance(action, Action) else Action.from_value(action)
129
131
  if selected_action == Action.REMOVE:
130
132
  raise ValueError("Action 'Remove' is not supported for sensitive detection.")
@@ -159,7 +161,7 @@ def detect_sensitive_pdf(
159
161
  indexed = []
160
162
  page_text = page.get_text("text")
161
163
 
162
- hits = detect_pii_in_text(page_text, entities=entities, min_score=min_score)
164
+ hits = sensitive_compat.detect_pii_in_text(page_text, entities=entities, min_score=min_score)
163
165
  if pattern:
164
166
  hits.extend(_regex_hits(page_text, pattern))
165
167
 
@@ -0,0 +1,29 @@
1
+ """Document extraction capabilities (OCR, LLM structuring)."""
2
+
3
+ from docintel.capabilities.extraction.ocr import (
4
+ OCRSpan,
5
+ build_indexed_text,
6
+ embed_invisible_text_layer,
7
+ extract_page_ocr,
8
+ merge_rects,
9
+ page_has_native_text,
10
+ rects_for_char_range,
11
+ )
12
+ from docintel.capabilities.extraction.structure_schema import (
13
+ SectionBlock,
14
+ StructuredDocument,
15
+ StructuredPage,
16
+ )
17
+
18
+ __all__ = [
19
+ "OCRSpan",
20
+ "SectionBlock",
21
+ "StructuredDocument",
22
+ "StructuredPage",
23
+ "build_indexed_text",
24
+ "embed_invisible_text_layer",
25
+ "extract_page_ocr",
26
+ "merge_rects",
27
+ "page_has_native_text",
28
+ "rects_for_char_range",
29
+ ]
@@ -0,0 +1,27 @@
1
+ """Multi-format document identification and text extraction."""
2
+
3
+ from docintel.capabilities.extraction.formats.extract import extract_document_text
4
+ from docintel.capabilities.extraction.formats.models import (
5
+ DocumentKind,
6
+ DocumentProfile,
7
+ ExtractionResult,
8
+ IdentificationResult,
9
+ )
10
+ from docintel.capabilities.extraction.formats.registry import (
11
+ get_profile,
12
+ list_supported_types,
13
+ profiles_for_kind,
14
+ )
15
+ from docintel.capabilities.extraction.formats.sniff import identify_document
16
+
17
+ __all__ = [
18
+ "DocumentKind",
19
+ "DocumentProfile",
20
+ "ExtractionResult",
21
+ "IdentificationResult",
22
+ "extract_document_text",
23
+ "get_profile",
24
+ "identify_document",
25
+ "list_supported_types",
26
+ "profiles_for_kind",
27
+ ]