docintel-platform 1.0.2__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docintel_platform-1.2.0/PKG-INFO +222 -0
- docintel_platform-1.2.0/README.md +139 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/pyproject.toml +19 -4
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/__init__.py +1 -1
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/app.py +4 -2
- docintel_platform-1.2.0/src/docintel/capabilities/__init__.py +1 -0
- docintel_platform-1.2.0/src/docintel/capabilities/compliance/__init__.py +14 -0
- {docintel_platform-1.0.2/src/docintel/services/pdf → docintel_platform-1.2.0/src/docintel/capabilities/compliance}/pii.py +4 -2
- docintel_platform-1.2.0/src/docintel/capabilities/compliance/presets.py +80 -0
- {docintel_platform-1.0.2/src/docintel/services/pdf → docintel_platform-1.2.0/src/docintel/capabilities/compliance}/sensitive.py +5 -3
- docintel_platform-1.2.0/src/docintel/capabilities/extraction/__init__.py +29 -0
- docintel_platform-1.2.0/src/docintel/capabilities/extraction/formats/__init__.py +27 -0
- docintel_platform-1.2.0/src/docintel/capabilities/extraction/formats/extract.py +171 -0
- docintel_platform-1.2.0/src/docintel/capabilities/extraction/formats/models.py +77 -0
- docintel_platform-1.2.0/src/docintel/capabilities/extraction/formats/registry.py +100 -0
- docintel_platform-1.2.0/src/docintel/capabilities/extraction/formats/sniff.py +171 -0
- {docintel_platform-1.0.2/src/docintel/services/pdf → docintel_platform-1.2.0/src/docintel/capabilities/extraction}/ocr.py +1 -1
- {docintel_platform-1.0.2/src/docintel/services/pdf → docintel_platform-1.2.0/src/docintel/capabilities/extraction}/structure.py +6 -5
- {docintel_platform-1.0.2/src/docintel/services/pdf → docintel_platform-1.2.0/src/docintel/capabilities/extraction}/structure_llm.py +1 -1
- {docintel_platform-1.0.2/src/docintel/services/pdf → docintel_platform-1.2.0/src/docintel/capabilities/extraction}/structure_render.py +1 -1
- docintel_platform-1.2.0/src/docintel/capabilities/pdf/__init__.py +23 -0
- {docintel_platform-1.0.2/src/docintel/services → docintel_platform-1.2.0/src/docintel/capabilities}/pdf/annotator.py +2 -2
- docintel_platform-1.2.0/src/docintel/capabilities/pipeline/__init__.py +10 -0
- docintel_platform-1.2.0/src/docintel/capabilities/pipeline/process.py +210 -0
- docintel_platform-1.2.0/src/docintel/capabilities/understanding/__init__.py +6 -0
- docintel_platform-1.2.0/src/docintel/capabilities/understanding/classify.py +101 -0
- docintel_platform-1.2.0/src/docintel/capabilities/understanding/compare.py +49 -0
- {docintel_platform-1.0.2/src/docintel/services/summary → docintel_platform-1.2.0/src/docintel/capabilities/understanding}/textrank.py +1 -1
- docintel_platform-1.2.0/src/docintel/client.py +418 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/config.py +6 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/jobs/models.py +11 -0
- docintel_platform-1.2.0/src/docintel/jobs/queue.py +308 -0
- docintel_platform-1.2.0/src/docintel/jobs/tasks.py +663 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/jobs/webhooks.py +28 -1
- docintel_platform-1.2.0/src/docintel/openapi/openapi.yaml +708 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/ops/middleware.py +4 -0
- docintel_platform-1.2.0/src/docintel/ops/prometheus.py +62 -0
- docintel_platform-1.2.0/src/docintel/routes/async_enqueue.py +29 -0
- docintel_platform-1.2.0/src/docintel/routes/batch.py +169 -0
- docintel_platform-1.2.0/src/docintel/routes/document_upload.py +81 -0
- docintel_platform-1.2.0/src/docintel/routes/documents.py +576 -0
- docintel_platform-1.2.0/src/docintel/routes/ops.py +32 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/routes/pdf.py +137 -52
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/routes/text.py +23 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/services/pdf/__init__.py +3 -1
- docintel_platform-1.2.0/src/docintel/services/pdf/annotator.py +19 -0
- docintel_platform-1.2.0/src/docintel/services/pdf/models.py +17 -0
- docintel_platform-1.2.0/src/docintel/services/pdf/ocr.py +23 -0
- docintel_platform-1.2.0/src/docintel/services/pdf/pii.py +5 -0
- docintel_platform-1.2.0/src/docintel/services/pdf/presets.py +19 -0
- docintel_platform-1.2.0/src/docintel/services/pdf/search.py +5 -0
- docintel_platform-1.2.0/src/docintel/services/pdf/sensitive.py +6 -0
- docintel_platform-1.2.0/src/docintel/services/pdf/structure.py +6 -0
- docintel_platform-1.2.0/src/docintel/services/pdf/structure_llm.py +5 -0
- docintel_platform-1.2.0/src/docintel/services/pdf/structure_render.py +5 -0
- docintel_platform-1.2.0/src/docintel/services/pdf/structure_schema.py +10 -0
- docintel_platform-1.2.0/src/docintel/services/summary/__init__.py +5 -0
- docintel_platform-1.2.0/src/docintel/services/summary/models.py +5 -0
- docintel_platform-1.2.0/src/docintel/services/summary/textrank.py +9 -0
- docintel_platform-1.2.0/src/docintel/storage/__init__.py +33 -0
- docintel_platform-1.2.0/src/docintel/storage/local.py +30 -0
- docintel_platform-1.2.0/src/docintel/storage/s3.py +76 -0
- docintel_platform-1.2.0/src/docintel/ui.py +607 -0
- docintel_platform-1.2.0/src/docintel_platform.egg-info/PKG-INFO +222 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/SOURCES.txt +49 -6
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/requires.txt +15 -0
- docintel_platform-1.2.0/tests/test_annotate_async.py +76 -0
- docintel_platform-1.2.0/tests/test_batch.py +96 -0
- docintel_platform-1.2.0/tests/test_document_formats.py +180 -0
- docintel_platform-1.2.0/tests/test_documents_async_routes.py +115 -0
- docintel_platform-1.2.0/tests/test_documents_classify.py +30 -0
- docintel_platform-1.2.0/tests/test_documents_compare.py +30 -0
- docintel_platform-1.2.0/tests/test_documents_compare_files.py +40 -0
- docintel_platform-1.2.0/tests/test_documents_detect_pii.py +39 -0
- docintel_platform-1.2.0/tests/test_documents_process.py +78 -0
- docintel_platform-1.2.0/tests/test_documents_process_async.py +81 -0
- docintel_platform-1.2.0/tests/test_documents_summarize.py +43 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_health.py +1 -1
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_ops.py +11 -1
- docintel_platform-1.2.0/tests/test_storage.py +17 -0
- docintel_platform-1.2.0/tests/test_ui_process.py +28 -0
- docintel_platform-1.2.0/tests/test_vertical_presets.py +32 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_webhooks.py +18 -0
- docintel_platform-1.0.2/PKG-INFO +0 -607
- docintel_platform-1.0.2/README.md +0 -537
- docintel_platform-1.0.2/src/docintel/client.py +0 -193
- docintel_platform-1.0.2/src/docintel/jobs/queue.py +0 -75
- docintel_platform-1.0.2/src/docintel/jobs/tasks.py +0 -173
- docintel_platform-1.0.2/src/docintel/openapi/openapi.yaml +0 -380
- docintel_platform-1.0.2/src/docintel/routes/match.py +0 -43
- docintel_platform-1.0.2/src/docintel/routes/ops.py +0 -22
- docintel_platform-1.0.2/src/docintel/services/matching/__init__.py +0 -6
- docintel_platform-1.0.2/src/docintel/services/matching/models.py +0 -19
- docintel_platform-1.0.2/src/docintel/services/matching/scorer.py +0 -64
- docintel_platform-1.0.2/src/docintel/services/pdf/presets.py +0 -26
- docintel_platform-1.0.2/src/docintel/services/summary/__init__.py +0 -6
- docintel_platform-1.0.2/src/docintel/ui.py +0 -347
- docintel_platform-1.0.2/src/docintel_platform.egg-info/PKG-INFO +0 -607
- docintel_platform-1.0.2/tests/test_matching_routes.py +0 -64
- docintel_platform-1.0.2/tests/test_matching_service.py +0 -59
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/LICENSE +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/MANIFEST.in +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/setup.cfg +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/auth/__init__.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/auth/api_keys.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/auth/limiter.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/auth/middleware.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/auth/oidc.py +0 -0
- {docintel_platform-1.0.2/src/docintel/services/pdf → docintel_platform-1.2.0/src/docintel/capabilities/extraction}/structure_schema.py +0 -0
- {docintel_platform-1.0.2/src/docintel/services → docintel_platform-1.2.0/src/docintel/capabilities}/pdf/models.py +0 -0
- {docintel_platform-1.0.2/src/docintel/services → docintel_platform-1.2.0/src/docintel/capabilities}/pdf/search.py +0 -0
- {docintel_platform-1.0.2/src/docintel/services/summary → docintel_platform-1.2.0/src/docintel/capabilities/understanding}/models.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/cli.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/jobs/__init__.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/jobs/helpers.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/jobs/store.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/openapi/__init__.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/ops/__init__.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/ops/logging.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/ops/metrics.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/routes/__init__.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/routes/jobs.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/routes/openapi_docs.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/services/__init__.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel/wsgi.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/dependency_links.txt +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/entry_points.txt +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/top_level.txt +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_auth.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_client.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_detect_sensitive_async.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_jobs.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_oidc.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_openapi.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_pdf_routes.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_pdf_sensitive.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_pdf_service.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_pdf_structure.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_pii_mask.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_structure_pii.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_summary_routes.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_summary_service.py +0 -0
- {docintel_platform-1.0.2 → docintel_platform-1.2.0}/tests/test_ui.py +0 -0
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docintel-platform
|
|
3
|
+
Version: 1.2.0
|
|
4
|
+
Summary: Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization.
|
|
5
|
+
Author: Babandeep Singh
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/baban9/document-intelligence-platform
|
|
8
|
+
Project-URL: Repository, https://github.com/baban9/document-intelligence-platform
|
|
9
|
+
Project-URL: Documentation, https://github.com/baban9/document-intelligence-platform#readme
|
|
10
|
+
Project-URL: Issues, https://github.com/baban9/document-intelligence-platform/issues
|
|
11
|
+
Keywords: nlp,pdf,flask,document-ai,ocr,pii,presidio,openapi,document-intelligence,compliance
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Framework :: Flask
|
|
20
|
+
Classifier: Topic :: Text Processing
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Typing :: Typed
|
|
23
|
+
Requires-Python: >=3.9
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
Requires-Dist: flask>=3.0.3
|
|
26
|
+
Requires-Dist: werkzeug>=3.0.3
|
|
27
|
+
Requires-Dist: pymupdf>=1.24.10
|
|
28
|
+
Requires-Dist: scikit-learn>=1.5.2
|
|
29
|
+
Requires-Dist: networkx>=3.2.1
|
|
30
|
+
Requires-Dist: numpy>=1.26.4
|
|
31
|
+
Requires-Dist: gunicorn>=23.0.0
|
|
32
|
+
Requires-Dist: pyyaml>=6.0.2
|
|
33
|
+
Requires-Dist: requests>=2.32.3
|
|
34
|
+
Requires-Dist: prometheus-client>=0.21.0
|
|
35
|
+
Provides-Extra: dev
|
|
36
|
+
Requires-Dist: pytest>=8.3.3; extra == "dev"
|
|
37
|
+
Requires-Dist: build>=1.2.2; extra == "dev"
|
|
38
|
+
Requires-Dist: twine>=5.1.1; extra == "dev"
|
|
39
|
+
Requires-Dist: fakeredis>=2.26.2; extra == "dev"
|
|
40
|
+
Requires-Dist: prometheus-client>=0.21.0; extra == "dev"
|
|
41
|
+
Requires-Dist: python-docx>=1.1.2; extra == "dev"
|
|
42
|
+
Requires-Dist: openpyxl>=3.1.5; extra == "dev"
|
|
43
|
+
Provides-Extra: documents
|
|
44
|
+
Requires-Dist: python-docx>=1.1.2; extra == "documents"
|
|
45
|
+
Requires-Dist: openpyxl>=3.1.5; extra == "documents"
|
|
46
|
+
Provides-Extra: ocr
|
|
47
|
+
Requires-Dist: easyocr>=1.7.2; extra == "ocr"
|
|
48
|
+
Requires-Dist: presidio-analyzer>=2.2.354; extra == "ocr"
|
|
49
|
+
Requires-Dist: spacy>=3.7.0; extra == "ocr"
|
|
50
|
+
Requires-Dist: opencv-python-headless>=4.10.0; extra == "ocr"
|
|
51
|
+
Requires-Dist: torch>=2.4.1; extra == "ocr"
|
|
52
|
+
Provides-Extra: ui
|
|
53
|
+
Requires-Dist: gradio>=4.44.0; extra == "ui"
|
|
54
|
+
Requires-Dist: requests>=2.32.3; extra == "ui"
|
|
55
|
+
Provides-Extra: llm
|
|
56
|
+
Requires-Dist: openai>=1.54.0; extra == "llm"
|
|
57
|
+
Provides-Extra: jobs
|
|
58
|
+
Requires-Dist: redis>=5.0.8; extra == "jobs"
|
|
59
|
+
Requires-Dist: rq>=1.16.2; extra == "jobs"
|
|
60
|
+
Provides-Extra: storage
|
|
61
|
+
Requires-Dist: boto3>=1.35.0; extra == "storage"
|
|
62
|
+
Provides-Extra: auth
|
|
63
|
+
Requires-Dist: flask-limiter>=3.8.0; extra == "auth"
|
|
64
|
+
Requires-Dist: PyJWT>=2.9.0; extra == "auth"
|
|
65
|
+
Requires-Dist: cryptography>=43.0.0; extra == "auth"
|
|
66
|
+
Provides-Extra: all
|
|
67
|
+
Requires-Dist: easyocr>=1.7.2; extra == "all"
|
|
68
|
+
Requires-Dist: presidio-analyzer>=2.2.354; extra == "all"
|
|
69
|
+
Requires-Dist: spacy>=3.7.0; extra == "all"
|
|
70
|
+
Requires-Dist: opencv-python-headless>=4.10.0; extra == "all"
|
|
71
|
+
Requires-Dist: torch>=2.4.1; extra == "all"
|
|
72
|
+
Requires-Dist: openai>=1.54.0; extra == "all"
|
|
73
|
+
Requires-Dist: redis>=5.0.8; extra == "all"
|
|
74
|
+
Requires-Dist: rq>=1.16.2; extra == "all"
|
|
75
|
+
Requires-Dist: boto3>=1.35.0; extra == "all"
|
|
76
|
+
Requires-Dist: prometheus-client>=0.21.0; extra == "all"
|
|
77
|
+
Requires-Dist: flask-limiter>=3.8.0; extra == "all"
|
|
78
|
+
Requires-Dist: PyJWT>=2.9.0; extra == "all"
|
|
79
|
+
Requires-Dist: cryptography>=43.0.0; extra == "all"
|
|
80
|
+
Requires-Dist: gradio>=4.44.0; extra == "all"
|
|
81
|
+
Requires-Dist: python-docx>=1.1.2; extra == "all"
|
|
82
|
+
Requires-Dist: openpyxl>=3.1.5; extra == "all"
|
|
83
|
+
|
|
84
|
+
# Document Intelligence Platform
|
|
85
|
+
|
|
86
|
+
[](https://www.python.org/downloads/)
|
|
87
|
+
[](https://flask.palletsprojects.com/)
|
|
88
|
+
[](docker-compose.yml)
|
|
89
|
+
[](LICENSE)
|
|
90
|
+
|
|
91
|
+
Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
|
|
92
|
+
|
|
93
|
+
**Version:** 1.2.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## Quick start
|
|
98
|
+
|
|
99
|
+
**Docker (API + Gradio UI + worker):**
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
git clone https://github.com/baban9/document-intelligence-platform.git
|
|
103
|
+
cd document-intelligence-platform
|
|
104
|
+
cp .env.example .env # optional: ports, LLM key, auth
|
|
105
|
+
make docker-up
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
| Service | URL |
|
|
109
|
+
|---------|-----|
|
|
110
|
+
| API | http://127.0.0.1:5000 |
|
|
111
|
+
| Interactive API docs | http://127.0.0.1:5000/docs |
|
|
112
|
+
| Gradio UI | http://127.0.0.1:7860 |
|
|
113
|
+
| Health | http://127.0.0.1:5000/health |
|
|
114
|
+
|
|
115
|
+
Gradio includes a **Document process** tab (unified pipeline). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
|
|
116
|
+
|
|
117
|
+
**pip install:**
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
pip install docintel-platform
|
|
121
|
+
pip install "docintel-platform[all]" # OCR, LLM, jobs, auth, UI, office formats
|
|
122
|
+
pip install "docintel-platform[documents]" # Word and Excel only
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
**Python client:**
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from docintel import DocintelClient
|
|
129
|
+
|
|
130
|
+
client = DocintelClient("http://127.0.0.1:5000", api_key="your-key")
|
|
131
|
+
summary = client.summarize(report_text, sentences=3)
|
|
132
|
+
report = client.process_document("policy.docx", include_pii=True)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## Capabilities
|
|
138
|
+
|
|
139
|
+
| Area | Endpoints | Notes |
|
|
140
|
+
|------|-----------|-------|
|
|
141
|
+
| PDF annotate | `POST /v1/pdf/annotate` | Regex highlight, redact, markup |
|
|
142
|
+
| PDF PII scan | `POST /v1/pdf/detect-sensitive` | Presidio + OCR for scanned PDFs |
|
|
143
|
+
| PDF structure | `POST /v1/pdf/structure` | OCR + LLM curated PDF (needs LLM key) |
|
|
144
|
+
| Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process** |
|
|
145
|
+
| Text | `POST /v1/text/summarize` | TextRank extractive summary |
|
|
146
|
+
| Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
|
|
147
|
+
| Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
|
|
148
|
+
| Ops | `GET /health`, `GET /metrics` | Health and Prometheus-friendly metrics |
|
|
149
|
+
|
|
150
|
+
**Supported uploads (text workflows):** PDF, DOCX, XLSX, CSV, JSON, TXT, MD.
|
|
151
|
+
|
|
152
|
+
**PDF-only routes** (annotate, sensitive, structure) return HTTP 415 for other types. Use `/v1/documents/extract-text` or `/v1/documents/process` for office files.
|
|
153
|
+
|
|
154
|
+
Full request and response schemas: **http://127.0.0.1:5000/docs** (OpenAPI).
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Example requests
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
# Sensitive PDF (digital or scanned)
|
|
162
|
+
curl -X POST http://127.0.0.1:5000/v1/pdf/detect-sensitive \
|
|
163
|
+
-F "file=@contract.pdf" -F "action=Highlight" -o marked.pdf
|
|
164
|
+
|
|
165
|
+
# Unified document pipeline (extract + classify + summarize + PII)
|
|
166
|
+
curl -X POST http://127.0.0.1:5000/v1/documents/process \
|
|
167
|
+
-F "file=@policy.docx" -F "sentences=3"
|
|
168
|
+
|
|
169
|
+
# Async: add ?async=true, then poll /v1/jobs/<job_id>
|
|
170
|
+
curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
|
|
171
|
+
-F "file=@policy.docx"
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## Local development
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
make setup # venv + dev deps
|
|
180
|
+
make setup-ocr # EasyOCR, Presidio, spaCy model
|
|
181
|
+
make setup-llm # OpenAI client (structure endpoint)
|
|
182
|
+
make setup-ui # Gradio
|
|
183
|
+
make run # API on :5000
|
|
184
|
+
make run-worker # RQ worker (separate terminal, needs Redis)
|
|
185
|
+
make run-ui # Gradio on :7860
|
|
186
|
+
make test
|
|
187
|
+
make eval # offline quality report (summary, classify, process, PII)
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and S3. See comments in that file for all variables.
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
## Documentation
|
|
195
|
+
|
|
196
|
+
| Doc | Contents |
|
|
197
|
+
|-----|----------|
|
|
198
|
+
| [/docs](http://127.0.0.1:5000/docs) | Live OpenAPI / Swagger (authoritative API reference) |
|
|
199
|
+
| [docs/PLATFORM.md](docs/PLATFORM.md) | Jobs, auth, storage, ops layout |
|
|
200
|
+
| [docs/PRODUCTION.md](docs/PRODUCTION.md) | Checklist, latency, failure modes |
|
|
201
|
+
| [docs/ROADMAP.md](docs/ROADMAP.md) | Milestones and history |
|
|
202
|
+
| [docs/adr/](docs/adr/) | Architecture decision records |
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
## Project layout
|
|
207
|
+
|
|
208
|
+
```
|
|
209
|
+
src/docintel/
|
|
210
|
+
routes/ HTTP API
|
|
211
|
+
capabilities/ Compliance, extraction, understanding
|
|
212
|
+
jobs/ Async queue (Redis + RQ)
|
|
213
|
+
auth/ API keys, OIDC, rate limits
|
|
214
|
+
storage/ Local or S3 artifacts
|
|
215
|
+
ops/ Logging and metrics
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
---
|
|
219
|
+
|
|
220
|
+
## License
|
|
221
|
+
|
|
222
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# Document Intelligence Platform
|
|
2
|
+
|
|
3
|
+
[](https://www.python.org/downloads/)
|
|
4
|
+
[](https://flask.palletsprojects.com/)
|
|
5
|
+
[](docker-compose.yml)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
|
|
8
|
+
Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
|
|
9
|
+
|
|
10
|
+
**Version:** 1.2.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## Quick start
|
|
15
|
+
|
|
16
|
+
**Docker (API + Gradio UI + worker):**
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
git clone https://github.com/baban9/document-intelligence-platform.git
|
|
20
|
+
cd document-intelligence-platform
|
|
21
|
+
cp .env.example .env # optional: ports, LLM key, auth
|
|
22
|
+
make docker-up
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
| Service | URL |
|
|
26
|
+
|---------|-----|
|
|
27
|
+
| API | http://127.0.0.1:5000 |
|
|
28
|
+
| Interactive API docs | http://127.0.0.1:5000/docs |
|
|
29
|
+
| Gradio UI | http://127.0.0.1:7860 |
|
|
30
|
+
| Health | http://127.0.0.1:5000/health |
|
|
31
|
+
|
|
32
|
+
Gradio includes a **Document process** tab (unified pipeline). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
|
|
33
|
+
|
|
34
|
+
**pip install:**
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install docintel-platform
|
|
38
|
+
pip install "docintel-platform[all]" # OCR, LLM, jobs, auth, UI, office formats
|
|
39
|
+
pip install "docintel-platform[documents]" # Word and Excel only
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
**Python client:**
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from docintel import DocintelClient
|
|
46
|
+
|
|
47
|
+
client = DocintelClient("http://127.0.0.1:5000", api_key="your-key")
|
|
48
|
+
summary = client.summarize(report_text, sentences=3)
|
|
49
|
+
report = client.process_document("policy.docx", include_pii=True)
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## Capabilities
|
|
55
|
+
|
|
56
|
+
| Area | Endpoints | Notes |
|
|
57
|
+
|------|-----------|-------|
|
|
58
|
+
| PDF annotate | `POST /v1/pdf/annotate` | Regex highlight, redact, markup |
|
|
59
|
+
| PDF PII scan | `POST /v1/pdf/detect-sensitive` | Presidio + OCR for scanned PDFs |
|
|
60
|
+
| PDF structure | `POST /v1/pdf/structure` | OCR + LLM curated PDF (needs LLM key) |
|
|
61
|
+
| Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process** |
|
|
62
|
+
| Text | `POST /v1/text/summarize` | TextRank extractive summary |
|
|
63
|
+
| Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
|
|
64
|
+
| Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
|
|
65
|
+
| Ops | `GET /health`, `GET /metrics` | Health and Prometheus-friendly metrics |
|
|
66
|
+
|
|
67
|
+
**Supported uploads (text workflows):** PDF, DOCX, XLSX, CSV, JSON, TXT, MD.
|
|
68
|
+
|
|
69
|
+
**PDF-only routes** (annotate, sensitive, structure) return HTTP 415 for other types. Use `/v1/documents/extract-text` or `/v1/documents/process` for office files.
|
|
70
|
+
|
|
71
|
+
Full request and response schemas: **http://127.0.0.1:5000/docs** (OpenAPI).
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Example requests
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
# Sensitive PDF (digital or scanned)
|
|
79
|
+
curl -X POST http://127.0.0.1:5000/v1/pdf/detect-sensitive \
|
|
80
|
+
-F "file=@contract.pdf" -F "action=Highlight" -o marked.pdf
|
|
81
|
+
|
|
82
|
+
# Unified document pipeline (extract + classify + summarize + PII)
|
|
83
|
+
curl -X POST http://127.0.0.1:5000/v1/documents/process \
|
|
84
|
+
-F "file=@policy.docx" -F "sentences=3"
|
|
85
|
+
|
|
86
|
+
# Async: add ?async=true, then poll /v1/jobs/<job_id>
|
|
87
|
+
curl -X POST "http://127.0.0.1:5000/v1/documents/process?async=true" \
|
|
88
|
+
-F "file=@policy.docx"
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## Local development
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
make setup # venv + dev deps
|
|
97
|
+
make setup-ocr # EasyOCR, Presidio, spaCy model
|
|
98
|
+
make setup-llm # OpenAI client (structure endpoint)
|
|
99
|
+
make setup-ui # Gradio
|
|
100
|
+
make run # API on :5000
|
|
101
|
+
make run-worker # RQ worker (separate terminal, needs Redis)
|
|
102
|
+
make run-ui # Gradio on :7860
|
|
103
|
+
make test
|
|
104
|
+
make eval # offline quality report (summary, classify, process, PII)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and S3. See comments in that file for all variables.
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## Documentation
|
|
112
|
+
|
|
113
|
+
| Doc | Contents |
|
|
114
|
+
|-----|----------|
|
|
115
|
+
| [/docs](http://127.0.0.1:5000/docs) | Live OpenAPI / Swagger (authoritative API reference) |
|
|
116
|
+
| [docs/PLATFORM.md](docs/PLATFORM.md) | Jobs, auth, storage, ops layout |
|
|
117
|
+
| [docs/PRODUCTION.md](docs/PRODUCTION.md) | Checklist, latency, failure modes |
|
|
118
|
+
| [docs/ROADMAP.md](docs/ROADMAP.md) | Milestones and history |
|
|
119
|
+
| [docs/adr/](docs/adr/) | Architecture decision records |
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Project layout
|
|
124
|
+
|
|
125
|
+
```
|
|
126
|
+
src/docintel/
|
|
127
|
+
routes/ HTTP API
|
|
128
|
+
capabilities/ Compliance, extraction, understanding
|
|
129
|
+
jobs/ Async queue (Redis + RQ)
|
|
130
|
+
auth/ API keys, OIDC, rate limits
|
|
131
|
+
storage/ Local or S3 artifacts
|
|
132
|
+
ops/ Logging and metrics
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## License
|
|
138
|
+
|
|
139
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -4,16 +4,16 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docintel-platform"
|
|
7
|
-
version = "1.0
|
|
8
|
-
description = "
|
|
7
|
+
version = "1.2.0"
|
|
8
|
+
description = "Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
11
11
|
license-files = []
|
|
12
12
|
requires-python = ">=3.9"
|
|
13
13
|
authors = [{name = "Babandeep Singh"}]
|
|
14
14
|
keywords = [
|
|
15
|
-
"nlp", "pdf", "flask", "document-ai",
|
|
16
|
-
"ocr", "pii", "presidio", "openapi", "document-intelligence",
|
|
15
|
+
"nlp", "pdf", "flask", "document-ai",
|
|
16
|
+
"ocr", "pii", "presidio", "openapi", "document-intelligence", "compliance",
|
|
17
17
|
]
|
|
18
18
|
classifiers = [
|
|
19
19
|
"Development Status :: 4 - Beta",
|
|
@@ -38,6 +38,7 @@ dependencies = [
|
|
|
38
38
|
"gunicorn>=23.0.0",
|
|
39
39
|
"pyyaml>=6.0.2",
|
|
40
40
|
"requests>=2.32.3",
|
|
41
|
+
"prometheus-client>=0.21.0",
|
|
41
42
|
]
|
|
42
43
|
|
|
43
44
|
[project.optional-dependencies]
|
|
@@ -46,6 +47,13 @@ dev = [
|
|
|
46
47
|
"build>=1.2.2",
|
|
47
48
|
"twine>=5.1.1",
|
|
48
49
|
"fakeredis>=2.26.2",
|
|
50
|
+
"prometheus-client>=0.21.0",
|
|
51
|
+
"python-docx>=1.1.2",
|
|
52
|
+
"openpyxl>=3.1.5",
|
|
53
|
+
]
|
|
54
|
+
documents = [
|
|
55
|
+
"python-docx>=1.1.2",
|
|
56
|
+
"openpyxl>=3.1.5",
|
|
49
57
|
]
|
|
50
58
|
ocr = [
|
|
51
59
|
"easyocr>=1.7.2",
|
|
@@ -65,6 +73,9 @@ jobs = [
|
|
|
65
73
|
"redis>=5.0.8",
|
|
66
74
|
"rq>=1.16.2",
|
|
67
75
|
]
|
|
76
|
+
storage = [
|
|
77
|
+
"boto3>=1.35.0",
|
|
78
|
+
]
|
|
68
79
|
auth = [
|
|
69
80
|
"flask-limiter>=3.8.0",
|
|
70
81
|
"PyJWT>=2.9.0",
|
|
@@ -79,10 +90,14 @@ all = [
|
|
|
79
90
|
"openai>=1.54.0",
|
|
80
91
|
"redis>=5.0.8",
|
|
81
92
|
"rq>=1.16.2",
|
|
93
|
+
"boto3>=1.35.0",
|
|
94
|
+
"prometheus-client>=0.21.0",
|
|
82
95
|
"flask-limiter>=3.8.0",
|
|
83
96
|
"PyJWT>=2.9.0",
|
|
84
97
|
"cryptography>=43.0.0",
|
|
85
98
|
"gradio>=4.44.0",
|
|
99
|
+
"python-docx>=1.1.2",
|
|
100
|
+
"openpyxl>=3.1.5",
|
|
86
101
|
]
|
|
87
102
|
|
|
88
103
|
[project.scripts]
|
|
@@ -8,9 +8,10 @@ from docintel.auth.limiter import init_limiter
|
|
|
8
8
|
from docintel.auth.middleware import register_auth
|
|
9
9
|
from docintel.ops.logging import configure_logging
|
|
10
10
|
from docintel.ops.middleware import register_request_hooks
|
|
11
|
+
from docintel.routes.documents import documents_bp
|
|
12
|
+
from docintel.routes.batch import batch_bp
|
|
11
13
|
from docintel.routes.jobs import jobs_bp
|
|
12
14
|
from docintel.routes.openapi_docs import docs_bp
|
|
13
|
-
from docintel.routes.match import match_bp
|
|
14
15
|
from docintel.routes.ops import ops_bp
|
|
15
16
|
from docintel.routes.pdf import pdf_bp
|
|
16
17
|
from docintel.routes.text import text_bp
|
|
@@ -36,9 +37,10 @@ def create_app(config: type[Config] = Config) -> Flask:
|
|
|
36
37
|
)
|
|
37
38
|
|
|
38
39
|
app.register_blueprint(docs_bp)
|
|
40
|
+
app.register_blueprint(documents_bp)
|
|
39
41
|
app.register_blueprint(pdf_bp)
|
|
40
42
|
app.register_blueprint(jobs_bp)
|
|
41
|
-
app.register_blueprint(
|
|
43
|
+
app.register_blueprint(batch_bp)
|
|
42
44
|
app.register_blueprint(text_bp)
|
|
43
45
|
app.register_blueprint(ops_bp)
|
|
44
46
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Enterprise capability modules."""
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Compliance capabilities (PII detection, sensitive PDF scanning)."""
|
|
2
|
+
|
|
3
|
+
from docintel.capabilities.compliance.pii import PIIHit, detect_pii_in_text, list_supported_entities, mask_pii_in_text
|
|
4
|
+
from docintel.capabilities.compliance.presets import DEFAULT_PII_ENTITIES, MIN_NATIVE_TEXT_CHARS, OCR_RENDER_SCALE
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"DEFAULT_PII_ENTITIES",
|
|
8
|
+
"MIN_NATIVE_TEXT_CHARS",
|
|
9
|
+
"OCR_RENDER_SCALE",
|
|
10
|
+
"PIIHit",
|
|
11
|
+
"detect_pii_in_text",
|
|
12
|
+
"list_supported_entities",
|
|
13
|
+
"mask_pii_in_text",
|
|
14
|
+
]
|
|
@@ -6,7 +6,7 @@ from dataclasses import dataclass
|
|
|
6
6
|
from functools import lru_cache
|
|
7
7
|
from typing import Sequence
|
|
8
8
|
|
|
9
|
-
from docintel.
|
|
9
|
+
from docintel.capabilities.compliance.presets import DEFAULT_PII_ENTITIES
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
@dataclass(frozen=True)
|
|
@@ -89,7 +89,9 @@ def mask_pii_in_text(
|
|
|
89
89
|
|
|
90
90
|
Returns masked text and the number of entities redacted.
|
|
91
91
|
"""
|
|
92
|
-
|
|
92
|
+
from docintel.services.pdf import pii as pii_compat
|
|
93
|
+
|
|
94
|
+
hits = pii_compat.detect_pii_in_text(
|
|
93
95
|
text,
|
|
94
96
|
entities=entities,
|
|
95
97
|
language=language,
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Default Presidio entity presets (extend via API or custom recognizers)."""
|
|
2
|
+
|
|
3
|
+
# Core Presidio entities suitable for legal, finance, and compliance workflows.
|
|
4
|
+
DEFAULT_PII_ENTITIES: tuple[str, ...] = (
|
|
5
|
+
"EMAIL_ADDRESS",
|
|
6
|
+
"PHONE_NUMBER",
|
|
7
|
+
"US_SSN",
|
|
8
|
+
"CREDIT_CARD",
|
|
9
|
+
"US_BANK_NUMBER",
|
|
10
|
+
"US_DRIVER_LICENSE",
|
|
11
|
+
"US_ITIN",
|
|
12
|
+
"US_PASSPORT",
|
|
13
|
+
"PERSON",
|
|
14
|
+
"LOCATION",
|
|
15
|
+
"DATE_TIME",
|
|
16
|
+
"IP_ADDRESS",
|
|
17
|
+
"IBAN_CODE",
|
|
18
|
+
"MEDICAL_LICENSE",
|
|
19
|
+
"URL",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
VERTICAL_ENTITY_PRESETS: dict[str, tuple[str, ...]] = {
|
|
23
|
+
"general": DEFAULT_PII_ENTITIES,
|
|
24
|
+
"healthcare": (
|
|
25
|
+
"PERSON",
|
|
26
|
+
"PHONE_NUMBER",
|
|
27
|
+
"EMAIL_ADDRESS",
|
|
28
|
+
"DATE_TIME",
|
|
29
|
+
"LOCATION",
|
|
30
|
+
"US_SSN",
|
|
31
|
+
"MEDICAL_LICENSE",
|
|
32
|
+
"US_DRIVER_LICENSE",
|
|
33
|
+
"URL",
|
|
34
|
+
),
|
|
35
|
+
"financial": (
|
|
36
|
+
"PERSON",
|
|
37
|
+
"EMAIL_ADDRESS",
|
|
38
|
+
"PHONE_NUMBER",
|
|
39
|
+
"US_SSN",
|
|
40
|
+
"CREDIT_CARD",
|
|
41
|
+
"US_BANK_NUMBER",
|
|
42
|
+
"IBAN_CODE",
|
|
43
|
+
"US_ITIN",
|
|
44
|
+
"DATE_TIME",
|
|
45
|
+
"LOCATION",
|
|
46
|
+
"URL",
|
|
47
|
+
),
|
|
48
|
+
"legal": (
|
|
49
|
+
"PERSON",
|
|
50
|
+
"EMAIL_ADDRESS",
|
|
51
|
+
"PHONE_NUMBER",
|
|
52
|
+
"LOCATION",
|
|
53
|
+
"DATE_TIME",
|
|
54
|
+
"US_PASSPORT",
|
|
55
|
+
"US_DRIVER_LICENSE",
|
|
56
|
+
"US_SSN",
|
|
57
|
+
"URL",
|
|
58
|
+
),
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
# Minimum extracted characters before a page is treated as scanned (OCR fallback).
|
|
62
|
+
MIN_NATIVE_TEXT_CHARS = 20
|
|
63
|
+
|
|
64
|
+
# EasyOCR render scale (higher improves accuracy, increases memory).
|
|
65
|
+
OCR_RENDER_SCALE = 2.0
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def list_vertical_presets() -> dict[str, list[str]]:
|
|
69
|
+
"""Return named entity packs for vertical workflows."""
|
|
70
|
+
return {name: list(entities) for name, entities in VERTICAL_ENTITY_PRESETS.items()}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def entities_for_vertical(name: str) -> tuple[str, ...]:
|
|
74
|
+
"""Resolve a vertical preset name to a Presidio entity list."""
|
|
75
|
+
key = name.strip().lower()
|
|
76
|
+
try:
|
|
77
|
+
return VERTICAL_ENTITY_PRESETS[key]
|
|
78
|
+
except KeyError as exc:
|
|
79
|
+
valid = ", ".join(sorted(VERTICAL_ENTITY_PRESETS))
|
|
80
|
+
raise ValueError(f"Unknown vertical preset '{name}'. Choose from: {valid}") from exc
|
|
@@ -18,7 +18,7 @@ from docintel.services.pdf.ocr import (
|
|
|
18
18
|
page_has_native_text,
|
|
19
19
|
rects_for_char_range,
|
|
20
20
|
)
|
|
21
|
-
from docintel.
|
|
21
|
+
from docintel.capabilities.compliance.pii import PIIHit
|
|
22
22
|
from docintel.services.pdf.search import search_for_text
|
|
23
23
|
|
|
24
24
|
|
|
@@ -124,7 +124,9 @@ def detect_sensitive_pdf(
|
|
|
124
124
|
|
|
125
125
|
Uses native PDF text when available. Falls back to EasyOCR for scanned pages.
|
|
126
126
|
"""
|
|
127
|
-
|
|
127
|
+
from docintel.services.pdf import sensitive as sensitive_compat
|
|
128
|
+
|
|
129
|
+
sensitive_compat._ensure_ocr_stack()
|
|
128
130
|
selected_action = action if isinstance(action, Action) else Action.from_value(action)
|
|
129
131
|
if selected_action == Action.REMOVE:
|
|
130
132
|
raise ValueError("Action 'Remove' is not supported for sensitive detection.")
|
|
@@ -159,7 +161,7 @@ def detect_sensitive_pdf(
|
|
|
159
161
|
indexed = []
|
|
160
162
|
page_text = page.get_text("text")
|
|
161
163
|
|
|
162
|
-
hits = detect_pii_in_text(page_text, entities=entities, min_score=min_score)
|
|
164
|
+
hits = sensitive_compat.detect_pii_in_text(page_text, entities=entities, min_score=min_score)
|
|
163
165
|
if pattern:
|
|
164
166
|
hits.extend(_regex_hits(page_text, pattern))
|
|
165
167
|
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Document extraction capabilities (OCR, LLM structuring)."""
|
|
2
|
+
|
|
3
|
+
from docintel.capabilities.extraction.ocr import (
|
|
4
|
+
OCRSpan,
|
|
5
|
+
build_indexed_text,
|
|
6
|
+
embed_invisible_text_layer,
|
|
7
|
+
extract_page_ocr,
|
|
8
|
+
merge_rects,
|
|
9
|
+
page_has_native_text,
|
|
10
|
+
rects_for_char_range,
|
|
11
|
+
)
|
|
12
|
+
from docintel.capabilities.extraction.structure_schema import (
|
|
13
|
+
SectionBlock,
|
|
14
|
+
StructuredDocument,
|
|
15
|
+
StructuredPage,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"OCRSpan",
|
|
20
|
+
"SectionBlock",
|
|
21
|
+
"StructuredDocument",
|
|
22
|
+
"StructuredPage",
|
|
23
|
+
"build_indexed_text",
|
|
24
|
+
"embed_invisible_text_layer",
|
|
25
|
+
"extract_page_ocr",
|
|
26
|
+
"merge_rects",
|
|
27
|
+
"page_has_native_text",
|
|
28
|
+
"rects_for_char_range",
|
|
29
|
+
]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Multi-format document identification and text extraction."""
|
|
2
|
+
|
|
3
|
+
from docintel.capabilities.extraction.formats.extract import extract_document_text
|
|
4
|
+
from docintel.capabilities.extraction.formats.models import (
|
|
5
|
+
DocumentKind,
|
|
6
|
+
DocumentProfile,
|
|
7
|
+
ExtractionResult,
|
|
8
|
+
IdentificationResult,
|
|
9
|
+
)
|
|
10
|
+
from docintel.capabilities.extraction.formats.registry import (
|
|
11
|
+
get_profile,
|
|
12
|
+
list_supported_types,
|
|
13
|
+
profiles_for_kind,
|
|
14
|
+
)
|
|
15
|
+
from docintel.capabilities.extraction.formats.sniff import identify_document
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"DocumentKind",
|
|
19
|
+
"DocumentProfile",
|
|
20
|
+
"ExtractionResult",
|
|
21
|
+
"IdentificationResult",
|
|
22
|
+
"extract_document_text",
|
|
23
|
+
"get_profile",
|
|
24
|
+
"identify_document",
|
|
25
|
+
"list_supported_types",
|
|
26
|
+
"profiles_for_kind",
|
|
27
|
+
]
|