docintel-platform 1.1.0__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docintel_platform-1.1.0/src/docintel_platform.egg-info → docintel_platform-1.2.0}/PKG-INFO +6 -3
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/README.md +5 -2
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/pyproject.toml +1 -1
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/__init__.py +1 -1
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/client.py +149 -57
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/jobs/models.py +5 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/jobs/queue.py +113 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/jobs/tasks.py +221 -0
- docintel_platform-1.2.0/src/docintel/routes/async_enqueue.py +29 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/routes/document_upload.py +11 -1
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/routes/documents.py +191 -10
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/routes/pdf.py +4 -9
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/routes/text.py +23 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/ui.py +148 -24
- {docintel_platform-1.1.0 → docintel_platform-1.2.0/src/docintel_platform.egg-info}/PKG-INFO +6 -3
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/SOURCES.txt +3 -0
- docintel_platform-1.2.0/tests/test_documents_async_routes.py +115 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_health.py +1 -1
- docintel_platform-1.2.0/tests/test_ui_process.py +28 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/LICENSE +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/MANIFEST.in +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/setup.cfg +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/app.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/auth/__init__.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/auth/api_keys.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/auth/limiter.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/auth/middleware.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/auth/oidc.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/__init__.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/compliance/__init__.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/compliance/pii.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/compliance/presets.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/compliance/sensitive.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/__init__.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/formats/__init__.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/formats/extract.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/formats/models.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/formats/registry.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/formats/sniff.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/ocr.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/structure.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/structure_llm.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/structure_render.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/structure_schema.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/pdf/__init__.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/pdf/annotator.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/pdf/models.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/pdf/search.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/pipeline/__init__.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/pipeline/process.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/understanding/__init__.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/understanding/classify.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/understanding/compare.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/understanding/models.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/understanding/textrank.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/cli.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/config.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/jobs/__init__.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/jobs/helpers.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/jobs/store.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/jobs/webhooks.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/openapi/__init__.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/openapi/openapi.yaml +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/ops/__init__.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/ops/logging.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/ops/metrics.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/ops/middleware.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/ops/prometheus.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/routes/__init__.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/routes/batch.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/routes/jobs.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/routes/openapi_docs.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/routes/ops.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/__init__.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/__init__.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/annotator.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/models.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/ocr.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/pii.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/presets.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/search.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/sensitive.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/structure.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/structure_llm.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/structure_render.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/structure_schema.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/summary/__init__.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/summary/models.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/summary/textrank.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/storage/__init__.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/storage/local.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/storage/s3.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/wsgi.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/dependency_links.txt +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/entry_points.txt +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/requires.txt +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/top_level.txt +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_annotate_async.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_auth.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_batch.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_client.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_detect_sensitive_async.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_document_formats.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_documents_classify.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_documents_compare.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_documents_compare_files.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_documents_detect_pii.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_documents_process.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_documents_process_async.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_documents_summarize.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_jobs.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_oidc.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_openapi.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_ops.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_pdf_routes.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_pdf_sensitive.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_pdf_service.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_pdf_structure.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_pii_mask.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_storage.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_structure_pii.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_summary_routes.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_summary_service.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_ui.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_vertical_presets.py +0 -0
- {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_webhooks.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docintel-platform
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization.
|
|
5
5
|
Author: Babandeep Singh
|
|
6
6
|
License-Expression: MIT
|
|
@@ -90,7 +90,7 @@ Requires-Dist: openpyxl>=3.1.5; extra == "all"
|
|
|
90
90
|
|
|
91
91
|
Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
|
|
92
92
|
|
|
93
|
-
**Version:** 1.
|
|
93
|
+
**Version:** 1.2.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
|
|
94
94
|
|
|
95
95
|
---
|
|
96
96
|
|
|
@@ -112,6 +112,8 @@ make docker-up
|
|
|
112
112
|
| Gradio UI | http://127.0.0.1:7860 |
|
|
113
113
|
| Health | http://127.0.0.1:5000/health |
|
|
114
114
|
|
|
115
|
+
Gradio includes a **Document process** tab (unified pipeline). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
|
|
116
|
+
|
|
115
117
|
**pip install:**
|
|
116
118
|
|
|
117
119
|
```bash
|
|
@@ -142,7 +144,7 @@ report = client.process_document("policy.docx", include_pii=True)
|
|
|
142
144
|
| Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process** |
|
|
143
145
|
| Text | `POST /v1/text/summarize` | TextRank extractive summary |
|
|
144
146
|
| Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
|
|
145
|
-
| Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true
|
|
147
|
+
| Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
|
|
146
148
|
| Ops | `GET /health`, `GET /metrics` | Health and Prometheus-friendly metrics |
|
|
147
149
|
|
|
148
150
|
**Supported uploads (text workflows):** PDF, DOCX, XLSX, CSV, JSON, TXT, MD.
|
|
@@ -182,6 +184,7 @@ make run # API on :5000
|
|
|
182
184
|
make run-worker # RQ worker (separate terminal, needs Redis)
|
|
183
185
|
make run-ui # Gradio on :7860
|
|
184
186
|
make test
|
|
187
|
+
make eval # offline quality report (summary, classify, process, PII)
|
|
185
188
|
```
|
|
186
189
|
|
|
187
190
|
Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and S3. See comments in that file for all variables.
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
|
|
8
8
|
Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
|
|
9
9
|
|
|
10
|
-
**Version:** 1.
|
|
10
|
+
**Version:** 1.2.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
|
|
11
11
|
|
|
12
12
|
---
|
|
13
13
|
|
|
@@ -29,6 +29,8 @@ make docker-up
|
|
|
29
29
|
| Gradio UI | http://127.0.0.1:7860 |
|
|
30
30
|
| Health | http://127.0.0.1:5000/health |
|
|
31
31
|
|
|
32
|
+
Gradio includes a **Document process** tab (unified pipeline). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
|
|
33
|
+
|
|
32
34
|
**pip install:**
|
|
33
35
|
|
|
34
36
|
```bash
|
|
@@ -59,7 +61,7 @@ report = client.process_document("policy.docx", include_pii=True)
|
|
|
59
61
|
| Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process** |
|
|
60
62
|
| Text | `POST /v1/text/summarize` | TextRank extractive summary |
|
|
61
63
|
| Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
|
|
62
|
-
| Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true
|
|
64
|
+
| Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
|
|
63
65
|
| Ops | `GET /health`, `GET /metrics` | Health and Prometheus-friendly metrics |
|
|
64
66
|
|
|
65
67
|
**Supported uploads (text workflows):** PDF, DOCX, XLSX, CSV, JSON, TXT, MD.
|
|
@@ -99,6 +101,7 @@ make run # API on :5000
|
|
|
99
101
|
make run-worker # RQ worker (separate terminal, needs Redis)
|
|
100
102
|
make run-ui # Gradio on :7860
|
|
101
103
|
make test
|
|
104
|
+
make eval # offline quality report (summary, classify, process, PII)
|
|
102
105
|
```
|
|
103
106
|
|
|
104
107
|
Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and S3. See comments in that file for all variables.
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docintel-platform"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.2.0"
|
|
8
8
|
description = "Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -164,15 +164,65 @@ class DocintelClient:
|
|
|
164
164
|
return response.content
|
|
165
165
|
return response.json()
|
|
166
166
|
|
|
167
|
-
def
|
|
167
|
+
def _post_async_json(
|
|
168
|
+
self,
|
|
169
|
+
path: str,
|
|
170
|
+
*,
|
|
171
|
+
json_body: dict[str, Any] | None = None,
|
|
172
|
+
params: dict[str, str] | None = None,
|
|
173
|
+
poll: bool = True,
|
|
174
|
+
) -> dict[str, Any]:
|
|
168
175
|
response = self._session.post(
|
|
169
|
-
self._url(
|
|
170
|
-
json=
|
|
176
|
+
self._url(path),
|
|
177
|
+
json=json_body,
|
|
178
|
+
params=params,
|
|
171
179
|
timeout=self.timeout,
|
|
172
180
|
)
|
|
181
|
+
if response.status_code == 202:
|
|
182
|
+
payload = response.json()
|
|
183
|
+
if not poll:
|
|
184
|
+
return payload
|
|
185
|
+
completed = self.poll_job(payload["job_id"])
|
|
186
|
+
result = completed.get("result") or {}
|
|
187
|
+
return {"status": "ok", **result}
|
|
173
188
|
self._raise_for_status(response)
|
|
174
189
|
return response.json()
|
|
175
190
|
|
|
191
|
+
def _post_async_multipart(
|
|
192
|
+
self,
|
|
193
|
+
path: str,
|
|
194
|
+
*,
|
|
195
|
+
files: dict,
|
|
196
|
+
data: dict[str, str] | None = None,
|
|
197
|
+
params: dict[str, str] | None = None,
|
|
198
|
+
poll: bool = True,
|
|
199
|
+
) -> dict[str, Any]:
|
|
200
|
+
response = self._session.post(
|
|
201
|
+
self._url(path),
|
|
202
|
+
params=params,
|
|
203
|
+
files=files,
|
|
204
|
+
data=data or {},
|
|
205
|
+
timeout=self.timeout,
|
|
206
|
+
)
|
|
207
|
+
if response.status_code == 202:
|
|
208
|
+
payload = response.json()
|
|
209
|
+
if not poll:
|
|
210
|
+
return payload
|
|
211
|
+
completed = self.poll_job(payload["job_id"])
|
|
212
|
+
result = completed.get("result") or {}
|
|
213
|
+
return {"status": "ok", **result}
|
|
214
|
+
self._raise_for_status(response)
|
|
215
|
+
return response.json()
|
|
216
|
+
|
|
217
|
+
def summarize(self, text: str, *, sentences: int = 3, async_job: bool = False, poll: bool = True) -> dict[str, Any]:
|
|
218
|
+
params = {"async": "true"} if async_job else {}
|
|
219
|
+
return self._post_async_json(
|
|
220
|
+
"/v1/text/summarize",
|
|
221
|
+
json_body={"text": text, "sentences": sentences},
|
|
222
|
+
params=params,
|
|
223
|
+
poll=poll,
|
|
224
|
+
)
|
|
225
|
+
|
|
176
226
|
def list_document_types(self) -> dict[str, Any]:
|
|
177
227
|
response = self._session.get(self._url("/v1/documents/types"), timeout=self.timeout)
|
|
178
228
|
self._raise_for_status(response)
|
|
@@ -189,16 +239,47 @@ class DocintelClient:
|
|
|
189
239
|
self._raise_for_status(response)
|
|
190
240
|
return response.json()
|
|
191
241
|
|
|
192
|
-
def extract_document_text(
|
|
242
|
+
def extract_document_text(
|
|
243
|
+
self,
|
|
244
|
+
path: str | Path,
|
|
245
|
+
*,
|
|
246
|
+
async_job: bool = False,
|
|
247
|
+
poll: bool = True,
|
|
248
|
+
) -> dict[str, Any]:
|
|
193
249
|
file_path = Path(path)
|
|
250
|
+
params = {"async": "true"} if async_job else {}
|
|
194
251
|
with file_path.open("rb") as handle:
|
|
195
|
-
|
|
196
|
-
|
|
252
|
+
return self._post_async_multipart(
|
|
253
|
+
"/v1/documents/extract-text",
|
|
254
|
+
params=params,
|
|
197
255
|
files={"file": (file_path.name, handle, "application/octet-stream")},
|
|
198
|
-
|
|
256
|
+
poll=poll,
|
|
199
257
|
)
|
|
200
|
-
|
|
201
|
-
|
|
258
|
+
|
|
259
|
+
def classify_document(
|
|
260
|
+
self,
|
|
261
|
+
path: str | Path | None = None,
|
|
262
|
+
*,
|
|
263
|
+
text: str | None = None,
|
|
264
|
+
async_job: bool = False,
|
|
265
|
+
poll: bool = True,
|
|
266
|
+
) -> dict[str, Any]:
|
|
267
|
+
params = {"async": "true"} if async_job else {}
|
|
268
|
+
if path is not None:
|
|
269
|
+
file_path = Path(path)
|
|
270
|
+
with file_path.open("rb") as handle:
|
|
271
|
+
return self._post_async_multipart(
|
|
272
|
+
"/v1/documents/classify",
|
|
273
|
+
params=params,
|
|
274
|
+
files={"file": (file_path.name, handle, "application/octet-stream")},
|
|
275
|
+
poll=poll,
|
|
276
|
+
)
|
|
277
|
+
return self._post_async_json(
|
|
278
|
+
"/v1/documents/classify",
|
|
279
|
+
json_body={"text": text or ""},
|
|
280
|
+
params=params,
|
|
281
|
+
poll=poll,
|
|
282
|
+
)
|
|
202
283
|
|
|
203
284
|
def summarize_document(
|
|
204
285
|
self,
|
|
@@ -206,24 +287,26 @@ class DocintelClient:
|
|
|
206
287
|
*,
|
|
207
288
|
text: str | None = None,
|
|
208
289
|
sentences: int = 3,
|
|
290
|
+
async_job: bool = False,
|
|
291
|
+
poll: bool = True,
|
|
209
292
|
) -> dict[str, Any]:
|
|
293
|
+
params = {"async": "true"} if async_job else {}
|
|
210
294
|
if path is not None:
|
|
211
295
|
file_path = Path(path)
|
|
212
296
|
with file_path.open("rb") as handle:
|
|
213
|
-
|
|
214
|
-
|
|
297
|
+
return self._post_async_multipart(
|
|
298
|
+
"/v1/documents/summarize",
|
|
299
|
+
params=params,
|
|
215
300
|
files={"file": (file_path.name, handle, "application/octet-stream")},
|
|
216
301
|
data={"sentences": str(sentences)},
|
|
217
|
-
|
|
302
|
+
poll=poll,
|
|
218
303
|
)
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
self._raise_for_status(response)
|
|
226
|
-
return response.json()
|
|
304
|
+
return self._post_async_json(
|
|
305
|
+
"/v1/documents/summarize",
|
|
306
|
+
json_body={"text": text or "", "sentences": sentences},
|
|
307
|
+
params=params,
|
|
308
|
+
poll=poll,
|
|
309
|
+
)
|
|
227
310
|
|
|
228
311
|
def detect_pii_document(
|
|
229
312
|
self,
|
|
@@ -233,34 +316,36 @@ class DocintelClient:
|
|
|
233
316
|
entities: str | None = None,
|
|
234
317
|
vertical: str | None = None,
|
|
235
318
|
min_score: float = 0.35,
|
|
319
|
+
async_job: bool = False,
|
|
320
|
+
poll: bool = True,
|
|
236
321
|
) -> dict[str, Any]:
|
|
237
|
-
|
|
238
|
-
if entities:
|
|
239
|
-
data["entities"] = entities
|
|
240
|
-
if vertical:
|
|
241
|
-
data["vertical"] = vertical
|
|
322
|
+
params = {"async": "true"} if async_job else {}
|
|
242
323
|
if path is not None:
|
|
243
324
|
file_path = Path(path)
|
|
325
|
+
data = {"min_score": str(min_score)}
|
|
326
|
+
if entities:
|
|
327
|
+
data["entities"] = entities
|
|
328
|
+
if vertical:
|
|
329
|
+
data["vertical"] = vertical
|
|
244
330
|
with file_path.open("rb") as handle:
|
|
245
|
-
|
|
246
|
-
|
|
331
|
+
return self._post_async_multipart(
|
|
332
|
+
"/v1/documents/detect-pii",
|
|
333
|
+
params=params,
|
|
247
334
|
files={"file": (file_path.name, handle, "application/octet-stream")},
|
|
248
335
|
data=data,
|
|
249
|
-
|
|
336
|
+
poll=poll,
|
|
250
337
|
)
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
self._raise_for_status(response)
|
|
263
|
-
return response.json()
|
|
338
|
+
payload: dict[str, Any] = {"text": text or "", "min_score": min_score}
|
|
339
|
+
if entities:
|
|
340
|
+
payload["entities"] = entities
|
|
341
|
+
if vertical:
|
|
342
|
+
payload["vertical"] = vertical
|
|
343
|
+
return self._post_async_json(
|
|
344
|
+
"/v1/documents/detect-pii",
|
|
345
|
+
json_body=payload,
|
|
346
|
+
params=params,
|
|
347
|
+
poll=poll,
|
|
348
|
+
)
|
|
264
349
|
|
|
265
350
|
def compare_documents(
|
|
266
351
|
self,
|
|
@@ -269,27 +354,29 @@ class DocintelClient:
|
|
|
269
354
|
text_b: str | None = None,
|
|
270
355
|
path_a: str | Path | None = None,
|
|
271
356
|
path_b: str | Path | None = None,
|
|
357
|
+
async_job: bool = False,
|
|
358
|
+
poll: bool = True,
|
|
272
359
|
) -> dict[str, Any]:
|
|
360
|
+
params = {"async": "true"} if async_job else {}
|
|
273
361
|
if path_a is not None and path_b is not None:
|
|
274
362
|
file_a = Path(path_a)
|
|
275
363
|
file_b = Path(path_b)
|
|
276
364
|
with file_a.open("rb") as handle_a, file_b.open("rb") as handle_b:
|
|
277
|
-
|
|
278
|
-
|
|
365
|
+
return self._post_async_multipart(
|
|
366
|
+
"/v1/documents/compare",
|
|
367
|
+
params=params,
|
|
279
368
|
files={
|
|
280
369
|
"file_a": (file_a.name, handle_a, "application/octet-stream"),
|
|
281
370
|
"file_b": (file_b.name, handle_b, "application/octet-stream"),
|
|
282
371
|
},
|
|
283
|
-
|
|
372
|
+
poll=poll,
|
|
284
373
|
)
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
self._raise_for_status(response)
|
|
292
|
-
return response.json()
|
|
374
|
+
return self._post_async_json(
|
|
375
|
+
"/v1/documents/compare",
|
|
376
|
+
json_body={"text_a": text_a or "", "text_b": text_b or ""},
|
|
377
|
+
params=params,
|
|
378
|
+
poll=poll,
|
|
379
|
+
)
|
|
293
380
|
|
|
294
381
|
def process_document(
|
|
295
382
|
self,
|
|
@@ -302,8 +389,12 @@ class DocintelClient:
|
|
|
302
389
|
entities: str | None = None,
|
|
303
390
|
vertical: str | None = None,
|
|
304
391
|
min_score: float = 0.35,
|
|
392
|
+
async_job: bool = False,
|
|
393
|
+
callback_url: str | None = None,
|
|
394
|
+
poll: bool = True,
|
|
305
395
|
) -> dict[str, Any]:
|
|
306
396
|
file_path = Path(path)
|
|
397
|
+
params = {"async": "true"} if async_job else {}
|
|
307
398
|
data = {
|
|
308
399
|
"sentences": str(sentences),
|
|
309
400
|
"include_summarize": str(include_summarize).lower(),
|
|
@@ -315,12 +406,13 @@ class DocintelClient:
|
|
|
315
406
|
data["entities"] = entities
|
|
316
407
|
if vertical:
|
|
317
408
|
data["vertical"] = vertical
|
|
409
|
+
if callback_url:
|
|
410
|
+
data["callback_url"] = callback_url
|
|
318
411
|
with file_path.open("rb") as handle:
|
|
319
|
-
|
|
320
|
-
|
|
412
|
+
return self._post_async_multipart(
|
|
413
|
+
"/v1/documents/process",
|
|
414
|
+
params=params,
|
|
321
415
|
files={"file": (file_path.name, handle, "application/octet-stream")},
|
|
322
416
|
data=data,
|
|
323
|
-
|
|
417
|
+
poll=poll,
|
|
324
418
|
)
|
|
325
|
-
self._raise_for_status(response)
|
|
326
|
-
return response.json()
|
|
@@ -30,6 +30,11 @@ class JobType(str, Enum):
|
|
|
30
30
|
TEXT_CLASSIFY = "text_classify"
|
|
31
31
|
TEXT_DETECT_PII = "text_detect_pii"
|
|
32
32
|
DOCUMENT_PROCESS = "document_process"
|
|
33
|
+
DOCUMENT_CLASSIFY = "document_classify"
|
|
34
|
+
DOCUMENT_SUMMARIZE = "document_summarize"
|
|
35
|
+
DOCUMENT_DETECT_PII = "document_detect_pii"
|
|
36
|
+
DOCUMENT_EXTRACT_TEXT = "document_extract_text"
|
|
37
|
+
DOCUMENT_COMPARE = "document_compare"
|
|
33
38
|
BATCH = "batch"
|
|
34
39
|
|
|
35
40
|
|
|
@@ -187,6 +187,119 @@ def enqueue_document_process_text_job(
|
|
|
187
187
|
)
|
|
188
188
|
|
|
189
189
|
|
|
190
|
+
def enqueue_classify_document_job(
|
|
191
|
+
job_id: str,
|
|
192
|
+
input_path: str,
|
|
193
|
+
filename: str,
|
|
194
|
+
content_type: str | None,
|
|
195
|
+
) -> None:
|
|
196
|
+
queue = get_queue()
|
|
197
|
+
queue.enqueue(
|
|
198
|
+
"docintel.jobs.tasks.run_classify_document_job",
|
|
199
|
+
job_id=job_id,
|
|
200
|
+
input_path=input_path,
|
|
201
|
+
filename=filename,
|
|
202
|
+
content_type=content_type,
|
|
203
|
+
job_timeout=600,
|
|
204
|
+
result_ttl=DEFAULT_RESULT_TTL,
|
|
205
|
+
failure_ttl=DEFAULT_FAILURE_TTL,
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def enqueue_summarize_document_job(
|
|
210
|
+
job_id: str,
|
|
211
|
+
input_path: str,
|
|
212
|
+
filename: str,
|
|
213
|
+
content_type: str | None,
|
|
214
|
+
sentences: int,
|
|
215
|
+
) -> None:
|
|
216
|
+
queue = get_queue()
|
|
217
|
+
queue.enqueue(
|
|
218
|
+
"docintel.jobs.tasks.run_summarize_document_job",
|
|
219
|
+
job_id=job_id,
|
|
220
|
+
input_path=input_path,
|
|
221
|
+
filename=filename,
|
|
222
|
+
content_type=content_type,
|
|
223
|
+
sentences=sentences,
|
|
224
|
+
job_timeout=600,
|
|
225
|
+
result_ttl=DEFAULT_RESULT_TTL,
|
|
226
|
+
failure_ttl=DEFAULT_FAILURE_TTL,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def enqueue_detect_pii_document_job(
|
|
231
|
+
job_id: str,
|
|
232
|
+
input_path: str,
|
|
233
|
+
filename: str,
|
|
234
|
+
content_type: str | None,
|
|
235
|
+
*,
|
|
236
|
+
entities: list[str] | None = None,
|
|
237
|
+
min_score: float = 0.35,
|
|
238
|
+
) -> None:
|
|
239
|
+
queue = get_queue()
|
|
240
|
+
queue.enqueue(
|
|
241
|
+
"docintel.jobs.tasks.run_detect_pii_document_job",
|
|
242
|
+
job_id=job_id,
|
|
243
|
+
input_path=input_path,
|
|
244
|
+
filename=filename,
|
|
245
|
+
content_type=content_type,
|
|
246
|
+
entities=entities,
|
|
247
|
+
min_score=min_score,
|
|
248
|
+
job_timeout=600,
|
|
249
|
+
result_ttl=DEFAULT_RESULT_TTL,
|
|
250
|
+
failure_ttl=DEFAULT_FAILURE_TTL,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def enqueue_extract_text_job(
|
|
255
|
+
job_id: str,
|
|
256
|
+
input_path: str,
|
|
257
|
+
filename: str,
|
|
258
|
+
content_type: str | None,
|
|
259
|
+
) -> None:
|
|
260
|
+
queue = get_queue()
|
|
261
|
+
queue.enqueue(
|
|
262
|
+
"docintel.jobs.tasks.run_extract_text_job",
|
|
263
|
+
job_id=job_id,
|
|
264
|
+
input_path=input_path,
|
|
265
|
+
filename=filename,
|
|
266
|
+
content_type=content_type,
|
|
267
|
+
job_timeout=600,
|
|
268
|
+
result_ttl=DEFAULT_RESULT_TTL,
|
|
269
|
+
failure_ttl=DEFAULT_FAILURE_TTL,
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def enqueue_compare_job(
|
|
274
|
+
job_id: str,
|
|
275
|
+
*,
|
|
276
|
+
text_a: str | None = None,
|
|
277
|
+
text_b: str | None = None,
|
|
278
|
+
path_a: str | None = None,
|
|
279
|
+
path_b: str | None = None,
|
|
280
|
+
filename_a: str | None = None,
|
|
281
|
+
filename_b: str | None = None,
|
|
282
|
+
content_type_a: str | None = None,
|
|
283
|
+
content_type_b: str | None = None,
|
|
284
|
+
) -> None:
|
|
285
|
+
queue = get_queue()
|
|
286
|
+
queue.enqueue(
|
|
287
|
+
"docintel.jobs.tasks.run_compare_job",
|
|
288
|
+
job_id=job_id,
|
|
289
|
+
text_a=text_a,
|
|
290
|
+
text_b=text_b,
|
|
291
|
+
path_a=path_a,
|
|
292
|
+
path_b=path_b,
|
|
293
|
+
filename_a=filename_a,
|
|
294
|
+
filename_b=filename_b,
|
|
295
|
+
content_type_a=content_type_a,
|
|
296
|
+
content_type_b=content_type_b,
|
|
297
|
+
job_timeout=600,
|
|
298
|
+
result_ttl=DEFAULT_RESULT_TTL,
|
|
299
|
+
failure_ttl=DEFAULT_FAILURE_TTL,
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
|
|
190
303
|
def queue_depth() -> int | None:
|
|
191
304
|
"""Return RQ queue length when Redis is reachable."""
|
|
192
305
|
try:
|