docintel-platform 1.1.0__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. {docintel_platform-1.1.0/src/docintel_platform.egg-info → docintel_platform-1.2.0}/PKG-INFO +6 -3
  2. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/README.md +5 -2
  3. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/pyproject.toml +1 -1
  4. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/__init__.py +1 -1
  5. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/client.py +149 -57
  6. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/jobs/models.py +5 -0
  7. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/jobs/queue.py +113 -0
  8. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/jobs/tasks.py +221 -0
  9. docintel_platform-1.2.0/src/docintel/routes/async_enqueue.py +29 -0
  10. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/routes/document_upload.py +11 -1
  11. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/routes/documents.py +191 -10
  12. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/routes/pdf.py +4 -9
  13. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/routes/text.py +23 -0
  14. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/ui.py +148 -24
  15. {docintel_platform-1.1.0 → docintel_platform-1.2.0/src/docintel_platform.egg-info}/PKG-INFO +6 -3
  16. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/SOURCES.txt +3 -0
  17. docintel_platform-1.2.0/tests/test_documents_async_routes.py +115 -0
  18. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_health.py +1 -1
  19. docintel_platform-1.2.0/tests/test_ui_process.py +28 -0
  20. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/LICENSE +0 -0
  21. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/MANIFEST.in +0 -0
  22. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/setup.cfg +0 -0
  23. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/app.py +0 -0
  24. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/auth/__init__.py +0 -0
  25. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/auth/api_keys.py +0 -0
  26. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/auth/limiter.py +0 -0
  27. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/auth/middleware.py +0 -0
  28. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/auth/oidc.py +0 -0
  29. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/__init__.py +0 -0
  30. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/compliance/__init__.py +0 -0
  31. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/compliance/pii.py +0 -0
  32. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/compliance/presets.py +0 -0
  33. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/compliance/sensitive.py +0 -0
  34. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/__init__.py +0 -0
  35. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/formats/__init__.py +0 -0
  36. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/formats/extract.py +0 -0
  37. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/formats/models.py +0 -0
  38. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/formats/registry.py +0 -0
  39. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/formats/sniff.py +0 -0
  40. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/ocr.py +0 -0
  41. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/structure.py +0 -0
  42. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/structure_llm.py +0 -0
  43. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/structure_render.py +0 -0
  44. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/extraction/structure_schema.py +0 -0
  45. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/pdf/__init__.py +0 -0
  46. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/pdf/annotator.py +0 -0
  47. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/pdf/models.py +0 -0
  48. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/pdf/search.py +0 -0
  49. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/pipeline/__init__.py +0 -0
  50. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/pipeline/process.py +0 -0
  51. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/understanding/__init__.py +0 -0
  52. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/understanding/classify.py +0 -0
  53. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/understanding/compare.py +0 -0
  54. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/understanding/models.py +0 -0
  55. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/capabilities/understanding/textrank.py +0 -0
  56. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/cli.py +0 -0
  57. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/config.py +0 -0
  58. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/jobs/__init__.py +0 -0
  59. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/jobs/helpers.py +0 -0
  60. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/jobs/store.py +0 -0
  61. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/jobs/webhooks.py +0 -0
  62. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/openapi/__init__.py +0 -0
  63. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/openapi/openapi.yaml +0 -0
  64. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/ops/__init__.py +0 -0
  65. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/ops/logging.py +0 -0
  66. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/ops/metrics.py +0 -0
  67. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/ops/middleware.py +0 -0
  68. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/ops/prometheus.py +0 -0
  69. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/routes/__init__.py +0 -0
  70. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/routes/batch.py +0 -0
  71. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/routes/jobs.py +0 -0
  72. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/routes/openapi_docs.py +0 -0
  73. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/routes/ops.py +0 -0
  74. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/__init__.py +0 -0
  75. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/__init__.py +0 -0
  76. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/annotator.py +0 -0
  77. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/models.py +0 -0
  78. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/ocr.py +0 -0
  79. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/pii.py +0 -0
  80. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/presets.py +0 -0
  81. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/search.py +0 -0
  82. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/sensitive.py +0 -0
  83. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/structure.py +0 -0
  84. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/structure_llm.py +0 -0
  85. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/structure_render.py +0 -0
  86. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/pdf/structure_schema.py +0 -0
  87. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/summary/__init__.py +0 -0
  88. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/summary/models.py +0 -0
  89. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/services/summary/textrank.py +0 -0
  90. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/storage/__init__.py +0 -0
  91. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/storage/local.py +0 -0
  92. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/storage/s3.py +0 -0
  93. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel/wsgi.py +0 -0
  94. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/dependency_links.txt +0 -0
  95. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/entry_points.txt +0 -0
  96. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/requires.txt +0 -0
  97. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/src/docintel_platform.egg-info/top_level.txt +0 -0
  98. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_annotate_async.py +0 -0
  99. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_auth.py +0 -0
  100. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_batch.py +0 -0
  101. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_client.py +0 -0
  102. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_detect_sensitive_async.py +0 -0
  103. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_document_formats.py +0 -0
  104. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_documents_classify.py +0 -0
  105. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_documents_compare.py +0 -0
  106. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_documents_compare_files.py +0 -0
  107. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_documents_detect_pii.py +0 -0
  108. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_documents_process.py +0 -0
  109. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_documents_process_async.py +0 -0
  110. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_documents_summarize.py +0 -0
  111. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_jobs.py +0 -0
  112. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_oidc.py +0 -0
  113. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_openapi.py +0 -0
  114. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_ops.py +0 -0
  115. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_pdf_routes.py +0 -0
  116. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_pdf_sensitive.py +0 -0
  117. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_pdf_service.py +0 -0
  118. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_pdf_structure.py +0 -0
  119. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_pii_mask.py +0 -0
  120. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_storage.py +0 -0
  121. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_structure_pii.py +0 -0
  122. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_summary_routes.py +0 -0
  123. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_summary_service.py +0 -0
  124. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_ui.py +0 -0
  125. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_vertical_presets.py +0 -0
  126. {docintel_platform-1.1.0 → docintel_platform-1.2.0}/tests/test_webhooks.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docintel-platform
3
- Version: 1.1.0
3
+ Version: 1.2.0
4
4
  Summary: Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization.
5
5
  Author: Babandeep Singh
6
6
  License-Expression: MIT
@@ -90,7 +90,7 @@ Requires-Dist: openpyxl>=3.1.5; extra == "all"
90
90
 
91
91
  Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
92
92
 
93
- **Version:** 1.1.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
93
+ **Version:** 1.2.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
94
94
 
95
95
  ---
96
96
 
@@ -112,6 +112,8 @@ make docker-up
112
112
  | Gradio UI | http://127.0.0.1:7860 |
113
113
  | Health | http://127.0.0.1:5000/health |
114
114
 
115
+ Gradio includes a **Document process** tab (unified pipeline). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
116
+
115
117
  **pip install:**
116
118
 
117
119
  ```bash
@@ -142,7 +144,7 @@ report = client.process_document("policy.docx", include_pii=True)
142
144
  | Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process** |
143
145
  | Text | `POST /v1/text/summarize` | TextRank extractive summary |
144
146
  | Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
145
- | Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true` on PDF and process routes) |
147
+ | Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
146
148
  | Ops | `GET /health`, `GET /metrics` | Health and Prometheus-friendly metrics |
147
149
 
148
150
  **Supported uploads (text workflows):** PDF, DOCX, XLSX, CSV, JSON, TXT, MD.
@@ -182,6 +184,7 @@ make run # API on :5000
182
184
  make run-worker # RQ worker (separate terminal, needs Redis)
183
185
  make run-ui # Gradio on :7860
184
186
  make test
187
+ make eval # offline quality report (summary, classify, process, PII)
185
188
  ```
186
189
 
187
190
  Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and S3. See comments in that file for all variables.
@@ -7,7 +7,7 @@
7
7
 
8
8
  Enterprise document intelligence API: PDF compliance (OCR, PII, redaction), LLM structuring, and multi-format text workflows (Word, Excel, CSV, plain text).
9
9
 
10
- **Version:** 1.1.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
10
+ **Version:** 1.2.0 | **PyPI:** [docintel-platform](https://pypi.org/project/docintel-platform/)
11
11
 
12
12
  ---
13
13
 
@@ -29,6 +29,8 @@ make docker-up
29
29
  | Gradio UI | http://127.0.0.1:7860 |
30
30
  | Health | http://127.0.0.1:5000/health |
31
31
 
32
+ Gradio includes a **Document process** tab (unified pipeline). It needs the API plus a Redis worker (`worker` service in compose, or `make run-worker` locally).
33
+
32
34
  **pip install:**
33
35
 
34
36
  ```bash
@@ -59,7 +61,7 @@ report = client.process_document("policy.docx", include_pii=True)
59
61
  | Documents | `POST /v1/documents/*` | Identify, extract, classify, summarize, PII, compare, **process** |
60
62
  | Text | `POST /v1/text/summarize` | TextRank extractive summary |
61
63
  | Batch | `POST /v1/batch` | Async summarize, classify, detect_pii, process |
62
- | Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true` on PDF and process routes) |
64
+ | Jobs | `GET /v1/jobs/{id}` | Poll async work (`?async=true`; default in Docker when Redis is up) |
63
65
  | Ops | `GET /health`, `GET /metrics` | Health and Prometheus-friendly metrics |
64
66
 
65
67
  **Supported uploads (text workflows):** PDF, DOCX, XLSX, CSV, JSON, TXT, MD.
@@ -99,6 +101,7 @@ make run # API on :5000
99
101
  make run-worker # RQ worker (separate terminal, needs Redis)
100
102
  make run-ui # Gradio on :7860
101
103
  make test
104
+ make eval # offline quality report (summary, classify, process, PII)
102
105
  ```
103
106
 
104
107
  Copy `.env.example` to `.env` for `DOCINTEL_LLM_API_KEY`, auth keys, Redis, and S3. See comments in that file for all variables.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docintel-platform"
7
- version = "1.1.0"
7
+ version = "1.2.0"
8
8
  description = "Enterprise document intelligence API for PDF compliance, multi-format extraction, structuring, and summarization."
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -2,5 +2,5 @@
2
2
 
3
3
  from docintel.client import DocintelClient, DocintelError
4
4
 
5
- __version__ = "1.1.0"
5
+ __version__ = "1.2.0"
6
6
  __all__ = ["DocintelClient", "DocintelError", "__version__"]
@@ -164,15 +164,65 @@ class DocintelClient:
164
164
  return response.content
165
165
  return response.json()
166
166
 
167
- def summarize(self, text: str, *, sentences: int = 3) -> dict[str, Any]:
167
+ def _post_async_json(
168
+ self,
169
+ path: str,
170
+ *,
171
+ json_body: dict[str, Any] | None = None,
172
+ params: dict[str, str] | None = None,
173
+ poll: bool = True,
174
+ ) -> dict[str, Any]:
168
175
  response = self._session.post(
169
- self._url("/v1/text/summarize"),
170
- json={"text": text, "sentences": sentences},
176
+ self._url(path),
177
+ json=json_body,
178
+ params=params,
171
179
  timeout=self.timeout,
172
180
  )
181
+ if response.status_code == 202:
182
+ payload = response.json()
183
+ if not poll:
184
+ return payload
185
+ completed = self.poll_job(payload["job_id"])
186
+ result = completed.get("result") or {}
187
+ return {"status": "ok", **result}
173
188
  self._raise_for_status(response)
174
189
  return response.json()
175
190
 
191
+ def _post_async_multipart(
192
+ self,
193
+ path: str,
194
+ *,
195
+ files: dict,
196
+ data: dict[str, str] | None = None,
197
+ params: dict[str, str] | None = None,
198
+ poll: bool = True,
199
+ ) -> dict[str, Any]:
200
+ response = self._session.post(
201
+ self._url(path),
202
+ params=params,
203
+ files=files,
204
+ data=data or {},
205
+ timeout=self.timeout,
206
+ )
207
+ if response.status_code == 202:
208
+ payload = response.json()
209
+ if not poll:
210
+ return payload
211
+ completed = self.poll_job(payload["job_id"])
212
+ result = completed.get("result") or {}
213
+ return {"status": "ok", **result}
214
+ self._raise_for_status(response)
215
+ return response.json()
216
+
217
+ def summarize(self, text: str, *, sentences: int = 3, async_job: bool = False, poll: bool = True) -> dict[str, Any]:
218
+ params = {"async": "true"} if async_job else {}
219
+ return self._post_async_json(
220
+ "/v1/text/summarize",
221
+ json_body={"text": text, "sentences": sentences},
222
+ params=params,
223
+ poll=poll,
224
+ )
225
+
176
226
  def list_document_types(self) -> dict[str, Any]:
177
227
  response = self._session.get(self._url("/v1/documents/types"), timeout=self.timeout)
178
228
  self._raise_for_status(response)
@@ -189,16 +239,47 @@ class DocintelClient:
189
239
  self._raise_for_status(response)
190
240
  return response.json()
191
241
 
192
- def extract_document_text(self, path: str | Path) -> dict[str, Any]:
242
+ def extract_document_text(
243
+ self,
244
+ path: str | Path,
245
+ *,
246
+ async_job: bool = False,
247
+ poll: bool = True,
248
+ ) -> dict[str, Any]:
193
249
  file_path = Path(path)
250
+ params = {"async": "true"} if async_job else {}
194
251
  with file_path.open("rb") as handle:
195
- response = self._session.post(
196
- self._url("/v1/documents/extract-text"),
252
+ return self._post_async_multipart(
253
+ "/v1/documents/extract-text",
254
+ params=params,
197
255
  files={"file": (file_path.name, handle, "application/octet-stream")},
198
- timeout=self.timeout,
256
+ poll=poll,
199
257
  )
200
- self._raise_for_status(response)
201
- return response.json()
258
+
259
+ def classify_document(
260
+ self,
261
+ path: str | Path | None = None,
262
+ *,
263
+ text: str | None = None,
264
+ async_job: bool = False,
265
+ poll: bool = True,
266
+ ) -> dict[str, Any]:
267
+ params = {"async": "true"} if async_job else {}
268
+ if path is not None:
269
+ file_path = Path(path)
270
+ with file_path.open("rb") as handle:
271
+ return self._post_async_multipart(
272
+ "/v1/documents/classify",
273
+ params=params,
274
+ files={"file": (file_path.name, handle, "application/octet-stream")},
275
+ poll=poll,
276
+ )
277
+ return self._post_async_json(
278
+ "/v1/documents/classify",
279
+ json_body={"text": text or ""},
280
+ params=params,
281
+ poll=poll,
282
+ )
202
283
 
203
284
  def summarize_document(
204
285
  self,
@@ -206,24 +287,26 @@ class DocintelClient:
206
287
  *,
207
288
  text: str | None = None,
208
289
  sentences: int = 3,
290
+ async_job: bool = False,
291
+ poll: bool = True,
209
292
  ) -> dict[str, Any]:
293
+ params = {"async": "true"} if async_job else {}
210
294
  if path is not None:
211
295
  file_path = Path(path)
212
296
  with file_path.open("rb") as handle:
213
- response = self._session.post(
214
- self._url("/v1/documents/summarize"),
297
+ return self._post_async_multipart(
298
+ "/v1/documents/summarize",
299
+ params=params,
215
300
  files={"file": (file_path.name, handle, "application/octet-stream")},
216
301
  data={"sentences": str(sentences)},
217
- timeout=self.timeout,
302
+ poll=poll,
218
303
  )
219
- else:
220
- response = self._session.post(
221
- self._url("/v1/documents/summarize"),
222
- json={"text": text or "", "sentences": sentences},
223
- timeout=self.timeout,
224
- )
225
- self._raise_for_status(response)
226
- return response.json()
304
+ return self._post_async_json(
305
+ "/v1/documents/summarize",
306
+ json_body={"text": text or "", "sentences": sentences},
307
+ params=params,
308
+ poll=poll,
309
+ )
227
310
 
228
311
  def detect_pii_document(
229
312
  self,
@@ -233,34 +316,36 @@ class DocintelClient:
233
316
  entities: str | None = None,
234
317
  vertical: str | None = None,
235
318
  min_score: float = 0.35,
319
+ async_job: bool = False,
320
+ poll: bool = True,
236
321
  ) -> dict[str, Any]:
237
- data = {"min_score": str(min_score)}
238
- if entities:
239
- data["entities"] = entities
240
- if vertical:
241
- data["vertical"] = vertical
322
+ params = {"async": "true"} if async_job else {}
242
323
  if path is not None:
243
324
  file_path = Path(path)
325
+ data = {"min_score": str(min_score)}
326
+ if entities:
327
+ data["entities"] = entities
328
+ if vertical:
329
+ data["vertical"] = vertical
244
330
  with file_path.open("rb") as handle:
245
- response = self._session.post(
246
- self._url("/v1/documents/detect-pii"),
331
+ return self._post_async_multipart(
332
+ "/v1/documents/detect-pii",
333
+ params=params,
247
334
  files={"file": (file_path.name, handle, "application/octet-stream")},
248
335
  data=data,
249
- timeout=self.timeout,
336
+ poll=poll,
250
337
  )
251
- else:
252
- payload = {"text": text or "", "min_score": min_score}
253
- if entities:
254
- payload["entities"] = entities
255
- if vertical:
256
- payload["vertical"] = vertical
257
- response = self._session.post(
258
- self._url("/v1/documents/detect-pii"),
259
- json=payload,
260
- timeout=self.timeout,
261
- )
262
- self._raise_for_status(response)
263
- return response.json()
338
+ payload: dict[str, Any] = {"text": text or "", "min_score": min_score}
339
+ if entities:
340
+ payload["entities"] = entities
341
+ if vertical:
342
+ payload["vertical"] = vertical
343
+ return self._post_async_json(
344
+ "/v1/documents/detect-pii",
345
+ json_body=payload,
346
+ params=params,
347
+ poll=poll,
348
+ )
264
349
 
265
350
  def compare_documents(
266
351
  self,
@@ -269,27 +354,29 @@ class DocintelClient:
269
354
  text_b: str | None = None,
270
355
  path_a: str | Path | None = None,
271
356
  path_b: str | Path | None = None,
357
+ async_job: bool = False,
358
+ poll: bool = True,
272
359
  ) -> dict[str, Any]:
360
+ params = {"async": "true"} if async_job else {}
273
361
  if path_a is not None and path_b is not None:
274
362
  file_a = Path(path_a)
275
363
  file_b = Path(path_b)
276
364
  with file_a.open("rb") as handle_a, file_b.open("rb") as handle_b:
277
- response = self._session.post(
278
- self._url("/v1/documents/compare"),
365
+ return self._post_async_multipart(
366
+ "/v1/documents/compare",
367
+ params=params,
279
368
  files={
280
369
  "file_a": (file_a.name, handle_a, "application/octet-stream"),
281
370
  "file_b": (file_b.name, handle_b, "application/octet-stream"),
282
371
  },
283
- timeout=self.timeout,
372
+ poll=poll,
284
373
  )
285
- else:
286
- response = self._session.post(
287
- self._url("/v1/documents/compare"),
288
- json={"text_a": text_a or "", "text_b": text_b or ""},
289
- timeout=self.timeout,
290
- )
291
- self._raise_for_status(response)
292
- return response.json()
374
+ return self._post_async_json(
375
+ "/v1/documents/compare",
376
+ json_body={"text_a": text_a or "", "text_b": text_b or ""},
377
+ params=params,
378
+ poll=poll,
379
+ )
293
380
 
294
381
  def process_document(
295
382
  self,
@@ -302,8 +389,12 @@ class DocintelClient:
302
389
  entities: str | None = None,
303
390
  vertical: str | None = None,
304
391
  min_score: float = 0.35,
392
+ async_job: bool = False,
393
+ callback_url: str | None = None,
394
+ poll: bool = True,
305
395
  ) -> dict[str, Any]:
306
396
  file_path = Path(path)
397
+ params = {"async": "true"} if async_job else {}
307
398
  data = {
308
399
  "sentences": str(sentences),
309
400
  "include_summarize": str(include_summarize).lower(),
@@ -315,12 +406,13 @@ class DocintelClient:
315
406
  data["entities"] = entities
316
407
  if vertical:
317
408
  data["vertical"] = vertical
409
+ if callback_url:
410
+ data["callback_url"] = callback_url
318
411
  with file_path.open("rb") as handle:
319
- response = self._session.post(
320
- self._url("/v1/documents/process"),
412
+ return self._post_async_multipart(
413
+ "/v1/documents/process",
414
+ params=params,
321
415
  files={"file": (file_path.name, handle, "application/octet-stream")},
322
416
  data=data,
323
- timeout=self.timeout,
417
+ poll=poll,
324
418
  )
325
- self._raise_for_status(response)
326
- return response.json()
@@ -30,6 +30,11 @@ class JobType(str, Enum):
30
30
  TEXT_CLASSIFY = "text_classify"
31
31
  TEXT_DETECT_PII = "text_detect_pii"
32
32
  DOCUMENT_PROCESS = "document_process"
33
+ DOCUMENT_CLASSIFY = "document_classify"
34
+ DOCUMENT_SUMMARIZE = "document_summarize"
35
+ DOCUMENT_DETECT_PII = "document_detect_pii"
36
+ DOCUMENT_EXTRACT_TEXT = "document_extract_text"
37
+ DOCUMENT_COMPARE = "document_compare"
33
38
  BATCH = "batch"
34
39
 
35
40
 
@@ -187,6 +187,119 @@ def enqueue_document_process_text_job(
187
187
  )
188
188
 
189
189
 
190
+ def enqueue_classify_document_job(
191
+ job_id: str,
192
+ input_path: str,
193
+ filename: str,
194
+ content_type: str | None,
195
+ ) -> None:
196
+ queue = get_queue()
197
+ queue.enqueue(
198
+ "docintel.jobs.tasks.run_classify_document_job",
199
+ job_id=job_id,
200
+ input_path=input_path,
201
+ filename=filename,
202
+ content_type=content_type,
203
+ job_timeout=600,
204
+ result_ttl=DEFAULT_RESULT_TTL,
205
+ failure_ttl=DEFAULT_FAILURE_TTL,
206
+ )
207
+
208
+
209
+ def enqueue_summarize_document_job(
210
+ job_id: str,
211
+ input_path: str,
212
+ filename: str,
213
+ content_type: str | None,
214
+ sentences: int,
215
+ ) -> None:
216
+ queue = get_queue()
217
+ queue.enqueue(
218
+ "docintel.jobs.tasks.run_summarize_document_job",
219
+ job_id=job_id,
220
+ input_path=input_path,
221
+ filename=filename,
222
+ content_type=content_type,
223
+ sentences=sentences,
224
+ job_timeout=600,
225
+ result_ttl=DEFAULT_RESULT_TTL,
226
+ failure_ttl=DEFAULT_FAILURE_TTL,
227
+ )
228
+
229
+
230
+ def enqueue_detect_pii_document_job(
231
+ job_id: str,
232
+ input_path: str,
233
+ filename: str,
234
+ content_type: str | None,
235
+ *,
236
+ entities: list[str] | None = None,
237
+ min_score: float = 0.35,
238
+ ) -> None:
239
+ queue = get_queue()
240
+ queue.enqueue(
241
+ "docintel.jobs.tasks.run_detect_pii_document_job",
242
+ job_id=job_id,
243
+ input_path=input_path,
244
+ filename=filename,
245
+ content_type=content_type,
246
+ entities=entities,
247
+ min_score=min_score,
248
+ job_timeout=600,
249
+ result_ttl=DEFAULT_RESULT_TTL,
250
+ failure_ttl=DEFAULT_FAILURE_TTL,
251
+ )
252
+
253
+
254
+ def enqueue_extract_text_job(
255
+ job_id: str,
256
+ input_path: str,
257
+ filename: str,
258
+ content_type: str | None,
259
+ ) -> None:
260
+ queue = get_queue()
261
+ queue.enqueue(
262
+ "docintel.jobs.tasks.run_extract_text_job",
263
+ job_id=job_id,
264
+ input_path=input_path,
265
+ filename=filename,
266
+ content_type=content_type,
267
+ job_timeout=600,
268
+ result_ttl=DEFAULT_RESULT_TTL,
269
+ failure_ttl=DEFAULT_FAILURE_TTL,
270
+ )
271
+
272
+
273
+ def enqueue_compare_job(
274
+ job_id: str,
275
+ *,
276
+ text_a: str | None = None,
277
+ text_b: str | None = None,
278
+ path_a: str | None = None,
279
+ path_b: str | None = None,
280
+ filename_a: str | None = None,
281
+ filename_b: str | None = None,
282
+ content_type_a: str | None = None,
283
+ content_type_b: str | None = None,
284
+ ) -> None:
285
+ queue = get_queue()
286
+ queue.enqueue(
287
+ "docintel.jobs.tasks.run_compare_job",
288
+ job_id=job_id,
289
+ text_a=text_a,
290
+ text_b=text_b,
291
+ path_a=path_a,
292
+ path_b=path_b,
293
+ filename_a=filename_a,
294
+ filename_b=filename_b,
295
+ content_type_a=content_type_a,
296
+ content_type_b=content_type_b,
297
+ job_timeout=600,
298
+ result_ttl=DEFAULT_RESULT_TTL,
299
+ failure_ttl=DEFAULT_FAILURE_TTL,
300
+ )
301
+
302
+
190
303
  def queue_depth() -> int | None:
191
304
  """Return RQ queue length when Redis is reachable."""
192
305
  try: