knowledge2 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowledge2-0.4.0.dist-info/METADATA +556 -0
- knowledge2-0.4.0.dist-info/RECORD +139 -0
- knowledge2-0.4.0.dist-info/WHEEL +5 -0
- knowledge2-0.4.0.dist-info/top_level.txt +1 -0
- sdk/__init__.py +70 -0
- sdk/_async_base.py +525 -0
- sdk/_async_paging.py +57 -0
- sdk/_base.py +541 -0
- sdk/_logging.py +41 -0
- sdk/_paging.py +73 -0
- sdk/_preview.py +70 -0
- sdk/_raw_response.py +25 -0
- sdk/_request_options.py +51 -0
- sdk/_transport.py +144 -0
- sdk/_validation.py +25 -0
- sdk/_validation_response.py +36 -0
- sdk/_version.py +3 -0
- sdk/async_client.py +320 -0
- sdk/async_resources/__init__.py +45 -0
- sdk/async_resources/_mixin_base.py +42 -0
- sdk/async_resources/a2a.py +230 -0
- sdk/async_resources/agents.py +489 -0
- sdk/async_resources/audit.py +145 -0
- sdk/async_resources/auth.py +133 -0
- sdk/async_resources/console.py +409 -0
- sdk/async_resources/corpora.py +276 -0
- sdk/async_resources/deployments.py +106 -0
- sdk/async_resources/documents.py +592 -0
- sdk/async_resources/feeds.py +248 -0
- sdk/async_resources/indexes.py +208 -0
- sdk/async_resources/jobs.py +165 -0
- sdk/async_resources/metadata.py +48 -0
- sdk/async_resources/models.py +102 -0
- sdk/async_resources/onboarding.py +538 -0
- sdk/async_resources/orgs.py +37 -0
- sdk/async_resources/pipelines.py +523 -0
- sdk/async_resources/projects.py +90 -0
- sdk/async_resources/search.py +262 -0
- sdk/async_resources/training.py +357 -0
- sdk/async_resources/usage.py +91 -0
- sdk/client.py +417 -0
- sdk/config.py +182 -0
- sdk/errors.py +178 -0
- sdk/examples/auth_factory.py +34 -0
- sdk/examples/batch_operations.py +57 -0
- sdk/examples/document_upload.py +56 -0
- sdk/examples/e2e_lifecycle.py +213 -0
- sdk/examples/error_handling.py +61 -0
- sdk/examples/pagination.py +64 -0
- sdk/examples/quickstart.py +36 -0
- sdk/examples/request_options.py +44 -0
- sdk/examples/search.py +64 -0
- sdk/integrations/__init__.py +57 -0
- sdk/integrations/_client.py +101 -0
- sdk/integrations/langchain/__init__.py +6 -0
- sdk/integrations/langchain/retriever.py +166 -0
- sdk/integrations/langchain/tools.py +108 -0
- sdk/integrations/llamaindex/__init__.py +11 -0
- sdk/integrations/llamaindex/filters.py +78 -0
- sdk/integrations/llamaindex/retriever.py +162 -0
- sdk/integrations/llamaindex/tools.py +109 -0
- sdk/integrations/llamaindex/vector_store.py +320 -0
- sdk/models/__init__.py +18 -0
- sdk/models/_base.py +24 -0
- sdk/models/_registry.py +457 -0
- sdk/models/a2a.py +92 -0
- sdk/models/agents.py +109 -0
- sdk/models/audit.py +28 -0
- sdk/models/auth.py +49 -0
- sdk/models/chunks.py +20 -0
- sdk/models/common.py +14 -0
- sdk/models/console.py +103 -0
- sdk/models/corpora.py +48 -0
- sdk/models/deployments.py +13 -0
- sdk/models/documents.py +126 -0
- sdk/models/embeddings.py +24 -0
- sdk/models/evaluation.py +17 -0
- sdk/models/feedback.py +9 -0
- sdk/models/feeds.py +57 -0
- sdk/models/indexes.py +36 -0
- sdk/models/jobs.py +52 -0
- sdk/models/models.py +26 -0
- sdk/models/onboarding.py +323 -0
- sdk/models/orgs.py +11 -0
- sdk/models/pipelines.py +147 -0
- sdk/models/projects.py +19 -0
- sdk/models/search.py +149 -0
- sdk/models/training.py +57 -0
- sdk/models/usage.py +39 -0
- sdk/namespaces.py +386 -0
- sdk/py.typed +0 -0
- sdk/resources/__init__.py +45 -0
- sdk/resources/_mixin_base.py +40 -0
- sdk/resources/a2a.py +230 -0
- sdk/resources/agents.py +487 -0
- sdk/resources/audit.py +144 -0
- sdk/resources/auth.py +138 -0
- sdk/resources/console.py +411 -0
- sdk/resources/corpora.py +269 -0
- sdk/resources/deployments.py +105 -0
- sdk/resources/documents.py +597 -0
- sdk/resources/feeds.py +246 -0
- sdk/resources/indexes.py +210 -0
- sdk/resources/jobs.py +164 -0
- sdk/resources/metadata.py +53 -0
- sdk/resources/models.py +99 -0
- sdk/resources/onboarding.py +542 -0
- sdk/resources/orgs.py +35 -0
- sdk/resources/pipeline_builder.py +257 -0
- sdk/resources/pipelines.py +520 -0
- sdk/resources/projects.py +87 -0
- sdk/resources/search.py +277 -0
- sdk/resources/training.py +358 -0
- sdk/resources/usage.py +92 -0
- sdk/types/__init__.py +366 -0
- sdk/types/a2a.py +88 -0
- sdk/types/agents.py +133 -0
- sdk/types/audit.py +26 -0
- sdk/types/auth.py +45 -0
- sdk/types/chunks.py +18 -0
- sdk/types/common.py +10 -0
- sdk/types/console.py +99 -0
- sdk/types/corpora.py +42 -0
- sdk/types/deployments.py +11 -0
- sdk/types/documents.py +104 -0
- sdk/types/embeddings.py +22 -0
- sdk/types/evaluation.py +15 -0
- sdk/types/feedback.py +7 -0
- sdk/types/feeds.py +61 -0
- sdk/types/indexes.py +30 -0
- sdk/types/jobs.py +50 -0
- sdk/types/models.py +22 -0
- sdk/types/onboarding.py +395 -0
- sdk/types/orgs.py +9 -0
- sdk/types/pipelines.py +177 -0
- sdk/types/projects.py +14 -0
- sdk/types/search.py +116 -0
- sdk/types/training.py +55 -0
- sdk/types/usage.py +37 -0
|
@@ -0,0 +1,597 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from concurrent.futures import Future, ThreadPoolExecutor
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from sdk._paging import Page, SyncPager
|
|
9
|
+
from sdk._request_options import RequestOptions
|
|
10
|
+
from sdk._validation import require_str
|
|
11
|
+
from sdk.errors import ConfirmationRequiredError
|
|
12
|
+
from sdk.resources._mixin_base import RequesterMixin
|
|
13
|
+
from sdk.types import (
|
|
14
|
+
ChunkingConfig,
|
|
15
|
+
DocumentBatchUploadResponse,
|
|
16
|
+
DocumentCreateResponse,
|
|
17
|
+
DocumentDeleteResponse,
|
|
18
|
+
DocumentDetailResponse,
|
|
19
|
+
DocumentManifestIngestResponse,
|
|
20
|
+
DocumentUrlIngestResponse,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DocumentsMixin(RequesterMixin):
|
|
25
|
+
def upload_document(
|
|
26
|
+
self,
|
|
27
|
+
corpus_id: str,
|
|
28
|
+
*,
|
|
29
|
+
file_path: str | None = None,
|
|
30
|
+
file_bytes: bytes | None = None,
|
|
31
|
+
filename: str | None = None,
|
|
32
|
+
raw_text: str | None = None,
|
|
33
|
+
source_uri: str | None = None,
|
|
34
|
+
metadata: dict[str, Any] | None = None,
|
|
35
|
+
auto_index: bool | None = None,
|
|
36
|
+
chunk_strategy: str | None = None,
|
|
37
|
+
chunking: ChunkingConfig | None = None,
|
|
38
|
+
idempotency_key: str | None = None,
|
|
39
|
+
request_options: RequestOptions | None = None,
|
|
40
|
+
) -> DocumentCreateResponse:
|
|
41
|
+
"""Upload a document to a corpus.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
corpus_id: Target corpus ID.
|
|
45
|
+
file_path: Path to file to upload.
|
|
46
|
+
file_bytes: Raw file bytes to upload.
|
|
47
|
+
filename: Filename when using file_bytes.
|
|
48
|
+
raw_text: Raw text content to upload.
|
|
49
|
+
source_uri: Optional source URI for the document.
|
|
50
|
+
metadata: Optional document metadata.
|
|
51
|
+
auto_index: Whether to auto-index after ingestion.
|
|
52
|
+
chunk_strategy: Deprecated - use chunking instead.
|
|
53
|
+
chunking: Chunking configuration (strategy, chunk_size, overlap, etc.)
|
|
54
|
+
idempotency_key: Optional key for idempotent requests.
|
|
55
|
+
"""
|
|
56
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
57
|
+
if file_path and (file_bytes or raw_text):
|
|
58
|
+
raise ValueError("file_path cannot be combined with file_bytes or raw_text")
|
|
59
|
+
if file_bytes and raw_text:
|
|
60
|
+
raise ValueError("file_bytes cannot be combined with raw_text")
|
|
61
|
+
headers = self._idempotency_headers(idempotency_key)
|
|
62
|
+
if file_path:
|
|
63
|
+
form: dict[str, str] = {}
|
|
64
|
+
if source_uri is not None:
|
|
65
|
+
form["source_uri"] = source_uri
|
|
66
|
+
if metadata is not None:
|
|
67
|
+
form["metadata"] = json.dumps(metadata)
|
|
68
|
+
if auto_index is not None:
|
|
69
|
+
form["auto_index"] = str(bool(auto_index)).lower()
|
|
70
|
+
if chunking is not None:
|
|
71
|
+
form["chunking"] = json.dumps(chunking)
|
|
72
|
+
elif chunk_strategy is not None:
|
|
73
|
+
form["chunk_strategy"] = chunk_strategy
|
|
74
|
+
with open(file_path, "rb") as handle:
|
|
75
|
+
files = {"file": (os.path.basename(file_path), handle)}
|
|
76
|
+
data = self._request(
|
|
77
|
+
"POST",
|
|
78
|
+
f"/v1/corpora/{corpus_id}/documents",
|
|
79
|
+
data=form,
|
|
80
|
+
files=files,
|
|
81
|
+
headers=headers,
|
|
82
|
+
request_options=request_options,
|
|
83
|
+
)
|
|
84
|
+
return self._maybe_validate(data, "DocumentCreateResponse")
|
|
85
|
+
if file_bytes is not None:
|
|
86
|
+
if not filename:
|
|
87
|
+
raise ValueError("filename is required when using file_bytes")
|
|
88
|
+
form_data: dict[str, str] = {}
|
|
89
|
+
if source_uri is not None:
|
|
90
|
+
form_data["source_uri"] = source_uri
|
|
91
|
+
if metadata is not None:
|
|
92
|
+
form_data["metadata"] = json.dumps(metadata)
|
|
93
|
+
if auto_index is not None:
|
|
94
|
+
form_data["auto_index"] = str(bool(auto_index)).lower()
|
|
95
|
+
if chunking is not None:
|
|
96
|
+
form_data["chunking"] = json.dumps(chunking)
|
|
97
|
+
elif chunk_strategy is not None:
|
|
98
|
+
form_data["chunk_strategy"] = chunk_strategy
|
|
99
|
+
file_payload: dict[str, Any] = {"file": (filename, file_bytes)}
|
|
100
|
+
data = self._request(
|
|
101
|
+
"POST",
|
|
102
|
+
f"/v1/corpora/{corpus_id}/documents",
|
|
103
|
+
data=form_data,
|
|
104
|
+
files=file_payload,
|
|
105
|
+
headers=headers,
|
|
106
|
+
request_options=request_options,
|
|
107
|
+
)
|
|
108
|
+
return self._maybe_validate(data, "DocumentCreateResponse")
|
|
109
|
+
if raw_text is None:
|
|
110
|
+
raise ValueError("raw_text is required when no file is provided")
|
|
111
|
+
payload: dict[str, Any] = {}
|
|
112
|
+
if raw_text is not None:
|
|
113
|
+
payload["raw_text"] = raw_text
|
|
114
|
+
if source_uri is not None:
|
|
115
|
+
payload["source_uri"] = source_uri
|
|
116
|
+
if metadata is not None:
|
|
117
|
+
payload["metadata"] = metadata
|
|
118
|
+
if auto_index is not None:
|
|
119
|
+
payload["auto_index"] = auto_index
|
|
120
|
+
if chunking is not None:
|
|
121
|
+
payload["chunking"] = chunking
|
|
122
|
+
elif chunk_strategy is not None:
|
|
123
|
+
payload["chunk_strategy"] = chunk_strategy
|
|
124
|
+
data = self._request(
|
|
125
|
+
"POST",
|
|
126
|
+
f"/v1/corpora/{corpus_id}/documents",
|
|
127
|
+
json=payload,
|
|
128
|
+
headers=headers,
|
|
129
|
+
request_options=request_options,
|
|
130
|
+
)
|
|
131
|
+
return self._maybe_validate(data, "DocumentCreateResponse")
|
|
132
|
+
|
|
133
|
+
def upload_documents_batch(
|
|
134
|
+
self,
|
|
135
|
+
corpus_id: str,
|
|
136
|
+
documents: list[dict[str, Any]],
|
|
137
|
+
idempotency_key: str | None = None,
|
|
138
|
+
*,
|
|
139
|
+
auto_index: bool | None = None,
|
|
140
|
+
chunk_strategy: str | None = None,
|
|
141
|
+
chunking: ChunkingConfig | None = None,
|
|
142
|
+
wait: bool = True,
|
|
143
|
+
poll_s: int = 5,
|
|
144
|
+
timeout_s: float | None = None,
|
|
145
|
+
request_options: RequestOptions | None = None,
|
|
146
|
+
) -> DocumentBatchUploadResponse:
|
|
147
|
+
"""Upload multiple documents as raw text in a batch.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
corpus_id: Target corpus ID.
|
|
151
|
+
documents: List of document dicts with raw_text, source_uri, metadata.
|
|
152
|
+
idempotency_key: Optional key for idempotent requests.
|
|
153
|
+
auto_index: Whether to auto-index after ingestion.
|
|
154
|
+
chunk_strategy: Deprecated - use chunking instead.
|
|
155
|
+
chunking: Chunking configuration (strategy, chunk_size, overlap, etc.)
|
|
156
|
+
wait: If True, wait for the batch job to complete.
|
|
157
|
+
poll_s: Polling interval when waiting.
|
|
158
|
+
timeout_s: Maximum seconds to wait for job completion.
|
|
159
|
+
Use ``None`` to wait indefinitely. This timeout only bounds
|
|
160
|
+
client-side waiting and does not cancel the backend job.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Response with ``doc_ids`` (list of created document IDs),
|
|
164
|
+
``job_id``, and ``count``.
|
|
165
|
+
"""
|
|
166
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
167
|
+
payload: dict[str, Any] = {"documents": documents}
|
|
168
|
+
if auto_index is not None:
|
|
169
|
+
payload["auto_index"] = auto_index
|
|
170
|
+
if chunking is not None:
|
|
171
|
+
payload["chunking"] = chunking
|
|
172
|
+
elif chunk_strategy is not None:
|
|
173
|
+
payload["chunk_strategy"] = chunk_strategy
|
|
174
|
+
headers = self._idempotency_headers(idempotency_key)
|
|
175
|
+
data = self._request(
|
|
176
|
+
"POST",
|
|
177
|
+
f"/v1/corpora/{corpus_id}/documents:batch",
|
|
178
|
+
json=payload,
|
|
179
|
+
headers=headers,
|
|
180
|
+
request_options=request_options,
|
|
181
|
+
)
|
|
182
|
+
if wait:
|
|
183
|
+
job_id = data.get("job_id")
|
|
184
|
+
if job_id:
|
|
185
|
+
self._wait_for_job(job_id, poll_s=poll_s, timeout_s=timeout_s)
|
|
186
|
+
return self._maybe_validate(data, "DocumentBatchUploadResponse")
|
|
187
|
+
|
|
188
|
+
def upload_files_batch(
|
|
189
|
+
self,
|
|
190
|
+
corpus_id: str,
|
|
191
|
+
files: list[tuple[str, bytes]],
|
|
192
|
+
idempotency_key: str | None = None,
|
|
193
|
+
*,
|
|
194
|
+
auto_index: bool | None = None,
|
|
195
|
+
chunk_strategy: str | None = None,
|
|
196
|
+
chunking: ChunkingConfig | None = None,
|
|
197
|
+
wait: bool = True,
|
|
198
|
+
poll_s: int = 5,
|
|
199
|
+
timeout_s: float | None = None,
|
|
200
|
+
request_options: RequestOptions | None = None,
|
|
201
|
+
) -> DocumentBatchUploadResponse:
|
|
202
|
+
"""Upload multiple files in a single multipart request.
|
|
203
|
+
|
|
204
|
+
Creates a single ingest_documents_batch job for all files,
|
|
205
|
+
enabling batch processing with near-data optimization.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
corpus_id: Target corpus ID.
|
|
209
|
+
files: List of (filename, content_bytes) tuples.
|
|
210
|
+
idempotency_key: Optional key for idempotent requests.
|
|
211
|
+
auto_index: Whether to auto-index after ingestion.
|
|
212
|
+
chunk_strategy: Deprecated - use chunking instead.
|
|
213
|
+
chunking: Chunking configuration (strategy, chunk_size, overlap, etc.)
|
|
214
|
+
wait: If True, wait for the batch job to complete.
|
|
215
|
+
poll_s: Polling interval when waiting.
|
|
216
|
+
timeout_s: Maximum seconds to wait for job completion.
|
|
217
|
+
Use ``None`` to wait indefinitely. This timeout only bounds
|
|
218
|
+
client-side waiting and does not cancel the backend job.
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
Response with job_id, doc_ids, and count.
|
|
222
|
+
"""
|
|
223
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
224
|
+
headers = self._idempotency_headers(idempotency_key)
|
|
225
|
+
|
|
226
|
+
# Build multipart form data
|
|
227
|
+
files_payload: dict[str, Any] = {}
|
|
228
|
+
for i, (filename, content) in enumerate(files):
|
|
229
|
+
files_payload[f"files"] = files_payload.get("files", [])
|
|
230
|
+
# httpx expects list of tuples for multiple files with same key
|
|
231
|
+
files_list = [("files", (filename, content)) for filename, content in files]
|
|
232
|
+
|
|
233
|
+
form_data: dict[str, Any] = {}
|
|
234
|
+
if auto_index is not None:
|
|
235
|
+
form_data["auto_index"] = str(auto_index).lower()
|
|
236
|
+
if chunking is not None:
|
|
237
|
+
form_data["chunking"] = json.dumps(chunking)
|
|
238
|
+
elif chunk_strategy is not None:
|
|
239
|
+
form_data["chunk_strategy"] = chunk_strategy
|
|
240
|
+
|
|
241
|
+
data = self._request(
|
|
242
|
+
"POST",
|
|
243
|
+
f"/v1/corpora/{corpus_id}/documents:upload_batch",
|
|
244
|
+
data=form_data if form_data else None,
|
|
245
|
+
files=files_list,
|
|
246
|
+
headers=headers,
|
|
247
|
+
request_options=request_options,
|
|
248
|
+
)
|
|
249
|
+
if wait:
|
|
250
|
+
job_id = data.get("job_id")
|
|
251
|
+
if job_id:
|
|
252
|
+
self._wait_for_job(job_id, poll_s=poll_s, timeout_s=timeout_s)
|
|
253
|
+
return self._maybe_validate(data, "DocumentBatchUploadResponse")
|
|
254
|
+
|
|
255
|
+
def upload_documents_parallel(
|
|
256
|
+
self,
|
|
257
|
+
corpus_id: str,
|
|
258
|
+
file_paths: list[str],
|
|
259
|
+
*,
|
|
260
|
+
max_workers: int = 8,
|
|
261
|
+
auto_index: bool | None = None,
|
|
262
|
+
chunking: ChunkingConfig | None = None,
|
|
263
|
+
metadata: dict[str, Any] | None = None,
|
|
264
|
+
request_options: RequestOptions | None = None,
|
|
265
|
+
) -> list[DocumentCreateResponse]:
|
|
266
|
+
"""Upload multiple files concurrently using a thread pool.
|
|
267
|
+
|
|
268
|
+
Each file is uploaded as a separate HTTP request via
|
|
269
|
+
:meth:`upload_document`. For large batches this is significantly
|
|
270
|
+
faster than uploading sequentially.
|
|
271
|
+
|
|
272
|
+
Unlike :meth:`upload_files_batch` (which sends a single multipart
|
|
273
|
+
request for server-side batching), this method issues concurrent
|
|
274
|
+
individual uploads for client-side parallelism.
|
|
275
|
+
|
|
276
|
+
.. warning:: **All-or-nothing semantics.** On partial failure an
|
|
277
|
+
:class:`ExceptionGroup` is raised and **no** successful results are
|
|
278
|
+
returned. Callers who need partial-failure recovery should upload
|
|
279
|
+
files individually via :meth:`upload_document` and handle errors
|
|
280
|
+
per file.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
corpus_id: Target corpus ID.
|
|
284
|
+
file_paths: List of local file paths to upload.
|
|
285
|
+
max_workers: Maximum number of concurrent upload threads
|
|
286
|
+
(default 8, must be >= 1).
|
|
287
|
+
auto_index: Whether to auto-index after ingestion.
|
|
288
|
+
chunking: Chunking configuration applied to each upload.
|
|
289
|
+
metadata: Optional metadata applied to every document.
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
List of upload responses (one per file, in input order).
|
|
293
|
+
|
|
294
|
+
Raises:
|
|
295
|
+
ExceptionGroup: If one or more uploads fail, containing all
|
|
296
|
+
individual exceptions from failed uploads. Successful
|
|
297
|
+
results from other uploads are discarded.
|
|
298
|
+
ValueError: If *max_workers* is less than 1 or greater than 256.
|
|
299
|
+
"""
|
|
300
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
301
|
+
if max_workers < 1:
|
|
302
|
+
raise ValueError(f"max_workers must be >= 1, got {max_workers}")
|
|
303
|
+
if max_workers > 256:
|
|
304
|
+
raise ValueError(f"max_workers must be <= 256, got {max_workers}")
|
|
305
|
+
if not file_paths:
|
|
306
|
+
return []
|
|
307
|
+
|
|
308
|
+
errors: list[Exception] = []
|
|
309
|
+
|
|
310
|
+
with ThreadPoolExecutor(max_workers=min(max_workers, len(file_paths))) as pool:
|
|
311
|
+
futures: list[Future[DocumentCreateResponse]] = [
|
|
312
|
+
pool.submit(
|
|
313
|
+
self.upload_document,
|
|
314
|
+
corpus_id,
|
|
315
|
+
file_path=fp,
|
|
316
|
+
auto_index=auto_index,
|
|
317
|
+
chunking=chunking,
|
|
318
|
+
metadata=metadata,
|
|
319
|
+
request_options=request_options,
|
|
320
|
+
)
|
|
321
|
+
for fp in file_paths
|
|
322
|
+
]
|
|
323
|
+
results: list[DocumentCreateResponse] = []
|
|
324
|
+
for future in futures:
|
|
325
|
+
try:
|
|
326
|
+
results.append(future.result())
|
|
327
|
+
except Exception as exc:
|
|
328
|
+
errors.append(exc)
|
|
329
|
+
|
|
330
|
+
if errors:
|
|
331
|
+
raise ExceptionGroup(f"{len(errors)} of {len(file_paths)} uploads failed", errors)
|
|
332
|
+
return results
|
|
333
|
+
|
|
334
|
+
def ingest_urls(
|
|
335
|
+
self,
|
|
336
|
+
corpus_id: str,
|
|
337
|
+
urls: list[dict[str, Any]],
|
|
338
|
+
idempotency_key: str | None = None,
|
|
339
|
+
*,
|
|
340
|
+
auto_index: bool | None = None,
|
|
341
|
+
chunk_strategy: str | None = None,
|
|
342
|
+
chunking: ChunkingConfig | None = None,
|
|
343
|
+
wait: bool = True,
|
|
344
|
+
poll_s: int = 5,
|
|
345
|
+
timeout_s: float | None = None,
|
|
346
|
+
request_options: RequestOptions | None = None,
|
|
347
|
+
) -> DocumentUrlIngestResponse:
|
|
348
|
+
"""Ingest documents from URLs.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
corpus_id: Target corpus ID.
|
|
352
|
+
urls: List of URL dicts with url, title, tags, metadata.
|
|
353
|
+
idempotency_key: Optional key for idempotent requests.
|
|
354
|
+
auto_index: Whether to auto-index after ingestion.
|
|
355
|
+
chunk_strategy: Deprecated - use chunking instead.
|
|
356
|
+
chunking: Chunking configuration (strategy, chunk_size, overlap, etc.)
|
|
357
|
+
wait: If True, wait for the batch job to complete.
|
|
358
|
+
poll_s: Polling interval when waiting.
|
|
359
|
+
timeout_s: Maximum seconds to wait for job completion.
|
|
360
|
+
Use ``None`` to wait indefinitely. This timeout only bounds
|
|
361
|
+
client-side waiting and does not cancel the backend job.
|
|
362
|
+
"""
|
|
363
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
364
|
+
payload: dict[str, Any] = {"urls": urls}
|
|
365
|
+
if auto_index is not None:
|
|
366
|
+
payload["auto_index"] = auto_index
|
|
367
|
+
if chunking is not None:
|
|
368
|
+
payload["chunking"] = chunking
|
|
369
|
+
elif chunk_strategy is not None:
|
|
370
|
+
payload["chunk_strategy"] = chunk_strategy
|
|
371
|
+
headers = self._idempotency_headers(idempotency_key)
|
|
372
|
+
data = self._request(
|
|
373
|
+
"POST",
|
|
374
|
+
f"/v1/corpora/{corpus_id}/documents:ingest_urls",
|
|
375
|
+
json=payload,
|
|
376
|
+
headers=headers,
|
|
377
|
+
request_options=request_options,
|
|
378
|
+
)
|
|
379
|
+
if wait:
|
|
380
|
+
job_id = data.get("job_id")
|
|
381
|
+
if job_id:
|
|
382
|
+
self._wait_for_job(job_id, poll_s=poll_s, timeout_s=timeout_s)
|
|
383
|
+
return self._maybe_validate(data, "DocumentUrlIngestResponse")
|
|
384
|
+
|
|
385
|
+
def ingest_manifest(
|
|
386
|
+
self,
|
|
387
|
+
corpus_id: str,
|
|
388
|
+
manifest_uri: str,
|
|
389
|
+
max_documents: int | None = None,
|
|
390
|
+
idempotency_key: str | None = None,
|
|
391
|
+
*,
|
|
392
|
+
auto_index: bool | None = None,
|
|
393
|
+
chunk_strategy: str | None = None,
|
|
394
|
+
chunking: ChunkingConfig | None = None,
|
|
395
|
+
request_options: RequestOptions | None = None,
|
|
396
|
+
) -> DocumentManifestIngestResponse:
|
|
397
|
+
"""Ingest documents from a manifest file.
|
|
398
|
+
|
|
399
|
+
Args:
|
|
400
|
+
corpus_id: Target corpus ID.
|
|
401
|
+
manifest_uri: URI to manifest file (S3, HTTP, local).
|
|
402
|
+
max_documents: Optional limit on documents to ingest.
|
|
403
|
+
idempotency_key: Optional key for idempotent requests.
|
|
404
|
+
auto_index: Whether to auto-index after ingestion.
|
|
405
|
+
chunk_strategy: Deprecated - use chunking instead.
|
|
406
|
+
chunking: Chunking configuration (strategy, chunk_size, overlap, etc.)
|
|
407
|
+
"""
|
|
408
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
409
|
+
payload: dict[str, Any] = {"manifest_uri": manifest_uri}
|
|
410
|
+
if max_documents is not None:
|
|
411
|
+
payload["max_documents"] = max_documents
|
|
412
|
+
if auto_index is not None:
|
|
413
|
+
payload["auto_index"] = auto_index
|
|
414
|
+
if chunking is not None:
|
|
415
|
+
payload["chunking"] = chunking
|
|
416
|
+
elif chunk_strategy is not None:
|
|
417
|
+
payload["chunk_strategy"] = chunk_strategy
|
|
418
|
+
headers = self._idempotency_headers(idempotency_key)
|
|
419
|
+
data = self._request(
|
|
420
|
+
"POST",
|
|
421
|
+
f"/v1/corpora/{corpus_id}/documents:ingest_manifest",
|
|
422
|
+
json=payload,
|
|
423
|
+
headers=headers,
|
|
424
|
+
request_options=request_options,
|
|
425
|
+
)
|
|
426
|
+
return self._maybe_validate(data, "DocumentManifestIngestResponse")
|
|
427
|
+
|
|
428
|
+
def list_documents(
|
|
429
|
+
self,
|
|
430
|
+
corpus_id: str,
|
|
431
|
+
*,
|
|
432
|
+
limit: int = 100,
|
|
433
|
+
offset: int = 0,
|
|
434
|
+
q: str | None = None,
|
|
435
|
+
status: str | None = None,
|
|
436
|
+
source: str | None = None,
|
|
437
|
+
tag: str | None = None,
|
|
438
|
+
request_options: RequestOptions | None = None,
|
|
439
|
+
) -> Page[dict[str, Any]]:
|
|
440
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
441
|
+
params: dict[str, Any] = {}
|
|
442
|
+
if q is not None:
|
|
443
|
+
params["q"] = q
|
|
444
|
+
if status is not None:
|
|
445
|
+
params["status"] = status
|
|
446
|
+
if source is not None:
|
|
447
|
+
params["source"] = source
|
|
448
|
+
if tag is not None:
|
|
449
|
+
params["tag"] = tag
|
|
450
|
+
return self._list_page(
|
|
451
|
+
"GET",
|
|
452
|
+
f"/v1/corpora/{corpus_id}/documents",
|
|
453
|
+
items_key="documents",
|
|
454
|
+
params=params or None,
|
|
455
|
+
limit=limit,
|
|
456
|
+
offset=offset,
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
def iter_documents(
|
|
460
|
+
self,
|
|
461
|
+
corpus_id: str,
|
|
462
|
+
*,
|
|
463
|
+
limit: int = 100,
|
|
464
|
+
q: str | None = None,
|
|
465
|
+
status: str | None = None,
|
|
466
|
+
source: str | None = None,
|
|
467
|
+
tag: str | None = None,
|
|
468
|
+
request_options: RequestOptions | None = None,
|
|
469
|
+
) -> SyncPager[dict[str, Any]]:
|
|
470
|
+
"""Lazily paginate documents, yielding individual document items."""
|
|
471
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
472
|
+
params: dict[str, Any] = {}
|
|
473
|
+
if q is not None:
|
|
474
|
+
params["q"] = q
|
|
475
|
+
if status is not None:
|
|
476
|
+
params["status"] = status
|
|
477
|
+
if source is not None:
|
|
478
|
+
params["source"] = source
|
|
479
|
+
if tag is not None:
|
|
480
|
+
params["tag"] = tag
|
|
481
|
+
return self._paginate(
|
|
482
|
+
"GET",
|
|
483
|
+
f"/v1/corpora/{corpus_id}/documents",
|
|
484
|
+
items_key="documents",
|
|
485
|
+
params=params if params else None,
|
|
486
|
+
limit=limit,
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
def get_document(
|
|
490
|
+
self,
|
|
491
|
+
doc_id: str,
|
|
492
|
+
request_options: RequestOptions | None = None,
|
|
493
|
+
) -> DocumentDetailResponse:
|
|
494
|
+
doc_id = require_str(doc_id, "doc_id")
|
|
495
|
+
data = self._request("GET", f"/v1/documents/{doc_id}", request_options=request_options)
|
|
496
|
+
return self._maybe_validate(data, "DocumentDetailResponse")
|
|
497
|
+
|
|
498
|
+
def update_document_metadata(
|
|
499
|
+
self,
|
|
500
|
+
doc_id: str,
|
|
501
|
+
metadata: dict[str, Any],
|
|
502
|
+
request_options: RequestOptions | None = None,
|
|
503
|
+
) -> dict[str, Any]:
|
|
504
|
+
"""Update customer metadata on a document using merge semantics.
|
|
505
|
+
|
|
506
|
+
Keys with non-empty values are added or updated.
|
|
507
|
+
Keys with empty string or None values are removed.
|
|
508
|
+
Keys not in the request are left unchanged.
|
|
509
|
+
|
|
510
|
+
Args:
|
|
511
|
+
doc_id: Document ID to update
|
|
512
|
+
metadata: Dict of metadata updates to apply
|
|
513
|
+
|
|
514
|
+
Returns:
|
|
515
|
+
Updated metadata dict with custom_metadata and system_metadata
|
|
516
|
+
"""
|
|
517
|
+
doc_id = require_str(doc_id, "doc_id")
|
|
518
|
+
response = self._request(
|
|
519
|
+
"PATCH",
|
|
520
|
+
f"/v1/documents/{doc_id}/metadata",
|
|
521
|
+
json=metadata,
|
|
522
|
+
request_options=request_options,
|
|
523
|
+
)
|
|
524
|
+
return response
|
|
525
|
+
|
|
526
|
+
def delete_document(
|
|
527
|
+
self,
|
|
528
|
+
corpus_id: str,
|
|
529
|
+
doc_id: str,
|
|
530
|
+
*,
|
|
531
|
+
confirm: bool = False,
|
|
532
|
+
reindex: bool = False,
|
|
533
|
+
request_options: RequestOptions | None = None,
|
|
534
|
+
) -> DocumentDeleteResponse:
|
|
535
|
+
"""Delete a document from a corpus.
|
|
536
|
+
|
|
537
|
+
This is an irreversible operation. You must pass ``confirm=True``
|
|
538
|
+
to acknowledge this and proceed.
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
corpus_id: The corpus containing the document.
|
|
542
|
+
doc_id: Unique identifier of the document to delete.
|
|
543
|
+
confirm: Safety guard — must be ``True`` to execute the
|
|
544
|
+
deletion. Raises ``ConfirmationRequiredError`` when ``False``.
|
|
545
|
+
reindex: If ``True``, trigger a re-index of the corpus after
|
|
546
|
+
deletion.
|
|
547
|
+
|
|
548
|
+
Returns:
|
|
549
|
+
Confirmation of the deletion.
|
|
550
|
+
|
|
551
|
+
Raises:
|
|
552
|
+
ConfirmationRequiredError: If *confirm* is not ``True``.
|
|
553
|
+
Knowledge2Error: If the API request fails.
|
|
554
|
+
"""
|
|
555
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
556
|
+
doc_id = require_str(doc_id, "doc_id")
|
|
557
|
+
if not confirm:
|
|
558
|
+
raise ConfirmationRequiredError("document", doc_id)
|
|
559
|
+
data = self._request(
|
|
560
|
+
"DELETE",
|
|
561
|
+
f"/v1/corpora/{corpus_id}/documents/{doc_id}",
|
|
562
|
+
params={"reindex": reindex},
|
|
563
|
+
request_options=request_options,
|
|
564
|
+
)
|
|
565
|
+
return self._maybe_validate(data, "DocumentDeleteResponse")
|
|
566
|
+
|
|
567
|
+
def list_chunks(
|
|
568
|
+
self,
|
|
569
|
+
corpus_id: str,
|
|
570
|
+
limit: int = 100,
|
|
571
|
+
offset: int = 0,
|
|
572
|
+
request_options: RequestOptions | None = None,
|
|
573
|
+
) -> Page[dict[str, Any]]:
|
|
574
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
575
|
+
return self._list_page(
|
|
576
|
+
"GET",
|
|
577
|
+
f"/v1/corpora/{corpus_id}/chunks",
|
|
578
|
+
items_key="chunks",
|
|
579
|
+
limit=limit,
|
|
580
|
+
offset=offset,
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
def iter_chunks(
|
|
584
|
+
self,
|
|
585
|
+
corpus_id: str,
|
|
586
|
+
*,
|
|
587
|
+
limit: int = 100,
|
|
588
|
+
request_options: RequestOptions | None = None,
|
|
589
|
+
) -> SyncPager[dict[str, Any]]:
|
|
590
|
+
"""Lazily paginate chunks, yielding individual chunk items."""
|
|
591
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
592
|
+
return self._paginate(
|
|
593
|
+
"GET",
|
|
594
|
+
f"/v1/corpora/{corpus_id}/chunks",
|
|
595
|
+
items_key="chunks",
|
|
596
|
+
limit=limit,
|
|
597
|
+
)
|