knowledge2 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowledge2-0.4.0.dist-info/METADATA +556 -0
- knowledge2-0.4.0.dist-info/RECORD +139 -0
- knowledge2-0.4.0.dist-info/WHEEL +5 -0
- knowledge2-0.4.0.dist-info/top_level.txt +1 -0
- sdk/__init__.py +70 -0
- sdk/_async_base.py +525 -0
- sdk/_async_paging.py +57 -0
- sdk/_base.py +541 -0
- sdk/_logging.py +41 -0
- sdk/_paging.py +73 -0
- sdk/_preview.py +70 -0
- sdk/_raw_response.py +25 -0
- sdk/_request_options.py +51 -0
- sdk/_transport.py +144 -0
- sdk/_validation.py +25 -0
- sdk/_validation_response.py +36 -0
- sdk/_version.py +3 -0
- sdk/async_client.py +320 -0
- sdk/async_resources/__init__.py +45 -0
- sdk/async_resources/_mixin_base.py +42 -0
- sdk/async_resources/a2a.py +230 -0
- sdk/async_resources/agents.py +489 -0
- sdk/async_resources/audit.py +145 -0
- sdk/async_resources/auth.py +133 -0
- sdk/async_resources/console.py +409 -0
- sdk/async_resources/corpora.py +276 -0
- sdk/async_resources/deployments.py +106 -0
- sdk/async_resources/documents.py +592 -0
- sdk/async_resources/feeds.py +248 -0
- sdk/async_resources/indexes.py +208 -0
- sdk/async_resources/jobs.py +165 -0
- sdk/async_resources/metadata.py +48 -0
- sdk/async_resources/models.py +102 -0
- sdk/async_resources/onboarding.py +538 -0
- sdk/async_resources/orgs.py +37 -0
- sdk/async_resources/pipelines.py +523 -0
- sdk/async_resources/projects.py +90 -0
- sdk/async_resources/search.py +262 -0
- sdk/async_resources/training.py +357 -0
- sdk/async_resources/usage.py +91 -0
- sdk/client.py +417 -0
- sdk/config.py +182 -0
- sdk/errors.py +178 -0
- sdk/examples/auth_factory.py +34 -0
- sdk/examples/batch_operations.py +57 -0
- sdk/examples/document_upload.py +56 -0
- sdk/examples/e2e_lifecycle.py +213 -0
- sdk/examples/error_handling.py +61 -0
- sdk/examples/pagination.py +64 -0
- sdk/examples/quickstart.py +36 -0
- sdk/examples/request_options.py +44 -0
- sdk/examples/search.py +64 -0
- sdk/integrations/__init__.py +57 -0
- sdk/integrations/_client.py +101 -0
- sdk/integrations/langchain/__init__.py +6 -0
- sdk/integrations/langchain/retriever.py +166 -0
- sdk/integrations/langchain/tools.py +108 -0
- sdk/integrations/llamaindex/__init__.py +11 -0
- sdk/integrations/llamaindex/filters.py +78 -0
- sdk/integrations/llamaindex/retriever.py +162 -0
- sdk/integrations/llamaindex/tools.py +109 -0
- sdk/integrations/llamaindex/vector_store.py +320 -0
- sdk/models/__init__.py +18 -0
- sdk/models/_base.py +24 -0
- sdk/models/_registry.py +457 -0
- sdk/models/a2a.py +92 -0
- sdk/models/agents.py +109 -0
- sdk/models/audit.py +28 -0
- sdk/models/auth.py +49 -0
- sdk/models/chunks.py +20 -0
- sdk/models/common.py +14 -0
- sdk/models/console.py +103 -0
- sdk/models/corpora.py +48 -0
- sdk/models/deployments.py +13 -0
- sdk/models/documents.py +126 -0
- sdk/models/embeddings.py +24 -0
- sdk/models/evaluation.py +17 -0
- sdk/models/feedback.py +9 -0
- sdk/models/feeds.py +57 -0
- sdk/models/indexes.py +36 -0
- sdk/models/jobs.py +52 -0
- sdk/models/models.py +26 -0
- sdk/models/onboarding.py +323 -0
- sdk/models/orgs.py +11 -0
- sdk/models/pipelines.py +147 -0
- sdk/models/projects.py +19 -0
- sdk/models/search.py +149 -0
- sdk/models/training.py +57 -0
- sdk/models/usage.py +39 -0
- sdk/namespaces.py +386 -0
- sdk/py.typed +0 -0
- sdk/resources/__init__.py +45 -0
- sdk/resources/_mixin_base.py +40 -0
- sdk/resources/a2a.py +230 -0
- sdk/resources/agents.py +487 -0
- sdk/resources/audit.py +144 -0
- sdk/resources/auth.py +138 -0
- sdk/resources/console.py +411 -0
- sdk/resources/corpora.py +269 -0
- sdk/resources/deployments.py +105 -0
- sdk/resources/documents.py +597 -0
- sdk/resources/feeds.py +246 -0
- sdk/resources/indexes.py +210 -0
- sdk/resources/jobs.py +164 -0
- sdk/resources/metadata.py +53 -0
- sdk/resources/models.py +99 -0
- sdk/resources/onboarding.py +542 -0
- sdk/resources/orgs.py +35 -0
- sdk/resources/pipeline_builder.py +257 -0
- sdk/resources/pipelines.py +520 -0
- sdk/resources/projects.py +87 -0
- sdk/resources/search.py +277 -0
- sdk/resources/training.py +358 -0
- sdk/resources/usage.py +92 -0
- sdk/types/__init__.py +366 -0
- sdk/types/a2a.py +88 -0
- sdk/types/agents.py +133 -0
- sdk/types/audit.py +26 -0
- sdk/types/auth.py +45 -0
- sdk/types/chunks.py +18 -0
- sdk/types/common.py +10 -0
- sdk/types/console.py +99 -0
- sdk/types/corpora.py +42 -0
- sdk/types/deployments.py +11 -0
- sdk/types/documents.py +104 -0
- sdk/types/embeddings.py +22 -0
- sdk/types/evaluation.py +15 -0
- sdk/types/feedback.py +7 -0
- sdk/types/feeds.py +61 -0
- sdk/types/indexes.py +30 -0
- sdk/types/jobs.py +50 -0
- sdk/types/models.py +22 -0
- sdk/types/onboarding.py +395 -0
- sdk/types/orgs.py +9 -0
- sdk/types/pipelines.py +177 -0
- sdk/types/projects.py +14 -0
- sdk/types/search.py +116 -0
- sdk/types/training.py +55 -0
- sdk/types/usage.py +37 -0
|
@@ -0,0 +1,592 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from sdk._async_paging import AsyncPager
|
|
9
|
+
from sdk._paging import Page
|
|
10
|
+
from sdk._request_options import RequestOptions
|
|
11
|
+
from sdk._validation import require_str
|
|
12
|
+
from sdk.async_resources._mixin_base import AsyncRequesterMixin
|
|
13
|
+
from sdk.errors import ConfirmationRequiredError
|
|
14
|
+
from sdk.types import (
|
|
15
|
+
ChunkingConfig,
|
|
16
|
+
DocumentBatchUploadResponse,
|
|
17
|
+
DocumentCreateResponse,
|
|
18
|
+
DocumentDeleteResponse,
|
|
19
|
+
DocumentDetailResponse,
|
|
20
|
+
DocumentManifestIngestResponse,
|
|
21
|
+
DocumentUrlIngestResponse,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class AsyncDocumentsMixin(AsyncRequesterMixin):
|
|
26
|
+
async def upload_document(
|
|
27
|
+
self,
|
|
28
|
+
corpus_id: str,
|
|
29
|
+
*,
|
|
30
|
+
file_path: str | None = None,
|
|
31
|
+
file_bytes: bytes | None = None,
|
|
32
|
+
filename: str | None = None,
|
|
33
|
+
raw_text: str | None = None,
|
|
34
|
+
source_uri: str | None = None,
|
|
35
|
+
metadata: dict[str, Any] | None = None,
|
|
36
|
+
auto_index: bool | None = None,
|
|
37
|
+
chunk_strategy: str | None = None,
|
|
38
|
+
chunking: ChunkingConfig | None = None,
|
|
39
|
+
idempotency_key: str | None = None,
|
|
40
|
+
request_options: RequestOptions | None = None,
|
|
41
|
+
) -> DocumentCreateResponse:
|
|
42
|
+
"""Upload a document to a corpus.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
corpus_id: Target corpus ID.
|
|
46
|
+
file_path: Path to file to upload.
|
|
47
|
+
file_bytes: Raw file bytes to upload.
|
|
48
|
+
filename: Filename when using file_bytes.
|
|
49
|
+
raw_text: Raw text content to upload.
|
|
50
|
+
source_uri: Optional source URI for the document.
|
|
51
|
+
metadata: Optional document metadata.
|
|
52
|
+
auto_index: Whether to auto-index after ingestion.
|
|
53
|
+
chunk_strategy: Deprecated - use chunking instead.
|
|
54
|
+
chunking: Chunking configuration (strategy, chunk_size, overlap, etc.)
|
|
55
|
+
idempotency_key: Optional key for idempotent requests.
|
|
56
|
+
"""
|
|
57
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
58
|
+
if file_path and (file_bytes or raw_text):
|
|
59
|
+
raise ValueError("file_path cannot be combined with file_bytes or raw_text")
|
|
60
|
+
if file_bytes and raw_text:
|
|
61
|
+
raise ValueError("file_bytes cannot be combined with raw_text")
|
|
62
|
+
headers = self._idempotency_headers(idempotency_key)
|
|
63
|
+
if file_path:
|
|
64
|
+
form: dict[str, str] = {}
|
|
65
|
+
if source_uri is not None:
|
|
66
|
+
form["source_uri"] = source_uri
|
|
67
|
+
if metadata is not None:
|
|
68
|
+
form["metadata"] = json.dumps(metadata)
|
|
69
|
+
if auto_index is not None:
|
|
70
|
+
form["auto_index"] = str(bool(auto_index)).lower()
|
|
71
|
+
if chunking is not None:
|
|
72
|
+
form["chunking"] = json.dumps(chunking)
|
|
73
|
+
elif chunk_strategy is not None:
|
|
74
|
+
form["chunk_strategy"] = chunk_strategy
|
|
75
|
+
with open(file_path, "rb") as handle:
|
|
76
|
+
files = {"file": (os.path.basename(file_path), handle)}
|
|
77
|
+
data = await self._request(
|
|
78
|
+
"POST",
|
|
79
|
+
f"/v1/corpora/{corpus_id}/documents",
|
|
80
|
+
data=form,
|
|
81
|
+
files=files,
|
|
82
|
+
headers=headers,
|
|
83
|
+
request_options=request_options,
|
|
84
|
+
)
|
|
85
|
+
return self._maybe_validate(data, "DocumentCreateResponse")
|
|
86
|
+
if file_bytes is not None:
|
|
87
|
+
if not filename:
|
|
88
|
+
raise ValueError("filename is required when using file_bytes")
|
|
89
|
+
form_data: dict[str, str] = {}
|
|
90
|
+
if source_uri is not None:
|
|
91
|
+
form_data["source_uri"] = source_uri
|
|
92
|
+
if metadata is not None:
|
|
93
|
+
form_data["metadata"] = json.dumps(metadata)
|
|
94
|
+
if auto_index is not None:
|
|
95
|
+
form_data["auto_index"] = str(bool(auto_index)).lower()
|
|
96
|
+
if chunking is not None:
|
|
97
|
+
form_data["chunking"] = json.dumps(chunking)
|
|
98
|
+
elif chunk_strategy is not None:
|
|
99
|
+
form_data["chunk_strategy"] = chunk_strategy
|
|
100
|
+
file_payload: dict[str, Any] = {"file": (filename, file_bytes)}
|
|
101
|
+
data = await self._request(
|
|
102
|
+
"POST",
|
|
103
|
+
f"/v1/corpora/{corpus_id}/documents",
|
|
104
|
+
data=form_data,
|
|
105
|
+
files=file_payload,
|
|
106
|
+
headers=headers,
|
|
107
|
+
request_options=request_options,
|
|
108
|
+
)
|
|
109
|
+
return self._maybe_validate(data, "DocumentCreateResponse")
|
|
110
|
+
if raw_text is None:
|
|
111
|
+
raise ValueError("raw_text is required when no file is provided")
|
|
112
|
+
payload: dict[str, Any] = {}
|
|
113
|
+
if raw_text is not None:
|
|
114
|
+
payload["raw_text"] = raw_text
|
|
115
|
+
if source_uri is not None:
|
|
116
|
+
payload["source_uri"] = source_uri
|
|
117
|
+
if metadata is not None:
|
|
118
|
+
payload["metadata"] = metadata
|
|
119
|
+
if auto_index is not None:
|
|
120
|
+
payload["auto_index"] = auto_index
|
|
121
|
+
if chunking is not None:
|
|
122
|
+
payload["chunking"] = chunking
|
|
123
|
+
elif chunk_strategy is not None:
|
|
124
|
+
payload["chunk_strategy"] = chunk_strategy
|
|
125
|
+
data = await self._request(
|
|
126
|
+
"POST",
|
|
127
|
+
f"/v1/corpora/{corpus_id}/documents",
|
|
128
|
+
json=payload,
|
|
129
|
+
headers=headers,
|
|
130
|
+
request_options=request_options,
|
|
131
|
+
)
|
|
132
|
+
return self._maybe_validate(data, "DocumentCreateResponse")
|
|
133
|
+
|
|
134
|
+
async def upload_documents_batch(
|
|
135
|
+
self,
|
|
136
|
+
corpus_id: str,
|
|
137
|
+
documents: list[dict[str, Any]],
|
|
138
|
+
idempotency_key: str | None = None,
|
|
139
|
+
*,
|
|
140
|
+
auto_index: bool | None = None,
|
|
141
|
+
chunk_strategy: str | None = None,
|
|
142
|
+
chunking: ChunkingConfig | None = None,
|
|
143
|
+
wait: bool = True,
|
|
144
|
+
poll_s: int = 5,
|
|
145
|
+
timeout_s: float | None = None,
|
|
146
|
+
request_options: RequestOptions | None = None,
|
|
147
|
+
) -> DocumentBatchUploadResponse:
|
|
148
|
+
"""Upload multiple documents as raw text in a batch.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
corpus_id: Target corpus ID.
|
|
152
|
+
documents: List of document dicts with raw_text, source_uri, metadata.
|
|
153
|
+
idempotency_key: Optional key for idempotent requests.
|
|
154
|
+
auto_index: Whether to auto-index after ingestion.
|
|
155
|
+
chunk_strategy: Deprecated - use chunking instead.
|
|
156
|
+
chunking: Chunking configuration (strategy, chunk_size, overlap, etc.)
|
|
157
|
+
wait: If True, wait for the batch job to complete.
|
|
158
|
+
poll_s: Polling interval when waiting.
|
|
159
|
+
timeout_s: Maximum seconds to wait for job completion.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Response with ``doc_ids`` (list of created document IDs),
|
|
163
|
+
``job_id``, and ``count``.
|
|
164
|
+
"""
|
|
165
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
166
|
+
payload: dict[str, Any] = {"documents": documents}
|
|
167
|
+
if auto_index is not None:
|
|
168
|
+
payload["auto_index"] = auto_index
|
|
169
|
+
if chunking is not None:
|
|
170
|
+
payload["chunking"] = chunking
|
|
171
|
+
elif chunk_strategy is not None:
|
|
172
|
+
payload["chunk_strategy"] = chunk_strategy
|
|
173
|
+
headers = self._idempotency_headers(idempotency_key)
|
|
174
|
+
data = await self._request(
|
|
175
|
+
"POST",
|
|
176
|
+
f"/v1/corpora/{corpus_id}/documents:batch",
|
|
177
|
+
json=payload,
|
|
178
|
+
headers=headers,
|
|
179
|
+
request_options=request_options,
|
|
180
|
+
)
|
|
181
|
+
if wait:
|
|
182
|
+
job_id = data.get("job_id")
|
|
183
|
+
if job_id:
|
|
184
|
+
await self._wait_for_job(job_id, poll_s=poll_s, timeout_s=timeout_s)
|
|
185
|
+
return self._maybe_validate(data, "DocumentBatchUploadResponse")
|
|
186
|
+
|
|
187
|
+
async def upload_files_batch(
|
|
188
|
+
self,
|
|
189
|
+
corpus_id: str,
|
|
190
|
+
files: list[tuple[str, bytes]],
|
|
191
|
+
idempotency_key: str | None = None,
|
|
192
|
+
*,
|
|
193
|
+
auto_index: bool | None = None,
|
|
194
|
+
chunk_strategy: str | None = None,
|
|
195
|
+
chunking: ChunkingConfig | None = None,
|
|
196
|
+
wait: bool = True,
|
|
197
|
+
poll_s: int = 5,
|
|
198
|
+
timeout_s: float | None = None,
|
|
199
|
+
request_options: RequestOptions | None = None,
|
|
200
|
+
) -> DocumentBatchUploadResponse:
|
|
201
|
+
"""Upload multiple files in a single multipart request.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
corpus_id: Target corpus ID.
|
|
205
|
+
files: List of (filename, content_bytes) tuples.
|
|
206
|
+
idempotency_key: Optional key for idempotent requests.
|
|
207
|
+
auto_index: Whether to auto-index after ingestion.
|
|
208
|
+
chunk_strategy: Deprecated - use chunking instead.
|
|
209
|
+
chunking: Chunking configuration (strategy, chunk_size, overlap, etc.)
|
|
210
|
+
wait: If True, wait for the batch job to complete.
|
|
211
|
+
poll_s: Polling interval when waiting.
|
|
212
|
+
timeout_s: Maximum seconds to wait for job completion.
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
Response with job_id, doc_ids, and count.
|
|
216
|
+
"""
|
|
217
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
218
|
+
headers = self._idempotency_headers(idempotency_key)
|
|
219
|
+
|
|
220
|
+
files_list = [("files", (filename, content)) for filename, content in files]
|
|
221
|
+
|
|
222
|
+
form_data: dict[str, Any] = {}
|
|
223
|
+
if auto_index is not None:
|
|
224
|
+
form_data["auto_index"] = str(auto_index).lower()
|
|
225
|
+
if chunking is not None:
|
|
226
|
+
form_data["chunking"] = json.dumps(chunking)
|
|
227
|
+
elif chunk_strategy is not None:
|
|
228
|
+
form_data["chunk_strategy"] = chunk_strategy
|
|
229
|
+
|
|
230
|
+
data = await self._request(
|
|
231
|
+
"POST",
|
|
232
|
+
f"/v1/corpora/{corpus_id}/documents:upload_batch",
|
|
233
|
+
data=form_data if form_data else None,
|
|
234
|
+
files=files_list,
|
|
235
|
+
headers=headers,
|
|
236
|
+
request_options=request_options,
|
|
237
|
+
)
|
|
238
|
+
if wait:
|
|
239
|
+
job_id = data.get("job_id")
|
|
240
|
+
if job_id:
|
|
241
|
+
await self._wait_for_job(job_id, poll_s=poll_s, timeout_s=timeout_s)
|
|
242
|
+
return self._maybe_validate(data, "DocumentBatchUploadResponse")
|
|
243
|
+
|
|
244
|
+
async def upload_documents_parallel(
|
|
245
|
+
self,
|
|
246
|
+
corpus_id: str,
|
|
247
|
+
file_paths: list[str],
|
|
248
|
+
*,
|
|
249
|
+
max_workers: int = 8,
|
|
250
|
+
auto_index: bool | None = None,
|
|
251
|
+
chunking: ChunkingConfig | None = None,
|
|
252
|
+
metadata: dict[str, Any] | None = None,
|
|
253
|
+
request_options: RequestOptions | None = None,
|
|
254
|
+
) -> list[DocumentCreateResponse]:
|
|
255
|
+
"""Upload multiple files concurrently using asyncio.gather.
|
|
256
|
+
|
|
257
|
+
Each file is uploaded as a separate HTTP request via
|
|
258
|
+
:meth:`upload_document`. For large batches this is significantly
|
|
259
|
+
faster than uploading sequentially.
|
|
260
|
+
|
|
261
|
+
Unlike :meth:`upload_files_batch` (which sends a single multipart
|
|
262
|
+
request for server-side batching), this method issues concurrent
|
|
263
|
+
individual uploads for client-side parallelism.
|
|
264
|
+
|
|
265
|
+
.. warning:: **All-or-nothing semantics.** On partial failure an
|
|
266
|
+
:class:`ExceptionGroup` is raised and **no** successful results are
|
|
267
|
+
returned. Callers who need partial-failure recovery should upload
|
|
268
|
+
files individually via :meth:`upload_document` and handle errors
|
|
269
|
+
per file.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
corpus_id: Target corpus ID.
|
|
273
|
+
file_paths: List of local file paths to upload.
|
|
274
|
+
max_workers: Maximum number of concurrent upload coroutines
|
|
275
|
+
(default 8, must be >= 1).
|
|
276
|
+
auto_index: Whether to auto-index after ingestion.
|
|
277
|
+
chunking: Chunking configuration applied to each upload.
|
|
278
|
+
metadata: Optional metadata applied to every document.
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
List of upload responses (one per file, in input order).
|
|
282
|
+
|
|
283
|
+
Raises:
|
|
284
|
+
ExceptionGroup: If one or more uploads fail, containing all
|
|
285
|
+
individual exceptions from failed uploads. Successful
|
|
286
|
+
results from other uploads are discarded.
|
|
287
|
+
ValueError: If *max_workers* is less than 1 or greater than 256.
|
|
288
|
+
"""
|
|
289
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
290
|
+
if max_workers < 1:
|
|
291
|
+
raise ValueError(f"max_workers must be >= 1, got {max_workers}")
|
|
292
|
+
if max_workers > 256:
|
|
293
|
+
raise ValueError(f"max_workers must be <= 256, got {max_workers}")
|
|
294
|
+
if not file_paths:
|
|
295
|
+
return []
|
|
296
|
+
|
|
297
|
+
semaphore = asyncio.Semaphore(max_workers)
|
|
298
|
+
|
|
299
|
+
async def _upload(fp: str) -> DocumentCreateResponse:
|
|
300
|
+
async with semaphore:
|
|
301
|
+
return await self.upload_document(
|
|
302
|
+
corpus_id,
|
|
303
|
+
file_path=fp,
|
|
304
|
+
auto_index=auto_index,
|
|
305
|
+
chunking=chunking,
|
|
306
|
+
metadata=metadata,
|
|
307
|
+
request_options=request_options,
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
tasks = [_upload(fp) for fp in file_paths]
|
|
311
|
+
settled = await asyncio.gather(*tasks, return_exceptions=True)
|
|
312
|
+
|
|
313
|
+
results: list[DocumentCreateResponse] = []
|
|
314
|
+
errors: list[Exception] = []
|
|
315
|
+
for outcome in settled:
|
|
316
|
+
if isinstance(outcome, Exception):
|
|
317
|
+
errors.append(outcome)
|
|
318
|
+
elif isinstance(outcome, BaseException):
|
|
319
|
+
err = RuntimeError(str(outcome))
|
|
320
|
+
err.__cause__ = outcome
|
|
321
|
+
errors.append(err)
|
|
322
|
+
else:
|
|
323
|
+
results.append(outcome)
|
|
324
|
+
|
|
325
|
+
if errors:
|
|
326
|
+
raise ExceptionGroup(f"{len(errors)} of {len(file_paths)} uploads failed", errors)
|
|
327
|
+
return results
|
|
328
|
+
|
|
329
|
+
async def ingest_urls(
|
|
330
|
+
self,
|
|
331
|
+
corpus_id: str,
|
|
332
|
+
urls: list[dict[str, Any]],
|
|
333
|
+
idempotency_key: str | None = None,
|
|
334
|
+
*,
|
|
335
|
+
auto_index: bool | None = None,
|
|
336
|
+
chunk_strategy: str | None = None,
|
|
337
|
+
chunking: ChunkingConfig | None = None,
|
|
338
|
+
wait: bool = True,
|
|
339
|
+
poll_s: int = 5,
|
|
340
|
+
timeout_s: float | None = None,
|
|
341
|
+
request_options: RequestOptions | None = None,
|
|
342
|
+
) -> DocumentUrlIngestResponse:
|
|
343
|
+
"""Ingest documents from URLs.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
corpus_id: Target corpus ID.
|
|
347
|
+
urls: List of URL dicts with url, title, tags, metadata.
|
|
348
|
+
idempotency_key: Optional key for idempotent requests.
|
|
349
|
+
auto_index: Whether to auto-index after ingestion.
|
|
350
|
+
chunk_strategy: Deprecated - use chunking instead.
|
|
351
|
+
chunking: Chunking configuration (strategy, chunk_size, overlap, etc.)
|
|
352
|
+
wait: If True, wait for the batch job to complete.
|
|
353
|
+
poll_s: Polling interval when waiting.
|
|
354
|
+
timeout_s: Maximum seconds to wait for job completion.
|
|
355
|
+
"""
|
|
356
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
357
|
+
payload: dict[str, Any] = {"urls": urls}
|
|
358
|
+
if auto_index is not None:
|
|
359
|
+
payload["auto_index"] = auto_index
|
|
360
|
+
if chunking is not None:
|
|
361
|
+
payload["chunking"] = chunking
|
|
362
|
+
elif chunk_strategy is not None:
|
|
363
|
+
payload["chunk_strategy"] = chunk_strategy
|
|
364
|
+
headers = self._idempotency_headers(idempotency_key)
|
|
365
|
+
data = await self._request(
|
|
366
|
+
"POST",
|
|
367
|
+
f"/v1/corpora/{corpus_id}/documents:ingest_urls",
|
|
368
|
+
json=payload,
|
|
369
|
+
headers=headers,
|
|
370
|
+
request_options=request_options,
|
|
371
|
+
)
|
|
372
|
+
if wait:
|
|
373
|
+
job_id = data.get("job_id")
|
|
374
|
+
if job_id:
|
|
375
|
+
await self._wait_for_job(job_id, poll_s=poll_s, timeout_s=timeout_s)
|
|
376
|
+
return self._maybe_validate(data, "DocumentUrlIngestResponse")
|
|
377
|
+
|
|
378
|
+
async def ingest_manifest(
|
|
379
|
+
self,
|
|
380
|
+
corpus_id: str,
|
|
381
|
+
manifest_uri: str,
|
|
382
|
+
max_documents: int | None = None,
|
|
383
|
+
idempotency_key: str | None = None,
|
|
384
|
+
*,
|
|
385
|
+
auto_index: bool | None = None,
|
|
386
|
+
chunk_strategy: str | None = None,
|
|
387
|
+
chunking: ChunkingConfig | None = None,
|
|
388
|
+
request_options: RequestOptions | None = None,
|
|
389
|
+
) -> DocumentManifestIngestResponse:
|
|
390
|
+
"""Ingest documents from a manifest file.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
corpus_id: Target corpus ID.
|
|
394
|
+
manifest_uri: URI to manifest file (S3, HTTP, local).
|
|
395
|
+
max_documents: Optional limit on documents to ingest.
|
|
396
|
+
idempotency_key: Optional key for idempotent requests.
|
|
397
|
+
auto_index: Whether to auto-index after ingestion.
|
|
398
|
+
chunk_strategy: Deprecated - use chunking instead.
|
|
399
|
+
chunking: Chunking configuration (strategy, chunk_size, overlap, etc.)
|
|
400
|
+
"""
|
|
401
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
402
|
+
payload: dict[str, Any] = {"manifest_uri": manifest_uri}
|
|
403
|
+
if max_documents is not None:
|
|
404
|
+
payload["max_documents"] = max_documents
|
|
405
|
+
if auto_index is not None:
|
|
406
|
+
payload["auto_index"] = auto_index
|
|
407
|
+
if chunking is not None:
|
|
408
|
+
payload["chunking"] = chunking
|
|
409
|
+
elif chunk_strategy is not None:
|
|
410
|
+
payload["chunk_strategy"] = chunk_strategy
|
|
411
|
+
headers = self._idempotency_headers(idempotency_key)
|
|
412
|
+
data = await self._request(
|
|
413
|
+
"POST",
|
|
414
|
+
f"/v1/corpora/{corpus_id}/documents:ingest_manifest",
|
|
415
|
+
json=payload,
|
|
416
|
+
headers=headers,
|
|
417
|
+
request_options=request_options,
|
|
418
|
+
)
|
|
419
|
+
return self._maybe_validate(data, "DocumentManifestIngestResponse")
|
|
420
|
+
|
|
421
|
+
async def list_documents(
|
|
422
|
+
self,
|
|
423
|
+
corpus_id: str,
|
|
424
|
+
*,
|
|
425
|
+
limit: int = 100,
|
|
426
|
+
offset: int = 0,
|
|
427
|
+
q: str | None = None,
|
|
428
|
+
status: str | None = None,
|
|
429
|
+
source: str | None = None,
|
|
430
|
+
tag: str | None = None,
|
|
431
|
+
request_options: RequestOptions | None = None,
|
|
432
|
+
) -> Page[dict[str, Any]]:
|
|
433
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
434
|
+
params: dict[str, Any] = {}
|
|
435
|
+
if q is not None:
|
|
436
|
+
params["q"] = q
|
|
437
|
+
if status is not None:
|
|
438
|
+
params["status"] = status
|
|
439
|
+
if source is not None:
|
|
440
|
+
params["source"] = source
|
|
441
|
+
if tag is not None:
|
|
442
|
+
params["tag"] = tag
|
|
443
|
+
return await self._list_page(
|
|
444
|
+
"GET",
|
|
445
|
+
f"/v1/corpora/{corpus_id}/documents",
|
|
446
|
+
items_key="documents",
|
|
447
|
+
params=params or None,
|
|
448
|
+
limit=limit,
|
|
449
|
+
offset=offset,
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
def iter_documents(
|
|
453
|
+
self,
|
|
454
|
+
corpus_id: str,
|
|
455
|
+
*,
|
|
456
|
+
limit: int = 100,
|
|
457
|
+
q: str | None = None,
|
|
458
|
+
status: str | None = None,
|
|
459
|
+
source: str | None = None,
|
|
460
|
+
tag: str | None = None,
|
|
461
|
+
request_options: RequestOptions | None = None,
|
|
462
|
+
) -> AsyncPager[dict[str, Any]]:
|
|
463
|
+
"""Lazily paginate documents, yielding individual document items."""
|
|
464
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
465
|
+
params: dict[str, Any] = {}
|
|
466
|
+
if q is not None:
|
|
467
|
+
params["q"] = q
|
|
468
|
+
if status is not None:
|
|
469
|
+
params["status"] = status
|
|
470
|
+
if source is not None:
|
|
471
|
+
params["source"] = source
|
|
472
|
+
if tag is not None:
|
|
473
|
+
params["tag"] = tag
|
|
474
|
+
return self._paginate(
|
|
475
|
+
"GET",
|
|
476
|
+
f"/v1/corpora/{corpus_id}/documents",
|
|
477
|
+
items_key="documents",
|
|
478
|
+
params=params if params else None,
|
|
479
|
+
limit=limit,
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
async def get_document(
|
|
483
|
+
self,
|
|
484
|
+
doc_id: str,
|
|
485
|
+
request_options: RequestOptions | None = None,
|
|
486
|
+
) -> DocumentDetailResponse:
|
|
487
|
+
doc_id = require_str(doc_id, "doc_id")
|
|
488
|
+
data = await self._request(
|
|
489
|
+
"GET", f"/v1/documents/{doc_id}", request_options=request_options
|
|
490
|
+
)
|
|
491
|
+
return self._maybe_validate(data, "DocumentDetailResponse")
|
|
492
|
+
|
|
493
|
+
async def update_document_metadata(
|
|
494
|
+
self,
|
|
495
|
+
doc_id: str,
|
|
496
|
+
metadata: dict[str, Any],
|
|
497
|
+
request_options: RequestOptions | None = None,
|
|
498
|
+
) -> dict[str, Any]:
|
|
499
|
+
"""Update customer metadata on a document using merge semantics.
|
|
500
|
+
|
|
501
|
+
Keys with non-empty values are added or updated.
|
|
502
|
+
Keys with empty string or None values are removed.
|
|
503
|
+
Keys not in the request are left unchanged.
|
|
504
|
+
|
|
505
|
+
Args:
|
|
506
|
+
doc_id: Document ID to update
|
|
507
|
+
metadata: Dict of metadata updates to apply
|
|
508
|
+
|
|
509
|
+
Returns:
|
|
510
|
+
Updated metadata dict with custom_metadata and system_metadata
|
|
511
|
+
"""
|
|
512
|
+
doc_id = require_str(doc_id, "doc_id")
|
|
513
|
+
response = await self._request(
|
|
514
|
+
"PATCH",
|
|
515
|
+
f"/v1/documents/{doc_id}/metadata",
|
|
516
|
+
json=metadata,
|
|
517
|
+
request_options=request_options,
|
|
518
|
+
)
|
|
519
|
+
return response
|
|
520
|
+
|
|
521
|
+
async def delete_document(
|
|
522
|
+
self,
|
|
523
|
+
corpus_id: str,
|
|
524
|
+
doc_id: str,
|
|
525
|
+
*,
|
|
526
|
+
confirm: bool = False,
|
|
527
|
+
reindex: bool = False,
|
|
528
|
+
request_options: RequestOptions | None = None,
|
|
529
|
+
) -> DocumentDeleteResponse:
|
|
530
|
+
"""Delete a document from a corpus.
|
|
531
|
+
|
|
532
|
+
This is an irreversible operation. You must pass ``confirm=True``
|
|
533
|
+
to acknowledge this and proceed.
|
|
534
|
+
|
|
535
|
+
Args:
|
|
536
|
+
corpus_id: The corpus containing the document.
|
|
537
|
+
doc_id: Unique identifier of the document to delete.
|
|
538
|
+
confirm: Safety guard -- must be ``True`` to execute the
|
|
539
|
+
deletion. Raises ``ConfirmationRequiredError`` when ``False``.
|
|
540
|
+
reindex: If ``True``, trigger a re-index of the corpus after
|
|
541
|
+
deletion.
|
|
542
|
+
|
|
543
|
+
Returns:
|
|
544
|
+
Confirmation of the deletion.
|
|
545
|
+
|
|
546
|
+
Raises:
|
|
547
|
+
ConfirmationRequiredError: If *confirm* is not ``True``.
|
|
548
|
+
Knowledge2Error: If the API request fails.
|
|
549
|
+
"""
|
|
550
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
551
|
+
doc_id = require_str(doc_id, "doc_id")
|
|
552
|
+
if not confirm:
|
|
553
|
+
raise ConfirmationRequiredError("document", doc_id)
|
|
554
|
+
data = await self._request(
|
|
555
|
+
"DELETE",
|
|
556
|
+
f"/v1/corpora/{corpus_id}/documents/{doc_id}",
|
|
557
|
+
params={"reindex": reindex},
|
|
558
|
+
request_options=request_options,
|
|
559
|
+
)
|
|
560
|
+
return self._maybe_validate(data, "DocumentDeleteResponse")
|
|
561
|
+
|
|
562
|
+
async def list_chunks(
|
|
563
|
+
self,
|
|
564
|
+
corpus_id: str,
|
|
565
|
+
limit: int = 100,
|
|
566
|
+
offset: int = 0,
|
|
567
|
+
request_options: RequestOptions | None = None,
|
|
568
|
+
) -> Page[dict[str, Any]]:
|
|
569
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
570
|
+
return await self._list_page(
|
|
571
|
+
"GET",
|
|
572
|
+
f"/v1/corpora/{corpus_id}/chunks",
|
|
573
|
+
items_key="chunks",
|
|
574
|
+
limit=limit,
|
|
575
|
+
offset=offset,
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
def iter_chunks(
|
|
579
|
+
self,
|
|
580
|
+
corpus_id: str,
|
|
581
|
+
*,
|
|
582
|
+
limit: int = 100,
|
|
583
|
+
request_options: RequestOptions | None = None,
|
|
584
|
+
) -> AsyncPager[dict[str, Any]]:
|
|
585
|
+
"""Lazily paginate chunks, yielding individual chunk items."""
|
|
586
|
+
corpus_id = require_str(corpus_id, "corpus_id")
|
|
587
|
+
return self._paginate(
|
|
588
|
+
"GET",
|
|
589
|
+
f"/v1/corpora/{corpus_id}/chunks",
|
|
590
|
+
items_key="chunks",
|
|
591
|
+
limit=limit,
|
|
592
|
+
)
|