knowledge2 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. knowledge2-0.4.0.dist-info/METADATA +556 -0
  2. knowledge2-0.4.0.dist-info/RECORD +139 -0
  3. knowledge2-0.4.0.dist-info/WHEEL +5 -0
  4. knowledge2-0.4.0.dist-info/top_level.txt +1 -0
  5. sdk/__init__.py +70 -0
  6. sdk/_async_base.py +525 -0
  7. sdk/_async_paging.py +57 -0
  8. sdk/_base.py +541 -0
  9. sdk/_logging.py +41 -0
  10. sdk/_paging.py +73 -0
  11. sdk/_preview.py +70 -0
  12. sdk/_raw_response.py +25 -0
  13. sdk/_request_options.py +51 -0
  14. sdk/_transport.py +144 -0
  15. sdk/_validation.py +25 -0
  16. sdk/_validation_response.py +36 -0
  17. sdk/_version.py +3 -0
  18. sdk/async_client.py +320 -0
  19. sdk/async_resources/__init__.py +45 -0
  20. sdk/async_resources/_mixin_base.py +42 -0
  21. sdk/async_resources/a2a.py +230 -0
  22. sdk/async_resources/agents.py +489 -0
  23. sdk/async_resources/audit.py +145 -0
  24. sdk/async_resources/auth.py +133 -0
  25. sdk/async_resources/console.py +409 -0
  26. sdk/async_resources/corpora.py +276 -0
  27. sdk/async_resources/deployments.py +106 -0
  28. sdk/async_resources/documents.py +592 -0
  29. sdk/async_resources/feeds.py +248 -0
  30. sdk/async_resources/indexes.py +208 -0
  31. sdk/async_resources/jobs.py +165 -0
  32. sdk/async_resources/metadata.py +48 -0
  33. sdk/async_resources/models.py +102 -0
  34. sdk/async_resources/onboarding.py +538 -0
  35. sdk/async_resources/orgs.py +37 -0
  36. sdk/async_resources/pipelines.py +523 -0
  37. sdk/async_resources/projects.py +90 -0
  38. sdk/async_resources/search.py +262 -0
  39. sdk/async_resources/training.py +357 -0
  40. sdk/async_resources/usage.py +91 -0
  41. sdk/client.py +417 -0
  42. sdk/config.py +182 -0
  43. sdk/errors.py +178 -0
  44. sdk/examples/auth_factory.py +34 -0
  45. sdk/examples/batch_operations.py +57 -0
  46. sdk/examples/document_upload.py +56 -0
  47. sdk/examples/e2e_lifecycle.py +213 -0
  48. sdk/examples/error_handling.py +61 -0
  49. sdk/examples/pagination.py +64 -0
  50. sdk/examples/quickstart.py +36 -0
  51. sdk/examples/request_options.py +44 -0
  52. sdk/examples/search.py +64 -0
  53. sdk/integrations/__init__.py +57 -0
  54. sdk/integrations/_client.py +101 -0
  55. sdk/integrations/langchain/__init__.py +6 -0
  56. sdk/integrations/langchain/retriever.py +166 -0
  57. sdk/integrations/langchain/tools.py +108 -0
  58. sdk/integrations/llamaindex/__init__.py +11 -0
  59. sdk/integrations/llamaindex/filters.py +78 -0
  60. sdk/integrations/llamaindex/retriever.py +162 -0
  61. sdk/integrations/llamaindex/tools.py +109 -0
  62. sdk/integrations/llamaindex/vector_store.py +320 -0
  63. sdk/models/__init__.py +18 -0
  64. sdk/models/_base.py +24 -0
  65. sdk/models/_registry.py +457 -0
  66. sdk/models/a2a.py +92 -0
  67. sdk/models/agents.py +109 -0
  68. sdk/models/audit.py +28 -0
  69. sdk/models/auth.py +49 -0
  70. sdk/models/chunks.py +20 -0
  71. sdk/models/common.py +14 -0
  72. sdk/models/console.py +103 -0
  73. sdk/models/corpora.py +48 -0
  74. sdk/models/deployments.py +13 -0
  75. sdk/models/documents.py +126 -0
  76. sdk/models/embeddings.py +24 -0
  77. sdk/models/evaluation.py +17 -0
  78. sdk/models/feedback.py +9 -0
  79. sdk/models/feeds.py +57 -0
  80. sdk/models/indexes.py +36 -0
  81. sdk/models/jobs.py +52 -0
  82. sdk/models/models.py +26 -0
  83. sdk/models/onboarding.py +323 -0
  84. sdk/models/orgs.py +11 -0
  85. sdk/models/pipelines.py +147 -0
  86. sdk/models/projects.py +19 -0
  87. sdk/models/search.py +149 -0
  88. sdk/models/training.py +57 -0
  89. sdk/models/usage.py +39 -0
  90. sdk/namespaces.py +386 -0
  91. sdk/py.typed +0 -0
  92. sdk/resources/__init__.py +45 -0
  93. sdk/resources/_mixin_base.py +40 -0
  94. sdk/resources/a2a.py +230 -0
  95. sdk/resources/agents.py +487 -0
  96. sdk/resources/audit.py +144 -0
  97. sdk/resources/auth.py +138 -0
  98. sdk/resources/console.py +411 -0
  99. sdk/resources/corpora.py +269 -0
  100. sdk/resources/deployments.py +105 -0
  101. sdk/resources/documents.py +597 -0
  102. sdk/resources/feeds.py +246 -0
  103. sdk/resources/indexes.py +210 -0
  104. sdk/resources/jobs.py +164 -0
  105. sdk/resources/metadata.py +53 -0
  106. sdk/resources/models.py +99 -0
  107. sdk/resources/onboarding.py +542 -0
  108. sdk/resources/orgs.py +35 -0
  109. sdk/resources/pipeline_builder.py +257 -0
  110. sdk/resources/pipelines.py +520 -0
  111. sdk/resources/projects.py +87 -0
  112. sdk/resources/search.py +277 -0
  113. sdk/resources/training.py +358 -0
  114. sdk/resources/usage.py +92 -0
  115. sdk/types/__init__.py +366 -0
  116. sdk/types/a2a.py +88 -0
  117. sdk/types/agents.py +133 -0
  118. sdk/types/audit.py +26 -0
  119. sdk/types/auth.py +45 -0
  120. sdk/types/chunks.py +18 -0
  121. sdk/types/common.py +10 -0
  122. sdk/types/console.py +99 -0
  123. sdk/types/corpora.py +42 -0
  124. sdk/types/deployments.py +11 -0
  125. sdk/types/documents.py +104 -0
  126. sdk/types/embeddings.py +22 -0
  127. sdk/types/evaluation.py +15 -0
  128. sdk/types/feedback.py +7 -0
  129. sdk/types/feeds.py +61 -0
  130. sdk/types/indexes.py +30 -0
  131. sdk/types/jobs.py +50 -0
  132. sdk/types/models.py +22 -0
  133. sdk/types/onboarding.py +395 -0
  134. sdk/types/orgs.py +9 -0
  135. sdk/types/pipelines.py +177 -0
  136. sdk/types/projects.py +14 -0
  137. sdk/types/search.py +116 -0
  138. sdk/types/training.py +55 -0
  139. sdk/types/usage.py +37 -0
@@ -0,0 +1,597 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from concurrent.futures import Future, ThreadPoolExecutor
6
+ from typing import Any
7
+
8
+ from sdk._paging import Page, SyncPager
9
+ from sdk._request_options import RequestOptions
10
+ from sdk._validation import require_str
11
+ from sdk.errors import ConfirmationRequiredError
12
+ from sdk.resources._mixin_base import RequesterMixin
13
+ from sdk.types import (
14
+ ChunkingConfig,
15
+ DocumentBatchUploadResponse,
16
+ DocumentCreateResponse,
17
+ DocumentDeleteResponse,
18
+ DocumentDetailResponse,
19
+ DocumentManifestIngestResponse,
20
+ DocumentUrlIngestResponse,
21
+ )
22
+
23
+
24
+ class DocumentsMixin(RequesterMixin):
25
+ def upload_document(
26
+ self,
27
+ corpus_id: str,
28
+ *,
29
+ file_path: str | None = None,
30
+ file_bytes: bytes | None = None,
31
+ filename: str | None = None,
32
+ raw_text: str | None = None,
33
+ source_uri: str | None = None,
34
+ metadata: dict[str, Any] | None = None,
35
+ auto_index: bool | None = None,
36
+ chunk_strategy: str | None = None,
37
+ chunking: ChunkingConfig | None = None,
38
+ idempotency_key: str | None = None,
39
+ request_options: RequestOptions | None = None,
40
+ ) -> DocumentCreateResponse:
41
+ """Upload a document to a corpus.
42
+
43
+ Args:
44
+ corpus_id: Target corpus ID.
45
+ file_path: Path to file to upload.
46
+ file_bytes: Raw file bytes to upload.
47
+ filename: Filename when using file_bytes.
48
+ raw_text: Raw text content to upload.
49
+ source_uri: Optional source URI for the document.
50
+ metadata: Optional document metadata.
51
+ auto_index: Whether to auto-index after ingestion.
52
+ chunk_strategy: Deprecated - use chunking instead.
53
+ chunking: Chunking configuration (strategy, chunk_size, overlap, etc.)
54
+ idempotency_key: Optional key for idempotent requests.
55
+ """
56
+ corpus_id = require_str(corpus_id, "corpus_id")
57
+ if file_path and (file_bytes or raw_text):
58
+ raise ValueError("file_path cannot be combined with file_bytes or raw_text")
59
+ if file_bytes and raw_text:
60
+ raise ValueError("file_bytes cannot be combined with raw_text")
61
+ headers = self._idempotency_headers(idempotency_key)
62
+ if file_path:
63
+ form: dict[str, str] = {}
64
+ if source_uri is not None:
65
+ form["source_uri"] = source_uri
66
+ if metadata is not None:
67
+ form["metadata"] = json.dumps(metadata)
68
+ if auto_index is not None:
69
+ form["auto_index"] = str(bool(auto_index)).lower()
70
+ if chunking is not None:
71
+ form["chunking"] = json.dumps(chunking)
72
+ elif chunk_strategy is not None:
73
+ form["chunk_strategy"] = chunk_strategy
74
+ with open(file_path, "rb") as handle:
75
+ files = {"file": (os.path.basename(file_path), handle)}
76
+ data = self._request(
77
+ "POST",
78
+ f"/v1/corpora/{corpus_id}/documents",
79
+ data=form,
80
+ files=files,
81
+ headers=headers,
82
+ request_options=request_options,
83
+ )
84
+ return self._maybe_validate(data, "DocumentCreateResponse")
85
+ if file_bytes is not None:
86
+ if not filename:
87
+ raise ValueError("filename is required when using file_bytes")
88
+ form_data: dict[str, str] = {}
89
+ if source_uri is not None:
90
+ form_data["source_uri"] = source_uri
91
+ if metadata is not None:
92
+ form_data["metadata"] = json.dumps(metadata)
93
+ if auto_index is not None:
94
+ form_data["auto_index"] = str(bool(auto_index)).lower()
95
+ if chunking is not None:
96
+ form_data["chunking"] = json.dumps(chunking)
97
+ elif chunk_strategy is not None:
98
+ form_data["chunk_strategy"] = chunk_strategy
99
+ file_payload: dict[str, Any] = {"file": (filename, file_bytes)}
100
+ data = self._request(
101
+ "POST",
102
+ f"/v1/corpora/{corpus_id}/documents",
103
+ data=form_data,
104
+ files=file_payload,
105
+ headers=headers,
106
+ request_options=request_options,
107
+ )
108
+ return self._maybe_validate(data, "DocumentCreateResponse")
109
+ if raw_text is None:
110
+ raise ValueError("raw_text is required when no file is provided")
111
+ payload: dict[str, Any] = {}
112
+ if raw_text is not None:
113
+ payload["raw_text"] = raw_text
114
+ if source_uri is not None:
115
+ payload["source_uri"] = source_uri
116
+ if metadata is not None:
117
+ payload["metadata"] = metadata
118
+ if auto_index is not None:
119
+ payload["auto_index"] = auto_index
120
+ if chunking is not None:
121
+ payload["chunking"] = chunking
122
+ elif chunk_strategy is not None:
123
+ payload["chunk_strategy"] = chunk_strategy
124
+ data = self._request(
125
+ "POST",
126
+ f"/v1/corpora/{corpus_id}/documents",
127
+ json=payload,
128
+ headers=headers,
129
+ request_options=request_options,
130
+ )
131
+ return self._maybe_validate(data, "DocumentCreateResponse")
132
+
133
+ def upload_documents_batch(
134
+ self,
135
+ corpus_id: str,
136
+ documents: list[dict[str, Any]],
137
+ idempotency_key: str | None = None,
138
+ *,
139
+ auto_index: bool | None = None,
140
+ chunk_strategy: str | None = None,
141
+ chunking: ChunkingConfig | None = None,
142
+ wait: bool = True,
143
+ poll_s: int = 5,
144
+ timeout_s: float | None = None,
145
+ request_options: RequestOptions | None = None,
146
+ ) -> DocumentBatchUploadResponse:
147
+ """Upload multiple documents as raw text in a batch.
148
+
149
+ Args:
150
+ corpus_id: Target corpus ID.
151
+ documents: List of document dicts with raw_text, source_uri, metadata.
152
+ idempotency_key: Optional key for idempotent requests.
153
+ auto_index: Whether to auto-index after ingestion.
154
+ chunk_strategy: Deprecated - use chunking instead.
155
+ chunking: Chunking configuration (strategy, chunk_size, overlap, etc.)
156
+ wait: If True, wait for the batch job to complete.
157
+ poll_s: Polling interval when waiting.
158
+ timeout_s: Maximum seconds to wait for job completion.
159
+ Use ``None`` to wait indefinitely. This timeout only bounds
160
+ client-side waiting and does not cancel the backend job.
161
+
162
+ Returns:
163
+ Response with ``doc_ids`` (list of created document IDs),
164
+ ``job_id``, and ``count``.
165
+ """
166
+ corpus_id = require_str(corpus_id, "corpus_id")
167
+ payload: dict[str, Any] = {"documents": documents}
168
+ if auto_index is not None:
169
+ payload["auto_index"] = auto_index
170
+ if chunking is not None:
171
+ payload["chunking"] = chunking
172
+ elif chunk_strategy is not None:
173
+ payload["chunk_strategy"] = chunk_strategy
174
+ headers = self._idempotency_headers(idempotency_key)
175
+ data = self._request(
176
+ "POST",
177
+ f"/v1/corpora/{corpus_id}/documents:batch",
178
+ json=payload,
179
+ headers=headers,
180
+ request_options=request_options,
181
+ )
182
+ if wait:
183
+ job_id = data.get("job_id")
184
+ if job_id:
185
+ self._wait_for_job(job_id, poll_s=poll_s, timeout_s=timeout_s)
186
+ return self._maybe_validate(data, "DocumentBatchUploadResponse")
187
+
188
+ def upload_files_batch(
189
+ self,
190
+ corpus_id: str,
191
+ files: list[tuple[str, bytes]],
192
+ idempotency_key: str | None = None,
193
+ *,
194
+ auto_index: bool | None = None,
195
+ chunk_strategy: str | None = None,
196
+ chunking: ChunkingConfig | None = None,
197
+ wait: bool = True,
198
+ poll_s: int = 5,
199
+ timeout_s: float | None = None,
200
+ request_options: RequestOptions | None = None,
201
+ ) -> DocumentBatchUploadResponse:
202
+ """Upload multiple files in a single multipart request.
203
+
204
+ Creates a single ingest_documents_batch job for all files,
205
+ enabling batch processing with near-data optimization.
206
+
207
+ Args:
208
+ corpus_id: Target corpus ID.
209
+ files: List of (filename, content_bytes) tuples.
210
+ idempotency_key: Optional key for idempotent requests.
211
+ auto_index: Whether to auto-index after ingestion.
212
+ chunk_strategy: Deprecated - use chunking instead.
213
+ chunking: Chunking configuration (strategy, chunk_size, overlap, etc.)
214
+ wait: If True, wait for the batch job to complete.
215
+ poll_s: Polling interval when waiting.
216
+ timeout_s: Maximum seconds to wait for job completion.
217
+ Use ``None`` to wait indefinitely. This timeout only bounds
218
+ client-side waiting and does not cancel the backend job.
219
+
220
+ Returns:
221
+ Response with job_id, doc_ids, and count.
222
+ """
223
+ corpus_id = require_str(corpus_id, "corpus_id")
224
+ headers = self._idempotency_headers(idempotency_key)
225
+
226
+ # Build multipart form data
227
+ files_payload: dict[str, Any] = {}
228
+ for i, (filename, content) in enumerate(files):
229
+ files_payload[f"files"] = files_payload.get("files", [])
230
+ # httpx expects list of tuples for multiple files with same key
231
+ files_list = [("files", (filename, content)) for filename, content in files]
232
+
233
+ form_data: dict[str, Any] = {}
234
+ if auto_index is not None:
235
+ form_data["auto_index"] = str(auto_index).lower()
236
+ if chunking is not None:
237
+ form_data["chunking"] = json.dumps(chunking)
238
+ elif chunk_strategy is not None:
239
+ form_data["chunk_strategy"] = chunk_strategy
240
+
241
+ data = self._request(
242
+ "POST",
243
+ f"/v1/corpora/{corpus_id}/documents:upload_batch",
244
+ data=form_data if form_data else None,
245
+ files=files_list,
246
+ headers=headers,
247
+ request_options=request_options,
248
+ )
249
+ if wait:
250
+ job_id = data.get("job_id")
251
+ if job_id:
252
+ self._wait_for_job(job_id, poll_s=poll_s, timeout_s=timeout_s)
253
+ return self._maybe_validate(data, "DocumentBatchUploadResponse")
254
+
255
+ def upload_documents_parallel(
256
+ self,
257
+ corpus_id: str,
258
+ file_paths: list[str],
259
+ *,
260
+ max_workers: int = 8,
261
+ auto_index: bool | None = None,
262
+ chunking: ChunkingConfig | None = None,
263
+ metadata: dict[str, Any] | None = None,
264
+ request_options: RequestOptions | None = None,
265
+ ) -> list[DocumentCreateResponse]:
266
+ """Upload multiple files concurrently using a thread pool.
267
+
268
+ Each file is uploaded as a separate HTTP request via
269
+ :meth:`upload_document`. For large batches this is significantly
270
+ faster than uploading sequentially.
271
+
272
+ Unlike :meth:`upload_files_batch` (which sends a single multipart
273
+ request for server-side batching), this method issues concurrent
274
+ individual uploads for client-side parallelism.
275
+
276
+ .. warning:: **All-or-nothing semantics.** On partial failure an
277
+ :class:`ExceptionGroup` is raised and **no** successful results are
278
+ returned. Callers who need partial-failure recovery should upload
279
+ files individually via :meth:`upload_document` and handle errors
280
+ per file.
281
+
282
+ Args:
283
+ corpus_id: Target corpus ID.
284
+ file_paths: List of local file paths to upload.
285
+ max_workers: Maximum number of concurrent upload threads
286
+ (default 8, must be >= 1).
287
+ auto_index: Whether to auto-index after ingestion.
288
+ chunking: Chunking configuration applied to each upload.
289
+ metadata: Optional metadata applied to every document.
290
+
291
+ Returns:
292
+ List of upload responses (one per file, in input order).
293
+
294
+ Raises:
295
+ ExceptionGroup: If one or more uploads fail, containing all
296
+ individual exceptions from failed uploads. Successful
297
+ results from other uploads are discarded.
298
+ ValueError: If *max_workers* is less than 1 or greater than 256.
299
+ """
300
+ corpus_id = require_str(corpus_id, "corpus_id")
301
+ if max_workers < 1:
302
+ raise ValueError(f"max_workers must be >= 1, got {max_workers}")
303
+ if max_workers > 256:
304
+ raise ValueError(f"max_workers must be <= 256, got {max_workers}")
305
+ if not file_paths:
306
+ return []
307
+
308
+ errors: list[Exception] = []
309
+
310
+ with ThreadPoolExecutor(max_workers=min(max_workers, len(file_paths))) as pool:
311
+ futures: list[Future[DocumentCreateResponse]] = [
312
+ pool.submit(
313
+ self.upload_document,
314
+ corpus_id,
315
+ file_path=fp,
316
+ auto_index=auto_index,
317
+ chunking=chunking,
318
+ metadata=metadata,
319
+ request_options=request_options,
320
+ )
321
+ for fp in file_paths
322
+ ]
323
+ results: list[DocumentCreateResponse] = []
324
+ for future in futures:
325
+ try:
326
+ results.append(future.result())
327
+ except Exception as exc:
328
+ errors.append(exc)
329
+
330
+ if errors:
331
+ raise ExceptionGroup(f"{len(errors)} of {len(file_paths)} uploads failed", errors)
332
+ return results
333
+
334
+ def ingest_urls(
335
+ self,
336
+ corpus_id: str,
337
+ urls: list[dict[str, Any]],
338
+ idempotency_key: str | None = None,
339
+ *,
340
+ auto_index: bool | None = None,
341
+ chunk_strategy: str | None = None,
342
+ chunking: ChunkingConfig | None = None,
343
+ wait: bool = True,
344
+ poll_s: int = 5,
345
+ timeout_s: float | None = None,
346
+ request_options: RequestOptions | None = None,
347
+ ) -> DocumentUrlIngestResponse:
348
+ """Ingest documents from URLs.
349
+
350
+ Args:
351
+ corpus_id: Target corpus ID.
352
+ urls: List of URL dicts with url, title, tags, metadata.
353
+ idempotency_key: Optional key for idempotent requests.
354
+ auto_index: Whether to auto-index after ingestion.
355
+ chunk_strategy: Deprecated - use chunking instead.
356
+ chunking: Chunking configuration (strategy, chunk_size, overlap, etc.)
357
+ wait: If True, wait for the batch job to complete.
358
+ poll_s: Polling interval when waiting.
359
+ timeout_s: Maximum seconds to wait for job completion.
360
+ Use ``None`` to wait indefinitely. This timeout only bounds
361
+ client-side waiting and does not cancel the backend job.
362
+ """
363
+ corpus_id = require_str(corpus_id, "corpus_id")
364
+ payload: dict[str, Any] = {"urls": urls}
365
+ if auto_index is not None:
366
+ payload["auto_index"] = auto_index
367
+ if chunking is not None:
368
+ payload["chunking"] = chunking
369
+ elif chunk_strategy is not None:
370
+ payload["chunk_strategy"] = chunk_strategy
371
+ headers = self._idempotency_headers(idempotency_key)
372
+ data = self._request(
373
+ "POST",
374
+ f"/v1/corpora/{corpus_id}/documents:ingest_urls",
375
+ json=payload,
376
+ headers=headers,
377
+ request_options=request_options,
378
+ )
379
+ if wait:
380
+ job_id = data.get("job_id")
381
+ if job_id:
382
+ self._wait_for_job(job_id, poll_s=poll_s, timeout_s=timeout_s)
383
+ return self._maybe_validate(data, "DocumentUrlIngestResponse")
384
+
385
+ def ingest_manifest(
386
+ self,
387
+ corpus_id: str,
388
+ manifest_uri: str,
389
+ max_documents: int | None = None,
390
+ idempotency_key: str | None = None,
391
+ *,
392
+ auto_index: bool | None = None,
393
+ chunk_strategy: str | None = None,
394
+ chunking: ChunkingConfig | None = None,
395
+ request_options: RequestOptions | None = None,
396
+ ) -> DocumentManifestIngestResponse:
397
+ """Ingest documents from a manifest file.
398
+
399
+ Args:
400
+ corpus_id: Target corpus ID.
401
+ manifest_uri: URI to manifest file (S3, HTTP, local).
402
+ max_documents: Optional limit on documents to ingest.
403
+ idempotency_key: Optional key for idempotent requests.
404
+ auto_index: Whether to auto-index after ingestion.
405
+ chunk_strategy: Deprecated - use chunking instead.
406
+ chunking: Chunking configuration (strategy, chunk_size, overlap, etc.)
407
+ """
408
+ corpus_id = require_str(corpus_id, "corpus_id")
409
+ payload: dict[str, Any] = {"manifest_uri": manifest_uri}
410
+ if max_documents is not None:
411
+ payload["max_documents"] = max_documents
412
+ if auto_index is not None:
413
+ payload["auto_index"] = auto_index
414
+ if chunking is not None:
415
+ payload["chunking"] = chunking
416
+ elif chunk_strategy is not None:
417
+ payload["chunk_strategy"] = chunk_strategy
418
+ headers = self._idempotency_headers(idempotency_key)
419
+ data = self._request(
420
+ "POST",
421
+ f"/v1/corpora/{corpus_id}/documents:ingest_manifest",
422
+ json=payload,
423
+ headers=headers,
424
+ request_options=request_options,
425
+ )
426
+ return self._maybe_validate(data, "DocumentManifestIngestResponse")
427
+
428
+ def list_documents(
429
+ self,
430
+ corpus_id: str,
431
+ *,
432
+ limit: int = 100,
433
+ offset: int = 0,
434
+ q: str | None = None,
435
+ status: str | None = None,
436
+ source: str | None = None,
437
+ tag: str | None = None,
438
+ request_options: RequestOptions | None = None,
439
+ ) -> Page[dict[str, Any]]:
440
+ corpus_id = require_str(corpus_id, "corpus_id")
441
+ params: dict[str, Any] = {}
442
+ if q is not None:
443
+ params["q"] = q
444
+ if status is not None:
445
+ params["status"] = status
446
+ if source is not None:
447
+ params["source"] = source
448
+ if tag is not None:
449
+ params["tag"] = tag
450
+ return self._list_page(
451
+ "GET",
452
+ f"/v1/corpora/{corpus_id}/documents",
453
+ items_key="documents",
454
+ params=params or None,
455
+ limit=limit,
456
+ offset=offset,
457
+ )
458
+
459
+ def iter_documents(
460
+ self,
461
+ corpus_id: str,
462
+ *,
463
+ limit: int = 100,
464
+ q: str | None = None,
465
+ status: str | None = None,
466
+ source: str | None = None,
467
+ tag: str | None = None,
468
+ request_options: RequestOptions | None = None,
469
+ ) -> SyncPager[dict[str, Any]]:
470
+ """Lazily paginate documents, yielding individual document items."""
471
+ corpus_id = require_str(corpus_id, "corpus_id")
472
+ params: dict[str, Any] = {}
473
+ if q is not None:
474
+ params["q"] = q
475
+ if status is not None:
476
+ params["status"] = status
477
+ if source is not None:
478
+ params["source"] = source
479
+ if tag is not None:
480
+ params["tag"] = tag
481
+ return self._paginate(
482
+ "GET",
483
+ f"/v1/corpora/{corpus_id}/documents",
484
+ items_key="documents",
485
+ params=params if params else None,
486
+ limit=limit,
487
+ )
488
+
489
+ def get_document(
490
+ self,
491
+ doc_id: str,
492
+ request_options: RequestOptions | None = None,
493
+ ) -> DocumentDetailResponse:
494
+ doc_id = require_str(doc_id, "doc_id")
495
+ data = self._request("GET", f"/v1/documents/{doc_id}", request_options=request_options)
496
+ return self._maybe_validate(data, "DocumentDetailResponse")
497
+
498
+ def update_document_metadata(
499
+ self,
500
+ doc_id: str,
501
+ metadata: dict[str, Any],
502
+ request_options: RequestOptions | None = None,
503
+ ) -> dict[str, Any]:
504
+ """Update customer metadata on a document using merge semantics.
505
+
506
+ Keys with non-empty values are added or updated.
507
+ Keys with empty string or None values are removed.
508
+ Keys not in the request are left unchanged.
509
+
510
+ Args:
511
+ doc_id: Document ID to update
512
+ metadata: Dict of metadata updates to apply
513
+
514
+ Returns:
515
+ Updated metadata dict with custom_metadata and system_metadata
516
+ """
517
+ doc_id = require_str(doc_id, "doc_id")
518
+ response = self._request(
519
+ "PATCH",
520
+ f"/v1/documents/{doc_id}/metadata",
521
+ json=metadata,
522
+ request_options=request_options,
523
+ )
524
+ return response
525
+
526
+ def delete_document(
527
+ self,
528
+ corpus_id: str,
529
+ doc_id: str,
530
+ *,
531
+ confirm: bool = False,
532
+ reindex: bool = False,
533
+ request_options: RequestOptions | None = None,
534
+ ) -> DocumentDeleteResponse:
535
+ """Delete a document from a corpus.
536
+
537
+ This is an irreversible operation. You must pass ``confirm=True``
538
+ to acknowledge this and proceed.
539
+
540
+ Args:
541
+ corpus_id: The corpus containing the document.
542
+ doc_id: Unique identifier of the document to delete.
543
+ confirm: Safety guard — must be ``True`` to execute the
544
+ deletion. Raises ``ConfirmationRequiredError`` when ``False``.
545
+ reindex: If ``True``, trigger a re-index of the corpus after
546
+ deletion.
547
+
548
+ Returns:
549
+ Confirmation of the deletion.
550
+
551
+ Raises:
552
+ ConfirmationRequiredError: If *confirm* is not ``True``.
553
+ Knowledge2Error: If the API request fails.
554
+ """
555
+ corpus_id = require_str(corpus_id, "corpus_id")
556
+ doc_id = require_str(doc_id, "doc_id")
557
+ if not confirm:
558
+ raise ConfirmationRequiredError("document", doc_id)
559
+ data = self._request(
560
+ "DELETE",
561
+ f"/v1/corpora/{corpus_id}/documents/{doc_id}",
562
+ params={"reindex": reindex},
563
+ request_options=request_options,
564
+ )
565
+ return self._maybe_validate(data, "DocumentDeleteResponse")
566
+
567
+ def list_chunks(
568
+ self,
569
+ corpus_id: str,
570
+ limit: int = 100,
571
+ offset: int = 0,
572
+ request_options: RequestOptions | None = None,
573
+ ) -> Page[dict[str, Any]]:
574
+ corpus_id = require_str(corpus_id, "corpus_id")
575
+ return self._list_page(
576
+ "GET",
577
+ f"/v1/corpora/{corpus_id}/chunks",
578
+ items_key="chunks",
579
+ limit=limit,
580
+ offset=offset,
581
+ )
582
+
583
+ def iter_chunks(
584
+ self,
585
+ corpus_id: str,
586
+ *,
587
+ limit: int = 100,
588
+ request_options: RequestOptions | None = None,
589
+ ) -> SyncPager[dict[str, Any]]:
590
+ """Lazily paginate chunks, yielding individual chunk items."""
591
+ corpus_id = require_str(corpus_id, "corpus_id")
592
+ return self._paginate(
593
+ "GET",
594
+ f"/v1/corpora/{corpus_id}/chunks",
595
+ items_key="chunks",
596
+ limit=limit,
597
+ )