knowhere-python-sdk 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
knowhere/__init__.py CHANGED
@@ -35,8 +35,14 @@ from knowhere._exceptions import (
35
35
  )
36
36
  from knowhere._types import PollProgressCallback, UploadProgressCallback
37
37
  from knowhere._version import __version__
38
+ from knowhere.types.document import Document, DocumentListResponse
38
39
  from knowhere.types.job import Job, JobError, JobProgress, JobResult
39
40
  from knowhere.types.params import ParsingParams, WebhookConfig
41
+ from knowhere.types.retrieval import (
42
+ RetrievalSource,
43
+ RetrievalQueryResponse,
44
+ RetrievalResult,
45
+ )
40
46
  from knowhere.types.result import (
41
47
  BaseChunk,
42
48
  Checksum,
@@ -46,6 +52,10 @@ from knowhere.types.result import (
46
52
  ImageFileInfo,
47
53
  Manifest,
48
54
  ParseResult,
55
+ ProcessingCost,
56
+ ProcessingMetadata,
57
+ ProcessingTiming,
58
+ SlimChunk,
49
59
  Statistics,
50
60
  TableChunk,
51
61
  TableFileInfo,
@@ -83,6 +93,13 @@ __all__: list[str] = [
83
93
  "JobError",
84
94
  "JobProgress",
85
95
  "JobResult",
96
+ # Document types
97
+ "Document",
98
+ "DocumentListResponse",
99
+ # Retrieval types
100
+ "RetrievalSource",
101
+ "RetrievalQueryResponse",
102
+ "RetrievalResult",
86
103
  # Result types
87
104
  "ParseResult",
88
105
  "Manifest",
@@ -91,6 +108,10 @@ __all__: list[str] = [
91
108
  "FileIndex",
92
109
  "ImageFileInfo",
93
110
  "TableFileInfo",
111
+ "ProcessingCost",
112
+ "ProcessingMetadata",
113
+ "ProcessingTiming",
114
+ "SlimChunk",
94
115
  "BaseChunk",
95
116
  "TextChunk",
96
117
  "ImageChunk",
knowhere/_client.py CHANGED
@@ -19,7 +19,9 @@ from knowhere._types import (
19
19
  PollProgressCallback,
20
20
  UploadProgressCallback,
21
21
  )
22
+ from knowhere.resources.documents import AsyncDocuments, Documents
22
23
  from knowhere.resources.jobs import AsyncJobs, Jobs
24
+ from knowhere.resources.retrieval import AsyncRetrieval, Retrieval
23
25
  from knowhere.types.job import Job, JobResult
24
26
  from knowhere.types.params import ParsingParams, WebhookConfig
25
27
  from knowhere.types.result import ParseResult
@@ -42,6 +44,16 @@ class Knowhere(SyncAPIClient):
42
44
  """Access the jobs resource namespace."""
43
45
  return Jobs(self)
44
46
 
47
+ @cached_property
48
+ def retrieval(self) -> Retrieval:
49
+ """Access the retrieval resource namespace."""
50
+ return Retrieval(self)
51
+
52
+ @cached_property
53
+ def documents(self) -> Documents:
54
+ """Access the documents resource namespace."""
55
+ return Documents(self)
56
+
45
57
  # -- overloaded parse signatures --
46
58
 
47
59
  @overload
@@ -50,6 +62,8 @@ class Knowhere(SyncAPIClient):
50
62
  *,
51
63
  url: str,
52
64
  data_id: Optional[str] = ...,
65
+ namespace: Optional[str] = ...,
66
+ document_id: Optional[str] = ...,
53
67
  parsing_params: Optional[ParsingParams] = ...,
54
68
  webhook: Optional[WebhookConfig] = ...,
55
69
  poll_interval: float = ...,
@@ -66,6 +80,8 @@ class Knowhere(SyncAPIClient):
66
80
  file: Union[Path, BinaryIO, bytes],
67
81
  file_name: Optional[str] = ...,
68
82
  data_id: Optional[str] = ...,
83
+ namespace: Optional[str] = ...,
84
+ document_id: Optional[str] = ...,
69
85
  parsing_params: Optional[ParsingParams] = ...,
70
86
  webhook: Optional[WebhookConfig] = ...,
71
87
  poll_interval: float = ...,
@@ -82,6 +98,8 @@ class Knowhere(SyncAPIClient):
82
98
  file: Optional[Union[Path, BinaryIO, bytes]] = None,
83
99
  file_name: Optional[str] = None,
84
100
  data_id: Optional[str] = None,
101
+ namespace: Optional[str] = None,
102
+ document_id: Optional[str] = None,
85
103
  parsing_params: Optional[ParsingParams] = None,
86
104
  webhook: Optional[WebhookConfig] = None,
87
105
  poll_interval: float = DEFAULT_POLL_INTERVAL,
@@ -105,6 +123,8 @@ class Knowhere(SyncAPIClient):
105
123
  source_type="url",
106
124
  source_url=url,
107
125
  data_id=data_id,
126
+ namespace=namespace,
127
+ document_id=document_id,
108
128
  parsing_params=parsing_params,
109
129
  webhook=webhook,
110
130
  )
@@ -116,6 +136,8 @@ class Knowhere(SyncAPIClient):
116
136
  source_type="file",
117
137
  file_name=resolved_name,
118
138
  data_id=data_id,
139
+ namespace=namespace,
140
+ document_id=document_id,
119
141
  parsing_params=parsing_params,
120
142
  webhook=webhook,
121
143
  )
@@ -149,12 +171,24 @@ class AsyncKnowhere(AsyncAPIClient):
149
171
  """Access the async jobs resource namespace."""
150
172
  return AsyncJobs(self)
151
173
 
174
+ @cached_property
175
+ def retrieval(self) -> AsyncRetrieval:
176
+ """Access the async retrieval resource namespace."""
177
+ return AsyncRetrieval(self)
178
+
179
+ @cached_property
180
+ def documents(self) -> AsyncDocuments:
181
+ """Access the async documents resource namespace."""
182
+ return AsyncDocuments(self)
183
+
152
184
  @overload
153
185
  async def parse(
154
186
  self,
155
187
  *,
156
188
  url: str,
157
189
  data_id: Optional[str] = ...,
190
+ namespace: Optional[str] = ...,
191
+ document_id: Optional[str] = ...,
158
192
  parsing_params: Optional[ParsingParams] = ...,
159
193
  webhook: Optional[WebhookConfig] = ...,
160
194
  poll_interval: float = ...,
@@ -171,6 +205,8 @@ class AsyncKnowhere(AsyncAPIClient):
171
205
  file: Union[Path, BinaryIO, bytes],
172
206
  file_name: Optional[str] = ...,
173
207
  data_id: Optional[str] = ...,
208
+ namespace: Optional[str] = ...,
209
+ document_id: Optional[str] = ...,
174
210
  parsing_params: Optional[ParsingParams] = ...,
175
211
  webhook: Optional[WebhookConfig] = ...,
176
212
  poll_interval: float = ...,
@@ -187,6 +223,8 @@ class AsyncKnowhere(AsyncAPIClient):
187
223
  file: Optional[Union[Path, BinaryIO, bytes]] = None,
188
224
  file_name: Optional[str] = None,
189
225
  data_id: Optional[str] = None,
226
+ namespace: Optional[str] = None,
227
+ document_id: Optional[str] = None,
190
228
  parsing_params: Optional[ParsingParams] = None,
191
229
  webhook: Optional[WebhookConfig] = None,
192
230
  poll_interval: float = DEFAULT_POLL_INTERVAL,
@@ -206,6 +244,8 @@ class AsyncKnowhere(AsyncAPIClient):
206
244
  source_type="url",
207
245
  source_url=url,
208
246
  data_id=data_id,
247
+ namespace=namespace,
248
+ document_id=document_id,
209
249
  parsing_params=parsing_params,
210
250
  webhook=webhook,
211
251
  )
@@ -217,6 +257,8 @@ class AsyncKnowhere(AsyncAPIClient):
217
257
  source_type="file",
218
258
  file_name=resolved_name,
219
259
  data_id=data_id,
260
+ namespace=namespace,
261
+ document_id=document_id,
220
262
  parsing_params=parsing_params,
221
263
  webhook=webhook,
222
264
  )
@@ -232,4 +274,4 @@ class AsyncKnowhere(AsyncAPIClient):
232
274
 
233
275
  return await self.jobs.load(
234
276
  job_result, verify_checksum=verify_checksum
235
- )
277
+ )
knowhere/_exceptions.py CHANGED
@@ -387,11 +387,29 @@ def makeStatusError(
387
387
  response=response,
388
388
  )
389
389
 
390
- if exception_class in (RateLimitError, ServiceUnavailableError, GatewayTimeoutError):
391
- return exception_class(
390
+ if exception_class is RateLimitError:
391
+ return RateLimitError(
392
392
  status_code,
393
393
  **common_kwargs,
394
- retry_after=retry_after, # type: ignore[call-arg]
394
+ retry_after=retry_after,
395
+ limit=limit,
396
+ period=period,
397
+ )
398
+
399
+ if exception_class is ServiceUnavailableError:
400
+ return ServiceUnavailableError(
401
+ status_code,
402
+ **common_kwargs,
403
+ retry_after=retry_after,
404
+ limit=limit,
405
+ period=period,
406
+ )
407
+
408
+ if exception_class is GatewayTimeoutError:
409
+ return GatewayTimeoutError(
410
+ status_code,
411
+ **common_kwargs,
412
+ retry_after=retry_after,
395
413
  limit=limit,
396
414
  period=period,
397
415
  )
knowhere/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.0" # x-release-please-version
1
+ __version__ = "0.3.0" # x-release-please-version
@@ -16,6 +16,7 @@ from knowhere.types.result import (
16
16
  ImageChunk,
17
17
  Manifest,
18
18
  ParseResult,
19
+ SlimChunk,
19
20
  TableChunk,
20
21
  TextChunk,
21
22
  TextChunkTokens,
@@ -134,6 +135,7 @@ def _buildChunks(
134
135
  type="image",
135
136
  content=raw.get("content", ""),
136
137
  path=raw.get("path"),
138
+ page_nums=metadata.get("page_nums", raw.get("page_nums")),
137
139
  length=metadata.get("length", raw.get("length", 0)),
138
140
  file_path=file_path,
139
141
  original_name=metadata.get("original_name", raw.get("original_name")),
@@ -151,6 +153,7 @@ def _buildChunks(
151
153
  type="table",
152
154
  content=raw.get("content", ""),
153
155
  path=raw.get("path"),
156
+ page_nums=metadata.get("page_nums", raw.get("page_nums")),
154
157
  length=metadata.get("length", raw.get("length", 0)),
155
158
  file_path=file_path,
156
159
  original_name=metadata.get("original_name", raw.get("original_name")),
@@ -167,10 +170,12 @@ def _buildChunks(
167
170
  type="text",
168
171
  content=raw.get("content", ""),
169
172
  path=raw.get("path"),
173
+ page_nums=metadata.get("page_nums", raw.get("page_nums")),
170
174
  length=metadata.get("length", raw.get("length", 0)),
171
175
  tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id),
172
176
  keywords=metadata.get("keywords", raw.get("keywords")),
173
177
  summary=metadata.get("summary", raw.get("summary")),
178
+ connect_to=metadata.get("connect_to", raw.get("connect_to")),
174
179
  relationships=metadata.get("relationships", raw.get("relationships")),
175
180
  )
176
181
 
@@ -230,12 +235,39 @@ def parseResultZip(
230
235
  json.loads(hierarchy_text) if hierarchy_text else None
231
236
  )
232
237
 
238
+ # -- Optimized sidecar files --
239
+ chunks_slim_text: Optional[str] = _readZipText(zf, "chunks_slim.json")
240
+ parsed_chunks_slim: Any = json.loads(chunks_slim_text) if chunks_slim_text else None
241
+ if isinstance(parsed_chunks_slim, dict) and "chunks" in parsed_chunks_slim:
242
+ raw_chunks_slim: List[Dict[str, Any]] = parsed_chunks_slim["chunks"]
243
+ elif isinstance(parsed_chunks_slim, list):
244
+ raw_chunks_slim = parsed_chunks_slim
245
+ else:
246
+ raw_chunks_slim = []
247
+ chunks_slim: Optional[List[SlimChunk]] = (
248
+ [SlimChunk.model_validate(chunk) for chunk in raw_chunks_slim]
249
+ if chunks_slim_text is not None
250
+ else None
251
+ )
252
+
253
+ toc_hierarchies_text: Optional[str] = _readZipText(zf, "toc_hierarchies.json")
254
+ toc_hierarchies: Optional[Any] = (
255
+ json.loads(toc_hierarchies_text) if toc_hierarchies_text else None
256
+ )
257
+
258
+ kb_csv: Optional[str] = _readZipText(zf, "kb.csv")
259
+ hierarchy_view_html: Optional[str] = _readZipText(zf, "hierarchy_view.html")
260
+
233
261
  zf.close()
234
262
 
235
263
  return ParseResult(
236
264
  manifest=manifest,
237
265
  chunks=chunks,
266
+ chunks_slim=chunks_slim,
238
267
  full_markdown=full_markdown,
239
268
  hierarchy=hierarchy,
269
+ toc_hierarchies=toc_hierarchies,
270
+ kb_csv=kb_csv,
271
+ hierarchy_view_html=hierarchy_view_html,
240
272
  raw_zip=zip_bytes,
241
273
  )
@@ -2,6 +2,15 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ from knowhere.resources.documents import AsyncDocuments, Documents
5
6
  from knowhere.resources.jobs import AsyncJobs, Jobs
7
+ from knowhere.resources.retrieval import AsyncRetrieval, Retrieval
6
8
 
7
- __all__: list[str] = ["Jobs", "AsyncJobs"]
9
+ __all__: list[str] = [
10
+ "AsyncDocuments",
11
+ "AsyncJobs",
12
+ "AsyncRetrieval",
13
+ "Documents",
14
+ "Jobs",
15
+ "Retrieval",
16
+ ]
@@ -0,0 +1,74 @@
1
+ """Documents resource for canonical document lifecycle operations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, Optional
6
+
7
+ from knowhere.resources._base import AsyncAPIResource, SyncAPIResource
8
+ from knowhere.types.document import Document, DocumentListResponse
9
+
10
+
11
+ class Documents(SyncAPIResource):
12
+ """Synchronous interface for ``/v1/documents`` endpoints."""
13
+
14
+ def list(self, *, namespace: Optional[str] = None) -> DocumentListResponse:
15
+ """List canonical documents in a namespace."""
16
+ params: Dict[str, Any] = {}
17
+ if namespace is not None:
18
+ params["namespace"] = namespace
19
+
20
+ return self._request(
21
+ "GET",
22
+ "v1/documents",
23
+ params=params or None,
24
+ cast_to=DocumentListResponse,
25
+ )
26
+
27
+ def get(self, document_id: str) -> Document:
28
+ """Get one canonical document by ID."""
29
+ return self._request(
30
+ "GET",
31
+ f"v1/documents/{document_id}",
32
+ cast_to=Document,
33
+ )
34
+
35
+ def archive(self, document_id: str) -> Document:
36
+ """Archive one canonical document by ID."""
37
+ return self._request(
38
+ "POST",
39
+ f"v1/documents/{document_id}/archive",
40
+ cast_to=Document,
41
+ )
42
+
43
+
44
+ class AsyncDocuments(AsyncAPIResource):
45
+ """Asynchronous interface for ``/v1/documents`` endpoints."""
46
+
47
+ async def list(self, *, namespace: Optional[str] = None) -> DocumentListResponse:
48
+ """List canonical documents in a namespace."""
49
+ params: Dict[str, Any] = {}
50
+ if namespace is not None:
51
+ params["namespace"] = namespace
52
+
53
+ return await self._request(
54
+ "GET",
55
+ "v1/documents",
56
+ params=params or None,
57
+ cast_to=DocumentListResponse,
58
+ )
59
+
60
+ async def get(self, document_id: str) -> Document:
61
+ """Get one canonical document by ID."""
62
+ return await self._request(
63
+ "GET",
64
+ f"v1/documents/{document_id}",
65
+ cast_to=Document,
66
+ )
67
+
68
+ async def archive(self, document_id: str) -> Document:
69
+ """Archive one canonical document by ID."""
70
+ return await self._request(
71
+ "POST",
72
+ f"v1/documents/{document_id}/archive",
73
+ cast_to=Document,
74
+ )
@@ -34,6 +34,8 @@ class Jobs(SyncAPIResource):
34
34
  source_type: str,
35
35
  source_url: Optional[str] = None,
36
36
  file_name: Optional[str] = None,
37
+ namespace: Optional[str] = None,
38
+ document_id: Optional[str] = None,
37
39
  data_id: Optional[str] = None,
38
40
  parsing_params: Optional[ParsingParams] = None,
39
41
  webhook: Optional[WebhookConfig] = None,
@@ -44,6 +46,8 @@ class Jobs(SyncAPIResource):
44
46
  source_type: ``"url"`` or ``"file"``.
45
47
  source_url: URL to parse (required when ``source_type="url"``).
46
48
  file_name: Original filename (used when ``source_type="file"``).
49
+ namespace: Retrieval namespace. Defaults to the server ``default``.
50
+ document_id: Existing document ID when creating an update job.
47
51
  data_id: Optional idempotency / correlation identifier.
48
52
  parsing_params: Optional parsing configuration.
49
53
  webhook: Optional webhook configuration.
@@ -56,6 +60,10 @@ class Jobs(SyncAPIResource):
56
60
  body["source_url"] = source_url
57
61
  if file_name is not None:
58
62
  body["file_name"] = file_name
63
+ if namespace is not None:
64
+ body["namespace"] = namespace
65
+ if document_id is not None:
66
+ body["document_id"] = document_id
59
67
  if data_id is not None:
60
68
  body["data_id"] = data_id
61
69
  if parsing_params is not None:
@@ -158,6 +166,8 @@ class AsyncJobs(AsyncAPIResource):
158
166
  source_type: str,
159
167
  source_url: Optional[str] = None,
160
168
  file_name: Optional[str] = None,
169
+ namespace: Optional[str] = None,
170
+ document_id: Optional[str] = None,
161
171
  data_id: Optional[str] = None,
162
172
  parsing_params: Optional[ParsingParams] = None,
163
173
  webhook: Optional[WebhookConfig] = None,
@@ -168,6 +178,10 @@ class AsyncJobs(AsyncAPIResource):
168
178
  body["source_url"] = source_url
169
179
  if file_name is not None:
170
180
  body["file_name"] = file_name
181
+ if namespace is not None:
182
+ body["namespace"] = namespace
183
+ if document_id is not None:
184
+ body["document_id"] = document_id
171
185
  if data_id is not None:
172
186
  body["data_id"] = data_id
173
187
  if parsing_params is not None:
@@ -0,0 +1,70 @@
1
+ """Retrieval resource for querying published documents."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, Optional
6
+
7
+ from knowhere.resources._base import AsyncAPIResource, SyncAPIResource
8
+ from knowhere.types.retrieval import RetrievalQueryResponse
9
+
10
+
11
+ class Retrieval(SyncAPIResource):
12
+ """Synchronous interface for ``/v1/retrieval`` endpoints."""
13
+
14
+ def query(
15
+ self,
16
+ *,
17
+ query: str,
18
+ namespace: Optional[str] = None,
19
+ top_k: Optional[int] = None,
20
+ exclude_document_ids: Optional[list[str]] = None,
21
+ exclude_sections: Optional[list[dict[str, str]]] = None,
22
+ ) -> RetrievalQueryResponse:
23
+ """Query published documents in a namespace."""
24
+ body: Dict[str, Any] = {"query": query}
25
+ if namespace is not None:
26
+ body["namespace"] = namespace
27
+ if top_k is not None:
28
+ body["top_k"] = top_k
29
+ if exclude_document_ids is not None:
30
+ body["exclude_document_ids"] = exclude_document_ids
31
+ if exclude_sections is not None:
32
+ body["exclude_sections"] = exclude_sections
33
+
34
+ return self._request(
35
+ "POST",
36
+ "v1/retrieval/query",
37
+ body=body,
38
+ cast_to=RetrievalQueryResponse,
39
+ )
40
+
41
+
42
+ class AsyncRetrieval(AsyncAPIResource):
43
+ """Asynchronous interface for ``/v1/retrieval`` endpoints."""
44
+
45
+ async def query(
46
+ self,
47
+ *,
48
+ query: str,
49
+ namespace: Optional[str] = None,
50
+ top_k: Optional[int] = None,
51
+ exclude_document_ids: Optional[list[str]] = None,
52
+ exclude_sections: Optional[list[dict[str, str]]] = None,
53
+ ) -> RetrievalQueryResponse:
54
+ """Query published documents in a namespace."""
55
+ body: Dict[str, Any] = {"query": query}
56
+ if namespace is not None:
57
+ body["namespace"] = namespace
58
+ if top_k is not None:
59
+ body["top_k"] = top_k
60
+ if exclude_document_ids is not None:
61
+ body["exclude_document_ids"] = exclude_document_ids
62
+ if exclude_sections is not None:
63
+ body["exclude_sections"] = exclude_sections
64
+
65
+ return await self._request(
66
+ "POST",
67
+ "v1/retrieval/query",
68
+ body=body,
69
+ cast_to=RetrievalQueryResponse,
70
+ )
@@ -2,8 +2,14 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ from knowhere.types.document import Document, DocumentListResponse
5
6
  from knowhere.types.job import Job, JobError, JobResult
6
7
  from knowhere.types.params import ParsingParams, WebhookConfig
8
+ from knowhere.types.retrieval import (
9
+ RetrievalSource,
10
+ RetrievalQueryResponse,
11
+ RetrievalResult,
12
+ )
7
13
  from knowhere.types.result import (
8
14
  BaseChunk,
9
15
  Checksum,
@@ -13,6 +19,10 @@ from knowhere.types.result import (
13
19
  ImageFileInfo,
14
20
  Manifest,
15
21
  ParseResult,
22
+ ProcessingCost,
23
+ ProcessingMetadata,
24
+ ProcessingTiming,
25
+ SlimChunk,
16
26
  Statistics,
17
27
  TableChunk,
18
28
  TableFileInfo,
@@ -24,6 +34,13 @@ __all__: list[str] = [
24
34
  "Job",
25
35
  "JobError",
26
36
  "JobResult",
37
+ # document
38
+ "Document",
39
+ "DocumentListResponse",
40
+ # retrieval
41
+ "RetrievalSource",
42
+ "RetrievalQueryResponse",
43
+ "RetrievalResult",
27
44
  # params
28
45
  "ParsingParams",
29
46
  "WebhookConfig",
@@ -36,6 +53,10 @@ __all__: list[str] = [
36
53
  "ImageFileInfo",
37
54
  "Manifest",
38
55
  "ParseResult",
56
+ "ProcessingCost",
57
+ "ProcessingMetadata",
58
+ "ProcessingTiming",
59
+ "SlimChunk",
39
60
  "Statistics",
40
61
  "TableChunk",
41
62
  "TableFileInfo",
@@ -0,0 +1,28 @@
1
+ """Pydantic models for canonical document lifecycle responses."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime
6
+ from typing import Optional
7
+
8
+ from pydantic import BaseModel
9
+
10
+
11
+ class Document(BaseModel):
12
+ """Canonical document state returned by ``/v1/documents`` endpoints."""
13
+
14
+ document_id: str
15
+ namespace: str
16
+ status: str
17
+ current_job_result_id: Optional[str] = None
18
+ source_file_name: Optional[str] = None
19
+ created_at: Optional[datetime] = None
20
+ updated_at: Optional[datetime] = None
21
+ archived_at: Optional[datetime] = None
22
+
23
+
24
+ class DocumentListResponse(BaseModel):
25
+ """Response from ``GET /v1/documents``."""
26
+
27
+ namespace: str
28
+ documents: list[Document]
knowhere/types/job.py CHANGED
@@ -40,6 +40,8 @@ class Job(BaseModel):
40
40
  job_id: str
41
41
  status: str
42
42
  source_type: str
43
+ namespace: Optional[str] = None
44
+ document_id: Optional[str] = None
43
45
  data_id: Optional[str] = None
44
46
  created_at: Optional[datetime] = None
45
47
  upload_url: Optional[str] = None
@@ -53,6 +55,8 @@ class JobResult(BaseModel):
53
55
  job_id: str
54
56
  status: str
55
57
  source_type: str
58
+ namespace: Optional[str] = None
59
+ document_id: Optional[str] = None
56
60
  data_id: Optional[str] = None
57
61
  created_at: Optional[datetime] = None
58
62
  progress: Optional[Union[float, JobProgress]] = None
knowhere/types/result.py CHANGED
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import os
6
+ import json
6
7
  import re
7
8
  from pathlib import Path
8
9
  from typing import Any, Dict, List, Optional, Union
@@ -92,12 +93,39 @@ class FileIndex(BaseModel):
92
93
 
93
94
  chunks: Optional[str] = None
94
95
  markdown: Optional[str] = None
96
+ chunks_slim: Optional[str] = None
95
97
  kb_csv: Optional[str] = None
96
98
  hierarchy: Optional[str] = None
99
+ toc_hierarchies: Optional[str] = None
100
+ hierarchy_view_html: Optional[str] = None
97
101
  images: List[ImageFileInfo] = Field(default_factory=list)
98
102
  tables: List[TableFileInfo] = Field(default_factory=list)
99
103
 
100
104
 
105
+ class ProcessingCost(BaseModel):
106
+ """Billing details emitted by manifest v2."""
107
+
108
+ micro_dollars: Optional[int] = None
109
+ credits: Optional[float] = None
110
+
111
+
112
+ class ProcessingTiming(BaseModel):
113
+ """Timing details emitted by manifest v2."""
114
+
115
+ started_at: Optional[str] = None
116
+ completed_at: Optional[str] = None
117
+ duration_ms: Optional[int] = None
118
+
119
+
120
+ class ProcessingMetadata(BaseModel):
121
+ """Worker-side processing metadata emitted by manifest v2."""
122
+
123
+ page_count: Optional[int] = None
124
+ billing_status: Optional[str] = None
125
+ cost: Optional[ProcessingCost] = None
126
+ timing: Optional[ProcessingTiming] = None
127
+
128
+
101
129
  class Manifest(BaseModel):
102
130
  """Top-level manifest describing the result ZIP contents."""
103
131
 
@@ -106,6 +134,7 @@ class Manifest(BaseModel):
106
134
  data_id: Optional[str] = None
107
135
  source_file_name: Optional[str] = None
108
136
  processing_date: Optional[str] = None
137
+ processing: Optional[ProcessingMetadata] = None
109
138
  checksum: Optional[Checksum] = None
110
139
  statistics: Optional[Statistics] = None
111
140
  files: Optional[FileIndex] = None
@@ -123,6 +152,7 @@ class BaseChunk(BaseModel):
123
152
  type: str
124
153
  content: str = ""
125
154
  path: Optional[str] = None
155
+ page_nums: Optional[List[int]] = None
126
156
 
127
157
 
128
158
  TextChunkTokens: TypeAlias = List[str]
@@ -136,6 +166,7 @@ class TextChunk(BaseChunk):
136
166
  tokens: Optional[TextChunkTokens] = None
137
167
  keywords: Optional[List[str]] = None
138
168
  summary: Optional[str] = None
169
+ connect_to: Optional[List[Dict[str, Any]]] = None
139
170
  relationships: Optional[List[Union[Dict[str, Any], str]]] = None
140
171
 
141
172
 
@@ -210,6 +241,15 @@ class TableChunk(BaseChunk):
210
241
  Chunk = Union[TextChunk, ImageChunk, TableChunk]
211
242
 
212
243
 
244
+ class SlimChunk(BaseModel):
245
+ """Minimal chunk entry emitted in chunks_slim.json."""
246
+
247
+ type: str
248
+ path: Optional[str] = None
249
+ content: str = ""
250
+ summary: Optional[str] = None
251
+
252
+
213
253
  # ---------------------------------------------------------------------------
214
254
  # ParseResult — the top-level object returned to the user
215
255
  # ---------------------------------------------------------------------------
@@ -225,8 +265,12 @@ class ParseResult:
225
265
 
226
266
  manifest: Manifest
227
267
  chunks: List[Chunk]
268
+ chunks_slim: Optional[List[SlimChunk]]
228
269
  full_markdown: str
229
270
  hierarchy: Optional[Any]
271
+ toc_hierarchies: Optional[Any]
272
+ kb_csv: Optional[str]
273
+ hierarchy_view_html: Optional[str]
230
274
  raw_zip: bytes
231
275
 
232
276
  def __init__(
@@ -234,14 +278,22 @@ class ParseResult:
234
278
  *,
235
279
  manifest: Manifest,
236
280
  chunks: List[Chunk],
281
+ chunks_slim: Optional[List[SlimChunk]],
237
282
  full_markdown: str,
238
283
  hierarchy: Optional[Any],
284
+ toc_hierarchies: Optional[Any],
285
+ kb_csv: Optional[str],
286
+ hierarchy_view_html: Optional[str],
239
287
  raw_zip: bytes,
240
288
  ) -> None:
241
289
  self.manifest = manifest
242
290
  self.chunks = chunks
291
+ self.chunks_slim = chunks_slim
243
292
  self.full_markdown = full_markdown
244
293
  self.hierarchy = hierarchy
294
+ self.toc_hierarchies = toc_hierarchies
295
+ self.kb_csv = kb_csv
296
+ self.hierarchy_view_html = hierarchy_view_html
245
297
  self.raw_zip = raw_zip
246
298
 
247
299
  # -- convenience properties --
@@ -296,10 +348,58 @@ class ParseResult:
296
348
  dir_path: Path = Path(directory)
297
349
  dir_path.mkdir(parents=True, exist_ok=True)
298
350
 
351
+ # Manifest / chunks
352
+ manifest_path: Path = dir_path / "manifest.json"
353
+ manifest_path.write_text(
354
+ self.manifest.model_dump_json(indent=2),
355
+ encoding="utf-8",
356
+ )
357
+
358
+ chunks_path: Path = dir_path / "chunks.json"
359
+ chunks_path.write_text(
360
+ json.dumps([chunk.model_dump() for chunk in self.chunks], indent=2),
361
+ encoding="utf-8",
362
+ )
363
+
364
+ if self.chunks_slim is not None:
365
+ chunks_slim_path: Path = dir_path / "chunks_slim.json"
366
+ chunks_slim_path.write_text(
367
+ json.dumps(
368
+ {"chunks": [chunk.model_dump() for chunk in self.chunks_slim]},
369
+ indent=2,
370
+ ),
371
+ encoding="utf-8",
372
+ )
373
+
299
374
  # Full markdown
300
375
  md_path: Path = dir_path / "full.md"
301
376
  md_path.write_text(self.full_markdown, encoding="utf-8")
302
377
 
378
+ if self.hierarchy is not None:
379
+ hierarchy_path: Path = dir_path / "hierarchy.json"
380
+ hierarchy_path.write_text(
381
+ json.dumps(self.hierarchy, indent=2),
382
+ encoding="utf-8",
383
+ )
384
+
385
+ if self.toc_hierarchies is not None:
386
+ toc_hierarchies_path: Path = dir_path / "toc_hierarchies.json"
387
+ toc_hierarchies_path.write_text(
388
+ json.dumps(self.toc_hierarchies, indent=2),
389
+ encoding="utf-8",
390
+ )
391
+
392
+ if self.kb_csv is not None:
393
+ kb_csv_path: Path = dir_path / "kb.csv"
394
+ kb_csv_path.write_text(self.kb_csv, encoding="utf-8")
395
+
396
+ if self.hierarchy_view_html is not None:
397
+ hierarchy_view_path: Path = dir_path / "hierarchy_view.html"
398
+ hierarchy_view_path.write_text(
399
+ self.hierarchy_view_html,
400
+ encoding="utf-8",
401
+ )
402
+
303
403
  # Images
304
404
  if self.image_chunks:
305
405
  images_dir: Path = dir_path / "images"
@@ -0,0 +1,33 @@
1
+ """Pydantic models for retrieval query responses."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ from pydantic import BaseModel
8
+
9
+
10
+ class RetrievalSource(BaseModel):
11
+ """Caller-facing source reference attached to a retrieval result."""
12
+
13
+ document_id: Optional[str] = None
14
+ source_file_name: Optional[str] = None
15
+ section_path: Optional[str] = None
16
+
17
+
18
+ class RetrievalResult(BaseModel):
19
+ """Canonical chunk result returned by ``POST /v1/retrieval/query``."""
20
+
21
+ chunk_type: str
22
+ content: str
23
+ score: float
24
+ asset_url: Optional[str] = None
25
+ source: RetrievalSource
26
+
27
+
28
+ class RetrievalQueryResponse(BaseModel):
29
+ """Response from ``POST /v1/retrieval/query``."""
30
+
31
+ namespace: str
32
+ query: str
33
+ results: list[RetrievalResult]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: knowhere-python-sdk
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Official Python SDK for the Knowhere document parsing API
5
5
  Project-URL: Homepage, https://knowhereto.ai
6
6
  Project-URL: Documentation, https://docs.knowhereto.ai
@@ -64,6 +64,74 @@ for chunk in result.text_chunks:
64
64
  print(chunk.content[:80])
65
65
  ```
66
66
 
67
+ ## Retrieval and document lifecycle
68
+
69
+ New documents are published into a retrieval namespace. The server returns a
70
+ stable `document_id` when you create a job; persist that value if you need to
71
+ update or archive the same document later.
72
+
73
+ ```python
74
+ job = client.jobs.create(
75
+ source_type="url",
76
+ source_url="https://example.com/manual.pdf",
77
+ namespace="support-center",
78
+ )
79
+
80
+ print(job.document_id) # "doc_..."
81
+ ```
82
+
83
+ After the job is done and published, query the canonical document content:
84
+
85
+ ```python
86
+ response = client.retrieval.query(
87
+ namespace="support-center",
88
+ query="How do I reset Bluetooth pairing?",
89
+ top_k=5,
90
+ )
91
+
92
+ for result in response.results:
93
+ print(result.content)
94
+ print(result.score)
95
+ print(result.source.source_file_name, result.source.section_path)
96
+ ```
97
+
98
+ Use `document_id` to update or archive a document:
99
+
100
+ ```python
101
+ update_job = client.jobs.create(
102
+ source_type="url",
103
+ source_url="https://example.com/manual-v2.pdf",
104
+ document_id=job.document_id,
105
+ )
106
+
107
+ document = client.documents.get(job.document_id)
108
+ print(document.status)
109
+
110
+ client.documents.archive(job.document_id)
111
+ ```
112
+
113
+ You can also list documents in a namespace:
114
+
115
+ ```python
116
+ documents = client.documents.list(namespace="support-center")
117
+ for document in documents.documents:
118
+ print(document.document_id, document.status)
119
+ ```
120
+
121
+ Retrieval supports exclusions when clients want follow-up results that avoid
122
+ previously used documents or sections:
123
+
124
+ ```python
125
+ response = client.retrieval.query(
126
+ namespace="support-center",
127
+ query="battery charging",
128
+ exclude_document_ids=["doc_old"],
129
+ exclude_sections=[
130
+ {"document_id": "doc_123", "section_path": "Appendix / Legal"}
131
+ ],
132
+ )
133
+ ```
134
+
67
135
  While you can provide an `api_key` keyword argument, we recommend using [python-dotenv](https://pypi.org/project/python-dotenv/) to add `KNOWHERE_API_KEY="sk_..."` to your `.env` file so that your API key is not stored in source control.
68
136
 
69
137
  ### Parse a local file
@@ -137,9 +205,12 @@ from pathlib import Path
137
205
  job = client.jobs.create(
138
206
  source_type="file",
139
207
  file_name="report.pdf",
208
+ namespace="support-center",
140
209
  parsing_params={"model": "advanced", "ocr_enabled": True},
141
210
  )
142
211
 
212
+ print(job.document_id) # Persist this to update/archive the document later.
213
+
143
214
  # Step 2: Upload file to presigned URL
144
215
  client.jobs.upload(job, file=Path("report.pdf"))
145
216
 
@@ -0,0 +1,29 @@
1
+ knowhere/__init__.py,sha256=FLKrentC0o9j1GZTSTlx7A1S_mWmXWceomBScdPbXg8,2854
2
+ knowhere/_base_client.py,sha256=ddeRR1lWLhes5ipvYX6-TMEecjjiEBGfQdPw_vnSNqA,17978
3
+ knowhere/_client.py,sha256=WYb-Fhi3x3nQYNfQG9eCgOpLc_wVyAawfPZWdZhFESg,9586
4
+ knowhere/_constants.py,sha256=ZNCFQC00NpUZIyc_XZ0uemjJE-E8uKAbv3BDa3po9cg,885
5
+ knowhere/_exceptions.py,sha256=NflH7phh_bNFOJmQ758V4mZCAFQskpGXACMz2JIfFNU,11896
6
+ knowhere/_logging.py,sha256=tNqEA1dLv-adTT6qRq5RBeO35FoWrnS3gwt7gKChLTA,1376
7
+ knowhere/_response.py,sha256=EsrM794qxCykvl82UkszeqjJzm9_OSq7nsyzaSCnx0I,1415
8
+ knowhere/_types.py,sha256=8-JFaRcxgBJbw2mV9BwnmCktFVph41a1mduwtXlYidI,1775
9
+ knowhere/_version.py,sha256=BW_DctcKYzNRp1g4_DgZOvYCUcP3tNHyQKvZG3uopBM,50
10
+ knowhere/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ knowhere/lib/__init__.py,sha256=e953V5ny3VmDtCw7y_4uPwdTkwwNpe_Y6o4AEgz3ujw,50
12
+ knowhere/lib/polling.py,sha256=s0EPHozAvNhXLqr5uwU8YXkkwAdF0ji_nIN0QfR6avY,4500
13
+ knowhere/lib/result_parser.py,sha256=dR3knoMq-AFMAe0M3l0YgOM-OrtSmofSLaKZO0tgYao,9882
14
+ knowhere/lib/upload.py,sha256=eT-O9_wB2WkWUAsUd7VzaKY6DVfNeA6WMHRdwm0HM0o,7849
15
+ knowhere/resources/__init__.py,sha256=ClsR-yn_0E4KOopD_Yq13wbPHHjl9s15XpydN-d2Rzo,393
16
+ knowhere/resources/_base.py,sha256=tgKphNTsgMhktWp6_rhyVOZyee4CYlDmD5O1_jWVvYo,1829
17
+ knowhere/resources/documents.py,sha256=u_gmrElvpMOABaHkEuTyaYvh4D_CG4pHZt23r8tivaY,2314
18
+ knowhere/resources/jobs.py,sha256=IhcJIQ_jho6dSsdJLSS0VRB6xuWw12BRJrjO_4NjEMs,9099
19
+ knowhere/resources/retrieval.py,sha256=yVCUWlOg6_ZJhXfiy5_AjqLZZm2Zx8ltqhj1kJ1gKIM,2302
20
+ knowhere/types/__init__.py,sha256=fKMA0NA2lZ-eag1FIeScnwz2ImV6LD-T3YJVfUBsA98,1290
21
+ knowhere/types/document.py,sha256=LbFleglvm538vSDDho82j7fVxvgMXdIVm9wrWemLShY,711
22
+ knowhere/types/job.py,sha256=_ORhgn_tnvQm_gyrCS39EsDV3dOKImBeJXGjEq3JLag,2510
23
+ knowhere/types/params.py,sha256=7DyBd4xMxtLPch-A1130-gI0ajKOv2G5tbSMkE8n6-E,543
24
+ knowhere/types/result.py,sha256=UmoxaFmxt2bhrP-2O6jYL89C2WuwZh2xcyyHl46Q1_Y,12925
25
+ knowhere/types/retrieval.py,sha256=-YzsKyusajVdGx4v1lR9Kts-Fh5D41uXf17lSL4ZyJM,777
26
+ knowhere/types/shared.py,sha256=K5ezX212othxgCviiE2WnwWFY2MS08pXKJ8Km1ZWmjc,104
27
+ knowhere_python_sdk-0.3.0.dist-info/METADATA,sha256=T7MT_NBl2sqb_FcBuxU97Eacm8YDXn8jcP3DLRnLQH0,7922
28
+ knowhere_python_sdk-0.3.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
29
+ knowhere_python_sdk-0.3.0.dist-info/RECORD,,
@@ -1,25 +0,0 @@
1
- knowhere/__init__.py,sha256=EuIpP3FtDeszonVAXMxZimjRd9iUcQ8wA53h1f27S3k,2343
2
- knowhere/_base_client.py,sha256=ddeRR1lWLhes5ipvYX6-TMEecjjiEBGfQdPw_vnSNqA,17978
3
- knowhere/_client.py,sha256=MGU1QsyjKrzTiitm891wgNCq6JLf3DR7y7zhkil_p2E,8027
4
- knowhere/_constants.py,sha256=ZNCFQC00NpUZIyc_XZ0uemjJE-E8uKAbv3BDa3po9cg,885
5
- knowhere/_exceptions.py,sha256=yg-4pK7AP6uUPxxyggxf8spQeXgFTpKRwELsHjCQycg,11489
6
- knowhere/_logging.py,sha256=tNqEA1dLv-adTT6qRq5RBeO35FoWrnS3gwt7gKChLTA,1376
7
- knowhere/_response.py,sha256=EsrM794qxCykvl82UkszeqjJzm9_OSq7nsyzaSCnx0I,1415
8
- knowhere/_types.py,sha256=8-JFaRcxgBJbw2mV9BwnmCktFVph41a1mduwtXlYidI,1775
9
- knowhere/_version.py,sha256=piZV5NEcs0VIotCxwaWvzWE2ASUv5tox5ye8ogIRiIk,50
10
- knowhere/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- knowhere/lib/__init__.py,sha256=e953V5ny3VmDtCw7y_4uPwdTkwwNpe_Y6o4AEgz3ujw,50
12
- knowhere/lib/polling.py,sha256=s0EPHozAvNhXLqr5uwU8YXkkwAdF0ji_nIN0QfR6avY,4500
13
- knowhere/lib/result_parser.py,sha256=U-DK3SDKrbUY0g_-ad04bsbra1mhYy9FJ2opa1n2bTU,8406
14
- knowhere/lib/upload.py,sha256=eT-O9_wB2WkWUAsUd7VzaKY6DVfNeA6WMHRdwm0HM0o,7849
15
- knowhere/resources/__init__.py,sha256=_x391t8qxwkGbOmbkzcp7rR10Q8uoDLQaAkZxCq_oM8,170
16
- knowhere/resources/_base.py,sha256=tgKphNTsgMhktWp6_rhyVOZyee4CYlDmD5O1_jWVvYo,1829
17
- knowhere/resources/jobs.py,sha256=45P4rZ9HMnTdgcso2AwQ6lDA9U80HGsgOU0jZLBIMFU,8460
18
- knowhere/types/__init__.py,sha256=OwTxpa9uo0GOEJ6Ds6rqEmXl86O49ByS6M7cscMwQo8,791
19
- knowhere/types/job.py,sha256=8shCqvgzKKkEPOpEHdk7CnDbPQiDzy3wEd5Jngw94ZM,2362
20
- knowhere/types/params.py,sha256=7DyBd4xMxtLPch-A1130-gI0ajKOv2G5tbSMkE8n6-E,543
21
- knowhere/types/result.py,sha256=Lmtaa0wQymBzAm6hXoZZr6dlfwf0WCMEda6Gd8nDIdw,9628
22
- knowhere/types/shared.py,sha256=K5ezX212othxgCviiE2WnwWFY2MS08pXKJ8Km1ZWmjc,104
23
- knowhere_python_sdk-0.2.0.dist-info/METADATA,sha256=10dnumfebnQ3VmPHmYuDexWTCdqdFLi-eAaF8FwcNpc,6115
24
- knowhere_python_sdk-0.2.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
25
- knowhere_python_sdk-0.2.0.dist-info/RECORD,,