knowhere-python-sdk 0.3.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
knowhere/__init__.py CHANGED
@@ -35,7 +35,15 @@ from knowhere._exceptions import (
35
35
  )
36
36
  from knowhere._types import PollProgressCallback, UploadProgressCallback
37
37
  from knowhere._version import __version__
38
- from knowhere.types.document import Document, DocumentListResponse
38
+ from knowhere.types.document import (
39
+ Document,
40
+ DocumentChunk,
41
+ DocumentChunkListResponse,
42
+ DocumentChunkPagination,
43
+ DocumentChunkResponse,
44
+ DocumentChunkType,
45
+ DocumentListResponse,
46
+ )
39
47
  from knowhere.types.job import Job, JobError, JobProgress, JobResult
40
48
  from knowhere.types.params import ParsingParams, WebhookConfig
41
49
  from knowhere.types.retrieval import (
@@ -98,6 +106,11 @@ __all__: list[str] = [
98
106
  "JobResult",
99
107
  # Document types
100
108
  "Document",
109
+ "DocumentChunk",
110
+ "DocumentChunkListResponse",
111
+ "DocumentChunkPagination",
112
+ "DocumentChunkResponse",
113
+ "DocumentChunkType",
101
114
  "DocumentListResponse",
102
115
  # Retrieval types
103
116
  "RetrievalChannel",
knowhere/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.2" # x-release-please-version
1
+ __version__ = "0.5.0" # x-release-please-version
@@ -13,13 +13,13 @@ from knowhere._exceptions import ChecksumError, KnowhereError
13
13
  from knowhere._logging import getLogger
14
14
  from knowhere.types.result import (
15
15
  Chunk,
16
+ DocNav,
16
17
  ImageChunk,
17
18
  Manifest,
18
19
  ParseResult,
19
20
  SlimChunk,
20
21
  TableChunk,
21
22
  TextChunk,
22
- TextChunkTokens,
23
23
  )
24
24
 
25
25
  _logger = getLogger()
@@ -81,38 +81,6 @@ def _extractFilePath(raw: Dict[str, Any]) -> Optional[str]:
81
81
  return fallback
82
82
 
83
83
 
84
- def _normalizeTokenList(raw_tokens: List[Any]) -> List[str]:
85
- """Return a string-only token list with empty values removed."""
86
- normalized_tokens: List[str] = []
87
- for raw_token in raw_tokens:
88
- token_text: str = str(raw_token).strip()
89
- if token_text:
90
- normalized_tokens.append(token_text)
91
- return normalized_tokens
92
-
93
-
94
- def _parseTextChunkTokens(
95
- raw_tokens: Any,
96
- *,
97
- chunk_id: str,
98
- ) -> Optional[TextChunkTokens]:
99
- """Normalize text chunk tokens from the current backend payload."""
100
- if raw_tokens is None:
101
- return None
102
- if isinstance(raw_tokens, bool):
103
- raise KnowhereError(
104
- f"Invalid tokens payload for text chunk '{chunk_id}': expected list[str], got bool."
105
- )
106
- if isinstance(raw_tokens, list):
107
- return _normalizeTokenList(raw_tokens)
108
-
109
- raise KnowhereError(
110
- "Invalid tokens payload for text chunk "
111
- f"'{chunk_id}': expected list[str], "
112
- f"got {type(raw_tokens).__name__}."
113
- )
114
-
115
-
116
84
  def _buildChunks(
117
85
  raw_chunks: List[Dict[str, Any]],
118
86
  zf: zipfile.ZipFile,
@@ -125,58 +93,39 @@ def _buildChunks(
125
93
 
126
94
  if chunk_type == "image":
127
95
  image_data: bytes = b""
128
- # file_path may be at top level, inside metadata, or use path as fallback
129
96
  file_path: Optional[str] = _extractFilePath(raw)
130
97
  if file_path:
131
98
  image_data = _readZipBytes(zf, file_path) or b""
132
- metadata: Dict[str, Any] = raw.get("metadata", {})
133
99
  chunk: Chunk = ImageChunk(
134
100
  chunk_id=raw.get("chunk_id", ""),
135
101
  type="image",
136
102
  content=raw.get("content", ""),
137
103
  path=raw.get("path"),
138
- page_nums=metadata.get("page_nums", raw.get("page_nums")),
139
- length=metadata.get("length", raw.get("length", 0)),
140
104
  file_path=file_path,
141
- original_name=metadata.get("original_name", raw.get("original_name")),
142
- summary=metadata.get("summary", raw.get("summary")),
143
105
  data=image_data,
106
+ metadata=raw.get("metadata", {}),
144
107
  )
145
108
  elif chunk_type == "table":
146
109
  table_html: str = ""
147
110
  file_path = _extractFilePath(raw)
148
111
  if file_path:
149
112
  table_html = _readZipText(zf, file_path) or ""
150
- metadata = raw.get("metadata", {})
151
113
  chunk = TableChunk(
152
114
  chunk_id=raw.get("chunk_id", ""),
153
115
  type="table",
154
116
  content=raw.get("content", ""),
155
117
  path=raw.get("path"),
156
- page_nums=metadata.get("page_nums", raw.get("page_nums")),
157
- length=metadata.get("length", raw.get("length", 0)),
158
118
  file_path=file_path,
159
- original_name=metadata.get("original_name", raw.get("original_name")),
160
- table_type=metadata.get("table_type", raw.get("table_type")),
161
- summary=metadata.get("summary", raw.get("summary")),
162
119
  html=table_html,
120
+ metadata=raw.get("metadata", {}),
163
121
  )
164
122
  else:
165
- metadata = raw.get("metadata", {})
166
- chunk_id: str = raw.get("chunk_id", "")
167
- raw_tokens: Any = metadata.get("tokens", raw.get("tokens"))
168
123
  chunk = TextChunk(
169
- chunk_id=chunk_id,
124
+ chunk_id=raw.get("chunk_id", ""),
170
125
  type="text",
171
126
  content=raw.get("content", ""),
172
127
  path=raw.get("path"),
173
- page_nums=metadata.get("page_nums", raw.get("page_nums")),
174
- length=metadata.get("length", raw.get("length", 0)),
175
- tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id),
176
- keywords=metadata.get("keywords", raw.get("keywords")),
177
- summary=metadata.get("summary", raw.get("summary")),
178
- connect_to=metadata.get("connect_to", raw.get("connect_to")),
179
- relationships=metadata.get("relationships", raw.get("relationships")),
128
+ metadata=raw.get("metadata", {}),
180
129
  )
181
130
 
182
131
  chunks.append(chunk)
@@ -229,7 +178,15 @@ def parseResultZip(
229
178
  # -- Full markdown --
230
179
  full_markdown: str = _readZipText(zf, "full.md") or ""
231
180
 
232
- # -- Hierarchy --
181
+ # -- DocNav (current worker output) --
182
+ doc_nav_text: Optional[str] = _readZipText(zf, "doc_nav.json")
183
+ doc_nav: Optional[DocNav] = (
184
+ DocNav.model_validate(json.loads(doc_nav_text))
185
+ if doc_nav_text
186
+ else None
187
+ )
188
+
189
+ # -- Hierarchy (legacy — current worker no longer emits this) --
233
190
  hierarchy_text: Optional[str] = _readZipText(zf, "hierarchy.json")
234
191
  hierarchy: Optional[Any] = (
235
192
  json.loads(hierarchy_text) if hierarchy_text else None
@@ -263,11 +220,13 @@ def parseResultZip(
263
220
  return ParseResult(
264
221
  manifest=manifest,
265
222
  chunks=chunks,
266
- chunks_slim=chunks_slim,
267
223
  full_markdown=full_markdown,
224
+ raw_zip=zip_bytes,
225
+ doc_nav=doc_nav,
226
+ # Legacy — the current worker no longer emits these files
227
+ chunks_slim=chunks_slim,
268
228
  hierarchy=hierarchy,
269
229
  toc_hierarchies=toc_hierarchies,
270
230
  kb_csv=kb_csv,
271
231
  hierarchy_view_html=hierarchy_view_html,
272
- raw_zip=zip_bytes,
273
232
  )
@@ -5,7 +5,13 @@ from __future__ import annotations
5
5
  from typing import Any, Dict, Optional
6
6
 
7
7
  from knowhere.resources._base import AsyncAPIResource, SyncAPIResource
8
- from knowhere.types.document import Document, DocumentListResponse
8
+ from knowhere.types.document import (
9
+ Document,
10
+ DocumentChunkListResponse,
11
+ DocumentChunkResponse,
12
+ DocumentChunkType,
13
+ DocumentListResponse,
14
+ )
9
15
 
10
16
 
11
17
  class Documents(SyncAPIResource):
@@ -32,6 +38,49 @@ class Documents(SyncAPIResource):
32
38
  cast_to=Document,
33
39
  )
34
40
 
41
+ def list_chunks(
42
+ self,
43
+ document_id: str,
44
+ *,
45
+ page: int = 1,
46
+ page_size: int = 50,
47
+ chunk_type: Optional[DocumentChunkType] = None,
48
+ include_asset_urls: bool = False,
49
+ ) -> DocumentChunkListResponse:
50
+ """List current-revision chunks for one canonical document."""
51
+ params: Dict[str, Any] = _build_chunk_list_params(
52
+ page=page,
53
+ page_size=page_size,
54
+ chunk_type=chunk_type,
55
+ include_asset_urls=include_asset_urls,
56
+ )
57
+
58
+ return self._request(
59
+ "GET",
60
+ f"v1/documents/{document_id}/chunks",
61
+ params=params or None,
62
+ cast_to=DocumentChunkListResponse,
63
+ )
64
+
65
+ def get_chunk(
66
+ self,
67
+ document_id: str,
68
+ document_chunk_id: str,
69
+ *,
70
+ include_asset_urls: bool = False,
71
+ ) -> DocumentChunkResponse:
72
+ """Get one current-revision chunk for one canonical document."""
73
+ params: Dict[str, Any] = _build_chunk_get_params(
74
+ include_asset_urls=include_asset_urls,
75
+ )
76
+
77
+ return self._request(
78
+ "GET",
79
+ f"v1/documents/{document_id}/chunks/{document_chunk_id}",
80
+ params=params or None,
81
+ cast_to=DocumentChunkResponse,
82
+ )
83
+
35
84
  def archive(self, document_id: str) -> Document:
36
85
  """Archive one canonical document by ID."""
37
86
  return self._request(
@@ -65,6 +114,49 @@ class AsyncDocuments(AsyncAPIResource):
65
114
  cast_to=Document,
66
115
  )
67
116
 
117
+ async def list_chunks(
118
+ self,
119
+ document_id: str,
120
+ *,
121
+ page: int = 1,
122
+ page_size: int = 50,
123
+ chunk_type: Optional[DocumentChunkType] = None,
124
+ include_asset_urls: bool = False,
125
+ ) -> DocumentChunkListResponse:
126
+ """List current-revision chunks for one canonical document."""
127
+ params: Dict[str, Any] = _build_chunk_list_params(
128
+ page=page,
129
+ page_size=page_size,
130
+ chunk_type=chunk_type,
131
+ include_asset_urls=include_asset_urls,
132
+ )
133
+
134
+ return await self._request(
135
+ "GET",
136
+ f"v1/documents/{document_id}/chunks",
137
+ params=params or None,
138
+ cast_to=DocumentChunkListResponse,
139
+ )
140
+
141
+ async def get_chunk(
142
+ self,
143
+ document_id: str,
144
+ document_chunk_id: str,
145
+ *,
146
+ include_asset_urls: bool = False,
147
+ ) -> DocumentChunkResponse:
148
+ """Get one current-revision chunk for one canonical document."""
149
+ params: Dict[str, Any] = _build_chunk_get_params(
150
+ include_asset_urls=include_asset_urls,
151
+ )
152
+
153
+ return await self._request(
154
+ "GET",
155
+ f"v1/documents/{document_id}/chunks/{document_chunk_id}",
156
+ params=params or None,
157
+ cast_to=DocumentChunkResponse,
158
+ )
159
+
68
160
  async def archive(self, document_id: str) -> Document:
69
161
  """Archive one canonical document by ID."""
70
162
  return await self._request(
@@ -72,3 +164,28 @@ class AsyncDocuments(AsyncAPIResource):
72
164
  f"v1/documents/{document_id}/archive",
73
165
  cast_to=Document,
74
166
  )
167
+
168
+
169
+ def _build_chunk_list_params(
170
+ *,
171
+ page: int,
172
+ page_size: int,
173
+ chunk_type: Optional[DocumentChunkType],
174
+ include_asset_urls: bool,
175
+ ) -> Dict[str, Any]:
176
+ params: Dict[str, Any] = {}
177
+ if page != 1:
178
+ params["page"] = page
179
+ if page_size != 50:
180
+ params["page_size"] = page_size
181
+ if chunk_type is not None:
182
+ params["chunk_type"] = chunk_type
183
+ if include_asset_urls:
184
+ params["include_asset_urls"] = True
185
+ return params
186
+
187
+
188
+ def _build_chunk_get_params(*, include_asset_urls: bool) -> Dict[str, Any]:
189
+ if not include_asset_urls:
190
+ return {}
191
+ return {"include_asset_urls": True}
@@ -22,6 +22,7 @@ class Retrieval(SyncAPIResource):
22
22
  query: str,
23
23
  namespace: Optional[str] = None,
24
24
  top_k: Optional[int] = None,
25
+ use_agentic: Optional[bool] = None,
25
26
  data_type: Optional[int] = None,
26
27
  signal_paths: Optional[list[str]] = None,
27
28
  filter_mode: Optional[RetrievalFilterMode] = None,
@@ -39,6 +40,8 @@ class Retrieval(SyncAPIResource):
39
40
  body["namespace"] = namespace
40
41
  if top_k is not None:
41
42
  body["top_k"] = top_k
43
+ if use_agentic is not None:
44
+ body["use_agentic"] = use_agentic
42
45
  if data_type is not None:
43
46
  body["data_type"] = data_type
44
47
  if signal_paths is not None:
@@ -77,6 +80,7 @@ class AsyncRetrieval(AsyncAPIResource):
77
80
  query: str,
78
81
  namespace: Optional[str] = None,
79
82
  top_k: Optional[int] = None,
83
+ use_agentic: Optional[bool] = None,
80
84
  data_type: Optional[int] = None,
81
85
  signal_paths: Optional[list[str]] = None,
82
86
  filter_mode: Optional[RetrievalFilterMode] = None,
@@ -94,6 +98,8 @@ class AsyncRetrieval(AsyncAPIResource):
94
98
  body["namespace"] = namespace
95
99
  if top_k is not None:
96
100
  body["top_k"] = top_k
101
+ if use_agentic is not None:
102
+ body["use_agentic"] = use_agentic
97
103
  if data_type is not None:
98
104
  body["data_type"] = data_type
99
105
  if signal_paths is not None:
@@ -2,7 +2,15 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from knowhere.types.document import Document, DocumentListResponse
5
+ from knowhere.types.document import (
6
+ Document,
7
+ DocumentChunk,
8
+ DocumentChunkListResponse,
9
+ DocumentChunkPagination,
10
+ DocumentChunkResponse,
11
+ DocumentChunkType,
12
+ DocumentListResponse,
13
+ )
6
14
  from knowhere.types.job import Job, JobError, JobResult
7
15
  from knowhere.types.params import ParsingParams, WebhookConfig
8
16
  from knowhere.types.retrieval import (
@@ -39,6 +47,11 @@ __all__: list[str] = [
39
47
  "JobResult",
40
48
  # document
41
49
  "Document",
50
+ "DocumentChunk",
51
+ "DocumentChunkListResponse",
52
+ "DocumentChunkPagination",
53
+ "DocumentChunkResponse",
54
+ "DocumentChunkType",
42
55
  "DocumentListResponse",
43
56
  # retrieval
44
57
  "RetrievalChannel",
@@ -3,7 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  from datetime import datetime
6
- from typing import Optional
6
+ from typing import Any, Dict, Literal, Optional
7
7
 
8
8
  from pydantic import BaseModel
9
9
 
@@ -26,3 +26,53 @@ class DocumentListResponse(BaseModel):
26
26
 
27
27
  namespace: str
28
28
  documents: list[Document]
29
+
30
+
31
+ DocumentChunkType = Literal["text", "image", "table"]
32
+
33
+
34
+ class DocumentChunkPagination(BaseModel):
35
+ """Pagination metadata returned by document chunk list endpoints."""
36
+
37
+ page: int
38
+ page_size: int
39
+ total: int
40
+ total_pages: int
41
+
42
+
43
+ class DocumentChunk(BaseModel):
44
+ """One current-revision document chunk."""
45
+
46
+ id: str
47
+ chunk_id: str
48
+ chunk_type: DocumentChunkType
49
+ content: Optional[str] = None
50
+ section_id: Optional[str] = None
51
+ section_path: Optional[str] = None
52
+ source_chunk_path: Optional[str] = None
53
+ file_path: Optional[str] = None
54
+ sort_order: int
55
+ metadata: Dict[str, Any]
56
+ asset_url: Optional[str] = None
57
+ created_at: Optional[datetime] = None
58
+
59
+
60
+ class DocumentChunkListResponse(BaseModel):
61
+ """Response from ``GET /v1/documents/{document_id}/chunks``."""
62
+
63
+ document_id: str
64
+ namespace: str
65
+ job_result_id: Optional[str] = None
66
+ job_id: Optional[str] = None
67
+ chunks: list[DocumentChunk]
68
+ pagination: DocumentChunkPagination
69
+
70
+
71
+ class DocumentChunkResponse(BaseModel):
72
+ """Response from ``GET /v1/documents/{document_id}/chunks/{chunk_id}``."""
73
+
74
+ document_id: str
75
+ namespace: str
76
+ job_result_id: Optional[str] = None
77
+ job_id: Optional[str] = None
78
+ chunk: DocumentChunk
knowhere/types/result.py CHANGED
@@ -9,7 +9,6 @@ from pathlib import Path
9
9
  from typing import Any, Dict, List, Optional, Union
10
10
 
11
11
  from pydantic import BaseModel, Field
12
- from typing_extensions import TypeAlias
13
12
 
14
13
  from knowhere._exceptions import ValidationError
15
14
 
@@ -138,6 +137,44 @@ class Manifest(BaseModel):
138
137
  checksum: Optional[Checksum] = None
139
138
  statistics: Optional[Statistics] = None
140
139
  files: Optional[FileIndex] = None
140
+ hierarchy: Optional[Any] = Field(default=None, alias="HIERARCHY")
141
+
142
+
143
+ # ---------------------------------------------------------------------------
144
+ # DocNav models
145
+ # ---------------------------------------------------------------------------
146
+
147
+
148
+ class DocNavResourceItem(BaseModel):
149
+ """A single image or table resource entry in ``doc_nav.json``."""
150
+
151
+ path: str
152
+ summary: Optional[str] = None
153
+
154
+
155
+ class DocNavResources(BaseModel):
156
+ """Image and table resource summaries from ``doc_nav.json``."""
157
+
158
+ images: List[DocNavResourceItem] = Field(default_factory=list)
159
+ tables: List[DocNavResourceItem] = Field(default_factory=list)
160
+
161
+
162
+ class DocNavSection(BaseModel):
163
+ """A document section entry in the ``doc_nav.json`` navigation tree."""
164
+
165
+ title: str
166
+ path: str
167
+ level: int
168
+ summary: Optional[str] = None
169
+ chunk_count: int = 0
170
+ children: List["DocNavSection"] = Field(default_factory=list)
171
+
172
+
173
+ class DocNav(BaseModel):
174
+ """Top-level document navigation structure from ``doc_nav.json``."""
175
+
176
+ sections: List[DocNavSection] = Field(default_factory=list)
177
+ resources: Optional[DocNavResources] = None
141
178
 
142
179
 
143
180
  # ---------------------------------------------------------------------------
@@ -145,6 +182,27 @@ class Manifest(BaseModel):
145
182
  # ---------------------------------------------------------------------------
146
183
 
147
184
 
185
+ class ChunkMetadata(BaseModel):
186
+ """Known worker metadata fields for a chunk.
187
+
188
+ All fields are optional. Unknown fields added by future worker
189
+ versions are preserved thanks to ``model_config``.
190
+ """
191
+
192
+ model_config = {"extra": "allow"}
193
+
194
+ length: Optional[int] = None
195
+ page_nums: Optional[List[int]] = None
196
+ tokens: Optional[List[str]] = None
197
+ keywords: Optional[List[str]] = None
198
+ summary: Optional[str] = None
199
+ connect_to: Optional[List[Dict[str, Any]]] = None
200
+ file_path: Optional[str] = None
201
+ original_name: Optional[str] = None
202
+ table_type: Optional[str] = None
203
+ document_top_summary: Optional[str] = None
204
+
205
+
148
206
  class BaseChunk(BaseModel):
149
207
  """Fields shared by every chunk type."""
150
208
 
@@ -152,32 +210,20 @@ class BaseChunk(BaseModel):
152
210
  type: str
153
211
  content: str = ""
154
212
  path: Optional[str] = None
155
- page_nums: Optional[List[int]] = None
156
-
157
-
158
- TextChunkTokens: TypeAlias = List[str]
213
+ metadata: ChunkMetadata = Field(default_factory=ChunkMetadata)
159
214
 
160
215
 
161
216
  class TextChunk(BaseChunk):
162
217
  """A text chunk extracted from the document."""
163
218
 
164
219
  type: str = "text"
165
- length: int = 0
166
- tokens: Optional[TextChunkTokens] = None
167
- keywords: Optional[List[str]] = None
168
- summary: Optional[str] = None
169
- connect_to: Optional[List[Dict[str, Any]]] = None
170
- relationships: Optional[List[Union[Dict[str, Any], str]]] = None
171
220
 
172
221
 
173
222
  class ImageChunk(BaseChunk):
174
223
  """An image chunk — carries raw bytes loaded from the ZIP."""
175
224
 
176
225
  type: str = "image"
177
- length: int = 0
178
226
  file_path: Optional[str] = None
179
- original_name: Optional[str] = None
180
- summary: Optional[str] = None
181
227
  data: bytes = Field(default=b"", exclude=True)
182
228
 
183
229
  model_config = {"arbitrary_types_allowed": True}
@@ -193,13 +239,13 @@ class ImageChunk(BaseChunk):
193
239
  def save(self, directory: Union[str, Path]) -> Path:
194
240
  """Write the image bytes to *directory*, returning the output path.
195
241
 
196
- The filename is derived from ``original_name`` or ``file_path``,
197
- sanitised for cross-platform safety.
242
+ The filename is derived from ``file_path``, sanitised for
243
+ cross-platform safety.
198
244
  """
199
245
  dir_path: Path = Path(directory)
200
246
  dir_path.mkdir(parents=True, exist_ok=True)
201
247
 
202
- raw_name: str = self.original_name or os.path.basename(
248
+ raw_name: str = os.path.basename(
203
249
  self.file_path or f"{self.chunk_id}.bin"
204
250
  )
205
251
  safe_name: str = _sanitizeFilename(raw_name)
@@ -214,11 +260,7 @@ class TableChunk(BaseChunk):
214
260
  """A table chunk — carries HTML loaded from the ZIP."""
215
261
 
216
262
  type: str = "table"
217
- length: int = 0
218
263
  file_path: Optional[str] = None
219
- original_name: Optional[str] = None
220
- table_type: Optional[str] = None
221
- summary: Optional[str] = None
222
264
  html: str = Field(default="", exclude=True)
223
265
 
224
266
  def save(self, directory: Union[str, Path]) -> Path:
@@ -226,7 +268,7 @@ class TableChunk(BaseChunk):
226
268
  dir_path: Path = Path(directory)
227
269
  dir_path.mkdir(parents=True, exist_ok=True)
228
270
 
229
- raw_name: str = self.original_name or os.path.basename(
271
+ raw_name: str = os.path.basename(
230
272
  self.file_path or f"{self.chunk_id}.html"
231
273
  )
232
274
  safe_name: str = _sanitizeFilename(raw_name)
@@ -242,12 +284,11 @@ Chunk = Union[TextChunk, ImageChunk, TableChunk]
242
284
 
243
285
 
244
286
  class SlimChunk(BaseModel):
245
- """Minimal chunk entry emitted in chunks_slim.json."""
287
+ """Minimal chunk entry emitted in chunks_slim.json (legacy)."""
246
288
 
247
289
  type: str
248
290
  path: Optional[str] = None
249
291
  content: str = ""
250
- summary: Optional[str] = None
251
292
 
252
293
 
253
294
  # ---------------------------------------------------------------------------
@@ -259,48 +300,59 @@ class ParseResult:
259
300
  """Eagerly-loaded result of a document parsing job.
260
301
 
261
302
  Contains the manifest, all chunks (with image bytes and table HTML
262
- already loaded), the full markdown, hierarchy data, and the raw ZIP
263
- bytes for archival purposes.
303
+ already loaded), the full markdown, the document navigation tree,
304
+ and the raw ZIP bytes for archival purposes.
305
+
306
+ Legacy fields (``chunks_slim``, ``hierarchy``, ``toc_hierarchies``,
307
+ ``kb_csv``, ``hierarchy_view_html``) are kept for backward
308
+ compatibility with older result ZIPs. The current worker does not
309
+ emit ``chunks_slim.json`` or ``hierarchy.json``.
264
310
  """
265
311
 
266
312
  manifest: Manifest
267
313
  chunks: List[Chunk]
268
- chunks_slim: Optional[List[SlimChunk]]
269
314
  full_markdown: str
315
+ raw_zip: bytes
316
+ namespace: Optional[str]
317
+ document_id: Optional[str]
318
+ # Current worker output
319
+ doc_nav: Optional[DocNav]
320
+ # Legacy — the current worker no longer emits these files
321
+ chunks_slim: Optional[List[SlimChunk]]
270
322
  hierarchy: Optional[Any]
271
323
  toc_hierarchies: Optional[Any]
272
324
  kb_csv: Optional[str]
273
325
  hierarchy_view_html: Optional[str]
274
- raw_zip: bytes
275
- namespace: Optional[str]
276
- document_id: Optional[str]
277
326
 
278
327
  def __init__(
279
328
  self,
280
329
  *,
281
330
  manifest: Manifest,
282
331
  chunks: List[Chunk],
283
- chunks_slim: Optional[List[SlimChunk]],
284
332
  full_markdown: str,
285
- hierarchy: Optional[Any],
286
- toc_hierarchies: Optional[Any],
287
- kb_csv: Optional[str],
288
- hierarchy_view_html: Optional[str],
289
333
  raw_zip: bytes,
334
+ doc_nav: Optional[DocNav] = None,
290
335
  namespace: Optional[str] = None,
291
336
  document_id: Optional[str] = None,
337
+ # Legacy — the current worker no longer emits these files
338
+ chunks_slim: Optional[List[SlimChunk]] = None,
339
+ hierarchy: Optional[Any] = None,
340
+ toc_hierarchies: Optional[Any] = None,
341
+ kb_csv: Optional[str] = None,
342
+ hierarchy_view_html: Optional[str] = None,
292
343
  ) -> None:
293
344
  self.manifest = manifest
294
345
  self.chunks = chunks
295
- self.chunks_slim = chunks_slim
296
346
  self.full_markdown = full_markdown
347
+ self.raw_zip = raw_zip
348
+ self.doc_nav = doc_nav
349
+ self.namespace = namespace
350
+ self.document_id = document_id
351
+ self.chunks_slim = chunks_slim
297
352
  self.hierarchy = hierarchy
298
353
  self.toc_hierarchies = toc_hierarchies
299
354
  self.kb_csv = kb_csv
300
355
  self.hierarchy_view_html = hierarchy_view_html
301
- self.raw_zip = raw_zip
302
- self.namespace = namespace
303
- self.document_id = document_id
304
356
 
305
357
  # -- convenience properties --
306
358
 
@@ -344,11 +396,17 @@ class ParseResult:
344
396
  """Save the full result to *directory*.
345
397
 
346
398
  Creates the directory if needed and writes:
399
+ * ``manifest.json`` — result manifest
400
+ * ``chunks.json`` — all chunks
401
+ * ``doc_nav.json`` — document navigation tree (if present)
347
402
  * ``full.md`` — the full markdown
348
403
  * ``images/`` — all image chunks
349
404
  * ``tables/`` — all table chunks
350
405
  * ``result.zip`` — the raw ZIP archive
351
406
 
407
+ Legacy files (``chunks_slim.json``, ``hierarchy.json``, etc.) are
408
+ also written when present for backward compatibility.
409
+
352
410
  Returns the resolved directory path.
353
411
  """
354
412
  dir_path: Path = Path(directory)
@@ -357,7 +415,7 @@ class ParseResult:
357
415
  # Manifest / chunks
358
416
  manifest_path: Path = dir_path / "manifest.json"
359
417
  manifest_path.write_text(
360
- self.manifest.model_dump_json(indent=2),
418
+ self.manifest.model_dump_json(indent=2, by_alias=True),
361
419
  encoding="utf-8",
362
420
  )
363
421
 
@@ -367,6 +425,13 @@ class ParseResult:
367
425
  encoding="utf-8",
368
426
  )
369
427
 
428
+ if self.doc_nav is not None:
429
+ doc_nav_path: Path = dir_path / "doc_nav.json"
430
+ doc_nav_path.write_text(
431
+ self.doc_nav.model_dump_json(indent=2),
432
+ encoding="utf-8",
433
+ )
434
+
370
435
  if self.chunks_slim is not None:
371
436
  chunks_slim_path: Path = dir_path / "chunks_slim.json"
372
437
  chunks_slim_path.write_text(
@@ -2,9 +2,9 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from typing import Literal, Optional, TypedDict
5
+ from typing import Any, Dict, List, Literal, Optional, TypedDict
6
6
 
7
- from pydantic import BaseModel
7
+ from pydantic import BaseModel, Field
8
8
 
9
9
 
10
10
  RetrievalChannel = Literal["path", "content", "term"]
@@ -37,9 +37,16 @@ class RetrievalResult(BaseModel):
37
37
 
38
38
 
39
39
  class RetrievalQueryResponse(BaseModel):
40
- """Response from ``POST /v1/retrieval/query``."""
40
+ """Response from ``POST /v1/retrieval/query``.
41
+
42
+ Agentic fields (``answer_text``, ``referenced_chunks``) are only
43
+ populated when ``use_agentic=True``. In legacy retrieval mode they
44
+ default to ``None`` and ``[]`` respectively.
45
+ """
41
46
 
42
47
  namespace: str
43
48
  query: str
44
49
  router_used: Optional[str] = None
50
+ answer_text: Optional[str] = None
51
+ referenced_chunks: List[Dict[str, Any]] = Field(default_factory=list)
45
52
  results: list[RetrievalResult]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: knowhere-python-sdk
3
- Version: 0.3.2
3
+ Version: 0.5.0
4
4
  Summary: Official Python SDK for the Knowhere document parsing API
5
5
  Project-URL: Homepage, https://knowhereto.ai
6
6
  Project-URL: Documentation, https://docs.knowhereto.ai
@@ -118,6 +118,21 @@ update_job = client.jobs.create(
118
118
  document = client.documents.get(document_id)
119
119
  print(document.status)
120
120
 
121
+ chunks = client.documents.list_chunks(
122
+ document_id,
123
+ page=1,
124
+ page_size=50,
125
+ chunk_type="text",
126
+ )
127
+ print(chunks.pagination.total)
128
+ if chunks.chunks:
129
+ chunk = client.documents.get_chunk(
130
+ document_id,
131
+ chunks.chunks[0].id,
132
+ include_asset_urls=True,
133
+ )
134
+ print(chunk.chunk.content)
135
+
121
136
  client.documents.archive(document_id)
122
137
  ```
123
138
 
@@ -1,4 +1,4 @@
1
- knowhere/__init__.py,sha256=wicVid8SW7a3AqabHmHI6iIxpY5Tm732eMyQgBQ7zDM,3016
1
+ knowhere/__init__.py,sha256=pucs7krCP306K1iW7_3X-6kY81qJs9FT9H_jly3ZaSA,3297
2
2
  knowhere/_base_client.py,sha256=ddeRR1lWLhes5ipvYX6-TMEecjjiEBGfQdPw_vnSNqA,17978
3
3
  knowhere/_client.py,sha256=WYb-Fhi3x3nQYNfQG9eCgOpLc_wVyAawfPZWdZhFESg,9586
4
4
  knowhere/_constants.py,sha256=ZNCFQC00NpUZIyc_XZ0uemjJE-E8uKAbv3BDa3po9cg,885
@@ -6,25 +6,25 @@ knowhere/_exceptions.py,sha256=NflH7phh_bNFOJmQ758V4mZCAFQskpGXACMz2JIfFNU,11896
6
6
  knowhere/_logging.py,sha256=tNqEA1dLv-adTT6qRq5RBeO35FoWrnS3gwt7gKChLTA,1376
7
7
  knowhere/_response.py,sha256=EsrM794qxCykvl82UkszeqjJzm9_OSq7nsyzaSCnx0I,1415
8
8
  knowhere/_types.py,sha256=8-JFaRcxgBJbw2mV9BwnmCktFVph41a1mduwtXlYidI,1775
9
- knowhere/_version.py,sha256=eN28KXRy0VvgUkjSTUYstuIdAhhpG6cgufP7uWuf12w,50
9
+ knowhere/_version.py,sha256=HeU50Kgj5bAqiHZ4FeMdYY8FZgliNX2Ol-kt2p2JREI,50
10
10
  knowhere/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  knowhere/lib/__init__.py,sha256=e953V5ny3VmDtCw7y_4uPwdTkwwNpe_Y6o4AEgz3ujw,50
12
12
  knowhere/lib/polling.py,sha256=s0EPHozAvNhXLqr5uwU8YXkkwAdF0ji_nIN0QfR6avY,4500
13
- knowhere/lib/result_parser.py,sha256=dR3knoMq-AFMAe0M3l0YgOM-OrtSmofSLaKZO0tgYao,9882
13
+ knowhere/lib/result_parser.py,sha256=t7504xKxwYgcPcJDrdSjtGKuNVGUnj49m1PvM1NOzKo,7849
14
14
  knowhere/lib/upload.py,sha256=eT-O9_wB2WkWUAsUd7VzaKY6DVfNeA6WMHRdwm0HM0o,7849
15
15
  knowhere/resources/__init__.py,sha256=ClsR-yn_0E4KOopD_Yq13wbPHHjl9s15XpydN-d2Rzo,393
16
16
  knowhere/resources/_base.py,sha256=tgKphNTsgMhktWp6_rhyVOZyee4CYlDmD5O1_jWVvYo,1829
17
- knowhere/resources/documents.py,sha256=u_gmrElvpMOABaHkEuTyaYvh4D_CG4pHZt23r8tivaY,2314
17
+ knowhere/resources/documents.py,sha256=itBkO3oud-ilo2tDOIeSB517OPVDVyfwSnPfHYYu23I,5695
18
18
  knowhere/resources/jobs.py,sha256=xYhgYP3Vz7SgGEckmXOvZocNru_4nsS4BoqquojncNw,9727
19
- knowhere/resources/retrieval.py,sha256=t_jFY-7wYfYVSH6e3WYgn0IaoaPcABXaeZoqcs-pUIo,4543
20
- knowhere/types/__init__.py,sha256=-T1Rx90y1W3kSW63v6QbXDgTO9aE097vx98xvRaYejU,1452
21
- knowhere/types/document.py,sha256=LbFleglvm538vSDDho82j7fVxvgMXdIVm9wrWemLShY,711
19
+ knowhere/resources/retrieval.py,sha256=E789ZJsJwk6uEHitZfZjsmZ2I-gp4NF1lBCP66gapYk,4795
20
+ knowhere/types/__init__.py,sha256=qsfiUolOzimMMen6DkhqW9htAYBunWBwv0r1O3_Hatg,1733
21
+ knowhere/types/document.py,sha256=iWK528fjGNyW36GhNAz0rq3164JzaPpkA0_UiQwbESE,1997
22
22
  knowhere/types/job.py,sha256=VsLUFuELZo8rRemuekTbliTIwaD6CR_dAjgdSriPmw4,2472
23
23
  knowhere/types/params.py,sha256=7DyBd4xMxtLPch-A1130-gI0ajKOv2G5tbSMkE8n6-E,543
24
- knowhere/types/result.py,sha256=uSpvOadmKOF5-n_uBTkmWAho2eDsOAUZoK_W96X2jeU,13143
25
- knowhere/types/retrieval.py,sha256=EopqmAx2DeO9AmEbd50emdu2mTbTxrhGoJ6DwvvoUCI,1090
24
+ knowhere/types/result.py,sha256=hpubTz95PLdoeXKpEc9YJ7Sqfla_3T6wOCJGE7YdC_s,15421
25
+ knowhere/types/retrieval.py,sha256=cm7ks_OESi6F7fGCjiKG-RJGx1iu2zee-qCFUCUa0-Y,1422
26
26
  knowhere/types/shared.py,sha256=K5ezX212othxgCviiE2WnwWFY2MS08pXKJ8Km1ZWmjc,104
27
- knowhere_python_sdk-0.3.2.dist-info/METADATA,sha256=Z12Y7vX6r6HeKg1DRw-J1isGwCo3Dt_PmSeJ5BRXU8g,8635
28
- knowhere_python_sdk-0.3.2.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
29
- knowhere_python_sdk-0.3.2.dist-info/licenses/LICENSE,sha256=jrRlxQDHyd_fTtIkQ_LlJV5AdlM_k_RFVPiJ3bTO6FQ,1070
30
- knowhere_python_sdk-0.3.2.dist-info/RECORD,,
27
+ knowhere_python_sdk-0.5.0.dist-info/METADATA,sha256=T1DBVJ3TWFFNCta2YMadQOcVeX_LvVvdIadRSTr8F04,8956
28
+ knowhere_python_sdk-0.5.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
29
+ knowhere_python_sdk-0.5.0.dist-info/licenses/LICENSE,sha256=jrRlxQDHyd_fTtIkQ_LlJV5AdlM_k_RFVPiJ3bTO6FQ,1070
30
+ knowhere_python_sdk-0.5.0.dist-info/RECORD,,