knowhere-python-sdk 0.4.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. knowhere_python_sdk-0.5.0/.release-please-manifest.json +3 -0
  2. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/CHANGELOG.md +7 -0
  3. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/PKG-INFO +1 -1
  4. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/docs/usage.md +48 -29
  5. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/pyproject.toml +1 -1
  6. knowhere_python_sdk-0.5.0/src/knowhere/_version.py +1 -0
  7. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/lib/result_parser.py +18 -59
  8. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/resources/retrieval.py +6 -0
  9. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/types/result.py +105 -40
  10. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/types/retrieval.py +10 -3
  11. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_models.py +10 -47
  12. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_result_parser.py +200 -105
  13. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_retrieval.py +93 -0
  14. knowhere_python_sdk-0.4.0/.release-please-manifest.json +0 -3
  15. knowhere_python_sdk-0.4.0/src/knowhere/_version.py +0 -1
  16. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/.github/ISSUE_TEMPLATE/bug-report.yml +0 -0
  17. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  18. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/.github/ISSUE_TEMPLATE/feature-request.yml +0 -0
  19. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/.github/pull_request_template.md +0 -0
  20. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/.github/workflows/ci.yml +0 -0
  21. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/.github/workflows/publish-pypi.yml +0 -0
  22. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/.github/workflows/publish.yml +0 -0
  23. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/.gitignore +0 -0
  24. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/CODE_OF_CONDUCT.md +0 -0
  25. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/CONTRIBUTING.md +0 -0
  26. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/LICENSE +0 -0
  27. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/README.md +0 -0
  28. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/SECURITY.md +0 -0
  29. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/examples/async_usage.py +0 -0
  30. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/examples/error_handling.py +0 -0
  31. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/examples/parse_file.py +0 -0
  32. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/examples/parse_url.py +0 -0
  33. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/examples/step_by_step.py +0 -0
  34. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/release-please-config.json +0 -0
  35. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/__init__.py +0 -0
  36. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/_base_client.py +0 -0
  37. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/_client.py +0 -0
  38. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/_constants.py +0 -0
  39. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/_exceptions.py +0 -0
  40. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/_logging.py +0 -0
  41. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/_response.py +0 -0
  42. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/_types.py +0 -0
  43. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/lib/__init__.py +0 -0
  44. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/lib/polling.py +0 -0
  45. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/lib/upload.py +0 -0
  46. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/py.typed +0 -0
  47. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/resources/__init__.py +0 -0
  48. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/resources/_base.py +0 -0
  49. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/resources/documents.py +0 -0
  50. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/resources/jobs.py +0 -0
  51. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/types/__init__.py +0 -0
  52. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/types/document.py +0 -0
  53. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/types/job.py +0 -0
  54. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/types/params.py +0 -0
  55. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/types/shared.py +0 -0
  56. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/__init__.py +0 -0
  57. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/conftest.py +0 -0
  58. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/fixtures/real_result.zip +0 -0
  59. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_client.py +0 -0
  60. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_documents.py +0 -0
  61. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_exceptions.py +0 -0
  62. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_jobs.py +0 -0
  63. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_logging.py +0 -0
  64. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_parse.py +0 -0
  65. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_polling.py +0 -0
  66. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_retry.py +0 -0
  67. {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_upload.py +0 -0
@@ -0,0 +1,3 @@
1
+ {
2
+ ".": "0.5.0"
3
+ }
@@ -1,5 +1,12 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.5.0](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.4.0...v0.5.0) (2026-05-15)
4
+
5
+
6
+ ### Features
7
+
8
+ * sync SDK with current worker ZIP contract and agentic retrieval API ([ad8db2e](https://github.com/Ontos-AI/knowhere-python-sdk/commit/ad8db2e87c77978928d046c95565e9e60c1b1f4e))
9
+
3
10
  ## [0.4.0](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.3.2...v0.4.0) (2026-04-27)
4
11
 
5
12
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: knowhere-python-sdk
3
- Version: 0.4.0
3
+ Version: 0.5.0
4
4
  Summary: Official Python SDK for the Knowhere document parsing API
5
5
  Project-URL: Homepage, https://knowhereto.ai
6
6
  Project-URL: Documentation, https://docs.knowhereto.ai
@@ -1,5 +1,9 @@
1
1
  # Knowhere Python SDK — Usage Guide
2
2
 
3
+ > **Recent changes:** Chunk metadata fields (`tokens`, `keywords`, `summary`,
4
+ > `length`, etc.) are no longer flattened to the chunk surface. Access them
5
+ > through `chunk.metadata` instead. See [Chunk Types](#chunk-types).
6
+
3
7
  Comprehensive reference for every feature, parameter, and pattern in the SDK.
4
8
 
5
9
  ## Table of Contents
@@ -219,8 +223,13 @@ result.table_chunks # List[TableChunk]
219
223
  # Lookup by ID
220
224
  chunk = result.getChunk("chunk_42")
221
225
 
222
- # Hierarchy data (document structure tree, if available)
223
- result.hierarchy
226
+ # Document navigation tree (from doc_nav.json, current worker output)
227
+ result.doc_nav # DocNav | None
228
+ result.doc_nav.sections # List[DocNavSection] — tree of titles/paths/levels
229
+ result.doc_nav.resources # DocNavResources — image/table resource summaries
230
+
231
+ # Legacy hierarchy (from hierarchy.json, older worker output)
232
+ result.hierarchy # Any | None
224
233
 
225
234
  # Raw ZIP bytes (for archival)
226
235
  result.raw_zip
@@ -239,49 +248,48 @@ result.save("./output/report/")
239
248
 
240
249
  ## Chunk Types
241
250
 
242
- Every chunk shares a base set of fields (`chunk_id`, `type`, `content`, `path`). Each type adds its own fields.
251
+ Every chunk shares a base set of fields (`chunk_id`, `type`, `content`, `path`,
252
+ `metadata`). Worker metadata is kept in the `metadata` dict — it is **not**
253
+ flattened to top-level chunk properties.
243
254
 
244
- ### TextChunk
255
+ ### Base fields (all chunk types)
245
256
 
246
257
  | Field | Type | Description |
247
258
  |-------|------|-------------|
248
259
  | `chunk_id` | `str` | Unique identifier |
249
- | `type` | `str` | Always `"text"` |
250
- | `content` | `str` | The text content |
251
- | `path` | `str \| None` | Document structure path (e.g. `"Section 1 > Subsection 2"`) |
252
- | `length` | `int` | Character count |
253
- | `tokens` | `List[str] \| None` | Tokenized words returned by the parser pipeline |
254
- | `keywords` | `List[str] \| None` | Extracted keywords (requires `summary_txt: True`) |
255
- | `summary` | `str \| None` | AI-generated summary (requires `summary_txt: True`) |
256
- | `relationships` | `List \| None` | Relationships to other chunks |
260
+ | `type` | `str` | `"text"`, `"image"`, or `"table"` |
261
+ | `content` | `str` | Text content or placeholder |
262
+ | `path` | `str \| None` | Document structure path |
263
+ | `metadata` | `dict` | Raw worker metadata (tokens, keywords, summary, length, page_nums, etc.) |
264
+
265
+ ### TextChunk
257
266
 
258
267
  ```python
259
268
  for chunk in result.text_chunks:
260
269
  print(f"[{chunk.chunk_id}] {chunk.content[:60]}...")
261
- if chunk.keywords:
262
- print(f" Keywords: {', '.join(chunk.keywords)}")
263
- if chunk.summary:
264
- print(f" Summary: {chunk.summary}")
270
+ # Metadata is in chunk.metadata, not flattened:
271
+ keywords = chunk.metadata.get("keywords", [])
272
+ summary = chunk.metadata.get("summary")
273
+ if keywords:
274
+ print(f" Keywords: {', '.join(keywords)}")
275
+ if summary:
276
+ print(f" Summary: {summary}")
265
277
  ```
266
278
 
267
279
  ### ImageChunk
268
280
 
269
281
  | Field | Type | Description |
270
282
  |-------|------|-------------|
271
- | `chunk_id` | `str` | Unique identifier |
272
- | `type` | `str` | Always `"image"` |
273
- | `content` | `str` | Text content associated with the image |
274
283
  | `file_path` | `str \| None` | Path within the ZIP |
275
- | `original_name` | `str \| None` | Original filename |
276
- | `summary` | `str \| None` | AI-generated image description (requires `summary_image: True`) |
277
284
  | `data` | `bytes` | Raw image bytes (loaded from ZIP) |
278
285
  | `format` | `str \| None` | Image format inferred from extension (property) |
279
286
 
280
287
  ```python
281
288
  for img in result.image_chunks:
282
289
  print(f"{img.file_path} ({len(img.data)} bytes, {img.format})")
283
- if img.summary:
284
- print(f" Description: {img.summary}")
290
+ summary = img.metadata.get("summary")
291
+ if summary:
292
+ print(f" Description: {summary}")
285
293
  img.save("./output/images/") # writes to disk
286
294
  ```
287
295
 
@@ -289,13 +297,7 @@ for img in result.image_chunks:
289
297
 
290
298
  | Field | Type | Description |
291
299
  |-------|------|-------------|
292
- | `chunk_id` | `str` | Unique identifier |
293
- | `type` | `str` | Always `"table"` |
294
- | `content` | `str` | Text representation of the table |
295
300
  | `file_path` | `str \| None` | Path within the ZIP |
296
- | `original_name` | `str \| None` | Original filename |
297
- | `table_type` | `str \| None` | Table classification |
298
- | `summary` | `str \| None` | AI-generated table summary (requires `summary_table: True`) |
299
301
  | `html` | `str` | Full HTML of the table (loaded from ZIP) |
300
302
 
301
303
  ```python
@@ -471,6 +473,19 @@ response = client.retrieval.query(
471
473
  top_k=5,
472
474
  )
473
475
 
476
+ # Agentic mode (LLM navigation + answer synthesis)
477
+ response = client.retrieval.query(
478
+ namespace="support-center",
479
+ query="How do I pair a Bluetooth headset?",
480
+ use_agentic=True,
481
+ top_k=5,
482
+ )
483
+ print(response.answer_text) # LLM-generated natural-language answer
484
+ print(response.router_used) # "workflow_single_step", "small_kb_all", etc.
485
+ for ref in response.referenced_chunks:
486
+ print(ref.get("chunk_id"), ref.get("asset_url"))
487
+
488
+ # Legacy results are always available
474
489
  for result in response.results:
475
490
  print(result.content)
476
491
  print(result.score)
@@ -479,6 +494,10 @@ for result in response.results:
479
494
  print(result.source.section_path)
480
495
  ```
481
496
 
497
+ | Parameter | Type | Default | Description |
498
+ |-----------|------|---------|-------------|
499
+ | `use_agentic` | `bool \| None` | `None` | Force agentic (`True`) or legacy (`False`) retrieval. `None` uses server default. |
500
+
482
501
  Retrieval results expose `content`, not the older parse-result `text` field.
483
502
  Media results may include `asset_url` when the server can sign the referenced
484
503
  artifact.
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "knowhere-python-sdk"
7
- version = "0.4.0"
7
+ version = "0.5.0"
8
8
  description = "Official Python SDK for the Knowhere document parsing API"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -0,0 +1 @@
1
+ __version__ = "0.5.0" # x-release-please-version
@@ -13,13 +13,13 @@ from knowhere._exceptions import ChecksumError, KnowhereError
13
13
  from knowhere._logging import getLogger
14
14
  from knowhere.types.result import (
15
15
  Chunk,
16
+ DocNav,
16
17
  ImageChunk,
17
18
  Manifest,
18
19
  ParseResult,
19
20
  SlimChunk,
20
21
  TableChunk,
21
22
  TextChunk,
22
- TextChunkTokens,
23
23
  )
24
24
 
25
25
  _logger = getLogger()
@@ -81,38 +81,6 @@ def _extractFilePath(raw: Dict[str, Any]) -> Optional[str]:
81
81
  return fallback
82
82
 
83
83
 
84
- def _normalizeTokenList(raw_tokens: List[Any]) -> List[str]:
85
- """Return a string-only token list with empty values removed."""
86
- normalized_tokens: List[str] = []
87
- for raw_token in raw_tokens:
88
- token_text: str = str(raw_token).strip()
89
- if token_text:
90
- normalized_tokens.append(token_text)
91
- return normalized_tokens
92
-
93
-
94
- def _parseTextChunkTokens(
95
- raw_tokens: Any,
96
- *,
97
- chunk_id: str,
98
- ) -> Optional[TextChunkTokens]:
99
- """Normalize text chunk tokens from the current backend payload."""
100
- if raw_tokens is None:
101
- return None
102
- if isinstance(raw_tokens, bool):
103
- raise KnowhereError(
104
- f"Invalid tokens payload for text chunk '{chunk_id}': expected list[str], got bool."
105
- )
106
- if isinstance(raw_tokens, list):
107
- return _normalizeTokenList(raw_tokens)
108
-
109
- raise KnowhereError(
110
- "Invalid tokens payload for text chunk "
111
- f"'{chunk_id}': expected list[str], "
112
- f"got {type(raw_tokens).__name__}."
113
- )
114
-
115
-
116
84
  def _buildChunks(
117
85
  raw_chunks: List[Dict[str, Any]],
118
86
  zf: zipfile.ZipFile,
@@ -125,58 +93,39 @@ def _buildChunks(
125
93
 
126
94
  if chunk_type == "image":
127
95
  image_data: bytes = b""
128
- # file_path may be at top level, inside metadata, or use path as fallback
129
96
  file_path: Optional[str] = _extractFilePath(raw)
130
97
  if file_path:
131
98
  image_data = _readZipBytes(zf, file_path) or b""
132
- metadata: Dict[str, Any] = raw.get("metadata", {})
133
99
  chunk: Chunk = ImageChunk(
134
100
  chunk_id=raw.get("chunk_id", ""),
135
101
  type="image",
136
102
  content=raw.get("content", ""),
137
103
  path=raw.get("path"),
138
- page_nums=metadata.get("page_nums", raw.get("page_nums")),
139
- length=metadata.get("length", raw.get("length", 0)),
140
104
  file_path=file_path,
141
- original_name=metadata.get("original_name", raw.get("original_name")),
142
- summary=metadata.get("summary", raw.get("summary")),
143
105
  data=image_data,
106
+ metadata=raw.get("metadata", {}),
144
107
  )
145
108
  elif chunk_type == "table":
146
109
  table_html: str = ""
147
110
  file_path = _extractFilePath(raw)
148
111
  if file_path:
149
112
  table_html = _readZipText(zf, file_path) or ""
150
- metadata = raw.get("metadata", {})
151
113
  chunk = TableChunk(
152
114
  chunk_id=raw.get("chunk_id", ""),
153
115
  type="table",
154
116
  content=raw.get("content", ""),
155
117
  path=raw.get("path"),
156
- page_nums=metadata.get("page_nums", raw.get("page_nums")),
157
- length=metadata.get("length", raw.get("length", 0)),
158
118
  file_path=file_path,
159
- original_name=metadata.get("original_name", raw.get("original_name")),
160
- table_type=metadata.get("table_type", raw.get("table_type")),
161
- summary=metadata.get("summary", raw.get("summary")),
162
119
  html=table_html,
120
+ metadata=raw.get("metadata", {}),
163
121
  )
164
122
  else:
165
- metadata = raw.get("metadata", {})
166
- chunk_id: str = raw.get("chunk_id", "")
167
- raw_tokens: Any = metadata.get("tokens", raw.get("tokens"))
168
123
  chunk = TextChunk(
169
- chunk_id=chunk_id,
124
+ chunk_id=raw.get("chunk_id", ""),
170
125
  type="text",
171
126
  content=raw.get("content", ""),
172
127
  path=raw.get("path"),
173
- page_nums=metadata.get("page_nums", raw.get("page_nums")),
174
- length=metadata.get("length", raw.get("length", 0)),
175
- tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id),
176
- keywords=metadata.get("keywords", raw.get("keywords")),
177
- summary=metadata.get("summary", raw.get("summary")),
178
- connect_to=metadata.get("connect_to", raw.get("connect_to")),
179
- relationships=metadata.get("relationships", raw.get("relationships")),
128
+ metadata=raw.get("metadata", {}),
180
129
  )
181
130
 
182
131
  chunks.append(chunk)
@@ -229,7 +178,15 @@ def parseResultZip(
229
178
  # -- Full markdown --
230
179
  full_markdown: str = _readZipText(zf, "full.md") or ""
231
180
 
232
- # -- Hierarchy --
181
+ # -- DocNav (current worker output) --
182
+ doc_nav_text: Optional[str] = _readZipText(zf, "doc_nav.json")
183
+ doc_nav: Optional[DocNav] = (
184
+ DocNav.model_validate(json.loads(doc_nav_text))
185
+ if doc_nav_text
186
+ else None
187
+ )
188
+
189
+ # -- Hierarchy (legacy — current worker no longer emits this) --
233
190
  hierarchy_text: Optional[str] = _readZipText(zf, "hierarchy.json")
234
191
  hierarchy: Optional[Any] = (
235
192
  json.loads(hierarchy_text) if hierarchy_text else None
@@ -263,11 +220,13 @@ def parseResultZip(
263
220
  return ParseResult(
264
221
  manifest=manifest,
265
222
  chunks=chunks,
266
- chunks_slim=chunks_slim,
267
223
  full_markdown=full_markdown,
224
+ raw_zip=zip_bytes,
225
+ doc_nav=doc_nav,
226
+ # Legacy — the current worker no longer emits these files
227
+ chunks_slim=chunks_slim,
268
228
  hierarchy=hierarchy,
269
229
  toc_hierarchies=toc_hierarchies,
270
230
  kb_csv=kb_csv,
271
231
  hierarchy_view_html=hierarchy_view_html,
272
- raw_zip=zip_bytes,
273
232
  )
@@ -22,6 +22,7 @@ class Retrieval(SyncAPIResource):
22
22
  query: str,
23
23
  namespace: Optional[str] = None,
24
24
  top_k: Optional[int] = None,
25
+ use_agentic: Optional[bool] = None,
25
26
  data_type: Optional[int] = None,
26
27
  signal_paths: Optional[list[str]] = None,
27
28
  filter_mode: Optional[RetrievalFilterMode] = None,
@@ -39,6 +40,8 @@ class Retrieval(SyncAPIResource):
39
40
  body["namespace"] = namespace
40
41
  if top_k is not None:
41
42
  body["top_k"] = top_k
43
+ if use_agentic is not None:
44
+ body["use_agentic"] = use_agentic
42
45
  if data_type is not None:
43
46
  body["data_type"] = data_type
44
47
  if signal_paths is not None:
@@ -77,6 +80,7 @@ class AsyncRetrieval(AsyncAPIResource):
77
80
  query: str,
78
81
  namespace: Optional[str] = None,
79
82
  top_k: Optional[int] = None,
83
+ use_agentic: Optional[bool] = None,
80
84
  data_type: Optional[int] = None,
81
85
  signal_paths: Optional[list[str]] = None,
82
86
  filter_mode: Optional[RetrievalFilterMode] = None,
@@ -94,6 +98,8 @@ class AsyncRetrieval(AsyncAPIResource):
94
98
  body["namespace"] = namespace
95
99
  if top_k is not None:
96
100
  body["top_k"] = top_k
101
+ if use_agentic is not None:
102
+ body["use_agentic"] = use_agentic
97
103
  if data_type is not None:
98
104
  body["data_type"] = data_type
99
105
  if signal_paths is not None:
@@ -9,7 +9,6 @@ from pathlib import Path
9
9
  from typing import Any, Dict, List, Optional, Union
10
10
 
11
11
  from pydantic import BaseModel, Field
12
- from typing_extensions import TypeAlias
13
12
 
14
13
  from knowhere._exceptions import ValidationError
15
14
 
@@ -138,6 +137,44 @@ class Manifest(BaseModel):
138
137
  checksum: Optional[Checksum] = None
139
138
  statistics: Optional[Statistics] = None
140
139
  files: Optional[FileIndex] = None
140
+ hierarchy: Optional[Any] = Field(default=None, alias="HIERARCHY")
141
+
142
+
143
+ # ---------------------------------------------------------------------------
144
+ # DocNav models
145
+ # ---------------------------------------------------------------------------
146
+
147
+
148
+ class DocNavResourceItem(BaseModel):
149
+ """A single image or table resource entry in ``doc_nav.json``."""
150
+
151
+ path: str
152
+ summary: Optional[str] = None
153
+
154
+
155
+ class DocNavResources(BaseModel):
156
+ """Image and table resource summaries from ``doc_nav.json``."""
157
+
158
+ images: List[DocNavResourceItem] = Field(default_factory=list)
159
+ tables: List[DocNavResourceItem] = Field(default_factory=list)
160
+
161
+
162
+ class DocNavSection(BaseModel):
163
+ """A document section entry in the ``doc_nav.json`` navigation tree."""
164
+
165
+ title: str
166
+ path: str
167
+ level: int
168
+ summary: Optional[str] = None
169
+ chunk_count: int = 0
170
+ children: List["DocNavSection"] = Field(default_factory=list)
171
+
172
+
173
+ class DocNav(BaseModel):
174
+ """Top-level document navigation structure from ``doc_nav.json``."""
175
+
176
+ sections: List[DocNavSection] = Field(default_factory=list)
177
+ resources: Optional[DocNavResources] = None
141
178
 
142
179
 
143
180
  # ---------------------------------------------------------------------------
@@ -145,6 +182,27 @@ class Manifest(BaseModel):
145
182
  # ---------------------------------------------------------------------------
146
183
 
147
184
 
185
+ class ChunkMetadata(BaseModel):
186
+ """Known worker metadata fields for a chunk.
187
+
188
+ All fields are optional. Unknown fields added by future worker
189
+ versions are preserved thanks to ``model_config``.
190
+ """
191
+
192
+ model_config = {"extra": "allow"}
193
+
194
+ length: Optional[int] = None
195
+ page_nums: Optional[List[int]] = None
196
+ tokens: Optional[List[str]] = None
197
+ keywords: Optional[List[str]] = None
198
+ summary: Optional[str] = None
199
+ connect_to: Optional[List[Dict[str, Any]]] = None
200
+ file_path: Optional[str] = None
201
+ original_name: Optional[str] = None
202
+ table_type: Optional[str] = None
203
+ document_top_summary: Optional[str] = None
204
+
205
+
148
206
  class BaseChunk(BaseModel):
149
207
  """Fields shared by every chunk type."""
150
208
 
@@ -152,32 +210,20 @@ class BaseChunk(BaseModel):
152
210
  type: str
153
211
  content: str = ""
154
212
  path: Optional[str] = None
155
- page_nums: Optional[List[int]] = None
156
-
157
-
158
- TextChunkTokens: TypeAlias = List[str]
213
+ metadata: ChunkMetadata = Field(default_factory=ChunkMetadata)
159
214
 
160
215
 
161
216
  class TextChunk(BaseChunk):
162
217
  """A text chunk extracted from the document."""
163
218
 
164
219
  type: str = "text"
165
- length: int = 0
166
- tokens: Optional[TextChunkTokens] = None
167
- keywords: Optional[List[str]] = None
168
- summary: Optional[str] = None
169
- connect_to: Optional[List[Dict[str, Any]]] = None
170
- relationships: Optional[List[Union[Dict[str, Any], str]]] = None
171
220
 
172
221
 
173
222
  class ImageChunk(BaseChunk):
174
223
  """An image chunk — carries raw bytes loaded from the ZIP."""
175
224
 
176
225
  type: str = "image"
177
- length: int = 0
178
226
  file_path: Optional[str] = None
179
- original_name: Optional[str] = None
180
- summary: Optional[str] = None
181
227
  data: bytes = Field(default=b"", exclude=True)
182
228
 
183
229
  model_config = {"arbitrary_types_allowed": True}
@@ -193,13 +239,13 @@ class ImageChunk(BaseChunk):
193
239
  def save(self, directory: Union[str, Path]) -> Path:
194
240
  """Write the image bytes to *directory*, returning the output path.
195
241
 
196
- The filename is derived from ``original_name`` or ``file_path``,
197
- sanitised for cross-platform safety.
242
+ The filename is derived from ``file_path``, sanitised for
243
+ cross-platform safety.
198
244
  """
199
245
  dir_path: Path = Path(directory)
200
246
  dir_path.mkdir(parents=True, exist_ok=True)
201
247
 
202
- raw_name: str = self.original_name or os.path.basename(
248
+ raw_name: str = os.path.basename(
203
249
  self.file_path or f"{self.chunk_id}.bin"
204
250
  )
205
251
  safe_name: str = _sanitizeFilename(raw_name)
@@ -214,11 +260,7 @@ class TableChunk(BaseChunk):
214
260
  """A table chunk — carries HTML loaded from the ZIP."""
215
261
 
216
262
  type: str = "table"
217
- length: int = 0
218
263
  file_path: Optional[str] = None
219
- original_name: Optional[str] = None
220
- table_type: Optional[str] = None
221
- summary: Optional[str] = None
222
264
  html: str = Field(default="", exclude=True)
223
265
 
224
266
  def save(self, directory: Union[str, Path]) -> Path:
@@ -226,7 +268,7 @@ class TableChunk(BaseChunk):
226
268
  dir_path: Path = Path(directory)
227
269
  dir_path.mkdir(parents=True, exist_ok=True)
228
270
 
229
- raw_name: str = self.original_name or os.path.basename(
271
+ raw_name: str = os.path.basename(
230
272
  self.file_path or f"{self.chunk_id}.html"
231
273
  )
232
274
  safe_name: str = _sanitizeFilename(raw_name)
@@ -242,12 +284,11 @@ Chunk = Union[TextChunk, ImageChunk, TableChunk]
242
284
 
243
285
 
244
286
  class SlimChunk(BaseModel):
245
- """Minimal chunk entry emitted in chunks_slim.json."""
287
+ """Minimal chunk entry emitted in chunks_slim.json (legacy)."""
246
288
 
247
289
  type: str
248
290
  path: Optional[str] = None
249
291
  content: str = ""
250
- summary: Optional[str] = None
251
292
 
252
293
 
253
294
  # ---------------------------------------------------------------------------
@@ -259,48 +300,59 @@ class ParseResult:
259
300
  """Eagerly-loaded result of a document parsing job.
260
301
 
261
302
  Contains the manifest, all chunks (with image bytes and table HTML
262
- already loaded), the full markdown, hierarchy data, and the raw ZIP
263
- bytes for archival purposes.
303
+ already loaded), the full markdown, the document navigation tree,
304
+ and the raw ZIP bytes for archival purposes.
305
+
306
+ Legacy fields (``chunks_slim``, ``hierarchy``, ``toc_hierarchies``,
307
+ ``kb_csv``, ``hierarchy_view_html``) are kept for backward
308
+ compatibility with older result ZIPs. The current worker does not
309
+ emit ``chunks_slim.json`` or ``hierarchy.json``.
264
310
  """
265
311
 
266
312
  manifest: Manifest
267
313
  chunks: List[Chunk]
268
- chunks_slim: Optional[List[SlimChunk]]
269
314
  full_markdown: str
315
+ raw_zip: bytes
316
+ namespace: Optional[str]
317
+ document_id: Optional[str]
318
+ # Current worker output
319
+ doc_nav: Optional[DocNav]
320
+ # Legacy — the current worker no longer emits these files
321
+ chunks_slim: Optional[List[SlimChunk]]
270
322
  hierarchy: Optional[Any]
271
323
  toc_hierarchies: Optional[Any]
272
324
  kb_csv: Optional[str]
273
325
  hierarchy_view_html: Optional[str]
274
- raw_zip: bytes
275
- namespace: Optional[str]
276
- document_id: Optional[str]
277
326
 
278
327
  def __init__(
279
328
  self,
280
329
  *,
281
330
  manifest: Manifest,
282
331
  chunks: List[Chunk],
283
- chunks_slim: Optional[List[SlimChunk]],
284
332
  full_markdown: str,
285
- hierarchy: Optional[Any],
286
- toc_hierarchies: Optional[Any],
287
- kb_csv: Optional[str],
288
- hierarchy_view_html: Optional[str],
289
333
  raw_zip: bytes,
334
+ doc_nav: Optional[DocNav] = None,
290
335
  namespace: Optional[str] = None,
291
336
  document_id: Optional[str] = None,
337
+ # Legacy — the current worker no longer emits these files
338
+ chunks_slim: Optional[List[SlimChunk]] = None,
339
+ hierarchy: Optional[Any] = None,
340
+ toc_hierarchies: Optional[Any] = None,
341
+ kb_csv: Optional[str] = None,
342
+ hierarchy_view_html: Optional[str] = None,
292
343
  ) -> None:
293
344
  self.manifest = manifest
294
345
  self.chunks = chunks
295
- self.chunks_slim = chunks_slim
296
346
  self.full_markdown = full_markdown
347
+ self.raw_zip = raw_zip
348
+ self.doc_nav = doc_nav
349
+ self.namespace = namespace
350
+ self.document_id = document_id
351
+ self.chunks_slim = chunks_slim
297
352
  self.hierarchy = hierarchy
298
353
  self.toc_hierarchies = toc_hierarchies
299
354
  self.kb_csv = kb_csv
300
355
  self.hierarchy_view_html = hierarchy_view_html
301
- self.raw_zip = raw_zip
302
- self.namespace = namespace
303
- self.document_id = document_id
304
356
 
305
357
  # -- convenience properties --
306
358
 
@@ -344,11 +396,17 @@ class ParseResult:
344
396
  """Save the full result to *directory*.
345
397
 
346
398
  Creates the directory if needed and writes:
399
+ * ``manifest.json`` — result manifest
400
+ * ``chunks.json`` — all chunks
401
+ * ``doc_nav.json`` — document navigation tree (if present)
347
402
  * ``full.md`` — the full markdown
348
403
  * ``images/`` — all image chunks
349
404
  * ``tables/`` — all table chunks
350
405
  * ``result.zip`` — the raw ZIP archive
351
406
 
407
+ Legacy files (``chunks_slim.json``, ``hierarchy.json``, etc.) are
408
+ also written when present for backward compatibility.
409
+
352
410
  Returns the resolved directory path.
353
411
  """
354
412
  dir_path: Path = Path(directory)
@@ -357,7 +415,7 @@ class ParseResult:
357
415
  # Manifest / chunks
358
416
  manifest_path: Path = dir_path / "manifest.json"
359
417
  manifest_path.write_text(
360
- self.manifest.model_dump_json(indent=2),
418
+ self.manifest.model_dump_json(indent=2, by_alias=True),
361
419
  encoding="utf-8",
362
420
  )
363
421
 
@@ -367,6 +425,13 @@ class ParseResult:
367
425
  encoding="utf-8",
368
426
  )
369
427
 
428
+ if self.doc_nav is not None:
429
+ doc_nav_path: Path = dir_path / "doc_nav.json"
430
+ doc_nav_path.write_text(
431
+ self.doc_nav.model_dump_json(indent=2),
432
+ encoding="utf-8",
433
+ )
434
+
370
435
  if self.chunks_slim is not None:
371
436
  chunks_slim_path: Path = dir_path / "chunks_slim.json"
372
437
  chunks_slim_path.write_text(
@@ -2,9 +2,9 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from typing import Literal, Optional, TypedDict
5
+ from typing import Any, Dict, List, Literal, Optional, TypedDict
6
6
 
7
- from pydantic import BaseModel
7
+ from pydantic import BaseModel, Field
8
8
 
9
9
 
10
10
  RetrievalChannel = Literal["path", "content", "term"]
@@ -37,9 +37,16 @@ class RetrievalResult(BaseModel):
37
37
 
38
38
 
39
39
  class RetrievalQueryResponse(BaseModel):
40
- """Response from ``POST /v1/retrieval/query``."""
40
+ """Response from ``POST /v1/retrieval/query``.
41
+
42
+ Agentic fields (``answer_text``, ``referenced_chunks``) are only
43
+ populated when ``use_agentic=True``. In legacy retrieval mode they
44
+ default to ``None`` and ``[]`` respectively.
45
+ """
41
46
 
42
47
  namespace: str
43
48
  query: str
44
49
  router_used: Optional[str] = None
50
+ answer_text: Optional[str] = None
51
+ referenced_chunks: List[Dict[str, Any]] = Field(default_factory=list)
45
52
  results: list[RetrievalResult]