knowhere-python-sdk 0.4.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowhere_python_sdk-0.5.0/.release-please-manifest.json +3 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/CHANGELOG.md +7 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/PKG-INFO +1 -1
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/docs/usage.md +48 -29
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/pyproject.toml +1 -1
- knowhere_python_sdk-0.5.0/src/knowhere/_version.py +1 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/lib/result_parser.py +18 -59
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/resources/retrieval.py +6 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/types/result.py +105 -40
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/types/retrieval.py +10 -3
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_models.py +10 -47
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_result_parser.py +200 -105
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_retrieval.py +93 -0
- knowhere_python_sdk-0.4.0/.release-please-manifest.json +0 -3
- knowhere_python_sdk-0.4.0/src/knowhere/_version.py +0 -1
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/.github/ISSUE_TEMPLATE/bug-report.yml +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/.github/ISSUE_TEMPLATE/feature-request.yml +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/.github/pull_request_template.md +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/.github/workflows/ci.yml +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/.github/workflows/publish-pypi.yml +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/.github/workflows/publish.yml +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/.gitignore +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/CODE_OF_CONDUCT.md +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/CONTRIBUTING.md +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/LICENSE +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/README.md +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/SECURITY.md +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/examples/async_usage.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/examples/error_handling.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/examples/parse_file.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/examples/parse_url.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/examples/step_by_step.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/release-please-config.json +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/__init__.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/_base_client.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/_client.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/_constants.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/_exceptions.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/_logging.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/_response.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/_types.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/lib/__init__.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/lib/polling.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/lib/upload.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/py.typed +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/resources/__init__.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/resources/_base.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/resources/documents.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/resources/jobs.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/types/__init__.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/types/document.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/types/job.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/types/params.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/src/knowhere/types/shared.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/__init__.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/conftest.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/fixtures/real_result.zip +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_client.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_documents.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_exceptions.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_jobs.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_logging.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_parse.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_polling.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_retry.py +0 -0
- {knowhere_python_sdk-0.4.0 → knowhere_python_sdk-0.5.0}/tests/test_upload.py +0 -0
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.5.0](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.4.0...v0.5.0) (2026-05-15)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
### Features
|
|
7
|
+
|
|
8
|
+
* sync SDK with current worker ZIP contract and agentic retrieval API ([ad8db2e](https://github.com/Ontos-AI/knowhere-python-sdk/commit/ad8db2e87c77978928d046c95565e9e60c1b1f4e))
|
|
9
|
+
|
|
3
10
|
## [0.4.0](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.3.2...v0.4.0) (2026-04-27)
|
|
4
11
|
|
|
5
12
|
|
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
# Knowhere Python SDK — Usage Guide
|
|
2
2
|
|
|
3
|
+
> **Recent changes:** Chunk metadata fields (`tokens`, `keywords`, `summary`,
|
|
4
|
+
> `length`, etc.) are no longer flattened to the chunk surface. Access them
|
|
5
|
+
> through `chunk.metadata` instead. See [Chunk Types](#chunk-types).
|
|
6
|
+
|
|
3
7
|
Comprehensive reference for every feature, parameter, and pattern in the SDK.
|
|
4
8
|
|
|
5
9
|
## Table of Contents
|
|
@@ -219,8 +223,13 @@ result.table_chunks # List[TableChunk]
|
|
|
219
223
|
# Lookup by ID
|
|
220
224
|
chunk = result.getChunk("chunk_42")
|
|
221
225
|
|
|
222
|
-
#
|
|
223
|
-
result.
|
|
226
|
+
# Document navigation tree (from doc_nav.json, current worker output)
|
|
227
|
+
result.doc_nav # DocNav | None
|
|
228
|
+
result.doc_nav.sections # List[DocNavSection] — tree of titles/paths/levels
|
|
229
|
+
result.doc_nav.resources # DocNavResources — image/table resource summaries
|
|
230
|
+
|
|
231
|
+
# Legacy hierarchy (from hierarchy.json, older worker output)
|
|
232
|
+
result.hierarchy # Any | None
|
|
224
233
|
|
|
225
234
|
# Raw ZIP bytes (for archival)
|
|
226
235
|
result.raw_zip
|
|
@@ -239,49 +248,48 @@ result.save("./output/report/")
|
|
|
239
248
|
|
|
240
249
|
## Chunk Types
|
|
241
250
|
|
|
242
|
-
Every chunk shares a base set of fields (`chunk_id`, `type`, `content`, `path
|
|
251
|
+
Every chunk shares a base set of fields (`chunk_id`, `type`, `content`, `path`,
|
|
252
|
+
`metadata`). Worker metadata is kept in the `metadata` dict — it is **not**
|
|
253
|
+
flattened to top-level chunk properties.
|
|
243
254
|
|
|
244
|
-
###
|
|
255
|
+
### Base fields (all chunk types)
|
|
245
256
|
|
|
246
257
|
| Field | Type | Description |
|
|
247
258
|
|-------|------|-------------|
|
|
248
259
|
| `chunk_id` | `str` | Unique identifier |
|
|
249
|
-
| `type` | `str` |
|
|
250
|
-
| `content` | `str` |
|
|
251
|
-
| `path` | `str \| None` | Document structure path
|
|
252
|
-
| `
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
| `summary` | `str \| None` | AI-generated summary (requires `summary_txt: True`) |
|
|
256
|
-
| `relationships` | `List \| None` | Relationships to other chunks |
|
|
260
|
+
| `type` | `str` | `"text"`, `"image"`, or `"table"` |
|
|
261
|
+
| `content` | `str` | Text content or placeholder |
|
|
262
|
+
| `path` | `str \| None` | Document structure path |
|
|
263
|
+
| `metadata` | `dict` | Raw worker metadata (tokens, keywords, summary, length, page_nums, etc.) |
|
|
264
|
+
|
|
265
|
+
### TextChunk
|
|
257
266
|
|
|
258
267
|
```python
|
|
259
268
|
for chunk in result.text_chunks:
|
|
260
269
|
print(f"[{chunk.chunk_id}] {chunk.content[:60]}...")
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
270
|
+
# Metadata is in chunk.metadata, not flattened:
|
|
271
|
+
keywords = chunk.metadata.get("keywords", [])
|
|
272
|
+
summary = chunk.metadata.get("summary")
|
|
273
|
+
if keywords:
|
|
274
|
+
print(f" Keywords: {', '.join(keywords)}")
|
|
275
|
+
if summary:
|
|
276
|
+
print(f" Summary: {summary}")
|
|
265
277
|
```
|
|
266
278
|
|
|
267
279
|
### ImageChunk
|
|
268
280
|
|
|
269
281
|
| Field | Type | Description |
|
|
270
282
|
|-------|------|-------------|
|
|
271
|
-
| `chunk_id` | `str` | Unique identifier |
|
|
272
|
-
| `type` | `str` | Always `"image"` |
|
|
273
|
-
| `content` | `str` | Text content associated with the image |
|
|
274
283
|
| `file_path` | `str \| None` | Path within the ZIP |
|
|
275
|
-
| `original_name` | `str \| None` | Original filename |
|
|
276
|
-
| `summary` | `str \| None` | AI-generated image description (requires `summary_image: True`) |
|
|
277
284
|
| `data` | `bytes` | Raw image bytes (loaded from ZIP) |
|
|
278
285
|
| `format` | `str \| None` | Image format inferred from extension (property) |
|
|
279
286
|
|
|
280
287
|
```python
|
|
281
288
|
for img in result.image_chunks:
|
|
282
289
|
print(f"{img.file_path} ({len(img.data)} bytes, {img.format})")
|
|
283
|
-
|
|
284
|
-
|
|
290
|
+
summary = img.metadata.get("summary")
|
|
291
|
+
if summary:
|
|
292
|
+
print(f" Description: {summary}")
|
|
285
293
|
img.save("./output/images/") # writes to disk
|
|
286
294
|
```
|
|
287
295
|
|
|
@@ -289,13 +297,7 @@ for img in result.image_chunks:
|
|
|
289
297
|
|
|
290
298
|
| Field | Type | Description |
|
|
291
299
|
|-------|------|-------------|
|
|
292
|
-
| `chunk_id` | `str` | Unique identifier |
|
|
293
|
-
| `type` | `str` | Always `"table"` |
|
|
294
|
-
| `content` | `str` | Text representation of the table |
|
|
295
300
|
| `file_path` | `str \| None` | Path within the ZIP |
|
|
296
|
-
| `original_name` | `str \| None` | Original filename |
|
|
297
|
-
| `table_type` | `str \| None` | Table classification |
|
|
298
|
-
| `summary` | `str \| None` | AI-generated table summary (requires `summary_table: True`) |
|
|
299
301
|
| `html` | `str` | Full HTML of the table (loaded from ZIP) |
|
|
300
302
|
|
|
301
303
|
```python
|
|
@@ -471,6 +473,19 @@ response = client.retrieval.query(
|
|
|
471
473
|
top_k=5,
|
|
472
474
|
)
|
|
473
475
|
|
|
476
|
+
# Agentic mode (LLM navigation + answer synthesis)
|
|
477
|
+
response = client.retrieval.query(
|
|
478
|
+
namespace="support-center",
|
|
479
|
+
query="How do I pair a Bluetooth headset?",
|
|
480
|
+
use_agentic=True,
|
|
481
|
+
top_k=5,
|
|
482
|
+
)
|
|
483
|
+
print(response.answer_text) # LLM-generated natural-language answer
|
|
484
|
+
print(response.router_used) # "workflow_single_step", "small_kb_all", etc.
|
|
485
|
+
for ref in response.referenced_chunks:
|
|
486
|
+
print(ref.get("chunk_id"), ref.get("asset_url"))
|
|
487
|
+
|
|
488
|
+
# Legacy results are always available
|
|
474
489
|
for result in response.results:
|
|
475
490
|
print(result.content)
|
|
476
491
|
print(result.score)
|
|
@@ -479,6 +494,10 @@ for result in response.results:
|
|
|
479
494
|
print(result.source.section_path)
|
|
480
495
|
```
|
|
481
496
|
|
|
497
|
+
| Parameter | Type | Default | Description |
|
|
498
|
+
|-----------|------|---------|-------------|
|
|
499
|
+
| `use_agentic` | `bool \| None` | `None` | Force agentic (`True`) or legacy (`False`) retrieval. `None` uses server default. |
|
|
500
|
+
|
|
482
501
|
Retrieval results expose `content`, not the older parse-result `text` field.
|
|
483
502
|
Media results may include `asset_url` when the server can sign the referenced
|
|
484
503
|
artifact.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.5.0" # x-release-please-version
|
|
@@ -13,13 +13,13 @@ from knowhere._exceptions import ChecksumError, KnowhereError
|
|
|
13
13
|
from knowhere._logging import getLogger
|
|
14
14
|
from knowhere.types.result import (
|
|
15
15
|
Chunk,
|
|
16
|
+
DocNav,
|
|
16
17
|
ImageChunk,
|
|
17
18
|
Manifest,
|
|
18
19
|
ParseResult,
|
|
19
20
|
SlimChunk,
|
|
20
21
|
TableChunk,
|
|
21
22
|
TextChunk,
|
|
22
|
-
TextChunkTokens,
|
|
23
23
|
)
|
|
24
24
|
|
|
25
25
|
_logger = getLogger()
|
|
@@ -81,38 +81,6 @@ def _extractFilePath(raw: Dict[str, Any]) -> Optional[str]:
|
|
|
81
81
|
return fallback
|
|
82
82
|
|
|
83
83
|
|
|
84
|
-
def _normalizeTokenList(raw_tokens: List[Any]) -> List[str]:
|
|
85
|
-
"""Return a string-only token list with empty values removed."""
|
|
86
|
-
normalized_tokens: List[str] = []
|
|
87
|
-
for raw_token in raw_tokens:
|
|
88
|
-
token_text: str = str(raw_token).strip()
|
|
89
|
-
if token_text:
|
|
90
|
-
normalized_tokens.append(token_text)
|
|
91
|
-
return normalized_tokens
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
def _parseTextChunkTokens(
|
|
95
|
-
raw_tokens: Any,
|
|
96
|
-
*,
|
|
97
|
-
chunk_id: str,
|
|
98
|
-
) -> Optional[TextChunkTokens]:
|
|
99
|
-
"""Normalize text chunk tokens from the current backend payload."""
|
|
100
|
-
if raw_tokens is None:
|
|
101
|
-
return None
|
|
102
|
-
if isinstance(raw_tokens, bool):
|
|
103
|
-
raise KnowhereError(
|
|
104
|
-
f"Invalid tokens payload for text chunk '{chunk_id}': expected list[str], got bool."
|
|
105
|
-
)
|
|
106
|
-
if isinstance(raw_tokens, list):
|
|
107
|
-
return _normalizeTokenList(raw_tokens)
|
|
108
|
-
|
|
109
|
-
raise KnowhereError(
|
|
110
|
-
"Invalid tokens payload for text chunk "
|
|
111
|
-
f"'{chunk_id}': expected list[str], "
|
|
112
|
-
f"got {type(raw_tokens).__name__}."
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
|
|
116
84
|
def _buildChunks(
|
|
117
85
|
raw_chunks: List[Dict[str, Any]],
|
|
118
86
|
zf: zipfile.ZipFile,
|
|
@@ -125,58 +93,39 @@ def _buildChunks(
|
|
|
125
93
|
|
|
126
94
|
if chunk_type == "image":
|
|
127
95
|
image_data: bytes = b""
|
|
128
|
-
# file_path may be at top level, inside metadata, or use path as fallback
|
|
129
96
|
file_path: Optional[str] = _extractFilePath(raw)
|
|
130
97
|
if file_path:
|
|
131
98
|
image_data = _readZipBytes(zf, file_path) or b""
|
|
132
|
-
metadata: Dict[str, Any] = raw.get("metadata", {})
|
|
133
99
|
chunk: Chunk = ImageChunk(
|
|
134
100
|
chunk_id=raw.get("chunk_id", ""),
|
|
135
101
|
type="image",
|
|
136
102
|
content=raw.get("content", ""),
|
|
137
103
|
path=raw.get("path"),
|
|
138
|
-
page_nums=metadata.get("page_nums", raw.get("page_nums")),
|
|
139
|
-
length=metadata.get("length", raw.get("length", 0)),
|
|
140
104
|
file_path=file_path,
|
|
141
|
-
original_name=metadata.get("original_name", raw.get("original_name")),
|
|
142
|
-
summary=metadata.get("summary", raw.get("summary")),
|
|
143
105
|
data=image_data,
|
|
106
|
+
metadata=raw.get("metadata", {}),
|
|
144
107
|
)
|
|
145
108
|
elif chunk_type == "table":
|
|
146
109
|
table_html: str = ""
|
|
147
110
|
file_path = _extractFilePath(raw)
|
|
148
111
|
if file_path:
|
|
149
112
|
table_html = _readZipText(zf, file_path) or ""
|
|
150
|
-
metadata = raw.get("metadata", {})
|
|
151
113
|
chunk = TableChunk(
|
|
152
114
|
chunk_id=raw.get("chunk_id", ""),
|
|
153
115
|
type="table",
|
|
154
116
|
content=raw.get("content", ""),
|
|
155
117
|
path=raw.get("path"),
|
|
156
|
-
page_nums=metadata.get("page_nums", raw.get("page_nums")),
|
|
157
|
-
length=metadata.get("length", raw.get("length", 0)),
|
|
158
118
|
file_path=file_path,
|
|
159
|
-
original_name=metadata.get("original_name", raw.get("original_name")),
|
|
160
|
-
table_type=metadata.get("table_type", raw.get("table_type")),
|
|
161
|
-
summary=metadata.get("summary", raw.get("summary")),
|
|
162
119
|
html=table_html,
|
|
120
|
+
metadata=raw.get("metadata", {}),
|
|
163
121
|
)
|
|
164
122
|
else:
|
|
165
|
-
metadata = raw.get("metadata", {})
|
|
166
|
-
chunk_id: str = raw.get("chunk_id", "")
|
|
167
|
-
raw_tokens: Any = metadata.get("tokens", raw.get("tokens"))
|
|
168
123
|
chunk = TextChunk(
|
|
169
|
-
chunk_id=chunk_id,
|
|
124
|
+
chunk_id=raw.get("chunk_id", ""),
|
|
170
125
|
type="text",
|
|
171
126
|
content=raw.get("content", ""),
|
|
172
127
|
path=raw.get("path"),
|
|
173
|
-
|
|
174
|
-
length=metadata.get("length", raw.get("length", 0)),
|
|
175
|
-
tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id),
|
|
176
|
-
keywords=metadata.get("keywords", raw.get("keywords")),
|
|
177
|
-
summary=metadata.get("summary", raw.get("summary")),
|
|
178
|
-
connect_to=metadata.get("connect_to", raw.get("connect_to")),
|
|
179
|
-
relationships=metadata.get("relationships", raw.get("relationships")),
|
|
128
|
+
metadata=raw.get("metadata", {}),
|
|
180
129
|
)
|
|
181
130
|
|
|
182
131
|
chunks.append(chunk)
|
|
@@ -229,7 +178,15 @@ def parseResultZip(
|
|
|
229
178
|
# -- Full markdown --
|
|
230
179
|
full_markdown: str = _readZipText(zf, "full.md") or ""
|
|
231
180
|
|
|
232
|
-
# --
|
|
181
|
+
# -- DocNav (current worker output) --
|
|
182
|
+
doc_nav_text: Optional[str] = _readZipText(zf, "doc_nav.json")
|
|
183
|
+
doc_nav: Optional[DocNav] = (
|
|
184
|
+
DocNav.model_validate(json.loads(doc_nav_text))
|
|
185
|
+
if doc_nav_text
|
|
186
|
+
else None
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# -- Hierarchy (legacy — current worker no longer emits this) --
|
|
233
190
|
hierarchy_text: Optional[str] = _readZipText(zf, "hierarchy.json")
|
|
234
191
|
hierarchy: Optional[Any] = (
|
|
235
192
|
json.loads(hierarchy_text) if hierarchy_text else None
|
|
@@ -263,11 +220,13 @@ def parseResultZip(
|
|
|
263
220
|
return ParseResult(
|
|
264
221
|
manifest=manifest,
|
|
265
222
|
chunks=chunks,
|
|
266
|
-
chunks_slim=chunks_slim,
|
|
267
223
|
full_markdown=full_markdown,
|
|
224
|
+
raw_zip=zip_bytes,
|
|
225
|
+
doc_nav=doc_nav,
|
|
226
|
+
# Legacy — the current worker no longer emits these files
|
|
227
|
+
chunks_slim=chunks_slim,
|
|
268
228
|
hierarchy=hierarchy,
|
|
269
229
|
toc_hierarchies=toc_hierarchies,
|
|
270
230
|
kb_csv=kb_csv,
|
|
271
231
|
hierarchy_view_html=hierarchy_view_html,
|
|
272
|
-
raw_zip=zip_bytes,
|
|
273
232
|
)
|
|
@@ -22,6 +22,7 @@ class Retrieval(SyncAPIResource):
|
|
|
22
22
|
query: str,
|
|
23
23
|
namespace: Optional[str] = None,
|
|
24
24
|
top_k: Optional[int] = None,
|
|
25
|
+
use_agentic: Optional[bool] = None,
|
|
25
26
|
data_type: Optional[int] = None,
|
|
26
27
|
signal_paths: Optional[list[str]] = None,
|
|
27
28
|
filter_mode: Optional[RetrievalFilterMode] = None,
|
|
@@ -39,6 +40,8 @@ class Retrieval(SyncAPIResource):
|
|
|
39
40
|
body["namespace"] = namespace
|
|
40
41
|
if top_k is not None:
|
|
41
42
|
body["top_k"] = top_k
|
|
43
|
+
if use_agentic is not None:
|
|
44
|
+
body["use_agentic"] = use_agentic
|
|
42
45
|
if data_type is not None:
|
|
43
46
|
body["data_type"] = data_type
|
|
44
47
|
if signal_paths is not None:
|
|
@@ -77,6 +80,7 @@ class AsyncRetrieval(AsyncAPIResource):
|
|
|
77
80
|
query: str,
|
|
78
81
|
namespace: Optional[str] = None,
|
|
79
82
|
top_k: Optional[int] = None,
|
|
83
|
+
use_agentic: Optional[bool] = None,
|
|
80
84
|
data_type: Optional[int] = None,
|
|
81
85
|
signal_paths: Optional[list[str]] = None,
|
|
82
86
|
filter_mode: Optional[RetrievalFilterMode] = None,
|
|
@@ -94,6 +98,8 @@ class AsyncRetrieval(AsyncAPIResource):
|
|
|
94
98
|
body["namespace"] = namespace
|
|
95
99
|
if top_k is not None:
|
|
96
100
|
body["top_k"] = top_k
|
|
101
|
+
if use_agentic is not None:
|
|
102
|
+
body["use_agentic"] = use_agentic
|
|
97
103
|
if data_type is not None:
|
|
98
104
|
body["data_type"] = data_type
|
|
99
105
|
if signal_paths is not None:
|
|
@@ -9,7 +9,6 @@ from pathlib import Path
|
|
|
9
9
|
from typing import Any, Dict, List, Optional, Union
|
|
10
10
|
|
|
11
11
|
from pydantic import BaseModel, Field
|
|
12
|
-
from typing_extensions import TypeAlias
|
|
13
12
|
|
|
14
13
|
from knowhere._exceptions import ValidationError
|
|
15
14
|
|
|
@@ -138,6 +137,44 @@ class Manifest(BaseModel):
|
|
|
138
137
|
checksum: Optional[Checksum] = None
|
|
139
138
|
statistics: Optional[Statistics] = None
|
|
140
139
|
files: Optional[FileIndex] = None
|
|
140
|
+
hierarchy: Optional[Any] = Field(default=None, alias="HIERARCHY")
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# ---------------------------------------------------------------------------
|
|
144
|
+
# DocNav models
|
|
145
|
+
# ---------------------------------------------------------------------------
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class DocNavResourceItem(BaseModel):
|
|
149
|
+
"""A single image or table resource entry in ``doc_nav.json``."""
|
|
150
|
+
|
|
151
|
+
path: str
|
|
152
|
+
summary: Optional[str] = None
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class DocNavResources(BaseModel):
|
|
156
|
+
"""Image and table resource summaries from ``doc_nav.json``."""
|
|
157
|
+
|
|
158
|
+
images: List[DocNavResourceItem] = Field(default_factory=list)
|
|
159
|
+
tables: List[DocNavResourceItem] = Field(default_factory=list)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class DocNavSection(BaseModel):
|
|
163
|
+
"""A document section entry in the ``doc_nav.json`` navigation tree."""
|
|
164
|
+
|
|
165
|
+
title: str
|
|
166
|
+
path: str
|
|
167
|
+
level: int
|
|
168
|
+
summary: Optional[str] = None
|
|
169
|
+
chunk_count: int = 0
|
|
170
|
+
children: List["DocNavSection"] = Field(default_factory=list)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class DocNav(BaseModel):
|
|
174
|
+
"""Top-level document navigation structure from ``doc_nav.json``."""
|
|
175
|
+
|
|
176
|
+
sections: List[DocNavSection] = Field(default_factory=list)
|
|
177
|
+
resources: Optional[DocNavResources] = None
|
|
141
178
|
|
|
142
179
|
|
|
143
180
|
# ---------------------------------------------------------------------------
|
|
@@ -145,6 +182,27 @@ class Manifest(BaseModel):
|
|
|
145
182
|
# ---------------------------------------------------------------------------
|
|
146
183
|
|
|
147
184
|
|
|
185
|
+
class ChunkMetadata(BaseModel):
|
|
186
|
+
"""Known worker metadata fields for a chunk.
|
|
187
|
+
|
|
188
|
+
All fields are optional. Unknown fields added by future worker
|
|
189
|
+
versions are preserved thanks to ``model_config``.
|
|
190
|
+
"""
|
|
191
|
+
|
|
192
|
+
model_config = {"extra": "allow"}
|
|
193
|
+
|
|
194
|
+
length: Optional[int] = None
|
|
195
|
+
page_nums: Optional[List[int]] = None
|
|
196
|
+
tokens: Optional[List[str]] = None
|
|
197
|
+
keywords: Optional[List[str]] = None
|
|
198
|
+
summary: Optional[str] = None
|
|
199
|
+
connect_to: Optional[List[Dict[str, Any]]] = None
|
|
200
|
+
file_path: Optional[str] = None
|
|
201
|
+
original_name: Optional[str] = None
|
|
202
|
+
table_type: Optional[str] = None
|
|
203
|
+
document_top_summary: Optional[str] = None
|
|
204
|
+
|
|
205
|
+
|
|
148
206
|
class BaseChunk(BaseModel):
|
|
149
207
|
"""Fields shared by every chunk type."""
|
|
150
208
|
|
|
@@ -152,32 +210,20 @@ class BaseChunk(BaseModel):
|
|
|
152
210
|
type: str
|
|
153
211
|
content: str = ""
|
|
154
212
|
path: Optional[str] = None
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
TextChunkTokens: TypeAlias = List[str]
|
|
213
|
+
metadata: ChunkMetadata = Field(default_factory=ChunkMetadata)
|
|
159
214
|
|
|
160
215
|
|
|
161
216
|
class TextChunk(BaseChunk):
|
|
162
217
|
"""A text chunk extracted from the document."""
|
|
163
218
|
|
|
164
219
|
type: str = "text"
|
|
165
|
-
length: int = 0
|
|
166
|
-
tokens: Optional[TextChunkTokens] = None
|
|
167
|
-
keywords: Optional[List[str]] = None
|
|
168
|
-
summary: Optional[str] = None
|
|
169
|
-
connect_to: Optional[List[Dict[str, Any]]] = None
|
|
170
|
-
relationships: Optional[List[Union[Dict[str, Any], str]]] = None
|
|
171
220
|
|
|
172
221
|
|
|
173
222
|
class ImageChunk(BaseChunk):
|
|
174
223
|
"""An image chunk — carries raw bytes loaded from the ZIP."""
|
|
175
224
|
|
|
176
225
|
type: str = "image"
|
|
177
|
-
length: int = 0
|
|
178
226
|
file_path: Optional[str] = None
|
|
179
|
-
original_name: Optional[str] = None
|
|
180
|
-
summary: Optional[str] = None
|
|
181
227
|
data: bytes = Field(default=b"", exclude=True)
|
|
182
228
|
|
|
183
229
|
model_config = {"arbitrary_types_allowed": True}
|
|
@@ -193,13 +239,13 @@ class ImageChunk(BaseChunk):
|
|
|
193
239
|
def save(self, directory: Union[str, Path]) -> Path:
|
|
194
240
|
"""Write the image bytes to *directory*, returning the output path.
|
|
195
241
|
|
|
196
|
-
The filename is derived from ``
|
|
197
|
-
|
|
242
|
+
The filename is derived from ``file_path``, sanitised for
|
|
243
|
+
cross-platform safety.
|
|
198
244
|
"""
|
|
199
245
|
dir_path: Path = Path(directory)
|
|
200
246
|
dir_path.mkdir(parents=True, exist_ok=True)
|
|
201
247
|
|
|
202
|
-
raw_name: str =
|
|
248
|
+
raw_name: str = os.path.basename(
|
|
203
249
|
self.file_path or f"{self.chunk_id}.bin"
|
|
204
250
|
)
|
|
205
251
|
safe_name: str = _sanitizeFilename(raw_name)
|
|
@@ -214,11 +260,7 @@ class TableChunk(BaseChunk):
|
|
|
214
260
|
"""A table chunk — carries HTML loaded from the ZIP."""
|
|
215
261
|
|
|
216
262
|
type: str = "table"
|
|
217
|
-
length: int = 0
|
|
218
263
|
file_path: Optional[str] = None
|
|
219
|
-
original_name: Optional[str] = None
|
|
220
|
-
table_type: Optional[str] = None
|
|
221
|
-
summary: Optional[str] = None
|
|
222
264
|
html: str = Field(default="", exclude=True)
|
|
223
265
|
|
|
224
266
|
def save(self, directory: Union[str, Path]) -> Path:
|
|
@@ -226,7 +268,7 @@ class TableChunk(BaseChunk):
|
|
|
226
268
|
dir_path: Path = Path(directory)
|
|
227
269
|
dir_path.mkdir(parents=True, exist_ok=True)
|
|
228
270
|
|
|
229
|
-
raw_name: str =
|
|
271
|
+
raw_name: str = os.path.basename(
|
|
230
272
|
self.file_path or f"{self.chunk_id}.html"
|
|
231
273
|
)
|
|
232
274
|
safe_name: str = _sanitizeFilename(raw_name)
|
|
@@ -242,12 +284,11 @@ Chunk = Union[TextChunk, ImageChunk, TableChunk]
|
|
|
242
284
|
|
|
243
285
|
|
|
244
286
|
class SlimChunk(BaseModel):
|
|
245
|
-
"""Minimal chunk entry emitted in chunks_slim.json."""
|
|
287
|
+
"""Minimal chunk entry emitted in chunks_slim.json (legacy)."""
|
|
246
288
|
|
|
247
289
|
type: str
|
|
248
290
|
path: Optional[str] = None
|
|
249
291
|
content: str = ""
|
|
250
|
-
summary: Optional[str] = None
|
|
251
292
|
|
|
252
293
|
|
|
253
294
|
# ---------------------------------------------------------------------------
|
|
@@ -259,48 +300,59 @@ class ParseResult:
|
|
|
259
300
|
"""Eagerly-loaded result of a document parsing job.
|
|
260
301
|
|
|
261
302
|
Contains the manifest, all chunks (with image bytes and table HTML
|
|
262
|
-
already loaded), the full markdown,
|
|
263
|
-
bytes for archival purposes.
|
|
303
|
+
already loaded), the full markdown, the document navigation tree,
|
|
304
|
+
and the raw ZIP bytes for archival purposes.
|
|
305
|
+
|
|
306
|
+
Legacy fields (``chunks_slim``, ``hierarchy``, ``toc_hierarchies``,
|
|
307
|
+
``kb_csv``, ``hierarchy_view_html``) are kept for backward
|
|
308
|
+
compatibility with older result ZIPs. The current worker does not
|
|
309
|
+
emit ``chunks_slim.json`` or ``hierarchy.json``.
|
|
264
310
|
"""
|
|
265
311
|
|
|
266
312
|
manifest: Manifest
|
|
267
313
|
chunks: List[Chunk]
|
|
268
|
-
chunks_slim: Optional[List[SlimChunk]]
|
|
269
314
|
full_markdown: str
|
|
315
|
+
raw_zip: bytes
|
|
316
|
+
namespace: Optional[str]
|
|
317
|
+
document_id: Optional[str]
|
|
318
|
+
# Current worker output
|
|
319
|
+
doc_nav: Optional[DocNav]
|
|
320
|
+
# Legacy — the current worker no longer emits these files
|
|
321
|
+
chunks_slim: Optional[List[SlimChunk]]
|
|
270
322
|
hierarchy: Optional[Any]
|
|
271
323
|
toc_hierarchies: Optional[Any]
|
|
272
324
|
kb_csv: Optional[str]
|
|
273
325
|
hierarchy_view_html: Optional[str]
|
|
274
|
-
raw_zip: bytes
|
|
275
|
-
namespace: Optional[str]
|
|
276
|
-
document_id: Optional[str]
|
|
277
326
|
|
|
278
327
|
def __init__(
|
|
279
328
|
self,
|
|
280
329
|
*,
|
|
281
330
|
manifest: Manifest,
|
|
282
331
|
chunks: List[Chunk],
|
|
283
|
-
chunks_slim: Optional[List[SlimChunk]],
|
|
284
332
|
full_markdown: str,
|
|
285
|
-
hierarchy: Optional[Any],
|
|
286
|
-
toc_hierarchies: Optional[Any],
|
|
287
|
-
kb_csv: Optional[str],
|
|
288
|
-
hierarchy_view_html: Optional[str],
|
|
289
333
|
raw_zip: bytes,
|
|
334
|
+
doc_nav: Optional[DocNav] = None,
|
|
290
335
|
namespace: Optional[str] = None,
|
|
291
336
|
document_id: Optional[str] = None,
|
|
337
|
+
# Legacy — the current worker no longer emits these files
|
|
338
|
+
chunks_slim: Optional[List[SlimChunk]] = None,
|
|
339
|
+
hierarchy: Optional[Any] = None,
|
|
340
|
+
toc_hierarchies: Optional[Any] = None,
|
|
341
|
+
kb_csv: Optional[str] = None,
|
|
342
|
+
hierarchy_view_html: Optional[str] = None,
|
|
292
343
|
) -> None:
|
|
293
344
|
self.manifest = manifest
|
|
294
345
|
self.chunks = chunks
|
|
295
|
-
self.chunks_slim = chunks_slim
|
|
296
346
|
self.full_markdown = full_markdown
|
|
347
|
+
self.raw_zip = raw_zip
|
|
348
|
+
self.doc_nav = doc_nav
|
|
349
|
+
self.namespace = namespace
|
|
350
|
+
self.document_id = document_id
|
|
351
|
+
self.chunks_slim = chunks_slim
|
|
297
352
|
self.hierarchy = hierarchy
|
|
298
353
|
self.toc_hierarchies = toc_hierarchies
|
|
299
354
|
self.kb_csv = kb_csv
|
|
300
355
|
self.hierarchy_view_html = hierarchy_view_html
|
|
301
|
-
self.raw_zip = raw_zip
|
|
302
|
-
self.namespace = namespace
|
|
303
|
-
self.document_id = document_id
|
|
304
356
|
|
|
305
357
|
# -- convenience properties --
|
|
306
358
|
|
|
@@ -344,11 +396,17 @@ class ParseResult:
|
|
|
344
396
|
"""Save the full result to *directory*.
|
|
345
397
|
|
|
346
398
|
Creates the directory if needed and writes:
|
|
399
|
+
* ``manifest.json`` — result manifest
|
|
400
|
+
* ``chunks.json`` — all chunks
|
|
401
|
+
* ``doc_nav.json`` — document navigation tree (if present)
|
|
347
402
|
* ``full.md`` — the full markdown
|
|
348
403
|
* ``images/`` — all image chunks
|
|
349
404
|
* ``tables/`` — all table chunks
|
|
350
405
|
* ``result.zip`` — the raw ZIP archive
|
|
351
406
|
|
|
407
|
+
Legacy files (``chunks_slim.json``, ``hierarchy.json``, etc.) are
|
|
408
|
+
also written when present for backward compatibility.
|
|
409
|
+
|
|
352
410
|
Returns the resolved directory path.
|
|
353
411
|
"""
|
|
354
412
|
dir_path: Path = Path(directory)
|
|
@@ -357,7 +415,7 @@ class ParseResult:
|
|
|
357
415
|
# Manifest / chunks
|
|
358
416
|
manifest_path: Path = dir_path / "manifest.json"
|
|
359
417
|
manifest_path.write_text(
|
|
360
|
-
self.manifest.model_dump_json(indent=2),
|
|
418
|
+
self.manifest.model_dump_json(indent=2, by_alias=True),
|
|
361
419
|
encoding="utf-8",
|
|
362
420
|
)
|
|
363
421
|
|
|
@@ -367,6 +425,13 @@ class ParseResult:
|
|
|
367
425
|
encoding="utf-8",
|
|
368
426
|
)
|
|
369
427
|
|
|
428
|
+
if self.doc_nav is not None:
|
|
429
|
+
doc_nav_path: Path = dir_path / "doc_nav.json"
|
|
430
|
+
doc_nav_path.write_text(
|
|
431
|
+
self.doc_nav.model_dump_json(indent=2),
|
|
432
|
+
encoding="utf-8",
|
|
433
|
+
)
|
|
434
|
+
|
|
370
435
|
if self.chunks_slim is not None:
|
|
371
436
|
chunks_slim_path: Path = dir_path / "chunks_slim.json"
|
|
372
437
|
chunks_slim_path.write_text(
|
|
@@ -2,9 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
from typing import Literal, Optional, TypedDict
|
|
5
|
+
from typing import Any, Dict, List, Literal, Optional, TypedDict
|
|
6
6
|
|
|
7
|
-
from pydantic import BaseModel
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
RetrievalChannel = Literal["path", "content", "term"]
|
|
@@ -37,9 +37,16 @@ class RetrievalResult(BaseModel):
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
class RetrievalQueryResponse(BaseModel):
|
|
40
|
-
"""Response from ``POST /v1/retrieval/query``.
|
|
40
|
+
"""Response from ``POST /v1/retrieval/query``.
|
|
41
|
+
|
|
42
|
+
Agentic fields (``answer_text``, ``referenced_chunks``) are only
|
|
43
|
+
populated when ``use_agentic=True``. In legacy retrieval mode they
|
|
44
|
+
default to ``None`` and ``[]`` respectively.
|
|
45
|
+
"""
|
|
41
46
|
|
|
42
47
|
namespace: str
|
|
43
48
|
query: str
|
|
44
49
|
router_used: Optional[str] = None
|
|
50
|
+
answer_text: Optional[str] = None
|
|
51
|
+
referenced_chunks: List[Dict[str, Any]] = Field(default_factory=list)
|
|
45
52
|
results: list[RetrievalResult]
|