knowhere-python-sdk 0.3.2__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowhere_python_sdk-0.5.0/.release-please-manifest.json +3 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/CHANGELOG.md +14 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/PKG-INFO +16 -1
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/README.md +15 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/docs/usage.md +64 -29
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/pyproject.toml +1 -1
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/__init__.py +14 -1
- knowhere_python_sdk-0.5.0/src/knowhere/_version.py +1 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/lib/result_parser.py +18 -59
- knowhere_python_sdk-0.5.0/src/knowhere/resources/documents.py +191 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/resources/retrieval.py +6 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/types/__init__.py +14 -1
- knowhere_python_sdk-0.5.0/src/knowhere/types/document.py +78 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/types/result.py +105 -40
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/types/retrieval.py +10 -3
- knowhere_python_sdk-0.5.0/tests/test_documents.py +219 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_models.py +10 -47
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_result_parser.py +200 -105
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_retrieval.py +93 -0
- knowhere_python_sdk-0.3.2/.release-please-manifest.json +0 -3
- knowhere_python_sdk-0.3.2/src/knowhere/_version.py +0 -1
- knowhere_python_sdk-0.3.2/src/knowhere/resources/documents.py +0 -74
- knowhere_python_sdk-0.3.2/src/knowhere/types/document.py +0 -28
- knowhere_python_sdk-0.3.2/tests/test_documents.py +0 -106
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/.github/ISSUE_TEMPLATE/bug-report.yml +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/.github/ISSUE_TEMPLATE/feature-request.yml +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/.github/pull_request_template.md +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/.github/workflows/ci.yml +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/.github/workflows/publish-pypi.yml +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/.github/workflows/publish.yml +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/.gitignore +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/CODE_OF_CONDUCT.md +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/CONTRIBUTING.md +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/LICENSE +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/SECURITY.md +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/examples/async_usage.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/examples/error_handling.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/examples/parse_file.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/examples/parse_url.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/examples/step_by_step.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/release-please-config.json +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/_base_client.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/_client.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/_constants.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/_exceptions.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/_logging.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/_response.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/_types.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/lib/__init__.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/lib/polling.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/lib/upload.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/py.typed +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/resources/__init__.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/resources/_base.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/resources/jobs.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/types/job.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/types/params.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/types/shared.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/__init__.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/conftest.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/fixtures/real_result.zip +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_client.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_exceptions.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_jobs.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_logging.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_parse.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_polling.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_retry.py +0 -0
- {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_upload.py +0 -0
|
@@ -1,5 +1,19 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.5.0](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.4.0...v0.5.0) (2026-05-15)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
### Features
|
|
7
|
+
|
|
8
|
+
* sync SDK with current worker ZIP contract and agentic retrieval API ([ad8db2e](https://github.com/Ontos-AI/knowhere-python-sdk/commit/ad8db2e87c77978928d046c95565e9e60c1b1f4e))
|
|
9
|
+
|
|
10
|
+
## [0.4.0](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.3.2...v0.4.0) (2026-04-27)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
### Features
|
|
14
|
+
|
|
15
|
+
* add document chunks resource methods ([73094d4](https://github.com/Ontos-AI/knowhere-python-sdk/commit/73094d4f95ef693785fa3965f6f2a223dfd2a350))
|
|
16
|
+
|
|
3
17
|
## [0.3.2](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.3.1...v0.3.2) (2026-04-23)
|
|
4
18
|
|
|
5
19
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: knowhere-python-sdk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Official Python SDK for the Knowhere document parsing API
|
|
5
5
|
Project-URL: Homepage, https://knowhereto.ai
|
|
6
6
|
Project-URL: Documentation, https://docs.knowhereto.ai
|
|
@@ -118,6 +118,21 @@ update_job = client.jobs.create(
|
|
|
118
118
|
document = client.documents.get(document_id)
|
|
119
119
|
print(document.status)
|
|
120
120
|
|
|
121
|
+
chunks = client.documents.list_chunks(
|
|
122
|
+
document_id,
|
|
123
|
+
page=1,
|
|
124
|
+
page_size=50,
|
|
125
|
+
chunk_type="text",
|
|
126
|
+
)
|
|
127
|
+
print(chunks.pagination.total)
|
|
128
|
+
if chunks.chunks:
|
|
129
|
+
chunk = client.documents.get_chunk(
|
|
130
|
+
document_id,
|
|
131
|
+
chunks.chunks[0].id,
|
|
132
|
+
include_asset_urls=True,
|
|
133
|
+
)
|
|
134
|
+
print(chunk.chunk.content)
|
|
135
|
+
|
|
121
136
|
client.documents.archive(document_id)
|
|
122
137
|
```
|
|
123
138
|
|
|
@@ -85,6 +85,21 @@ update_job = client.jobs.create(
|
|
|
85
85
|
document = client.documents.get(document_id)
|
|
86
86
|
print(document.status)
|
|
87
87
|
|
|
88
|
+
chunks = client.documents.list_chunks(
|
|
89
|
+
document_id,
|
|
90
|
+
page=1,
|
|
91
|
+
page_size=50,
|
|
92
|
+
chunk_type="text",
|
|
93
|
+
)
|
|
94
|
+
print(chunks.pagination.total)
|
|
95
|
+
if chunks.chunks:
|
|
96
|
+
chunk = client.documents.get_chunk(
|
|
97
|
+
document_id,
|
|
98
|
+
chunks.chunks[0].id,
|
|
99
|
+
include_asset_urls=True,
|
|
100
|
+
)
|
|
101
|
+
print(chunk.chunk.content)
|
|
102
|
+
|
|
88
103
|
client.documents.archive(document_id)
|
|
89
104
|
```
|
|
90
105
|
|
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
# Knowhere Python SDK — Usage Guide
|
|
2
2
|
|
|
3
|
+
> **Recent changes:** Chunk metadata fields (`tokens`, `keywords`, `summary`,
|
|
4
|
+
> `length`, etc.) are no longer flattened to the chunk surface. Access them
|
|
5
|
+
> through `chunk.metadata` instead. See [Chunk Types](#chunk-types).
|
|
6
|
+
|
|
3
7
|
Comprehensive reference for every feature, parameter, and pattern in the SDK.
|
|
4
8
|
|
|
5
9
|
## Table of Contents
|
|
@@ -219,8 +223,13 @@ result.table_chunks # List[TableChunk]
|
|
|
219
223
|
# Lookup by ID
|
|
220
224
|
chunk = result.getChunk("chunk_42")
|
|
221
225
|
|
|
222
|
-
#
|
|
223
|
-
result.
|
|
226
|
+
# Document navigation tree (from doc_nav.json, current worker output)
|
|
227
|
+
result.doc_nav # DocNav | None
|
|
228
|
+
result.doc_nav.sections # List[DocNavSection] — tree of titles/paths/levels
|
|
229
|
+
result.doc_nav.resources # DocNavResources — image/table resource summaries
|
|
230
|
+
|
|
231
|
+
# Legacy hierarchy (from hierarchy.json, older worker output)
|
|
232
|
+
result.hierarchy # Any | None
|
|
224
233
|
|
|
225
234
|
# Raw ZIP bytes (for archival)
|
|
226
235
|
result.raw_zip
|
|
@@ -239,49 +248,48 @@ result.save("./output/report/")
|
|
|
239
248
|
|
|
240
249
|
## Chunk Types
|
|
241
250
|
|
|
242
|
-
Every chunk shares a base set of fields (`chunk_id`, `type`, `content`, `path
|
|
251
|
+
Every chunk shares a base set of fields (`chunk_id`, `type`, `content`, `path`,
|
|
252
|
+
`metadata`). Worker metadata is kept in the `metadata` dict — it is **not**
|
|
253
|
+
flattened to top-level chunk properties.
|
|
243
254
|
|
|
244
|
-
###
|
|
255
|
+
### Base fields (all chunk types)
|
|
245
256
|
|
|
246
257
|
| Field | Type | Description |
|
|
247
258
|
|-------|------|-------------|
|
|
248
259
|
| `chunk_id` | `str` | Unique identifier |
|
|
249
|
-
| `type` | `str` |
|
|
250
|
-
| `content` | `str` |
|
|
251
|
-
| `path` | `str \| None` | Document structure path
|
|
252
|
-
| `
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
| `summary` | `str \| None` | AI-generated summary (requires `summary_txt: True`) |
|
|
256
|
-
| `relationships` | `List \| None` | Relationships to other chunks |
|
|
260
|
+
| `type` | `str` | `"text"`, `"image"`, or `"table"` |
|
|
261
|
+
| `content` | `str` | Text content or placeholder |
|
|
262
|
+
| `path` | `str \| None` | Document structure path |
|
|
263
|
+
| `metadata` | `dict` | Raw worker metadata (tokens, keywords, summary, length, page_nums, etc.) |
|
|
264
|
+
|
|
265
|
+
### TextChunk
|
|
257
266
|
|
|
258
267
|
```python
|
|
259
268
|
for chunk in result.text_chunks:
|
|
260
269
|
print(f"[{chunk.chunk_id}] {chunk.content[:60]}...")
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
270
|
+
# Metadata is in chunk.metadata, not flattened:
|
|
271
|
+
keywords = chunk.metadata.get("keywords", [])
|
|
272
|
+
summary = chunk.metadata.get("summary")
|
|
273
|
+
if keywords:
|
|
274
|
+
print(f" Keywords: {', '.join(keywords)}")
|
|
275
|
+
if summary:
|
|
276
|
+
print(f" Summary: {summary}")
|
|
265
277
|
```
|
|
266
278
|
|
|
267
279
|
### ImageChunk
|
|
268
280
|
|
|
269
281
|
| Field | Type | Description |
|
|
270
282
|
|-------|------|-------------|
|
|
271
|
-
| `chunk_id` | `str` | Unique identifier |
|
|
272
|
-
| `type` | `str` | Always `"image"` |
|
|
273
|
-
| `content` | `str` | Text content associated with the image |
|
|
274
283
|
| `file_path` | `str \| None` | Path within the ZIP |
|
|
275
|
-
| `original_name` | `str \| None` | Original filename |
|
|
276
|
-
| `summary` | `str \| None` | AI-generated image description (requires `summary_image: True`) |
|
|
277
284
|
| `data` | `bytes` | Raw image bytes (loaded from ZIP) |
|
|
278
285
|
| `format` | `str \| None` | Image format inferred from extension (property) |
|
|
279
286
|
|
|
280
287
|
```python
|
|
281
288
|
for img in result.image_chunks:
|
|
282
289
|
print(f"{img.file_path} ({len(img.data)} bytes, {img.format})")
|
|
283
|
-
|
|
284
|
-
|
|
290
|
+
summary = img.metadata.get("summary")
|
|
291
|
+
if summary:
|
|
292
|
+
print(f" Description: {summary}")
|
|
285
293
|
img.save("./output/images/") # writes to disk
|
|
286
294
|
```
|
|
287
295
|
|
|
@@ -289,13 +297,7 @@ for img in result.image_chunks:
|
|
|
289
297
|
|
|
290
298
|
| Field | Type | Description |
|
|
291
299
|
|-------|------|-------------|
|
|
292
|
-
| `chunk_id` | `str` | Unique identifier |
|
|
293
|
-
| `type` | `str` | Always `"table"` |
|
|
294
|
-
| `content` | `str` | Text representation of the table |
|
|
295
300
|
| `file_path` | `str \| None` | Path within the ZIP |
|
|
296
|
-
| `original_name` | `str \| None` | Original filename |
|
|
297
|
-
| `table_type` | `str \| None` | Table classification |
|
|
298
|
-
| `summary` | `str \| None` | AI-generated table summary (requires `summary_table: True`) |
|
|
299
301
|
| `html` | `str` | Full HTML of the table (loaded from ZIP) |
|
|
300
302
|
|
|
301
303
|
```python
|
|
@@ -471,6 +473,19 @@ response = client.retrieval.query(
|
|
|
471
473
|
top_k=5,
|
|
472
474
|
)
|
|
473
475
|
|
|
476
|
+
# Agentic mode (LLM navigation + answer synthesis)
|
|
477
|
+
response = client.retrieval.query(
|
|
478
|
+
namespace="support-center",
|
|
479
|
+
query="How do I pair a Bluetooth headset?",
|
|
480
|
+
use_agentic=True,
|
|
481
|
+
top_k=5,
|
|
482
|
+
)
|
|
483
|
+
print(response.answer_text) # LLM-generated natural-language answer
|
|
484
|
+
print(response.router_used) # "workflow_single_step", "small_kb_all", etc.
|
|
485
|
+
for ref in response.referenced_chunks:
|
|
486
|
+
print(ref.get("chunk_id"), ref.get("asset_url"))
|
|
487
|
+
|
|
488
|
+
# Legacy results are always available
|
|
474
489
|
for result in response.results:
|
|
475
490
|
print(result.content)
|
|
476
491
|
print(result.score)
|
|
@@ -479,6 +494,10 @@ for result in response.results:
|
|
|
479
494
|
print(result.source.section_path)
|
|
480
495
|
```
|
|
481
496
|
|
|
497
|
+
| Parameter | Type | Default | Description |
|
|
498
|
+
|-----------|------|---------|-------------|
|
|
499
|
+
| `use_agentic` | `bool \| None` | `None` | Force agentic (`True`) or legacy (`False`) retrieval. `None` uses server default. |
|
|
500
|
+
|
|
482
501
|
Retrieval results expose `content`, not the older parse-result `text` field.
|
|
483
502
|
Media results may include `asset_url` when the server can sign the referenced
|
|
484
503
|
artifact.
|
|
@@ -521,6 +540,22 @@ for document in document_list.documents:
|
|
|
521
540
|
document = client.documents.get("doc_123")
|
|
522
541
|
print(document.current_job_result_id)
|
|
523
542
|
|
|
543
|
+
chunks = client.documents.list_chunks(
|
|
544
|
+
"doc_123",
|
|
545
|
+
page=1,
|
|
546
|
+
page_size=50,
|
|
547
|
+
chunk_type="text",
|
|
548
|
+
)
|
|
549
|
+
for chunk in chunks.chunks:
|
|
550
|
+
print(chunk.id, chunk.content)
|
|
551
|
+
|
|
552
|
+
image_chunk = client.documents.get_chunk(
|
|
553
|
+
"doc_123",
|
|
554
|
+
"dchk_123",
|
|
555
|
+
include_asset_urls=True,
|
|
556
|
+
)
|
|
557
|
+
print(image_chunk.chunk.asset_url)
|
|
558
|
+
|
|
524
559
|
archived = client.documents.archive("doc_123")
|
|
525
560
|
print(archived.status) # "archived"
|
|
526
561
|
```
|
|
@@ -35,7 +35,15 @@ from knowhere._exceptions import (
|
|
|
35
35
|
)
|
|
36
36
|
from knowhere._types import PollProgressCallback, UploadProgressCallback
|
|
37
37
|
from knowhere._version import __version__
|
|
38
|
-
from knowhere.types.document import
|
|
38
|
+
from knowhere.types.document import (
|
|
39
|
+
Document,
|
|
40
|
+
DocumentChunk,
|
|
41
|
+
DocumentChunkListResponse,
|
|
42
|
+
DocumentChunkPagination,
|
|
43
|
+
DocumentChunkResponse,
|
|
44
|
+
DocumentChunkType,
|
|
45
|
+
DocumentListResponse,
|
|
46
|
+
)
|
|
39
47
|
from knowhere.types.job import Job, JobError, JobProgress, JobResult
|
|
40
48
|
from knowhere.types.params import ParsingParams, WebhookConfig
|
|
41
49
|
from knowhere.types.retrieval import (
|
|
@@ -98,6 +106,11 @@ __all__: list[str] = [
|
|
|
98
106
|
"JobResult",
|
|
99
107
|
# Document types
|
|
100
108
|
"Document",
|
|
109
|
+
"DocumentChunk",
|
|
110
|
+
"DocumentChunkListResponse",
|
|
111
|
+
"DocumentChunkPagination",
|
|
112
|
+
"DocumentChunkResponse",
|
|
113
|
+
"DocumentChunkType",
|
|
101
114
|
"DocumentListResponse",
|
|
102
115
|
# Retrieval types
|
|
103
116
|
"RetrievalChannel",
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.5.0" # x-release-please-version
|
|
@@ -13,13 +13,13 @@ from knowhere._exceptions import ChecksumError, KnowhereError
|
|
|
13
13
|
from knowhere._logging import getLogger
|
|
14
14
|
from knowhere.types.result import (
|
|
15
15
|
Chunk,
|
|
16
|
+
DocNav,
|
|
16
17
|
ImageChunk,
|
|
17
18
|
Manifest,
|
|
18
19
|
ParseResult,
|
|
19
20
|
SlimChunk,
|
|
20
21
|
TableChunk,
|
|
21
22
|
TextChunk,
|
|
22
|
-
TextChunkTokens,
|
|
23
23
|
)
|
|
24
24
|
|
|
25
25
|
_logger = getLogger()
|
|
@@ -81,38 +81,6 @@ def _extractFilePath(raw: Dict[str, Any]) -> Optional[str]:
|
|
|
81
81
|
return fallback
|
|
82
82
|
|
|
83
83
|
|
|
84
|
-
def _normalizeTokenList(raw_tokens: List[Any]) -> List[str]:
|
|
85
|
-
"""Return a string-only token list with empty values removed."""
|
|
86
|
-
normalized_tokens: List[str] = []
|
|
87
|
-
for raw_token in raw_tokens:
|
|
88
|
-
token_text: str = str(raw_token).strip()
|
|
89
|
-
if token_text:
|
|
90
|
-
normalized_tokens.append(token_text)
|
|
91
|
-
return normalized_tokens
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
def _parseTextChunkTokens(
|
|
95
|
-
raw_tokens: Any,
|
|
96
|
-
*,
|
|
97
|
-
chunk_id: str,
|
|
98
|
-
) -> Optional[TextChunkTokens]:
|
|
99
|
-
"""Normalize text chunk tokens from the current backend payload."""
|
|
100
|
-
if raw_tokens is None:
|
|
101
|
-
return None
|
|
102
|
-
if isinstance(raw_tokens, bool):
|
|
103
|
-
raise KnowhereError(
|
|
104
|
-
f"Invalid tokens payload for text chunk '{chunk_id}': expected list[str], got bool."
|
|
105
|
-
)
|
|
106
|
-
if isinstance(raw_tokens, list):
|
|
107
|
-
return _normalizeTokenList(raw_tokens)
|
|
108
|
-
|
|
109
|
-
raise KnowhereError(
|
|
110
|
-
"Invalid tokens payload for text chunk "
|
|
111
|
-
f"'{chunk_id}': expected list[str], "
|
|
112
|
-
f"got {type(raw_tokens).__name__}."
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
|
|
116
84
|
def _buildChunks(
|
|
117
85
|
raw_chunks: List[Dict[str, Any]],
|
|
118
86
|
zf: zipfile.ZipFile,
|
|
@@ -125,58 +93,39 @@ def _buildChunks(
|
|
|
125
93
|
|
|
126
94
|
if chunk_type == "image":
|
|
127
95
|
image_data: bytes = b""
|
|
128
|
-
# file_path may be at top level, inside metadata, or use path as fallback
|
|
129
96
|
file_path: Optional[str] = _extractFilePath(raw)
|
|
130
97
|
if file_path:
|
|
131
98
|
image_data = _readZipBytes(zf, file_path) or b""
|
|
132
|
-
metadata: Dict[str, Any] = raw.get("metadata", {})
|
|
133
99
|
chunk: Chunk = ImageChunk(
|
|
134
100
|
chunk_id=raw.get("chunk_id", ""),
|
|
135
101
|
type="image",
|
|
136
102
|
content=raw.get("content", ""),
|
|
137
103
|
path=raw.get("path"),
|
|
138
|
-
page_nums=metadata.get("page_nums", raw.get("page_nums")),
|
|
139
|
-
length=metadata.get("length", raw.get("length", 0)),
|
|
140
104
|
file_path=file_path,
|
|
141
|
-
original_name=metadata.get("original_name", raw.get("original_name")),
|
|
142
|
-
summary=metadata.get("summary", raw.get("summary")),
|
|
143
105
|
data=image_data,
|
|
106
|
+
metadata=raw.get("metadata", {}),
|
|
144
107
|
)
|
|
145
108
|
elif chunk_type == "table":
|
|
146
109
|
table_html: str = ""
|
|
147
110
|
file_path = _extractFilePath(raw)
|
|
148
111
|
if file_path:
|
|
149
112
|
table_html = _readZipText(zf, file_path) or ""
|
|
150
|
-
metadata = raw.get("metadata", {})
|
|
151
113
|
chunk = TableChunk(
|
|
152
114
|
chunk_id=raw.get("chunk_id", ""),
|
|
153
115
|
type="table",
|
|
154
116
|
content=raw.get("content", ""),
|
|
155
117
|
path=raw.get("path"),
|
|
156
|
-
page_nums=metadata.get("page_nums", raw.get("page_nums")),
|
|
157
|
-
length=metadata.get("length", raw.get("length", 0)),
|
|
158
118
|
file_path=file_path,
|
|
159
|
-
original_name=metadata.get("original_name", raw.get("original_name")),
|
|
160
|
-
table_type=metadata.get("table_type", raw.get("table_type")),
|
|
161
|
-
summary=metadata.get("summary", raw.get("summary")),
|
|
162
119
|
html=table_html,
|
|
120
|
+
metadata=raw.get("metadata", {}),
|
|
163
121
|
)
|
|
164
122
|
else:
|
|
165
|
-
metadata = raw.get("metadata", {})
|
|
166
|
-
chunk_id: str = raw.get("chunk_id", "")
|
|
167
|
-
raw_tokens: Any = metadata.get("tokens", raw.get("tokens"))
|
|
168
123
|
chunk = TextChunk(
|
|
169
|
-
chunk_id=chunk_id,
|
|
124
|
+
chunk_id=raw.get("chunk_id", ""),
|
|
170
125
|
type="text",
|
|
171
126
|
content=raw.get("content", ""),
|
|
172
127
|
path=raw.get("path"),
|
|
173
|
-
|
|
174
|
-
length=metadata.get("length", raw.get("length", 0)),
|
|
175
|
-
tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id),
|
|
176
|
-
keywords=metadata.get("keywords", raw.get("keywords")),
|
|
177
|
-
summary=metadata.get("summary", raw.get("summary")),
|
|
178
|
-
connect_to=metadata.get("connect_to", raw.get("connect_to")),
|
|
179
|
-
relationships=metadata.get("relationships", raw.get("relationships")),
|
|
128
|
+
metadata=raw.get("metadata", {}),
|
|
180
129
|
)
|
|
181
130
|
|
|
182
131
|
chunks.append(chunk)
|
|
@@ -229,7 +178,15 @@ def parseResultZip(
|
|
|
229
178
|
# -- Full markdown --
|
|
230
179
|
full_markdown: str = _readZipText(zf, "full.md") or ""
|
|
231
180
|
|
|
232
|
-
# --
|
|
181
|
+
# -- DocNav (current worker output) --
|
|
182
|
+
doc_nav_text: Optional[str] = _readZipText(zf, "doc_nav.json")
|
|
183
|
+
doc_nav: Optional[DocNav] = (
|
|
184
|
+
DocNav.model_validate(json.loads(doc_nav_text))
|
|
185
|
+
if doc_nav_text
|
|
186
|
+
else None
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# -- Hierarchy (legacy — current worker no longer emits this) --
|
|
233
190
|
hierarchy_text: Optional[str] = _readZipText(zf, "hierarchy.json")
|
|
234
191
|
hierarchy: Optional[Any] = (
|
|
235
192
|
json.loads(hierarchy_text) if hierarchy_text else None
|
|
@@ -263,11 +220,13 @@ def parseResultZip(
|
|
|
263
220
|
return ParseResult(
|
|
264
221
|
manifest=manifest,
|
|
265
222
|
chunks=chunks,
|
|
266
|
-
chunks_slim=chunks_slim,
|
|
267
223
|
full_markdown=full_markdown,
|
|
224
|
+
raw_zip=zip_bytes,
|
|
225
|
+
doc_nav=doc_nav,
|
|
226
|
+
# Legacy — the current worker no longer emits these files
|
|
227
|
+
chunks_slim=chunks_slim,
|
|
268
228
|
hierarchy=hierarchy,
|
|
269
229
|
toc_hierarchies=toc_hierarchies,
|
|
270
230
|
kb_csv=kb_csv,
|
|
271
231
|
hierarchy_view_html=hierarchy_view_html,
|
|
272
|
-
raw_zip=zip_bytes,
|
|
273
232
|
)
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""Documents resource for canonical document lifecycle operations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, Optional
|
|
6
|
+
|
|
7
|
+
from knowhere.resources._base import AsyncAPIResource, SyncAPIResource
|
|
8
|
+
from knowhere.types.document import (
|
|
9
|
+
Document,
|
|
10
|
+
DocumentChunkListResponse,
|
|
11
|
+
DocumentChunkResponse,
|
|
12
|
+
DocumentChunkType,
|
|
13
|
+
DocumentListResponse,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Documents(SyncAPIResource):
|
|
18
|
+
"""Synchronous interface for ``/v1/documents`` endpoints."""
|
|
19
|
+
|
|
20
|
+
def list(self, *, namespace: Optional[str] = None) -> DocumentListResponse:
|
|
21
|
+
"""List canonical documents in a namespace."""
|
|
22
|
+
params: Dict[str, Any] = {}
|
|
23
|
+
if namespace is not None:
|
|
24
|
+
params["namespace"] = namespace
|
|
25
|
+
|
|
26
|
+
return self._request(
|
|
27
|
+
"GET",
|
|
28
|
+
"v1/documents",
|
|
29
|
+
params=params or None,
|
|
30
|
+
cast_to=DocumentListResponse,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
def get(self, document_id: str) -> Document:
|
|
34
|
+
"""Get one canonical document by ID."""
|
|
35
|
+
return self._request(
|
|
36
|
+
"GET",
|
|
37
|
+
f"v1/documents/{document_id}",
|
|
38
|
+
cast_to=Document,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def list_chunks(
|
|
42
|
+
self,
|
|
43
|
+
document_id: str,
|
|
44
|
+
*,
|
|
45
|
+
page: int = 1,
|
|
46
|
+
page_size: int = 50,
|
|
47
|
+
chunk_type: Optional[DocumentChunkType] = None,
|
|
48
|
+
include_asset_urls: bool = False,
|
|
49
|
+
) -> DocumentChunkListResponse:
|
|
50
|
+
"""List current-revision chunks for one canonical document."""
|
|
51
|
+
params: Dict[str, Any] = _build_chunk_list_params(
|
|
52
|
+
page=page,
|
|
53
|
+
page_size=page_size,
|
|
54
|
+
chunk_type=chunk_type,
|
|
55
|
+
include_asset_urls=include_asset_urls,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
return self._request(
|
|
59
|
+
"GET",
|
|
60
|
+
f"v1/documents/{document_id}/chunks",
|
|
61
|
+
params=params or None,
|
|
62
|
+
cast_to=DocumentChunkListResponse,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def get_chunk(
|
|
66
|
+
self,
|
|
67
|
+
document_id: str,
|
|
68
|
+
document_chunk_id: str,
|
|
69
|
+
*,
|
|
70
|
+
include_asset_urls: bool = False,
|
|
71
|
+
) -> DocumentChunkResponse:
|
|
72
|
+
"""Get one current-revision chunk for one canonical document."""
|
|
73
|
+
params: Dict[str, Any] = _build_chunk_get_params(
|
|
74
|
+
include_asset_urls=include_asset_urls,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return self._request(
|
|
78
|
+
"GET",
|
|
79
|
+
f"v1/documents/{document_id}/chunks/{document_chunk_id}",
|
|
80
|
+
params=params or None,
|
|
81
|
+
cast_to=DocumentChunkResponse,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def archive(self, document_id: str) -> Document:
|
|
85
|
+
"""Archive one canonical document by ID."""
|
|
86
|
+
return self._request(
|
|
87
|
+
"POST",
|
|
88
|
+
f"v1/documents/{document_id}/archive",
|
|
89
|
+
cast_to=Document,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class AsyncDocuments(AsyncAPIResource):
|
|
94
|
+
"""Asynchronous interface for ``/v1/documents`` endpoints."""
|
|
95
|
+
|
|
96
|
+
async def list(self, *, namespace: Optional[str] = None) -> DocumentListResponse:
|
|
97
|
+
"""List canonical documents in a namespace."""
|
|
98
|
+
params: Dict[str, Any] = {}
|
|
99
|
+
if namespace is not None:
|
|
100
|
+
params["namespace"] = namespace
|
|
101
|
+
|
|
102
|
+
return await self._request(
|
|
103
|
+
"GET",
|
|
104
|
+
"v1/documents",
|
|
105
|
+
params=params or None,
|
|
106
|
+
cast_to=DocumentListResponse,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
async def get(self, document_id: str) -> Document:
|
|
110
|
+
"""Get one canonical document by ID."""
|
|
111
|
+
return await self._request(
|
|
112
|
+
"GET",
|
|
113
|
+
f"v1/documents/{document_id}",
|
|
114
|
+
cast_to=Document,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
async def list_chunks(
|
|
118
|
+
self,
|
|
119
|
+
document_id: str,
|
|
120
|
+
*,
|
|
121
|
+
page: int = 1,
|
|
122
|
+
page_size: int = 50,
|
|
123
|
+
chunk_type: Optional[DocumentChunkType] = None,
|
|
124
|
+
include_asset_urls: bool = False,
|
|
125
|
+
) -> DocumentChunkListResponse:
|
|
126
|
+
"""List current-revision chunks for one canonical document."""
|
|
127
|
+
params: Dict[str, Any] = _build_chunk_list_params(
|
|
128
|
+
page=page,
|
|
129
|
+
page_size=page_size,
|
|
130
|
+
chunk_type=chunk_type,
|
|
131
|
+
include_asset_urls=include_asset_urls,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
return await self._request(
|
|
135
|
+
"GET",
|
|
136
|
+
f"v1/documents/{document_id}/chunks",
|
|
137
|
+
params=params or None,
|
|
138
|
+
cast_to=DocumentChunkListResponse,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
async def get_chunk(
|
|
142
|
+
self,
|
|
143
|
+
document_id: str,
|
|
144
|
+
document_chunk_id: str,
|
|
145
|
+
*,
|
|
146
|
+
include_asset_urls: bool = False,
|
|
147
|
+
) -> DocumentChunkResponse:
|
|
148
|
+
"""Get one current-revision chunk for one canonical document."""
|
|
149
|
+
params: Dict[str, Any] = _build_chunk_get_params(
|
|
150
|
+
include_asset_urls=include_asset_urls,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
return await self._request(
|
|
154
|
+
"GET",
|
|
155
|
+
f"v1/documents/{document_id}/chunks/{document_chunk_id}",
|
|
156
|
+
params=params or None,
|
|
157
|
+
cast_to=DocumentChunkResponse,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
async def archive(self, document_id: str) -> Document:
|
|
161
|
+
"""Archive one canonical document by ID."""
|
|
162
|
+
return await self._request(
|
|
163
|
+
"POST",
|
|
164
|
+
f"v1/documents/{document_id}/archive",
|
|
165
|
+
cast_to=Document,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _build_chunk_list_params(
|
|
170
|
+
*,
|
|
171
|
+
page: int,
|
|
172
|
+
page_size: int,
|
|
173
|
+
chunk_type: Optional[DocumentChunkType],
|
|
174
|
+
include_asset_urls: bool,
|
|
175
|
+
) -> Dict[str, Any]:
|
|
176
|
+
params: Dict[str, Any] = {}
|
|
177
|
+
if page != 1:
|
|
178
|
+
params["page"] = page
|
|
179
|
+
if page_size != 50:
|
|
180
|
+
params["page_size"] = page_size
|
|
181
|
+
if chunk_type is not None:
|
|
182
|
+
params["chunk_type"] = chunk_type
|
|
183
|
+
if include_asset_urls:
|
|
184
|
+
params["include_asset_urls"] = True
|
|
185
|
+
return params
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _build_chunk_get_params(*, include_asset_urls: bool) -> Dict[str, Any]:
|
|
189
|
+
if not include_asset_urls:
|
|
190
|
+
return {}
|
|
191
|
+
return {"include_asset_urls": True}
|
|
@@ -22,6 +22,7 @@ class Retrieval(SyncAPIResource):
|
|
|
22
22
|
query: str,
|
|
23
23
|
namespace: Optional[str] = None,
|
|
24
24
|
top_k: Optional[int] = None,
|
|
25
|
+
use_agentic: Optional[bool] = None,
|
|
25
26
|
data_type: Optional[int] = None,
|
|
26
27
|
signal_paths: Optional[list[str]] = None,
|
|
27
28
|
filter_mode: Optional[RetrievalFilterMode] = None,
|
|
@@ -39,6 +40,8 @@ class Retrieval(SyncAPIResource):
|
|
|
39
40
|
body["namespace"] = namespace
|
|
40
41
|
if top_k is not None:
|
|
41
42
|
body["top_k"] = top_k
|
|
43
|
+
if use_agentic is not None:
|
|
44
|
+
body["use_agentic"] = use_agentic
|
|
42
45
|
if data_type is not None:
|
|
43
46
|
body["data_type"] = data_type
|
|
44
47
|
if signal_paths is not None:
|
|
@@ -77,6 +80,7 @@ class AsyncRetrieval(AsyncAPIResource):
|
|
|
77
80
|
query: str,
|
|
78
81
|
namespace: Optional[str] = None,
|
|
79
82
|
top_k: Optional[int] = None,
|
|
83
|
+
use_agentic: Optional[bool] = None,
|
|
80
84
|
data_type: Optional[int] = None,
|
|
81
85
|
signal_paths: Optional[list[str]] = None,
|
|
82
86
|
filter_mode: Optional[RetrievalFilterMode] = None,
|
|
@@ -94,6 +98,8 @@ class AsyncRetrieval(AsyncAPIResource):
|
|
|
94
98
|
body["namespace"] = namespace
|
|
95
99
|
if top_k is not None:
|
|
96
100
|
body["top_k"] = top_k
|
|
101
|
+
if use_agentic is not None:
|
|
102
|
+
body["use_agentic"] = use_agentic
|
|
97
103
|
if data_type is not None:
|
|
98
104
|
body["data_type"] = data_type
|
|
99
105
|
if signal_paths is not None:
|
|
@@ -2,7 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
from knowhere.types.document import
|
|
5
|
+
from knowhere.types.document import (
|
|
6
|
+
Document,
|
|
7
|
+
DocumentChunk,
|
|
8
|
+
DocumentChunkListResponse,
|
|
9
|
+
DocumentChunkPagination,
|
|
10
|
+
DocumentChunkResponse,
|
|
11
|
+
DocumentChunkType,
|
|
12
|
+
DocumentListResponse,
|
|
13
|
+
)
|
|
6
14
|
from knowhere.types.job import Job, JobError, JobResult
|
|
7
15
|
from knowhere.types.params import ParsingParams, WebhookConfig
|
|
8
16
|
from knowhere.types.retrieval import (
|
|
@@ -39,6 +47,11 @@ __all__: list[str] = [
|
|
|
39
47
|
"JobResult",
|
|
40
48
|
# document
|
|
41
49
|
"Document",
|
|
50
|
+
"DocumentChunk",
|
|
51
|
+
"DocumentChunkListResponse",
|
|
52
|
+
"DocumentChunkPagination",
|
|
53
|
+
"DocumentChunkResponse",
|
|
54
|
+
"DocumentChunkType",
|
|
42
55
|
"DocumentListResponse",
|
|
43
56
|
# retrieval
|
|
44
57
|
"RetrievalChannel",
|