knowhere-python-sdk 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowhere_python_sdk-0.3.0/.release-please-manifest.json +3 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/CHANGELOG.md +18 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/PKG-INFO +72 -1
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/README.md +71 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/docs/usage.md +127 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/pyproject.toml +1 -1
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/__init__.py +21 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/_client.py +43 -1
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/_exceptions.py +21 -3
- knowhere_python_sdk-0.3.0/src/knowhere/_version.py +1 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/lib/result_parser.py +32 -0
- knowhere_python_sdk-0.3.0/src/knowhere/resources/__init__.py +16 -0
- knowhere_python_sdk-0.3.0/src/knowhere/resources/documents.py +74 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/resources/jobs.py +14 -0
- knowhere_python_sdk-0.3.0/src/knowhere/resources/retrieval.py +70 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/types/__init__.py +21 -0
- knowhere_python_sdk-0.3.0/src/knowhere/types/document.py +28 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/types/job.py +4 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/types/result.py +100 -0
- knowhere_python_sdk-0.3.0/src/knowhere/types/retrieval.py +33 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/conftest.py +4 -1
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_client.py +34 -0
- knowhere_python_sdk-0.3.0/tests/test_documents.py +106 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_jobs.py +11 -2
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_models.py +73 -2
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_polling.py +1 -1
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_result_parser.py +195 -0
- knowhere_python_sdk-0.3.0/tests/test_retrieval.py +110 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_retry.py +0 -1
- knowhere_python_sdk-0.2.0/.release-please-manifest.json +0 -3
- knowhere_python_sdk-0.2.0/src/knowhere/_version.py +0 -1
- knowhere_python_sdk-0.2.0/src/knowhere/resources/__init__.py +0 -7
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/.github/workflows/ci.yml +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/.github/workflows/publish-pypi.yml +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/.github/workflows/publish.yml +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/.gitignore +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/examples/async_usage.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/examples/error_handling.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/examples/parse_file.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/examples/parse_url.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/examples/step_by_step.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/release-please-config.json +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/_base_client.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/_constants.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/_logging.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/_response.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/_types.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/lib/__init__.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/lib/polling.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/lib/upload.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/py.typed +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/resources/_base.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/types/params.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/types/shared.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/__init__.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/fixtures/real_result.zip +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_exceptions.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_logging.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_parse.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_upload.py +0 -0
|
@@ -1,5 +1,23 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.3.0](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.2.1...v0.3.0) (2026-04-21)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
### Features
|
|
7
|
+
|
|
8
|
+
* add retrieval service sdk clients ([bceef5c](https://github.com/Ontos-AI/knowhere-python-sdk/commit/bceef5cf379dba39543244bd6ca86262a536fb9b))
|
|
9
|
+
* integrate retrieval service v1 in Python SDK ([bce7aa8](https://github.com/Ontos-AI/knowhere-python-sdk/commit/bce7aa8dbf069d5880b92c6f9d8996878251f7cb))
|
|
10
|
+
|
|
11
|
+
## [0.2.1](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.2.0...v0.2.1) (2026-04-09)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
### Bug Fixes
|
|
15
|
+
|
|
16
|
+
* narrow status error constructors ([c8fc035](https://github.com/Ontos-AI/knowhere-python-sdk/commit/c8fc035dade768c5364e50de890bde0fb280586e))
|
|
17
|
+
* remove stale mypy ignore ([150336a](https://github.com/Ontos-AI/knowhere-python-sdk/commit/150336a5dc0497b287437dffa6e1506f4bcf8fbf))
|
|
18
|
+
* sync optimized parse result payload ([a7903ad](https://github.com/Ontos-AI/knowhere-python-sdk/commit/a7903ad53fb5ab142c5835134c9a942eb5cdfe21))
|
|
19
|
+
* sync parse result payload with current API schema ([430b067](https://github.com/Ontos-AI/knowhere-python-sdk/commit/430b067b37ce0b2eb8bd3c81cfca56b1df657376))
|
|
20
|
+
|
|
3
21
|
## [0.2.0](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.1.0...v0.2.0) (2026-03-18)
|
|
4
22
|
|
|
5
23
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: knowhere-python-sdk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Official Python SDK for the Knowhere document parsing API
|
|
5
5
|
Project-URL: Homepage, https://knowhereto.ai
|
|
6
6
|
Project-URL: Documentation, https://docs.knowhereto.ai
|
|
@@ -64,6 +64,74 @@ for chunk in result.text_chunks:
|
|
|
64
64
|
print(chunk.content[:80])
|
|
65
65
|
```
|
|
66
66
|
|
|
67
|
+
## Retrieval and document lifecycle
|
|
68
|
+
|
|
69
|
+
New documents are published into a retrieval namespace. The server returns a
|
|
70
|
+
stable `document_id` when you create a job; persist that value if you need to
|
|
71
|
+
update or archive the same document later.
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
job = client.jobs.create(
|
|
75
|
+
source_type="url",
|
|
76
|
+
source_url="https://example.com/manual.pdf",
|
|
77
|
+
namespace="support-center",
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
print(job.document_id) # "doc_..."
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
After the job is done and published, query the canonical document content:
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
response = client.retrieval.query(
|
|
87
|
+
namespace="support-center",
|
|
88
|
+
query="How do I reset Bluetooth pairing?",
|
|
89
|
+
top_k=5,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
for result in response.results:
|
|
93
|
+
print(result.content)
|
|
94
|
+
print(result.score)
|
|
95
|
+
print(result.source.source_file_name, result.source.section_path)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Use `document_id` to update or archive a document:
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
update_job = client.jobs.create(
|
|
102
|
+
source_type="url",
|
|
103
|
+
source_url="https://example.com/manual-v2.pdf",
|
|
104
|
+
document_id=job.document_id,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
document = client.documents.get(job.document_id)
|
|
108
|
+
print(document.status)
|
|
109
|
+
|
|
110
|
+
client.documents.archive(job.document_id)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
You can also list documents in a namespace:
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
documents = client.documents.list(namespace="support-center")
|
|
117
|
+
for document in documents.documents:
|
|
118
|
+
print(document.document_id, document.status)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
Retrieval supports exclusions when clients want follow-up results that avoid
|
|
122
|
+
previously used documents or sections:
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
response = client.retrieval.query(
|
|
126
|
+
namespace="support-center",
|
|
127
|
+
query="battery charging",
|
|
128
|
+
exclude_document_ids=["doc_old"],
|
|
129
|
+
exclude_sections=[
|
|
130
|
+
{"document_id": "doc_123", "section_path": "Appendix / Legal"}
|
|
131
|
+
],
|
|
132
|
+
)
|
|
133
|
+
```
|
|
134
|
+
|
|
67
135
|
While you can provide an `api_key` keyword argument, we recommend using [python-dotenv](https://pypi.org/project/python-dotenv/) to add `KNOWHERE_API_KEY="sk_..."` to your `.env` file so that your API key is not stored in source control.
|
|
68
136
|
|
|
69
137
|
### Parse a local file
|
|
@@ -137,9 +205,12 @@ from pathlib import Path
|
|
|
137
205
|
job = client.jobs.create(
|
|
138
206
|
source_type="file",
|
|
139
207
|
file_name="report.pdf",
|
|
208
|
+
namespace="support-center",
|
|
140
209
|
parsing_params={"model": "advanced", "ocr_enabled": True},
|
|
141
210
|
)
|
|
142
211
|
|
|
212
|
+
print(job.document_id) # Persist this to update/archive the document later.
|
|
213
|
+
|
|
143
214
|
# Step 2: Upload file to presigned URL
|
|
144
215
|
client.jobs.upload(job, file=Path("report.pdf"))
|
|
145
216
|
|
|
@@ -32,6 +32,74 @@ for chunk in result.text_chunks:
|
|
|
32
32
|
print(chunk.content[:80])
|
|
33
33
|
```
|
|
34
34
|
|
|
35
|
+
## Retrieval and document lifecycle
|
|
36
|
+
|
|
37
|
+
New documents are published into a retrieval namespace. The server returns a
|
|
38
|
+
stable `document_id` when you create a job; persist that value if you need to
|
|
39
|
+
update or archive the same document later.
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
job = client.jobs.create(
|
|
43
|
+
source_type="url",
|
|
44
|
+
source_url="https://example.com/manual.pdf",
|
|
45
|
+
namespace="support-center",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
print(job.document_id) # "doc_..."
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
After the job is done and published, query the canonical document content:
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
response = client.retrieval.query(
|
|
55
|
+
namespace="support-center",
|
|
56
|
+
query="How do I reset Bluetooth pairing?",
|
|
57
|
+
top_k=5,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
for result in response.results:
|
|
61
|
+
print(result.content)
|
|
62
|
+
print(result.score)
|
|
63
|
+
print(result.source.source_file_name, result.source.section_path)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Use `document_id` to update or archive a document:
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
update_job = client.jobs.create(
|
|
70
|
+
source_type="url",
|
|
71
|
+
source_url="https://example.com/manual-v2.pdf",
|
|
72
|
+
document_id=job.document_id,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
document = client.documents.get(job.document_id)
|
|
76
|
+
print(document.status)
|
|
77
|
+
|
|
78
|
+
client.documents.archive(job.document_id)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
You can also list documents in a namespace:
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
documents = client.documents.list(namespace="support-center")
|
|
85
|
+
for document in documents.documents:
|
|
86
|
+
print(document.document_id, document.status)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Retrieval supports exclusions when clients want follow-up results that avoid
|
|
90
|
+
previously used documents or sections:
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
response = client.retrieval.query(
|
|
94
|
+
namespace="support-center",
|
|
95
|
+
query="battery charging",
|
|
96
|
+
exclude_document_ids=["doc_old"],
|
|
97
|
+
exclude_sections=[
|
|
98
|
+
{"document_id": "doc_123", "section_path": "Appendix / Legal"}
|
|
99
|
+
],
|
|
100
|
+
)
|
|
101
|
+
```
|
|
102
|
+
|
|
35
103
|
While you can provide an `api_key` keyword argument, we recommend using [python-dotenv](https://pypi.org/project/python-dotenv/) to add `KNOWHERE_API_KEY="sk_..."` to your `.env` file so that your API key is not stored in source control.
|
|
36
104
|
|
|
37
105
|
### Parse a local file
|
|
@@ -105,9 +173,12 @@ from pathlib import Path
|
|
|
105
173
|
job = client.jobs.create(
|
|
106
174
|
source_type="file",
|
|
107
175
|
file_name="report.pdf",
|
|
176
|
+
namespace="support-center",
|
|
108
177
|
parsing_params={"model": "advanced", "ocr_enabled": True},
|
|
109
178
|
)
|
|
110
179
|
|
|
180
|
+
print(job.document_id) # Persist this to update/archive the document later.
|
|
181
|
+
|
|
111
182
|
# Step 2: Upload file to presigned URL
|
|
112
183
|
client.jobs.upload(job, file=Path("report.pdf"))
|
|
113
184
|
|
|
@@ -12,6 +12,7 @@ Comprehensive reference for every feature, parameter, and pattern in the SDK.
|
|
|
12
12
|
- [Working with Results](#working-with-results)
|
|
13
13
|
- [Chunk Types](#chunk-types)
|
|
14
14
|
- [Step-by-Step Control (Jobs API)](#step-by-step-control-jobs-api)
|
|
15
|
+
- [Retrieval and Document Lifecycle](#retrieval-and-document-lifecycle)
|
|
15
16
|
- [Async Usage](#async-usage)
|
|
16
17
|
- [Progress Callbacks](#progress-callbacks)
|
|
17
18
|
- [Error Handling](#error-handling)
|
|
@@ -316,8 +317,10 @@ from pathlib import Path
|
|
|
316
317
|
job = client.jobs.create(
|
|
317
318
|
source_type="file",
|
|
318
319
|
file_name="report.pdf",
|
|
320
|
+
namespace="support-center",
|
|
319
321
|
parsing_params={"model": "advanced", "ocr_enabled": True},
|
|
320
322
|
)
|
|
323
|
+
print(job.document_id) # Persist this value for update/archive flows.
|
|
321
324
|
|
|
322
325
|
# Step 2: Upload file to the presigned URL
|
|
323
326
|
client.jobs.upload(job, file=Path("report.pdf"))
|
|
@@ -341,6 +344,8 @@ print(result.statistics)
|
|
|
341
344
|
| `source_type` | `"url" \| "file"` | — | Required. Whether parsing from URL or uploaded file. |
|
|
342
345
|
| `source_url` | `str \| None` | `None` | URL to parse (required when `source_type="url"`). |
|
|
343
346
|
| `file_name` | `str \| None` | `None` | Original filename (used when `source_type="file"`). |
|
|
347
|
+
| `namespace` | `str \| None` | `None` | Retrieval namespace. The server defaults to `"default"` when omitted. |
|
|
348
|
+
| `document_id` | `str \| None` | `None` | Existing document ID when creating an update job. Omit for a new document. |
|
|
344
349
|
| `data_id` | `str \| None` | `None` | Your own correlation/idempotency identifier. |
|
|
345
350
|
| `parsing_params` | `ParsingParams \| None` | `None` | Parsing configuration. |
|
|
346
351
|
| `webhook` | `WebhookConfig \| None` | `None` | Webhook for completion notification. |
|
|
@@ -351,6 +356,8 @@ Returns a `Job` object:
|
|
|
351
356
|
job.job_id # "abc-123"
|
|
352
357
|
job.status # "pending"
|
|
353
358
|
job.source_type # "file"
|
|
359
|
+
job.namespace # "support-center"
|
|
360
|
+
job.document_id # "doc_..." — persist this for updates and archive calls
|
|
354
361
|
job.upload_url # presigned URL (for file uploads)
|
|
355
362
|
job.upload_headers # headers to include in the upload request
|
|
356
363
|
job.expires_in # seconds until upload URL expires
|
|
@@ -407,6 +414,119 @@ result = client.jobs.load("https://storage.example.com/result.zip")
|
|
|
407
414
|
|
|
408
415
|
---
|
|
409
416
|
|
|
417
|
+
## Retrieval and Document Lifecycle
|
|
418
|
+
|
|
419
|
+
The retrieval APIs operate on canonical documents that are published after a
|
|
420
|
+
job completes. For new documents, the server generates `document_id` during
|
|
421
|
+
`jobs.create()`. Store that ID in your application if you need to update or
|
|
422
|
+
archive the same document later.
|
|
423
|
+
|
|
424
|
+
### Create a retrievable document
|
|
425
|
+
|
|
426
|
+
```python
|
|
427
|
+
job = client.jobs.create(
|
|
428
|
+
source_type="url",
|
|
429
|
+
source_url="https://example.com/manual.pdf",
|
|
430
|
+
namespace="support-center",
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
print(job.document_id) # "doc_..."
|
|
434
|
+
```
|
|
435
|
+
|
|
436
|
+
For file uploads, the flow is the same except that you upload the file before
|
|
437
|
+
polling:
|
|
438
|
+
|
|
439
|
+
```python
|
|
440
|
+
job = client.jobs.create(
|
|
441
|
+
source_type="file",
|
|
442
|
+
file_name="manual.pdf",
|
|
443
|
+
namespace="support-center",
|
|
444
|
+
)
|
|
445
|
+
client.jobs.upload(job, file=Path("manual.pdf"))
|
|
446
|
+
job_result = client.jobs.wait(job.job_id)
|
|
447
|
+
```
|
|
448
|
+
|
|
449
|
+
### Update an existing document
|
|
450
|
+
|
|
451
|
+
Pass the prior `document_id` to create an update job. If `namespace` is omitted,
|
|
452
|
+
the API resolves the namespace from the existing document.
|
|
453
|
+
|
|
454
|
+
```python
|
|
455
|
+
update_job = client.jobs.create(
|
|
456
|
+
source_type="url",
|
|
457
|
+
source_url="https://example.com/manual-v2.pdf",
|
|
458
|
+
document_id=job.document_id,
|
|
459
|
+
)
|
|
460
|
+
```
|
|
461
|
+
|
|
462
|
+
The API rejects concurrent non-terminal jobs for the same document with a
|
|
463
|
+
retryable `ConflictError` using the server error code `ABORTED`.
|
|
464
|
+
|
|
465
|
+
### Query retrieval results
|
|
466
|
+
|
|
467
|
+
```python
|
|
468
|
+
response = client.retrieval.query(
|
|
469
|
+
namespace="support-center",
|
|
470
|
+
query="How do I pair a Bluetooth headset?",
|
|
471
|
+
top_k=5,
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
for result in response.results:
|
|
475
|
+
print(result.content)
|
|
476
|
+
print(result.score)
|
|
477
|
+
print(result.source.document_id)
|
|
478
|
+
print(result.source.source_file_name)
|
|
479
|
+
print(result.source.section_path)
|
|
480
|
+
```
|
|
481
|
+
|
|
482
|
+
Retrieval results expose `content`, not the older parse-result `text` field.
|
|
483
|
+
Media results may include `asset_url` when the server can sign the referenced
|
|
484
|
+
artifact.
|
|
485
|
+
|
|
486
|
+
Each retrieval result uses one canonical source reference shape:
|
|
487
|
+
|
|
488
|
+
```python
|
|
489
|
+
result.content
|
|
490
|
+
result.chunk_type
|
|
491
|
+
result.score
|
|
492
|
+
result.asset_url # Optional[str]
|
|
493
|
+
result.source.document_id
|
|
494
|
+
result.source.source_file_name
|
|
495
|
+
result.source.section_path
|
|
496
|
+
```
|
|
497
|
+
|
|
498
|
+
### Exclude documents or sections
|
|
499
|
+
|
|
500
|
+
Use exclusions for follow-up queries that should avoid already-used context.
|
|
501
|
+
|
|
502
|
+
```python
|
|
503
|
+
response = client.retrieval.query(
|
|
504
|
+
namespace="support-center",
|
|
505
|
+
query="battery charging",
|
|
506
|
+
top_k=10,
|
|
507
|
+
exclude_document_ids=["doc_old"],
|
|
508
|
+
exclude_sections=[
|
|
509
|
+
{"document_id": "doc_123", "section_path": "Appendix / Legal"}
|
|
510
|
+
],
|
|
511
|
+
)
|
|
512
|
+
```
|
|
513
|
+
|
|
514
|
+
### List, get, and archive documents
|
|
515
|
+
|
|
516
|
+
```python
|
|
517
|
+
document_list = client.documents.list(namespace="support-center")
|
|
518
|
+
for document in document_list.documents:
|
|
519
|
+
print(document.document_id, document.status, document.source_file_name)
|
|
520
|
+
|
|
521
|
+
document = client.documents.get("doc_123")
|
|
522
|
+
print(document.current_job_result_id)
|
|
523
|
+
|
|
524
|
+
archived = client.documents.archive("doc_123")
|
|
525
|
+
print(archived.status) # "archived"
|
|
526
|
+
```
|
|
527
|
+
|
|
528
|
+
---
|
|
529
|
+
|
|
410
530
|
## Async Usage
|
|
411
531
|
|
|
412
532
|
Every method available on `Knowhere` has an async counterpart on `AsyncKnowhere`:
|
|
@@ -429,6 +549,13 @@ async def main():
|
|
|
429
549
|
job_result = await client.jobs.wait(job.job_id)
|
|
430
550
|
result = await client.jobs.load(job_result)
|
|
431
551
|
|
|
552
|
+
retrieval = await client.retrieval.query(
|
|
553
|
+
namespace="support-center",
|
|
554
|
+
query="refund policy",
|
|
555
|
+
top_k=5,
|
|
556
|
+
)
|
|
557
|
+
print(retrieval.results[0].content)
|
|
558
|
+
|
|
432
559
|
asyncio.run(main())
|
|
433
560
|
```
|
|
434
561
|
|
|
@@ -35,8 +35,14 @@ from knowhere._exceptions import (
|
|
|
35
35
|
)
|
|
36
36
|
from knowhere._types import PollProgressCallback, UploadProgressCallback
|
|
37
37
|
from knowhere._version import __version__
|
|
38
|
+
from knowhere.types.document import Document, DocumentListResponse
|
|
38
39
|
from knowhere.types.job import Job, JobError, JobProgress, JobResult
|
|
39
40
|
from knowhere.types.params import ParsingParams, WebhookConfig
|
|
41
|
+
from knowhere.types.retrieval import (
|
|
42
|
+
RetrievalSource,
|
|
43
|
+
RetrievalQueryResponse,
|
|
44
|
+
RetrievalResult,
|
|
45
|
+
)
|
|
40
46
|
from knowhere.types.result import (
|
|
41
47
|
BaseChunk,
|
|
42
48
|
Checksum,
|
|
@@ -46,6 +52,10 @@ from knowhere.types.result import (
|
|
|
46
52
|
ImageFileInfo,
|
|
47
53
|
Manifest,
|
|
48
54
|
ParseResult,
|
|
55
|
+
ProcessingCost,
|
|
56
|
+
ProcessingMetadata,
|
|
57
|
+
ProcessingTiming,
|
|
58
|
+
SlimChunk,
|
|
49
59
|
Statistics,
|
|
50
60
|
TableChunk,
|
|
51
61
|
TableFileInfo,
|
|
@@ -83,6 +93,13 @@ __all__: list[str] = [
|
|
|
83
93
|
"JobError",
|
|
84
94
|
"JobProgress",
|
|
85
95
|
"JobResult",
|
|
96
|
+
# Document types
|
|
97
|
+
"Document",
|
|
98
|
+
"DocumentListResponse",
|
|
99
|
+
# Retrieval types
|
|
100
|
+
"RetrievalSource",
|
|
101
|
+
"RetrievalQueryResponse",
|
|
102
|
+
"RetrievalResult",
|
|
86
103
|
# Result types
|
|
87
104
|
"ParseResult",
|
|
88
105
|
"Manifest",
|
|
@@ -91,6 +108,10 @@ __all__: list[str] = [
|
|
|
91
108
|
"FileIndex",
|
|
92
109
|
"ImageFileInfo",
|
|
93
110
|
"TableFileInfo",
|
|
111
|
+
"ProcessingCost",
|
|
112
|
+
"ProcessingMetadata",
|
|
113
|
+
"ProcessingTiming",
|
|
114
|
+
"SlimChunk",
|
|
94
115
|
"BaseChunk",
|
|
95
116
|
"TextChunk",
|
|
96
117
|
"ImageChunk",
|
|
@@ -19,7 +19,9 @@ from knowhere._types import (
|
|
|
19
19
|
PollProgressCallback,
|
|
20
20
|
UploadProgressCallback,
|
|
21
21
|
)
|
|
22
|
+
from knowhere.resources.documents import AsyncDocuments, Documents
|
|
22
23
|
from knowhere.resources.jobs import AsyncJobs, Jobs
|
|
24
|
+
from knowhere.resources.retrieval import AsyncRetrieval, Retrieval
|
|
23
25
|
from knowhere.types.job import Job, JobResult
|
|
24
26
|
from knowhere.types.params import ParsingParams, WebhookConfig
|
|
25
27
|
from knowhere.types.result import ParseResult
|
|
@@ -42,6 +44,16 @@ class Knowhere(SyncAPIClient):
|
|
|
42
44
|
"""Access the jobs resource namespace."""
|
|
43
45
|
return Jobs(self)
|
|
44
46
|
|
|
47
|
+
@cached_property
|
|
48
|
+
def retrieval(self) -> Retrieval:
|
|
49
|
+
"""Access the retrieval resource namespace."""
|
|
50
|
+
return Retrieval(self)
|
|
51
|
+
|
|
52
|
+
@cached_property
|
|
53
|
+
def documents(self) -> Documents:
|
|
54
|
+
"""Access the documents resource namespace."""
|
|
55
|
+
return Documents(self)
|
|
56
|
+
|
|
45
57
|
# -- overloaded parse signatures --
|
|
46
58
|
|
|
47
59
|
@overload
|
|
@@ -50,6 +62,8 @@ class Knowhere(SyncAPIClient):
|
|
|
50
62
|
*,
|
|
51
63
|
url: str,
|
|
52
64
|
data_id: Optional[str] = ...,
|
|
65
|
+
namespace: Optional[str] = ...,
|
|
66
|
+
document_id: Optional[str] = ...,
|
|
53
67
|
parsing_params: Optional[ParsingParams] = ...,
|
|
54
68
|
webhook: Optional[WebhookConfig] = ...,
|
|
55
69
|
poll_interval: float = ...,
|
|
@@ -66,6 +80,8 @@ class Knowhere(SyncAPIClient):
|
|
|
66
80
|
file: Union[Path, BinaryIO, bytes],
|
|
67
81
|
file_name: Optional[str] = ...,
|
|
68
82
|
data_id: Optional[str] = ...,
|
|
83
|
+
namespace: Optional[str] = ...,
|
|
84
|
+
document_id: Optional[str] = ...,
|
|
69
85
|
parsing_params: Optional[ParsingParams] = ...,
|
|
70
86
|
webhook: Optional[WebhookConfig] = ...,
|
|
71
87
|
poll_interval: float = ...,
|
|
@@ -82,6 +98,8 @@ class Knowhere(SyncAPIClient):
|
|
|
82
98
|
file: Optional[Union[Path, BinaryIO, bytes]] = None,
|
|
83
99
|
file_name: Optional[str] = None,
|
|
84
100
|
data_id: Optional[str] = None,
|
|
101
|
+
namespace: Optional[str] = None,
|
|
102
|
+
document_id: Optional[str] = None,
|
|
85
103
|
parsing_params: Optional[ParsingParams] = None,
|
|
86
104
|
webhook: Optional[WebhookConfig] = None,
|
|
87
105
|
poll_interval: float = DEFAULT_POLL_INTERVAL,
|
|
@@ -105,6 +123,8 @@ class Knowhere(SyncAPIClient):
|
|
|
105
123
|
source_type="url",
|
|
106
124
|
source_url=url,
|
|
107
125
|
data_id=data_id,
|
|
126
|
+
namespace=namespace,
|
|
127
|
+
document_id=document_id,
|
|
108
128
|
parsing_params=parsing_params,
|
|
109
129
|
webhook=webhook,
|
|
110
130
|
)
|
|
@@ -116,6 +136,8 @@ class Knowhere(SyncAPIClient):
|
|
|
116
136
|
source_type="file",
|
|
117
137
|
file_name=resolved_name,
|
|
118
138
|
data_id=data_id,
|
|
139
|
+
namespace=namespace,
|
|
140
|
+
document_id=document_id,
|
|
119
141
|
parsing_params=parsing_params,
|
|
120
142
|
webhook=webhook,
|
|
121
143
|
)
|
|
@@ -149,12 +171,24 @@ class AsyncKnowhere(AsyncAPIClient):
|
|
|
149
171
|
"""Access the async jobs resource namespace."""
|
|
150
172
|
return AsyncJobs(self)
|
|
151
173
|
|
|
174
|
+
@cached_property
|
|
175
|
+
def retrieval(self) -> AsyncRetrieval:
|
|
176
|
+
"""Access the async retrieval resource namespace."""
|
|
177
|
+
return AsyncRetrieval(self)
|
|
178
|
+
|
|
179
|
+
@cached_property
|
|
180
|
+
def documents(self) -> AsyncDocuments:
|
|
181
|
+
"""Access the async documents resource namespace."""
|
|
182
|
+
return AsyncDocuments(self)
|
|
183
|
+
|
|
152
184
|
@overload
|
|
153
185
|
async def parse(
|
|
154
186
|
self,
|
|
155
187
|
*,
|
|
156
188
|
url: str,
|
|
157
189
|
data_id: Optional[str] = ...,
|
|
190
|
+
namespace: Optional[str] = ...,
|
|
191
|
+
document_id: Optional[str] = ...,
|
|
158
192
|
parsing_params: Optional[ParsingParams] = ...,
|
|
159
193
|
webhook: Optional[WebhookConfig] = ...,
|
|
160
194
|
poll_interval: float = ...,
|
|
@@ -171,6 +205,8 @@ class AsyncKnowhere(AsyncAPIClient):
|
|
|
171
205
|
file: Union[Path, BinaryIO, bytes],
|
|
172
206
|
file_name: Optional[str] = ...,
|
|
173
207
|
data_id: Optional[str] = ...,
|
|
208
|
+
namespace: Optional[str] = ...,
|
|
209
|
+
document_id: Optional[str] = ...,
|
|
174
210
|
parsing_params: Optional[ParsingParams] = ...,
|
|
175
211
|
webhook: Optional[WebhookConfig] = ...,
|
|
176
212
|
poll_interval: float = ...,
|
|
@@ -187,6 +223,8 @@ class AsyncKnowhere(AsyncAPIClient):
|
|
|
187
223
|
file: Optional[Union[Path, BinaryIO, bytes]] = None,
|
|
188
224
|
file_name: Optional[str] = None,
|
|
189
225
|
data_id: Optional[str] = None,
|
|
226
|
+
namespace: Optional[str] = None,
|
|
227
|
+
document_id: Optional[str] = None,
|
|
190
228
|
parsing_params: Optional[ParsingParams] = None,
|
|
191
229
|
webhook: Optional[WebhookConfig] = None,
|
|
192
230
|
poll_interval: float = DEFAULT_POLL_INTERVAL,
|
|
@@ -206,6 +244,8 @@ class AsyncKnowhere(AsyncAPIClient):
|
|
|
206
244
|
source_type="url",
|
|
207
245
|
source_url=url,
|
|
208
246
|
data_id=data_id,
|
|
247
|
+
namespace=namespace,
|
|
248
|
+
document_id=document_id,
|
|
209
249
|
parsing_params=parsing_params,
|
|
210
250
|
webhook=webhook,
|
|
211
251
|
)
|
|
@@ -217,6 +257,8 @@ class AsyncKnowhere(AsyncAPIClient):
|
|
|
217
257
|
source_type="file",
|
|
218
258
|
file_name=resolved_name,
|
|
219
259
|
data_id=data_id,
|
|
260
|
+
namespace=namespace,
|
|
261
|
+
document_id=document_id,
|
|
220
262
|
parsing_params=parsing_params,
|
|
221
263
|
webhook=webhook,
|
|
222
264
|
)
|
|
@@ -232,4 +274,4 @@ class AsyncKnowhere(AsyncAPIClient):
|
|
|
232
274
|
|
|
233
275
|
return await self.jobs.load(
|
|
234
276
|
job_result, verify_checksum=verify_checksum
|
|
235
|
-
)
|
|
277
|
+
)
|
|
@@ -387,11 +387,29 @@ def makeStatusError(
|
|
|
387
387
|
response=response,
|
|
388
388
|
)
|
|
389
389
|
|
|
390
|
-
if exception_class
|
|
391
|
-
return
|
|
390
|
+
if exception_class is RateLimitError:
|
|
391
|
+
return RateLimitError(
|
|
392
392
|
status_code,
|
|
393
393
|
**common_kwargs,
|
|
394
|
-
retry_after=retry_after,
|
|
394
|
+
retry_after=retry_after,
|
|
395
|
+
limit=limit,
|
|
396
|
+
period=period,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
if exception_class is ServiceUnavailableError:
|
|
400
|
+
return ServiceUnavailableError(
|
|
401
|
+
status_code,
|
|
402
|
+
**common_kwargs,
|
|
403
|
+
retry_after=retry_after,
|
|
404
|
+
limit=limit,
|
|
405
|
+
period=period,
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
if exception_class is GatewayTimeoutError:
|
|
409
|
+
return GatewayTimeoutError(
|
|
410
|
+
status_code,
|
|
411
|
+
**common_kwargs,
|
|
412
|
+
retry_after=retry_after,
|
|
395
413
|
limit=limit,
|
|
396
414
|
period=period,
|
|
397
415
|
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.3.0" # x-release-please-version
|
|
@@ -16,6 +16,7 @@ from knowhere.types.result import (
|
|
|
16
16
|
ImageChunk,
|
|
17
17
|
Manifest,
|
|
18
18
|
ParseResult,
|
|
19
|
+
SlimChunk,
|
|
19
20
|
TableChunk,
|
|
20
21
|
TextChunk,
|
|
21
22
|
TextChunkTokens,
|
|
@@ -134,6 +135,7 @@ def _buildChunks(
|
|
|
134
135
|
type="image",
|
|
135
136
|
content=raw.get("content", ""),
|
|
136
137
|
path=raw.get("path"),
|
|
138
|
+
page_nums=metadata.get("page_nums", raw.get("page_nums")),
|
|
137
139
|
length=metadata.get("length", raw.get("length", 0)),
|
|
138
140
|
file_path=file_path,
|
|
139
141
|
original_name=metadata.get("original_name", raw.get("original_name")),
|
|
@@ -151,6 +153,7 @@ def _buildChunks(
|
|
|
151
153
|
type="table",
|
|
152
154
|
content=raw.get("content", ""),
|
|
153
155
|
path=raw.get("path"),
|
|
156
|
+
page_nums=metadata.get("page_nums", raw.get("page_nums")),
|
|
154
157
|
length=metadata.get("length", raw.get("length", 0)),
|
|
155
158
|
file_path=file_path,
|
|
156
159
|
original_name=metadata.get("original_name", raw.get("original_name")),
|
|
@@ -167,10 +170,12 @@ def _buildChunks(
|
|
|
167
170
|
type="text",
|
|
168
171
|
content=raw.get("content", ""),
|
|
169
172
|
path=raw.get("path"),
|
|
173
|
+
page_nums=metadata.get("page_nums", raw.get("page_nums")),
|
|
170
174
|
length=metadata.get("length", raw.get("length", 0)),
|
|
171
175
|
tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id),
|
|
172
176
|
keywords=metadata.get("keywords", raw.get("keywords")),
|
|
173
177
|
summary=metadata.get("summary", raw.get("summary")),
|
|
178
|
+
connect_to=metadata.get("connect_to", raw.get("connect_to")),
|
|
174
179
|
relationships=metadata.get("relationships", raw.get("relationships")),
|
|
175
180
|
)
|
|
176
181
|
|
|
@@ -230,12 +235,39 @@ def parseResultZip(
|
|
|
230
235
|
json.loads(hierarchy_text) if hierarchy_text else None
|
|
231
236
|
)
|
|
232
237
|
|
|
238
|
+
# -- Optimized sidecar files --
|
|
239
|
+
chunks_slim_text: Optional[str] = _readZipText(zf, "chunks_slim.json")
|
|
240
|
+
parsed_chunks_slim: Any = json.loads(chunks_slim_text) if chunks_slim_text else None
|
|
241
|
+
if isinstance(parsed_chunks_slim, dict) and "chunks" in parsed_chunks_slim:
|
|
242
|
+
raw_chunks_slim: List[Dict[str, Any]] = parsed_chunks_slim["chunks"]
|
|
243
|
+
elif isinstance(parsed_chunks_slim, list):
|
|
244
|
+
raw_chunks_slim = parsed_chunks_slim
|
|
245
|
+
else:
|
|
246
|
+
raw_chunks_slim = []
|
|
247
|
+
chunks_slim: Optional[List[SlimChunk]] = (
|
|
248
|
+
[SlimChunk.model_validate(chunk) for chunk in raw_chunks_slim]
|
|
249
|
+
if chunks_slim_text is not None
|
|
250
|
+
else None
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
toc_hierarchies_text: Optional[str] = _readZipText(zf, "toc_hierarchies.json")
|
|
254
|
+
toc_hierarchies: Optional[Any] = (
|
|
255
|
+
json.loads(toc_hierarchies_text) if toc_hierarchies_text else None
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
kb_csv: Optional[str] = _readZipText(zf, "kb.csv")
|
|
259
|
+
hierarchy_view_html: Optional[str] = _readZipText(zf, "hierarchy_view.html")
|
|
260
|
+
|
|
233
261
|
zf.close()
|
|
234
262
|
|
|
235
263
|
return ParseResult(
|
|
236
264
|
manifest=manifest,
|
|
237
265
|
chunks=chunks,
|
|
266
|
+
chunks_slim=chunks_slim,
|
|
238
267
|
full_markdown=full_markdown,
|
|
239
268
|
hierarchy=hierarchy,
|
|
269
|
+
toc_hierarchies=toc_hierarchies,
|
|
270
|
+
kb_csv=kb_csv,
|
|
271
|
+
hierarchy_view_html=hierarchy_view_html,
|
|
240
272
|
raw_zip=zip_bytes,
|
|
241
273
|
)
|