knowhere-python-sdk 0.3.0__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowhere_python_sdk-0.3.1/.release-please-manifest.json +3 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/CHANGELOG.md +8 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/PKG-INFO +21 -9
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/README.md +20 -8
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/pyproject.toml +1 -1
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/__init__.py +6 -0
- knowhere_python_sdk-0.3.1/src/knowhere/_version.py +1 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/resources/jobs.py +16 -2
- knowhere_python_sdk-0.3.1/src/knowhere/resources/retrieval.py +123 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/types/__init__.py +6 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/types/job.py +0 -1
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/types/result.py +6 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/types/retrieval.py +13 -1
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/conftest.py +0 -1
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_jobs.py +5 -3
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_models.py +6 -1
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_parse.py +4 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_retrieval.py +19 -0
- knowhere_python_sdk-0.3.0/.release-please-manifest.json +0 -3
- knowhere_python_sdk-0.3.0/src/knowhere/_version.py +0 -1
- knowhere_python_sdk-0.3.0/src/knowhere/resources/retrieval.py +0 -70
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/.github/workflows/ci.yml +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/.github/workflows/publish-pypi.yml +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/.github/workflows/publish.yml +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/.gitignore +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/docs/usage.md +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/examples/async_usage.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/examples/error_handling.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/examples/parse_file.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/examples/parse_url.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/examples/step_by_step.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/release-please-config.json +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/_base_client.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/_client.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/_constants.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/_exceptions.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/_logging.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/_response.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/_types.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/lib/__init__.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/lib/polling.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/lib/result_parser.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/lib/upload.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/py.typed +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/resources/__init__.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/resources/_base.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/resources/documents.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/types/document.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/types/params.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/types/shared.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/__init__.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/fixtures/real_result.zip +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_client.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_documents.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_exceptions.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_logging.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_polling.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_result_parser.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_retry.py +0 -0
- {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_upload.py +0 -0
|
@@ -1,5 +1,13 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.3.1](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.3.0...v0.3.1) (2026-04-22)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
### Documentation
|
|
7
|
+
|
|
8
|
+
* clarify ParseResult document scope ([861084e](https://github.com/Ontos-AI/knowhere-python-sdk/commit/861084e34144987994fa618ac0db262ce681b5a8))
|
|
9
|
+
* clarify ParseResult document scope ([bb14ad4](https://github.com/Ontos-AI/knowhere-python-sdk/commit/bb14ad4077c41cbe74a5dd155995d6f9937962b8))
|
|
10
|
+
|
|
3
11
|
## [0.3.0](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.2.1...v0.3.0) (2026-04-21)
|
|
4
12
|
|
|
5
13
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: knowhere-python-sdk
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Official Python SDK for the Knowhere document parsing API
|
|
5
5
|
Project-URL: Homepage, https://knowhereto.ai
|
|
6
6
|
Project-URL: Documentation, https://docs.knowhereto.ai
|
|
@@ -67,8 +67,9 @@ for chunk in result.text_chunks:
|
|
|
67
67
|
## Retrieval and document lifecycle
|
|
68
68
|
|
|
69
69
|
New documents are published into a retrieval namespace. The server returns a
|
|
70
|
-
stable `document_id`
|
|
71
|
-
|
|
70
|
+
stable `document_id` after the job is published. `client.jobs.create(...)`
|
|
71
|
+
does not return a usable `document_id`; persist `job_result.document_id` if you
|
|
72
|
+
need to update or archive the same document later.
|
|
72
73
|
|
|
73
74
|
```python
|
|
74
75
|
job = client.jobs.create(
|
|
@@ -77,7 +78,11 @@ job = client.jobs.create(
|
|
|
77
78
|
namespace="support-center",
|
|
78
79
|
)
|
|
79
80
|
|
|
80
|
-
|
|
81
|
+
job_result = client.jobs.wait(job.job_id)
|
|
82
|
+
document_id = job_result.document_id
|
|
83
|
+
|
|
84
|
+
if document_id is None:
|
|
85
|
+
raise RuntimeError("Expected document_id after successful publication.")
|
|
81
86
|
```
|
|
82
87
|
|
|
83
88
|
After the job is done and published, query the canonical document content:
|
|
@@ -87,8 +92,13 @@ response = client.retrieval.query(
|
|
|
87
92
|
namespace="support-center",
|
|
88
93
|
query="How do I reset Bluetooth pairing?",
|
|
89
94
|
top_k=5,
|
|
95
|
+
channels=["path", "term"],
|
|
96
|
+
filter_mode="keep",
|
|
97
|
+
signal_paths=["Bluetooth", "Pairing"],
|
|
90
98
|
)
|
|
91
99
|
|
|
100
|
+
print(response.router_used)
|
|
101
|
+
|
|
92
102
|
for result in response.results:
|
|
93
103
|
print(result.content)
|
|
94
104
|
print(result.score)
|
|
@@ -101,13 +111,13 @@ Use `document_id` to update or archive a document:
|
|
|
101
111
|
update_job = client.jobs.create(
|
|
102
112
|
source_type="url",
|
|
103
113
|
source_url="https://example.com/manual-v2.pdf",
|
|
104
|
-
document_id=
|
|
114
|
+
document_id=document_id,
|
|
105
115
|
)
|
|
106
116
|
|
|
107
|
-
document = client.documents.get(
|
|
117
|
+
document = client.documents.get(document_id)
|
|
108
118
|
print(document.status)
|
|
109
119
|
|
|
110
|
-
client.documents.archive(
|
|
120
|
+
client.documents.archive(document_id)
|
|
111
121
|
```
|
|
112
122
|
|
|
113
123
|
You can also list documents in a namespace:
|
|
@@ -146,6 +156,8 @@ result = client.parse(
|
|
|
146
156
|
|
|
147
157
|
print(result.manifest.source_file_name) # "report.pdf"
|
|
148
158
|
print(len(result.chunks)) # 152
|
|
159
|
+
print(result.namespace) # "default" or your explicit namespace
|
|
160
|
+
print(result.document_id) # Published canonical document id
|
|
149
161
|
```
|
|
150
162
|
|
|
151
163
|
### Access different chunk types
|
|
@@ -209,14 +221,14 @@ job = client.jobs.create(
|
|
|
209
221
|
parsing_params={"model": "advanced", "ocr_enabled": True},
|
|
210
222
|
)
|
|
211
223
|
|
|
212
|
-
print(job.document_id) # Persist this to update/archive the document later.
|
|
213
|
-
|
|
214
224
|
# Step 2: Upload file to presigned URL
|
|
215
225
|
client.jobs.upload(job, file=Path("report.pdf"))
|
|
216
226
|
|
|
217
227
|
# Step 3: Poll until done (adaptive backoff)
|
|
218
228
|
job_result = client.jobs.wait(job.job_id, poll_interval=10.0, poll_timeout=1800.0)
|
|
219
229
|
|
|
230
|
+
print(job_result.document_id) # Persist this to update/archive the document later.
|
|
231
|
+
|
|
220
232
|
# Step 4: Download and parse results
|
|
221
233
|
result = client.jobs.load(job_result)
|
|
222
234
|
print(result.statistics)
|
|
@@ -35,8 +35,9 @@ for chunk in result.text_chunks:
|
|
|
35
35
|
## Retrieval and document lifecycle
|
|
36
36
|
|
|
37
37
|
New documents are published into a retrieval namespace. The server returns a
|
|
38
|
-
stable `document_id`
|
|
39
|
-
|
|
38
|
+
stable `document_id` after the job is published. `client.jobs.create(...)`
|
|
39
|
+
does not return a usable `document_id`; persist `job_result.document_id` if you
|
|
40
|
+
need to update or archive the same document later.
|
|
40
41
|
|
|
41
42
|
```python
|
|
42
43
|
job = client.jobs.create(
|
|
@@ -45,7 +46,11 @@ job = client.jobs.create(
|
|
|
45
46
|
namespace="support-center",
|
|
46
47
|
)
|
|
47
48
|
|
|
48
|
-
|
|
49
|
+
job_result = client.jobs.wait(job.job_id)
|
|
50
|
+
document_id = job_result.document_id
|
|
51
|
+
|
|
52
|
+
if document_id is None:
|
|
53
|
+
raise RuntimeError("Expected document_id after successful publication.")
|
|
49
54
|
```
|
|
50
55
|
|
|
51
56
|
After the job is done and published, query the canonical document content:
|
|
@@ -55,8 +60,13 @@ response = client.retrieval.query(
|
|
|
55
60
|
namespace="support-center",
|
|
56
61
|
query="How do I reset Bluetooth pairing?",
|
|
57
62
|
top_k=5,
|
|
63
|
+
channels=["path", "term"],
|
|
64
|
+
filter_mode="keep",
|
|
65
|
+
signal_paths=["Bluetooth", "Pairing"],
|
|
58
66
|
)
|
|
59
67
|
|
|
68
|
+
print(response.router_used)
|
|
69
|
+
|
|
60
70
|
for result in response.results:
|
|
61
71
|
print(result.content)
|
|
62
72
|
print(result.score)
|
|
@@ -69,13 +79,13 @@ Use `document_id` to update or archive a document:
|
|
|
69
79
|
update_job = client.jobs.create(
|
|
70
80
|
source_type="url",
|
|
71
81
|
source_url="https://example.com/manual-v2.pdf",
|
|
72
|
-
document_id=
|
|
82
|
+
document_id=document_id,
|
|
73
83
|
)
|
|
74
84
|
|
|
75
|
-
document = client.documents.get(
|
|
85
|
+
document = client.documents.get(document_id)
|
|
76
86
|
print(document.status)
|
|
77
87
|
|
|
78
|
-
client.documents.archive(
|
|
88
|
+
client.documents.archive(document_id)
|
|
79
89
|
```
|
|
80
90
|
|
|
81
91
|
You can also list documents in a namespace:
|
|
@@ -114,6 +124,8 @@ result = client.parse(
|
|
|
114
124
|
|
|
115
125
|
print(result.manifest.source_file_name) # "report.pdf"
|
|
116
126
|
print(len(result.chunks)) # 152
|
|
127
|
+
print(result.namespace) # "default" or your explicit namespace
|
|
128
|
+
print(result.document_id) # Published canonical document id
|
|
117
129
|
```
|
|
118
130
|
|
|
119
131
|
### Access different chunk types
|
|
@@ -177,14 +189,14 @@ job = client.jobs.create(
|
|
|
177
189
|
parsing_params={"model": "advanced", "ocr_enabled": True},
|
|
178
190
|
)
|
|
179
191
|
|
|
180
|
-
print(job.document_id) # Persist this to update/archive the document later.
|
|
181
|
-
|
|
182
192
|
# Step 2: Upload file to presigned URL
|
|
183
193
|
client.jobs.upload(job, file=Path("report.pdf"))
|
|
184
194
|
|
|
185
195
|
# Step 3: Poll until done (adaptive backoff)
|
|
186
196
|
job_result = client.jobs.wait(job.job_id, poll_interval=10.0, poll_timeout=1800.0)
|
|
187
197
|
|
|
198
|
+
print(job_result.document_id) # Persist this to update/archive the document later.
|
|
199
|
+
|
|
188
200
|
# Step 4: Download and parse results
|
|
189
201
|
result = client.jobs.load(job_result)
|
|
190
202
|
print(result.statistics)
|
|
@@ -39,6 +39,9 @@ from knowhere.types.document import Document, DocumentListResponse
|
|
|
39
39
|
from knowhere.types.job import Job, JobError, JobProgress, JobResult
|
|
40
40
|
from knowhere.types.params import ParsingParams, WebhookConfig
|
|
41
41
|
from knowhere.types.retrieval import (
|
|
42
|
+
RetrievalChannel,
|
|
43
|
+
RetrievalFilterMode,
|
|
44
|
+
RetrievalSectionExclusion,
|
|
42
45
|
RetrievalSource,
|
|
43
46
|
RetrievalQueryResponse,
|
|
44
47
|
RetrievalResult,
|
|
@@ -97,6 +100,9 @@ __all__: list[str] = [
|
|
|
97
100
|
"Document",
|
|
98
101
|
"DocumentListResponse",
|
|
99
102
|
# Retrieval types
|
|
103
|
+
"RetrievalChannel",
|
|
104
|
+
"RetrievalFilterMode",
|
|
105
|
+
"RetrievalSectionExclusion",
|
|
100
106
|
"RetrievalSource",
|
|
101
107
|
"RetrievalQueryResponse",
|
|
102
108
|
"RetrievalResult",
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.3.1" # x-release-please-version
|
|
@@ -145,8 +145,12 @@ class Jobs(SyncAPIResource):
|
|
|
145
145
|
if not job_result.result_url:
|
|
146
146
|
raise InvalidStateError("JobResult does not have a result_url.")
|
|
147
147
|
result_url: str = job_result.result_url
|
|
148
|
+
namespace: Optional[str] = job_result.namespace
|
|
149
|
+
document_id: Optional[str] = job_result.document_id
|
|
148
150
|
else:
|
|
149
151
|
result_url = job_result
|
|
152
|
+
namespace = None
|
|
153
|
+
document_id = None
|
|
150
154
|
|
|
151
155
|
response: httpx.Response = self._client._client.get(
|
|
152
156
|
result_url, timeout=self._client.upload_timeout
|
|
@@ -154,7 +158,10 @@ class Jobs(SyncAPIResource):
|
|
|
154
158
|
response.raise_for_status()
|
|
155
159
|
zip_bytes: bytes = response.content
|
|
156
160
|
|
|
157
|
-
|
|
161
|
+
parsed_result = parseResultZip(zip_bytes, verify_checksum=verify_checksum)
|
|
162
|
+
parsed_result.namespace = namespace
|
|
163
|
+
parsed_result.document_id = document_id
|
|
164
|
+
return parsed_result
|
|
158
165
|
|
|
159
166
|
|
|
160
167
|
class AsyncJobs(AsyncAPIResource):
|
|
@@ -251,8 +258,12 @@ class AsyncJobs(AsyncAPIResource):
|
|
|
251
258
|
if not job_result.result_url:
|
|
252
259
|
raise InvalidStateError("JobResult does not have a result_url.")
|
|
253
260
|
result_url: str = job_result.result_url
|
|
261
|
+
namespace: Optional[str] = job_result.namespace
|
|
262
|
+
document_id: Optional[str] = job_result.document_id
|
|
254
263
|
else:
|
|
255
264
|
result_url = job_result
|
|
265
|
+
namespace = None
|
|
266
|
+
document_id = None
|
|
256
267
|
|
|
257
268
|
response: httpx.Response = await self._client._client.get(
|
|
258
269
|
result_url, timeout=self._client.upload_timeout
|
|
@@ -260,4 +271,7 @@ class AsyncJobs(AsyncAPIResource):
|
|
|
260
271
|
response.raise_for_status()
|
|
261
272
|
zip_bytes: bytes = response.content
|
|
262
273
|
|
|
263
|
-
|
|
274
|
+
parsed_result = parseResultZip(zip_bytes, verify_checksum=verify_checksum)
|
|
275
|
+
parsed_result.namespace = namespace
|
|
276
|
+
parsed_result.document_id = document_id
|
|
277
|
+
return parsed_result
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Retrieval resource for querying published documents."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, Optional
|
|
6
|
+
|
|
7
|
+
from knowhere.resources._base import AsyncAPIResource, SyncAPIResource
|
|
8
|
+
from knowhere.types.retrieval import (
|
|
9
|
+
RetrievalChannel,
|
|
10
|
+
RetrievalFilterMode,
|
|
11
|
+
RetrievalQueryResponse,
|
|
12
|
+
RetrievalSectionExclusion,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Retrieval(SyncAPIResource):
|
|
17
|
+
"""Synchronous interface for ``/v1/retrieval`` endpoints."""
|
|
18
|
+
|
|
19
|
+
def query(
|
|
20
|
+
self,
|
|
21
|
+
*,
|
|
22
|
+
query: str,
|
|
23
|
+
namespace: Optional[str] = None,
|
|
24
|
+
top_k: Optional[int] = None,
|
|
25
|
+
data_type: Optional[int] = None,
|
|
26
|
+
signal_paths: Optional[list[str]] = None,
|
|
27
|
+
filter_mode: Optional[RetrievalFilterMode] = None,
|
|
28
|
+
channels: Optional[list[RetrievalChannel]] = None,
|
|
29
|
+
channel_weights: Optional[dict[RetrievalChannel, float]] = None,
|
|
30
|
+
rerank: Optional[bool] = None,
|
|
31
|
+
threshold: Optional[float] = None,
|
|
32
|
+
internal_recall_k: Optional[int] = None,
|
|
33
|
+
exclude_document_ids: Optional[list[str]] = None,
|
|
34
|
+
exclude_sections: Optional[list[RetrievalSectionExclusion]] = None,
|
|
35
|
+
) -> RetrievalQueryResponse:
|
|
36
|
+
"""Query published documents in a namespace."""
|
|
37
|
+
body: Dict[str, Any] = {"query": query}
|
|
38
|
+
if namespace is not None:
|
|
39
|
+
body["namespace"] = namespace
|
|
40
|
+
if top_k is not None:
|
|
41
|
+
body["top_k"] = top_k
|
|
42
|
+
if data_type is not None:
|
|
43
|
+
body["data_type"] = data_type
|
|
44
|
+
if signal_paths is not None:
|
|
45
|
+
body["signal_paths"] = signal_paths
|
|
46
|
+
if filter_mode is not None:
|
|
47
|
+
body["filter_mode"] = filter_mode
|
|
48
|
+
if channels is not None:
|
|
49
|
+
body["channels"] = channels
|
|
50
|
+
if channel_weights is not None:
|
|
51
|
+
body["channel_weights"] = channel_weights
|
|
52
|
+
if rerank is not None:
|
|
53
|
+
body["rerank"] = rerank
|
|
54
|
+
if threshold is not None:
|
|
55
|
+
body["threshold"] = threshold
|
|
56
|
+
if internal_recall_k is not None:
|
|
57
|
+
body["internal_recall_k"] = internal_recall_k
|
|
58
|
+
if exclude_document_ids is not None:
|
|
59
|
+
body["exclude_document_ids"] = exclude_document_ids
|
|
60
|
+
if exclude_sections is not None:
|
|
61
|
+
body["exclude_sections"] = exclude_sections
|
|
62
|
+
|
|
63
|
+
return self._request(
|
|
64
|
+
"POST",
|
|
65
|
+
"v1/retrieval/query",
|
|
66
|
+
body=body,
|
|
67
|
+
cast_to=RetrievalQueryResponse,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class AsyncRetrieval(AsyncAPIResource):
|
|
72
|
+
"""Asynchronous interface for ``/v1/retrieval`` endpoints."""
|
|
73
|
+
|
|
74
|
+
async def query(
|
|
75
|
+
self,
|
|
76
|
+
*,
|
|
77
|
+
query: str,
|
|
78
|
+
namespace: Optional[str] = None,
|
|
79
|
+
top_k: Optional[int] = None,
|
|
80
|
+
data_type: Optional[int] = None,
|
|
81
|
+
signal_paths: Optional[list[str]] = None,
|
|
82
|
+
filter_mode: Optional[RetrievalFilterMode] = None,
|
|
83
|
+
channels: Optional[list[RetrievalChannel]] = None,
|
|
84
|
+
channel_weights: Optional[dict[RetrievalChannel, float]] = None,
|
|
85
|
+
rerank: Optional[bool] = None,
|
|
86
|
+
threshold: Optional[float] = None,
|
|
87
|
+
internal_recall_k: Optional[int] = None,
|
|
88
|
+
exclude_document_ids: Optional[list[str]] = None,
|
|
89
|
+
exclude_sections: Optional[list[RetrievalSectionExclusion]] = None,
|
|
90
|
+
) -> RetrievalQueryResponse:
|
|
91
|
+
"""Query published documents in a namespace."""
|
|
92
|
+
body: Dict[str, Any] = {"query": query}
|
|
93
|
+
if namespace is not None:
|
|
94
|
+
body["namespace"] = namespace
|
|
95
|
+
if top_k is not None:
|
|
96
|
+
body["top_k"] = top_k
|
|
97
|
+
if data_type is not None:
|
|
98
|
+
body["data_type"] = data_type
|
|
99
|
+
if signal_paths is not None:
|
|
100
|
+
body["signal_paths"] = signal_paths
|
|
101
|
+
if filter_mode is not None:
|
|
102
|
+
body["filter_mode"] = filter_mode
|
|
103
|
+
if channels is not None:
|
|
104
|
+
body["channels"] = channels
|
|
105
|
+
if channel_weights is not None:
|
|
106
|
+
body["channel_weights"] = channel_weights
|
|
107
|
+
if rerank is not None:
|
|
108
|
+
body["rerank"] = rerank
|
|
109
|
+
if threshold is not None:
|
|
110
|
+
body["threshold"] = threshold
|
|
111
|
+
if internal_recall_k is not None:
|
|
112
|
+
body["internal_recall_k"] = internal_recall_k
|
|
113
|
+
if exclude_document_ids is not None:
|
|
114
|
+
body["exclude_document_ids"] = exclude_document_ids
|
|
115
|
+
if exclude_sections is not None:
|
|
116
|
+
body["exclude_sections"] = exclude_sections
|
|
117
|
+
|
|
118
|
+
return await self._request(
|
|
119
|
+
"POST",
|
|
120
|
+
"v1/retrieval/query",
|
|
121
|
+
body=body,
|
|
122
|
+
cast_to=RetrievalQueryResponse,
|
|
123
|
+
)
|
|
@@ -6,6 +6,9 @@ from knowhere.types.document import Document, DocumentListResponse
|
|
|
6
6
|
from knowhere.types.job import Job, JobError, JobResult
|
|
7
7
|
from knowhere.types.params import ParsingParams, WebhookConfig
|
|
8
8
|
from knowhere.types.retrieval import (
|
|
9
|
+
RetrievalChannel,
|
|
10
|
+
RetrievalFilterMode,
|
|
11
|
+
RetrievalSectionExclusion,
|
|
9
12
|
RetrievalSource,
|
|
10
13
|
RetrievalQueryResponse,
|
|
11
14
|
RetrievalResult,
|
|
@@ -38,6 +41,9 @@ __all__: list[str] = [
|
|
|
38
41
|
"Document",
|
|
39
42
|
"DocumentListResponse",
|
|
40
43
|
# retrieval
|
|
44
|
+
"RetrievalChannel",
|
|
45
|
+
"RetrievalFilterMode",
|
|
46
|
+
"RetrievalSectionExclusion",
|
|
41
47
|
"RetrievalSource",
|
|
42
48
|
"RetrievalQueryResponse",
|
|
43
49
|
"RetrievalResult",
|
|
@@ -272,6 +272,8 @@ class ParseResult:
|
|
|
272
272
|
kb_csv: Optional[str]
|
|
273
273
|
hierarchy_view_html: Optional[str]
|
|
274
274
|
raw_zip: bytes
|
|
275
|
+
namespace: Optional[str]
|
|
276
|
+
document_id: Optional[str]
|
|
275
277
|
|
|
276
278
|
def __init__(
|
|
277
279
|
self,
|
|
@@ -285,6 +287,8 @@ class ParseResult:
|
|
|
285
287
|
kb_csv: Optional[str],
|
|
286
288
|
hierarchy_view_html: Optional[str],
|
|
287
289
|
raw_zip: bytes,
|
|
290
|
+
namespace: Optional[str] = None,
|
|
291
|
+
document_id: Optional[str] = None,
|
|
288
292
|
) -> None:
|
|
289
293
|
self.manifest = manifest
|
|
290
294
|
self.chunks = chunks
|
|
@@ -295,6 +299,8 @@ class ParseResult:
|
|
|
295
299
|
self.kb_csv = kb_csv
|
|
296
300
|
self.hierarchy_view_html = hierarchy_view_html
|
|
297
301
|
self.raw_zip = raw_zip
|
|
302
|
+
self.namespace = namespace
|
|
303
|
+
self.document_id = document_id
|
|
298
304
|
|
|
299
305
|
# -- convenience properties --
|
|
300
306
|
|
|
@@ -2,11 +2,22 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
from typing import Optional
|
|
5
|
+
from typing import Literal, Optional, TypedDict
|
|
6
6
|
|
|
7
7
|
from pydantic import BaseModel
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
RetrievalChannel = Literal["path", "content", "term"]
|
|
11
|
+
RetrievalFilterMode = Literal["delete", "keep"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class RetrievalSectionExclusion(TypedDict):
|
|
15
|
+
"""Section exclusion for follow-up retrieval queries."""
|
|
16
|
+
|
|
17
|
+
document_id: str
|
|
18
|
+
section_path: str
|
|
19
|
+
|
|
20
|
+
|
|
10
21
|
class RetrievalSource(BaseModel):
|
|
11
22
|
"""Caller-facing source reference attached to a retrieval result."""
|
|
12
23
|
|
|
@@ -30,4 +41,5 @@ class RetrievalQueryResponse(BaseModel):
|
|
|
30
41
|
|
|
31
42
|
namespace: str
|
|
32
43
|
query: str
|
|
44
|
+
router_used: Optional[str] = None
|
|
33
45
|
results: list[RetrievalResult]
|
|
@@ -72,7 +72,6 @@ def mock_job_response() -> Dict[str, Any]:
|
|
|
72
72
|
"status": "waiting-file",
|
|
73
73
|
"source_type": "file",
|
|
74
74
|
"namespace": "default",
|
|
75
|
-
"document_id": "doc_test123",
|
|
76
75
|
"data_id": None,
|
|
77
76
|
"created_at": "2025-01-01T00:00:00Z",
|
|
78
77
|
"upload_url": "https://storage.example.com/upload?token=abc",
|
|
@@ -36,7 +36,6 @@ class TestJobsCreate:
|
|
|
36
36
|
"status": "pending",
|
|
37
37
|
"source_type": "url",
|
|
38
38
|
"namespace": "support-center",
|
|
39
|
-
"document_id": "doc_123",
|
|
40
39
|
}
|
|
41
40
|
|
|
42
41
|
route = respx.post(JOBS_URL).mock(
|
|
@@ -53,7 +52,7 @@ class TestJobsCreate:
|
|
|
53
52
|
assert job.source_type == "url"
|
|
54
53
|
assert job.status == "pending"
|
|
55
54
|
assert job.namespace == "support-center"
|
|
56
|
-
assert job
|
|
55
|
+
assert not hasattr(job, "document_id")
|
|
57
56
|
|
|
58
57
|
@respx.mock
|
|
59
58
|
def test_create_with_file_source(
|
|
@@ -87,7 +86,6 @@ class TestJobsCreate:
|
|
|
87
86
|
"status": "pending",
|
|
88
87
|
"source_type": "url",
|
|
89
88
|
"namespace": "support-center",
|
|
90
|
-
"document_id": "doc_123",
|
|
91
89
|
}
|
|
92
90
|
|
|
93
91
|
route = respx.post(JOBS_URL).mock(
|
|
@@ -284,6 +282,8 @@ class TestJobsLoad:
|
|
|
284
282
|
job_id="job_load",
|
|
285
283
|
status="done",
|
|
286
284
|
source_type="url",
|
|
285
|
+
namespace="support-center",
|
|
286
|
+
document_id="doc_123",
|
|
287
287
|
result_url=result_url,
|
|
288
288
|
)
|
|
289
289
|
|
|
@@ -293,3 +293,5 @@ class TestJobsLoad:
|
|
|
293
293
|
|
|
294
294
|
assert route.called
|
|
295
295
|
assert parse_result.manifest is not None
|
|
296
|
+
assert parse_result.namespace == "support-center"
|
|
297
|
+
assert parse_result.document_id == "doc_123"
|
|
@@ -55,7 +55,7 @@ class TestJobModel:
|
|
|
55
55
|
}
|
|
56
56
|
job: Job = Job(**data)
|
|
57
57
|
assert job.namespace == "support-center"
|
|
58
|
-
assert
|
|
58
|
+
assert "document_id" not in job.model_dump()
|
|
59
59
|
|
|
60
60
|
def test_from_dict_with_upload(self) -> None:
|
|
61
61
|
data: Dict[str, Any] = {
|
|
@@ -717,6 +717,11 @@ class TestParseResult:
|
|
|
717
717
|
assert stats.total_chunks == 3
|
|
718
718
|
assert stats.text_chunks == 1
|
|
719
719
|
|
|
720
|
+
def test_document_scope_defaults_to_none(self) -> None:
|
|
721
|
+
result: ParseResult = _build_parse_result()
|
|
722
|
+
assert result.namespace is None
|
|
723
|
+
assert result.document_id is None
|
|
724
|
+
|
|
720
725
|
def test_raw_zip_accessible(self) -> None:
|
|
721
726
|
result: ParseResult = _build_parse_result()
|
|
722
727
|
assert result.raw_zip == b"fake zip bytes"
|
|
@@ -42,6 +42,8 @@ def _make_done_response(job_id: str, result_url: str) -> Dict[str, Any]:
|
|
|
42
42
|
"job_id": job_id,
|
|
43
43
|
"status": "done",
|
|
44
44
|
"source_type": "url",
|
|
45
|
+
"namespace": "support-center",
|
|
46
|
+
"document_id": "doc_123",
|
|
45
47
|
"result_url": result_url,
|
|
46
48
|
}
|
|
47
49
|
|
|
@@ -96,6 +98,8 @@ class TestParseWithUrl:
|
|
|
96
98
|
|
|
97
99
|
assert parse_result.manifest is not None
|
|
98
100
|
assert parse_result.manifest.job_id == "job_test123"
|
|
101
|
+
assert parse_result.namespace == "support-center"
|
|
102
|
+
assert parse_result.document_id == "doc_123"
|
|
99
103
|
|
|
100
104
|
|
|
101
105
|
# ---------------------------------------------------------------------------
|
|
@@ -19,6 +19,7 @@ def _make_retrieval_response() -> Dict[str, Any]:
|
|
|
19
19
|
return {
|
|
20
20
|
"namespace": "support-center",
|
|
21
21
|
"query": "refund policy",
|
|
22
|
+
"router_used": "discovery+agent",
|
|
22
23
|
"results": [
|
|
23
24
|
{
|
|
24
25
|
"chunk_type": "text",
|
|
@@ -47,6 +48,14 @@ class TestRetrievalQuery:
|
|
|
47
48
|
query="refund policy",
|
|
48
49
|
namespace="support-center",
|
|
49
50
|
top_k=5,
|
|
51
|
+
data_type=6,
|
|
52
|
+
signal_paths=["Billing", "Refunds"],
|
|
53
|
+
filter_mode="keep",
|
|
54
|
+
channels=["path", "term"],
|
|
55
|
+
channel_weights={"path": 2.0, "term": 0.5},
|
|
56
|
+
rerank=True,
|
|
57
|
+
threshold=0.2,
|
|
58
|
+
internal_recall_k=25,
|
|
50
59
|
exclude_document_ids=["doc_old"],
|
|
51
60
|
exclude_sections=[
|
|
52
61
|
{
|
|
@@ -62,6 +71,14 @@ class TestRetrievalQuery:
|
|
|
62
71
|
"query": "refund policy",
|
|
63
72
|
"namespace": "support-center",
|
|
64
73
|
"top_k": 5,
|
|
74
|
+
"data_type": 6,
|
|
75
|
+
"signal_paths": ["Billing", "Refunds"],
|
|
76
|
+
"filter_mode": "keep",
|
|
77
|
+
"channels": ["path", "term"],
|
|
78
|
+
"channel_weights": {"path": 2.0, "term": 0.5},
|
|
79
|
+
"rerank": True,
|
|
80
|
+
"threshold": 0.2,
|
|
81
|
+
"internal_recall_k": 25,
|
|
65
82
|
"exclude_document_ids": ["doc_old"],
|
|
66
83
|
"exclude_sections": [
|
|
67
84
|
{
|
|
@@ -71,6 +88,7 @@ class TestRetrievalQuery:
|
|
|
71
88
|
],
|
|
72
89
|
}
|
|
73
90
|
assert response.namespace == "support-center"
|
|
91
|
+
assert response.router_used == "discovery+agent"
|
|
74
92
|
assert response.results[0].content == "Annual plans may be refunded within 30 days."
|
|
75
93
|
assert response.results[0].source.document_id == "doc_123"
|
|
76
94
|
assert response.results[0].source.source_file_name == "refund-policy.md"
|
|
@@ -107,4 +125,5 @@ class TestRetrievalQuery:
|
|
|
107
125
|
)
|
|
108
126
|
|
|
109
127
|
assert route.called
|
|
128
|
+
assert response.router_used == "discovery+agent"
|
|
110
129
|
assert response.results[0].source.document_id == "doc_123"
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.3.0" # x-release-please-version
|
|
@@ -1,70 +0,0 @@
|
|
|
1
|
-
"""Retrieval resource for querying published documents."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from typing import Any, Dict, Optional
|
|
6
|
-
|
|
7
|
-
from knowhere.resources._base import AsyncAPIResource, SyncAPIResource
|
|
8
|
-
from knowhere.types.retrieval import RetrievalQueryResponse
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class Retrieval(SyncAPIResource):
|
|
12
|
-
"""Synchronous interface for ``/v1/retrieval`` endpoints."""
|
|
13
|
-
|
|
14
|
-
def query(
|
|
15
|
-
self,
|
|
16
|
-
*,
|
|
17
|
-
query: str,
|
|
18
|
-
namespace: Optional[str] = None,
|
|
19
|
-
top_k: Optional[int] = None,
|
|
20
|
-
exclude_document_ids: Optional[list[str]] = None,
|
|
21
|
-
exclude_sections: Optional[list[dict[str, str]]] = None,
|
|
22
|
-
) -> RetrievalQueryResponse:
|
|
23
|
-
"""Query published documents in a namespace."""
|
|
24
|
-
body: Dict[str, Any] = {"query": query}
|
|
25
|
-
if namespace is not None:
|
|
26
|
-
body["namespace"] = namespace
|
|
27
|
-
if top_k is not None:
|
|
28
|
-
body["top_k"] = top_k
|
|
29
|
-
if exclude_document_ids is not None:
|
|
30
|
-
body["exclude_document_ids"] = exclude_document_ids
|
|
31
|
-
if exclude_sections is not None:
|
|
32
|
-
body["exclude_sections"] = exclude_sections
|
|
33
|
-
|
|
34
|
-
return self._request(
|
|
35
|
-
"POST",
|
|
36
|
-
"v1/retrieval/query",
|
|
37
|
-
body=body,
|
|
38
|
-
cast_to=RetrievalQueryResponse,
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
class AsyncRetrieval(AsyncAPIResource):
|
|
43
|
-
"""Asynchronous interface for ``/v1/retrieval`` endpoints."""
|
|
44
|
-
|
|
45
|
-
async def query(
|
|
46
|
-
self,
|
|
47
|
-
*,
|
|
48
|
-
query: str,
|
|
49
|
-
namespace: Optional[str] = None,
|
|
50
|
-
top_k: Optional[int] = None,
|
|
51
|
-
exclude_document_ids: Optional[list[str]] = None,
|
|
52
|
-
exclude_sections: Optional[list[dict[str, str]]] = None,
|
|
53
|
-
) -> RetrievalQueryResponse:
|
|
54
|
-
"""Query published documents in a namespace."""
|
|
55
|
-
body: Dict[str, Any] = {"query": query}
|
|
56
|
-
if namespace is not None:
|
|
57
|
-
body["namespace"] = namespace
|
|
58
|
-
if top_k is not None:
|
|
59
|
-
body["top_k"] = top_k
|
|
60
|
-
if exclude_document_ids is not None:
|
|
61
|
-
body["exclude_document_ids"] = exclude_document_ids
|
|
62
|
-
if exclude_sections is not None:
|
|
63
|
-
body["exclude_sections"] = exclude_sections
|
|
64
|
-
|
|
65
|
-
return await self._request(
|
|
66
|
-
"POST",
|
|
67
|
-
"v1/retrieval/query",
|
|
68
|
-
body=body,
|
|
69
|
-
cast_to=RetrievalQueryResponse,
|
|
70
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|