knowhere-python-sdk 0.3.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. knowhere_python_sdk-0.3.1/.release-please-manifest.json +3 -0
  2. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/CHANGELOG.md +8 -0
  3. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/PKG-INFO +21 -9
  4. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/README.md +20 -8
  5. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/pyproject.toml +1 -1
  6. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/__init__.py +6 -0
  7. knowhere_python_sdk-0.3.1/src/knowhere/_version.py +1 -0
  8. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/resources/jobs.py +16 -2
  9. knowhere_python_sdk-0.3.1/src/knowhere/resources/retrieval.py +123 -0
  10. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/types/__init__.py +6 -0
  11. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/types/job.py +0 -1
  12. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/types/result.py +6 -0
  13. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/types/retrieval.py +13 -1
  14. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/conftest.py +0 -1
  15. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_jobs.py +5 -3
  16. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_models.py +6 -1
  17. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_parse.py +4 -0
  18. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_retrieval.py +19 -0
  19. knowhere_python_sdk-0.3.0/.release-please-manifest.json +0 -3
  20. knowhere_python_sdk-0.3.0/src/knowhere/_version.py +0 -1
  21. knowhere_python_sdk-0.3.0/src/knowhere/resources/retrieval.py +0 -70
  22. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/.github/workflows/ci.yml +0 -0
  23. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/.github/workflows/publish-pypi.yml +0 -0
  24. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/.github/workflows/publish.yml +0 -0
  25. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/.gitignore +0 -0
  26. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/docs/usage.md +0 -0
  27. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/examples/async_usage.py +0 -0
  28. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/examples/error_handling.py +0 -0
  29. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/examples/parse_file.py +0 -0
  30. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/examples/parse_url.py +0 -0
  31. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/examples/step_by_step.py +0 -0
  32. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/release-please-config.json +0 -0
  33. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/_base_client.py +0 -0
  34. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/_client.py +0 -0
  35. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/_constants.py +0 -0
  36. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/_exceptions.py +0 -0
  37. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/_logging.py +0 -0
  38. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/_response.py +0 -0
  39. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/_types.py +0 -0
  40. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/lib/__init__.py +0 -0
  41. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/lib/polling.py +0 -0
  42. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/lib/result_parser.py +0 -0
  43. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/lib/upload.py +0 -0
  44. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/py.typed +0 -0
  45. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/resources/__init__.py +0 -0
  46. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/resources/_base.py +0 -0
  47. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/resources/documents.py +0 -0
  48. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/types/document.py +0 -0
  49. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/types/params.py +0 -0
  50. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/src/knowhere/types/shared.py +0 -0
  51. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/__init__.py +0 -0
  52. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/fixtures/real_result.zip +0 -0
  53. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_client.py +0 -0
  54. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_documents.py +0 -0
  55. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_exceptions.py +0 -0
  56. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_logging.py +0 -0
  57. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_polling.py +0 -0
  58. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_result_parser.py +0 -0
  59. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_retry.py +0 -0
  60. {knowhere_python_sdk-0.3.0 → knowhere_python_sdk-0.3.1}/tests/test_upload.py +0 -0
@@ -0,0 +1,3 @@
1
+ {
2
+ ".": "0.3.1"
3
+ }
@@ -1,5 +1,13 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.3.1](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.3.0...v0.3.1) (2026-04-22)
4
+
5
+
6
+ ### Documentation
7
+
8
+ * clarify ParseResult document scope ([861084e](https://github.com/Ontos-AI/knowhere-python-sdk/commit/861084e34144987994fa618ac0db262ce681b5a8))
9
+ * clarify ParseResult document scope ([bb14ad4](https://github.com/Ontos-AI/knowhere-python-sdk/commit/bb14ad4077c41cbe74a5dd155995d6f9937962b8))
10
+
3
11
  ## [0.3.0](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.2.1...v0.3.0) (2026-04-21)
4
12
 
5
13
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: knowhere-python-sdk
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: Official Python SDK for the Knowhere document parsing API
5
5
  Project-URL: Homepage, https://knowhereto.ai
6
6
  Project-URL: Documentation, https://docs.knowhereto.ai
@@ -67,8 +67,9 @@ for chunk in result.text_chunks:
67
67
  ## Retrieval and document lifecycle
68
68
 
69
69
  New documents are published into a retrieval namespace. The server returns a
70
- stable `document_id` when you create a job; persist that value if you need to
71
- update or archive the same document later.
70
+ stable `document_id` after the job is published. `client.jobs.create(...)`
71
+ does not return a usable `document_id`; persist `job_result.document_id` if you
72
+ need to update or archive the same document later.
72
73
 
73
74
  ```python
74
75
  job = client.jobs.create(
@@ -77,7 +78,11 @@ job = client.jobs.create(
77
78
  namespace="support-center",
78
79
  )
79
80
 
80
- print(job.document_id) # "doc_..."
81
+ job_result = client.jobs.wait(job.job_id)
82
+ document_id = job_result.document_id
83
+
84
+ if document_id is None:
85
+ raise RuntimeError("Expected document_id after successful publication.")
81
86
  ```
82
87
 
83
88
  After the job is done and published, query the canonical document content:
@@ -87,8 +92,13 @@ response = client.retrieval.query(
87
92
  namespace="support-center",
88
93
  query="How do I reset Bluetooth pairing?",
89
94
  top_k=5,
95
+ channels=["path", "term"],
96
+ filter_mode="keep",
97
+ signal_paths=["Bluetooth", "Pairing"],
90
98
  )
91
99
 
100
+ print(response.router_used)
101
+
92
102
  for result in response.results:
93
103
  print(result.content)
94
104
  print(result.score)
@@ -101,13 +111,13 @@ Use `document_id` to update or archive a document:
101
111
  update_job = client.jobs.create(
102
112
  source_type="url",
103
113
  source_url="https://example.com/manual-v2.pdf",
104
- document_id=job.document_id,
114
+ document_id=document_id,
105
115
  )
106
116
 
107
- document = client.documents.get(job.document_id)
117
+ document = client.documents.get(document_id)
108
118
  print(document.status)
109
119
 
110
- client.documents.archive(job.document_id)
120
+ client.documents.archive(document_id)
111
121
  ```
112
122
 
113
123
  You can also list documents in a namespace:
@@ -146,6 +156,8 @@ result = client.parse(
146
156
 
147
157
  print(result.manifest.source_file_name) # "report.pdf"
148
158
  print(len(result.chunks)) # 152
159
+ print(result.namespace) # "default" or your explicit namespace
160
+ print(result.document_id) # Published canonical document id
149
161
  ```
150
162
 
151
163
  ### Access different chunk types
@@ -209,14 +221,14 @@ job = client.jobs.create(
209
221
  parsing_params={"model": "advanced", "ocr_enabled": True},
210
222
  )
211
223
 
212
- print(job.document_id) # Persist this to update/archive the document later.
213
-
214
224
  # Step 2: Upload file to presigned URL
215
225
  client.jobs.upload(job, file=Path("report.pdf"))
216
226
 
217
227
  # Step 3: Poll until done (adaptive backoff)
218
228
  job_result = client.jobs.wait(job.job_id, poll_interval=10.0, poll_timeout=1800.0)
219
229
 
230
+ print(job_result.document_id) # Persist this to update/archive the document later.
231
+
220
232
  # Step 4: Download and parse results
221
233
  result = client.jobs.load(job_result)
222
234
  print(result.statistics)
@@ -35,8 +35,9 @@ for chunk in result.text_chunks:
35
35
  ## Retrieval and document lifecycle
36
36
 
37
37
  New documents are published into a retrieval namespace. The server returns a
38
- stable `document_id` when you create a job; persist that value if you need to
39
- update or archive the same document later.
38
+ stable `document_id` after the job is published. `client.jobs.create(...)`
39
+ does not return a usable `document_id`; persist `job_result.document_id` if you
40
+ need to update or archive the same document later.
40
41
 
41
42
  ```python
42
43
  job = client.jobs.create(
@@ -45,7 +46,11 @@ job = client.jobs.create(
45
46
  namespace="support-center",
46
47
  )
47
48
 
48
- print(job.document_id) # "doc_..."
49
+ job_result = client.jobs.wait(job.job_id)
50
+ document_id = job_result.document_id
51
+
52
+ if document_id is None:
53
+ raise RuntimeError("Expected document_id after successful publication.")
49
54
  ```
50
55
 
51
56
  After the job is done and published, query the canonical document content:
@@ -55,8 +60,13 @@ response = client.retrieval.query(
55
60
  namespace="support-center",
56
61
  query="How do I reset Bluetooth pairing?",
57
62
  top_k=5,
63
+ channels=["path", "term"],
64
+ filter_mode="keep",
65
+ signal_paths=["Bluetooth", "Pairing"],
58
66
  )
59
67
 
68
+ print(response.router_used)
69
+
60
70
  for result in response.results:
61
71
  print(result.content)
62
72
  print(result.score)
@@ -69,13 +79,13 @@ Use `document_id` to update or archive a document:
69
79
  update_job = client.jobs.create(
70
80
  source_type="url",
71
81
  source_url="https://example.com/manual-v2.pdf",
72
- document_id=job.document_id,
82
+ document_id=document_id,
73
83
  )
74
84
 
75
- document = client.documents.get(job.document_id)
85
+ document = client.documents.get(document_id)
76
86
  print(document.status)
77
87
 
78
- client.documents.archive(job.document_id)
88
+ client.documents.archive(document_id)
79
89
  ```
80
90
 
81
91
  You can also list documents in a namespace:
@@ -114,6 +124,8 @@ result = client.parse(
114
124
 
115
125
  print(result.manifest.source_file_name) # "report.pdf"
116
126
  print(len(result.chunks)) # 152
127
+ print(result.namespace) # "default" or your explicit namespace
128
+ print(result.document_id) # Published canonical document id
117
129
  ```
118
130
 
119
131
  ### Access different chunk types
@@ -177,14 +189,14 @@ job = client.jobs.create(
177
189
  parsing_params={"model": "advanced", "ocr_enabled": True},
178
190
  )
179
191
 
180
- print(job.document_id) # Persist this to update/archive the document later.
181
-
182
192
  # Step 2: Upload file to presigned URL
183
193
  client.jobs.upload(job, file=Path("report.pdf"))
184
194
 
185
195
  # Step 3: Poll until done (adaptive backoff)
186
196
  job_result = client.jobs.wait(job.job_id, poll_interval=10.0, poll_timeout=1800.0)
187
197
 
198
+ print(job_result.document_id) # Persist this to update/archive the document later.
199
+
188
200
  # Step 4: Download and parse results
189
201
  result = client.jobs.load(job_result)
190
202
  print(result.statistics)
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "knowhere-python-sdk"
7
- version = "0.3.0"
7
+ version = "0.3.1"
8
8
  description = "Official Python SDK for the Knowhere document parsing API"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -39,6 +39,9 @@ from knowhere.types.document import Document, DocumentListResponse
39
39
  from knowhere.types.job import Job, JobError, JobProgress, JobResult
40
40
  from knowhere.types.params import ParsingParams, WebhookConfig
41
41
  from knowhere.types.retrieval import (
42
+ RetrievalChannel,
43
+ RetrievalFilterMode,
44
+ RetrievalSectionExclusion,
42
45
  RetrievalSource,
43
46
  RetrievalQueryResponse,
44
47
  RetrievalResult,
@@ -97,6 +100,9 @@ __all__: list[str] = [
97
100
  "Document",
98
101
  "DocumentListResponse",
99
102
  # Retrieval types
103
+ "RetrievalChannel",
104
+ "RetrievalFilterMode",
105
+ "RetrievalSectionExclusion",
100
106
  "RetrievalSource",
101
107
  "RetrievalQueryResponse",
102
108
  "RetrievalResult",
@@ -0,0 +1 @@
1
+ __version__ = "0.3.1" # x-release-please-version
@@ -145,8 +145,12 @@ class Jobs(SyncAPIResource):
145
145
  if not job_result.result_url:
146
146
  raise InvalidStateError("JobResult does not have a result_url.")
147
147
  result_url: str = job_result.result_url
148
+ namespace: Optional[str] = job_result.namespace
149
+ document_id: Optional[str] = job_result.document_id
148
150
  else:
149
151
  result_url = job_result
152
+ namespace = None
153
+ document_id = None
150
154
 
151
155
  response: httpx.Response = self._client._client.get(
152
156
  result_url, timeout=self._client.upload_timeout
@@ -154,7 +158,10 @@ class Jobs(SyncAPIResource):
154
158
  response.raise_for_status()
155
159
  zip_bytes: bytes = response.content
156
160
 
157
- return parseResultZip(zip_bytes, verify_checksum=verify_checksum)
161
+ parsed_result = parseResultZip(zip_bytes, verify_checksum=verify_checksum)
162
+ parsed_result.namespace = namespace
163
+ parsed_result.document_id = document_id
164
+ return parsed_result
158
165
 
159
166
 
160
167
  class AsyncJobs(AsyncAPIResource):
@@ -251,8 +258,12 @@ class AsyncJobs(AsyncAPIResource):
251
258
  if not job_result.result_url:
252
259
  raise InvalidStateError("JobResult does not have a result_url.")
253
260
  result_url: str = job_result.result_url
261
+ namespace: Optional[str] = job_result.namespace
262
+ document_id: Optional[str] = job_result.document_id
254
263
  else:
255
264
  result_url = job_result
265
+ namespace = None
266
+ document_id = None
256
267
 
257
268
  response: httpx.Response = await self._client._client.get(
258
269
  result_url, timeout=self._client.upload_timeout
@@ -260,4 +271,7 @@ class AsyncJobs(AsyncAPIResource):
260
271
  response.raise_for_status()
261
272
  zip_bytes: bytes = response.content
262
273
 
263
- return parseResultZip(zip_bytes, verify_checksum=verify_checksum)
274
+ parsed_result = parseResultZip(zip_bytes, verify_checksum=verify_checksum)
275
+ parsed_result.namespace = namespace
276
+ parsed_result.document_id = document_id
277
+ return parsed_result
@@ -0,0 +1,123 @@
1
+ """Retrieval resource for querying published documents."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, Optional
6
+
7
+ from knowhere.resources._base import AsyncAPIResource, SyncAPIResource
8
+ from knowhere.types.retrieval import (
9
+ RetrievalChannel,
10
+ RetrievalFilterMode,
11
+ RetrievalQueryResponse,
12
+ RetrievalSectionExclusion,
13
+ )
14
+
15
+
16
+ class Retrieval(SyncAPIResource):
17
+ """Synchronous interface for ``/v1/retrieval`` endpoints."""
18
+
19
+ def query(
20
+ self,
21
+ *,
22
+ query: str,
23
+ namespace: Optional[str] = None,
24
+ top_k: Optional[int] = None,
25
+ data_type: Optional[int] = None,
26
+ signal_paths: Optional[list[str]] = None,
27
+ filter_mode: Optional[RetrievalFilterMode] = None,
28
+ channels: Optional[list[RetrievalChannel]] = None,
29
+ channel_weights: Optional[dict[RetrievalChannel, float]] = None,
30
+ rerank: Optional[bool] = None,
31
+ threshold: Optional[float] = None,
32
+ internal_recall_k: Optional[int] = None,
33
+ exclude_document_ids: Optional[list[str]] = None,
34
+ exclude_sections: Optional[list[RetrievalSectionExclusion]] = None,
35
+ ) -> RetrievalQueryResponse:
36
+ """Query published documents in a namespace."""
37
+ body: Dict[str, Any] = {"query": query}
38
+ if namespace is not None:
39
+ body["namespace"] = namespace
40
+ if top_k is not None:
41
+ body["top_k"] = top_k
42
+ if data_type is not None:
43
+ body["data_type"] = data_type
44
+ if signal_paths is not None:
45
+ body["signal_paths"] = signal_paths
46
+ if filter_mode is not None:
47
+ body["filter_mode"] = filter_mode
48
+ if channels is not None:
49
+ body["channels"] = channels
50
+ if channel_weights is not None:
51
+ body["channel_weights"] = channel_weights
52
+ if rerank is not None:
53
+ body["rerank"] = rerank
54
+ if threshold is not None:
55
+ body["threshold"] = threshold
56
+ if internal_recall_k is not None:
57
+ body["internal_recall_k"] = internal_recall_k
58
+ if exclude_document_ids is not None:
59
+ body["exclude_document_ids"] = exclude_document_ids
60
+ if exclude_sections is not None:
61
+ body["exclude_sections"] = exclude_sections
62
+
63
+ return self._request(
64
+ "POST",
65
+ "v1/retrieval/query",
66
+ body=body,
67
+ cast_to=RetrievalQueryResponse,
68
+ )
69
+
70
+
71
+ class AsyncRetrieval(AsyncAPIResource):
72
+ """Asynchronous interface for ``/v1/retrieval`` endpoints."""
73
+
74
+ async def query(
75
+ self,
76
+ *,
77
+ query: str,
78
+ namespace: Optional[str] = None,
79
+ top_k: Optional[int] = None,
80
+ data_type: Optional[int] = None,
81
+ signal_paths: Optional[list[str]] = None,
82
+ filter_mode: Optional[RetrievalFilterMode] = None,
83
+ channels: Optional[list[RetrievalChannel]] = None,
84
+ channel_weights: Optional[dict[RetrievalChannel, float]] = None,
85
+ rerank: Optional[bool] = None,
86
+ threshold: Optional[float] = None,
87
+ internal_recall_k: Optional[int] = None,
88
+ exclude_document_ids: Optional[list[str]] = None,
89
+ exclude_sections: Optional[list[RetrievalSectionExclusion]] = None,
90
+ ) -> RetrievalQueryResponse:
91
+ """Query published documents in a namespace."""
92
+ body: Dict[str, Any] = {"query": query}
93
+ if namespace is not None:
94
+ body["namespace"] = namespace
95
+ if top_k is not None:
96
+ body["top_k"] = top_k
97
+ if data_type is not None:
98
+ body["data_type"] = data_type
99
+ if signal_paths is not None:
100
+ body["signal_paths"] = signal_paths
101
+ if filter_mode is not None:
102
+ body["filter_mode"] = filter_mode
103
+ if channels is not None:
104
+ body["channels"] = channels
105
+ if channel_weights is not None:
106
+ body["channel_weights"] = channel_weights
107
+ if rerank is not None:
108
+ body["rerank"] = rerank
109
+ if threshold is not None:
110
+ body["threshold"] = threshold
111
+ if internal_recall_k is not None:
112
+ body["internal_recall_k"] = internal_recall_k
113
+ if exclude_document_ids is not None:
114
+ body["exclude_document_ids"] = exclude_document_ids
115
+ if exclude_sections is not None:
116
+ body["exclude_sections"] = exclude_sections
117
+
118
+ return await self._request(
119
+ "POST",
120
+ "v1/retrieval/query",
121
+ body=body,
122
+ cast_to=RetrievalQueryResponse,
123
+ )
@@ -6,6 +6,9 @@ from knowhere.types.document import Document, DocumentListResponse
6
6
  from knowhere.types.job import Job, JobError, JobResult
7
7
  from knowhere.types.params import ParsingParams, WebhookConfig
8
8
  from knowhere.types.retrieval import (
9
+ RetrievalChannel,
10
+ RetrievalFilterMode,
11
+ RetrievalSectionExclusion,
9
12
  RetrievalSource,
10
13
  RetrievalQueryResponse,
11
14
  RetrievalResult,
@@ -38,6 +41,9 @@ __all__: list[str] = [
38
41
  "Document",
39
42
  "DocumentListResponse",
40
43
  # retrieval
44
+ "RetrievalChannel",
45
+ "RetrievalFilterMode",
46
+ "RetrievalSectionExclusion",
41
47
  "RetrievalSource",
42
48
  "RetrievalQueryResponse",
43
49
  "RetrievalResult",
@@ -41,7 +41,6 @@ class Job(BaseModel):
41
41
  status: str
42
42
  source_type: str
43
43
  namespace: Optional[str] = None
44
- document_id: Optional[str] = None
45
44
  data_id: Optional[str] = None
46
45
  created_at: Optional[datetime] = None
47
46
  upload_url: Optional[str] = None
@@ -272,6 +272,8 @@ class ParseResult:
272
272
  kb_csv: Optional[str]
273
273
  hierarchy_view_html: Optional[str]
274
274
  raw_zip: bytes
275
+ namespace: Optional[str]
276
+ document_id: Optional[str]
275
277
 
276
278
  def __init__(
277
279
  self,
@@ -285,6 +287,8 @@ class ParseResult:
285
287
  kb_csv: Optional[str],
286
288
  hierarchy_view_html: Optional[str],
287
289
  raw_zip: bytes,
290
+ namespace: Optional[str] = None,
291
+ document_id: Optional[str] = None,
288
292
  ) -> None:
289
293
  self.manifest = manifest
290
294
  self.chunks = chunks
@@ -295,6 +299,8 @@ class ParseResult:
295
299
  self.kb_csv = kb_csv
296
300
  self.hierarchy_view_html = hierarchy_view_html
297
301
  self.raw_zip = raw_zip
302
+ self.namespace = namespace
303
+ self.document_id = document_id
298
304
 
299
305
  # -- convenience properties --
300
306
 
@@ -2,11 +2,22 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from typing import Optional
5
+ from typing import Literal, Optional, TypedDict
6
6
 
7
7
  from pydantic import BaseModel
8
8
 
9
9
 
10
+ RetrievalChannel = Literal["path", "content", "term"]
11
+ RetrievalFilterMode = Literal["delete", "keep"]
12
+
13
+
14
+ class RetrievalSectionExclusion(TypedDict):
15
+ """Section exclusion for follow-up retrieval queries."""
16
+
17
+ document_id: str
18
+ section_path: str
19
+
20
+
10
21
  class RetrievalSource(BaseModel):
11
22
  """Caller-facing source reference attached to a retrieval result."""
12
23
 
@@ -30,4 +41,5 @@ class RetrievalQueryResponse(BaseModel):
30
41
 
31
42
  namespace: str
32
43
  query: str
44
+ router_used: Optional[str] = None
33
45
  results: list[RetrievalResult]
@@ -72,7 +72,6 @@ def mock_job_response() -> Dict[str, Any]:
72
72
  "status": "waiting-file",
73
73
  "source_type": "file",
74
74
  "namespace": "default",
75
- "document_id": "doc_test123",
76
75
  "data_id": None,
77
76
  "created_at": "2025-01-01T00:00:00Z",
78
77
  "upload_url": "https://storage.example.com/upload?token=abc",
@@ -36,7 +36,6 @@ class TestJobsCreate:
36
36
  "status": "pending",
37
37
  "source_type": "url",
38
38
  "namespace": "support-center",
39
- "document_id": "doc_123",
40
39
  }
41
40
 
42
41
  route = respx.post(JOBS_URL).mock(
@@ -53,7 +52,7 @@ class TestJobsCreate:
53
52
  assert job.source_type == "url"
54
53
  assert job.status == "pending"
55
54
  assert job.namespace == "support-center"
56
- assert job.document_id == "doc_123"
55
+ assert not hasattr(job, "document_id")
57
56
 
58
57
  @respx.mock
59
58
  def test_create_with_file_source(
@@ -87,7 +86,6 @@ class TestJobsCreate:
87
86
  "status": "pending",
88
87
  "source_type": "url",
89
88
  "namespace": "support-center",
90
- "document_id": "doc_123",
91
89
  }
92
90
 
93
91
  route = respx.post(JOBS_URL).mock(
@@ -284,6 +282,8 @@ class TestJobsLoad:
284
282
  job_id="job_load",
285
283
  status="done",
286
284
  source_type="url",
285
+ namespace="support-center",
286
+ document_id="doc_123",
287
287
  result_url=result_url,
288
288
  )
289
289
 
@@ -293,3 +293,5 @@ class TestJobsLoad:
293
293
 
294
294
  assert route.called
295
295
  assert parse_result.manifest is not None
296
+ assert parse_result.namespace == "support-center"
297
+ assert parse_result.document_id == "doc_123"
@@ -55,7 +55,7 @@ class TestJobModel:
55
55
  }
56
56
  job: Job = Job(**data)
57
57
  assert job.namespace == "support-center"
58
- assert job.document_id == "doc_123"
58
+ assert "document_id" not in job.model_dump()
59
59
 
60
60
  def test_from_dict_with_upload(self) -> None:
61
61
  data: Dict[str, Any] = {
@@ -717,6 +717,11 @@ class TestParseResult:
717
717
  assert stats.total_chunks == 3
718
718
  assert stats.text_chunks == 1
719
719
 
720
+ def test_document_scope_defaults_to_none(self) -> None:
721
+ result: ParseResult = _build_parse_result()
722
+ assert result.namespace is None
723
+ assert result.document_id is None
724
+
720
725
  def test_raw_zip_accessible(self) -> None:
721
726
  result: ParseResult = _build_parse_result()
722
727
  assert result.raw_zip == b"fake zip bytes"
@@ -42,6 +42,8 @@ def _make_done_response(job_id: str, result_url: str) -> Dict[str, Any]:
42
42
  "job_id": job_id,
43
43
  "status": "done",
44
44
  "source_type": "url",
45
+ "namespace": "support-center",
46
+ "document_id": "doc_123",
45
47
  "result_url": result_url,
46
48
  }
47
49
 
@@ -96,6 +98,8 @@ class TestParseWithUrl:
96
98
 
97
99
  assert parse_result.manifest is not None
98
100
  assert parse_result.manifest.job_id == "job_test123"
101
+ assert parse_result.namespace == "support-center"
102
+ assert parse_result.document_id == "doc_123"
99
103
 
100
104
 
101
105
  # ---------------------------------------------------------------------------
@@ -19,6 +19,7 @@ def _make_retrieval_response() -> Dict[str, Any]:
19
19
  return {
20
20
  "namespace": "support-center",
21
21
  "query": "refund policy",
22
+ "router_used": "discovery+agent",
22
23
  "results": [
23
24
  {
24
25
  "chunk_type": "text",
@@ -47,6 +48,14 @@ class TestRetrievalQuery:
47
48
  query="refund policy",
48
49
  namespace="support-center",
49
50
  top_k=5,
51
+ data_type=6,
52
+ signal_paths=["Billing", "Refunds"],
53
+ filter_mode="keep",
54
+ channels=["path", "term"],
55
+ channel_weights={"path": 2.0, "term": 0.5},
56
+ rerank=True,
57
+ threshold=0.2,
58
+ internal_recall_k=25,
50
59
  exclude_document_ids=["doc_old"],
51
60
  exclude_sections=[
52
61
  {
@@ -62,6 +71,14 @@ class TestRetrievalQuery:
62
71
  "query": "refund policy",
63
72
  "namespace": "support-center",
64
73
  "top_k": 5,
74
+ "data_type": 6,
75
+ "signal_paths": ["Billing", "Refunds"],
76
+ "filter_mode": "keep",
77
+ "channels": ["path", "term"],
78
+ "channel_weights": {"path": 2.0, "term": 0.5},
79
+ "rerank": True,
80
+ "threshold": 0.2,
81
+ "internal_recall_k": 25,
65
82
  "exclude_document_ids": ["doc_old"],
66
83
  "exclude_sections": [
67
84
  {
@@ -71,6 +88,7 @@ class TestRetrievalQuery:
71
88
  ],
72
89
  }
73
90
  assert response.namespace == "support-center"
91
+ assert response.router_used == "discovery+agent"
74
92
  assert response.results[0].content == "Annual plans may be refunded within 30 days."
75
93
  assert response.results[0].source.document_id == "doc_123"
76
94
  assert response.results[0].source.source_file_name == "refund-policy.md"
@@ -107,4 +125,5 @@ class TestRetrievalQuery:
107
125
  )
108
126
 
109
127
  assert route.called
128
+ assert response.router_used == "discovery+agent"
110
129
  assert response.results[0].source.document_id == "doc_123"
@@ -1,3 +0,0 @@
1
- {
2
- ".": "0.3.0"
3
- }
@@ -1 +0,0 @@
1
- __version__ = "0.3.0" # x-release-please-version
@@ -1,70 +0,0 @@
1
- """Retrieval resource for querying published documents."""
2
-
3
- from __future__ import annotations
4
-
5
- from typing import Any, Dict, Optional
6
-
7
- from knowhere.resources._base import AsyncAPIResource, SyncAPIResource
8
- from knowhere.types.retrieval import RetrievalQueryResponse
9
-
10
-
11
- class Retrieval(SyncAPIResource):
12
- """Synchronous interface for ``/v1/retrieval`` endpoints."""
13
-
14
- def query(
15
- self,
16
- *,
17
- query: str,
18
- namespace: Optional[str] = None,
19
- top_k: Optional[int] = None,
20
- exclude_document_ids: Optional[list[str]] = None,
21
- exclude_sections: Optional[list[dict[str, str]]] = None,
22
- ) -> RetrievalQueryResponse:
23
- """Query published documents in a namespace."""
24
- body: Dict[str, Any] = {"query": query}
25
- if namespace is not None:
26
- body["namespace"] = namespace
27
- if top_k is not None:
28
- body["top_k"] = top_k
29
- if exclude_document_ids is not None:
30
- body["exclude_document_ids"] = exclude_document_ids
31
- if exclude_sections is not None:
32
- body["exclude_sections"] = exclude_sections
33
-
34
- return self._request(
35
- "POST",
36
- "v1/retrieval/query",
37
- body=body,
38
- cast_to=RetrievalQueryResponse,
39
- )
40
-
41
-
42
- class AsyncRetrieval(AsyncAPIResource):
43
- """Asynchronous interface for ``/v1/retrieval`` endpoints."""
44
-
45
- async def query(
46
- self,
47
- *,
48
- query: str,
49
- namespace: Optional[str] = None,
50
- top_k: Optional[int] = None,
51
- exclude_document_ids: Optional[list[str]] = None,
52
- exclude_sections: Optional[list[dict[str, str]]] = None,
53
- ) -> RetrievalQueryResponse:
54
- """Query published documents in a namespace."""
55
- body: Dict[str, Any] = {"query": query}
56
- if namespace is not None:
57
- body["namespace"] = namespace
58
- if top_k is not None:
59
- body["top_k"] = top_k
60
- if exclude_document_ids is not None:
61
- body["exclude_document_ids"] = exclude_document_ids
62
- if exclude_sections is not None:
63
- body["exclude_sections"] = exclude_sections
64
-
65
- return await self._request(
66
- "POST",
67
- "v1/retrieval/query",
68
- body=body,
69
- cast_to=RetrievalQueryResponse,
70
- )