knowhere-python-sdk 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowhere/__init__.py +21 -0
- knowhere/_client.py +43 -1
- knowhere/_exceptions.py +21 -3
- knowhere/_version.py +1 -1
- knowhere/lib/result_parser.py +32 -0
- knowhere/resources/__init__.py +10 -1
- knowhere/resources/documents.py +74 -0
- knowhere/resources/jobs.py +14 -0
- knowhere/resources/retrieval.py +70 -0
- knowhere/types/__init__.py +21 -0
- knowhere/types/document.py +28 -0
- knowhere/types/job.py +4 -0
- knowhere/types/result.py +100 -0
- knowhere/types/retrieval.py +33 -0
- {knowhere_python_sdk-0.2.0.dist-info → knowhere_python_sdk-0.3.0.dist-info}/METADATA +72 -1
- knowhere_python_sdk-0.3.0.dist-info/RECORD +29 -0
- knowhere_python_sdk-0.2.0.dist-info/RECORD +0 -25
- {knowhere_python_sdk-0.2.0.dist-info → knowhere_python_sdk-0.3.0.dist-info}/WHEEL +0 -0
knowhere/__init__.py
CHANGED
|
@@ -35,8 +35,14 @@ from knowhere._exceptions import (
|
|
|
35
35
|
)
|
|
36
36
|
from knowhere._types import PollProgressCallback, UploadProgressCallback
|
|
37
37
|
from knowhere._version import __version__
|
|
38
|
+
from knowhere.types.document import Document, DocumentListResponse
|
|
38
39
|
from knowhere.types.job import Job, JobError, JobProgress, JobResult
|
|
39
40
|
from knowhere.types.params import ParsingParams, WebhookConfig
|
|
41
|
+
from knowhere.types.retrieval import (
|
|
42
|
+
RetrievalSource,
|
|
43
|
+
RetrievalQueryResponse,
|
|
44
|
+
RetrievalResult,
|
|
45
|
+
)
|
|
40
46
|
from knowhere.types.result import (
|
|
41
47
|
BaseChunk,
|
|
42
48
|
Checksum,
|
|
@@ -46,6 +52,10 @@ from knowhere.types.result import (
|
|
|
46
52
|
ImageFileInfo,
|
|
47
53
|
Manifest,
|
|
48
54
|
ParseResult,
|
|
55
|
+
ProcessingCost,
|
|
56
|
+
ProcessingMetadata,
|
|
57
|
+
ProcessingTiming,
|
|
58
|
+
SlimChunk,
|
|
49
59
|
Statistics,
|
|
50
60
|
TableChunk,
|
|
51
61
|
TableFileInfo,
|
|
@@ -83,6 +93,13 @@ __all__: list[str] = [
|
|
|
83
93
|
"JobError",
|
|
84
94
|
"JobProgress",
|
|
85
95
|
"JobResult",
|
|
96
|
+
# Document types
|
|
97
|
+
"Document",
|
|
98
|
+
"DocumentListResponse",
|
|
99
|
+
# Retrieval types
|
|
100
|
+
"RetrievalSource",
|
|
101
|
+
"RetrievalQueryResponse",
|
|
102
|
+
"RetrievalResult",
|
|
86
103
|
# Result types
|
|
87
104
|
"ParseResult",
|
|
88
105
|
"Manifest",
|
|
@@ -91,6 +108,10 @@ __all__: list[str] = [
|
|
|
91
108
|
"FileIndex",
|
|
92
109
|
"ImageFileInfo",
|
|
93
110
|
"TableFileInfo",
|
|
111
|
+
"ProcessingCost",
|
|
112
|
+
"ProcessingMetadata",
|
|
113
|
+
"ProcessingTiming",
|
|
114
|
+
"SlimChunk",
|
|
94
115
|
"BaseChunk",
|
|
95
116
|
"TextChunk",
|
|
96
117
|
"ImageChunk",
|
knowhere/_client.py
CHANGED
|
@@ -19,7 +19,9 @@ from knowhere._types import (
|
|
|
19
19
|
PollProgressCallback,
|
|
20
20
|
UploadProgressCallback,
|
|
21
21
|
)
|
|
22
|
+
from knowhere.resources.documents import AsyncDocuments, Documents
|
|
22
23
|
from knowhere.resources.jobs import AsyncJobs, Jobs
|
|
24
|
+
from knowhere.resources.retrieval import AsyncRetrieval, Retrieval
|
|
23
25
|
from knowhere.types.job import Job, JobResult
|
|
24
26
|
from knowhere.types.params import ParsingParams, WebhookConfig
|
|
25
27
|
from knowhere.types.result import ParseResult
|
|
@@ -42,6 +44,16 @@ class Knowhere(SyncAPIClient):
|
|
|
42
44
|
"""Access the jobs resource namespace."""
|
|
43
45
|
return Jobs(self)
|
|
44
46
|
|
|
47
|
+
@cached_property
|
|
48
|
+
def retrieval(self) -> Retrieval:
|
|
49
|
+
"""Access the retrieval resource namespace."""
|
|
50
|
+
return Retrieval(self)
|
|
51
|
+
|
|
52
|
+
@cached_property
|
|
53
|
+
def documents(self) -> Documents:
|
|
54
|
+
"""Access the documents resource namespace."""
|
|
55
|
+
return Documents(self)
|
|
56
|
+
|
|
45
57
|
# -- overloaded parse signatures --
|
|
46
58
|
|
|
47
59
|
@overload
|
|
@@ -50,6 +62,8 @@ class Knowhere(SyncAPIClient):
|
|
|
50
62
|
*,
|
|
51
63
|
url: str,
|
|
52
64
|
data_id: Optional[str] = ...,
|
|
65
|
+
namespace: Optional[str] = ...,
|
|
66
|
+
document_id: Optional[str] = ...,
|
|
53
67
|
parsing_params: Optional[ParsingParams] = ...,
|
|
54
68
|
webhook: Optional[WebhookConfig] = ...,
|
|
55
69
|
poll_interval: float = ...,
|
|
@@ -66,6 +80,8 @@ class Knowhere(SyncAPIClient):
|
|
|
66
80
|
file: Union[Path, BinaryIO, bytes],
|
|
67
81
|
file_name: Optional[str] = ...,
|
|
68
82
|
data_id: Optional[str] = ...,
|
|
83
|
+
namespace: Optional[str] = ...,
|
|
84
|
+
document_id: Optional[str] = ...,
|
|
69
85
|
parsing_params: Optional[ParsingParams] = ...,
|
|
70
86
|
webhook: Optional[WebhookConfig] = ...,
|
|
71
87
|
poll_interval: float = ...,
|
|
@@ -82,6 +98,8 @@ class Knowhere(SyncAPIClient):
|
|
|
82
98
|
file: Optional[Union[Path, BinaryIO, bytes]] = None,
|
|
83
99
|
file_name: Optional[str] = None,
|
|
84
100
|
data_id: Optional[str] = None,
|
|
101
|
+
namespace: Optional[str] = None,
|
|
102
|
+
document_id: Optional[str] = None,
|
|
85
103
|
parsing_params: Optional[ParsingParams] = None,
|
|
86
104
|
webhook: Optional[WebhookConfig] = None,
|
|
87
105
|
poll_interval: float = DEFAULT_POLL_INTERVAL,
|
|
@@ -105,6 +123,8 @@ class Knowhere(SyncAPIClient):
|
|
|
105
123
|
source_type="url",
|
|
106
124
|
source_url=url,
|
|
107
125
|
data_id=data_id,
|
|
126
|
+
namespace=namespace,
|
|
127
|
+
document_id=document_id,
|
|
108
128
|
parsing_params=parsing_params,
|
|
109
129
|
webhook=webhook,
|
|
110
130
|
)
|
|
@@ -116,6 +136,8 @@ class Knowhere(SyncAPIClient):
|
|
|
116
136
|
source_type="file",
|
|
117
137
|
file_name=resolved_name,
|
|
118
138
|
data_id=data_id,
|
|
139
|
+
namespace=namespace,
|
|
140
|
+
document_id=document_id,
|
|
119
141
|
parsing_params=parsing_params,
|
|
120
142
|
webhook=webhook,
|
|
121
143
|
)
|
|
@@ -149,12 +171,24 @@ class AsyncKnowhere(AsyncAPIClient):
|
|
|
149
171
|
"""Access the async jobs resource namespace."""
|
|
150
172
|
return AsyncJobs(self)
|
|
151
173
|
|
|
174
|
+
@cached_property
|
|
175
|
+
def retrieval(self) -> AsyncRetrieval:
|
|
176
|
+
"""Access the async retrieval resource namespace."""
|
|
177
|
+
return AsyncRetrieval(self)
|
|
178
|
+
|
|
179
|
+
@cached_property
|
|
180
|
+
def documents(self) -> AsyncDocuments:
|
|
181
|
+
"""Access the async documents resource namespace."""
|
|
182
|
+
return AsyncDocuments(self)
|
|
183
|
+
|
|
152
184
|
@overload
|
|
153
185
|
async def parse(
|
|
154
186
|
self,
|
|
155
187
|
*,
|
|
156
188
|
url: str,
|
|
157
189
|
data_id: Optional[str] = ...,
|
|
190
|
+
namespace: Optional[str] = ...,
|
|
191
|
+
document_id: Optional[str] = ...,
|
|
158
192
|
parsing_params: Optional[ParsingParams] = ...,
|
|
159
193
|
webhook: Optional[WebhookConfig] = ...,
|
|
160
194
|
poll_interval: float = ...,
|
|
@@ -171,6 +205,8 @@ class AsyncKnowhere(AsyncAPIClient):
|
|
|
171
205
|
file: Union[Path, BinaryIO, bytes],
|
|
172
206
|
file_name: Optional[str] = ...,
|
|
173
207
|
data_id: Optional[str] = ...,
|
|
208
|
+
namespace: Optional[str] = ...,
|
|
209
|
+
document_id: Optional[str] = ...,
|
|
174
210
|
parsing_params: Optional[ParsingParams] = ...,
|
|
175
211
|
webhook: Optional[WebhookConfig] = ...,
|
|
176
212
|
poll_interval: float = ...,
|
|
@@ -187,6 +223,8 @@ class AsyncKnowhere(AsyncAPIClient):
|
|
|
187
223
|
file: Optional[Union[Path, BinaryIO, bytes]] = None,
|
|
188
224
|
file_name: Optional[str] = None,
|
|
189
225
|
data_id: Optional[str] = None,
|
|
226
|
+
namespace: Optional[str] = None,
|
|
227
|
+
document_id: Optional[str] = None,
|
|
190
228
|
parsing_params: Optional[ParsingParams] = None,
|
|
191
229
|
webhook: Optional[WebhookConfig] = None,
|
|
192
230
|
poll_interval: float = DEFAULT_POLL_INTERVAL,
|
|
@@ -206,6 +244,8 @@ class AsyncKnowhere(AsyncAPIClient):
|
|
|
206
244
|
source_type="url",
|
|
207
245
|
source_url=url,
|
|
208
246
|
data_id=data_id,
|
|
247
|
+
namespace=namespace,
|
|
248
|
+
document_id=document_id,
|
|
209
249
|
parsing_params=parsing_params,
|
|
210
250
|
webhook=webhook,
|
|
211
251
|
)
|
|
@@ -217,6 +257,8 @@ class AsyncKnowhere(AsyncAPIClient):
|
|
|
217
257
|
source_type="file",
|
|
218
258
|
file_name=resolved_name,
|
|
219
259
|
data_id=data_id,
|
|
260
|
+
namespace=namespace,
|
|
261
|
+
document_id=document_id,
|
|
220
262
|
parsing_params=parsing_params,
|
|
221
263
|
webhook=webhook,
|
|
222
264
|
)
|
|
@@ -232,4 +274,4 @@ class AsyncKnowhere(AsyncAPIClient):
|
|
|
232
274
|
|
|
233
275
|
return await self.jobs.load(
|
|
234
276
|
job_result, verify_checksum=verify_checksum
|
|
235
|
-
)
|
|
277
|
+
)
|
knowhere/_exceptions.py
CHANGED
|
@@ -387,11 +387,29 @@ def makeStatusError(
|
|
|
387
387
|
response=response,
|
|
388
388
|
)
|
|
389
389
|
|
|
390
|
-
if exception_class
|
|
391
|
-
return
|
|
390
|
+
if exception_class is RateLimitError:
|
|
391
|
+
return RateLimitError(
|
|
392
392
|
status_code,
|
|
393
393
|
**common_kwargs,
|
|
394
|
-
retry_after=retry_after,
|
|
394
|
+
retry_after=retry_after,
|
|
395
|
+
limit=limit,
|
|
396
|
+
period=period,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
if exception_class is ServiceUnavailableError:
|
|
400
|
+
return ServiceUnavailableError(
|
|
401
|
+
status_code,
|
|
402
|
+
**common_kwargs,
|
|
403
|
+
retry_after=retry_after,
|
|
404
|
+
limit=limit,
|
|
405
|
+
period=period,
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
if exception_class is GatewayTimeoutError:
|
|
409
|
+
return GatewayTimeoutError(
|
|
410
|
+
status_code,
|
|
411
|
+
**common_kwargs,
|
|
412
|
+
retry_after=retry_after,
|
|
395
413
|
limit=limit,
|
|
396
414
|
period=period,
|
|
397
415
|
)
|
knowhere/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.3.0" # x-release-please-version
|
knowhere/lib/result_parser.py
CHANGED
|
@@ -16,6 +16,7 @@ from knowhere.types.result import (
|
|
|
16
16
|
ImageChunk,
|
|
17
17
|
Manifest,
|
|
18
18
|
ParseResult,
|
|
19
|
+
SlimChunk,
|
|
19
20
|
TableChunk,
|
|
20
21
|
TextChunk,
|
|
21
22
|
TextChunkTokens,
|
|
@@ -134,6 +135,7 @@ def _buildChunks(
|
|
|
134
135
|
type="image",
|
|
135
136
|
content=raw.get("content", ""),
|
|
136
137
|
path=raw.get("path"),
|
|
138
|
+
page_nums=metadata.get("page_nums", raw.get("page_nums")),
|
|
137
139
|
length=metadata.get("length", raw.get("length", 0)),
|
|
138
140
|
file_path=file_path,
|
|
139
141
|
original_name=metadata.get("original_name", raw.get("original_name")),
|
|
@@ -151,6 +153,7 @@ def _buildChunks(
|
|
|
151
153
|
type="table",
|
|
152
154
|
content=raw.get("content", ""),
|
|
153
155
|
path=raw.get("path"),
|
|
156
|
+
page_nums=metadata.get("page_nums", raw.get("page_nums")),
|
|
154
157
|
length=metadata.get("length", raw.get("length", 0)),
|
|
155
158
|
file_path=file_path,
|
|
156
159
|
original_name=metadata.get("original_name", raw.get("original_name")),
|
|
@@ -167,10 +170,12 @@ def _buildChunks(
|
|
|
167
170
|
type="text",
|
|
168
171
|
content=raw.get("content", ""),
|
|
169
172
|
path=raw.get("path"),
|
|
173
|
+
page_nums=metadata.get("page_nums", raw.get("page_nums")),
|
|
170
174
|
length=metadata.get("length", raw.get("length", 0)),
|
|
171
175
|
tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id),
|
|
172
176
|
keywords=metadata.get("keywords", raw.get("keywords")),
|
|
173
177
|
summary=metadata.get("summary", raw.get("summary")),
|
|
178
|
+
connect_to=metadata.get("connect_to", raw.get("connect_to")),
|
|
174
179
|
relationships=metadata.get("relationships", raw.get("relationships")),
|
|
175
180
|
)
|
|
176
181
|
|
|
@@ -230,12 +235,39 @@ def parseResultZip(
|
|
|
230
235
|
json.loads(hierarchy_text) if hierarchy_text else None
|
|
231
236
|
)
|
|
232
237
|
|
|
238
|
+
# -- Optimized sidecar files --
|
|
239
|
+
chunks_slim_text: Optional[str] = _readZipText(zf, "chunks_slim.json")
|
|
240
|
+
parsed_chunks_slim: Any = json.loads(chunks_slim_text) if chunks_slim_text else None
|
|
241
|
+
if isinstance(parsed_chunks_slim, dict) and "chunks" in parsed_chunks_slim:
|
|
242
|
+
raw_chunks_slim: List[Dict[str, Any]] = parsed_chunks_slim["chunks"]
|
|
243
|
+
elif isinstance(parsed_chunks_slim, list):
|
|
244
|
+
raw_chunks_slim = parsed_chunks_slim
|
|
245
|
+
else:
|
|
246
|
+
raw_chunks_slim = []
|
|
247
|
+
chunks_slim: Optional[List[SlimChunk]] = (
|
|
248
|
+
[SlimChunk.model_validate(chunk) for chunk in raw_chunks_slim]
|
|
249
|
+
if chunks_slim_text is not None
|
|
250
|
+
else None
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
toc_hierarchies_text: Optional[str] = _readZipText(zf, "toc_hierarchies.json")
|
|
254
|
+
toc_hierarchies: Optional[Any] = (
|
|
255
|
+
json.loads(toc_hierarchies_text) if toc_hierarchies_text else None
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
kb_csv: Optional[str] = _readZipText(zf, "kb.csv")
|
|
259
|
+
hierarchy_view_html: Optional[str] = _readZipText(zf, "hierarchy_view.html")
|
|
260
|
+
|
|
233
261
|
zf.close()
|
|
234
262
|
|
|
235
263
|
return ParseResult(
|
|
236
264
|
manifest=manifest,
|
|
237
265
|
chunks=chunks,
|
|
266
|
+
chunks_slim=chunks_slim,
|
|
238
267
|
full_markdown=full_markdown,
|
|
239
268
|
hierarchy=hierarchy,
|
|
269
|
+
toc_hierarchies=toc_hierarchies,
|
|
270
|
+
kb_csv=kb_csv,
|
|
271
|
+
hierarchy_view_html=hierarchy_view_html,
|
|
240
272
|
raw_zip=zip_bytes,
|
|
241
273
|
)
|
knowhere/resources/__init__.py
CHANGED
|
@@ -2,6 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
from knowhere.resources.documents import AsyncDocuments, Documents
|
|
5
6
|
from knowhere.resources.jobs import AsyncJobs, Jobs
|
|
7
|
+
from knowhere.resources.retrieval import AsyncRetrieval, Retrieval
|
|
6
8
|
|
|
7
|
-
__all__: list[str] = [
|
|
9
|
+
__all__: list[str] = [
|
|
10
|
+
"AsyncDocuments",
|
|
11
|
+
"AsyncJobs",
|
|
12
|
+
"AsyncRetrieval",
|
|
13
|
+
"Documents",
|
|
14
|
+
"Jobs",
|
|
15
|
+
"Retrieval",
|
|
16
|
+
]
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""Documents resource for canonical document lifecycle operations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, Optional
|
|
6
|
+
|
|
7
|
+
from knowhere.resources._base import AsyncAPIResource, SyncAPIResource
|
|
8
|
+
from knowhere.types.document import Document, DocumentListResponse
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Documents(SyncAPIResource):
|
|
12
|
+
"""Synchronous interface for ``/v1/documents`` endpoints."""
|
|
13
|
+
|
|
14
|
+
def list(self, *, namespace: Optional[str] = None) -> DocumentListResponse:
|
|
15
|
+
"""List canonical documents in a namespace."""
|
|
16
|
+
params: Dict[str, Any] = {}
|
|
17
|
+
if namespace is not None:
|
|
18
|
+
params["namespace"] = namespace
|
|
19
|
+
|
|
20
|
+
return self._request(
|
|
21
|
+
"GET",
|
|
22
|
+
"v1/documents",
|
|
23
|
+
params=params or None,
|
|
24
|
+
cast_to=DocumentListResponse,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def get(self, document_id: str) -> Document:
|
|
28
|
+
"""Get one canonical document by ID."""
|
|
29
|
+
return self._request(
|
|
30
|
+
"GET",
|
|
31
|
+
f"v1/documents/{document_id}",
|
|
32
|
+
cast_to=Document,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def archive(self, document_id: str) -> Document:
|
|
36
|
+
"""Archive one canonical document by ID."""
|
|
37
|
+
return self._request(
|
|
38
|
+
"POST",
|
|
39
|
+
f"v1/documents/{document_id}/archive",
|
|
40
|
+
cast_to=Document,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class AsyncDocuments(AsyncAPIResource):
|
|
45
|
+
"""Asynchronous interface for ``/v1/documents`` endpoints."""
|
|
46
|
+
|
|
47
|
+
async def list(self, *, namespace: Optional[str] = None) -> DocumentListResponse:
|
|
48
|
+
"""List canonical documents in a namespace."""
|
|
49
|
+
params: Dict[str, Any] = {}
|
|
50
|
+
if namespace is not None:
|
|
51
|
+
params["namespace"] = namespace
|
|
52
|
+
|
|
53
|
+
return await self._request(
|
|
54
|
+
"GET",
|
|
55
|
+
"v1/documents",
|
|
56
|
+
params=params or None,
|
|
57
|
+
cast_to=DocumentListResponse,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
async def get(self, document_id: str) -> Document:
|
|
61
|
+
"""Get one canonical document by ID."""
|
|
62
|
+
return await self._request(
|
|
63
|
+
"GET",
|
|
64
|
+
f"v1/documents/{document_id}",
|
|
65
|
+
cast_to=Document,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
async def archive(self, document_id: str) -> Document:
|
|
69
|
+
"""Archive one canonical document by ID."""
|
|
70
|
+
return await self._request(
|
|
71
|
+
"POST",
|
|
72
|
+
f"v1/documents/{document_id}/archive",
|
|
73
|
+
cast_to=Document,
|
|
74
|
+
)
|
knowhere/resources/jobs.py
CHANGED
|
@@ -34,6 +34,8 @@ class Jobs(SyncAPIResource):
|
|
|
34
34
|
source_type: str,
|
|
35
35
|
source_url: Optional[str] = None,
|
|
36
36
|
file_name: Optional[str] = None,
|
|
37
|
+
namespace: Optional[str] = None,
|
|
38
|
+
document_id: Optional[str] = None,
|
|
37
39
|
data_id: Optional[str] = None,
|
|
38
40
|
parsing_params: Optional[ParsingParams] = None,
|
|
39
41
|
webhook: Optional[WebhookConfig] = None,
|
|
@@ -44,6 +46,8 @@ class Jobs(SyncAPIResource):
|
|
|
44
46
|
source_type: ``"url"`` or ``"file"``.
|
|
45
47
|
source_url: URL to parse (required when ``source_type="url"``).
|
|
46
48
|
file_name: Original filename (used when ``source_type="file"``).
|
|
49
|
+
namespace: Retrieval namespace. Defaults to the server ``default``.
|
|
50
|
+
document_id: Existing document ID when creating an update job.
|
|
47
51
|
data_id: Optional idempotency / correlation identifier.
|
|
48
52
|
parsing_params: Optional parsing configuration.
|
|
49
53
|
webhook: Optional webhook configuration.
|
|
@@ -56,6 +60,10 @@ class Jobs(SyncAPIResource):
|
|
|
56
60
|
body["source_url"] = source_url
|
|
57
61
|
if file_name is not None:
|
|
58
62
|
body["file_name"] = file_name
|
|
63
|
+
if namespace is not None:
|
|
64
|
+
body["namespace"] = namespace
|
|
65
|
+
if document_id is not None:
|
|
66
|
+
body["document_id"] = document_id
|
|
59
67
|
if data_id is not None:
|
|
60
68
|
body["data_id"] = data_id
|
|
61
69
|
if parsing_params is not None:
|
|
@@ -158,6 +166,8 @@ class AsyncJobs(AsyncAPIResource):
|
|
|
158
166
|
source_type: str,
|
|
159
167
|
source_url: Optional[str] = None,
|
|
160
168
|
file_name: Optional[str] = None,
|
|
169
|
+
namespace: Optional[str] = None,
|
|
170
|
+
document_id: Optional[str] = None,
|
|
161
171
|
data_id: Optional[str] = None,
|
|
162
172
|
parsing_params: Optional[ParsingParams] = None,
|
|
163
173
|
webhook: Optional[WebhookConfig] = None,
|
|
@@ -168,6 +178,10 @@ class AsyncJobs(AsyncAPIResource):
|
|
|
168
178
|
body["source_url"] = source_url
|
|
169
179
|
if file_name is not None:
|
|
170
180
|
body["file_name"] = file_name
|
|
181
|
+
if namespace is not None:
|
|
182
|
+
body["namespace"] = namespace
|
|
183
|
+
if document_id is not None:
|
|
184
|
+
body["document_id"] = document_id
|
|
171
185
|
if data_id is not None:
|
|
172
186
|
body["data_id"] = data_id
|
|
173
187
|
if parsing_params is not None:
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Retrieval resource for querying published documents."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, Optional
|
|
6
|
+
|
|
7
|
+
from knowhere.resources._base import AsyncAPIResource, SyncAPIResource
|
|
8
|
+
from knowhere.types.retrieval import RetrievalQueryResponse
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Retrieval(SyncAPIResource):
|
|
12
|
+
"""Synchronous interface for ``/v1/retrieval`` endpoints."""
|
|
13
|
+
|
|
14
|
+
def query(
|
|
15
|
+
self,
|
|
16
|
+
*,
|
|
17
|
+
query: str,
|
|
18
|
+
namespace: Optional[str] = None,
|
|
19
|
+
top_k: Optional[int] = None,
|
|
20
|
+
exclude_document_ids: Optional[list[str]] = None,
|
|
21
|
+
exclude_sections: Optional[list[dict[str, str]]] = None,
|
|
22
|
+
) -> RetrievalQueryResponse:
|
|
23
|
+
"""Query published documents in a namespace."""
|
|
24
|
+
body: Dict[str, Any] = {"query": query}
|
|
25
|
+
if namespace is not None:
|
|
26
|
+
body["namespace"] = namespace
|
|
27
|
+
if top_k is not None:
|
|
28
|
+
body["top_k"] = top_k
|
|
29
|
+
if exclude_document_ids is not None:
|
|
30
|
+
body["exclude_document_ids"] = exclude_document_ids
|
|
31
|
+
if exclude_sections is not None:
|
|
32
|
+
body["exclude_sections"] = exclude_sections
|
|
33
|
+
|
|
34
|
+
return self._request(
|
|
35
|
+
"POST",
|
|
36
|
+
"v1/retrieval/query",
|
|
37
|
+
body=body,
|
|
38
|
+
cast_to=RetrievalQueryResponse,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class AsyncRetrieval(AsyncAPIResource):
|
|
43
|
+
"""Asynchronous interface for ``/v1/retrieval`` endpoints."""
|
|
44
|
+
|
|
45
|
+
async def query(
|
|
46
|
+
self,
|
|
47
|
+
*,
|
|
48
|
+
query: str,
|
|
49
|
+
namespace: Optional[str] = None,
|
|
50
|
+
top_k: Optional[int] = None,
|
|
51
|
+
exclude_document_ids: Optional[list[str]] = None,
|
|
52
|
+
exclude_sections: Optional[list[dict[str, str]]] = None,
|
|
53
|
+
) -> RetrievalQueryResponse:
|
|
54
|
+
"""Query published documents in a namespace."""
|
|
55
|
+
body: Dict[str, Any] = {"query": query}
|
|
56
|
+
if namespace is not None:
|
|
57
|
+
body["namespace"] = namespace
|
|
58
|
+
if top_k is not None:
|
|
59
|
+
body["top_k"] = top_k
|
|
60
|
+
if exclude_document_ids is not None:
|
|
61
|
+
body["exclude_document_ids"] = exclude_document_ids
|
|
62
|
+
if exclude_sections is not None:
|
|
63
|
+
body["exclude_sections"] = exclude_sections
|
|
64
|
+
|
|
65
|
+
return await self._request(
|
|
66
|
+
"POST",
|
|
67
|
+
"v1/retrieval/query",
|
|
68
|
+
body=body,
|
|
69
|
+
cast_to=RetrievalQueryResponse,
|
|
70
|
+
)
|
knowhere/types/__init__.py
CHANGED
|
@@ -2,8 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
from knowhere.types.document import Document, DocumentListResponse
|
|
5
6
|
from knowhere.types.job import Job, JobError, JobResult
|
|
6
7
|
from knowhere.types.params import ParsingParams, WebhookConfig
|
|
8
|
+
from knowhere.types.retrieval import (
|
|
9
|
+
RetrievalSource,
|
|
10
|
+
RetrievalQueryResponse,
|
|
11
|
+
RetrievalResult,
|
|
12
|
+
)
|
|
7
13
|
from knowhere.types.result import (
|
|
8
14
|
BaseChunk,
|
|
9
15
|
Checksum,
|
|
@@ -13,6 +19,10 @@ from knowhere.types.result import (
|
|
|
13
19
|
ImageFileInfo,
|
|
14
20
|
Manifest,
|
|
15
21
|
ParseResult,
|
|
22
|
+
ProcessingCost,
|
|
23
|
+
ProcessingMetadata,
|
|
24
|
+
ProcessingTiming,
|
|
25
|
+
SlimChunk,
|
|
16
26
|
Statistics,
|
|
17
27
|
TableChunk,
|
|
18
28
|
TableFileInfo,
|
|
@@ -24,6 +34,13 @@ __all__: list[str] = [
|
|
|
24
34
|
"Job",
|
|
25
35
|
"JobError",
|
|
26
36
|
"JobResult",
|
|
37
|
+
# document
|
|
38
|
+
"Document",
|
|
39
|
+
"DocumentListResponse",
|
|
40
|
+
# retrieval
|
|
41
|
+
"RetrievalSource",
|
|
42
|
+
"RetrievalQueryResponse",
|
|
43
|
+
"RetrievalResult",
|
|
27
44
|
# params
|
|
28
45
|
"ParsingParams",
|
|
29
46
|
"WebhookConfig",
|
|
@@ -36,6 +53,10 @@ __all__: list[str] = [
|
|
|
36
53
|
"ImageFileInfo",
|
|
37
54
|
"Manifest",
|
|
38
55
|
"ParseResult",
|
|
56
|
+
"ProcessingCost",
|
|
57
|
+
"ProcessingMetadata",
|
|
58
|
+
"ProcessingTiming",
|
|
59
|
+
"SlimChunk",
|
|
39
60
|
"Statistics",
|
|
40
61
|
"TableChunk",
|
|
41
62
|
"TableFileInfo",
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Pydantic models for canonical document lifecycle responses."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Document(BaseModel):
|
|
12
|
+
"""Canonical document state returned by ``/v1/documents`` endpoints."""
|
|
13
|
+
|
|
14
|
+
document_id: str
|
|
15
|
+
namespace: str
|
|
16
|
+
status: str
|
|
17
|
+
current_job_result_id: Optional[str] = None
|
|
18
|
+
source_file_name: Optional[str] = None
|
|
19
|
+
created_at: Optional[datetime] = None
|
|
20
|
+
updated_at: Optional[datetime] = None
|
|
21
|
+
archived_at: Optional[datetime] = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DocumentListResponse(BaseModel):
|
|
25
|
+
"""Response from ``GET /v1/documents``."""
|
|
26
|
+
|
|
27
|
+
namespace: str
|
|
28
|
+
documents: list[Document]
|
knowhere/types/job.py
CHANGED
|
@@ -40,6 +40,8 @@ class Job(BaseModel):
|
|
|
40
40
|
job_id: str
|
|
41
41
|
status: str
|
|
42
42
|
source_type: str
|
|
43
|
+
namespace: Optional[str] = None
|
|
44
|
+
document_id: Optional[str] = None
|
|
43
45
|
data_id: Optional[str] = None
|
|
44
46
|
created_at: Optional[datetime] = None
|
|
45
47
|
upload_url: Optional[str] = None
|
|
@@ -53,6 +55,8 @@ class JobResult(BaseModel):
|
|
|
53
55
|
job_id: str
|
|
54
56
|
status: str
|
|
55
57
|
source_type: str
|
|
58
|
+
namespace: Optional[str] = None
|
|
59
|
+
document_id: Optional[str] = None
|
|
56
60
|
data_id: Optional[str] = None
|
|
57
61
|
created_at: Optional[datetime] = None
|
|
58
62
|
progress: Optional[Union[float, JobProgress]] = None
|
knowhere/types/result.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
|
+
import json
|
|
6
7
|
import re
|
|
7
8
|
from pathlib import Path
|
|
8
9
|
from typing import Any, Dict, List, Optional, Union
|
|
@@ -92,12 +93,39 @@ class FileIndex(BaseModel):
|
|
|
92
93
|
|
|
93
94
|
chunks: Optional[str] = None
|
|
94
95
|
markdown: Optional[str] = None
|
|
96
|
+
chunks_slim: Optional[str] = None
|
|
95
97
|
kb_csv: Optional[str] = None
|
|
96
98
|
hierarchy: Optional[str] = None
|
|
99
|
+
toc_hierarchies: Optional[str] = None
|
|
100
|
+
hierarchy_view_html: Optional[str] = None
|
|
97
101
|
images: List[ImageFileInfo] = Field(default_factory=list)
|
|
98
102
|
tables: List[TableFileInfo] = Field(default_factory=list)
|
|
99
103
|
|
|
100
104
|
|
|
105
|
+
class ProcessingCost(BaseModel):
|
|
106
|
+
"""Billing details emitted by manifest v2."""
|
|
107
|
+
|
|
108
|
+
micro_dollars: Optional[int] = None
|
|
109
|
+
credits: Optional[float] = None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class ProcessingTiming(BaseModel):
|
|
113
|
+
"""Timing details emitted by manifest v2."""
|
|
114
|
+
|
|
115
|
+
started_at: Optional[str] = None
|
|
116
|
+
completed_at: Optional[str] = None
|
|
117
|
+
duration_ms: Optional[int] = None
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class ProcessingMetadata(BaseModel):
|
|
121
|
+
"""Worker-side processing metadata emitted by manifest v2."""
|
|
122
|
+
|
|
123
|
+
page_count: Optional[int] = None
|
|
124
|
+
billing_status: Optional[str] = None
|
|
125
|
+
cost: Optional[ProcessingCost] = None
|
|
126
|
+
timing: Optional[ProcessingTiming] = None
|
|
127
|
+
|
|
128
|
+
|
|
101
129
|
class Manifest(BaseModel):
|
|
102
130
|
"""Top-level manifest describing the result ZIP contents."""
|
|
103
131
|
|
|
@@ -106,6 +134,7 @@ class Manifest(BaseModel):
|
|
|
106
134
|
data_id: Optional[str] = None
|
|
107
135
|
source_file_name: Optional[str] = None
|
|
108
136
|
processing_date: Optional[str] = None
|
|
137
|
+
processing: Optional[ProcessingMetadata] = None
|
|
109
138
|
checksum: Optional[Checksum] = None
|
|
110
139
|
statistics: Optional[Statistics] = None
|
|
111
140
|
files: Optional[FileIndex] = None
|
|
@@ -123,6 +152,7 @@ class BaseChunk(BaseModel):
|
|
|
123
152
|
type: str
|
|
124
153
|
content: str = ""
|
|
125
154
|
path: Optional[str] = None
|
|
155
|
+
page_nums: Optional[List[int]] = None
|
|
126
156
|
|
|
127
157
|
|
|
128
158
|
TextChunkTokens: TypeAlias = List[str]
|
|
@@ -136,6 +166,7 @@ class TextChunk(BaseChunk):
|
|
|
136
166
|
tokens: Optional[TextChunkTokens] = None
|
|
137
167
|
keywords: Optional[List[str]] = None
|
|
138
168
|
summary: Optional[str] = None
|
|
169
|
+
connect_to: Optional[List[Dict[str, Any]]] = None
|
|
139
170
|
relationships: Optional[List[Union[Dict[str, Any], str]]] = None
|
|
140
171
|
|
|
141
172
|
|
|
@@ -210,6 +241,15 @@ class TableChunk(BaseChunk):
|
|
|
210
241
|
Chunk = Union[TextChunk, ImageChunk, TableChunk]
|
|
211
242
|
|
|
212
243
|
|
|
244
|
+
class SlimChunk(BaseModel):
|
|
245
|
+
"""Minimal chunk entry emitted in chunks_slim.json."""
|
|
246
|
+
|
|
247
|
+
type: str
|
|
248
|
+
path: Optional[str] = None
|
|
249
|
+
content: str = ""
|
|
250
|
+
summary: Optional[str] = None
|
|
251
|
+
|
|
252
|
+
|
|
213
253
|
# ---------------------------------------------------------------------------
|
|
214
254
|
# ParseResult — the top-level object returned to the user
|
|
215
255
|
# ---------------------------------------------------------------------------
|
|
@@ -225,8 +265,12 @@ class ParseResult:
|
|
|
225
265
|
|
|
226
266
|
manifest: Manifest
|
|
227
267
|
chunks: List[Chunk]
|
|
268
|
+
chunks_slim: Optional[List[SlimChunk]]
|
|
228
269
|
full_markdown: str
|
|
229
270
|
hierarchy: Optional[Any]
|
|
271
|
+
toc_hierarchies: Optional[Any]
|
|
272
|
+
kb_csv: Optional[str]
|
|
273
|
+
hierarchy_view_html: Optional[str]
|
|
230
274
|
raw_zip: bytes
|
|
231
275
|
|
|
232
276
|
def __init__(
|
|
@@ -234,14 +278,22 @@ class ParseResult:
|
|
|
234
278
|
*,
|
|
235
279
|
manifest: Manifest,
|
|
236
280
|
chunks: List[Chunk],
|
|
281
|
+
chunks_slim: Optional[List[SlimChunk]],
|
|
237
282
|
full_markdown: str,
|
|
238
283
|
hierarchy: Optional[Any],
|
|
284
|
+
toc_hierarchies: Optional[Any],
|
|
285
|
+
kb_csv: Optional[str],
|
|
286
|
+
hierarchy_view_html: Optional[str],
|
|
239
287
|
raw_zip: bytes,
|
|
240
288
|
) -> None:
|
|
241
289
|
self.manifest = manifest
|
|
242
290
|
self.chunks = chunks
|
|
291
|
+
self.chunks_slim = chunks_slim
|
|
243
292
|
self.full_markdown = full_markdown
|
|
244
293
|
self.hierarchy = hierarchy
|
|
294
|
+
self.toc_hierarchies = toc_hierarchies
|
|
295
|
+
self.kb_csv = kb_csv
|
|
296
|
+
self.hierarchy_view_html = hierarchy_view_html
|
|
245
297
|
self.raw_zip = raw_zip
|
|
246
298
|
|
|
247
299
|
# -- convenience properties --
|
|
@@ -296,10 +348,58 @@ class ParseResult:
|
|
|
296
348
|
dir_path: Path = Path(directory)
|
|
297
349
|
dir_path.mkdir(parents=True, exist_ok=True)
|
|
298
350
|
|
|
351
|
+
# Manifest / chunks
|
|
352
|
+
manifest_path: Path = dir_path / "manifest.json"
|
|
353
|
+
manifest_path.write_text(
|
|
354
|
+
self.manifest.model_dump_json(indent=2),
|
|
355
|
+
encoding="utf-8",
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
chunks_path: Path = dir_path / "chunks.json"
|
|
359
|
+
chunks_path.write_text(
|
|
360
|
+
json.dumps([chunk.model_dump() for chunk in self.chunks], indent=2),
|
|
361
|
+
encoding="utf-8",
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
if self.chunks_slim is not None:
|
|
365
|
+
chunks_slim_path: Path = dir_path / "chunks_slim.json"
|
|
366
|
+
chunks_slim_path.write_text(
|
|
367
|
+
json.dumps(
|
|
368
|
+
{"chunks": [chunk.model_dump() for chunk in self.chunks_slim]},
|
|
369
|
+
indent=2,
|
|
370
|
+
),
|
|
371
|
+
encoding="utf-8",
|
|
372
|
+
)
|
|
373
|
+
|
|
299
374
|
# Full markdown
|
|
300
375
|
md_path: Path = dir_path / "full.md"
|
|
301
376
|
md_path.write_text(self.full_markdown, encoding="utf-8")
|
|
302
377
|
|
|
378
|
+
if self.hierarchy is not None:
|
|
379
|
+
hierarchy_path: Path = dir_path / "hierarchy.json"
|
|
380
|
+
hierarchy_path.write_text(
|
|
381
|
+
json.dumps(self.hierarchy, indent=2),
|
|
382
|
+
encoding="utf-8",
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
if self.toc_hierarchies is not None:
|
|
386
|
+
toc_hierarchies_path: Path = dir_path / "toc_hierarchies.json"
|
|
387
|
+
toc_hierarchies_path.write_text(
|
|
388
|
+
json.dumps(self.toc_hierarchies, indent=2),
|
|
389
|
+
encoding="utf-8",
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
if self.kb_csv is not None:
|
|
393
|
+
kb_csv_path: Path = dir_path / "kb.csv"
|
|
394
|
+
kb_csv_path.write_text(self.kb_csv, encoding="utf-8")
|
|
395
|
+
|
|
396
|
+
if self.hierarchy_view_html is not None:
|
|
397
|
+
hierarchy_view_path: Path = dir_path / "hierarchy_view.html"
|
|
398
|
+
hierarchy_view_path.write_text(
|
|
399
|
+
self.hierarchy_view_html,
|
|
400
|
+
encoding="utf-8",
|
|
401
|
+
)
|
|
402
|
+
|
|
303
403
|
# Images
|
|
304
404
|
if self.image_chunks:
|
|
305
405
|
images_dir: Path = dir_path / "images"
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Pydantic models for retrieval query responses."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class RetrievalSource(BaseModel):
|
|
11
|
+
"""Caller-facing source reference attached to a retrieval result."""
|
|
12
|
+
|
|
13
|
+
document_id: Optional[str] = None
|
|
14
|
+
source_file_name: Optional[str] = None
|
|
15
|
+
section_path: Optional[str] = None
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class RetrievalResult(BaseModel):
|
|
19
|
+
"""Canonical chunk result returned by ``POST /v1/retrieval/query``."""
|
|
20
|
+
|
|
21
|
+
chunk_type: str
|
|
22
|
+
content: str
|
|
23
|
+
score: float
|
|
24
|
+
asset_url: Optional[str] = None
|
|
25
|
+
source: RetrievalSource
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class RetrievalQueryResponse(BaseModel):
|
|
29
|
+
"""Response from ``POST /v1/retrieval/query``."""
|
|
30
|
+
|
|
31
|
+
namespace: str
|
|
32
|
+
query: str
|
|
33
|
+
results: list[RetrievalResult]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: knowhere-python-sdk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Official Python SDK for the Knowhere document parsing API
|
|
5
5
|
Project-URL: Homepage, https://knowhereto.ai
|
|
6
6
|
Project-URL: Documentation, https://docs.knowhereto.ai
|
|
@@ -64,6 +64,74 @@ for chunk in result.text_chunks:
|
|
|
64
64
|
print(chunk.content[:80])
|
|
65
65
|
```
|
|
66
66
|
|
|
67
|
+
## Retrieval and document lifecycle
|
|
68
|
+
|
|
69
|
+
New documents are published into a retrieval namespace. The server returns a
|
|
70
|
+
stable `document_id` when you create a job; persist that value if you need to
|
|
71
|
+
update or archive the same document later.
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
job = client.jobs.create(
|
|
75
|
+
source_type="url",
|
|
76
|
+
source_url="https://example.com/manual.pdf",
|
|
77
|
+
namespace="support-center",
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
print(job.document_id) # "doc_..."
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
After the job is done and published, query the canonical document content:
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
response = client.retrieval.query(
|
|
87
|
+
namespace="support-center",
|
|
88
|
+
query="How do I reset Bluetooth pairing?",
|
|
89
|
+
top_k=5,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
for result in response.results:
|
|
93
|
+
print(result.content)
|
|
94
|
+
print(result.score)
|
|
95
|
+
print(result.source.source_file_name, result.source.section_path)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Use `document_id` to update or archive a document:
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
update_job = client.jobs.create(
|
|
102
|
+
source_type="url",
|
|
103
|
+
source_url="https://example.com/manual-v2.pdf",
|
|
104
|
+
document_id=job.document_id,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
document = client.documents.get(job.document_id)
|
|
108
|
+
print(document.status)
|
|
109
|
+
|
|
110
|
+
client.documents.archive(job.document_id)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
You can also list documents in a namespace:
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
documents = client.documents.list(namespace="support-center")
|
|
117
|
+
for document in documents.documents:
|
|
118
|
+
print(document.document_id, document.status)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
Retrieval supports exclusions when clients want follow-up results that avoid
|
|
122
|
+
previously used documents or sections:
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
response = client.retrieval.query(
|
|
126
|
+
namespace="support-center",
|
|
127
|
+
query="battery charging",
|
|
128
|
+
exclude_document_ids=["doc_old"],
|
|
129
|
+
exclude_sections=[
|
|
130
|
+
{"document_id": "doc_123", "section_path": "Appendix / Legal"}
|
|
131
|
+
],
|
|
132
|
+
)
|
|
133
|
+
```
|
|
134
|
+
|
|
67
135
|
While you can provide an `api_key` keyword argument, we recommend using [python-dotenv](https://pypi.org/project/python-dotenv/) to add `KNOWHERE_API_KEY="sk_..."` to your `.env` file so that your API key is not stored in source control.
|
|
68
136
|
|
|
69
137
|
### Parse a local file
|
|
@@ -137,9 +205,12 @@ from pathlib import Path
|
|
|
137
205
|
job = client.jobs.create(
|
|
138
206
|
source_type="file",
|
|
139
207
|
file_name="report.pdf",
|
|
208
|
+
namespace="support-center",
|
|
140
209
|
parsing_params={"model": "advanced", "ocr_enabled": True},
|
|
141
210
|
)
|
|
142
211
|
|
|
212
|
+
print(job.document_id) # Persist this to update/archive the document later.
|
|
213
|
+
|
|
143
214
|
# Step 2: Upload file to presigned URL
|
|
144
215
|
client.jobs.upload(job, file=Path("report.pdf"))
|
|
145
216
|
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
knowhere/__init__.py,sha256=FLKrentC0o9j1GZTSTlx7A1S_mWmXWceomBScdPbXg8,2854
|
|
2
|
+
knowhere/_base_client.py,sha256=ddeRR1lWLhes5ipvYX6-TMEecjjiEBGfQdPw_vnSNqA,17978
|
|
3
|
+
knowhere/_client.py,sha256=WYb-Fhi3x3nQYNfQG9eCgOpLc_wVyAawfPZWdZhFESg,9586
|
|
4
|
+
knowhere/_constants.py,sha256=ZNCFQC00NpUZIyc_XZ0uemjJE-E8uKAbv3BDa3po9cg,885
|
|
5
|
+
knowhere/_exceptions.py,sha256=NflH7phh_bNFOJmQ758V4mZCAFQskpGXACMz2JIfFNU,11896
|
|
6
|
+
knowhere/_logging.py,sha256=tNqEA1dLv-adTT6qRq5RBeO35FoWrnS3gwt7gKChLTA,1376
|
|
7
|
+
knowhere/_response.py,sha256=EsrM794qxCykvl82UkszeqjJzm9_OSq7nsyzaSCnx0I,1415
|
|
8
|
+
knowhere/_types.py,sha256=8-JFaRcxgBJbw2mV9BwnmCktFVph41a1mduwtXlYidI,1775
|
|
9
|
+
knowhere/_version.py,sha256=BW_DctcKYzNRp1g4_DgZOvYCUcP3tNHyQKvZG3uopBM,50
|
|
10
|
+
knowhere/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
knowhere/lib/__init__.py,sha256=e953V5ny3VmDtCw7y_4uPwdTkwwNpe_Y6o4AEgz3ujw,50
|
|
12
|
+
knowhere/lib/polling.py,sha256=s0EPHozAvNhXLqr5uwU8YXkkwAdF0ji_nIN0QfR6avY,4500
|
|
13
|
+
knowhere/lib/result_parser.py,sha256=dR3knoMq-AFMAe0M3l0YgOM-OrtSmofSLaKZO0tgYao,9882
|
|
14
|
+
knowhere/lib/upload.py,sha256=eT-O9_wB2WkWUAsUd7VzaKY6DVfNeA6WMHRdwm0HM0o,7849
|
|
15
|
+
knowhere/resources/__init__.py,sha256=ClsR-yn_0E4KOopD_Yq13wbPHHjl9s15XpydN-d2Rzo,393
|
|
16
|
+
knowhere/resources/_base.py,sha256=tgKphNTsgMhktWp6_rhyVOZyee4CYlDmD5O1_jWVvYo,1829
|
|
17
|
+
knowhere/resources/documents.py,sha256=u_gmrElvpMOABaHkEuTyaYvh4D_CG4pHZt23r8tivaY,2314
|
|
18
|
+
knowhere/resources/jobs.py,sha256=IhcJIQ_jho6dSsdJLSS0VRB6xuWw12BRJrjO_4NjEMs,9099
|
|
19
|
+
knowhere/resources/retrieval.py,sha256=yVCUWlOg6_ZJhXfiy5_AjqLZZm2Zx8ltqhj1kJ1gKIM,2302
|
|
20
|
+
knowhere/types/__init__.py,sha256=fKMA0NA2lZ-eag1FIeScnwz2ImV6LD-T3YJVfUBsA98,1290
|
|
21
|
+
knowhere/types/document.py,sha256=LbFleglvm538vSDDho82j7fVxvgMXdIVm9wrWemLShY,711
|
|
22
|
+
knowhere/types/job.py,sha256=_ORhgn_tnvQm_gyrCS39EsDV3dOKImBeJXGjEq3JLag,2510
|
|
23
|
+
knowhere/types/params.py,sha256=7DyBd4xMxtLPch-A1130-gI0ajKOv2G5tbSMkE8n6-E,543
|
|
24
|
+
knowhere/types/result.py,sha256=UmoxaFmxt2bhrP-2O6jYL89C2WuwZh2xcyyHl46Q1_Y,12925
|
|
25
|
+
knowhere/types/retrieval.py,sha256=-YzsKyusajVdGx4v1lR9Kts-Fh5D41uXf17lSL4ZyJM,777
|
|
26
|
+
knowhere/types/shared.py,sha256=K5ezX212othxgCviiE2WnwWFY2MS08pXKJ8Km1ZWmjc,104
|
|
27
|
+
knowhere_python_sdk-0.3.0.dist-info/METADATA,sha256=T7MT_NBl2sqb_FcBuxU97Eacm8YDXn8jcP3DLRnLQH0,7922
|
|
28
|
+
knowhere_python_sdk-0.3.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
29
|
+
knowhere_python_sdk-0.3.0.dist-info/RECORD,,
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
knowhere/__init__.py,sha256=EuIpP3FtDeszonVAXMxZimjRd9iUcQ8wA53h1f27S3k,2343
|
|
2
|
-
knowhere/_base_client.py,sha256=ddeRR1lWLhes5ipvYX6-TMEecjjiEBGfQdPw_vnSNqA,17978
|
|
3
|
-
knowhere/_client.py,sha256=MGU1QsyjKrzTiitm891wgNCq6JLf3DR7y7zhkil_p2E,8027
|
|
4
|
-
knowhere/_constants.py,sha256=ZNCFQC00NpUZIyc_XZ0uemjJE-E8uKAbv3BDa3po9cg,885
|
|
5
|
-
knowhere/_exceptions.py,sha256=yg-4pK7AP6uUPxxyggxf8spQeXgFTpKRwELsHjCQycg,11489
|
|
6
|
-
knowhere/_logging.py,sha256=tNqEA1dLv-adTT6qRq5RBeO35FoWrnS3gwt7gKChLTA,1376
|
|
7
|
-
knowhere/_response.py,sha256=EsrM794qxCykvl82UkszeqjJzm9_OSq7nsyzaSCnx0I,1415
|
|
8
|
-
knowhere/_types.py,sha256=8-JFaRcxgBJbw2mV9BwnmCktFVph41a1mduwtXlYidI,1775
|
|
9
|
-
knowhere/_version.py,sha256=piZV5NEcs0VIotCxwaWvzWE2ASUv5tox5ye8ogIRiIk,50
|
|
10
|
-
knowhere/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
knowhere/lib/__init__.py,sha256=e953V5ny3VmDtCw7y_4uPwdTkwwNpe_Y6o4AEgz3ujw,50
|
|
12
|
-
knowhere/lib/polling.py,sha256=s0EPHozAvNhXLqr5uwU8YXkkwAdF0ji_nIN0QfR6avY,4500
|
|
13
|
-
knowhere/lib/result_parser.py,sha256=U-DK3SDKrbUY0g_-ad04bsbra1mhYy9FJ2opa1n2bTU,8406
|
|
14
|
-
knowhere/lib/upload.py,sha256=eT-O9_wB2WkWUAsUd7VzaKY6DVfNeA6WMHRdwm0HM0o,7849
|
|
15
|
-
knowhere/resources/__init__.py,sha256=_x391t8qxwkGbOmbkzcp7rR10Q8uoDLQaAkZxCq_oM8,170
|
|
16
|
-
knowhere/resources/_base.py,sha256=tgKphNTsgMhktWp6_rhyVOZyee4CYlDmD5O1_jWVvYo,1829
|
|
17
|
-
knowhere/resources/jobs.py,sha256=45P4rZ9HMnTdgcso2AwQ6lDA9U80HGsgOU0jZLBIMFU,8460
|
|
18
|
-
knowhere/types/__init__.py,sha256=OwTxpa9uo0GOEJ6Ds6rqEmXl86O49ByS6M7cscMwQo8,791
|
|
19
|
-
knowhere/types/job.py,sha256=8shCqvgzKKkEPOpEHdk7CnDbPQiDzy3wEd5Jngw94ZM,2362
|
|
20
|
-
knowhere/types/params.py,sha256=7DyBd4xMxtLPch-A1130-gI0ajKOv2G5tbSMkE8n6-E,543
|
|
21
|
-
knowhere/types/result.py,sha256=Lmtaa0wQymBzAm6hXoZZr6dlfwf0WCMEda6Gd8nDIdw,9628
|
|
22
|
-
knowhere/types/shared.py,sha256=K5ezX212othxgCviiE2WnwWFY2MS08pXKJ8Km1ZWmjc,104
|
|
23
|
-
knowhere_python_sdk-0.2.0.dist-info/METADATA,sha256=10dnumfebnQ3VmPHmYuDexWTCdqdFLi-eAaF8FwcNpc,6115
|
|
24
|
-
knowhere_python_sdk-0.2.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
25
|
-
knowhere_python_sdk-0.2.0.dist-info/RECORD,,
|
|
File without changes
|