knowhere-python-sdk 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowhere/__init__.py +4 -0
- knowhere/_base_client.py +96 -31
- knowhere/_client.py +5 -4
- knowhere/_constants.py +1 -0
- knowhere/_exceptions.py +86 -11
- knowhere/_version.py +1 -1
- knowhere/lib/result_parser.py +37 -2
- knowhere/lib/upload.py +118 -42
- knowhere/resources/jobs.py +5 -4
- knowhere/types/result.py +9 -3
- {knowhere_python_sdk-0.1.0.dist-info → knowhere_python_sdk-0.2.0.dist-info}/METADATA +47 -134
- knowhere_python_sdk-0.2.0.dist-info/RECORD +25 -0
- {knowhere_python_sdk-0.1.0.dist-info → knowhere_python_sdk-0.2.0.dist-info}/WHEEL +1 -1
- knowhere_python_sdk-0.1.0.dist-info/RECORD +0 -25
knowhere/__init__.py
CHANGED
|
@@ -22,6 +22,7 @@ from knowhere._exceptions import (
|
|
|
22
22
|
ConflictError,
|
|
23
23
|
GatewayTimeoutError,
|
|
24
24
|
InternalServerError,
|
|
25
|
+
InvalidStateError,
|
|
25
26
|
JobFailedError,
|
|
26
27
|
KnowhereError,
|
|
27
28
|
NotFoundError,
|
|
@@ -30,6 +31,7 @@ from knowhere._exceptions import (
|
|
|
30
31
|
PollingTimeoutError,
|
|
31
32
|
RateLimitError,
|
|
32
33
|
ServiceUnavailableError,
|
|
34
|
+
ValidationError,
|
|
33
35
|
)
|
|
34
36
|
from knowhere._types import PollProgressCallback, UploadProgressCallback
|
|
35
37
|
from knowhere._version import __version__
|
|
@@ -58,6 +60,8 @@ __all__: list[str] = [
|
|
|
58
60
|
"__version__",
|
|
59
61
|
# Exceptions
|
|
60
62
|
"KnowhereError",
|
|
63
|
+
"ValidationError",
|
|
64
|
+
"InvalidStateError",
|
|
61
65
|
"APIConnectionError",
|
|
62
66
|
"APITimeoutError",
|
|
63
67
|
"APIStatusError",
|
knowhere/_base_client.py
CHANGED
|
@@ -25,6 +25,7 @@ from knowhere._constants import (
|
|
|
25
25
|
from knowhere._exceptions import (
|
|
26
26
|
APIConnectionError,
|
|
27
27
|
APITimeoutError,
|
|
28
|
+
ValidationError,
|
|
28
29
|
makeStatusError,
|
|
29
30
|
)
|
|
30
31
|
from knowhere._logging import getLogger, redactSensitiveHeaders
|
|
@@ -35,17 +36,23 @@ T = TypeVar("T")
|
|
|
35
36
|
|
|
36
37
|
_logger = getLogger()
|
|
37
38
|
|
|
38
|
-
# Error codes that are safe to retry
|
|
39
|
-
|
|
40
|
-
"
|
|
41
|
-
"
|
|
42
|
-
"
|
|
43
|
-
"internal_server_error",
|
|
44
|
-
"timeout",
|
|
39
|
+
# Error codes that are always safe to retry (matches server ALWAYS_RETRYABLE_ERROR_CODES)
|
|
40
|
+
_ALWAYS_RETRYABLE_ERROR_CODES: frozenset[str] = frozenset({
|
|
41
|
+
"ABORTED", # 409 - Concurrency conflict
|
|
42
|
+
"UNAVAILABLE", # 503 - Service temporarily down
|
|
43
|
+
"DEADLINE_EXCEEDED", # 504 - Timeout
|
|
45
44
|
})
|
|
46
45
|
|
|
47
|
-
#
|
|
48
|
-
|
|
46
|
+
# RESOURCE_EXHAUSTED (429) is conditionally retryable:
|
|
47
|
+
# - Rate limit: details.retry_after present → RETRY
|
|
48
|
+
# - Quota exceeded: no retry_after → DO NOT RETRY
|
|
49
|
+
_CONDITIONALLY_RETRYABLE_ERROR_CODE: str = "RESOURCE_EXHAUSTED"
|
|
50
|
+
|
|
51
|
+
# HTTP status codes that are always safe to retry
|
|
52
|
+
_ALWAYS_RETRYABLE_STATUS_CODES: frozenset[int] = frozenset({409, 502, 503, 504})
|
|
53
|
+
|
|
54
|
+
# HTTP status code that is conditionally retryable (only with retry_after)
|
|
55
|
+
_CONDITIONALLY_RETRYABLE_STATUS_CODE: int = 429
|
|
49
56
|
|
|
50
57
|
|
|
51
58
|
class BaseClient:
|
|
@@ -71,7 +78,7 @@ class BaseClient:
|
|
|
71
78
|
# Resolve: arg > env > default
|
|
72
79
|
resolved_key: Optional[str] = api_key or os.environ.get(ENV_API_KEY)
|
|
73
80
|
if not resolved_key:
|
|
74
|
-
raise
|
|
81
|
+
raise ValidationError(
|
|
75
82
|
"An API key must be provided via the 'api_key' argument "
|
|
76
83
|
f"or the {ENV_API_KEY} environment variable."
|
|
77
84
|
)
|
|
@@ -122,12 +129,68 @@ class BaseClient:
|
|
|
122
129
|
self,
|
|
123
130
|
status_code: int,
|
|
124
131
|
error_code: Optional[str] = None,
|
|
125
|
-
details: Optional[Any] = None,
|
|
132
|
+
details: Optional[Dict[str, Any]] = None,
|
|
126
133
|
) -> bool:
|
|
127
|
-
"""Decide whether a request should be retried.
|
|
128
|
-
|
|
134
|
+
"""Decide whether a request should be retried.
|
|
135
|
+
|
|
136
|
+
Follows server-side retry semantics:
|
|
137
|
+
- ABORTED, UNAVAILABLE, DEADLINE_EXCEEDED → always retry
|
|
138
|
+
- RESOURCE_EXHAUSTED (429) → retry only if details.retry_after present
|
|
139
|
+
- All other errors → never retry
|
|
140
|
+
"""
|
|
141
|
+
if error_code:
|
|
142
|
+
if error_code in _ALWAYS_RETRYABLE_ERROR_CODES:
|
|
143
|
+
return True
|
|
144
|
+
if error_code == _CONDITIONALLY_RETRYABLE_ERROR_CODE:
|
|
145
|
+
return self._hasRetryAfter(details)
|
|
146
|
+
return False
|
|
147
|
+
|
|
148
|
+
# Fallback to status code when error_code is unavailable
|
|
149
|
+
if status_code in _ALWAYS_RETRYABLE_STATUS_CODES:
|
|
129
150
|
return True
|
|
130
|
-
|
|
151
|
+
if status_code == _CONDITIONALLY_RETRYABLE_STATUS_CODE:
|
|
152
|
+
return self._hasRetryAfter(details)
|
|
153
|
+
return False
|
|
154
|
+
|
|
155
|
+
@staticmethod
|
|
156
|
+
def _hasRetryAfter(details: Optional[Dict[str, Any]]) -> bool:
|
|
157
|
+
"""Check if details contains a retry_after hint."""
|
|
158
|
+
if not isinstance(details, dict):
|
|
159
|
+
return False
|
|
160
|
+
retry_after: Any = details.get("retry_after")
|
|
161
|
+
return retry_after is not None
|
|
162
|
+
|
|
163
|
+
@staticmethod
|
|
164
|
+
def _extractRetryAfter(
|
|
165
|
+
error_body: Optional[Dict[str, Any]],
|
|
166
|
+
response: httpx.Response,
|
|
167
|
+
) -> Optional[float]:
|
|
168
|
+
"""Extract retry_after from the response body or Retry-After header.
|
|
169
|
+
|
|
170
|
+
The server puts retry_after in ``error.details.retry_after``.
|
|
171
|
+
Falls back to the HTTP ``Retry-After`` header.
|
|
172
|
+
"""
|
|
173
|
+
# Prefer body: error.details.retry_after
|
|
174
|
+
if isinstance(error_body, dict):
|
|
175
|
+
err_obj: Any = error_body.get("error", error_body)
|
|
176
|
+
if isinstance(err_obj, dict):
|
|
177
|
+
details: Any = err_obj.get("details")
|
|
178
|
+
if isinstance(details, dict):
|
|
179
|
+
raw: Any = details.get("retry_after")
|
|
180
|
+
if raw is not None:
|
|
181
|
+
try:
|
|
182
|
+
return float(raw)
|
|
183
|
+
except (ValueError, TypeError):
|
|
184
|
+
pass
|
|
185
|
+
|
|
186
|
+
# Fallback: HTTP Retry-After header
|
|
187
|
+
header_raw: Optional[str] = response.headers.get("retry-after")
|
|
188
|
+
if header_raw is not None:
|
|
189
|
+
try:
|
|
190
|
+
return float(header_raw)
|
|
191
|
+
except (ValueError, TypeError):
|
|
192
|
+
pass
|
|
193
|
+
return None
|
|
131
194
|
|
|
132
195
|
def _calculateRetryDelay(
|
|
133
196
|
self,
|
|
@@ -257,24 +320,24 @@ class SyncAPIClient(BaseClient):
|
|
|
257
320
|
response
|
|
258
321
|
)
|
|
259
322
|
error_code: Optional[str] = None
|
|
323
|
+
error_details: Optional[Dict[str, Any]] = None
|
|
260
324
|
if isinstance(error_body, dict):
|
|
261
325
|
err_obj: Any = error_body.get("error", error_body)
|
|
262
326
|
if isinstance(err_obj, dict):
|
|
263
327
|
error_code = err_obj.get("code")
|
|
328
|
+
raw_details: Any = err_obj.get("details")
|
|
329
|
+
if isinstance(raw_details, dict):
|
|
330
|
+
error_details = raw_details
|
|
264
331
|
|
|
265
332
|
if (
|
|
266
333
|
attempt < self.max_retries
|
|
267
|
-
and self._shouldRetry(
|
|
334
|
+
and self._shouldRetry(
|
|
335
|
+
response.status_code, error_code, error_details
|
|
336
|
+
)
|
|
268
337
|
):
|
|
269
|
-
|
|
270
|
-
|
|
338
|
+
retry_after_val: Optional[float] = self._extractRetryAfter(
|
|
339
|
+
error_body, response
|
|
271
340
|
)
|
|
272
|
-
retry_after_val: Optional[float] = None
|
|
273
|
-
if retry_after_raw:
|
|
274
|
-
try:
|
|
275
|
-
retry_after_val = float(retry_after_raw)
|
|
276
|
-
except (ValueError, TypeError):
|
|
277
|
-
pass
|
|
278
341
|
delay = self._calculateRetryDelay(attempt, retry_after_val)
|
|
279
342
|
_logger.warning(
|
|
280
343
|
"Retryable error %d on attempt %d/%d, retrying in %.1fs",
|
|
@@ -404,22 +467,24 @@ class AsyncAPIClient(BaseClient):
|
|
|
404
467
|
|
|
405
468
|
error_body: Optional[Dict[str, Any]] = self._parseErrorResponse(response)
|
|
406
469
|
error_code: Optional[str] = None
|
|
470
|
+
error_details: Optional[Dict[str, Any]] = None
|
|
407
471
|
if isinstance(error_body, dict):
|
|
408
472
|
err_obj: Any = error_body.get("error", error_body)
|
|
409
473
|
if isinstance(err_obj, dict):
|
|
410
474
|
error_code = err_obj.get("code")
|
|
475
|
+
raw_details: Any = err_obj.get("details")
|
|
476
|
+
if isinstance(raw_details, dict):
|
|
477
|
+
error_details = raw_details
|
|
411
478
|
|
|
412
479
|
if (
|
|
413
480
|
attempt < self.max_retries
|
|
414
|
-
and self._shouldRetry(
|
|
481
|
+
and self._shouldRetry(
|
|
482
|
+
response.status_code, error_code, error_details
|
|
483
|
+
)
|
|
415
484
|
):
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
try:
|
|
420
|
-
retry_after_val = float(retry_after_raw)
|
|
421
|
-
except (ValueError, TypeError):
|
|
422
|
-
pass
|
|
485
|
+
retry_after_val: Optional[float] = self._extractRetryAfter(
|
|
486
|
+
error_body, response
|
|
487
|
+
)
|
|
423
488
|
delay = self._calculateRetryDelay(attempt, retry_after_val)
|
|
424
489
|
_logger.warning(
|
|
425
490
|
"Retryable error %d on attempt %d/%d, retrying in %.1fs",
|
knowhere/_client.py
CHANGED
|
@@ -13,6 +13,7 @@ from typing import BinaryIO, Optional, Union, overload
|
|
|
13
13
|
|
|
14
14
|
from knowhere._base_client import AsyncAPIClient, SyncAPIClient
|
|
15
15
|
from knowhere._constants import DEFAULT_POLL_INTERVAL, DEFAULT_POLL_TIMEOUT
|
|
16
|
+
from knowhere._exceptions import ValidationError
|
|
16
17
|
from knowhere._logging import getLogger
|
|
17
18
|
from knowhere._types import (
|
|
18
19
|
PollProgressCallback,
|
|
@@ -94,9 +95,9 @@ class Knowhere(SyncAPIClient):
|
|
|
94
95
|
Provide exactly one of *url* or *file*.
|
|
95
96
|
"""
|
|
96
97
|
if url and file:
|
|
97
|
-
raise
|
|
98
|
+
raise ValidationError("Provide either 'url' or 'file', not both.")
|
|
98
99
|
if not url and file is None:
|
|
99
|
-
raise
|
|
100
|
+
raise ValidationError("Provide either 'url' or 'file'.")
|
|
100
101
|
|
|
101
102
|
# Determine source type and create job
|
|
102
103
|
if url:
|
|
@@ -196,9 +197,9 @@ class AsyncKnowhere(AsyncAPIClient):
|
|
|
196
197
|
) -> ParseResult:
|
|
197
198
|
"""Parse a document end-to-end (async version)."""
|
|
198
199
|
if url and file:
|
|
199
|
-
raise
|
|
200
|
+
raise ValidationError("Provide either 'url' or 'file', not both.")
|
|
200
201
|
if not url and file is None:
|
|
201
|
-
raise
|
|
202
|
+
raise ValidationError("Provide either 'url' or 'file'.")
|
|
202
203
|
|
|
203
204
|
if url:
|
|
204
205
|
job: Job = await self.jobs.create(
|
knowhere/_constants.py
CHANGED
knowhere/_exceptions.py
CHANGED
|
@@ -41,6 +41,19 @@ class APITimeoutError(APIConnectionError):
|
|
|
41
41
|
super().__init__(message)
|
|
42
42
|
|
|
43
43
|
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
# Validation / state
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class ValidationError(KnowhereError):
|
|
50
|
+
"""Raised when the caller provides invalid arguments."""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class InvalidStateError(KnowhereError):
|
|
54
|
+
"""Raised when an object is in an unexpected state for the operation."""
|
|
55
|
+
|
|
56
|
+
|
|
44
57
|
# ---------------------------------------------------------------------------
|
|
45
58
|
# Polling / job errors
|
|
46
59
|
# ---------------------------------------------------------------------------
|
|
@@ -161,9 +174,17 @@ class ConflictError(APIStatusError):
|
|
|
161
174
|
|
|
162
175
|
|
|
163
176
|
class RateLimitError(APIStatusError):
|
|
164
|
-
"""HTTP 429 — includes optional
|
|
177
|
+
"""HTTP 429 — includes optional rate limit hints from the server.
|
|
178
|
+
|
|
179
|
+
Attributes:
|
|
180
|
+
retry_after: Seconds to wait before retrying (``None`` for quota exceeded).
|
|
181
|
+
limit: Maximum allowed requests in the rate window.
|
|
182
|
+
period: Rate window unit (``"second"``, ``"minute"``, ``"hour"``, ``"day"``).
|
|
183
|
+
"""
|
|
165
184
|
|
|
166
185
|
retry_after: Optional[float]
|
|
186
|
+
limit: Optional[int]
|
|
187
|
+
period: Optional[str]
|
|
167
188
|
|
|
168
189
|
def __init__(
|
|
169
190
|
self,
|
|
@@ -176,6 +197,8 @@ class RateLimitError(APIStatusError):
|
|
|
176
197
|
body: Optional[Any] = None,
|
|
177
198
|
response: httpx.Response,
|
|
178
199
|
retry_after: Optional[float] = None,
|
|
200
|
+
limit: Optional[int] = None,
|
|
201
|
+
period: Optional[str] = None,
|
|
179
202
|
) -> None:
|
|
180
203
|
super().__init__(
|
|
181
204
|
status_code,
|
|
@@ -187,6 +210,8 @@ class RateLimitError(APIStatusError):
|
|
|
187
210
|
response=response,
|
|
188
211
|
)
|
|
189
212
|
self.retry_after = retry_after
|
|
213
|
+
self.limit = limit
|
|
214
|
+
self.period = period
|
|
190
215
|
|
|
191
216
|
|
|
192
217
|
class InternalServerError(APIStatusError):
|
|
@@ -194,9 +219,17 @@ class InternalServerError(APIStatusError):
|
|
|
194
219
|
|
|
195
220
|
|
|
196
221
|
class ServiceUnavailableError(APIStatusError):
|
|
197
|
-
"""HTTP 502 / 503 — includes optional
|
|
222
|
+
"""HTTP 502 / 503 — includes optional rate limit hints from the server.
|
|
223
|
+
|
|
224
|
+
Attributes:
|
|
225
|
+
retry_after: Seconds to wait before retrying.
|
|
226
|
+
limit: Maximum allowed requests in the rate window (optional).
|
|
227
|
+
period: Rate window unit (optional).
|
|
228
|
+
"""
|
|
198
229
|
|
|
199
230
|
retry_after: Optional[float]
|
|
231
|
+
limit: Optional[int]
|
|
232
|
+
period: Optional[str]
|
|
200
233
|
|
|
201
234
|
def __init__(
|
|
202
235
|
self,
|
|
@@ -209,6 +242,8 @@ class ServiceUnavailableError(APIStatusError):
|
|
|
209
242
|
body: Optional[Any] = None,
|
|
210
243
|
response: httpx.Response,
|
|
211
244
|
retry_after: Optional[float] = None,
|
|
245
|
+
limit: Optional[int] = None,
|
|
246
|
+
period: Optional[str] = None,
|
|
212
247
|
) -> None:
|
|
213
248
|
super().__init__(
|
|
214
249
|
status_code,
|
|
@@ -220,12 +255,22 @@ class ServiceUnavailableError(APIStatusError):
|
|
|
220
255
|
response=response,
|
|
221
256
|
)
|
|
222
257
|
self.retry_after = retry_after
|
|
258
|
+
self.limit = limit
|
|
259
|
+
self.period = period
|
|
223
260
|
|
|
224
261
|
|
|
225
262
|
class GatewayTimeoutError(APIStatusError):
|
|
226
|
-
"""HTTP 504 — includes optional
|
|
263
|
+
"""HTTP 504 — includes optional rate limit hints from the server.
|
|
264
|
+
|
|
265
|
+
Attributes:
|
|
266
|
+
retry_after: Seconds to wait before retrying.
|
|
267
|
+
limit: Maximum allowed requests in the rate window (optional).
|
|
268
|
+
period: Rate window unit (optional).
|
|
269
|
+
"""
|
|
227
270
|
|
|
228
271
|
retry_after: Optional[float]
|
|
272
|
+
limit: Optional[int]
|
|
273
|
+
period: Optional[str]
|
|
229
274
|
|
|
230
275
|
def __init__(
|
|
231
276
|
self,
|
|
@@ -238,6 +283,8 @@ class GatewayTimeoutError(APIStatusError):
|
|
|
238
283
|
body: Optional[Any] = None,
|
|
239
284
|
response: httpx.Response,
|
|
240
285
|
retry_after: Optional[float] = None,
|
|
286
|
+
limit: Optional[int] = None,
|
|
287
|
+
period: Optional[str] = None,
|
|
241
288
|
) -> None:
|
|
242
289
|
super().__init__(
|
|
243
290
|
status_code,
|
|
@@ -249,6 +296,8 @@ class GatewayTimeoutError(APIStatusError):
|
|
|
249
296
|
response=response,
|
|
250
297
|
)
|
|
251
298
|
self.retry_after = retry_after
|
|
299
|
+
self.limit = limit
|
|
300
|
+
self.period = period
|
|
252
301
|
|
|
253
302
|
|
|
254
303
|
# ---------------------------------------------------------------------------
|
|
@@ -298,14 +347,36 @@ def makeStatusError(
|
|
|
298
347
|
status_code, APIStatusError
|
|
299
348
|
)
|
|
300
349
|
|
|
301
|
-
# Extract
|
|
350
|
+
# Extract retry hints for classes that support them
|
|
351
|
+
# Prefer body: error.details.retry_after, fallback to HTTP header
|
|
302
352
|
retry_after: Optional[float] = None
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
353
|
+
limit: Optional[int] = None
|
|
354
|
+
period: Optional[str] = None
|
|
355
|
+
|
|
356
|
+
if isinstance(details, dict):
|
|
357
|
+
raw_body_retry: Any = details.get("retry_after")
|
|
358
|
+
if raw_body_retry is not None:
|
|
359
|
+
try:
|
|
360
|
+
retry_after = float(raw_body_retry)
|
|
361
|
+
except (ValueError, TypeError):
|
|
362
|
+
pass
|
|
363
|
+
raw_limit: Any = details.get("limit")
|
|
364
|
+
if raw_limit is not None:
|
|
365
|
+
try:
|
|
366
|
+
limit = int(raw_limit)
|
|
367
|
+
except (ValueError, TypeError):
|
|
368
|
+
pass
|
|
369
|
+
raw_period: Any = details.get("period")
|
|
370
|
+
if isinstance(raw_period, str):
|
|
371
|
+
period = raw_period
|
|
372
|
+
|
|
373
|
+
if retry_after is None:
|
|
374
|
+
raw_header_retry: Optional[str] = response.headers.get("retry-after")
|
|
375
|
+
if raw_header_retry is not None:
|
|
376
|
+
try:
|
|
377
|
+
retry_after = float(raw_header_retry)
|
|
378
|
+
except (ValueError, TypeError):
|
|
379
|
+
pass
|
|
309
380
|
|
|
310
381
|
common_kwargs: Dict[str, Any] = dict(
|
|
311
382
|
code=code,
|
|
@@ -318,7 +389,11 @@ def makeStatusError(
|
|
|
318
389
|
|
|
319
390
|
if exception_class in (RateLimitError, ServiceUnavailableError, GatewayTimeoutError):
|
|
320
391
|
return exception_class(
|
|
321
|
-
status_code,
|
|
392
|
+
status_code,
|
|
393
|
+
**common_kwargs,
|
|
394
|
+
retry_after=retry_after, # type: ignore[call-arg]
|
|
395
|
+
limit=limit,
|
|
396
|
+
period=period,
|
|
322
397
|
)
|
|
323
398
|
|
|
324
399
|
return exception_class(status_code, **common_kwargs)
|
knowhere/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.2.0" # x-release-please-version
|
knowhere/lib/result_parser.py
CHANGED
|
@@ -18,6 +18,7 @@ from knowhere.types.result import (
|
|
|
18
18
|
ParseResult,
|
|
19
19
|
TableChunk,
|
|
20
20
|
TextChunk,
|
|
21
|
+
TextChunkTokens,
|
|
21
22
|
)
|
|
22
23
|
|
|
23
24
|
_logger = getLogger()
|
|
@@ -79,6 +80,38 @@ def _extractFilePath(raw: Dict[str, Any]) -> Optional[str]:
|
|
|
79
80
|
return fallback
|
|
80
81
|
|
|
81
82
|
|
|
83
|
+
def _normalizeTokenList(raw_tokens: List[Any]) -> List[str]:
|
|
84
|
+
"""Return a string-only token list with empty values removed."""
|
|
85
|
+
normalized_tokens: List[str] = []
|
|
86
|
+
for raw_token in raw_tokens:
|
|
87
|
+
token_text: str = str(raw_token).strip()
|
|
88
|
+
if token_text:
|
|
89
|
+
normalized_tokens.append(token_text)
|
|
90
|
+
return normalized_tokens
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _parseTextChunkTokens(
|
|
94
|
+
raw_tokens: Any,
|
|
95
|
+
*,
|
|
96
|
+
chunk_id: str,
|
|
97
|
+
) -> Optional[TextChunkTokens]:
|
|
98
|
+
"""Normalize text chunk tokens from the current backend payload."""
|
|
99
|
+
if raw_tokens is None:
|
|
100
|
+
return None
|
|
101
|
+
if isinstance(raw_tokens, bool):
|
|
102
|
+
raise KnowhereError(
|
|
103
|
+
f"Invalid tokens payload for text chunk '{chunk_id}': expected list[str], got bool."
|
|
104
|
+
)
|
|
105
|
+
if isinstance(raw_tokens, list):
|
|
106
|
+
return _normalizeTokenList(raw_tokens)
|
|
107
|
+
|
|
108
|
+
raise KnowhereError(
|
|
109
|
+
"Invalid tokens payload for text chunk "
|
|
110
|
+
f"'{chunk_id}': expected list[str], "
|
|
111
|
+
f"got {type(raw_tokens).__name__}."
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
82
115
|
def _buildChunks(
|
|
83
116
|
raw_chunks: List[Dict[str, Any]],
|
|
84
117
|
zf: zipfile.ZipFile,
|
|
@@ -127,13 +160,15 @@ def _buildChunks(
|
|
|
127
160
|
)
|
|
128
161
|
else:
|
|
129
162
|
metadata = raw.get("metadata", {})
|
|
163
|
+
chunk_id: str = raw.get("chunk_id", "")
|
|
164
|
+
raw_tokens: Any = metadata.get("tokens", raw.get("tokens"))
|
|
130
165
|
chunk = TextChunk(
|
|
131
|
-
chunk_id=
|
|
166
|
+
chunk_id=chunk_id,
|
|
132
167
|
type="text",
|
|
133
168
|
content=raw.get("content", ""),
|
|
134
169
|
path=raw.get("path"),
|
|
135
170
|
length=metadata.get("length", raw.get("length", 0)),
|
|
136
|
-
tokens=
|
|
171
|
+
tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id),
|
|
137
172
|
keywords=metadata.get("keywords", raw.get("keywords")),
|
|
138
173
|
summary=metadata.get("summary", raw.get("summary")),
|
|
139
174
|
relationships=metadata.get("relationships", raw.get("relationships")),
|
knowhere/lib/upload.py
CHANGED
|
@@ -2,11 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import asyncio
|
|
6
|
+
import random
|
|
7
|
+
import time
|
|
5
8
|
from pathlib import Path
|
|
6
9
|
from typing import BinaryIO, Dict, Optional, Union
|
|
7
10
|
|
|
8
11
|
import httpx
|
|
9
12
|
|
|
13
|
+
from knowhere._constants import DEFAULT_UPLOAD_MAX_RETRIES
|
|
10
14
|
from knowhere._exceptions import APIConnectionError, APITimeoutError
|
|
11
15
|
from knowhere._logging import getLogger
|
|
12
16
|
from knowhere._types import UploadProgressCallback
|
|
@@ -16,6 +20,26 @@ _logger = getLogger()
|
|
|
16
20
|
# Chunk size for streaming uploads (256 KiB)
|
|
17
21
|
_UPLOAD_CHUNK_SIZE: int = 256 * 1024
|
|
18
22
|
|
|
23
|
+
# Storage-provider HTTP status codes that are safe to retry.
|
|
24
|
+
# These are transient errors from S3/GCS/Azure Blob, not Knowhere API codes.
|
|
25
|
+
_UPLOAD_RETRYABLE_STATUS_CODES: frozenset[int] = frozenset({500, 502, 503, 504})
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _calculateUploadRetryDelay(attempt: int) -> float:
|
|
29
|
+
"""Exponential backoff with jitter for upload retries."""
|
|
30
|
+
base_delay: float = min(1.0 * (2 ** attempt), 16.0)
|
|
31
|
+
jitter: float = random.uniform(0, base_delay * 0.25)
|
|
32
|
+
return base_delay + jitter
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _isRetryableUploadError(exc: Exception) -> bool:
|
|
36
|
+
"""Return True if the upload error is transient and worth retrying."""
|
|
37
|
+
if isinstance(exc, (httpx.ConnectError, httpx.TimeoutException)):
|
|
38
|
+
return True
|
|
39
|
+
if isinstance(exc, httpx.HTTPStatusError):
|
|
40
|
+
return exc.response.status_code in _UPLOAD_RETRYABLE_STATUS_CODES
|
|
41
|
+
return False
|
|
42
|
+
|
|
19
43
|
|
|
20
44
|
def _prepareFileContent(
|
|
21
45
|
file: Union[Path, BinaryIO, bytes],
|
|
@@ -66,41 +90,68 @@ def syncUploadFile(
|
|
|
66
90
|
on_progress: Optional[UploadProgressCallback] = None,
|
|
67
91
|
*,
|
|
68
92
|
timeout: float = 600.0,
|
|
93
|
+
max_retries: int = DEFAULT_UPLOAD_MAX_RETRIES,
|
|
69
94
|
) -> None:
|
|
70
|
-
"""Upload *file* to *upload_url* using a synchronous PUT request.
|
|
95
|
+
"""Upload *file* to *upload_url* using a synchronous PUT request.
|
|
96
|
+
|
|
97
|
+
Retries on connection errors, timeouts, and transient storage HTTP errors
|
|
98
|
+
(500/502/503/504) up to *max_retries* times.
|
|
99
|
+
"""
|
|
71
100
|
content, total_bytes = _prepareFileContent(file)
|
|
72
101
|
headers: Dict[str, str] = _buildUploadHeaders(upload_headers, total_bytes)
|
|
73
102
|
|
|
74
|
-
_logger.debug("Uploading %s bytes to %s", total_bytes, upload_url)
|
|
75
|
-
|
|
76
103
|
if isinstance(content, bytes):
|
|
77
104
|
data: bytes = content
|
|
78
105
|
else:
|
|
79
|
-
# BinaryIO — read all for simplicity (already measured size)
|
|
80
106
|
pos: int = content.tell()
|
|
81
107
|
data = content.read()
|
|
82
108
|
content.seek(pos)
|
|
83
109
|
|
|
84
|
-
|
|
85
|
-
on_progress(0, total_bytes)
|
|
110
|
+
last_exc: Optional[Exception] = None
|
|
86
111
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
headers=headers,
|
|
92
|
-
timeout=timeout,
|
|
112
|
+
for attempt in range(max_retries + 1):
|
|
113
|
+
_logger.debug(
|
|
114
|
+
"Upload attempt %d/%d — %s bytes to %s",
|
|
115
|
+
attempt + 1, max_retries + 1, total_bytes, upload_url,
|
|
93
116
|
)
|
|
94
|
-
response.raise_for_status()
|
|
95
|
-
except httpx.TimeoutException as exc:
|
|
96
|
-
raise APITimeoutError(f"Upload timed out: {exc}") from exc
|
|
97
|
-
except httpx.HTTPError as exc:
|
|
98
|
-
raise APIConnectionError(f"Upload failed: {exc}") from exc
|
|
99
|
-
|
|
100
|
-
if on_progress:
|
|
101
|
-
on_progress(len(data), total_bytes)
|
|
102
117
|
|
|
103
|
-
|
|
118
|
+
if on_progress and attempt == 0:
|
|
119
|
+
on_progress(0, total_bytes)
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
response: httpx.Response = client.put(
|
|
123
|
+
upload_url,
|
|
124
|
+
content=data,
|
|
125
|
+
headers=headers,
|
|
126
|
+
timeout=timeout,
|
|
127
|
+
)
|
|
128
|
+
response.raise_for_status()
|
|
129
|
+
except (httpx.HTTPError, httpx.TimeoutException) as exc:
|
|
130
|
+
last_exc = exc
|
|
131
|
+
if attempt < max_retries and _isRetryableUploadError(exc):
|
|
132
|
+
delay: float = _calculateUploadRetryDelay(attempt)
|
|
133
|
+
_logger.warning(
|
|
134
|
+
"Upload attempt %d/%d failed (%s), retrying in %.1fs",
|
|
135
|
+
attempt + 1, max_retries + 1, exc, delay,
|
|
136
|
+
)
|
|
137
|
+
time.sleep(delay)
|
|
138
|
+
continue
|
|
139
|
+
# Non-retryable or exhausted retries
|
|
140
|
+
if isinstance(exc, httpx.TimeoutException):
|
|
141
|
+
raise APITimeoutError(f"Upload timed out: {exc}") from exc
|
|
142
|
+
raise APIConnectionError(f"Upload failed: {exc}") from exc
|
|
143
|
+
|
|
144
|
+
# Success
|
|
145
|
+
if on_progress:
|
|
146
|
+
on_progress(len(data), total_bytes)
|
|
147
|
+
_logger.debug("Upload complete: %d", response.status_code)
|
|
148
|
+
return
|
|
149
|
+
|
|
150
|
+
# Should not reach here, but guard against it
|
|
151
|
+
if last_exc is not None:
|
|
152
|
+
if isinstance(last_exc, httpx.TimeoutException):
|
|
153
|
+
raise APITimeoutError(f"Upload timed out: {last_exc}") from last_exc
|
|
154
|
+
raise APIConnectionError(f"Upload failed: {last_exc}") from last_exc
|
|
104
155
|
|
|
105
156
|
|
|
106
157
|
async def asyncUploadFile(
|
|
@@ -111,37 +162,62 @@ async def asyncUploadFile(
|
|
|
111
162
|
on_progress: Optional[UploadProgressCallback] = None,
|
|
112
163
|
*,
|
|
113
164
|
timeout: float = 600.0,
|
|
165
|
+
max_retries: int = DEFAULT_UPLOAD_MAX_RETRIES,
|
|
114
166
|
) -> None:
|
|
115
|
-
"""Upload *file* to *upload_url* using an async PUT request.
|
|
167
|
+
"""Upload *file* to *upload_url* using an async PUT request.
|
|
168
|
+
|
|
169
|
+
Retries on connection errors, timeouts, and transient storage HTTP errors
|
|
170
|
+
(500/502/503/504) up to *max_retries* times.
|
|
171
|
+
"""
|
|
116
172
|
content, total_bytes = _prepareFileContent(file)
|
|
117
173
|
headers: Dict[str, str] = _buildUploadHeaders(upload_headers, total_bytes)
|
|
118
174
|
|
|
119
|
-
_logger.debug("Async uploading %s bytes to %s", total_bytes, upload_url)
|
|
120
|
-
|
|
121
175
|
if isinstance(content, bytes):
|
|
122
176
|
data: bytes = content
|
|
123
177
|
else:
|
|
124
|
-
pos = content.tell()
|
|
178
|
+
pos: int = content.tell()
|
|
125
179
|
data = content.read()
|
|
126
180
|
content.seek(pos)
|
|
127
181
|
|
|
128
|
-
|
|
129
|
-
on_progress(0, total_bytes)
|
|
182
|
+
last_exc: Optional[Exception] = None
|
|
130
183
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
headers=headers,
|
|
136
|
-
timeout=timeout,
|
|
184
|
+
for attempt in range(max_retries + 1):
|
|
185
|
+
_logger.debug(
|
|
186
|
+
"Async upload attempt %d/%d — %s bytes to %s",
|
|
187
|
+
attempt + 1, max_retries + 1, total_bytes, upload_url,
|
|
137
188
|
)
|
|
138
|
-
response.raise_for_status()
|
|
139
|
-
except httpx.TimeoutException as exc:
|
|
140
|
-
raise APITimeoutError(f"Upload timed out: {exc}") from exc
|
|
141
|
-
except httpx.HTTPError as exc:
|
|
142
|
-
raise APIConnectionError(f"Upload failed: {exc}") from exc
|
|
143
|
-
|
|
144
|
-
if on_progress:
|
|
145
|
-
on_progress(len(data), total_bytes)
|
|
146
189
|
|
|
147
|
-
|
|
190
|
+
if on_progress and attempt == 0:
|
|
191
|
+
on_progress(0, total_bytes)
|
|
192
|
+
|
|
193
|
+
try:
|
|
194
|
+
response: httpx.Response = await client.put(
|
|
195
|
+
upload_url,
|
|
196
|
+
content=data,
|
|
197
|
+
headers=headers,
|
|
198
|
+
timeout=timeout,
|
|
199
|
+
)
|
|
200
|
+
response.raise_for_status()
|
|
201
|
+
except (httpx.HTTPError, httpx.TimeoutException) as exc:
|
|
202
|
+
last_exc = exc
|
|
203
|
+
if attempt < max_retries and _isRetryableUploadError(exc):
|
|
204
|
+
delay: float = _calculateUploadRetryDelay(attempt)
|
|
205
|
+
_logger.warning(
|
|
206
|
+
"Async upload attempt %d/%d failed (%s), retrying in %.1fs",
|
|
207
|
+
attempt + 1, max_retries + 1, exc, delay,
|
|
208
|
+
)
|
|
209
|
+
await asyncio.sleep(delay)
|
|
210
|
+
continue
|
|
211
|
+
if isinstance(exc, httpx.TimeoutException):
|
|
212
|
+
raise APITimeoutError(f"Upload timed out: {exc}") from exc
|
|
213
|
+
raise APIConnectionError(f"Upload failed: {exc}") from exc
|
|
214
|
+
|
|
215
|
+
if on_progress:
|
|
216
|
+
on_progress(len(data), total_bytes)
|
|
217
|
+
_logger.debug("Async upload complete: %d", response.status_code)
|
|
218
|
+
return
|
|
219
|
+
|
|
220
|
+
if last_exc is not None:
|
|
221
|
+
if isinstance(last_exc, httpx.TimeoutException):
|
|
222
|
+
raise APITimeoutError(f"Upload timed out: {last_exc}") from last_exc
|
|
223
|
+
raise APIConnectionError(f"Upload failed: {last_exc}") from last_exc
|
knowhere/resources/jobs.py
CHANGED
|
@@ -8,6 +8,7 @@ from typing import Any, BinaryIO, Dict, Optional, Union
|
|
|
8
8
|
import httpx
|
|
9
9
|
|
|
10
10
|
from knowhere._constants import DEFAULT_POLL_INTERVAL, DEFAULT_POLL_TIMEOUT
|
|
11
|
+
from knowhere._exceptions import InvalidStateError
|
|
11
12
|
from knowhere._logging import getLogger
|
|
12
13
|
from knowhere._types import (
|
|
13
14
|
PollProgressCallback,
|
|
@@ -84,7 +85,7 @@ class Jobs(SyncAPIResource):
|
|
|
84
85
|
"""
|
|
85
86
|
if isinstance(job, Job):
|
|
86
87
|
if not job.upload_url:
|
|
87
|
-
raise
|
|
88
|
+
raise InvalidStateError("Job does not have an upload URL.")
|
|
88
89
|
upload_url: str = job.upload_url
|
|
89
90
|
upload_headers: Optional[Dict[str, str]] = job.upload_headers
|
|
90
91
|
else:
|
|
@@ -134,7 +135,7 @@ class Jobs(SyncAPIResource):
|
|
|
134
135
|
"""
|
|
135
136
|
if isinstance(job_result, JobResult):
|
|
136
137
|
if not job_result.result_url:
|
|
137
|
-
raise
|
|
138
|
+
raise InvalidStateError("JobResult does not have a result_url.")
|
|
138
139
|
result_url: str = job_result.result_url
|
|
139
140
|
else:
|
|
140
141
|
result_url = job_result
|
|
@@ -192,7 +193,7 @@ class AsyncJobs(AsyncAPIResource):
|
|
|
192
193
|
"""Upload a file for a job (async)."""
|
|
193
194
|
if isinstance(job, Job):
|
|
194
195
|
if not job.upload_url:
|
|
195
|
-
raise
|
|
196
|
+
raise InvalidStateError("Job does not have an upload URL.")
|
|
196
197
|
upload_url: str = job.upload_url
|
|
197
198
|
upload_headers: Optional[Dict[str, str]] = job.upload_headers
|
|
198
199
|
else:
|
|
@@ -234,7 +235,7 @@ class AsyncJobs(AsyncAPIResource):
|
|
|
234
235
|
"""Download and parse the result ZIP (async)."""
|
|
235
236
|
if isinstance(job_result, JobResult):
|
|
236
237
|
if not job_result.result_url:
|
|
237
|
-
raise
|
|
238
|
+
raise InvalidStateError("JobResult does not have a result_url.")
|
|
238
239
|
result_url: str = job_result.result_url
|
|
239
240
|
else:
|
|
240
241
|
result_url = job_result
|
knowhere/types/result.py
CHANGED
|
@@ -8,6 +8,9 @@ from pathlib import Path
|
|
|
8
8
|
from typing import Any, Dict, List, Optional, Union
|
|
9
9
|
|
|
10
10
|
from pydantic import BaseModel, Field
|
|
11
|
+
from typing_extensions import TypeAlias
|
|
12
|
+
|
|
13
|
+
from knowhere._exceptions import ValidationError
|
|
11
14
|
|
|
12
15
|
|
|
13
16
|
# ---------------------------------------------------------------------------
|
|
@@ -30,11 +33,11 @@ def _sanitizeFilename(name: str) -> str:
|
|
|
30
33
|
|
|
31
34
|
|
|
32
35
|
def _ensurePathWithinDirectory(base: Path, target: Path) -> Path:
|
|
33
|
-
"""Raise ``
|
|
36
|
+
"""Raise ``ValidationError`` if *target* escapes *base* (Zip Slip prevention)."""
|
|
34
37
|
resolved_base: Path = base.resolve()
|
|
35
38
|
resolved_target: Path = target.resolve()
|
|
36
39
|
if not str(resolved_target).startswith(str(resolved_base)):
|
|
37
|
-
raise
|
|
40
|
+
raise ValidationError(
|
|
38
41
|
f"Path '{resolved_target}' escapes output directory '{resolved_base}'."
|
|
39
42
|
)
|
|
40
43
|
return resolved_target
|
|
@@ -122,12 +125,15 @@ class BaseChunk(BaseModel):
|
|
|
122
125
|
path: Optional[str] = None
|
|
123
126
|
|
|
124
127
|
|
|
128
|
+
TextChunkTokens: TypeAlias = List[str]
|
|
129
|
+
|
|
130
|
+
|
|
125
131
|
class TextChunk(BaseChunk):
|
|
126
132
|
"""A text chunk extracted from the document."""
|
|
127
133
|
|
|
128
134
|
type: str = "text"
|
|
129
135
|
length: int = 0
|
|
130
|
-
tokens: Optional[
|
|
136
|
+
tokens: Optional[TextChunkTokens] = None
|
|
131
137
|
keywords: Optional[List[str]] = None
|
|
132
138
|
summary: Optional[str] = None
|
|
133
139
|
relationships: Optional[List[Union[Dict[str, Any], str]]] = None
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: knowhere-python-sdk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Official Python SDK for the Knowhere document parsing API
|
|
5
5
|
Project-URL: Homepage, https://knowhereto.ai
|
|
6
6
|
Project-URL: Documentation, https://docs.knowhereto.ai
|
|
@@ -32,38 +32,41 @@ Description-Content-Type: text/markdown
|
|
|
32
32
|
|
|
33
33
|
# Knowhere Python SDK
|
|
34
34
|
|
|
35
|
+
[](https://pypi.org/project/knowhere-python-sdk/)
|
|
36
|
+
|
|
35
37
|
Official Python SDK for the [Knowhere](https://knowhereto.ai) document parsing API.
|
|
36
38
|
|
|
37
39
|
## Installation
|
|
38
40
|
|
|
39
|
-
```
|
|
41
|
+
```sh
|
|
40
42
|
pip install knowhere-python-sdk
|
|
41
43
|
```
|
|
42
44
|
|
|
43
45
|
Or with [uv](https://docs.astral.sh/uv/):
|
|
44
46
|
|
|
45
|
-
```
|
|
47
|
+
```sh
|
|
46
48
|
uv add knowhere-python-sdk
|
|
47
49
|
```
|
|
48
50
|
|
|
49
|
-
##
|
|
51
|
+
## Usage
|
|
50
52
|
|
|
51
53
|
```python
|
|
52
54
|
import knowhere
|
|
53
55
|
|
|
54
56
|
client = knowhere.Knowhere(api_key="sk_...")
|
|
55
57
|
|
|
56
|
-
# Parse a document from URL
|
|
57
58
|
result = client.parse(url="https://example.com/report.pdf")
|
|
58
59
|
|
|
59
|
-
print(result.statistics.total_chunks)
|
|
60
|
-
print(result.full_markdown[:200])
|
|
60
|
+
print(result.statistics.total_chunks)
|
|
61
|
+
print(result.full_markdown[:200])
|
|
61
62
|
|
|
62
63
|
for chunk in result.text_chunks:
|
|
63
64
|
print(chunk.content[:80])
|
|
64
65
|
```
|
|
65
66
|
|
|
66
|
-
|
|
67
|
+
While you can provide an `api_key` keyword argument, we recommend using [python-dotenv](https://pypi.org/project/python-dotenv/) to add `KNOWHERE_API_KEY="sk_..."` to your `.env` file so that your API key is not stored in source control.
|
|
68
|
+
|
|
69
|
+
### Parse a local file
|
|
67
70
|
|
|
68
71
|
```python
|
|
69
72
|
from pathlib import Path
|
|
@@ -77,7 +80,7 @@ print(result.manifest.source_file_name) # "report.pdf"
|
|
|
77
80
|
print(len(result.chunks)) # 152
|
|
78
81
|
```
|
|
79
82
|
|
|
80
|
-
### Access
|
|
83
|
+
### Access different chunk types
|
|
81
84
|
|
|
82
85
|
```python
|
|
83
86
|
result = client.parse(url="https://example.com/report.pdf")
|
|
@@ -99,14 +102,14 @@ for chunk in result.table_chunks:
|
|
|
99
102
|
print(chunk.html[:100])
|
|
100
103
|
```
|
|
101
104
|
|
|
102
|
-
### Save
|
|
105
|
+
### Save all results to disk
|
|
103
106
|
|
|
104
107
|
```python
|
|
105
108
|
result = client.parse(file=Path("report.pdf"))
|
|
106
109
|
result.save("./output/report/")
|
|
107
110
|
```
|
|
108
111
|
|
|
109
|
-
## Async
|
|
112
|
+
## Async usage
|
|
110
113
|
|
|
111
114
|
```python
|
|
112
115
|
import asyncio
|
|
@@ -123,7 +126,7 @@ async def main():
|
|
|
123
126
|
asyncio.run(main())
|
|
124
127
|
```
|
|
125
128
|
|
|
126
|
-
## Step-by-
|
|
129
|
+
## Step-by-step control
|
|
127
130
|
|
|
128
131
|
For granular control over the parsing workflow, use the `jobs` resource directly:
|
|
129
132
|
|
|
@@ -148,6 +151,22 @@ result = client.jobs.load(job_result)
|
|
|
148
151
|
print(result.statistics)
|
|
149
152
|
```
|
|
150
153
|
|
|
154
|
+
## Handling errors
|
|
155
|
+
|
|
156
|
+
All errors inherit from `knowhere.KnowhereError`.
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
import knowhere
|
|
161
|
+
|
|
162
|
+
try:
|
|
163
|
+
result = client.parse(url="https://example.com/report.pdf")
|
|
164
|
+
except knowhere.AuthenticationError:
|
|
165
|
+
print("Invalid API key")
|
|
166
|
+
except knowhere.APIStatusError as e:
|
|
167
|
+
print(f"{e.status_code}: {e.message}")
|
|
168
|
+
```
|
|
169
|
+
|
|
151
170
|
## Configuration
|
|
152
171
|
|
|
153
172
|
The SDK reads configuration from constructor arguments, environment variables, or defaults (in that priority order):
|
|
@@ -172,50 +191,30 @@ client = knowhere.Knowhere(
|
|
|
172
191
|
)
|
|
173
192
|
```
|
|
174
193
|
|
|
175
|
-
###
|
|
194
|
+
### Retries
|
|
176
195
|
|
|
177
|
-
|
|
178
|
-
# Sync — ensures httpx.Client is properly closed
|
|
179
|
-
with knowhere.Knowhere(api_key="sk_...") as client:
|
|
180
|
-
result = client.parse(url="https://example.com/report.pdf")
|
|
196
|
+
Connection errors, 429 Rate Limit, and >=500 Internal errors are automatically retried with exponential backoff.
|
|
181
197
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
198
|
+
```python
|
|
199
|
+
client = knowhere.Knowhere(
|
|
200
|
+
api_key="sk_...",
|
|
201
|
+
max_retries=3, # default is 5
|
|
202
|
+
)
|
|
185
203
|
```
|
|
186
204
|
|
|
187
|
-
|
|
205
|
+
### Determining the installed version
|
|
188
206
|
|
|
189
207
|
```python
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
AuthenticationError,
|
|
193
|
-
NotFoundError,
|
|
194
|
-
RateLimitError,
|
|
195
|
-
BadRequestError,
|
|
196
|
-
APIStatusError,
|
|
197
|
-
PollingTimeoutError,
|
|
198
|
-
)
|
|
199
|
-
|
|
200
|
-
try:
|
|
201
|
-
result = client.parse(url="https://example.com/report.pdf")
|
|
202
|
-
except BadRequestError as e:
|
|
203
|
-
print(e.status_code) # 400
|
|
204
|
-
print(e.code) # "INVALID_ARGUMENT"
|
|
205
|
-
print(e.message) # "Unsupported file format"
|
|
206
|
-
print(e.request_id) # "req_abc123"
|
|
207
|
-
except NotFoundError as e:
|
|
208
|
-
print(e.message) # "Job not found"
|
|
209
|
-
except RateLimitError as e:
|
|
210
|
-
print(e.retry_after) # seconds to wait
|
|
211
|
-
except AuthenticationError:
|
|
212
|
-
print("Invalid API key")
|
|
213
|
-
except PollingTimeoutError:
|
|
214
|
-
print("Job did not complete within timeout")
|
|
215
|
-
except APIStatusError as e:
|
|
216
|
-
print(f"API error {e.status_code}: {e.message}")
|
|
208
|
+
import knowhere
|
|
209
|
+
print(knowhere.__version__)
|
|
217
210
|
```
|
|
218
211
|
|
|
212
|
+
## Versioning
|
|
213
|
+
|
|
214
|
+
This package follows [Semantic Versioning](https://semver.org/).
|
|
215
|
+
|
|
216
|
+
We publish stable releases to [PyPI](https://pypi.org/project/knowhere-python-sdk/). To install the latest unreleased changes directly from the repository: https://github.com/Ontos-AI/knowhere-python-sdk
|
|
217
|
+
|
|
219
218
|
## Requirements
|
|
220
219
|
|
|
221
220
|
- Python 3.9+
|
|
@@ -223,92 +222,6 @@ except APIStatusError as e:
|
|
|
223
222
|
- [pydantic](https://docs.pydantic.dev/) `>=2.0.0,<3.0`
|
|
224
223
|
- [typing-extensions](https://pypi.org/project/typing-extensions/) `>=4.7.0`
|
|
225
224
|
|
|
226
|
-
## Building from Source
|
|
227
|
-
|
|
228
|
-
### Prerequisites
|
|
229
|
-
|
|
230
|
-
- Python 3.9 or later
|
|
231
|
-
- [uv](https://docs.astral.sh/uv/) (recommended) or pip
|
|
232
|
-
|
|
233
|
-
### Build
|
|
234
|
-
|
|
235
|
-
```bash
|
|
236
|
-
git clone https://github.com/Ontos-AI/knowhere-python-sdk.git
|
|
237
|
-
cd knowhere-python-sdk
|
|
238
|
-
|
|
239
|
-
# Install uv if you don't have it
|
|
240
|
-
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
241
|
-
|
|
242
|
-
# Build sdist + wheel
|
|
243
|
-
uv build
|
|
244
|
-
|
|
245
|
-
# Install the built wheel
|
|
246
|
-
pip install dist/knowhere_python_sdk-*.whl
|
|
247
|
-
```
|
|
248
|
-
|
|
249
|
-
## Development
|
|
250
|
-
|
|
251
|
-
### Setup
|
|
252
|
-
|
|
253
|
-
```bash
|
|
254
|
-
git clone https://github.com/Ontos-AI/knowhere-python-sdk.git
|
|
255
|
-
cd knowhere-python-sdk
|
|
256
|
-
|
|
257
|
-
# Create venv and install all dependencies (including dev)
|
|
258
|
-
uv sync --all-extras
|
|
259
|
-
```
|
|
260
|
-
|
|
261
|
-
### Running Tests
|
|
262
|
-
|
|
263
|
-
```bash
|
|
264
|
-
# Run all unit tests
|
|
265
|
-
uv run pytest tests/ -v
|
|
266
|
-
|
|
267
|
-
# Run with coverage
|
|
268
|
-
uv run coverage run -m pytest tests/ -v
|
|
269
|
-
uv run coverage report -m
|
|
270
|
-
```
|
|
271
|
-
|
|
272
|
-
### Linting and Type Checking
|
|
273
|
-
|
|
274
|
-
```bash
|
|
275
|
-
# Lint
|
|
276
|
-
uv run ruff check src/
|
|
277
|
-
|
|
278
|
-
# Type check
|
|
279
|
-
uv run mypy src/knowhere/
|
|
280
|
-
```
|
|
281
|
-
|
|
282
|
-
### Project Structure
|
|
283
|
-
|
|
284
|
-
```
|
|
285
|
-
knowhere-python-sdk/
|
|
286
|
-
├── src/knowhere/
|
|
287
|
-
│ ├── __init__.py # Public API surface
|
|
288
|
-
│ ├── _client.py # Knowhere + AsyncKnowhere clients
|
|
289
|
-
│ ├── _base_client.py # HTTP logic, retry, error parsing
|
|
290
|
-
│ ├── _exceptions.py # Exception hierarchy
|
|
291
|
-
│ ├── _constants.py # Default URLs, timeouts, env var names
|
|
292
|
-
│ ├── _types.py # Sentinel types, callback type aliases
|
|
293
|
-
│ ├── _logging.py # Logger setup, header redaction
|
|
294
|
-
│ ├── _response.py # APIResponse wrapper
|
|
295
|
-
│ ├── _version.py # __version__
|
|
296
|
-
│ ├── py.typed # PEP 561 marker
|
|
297
|
-
│ ├── types/
|
|
298
|
-
│ │ ├── job.py # Job, JobResult, JobError
|
|
299
|
-
│ │ ├── result.py # ParseResult, Manifest, Chunk types
|
|
300
|
-
│ │ └── params.py # ParsingParams, WebhookConfig
|
|
301
|
-
│ ├── resources/
|
|
302
|
-
│ │ └── jobs.py # Jobs + AsyncJobs resource
|
|
303
|
-
│ └── lib/
|
|
304
|
-
│ ├── polling.py # Adaptive polling loop
|
|
305
|
-
│ ├── upload.py # Streaming file upload
|
|
306
|
-
│ └── result_parser.py # ZIP parsing, checksum verification
|
|
307
|
-
├── tests/ # Unit tests (respx-mocked HTTP)
|
|
308
|
-
├── examples/ # Usage examples
|
|
309
|
-
└── pyproject.toml
|
|
310
|
-
```
|
|
311
|
-
|
|
312
225
|
## License
|
|
313
226
|
|
|
314
227
|
MIT
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
knowhere/__init__.py,sha256=EuIpP3FtDeszonVAXMxZimjRd9iUcQ8wA53h1f27S3k,2343
|
|
2
|
+
knowhere/_base_client.py,sha256=ddeRR1lWLhes5ipvYX6-TMEecjjiEBGfQdPw_vnSNqA,17978
|
|
3
|
+
knowhere/_client.py,sha256=MGU1QsyjKrzTiitm891wgNCq6JLf3DR7y7zhkil_p2E,8027
|
|
4
|
+
knowhere/_constants.py,sha256=ZNCFQC00NpUZIyc_XZ0uemjJE-E8uKAbv3BDa3po9cg,885
|
|
5
|
+
knowhere/_exceptions.py,sha256=yg-4pK7AP6uUPxxyggxf8spQeXgFTpKRwELsHjCQycg,11489
|
|
6
|
+
knowhere/_logging.py,sha256=tNqEA1dLv-adTT6qRq5RBeO35FoWrnS3gwt7gKChLTA,1376
|
|
7
|
+
knowhere/_response.py,sha256=EsrM794qxCykvl82UkszeqjJzm9_OSq7nsyzaSCnx0I,1415
|
|
8
|
+
knowhere/_types.py,sha256=8-JFaRcxgBJbw2mV9BwnmCktFVph41a1mduwtXlYidI,1775
|
|
9
|
+
knowhere/_version.py,sha256=piZV5NEcs0VIotCxwaWvzWE2ASUv5tox5ye8ogIRiIk,50
|
|
10
|
+
knowhere/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
knowhere/lib/__init__.py,sha256=e953V5ny3VmDtCw7y_4uPwdTkwwNpe_Y6o4AEgz3ujw,50
|
|
12
|
+
knowhere/lib/polling.py,sha256=s0EPHozAvNhXLqr5uwU8YXkkwAdF0ji_nIN0QfR6avY,4500
|
|
13
|
+
knowhere/lib/result_parser.py,sha256=U-DK3SDKrbUY0g_-ad04bsbra1mhYy9FJ2opa1n2bTU,8406
|
|
14
|
+
knowhere/lib/upload.py,sha256=eT-O9_wB2WkWUAsUd7VzaKY6DVfNeA6WMHRdwm0HM0o,7849
|
|
15
|
+
knowhere/resources/__init__.py,sha256=_x391t8qxwkGbOmbkzcp7rR10Q8uoDLQaAkZxCq_oM8,170
|
|
16
|
+
knowhere/resources/_base.py,sha256=tgKphNTsgMhktWp6_rhyVOZyee4CYlDmD5O1_jWVvYo,1829
|
|
17
|
+
knowhere/resources/jobs.py,sha256=45P4rZ9HMnTdgcso2AwQ6lDA9U80HGsgOU0jZLBIMFU,8460
|
|
18
|
+
knowhere/types/__init__.py,sha256=OwTxpa9uo0GOEJ6Ds6rqEmXl86O49ByS6M7cscMwQo8,791
|
|
19
|
+
knowhere/types/job.py,sha256=8shCqvgzKKkEPOpEHdk7CnDbPQiDzy3wEd5Jngw94ZM,2362
|
|
20
|
+
knowhere/types/params.py,sha256=7DyBd4xMxtLPch-A1130-gI0ajKOv2G5tbSMkE8n6-E,543
|
|
21
|
+
knowhere/types/result.py,sha256=Lmtaa0wQymBzAm6hXoZZr6dlfwf0WCMEda6Gd8nDIdw,9628
|
|
22
|
+
knowhere/types/shared.py,sha256=K5ezX212othxgCviiE2WnwWFY2MS08pXKJ8Km1ZWmjc,104
|
|
23
|
+
knowhere_python_sdk-0.2.0.dist-info/METADATA,sha256=10dnumfebnQ3VmPHmYuDexWTCdqdFLi-eAaF8FwcNpc,6115
|
|
24
|
+
knowhere_python_sdk-0.2.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
25
|
+
knowhere_python_sdk-0.2.0.dist-info/RECORD,,
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
knowhere/__init__.py,sha256=CGMrMT1Ujmv8-9Kq_Imfch_lFzxkfbBebefd2nMLIo8,2251
|
|
2
|
-
knowhere/_base_client.py,sha256=Rt9rbERMVOFGHU-Tyou4CaUCqQ3VRUvA8siDlp8bcDM,15222
|
|
3
|
-
knowhere/_client.py,sha256=-Bv5BOinvvnHyLregcGfMFMSbwpFGzWFVnfIN8uq5qY,7958
|
|
4
|
-
knowhere/_constants.py,sha256=zD1WuJz77LbGE3NXP1zc0eSXkaiibStSVLyN34D-lYc,849
|
|
5
|
-
knowhere/_exceptions.py,sha256=vH2b_shTBElkeHoQ30QZp-zNucSVaTQ7QHDuy6tEyb0,9012
|
|
6
|
-
knowhere/_logging.py,sha256=tNqEA1dLv-adTT6qRq5RBeO35FoWrnS3gwt7gKChLTA,1376
|
|
7
|
-
knowhere/_response.py,sha256=EsrM794qxCykvl82UkszeqjJzm9_OSq7nsyzaSCnx0I,1415
|
|
8
|
-
knowhere/_types.py,sha256=8-JFaRcxgBJbw2mV9BwnmCktFVph41a1mduwtXlYidI,1775
|
|
9
|
-
knowhere/_version.py,sha256=IOEzhCbX916JIkvZr03GFY4bl9YqSkfjdqLO4qJ2FOs,50
|
|
10
|
-
knowhere/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
knowhere/lib/__init__.py,sha256=e953V5ny3VmDtCw7y_4uPwdTkwwNpe_Y6o4AEgz3ujw,50
|
|
12
|
-
knowhere/lib/polling.py,sha256=s0EPHozAvNhXLqr5uwU8YXkkwAdF0ji_nIN0QfR6avY,4500
|
|
13
|
-
knowhere/lib/result_parser.py,sha256=KnRDhW8G_frH5-FMrpoeG3_188_Hk2kSzDixrec6_fg,7257
|
|
14
|
-
knowhere/lib/upload.py,sha256=AEDLJH1UANPU96JbDdstomE__xuu5DfW7YTZQk1g0lg,4544
|
|
15
|
-
knowhere/resources/__init__.py,sha256=_x391t8qxwkGbOmbkzcp7rR10Q8uoDLQaAkZxCq_oM8,170
|
|
16
|
-
knowhere/resources/_base.py,sha256=tgKphNTsgMhktWp6_rhyVOZyee4CYlDmD5O1_jWVvYo,1829
|
|
17
|
-
knowhere/resources/jobs.py,sha256=k6ZNAdEr6CKmHT-bgtaliF5gjOF7IjLYnDM5dqPCKhw,8381
|
|
18
|
-
knowhere/types/__init__.py,sha256=OwTxpa9uo0GOEJ6Ds6rqEmXl86O49ByS6M7cscMwQo8,791
|
|
19
|
-
knowhere/types/job.py,sha256=8shCqvgzKKkEPOpEHdk7CnDbPQiDzy3wEd5Jngw94ZM,2362
|
|
20
|
-
knowhere/types/params.py,sha256=7DyBd4xMxtLPch-A1130-gI0ajKOv2G5tbSMkE8n6-E,543
|
|
21
|
-
knowhere/types/result.py,sha256=9uCiTm2-X6jOMK5eyNbbzMj11G9vfCuCF_yoowLJ2JQ,9475
|
|
22
|
-
knowhere/types/shared.py,sha256=K5ezX212othxgCviiE2WnwWFY2MS08pXKJ8Km1ZWmjc,104
|
|
23
|
-
knowhere_python_sdk-0.1.0.dist-info/METADATA,sha256=sf2ro4Ya8-r1wHCtWhpTtt-xQLcXQY04wAyjHn-msss,8536
|
|
24
|
-
knowhere_python_sdk-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
25
|
-
knowhere_python_sdk-0.1.0.dist-info/RECORD,,
|