knowhere-python-sdk 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowhere/__init__.py +12 -0
- knowhere/_base_client.py +96 -31
- knowhere/_client.py +5 -4
- knowhere/_constants.py +1 -0
- knowhere/_exceptions.py +106 -13
- knowhere/_version.py +1 -1
- knowhere/lib/result_parser.py +69 -2
- knowhere/lib/upload.py +118 -42
- knowhere/resources/jobs.py +5 -4
- knowhere/types/__init__.py +8 -0
- knowhere/types/result.py +109 -3
- {knowhere_python_sdk-0.1.0.dist-info → knowhere_python_sdk-0.2.1.dist-info}/METADATA +47 -134
- knowhere_python_sdk-0.2.1.dist-info/RECORD +25 -0
- {knowhere_python_sdk-0.1.0.dist-info → knowhere_python_sdk-0.2.1.dist-info}/WHEEL +1 -1
- knowhere_python_sdk-0.1.0.dist-info/RECORD +0 -25
knowhere/__init__.py
CHANGED
|
@@ -22,6 +22,7 @@ from knowhere._exceptions import (
|
|
|
22
22
|
ConflictError,
|
|
23
23
|
GatewayTimeoutError,
|
|
24
24
|
InternalServerError,
|
|
25
|
+
InvalidStateError,
|
|
25
26
|
JobFailedError,
|
|
26
27
|
KnowhereError,
|
|
27
28
|
NotFoundError,
|
|
@@ -30,6 +31,7 @@ from knowhere._exceptions import (
|
|
|
30
31
|
PollingTimeoutError,
|
|
31
32
|
RateLimitError,
|
|
32
33
|
ServiceUnavailableError,
|
|
34
|
+
ValidationError,
|
|
33
35
|
)
|
|
34
36
|
from knowhere._types import PollProgressCallback, UploadProgressCallback
|
|
35
37
|
from knowhere._version import __version__
|
|
@@ -44,6 +46,10 @@ from knowhere.types.result import (
|
|
|
44
46
|
ImageFileInfo,
|
|
45
47
|
Manifest,
|
|
46
48
|
ParseResult,
|
|
49
|
+
ProcessingCost,
|
|
50
|
+
ProcessingMetadata,
|
|
51
|
+
ProcessingTiming,
|
|
52
|
+
SlimChunk,
|
|
47
53
|
Statistics,
|
|
48
54
|
TableChunk,
|
|
49
55
|
TableFileInfo,
|
|
@@ -58,6 +64,8 @@ __all__: list[str] = [
|
|
|
58
64
|
"__version__",
|
|
59
65
|
# Exceptions
|
|
60
66
|
"KnowhereError",
|
|
67
|
+
"ValidationError",
|
|
68
|
+
"InvalidStateError",
|
|
61
69
|
"APIConnectionError",
|
|
62
70
|
"APITimeoutError",
|
|
63
71
|
"APIStatusError",
|
|
@@ -87,6 +95,10 @@ __all__: list[str] = [
|
|
|
87
95
|
"FileIndex",
|
|
88
96
|
"ImageFileInfo",
|
|
89
97
|
"TableFileInfo",
|
|
98
|
+
"ProcessingCost",
|
|
99
|
+
"ProcessingMetadata",
|
|
100
|
+
"ProcessingTiming",
|
|
101
|
+
"SlimChunk",
|
|
90
102
|
"BaseChunk",
|
|
91
103
|
"TextChunk",
|
|
92
104
|
"ImageChunk",
|
knowhere/_base_client.py
CHANGED
|
@@ -25,6 +25,7 @@ from knowhere._constants import (
|
|
|
25
25
|
from knowhere._exceptions import (
|
|
26
26
|
APIConnectionError,
|
|
27
27
|
APITimeoutError,
|
|
28
|
+
ValidationError,
|
|
28
29
|
makeStatusError,
|
|
29
30
|
)
|
|
30
31
|
from knowhere._logging import getLogger, redactSensitiveHeaders
|
|
@@ -35,17 +36,23 @@ T = TypeVar("T")
|
|
|
35
36
|
|
|
36
37
|
_logger = getLogger()
|
|
37
38
|
|
|
38
|
-
# Error codes that are safe to retry
|
|
39
|
-
|
|
40
|
-
"
|
|
41
|
-
"
|
|
42
|
-
"
|
|
43
|
-
"internal_server_error",
|
|
44
|
-
"timeout",
|
|
39
|
+
# Error codes that are always safe to retry (matches server ALWAYS_RETRYABLE_ERROR_CODES)
|
|
40
|
+
_ALWAYS_RETRYABLE_ERROR_CODES: frozenset[str] = frozenset({
|
|
41
|
+
"ABORTED", # 409 - Concurrency conflict
|
|
42
|
+
"UNAVAILABLE", # 503 - Service temporarily down
|
|
43
|
+
"DEADLINE_EXCEEDED", # 504 - Timeout
|
|
45
44
|
})
|
|
46
45
|
|
|
47
|
-
#
|
|
48
|
-
|
|
46
|
+
# RESOURCE_EXHAUSTED (429) is conditionally retryable:
|
|
47
|
+
# - Rate limit: details.retry_after present → RETRY
|
|
48
|
+
# - Quota exceeded: no retry_after → DO NOT RETRY
|
|
49
|
+
_CONDITIONALLY_RETRYABLE_ERROR_CODE: str = "RESOURCE_EXHAUSTED"
|
|
50
|
+
|
|
51
|
+
# HTTP status codes that are always safe to retry
|
|
52
|
+
_ALWAYS_RETRYABLE_STATUS_CODES: frozenset[int] = frozenset({409, 502, 503, 504})
|
|
53
|
+
|
|
54
|
+
# HTTP status code that is conditionally retryable (only with retry_after)
|
|
55
|
+
_CONDITIONALLY_RETRYABLE_STATUS_CODE: int = 429
|
|
49
56
|
|
|
50
57
|
|
|
51
58
|
class BaseClient:
|
|
@@ -71,7 +78,7 @@ class BaseClient:
|
|
|
71
78
|
# Resolve: arg > env > default
|
|
72
79
|
resolved_key: Optional[str] = api_key or os.environ.get(ENV_API_KEY)
|
|
73
80
|
if not resolved_key:
|
|
74
|
-
raise
|
|
81
|
+
raise ValidationError(
|
|
75
82
|
"An API key must be provided via the 'api_key' argument "
|
|
76
83
|
f"or the {ENV_API_KEY} environment variable."
|
|
77
84
|
)
|
|
@@ -122,12 +129,68 @@ class BaseClient:
|
|
|
122
129
|
self,
|
|
123
130
|
status_code: int,
|
|
124
131
|
error_code: Optional[str] = None,
|
|
125
|
-
details: Optional[Any] = None,
|
|
132
|
+
details: Optional[Dict[str, Any]] = None,
|
|
126
133
|
) -> bool:
|
|
127
|
-
"""Decide whether a request should be retried.
|
|
128
|
-
|
|
134
|
+
"""Decide whether a request should be retried.
|
|
135
|
+
|
|
136
|
+
Follows server-side retry semantics:
|
|
137
|
+
- ABORTED, UNAVAILABLE, DEADLINE_EXCEEDED → always retry
|
|
138
|
+
- RESOURCE_EXHAUSTED (429) → retry only if details.retry_after present
|
|
139
|
+
- All other errors → never retry
|
|
140
|
+
"""
|
|
141
|
+
if error_code:
|
|
142
|
+
if error_code in _ALWAYS_RETRYABLE_ERROR_CODES:
|
|
143
|
+
return True
|
|
144
|
+
if error_code == _CONDITIONALLY_RETRYABLE_ERROR_CODE:
|
|
145
|
+
return self._hasRetryAfter(details)
|
|
146
|
+
return False
|
|
147
|
+
|
|
148
|
+
# Fallback to status code when error_code is unavailable
|
|
149
|
+
if status_code in _ALWAYS_RETRYABLE_STATUS_CODES:
|
|
129
150
|
return True
|
|
130
|
-
|
|
151
|
+
if status_code == _CONDITIONALLY_RETRYABLE_STATUS_CODE:
|
|
152
|
+
return self._hasRetryAfter(details)
|
|
153
|
+
return False
|
|
154
|
+
|
|
155
|
+
@staticmethod
|
|
156
|
+
def _hasRetryAfter(details: Optional[Dict[str, Any]]) -> bool:
|
|
157
|
+
"""Check if details contains a retry_after hint."""
|
|
158
|
+
if not isinstance(details, dict):
|
|
159
|
+
return False
|
|
160
|
+
retry_after: Any = details.get("retry_after")
|
|
161
|
+
return retry_after is not None
|
|
162
|
+
|
|
163
|
+
@staticmethod
|
|
164
|
+
def _extractRetryAfter(
|
|
165
|
+
error_body: Optional[Dict[str, Any]],
|
|
166
|
+
response: httpx.Response,
|
|
167
|
+
) -> Optional[float]:
|
|
168
|
+
"""Extract retry_after from the response body or Retry-After header.
|
|
169
|
+
|
|
170
|
+
The server puts retry_after in ``error.details.retry_after``.
|
|
171
|
+
Falls back to the HTTP ``Retry-After`` header.
|
|
172
|
+
"""
|
|
173
|
+
# Prefer body: error.details.retry_after
|
|
174
|
+
if isinstance(error_body, dict):
|
|
175
|
+
err_obj: Any = error_body.get("error", error_body)
|
|
176
|
+
if isinstance(err_obj, dict):
|
|
177
|
+
details: Any = err_obj.get("details")
|
|
178
|
+
if isinstance(details, dict):
|
|
179
|
+
raw: Any = details.get("retry_after")
|
|
180
|
+
if raw is not None:
|
|
181
|
+
try:
|
|
182
|
+
return float(raw)
|
|
183
|
+
except (ValueError, TypeError):
|
|
184
|
+
pass
|
|
185
|
+
|
|
186
|
+
# Fallback: HTTP Retry-After header
|
|
187
|
+
header_raw: Optional[str] = response.headers.get("retry-after")
|
|
188
|
+
if header_raw is not None:
|
|
189
|
+
try:
|
|
190
|
+
return float(header_raw)
|
|
191
|
+
except (ValueError, TypeError):
|
|
192
|
+
pass
|
|
193
|
+
return None
|
|
131
194
|
|
|
132
195
|
def _calculateRetryDelay(
|
|
133
196
|
self,
|
|
@@ -257,24 +320,24 @@ class SyncAPIClient(BaseClient):
|
|
|
257
320
|
response
|
|
258
321
|
)
|
|
259
322
|
error_code: Optional[str] = None
|
|
323
|
+
error_details: Optional[Dict[str, Any]] = None
|
|
260
324
|
if isinstance(error_body, dict):
|
|
261
325
|
err_obj: Any = error_body.get("error", error_body)
|
|
262
326
|
if isinstance(err_obj, dict):
|
|
263
327
|
error_code = err_obj.get("code")
|
|
328
|
+
raw_details: Any = err_obj.get("details")
|
|
329
|
+
if isinstance(raw_details, dict):
|
|
330
|
+
error_details = raw_details
|
|
264
331
|
|
|
265
332
|
if (
|
|
266
333
|
attempt < self.max_retries
|
|
267
|
-
and self._shouldRetry(
|
|
334
|
+
and self._shouldRetry(
|
|
335
|
+
response.status_code, error_code, error_details
|
|
336
|
+
)
|
|
268
337
|
):
|
|
269
|
-
|
|
270
|
-
|
|
338
|
+
retry_after_val: Optional[float] = self._extractRetryAfter(
|
|
339
|
+
error_body, response
|
|
271
340
|
)
|
|
272
|
-
retry_after_val: Optional[float] = None
|
|
273
|
-
if retry_after_raw:
|
|
274
|
-
try:
|
|
275
|
-
retry_after_val = float(retry_after_raw)
|
|
276
|
-
except (ValueError, TypeError):
|
|
277
|
-
pass
|
|
278
341
|
delay = self._calculateRetryDelay(attempt, retry_after_val)
|
|
279
342
|
_logger.warning(
|
|
280
343
|
"Retryable error %d on attempt %d/%d, retrying in %.1fs",
|
|
@@ -404,22 +467,24 @@ class AsyncAPIClient(BaseClient):
|
|
|
404
467
|
|
|
405
468
|
error_body: Optional[Dict[str, Any]] = self._parseErrorResponse(response)
|
|
406
469
|
error_code: Optional[str] = None
|
|
470
|
+
error_details: Optional[Dict[str, Any]] = None
|
|
407
471
|
if isinstance(error_body, dict):
|
|
408
472
|
err_obj: Any = error_body.get("error", error_body)
|
|
409
473
|
if isinstance(err_obj, dict):
|
|
410
474
|
error_code = err_obj.get("code")
|
|
475
|
+
raw_details: Any = err_obj.get("details")
|
|
476
|
+
if isinstance(raw_details, dict):
|
|
477
|
+
error_details = raw_details
|
|
411
478
|
|
|
412
479
|
if (
|
|
413
480
|
attempt < self.max_retries
|
|
414
|
-
and self._shouldRetry(
|
|
481
|
+
and self._shouldRetry(
|
|
482
|
+
response.status_code, error_code, error_details
|
|
483
|
+
)
|
|
415
484
|
):
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
try:
|
|
420
|
-
retry_after_val = float(retry_after_raw)
|
|
421
|
-
except (ValueError, TypeError):
|
|
422
|
-
pass
|
|
485
|
+
retry_after_val: Optional[float] = self._extractRetryAfter(
|
|
486
|
+
error_body, response
|
|
487
|
+
)
|
|
423
488
|
delay = self._calculateRetryDelay(attempt, retry_after_val)
|
|
424
489
|
_logger.warning(
|
|
425
490
|
"Retryable error %d on attempt %d/%d, retrying in %.1fs",
|
knowhere/_client.py
CHANGED
|
@@ -13,6 +13,7 @@ from typing import BinaryIO, Optional, Union, overload
|
|
|
13
13
|
|
|
14
14
|
from knowhere._base_client import AsyncAPIClient, SyncAPIClient
|
|
15
15
|
from knowhere._constants import DEFAULT_POLL_INTERVAL, DEFAULT_POLL_TIMEOUT
|
|
16
|
+
from knowhere._exceptions import ValidationError
|
|
16
17
|
from knowhere._logging import getLogger
|
|
17
18
|
from knowhere._types import (
|
|
18
19
|
PollProgressCallback,
|
|
@@ -94,9 +95,9 @@ class Knowhere(SyncAPIClient):
|
|
|
94
95
|
Provide exactly one of *url* or *file*.
|
|
95
96
|
"""
|
|
96
97
|
if url and file:
|
|
97
|
-
raise
|
|
98
|
+
raise ValidationError("Provide either 'url' or 'file', not both.")
|
|
98
99
|
if not url and file is None:
|
|
99
|
-
raise
|
|
100
|
+
raise ValidationError("Provide either 'url' or 'file'.")
|
|
100
101
|
|
|
101
102
|
# Determine source type and create job
|
|
102
103
|
if url:
|
|
@@ -196,9 +197,9 @@ class AsyncKnowhere(AsyncAPIClient):
|
|
|
196
197
|
) -> ParseResult:
|
|
197
198
|
"""Parse a document end-to-end (async version)."""
|
|
198
199
|
if url and file:
|
|
199
|
-
raise
|
|
200
|
+
raise ValidationError("Provide either 'url' or 'file', not both.")
|
|
200
201
|
if not url and file is None:
|
|
201
|
-
raise
|
|
202
|
+
raise ValidationError("Provide either 'url' or 'file'.")
|
|
202
203
|
|
|
203
204
|
if url:
|
|
204
205
|
job: Job = await self.jobs.create(
|
knowhere/_constants.py
CHANGED
knowhere/_exceptions.py
CHANGED
|
@@ -41,6 +41,19 @@ class APITimeoutError(APIConnectionError):
|
|
|
41
41
|
super().__init__(message)
|
|
42
42
|
|
|
43
43
|
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
# Validation / state
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class ValidationError(KnowhereError):
|
|
50
|
+
"""Raised when the caller provides invalid arguments."""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class InvalidStateError(KnowhereError):
|
|
54
|
+
"""Raised when an object is in an unexpected state for the operation."""
|
|
55
|
+
|
|
56
|
+
|
|
44
57
|
# ---------------------------------------------------------------------------
|
|
45
58
|
# Polling / job errors
|
|
46
59
|
# ---------------------------------------------------------------------------
|
|
@@ -161,9 +174,17 @@ class ConflictError(APIStatusError):
|
|
|
161
174
|
|
|
162
175
|
|
|
163
176
|
class RateLimitError(APIStatusError):
|
|
164
|
-
"""HTTP 429 — includes optional
|
|
177
|
+
"""HTTP 429 — includes optional rate limit hints from the server.
|
|
178
|
+
|
|
179
|
+
Attributes:
|
|
180
|
+
retry_after: Seconds to wait before retrying (``None`` for quota exceeded).
|
|
181
|
+
limit: Maximum allowed requests in the rate window.
|
|
182
|
+
period: Rate window unit (``"second"``, ``"minute"``, ``"hour"``, ``"day"``).
|
|
183
|
+
"""
|
|
165
184
|
|
|
166
185
|
retry_after: Optional[float]
|
|
186
|
+
limit: Optional[int]
|
|
187
|
+
period: Optional[str]
|
|
167
188
|
|
|
168
189
|
def __init__(
|
|
169
190
|
self,
|
|
@@ -176,6 +197,8 @@ class RateLimitError(APIStatusError):
|
|
|
176
197
|
body: Optional[Any] = None,
|
|
177
198
|
response: httpx.Response,
|
|
178
199
|
retry_after: Optional[float] = None,
|
|
200
|
+
limit: Optional[int] = None,
|
|
201
|
+
period: Optional[str] = None,
|
|
179
202
|
) -> None:
|
|
180
203
|
super().__init__(
|
|
181
204
|
status_code,
|
|
@@ -187,6 +210,8 @@ class RateLimitError(APIStatusError):
|
|
|
187
210
|
response=response,
|
|
188
211
|
)
|
|
189
212
|
self.retry_after = retry_after
|
|
213
|
+
self.limit = limit
|
|
214
|
+
self.period = period
|
|
190
215
|
|
|
191
216
|
|
|
192
217
|
class InternalServerError(APIStatusError):
|
|
@@ -194,9 +219,17 @@ class InternalServerError(APIStatusError):
|
|
|
194
219
|
|
|
195
220
|
|
|
196
221
|
class ServiceUnavailableError(APIStatusError):
|
|
197
|
-
"""HTTP 502 / 503 — includes optional
|
|
222
|
+
"""HTTP 502 / 503 — includes optional rate limit hints from the server.
|
|
223
|
+
|
|
224
|
+
Attributes:
|
|
225
|
+
retry_after: Seconds to wait before retrying.
|
|
226
|
+
limit: Maximum allowed requests in the rate window (optional).
|
|
227
|
+
period: Rate window unit (optional).
|
|
228
|
+
"""
|
|
198
229
|
|
|
199
230
|
retry_after: Optional[float]
|
|
231
|
+
limit: Optional[int]
|
|
232
|
+
period: Optional[str]
|
|
200
233
|
|
|
201
234
|
def __init__(
|
|
202
235
|
self,
|
|
@@ -209,6 +242,8 @@ class ServiceUnavailableError(APIStatusError):
|
|
|
209
242
|
body: Optional[Any] = None,
|
|
210
243
|
response: httpx.Response,
|
|
211
244
|
retry_after: Optional[float] = None,
|
|
245
|
+
limit: Optional[int] = None,
|
|
246
|
+
period: Optional[str] = None,
|
|
212
247
|
) -> None:
|
|
213
248
|
super().__init__(
|
|
214
249
|
status_code,
|
|
@@ -220,12 +255,22 @@ class ServiceUnavailableError(APIStatusError):
|
|
|
220
255
|
response=response,
|
|
221
256
|
)
|
|
222
257
|
self.retry_after = retry_after
|
|
258
|
+
self.limit = limit
|
|
259
|
+
self.period = period
|
|
223
260
|
|
|
224
261
|
|
|
225
262
|
class GatewayTimeoutError(APIStatusError):
|
|
226
|
-
"""HTTP 504 — includes optional
|
|
263
|
+
"""HTTP 504 — includes optional rate limit hints from the server.
|
|
264
|
+
|
|
265
|
+
Attributes:
|
|
266
|
+
retry_after: Seconds to wait before retrying.
|
|
267
|
+
limit: Maximum allowed requests in the rate window (optional).
|
|
268
|
+
period: Rate window unit (optional).
|
|
269
|
+
"""
|
|
227
270
|
|
|
228
271
|
retry_after: Optional[float]
|
|
272
|
+
limit: Optional[int]
|
|
273
|
+
period: Optional[str]
|
|
229
274
|
|
|
230
275
|
def __init__(
|
|
231
276
|
self,
|
|
@@ -238,6 +283,8 @@ class GatewayTimeoutError(APIStatusError):
|
|
|
238
283
|
body: Optional[Any] = None,
|
|
239
284
|
response: httpx.Response,
|
|
240
285
|
retry_after: Optional[float] = None,
|
|
286
|
+
limit: Optional[int] = None,
|
|
287
|
+
period: Optional[str] = None,
|
|
241
288
|
) -> None:
|
|
242
289
|
super().__init__(
|
|
243
290
|
status_code,
|
|
@@ -249,6 +296,8 @@ class GatewayTimeoutError(APIStatusError):
|
|
|
249
296
|
response=response,
|
|
250
297
|
)
|
|
251
298
|
self.retry_after = retry_after
|
|
299
|
+
self.limit = limit
|
|
300
|
+
self.period = period
|
|
252
301
|
|
|
253
302
|
|
|
254
303
|
# ---------------------------------------------------------------------------
|
|
@@ -298,14 +347,36 @@ def makeStatusError(
|
|
|
298
347
|
status_code, APIStatusError
|
|
299
348
|
)
|
|
300
349
|
|
|
301
|
-
# Extract
|
|
350
|
+
# Extract retry hints for classes that support them
|
|
351
|
+
# Prefer body: error.details.retry_after, fallback to HTTP header
|
|
302
352
|
retry_after: Optional[float] = None
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
353
|
+
limit: Optional[int] = None
|
|
354
|
+
period: Optional[str] = None
|
|
355
|
+
|
|
356
|
+
if isinstance(details, dict):
|
|
357
|
+
raw_body_retry: Any = details.get("retry_after")
|
|
358
|
+
if raw_body_retry is not None:
|
|
359
|
+
try:
|
|
360
|
+
retry_after = float(raw_body_retry)
|
|
361
|
+
except (ValueError, TypeError):
|
|
362
|
+
pass
|
|
363
|
+
raw_limit: Any = details.get("limit")
|
|
364
|
+
if raw_limit is not None:
|
|
365
|
+
try:
|
|
366
|
+
limit = int(raw_limit)
|
|
367
|
+
except (ValueError, TypeError):
|
|
368
|
+
pass
|
|
369
|
+
raw_period: Any = details.get("period")
|
|
370
|
+
if isinstance(raw_period, str):
|
|
371
|
+
period = raw_period
|
|
372
|
+
|
|
373
|
+
if retry_after is None:
|
|
374
|
+
raw_header_retry: Optional[str] = response.headers.get("retry-after")
|
|
375
|
+
if raw_header_retry is not None:
|
|
376
|
+
try:
|
|
377
|
+
retry_after = float(raw_header_retry)
|
|
378
|
+
except (ValueError, TypeError):
|
|
379
|
+
pass
|
|
309
380
|
|
|
310
381
|
common_kwargs: Dict[str, Any] = dict(
|
|
311
382
|
code=code,
|
|
@@ -316,9 +387,31 @@ def makeStatusError(
|
|
|
316
387
|
response=response,
|
|
317
388
|
)
|
|
318
389
|
|
|
319
|
-
if exception_class
|
|
320
|
-
return
|
|
321
|
-
status_code,
|
|
390
|
+
if exception_class is RateLimitError:
|
|
391
|
+
return RateLimitError(
|
|
392
|
+
status_code,
|
|
393
|
+
**common_kwargs,
|
|
394
|
+
retry_after=retry_after,
|
|
395
|
+
limit=limit,
|
|
396
|
+
period=period,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
if exception_class is ServiceUnavailableError:
|
|
400
|
+
return ServiceUnavailableError(
|
|
401
|
+
status_code,
|
|
402
|
+
**common_kwargs,
|
|
403
|
+
retry_after=retry_after,
|
|
404
|
+
limit=limit,
|
|
405
|
+
period=period,
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
if exception_class is GatewayTimeoutError:
|
|
409
|
+
return GatewayTimeoutError(
|
|
410
|
+
status_code,
|
|
411
|
+
**common_kwargs,
|
|
412
|
+
retry_after=retry_after,
|
|
413
|
+
limit=limit,
|
|
414
|
+
period=period,
|
|
322
415
|
)
|
|
323
416
|
|
|
324
417
|
return exception_class(status_code, **common_kwargs)
|
knowhere/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.1
|
|
1
|
+
__version__ = "0.2.1" # x-release-please-version
|
knowhere/lib/result_parser.py
CHANGED
|
@@ -16,8 +16,10 @@ from knowhere.types.result import (
|
|
|
16
16
|
ImageChunk,
|
|
17
17
|
Manifest,
|
|
18
18
|
ParseResult,
|
|
19
|
+
SlimChunk,
|
|
19
20
|
TableChunk,
|
|
20
21
|
TextChunk,
|
|
22
|
+
TextChunkTokens,
|
|
21
23
|
)
|
|
22
24
|
|
|
23
25
|
_logger = getLogger()
|
|
@@ -79,6 +81,38 @@ def _extractFilePath(raw: Dict[str, Any]) -> Optional[str]:
|
|
|
79
81
|
return fallback
|
|
80
82
|
|
|
81
83
|
|
|
84
|
+
def _normalizeTokenList(raw_tokens: List[Any]) -> List[str]:
|
|
85
|
+
"""Return a string-only token list with empty values removed."""
|
|
86
|
+
normalized_tokens: List[str] = []
|
|
87
|
+
for raw_token in raw_tokens:
|
|
88
|
+
token_text: str = str(raw_token).strip()
|
|
89
|
+
if token_text:
|
|
90
|
+
normalized_tokens.append(token_text)
|
|
91
|
+
return normalized_tokens
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _parseTextChunkTokens(
|
|
95
|
+
raw_tokens: Any,
|
|
96
|
+
*,
|
|
97
|
+
chunk_id: str,
|
|
98
|
+
) -> Optional[TextChunkTokens]:
|
|
99
|
+
"""Normalize text chunk tokens from the current backend payload."""
|
|
100
|
+
if raw_tokens is None:
|
|
101
|
+
return None
|
|
102
|
+
if isinstance(raw_tokens, bool):
|
|
103
|
+
raise KnowhereError(
|
|
104
|
+
f"Invalid tokens payload for text chunk '{chunk_id}': expected list[str], got bool."
|
|
105
|
+
)
|
|
106
|
+
if isinstance(raw_tokens, list):
|
|
107
|
+
return _normalizeTokenList(raw_tokens)
|
|
108
|
+
|
|
109
|
+
raise KnowhereError(
|
|
110
|
+
"Invalid tokens payload for text chunk "
|
|
111
|
+
f"'{chunk_id}': expected list[str], "
|
|
112
|
+
f"got {type(raw_tokens).__name__}."
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
|
|
82
116
|
def _buildChunks(
|
|
83
117
|
raw_chunks: List[Dict[str, Any]],
|
|
84
118
|
zf: zipfile.ZipFile,
|
|
@@ -101,6 +135,7 @@ def _buildChunks(
|
|
|
101
135
|
type="image",
|
|
102
136
|
content=raw.get("content", ""),
|
|
103
137
|
path=raw.get("path"),
|
|
138
|
+
page_nums=metadata.get("page_nums", raw.get("page_nums")),
|
|
104
139
|
length=metadata.get("length", raw.get("length", 0)),
|
|
105
140
|
file_path=file_path,
|
|
106
141
|
original_name=metadata.get("original_name", raw.get("original_name")),
|
|
@@ -118,6 +153,7 @@ def _buildChunks(
|
|
|
118
153
|
type="table",
|
|
119
154
|
content=raw.get("content", ""),
|
|
120
155
|
path=raw.get("path"),
|
|
156
|
+
page_nums=metadata.get("page_nums", raw.get("page_nums")),
|
|
121
157
|
length=metadata.get("length", raw.get("length", 0)),
|
|
122
158
|
file_path=file_path,
|
|
123
159
|
original_name=metadata.get("original_name", raw.get("original_name")),
|
|
@@ -127,15 +163,19 @@ def _buildChunks(
|
|
|
127
163
|
)
|
|
128
164
|
else:
|
|
129
165
|
metadata = raw.get("metadata", {})
|
|
166
|
+
chunk_id: str = raw.get("chunk_id", "")
|
|
167
|
+
raw_tokens: Any = metadata.get("tokens", raw.get("tokens"))
|
|
130
168
|
chunk = TextChunk(
|
|
131
|
-
chunk_id=
|
|
169
|
+
chunk_id=chunk_id,
|
|
132
170
|
type="text",
|
|
133
171
|
content=raw.get("content", ""),
|
|
134
172
|
path=raw.get("path"),
|
|
173
|
+
page_nums=metadata.get("page_nums", raw.get("page_nums")),
|
|
135
174
|
length=metadata.get("length", raw.get("length", 0)),
|
|
136
|
-
tokens=
|
|
175
|
+
tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id),
|
|
137
176
|
keywords=metadata.get("keywords", raw.get("keywords")),
|
|
138
177
|
summary=metadata.get("summary", raw.get("summary")),
|
|
178
|
+
connect_to=metadata.get("connect_to", raw.get("connect_to")),
|
|
139
179
|
relationships=metadata.get("relationships", raw.get("relationships")),
|
|
140
180
|
)
|
|
141
181
|
|
|
@@ -195,12 +235,39 @@ def parseResultZip(
|
|
|
195
235
|
json.loads(hierarchy_text) if hierarchy_text else None
|
|
196
236
|
)
|
|
197
237
|
|
|
238
|
+
# -- Optimized sidecar files --
|
|
239
|
+
chunks_slim_text: Optional[str] = _readZipText(zf, "chunks_slim.json")
|
|
240
|
+
parsed_chunks_slim: Any = json.loads(chunks_slim_text) if chunks_slim_text else None
|
|
241
|
+
if isinstance(parsed_chunks_slim, dict) and "chunks" in parsed_chunks_slim:
|
|
242
|
+
raw_chunks_slim: List[Dict[str, Any]] = parsed_chunks_slim["chunks"]
|
|
243
|
+
elif isinstance(parsed_chunks_slim, list):
|
|
244
|
+
raw_chunks_slim = parsed_chunks_slim
|
|
245
|
+
else:
|
|
246
|
+
raw_chunks_slim = []
|
|
247
|
+
chunks_slim: Optional[List[SlimChunk]] = (
|
|
248
|
+
[SlimChunk.model_validate(chunk) for chunk in raw_chunks_slim]
|
|
249
|
+
if chunks_slim_text is not None
|
|
250
|
+
else None
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
toc_hierarchies_text: Optional[str] = _readZipText(zf, "toc_hierarchies.json")
|
|
254
|
+
toc_hierarchies: Optional[Any] = (
|
|
255
|
+
json.loads(toc_hierarchies_text) if toc_hierarchies_text else None
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
kb_csv: Optional[str] = _readZipText(zf, "kb.csv")
|
|
259
|
+
hierarchy_view_html: Optional[str] = _readZipText(zf, "hierarchy_view.html")
|
|
260
|
+
|
|
198
261
|
zf.close()
|
|
199
262
|
|
|
200
263
|
return ParseResult(
|
|
201
264
|
manifest=manifest,
|
|
202
265
|
chunks=chunks,
|
|
266
|
+
chunks_slim=chunks_slim,
|
|
203
267
|
full_markdown=full_markdown,
|
|
204
268
|
hierarchy=hierarchy,
|
|
269
|
+
toc_hierarchies=toc_hierarchies,
|
|
270
|
+
kb_csv=kb_csv,
|
|
271
|
+
hierarchy_view_html=hierarchy_view_html,
|
|
205
272
|
raw_zip=zip_bytes,
|
|
206
273
|
)
|