knowhere-python-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
knowhere/__init__.py ADDED
@@ -0,0 +1,101 @@
1
+ """Knowhere Python SDK — official client for the Knowhere document parsing API.
2
+
3
+ Quick start::
4
+
5
+ from knowhere import Knowhere
6
+
7
+ client = Knowhere(api_key="sk_...")
8
+ result = client.parse(url="https://example.com/document.pdf")
9
+ print(result.full_markdown)
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from knowhere._client import AsyncKnowhere, Knowhere
15
+ from knowhere._exceptions import (
16
+ APIConnectionError,
17
+ APIStatusError,
18
+ APITimeoutError,
19
+ AuthenticationError,
20
+ BadRequestError,
21
+ ChecksumError,
22
+ ConflictError,
23
+ GatewayTimeoutError,
24
+ InternalServerError,
25
+ JobFailedError,
26
+ KnowhereError,
27
+ NotFoundError,
28
+ PaymentRequiredError,
29
+ PermissionDeniedError,
30
+ PollingTimeoutError,
31
+ RateLimitError,
32
+ ServiceUnavailableError,
33
+ )
34
+ from knowhere._types import PollProgressCallback, UploadProgressCallback
35
+ from knowhere._version import __version__
36
+ from knowhere.types.job import Job, JobError, JobProgress, JobResult
37
+ from knowhere.types.params import ParsingParams, WebhookConfig
38
+ from knowhere.types.result import (
39
+ BaseChunk,
40
+ Checksum,
41
+ Chunk,
42
+ FileIndex,
43
+ ImageChunk,
44
+ ImageFileInfo,
45
+ Manifest,
46
+ ParseResult,
47
+ Statistics,
48
+ TableChunk,
49
+ TableFileInfo,
50
+ TextChunk,
51
+ )
52
+
53
+ __all__: list[str] = [
54
+ # Clients
55
+ "Knowhere",
56
+ "AsyncKnowhere",
57
+ # Version
58
+ "__version__",
59
+ # Exceptions
60
+ "KnowhereError",
61
+ "APIConnectionError",
62
+ "APITimeoutError",
63
+ "APIStatusError",
64
+ "BadRequestError",
65
+ "AuthenticationError",
66
+ "PaymentRequiredError",
67
+ "PermissionDeniedError",
68
+ "NotFoundError",
69
+ "ConflictError",
70
+ "RateLimitError",
71
+ "InternalServerError",
72
+ "ServiceUnavailableError",
73
+ "GatewayTimeoutError",
74
+ "PollingTimeoutError",
75
+ "JobFailedError",
76
+ "ChecksumError",
77
+ # Job types
78
+ "Job",
79
+ "JobError",
80
+ "JobProgress",
81
+ "JobResult",
82
+ # Result types
83
+ "ParseResult",
84
+ "Manifest",
85
+ "Statistics",
86
+ "Checksum",
87
+ "FileIndex",
88
+ "ImageFileInfo",
89
+ "TableFileInfo",
90
+ "BaseChunk",
91
+ "TextChunk",
92
+ "ImageChunk",
93
+ "TableChunk",
94
+ "Chunk",
95
+ # Param types
96
+ "ParsingParams",
97
+ "WebhookConfig",
98
+ # Callback types
99
+ "UploadProgressCallback",
100
+ "PollProgressCallback",
101
+ ]
@@ -0,0 +1,443 @@
1
+ """Base HTTP client classes for the Knowhere SDK.
2
+
3
+ Provides ``BaseClient`` (shared config), ``SyncAPIClient`` (httpx.Client),
4
+ and ``AsyncAPIClient`` (httpx.AsyncClient) with retry logic and error handling.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ import random
11
+ import time
12
+ from typing import Any, Dict, Optional, Type, TypeVar
13
+
14
+ import httpx
15
+
16
+ from knowhere._constants import (
17
+ API_VERSION,
18
+ DEFAULT_BASE_URL,
19
+ DEFAULT_MAX_RETRIES,
20
+ DEFAULT_TIMEOUT,
21
+ DEFAULT_UPLOAD_TIMEOUT,
22
+ ENV_API_KEY,
23
+ ENV_BASE_URL,
24
+ )
25
+ from knowhere._exceptions import (
26
+ APIConnectionError,
27
+ APITimeoutError,
28
+ makeStatusError,
29
+ )
30
+ from knowhere._logging import getLogger, redactSensitiveHeaders
31
+ from knowhere._response import APIResponse
32
+ from knowhere._version import __version__
33
+
34
+ T = TypeVar("T")
35
+
36
+ _logger = getLogger()
37
+
38
+ # Error codes that are safe to retry
39
+ _RETRYABLE_ERROR_CODES: frozenset[str] = frozenset({
40
+ "rate_limit_exceeded",
41
+ "service_unavailable",
42
+ "gateway_timeout",
43
+ "internal_server_error",
44
+ "timeout",
45
+ })
46
+
47
+ # Status codes that are safe to retry
48
+ _RETRYABLE_STATUS_CODES: frozenset[int] = frozenset({408, 429, 500, 502, 503, 504})
49
+
50
+
51
+ class BaseClient:
52
+ """Shared configuration and helper methods for sync/async clients."""
53
+
54
+ api_key: str
55
+ base_url: str
56
+ timeout: float
57
+ upload_timeout: float
58
+ max_retries: int
59
+ _default_headers: Dict[str, str]
60
+
61
+ def __init__(
62
+ self,
63
+ *,
64
+ api_key: Optional[str] = None,
65
+ base_url: Optional[str] = None,
66
+ timeout: Optional[float] = None,
67
+ upload_timeout: Optional[float] = None,
68
+ max_retries: Optional[int] = None,
69
+ default_headers: Optional[Dict[str, str]] = None,
70
+ ) -> None:
71
+ # Resolve: arg > env > default
72
+ resolved_key: Optional[str] = api_key or os.environ.get(ENV_API_KEY)
73
+ if not resolved_key:
74
+ raise ValueError(
75
+ "An API key must be provided via the 'api_key' argument "
76
+ f"or the {ENV_API_KEY} environment variable."
77
+ )
78
+ self.api_key = resolved_key
79
+ self.base_url = (
80
+ base_url
81
+ or os.environ.get(ENV_BASE_URL)
82
+ or DEFAULT_BASE_URL
83
+ ).rstrip("/")
84
+ self.timeout = timeout if timeout is not None else DEFAULT_TIMEOUT
85
+ self.upload_timeout = (
86
+ upload_timeout if upload_timeout is not None else DEFAULT_UPLOAD_TIMEOUT
87
+ )
88
+ self.max_retries = (
89
+ max_retries if max_retries is not None else DEFAULT_MAX_RETRIES
90
+ )
91
+ self._default_headers = default_headers or {}
92
+
93
+ def _buildHeaders(self) -> Dict[str, str]:
94
+ """Return headers including auth and user-agent."""
95
+ headers: Dict[str, str] = {
96
+ "Authorization": f"Bearer {self.api_key}",
97
+ "User-Agent": f"knowhere-python/{__version__}",
98
+ "Accept": "application/json",
99
+ }
100
+ headers.update(self._default_headers)
101
+ return headers
102
+
103
+ def _buildRequestUrl(self, path: str) -> str:
104
+ """Join ``base_url`` with *path*, inserting the API version prefix."""
105
+ if path.startswith("http://") or path.startswith("https://"):
106
+ return path
107
+ clean_path: str = path.lstrip("/")
108
+ if not clean_path.startswith(API_VERSION):
109
+ clean_path = f"{API_VERSION}/{clean_path}"
110
+ return f"{self.base_url}/{clean_path}"
111
+
112
+ def _parseErrorResponse(
113
+ self, response: httpx.Response
114
+ ) -> Optional[Dict[str, Any]]:
115
+ """Try to parse a JSON error body; return ``None`` on failure."""
116
+ try:
117
+ return response.json() # type: ignore[no-any-return]
118
+ except Exception:
119
+ return None
120
+
121
+ def _shouldRetry(
122
+ self,
123
+ status_code: int,
124
+ error_code: Optional[str] = None,
125
+ details: Optional[Any] = None,
126
+ ) -> bool:
127
+ """Decide whether a request should be retried."""
128
+ if error_code and error_code in _RETRYABLE_ERROR_CODES:
129
+ return True
130
+ return status_code in _RETRYABLE_STATUS_CODES
131
+
132
+ def _calculateRetryDelay(
133
+ self,
134
+ attempt: int,
135
+ retry_after: Optional[float] = None,
136
+ ) -> float:
137
+ """Exponential backoff with jitter, respecting ``Retry-After``."""
138
+ if retry_after is not None and retry_after > 0:
139
+ return retry_after
140
+ # Exponential backoff: 0.5 * 2^attempt, capped at 30s
141
+ base_delay: float = min(0.5 * (2 ** attempt), 30.0)
142
+ jitter: float = random.uniform(0, base_delay * 0.25)
143
+ return base_delay + jitter
144
+
145
+
146
+ # ---------------------------------------------------------------------------
147
+ # Synchronous client
148
+ # ---------------------------------------------------------------------------
149
+
150
+
151
+ class SyncAPIClient(BaseClient):
152
+ """Synchronous HTTP client backed by ``httpx.Client``."""
153
+
154
+ _client: httpx.Client
155
+
156
+ def __init__(
157
+ self,
158
+ *,
159
+ api_key: Optional[str] = None,
160
+ base_url: Optional[str] = None,
161
+ timeout: Optional[float] = None,
162
+ upload_timeout: Optional[float] = None,
163
+ max_retries: Optional[int] = None,
164
+ default_headers: Optional[Dict[str, str]] = None,
165
+ ) -> None:
166
+ super().__init__(
167
+ api_key=api_key,
168
+ base_url=base_url,
169
+ timeout=timeout,
170
+ upload_timeout=upload_timeout,
171
+ max_retries=max_retries,
172
+ default_headers=default_headers,
173
+ )
174
+ self._client = httpx.Client(
175
+ timeout=httpx.Timeout(self.timeout),
176
+ follow_redirects=True,
177
+ )
178
+
179
+ # -- request with retry loop --
180
+
181
+ def _request(
182
+ self,
183
+ method: str,
184
+ path: str,
185
+ *,
186
+ body: Optional[Any] = None,
187
+ params: Optional[Dict[str, Any]] = None,
188
+ timeout: Optional[float] = None,
189
+ cast_to: Type[T],
190
+ headers: Optional[Dict[str, str]] = None,
191
+ ) -> T:
192
+ """Execute an HTTP request with automatic retries and error handling."""
193
+ url: str = self._buildRequestUrl(path)
194
+ request_headers: Dict[str, str] = self._buildHeaders()
195
+ if headers:
196
+ request_headers.update(headers)
197
+
198
+ effective_timeout: float = timeout if timeout is not None else self.timeout
199
+
200
+ _logger.debug(
201
+ "Request: %s %s headers=%s",
202
+ method,
203
+ url,
204
+ redactSensitiveHeaders(request_headers),
205
+ )
206
+
207
+ for attempt in range(self.max_retries + 1):
208
+ try:
209
+ response: httpx.Response = self._client.request(
210
+ method,
211
+ url,
212
+ json=body,
213
+ params=params,
214
+ headers=request_headers,
215
+ timeout=effective_timeout,
216
+ )
217
+ except httpx.TimeoutException as exc:
218
+ if attempt < self.max_retries:
219
+ delay: float = self._calculateRetryDelay(attempt)
220
+ _logger.warning(
221
+ "Timeout on attempt %d/%d, retrying in %.1fs",
222
+ attempt + 1,
223
+ self.max_retries + 1,
224
+ delay,
225
+ )
226
+ time.sleep(delay)
227
+ continue
228
+ raise APITimeoutError(
229
+ f"Request to {url} timed out after {effective_timeout}s."
230
+ ) from exc
231
+ except httpx.HTTPError as exc:
232
+ if attempt < self.max_retries:
233
+ delay = self._calculateRetryDelay(attempt)
234
+ _logger.warning(
235
+ "Connection error on attempt %d/%d, retrying in %.1fs",
236
+ attempt + 1,
237
+ self.max_retries + 1,
238
+ delay,
239
+ )
240
+ time.sleep(delay)
241
+ continue
242
+ raise APIConnectionError(str(exc)) from exc
243
+
244
+ _logger.debug(
245
+ "Response: %d %s", response.status_code, url
246
+ )
247
+
248
+ # Success
249
+ if response.is_success:
250
+ api_response: APIResponse[T] = APIResponse(
251
+ response, cast_to
252
+ )
253
+ return api_response.parse()
254
+
255
+ # Error — decide whether to retry
256
+ error_body: Optional[Dict[str, Any]] = self._parseErrorResponse(
257
+ response
258
+ )
259
+ error_code: Optional[str] = None
260
+ if isinstance(error_body, dict):
261
+ err_obj: Any = error_body.get("error", error_body)
262
+ if isinstance(err_obj, dict):
263
+ error_code = err_obj.get("code")
264
+
265
+ if (
266
+ attempt < self.max_retries
267
+ and self._shouldRetry(response.status_code, error_code)
268
+ ):
269
+ retry_after_raw: Optional[str] = response.headers.get(
270
+ "retry-after"
271
+ )
272
+ retry_after_val: Optional[float] = None
273
+ if retry_after_raw:
274
+ try:
275
+ retry_after_val = float(retry_after_raw)
276
+ except (ValueError, TypeError):
277
+ pass
278
+ delay = self._calculateRetryDelay(attempt, retry_after_val)
279
+ _logger.warning(
280
+ "Retryable error %d on attempt %d/%d, retrying in %.1fs",
281
+ response.status_code,
282
+ attempt + 1,
283
+ self.max_retries + 1,
284
+ delay,
285
+ )
286
+ time.sleep(delay)
287
+ continue
288
+
289
+ raise makeStatusError(response.status_code, response, error_body)
290
+
291
+ # Should not reach here, but satisfy the type checker
292
+ raise APIConnectionError("Max retries exceeded.")
293
+
294
+ def close(self) -> None:
295
+ """Close the underlying HTTP client."""
296
+ self._client.close()
297
+
298
+ def __enter__(self) -> SyncAPIClient:
299
+ return self
300
+
301
+ def __exit__(self, *args: Any) -> None:
302
+ self.close()
303
+
304
+
305
+ # ---------------------------------------------------------------------------
306
+ # Asynchronous client
307
+ # ---------------------------------------------------------------------------
308
+
309
+
310
+ class AsyncAPIClient(BaseClient):
311
+ """Asynchronous HTTP client backed by ``httpx.AsyncClient``."""
312
+
313
+ _client: httpx.AsyncClient
314
+
315
+ def __init__(
316
+ self,
317
+ *,
318
+ api_key: Optional[str] = None,
319
+ base_url: Optional[str] = None,
320
+ timeout: Optional[float] = None,
321
+ upload_timeout: Optional[float] = None,
322
+ max_retries: Optional[int] = None,
323
+ default_headers: Optional[Dict[str, str]] = None,
324
+ ) -> None:
325
+ super().__init__(
326
+ api_key=api_key,
327
+ base_url=base_url,
328
+ timeout=timeout,
329
+ upload_timeout=upload_timeout,
330
+ max_retries=max_retries,
331
+ default_headers=default_headers,
332
+ )
333
+ self._client = httpx.AsyncClient(
334
+ timeout=httpx.Timeout(self.timeout),
335
+ follow_redirects=True,
336
+ )
337
+
338
+ async def _request(
339
+ self,
340
+ method: str,
341
+ path: str,
342
+ *,
343
+ body: Optional[Any] = None,
344
+ params: Optional[Dict[str, Any]] = None,
345
+ timeout: Optional[float] = None,
346
+ cast_to: Type[T],
347
+ headers: Optional[Dict[str, str]] = None,
348
+ ) -> T:
349
+ """Execute an async HTTP request with automatic retries."""
350
+ import asyncio
351
+
352
+ url: str = self._buildRequestUrl(path)
353
+ request_headers: Dict[str, str] = self._buildHeaders()
354
+ if headers:
355
+ request_headers.update(headers)
356
+
357
+ effective_timeout: float = timeout if timeout is not None else self.timeout
358
+
359
+ _logger.debug(
360
+ "Async request: %s %s headers=%s",
361
+ method,
362
+ url,
363
+ redactSensitiveHeaders(request_headers),
364
+ )
365
+
366
+ for attempt in range(self.max_retries + 1):
367
+ try:
368
+ response: httpx.Response = await self._client.request(
369
+ method,
370
+ url,
371
+ json=body,
372
+ params=params,
373
+ headers=request_headers,
374
+ timeout=effective_timeout,
375
+ )
376
+ except httpx.TimeoutException as exc:
377
+ if attempt < self.max_retries:
378
+ delay: float = self._calculateRetryDelay(attempt)
379
+ _logger.warning(
380
+ "Timeout on attempt %d/%d, retrying in %.1fs",
381
+ attempt + 1, self.max_retries + 1, delay,
382
+ )
383
+ await asyncio.sleep(delay)
384
+ continue
385
+ raise APITimeoutError(
386
+ f"Request to {url} timed out after {effective_timeout}s."
387
+ ) from exc
388
+ except httpx.HTTPError as exc:
389
+ if attempt < self.max_retries:
390
+ delay = self._calculateRetryDelay(attempt)
391
+ _logger.warning(
392
+ "Connection error on attempt %d/%d, retrying in %.1fs",
393
+ attempt + 1, self.max_retries + 1, delay,
394
+ )
395
+ await asyncio.sleep(delay)
396
+ continue
397
+ raise APIConnectionError(str(exc)) from exc
398
+
399
+ _logger.debug("Async response: %d %s", response.status_code, url)
400
+
401
+ if response.is_success:
402
+ api_response: APIResponse[T] = APIResponse(response, cast_to)
403
+ return api_response.parse()
404
+
405
+ error_body: Optional[Dict[str, Any]] = self._parseErrorResponse(response)
406
+ error_code: Optional[str] = None
407
+ if isinstance(error_body, dict):
408
+ err_obj: Any = error_body.get("error", error_body)
409
+ if isinstance(err_obj, dict):
410
+ error_code = err_obj.get("code")
411
+
412
+ if (
413
+ attempt < self.max_retries
414
+ and self._shouldRetry(response.status_code, error_code)
415
+ ):
416
+ retry_after_raw: Optional[str] = response.headers.get("retry-after")
417
+ retry_after_val: Optional[float] = None
418
+ if retry_after_raw:
419
+ try:
420
+ retry_after_val = float(retry_after_raw)
421
+ except (ValueError, TypeError):
422
+ pass
423
+ delay = self._calculateRetryDelay(attempt, retry_after_val)
424
+ _logger.warning(
425
+ "Retryable error %d on attempt %d/%d, retrying in %.1fs",
426
+ response.status_code, attempt + 1, self.max_retries + 1, delay,
427
+ )
428
+ await asyncio.sleep(delay)
429
+ continue
430
+
431
+ raise makeStatusError(response.status_code, response, error_body)
432
+
433
+ raise APIConnectionError("Max retries exceeded.")
434
+
435
+ async def close(self) -> None:
436
+ """Close the underlying async HTTP client."""
437
+ await self._client.aclose()
438
+
439
+ async def __aenter__(self) -> AsyncAPIClient:
440
+ return self
441
+
442
+ async def __aexit__(self, *args: Any) -> None:
443
+ await self.close()