knowhere-python-sdk 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. knowhere_python_sdk-0.3.0/.release-please-manifest.json +3 -0
  2. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/CHANGELOG.md +18 -0
  3. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/PKG-INFO +72 -1
  4. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/README.md +71 -0
  5. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/docs/usage.md +127 -0
  6. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/pyproject.toml +1 -1
  7. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/__init__.py +21 -0
  8. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/_client.py +43 -1
  9. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/_exceptions.py +21 -3
  10. knowhere_python_sdk-0.3.0/src/knowhere/_version.py +1 -0
  11. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/lib/result_parser.py +32 -0
  12. knowhere_python_sdk-0.3.0/src/knowhere/resources/__init__.py +16 -0
  13. knowhere_python_sdk-0.3.0/src/knowhere/resources/documents.py +74 -0
  14. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/resources/jobs.py +14 -0
  15. knowhere_python_sdk-0.3.0/src/knowhere/resources/retrieval.py +70 -0
  16. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/types/__init__.py +21 -0
  17. knowhere_python_sdk-0.3.0/src/knowhere/types/document.py +28 -0
  18. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/types/job.py +4 -0
  19. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/types/result.py +100 -0
  20. knowhere_python_sdk-0.3.0/src/knowhere/types/retrieval.py +33 -0
  21. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/conftest.py +4 -1
  22. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_client.py +34 -0
  23. knowhere_python_sdk-0.3.0/tests/test_documents.py +106 -0
  24. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_jobs.py +11 -2
  25. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_models.py +73 -2
  26. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_polling.py +1 -1
  27. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_result_parser.py +195 -0
  28. knowhere_python_sdk-0.3.0/tests/test_retrieval.py +110 -0
  29. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_retry.py +0 -1
  30. knowhere_python_sdk-0.2.0/.release-please-manifest.json +0 -3
  31. knowhere_python_sdk-0.2.0/src/knowhere/_version.py +0 -1
  32. knowhere_python_sdk-0.2.0/src/knowhere/resources/__init__.py +0 -7
  33. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/.github/workflows/ci.yml +0 -0
  34. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/.github/workflows/publish-pypi.yml +0 -0
  35. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/.github/workflows/publish.yml +0 -0
  36. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/.gitignore +0 -0
  37. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/examples/async_usage.py +0 -0
  38. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/examples/error_handling.py +0 -0
  39. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/examples/parse_file.py +0 -0
  40. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/examples/parse_url.py +0 -0
  41. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/examples/step_by_step.py +0 -0
  42. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/release-please-config.json +0 -0
  43. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/_base_client.py +0 -0
  44. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/_constants.py +0 -0
  45. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/_logging.py +0 -0
  46. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/_response.py +0 -0
  47. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/_types.py +0 -0
  48. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/lib/__init__.py +0 -0
  49. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/lib/polling.py +0 -0
  50. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/lib/upload.py +0 -0
  51. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/py.typed +0 -0
  52. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/resources/_base.py +0 -0
  53. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/types/params.py +0 -0
  54. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/src/knowhere/types/shared.py +0 -0
  55. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/__init__.py +0 -0
  56. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/fixtures/real_result.zip +0 -0
  57. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_exceptions.py +0 -0
  58. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_logging.py +0 -0
  59. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_parse.py +0 -0
  60. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.3.0}/tests/test_upload.py +0 -0
@@ -0,0 +1,3 @@
1
+ {
2
+ ".": "0.3.0"
3
+ }
@@ -1,5 +1,23 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.3.0](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.2.1...v0.3.0) (2026-04-21)
4
+
5
+
6
+ ### Features
7
+
8
+ * add retrieval service sdk clients ([bceef5c](https://github.com/Ontos-AI/knowhere-python-sdk/commit/bceef5cf379dba39543244bd6ca86262a536fb9b))
9
+ * integrate retrieval service v1 in Python SDK ([bce7aa8](https://github.com/Ontos-AI/knowhere-python-sdk/commit/bce7aa8dbf069d5880b92c6f9d8996878251f7cb))
10
+
11
+ ## [0.2.1](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.2.0...v0.2.1) (2026-04-09)
12
+
13
+
14
+ ### Bug Fixes
15
+
16
+ * narrow status error constructors ([c8fc035](https://github.com/Ontos-AI/knowhere-python-sdk/commit/c8fc035dade768c5364e50de890bde0fb280586e))
17
+ * remove stale mypy ignore ([150336a](https://github.com/Ontos-AI/knowhere-python-sdk/commit/150336a5dc0497b287437dffa6e1506f4bcf8fbf))
18
+ * sync optimized parse result payload ([a7903ad](https://github.com/Ontos-AI/knowhere-python-sdk/commit/a7903ad53fb5ab142c5835134c9a942eb5cdfe21))
19
+ * sync parse result payload with current API schema ([430b067](https://github.com/Ontos-AI/knowhere-python-sdk/commit/430b067b37ce0b2eb8bd3c81cfca56b1df657376))
20
+
3
21
  ## [0.2.0](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.1.0...v0.2.0) (2026-03-18)
4
22
 
5
23
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: knowhere-python-sdk
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Official Python SDK for the Knowhere document parsing API
5
5
  Project-URL: Homepage, https://knowhereto.ai
6
6
  Project-URL: Documentation, https://docs.knowhereto.ai
@@ -64,6 +64,74 @@ for chunk in result.text_chunks:
64
64
  print(chunk.content[:80])
65
65
  ```
66
66
 
67
+ ## Retrieval and document lifecycle
68
+
69
+ New documents are published into a retrieval namespace. The server returns a
70
+ stable `document_id` when you create a job; persist that value if you need to
71
+ update or archive the same document later.
72
+
73
+ ```python
74
+ job = client.jobs.create(
75
+ source_type="url",
76
+ source_url="https://example.com/manual.pdf",
77
+ namespace="support-center",
78
+ )
79
+
80
+ print(job.document_id) # "doc_..."
81
+ ```
82
+
83
+ After the job is done and published, query the canonical document content:
84
+
85
+ ```python
86
+ response = client.retrieval.query(
87
+ namespace="support-center",
88
+ query="How do I reset Bluetooth pairing?",
89
+ top_k=5,
90
+ )
91
+
92
+ for result in response.results:
93
+ print(result.content)
94
+ print(result.score)
95
+ print(result.source.source_file_name, result.source.section_path)
96
+ ```
97
+
98
+ Use `document_id` to update or archive a document:
99
+
100
+ ```python
101
+ update_job = client.jobs.create(
102
+ source_type="url",
103
+ source_url="https://example.com/manual-v2.pdf",
104
+ document_id=job.document_id,
105
+ )
106
+
107
+ document = client.documents.get(job.document_id)
108
+ print(document.status)
109
+
110
+ client.documents.archive(job.document_id)
111
+ ```
112
+
113
+ You can also list documents in a namespace:
114
+
115
+ ```python
116
+ documents = client.documents.list(namespace="support-center")
117
+ for document in documents.documents:
118
+ print(document.document_id, document.status)
119
+ ```
120
+
121
+ Retrieval supports exclusions when clients want follow-up results that avoid
122
+ previously used documents or sections:
123
+
124
+ ```python
125
+ response = client.retrieval.query(
126
+ namespace="support-center",
127
+ query="battery charging",
128
+ exclude_document_ids=["doc_old"],
129
+ exclude_sections=[
130
+ {"document_id": "doc_123", "section_path": "Appendix / Legal"}
131
+ ],
132
+ )
133
+ ```
134
+
67
135
  While you can provide an `api_key` keyword argument, we recommend using [python-dotenv](https://pypi.org/project/python-dotenv/) to add `KNOWHERE_API_KEY="sk_..."` to your `.env` file so that your API key is not stored in source control.
68
136
 
69
137
  ### Parse a local file
@@ -137,9 +205,12 @@ from pathlib import Path
137
205
  job = client.jobs.create(
138
206
  source_type="file",
139
207
  file_name="report.pdf",
208
+ namespace="support-center",
140
209
  parsing_params={"model": "advanced", "ocr_enabled": True},
141
210
  )
142
211
 
212
+ print(job.document_id) # Persist this to update/archive the document later.
213
+
143
214
  # Step 2: Upload file to presigned URL
144
215
  client.jobs.upload(job, file=Path("report.pdf"))
145
216
 
@@ -32,6 +32,74 @@ for chunk in result.text_chunks:
32
32
  print(chunk.content[:80])
33
33
  ```
34
34
 
35
+ ## Retrieval and document lifecycle
36
+
37
+ New documents are published into a retrieval namespace. The server returns a
38
+ stable `document_id` when you create a job; persist that value if you need to
39
+ update or archive the same document later.
40
+
41
+ ```python
42
+ job = client.jobs.create(
43
+ source_type="url",
44
+ source_url="https://example.com/manual.pdf",
45
+ namespace="support-center",
46
+ )
47
+
48
+ print(job.document_id) # "doc_..."
49
+ ```
50
+
51
+ After the job is done and published, query the canonical document content:
52
+
53
+ ```python
54
+ response = client.retrieval.query(
55
+ namespace="support-center",
56
+ query="How do I reset Bluetooth pairing?",
57
+ top_k=5,
58
+ )
59
+
60
+ for result in response.results:
61
+ print(result.content)
62
+ print(result.score)
63
+ print(result.source.source_file_name, result.source.section_path)
64
+ ```
65
+
66
+ Use `document_id` to update or archive a document:
67
+
68
+ ```python
69
+ update_job = client.jobs.create(
70
+ source_type="url",
71
+ source_url="https://example.com/manual-v2.pdf",
72
+ document_id=job.document_id,
73
+ )
74
+
75
+ document = client.documents.get(job.document_id)
76
+ print(document.status)
77
+
78
+ client.documents.archive(job.document_id)
79
+ ```
80
+
81
+ You can also list documents in a namespace:
82
+
83
+ ```python
84
+ documents = client.documents.list(namespace="support-center")
85
+ for document in documents.documents:
86
+ print(document.document_id, document.status)
87
+ ```
88
+
89
+ Retrieval supports exclusions when clients want follow-up results that avoid
90
+ previously used documents or sections:
91
+
92
+ ```python
93
+ response = client.retrieval.query(
94
+ namespace="support-center",
95
+ query="battery charging",
96
+ exclude_document_ids=["doc_old"],
97
+ exclude_sections=[
98
+ {"document_id": "doc_123", "section_path": "Appendix / Legal"}
99
+ ],
100
+ )
101
+ ```
102
+
35
103
  While you can provide an `api_key` keyword argument, we recommend using [python-dotenv](https://pypi.org/project/python-dotenv/) to add `KNOWHERE_API_KEY="sk_..."` to your `.env` file so that your API key is not stored in source control.
36
104
 
37
105
  ### Parse a local file
@@ -105,9 +173,12 @@ from pathlib import Path
105
173
  job = client.jobs.create(
106
174
  source_type="file",
107
175
  file_name="report.pdf",
176
+ namespace="support-center",
108
177
  parsing_params={"model": "advanced", "ocr_enabled": True},
109
178
  )
110
179
 
180
+ print(job.document_id) # Persist this to update/archive the document later.
181
+
111
182
  # Step 2: Upload file to presigned URL
112
183
  client.jobs.upload(job, file=Path("report.pdf"))
113
184
 
@@ -12,6 +12,7 @@ Comprehensive reference for every feature, parameter, and pattern in the SDK.
12
12
  - [Working with Results](#working-with-results)
13
13
  - [Chunk Types](#chunk-types)
14
14
  - [Step-by-Step Control (Jobs API)](#step-by-step-control-jobs-api)
15
+ - [Retrieval and Document Lifecycle](#retrieval-and-document-lifecycle)
15
16
  - [Async Usage](#async-usage)
16
17
  - [Progress Callbacks](#progress-callbacks)
17
18
  - [Error Handling](#error-handling)
@@ -316,8 +317,10 @@ from pathlib import Path
316
317
  job = client.jobs.create(
317
318
  source_type="file",
318
319
  file_name="report.pdf",
320
+ namespace="support-center",
319
321
  parsing_params={"model": "advanced", "ocr_enabled": True},
320
322
  )
323
+ print(job.document_id) # Persist this value for update/archive flows.
321
324
 
322
325
  # Step 2: Upload file to the presigned URL
323
326
  client.jobs.upload(job, file=Path("report.pdf"))
@@ -341,6 +344,8 @@ print(result.statistics)
341
344
  | `source_type` | `"url" \| "file"` | — | Required. Whether parsing from URL or uploaded file. |
342
345
  | `source_url` | `str \| None` | `None` | URL to parse (required when `source_type="url"`). |
343
346
  | `file_name` | `str \| None` | `None` | Original filename (used when `source_type="file"`). |
347
+ | `namespace` | `str \| None` | `None` | Retrieval namespace. The server defaults to `"default"` when omitted. |
348
+ | `document_id` | `str \| None` | `None` | Existing document ID when creating an update job. Omit for a new document. |
344
349
  | `data_id` | `str \| None` | `None` | Your own correlation/idempotency identifier. |
345
350
  | `parsing_params` | `ParsingParams \| None` | `None` | Parsing configuration. |
346
351
  | `webhook` | `WebhookConfig \| None` | `None` | Webhook for completion notification. |
@@ -351,6 +356,8 @@ Returns a `Job` object:
351
356
  job.job_id # "abc-123"
352
357
  job.status # "pending"
353
358
  job.source_type # "file"
359
+ job.namespace # "support-center"
360
+ job.document_id # "doc_..." — persist this for updates and archive calls
354
361
  job.upload_url # presigned URL (for file uploads)
355
362
  job.upload_headers # headers to include in the upload request
356
363
  job.expires_in # seconds until upload URL expires
@@ -407,6 +414,119 @@ result = client.jobs.load("https://storage.example.com/result.zip")
407
414
 
408
415
  ---
409
416
 
417
+ ## Retrieval and Document Lifecycle
418
+
419
+ The retrieval APIs operate on canonical documents that are published after a
420
+ job completes. For new documents, the server generates `document_id` during
421
+ `jobs.create()`. Store that ID in your application if you need to update or
422
+ archive the same document later.
423
+
424
+ ### Create a retrievable document
425
+
426
+ ```python
427
+ job = client.jobs.create(
428
+ source_type="url",
429
+ source_url="https://example.com/manual.pdf",
430
+ namespace="support-center",
431
+ )
432
+
433
+ print(job.document_id) # "doc_..."
434
+ ```
435
+
436
+ For file uploads, the flow is the same except that you upload the file before
437
+ polling:
438
+
439
+ ```python
440
+ job = client.jobs.create(
441
+ source_type="file",
442
+ file_name="manual.pdf",
443
+ namespace="support-center",
444
+ )
445
+ client.jobs.upload(job, file=Path("manual.pdf"))
446
+ job_result = client.jobs.wait(job.job_id)
447
+ ```
448
+
449
+ ### Update an existing document
450
+
451
+ Pass the prior `document_id` to create an update job. If `namespace` is omitted,
452
+ the API resolves the namespace from the existing document.
453
+
454
+ ```python
455
+ update_job = client.jobs.create(
456
+ source_type="url",
457
+ source_url="https://example.com/manual-v2.pdf",
458
+ document_id=job.document_id,
459
+ )
460
+ ```
461
+
462
+ The API rejects concurrent non-terminal jobs for the same document with a
463
+ retryable `ConflictError` using the server error code `ABORTED`.
464
+
465
+ ### Query retrieval results
466
+
467
+ ```python
468
+ response = client.retrieval.query(
469
+ namespace="support-center",
470
+ query="How do I pair a Bluetooth headset?",
471
+ top_k=5,
472
+ )
473
+
474
+ for result in response.results:
475
+ print(result.content)
476
+ print(result.score)
477
+ print(result.source.document_id)
478
+ print(result.source.source_file_name)
479
+ print(result.source.section_path)
480
+ ```
481
+
482
+ Retrieval results expose `content`, not the older parse-result `text` field.
483
+ Media results may include `asset_url` when the server can sign the referenced
484
+ artifact.
485
+
486
+ Each retrieval result uses one canonical source reference shape:
487
+
488
+ ```python
489
+ result.content
490
+ result.chunk_type
491
+ result.score
492
+ result.asset_url # Optional[str]
493
+ result.source.document_id
494
+ result.source.source_file_name
495
+ result.source.section_path
496
+ ```
497
+
498
+ ### Exclude documents or sections
499
+
500
+ Use exclusions for follow-up queries that should avoid already-used context.
501
+
502
+ ```python
503
+ response = client.retrieval.query(
504
+ namespace="support-center",
505
+ query="battery charging",
506
+ top_k=10,
507
+ exclude_document_ids=["doc_old"],
508
+ exclude_sections=[
509
+ {"document_id": "doc_123", "section_path": "Appendix / Legal"}
510
+ ],
511
+ )
512
+ ```
513
+
514
+ ### List, get, and archive documents
515
+
516
+ ```python
517
+ document_list = client.documents.list(namespace="support-center")
518
+ for document in document_list.documents:
519
+ print(document.document_id, document.status, document.source_file_name)
520
+
521
+ document = client.documents.get("doc_123")
522
+ print(document.current_job_result_id)
523
+
524
+ archived = client.documents.archive("doc_123")
525
+ print(archived.status) # "archived"
526
+ ```
527
+
528
+ ---
529
+
410
530
  ## Async Usage
411
531
 
412
532
  Every method available on `Knowhere` has an async counterpart on `AsyncKnowhere`:
@@ -429,6 +549,13 @@ async def main():
429
549
  job_result = await client.jobs.wait(job.job_id)
430
550
  result = await client.jobs.load(job_result)
431
551
 
552
+ retrieval = await client.retrieval.query(
553
+ namespace="support-center",
554
+ query="refund policy",
555
+ top_k=5,
556
+ )
557
+ print(retrieval.results[0].content)
558
+
432
559
  asyncio.run(main())
433
560
  ```
434
561
 
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "knowhere-python-sdk"
7
- version = "0.2.0"
7
+ version = "0.3.0"
8
8
  description = "Official Python SDK for the Knowhere document parsing API"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -35,8 +35,14 @@ from knowhere._exceptions import (
35
35
  )
36
36
  from knowhere._types import PollProgressCallback, UploadProgressCallback
37
37
  from knowhere._version import __version__
38
+ from knowhere.types.document import Document, DocumentListResponse
38
39
  from knowhere.types.job import Job, JobError, JobProgress, JobResult
39
40
  from knowhere.types.params import ParsingParams, WebhookConfig
41
+ from knowhere.types.retrieval import (
42
+ RetrievalSource,
43
+ RetrievalQueryResponse,
44
+ RetrievalResult,
45
+ )
40
46
  from knowhere.types.result import (
41
47
  BaseChunk,
42
48
  Checksum,
@@ -46,6 +52,10 @@ from knowhere.types.result import (
46
52
  ImageFileInfo,
47
53
  Manifest,
48
54
  ParseResult,
55
+ ProcessingCost,
56
+ ProcessingMetadata,
57
+ ProcessingTiming,
58
+ SlimChunk,
49
59
  Statistics,
50
60
  TableChunk,
51
61
  TableFileInfo,
@@ -83,6 +93,13 @@ __all__: list[str] = [
83
93
  "JobError",
84
94
  "JobProgress",
85
95
  "JobResult",
96
+ # Document types
97
+ "Document",
98
+ "DocumentListResponse",
99
+ # Retrieval types
100
+ "RetrievalSource",
101
+ "RetrievalQueryResponse",
102
+ "RetrievalResult",
86
103
  # Result types
87
104
  "ParseResult",
88
105
  "Manifest",
@@ -91,6 +108,10 @@ __all__: list[str] = [
91
108
  "FileIndex",
92
109
  "ImageFileInfo",
93
110
  "TableFileInfo",
111
+ "ProcessingCost",
112
+ "ProcessingMetadata",
113
+ "ProcessingTiming",
114
+ "SlimChunk",
94
115
  "BaseChunk",
95
116
  "TextChunk",
96
117
  "ImageChunk",
@@ -19,7 +19,9 @@ from knowhere._types import (
19
19
  PollProgressCallback,
20
20
  UploadProgressCallback,
21
21
  )
22
+ from knowhere.resources.documents import AsyncDocuments, Documents
22
23
  from knowhere.resources.jobs import AsyncJobs, Jobs
24
+ from knowhere.resources.retrieval import AsyncRetrieval, Retrieval
23
25
  from knowhere.types.job import Job, JobResult
24
26
  from knowhere.types.params import ParsingParams, WebhookConfig
25
27
  from knowhere.types.result import ParseResult
@@ -42,6 +44,16 @@ class Knowhere(SyncAPIClient):
42
44
  """Access the jobs resource namespace."""
43
45
  return Jobs(self)
44
46
 
47
+ @cached_property
48
+ def retrieval(self) -> Retrieval:
49
+ """Access the retrieval resource namespace."""
50
+ return Retrieval(self)
51
+
52
+ @cached_property
53
+ def documents(self) -> Documents:
54
+ """Access the documents resource namespace."""
55
+ return Documents(self)
56
+
45
57
  # -- overloaded parse signatures --
46
58
 
47
59
  @overload
@@ -50,6 +62,8 @@ class Knowhere(SyncAPIClient):
50
62
  *,
51
63
  url: str,
52
64
  data_id: Optional[str] = ...,
65
+ namespace: Optional[str] = ...,
66
+ document_id: Optional[str] = ...,
53
67
  parsing_params: Optional[ParsingParams] = ...,
54
68
  webhook: Optional[WebhookConfig] = ...,
55
69
  poll_interval: float = ...,
@@ -66,6 +80,8 @@ class Knowhere(SyncAPIClient):
66
80
  file: Union[Path, BinaryIO, bytes],
67
81
  file_name: Optional[str] = ...,
68
82
  data_id: Optional[str] = ...,
83
+ namespace: Optional[str] = ...,
84
+ document_id: Optional[str] = ...,
69
85
  parsing_params: Optional[ParsingParams] = ...,
70
86
  webhook: Optional[WebhookConfig] = ...,
71
87
  poll_interval: float = ...,
@@ -82,6 +98,8 @@ class Knowhere(SyncAPIClient):
82
98
  file: Optional[Union[Path, BinaryIO, bytes]] = None,
83
99
  file_name: Optional[str] = None,
84
100
  data_id: Optional[str] = None,
101
+ namespace: Optional[str] = None,
102
+ document_id: Optional[str] = None,
85
103
  parsing_params: Optional[ParsingParams] = None,
86
104
  webhook: Optional[WebhookConfig] = None,
87
105
  poll_interval: float = DEFAULT_POLL_INTERVAL,
@@ -105,6 +123,8 @@ class Knowhere(SyncAPIClient):
105
123
  source_type="url",
106
124
  source_url=url,
107
125
  data_id=data_id,
126
+ namespace=namespace,
127
+ document_id=document_id,
108
128
  parsing_params=parsing_params,
109
129
  webhook=webhook,
110
130
  )
@@ -116,6 +136,8 @@ class Knowhere(SyncAPIClient):
116
136
  source_type="file",
117
137
  file_name=resolved_name,
118
138
  data_id=data_id,
139
+ namespace=namespace,
140
+ document_id=document_id,
119
141
  parsing_params=parsing_params,
120
142
  webhook=webhook,
121
143
  )
@@ -149,12 +171,24 @@ class AsyncKnowhere(AsyncAPIClient):
149
171
  """Access the async jobs resource namespace."""
150
172
  return AsyncJobs(self)
151
173
 
174
+ @cached_property
175
+ def retrieval(self) -> AsyncRetrieval:
176
+ """Access the async retrieval resource namespace."""
177
+ return AsyncRetrieval(self)
178
+
179
+ @cached_property
180
+ def documents(self) -> AsyncDocuments:
181
+ """Access the async documents resource namespace."""
182
+ return AsyncDocuments(self)
183
+
152
184
  @overload
153
185
  async def parse(
154
186
  self,
155
187
  *,
156
188
  url: str,
157
189
  data_id: Optional[str] = ...,
190
+ namespace: Optional[str] = ...,
191
+ document_id: Optional[str] = ...,
158
192
  parsing_params: Optional[ParsingParams] = ...,
159
193
  webhook: Optional[WebhookConfig] = ...,
160
194
  poll_interval: float = ...,
@@ -171,6 +205,8 @@ class AsyncKnowhere(AsyncAPIClient):
171
205
  file: Union[Path, BinaryIO, bytes],
172
206
  file_name: Optional[str] = ...,
173
207
  data_id: Optional[str] = ...,
208
+ namespace: Optional[str] = ...,
209
+ document_id: Optional[str] = ...,
174
210
  parsing_params: Optional[ParsingParams] = ...,
175
211
  webhook: Optional[WebhookConfig] = ...,
176
212
  poll_interval: float = ...,
@@ -187,6 +223,8 @@ class AsyncKnowhere(AsyncAPIClient):
187
223
  file: Optional[Union[Path, BinaryIO, bytes]] = None,
188
224
  file_name: Optional[str] = None,
189
225
  data_id: Optional[str] = None,
226
+ namespace: Optional[str] = None,
227
+ document_id: Optional[str] = None,
190
228
  parsing_params: Optional[ParsingParams] = None,
191
229
  webhook: Optional[WebhookConfig] = None,
192
230
  poll_interval: float = DEFAULT_POLL_INTERVAL,
@@ -206,6 +244,8 @@ class AsyncKnowhere(AsyncAPIClient):
206
244
  source_type="url",
207
245
  source_url=url,
208
246
  data_id=data_id,
247
+ namespace=namespace,
248
+ document_id=document_id,
209
249
  parsing_params=parsing_params,
210
250
  webhook=webhook,
211
251
  )
@@ -217,6 +257,8 @@ class AsyncKnowhere(AsyncAPIClient):
217
257
  source_type="file",
218
258
  file_name=resolved_name,
219
259
  data_id=data_id,
260
+ namespace=namespace,
261
+ document_id=document_id,
220
262
  parsing_params=parsing_params,
221
263
  webhook=webhook,
222
264
  )
@@ -232,4 +274,4 @@ class AsyncKnowhere(AsyncAPIClient):
232
274
 
233
275
  return await self.jobs.load(
234
276
  job_result, verify_checksum=verify_checksum
235
- )
277
+ )
@@ -387,11 +387,29 @@ def makeStatusError(
387
387
  response=response,
388
388
  )
389
389
 
390
- if exception_class in (RateLimitError, ServiceUnavailableError, GatewayTimeoutError):
391
- return exception_class(
390
+ if exception_class is RateLimitError:
391
+ return RateLimitError(
392
392
  status_code,
393
393
  **common_kwargs,
394
- retry_after=retry_after, # type: ignore[call-arg]
394
+ retry_after=retry_after,
395
+ limit=limit,
396
+ period=period,
397
+ )
398
+
399
+ if exception_class is ServiceUnavailableError:
400
+ return ServiceUnavailableError(
401
+ status_code,
402
+ **common_kwargs,
403
+ retry_after=retry_after,
404
+ limit=limit,
405
+ period=period,
406
+ )
407
+
408
+ if exception_class is GatewayTimeoutError:
409
+ return GatewayTimeoutError(
410
+ status_code,
411
+ **common_kwargs,
412
+ retry_after=retry_after,
395
413
  limit=limit,
396
414
  period=period,
397
415
  )
@@ -0,0 +1 @@
1
+ __version__ = "0.3.0" # x-release-please-version
@@ -16,6 +16,7 @@ from knowhere.types.result import (
16
16
  ImageChunk,
17
17
  Manifest,
18
18
  ParseResult,
19
+ SlimChunk,
19
20
  TableChunk,
20
21
  TextChunk,
21
22
  TextChunkTokens,
@@ -134,6 +135,7 @@ def _buildChunks(
134
135
  type="image",
135
136
  content=raw.get("content", ""),
136
137
  path=raw.get("path"),
138
+ page_nums=metadata.get("page_nums", raw.get("page_nums")),
137
139
  length=metadata.get("length", raw.get("length", 0)),
138
140
  file_path=file_path,
139
141
  original_name=metadata.get("original_name", raw.get("original_name")),
@@ -151,6 +153,7 @@ def _buildChunks(
151
153
  type="table",
152
154
  content=raw.get("content", ""),
153
155
  path=raw.get("path"),
156
+ page_nums=metadata.get("page_nums", raw.get("page_nums")),
154
157
  length=metadata.get("length", raw.get("length", 0)),
155
158
  file_path=file_path,
156
159
  original_name=metadata.get("original_name", raw.get("original_name")),
@@ -167,10 +170,12 @@ def _buildChunks(
167
170
  type="text",
168
171
  content=raw.get("content", ""),
169
172
  path=raw.get("path"),
173
+ page_nums=metadata.get("page_nums", raw.get("page_nums")),
170
174
  length=metadata.get("length", raw.get("length", 0)),
171
175
  tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id),
172
176
  keywords=metadata.get("keywords", raw.get("keywords")),
173
177
  summary=metadata.get("summary", raw.get("summary")),
178
+ connect_to=metadata.get("connect_to", raw.get("connect_to")),
174
179
  relationships=metadata.get("relationships", raw.get("relationships")),
175
180
  )
176
181
 
@@ -230,12 +235,39 @@ def parseResultZip(
230
235
  json.loads(hierarchy_text) if hierarchy_text else None
231
236
  )
232
237
 
238
+ # -- Optimized sidecar files --
239
+ chunks_slim_text: Optional[str] = _readZipText(zf, "chunks_slim.json")
240
+ parsed_chunks_slim: Any = json.loads(chunks_slim_text) if chunks_slim_text else None
241
+ if isinstance(parsed_chunks_slim, dict) and "chunks" in parsed_chunks_slim:
242
+ raw_chunks_slim: List[Dict[str, Any]] = parsed_chunks_slim["chunks"]
243
+ elif isinstance(parsed_chunks_slim, list):
244
+ raw_chunks_slim = parsed_chunks_slim
245
+ else:
246
+ raw_chunks_slim = []
247
+ chunks_slim: Optional[List[SlimChunk]] = (
248
+ [SlimChunk.model_validate(chunk) for chunk in raw_chunks_slim]
249
+ if chunks_slim_text is not None
250
+ else None
251
+ )
252
+
253
+ toc_hierarchies_text: Optional[str] = _readZipText(zf, "toc_hierarchies.json")
254
+ toc_hierarchies: Optional[Any] = (
255
+ json.loads(toc_hierarchies_text) if toc_hierarchies_text else None
256
+ )
257
+
258
+ kb_csv: Optional[str] = _readZipText(zf, "kb.csv")
259
+ hierarchy_view_html: Optional[str] = _readZipText(zf, "hierarchy_view.html")
260
+
233
261
  zf.close()
234
262
 
235
263
  return ParseResult(
236
264
  manifest=manifest,
237
265
  chunks=chunks,
266
+ chunks_slim=chunks_slim,
238
267
  full_markdown=full_markdown,
239
268
  hierarchy=hierarchy,
269
+ toc_hierarchies=toc_hierarchies,
270
+ kb_csv=kb_csv,
271
+ hierarchy_view_html=hierarchy_view_html,
240
272
  raw_zip=zip_bytes,
241
273
  )