PyPI - knowhere-python-sdk - Versions diffs - 0.2.1__tar.gz → 0.3.1__tar.gz - Mend

knowhere-python-sdk 0.2.1tar.gz → 0.3.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

knowhere_python_sdk-0.3.1/.release-please-manifest.json ADDED Viewed

@@ -0,0 +1,3 @@
+{
+  ".": "0.3.1"
+}

{knowhere_python_sdk-0.2.1 → knowhere_python_sdk-0.3.1}/CHANGELOG.md RENAMED Viewed

@@ -1,5 +1,21 @@
 # Changelog
+## [0.3.1](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.3.0...v0.3.1) (2026-04-22)
+### Documentation
+* clarify ParseResult document scope ([861084e](https://github.com/Ontos-AI/knowhere-python-sdk/commit/861084e34144987994fa618ac0db262ce681b5a8))
+* clarify ParseResult document scope ([bb14ad4](https://github.com/Ontos-AI/knowhere-python-sdk/commit/bb14ad4077c41cbe74a5dd155995d6f9937962b8))
+## [0.3.0](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.2.1...v0.3.0) (2026-04-21)
+### Features
+* add retrieval service sdk clients ([bceef5c](https://github.com/Ontos-AI/knowhere-python-sdk/commit/bceef5cf379dba39543244bd6ca86262a536fb9b))
+* integrate retrieval service v1 in Python SDK ([bce7aa8](https://github.com/Ontos-AI/knowhere-python-sdk/commit/bce7aa8dbf069d5880b92c6f9d8996878251f7cb))
 ## [0.2.1](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.2.0...v0.2.1) (2026-04-09)

{knowhere_python_sdk-0.2.1 → knowhere_python_sdk-0.3.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: knowhere-python-sdk
-Version: 0.2.1
+Version: 0.3.1
 Summary: Official Python SDK for the Knowhere document parsing API
 Project-URL: Homepage, https://knowhereto.ai
 Project-URL: Documentation, https://docs.knowhereto.ai
@@ -64,6 +64,84 @@ for chunk in result.text_chunks:
     print(chunk.content[:80])
 ```
+## Retrieval and document lifecycle
+New documents are published into a retrieval namespace. The server returns a
+stable `document_id` after the job is published. `client.jobs.create(...)`
+does not return a usable `document_id`; persist `job_result.document_id` if you
+need to update or archive the same document later.
+```python
+job = client.jobs.create(
+    source_type="url",
+    source_url="https://example.com/manual.pdf",
+    namespace="support-center",
+)
+job_result = client.jobs.wait(job.job_id)
+document_id = job_result.document_id
+if document_id is None:
+    raise RuntimeError("Expected document_id after successful publication.")
+```
+After the job is done and published, query the canonical document content:
+```python
+response = client.retrieval.query(
+    namespace="support-center",
+    query="How do I reset Bluetooth pairing?",
+    top_k=5,
+    channels=["path", "term"],
+    filter_mode="keep",
+    signal_paths=["Bluetooth", "Pairing"],
+)
+print(response.router_used)
+for result in response.results:
+    print(result.content)
+    print(result.score)
+    print(result.source.source_file_name, result.source.section_path)
+```
+Use `document_id` to update or archive a document:
+```python
+update_job = client.jobs.create(
+    source_type="url",
+    source_url="https://example.com/manual-v2.pdf",
+    document_id=document_id,
+)
+document = client.documents.get(document_id)
+print(document.status)
+client.documents.archive(document_id)
+```
+You can also list documents in a namespace:
+```python
+documents = client.documents.list(namespace="support-center")
+for document in documents.documents:
+    print(document.document_id, document.status)
+```
+Retrieval supports exclusions when clients want follow-up results that avoid
+previously used documents or sections:
+```python
+response = client.retrieval.query(
+    namespace="support-center",
+    query="battery charging",
+    exclude_document_ids=["doc_old"],
+    exclude_sections=[
+        {"document_id": "doc_123", "section_path": "Appendix / Legal"}
+    ],
+)
+```
 While you can provide an `api_key` keyword argument, we recommend using [python-dotenv](https://pypi.org/project/python-dotenv/) to add `KNOWHERE_API_KEY="sk_..."` to your `.env` file so that your API key is not stored in source control.
 ### Parse a local file
@@ -78,6 +156,8 @@ result = client.parse(
 print(result.manifest.source_file_name)  # "report.pdf"
 print(len(result.chunks))                # 152
+print(result.namespace)                  # "default" or your explicit namespace
+print(result.document_id)                # Published canonical document id
 ```
 ### Access different chunk types
@@ -137,6 +217,7 @@ from pathlib import Path
 job = client.jobs.create(
     source_type="file",
     file_name="report.pdf",
+    namespace="support-center",
     parsing_params={"model": "advanced", "ocr_enabled": True},
 )
@@ -146,6 +227,8 @@ client.jobs.upload(job, file=Path("report.pdf"))
 # Step 3: Poll until done (adaptive backoff)
 job_result = client.jobs.wait(job.job_id, poll_interval=10.0, poll_timeout=1800.0)
+print(job_result.document_id)  # Persist this to update/archive the document later.
 # Step 4: Download and parse results
 result = client.jobs.load(job_result)
 print(result.statistics)

{knowhere_python_sdk-0.2.1 → knowhere_python_sdk-0.3.1}/README.md RENAMED Viewed

@@ -32,6 +32,84 @@ for chunk in result.text_chunks:
     print(chunk.content[:80])
 ```
+## Retrieval and document lifecycle
+New documents are published into a retrieval namespace. The server returns a
+stable `document_id` after the job is published. `client.jobs.create(...)`
+does not return a usable `document_id`; persist `job_result.document_id` if you
+need to update or archive the same document later.
+```python
+job = client.jobs.create(
+    source_type="url",
+    source_url="https://example.com/manual.pdf",
+    namespace="support-center",
+)
+job_result = client.jobs.wait(job.job_id)
+document_id = job_result.document_id
+if document_id is None:
+    raise RuntimeError("Expected document_id after successful publication.")
+```
+After the job is done and published, query the canonical document content:
+```python
+response = client.retrieval.query(
+    namespace="support-center",
+    query="How do I reset Bluetooth pairing?",
+    top_k=5,
+    channels=["path", "term"],
+    filter_mode="keep",
+    signal_paths=["Bluetooth", "Pairing"],
+)
+print(response.router_used)
+for result in response.results:
+    print(result.content)
+    print(result.score)
+    print(result.source.source_file_name, result.source.section_path)
+```
+Use `document_id` to update or archive a document:
+```python
+update_job = client.jobs.create(
+    source_type="url",
+    source_url="https://example.com/manual-v2.pdf",
+    document_id=document_id,
+)
+document = client.documents.get(document_id)
+print(document.status)
+client.documents.archive(document_id)
+```
+You can also list documents in a namespace:
+```python
+documents = client.documents.list(namespace="support-center")
+for document in documents.documents:
+    print(document.document_id, document.status)
+```
+Retrieval supports exclusions when clients want follow-up results that avoid
+previously used documents or sections:
+```python
+response = client.retrieval.query(
+    namespace="support-center",
+    query="battery charging",
+    exclude_document_ids=["doc_old"],
+    exclude_sections=[
+        {"document_id": "doc_123", "section_path": "Appendix / Legal"}
+    ],
+)
+```
 While you can provide an `api_key` keyword argument, we recommend using [python-dotenv](https://pypi.org/project/python-dotenv/) to add `KNOWHERE_API_KEY="sk_..."` to your `.env` file so that your API key is not stored in source control.
 ### Parse a local file
@@ -46,6 +124,8 @@ result = client.parse(
 print(result.manifest.source_file_name)  # "report.pdf"
 print(len(result.chunks))                # 152
+print(result.namespace)                  # "default" or your explicit namespace
+print(result.document_id)                # Published canonical document id
 ```
 ### Access different chunk types
@@ -105,6 +185,7 @@ from pathlib import Path
 job = client.jobs.create(
     source_type="file",
     file_name="report.pdf",
+    namespace="support-center",
     parsing_params={"model": "advanced", "ocr_enabled": True},
 )
@@ -114,6 +195,8 @@ client.jobs.upload(job, file=Path("report.pdf"))
 # Step 3: Poll until done (adaptive backoff)
 job_result = client.jobs.wait(job.job_id, poll_interval=10.0, poll_timeout=1800.0)
+print(job_result.document_id)  # Persist this to update/archive the document later.
 # Step 4: Download and parse results
 result = client.jobs.load(job_result)
 print(result.statistics)

{knowhere_python_sdk-0.2.1 → knowhere_python_sdk-0.3.1}/docs/usage.md RENAMED Viewed

@@ -12,6 +12,7 @@ Comprehensive reference for every feature, parameter, and pattern in the SDK.
 - [Working with Results](#working-with-results)
 - [Chunk Types](#chunk-types)
 - [Step-by-Step Control (Jobs API)](#step-by-step-control-jobs-api)
+- [Retrieval and Document Lifecycle](#retrieval-and-document-lifecycle)
 - [Async Usage](#async-usage)
 - [Progress Callbacks](#progress-callbacks)
 - [Error Handling](#error-handling)
@@ -316,8 +317,10 @@ from pathlib import Path
 job = client.jobs.create(
     source_type="file",
     file_name="report.pdf",
+    namespace="support-center",
     parsing_params={"model": "advanced", "ocr_enabled": True},
 )
+print(job.document_id)  # Persist this value for update/archive flows.
 # Step 2: Upload file to the presigned URL
 client.jobs.upload(job, file=Path("report.pdf"))
@@ -341,6 +344,8 @@ print(result.statistics)
 | `source_type` | `"url" \| "file"` | — | Required. Whether parsing from URL or uploaded file. |
 | `source_url` | `str \| None` | `None` | URL to parse (required when `source_type="url"`). |
 | `file_name` | `str \| None` | `None` | Original filename (used when `source_type="file"`). |
+| `namespace` | `str \| None` | `None` | Retrieval namespace. The server defaults to `"default"` when omitted. |
+| `document_id` | `str \| None` | `None` | Existing document ID when creating an update job. Omit for a new document. |
 | `data_id` | `str \| None` | `None` | Your own correlation/idempotency identifier. |
 | `parsing_params` | `ParsingParams \| None` | `None` | Parsing configuration. |
 | `webhook` | `WebhookConfig \| None` | `None` | Webhook for completion notification. |
@@ -351,6 +356,8 @@ Returns a `Job` object:
 job.job_id          # "abc-123"
 job.status          # "pending"
 job.source_type     # "file"
+job.namespace       # "support-center"
+job.document_id     # "doc_..." — persist this for updates and archive calls
 job.upload_url      # presigned URL (for file uploads)
 job.upload_headers  # headers to include in the upload request
 job.expires_in      # seconds until upload URL expires
@@ -407,6 +414,119 @@ result = client.jobs.load("https://storage.example.com/result.zip")
 ---
+## Retrieval and Document Lifecycle
+The retrieval APIs operate on canonical documents that are published after a
+job completes. For new documents, the server generates `document_id` during
+`jobs.create()`. Store that ID in your application if you need to update or
+archive the same document later.
+### Create a retrievable document
+```python
+job = client.jobs.create(
+    source_type="url",
+    source_url="https://example.com/manual.pdf",
+    namespace="support-center",
+)
+print(job.document_id)  # "doc_..."
+```
+For file uploads, the flow is the same except that you upload the file before
+polling:
+```python
+job = client.jobs.create(
+    source_type="file",
+    file_name="manual.pdf",
+    namespace="support-center",
+)
+client.jobs.upload(job, file=Path("manual.pdf"))
+job_result = client.jobs.wait(job.job_id)
+```
+### Update an existing document
+Pass the prior `document_id` to create an update job. If `namespace` is omitted,
+the API resolves the namespace from the existing document.
+```python
+update_job = client.jobs.create(
+    source_type="url",
+    source_url="https://example.com/manual-v2.pdf",
+    document_id=job.document_id,
+)
+```
+The API rejects concurrent non-terminal jobs for the same document with a
+retryable `ConflictError` using the server error code `ABORTED`.
+### Query retrieval results
+```python
+response = client.retrieval.query(
+    namespace="support-center",
+    query="How do I pair a Bluetooth headset?",
+    top_k=5,
+)
+for result in response.results:
+    print(result.content)
+    print(result.score)
+    print(result.source.document_id)
+    print(result.source.source_file_name)
+    print(result.source.section_path)
+```
+Retrieval results expose `content`, not the older parse-result `text` field.
+Media results may include `asset_url` when the server can sign the referenced
+artifact.
+Each retrieval result uses one canonical source reference shape:
+```python
+result.content
+result.chunk_type
+result.score
+result.asset_url  # Optional[str]
+result.source.document_id
+result.source.source_file_name
+result.source.section_path
+```
+### Exclude documents or sections
+Use exclusions for follow-up queries that should avoid already-used context.
+```python
+response = client.retrieval.query(
+    namespace="support-center",
+    query="battery charging",
+    top_k=10,
+    exclude_document_ids=["doc_old"],
+    exclude_sections=[
+        {"document_id": "doc_123", "section_path": "Appendix / Legal"}
+    ],
+)
+```
+### List, get, and archive documents
+```python
+document_list = client.documents.list(namespace="support-center")
+for document in document_list.documents:
+    print(document.document_id, document.status, document.source_file_name)
+document = client.documents.get("doc_123")
+print(document.current_job_result_id)
+archived = client.documents.archive("doc_123")
+print(archived.status)  # "archived"
+```
+---
 ## Async Usage
 Every method available on `Knowhere` has an async counterpart on `AsyncKnowhere`:
@@ -429,6 +549,13 @@ async def main():
         job_result = await client.jobs.wait(job.job_id)
         result = await client.jobs.load(job_result)
+        retrieval = await client.retrieval.query(
+            namespace="support-center",
+            query="refund policy",
+            top_k=5,
+        )
+        print(retrieval.results[0].content)
 asyncio.run(main())
 ```

{knowhere_python_sdk-0.2.1 → knowhere_python_sdk-0.3.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "knowhere-python-sdk"
-version = "0.2.1"
+version = "0.3.1"
 description = "Official Python SDK for the Knowhere document parsing API"
 readme = "README.md"
 license = "MIT"

{knowhere_python_sdk-0.2.1 → knowhere_python_sdk-0.3.1}/src/knowhere/__init__.py RENAMED Viewed

@@ -35,8 +35,17 @@ from knowhere._exceptions import (
 )
 from knowhere._types import PollProgressCallback, UploadProgressCallback
 from knowhere._version import __version__
+from knowhere.types.document import Document, DocumentListResponse
 from knowhere.types.job import Job, JobError, JobProgress, JobResult
 from knowhere.types.params import ParsingParams, WebhookConfig
+from knowhere.types.retrieval import (
+    RetrievalChannel,
+    RetrievalFilterMode,
+    RetrievalSectionExclusion,
+    RetrievalSource,
+    RetrievalQueryResponse,
+    RetrievalResult,
+)
 from knowhere.types.result import (
     BaseChunk,
     Checksum,
@@ -87,6 +96,16 @@ __all__: list[str] = [
     "JobError",
     "JobProgress",
     "JobResult",
+    # Document types
+    "Document",
+    "DocumentListResponse",
+    # Retrieval types
+    "RetrievalChannel",
+    "RetrievalFilterMode",
+    "RetrievalSectionExclusion",
+    "RetrievalSource",
+    "RetrievalQueryResponse",
+    "RetrievalResult",
     # Result types
     "ParseResult",
     "Manifest",

{knowhere_python_sdk-0.2.1 → knowhere_python_sdk-0.3.1}/src/knowhere/_client.py RENAMED Viewed

@@ -19,7 +19,9 @@ from knowhere._types import (
     PollProgressCallback,
     UploadProgressCallback,
 )
+from knowhere.resources.documents import AsyncDocuments, Documents
 from knowhere.resources.jobs import AsyncJobs, Jobs
+from knowhere.resources.retrieval import AsyncRetrieval, Retrieval
 from knowhere.types.job import Job, JobResult
 from knowhere.types.params import ParsingParams, WebhookConfig
 from knowhere.types.result import ParseResult
@@ -42,6 +44,16 @@ class Knowhere(SyncAPIClient):
         """Access the jobs resource namespace."""
         return Jobs(self)
+    @cached_property
+    def retrieval(self) -> Retrieval:
+        """Access the retrieval resource namespace."""
+        return Retrieval(self)
+    @cached_property
+    def documents(self) -> Documents:
+        """Access the documents resource namespace."""
+        return Documents(self)
     # -- overloaded parse signatures --
     @overload
@@ -50,6 +62,8 @@ class Knowhere(SyncAPIClient):
         *,
         url: str,
         data_id: Optional[str] = ...,
+        namespace: Optional[str] = ...,
+        document_id: Optional[str] = ...,
         parsing_params: Optional[ParsingParams] = ...,
         webhook: Optional[WebhookConfig] = ...,
         poll_interval: float = ...,
@@ -66,6 +80,8 @@ class Knowhere(SyncAPIClient):
         file: Union[Path, BinaryIO, bytes],
         file_name: Optional[str] = ...,
         data_id: Optional[str] = ...,
+        namespace: Optional[str] = ...,
+        document_id: Optional[str] = ...,
         parsing_params: Optional[ParsingParams] = ...,
         webhook: Optional[WebhookConfig] = ...,
         poll_interval: float = ...,
@@ -82,6 +98,8 @@ class Knowhere(SyncAPIClient):
         file: Optional[Union[Path, BinaryIO, bytes]] = None,
         file_name: Optional[str] = None,
         data_id: Optional[str] = None,
+        namespace: Optional[str] = None,
+        document_id: Optional[str] = None,
         parsing_params: Optional[ParsingParams] = None,
         webhook: Optional[WebhookConfig] = None,
         poll_interval: float = DEFAULT_POLL_INTERVAL,
@@ -105,6 +123,8 @@ class Knowhere(SyncAPIClient):
                 source_type="url",
                 source_url=url,
                 data_id=data_id,
+                namespace=namespace,
+                document_id=document_id,
                 parsing_params=parsing_params,
                 webhook=webhook,
             )
@@ -116,6 +136,8 @@ class Knowhere(SyncAPIClient):
                 source_type="file",
                 file_name=resolved_name,
                 data_id=data_id,
+                namespace=namespace,
+                document_id=document_id,
                 parsing_params=parsing_params,
                 webhook=webhook,
             )
@@ -149,12 +171,24 @@ class AsyncKnowhere(AsyncAPIClient):
         """Access the async jobs resource namespace."""
         return AsyncJobs(self)
+    @cached_property
+    def retrieval(self) -> AsyncRetrieval:
+        """Access the async retrieval resource namespace."""
+        return AsyncRetrieval(self)
+    @cached_property
+    def documents(self) -> AsyncDocuments:
+        """Access the async documents resource namespace."""
+        return AsyncDocuments(self)
     @overload
     async def parse(
         self,
         *,
         url: str,
         data_id: Optional[str] = ...,
+        namespace: Optional[str] = ...,
+        document_id: Optional[str] = ...,
         parsing_params: Optional[ParsingParams] = ...,
         webhook: Optional[WebhookConfig] = ...,
         poll_interval: float = ...,
@@ -171,6 +205,8 @@ class AsyncKnowhere(AsyncAPIClient):
         file: Union[Path, BinaryIO, bytes],
         file_name: Optional[str] = ...,
         data_id: Optional[str] = ...,
+        namespace: Optional[str] = ...,
+        document_id: Optional[str] = ...,
         parsing_params: Optional[ParsingParams] = ...,
         webhook: Optional[WebhookConfig] = ...,
         poll_interval: float = ...,
@@ -187,6 +223,8 @@ class AsyncKnowhere(AsyncAPIClient):
         file: Optional[Union[Path, BinaryIO, bytes]] = None,
         file_name: Optional[str] = None,
         data_id: Optional[str] = None,
+        namespace: Optional[str] = None,
+        document_id: Optional[str] = None,
         parsing_params: Optional[ParsingParams] = None,
         webhook: Optional[WebhookConfig] = None,
         poll_interval: float = DEFAULT_POLL_INTERVAL,
@@ -206,6 +244,8 @@ class AsyncKnowhere(AsyncAPIClient):
                 source_type="url",
                 source_url=url,
                 data_id=data_id,
+                namespace=namespace,
+                document_id=document_id,
                 parsing_params=parsing_params,
                 webhook=webhook,
             )
@@ -217,6 +257,8 @@ class AsyncKnowhere(AsyncAPIClient):
                 source_type="file",
                 file_name=resolved_name,
                 data_id=data_id,
+                namespace=namespace,
+                document_id=document_id,
                 parsing_params=parsing_params,
                 webhook=webhook,
             )
@@ -232,4 +274,4 @@ class AsyncKnowhere(AsyncAPIClient):
         return await self.jobs.load(
             job_result, verify_checksum=verify_checksum
-        )
+        )

knowhere_python_sdk-0.3.1/src/knowhere/_version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.3.1" # x-release-please-version

knowhere_python_sdk-0.3.1/src/knowhere/resources/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""Resource namespace re-exports."""
+from __future__ import annotations
+from knowhere.resources.documents import AsyncDocuments, Documents
+from knowhere.resources.jobs import AsyncJobs, Jobs
+from knowhere.resources.retrieval import AsyncRetrieval, Retrieval
+__all__: list[str] = [
+    "AsyncDocuments",
+    "AsyncJobs",
+    "AsyncRetrieval",
+    "Documents",
+    "Jobs",
+    "Retrieval",
+]

knowhere-python-sdk 0.2.1__tar.gz → 0.3.1__tar.gz

knowhere-python-sdk 0.2.1tar.gz → 0.3.1tar.gz