knowledge2 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowledge2-0.4.0.dist-info/METADATA +556 -0
- knowledge2-0.4.0.dist-info/RECORD +139 -0
- knowledge2-0.4.0.dist-info/WHEEL +5 -0
- knowledge2-0.4.0.dist-info/top_level.txt +1 -0
- sdk/__init__.py +70 -0
- sdk/_async_base.py +525 -0
- sdk/_async_paging.py +57 -0
- sdk/_base.py +541 -0
- sdk/_logging.py +41 -0
- sdk/_paging.py +73 -0
- sdk/_preview.py +70 -0
- sdk/_raw_response.py +25 -0
- sdk/_request_options.py +51 -0
- sdk/_transport.py +144 -0
- sdk/_validation.py +25 -0
- sdk/_validation_response.py +36 -0
- sdk/_version.py +3 -0
- sdk/async_client.py +320 -0
- sdk/async_resources/__init__.py +45 -0
- sdk/async_resources/_mixin_base.py +42 -0
- sdk/async_resources/a2a.py +230 -0
- sdk/async_resources/agents.py +489 -0
- sdk/async_resources/audit.py +145 -0
- sdk/async_resources/auth.py +133 -0
- sdk/async_resources/console.py +409 -0
- sdk/async_resources/corpora.py +276 -0
- sdk/async_resources/deployments.py +106 -0
- sdk/async_resources/documents.py +592 -0
- sdk/async_resources/feeds.py +248 -0
- sdk/async_resources/indexes.py +208 -0
- sdk/async_resources/jobs.py +165 -0
- sdk/async_resources/metadata.py +48 -0
- sdk/async_resources/models.py +102 -0
- sdk/async_resources/onboarding.py +538 -0
- sdk/async_resources/orgs.py +37 -0
- sdk/async_resources/pipelines.py +523 -0
- sdk/async_resources/projects.py +90 -0
- sdk/async_resources/search.py +262 -0
- sdk/async_resources/training.py +357 -0
- sdk/async_resources/usage.py +91 -0
- sdk/client.py +417 -0
- sdk/config.py +182 -0
- sdk/errors.py +178 -0
- sdk/examples/auth_factory.py +34 -0
- sdk/examples/batch_operations.py +57 -0
- sdk/examples/document_upload.py +56 -0
- sdk/examples/e2e_lifecycle.py +213 -0
- sdk/examples/error_handling.py +61 -0
- sdk/examples/pagination.py +64 -0
- sdk/examples/quickstart.py +36 -0
- sdk/examples/request_options.py +44 -0
- sdk/examples/search.py +64 -0
- sdk/integrations/__init__.py +57 -0
- sdk/integrations/_client.py +101 -0
- sdk/integrations/langchain/__init__.py +6 -0
- sdk/integrations/langchain/retriever.py +166 -0
- sdk/integrations/langchain/tools.py +108 -0
- sdk/integrations/llamaindex/__init__.py +11 -0
- sdk/integrations/llamaindex/filters.py +78 -0
- sdk/integrations/llamaindex/retriever.py +162 -0
- sdk/integrations/llamaindex/tools.py +109 -0
- sdk/integrations/llamaindex/vector_store.py +320 -0
- sdk/models/__init__.py +18 -0
- sdk/models/_base.py +24 -0
- sdk/models/_registry.py +457 -0
- sdk/models/a2a.py +92 -0
- sdk/models/agents.py +109 -0
- sdk/models/audit.py +28 -0
- sdk/models/auth.py +49 -0
- sdk/models/chunks.py +20 -0
- sdk/models/common.py +14 -0
- sdk/models/console.py +103 -0
- sdk/models/corpora.py +48 -0
- sdk/models/deployments.py +13 -0
- sdk/models/documents.py +126 -0
- sdk/models/embeddings.py +24 -0
- sdk/models/evaluation.py +17 -0
- sdk/models/feedback.py +9 -0
- sdk/models/feeds.py +57 -0
- sdk/models/indexes.py +36 -0
- sdk/models/jobs.py +52 -0
- sdk/models/models.py +26 -0
- sdk/models/onboarding.py +323 -0
- sdk/models/orgs.py +11 -0
- sdk/models/pipelines.py +147 -0
- sdk/models/projects.py +19 -0
- sdk/models/search.py +149 -0
- sdk/models/training.py +57 -0
- sdk/models/usage.py +39 -0
- sdk/namespaces.py +386 -0
- sdk/py.typed +0 -0
- sdk/resources/__init__.py +45 -0
- sdk/resources/_mixin_base.py +40 -0
- sdk/resources/a2a.py +230 -0
- sdk/resources/agents.py +487 -0
- sdk/resources/audit.py +144 -0
- sdk/resources/auth.py +138 -0
- sdk/resources/console.py +411 -0
- sdk/resources/corpora.py +269 -0
- sdk/resources/deployments.py +105 -0
- sdk/resources/documents.py +597 -0
- sdk/resources/feeds.py +246 -0
- sdk/resources/indexes.py +210 -0
- sdk/resources/jobs.py +164 -0
- sdk/resources/metadata.py +53 -0
- sdk/resources/models.py +99 -0
- sdk/resources/onboarding.py +542 -0
- sdk/resources/orgs.py +35 -0
- sdk/resources/pipeline_builder.py +257 -0
- sdk/resources/pipelines.py +520 -0
- sdk/resources/projects.py +87 -0
- sdk/resources/search.py +277 -0
- sdk/resources/training.py +358 -0
- sdk/resources/usage.py +92 -0
- sdk/types/__init__.py +366 -0
- sdk/types/a2a.py +88 -0
- sdk/types/agents.py +133 -0
- sdk/types/audit.py +26 -0
- sdk/types/auth.py +45 -0
- sdk/types/chunks.py +18 -0
- sdk/types/common.py +10 -0
- sdk/types/console.py +99 -0
- sdk/types/corpora.py +42 -0
- sdk/types/deployments.py +11 -0
- sdk/types/documents.py +104 -0
- sdk/types/embeddings.py +22 -0
- sdk/types/evaluation.py +15 -0
- sdk/types/feedback.py +7 -0
- sdk/types/feeds.py +61 -0
- sdk/types/indexes.py +30 -0
- sdk/types/jobs.py +50 -0
- sdk/types/models.py +22 -0
- sdk/types/onboarding.py +395 -0
- sdk/types/orgs.py +9 -0
- sdk/types/pipelines.py +177 -0
- sdk/types/projects.py +14 -0
- sdk/types/search.py +116 -0
- sdk/types/training.py +55 -0
- sdk/types/usage.py +37 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Mapping
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from sdk import AsyncKnowledge2, Knowledge2
|
|
7
|
+
from sdk.integrations._client import (
|
|
8
|
+
merge_return_config,
|
|
9
|
+
resolve_async_client,
|
|
10
|
+
resolve_client,
|
|
11
|
+
resolve_corpus_id,
|
|
12
|
+
)
|
|
13
|
+
from sdk.integrations.llamaindex.filters import llama_filters_to_k2
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
from llama_index.core import QueryBundle
|
|
17
|
+
from llama_index.core.retrievers import BaseRetriever
|
|
18
|
+
from llama_index.core.schema import NodeWithScore, TextNode
|
|
19
|
+
from llama_index.core.vector_stores.types import MetadataFilters
|
|
20
|
+
except ImportError as exc: # pragma: no cover - import-time dependency guard
|
|
21
|
+
raise ImportError(
|
|
22
|
+
"LlamaIndex integration requires llama-index-core. Install with `pip install .[llamaindex]`."
|
|
23
|
+
) from exc
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class K2LlamaIndexRetriever(BaseRetriever):
|
|
27
|
+
"""LlamaIndex retriever backed by Knowledge2 search."""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
*,
|
|
32
|
+
corpus_id: str | None = None,
|
|
33
|
+
client: Knowledge2 | None = None,
|
|
34
|
+
api_key: str | None = None,
|
|
35
|
+
api_host: str | None = None,
|
|
36
|
+
top_k: int = 10,
|
|
37
|
+
filters: MetadataFilters | None = None,
|
|
38
|
+
hybrid: dict[str, Any] | None = None,
|
|
39
|
+
rerank: dict[str, Any] | None = None,
|
|
40
|
+
return_config: dict[str, Any] | None = None,
|
|
41
|
+
) -> None:
|
|
42
|
+
super().__init__()
|
|
43
|
+
self._client = resolve_client(client=client, api_key=api_key, api_host=api_host)
|
|
44
|
+
self._async_client: AsyncKnowledge2 | None = None
|
|
45
|
+
self._corpus_id = resolve_corpus_id(corpus_id)
|
|
46
|
+
self._top_k = top_k
|
|
47
|
+
self._filters = filters
|
|
48
|
+
self._hybrid = hybrid
|
|
49
|
+
self._rerank = rerank
|
|
50
|
+
self._return_config = return_config
|
|
51
|
+
|
|
52
|
+
def _ensure_async_client(self) -> AsyncKnowledge2:
|
|
53
|
+
"""Lazily create an AsyncKnowledge2 sharing the sync client's credentials."""
|
|
54
|
+
if self._async_client is None:
|
|
55
|
+
self._async_client = resolve_async_client(sync_client=self._client)
|
|
56
|
+
return self._async_client
|
|
57
|
+
|
|
58
|
+
async def aclose(self) -> None:
|
|
59
|
+
"""Close the lazily-created async client, releasing connections."""
|
|
60
|
+
if self._async_client is not None:
|
|
61
|
+
await self._async_client.close()
|
|
62
|
+
self._async_client = None
|
|
63
|
+
|
|
64
|
+
@staticmethod
|
|
65
|
+
def _result_to_node_with_score(
|
|
66
|
+
result: Mapping[str, Any], corpus_id: str
|
|
67
|
+
) -> NodeWithScore | None:
|
|
68
|
+
"""Convert a single K2 search result to a LlamaIndex NodeWithScore."""
|
|
69
|
+
custom_meta = result.get("custom_metadata") or {}
|
|
70
|
+
system_meta = result.get("system_metadata") or {}
|
|
71
|
+
if not custom_meta and not system_meta:
|
|
72
|
+
legacy = result.get("metadata")
|
|
73
|
+
if isinstance(legacy, dict):
|
|
74
|
+
custom_meta = legacy
|
|
75
|
+
if not isinstance(custom_meta, dict):
|
|
76
|
+
custom_meta = {}
|
|
77
|
+
if not isinstance(system_meta, dict):
|
|
78
|
+
system_meta = {}
|
|
79
|
+
chunk_metadata = {**system_meta, **custom_meta}
|
|
80
|
+
|
|
81
|
+
chunk_id = result.get("chunk_id")
|
|
82
|
+
if not chunk_id:
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
node = TextNode(
|
|
86
|
+
id_=chunk_id,
|
|
87
|
+
text=result.get("text") or "",
|
|
88
|
+
metadata={
|
|
89
|
+
**chunk_metadata,
|
|
90
|
+
"chunk_id": chunk_id,
|
|
91
|
+
"corpus_id": corpus_id,
|
|
92
|
+
"raw_score": result.get("raw_score"),
|
|
93
|
+
"offset_start": result.get("offset_start"),
|
|
94
|
+
"offset_end": result.get("offset_end"),
|
|
95
|
+
"page_start": result.get("page_start"),
|
|
96
|
+
"page_end": result.get("page_end"),
|
|
97
|
+
},
|
|
98
|
+
)
|
|
99
|
+
score = result.get("score")
|
|
100
|
+
if score is None:
|
|
101
|
+
score = result.get("raw_score")
|
|
102
|
+
return NodeWithScore(node=node, score=score)
|
|
103
|
+
|
|
104
|
+
def _retrieve(self, query_bundle: QueryBundle) -> list[NodeWithScore]:
|
|
105
|
+
query_text = query_bundle.query_str if hasattr(query_bundle, "query_str") else None
|
|
106
|
+
if not query_text:
|
|
107
|
+
raise ValueError("K2LlamaIndexRetriever requires a text query")
|
|
108
|
+
|
|
109
|
+
k2_filters = llama_filters_to_k2(self._filters)
|
|
110
|
+
response = self._client.search(
|
|
111
|
+
self._corpus_id,
|
|
112
|
+
query_text,
|
|
113
|
+
top_k=self._top_k,
|
|
114
|
+
filters=k2_filters,
|
|
115
|
+
hybrid=self._hybrid,
|
|
116
|
+
rerank=self._rerank,
|
|
117
|
+
return_config=merge_return_config(
|
|
118
|
+
base=self._return_config,
|
|
119
|
+
override=None,
|
|
120
|
+
include_text=True,
|
|
121
|
+
include_scores=True,
|
|
122
|
+
include_provenance=True,
|
|
123
|
+
),
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
nodes: list[NodeWithScore] = []
|
|
127
|
+
for result in response.get("results", []):
|
|
128
|
+
node = self._result_to_node_with_score(result, self._corpus_id)
|
|
129
|
+
if node is not None:
|
|
130
|
+
nodes.append(node)
|
|
131
|
+
return nodes
|
|
132
|
+
|
|
133
|
+
async def _aretrieve(self, query_bundle: QueryBundle) -> list[NodeWithScore]:
|
|
134
|
+
"""Async variant for event-loop-safe LlamaIndex integration."""
|
|
135
|
+
query_text = query_bundle.query_str if hasattr(query_bundle, "query_str") else None
|
|
136
|
+
if not query_text:
|
|
137
|
+
raise ValueError("K2LlamaIndexRetriever requires a text query")
|
|
138
|
+
|
|
139
|
+
async_client = self._ensure_async_client()
|
|
140
|
+
k2_filters = llama_filters_to_k2(self._filters)
|
|
141
|
+
response = await async_client.search(
|
|
142
|
+
self._corpus_id,
|
|
143
|
+
query_text,
|
|
144
|
+
top_k=self._top_k,
|
|
145
|
+
filters=k2_filters,
|
|
146
|
+
hybrid=self._hybrid,
|
|
147
|
+
rerank=self._rerank,
|
|
148
|
+
return_config=merge_return_config(
|
|
149
|
+
base=self._return_config,
|
|
150
|
+
override=None,
|
|
151
|
+
include_text=True,
|
|
152
|
+
include_scores=True,
|
|
153
|
+
include_provenance=True,
|
|
154
|
+
),
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
nodes: list[NodeWithScore] = []
|
|
158
|
+
for result in response.get("results", []):
|
|
159
|
+
node = self._result_to_node_with_score(result, self._corpus_id)
|
|
160
|
+
if node is not None:
|
|
161
|
+
nodes.append(node)
|
|
162
|
+
return nodes
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, cast
|
|
4
|
+
|
|
5
|
+
from sdk import Knowledge2
|
|
6
|
+
from sdk.integrations._client import merge_return_config, resolve_client, resolve_corpus_id
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from llama_index.core.tools import FunctionTool
|
|
10
|
+
except ImportError as exc: # pragma: no cover - import-time dependency guard
|
|
11
|
+
raise ImportError(
|
|
12
|
+
"LlamaIndex integration requires llama-index-core. Install with `pip install .[llamaindex]`."
|
|
13
|
+
) from exc
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def create_k2_llamaindex_tools(
|
|
17
|
+
*,
|
|
18
|
+
corpus_id: str | None = None,
|
|
19
|
+
client: Knowledge2 | None = None,
|
|
20
|
+
api_key: str | None = None,
|
|
21
|
+
api_host: str | None = None,
|
|
22
|
+
default_top_k: int = 10,
|
|
23
|
+
default_hybrid: dict[str, Any] | None = None,
|
|
24
|
+
default_generation: dict[str, Any] | None = None,
|
|
25
|
+
) -> list[FunctionTool]:
|
|
26
|
+
"""Create LlamaIndex FunctionTool instances for K2 workflows."""
|
|
27
|
+
resolved_client = resolve_client(client=client, api_key=api_key, api_host=api_host)
|
|
28
|
+
resolved_corpus_id = resolve_corpus_id(corpus_id)
|
|
29
|
+
|
|
30
|
+
def k2_search(
|
|
31
|
+
query: str,
|
|
32
|
+
top_k: int = default_top_k,
|
|
33
|
+
filters: dict[str, Any] | None = None,
|
|
34
|
+
) -> dict[str, Any]:
|
|
35
|
+
"""Search a K2 corpus and return scored chunks."""
|
|
36
|
+
return cast(
|
|
37
|
+
"dict[str, Any]",
|
|
38
|
+
resolved_client.search(
|
|
39
|
+
resolved_corpus_id,
|
|
40
|
+
query,
|
|
41
|
+
top_k=top_k,
|
|
42
|
+
filters=filters,
|
|
43
|
+
hybrid=default_hybrid,
|
|
44
|
+
return_config=merge_return_config(base=None, override=None),
|
|
45
|
+
),
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def k2_ingest_text(
|
|
49
|
+
raw_text: str,
|
|
50
|
+
source_uri: str | None = None,
|
|
51
|
+
metadata: dict[str, Any] | None = None,
|
|
52
|
+
auto_index: bool = False,
|
|
53
|
+
) -> dict[str, Any]:
|
|
54
|
+
"""Ingest a text document into K2."""
|
|
55
|
+
return cast(
|
|
56
|
+
"dict[str, Any]",
|
|
57
|
+
resolved_client.upload_document(
|
|
58
|
+
resolved_corpus_id,
|
|
59
|
+
raw_text=raw_text,
|
|
60
|
+
source_uri=source_uri,
|
|
61
|
+
metadata=metadata,
|
|
62
|
+
auto_index=auto_index,
|
|
63
|
+
),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def k2_build_indexes(
|
|
67
|
+
dense: bool = True,
|
|
68
|
+
sparse: bool = True,
|
|
69
|
+
mode: str = "incremental",
|
|
70
|
+
wait: bool = True,
|
|
71
|
+
) -> dict[str, Any]:
|
|
72
|
+
"""Trigger K2 index build for the current corpus."""
|
|
73
|
+
return cast(
|
|
74
|
+
"dict[str, Any]",
|
|
75
|
+
resolved_client.build_indexes(
|
|
76
|
+
resolved_corpus_id,
|
|
77
|
+
dense=dense,
|
|
78
|
+
sparse=sparse,
|
|
79
|
+
mode=mode,
|
|
80
|
+
wait=wait,
|
|
81
|
+
),
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def k2_generate_answer(
|
|
85
|
+
query: str,
|
|
86
|
+
top_k: int = default_top_k,
|
|
87
|
+
filters: dict[str, Any] | None = None,
|
|
88
|
+
generation: dict[str, Any] | None = None,
|
|
89
|
+
) -> dict[str, Any]:
|
|
90
|
+
"""Generate a grounded answer using K2 retrieval + server-side LLM generation."""
|
|
91
|
+
return cast(
|
|
92
|
+
"dict[str, Any]",
|
|
93
|
+
resolved_client.search_generate(
|
|
94
|
+
resolved_corpus_id,
|
|
95
|
+
query,
|
|
96
|
+
top_k=top_k,
|
|
97
|
+
filters=filters,
|
|
98
|
+
hybrid=default_hybrid,
|
|
99
|
+
generation=generation if generation is not None else default_generation,
|
|
100
|
+
return_config=merge_return_config(base=None, override=None),
|
|
101
|
+
),
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
return [
|
|
105
|
+
FunctionTool.from_defaults(fn=k2_search, name="k2_search"),
|
|
106
|
+
FunctionTool.from_defaults(fn=k2_ingest_text, name="k2_ingest_text"),
|
|
107
|
+
FunctionTool.from_defaults(fn=k2_build_indexes, name="k2_build_indexes"),
|
|
108
|
+
FunctionTool.from_defaults(fn=k2_generate_answer, name="k2_generate_answer"),
|
|
109
|
+
]
|
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
from typing import Any, ClassVar, Sequence
|
|
6
|
+
|
|
7
|
+
from pydantic import ConfigDict, Field, PrivateAttr
|
|
8
|
+
|
|
9
|
+
from sdk import Knowledge2
|
|
10
|
+
from sdk.integrations._client import merge_return_config, resolve_client, resolve_corpus_id
|
|
11
|
+
from sdk.integrations.llamaindex.filters import llama_filters_to_k2
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from llama_index.core.schema import BaseNode, MetadataMode, TextNode
|
|
15
|
+
from llama_index.core.vector_stores.types import (
|
|
16
|
+
BasePydanticVectorStore,
|
|
17
|
+
VectorStoreQuery,
|
|
18
|
+
VectorStoreQueryResult,
|
|
19
|
+
)
|
|
20
|
+
except ImportError as exc: # pragma: no cover - import-time dependency guard
|
|
21
|
+
raise ImportError(
|
|
22
|
+
"LlamaIndex integration requires llama-index-core. Install with `pip install .[llamaindex]`."
|
|
23
|
+
) from exc
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _node_text(node: BaseNode) -> str:
|
|
30
|
+
"""Extract text content from a LlamaIndex node."""
|
|
31
|
+
try:
|
|
32
|
+
text = node.get_content(metadata_mode=MetadataMode.NONE)
|
|
33
|
+
except Exception: # pragma: no cover - defensive for node variants
|
|
34
|
+
text = getattr(node, "text", "")
|
|
35
|
+
return text or ""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _resolve_source_uri(
|
|
39
|
+
*,
|
|
40
|
+
node: BaseNode,
|
|
41
|
+
ref_doc_id: str | None,
|
|
42
|
+
source_uri_prefix: str,
|
|
43
|
+
) -> str:
|
|
44
|
+
"""Resolve a deterministic source URI for document ingestion."""
|
|
45
|
+
source_node = getattr(node, "source_node", None)
|
|
46
|
+
source_node_id: str | None = None
|
|
47
|
+
|
|
48
|
+
if isinstance(source_node, str):
|
|
49
|
+
source_node_id = source_node.strip() or None
|
|
50
|
+
elif source_node is not None:
|
|
51
|
+
# LlamaIndex usually exposes RelatedNodeInfo here, but some callers may
|
|
52
|
+
# surface a BaseNode-like object. Prefer stable node identifiers.
|
|
53
|
+
raw_source_node_id = getattr(source_node, "node_id", None) or getattr(
|
|
54
|
+
source_node, "id_", None
|
|
55
|
+
)
|
|
56
|
+
if raw_source_node_id:
|
|
57
|
+
source_node_id = str(raw_source_node_id).strip() or None
|
|
58
|
+
|
|
59
|
+
if source_node_id:
|
|
60
|
+
if "://" in source_node_id:
|
|
61
|
+
return source_node_id
|
|
62
|
+
return f"{source_uri_prefix}{source_node_id}"
|
|
63
|
+
if ref_doc_id:
|
|
64
|
+
return f"{source_uri_prefix}{ref_doc_id}"
|
|
65
|
+
return f"{source_uri_prefix}{node.node_id}"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _resolve_result_doc_id(*, chunk_id: str, metadata: dict[str, Any]) -> str:
|
|
69
|
+
"""Resolve a stable document identifier from chunk metadata when available."""
|
|
70
|
+
provenance = metadata.get("provenance")
|
|
71
|
+
candidates: list[Any] = []
|
|
72
|
+
if isinstance(provenance, dict):
|
|
73
|
+
candidates.extend(
|
|
74
|
+
(
|
|
75
|
+
provenance.get("document_id"),
|
|
76
|
+
provenance.get("doc_id"),
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
candidates.extend(
|
|
80
|
+
(
|
|
81
|
+
metadata.get("document_id"),
|
|
82
|
+
metadata.get("doc_id"),
|
|
83
|
+
)
|
|
84
|
+
)
|
|
85
|
+
for candidate in candidates:
|
|
86
|
+
if candidate is None:
|
|
87
|
+
continue
|
|
88
|
+
value = str(candidate).strip()
|
|
89
|
+
if value:
|
|
90
|
+
return value
|
|
91
|
+
return chunk_id
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class K2LlamaIndexVectorStore(BasePydanticVectorStore):
|
|
95
|
+
"""Doc-centric LlamaIndex VectorStore adapter for Knowledge2.
|
|
96
|
+
|
|
97
|
+
This adapter maps LlamaIndex vector-store operations onto K2 document/search APIs.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
stores_text: bool = True
|
|
101
|
+
|
|
102
|
+
k2_client: Any | None = Field(default=None, alias="client", exclude=True)
|
|
103
|
+
api_key: str | None = Field(default=None, exclude=True, repr=False)
|
|
104
|
+
api_host: str | None = None
|
|
105
|
+
corpus_id: str | None = None
|
|
106
|
+
|
|
107
|
+
top_k: int = 10
|
|
108
|
+
filters: dict[str, Any] | None = None
|
|
109
|
+
hybrid: dict[str, Any] | None = None
|
|
110
|
+
rerank: dict[str, Any] | None = None
|
|
111
|
+
return_config: dict[str, Any] | None = None
|
|
112
|
+
|
|
113
|
+
auto_index_on_add: bool = False
|
|
114
|
+
# LlamaIndex vector-store APIs are typically synchronous. K2 ingestion happens via
|
|
115
|
+
# background jobs, so we optionally wait for ingest completion on add to avoid
|
|
116
|
+
# surprising "no chunks to index" failures when users build indexes immediately.
|
|
117
|
+
wait_for_ingest_on_add: bool = True
|
|
118
|
+
ingest_poll_s: int = 2
|
|
119
|
+
ingest_timeout_s: float | None = 300.0
|
|
120
|
+
source_uri_prefix: str = "llamaindex://node/"
|
|
121
|
+
|
|
122
|
+
_client: Knowledge2 | Any = PrivateAttr()
|
|
123
|
+
_corpus_id: str = PrivateAttr()
|
|
124
|
+
_node_to_doc_id: dict[str, str] = PrivateAttr(default_factory=dict)
|
|
125
|
+
|
|
126
|
+
model_config: ClassVar[ConfigDict] = ConfigDict(
|
|
127
|
+
arbitrary_types_allowed=True,
|
|
128
|
+
populate_by_name=True,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def model_post_init(self, __context: Any) -> None:
|
|
132
|
+
self._client = resolve_client(
|
|
133
|
+
client=self.k2_client, api_key=self.api_key, api_host=self.api_host
|
|
134
|
+
)
|
|
135
|
+
self._corpus_id = resolve_corpus_id(self.corpus_id)
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def client(self) -> Any:
|
|
139
|
+
"""Expose the underlying Knowledge2 client per BasePydanticVectorStore contract."""
|
|
140
|
+
return self._client
|
|
141
|
+
|
|
142
|
+
def get(self, text_id: str) -> list[float]:
|
|
143
|
+
"""K2 does not expose direct embedding lookup by id."""
|
|
144
|
+
raise NotImplementedError("K2 does not support vector lookup by text_id")
|
|
145
|
+
|
|
146
|
+
def _wait_for_ingest_job(self, job_id: str) -> None:
|
|
147
|
+
if not hasattr(self._client, "get_job"):
|
|
148
|
+
logger.debug(
|
|
149
|
+
"Skipping ingest wait; no public get_job method on client for job=%s",
|
|
150
|
+
job_id,
|
|
151
|
+
)
|
|
152
|
+
return
|
|
153
|
+
|
|
154
|
+
start = time.monotonic()
|
|
155
|
+
while True:
|
|
156
|
+
job = self._client.get_job(job_id)
|
|
157
|
+
status = job.get("status")
|
|
158
|
+
if status in {"succeeded", "failed", "canceled"}:
|
|
159
|
+
if status != "succeeded":
|
|
160
|
+
error_message = job.get("error_message")
|
|
161
|
+
if not error_message:
|
|
162
|
+
error_message = f"Job {job_id} ended with status={status}"
|
|
163
|
+
raise RuntimeError(error_message)
|
|
164
|
+
return
|
|
165
|
+
|
|
166
|
+
if (
|
|
167
|
+
self.ingest_timeout_s is not None
|
|
168
|
+
and (time.monotonic() - start) > self.ingest_timeout_s
|
|
169
|
+
):
|
|
170
|
+
raise TimeoutError(f"Timed out waiting for ingest job {job_id}")
|
|
171
|
+
|
|
172
|
+
time.sleep(self.ingest_poll_s)
|
|
173
|
+
|
|
174
|
+
def add(self, nodes: Sequence[BaseNode], **add_kwargs: Any) -> list[str]:
|
|
175
|
+
"""Add nodes by ingesting documents into K2."""
|
|
176
|
+
added_doc_ids: list[str] = []
|
|
177
|
+
wait_for_ingest = add_kwargs.get("wait")
|
|
178
|
+
if wait_for_ingest is None:
|
|
179
|
+
wait_for_ingest = self.wait_for_ingest_on_add
|
|
180
|
+
wait_for_ingest = bool(wait_for_ingest)
|
|
181
|
+
log_jobs = bool(add_kwargs.get("log_jobs", False))
|
|
182
|
+
|
|
183
|
+
for node in nodes:
|
|
184
|
+
node_id = node.node_id
|
|
185
|
+
ref_doc_id = getattr(node, "ref_doc_id", None)
|
|
186
|
+
source_uri = _resolve_source_uri(
|
|
187
|
+
node=node,
|
|
188
|
+
ref_doc_id=ref_doc_id,
|
|
189
|
+
source_uri_prefix=self.source_uri_prefix,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
metadata = dict(getattr(node, "metadata", {}) or {})
|
|
193
|
+
metadata.setdefault("llama_node_id", node_id)
|
|
194
|
+
if ref_doc_id:
|
|
195
|
+
metadata.setdefault("llama_ref_doc_id", ref_doc_id)
|
|
196
|
+
|
|
197
|
+
response = self._client.upload_document(
|
|
198
|
+
self._corpus_id,
|
|
199
|
+
raw_text=_node_text(node),
|
|
200
|
+
source_uri=source_uri,
|
|
201
|
+
metadata=metadata,
|
|
202
|
+
auto_index=False,
|
|
203
|
+
)
|
|
204
|
+
doc_id = response["id"]
|
|
205
|
+
added_doc_ids.append(doc_id)
|
|
206
|
+
self._node_to_doc_id[node_id] = doc_id
|
|
207
|
+
if ref_doc_id:
|
|
208
|
+
self._node_to_doc_id[ref_doc_id] = doc_id
|
|
209
|
+
|
|
210
|
+
job_id = response.get("job_id")
|
|
211
|
+
if wait_for_ingest and job_id:
|
|
212
|
+
if log_jobs:
|
|
213
|
+
# Avoid noisy polling here; the smoke runner already prints job transitions.
|
|
214
|
+
# This just makes job creation visible in logs when desired.
|
|
215
|
+
logger.info(
|
|
216
|
+
"[job] job_id=%s job_type=ingest_document status=created doc_id=%s",
|
|
217
|
+
job_id,
|
|
218
|
+
doc_id,
|
|
219
|
+
)
|
|
220
|
+
self._wait_for_ingest_job(job_id)
|
|
221
|
+
|
|
222
|
+
if self.auto_index_on_add and added_doc_ids:
|
|
223
|
+
self._client.build_indexes(
|
|
224
|
+
self._corpus_id, dense=True, sparse=True, mode="incremental", wait=True
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
return added_doc_ids
|
|
228
|
+
|
|
229
|
+
def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
|
|
230
|
+
"""Delete a document from K2 by mapped ref_doc_id or raw doc_id."""
|
|
231
|
+
doc_id = self._node_to_doc_id.get(ref_doc_id, ref_doc_id)
|
|
232
|
+
reindex = bool(delete_kwargs.get("reindex", False))
|
|
233
|
+
self._client.delete_document(self._corpus_id, doc_id, confirm=True, reindex=reindex)
|
|
234
|
+
|
|
235
|
+
drop_keys = [key for key, value in self._node_to_doc_id.items() if value == doc_id]
|
|
236
|
+
for key in drop_keys:
|
|
237
|
+
self._node_to_doc_id.pop(key, None)
|
|
238
|
+
|
|
239
|
+
def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
|
|
240
|
+
"""Query K2 and return LlamaIndex vector-store query results."""
|
|
241
|
+
query_str = query.query_str or kwargs.get("query_str")
|
|
242
|
+
if not query_str:
|
|
243
|
+
raise ValueError(
|
|
244
|
+
"K2LlamaIndexVectorStore requires text queries; embedding-only VectorStoreQuery is unsupported"
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
query_top_k = query.similarity_top_k or kwargs.get("similarity_top_k") or self.top_k
|
|
248
|
+
|
|
249
|
+
query_filters = kwargs.get("filters")
|
|
250
|
+
if query_filters is None and query.filters is not None:
|
|
251
|
+
query_filters = llama_filters_to_k2(query.filters)
|
|
252
|
+
if query_filters is None:
|
|
253
|
+
query_filters = self.filters
|
|
254
|
+
|
|
255
|
+
response = self._client.search(
|
|
256
|
+
self._corpus_id,
|
|
257
|
+
query_str,
|
|
258
|
+
top_k=int(query_top_k),
|
|
259
|
+
filters=query_filters,
|
|
260
|
+
hybrid=self.hybrid,
|
|
261
|
+
rerank=self.rerank,
|
|
262
|
+
return_config=merge_return_config(
|
|
263
|
+
base=self.return_config,
|
|
264
|
+
override=None,
|
|
265
|
+
include_text=True,
|
|
266
|
+
include_scores=True,
|
|
267
|
+
include_provenance=True,
|
|
268
|
+
),
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
ids: list[str] = []
|
|
272
|
+
nodes: list[BaseNode] = []
|
|
273
|
+
similarities: list[float] = []
|
|
274
|
+
|
|
275
|
+
for result in response.get("results", []):
|
|
276
|
+
chunk_id = result.get("chunk_id")
|
|
277
|
+
if not chunk_id:
|
|
278
|
+
continue
|
|
279
|
+
|
|
280
|
+
custom_meta = result.get("custom_metadata") or {}
|
|
281
|
+
system_meta = result.get("system_metadata") or {}
|
|
282
|
+
if not custom_meta and not system_meta:
|
|
283
|
+
legacy = result.get("metadata")
|
|
284
|
+
if isinstance(legacy, dict):
|
|
285
|
+
custom_meta = legacy
|
|
286
|
+
if not isinstance(custom_meta, dict):
|
|
287
|
+
custom_meta = {}
|
|
288
|
+
if not isinstance(system_meta, dict):
|
|
289
|
+
system_meta = {}
|
|
290
|
+
chunk_metadata = {**system_meta, **custom_meta}
|
|
291
|
+
doc_id = _resolve_result_doc_id(chunk_id=chunk_id, metadata=chunk_metadata)
|
|
292
|
+
|
|
293
|
+
score = result.get("score")
|
|
294
|
+
if score is None:
|
|
295
|
+
score = result.get("raw_score")
|
|
296
|
+
if score is None:
|
|
297
|
+
score = 0.0
|
|
298
|
+
|
|
299
|
+
node = TextNode(
|
|
300
|
+
id_=chunk_id,
|
|
301
|
+
text=result.get("text") or "",
|
|
302
|
+
metadata={
|
|
303
|
+
**chunk_metadata,
|
|
304
|
+
"document_id": doc_id,
|
|
305
|
+
"chunk_id": chunk_id,
|
|
306
|
+
"corpus_id": self._corpus_id,
|
|
307
|
+
"raw_score": result.get("raw_score"),
|
|
308
|
+
},
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
# Keep query IDs delete-compatible with doc-centric write/delete semantics.
|
|
312
|
+
ids.append(doc_id)
|
|
313
|
+
nodes.append(node)
|
|
314
|
+
similarities.append(float(score))
|
|
315
|
+
|
|
316
|
+
return VectorStoreQueryResult(nodes=nodes, ids=ids, similarities=similarities)
|
|
317
|
+
|
|
318
|
+
def persist(self, persist_path: str, fs: Any = None) -> None:
|
|
319
|
+
"""No-op: K2 persists state remotely in the K2 backend."""
|
|
320
|
+
return
|
sdk/models/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Pydantic response models for the Knowledge2 SDK.
|
|
2
|
+
|
|
3
|
+
Requires: pip install knowledge2[pydantic]
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def __getattr__(name: str):
|
|
10
|
+
"""Provide a helpful error when importing model classes without pydantic."""
|
|
11
|
+
try:
|
|
12
|
+
import pydantic
|
|
13
|
+
except ImportError:
|
|
14
|
+
raise ImportError(
|
|
15
|
+
f"Cannot import '{name}' — Pydantic response models require the optional "
|
|
16
|
+
"'pydantic' dependency. Install with: pip install 'knowledge2[pydantic]'"
|
|
17
|
+
) from None
|
|
18
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
sdk/models/_base.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Base Pydantic model for Knowledge2 API responses."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, ConfigDict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class K2BaseModel(BaseModel):
|
|
9
|
+
"""Base model for all Knowledge2 API response models.
|
|
10
|
+
|
|
11
|
+
Configuration:
|
|
12
|
+
- extra="allow": unknown server fields are preserved, not rejected.
|
|
13
|
+
- validate_assignment=True: re-validates when fields are set post-init.
|
|
14
|
+
- use_enum_values=True: enum fields store the value, not the enum member.
|
|
15
|
+
- populate_by_name=True: allows field access by both Python name and alias.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
model_config = ConfigDict(
|
|
19
|
+
extra="allow",
|
|
20
|
+
validate_assignment=True,
|
|
21
|
+
validate_by_alias=True,
|
|
22
|
+
use_enum_values=True,
|
|
23
|
+
populate_by_name=True,
|
|
24
|
+
)
|