cloud-dog-vdb 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloud_dog_vdb/__init__.py +45 -0
- cloud_dog_vdb/access/__init__.py +15 -0
- cloud_dog_vdb/access/enforcement.py +32 -0
- cloud_dog_vdb/access/policy.py +38 -0
- cloud_dog_vdb/adapters/__init__.py +39 -0
- cloud_dog_vdb/adapters/base.py +94 -0
- cloud_dog_vdb/adapters/chroma.py +329 -0
- cloud_dog_vdb/adapters/factory.py +51 -0
- cloud_dog_vdb/adapters/infinity.py +404 -0
- cloud_dog_vdb/adapters/opensearch.py +281 -0
- cloud_dog_vdb/adapters/pgvector.py +300 -0
- cloud_dog_vdb/adapters/qdrant.py +315 -0
- cloud_dog_vdb/adapters/registry.py +38 -0
- cloud_dog_vdb/adapters/vector_utils.py +35 -0
- cloud_dog_vdb/adapters/weaviate.py +291 -0
- cloud_dog_vdb/capabilities/__init__.py +15 -0
- cloud_dog_vdb/capabilities/models.py +28 -0
- cloud_dog_vdb/capabilities/planner.py +27 -0
- cloud_dog_vdb/collections/__init__.py +15 -0
- cloud_dog_vdb/collections/manager.py +44 -0
- cloud_dog_vdb/collections/specs.py +34 -0
- cloud_dog_vdb/compat/__init__.py +20 -0
- cloud_dog_vdb/compat/response_normaliser.py +194 -0
- cloud_dog_vdb/config/__init__.py +17 -0
- cloud_dog_vdb/config/models.py +38 -0
- cloud_dog_vdb/domain/__init__.py +25 -0
- cloud_dog_vdb/domain/enums.py +35 -0
- cloud_dog_vdb/domain/errors.py +45 -0
- cloud_dog_vdb/domain/models.py +108 -0
- cloud_dog_vdb/embeddings/__init__.py +18 -0
- cloud_dog_vdb/embeddings/base.py +28 -0
- cloud_dog_vdb/embeddings/providers.py +86 -0
- cloud_dog_vdb/factory.py +27 -0
- cloud_dog_vdb/ingestion/__init__.py +29 -0
- cloud_dog_vdb/ingestion/acquire.py +35 -0
- cloud_dog_vdb/ingestion/checkpoints.py +34 -0
- cloud_dog_vdb/ingestion/chunk/__init__.py +15 -0
- cloud_dog_vdb/ingestion/chunk/base.py +33 -0
- cloud_dog_vdb/ingestion/chunk/boundary.py +23 -0
- cloud_dog_vdb/ingestion/chunk/fixed.py +28 -0
- cloud_dog_vdb/ingestion/chunk/recursive.py +33 -0
- cloud_dog_vdb/ingestion/chunk/semantic.py +40 -0
- cloud_dog_vdb/ingestion/convert/__init__.py +15 -0
- cloud_dog_vdb/ingestion/convert/base.py +34 -0
- cloud_dog_vdb/ingestion/convert/deepdoc_conv.py +26 -0
- cloud_dog_vdb/ingestion/convert/mineru_conv.py +25 -0
- cloud_dog_vdb/ingestion/convert/pandas_conv.py +32 -0
- cloud_dog_vdb/ingestion/embed.py +30 -0
- cloud_dog_vdb/ingestion/ocr/__init__.py +26 -0
- cloud_dog_vdb/ingestion/ocr/base.py +52 -0
- cloud_dog_vdb/ingestion/ocr/heuristics.py +31 -0
- cloud_dog_vdb/ingestion/ocr/planner.py +43 -0
- cloud_dog_vdb/ingestion/ocr/providers/__init__.py +23 -0
- cloud_dog_vdb/ingestion/ocr/providers/external_service.py +69 -0
- cloud_dog_vdb/ingestion/ocr/providers/llm.py +94 -0
- cloud_dog_vdb/ingestion/ocr/providers/local.py +78 -0
- cloud_dog_vdb/ingestion/ocr/registry.py +36 -0
- cloud_dog_vdb/ingestion/parse/__init__.py +46 -0
- cloud_dog_vdb/ingestion/parse/async_runner.py +215 -0
- cloud_dog_vdb/ingestion/parse/base.py +52 -0
- cloud_dog_vdb/ingestion/parse/capabilities.py +32 -0
- cloud_dog_vdb/ingestion/parse/ir.py +57 -0
- cloud_dog_vdb/ingestion/parse/planner.py +31 -0
- cloud_dog_vdb/ingestion/parse/providers/__init__.py +29 -0
- cloud_dog_vdb/ingestion/parse/providers/deepdoc.py +101 -0
- cloud_dog_vdb/ingestion/parse/providers/docling.py +101 -0
- cloud_dog_vdb/ingestion/parse/providers/internal.py +83 -0
- cloud_dog_vdb/ingestion/parse/providers/marker_mcp.py +643 -0
- cloud_dog_vdb/ingestion/parse/providers/mineru.py +703 -0
- cloud_dog_vdb/ingestion/parse/providers/transformers.py +176 -0
- cloud_dog_vdb/ingestion/parse/quality.py +21 -0
- cloud_dog_vdb/ingestion/parse/registry.py +36 -0
- cloud_dog_vdb/ingestion/pipeline.py +433 -0
- cloud_dog_vdb/ingestion/table/__init__.py +25 -0
- cloud_dog_vdb/ingestion/table/policy.py +31 -0
- cloud_dog_vdb/ingestion/table/renderers.py +74 -0
- cloud_dog_vdb/ingestion/table/schema.py +40 -0
- cloud_dog_vdb/ingestion/verify.py +30 -0
- cloud_dog_vdb/integrations/__init__.py +15 -0
- cloud_dog_vdb/integrations/langchain.py +32 -0
- cloud_dog_vdb/integrations/llamaindex.py +32 -0
- cloud_dog_vdb/isolation/__init__.py +15 -0
- cloud_dog_vdb/isolation/manager.py +36 -0
- cloud_dog_vdb/jobs/__init__.py +15 -0
- cloud_dog_vdb/jobs/models.py +28 -0
- cloud_dog_vdb/jobs/queue.py +45 -0
- cloud_dog_vdb/jobs/status.py +32 -0
- cloud_dog_vdb/jobs/worker.py +28 -0
- cloud_dog_vdb/lifecycle/__init__.py +25 -0
- cloud_dog_vdb/lifecycle/manager.py +53 -0
- cloud_dog_vdb/lifecycle/retention.py +83 -0
- cloud_dog_vdb/metadata/__init__.py +46 -0
- cloud_dog_vdb/metadata/filters.py +130 -0
- cloud_dog_vdb/metadata/identity.py +72 -0
- cloud_dog_vdb/metadata/normalise.py +35 -0
- cloud_dog_vdb/metadata/provenance.py +102 -0
- cloud_dog_vdb/metadata/schema.py +166 -0
- cloud_dog_vdb/observability/__init__.py +15 -0
- cloud_dog_vdb/observability/audit.py +32 -0
- cloud_dog_vdb/observability/metrics.py +37 -0
- cloud_dog_vdb/observability/otel.py +32 -0
- cloud_dog_vdb/options/__init__.py +15 -0
- cloud_dog_vdb/options/chroma.py +28 -0
- cloud_dog_vdb/options/common.py +35 -0
- cloud_dog_vdb/options/manager.py +34 -0
- cloud_dog_vdb/options/opensearch.py +28 -0
- cloud_dog_vdb/options/pgvector.py +28 -0
- cloud_dog_vdb/options/qdrant.py +28 -0
- cloud_dog_vdb/options/weaviate.py +28 -0
- cloud_dog_vdb/remote/__init__.py +20 -0
- cloud_dog_vdb/remote/client.py +105 -0
- cloud_dog_vdb/runtime/__init__.py +18 -0
- cloud_dog_vdb/runtime/client.py +362 -0
- cloud_dog_vdb/runtime/factory.py +113 -0
- cloud_dog_vdb/search/__init__.py +15 -0
- cloud_dog_vdb/search/engine.py +44 -0
- cloud_dog_vdb/search/rerank.py +29 -0
- cloud_dog_vdb/testing/__init__.py +22 -0
- cloud_dog_vdb/testing/comparison.py +424 -0
- cloud_dog_vdb/testing/comparison_report.py +89 -0
- cloud_dog_vdb/testing/conformance.py +32 -0
- cloud_dog_vdb/testing/fixtures.py +30 -0
- cloud_dog_vdb/testing/mock_adapters.py +32 -0
- cloud_dog_vdb/versioning/__init__.py +24 -0
- cloud_dog_vdb/versioning/schema_version.py +151 -0
- cloud_dog_vdb-0.5.4.dist-info/METADATA +43 -0
- cloud_dog_vdb-0.5.4.dist-info/RECORD +131 -0
- cloud_dog_vdb-0.5.4.dist-info/WHEEL +4 -0
- cloud_dog_vdb-0.5.4.dist-info/licenses/LICENCE +190 -0
- cloud_dog_vdb-0.5.4.dist-info/licenses/LICENSE +176 -0
- cloud_dog_vdb-0.5.4.dist-info/licenses/NOTICE +7 -0
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Copyright 2026 Cloud-Dog, Viewdeck Engineering Limited
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from cloud_dog_vdb.domain.models import (
|
|
18
|
+
CapabilityDescriptor,
|
|
19
|
+
CollectionSpec,
|
|
20
|
+
Job,
|
|
21
|
+
Record,
|
|
22
|
+
SearchRequest,
|
|
23
|
+
SearchResponse,
|
|
24
|
+
SearchResult,
|
|
25
|
+
)
|
|
26
|
+
from cloud_dog_vdb.factory import get_vdb_client
|
|
27
|
+
from cloud_dog_vdb.ingestion.pipeline import IngestionPipeline, ParserIngestionOptions, ingest_document
|
|
28
|
+
from cloud_dog_vdb.runtime.client import VDBClient
|
|
29
|
+
|
|
30
|
+
__version__ = "0.5.4"
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
"VDBClient",
|
|
34
|
+
"CapabilityDescriptor",
|
|
35
|
+
"CollectionSpec",
|
|
36
|
+
"Record",
|
|
37
|
+
"SearchRequest",
|
|
38
|
+
"SearchResult",
|
|
39
|
+
"SearchResponse",
|
|
40
|
+
"Job",
|
|
41
|
+
"get_vdb_client",
|
|
42
|
+
"ingest_document",
|
|
43
|
+
"ParserIngestionOptions",
|
|
44
|
+
"IngestionPipeline",
|
|
45
|
+
]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Copyright 2026 Cloud-Dog, Viewdeck Engineering Limited
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
# cloud_dog_vdb access
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Copyright 2026 Cloud-Dog, Viewdeck Engineering Limited
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from cloud_dog_vdb.access.policy import AccessPolicy
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def can_read(role: str, policy: AccessPolicy) -> bool:
|
|
21
|
+
"""Handle can read."""
|
|
22
|
+
return role in policy.readers or role in policy.writers or role in policy.admins
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def can_write(role: str, policy: AccessPolicy) -> bool:
|
|
26
|
+
"""Handle can write."""
|
|
27
|
+
return role in policy.writers or role in policy.admins
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def can_admin(role: str, policy: AccessPolicy) -> bool:
|
|
31
|
+
"""Handle can admin."""
|
|
32
|
+
return role in policy.admins
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Copyright 2026 Cloud-Dog, Viewdeck Engineering Limited
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True, slots=True)
|
|
21
|
+
class AccessPolicy:
|
|
22
|
+
"""Represent access policy."""
|
|
23
|
+
|
|
24
|
+
readers: set[str] = field(default_factory=set)
|
|
25
|
+
writers: set[str] = field(default_factory=set)
|
|
26
|
+
admins: set[str] = field(default_factory=set)
|
|
27
|
+
|
|
28
|
+
def can_read(self, role: str) -> bool:
|
|
29
|
+
"""Handle can read."""
|
|
30
|
+
return role in self.readers or self.can_write(role)
|
|
31
|
+
|
|
32
|
+
def can_write(self, role: str) -> bool:
|
|
33
|
+
"""Handle can write."""
|
|
34
|
+
return role in self.writers or self.can_admin(role)
|
|
35
|
+
|
|
36
|
+
def can_admin(self, role: str) -> bool:
|
|
37
|
+
"""Handle can admin."""
|
|
38
|
+
return role in self.admins
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Copyright 2026 Cloud-Dog, Viewdeck Engineering Limited
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from cloud_dog_vdb.adapters.base import VDBAdapter
|
|
16
|
+
from cloud_dog_vdb.adapters.chroma import ChromaAdapter
|
|
17
|
+
from cloud_dog_vdb.adapters.factory import build_adapter
|
|
18
|
+
from cloud_dog_vdb.adapters.infinity import InfinityAdapter
|
|
19
|
+
from cloud_dog_vdb.adapters.opensearch import OpenSearchAdapter
|
|
20
|
+
from cloud_dog_vdb.adapters.qdrant import QdrantAdapter
|
|
21
|
+
from cloud_dog_vdb.adapters.registry import AdapterRegistry
|
|
22
|
+
from cloud_dog_vdb.adapters.weaviate import WeaviateAdapter
|
|
23
|
+
|
|
24
|
+
try: # optional dependency (asyncpg)
|
|
25
|
+
from cloud_dog_vdb.adapters.pgvector import PGVectorAdapter
|
|
26
|
+
except Exception: # pragma: no cover
|
|
27
|
+
PGVectorAdapter = None # type: ignore[assignment]
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"VDBAdapter",
|
|
31
|
+
"AdapterRegistry",
|
|
32
|
+
"build_adapter",
|
|
33
|
+
"ChromaAdapter",
|
|
34
|
+
"QdrantAdapter",
|
|
35
|
+
"WeaviateAdapter",
|
|
36
|
+
"OpenSearchAdapter",
|
|
37
|
+
"InfinityAdapter",
|
|
38
|
+
"PGVectorAdapter",
|
|
39
|
+
]
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Copyright 2026 Cloud-Dog, Viewdeck Engineering Limited
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from abc import ABC, abstractmethod
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
from cloud_dog_vdb.domain.models import CapabilityDescriptor, CollectionSpec
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class VDBAdapter(ABC):
|
|
24
|
+
"""Define the asynchronous vector-database adapter contract."""
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
async def initialize(self, config: dict[str, Any] | None = None) -> bool:
|
|
28
|
+
"""Initialise the adapter with optional configuration."""
|
|
29
|
+
raise NotImplementedError
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
async def health_check(self) -> bool:
|
|
33
|
+
"""Return whether the adapter is healthy."""
|
|
34
|
+
raise NotImplementedError
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
async def create_collection(self, spec: CollectionSpec) -> dict:
|
|
38
|
+
"""Create collection."""
|
|
39
|
+
raise NotImplementedError
|
|
40
|
+
|
|
41
|
+
@abstractmethod
|
|
42
|
+
async def get_collection(self, name: str) -> dict | None:
|
|
43
|
+
"""Return collection."""
|
|
44
|
+
raise NotImplementedError
|
|
45
|
+
|
|
46
|
+
@abstractmethod
|
|
47
|
+
async def delete_collection(self, name: str) -> bool:
|
|
48
|
+
"""Delete collection."""
|
|
49
|
+
raise NotImplementedError
|
|
50
|
+
|
|
51
|
+
@abstractmethod
|
|
52
|
+
async def add_documents(
|
|
53
|
+
self,
|
|
54
|
+
collection: str,
|
|
55
|
+
documents: list[str],
|
|
56
|
+
metadatas: list[dict[str, Any]] | None = None,
|
|
57
|
+
ids: list[str] | None = None,
|
|
58
|
+
) -> list[str]:
|
|
59
|
+
"""Add documents to a collection and return their identifiers."""
|
|
60
|
+
raise NotImplementedError
|
|
61
|
+
|
|
62
|
+
@abstractmethod
|
|
63
|
+
async def search(
|
|
64
|
+
self,
|
|
65
|
+
collection: str,
|
|
66
|
+
query: str,
|
|
67
|
+
n_results: int,
|
|
68
|
+
filter: dict[str, Any] | None = None,
|
|
69
|
+
search_options: dict[str, Any] | None = None,
|
|
70
|
+
) -> list[dict[str, Any]]:
|
|
71
|
+
"""Search a collection and return matching records."""
|
|
72
|
+
raise NotImplementedError
|
|
73
|
+
|
|
74
|
+
@abstractmethod
|
|
75
|
+
async def delete_document(self, collection: str, doc_id: str) -> bool:
|
|
76
|
+
"""Delete document."""
|
|
77
|
+
raise NotImplementedError
|
|
78
|
+
|
|
79
|
+
@abstractmethod
|
|
80
|
+
async def update_document(
|
|
81
|
+
self, collection: str, doc_id: str, content: str, metadata: dict[str, Any] | None = None
|
|
82
|
+
) -> bool:
|
|
83
|
+
"""Update document."""
|
|
84
|
+
raise NotImplementedError
|
|
85
|
+
|
|
86
|
+
@abstractmethod
|
|
87
|
+
async def count_documents(self, collection: str, filter: dict[str, Any] | None = None) -> int:
|
|
88
|
+
"""Count documents in a collection."""
|
|
89
|
+
raise NotImplementedError
|
|
90
|
+
|
|
91
|
+
@abstractmethod
|
|
92
|
+
def capabilities(self) -> CapabilityDescriptor:
|
|
93
|
+
"""Return the adapter capability descriptor."""
|
|
94
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
# Copyright 2026 Cloud-Dog, Viewdeck Engineering Limited
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import json
|
|
18
|
+
import uuid
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
import httpx
|
|
22
|
+
|
|
23
|
+
from cloud_dog_vdb.adapters.base import VDBAdapter
|
|
24
|
+
from cloud_dog_vdb.adapters.vector_utils import deterministic_vector
|
|
25
|
+
from cloud_dog_vdb.config.models import ProviderConfig
|
|
26
|
+
from cloud_dog_vdb.domain.models import CapabilityDescriptor, CollectionSpec
|
|
27
|
+
from cloud_dog_vdb.embeddings.base import EmbeddingProvider
|
|
28
|
+
from cloud_dog_vdb.embeddings.providers import build_embedding_provider
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ChromaAdapter(VDBAdapter):
|
|
32
|
+
"""Represent chroma adapter."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, config: ProviderConfig, *, local_mode: bool = False) -> None:
|
|
35
|
+
self.config = config
|
|
36
|
+
self.local_mode = local_mode
|
|
37
|
+
self._client = httpx.AsyncClient(timeout=config.timeout_seconds)
|
|
38
|
+
self._local: dict[str, dict] = {}
|
|
39
|
+
self._dims: dict[str, int] = {}
|
|
40
|
+
self._embedding_provider: EmbeddingProvider | None = None if local_mode else build_embedding_provider(config)
|
|
41
|
+
|
|
42
|
+
async def initialize(self, config: dict[str, Any] | None = None) -> bool:
|
|
43
|
+
"""Handle initialize."""
|
|
44
|
+
_ = config
|
|
45
|
+
return await self.health_check()
|
|
46
|
+
|
|
47
|
+
def _headers(self) -> dict[str, str]:
|
|
48
|
+
headers = {"Content-Type": "application/json"}
|
|
49
|
+
api_key = str(self.config.api_key)
|
|
50
|
+
if api_key:
|
|
51
|
+
headers["Authorization"] = f"Bearer {api_key}"
|
|
52
|
+
return headers
|
|
53
|
+
|
|
54
|
+
def _root(self) -> str:
|
|
55
|
+
base_url = str(self.config.base_url)
|
|
56
|
+
return f"{base_url.rstrip('/')}/api/v2/tenants/default_tenant/databases/default_database/collections"
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def _serialise_metadata_value(value: Any) -> str | int | float | bool:
|
|
60
|
+
if isinstance(value, bool):
|
|
61
|
+
return value
|
|
62
|
+
if isinstance(value, (str, int, float)):
|
|
63
|
+
return value
|
|
64
|
+
if value is None:
|
|
65
|
+
return ""
|
|
66
|
+
if isinstance(value, (list, tuple, set, dict)):
|
|
67
|
+
return json.dumps(value, ensure_ascii=True, sort_keys=True)
|
|
68
|
+
return str(value)
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def _serialise_metadata(cls, metadata: dict[str, Any] | None) -> dict[str, str | int | float | bool]:
|
|
72
|
+
source = metadata or {}
|
|
73
|
+
return {
|
|
74
|
+
str(key): cls._serialise_metadata_value(value)
|
|
75
|
+
for key, value in source.items()
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
async def _embed_text(self, text: str, dim: int) -> list[float]:
|
|
79
|
+
if self._embedding_provider is None:
|
|
80
|
+
return deterministic_vector(text, dim)
|
|
81
|
+
vector = await self._embedding_provider.embed(text)
|
|
82
|
+
if not vector:
|
|
83
|
+
raise ValueError("Embedding provider returned an empty vector")
|
|
84
|
+
return vector
|
|
85
|
+
|
|
86
|
+
async def _embed_many(self, texts: list[str], dim: int) -> list[list[float]]:
|
|
87
|
+
return [await self._embed_text(text, dim) for text in texts]
|
|
88
|
+
|
|
89
|
+
async def health_check(self) -> bool:
|
|
90
|
+
"""Handle health check."""
|
|
91
|
+
if self.local_mode:
|
|
92
|
+
return True
|
|
93
|
+
try:
|
|
94
|
+
resp = await self._client.get(
|
|
95
|
+
f"{str(self.config.base_url).rstrip('/')}/api/v2/heartbeat",
|
|
96
|
+
headers=self._headers(),
|
|
97
|
+
)
|
|
98
|
+
return resp.status_code == 200
|
|
99
|
+
except Exception:
|
|
100
|
+
return False
|
|
101
|
+
|
|
102
|
+
async def create_collection(self, spec: CollectionSpec) -> dict:
|
|
103
|
+
"""Create collection."""
|
|
104
|
+
if self.local_mode:
|
|
105
|
+
data = {"name": spec.name, "id": spec.name, "local": True, "metadata": dict(spec.metadata)}
|
|
106
|
+
self._local[spec.name] = data
|
|
107
|
+
self._dims[spec.name] = spec.embedding_dim
|
|
108
|
+
return data
|
|
109
|
+
payload = {"name": spec.name, "configuration": {"hnsw": {"space": spec.distance_metric.value}}}
|
|
110
|
+
if spec.metadata:
|
|
111
|
+
payload["metadata"] = spec.metadata
|
|
112
|
+
resp = await self._client.post(self._root(), headers=self._headers(), json=payload)
|
|
113
|
+
if resp.status_code == 409:
|
|
114
|
+
return {"name": spec.name, "status": "exists"}
|
|
115
|
+
resp.raise_for_status()
|
|
116
|
+
self._dims[spec.name] = spec.embedding_dim
|
|
117
|
+
return resp.json()
|
|
118
|
+
|
|
119
|
+
async def get_collection(self, name: str) -> dict | None:
|
|
120
|
+
"""Return collection."""
|
|
121
|
+
if self.local_mode:
|
|
122
|
+
return self._local.get(name)
|
|
123
|
+
resp = await self._client.get(self._root(), headers=self._headers())
|
|
124
|
+
resp.raise_for_status()
|
|
125
|
+
for col in resp.json() or []:
|
|
126
|
+
if col.get("name") == name:
|
|
127
|
+
return col
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
async def delete_collection(self, name: str) -> bool:
|
|
131
|
+
"""Delete collection."""
|
|
132
|
+
self._dims.pop(name, None)
|
|
133
|
+
if self.local_mode:
|
|
134
|
+
return self._local.pop(name, None) is not None
|
|
135
|
+
# Newer Chroma deployments accept collection name in delete path,
|
|
136
|
+
# while some older deployments may still rely on collection ID.
|
|
137
|
+
resp = await self._client.delete(f"{self._root()}/{name}", headers=self._headers())
|
|
138
|
+
if resp.status_code < 300 or resp.status_code == 404:
|
|
139
|
+
return True
|
|
140
|
+
|
|
141
|
+
cid = await self._collection_id(name)
|
|
142
|
+
if not cid:
|
|
143
|
+
return True
|
|
144
|
+
resp = await self._client.delete(f"{self._root()}/{cid}", headers=self._headers())
|
|
145
|
+
return resp.status_code < 300 or resp.status_code == 404
|
|
146
|
+
|
|
147
|
+
async def add_documents(
|
|
148
|
+
self,
|
|
149
|
+
collection: str,
|
|
150
|
+
documents: list[str],
|
|
151
|
+
metadatas: list[dict[str, Any]] | None = None,
|
|
152
|
+
ids: list[str] | None = None,
|
|
153
|
+
) -> list[str]:
|
|
154
|
+
"""Handle add documents."""
|
|
155
|
+
out: list[str] = []
|
|
156
|
+
dim = self._dims.get(collection, 1024)
|
|
157
|
+
docs: list[str] = []
|
|
158
|
+
ids_out: list[str] = []
|
|
159
|
+
metas_out: list[dict[str, Any]] = []
|
|
160
|
+
embeddings = await self._embed_many(documents, dim)
|
|
161
|
+
for i, content in enumerate(documents):
|
|
162
|
+
doc_id = ids[i] if ids and i < len(ids) else uuid.uuid4().hex
|
|
163
|
+
meta = self._serialise_metadata(metadatas[i] if metadatas and i < len(metadatas) else {})
|
|
164
|
+
docs.append(content)
|
|
165
|
+
ids_out.append(doc_id)
|
|
166
|
+
metas_out.append(meta)
|
|
167
|
+
out.append(doc_id)
|
|
168
|
+
if self.local_mode:
|
|
169
|
+
local = self._local.setdefault(collection, {"docs": {}})
|
|
170
|
+
docs_state = local.setdefault("docs", {})
|
|
171
|
+
for idx, doc_id in enumerate(ids_out):
|
|
172
|
+
docs_state[doc_id] = {"content": docs[idx], "metadata": metas_out[idx], "embedding": embeddings[idx]}
|
|
173
|
+
return out
|
|
174
|
+
cid = await self._collection_id(collection)
|
|
175
|
+
if not cid:
|
|
176
|
+
raise ValueError(f"Collection not found: {collection}")
|
|
177
|
+
payload = {"ids": ids_out, "documents": docs, "metadatas": metas_out, "embeddings": embeddings}
|
|
178
|
+
resp = await self._client.post(f"{self._root()}/{cid}/upsert", headers=self._headers(), json=payload)
|
|
179
|
+
if resp.status_code == 404:
|
|
180
|
+
resp = await self._client.post(f"{self._root()}/{cid}/add", headers=self._headers(), json=payload)
|
|
181
|
+
resp.raise_for_status()
|
|
182
|
+
return out
|
|
183
|
+
|
|
184
|
+
async def search(
|
|
185
|
+
self,
|
|
186
|
+
collection: str,
|
|
187
|
+
query: str,
|
|
188
|
+
n_results: int,
|
|
189
|
+
filter: dict[str, Any] | None = None,
|
|
190
|
+
search_options: dict[str, Any] | None = None,
|
|
191
|
+
) -> list[dict[str, Any]]:
|
|
192
|
+
"""Handle search."""
|
|
193
|
+
_ = search_options
|
|
194
|
+
dim = self._dims.get(collection, 1024)
|
|
195
|
+
if self.local_mode:
|
|
196
|
+
docs_state = (self._local.get(collection) or {}).get("docs", {})
|
|
197
|
+
target = deterministic_vector(query or "*", dim)
|
|
198
|
+
out: list[dict[str, Any]] = []
|
|
199
|
+
for doc_id, value in docs_state.items():
|
|
200
|
+
metadata = value.get("metadata") or {}
|
|
201
|
+
if filter and any(metadata.get(k) != v for k, v in filter.items()):
|
|
202
|
+
continue
|
|
203
|
+
emb = value.get("embedding") or target
|
|
204
|
+
score = sum(a * b for a, b in zip(emb, target))
|
|
205
|
+
out.append(
|
|
206
|
+
{
|
|
207
|
+
"id": doc_id,
|
|
208
|
+
"score": float(score),
|
|
209
|
+
"content": value.get("content", ""),
|
|
210
|
+
"metadata": metadata,
|
|
211
|
+
}
|
|
212
|
+
)
|
|
213
|
+
out.sort(key=lambda x: x["score"], reverse=True)
|
|
214
|
+
return out[:n_results]
|
|
215
|
+
cid = await self._collection_id(collection)
|
|
216
|
+
if not cid:
|
|
217
|
+
return []
|
|
218
|
+
payload: dict[str, Any] = {
|
|
219
|
+
"query_embeddings": [await self._embed_text(query or "*", dim)],
|
|
220
|
+
"n_results": n_results,
|
|
221
|
+
"include": ["distances", "metadatas", "documents"],
|
|
222
|
+
}
|
|
223
|
+
if filter:
|
|
224
|
+
payload["where"] = self._build_where(filter)
|
|
225
|
+
resp = await self._client.post(f"{self._root()}/{cid}/query", headers=self._headers(), json=payload)
|
|
226
|
+
resp.raise_for_status()
|
|
227
|
+
body = resp.json()
|
|
228
|
+
ids = (body.get("ids") or [[]])[0]
|
|
229
|
+
documents = (body.get("documents") or [[]])[0]
|
|
230
|
+
metadatas = (body.get("metadatas") or [[]])[0]
|
|
231
|
+
distances = (body.get("distances") or [[]])[0]
|
|
232
|
+
out = []
|
|
233
|
+
for i, doc_id in enumerate(ids):
|
|
234
|
+
dist = float(distances[i]) if i < len(distances) else 1.0
|
|
235
|
+
out.append(
|
|
236
|
+
{
|
|
237
|
+
"id": str(doc_id),
|
|
238
|
+
"score": 1.0 - dist,
|
|
239
|
+
"content": documents[i] if i < len(documents) else "",
|
|
240
|
+
"metadata": metadatas[i] if i < len(metadatas) else {},
|
|
241
|
+
}
|
|
242
|
+
)
|
|
243
|
+
return out
|
|
244
|
+
|
|
245
|
+
async def delete_document(self, collection: str, doc_id: str) -> bool:
|
|
246
|
+
"""Delete document."""
|
|
247
|
+
if self.local_mode:
|
|
248
|
+
docs_state = (self._local.get(collection) or {}).get("docs", {})
|
|
249
|
+
return docs_state.pop(doc_id, None) is not None
|
|
250
|
+
cid = await self._collection_id(collection)
|
|
251
|
+
if not cid:
|
|
252
|
+
return False
|
|
253
|
+
resp = await self._client.post(
|
|
254
|
+
f"{self._root()}/{cid}/delete",
|
|
255
|
+
headers=self._headers(),
|
|
256
|
+
json={"ids": [doc_id]},
|
|
257
|
+
)
|
|
258
|
+
return resp.status_code < 300
|
|
259
|
+
|
|
260
|
+
async def update_document(
|
|
261
|
+
self, collection: str, doc_id: str, content: str, metadata: dict[str, Any] | None = None
|
|
262
|
+
) -> bool:
|
|
263
|
+
"""Update document."""
|
|
264
|
+
dim = self._dims.get(collection, 1024)
|
|
265
|
+
if self.local_mode:
|
|
266
|
+
docs_state = (self._local.get(collection) or {}).setdefault("docs", {})
|
|
267
|
+
if doc_id not in docs_state:
|
|
268
|
+
return False
|
|
269
|
+
docs_state[doc_id] = {
|
|
270
|
+
"content": content,
|
|
271
|
+
"metadata": metadata or {},
|
|
272
|
+
"embedding": deterministic_vector(content, dim),
|
|
273
|
+
}
|
|
274
|
+
return True
|
|
275
|
+
cid = await self._collection_id(collection)
|
|
276
|
+
if not cid:
|
|
277
|
+
return False
|
|
278
|
+
payload = {
|
|
279
|
+
"ids": [doc_id],
|
|
280
|
+
"documents": [content],
|
|
281
|
+
"metadatas": [metadata or {}],
|
|
282
|
+
"embeddings": [await self._embed_text(content, dim)],
|
|
283
|
+
}
|
|
284
|
+
resp = await self._client.post(f"{self._root()}/{cid}/upsert", headers=self._headers(), json=payload)
|
|
285
|
+
if resp.status_code == 404:
|
|
286
|
+
resp = await self._client.post(f"{self._root()}/{cid}/update", headers=self._headers(), json=payload)
|
|
287
|
+
return resp.status_code < 300
|
|
288
|
+
|
|
289
|
+
async def count_documents(self, collection: str, filter: dict[str, Any] | None = None) -> int:
|
|
290
|
+
"""Handle count documents."""
|
|
291
|
+
if self.local_mode:
|
|
292
|
+
docs_state = (self._local.get(collection) or {}).get("docs", {})
|
|
293
|
+
if not filter:
|
|
294
|
+
return len(docs_state)
|
|
295
|
+
return sum(
|
|
296
|
+
1
|
|
297
|
+
for value in docs_state.values()
|
|
298
|
+
if all((value.get("metadata") or {}).get(k) == v for k, v in filter.items())
|
|
299
|
+
)
|
|
300
|
+
cid = await self._collection_id(collection)
|
|
301
|
+
if not cid:
|
|
302
|
+
return 0
|
|
303
|
+
if filter:
|
|
304
|
+
docs = await self.search(collection, "*", 10000, filter, {})
|
|
305
|
+
return len(docs)
|
|
306
|
+
resp = await self._client.get(f"{self._root()}/{cid}/count", headers=self._headers())
|
|
307
|
+
if resp.status_code == 404:
|
|
308
|
+
return 0
|
|
309
|
+
resp.raise_for_status()
|
|
310
|
+
body = resp.json()
|
|
311
|
+
return int(body if isinstance(body, int) else body.get("count", 0))
|
|
312
|
+
|
|
313
|
+
def capabilities(self) -> CapabilityDescriptor:
|
|
314
|
+
"""Handle capabilities."""
|
|
315
|
+
return CapabilityDescriptor(provider_id="chroma", filtering=True, hybrid_search=False, delete_by_filter=False)
|
|
316
|
+
|
|
317
|
+
async def _collection_id(self, name: str) -> str | None:
|
|
318
|
+
resp = await self._client.get(self._root(), headers=self._headers())
|
|
319
|
+
resp.raise_for_status()
|
|
320
|
+
for col in resp.json() or []:
|
|
321
|
+
if col.get("name") == name:
|
|
322
|
+
return str(col.get("id") or col.get("name"))
|
|
323
|
+
return None
|
|
324
|
+
|
|
325
|
+
@staticmethod
|
|
326
|
+
def _build_where(filter: dict[str, Any]) -> dict[str, Any]:
|
|
327
|
+
if len(filter) <= 1:
|
|
328
|
+
return dict(filter)
|
|
329
|
+
return {"$and": [{k: v} for k, v in filter.items()]}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# Copyright 2026 Cloud-Dog, Viewdeck Engineering Limited
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
# cloud_dog_vdb — Adapter factory
|
|
16
|
+
"""Factory helpers for constructing VDB adapters from provider config."""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from cloud_dog_vdb.adapters.base import VDBAdapter
|
|
21
|
+
from cloud_dog_vdb.config.models import ProviderConfig
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def build_adapter(config: ProviderConfig, *, local_mode: bool = False) -> VDBAdapter:
|
|
25
|
+
"""Build adapter."""
|
|
26
|
+
provider = config.provider_id.lower()
|
|
27
|
+
if provider == "chroma":
|
|
28
|
+
from cloud_dog_vdb.adapters.chroma import ChromaAdapter
|
|
29
|
+
|
|
30
|
+
return ChromaAdapter(config, local_mode=local_mode)
|
|
31
|
+
if provider == "qdrant":
|
|
32
|
+
from cloud_dog_vdb.adapters.qdrant import QdrantAdapter
|
|
33
|
+
|
|
34
|
+
return QdrantAdapter(config, local_mode=local_mode)
|
|
35
|
+
if provider == "weaviate":
|
|
36
|
+
from cloud_dog_vdb.adapters.weaviate import WeaviateAdapter
|
|
37
|
+
|
|
38
|
+
return WeaviateAdapter(config, local_mode=local_mode)
|
|
39
|
+
if provider == "opensearch":
|
|
40
|
+
from cloud_dog_vdb.adapters.opensearch import OpenSearchAdapter
|
|
41
|
+
|
|
42
|
+
return OpenSearchAdapter(config, local_mode=local_mode)
|
|
43
|
+
if provider == "pgvector":
|
|
44
|
+
from cloud_dog_vdb.adapters.pgvector import PGVectorAdapter
|
|
45
|
+
|
|
46
|
+
return PGVectorAdapter(config, local_mode=local_mode)
|
|
47
|
+
if provider == "infinity":
|
|
48
|
+
from cloud_dog_vdb.adapters.infinity import InfinityAdapter
|
|
49
|
+
|
|
50
|
+
return InfinityAdapter(config, local_mode=local_mode)
|
|
51
|
+
raise ValueError(f"Unsupported provider: {config.provider_id}")
|