linkml-store 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linkml_store/__init__.py +7 -0
- linkml_store/api/__init__.py +8 -0
- linkml_store/api/client.py +414 -0
- linkml_store/api/collection.py +1280 -0
- linkml_store/api/config.py +187 -0
- linkml_store/api/database.py +862 -0
- linkml_store/api/queries.py +69 -0
- linkml_store/api/stores/__init__.py +0 -0
- linkml_store/api/stores/chromadb/__init__.py +7 -0
- linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
- linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
- linkml_store/api/stores/dremio/__init__.py +10 -0
- linkml_store/api/stores/dremio/dremio_collection.py +555 -0
- linkml_store/api/stores/dremio/dremio_database.py +1052 -0
- linkml_store/api/stores/dremio/mappings.py +105 -0
- linkml_store/api/stores/dremio_rest/__init__.py +11 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
- linkml_store/api/stores/duckdb/__init__.py +16 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
- linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
- linkml_store/api/stores/duckdb/mappings.py +8 -0
- linkml_store/api/stores/filesystem/__init__.py +15 -0
- linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
- linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
- linkml_store/api/stores/hdf5/__init__.py +7 -0
- linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
- linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
- linkml_store/api/stores/ibis/__init__.py +5 -0
- linkml_store/api/stores/ibis/ibis_collection.py +488 -0
- linkml_store/api/stores/ibis/ibis_database.py +328 -0
- linkml_store/api/stores/mongodb/__init__.py +25 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
- linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
- linkml_store/api/stores/neo4j/__init__.py +0 -0
- linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
- linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
- linkml_store/api/stores/solr/__init__.py +3 -0
- linkml_store/api/stores/solr/solr_collection.py +224 -0
- linkml_store/api/stores/solr/solr_database.py +83 -0
- linkml_store/api/stores/solr/solr_utils.py +0 -0
- linkml_store/api/types.py +4 -0
- linkml_store/cli.py +1147 -0
- linkml_store/constants.py +7 -0
- linkml_store/graphs/__init__.py +0 -0
- linkml_store/graphs/graph_map.py +24 -0
- linkml_store/index/__init__.py +53 -0
- linkml_store/index/implementations/__init__.py +0 -0
- linkml_store/index/implementations/llm_indexer.py +174 -0
- linkml_store/index/implementations/simple_indexer.py +43 -0
- linkml_store/index/indexer.py +211 -0
- linkml_store/inference/__init__.py +13 -0
- linkml_store/inference/evaluation.py +195 -0
- linkml_store/inference/implementations/__init__.py +0 -0
- linkml_store/inference/implementations/llm_inference_engine.py +154 -0
- linkml_store/inference/implementations/rag_inference_engine.py +276 -0
- linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
- linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
- linkml_store/inference/inference_config.py +66 -0
- linkml_store/inference/inference_engine.py +209 -0
- linkml_store/inference/inference_engine_registry.py +74 -0
- linkml_store/plotting/__init__.py +5 -0
- linkml_store/plotting/cli.py +826 -0
- linkml_store/plotting/dimensionality_reduction.py +453 -0
- linkml_store/plotting/embedding_plot.py +489 -0
- linkml_store/plotting/facet_chart.py +73 -0
- linkml_store/plotting/heatmap.py +383 -0
- linkml_store/utils/__init__.py +0 -0
- linkml_store/utils/change_utils.py +17 -0
- linkml_store/utils/dat_parser.py +95 -0
- linkml_store/utils/embedding_matcher.py +424 -0
- linkml_store/utils/embedding_utils.py +299 -0
- linkml_store/utils/enrichment_analyzer.py +217 -0
- linkml_store/utils/file_utils.py +37 -0
- linkml_store/utils/format_utils.py +550 -0
- linkml_store/utils/io.py +38 -0
- linkml_store/utils/llm_utils.py +122 -0
- linkml_store/utils/mongodb_utils.py +145 -0
- linkml_store/utils/neo4j_utils.py +42 -0
- linkml_store/utils/object_utils.py +190 -0
- linkml_store/utils/pandas_utils.py +93 -0
- linkml_store/utils/patch_utils.py +126 -0
- linkml_store/utils/query_utils.py +89 -0
- linkml_store/utils/schema_utils.py +23 -0
- linkml_store/utils/sklearn_utils.py +193 -0
- linkml_store/utils/sql_utils.py +177 -0
- linkml_store/utils/stats_utils.py +53 -0
- linkml_store/utils/vector_utils.py +158 -0
- linkml_store/webapi/__init__.py +0 -0
- linkml_store/webapi/html/__init__.py +3 -0
- linkml_store/webapi/html/base.html.j2 +24 -0
- linkml_store/webapi/html/collection_details.html.j2 +15 -0
- linkml_store/webapi/html/database_details.html.j2 +16 -0
- linkml_store/webapi/html/databases.html.j2 +14 -0
- linkml_store/webapi/html/generic.html.j2 +43 -0
- linkml_store/webapi/main.py +855 -0
- linkml_store-0.3.0.dist-info/METADATA +226 -0
- linkml_store-0.3.0.dist-info/RECORD +101 -0
- linkml_store-0.3.0.dist-info/WHEEL +4 -0
- linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
- linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from collections import namedtuple
|
|
2
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
# defined a named tuple called between with two values (start, end):
|
|
8
|
+
# This is used in the Query class to represent a range of values
|
|
9
|
+
# This is used in the Query class to represent a range of values
|
|
10
|
+
Between = namedtuple("Between", "min max")
|
|
11
|
+
|
|
12
|
+
FACET_GROUP_ATOM = Union[str, int, float, Between]
|
|
13
|
+
FACET_GROUP = Union[FACET_GROUP_ATOM, Tuple[FACET_GROUP_ATOM, ...]]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Query(BaseModel):
|
|
17
|
+
"""
|
|
18
|
+
A query object.
|
|
19
|
+
|
|
20
|
+
- In SQL this would be a SQL query string
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from_table: Optional[str] = None
|
|
24
|
+
select_cols: Optional[List[str]] = None
|
|
25
|
+
where_clause: Optional[Union[str, List[str], Dict[str, Any]]] = None
|
|
26
|
+
sort_by: Optional[List[str]] = None
|
|
27
|
+
limit: Optional[int] = None
|
|
28
|
+
offset: Optional[int] = None
|
|
29
|
+
include_facet_counts: bool = False
|
|
30
|
+
facet_slots: Optional[List[str]] = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class FacetCountResult(BaseModel):
|
|
34
|
+
"""
|
|
35
|
+
A facet count result
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
as_dict: Dict[FACET_GROUP, List[Tuple[FACET_GROUP, int]]]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class QueryResult(BaseModel):
|
|
42
|
+
"""
|
|
43
|
+
A query result.
|
|
44
|
+
|
|
45
|
+
TODO: make this a subclass of Collection
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
query: Optional[Query] = None
|
|
49
|
+
search_term: Optional[str] = None
|
|
50
|
+
num_rows: int
|
|
51
|
+
offset: Optional[int] = 0
|
|
52
|
+
rows: Optional[List[Dict[str, Any]]] = None
|
|
53
|
+
ranked_rows: Optional[List[Tuple[float, Dict[str, Any]]]] = None
|
|
54
|
+
_rows_dataframe: Optional[pd.DataFrame] = None
|
|
55
|
+
facet_counts: Optional[Dict[str, List[Tuple[FACET_GROUP, int]]]] = None
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def rows_dataframe(self) -> pd.DataFrame:
|
|
59
|
+
if self.ranked_rows is not None:
|
|
60
|
+
self._rows_dataframe = pd.DataFrame([{"score": score, **row} for score, row in self.ranked_rows])
|
|
61
|
+
if self._rows_dataframe is None and self.rows:
|
|
62
|
+
self._rows_dataframe = pd.DataFrame(self.rows)
|
|
63
|
+
return self._rows_dataframe
|
|
64
|
+
|
|
65
|
+
def set_rows(self, rows: pd.DataFrame):
|
|
66
|
+
self._rows_dataframe = rows
|
|
67
|
+
|
|
68
|
+
class Config:
|
|
69
|
+
arbitrary_types_allowed = True
|
|
File without changes
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ChromaDB Collection
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
7
|
+
|
|
8
|
+
from chromadb.api.models.Collection import Collection as ChromaCollection
|
|
9
|
+
from linkml_runtime.linkml_model import SlotDefinition
|
|
10
|
+
|
|
11
|
+
from linkml_store.api import Collection
|
|
12
|
+
from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
|
|
13
|
+
from linkml_store.api.queries import Query, QueryResult
|
|
14
|
+
from linkml_store.index import Indexer
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ChromaDBCollection(Collection):
|
|
20
|
+
"""
|
|
21
|
+
A wrapper for ChromaDB collections.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def native_collection(self) -> ChromaCollection:
|
|
26
|
+
return self.parent.client.get_collection(self.name)
|
|
27
|
+
|
|
28
|
+
def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
29
|
+
if not isinstance(objs, list):
|
|
30
|
+
objs = [objs]
|
|
31
|
+
|
|
32
|
+
documents = []
|
|
33
|
+
metadatas = []
|
|
34
|
+
ids = []
|
|
35
|
+
indexer = Indexer()
|
|
36
|
+
|
|
37
|
+
for obj in objs:
|
|
38
|
+
obj_id = self.object_identifier(obj)
|
|
39
|
+
ids.append(obj_id)
|
|
40
|
+
doc_text = indexer.object_to_text(obj)
|
|
41
|
+
documents.append(doc_text)
|
|
42
|
+
# TODO: handle nesting
|
|
43
|
+
metadata = {k: v for k, v in obj.items()}
|
|
44
|
+
metadatas.append(metadata)
|
|
45
|
+
|
|
46
|
+
self.native_collection.add(
|
|
47
|
+
documents=documents,
|
|
48
|
+
metadatas=metadatas,
|
|
49
|
+
ids=ids,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> int:
|
|
53
|
+
if not isinstance(objs, list):
|
|
54
|
+
objs = [objs]
|
|
55
|
+
ids = [obj["id"] for obj in objs]
|
|
56
|
+
self.native_collection.delete(ids=ids)
|
|
57
|
+
return len(ids)
|
|
58
|
+
|
|
59
|
+
def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> int:
|
|
60
|
+
logger.info(f"Deleting from {self.target_class_name} where: {where}")
|
|
61
|
+
if where is None:
|
|
62
|
+
where = {}
|
|
63
|
+
results = self.native_collection.get(where=where)
|
|
64
|
+
ids = [result["id"] for result in results]
|
|
65
|
+
self.native_collection.delete(ids=ids)
|
|
66
|
+
return len(ids)
|
|
67
|
+
|
|
68
|
+
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
69
|
+
chroma_filter = self._build_chroma_filter(query.where_clause)
|
|
70
|
+
if query.limit:
|
|
71
|
+
results = self.native_collection.get(where=chroma_filter, limit=query.limit)
|
|
72
|
+
else:
|
|
73
|
+
results = self.native_collection.get(where=chroma_filter)
|
|
74
|
+
|
|
75
|
+
count = len(results)
|
|
76
|
+
return QueryResult(query=query, num_rows=count, rows=results)
|
|
77
|
+
|
|
78
|
+
def query_facets(
|
|
79
|
+
self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
|
|
80
|
+
) -> Dict[str, List[Tuple[Any, int]]]:
|
|
81
|
+
results = {}
|
|
82
|
+
cd = self.class_definition()
|
|
83
|
+
if not facet_columns:
|
|
84
|
+
facet_columns = list(self.class_definition().attributes.keys())
|
|
85
|
+
|
|
86
|
+
for col in facet_columns:
|
|
87
|
+
logger.debug(f"Faceting on {col}")
|
|
88
|
+
if isinstance(col, tuple):
|
|
89
|
+
sd = SlotDefinition(name="PLACEHOLDER")
|
|
90
|
+
else:
|
|
91
|
+
sd = cd.attributes[col]
|
|
92
|
+
|
|
93
|
+
if sd.multivalued:
|
|
94
|
+
facet_results = self.native_collection.aggregate(
|
|
95
|
+
aggregation=[
|
|
96
|
+
{"$match": where} if where else {"$match": {}},
|
|
97
|
+
{"$unwind": f"${col}"},
|
|
98
|
+
{"$group": {"_id": f"${col}", "count": {"$sum": 1}}},
|
|
99
|
+
{"$sort": {"count": -1}},
|
|
100
|
+
{"$limit": facet_limit},
|
|
101
|
+
]
|
|
102
|
+
)
|
|
103
|
+
else:
|
|
104
|
+
facet_results = self.native_collection.aggregate(
|
|
105
|
+
aggregation=[
|
|
106
|
+
{"$match": where} if where else {"$match": {}},
|
|
107
|
+
{"$group": {"_id": f"${col}", "count": {"$sum": 1}}},
|
|
108
|
+
{"$sort": {"count": -1}},
|
|
109
|
+
{"$limit": facet_limit},
|
|
110
|
+
]
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
results[col] = [(result["_id"], result["count"]) for result in facet_results]
|
|
114
|
+
|
|
115
|
+
return results
|
|
116
|
+
|
|
117
|
+
def _build_chroma_filter(self, where_clause: Dict[str, Any]) -> Dict[str, Any]:
|
|
118
|
+
chroma_filter = {}
|
|
119
|
+
for field, value in where_clause.items():
|
|
120
|
+
chroma_filter[field] = value
|
|
121
|
+
return chroma_filter
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# chromadb_database.py
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import chromadb
|
|
7
|
+
from chromadb.config import Settings
|
|
8
|
+
from linkml_runtime import SchemaView
|
|
9
|
+
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
|
|
10
|
+
from linkml_runtime.utils.schema_builder import SchemaBuilder
|
|
11
|
+
|
|
12
|
+
from linkml_store.api import Database
|
|
13
|
+
from linkml_store.api.queries import Query, QueryResult
|
|
14
|
+
from linkml_store.api.stores.chromadb.chromadb_collection import ChromaDBCollection
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ChromaDBDatabase(Database):
|
|
20
|
+
_client: chromadb.Client = None
|
|
21
|
+
collection_class = ChromaDBCollection
|
|
22
|
+
|
|
23
|
+
def __init__(self, handle: Optional[str] = None, **kwargs):
|
|
24
|
+
if handle is None:
|
|
25
|
+
handle = ".chromadb"
|
|
26
|
+
super().__init__(handle=handle, **kwargs)
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def client(self) -> chromadb.Client:
|
|
30
|
+
if self._client is None:
|
|
31
|
+
self._client = chromadb.Client(
|
|
32
|
+
Settings(
|
|
33
|
+
chroma_db_impl="duckdb+parquet",
|
|
34
|
+
persist_directory=self.handle,
|
|
35
|
+
)
|
|
36
|
+
)
|
|
37
|
+
return self._client
|
|
38
|
+
|
|
39
|
+
def commit(self, **kwargs):
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
def close(self, **kwargs):
|
|
43
|
+
if self._client:
|
|
44
|
+
self._client.close()
|
|
45
|
+
|
|
46
|
+
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
47
|
+
if query.from_table:
|
|
48
|
+
collection = self.get_collection(query.from_table)
|
|
49
|
+
return collection.query(query, **kwargs)
|
|
50
|
+
|
|
51
|
+
def init_collections(self):
|
|
52
|
+
if self._collections is None:
|
|
53
|
+
self._collections = {}
|
|
54
|
+
|
|
55
|
+
for collection_name in self.client.list_collections():
|
|
56
|
+
if collection_name not in self._collections:
|
|
57
|
+
collection = ChromaDBCollection(name=collection_name, parent=self)
|
|
58
|
+
self._collections[collection_name] = collection
|
|
59
|
+
|
|
60
|
+
def induce_schema_view(self) -> SchemaView:
|
|
61
|
+
logger.info(f"Inducing schema view for {self.handle}")
|
|
62
|
+
sb = SchemaBuilder()
|
|
63
|
+
schema = sb.schema
|
|
64
|
+
|
|
65
|
+
for collection_name in self.client.list_collections():
|
|
66
|
+
sb.add_class(collection_name)
|
|
67
|
+
chroma_collection = self.client.get_collection(collection_name)
|
|
68
|
+
sample_doc = chroma_collection.peek(1)
|
|
69
|
+
if sample_doc:
|
|
70
|
+
for field, value in sample_doc[0].items():
|
|
71
|
+
if field == "_id":
|
|
72
|
+
continue
|
|
73
|
+
sd = SlotDefinition(field)
|
|
74
|
+
if isinstance(value, list):
|
|
75
|
+
sd.multivalued = True
|
|
76
|
+
if isinstance(value, dict):
|
|
77
|
+
sd.inlined = True
|
|
78
|
+
sb.schema.classes[collection_name].attributes[sd.name] = sd
|
|
79
|
+
|
|
80
|
+
sb.add_defaults()
|
|
81
|
+
for cls_name in schema.classes:
|
|
82
|
+
if cls_name in self.metadata.collections:
|
|
83
|
+
collection_metadata = self.metadata.collections[cls_name]
|
|
84
|
+
if collection_metadata.attributes:
|
|
85
|
+
del schema.classes[cls_name]
|
|
86
|
+
cls = ClassDefinition(name=collection_metadata.type, attributes=collection_metadata.attributes)
|
|
87
|
+
schema.classes[cls.name] = cls
|
|
88
|
+
|
|
89
|
+
return SchemaView(schema)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Dremio database adapter for linkml-store.
|
|
2
|
+
|
|
3
|
+
This module provides a Dremio adapter that uses Arrow Flight SQL for high-performance
|
|
4
|
+
data access to Dremio data lakehouse.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from linkml_store.api.stores.dremio.dremio_collection import DremioCollection
|
|
8
|
+
from linkml_store.api.stores.dremio.dremio_database import DremioDatabase
|
|
9
|
+
|
|
10
|
+
__all__ = ["DremioDatabase", "DremioCollection"]
|