linkml-store 0.0.0__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/__init__.py +2 -2
- linkml_store/api/client.py +108 -7
- linkml_store/api/collection.py +221 -30
- linkml_store/api/config.py +97 -0
- linkml_store/api/database.py +207 -17
- linkml_store/api/queries.py +12 -1
- linkml_store/api/stores/chromadb/__init__.py +0 -0
- linkml_store/api/stores/chromadb/chromadb_collection.py +114 -0
- linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +47 -14
- linkml_store/api/stores/duckdb/duckdb_database.py +35 -44
- linkml_store/api/stores/hdf5/__init__.py +0 -0
- linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
- linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +86 -40
- linkml_store/api/stores/mongodb/mongodb_database.py +58 -67
- linkml_store/api/stores/solr/solr_collection.py +132 -0
- linkml_store/api/stores/solr/solr_database.py +82 -0
- linkml_store/api/stores/solr/solr_utils.py +0 -0
- linkml_store/cli.py +369 -0
- linkml_store/index/__init__.py +33 -0
- linkml_store/index/implementations/{llm_index.py → llm_indexer.py} +2 -2
- linkml_store/index/implementations/{simple_index.py → simple_indexer.py} +6 -3
- linkml_store/index/{index.py → indexer.py} +7 -4
- linkml_store/utils/format_utils.py +93 -0
- linkml_store/utils/object_utils.py +73 -0
- linkml_store/utils/sql_utils.py +46 -7
- {linkml_store-0.0.0.dist-info → linkml_store-0.1.6.dist-info}/METADATA +17 -6
- linkml_store-0.1.6.dist-info/RECORD +41 -0
- linkml_store-0.1.6.dist-info/entry_points.txt +3 -0
- linkml_store/api/metadata.py +0 -5
- linkml_store-0.0.0.dist-info/RECORD +0 -29
- linkml_store-0.0.0.dist-info/entry_points.txt +0 -3
- {linkml_store-0.0.0.dist-info → linkml_store-0.1.6.dist-info}/LICENSE +0 -0
- {linkml_store-0.0.0.dist-info → linkml_store-0.1.6.dist-info}/WHEEL +0 -0
linkml_store/api/__init__.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# flake8: noqa: E402
|
|
2
2
|
from linkml_store.api.collection import Collection
|
|
3
3
|
from linkml_store.api.database import Database
|
|
4
|
-
from linkml_store.api.metadata import MetaData
|
|
5
4
|
from linkml_store.api.client import Client
|
|
5
|
+
|
|
6
6
|
# flake8: noqa
|
|
7
7
|
|
|
8
|
-
__all__ = ["Client", "Database", "
|
|
8
|
+
__all__ = ["Client", "Database", "Collection"]
|
linkml_store/api/client.py
CHANGED
|
@@ -1,17 +1,24 @@
|
|
|
1
|
-
from
|
|
2
|
-
from typing import Dict, Optional
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Dict, Optional, Union
|
|
3
3
|
|
|
4
|
+
import yaml
|
|
4
5
|
from linkml_runtime import SchemaView
|
|
5
6
|
|
|
6
7
|
from linkml_store.api import Database
|
|
8
|
+
from linkml_store.api.config import ClientConfig
|
|
9
|
+
from linkml_store.api.stores.chromadb.chromadb_database import ChromaDBDatabase
|
|
7
10
|
from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
|
|
11
|
+
from linkml_store.api.stores.mongodb.mongodb_database import MongoDBDatabase
|
|
12
|
+
from linkml_store.api.stores.solr.solr_database import SolrDatabase
|
|
8
13
|
|
|
9
14
|
HANDLE_MAP = {
|
|
10
15
|
"duckdb": DuckDBDatabase,
|
|
16
|
+
"solr": SolrDatabase,
|
|
17
|
+
"mongodb": MongoDBDatabase,
|
|
18
|
+
"chromadb": ChromaDBDatabase,
|
|
11
19
|
}
|
|
12
20
|
|
|
13
21
|
|
|
14
|
-
@dataclass
|
|
15
22
|
class Client:
|
|
16
23
|
"""
|
|
17
24
|
A client provides access to named collections.
|
|
@@ -22,7 +29,7 @@ class Client:
|
|
|
22
29
|
>>> db = client.attach_database("duckdb", alias="test")
|
|
23
30
|
>>> collection = db.create_collection("Person")
|
|
24
31
|
>>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
|
|
25
|
-
>>> collection.
|
|
32
|
+
>>> collection.insert(objs)
|
|
26
33
|
>>> qr = collection.find()
|
|
27
34
|
>>> len(qr.rows)
|
|
28
35
|
2
|
|
@@ -38,9 +45,76 @@ class Client:
|
|
|
38
45
|
|
|
39
46
|
"""
|
|
40
47
|
|
|
41
|
-
|
|
48
|
+
metadata: Optional[ClientConfig] = None
|
|
42
49
|
_databases: Optional[Dict[str, Database]] = None
|
|
43
50
|
|
|
51
|
+
def __init__(self, handle: Optional[str] = None, metadata: Optional[ClientConfig] = None):
|
|
52
|
+
"""
|
|
53
|
+
Initialize a client.
|
|
54
|
+
|
|
55
|
+
:param handle:
|
|
56
|
+
:param metadata:
|
|
57
|
+
"""
|
|
58
|
+
self.metadata = metadata
|
|
59
|
+
if not self.metadata:
|
|
60
|
+
self.metadata = ClientConfig()
|
|
61
|
+
self.metadata.handle = handle
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def handle(self) -> Optional[str]:
|
|
65
|
+
return self.metadata.handle
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def base_dir(self) -> Optional[str]:
|
|
69
|
+
"""
|
|
70
|
+
Get the base directory for the client.
|
|
71
|
+
|
|
72
|
+
Wraps metadata.base_dir.
|
|
73
|
+
|
|
74
|
+
:return:
|
|
75
|
+
"""
|
|
76
|
+
return self.metadata.base_dir
|
|
77
|
+
|
|
78
|
+
def from_config(self, config: Union[ClientConfig, str, Path], base_dir=None, **kwargs):
|
|
79
|
+
"""
|
|
80
|
+
Create a client from a configuration.
|
|
81
|
+
|
|
82
|
+
Examples
|
|
83
|
+
--------
|
|
84
|
+
>>> from linkml_store.api.config import ClientConfig
|
|
85
|
+
>>> client = Client().from_config(ClientConfig(databases={"test": {"handle": "duckdb:///:memory:"}}))
|
|
86
|
+
>>> len(client.databases)
|
|
87
|
+
1
|
|
88
|
+
>>> "test" in client.databases
|
|
89
|
+
True
|
|
90
|
+
>>> client.databases["test"].handle
|
|
91
|
+
'duckdb:///:memory:'
|
|
92
|
+
|
|
93
|
+
:param config:
|
|
94
|
+
:param kwargs:
|
|
95
|
+
:return:
|
|
96
|
+
|
|
97
|
+
"""
|
|
98
|
+
if isinstance(config, Path):
|
|
99
|
+
config = str(config)
|
|
100
|
+
if isinstance(config, str):
|
|
101
|
+
if not base_dir:
|
|
102
|
+
base_dir = Path(config).parent
|
|
103
|
+
parsed_obj = yaml.safe_load(open(config))
|
|
104
|
+
config = ClientConfig(**parsed_obj)
|
|
105
|
+
self.metadata = config
|
|
106
|
+
if base_dir:
|
|
107
|
+
self.metadata.base_dir = base_dir
|
|
108
|
+
self._initialize_databases(**kwargs)
|
|
109
|
+
return self
|
|
110
|
+
|
|
111
|
+
def _initialize_databases(self, **kwargs):
|
|
112
|
+
for name, db_config in self.metadata.databases.items():
|
|
113
|
+
handle = db_config.handle.format(base_dir=self.base_dir)
|
|
114
|
+
db_config.handle = handle
|
|
115
|
+
db = self.attach_database(handle, alias=name, **kwargs)
|
|
116
|
+
db.from_config(db_config)
|
|
117
|
+
|
|
44
118
|
def attach_database(
|
|
45
119
|
self,
|
|
46
120
|
handle: str,
|
|
@@ -69,7 +143,6 @@ class Client:
|
|
|
69
143
|
:param schema_view: schema view to associate with the database
|
|
70
144
|
:param kwargs:
|
|
71
145
|
:return:
|
|
72
|
-
|
|
73
146
|
"""
|
|
74
147
|
if ":" not in handle:
|
|
75
148
|
scheme = handle
|
|
@@ -87,6 +160,7 @@ class Client:
|
|
|
87
160
|
if not self._databases:
|
|
88
161
|
self._databases = {}
|
|
89
162
|
self._databases[alias] = db
|
|
163
|
+
db.parent = self
|
|
90
164
|
return db
|
|
91
165
|
|
|
92
166
|
def get_database(self, name: Optional[str] = None, create_if_not_exists=True, **kwargs) -> Database:
|
|
@@ -101,7 +175,7 @@ class Client:
|
|
|
101
175
|
>>> db == retrieved_db
|
|
102
176
|
True
|
|
103
177
|
|
|
104
|
-
:param name:
|
|
178
|
+
:param name: if None, there must be a single database attached
|
|
105
179
|
:param create_if_not_exists:
|
|
106
180
|
:param kwargs:
|
|
107
181
|
:return:
|
|
@@ -149,3 +223,30 @@ class Client:
|
|
|
149
223
|
if not self._databases:
|
|
150
224
|
self._databases = {}
|
|
151
225
|
return self._databases
|
|
226
|
+
|
|
227
|
+
def drop_database(self, name: str, missing_ok=False, **kwargs):
|
|
228
|
+
"""
|
|
229
|
+
Drop a database.
|
|
230
|
+
|
|
231
|
+
:param name:
|
|
232
|
+
:param missing_ok:
|
|
233
|
+
:return:
|
|
234
|
+
"""
|
|
235
|
+
if name in self._databases:
|
|
236
|
+
db = self._databases[name]
|
|
237
|
+
db.drop(**kwargs)
|
|
238
|
+
del self._databases[name]
|
|
239
|
+
else:
|
|
240
|
+
if not missing_ok:
|
|
241
|
+
raise ValueError(f"Database {name} not found")
|
|
242
|
+
|
|
243
|
+
def drop_all_databases(self, **kwargs):
|
|
244
|
+
"""
|
|
245
|
+
Drop all databases.
|
|
246
|
+
|
|
247
|
+
:param missing_ok:
|
|
248
|
+
:return:
|
|
249
|
+
"""
|
|
250
|
+
for name in list(self._databases.keys()):
|
|
251
|
+
self.drop_database(name, missing_ok=False, **kwargs)
|
|
252
|
+
self._databases = {}
|
linkml_store/api/collection.py
CHANGED
|
@@ -1,16 +1,24 @@
|
|
|
1
|
+
import hashlib
|
|
1
2
|
import logging
|
|
2
3
|
from collections import defaultdict
|
|
3
|
-
from dataclasses import dataclass
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, TextIO, Type, Union
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, TextIO, Type, Union
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
|
|
9
9
|
from linkml_runtime.linkml_model.meta import ArrayExpression
|
|
10
10
|
from pydantic import BaseModel
|
|
11
11
|
|
|
12
|
+
from linkml_store.index import get_indexer
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
from linkml.validator.report import ValidationResult
|
|
16
|
+
except ImportError:
|
|
17
|
+
ValidationResult = None
|
|
18
|
+
|
|
19
|
+
from linkml_store.api.config import CollectionConfig
|
|
12
20
|
from linkml_store.api.queries import Query, QueryResult
|
|
13
|
-
from linkml_store.index.
|
|
21
|
+
from linkml_store.index.indexer import Indexer
|
|
14
22
|
|
|
15
23
|
if TYPE_CHECKING:
|
|
16
24
|
from linkml_store.api.database import Database
|
|
@@ -19,11 +27,11 @@ logger = logging.getLogger(__name__)
|
|
|
19
27
|
|
|
20
28
|
OBJECT = Union[Dict[str, Any], BaseModel, Type]
|
|
21
29
|
|
|
30
|
+
DEFAULT_FACET_LIMIT = 100
|
|
22
31
|
IDENTIFIER = str
|
|
23
32
|
FIELD_NAME = str
|
|
24
33
|
|
|
25
34
|
|
|
26
|
-
@dataclass
|
|
27
35
|
class Collection:
|
|
28
36
|
"""
|
|
29
37
|
A collection is an organized set of objects of the same or similar type.
|
|
@@ -33,12 +41,80 @@ class Collection:
|
|
|
33
41
|
- For a file system, a collection could be a single tabular file such as Parquet or CSV
|
|
34
42
|
"""
|
|
35
43
|
|
|
36
|
-
name: str
|
|
44
|
+
# name: str
|
|
37
45
|
parent: Optional["Database"] = None
|
|
38
|
-
|
|
39
|
-
hidden: Optional[bool] = False
|
|
46
|
+
_indexers: Optional[Dict[str, Indexer]] = None
|
|
47
|
+
# hidden: Optional[bool] = False
|
|
48
|
+
|
|
49
|
+
metadata: Optional[CollectionConfig] = None
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self, name: str, parent: Optional["Database"] = None, metadata: Optional[CollectionConfig] = None, **kwargs
|
|
53
|
+
):
|
|
54
|
+
self.parent = parent
|
|
55
|
+
if metadata:
|
|
56
|
+
self.metadata = metadata
|
|
57
|
+
else:
|
|
58
|
+
self.metadata = CollectionConfig(name=name, **kwargs)
|
|
59
|
+
if name is not None and self.metadata.name is not None and name != self.metadata.name:
|
|
60
|
+
raise ValueError(f"Name mismatch: {name} != {self.metadata.name}")
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def name(self) -> str:
|
|
64
|
+
return self.metadata.name
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def hidden(self) -> bool:
|
|
68
|
+
return self.metadata.hidden
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def _target_class_name(self):
|
|
72
|
+
"""
|
|
73
|
+
Return the name of the class that this collection represents
|
|
74
|
+
|
|
75
|
+
This MUST be a LinkML class name
|
|
76
|
+
|
|
77
|
+
:return:
|
|
78
|
+
"""
|
|
79
|
+
# TODO: this is a shim layer until we can normalize on this
|
|
80
|
+
if self.metadata.type:
|
|
81
|
+
return self.metadata.type
|
|
82
|
+
return self.name
|
|
40
83
|
|
|
41
|
-
|
|
84
|
+
@property
|
|
85
|
+
def _alias(self):
|
|
86
|
+
"""
|
|
87
|
+
Return the primary name/alias used for the collection.
|
|
88
|
+
|
|
89
|
+
This MAY be the name of the LinkML class, but it may be desirable
|
|
90
|
+
to have an alias, for example "persons" which collects all instances
|
|
91
|
+
of class Person.
|
|
92
|
+
|
|
93
|
+
The _alias SHOULD be used for Table names in SQL.
|
|
94
|
+
|
|
95
|
+
For nested data, the alias SHOULD be used as the key; e.g
|
|
96
|
+
|
|
97
|
+
``{ "persons": [ { "name": "Alice" }, { "name": "Bob" } ] }``
|
|
98
|
+
|
|
99
|
+
:return:
|
|
100
|
+
"""
|
|
101
|
+
# TODO: this is a shim layer until we can normalize on this
|
|
102
|
+
if self.metadata.alias:
|
|
103
|
+
return self.metadata.alias
|
|
104
|
+
return self.name
|
|
105
|
+
|
|
106
|
+
def replace(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
107
|
+
"""
|
|
108
|
+
Replace entire collection with objects.
|
|
109
|
+
|
|
110
|
+
:param objs:
|
|
111
|
+
:param kwargs:
|
|
112
|
+
:return:
|
|
113
|
+
"""
|
|
114
|
+
self.delete_where({})
|
|
115
|
+
self.insert(objs, **kwargs)
|
|
116
|
+
|
|
117
|
+
def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
42
118
|
"""
|
|
43
119
|
Add one or more objects to the collection
|
|
44
120
|
|
|
@@ -58,13 +134,14 @@ class Collection:
|
|
|
58
134
|
"""
|
|
59
135
|
raise NotImplementedError
|
|
60
136
|
|
|
61
|
-
def delete_where(self, where: Optional[Dict[str, Any]] = None, **kwargs) -> int:
|
|
137
|
+
def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> int:
|
|
62
138
|
"""
|
|
63
139
|
Delete objects that match a query
|
|
64
140
|
|
|
65
|
-
:param where:
|
|
141
|
+
:param where: where conditions
|
|
142
|
+
:param missing_ok: if True, do not raise an error if the collection does not exist
|
|
66
143
|
:param kwargs:
|
|
67
|
-
:return:
|
|
144
|
+
:return: number of objects deleted (or -1 if unsupported)
|
|
68
145
|
"""
|
|
69
146
|
raise NotImplementedError
|
|
70
147
|
|
|
@@ -79,7 +156,7 @@ class Collection:
|
|
|
79
156
|
raise NotImplementedError
|
|
80
157
|
|
|
81
158
|
def _create_query(self, **kwargs) -> Query:
|
|
82
|
-
return Query(from_table=self.
|
|
159
|
+
return Query(from_table=self._alias, **kwargs)
|
|
83
160
|
|
|
84
161
|
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
85
162
|
"""
|
|
@@ -91,7 +168,9 @@ class Collection:
|
|
|
91
168
|
"""
|
|
92
169
|
return self.parent.query(query, **kwargs)
|
|
93
170
|
|
|
94
|
-
def query_facets(
|
|
171
|
+
def query_facets(
|
|
172
|
+
self, where: Optional[Dict] = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
|
|
173
|
+
) -> Dict[str, Dict[str, int]]:
|
|
95
174
|
"""
|
|
96
175
|
Run a query to get facet counts for one or more columns.
|
|
97
176
|
|
|
@@ -108,17 +187,32 @@ class Collection:
|
|
|
108
187
|
:param con: A DuckDB database connection.
|
|
109
188
|
:param query: A Query object representing the base query.
|
|
110
189
|
:param facet_columns: A list of column names to get facet counts for.
|
|
190
|
+
:param facet_limit:
|
|
111
191
|
:return: A dictionary where keys are column names and values are pandas DataFrames
|
|
112
192
|
containing the facet counts for each unique value in the respective column.
|
|
113
193
|
"""
|
|
114
194
|
raise NotImplementedError
|
|
115
195
|
|
|
116
196
|
def get(self, ids: Optional[IDENTIFIER], **kwargs) -> QueryResult:
|
|
197
|
+
"""
|
|
198
|
+
Get one or more objects by ID.
|
|
199
|
+
|
|
200
|
+
:param ids:
|
|
201
|
+
:param kwargs:
|
|
202
|
+
:return:
|
|
203
|
+
"""
|
|
117
204
|
id_field = self.identifier_field
|
|
118
205
|
q = self._create_query(where_clause={id_field: ids})
|
|
119
206
|
return self.query(q, **kwargs)
|
|
120
207
|
|
|
121
208
|
def find(self, where: Optional[Any] = None, **kwargs) -> QueryResult:
|
|
209
|
+
"""
|
|
210
|
+
Find objects in the collection using a where query.
|
|
211
|
+
|
|
212
|
+
:param where:
|
|
213
|
+
:param kwargs:
|
|
214
|
+
:return:
|
|
215
|
+
"""
|
|
122
216
|
query = self._create_query(where_clause=where)
|
|
123
217
|
return self.query(query, **kwargs)
|
|
124
218
|
|
|
@@ -141,66 +235,122 @@ class Collection:
|
|
|
141
235
|
:return:
|
|
142
236
|
"""
|
|
143
237
|
if index_name is None:
|
|
144
|
-
if len(self.
|
|
145
|
-
index_name = list(self.
|
|
238
|
+
if len(self._indexers) == 1:
|
|
239
|
+
index_name = list(self._indexers.keys())[0]
|
|
146
240
|
else:
|
|
147
241
|
raise ValueError("Multiple indexes found. Please specify an index name.")
|
|
148
242
|
ix_coll = self.parent.get_collection(self._index_collection_name(index_name))
|
|
149
|
-
ix = self.
|
|
243
|
+
ix = self._indexers.get(index_name)
|
|
150
244
|
if not ix:
|
|
151
245
|
raise ValueError(f"No index named {index_name}")
|
|
152
246
|
qr = ix_coll.find(where=where, limit=-1, **kwargs)
|
|
153
247
|
index_col = ix.index_field
|
|
154
248
|
vector_pairs = [(row, np.array(row[index_col], dtype=float)) for row in qr.rows]
|
|
155
249
|
results = ix.search(query, vector_pairs, limit=limit)
|
|
250
|
+
for r in results:
|
|
251
|
+
del r[1][index_col]
|
|
156
252
|
new_qr = QueryResult(num_rows=len(results))
|
|
157
253
|
new_qr.ranked_rows = results
|
|
158
254
|
return new_qr
|
|
159
255
|
|
|
160
|
-
|
|
256
|
+
@property
|
|
257
|
+
def is_internal(self) -> bool:
|
|
258
|
+
"""
|
|
259
|
+
Check if the collection is internal
|
|
260
|
+
|
|
261
|
+
:return:
|
|
262
|
+
"""
|
|
263
|
+
if not self.name:
|
|
264
|
+
raise ValueError(f"Collection has no name: {self} // {self.metadata}")
|
|
265
|
+
return self.name.startswith("internal__")
|
|
266
|
+
|
|
267
|
+
def attach_indexer(self, index: Union[Indexer, str], name: Optional[str] = True, auto_index=True, **kwargs):
|
|
161
268
|
"""
|
|
162
269
|
Attach an index to the collection.
|
|
163
270
|
|
|
164
271
|
:param index:
|
|
165
|
-
:param
|
|
272
|
+
:param name:
|
|
273
|
+
:param auto_index: Automatically index all objects in the collection
|
|
166
274
|
:param kwargs:
|
|
167
275
|
:return:
|
|
168
276
|
"""
|
|
277
|
+
if isinstance(index, str):
|
|
278
|
+
index = get_indexer(index)
|
|
279
|
+
if name:
|
|
280
|
+
index.name = name
|
|
281
|
+
if not index.name:
|
|
282
|
+
index.name = type(index).__name__.lower()
|
|
169
283
|
index_name = index.name
|
|
170
284
|
if not index_name:
|
|
171
285
|
raise ValueError("Index must have a name")
|
|
172
|
-
if not self.
|
|
173
|
-
self.
|
|
174
|
-
self.
|
|
286
|
+
if not self._indexers:
|
|
287
|
+
self._indexers = {}
|
|
288
|
+
self._indexers[index_name] = index
|
|
175
289
|
if auto_index:
|
|
176
290
|
all_objs = self.find(limit=-1).rows
|
|
177
|
-
self.index_objects(all_objs, index_name, **kwargs)
|
|
291
|
+
self.index_objects(all_objs, index_name, replace=True, **kwargs)
|
|
178
292
|
|
|
179
293
|
def _index_collection_name(self, index_name: str) -> str:
|
|
180
|
-
|
|
294
|
+
"""
|
|
295
|
+
Create a name for a special collection that holds index data
|
|
296
|
+
|
|
297
|
+
:param index_name:
|
|
298
|
+
:return:
|
|
299
|
+
"""
|
|
300
|
+
return f"internal__index__{self.name}__{index_name}"
|
|
181
301
|
|
|
182
|
-
def index_objects(self, objs: List[OBJECT], index_name: str, **kwargs):
|
|
302
|
+
def index_objects(self, objs: List[OBJECT], index_name: str, replace=False, **kwargs):
|
|
183
303
|
"""
|
|
184
304
|
Index a list of objects
|
|
185
305
|
|
|
186
306
|
:param objs:
|
|
187
307
|
:param index_name:
|
|
308
|
+
:param replace:
|
|
188
309
|
:param kwargs:
|
|
189
310
|
:return:
|
|
190
311
|
"""
|
|
191
|
-
ix = self.
|
|
312
|
+
ix = self._indexers.get(index_name)
|
|
192
313
|
if not ix:
|
|
193
314
|
raise ValueError(f"No index named {index_name}")
|
|
194
|
-
|
|
315
|
+
ix_coll_name = self._index_collection_name(index_name)
|
|
316
|
+
ix_coll = self.parent.get_collection(ix_coll_name, create_if_not_exists=True)
|
|
195
317
|
vectors = [list(float(e) for e in v) for v in ix.objects_to_vectors(objs)]
|
|
196
318
|
objects_with_ix = []
|
|
197
319
|
index_col = ix.index_field
|
|
198
320
|
for obj, vector in zip(objs, vectors):
|
|
199
321
|
# TODO: id field
|
|
200
322
|
objects_with_ix.append({**obj, **{index_col: vector}})
|
|
201
|
-
|
|
323
|
+
if replace:
|
|
324
|
+
schema = self.parent.schema_view.schema
|
|
325
|
+
logger.info(f"Checking if {ix_coll_name} is in {schema.classes.keys()}")
|
|
326
|
+
if ix_coll_name in schema.classes:
|
|
327
|
+
ix_coll.delete_where()
|
|
328
|
+
ix_coll.insert(objects_with_ix, **kwargs)
|
|
329
|
+
|
|
330
|
+
def list_index_names(self) -> List[str]:
|
|
331
|
+
"""
|
|
332
|
+
Return a list of index names
|
|
333
|
+
|
|
334
|
+
:return:
|
|
335
|
+
"""
|
|
336
|
+
return list(self._indexers.keys())
|
|
337
|
+
|
|
338
|
+
@property
|
|
339
|
+
def indexers(self) -> Dict[str, Indexer]:
|
|
340
|
+
"""
|
|
341
|
+
Return a list of indexers
|
|
342
|
+
|
|
343
|
+
:return:
|
|
344
|
+
"""
|
|
345
|
+
return self._indexers if self._indexers else {}
|
|
202
346
|
|
|
203
347
|
def peek(self, limit: Optional[int] = None) -> QueryResult:
|
|
348
|
+
"""
|
|
349
|
+
Return the first N objects in the collection
|
|
350
|
+
|
|
351
|
+
:param limit:
|
|
352
|
+
:return:
|
|
353
|
+
"""
|
|
204
354
|
q = self._create_query()
|
|
205
355
|
return self.query(q, limit=limit)
|
|
206
356
|
|
|
@@ -212,13 +362,16 @@ class Collection:
|
|
|
212
362
|
"""
|
|
213
363
|
sv = self.parent.schema_view
|
|
214
364
|
if sv:
|
|
215
|
-
|
|
365
|
+
cls = sv.get_class(self._target_class_name)
|
|
366
|
+
return cls
|
|
216
367
|
return None
|
|
217
368
|
|
|
218
369
|
def identifier_attribute_name(self) -> Optional[str]:
|
|
219
370
|
"""
|
|
220
371
|
Return the name of the identifier attribute for the collection.
|
|
221
372
|
|
|
373
|
+
AKA the primary key.
|
|
374
|
+
|
|
222
375
|
:return: The name of the identifier attribute, if one exists.
|
|
223
376
|
"""
|
|
224
377
|
cd = self.class_definition()
|
|
@@ -228,6 +381,25 @@ class Collection:
|
|
|
228
381
|
return att.name
|
|
229
382
|
return None
|
|
230
383
|
|
|
384
|
+
def object_identifier(self, obj: OBJECT, auto=True) -> Optional[IDENTIFIER]:
|
|
385
|
+
"""
|
|
386
|
+
Return the identifier for an object.
|
|
387
|
+
|
|
388
|
+
:param obj:
|
|
389
|
+
:param auto: If True, generate an identifier if one does not exist.
|
|
390
|
+
:return:
|
|
391
|
+
"""
|
|
392
|
+
pk = self.identifier_attribute_name
|
|
393
|
+
if pk in obj:
|
|
394
|
+
return obj[pk]
|
|
395
|
+
elif auto:
|
|
396
|
+
# TODO: use other unique keys if no primary key
|
|
397
|
+
as_str = str(obj)
|
|
398
|
+
md5 = hashlib.md5(as_str.encode()).hexdigest()
|
|
399
|
+
return md5
|
|
400
|
+
else:
|
|
401
|
+
return None
|
|
402
|
+
|
|
231
403
|
def induce_class_definition_from_objects(self, objs: List[OBJECT], max_sample_size=10) -> ClassDefinition:
|
|
232
404
|
"""
|
|
233
405
|
Induce a class definition from a list of objects.
|
|
@@ -239,7 +411,7 @@ class Collection:
|
|
|
239
411
|
:param max_sample_size:
|
|
240
412
|
:return:
|
|
241
413
|
"""
|
|
242
|
-
cd = ClassDefinition(self.
|
|
414
|
+
cd = ClassDefinition(self._target_class_name)
|
|
243
415
|
keys = defaultdict(list)
|
|
244
416
|
for obj in objs[0:max_sample_size]:
|
|
245
417
|
if isinstance(obj, BaseModel):
|
|
@@ -302,7 +474,7 @@ class Collection:
|
|
|
302
474
|
array_expr = ArrayExpression(exact_number_dimensions=len(exact_dimensions_list[0]))
|
|
303
475
|
cd.attributes[k].array = array_expr
|
|
304
476
|
sv = self.parent.schema_view
|
|
305
|
-
sv.schema.classes[self.
|
|
477
|
+
sv.schema.classes[self._target_class_name] = cd
|
|
306
478
|
sv.set_modified()
|
|
307
479
|
return cd
|
|
308
480
|
|
|
@@ -325,3 +497,22 @@ class Collection:
|
|
|
325
497
|
:return:
|
|
326
498
|
"""
|
|
327
499
|
raise NotImplementedError
|
|
500
|
+
|
|
501
|
+
def iter_validate_collection(self, **kwargs) -> Iterator["ValidationResult"]:
|
|
502
|
+
"""
|
|
503
|
+
Validate the contents of the collection
|
|
504
|
+
|
|
505
|
+
:param kwargs:
|
|
506
|
+
:return: iterator over validation results
|
|
507
|
+
"""
|
|
508
|
+
from linkml.validator import JsonschemaValidationPlugin, Validator
|
|
509
|
+
|
|
510
|
+
validation_plugins = [JsonschemaValidationPlugin(closed=True)]
|
|
511
|
+
validator = Validator(self.parent.schema_view.schema, validation_plugins=validation_plugins)
|
|
512
|
+
cd = self.class_definition()
|
|
513
|
+
if not cd:
|
|
514
|
+
raise ValueError(f"Cannot find class definition for {self._target_class_name}")
|
|
515
|
+
class_name = cd.name
|
|
516
|
+
result = self.find(**kwargs)
|
|
517
|
+
for obj in result.rows:
|
|
518
|
+
yield from validator.iter_results(obj, class_name)
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CollectionConfig(BaseModel):
|
|
7
|
+
name: Optional[str] = Field(
|
|
8
|
+
default=None,
|
|
9
|
+
description="An optional name for the collection",
|
|
10
|
+
)
|
|
11
|
+
alias: Optional[str] = Field(
|
|
12
|
+
default=None,
|
|
13
|
+
description="An optional alias for the collection",
|
|
14
|
+
)
|
|
15
|
+
type: Optional[str] = Field(
|
|
16
|
+
default=None,
|
|
17
|
+
description="The type of object in the collection. TODO; use this instead of name",
|
|
18
|
+
)
|
|
19
|
+
metadata: Optional[Dict] = Field(
|
|
20
|
+
default=None,
|
|
21
|
+
description="Optional metadata for the collection",
|
|
22
|
+
)
|
|
23
|
+
attributes: Optional[Dict[str, Dict]] = Field(
|
|
24
|
+
default=None,
|
|
25
|
+
description="Optional attributes for the collection, following LinkML schema",
|
|
26
|
+
)
|
|
27
|
+
indexers: Optional[Dict[str, Dict]] = Field(
|
|
28
|
+
default=None,
|
|
29
|
+
description="Optional configuration for indexers",
|
|
30
|
+
)
|
|
31
|
+
hidden: Optional[bool] = Field(
|
|
32
|
+
default=False,
|
|
33
|
+
description="Whether the collection is hidden",
|
|
34
|
+
)
|
|
35
|
+
is_prepopulated: Optional[bool] = Field(
|
|
36
|
+
default=False,
|
|
37
|
+
description="Whether the collection is prepopulated",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DatabaseConfig(BaseModel):
|
|
42
|
+
handle: str = Field(
|
|
43
|
+
default="duckdb:///:memory:",
|
|
44
|
+
description="The database handle, e.g., 'duckdb:///:memory:' or 'mongodb://localhost:27017'",
|
|
45
|
+
)
|
|
46
|
+
alias: Optional[str] = Field(
|
|
47
|
+
default=None,
|
|
48
|
+
description="An optional alias for the database",
|
|
49
|
+
)
|
|
50
|
+
schema_location: Optional[str] = Field(
|
|
51
|
+
default=None,
|
|
52
|
+
description="The location of the schema file, either a path on disk or URL",
|
|
53
|
+
)
|
|
54
|
+
schema_dict: Optional[Dict[str, Any]] = Field(
|
|
55
|
+
default=None,
|
|
56
|
+
description="The LinkML schema as a dictionary",
|
|
57
|
+
)
|
|
58
|
+
collections: Dict[str, CollectionConfig] = Field(
|
|
59
|
+
default={},
|
|
60
|
+
description="A dictionary of collection configurations",
|
|
61
|
+
)
|
|
62
|
+
recreate_if_exists: bool = Field(
|
|
63
|
+
default=False,
|
|
64
|
+
description="Whether to recreate the database if it already exists",
|
|
65
|
+
)
|
|
66
|
+
collection_type_slot: Optional[str] = Field(
|
|
67
|
+
default=None,
|
|
68
|
+
description=(
|
|
69
|
+
"For databases that combine multiple collections into a single space, this field"
|
|
70
|
+
"specifies the field that contains the collection type. An example of this is a Solr"
|
|
71
|
+
"index that does not use cores for collections, and instead uses a single global"
|
|
72
|
+
"document space; if this has a field 'document_type', then this field should be set"
|
|
73
|
+
),
|
|
74
|
+
)
|
|
75
|
+
searchable_slots: Optional[List[str]] = Field(
|
|
76
|
+
default=None,
|
|
77
|
+
description="Optional configuration for search fields",
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class ClientConfig(BaseModel):
|
|
82
|
+
handle: Optional[str] = Field(
|
|
83
|
+
default=None,
|
|
84
|
+
description="The client handle",
|
|
85
|
+
)
|
|
86
|
+
databases: Dict[str, DatabaseConfig] = Field(
|
|
87
|
+
default={},
|
|
88
|
+
description="A dictionary of database configurations",
|
|
89
|
+
)
|
|
90
|
+
schema_path: Optional[str] = Field(
|
|
91
|
+
default=None,
|
|
92
|
+
description="The path to the LinkML schema file",
|
|
93
|
+
)
|
|
94
|
+
base_dir: Optional[str] = Field(
|
|
95
|
+
default=None,
|
|
96
|
+
description="The base directory for the client",
|
|
97
|
+
)
|