linkml-store 0.0.0__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/__init__.py +2 -2
- linkml_store/api/client.py +113 -8
- linkml_store/api/collection.py +272 -34
- linkml_store/api/config.py +101 -0
- linkml_store/api/database.py +282 -18
- linkml_store/api/queries.py +12 -1
- linkml_store/api/stores/chromadb/__init__.py +3 -0
- linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
- linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
- linkml_store/api/stores/duckdb/__init__.py +7 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +47 -14
- linkml_store/api/stores/duckdb/duckdb_database.py +38 -47
- linkml_store/api/stores/hdf5/__init__.py +0 -0
- linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
- linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +92 -40
- linkml_store/api/stores/mongodb/mongodb_database.py +58 -67
- linkml_store/api/stores/solr/__init__.py +3 -0
- linkml_store/api/stores/solr/solr_collection.py +133 -0
- linkml_store/api/stores/solr/solr_database.py +83 -0
- linkml_store/api/stores/solr/solr_utils.py +0 -0
- linkml_store/cli.py +369 -0
- linkml_store/index/__init__.py +33 -0
- linkml_store/index/implementations/{llm_index.py → llm_indexer.py} +2 -2
- linkml_store/index/implementations/{simple_index.py → simple_indexer.py} +6 -3
- linkml_store/index/{index.py → indexer.py} +7 -4
- linkml_store/utils/format_utils.py +93 -0
- linkml_store/utils/object_utils.py +81 -0
- linkml_store/utils/sql_utils.py +46 -7
- {linkml_store-0.0.0.dist-info → linkml_store-0.1.7.dist-info}/METADATA +17 -6
- linkml_store-0.1.7.dist-info/RECORD +42 -0
- linkml_store-0.1.7.dist-info/entry_points.txt +3 -0
- linkml_store/api/metadata.py +0 -5
- linkml_store-0.0.0.dist-info/RECORD +0 -29
- linkml_store-0.0.0.dist-info/entry_points.txt +0 -3
- {linkml_store-0.0.0.dist-info → linkml_store-0.1.7.dist-info}/LICENSE +0 -0
- {linkml_store-0.0.0.dist-info → linkml_store-0.1.7.dist-info}/WHEEL +0 -0
linkml_store/api/__init__.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# flake8: noqa: E402
|
|
2
2
|
from linkml_store.api.collection import Collection
|
|
3
3
|
from linkml_store.api.database import Database
|
|
4
|
-
from linkml_store.api.metadata import MetaData
|
|
5
4
|
from linkml_store.api.client import Client
|
|
5
|
+
|
|
6
6
|
# flake8: noqa
|
|
7
7
|
|
|
8
|
-
__all__ = ["Client", "Database", "
|
|
8
|
+
__all__ = ["Client", "Database", "Collection"]
|
linkml_store/api/client.py
CHANGED
|
@@ -1,20 +1,31 @@
|
|
|
1
|
-
from
|
|
2
|
-
from typing import Dict, Optional
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Dict, Optional, Union
|
|
3
3
|
|
|
4
|
+
import yaml
|
|
4
5
|
from linkml_runtime import SchemaView
|
|
5
6
|
|
|
6
7
|
from linkml_store.api import Database
|
|
8
|
+
from linkml_store.api.config import ClientConfig
|
|
9
|
+
from linkml_store.api.stores.chromadb.chromadb_database import ChromaDBDatabase
|
|
7
10
|
from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
|
|
11
|
+
from linkml_store.api.stores.mongodb.mongodb_database import MongoDBDatabase
|
|
12
|
+
from linkml_store.api.stores.solr.solr_database import SolrDatabase
|
|
8
13
|
|
|
9
14
|
HANDLE_MAP = {
|
|
10
15
|
"duckdb": DuckDBDatabase,
|
|
16
|
+
"solr": SolrDatabase,
|
|
17
|
+
"mongodb": MongoDBDatabase,
|
|
18
|
+
"chromadb": ChromaDBDatabase,
|
|
11
19
|
}
|
|
12
20
|
|
|
13
21
|
|
|
14
|
-
@dataclass
|
|
15
22
|
class Client:
|
|
16
23
|
"""
|
|
17
|
-
A client
|
|
24
|
+
A client is the top-level object for interacting with databases.
|
|
25
|
+
|
|
26
|
+
A client has access to one or more :class:`Database` objects.
|
|
27
|
+
|
|
28
|
+
Each database consists of a number of :class:`.Collection` objects.
|
|
18
29
|
|
|
19
30
|
Examples
|
|
20
31
|
--------
|
|
@@ -22,7 +33,7 @@ class Client:
|
|
|
22
33
|
>>> db = client.attach_database("duckdb", alias="test")
|
|
23
34
|
>>> collection = db.create_collection("Person")
|
|
24
35
|
>>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
|
|
25
|
-
>>> collection.
|
|
36
|
+
>>> collection.insert(objs)
|
|
26
37
|
>>> qr = collection.find()
|
|
27
38
|
>>> len(qr.rows)
|
|
28
39
|
2
|
|
@@ -38,9 +49,76 @@ class Client:
|
|
|
38
49
|
|
|
39
50
|
"""
|
|
40
51
|
|
|
41
|
-
|
|
52
|
+
metadata: Optional[ClientConfig] = None
|
|
42
53
|
_databases: Optional[Dict[str, Database]] = None
|
|
43
54
|
|
|
55
|
+
def __init__(self, handle: Optional[str] = None, metadata: Optional[ClientConfig] = None):
|
|
56
|
+
"""
|
|
57
|
+
Initialize a client.
|
|
58
|
+
|
|
59
|
+
:param handle:
|
|
60
|
+
:param metadata:
|
|
61
|
+
"""
|
|
62
|
+
self.metadata = metadata
|
|
63
|
+
if not self.metadata:
|
|
64
|
+
self.metadata = ClientConfig()
|
|
65
|
+
self.metadata.handle = handle
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def handle(self) -> Optional[str]:
|
|
69
|
+
return self.metadata.handle
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def base_dir(self) -> Optional[str]:
|
|
73
|
+
"""
|
|
74
|
+
Get the base directory for the client.
|
|
75
|
+
|
|
76
|
+
Wraps metadata.base_dir.
|
|
77
|
+
|
|
78
|
+
:return:
|
|
79
|
+
"""
|
|
80
|
+
return self.metadata.base_dir
|
|
81
|
+
|
|
82
|
+
def from_config(self, config: Union[ClientConfig, str, Path], base_dir=None, **kwargs):
|
|
83
|
+
"""
|
|
84
|
+
Create a client from a configuration.
|
|
85
|
+
|
|
86
|
+
Examples
|
|
87
|
+
--------
|
|
88
|
+
>>> from linkml_store.api.config import ClientConfig
|
|
89
|
+
>>> client = Client().from_config(ClientConfig(databases={"test": {"handle": "duckdb:///:memory:"}}))
|
|
90
|
+
>>> len(client.databases)
|
|
91
|
+
1
|
|
92
|
+
>>> "test" in client.databases
|
|
93
|
+
True
|
|
94
|
+
>>> client.databases["test"].handle
|
|
95
|
+
'duckdb:///:memory:'
|
|
96
|
+
|
|
97
|
+
:param config:
|
|
98
|
+
:param kwargs:
|
|
99
|
+
:return:
|
|
100
|
+
|
|
101
|
+
"""
|
|
102
|
+
if isinstance(config, Path):
|
|
103
|
+
config = str(config)
|
|
104
|
+
if isinstance(config, str):
|
|
105
|
+
if not base_dir:
|
|
106
|
+
base_dir = Path(config).parent
|
|
107
|
+
parsed_obj = yaml.safe_load(open(config))
|
|
108
|
+
config = ClientConfig(**parsed_obj)
|
|
109
|
+
self.metadata = config
|
|
110
|
+
if base_dir:
|
|
111
|
+
self.metadata.base_dir = base_dir
|
|
112
|
+
self._initialize_databases(**kwargs)
|
|
113
|
+
return self
|
|
114
|
+
|
|
115
|
+
def _initialize_databases(self, **kwargs):
|
|
116
|
+
for name, db_config in self.metadata.databases.items():
|
|
117
|
+
handle = db_config.handle.format(base_dir=self.base_dir)
|
|
118
|
+
db_config.handle = handle
|
|
119
|
+
db = self.attach_database(handle, alias=name, **kwargs)
|
|
120
|
+
db.from_config(db_config)
|
|
121
|
+
|
|
44
122
|
def attach_database(
|
|
45
123
|
self,
|
|
46
124
|
handle: str,
|
|
@@ -69,7 +147,6 @@ class Client:
|
|
|
69
147
|
:param schema_view: schema view to associate with the database
|
|
70
148
|
:param kwargs:
|
|
71
149
|
:return:
|
|
72
|
-
|
|
73
150
|
"""
|
|
74
151
|
if ":" not in handle:
|
|
75
152
|
scheme = handle
|
|
@@ -87,6 +164,7 @@ class Client:
|
|
|
87
164
|
if not self._databases:
|
|
88
165
|
self._databases = {}
|
|
89
166
|
self._databases[alias] = db
|
|
167
|
+
db.parent = self
|
|
90
168
|
return db
|
|
91
169
|
|
|
92
170
|
def get_database(self, name: Optional[str] = None, create_if_not_exists=True, **kwargs) -> Database:
|
|
@@ -101,7 +179,7 @@ class Client:
|
|
|
101
179
|
>>> db == retrieved_db
|
|
102
180
|
True
|
|
103
181
|
|
|
104
|
-
:param name:
|
|
182
|
+
:param name: if None, there must be a single database attached
|
|
105
183
|
:param create_if_not_exists:
|
|
106
184
|
:param kwargs:
|
|
107
185
|
:return:
|
|
@@ -149,3 +227,30 @@ class Client:
|
|
|
149
227
|
if not self._databases:
|
|
150
228
|
self._databases = {}
|
|
151
229
|
return self._databases
|
|
230
|
+
|
|
231
|
+
def drop_database(self, name: str, missing_ok=False, **kwargs):
|
|
232
|
+
"""
|
|
233
|
+
Drop a database.
|
|
234
|
+
|
|
235
|
+
:param name:
|
|
236
|
+
:param missing_ok:
|
|
237
|
+
:return:
|
|
238
|
+
"""
|
|
239
|
+
if name in self._databases:
|
|
240
|
+
db = self._databases[name]
|
|
241
|
+
db.drop(**kwargs)
|
|
242
|
+
del self._databases[name]
|
|
243
|
+
else:
|
|
244
|
+
if not missing_ok:
|
|
245
|
+
raise ValueError(f"Database {name} not found")
|
|
246
|
+
|
|
247
|
+
def drop_all_databases(self, **kwargs):
|
|
248
|
+
"""
|
|
249
|
+
Drop all databases.
|
|
250
|
+
|
|
251
|
+
:param missing_ok:
|
|
252
|
+
:return:
|
|
253
|
+
"""
|
|
254
|
+
for name in list(self._databases.keys()):
|
|
255
|
+
self.drop_database(name, missing_ok=False, **kwargs)
|
|
256
|
+
self._databases = {}
|
linkml_store/api/collection.py
CHANGED
|
@@ -1,16 +1,25 @@
|
|
|
1
|
+
import hashlib
|
|
1
2
|
import logging
|
|
2
3
|
from collections import defaultdict
|
|
3
|
-
from dataclasses import dataclass
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, TextIO, Type, Union
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, TextIO, Type, Union
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
|
|
9
9
|
from linkml_runtime.linkml_model.meta import ArrayExpression
|
|
10
10
|
from pydantic import BaseModel
|
|
11
11
|
|
|
12
|
+
from linkml_store.index import get_indexer
|
|
13
|
+
from linkml_store.utils.object_utils import clean_empties
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
from linkml.validator.report import ValidationResult
|
|
17
|
+
except ImportError:
|
|
18
|
+
ValidationResult = None
|
|
19
|
+
|
|
20
|
+
from linkml_store.api.config import CollectionConfig
|
|
12
21
|
from linkml_store.api.queries import Query, QueryResult
|
|
13
|
-
from linkml_store.index.
|
|
22
|
+
from linkml_store.index.indexer import Indexer
|
|
14
23
|
|
|
15
24
|
if TYPE_CHECKING:
|
|
16
25
|
from linkml_store.api.database import Database
|
|
@@ -19,11 +28,11 @@ logger = logging.getLogger(__name__)
|
|
|
19
28
|
|
|
20
29
|
OBJECT = Union[Dict[str, Any], BaseModel, Type]
|
|
21
30
|
|
|
31
|
+
DEFAULT_FACET_LIMIT = 100
|
|
22
32
|
IDENTIFIER = str
|
|
23
33
|
FIELD_NAME = str
|
|
24
34
|
|
|
25
35
|
|
|
26
|
-
@dataclass
|
|
27
36
|
class Collection:
|
|
28
37
|
"""
|
|
29
38
|
A collection is an organized set of objects of the same or similar type.
|
|
@@ -33,12 +42,93 @@ class Collection:
|
|
|
33
42
|
- For a file system, a collection could be a single tabular file such as Parquet or CSV
|
|
34
43
|
"""
|
|
35
44
|
|
|
36
|
-
name: str
|
|
45
|
+
# name: str
|
|
37
46
|
parent: Optional["Database"] = None
|
|
38
|
-
|
|
39
|
-
hidden: Optional[bool] = False
|
|
47
|
+
_indexers: Optional[Dict[str, Indexer]] = None
|
|
48
|
+
# hidden: Optional[bool] = False
|
|
49
|
+
|
|
50
|
+
metadata: Optional[CollectionConfig] = None
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self, name: str, parent: Optional["Database"] = None, metadata: Optional[CollectionConfig] = None, **kwargs
|
|
54
|
+
):
|
|
55
|
+
self.parent = parent
|
|
56
|
+
if metadata:
|
|
57
|
+
self.metadata = metadata
|
|
58
|
+
else:
|
|
59
|
+
self.metadata = CollectionConfig(name=name, **kwargs)
|
|
60
|
+
if name is not None and self.metadata.name is not None and name != self.metadata.name:
|
|
61
|
+
raise ValueError(f"Name mismatch: {name} != {self.metadata.name}")
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def name(self) -> str:
|
|
65
|
+
"""
|
|
66
|
+
Return the name of the collection
|
|
40
67
|
|
|
41
|
-
|
|
68
|
+
:return:
|
|
69
|
+
"""
|
|
70
|
+
return self.metadata.name
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def hidden(self) -> bool:
|
|
74
|
+
"""
|
|
75
|
+
True if the collection is hidden.
|
|
76
|
+
|
|
77
|
+
An example of a hidden collection is a collection that indexes another
|
|
78
|
+
collection
|
|
79
|
+
|
|
80
|
+
:return: True if the collection is hidden
|
|
81
|
+
"""
|
|
82
|
+
return self.metadata.hidden
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def target_class_name(self):
|
|
86
|
+
"""
|
|
87
|
+
Return the name of the class that this collection represents
|
|
88
|
+
|
|
89
|
+
This MUST be a LinkML class name
|
|
90
|
+
|
|
91
|
+
:return:
|
|
92
|
+
"""
|
|
93
|
+
# TODO: this is a shim layer until we can normalize on this
|
|
94
|
+
if self.metadata.type:
|
|
95
|
+
return self.metadata.type
|
|
96
|
+
return self.name
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def alias(self):
|
|
100
|
+
"""
|
|
101
|
+
Return the primary name/alias used for the collection.
|
|
102
|
+
|
|
103
|
+
This MAY be the name of the LinkML class, but it may be desirable
|
|
104
|
+
to have an alias, for example "persons" which collects all instances
|
|
105
|
+
of class Person.
|
|
106
|
+
|
|
107
|
+
The _alias SHOULD be used for Table names in SQL.
|
|
108
|
+
|
|
109
|
+
For nested data, the alias SHOULD be used as the key; e.g
|
|
110
|
+
|
|
111
|
+
``{ "persons": [ { "name": "Alice" }, { "name": "Bob" } ] }``
|
|
112
|
+
|
|
113
|
+
:return:
|
|
114
|
+
"""
|
|
115
|
+
# TODO: this is a shim layer until we can normalize on this
|
|
116
|
+
if self.metadata.alias:
|
|
117
|
+
return self.metadata.alias
|
|
118
|
+
return self.name
|
|
119
|
+
|
|
120
|
+
def replace(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
121
|
+
"""
|
|
122
|
+
Replace entire collection with objects.
|
|
123
|
+
|
|
124
|
+
:param objs:
|
|
125
|
+
:param kwargs:
|
|
126
|
+
:return:
|
|
127
|
+
"""
|
|
128
|
+
self.delete_where({})
|
|
129
|
+
self.insert(objs, **kwargs)
|
|
130
|
+
|
|
131
|
+
def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
42
132
|
"""
|
|
43
133
|
Add one or more objects to the collection
|
|
44
134
|
|
|
@@ -58,13 +148,14 @@ class Collection:
|
|
|
58
148
|
"""
|
|
59
149
|
raise NotImplementedError
|
|
60
150
|
|
|
61
|
-
def delete_where(self, where: Optional[Dict[str, Any]] = None, **kwargs) -> int:
|
|
151
|
+
def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> int:
|
|
62
152
|
"""
|
|
63
153
|
Delete objects that match a query
|
|
64
154
|
|
|
65
|
-
:param where:
|
|
155
|
+
:param where: where conditions
|
|
156
|
+
:param missing_ok: if True, do not raise an error if the collection does not exist
|
|
66
157
|
:param kwargs:
|
|
67
|
-
:return:
|
|
158
|
+
:return: number of objects deleted (or -1 if unsupported)
|
|
68
159
|
"""
|
|
69
160
|
raise NotImplementedError
|
|
70
161
|
|
|
@@ -79,7 +170,7 @@ class Collection:
|
|
|
79
170
|
raise NotImplementedError
|
|
80
171
|
|
|
81
172
|
def _create_query(self, **kwargs) -> Query:
|
|
82
|
-
return Query(from_table=self.
|
|
173
|
+
return Query(from_table=self.alias, **kwargs)
|
|
83
174
|
|
|
84
175
|
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
85
176
|
"""
|
|
@@ -91,7 +182,9 @@ class Collection:
|
|
|
91
182
|
"""
|
|
92
183
|
return self.parent.query(query, **kwargs)
|
|
93
184
|
|
|
94
|
-
def query_facets(
|
|
185
|
+
def query_facets(
|
|
186
|
+
self, where: Optional[Dict] = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
|
|
187
|
+
) -> Dict[str, Dict[str, int]]:
|
|
95
188
|
"""
|
|
96
189
|
Run a query to get facet counts for one or more columns.
|
|
97
190
|
|
|
@@ -108,20 +201,66 @@ class Collection:
|
|
|
108
201
|
:param con: A DuckDB database connection.
|
|
109
202
|
:param query: A Query object representing the base query.
|
|
110
203
|
:param facet_columns: A list of column names to get facet counts for.
|
|
204
|
+
:param facet_limit:
|
|
111
205
|
:return: A dictionary where keys are column names and values are pandas DataFrames
|
|
112
206
|
containing the facet counts for each unique value in the respective column.
|
|
113
207
|
"""
|
|
114
208
|
raise NotImplementedError
|
|
115
209
|
|
|
116
210
|
def get(self, ids: Optional[IDENTIFIER], **kwargs) -> QueryResult:
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
211
|
+
"""
|
|
212
|
+
Get one or more objects by ID.
|
|
213
|
+
|
|
214
|
+
:param ids:
|
|
215
|
+
:param kwargs:
|
|
216
|
+
:return:
|
|
217
|
+
"""
|
|
218
|
+
# TODO
|
|
219
|
+
id_field = self.identifier_attribute_name
|
|
220
|
+
return self.find({id_field: ids})
|
|
221
|
+
|
|
222
|
+
def get_one(self, id: IDENTIFIER, **kwargs) -> Optional[OBJECT]:
|
|
223
|
+
"""
|
|
224
|
+
Get one object by ID.
|
|
225
|
+
|
|
226
|
+
:param id:
|
|
227
|
+
:param kwargs:
|
|
228
|
+
:return:
|
|
229
|
+
"""
|
|
230
|
+
if not id:
|
|
231
|
+
raise ValueError("Must pass an ID")
|
|
232
|
+
id_field = self.identifier_attribute_name
|
|
233
|
+
if not id_field:
|
|
234
|
+
raise ValueError(f"No identifier for {self.name}")
|
|
235
|
+
w = {id_field: id}
|
|
236
|
+
qr = self.find(w)
|
|
237
|
+
if qr.num_rows == 1:
|
|
238
|
+
return qr.rows[0]
|
|
239
|
+
return None
|
|
120
240
|
|
|
121
241
|
def find(self, where: Optional[Any] = None, **kwargs) -> QueryResult:
|
|
242
|
+
"""
|
|
243
|
+
Find objects in the collection using a where query.
|
|
244
|
+
|
|
245
|
+
:param where:
|
|
246
|
+
:param kwargs:
|
|
247
|
+
:return:
|
|
248
|
+
"""
|
|
122
249
|
query = self._create_query(where_clause=where)
|
|
123
250
|
return self.query(query, **kwargs)
|
|
124
251
|
|
|
252
|
+
def find_iter(self, where: Optional[Any] = None, **kwargs) -> Iterator[OBJECT]:
|
|
253
|
+
"""
|
|
254
|
+
Find objects in the collection using a where query.
|
|
255
|
+
|
|
256
|
+
:param where:
|
|
257
|
+
:param kwargs:
|
|
258
|
+
:return:
|
|
259
|
+
"""
|
|
260
|
+
qr = self.find(where=where, limit=-1, **kwargs)
|
|
261
|
+
for row in qr.rows:
|
|
262
|
+
yield row
|
|
263
|
+
|
|
125
264
|
def search(
|
|
126
265
|
self,
|
|
127
266
|
query: str,
|
|
@@ -141,66 +280,122 @@ class Collection:
|
|
|
141
280
|
:return:
|
|
142
281
|
"""
|
|
143
282
|
if index_name is None:
|
|
144
|
-
if len(self.
|
|
145
|
-
index_name = list(self.
|
|
283
|
+
if len(self._indexers) == 1:
|
|
284
|
+
index_name = list(self._indexers.keys())[0]
|
|
146
285
|
else:
|
|
147
286
|
raise ValueError("Multiple indexes found. Please specify an index name.")
|
|
148
287
|
ix_coll = self.parent.get_collection(self._index_collection_name(index_name))
|
|
149
|
-
ix = self.
|
|
288
|
+
ix = self._indexers.get(index_name)
|
|
150
289
|
if not ix:
|
|
151
290
|
raise ValueError(f"No index named {index_name}")
|
|
152
291
|
qr = ix_coll.find(where=where, limit=-1, **kwargs)
|
|
153
292
|
index_col = ix.index_field
|
|
154
293
|
vector_pairs = [(row, np.array(row[index_col], dtype=float)) for row in qr.rows]
|
|
155
294
|
results = ix.search(query, vector_pairs, limit=limit)
|
|
295
|
+
for r in results:
|
|
296
|
+
del r[1][index_col]
|
|
156
297
|
new_qr = QueryResult(num_rows=len(results))
|
|
157
298
|
new_qr.ranked_rows = results
|
|
158
299
|
return new_qr
|
|
159
300
|
|
|
160
|
-
|
|
301
|
+
@property
|
|
302
|
+
def is_internal(self) -> bool:
|
|
303
|
+
"""
|
|
304
|
+
Check if the collection is internal
|
|
305
|
+
|
|
306
|
+
:return:
|
|
307
|
+
"""
|
|
308
|
+
if not self.name:
|
|
309
|
+
raise ValueError(f"Collection has no name: {self} // {self.metadata}")
|
|
310
|
+
return self.name.startswith("internal__")
|
|
311
|
+
|
|
312
|
+
def attach_indexer(self, index: Union[Indexer, str], name: Optional[str] = True, auto_index=True, **kwargs):
|
|
161
313
|
"""
|
|
162
314
|
Attach an index to the collection.
|
|
163
315
|
|
|
164
316
|
:param index:
|
|
165
|
-
:param
|
|
317
|
+
:param name:
|
|
318
|
+
:param auto_index: Automatically index all objects in the collection
|
|
166
319
|
:param kwargs:
|
|
167
320
|
:return:
|
|
168
321
|
"""
|
|
322
|
+
if isinstance(index, str):
|
|
323
|
+
index = get_indexer(index)
|
|
324
|
+
if name:
|
|
325
|
+
index.name = name
|
|
326
|
+
if not index.name:
|
|
327
|
+
index.name = type(index).__name__.lower()
|
|
169
328
|
index_name = index.name
|
|
170
329
|
if not index_name:
|
|
171
330
|
raise ValueError("Index must have a name")
|
|
172
|
-
if not self.
|
|
173
|
-
self.
|
|
174
|
-
self.
|
|
331
|
+
if not self._indexers:
|
|
332
|
+
self._indexers = {}
|
|
333
|
+
self._indexers[index_name] = index
|
|
175
334
|
if auto_index:
|
|
176
335
|
all_objs = self.find(limit=-1).rows
|
|
177
|
-
self.index_objects(all_objs, index_name, **kwargs)
|
|
336
|
+
self.index_objects(all_objs, index_name, replace=True, **kwargs)
|
|
178
337
|
|
|
179
338
|
def _index_collection_name(self, index_name: str) -> str:
|
|
180
|
-
|
|
339
|
+
"""
|
|
340
|
+
Create a name for a special collection that holds index data
|
|
341
|
+
|
|
342
|
+
:param index_name:
|
|
343
|
+
:return:
|
|
344
|
+
"""
|
|
345
|
+
return f"internal__index__{self.name}__{index_name}"
|
|
181
346
|
|
|
182
|
-
def index_objects(self, objs: List[OBJECT], index_name: str, **kwargs):
|
|
347
|
+
def index_objects(self, objs: List[OBJECT], index_name: str, replace=False, **kwargs):
|
|
183
348
|
"""
|
|
184
349
|
Index a list of objects
|
|
185
350
|
|
|
186
351
|
:param objs:
|
|
187
352
|
:param index_name:
|
|
353
|
+
:param replace:
|
|
188
354
|
:param kwargs:
|
|
189
355
|
:return:
|
|
190
356
|
"""
|
|
191
|
-
ix = self.
|
|
357
|
+
ix = self._indexers.get(index_name)
|
|
192
358
|
if not ix:
|
|
193
359
|
raise ValueError(f"No index named {index_name}")
|
|
194
|
-
|
|
360
|
+
ix_coll_name = self._index_collection_name(index_name)
|
|
361
|
+
ix_coll = self.parent.get_collection(ix_coll_name, create_if_not_exists=True)
|
|
195
362
|
vectors = [list(float(e) for e in v) for v in ix.objects_to_vectors(objs)]
|
|
196
363
|
objects_with_ix = []
|
|
197
364
|
index_col = ix.index_field
|
|
198
365
|
for obj, vector in zip(objs, vectors):
|
|
199
366
|
# TODO: id field
|
|
200
367
|
objects_with_ix.append({**obj, **{index_col: vector}})
|
|
201
|
-
|
|
368
|
+
if replace:
|
|
369
|
+
schema = self.parent.schema_view.schema
|
|
370
|
+
logger.info(f"Checking if {ix_coll_name} is in {schema.classes.keys()}")
|
|
371
|
+
if ix_coll_name in schema.classes:
|
|
372
|
+
ix_coll.delete_where()
|
|
373
|
+
ix_coll.insert(objects_with_ix, **kwargs)
|
|
374
|
+
|
|
375
|
+
def list_index_names(self) -> List[str]:
|
|
376
|
+
"""
|
|
377
|
+
Return a list of index names
|
|
378
|
+
|
|
379
|
+
:return:
|
|
380
|
+
"""
|
|
381
|
+
return list(self._indexers.keys())
|
|
382
|
+
|
|
383
|
+
@property
|
|
384
|
+
def indexers(self) -> Dict[str, Indexer]:
|
|
385
|
+
"""
|
|
386
|
+
Return a list of indexers
|
|
387
|
+
|
|
388
|
+
:return:
|
|
389
|
+
"""
|
|
390
|
+
return self._indexers if self._indexers else {}
|
|
202
391
|
|
|
203
392
|
def peek(self, limit: Optional[int] = None) -> QueryResult:
|
|
393
|
+
"""
|
|
394
|
+
Return the first N objects in the collection
|
|
395
|
+
|
|
396
|
+
:param limit:
|
|
397
|
+
:return:
|
|
398
|
+
"""
|
|
204
399
|
q = self._create_query()
|
|
205
400
|
return self.query(q, limit=limit)
|
|
206
401
|
|
|
@@ -212,22 +407,45 @@ class Collection:
|
|
|
212
407
|
"""
|
|
213
408
|
sv = self.parent.schema_view
|
|
214
409
|
if sv:
|
|
215
|
-
|
|
410
|
+
cls = sv.get_class(self.target_class_name)
|
|
411
|
+
return cls
|
|
216
412
|
return None
|
|
217
413
|
|
|
414
|
+
@property
|
|
218
415
|
def identifier_attribute_name(self) -> Optional[str]:
|
|
219
416
|
"""
|
|
220
417
|
Return the name of the identifier attribute for the collection.
|
|
221
418
|
|
|
419
|
+
AKA the primary key.
|
|
420
|
+
|
|
222
421
|
:return: The name of the identifier attribute, if one exists.
|
|
223
422
|
"""
|
|
224
423
|
cd = self.class_definition()
|
|
225
424
|
if cd:
|
|
226
|
-
for att in
|
|
425
|
+
for att in self.parent.schema_view.class_induced_slots(cd.name):
|
|
227
426
|
if att.identifier:
|
|
228
427
|
return att.name
|
|
229
428
|
return None
|
|
230
429
|
|
|
430
|
+
def object_identifier(self, obj: OBJECT, auto=True) -> Optional[IDENTIFIER]:
|
|
431
|
+
"""
|
|
432
|
+
Return the identifier for an object.
|
|
433
|
+
|
|
434
|
+
:param obj:
|
|
435
|
+
:param auto: If True, generate an identifier if one does not exist.
|
|
436
|
+
:return:
|
|
437
|
+
"""
|
|
438
|
+
pk = self.identifier_attribute_name
|
|
439
|
+
if pk in obj:
|
|
440
|
+
return obj[pk]
|
|
441
|
+
elif auto:
|
|
442
|
+
# TODO: use other unique keys if no primary key
|
|
443
|
+
as_str = str(obj)
|
|
444
|
+
md5 = hashlib.md5(as_str.encode()).hexdigest()
|
|
445
|
+
return md5
|
|
446
|
+
else:
|
|
447
|
+
return None
|
|
448
|
+
|
|
231
449
|
def induce_class_definition_from_objects(self, objs: List[OBJECT], max_sample_size=10) -> ClassDefinition:
|
|
232
450
|
"""
|
|
233
451
|
Induce a class definition from a list of objects.
|
|
@@ -239,7 +457,7 @@ class Collection:
|
|
|
239
457
|
:param max_sample_size:
|
|
240
458
|
:return:
|
|
241
459
|
"""
|
|
242
|
-
cd = ClassDefinition(self.
|
|
460
|
+
cd = ClassDefinition(self.target_class_name)
|
|
243
461
|
keys = defaultdict(list)
|
|
244
462
|
for obj in objs[0:max_sample_size]:
|
|
245
463
|
if isinstance(obj, BaseModel):
|
|
@@ -302,7 +520,7 @@ class Collection:
|
|
|
302
520
|
array_expr = ArrayExpression(exact_number_dimensions=len(exact_dimensions_list[0]))
|
|
303
521
|
cd.attributes[k].array = array_expr
|
|
304
522
|
sv = self.parent.schema_view
|
|
305
|
-
sv.schema.classes[self.
|
|
523
|
+
sv.schema.classes[self.target_class_name] = cd
|
|
306
524
|
sv.set_modified()
|
|
307
525
|
return cd
|
|
308
526
|
|
|
@@ -325,3 +543,23 @@ class Collection:
|
|
|
325
543
|
:return:
|
|
326
544
|
"""
|
|
327
545
|
raise NotImplementedError
|
|
546
|
+
|
|
547
|
+
def iter_validate_collection(self, **kwargs) -> Iterator["ValidationResult"]:
|
|
548
|
+
"""
|
|
549
|
+
Validate the contents of the collection
|
|
550
|
+
|
|
551
|
+
:param kwargs:
|
|
552
|
+
:return: iterator over validation results
|
|
553
|
+
"""
|
|
554
|
+
from linkml.validator import JsonschemaValidationPlugin, Validator
|
|
555
|
+
|
|
556
|
+
validation_plugins = [JsonschemaValidationPlugin(closed=True)]
|
|
557
|
+
validator = Validator(self.parent.schema_view.schema, validation_plugins=validation_plugins)
|
|
558
|
+
cd = self.class_definition()
|
|
559
|
+
if not cd:
|
|
560
|
+
raise ValueError(f"Cannot find class definition for {self.target_class_name}")
|
|
561
|
+
class_name = cd.name
|
|
562
|
+
result = self.find(**kwargs)
|
|
563
|
+
for obj in result.rows:
|
|
564
|
+
obj = clean_empties(obj)
|
|
565
|
+
yield from validator.iter_results(obj, class_name)
|