linkml-store 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

@@ -0,0 +1,7 @@
1
+ from pathlib import Path
2
+
3
+ from linkml_store.api import Client
4
+
5
+ THIS_DIR = Path(__file__).parent
6
+
7
+ __all__ = ["Client"]
@@ -0,0 +1,8 @@
1
+ # flake8: noqa: E402
2
+ from linkml_store.api.collection import Collection
3
+ from linkml_store.api.database import Database
4
+ from linkml_store.api.metadata import MetaData
5
+ from linkml_store.api.client import Client
6
+ # flake8: noqa
7
+
8
+ __all__ = ["Client", "Database", "MetaData", "Collection"]
@@ -0,0 +1,151 @@
1
+ from dataclasses import dataclass
2
+ from typing import Dict, Optional
3
+
4
+ from linkml_runtime import SchemaView
5
+
6
+ from linkml_store.api import Database
7
+ from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
8
+
9
+ HANDLE_MAP = {
10
+ "duckdb": DuckDBDatabase,
11
+ }
12
+
13
+
14
+ @dataclass
15
+ class Client:
16
+ """
17
+ A client provides access to named collections.
18
+
19
+ Examples
20
+ --------
21
+ >>> client = Client()
22
+ >>> db = client.attach_database("duckdb", alias="test")
23
+ >>> collection = db.create_collection("Person")
24
+ >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
25
+ >>> collection.add(objs)
26
+ >>> qr = collection.find()
27
+ >>> len(qr.rows)
28
+ 2
29
+ >>> qr.rows[0]["id"]
30
+ 'P1'
31
+ >>> qr.rows[1]["name"]
32
+ 'Alice'
33
+ >>> qr = collection.find({"name": "John"})
34
+ >>> len(qr.rows)
35
+ 1
36
+ >>> qr.rows[0]["name"]
37
+ 'John'
38
+
39
+ """
40
+
41
+ handle: Optional[str] = None
42
+ _databases: Optional[Dict[str, Database]] = None
43
+
44
+ def attach_database(
45
+ self,
46
+ handle: str,
47
+ alias: Optional[str] = None,
48
+ schema_view: Optional[SchemaView] = None,
49
+ recreate_if_exists=False,
50
+ **kwargs,
51
+ ) -> Database:
52
+ """
53
+ Associate a database with a handle.
54
+
55
+ Examples
56
+ --------
57
+ >>> client = Client()
58
+ >>> db = client.attach_database("duckdb", alias="memory")
59
+ >>> "memory" in client.databases
60
+ True
61
+ >>> db = client.attach_database("duckdb:///tmp/another.db", alias="disk")
62
+ >>> len(client.databases)
63
+ 2
64
+ >>> "disk" in client.databases
65
+ True
66
+
67
+ :param handle: handle for the database, e.g. duckdb:///foo.db
68
+ :param alias: alias for the database, e.g foo
69
+ :param schema_view: schema view to associate with the database
70
+ :param kwargs:
71
+ :return:
72
+
73
+ """
74
+ if ":" not in handle:
75
+ scheme = handle
76
+ handle = None
77
+ else:
78
+ scheme, _ = handle.split(":", 1)
79
+ if scheme not in HANDLE_MAP:
80
+ raise ValueError(f"Unknown scheme: {scheme}")
81
+ cls = HANDLE_MAP[scheme]
82
+ db = cls(handle=handle, recreate_if_exists=recreate_if_exists, **kwargs)
83
+ if schema_view:
84
+ db.set_schema_view(schema_view)
85
+ if not alias:
86
+ alias = handle
87
+ if not self._databases:
88
+ self._databases = {}
89
+ self._databases[alias] = db
90
+ return db
91
+
92
+ def get_database(self, name: Optional[str] = None, create_if_not_exists=True, **kwargs) -> Database:
93
+ """
94
+ Get a named database.
95
+
96
+ Examples
97
+ --------
98
+ >>> client = Client()
99
+ >>> db = client.attach_database("duckdb:///test.db", alias="test")
100
+ >>> retrieved_db = client.get_database("test")
101
+ >>> db == retrieved_db
102
+ True
103
+
104
+ :param name:
105
+ :param create_if_not_exists:
106
+ :param kwargs:
107
+ :return:
108
+
109
+ """
110
+ if not name:
111
+ if not self._databases:
112
+ raise ValueError("No databases attached and no name provided")
113
+ if len(self._databases) > 1:
114
+ raise ValueError("Ambiguous: No name provided and multiple databases attached")
115
+ return list(self._databases.values())[0]
116
+ if not self._databases:
117
+ self._databases = {}
118
+ if name not in self._databases:
119
+ if create_if_not_exists:
120
+ self.attach_database(name, **kwargs)
121
+ else:
122
+ raise ValueError(f"Database {name} does not exist")
123
+ return self._databases[name]
124
+
125
+ @property
126
+ def databases(self) -> Dict[str, Database]:
127
+ """
128
+ Return all attached databases
129
+
130
+ Examples
131
+ --------
132
+ >>> client = Client()
133
+ >>> _ = client.attach_database("duckdb", alias="test1")
134
+ >>> _ = client.attach_database("duckdb", alias="test2")
135
+ >>> len(client.databases)
136
+ 2
137
+ >>> "test1" in client.databases
138
+ True
139
+ >>> "test2" in client.databases
140
+ True
141
+ >>> client.databases["test1"].handle
142
+ 'duckdb:///:memory:'
143
+ >>> client.databases["test2"].handle
144
+ 'duckdb:///:memory:'
145
+
146
+ :return:
147
+
148
+ """
149
+ if not self._databases:
150
+ self._databases = {}
151
+ return self._databases
@@ -0,0 +1,327 @@
1
+ import logging
2
+ from collections import defaultdict
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, TextIO, Type, Union
6
+
7
+ import numpy as np
8
+ from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
9
+ from linkml_runtime.linkml_model.meta import ArrayExpression
10
+ from pydantic import BaseModel
11
+
12
+ from linkml_store.api.queries import Query, QueryResult
13
+ from linkml_store.index.index import Index
14
+
15
+ if TYPE_CHECKING:
16
+ from linkml_store.api.database import Database
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ OBJECT = Union[Dict[str, Any], BaseModel, Type]
21
+
22
+ IDENTIFIER = str
23
+ FIELD_NAME = str
24
+
25
+
26
+ @dataclass
27
+ class Collection:
28
+ """
29
+ A collection is an organized set of objects of the same or similar type.
30
+
31
+ - For relational databases, a collection is typically a table
32
+ - For document databases such as MongoDB, a collection is the native type
33
+ - For a file system, a collection could be a single tabular file such as Parquet or CSV
34
+ """
35
+
36
+ name: str
37
+ parent: Optional["Database"] = None
38
+ _indexes: Optional[Dict[str, Index]] = None
39
+ hidden: Optional[bool] = False
40
+
41
+ def add(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
42
+ """
43
+ Add one or more objects to the collection
44
+
45
+ :param objs:
46
+ :param kwargs:
47
+ :return:
48
+ """
49
+ raise NotImplementedError
50
+
51
+ def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> int:
52
+ """
53
+ Delete one or more objects from the collection
54
+
55
+ :param objs:
56
+ :param kwargs:
57
+ :return:
58
+ """
59
+ raise NotImplementedError
60
+
61
+ def delete_where(self, where: Optional[Dict[str, Any]] = None, **kwargs) -> int:
62
+ """
63
+ Delete objects that match a query
64
+
65
+ :param where:
66
+ :param kwargs:
67
+ :return:
68
+ """
69
+ raise NotImplementedError
70
+
71
+ def update(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
72
+ """
73
+ Update one or more objects in the collection
74
+
75
+ :param objs:
76
+ :param kwargs:
77
+ :return:
78
+ """
79
+ raise NotImplementedError
80
+
81
+ def _create_query(self, **kwargs) -> Query:
82
+ return Query(from_table=self.name, **kwargs)
83
+
84
+ def query(self, query: Query, **kwargs) -> QueryResult:
85
+ """
86
+ Run a query against the collection
87
+
88
+ :param query:
89
+ :param kwargs:
90
+ :return:
91
+ """
92
+ return self.parent.query(query, **kwargs)
93
+
94
+ def query_facets(self, where: Optional[Dict] = None, facet_columns: List[str] = None) -> Dict[str, Dict[str, int]]:
95
+ """
96
+ Run a query to get facet counts for one or more columns.
97
+
98
+ This function takes a database connection, a Query object, and a list of column names.
99
+ It generates and executes a facet count query for each specified column and returns
100
+ the results as a dictionary where the keys are the column names and the values are
101
+ pandas DataFrames containing the facet counts.
102
+
103
+ The facet count query is generated by modifying the original query's WHERE clause
104
+ to exclude conditions directly related to the facet column. This allows for counting
105
+ the occurrences of each unique value in the facet column while still applying the
106
+ other filtering conditions.
107
+
108
+ :param con: A DuckDB database connection.
109
+ :param query: A Query object representing the base query.
110
+ :param facet_columns: A list of column names to get facet counts for.
111
+ :return: A dictionary where keys are column names and values are pandas DataFrames
112
+ containing the facet counts for each unique value in the respective column.
113
+ """
114
+ raise NotImplementedError
115
+
116
+ def get(self, ids: Optional[IDENTIFIER], **kwargs) -> QueryResult:
117
+ id_field = self.identifier_field
118
+ q = self._create_query(where_clause={id_field: ids})
119
+ return self.query(q, **kwargs)
120
+
121
+ def find(self, where: Optional[Any] = None, **kwargs) -> QueryResult:
122
+ query = self._create_query(where_clause=where)
123
+ return self.query(query, **kwargs)
124
+
125
+ def search(
126
+ self,
127
+ query: str,
128
+ where: Optional[Any] = None,
129
+ index_name: Optional[str] = None,
130
+ limit: Optional[int] = None,
131
+ **kwargs,
132
+ ) -> QueryResult:
133
+ """
134
+ Search the collection using a full-text search index.
135
+
136
+ :param query:
137
+ :param where:
138
+ :param index_name:
139
+ :param limit:
140
+ :param kwargs:
141
+ :return:
142
+ """
143
+ if index_name is None:
144
+ if len(self._indexes) == 1:
145
+ index_name = list(self._indexes.keys())[0]
146
+ else:
147
+ raise ValueError("Multiple indexes found. Please specify an index name.")
148
+ ix_coll = self.parent.get_collection(self._index_collection_name(index_name))
149
+ ix = self._indexes.get(index_name)
150
+ if not ix:
151
+ raise ValueError(f"No index named {index_name}")
152
+ qr = ix_coll.find(where=where, limit=-1, **kwargs)
153
+ index_col = ix.index_field
154
+ vector_pairs = [(row, np.array(row[index_col], dtype=float)) for row in qr.rows]
155
+ results = ix.search(query, vector_pairs, limit=limit)
156
+ new_qr = QueryResult(num_rows=len(results))
157
+ new_qr.ranked_rows = results
158
+ return new_qr
159
+
160
+ def attach_index(self, index: Index, auto_index=True, **kwargs):
161
+ """
162
+ Attach an index to the collection.
163
+
164
+ :param index:
165
+ :param auto_index:
166
+ :param kwargs:
167
+ :return:
168
+ """
169
+ index_name = index.name
170
+ if not index_name:
171
+ raise ValueError("Index must have a name")
172
+ if not self._indexes:
173
+ self._indexes = {}
174
+ self._indexes[index_name] = index
175
+ if auto_index:
176
+ all_objs = self.find(limit=-1).rows
177
+ self.index_objects(all_objs, index_name, **kwargs)
178
+
179
+ def _index_collection_name(self, index_name: str) -> str:
180
+ return f"index__{self.name}_{index_name}"
181
+
182
+ def index_objects(self, objs: List[OBJECT], index_name: str, **kwargs):
183
+ """
184
+ Index a list of objects
185
+
186
+ :param objs:
187
+ :param index_name:
188
+ :param kwargs:
189
+ :return:
190
+ """
191
+ ix = self._indexes.get(index_name)
192
+ if not ix:
193
+ raise ValueError(f"No index named {index_name}")
194
+ ix_coll = self.parent.get_collection(self._index_collection_name(index_name), create_if_not_exists=True)
195
+ vectors = [list(float(e) for e in v) for v in ix.objects_to_vectors(objs)]
196
+ objects_with_ix = []
197
+ index_col = ix.index_field
198
+ for obj, vector in zip(objs, vectors):
199
+ # TODO: id field
200
+ objects_with_ix.append({**obj, **{index_col: vector}})
201
+ ix_coll.add(objects_with_ix, **kwargs)
202
+
203
+ def peek(self, limit: Optional[int] = None) -> QueryResult:
204
+ q = self._create_query()
205
+ return self.query(q, limit=limit)
206
+
207
+ def class_definition(self) -> Optional[ClassDefinition]:
208
+ """
209
+ Return the class definition for the collection.
210
+
211
+ :return:
212
+ """
213
+ sv = self.parent.schema_view
214
+ if sv:
215
+ return sv.get_class(self.name)
216
+ return None
217
+
218
+ def identifier_attribute_name(self) -> Optional[str]:
219
+ """
220
+ Return the name of the identifier attribute for the collection.
221
+
222
+ :return: The name of the identifier attribute, if one exists.
223
+ """
224
+ cd = self.class_definition()
225
+ if cd:
226
+ for att in cd.attributes.values():
227
+ if att.identifier:
228
+ return att.name
229
+ return None
230
+
231
+ def induce_class_definition_from_objects(self, objs: List[OBJECT], max_sample_size=10) -> ClassDefinition:
232
+ """
233
+ Induce a class definition from a list of objects.
234
+
235
+ This uses a heuristic procedure to infer the class definition from a list of objects.
236
+ In general it is recommended you explicitly provide a schema.
237
+
238
+ :param objs:
239
+ :param max_sample_size:
240
+ :return:
241
+ """
242
+ cd = ClassDefinition(self.name)
243
+ keys = defaultdict(list)
244
+ for obj in objs[0:max_sample_size]:
245
+ if isinstance(obj, BaseModel):
246
+ obj = obj.model_dump()
247
+ if not isinstance(obj, dict):
248
+ logger.warning(f"Skipping non-dict object: {obj}")
249
+ continue
250
+ for k, v in obj.items():
251
+ keys[k].append(v)
252
+ for k, vs in keys.items():
253
+ multivalueds = []
254
+ inlineds = []
255
+ rngs = []
256
+ exact_dimensions_list = []
257
+ for v in vs:
258
+ if v is None:
259
+ continue
260
+ if isinstance(v, np.ndarray):
261
+ rngs.append("float")
262
+ exact_dimensions_list.append(v.shape)
263
+ break
264
+ if isinstance(v, list):
265
+ v = v[0]
266
+ multivalueds.append(True)
267
+ elif isinstance(v, dict):
268
+ v = list(v.values())[0]
269
+ multivalueds.append(True)
270
+ else:
271
+ multivalueds.append(False)
272
+ if not v:
273
+ continue
274
+ if isinstance(v, str):
275
+ rng = "string"
276
+ elif isinstance(v, bool):
277
+ rng = "boolean"
278
+ elif isinstance(v, int):
279
+ rng = "integer"
280
+ elif isinstance(v, float):
281
+ rng = "float"
282
+ elif isinstance(v, dict):
283
+ rng = None
284
+ inlineds.append(True)
285
+ else:
286
+ # raise ValueError(f"No mappings for {type(v)} // v={v}")
287
+ rng = None
288
+ inlineds.append(False)
289
+ rngs.append(rng)
290
+ multivalued = any(multivalueds)
291
+ inlined = any(inlineds)
292
+ if multivalued and False in multivalueds:
293
+ raise ValueError(f"Mixed list non list: {vs} // inferred= {multivalueds}")
294
+ # if not rngs:
295
+ # raise AssertionError(f"Empty rngs for {k} = {vs}")
296
+ rng = rngs[0] if rngs else None
297
+ for other_rng in rngs:
298
+ if rng != other_rng:
299
+ raise ValueError(f"Conflict: {rng} != {other_rng} for {vs}")
300
+ cd.attributes[k] = SlotDefinition(k, range=rng, multivalued=multivalued, inlined=inlined)
301
+ if exact_dimensions_list:
302
+ array_expr = ArrayExpression(exact_number_dimensions=len(exact_dimensions_list[0]))
303
+ cd.attributes[k].array = array_expr
304
+ sv = self.parent.schema_view
305
+ sv.schema.classes[self.name] = cd
306
+ sv.set_modified()
307
+ return cd
308
+
309
+ def import_data(self, location: Union[Path, str, TextIO], **kwargs):
310
+ """
311
+ Import data from a file or stream
312
+
313
+ :param location:
314
+ :param kwargs:
315
+ :return:
316
+ """
317
+ raise NotImplementedError
318
+
319
+ def export_data(self, location: Union[Path, str, TextIO], **kwargs):
320
+ """
321
+ Export data to a file or stream
322
+
323
+ :param location:
324
+ :param kwargs:
325
+ :return:
326
+ """
327
+ raise NotImplementedError