linkml-store 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

@@ -0,0 +1,215 @@
1
+ from abc import ABC
2
+ from dataclasses import dataclass
3
+ from typing import Dict, Optional, Sequence
4
+
5
+ from linkml_runtime import SchemaView
6
+
7
+ from linkml_store.api.collection import Collection
8
+ from linkml_store.api.metadata import MetaData
9
+ from linkml_store.api.queries import Query, QueryResult
10
+
11
+
12
+ @dataclass
13
+ class Database(ABC):
14
+ """
15
+ A Database provides access to named collections of data.
16
+
17
+ Examples
18
+ --------
19
+ >>> from linkml_store.api.client import Client
20
+ >>> client = Client()
21
+ >>> db = client.attach_database("duckdb", alias="test")
22
+ >>> db.handle
23
+ 'duckdb:///:memory:'
24
+ >>> collection = db.create_collection("Person")
25
+ >>> len(db.list_collections())
26
+ 1
27
+ >>> db.get_collection("Person") == collection
28
+ True
29
+ >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
30
+ >>> collection.add(objs)
31
+ >>> qr = collection.find()
32
+ >>> len(qr.rows)
33
+ 2
34
+ >>> qr.rows[0]["id"]
35
+ 'P1'
36
+ >>> qr.rows[1]["name"]
37
+ 'Alice'
38
+ >>> qr = collection.find({"name": "John"})
39
+ >>> len(qr.rows)
40
+ 1
41
+ >>> qr.rows[0]["name"]
42
+ 'John'
43
+
44
+ """
45
+
46
+ handle: Optional[str] = None
47
+ recreate_if_exists: Optional[bool] = False
48
+ _schema_view: Optional[SchemaView] = None
49
+ _collections: Optional[Dict[str, Collection]] = None
50
+
51
+ def store(self, obj: Dict[str, str], **kwargs):
52
+ """
53
+ Store an object in the database
54
+
55
+ :param obj: object to store
56
+ :param kwargs: additional arguments
57
+ """
58
+ for k, v in obj.items():
59
+ if not isinstance(v, list):
60
+ continue
61
+ if not v:
62
+ continue
63
+ collection = self.get_collection(k, create_if_not_exists=True)
64
+ collection.add(v)
65
+
66
+ def commit(self, **kwargs):
67
+ """
68
+ Commit any pending changes to the database
69
+ """
70
+ raise NotImplementedError()
71
+
72
+ def close(self, **kwargs):
73
+ """
74
+ Close the database and all connection objects
75
+ """
76
+ raise NotImplementedError()
77
+
78
+ def create_collection(
79
+ self, name: str, alias: Optional[str] = None, metadata: Optional[MetaData] = None, **kwargs
80
+ ) -> Collection:
81
+ """
82
+ Create a new collection
83
+
84
+ >>> from linkml_store.api.client import Client
85
+ >>> client = Client()
86
+ >>> db = client.attach_database("duckdb", alias="test")
87
+ >>> collection = db.create_collection("Person")
88
+ >>> collection.name
89
+ 'Person'
90
+
91
+ :param name: name of the collection
92
+ :param alias: alias for the collection
93
+ :param metadata: metadata for the collection
94
+ :param kwargs: additional arguments
95
+ """
96
+ raise NotImplementedError()
97
+
98
+ def list_collections(self) -> Sequence[Collection]:
99
+ """
100
+ List all collections.
101
+
102
+ Examples
103
+ --------
104
+ >>> from linkml_store.api.client import Client
105
+ >>> client = Client()
106
+ >>> db = client.attach_database("duckdb", alias="test")
107
+ >>> c1 = db.create_collection("Person")
108
+ >>> c2 = db.create_collection("Product")
109
+ >>> collections = db.list_collections()
110
+ >>> len(collections)
111
+ 2
112
+ >>> [c.name for c in collections]
113
+ ['Person', 'Product']
114
+
115
+ """
116
+ if not self._collections:
117
+ self.init_collections()
118
+ return list(self._collections.values())
119
+
120
+ def get_collection(self, name: str, create_if_not_exists=True, **kwargs) -> "Collection":
121
+ """
122
+ Get a named collection.
123
+
124
+ Examples
125
+ --------
126
+ >>> from linkml_store.api.client import Client
127
+ >>> client = Client()
128
+ >>> db = client.attach_database("duckdb", alias="test")
129
+ >>> collection = db.create_collection("Person")
130
+ >>> db.get_collection("Person") == collection
131
+ True
132
+ >>> db.get_collection("NonExistent", create_if_not_exists=False)
133
+ Traceback (most recent call last):
134
+ ...
135
+ KeyError: 'Collection NonExistent does not exist'
136
+
137
+ :param name: name of the collection
138
+ :param create_if_not_exists: create the collection if it does not exist
139
+
140
+ """
141
+ if not self._collections:
142
+ self.init_collections()
143
+ if name not in self._collections:
144
+ if create_if_not_exists:
145
+ self._collections[name] = self.create_collection(name)
146
+ else:
147
+ raise KeyError(f"Collection {name} does not exist")
148
+ return self._collections[name]
149
+
150
+ def init_collections(self):
151
+ """
152
+ Initialize collections.
153
+
154
+ Not typically called directly: consider making hidden
155
+ :return:
156
+ """
157
+ raise NotImplementedError
158
+
159
+ def query(self, query: Query, **kwargs) -> QueryResult:
160
+ """
161
+ Run a query against the database.
162
+
163
+ Examples
164
+ --------
165
+ >>> from linkml_store.api.client import Client
166
+ >>> from linkml_store.api.queries import Query
167
+ >>> client = Client()
168
+ >>> db = client.attach_database("duckdb", alias="test")
169
+ >>> collection = db.create_collection("Person")
170
+ >>> collection.add([{"id": "P1", "name": "John"}, {"id": "P2", "name": "Alice"}])
171
+ >>> query = Query(from_table="Person", where_clause={"name": "John"})
172
+ >>> result = db.query(query)
173
+ >>> len(result.rows)
174
+ 1
175
+ >>> result.rows[0]["id"]
176
+ 'P1'
177
+
178
+ :param query:
179
+ :param kwargs:
180
+ :return:
181
+
182
+ """
183
+ raise NotImplementedError
184
+
185
+ @property
186
+ def schema_view(self) -> SchemaView:
187
+ """
188
+ Return a schema view for the named collection
189
+ """
190
+ if not self._schema_view:
191
+ self._schema_view = self.induce_schema_view()
192
+ return self._schema_view
193
+
194
+ def set_schema_view(self, schema_view: SchemaView):
195
+ self._schema_view = schema_view
196
+
197
+ def induce_schema_view(self) -> SchemaView:
198
+ """
199
+ Induce a schema view from a schema definition.
200
+
201
+ >>> from linkml_store.api.client import Client
202
+ >>> from linkml_store.api.queries import Query
203
+ >>> client = Client()
204
+ >>> db = client.attach_database("duckdb", alias="test")
205
+ >>> collection = db.create_collection("Person")
206
+ >>> collection.add([{"id": "P1", "name": "John", "age_in_years": 25},
207
+ ... {"id": "P2", "name": "Alice", "age_in_years": 25}])
208
+ >>> schema_view = db.induce_schema_view()
209
+ >>> cd = schema_view.get_class("Person")
210
+ >>> cd.attributes["id"].range
211
+ 'string'
212
+
213
+ :return: A schema view
214
+ """
215
+ raise NotImplementedError()
@@ -0,0 +1,5 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class MetaData(BaseModel):
5
+ pass
@@ -0,0 +1,56 @@
1
+ from collections import namedtuple
2
+ from typing import Any, Dict, List, Optional, Tuple, Union
3
+
4
+ import pandas as pd
5
+ from pydantic import BaseModel
6
+
7
+ # defined a named tuple called between with two values (start, end):
8
+ # This is used in the Query class to represent a range of values
9
+ # This is used in the Query class to represent a range of values
10
+ Between = namedtuple("Between", "min max")
11
+
12
+ FACET_GROUP_ATOM = Union[str, int, float, Between]
13
+ FACET_GROUP = Union[FACET_GROUP_ATOM, Tuple[FACET_GROUP_ATOM, ...]]
14
+
15
+
16
+ class Query(BaseModel):
17
+ """
18
+ A query object.
19
+
20
+ - In SQL this would be a SQL query string
21
+ """
22
+
23
+ from_table: Optional[str]
24
+ select_cols: Optional[List[str]] = None
25
+ where_clause: Optional[Union[str, List[str], Dict[str, str]]] = None
26
+ sort_by: Optional[List[str]] = None
27
+ limit: Optional[int] = None
28
+ offset: Optional[int] = None
29
+ include_facet_counts: bool = False
30
+ facet_slots: Optional[List[str]] = None
31
+
32
+
33
+ class QueryResult(BaseModel):
34
+ """
35
+ A query result
36
+ """
37
+
38
+ query: Optional[Query] = None
39
+ num_rows: int
40
+ offset: Optional[int] = 0
41
+ rows: Optional[List[Dict[str, Any]]] = None
42
+ ranked_rows: Optional[List[Tuple[float, Dict[str, Any]]]] = None
43
+ _rows_dataframe: Optional[pd.DataFrame] = None
44
+ facet_counts: Optional[Dict[str, List[Tuple[FACET_GROUP, int]]]] = None
45
+
46
+ @property
47
+ def rows_dataframe(self) -> pd.DataFrame:
48
+ if self._rows_dataframe is None and self.rows:
49
+ self._rows_dataframe = pd.DataFrame(self.rows)
50
+ return self._rows_dataframe
51
+
52
+ def set_rows(self, rows: pd.DataFrame):
53
+ self._rows_dataframe = rows
54
+
55
+ class Config:
56
+ arbitrary_types_allowed = True
File without changes
File without changes
@@ -0,0 +1,109 @@
1
+ from dataclasses import dataclass
2
+ from typing import Any, Dict, List, Optional, Union
3
+
4
+ import sqlalchemy as sqla
5
+ from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
6
+ from sqlalchemy import Column, Table, delete, insert, text
7
+ from sqlalchemy.sql.ddl import CreateTable
8
+
9
+ from linkml_store.api import Collection
10
+ from linkml_store.api.collection import OBJECT
11
+ from linkml_store.api.stores.duckdb.mappings import TMAP
12
+ from linkml_store.utils.sql_utils import facet_count_sql
13
+
14
+
15
+ @dataclass
16
+ class DuckDBCollection(Collection):
17
+ _table_created: bool = None
18
+
19
+ def add(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
20
+ if not isinstance(objs, list):
21
+ objs = [objs]
22
+ if not objs:
23
+ return
24
+ cd = self.class_definition()
25
+ if not cd:
26
+ cd = self.induce_class_definition_from_objects(objs)
27
+ self._create_table(cd)
28
+ table = self._sqla_table(cd)
29
+ engine = self.parent.engine
30
+ col_names = [c.name for c in table.columns]
31
+ objs = [{k: obj.get(k, None) for k in col_names} for obj in objs]
32
+ with engine.connect() as conn:
33
+ with conn.begin():
34
+ conn.execute(insert(table), objs)
35
+ conn.commit()
36
+
37
+ def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> int:
38
+ if not isinstance(objs, list):
39
+ objs = [objs]
40
+ cd = self.class_definition()
41
+ if not cd:
42
+ cd = self.induce_class_definition_from_objects(objs)
43
+ table = self._sqla_table(cd)
44
+ engine = self.parent.engine
45
+ with engine.connect() as conn:
46
+ for obj in objs:
47
+ conditions = [table.c[k] == v for k, v in obj.items() if k in cd.attributes]
48
+ stmt = delete(table).where(*conditions)
49
+ stmt = stmt.compile(engine)
50
+ conn.execute(stmt)
51
+ conn.commit()
52
+ return len(objs)
53
+
54
+ def delete_where(self, where: Optional[Dict[str, Any]] = None, **kwargs) -> int:
55
+ cd = self.class_definition()
56
+ table = self._sqla_table(cd)
57
+ engine = self.parent.engine
58
+ with engine.connect() as conn:
59
+ conditions = [table.c[k] == v for k, v in where.items()]
60
+ stmt = delete(table).where(*conditions)
61
+ stmt = stmt.compile(engine)
62
+ conn.execute(stmt)
63
+ conn.commit()
64
+ return 0
65
+
66
+ def query_facets(self, where: Dict = None, facet_columns: List[str] = None) -> Dict[str, Dict[str, int]]:
67
+ results = {}
68
+ cd = self.class_definition()
69
+ with self.parent.engine.connect() as conn:
70
+ if not facet_columns:
71
+ facet_columns = list(self.class_definition().attributes.keys())
72
+ for col in facet_columns:
73
+ if isinstance(col, tuple):
74
+ sd = SlotDefinition(name="PLACEHOLDER")
75
+ else:
76
+ sd = cd.attributes[col]
77
+ facet_query = self._create_query(where_clause=where)
78
+ facet_query_str = facet_count_sql(facet_query, col, multivalued=sd.multivalued)
79
+ rows = list(conn.execute(text(facet_query_str)))
80
+ results[col] = rows
81
+ return results
82
+
83
+ def _sqla_table(self, cd: ClassDefinition) -> Table:
84
+ metadata_obj = sqla.MetaData()
85
+ cols = []
86
+ for att in cd.attributes.values():
87
+ typ = TMAP.get(att.range, sqla.String)
88
+ if att.inlined:
89
+ typ = sqla.JSON
90
+ if att.multivalued:
91
+ typ = sqla.ARRAY(typ, dimensions=1)
92
+ if att.array:
93
+ typ = sqla.ARRAY(typ, dimensions=1)
94
+ col = Column(att.name, typ)
95
+ cols.append(col)
96
+ t = Table(self.name, metadata_obj, *cols)
97
+ return t
98
+
99
+ def _create_table(self, cd: ClassDefinition):
100
+ if self._table_created:
101
+ return
102
+ t = self._sqla_table(cd)
103
+ ct = CreateTable(t)
104
+ ddl = str(ct.compile(self.parent.engine))
105
+ with self.parent.engine.connect() as conn:
106
+ conn.execute(text(ddl))
107
+ conn.commit()
108
+ if not self._table_created:
109
+ self._table_created = True
@@ -0,0 +1,166 @@
1
+ import json
2
+ import logging
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+
6
+ import pandas as pd
7
+ import sqlalchemy
8
+ from duckdb import DuckDBPyConnection
9
+ from linkml_runtime import SchemaView
10
+ from linkml_runtime.linkml_model import SlotDefinition
11
+ from linkml_runtime.utils.schema_builder import SchemaBuilder
12
+ from sqlalchemy import text
13
+
14
+ from linkml_store.api import Database
15
+ from linkml_store.api.queries import Query, QueryResult
16
+ from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
17
+ from linkml_store.utils.sql_utils import introspect_schema, query_to_sql
18
+
19
+ TYPE_MAP = {
20
+ "VARCHAR": "string",
21
+ "BIGINT": "integer",
22
+ "BOOLEAN": "boolean",
23
+ }
24
+
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ def run_query(con: DuckDBPyConnection, query: Query, **kwargs):
30
+ """
31
+ Run a query and return the result.
32
+
33
+ >>> import duckdb
34
+ >>> con = duckdb.connect("db/mgi.db")
35
+ >>> query = Query(from_table="gaf_association", limit=5)
36
+ >>> result = run_query(con, query)
37
+ >>> print(result.num_rows)
38
+ 532233
39
+
40
+ :param con:
41
+ :param query:
42
+ :return:
43
+ """
44
+ count_query_str = query_to_sql(query, count=True)
45
+ num_rows = con.execute(count_query_str).fetchall()[0][0]
46
+ logger.debug(f"num_rows: {num_rows}")
47
+ query_str = query_to_sql(query, **kwargs)
48
+ logger.debug(f"query_str: {query_str}")
49
+ rows = con.execute(query_str).fetchdf()
50
+ qr = QueryResult(query=query, num_rows=num_rows)
51
+ qr.set_rows(rows)
52
+ return qr
53
+
54
+
55
+ @dataclass
56
+ class DuckDBDatabase(Database):
57
+ _connection: DuckDBPyConnection = None
58
+ _engine: sqlalchemy.Engine = None
59
+
60
+ def __post_init__(self):
61
+ if not self.handle:
62
+ self.handle = "duckdb:///:memory:"
63
+
64
+ @property
65
+ def engine(self) -> sqlalchemy.Engine:
66
+ if not self._engine:
67
+ handle = self.handle
68
+ if not handle.startswith("duckdb://") and not handle.startswith(":"):
69
+ handle = f"duckdb://{handle}"
70
+ self._engine = sqlalchemy.create_engine(handle)
71
+ return self._engine
72
+
73
+ def commit(self, **kwargs):
74
+ with self.engine.connect() as conn:
75
+ conn.commit()
76
+
77
+ def close(self, **kwargs):
78
+ self.engine.dispose()
79
+
80
+ def query(self, query: Query, **kwargs) -> QueryResult:
81
+ json_encoded_cols = []
82
+ if query.from_table:
83
+ sv = self._schema_view
84
+ if sv:
85
+ cd = None
86
+ for c in self._collections.values():
87
+ if c.name == query.from_table:
88
+ cd = c.class_definition()
89
+ break
90
+ if cd:
91
+ for att in cd.attributes.values():
92
+ if att.inlined:
93
+ json_encoded_cols.append(att.name)
94
+ with self.engine.connect() as conn:
95
+ count_query_str = text(query_to_sql(query, count=True))
96
+ num_rows = list(conn.execute(count_query_str))[0][0]
97
+ logger.debug(f"num_rows: {num_rows}")
98
+ query_str = query_to_sql(query, **kwargs) # include offset, limit
99
+ logger.debug(f"query_str: {query_str}")
100
+ rows = list(conn.execute(text(query_str)).mappings())
101
+ qr = QueryResult(query=query, num_rows=num_rows, rows=rows)
102
+ if json_encoded_cols:
103
+ for row in qr.rows:
104
+ for col in json_encoded_cols:
105
+ if row[col]:
106
+ if isinstance(row[col], list):
107
+ for i in range(len(row[col])):
108
+ row[col][i] = json.loads(row[col][i])
109
+ else:
110
+ row[col] = json.loads(row[col])
111
+ qr.set_rows(pd.DataFrame(rows))
112
+ facet_columns = query.facet_slots
113
+ if query.include_facet_counts and not facet_columns:
114
+ raise ValueError("Facet counts requested but no facet columns specified")
115
+ if facet_columns:
116
+ raise NotImplementedError
117
+ return qr
118
+
119
+ def init_collections(self):
120
+ # TODO: unify schema introspection
121
+ schema = introspect_schema(self.engine)
122
+ table_names = schema.classes.keys()
123
+ if self._collections is None:
124
+ self._collections = {}
125
+ for table_name in table_names:
126
+ if table_name not in self._collections:
127
+ collection = DuckDBCollection(name=table_name, parent=self)
128
+ self._collections[table_name] = collection
129
+
130
+ def create_collection(self, name: str, alias: Optional[str] = None, **kwargs) -> DuckDBCollection:
131
+ collection = DuckDBCollection(name=name, parent=self)
132
+ if not self._collections:
133
+ self._collections = {}
134
+ if not alias:
135
+ alias = name
136
+ self._collections[alias] = collection
137
+ return collection
138
+
139
+ def induce_schema_view(self) -> SchemaView:
140
+ # TODO: unify schema introspection
141
+ sb = SchemaBuilder()
142
+ schema = sb.schema
143
+ query = Query(from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE"})
144
+ qr = self.query(query)
145
+ if qr.num_rows:
146
+ table_names = [row["table_name"] for row in qr.rows]
147
+ for tbl in table_names:
148
+ sb.add_class(tbl)
149
+ query = Query(from_table="information_schema.columns", sort_by=["ordinal_position"])
150
+ for row in self.query(query, limit=-1).rows:
151
+ tbl_name = row["table_name"]
152
+ if tbl_name not in sb.schema.classes:
153
+ continue
154
+ dt = row["data_type"]
155
+ if dt.endswith("[]"):
156
+ dt = dt[0:-2]
157
+ multivalued = True
158
+ else:
159
+ multivalued = False
160
+ rng = TYPE_MAP.get(dt, "string")
161
+ sd = SlotDefinition(
162
+ row["column_name"], required=row["is_nullable"] == "NO", multivalued=multivalued, range=rng
163
+ )
164
+ sb.schema.classes[tbl_name].attributes[sd.name] = sd
165
+ sb.add_defaults()
166
+ return SchemaView(schema)
@@ -0,0 +1,7 @@
1
+ import sqlalchemy as sqla
2
+
3
+ TMAP = {
4
+ "string": sqla.String,
5
+ "integer": sqla.Integer,
6
+ "linkml:Any": sqla.JSON,
7
+ }
File without changes
@@ -0,0 +1,56 @@
1
+ from dataclasses import dataclass
2
+ from typing import Any, Dict, List, Optional, Union
3
+
4
+ from linkml_store.api import Collection
5
+ from linkml_store.api.collection import OBJECT
6
+
7
+
8
+ @dataclass
9
+ class MongoDBCollection(Collection):
10
+ """
11
+ A wrapper around a MongoDB collection
12
+ """
13
+
14
+ def add(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
15
+ if not isinstance(objs, list):
16
+ objs = [objs]
17
+ if not objs:
18
+ return
19
+ cd = self.class_definition()
20
+ if not cd:
21
+ cd = self.induce_class_definition_from_objects(objs)
22
+ collection = self.parent.database[self.name]
23
+ collection.insert_many(objs)
24
+
25
+ def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> int:
26
+ if not isinstance(objs, list):
27
+ objs = [objs]
28
+ cd = self.class_definition()
29
+ if not cd:
30
+ cd = self.induce_class_definition_from_objects(objs)
31
+ collection = self.parent.database[self.name]
32
+ deleted_count = 0
33
+ for obj in objs:
34
+ result = collection.delete_one(obj)
35
+ deleted_count += result.deleted_count
36
+ return deleted_count
37
+
38
+ def delete_where(self, where: Optional[Dict[str, Any]] = None, **kwargs) -> int:
39
+ collection = self.parent.database[self.name]
40
+ result = collection.delete_many(where)
41
+ return result.deleted_count
42
+
43
+ def query_facets(self, where: Dict = None, facet_columns: List[str] = None) -> Dict[str, Dict[str, int]]:
44
+ results = {}
45
+ _cd = self.class_definition()
46
+ collection = self.parent.database[self.name]
47
+ if not facet_columns:
48
+ facet_columns = list(self.class_definition().attributes.keys())
49
+ for col in facet_columns:
50
+ facet_pipeline = [
51
+ {"$match": where} if where else {"$match": {}},
52
+ {"$group": {"_id": f"${col}", "count": {"$sum": 1}}},
53
+ ]
54
+ facet_results = list(collection.aggregate(facet_pipeline))
55
+ results[col] = [(row["_id"], row["count"]) for row in facet_results]
56
+ return results