linkml-store 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/client.py +32 -5
- linkml_store/api/collection.py +276 -27
- linkml_store/api/config.py +6 -2
- linkml_store/api/database.py +264 -21
- linkml_store/api/stores/chromadb/__init__.py +5 -1
- linkml_store/api/stores/duckdb/__init__.py +9 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +7 -4
- linkml_store/api/stores/duckdb/duckdb_database.py +19 -5
- linkml_store/api/stores/duckdb/mappings.py +1 -0
- linkml_store/api/stores/filesystem/__init__.py +15 -0
- linkml_store/api/stores/filesystem/filesystem_collection.py +177 -0
- linkml_store/api/stores/filesystem/filesystem_database.py +72 -0
- linkml_store/api/stores/hdf5/__init__.py +7 -0
- linkml_store/api/stores/mongodb/__init__.py +25 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +31 -10
- linkml_store/api/stores/mongodb/mongodb_database.py +13 -2
- linkml_store/api/types.py +4 -0
- linkml_store/cli.py +150 -15
- linkml_store/index/__init__.py +6 -2
- linkml_store/index/implementations/llm_indexer.py +83 -5
- linkml_store/index/implementations/simple_indexer.py +2 -2
- linkml_store/index/indexer.py +32 -8
- linkml_store/utils/change_utils.py +17 -0
- linkml_store/utils/format_utils.py +139 -8
- linkml_store/utils/patch_utils.py +126 -0
- linkml_store/utils/query_utils.py +89 -0
- {linkml_store-0.1.7.dist-info → linkml_store-0.1.9.dist-info}/METADATA +7 -1
- linkml_store-0.1.9.dist-info/RECORD +49 -0
- linkml_store-0.1.7.dist-info/RECORD +0 -42
- {linkml_store-0.1.7.dist-info → linkml_store-0.1.9.dist-info}/LICENSE +0 -0
- {linkml_store-0.1.7.dist-info → linkml_store-0.1.9.dist-info}/WHEEL +0 -0
- {linkml_store-0.1.7.dist-info → linkml_store-0.1.9.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Dict, List, Optional, Union
|
|
4
|
+
|
|
5
|
+
from linkml_store.api import Collection
|
|
6
|
+
from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
|
|
7
|
+
from linkml_store.api.queries import Query, QueryResult
|
|
8
|
+
from linkml_store.api.types import DatabaseType
|
|
9
|
+
from linkml_store.utils.query_utils import mongo_query_to_match_function
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FileSystemCollection(Collection[DatabaseType]):
|
|
15
|
+
path: Optional[Path] = None
|
|
16
|
+
file_format: Optional[str] = None
|
|
17
|
+
encoding: Optional[str] = None
|
|
18
|
+
_objects_list: List[OBJECT] = None
|
|
19
|
+
_object_map: Dict[str, OBJECT] = None
|
|
20
|
+
|
|
21
|
+
def __init__(self, **kwargs):
|
|
22
|
+
super().__init__(**kwargs)
|
|
23
|
+
parent: DatabaseType = self.parent
|
|
24
|
+
if not self.path:
|
|
25
|
+
if self.parent:
|
|
26
|
+
self.path = Path(parent.directory_path)
|
|
27
|
+
self._objects_list = []
|
|
28
|
+
self._object_map = {}
|
|
29
|
+
if not self.file_format:
|
|
30
|
+
self.file_format = "json"
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def path_to_file(self):
|
|
34
|
+
return Path(self.parent.directory_path) / f"{self.name}.{self.file_format}"
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def objects_as_list(self) -> List[OBJECT]:
|
|
38
|
+
if self._object_map:
|
|
39
|
+
return list(self._object_map.values())
|
|
40
|
+
else:
|
|
41
|
+
return self._objects_list
|
|
42
|
+
|
|
43
|
+
def _set_objects(self, objs: List[OBJECT]):
|
|
44
|
+
pk = self.identifier_attribute_name
|
|
45
|
+
if pk:
|
|
46
|
+
self._object_map = {obj[pk]: obj for obj in objs}
|
|
47
|
+
self._objects_list = []
|
|
48
|
+
else:
|
|
49
|
+
self._objects_list = objs
|
|
50
|
+
self._object_map = {}
|
|
51
|
+
|
|
52
|
+
def commit(self):
|
|
53
|
+
path = self.path_to_file
|
|
54
|
+
if not path:
|
|
55
|
+
raise ValueError("Path not set")
|
|
56
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
57
|
+
self._save(path)
|
|
58
|
+
|
|
59
|
+
def _save(self, path: Path):
|
|
60
|
+
encoding = self.encoding or "utf-8"
|
|
61
|
+
fmt = self.file_format or "json"
|
|
62
|
+
mode = "w"
|
|
63
|
+
if fmt == "parquet":
|
|
64
|
+
mode = "wb"
|
|
65
|
+
encoding = None
|
|
66
|
+
with open(path, mode, encoding=encoding) as stream:
|
|
67
|
+
if fmt == "json":
|
|
68
|
+
import json
|
|
69
|
+
|
|
70
|
+
json.dump(self.objects_as_list, stream, indent=2)
|
|
71
|
+
elif fmt == "jsonl":
|
|
72
|
+
import jsonlines
|
|
73
|
+
|
|
74
|
+
writer = jsonlines.Writer(stream)
|
|
75
|
+
writer.write_all(self.objects_as_list)
|
|
76
|
+
elif fmt == "yaml":
|
|
77
|
+
import yaml
|
|
78
|
+
|
|
79
|
+
yaml.dump_all(self.objects_as_list, stream)
|
|
80
|
+
elif fmt == "parquet":
|
|
81
|
+
import pandas as pd
|
|
82
|
+
import pyarrow
|
|
83
|
+
import pyarrow.parquet as pq
|
|
84
|
+
|
|
85
|
+
df = pd.DataFrame(self.objects_as_list)
|
|
86
|
+
table = pyarrow.Table.from_pandas(df)
|
|
87
|
+
pq.write_table(table, stream)
|
|
88
|
+
elif fmt in {"csv", "tsv"}:
|
|
89
|
+
import csv
|
|
90
|
+
|
|
91
|
+
delimiter = "\t" if fmt == "tsv" else ","
|
|
92
|
+
fieldnames = list(self.objects_as_list[0].keys())
|
|
93
|
+
for obj in self.objects_as_list[1:]:
|
|
94
|
+
fieldnames.extend([k for k in obj.keys() if k not in fieldnames])
|
|
95
|
+
writer = csv.DictWriter(stream, fieldnames=fieldnames, delimiter=delimiter)
|
|
96
|
+
writer.writeheader()
|
|
97
|
+
for obj in self.objects_as_list:
|
|
98
|
+
writer.writerow(obj)
|
|
99
|
+
else:
|
|
100
|
+
raise ValueError(f"Unsupported file format: {fmt}")
|
|
101
|
+
|
|
102
|
+
def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
103
|
+
if not isinstance(objs, list):
|
|
104
|
+
objs = [objs]
|
|
105
|
+
if not objs:
|
|
106
|
+
return
|
|
107
|
+
pk = self.identifier_attribute_name
|
|
108
|
+
if pk:
|
|
109
|
+
for obj in objs:
|
|
110
|
+
if pk not in obj:
|
|
111
|
+
raise ValueError(f"Primary key {pk} not found in object {obj}")
|
|
112
|
+
pk_val = obj[pk]
|
|
113
|
+
self._object_map[pk_val] = obj
|
|
114
|
+
else:
|
|
115
|
+
self._objects_list.extend(objs)
|
|
116
|
+
|
|
117
|
+
def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
|
|
118
|
+
if not isinstance(objs, list):
|
|
119
|
+
objs = [objs]
|
|
120
|
+
if not objs:
|
|
121
|
+
return 0
|
|
122
|
+
pk = self.identifier_attribute_name
|
|
123
|
+
n = 0
|
|
124
|
+
if pk:
|
|
125
|
+
for obj in objs:
|
|
126
|
+
pk_val = obj[pk]
|
|
127
|
+
if pk_val in self._object_map:
|
|
128
|
+
del self._object_map[pk_val]
|
|
129
|
+
n += 1
|
|
130
|
+
else:
|
|
131
|
+
n = len(objs)
|
|
132
|
+
self._objects_list = [o for o in self._objects_list if o not in objs]
|
|
133
|
+
n = n - len(objs)
|
|
134
|
+
return n
|
|
135
|
+
|
|
136
|
+
def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
|
|
137
|
+
logger.info(f"Deleting from {self.target_class_name} where: {where}")
|
|
138
|
+
if where is None:
|
|
139
|
+
where = {}
|
|
140
|
+
|
|
141
|
+
def matches(obj: OBJECT):
|
|
142
|
+
for k, v in where.items():
|
|
143
|
+
if obj.get(k) != v:
|
|
144
|
+
return False
|
|
145
|
+
return True
|
|
146
|
+
|
|
147
|
+
print(type(self))
|
|
148
|
+
print(self)
|
|
149
|
+
print(vars(self))
|
|
150
|
+
curr_objects = [o for o in self.objects_as_list if not matches(o)]
|
|
151
|
+
self._set_objects(curr_objects)
|
|
152
|
+
|
|
153
|
+
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
154
|
+
|
|
155
|
+
where = query.where_clause or {}
|
|
156
|
+
match = mongo_query_to_match_function(where)
|
|
157
|
+
rows = [o for o in self.objects_as_list if match(o)]
|
|
158
|
+
count = len(rows)
|
|
159
|
+
return QueryResult(query=query, num_rows=count, rows=rows)
|
|
160
|
+
|
|
161
|
+
def query_facets(
|
|
162
|
+
self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
|
|
163
|
+
) -> Dict[str, Dict[str, int]]:
|
|
164
|
+
match = mongo_query_to_match_function(where)
|
|
165
|
+
rows = [o for o in self.objects_as_list if match(o)]
|
|
166
|
+
if not facet_columns:
|
|
167
|
+
facet_columns = self.class_definition().attributes.keys()
|
|
168
|
+
facet_results = {c: {} for c in facet_columns}
|
|
169
|
+
for row in rows:
|
|
170
|
+
for fc in facet_columns:
|
|
171
|
+
if fc in row:
|
|
172
|
+
v = row[fc]
|
|
173
|
+
if v not in facet_results[fc]:
|
|
174
|
+
facet_results[fc][v] = 1
|
|
175
|
+
else:
|
|
176
|
+
facet_results[fc][v] += 1
|
|
177
|
+
return {fc: list(facet_results[fc].items()) for fc in facet_results}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import yaml
|
|
6
|
+
from linkml.utils.schema_builder import SchemaBuilder
|
|
7
|
+
from linkml_runtime import SchemaView
|
|
8
|
+
|
|
9
|
+
from linkml_store.api import Database
|
|
10
|
+
from linkml_store.api.config import DatabaseConfig
|
|
11
|
+
from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
|
|
12
|
+
from linkml_store.utils.format_utils import Format, load_objects
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FileSystemDatabase(Database):
|
|
18
|
+
collection_class = FileSystemCollection
|
|
19
|
+
|
|
20
|
+
directory_path: Optional[Path] = None
|
|
21
|
+
default_file_format: Optional[str] = None
|
|
22
|
+
|
|
23
|
+
def __init__(self, handle: Optional[str] = None, **kwargs):
|
|
24
|
+
handle = handle.replace("file:", "")
|
|
25
|
+
if handle.startswith("//"):
|
|
26
|
+
handle = handle[2:]
|
|
27
|
+
self.directory_path = Path(handle)
|
|
28
|
+
self.load_metadata()
|
|
29
|
+
super().__init__(handle=handle, **kwargs)
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def metadata_path(self) -> Path:
|
|
33
|
+
return self.directory_path / ".linkml_metadata.yaml"
|
|
34
|
+
|
|
35
|
+
def load_metadata(self):
|
|
36
|
+
if self.metadata_path.exists():
|
|
37
|
+
md_dict = yaml.safe_load(open(self.metadata_path))
|
|
38
|
+
metadata = DatabaseConfig(**md_dict)
|
|
39
|
+
else:
|
|
40
|
+
metadata = DatabaseConfig()
|
|
41
|
+
self.metadata = metadata
|
|
42
|
+
|
|
43
|
+
def close(self, **kwargs):
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
def init_collections(self):
|
|
47
|
+
metadata = self.metadata
|
|
48
|
+
if self._collections is None:
|
|
49
|
+
self._collections = {}
|
|
50
|
+
for name, collection_config in metadata.collections.items():
|
|
51
|
+
collection = FileSystemCollection(parent=self, **collection_config.dict())
|
|
52
|
+
self._collections[name] = collection
|
|
53
|
+
path = self.directory_path
|
|
54
|
+
if path.exists():
|
|
55
|
+
for fmt in Format:
|
|
56
|
+
suffix = fmt.value
|
|
57
|
+
logger.info(f"Looking for {suffix} files in {path}")
|
|
58
|
+
for f in path.glob(f"*.{suffix}"):
|
|
59
|
+
logger.info(f"Found {f}")
|
|
60
|
+
n = f.stem
|
|
61
|
+
objs = load_objects(f, suffix, expected_type=list)
|
|
62
|
+
collection = FileSystemCollection(parent=self, name=n)
|
|
63
|
+
self._collections[n] = collection
|
|
64
|
+
collection._set_objects(objs)
|
|
65
|
+
|
|
66
|
+
def induce_schema_view(self) -> SchemaView:
|
|
67
|
+
logger.info(f"Inducing schema view for {self.handle}")
|
|
68
|
+
sb = SchemaBuilder()
|
|
69
|
+
|
|
70
|
+
for collection_name in self.list_collection_names():
|
|
71
|
+
sb.add_class(collection_name)
|
|
72
|
+
return SchemaView(sb.schema)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Adapter for MongoDB document store.
|
|
3
|
+
|
|
4
|
+
Handles have the form: ``mongodb://<host>:<port>/<database>``
|
|
5
|
+
|
|
6
|
+
To use this, you must have the `pymongo` extra installed.
|
|
7
|
+
|
|
8
|
+
.. code-block:: bash
|
|
9
|
+
|
|
10
|
+
pip install linkml-store[mongodb]
|
|
11
|
+
|
|
12
|
+
or
|
|
13
|
+
|
|
14
|
+
.. code-block:: bash
|
|
15
|
+
|
|
16
|
+
pip install linkml-store[all]
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from linkml_store.api.stores.mongodb.mongodb_collection import MongoDBCollection
|
|
20
|
+
from linkml_store.api.stores.mongodb.mongodb_database import MongoDBDatabase
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"MongoDBCollection",
|
|
24
|
+
"MongoDBDatabase",
|
|
25
|
+
]
|
|
@@ -13,22 +13,36 @@ logger = logging.getLogger(__name__)
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class MongoDBCollection(Collection):
|
|
16
|
+
"""
|
|
17
|
+
Adapter for collections in a MongoDB database.
|
|
18
|
+
|
|
19
|
+
.. note::
|
|
20
|
+
|
|
21
|
+
You should not use or manipulate this class directly.
|
|
22
|
+
Instead, use the general :class:`linkml_store.api.Collection`
|
|
23
|
+
"""
|
|
16
24
|
|
|
17
25
|
@property
|
|
18
26
|
def mongo_collection(self) -> MongoCollection:
|
|
19
27
|
if not self.name:
|
|
20
28
|
raise ValueError("Collection name not set")
|
|
21
|
-
|
|
29
|
+
collection_name = self.alias or self.name
|
|
30
|
+
return self.parent.native_db[collection_name]
|
|
22
31
|
|
|
23
32
|
def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
24
33
|
if not isinstance(objs, list):
|
|
25
34
|
objs = [objs]
|
|
26
35
|
self.mongo_collection.insert_many(objs)
|
|
36
|
+
# TODO: allow mapping of _id to id for efficiency
|
|
37
|
+
for obj in objs:
|
|
38
|
+
del obj["_id"]
|
|
39
|
+
self._post_insert_hook(objs)
|
|
27
40
|
|
|
28
|
-
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
41
|
+
def query(self, query: Query, limit: Optional[int] = None, **kwargs) -> QueryResult:
|
|
29
42
|
mongo_filter = self._build_mongo_filter(query.where_clause)
|
|
30
|
-
|
|
31
|
-
|
|
43
|
+
limit = limit or query.limit
|
|
44
|
+
if limit and limit >= 0:
|
|
45
|
+
cursor = self.mongo_collection.find(mongo_filter).limit(limit)
|
|
32
46
|
else:
|
|
33
47
|
cursor = self.mongo_collection.find(mongo_filter)
|
|
34
48
|
|
|
@@ -62,24 +76,31 @@ class MongoDBCollection(Collection):
|
|
|
62
76
|
if isinstance(col, tuple):
|
|
63
77
|
sd = SlotDefinition(name="PLACEHOLDER")
|
|
64
78
|
else:
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
79
|
+
if col in cd.attributes:
|
|
80
|
+
sd = cd.attributes[col]
|
|
81
|
+
else:
|
|
82
|
+
logger.info(f"No schema metadata for {col}")
|
|
83
|
+
sd = SlotDefinition(name=col)
|
|
84
|
+
group = {"$group": {"_id": f"${col}", "count": {"$sum": 1}}}
|
|
85
|
+
if isinstance(col, tuple):
|
|
86
|
+
q = {k.replace(".", ""): f"${k}" for k in col}
|
|
87
|
+
group["$group"]["_id"] = q
|
|
88
|
+
if sd and sd.multivalued:
|
|
68
89
|
facet_pipeline = [
|
|
69
90
|
{"$match": where} if where else {"$match": {}},
|
|
70
91
|
{"$unwind": f"${col}"},
|
|
71
|
-
|
|
92
|
+
group,
|
|
72
93
|
{"$sort": {"count": -1}},
|
|
73
94
|
{"$limit": facet_limit},
|
|
74
95
|
]
|
|
75
96
|
else:
|
|
76
97
|
facet_pipeline = [
|
|
77
98
|
{"$match": where} if where else {"$match": {}},
|
|
78
|
-
|
|
99
|
+
group,
|
|
79
100
|
{"$sort": {"count": -1}},
|
|
80
101
|
{"$limit": facet_limit},
|
|
81
102
|
]
|
|
82
|
-
|
|
103
|
+
logger.info(f"Facet pipeline: {facet_pipeline}")
|
|
83
104
|
facet_results = list(self.mongo_collection.aggregate(facet_pipeline))
|
|
84
105
|
results[col] = [(result["_id"], result["count"]) for result in facet_results]
|
|
85
106
|
|
|
@@ -29,9 +29,17 @@ class MongoDBDatabase(Database):
|
|
|
29
29
|
|
|
30
30
|
def __init__(self, handle: Optional[str] = None, **kwargs):
|
|
31
31
|
if handle is None:
|
|
32
|
-
handle = "mongodb://localhost:27017"
|
|
32
|
+
handle = "mongodb://localhost:27017/test"
|
|
33
33
|
super().__init__(handle=handle, **kwargs)
|
|
34
34
|
|
|
35
|
+
@property
|
|
36
|
+
def _db_name(self) -> str:
|
|
37
|
+
if self.handle:
|
|
38
|
+
db = self.handle.split("/")[-1]
|
|
39
|
+
else:
|
|
40
|
+
db = "default"
|
|
41
|
+
return db
|
|
42
|
+
|
|
35
43
|
@property
|
|
36
44
|
def native_client(self) -> MongoClient:
|
|
37
45
|
if self._native_client is None:
|
|
@@ -44,7 +52,7 @@ class MongoDBDatabase(Database):
|
|
|
44
52
|
alias = self.metadata.alias
|
|
45
53
|
if not alias:
|
|
46
54
|
alias = "default"
|
|
47
|
-
self._native_db = self.native_client[
|
|
55
|
+
self._native_db = self.native_client[self._db_name]
|
|
48
56
|
return self._native_db
|
|
49
57
|
|
|
50
58
|
def commit(self, **kwargs):
|
|
@@ -58,9 +66,12 @@ class MongoDBDatabase(Database):
|
|
|
58
66
|
self.native_client.drop_database(self.metadata.alias)
|
|
59
67
|
|
|
60
68
|
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
69
|
+
# TODO: DRY
|
|
61
70
|
if query.from_table:
|
|
62
71
|
collection = self.get_collection(query.from_table)
|
|
63
72
|
return collection.query(query, **kwargs)
|
|
73
|
+
else:
|
|
74
|
+
raise NotImplementedError(f"Querying without a table is not supported in {self.__class__.__name__}")
|
|
64
75
|
|
|
65
76
|
def init_collections(self):
|
|
66
77
|
if self._collections is None:
|
linkml_store/cli.py
CHANGED
|
@@ -11,12 +11,19 @@ from pydantic import BaseModel
|
|
|
11
11
|
from linkml_store import Client
|
|
12
12
|
from linkml_store.api import Collection, Database
|
|
13
13
|
from linkml_store.api.queries import Query
|
|
14
|
+
from linkml_store.index import get_indexer
|
|
14
15
|
from linkml_store.index.implementations.simple_indexer import SimpleIndexer
|
|
15
16
|
from linkml_store.index.indexer import Indexer
|
|
16
|
-
from linkml_store.utils.format_utils import Format, load_objects, render_output
|
|
17
|
+
from linkml_store.utils.format_utils import Format, guess_format, load_objects, render_output, write_output
|
|
17
18
|
from linkml_store.utils.object_utils import object_path_update
|
|
18
19
|
|
|
19
|
-
index_type_option = click.option(
|
|
20
|
+
index_type_option = click.option(
|
|
21
|
+
"--index-type",
|
|
22
|
+
"-t",
|
|
23
|
+
default="simple",
|
|
24
|
+
show_default=True,
|
|
25
|
+
help="Type of index to create. Values: simple, llm",
|
|
26
|
+
)
|
|
20
27
|
|
|
21
28
|
logger = logging.getLogger(__name__)
|
|
22
29
|
|
|
@@ -70,6 +77,9 @@ class ContextSettings(BaseModel):
|
|
|
70
77
|
format_choice = click.Choice([f.value for f in Format])
|
|
71
78
|
|
|
72
79
|
|
|
80
|
+
include_internal_option = click.option("--include-internal/--no-include-internal", default=False, show_default=True)
|
|
81
|
+
|
|
82
|
+
|
|
73
83
|
@click.group()
|
|
74
84
|
@click.option("--database", "-d", help="Database name")
|
|
75
85
|
@click.option("--collection", "-c", help="Collection name")
|
|
@@ -89,6 +99,15 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
|
|
|
89
99
|
if not stacktrace:
|
|
90
100
|
sys.tracebacklimit = 0
|
|
91
101
|
logger = logging.getLogger()
|
|
102
|
+
# Set handler for the root logger to output to the console
|
|
103
|
+
console_handler = logging.StreamHandler()
|
|
104
|
+
console_handler.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
|
|
105
|
+
|
|
106
|
+
# Clear existing handlers to avoid duplicate messages if function runs multiple times
|
|
107
|
+
logger.handlers = []
|
|
108
|
+
|
|
109
|
+
# Add the newly created console handler to the logger
|
|
110
|
+
logger.addHandler(console_handler)
|
|
92
111
|
if verbose >= 2:
|
|
93
112
|
logger.setLevel(logging.DEBUG)
|
|
94
113
|
elif verbose == 1:
|
|
@@ -162,6 +181,7 @@ def insert(ctx, files, object, format):
|
|
|
162
181
|
objects = yaml.safe_load(object_str)
|
|
163
182
|
collection.insert(objects)
|
|
164
183
|
click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.name}'.")
|
|
184
|
+
collection.commit()
|
|
165
185
|
|
|
166
186
|
|
|
167
187
|
@cli.command()
|
|
@@ -193,14 +213,107 @@ def store(ctx, files, object, format):
|
|
|
193
213
|
click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{db.name}'.")
|
|
194
214
|
|
|
195
215
|
|
|
216
|
+
@cli.command(name="import")
|
|
217
|
+
@click.option("--format", "-f", help="Input format")
|
|
218
|
+
@click.pass_context
|
|
219
|
+
@click.argument("files", type=click.Path(exists=True), nargs=-1)
|
|
220
|
+
def import_database(ctx, files, format):
|
|
221
|
+
"""Imports a database from a dump."""
|
|
222
|
+
settings = ctx.obj["settings"]
|
|
223
|
+
db = settings.database
|
|
224
|
+
if not files and not object:
|
|
225
|
+
files = ["-"]
|
|
226
|
+
for file_path in files:
|
|
227
|
+
db.import_database(file_path, source_format=format)
|
|
228
|
+
|
|
229
|
+
|
|
196
230
|
@cli.command()
|
|
197
|
-
@click.option("--
|
|
231
|
+
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
232
|
+
@click.option("--output", "-o", required=True, type=click.Path(), help="Output file path")
|
|
233
|
+
@click.pass_context
|
|
234
|
+
def export(ctx, output_type, output):
|
|
235
|
+
"""Exports a database to a dump."""
|
|
236
|
+
settings = ctx.obj["settings"]
|
|
237
|
+
db = settings.database
|
|
238
|
+
if output_type is None:
|
|
239
|
+
output_type = guess_format(output)
|
|
240
|
+
if output_type is None:
|
|
241
|
+
raise ValueError(f"Output format must be specified can't be inferred from {output}.")
|
|
242
|
+
db.export_database(output, target_format=output_type)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
@cli.command()
|
|
246
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
247
|
+
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
248
|
+
@click.option("--other-database", "-D", required=False, help="Path to the other database")
|
|
249
|
+
@click.option("--other-collection", "-X", required=True, help="Name of the other collection")
|
|
250
|
+
@click.option("--identifier-attribute", "-I", required=False, help="Primary key name")
|
|
251
|
+
@click.pass_context
|
|
252
|
+
def diff(ctx, output, output_type, other_database, other_collection, identifier_attribute):
|
|
253
|
+
"""Diffs two collectoons to create a patch."""
|
|
254
|
+
settings = ctx.obj["settings"]
|
|
255
|
+
db = settings.database
|
|
256
|
+
collection = settings.collection
|
|
257
|
+
if not collection:
|
|
258
|
+
raise ValueError("Collection must be specified.")
|
|
259
|
+
other_db = settings.client.get_database(other_database) if other_database else db
|
|
260
|
+
other_collection = other_db.get_collection(other_collection)
|
|
261
|
+
if identifier_attribute:
|
|
262
|
+
collection.set_identifier_attribute_name(identifier_attribute)
|
|
263
|
+
other_collection.set_identifier_attribute_name(identifier_attribute)
|
|
264
|
+
diff = collection.diff(other_collection)
|
|
265
|
+
write_output(diff, output_type, target=output)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
@cli.command()
|
|
269
|
+
@click.option("--identifier-attribute", "-I", required=False, help="Primary key name")
|
|
270
|
+
@click.argument("patch_files", type=click.Path(exists=True), nargs=-1)
|
|
271
|
+
@click.pass_context
|
|
272
|
+
def apply(ctx, patch_files, identifier_attribute):
|
|
273
|
+
"""
|
|
274
|
+
Apply a patch to a collection.
|
|
275
|
+
"""
|
|
276
|
+
settings = ctx.obj["settings"]
|
|
277
|
+
collection = settings.collection
|
|
278
|
+
if not collection:
|
|
279
|
+
raise ValueError("Collection must be specified.")
|
|
280
|
+
if identifier_attribute:
|
|
281
|
+
collection.set_identifier_attribute_name(identifier_attribute)
|
|
282
|
+
for patch_file in patch_files:
|
|
283
|
+
patch_objs = load_objects(patch_file, expected_type=list)
|
|
284
|
+
collection.apply_patches(patch_objs)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
@cli.command()
|
|
288
|
+
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query, as YAML")
|
|
198
289
|
@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
|
|
199
290
|
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
200
291
|
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
201
292
|
@click.pass_context
|
|
202
293
|
def query(ctx, where, limit, output_type, output):
|
|
203
|
-
"""Query objects from the specified collection.
|
|
294
|
+
"""Query objects from the specified collection.
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
Leave the query field blank to return all objects in the collection.
|
|
298
|
+
|
|
299
|
+
Examples:
|
|
300
|
+
|
|
301
|
+
linkml-store -d duckdb:///countries.db -c countries query
|
|
302
|
+
|
|
303
|
+
Queries can be specified in YAML, as basic key-value pairs
|
|
304
|
+
|
|
305
|
+
Examples:
|
|
306
|
+
|
|
307
|
+
linkml-store -d duckdb:///countries.db -c countries query -w 'code: NZ'
|
|
308
|
+
|
|
309
|
+
More complex queries can be specified using MongoDB-style query syntax
|
|
310
|
+
|
|
311
|
+
Examples:
|
|
312
|
+
|
|
313
|
+
linkml-store -d file:. -c persons query -w 'occupation: {$ne: Architect}'
|
|
314
|
+
|
|
315
|
+
Finds all people who are not architects.
|
|
316
|
+
"""
|
|
204
317
|
collection = ctx.obj["settings"].collection
|
|
205
318
|
where_clause = yaml.safe_load(where) if where else None
|
|
206
319
|
query = Query(from_table=collection.name, where_clause=where_clause, limit=limit)
|
|
@@ -216,9 +329,10 @@ def query(ctx, where, limit, output_type, output):
|
|
|
216
329
|
|
|
217
330
|
@cli.command()
|
|
218
331
|
@click.pass_context
|
|
219
|
-
|
|
332
|
+
@include_internal_option
|
|
333
|
+
def list_collections(ctx, **kwargs):
|
|
220
334
|
db = ctx.obj["settings"].database
|
|
221
|
-
for collection in db.list_collections():
|
|
335
|
+
for collection in db.list_collections(**kwargs):
|
|
222
336
|
click.echo(collection.name)
|
|
223
337
|
click.echo(render_output(collection.metadata))
|
|
224
338
|
|
|
@@ -254,7 +368,7 @@ def fq(ctx, where, limit, columns, output_type, output):
|
|
|
254
368
|
|
|
255
369
|
def _untuple(key):
|
|
256
370
|
if isinstance(key, tuple):
|
|
257
|
-
return "+".join(key)
|
|
371
|
+
return "+".join([str(x) for x in key])
|
|
258
372
|
return key
|
|
259
373
|
|
|
260
374
|
count_dict = {}
|
|
@@ -277,19 +391,34 @@ def _get_index(index_type=None, **kwargs) -> Indexer:
|
|
|
277
391
|
raise ValueError(f"Unknown index type: {index_type}")
|
|
278
392
|
|
|
279
393
|
|
|
394
|
+
@cli.command()
|
|
395
|
+
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
|
|
396
|
+
@click.option("--output-type", "-O", type=format_choice, default=Format.FORMATTED.value, help="Output format")
|
|
397
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
398
|
+
@click.pass_context
|
|
399
|
+
def describe(ctx, where, output_type, output):
|
|
400
|
+
"""
|
|
401
|
+
Describe the collection schema.
|
|
402
|
+
"""
|
|
403
|
+
where_clause = yaml.safe_load(where) if where else None
|
|
404
|
+
collection = ctx.obj["settings"].collection
|
|
405
|
+
df = collection.find(where_clause, limit=1).rows_dataframe
|
|
406
|
+
write_output(df.describe(include="all").transpose(), output_type, target=output)
|
|
407
|
+
|
|
408
|
+
|
|
280
409
|
@cli.command()
|
|
281
410
|
@index_type_option
|
|
411
|
+
@click.option("--cached-embeddings-database", "-E", help="Path to the database where embeddings are cached")
|
|
412
|
+
@click.option("--text-template", "-T", help="Template for text embeddings")
|
|
282
413
|
@click.pass_context
|
|
283
|
-
def index(ctx, index_type):
|
|
414
|
+
def index(ctx, index_type, **kwargs):
|
|
284
415
|
"""
|
|
285
416
|
Create an index over a collection.
|
|
286
417
|
|
|
287
|
-
|
|
288
|
-
:param index_type:
|
|
289
|
-
:return:
|
|
418
|
+
By default a simple trigram index is used.
|
|
290
419
|
"""
|
|
291
420
|
collection = ctx.obj["settings"].collection
|
|
292
|
-
ix =
|
|
421
|
+
ix = get_indexer(index_type, **kwargs)
|
|
293
422
|
collection.attach_indexer(ix)
|
|
294
423
|
|
|
295
424
|
|
|
@@ -322,14 +451,17 @@ def schema(ctx, output_type, output):
|
|
|
322
451
|
@click.option("--limit", "-l", type=click.INT, help="Maximum number of search results")
|
|
323
452
|
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
324
453
|
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
454
|
+
@click.option(
|
|
455
|
+
"--auto-index/--no-auto-index", default=False, show_default=True, help="Automatically index the collection"
|
|
456
|
+
)
|
|
325
457
|
@index_type_option
|
|
326
458
|
@click.pass_context
|
|
327
|
-
def search(ctx, search_term, where, limit, index_type, output_type, output):
|
|
459
|
+
def search(ctx, search_term, where, limit, index_type, output_type, output, auto_index):
|
|
328
460
|
"""Search objects in the specified collection."""
|
|
329
461
|
collection = ctx.obj["settings"].collection
|
|
330
|
-
ix =
|
|
462
|
+
ix = get_indexer(index_type)
|
|
331
463
|
logger.info(f"Attaching index to collection {collection.name}: {ix.model_dump()}")
|
|
332
|
-
collection.attach_indexer(ix, auto_index=
|
|
464
|
+
collection.attach_indexer(ix, auto_index=auto_index)
|
|
333
465
|
result = collection.search(search_term, where=where, limit=limit)
|
|
334
466
|
output_data = render_output([{"score": row[0], **row[1]} for row in result.ranked_rows], output_type)
|
|
335
467
|
if output:
|
|
@@ -343,6 +475,9 @@ def search(ctx, search_term, where, limit, index_type, output_type, output):
|
|
|
343
475
|
@cli.command()
|
|
344
476
|
@click.pass_context
|
|
345
477
|
def indexes(ctx):
|
|
478
|
+
"""
|
|
479
|
+
Show the indexes for a collection.
|
|
480
|
+
"""
|
|
346
481
|
collection = ctx.obj["settings"].collection
|
|
347
482
|
for name, ix in collection.indexers.items():
|
|
348
483
|
click.echo(f"{name}: {type(ix)}\n{ix.model_json()}")
|
linkml_store/index/__init__.py
CHANGED
|
@@ -22,7 +22,7 @@ def get_indexer_class(name: str) -> Type[Indexer]:
|
|
|
22
22
|
return INDEXER_CLASSES[name]
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
def get_indexer(name: str,
|
|
25
|
+
def get_indexer(name: str, **kwargs) -> Indexer:
|
|
26
26
|
"""
|
|
27
27
|
Get an indexer by name.
|
|
28
28
|
|
|
@@ -30,4 +30,8 @@ def get_indexer(name: str, *args, **kwargs) -> Indexer:
|
|
|
30
30
|
:param kwargs: additional arguments to pass to the indexer
|
|
31
31
|
:return: the indexer
|
|
32
32
|
"""
|
|
33
|
-
|
|
33
|
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
|
34
|
+
cls = get_indexer_class(name)
|
|
35
|
+
kwargs["name"] = name
|
|
36
|
+
indexer = cls(**kwargs)
|
|
37
|
+
return indexer
|