linkml-store 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linkml_store/__init__.py +7 -0
- linkml_store/api/__init__.py +8 -0
- linkml_store/api/client.py +414 -0
- linkml_store/api/collection.py +1280 -0
- linkml_store/api/config.py +187 -0
- linkml_store/api/database.py +862 -0
- linkml_store/api/queries.py +69 -0
- linkml_store/api/stores/__init__.py +0 -0
- linkml_store/api/stores/chromadb/__init__.py +7 -0
- linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
- linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
- linkml_store/api/stores/dremio/__init__.py +10 -0
- linkml_store/api/stores/dremio/dremio_collection.py +555 -0
- linkml_store/api/stores/dremio/dremio_database.py +1052 -0
- linkml_store/api/stores/dremio/mappings.py +105 -0
- linkml_store/api/stores/dremio_rest/__init__.py +11 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
- linkml_store/api/stores/duckdb/__init__.py +16 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
- linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
- linkml_store/api/stores/duckdb/mappings.py +8 -0
- linkml_store/api/stores/filesystem/__init__.py +15 -0
- linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
- linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
- linkml_store/api/stores/hdf5/__init__.py +7 -0
- linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
- linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
- linkml_store/api/stores/ibis/__init__.py +5 -0
- linkml_store/api/stores/ibis/ibis_collection.py +488 -0
- linkml_store/api/stores/ibis/ibis_database.py +328 -0
- linkml_store/api/stores/mongodb/__init__.py +25 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
- linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
- linkml_store/api/stores/neo4j/__init__.py +0 -0
- linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
- linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
- linkml_store/api/stores/solr/__init__.py +3 -0
- linkml_store/api/stores/solr/solr_collection.py +224 -0
- linkml_store/api/stores/solr/solr_database.py +83 -0
- linkml_store/api/stores/solr/solr_utils.py +0 -0
- linkml_store/api/types.py +4 -0
- linkml_store/cli.py +1147 -0
- linkml_store/constants.py +7 -0
- linkml_store/graphs/__init__.py +0 -0
- linkml_store/graphs/graph_map.py +24 -0
- linkml_store/index/__init__.py +53 -0
- linkml_store/index/implementations/__init__.py +0 -0
- linkml_store/index/implementations/llm_indexer.py +174 -0
- linkml_store/index/implementations/simple_indexer.py +43 -0
- linkml_store/index/indexer.py +211 -0
- linkml_store/inference/__init__.py +13 -0
- linkml_store/inference/evaluation.py +195 -0
- linkml_store/inference/implementations/__init__.py +0 -0
- linkml_store/inference/implementations/llm_inference_engine.py +154 -0
- linkml_store/inference/implementations/rag_inference_engine.py +276 -0
- linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
- linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
- linkml_store/inference/inference_config.py +66 -0
- linkml_store/inference/inference_engine.py +209 -0
- linkml_store/inference/inference_engine_registry.py +74 -0
- linkml_store/plotting/__init__.py +5 -0
- linkml_store/plotting/cli.py +826 -0
- linkml_store/plotting/dimensionality_reduction.py +453 -0
- linkml_store/plotting/embedding_plot.py +489 -0
- linkml_store/plotting/facet_chart.py +73 -0
- linkml_store/plotting/heatmap.py +383 -0
- linkml_store/utils/__init__.py +0 -0
- linkml_store/utils/change_utils.py +17 -0
- linkml_store/utils/dat_parser.py +95 -0
- linkml_store/utils/embedding_matcher.py +424 -0
- linkml_store/utils/embedding_utils.py +299 -0
- linkml_store/utils/enrichment_analyzer.py +217 -0
- linkml_store/utils/file_utils.py +37 -0
- linkml_store/utils/format_utils.py +550 -0
- linkml_store/utils/io.py +38 -0
- linkml_store/utils/llm_utils.py +122 -0
- linkml_store/utils/mongodb_utils.py +145 -0
- linkml_store/utils/neo4j_utils.py +42 -0
- linkml_store/utils/object_utils.py +190 -0
- linkml_store/utils/pandas_utils.py +93 -0
- linkml_store/utils/patch_utils.py +126 -0
- linkml_store/utils/query_utils.py +89 -0
- linkml_store/utils/schema_utils.py +23 -0
- linkml_store/utils/sklearn_utils.py +193 -0
- linkml_store/utils/sql_utils.py +177 -0
- linkml_store/utils/stats_utils.py +53 -0
- linkml_store/utils/vector_utils.py +158 -0
- linkml_store/webapi/__init__.py +0 -0
- linkml_store/webapi/html/__init__.py +3 -0
- linkml_store/webapi/html/base.html.j2 +24 -0
- linkml_store/webapi/html/collection_details.html.j2 +15 -0
- linkml_store/webapi/html/database_details.html.j2 +16 -0
- linkml_store/webapi/html/databases.html.j2 +14 -0
- linkml_store/webapi/html/generic.html.j2 +43 -0
- linkml_store/webapi/main.py +855 -0
- linkml_store-0.3.0.dist-info/METADATA +226 -0
- linkml_store-0.3.0.dist-info/RECORD +101 -0
- linkml_store-0.3.0.dist-info/WHEEL +4 -0
- linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
- linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
linkml_store/cli.py
ADDED
|
@@ -0,0 +1,1147 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
import warnings
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
import yaml
|
|
10
|
+
from linkml_runtime.dumpers import json_dumper
|
|
11
|
+
from linkml_runtime.utils.formatutils import underscore
|
|
12
|
+
from pydantic import BaseModel
|
|
13
|
+
|
|
14
|
+
from linkml_store import Client
|
|
15
|
+
from linkml_store.api import Collection, Database
|
|
16
|
+
from linkml_store.api.config import ClientConfig
|
|
17
|
+
from linkml_store.api.queries import Query
|
|
18
|
+
from linkml_store.index import get_indexer
|
|
19
|
+
from linkml_store.index.implementations.simple_indexer import SimpleIndexer
|
|
20
|
+
from linkml_store.index.indexer import Indexer
|
|
21
|
+
from linkml_store.inference import get_inference_engine
|
|
22
|
+
from linkml_store.inference.evaluation import evaluate_predictor, score_text_overlap
|
|
23
|
+
from linkml_store.inference.inference_config import InferenceConfig
|
|
24
|
+
from linkml_store.inference.inference_engine import ModelSerialization
|
|
25
|
+
from linkml_store.utils.format_utils import Format, guess_format, load_objects, render_output, write_output
|
|
26
|
+
from linkml_store.utils.object_utils import object_path_update
|
|
27
|
+
from linkml_store.utils.pandas_utils import facet_summary_to_dataframe_unmelted
|
|
28
|
+
from linkml_store.plotting.cli import plot_cli
|
|
29
|
+
|
|
30
|
+
DEFAULT_LOCAL_CONF_PATH = Path("linkml.yaml")
|
|
31
|
+
# global path is ~/.linkml.yaml in the user's home directory
|
|
32
|
+
DEFAULT_GLOBAL_CONF_PATH = Path("~/.linkml.yaml").expanduser()
|
|
33
|
+
|
|
34
|
+
index_type_option = click.option(
|
|
35
|
+
"--index-type",
|
|
36
|
+
"-t",
|
|
37
|
+
default="simple",
|
|
38
|
+
show_default=True,
|
|
39
|
+
help="Type of index to create. Values: simple, llm",
|
|
40
|
+
)
|
|
41
|
+
json_select_query_option = click.option(
|
|
42
|
+
"--json-select-query",
|
|
43
|
+
"-J",
|
|
44
|
+
help="JSON SELECT query",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
logger = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
warnings.filterwarnings("ignore", module="duckdb_engine")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class ContextSettings(BaseModel):
|
|
53
|
+
"""
|
|
54
|
+
Context object for CLI commands.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
client: Client
|
|
58
|
+
database_name: Optional[str] = None
|
|
59
|
+
collection_name: Optional[str] = None
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def database(self) -> Optional[Database]:
|
|
63
|
+
"""
|
|
64
|
+
Get the database object.
|
|
65
|
+
:return:
|
|
66
|
+
"""
|
|
67
|
+
name = self.database_name
|
|
68
|
+
if name is None:
|
|
69
|
+
# if len(self.client.databases) > 1:
|
|
70
|
+
# raise ValueError("Database must be specified if there are multiple databases.")
|
|
71
|
+
if not self.client.databases:
|
|
72
|
+
return None
|
|
73
|
+
name = list(self.client.databases.keys())[0]
|
|
74
|
+
return self.client.get_database(name)
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def collection(self) -> Optional[Collection]:
|
|
78
|
+
"""
|
|
79
|
+
Get the collection object.
|
|
80
|
+
:return:
|
|
81
|
+
"""
|
|
82
|
+
name = self.collection_name
|
|
83
|
+
if name is None:
|
|
84
|
+
# if len(self.database.list_collections()) > 1:
|
|
85
|
+
# raise ValueError("Collection must be specified if there are multiple collections.")
|
|
86
|
+
if not self.database:
|
|
87
|
+
return None
|
|
88
|
+
if not self.database.list_collections():
|
|
89
|
+
return None
|
|
90
|
+
name = list(self.database.list_collections())[0]
|
|
91
|
+
return self.database.get_collection(name)
|
|
92
|
+
|
|
93
|
+
class Config:
|
|
94
|
+
arbitrary_types_allowed = True
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# format_choice = click.Choice(["json", "yaml", "tsv"])
|
|
98
|
+
format_choice = click.Choice([f.value for f in Format])
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
include_internal_option = click.option("--include-internal/--no-include-internal", default=False, show_default=True)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@click.group()
|
|
105
|
+
@click.option("--database", "-d", help="Database name")
|
|
106
|
+
@click.option("--collection", "-c", help="Collection name")
|
|
107
|
+
@click.option("--input", "-i", help="Input file (alternative to database/collection)")
|
|
108
|
+
@click.option("--schema", "-S", help="Path to schema (LinkML yaml)")
|
|
109
|
+
@click.option("--config", "-C", type=click.Path(exists=True), help="Path to the configuration file")
|
|
110
|
+
@click.option("--set", help="Metadata settings in the form PATHEXPR=value", multiple=True)
|
|
111
|
+
@click.option("-v", "--verbose", count=True)
|
|
112
|
+
@click.option("-q", "--quiet/--no-quiet")
|
|
113
|
+
@click.option("--base-dir", "-B", help="Base directory for the client configuration")
|
|
114
|
+
@click.option(
|
|
115
|
+
"--stacktrace/--no-stacktrace",
|
|
116
|
+
default=False,
|
|
117
|
+
show_default=True,
|
|
118
|
+
help="If set then show full stacktrace on error",
|
|
119
|
+
)
|
|
120
|
+
@click.pass_context
|
|
121
|
+
def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, schema, config, set, input, **kwargs):
|
|
122
|
+
"""A CLI for interacting with the linkml-store."""
|
|
123
|
+
if not stacktrace:
|
|
124
|
+
sys.tracebacklimit = 0
|
|
125
|
+
logger = logging.getLogger()
|
|
126
|
+
# Set handler for the root logger to output to the console
|
|
127
|
+
console_handler = logging.StreamHandler()
|
|
128
|
+
console_handler.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
|
|
129
|
+
|
|
130
|
+
# Clear existing handlers to avoid duplicate messages if function runs multiple times
|
|
131
|
+
logger.handlers = []
|
|
132
|
+
|
|
133
|
+
# Add the newly created console handler to the logger
|
|
134
|
+
logger.addHandler(console_handler)
|
|
135
|
+
if verbose >= 2:
|
|
136
|
+
logger.setLevel(logging.DEBUG)
|
|
137
|
+
elif verbose == 1:
|
|
138
|
+
logger.setLevel(logging.INFO)
|
|
139
|
+
else:
|
|
140
|
+
logger.setLevel(logging.WARNING)
|
|
141
|
+
if quiet:
|
|
142
|
+
logger.setLevel(logging.ERROR)
|
|
143
|
+
ctx.ensure_object(dict)
|
|
144
|
+
if input:
|
|
145
|
+
database = "duckdb" # default: store in duckdb
|
|
146
|
+
if input.startswith("http"):
|
|
147
|
+
parts = input.split("/")
|
|
148
|
+
collection = parts[-1]
|
|
149
|
+
collection = collection.split(".")[0]
|
|
150
|
+
else:
|
|
151
|
+
stem = underscore(Path(input).stem)
|
|
152
|
+
collection = stem
|
|
153
|
+
logger.info(f"Using input file: {input}, " f"default storage is {database} and collection is {collection}")
|
|
154
|
+
config = ClientConfig(databases={"duckdb": {"collections": {stem: {"source": {"local_path": input}}}}})
|
|
155
|
+
if config is None and DEFAULT_LOCAL_CONF_PATH.exists():
|
|
156
|
+
config = DEFAULT_LOCAL_CONF_PATH
|
|
157
|
+
if config is None and DEFAULT_GLOBAL_CONF_PATH.exists():
|
|
158
|
+
config = DEFAULT_GLOBAL_CONF_PATH
|
|
159
|
+
if config == ".":
|
|
160
|
+
config = None
|
|
161
|
+
if not collection and database and "::" in database:
|
|
162
|
+
database, collection = database.split("::")
|
|
163
|
+
|
|
164
|
+
client = Client().from_config(config, **kwargs) if config else Client()
|
|
165
|
+
settings = ContextSettings(client=client, database_name=database, collection_name=collection)
|
|
166
|
+
ctx.obj["settings"] = settings
|
|
167
|
+
if schema:
|
|
168
|
+
db = settings.database
|
|
169
|
+
db.set_schema_view(schema)
|
|
170
|
+
if settings.database_name:
|
|
171
|
+
db = client.get_database(database)
|
|
172
|
+
if set:
|
|
173
|
+
for expr in set:
|
|
174
|
+
if "=" not in expr:
|
|
175
|
+
raise ValueError(f"Expression must be of form PARAM=VALUE. Got: {expr}")
|
|
176
|
+
path, val = expr.split("=", 1)
|
|
177
|
+
val = yaml.safe_load(val)
|
|
178
|
+
logger.info(f"Setting {path} to {val}")
|
|
179
|
+
db.metadata = object_path_update(db.metadata, path, val)
|
|
180
|
+
if not settings.database_name:
|
|
181
|
+
# if len(client.databases) != 1:
|
|
182
|
+
# raise ValueError("Database must be specified if there are multiple databases.")
|
|
183
|
+
if client.databases:
|
|
184
|
+
settings.database_name = list(client.databases.keys())[0]
|
|
185
|
+
if not settings.collection_name:
|
|
186
|
+
# if len(settings.database.list_collections()) != 1:
|
|
187
|
+
# raise ValueError("Collection must be specified if there are multiple collections.")
|
|
188
|
+
if settings.database and settings.database.list_collections():
|
|
189
|
+
collection = settings.database.list_collections()[0]
|
|
190
|
+
settings.collection_name = collection.alias
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
@cli.command()
|
|
194
|
+
@click.pass_context
|
|
195
|
+
def drop(ctx):
|
|
196
|
+
"""
|
|
197
|
+
Drop database and all its collections.
|
|
198
|
+
"""
|
|
199
|
+
database = ctx.obj["settings"].database
|
|
200
|
+
database.drop()
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
@cli.command()
|
|
204
|
+
@click.argument("files", type=click.Path(), nargs=-1)
|
|
205
|
+
@click.option("--replace/--no-replace", default=False, show_default=True, help="Replace existing objects")
|
|
206
|
+
@click.option("--format", "-f", type=format_choice, help="Input format")
|
|
207
|
+
@click.option("--object", "-i", multiple=True, help="Input object as YAML")
|
|
208
|
+
@click.option("--source-field", help="If provided, inject file path source as this field")
|
|
209
|
+
@click.option("--glob-files/--no-glob-files", default=False, show_default=True, help="If true, use glob to find files")
|
|
210
|
+
@json_select_query_option
|
|
211
|
+
@click.pass_context
|
|
212
|
+
def insert(ctx, files, glob_files, replace, object, format, source_field, json_select_query):
|
|
213
|
+
"""Insert objects from files (JSON, YAML, TSV) into the specified collection.
|
|
214
|
+
|
|
215
|
+
Using a configuration:
|
|
216
|
+
|
|
217
|
+
linkml-store -C config.yaml -c genes insert data/genes/*.json
|
|
218
|
+
|
|
219
|
+
Note: if you don't provide a schema this will be inferred, but it is
|
|
220
|
+
usually better to provide an explicit schema
|
|
221
|
+
|
|
222
|
+
You can use --glob-files if the list of files is too long
|
|
223
|
+
|
|
224
|
+
linkml-store -C config.yaml -c genes insert "data/genes/*.json" --glob-files
|
|
225
|
+
|
|
226
|
+
"""
|
|
227
|
+
settings = ctx.obj["settings"]
|
|
228
|
+
collection = settings.collection
|
|
229
|
+
if not collection:
|
|
230
|
+
raise ValueError("Collection must be specified.")
|
|
231
|
+
if not files and not object:
|
|
232
|
+
files = ["-"]
|
|
233
|
+
load_objects_args = {}
|
|
234
|
+
if json_select_query:
|
|
235
|
+
load_objects_args["select_query"] = json_select_query
|
|
236
|
+
if glob_files:
|
|
237
|
+
import glob
|
|
238
|
+
new_files = []
|
|
239
|
+
for file_path in files:
|
|
240
|
+
new_files.extend(glob.glob(file_path))
|
|
241
|
+
logger.info(f"Found {len(new_files)} files matching glob pattern {files}")
|
|
242
|
+
files = new_files
|
|
243
|
+
for file_path in files:
|
|
244
|
+
|
|
245
|
+
if format:
|
|
246
|
+
objects = load_objects(file_path, format=format, **load_objects_args)
|
|
247
|
+
else:
|
|
248
|
+
objects = load_objects(file_path, **load_objects_args)
|
|
249
|
+
if source_field:
|
|
250
|
+
for obj in objects:
|
|
251
|
+
obj[source_field] = str(file_path)
|
|
252
|
+
logger.info(f"Inserting {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
|
|
253
|
+
if replace:
|
|
254
|
+
collection.replace(objects)
|
|
255
|
+
else:
|
|
256
|
+
collection.insert(objects)
|
|
257
|
+
click.echo(f"Inserted {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
|
|
258
|
+
if object:
|
|
259
|
+
for object_str in object:
|
|
260
|
+
logger.info(f"Parsing: {object_str}")
|
|
261
|
+
objects = yaml.safe_load(object_str)
|
|
262
|
+
if not isinstance(objects, list):
|
|
263
|
+
objects = [objects]
|
|
264
|
+
if replace:
|
|
265
|
+
collection.replace(objects)
|
|
266
|
+
else:
|
|
267
|
+
collection.insert(objects)
|
|
268
|
+
click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.alias}'.")
|
|
269
|
+
collection.commit()
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
@cli.command()
|
|
273
|
+
@click.argument("files", type=click.Path(exists=True), nargs=-1)
|
|
274
|
+
@click.option("--format", "-f", type=format_choice, help="Input format")
|
|
275
|
+
@click.option("--object", "-i", multiple=True, help="Input object as YAML")
|
|
276
|
+
@json_select_query_option
|
|
277
|
+
@click.pass_context
|
|
278
|
+
def store(ctx, files, object, format, json_select_query):
|
|
279
|
+
"""Store objects from files (JSON, YAML, TSV) into the database.
|
|
280
|
+
|
|
281
|
+
Note: this is similar to insert, but a collection does not need to be specified.
|
|
282
|
+
|
|
283
|
+
For example, assume that `my-collection` is a dict with multiple keys,
|
|
284
|
+
and we want one collection per key:
|
|
285
|
+
|
|
286
|
+
linkml-store -d my.ddb store my-collection.yaml
|
|
287
|
+
|
|
288
|
+
Loading JSON (e.g OBO-JSON), with a --json-select-query:
|
|
289
|
+
|
|
290
|
+
linkml-store -d cl.ddb store -J graphs cl.obo.json
|
|
291
|
+
|
|
292
|
+
Loading XML (e.g OWL-XML), with a --json-select-query:
|
|
293
|
+
|
|
294
|
+
linkml-store -d cl.ddb store -J Ontology cl.owx
|
|
295
|
+
|
|
296
|
+
Because the XML uses a top level Ontology, with multiple
|
|
297
|
+
|
|
298
|
+
"""
|
|
299
|
+
settings = ctx.obj["settings"]
|
|
300
|
+
db = settings.database
|
|
301
|
+
if not files and not object:
|
|
302
|
+
files = ["-"]
|
|
303
|
+
load_objects_args = {}
|
|
304
|
+
if json_select_query:
|
|
305
|
+
load_objects_args["select_query"] = json_select_query
|
|
306
|
+
for file_path in files:
|
|
307
|
+
if format:
|
|
308
|
+
objects = load_objects(file_path, format=format, **load_objects_args)
|
|
309
|
+
else:
|
|
310
|
+
objects = load_objects(file_path, **load_objects_args)
|
|
311
|
+
logger.info(f"Inserting {len(objects)} objects from {file_path} into database '{db}'.")
|
|
312
|
+
for obj in objects:
|
|
313
|
+
db.store(obj)
|
|
314
|
+
click.echo(f"Inserted {len(objects)} objects from {file_path} into database '{db}'.")
|
|
315
|
+
if object:
|
|
316
|
+
for object_str in object:
|
|
317
|
+
logger.info(f"Parsing: {object_str}")
|
|
318
|
+
objects = yaml.safe_load(object_str)
|
|
319
|
+
for obj in objects:
|
|
320
|
+
db.store(obj)
|
|
321
|
+
click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{db.name}'.")
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
@cli.command(name="import")
|
|
325
|
+
@click.option("--format", "-f", help="Input format")
|
|
326
|
+
@click.pass_context
|
|
327
|
+
@click.argument("files", type=click.Path(exists=True), nargs=-1)
|
|
328
|
+
def import_database(ctx, files, format):
|
|
329
|
+
"""Imports a database from a dump.
|
|
330
|
+
|
|
331
|
+
See the `export` command for a full list of supported formats. The same
|
|
332
|
+
formats are generally supported for imports.
|
|
333
|
+
"""
|
|
334
|
+
settings = ctx.obj["settings"]
|
|
335
|
+
db = settings.database
|
|
336
|
+
if not files and not object:
|
|
337
|
+
files = ["-"]
|
|
338
|
+
for file_path in files:
|
|
339
|
+
db.import_database(file_path, source_format=format)
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
@cli.command()
|
|
343
|
+
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
344
|
+
@click.option("--output", "-o", required=True, type=click.Path(), help="Output file path")
|
|
345
|
+
@click.pass_context
|
|
346
|
+
def export(ctx, output_type, output):
|
|
347
|
+
"""Exports a database to a standard dump format.
|
|
348
|
+
|
|
349
|
+
Example:
|
|
350
|
+
|
|
351
|
+
linkml-store -d duckdb:///countries.db export -O yaml -o countries.yaml
|
|
352
|
+
|
|
353
|
+
Export format will be guessed from extension if not specified
|
|
354
|
+
|
|
355
|
+
Example:
|
|
356
|
+
|
|
357
|
+
linkml-store -d duckdb:///countries.db export -o countries.json
|
|
358
|
+
|
|
359
|
+
Tree formats such as YAML and JSON can natively store an entire database; each collection
|
|
360
|
+
will be a distinct key in the database.
|
|
361
|
+
|
|
362
|
+
Additionally, native dump formats can be used:
|
|
363
|
+
|
|
364
|
+
Example:
|
|
365
|
+
|
|
366
|
+
linkml-store -d duckdb:///countries.db export -o countries -O duckdb
|
|
367
|
+
|
|
368
|
+
Here, `countries` is a directory. This is equivalent to running EXPORT DATABASE
|
|
369
|
+
(see https://duckdb.org/docs/sql/statements/export.html)
|
|
370
|
+
"""
|
|
371
|
+
settings = ctx.obj["settings"]
|
|
372
|
+
db = settings.database
|
|
373
|
+
if output_type is None:
|
|
374
|
+
output_type = guess_format(output)
|
|
375
|
+
if output_type is None:
|
|
376
|
+
raise ValueError(f"Output format must be specified can't be inferred from {output}.")
|
|
377
|
+
db.export_database(output, target_format=output_type)
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
@cli.command()
|
|
381
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
382
|
+
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
383
|
+
@click.option("--other-database", "-D", required=False, help="Path to the other database")
|
|
384
|
+
@click.option("--other-collection", "-X", required=True, help="Name of the other collection")
|
|
385
|
+
@click.option("--identifier-attribute", "-I", required=False, help="Primary key name")
|
|
386
|
+
@click.pass_context
|
|
387
|
+
def diff(ctx, output, output_type, other_database, other_collection, identifier_attribute):
|
|
388
|
+
"""Diffs two collectoons to create a patch."""
|
|
389
|
+
settings = ctx.obj["settings"]
|
|
390
|
+
db = settings.database
|
|
391
|
+
collection = settings.collection
|
|
392
|
+
if not collection:
|
|
393
|
+
raise ValueError("Collection must be specified.")
|
|
394
|
+
other_db = settings.client.get_database(other_database) if other_database else db
|
|
395
|
+
other_collection = other_db.get_collection(other_collection)
|
|
396
|
+
if identifier_attribute:
|
|
397
|
+
collection.set_identifier_attribute_name(identifier_attribute)
|
|
398
|
+
other_collection.set_identifier_attribute_name(identifier_attribute)
|
|
399
|
+
diff = collection.diff(other_collection)
|
|
400
|
+
write_output(diff, output_type, target=output)
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
@cli.command()
|
|
404
|
+
@click.option("--identifier-attribute", "-I", required=False, help="Primary key name")
|
|
405
|
+
@click.argument("patch_files", type=click.Path(exists=True), nargs=-1)
|
|
406
|
+
@click.pass_context
|
|
407
|
+
def apply(ctx, patch_files, identifier_attribute):
|
|
408
|
+
"""
|
|
409
|
+
Apply a patch to a collection.
|
|
410
|
+
"""
|
|
411
|
+
settings = ctx.obj["settings"]
|
|
412
|
+
collection = settings.collection
|
|
413
|
+
if not collection:
|
|
414
|
+
raise ValueError("Collection must be specified.")
|
|
415
|
+
if identifier_attribute:
|
|
416
|
+
collection.set_identifier_attribute_name(identifier_attribute)
|
|
417
|
+
for patch_file in patch_files:
|
|
418
|
+
patch_objs = load_objects(patch_file, expected_type=list)
|
|
419
|
+
collection.apply_patches(patch_objs)
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
@cli.command()
|
|
423
|
+
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query, as YAML")
|
|
424
|
+
@click.option("--select", "-s", type=click.STRING, help="SELECT clause for the query, as YAML")
|
|
425
|
+
@click.option("--sql", type=click.STRING, help="Raw SQL query (database-level, collection not required)")
|
|
426
|
+
@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
|
|
427
|
+
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
428
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
429
|
+
@click.pass_context
|
|
430
|
+
def query(ctx, where, select, sql, limit, output_type, output):
|
|
431
|
+
"""Query objects from the specified collection or execute raw SQL.
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
Leave the query field blank to return all objects in the collection.
|
|
435
|
+
|
|
436
|
+
Examples:
|
|
437
|
+
|
|
438
|
+
linkml-store -d duckdb:///countries.db -c countries query
|
|
439
|
+
|
|
440
|
+
Queries can be specified in YAML, as basic key-value pairs
|
|
441
|
+
|
|
442
|
+
Examples:
|
|
443
|
+
|
|
444
|
+
linkml-store -d duckdb:///countries.db -c countries query -w 'code: NZ'
|
|
445
|
+
|
|
446
|
+
More complex queries can be specified using MongoDB-style query syntax
|
|
447
|
+
|
|
448
|
+
Examples:
|
|
449
|
+
|
|
450
|
+
linkml-store -d file:. -c persons query -w 'occupation: {$ne: Architect}'
|
|
451
|
+
|
|
452
|
+
Finds all people who are not architects.
|
|
453
|
+
|
|
454
|
+
Raw SQL queries can be executed against SQL-capable backends (DuckDB, Dremio):
|
|
455
|
+
|
|
456
|
+
Examples:
|
|
457
|
+
|
|
458
|
+
linkml-store -d duckdb:///countries.db query --sql 'SELECT * FROM countries WHERE code = "NZ"'
|
|
459
|
+
|
|
460
|
+
linkml-store -d dremio://lakehouse:32010 query --sql 'SELECT COUNT(*) FROM "gold"."samples"'
|
|
461
|
+
|
|
462
|
+
Note: --sql cannot be combined with --where or --select.
|
|
463
|
+
"""
|
|
464
|
+
settings = ctx.obj["settings"]
|
|
465
|
+
|
|
466
|
+
# Handle raw SQL mode
|
|
467
|
+
if sql:
|
|
468
|
+
if where or select:
|
|
469
|
+
raise click.UsageError("--sql cannot be combined with --where or --select.")
|
|
470
|
+
|
|
471
|
+
database = settings.database
|
|
472
|
+
if not database:
|
|
473
|
+
raise click.UsageError("Database must be specified when using --sql.")
|
|
474
|
+
|
|
475
|
+
if not database.supports_sql:
|
|
476
|
+
raise click.UsageError(
|
|
477
|
+
f"Database type '{database.__class__.__name__}' does not support raw SQL queries."
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
result = database.execute_sql(sql)
|
|
481
|
+
write_output(result.rows, output_type, target=output)
|
|
482
|
+
if output:
|
|
483
|
+
click.echo(f"Query results saved to {output}")
|
|
484
|
+
return
|
|
485
|
+
|
|
486
|
+
# Original collection-based query logic
|
|
487
|
+
collection = settings.collection
|
|
488
|
+
if not collection:
|
|
489
|
+
raise click.UsageError(
|
|
490
|
+
"Collection must be specified for non-SQL queries. "
|
|
491
|
+
"Use -c/--collection option or --sql for raw SQL queries."
|
|
492
|
+
)
|
|
493
|
+
where_clause = yaml.safe_load(where) if where else None
|
|
494
|
+
select_clause = yaml.safe_load(select) if select else None
|
|
495
|
+
if select_clause:
|
|
496
|
+
if isinstance(select_clause, str):
|
|
497
|
+
select_clause = [select_clause]
|
|
498
|
+
if not isinstance(select_clause, list):
|
|
499
|
+
raise ValueError(f"SELECT clause must be a list. Got: {select_clause}")
|
|
500
|
+
query_obj = Query(from_table=collection.alias, select_cols=select_clause, where_clause=where_clause, limit=limit)
|
|
501
|
+
result = collection.query(query_obj)
|
|
502
|
+
write_output(result.rows, output_type, target=output)
|
|
503
|
+
if output:
|
|
504
|
+
click.echo(f"Query results saved to {output}")
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
@cli.command()
|
|
508
|
+
@click.pass_context
|
|
509
|
+
@include_internal_option
|
|
510
|
+
def list_collections(ctx, **kwargs):
|
|
511
|
+
db = ctx.obj["settings"].database
|
|
512
|
+
for collection in db.list_collections(**kwargs):
|
|
513
|
+
click.echo(collection.alias)
|
|
514
|
+
click.echo(render_output(collection.metadata))
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
@cli.command()
|
|
518
|
+
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
|
|
519
|
+
@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return per facet")
|
|
520
|
+
@click.option("--facet-min-count", "-M", type=click.INT, help="Minimum count for a facet to be included")
|
|
521
|
+
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
522
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
523
|
+
@click.option("--columns", "-S", help="Columns to facet on. Comma-separated, join combined facets with +")
|
|
524
|
+
@click.option("--wide/--no-wide", "-U/--no-U", default=False, show_default=True, help="Wide table")
|
|
525
|
+
@click.pass_context
|
|
526
|
+
def fq(ctx, where, limit, columns, output_type, wide, output, **kwargs):
|
|
527
|
+
"""
|
|
528
|
+
Query facet counts from the specified collection.
|
|
529
|
+
|
|
530
|
+
Assuming your .linkml.yaml includes an entry mapping `phenopackets` to a
|
|
531
|
+
mongodb
|
|
532
|
+
|
|
533
|
+
Facet counts (all columns)
|
|
534
|
+
|
|
535
|
+
linkml-store -d phenopackets fq
|
|
536
|
+
|
|
537
|
+
Nested columns:
|
|
538
|
+
|
|
539
|
+
linkml-store -d phenopackets fq -S subject.timeAtLastEncounter.age
|
|
540
|
+
|
|
541
|
+
Compound keys:
|
|
542
|
+
|
|
543
|
+
linkml-store -d phenopackets fq subject.sex+subject.timeAtLastEncounter.age
|
|
544
|
+
|
|
545
|
+
(TODO: compound keys do not work on solr)
|
|
546
|
+
|
|
547
|
+
"""
|
|
548
|
+
collection = ctx.obj["settings"].collection
|
|
549
|
+
where_clause = yaml.safe_load(where) if where else None
|
|
550
|
+
columns = columns.split(",") if columns else None
|
|
551
|
+
if columns:
|
|
552
|
+
columns = [col.strip() for col in columns]
|
|
553
|
+
columns = [(tuple(col.split("+")) if "+" in col else col) for col in columns]
|
|
554
|
+
logger.info(f"Faceting on columns: {columns}")
|
|
555
|
+
results = collection.query_facets(where_clause, facet_columns=columns, facet_limit=limit, **kwargs)
|
|
556
|
+
logger.info(f"Facet results: {results}")
|
|
557
|
+
|
|
558
|
+
def _untuple(key):
|
|
559
|
+
if isinstance(key, tuple):
|
|
560
|
+
return "+".join([str(x) for x in key])
|
|
561
|
+
return key
|
|
562
|
+
|
|
563
|
+
if wide:
|
|
564
|
+
results_obj = facet_summary_to_dataframe_unmelted(results)
|
|
565
|
+
else:
|
|
566
|
+
if output_type == Format.PYTHON.value:
|
|
567
|
+
results_obj = results
|
|
568
|
+
elif output_type in [Format.TSV.value, Format.CSV.value]:
|
|
569
|
+
results_obj = []
|
|
570
|
+
for fc, data in results.items():
|
|
571
|
+
for v, c in data:
|
|
572
|
+
results_obj.append({"facet": fc, "value": v, "count": c})
|
|
573
|
+
else:
|
|
574
|
+
results_obj = {}
|
|
575
|
+
for key, value in results.items():
|
|
576
|
+
value_as_dict = {_untuple(v[0:-1]): v[-1] for v in value}
|
|
577
|
+
results_obj[_untuple(key)] = value_as_dict
|
|
578
|
+
if output_type == Format.PNG.value:
|
|
579
|
+
if not output:
|
|
580
|
+
raise ValueError("Output file path is required for PNG output")
|
|
581
|
+
from linkml_store.plotting.facet_chart import create_faceted_horizontal_barchart
|
|
582
|
+
create_faceted_horizontal_barchart(results_obj, output)
|
|
583
|
+
click.echo(f"Facet chart saved to {output}")
|
|
584
|
+
return
|
|
585
|
+
output_data = render_output(results_obj, output_type)
|
|
586
|
+
if output:
|
|
587
|
+
with open(output, "w") as f:
|
|
588
|
+
f.write(output_data)
|
|
589
|
+
click.echo(f"Query results saved to {output}")
|
|
590
|
+
else:
|
|
591
|
+
click.echo(output_data)
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
@cli.command()
|
|
595
|
+
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
|
|
596
|
+
@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return per facet")
|
|
597
|
+
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
598
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
599
|
+
@click.option("--columns", "-S", help="Columns to facet on. Comma-separated, join combined facets with +")
|
|
600
|
+
@click.pass_context
|
|
601
|
+
def groupby(ctx, where, limit, columns, output_type, output, **kwargs):
|
|
602
|
+
"""
|
|
603
|
+
Group by columns in the specified collection.
|
|
604
|
+
|
|
605
|
+
Assume a simple triple model:
|
|
606
|
+
|
|
607
|
+
linkml-store -d cl.ddb -c triple insert cl.owl
|
|
608
|
+
|
|
609
|
+
This makes a flat subject/predicate/object table
|
|
610
|
+
|
|
611
|
+
This can be grouped, e.g by subject:
|
|
612
|
+
|
|
613
|
+
linkml-store -d cl.ddb -c triple groupby -s subject
|
|
614
|
+
|
|
615
|
+
Or subject and predicate:
|
|
616
|
+
|
|
617
|
+
linkml-store -d cl.ddb -c triple groupby -s '[subject,predicate]'
|
|
618
|
+
|
|
619
|
+
"""
|
|
620
|
+
collection = ctx.obj["settings"].collection
|
|
621
|
+
where_clause = yaml.safe_load(where) if where else None
|
|
622
|
+
columns = columns.split(",") if columns else None
|
|
623
|
+
if columns:
|
|
624
|
+
columns = [col.strip() for col in columns]
|
|
625
|
+
columns = [(tuple(col.split("+")) if "+" in col else col) for col in columns]
|
|
626
|
+
logger.info(f"Group by: {columns}")
|
|
627
|
+
result = collection.group_by(
|
|
628
|
+
group_by_fields=columns,
|
|
629
|
+
where_clause=where_clause,
|
|
630
|
+
agg_map={},
|
|
631
|
+
limit=limit,
|
|
632
|
+
**kwargs,
|
|
633
|
+
)
|
|
634
|
+
logger.info(f"Group by results: {result}")
|
|
635
|
+
output_data = render_output(result.rows, output_type)
|
|
636
|
+
if output:
|
|
637
|
+
with open(output, "w") as f:
|
|
638
|
+
f.write(output_data)
|
|
639
|
+
click.echo(f"Query results saved to {output}")
|
|
640
|
+
else:
|
|
641
|
+
click.echo(output_data)
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
def _get_index(index_type=None, **kwargs) -> Indexer:
|
|
645
|
+
if index_type is None or index_type == "simple":
|
|
646
|
+
return SimpleIndexer(name="test", **kwargs)
|
|
647
|
+
else:
|
|
648
|
+
raise ValueError(f"Unknown index type: {index_type}")
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
@cli.command()
|
|
652
|
+
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
|
|
653
|
+
@click.option("--output-type", "-O", type=format_choice, default=Format.FORMATTED.value, help="Output format")
|
|
654
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
655
|
+
@click.option(
|
|
656
|
+
"--limit", "-l", default=-1, show_default=True, type=click.INT, help="Maximum number of results to return"
|
|
657
|
+
)
|
|
658
|
+
@click.pass_context
|
|
659
|
+
def describe(ctx, where, output_type, output, limit):
|
|
660
|
+
"""
|
|
661
|
+
Describe the collection schema.
|
|
662
|
+
"""
|
|
663
|
+
where_clause = yaml.safe_load(where) if where else None
|
|
664
|
+
collection = ctx.obj["settings"].collection
|
|
665
|
+
df = collection.find(where_clause, limit=limit).rows_dataframe
|
|
666
|
+
write_output(df.describe(include="all").transpose(), output_type, target=output)
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
@cli.command()
|
|
670
|
+
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
|
|
671
|
+
@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
|
|
672
|
+
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
673
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
674
|
+
@click.option("--index", "-I", help="Attributes to index on in pivot")
|
|
675
|
+
@click.option("--columns", "-A", help="Attributes to use as columns in pivot")
|
|
676
|
+
@click.option("--values", "-V", help="Attributes to use as values in pivot")
|
|
677
|
+
@click.pass_context
|
|
678
|
+
def pivot(ctx, where, limit, index, columns, values, output_type, output):
|
|
679
|
+
collection = ctx.obj["settings"].collection
|
|
680
|
+
where_clause = yaml.safe_load(where) if where else None
|
|
681
|
+
column_atts = columns.split(",") if columns else None
|
|
682
|
+
value_atts = values.split(",") if values else None
|
|
683
|
+
index_atts = index.split(",") if index else None
|
|
684
|
+
results = collection.find(where_clause, limit=limit)
|
|
685
|
+
pivoted = defaultdict(dict)
|
|
686
|
+
for row in results.rows:
|
|
687
|
+
index_key = tuple([row.get(att) for att in index_atts])
|
|
688
|
+
column_key = tuple([row.get(att) for att in column_atts])
|
|
689
|
+
value_key = tuple([row.get(att) for att in value_atts])
|
|
690
|
+
pivoted[index_key][column_key] = value_key
|
|
691
|
+
pivoted_objs = []
|
|
692
|
+
|
|
693
|
+
def detuple(t: Tuple) -> Any:
|
|
694
|
+
if len(t) == 1:
|
|
695
|
+
return t[0]
|
|
696
|
+
return str(t)
|
|
697
|
+
|
|
698
|
+
for index_key, data in pivoted.items():
|
|
699
|
+
obj = {att: key for att, key in zip(index_atts, index_key)}
|
|
700
|
+
for column_key, value_key in data.items():
|
|
701
|
+
obj[detuple(column_key)] = detuple(value_key)
|
|
702
|
+
pivoted_objs.append(obj)
|
|
703
|
+
write_output(pivoted_objs, output_type, target=output)
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
@cli.command()
|
|
707
|
+
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
|
|
708
|
+
@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
|
|
709
|
+
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
710
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
711
|
+
@click.option("--sample-field", "-I", help="Field to use as the sample identifier")
|
|
712
|
+
@click.option("--classification-field", "-L", help="Field to use as for classification")
|
|
713
|
+
@click.option(
|
|
714
|
+
"--p-value-threshold",
|
|
715
|
+
"-P",
|
|
716
|
+
type=click.FLOAT,
|
|
717
|
+
default=0.05,
|
|
718
|
+
show_default=True,
|
|
719
|
+
help="P-value threshold for enrichment",
|
|
720
|
+
)
|
|
721
|
+
@click.option(
|
|
722
|
+
"--multiple-testing-correction",
|
|
723
|
+
"-M",
|
|
724
|
+
type=click.STRING,
|
|
725
|
+
default="bh",
|
|
726
|
+
show_default=True,
|
|
727
|
+
help="Multiple test correction method",
|
|
728
|
+
)
|
|
729
|
+
@click.argument("samples", type=click.STRING, nargs=-1)
|
|
730
|
+
@click.pass_context
|
|
731
|
+
def enrichment(ctx, where, limit, output_type, output, sample_field, classification_field, samples, **kwargs):
|
|
732
|
+
from linkml_store.utils.enrichment_analyzer import EnrichmentAnalyzer
|
|
733
|
+
|
|
734
|
+
collection = ctx.obj["settings"].collection
|
|
735
|
+
where_clause = yaml.safe_load(where) if where else None
|
|
736
|
+
column_atts = [sample_field, classification_field]
|
|
737
|
+
results = collection.find(where_clause, select_cols=column_atts, limit=-1)
|
|
738
|
+
df = results.rows_dataframe
|
|
739
|
+
ea = EnrichmentAnalyzer(df, sample_key=sample_field, classification_key=classification_field)
|
|
740
|
+
if not samples:
|
|
741
|
+
samples = df[sample_field].unique()
|
|
742
|
+
enrichment_results = []
|
|
743
|
+
for sample in samples:
|
|
744
|
+
enriched = ea.find_enriched_categories(sample, **kwargs)
|
|
745
|
+
for e in enriched:
|
|
746
|
+
obj = {"sample": sample, **e.model_dump()}
|
|
747
|
+
enrichment_results.append(obj)
|
|
748
|
+
output_data = render_output(enrichment_results, output_type)
|
|
749
|
+
if output:
|
|
750
|
+
with open(output, "w") as f:
|
|
751
|
+
f.write(output_data)
|
|
752
|
+
click.echo(f"Search results saved to {output}")
|
|
753
|
+
else:
|
|
754
|
+
click.echo(output_data)
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
@cli.command()
|
|
758
|
+
@click.option("--output-type", "-O", type=format_choice, default=Format.YAML.value, help="Output format")
|
|
759
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
760
|
+
@click.option("--target-attribute", "-T", type=click.STRING, multiple=True, help="Target attributes for inference")
|
|
761
|
+
@click.option(
|
|
762
|
+
"--feature-attributes", "-F", type=click.STRING, help="Feature attributes for inference (comma separated)"
|
|
763
|
+
)
|
|
764
|
+
@click.option("--training-collection", type=click.STRING, help="Collection to use for training")
|
|
765
|
+
@click.option("--inference-config-file", "-Y", type=click.Path(), help="Path to inference configuration file")
|
|
766
|
+
@click.option("--export-model", "-E", type=click.Path(), help="Export model to file")
|
|
767
|
+
@click.option("--load-model", "-L", type=click.Path(), help="Load model from file")
|
|
768
|
+
@click.option("--model-format", "-M", type=click.Choice([x.value for x in ModelSerialization]), help="Format for model")
|
|
769
|
+
@click.option("--training-test-data-split", "-S", type=click.Tuple([float, float]), help="Training/test data split")
|
|
770
|
+
@click.option(
|
|
771
|
+
"--predictor-type", "-t", default="sklearn", show_default=True, type=click.STRING, help="Type of predictor"
|
|
772
|
+
)
|
|
773
|
+
@click.option("--evaluation-count", "-n", type=click.INT, help="Number of examples to evaluate over")
|
|
774
|
+
@click.option("--evaluation-match-function", help="Name of function to use for matching objects in eval")
|
|
775
|
+
@click.option("--query", "-q", type=click.STRING, help="query term")
|
|
776
|
+
@click.option("--where", "-w", type=click.STRING, help="query term")
|
|
777
|
+
@click.pass_context
|
|
778
|
+
def infer(
|
|
779
|
+
ctx,
|
|
780
|
+
inference_config_file,
|
|
781
|
+
where,
|
|
782
|
+
query,
|
|
783
|
+
evaluation_count,
|
|
784
|
+
evaluation_match_function,
|
|
785
|
+
training_test_data_split,
|
|
786
|
+
training_collection,
|
|
787
|
+
predictor_type,
|
|
788
|
+
target_attribute,
|
|
789
|
+
feature_attributes,
|
|
790
|
+
output_type,
|
|
791
|
+
output,
|
|
792
|
+
model_format,
|
|
793
|
+
export_model,
|
|
794
|
+
load_model,
|
|
795
|
+
):
|
|
796
|
+
"""
|
|
797
|
+
Predict a complete object from a partial object.
|
|
798
|
+
|
|
799
|
+
Currently two main prediction methods are provided: RAG and sklearn
|
|
800
|
+
|
|
801
|
+
## RAG:
|
|
802
|
+
|
|
803
|
+
The RAG approach will use Retrieval Augmented Generation to inference the missing attributes of an object.
|
|
804
|
+
|
|
805
|
+
Example:
|
|
806
|
+
|
|
807
|
+
linkml-store -i countries.jsonl inference -t rag -q 'name: Uruguay'
|
|
808
|
+
|
|
809
|
+
Result:
|
|
810
|
+
|
|
811
|
+
capital: Montevideo, code: UY, continent: South America, languages: [Spanish]
|
|
812
|
+
|
|
813
|
+
You can pass in configurations as follows:
|
|
814
|
+
|
|
815
|
+
linkml-store -i countries.jsonl inference -t rag:llm_config.model_name=llama-3 -q 'name: Uruguay'
|
|
816
|
+
|
|
817
|
+
## SKLearn:
|
|
818
|
+
|
|
819
|
+
This uses scikit-learn (defaulting to simple decision trees) to do the prediction.
|
|
820
|
+
|
|
821
|
+
linkml-store -i tests/input/iris.csv inference -t sklearn \
|
|
822
|
+
-q '{"sepal_length": 5.1, "sepal_width": 3.5, "petal_length": 1.4, "petal_width": 0.2}'
|
|
823
|
+
"""
|
|
824
|
+
where_clause = yaml.safe_load(where) if where else None
|
|
825
|
+
if query:
|
|
826
|
+
query_obj = yaml.safe_load(query)
|
|
827
|
+
else:
|
|
828
|
+
query_obj = None
|
|
829
|
+
collection = ctx.obj["settings"].collection
|
|
830
|
+
if collection:
|
|
831
|
+
atts = collection.class_definition().attributes.keys()
|
|
832
|
+
else:
|
|
833
|
+
atts = []
|
|
834
|
+
if feature_attributes:
|
|
835
|
+
features = feature_attributes.split(",")
|
|
836
|
+
features = [f.strip() for f in features]
|
|
837
|
+
else:
|
|
838
|
+
if query_obj:
|
|
839
|
+
features = query_obj.keys()
|
|
840
|
+
else:
|
|
841
|
+
features = None
|
|
842
|
+
if target_attribute:
|
|
843
|
+
target_attributes = list(target_attribute)
|
|
844
|
+
else:
|
|
845
|
+
target_attributes = [att for att in atts if att not in features]
|
|
846
|
+
if model_format:
|
|
847
|
+
model_format = ModelSerialization(model_format)
|
|
848
|
+
if load_model:
|
|
849
|
+
logger.info(f"Loading predictor from {load_model}")
|
|
850
|
+
predictor = get_inference_engine(predictor_type)
|
|
851
|
+
predictor = type(predictor).load_model(load_model)
|
|
852
|
+
else:
|
|
853
|
+
if inference_config_file:
|
|
854
|
+
config = InferenceConfig.from_file(inference_config_file)
|
|
855
|
+
else:
|
|
856
|
+
config = InferenceConfig(target_attributes=target_attributes, feature_attributes=features)
|
|
857
|
+
if training_test_data_split:
|
|
858
|
+
config.train_test_split = training_test_data_split
|
|
859
|
+
predictor = get_inference_engine(predictor_type, config=config)
|
|
860
|
+
training_collection_obj = collection
|
|
861
|
+
if training_collection:
|
|
862
|
+
training_collection_obj = ctx.obj["settings"].database.get_collection(training_collection)
|
|
863
|
+
if training_collection_obj:
|
|
864
|
+
logger.info(f"Using collection: {training_collection_obj.alias} for inference")
|
|
865
|
+
split = training_test_data_split or (1.0, 0.0)
|
|
866
|
+
predictor.load_and_split_data(training_collection_obj, split=split)
|
|
867
|
+
predictor.initialize_model()
|
|
868
|
+
if export_model:
|
|
869
|
+
logger.info(f"Exporting model to {export_model} in {model_format}")
|
|
870
|
+
predictor.export_model(export_model, model_format)
|
|
871
|
+
if not query_obj and where_clause is None:
|
|
872
|
+
if not export_model and not evaluation_count:
|
|
873
|
+
raise ValueError("Query or evaluate must be specified if not exporting model")
|
|
874
|
+
if evaluation_count:
|
|
875
|
+
if evaluation_match_function == "score_text_overlap":
|
|
876
|
+
match_function_fn = score_text_overlap
|
|
877
|
+
elif evaluation_match_function is not None:
|
|
878
|
+
raise ValueError(f"Unknown match function: {evaluation_match_function}")
|
|
879
|
+
else:
|
|
880
|
+
match_function_fn = None
|
|
881
|
+
outcome = evaluate_predictor(
|
|
882
|
+
predictor, target_attributes, evaluation_count=evaluation_count, match_function=match_function_fn
|
|
883
|
+
)
|
|
884
|
+
print(f"Outcome: {outcome} // accuracy: {outcome.accuracy}")
|
|
885
|
+
if query_obj:
|
|
886
|
+
result = predictor.derive(query_obj)
|
|
887
|
+
dumped_obj = result.model_dump(exclude_none=True)
|
|
888
|
+
write_output([dumped_obj], output_type, target=output)
|
|
889
|
+
if where_clause is not None:
|
|
890
|
+
predicted_objs = []
|
|
891
|
+
for query_obj in collection.find(where_clause).rows:
|
|
892
|
+
result = predictor.derive(query_obj)
|
|
893
|
+
predicted_objs.append(result.predicted_object)
|
|
894
|
+
write_output(predicted_objs, output_type, target=output)
|
|
895
|
+
|
|
896
|
+
|
|
897
|
+
@cli.command()
|
|
898
|
+
@index_type_option
|
|
899
|
+
@click.option("--cached-embeddings-database", "-E", help="Path to the database where embeddings are cached")
|
|
900
|
+
@click.option("--text-template", "-T", help="Template for text embeddings")
|
|
901
|
+
@click.option("--name", "-N", help="Index name")
|
|
902
|
+
# TODO: Add --model option to specify embedding model (e.g., text-embedding-3-large)
|
|
903
|
+
# TODO: Add --batch-size option to control batch processing size
|
|
904
|
+
# TODO: Add --index-attributes option to specify which fields to index
|
|
905
|
+
# TODO: Add --progress flag to show indexing progress
|
|
906
|
+
@click.pass_context
|
|
907
|
+
def index(ctx, index_type, **kwargs):
|
|
908
|
+
"""
|
|
909
|
+
Create an index over a collection.
|
|
910
|
+
|
|
911
|
+
By default a simple trigram index is used.
|
|
912
|
+
|
|
913
|
+
TODO: Support additional options for LLM indexer:
|
|
914
|
+
- Model selection (--model text-embedding-3-large)
|
|
915
|
+
- Batch size configuration (--batch-size 100)
|
|
916
|
+
- Index attributes (--index-attributes title,content,author)
|
|
917
|
+
- Progress reporting (--progress)
|
|
918
|
+
"""
|
|
919
|
+
collection = ctx.obj["settings"].collection
|
|
920
|
+
ix = get_indexer(index_type, **kwargs)
|
|
921
|
+
collection.attach_indexer(ix)
|
|
922
|
+
|
|
923
|
+
|
|
924
|
+
@cli.command()
|
|
925
|
+
@click.pass_context
|
|
926
|
+
@click.option("--output-type", "-O", type=format_choice, default="yaml", help="Output format")
|
|
927
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
928
|
+
def schema(ctx, output_type, output):
|
|
929
|
+
"""
|
|
930
|
+
Show the schema for a database
|
|
931
|
+
|
|
932
|
+
:param ctx:
|
|
933
|
+
:param index_type:
|
|
934
|
+
:return:
|
|
935
|
+
"""
|
|
936
|
+
db = ctx.obj["settings"].database
|
|
937
|
+
schema_dict = json_dumper.to_dict(db.schema_view.schema)
|
|
938
|
+
output_data = render_output(schema_dict, output_type)
|
|
939
|
+
if output:
|
|
940
|
+
with open(output, "w") as f:
|
|
941
|
+
f.write(output_data)
|
|
942
|
+
click.echo(f"Schema saved to {output}")
|
|
943
|
+
else:
|
|
944
|
+
click.echo(output_data)
|
|
945
|
+
|
|
946
|
+
|
|
947
|
+
@cli.command()
|
|
948
|
+
@click.argument("search_term")
|
|
949
|
+
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the search")
|
|
950
|
+
@click.option("--select", "-s", type=click.STRING, help="SELECT clause for the query, as YAML")
|
|
951
|
+
@click.option("--limit", "-l", type=click.INT, help="Maximum number of search results")
|
|
952
|
+
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
953
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
954
|
+
@click.option(
|
|
955
|
+
"--auto-index/--no-auto-index", default=False, show_default=True, help="Automatically index the collection"
|
|
956
|
+
)
|
|
957
|
+
@index_type_option
|
|
958
|
+
@click.option("--index-name", "-N", help="Index name")
|
|
959
|
+
@click.pass_context
|
|
960
|
+
def search(ctx, search_term, where, select, limit, index_type, output_type, output, auto_index, index_name):
|
|
961
|
+
"""Search objects in the specified collection."""
|
|
962
|
+
collection = ctx.obj["settings"].collection
|
|
963
|
+
ix = get_indexer(index_type)
|
|
964
|
+
logger.info(f"Attaching index to collection {collection.alias}: {ix.model_dump()}")
|
|
965
|
+
collection.attach_indexer(ix, auto_index=auto_index, name=index_name)
|
|
966
|
+
select_cols = yaml.safe_load(select) if select else None
|
|
967
|
+
if where:
|
|
968
|
+
where = yaml.safe_load(where)
|
|
969
|
+
result = collection.search(search_term, where=where, select_cols=select_cols, limit=limit)
|
|
970
|
+
output_data = render_output([{"score": row[0], **row[1]} for row in result.ranked_rows], output_type)
|
|
971
|
+
if output:
|
|
972
|
+
with open(output, "w") as f:
|
|
973
|
+
f.write(output_data)
|
|
974
|
+
click.echo(f"Search results saved to {output}")
|
|
975
|
+
else:
|
|
976
|
+
click.echo(output_data)
|
|
977
|
+
|
|
978
|
+
|
|
979
|
+
@cli.command()
|
|
980
|
+
@click.pass_context
|
|
981
|
+
def indexes(ctx):
|
|
982
|
+
"""
|
|
983
|
+
Show the indexes for a collection.
|
|
984
|
+
"""
|
|
985
|
+
collection = ctx.obj["settings"].collection
|
|
986
|
+
for name, ix in collection.indexers.items():
|
|
987
|
+
click.echo(f"{name}: {type(ix)}\n{ix.model_json()}")
|
|
988
|
+
|
|
989
|
+
|
|
990
|
+
|
|
991
|
+
|
|
992
|
+
|
|
993
|
+
@cli.command()
|
|
994
|
+
@click.pass_context
|
|
995
|
+
@click.option("--source-collection", "-s", required=True, help="Source collection name")
|
|
996
|
+
@click.option("--target-collection", "-t", help="Target collection name (defaults to source for intra-collection)")
|
|
997
|
+
@click.option("--index-name", "-i", help="Name of index to use (defaults to first available)")
|
|
998
|
+
@click.option("--metric", "-m", type=click.Choice(["cosine", "euclidean", "l2", "dot", "manhattan"]), default="cosine", help="Distance metric")
|
|
999
|
+
@click.option("--max-matches", "-n", type=int, default=5, help="Maximum matches per item")
|
|
1000
|
+
@click.option("--similarity-threshold", type=float, help="Minimum similarity threshold")
|
|
1001
|
+
@click.option("--distance-threshold", type=float, help="Maximum distance threshold")
|
|
1002
|
+
@click.option("--source-fields", help="Comma-separated list of source fields to include")
|
|
1003
|
+
@click.option("--target-fields", help="Comma-separated list of target fields to include")
|
|
1004
|
+
@click.option("--limit", "-l", type=int, help="Limit number of items to process")
|
|
1005
|
+
@click.option("--output-format", "-f", type=click.Choice(["report", "json", "csv"]), default="report", help="Output format")
|
|
1006
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
1007
|
+
def find_matches(ctx, source_collection, target_collection, index_name, metric, max_matches,
|
|
1008
|
+
similarity_threshold, distance_threshold, source_fields, target_fields,
|
|
1009
|
+
limit, output_format, output):
|
|
1010
|
+
"""
|
|
1011
|
+
Find best matches between embeddings in collections.
|
|
1012
|
+
|
|
1013
|
+
Examples:
|
|
1014
|
+
# Find matches between two collections
|
|
1015
|
+
linkml-store -d mydb.ddb find-matches -s collection1 -t collection2
|
|
1016
|
+
|
|
1017
|
+
# Find similar items within a single collection
|
|
1018
|
+
linkml-store -d mydb.ddb find-matches -s collection1
|
|
1019
|
+
|
|
1020
|
+
# With specific fields and threshold
|
|
1021
|
+
linkml-store -d mydb.ddb find-matches -s coll1 -t coll2 \\
|
|
1022
|
+
--similarity-threshold 0.8 \\
|
|
1023
|
+
--source-fields id,name \\
|
|
1024
|
+
--target-fields id,description
|
|
1025
|
+
"""
|
|
1026
|
+
from linkml_store.utils.embedding_matcher import (
|
|
1027
|
+
match_embeddings_between_collections,
|
|
1028
|
+
match_embeddings_within_collection,
|
|
1029
|
+
MatchingConfig,
|
|
1030
|
+
DistanceMetric,
|
|
1031
|
+
format_matches_report
|
|
1032
|
+
)
|
|
1033
|
+
|
|
1034
|
+
db = ctx.obj["settings"].database
|
|
1035
|
+
|
|
1036
|
+
# Parse field lists
|
|
1037
|
+
source_field_list = None
|
|
1038
|
+
if source_fields:
|
|
1039
|
+
source_field_list = [f.strip() for f in source_fields.split(",")]
|
|
1040
|
+
|
|
1041
|
+
target_field_list = None
|
|
1042
|
+
if target_fields:
|
|
1043
|
+
target_field_list = [f.strip() for f in target_fields.split(",")]
|
|
1044
|
+
|
|
1045
|
+
# Create config
|
|
1046
|
+
config = MatchingConfig(
|
|
1047
|
+
metric=DistanceMetric(metric),
|
|
1048
|
+
max_matches_per_item=max_matches,
|
|
1049
|
+
similarity_threshold=similarity_threshold,
|
|
1050
|
+
distance_threshold=distance_threshold,
|
|
1051
|
+
source_fields=source_field_list,
|
|
1052
|
+
target_fields=target_field_list
|
|
1053
|
+
)
|
|
1054
|
+
|
|
1055
|
+
# Perform matching
|
|
1056
|
+
try:
|
|
1057
|
+
if target_collection and target_collection != source_collection:
|
|
1058
|
+
# Between collections
|
|
1059
|
+
click.echo(f"Finding matches between {source_collection} and {target_collection}...")
|
|
1060
|
+
results = match_embeddings_between_collections(
|
|
1061
|
+
database=db,
|
|
1062
|
+
source_collection=source_collection,
|
|
1063
|
+
target_collection=target_collection,
|
|
1064
|
+
index_name=index_name,
|
|
1065
|
+
config=config,
|
|
1066
|
+
limit=limit
|
|
1067
|
+
)
|
|
1068
|
+
else:
|
|
1069
|
+
# Within collection
|
|
1070
|
+
click.echo(f"Finding matches within {source_collection}...")
|
|
1071
|
+
results = match_embeddings_within_collection(
|
|
1072
|
+
database=db,
|
|
1073
|
+
collection_name=source_collection,
|
|
1074
|
+
index_name=index_name,
|
|
1075
|
+
config=config,
|
|
1076
|
+
limit=limit
|
|
1077
|
+
)
|
|
1078
|
+
|
|
1079
|
+
# Format output
|
|
1080
|
+
if output_format == "report":
|
|
1081
|
+
output_text = format_matches_report(results)
|
|
1082
|
+
elif output_format == "json":
|
|
1083
|
+
import json
|
|
1084
|
+
output_text = json.dumps([m.to_dict() for m in results.matches], indent=2)
|
|
1085
|
+
elif output_format == "csv":
|
|
1086
|
+
df = results.to_dataframe()
|
|
1087
|
+
if df is not None:
|
|
1088
|
+
output_text = df.to_csv(index=False)
|
|
1089
|
+
else:
|
|
1090
|
+
click.echo("pandas required for CSV output", err=True)
|
|
1091
|
+
return
|
|
1092
|
+
|
|
1093
|
+
# Output results
|
|
1094
|
+
if output:
|
|
1095
|
+
with open(output, "w") as f:
|
|
1096
|
+
f.write(output_text)
|
|
1097
|
+
click.echo(f"Results saved to {output}")
|
|
1098
|
+
else:
|
|
1099
|
+
click.echo(output_text)
|
|
1100
|
+
|
|
1101
|
+
# Summary
|
|
1102
|
+
click.echo(f"\nFound {len(results.matches)} total matches")
|
|
1103
|
+
|
|
1104
|
+
except Exception as e:
|
|
1105
|
+
click.echo(f"Error: {e}", err=True)
|
|
1106
|
+
import traceback
|
|
1107
|
+
traceback.print_exc()
|
|
1108
|
+
|
|
1109
|
+
|
|
1110
|
+
@cli.command()
|
|
1111
|
+
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
1112
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
1113
|
+
@click.option(
|
|
1114
|
+
"--collection-only/--no-collection-only",
|
|
1115
|
+
default=False,
|
|
1116
|
+
show_default=True,
|
|
1117
|
+
help="Only validate specified collection",
|
|
1118
|
+
)
|
|
1119
|
+
@click.option(
|
|
1120
|
+
"--ensure-referential-integrity/--no-ensure-referential-integrity",
|
|
1121
|
+
default=True,
|
|
1122
|
+
show_default=True,
|
|
1123
|
+
help="Ensure referential integrity",
|
|
1124
|
+
)
|
|
1125
|
+
@click.pass_context
|
|
1126
|
+
def validate(ctx, output_type, output, collection_only, **kwargs):
|
|
1127
|
+
"""Validate objects in the specified collection."""
|
|
1128
|
+
if collection_only:
|
|
1129
|
+
collection = ctx.obj["settings"].collection
|
|
1130
|
+
logger.info(f"Validating collection {collection.alias}")
|
|
1131
|
+
validation_results = [json_dumper.to_dict(x) for x in collection.iter_validate_collection(**kwargs)]
|
|
1132
|
+
else:
|
|
1133
|
+
db = ctx.obj["settings"].database
|
|
1134
|
+
validation_results = [json_dumper.to_dict(x) for x in db.validate_database(**kwargs)]
|
|
1135
|
+
output_data = render_output(validation_results, output_type)
|
|
1136
|
+
if output:
|
|
1137
|
+
with open(output, "w") as f:
|
|
1138
|
+
f.write(output_data)
|
|
1139
|
+
click.echo(f"Validation results saved to {output}")
|
|
1140
|
+
else:
|
|
1141
|
+
click.echo(output_data)
|
|
1142
|
+
|
|
1143
|
+
|
|
1144
|
+
cli.add_command(plot_cli, name="plot")
|
|
1145
|
+
|
|
1146
|
+
if __name__ == "__main__":
|
|
1147
|
+
cli()
|