linkml-store 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. linkml_store/__init__.py +7 -0
  2. linkml_store/api/__init__.py +8 -0
  3. linkml_store/api/client.py +414 -0
  4. linkml_store/api/collection.py +1280 -0
  5. linkml_store/api/config.py +187 -0
  6. linkml_store/api/database.py +862 -0
  7. linkml_store/api/queries.py +69 -0
  8. linkml_store/api/stores/__init__.py +0 -0
  9. linkml_store/api/stores/chromadb/__init__.py +7 -0
  10. linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
  11. linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
  12. linkml_store/api/stores/dremio/__init__.py +10 -0
  13. linkml_store/api/stores/dremio/dremio_collection.py +555 -0
  14. linkml_store/api/stores/dremio/dremio_database.py +1052 -0
  15. linkml_store/api/stores/dremio/mappings.py +105 -0
  16. linkml_store/api/stores/dremio_rest/__init__.py +11 -0
  17. linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
  18. linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
  19. linkml_store/api/stores/duckdb/__init__.py +16 -0
  20. linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
  21. linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
  22. linkml_store/api/stores/duckdb/mappings.py +8 -0
  23. linkml_store/api/stores/filesystem/__init__.py +15 -0
  24. linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
  25. linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
  26. linkml_store/api/stores/hdf5/__init__.py +7 -0
  27. linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
  28. linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
  29. linkml_store/api/stores/ibis/__init__.py +5 -0
  30. linkml_store/api/stores/ibis/ibis_collection.py +488 -0
  31. linkml_store/api/stores/ibis/ibis_database.py +328 -0
  32. linkml_store/api/stores/mongodb/__init__.py +25 -0
  33. linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
  34. linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
  35. linkml_store/api/stores/neo4j/__init__.py +0 -0
  36. linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
  37. linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
  38. linkml_store/api/stores/solr/__init__.py +3 -0
  39. linkml_store/api/stores/solr/solr_collection.py +224 -0
  40. linkml_store/api/stores/solr/solr_database.py +83 -0
  41. linkml_store/api/stores/solr/solr_utils.py +0 -0
  42. linkml_store/api/types.py +4 -0
  43. linkml_store/cli.py +1147 -0
  44. linkml_store/constants.py +7 -0
  45. linkml_store/graphs/__init__.py +0 -0
  46. linkml_store/graphs/graph_map.py +24 -0
  47. linkml_store/index/__init__.py +53 -0
  48. linkml_store/index/implementations/__init__.py +0 -0
  49. linkml_store/index/implementations/llm_indexer.py +174 -0
  50. linkml_store/index/implementations/simple_indexer.py +43 -0
  51. linkml_store/index/indexer.py +211 -0
  52. linkml_store/inference/__init__.py +13 -0
  53. linkml_store/inference/evaluation.py +195 -0
  54. linkml_store/inference/implementations/__init__.py +0 -0
  55. linkml_store/inference/implementations/llm_inference_engine.py +154 -0
  56. linkml_store/inference/implementations/rag_inference_engine.py +276 -0
  57. linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
  58. linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
  59. linkml_store/inference/inference_config.py +66 -0
  60. linkml_store/inference/inference_engine.py +209 -0
  61. linkml_store/inference/inference_engine_registry.py +74 -0
  62. linkml_store/plotting/__init__.py +5 -0
  63. linkml_store/plotting/cli.py +826 -0
  64. linkml_store/plotting/dimensionality_reduction.py +453 -0
  65. linkml_store/plotting/embedding_plot.py +489 -0
  66. linkml_store/plotting/facet_chart.py +73 -0
  67. linkml_store/plotting/heatmap.py +383 -0
  68. linkml_store/utils/__init__.py +0 -0
  69. linkml_store/utils/change_utils.py +17 -0
  70. linkml_store/utils/dat_parser.py +95 -0
  71. linkml_store/utils/embedding_matcher.py +424 -0
  72. linkml_store/utils/embedding_utils.py +299 -0
  73. linkml_store/utils/enrichment_analyzer.py +217 -0
  74. linkml_store/utils/file_utils.py +37 -0
  75. linkml_store/utils/format_utils.py +550 -0
  76. linkml_store/utils/io.py +38 -0
  77. linkml_store/utils/llm_utils.py +122 -0
  78. linkml_store/utils/mongodb_utils.py +145 -0
  79. linkml_store/utils/neo4j_utils.py +42 -0
  80. linkml_store/utils/object_utils.py +190 -0
  81. linkml_store/utils/pandas_utils.py +93 -0
  82. linkml_store/utils/patch_utils.py +126 -0
  83. linkml_store/utils/query_utils.py +89 -0
  84. linkml_store/utils/schema_utils.py +23 -0
  85. linkml_store/utils/sklearn_utils.py +193 -0
  86. linkml_store/utils/sql_utils.py +177 -0
  87. linkml_store/utils/stats_utils.py +53 -0
  88. linkml_store/utils/vector_utils.py +158 -0
  89. linkml_store/webapi/__init__.py +0 -0
  90. linkml_store/webapi/html/__init__.py +3 -0
  91. linkml_store/webapi/html/base.html.j2 +24 -0
  92. linkml_store/webapi/html/collection_details.html.j2 +15 -0
  93. linkml_store/webapi/html/database_details.html.j2 +16 -0
  94. linkml_store/webapi/html/databases.html.j2 +14 -0
  95. linkml_store/webapi/html/generic.html.j2 +43 -0
  96. linkml_store/webapi/main.py +855 -0
  97. linkml_store-0.3.0.dist-info/METADATA +226 -0
  98. linkml_store-0.3.0.dist-info/RECORD +101 -0
  99. linkml_store-0.3.0.dist-info/WHEEL +4 -0
  100. linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
  101. linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
linkml_store/cli.py ADDED
@@ -0,0 +1,1147 @@
1
+ import logging
2
+ import sys
3
+ import warnings
4
+ from collections import defaultdict
5
+ from pathlib import Path
6
+ from typing import Any, Optional, Tuple
7
+
8
+ import click
9
+ import yaml
10
+ from linkml_runtime.dumpers import json_dumper
11
+ from linkml_runtime.utils.formatutils import underscore
12
+ from pydantic import BaseModel
13
+
14
+ from linkml_store import Client
15
+ from linkml_store.api import Collection, Database
16
+ from linkml_store.api.config import ClientConfig
17
+ from linkml_store.api.queries import Query
18
+ from linkml_store.index import get_indexer
19
+ from linkml_store.index.implementations.simple_indexer import SimpleIndexer
20
+ from linkml_store.index.indexer import Indexer
21
+ from linkml_store.inference import get_inference_engine
22
+ from linkml_store.inference.evaluation import evaluate_predictor, score_text_overlap
23
+ from linkml_store.inference.inference_config import InferenceConfig
24
+ from linkml_store.inference.inference_engine import ModelSerialization
25
+ from linkml_store.utils.format_utils import Format, guess_format, load_objects, render_output, write_output
26
+ from linkml_store.utils.object_utils import object_path_update
27
+ from linkml_store.utils.pandas_utils import facet_summary_to_dataframe_unmelted
28
+ from linkml_store.plotting.cli import plot_cli
29
+
30
+ DEFAULT_LOCAL_CONF_PATH = Path("linkml.yaml")
31
+ # global path is ~/.linkml.yaml in the user's home directory
32
+ DEFAULT_GLOBAL_CONF_PATH = Path("~/.linkml.yaml").expanduser()
33
+
34
+ index_type_option = click.option(
35
+ "--index-type",
36
+ "-t",
37
+ default="simple",
38
+ show_default=True,
39
+ help="Type of index to create. Values: simple, llm",
40
+ )
41
+ json_select_query_option = click.option(
42
+ "--json-select-query",
43
+ "-J",
44
+ help="JSON SELECT query",
45
+ )
46
+
47
+ logger = logging.getLogger(__name__)
48
+
49
+ warnings.filterwarnings("ignore", module="duckdb_engine")
50
+
51
+
52
+ class ContextSettings(BaseModel):
53
+ """
54
+ Context object for CLI commands.
55
+ """
56
+
57
+ client: Client
58
+ database_name: Optional[str] = None
59
+ collection_name: Optional[str] = None
60
+
61
+ @property
62
+ def database(self) -> Optional[Database]:
63
+ """
64
+ Get the database object.
65
+ :return:
66
+ """
67
+ name = self.database_name
68
+ if name is None:
69
+ # if len(self.client.databases) > 1:
70
+ # raise ValueError("Database must be specified if there are multiple databases.")
71
+ if not self.client.databases:
72
+ return None
73
+ name = list(self.client.databases.keys())[0]
74
+ return self.client.get_database(name)
75
+
76
+ @property
77
+ def collection(self) -> Optional[Collection]:
78
+ """
79
+ Get the collection object.
80
+ :return:
81
+ """
82
+ name = self.collection_name
83
+ if name is None:
84
+ # if len(self.database.list_collections()) > 1:
85
+ # raise ValueError("Collection must be specified if there are multiple collections.")
86
+ if not self.database:
87
+ return None
88
+ if not self.database.list_collections():
89
+ return None
90
+ name = list(self.database.list_collections())[0]
91
+ return self.database.get_collection(name)
92
+
93
+ class Config:
94
+ arbitrary_types_allowed = True
95
+
96
+
97
+ # format_choice = click.Choice(["json", "yaml", "tsv"])
98
+ format_choice = click.Choice([f.value for f in Format])
99
+
100
+
101
+ include_internal_option = click.option("--include-internal/--no-include-internal", default=False, show_default=True)
102
+
103
+
104
+ @click.group()
105
+ @click.option("--database", "-d", help="Database name")
106
+ @click.option("--collection", "-c", help="Collection name")
107
+ @click.option("--input", "-i", help="Input file (alternative to database/collection)")
108
+ @click.option("--schema", "-S", help="Path to schema (LinkML yaml)")
109
+ @click.option("--config", "-C", type=click.Path(exists=True), help="Path to the configuration file")
110
+ @click.option("--set", help="Metadata settings in the form PATHEXPR=value", multiple=True)
111
+ @click.option("-v", "--verbose", count=True)
112
+ @click.option("-q", "--quiet/--no-quiet")
113
+ @click.option("--base-dir", "-B", help="Base directory for the client configuration")
114
+ @click.option(
115
+ "--stacktrace/--no-stacktrace",
116
+ default=False,
117
+ show_default=True,
118
+ help="If set then show full stacktrace on error",
119
+ )
120
+ @click.pass_context
121
+ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, schema, config, set, input, **kwargs):
122
+ """A CLI for interacting with the linkml-store."""
123
+ if not stacktrace:
124
+ sys.tracebacklimit = 0
125
+ logger = logging.getLogger()
126
+ # Set handler for the root logger to output to the console
127
+ console_handler = logging.StreamHandler()
128
+ console_handler.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
129
+
130
+ # Clear existing handlers to avoid duplicate messages if function runs multiple times
131
+ logger.handlers = []
132
+
133
+ # Add the newly created console handler to the logger
134
+ logger.addHandler(console_handler)
135
+ if verbose >= 2:
136
+ logger.setLevel(logging.DEBUG)
137
+ elif verbose == 1:
138
+ logger.setLevel(logging.INFO)
139
+ else:
140
+ logger.setLevel(logging.WARNING)
141
+ if quiet:
142
+ logger.setLevel(logging.ERROR)
143
+ ctx.ensure_object(dict)
144
+ if input:
145
+ database = "duckdb" # default: store in duckdb
146
+ if input.startswith("http"):
147
+ parts = input.split("/")
148
+ collection = parts[-1]
149
+ collection = collection.split(".")[0]
150
+ else:
151
+ stem = underscore(Path(input).stem)
152
+ collection = stem
153
+ logger.info(f"Using input file: {input}, " f"default storage is {database} and collection is {collection}")
154
+ config = ClientConfig(databases={"duckdb": {"collections": {stem: {"source": {"local_path": input}}}}})
155
+ if config is None and DEFAULT_LOCAL_CONF_PATH.exists():
156
+ config = DEFAULT_LOCAL_CONF_PATH
157
+ if config is None and DEFAULT_GLOBAL_CONF_PATH.exists():
158
+ config = DEFAULT_GLOBAL_CONF_PATH
159
+ if config == ".":
160
+ config = None
161
+ if not collection and database and "::" in database:
162
+ database, collection = database.split("::")
163
+
164
+ client = Client().from_config(config, **kwargs) if config else Client()
165
+ settings = ContextSettings(client=client, database_name=database, collection_name=collection)
166
+ ctx.obj["settings"] = settings
167
+ if schema:
168
+ db = settings.database
169
+ db.set_schema_view(schema)
170
+ if settings.database_name:
171
+ db = client.get_database(database)
172
+ if set:
173
+ for expr in set:
174
+ if "=" not in expr:
175
+ raise ValueError(f"Expression must be of form PARAM=VALUE. Got: {expr}")
176
+ path, val = expr.split("=", 1)
177
+ val = yaml.safe_load(val)
178
+ logger.info(f"Setting {path} to {val}")
179
+ db.metadata = object_path_update(db.metadata, path, val)
180
+ if not settings.database_name:
181
+ # if len(client.databases) != 1:
182
+ # raise ValueError("Database must be specified if there are multiple databases.")
183
+ if client.databases:
184
+ settings.database_name = list(client.databases.keys())[0]
185
+ if not settings.collection_name:
186
+ # if len(settings.database.list_collections()) != 1:
187
+ # raise ValueError("Collection must be specified if there are multiple collections.")
188
+ if settings.database and settings.database.list_collections():
189
+ collection = settings.database.list_collections()[0]
190
+ settings.collection_name = collection.alias
191
+
192
+
193
+ @cli.command()
194
+ @click.pass_context
195
+ def drop(ctx):
196
+ """
197
+ Drop database and all its collections.
198
+ """
199
+ database = ctx.obj["settings"].database
200
+ database.drop()
201
+
202
+
203
+ @cli.command()
204
+ @click.argument("files", type=click.Path(), nargs=-1)
205
+ @click.option("--replace/--no-replace", default=False, show_default=True, help="Replace existing objects")
206
+ @click.option("--format", "-f", type=format_choice, help="Input format")
207
+ @click.option("--object", "-i", multiple=True, help="Input object as YAML")
208
+ @click.option("--source-field", help="If provided, inject file path source as this field")
209
+ @click.option("--glob-files/--no-glob-files", default=False, show_default=True, help="If true, use glob to find files")
210
+ @json_select_query_option
211
+ @click.pass_context
212
+ def insert(ctx, files, glob_files, replace, object, format, source_field, json_select_query):
213
+ """Insert objects from files (JSON, YAML, TSV) into the specified collection.
214
+
215
+ Using a configuration:
216
+
217
+ linkml-store -C config.yaml -c genes insert data/genes/*.json
218
+
219
+ Note: if you don't provide a schema this will be inferred, but it is
220
+ usually better to provide an explicit schema
221
+
222
+ You can use --glob-files if the list of files is too long
223
+
224
+ linkml-store -C config.yaml -c genes insert "data/genes/*.json" --glob-files
225
+
226
+ """
227
+ settings = ctx.obj["settings"]
228
+ collection = settings.collection
229
+ if not collection:
230
+ raise ValueError("Collection must be specified.")
231
+ if not files and not object:
232
+ files = ["-"]
233
+ load_objects_args = {}
234
+ if json_select_query:
235
+ load_objects_args["select_query"] = json_select_query
236
+ if glob_files:
237
+ import glob
238
+ new_files = []
239
+ for file_path in files:
240
+ new_files.extend(glob.glob(file_path))
241
+ logger.info(f"Found {len(new_files)} files matching glob pattern {files}")
242
+ files = new_files
243
+ for file_path in files:
244
+
245
+ if format:
246
+ objects = load_objects(file_path, format=format, **load_objects_args)
247
+ else:
248
+ objects = load_objects(file_path, **load_objects_args)
249
+ if source_field:
250
+ for obj in objects:
251
+ obj[source_field] = str(file_path)
252
+ logger.info(f"Inserting {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
253
+ if replace:
254
+ collection.replace(objects)
255
+ else:
256
+ collection.insert(objects)
257
+ click.echo(f"Inserted {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
258
+ if object:
259
+ for object_str in object:
260
+ logger.info(f"Parsing: {object_str}")
261
+ objects = yaml.safe_load(object_str)
262
+ if not isinstance(objects, list):
263
+ objects = [objects]
264
+ if replace:
265
+ collection.replace(objects)
266
+ else:
267
+ collection.insert(objects)
268
+ click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.alias}'.")
269
+ collection.commit()
270
+
271
+
272
+ @cli.command()
273
+ @click.argument("files", type=click.Path(exists=True), nargs=-1)
274
+ @click.option("--format", "-f", type=format_choice, help="Input format")
275
+ @click.option("--object", "-i", multiple=True, help="Input object as YAML")
276
+ @json_select_query_option
277
+ @click.pass_context
278
+ def store(ctx, files, object, format, json_select_query):
279
+ """Store objects from files (JSON, YAML, TSV) into the database.
280
+
281
+ Note: this is similar to insert, but a collection does not need to be specified.
282
+
283
+ For example, assume that `my-collection` is a dict with multiple keys,
284
+ and we want one collection per key:
285
+
286
+ linkml-store -d my.ddb store my-collection.yaml
287
+
288
+ Loading JSON (e.g OBO-JSON), with a --json-select-query:
289
+
290
+ linkml-store -d cl.ddb store -J graphs cl.obo.json
291
+
292
+ Loading XML (e.g OWL-XML), with a --json-select-query:
293
+
294
+ linkml-store -d cl.ddb store -J Ontology cl.owx
295
+
296
+ Because the XML uses a top level Ontology, with multiple
297
+
298
+ """
299
+ settings = ctx.obj["settings"]
300
+ db = settings.database
301
+ if not files and not object:
302
+ files = ["-"]
303
+ load_objects_args = {}
304
+ if json_select_query:
305
+ load_objects_args["select_query"] = json_select_query
306
+ for file_path in files:
307
+ if format:
308
+ objects = load_objects(file_path, format=format, **load_objects_args)
309
+ else:
310
+ objects = load_objects(file_path, **load_objects_args)
311
+ logger.info(f"Inserting {len(objects)} objects from {file_path} into database '{db}'.")
312
+ for obj in objects:
313
+ db.store(obj)
314
+ click.echo(f"Inserted {len(objects)} objects from {file_path} into database '{db}'.")
315
+ if object:
316
+ for object_str in object:
317
+ logger.info(f"Parsing: {object_str}")
318
+ objects = yaml.safe_load(object_str)
319
+ for obj in objects:
320
+ db.store(obj)
321
+ click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{db.name}'.")
322
+
323
+
324
+ @cli.command(name="import")
325
+ @click.option("--format", "-f", help="Input format")
326
+ @click.pass_context
327
+ @click.argument("files", type=click.Path(exists=True), nargs=-1)
328
+ def import_database(ctx, files, format):
329
+ """Imports a database from a dump.
330
+
331
+ See the `export` command for a full list of supported formats. The same
332
+ formats are generally supported for imports.
333
+ """
334
+ settings = ctx.obj["settings"]
335
+ db = settings.database
336
+ if not files and not object:
337
+ files = ["-"]
338
+ for file_path in files:
339
+ db.import_database(file_path, source_format=format)
340
+
341
+
342
+ @cli.command()
343
+ @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
344
+ @click.option("--output", "-o", required=True, type=click.Path(), help="Output file path")
345
+ @click.pass_context
346
+ def export(ctx, output_type, output):
347
+ """Exports a database to a standard dump format.
348
+
349
+ Example:
350
+
351
+ linkml-store -d duckdb:///countries.db export -O yaml -o countries.yaml
352
+
353
+ Export format will be guessed from extension if not specified
354
+
355
+ Example:
356
+
357
+ linkml-store -d duckdb:///countries.db export -o countries.json
358
+
359
+ Tree formats such as YAML and JSON can natively store an entire database; each collection
360
+ will be a distinct key in the database.
361
+
362
+ Additionally, native dump formats can be used:
363
+
364
+ Example:
365
+
366
+ linkml-store -d duckdb:///countries.db export -o countries -O duckdb
367
+
368
+ Here, `countries` is a directory. This is equivalent to running EXPORT DATABASE
369
+ (see https://duckdb.org/docs/sql/statements/export.html)
370
+ """
371
+ settings = ctx.obj["settings"]
372
+ db = settings.database
373
+ if output_type is None:
374
+ output_type = guess_format(output)
375
+ if output_type is None:
376
+ raise ValueError(f"Output format must be specified can't be inferred from {output}.")
377
+ db.export_database(output, target_format=output_type)
378
+
379
+
380
+ @cli.command()
381
+ @click.option("--output", "-o", type=click.Path(), help="Output file path")
382
+ @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
383
+ @click.option("--other-database", "-D", required=False, help="Path to the other database")
384
+ @click.option("--other-collection", "-X", required=True, help="Name of the other collection")
385
+ @click.option("--identifier-attribute", "-I", required=False, help="Primary key name")
386
+ @click.pass_context
387
+ def diff(ctx, output, output_type, other_database, other_collection, identifier_attribute):
388
+ """Diffs two collectoons to create a patch."""
389
+ settings = ctx.obj["settings"]
390
+ db = settings.database
391
+ collection = settings.collection
392
+ if not collection:
393
+ raise ValueError("Collection must be specified.")
394
+ other_db = settings.client.get_database(other_database) if other_database else db
395
+ other_collection = other_db.get_collection(other_collection)
396
+ if identifier_attribute:
397
+ collection.set_identifier_attribute_name(identifier_attribute)
398
+ other_collection.set_identifier_attribute_name(identifier_attribute)
399
+ diff = collection.diff(other_collection)
400
+ write_output(diff, output_type, target=output)
401
+
402
+
403
+ @cli.command()
404
+ @click.option("--identifier-attribute", "-I", required=False, help="Primary key name")
405
+ @click.argument("patch_files", type=click.Path(exists=True), nargs=-1)
406
+ @click.pass_context
407
+ def apply(ctx, patch_files, identifier_attribute):
408
+ """
409
+ Apply a patch to a collection.
410
+ """
411
+ settings = ctx.obj["settings"]
412
+ collection = settings.collection
413
+ if not collection:
414
+ raise ValueError("Collection must be specified.")
415
+ if identifier_attribute:
416
+ collection.set_identifier_attribute_name(identifier_attribute)
417
+ for patch_file in patch_files:
418
+ patch_objs = load_objects(patch_file, expected_type=list)
419
+ collection.apply_patches(patch_objs)
420
+
421
+
422
+ @cli.command()
423
+ @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query, as YAML")
424
+ @click.option("--select", "-s", type=click.STRING, help="SELECT clause for the query, as YAML")
425
+ @click.option("--sql", type=click.STRING, help="Raw SQL query (database-level, collection not required)")
426
+ @click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
427
+ @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
428
+ @click.option("--output", "-o", type=click.Path(), help="Output file path")
429
+ @click.pass_context
430
+ def query(ctx, where, select, sql, limit, output_type, output):
431
+ """Query objects from the specified collection or execute raw SQL.
432
+
433
+
434
+ Leave the query field blank to return all objects in the collection.
435
+
436
+ Examples:
437
+
438
+ linkml-store -d duckdb:///countries.db -c countries query
439
+
440
+ Queries can be specified in YAML, as basic key-value pairs
441
+
442
+ Examples:
443
+
444
+ linkml-store -d duckdb:///countries.db -c countries query -w 'code: NZ'
445
+
446
+ More complex queries can be specified using MongoDB-style query syntax
447
+
448
+ Examples:
449
+
450
+ linkml-store -d file:. -c persons query -w 'occupation: {$ne: Architect}'
451
+
452
+ Finds all people who are not architects.
453
+
454
+ Raw SQL queries can be executed against SQL-capable backends (DuckDB, Dremio):
455
+
456
+ Examples:
457
+
458
+ linkml-store -d duckdb:///countries.db query --sql 'SELECT * FROM countries WHERE code = "NZ"'
459
+
460
+ linkml-store -d dremio://lakehouse:32010 query --sql 'SELECT COUNT(*) FROM "gold"."samples"'
461
+
462
+ Note: --sql cannot be combined with --where or --select.
463
+ """
464
+ settings = ctx.obj["settings"]
465
+
466
+ # Handle raw SQL mode
467
+ if sql:
468
+ if where or select:
469
+ raise click.UsageError("--sql cannot be combined with --where or --select.")
470
+
471
+ database = settings.database
472
+ if not database:
473
+ raise click.UsageError("Database must be specified when using --sql.")
474
+
475
+ if not database.supports_sql:
476
+ raise click.UsageError(
477
+ f"Database type '{database.__class__.__name__}' does not support raw SQL queries."
478
+ )
479
+
480
+ result = database.execute_sql(sql)
481
+ write_output(result.rows, output_type, target=output)
482
+ if output:
483
+ click.echo(f"Query results saved to {output}")
484
+ return
485
+
486
+ # Original collection-based query logic
487
+ collection = settings.collection
488
+ if not collection:
489
+ raise click.UsageError(
490
+ "Collection must be specified for non-SQL queries. "
491
+ "Use -c/--collection option or --sql for raw SQL queries."
492
+ )
493
+ where_clause = yaml.safe_load(where) if where else None
494
+ select_clause = yaml.safe_load(select) if select else None
495
+ if select_clause:
496
+ if isinstance(select_clause, str):
497
+ select_clause = [select_clause]
498
+ if not isinstance(select_clause, list):
499
+ raise ValueError(f"SELECT clause must be a list. Got: {select_clause}")
500
+ query_obj = Query(from_table=collection.alias, select_cols=select_clause, where_clause=where_clause, limit=limit)
501
+ result = collection.query(query_obj)
502
+ write_output(result.rows, output_type, target=output)
503
+ if output:
504
+ click.echo(f"Query results saved to {output}")
505
+
506
+
507
+ @cli.command()
508
+ @click.pass_context
509
+ @include_internal_option
510
+ def list_collections(ctx, **kwargs):
511
+ db = ctx.obj["settings"].database
512
+ for collection in db.list_collections(**kwargs):
513
+ click.echo(collection.alias)
514
+ click.echo(render_output(collection.metadata))
515
+
516
+
517
+ @cli.command()
518
+ @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
519
+ @click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return per facet")
520
+ @click.option("--facet-min-count", "-M", type=click.INT, help="Minimum count for a facet to be included")
521
+ @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
522
+ @click.option("--output", "-o", type=click.Path(), help="Output file path")
523
+ @click.option("--columns", "-S", help="Columns to facet on. Comma-separated, join combined facets with +")
524
+ @click.option("--wide/--no-wide", "-U/--no-U", default=False, show_default=True, help="Wide table")
525
+ @click.pass_context
526
+ def fq(ctx, where, limit, columns, output_type, wide, output, **kwargs):
527
+ """
528
+ Query facet counts from the specified collection.
529
+
530
+ Assuming your .linkml.yaml includes an entry mapping `phenopackets` to a
531
+ mongodb
532
+
533
+ Facet counts (all columns)
534
+
535
+ linkml-store -d phenopackets fq
536
+
537
+ Nested columns:
538
+
539
+ linkml-store -d phenopackets fq -S subject.timeAtLastEncounter.age
540
+
541
+ Compound keys:
542
+
543
+ linkml-store -d phenopackets fq subject.sex+subject.timeAtLastEncounter.age
544
+
545
+ (TODO: compound keys do not work on solr)
546
+
547
+ """
548
+ collection = ctx.obj["settings"].collection
549
+ where_clause = yaml.safe_load(where) if where else None
550
+ columns = columns.split(",") if columns else None
551
+ if columns:
552
+ columns = [col.strip() for col in columns]
553
+ columns = [(tuple(col.split("+")) if "+" in col else col) for col in columns]
554
+ logger.info(f"Faceting on columns: {columns}")
555
+ results = collection.query_facets(where_clause, facet_columns=columns, facet_limit=limit, **kwargs)
556
+ logger.info(f"Facet results: {results}")
557
+
558
+ def _untuple(key):
559
+ if isinstance(key, tuple):
560
+ return "+".join([str(x) for x in key])
561
+ return key
562
+
563
+ if wide:
564
+ results_obj = facet_summary_to_dataframe_unmelted(results)
565
+ else:
566
+ if output_type == Format.PYTHON.value:
567
+ results_obj = results
568
+ elif output_type in [Format.TSV.value, Format.CSV.value]:
569
+ results_obj = []
570
+ for fc, data in results.items():
571
+ for v, c in data:
572
+ results_obj.append({"facet": fc, "value": v, "count": c})
573
+ else:
574
+ results_obj = {}
575
+ for key, value in results.items():
576
+ value_as_dict = {_untuple(v[0:-1]): v[-1] for v in value}
577
+ results_obj[_untuple(key)] = value_as_dict
578
+ if output_type == Format.PNG.value:
579
+ if not output:
580
+ raise ValueError("Output file path is required for PNG output")
581
+ from linkml_store.plotting.facet_chart import create_faceted_horizontal_barchart
582
+ create_faceted_horizontal_barchart(results_obj, output)
583
+ click.echo(f"Facet chart saved to {output}")
584
+ return
585
+ output_data = render_output(results_obj, output_type)
586
+ if output:
587
+ with open(output, "w") as f:
588
+ f.write(output_data)
589
+ click.echo(f"Query results saved to {output}")
590
+ else:
591
+ click.echo(output_data)
592
+
593
+
594
+ @cli.command()
595
+ @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
596
+ @click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return per facet")
597
+ @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
598
+ @click.option("--output", "-o", type=click.Path(), help="Output file path")
599
+ @click.option("--columns", "-S", help="Columns to facet on. Comma-separated, join combined facets with +")
600
+ @click.pass_context
601
+ def groupby(ctx, where, limit, columns, output_type, output, **kwargs):
602
+ """
603
+ Group by columns in the specified collection.
604
+
605
+ Assume a simple triple model:
606
+
607
+ linkml-store -d cl.ddb -c triple insert cl.owl
608
+
609
+ This makes a flat subject/predicate/object table
610
+
611
+ This can be grouped, e.g by subject:
612
+
613
+ linkml-store -d cl.ddb -c triple groupby -s subject
614
+
615
+ Or subject and predicate:
616
+
617
+ linkml-store -d cl.ddb -c triple groupby -s '[subject,predicate]'
618
+
619
+ """
620
+ collection = ctx.obj["settings"].collection
621
+ where_clause = yaml.safe_load(where) if where else None
622
+ columns = columns.split(",") if columns else None
623
+ if columns:
624
+ columns = [col.strip() for col in columns]
625
+ columns = [(tuple(col.split("+")) if "+" in col else col) for col in columns]
626
+ logger.info(f"Group by: {columns}")
627
+ result = collection.group_by(
628
+ group_by_fields=columns,
629
+ where_clause=where_clause,
630
+ agg_map={},
631
+ limit=limit,
632
+ **kwargs,
633
+ )
634
+ logger.info(f"Group by results: {result}")
635
+ output_data = render_output(result.rows, output_type)
636
+ if output:
637
+ with open(output, "w") as f:
638
+ f.write(output_data)
639
+ click.echo(f"Query results saved to {output}")
640
+ else:
641
+ click.echo(output_data)
642
+
643
+
644
+ def _get_index(index_type=None, **kwargs) -> Indexer:
645
+ if index_type is None or index_type == "simple":
646
+ return SimpleIndexer(name="test", **kwargs)
647
+ else:
648
+ raise ValueError(f"Unknown index type: {index_type}")
649
+
650
+
651
+ @cli.command()
652
+ @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
653
+ @click.option("--output-type", "-O", type=format_choice, default=Format.FORMATTED.value, help="Output format")
654
+ @click.option("--output", "-o", type=click.Path(), help="Output file path")
655
+ @click.option(
656
+ "--limit", "-l", default=-1, show_default=True, type=click.INT, help="Maximum number of results to return"
657
+ )
658
+ @click.pass_context
659
+ def describe(ctx, where, output_type, output, limit):
660
+ """
661
+ Describe the collection schema.
662
+ """
663
+ where_clause = yaml.safe_load(where) if where else None
664
+ collection = ctx.obj["settings"].collection
665
+ df = collection.find(where_clause, limit=limit).rows_dataframe
666
+ write_output(df.describe(include="all").transpose(), output_type, target=output)
667
+
668
+
669
+ @cli.command()
670
+ @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
671
+ @click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
672
+ @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
673
+ @click.option("--output", "-o", type=click.Path(), help="Output file path")
674
+ @click.option("--index", "-I", help="Attributes to index on in pivot")
675
+ @click.option("--columns", "-A", help="Attributes to use as columns in pivot")
676
+ @click.option("--values", "-V", help="Attributes to use as values in pivot")
677
+ @click.pass_context
678
+ def pivot(ctx, where, limit, index, columns, values, output_type, output):
679
+ collection = ctx.obj["settings"].collection
680
+ where_clause = yaml.safe_load(where) if where else None
681
+ column_atts = columns.split(",") if columns else None
682
+ value_atts = values.split(",") if values else None
683
+ index_atts = index.split(",") if index else None
684
+ results = collection.find(where_clause, limit=limit)
685
+ pivoted = defaultdict(dict)
686
+ for row in results.rows:
687
+ index_key = tuple([row.get(att) for att in index_atts])
688
+ column_key = tuple([row.get(att) for att in column_atts])
689
+ value_key = tuple([row.get(att) for att in value_atts])
690
+ pivoted[index_key][column_key] = value_key
691
+ pivoted_objs = []
692
+
693
+ def detuple(t: Tuple) -> Any:
694
+ if len(t) == 1:
695
+ return t[0]
696
+ return str(t)
697
+
698
+ for index_key, data in pivoted.items():
699
+ obj = {att: key for att, key in zip(index_atts, index_key)}
700
+ for column_key, value_key in data.items():
701
+ obj[detuple(column_key)] = detuple(value_key)
702
+ pivoted_objs.append(obj)
703
+ write_output(pivoted_objs, output_type, target=output)
704
+
705
+
706
+ @cli.command()
707
+ @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
708
+ @click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
709
+ @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
710
+ @click.option("--output", "-o", type=click.Path(), help="Output file path")
711
+ @click.option("--sample-field", "-I", help="Field to use as the sample identifier")
712
+ @click.option("--classification-field", "-L", help="Field to use as for classification")
713
+ @click.option(
714
+ "--p-value-threshold",
715
+ "-P",
716
+ type=click.FLOAT,
717
+ default=0.05,
718
+ show_default=True,
719
+ help="P-value threshold for enrichment",
720
+ )
721
+ @click.option(
722
+ "--multiple-testing-correction",
723
+ "-M",
724
+ type=click.STRING,
725
+ default="bh",
726
+ show_default=True,
727
+ help="Multiple test correction method",
728
+ )
729
+ @click.argument("samples", type=click.STRING, nargs=-1)
730
+ @click.pass_context
731
+ def enrichment(ctx, where, limit, output_type, output, sample_field, classification_field, samples, **kwargs):
732
+ from linkml_store.utils.enrichment_analyzer import EnrichmentAnalyzer
733
+
734
+ collection = ctx.obj["settings"].collection
735
+ where_clause = yaml.safe_load(where) if where else None
736
+ column_atts = [sample_field, classification_field]
737
+ results = collection.find(where_clause, select_cols=column_atts, limit=-1)
738
+ df = results.rows_dataframe
739
+ ea = EnrichmentAnalyzer(df, sample_key=sample_field, classification_key=classification_field)
740
+ if not samples:
741
+ samples = df[sample_field].unique()
742
+ enrichment_results = []
743
+ for sample in samples:
744
+ enriched = ea.find_enriched_categories(sample, **kwargs)
745
+ for e in enriched:
746
+ obj = {"sample": sample, **e.model_dump()}
747
+ enrichment_results.append(obj)
748
+ output_data = render_output(enrichment_results, output_type)
749
+ if output:
750
+ with open(output, "w") as f:
751
+ f.write(output_data)
752
+ click.echo(f"Search results saved to {output}")
753
+ else:
754
+ click.echo(output_data)
755
+
756
+
757
+ @cli.command()
758
+ @click.option("--output-type", "-O", type=format_choice, default=Format.YAML.value, help="Output format")
759
+ @click.option("--output", "-o", type=click.Path(), help="Output file path")
760
+ @click.option("--target-attribute", "-T", type=click.STRING, multiple=True, help="Target attributes for inference")
761
+ @click.option(
762
+ "--feature-attributes", "-F", type=click.STRING, help="Feature attributes for inference (comma separated)"
763
+ )
764
+ @click.option("--training-collection", type=click.STRING, help="Collection to use for training")
765
+ @click.option("--inference-config-file", "-Y", type=click.Path(), help="Path to inference configuration file")
766
+ @click.option("--export-model", "-E", type=click.Path(), help="Export model to file")
767
+ @click.option("--load-model", "-L", type=click.Path(), help="Load model from file")
768
+ @click.option("--model-format", "-M", type=click.Choice([x.value for x in ModelSerialization]), help="Format for model")
769
+ @click.option("--training-test-data-split", "-S", type=click.Tuple([float, float]), help="Training/test data split")
770
+ @click.option(
771
+ "--predictor-type", "-t", default="sklearn", show_default=True, type=click.STRING, help="Type of predictor"
772
+ )
773
+ @click.option("--evaluation-count", "-n", type=click.INT, help="Number of examples to evaluate over")
774
+ @click.option("--evaluation-match-function", help="Name of function to use for matching objects in eval")
775
+ @click.option("--query", "-q", type=click.STRING, help="query term")
776
+ @click.option("--where", "-w", type=click.STRING, help="query term")
777
+ @click.pass_context
778
+ def infer(
779
+ ctx,
780
+ inference_config_file,
781
+ where,
782
+ query,
783
+ evaluation_count,
784
+ evaluation_match_function,
785
+ training_test_data_split,
786
+ training_collection,
787
+ predictor_type,
788
+ target_attribute,
789
+ feature_attributes,
790
+ output_type,
791
+ output,
792
+ model_format,
793
+ export_model,
794
+ load_model,
795
+ ):
796
+ """
797
+ Predict a complete object from a partial object.
798
+
799
+ Currently two main prediction methods are provided: RAG and sklearn
800
+
801
+ ## RAG:
802
+
803
+ The RAG approach will use Retrieval Augmented Generation to inference the missing attributes of an object.
804
+
805
+ Example:
806
+
807
+ linkml-store -i countries.jsonl inference -t rag -q 'name: Uruguay'
808
+
809
+ Result:
810
+
811
+ capital: Montevideo, code: UY, continent: South America, languages: [Spanish]
812
+
813
+ You can pass in configurations as follows:
814
+
815
+ linkml-store -i countries.jsonl inference -t rag:llm_config.model_name=llama-3 -q 'name: Uruguay'
816
+
817
+ ## SKLearn:
818
+
819
+ This uses scikit-learn (defaulting to simple decision trees) to do the prediction.
820
+
821
+ linkml-store -i tests/input/iris.csv inference -t sklearn \
822
+ -q '{"sepal_length": 5.1, "sepal_width": 3.5, "petal_length": 1.4, "petal_width": 0.2}'
823
+ """
824
+ where_clause = yaml.safe_load(where) if where else None
825
+ if query:
826
+ query_obj = yaml.safe_load(query)
827
+ else:
828
+ query_obj = None
829
+ collection = ctx.obj["settings"].collection
830
+ if collection:
831
+ atts = collection.class_definition().attributes.keys()
832
+ else:
833
+ atts = []
834
+ if feature_attributes:
835
+ features = feature_attributes.split(",")
836
+ features = [f.strip() for f in features]
837
+ else:
838
+ if query_obj:
839
+ features = query_obj.keys()
840
+ else:
841
+ features = None
842
+ if target_attribute:
843
+ target_attributes = list(target_attribute)
844
+ else:
845
+ target_attributes = [att for att in atts if att not in features]
846
+ if model_format:
847
+ model_format = ModelSerialization(model_format)
848
+ if load_model:
849
+ logger.info(f"Loading predictor from {load_model}")
850
+ predictor = get_inference_engine(predictor_type)
851
+ predictor = type(predictor).load_model(load_model)
852
+ else:
853
+ if inference_config_file:
854
+ config = InferenceConfig.from_file(inference_config_file)
855
+ else:
856
+ config = InferenceConfig(target_attributes=target_attributes, feature_attributes=features)
857
+ if training_test_data_split:
858
+ config.train_test_split = training_test_data_split
859
+ predictor = get_inference_engine(predictor_type, config=config)
860
+ training_collection_obj = collection
861
+ if training_collection:
862
+ training_collection_obj = ctx.obj["settings"].database.get_collection(training_collection)
863
+ if training_collection_obj:
864
+ logger.info(f"Using collection: {training_collection_obj.alias} for inference")
865
+ split = training_test_data_split or (1.0, 0.0)
866
+ predictor.load_and_split_data(training_collection_obj, split=split)
867
+ predictor.initialize_model()
868
+ if export_model:
869
+ logger.info(f"Exporting model to {export_model} in {model_format}")
870
+ predictor.export_model(export_model, model_format)
871
+ if not query_obj and where_clause is None:
872
+ if not export_model and not evaluation_count:
873
+ raise ValueError("Query or evaluate must be specified if not exporting model")
874
+ if evaluation_count:
875
+ if evaluation_match_function == "score_text_overlap":
876
+ match_function_fn = score_text_overlap
877
+ elif evaluation_match_function is not None:
878
+ raise ValueError(f"Unknown match function: {evaluation_match_function}")
879
+ else:
880
+ match_function_fn = None
881
+ outcome = evaluate_predictor(
882
+ predictor, target_attributes, evaluation_count=evaluation_count, match_function=match_function_fn
883
+ )
884
+ print(f"Outcome: {outcome} // accuracy: {outcome.accuracy}")
885
+ if query_obj:
886
+ result = predictor.derive(query_obj)
887
+ dumped_obj = result.model_dump(exclude_none=True)
888
+ write_output([dumped_obj], output_type, target=output)
889
+ if where_clause is not None:
890
+ predicted_objs = []
891
+ for query_obj in collection.find(where_clause).rows:
892
+ result = predictor.derive(query_obj)
893
+ predicted_objs.append(result.predicted_object)
894
+ write_output(predicted_objs, output_type, target=output)
895
+
896
+
897
+ @cli.command()
898
+ @index_type_option
899
+ @click.option("--cached-embeddings-database", "-E", help="Path to the database where embeddings are cached")
900
+ @click.option("--text-template", "-T", help="Template for text embeddings")
901
+ @click.option("--name", "-N", help="Index name")
902
+ # TODO: Add --model option to specify embedding model (e.g., text-embedding-3-large)
903
+ # TODO: Add --batch-size option to control batch processing size
904
+ # TODO: Add --index-attributes option to specify which fields to index
905
+ # TODO: Add --progress flag to show indexing progress
906
+ @click.pass_context
907
+ def index(ctx, index_type, **kwargs):
908
+ """
909
+ Create an index over a collection.
910
+
911
+ By default a simple trigram index is used.
912
+
913
+ TODO: Support additional options for LLM indexer:
914
+ - Model selection (--model text-embedding-3-large)
915
+ - Batch size configuration (--batch-size 100)
916
+ - Index attributes (--index-attributes title,content,author)
917
+ - Progress reporting (--progress)
918
+ """
919
+ collection = ctx.obj["settings"].collection
920
+ ix = get_indexer(index_type, **kwargs)
921
+ collection.attach_indexer(ix)
922
+
923
+
924
+ @cli.command()
925
+ @click.pass_context
926
+ @click.option("--output-type", "-O", type=format_choice, default="yaml", help="Output format")
927
+ @click.option("--output", "-o", type=click.Path(), help="Output file path")
928
+ def schema(ctx, output_type, output):
929
+ """
930
+ Show the schema for a database
931
+
932
+ :param ctx:
933
+ :param index_type:
934
+ :return:
935
+ """
936
+ db = ctx.obj["settings"].database
937
+ schema_dict = json_dumper.to_dict(db.schema_view.schema)
938
+ output_data = render_output(schema_dict, output_type)
939
+ if output:
940
+ with open(output, "w") as f:
941
+ f.write(output_data)
942
+ click.echo(f"Schema saved to {output}")
943
+ else:
944
+ click.echo(output_data)
945
+
946
+
947
+ @cli.command()
948
+ @click.argument("search_term")
949
+ @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the search")
950
+ @click.option("--select", "-s", type=click.STRING, help="SELECT clause for the query, as YAML")
951
+ @click.option("--limit", "-l", type=click.INT, help="Maximum number of search results")
952
+ @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
953
+ @click.option("--output", "-o", type=click.Path(), help="Output file path")
954
+ @click.option(
955
+ "--auto-index/--no-auto-index", default=False, show_default=True, help="Automatically index the collection"
956
+ )
957
+ @index_type_option
958
+ @click.option("--index-name", "-N", help="Index name")
959
+ @click.pass_context
960
+ def search(ctx, search_term, where, select, limit, index_type, output_type, output, auto_index, index_name):
961
+ """Search objects in the specified collection."""
962
+ collection = ctx.obj["settings"].collection
963
+ ix = get_indexer(index_type)
964
+ logger.info(f"Attaching index to collection {collection.alias}: {ix.model_dump()}")
965
+ collection.attach_indexer(ix, auto_index=auto_index, name=index_name)
966
+ select_cols = yaml.safe_load(select) if select else None
967
+ if where:
968
+ where = yaml.safe_load(where)
969
+ result = collection.search(search_term, where=where, select_cols=select_cols, limit=limit)
970
+ output_data = render_output([{"score": row[0], **row[1]} for row in result.ranked_rows], output_type)
971
+ if output:
972
+ with open(output, "w") as f:
973
+ f.write(output_data)
974
+ click.echo(f"Search results saved to {output}")
975
+ else:
976
+ click.echo(output_data)
977
+
978
+
979
+ @cli.command()
980
+ @click.pass_context
981
+ def indexes(ctx):
982
+ """
983
+ Show the indexes for a collection.
984
+ """
985
+ collection = ctx.obj["settings"].collection
986
+ for name, ix in collection.indexers.items():
987
+ click.echo(f"{name}: {type(ix)}\n{ix.model_json()}")
988
+
989
+
990
+
991
+
992
+
993
+ @cli.command()
994
+ @click.pass_context
995
+ @click.option("--source-collection", "-s", required=True, help="Source collection name")
996
+ @click.option("--target-collection", "-t", help="Target collection name (defaults to source for intra-collection)")
997
+ @click.option("--index-name", "-i", help="Name of index to use (defaults to first available)")
998
+ @click.option("--metric", "-m", type=click.Choice(["cosine", "euclidean", "l2", "dot", "manhattan"]), default="cosine", help="Distance metric")
999
+ @click.option("--max-matches", "-n", type=int, default=5, help="Maximum matches per item")
1000
+ @click.option("--similarity-threshold", type=float, help="Minimum similarity threshold")
1001
+ @click.option("--distance-threshold", type=float, help="Maximum distance threshold")
1002
+ @click.option("--source-fields", help="Comma-separated list of source fields to include")
1003
+ @click.option("--target-fields", help="Comma-separated list of target fields to include")
1004
+ @click.option("--limit", "-l", type=int, help="Limit number of items to process")
1005
+ @click.option("--output-format", "-f", type=click.Choice(["report", "json", "csv"]), default="report", help="Output format")
1006
+ @click.option("--output", "-o", type=click.Path(), help="Output file path")
1007
+ def find_matches(ctx, source_collection, target_collection, index_name, metric, max_matches,
1008
+ similarity_threshold, distance_threshold, source_fields, target_fields,
1009
+ limit, output_format, output):
1010
+ """
1011
+ Find best matches between embeddings in collections.
1012
+
1013
+ Examples:
1014
+ # Find matches between two collections
1015
+ linkml-store -d mydb.ddb find-matches -s collection1 -t collection2
1016
+
1017
+ # Find similar items within a single collection
1018
+ linkml-store -d mydb.ddb find-matches -s collection1
1019
+
1020
+ # With specific fields and threshold
1021
+ linkml-store -d mydb.ddb find-matches -s coll1 -t coll2 \\
1022
+ --similarity-threshold 0.8 \\
1023
+ --source-fields id,name \\
1024
+ --target-fields id,description
1025
+ """
1026
+ from linkml_store.utils.embedding_matcher import (
1027
+ match_embeddings_between_collections,
1028
+ match_embeddings_within_collection,
1029
+ MatchingConfig,
1030
+ DistanceMetric,
1031
+ format_matches_report
1032
+ )
1033
+
1034
+ db = ctx.obj["settings"].database
1035
+
1036
+ # Parse field lists
1037
+ source_field_list = None
1038
+ if source_fields:
1039
+ source_field_list = [f.strip() for f in source_fields.split(",")]
1040
+
1041
+ target_field_list = None
1042
+ if target_fields:
1043
+ target_field_list = [f.strip() for f in target_fields.split(",")]
1044
+
1045
+ # Create config
1046
+ config = MatchingConfig(
1047
+ metric=DistanceMetric(metric),
1048
+ max_matches_per_item=max_matches,
1049
+ similarity_threshold=similarity_threshold,
1050
+ distance_threshold=distance_threshold,
1051
+ source_fields=source_field_list,
1052
+ target_fields=target_field_list
1053
+ )
1054
+
1055
+ # Perform matching
1056
+ try:
1057
+ if target_collection and target_collection != source_collection:
1058
+ # Between collections
1059
+ click.echo(f"Finding matches between {source_collection} and {target_collection}...")
1060
+ results = match_embeddings_between_collections(
1061
+ database=db,
1062
+ source_collection=source_collection,
1063
+ target_collection=target_collection,
1064
+ index_name=index_name,
1065
+ config=config,
1066
+ limit=limit
1067
+ )
1068
+ else:
1069
+ # Within collection
1070
+ click.echo(f"Finding matches within {source_collection}...")
1071
+ results = match_embeddings_within_collection(
1072
+ database=db,
1073
+ collection_name=source_collection,
1074
+ index_name=index_name,
1075
+ config=config,
1076
+ limit=limit
1077
+ )
1078
+
1079
+ # Format output
1080
+ if output_format == "report":
1081
+ output_text = format_matches_report(results)
1082
+ elif output_format == "json":
1083
+ import json
1084
+ output_text = json.dumps([m.to_dict() for m in results.matches], indent=2)
1085
+ elif output_format == "csv":
1086
+ df = results.to_dataframe()
1087
+ if df is not None:
1088
+ output_text = df.to_csv(index=False)
1089
+ else:
1090
+ click.echo("pandas required for CSV output", err=True)
1091
+ return
1092
+
1093
+ # Output results
1094
+ if output:
1095
+ with open(output, "w") as f:
1096
+ f.write(output_text)
1097
+ click.echo(f"Results saved to {output}")
1098
+ else:
1099
+ click.echo(output_text)
1100
+
1101
+ # Summary
1102
+ click.echo(f"\nFound {len(results.matches)} total matches")
1103
+
1104
+ except Exception as e:
1105
+ click.echo(f"Error: {e}", err=True)
1106
+ import traceback
1107
+ traceback.print_exc()
1108
+
1109
+
1110
+ @cli.command()
1111
+ @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
1112
+ @click.option("--output", "-o", type=click.Path(), help="Output file path")
1113
+ @click.option(
1114
+ "--collection-only/--no-collection-only",
1115
+ default=False,
1116
+ show_default=True,
1117
+ help="Only validate specified collection",
1118
+ )
1119
+ @click.option(
1120
+ "--ensure-referential-integrity/--no-ensure-referential-integrity",
1121
+ default=True,
1122
+ show_default=True,
1123
+ help="Ensure referential integrity",
1124
+ )
1125
+ @click.pass_context
1126
+ def validate(ctx, output_type, output, collection_only, **kwargs):
1127
+ """Validate objects in the specified collection."""
1128
+ if collection_only:
1129
+ collection = ctx.obj["settings"].collection
1130
+ logger.info(f"Validating collection {collection.alias}")
1131
+ validation_results = [json_dumper.to_dict(x) for x in collection.iter_validate_collection(**kwargs)]
1132
+ else:
1133
+ db = ctx.obj["settings"].database
1134
+ validation_results = [json_dumper.to_dict(x) for x in db.validate_database(**kwargs)]
1135
+ output_data = render_output(validation_results, output_type)
1136
+ if output:
1137
+ with open(output, "w") as f:
1138
+ f.write(output_data)
1139
+ click.echo(f"Validation results saved to {output}")
1140
+ else:
1141
+ click.echo(output_data)
1142
+
1143
+
1144
+ cli.add_command(plot_cli, name="plot")
1145
+
1146
+ if __name__ == "__main__":
1147
+ cli()