linkml-store 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (32) hide show
  1. linkml_store/api/client.py +32 -5
  2. linkml_store/api/collection.py +276 -27
  3. linkml_store/api/config.py +6 -2
  4. linkml_store/api/database.py +264 -21
  5. linkml_store/api/stores/chromadb/__init__.py +5 -1
  6. linkml_store/api/stores/duckdb/__init__.py +9 -0
  7. linkml_store/api/stores/duckdb/duckdb_collection.py +7 -4
  8. linkml_store/api/stores/duckdb/duckdb_database.py +19 -5
  9. linkml_store/api/stores/duckdb/mappings.py +1 -0
  10. linkml_store/api/stores/filesystem/__init__.py +15 -0
  11. linkml_store/api/stores/filesystem/filesystem_collection.py +177 -0
  12. linkml_store/api/stores/filesystem/filesystem_database.py +72 -0
  13. linkml_store/api/stores/hdf5/__init__.py +7 -0
  14. linkml_store/api/stores/mongodb/__init__.py +25 -0
  15. linkml_store/api/stores/mongodb/mongodb_collection.py +31 -10
  16. linkml_store/api/stores/mongodb/mongodb_database.py +13 -2
  17. linkml_store/api/types.py +4 -0
  18. linkml_store/cli.py +150 -15
  19. linkml_store/index/__init__.py +6 -2
  20. linkml_store/index/implementations/llm_indexer.py +83 -5
  21. linkml_store/index/implementations/simple_indexer.py +2 -2
  22. linkml_store/index/indexer.py +32 -8
  23. linkml_store/utils/change_utils.py +17 -0
  24. linkml_store/utils/format_utils.py +139 -8
  25. linkml_store/utils/patch_utils.py +126 -0
  26. linkml_store/utils/query_utils.py +89 -0
  27. {linkml_store-0.1.7.dist-info → linkml_store-0.1.9.dist-info}/METADATA +7 -1
  28. linkml_store-0.1.9.dist-info/RECORD +49 -0
  29. linkml_store-0.1.7.dist-info/RECORD +0 -42
  30. {linkml_store-0.1.7.dist-info → linkml_store-0.1.9.dist-info}/LICENSE +0 -0
  31. {linkml_store-0.1.7.dist-info → linkml_store-0.1.9.dist-info}/WHEEL +0 -0
  32. {linkml_store-0.1.7.dist-info → linkml_store-0.1.9.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,177 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Any, Dict, List, Optional, Union
4
+
5
+ from linkml_store.api import Collection
6
+ from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
7
+ from linkml_store.api.queries import Query, QueryResult
8
+ from linkml_store.api.types import DatabaseType
9
+ from linkml_store.utils.query_utils import mongo_query_to_match_function
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class FileSystemCollection(Collection[DatabaseType]):
15
+ path: Optional[Path] = None
16
+ file_format: Optional[str] = None
17
+ encoding: Optional[str] = None
18
+ _objects_list: List[OBJECT] = None
19
+ _object_map: Dict[str, OBJECT] = None
20
+
21
+ def __init__(self, **kwargs):
22
+ super().__init__(**kwargs)
23
+ parent: DatabaseType = self.parent
24
+ if not self.path:
25
+ if self.parent:
26
+ self.path = Path(parent.directory_path)
27
+ self._objects_list = []
28
+ self._object_map = {}
29
+ if not self.file_format:
30
+ self.file_format = "json"
31
+
32
+ @property
33
+ def path_to_file(self):
34
+ return Path(self.parent.directory_path) / f"{self.name}.{self.file_format}"
35
+
36
+ @property
37
+ def objects_as_list(self) -> List[OBJECT]:
38
+ if self._object_map:
39
+ return list(self._object_map.values())
40
+ else:
41
+ return self._objects_list
42
+
43
+ def _set_objects(self, objs: List[OBJECT]):
44
+ pk = self.identifier_attribute_name
45
+ if pk:
46
+ self._object_map = {obj[pk]: obj for obj in objs}
47
+ self._objects_list = []
48
+ else:
49
+ self._objects_list = objs
50
+ self._object_map = {}
51
+
52
+ def commit(self):
53
+ path = self.path_to_file
54
+ if not path:
55
+ raise ValueError("Path not set")
56
+ path.parent.mkdir(parents=True, exist_ok=True)
57
+ self._save(path)
58
+
59
+ def _save(self, path: Path):
60
+ encoding = self.encoding or "utf-8"
61
+ fmt = self.file_format or "json"
62
+ mode = "w"
63
+ if fmt == "parquet":
64
+ mode = "wb"
65
+ encoding = None
66
+ with open(path, mode, encoding=encoding) as stream:
67
+ if fmt == "json":
68
+ import json
69
+
70
+ json.dump(self.objects_as_list, stream, indent=2)
71
+ elif fmt == "jsonl":
72
+ import jsonlines
73
+
74
+ writer = jsonlines.Writer(stream)
75
+ writer.write_all(self.objects_as_list)
76
+ elif fmt == "yaml":
77
+ import yaml
78
+
79
+ yaml.dump_all(self.objects_as_list, stream)
80
+ elif fmt == "parquet":
81
+ import pandas as pd
82
+ import pyarrow
83
+ import pyarrow.parquet as pq
84
+
85
+ df = pd.DataFrame(self.objects_as_list)
86
+ table = pyarrow.Table.from_pandas(df)
87
+ pq.write_table(table, stream)
88
+ elif fmt in {"csv", "tsv"}:
89
+ import csv
90
+
91
+ delimiter = "\t" if fmt == "tsv" else ","
92
+ fieldnames = list(self.objects_as_list[0].keys())
93
+ for obj in self.objects_as_list[1:]:
94
+ fieldnames.extend([k for k in obj.keys() if k not in fieldnames])
95
+ writer = csv.DictWriter(stream, fieldnames=fieldnames, delimiter=delimiter)
96
+ writer.writeheader()
97
+ for obj in self.objects_as_list:
98
+ writer.writerow(obj)
99
+ else:
100
+ raise ValueError(f"Unsupported file format: {fmt}")
101
+
102
+ def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
103
+ if not isinstance(objs, list):
104
+ objs = [objs]
105
+ if not objs:
106
+ return
107
+ pk = self.identifier_attribute_name
108
+ if pk:
109
+ for obj in objs:
110
+ if pk not in obj:
111
+ raise ValueError(f"Primary key {pk} not found in object {obj}")
112
+ pk_val = obj[pk]
113
+ self._object_map[pk_val] = obj
114
+ else:
115
+ self._objects_list.extend(objs)
116
+
117
+ def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
118
+ if not isinstance(objs, list):
119
+ objs = [objs]
120
+ if not objs:
121
+ return 0
122
+ pk = self.identifier_attribute_name
123
+ n = 0
124
+ if pk:
125
+ for obj in objs:
126
+ pk_val = obj[pk]
127
+ if pk_val in self._object_map:
128
+ del self._object_map[pk_val]
129
+ n += 1
130
+ else:
131
+ n = len(objs)
132
+ self._objects_list = [o for o in self._objects_list if o not in objs]
133
+ n = n - len(objs)
134
+ return n
135
+
136
+ def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
137
+ logger.info(f"Deleting from {self.target_class_name} where: {where}")
138
+ if where is None:
139
+ where = {}
140
+
141
+ def matches(obj: OBJECT):
142
+ for k, v in where.items():
143
+ if obj.get(k) != v:
144
+ return False
145
+ return True
146
+
147
+ print(type(self))
148
+ print(self)
149
+ print(vars(self))
150
+ curr_objects = [o for o in self.objects_as_list if not matches(o)]
151
+ self._set_objects(curr_objects)
152
+
153
+ def query(self, query: Query, **kwargs) -> QueryResult:
154
+
155
+ where = query.where_clause or {}
156
+ match = mongo_query_to_match_function(where)
157
+ rows = [o for o in self.objects_as_list if match(o)]
158
+ count = len(rows)
159
+ return QueryResult(query=query, num_rows=count, rows=rows)
160
+
161
+ def query_facets(
162
+ self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
163
+ ) -> Dict[str, Dict[str, int]]:
164
+ match = mongo_query_to_match_function(where)
165
+ rows = [o for o in self.objects_as_list if match(o)]
166
+ if not facet_columns:
167
+ facet_columns = self.class_definition().attributes.keys()
168
+ facet_results = {c: {} for c in facet_columns}
169
+ for row in rows:
170
+ for fc in facet_columns:
171
+ if fc in row:
172
+ v = row[fc]
173
+ if v not in facet_results[fc]:
174
+ facet_results[fc][v] = 1
175
+ else:
176
+ facet_results[fc][v] += 1
177
+ return {fc: list(facet_results[fc].items()) for fc in facet_results}
@@ -0,0 +1,72 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ import yaml
6
+ from linkml.utils.schema_builder import SchemaBuilder
7
+ from linkml_runtime import SchemaView
8
+
9
+ from linkml_store.api import Database
10
+ from linkml_store.api.config import DatabaseConfig
11
+ from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
12
+ from linkml_store.utils.format_utils import Format, load_objects
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class FileSystemDatabase(Database):
18
+ collection_class = FileSystemCollection
19
+
20
+ directory_path: Optional[Path] = None
21
+ default_file_format: Optional[str] = None
22
+
23
+ def __init__(self, handle: Optional[str] = None, **kwargs):
24
+ handle = handle.replace("file:", "")
25
+ if handle.startswith("//"):
26
+ handle = handle[2:]
27
+ self.directory_path = Path(handle)
28
+ self.load_metadata()
29
+ super().__init__(handle=handle, **kwargs)
30
+
31
+ @property
32
+ def metadata_path(self) -> Path:
33
+ return self.directory_path / ".linkml_metadata.yaml"
34
+
35
+ def load_metadata(self):
36
+ if self.metadata_path.exists():
37
+ md_dict = yaml.safe_load(open(self.metadata_path))
38
+ metadata = DatabaseConfig(**md_dict)
39
+ else:
40
+ metadata = DatabaseConfig()
41
+ self.metadata = metadata
42
+
43
+ def close(self, **kwargs):
44
+ pass
45
+
46
+ def init_collections(self):
47
+ metadata = self.metadata
48
+ if self._collections is None:
49
+ self._collections = {}
50
+ for name, collection_config in metadata.collections.items():
51
+ collection = FileSystemCollection(parent=self, **collection_config.dict())
52
+ self._collections[name] = collection
53
+ path = self.directory_path
54
+ if path.exists():
55
+ for fmt in Format:
56
+ suffix = fmt.value
57
+ logger.info(f"Looking for {suffix} files in {path}")
58
+ for f in path.glob(f"*.{suffix}"):
59
+ logger.info(f"Found {f}")
60
+ n = f.stem
61
+ objs = load_objects(f, suffix, expected_type=list)
62
+ collection = FileSystemCollection(parent=self, name=n)
63
+ self._collections[n] = collection
64
+ collection._set_objects(objs)
65
+
66
+ def induce_schema_view(self) -> SchemaView:
67
+ logger.info(f"Inducing schema view for {self.handle}")
68
+ sb = SchemaBuilder()
69
+
70
+ for collection_name in self.list_collection_names():
71
+ sb.add_class(collection_name)
72
+ return SchemaView(sb.schema)
@@ -0,0 +1,7 @@
1
+ """
2
+ Adapter for HDF5 file storage.
3
+
4
+ .. warning::
5
+
6
+ Experimental support for HDF5 storage.
7
+ """
@@ -0,0 +1,25 @@
1
+ """
2
+ Adapter for MongoDB document store.
3
+
4
+ Handles have the form: ``mongodb://<host>:<port>/<database>``
5
+
6
+ To use this, you must have the `pymongo` extra installed.
7
+
8
+ .. code-block:: bash
9
+
10
+ pip install linkml-store[mongodb]
11
+
12
+ or
13
+
14
+ .. code-block:: bash
15
+
16
+ pip install linkml-store[all]
17
+ """
18
+
19
+ from linkml_store.api.stores.mongodb.mongodb_collection import MongoDBCollection
20
+ from linkml_store.api.stores.mongodb.mongodb_database import MongoDBDatabase
21
+
22
+ __all__ = [
23
+ "MongoDBCollection",
24
+ "MongoDBDatabase",
25
+ ]
@@ -13,22 +13,36 @@ logger = logging.getLogger(__name__)
13
13
 
14
14
 
15
15
  class MongoDBCollection(Collection):
16
+ """
17
+ Adapter for collections in a MongoDB database.
18
+
19
+ .. note::
20
+
21
+ You should not use or manipulate this class directly.
22
+ Instead, use the general :class:`linkml_store.api.Collection`
23
+ """
16
24
 
17
25
  @property
18
26
  def mongo_collection(self) -> MongoCollection:
19
27
  if not self.name:
20
28
  raise ValueError("Collection name not set")
21
- return self.parent.native_db[self.name]
29
+ collection_name = self.alias or self.name
30
+ return self.parent.native_db[collection_name]
22
31
 
23
32
  def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
24
33
  if not isinstance(objs, list):
25
34
  objs = [objs]
26
35
  self.mongo_collection.insert_many(objs)
36
+ # TODO: allow mapping of _id to id for efficiency
37
+ for obj in objs:
38
+ del obj["_id"]
39
+ self._post_insert_hook(objs)
27
40
 
28
- def query(self, query: Query, **kwargs) -> QueryResult:
41
+ def query(self, query: Query, limit: Optional[int] = None, **kwargs) -> QueryResult:
29
42
  mongo_filter = self._build_mongo_filter(query.where_clause)
30
- if query.limit:
31
- cursor = self.mongo_collection.find(mongo_filter).limit(query.limit)
43
+ limit = limit or query.limit
44
+ if limit and limit >= 0:
45
+ cursor = self.mongo_collection.find(mongo_filter).limit(limit)
32
46
  else:
33
47
  cursor = self.mongo_collection.find(mongo_filter)
34
48
 
@@ -62,24 +76,31 @@ class MongoDBCollection(Collection):
62
76
  if isinstance(col, tuple):
63
77
  sd = SlotDefinition(name="PLACEHOLDER")
64
78
  else:
65
- sd = cd.attributes[col]
66
-
67
- if sd.multivalued:
79
+ if col in cd.attributes:
80
+ sd = cd.attributes[col]
81
+ else:
82
+ logger.info(f"No schema metadata for {col}")
83
+ sd = SlotDefinition(name=col)
84
+ group = {"$group": {"_id": f"${col}", "count": {"$sum": 1}}}
85
+ if isinstance(col, tuple):
86
+ q = {k.replace(".", ""): f"${k}" for k in col}
87
+ group["$group"]["_id"] = q
88
+ if sd and sd.multivalued:
68
89
  facet_pipeline = [
69
90
  {"$match": where} if where else {"$match": {}},
70
91
  {"$unwind": f"${col}"},
71
- {"$group": {"_id": f"${col}", "count": {"$sum": 1}}},
92
+ group,
72
93
  {"$sort": {"count": -1}},
73
94
  {"$limit": facet_limit},
74
95
  ]
75
96
  else:
76
97
  facet_pipeline = [
77
98
  {"$match": where} if where else {"$match": {}},
78
- {"$group": {"_id": f"${col}", "count": {"$sum": 1}}},
99
+ group,
79
100
  {"$sort": {"count": -1}},
80
101
  {"$limit": facet_limit},
81
102
  ]
82
-
103
+ logger.info(f"Facet pipeline: {facet_pipeline}")
83
104
  facet_results = list(self.mongo_collection.aggregate(facet_pipeline))
84
105
  results[col] = [(result["_id"], result["count"]) for result in facet_results]
85
106
 
@@ -29,9 +29,17 @@ class MongoDBDatabase(Database):
29
29
 
30
30
  def __init__(self, handle: Optional[str] = None, **kwargs):
31
31
  if handle is None:
32
- handle = "mongodb://localhost:27017"
32
+ handle = "mongodb://localhost:27017/test"
33
33
  super().__init__(handle=handle, **kwargs)
34
34
 
35
+ @property
36
+ def _db_name(self) -> str:
37
+ if self.handle:
38
+ db = self.handle.split("/")[-1]
39
+ else:
40
+ db = "default"
41
+ return db
42
+
35
43
  @property
36
44
  def native_client(self) -> MongoClient:
37
45
  if self._native_client is None:
@@ -44,7 +52,7 @@ class MongoDBDatabase(Database):
44
52
  alias = self.metadata.alias
45
53
  if not alias:
46
54
  alias = "default"
47
- self._native_db = self.native_client[alias]
55
+ self._native_db = self.native_client[self._db_name]
48
56
  return self._native_db
49
57
 
50
58
  def commit(self, **kwargs):
@@ -58,9 +66,12 @@ class MongoDBDatabase(Database):
58
66
  self.native_client.drop_database(self.metadata.alias)
59
67
 
60
68
  def query(self, query: Query, **kwargs) -> QueryResult:
69
+ # TODO: DRY
61
70
  if query.from_table:
62
71
  collection = self.get_collection(query.from_table)
63
72
  return collection.query(query, **kwargs)
73
+ else:
74
+ raise NotImplementedError(f"Querying without a table is not supported in {self.__class__.__name__}")
64
75
 
65
76
  def init_collections(self):
66
77
  if self._collections is None:
@@ -0,0 +1,4 @@
1
+ from typing import TypeVar
2
+
3
+ DatabaseType = TypeVar("DatabaseType", bound="Database") # noqa: F821
4
+ CollectionType = TypeVar("CollectionType", bound="Collection") # noqa: F821
linkml_store/cli.py CHANGED
@@ -11,12 +11,19 @@ from pydantic import BaseModel
11
11
  from linkml_store import Client
12
12
  from linkml_store.api import Collection, Database
13
13
  from linkml_store.api.queries import Query
14
+ from linkml_store.index import get_indexer
14
15
  from linkml_store.index.implementations.simple_indexer import SimpleIndexer
15
16
  from linkml_store.index.indexer import Indexer
16
- from linkml_store.utils.format_utils import Format, load_objects, render_output
17
+ from linkml_store.utils.format_utils import Format, guess_format, load_objects, render_output, write_output
17
18
  from linkml_store.utils.object_utils import object_path_update
18
19
 
19
- index_type_option = click.option("--index-type", "-t")
20
+ index_type_option = click.option(
21
+ "--index-type",
22
+ "-t",
23
+ default="simple",
24
+ show_default=True,
25
+ help="Type of index to create. Values: simple, llm",
26
+ )
20
27
 
21
28
  logger = logging.getLogger(__name__)
22
29
 
@@ -70,6 +77,9 @@ class ContextSettings(BaseModel):
70
77
  format_choice = click.Choice([f.value for f in Format])
71
78
 
72
79
 
80
+ include_internal_option = click.option("--include-internal/--no-include-internal", default=False, show_default=True)
81
+
82
+
73
83
  @click.group()
74
84
  @click.option("--database", "-d", help="Database name")
75
85
  @click.option("--collection", "-c", help="Collection name")
@@ -89,6 +99,15 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
89
99
  if not stacktrace:
90
100
  sys.tracebacklimit = 0
91
101
  logger = logging.getLogger()
102
+ # Set handler for the root logger to output to the console
103
+ console_handler = logging.StreamHandler()
104
+ console_handler.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
105
+
106
+ # Clear existing handlers to avoid duplicate messages if function runs multiple times
107
+ logger.handlers = []
108
+
109
+ # Add the newly created console handler to the logger
110
+ logger.addHandler(console_handler)
92
111
  if verbose >= 2:
93
112
  logger.setLevel(logging.DEBUG)
94
113
  elif verbose == 1:
@@ -162,6 +181,7 @@ def insert(ctx, files, object, format):
162
181
  objects = yaml.safe_load(object_str)
163
182
  collection.insert(objects)
164
183
  click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.name}'.")
184
+ collection.commit()
165
185
 
166
186
 
167
187
  @cli.command()
@@ -193,14 +213,107 @@ def store(ctx, files, object, format):
193
213
  click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{db.name}'.")
194
214
 
195
215
 
216
+ @cli.command(name="import")
217
+ @click.option("--format", "-f", help="Input format")
218
+ @click.pass_context
219
+ @click.argument("files", type=click.Path(exists=True), nargs=-1)
220
+ def import_database(ctx, files, format):
221
+ """Imports a database from a dump."""
222
+ settings = ctx.obj["settings"]
223
+ db = settings.database
224
+ if not files and not object:
225
+ files = ["-"]
226
+ for file_path in files:
227
+ db.import_database(file_path, source_format=format)
228
+
229
+
196
230
  @cli.command()
197
- @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
231
+ @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
232
+ @click.option("--output", "-o", required=True, type=click.Path(), help="Output file path")
233
+ @click.pass_context
234
+ def export(ctx, output_type, output):
235
+ """Exports a database to a dump."""
236
+ settings = ctx.obj["settings"]
237
+ db = settings.database
238
+ if output_type is None:
239
+ output_type = guess_format(output)
240
+ if output_type is None:
241
+ raise ValueError(f"Output format must be specified can't be inferred from {output}.")
242
+ db.export_database(output, target_format=output_type)
243
+
244
+
245
+ @cli.command()
246
+ @click.option("--output", "-o", type=click.Path(), help="Output file path")
247
+ @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
248
+ @click.option("--other-database", "-D", required=False, help="Path to the other database")
249
+ @click.option("--other-collection", "-X", required=True, help="Name of the other collection")
250
+ @click.option("--identifier-attribute", "-I", required=False, help="Primary key name")
251
+ @click.pass_context
252
+ def diff(ctx, output, output_type, other_database, other_collection, identifier_attribute):
253
+ """Diffs two collectoons to create a patch."""
254
+ settings = ctx.obj["settings"]
255
+ db = settings.database
256
+ collection = settings.collection
257
+ if not collection:
258
+ raise ValueError("Collection must be specified.")
259
+ other_db = settings.client.get_database(other_database) if other_database else db
260
+ other_collection = other_db.get_collection(other_collection)
261
+ if identifier_attribute:
262
+ collection.set_identifier_attribute_name(identifier_attribute)
263
+ other_collection.set_identifier_attribute_name(identifier_attribute)
264
+ diff = collection.diff(other_collection)
265
+ write_output(diff, output_type, target=output)
266
+
267
+
268
+ @cli.command()
269
+ @click.option("--identifier-attribute", "-I", required=False, help="Primary key name")
270
+ @click.argument("patch_files", type=click.Path(exists=True), nargs=-1)
271
+ @click.pass_context
272
+ def apply(ctx, patch_files, identifier_attribute):
273
+ """
274
+ Apply a patch to a collection.
275
+ """
276
+ settings = ctx.obj["settings"]
277
+ collection = settings.collection
278
+ if not collection:
279
+ raise ValueError("Collection must be specified.")
280
+ if identifier_attribute:
281
+ collection.set_identifier_attribute_name(identifier_attribute)
282
+ for patch_file in patch_files:
283
+ patch_objs = load_objects(patch_file, expected_type=list)
284
+ collection.apply_patches(patch_objs)
285
+
286
+
287
+ @cli.command()
288
+ @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query, as YAML")
198
289
  @click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
199
290
  @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
200
291
  @click.option("--output", "-o", type=click.Path(), help="Output file path")
201
292
  @click.pass_context
202
293
  def query(ctx, where, limit, output_type, output):
203
- """Query objects from the specified collection."""
294
+ """Query objects from the specified collection.
295
+
296
+
297
+ Leave the query field blank to return all objects in the collection.
298
+
299
+ Examples:
300
+
301
+ linkml-store -d duckdb:///countries.db -c countries query
302
+
303
+ Queries can be specified in YAML, as basic key-value pairs
304
+
305
+ Examples:
306
+
307
+ linkml-store -d duckdb:///countries.db -c countries query -w 'code: NZ'
308
+
309
+ More complex queries can be specified using MongoDB-style query syntax
310
+
311
+ Examples:
312
+
313
+ linkml-store -d file:. -c persons query -w 'occupation: {$ne: Architect}'
314
+
315
+ Finds all people who are not architects.
316
+ """
204
317
  collection = ctx.obj["settings"].collection
205
318
  where_clause = yaml.safe_load(where) if where else None
206
319
  query = Query(from_table=collection.name, where_clause=where_clause, limit=limit)
@@ -216,9 +329,10 @@ def query(ctx, where, limit, output_type, output):
216
329
 
217
330
  @cli.command()
218
331
  @click.pass_context
219
- def list_collections(ctx):
332
+ @include_internal_option
333
+ def list_collections(ctx, **kwargs):
220
334
  db = ctx.obj["settings"].database
221
- for collection in db.list_collections():
335
+ for collection in db.list_collections(**kwargs):
222
336
  click.echo(collection.name)
223
337
  click.echo(render_output(collection.metadata))
224
338
 
@@ -254,7 +368,7 @@ def fq(ctx, where, limit, columns, output_type, output):
254
368
 
255
369
  def _untuple(key):
256
370
  if isinstance(key, tuple):
257
- return "+".join(key)
371
+ return "+".join([str(x) for x in key])
258
372
  return key
259
373
 
260
374
  count_dict = {}
@@ -277,19 +391,34 @@ def _get_index(index_type=None, **kwargs) -> Indexer:
277
391
  raise ValueError(f"Unknown index type: {index_type}")
278
392
 
279
393
 
394
+ @cli.command()
395
+ @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
396
+ @click.option("--output-type", "-O", type=format_choice, default=Format.FORMATTED.value, help="Output format")
397
+ @click.option("--output", "-o", type=click.Path(), help="Output file path")
398
+ @click.pass_context
399
+ def describe(ctx, where, output_type, output):
400
+ """
401
+ Describe the collection schema.
402
+ """
403
+ where_clause = yaml.safe_load(where) if where else None
404
+ collection = ctx.obj["settings"].collection
405
+ df = collection.find(where_clause, limit=1).rows_dataframe
406
+ write_output(df.describe(include="all").transpose(), output_type, target=output)
407
+
408
+
280
409
  @cli.command()
281
410
  @index_type_option
411
+ @click.option("--cached-embeddings-database", "-E", help="Path to the database where embeddings are cached")
412
+ @click.option("--text-template", "-T", help="Template for text embeddings")
282
413
  @click.pass_context
283
- def index(ctx, index_type):
414
+ def index(ctx, index_type, **kwargs):
284
415
  """
285
416
  Create an index over a collection.
286
417
 
287
- :param ctx:
288
- :param index_type:
289
- :return:
418
+ By default a simple trigram index is used.
290
419
  """
291
420
  collection = ctx.obj["settings"].collection
292
- ix = _get_index(index_type)
421
+ ix = get_indexer(index_type, **kwargs)
293
422
  collection.attach_indexer(ix)
294
423
 
295
424
 
@@ -322,14 +451,17 @@ def schema(ctx, output_type, output):
322
451
  @click.option("--limit", "-l", type=click.INT, help="Maximum number of search results")
323
452
  @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
324
453
  @click.option("--output", "-o", type=click.Path(), help="Output file path")
454
+ @click.option(
455
+ "--auto-index/--no-auto-index", default=False, show_default=True, help="Automatically index the collection"
456
+ )
325
457
  @index_type_option
326
458
  @click.pass_context
327
- def search(ctx, search_term, where, limit, index_type, output_type, output):
459
+ def search(ctx, search_term, where, limit, index_type, output_type, output, auto_index):
328
460
  """Search objects in the specified collection."""
329
461
  collection = ctx.obj["settings"].collection
330
- ix = _get_index(index_type)
462
+ ix = get_indexer(index_type)
331
463
  logger.info(f"Attaching index to collection {collection.name}: {ix.model_dump()}")
332
- collection.attach_indexer(ix, auto_index=False)
464
+ collection.attach_indexer(ix, auto_index=auto_index)
333
465
  result = collection.search(search_term, where=where, limit=limit)
334
466
  output_data = render_output([{"score": row[0], **row[1]} for row in result.ranked_rows], output_type)
335
467
  if output:
@@ -343,6 +475,9 @@ def search(ctx, search_term, where, limit, index_type, output_type, output):
343
475
  @cli.command()
344
476
  @click.pass_context
345
477
  def indexes(ctx):
478
+ """
479
+ Show the indexes for a collection.
480
+ """
346
481
  collection = ctx.obj["settings"].collection
347
482
  for name, ix in collection.indexers.items():
348
483
  click.echo(f"{name}: {type(ix)}\n{ix.model_json()}")
@@ -22,7 +22,7 @@ def get_indexer_class(name: str) -> Type[Indexer]:
22
22
  return INDEXER_CLASSES[name]
23
23
 
24
24
 
25
- def get_indexer(name: str, *args, **kwargs) -> Indexer:
25
+ def get_indexer(name: str, **kwargs) -> Indexer:
26
26
  """
27
27
  Get an indexer by name.
28
28
 
@@ -30,4 +30,8 @@ def get_indexer(name: str, *args, **kwargs) -> Indexer:
30
30
  :param kwargs: additional arguments to pass to the indexer
31
31
  :return: the indexer
32
32
  """
33
- return get_indexer_class(name)(*args, **kwargs)
33
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
34
+ cls = get_indexer_class(name)
35
+ kwargs["name"] = name
36
+ indexer = cls(**kwargs)
37
+ return indexer