linkml-store 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (33) hide show
  1. linkml_store/api/client.py +32 -3
  2. linkml_store/api/collection.py +231 -30
  3. linkml_store/api/config.py +10 -2
  4. linkml_store/api/database.py +305 -19
  5. linkml_store/api/stores/chromadb/__init__.py +7 -0
  6. linkml_store/api/stores/chromadb/chromadb_collection.py +8 -1
  7. linkml_store/api/stores/duckdb/__init__.py +16 -0
  8. linkml_store/api/stores/duckdb/duckdb_collection.py +11 -9
  9. linkml_store/api/stores/duckdb/duckdb_database.py +22 -8
  10. linkml_store/api/stores/duckdb/mappings.py +1 -0
  11. linkml_store/api/stores/filesystem/__init__.py +16 -0
  12. linkml_store/api/stores/filesystem/filesystem_collection.py +142 -0
  13. linkml_store/api/stores/filesystem/filesystem_database.py +36 -0
  14. linkml_store/api/stores/hdf5/__init__.py +7 -0
  15. linkml_store/api/stores/hdf5/hdf5_collection.py +1 -1
  16. linkml_store/api/stores/mongodb/__init__.py +25 -0
  17. linkml_store/api/stores/mongodb/mongodb_collection.py +29 -8
  18. linkml_store/api/stores/solr/__init__.py +3 -0
  19. linkml_store/api/stores/solr/solr_collection.py +2 -1
  20. linkml_store/api/stores/solr/solr_database.py +1 -0
  21. linkml_store/cli.py +64 -10
  22. linkml_store/index/__init__.py +6 -2
  23. linkml_store/index/implementations/llm_indexer.py +83 -5
  24. linkml_store/index/implementations/simple_indexer.py +2 -2
  25. linkml_store/index/indexer.py +32 -8
  26. linkml_store/utils/format_utils.py +52 -2
  27. linkml_store/utils/object_utils.py +9 -1
  28. {linkml_store-0.1.6.dist-info → linkml_store-0.1.8.dist-info}/METADATA +4 -1
  29. linkml_store-0.1.8.dist-info/RECORD +45 -0
  30. linkml_store-0.1.6.dist-info/RECORD +0 -41
  31. {linkml_store-0.1.6.dist-info → linkml_store-0.1.8.dist-info}/LICENSE +0 -0
  32. {linkml_store-0.1.6.dist-info → linkml_store-0.1.8.dist-info}/WHEEL +0 -0
  33. {linkml_store-0.1.6.dist-info → linkml_store-0.1.8.dist-info}/entry_points.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import logging
3
+ from pathlib import Path
3
4
  from typing import Optional
4
5
 
5
6
  import pandas as pd
@@ -22,6 +23,7 @@ TYPE_MAP = {
22
23
  "DATE": "date",
23
24
  "DOUBLE": "float",
24
25
  "INTEGER": "integer",
26
+ "JSON": "Any",
25
27
  }
26
28
 
27
29
 
@@ -33,9 +35,13 @@ class DuckDBDatabase(Database):
33
35
  _engine: sqlalchemy.Engine = None
34
36
  collection_class = DuckDBCollection
35
37
 
36
- def __init__(self, handle: Optional[str] = None, **kwargs):
38
+ def __init__(self, handle: Optional[str] = None, recreate_if_exists: bool = False, **kwargs):
37
39
  if handle is None:
38
40
  handle = "duckdb:///:memory:"
41
+ if recreate_if_exists:
42
+ path = Path(handle.replace("duckdb:///", ""))
43
+ if path.exists():
44
+ path.unlink()
39
45
  super().__init__(handle=handle, **kwargs)
40
46
 
41
47
  @property
@@ -69,16 +75,19 @@ class DuckDBDatabase(Database):
69
75
  if qr.num_rows == 0:
70
76
  logger.debug(f"Table {query.from_table} not created yet")
71
77
  return QueryResult(query=query, num_rows=0, rows=[])
72
- sv = self._schema_view
78
+ if not query.from_table.startswith("information_schema"):
79
+ sv = self.schema_view
80
+ else:
81
+ sv = None
73
82
  if sv:
74
83
  cd = None
75
84
  for c in self._collections.values():
76
- if c.name == query.from_table:
85
+ if c.name == query.from_table or c.metadata.alias == query.from_table:
77
86
  cd = c.class_definition()
78
87
  break
79
88
  if cd:
80
- for att in cd.attributes.values():
81
- if att.inlined:
89
+ for att in sv.class_induced_slots(cd.name):
90
+ if att.inlined or att.inlined_as_list:
82
91
  json_encoded_cols.append(att.name)
83
92
  with self.engine.connect() as conn:
84
93
  count_query_str = text(query_to_sql(query, count=True))
@@ -107,7 +116,10 @@ class DuckDBDatabase(Database):
107
116
 
108
117
  def init_collections(self):
109
118
  # TODO: unify schema introspection
110
- schema = introspect_schema(self.engine)
119
+ if not self.schema_view:
120
+ schema = introspect_schema(self.engine)
121
+ else:
122
+ schema = self.schema_view.schema
111
123
  table_names = schema.classes.keys()
112
124
  if self._collections is None:
113
125
  self._collections = {}
@@ -119,7 +131,7 @@ class DuckDBDatabase(Database):
119
131
  def induce_schema_view(self) -> SchemaView:
120
132
  # TODO: unify schema introspection
121
133
  # TODO: handle case where schema is provided in advance
122
- logger.info(f"Inducing schema view for {self.metadata.handle}")
134
+ logger.info(f"Inducing schema view for {self.metadata.handle} // {self}")
123
135
  sb = SchemaBuilder()
124
136
  schema = sb.schema
125
137
  query = Query(from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE"})
@@ -144,8 +156,10 @@ class DuckDBDatabase(Database):
144
156
  sd = SlotDefinition(
145
157
  row["column_name"], required=row["is_nullable"] == "NO", multivalued=multivalued, range=rng
146
158
  )
159
+ if dt == "JSON":
160
+ sd.inlined_as_list = True
147
161
  sb.schema.classes[tbl_name].attributes[sd.name] = sd
148
- logger.info(f"Introspected slot: {tbl_name}.{sd.name}: {sd.range}")
162
+ logger.info(f"Introspected slot: {tbl_name}.{sd.name}: {sd.range} FROM {dt}")
149
163
  sb.add_defaults()
150
164
  for cls_name in schema.classes:
151
165
  if cls_name in self.metadata.collections:
@@ -3,5 +3,6 @@ import sqlalchemy as sqla
3
3
  TMAP = {
4
4
  "string": sqla.String,
5
5
  "integer": sqla.Integer,
6
+ "float": sqla.Float,
6
7
  "linkml:Any": sqla.JSON,
7
8
  }
@@ -0,0 +1,16 @@
1
+ """
2
+ Adapter for DuckDB embedded database.
3
+
4
+ Handles have the form:
5
+
6
+ - ``duckdb:///<path>`` for a file-based database
7
+ - ``duckdb:///:memory:`` for an in-memory database
8
+ """
9
+
10
+ from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
11
+ from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
12
+
13
+ __all__ = [
14
+ "DuckDBCollection",
15
+ "DuckDBDatabase",
16
+ ]
@@ -0,0 +1,142 @@
1
+ import logging
2
+ from typing import Any, Dict, List, Optional, Union
3
+
4
+ import sqlalchemy as sqla
5
+ from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
6
+ from sqlalchemy import Column, Table, delete, insert, inspect, text
7
+ from sqlalchemy.sql.ddl import CreateTable
8
+
9
+ from linkml_store.api import Collection
10
+ from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
11
+ from linkml_store.api.queries import Query
12
+ from linkml_store.api.stores.duckdb.mappings import TMAP
13
+ from linkml_store.utils.sql_utils import facet_count_sql
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class FileSystemCollection(Collection):
19
+ _table_created: bool = None
20
+
21
+ def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
22
+ if not isinstance(objs, list):
23
+ objs = [objs]
24
+ if not objs:
25
+ return
26
+ cd = self.class_definition()
27
+ if not cd:
28
+ cd = self.induce_class_definition_from_objects(objs)
29
+ self._create_table(cd)
30
+ table = self._sqla_table(cd)
31
+ logger.info(f"Inserting into: {self.alias} // T={table.name}")
32
+ engine = self.parent.engine
33
+ col_names = [c.name for c in table.columns]
34
+ objs = [{k: obj.get(k, None) for k in col_names} for obj in objs]
35
+ with engine.connect() as conn:
36
+ with conn.begin():
37
+ conn.execute(insert(table), objs)
38
+ conn.commit()
39
+
40
+ def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
41
+ if not isinstance(objs, list):
42
+ objs = [objs]
43
+ cd = self.class_definition()
44
+ if not cd:
45
+ cd = self.induce_class_definition_from_objects(objs)
46
+ table = self._sqla_table(cd)
47
+ engine = self.parent.engine
48
+ with engine.connect() as conn:
49
+ for obj in objs:
50
+ conditions = [table.c[k] == v for k, v in obj.items() if k in cd.attributes]
51
+ stmt = delete(table).where(*conditions)
52
+ stmt = stmt.compile(engine)
53
+ conn.execute(stmt)
54
+ conn.commit()
55
+ return
56
+
57
+ def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
58
+ logger.info(f"Deleting from {self.target_class_name} where: {where}")
59
+ if where is None:
60
+ where = {}
61
+ cd = self.class_definition()
62
+ if not cd:
63
+ logger.info(f"No class definition found for {self.target_class_name}, assuming not prepopulated")
64
+ return 0
65
+ table = self._sqla_table(cd)
66
+ engine = self.parent.engine
67
+ inspector = inspect(engine)
68
+ table_exists = table.name in inspector.get_table_names()
69
+ if not table_exists:
70
+ logger.info(f"Table {table.name} does not exist, assuming no data")
71
+ return 0
72
+ with engine.connect() as conn:
73
+ conditions = [table.c[k] == v for k, v in where.items()]
74
+ stmt = delete(table).where(*conditions)
75
+ stmt = stmt.compile(engine)
76
+ result = conn.execute(stmt)
77
+ deleted_rows_count = result.rowcount
78
+ if deleted_rows_count == 0 and not missing_ok:
79
+ raise ValueError(f"No rows found for {where}")
80
+ conn.commit()
81
+ return deleted_rows_count if deleted_rows_count > -1 else None
82
+
83
+ def query_facets(
84
+ self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
85
+ ) -> Dict[str, Dict[str, int]]:
86
+ results = {}
87
+ cd = self.class_definition()
88
+ with self.parent.engine.connect() as conn:
89
+ if not facet_columns:
90
+ facet_columns = list(self.class_definition().attributes.keys())
91
+ for col in facet_columns:
92
+ logger.debug(f"Faceting on {col}")
93
+ if isinstance(col, tuple):
94
+ sd = SlotDefinition(name="PLACEHOLDER")
95
+ else:
96
+ sd = cd.attributes[col]
97
+ facet_query = self._create_query(where_clause=where)
98
+ facet_query_str = facet_count_sql(facet_query, col, multivalued=sd.multivalued)
99
+ logger.debug(f"Facet query: {facet_query_str}")
100
+ rows = list(conn.execute(text(facet_query_str)))
101
+ results[col] = rows
102
+ return results
103
+
104
+ def _sqla_table(self, cd: ClassDefinition) -> Table:
105
+ schema_view = self.parent.schema_view
106
+ metadata_obj = sqla.MetaData()
107
+ cols = []
108
+ for att in schema_view.class_induced_slots(cd.name):
109
+ typ = TMAP.get(att.range, sqla.String)
110
+ if att.inlined:
111
+ typ = sqla.JSON
112
+ if att.multivalued:
113
+ typ = sqla.ARRAY(typ, dimensions=1)
114
+ if att.array:
115
+ typ = sqla.ARRAY(typ, dimensions=1)
116
+ col = Column(att.name, typ)
117
+ cols.append(col)
118
+ t = Table(self.alias, metadata_obj, *cols)
119
+ return t
120
+
121
+ def _create_table(self, cd: ClassDefinition):
122
+ if self._table_created or self.metadata.is_prepopulated:
123
+ logger.info(f"Already have table for: {cd.name}")
124
+ return
125
+ query = Query(
126
+ from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE", "table_name": self.alias}
127
+ )
128
+ qr = self.parent.query(query)
129
+ if qr.num_rows > 0:
130
+ logger.info(f"Table already exists for {cd.name}")
131
+ self._table_created = True
132
+ self.metadata.is_prepopulated = True
133
+ return
134
+ logger.info(f"Creating table for {cd.name}")
135
+ t = self._sqla_table(cd)
136
+ ct = CreateTable(t)
137
+ ddl = str(ct.compile(self.parent.engine))
138
+ with self.parent.engine.connect() as conn:
139
+ conn.execute(text(ddl))
140
+ conn.commit()
141
+ self._table_created = True
142
+ self.metadata.is_prepopulated = True
@@ -0,0 +1,36 @@
1
+ import logging
2
+ from typing import Optional
3
+
4
+ from linkml_store.api import Collection, Database
5
+ from linkml_store.api.config import CollectionConfig
6
+ from linkml_store.api.stores.duckdb import DuckDBDatabase
7
+ from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class FileSystemDatabase(Database):
13
+ collection_class = FileSystemCollection
14
+ wrapped_database: Database = None
15
+
16
+ def __init__(self, handle: Optional[str] = None, recreate_if_exists: bool = False, **kwargs):
17
+ self.wrapped_database = DuckDBDatabase("duckdb:///:memory:")
18
+ super().__init__(handle=handle, **kwargs)
19
+
20
+ def commit(self, **kwargs):
21
+ # TODO: sync
22
+ pass
23
+
24
+ def close(self, **kwargs):
25
+ self.wrapped_database.close()
26
+
27
+ def create_collection(
28
+ self,
29
+ name: str,
30
+ alias: Optional[str] = None,
31
+ metadata: Optional[CollectionConfig] = None,
32
+ recreate_if_exists=False,
33
+ **kwargs,
34
+ ) -> Collection:
35
+ wd = self.wrapped_database
36
+ wd.create_collection()
@@ -0,0 +1,7 @@
1
+ """
2
+ Adapter for HDF5 file storage.
3
+
4
+ .. warning::
5
+
6
+ Experimental support for HDF5 storage.
7
+ """
@@ -46,7 +46,7 @@ class HDF5Collection(Collection):
46
46
  return count
47
47
 
48
48
  def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> int:
49
- logger.info(f"Deleting from {self._target_class_name} where: {where}")
49
+ logger.info(f"Deleting from {self.target_class_name} where: {where}")
50
50
  if where is None:
51
51
  where = {}
52
52
  results = self.query(Query(where_clause=where)).rows
@@ -0,0 +1,25 @@
1
+ """
2
+ Adapter for MongoDB document store.
3
+
4
+ Handles have the form: ``mongodb://<host>:<port>/<database>``
5
+
6
+ To use this, you must have the `pymongo` extra installed.
7
+
8
+ .. code-block:: bash
9
+
10
+ pip install linkml-store[mongodb]
11
+
12
+ or
13
+
14
+ .. code-block:: bash
15
+
16
+ pip install linkml-store[all]
17
+ """
18
+
19
+ from linkml_store.api.stores.mongodb.mongodb_collection import MongoDBCollection
20
+ from linkml_store.api.stores.mongodb.mongodb_database import MongoDBDatabase
21
+
22
+ __all__ = [
23
+ "MongoDBCollection",
24
+ "MongoDBDatabase",
25
+ ]
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from copy import copy
2
3
  from typing import Any, Dict, List, Optional, Tuple, Union
3
4
 
4
5
  from linkml_runtime.linkml_model import SlotDefinition
@@ -12,6 +13,14 @@ logger = logging.getLogger(__name__)
12
13
 
13
14
 
14
15
  class MongoDBCollection(Collection):
16
+ """
17
+ Adapter for collections in a MongoDB database.
18
+
19
+ .. note::
20
+
21
+ You should not use or manipulate this class directly.
22
+ Instead, use the general :class:`linkml_store.api.Collection`
23
+ """
15
24
 
16
25
  @property
17
26
  def mongo_collection(self) -> MongoCollection:
@@ -31,7 +40,12 @@ class MongoDBCollection(Collection):
31
40
  else:
32
41
  cursor = self.mongo_collection.find(mongo_filter)
33
42
 
34
- rows = list(cursor)
43
+ def _as_row(row: dict):
44
+ row = copy(row)
45
+ del row["_id"]
46
+ return row
47
+
48
+ rows = [_as_row(row) for row in cursor]
35
49
  count = self.mongo_collection.count_documents(mongo_filter)
36
50
 
37
51
  return QueryResult(query=query, num_rows=count, rows=rows)
@@ -56,24 +70,31 @@ class MongoDBCollection(Collection):
56
70
  if isinstance(col, tuple):
57
71
  sd = SlotDefinition(name="PLACEHOLDER")
58
72
  else:
59
- sd = cd.attributes[col]
60
-
61
- if sd.multivalued:
73
+ if col in cd.attributes:
74
+ sd = cd.attributes[col]
75
+ else:
76
+ logger.info(f"No schema metadata for {col}")
77
+ sd = SlotDefinition(name=col)
78
+ group = {"$group": {"_id": f"${col}", "count": {"$sum": 1}}}
79
+ if isinstance(col, tuple):
80
+ q = {k.replace(".", ""): f"${k}" for k in col}
81
+ group["$group"]["_id"] = q
82
+ if sd and sd.multivalued:
62
83
  facet_pipeline = [
63
84
  {"$match": where} if where else {"$match": {}},
64
85
  {"$unwind": f"${col}"},
65
- {"$group": {"_id": f"${col}", "count": {"$sum": 1}}},
86
+ group,
66
87
  {"$sort": {"count": -1}},
67
88
  {"$limit": facet_limit},
68
89
  ]
69
90
  else:
70
91
  facet_pipeline = [
71
92
  {"$match": where} if where else {"$match": {}},
72
- {"$group": {"_id": f"${col}", "count": {"$sum": 1}}},
93
+ group,
73
94
  {"$sort": {"count": -1}},
74
95
  {"$limit": facet_limit},
75
96
  ]
76
-
97
+ logger.info(f"Facet pipeline: {facet_pipeline}")
77
98
  facet_results = list(self.mongo_collection.aggregate(facet_pipeline))
78
99
  results[col] = [(result["_id"], result["count"]) for result in facet_results]
79
100
 
@@ -92,7 +113,7 @@ class MongoDBCollection(Collection):
92
113
  return result.deleted_count
93
114
 
94
115
  def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> int:
95
- logger.info(f"Deleting from {self._target_class_name} where: {where}")
116
+ logger.info(f"Deleting from {self.target_class_name} where: {where}")
96
117
  if where is None:
97
118
  where = {}
98
119
  result = self.mongo_collection.delete_many(where)
@@ -0,0 +1,3 @@
1
+ """
2
+ Wrapper for Solr endpoints.
3
+ """
@@ -5,6 +5,7 @@ from copy import copy
5
5
  from typing import Any, Dict, List, Optional, Union
6
6
 
7
7
  import requests
8
+
8
9
  from linkml_store.api import Collection
9
10
  from linkml_store.api.collection import DEFAULT_FACET_LIMIT
10
11
  from linkml_store.api.queries import Query, QueryResult
@@ -119,7 +120,7 @@ class SolrCollection(Collection):
119
120
  conditions = []
120
121
  if self.parent.metadata.collection_type_slot:
121
122
  where_clause = copy(where_clause)
122
- where_clause[self.parent.metadata.collection_type_slot] = self._alias
123
+ where_clause[self.parent.metadata.collection_type_slot] = self.alias
123
124
  for field, value in where_clause.items():
124
125
  if not isinstance(value, (list, tuple)):
125
126
  value = [value]
@@ -2,6 +2,7 @@ import logging
2
2
  from typing import Optional
3
3
 
4
4
  import requests
5
+
5
6
  from linkml_store.api import Collection, Database
6
7
  from linkml_store.api.config import CollectionConfig
7
8
  from linkml_store.api.queries import Query, QueryResult
linkml_store/cli.py CHANGED
@@ -11,12 +11,19 @@ from pydantic import BaseModel
11
11
  from linkml_store import Client
12
12
  from linkml_store.api import Collection, Database
13
13
  from linkml_store.api.queries import Query
14
+ from linkml_store.index import get_indexer
14
15
  from linkml_store.index.implementations.simple_indexer import SimpleIndexer
15
16
  from linkml_store.index.indexer import Indexer
16
- from linkml_store.utils.format_utils import Format, load_objects, render_output
17
+ from linkml_store.utils.format_utils import Format, guess_format, load_objects, render_output
17
18
  from linkml_store.utils.object_utils import object_path_update
18
19
 
19
- index_type_option = click.option("--index-type", "-t")
20
+ index_type_option = click.option(
21
+ "--index-type",
22
+ "-t",
23
+ default="simple",
24
+ show_default=True,
25
+ help="Type of index to create. Values: simple, llm",
26
+ )
20
27
 
21
28
  logger = logging.getLogger(__name__)
22
29
 
@@ -70,6 +77,9 @@ class ContextSettings(BaseModel):
70
77
  format_choice = click.Choice([f.value for f in Format])
71
78
 
72
79
 
80
+ include_internal_option = click.option("--include-internal/--no-include-internal", default=False, show_default=True)
81
+
82
+
73
83
  @click.group()
74
84
  @click.option("--database", "-d", help="Database name")
75
85
  @click.option("--collection", "-c", help="Collection name")
@@ -89,6 +99,15 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
89
99
  if not stacktrace:
90
100
  sys.tracebacklimit = 0
91
101
  logger = logging.getLogger()
102
+ # Set handler for the root logger to output to the console
103
+ console_handler = logging.StreamHandler()
104
+ console_handler.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
105
+
106
+ # Clear existing handlers to avoid duplicate messages if function runs multiple times
107
+ logger.handlers = []
108
+
109
+ # Add the newly created console handler to the logger
110
+ logger.addHandler(console_handler)
92
111
  if verbose >= 2:
93
112
  logger.setLevel(logging.DEBUG)
94
113
  elif verbose == 1:
@@ -193,6 +212,35 @@ def store(ctx, files, object, format):
193
212
  click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{db.name}'.")
194
213
 
195
214
 
215
+ @cli.command(name="import")
216
+ @click.argument("files", type=click.Path(exists=True), nargs=-1)
217
+ @click.option("--format", "-f", help="Input format")
218
+ @click.pass_context
219
+ def import_database(ctx, files, format):
220
+ """Imports a database from a dump."""
221
+ settings = ctx.obj["settings"]
222
+ db = settings.database
223
+ if not files and not object:
224
+ files = ["-"]
225
+ for file_path in files:
226
+ db.import_database(file_path, source_format=format)
227
+
228
+
229
+ @cli.command()
230
+ @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
231
+ @click.option("--output", "-o", required=True, type=click.Path(), help="Output file path")
232
+ @click.pass_context
233
+ def export(ctx, output_type, output):
234
+ """Exports a database to a dump."""
235
+ settings = ctx.obj["settings"]
236
+ db = settings.database
237
+ if output_type is None:
238
+ output_type = guess_format(output)
239
+ if output_type is None:
240
+ raise ValueError(f"Output format must be specified can't be inferred from {output}.")
241
+ db.export_database(output, target_format=output_type)
242
+
243
+
196
244
  @cli.command()
197
245
  @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
198
246
  @click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
@@ -216,9 +264,10 @@ def query(ctx, where, limit, output_type, output):
216
264
 
217
265
  @cli.command()
218
266
  @click.pass_context
219
- def list_collections(ctx):
267
+ @include_internal_option
268
+ def list_collections(ctx, **kwargs):
220
269
  db = ctx.obj["settings"].database
221
- for collection in db.list_collections():
270
+ for collection in db.list_collections(**kwargs):
222
271
  click.echo(collection.name)
223
272
  click.echo(render_output(collection.metadata))
224
273
 
@@ -254,7 +303,7 @@ def fq(ctx, where, limit, columns, output_type, output):
254
303
 
255
304
  def _untuple(key):
256
305
  if isinstance(key, tuple):
257
- return "+".join(key)
306
+ return "+".join([str(x) for x in key])
258
307
  return key
259
308
 
260
309
  count_dict = {}
@@ -279,8 +328,10 @@ def _get_index(index_type=None, **kwargs) -> Indexer:
279
328
 
280
329
  @cli.command()
281
330
  @index_type_option
331
+ @click.option("--cached-embeddings-database", "-E", help="Path to the database where embeddings are cached")
332
+ @click.option("--text-template", "-T", help="Template for text embeddings")
282
333
  @click.pass_context
283
- def index(ctx, index_type):
334
+ def index(ctx, index_type, **kwargs):
284
335
  """
285
336
  Create an index over a collection.
286
337
 
@@ -289,7 +340,7 @@ def index(ctx, index_type):
289
340
  :return:
290
341
  """
291
342
  collection = ctx.obj["settings"].collection
292
- ix = _get_index(index_type)
343
+ ix = get_indexer(index_type, **kwargs)
293
344
  collection.attach_indexer(ix)
294
345
 
295
346
 
@@ -322,14 +373,17 @@ def schema(ctx, output_type, output):
322
373
  @click.option("--limit", "-l", type=click.INT, help="Maximum number of search results")
323
374
  @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
324
375
  @click.option("--output", "-o", type=click.Path(), help="Output file path")
376
+ @click.option(
377
+ "--auto-index/--no-auto-index", default=False, show_default=True, help="Automatically index the collection"
378
+ )
325
379
  @index_type_option
326
380
  @click.pass_context
327
- def search(ctx, search_term, where, limit, index_type, output_type, output):
381
+ def search(ctx, search_term, where, limit, index_type, output_type, output, auto_index):
328
382
  """Search objects in the specified collection."""
329
383
  collection = ctx.obj["settings"].collection
330
- ix = _get_index(index_type)
384
+ ix = get_indexer(index_type)
331
385
  logger.info(f"Attaching index to collection {collection.name}: {ix.model_dump()}")
332
- collection.attach_indexer(ix, auto_index=False)
386
+ collection.attach_indexer(ix, auto_index=auto_index)
333
387
  result = collection.search(search_term, where=where, limit=limit)
334
388
  output_data = render_output([{"score": row[0], **row[1]} for row in result.ranked_rows], output_type)
335
389
  if output:
@@ -22,7 +22,7 @@ def get_indexer_class(name: str) -> Type[Indexer]:
22
22
  return INDEXER_CLASSES[name]
23
23
 
24
24
 
25
- def get_indexer(name: str, *args, **kwargs) -> Indexer:
25
+ def get_indexer(name: str, **kwargs) -> Indexer:
26
26
  """
27
27
  Get an indexer by name.
28
28
 
@@ -30,4 +30,8 @@ def get_indexer(name: str, *args, **kwargs) -> Indexer:
30
30
  :param kwargs: additional arguments to pass to the indexer
31
31
  :return: the indexer
32
32
  """
33
- return get_indexer_class(name)(*args, **kwargs)
33
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
34
+ cls = get_indexer_class(name)
35
+ kwargs["name"] = name
36
+ indexer = cls(**kwargs)
37
+ return indexer