linkml-store 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/client.py +63 -7
- linkml_store/api/collection.py +138 -30
- linkml_store/api/config.py +48 -6
- linkml_store/api/database.py +45 -27
- linkml_store/api/stores/duckdb/duckdb_collection.py +16 -0
- linkml_store/api/stores/duckdb/duckdb_database.py +16 -2
- linkml_store/api/stores/filesystem/filesystem_collection.py +11 -4
- linkml_store/api/stores/filesystem/filesystem_database.py +10 -1
- linkml_store/api/stores/mongodb/mongodb_collection.py +6 -2
- linkml_store/api/stores/mongodb/mongodb_database.py +1 -36
- linkml_store/api/stores/solr/solr_collection.py +4 -4
- linkml_store/cli.py +35 -17
- linkml_store/index/__init__.py +16 -2
- linkml_store/index/implementations/llm_indexer.py +2 -1
- linkml_store/index/indexer.py +13 -2
- linkml_store/utils/file_utils.py +37 -0
- linkml_store/utils/format_utils.py +68 -7
- linkml_store/utils/pandas_utils.py +40 -0
- linkml_store/utils/sql_utils.py +2 -1
- {linkml_store-0.1.10.dist-info → linkml_store-0.1.11.dist-info}/METADATA +36 -3
- {linkml_store-0.1.10.dist-info → linkml_store-0.1.11.dist-info}/RECORD +24 -22
- {linkml_store-0.1.10.dist-info → linkml_store-0.1.11.dist-info}/LICENSE +0 -0
- {linkml_store-0.1.10.dist-info → linkml_store-0.1.11.dist-info}/WHEEL +0 -0
- {linkml_store-0.1.10.dist-info → linkml_store-0.1.11.dist-info}/entry_points.txt +0 -0
|
@@ -26,6 +26,8 @@ TYPE_MAP = {
|
|
|
26
26
|
"JSON": "Any",
|
|
27
27
|
}
|
|
28
28
|
|
|
29
|
+
MEMORY_HANDLE = "duckdb:///:memory:"
|
|
30
|
+
|
|
29
31
|
|
|
30
32
|
logger = logging.getLogger(__name__)
|
|
31
33
|
|
|
@@ -49,7 +51,7 @@ class DuckDBDatabase(Database):
|
|
|
49
51
|
|
|
50
52
|
def __init__(self, handle: Optional[str] = None, recreate_if_exists: bool = False, **kwargs):
|
|
51
53
|
if handle is None:
|
|
52
|
-
handle =
|
|
54
|
+
handle = MEMORY_HANDLE
|
|
53
55
|
if recreate_if_exists:
|
|
54
56
|
path = Path(handle.replace("duckdb:///", ""))
|
|
55
57
|
if path.exists():
|
|
@@ -76,6 +78,17 @@ class DuckDBDatabase(Database):
|
|
|
76
78
|
def close(self, **kwargs):
|
|
77
79
|
self.engine.dispose()
|
|
78
80
|
|
|
81
|
+
def drop(self, missing_ok=True, **kwargs):
|
|
82
|
+
self.close()
|
|
83
|
+
if self.handle == MEMORY_HANDLE:
|
|
84
|
+
return
|
|
85
|
+
path = Path(self.handle.replace("duckdb:///", ""))
|
|
86
|
+
if path.exists():
|
|
87
|
+
path.unlink()
|
|
88
|
+
else:
|
|
89
|
+
if not missing_ok:
|
|
90
|
+
raise FileNotFoundError(f"Database file not found: {path}")
|
|
91
|
+
|
|
79
92
|
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
80
93
|
json_encoded_cols = []
|
|
81
94
|
if query.from_table:
|
|
@@ -94,7 +107,8 @@ class DuckDBDatabase(Database):
|
|
|
94
107
|
if sv:
|
|
95
108
|
cd = None
|
|
96
109
|
for c in self._collections.values():
|
|
97
|
-
if c.name == query.from_table or c.metadata.alias == query.from_table:
|
|
110
|
+
# if c.name == query.from_table or c.metadata.alias == query.from_table:
|
|
111
|
+
if c.alias == query.from_table or c.target_class_name == query.from_table:
|
|
98
112
|
cd = c.class_definition()
|
|
99
113
|
break
|
|
100
114
|
if cd:
|
|
@@ -31,7 +31,7 @@ class FileSystemCollection(Collection[DatabaseType]):
|
|
|
31
31
|
|
|
32
32
|
@property
|
|
33
33
|
def path_to_file(self):
|
|
34
|
-
return Path(self.parent.directory_path) / f"{self.
|
|
34
|
+
return Path(self.parent.directory_path) / f"{self.alias}.{self.file_format}"
|
|
35
35
|
|
|
36
36
|
@property
|
|
37
37
|
def objects_as_list(self) -> List[OBJECT]:
|
|
@@ -150,13 +150,20 @@ class FileSystemCollection(Collection[DatabaseType]):
|
|
|
150
150
|
curr_objects = [o for o in self.objects_as_list if not matches(o)]
|
|
151
151
|
self._set_objects(curr_objects)
|
|
152
152
|
|
|
153
|
-
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
154
|
-
|
|
153
|
+
def query(self, query: Query, limit: Optional[int] = None, offset: Optional[int] = None, **kwargs) -> QueryResult:
|
|
154
|
+
limit = limit or query.limit
|
|
155
|
+
offset = offset or query.offset
|
|
156
|
+
if offset is None:
|
|
157
|
+
offset = 0
|
|
155
158
|
where = query.where_clause or {}
|
|
156
159
|
match = mongo_query_to_match_function(where)
|
|
157
160
|
rows = [o for o in self.objects_as_list if match(o)]
|
|
158
161
|
count = len(rows)
|
|
159
|
-
|
|
162
|
+
if limit is None or limit < 0:
|
|
163
|
+
limit = count
|
|
164
|
+
# TODO: avoid recalculating
|
|
165
|
+
returned_row = rows[offset : offset + limit]
|
|
166
|
+
return QueryResult(query=query, num_rows=count, rows=returned_row)
|
|
160
167
|
|
|
161
168
|
def query_facets(
|
|
162
169
|
self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
|
|
@@ -9,6 +9,7 @@ from linkml_runtime import SchemaView
|
|
|
9
9
|
from linkml_store.api import Database
|
|
10
10
|
from linkml_store.api.config import DatabaseConfig
|
|
11
11
|
from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
|
|
12
|
+
from linkml_store.utils.file_utils import safe_remove_directory
|
|
12
13
|
from linkml_store.utils.format_utils import Format, load_objects
|
|
13
14
|
|
|
14
15
|
logger = logging.getLogger(__name__)
|
|
@@ -20,6 +21,8 @@ class FileSystemDatabase(Database):
|
|
|
20
21
|
directory_path: Optional[Path] = None
|
|
21
22
|
default_file_format: Optional[str] = None
|
|
22
23
|
|
|
24
|
+
no_backup_on_drop: bool = False
|
|
25
|
+
|
|
23
26
|
def __init__(self, handle: Optional[str] = None, **kwargs):
|
|
24
27
|
handle = handle.replace("file:", "")
|
|
25
28
|
if handle.startswith("//"):
|
|
@@ -43,6 +46,12 @@ class FileSystemDatabase(Database):
|
|
|
43
46
|
def close(self, **kwargs):
|
|
44
47
|
pass
|
|
45
48
|
|
|
49
|
+
def drop(self, no_backup=False, **kwargs):
|
|
50
|
+
self.close()
|
|
51
|
+
path = self.directory_path
|
|
52
|
+
if path.exists():
|
|
53
|
+
safe_remove_directory(path, no_backup=self.no_backup_on_drop or no_backup)
|
|
54
|
+
|
|
46
55
|
def init_collections(self):
|
|
47
56
|
metadata = self.metadata
|
|
48
57
|
if self._collections is None:
|
|
@@ -63,7 +72,7 @@ class FileSystemDatabase(Database):
|
|
|
63
72
|
self._collections[n] = collection
|
|
64
73
|
collection._set_objects(objs)
|
|
65
74
|
|
|
66
|
-
def
|
|
75
|
+
def xxxinduce_schema_view(self) -> SchemaView:
|
|
67
76
|
logger.info(f"Inducing schema view for {self.handle}")
|
|
68
77
|
sb = SchemaBuilder()
|
|
69
78
|
|
|
@@ -23,11 +23,15 @@ class MongoDBCollection(Collection):
|
|
|
23
23
|
|
|
24
24
|
@property
|
|
25
25
|
def mongo_collection(self) -> MongoCollection:
|
|
26
|
-
|
|
26
|
+
# collection_name = self.alias or self.name
|
|
27
|
+
collection_name = self.alias
|
|
28
|
+
if not collection_name:
|
|
27
29
|
raise ValueError("Collection name not set")
|
|
28
|
-
collection_name = self.alias or self.name
|
|
29
30
|
return self.parent.native_db[collection_name]
|
|
30
31
|
|
|
32
|
+
def _check_if_initialized(self) -> bool:
|
|
33
|
+
return self.alias in self.parent.native_db.list_collection_names()
|
|
34
|
+
|
|
31
35
|
def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
32
36
|
if not isinstance(objs, list):
|
|
33
37
|
objs = [objs]
|
|
@@ -3,9 +3,6 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
from typing import Optional
|
|
5
5
|
|
|
6
|
-
from linkml_runtime import SchemaView
|
|
7
|
-
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
|
|
8
|
-
from linkml_runtime.utils.schema_builder import SchemaBuilder
|
|
9
6
|
from pymongo import MongoClient
|
|
10
7
|
from pymongo.database import Database as NativeDatabase
|
|
11
8
|
|
|
@@ -63,10 +60,9 @@ class MongoDBDatabase(Database):
|
|
|
63
60
|
self._native_client.close()
|
|
64
61
|
|
|
65
62
|
def drop(self, **kwargs):
|
|
66
|
-
self.native_client.drop_database(self.
|
|
63
|
+
self.native_client.drop_database(self.native_db.name)
|
|
67
64
|
|
|
68
65
|
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
69
|
-
# TODO: DRY
|
|
70
66
|
if query.from_table:
|
|
71
67
|
collection = self.get_collection(query.from_table)
|
|
72
68
|
return collection.query(query, **kwargs)
|
|
@@ -81,34 +77,3 @@ class MongoDBDatabase(Database):
|
|
|
81
77
|
if collection_name not in self._collections:
|
|
82
78
|
collection = MongoDBCollection(name=collection_name, parent=self)
|
|
83
79
|
self._collections[collection_name] = collection
|
|
84
|
-
|
|
85
|
-
def induce_schema_view(self) -> SchemaView:
|
|
86
|
-
logger.info(f"Inducing schema view for {self.handle}")
|
|
87
|
-
sb = SchemaBuilder()
|
|
88
|
-
schema = sb.schema
|
|
89
|
-
|
|
90
|
-
for collection_name in self.native_db.list_collection_names():
|
|
91
|
-
sb.add_class(collection_name)
|
|
92
|
-
mongo_collection = self.native_db[collection_name]
|
|
93
|
-
sample_doc = mongo_collection.find_one()
|
|
94
|
-
if sample_doc:
|
|
95
|
-
for field, value in sample_doc.items():
|
|
96
|
-
if field == "_id":
|
|
97
|
-
continue
|
|
98
|
-
sd = SlotDefinition(field)
|
|
99
|
-
if isinstance(value, list):
|
|
100
|
-
sd.multivalued = True
|
|
101
|
-
if isinstance(value, dict):
|
|
102
|
-
sd.inlined = True
|
|
103
|
-
sb.schema.classes[collection_name].attributes[sd.name] = sd
|
|
104
|
-
|
|
105
|
-
sb.add_defaults()
|
|
106
|
-
for cls_name in schema.classes:
|
|
107
|
-
if cls_name in self.metadata.collections:
|
|
108
|
-
collection_metadata = self.metadata.collections[cls_name]
|
|
109
|
-
if collection_metadata.attributes:
|
|
110
|
-
del schema.classes[cls_name]
|
|
111
|
-
cls = ClassDefinition(name=collection_metadata.type, attributes=collection_metadata.attributes)
|
|
112
|
-
schema.classes[cls.name] = cls
|
|
113
|
-
|
|
114
|
-
return SchemaView(schema)
|
|
@@ -18,7 +18,7 @@ class SolrCollection(Collection):
|
|
|
18
18
|
@property
|
|
19
19
|
def _collection_base(self) -> str:
|
|
20
20
|
if self.parent.use_cores:
|
|
21
|
-
base_url = f"{self.parent.base_url}/{self.
|
|
21
|
+
base_url = f"{self.parent.base_url}/{self.alias}"
|
|
22
22
|
else:
|
|
23
23
|
base_url = self.parent.base_url
|
|
24
24
|
return base_url
|
|
@@ -37,7 +37,7 @@ class SolrCollection(Collection):
|
|
|
37
37
|
if not qfs:
|
|
38
38
|
raise ValueError("No searchable slots configured for Solr collection")
|
|
39
39
|
solr_query = self._build_solr_query(where, search_term=query, extra={"defType": index_name, "qf": qfs})
|
|
40
|
-
logger.info(f"Querying Solr collection {self.
|
|
40
|
+
logger.info(f"Querying Solr collection {self.alias} with query: {solr_query}")
|
|
41
41
|
|
|
42
42
|
response = requests.get(f"{self._collection_base}/select", params=solr_query)
|
|
43
43
|
response.raise_for_status()
|
|
@@ -50,7 +50,7 @@ class SolrCollection(Collection):
|
|
|
50
50
|
|
|
51
51
|
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
52
52
|
solr_query = self._build_solr_query(query)
|
|
53
|
-
logger.info(f"Querying Solr collection {self.
|
|
53
|
+
logger.info(f"Querying Solr collection {self.alias} with query: {solr_query}")
|
|
54
54
|
|
|
55
55
|
response = requests.get(f"{self._collection_base}/select", params=solr_query)
|
|
56
56
|
response.raise_for_status()
|
|
@@ -69,7 +69,7 @@ class SolrCollection(Collection):
|
|
|
69
69
|
solr_query["facet.field"] = facet_columns
|
|
70
70
|
solr_query["facet.limit"] = facet_limit
|
|
71
71
|
|
|
72
|
-
logger.info(f"Querying Solr collection {self.
|
|
72
|
+
logger.info(f"Querying Solr collection {self.alias} for facets with query: {solr_query}")
|
|
73
73
|
|
|
74
74
|
response = requests.get(f"{self._collection_base}/select", params=solr_query)
|
|
75
75
|
response.raise_for_status()
|
linkml_store/cli.py
CHANGED
|
@@ -16,6 +16,7 @@ from linkml_store.index.implementations.simple_indexer import SimpleIndexer
|
|
|
16
16
|
from linkml_store.index.indexer import Indexer
|
|
17
17
|
from linkml_store.utils.format_utils import Format, guess_format, load_objects, render_output, write_output
|
|
18
18
|
from linkml_store.utils.object_utils import object_path_update
|
|
19
|
+
from linkml_store.utils.pandas_utils import facet_summary_to_dataframe_unmelted
|
|
19
20
|
|
|
20
21
|
index_type_option = click.option(
|
|
21
22
|
"--index-type",
|
|
@@ -87,6 +88,7 @@ include_internal_option = click.option("--include-internal/--no-include-internal
|
|
|
87
88
|
@click.option("--set", help="Metadata settings in the form PATHEXPR=value", multiple=True)
|
|
88
89
|
@click.option("-v", "--verbose", count=True)
|
|
89
90
|
@click.option("-q", "--quiet/--no-quiet")
|
|
91
|
+
@click.option("--base-dir", "-B", help="Base directory for the client configuration")
|
|
90
92
|
@click.option(
|
|
91
93
|
"--stacktrace/--no-stacktrace",
|
|
92
94
|
default=False,
|
|
@@ -94,7 +96,7 @@ include_internal_option = click.option("--include-internal/--no-include-internal
|
|
|
94
96
|
help="If set then show full stacktrace on error",
|
|
95
97
|
)
|
|
96
98
|
@click.pass_context
|
|
97
|
-
def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, config, set):
|
|
99
|
+
def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, config, set, **kwargs):
|
|
98
100
|
"""A CLI for interacting with the linkml-store."""
|
|
99
101
|
if not stacktrace:
|
|
100
102
|
sys.tracebacklimit = 0
|
|
@@ -117,7 +119,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
|
|
|
117
119
|
if quiet:
|
|
118
120
|
logger.setLevel(logging.ERROR)
|
|
119
121
|
ctx.ensure_object(dict)
|
|
120
|
-
client = Client().from_config(config) if config else Client()
|
|
122
|
+
client = Client().from_config(config, **kwargs) if config else Client()
|
|
121
123
|
settings = ContextSettings(client=client, database_name=database, collection_name=collection)
|
|
122
124
|
ctx.obj["settings"] = settings
|
|
123
125
|
# DEPRECATED
|
|
@@ -150,7 +152,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
|
|
|
150
152
|
# raise ValueError("Collection must be specified if there are multiple collections.")
|
|
151
153
|
if settings.database and settings.database.list_collections():
|
|
152
154
|
collection = settings.database.list_collections()[0]
|
|
153
|
-
settings.collection_name = collection.
|
|
155
|
+
settings.collection_name = collection.alias
|
|
154
156
|
|
|
155
157
|
|
|
156
158
|
@cli.command()
|
|
@@ -180,15 +182,15 @@ def insert(ctx, files, object, format):
|
|
|
180
182
|
objects = load_objects(file_path, format=format)
|
|
181
183
|
else:
|
|
182
184
|
objects = load_objects(file_path)
|
|
183
|
-
logger.info(f"Inserting {len(objects)} objects from {file_path} into collection '{collection.
|
|
185
|
+
logger.info(f"Inserting {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
|
|
184
186
|
collection.insert(objects)
|
|
185
|
-
click.echo(f"Inserted {len(objects)} objects from {file_path} into collection '{collection.
|
|
187
|
+
click.echo(f"Inserted {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
|
|
186
188
|
if object:
|
|
187
189
|
for object_str in object:
|
|
188
190
|
logger.info(f"Parsing: {object_str}")
|
|
189
191
|
objects = yaml.safe_load(object_str)
|
|
190
192
|
collection.insert(objects)
|
|
191
|
-
click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.
|
|
193
|
+
click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.alias}'.")
|
|
192
194
|
collection.commit()
|
|
193
195
|
|
|
194
196
|
|
|
@@ -324,7 +326,7 @@ def query(ctx, where, limit, output_type, output):
|
|
|
324
326
|
"""
|
|
325
327
|
collection = ctx.obj["settings"].collection
|
|
326
328
|
where_clause = yaml.safe_load(where) if where else None
|
|
327
|
-
query = Query(from_table=collection.
|
|
329
|
+
query = Query(from_table=collection.alias, where_clause=where_clause, limit=limit)
|
|
328
330
|
result = collection.query(query)
|
|
329
331
|
output_data = render_output(result.rows, output_type)
|
|
330
332
|
if output:
|
|
@@ -341,7 +343,7 @@ def query(ctx, where, limit, output_type, output):
|
|
|
341
343
|
def list_collections(ctx, **kwargs):
|
|
342
344
|
db = ctx.obj["settings"].database
|
|
343
345
|
for collection in db.list_collections(**kwargs):
|
|
344
|
-
click.echo(collection.
|
|
346
|
+
click.echo(collection.alias)
|
|
345
347
|
click.echo(render_output(collection.metadata))
|
|
346
348
|
|
|
347
349
|
|
|
@@ -351,8 +353,9 @@ def list_collections(ctx, **kwargs):
|
|
|
351
353
|
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
352
354
|
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
353
355
|
@click.option("--columns", "-S", help="Columns to facet on")
|
|
356
|
+
@click.option("--wide/--no-wide", "-U/--no-U", default=False, show_default=True, help="Wide table")
|
|
354
357
|
@click.pass_context
|
|
355
|
-
def fq(ctx, where, limit, columns, output_type, output):
|
|
358
|
+
def fq(ctx, where, limit, columns, output_type, wide, output):
|
|
356
359
|
"""
|
|
357
360
|
Query facets from the specified collection.
|
|
358
361
|
|
|
@@ -379,11 +382,22 @@ def fq(ctx, where, limit, columns, output_type, output):
|
|
|
379
382
|
return "+".join([str(x) for x in key])
|
|
380
383
|
return key
|
|
381
384
|
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
385
|
+
if wide:
|
|
386
|
+
results_obj = facet_summary_to_dataframe_unmelted(results)
|
|
387
|
+
else:
|
|
388
|
+
if output_type == Format.PYTHON.value:
|
|
389
|
+
results_obj = results
|
|
390
|
+
elif output_type in [Format.TSV.value, Format.CSV.value]:
|
|
391
|
+
results_obj = []
|
|
392
|
+
for fc, data in results.items():
|
|
393
|
+
for v, c in data:
|
|
394
|
+
results_obj.append({"facet": fc, "value": v, "count": c})
|
|
395
|
+
else:
|
|
396
|
+
results_obj = {}
|
|
397
|
+
for key, value in results.items():
|
|
398
|
+
value_as_dict = {_untuple(v[0:-1]): v[-1] for v in value}
|
|
399
|
+
results_obj[_untuple(key)] = value_as_dict
|
|
400
|
+
output_data = render_output(results_obj, output_type)
|
|
387
401
|
if output:
|
|
388
402
|
with open(output, "w") as f:
|
|
389
403
|
f.write(output_data)
|
|
@@ -403,14 +417,17 @@ def _get_index(index_type=None, **kwargs) -> Indexer:
|
|
|
403
417
|
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
|
|
404
418
|
@click.option("--output-type", "-O", type=format_choice, default=Format.FORMATTED.value, help="Output format")
|
|
405
419
|
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
420
|
+
@click.option(
|
|
421
|
+
"--limit", "-l", default=-1, show_default=True, type=click.INT, help="Maximum number of results to return"
|
|
422
|
+
)
|
|
406
423
|
@click.pass_context
|
|
407
|
-
def describe(ctx, where, output_type, output):
|
|
424
|
+
def describe(ctx, where, output_type, output, limit):
|
|
408
425
|
"""
|
|
409
426
|
Describe the collection schema.
|
|
410
427
|
"""
|
|
411
428
|
where_clause = yaml.safe_load(where) if where else None
|
|
412
429
|
collection = ctx.obj["settings"].collection
|
|
413
|
-
df = collection.find(where_clause, limit=
|
|
430
|
+
df = collection.find(where_clause, limit=limit).rows_dataframe
|
|
414
431
|
write_output(df.describe(include="all").transpose(), output_type, target=output)
|
|
415
432
|
|
|
416
433
|
|
|
@@ -468,7 +485,7 @@ def search(ctx, search_term, where, limit, index_type, output_type, output, auto
|
|
|
468
485
|
"""Search objects in the specified collection."""
|
|
469
486
|
collection = ctx.obj["settings"].collection
|
|
470
487
|
ix = get_indexer(index_type)
|
|
471
|
-
logger.info(f"Attaching index to collection {collection.
|
|
488
|
+
logger.info(f"Attaching index to collection {collection.alias}: {ix.model_dump()}")
|
|
472
489
|
collection.attach_indexer(ix, auto_index=auto_index)
|
|
473
490
|
result = collection.search(search_term, where=where, limit=limit)
|
|
474
491
|
output_data = render_output([{"score": row[0], **row[1]} for row in result.ranked_rows], output_type)
|
|
@@ -498,6 +515,7 @@ def indexes(ctx):
|
|
|
498
515
|
def validate(ctx, output_type, output):
|
|
499
516
|
"""Validate objects in the specified collection."""
|
|
500
517
|
collection = ctx.obj["settings"].collection
|
|
518
|
+
logger.info(f"Validating collection {collection.alias}")
|
|
501
519
|
validation_results = [json_dumper.to_dict(x) for x in collection.iter_validate_collection()]
|
|
502
520
|
output_data = render_output(validation_results, output_type)
|
|
503
521
|
if output:
|
linkml_store/index/__init__.py
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Indexers package.
|
|
3
|
+
|
|
4
|
+
Indexers allow indexes to be added to existing :class:`Collection` objects.
|
|
5
|
+
|
|
6
|
+
Current two are supported:
|
|
7
|
+
|
|
8
|
+
* simple: :class:`SimpleIndexer`
|
|
9
|
+
* llm: :class:`LLMIndexer`
|
|
10
|
+
"""
|
|
11
|
+
|
|
1
12
|
from typing import Type
|
|
2
13
|
|
|
3
14
|
from linkml_store.index.implementations.llm_indexer import LLMIndexer
|
|
@@ -14,7 +25,7 @@ def get_indexer_class(name: str) -> Type[Indexer]:
|
|
|
14
25
|
"""
|
|
15
26
|
Get an indexer class by name.
|
|
16
27
|
|
|
17
|
-
:param name: the name of the indexer
|
|
28
|
+
:param name: the name of the indexer (simple, llm, ...)
|
|
18
29
|
:return: the indexer class
|
|
19
30
|
"""
|
|
20
31
|
if name not in INDEXER_CLASSES:
|
|
@@ -26,7 +37,10 @@ def get_indexer(index_type: str, **kwargs) -> Indexer:
|
|
|
26
37
|
"""
|
|
27
38
|
Get an indexer by name.
|
|
28
39
|
|
|
29
|
-
|
|
40
|
+
>>> simple_indexer = get_indexer("simple")
|
|
41
|
+
>>> llm_indexer = get_indexer("llm")
|
|
42
|
+
|
|
43
|
+
:param name: the name of the indexer (simple, llm, ...)
|
|
30
44
|
:param kwargs: additional arguments to pass to the indexer
|
|
31
45
|
:return: the indexer
|
|
32
46
|
"""
|
|
@@ -74,7 +74,7 @@ class LLMIndexer(Indexer):
|
|
|
74
74
|
|
|
75
75
|
embeddings_client = Client()
|
|
76
76
|
config = CollectionConfig(
|
|
77
|
-
|
|
77
|
+
alias=coll_name,
|
|
78
78
|
type="Embeddings",
|
|
79
79
|
attributes={
|
|
80
80
|
"text": {"range": "string"},
|
|
@@ -116,6 +116,7 @@ class LLMIndexer(Indexer):
|
|
|
116
116
|
embeddings_collection.insert(
|
|
117
117
|
{"text": uncached_texts[i], "embedding": embeddings[index], "model_id": model_id}
|
|
118
118
|
)
|
|
119
|
+
embeddings_collection.commit()
|
|
119
120
|
else:
|
|
120
121
|
logger.info(f"Embedding {len(texts)} texts")
|
|
121
122
|
embeddings = model.embed_multi(texts)
|
linkml_store/index/indexer.py
CHANGED
|
@@ -11,11 +11,22 @@ logger = logging.getLogger(__name__)
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class TemplateSyntaxEnum(str, Enum):
|
|
14
|
+
"""
|
|
15
|
+
Template syntax types.
|
|
16
|
+
"""
|
|
17
|
+
|
|
14
18
|
jinja2 = "jinja2"
|
|
15
19
|
fstring = "fstring"
|
|
16
20
|
|
|
17
21
|
|
|
18
|
-
def cosine_similarity(vector1, vector2):
|
|
22
|
+
def cosine_similarity(vector1, vector2) -> float:
|
|
23
|
+
"""
|
|
24
|
+
Calculate the cosine similarity between two vectors
|
|
25
|
+
|
|
26
|
+
:param vector1:
|
|
27
|
+
:param vector2:
|
|
28
|
+
:return:
|
|
29
|
+
"""
|
|
19
30
|
dot_product = np.dot(vector1, vector2)
|
|
20
31
|
norm1 = np.linalg.norm(vector1)
|
|
21
32
|
norm2 = np.linalg.norm(vector2)
|
|
@@ -24,7 +35,7 @@ def cosine_similarity(vector1, vector2):
|
|
|
24
35
|
|
|
25
36
|
class Indexer(BaseModel):
|
|
26
37
|
"""
|
|
27
|
-
An
|
|
38
|
+
An indexer operates on a collection in order to search for objects.
|
|
28
39
|
"""
|
|
29
40
|
|
|
30
41
|
name: Optional[str] = None
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import shutil
|
|
3
|
+
import tempfile
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
# Set up logging
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def safe_remove_directory(dir_path: Path, no_backup: bool = False) -> Optional[Path]:
|
|
13
|
+
# Ensure the directory exists
|
|
14
|
+
if not dir_path.exists():
|
|
15
|
+
raise FileNotFoundError(f"Directory does not exist: {dir_path}")
|
|
16
|
+
try:
|
|
17
|
+
if no_backup:
|
|
18
|
+
# Move to a temporary directory instead of permanent removal
|
|
19
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
20
|
+
tmp_path = Path(tmpdir) / dir_path.name
|
|
21
|
+
shutil.move(str(dir_path), str(tmp_path))
|
|
22
|
+
logger.info(f"Directory moved to temporary location: {tmp_path}")
|
|
23
|
+
# The directory will be automatically removed when exiting the context manager
|
|
24
|
+
return None
|
|
25
|
+
else:
|
|
26
|
+
# Create a backup directory name with timestamp
|
|
27
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
28
|
+
backup_dir = dir_path.with_name(f"{dir_path.name}_backup_{timestamp}")
|
|
29
|
+
|
|
30
|
+
# Move the directory to the backup location
|
|
31
|
+
shutil.move(str(dir_path), str(backup_dir))
|
|
32
|
+
logger.info(f"Directory backed up to: {backup_dir}")
|
|
33
|
+
return backup_dir
|
|
34
|
+
|
|
35
|
+
except Exception as e:
|
|
36
|
+
logger.error(f"An error occurred: {e}")
|
|
37
|
+
return None
|
|
@@ -7,8 +7,10 @@ from pathlib import Path
|
|
|
7
7
|
from typing import Any, Dict, List, Optional, TextIO, Type, Union
|
|
8
8
|
|
|
9
9
|
import pandas as pd
|
|
10
|
+
import pystow
|
|
10
11
|
import yaml
|
|
11
12
|
from pydantic import BaseModel
|
|
13
|
+
from tabulate import tabulate
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
class Format(Enum):
|
|
@@ -21,12 +23,40 @@ class Format(Enum):
|
|
|
21
23
|
YAML = "yaml"
|
|
22
24
|
TSV = "tsv"
|
|
23
25
|
CSV = "csv"
|
|
26
|
+
PYTHON = "python"
|
|
24
27
|
PARQUET = "parquet"
|
|
25
28
|
FORMATTED = "formatted"
|
|
29
|
+
TABLE = "table"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def load_objects_from_url(
|
|
33
|
+
url: str,
|
|
34
|
+
format: Union[Format, str] = None,
|
|
35
|
+
expected_type: Type = None,
|
|
36
|
+
local_path: Optional[str] = None,
|
|
37
|
+
**kwargs,
|
|
38
|
+
) -> List[Dict[str, Any]]:
|
|
39
|
+
"""
|
|
40
|
+
Load objects from a URL in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
41
|
+
|
|
42
|
+
:param url: The URL to the file.
|
|
43
|
+
:param format: The format of the file. Can be a Format enum or a string value.
|
|
44
|
+
:param expected_type: The target type to load the objects into.
|
|
45
|
+
:param local_path: The local path to save the file to.
|
|
46
|
+
:return: A list of dictionaries representing the loaded objects.
|
|
47
|
+
"""
|
|
48
|
+
local_path = pystow.ensure("linkml", "linkml-store", url=url)
|
|
49
|
+
objs = load_objects(local_path, format=format, expected_type=expected_type, **kwargs)
|
|
50
|
+
if not objs:
|
|
51
|
+
raise ValueError(f"No objects loaded from URL: {url}")
|
|
52
|
+
return objs
|
|
26
53
|
|
|
27
54
|
|
|
28
55
|
def load_objects(
|
|
29
|
-
file_path: Union[str, Path],
|
|
56
|
+
file_path: Union[str, Path],
|
|
57
|
+
format: Union[Format, str] = None,
|
|
58
|
+
expected_type: Type = None,
|
|
59
|
+
header_comment_token: Optional[str] = None,
|
|
30
60
|
) -> List[Dict[str, Any]]:
|
|
31
61
|
"""
|
|
32
62
|
Load objects from a file in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
@@ -37,7 +67,7 @@ def load_objects(
|
|
|
37
67
|
|
|
38
68
|
:param file_path: The path to the file.
|
|
39
69
|
:param format: The format of the file. Can be a Format enum or a string value.
|
|
40
|
-
:param expected_type: The target type to load the objects into.
|
|
70
|
+
:param expected_type: The target type to load the objects into, e.g. list
|
|
41
71
|
:return: A list of dictionaries representing the loaded objects.
|
|
42
72
|
"""
|
|
43
73
|
if isinstance(format, str):
|
|
@@ -48,6 +78,12 @@ def load_objects(
|
|
|
48
78
|
|
|
49
79
|
if not format and (file_path.endswith(".parquet") or file_path.endswith(".pq")):
|
|
50
80
|
format = Format.PARQUET
|
|
81
|
+
if not format and file_path.endswith(".tsv"):
|
|
82
|
+
format = Format.TSV
|
|
83
|
+
if not format and file_path.endswith(".csv"):
|
|
84
|
+
format = Format.CSV
|
|
85
|
+
if not format and file_path.endswith(".py"):
|
|
86
|
+
format = Format.PYTHON
|
|
51
87
|
|
|
52
88
|
mode = "r"
|
|
53
89
|
if format == Format.PARQUET:
|
|
@@ -68,11 +104,29 @@ def load_objects(
|
|
|
68
104
|
objs = list(yaml.safe_load_all(f))
|
|
69
105
|
else:
|
|
70
106
|
objs = yaml.safe_load(f)
|
|
71
|
-
elif format == Format.TSV or
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
107
|
+
elif format == Format.TSV or format == Format.CSV:
|
|
108
|
+
# Skip initial comment lines if comment_char is set
|
|
109
|
+
if header_comment_token:
|
|
110
|
+
# Store the original position
|
|
111
|
+
original_pos = f.tell()
|
|
112
|
+
|
|
113
|
+
# Read and store lines until we find a non-comment line
|
|
114
|
+
lines = []
|
|
115
|
+
for line in f:
|
|
116
|
+
if not line.startswith(header_comment_token):
|
|
117
|
+
break
|
|
118
|
+
lines.append(line)
|
|
119
|
+
|
|
120
|
+
# Go back to the original position
|
|
121
|
+
f.seek(original_pos)
|
|
122
|
+
|
|
123
|
+
# Skip the comment lines we found
|
|
124
|
+
for _ in lines:
|
|
125
|
+
f.readline()
|
|
126
|
+
if format == Format.TSV:
|
|
127
|
+
reader = csv.DictReader(f, delimiter="\t")
|
|
128
|
+
else:
|
|
129
|
+
reader = csv.DictReader(f)
|
|
76
130
|
objs = list(reader)
|
|
77
131
|
elif format == Format.PARQUET:
|
|
78
132
|
import pyarrow.parquet as pq
|
|
@@ -151,6 +205,9 @@ def render_output(
|
|
|
151
205
|
if isinstance(data, pd.DataFrame):
|
|
152
206
|
data = data.to_dict(orient="records")
|
|
153
207
|
|
|
208
|
+
if isinstance(data, dict) and format in [Format.TSV, Format.CSV]:
|
|
209
|
+
data = [data]
|
|
210
|
+
|
|
154
211
|
if isinstance(data, BaseModel):
|
|
155
212
|
data = data.model_dump()
|
|
156
213
|
|
|
@@ -158,6 +215,10 @@ def render_output(
|
|
|
158
215
|
return json.dumps(data, indent=2, default=str)
|
|
159
216
|
elif format == Format.JSONL:
|
|
160
217
|
return "\n".join(json.dumps(obj) for obj in data)
|
|
218
|
+
elif format == Format.PYTHON:
|
|
219
|
+
return str(data)
|
|
220
|
+
elif format == Format.TABLE:
|
|
221
|
+
return tabulate(pd.DataFrame(data), headers="keys", tablefmt="psql")
|
|
161
222
|
elif format == Format.YAML:
|
|
162
223
|
if isinstance(data, list):
|
|
163
224
|
return yaml.safe_dump_all(data, sort_keys=False)
|