linkml-store 0.0.0__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/__init__.py +2 -2
- linkml_store/api/client.py +113 -8
- linkml_store/api/collection.py +272 -34
- linkml_store/api/config.py +101 -0
- linkml_store/api/database.py +282 -18
- linkml_store/api/queries.py +12 -1
- linkml_store/api/stores/chromadb/__init__.py +3 -0
- linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
- linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
- linkml_store/api/stores/duckdb/__init__.py +7 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +47 -14
- linkml_store/api/stores/duckdb/duckdb_database.py +38 -47
- linkml_store/api/stores/hdf5/__init__.py +0 -0
- linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
- linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +92 -40
- linkml_store/api/stores/mongodb/mongodb_database.py +58 -67
- linkml_store/api/stores/solr/__init__.py +3 -0
- linkml_store/api/stores/solr/solr_collection.py +133 -0
- linkml_store/api/stores/solr/solr_database.py +83 -0
- linkml_store/api/stores/solr/solr_utils.py +0 -0
- linkml_store/cli.py +369 -0
- linkml_store/index/__init__.py +33 -0
- linkml_store/index/implementations/{llm_index.py → llm_indexer.py} +2 -2
- linkml_store/index/implementations/{simple_index.py → simple_indexer.py} +6 -3
- linkml_store/index/{index.py → indexer.py} +7 -4
- linkml_store/utils/format_utils.py +93 -0
- linkml_store/utils/object_utils.py +81 -0
- linkml_store/utils/sql_utils.py +46 -7
- {linkml_store-0.0.0.dist-info → linkml_store-0.1.7.dist-info}/METADATA +17 -6
- linkml_store-0.1.7.dist-info/RECORD +42 -0
- linkml_store-0.1.7.dist-info/entry_points.txt +3 -0
- linkml_store/api/metadata.py +0 -5
- linkml_store-0.0.0.dist-info/RECORD +0 -29
- linkml_store-0.0.0.dist-info/entry_points.txt +0 -3
- {linkml_store-0.0.0.dist-info → linkml_store-0.1.7.dist-info}/LICENSE +0 -0
- {linkml_store-0.0.0.dist-info → linkml_store-0.1.7.dist-info}/WHEEL +0 -0
linkml_store/cli.py
ADDED
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
import warnings
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
import yaml
|
|
8
|
+
from linkml_runtime.dumpers import json_dumper
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
|
|
11
|
+
from linkml_store import Client
|
|
12
|
+
from linkml_store.api import Collection, Database
|
|
13
|
+
from linkml_store.api.queries import Query
|
|
14
|
+
from linkml_store.index.implementations.simple_indexer import SimpleIndexer
|
|
15
|
+
from linkml_store.index.indexer import Indexer
|
|
16
|
+
from linkml_store.utils.format_utils import Format, load_objects, render_output
|
|
17
|
+
from linkml_store.utils.object_utils import object_path_update
|
|
18
|
+
|
|
19
|
+
index_type_option = click.option("--index-type", "-t")
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
warnings.filterwarnings("ignore", module="duckdb_engine")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ContextSettings(BaseModel):
|
|
27
|
+
"""
|
|
28
|
+
Context object for CLI commands.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
client: Client
|
|
32
|
+
database_name: Optional[str] = None
|
|
33
|
+
collection_name: Optional[str] = None
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def database(self) -> Optional[Database]:
|
|
37
|
+
"""
|
|
38
|
+
Get the database object.
|
|
39
|
+
:return:
|
|
40
|
+
"""
|
|
41
|
+
name = self.database_name
|
|
42
|
+
if name is None:
|
|
43
|
+
# if len(self.client.databases) > 1:
|
|
44
|
+
# raise ValueError("Database must be specified if there are multiple databases.")
|
|
45
|
+
if not self.client.databases:
|
|
46
|
+
return None
|
|
47
|
+
name = list(self.client.databases.keys())[0]
|
|
48
|
+
return self.client.get_database(name)
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def collection(self) -> Optional[Collection]:
|
|
52
|
+
"""
|
|
53
|
+
Get the collection object.
|
|
54
|
+
:return:
|
|
55
|
+
"""
|
|
56
|
+
name = self.collection_name
|
|
57
|
+
if name is None:
|
|
58
|
+
# if len(self.database.list_collections()) > 1:
|
|
59
|
+
# raise ValueError("Collection must be specified if there are multiple collections.")
|
|
60
|
+
if not self.database.list_collections():
|
|
61
|
+
return None
|
|
62
|
+
name = list(self.database.list_collections())[0]
|
|
63
|
+
return self.database.get_collection(name)
|
|
64
|
+
|
|
65
|
+
class Config:
|
|
66
|
+
arbitrary_types_allowed = True
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# format_choice = click.Choice(["json", "yaml", "tsv"])
|
|
70
|
+
format_choice = click.Choice([f.value for f in Format])
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@click.group()
|
|
74
|
+
@click.option("--database", "-d", help="Database name")
|
|
75
|
+
@click.option("--collection", "-c", help="Collection name")
|
|
76
|
+
@click.option("--config", "-C", type=click.Path(exists=True), help="Path to the configuration file")
|
|
77
|
+
@click.option("--set", help="Metadata settings in the form PATHEXPR=value", multiple=True)
|
|
78
|
+
@click.option("-v", "--verbose", count=True)
|
|
79
|
+
@click.option("-q", "--quiet/--no-quiet")
|
|
80
|
+
@click.option(
|
|
81
|
+
"--stacktrace/--no-stacktrace",
|
|
82
|
+
default=False,
|
|
83
|
+
show_default=True,
|
|
84
|
+
help="If set then show full stacktrace on error",
|
|
85
|
+
)
|
|
86
|
+
@click.pass_context
|
|
87
|
+
def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, config, set):
|
|
88
|
+
"""A CLI for interacting with the linkml-store."""
|
|
89
|
+
if not stacktrace:
|
|
90
|
+
sys.tracebacklimit = 0
|
|
91
|
+
logger = logging.getLogger()
|
|
92
|
+
if verbose >= 2:
|
|
93
|
+
logger.setLevel(logging.DEBUG)
|
|
94
|
+
elif verbose == 1:
|
|
95
|
+
logger.setLevel(logging.INFO)
|
|
96
|
+
else:
|
|
97
|
+
logger.setLevel(logging.WARNING)
|
|
98
|
+
if quiet:
|
|
99
|
+
logger.setLevel(logging.ERROR)
|
|
100
|
+
ctx.ensure_object(dict)
|
|
101
|
+
client = Client().from_config(config) if config else Client()
|
|
102
|
+
settings = ContextSettings(client=client, database_name=database, collection_name=collection)
|
|
103
|
+
ctx.obj["settings"] = settings
|
|
104
|
+
# DEPRECATED
|
|
105
|
+
ctx.obj["client"] = client
|
|
106
|
+
ctx.obj["database"] = database
|
|
107
|
+
ctx.obj["collection"] = collection
|
|
108
|
+
if settings.database_name:
|
|
109
|
+
db = client.get_database(database)
|
|
110
|
+
if set:
|
|
111
|
+
for expr in set:
|
|
112
|
+
if "=" not in expr:
|
|
113
|
+
raise ValueError(f"Expression must be of form PARAM=VALUE. Got: {expr}")
|
|
114
|
+
path, val = expr.split("=", 1)
|
|
115
|
+
val = yaml.safe_load(val)
|
|
116
|
+
logger.info(f"Setting {path} to {val}")
|
|
117
|
+
db.metadata = object_path_update(db.metadata, path, val)
|
|
118
|
+
# settings.database = db
|
|
119
|
+
# DEPRECATED
|
|
120
|
+
ctx.obj["database_obj"] = db
|
|
121
|
+
if collection:
|
|
122
|
+
collection_obj = db.get_collection(collection)
|
|
123
|
+
ctx.obj["collection_obj"] = collection_obj
|
|
124
|
+
if not settings.database_name:
|
|
125
|
+
# if len(client.databases) != 1:
|
|
126
|
+
# raise ValueError("Database must be specified if there are multiple databases.")
|
|
127
|
+
if client.databases:
|
|
128
|
+
settings.database_name = list(client.databases.keys())[0]
|
|
129
|
+
if not settings.collection_name:
|
|
130
|
+
# if len(settings.database.list_collections()) != 1:
|
|
131
|
+
# raise ValueError("Collection must be specified if there are multiple collections.")
|
|
132
|
+
if settings.database and settings.database.list_collections():
|
|
133
|
+
collection = settings.database.list_collections()[0]
|
|
134
|
+
settings.collection_name = collection.name
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@cli.command()
|
|
138
|
+
@click.argument("files", type=click.Path(exists=True), nargs=-1)
|
|
139
|
+
@click.option("--format", "-f", type=format_choice, help="Input format")
|
|
140
|
+
@click.option("--object", "-i", multiple=True, help="Input object as YAML")
|
|
141
|
+
@click.pass_context
|
|
142
|
+
def insert(ctx, files, object, format):
|
|
143
|
+
"""Insert objects from files (JSON, YAML, TSV) into the specified collection."""
|
|
144
|
+
settings = ctx.obj["settings"]
|
|
145
|
+
collection = settings.collection
|
|
146
|
+
if not collection:
|
|
147
|
+
raise ValueError("Collection must be specified.")
|
|
148
|
+
objects = []
|
|
149
|
+
if not files and not object:
|
|
150
|
+
files = ["-"]
|
|
151
|
+
for file_path in files:
|
|
152
|
+
if format:
|
|
153
|
+
objects = load_objects(file_path, format=format)
|
|
154
|
+
else:
|
|
155
|
+
objects = load_objects(file_path)
|
|
156
|
+
logger.info(f"Inserting {len(objects)} objects from {file_path} into collection '{collection.name}'.")
|
|
157
|
+
collection.insert(objects)
|
|
158
|
+
click.echo(f"Inserted {len(objects)} objects from {file_path} into collection '{collection.name}'.")
|
|
159
|
+
if object:
|
|
160
|
+
for object_str in object:
|
|
161
|
+
logger.info(f"Parsing: {object_str}")
|
|
162
|
+
objects = yaml.safe_load(object_str)
|
|
163
|
+
collection.insert(objects)
|
|
164
|
+
click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.name}'.")
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@cli.command()
|
|
168
|
+
@click.argument("files", type=click.Path(exists=True), nargs=-1)
|
|
169
|
+
@click.option("--format", "-f", type=format_choice, help="Input format")
|
|
170
|
+
@click.option("--object", "-i", multiple=True, help="Input object as YAML")
|
|
171
|
+
@click.pass_context
|
|
172
|
+
def store(ctx, files, object, format):
|
|
173
|
+
"""Store objects from files (JSON, YAML, TSV) into the specified collection."""
|
|
174
|
+
settings = ctx.obj["settings"]
|
|
175
|
+
db = settings.database
|
|
176
|
+
if not files and not object:
|
|
177
|
+
files = ["-"]
|
|
178
|
+
for file_path in files:
|
|
179
|
+
if format:
|
|
180
|
+
objects = load_objects(file_path, format=format)
|
|
181
|
+
else:
|
|
182
|
+
objects = load_objects(file_path)
|
|
183
|
+
logger.info(f"Inserting {len(objects)} objects from {file_path} into database '{db}'.")
|
|
184
|
+
for obj in objects:
|
|
185
|
+
db.store(obj)
|
|
186
|
+
click.echo(f"Inserted {len(objects)} objects from {file_path} into database '{db}'.")
|
|
187
|
+
if object:
|
|
188
|
+
for object_str in object:
|
|
189
|
+
logger.info(f"Parsing: {object_str}")
|
|
190
|
+
objects = yaml.safe_load(object_str)
|
|
191
|
+
for obj in objects:
|
|
192
|
+
db.store(obj)
|
|
193
|
+
click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{db.name}'.")
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
@cli.command()
|
|
197
|
+
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
|
|
198
|
+
@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
|
|
199
|
+
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
200
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
201
|
+
@click.pass_context
|
|
202
|
+
def query(ctx, where, limit, output_type, output):
|
|
203
|
+
"""Query objects from the specified collection."""
|
|
204
|
+
collection = ctx.obj["settings"].collection
|
|
205
|
+
where_clause = yaml.safe_load(where) if where else None
|
|
206
|
+
query = Query(from_table=collection.name, where_clause=where_clause, limit=limit)
|
|
207
|
+
result = collection.query(query)
|
|
208
|
+
output_data = render_output(result.rows, output_type)
|
|
209
|
+
if output:
|
|
210
|
+
with open(output, "w") as f:
|
|
211
|
+
f.write(output_data)
|
|
212
|
+
click.echo(f"Query results saved to {output}")
|
|
213
|
+
else:
|
|
214
|
+
click.echo(output_data)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
@cli.command()
|
|
218
|
+
@click.pass_context
|
|
219
|
+
def list_collections(ctx):
|
|
220
|
+
db = ctx.obj["settings"].database
|
|
221
|
+
for collection in db.list_collections():
|
|
222
|
+
click.echo(collection.name)
|
|
223
|
+
click.echo(render_output(collection.metadata))
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
@cli.command()
|
|
227
|
+
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
|
|
228
|
+
@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
|
|
229
|
+
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
230
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
231
|
+
@click.option("--columns", "-S", help="Columns to facet on")
|
|
232
|
+
@click.pass_context
|
|
233
|
+
def fq(ctx, where, limit, columns, output_type, output):
|
|
234
|
+
"""
|
|
235
|
+
Query facets from the specified collection.
|
|
236
|
+
|
|
237
|
+
:param ctx:
|
|
238
|
+
:param where:
|
|
239
|
+
:param limit:
|
|
240
|
+
:param columns:
|
|
241
|
+
:param output_type:
|
|
242
|
+
:param output:
|
|
243
|
+
:return:
|
|
244
|
+
"""
|
|
245
|
+
collection = ctx.obj["settings"].collection
|
|
246
|
+
where_clause = yaml.safe_load(where) if where else None
|
|
247
|
+
columns = columns.split(",") if columns else None
|
|
248
|
+
if columns:
|
|
249
|
+
columns = [col.strip() for col in columns]
|
|
250
|
+
columns = [(tuple(col.split("+")) if "+" in col else col) for col in columns]
|
|
251
|
+
logger.info(f"Faceting on columns: {columns}")
|
|
252
|
+
results = collection.query_facets(where_clause, facet_columns=columns, limit=limit)
|
|
253
|
+
logger.info(f"Facet results: {results}")
|
|
254
|
+
|
|
255
|
+
def _untuple(key):
|
|
256
|
+
if isinstance(key, tuple):
|
|
257
|
+
return "+".join(key)
|
|
258
|
+
return key
|
|
259
|
+
|
|
260
|
+
count_dict = {}
|
|
261
|
+
for key, value in results.items():
|
|
262
|
+
value_as_dict = {_untuple(v[0:-1]): v[-1] for v in value}
|
|
263
|
+
count_dict[_untuple(key)] = value_as_dict
|
|
264
|
+
output_data = render_output(count_dict, output_type)
|
|
265
|
+
if output:
|
|
266
|
+
with open(output, "w") as f:
|
|
267
|
+
f.write(output_data)
|
|
268
|
+
click.echo(f"Query results saved to {output}")
|
|
269
|
+
else:
|
|
270
|
+
click.echo(output_data)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _get_index(index_type=None, **kwargs) -> Indexer:
|
|
274
|
+
if index_type is None or index_type == "simple":
|
|
275
|
+
return SimpleIndexer(name="test", **kwargs)
|
|
276
|
+
else:
|
|
277
|
+
raise ValueError(f"Unknown index type: {index_type}")
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
@cli.command()
|
|
281
|
+
@index_type_option
|
|
282
|
+
@click.pass_context
|
|
283
|
+
def index(ctx, index_type):
|
|
284
|
+
"""
|
|
285
|
+
Create an index over a collection.
|
|
286
|
+
|
|
287
|
+
:param ctx:
|
|
288
|
+
:param index_type:
|
|
289
|
+
:return:
|
|
290
|
+
"""
|
|
291
|
+
collection = ctx.obj["settings"].collection
|
|
292
|
+
ix = _get_index(index_type)
|
|
293
|
+
collection.attach_indexer(ix)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
@cli.command()
|
|
297
|
+
@click.pass_context
|
|
298
|
+
@click.option("--output-type", "-O", type=format_choice, default="yaml", help="Output format")
|
|
299
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
300
|
+
def schema(ctx, output_type, output):
|
|
301
|
+
"""
|
|
302
|
+
Show the schema for a database
|
|
303
|
+
|
|
304
|
+
:param ctx:
|
|
305
|
+
:param index_type:
|
|
306
|
+
:return:
|
|
307
|
+
"""
|
|
308
|
+
db = ctx.obj["settings"].database
|
|
309
|
+
schema_dict = json_dumper.to_dict(db.schema_view.schema)
|
|
310
|
+
output_data = render_output(schema_dict, output_type)
|
|
311
|
+
if output:
|
|
312
|
+
with open(output, "w") as f:
|
|
313
|
+
f.write(output_data)
|
|
314
|
+
click.echo(f"Schema saved to {output}")
|
|
315
|
+
else:
|
|
316
|
+
click.echo(output_data)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
@cli.command()
|
|
320
|
+
@click.argument("search_term")
|
|
321
|
+
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the search")
|
|
322
|
+
@click.option("--limit", "-l", type=click.INT, help="Maximum number of search results")
|
|
323
|
+
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
324
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
325
|
+
@index_type_option
|
|
326
|
+
@click.pass_context
|
|
327
|
+
def search(ctx, search_term, where, limit, index_type, output_type, output):
|
|
328
|
+
"""Search objects in the specified collection."""
|
|
329
|
+
collection = ctx.obj["settings"].collection
|
|
330
|
+
ix = _get_index(index_type)
|
|
331
|
+
logger.info(f"Attaching index to collection {collection.name}: {ix.model_dump()}")
|
|
332
|
+
collection.attach_indexer(ix, auto_index=False)
|
|
333
|
+
result = collection.search(search_term, where=where, limit=limit)
|
|
334
|
+
output_data = render_output([{"score": row[0], **row[1]} for row in result.ranked_rows], output_type)
|
|
335
|
+
if output:
|
|
336
|
+
with open(output, "w") as f:
|
|
337
|
+
f.write(output_data)
|
|
338
|
+
click.echo(f"Search results saved to {output}")
|
|
339
|
+
else:
|
|
340
|
+
click.echo(output_data)
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
@cli.command()
|
|
344
|
+
@click.pass_context
|
|
345
|
+
def indexes(ctx):
|
|
346
|
+
collection = ctx.obj["settings"].collection
|
|
347
|
+
for name, ix in collection.indexers.items():
|
|
348
|
+
click.echo(f"{name}: {type(ix)}\n{ix.model_json()}")
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
@cli.command()
|
|
352
|
+
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
353
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
354
|
+
@click.pass_context
|
|
355
|
+
def validate(ctx, output_type, output):
|
|
356
|
+
"""Validate objects in the specified collection."""
|
|
357
|
+
collection = ctx.obj["settings"].collection
|
|
358
|
+
validation_results = [json_dumper.to_dict(x) for x in collection.iter_validate_collection()]
|
|
359
|
+
output_data = render_output(validation_results, output_type)
|
|
360
|
+
if output:
|
|
361
|
+
with open(output, "w") as f:
|
|
362
|
+
f.write(output_data)
|
|
363
|
+
click.echo(f"Validation results saved to {output}")
|
|
364
|
+
else:
|
|
365
|
+
click.echo(output_data)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
if __name__ == "__main__":
|
|
369
|
+
cli()
|
linkml_store/index/__init__.py
CHANGED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from typing import Type
|
|
2
|
+
|
|
3
|
+
from linkml_store.index.implementations.llm_indexer import LLMIndexer
|
|
4
|
+
from linkml_store.index.implementations.simple_indexer import SimpleIndexer
|
|
5
|
+
from linkml_store.index.indexer import Indexer
|
|
6
|
+
|
|
7
|
+
INDEXER_CLASSES = {
|
|
8
|
+
"simple": SimpleIndexer,
|
|
9
|
+
"llm": LLMIndexer,
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_indexer_class(name: str) -> Type[Indexer]:
|
|
14
|
+
"""
|
|
15
|
+
Get an indexer class by name.
|
|
16
|
+
|
|
17
|
+
:param name: the name of the indexer
|
|
18
|
+
:return: the indexer class
|
|
19
|
+
"""
|
|
20
|
+
if name not in INDEXER_CLASSES:
|
|
21
|
+
raise ValueError(f"Unknown indexer class: {name}")
|
|
22
|
+
return INDEXER_CLASSES[name]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_indexer(name: str, *args, **kwargs) -> Indexer:
|
|
26
|
+
"""
|
|
27
|
+
Get an indexer by name.
|
|
28
|
+
|
|
29
|
+
:param name: the name of the indexer
|
|
30
|
+
:param kwargs: additional arguments to pass to the indexer
|
|
31
|
+
:return: the indexer
|
|
32
|
+
"""
|
|
33
|
+
return get_indexer_class(name)(*args, **kwargs)
|
|
@@ -2,13 +2,13 @@ from typing import TYPE_CHECKING, List
|
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
|
|
5
|
-
from linkml_store.index.
|
|
5
|
+
from linkml_store.index.indexer import INDEX_ITEM, Indexer
|
|
6
6
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
8
|
import llm
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
class
|
|
11
|
+
class LLMIndexer(Indexer):
|
|
12
12
|
"""
|
|
13
13
|
A implementations index wraps the llm library
|
|
14
14
|
"""
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
import hashlib
|
|
2
|
+
import logging
|
|
2
3
|
|
|
3
4
|
import numpy as np
|
|
4
5
|
|
|
5
|
-
from linkml_store.index.
|
|
6
|
+
from linkml_store.index.indexer import INDEX_ITEM, Indexer
|
|
6
7
|
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
7
9
|
|
|
8
|
-
|
|
10
|
+
|
|
11
|
+
class SimpleIndexer(Indexer):
|
|
9
12
|
"""
|
|
10
13
|
A implementations index that uses a hash function to generate an index from text.
|
|
11
14
|
|
|
@@ -36,5 +39,5 @@ class SimpleIndex(Index):
|
|
|
36
39
|
|
|
37
40
|
# Increment the count at the computed index
|
|
38
41
|
vector[index] += 1.0
|
|
39
|
-
|
|
42
|
+
logger.info(f"Indexed text: {text} as {vector}")
|
|
40
43
|
return vector
|
|
@@ -13,7 +13,7 @@ def cosine_similarity(vector1, vector2):
|
|
|
13
13
|
return dot_product / (norm1 * norm2)
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
class
|
|
16
|
+
class Indexer(BaseModel):
|
|
17
17
|
"""
|
|
18
18
|
An index operates on a collection in order to search for objects.
|
|
19
19
|
"""
|
|
@@ -65,7 +65,10 @@ class Index(BaseModel):
|
|
|
65
65
|
|
|
66
66
|
def object_to_text(self, obj: Dict[str, Any]) -> str:
|
|
67
67
|
"""
|
|
68
|
-
|
|
68
|
+
Convert an object to a text representation
|
|
69
|
+
|
|
70
|
+
:param obj:
|
|
71
|
+
:return:
|
|
69
72
|
"""
|
|
70
73
|
if self.index_attributes:
|
|
71
74
|
obj = {k: v for k, v in obj.items() if k in self.index_attributes}
|
|
@@ -77,14 +80,14 @@ class Index(BaseModel):
|
|
|
77
80
|
|
|
78
81
|
def search(
|
|
79
82
|
self, query: str, vectors: List[Tuple[str, INDEX_ITEM]], limit: Optional[int] = None
|
|
80
|
-
) -> List[Tuple[float,
|
|
83
|
+
) -> List[Tuple[float, Any]]:
|
|
81
84
|
"""
|
|
82
85
|
Search the index for a query string
|
|
83
86
|
|
|
84
87
|
:param query: The query string to search for
|
|
85
88
|
:param vectors: A list of indexed items, where each item is a tuple of (id, vector)
|
|
86
89
|
:param limit: The maximum number of results to return (optional)
|
|
87
|
-
:return: A list of item IDs that match the query
|
|
90
|
+
:return: A list of item IDs or objects that match the query
|
|
88
91
|
"""
|
|
89
92
|
|
|
90
93
|
# Convert the query string to a vector
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import json
|
|
3
|
+
import sys
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from io import StringIO
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, List, Union
|
|
8
|
+
|
|
9
|
+
import yaml
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Format(Enum):
|
|
14
|
+
JSON = "json"
|
|
15
|
+
JSONL = "jsonl"
|
|
16
|
+
YAML = "yaml"
|
|
17
|
+
TSV = "tsv"
|
|
18
|
+
CSV = "csv"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def load_objects(file_path: Union[str, Path], format: Union[Format, str] = None) -> List[Dict[str, Any]]:
|
|
22
|
+
"""
|
|
23
|
+
Load objects from a file in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
24
|
+
|
|
25
|
+
:param file_path: The path to the file.
|
|
26
|
+
:param format: The format of the file. Can be a Format enum or a string value.
|
|
27
|
+
:return: A list of dictionaries representing the loaded objects.
|
|
28
|
+
"""
|
|
29
|
+
if isinstance(format, str):
|
|
30
|
+
format = Format(format)
|
|
31
|
+
|
|
32
|
+
if isinstance(file_path, Path):
|
|
33
|
+
file_path = str(file_path)
|
|
34
|
+
|
|
35
|
+
if file_path == "-":
|
|
36
|
+
# set file_path to be a stream from stdin
|
|
37
|
+
f = sys.stdin
|
|
38
|
+
else:
|
|
39
|
+
f = open(file_path)
|
|
40
|
+
|
|
41
|
+
if format == Format.JSON or (not format and file_path.endswith(".json")):
|
|
42
|
+
objs = json.load(f)
|
|
43
|
+
elif format == Format.JSONL or (not format and file_path.endswith(".jsonl")):
|
|
44
|
+
objs = [json.loads(line) for line in f]
|
|
45
|
+
elif format == Format.YAML or (not format and (file_path.endswith(".yaml") or file_path.endswith(".yml"))):
|
|
46
|
+
objs = yaml.safe_load(f)
|
|
47
|
+
elif format == Format.TSV or (not format and file_path.endswith(".tsv")):
|
|
48
|
+
reader = csv.DictReader(f, delimiter="\t")
|
|
49
|
+
objs = list(reader)
|
|
50
|
+
elif format == Format.CSV or (not format and file_path.endswith(".csv")):
|
|
51
|
+
reader = csv.DictReader(f)
|
|
52
|
+
objs = list(reader)
|
|
53
|
+
else:
|
|
54
|
+
raise ValueError(f"Unsupported file format: {file_path}")
|
|
55
|
+
if not isinstance(objs, list):
|
|
56
|
+
objs = [objs]
|
|
57
|
+
return objs
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def render_output(data: List[Dict[str, Any]], format: Union[Format, str] = Format.YAML) -> str:
|
|
61
|
+
"""
|
|
62
|
+
Render output data in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
63
|
+
|
|
64
|
+
:param data: The data to be rendered.
|
|
65
|
+
:param format: The desired output format. Can be a Format enum or a string value.
|
|
66
|
+
:return: The rendered output as a string.
|
|
67
|
+
"""
|
|
68
|
+
if isinstance(format, str):
|
|
69
|
+
format = Format(format)
|
|
70
|
+
|
|
71
|
+
if isinstance(data, BaseModel):
|
|
72
|
+
data = data.model_dump()
|
|
73
|
+
|
|
74
|
+
if format == Format.JSON:
|
|
75
|
+
return json.dumps(data, indent=2, default=str)
|
|
76
|
+
elif format == Format.JSONL:
|
|
77
|
+
return "\n".join(json.dumps(obj) for obj in data)
|
|
78
|
+
elif format == Format.YAML:
|
|
79
|
+
return yaml.safe_dump(data, sort_keys=False)
|
|
80
|
+
elif format == Format.TSV:
|
|
81
|
+
output = StringIO()
|
|
82
|
+
writer = csv.DictWriter(output, fieldnames=data[0].keys(), delimiter="\t")
|
|
83
|
+
writer.writeheader()
|
|
84
|
+
writer.writerows(data)
|
|
85
|
+
return output.getvalue()
|
|
86
|
+
elif format == Format.CSV:
|
|
87
|
+
output = StringIO()
|
|
88
|
+
writer = csv.DictWriter(output, fieldnames=data[0].keys())
|
|
89
|
+
writer.writeheader()
|
|
90
|
+
writer.writerows(data)
|
|
91
|
+
return output.getvalue()
|
|
92
|
+
else:
|
|
93
|
+
raise ValueError(f"Unsupported output format: {format}")
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from copy import deepcopy
|
|
3
|
+
from typing import Any, Dict, List, Union
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def object_path_update(
|
|
9
|
+
obj: Union[BaseModel, Dict[str, Any]], path: str, value: Any
|
|
10
|
+
) -> Union[BaseModel, Dict[str, Any]]:
|
|
11
|
+
"""
|
|
12
|
+
Updates a nested object based on a path description and a value. The path to the
|
|
13
|
+
desired field is given in dot and bracket notation (e.g., 'a[0].b.c[1]').
|
|
14
|
+
|
|
15
|
+
:param obj: The dictionary object to be updated.
|
|
16
|
+
:type obj: Dict[str, Any]
|
|
17
|
+
:param path: The path string indicating where to place the value within the object.
|
|
18
|
+
:type path: str
|
|
19
|
+
:param value: The value to be set at the specified path.
|
|
20
|
+
:type value: Any
|
|
21
|
+
:return: None. This function modifies the object in-place.
|
|
22
|
+
:rtype: None
|
|
23
|
+
|
|
24
|
+
**Example**::
|
|
25
|
+
|
|
26
|
+
>>> data = {}
|
|
27
|
+
>>> object_path_update(data, 'persons[0].foo.bar', 1)
|
|
28
|
+
{'persons': [{'foo': {'bar': 1}}]}
|
|
29
|
+
"""
|
|
30
|
+
if isinstance(obj, BaseModel):
|
|
31
|
+
typ = type(obj)
|
|
32
|
+
obj = obj.dict()
|
|
33
|
+
obj = object_path_update(obj, path, value)
|
|
34
|
+
return typ(**obj)
|
|
35
|
+
obj = deepcopy(obj)
|
|
36
|
+
ret_obj = obj
|
|
37
|
+
parts = path.split(".")
|
|
38
|
+
for part in parts[:-1]:
|
|
39
|
+
if "[" in part:
|
|
40
|
+
key, index = part[:-1].split("[")
|
|
41
|
+
index = int(index)
|
|
42
|
+
# obj = obj.setdefault(key, [{} for _ in range(index+1)])
|
|
43
|
+
obj = obj.setdefault(key, [])
|
|
44
|
+
while len(obj) <= index:
|
|
45
|
+
obj.append({})
|
|
46
|
+
obj = obj[index]
|
|
47
|
+
else:
|
|
48
|
+
obj = obj.setdefault(part, {})
|
|
49
|
+
last_part = parts[-1]
|
|
50
|
+
if "[" in last_part:
|
|
51
|
+
key, index = last_part[:-1].split("[")
|
|
52
|
+
index = int(index)
|
|
53
|
+
if key not in obj or not isinstance(obj[key], list):
|
|
54
|
+
obj[key] = [{} for _ in range(index + 1)]
|
|
55
|
+
obj[key][index] = value
|
|
56
|
+
else:
|
|
57
|
+
obj[last_part] = value
|
|
58
|
+
return ret_obj
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def parse_update_expression(expr: str) -> Union[tuple[str, Any], None]:
|
|
62
|
+
"""
|
|
63
|
+
Parse a string expression of the form 'path.to.field=value' into a path and a value.
|
|
64
|
+
|
|
65
|
+
:param expr:
|
|
66
|
+
:return:
|
|
67
|
+
"""
|
|
68
|
+
try:
|
|
69
|
+
path, val = expr.split("=", 1)
|
|
70
|
+
val = json.loads(val)
|
|
71
|
+
except ValueError:
|
|
72
|
+
return None
|
|
73
|
+
return path, val
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def clean_empties(value: Union[Dict, List]) -> Any:
|
|
77
|
+
if isinstance(value, dict):
|
|
78
|
+
value = {k: v for k, v in ((k, clean_empties(v)) for k, v in value.items()) if v is not None}
|
|
79
|
+
elif isinstance(value, list):
|
|
80
|
+
value = [v for v in (clean_empties(v) for v in value) if v is not None]
|
|
81
|
+
return value
|