linkml-store 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/client.py +63 -7
- linkml_store/api/collection.py +152 -32
- linkml_store/api/config.py +49 -6
- linkml_store/api/database.py +77 -30
- linkml_store/api/stores/duckdb/duckdb_collection.py +16 -0
- linkml_store/api/stores/duckdb/duckdb_database.py +47 -5
- linkml_store/api/stores/filesystem/filesystem_collection.py +11 -4
- linkml_store/api/stores/filesystem/filesystem_database.py +10 -1
- linkml_store/api/stores/mongodb/mongodb_collection.py +6 -2
- linkml_store/api/stores/mongodb/mongodb_database.py +30 -35
- linkml_store/api/stores/solr/solr_collection.py +4 -4
- linkml_store/cli.py +64 -19
- linkml_store/index/__init__.py +16 -2
- linkml_store/index/implementations/llm_indexer.py +2 -1
- linkml_store/index/indexer.py +13 -2
- linkml_store/utils/file_utils.py +37 -0
- linkml_store/utils/format_utils.py +200 -21
- linkml_store/utils/mongodb_utils.py +145 -0
- linkml_store/utils/pandas_utils.py +40 -0
- linkml_store/utils/sql_utils.py +9 -3
- linkml_store/webapi/html/generic.html.j2 +25 -28
- linkml_store/webapi/main.py +346 -63
- {linkml_store-0.1.10.dist-info → linkml_store-0.1.12.dist-info}/METADATA +36 -3
- {linkml_store-0.1.10.dist-info → linkml_store-0.1.12.dist-info}/RECORD +27 -24
- {linkml_store-0.1.10.dist-info → linkml_store-0.1.12.dist-info}/LICENSE +0 -0
- {linkml_store-0.1.10.dist-info → linkml_store-0.1.12.dist-info}/WHEEL +0 -0
- {linkml_store-0.1.10.dist-info → linkml_store-0.1.12.dist-info}/entry_points.txt +0 -0
linkml_store/cli.py
CHANGED
|
@@ -16,6 +16,7 @@ from linkml_store.index.implementations.simple_indexer import SimpleIndexer
|
|
|
16
16
|
from linkml_store.index.indexer import Indexer
|
|
17
17
|
from linkml_store.utils.format_utils import Format, guess_format, load_objects, render_output, write_output
|
|
18
18
|
from linkml_store.utils.object_utils import object_path_update
|
|
19
|
+
from linkml_store.utils.pandas_utils import facet_summary_to_dataframe_unmelted
|
|
19
20
|
|
|
20
21
|
index_type_option = click.option(
|
|
21
22
|
"--index-type",
|
|
@@ -87,6 +88,7 @@ include_internal_option = click.option("--include-internal/--no-include-internal
|
|
|
87
88
|
@click.option("--set", help="Metadata settings in the form PATHEXPR=value", multiple=True)
|
|
88
89
|
@click.option("-v", "--verbose", count=True)
|
|
89
90
|
@click.option("-q", "--quiet/--no-quiet")
|
|
91
|
+
@click.option("--base-dir", "-B", help="Base directory for the client configuration")
|
|
90
92
|
@click.option(
|
|
91
93
|
"--stacktrace/--no-stacktrace",
|
|
92
94
|
default=False,
|
|
@@ -94,7 +96,7 @@ include_internal_option = click.option("--include-internal/--no-include-internal
|
|
|
94
96
|
help="If set then show full stacktrace on error",
|
|
95
97
|
)
|
|
96
98
|
@click.pass_context
|
|
97
|
-
def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, config, set):
|
|
99
|
+
def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, config, set, **kwargs):
|
|
98
100
|
"""A CLI for interacting with the linkml-store."""
|
|
99
101
|
if not stacktrace:
|
|
100
102
|
sys.tracebacklimit = 0
|
|
@@ -117,7 +119,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
|
|
|
117
119
|
if quiet:
|
|
118
120
|
logger.setLevel(logging.ERROR)
|
|
119
121
|
ctx.ensure_object(dict)
|
|
120
|
-
client = Client().from_config(config) if config else Client()
|
|
122
|
+
client = Client().from_config(config, **kwargs) if config else Client()
|
|
121
123
|
settings = ContextSettings(client=client, database_name=database, collection_name=collection)
|
|
122
124
|
ctx.obj["settings"] = settings
|
|
123
125
|
# DEPRECATED
|
|
@@ -150,7 +152,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
|
|
|
150
152
|
# raise ValueError("Collection must be specified if there are multiple collections.")
|
|
151
153
|
if settings.database and settings.database.list_collections():
|
|
152
154
|
collection = settings.database.list_collections()[0]
|
|
153
|
-
settings.collection_name = collection.
|
|
155
|
+
settings.collection_name = collection.alias
|
|
154
156
|
|
|
155
157
|
|
|
156
158
|
@cli.command()
|
|
@@ -180,15 +182,15 @@ def insert(ctx, files, object, format):
|
|
|
180
182
|
objects = load_objects(file_path, format=format)
|
|
181
183
|
else:
|
|
182
184
|
objects = load_objects(file_path)
|
|
183
|
-
logger.info(f"Inserting {len(objects)} objects from {file_path} into collection '{collection.
|
|
185
|
+
logger.info(f"Inserting {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
|
|
184
186
|
collection.insert(objects)
|
|
185
|
-
click.echo(f"Inserted {len(objects)} objects from {file_path} into collection '{collection.
|
|
187
|
+
click.echo(f"Inserted {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
|
|
186
188
|
if object:
|
|
187
189
|
for object_str in object:
|
|
188
190
|
logger.info(f"Parsing: {object_str}")
|
|
189
191
|
objects = yaml.safe_load(object_str)
|
|
190
192
|
collection.insert(objects)
|
|
191
|
-
click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.
|
|
193
|
+
click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.alias}'.")
|
|
192
194
|
collection.commit()
|
|
193
195
|
|
|
194
196
|
|
|
@@ -226,7 +228,11 @@ def store(ctx, files, object, format):
|
|
|
226
228
|
@click.pass_context
|
|
227
229
|
@click.argument("files", type=click.Path(exists=True), nargs=-1)
|
|
228
230
|
def import_database(ctx, files, format):
|
|
229
|
-
"""Imports a database from a dump.
|
|
231
|
+
"""Imports a database from a dump.
|
|
232
|
+
|
|
233
|
+
See the `export` command for a full list of supported formats. The same
|
|
234
|
+
formats are generally supported for imports.
|
|
235
|
+
"""
|
|
230
236
|
settings = ctx.obj["settings"]
|
|
231
237
|
db = settings.database
|
|
232
238
|
if not files and not object:
|
|
@@ -240,7 +246,30 @@ def import_database(ctx, files, format):
|
|
|
240
246
|
@click.option("--output", "-o", required=True, type=click.Path(), help="Output file path")
|
|
241
247
|
@click.pass_context
|
|
242
248
|
def export(ctx, output_type, output):
|
|
243
|
-
"""Exports a database to a dump.
|
|
249
|
+
"""Exports a database to a standard dump format.
|
|
250
|
+
|
|
251
|
+
Example:
|
|
252
|
+
|
|
253
|
+
linkml-store -d duckdb:///countries.db export -O yaml -o countries.yaml
|
|
254
|
+
|
|
255
|
+
Export format will be guessed from extension if not specified
|
|
256
|
+
|
|
257
|
+
Example:
|
|
258
|
+
|
|
259
|
+
linkml-store -d duckdb:///countries.db export -o countries.json
|
|
260
|
+
|
|
261
|
+
Tree formats such as YAML and JSON can natively store an entire database; each collection
|
|
262
|
+
will be a distinct key in the database.
|
|
263
|
+
|
|
264
|
+
Additionally, native dump formats can be used:
|
|
265
|
+
|
|
266
|
+
Example:
|
|
267
|
+
|
|
268
|
+
linkml-store -d duckdb:///countries.db export -o countries -O duckdb
|
|
269
|
+
|
|
270
|
+
Here, `countries` is a directory. This is equivalent to running EXPORT DATABASE
|
|
271
|
+
(see https://duckdb.org/docs/sql/statements/export.html)
|
|
272
|
+
"""
|
|
244
273
|
settings = ctx.obj["settings"]
|
|
245
274
|
db = settings.database
|
|
246
275
|
if output_type is None:
|
|
@@ -324,7 +353,7 @@ def query(ctx, where, limit, output_type, output):
|
|
|
324
353
|
"""
|
|
325
354
|
collection = ctx.obj["settings"].collection
|
|
326
355
|
where_clause = yaml.safe_load(where) if where else None
|
|
327
|
-
query = Query(from_table=collection.
|
|
356
|
+
query = Query(from_table=collection.alias, where_clause=where_clause, limit=limit)
|
|
328
357
|
result = collection.query(query)
|
|
329
358
|
output_data = render_output(result.rows, output_type)
|
|
330
359
|
if output:
|
|
@@ -341,7 +370,7 @@ def query(ctx, where, limit, output_type, output):
|
|
|
341
370
|
def list_collections(ctx, **kwargs):
|
|
342
371
|
db = ctx.obj["settings"].database
|
|
343
372
|
for collection in db.list_collections(**kwargs):
|
|
344
|
-
click.echo(collection.
|
|
373
|
+
click.echo(collection.alias)
|
|
345
374
|
click.echo(render_output(collection.metadata))
|
|
346
375
|
|
|
347
376
|
|
|
@@ -351,8 +380,9 @@ def list_collections(ctx, **kwargs):
|
|
|
351
380
|
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
352
381
|
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
353
382
|
@click.option("--columns", "-S", help="Columns to facet on")
|
|
383
|
+
@click.option("--wide/--no-wide", "-U/--no-U", default=False, show_default=True, help="Wide table")
|
|
354
384
|
@click.pass_context
|
|
355
|
-
def fq(ctx, where, limit, columns, output_type, output):
|
|
385
|
+
def fq(ctx, where, limit, columns, output_type, wide, output):
|
|
356
386
|
"""
|
|
357
387
|
Query facets from the specified collection.
|
|
358
388
|
|
|
@@ -379,11 +409,22 @@ def fq(ctx, where, limit, columns, output_type, output):
|
|
|
379
409
|
return "+".join([str(x) for x in key])
|
|
380
410
|
return key
|
|
381
411
|
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
412
|
+
if wide:
|
|
413
|
+
results_obj = facet_summary_to_dataframe_unmelted(results)
|
|
414
|
+
else:
|
|
415
|
+
if output_type == Format.PYTHON.value:
|
|
416
|
+
results_obj = results
|
|
417
|
+
elif output_type in [Format.TSV.value, Format.CSV.value]:
|
|
418
|
+
results_obj = []
|
|
419
|
+
for fc, data in results.items():
|
|
420
|
+
for v, c in data:
|
|
421
|
+
results_obj.append({"facet": fc, "value": v, "count": c})
|
|
422
|
+
else:
|
|
423
|
+
results_obj = {}
|
|
424
|
+
for key, value in results.items():
|
|
425
|
+
value_as_dict = {_untuple(v[0:-1]): v[-1] for v in value}
|
|
426
|
+
results_obj[_untuple(key)] = value_as_dict
|
|
427
|
+
output_data = render_output(results_obj, output_type)
|
|
387
428
|
if output:
|
|
388
429
|
with open(output, "w") as f:
|
|
389
430
|
f.write(output_data)
|
|
@@ -403,14 +444,17 @@ def _get_index(index_type=None, **kwargs) -> Indexer:
|
|
|
403
444
|
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
|
|
404
445
|
@click.option("--output-type", "-O", type=format_choice, default=Format.FORMATTED.value, help="Output format")
|
|
405
446
|
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
447
|
+
@click.option(
|
|
448
|
+
"--limit", "-l", default=-1, show_default=True, type=click.INT, help="Maximum number of results to return"
|
|
449
|
+
)
|
|
406
450
|
@click.pass_context
|
|
407
|
-
def describe(ctx, where, output_type, output):
|
|
451
|
+
def describe(ctx, where, output_type, output, limit):
|
|
408
452
|
"""
|
|
409
453
|
Describe the collection schema.
|
|
410
454
|
"""
|
|
411
455
|
where_clause = yaml.safe_load(where) if where else None
|
|
412
456
|
collection = ctx.obj["settings"].collection
|
|
413
|
-
df = collection.find(where_clause, limit=
|
|
457
|
+
df = collection.find(where_clause, limit=limit).rows_dataframe
|
|
414
458
|
write_output(df.describe(include="all").transpose(), output_type, target=output)
|
|
415
459
|
|
|
416
460
|
|
|
@@ -468,7 +512,7 @@ def search(ctx, search_term, where, limit, index_type, output_type, output, auto
|
|
|
468
512
|
"""Search objects in the specified collection."""
|
|
469
513
|
collection = ctx.obj["settings"].collection
|
|
470
514
|
ix = get_indexer(index_type)
|
|
471
|
-
logger.info(f"Attaching index to collection {collection.
|
|
515
|
+
logger.info(f"Attaching index to collection {collection.alias}: {ix.model_dump()}")
|
|
472
516
|
collection.attach_indexer(ix, auto_index=auto_index)
|
|
473
517
|
result = collection.search(search_term, where=where, limit=limit)
|
|
474
518
|
output_data = render_output([{"score": row[0], **row[1]} for row in result.ranked_rows], output_type)
|
|
@@ -498,6 +542,7 @@ def indexes(ctx):
|
|
|
498
542
|
def validate(ctx, output_type, output):
|
|
499
543
|
"""Validate objects in the specified collection."""
|
|
500
544
|
collection = ctx.obj["settings"].collection
|
|
545
|
+
logger.info(f"Validating collection {collection.alias}")
|
|
501
546
|
validation_results = [json_dumper.to_dict(x) for x in collection.iter_validate_collection()]
|
|
502
547
|
output_data = render_output(validation_results, output_type)
|
|
503
548
|
if output:
|
linkml_store/index/__init__.py
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Indexers package.
|
|
3
|
+
|
|
4
|
+
Indexers allow indexes to be added to existing :class:`Collection` objects.
|
|
5
|
+
|
|
6
|
+
Current two are supported:
|
|
7
|
+
|
|
8
|
+
* simple: :class:`SimpleIndexer`
|
|
9
|
+
* llm: :class:`LLMIndexer`
|
|
10
|
+
"""
|
|
11
|
+
|
|
1
12
|
from typing import Type
|
|
2
13
|
|
|
3
14
|
from linkml_store.index.implementations.llm_indexer import LLMIndexer
|
|
@@ -14,7 +25,7 @@ def get_indexer_class(name: str) -> Type[Indexer]:
|
|
|
14
25
|
"""
|
|
15
26
|
Get an indexer class by name.
|
|
16
27
|
|
|
17
|
-
:param name: the name of the indexer
|
|
28
|
+
:param name: the name of the indexer (simple, llm, ...)
|
|
18
29
|
:return: the indexer class
|
|
19
30
|
"""
|
|
20
31
|
if name not in INDEXER_CLASSES:
|
|
@@ -26,7 +37,10 @@ def get_indexer(index_type: str, **kwargs) -> Indexer:
|
|
|
26
37
|
"""
|
|
27
38
|
Get an indexer by name.
|
|
28
39
|
|
|
29
|
-
|
|
40
|
+
>>> simple_indexer = get_indexer("simple")
|
|
41
|
+
>>> llm_indexer = get_indexer("llm")
|
|
42
|
+
|
|
43
|
+
:param name: the name of the indexer (simple, llm, ...)
|
|
30
44
|
:param kwargs: additional arguments to pass to the indexer
|
|
31
45
|
:return: the indexer
|
|
32
46
|
"""
|
|
@@ -74,7 +74,7 @@ class LLMIndexer(Indexer):
|
|
|
74
74
|
|
|
75
75
|
embeddings_client = Client()
|
|
76
76
|
config = CollectionConfig(
|
|
77
|
-
|
|
77
|
+
alias=coll_name,
|
|
78
78
|
type="Embeddings",
|
|
79
79
|
attributes={
|
|
80
80
|
"text": {"range": "string"},
|
|
@@ -116,6 +116,7 @@ class LLMIndexer(Indexer):
|
|
|
116
116
|
embeddings_collection.insert(
|
|
117
117
|
{"text": uncached_texts[i], "embedding": embeddings[index], "model_id": model_id}
|
|
118
118
|
)
|
|
119
|
+
embeddings_collection.commit()
|
|
119
120
|
else:
|
|
120
121
|
logger.info(f"Embedding {len(texts)} texts")
|
|
121
122
|
embeddings = model.embed_multi(texts)
|
linkml_store/index/indexer.py
CHANGED
|
@@ -11,11 +11,22 @@ logger = logging.getLogger(__name__)
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class TemplateSyntaxEnum(str, Enum):
|
|
14
|
+
"""
|
|
15
|
+
Template syntax types.
|
|
16
|
+
"""
|
|
17
|
+
|
|
14
18
|
jinja2 = "jinja2"
|
|
15
19
|
fstring = "fstring"
|
|
16
20
|
|
|
17
21
|
|
|
18
|
-
def cosine_similarity(vector1, vector2):
|
|
22
|
+
def cosine_similarity(vector1, vector2) -> float:
|
|
23
|
+
"""
|
|
24
|
+
Calculate the cosine similarity between two vectors
|
|
25
|
+
|
|
26
|
+
:param vector1:
|
|
27
|
+
:param vector2:
|
|
28
|
+
:return:
|
|
29
|
+
"""
|
|
19
30
|
dot_product = np.dot(vector1, vector2)
|
|
20
31
|
norm1 = np.linalg.norm(vector1)
|
|
21
32
|
norm2 = np.linalg.norm(vector2)
|
|
@@ -24,7 +35,7 @@ def cosine_similarity(vector1, vector2):
|
|
|
24
35
|
|
|
25
36
|
class Indexer(BaseModel):
|
|
26
37
|
"""
|
|
27
|
-
An
|
|
38
|
+
An indexer operates on a collection in order to search for objects.
|
|
28
39
|
"""
|
|
29
40
|
|
|
30
41
|
name: Optional[str] = None
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import shutil
|
|
3
|
+
import tempfile
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
# Set up logging
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def safe_remove_directory(dir_path: Path, no_backup: bool = False) -> Optional[Path]:
|
|
13
|
+
# Ensure the directory exists
|
|
14
|
+
if not dir_path.exists():
|
|
15
|
+
raise FileNotFoundError(f"Directory does not exist: {dir_path}")
|
|
16
|
+
try:
|
|
17
|
+
if no_backup:
|
|
18
|
+
# Move to a temporary directory instead of permanent removal
|
|
19
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
20
|
+
tmp_path = Path(tmpdir) / dir_path.name
|
|
21
|
+
shutil.move(str(dir_path), str(tmp_path))
|
|
22
|
+
logger.info(f"Directory moved to temporary location: {tmp_path}")
|
|
23
|
+
# The directory will be automatically removed when exiting the context manager
|
|
24
|
+
return None
|
|
25
|
+
else:
|
|
26
|
+
# Create a backup directory name with timestamp
|
|
27
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
28
|
+
backup_dir = dir_path.with_name(f"{dir_path.name}_backup_{timestamp}")
|
|
29
|
+
|
|
30
|
+
# Move the directory to the backup location
|
|
31
|
+
shutil.move(str(dir_path), str(backup_dir))
|
|
32
|
+
logger.info(f"Directory backed up to: {backup_dir}")
|
|
33
|
+
return backup_dir
|
|
34
|
+
|
|
35
|
+
except Exception as e:
|
|
36
|
+
logger.error(f"An error occurred: {e}")
|
|
37
|
+
return None
|
|
@@ -1,14 +1,22 @@
|
|
|
1
1
|
import csv
|
|
2
|
+
import gzip
|
|
3
|
+
import io
|
|
2
4
|
import json
|
|
5
|
+
import logging
|
|
3
6
|
import sys
|
|
7
|
+
import tarfile
|
|
4
8
|
from enum import Enum
|
|
5
9
|
from io import StringIO
|
|
6
10
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Dict, List, Optional, TextIO, Type, Union
|
|
11
|
+
from typing import IO, Any, Dict, List, Optional, TextIO, Type, Union
|
|
8
12
|
|
|
9
13
|
import pandas as pd
|
|
14
|
+
import pystow
|
|
10
15
|
import yaml
|
|
11
16
|
from pydantic import BaseModel
|
|
17
|
+
from tabulate import tabulate
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
12
20
|
|
|
13
21
|
|
|
14
22
|
class Format(Enum):
|
|
@@ -21,12 +29,163 @@ class Format(Enum):
|
|
|
21
29
|
YAML = "yaml"
|
|
22
30
|
TSV = "tsv"
|
|
23
31
|
CSV = "csv"
|
|
32
|
+
PYTHON = "python"
|
|
24
33
|
PARQUET = "parquet"
|
|
25
34
|
FORMATTED = "formatted"
|
|
35
|
+
TABLE = "table"
|
|
36
|
+
SQLDUMP_DUCKDB = "duckdb"
|
|
37
|
+
SQLDUMP_POSTGRES = "postgres"
|
|
38
|
+
DUMP_MONGODB = "mongodb"
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def guess_format(cls, file_name: str) -> Optional["Format"]:
|
|
42
|
+
ext = Path(file_name).suffix.lower()
|
|
43
|
+
|
|
44
|
+
format_map = {
|
|
45
|
+
".json": cls.JSON,
|
|
46
|
+
".jsonl": cls.JSONL,
|
|
47
|
+
".yaml": cls.YAML,
|
|
48
|
+
".yml": cls.YAML,
|
|
49
|
+
".tsv": cls.TSV,
|
|
50
|
+
".csv": cls.CSV,
|
|
51
|
+
".py": cls.PYTHON,
|
|
52
|
+
".parquet": cls.PARQUET,
|
|
53
|
+
".pq": cls.PARQUET,
|
|
54
|
+
}
|
|
55
|
+
fmt = format_map.get(ext, None)
|
|
56
|
+
if fmt is None:
|
|
57
|
+
if ext.startswith("."):
|
|
58
|
+
ext = ext[1:]
|
|
59
|
+
if ext in [f.value for f in Format]:
|
|
60
|
+
return Format(ext)
|
|
61
|
+
return fmt
|
|
62
|
+
|
|
63
|
+
def is_dump_format(self):
|
|
64
|
+
return self in [Format.SQLDUMP_DUCKDB, Format.SQLDUMP_POSTGRES, Format.DUMP_MONGODB]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def load_objects_from_url(
|
|
68
|
+
url: str,
|
|
69
|
+
format: Union[Format, str] = None,
|
|
70
|
+
expected_type: Type = None,
|
|
71
|
+
local_path: Optional[str] = None,
|
|
72
|
+
**kwargs,
|
|
73
|
+
) -> List[Dict[str, Any]]:
|
|
74
|
+
"""
|
|
75
|
+
Load objects from a URL in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
76
|
+
|
|
77
|
+
:param url: The URL to the file.
|
|
78
|
+
:param format: The format of the file. Can be a Format enum or a string value.
|
|
79
|
+
:param expected_type: The target type to load the objects into.
|
|
80
|
+
:param local_path: The local path to save the file to.
|
|
81
|
+
:return: A list of dictionaries representing the loaded objects.
|
|
82
|
+
"""
|
|
83
|
+
local_path = pystow.ensure("linkml", "linkml-store", url=url)
|
|
84
|
+
logger.info(f"synced to {local_path}")
|
|
85
|
+
objs = load_objects(local_path, format=format, expected_type=expected_type, **kwargs)
|
|
86
|
+
if not objs:
|
|
87
|
+
raise ValueError(f"No objects loaded from URL: {url}")
|
|
88
|
+
return objs
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def process_file(
|
|
92
|
+
f: IO, format: Format, expected_type: Optional[Type] = None, header_comment_token: Optional[str] = None
|
|
93
|
+
) -> List[Dict[str, Any]]:
|
|
94
|
+
"""
|
|
95
|
+
Process a single file and return a list of objects.
|
|
96
|
+
"""
|
|
97
|
+
if format == Format.JSON:
|
|
98
|
+
objs = json.load(f)
|
|
99
|
+
elif format == Format.JSONL:
|
|
100
|
+
objs = [json.loads(line) for line in f]
|
|
101
|
+
elif format == Format.YAML:
|
|
102
|
+
if expected_type and expected_type == list: # noqa E721
|
|
103
|
+
objs = list(yaml.safe_load_all(f))
|
|
104
|
+
else:
|
|
105
|
+
objs = yaml.safe_load(f)
|
|
106
|
+
elif format in [Format.TSV, Format.CSV]:
|
|
107
|
+
if header_comment_token:
|
|
108
|
+
while True:
|
|
109
|
+
pos = f.tell()
|
|
110
|
+
line = f.readline()
|
|
111
|
+
if not line.startswith(header_comment_token):
|
|
112
|
+
f.seek(pos)
|
|
113
|
+
break
|
|
114
|
+
delimiter = "\t" if format == Format.TSV else ","
|
|
115
|
+
reader = csv.DictReader(f, delimiter=delimiter)
|
|
116
|
+
objs = list(reader)
|
|
117
|
+
elif format == Format.PARQUET:
|
|
118
|
+
import pyarrow.parquet as pq
|
|
119
|
+
|
|
120
|
+
table = pq.read_table(f)
|
|
121
|
+
objs = table.to_pandas().to_dict(orient="records")
|
|
122
|
+
elif format in [Format.PYTHON, Format.FORMATTED, Format.TABLE]:
|
|
123
|
+
raise ValueError(f"Format {format} is not supported for loading objects")
|
|
124
|
+
else:
|
|
125
|
+
raise ValueError(f"Unsupported file format: {format}")
|
|
126
|
+
|
|
127
|
+
if not isinstance(objs, list):
|
|
128
|
+
objs = [objs]
|
|
129
|
+
return objs
|
|
26
130
|
|
|
27
131
|
|
|
28
132
|
def load_objects(
|
|
29
|
-
file_path: Union[str, Path],
|
|
133
|
+
file_path: Union[str, Path],
|
|
134
|
+
format: Optional[Union[Format, str]] = None,
|
|
135
|
+
compression: Optional[str] = None,
|
|
136
|
+
expected_type: Optional[Type] = None,
|
|
137
|
+
header_comment_token: Optional[str] = None,
|
|
138
|
+
) -> List[Dict[str, Any]]:
|
|
139
|
+
"""
|
|
140
|
+
Load objects from a file or archive in supported formats.
|
|
141
|
+
For tgz archives, it processes all files and concatenates the results.
|
|
142
|
+
|
|
143
|
+
:param file_path: The path to the file or archive.
|
|
144
|
+
:param format: The format of the file. Can be a Format enum or a string value.
|
|
145
|
+
:param compression: The compression type. Supports 'gz' for gzip and 'tgz' for tar.gz.
|
|
146
|
+
:param expected_type: The target type to load the objects into, e.g. list
|
|
147
|
+
:param header_comment_token: Token used for header comments to be skipped
|
|
148
|
+
:return: A list of dictionaries representing the loaded objects.
|
|
149
|
+
"""
|
|
150
|
+
if isinstance(file_path, Path):
|
|
151
|
+
file_path = str(file_path)
|
|
152
|
+
|
|
153
|
+
if isinstance(format, str):
|
|
154
|
+
format = Format(format)
|
|
155
|
+
|
|
156
|
+
all_objects = []
|
|
157
|
+
|
|
158
|
+
if compression == "tgz":
|
|
159
|
+
with tarfile.open(file_path, "r:gz") as tar:
|
|
160
|
+
for member in tar.getmembers():
|
|
161
|
+
if member.isfile():
|
|
162
|
+
f = tar.extractfile(member)
|
|
163
|
+
if f:
|
|
164
|
+
content = io.TextIOWrapper(f)
|
|
165
|
+
member_format = Format.guess_format(member.name) if not format else format
|
|
166
|
+
logger.debug(f"Processing tar member {member.name} with format {member_format}")
|
|
167
|
+
all_objects.extend(process_file(content, member_format, expected_type, header_comment_token))
|
|
168
|
+
else:
|
|
169
|
+
if Path(file_path).is_dir():
|
|
170
|
+
raise ValueError(f"{file_path} is a dir, which is invalid for {format}")
|
|
171
|
+
mode = "rb" if format == Format.PARQUET or compression == "gz" else "r"
|
|
172
|
+
open_func = gzip.open if compression == "gz" else open
|
|
173
|
+
format = Format.guess_format(file_path) if not format else format
|
|
174
|
+
with open_func(file_path, mode) if file_path != "-" else sys.stdin as f:
|
|
175
|
+
if compression == "gz" and mode == "r":
|
|
176
|
+
f = io.TextIOWrapper(f)
|
|
177
|
+
all_objects = process_file(f, format, expected_type, header_comment_token)
|
|
178
|
+
|
|
179
|
+
logger.debug(f"Loaded {len(all_objects)} objects from {file_path}")
|
|
180
|
+
return all_objects
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def xxxload_objects(
|
|
184
|
+
file_path: Union[str, Path],
|
|
185
|
+
format: Union[Format, str] = None,
|
|
186
|
+
compression: Optional[str] = None,
|
|
187
|
+
expected_type: Type = None,
|
|
188
|
+
header_comment_token: Optional[str] = None,
|
|
30
189
|
) -> List[Dict[str, Any]]:
|
|
31
190
|
"""
|
|
32
191
|
Load objects from a file in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
@@ -37,7 +196,7 @@ def load_objects(
|
|
|
37
196
|
|
|
38
197
|
:param file_path: The path to the file.
|
|
39
198
|
:param format: The format of the file. Can be a Format enum or a string value.
|
|
40
|
-
:param expected_type: The target type to load the objects into.
|
|
199
|
+
:param expected_type: The target type to load the objects into, e.g. list
|
|
41
200
|
:return: A list of dictionaries representing the loaded objects.
|
|
42
201
|
"""
|
|
43
202
|
if isinstance(format, str):
|
|
@@ -48,6 +207,12 @@ def load_objects(
|
|
|
48
207
|
|
|
49
208
|
if not format and (file_path.endswith(".parquet") or file_path.endswith(".pq")):
|
|
50
209
|
format = Format.PARQUET
|
|
210
|
+
if not format and file_path.endswith(".tsv"):
|
|
211
|
+
format = Format.TSV
|
|
212
|
+
if not format and file_path.endswith(".csv"):
|
|
213
|
+
format = Format.CSV
|
|
214
|
+
if not format and file_path.endswith(".py"):
|
|
215
|
+
format = Format.PYTHON
|
|
51
216
|
|
|
52
217
|
mode = "r"
|
|
53
218
|
if format == Format.PARQUET:
|
|
@@ -68,11 +233,29 @@ def load_objects(
|
|
|
68
233
|
objs = list(yaml.safe_load_all(f))
|
|
69
234
|
else:
|
|
70
235
|
objs = yaml.safe_load(f)
|
|
71
|
-
elif format == Format.TSV or
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
236
|
+
elif format == Format.TSV or format == Format.CSV:
|
|
237
|
+
# Skip initial comment lines if comment_char is set
|
|
238
|
+
if header_comment_token:
|
|
239
|
+
# Store the original position
|
|
240
|
+
original_pos = f.tell()
|
|
241
|
+
|
|
242
|
+
# Read and store lines until we find a non-comment line
|
|
243
|
+
lines = []
|
|
244
|
+
for line in f:
|
|
245
|
+
if not line.startswith(header_comment_token):
|
|
246
|
+
break
|
|
247
|
+
lines.append(line)
|
|
248
|
+
|
|
249
|
+
# Go back to the original position
|
|
250
|
+
f.seek(original_pos)
|
|
251
|
+
|
|
252
|
+
# Skip the comment lines we found
|
|
253
|
+
for _ in lines:
|
|
254
|
+
f.readline()
|
|
255
|
+
if format == Format.TSV:
|
|
256
|
+
reader = csv.DictReader(f, delimiter="\t")
|
|
257
|
+
else:
|
|
258
|
+
reader = csv.DictReader(f)
|
|
76
259
|
objs = list(reader)
|
|
77
260
|
elif format == Format.PARQUET:
|
|
78
261
|
import pyarrow.parquet as pq
|
|
@@ -118,7 +301,7 @@ def write_output(
|
|
|
118
301
|
|
|
119
302
|
|
|
120
303
|
def render_output(
|
|
121
|
-
data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame], format: Union[Format, str] = Format.YAML
|
|
304
|
+
data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame], format: Optional[Union[Format, str]] = Format.YAML
|
|
122
305
|
) -> str:
|
|
123
306
|
"""
|
|
124
307
|
Render output data in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
@@ -151,6 +334,9 @@ def render_output(
|
|
|
151
334
|
if isinstance(data, pd.DataFrame):
|
|
152
335
|
data = data.to_dict(orient="records")
|
|
153
336
|
|
|
337
|
+
if isinstance(data, dict) and format in [Format.TSV, Format.CSV]:
|
|
338
|
+
data = [data]
|
|
339
|
+
|
|
154
340
|
if isinstance(data, BaseModel):
|
|
155
341
|
data = data.model_dump()
|
|
156
342
|
|
|
@@ -158,6 +344,10 @@ def render_output(
|
|
|
158
344
|
return json.dumps(data, indent=2, default=str)
|
|
159
345
|
elif format == Format.JSONL:
|
|
160
346
|
return "\n".join(json.dumps(obj) for obj in data)
|
|
347
|
+
elif format == Format.PYTHON:
|
|
348
|
+
return str(data)
|
|
349
|
+
elif format == Format.TABLE:
|
|
350
|
+
return tabulate(pd.DataFrame(data), headers="keys", tablefmt="psql")
|
|
161
351
|
elif format == Format.YAML:
|
|
162
352
|
if isinstance(data, list):
|
|
163
353
|
return yaml.safe_dump_all(data, sort_keys=False)
|
|
@@ -210,15 +400,4 @@ def guess_format(path: str) -> Optional[Format]:
|
|
|
210
400
|
:param path: The path to the file.
|
|
211
401
|
:return: The guessed format.
|
|
212
402
|
"""
|
|
213
|
-
|
|
214
|
-
return Format.JSON
|
|
215
|
-
elif path.endswith(".jsonl"):
|
|
216
|
-
return Format.JSONL
|
|
217
|
-
elif path.endswith(".yaml") or path.endswith(".yml"):
|
|
218
|
-
return Format.YAML
|
|
219
|
-
elif path.endswith(".tsv"):
|
|
220
|
-
return Format.TSV
|
|
221
|
-
elif path.endswith(".csv"):
|
|
222
|
-
return Format.CSV
|
|
223
|
-
else:
|
|
224
|
-
return None
|
|
403
|
+
return Format.guess_format(path)
|