linkml-store 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/client.py +2 -0
- linkml_store/api/collection.py +101 -6
- linkml_store/api/database.py +36 -5
- linkml_store/api/stores/duckdb/duckdb_collection.py +1 -0
- linkml_store/api/stores/filesystem/__init__.py +7 -8
- linkml_store/api/stores/filesystem/filesystem_collection.py +148 -113
- linkml_store/api/stores/filesystem/filesystem_database.py +57 -21
- linkml_store/api/stores/mongodb/mongodb_collection.py +10 -4
- linkml_store/api/stores/mongodb/mongodb_database.py +13 -2
- linkml_store/api/types.py +4 -0
- linkml_store/cli.py +88 -7
- linkml_store/utils/change_utils.py +17 -0
- linkml_store/utils/format_utils.py +89 -8
- linkml_store/utils/patch_utils.py +126 -0
- linkml_store/utils/query_utils.py +89 -0
- {linkml_store-0.1.8.dist-info → linkml_store-0.1.9.dist-info}/METADATA +4 -1
- {linkml_store-0.1.8.dist-info → linkml_store-0.1.9.dist-info}/RECORD +20 -16
- {linkml_store-0.1.8.dist-info → linkml_store-0.1.9.dist-info}/LICENSE +0 -0
- {linkml_store-0.1.8.dist-info → linkml_store-0.1.9.dist-info}/WHEEL +0 -0
- {linkml_store-0.1.8.dist-info → linkml_store-0.1.9.dist-info}/entry_points.txt +0 -0
|
@@ -1,36 +1,72 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from pathlib import Path
|
|
2
3
|
from typing import Optional
|
|
3
4
|
|
|
4
|
-
|
|
5
|
-
from
|
|
6
|
-
from
|
|
5
|
+
import yaml
|
|
6
|
+
from linkml.utils.schema_builder import SchemaBuilder
|
|
7
|
+
from linkml_runtime import SchemaView
|
|
8
|
+
|
|
9
|
+
from linkml_store.api import Database
|
|
10
|
+
from linkml_store.api.config import DatabaseConfig
|
|
7
11
|
from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
|
|
12
|
+
from linkml_store.utils.format_utils import Format, load_objects
|
|
8
13
|
|
|
9
14
|
logger = logging.getLogger(__name__)
|
|
10
15
|
|
|
11
16
|
|
|
12
17
|
class FileSystemDatabase(Database):
|
|
13
18
|
collection_class = FileSystemCollection
|
|
14
|
-
wrapped_database: Database = None
|
|
15
19
|
|
|
16
|
-
|
|
17
|
-
|
|
20
|
+
directory_path: Optional[Path] = None
|
|
21
|
+
default_file_format: Optional[str] = None
|
|
22
|
+
|
|
23
|
+
def __init__(self, handle: Optional[str] = None, **kwargs):
|
|
24
|
+
handle = handle.replace("file:", "")
|
|
25
|
+
if handle.startswith("//"):
|
|
26
|
+
handle = handle[2:]
|
|
27
|
+
self.directory_path = Path(handle)
|
|
28
|
+
self.load_metadata()
|
|
18
29
|
super().__init__(handle=handle, **kwargs)
|
|
19
30
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
31
|
+
@property
|
|
32
|
+
def metadata_path(self) -> Path:
|
|
33
|
+
return self.directory_path / ".linkml_metadata.yaml"
|
|
34
|
+
|
|
35
|
+
def load_metadata(self):
|
|
36
|
+
if self.metadata_path.exists():
|
|
37
|
+
md_dict = yaml.safe_load(open(self.metadata_path))
|
|
38
|
+
metadata = DatabaseConfig(**md_dict)
|
|
39
|
+
else:
|
|
40
|
+
metadata = DatabaseConfig()
|
|
41
|
+
self.metadata = metadata
|
|
23
42
|
|
|
24
43
|
def close(self, **kwargs):
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def
|
|
28
|
-
self
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
def init_collections(self):
|
|
47
|
+
metadata = self.metadata
|
|
48
|
+
if self._collections is None:
|
|
49
|
+
self._collections = {}
|
|
50
|
+
for name, collection_config in metadata.collections.items():
|
|
51
|
+
collection = FileSystemCollection(parent=self, **collection_config.dict())
|
|
52
|
+
self._collections[name] = collection
|
|
53
|
+
path = self.directory_path
|
|
54
|
+
if path.exists():
|
|
55
|
+
for fmt in Format:
|
|
56
|
+
suffix = fmt.value
|
|
57
|
+
logger.info(f"Looking for {suffix} files in {path}")
|
|
58
|
+
for f in path.glob(f"*.{suffix}"):
|
|
59
|
+
logger.info(f"Found {f}")
|
|
60
|
+
n = f.stem
|
|
61
|
+
objs = load_objects(f, suffix, expected_type=list)
|
|
62
|
+
collection = FileSystemCollection(parent=self, name=n)
|
|
63
|
+
self._collections[n] = collection
|
|
64
|
+
collection._set_objects(objs)
|
|
65
|
+
|
|
66
|
+
def induce_schema_view(self) -> SchemaView:
|
|
67
|
+
logger.info(f"Inducing schema view for {self.handle}")
|
|
68
|
+
sb = SchemaBuilder()
|
|
69
|
+
|
|
70
|
+
for collection_name in self.list_collection_names():
|
|
71
|
+
sb.add_class(collection_name)
|
|
72
|
+
return SchemaView(sb.schema)
|
|
@@ -26,17 +26,23 @@ class MongoDBCollection(Collection):
|
|
|
26
26
|
def mongo_collection(self) -> MongoCollection:
|
|
27
27
|
if not self.name:
|
|
28
28
|
raise ValueError("Collection name not set")
|
|
29
|
-
|
|
29
|
+
collection_name = self.alias or self.name
|
|
30
|
+
return self.parent.native_db[collection_name]
|
|
30
31
|
|
|
31
32
|
def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
32
33
|
if not isinstance(objs, list):
|
|
33
34
|
objs = [objs]
|
|
34
35
|
self.mongo_collection.insert_many(objs)
|
|
36
|
+
# TODO: allow mapping of _id to id for efficiency
|
|
37
|
+
for obj in objs:
|
|
38
|
+
del obj["_id"]
|
|
39
|
+
self._post_insert_hook(objs)
|
|
35
40
|
|
|
36
|
-
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
41
|
+
def query(self, query: Query, limit: Optional[int] = None, **kwargs) -> QueryResult:
|
|
37
42
|
mongo_filter = self._build_mongo_filter(query.where_clause)
|
|
38
|
-
|
|
39
|
-
|
|
43
|
+
limit = limit or query.limit
|
|
44
|
+
if limit and limit >= 0:
|
|
45
|
+
cursor = self.mongo_collection.find(mongo_filter).limit(limit)
|
|
40
46
|
else:
|
|
41
47
|
cursor = self.mongo_collection.find(mongo_filter)
|
|
42
48
|
|
|
@@ -29,9 +29,17 @@ class MongoDBDatabase(Database):
|
|
|
29
29
|
|
|
30
30
|
def __init__(self, handle: Optional[str] = None, **kwargs):
|
|
31
31
|
if handle is None:
|
|
32
|
-
handle = "mongodb://localhost:27017"
|
|
32
|
+
handle = "mongodb://localhost:27017/test"
|
|
33
33
|
super().__init__(handle=handle, **kwargs)
|
|
34
34
|
|
|
35
|
+
@property
|
|
36
|
+
def _db_name(self) -> str:
|
|
37
|
+
if self.handle:
|
|
38
|
+
db = self.handle.split("/")[-1]
|
|
39
|
+
else:
|
|
40
|
+
db = "default"
|
|
41
|
+
return db
|
|
42
|
+
|
|
35
43
|
@property
|
|
36
44
|
def native_client(self) -> MongoClient:
|
|
37
45
|
if self._native_client is None:
|
|
@@ -44,7 +52,7 @@ class MongoDBDatabase(Database):
|
|
|
44
52
|
alias = self.metadata.alias
|
|
45
53
|
if not alias:
|
|
46
54
|
alias = "default"
|
|
47
|
-
self._native_db = self.native_client[
|
|
55
|
+
self._native_db = self.native_client[self._db_name]
|
|
48
56
|
return self._native_db
|
|
49
57
|
|
|
50
58
|
def commit(self, **kwargs):
|
|
@@ -58,9 +66,12 @@ class MongoDBDatabase(Database):
|
|
|
58
66
|
self.native_client.drop_database(self.metadata.alias)
|
|
59
67
|
|
|
60
68
|
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
69
|
+
# TODO: DRY
|
|
61
70
|
if query.from_table:
|
|
62
71
|
collection = self.get_collection(query.from_table)
|
|
63
72
|
return collection.query(query, **kwargs)
|
|
73
|
+
else:
|
|
74
|
+
raise NotImplementedError(f"Querying without a table is not supported in {self.__class__.__name__}")
|
|
64
75
|
|
|
65
76
|
def init_collections(self):
|
|
66
77
|
if self._collections is None:
|
linkml_store/cli.py
CHANGED
|
@@ -14,7 +14,7 @@ from linkml_store.api.queries import Query
|
|
|
14
14
|
from linkml_store.index import get_indexer
|
|
15
15
|
from linkml_store.index.implementations.simple_indexer import SimpleIndexer
|
|
16
16
|
from linkml_store.index.indexer import Indexer
|
|
17
|
-
from linkml_store.utils.format_utils import Format, guess_format, load_objects, render_output
|
|
17
|
+
from linkml_store.utils.format_utils import Format, guess_format, load_objects, render_output, write_output
|
|
18
18
|
from linkml_store.utils.object_utils import object_path_update
|
|
19
19
|
|
|
20
20
|
index_type_option = click.option(
|
|
@@ -181,6 +181,7 @@ def insert(ctx, files, object, format):
|
|
|
181
181
|
objects = yaml.safe_load(object_str)
|
|
182
182
|
collection.insert(objects)
|
|
183
183
|
click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.name}'.")
|
|
184
|
+
collection.commit()
|
|
184
185
|
|
|
185
186
|
|
|
186
187
|
@cli.command()
|
|
@@ -213,9 +214,9 @@ def store(ctx, files, object, format):
|
|
|
213
214
|
|
|
214
215
|
|
|
215
216
|
@cli.command(name="import")
|
|
216
|
-
@click.argument("files", type=click.Path(exists=True), nargs=-1)
|
|
217
217
|
@click.option("--format", "-f", help="Input format")
|
|
218
218
|
@click.pass_context
|
|
219
|
+
@click.argument("files", type=click.Path(exists=True), nargs=-1)
|
|
219
220
|
def import_database(ctx, files, format):
|
|
220
221
|
"""Imports a database from a dump."""
|
|
221
222
|
settings = ctx.obj["settings"]
|
|
@@ -242,13 +243,77 @@ def export(ctx, output_type, output):
|
|
|
242
243
|
|
|
243
244
|
|
|
244
245
|
@cli.command()
|
|
245
|
-
@click.option("--
|
|
246
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
247
|
+
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
248
|
+
@click.option("--other-database", "-D", required=False, help="Path to the other database")
|
|
249
|
+
@click.option("--other-collection", "-X", required=True, help="Name of the other collection")
|
|
250
|
+
@click.option("--identifier-attribute", "-I", required=False, help="Primary key name")
|
|
251
|
+
@click.pass_context
|
|
252
|
+
def diff(ctx, output, output_type, other_database, other_collection, identifier_attribute):
|
|
253
|
+
"""Diffs two collectoons to create a patch."""
|
|
254
|
+
settings = ctx.obj["settings"]
|
|
255
|
+
db = settings.database
|
|
256
|
+
collection = settings.collection
|
|
257
|
+
if not collection:
|
|
258
|
+
raise ValueError("Collection must be specified.")
|
|
259
|
+
other_db = settings.client.get_database(other_database) if other_database else db
|
|
260
|
+
other_collection = other_db.get_collection(other_collection)
|
|
261
|
+
if identifier_attribute:
|
|
262
|
+
collection.set_identifier_attribute_name(identifier_attribute)
|
|
263
|
+
other_collection.set_identifier_attribute_name(identifier_attribute)
|
|
264
|
+
diff = collection.diff(other_collection)
|
|
265
|
+
write_output(diff, output_type, target=output)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
@cli.command()
|
|
269
|
+
@click.option("--identifier-attribute", "-I", required=False, help="Primary key name")
|
|
270
|
+
@click.argument("patch_files", type=click.Path(exists=True), nargs=-1)
|
|
271
|
+
@click.pass_context
|
|
272
|
+
def apply(ctx, patch_files, identifier_attribute):
|
|
273
|
+
"""
|
|
274
|
+
Apply a patch to a collection.
|
|
275
|
+
"""
|
|
276
|
+
settings = ctx.obj["settings"]
|
|
277
|
+
collection = settings.collection
|
|
278
|
+
if not collection:
|
|
279
|
+
raise ValueError("Collection must be specified.")
|
|
280
|
+
if identifier_attribute:
|
|
281
|
+
collection.set_identifier_attribute_name(identifier_attribute)
|
|
282
|
+
for patch_file in patch_files:
|
|
283
|
+
patch_objs = load_objects(patch_file, expected_type=list)
|
|
284
|
+
collection.apply_patches(patch_objs)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
@cli.command()
|
|
288
|
+
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query, as YAML")
|
|
246
289
|
@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
|
|
247
290
|
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
248
291
|
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
249
292
|
@click.pass_context
|
|
250
293
|
def query(ctx, where, limit, output_type, output):
|
|
251
|
-
"""Query objects from the specified collection.
|
|
294
|
+
"""Query objects from the specified collection.
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
Leave the query field blank to return all objects in the collection.
|
|
298
|
+
|
|
299
|
+
Examples:
|
|
300
|
+
|
|
301
|
+
linkml-store -d duckdb:///countries.db -c countries query
|
|
302
|
+
|
|
303
|
+
Queries can be specified in YAML, as basic key-value pairs
|
|
304
|
+
|
|
305
|
+
Examples:
|
|
306
|
+
|
|
307
|
+
linkml-store -d duckdb:///countries.db -c countries query -w 'code: NZ'
|
|
308
|
+
|
|
309
|
+
More complex queries can be specified using MongoDB-style query syntax
|
|
310
|
+
|
|
311
|
+
Examples:
|
|
312
|
+
|
|
313
|
+
linkml-store -d file:. -c persons query -w 'occupation: {$ne: Architect}'
|
|
314
|
+
|
|
315
|
+
Finds all people who are not architects.
|
|
316
|
+
"""
|
|
252
317
|
collection = ctx.obj["settings"].collection
|
|
253
318
|
where_clause = yaml.safe_load(where) if where else None
|
|
254
319
|
query = Query(from_table=collection.name, where_clause=where_clause, limit=limit)
|
|
@@ -326,6 +391,21 @@ def _get_index(index_type=None, **kwargs) -> Indexer:
|
|
|
326
391
|
raise ValueError(f"Unknown index type: {index_type}")
|
|
327
392
|
|
|
328
393
|
|
|
394
|
+
@cli.command()
|
|
395
|
+
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
|
|
396
|
+
@click.option("--output-type", "-O", type=format_choice, default=Format.FORMATTED.value, help="Output format")
|
|
397
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
398
|
+
@click.pass_context
|
|
399
|
+
def describe(ctx, where, output_type, output):
|
|
400
|
+
"""
|
|
401
|
+
Describe the collection schema.
|
|
402
|
+
"""
|
|
403
|
+
where_clause = yaml.safe_load(where) if where else None
|
|
404
|
+
collection = ctx.obj["settings"].collection
|
|
405
|
+
df = collection.find(where_clause, limit=1).rows_dataframe
|
|
406
|
+
write_output(df.describe(include="all").transpose(), output_type, target=output)
|
|
407
|
+
|
|
408
|
+
|
|
329
409
|
@cli.command()
|
|
330
410
|
@index_type_option
|
|
331
411
|
@click.option("--cached-embeddings-database", "-E", help="Path to the database where embeddings are cached")
|
|
@@ -335,9 +415,7 @@ def index(ctx, index_type, **kwargs):
|
|
|
335
415
|
"""
|
|
336
416
|
Create an index over a collection.
|
|
337
417
|
|
|
338
|
-
|
|
339
|
-
:param index_type:
|
|
340
|
-
:return:
|
|
418
|
+
By default a simple trigram index is used.
|
|
341
419
|
"""
|
|
342
420
|
collection = ctx.obj["settings"].collection
|
|
343
421
|
ix = get_indexer(index_type, **kwargs)
|
|
@@ -397,6 +475,9 @@ def search(ctx, search_term, where, limit, index_type, output_type, output, auto
|
|
|
397
475
|
@cli.command()
|
|
398
476
|
@click.pass_context
|
|
399
477
|
def indexes(ctx):
|
|
478
|
+
"""
|
|
479
|
+
Show the indexes for a collection.
|
|
480
|
+
"""
|
|
400
481
|
collection = ctx.obj["settings"].collection
|
|
401
482
|
for name, ix in collection.indexers.items():
|
|
402
483
|
click.echo(f"{name}: {type(ix)}\n{ix.model_json()}")
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from linkml_store.api.collection import OBJECT
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def insert_operation_to_patches(objs: List[OBJECT], **kwargs):
|
|
7
|
+
"""
|
|
8
|
+
Translate a list of objects to a list of patches for insertion.
|
|
9
|
+
|
|
10
|
+
Note: inserts are always treated as being at the start of a list
|
|
11
|
+
|
|
12
|
+
:param objs: objects to insert
|
|
13
|
+
:param kwargs: additional arguments
|
|
14
|
+
"""
|
|
15
|
+
patches = []
|
|
16
|
+
for obj in objs:
|
|
17
|
+
patches.append({"op": "add", "path": "/0", "value": obj})
|
|
@@ -4,8 +4,9 @@ import sys
|
|
|
4
4
|
from enum import Enum
|
|
5
5
|
from io import StringIO
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Dict, List, Optional, Union
|
|
7
|
+
from typing import Any, Dict, List, Optional, TextIO, Type, Union
|
|
8
8
|
|
|
9
|
+
import pandas as pd
|
|
9
10
|
import yaml
|
|
10
11
|
from pydantic import BaseModel
|
|
11
12
|
|
|
@@ -20,9 +21,13 @@ class Format(Enum):
|
|
|
20
21
|
YAML = "yaml"
|
|
21
22
|
TSV = "tsv"
|
|
22
23
|
CSV = "csv"
|
|
24
|
+
PARQUET = "parquet"
|
|
25
|
+
FORMATTED = "formatted"
|
|
23
26
|
|
|
24
27
|
|
|
25
|
-
def load_objects(
|
|
28
|
+
def load_objects(
|
|
29
|
+
file_path: Union[str, Path], format: Union[Format, str] = None, expected_type: Type = None
|
|
30
|
+
) -> List[Dict[str, Any]]:
|
|
26
31
|
"""
|
|
27
32
|
Load objects from a file in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
28
33
|
|
|
@@ -32,6 +37,7 @@ def load_objects(file_path: Union[str, Path], format: Union[Format, str] = None)
|
|
|
32
37
|
|
|
33
38
|
:param file_path: The path to the file.
|
|
34
39
|
:param format: The format of the file. Can be a Format enum or a string value.
|
|
40
|
+
:param expected_type: The target type to load the objects into.
|
|
35
41
|
:return: A list of dictionaries representing the loaded objects.
|
|
36
42
|
"""
|
|
37
43
|
if isinstance(format, str):
|
|
@@ -40,24 +46,39 @@ def load_objects(file_path: Union[str, Path], format: Union[Format, str] = None)
|
|
|
40
46
|
if isinstance(file_path, Path):
|
|
41
47
|
file_path = str(file_path)
|
|
42
48
|
|
|
49
|
+
if not format and (file_path.endswith(".parquet") or file_path.endswith(".pq")):
|
|
50
|
+
format = Format.PARQUET
|
|
51
|
+
|
|
52
|
+
mode = "r"
|
|
53
|
+
if format == Format.PARQUET:
|
|
54
|
+
mode = "rb"
|
|
55
|
+
|
|
43
56
|
if file_path == "-":
|
|
44
57
|
# set file_path to be a stream from stdin
|
|
45
58
|
f = sys.stdin
|
|
46
59
|
else:
|
|
47
|
-
f = open(file_path)
|
|
60
|
+
f = open(file_path, mode)
|
|
48
61
|
|
|
49
62
|
if format == Format.JSON or (not format and file_path.endswith(".json")):
|
|
50
63
|
objs = json.load(f)
|
|
51
64
|
elif format == Format.JSONL or (not format and file_path.endswith(".jsonl")):
|
|
52
65
|
objs = [json.loads(line) for line in f]
|
|
53
66
|
elif format == Format.YAML or (not format and (file_path.endswith(".yaml") or file_path.endswith(".yml"))):
|
|
54
|
-
|
|
67
|
+
if expected_type and expected_type == list:
|
|
68
|
+
objs = list(yaml.safe_load_all(f))
|
|
69
|
+
else:
|
|
70
|
+
objs = yaml.safe_load(f)
|
|
55
71
|
elif format == Format.TSV or (not format and file_path.endswith(".tsv")):
|
|
56
72
|
reader = csv.DictReader(f, delimiter="\t")
|
|
57
73
|
objs = list(reader)
|
|
58
74
|
elif format == Format.CSV or (not format and file_path.endswith(".csv")):
|
|
59
75
|
reader = csv.DictReader(f)
|
|
60
76
|
objs = list(reader)
|
|
77
|
+
elif format == Format.PARQUET:
|
|
78
|
+
import pyarrow.parquet as pq
|
|
79
|
+
|
|
80
|
+
table = pq.read_table(f)
|
|
81
|
+
objs = table.to_pandas().to_dict(orient="records")
|
|
61
82
|
else:
|
|
62
83
|
raise ValueError(f"Unsupported file format: {file_path}")
|
|
63
84
|
if not isinstance(objs, list):
|
|
@@ -65,7 +86,40 @@ def load_objects(file_path: Union[str, Path], format: Union[Format, str] = None)
|
|
|
65
86
|
return objs
|
|
66
87
|
|
|
67
88
|
|
|
68
|
-
def
|
|
89
|
+
def write_output(
|
|
90
|
+
data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame],
|
|
91
|
+
format: Union[Format, str] = Format.YAML,
|
|
92
|
+
target: Optional[Union[TextIO, str, Path]] = None,
|
|
93
|
+
) -> None:
|
|
94
|
+
"""
|
|
95
|
+
Write output data to a file in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
96
|
+
|
|
97
|
+
>>> write_output([{"a": 1, "b": 2}, {"a": 3, "b": 4}], Format.JSON, sys.stdout)
|
|
98
|
+
[
|
|
99
|
+
{
|
|
100
|
+
"a": 1,
|
|
101
|
+
"b": 2
|
|
102
|
+
},
|
|
103
|
+
{
|
|
104
|
+
"a": 3,
|
|
105
|
+
"b": 4
|
|
106
|
+
}
|
|
107
|
+
]
|
|
108
|
+
"""
|
|
109
|
+
output_str = render_output(data, format)
|
|
110
|
+
if target:
|
|
111
|
+
if isinstance(target, str):
|
|
112
|
+
with open(target, "w") as target:
|
|
113
|
+
target.write(output_str)
|
|
114
|
+
else:
|
|
115
|
+
target.write(output_str)
|
|
116
|
+
else:
|
|
117
|
+
print(output_str)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def render_output(
|
|
121
|
+
data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame], format: Union[Format, str] = Format.YAML
|
|
122
|
+
) -> str:
|
|
69
123
|
"""
|
|
70
124
|
Render output data in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
71
125
|
|
|
@@ -89,6 +143,14 @@ def render_output(data: Union[List[Dict[str, Any]], Dict[str, Any]], format: Uni
|
|
|
89
143
|
if isinstance(format, str):
|
|
90
144
|
format = Format(format)
|
|
91
145
|
|
|
146
|
+
if format == Format.FORMATTED:
|
|
147
|
+
if not isinstance(data, pd.DataFrame):
|
|
148
|
+
data = pd.DataFrame(data)
|
|
149
|
+
return str(data)
|
|
150
|
+
|
|
151
|
+
if isinstance(data, pd.DataFrame):
|
|
152
|
+
data = data.to_dict(orient="records")
|
|
153
|
+
|
|
92
154
|
if isinstance(data, BaseModel):
|
|
93
155
|
data = data.model_dump()
|
|
94
156
|
|
|
@@ -97,16 +159,19 @@ def render_output(data: Union[List[Dict[str, Any]], Dict[str, Any]], format: Uni
|
|
|
97
159
|
elif format == Format.JSONL:
|
|
98
160
|
return "\n".join(json.dumps(obj) for obj in data)
|
|
99
161
|
elif format == Format.YAML:
|
|
100
|
-
|
|
162
|
+
if isinstance(data, list):
|
|
163
|
+
return yaml.safe_dump_all(data, sort_keys=False)
|
|
164
|
+
else:
|
|
165
|
+
return yaml.safe_dump(data, sort_keys=False)
|
|
101
166
|
elif format == Format.TSV:
|
|
102
167
|
output = StringIO()
|
|
103
|
-
writer = csv.DictWriter(output, fieldnames=data
|
|
168
|
+
writer = csv.DictWriter(output, fieldnames=get_fieldnames(data), delimiter="\t")
|
|
104
169
|
writer.writeheader()
|
|
105
170
|
writer.writerows(data)
|
|
106
171
|
return output.getvalue()
|
|
107
172
|
elif format == Format.CSV:
|
|
108
173
|
output = StringIO()
|
|
109
|
-
writer = csv.DictWriter(output, fieldnames=data
|
|
174
|
+
writer = csv.DictWriter(output, fieldnames=get_fieldnames(data))
|
|
110
175
|
writer.writeheader()
|
|
111
176
|
writer.writerows(data)
|
|
112
177
|
return output.getvalue()
|
|
@@ -114,6 +179,22 @@ def render_output(data: Union[List[Dict[str, Any]], Dict[str, Any]], format: Uni
|
|
|
114
179
|
raise ValueError(f"Unsupported output format: {format}")
|
|
115
180
|
|
|
116
181
|
|
|
182
|
+
def get_fieldnames(data: List[Dict[str, Any]]) -> List[str]:
|
|
183
|
+
"""
|
|
184
|
+
Get the fieldnames of a list of dictionaries.
|
|
185
|
+
|
|
186
|
+
>>> get_fieldnames([{"a": 1, "b": 2}, {"a": 3, "b": 4}])
|
|
187
|
+
['a', 'b']
|
|
188
|
+
|
|
189
|
+
:param data: The list of dictionaries.
|
|
190
|
+
:return: The fieldnames.
|
|
191
|
+
"""
|
|
192
|
+
fieldnames = []
|
|
193
|
+
for obj in data:
|
|
194
|
+
fieldnames.extend([k for k in obj.keys() if k not in fieldnames])
|
|
195
|
+
return fieldnames
|
|
196
|
+
|
|
197
|
+
|
|
117
198
|
def guess_format(path: str) -> Optional[Format]:
|
|
118
199
|
"""
|
|
119
200
|
Guess the format of a file based on its extension.
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional, TypedDict
|
|
2
|
+
|
|
3
|
+
import jsonpatch
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class PatchDict(TypedDict):
|
|
7
|
+
op: str
|
|
8
|
+
path: str
|
|
9
|
+
value: Optional[Any]
|
|
10
|
+
_from: Optional[str]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def apply_patches(obj: Any, patches: List[PatchDict], primary_key: Optional[str] = None, in_place=False) -> Any:
|
|
14
|
+
"""
|
|
15
|
+
Apply a set of patches to an object.
|
|
16
|
+
|
|
17
|
+
If the object is a list, the primary key must be specified.
|
|
18
|
+
|
|
19
|
+
>>> objs = [{'id': 'F1', 'name': 'Cheese'}, {'id': 'F2', 'name': 'Bread'}]
|
|
20
|
+
>>> patches = [{'op': 'replace', 'path': '/F1/name', 'value': 'Toast'}]
|
|
21
|
+
>>> apply_patches(objs, patches, primary_key='id')
|
|
22
|
+
[{'id': 'F1', 'name': 'Toast'}, {'id': 'F2', 'name': 'Bread'}]
|
|
23
|
+
|
|
24
|
+
:param obj: object to patch
|
|
25
|
+
:param patches: list of patches, conforming to the JSON Patch format
|
|
26
|
+
:param primary_key: key to use as the primary key for the objects (if obj is a list)
|
|
27
|
+
:param in_place: whether to apply the patches in place
|
|
28
|
+
:return:
|
|
29
|
+
"""
|
|
30
|
+
if isinstance(obj, dict):
|
|
31
|
+
patch_obj = jsonpatch.JsonPatch(patches)
|
|
32
|
+
return patch_obj.apply(obj, in_place=in_place)
|
|
33
|
+
elif isinstance(obj, list):
|
|
34
|
+
if not primary_key:
|
|
35
|
+
raise ValueError("Primary key must be specified for list objects")
|
|
36
|
+
return apply_patches_to_list(obj, patches, primary_key, in_place=in_place)
|
|
37
|
+
else:
|
|
38
|
+
raise ValueError(f"Unsupported object type: {type(obj)}")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def apply_patches_to_list(
|
|
42
|
+
objects: List[Dict[str, Any]], patches: List[PatchDict], primary_key: str, in_place=False
|
|
43
|
+
) -> List[Dict[str, Any]]:
|
|
44
|
+
"""
|
|
45
|
+
Apply a set of patches to a list of objects.
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
:param objects: list of objects
|
|
50
|
+
:param patches: list of patches, conforming to the JSON Patch format
|
|
51
|
+
:param primary_key: key to use as the primary key for the objects
|
|
52
|
+
:param in_place: whether to apply the patches in place
|
|
53
|
+
:return:
|
|
54
|
+
"""
|
|
55
|
+
objs_as_dict = {obj[primary_key]: obj for obj in objects}
|
|
56
|
+
result = apply_patches_to_keyed_list(objs_as_dict, patches, in_place=in_place)
|
|
57
|
+
return list(result.values())
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def apply_patches_to_keyed_list(
|
|
61
|
+
objs_as_dict: Dict[str, Dict[str, Any]], patches: List[PatchDict], in_place=False
|
|
62
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
63
|
+
"""
|
|
64
|
+
Apply a set of patches to a list of objects, where the objects are keyed by a primary key
|
|
65
|
+
|
|
66
|
+
:param objs_as_dict:
|
|
67
|
+
:param patches:
|
|
68
|
+
:param in_place:
|
|
69
|
+
:return:
|
|
70
|
+
"""
|
|
71
|
+
patch_obj = jsonpatch.JsonPatch(patches)
|
|
72
|
+
result = patch_obj.apply(objs_as_dict, in_place=in_place)
|
|
73
|
+
return result
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def patches_from_objects_lists(
|
|
77
|
+
src_objs: List[Dict[str, Any]], dst_objs: List[Dict[str, Any]], primary_key: str, exclude_none=True
|
|
78
|
+
) -> List[PatchDict]:
|
|
79
|
+
"""
|
|
80
|
+
Generate a set of patches to transform src_objs into tgt_objs.
|
|
81
|
+
|
|
82
|
+
>>> src_objs = [{'id': 'F1', 'name': 'Cheese'}, {'id': 'F2', 'name': 'Bread'}]
|
|
83
|
+
>>> tgt_objs = [{'id': 'F1', 'name': 'Toast'}, {'id': 'F2', 'name': 'Bread'}]
|
|
84
|
+
>>> patches_from_objects_lists(src_objs, tgt_objs, primary_key='id')
|
|
85
|
+
[{'op': 'replace', 'path': '/F1/name', 'value': 'Toast'}]
|
|
86
|
+
|
|
87
|
+
by default exclude_none is True, so None values are excluded from the patch
|
|
88
|
+
|
|
89
|
+
>>> tgt_objs = [{'id': 'F1', 'name': 'Toast'}, {'id': 'F2', 'name': None}]
|
|
90
|
+
>>> patches_from_objects_lists(src_objs, tgt_objs, primary_key='id')
|
|
91
|
+
[{'op': 'replace', 'path': '/F1/name', 'value': 'Toast'}, {'op': 'remove', 'path': '/F2/name'}]
|
|
92
|
+
|
|
93
|
+
if exclude_none is False, None values are treated as being set to None
|
|
94
|
+
|
|
95
|
+
>>> patches_from_objects_lists(src_objs, tgt_objs, primary_key='id', exclude_none=False)
|
|
96
|
+
[{'op': 'replace', 'path': '/F1/name', 'value': 'Toast'}, {'op': 'replace', 'path': '/F2/name', 'value': None}]
|
|
97
|
+
|
|
98
|
+
See also: `<https://github.com/orgs/linkml/discussions/1975>`_
|
|
99
|
+
|
|
100
|
+
Note the patches are sorted deterministically, first by path, then by operation.
|
|
101
|
+
This helps ensure operations on the same object are grouped together
|
|
102
|
+
|
|
103
|
+
:param src_objs: source objects
|
|
104
|
+
:param dst_objs: target objects
|
|
105
|
+
:param primary_key: key to use as the primary key for the objects
|
|
106
|
+
:param exclude_none: whether to exclude None values from the patch
|
|
107
|
+
:return:
|
|
108
|
+
"""
|
|
109
|
+
src_objs_as_dict = {obj[primary_key]: obj for obj in src_objs}
|
|
110
|
+
dst_objs_as_dict = {obj[primary_key]: obj for obj in dst_objs}
|
|
111
|
+
if exclude_none:
|
|
112
|
+
src_objs_as_dict = {k: remove_nones(v) for k, v in src_objs_as_dict.items()}
|
|
113
|
+
dst_objs_as_dict = {k: remove_nones(v) for k, v in dst_objs_as_dict.items()}
|
|
114
|
+
patch_obj = jsonpatch.JsonPatch.from_diff(src_objs_as_dict, dst_objs_as_dict)
|
|
115
|
+
pl = patch_obj.patch
|
|
116
|
+
return sorted(pl, key=lambda x: (x["path"], x["op"]))
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def remove_nones(obj: Dict[str, Any]) -> Dict[str, Any]:
|
|
120
|
+
"""
|
|
121
|
+
Remove None values from a dictionary.
|
|
122
|
+
|
|
123
|
+
:param obj:
|
|
124
|
+
:return:
|
|
125
|
+
"""
|
|
126
|
+
return {k: v for k, v in obj.items() if v is not None}
|