linkml-store 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/client.py +32 -5
- linkml_store/api/collection.py +276 -27
- linkml_store/api/config.py +6 -2
- linkml_store/api/database.py +264 -21
- linkml_store/api/stores/chromadb/__init__.py +5 -1
- linkml_store/api/stores/duckdb/__init__.py +9 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +7 -4
- linkml_store/api/stores/duckdb/duckdb_database.py +19 -5
- linkml_store/api/stores/duckdb/mappings.py +1 -0
- linkml_store/api/stores/filesystem/__init__.py +15 -0
- linkml_store/api/stores/filesystem/filesystem_collection.py +177 -0
- linkml_store/api/stores/filesystem/filesystem_database.py +72 -0
- linkml_store/api/stores/hdf5/__init__.py +7 -0
- linkml_store/api/stores/mongodb/__init__.py +25 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +31 -10
- linkml_store/api/stores/mongodb/mongodb_database.py +13 -2
- linkml_store/api/types.py +4 -0
- linkml_store/cli.py +150 -15
- linkml_store/index/__init__.py +6 -2
- linkml_store/index/implementations/llm_indexer.py +83 -5
- linkml_store/index/implementations/simple_indexer.py +2 -2
- linkml_store/index/indexer.py +32 -8
- linkml_store/utils/change_utils.py +17 -0
- linkml_store/utils/format_utils.py +139 -8
- linkml_store/utils/patch_utils.py +126 -0
- linkml_store/utils/query_utils.py +89 -0
- {linkml_store-0.1.7.dist-info → linkml_store-0.1.9.dist-info}/METADATA +7 -1
- linkml_store-0.1.9.dist-info/RECORD +49 -0
- linkml_store-0.1.7.dist-info/RECORD +0 -42
- {linkml_store-0.1.7.dist-info → linkml_store-0.1.9.dist-info}/LICENSE +0 -0
- {linkml_store-0.1.7.dist-info → linkml_store-0.1.9.dist-info}/WHEEL +0 -0
- {linkml_store-0.1.7.dist-info → linkml_store-0.1.9.dist-info}/entry_points.txt +0 -0
|
@@ -1,20 +1,34 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
1
3
|
from typing import TYPE_CHECKING, List
|
|
2
4
|
|
|
3
5
|
import numpy as np
|
|
4
6
|
|
|
7
|
+
from linkml_store.api.config import CollectionConfig
|
|
5
8
|
from linkml_store.index.indexer import INDEX_ITEM, Indexer
|
|
6
9
|
|
|
7
10
|
if TYPE_CHECKING:
|
|
8
11
|
import llm
|
|
9
12
|
|
|
10
13
|
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
11
17
|
class LLMIndexer(Indexer):
|
|
12
18
|
"""
|
|
13
|
-
|
|
19
|
+
An indexer that wraps the llm library.
|
|
20
|
+
|
|
21
|
+
This indexer is used to convert text to vectors using the llm library.
|
|
22
|
+
|
|
23
|
+
>>> indexer = LLMIndexer(cached_embeddings_database="tests/input/llm_cache.db")
|
|
24
|
+
>>> vector = indexer.text_to_vector("hello")
|
|
14
25
|
"""
|
|
15
26
|
|
|
16
27
|
embedding_model_name: str = "ada-002"
|
|
17
28
|
_embedding_model: "llm.EmbeddingModel" = None
|
|
29
|
+
cached_embeddings_database: str = None
|
|
30
|
+
cached_embeddings_collection: str = None
|
|
31
|
+
cache_queries: bool = False
|
|
18
32
|
|
|
19
33
|
@property
|
|
20
34
|
def embedding_model(self):
|
|
@@ -24,21 +38,85 @@ class LLMIndexer(Indexer):
|
|
|
24
38
|
self._embedding_model = llm.get_embedding_model(self.embedding_model_name)
|
|
25
39
|
return self._embedding_model
|
|
26
40
|
|
|
27
|
-
def text_to_vector(self, text: str) -> INDEX_ITEM:
|
|
41
|
+
def text_to_vector(self, text: str, cache: bool = None, **kwargs) -> INDEX_ITEM:
|
|
28
42
|
"""
|
|
29
43
|
Convert a text to an indexable object
|
|
30
44
|
|
|
45
|
+
>>> indexer = LLMIndexer(cached_embeddings_database="tests/input/llm_cache.db")
|
|
46
|
+
>>> vector = indexer.text_to_vector("hello")
|
|
47
|
+
|
|
31
48
|
:param text:
|
|
32
49
|
:return:
|
|
33
50
|
"""
|
|
34
|
-
return self.texts_to_vectors([text])[0]
|
|
51
|
+
return self.texts_to_vectors([text], cache=cache, **kwargs)[0]
|
|
35
52
|
|
|
36
|
-
def texts_to_vectors(self, texts: List[str]) -> List[INDEX_ITEM]:
|
|
53
|
+
def texts_to_vectors(self, texts: List[str], cache: bool = None, **kwargs) -> List[INDEX_ITEM]:
|
|
37
54
|
"""
|
|
38
55
|
Use LLM to embed
|
|
39
56
|
|
|
57
|
+
>>> indexer = LLMIndexer(cached_embeddings_database="tests/input/llm_cache.db")
|
|
58
|
+
>>> vectors = indexer.texts_to_vectors(["hello", "goodbye"])
|
|
59
|
+
|
|
40
60
|
:param texts:
|
|
41
61
|
:return:
|
|
42
62
|
"""
|
|
43
|
-
|
|
63
|
+
logging.info(f"Converting {len(texts)} texts to vectors")
|
|
64
|
+
model = self.embedding_model
|
|
65
|
+
if self.cached_embeddings_database and (cache is None or cache or self.cache_queries):
|
|
66
|
+
model_id = model.model_id
|
|
67
|
+
if not model_id:
|
|
68
|
+
raise ValueError("Model ID is required to cache embeddings")
|
|
69
|
+
db_path = Path(self.cached_embeddings_database)
|
|
70
|
+
coll_name = self.cached_embeddings_collection
|
|
71
|
+
if not coll_name:
|
|
72
|
+
coll_name = "all_embeddings"
|
|
73
|
+
from linkml_store import Client
|
|
74
|
+
|
|
75
|
+
embeddings_client = Client()
|
|
76
|
+
config = CollectionConfig(
|
|
77
|
+
name=coll_name,
|
|
78
|
+
type="Embeddings",
|
|
79
|
+
attributes={
|
|
80
|
+
"text": {"range": "string"},
|
|
81
|
+
"model_id": {"range": "string"},
|
|
82
|
+
"embedding": {"range": "float", "array": {}},
|
|
83
|
+
},
|
|
84
|
+
)
|
|
85
|
+
embeddings_db = embeddings_client.get_database(f"duckdb:///{db_path}")
|
|
86
|
+
if coll_name in embeddings_db.list_collection_names():
|
|
87
|
+
# Load existing collection and use its model
|
|
88
|
+
embeddings_collection = embeddings_db.create_collection(coll_name, metadata=config)
|
|
89
|
+
else:
|
|
90
|
+
embeddings_collection = embeddings_db.create_collection(coll_name, metadata=config)
|
|
91
|
+
texts = list(texts)
|
|
92
|
+
embeddings = list([None] * len(texts))
|
|
93
|
+
uncached_texts = []
|
|
94
|
+
n = 0
|
|
95
|
+
for i in range(len(texts)):
|
|
96
|
+
# TODO: optimize this
|
|
97
|
+
text = texts[i]
|
|
98
|
+
logger.info(f"Looking for cached embedding for {text}")
|
|
99
|
+
r = embeddings_collection.find({"text": text, "model_id": model_id})
|
|
100
|
+
if r.num_rows:
|
|
101
|
+
embeddings[i] = r.rows[0]["embedding"]
|
|
102
|
+
n += 1
|
|
103
|
+
logger.info("Found")
|
|
104
|
+
else:
|
|
105
|
+
uncached_texts.append((text, i))
|
|
106
|
+
logger.info("NOT Found")
|
|
107
|
+
logger.info(f"Found {n} cached embeddings")
|
|
108
|
+
if uncached_texts:
|
|
109
|
+
logger.info(f"Embedding {len(uncached_texts)} uncached texts")
|
|
110
|
+
uncached_texts, uncached_indices = zip(*uncached_texts)
|
|
111
|
+
uncached_embeddings = list(model.embed_multi(uncached_texts))
|
|
112
|
+
# TODO: combine into a single insert with multiple rows
|
|
113
|
+
for i, index in enumerate(uncached_indices):
|
|
114
|
+
logger.debug(f"Indexing text at {i}")
|
|
115
|
+
embeddings[index] = uncached_embeddings[i]
|
|
116
|
+
embeddings_collection.insert(
|
|
117
|
+
{"text": uncached_texts[i], "embedding": embeddings[index], "model_id": model_id}
|
|
118
|
+
)
|
|
119
|
+
else:
|
|
120
|
+
logger.info(f"Embedding {len(texts)} texts")
|
|
121
|
+
embeddings = model.embed_multi(texts)
|
|
44
122
|
return [np.array(v, dtype=float) for v in embeddings]
|
|
@@ -15,7 +15,7 @@ class SimpleIndexer(Indexer):
|
|
|
15
15
|
This uses a naive method to generate an index from text. It is not suitable for production use.
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
|
-
def text_to_vector(self, text: str) -> INDEX_ITEM:
|
|
18
|
+
def text_to_vector(self, text: str, cache: bool = None, **kwargs) -> INDEX_ITEM:
|
|
19
19
|
"""
|
|
20
20
|
This is a naive method purely for testing
|
|
21
21
|
|
|
@@ -39,5 +39,5 @@ class SimpleIndexer(Indexer):
|
|
|
39
39
|
|
|
40
40
|
# Increment the count at the computed index
|
|
41
41
|
vector[index] += 1.0
|
|
42
|
-
logger.
|
|
42
|
+
logger.debug(f"Indexed text: {text} as {vector}")
|
|
43
43
|
return vector
|
linkml_store/index/indexer.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from enum import Enum
|
|
1
3
|
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
2
4
|
|
|
3
5
|
import numpy as np
|
|
@@ -5,6 +7,13 @@ from pydantic import BaseModel
|
|
|
5
7
|
|
|
6
8
|
INDEX_ITEM = np.ndarray
|
|
7
9
|
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TemplateSyntaxEnum(str, Enum):
|
|
14
|
+
jinja2 = "jinja2"
|
|
15
|
+
fstring = "fstring"
|
|
16
|
+
|
|
8
17
|
|
|
9
18
|
def cosine_similarity(vector1, vector2):
|
|
10
19
|
dot_product = np.dot(vector1, vector2)
|
|
@@ -21,8 +30,9 @@ class Indexer(BaseModel):
|
|
|
21
30
|
name: Optional[str] = None
|
|
22
31
|
index_function: Optional[Callable] = None
|
|
23
32
|
distance_function: Optional[Callable] = None
|
|
24
|
-
index_attributes: Optional[str] = None
|
|
33
|
+
index_attributes: Optional[List[str]] = None
|
|
25
34
|
text_template: Optional[str] = None
|
|
35
|
+
text_template_syntax: Optional[TemplateSyntaxEnum] = None
|
|
26
36
|
filter_nulls: Optional[bool] = True
|
|
27
37
|
vector_default_length: Optional[int] = 1000
|
|
28
38
|
index_field: Optional[str] = "__index__"
|
|
@@ -41,24 +51,25 @@ class Indexer(BaseModel):
|
|
|
41
51
|
Convert a list of objects to indexable objects
|
|
42
52
|
|
|
43
53
|
:param objs:
|
|
44
|
-
:return:
|
|
54
|
+
:return: list of vectors
|
|
45
55
|
"""
|
|
46
|
-
return [self.
|
|
56
|
+
return self.texts_to_vectors([self.object_to_text(obj) for obj in objs])
|
|
47
57
|
|
|
48
|
-
def texts_to_vectors(self, texts: List[str]) -> List[INDEX_ITEM]:
|
|
58
|
+
def texts_to_vectors(self, texts: List[str], cache: bool = None, **kwargs) -> List[INDEX_ITEM]:
|
|
49
59
|
"""
|
|
50
60
|
Convert a list of texts to indexable objects
|
|
51
61
|
|
|
52
62
|
:param texts:
|
|
53
63
|
:return:
|
|
54
64
|
"""
|
|
55
|
-
return [self.text_to_vector(text) for text in texts]
|
|
65
|
+
return [self.text_to_vector(text, cache=cache, **kwargs) for text in texts]
|
|
56
66
|
|
|
57
|
-
def text_to_vector(self, text: str) -> INDEX_ITEM:
|
|
67
|
+
def text_to_vector(self, text: str, cache: bool = None, **kwargs) -> INDEX_ITEM:
|
|
58
68
|
"""
|
|
59
69
|
Convert a text to an indexable object
|
|
60
70
|
|
|
61
71
|
:param text:
|
|
72
|
+
:param cache:
|
|
62
73
|
:return:
|
|
63
74
|
"""
|
|
64
75
|
raise NotImplementedError
|
|
@@ -71,11 +82,24 @@ class Indexer(BaseModel):
|
|
|
71
82
|
:return:
|
|
72
83
|
"""
|
|
73
84
|
if self.index_attributes:
|
|
85
|
+
if len(self.index_attributes) == 1 and not self.text_template:
|
|
86
|
+
return str(obj[self.index_attributes[0]])
|
|
74
87
|
obj = {k: v for k, v in obj.items() if k in self.index_attributes}
|
|
75
88
|
if self.filter_nulls:
|
|
76
89
|
obj = {k: v for k, v in obj.items() if v is not None}
|
|
77
90
|
if self.text_template:
|
|
78
|
-
|
|
91
|
+
syntax = self.text_template_syntax
|
|
92
|
+
if not syntax:
|
|
93
|
+
if "{%" in self.text_template or "{{" in self.text_template:
|
|
94
|
+
logger.info("Detected Jinja2 syntax in text template")
|
|
95
|
+
syntax = TemplateSyntaxEnum.jinja2
|
|
96
|
+
if syntax and syntax == TemplateSyntaxEnum.jinja2:
|
|
97
|
+
from jinja2 import Template
|
|
98
|
+
|
|
99
|
+
template = Template(self.text_template)
|
|
100
|
+
return template.render(**obj)
|
|
101
|
+
else:
|
|
102
|
+
return self.text_template.format(**obj)
|
|
79
103
|
return str(obj)
|
|
80
104
|
|
|
81
105
|
def search(
|
|
@@ -91,7 +115,7 @@ class Indexer(BaseModel):
|
|
|
91
115
|
"""
|
|
92
116
|
|
|
93
117
|
# Convert the query string to a vector
|
|
94
|
-
query_vector = self.text_to_vector(query)
|
|
118
|
+
query_vector = self.text_to_vector(query, cache=False)
|
|
95
119
|
|
|
96
120
|
distances = []
|
|
97
121
|
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from linkml_store.api.collection import OBJECT
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def insert_operation_to_patches(objs: List[OBJECT], **kwargs):
|
|
7
|
+
"""
|
|
8
|
+
Translate a list of objects to a list of patches for insertion.
|
|
9
|
+
|
|
10
|
+
Note: inserts are always treated as being at the start of a list
|
|
11
|
+
|
|
12
|
+
:param objs: objects to insert
|
|
13
|
+
:param kwargs: additional arguments
|
|
14
|
+
"""
|
|
15
|
+
patches = []
|
|
16
|
+
for obj in objs:
|
|
17
|
+
patches.append({"op": "add", "path": "/0", "value": obj})
|
|
@@ -4,26 +4,40 @@ import sys
|
|
|
4
4
|
from enum import Enum
|
|
5
5
|
from io import StringIO
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Dict, List, Union
|
|
7
|
+
from typing import Any, Dict, List, Optional, TextIO, Type, Union
|
|
8
8
|
|
|
9
|
+
import pandas as pd
|
|
9
10
|
import yaml
|
|
10
11
|
from pydantic import BaseModel
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
class Format(Enum):
|
|
15
|
+
"""
|
|
16
|
+
Supported generic file formats for loading and rendering objects.
|
|
17
|
+
"""
|
|
18
|
+
|
|
14
19
|
JSON = "json"
|
|
15
20
|
JSONL = "jsonl"
|
|
16
21
|
YAML = "yaml"
|
|
17
22
|
TSV = "tsv"
|
|
18
23
|
CSV = "csv"
|
|
24
|
+
PARQUET = "parquet"
|
|
25
|
+
FORMATTED = "formatted"
|
|
19
26
|
|
|
20
27
|
|
|
21
|
-
def load_objects(
|
|
28
|
+
def load_objects(
|
|
29
|
+
file_path: Union[str, Path], format: Union[Format, str] = None, expected_type: Type = None
|
|
30
|
+
) -> List[Dict[str, Any]]:
|
|
22
31
|
"""
|
|
23
32
|
Load objects from a file in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
24
33
|
|
|
34
|
+
>>> load_objects("tests/input/test_data/data.csv")
|
|
35
|
+
[{'id': '1', 'name': 'John', 'age': '30'},
|
|
36
|
+
{'id': '2', 'name': 'Alice', 'age': '25'}, {'id': '3', 'name': 'Bob', 'age': '35'}]
|
|
37
|
+
|
|
25
38
|
:param file_path: The path to the file.
|
|
26
39
|
:param format: The format of the file. Can be a Format enum or a string value.
|
|
40
|
+
:param expected_type: The target type to load the objects into.
|
|
27
41
|
:return: A list of dictionaries representing the loaded objects.
|
|
28
42
|
"""
|
|
29
43
|
if isinstance(format, str):
|
|
@@ -32,24 +46,39 @@ def load_objects(file_path: Union[str, Path], format: Union[Format, str] = None)
|
|
|
32
46
|
if isinstance(file_path, Path):
|
|
33
47
|
file_path = str(file_path)
|
|
34
48
|
|
|
49
|
+
if not format and (file_path.endswith(".parquet") or file_path.endswith(".pq")):
|
|
50
|
+
format = Format.PARQUET
|
|
51
|
+
|
|
52
|
+
mode = "r"
|
|
53
|
+
if format == Format.PARQUET:
|
|
54
|
+
mode = "rb"
|
|
55
|
+
|
|
35
56
|
if file_path == "-":
|
|
36
57
|
# set file_path to be a stream from stdin
|
|
37
58
|
f = sys.stdin
|
|
38
59
|
else:
|
|
39
|
-
f = open(file_path)
|
|
60
|
+
f = open(file_path, mode)
|
|
40
61
|
|
|
41
62
|
if format == Format.JSON or (not format and file_path.endswith(".json")):
|
|
42
63
|
objs = json.load(f)
|
|
43
64
|
elif format == Format.JSONL or (not format and file_path.endswith(".jsonl")):
|
|
44
65
|
objs = [json.loads(line) for line in f]
|
|
45
66
|
elif format == Format.YAML or (not format and (file_path.endswith(".yaml") or file_path.endswith(".yml"))):
|
|
46
|
-
|
|
67
|
+
if expected_type and expected_type == list:
|
|
68
|
+
objs = list(yaml.safe_load_all(f))
|
|
69
|
+
else:
|
|
70
|
+
objs = yaml.safe_load(f)
|
|
47
71
|
elif format == Format.TSV or (not format and file_path.endswith(".tsv")):
|
|
48
72
|
reader = csv.DictReader(f, delimiter="\t")
|
|
49
73
|
objs = list(reader)
|
|
50
74
|
elif format == Format.CSV or (not format and file_path.endswith(".csv")):
|
|
51
75
|
reader = csv.DictReader(f)
|
|
52
76
|
objs = list(reader)
|
|
77
|
+
elif format == Format.PARQUET:
|
|
78
|
+
import pyarrow.parquet as pq
|
|
79
|
+
|
|
80
|
+
table = pq.read_table(f)
|
|
81
|
+
objs = table.to_pandas().to_dict(orient="records")
|
|
53
82
|
else:
|
|
54
83
|
raise ValueError(f"Unsupported file format: {file_path}")
|
|
55
84
|
if not isinstance(objs, list):
|
|
@@ -57,10 +86,56 @@ def load_objects(file_path: Union[str, Path], format: Union[Format, str] = None)
|
|
|
57
86
|
return objs
|
|
58
87
|
|
|
59
88
|
|
|
60
|
-
def
|
|
89
|
+
def write_output(
|
|
90
|
+
data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame],
|
|
91
|
+
format: Union[Format, str] = Format.YAML,
|
|
92
|
+
target: Optional[Union[TextIO, str, Path]] = None,
|
|
93
|
+
) -> None:
|
|
94
|
+
"""
|
|
95
|
+
Write output data to a file in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
96
|
+
|
|
97
|
+
>>> write_output([{"a": 1, "b": 2}, {"a": 3, "b": 4}], Format.JSON, sys.stdout)
|
|
98
|
+
[
|
|
99
|
+
{
|
|
100
|
+
"a": 1,
|
|
101
|
+
"b": 2
|
|
102
|
+
},
|
|
103
|
+
{
|
|
104
|
+
"a": 3,
|
|
105
|
+
"b": 4
|
|
106
|
+
}
|
|
107
|
+
]
|
|
108
|
+
"""
|
|
109
|
+
output_str = render_output(data, format)
|
|
110
|
+
if target:
|
|
111
|
+
if isinstance(target, str):
|
|
112
|
+
with open(target, "w") as target:
|
|
113
|
+
target.write(output_str)
|
|
114
|
+
else:
|
|
115
|
+
target.write(output_str)
|
|
116
|
+
else:
|
|
117
|
+
print(output_str)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def render_output(
|
|
121
|
+
data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame], format: Union[Format, str] = Format.YAML
|
|
122
|
+
) -> str:
|
|
61
123
|
"""
|
|
62
124
|
Render output data in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
63
125
|
|
|
126
|
+
>>> print(render_output([{"a": 1, "b": 2}, {"a": 3, "b": 4}], Format.JSON))
|
|
127
|
+
[
|
|
128
|
+
{
|
|
129
|
+
"a": 1,
|
|
130
|
+
"b": 2
|
|
131
|
+
},
|
|
132
|
+
{
|
|
133
|
+
"a": 3,
|
|
134
|
+
"b": 4
|
|
135
|
+
}
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
|
|
64
139
|
:param data: The data to be rendered.
|
|
65
140
|
:param format: The desired output format. Can be a Format enum or a string value.
|
|
66
141
|
:return: The rendered output as a string.
|
|
@@ -68,6 +143,14 @@ def render_output(data: List[Dict[str, Any]], format: Union[Format, str] = Forma
|
|
|
68
143
|
if isinstance(format, str):
|
|
69
144
|
format = Format(format)
|
|
70
145
|
|
|
146
|
+
if format == Format.FORMATTED:
|
|
147
|
+
if not isinstance(data, pd.DataFrame):
|
|
148
|
+
data = pd.DataFrame(data)
|
|
149
|
+
return str(data)
|
|
150
|
+
|
|
151
|
+
if isinstance(data, pd.DataFrame):
|
|
152
|
+
data = data.to_dict(orient="records")
|
|
153
|
+
|
|
71
154
|
if isinstance(data, BaseModel):
|
|
72
155
|
data = data.model_dump()
|
|
73
156
|
|
|
@@ -76,18 +159,66 @@ def render_output(data: List[Dict[str, Any]], format: Union[Format, str] = Forma
|
|
|
76
159
|
elif format == Format.JSONL:
|
|
77
160
|
return "\n".join(json.dumps(obj) for obj in data)
|
|
78
161
|
elif format == Format.YAML:
|
|
79
|
-
|
|
162
|
+
if isinstance(data, list):
|
|
163
|
+
return yaml.safe_dump_all(data, sort_keys=False)
|
|
164
|
+
else:
|
|
165
|
+
return yaml.safe_dump(data, sort_keys=False)
|
|
80
166
|
elif format == Format.TSV:
|
|
81
167
|
output = StringIO()
|
|
82
|
-
writer = csv.DictWriter(output, fieldnames=data
|
|
168
|
+
writer = csv.DictWriter(output, fieldnames=get_fieldnames(data), delimiter="\t")
|
|
83
169
|
writer.writeheader()
|
|
84
170
|
writer.writerows(data)
|
|
85
171
|
return output.getvalue()
|
|
86
172
|
elif format == Format.CSV:
|
|
87
173
|
output = StringIO()
|
|
88
|
-
writer = csv.DictWriter(output, fieldnames=data
|
|
174
|
+
writer = csv.DictWriter(output, fieldnames=get_fieldnames(data))
|
|
89
175
|
writer.writeheader()
|
|
90
176
|
writer.writerows(data)
|
|
91
177
|
return output.getvalue()
|
|
92
178
|
else:
|
|
93
179
|
raise ValueError(f"Unsupported output format: {format}")
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def get_fieldnames(data: List[Dict[str, Any]]) -> List[str]:
|
|
183
|
+
"""
|
|
184
|
+
Get the fieldnames of a list of dictionaries.
|
|
185
|
+
|
|
186
|
+
>>> get_fieldnames([{"a": 1, "b": 2}, {"a": 3, "b": 4}])
|
|
187
|
+
['a', 'b']
|
|
188
|
+
|
|
189
|
+
:param data: The list of dictionaries.
|
|
190
|
+
:return: The fieldnames.
|
|
191
|
+
"""
|
|
192
|
+
fieldnames = []
|
|
193
|
+
for obj in data:
|
|
194
|
+
fieldnames.extend([k for k in obj.keys() if k not in fieldnames])
|
|
195
|
+
return fieldnames
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def guess_format(path: str) -> Optional[Format]:
|
|
199
|
+
"""
|
|
200
|
+
Guess the format of a file based on its extension.
|
|
201
|
+
|
|
202
|
+
>>> guess_format("data.json")
|
|
203
|
+
<Format.JSON: 'json'>
|
|
204
|
+
>>> guess_format("data.jsonl")
|
|
205
|
+
<Format.JSONL: 'jsonl'>
|
|
206
|
+
>>> guess_format("data.yaml")
|
|
207
|
+
<Format.YAML: 'yaml'>
|
|
208
|
+
>>> assert not guess_format("data")
|
|
209
|
+
|
|
210
|
+
:param path: The path to the file.
|
|
211
|
+
:return: The guessed format.
|
|
212
|
+
"""
|
|
213
|
+
if path.endswith(".json"):
|
|
214
|
+
return Format.JSON
|
|
215
|
+
elif path.endswith(".jsonl"):
|
|
216
|
+
return Format.JSONL
|
|
217
|
+
elif path.endswith(".yaml") or path.endswith(".yml"):
|
|
218
|
+
return Format.YAML
|
|
219
|
+
elif path.endswith(".tsv"):
|
|
220
|
+
return Format.TSV
|
|
221
|
+
elif path.endswith(".csv"):
|
|
222
|
+
return Format.CSV
|
|
223
|
+
else:
|
|
224
|
+
return None
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional, TypedDict
|
|
2
|
+
|
|
3
|
+
import jsonpatch
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class PatchDict(TypedDict):
|
|
7
|
+
op: str
|
|
8
|
+
path: str
|
|
9
|
+
value: Optional[Any]
|
|
10
|
+
_from: Optional[str]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def apply_patches(obj: Any, patches: List[PatchDict], primary_key: Optional[str] = None, in_place=False) -> Any:
|
|
14
|
+
"""
|
|
15
|
+
Apply a set of patches to an object.
|
|
16
|
+
|
|
17
|
+
If the object is a list, the primary key must be specified.
|
|
18
|
+
|
|
19
|
+
>>> objs = [{'id': 'F1', 'name': 'Cheese'}, {'id': 'F2', 'name': 'Bread'}]
|
|
20
|
+
>>> patches = [{'op': 'replace', 'path': '/F1/name', 'value': 'Toast'}]
|
|
21
|
+
>>> apply_patches(objs, patches, primary_key='id')
|
|
22
|
+
[{'id': 'F1', 'name': 'Toast'}, {'id': 'F2', 'name': 'Bread'}]
|
|
23
|
+
|
|
24
|
+
:param obj: object to patch
|
|
25
|
+
:param patches: list of patches, conforming to the JSON Patch format
|
|
26
|
+
:param primary_key: key to use as the primary key for the objects (if obj is a list)
|
|
27
|
+
:param in_place: whether to apply the patches in place
|
|
28
|
+
:return:
|
|
29
|
+
"""
|
|
30
|
+
if isinstance(obj, dict):
|
|
31
|
+
patch_obj = jsonpatch.JsonPatch(patches)
|
|
32
|
+
return patch_obj.apply(obj, in_place=in_place)
|
|
33
|
+
elif isinstance(obj, list):
|
|
34
|
+
if not primary_key:
|
|
35
|
+
raise ValueError("Primary key must be specified for list objects")
|
|
36
|
+
return apply_patches_to_list(obj, patches, primary_key, in_place=in_place)
|
|
37
|
+
else:
|
|
38
|
+
raise ValueError(f"Unsupported object type: {type(obj)}")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def apply_patches_to_list(
|
|
42
|
+
objects: List[Dict[str, Any]], patches: List[PatchDict], primary_key: str, in_place=False
|
|
43
|
+
) -> List[Dict[str, Any]]:
|
|
44
|
+
"""
|
|
45
|
+
Apply a set of patches to a list of objects.
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
:param objects: list of objects
|
|
50
|
+
:param patches: list of patches, conforming to the JSON Patch format
|
|
51
|
+
:param primary_key: key to use as the primary key for the objects
|
|
52
|
+
:param in_place: whether to apply the patches in place
|
|
53
|
+
:return:
|
|
54
|
+
"""
|
|
55
|
+
objs_as_dict = {obj[primary_key]: obj for obj in objects}
|
|
56
|
+
result = apply_patches_to_keyed_list(objs_as_dict, patches, in_place=in_place)
|
|
57
|
+
return list(result.values())
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def apply_patches_to_keyed_list(
|
|
61
|
+
objs_as_dict: Dict[str, Dict[str, Any]], patches: List[PatchDict], in_place=False
|
|
62
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
63
|
+
"""
|
|
64
|
+
Apply a set of patches to a list of objects, where the objects are keyed by a primary key
|
|
65
|
+
|
|
66
|
+
:param objs_as_dict:
|
|
67
|
+
:param patches:
|
|
68
|
+
:param in_place:
|
|
69
|
+
:return:
|
|
70
|
+
"""
|
|
71
|
+
patch_obj = jsonpatch.JsonPatch(patches)
|
|
72
|
+
result = patch_obj.apply(objs_as_dict, in_place=in_place)
|
|
73
|
+
return result
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def patches_from_objects_lists(
|
|
77
|
+
src_objs: List[Dict[str, Any]], dst_objs: List[Dict[str, Any]], primary_key: str, exclude_none=True
|
|
78
|
+
) -> List[PatchDict]:
|
|
79
|
+
"""
|
|
80
|
+
Generate a set of patches to transform src_objs into tgt_objs.
|
|
81
|
+
|
|
82
|
+
>>> src_objs = [{'id': 'F1', 'name': 'Cheese'}, {'id': 'F2', 'name': 'Bread'}]
|
|
83
|
+
>>> tgt_objs = [{'id': 'F1', 'name': 'Toast'}, {'id': 'F2', 'name': 'Bread'}]
|
|
84
|
+
>>> patches_from_objects_lists(src_objs, tgt_objs, primary_key='id')
|
|
85
|
+
[{'op': 'replace', 'path': '/F1/name', 'value': 'Toast'}]
|
|
86
|
+
|
|
87
|
+
by default exclude_none is True, so None values are excluded from the patch
|
|
88
|
+
|
|
89
|
+
>>> tgt_objs = [{'id': 'F1', 'name': 'Toast'}, {'id': 'F2', 'name': None}]
|
|
90
|
+
>>> patches_from_objects_lists(src_objs, tgt_objs, primary_key='id')
|
|
91
|
+
[{'op': 'replace', 'path': '/F1/name', 'value': 'Toast'}, {'op': 'remove', 'path': '/F2/name'}]
|
|
92
|
+
|
|
93
|
+
if exclude_none is False, None values are treated as being set to None
|
|
94
|
+
|
|
95
|
+
>>> patches_from_objects_lists(src_objs, tgt_objs, primary_key='id', exclude_none=False)
|
|
96
|
+
[{'op': 'replace', 'path': '/F1/name', 'value': 'Toast'}, {'op': 'replace', 'path': '/F2/name', 'value': None}]
|
|
97
|
+
|
|
98
|
+
See also: `<https://github.com/orgs/linkml/discussions/1975>`_
|
|
99
|
+
|
|
100
|
+
Note the patches are sorted deterministically, first by path, then by operation.
|
|
101
|
+
This helps ensure operations on the same object are grouped together
|
|
102
|
+
|
|
103
|
+
:param src_objs: source objects
|
|
104
|
+
:param dst_objs: target objects
|
|
105
|
+
:param primary_key: key to use as the primary key for the objects
|
|
106
|
+
:param exclude_none: whether to exclude None values from the patch
|
|
107
|
+
:return:
|
|
108
|
+
"""
|
|
109
|
+
src_objs_as_dict = {obj[primary_key]: obj for obj in src_objs}
|
|
110
|
+
dst_objs_as_dict = {obj[primary_key]: obj for obj in dst_objs}
|
|
111
|
+
if exclude_none:
|
|
112
|
+
src_objs_as_dict = {k: remove_nones(v) for k, v in src_objs_as_dict.items()}
|
|
113
|
+
dst_objs_as_dict = {k: remove_nones(v) for k, v in dst_objs_as_dict.items()}
|
|
114
|
+
patch_obj = jsonpatch.JsonPatch.from_diff(src_objs_as_dict, dst_objs_as_dict)
|
|
115
|
+
pl = patch_obj.patch
|
|
116
|
+
return sorted(pl, key=lambda x: (x["path"], x["op"]))
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def remove_nones(obj: Dict[str, Any]) -> Dict[str, Any]:
|
|
120
|
+
"""
|
|
121
|
+
Remove None values from a dictionary.
|
|
122
|
+
|
|
123
|
+
:param obj:
|
|
124
|
+
:return:
|
|
125
|
+
"""
|
|
126
|
+
return {k: v for k, v in obj.items() if v is not None}
|